diff --git a/CMakeLists.txt b/CMakeLists.txt
index f43eab264d20..409d1a716a8a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -829,6 +829,7 @@ if(NOT DEFINED ENV{CONDA_BUILD})
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python
   )
   add_dependencies(tvm_cython tvm)
+  message("Add Cython build into the default build step")
 endif()
 
 # Installation rules
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
deleted file mode 100644
index 44c54b1cf297..000000000000
--- a/apps/benchmark/README.md
+++ /dev/null
@@ -1,156 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-# Performance Benchmark
-
-## Results
-
-See results on wiki page https://github.com/apache/tvm/wiki/Benchmark
-
-## How to Reproduce
-
-To obtain the best performance, we always do auto-tuning for the specific devices and get
-the parameters for used kernels. To enable easy reproduction of our results, we release
-pre-tuned parameters for popular networks on some common devices.
-TVM will download related tuning cache files during compilation.
-
-If you don't have the following listed devices, you can still run these scripts.
-You can pick the one that is most similar to your device as argument.
-In general, the performance should also be good.
-
-It is recommended that you run tuning by yourself if you have your customized network or devices.
-Please follow the tutorial for
-[NVIDIA GPU](https://tvm.apache.org/docs/tutorials/autotvm/tune_conv2d_cuda.html),
-[ARM CPU](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html),
-[Mobile GPU](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_mobile_gpu.html) and
-[Adreno GPU](https://www.qualcomm.com/products/features/adreno).
-
-### NVIDIA GPU
-
-Build TVM with LLVM and CUDA enabled. [Help](https://tvm.apache.org/docs/install/from_source.html)
-
-```bash
-python3 gpu_imagenet_bench.py --model 1080ti
-python3 gpu_imagenet_bench.py --model titanx
-
-# For NVIDIA Jetson TX2, you can run the following command directly on the board,
-# or use cross compilation and RPC like what we do for ARM CPU.
-python3 gpu_imagenet_bench.py --model tx2
-```
-
-### ARM CPU & Mali GPU
-For embedded devices, we use RPC infrastructure in TVM to make the management easy.
-You need to use it for reproducing benchmark results.
-
-**Note**: We use llvm-4.0 in our tuning environment. Mismatch of the LLVM version during tuning and deployment can influence the performance, so you have to use a same version for reproduction.
-
-0. Build TVM with LLVM enabled. [Help](https://tvm.apache.org/docs/install/from_source.html)
-
-1. Start an RPC Tracker on the host machine
-```bash
-python3 -m tvm.exec.rpc_tracker
-```
-
-2. Register devices to the tracker
-* For Linux device
-  * Build tvm runtime on your device [Help](https://tvm.apache.org/docs/tutorials/frontend/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
-  * Register your device to tracker by
-  ```bash
-  python3 -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
-  ```
-  replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.
-
-  E.g. Here is an example command for RK3399,
-  `python3 -m tvm.exec.rpc_server --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
-
-* For Android device
-   * Build and install tvm RPC apk on your device [Help](https://github.com/apache/tvm/tree/main/apps/android_rpc).
-     Make sure you can pass the android rpc test. Then you have alreadly known how to register.
-
-3. Verify the device registration
-  We can query all registered devices by
-  ```bash
-  python3 -m tvm.exec.query_rpc_tracker
-  ```
-  You should be able to find your devices in `Queue Status`. Make sure the registration is correct before going ahead.
-
-  For our test environment, one sample output can be
-  ```bash
-  Queue Status
-  ----------------------------------
-  key          total  free  pending
-  ----------------------------------
-  mate10pro    1      1     0
-  p20pro       2      2     0
-  pixel2       2      2     0
-  rk3399       2      2     0
-  rasp3b       8      8     0
-  ```
-
-4. Run benchmark
-  ```bash
-  # ARM CPU
-  python3 arm_cpu_imagenet_bench.py --model rasp3b --rpc-key rasp3b
-  python3 arm_cpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
-  python3 arm_cpu_imagenet_bench.py --model pixel2 --rpc-key pixel2
-  python3 arm_cpu_imagenet_bench.py --model p20pro --rpc-key p20pro
-  python3 arm_cpu_imagenet_bench.py --model mate10pro --rpc-key mate10pro
-  ```
-
-  ```bash
-  # Mali GPU
-  # NOTE: To make the test environment more stable, we close GUI and lock the frequency
-  sudo /etc/init.d/lightdm stop
-  sudo -i
-  echo performance > /sys/class/misc/mali0/device/devfreq/ff9a0000.gpu/governor
-  python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
-  python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399 --dtype float16
-  ```
-
-### AMD GPU
-
-Build TVM with LLVM and ROCm enabled. [Help](https://tvm.apache.org/docs/install/from_source.html)
-```bash
-python3 gpu_imagenet_bench.py --model gfx900 --target rocm
-```
-
-### Adreno GPU
-
-Adreno benchmarks are automated over the docker - [ci_adreno](https://github.com/apache/tvm/blob/main/docker/Dockerfile.ci_adreno).
-Adreno docker share the Android devices from host. It is adviced to have host adb version same as docker, which is ```1.0.41```
-
-Below command runs all (OpenCL native, CLML SDK) the benchmarks over given Android device.
-```bash
-export ANDROID_SERIAL=<ADB ID>
-./tests/scripts/ci.py adreno -b
-```
-Below command runs all OpenCL native benchmarks over given Android device.
-```bash
-export ANDROID_SERIAL=<ADB ID>
-./tests/scripts/ci.py adreno -n
-```
-CLML SDK benchmarks require CLML SDK path to be exported and the SDK version should match with target device's SDK version.
-
-Below command runs all CLML SDK benchmarks over given Android device.
-```bash
-export ADRENO_OPENCL=<CLML SDK PATH>
-export ANDROID_SERIAL=<ADB ID>
-./tests/scripts/ci.py adreno -c
-```
-
-Note: Tuning cache is implicite through tophub repo for all the benchmarks and is tuned over Snapdragon Gen 1.
diff --git a/apps/benchmark/adreno/adreno_gpu_bench_clml.py b/apps/benchmark/adreno/adreno_gpu_bench_clml.py
deleted file mode 100755
index a7e2e5e9c202..000000000000
--- a/apps/benchmark/adreno/adreno_gpu_bench_clml.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Benchmark script for various models on Adreno GPU.
-"""
-import argparse
-
-import numpy as np
-
-import os
-import sys
-import tvm
-from tvm import te
-from tvm.relay import testing
-from tvm.contrib.utils import tempdir
-from tvm.relay.op.contrib import clml
-import tvm.contrib.graph_executor as runtime
-from tvm import relay
-from tvm import autotvm
-from tvm.contrib import utils, ndk
-
-
-def get_network(name, batch_size, dtype="float32"):
-    """Get the symbol definition and random weight of a network
-
-    Parameters
-    ----------
-    name: str
-        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
-    batch_size: int
-        batch size
-    dtype: str
-        Data type
-
-    Returns
-    -------
-    net: tvm.IRModule
-        The relay function of network definition
-    params: dict
-        The random parameters for benchmark
-    input_shape: tuple
-        The shape of input tensor
-    output_shape: tuple
-        The shape of output tensor
-    """
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-
-    if name == "mobilenet":
-        net, params = testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == "inception_v3":
-        input_shape = (batch_size, 3, 299, 299)
-        net, params = testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif "resnet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.resnet.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "vgg" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.vgg.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "densenet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.densenet.get_workload(
-            densenet_size=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "squeezenet" in name:
-        version = name.split("_v")[1]
-        net, params = testing.squeezenet.get_workload(
-            batch_size=batch_size, version=version, dtype=dtype
-        )
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    return net, params, input_shape, output_shape
-
-
-def print_progress(msg):
-    """print progress message
-
-    Parameters
-    ----------
-    msg: str
-        The message to print
-    """
-    sys.stdout.write(msg + "\r")
-    sys.stdout.flush()
-
-
-def tune_tasks(
-    tasks,
-    measure_option,
-    tuner="xgb",
-    n_trial=1024,
-    early_stopping=None,
-    log_filename="tuning.log",
-):
-    from tvm.autotvm.tuner import XGBTuner
-
-    tmp_log_file = log_filename + ".tmp"
-
-    for i, tsk in enumerate(reversed(tasks)):
-        print("Task: ", tsk)
-        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-
-        # create tuner
-        if tuner == "xgb":
-            tuner_obj = XGBTuner(tsk, loss_type="reg")
-        elif tuner == "xgb_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob")
-        elif tuner == "xgb_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar")
-        elif tuner == "xgb_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve")
-        elif tuner == "xgb_rank":
-            tuner_obj = XGBTuner(tsk, loss_type="rank")
-        elif tuner == "xgb_rank_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
-        elif tuner == "xgb_rank_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
-        elif tuner == "xgb_rank_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
-        elif tuner == "xgb_rank_binary":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary")
-        elif tuner == "xgb_rank_binary_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob")
-        elif tuner == "xgb_rank_binary_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar")
-        elif tuner == "xgb_rank_binary_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve")
-        elif tuner == "ga":
-            tuner_obj = GATuner(tsk, pop_size=50)
-        elif tuner == "random":
-            tuner_obj = RandomTuner(tsk)
-        elif tuner == "gridsearch":
-            tuner_obj = GridSearchTuner(tsk)
-        else:
-            raise ValueError("Invalid tuner: " + tuner)
-
-        tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(
-            n_trial=tsk_trial,
-            early_stopping=early_stopping,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-        autotvm.record.pick_best(tmp_log_file, log_filename)
-
-
-def evaluate_network(network, target, target_host, dtype, repeat):
-    print_progress(network)
-    net, params, input_shape, output_shape = get_network(network, batch_size=1, dtype=dtype)
-
-    # Auto Tuning
-    tune_log = "adreno-" + network + "-" + dtype + ".log"
-    tuning_options = {
-        "log_filename": tune_log,
-        "early_stopping": None,
-        "measure_option": autotvm.measure_option(
-            builder=autotvm.LocalBuilder(build_func=ndk.create_shared, timeout=15),
-            runner=autotvm.RPCRunner(
-                args.rpc_key,
-                host=args.host,
-                port=args.port,
-                number=3,
-                timeout=600,
-            ),
-        ),
-    }
-    if args.tune:
-        tasks = autotvm.task.extract_from_program(
-            net, target=target, target_host=target_host, params=params
-        )
-        tune_tasks(tasks, **tuning_options)
-
-    print_progress("%-20s building..." % network)
-
-    # Build the tuning log
-    if os.path.exists(tune_log):
-        with autotvm.apply_history_best(tune_log):
-            with tvm.transform.PassContext(opt_level=3):
-                net = clml.partition_for_clml(net, params)
-                lib = relay.build(
-                    net, target=tvm.target.Target(target, host=target_host), params=params
-                )
-    else:
-        with tvm.transform.PassContext(opt_level=3):
-            net = clml.partition_for_clml(net, params)
-
-            lib = relay.build(
-                net, target=tvm.target.Target(target, host=target_host), params=params
-            )
-
-    tmp = tempdir()
-
-    filename = "%s.so" % network
-    lib.export_library(tmp.relpath(filename), fcompile=ndk.create_shared)
-
-    # upload library and params
-    print_progress("%-20s uploading..." % network)
-
-    # connect to remote device
-    tracker = tvm.rpc.connect_tracker(args.host, args.port)
-    remote = tracker.request(args.rpc_key)
-
-    dev = remote.device(str(target), 0)
-    remote.upload(tmp.relpath(filename))
-
-    rlib = remote.load_module(filename)
-    module = runtime.GraphModule(rlib["default"](dev))
-    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-    module.set_input("data", data_tvm)
-
-    # evaluate
-    print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
-    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-    print(
-        "%-20s %-19s (%s)"
-        % (network + "-" + dtype, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
-    )
-    return (np.mean(prof_res), np.std(prof_res))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--network",
-        type=str,
-        choices=[
-            "resnet-18",
-            "resnet-34",
-            "resnet-50",
-            "vgg-16",
-            "vgg-19",
-            "densenet-121",
-            "inception_v3",
-            "mobilenet",
-            "squeezenet_v1.0",
-            "squeezenet_v1.1",
-        ],
-        help="The name of neural network",
-    )
-    parser.add_argument("--host", type=str, default="127.0.0.1")
-    parser.add_argument("--port", type=int, default=9190)
-    parser.add_argument("--rpc-key", type=str, default="android")
-    parser.add_argument("--repeat", type=int, default=30)
-    parser.add_argument("--tune", type=bool, default=False)
-    args = parser.parse_args()
-
-    if args.network is None:
-        networks = [
-            "resnet-18",
-            "resnet-34",
-            "resnet-50",
-            # "vgg-16",
-            # "vgg-19",
-            "densenet-121",
-            "inception_v3",
-            "mobilenet",
-            "squeezenet_v1.0",
-            "squeezenet_v1.1",
-        ]
-    else:
-        networks = [args.network]
-
-    target = "opencl"
-    target_host = "llvm -mtriple=arm64-linux-android"
-
-    print("--------------------------------------------------")
-    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
-    print("--------------------------------------------------")
-
-    results = {}
-
-    for network in networks:
-        ftime = evaluate_network(network, target, target_host, "float32", args.repeat)
-        results[network + "-float32"] = ftime
-        ftime = evaluate_network(network, target, target_host, "float16", args.repeat)
-        results[network + "-float16"] = ftime
-
-    print("----------------------------------------------------------------------")
-    print("%-30s %-30s" % ("Network Name", "Mean Inference Time        (std dev)"))
-    print("----------------------------------------------------------------------")
-    for key, val in results.items():
-        print("%-30s %-30s (%s)" % (key, "%.2f ms" % val[0], "%.2f ms" % val[1]))
diff --git a/apps/benchmark/adreno/adreno_gpu_bench_texture.py b/apps/benchmark/adreno/adreno_gpu_bench_texture.py
deleted file mode 100755
index 5c4ee3bb6e43..000000000000
--- a/apps/benchmark/adreno/adreno_gpu_bench_texture.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Benchmark script for various models on Adreno GPU.
-"""
-import argparse
-
-import numpy as np
-
-import os
-import sys
-import tvm
-from tvm import te
-from tvm.relay import testing
-from tvm.contrib.utils import tempdir
-import tvm.contrib.graph_executor as runtime
-from tvm import relay
-from tvm import autotvm
-from tvm.contrib import utils, ndk
-
-
-def get_network(name, batch_size, dtype="float32"):
-    """Get the symbol definition and random weight of a network
-
-    Parameters
-    ----------
-    name: str
-        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
-    batch_size: int
-        batch size
-    dtype: str
-        Data type
-
-    Returns
-    -------
-    net: tvm.IRModule
-        The relay function of network definition
-    params: dict
-        The random parameters for benchmark
-    input_shape: tuple
-        The shape of input tensor
-    output_shape: tuple
-        The shape of output tensor
-    """
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-
-    if name == "mobilenet":
-        net, params = testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == "inception_v3":
-        input_shape = (batch_size, 3, 299, 299)
-        net, params = testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif "resnet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.resnet.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "vgg" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.vgg.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "densenet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.densenet.get_workload(
-            densenet_size=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "squeezenet" in name:
-        version = name.split("_v")[1]
-        net, params = testing.squeezenet.get_workload(
-            batch_size=batch_size, version=version, dtype=dtype
-        )
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    return net, params, input_shape, output_shape
-
-
-def print_progress(msg):
-    """print progress message
-
-    Parameters
-    ----------
-    msg: str
-        The message to print
-    """
-    sys.stdout.write(msg + "\r")
-    sys.stdout.flush()
-
-
-def tune_tasks(
-    tasks,
-    measure_option,
-    tuner="xgb",
-    n_trial=1024,
-    early_stopping=None,
-    log_filename="tuning.log",
-):
-    from tvm.autotvm.tuner import XGBTuner
-
-    tmp_log_file = log_filename + ".tmp"
-
-    for i, tsk in enumerate(reversed(tasks)):
-        print("Task: ", tsk)
-        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-
-        # create tuner
-        if tuner == "xgb":
-            tuner_obj = XGBTuner(tsk, loss_type="reg")
-        elif tuner == "xgb_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob")
-        elif tuner == "xgb_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar")
-        elif tuner == "xgb_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve")
-        elif tuner == "xgb_rank":
-            tuner_obj = XGBTuner(tsk, loss_type="rank")
-        elif tuner == "xgb_rank_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
-        elif tuner == "xgb_rank_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
-        elif tuner == "xgb_rank_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
-        elif tuner == "xgb_rank_binary":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary")
-        elif tuner == "xgb_rank_binary_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob")
-        elif tuner == "xgb_rank_binary_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar")
-        elif tuner == "xgb_rank_binary_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve")
-        elif tuner == "ga":
-            tuner_obj = GATuner(tsk, pop_size=50)
-        elif tuner == "random":
-            tuner_obj = RandomTuner(tsk)
-        elif tuner == "gridsearch":
-            tuner_obj = GridSearchTuner(tsk)
-        else:
-            raise ValueError("Invalid tuner: " + tuner)
-
-        tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(
-            n_trial=tsk_trial,
-            early_stopping=early_stopping,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-        autotvm.record.pick_best(tmp_log_file, log_filename)
-
-
-def evaluate_network(network, target, target_host, dtype, repeat):
-    print_progress(network)
-    net, params, input_shape, output_shape = get_network(network, batch_size=1, dtype=dtype)
-
-    # Auto Tuning
-    tune_log = "adreno-" + network + "-" + dtype + ".log"
-    tuning_options = {
-        "log_filename": tune_log,
-        "early_stopping": None,
-        "measure_option": autotvm.measure_option(
-            builder=autotvm.LocalBuilder(build_func=ndk.create_shared, timeout=15),
-            runner=autotvm.RPCRunner(
-                args.rpc_key,
-                host=args.host,
-                port=args.port,
-                number=3,
-                timeout=600,
-            ),
-        ),
-    }
-    if args.tune:
-        tasks = autotvm.task.extract_from_program(
-            net, target=target, target_host=target_host, params=params
-        )
-        tune_tasks(tasks, **tuning_options)
-
-    print_progress("%-20s building..." % network)
-
-    # Build the tuning log
-    if os.path.exists(tune_log):
-        with autotvm.apply_history_best(tune_log):
-            with tvm.transform.PassContext(opt_level=3):
-                lib = relay.build(
-                    net, target=tvm.target.Target(target, host=target_host), params=params
-                )
-    else:
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(
-                net, target=tvm.target.Target(target, host=target_host), params=params
-            )
-
-    tmp = tempdir()
-
-    filename = "%s.so" % network
-    lib.export_library(tmp.relpath(filename), fcompile=ndk.create_shared)
-
-    # upload library and params
-    print_progress("%-20s uploading..." % network)
-
-    # connect to remote device
-    tracker = tvm.rpc.connect_tracker(args.host, args.port)
-    remote = tracker.request(args.rpc_key)
-
-    dev = remote.device(str(target), 0)
-    remote.upload(tmp.relpath(filename))
-
-    rlib = remote.load_module(filename)
-    module = runtime.GraphModule(rlib["default"](dev))
-    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-    module.set_input("data", data_tvm)
-
-    # evaluate
-    print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
-    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-    print(
-        "%-20s %-19s (%s)"
-        % (network + "-" + dtype, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
-    )
-    return (np.mean(prof_res), np.std(prof_res))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--network",
-        type=str,
-        choices=[
-            "resnet-18",
-            "resnet-34",
-            "resnet-50",
-            "vgg-16",
-            "vgg-19",
-            "densenet-121",
-            "inception_v3",
-            "mobilenet",
-            "squeezenet_v1.0",
-            "squeezenet_v1.1",
-        ],
-        help="The name of neural network",
-    )
-    parser.add_argument("--host", type=str, default="127.0.0.1")
-    parser.add_argument("--port", type=int, default=9190)
-    parser.add_argument("--rpc-key", type=str, default="android")
-    parser.add_argument("--repeat", type=int, default=30)
-    parser.add_argument("--tune", type=bool, default=False)
-    args = parser.parse_args()
-
-    if args.network is None:
-        networks = [
-            "resnet-18",
-            "resnet-34",
-            "resnet-50",
-            "vgg-16",
-            "vgg-19",
-            "densenet-121",
-            "inception_v3",
-            "mobilenet",
-            "squeezenet_v1.0",
-            "squeezenet_v1.1",
-        ]
-    else:
-        networks = [args.network]
-
-    target = "opencl -device=adreno"
-    target_host = "llvm -mtriple=arm64-linux-android"
-
-    print("--------------------------------------------------")
-    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
-    print("--------------------------------------------------")
-
-    results = {}
-
-    for network in networks:
-        ftime = evaluate_network(network, target, target_host, "float32", args.repeat)
-        results[network + "-float32"] = ftime
-        ftime = evaluate_network(network, target, target_host, "float16", args.repeat)
-        results[network + "-float16"] = ftime
-
-    print("----------------------------------------------------------------------")
-    print("%-30s %-30s" % ("Network Name", "Mean Inference Time        (std dev)"))
-    print("----------------------------------------------------------------------")
-    for key, val in results.items():
-        print("%-30s %-30s (%s)" % (key, "%.2f ms" % val[0], "%.2f ms" % val[1]))
diff --git a/apps/benchmark/adreno/bench.sh b/apps/benchmark/adreno/bench.sh
deleted file mode 100755
index 2c8ff288202f..000000000000
--- a/apps/benchmark/adreno/bench.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -euxo pipefail
-
-source tests/scripts/setup-pytest-env.sh
-export PYTHONPATH=${PYTHONPATH}:${TVM_PATH}/apps/extension/python
-export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
-
-export TVM_TRACKER_HOST=127.0.0.1
-export TVM_TRACKER_PORT=$(((RANDOM % 100) + 9100))
-export RPC_DEVICE_KEY="android"
-export TVM_NDK_CC="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang"
-
-env PYTHONPATH=python python3 -m tvm.exec.rpc_tracker --host "${TVM_TRACKER_HOST}" --port "${TVM_TRACKER_PORT}" &
-TRACKER_PID=$!
-sleep 5   # Wait for tracker to bind
-
-export ANDROID_SERIAL=$2
-
-adb shell "mkdir -p /data/local/tmp/tvm_ci"
-adb push build-adreno-target/tvm_rpc /data/local/tmp/tvm_ci/tvm_rpc_ci
-adb push build-adreno-target/libtvm_runtime.so /data/local/tmp/tvm_ci
-
-adb reverse tcp:${TVM_TRACKER_PORT} tcp:${TVM_TRACKER_PORT}
-adb forward tcp:5000 tcp:5000
-adb forward tcp:5001 tcp:5001
-adb forward tcp:5002 tcp:5002
-env adb shell "cd /data/local/tmp/tvm_ci; killall -9 tvm_rpc_ci; sleep 2; LD_LIBRARY_PATH=/data/local/tmp/tvm_ci/ ./tvm_rpc_ci server --host=0.0.0.0 --port=5000 --port-end=5010 --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}" &
-DEVICE_PID=$!
-sleep 5 # Wait for the device connections
-trap "{ kill ${TRACKER_PID}; kill ${DEVICE_PID}; }" 0
-
-# cleanup pycache
-find . -type f -path "*.pyc" | xargs rm -f
-
-if [ "texture" == $1 ] ; then
-    python3 apps/benchmark/adreno/adreno_gpu_bench_texture.py --host ${TVM_TRACKER_HOST} --port ${TVM_TRACKER_PORT} --rpc-key ${RPC_DEVICE_KEY}
-fi
-
-if [ "clml" == $1 ] ; then
-    python3 apps/benchmark/adreno/adreno_gpu_bench_clml.py --host ${TVM_TRACKER_HOST} --port ${TVM_TRACKER_PORT} --rpc-key ${RPC_DEVICE_KEY}
-fi
-
-
-kill ${TRACKER_PID}
-kill ${DEVICE_PID}
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
deleted file mode 100644
index c618a89c8f1a..000000000000
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Benchmark script for ImageNet models on ARM CPU.
-see README.md for the usage and results of this script.
-"""
-import argparse
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.contrib.utils import tempdir
-import tvm.contrib.graph_executor as runtime
-from tvm import relay
-
-from util import get_network, print_progress
-
-
-def evaluate_network(network, target, target_host, repeat):
-    # connect to remote device
-    tracker = tvm.rpc.connect_tracker(args.host, args.port)
-    remote = tracker.request(args.rpc_key)
-
-    print_progress(network)
-    net, params, input_shape, output_shape = get_network(network, batch_size=1)
-
-    print_progress("%-20s building..." % network)
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(net, target=tvm.target.Target(target, host=target_host), params=params)
-
-    tmp = tempdir()
-    if "android" in str(target):
-        from tvm.contrib import ndk
-
-        filename = "%s.so" % network
-        lib.export_library(tmp.relpath(filename), fcompile=ndk.create_shared)
-    else:
-        filename = "%s.tar" % network
-        lib.export_library(tmp.relpath(filename))
-
-    # upload library and params
-    print_progress("%-20s uploading..." % network)
-    dev = remote.device(str(target), 0)
-    remote.upload(tmp.relpath(filename))
-
-    rlib = remote.load_module(filename)
-    module = runtime.GraphModule(rlib["default"](dev))
-    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-    module.set_input("data", data_tvm)
-
-    # evaluate
-    print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
-    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-    print(
-        "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--network",
-        type=str,
-        choices=[
-            "resnet-18",
-            "resnet-34",
-            "resnet-50",
-            "vgg-16",
-            "vgg-19",
-            "densenet-121",
-            "inception_v3",
-            "mobilenet",
-            "squeezenet_v1.0",
-            "squeezenet_v1.1",
-        ],
-        help="The name of neural network",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        choices=["rk3399", "mate10", "mate10pro", "p20", "p20pro", "pixel2", "rasp3b", "pynq"],
-        default="rk3399",
-        help="The model of the test device. If your device is not listed in "
-        "the choices list, pick the most similar one as argument.",
-    )
-    parser.add_argument("--host", type=str, default="127.0.0.1")
-    parser.add_argument("--port", type=int, default=9190)
-    parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--repeat", type=int, default=10)
-    args = parser.parse_args()
-
-    dtype = "float32"
-
-    if args.network is None:
-        networks = ["squeezenet_v1.1", "mobilenet", "resnet-18", "vgg-16"]
-    else:
-        networks = [args.network]
-
-    target = tvm.target.arm_cpu(model=args.model)
-    target_host = None
-
-    print("--------------------------------------------------")
-    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
-    print("--------------------------------------------------")
-    for network in networks:
-        evaluate_network(network, target, target_host, args.repeat)
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
deleted file mode 100644
index 6407f766cb76..000000000000
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Benchmark script for ImageNet models on GPU.
-see README.md for the usage and results of this script.
-"""
-import argparse
-import threading
-
-import numpy as np
-
-import tvm
-from tvm import te
-import tvm.contrib.graph_executor as runtime
-from tvm import relay
-
-from util import get_network
-
-
-def benchmark(network, target):
-    net, params, input_shape, output_shape = get_network(network, batch_size=1)
-
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(net, target=target, params=params)
-
-    # create runtime
-    dev = tvm.device(str(target), 0)
-    module = runtime.GraphModule(lib["default"](dev))
-    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-    module.set_input("data", data_tvm)
-
-    # evaluate
-    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=args.repeat)
-    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-    print(
-        "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--network",
-        type=str,
-        choices=[
-            "resnet-18",
-            "resnet-34",
-            "resnet-50",
-            "vgg-16",
-            "vgg-19",
-            "densenet-121",
-            "inception_v3",
-            "mobilenet",
-            "squeezenet_v1.0",
-            "squeezenet_v1.1",
-        ],
-        help="The name of neural network",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        choices=["amd_apu"],
-        default="amd_apu",
-        help="The name of the test device. If your device is not listed in "
-        "the choices list, pick the most similar one as argument.",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        choices=["1080ti", "titanx", "tx2", "gfx900", "v1000"],
-        default="1080ti",
-        help="The model of the test device. If your device is not listed in "
-        "the choices list, pick the most similar one as argument.",
-    )
-    parser.add_argument("--repeat", type=int, default=600)
-    parser.add_argument(
-        "--target",
-        type=str,
-        choices=["cuda", "opencl", "rocm", "nvptx", "metal", "vulkan"],
-        default="cuda",
-        help="The tvm compilation target",
-    )
-    parser.add_argument("--thread", type=int, default=1, help="The number of threads to be run.")
-    args = parser.parse_args()
-
-    dtype = "float32"
-
-    if args.network is None:
-        networks = ["resnet-50", "mobilenet", "vgg-19", "inception_v3"]
-    else:
-        networks = [args.network]
-
-    target = tvm.target.Target("%s -device=%s -model=%s" % (args.target, args.device, args.model))
-
-    print("--------------------------------------------------")
-    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
-    print("--------------------------------------------------")
-    for network in networks:
-        if args.thread == 1:
-            benchmark(network, target)
-        else:
-            threads = list()
-            for n in range(args.thread):
-                thread = threading.Thread(
-                    target=benchmark, args=([network, target]), name="thread%d" % n
-                )
-                threads.append(thread)
-
-            for thread in threads:
-                thread.start()
-
-            for thread in threads:
-                thread.join()
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
deleted file mode 100644
index 83a6e0b10947..000000000000
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Benchmark script for ImageNet models on mobile GPU.
-see README.md for the usage and results of this script.
-"""
-import argparse
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.contrib.utils import tempdir
-import tvm.contrib.graph_executor as runtime
-from tvm import relay
-
-from util import get_network, print_progress
-
-
-def evaluate_network(network, target, target_host, dtype, repeat):
-    # connect to remote device
-    tracker = tvm.rpc.connect_tracker(args.host, args.port)
-    remote = tracker.request(args.rpc_key)
-
-    print_progress(network)
-    net, params, input_shape, output_shape = get_network(network, batch_size=1, dtype=dtype)
-
-    print_progress("%-20s building..." % network)
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(net, target=tvm.target.Target(target, host=target_host), params=params)
-
-    tmp = tempdir()
-    if "android" in str(target) or "android" in str(target_host):
-        from tvm.contrib import ndk
-
-        filename = "%s.so" % network
-        lib.export_library(tmp.relpath(filename), fcompile=ndk.create_shared)
-    else:
-        filename = "%s.tar" % network
-        lib.export_library(tmp.relpath(filename))
-
-    # upload library and params
-    print_progress("%-20s uploading..." % network)
-    dev = remote.device(str(target), 0)
-    remote.upload(tmp.relpath(filename))
-
-    rlib = remote.load_module(filename)
-    module = runtime.GraphModule(rlib["default"](dev))
-    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-    module.set_input("data", data_tvm)
-
-    # evaluate
-    print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
-    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-    print(
-        "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--network",
-        type=str,
-        choices=[
-            "resnet-18",
-            "resnet-34",
-            "resnet-50",
-            "vgg-16",
-            "vgg-19",
-            "densenet-121",
-            "inception_v3",
-            "mobilenet",
-            "squeezenet_v1.0",
-            "squeezenet_v1.1",
-        ],
-        help="The name of neural network",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        choices=["rk3399"],
-        default="rk3399",
-        help="The model of the test device. If your device is not listed in "
-        "the choices list, pick the most similar one as argument.",
-    )
-    parser.add_argument("--host", type=str, default="127.0.0.1")
-    parser.add_argument("--port", type=int, default=9190)
-    parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--repeat", type=int, default=30)
-    parser.add_argument("--dtype", type=str, default="float32")
-    args = parser.parse_args()
-
-    if args.network is None:
-        networks = ["squeezenet_v1.1", "mobilenet", "resnet-18", "vgg-16"]
-    else:
-        networks = [args.network]
-
-    target = tvm.target.mali(model=args.model)
-    target_host = tvm.target.arm_cpu(model=args.model)
-
-    print("--------------------------------------------------")
-    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
-    print("--------------------------------------------------")
-
-    for network in networks:
-        evaluate_network(network, target, target_host, args.dtype, args.repeat)
diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
deleted file mode 100644
index 4e9bfa8d9e42..000000000000
--- a/apps/benchmark/util.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility for benchmark"""
-
-import sys
-from tvm import relay
-from tvm.relay import testing
-
-
-def get_network(name, batch_size, dtype="float32"):
-    """Get the symbol definition and random weight of a network
-
-    Parameters
-    ----------
-    name: str
-        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
-    batch_size: int
-        batch size
-    dtype: str
-        Data type
-
-    Returns
-    -------
-    net: tvm.IRModule
-        The relay function of network definition
-    params: dict
-        The random parameters for benchmark
-    input_shape: tuple
-        The shape of input tensor
-    output_shape: tuple
-        The shape of output tensor
-    """
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-
-    if name == "mobilenet":
-        net, params = testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == "inception_v3":
-        input_shape = (batch_size, 3, 299, 299)
-        net, params = testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif "resnet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.resnet.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "vgg" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.vgg.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "densenet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.densenet.get_workload(
-            densenet_size=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "squeezenet" in name:
-        version = name.split("_v")[1]
-        net, params = testing.squeezenet.get_workload(
-            batch_size=batch_size, version=version, dtype=dtype
-        )
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    return net, params, input_shape, output_shape
-
-
-def print_progress(msg):
-    """print progress message
-
-    Parameters
-    ----------
-    msg: str
-        The message to print
-    """
-    sys.stdout.write(msg + "\r")
-    sys.stdout.flush()
diff --git a/apps/cpp_clml/CMakeLists.txt b/apps/cpp_clml/CMakeLists.txt
deleted file mode 100644
index 8c0fd53bf9f4..000000000000
--- a/apps/cpp_clml/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-cmake_minimum_required(VERSION 3.13)
-
-project(clml_run VERSION 2.0)
-
-if(NOT DEFINED CMAKE_TOOLCHAIN_FILE)
-  message( FATAL_ERROR "CMAKE_TOOLCHAIN_FILE Not set, forcing exit. Suggested value: {ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake." )
-endif(NOT DEFINED CMAKE_TOOLCHAIN_FILE)
-
-if(NOT DEFINED ANDROID_ABI)
-  message( FATAL_ERROR "ANDROID_ABI Not set, forcing exit. Suggested value(s): arm64-v8a (64), armeabi-v7a (32)" )
-endif(NOT DEFINED ANDROID_ABI)
-
-if(NOT DEFINED CLML_SDK)
-  message( FATAL_ERROR "CLML_SDK Not set, forcing exit." )
-endif(NOT DEFINED CLML_SDK)
-
-if (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY STREQUAL "ONLY")
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
-endif()
-
-find_library(CLML_LIBRARIES NAMES libOpenCL.so NO_DEFAULT_PATH PATHS ${CLML_SDK}/lib ${CLML_SDK}/lib64)
-
-# CMake/Android variables
-set( ANDROID_STL  c++_static CACHE STRING "Target Android STL") # default
-
-# Source variables
-set( OPENCL_INCLUDE_DIRS  ${CLML_SDK} CACHE PATH "filepath to OpenCL headers")
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED True)
-
-#we do not want to pass -fno-exceptions
-if(${CMAKE_CXX_FLAGS} MATCHES "-fno-exceptions")
-  message ( WARNING "Disabling -fno-exceptions")
-  string(REGEX REPLACE "-fno-exceptions" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-endif()
-
-#we do not want to pass -fno-rtti
-if(${CMAKE_CXX_FLAGS} MATCHES "-fno-rtti")
-  message ( WARNING "Disabling -fno-rtti")
-  string(REGEX REPLACE "-fno-rtti" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-endif()
-
-set(COMMON_SOURCE_FILES
-        clml_models.cc
-        clml_runner.cc
-        clml_runner.h
-        main.cc
-        ../../3rdparty/cnpy/cnpy.cpp
-        )
-
-include_directories(
-        src
-        ${OPENCL_INCLUDE_DIRS}
-        "../../3rdparty/dmlc-core/include"
-        "../../3rdparty/cnpy/"
-        )
-
-add_executable(clml_run ${COMMON_SOURCE_FILES})
-target_link_options(clml_run PRIVATE -Wl,--unresolved-symbols=ignore-in-shared-libs)
-target_link_libraries(clml_run ${CLML_LIBRARIES} z)
diff --git a/apps/cpp_clml/README.md b/apps/cpp_clml/README.md
deleted file mode 100644
index 3200492122ab..000000000000
--- a/apps/cpp_clml/README.md
+++ /dev/null
@@ -1,145 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# OpenCLML Debug Tool
-
-Tool to generate OpenCLML source file given a model from any framework and compile it as a native application that runs on Android target.
-This tool helps to debug or triage OpenCLML offloaded sub graphs as a standalone application.
-
-### Codegen
-
-Models can be downloaded from well known frameworks like Tensorflow, PyTorch, TFLite, Onnx ..etc.
-Assuming  ```resnet50.h5``` is a Keras ResNet50 model file, use the below command to generate a OpenCLML source for the model.
-
-```bash
-python3 scripts/clml_codegen.py resnet50.h5
-```
-
-Above command generates ```clml_models.cc``` and ```clml_params.npz```.
-```clml_models.cc``` contains cpp representation of all OpenCLML subgraphs offloaded by TVM compilation. This file will be used to build tool ```clml_run```.
-```clml_params.npz``` is a numpy dump of all params involved in all sub graphs of TVM module. This file to be copied to target.
-
-### Build Tool
-
-Copy the generated models source ```clml_models.cc``` under ```cpp_clml```.
-
-Below commands will compile the tool ```clml_run``` from generated source and other static dependents.
-
-```bash
-cmake -S . -B build_64 -D ANDROID_ABI=arm64-v8a -D CLML_SDK=<CLML SDK PATH> -D CMAKE_TOOLCHAIN_FILE=<ANDROID NDK PATH>/build/cmake/android.toolchain.cmake -D ANDROID_PLATFORM=latest
-cmake --build build_64
-```
-
-### Run the tool
-
-Copy ```clml_params.npz``` and ```clml_run``` to the target Android device
-
-```bash
-Android:/data/local/tmp $ ./clml_run --dump-meta
-Input         =
-Output        =
-Params        =
-DumpMeta      = 1
-.....
-Subgraph Name: tvmgen_default_clml_main_1
-    Input Count  : 1
-    Output Count : 1
-    Input MetaInfo
-        Input: tvmgen_default_clml_main_1_input_0
-            Dtype : float32
-            Shape : [1, 1, 1, 2048]
-    Output MetaInfo
-        Output: tvmgen_default_clml_main_1_layer_out_5
-            Dtype : float32
-            Shape : [1, 1000]
-
-Subgraph Name: tvmgen_default_clml_main_0
-    Input Count  : 1
-    Output Count : 1
-    Input MetaInfo
-        Input: tvmgen_default_clml_main_0_input_0
-            Dtype : float32
-            Shape : [1, 3, 230, 230]
-    Output MetaInfo
-        Output: tvmgen_default_clml_main_0_layer_out_406
-            Dtype : float32
-            Shape : [1, 2048, 1, 1]
-.....
-```
-
-The meta information above indicates that the ResNet50 model is partitioned such a way that there exists two OpenCLML subgraphs.
-
-Below command runs the models by setting the parameters from ```clml_params.npz```.
-
-```bash
-Android:/data/local/tmp $ ./clml_run --params=./clml_params.npz
-Input         =
-Output        =
-Params        = ./clml_params.npz
-DumpMeta      = 1
-......
-CLMLRunner Loading Params:./clml_params.npz
-CLMLRunner Loading Params:./clml_params.npz
-CLMLRunner::Run :tvmgen_default_clml_main_1
-CLMLRunner::Run :tvmgen_default_clml_main_0
-......
-```
-
-Below command can set the model inputs from ```input.npz```  and can output sub graph outputs to ```output.npz```.
-```input.npz``` should have numpy arrays for ```tvmgen_default_clml_main_1_input_0``` from sub graph ```tvmgen_default_clml_main_1``` and ```tvmgen_default_clml_main_0_input_0``` from sub graph ```tvmgen_default_clml_main_0```.
-
-```bash
-Android:/data/local/tmp $ ./clml_run --params=./clml_params.npz --input=./input.npz --output=./output.npz                                                                       <
-Input         = ./input.npz
-Output        = ./output.npz
-Params        = ./clml_params.npz
-DumpMeta      = 0
-Call Build Modules
-CLMLRunner Constructor: Input:./input.npz Output:./output.npz Params:./clml_params.npz
-CLML Target version:3
-CLMLRunner Loading Params:./clml_params.npz
-CLMLRunner Loading Inputs:./input.npz
-Set Input For:tvmgen_default_clml_main_1_input_0
-
-CLMLRunner Constructor: Input:./input.npz Output:./output.npz Params:./clml_params.npz
-CLML Target version:3
-CLMLRunner Loading Params:./clml_params.npz
-CLMLRunner Loading Inputs:./input.npz
-Set Input For:tvmgen_default_clml_main_0_input_0
-
-Loop Through the Modules
-CLMLRunner::Run :tvmgen_default_clml_main_1
-Saving Output:tvmgen_default_clml_main_1_layer_out_5
-CLMLRunner::Run :tvmgen_default_clml_main_0
-Saving Output:tvmgen_default_clml_main_0_layer_out_406
-......
-```
-
-The generated output file ```output.npz``` contains all the output from all sub modules.
-In this case it contains ```tvmgen_default_clml_main_1_layer_out_5``` for sub graph ```tvmgen_default_clml_main_1``` and ```tvmgen_default_clml_main_0_layer_out_406``` for sub graph ```tvmgen_default_clml_main_0``` as shown below.
-
-
-```bash
-Android:/data/local/tmp $ unzip -l output.npz
-Archive:  output.npz
-  Length      Date    Time    Name
----------  ---------- -----   ----
-     4080  1980-00-00 00:00   tvmgen_default_clml_main_1_layer_out_5.npy
-     8272  1980-00-00 00:00   tvmgen_default_clml_main_0_layer_out_406.npy
----------                     -------
-    12352                     2 files
-```
diff --git a/apps/cpp_clml/clml_runner.cc b/apps/cpp_clml/clml_runner.cc
deleted file mode 100644
index 0a5508635e0a..000000000000
--- a/apps/cpp_clml/clml_runner.cc
+++ /dev/null
@@ -1,822 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file clml_runner.cc
- * \brief CLML model runner implementation.
- */
-
-#include "clml_runner.h"
-
-#include <fstream>
-#include <iostream>
-#include <streambuf>
-#include <string>
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief Constructor for CLMLRunner.
- * \param name is unique name for the sub graph or this CLML Runner.
- * \param args tool or utility arguments.
- * \param arg_platform_id is the OpenCL platform.
- * \param arg_context is the OpenCL context.
- * \param arg_device_id is the OpenCL device_id.
- * \param arg_queue is the OpenCL queue.
- */
-CLMLRunner::CLMLRunner(std::string name, ToolArgs& args, cl_platform_id arg_platform_id,
-                       cl_context arg_context, cl_device_id arg_device_id,
-                       cl_command_queue arg_queue)
-    : r_args(args),
-      r_name(name),
-      platform(arg_platform_id),
-      context(arg_context),
-      device_id(arg_device_id),
-      queue(arg_queue) {
-  LOG(INFO) << "CLMLRunner Constructor:" << name << " Input:" << r_args.input
-            << " Output:" << r_args.output << " Params:" << r_args.params;
-  cl_int result;
-
-  // Query and Get CLML Interface
-  static const cl_uint MAX_VERSIONS = 256;
-  cl_int majorVersions[MAX_VERSIONS];
-  cl_int minorVersions[MAX_VERSIONS];
-  cl_uint numVersions = 0;
-  result = clQueryMLInterfaceVersionsQCOM(nullptr, nullptr, 0, &numVersions);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-  CLML_SDK_TEST_AND_EXIT(numVersions > 0u);
-  CLML_SDK_TEST_AND_EXIT(numVersions <= MAX_VERSIONS);
-
-  result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, nullptr);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  for (cl_uint i = 0; i < numVersions; ++i) {
-    if (majorVersions[i] == CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
-      this->h_ClmlIntf = GET_ML_INTERFACE(0);
-      LOG(INFO) << "CLML Target version:" << majorVersions[i];
-      break;
-    }
-  }
-  CLML_SDK_TEST_AND_EXIT(this->h_ClmlIntf != nullptr);
-
-  result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  if (!r_args.params.empty()) {
-    LOG(INFO) << "CLMLRunner Loading Params:" << r_args.params;
-    npz_params = cnpy::npz_load(r_args.params);
-  } else {
-    LOG(INFO) << "CLMLRunner : No parameters supplied";
-  }
-
-  if (!r_args.input.empty()) {
-    LOG(INFO) << "CLMLRunner Loading Inputs:" << r_args.input;
-    npz_input = cnpy::npz_load(r_args.input);
-  } else {
-    LOG(INFO) << "CLMLRunner : No Input's given. Asuming a dry-run.";
-  }
-}
-
-/*!
- * \brief Call one cycle of execution for the model.
- * \return 0 on success else error code.
- */
-int CLMLRunner::Run(void) {
-  LOG(INFO) << "CLMLRunner::Run :" << GetModName();
-  cl_int result;
-
-  for (size_t i = 0; i < this->function.size(); ++i) {
-    result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->function[i], this->descriptorSet, 0,
-                                           nullptr, nullptr);
-    CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-  }
-  if (!r_args.output.empty()) {
-    for (auto it = this->outputs.begin(); it != this->outputs.end(); it++) {
-      auto out_name = it->first;
-      auto out_desc = it->second;
-      auto dtype = outputs_dtypes[out_name];
-      auto shape = outputs_shapes[out_name];
-      size_t size = 1;
-      for (auto si : shape) size *= si;
-      if (dtype == "float32") {
-        void* data = (void*)malloc(size * 4);
-        CopyDataFromCLMLTensor(out_desc, data);
-        LOG(INFO) << "Saving Output:" << out_name;
-        cnpy::npz_save<float>(r_args.output, out_name, (float*)data, shape, "a");
-        free(data);
-      } else if (dtype == "int8") {
-        void* data = (void*)malloc(size);
-        CopyDataFromCLMLTensor(out_desc, data);
-        LOG(INFO) << "Saving Output:" << out_name;
-        cnpy::npz_save<int8_t>(r_args.output, out_name, (int8_t*)data, shape, "a");
-        free(data);
-      } else {
-        LOG(WARNING) << "Unsupported dtype to dump :" << dtype;
-      }
-    }
-  }
-  return 0;
-}
-
-/*!
- * \brief Set meta information.
- * \param minfo is the meta information of the sub graph.
- */
-void CLMLRunner::SetMetaInfo(std::string minfo) { this->meta_info = minfo; }
-
-/*!
- * \brief Print the meta information.
- */
-void CLMLRunner::PrintMetaInfo(void) { LOG(INFO) << "\n" << this->meta_info; }
-
-/*!
- * \brief Copy the bytedata into tensor.
- * \param tensor is tensor descriptor to copy data.
- * \param data is pointer to bytedata.
- * \param layout is source data layout
- */
-void CLMLRunner::CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor,
-                                      void* data, cl_ml_tensor_layout_qcom layout) {
-  cl_int result = 0;
-  cl_event evt = nullptr;
-  result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(this->queue, data, layout, tensor->tensor,
-                                                      tensor->memory,
-                                                      0,        // n waitlist
-                                                      nullptr,  // waitlist
-                                                      &evt);    // event
-  CLML_SDK_TEST_AND_EXIT((evt != nullptr) && result == CL_SUCCESS);
-}
-
-/*!
- * \brief Copy the bytedata into tensor.
- * \param tensor is tensor descriptor to copy data.
- * \param data is pointer to bytedata.
- * \param layout is source data layout
- */
-void CLMLRunner::CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor,
-                                        void* data, cl_ml_tensor_layout_qcom layout) {
-  cl_int result = 0;
-  cl_event readEvent = nullptr;
-  // Read the output tensor
-  result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(this->queue, tensor->tensor, tensor->memory,
-                                                     data, layout,
-                                                     0,            // n waitlist
-                                                     nullptr,      // waitlist
-                                                     &readEvent);  // event
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-  result = clWaitForEvents(1, &readEvent);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-}
-
-/*!
- * \brief Allocate backing memory for tensor descriptor.
- * \param pTensorMemDesc is tensor descriptor.
- * \return memory alocation status (CL_SUCCESS or error code).
- */
-cl_int CLMLRunner::AllocateTensorMemory(
-    std::shared_ptr<cl_ml_tensor_memory_desc_qcom> pTensorMemDesc) {
-  uint32_t size = 0;
-  cl_int result = CL_OUT_OF_HOST_MEMORY;
-  cl_mem buffer = nullptr;
-
-  result = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(context, pTensorMemDesc->tensor, &size);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, nullptr, &result);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  pTensorMemDesc->memory = buffer;
-
-  return result;
-}
-
-/*!
- * \brief Allocate memory for all tensor dectiptor in storage map.
- * Also set data for tensors given params and input numpy dumps
- */
-void CLMLRunner::AllocateMemAndPopulateParams(void) {
-  cl_int result;
-  for (auto it = this->storage_map.begin(); it != this->storage_map.end(); it++) {
-    auto node_id = it->first;
-    auto tensor_desc = it->second;
-
-    AllocateTensorMemory(tensor_desc);
-
-    if (npz_params.find(node_id) != npz_params.end()) {
-      CopyDataToCLMLTensor(tensor_desc, npz_params[node_id].data<char>());
-    }
-
-    if (npz_input.find(node_id) != npz_input.end()) {
-      LOG(INFO) << "Set Input For:" << node_id;
-      CopyDataToCLMLTensor(tensor_desc, npz_input[node_id].data<char>());
-    }
-
-    this->tensorMemDescs.push_back(*tensor_desc);
-  }
-  if (!r_args.dump_meta) {
-    // Cross check all params
-    for (auto nid : consts) {
-      if (npz_params.find(nid) == npz_params.end()) {
-        LOG(WARNING) << "Param not found in npz:" << nid;
-      }
-    }
-  }
-  // Initialize Tensor Descriptors
-  result = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&this->descriptorSet);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  result = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(
-      this->descriptorSet, static_cast<uint32_t>(this->tensorMemDescs.size()),
-      this->tensorMemDescs.data());
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-}
-
-/*!
- * \brief Initializes an unused tensor.
- * It is used across operators.
- */
-void CLMLRunner::MakeUnusedTensor(void) {
-  cl_int result;
-  cl_ml_tensor_desc_qcom desc = {};
-  desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-  this->unusedTensor = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-  result = this->h_ClmlIntf->clCreateMLTensorQCOM(this->context, nullptr, &desc,
-                                                  &(this->unusedTensor->tensor));
-  CLML_SDK_TEST_AND_EXIT(this->unusedTensor && result == CL_SUCCESS);
-}
-
-/*!
- * \brief Convert string datatype to cl channel type.
- * \param dtype the datatype as string.
- * \return cl channel type corresponding to the datatype.
- */
-cl_channel_type MakeCLDataType(const std::string& dtype) {
-  if (dtype == "float32") {
-    return CL_FLOAT;
-  } else if (dtype == "float16") {
-    return CL_HALF_FLOAT;
-  } else {
-    LOG(FATAL) << "Datatype: " << dtype << " unsupported by CLML runtime";
-  }
-  return CL_FLOAT;
-}
-
-/*!
- * \brief Map operator arthemetic mode based on data type and accumulation type.
- * \param data_type is cl channel type for computation.
- * \param acc_tpe is cl channel type for accumulation.
- * \return the arthemetic mode.
- */
-cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
-                                        const cl_channel_type& acc_type = CL_FLOAT) {
-  if (data_type == CL_FLOAT && acc_type == CL_FLOAT) {
-    return CL_ARITHMETIC_MODE_FP32_QCOM;
-  } else if (data_type == CL_HALF_FLOAT && acc_type == CL_FLOAT) {
-    return CL_ARITHMETIC_MODE_FP16_ACC32_QCOM;
-  } else if (data_type == CL_HALF_FLOAT && acc_type == CL_HALF_FLOAT) {
-    return CL_ARITHMETIC_MODE_FP16_QCOM;
-  } else {
-    LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
-  }
-}
-
-/*!
- * \brief Creates a tensor descriptor.
- * \param shape is shape of tensor.
- * \param dtype tensor data type as string.
- * \param layout is the data layout to be used.
- * \return newly created tensor descriptor.
- */
-std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CLMLRunner::MakeCLMLTensor(
-    std::vector<size_t> shape, std::string dtype, cl_ml_tensor_layout_qcom layout) {
-  cl_int result;
-  tensor_dims_t dims;
-  // Make sure the tensors with dimensions less than 4 are padded with 1.
-  shape.push_back(1);
-  shape.push_back(1);
-  shape.push_back(1);
-
-  dims.n = shape[0];
-  dims.c = shape[1];
-  dims.h = shape[2];
-  dims.w = shape[3];
-  cl_channel_type cl_dtype = MakeCLDataType(dtype);
-  auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-  cl_ml_tensor_desc_qcom desc = {
-      cl_dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {0}};
-  result =
-      this->h_ClmlIntf->clCreateMLTensorQCOM(this->context, nullptr, &desc, &tensor_dsc->tensor);
-  CLML_SDK_TEST_AND_EXIT(tensor_dsc->tensor && result == CL_SUCCESS);
-  return tensor_dsc;
-}
-
-/*!
- * \brief Convolution2D implementation.
- * \param input_desc is input tensor descriptor.
- * \param weight_desc is the kernel as tensor descriptor.
- * \param bias_desc is bias as tensor descriptor.
- * \param output_desc is the placeholder for convolution output.
- * \param padding padding to be applied on input tensor.
- * \param dilation is convolution dilation parameter.
- * \param strides is convolution strides parameter.
- * \param groups number of groups.
- * \param mode is it normal convolution of depthwise convolution.
- * \param activation activation to be applied on result.
- * \param has_bias is bias tensor valid.
- * \param has_activation is activation to be applied.
- * \param dtype operator data type.
- */
-void CLMLRunner::MakeConv2D(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
-                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc,
-                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                            std::vector<cl_uint> padding, std::vector<cl_uint> dilation,
-                            std::vector<cl_uint> strides, int groups, cl_convolution_mode_qcom mode,
-                            cl_activation_function_qcom activation, bool has_bias, bool has_act,
-                            std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_int result;
-  if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
-    CLML_SDK_TEST_AND_EXIT(groups == 1);  // CLML convolution only supports group size of 1
-  } else {
-    groups = 1;  // Don't need to pass groups to depthwise
-  }
-  cl_ml_op_activation_desc_qcom act_desc = {activation, CL_PROPAGATE_NAN_QCOM, cl_arithmetic_mode};
-  cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {padding[0], padding[1]};
-  cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {padding[2], padding[3]};
-  cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {strides[0], strides[1]};
-  cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {dilation[0], dilation[1]};
-
-  cl_ml_op_convolution_desc_qcom conv_desc{mode,
-                                           static_cast<cl_uint>(groups),
-                                           4,
-                                           {clml_padding_b[0], clml_padding_b[1]},
-                                           {clml_padding_a[0], clml_padding_a[1]},
-                                           {clml_strides[0], clml_strides[1]},
-                                           {clml_dilation[0], clml_dilation[1]},
-                                           0,
-                                           cl_arithmetic_mode};
-  cl_ml_op_qcom op = nullptr;
-  if (!has_act) {
-    result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
-        this->context, 0, &conv_desc, input_desc->tensor, weight_desc->tensor, bias_desc->tensor,
-        output_desc->tensor, &op, tuning_cache);
-    CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  } else {
-    result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM(
-        this->context, 0, &conv_desc, &act_desc, input_desc->tensor, weight_desc->tensor,
-        bias_desc->tensor, nullptr, output_desc->tensor, &op, tuning_cache);
-    CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  }
-  this->function.push_back(op);
-}
-
-/*!
- * \brief Fused Convolution2D+BatchNorm implementation.
- * \param input_desc is input tensor descriptor.
- * \param weight_desc is the kernel as tensor descriptor.
- * \param bias_desc is bias as tensor descriptor.
- * \param output_desc is the placeholder for convolution output.
- * \param bn_scale fused batchnorm scale tensor descriptor.
- * \param bn_bias fused batchnorm scale tensor descriptor.
- * \param bn_mean fused batchnorm mean tensor descriptor.
- * \param bn_var fused batchnorm variance tensor descriptor.
- * \param bn_attrs batchnorm other attributes.
- * \param padding padding to be applied on input tensor.
- * \param dilation is convolution dilation parameter.
- * \param strides is convolution strides parameter.
- * \param groups number of groups.
- * \param mode is it normal convolution of depthwise convolution.
- * \param activation activation to be applied on result.
- * \param has_bias is bias tensor valid.
- * \param has_activation is activation to be applied.
- * \param dtype operator data type.
- */
-void CLMLRunner::MakeConv2DWithBN(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_scale,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_bias,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_mean,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_var,
-                                  std::vector<float> bn_attrs, std::vector<cl_uint> padding,
-                                  std::vector<cl_uint> dilation, std::vector<cl_uint> strides,
-                                  int groups, cl_convolution_mode_qcom mode,
-                                  cl_activation_function_qcom activation, bool has_bias,
-                                  bool has_act, std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_int result;
-  if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
-    CLML_SDK_TEST_AND_EXIT(groups == 1);  // CLML convolution only supports group size of 1
-  } else {
-    groups = 1;  // Don't need to pass groups to depthwise
-  }
-  cl_ml_op_activation_desc_qcom act_desc = {activation, CL_PROPAGATE_NAN_QCOM, cl_arithmetic_mode};
-  cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {padding[0], padding[1]};
-  cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {padding[2], padding[3]};
-  cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {strides[0], strides[1]};
-  cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {dilation[0], dilation[1]};
-
-  cl_ml_op_convolution_desc_qcom conv_desc{mode,
-                                           static_cast<cl_uint>(groups),
-                                           4,
-                                           {clml_padding_b[0], clml_padding_b[1]},
-                                           {clml_padding_a[0], clml_padding_a[1]},
-                                           {clml_strides[0], clml_strides[1]},
-                                           {clml_dilation[0], clml_dilation[1]},
-                                           0,
-                                           cl_arithmetic_mode};
-  cl_ml_op_qcom op = nullptr;
-  cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
-  if (!has_act) {
-    result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
-        this->context, 0, &conv_desc, &bn_desc, input_desc->tensor, weight_desc->tensor,
-        bias_desc->tensor, output_desc->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
-        bn_bias->tensor, &op, tuning_cache);
-    CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  } else {
-    result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
-        this->context, 0, &conv_desc, &bn_desc, &act_desc, input_desc->tensor, weight_desc->tensor,
-        bias_desc->tensor, output_desc->tensor, nullptr, bn_mean->tensor, bn_var->tensor,
-        bn_scale->tensor, bn_bias->tensor, &op, tuning_cache);
-    CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  }
-  this->function.push_back(op);
-}
-
-/*!
- * \brief All types of ReLU(6) implementation.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param relu_type the pf ReLU activation.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeRelu(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                          std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                          cl_activation_function_qcom relu_type, std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-  cl_ml_op_activation_desc_qcom act_desc = {relu_type, CL_PROPAGATE_NAN_QCOM, cl_arithmetic_mode};
-
-  result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(
-      this->context, 0, &act_desc, input_desc->tensor, this->unusedTensor->tensor,
-      output_desc->tensor, &op, tuning_cache);
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief Batch Normalization operator implementation.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param bn_scale fused batchnorm scale tensor descriptor.
- * \param bn_bias fused batchnorm scale tensor descriptor.
- * \param bn_mean fused batchnorm mean tensor descriptor.
- * \param bn_var fused batchnorm variance tensor descriptor.
- * \param bn_attrs batchnorm other attributes.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeBatchNorm(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                               std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                               std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_scale,
-                               std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_bias,
-                               std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_mean,
-                               std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_var,
-                               std::vector<float> bn_attrs, std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
-
-  result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
-      this->context, 0, &bn_desc, input_desc->tensor, bn_mean->tensor, bn_var->tensor,
-      bn_scale->tensor, bn_bias->tensor, output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief All types of Pool2D operator implementation.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param pool_size pooling window size.
- * \param strides stride for pooling.
- * \param padding is the input padding.
- * \param pool_type is type of poling (max, avg ...etc).
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakePool2D(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                            std::vector<cl_uint> pool_size, std::vector<cl_uint> strides,
-                            std::vector<cl_uint> padding, std::string pool_type,
-                            std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  cl_ml_op_pooling_desc_qcom pool_desc = {
-      pool_type == "nn.max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
-                                   : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
-      4,  // reserved
-      {padding[0], padding[1]},
-      {padding[2], padding[3]},
-      {strides[0], strides[1]},
-      {pool_size[0], pool_size[1]},
-      CL_PROPAGATE_NAN_QCOM,
-      cl_arithmetic_mode,
-  };
-
-  result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(
-      this->context, 0, &pool_desc, input_desc->tensor, this->unusedTensor->tensor,
-      output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief All types of Global Pooling 2D operator implementation.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param in_shape is the input tensor shape.
- * \param pool_type is the pool type (max or avg).
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeGlobalPool2D(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                                  std::vector<cl_uint> in_shape, std::string pool_type,
-                                  std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-  cl_ml_op_pooling_desc_qcom pool_desc = {
-      pool_type == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
-                                          : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
-      4,  // reserved
-      {0, 0},
-      {0, 0},
-      {1, 1},
-      {in_shape[2], in_shape[3]},
-      CL_PROPAGATE_NAN_QCOM,
-      cl_arithmetic_mode,
-  };
-
-  result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(
-      this->context, 0, &pool_desc, input_desc->tensor, this->unusedTensor->tensor,
-      output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief Reshape Operator.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeReshape(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                             std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                             std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  result = h_ClmlIntf->clCreateMLOpReshapeQCOM(this->context, 0, input_desc->tensor,
-                                               output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief Concatenate operator implementation.
- * \param in_list list of input tensor descriptors to concatenate.
- * \param output_desc output tensor descriptor.
- * \param axis is the dimention on which we join the tensors.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeConcatenate(
-    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_list,
-    std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, int axis, std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[in_list.size()];
-  for (int i = 0; i < in_list.size(); i++) {
-    concatInputs[i] = in_list[i]->tensor;
-  }
-  cl_ml_op_concat_desc_qcom concatDesc = {1, (cl_uint)in_list.size(), cl_arithmetic_mode};
-  result = h_ClmlIntf->clCreateMLOpConcatQCOM(this->context, 0, &concatDesc, concatInputs,
-                                              output_desc->tensor, &op, tuning_cache);
-  delete[] concatInputs;
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief Dense operator implementation.
- * \param input_desc input tensor descriptor.
- * \param weight_desc weight tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param bias_desc bias tensor descriptor.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeDense(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                           std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
-                           std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                           std::vector<cl_uint> in_shape, std::vector<cl_uint> wt_shape,
-                           std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-  cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
-
-  if (in_shape[1] == wt_shape[1]) {
-    b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
-  }
-
-  cl_ml_op_gemm_desc_qcom gemmDesc = {in_shape[0],                  // m
-                                      wt_shape[0],                  // n
-                                      wt_shape[1],                  // k
-                                      CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
-                                      b_transform,                  // B transform
-                                      {{1.0}, CL_FLOAT},            // alpha
-                                      {{0.0}, CL_FLOAT},            // beta
-                                      cl_arithmetic_mode};
-
-  result =
-      h_ClmlIntf->clCreateMLOpGemmQCOM(this->context, 0, &gemmDesc, input_desc->tensor,
-                                       weight_desc->tensor, output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief SoftMax operator implementation.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeSoftMax(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                             std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                             std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
-                                             CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode};
-
-  result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(this->context, 0, &softmax_desc, input_desc->tensor,
-                                               output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief .
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param pad_mode type of padding to be applied (constant, edge, reflect ...etc).
- * \param padding amount of padding.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakePad(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                         std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                         std::string pad_mode, std::vector<cl_uint> padding, std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
-  if (pad_mode == "constant")
-    clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
-  else if (pad_mode == "edge")
-    clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
-  else if (pad_mode == "reflect")
-    clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
-  else
-    LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;
-
-  cl_ml_op_pad_desc_qcom pad_desc{clml_pad_mode,
-                                  {0, 0},
-                                  {padding[0], padding[1], padding[2], padding[3], 0, 0, 0, 0},
-                                  cl_arithmetic_mode};
-
-  result = h_ClmlIntf->clCreateMLOpPadQCOM(this->context, 0, &pad_desc, input_desc->tensor,
-                                           output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief Batch Flatten operator implementation.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeBatchFlatten(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                                  std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  result = h_ClmlIntf->clCreateMLOpReshapeQCOM(this->context, 0, input_desc->tensor,
-                                               output_desc->tensor, &op, tuning_cache);
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief Clip operator.
- * \param input_desc input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param a_max is the upper bound to clip.
- * \param a_min is the lower bound to clip.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeClip(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                          std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, float a_max,
-                          float a_min, std::string dtype) {
-  LOG(INFO) << "MakeClip called";
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  cl_ml_op_clip_desc_qcom clip_desc = {
-      CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};
-
-  result = h_ClmlIntf->clCreateMLOpClipQCOM(this->context, 0, &clip_desc, input_desc->tensor,
-                                            output_desc->tensor, &op, tuning_cache);
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-/*!
- * \brief All types of Binary operators.
- * \param input_a first input tensor descriptor.
- * \param input_b second input tensor descriptor.
- * \param output_desc output tensor descriptor.
- * \param op_name is the binary operator.
- * \param dtype operator datatype.
- */
-void CLMLRunner::MakeBinaryOp(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_a,
-                              std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_b,
-                              std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                              std::string op_name, std::string dtype) {
-  cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
-  cl_ml_op_qcom op = nullptr;
-  cl_int result;
-
-  cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
-  if (op_name == "subtract")
-    binary_op = CL_TENSOR_OP_SUB_QCOM;
-  else if (op_name == "multiply")
-    binary_op = CL_TENSOR_OP_MUL_QCOM;
-  else if (op_name == "divide")
-    binary_op = CL_TENSOR_OP_DIV_QCOM;
-  else if (op_name == "minimum")
-    binary_op = CL_TENSOR_OP_MIN_QCOM;
-  else if (op_name == "maximum")
-    binary_op = CL_TENSOR_OP_MAX_QCOM;
-  cl_ml_op_binary_desc_qcom add_desc = {
-      binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
-
-  result =
-      h_ClmlIntf->clCreateMLOpBinaryQCOM(this->context, 0, &add_desc, input_a->tensor,
-                                         input_b->tensor, output_desc->tensor, &op, tuning_cache);
-
-  CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
-  this->function.push_back(op);
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/apps/cpp_clml/clml_runner.h b/apps/cpp_clml/clml_runner.h
deleted file mode 100644
index a1e78fcb66be..000000000000
--- a/apps/cpp_clml/clml_runner.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file clml_runner.h
- * \brief CLML model runner.
- */
-#ifndef CLML_APPS_CPP_RCLML_RUNNER_H_
-#define CLML_APPS_CPP_RCLML_RUNNER_H_
-
-#include <csignal>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <string>
-#if defined(__linux__) || defined(__ANDROID__)
-#include <unistd.h>
-#endif
-
-#include <CL/cl_qcom_ml_ops.h>
-#include <cnpy.h>
-#include <dmlc/io.h>
-
-#include "CL/cl.h"
-
-#define CLML_SDK_TEST_AND_EXIT(expression)                                                      \
-  {                                                                                             \
-    {                                                                                           \
-      int _n_ = !(expression);                                                                  \
-      if (_n_) {                                                                                \
-        fprintf(stderr, "Error on line %d of %s\nFailing expression: %s\n", __LINE__, __FILE__, \
-                #expression);                                                                   \
-        exit(1);                                                                                \
-      }                                                                                         \
-    }                                                                                           \
-  }
-
-#define CAT_I(a, b) a##b
-#define CAT(a, b) CAT_I(a, b)
-#define GET_ML_INTERFACE CAT(CAT(clGetMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
-#define GET_ML_API_INTERFACE CAT(CAT(CLMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
-
-namespace tvm {
-namespace runtime {
-
-/**
- * \brief Tensor dimensions, batch, channel, height, width
- *
- */
-struct tensor_dims_t {
-  uint32_t n, c, h, w;
-};
-
-/*!
- * \brief Tool Arguments.
- * \arg input Numpy file for the model input
- * \arg output Numpy file name to dump the model output as numpy
- * \arg parsms Numpy file holding the params for models
- */
-struct ToolArgs {
-  std::string input;
-  std::string output;
-  std::string params;
-  bool dump_meta = false;
-};
-
-/*!
- * \brief encapsulates CLML Runner functionality for the sub graph
- */
-class CLMLRunner {
- public:
-  /*! \brief Constructor */
-  CLMLRunner(std::string name, ToolArgs& args, cl_platform_id arg_platform_id,
-             cl_context arg_context, cl_device_id arg_device_id, cl_command_queue arg_queue);
-
-  /*! \brief Returns the name for this sub graph */
-  std::string GetModName(void) { return r_name; }
-  /*! \brief Executes one cycle all CLML ops */
-  int Run(void);
-  /*! \brief set meta information */
-  void SetMetaInfo(std::string minfo);
-  /*! \brief Print function to show all meta information */
-  void PrintMetaInfo(void);
-  /*! \brief initializes the unusedTensor */
-  void MakeUnusedTensor(void);
-  /*! \brief Copy given bytestream of data to the tensor */
-  void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
-                            cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM);
-  /*! \brief Copy tensor data to data in expected layout format */
-  void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
-                              cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM);
-  /*! \brief Allocates memory for the tensor descriptor */
-  cl_int AllocateTensorMemory(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> pTensorMemDesc);
-  /*!
-   * \brief Allocates memory for all tensor descriptor in storage map.
-   * Also initializes the parameter nodes, inputs from given numpy dumps if provided.
-   */
-  void AllocateMemAndPopulateParams(void);
-  /*! \brief Create a tensor descriptor given it's shape, dtype and layout */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
-      std::vector<size_t> shape, std::string dtype = "float32",
-      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM);
-  /*! \brief Conv2D layer implementattion */
-  void MakeConv2D(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
-                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc,
-                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                  std::vector<cl_uint> padding, std::vector<cl_uint> dilation,
-                  std::vector<cl_uint> strides, int groups, cl_convolution_mode_qcom mode,
-                  cl_activation_function_qcom activation, bool has_bias, bool has_act,
-                  std::string dtype);
-
-  /*! \brief Conv2D with Fused BatchNorm layer implementattion */
-  void MakeConv2DWithBN(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_scale,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_bias,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_mean,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_var,
-                        std::vector<float> bn_attrs, std::vector<cl_uint> padding,
-                        std::vector<cl_uint> dilation, std::vector<cl_uint> strides, int groups,
-                        cl_convolution_mode_qcom mode, cl_activation_function_qcom activation,
-                        bool has_bias, bool has_act, std::string dtype);
-
-  /*! \brief ReLU layer implementattion */
-  void MakeRelu(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                cl_activation_function_qcom relu_type, std::string dtype);
-
-  /*! \brief Batch Normalization layer implementattion */
-  void MakeBatchNorm(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                     std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                     std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_scale,
-                     std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_bias,
-                     std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_mean,
-                     std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bn_var,
-                     std::vector<float> bn_attrs, std::string dtype);
-
-  /*! \brief Pool2D (with all variants) layer implementattion */
-  void MakePool2D(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                  std::vector<cl_uint> pool_size, std::vector<cl_uint> strides,
-                  std::vector<cl_uint> padding, std::string pool_type, std::string dtype);
-
-  /*! \brief GlobalPool2D (with all variants) layer implementattion */
-  void MakeGlobalPool2D(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                        std::vector<cl_uint> in_shape, std::string pool_type, std::string dtype);
-
-  /*! \brief Reshape layer implementattion */
-  void MakeReshape(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                   std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, std::string dtype);
-
-  /*! \brief Concatenate layer implementattion */
-  void MakeConcatenate(std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_list,
-                       std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, int axis,
-                       std::string dtype);
-
-  /*! \brief Dense layer implementattion */
-  void MakeDense(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
-                 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                 std::vector<cl_uint> in_shape, std::vector<cl_uint> wt_shape, std::string dtype);
-
-  /*! \brief SoftMax layer implementattion */
-  void MakeSoftMax(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                   std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, std::string dtype);
-
-  /*! \brief Pad layer implementattion */
-  void MakePad(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-               std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, std::string pad_mode,
-               std::vector<cl_uint> padding, std::string dtype);
-
-  /*! \brief Batch Flatten layer implementattion */
-  void MakeBatchFlatten(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                        std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                        std::string dtype);
-
-  /*! \brief Clip layer implementattion */
-  void MakeClip(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
-                std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, float a_max,
-                float a_min, std::string dtype);
-
-  /*! \brief Binary Operator (with all types) layer implementattion */
-  void MakeBinaryOp(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_a,
-                    std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_b,
-                    std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc, std::string op_name,
-                    std::string dtype);
-
-  /*! \brief Vector of created operators */
-  std::vector<cl_ml_op_qcom> function;
-  /*! \brief Vector of graph's input tensor descriptors */
-  std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
-  /*! \brief Map of graph's output tensor descriptors with names */
-  std::map<std::string, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
-  /*! \brief Map of graph's output tensor names and dtypes */
-  std::map<std::string, std::string> outputs_dtypes;
-  /*! \brief Map of graph's output tensor names and shapes */
-  std::map<std::string, std::vector<size_t>> outputs_shapes;
-  /*! \brief Overall storage map for all tensor descriptors involved */
-  std::map<std::string, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> storage_map;
-  /*! \brief List of const tensor of the graph */
-  std::vector<std::string> consts;
-  /*! \brief List of all memory descriptor in graph */
-  std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
-  /*! \brief Tensor memory descriptor set */
-  cl_ml_tensor_mem_desc_set_qcom descriptorSet;
-  /*! \brief Unused tensor used across various ops */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> unusedTensor;
-
-  /*! \brief  ML API interface */
-  GET_ML_API_INTERFACE* h_ClmlIntf = nullptr;
-  /*! \brief  Tuning cache object */
-  cl_ml_tuningcache_qcom tuning_cache = nullptr;
-  /*! \brief  Flag to inticate a tuning run */
-  bool is_tuning_run;
-  /*! \brief  The tuning file for loading or storing cache */
-  char* tuning_file;
-
-  /*! \brief  OpenCL platform */
-  cl_platform_id platform{nullptr};
-  /*! \brief  OpenCL context */
-  cl_context context{nullptr};
-  /*! \brief  OpenCL device */
-  cl_device_id device_id{nullptr};
-  /*! \brief  OpenCL Queue */
-  cl_command_queue queue{nullptr};
-  /*! \brief  Numpy object for params */
-  cnpy::npz_t npz_params;
-  /*! \brief  Numpy object for inputs */
-  cnpy::npz_t npz_input;
-
- private:
-  /*! \brief unique name for the runner */
-  std::string r_name;
-  /*! \brief arguments */
-  ToolArgs r_args;
-  /*! \brief Holds meta information from clml codegen */
-  std::string meta_info;
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // CLML_APPS_CPP_RCLML_RUNNER_H_
diff --git a/apps/cpp_clml/main.cc b/apps/cpp_clml/main.cc
deleted file mode 100644
index b918618a1772..000000000000
--- a/apps/cpp_clml/main.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file main.cc
- * \brief CLML Model execution application.
- */
-
-#include "clml_runner.h"
-
-using namespace tvm::runtime;
-
-/*!
- * \brief Auto generated model file (clml_models.cc) entry function definition.
- * \param args The tool arguments to forward
- * \param arg_platform OpenCL platform
- * \param arg_context OpenCL context
- * \param arg_device_id OpenCL device id
- * \param queue OpenCL queue
- * \return List of CLMLRunner objects corresponding to all sub graphs of a TVM module.
- */
-std::vector<CLMLRunner> BuildModules(ToolArgs& args, cl_platform_id arg_platform,
-                                     cl_context arg_context, cl_device_id arg_device_id,
-                                     cl_command_queue queue);
-
-static const std::string kUsage =
-    "Command line usage\n"
-    "--input        - Numpy file for the model input (optional and we use random of not given)\n"
-    "--output       - Numpy file name to dump the model output as numpy\n"
-    "--params       - Numpy file with params\n"
-    "--dump-meta    - Dump model meta information\n"
-    "\n"
-    "  Example\n"
-    "  ./clml_run --dump-meta\n"
-    "  ./clml_run --params=clmlparams.npz\n"
-    "  ./clml_run --input=input.npz --output=output.npz --params=clml_params.npz\n"
-    "\n";
-
-/*!
- * \brief PrintArgs print the contents of ToolArgs
- * \param args ToolArgs structure
- */
-void PrintArgs(const ToolArgs& args) {
-  LOG(INFO) << "Input         = " << args.input;
-  LOG(INFO) << "Output        = " << args.output;
-  LOG(INFO) << "Params        = " << args.params;
-  LOG(INFO) << "DumpMeta      = " << args.dump_meta;
-}
-
-#if defined(__linux__) || defined(__ANDROID__)
-/*!
- * \brief CtrlCHandler, exits if Ctrl+C is pressed
- * \param s signal
- */
-void CtrlCHandler(int s) {
-  LOG(INFO) << "User pressed Ctrl+C, Exiting";
-  exit(1);
-}
-
-/*!
- * \brief HandleCtrlC Register for handling Ctrl+C event.
- */
-void HandleCtrlC() {
-  // Ctrl+C handler
-  struct sigaction sigIntHandler;
-  sigIntHandler.sa_handler = CtrlCHandler;
-  sigemptyset(&sigIntHandler.sa_mask);
-  sigIntHandler.sa_flags = 0;
-  sigaction(SIGINT, &sigIntHandler, nullptr);
-}
-#endif
-/*!
- * \brief GetCmdOption Parse and find the command option.
- * \param argc arg counter
- * \param argv arg values
- * \param option command line option to search for.
- * \param key whether the option itself is key
- * \return value corresponding to option.
- */
-std::string GetCmdOption(int argc, char* argv[], std::string option, bool key = false) {
-  std::string cmd;
-  for (int i = 1; i < argc; ++i) {
-    std::string arg = argv[i];
-    if (arg.find(option) == 0) {
-      if (key) {
-        cmd = argv[i];
-        return cmd;
-      }
-      // We assume "=" is the end of option.
-      // ICHECK_EQ(*option.rbegin(), '=');
-      cmd = arg.substr(arg.find('=') + 1);
-      return cmd;
-    }
-  }
-  return cmd;
-}
-
-/*!
- * \brief ParseCmdArgs parses the command line arguments.
- * \param argc arg counter
- * \param argv arg values
- * \param args the output structure which holds the parsed values
- */
-void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
-  const std::string input = GetCmdOption(argc, argv, "--input=");
-  if (!input.empty()) {
-    args.input = input;
-  }
-
-  const std::string output = GetCmdOption(argc, argv, "--output=");
-  if (!output.empty()) {
-    args.output = output;
-  }
-
-  const std::string params = GetCmdOption(argc, argv, "--params=");
-  if (!params.empty()) {
-    args.params = params;
-  }
-
-  const std::string pmeta = GetCmdOption(argc, argv, "--dump-meta", true);
-  if (!pmeta.empty()) {
-    args.dump_meta = true;
-  }
-}
-
-/*!
- * \brief Check CLML extension availability in the CL device.
- * \param platform_id OpenCL platform
- * \param device_id OpenCL device id
- * \return true if extension present else false.
- */
-bool ExtensionStringPresent(cl_platform_id platform_id, cl_device_id device_id) {
-  cl_int result = 0;
-  size_t reqd_size = 0;
-  result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size);
-  CLML_SDK_TEST_AND_EXIT(reqd_size > 0u && result == CL_SUCCESS);
-
-  std::vector<char> buf(reqd_size);
-  result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, buf.data(), nullptr);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  std::string extensions(buf.data());
-  LOG(WARNING) << "OpenCL Extensions:" << extensions;
-  return (extensions.find("cl_qcom_ml_ops") != std::string::npos);
-}
-
-/*!
- * \brief Loads and Executes the model on given Target.
- * \param args tool arguments
- * \return result of operation.
- */
-int ExecuteModel(ToolArgs& args) {
-#if defined(__linux__) || defined(__ANDROID__)
-  // Ctrl+C handler
-  HandleCtrlC();
-#endif
-
-  // Init OpenCL Environment
-  cl_int result;
-  cl_event readEvent = nullptr;
-  cl_platform_id platform = nullptr;
-  cl_context context = nullptr;
-  cl_device_id device_id = nullptr;
-  cl_command_queue queue = nullptr;
-
-  // Initialize Context and Command Queue
-  result = clGetPlatformIDs(1, &platform, nullptr);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  uint32_t num_devices = 0;
-  result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS && num_devices == 1);
-
-  result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, nullptr);
-  CLML_SDK_TEST_AND_EXIT(device_id && result == CL_SUCCESS);
-
-  CLML_SDK_TEST_AND_EXIT(ExtensionStringPresent(platform, device_id) == true);
-
-  context = clCreateContext(0, 1, &device_id, nullptr, nullptr, &result);
-  CLML_SDK_TEST_AND_EXIT(result == CL_SUCCESS);
-
-  cl_command_queue_properties queue_props = 0;
-
-  queue = clCreateCommandQueue(context, device_id, queue_props, &result);
-  CLML_SDK_TEST_AND_EXIT(queue && result == CL_SUCCESS);
-
-  // Populate the runner with model
-  LOG(INFO) << "Call Build Modules\n";
-  auto runners = BuildModules(args, platform, context, device_id, queue);
-
-  LOG(INFO) << "Loop Through the Modules";
-  for (auto runner : runners) {
-    if (args.dump_meta) {
-      // Print Meta Information
-      runner.PrintMetaInfo();
-    }
-
-    // Run the model
-    runner.Run();
-  }
-
-  return 0;
-}
-
-/*!
- * \brief main The main function.
- * \param argc arg counter
- * \param argv arg values
- * \return result of operation.
- */
-int main(int argc, char* argv[]) {
-  if (argc <= 1) {
-    LOG(INFO) << kUsage;
-    return 0;
-  }
-
-  ToolArgs args;
-  ParseCmdArgs(argc, argv, args);
-  PrintArgs(args);
-
-  if (ExecuteModel(args)) {
-    PrintArgs(args);
-    LOG(INFO) << kUsage;
-    return -1;
-  }
-  return 0;
-}
diff --git a/apps/cpp_clml/scripts/clml_codegen.py b/apps/cpp_clml/scripts/clml_codegen.py
deleted file mode 100644
index 7540812ed58b..000000000000
--- a/apps/cpp_clml/scripts/clml_codegen.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import sys
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm.driver import tvmc
-from tvm.relay.op.contrib import clml
-from tvm.contrib import utils
-from string import Template
-
-
-def main():
-    print("CLML Codegen")
-    if len(sys.argv) != 2:
-        print("Usage: python clml_codegen.py <model_path>")
-        return
-
-    tvmc_model = tvmc.load(sys.argv[1])
-    mod = tvmc_model.mod
-    params = tvmc_model.params
-    with tvm.transform.PassContext(opt_level=3):
-        mod = tvmc.transform.convert_graph_layout(mod, "NCHW")
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        clml_mod = clml.partition_for_clml(mod, params)
-        libm = relay.build(
-            clml_mod,
-            target="opencl",
-            target_host="llvm -mtriple=aarch64-linux-gnu",
-            params=params,
-        )
-
-        # Extract CLML related params
-        (clml_params_save, gen_src) = clml.CLMLGenSrc(libm).get_artifacts()
-        np.savez("clml_params.npz", **clml_params_save)
-
-        f_src = open("../clml_models.cc", "w")
-        f_src.write("\n".join(gen_src))
-        f_src.close()
-        os.popen("clang-format-15 -i ../clml_models.cc")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/apps/cpp_clml/scripts/clml_codegen_json.py b/apps/cpp_clml/scripts/clml_codegen_json.py
deleted file mode 100644
index c3fbf835d8ee..000000000000
--- a/apps/cpp_clml/scripts/clml_codegen_json.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import sys
-import json
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm.driver import tvmc
-from tvm.relay.op.contrib import clml
-from tvm.contrib import utils
-from string import Template
-
-
-def main():
-    print("CLML Codegen From JSON")
-    if len(sys.argv) != 3:
-        print("Usage: python clml_codegen_json.py <json path> <outfile path>")
-        return
-
-    with open(sys.argv[1], "r") as file:
-        codegen = json.load(file)
-        (_, gen_src) = clml.CLMLGenSrc(codegen).get_artifacts()
-
-        f_src = open(sys.argv[2], "w")
-        f_src.write("\n".join(gen_src))
-        f_src.close()
-        os.popen("clang-format-15 -i " + sys.argv[2])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/apps/cpp_clml/scripts/compare_npy.py b/apps/cpp_clml/scripts/compare_npy.py
deleted file mode 100644
index 8e3c3a8b630f..000000000000
--- a/apps/cpp_clml/scripts/compare_npy.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import sys
-import numpy as np
-
-
-def main():
-    print("Compare given numpy array in npz files")
-    if len(sys.argv) != 4:
-        print("Usage: python compare_npy.py <npz file 1> <npz file 2> <np array to cpmpare>")
-        return
-
-    in1 = np.load(sys.argv[1])
-    in2 = np.load(sys.argv[2])
-
-    print(sys.argv[1] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
-    print(sys.argv[2] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
-
-    np.testing.assert_allclose(in1[sys.argv[3]], in2[sys.argv[3]], rtol=1e-5, atol=1e-5)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index be0015b17ae1..706313c6d23c 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -101,106 +101,12 @@ copy the shared object with the model and the model JSON file over to the
 device (both are obtained from relay).  Also, copy all input files for the
 model as well.
 
-The following snippet illustrates how to obtain the shared object and the
-JSON file from a TFLite model (using Inception V3 as an example):
-
-```
-# Skipped imports, etc.
-
-with open("inception_v3.tflite", "rb") as f:
-    tflite_model_buf = f.read()
-tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-
-shape_dict = { "input": [1,299,299,3] }
-dtype_dict = { "input": "float32" }
-
-mod, params = relay.frontend.from_tflite(
-    tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
-)
-
-target = tvm.target.hexagon('v68')
-with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, tvm.target.Target(target, host=target), params=params, mod_name="default")
-
-# Save model.so and model.json:
-with open('model.json', 'w') as f:
-    f.write(lib.get_graph_json())
-lib.get_lib().save('model.so')
-```
-
-The final thing is to prepare a JSON configuration file for the launcher.
-The JSON has two attributes describing the model: `model-library` and
-`model-json`, and an attribute `inputs`, which is a list of records, one
-for each input file.
-An input file record has three attributes: `file`, `shape`, and `dtype`.
-
-Below is an example of the input config file for Inception V3:
-```
-{
-  "model-library": "inceptionv3-float32.so",
-  "model-json": "inceptionv3-float32.json",
-  "inputs" : [
-    {
-      "file": "panda_299x299_fp.dat",
-      "shape": [1,299,299,3],
-      "dtype": "float32"
-    }
-  ]
-}
-```
-
-The launcher will then create the output JSON file (with the name given via
-`--out_config`) containing information about the execution time and the model
-outputs. The output JSON file has three attributes: "pcycles", "usecs" that
-contain the execution duration in terms of processor cycles and microseconds
-respectivaly, and an attribute `outputs`, which is a list of output file records
-whose syntax is identical to the input file records in the input file.
-A sample output JSON from running the Inception V3 model may look like
-```
-{
-  "pcycles": 112965680178,
-  "usecs": 79532302,
-  "outputs": [
-    {
-      "file": "output0.dat",
-      "shape": [1, 1001],
-      "dtype": "float32"
-    }
-  ]
-}
-```
-
-When using AoT, the `target` needs to be `llvm`:
-```
-aot_target = "llvm -keys=hexagon -mattr=+hvxv69,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv69 -mtriple=hexagon"
-aot_host_target = aot_target
-```
-
-Build the relay module specifying AoT as executor and CPP as runtime, and save it via `export_library`:
-```
-lowered = tvm.relay.build(
-    relay_mod,
-    params=params,
-    target=tvm.target.Target(aot_target, host=aot_host_target),
-    runtime=Runtime("cpp"),
-    executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
-)
-
-lowered.export_library("model-aot.so", fcompile=tvm.contrib.hexagon.link_shared)
-```
-
-
 ## Profiling using hexagon launcher
 
 ### Enabling lightweight profiling (LWP) instrumentation
 
 This profiling option can be used to get function and loop level processor cycles.
-This needs to be enabled explicitly while compiling a model. For example:
-
-```
-with tvm.transform.PassContext(config={'tir.instrument_lwp':True} ):
-    lib = relay.build(...)
-```
+This needs to be enabled explicitly while compiling a model.
 
 Here, `instrument_lwp` is used to enable the tir pass which instruments the code with the builtin calls.
 
diff --git a/apps/ios_rpc/tests/ios_rpc_mobilenet.py b/apps/ios_rpc/tests/ios_rpc_mobilenet.py
deleted file mode 100644
index 85a430317765..000000000000
--- a/apps/ios_rpc/tests/ios_rpc_mobilenet.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import os
-import re
-import sys
-
-import coremltools
-import numpy as np
-import tvm
-from PIL import Image
-from tvm import relay, rpc
-from tvm.contrib import coreml_runtime, graph_executor, utils, xcode
-from tvm.contrib.download import download_testdata
-from tvm.contrib.target import coreml as _coreml
-from tvm.relay import transform
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-from tvm.relay.quantize.quantize import prerequisite_optimize
-
-# Change target configuration, this is setting for iphone6s
-# arch = "x86_64"
-# sdk = "iphonesimulator"
-arch = "arm64"
-sdk = "iphoneos"
-target_host = "llvm -mtriple=%s-apple-darwin" % arch
-
-MODES = {"proxy": rpc.connect, "tracker": rpc.connect_tracker, "standalone": rpc.connect}
-
-
-# override metal compiler to compile to iphone
-@tvm.register_func("tvm_callback_metal_compile")
-def compile_metal(src, target):
-    return xcode.compile_metal(src, sdk=sdk)
-
-
-def prepare_input():
-    from torchvision import transforms
-
-    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
-    img_name = "cat.png"
-    synset_url = "".join(
-        [
-            "https://gist.githubusercontent.com/zhreshold/",
-            "4d0b62f3d01426887599d4f7ede23ee5/raw/",
-            "596b27d23537e5a1b5751d2b0481ef172f58b539/",
-            "imagenet1000_clsid_to_human.txt",
-        ]
-    )
-    synset_name = "imagenet1000_clsid_to_human.txt"
-    img_path = download_testdata(img_url, img_name, module="data")
-    synset_path = download_testdata(synset_url, synset_name, module="data")
-    with open(synset_path) as f:
-        synset = eval(f.read())
-        input_image = Image.open(img_path)
-
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(0)
-    return input_batch.detach().cpu().numpy(), synset
-
-
-def get_model(model_name, data_shape):
-    import torch
-    import torchvision
-
-    torch_model = getattr(torchvision.models, model_name)(weights="IMAGENET1K_V1").eval()
-    input_data = torch.randn(data_shape)
-    scripted_model = torch.jit.trace(torch_model, input_data)
-
-    input_infos = [("data", input_data.shape)]
-    mod, params = relay.frontend.from_pytorch(scripted_model, input_infos)
-
-    # we want a probability so add a softmax operator
-    func = mod["main"]
-    func = relay.Function(
-        func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs
-    )
-
-    return func, params
-
-
-def test_mobilenet(host, port, key, mode):
-    temp = utils.tempdir()
-    image, synset = prepare_input()
-    model, params = get_model("mobilenet_v2", image.shape)
-
-    def run(mod, target):
-        with relay.build_config(opt_level=3):
-            lib = relay.build(
-                mod, target=tvm.target.Target(target, host=target_host), params=params
-            )
-        path_dso = temp.relpath("deploy.dylib")
-        lib.export_library(path_dso, fcompile=xcode.create_dylib, arch=arch, sdk=sdk)
-
-        # connect to the proxy
-        if mode == "tracker":
-            remote = MODES[mode](host, port).request(key)
-        else:
-            remote = MODES[mode](host, port, key=key)
-        remote.upload(path_dso)
-
-        if target == "metal":
-            dev = remote.metal(0)
-        else:
-            dev = remote.cpu(0)
-        lib = remote.load_module("deploy.dylib")
-        m = graph_executor.GraphModule(lib["default"](dev))
-
-        m.set_input("data", tvm.nd.array(image, dev))
-        m.run()
-        tvm_output = m.get_output(0)
-        top1 = np.argmax(tvm_output.numpy()[0])
-        print("TVM prediction top-1:", top1, synset[top1])
-
-        # evaluate
-        ftimer = m.module.time_evaluator("run", dev, number=3, repeat=10)
-        prof_res = np.array(ftimer().results) * 1000
-        print("%-19s (%s)" % ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
-
-    def annotate(func, compiler):
-        """
-        An annotator for Core ML.
-        """
-        # Bind free variables to the constant values.
-        bind_dict = {}
-        for arg in func.params:
-            name = arg.name_hint
-            if name in params:
-                bind_dict[arg] = relay.const(params[name])
-
-        func = relay.bind(func, bind_dict)
-
-        # Annotate the entire graph for Core ML
-        mod = tvm.IRModule()
-        mod["main"] = func
-
-        seq = tvm.transform.Sequential(
-            [
-                transform.SimplifyInference(),
-                transform.FoldConstant(),
-                transform.FoldScaleAxis(),
-                transform.AnnotateTarget(compiler),
-                transform.MergeCompilerRegions(),
-                transform.PartitionGraph(),
-            ]
-        )
-
-        with relay.build_config(opt_level=3):
-            mod = seq(mod)
-
-        return mod
-
-    # CPU
-    run(model, target_host)
-    # Metal
-    run(model, "metal")
-    # CoreML
-    run(annotate(model, "coremlcompiler"), target_host)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Demo app demonstrates how ios_rpc works.")
-    parser.add_argument("--host", required=True, type=str, help="Adress of rpc server")
-    parser.add_argument("--port", type=int, default=9090, help="rpc port (default: 9090)")
-    parser.add_argument("--key", type=str, default="iphone", help="device key (default: iphone)")
-    parser.add_argument(
-        "--mode",
-        type=str,
-        default="tracker",
-        help="type of RPC connection (default: tracker), possible values: {}".format(
-            ", ".join(MODES.keys())
-        ),
-    )
-
-    args = parser.parse_args()
-    assert args.mode in MODES.keys()
-    test_mobilenet(args.host, args.port, args.key, args.mode)
diff --git a/apps/relax_examples/e2e_auto_tir.py b/apps/relax_examples/e2e_auto_tir.py
deleted file mode 100644
index 8113f942d166..000000000000
--- a/apps/relax_examples/e2e_auto_tir.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import datetime
-import os
-import csv
-import json
-import argparse
-import logging
-from typing import Dict
-import numpy as np  # type: ignore
-
-import tvm
-from tvm import relay, relax, runtime, transform
-from tvm.ir.module import IRModule
-from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
-from tvm.relax.testing import relay_translator
-from tvm.target.target import Target
-
-
-def _parse_args():
-    args = argparse.ArgumentParser()
-    args.add_argument(
-        "--workload",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--input-shape",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--target",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--num-trials",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-host",
-        type=str,
-        default=None,
-    )
-    args.add_argument(
-        "--rpc-port",
-        type=int,
-        default=None,
-    )
-    args.add_argument(
-        "--rpc-key",
-        type=str,
-        default=None,
-    )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--cache-dir",
-        type=str,
-        default=None,
-    )
-    args.add_argument(
-        "--rpc-timeout-sec",
-        type=int,
-        default=180,
-    )
-    args.add_argument("--num-measurement-repeats", type=int, default=5)
-    args.add_argument("--num-measurements", type=int, default=10)
-    args.add_argument("--results-file", type=str, required=False, default=None)
-    parsed = args.parse_args()
-    parsed.target = tvm.target.Target(parsed.target)
-    parsed.input_shape = json.loads(parsed.input_shape)
-    if parsed.target.attrs.get("mtriple", None) == "aarch64-linux-gnu":
-        parsed.alloc_repeat = 3
-    else:
-        parsed.alloc_repeat = 1
-    if parsed.rpc_host and parsed.rpc_port and parsed.rpc_key:
-        parsed.rpc_config = ms.runner.RPCConfig(
-            tracker_host=parsed.rpc_host,
-            tracker_port=parsed.rpc_port,
-            tracker_key=parsed.rpc_key,
-            session_timeout_sec=parsed.rpc_timeout_sec,
-        )
-        parsed.workers = parsed.rpc_config.count_num_servers(allow_missing=False)
-    else:
-        # check all rpc configs are None
-        assert (
-            (parsed.rpc_host is None) and (parsed.rpc_port is None) and (parsed.rpc_key is None)
-        ), "Please set all 'rpc_host', 'rpc_port' and 'rpc_key' to use PRC server"
-        parsed.rpc_config = None
-        parsed.workers = 1
-    return parsed
-
-
-logging.basicConfig()
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
-ARGS = _parse_args()
-
-
-def apply_opt_before_tuning(
-    relay_mod: IRModule, params: Dict[str, runtime.NDArray], target: Target
-):
-    with transform.PassContext(opt_level=3):
-        main_func = relay_mod["main"]
-        bind_main_func = relay.build_module.bind_params_by_name(main_func, params)
-        relay_mod = IRModule.from_expr(bind_main_func)
-        relay_mod = relay.transform.SimplifyInference()(relay_mod)
-        relay_mod = relay.transform.FoldConstant()(relay_mod)
-        relay_mod = relay.transform.FoldScaleAxis()(relay_mod)
-        relay_mod = relay.transform.CanonicalizeOps()(relay_mod)
-        relay_mod = relay.transform.AlterOpLayout()(relay_mod)
-        relay_mod = relay.transform.FoldConstant()(relay_mod)
-
-        relax_mod = relay_translator.from_relay(relay_mod["main"], target=target)
-        relax_mod = relax.transform.AnnotateTIROpPattern()(relax_mod)
-        relax_mod = relax.transform.FuseOps()(relax_mod)
-        relax_mod = relax.transform.FuseTIR()(relax_mod)
-    return relax_mod
-
-
-def f_measurement(
-    rt_mod: runtime.Module, device: runtime.ndarray.Device, input_data: Dict[str, runtime.NDArray]
-):
-    vm = relax.VirtualMachine(rt_mod, device=device)
-    vm.save_function("main", "measure_func", **input_data, include_return=False)
-    evaluator = vm.time_evaluator(
-        func_name="measure_func",
-        dev=device,
-        repeat=ARGS.num_measurement_repeats,
-        number=ARGS.num_measurements,
-        min_repeat_ms=500,
-    )
-    return evaluator()
-
-
-def get_runner():
-    runner_config = {
-        "evaluator_config": ms.runner.EvaluatorConfig(
-            number=3,
-            repeat=1,
-            min_repeat_ms=100,
-            enable_cpu_cache_flush=False,
-        ),
-        "alloc_repeat": ARGS.alloc_repeat,
-    }
-    if ARGS.rpc_config:
-        runner = ms.runner.RPCRunner(
-            rpc_config=ARGS.rpc_config, max_workers=ARGS.workers, **runner_config
-        )
-    else:
-        runner = ms.runner.LocalRunner(**runner_config)
-
-    return runner
-
-
-def main():
-    relay_mod, params, (input_name, input_shape, input_dtype) = get_network(
-        ARGS.workload,
-        ARGS.input_shape,
-        cache_dir=ARGS.cache_dir,
-    )
-    input_info = {input_name: input_shape}
-    input_data = {}
-    for input_name, input_shape in input_info.items():
-        print(f"  input_name: {input_name}")
-        print(f"  input_shape: {input_shape}")
-        print(f"  input_dtype: {input_dtype}")
-
-    # translate the ResNet model from Relay to Relax
-    relax_mod = apply_opt_before_tuning(relay_mod, params, target=ARGS.target)
-    assert isinstance(relax_mod, tvm.IRModule)
-
-    db = ms.relax_integration.tune_relax(
-        mod=relax_mod,
-        target=ARGS.target,
-        params=params,
-        num_trials_per_iter=64,
-        max_trials_per_task=ARGS.num_trials,
-        max_trials_global=ARGS.num_trials,
-        runner=get_runner(),
-        work_dir=ARGS.work_dir,
-    )
-    executable = ms.relax_integration.compile_relax(
-        db,
-        mod=relax_mod,
-        target=ARGS.target,
-        params=params,
-    )
-
-    for input_name, input_shape in input_info.items():
-        if input_dtype.startswith("float"):
-            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
-        else:
-            input_data[input_name] = np.random.randint(
-                low=0, high=10000, size=input_shape, dtype=input_dtype
-            )
-
-    # for documentation purposes
-    start_time = datetime.datetime.now()
-
-    if ARGS.rpc_config:
-        result = run_module_via_rpc(
-            rpc_config=ARGS.rpc_config,
-            lib=executable.mod,
-            dev_type=ARGS.target.kind.name,
-            args=input_data,
-            continuation=f_measurement,
-        )
-    else:
-        dev = tvm.device(ARGS.target.kind.name)
-        result = f_measurement(executable.mod, dev, input_data)
-
-    print(result)
-
-    if not ARGS.results_file:
-        return
-
-    out_path = os.path.abspath(os.path.expanduser(ARGS.results_file))
-    with open(out_path, "w") as out_file:
-        writer = csv.writer(out_file)
-        # write experiment parameters at the top as a record
-        writer.writerow(["start", str(start_time)])
-        writer.writerow(["workload", ARGS.workload])
-        writer.writerow(["input_shape", ARGS.input_shape])
-        writer.writerow(["target", ARGS.target])
-        writer.writerow(["num_measurement_repeats", ARGS.num_measurement_repeats])
-        for res in result.results:
-            writer.writerow([str(res)])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/apps/relax_examples/mlp.py b/apps/relax_examples/mlp.py
deleted file mode 100644
index 2a81b61543fd..000000000000
--- a/apps/relax_examples/mlp.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Example code on creating, compiling, and running an MLP model in relax
-
-
-import tvm
-from tvm import relax, tir, topi
-import numpy as np
-
-
-def build_mlp(data, weight):
-    bb = relax.BlockBuilder()
-
-    with bb.function("mlp", [data, weight]):
-        gv0 = bb.emit_te(tvm.contrib.cblas.matmul, data, weight, transa=False, transb=False)
-        gv1 = bb.emit_te(topi.nn.relu, gv0)
-        bb.emit_func_output(gv1)
-
-    mod = bb.get()
-    return mod
-
-
-if __name__ == "__main__":
-    # symbolic dimensions
-    n, m = tir.Var("n", "int64"), tir.Var("m", "int64")
-    # create data and weight variables
-    data = relax.Var("data", relax.TensorStructInfo([n, m], "float32"))
-    weight = relax.Var("weight", relax.TensorStructInfo([m, n], "float32"))
-
-    # construct a mlp model
-    mod = build_mlp(data, weight)
-
-    # build and create vm executor
-    target = tvm.target.Target("llvm", host="llvm")
-    ex = relax.build(mod, target)
-    vm = relax.VirtualMachine(ex, tvm.cpu())
-
-    # run the mlp model on relax vm
-    data = tvm.nd.array(np.random.rand(16, 32).astype(np.float32))
-    weight = tvm.nd.array(np.random.rand(32, 16).astype(np.float32))
-    res = vm["mlp"](data, weight)
-    print(res)
diff --git a/apps/relax_examples/nn_module.py b/apps/relax_examples/nn_module.py
deleted file mode 100644
index 57a13e4fb51b..000000000000
--- a/apps/relax_examples/nn_module.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Example code on creating, compiling, and running a neural network with pytorch-like API
-
-
-import tvm
-from tvm.relay import Call
-from tvm import relax, tir
-from tvm.relax.testing import nn
-from tvm.script import relax as R
-import numpy as np
-
-
-if __name__ == "__main__":
-    builder = relax.BlockBuilder()
-
-    # a symbolic variable to represent minibatch size
-    n = tir.Var("n", "int64")
-    input_size = 784
-    hidden_sizes = [128, 32]
-    output_size = 10
-
-    # build a three linear-layer neural network for a classification task
-    with builder.function("main"):
-        model = nn.Sequential(
-            nn.Linear(input_size, hidden_sizes[0]),
-            nn.ReLU(),
-            nn.Linear(hidden_sizes[0], hidden_sizes[1]),
-            nn.ReLU(),
-            nn.Linear(hidden_sizes[1], output_size),
-            nn.LogSoftmax(),
-        )
-        data = nn.Placeholder((n, input_size), name="data")
-        output = model(data)
-        params = [data] + model.parameters()
-        builder.emit_func_output(output, params=params)
-
-    # get and print the IRmodule being built
-    mod = builder.get()
-    mod.show()
-
-    # build the IRModule and create relax vm
-    target = tvm.target.Target("llvm", host="llvm")
-    ex = relax.build(mod, target)
-    vm = relax.VirtualMachine(ex, tvm.cpu())
-
-    # init parameters
-    params = nn.init_params(mod)
-
-    # run the model on relax vm
-    # the input data has a minibatch size of 3
-    data = tvm.nd.array(np.random.rand(3, input_size).astype(np.float32))
-    res = vm["main"](data, *params)
-    print(res)
diff --git a/apps/relax_examples/resnet.py b/apps/relax_examples/resnet.py
deleted file mode 100644
index 6c7350d77847..000000000000
--- a/apps/relax_examples/resnet.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example ResNet workload by translating the Relay program to Relax"""
-
-import tvm
-import tvm.testing
-from tvm.relay import testing
-from tvm import relax, relay
-from tvm.relax.testing import relay_translator, nn
-from tvm.runtime import vm as vm_rt
-from tvm.script import relax as R
-import numpy as np
-
-if __name__ == "__main__":
-    relay_mod, _ = testing.resnet.get_workload(num_layers=50, batch_size=1, dtype="float32")
-
-    # translate the ResNet model from Relay to Relax
-    target = tvm.target.Target("llvm", host="llvm")
-    relax_mod = relay_translator.from_relay(relay_mod["main"], target)
-
-    # print the ResNet IRmodule got translated
-    relax_mod.show()
-
-    # build the IRModule and create relax vm
-    ex = relax.build(relax_mod, target)
-    vm = relax.VirtualMachine(ex, tvm.cpu())
-
-    # init weights and run the model on relax vm
-    shape = (1, 3, 224, 224)
-    data = tvm.nd.array(np.random.rand(*shape).astype(np.float32))
-    params = nn.init_params(relax_mod)
-    res = vm["main"](data, *params)
-
-    # check correctness by comparing with relay result
-    exe = relay.vm.compile(relay_mod, target)
-    relay_vm = vm_rt.VirtualMachine(exe, tvm.cpu())
-    inputs = [data] + params
-    expected_output = relay_vm.run(*inputs)
-    tvm.testing.assert_allclose(res.numpy(), expected_output.numpy(), rtol=1e-4, atol=1e-4)
diff --git a/apps/wasm-standalone/.gitignore b/apps/wasm-standalone/.gitignore
deleted file mode 100644
index 54fb6c73048d..000000000000
--- a/apps/wasm-standalone/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Built packages
-**/lib/
-
-
-#Added by cargo
-
-**/target/
-**/Cargo.lock
diff --git a/apps/wasm-standalone/README.md b/apps/wasm-standalone/README.md
deleted file mode 100644
index 34c844368029..000000000000
--- a/apps/wasm-standalone/README.md
+++ /dev/null
@@ -1,201 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# WebAssembly Standalone for Deep Learning Framework with TVM Runtime
-
-#### Experimental notice: This project is still *experimental* and only serves as a proof of concept for running deep learning frameworks on [WebAssembly runtime](https://github.com/bytecodealliance/wasmtime) with [TVM stack](https://tvm.apache.org/).
-
-- [WebAssembly Standalone for Deep Learning Framework with TVM Runtime](#webassembly-standalone-for-deep-learning-framework-with-tvm-runtime)
-    - [Motivation](#motivation)
-    - [Framework Landscape](#framework-landscape)
-    - [Project Status](#project-status)
-    - [PoC Guidelines](#poc-guidelines)
-        - [Pre-installation](#pre-installation)
-        - [Build ResNet50 model](#build-resnet50-model)
-        - [Build wasm-graph package](#build-wasm-graph-package)
-        - [Test](#test)
-    - [Future Work](#future-work)
-        - [More networks support](#more-networks-support)
-        - [Performance benchmark](#performance-benchmark)
-        - [Native TVM Rust runtime support](#native-tvm-rust-runtime-support)
-    - [Appendix](#appendix)
-        - [System packages install](#system-packages-install)
-
-## Motivation
-
-<img src="https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tvm_support_list.png" alt="TVM hardware support" width="600"/>
-
-As demonstrated in TVM runtime [tutorials](https://tvm.apache.org/docs/tutorials/get_started/relay_quick_start.html), TVM already supports WASM as the optional hardware backend, so we can leverage the features of WebAssembly (portability, security) and TVM runtime (domain-specific, optimization) to build a flexible and auto-optimized graph compiler for all deep learning frameworks.
-
-## Framework Landscape
-
-The figures below demonstrate the whole landscape of running deep learning frameworks on WASM runtime with TVM compiler stack.
-
-* WASM graph generation
-    ```
-       _ _ _ _ _ _ _ _ _ _        _ _ _ _ _ _ _        _ _ _ _ _ _ _ _ _ _ _ _
-      |                   |      |             |      |                       |
-      |  Framework Model  | ---> |  ONNX Model | ---> |  TVM Relay Python API |
-      |_ _ _ _ _ _ _ _ _ _|      |_ _ _ _ _ _ _|      |_ _ _ _ _ _ _ _ _ _ _ _|
-                                                                 ||
-                                                                 \/
-                 _ _ _ _ _ _ _ _ _ _ _                  _ _ _ _ _ _ _ _ _ _ _
-                |                     |                |                     |
-                | WASM Graph Builder  |                |  TVM Compiler Stack |
-                |    (TVM runtime)    |                |_ _ _ _ _ _ _ _ _ _ _|
-                |_ _ _ _ _ _ _ _ _ _ _|                          ||
-                          ||                                     \/
-      _ _ _ _ _ _ _ _ _   ||   _ _ _ _ _ _ _ _ _ _            _ _ _ _ _
-     |                 |  \/  |                   |  llvm-ar |         |
-     | wasm_graph.wasm | <--- | libgraph_wasm32.a | <------- | graph.o |
-     |_ _ _ _ _ _ _ _ _|      |_ _ _ _ _ _ _ _ _ _|          |_ _ _ _ _|
-    ```
-
-* WASM graph loading
-    ```
-         _ _ _ _ _ _ _ _ _ _ _
-        |                     |
-        |  WASM Graph Loader  |
-        |   (WASM runtime)    |
-        |_ _ _ _ _ _ _ _ _ _ _|
-                  ||
-                  \/
-          _ _ _ _ _ _ _ _ _ _
-         |                   |
-         |  wasm_graph.wasm  |
-         |_ _ _ _ _ _ _ _ _ _|
-    ```
-
-## Project Status
-
-This project should be considered **experimental** at the very early stage, all rich features are under active development. Here is the current operator support matrix:
-
-| Model Name | Status |
-| ---------- | ------ |
-| ResNet50 | ✔️ |
-| LeNet | <center>&mdash;</center> |
-
-**NOTICE**: Currently this project is ONLY tested on Ubuntu system, so `Ubuntu 16.04+` should be prepared as the testing environment.
-
-## PoC Guidelines
-
-### Pre-installation
-
-* Rust
-
-    Before running this demo, please make sure [Rust](#system-packages-install) has been installed.
-
-    After Rust installed, execute the code below to add `wasm32-wasi` target:
-    ```shell
-    rustup target add wasm32-wasi
-    ```
-
-* TVM
-
-    Please follow TVM [installations](https://tvm.apache.org/docs/install/index.html) for the detailed instruction.
-
-* LLVM
-
-    `LLVM 10.0` or later is REQUIRED.
-
-### Build ResNet50 model
-
-- Build DL library in the WebAssembly format.
-
-  - Compile the model
-
-    ```
-    cd wasm-graph/tools && LLVM_AR=llvm-ar-10 python ./build_graph_lib.py -O3
-    ```
-
-### Build wasm-graph package
-
-```shell
-cd wasm-graph && cargo build --release
-cp ./target/wasm32-wasi/release/wasm_graph.wasm ./lib/wasm_graph_resnet50.wasm
-```
-
-### Test
-
-Before running this demo, please make sure [`Rust`](#system-packages-install) has been installed.
-
-Next run the command below to install the runtime package for testing (`rust` REQUIRED):
-
-```shell
-cd wasm-runtime/tests/test_graph_resnet50 && cargo build
-```
-
-Check the usage of `test_graph_resnet50`:
-
-```shell
-~# ./target/debug/test_graph_resnet50 -h
-
-Usage: ./target/debug/test_graph_resnet50 [options]
-
-Options:
-    -g, --wasm-graph-file FILE_PATH
-                        set the path to wasm graph file
-    -i, --input-data-file FILE_PATH
-                        set the path to input image file
-    -l, --label-class-file FILE_PATH
-                        set the path to label class file
-    -h, --help          print this help menu
-```
-
-Next perform model inference using these commands below:
-```
-$ cp ../../../wasm-graph/lib/wasm_graph_resnet50.wasm ./
-$ wget -O cat.png https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true
-$ wget -O synset.csv https://raw.githubusercontent.com/kazum/tvm-wasm/master/synset.csv
-$ ./target/debug/test_graph_resnet50 -g ./wasm_graph_resnet50.wasm -i ./cat.png -l ./synset.csv
-original image dimensions: (256, 256)
-resized image dimensions: (224, 224)
-input image belongs to the class `tiger cat`
-```
-
-Note: this example also works without WASI support. Please modify `wasm-graph/.cargo/config` to change the target to
-`wasm32-unknown-unknown` and uncomment the raw wasm engine in `wasm-runtime/src/graph.rs` to run in pure wasm32. SIMD
-may not be supported without WASI support. You may also need to delete ` -mattr=+simd128` in the
-[build script](apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py).
-
-## Future Work
-
-### More networks support
-TODO
-
-### Performance benchmark
-
-We are working on several improvements on performances:
-* WebAssembly simd128 support (**Done**)
-* Auto-tvm enhancement for llvm target
-
-### Native TVM Rust runtime support
-TODO
-
-## Appendix
-
-### System packages install
-
-* Rust (latest version)
-
-    If you are running Windows, to install Rust, download and run the [RUST-INIT.EXE](https://win.rustup.rs/), and then follow the onscreen instructions.
-
-    If you are a Linux user, run the following in your terminal, then follow the on-screen instructions to install Rust.
-
-    ```shell
-    curl https://sh.rustup.rs -sSf | sh
-    ```
diff --git a/apps/wasm-standalone/wasm-graph/.cargo/config b/apps/wasm-standalone/wasm-graph/.cargo/config
deleted file mode 100644
index b01a37beeb90..000000000000
--- a/apps/wasm-standalone/wasm-graph/.cargo/config
+++ /dev/null
@@ -1,3 +0,0 @@
-[build]
-target = "wasm32-wasi"
-rustflags = ["-C", "link-arg=--whole-archive", "-C", "link-arg=-lgraph_wasm32"]
diff --git a/apps/wasm-standalone/wasm-graph/Cargo.toml b/apps/wasm-standalone/wasm-graph/Cargo.toml
deleted file mode 100644
index cea491b2f128..000000000000
--- a/apps/wasm-standalone/wasm-graph/Cargo.toml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "wasm-graph"
-version = "0.1.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-description = "WebAssembly graph to deep learning frameworks using TVM"
-readme = "README.md"
-repository = "https://github.com/apache/tvm"
-license = "Apache-2.0"
-keywords = ["wasm", "machine learning", "tvm"]
-
-[profile.release]
-lto = true
-opt-level = 's'
-
-[lib]
-crate-type = ['cdylib']
-
-[dependencies]
-serde = "1.0.53"
-serde_derive = "1.0.53"
-serde_json = "1.0.53"
-ndarray = "0.12"
-tvm-sys = { path = "../../../rust/tvm-sys" }
-tvm-graph-rt = { path = "../../../rust/tvm-graph-rt" }
-lazy_static = "1.1.1"
diff --git a/apps/wasm-standalone/wasm-graph/build.rs b/apps/wasm-standalone/wasm-graph/build.rs
deleted file mode 100644
index 8f9b36df3379..000000000000
--- a/apps/wasm-standalone/wasm-graph/build.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-fn main() {
-    let out_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/lib");
-    println!("cargo:rustc-link-search=native={}", out_dir);
-}
diff --git a/apps/wasm-standalone/wasm-graph/src/lib.rs b/apps/wasm-standalone/wasm-graph/src/lib.rs
deleted file mode 100644
index 92a3d5c2f3b0..000000000000
--- a/apps/wasm-standalone/wasm-graph/src/lib.rs
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#[macro_use]
-extern crate lazy_static;
-#[macro_use]
-extern crate serde_derive;
-
-mod types;
-mod utils;
-
-use std::{collections::HashMap, convert::TryFrom, env, sync::Mutex};
-
-use tvm_graph_rt::{Graph, GraphExecutor, SystemLibModule, Tensor as TVMTensor};
-
-use types::Tensor;
-
-extern "C" {
-    fn __wasm_call_ctors();
-}
-
-lazy_static! {
-    static ref SYSLIB: SystemLibModule = SystemLibModule::default();
-    static ref GRAPH_EXECUTOR: Mutex<GraphExecutor<'static, 'static>> = {
-        unsafe {
-            // This is necessary to invoke TVMBackendRegisterSystemLibSymbol
-            // API calls.
-            __wasm_call_ctors();
-        }
-        let graph = Graph::try_from(include_str!(concat!(
-            env!("CARGO_MANIFEST_DIR"),
-            "/lib/graph.json"
-        )))
-        .unwrap();
-
-        let params_bytes =
-            include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/lib/graph.params"));
-        let params = tvm_graph_rt::load_param_dict(params_bytes)
-            .unwrap()
-            .into_iter()
-            .map(|(k, v)| (k, v.to_owned()))
-            .collect::<HashMap<String, TVMTensor<'static>>>();
-
-        let mut exec = GraphExecutor::new(graph, &*SYSLIB).unwrap();
-
-        exec.load_params(params);
-
-        Mutex::new(exec)
-    };
-}
-
-#[no_mangle]
-pub extern "C" fn run(wasm_addr: i32, in_size: i32) -> i32 {
-    let in_tensor = unsafe { utils::load_input(wasm_addr, in_size as usize) };
-    let input: TVMTensor = in_tensor.as_dltensor().into();
-
-    // since this executor is not multi-threaded, we can acquire lock once
-    let mut executor = GRAPH_EXECUTOR.lock().unwrap();
-
-    executor.set_input("data", input);
-
-    executor.run();
-
-    let output = executor.get_output(0).unwrap().as_dltensor(false);
-
-    let out_tensor: Tensor = output.into();
-    let out_size = unsafe { utils::store_output(wasm_addr, out_tensor) };
-    out_size as i32
-}
diff --git a/apps/wasm-standalone/wasm-graph/src/types.rs b/apps/wasm-standalone/wasm-graph/src/types.rs
deleted file mode 100644
index f08b7be84990..000000000000
--- a/apps/wasm-standalone/wasm-graph/src/types.rs
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{
-    any::TypeId,
-    os::raw::{c_int, c_void},
-    slice,
-};
-pub use tvm_sys::ffi::DLTensor;
-use tvm_sys::ffi::{
-    DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDevice, DLDeviceType_kDLCPU,
-};
-
-#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
-pub enum DataType {
-    FP32,
-    INT32,
-    INT8,
-}
-
-impl DataType {
-    pub fn as_dldtype(&self) -> DLDataType {
-        match self {
-            DataType::INT32 => DLDataType {
-                code: DLDataTypeCode_kDLInt as u8,
-                bits: 32u8,
-                lanes: 1u16,
-            },
-            DataType::INT8 => DLDataType {
-                code: DLDataTypeCode_kDLInt as u8,
-                bits: 8u8,
-                lanes: 1u16,
-            },
-            DataType::FP32 => DLDataType {
-                code: DLDataTypeCode_kDLFloat as u8,
-                bits: 32u8,
-                lanes: 1u16,
-            },
-        }
-    }
-
-    /// Returns whether this `DataType` represents primitive type `T`.
-    pub fn is_type<T: 'static>(&self) -> bool {
-        let typ = TypeId::of::<T>();
-        typ == TypeId::of::<i32>() || typ == TypeId::of::<i8>() || typ == TypeId::of::<f32>()
-    }
-}
-
-impl From<DLDataType> for DataType {
-    fn from(dl_dtype: DLDataType) -> Self {
-        if dl_dtype.code == DLDataTypeCode_kDLInt as u8 && dl_dtype.bits == 32u8 {
-            DataType::INT32
-        } else if dl_dtype.code == DLDataTypeCode_kDLInt as u8 && dl_dtype.bits == 8u8 {
-            DataType::INT8
-        } else if dl_dtype.code == DLDataTypeCode_kDLFloat as u8 && dl_dtype.bits == 32u8 {
-            DataType::FP32
-        } else {
-            DataType::FP32
-        }
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Tensor {
-    pub(crate) dtype: DataType,
-    pub(crate) shape: Vec<i64>,
-    pub(crate) strides: Option<Vec<usize>>,
-    pub(crate) data: Vec<u8>,
-}
-
-#[allow(dead_code)]
-impl Tensor {
-    pub fn new(dtype: DataType, shape: Vec<i64>, strides: Vec<usize>, data: Vec<u8>) -> Self {
-        Tensor {
-            dtype,
-            shape,
-            strides: Some(strides),
-            data,
-        }
-    }
-
-    pub fn dtype(&self) -> DataType {
-        self.dtype.clone()
-    }
-
-    pub fn ndim(&self) -> usize {
-        self.shape.len()
-    }
-
-    pub fn shape(&self) -> Vec<i64> {
-        self.shape.clone()
-    }
-
-    pub fn data(&self) -> Vec<u8> {
-        self.data.clone()
-    }
-
-    pub fn as_dltensor(&self) -> DLTensor {
-        DLTensor {
-            data: self.data.as_ptr() as *mut c_void,
-            device: DLDevice {
-                device_type: DLDeviceType_kDLCPU,
-                device_id: 0 as c_int,
-            },
-            ndim: self.shape.len() as c_int,
-            dtype: self.dtype().as_dldtype(),
-            shape: self.shape.as_ptr() as *mut i64,
-            strides: self.strides.as_ref().unwrap().as_ptr() as *mut i64,
-            byte_offset: 0,
-            ..Default::default()
-        }
-    }
-
-    /// Returns the data of this `Tensor` as a `Vec`.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `Tensor` does not contain elements of type `T`.
-    pub fn to_vec<T: 'static + std::fmt::Debug + Clone>(&self) -> Vec<T> {
-        assert!(self.dtype().is_type::<T>());
-
-        unsafe {
-            slice::from_raw_parts(
-                self.data().as_ptr() as *const T,
-                self.shape().iter().map(|v| *v as usize).product::<usize>() as usize,
-            )
-            .to_vec()
-        }
-    }
-}
-
-impl Default for Tensor {
-    fn default() -> Self {
-        Self {
-            dtype: DataType::FP32,
-            shape: Vec::new(),
-            strides: None,
-            data: Vec::new(),
-        }
-    }
-}
-
-impl From<DLTensor> for Tensor {
-    fn from(dlt: DLTensor) -> Self {
-        unsafe {
-            let shape = slice::from_raw_parts_mut(dlt.shape, dlt.ndim as usize).to_vec();
-            let size = shape.iter().map(|v| *v as usize).product::<usize>() as usize;
-            let itemsize: usize = (dlt.dtype.bits >> 3).into();
-            let data = slice::from_raw_parts(dlt.data as *const u8, size * itemsize).to_vec();
-
-            Self {
-                dtype: DataType::from(dlt.dtype),
-                shape,
-                strides: if dlt.strides.is_null() {
-                    None
-                } else {
-                    Some(
-                        slice::from_raw_parts_mut(dlt.strides as *mut usize, dlt.ndim as usize)
-                            .to_vec(),
-                    )
-                },
-                data,
-            }
-        }
-    }
-}
diff --git a/apps/wasm-standalone/wasm-graph/src/utils.rs b/apps/wasm-standalone/wasm-graph/src/utils.rs
deleted file mode 100644
index 92d386e3062a..000000000000
--- a/apps/wasm-standalone/wasm-graph/src/utils.rs
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use super::types::*;
-use serde_json;
-use std::ptr;
-
-pub unsafe fn load_input(in_addr: i32, in_size: usize) -> Tensor {
-    let in_addr = in_addr as *mut u8;
-
-    println!("DEBUG: in_addr {:?}, in_size {:?}", in_addr, in_size);
-
-    let data_vec = unsafe { std::slice::from_raw_parts(in_addr, in_size) };
-
-    let input = serde_json::from_slice(&data_vec);
-    match input {
-        Ok(result) => {
-            println!("DEBUG: SER SUCCEED!!! and Ok");
-            result
-        }
-        Err(e) => {
-            panic!("DEBUG: SER SUCCEED!!! but Err, {:?}", &e);
-        }
-    }
-}
-
-pub unsafe fn store_output(out_addr: i32, output: Tensor) -> usize {
-    let out_addr = out_addr as *mut u8;
-
-    let data_vec = serde_json::to_vec(&output).unwrap();
-    let data_size = data_vec.len();
-    for i in 0..data_size {
-        ptr::write(out_addr.offset(i as isize), *data_vec.get(i).unwrap());
-    }
-
-    data_size
-}
diff --git a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
deleted file mode 100755
index f08ee07731ac..000000000000
--- a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Builds a simple resnet50 graph for testing."""
-import argparse
-import os
-import subprocess
-import sys
-
-import onnx
-import tvm
-from tvm import relay, runtime
-from tvm.contrib.download import download_testdata
-from tvm.contrib import graph_executor
-
-from PIL import Image
-import numpy as np
-import tvm.relay as relay
-
-# This example uses resnet50-v2-7 model
-model_url = (
-    "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/"
-    "vision/classification/resnet/model/"
-    "resnet50-v2-7.onnx"
-)
-
-
-def build_graph_lib(opt_level):
-    """Compiles the pre-trained model with TVM"""
-    out_dir = os.path.join(sys.path[0], "../lib")
-    if not os.path.exists(out_dir):
-        os.makedirs(out_dir)
-
-    # Follow the tutorial to download and compile the model
-    model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx")
-    onnx_model = onnx.load(model_path)
-
-    img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
-    img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
-
-    # Resize it to 224x224
-    resized_image = Image.open(img_path).resize((224, 224))
-    img_data = np.asarray(resized_image).astype("float32")
-
-    # Our input image is in HWC layout while ONNX expects CHW input, so convert the array
-    img_data = np.transpose(img_data, (2, 0, 1))
-
-    # Normalize according to the ImageNet input specification
-    imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-    imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-    norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev
-
-    # Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
-    img_data = np.expand_dims(norm_img_data, axis=0)
-
-    input_name = "data"
-    shape_dict = {input_name: img_data.shape}
-
-    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
-    target = "llvm -mtriple=wasm32-unknown-unknown -mattr=+simd128"
-
-    with tvm.transform.PassContext(opt_level=opt_level):
-        factory = relay.build(
-            mod,
-            target=target,
-            params=params,
-            runtime=tvm.relay.backend.Runtime("cpp", {"system-lib": True}),
-        )
-
-    # Save the model artifacts to obj_file
-    obj_file = os.path.join(out_dir, "graph.o")
-    factory.get_lib().save(obj_file)
-
-    # Run llvm-ar to archive obj_file into lib_file
-    lib_file = os.path.join(out_dir, "libgraph_wasm32.a")
-    cmds = [os.environ.get("LLVM_AR", "llvm-ar-10"), "rcs", lib_file, obj_file]
-    subprocess.run(cmds)
-
-    # Save the json and params
-    with open(os.path.join(out_dir, "graph.json"), "w") as f_graph:
-        f_graph.write(factory.get_graph_json())
-    with open(os.path.join(out_dir, "graph.params"), "wb") as f_params:
-        f_params.write(runtime.save_param_dict(factory.get_params()))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="ONNX model build example")
-    parser.add_argument(
-        "-O",
-        "--opt-level",
-        type=int,
-        default=0,
-        help="level of optimization. 0 is non-optimized and 3 is the highest level",
-    )
-    args = parser.parse_args()
-
-    build_graph_lib(args.opt_level)
diff --git a/apps/wasm-standalone/wasm-runtime/Cargo.toml b/apps/wasm-standalone/wasm-runtime/Cargo.toml
deleted file mode 100644
index d3f860170d4e..000000000000
--- a/apps/wasm-standalone/wasm-runtime/Cargo.toml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "wasm-runtime"
-version = "0.1.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-description = "WebAssembly runtime to deep learning frameworks using wasmtime"
-repository = "https://github.com/apache/tvm"
-license = "Apache-2.0"
-keywords = ["wasm", "machine learning", "wasmtime"]
-
-[dependencies]
-wasmtime = "0.28.0"
-wasmtime-wasi = "0.28.0"
-anyhow = "1.0.31"
-serde = "1.0.53"
-serde_json = "1.0.53"
-serde_derive = "1.0.53"
-ndarray = "0.12"
diff --git a/apps/wasm-standalone/wasm-runtime/src/graph.rs b/apps/wasm-standalone/wasm-runtime/src/graph.rs
deleted file mode 100644
index bfa1c2f19c56..000000000000
--- a/apps/wasm-standalone/wasm-runtime/src/graph.rs
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use anyhow::Result;
-use wasmtime::*;
-use wasmtime_wasi::{WasiCtx, WasiCtxBuilder};
-
-use super::Tensor;
-
-pub struct GraphExecutor {
-    pub(crate) wasm_addr: i32,
-    pub(crate) input_size: i32,
-    pub(crate) output_size: i32,
-    pub(crate) store: Option<Store<WasiCtx>>,
-    // None-WASI version:
-    // pub(crate) store: Option<Store<()>>,
-    pub(crate) instance: Option<Instance>,
-}
-
-#[allow(dead_code)]
-impl GraphExecutor {
-    pub fn new() -> Self {
-        Self {
-            wasm_addr: 0,
-            input_size: 0,
-            output_size: 0,
-            store: None,
-            instance: None,
-        }
-    }
-
-    pub fn instantiate(&mut self, wasm_graph_file: String) -> Result<()> {
-        // It seems WASI in this example is not necessary
-
-        // None WASI version: works with no SIMD
-        // let engine = Engine::new(Config::new().wasm_simd(true)).unwrap();
-        // let mut store = Store::new(&engine, ());
-        // let module = Module::from_file(store.engine(), &wasm_graph_file)?;
-
-        // let instance = Instance::new(&mut store, &module, &[])?;
-
-        // self.instance = Some(instance);
-        // self.store = Some(store);
-
-        // Ok(())
-
-        // WASI version:
-        let engine = Engine::new(Config::new().wasm_simd(true)).unwrap();
-        // First set up our linker which is going to be linking modules together. We
-        // want our linker to have wasi available, so we set that up here as well.
-        let mut linker = Linker::new(&engine);
-        wasmtime_wasi::add_to_linker(&mut linker, |s| s)?;
-        // Create an instance of `Wasi` which contains a `WasiCtx`. Note that
-        // `WasiCtx` provides a number of ways to configure what the target program
-        // will have access to.
-        let wasi = WasiCtxBuilder::new()
-            .inherit_stdio()
-            .inherit_args()?
-            .build();
-        let mut store = Store::new(&engine, wasi);
-
-        let module = Module::from_file(&engine, &wasm_graph_file)?;
-        self.instance = Some(linker.instantiate(&mut store, &module)?);
-        self.store = Some(store);
-
-        Ok(())
-    }
-
-    pub fn set_input(&mut self, input_data: Tensor) -> Result<()> {
-        let memory = self
-            .instance
-            .as_ref()
-            .unwrap()
-            .get_memory(self.store.as_mut().unwrap(), "memory")
-            .ok_or_else(|| anyhow::format_err!("failed to find `memory` export"))?;
-
-        // Specify the wasm address to access the wasm memory.
-        let wasm_addr = memory.data_size(self.store.as_mut().unwrap());
-
-        // Serialize the data into a JSON string.
-        let in_data = serde_json::to_vec(&input_data)?;
-        let in_size = in_data.len();
-
-        // Grow up memory size according to in_size to avoid memory leak.
-        memory.grow(self.store.as_mut().unwrap(), (in_size >> 16) as u32 + 1)?;
-
-        memory.write(self.store.as_mut().unwrap(), wasm_addr, &in_data)?;
-
-        self.wasm_addr = wasm_addr as i32;
-        self.input_size = in_size as i32;
-
-        Ok(())
-    }
-
-    pub fn run(&mut self) -> Result<()> {
-        // Invoke `run` export.
-        let run = self
-            .instance
-            .as_ref()
-            .unwrap()
-            .get_func(self.store.as_mut().unwrap(), "run")
-            .ok_or_else(|| anyhow::format_err!("failed to find `run` function export!"))?;
-
-        let params = [Val::I32(self.wasm_addr), Val::I32(self.input_size)];
-        let out_size = run.call(self.store.as_mut().unwrap(), &params[..])?;
-        let out_size = (*out_size)[0].unwrap_i32();
-        if out_size == 0 {
-            panic!("graph run failed!");
-        }
-
-        self.output_size = out_size;
-        Ok(())
-    }
-
-    pub fn get_output(&mut self) -> Result<Tensor> {
-        let memory = self
-            .instance
-            .as_ref()
-            .unwrap()
-            .get_memory(self.store.as_mut().unwrap(), "memory")
-            .ok_or_else(|| anyhow::format_err!("failed to find `memory` export"))?;
-
-        let mut out_data = vec![0 as u8; self.output_size as _];
-        memory.read(
-            self.store.as_mut().unwrap(),
-            self.wasm_addr as _,
-            &mut out_data,
-        )?;
-
-        let out_vec: Tensor = serde_json::from_slice(&out_data).unwrap();
-        Ok(out_vec)
-    }
-}
-
-impl Default for GraphExecutor {
-    fn default() -> Self {
-        Self::new()
-    }
-}
diff --git a/apps/wasm-standalone/wasm-runtime/src/lib.rs b/apps/wasm-standalone/wasm-runtime/src/lib.rs
deleted file mode 100644
index fa41cade035d..000000000000
--- a/apps/wasm-standalone/wasm-runtime/src/lib.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#[macro_use]
-extern crate serde_derive;
-
-mod graph;
-mod types;
-
-pub use graph::GraphExecutor;
-pub use types::Tensor;
diff --git a/apps/wasm-standalone/wasm-runtime/src/types.rs b/apps/wasm-standalone/wasm-runtime/src/types.rs
deleted file mode 100644
index 762a75d3c910..000000000000
--- a/apps/wasm-standalone/wasm-runtime/src/types.rs
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{any::TypeId, mem, slice};
-
-#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
-pub enum DataType {
-    FP32,
-    INT32,
-    INT8,
-}
-
-impl DataType {
-    /// Returns whether this `DataType` represents primitive type `T`.
-    pub fn is_type<T: 'static>(&self) -> bool {
-        let typ = TypeId::of::<T>();
-        typ == TypeId::of::<i32>() || typ == TypeId::of::<i8>() || typ == TypeId::of::<f32>()
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Tensor {
-    pub(crate) dtype: DataType,
-    pub(crate) shape: Vec<i64>,
-    pub(crate) strides: Option<Vec<usize>>,
-    pub(crate) data: Vec<u8>,
-}
-
-#[allow(dead_code)]
-impl Tensor {
-    pub fn new(dtype: DataType, shape: Vec<i64>, strides: Vec<usize>, data: Vec<u8>) -> Self {
-        Tensor {
-            dtype,
-            shape,
-            strides: Some(strides),
-            data,
-        }
-    }
-
-    pub fn dtype(&self) -> DataType {
-        self.dtype.clone()
-    }
-
-    pub fn ndim(&self) -> usize {
-        self.shape.len()
-    }
-
-    pub fn shape(&self) -> Vec<i64> {
-        self.shape.clone()
-    }
-
-    pub fn data(&self) -> Vec<u8> {
-        self.data.clone()
-    }
-
-    /// Returns the data of this `Tensor` as a `Vec`.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `Tensor` does not contain elements of type `T`.
-    pub fn to_vec<T: 'static + std::fmt::Debug + Clone>(&self) -> Vec<T> {
-        assert!(self.dtype().is_type::<T>());
-
-        unsafe {
-            slice::from_raw_parts(
-                self.data().as_ptr() as *const T,
-                self.shape().iter().map(|v| *v as usize).product::<usize>() as usize,
-            )
-            .to_vec()
-        }
-    }
-}
-
-impl Default for Tensor {
-    fn default() -> Self {
-        Self {
-            dtype: DataType::FP32,
-            shape: Vec::new(),
-            strides: None,
-            data: Vec::new(),
-        }
-    }
-}
-
-/// `From` conversions to `Tensor` for `ndarray::Array`.
-/// Takes a reference to the `ndarray` since `Tensor` is not owned.
-macro_rules! impl_tensor_from_ndarray {
-    ($type:ty, $typecode:expr) => {
-        impl<D: ndarray::Dimension> From<ndarray::Array<$type, D>> for Tensor {
-            fn from(arr: ndarray::Array<$type, D>) -> Self {
-                Tensor {
-                    dtype: $typecode,
-                    shape: arr.shape().iter().map(|v| *v as i64).collect(),
-                    strides: Some(arr.strides().iter().map(|v| *v as usize).collect()),
-                    data: unsafe {
-                        slice::from_raw_parts(
-                            arr.as_ptr() as *const u8,
-                            arr.len() * mem::size_of::<$type>(),
-                        )
-                        .to_vec()
-                    },
-                }
-            }
-        }
-    };
-}
-
-impl_tensor_from_ndarray!(f32, DataType::FP32);
-impl_tensor_from_ndarray!(i32, DataType::INT32);
-impl_tensor_from_ndarray!(i8, DataType::INT8);
diff --git a/apps/wasm-standalone/wasm-runtime/tests/test_graph_resnet50/Cargo.toml b/apps/wasm-standalone/wasm-runtime/tests/test_graph_resnet50/Cargo.toml
deleted file mode 100644
index 67ffe3429363..000000000000
--- a/apps/wasm-standalone/wasm-runtime/tests/test_graph_resnet50/Cargo.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "test_graph_resnet50"
-version = "0.1.0"
-license = "Apache-2.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[dependencies]
-getopts = "0.2.21"
-ndarray = "0.12"
-csv = "1.1"
-image = "0.20"
-wasm-runtime = { path = "../../" }
diff --git a/apps/wasm-standalone/wasm-runtime/tests/test_graph_resnet50/src/main.rs b/apps/wasm-standalone/wasm-runtime/tests/test_graph_resnet50/src/main.rs
deleted file mode 100644
index befac124e9e4..000000000000
--- a/apps/wasm-standalone/wasm-runtime/tests/test_graph_resnet50/src/main.rs
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use getopts::Options;
-use image::{FilterType, GenericImageView};
-use ndarray::Array;
-use std::{collections::HashMap, env, fs::File, io::BufReader};
-use wasm_runtime::{GraphExecutor, Tensor};
-
-const IMG_HEIGHT: usize = 224;
-const IMG_WIDTH: usize = 224;
-
-fn print_usage(program: &str, opts: Options) {
-    let brief = format!("Usage: {} [options]", program);
-    print!("{}", opts.usage(&brief));
-}
-
-fn main() {
-    let args: Vec<String> = env::args().collect();
-    let program = args[0].clone();
-
-    let mut opts = Options::new();
-    opts.optopt(
-        "g",
-        "wasm-graph-file",
-        "set the path to wasm graph file",
-        "FILE_PATH",
-    );
-    opts.optopt(
-        "i",
-        "input-data-file",
-        "set the path to input image file",
-        "FILE_PATH",
-    );
-    opts.optopt(
-        "l",
-        "label-class-file",
-        "set the path to label class file",
-        "FILE_PATH",
-    );
-    opts.optflag("h", "help", "print this help menu");
-    let matches = match opts.parse(&args[1..]) {
-        Ok(m) => m,
-        Err(f) => panic!(f.to_string()),
-    };
-    if matches.opt_present("h") {
-        print_usage(&program, opts);
-        return;
-    }
-    let wasm_graph_file: String = match matches.opt_str("g") {
-        Some(s) => s,
-        None => String::from(""),
-    };
-    let input_data_file: String = match matches.opt_str("i") {
-        Some(s) => s,
-        None => String::from(""),
-    };
-    let label_class_file: String = match matches.opt_str("l") {
-        Some(s) => s,
-        None => String::from(""),
-    };
-    let img = image::open(input_data_file).unwrap();
-    let input = data_preprocess(img);
-
-    let mut graph_exec = GraphExecutor::new();
-    graph_exec.instantiate(wasm_graph_file).unwrap();
-    graph_exec.set_input(input).unwrap();
-    graph_exec.run().unwrap();
-    let output: Tensor = match graph_exec.get_output() {
-        Ok(m) => m,
-        Err(f) => panic!(f.to_string()),
-    };
-    output_assert(output, label_class_file);
-}
-
-fn data_preprocess(img: image::DynamicImage) -> Tensor {
-    println!("original image dimensions: {:?}", img.dimensions());
-    let img = img
-        .resize_exact(IMG_HEIGHT as u32, IMG_WIDTH as u32, FilterType::Nearest)
-        .to_rgb();
-    println!("resized image dimensions: {:?}", img.dimensions());
-    let mut pixels: Vec<f32> = vec![];
-    for pixel in img.pixels() {
-        let tmp = pixel.data;
-        // normalize the RGB channels using mean, std of imagenet1k
-        let tmp = [
-            (tmp[0] as f32 - 123.0) / 58.395, // R
-            (tmp[1] as f32 - 117.0) / 57.12,  // G
-            (tmp[2] as f32 - 104.0) / 57.375, // B
-        ];
-        for e in &tmp {
-            pixels.push(*e);
-        }
-    }
-
-    // (H,W,C) -> (C,H,W)
-    let arr = Array::from_shape_vec((IMG_HEIGHT, IMG_WIDTH, 3), pixels).unwrap();
-    let arr = arr.permuted_axes([2, 0, 1]);
-    let arr = Array::from_iter(arr.into_iter().copied().map(|v| v));
-
-    Tensor::from(arr)
-}
-
-fn output_assert(out_tensor: Tensor, label_class_file: String) {
-    let output = out_tensor.to_vec::<f32>();
-
-    // Find the maximum entry in the output and its index.
-    let mut argmax = -1;
-    let mut max_prob = 0.;
-    for i in 0..output.len() {
-        if output[i] > max_prob {
-            max_prob = output[i];
-            argmax = i as i32;
-        }
-    }
-
-    // Create a hash map of (class id, class name)
-    let mut synset: HashMap<i32, String> = HashMap::new();
-    let mut rdr = csv::ReaderBuilder::new().from_reader(BufReader::new(
-        File::open(label_class_file.as_str()).unwrap(),
-    ));
-
-    for result in rdr.records() {
-        let record = result.unwrap();
-        let id: i32 = record[0].parse().unwrap();
-        let cls = record[1].to_string();
-        synset.insert(id, cls);
-    }
-
-    println!(
-        "input image belongs to the class `{}`",
-        synset
-            .get(&argmax)
-            .expect("cannot find the class id for argmax")
-    );
-}
diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
index 0eab7ac6cbd4..5e48cc65004b 100644
--- a/ci/jenkins/generated/arm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.773823
+// Generated at 2025-02-15T10:14:10.162250
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
@@ -523,11 +530,17 @@ def build() {
     try {
         run_build('ARM-GRAVITON3-SPOT')
     } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        run_build('ARM-GRAVITON3')
+        if (is_last_build()) {
+          // retry if we are currently at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          run_build('ARM-GRAVITON3')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
     }
   }
 }
@@ -725,108 +738,6 @@ def shard_run_integration_aarch64_4_of_4(node_type) {
 
 
 
-def shard_run_topi_aarch64_1_of_2(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=topi: aarch64',
-            'TVM_NUM_SHARDS=2',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              cpp_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                label: 'Run test_arm_compute_lib test',
-              )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('topi: aarch64 1 of 2')
-  }
-}
-
-def shard_run_topi_aarch64_2_of_2(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=topi: aarch64',
-            'TVM_NUM_SHARDS=2',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                label: 'Run test_arm_compute_lib test',
-              )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('topi: aarch64 2 of 2')
-  }
-}
-
-
 def test() {
   stage('Test') {
     environment {
@@ -837,66 +748,68 @@ def test() {
       try {
       shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'integration: aarch64 2 of 4': {
       try {
       shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'integration: aarch64 3 of 4': {
       try {
       shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'integration: aarch64 4 of 4': {
       try {
       shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3')
-      }
-    },
-    'topi: aarch64 1 of 2': {
-      try {
-      shard_run_topi_aarch64_1_of_2('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_topi_aarch64_1_of_2('ARM-GRAVITON3')
-      }
-    },
-    'topi: aarch64 2 of 2': {
-      try {
-      shard_run_topi_aarch64_2_of_2('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_topi_aarch64_2_of_2('ARM-GRAVITON3')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     )
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
index 647ded7f264f..b54fdf51ca3c 100644
--- a/ci/jenkins/generated/cpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.787826
+// Generated at 2025-02-15T10:14:10.181874
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
@@ -528,11 +535,17 @@ def build() {
     try {
         run_build('CPU-SPOT')
     } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        run_build('CPU')
+        if (is_last_build()) {
+          // retry if we are currently at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          run_build('CPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
     }
   }
 }
@@ -781,55 +794,85 @@ def test() {
       try {
       shard_run_integration_CPU_1_of_4('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_CPU_1_of_4('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_CPU_1_of_4('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'integration: CPU 2 of 4': {
       try {
       shard_run_integration_CPU_2_of_4('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_CPU_2_of_4('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_CPU_2_of_4('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'integration: CPU 3 of 4': {
       try {
       shard_run_integration_CPU_3_of_4('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_CPU_3_of_4('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_CPU_3_of_4('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'integration: CPU 4 of 4': {
       try {
       shard_run_integration_CPU_4_of_4('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_integration_CPU_4_of_4('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_integration_CPU_4_of_4('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'unittest: CPU 1 of 1': {
       try {
       shard_run_unittest_CPU_1_of_1('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_unittest_CPU_1_of_1('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_unittest_CPU_1_of_1('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     )
diff --git a/ci/jenkins/generated/docker_jenkinsfile.groovy b/ci/jenkins/generated/docker_jenkinsfile.groovy
index b7947ab2bf5d..daad2188ff26 100644
--- a/ci/jenkins/generated/docker_jenkinsfile.groovy
+++ b/ci/jenkins/generated/docker_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.715947
+// Generated at 2025-02-15T10:12:52.000152
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
diff --git a/ci/jenkins/generated/gpu_jenkinsfile.groovy b/ci/jenkins/generated/gpu_jenkinsfile.groovy
index 8c169ffd36d1..20f016dcdee3 100644
--- a/ci/jenkins/generated/gpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/gpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.812601
+// Generated at 2025-02-15T12:03:28.800680
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
@@ -529,11 +536,17 @@ def build() {
     try {
         run_build('CPU-SPOT')
     } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        run_build('CPU')
+        if (is_last_build()) {
+          // retry if we are currently at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          run_build('CPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
     }
   }
 }
@@ -541,7 +554,7 @@ build()
 
 
 
-def shard_run_unittest_GPU_1_of_3(node_type) {
+def shard_run_unittest_GPU_1_of_2(node_type) {
   echo 'Begin running on node_type ' + node_type
   if (!skip_ci && is_docs_only_build != 1) {
     node(node_type) {
@@ -553,37 +566,15 @@ def shard_run_unittest_GPU_1_of_3(node_type) {
           withEnv([
             'PLATFORM=gpu',
             'TEST_STEP_NAME=unittest: GPU',
-            'TVM_NUM_SHARDS=3',
+            'TVM_NUM_SHARDS=2',
             'TVM_SHARD_INDEX=0',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu2",
-                  label: 'Download artifacts from S3',
-                )
-
-              sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
-              // These require a GPU to finish the build (i.e. CUDA needs to be load-able)
-              // make_cpp_tests(ci_gpu, 'build')
-              // cpp_unittest(ci_gpu)
-
-              sh "rm -rf build"
-              sh(
                   script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
                   label: 'Download artifacts from S3',
                 )
 
               ci_setup(ci_gpu)
-              sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-              make_cpp_tests(ci_gpu, 'build')
-              cpp_unittest(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} python3 ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --sccache-region us-west-2 --cmake-target opencl-cpptest --build-dir build",
-                label: 'Make OpenCL cpp unit tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_opencl_cpp_unittest.sh",
-                label: 'Run OpenCL cpp unit tests',
-              )
               sh (
                 script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
                 label: 'Run Python GPU unit tests',
@@ -609,11 +600,11 @@ def shard_run_unittest_GPU_1_of_3(node_type) {
     }
     echo 'End running on node_type ' + node_type
   } else {
-    Utils.markStageSkippedForConditional('unittest: GPU 1 of 3')
+    Utils.markStageSkippedForConditional('unittest: GPU 1 of 2')
   }
 }
 
-def shard_run_unittest_GPU_2_of_3(node_type) {
+def shard_run_unittest_GPU_2_of_2(node_type) {
   echo 'Begin running on node_type ' + node_type
   if (!skip_ci && is_docs_only_build != 1) {
     node(node_type) {
@@ -625,7 +616,7 @@ def shard_run_unittest_GPU_2_of_3(node_type) {
           withEnv([
             'PLATFORM=gpu',
             'TEST_STEP_NAME=unittest: GPU',
-            'TVM_NUM_SHARDS=3',
+            'TVM_NUM_SHARDS=2',
             'TVM_SHARD_INDEX=1',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh(
@@ -663,201 +654,12 @@ def shard_run_unittest_GPU_2_of_3(node_type) {
     }
     echo 'End running on node_type ' + node_type
   } else {
-    Utils.markStageSkippedForConditional('unittest: GPU 2 of 3')
-  }
-}
-
-def shard_run_unittest_GPU_3_of_3(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_gpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=gpu',
-            'TEST_STEP_NAME=unittest: GPU',
-            'TVM_NUM_SHARDS=3',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-                label: 'Run Python GPU unit tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-                label: 'Run Python GPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_GPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('unittest: GPU 3 of 3')
+    Utils.markStageSkippedForConditional('unittest: GPU 2 of 2')
   }
 }
 
 
 
-def shard_run_topi_GPU_1_of_3(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_gpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=gpu',
-            'TEST_STEP_NAME=topi: GPU',
-            'TVM_NUM_SHARDS=3',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('topi: GPU 1 of 3')
-  }
-}
-
-def shard_run_topi_GPU_2_of_3(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_gpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=gpu',
-            'TEST_STEP_NAME=topi: GPU',
-            'TVM_NUM_SHARDS=3',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('topi: GPU 2 of 3')
-  }
-}
-
-def shard_run_topi_GPU_3_of_3(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_gpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=gpu',
-            'TEST_STEP_NAME=topi: GPU',
-            'TVM_NUM_SHARDS=3',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('topi: GPU 3 of 3')
-  }
-}
-
-
 
 def shard_run_docs_GPU_1_of_1(node_type) {
   echo 'Begin running on node_type ' + node_type
@@ -922,81 +724,55 @@ def test() {
       SKIP_SLOW_TESTS = "${skip_slow_tests}"
     }
     parallel(
-    'unittest: GPU 1 of 3': {
-      try {
-      shard_run_unittest_GPU_1_of_3('GPU-SPOT')
-      } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_unittest_GPU_1_of_3('GPU')
-      }
-    },
-    'unittest: GPU 2 of 3': {
-      try {
-      shard_run_unittest_GPU_2_of_3('GPU-SPOT')
-      } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_unittest_GPU_2_of_3('GPU')
-      }
-    },
-    'unittest: GPU 3 of 3': {
+    'unittest: GPU 1 of 2': {
       try {
-      shard_run_unittest_GPU_3_of_3('GPU-SPOT')
+      shard_run_unittest_GPU_1_of_2('GPU-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_unittest_GPU_3_of_3('GPU')
-      }
-    },
-    'topi: GPU 1 of 3': {
-      try {
-      shard_run_topi_GPU_1_of_3('GPU-SPOT')
-      } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_topi_GPU_1_of_3('GPU')
-      }
-    },
-    'topi: GPU 2 of 3': {
-      try {
-      shard_run_topi_GPU_2_of_3('GPU-SPOT')
-      } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_topi_GPU_2_of_3('GPU')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_unittest_GPU_1_of_2('GPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
-    'topi: GPU 3 of 3': {
+    'unittest: GPU 2 of 2': {
       try {
-      shard_run_topi_GPU_3_of_3('GPU-SPOT')
+      shard_run_unittest_GPU_2_of_2('GPU-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_topi_GPU_3_of_3('GPU')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_unittest_GPU_2_of_2('GPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'docs: GPU 1 of 1': {
       try {
       shard_run_docs_GPU_1_of_1('GPU-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_docs_GPU_1_of_1('GPU')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_docs_GPU_1_of_1('GPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     )
diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
index 0155a7a843a9..da20f33bbb3d 100644
--- a/ci/jenkins/generated/hexagon_jenkinsfile.groovy
+++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.760323
+// Generated at 2025-02-15T10:14:10.056677
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
@@ -527,11 +534,17 @@ def build() {
     try {
         run_build('CPU-SPOT')
     } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        run_build('CPU')
+        if (is_last_build()) {
+          // retry if we are currently at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          run_build('CPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
     }
   }
 }
@@ -920,88 +933,136 @@ def test() {
       try {
       shard_run_test_Hexagon_1_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_1_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_1_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'test: Hexagon 2 of 8': {
       try {
       shard_run_test_Hexagon_2_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_2_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_2_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'test: Hexagon 3 of 8': {
       try {
       shard_run_test_Hexagon_3_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_3_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_3_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'test: Hexagon 4 of 8': {
       try {
       shard_run_test_Hexagon_4_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_4_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_4_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'test: Hexagon 5 of 8': {
       try {
       shard_run_test_Hexagon_5_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_5_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_5_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'test: Hexagon 6 of 8': {
       try {
       shard_run_test_Hexagon_6_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_6_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_6_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'test: Hexagon 7 of 8': {
       try {
       shard_run_test_Hexagon_7_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_7_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_7_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'test: Hexagon 8 of 8': {
       try {
       shard_run_test_Hexagon_8_of_8('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_test_Hexagon_8_of_8('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_test_Hexagon_8_of_8('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     )
diff --git a/ci/jenkins/generated/i386_jenkinsfile.groovy b/ci/jenkins/generated/i386_jenkinsfile.groovy
index 565109193695..993f1d3e8323 100644
--- a/ci/jenkins/generated/i386_jenkinsfile.groovy
+++ b/ci/jenkins/generated/i386_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.734278
+// Generated at 2025-02-15T10:14:10.142167
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
@@ -523,11 +530,17 @@ def build() {
     try {
         run_build('CPU-SPOT')
     } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        run_build('CPU')
+        if (is_last_build()) {
+          // retry if we are currently at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          run_build('CPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
     }
   }
 }
@@ -689,33 +702,51 @@ def test() {
       try {
       shard_run_python_i386_1_of_3('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_python_i386_1_of_3('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_python_i386_1_of_3('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'python: i386 2 of 3': {
       try {
       shard_run_python_i386_2_of_3('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_python_i386_2_of_3('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_python_i386_2_of_3('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     'python: i386 3 of 3': {
       try {
       shard_run_python_i386_3_of_3('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        shard_run_python_i386_3_of_3('CPU-SMALL')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          shard_run_python_i386_3_of_3('CPU-SMALL')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     )
diff --git a/ci/jenkins/generated/lint_jenkinsfile.groovy b/ci/jenkins/generated/lint_jenkinsfile.groovy
index d85f6af857f6..a1750eb853d5 100644
--- a/ci/jenkins/generated/lint_jenkinsfile.groovy
+++ b/ci/jenkins/generated/lint_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.747933
+// Generated at 2025-02-15T10:12:51.981152
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
diff --git a/ci/jenkins/generated/wasm_jenkinsfile.groovy b/ci/jenkins/generated/wasm_jenkinsfile.groovy
index a0cb13d55849..407f4c80049d 100644
--- a/ci/jenkins/generated/wasm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/wasm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-09T12:21:01.801132
+// Generated at 2025-02-15T10:14:10.202706
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -279,6 +279,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
@@ -525,11 +532,17 @@ def build() {
     try {
         run_build('CPU-SPOT')
     } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        run_build('CPU')
+        if (is_last_build()) {
+          // retry if we are currently at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          run_build('CPU')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
     }
   }
 }
diff --git a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
index f8695091058d..aa999408a7e2 100644
--- a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
@@ -53,27 +53,5 @@
   )
 {% endcall %}
 
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="topi: aarch64",
-  ws="tvm/ut-python-arm",
-  platform="arm",
-  docker_image="ci_arm",
-  num_shards=2,
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='arm') }}
-  ci_setup(ci_arm)
-  {% if shard_index == 1 %}
-  cpp_unittest(ci_arm)
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-    label: 'Run test_arm_compute_lib test',
-  )
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-    label: 'Run TOPI tests',
-  )
-{% endcall %}
 
 {{ m.invoke_tests(node="ARM-GRAVITON3", test_method_names=test_method_names) -}}
diff --git a/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
index e57e8c043303..2769ae2c5d87 100644
--- a/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
@@ -40,37 +40,14 @@
 
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="unittest: GPU",
-  num_shards=3,
+  num_shards=2,
   ws="tvm/ut-python-gpu",
   platform="gpu",
   docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
-  {% if shard_index == 1 %}
-  {{ m.download_artifacts(tag='gpu2') }}
-  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
-  // These require a GPU to finish the build (i.e. CUDA needs to be load-able)
-  // make_cpp_tests(ci_gpu, 'build')
-  // cpp_unittest(ci_gpu)
-
-  sh "rm -rf build"
   {{ m.download_artifacts(tag='gpu') }}
   ci_setup(ci_gpu)
-  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-  make_cpp_tests(ci_gpu, 'build')
-  cpp_unittest(ci_gpu)
-  sh (
-    script: "${docker_run} ${ci_gpu} python3 ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --sccache-region us-west-2 --cmake-target opencl-cpptest --build-dir build",
-    label: 'Make OpenCL cpp unit tests',
-  )
-  sh (
-    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_opencl_cpp_unittest.sh",
-    label: 'Run OpenCL cpp unit tests',
-  )
-  {% else %}
-  {{ m.download_artifacts(tag='gpu') }}
-  ci_setup(ci_gpu)
-  {% endif %}
   {% if shard_index == 2 or num_shards < 2 %}
   sh (
     script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
@@ -87,21 +64,6 @@
   )
 {% endcall %}
 
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="topi: GPU",
-  num_shards=3,
-  ws="tvm/topi-python-gpu",
-  platform="gpu",
-  docker_image="ci_gpu",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='gpu') }}
-  ci_setup(ci_gpu)
-  sh (
-    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-    label: 'Run TOPI tests',
-  )
-{% endcall %}
 
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="docs: GPU",
diff --git a/ci/jenkins/templates/utils/Prepare.groovy.j2 b/ci/jenkins/templates/utils/Prepare.groovy.j2
index 3f27b98861f6..68e6569d4e46 100644
--- a/ci/jenkins/templates/utils/Prepare.groovy.j2
+++ b/ci/jenkins/templates/utils/Prepare.groovy.j2
@@ -155,6 +155,13 @@ def cancel_previous_build() {
   }
 }
 
+def is_last_build() {
+  // whether it is last build
+  def job = Jenkins.instance.getItem(env.JOB_NAME)
+  def lastBuild = job.getLastBuild()
+  return lastBuild.getNumber() == env.BUILD_NUMBER
+}
+
 def checkout_trusted_files() {
   // trust everything from branch builds
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
diff --git a/ci/jenkins/templates/utils/macros.j2 b/ci/jenkins/templates/utils/macros.j2
index 81eeaa1fdff8..ee90b043ccbc 100644
--- a/ci/jenkins/templates/utils/macros.j2
+++ b/ci/jenkins/templates/utils/macros.j2
@@ -96,11 +96,17 @@ def build() {
     try {
         run_build('{{ node }}-SPOT')
     } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        run_build('{{ node }}')
+        if (is_last_build()) {
+          // retry if we are currently at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          run_build('{{ node }}')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
     }
   }
 }
@@ -119,11 +125,17 @@ def test() {
       try {
       {{ method_name }}('{{ node }}-SPOT')
       } catch (Throwable ex) {
-        // mark the current stage as success
-        // and try again via on demand node
-        echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-        currentBuild.result = 'SUCCESS'
-        {{ method_name }}('{{ node }}')
+        if (is_last_build()) {
+          // retry if at last build
+          // mark the current stage as success
+          // and try again via on demand node
+          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
+          currentBuild.result = 'SUCCESS'
+          {{ method_name }}('{{ node }}')
+        } else {
+          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
+          throw ex
+        }
       }
     },
     {% endfor %}
diff --git a/docs/arch/introduction_to_module_serialization.rst b/docs/arch/introduction_to_module_serialization.rst
index 6b2f2addaf9a..0737c6648534 100644
--- a/docs/arch/introduction_to_module_serialization.rst
+++ b/docs/arch/introduction_to_module_serialization.rst
@@ -22,40 +22,6 @@ When to deploy TVM runtime module, no matter whether it is CPU or GPU, TVM only
 shared library. The key is our unified module serialization mechanism. This document will introduce TVM module
 serialization format standard and implementation details.
 
-*********************
-Module Export Example
-*********************
-
-Let us build one ResNet-18 workload for GPU as an example first.
-
-.. code:: python
-
-   from tvm import relay
-   from tvm.relay import testing
-   from tvm.contrib import utils
-   import tvm
-
-   # Resnet18 workload
-   resnet18_mod, resnet18_params = relay.testing.resnet.get_workload(num_layers=18)
-
-   # build
-   with relay.build_config(opt_level=3):
-       _, resnet18_lib, _ = relay.build_module.build(resnet18_mod, "cuda", params=resnet18_params)
-
-   # create one tempory directory
-   temp = utils.tempdir()
-
-   # path lib
-   file_name = "deploy.so"
-   path_lib = temp.relpath(file_name)
-
-   # export library
-   resnet18_lib.export_library(path_lib)
-
-   # load it back
-   loaded_lib = tvm.runtime.load_module(path_lib)
-   assert loaded_lib.type_key == "library"
-   assert loaded_lib.imported_modules[0].type_key == "cuda"
 
 *************
 Serialization
diff --git a/docs/conf.py b/docs/conf.py
index cffb616f3b22..6f9567c9b148 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -487,7 +487,6 @@ def force_gc(gallery_conf, fname):
 tvm_alias_check_map = {
     "tvm.te": ["tvm.tir"],
     "tvm.tir": ["tvm.ir", "tvm.runtime"],
-    "tvm.relay": ["tvm.ir", "tvm.tir"],
 }
 
 ## Setup header and other configs
diff --git a/docs/reference/api/python/auto_scheduler.rst b/docs/reference/api/python/auto_scheduler.rst
deleted file mode 100644
index 8fa182307352..000000000000
--- a/docs/reference/api/python/auto_scheduler.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.auto_scheduler
-------------------
-.. automodule:: tvm.auto_scheduler
-   :members:
-   :imported-members:
-   :autosummary:
diff --git a/docs/reference/api/python/autotvm.rst b/docs/reference/api/python/autotvm.rst
deleted file mode 100644
index 5bde9ac47962..000000000000
--- a/docs/reference/api/python/autotvm.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.autotvm
------------
-.. automodule:: tvm.autotvm
-.. autofunction:: tvm.autotvm.apply_history_best
-
-tvm.autotvm.measure
-~~~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.autotvm.measure.measure
-
-.. autoclass:: tvm.autotvm.measure.MeasureInput
-    :members:
-
-.. autoclass:: tvm.autotvm.measure.MeasureResult
-    :members:
-
-.. autofunction:: tvm.autotvm.measure.measure_option
-
-.. autofunction:: tvm.autotvm.measure.create_measure_batch
-
-.. autoclass:: tvm.autotvm.measure.measure_methods.LocalBuilder
-
-.. autoclass:: tvm.autotvm.measure.measure_methods.RPCRunner
-
-.. autoclass:: tvm.autotvm.measure.measure_methods.LocalRunner
-
-tvm.autotvm.tuner
-~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.autotvm.tuner
-    :members:
-
-.. autoclass:: tvm.autotvm.tuner.Tuner
-    :members:
-
-.. autoclass:: tvm.autotvm.tuner.RandomTuner
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.autotvm.tuner.GridSearchTuner
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.autotvm.tuner.GATuner
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.autotvm.tuner.XGBTuner
-    :members:
-    :inherited-members:
-
-.. automodule:: tvm.autotvm.tuner.callback
-    :members:
-
-tvm.autotvm.task
-~~~~~~~~~~~~~~~~
-.. automodule:: tvm.autotvm.task
-    :members:
-
-.. automodule:: tvm.autotvm.task.task
-    :members:
-
-.. automodule:: tvm.autotvm.task.space
-    :members:
-
-.. automodule:: tvm.autotvm.task.dispatcher
-    :members:
-
-.. automodule:: tvm.autotvm.task.topi_integration
-    :members:
-
-tvm.autotvm.record
-~~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.autotvm.record
-    :members:
diff --git a/docs/reference/api/python/graph_executor.rst b/docs/reference/api/python/graph_executor.rst
deleted file mode 100644
index 1af93e88458d..000000000000
--- a/docs/reference/api/python/graph_executor.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.contrib.graph_executor
---------------------------
-.. automodule:: tvm.contrib.graph_executor
-    :members:
diff --git a/docs/reference/api/python/index.rst b/docs/reference/api/python/index.rst
index c4082354a8de..2e0a9c7f2fb9 100644
--- a/docs/reference/api/python/index.rst
+++ b/docs/reference/api/python/index.rst
@@ -85,21 +85,3 @@ Python API
 
     rpc
     contrib
-
-.. toctree::
-    :maxdepth: 1
-    :caption: Legacy
-
-    relay/index
-    relay/frontend
-    relay/nn
-    relay/vision
-    relay/image
-    relay/transform
-    relay/analysis
-    relay/backend
-    relay/dataflow_pattern
-    relay/testing
-    autotvm
-    auto_scheduler
-    graph_executor
diff --git a/docs/reference/api/python/relay/analysis.rst b/docs/reference/api/python/relay/analysis.rst
deleted file mode 100644
index ee2ae2fe1e7b..000000000000
--- a/docs/reference/api/python/relay/analysis.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay.analysis
-----------------------
-
-.. automodule:: tvm.relay.analysis
-    :members:
-    :imported-members:
-    :exclude-members: Object, RelayExpr, IRModule, Type
-    :autosummary:
diff --git a/docs/reference/api/python/relay/backend.rst b/docs/reference/api/python/relay/backend.rst
deleted file mode 100644
index e717ee10ffab..000000000000
--- a/docs/reference/api/python/relay/backend.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay.backend
------------------
-
-.. automodule:: tvm.relay.backend
-
-.. automodule:: tvm.relay.backend.interpreter
-    :members:
-
-.. automodule:: tvm.relay.backend.te_compiler
-    :members:
-
-.. automodule:: tvm.relay.backend.graph_executor_codegen
-    :members:
-
-.. automodule:: tvm.relay.backend.vm
-    :members:
diff --git a/docs/reference/api/python/relay/dataflow_pattern.rst b/docs/reference/api/python/relay/dataflow_pattern.rst
deleted file mode 100644
index fe1d4e95e507..000000000000
--- a/docs/reference/api/python/relay/dataflow_pattern.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay.dataflow_pattern
---------------------------
-
-.. automodule:: tvm.relay.dataflow_pattern
-    :members:
-    :imported-members:
-    :exclude-members: Object, Node
-    :autosummary:
diff --git a/docs/reference/api/python/relay/frontend.rst b/docs/reference/api/python/relay/frontend.rst
deleted file mode 100644
index a62e36e7b7b8..000000000000
--- a/docs/reference/api/python/relay/frontend.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-
-tvm.relay.frontend
-------------------
-
-.. automodule:: tvm.relay.frontend
-    :members:
-    :imported-members:
-    :autosummary:
diff --git a/docs/reference/api/python/relay/image.rst b/docs/reference/api/python/relay/image.rst
deleted file mode 100644
index 3e5162718441..000000000000
--- a/docs/reference/api/python/relay/image.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-
-tvm.relay.image
----------------
-
-.. automodule:: tvm.relay.image
-   :members:
-   :imported-members:
-   :exclude-members: Expr, Constant
-   :autosummary:
diff --git a/docs/reference/api/python/relay/index.rst b/docs/reference/api/python/relay/index.rst
deleted file mode 100644
index 399bba3e5b11..000000000000
--- a/docs/reference/api/python/relay/index.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay
----------
-
-.. automodule:: tvm.relay
-    :members:
-    :imported-members:
-    :exclude-members: RelayExpr,
-      Type, TypeKind,
-      TypeVar, GlobalTypeVar, TypeConstraint, FuncType, TupleType, IncompleteType,
-      TypeCall, TypeRelation, TensorType, RelayRefType, GlobalVar, SourceName,
-      Span, Var, Op, Constructor
-    :noindex: TypeData
-    :autosummary:
diff --git a/docs/reference/api/python/relay/nn.rst b/docs/reference/api/python/relay/nn.rst
deleted file mode 100644
index b54b752ba115..000000000000
--- a/docs/reference/api/python/relay/nn.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay.nn
-------------
-.. automodule:: tvm.relay.nn
-    :members:
-    :imported-members:
-    :autosummary:
diff --git a/docs/reference/api/python/relay/testing.rst b/docs/reference/api/python/relay/testing.rst
deleted file mode 100644
index 1d0bfe08e183..000000000000
--- a/docs/reference/api/python/relay/testing.rst
+++ /dev/null
@@ -1,51 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay.testing
------------------
-
-.. automodule:: tvm.relay.testing
-    :members:
-    :imported-members:
-    :autosummary:
-
-.. automodule:: tvm.relay.testing.mlp
-    :members:
-
-.. automodule:: tvm.relay.testing.resnet
-    :members:
-
-.. automodule:: tvm.relay.testing.dcgan
-    :members:
-
-.. automodule:: tvm.relay.testing.mobilenet
-    :members:
-
-.. automodule:: tvm.relay.testing.lstm
-    :members:
-
-.. automodule:: tvm.relay.testing.inception_v3
-    :members:
-
-.. automodule:: tvm.relay.testing.squeezenet
-    :members:
-
-.. automodule:: tvm.relay.testing.vgg
-    :members:
-
-.. automodule:: tvm.relay.testing.densenet
-    :members:
diff --git a/docs/reference/api/python/relay/transform.rst b/docs/reference/api/python/relay/transform.rst
deleted file mode 100644
index 4a8747606eb2..000000000000
--- a/docs/reference/api/python/relay/transform.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay.transform
-----------------------
-
-.. automodule:: tvm.relay.transform
-    :members:
-    :imported-members:
-    :autosummary:
-    :exclude-members: FunctionPass
diff --git a/docs/reference/api/python/relay/vision.rst b/docs/reference/api/python/relay/vision.rst
deleted file mode 100644
index f6bedee8ce01..000000000000
--- a/docs/reference/api/python/relay/vision.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-
-tvm.relay.vision
-----------------
-
-.. automodule:: tvm.relay.vision
-    :members:
-    :imported-members:
-    :autosummary:
diff --git a/docs/reference/api/python/topi.rst b/docs/reference/api/python/topi.rst
index 0528844d682a..ce44e07dedf4 100644
--- a/docs/reference/api/python/topi.rst
+++ b/docs/reference/api/python/topi.rst
@@ -37,11 +37,3 @@ tvm.topi.image
    :members:
    :imported-members:
    :autosummary:
-
-
-tvm.topi.sparse
-~~~~~~~~~~~~~~~
-.. automodule:: tvm.topi.sparse
-   :members:
-   :imported-members:
-   :autosummary:
diff --git a/golang/sample/gen_mobilenet_lib.py b/golang/sample/gen_mobilenet_lib.py
deleted file mode 100644
index 12f215b4fd9c..000000000000
--- a/golang/sample/gen_mobilenet_lib.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-from tvm import relay, transform, runtime
-from tvm.contrib.download import download_testdata
-
-
-################################################
-# Utils for downloading and extracting zip files
-# ----------------------------------------------
-def extract(path):
-    import tarfile
-
-    if path.endswith("tgz") or path.endswith("gz"):
-        dir_path = os.path.dirname(path)
-        tar = tarfile.open(path)
-        tar.extractall(path=dir_path)
-        tar.close()
-    else:
-        raise RuntimeError("Could not decompress the file: " + path)
-
-
-###################################
-# Download TFLite pre-trained model
-# ---------------------------------
-
-model_url = "https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.4_224.tgz"
-model_path = download_testdata(model_url, "mobilenet_v2_1.4_224.tgz", module=["tf", "official"])
-model_dir = os.path.dirname(model_path)
-extract(model_path)
-
-# now we have mobilenet_v2_1.4_224.tflite on disk
-model_file = os.path.join(model_dir, "mobilenet_v2_1.4_224.tflite")
-
-# get TFLite model from buffer
-tflite_model_buf = open(model_file, "rb").read()
-try:
-    import tflite
-
-    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
-except AttributeError:
-    import tflite.Model
-
-    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-
-
-##############################
-# Load Neural Network in Relay
-# ----------------------------
-
-# TFLite input tensor name, shape and type
-input_tensor = "input"
-input_shape = (1, 224, 224, 3)
-input_dtype = "float32"
-
-# parse TFLite model and convert into Relay computation graph
-mod, params = relay.frontend.from_tflite(
-    tflite_model, shape_dict={input_tensor: input_shape}, dtype_dict={input_tensor: input_dtype}
-)
-
-#############
-# Compilation
-# -----------
-
-target = "llvm"
-
-# Build with Relay
-with transform.PassContext(opt_level=3):
-    graph, lib, params = relay.build_module.build(mod, target, params=params)
-
-###############################################
-# Save the graph, lib and parameters into files
-# ---------------------------------------------
-
-lib.export_library("./mobilenet.so")
-print("lib export succeefully")
-
-with open("./mobilenet.json", "w") as fo:
-    fo.write(graph)
-
-with open("./mobilenet.params", "wb") as fo:
-    fo.write(runtime.save_param_dict(params))
diff --git a/jvm/core/src/test/scripts/test_graph_executor.py b/jvm/core/src/test/scripts/test_graph_executor.py
deleted file mode 100644
index 676b008205ca..000000000000
--- a/jvm/core/src/test/scripts/test_graph_executor.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-
-import tvm
-from tvm import te
-import json
-from tvm.contrib import graph_executor
-
-
-def dump_graph_lib(target_dir):
-    dim = 4
-    A = te.placeholder((dim,), name="A")
-    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    sched = te.create_schedule(B.op)
-
-    node0 = {"op": "null", "name": "x", "inputs": []}
-    node1 = {
-        "op": "tvm_op",
-        "name": "add",
-        "inputs": [[0, 0, 0]],
-        "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"},
-    }
-    nodes = [node0, node1]
-    arg_nodes = [0]
-    node_row_ptr = [0, 1, 2]
-    outputs = [[1, 0, 0]]
-    shape = (4,)
-    attrs = {
-        "shape": ["list_shape", [shape, shape]],
-        "dltype": ["list_str", ["float32", "float32"]],
-        "storage_id": ["list_int", [0, 1]],
-    }
-    graph = {
-        "nodes": nodes,
-        "arg_nodes": arg_nodes,
-        "node_row_ptr": node_row_ptr,
-        "heads": outputs,
-        "attrs": attrs,
-    }
-
-    graph = json.dumps(graph)
-    mlib = tvm.build(sched, [A, B], "llvm", name="myadd")
-
-    mlib.export_library(os.path.join(target_dir, "graph_addone_lib.so"))
-    with open(os.path.join(target_dir, "graph_addone.json"), "w") as fo:
-        fo.write(graph)
-
-
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    dump_graph_lib(sys.argv[1])
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 1d04772e9e3e..ab11f33cc035 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -63,9 +63,6 @@
 # tvm.driver
 from .driver import build, lower
 
-# tvm.parser
-from . import parser
-
 # others
 from . import arith
 
@@ -78,7 +75,6 @@
 # Relay and Relax contain modules that are only available in compiler package
 # Do not import them if TVM is built with runtime only
 if not _RUNTIME_ONLY:
-    from . import relay
     from . import relax
 
 # NOTE: This file should be python2 compatible so we can
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
deleted file mode 100644
index 97ac323662bb..000000000000
--- a/python/tvm/auto_scheduler/__init__.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import, redefined-builtin
-""" Namespace for TVM Auto-scheduler. """
-
-from . import (
-    compute_dag,
-    dispatcher,
-    feature,
-    loop_state,
-    measure,
-    measure_record,
-    relay_integration,
-    search_policy,
-    search_task,
-    task_scheduler,
-    utils,
-    workload_registry,
-)
-
-# Shortcut
-from .compute_dag import (
-    ComputeDAG,
-    LayoutRewriteOption,
-    get_shape_from_rewritten_layout,
-)
-from .cost_model import RandomModel, XGBModel
-from .dispatcher import ApplyHistoryBest, ApplyHistoryBestOrSample, DispatchContext
-from .measure import (
-    LocalBuilder,
-    LocalRPCMeasureContext,
-    LocalRunner,
-    MeasureInput,
-    MeasureResult,
-    RPCRunner,
-    register_task_input_check_func,
-)
-from .measure_record import (
-    RecordReader,
-    RecordToFile,
-    load_best_record,
-    load_records,
-    save_records,
-)
-from .relay_integration import (
-    extract_tasks,
-    is_auto_scheduler_enabled,
-    remove_index_check,
-    rewrite_compute_body,
-    rewrite_tensor_shape,
-)
-from .search_policy import (
-    EmptyPolicy,
-    PreloadCustomSketchRule,
-    PreloadMeasuredStates,
-    SketchPolicy,
-)
-from .search_task import (
-    HardwareParams,
-    SearchTask,
-    TuningOptions,
-    auto_schedule,
-    create_task,
-)
-from .task_scheduler import TaskScheduler
-from .workload_registry import make_workload_key, register_workload
diff --git a/python/tvm/auto_scheduler/_ffi_api.py b/python/tvm/auto_scheduler/_ffi_api.py
deleted file mode 100644
index d7b874f71e0f..000000000000
--- a/python/tvm/auto_scheduler/_ffi_api.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Register FFI APIs from C++ for the namespace tvm.auto_scheduler. """
-import tvm._ffi
-
-
-tvm._ffi._init_api("auto_scheduler", __name__)
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
deleted file mode 100644
index c212d143f987..000000000000
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-""" The auto-scheduler's computational graph and related program analyses. """
-
-import hashlib
-import json
-
-import tvm._ffi
-from tvm.runtime import Object
-from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON
-
-from . import _ffi_api
-from .loop_state import State, StateObject
-from .utils import get_const_tuple
-from .workload_registry import workload_key_to_tensors
-
-
-class LayoutRewriteOption:
-    """
-    Options for applying layout rewrite.
-
-    The NO_REWRITE and INSERT_TRANSFORM_STAGE are expected to be used when tuning a standalone op,
-    and the REWRITE_FOR_PRE_TRANSFORMED is expected to be used when tuning ops inside a network.
-    """
-
-    # Do not perform layout rewrite
-    NO_REWRITE = 0
-    # Insert layout transformation stages for input placeholders in the compute DAG
-    INSERT_TRANSFORM_STAGE = 1
-    # Do not insert layout transformation stages and assume the input placeholders
-    # are pre-transformed.
-    # Note: The lowered function with this option does not accept the origial input shapes,
-    # so this option must be used along with `AutoSchedulerLayoutRewrite` pass in Relay.
-    REWRITE_FOR_PRE_TRANSFORMED = 2
-
-    @staticmethod
-    def get_target_default(target, in_relay_integration=False):
-        """Get the default layout rewrite option for the specified target.
-        Currently we only enable layout rewrite for cpu / mali backend for now
-
-        Parameters
-        ----------
-        target: tvm.target.Target
-            The compilation target.
-        in_relay_integration: bool
-            If this check is ask for relay integration.
-
-        Returns
-        -------
-        layout_rewrite_option: LayoutRewriteOption
-            The default layout rewrite option for the specified target.
-        """
-        layout_rewrite_option = LayoutRewriteOption.NO_REWRITE
-        if target.kind.name == "llvm" or (
-            "device" in target.attrs and target.attrs["device"] == "mali"
-        ):
-            layout_rewrite_option = (
-                LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
-                if in_relay_integration
-                else LayoutRewriteOption.INSERT_TRANSFORM_STAGE
-            )
-
-        return layout_rewrite_option
-
-
-@tvm._ffi.register_object("auto_scheduler.ComputeDAG")
-class ComputeDAG(Object):
-    """
-    The auto-scheduler's computational graph and related program analyses.
-
-    We convert a compute declaration described by `tvm.compute` (could be a single operator or a
-    subgraph) to a ComputeDAG. It keeps the input/output tensors, all operations in the DAG, and
-    some static analysis results for the DAG (e.g. the total float operation count,
-    consumer/producer relations of operations, whether an operation stage should
-    be tiled/compute inlined).
-    These analyses can help the search policy to make decisions during the search.
-    ComputeDAG is also responsible for the interaction between auto-scheduler's `LoopState` and
-    TVM schedule (e.g. applying the `LoopState` transform steps to a TVM schedule, providing
-    `LoopState` with extra information got from TVM schedule).
-
-    Parameters
-    ----------
-    compute : Union[List[Tensor], str, tvm.te.Schedule]
-        Input/output tensors or workload key for a compute declaration.
-    """
-
-    def __init__(self, compute_or_sche):
-        if isinstance(compute_or_sche, str):
-            compute = workload_key_to_tensors(compute_or_sche)
-            sche = None
-        elif isinstance(compute_or_sche, (list, tvm.ir.container.Array)):
-            for item in compute_or_sche:
-                if not isinstance(item, tvm.te.Tensor):
-                    raise ValueError(
-                        "The input of ComputeDAG should be a list of Tensor, but got %s"
-                        % type(item)
-                    )
-            compute = compute_or_sche
-            sche = None
-        elif isinstance(compute_or_sche, tvm.te.Schedule):
-            compute = None
-            sche = compute_or_sche
-        else:
-            raise ValueError(
-                "Invalid compute type: %s. ComputeDAG expects string, list of Tensor, or Schedule"
-                % type(compute_or_sche)
-            )
-        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, compute, sche)
-
-    def get_init_state(self):
-        """Get the init state of this ComputeDAG.
-
-        Returns
-        -------
-        state : State
-            The initial State without any transform steps.
-        """
-        return State(self.init_state, self)
-
-    def apply_steps_from_state(self, state, layout_rewrite=LayoutRewriteOption.NO_REWRITE):
-        """
-        Apply the history transform steps from a State to get a TVM schedule.
-
-        Parameters
-        ----------
-        state : Union[State, StateObject]
-            The state from which we get transform steps.
-
-        layout_rewrite: LayoutRewriteOption = NoRewrite
-            Rewrite the layout of placeholders specified by "layout_free_placeholders" attr
-            to make it most friendly for the generated schedule to read from.
-
-        Returns
-        -------
-            A `te.schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
-        """
-        state_obj = state if isinstance(state, StateObject) else state.state_object
-        return _ffi_api.ComputeDAGApplyStepsFromState(self, state_obj, layout_rewrite)
-
-    def print_python_code_from_state(self, state):
-        """
-        Print transform steps in the history of a State as TVM's python schedule code.
-
-        This is used to print transformation steps for debugging.
-        Use `apply_steps_from_state` if you want to get a schedule for code generation.
-
-        Parameters
-        ----------
-        state : Union[State, StateObject]
-            The state from which we get transform steps.
-
-        Returns
-        -------
-        str : Str
-            The Python schedule code.
-        """
-        state_obj = state if isinstance(state, StateObject) else state.state_object
-        return _ffi_api.ComputeDAGPrintPythonCodeFromState(self, state_obj)
-
-    def infer_bound_from_state(self, state):
-        """
-        Infer and fill the bound of all iterators of a state.
-
-        The states may lose complete bound information after some transform steps
-        (e.g., compute_at).
-        We can call this function to infer and fill all the bound information.
-        This function calls TVM InferBound pass internally to get the bound.
-        The returned state of this function is guaranteed to have complete iterator extent
-        information.
-
-        Parameters
-        ----------
-        state : Union[State, StateObject]
-            The state from which we get transform steps.
-
-        Returns
-        -------
-        updated_state : State
-            The State with complete bound information.
-        """
-        state_obj = state if isinstance(state, StateObject) else state.state_object
-        updated_state = State(_ffi_api.ComputeDAGInferBoundFromState(self, state_obj), self)
-        # Copy the stage_id_map from the original state to make sure the old indices are still
-        # valid
-        if isinstance(state, State):
-            for k, v in state.stage_id_map.items():
-                updated_state.stage_id_map[k] = v
-        return updated_state
-
-    def rewrite_layout_from_state(self, state):
-        """
-        Rewrite the layout of the DAG according to the history transform steps of a state.
-
-        Parameters
-        ----------
-        state : Union[State, StateObject]
-            The state from which we get transform steps.
-
-        Returns
-        -------
-        updated_dag : ComputeDAG
-            The compute dag with rewritten layout.
-        """
-        state_obj = state if isinstance(state, StateObject) else state.state_object
-        return _ffi_api.ComputeDAGRewriteLayoutFromState(self, state_obj)
-
-    def workload_key(self):
-        """Return the workload key of this compute DAG.
-        The workload key is a JSON string from a tuple of (hash of DAG, tensor shapes...)
-
-        Returns
-        -------
-        key: str
-            The workload key of this compute DAG
-        """
-        str_dag = _ffi_api.ComputeDAGPrintDAG(self, True)
-        hash_func = tvm._ffi.get_global_func(
-            "auto_scheduler.compute_dag.hash_func", allow_missing=True
-        )
-
-        if hash_func is None:
-            str_dag = str_dag.encode("utf-8")
-            hash_key = hashlib.md5(str_dag).hexdigest()
-        else:
-            hash_key = hash_func(str_dag)
-
-        io_shapes = []
-        for tensor in self.tensors:
-            io_shapes.append(get_const_tuple(tensor.shape))
-        return json.dumps([hash_key] + io_shapes)
-
-    def __str__(self):
-        # pretty print
-        MAX_LINE_WIDTH = 256
-
-        raw_lines = super().__str__().split("\n")
-        lines = []
-        for line in raw_lines:
-            if len(line) > MAX_LINE_WIDTH:
-                line = (
-                    line[: MAX_LINE_WIDTH // 2] + " ..(OMITTED).. " + line[-MAX_LINE_WIDTH // 2 :]
-                )
-            lines.append(line)
-        return "\n".join(lines)
-
-    def __getstate__(self):
-        return {"tensors": SaveJSON(self.tensors)}
-
-    def __setstate__(self, state):
-        # Since we always use tensors to recover the ComputeDAG, we do not support
-        # (de)serialization of the ComputeDAG constructed by a schedule.
-        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, LoadJSON(state["tensors"]), None)
-
-
-def get_shape_from_rewritten_layout(rewritten_layout, axis_names):
-    """Get the orginal shape from a rewritten layout string.
-
-    Parameters
-    ----------
-    rewritten_layout: str
-        The layout after rewrite
-    axis_names: List[str]
-        Specify the order of axes by names
-
-    Returns
-    -------
-    shape: List[PrimExpr]
-        The original shape
-    """
-    return _ffi_api.GetShapeFromRewrittenLayout(rewritten_layout, axis_names)
diff --git a/python/tvm/auto_scheduler/cost_model/__init__.py b/python/tvm/auto_scheduler/cost_model/__init__.py
deleted file mode 100644
index 56e4a5f9128b..000000000000
--- a/python/tvm/auto_scheduler/cost_model/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import, redefined-builtin
-""" Cost model that estimates the performance of programs """
-
-from .cost_model import RandomModel
-from .xgb_model import XGBModel
diff --git a/python/tvm/auto_scheduler/cost_model/cost_model.py b/python/tvm/auto_scheduler/cost_model/cost_model.py
deleted file mode 100644
index 9ef4bcac7a99..000000000000
--- a/python/tvm/auto_scheduler/cost_model/cost_model.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Cost models that estimate the performance of programs """
-import ctypes
-import numpy as np
-
-import tvm._ffi
-from tvm.runtime import Object
-from .. import _ffi_api
-
-
-@tvm._ffi.register_object("auto_scheduler.CostModel")
-class CostModel(Object):
-    """The base class for cost model"""
-
-
-@tvm._ffi.register_object("auto_scheduler.RandomModel")
-class RandomModel(CostModel):
-    """A model that returns random estimation for all inputs"""
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ffi_api.RandomModel)
-
-    def update(self, inputs, results):
-        """Update the cost model according to new measurement results (training data).
-
-        Parameters
-        ----------
-        inputs : List[auto_scheduler.measure.MeasureInput]
-            The measurement inputs
-        results : List[auto_scheduler.measure.MeasureResult]
-            The measurement results
-        """
-        _ffi_api.CostModelUpdate(self, inputs, results)
-
-    def predict(self, search_task, states):
-        """Predict the scores of states
-
-        Parameters
-        ----------
-        search_task : SearchTask
-            The search task of states
-        states : List[State]
-            The input states
-
-        Returns
-        -------
-        scores: List[float]
-            The predicted scores for all states
-        """
-        return [x.value for x in _ffi_api.CostModelPredict(self, search_task, states)]
-
-
-@tvm._ffi.register_func("auto_scheduler.cost_model.random_fill_float")
-def random_fill_float(size, return_ptr):
-    """Fills a c++ float array with random numbers in [0, 1]
-
-    Parameters
-    ----------
-    size: int
-        The size of the array
-    return_ptr:
-        A pointer to a c++ float array
-    """
-    if size == 0:
-        return
-    return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float))
-    array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(size,))
-    array_wrapper[:] = np.random.uniform(0, 1, (size,))
-
-
-@tvm._ffi.register_object("auto_scheduler.PythonBasedModel")
-class PythonBasedModel(CostModel):
-    """Base class for cost models implemented in python"""
-
-    def __init__(self):
-        def update_func(inputs, results):
-            self.update(inputs, results)
-
-        def predict_func(task, states, return_ptr):
-            return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float))
-            array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(len(states),))
-            array_wrapper[:] = self.predict(task, states)
-
-        def predict_stage_func(task, states, return_ptr):
-            ret = self.predict_stages(task, states)
-            return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float))
-            array_wrapper = np.ctypeslib.as_array(return_ptr, shape=ret.shape)
-            array_wrapper[:] = ret
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.PythonBasedModel, update_func, predict_func, predict_stage_func
-        )
-
-    def update(self, inputs, results):
-        """Update the cost model according to new measurement results (training data).
-
-        Parameters
-        ----------
-        inputs : List[auto_scheduler.measure.MeasureInput]
-            The measurement inputs
-        results : List[auto_scheduler.measure.MeasureResult]
-            The measurement results
-        """
-        raise NotImplementedError
-
-    def predict(self, task, states):
-        """Predict the scores of states
-
-        Parameters
-        ----------
-        search_task : SearchTask
-            The search task of states
-        states : List[State]
-            The input states
-
-        Returns
-        -------
-        scores: List[float]
-            The predicted scores for all states
-        """
-        raise NotImplementedError
-
-    def predict_stages(self, task, states):
-        """Predict the scores of all stages in states. This is the breakdown version of `predict`.
-
-        Parameters
-        ----------
-        search_task : SearchTask
-            The search task of states
-        states : List[State]
-            The input states
-
-        Returns
-        -------
-        scores: List[float]
-            The predicted scores for all stages in all states in the packed format
-
-        Note
-        ----
-        For faster data copy between c++ and python, the python part returns scores in a
-        single flatten array using a packed format. The c++ part then unpacks the flatten array.
-
-        The packed format is:
-        {
-          float  scores[N];                 // scores[i] is the score for states[i].
-          int    n_stage_0;                 // the number of stages in states[0]
-          float  stage_scores_0[[n_stage_0] // the scores for all stages in states[0]
-          int    n_stage_1;                 // the number of stages in states[1]
-          float  stage_scores_1[n_stage_1]; // the scores for all stages in states[1]
-          ...
-          int    n_stage_i;                 // the number of stages in states[i]
-          float  stage_scores_1[n_stage_i]; // the scores for all stages in states[i]
-          ...  // until i == N - 1
-        }
-        To implement this format, we also store int as float, so we can store all numbers
-        into a single float array.
-        """
-        raise NotImplementedError
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
deleted file mode 100644
index c7cdb15634e1..000000000000
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ /dev/null
@@ -1,683 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Cost model based on xgboost"""
-import multiprocessing
-import logging
-from typing import Dict
-from collections import defaultdict
-
-import numpy as np
-
-from tvm.autotvm.tuner.metric import max_curve
-from .cost_model import PythonBasedModel
-from ..feature import get_per_store_features_from_measure_pairs, get_per_store_features_from_states
-from ..measure_record import RecordReader
-
-try:
-    from xgboost.callback import TrainingCallback  # type: ignore
-except ImportError:
-
-    class TrainingCallback:  # type: ignore
-        pass
-
-
-xgb = None
-
-logger = logging.getLogger("auto_scheduler")
-
-
-class XGBDMatrixContext:
-    """A global context to hold additional attributes of xgb.DMatrix"""
-
-    def __init__(self):
-        self.context_dict = defaultdict(dict)
-
-    def get(self, key, matrix, default=None):
-        """
-        Get an attribute of a xgb.DMatrix
-        Parameters
-        ----------
-        key: str
-            The name of the attribute
-        matrix: xgb.DMatrix
-            The matrix
-        default: Optional[Any]
-            The default value if the item does not exist
-        """
-        return self.context_dict[key].get(matrix.handle.value, default)
-
-    def set(self, key, matrix, value):
-        """
-        Set an attribute for a xgb.DMatrix
-        Parameters
-        ----------
-        key: str
-            The name of the attribute
-        matrix: xgb.DMatrix
-            The matrix
-        value: Optional[Any]
-            The new value
-        """
-        self.context_dict[key][matrix.handle.value] = value
-
-
-dmatrix_context = XGBDMatrixContext()
-
-
-class XGBModel(PythonBasedModel):
-    """Train a XGBoost model to predict the normalized throughputs of programs.
-    Let the normalized throughput be the score of a program (higher is better). We predict
-    the (approximate) score of a program = the sum of the scores of all stages in this program.
-    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
-    where score_si is the score of Stage i in Program P.
-    We extract feature for each stage and let the xgboost predict the score for each stage.
-    We then sum up the predictions as the score of the whole program.
-    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - y)^2,
-    where P is the program and y is the normalized throughput according to
-    the ground truth (measurement).
-    XGBoost does not support this loss function because `score(P)` is a sum of the prediction
-    of several samples, so we implemented a custom loss function and call it pack-sum-rmse.
-    It is called "pack-sum" because we combine several samples into a "pack" and sum up
-    their predictions.
-
-    Parameters
-    ----------
-    verbose_eval: int = 25
-        Print training log every `verbose_eval` iterations.
-    num_warmup_sample: int = 100
-        The minimum number of samples to start to use the trained model.
-        If the number of samples is less than this number, the model outputs random predictions.
-    seed: Optional[int]
-        The random seed
-    model_file: Optional[str]
-        If is not None, save model to this file after every update.
-    adaptive_training: bool = False
-        Whether to use adaptive training, which reduces the training frequency when there are
-        too many logs.
-    """
-
-    def __init__(
-        self,
-        verbose_eval=25,
-        num_warmup_sample=100,
-        seed=None,
-        model_file=None,
-        adaptive_training=False,
-    ):
-        global xgb
-        try:
-            if xgb is None:
-                xgb = __import__("xgboost")
-        except ImportError:
-            # add "from Node" to silence
-            # "During handling of the above exception, another exception occurred"
-            raise ImportError(
-                "XGBoost is required for XGBModel. "
-                "Please install its python package first. "
-                "Help: (https://xgboost.readthedocs.io/en/latest/) "
-            ) from None
-
-        self.xgb_params = {
-            "max_depth": 10,
-            "gamma": 0.001,
-            "min_child_weight": 0,
-            "eta": 0.2,
-            # todo(merrymercy): automatically decrease learning rate when the loss is too large
-            "n_gpus": 0,
-            "nthread": multiprocessing.cpu_count() // 2,
-            "verbosity": 0,
-            "seed": seed or 43,
-            "disable_default_eval_metric": 1,
-        }
-        self.bst = None
-        self.plan_size = 32
-        self.num_warmup_sample = num_warmup_sample
-        self.verbose_eval = verbose_eval
-        self.model_file = model_file
-        self.adaptive_training = adaptive_training
-
-        super().__init__()
-
-        # cache measurement input/result pairs and extracted features
-        self.inputs = []
-        self.results = []
-        self.last_train_length = 0
-        self.inputs_feature_cache = []
-
-    def update(self, inputs, results):
-        """Update the cost model according to new measurement results (training data).
-        XGBoost does not support incremental training, so we re-train a new model every time.
-        Parameters
-        ----------
-        inputs : List[MeasureInput]
-            The measurement inputs
-        results : List[MeasureResult]
-            The measurement results
-        """
-        if len(inputs) <= 0:
-            return
-        assert len(inputs) == len(results)
-
-        self.inputs.extend(inputs)
-        self.results.extend(results)
-
-        if (
-            self.adaptive_training
-            and len(self.inputs) - self.last_train_length < self.last_train_length / 5
-        ):
-            # Set a training threshold related to `last_train_length` to reduce the training
-            # overhead when there're too many logs
-            return
-        self.last_train_length = len(self.inputs)
-
-        # extract feature
-        n_cached = len(self.inputs_feature_cache)
-        features, normalized_throughputs, task_ids = get_per_store_features_from_measure_pairs(
-            self.inputs, self.results, skip_first_n_feature_extraction=n_cached
-        )
-        if n_cached > 0:
-            features = list(features)
-            features[:n_cached] = self.inputs_feature_cache
-            features = np.array(features, dtype=object)
-        self.inputs_feature_cache = features
-        dtrain = pack_sum_xgbmatrix(
-            features, normalized_throughputs, task_ids, normalized_throughputs
-        )
-
-        # train xgb model
-        self.bst = xgb.train(
-            self.xgb_params,
-            dtrain,
-            num_boost_round=10000,
-            obj=pack_sum_square_error,
-            callbacks=[
-                CustomCallback(
-                    stopping_rounds=50,
-                    metric="tr-p-rmse",
-                    fevals=[pack_sum_rmse, pack_sum_average_peak_score(self.plan_size)],
-                    evals=[(dtrain, "tr")],
-                    maximize=False,
-                    verbose_eval=self.verbose_eval,
-                )
-            ],
-        )
-
-        # Update the model file if it has been set
-        if self.model_file:
-            self.save(self.model_file)
-
-    def predict(self, task, states):
-        """Predict the scores of states
-        Parameters
-        ----------
-        search_task : SearchTask
-            The search task of states
-        statse : List[State]
-            The input states
-        Returns
-        -------
-        scores: List[float]
-            The predicted scores for all states
-        """
-        features = get_per_store_features_from_states(states, task)
-        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
-            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
-            raw_preds = self.bst.predict(dtest)
-            ret = predict_throughput_pack_sum(raw_preds, pack_ids)
-        else:
-            ret = np.random.uniform(0, 1, (len(states),))
-
-        # Predict -inf for invalid states that failed to be lowered.
-        for idx, feature in enumerate(features):
-            if feature.min() == feature.max() == 0:
-                ret[idx] = float("-inf")
-
-        return ret
-
-    def predict_stages(self, task, states):
-        """Predict the scores of all stages in states. This is the breakdown version of `predict`.
-
-        Parameters
-        ----------
-        search_task : SearchTask
-            The search task of states
-        statse : List[State]
-            The input states
-
-        Returns
-        -------
-        scores: List[float]
-            The predicted scores for all stages in all states in the packed format
-
-        Note
-        ----
-        For faster data copy between c++ and python, the python part returns scores in a
-        single flatten array using a packed format. The c++ part then unpacks the flatten array.
-        The packed format is:
-        {
-
-          float  scores[N];                 // scores[i] is the score for states[i].
-          int    n_stage_0;                 // the number of stages in states[0]
-          float  stage_scores_0[[n_stage_0] // the scores for all stages in states[0]
-          int    n_stage_1;                 // the number of stages in states[1]
-          float  stage_scores_1[n_stage_1]; // the scores for all stages in states[1]
-          ...
-          int    n_stage_i;                 // the number of stages in states[i]
-          float  stage_scores_1[n_stage_i]; // the scores for all stages in states[i]
-          ...  // untill i == N - 1
-
-        }
-        To implement this format, we also store int as float, so we can store all numbers
-        into a single float array.
-        """
-        features = get_per_store_features_from_states(states, task)
-        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
-            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
-            raw_preds = self.bst.predict(dtest)
-            breakdown = predict_throughput_pack_sum(raw_preds, pack_ids)
-            stage_scores = [[] for _ in range(len(states))]
-            for pred, pack_id in zip(raw_preds, pack_ids):
-                stage_scores[pack_id].append(pred)
-            for idx, stage_score in enumerate(stage_scores):
-                breakdown = np.append(breakdown, len(stage_score))
-                breakdown = np.concatenate((breakdown, np.array(stage_score)))
-        else:
-            breakdown = np.concatenate(
-                (np.random.uniform(0, 1, (len(states),)), np.zeros(len(states)))
-            )
-
-        # Predict 0 for invalid states that failed to be lowered.
-        for idx, feature in enumerate(features):
-            if feature.min() == feature.max() == 0:
-                breakdown[idx] = float("-inf")
-
-        return breakdown
-
-    def update_from_file(self, file_name, n_lines=None):
-        """Load measure records from a log file to update the cost model.
-        This function can be used to pre-train the cost model with history log files.
-        Parameters
-        ----------
-        file_name: str
-            The filename
-        n_lines: Optional[int]
-            Only load first n lines of the log file
-        """
-        inputs, results = RecordReader(file_name).read_lines(n_lines)
-        logger.info("XGBModel: Loaded %s measurement records from %s", len(inputs), file_name)
-        self.update(inputs, results)
-
-    def save(self, file_name: str):
-        """Save the model to a file
-        Parameters
-        ----------
-        file_name: str
-            The filename
-        """
-        self.bst.save_model(file_name)
-
-    def load(self, file_name: str):
-        """Load the model from a file
-        Parameters
-        ----------
-        file_name: str
-            The filename
-        """
-        if self.bst is None:
-            self.bst = xgb.Booster(self.xgb_params)
-        self.bst.load_model(file_name)
-        self.num_warmup_sample = -1
-
-
-def feature_to_pack_sum_xgbmatrix(xs):
-    """Convert an extracted multi-stage feature vector to a xgbmatrx in pack-sum format
-    Parameters
-    ----------
-    xs: np.ndarray
-        The feature vector
-    Returns
-    -------
-    dmatrix: xgb.DMatrix
-        The DMatrix
-    pack_ids: List[int]
-        pack ids information
-    """
-    x_flatten = []
-    pack_ids = []
-
-    for ct, x in enumerate(xs):
-        for row in x:
-            x_flatten.append(row)
-            pack_ids.append(ct)
-
-    return xgb.DMatrix(np.array(x_flatten)), pack_ids
-
-
-def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None):
-    """Convert (feature, label) pairs into a xgb matrix with pack-sum format
-    Parameters
-    ----------
-    xs: np.ndarray
-        The feature vector
-    ys: np.ndarray
-        The normaizlied throughput
-    gids: Optional[List[int]]
-        Group id (task id)
-    weights: Optional[np.ndarray]
-        The weight of samples
-    Returns
-    -------
-    dmatrix: xgb.DMatrix
-        The DMatrix with pack-sum information
-    """
-    if gids is not None:
-        # sort by group
-        indices = gids.argsort()
-        xs, ys = xs[indices], ys[indices]
-        group_sizes = np.bincount(gids)
-        if weights is not None:
-            weights = weights[indices]
-    else:
-        # assume it has only one group
-        group_sizes = [len(xs)]
-
-    x_flatten = []
-    y_flatten = []
-    weights_flatten = []
-    pack_ids = []
-
-    if weights is not None:
-        for ct, (x, y, w) in enumerate(zip(xs, ys, weights)):
-            for row in x:
-                x_flatten.append(row)
-                y_flatten.append(y)
-                weights_flatten.append(w)
-                pack_ids.append(ct)
-    else:
-        for ct, (x, y) in enumerate(zip(xs, ys)):
-            for row in x:
-                x_flatten.append(row)
-                y_flatten.append(y)
-                pack_ids.append(ct)
-
-    ret = xgb.DMatrix(np.array(x_flatten), y_flatten)
-    if weights is not None:
-        ret.set_weight(weights_flatten)
-    dmatrix_context.set("pack_ids", ret, np.array(pack_ids))
-    dmatrix_context.set("group_sizes", ret, group_sizes)
-    return ret
-
-
-def predict_throughput_pack_sum(raw_preds, pack_ids):
-    """Predict the throughputs for predictions in pack-sum format
-    Parameters
-    ----------
-    raw_preds: np.ndarray
-        The raw predictions
-    pack_ids: List[int]
-        The pack id for predictions
-    Returns
-    -------
-    throughputs: np.ndarray
-        The throughput
-    """
-    sum_pred = np.bincount(pack_ids, weights=raw_preds)
-    return sum_pred
-
-
-def pack_sum_square_error(preds, dtrain):
-    """Implement square error loss on pack-sum format as
-     a custom objective function for xgboost.
-    Parameters
-    ----------
-    preds: np.ndarray
-        The predicitons
-    dtrain: xgb.DMatrix
-        The training set
-    Returns
-    -------
-    gradient: np.ndarray
-    hessian: np.ndarray
-        gradient and hessian according to the xgboost format
-    """
-    pack_ids = dmatrix_context.get("pack_ids", dtrain)
-    weight = dtrain.get_weight()
-
-    sum_pred = np.bincount(pack_ids, weights=preds)
-    x = sum_pred[pack_ids]
-    y = dtrain.get_label()
-    gradient = x - y
-    hessian = np.ones_like(gradient)
-
-    if len(weight) == 0:
-        return gradient, hessian
-
-    return gradient * weight, hessian * weight
-
-
-def pack_sum_rmse(raw_preds, labels):
-    """Evaluate RMSE (rooted mean square error) in the pack-sum format
-    Parameters
-    ----------
-    raw_preds: np.ndarray
-        The raw prediction
-    labels: xgb.DMatrix
-        The groud-truth label matrix
-    Returns
-    -------
-    name: str
-    score: float
-        The name and score of this metric
-    """
-    pack_ids = dmatrix_context.get("pack_ids", labels)
-    preds = predict_throughput_pack_sum(raw_preds, pack_ids)[pack_ids]
-    return "p-rmse", np.sqrt(np.mean(np.square((preds - labels.get_label()))))
-
-
-def pack_sum_average_peak_score(N):
-    """Return the evaluation function for average-peak-score@N
-    Parameters
-    ----------
-    N: int
-        The "N" in "average-peak-score@N"
-    Returns
-    -------
-    The evaluation function
-    """
-
-    def feval(preds, labels):
-        """Evaluate average-peak-score@N in the pack-sum format
-        Parameters
-        ----------
-        raw_preds: np.ndarray
-            The raw prediction
-        labels: xgb.DMatrix
-            The groud-truth label matrix
-        Returns
-        -------
-        name: str
-        score: float
-        The name and score of this metric
-        """
-        group_sizes = dmatrix_context.get("group_sizes", labels, [len(preds)])
-        pack_ids = dmatrix_context.get("pack_ids", labels)
-
-        preds = predict_throughput_pack_sum(preds, pack_ids)
-        labels = (
-            np.bincount(pack_ids, weights=labels.get_label())
-            / np.unique(pack_ids, return_counts=True)[1]
-        )
-
-        scores = []
-        offset = 0
-        for size in group_sizes:
-            preds_group = preds[offset : offset + size]
-            labels_group = labels[offset : offset + size]
-            offset += size
-
-            trials = np.argsort(preds_group)[::-1][:N]
-            trial_scores = labels_group[trials]
-            curve = max_curve(trial_scores) / np.max(labels_group)
-            scores.append(np.mean(curve))
-        return f"a-peak@{N}", np.mean(scores)
-
-    return feval
-
-
-class XGBoostCallback(TrainingCallback):
-    """Base class for XGBoost callbacks."""
-
-    def __call__(self, env: "xgb.core.CallbackEnv"):
-        # Compatibility with xgboost < 1.3
-        return self.after_iteration(env.model, env.iteration, env.evaluation_result_list)
-
-    def after_iteration(self, model: "xgb.Booster", epoch: int, evals_log: Dict):
-        raise NotImplementedError
-
-
-class CustomCallback(XGBoostCallback):
-    """
-    Callback function for xgboost.
-    Support custom evaluation function and early-stopping.
-    """
-
-    def __init__(
-        self,
-        stopping_rounds,
-        metric,
-        fevals,
-        evals=(),
-        log_file=None,
-        maximize=False,
-        verbose_eval=True,
-        skip_every=2,
-    ):
-        """Init function"""
-        self.stopping_rounds = stopping_rounds
-        self.metric = metric
-        self.metric_shortname = metric.split("-")[1]
-        self.fevals = fevals
-        self.evals = evals
-        self.log_file = log_file
-        self.maximize = maximize
-        self.verbose_eval = verbose_eval
-        self.skip_every = skip_every
-        self.state = {}
-
-    def after_iteration(self, model: "xgb.Booster", epoch: int, evals_log: Dict):
-        """Run after each iteration.  Return True when training should stop."""
-        # pylint:disable = import-outside-toplevel
-        try:
-            from xgboost.callback import _fmt_metric  # type: ignore
-        except ImportError:
-            # Compatibility with xgboost >= 1.6
-            def _fmt_metric(value, show_stdv=True):
-                """format metric string"""
-                if len(value) == 2:
-                    return f"{value[0]}:{value[1]:.5f}"
-                if len(value) == 3:
-                    if show_stdv:
-                        return f"{value[0]}:{value[1]:.5f}+{value[2]:.5f}"
-                    return f"{value[0]}:{value[1]:.5f}"
-                raise ValueError("wrong metric value", value)
-
-        ##### init state #####
-        if not self.state:
-            self.state["maximize_score"] = self.maximize
-            self.state["best_iteration"] = 0
-            if self.maximize:
-                self.state["best_score"] = float("-inf")
-            else:
-                self.state["best_score"] = float("inf")
-
-            assert model is not None
-            if model.attr("best_score") is not None:
-                self.state["best_score"] = float(model.attr("best_score"))
-                self.state["best_iteration"] = int(model.attr("best_iteration"))
-                self.state["best_msg"] = model.attr("best_msg")
-            else:
-                model.set_attr(best_iteration=str(self.state["best_iteration"]))
-                model.set_attr(best_score=str(self.state["best_score"]))
-        res_dict = {}
-
-        if epoch % self.skip_every == 1:
-            return False
-
-        ##### evaluation #####
-        for feval in self.fevals:
-            bst_eval = model.eval_set(self.evals, epoch, feval)
-            res = [x.split(":") for x in bst_eval.split()]
-            for kv in res[1:]:
-                res_dict[kv[0]] = [float(kv[1])]
-
-        eval_res = []
-        keys = list(res_dict.keys())
-        keys.sort(key=lambda x: x if self.metric_shortname not in x else "a" + x)
-        for key in keys:
-            v = res_dict[key]
-            eval_res.append([key] + v)
-
-        ##### print eval result #####
-        if (
-            not isinstance(self.verbose_eval, bool)
-            and self.verbose_eval
-            and epoch % self.verbose_eval == 0
-        ):
-            infos = [f"XGB iter: {epoch:3d}"]
-            for item in eval_res:
-                if "null" in item[0]:
-                    continue
-                infos.append(f"{item[0]}: {item[1]:.6f}")
-
-            logger.debug("\t".join(infos))
-            if self.log_file:
-                with open(self.log_file, "a") as fout:
-                    fout.write("\t".join(infos) + "\n")
-
-        ##### choose score and do early stopping #####
-        score = None
-        for item in eval_res:
-            if item[0] == self.metric:
-                score = item[1]
-                break
-        assert score is not None
-
-        best_score = self.state["best_score"]
-        best_iteration = self.state["best_iteration"]
-        maximize_score = self.state["maximize_score"]
-
-        if (maximize_score and score > best_score) or (not maximize_score and score < best_score):
-            msg = f"[{epoch}] " + "\t".join([_fmt_metric(x) for x in eval_res])
-            self.state["best_msg"] = msg
-            self.state["best_score"] = score
-            self.state["best_iteration"] = epoch
-            # save the property to attributes, so they will occur in checkpoint.
-            if model is not None:
-                model.set_attr(
-                    best_score=str(self.state["best_score"]),
-                    best_iteration=str(self.state["best_iteration"]),
-                    best_msg=self.state["best_msg"],
-                )
-        elif epoch - best_iteration >= self.stopping_rounds:
-            best_msg = self.state["best_msg"]
-            if self.verbose_eval:
-                logger.debug("XGB stopped. Best iteration: %s ", best_msg)
-            return True
-
-        return False
diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
deleted file mode 100644
index 3384850502c7..000000000000
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-The global context that dispatches best schedules to workloads.
-
-In auto-scheduler, a state (loop_state.py::StateObject) saves the
-schedule configuration by its transform_steps, so a state is used
-as a schedule configuration here.
-"""
-# pylint: disable=invalid-name
-
-import logging
-import pathlib
-from collections.abc import Iterable
-
-import numpy as np
-
-from tvm.contrib.utils import tempdir
-from tvm.tir.expr import FloatImm
-from .cost_model import RandomModel, XGBModel
-from .measure import LocalRPCMeasureContext
-from .measure_record import RecordToFile, load_records
-from .search_policy import PreloadMeasuredStates, SketchPolicy
-from .search_task import SearchTask, TuningOptions
-from .utils import calc_workload_dis_factor, decode_workload_key
-
-logger = logging.getLogger("auto_scheduler")
-
-
-class DispatchContext(object):
-    """
-    Base class of dispatch context.
-    """
-
-    current = None
-
-    def __init__(self):
-        self._old_ctx = DispatchContext.current
-
-    def query(self, target, workload_key, has_complex_op, dag, func_name):
-        """
-        Query the context to get the specific config for a workload.
-        If this function cannot find the result inside this context, it will query the result
-        from the upper contexts.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload_key : str
-            The workload key
-        has_complex_op: bool
-            Whether this workload has at least one complex op.
-        dag: ComputeDAG
-            The ComputeDAG of the workload.
-        func_name: str
-            The function name of this workload.
-
-        Returns
-        -------
-        state : StateObject
-            The state that stores schedule configuration for the workload
-        """
-        ret = self._query_inside(target, workload_key, func_name)
-        if ret is None:
-            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag, func_name)
-        return ret
-
-    def update(self, target, workload_key, state):
-        """
-        Update the config for a workload
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload_key : str
-            The current workload_key.
-        state : StateObject
-            The state that stores schedule configuration for the workload
-        """
-        raise NotImplementedError()
-
-    def _query_inside(self, target, workload_key, func_name):
-        """
-        Query the context to get the specific config for a workload.
-        This function only query config inside this context.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload_key : str
-            The current workload_key.
-        func_name: str
-            The function name of this workload.
-
-        Returns
-        -------
-        state : StateObject
-            The schedule configuration for the workload
-        """
-        raise NotImplementedError()
-
-    def __enter__(self):
-        self._old_ctx = DispatchContext.current
-        DispatchContext.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        DispatchContext.current = self._old_ctx
-
-
-class ApplyHistoryBest(DispatchContext):
-    """
-    Apply the history best config
-
-    Parameters
-    ----------
-    records : str, list of str, or iterator of (auto_scheduler.measure.MeasureInput,\
-                                                auto_scheduler.measure.MeasureResult)
-        Collection of tuning records.
-        If is str, then it should be the filename of a records log file.
-        Each row of this file is an encoded record pair. If it is an iterator,
-        it can either be a set of str filenames which will be applied jointly,
-        or a set of (input, result) tuples.
-    n_lines: Optional[int]
-        if it is not None, only load the first `n_lines` lines of log.
-    include_compatible: bool
-        When set to True, compatible records will also be considered.
-    """
-
-    def __init__(self, records, n_lines=None, include_compatible=False):
-        super(ApplyHistoryBest, self).__init__()
-        self.include_compatible = include_compatible
-
-        # Dict[str (target key),
-        #   Dict[str (workload hash),
-        #     Dict[tuple (workload args), tuple (State, cost)]]]
-        self.best_by_targetkey = {}
-        self.best_by_model = {}
-        self._best_user_defined = {}
-
-        self.load(records, n_lines)
-
-    @staticmethod
-    def get_workload_entry(best_records, target_key, workload_key):
-        """Get the entry of the target key and workload key hash in the given best record map.
-
-        Parameters
-        ----------
-        best_records: Dict[str, Dict[str, Dict[str, Any]]]
-            The best record map.
-        target_key: str
-            The first key to the best_records.
-        workload_key: str
-            The workload key that can be decoded to workload hash and args.
-
-        Returns
-        -------
-        entry: Dict[str, Any]
-            The entry in best_records with target key and workload hash.
-        workload_hash: str
-            The workload hash decoded from workload_key.
-        workload_args: Tuple[Any, ...]
-            The hashable tuple of workload args decoded from workload_key.
-        """
-        workload_hash, workload_args = decode_workload_key(workload_key)
-        if target_key not in best_records:
-            best_records[target_key] = {}
-        if workload_hash not in best_records[target_key]:
-            best_records[target_key][workload_hash] = {}
-        return best_records[target_key][workload_hash], workload_hash, workload_args
-
-    def load(self, records, n_lines=None):
-        """Load records to this dispatch context
-
-        Parameters
-        ----------
-        records : str or iterator of (auto_scheduler.measure.MeasureInput,\
-                                      auto_scheduler.measure.MeasureResult)
-            Collection of tuning records.
-            If is str, then it should be the filename of a records log file.
-            Each row of this file is an encoded record pair. Otherwise, it is an iterator.
-        n_lines: Optional[int]
-            if it is not None, only load the first `n_lines` lines of log
-        """
-        joint_records = []
-        if not isinstance(records, Iterable) or isinstance(records, str):
-            records = [records]
-
-        for rec in records:
-            if isinstance(rec, pathlib.Path):
-                rec = str(rec)
-
-            if isinstance(rec, str):
-                rec = load_records(rec)
-                joint_records += rec
-            else:
-                if rec is not None:
-                    joint_records.append(rec)
-
-        if not joint_records:
-            return
-
-        best_by_targetkey = self.best_by_targetkey
-        best_by_model = self.best_by_model
-
-        counter = 0
-        for inp, res in joint_records:
-            if n_lines is not None and counter >= n_lines:
-                break
-            counter += 1
-            if res.error_no != 0:
-                continue
-
-            costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
-            cost = np.mean(costs)
-
-            # use target keys in tvm target system as key to build best map
-            for k in inp.task.target.keys:
-                entry, _, workload_args = self.get_workload_entry(
-                    best_by_targetkey, k, inp.task.workload_key
-                )
-                if workload_args not in entry:
-                    entry[workload_args] = (inp.state, cost)
-                else:
-                    _, other_cost = entry[workload_args]
-                    if other_cost > cost:
-                        entry[workload_args] = (inp.state, cost)
-
-            # use model as key to build best map
-            entry, _, workload_args = self.get_workload_entry(
-                best_by_model, inp.task.target.model, inp.task.workload_key
-            )
-            if workload_args not in entry:
-                if inp.task.target.model != "unknown":
-                    entry[workload_args] = (inp.state, cost)
-            else:
-                _, other_cost = entry[workload_args]
-                if other_cost > cost:
-                    entry[workload_args] = (inp.state, cost)
-
-        logger.debug("Finish loading %d records", counter)
-
-    def _query_inside(self, target, workload_key, func_name):
-        if target is None:
-            raise RuntimeError(
-                "Need a target context to find the history best. "
-                "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
-                " above the dispatcher call. So does other target. "
-            )
-
-        def match_record(best_records, target_key, workload_key):
-            """The helper function to match the record in the given map
-            and return the matched state, or None if no match.
-            """
-            ret = None
-
-            entry, workload_hash, workload_args = self.get_workload_entry(
-                best_records, target_key, workload_key
-            )
-            if workload_args in entry:
-                ret = entry[workload_args][0]
-            elif self.include_compatible:
-                best_cost = float("inf")
-                for args, val in entry.items():
-                    dis_f = calc_workload_dis_factor(
-                        (workload_hash, workload_args), (workload_hash, args)
-                    )
-                    if dis_f == float("inf"):
-                        continue
-
-                    state, cost = val
-                    cost *= dis_f
-                    if ret is None or cost < best_cost:
-                        best_cost = cost
-                        ret = state
-            return ret
-
-        # first try matching by model
-        ret = match_record(self._best_user_defined, target.model, workload_key)
-        if ret is not None:
-            return ret
-        ret = match_record(self.best_by_model, target.model, workload_key)
-        if ret is not None:
-            return ret
-
-        # then try matching by target key
-        for k in target.keys:
-            ret = match_record(self._best_user_defined, k, workload_key)
-            if ret is not None:
-                return ret
-            ret = match_record(self.best_by_targetkey, k, workload_key)
-            if ret is not None:
-                return ret
-
-        return None
-
-    def update(self, target, workload_key, state):
-        entry, _, workload_args = self.get_workload_entry(
-            self._best_user_defined, target.model, workload_key
-        )
-        entry[workload_args] = (state, 1)
-
-        for k in target.keys:
-            entry, _, _ = self.get_workload_entry(self._best_user_defined, k, workload_key)
-            entry[workload_args] = (state, 1)
-
-
-class ApplyHistoryBestOrSample(ApplyHistoryBest):
-    """
-    Apply the history best config, or sample a valid schedule if no config is found.
-
-    Parameters
-    ----------
-    records : str or iterator of (auto_scheduler.measure.MeasureInput,\
-                                  auto_scheduler.measure.MeasureResult)
-        Collection of tuning records.
-        If is str, then it should be the filename of a records log file.
-        Each row of this file is an encoded record pair. Otherwise, it is an iterator.
-    sample_simple_workloads: bool
-        When False, sampling will not apply to simple workloads (w/o reduction).
-    cost_model_file: str
-        The filename of the pre-trained XGBoost cost model. If not present, then random
-        model will be used.
-    num_measure: int
-        Meausre the top-N rank of sampled schedules on the device. The default -1 means
-        no measurement and simply return the top-1 schedule ranked by the cost model.
-    """
-
-    def __init__(
-        self, records, sample_simple_workloads=False, cost_model_file=None, num_measure=-1
-    ):
-        self.sample_simple_workloads = sample_simple_workloads
-        self.num_measure = num_measure
-        self.log_dir = tempdir()
-        if cost_model_file is None:
-            self.cost_model = RandomModel()
-        else:
-            self.cost_model = XGBModel()
-            self.cost_model.load(cost_model_file)
-
-        super(ApplyHistoryBestOrSample, self).__init__(
-            records, n_lines=None, include_compatible=True
-        )
-
-    def query(self, target, workload_key, has_complex_op, dag, func_name):
-        if has_complex_op or self.sample_simple_workloads:
-            ret = self._query_inside(target, workload_key, func_name)
-        else:
-            ret = super(ApplyHistoryBestOrSample, self)._query_inside(
-                target, workload_key, func_name
-            )
-
-        if ret is None:
-            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag, func_name)
-        return ret
-
-    def _query_inside(self, target, workload_key, func_name):
-        ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key, func_name)
-        if ret is not None:
-            return ret
-
-        # Sampling valid schedules when no existing records can be used.
-        task = SearchTask(workload_key=workload_key, target=target)
-        measure_ctx = LocalRPCMeasureContext(min_repeat_ms=300)
-
-        log_file = self.log_dir.relpath(f"{decode_workload_key(workload_key)[0]}.log")
-
-        while ret is None:
-            tune_option = TuningOptions(
-                num_measure_trials=self.num_measure,
-                runner=measure_ctx.runner,
-                measure_callbacks=[RecordToFile(log_file)],
-                verbose=0,
-            )
-            search_policy = SketchPolicy(
-                task,
-                self.cost_model,
-                params={
-                    "eps_greedy": 0.01,
-                    "sample_init_min_population": 64,
-                    "evolutionary_search_num_iters": 0,
-                },
-                init_search_callbacks=[PreloadMeasuredStates(log_file)],
-                verbose=0,
-            )
-            task.tune(tune_option, search_policy)
-
-            # Load the sampled records and query again.
-            self.load(log_file)
-            ret = super(ApplyHistoryBestOrSample, self)._query_inside(
-                target, workload_key, func_name
-            )
-
-        del measure_ctx
-        return ret
-
-
-class FallbackContext(DispatchContext):
-    """
-    A fallback dispatch context.
-    This is used as the root context.
-    """
-
-    def __init__(self):
-        super(FallbackContext, self).__init__()
-        self.memory = {}
-
-        # Verbose level:
-        # 0: Completely silent.
-        # 1: Warning the missing configs for querying complex tasks.
-        # 2: Warning the missing configs for querying all tasks.
-        self.verbose = 1
-
-        # a set to prevent print duplicated message
-        self.messages = set()
-
-    def query(self, target, workload_key, has_complex_op, dag, func_name):
-        key = (str(target), workload_key)
-        if key in self.memory:
-            return self.memory[key]
-
-        if self.verbose == 2 or (has_complex_op and self.verbose == 1):
-            msg = (
-                f"-----------------------------------\n"
-                f"{func_name}\n"
-                f"Cannot find tuned schedules for target={target}, workload_key={workload_key}. "
-                f"A fallback TOPI schedule is used, "
-                f"which may bring great performance regression or even compilation failure. "
-                f"Compute DAG info:\n{dag}"
-            )
-            if msg not in self.messages:
-                self.messages.add(msg)
-                logger.warning(msg)
-
-        state = None
-
-        # cache this config to avoid duplicated warning message
-        self.memory[key] = state
-        return state
-
-    def _query_inside(self, target, workload_key, func_name):
-        _ = target = workload_key = func_name
-        raise RuntimeError("This function should never be called")
-
-    def update(self, target, workload_key, state):
-        key = (str(target), workload_key)
-        self.memory[key] = state
-
-
-DispatchContext.current = FallbackContext()
diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py
deleted file mode 100644
index ea62560a6f6e..000000000000
--- a/python/tvm/auto_scheduler/feature.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""""
-Python API for Feature extraction. The extracted features vector are used by cost models.
-
-We extract one feature vector per BufferStoreNode statement in a TIR Stmt,
-so we call this feature as "per-store" feature.
-The cost model also does prediction for each BufferStoreNode statement and aggregates
-the predicted score of each BufferStoreNode as the score of a TIR Stmt.
-
-The feature specification is defined by `src/auto_scheduler/feature.cc::FeatureSet`
-"""
-
-from typing import List, Tuple, Union, Optional, Dict
-import struct
-
-import numpy as np
-
-from .loop_state import State, StateObject
-from .measure import MeasureInput, MeasureResult
-from . import _ffi_api
-from ..tir import PrimFunc
-
-# The maximum number of extracted buffers for one statement
-DEFAULT_MAX_N_BUFS = 5
-
-# The length of the feature vector
-DEFAULT_FEATURE_VEC_LEN = 164
-
-# The size of int and float in bytes
-SIZE_OF_INT32 = 4
-SIZE_OF_FLOAT32 = 4
-
-
-def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Unpack the flatten feature (in byte array format) from c++
-
-    Parameters
-    ----------
-    byte_arr: bytearray
-        The two-dimensional feature vector in serialized byte array format
-
-    Returns
-    -------
-    features: np.ndarray
-        Feature vectors
-    normalized_throughputs: np.ndarray
-        Normalized throughputs
-    task_ids: np.ndarray
-        Task ids
-
-    Note
-    ----
-    For faster data copy between c++ and python, the c++ part returns features in a single
-    flatten array using a packed format. The python part then unpacks the flatten array.
-
-    The packed format for n records is:
-    {
-      int   n;
-      int   sizes[n+2];           // The sizes for the following arrays
-
-      float features_0[size[0]];  // The features for record 0
-      float features_1[size[1]];  // The features for record 1
-      ...
-      float features_i[size[i]];  // The features for record i
-      ... // until i == n - 1
-
-      float throughputs[sizes[n]];  // The normalized throughputs for n records
-      int   task_ids[size[n+1]];    // The task ids for n records
-
-    }
-    To implement this format, we also store int as float, so we can store all numbers
-    into a single float array.
-    """
-    vec_len = DEFAULT_FEATURE_VEC_LEN
-
-    # unpack sizes
-    offset = 0
-    n = struct.unpack_from("1i", byte_arr, offset=offset)[0]
-    offset += SIZE_OF_INT32
-
-    sizes = struct.unpack_from(f"{n + 2}i", byte_arr, offset=offset)
-    offset += SIZE_OF_INT32 * (n + 2)
-
-    # unpack features
-    features = []
-    for size in sizes[:-2]:
-        row = []
-
-        # Now, we need to unpack the feature for multiple statements.
-        # The format is:
-        # {
-        #   int   n_stage;                        // The number of stages
-        #   float feature_vecs[n_stage][vec_len]  // The feature vector for each stage
-        # }
-        # where vec_len can be calculated by `(size - 1) / n_stmts`
-
-        if size == 0:
-            # failed during lowering
-            features.append(np.zeros((1, vec_len)))
-        else:
-            n_stmts = struct.unpack_from("f", byte_arr, offset=offset)
-            offset += SIZE_OF_FLOAT32
-
-            n_stmts = int(n_stmts[0] + 0.5)
-            tmp_vec_len = (size - 1) // n_stmts
-            assert (
-                tmp_vec_len == vec_len
-            ), f"The length of feature vector is wrong. Expected {vec_len} but got {tmp_vec_len}."
-            assert tmp_vec_len * n_stmts == size - 1
-            for _ in range(n_stmts):
-                x = struct.unpack_from(f"{vec_len}f", byte_arr, offset=offset)
-                offset += vec_len * SIZE_OF_FLOAT32
-                row.append(x)
-
-            features.append(np.array(row))
-
-    # unpack normalized_throughputs
-    m = sizes[-2]
-    normalized_throughputs = struct.unpack_from(f"{m}f", byte_arr, offset=offset)
-    offset += m * SIZE_OF_FLOAT32
-
-    # unpack task_ids
-    m = sizes[-1]
-    task_ids = struct.unpack_from(f"{m}i", byte_arr, offset=offset)
-    offset += m * SIZE_OF_INT32
-
-    assert offset == len(byte_arr), f"{offset} vs {len(byte_arr)}"
-    return np.array(features, dtype=object), np.array(normalized_throughputs), np.array(task_ids)
-
-
-def get_per_store_features_from_file(
-    filename: str, max_lines: int, max_n_bufs: Optional[int] = None
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Get per-store features from a log file
-
-    Parameters
-    ----------
-    filename: str
-        The input filename
-    max_lines: int
-        Only extract the first n lines of the file
-    max_n_bufs: Optional[int]
-        The maximum number of extracted buffers for one statement
-
-    Returns
-    -------
-    features: np.ndarray
-        Feature vectors
-    normalized_throughputs: np.ndarray
-        Normalized throughputs
-    task_ids: np.ndarray
-        Task ids
-    """
-    byte_arr = _ffi_api.GetPerStoreFeaturesFromFile(
-        filename, max_lines, max_n_bufs or DEFAULT_MAX_N_BUFS
-    )
-    return unpack_feature(byte_arr)
-
-
-def get_per_store_features_from_measure_pairs(
-    inputs: List[MeasureInput],
-    results: List[MeasureResult],
-    skip_first_n_feature_extraction: int = 0,
-    max_n_bufs: Optional[int] = None,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Get per-store features from measurement input/result pairs
-
-    Parameters
-    ----------
-    inputs: List[MeasureInput]
-        The measure inputs
-    results: List[MeasureResult]
-        The measure results
-    skip_first_n_feature_extraction: int
-        Skip feature extraction for the first n states
-    max_n_bufs: int
-        The maximum number of extracted buffers for one statement
-
-    Returns
-    -------
-    features: np.ndarray
-        Feature vectors
-    normalized_throughputs: np.ndarray
-        Normalized throughputs
-    task_ids: np.ndarray
-        Task ids
-    """
-    byte_arr = _ffi_api.GetPerStoreFeaturesFromMeasurePairs(
-        inputs, results, skip_first_n_feature_extraction, max_n_bufs or DEFAULT_MAX_N_BUFS
-    )
-    return unpack_feature(byte_arr)
-
-
-def get_per_store_features_from_states(
-    states: List[Union[State, StateObject]], task: "SearchTask", max_n_bufs: Optional[int] = None
-) -> np.ndarray:
-    """Get per-store features from measurement input/result pairs
-
-    Parameters
-    ----------
-    states: List[Union[State, StateObject]]
-        The input states
-    task: SearchTask
-        The search task of the input states
-    max_n_bufs: Optional[int]
-        The maximum number of extracted buffers for one statement
-
-    Returns
-    -------
-    features: np.ndarray
-        Feature vectors
-    """
-    if isinstance(states[0], State):
-        state_objects = [s.state_object for s in states]
-    elif isinstance(states[0], StateObject):
-        state_objects = states
-    byte_arr = _ffi_api.GetPerStoreFeaturesFromStates(
-        state_objects, task, max_n_bufs or DEFAULT_MAX_N_BUFS
-    )
-    return unpack_feature(byte_arr)[0]
-
-
-def get_per_store_feature_names(max_n_bufs: Optional[int] = None) -> List[str]:
-    """Get the name of every element in the feature vector. Use this for debug and inspection.
-
-    Parameters
-    ----------
-    max_n_bufs: int
-        The maximum number of extracted buffers for one statement
-
-    Returns
-    -------
-    names: List[str]
-        The names of elements in the flatten feature vector
-    """
-    return _ffi_api.GetPerStoreFeatureNames(max_n_bufs or DEFAULT_MAX_N_BUFS)
-
-
-def features_from_primfunc(
-    func: PrimFunc,
-    cache_line_bytes: int = 64,
-    max_n_bufs: Optional[int] = None,
-    log_scale: bool = False,
-) -> Optional[np.ndarray]:
-    """Extract performance features from a PrimFunc.
-
-    Parameters
-    ----------
-    func: PrimFunc
-        PrimFunc from which features will be extracted. Each store operation to
-        a unique buffer in the function will result in one row of features in
-        the output.
-
-    cache_line_bytes: int, optional
-        Size of a cache line in bytes. Defaults to 64 which is the size for
-        most x86 processors.
-
-    max_n_bufs: int, optional
-        Maximum number of buffers in generated features. This determines the
-        length of the resulting feature vector.
-
-    log_scale: bool
-        Should entries in the feature vector be scaled by log2(x + 1). Defaults
-        to False. Use True if using features with a cost model.
-
-    Returns
-    -------
-    Optional[np.ndarray]
-        Output features, one row per store into a unique buffer statement in `func`.
-    """
-    return _ffi_api.FeaturesFromPrimFunc(
-        func, cache_line_bytes, max_n_bufs or DEFAULT_MAX_N_BUFS, log_scale
-    ).numpy()
-
-
-def named_features_from_primfunc(
-    func: PrimFunc,
-    cache_line_bytes: int = 64,
-    max_n_bufs: Optional[int] = None,
-    log_scale: bool = False,
-) -> Optional[Dict[str, np.ndarray]]:
-    """Extract performance features and associated names from a PrimFunc.
-
-    Parameters
-    ----------
-    func: PrimFunc
-        PrimFunc from which features will be extracted. Each store operation to
-        a unique buffer in the function will result in one row of features in
-        the output.
-
-    cache_line_bytes: int, optional
-        Size of a cache line in bytes. Defaults to 64 which is the size for
-        most x86 processors.
-
-    max_n_bufs: int, optional
-        Maximum number of buffers in generated features. This determines the
-        length of the resulting feature vector.
-
-    log_scale: bool
-        Should entries in the feature vector be scaled by log2(x + 1). Defaults
-        to False. Use True if using features with a cost model.
-
-    Returns
-    -------
-    Optional[Dict[str, np.ndarray]]
-        Mapping from feature name to features. One element per store into a
-        unique buffer statement in `func`.
-    """
-    features = features_from_primfunc(func, cache_line_bytes, max_n_bufs, log_scale)
-    names = get_per_store_feature_names(max_n_bufs)
-    if features.shape[0] == 0:
-        return None
-    return {name: features[:, i] for i, name in enumerate(names)}
diff --git a/python/tvm/auto_scheduler/loop_state.py b/python/tvm/auto_scheduler/loop_state.py
deleted file mode 100644
index 03cc00def6b7..000000000000
--- a/python/tvm/auto_scheduler/loop_state.py
+++ /dev/null
@@ -1,618 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import
-
-"""
-The definition of the "state" in the search.
-Each LoopState corresponds to a schedule for its ComputeDAG.
-A LoopState consists of: 1. a current loop structure; 2. a list of transformation steps used to
-construct the loop structure.
-The loop structure keeps a preview of how the schedule will finally look like after lowering the
-current state (e.g. number of iterators, the extent of each iterator, the compute_at locations
-...).
-During the schedule search process, the loop structure can provide search policy with necessary
-information on how to manipulate the current state.
-The transform history is a sequence of `TransformStep` which will finally be mapped to TVM
-schedule primitives. The steps are also used for the serialization of a state.
-The LoopState can be seen as a lightweight loop structure IR specifically for schedule search.
-We don't use the existing TVM IR but to extend a new structure on it is because:
-1. We want fast incremental change to the loop structures. The search policy needs to get the
-immediate loop structures update rather than after TVM lowering;
-2. We want serializable transform history for replay, backtracking, and mutation;
-3. We may create some macro schedule primitives that represent the combination of several
-TVM schedule primitives.
-When the search is finished, we will lower the state to TVM IR with TVM's schedule primitives.
-Since we share a lot of common objects during search, the transformation is implemented in
-copy on write style. All objects are immutable, which is similar to TVM IR.
-"""
-
-import tvm._ffi
-from tvm.te.tensor import Operation, Tensor
-from tvm.runtime import Object
-from . import _ffi_api
-
-
-@tvm._ffi.register_object("auto_scheduler.Iterator")
-class Iterator(Object):
-    """A loop iterator structure."""
-
-
-@tvm._ffi.register_object("auto_scheduler.Stage")
-class Stage(Object):
-    """A stage in the compute declaration. Similar to tvm.te.schedule.Stage."""
-
-    # Static trans table for compute_at location
-    # This is used to transform the compute_at location to C++ enum
-    COMPUTE_AT_TRANS_TABLE = {"root": 0, "inlined": 1, "iter": 2}
-
-
-@tvm._ffi.register_object("auto_scheduler.State")
-class StateObject(Object):
-    """The internal State object"""
-
-    def __eq__(self, other):
-        return _ffi_api.StateEqual(self, other)
-
-
-class State:
-    """
-    A state in the search process. It consists of the current loop structure
-    and a list of transformation steps used to construct it.
-    Each State corresponds to a specific schedule for its ComputeDAG.
-    Parameters
-    ----------
-    state_object : StateObject
-        The StateObject corresponding to C++ internal State object.
-    dag : ComputeDAG
-        The original ComputeDAG of this State.
-    Notes
-    -----
-    This is a wrapper class of StateObject to deal with copy-on-write property
-    """
-
-    # Static trans table for thread bind and annotation
-    # This is used to transform the annotation name to C++ enum
-    ANNOTATION_TRANS_TABLE = {
-        "none": 0,
-        "unroll": 1,
-        "vectorize": 2,
-        "parallel": 3,
-        "vthread": 4,
-        "blockIdx.x": 5,
-        "threadIdx.x": 6,
-        "blockIdx.y": 7,
-        "threadIdx.y": 8,
-        "blockIdx.z": 9,
-        "threadIdx.z": 10,
-        "tensorize": 11,
-    }
-
-    def __init__(self, state_object, dag):
-        self.state_object = state_object
-        self.compute_dag = dag
-
-        self.stage_id_map = {}  # A dict maps operation to stage id
-        self._update_stage_id_map()
-
-    @property
-    def stages(self):
-        """
-        Returns
-        -------
-        stages : List[Stage]
-        """
-        return self.state_object.stages
-
-    @property
-    def transform_steps(self):
-        """
-        Returns
-        -------
-        transform_steps : List[transform_steps]
-        """
-        return self.state_object.transform_steps
-
-    @property
-    def stage_ops(self):
-        """
-        Returns
-        -------
-        ops: List[Operation]
-        """
-        return [stage.op for stage in self.stages]
-
-    def bind(self, stage, iterator, thread_name):
-        """Schedule primitive corresponding to `te.Stage.bind`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be binded, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to be binded.
-        thread_name : str
-            The thread type to be binded. Candidates:
-            - vthread
-            - blockIdx.x
-            - threadIdx.x
-            - blockIdx.y
-            - threadIdx.y
-            - blockIdx.z
-            - threadIdx.z
-        Returns
-        -------
-        res_it : Iterator
-            The binded Iterator.
-        """
-        if not thread_name in State.ANNOTATION_TRANS_TABLE.keys():
-            raise ValueError("Invalid thread_name: ", thread_name)
-
-        self.state_object, res = _ffi_api.StateBind(
-            self.state_object,
-            self._resolve_stage_id(stage),
-            iterator,
-            State.ANNOTATION_TRANS_TABLE[thread_name],
-        )
-        return res
-
-    def parallel(self, stage, iterator):
-        """Schedule primitive corresponding to `te.Stage.parallel`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be paralleled, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to be paralleled.
-        Returns
-        -------
-        res_it : Iterator
-            The paralleled Iterator.
-        """
-        self.state_object, res = _ffi_api.StateParallel(
-            self.state_object, self._resolve_stage_id(stage), iterator
-        )
-        return res
-
-    def unroll(self, stage, iterator, max_unroll=None):
-        """Schedule primitive corresponding to `te.Stage.unroll`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be unrolled, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to be unrolled.
-        max_unroll : Optional[int]
-            The max unroll limit. Iterator with extent larger than this limit will be skipped.
-        Returns
-        -------
-        res_it : Iterator
-            The unrolled Iterator.
-        """
-        self.state_object, res = _ffi_api.StateUnroll(
-            self.state_object,
-            self._resolve_stage_id(stage),
-            iterator,
-            max_unroll if max_unroll else -1,
-        )
-        return res
-
-    def vectorize(self, stage, iterator):
-        """Schedule primitive corresponding to `te.Stage.vectorize`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be vectorized, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to be vectorized.
-        Returns
-        -------
-        res_it : Iterator
-            The vectorized Iterator.
-        """
-        self.state_object, res = _ffi_api.StateVectorize(
-            self.state_object, self._resolve_stage_id(stage), iterator
-        )
-        return res
-
-    def fuse(self, stage, iters):
-        """Schedule primitive corresponding to `te.Stage.fuse`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be fused, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iters : List[Iterator]
-            The iterators to be fused.
-        Returns
-        -------
-        res_it : Iterator
-            The fused Iterator.
-        Notes
-        -----
-        If the iterators to be fused have stages attached at them(by compute_at), the fused
-        result will become the new attach point.
-        """
-        self.state_object, res = _ffi_api.StateFuse(
-            self.state_object, self._resolve_stage_id(stage), iters
-        )
-        return res
-
-    def pragma(self, stage, iterator, pragma_type):
-        """Schedule primitive corresponding to `te.Stage.pragma`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to add pragma, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to add pragma.
-        pragma_type : str
-            The pragma string.
-        """
-        self.state_object = _ffi_api.StatePragma(
-            self.state_object, self._resolve_stage_id(stage), iterator, pragma_type
-        )
-
-    def reorder(self, stage, order):
-        """Schedule primitive corresponding to `te.Stage.reorder`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be reordered, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        order : List[Iterator]
-            Iterators in the expected order.
-        """
-        self.state_object = _ffi_api.StateReorder(
-            self.state_object, self._resolve_stage_id(stage), order
-        )
-
-    def split(self, stage, iterator, lengths, inner_to_outer=True):
-        """Schedule primitive corresponding to `te.Stage.split`.
-        See also the `te.Stage` for more details.
-        This API supports multiple split factors. (e.g. with 2 split factors, the original iterator
-        will be split to 3 parts, use `inner_to_outer` to control the split order)
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be split, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to be split.
-        lengths: List[int]
-            The multiple split factors. Can be None to be filled by search policy.
-        inner_to_outer: boolean = True
-            Whether the factor go from inner to outer, or from outer to inner.
-        Returns
-        -------
-        res_its : List[Iterator]
-            The splitted new Iterators.
-        Notes
-        -----
-        If we do split on an iterator which has stages attached at it(by compute_at), the inner
-        most iterator of split results will become the new attach point.
-        """
-        self.state_object, res = _ffi_api.StateSplit(
-            self.state_object, self._resolve_stage_id(stage), iterator, lengths, inner_to_outer
-        )
-        return res
-
-    def follow_split(self, stage, iterator, src_step_id, n_split):
-        """The schedule primitive similar to split, but uses split factors from previous steps.
-        This step splits the iterator by the same factors as the given SplitStep.
-        Notes
-        ------
-            This step is useful in a scenario that we have subgraph Dense -> Relu,
-            and we want to compute the Dense stage at ReLU. In this case, we need them to have
-            the same tiling structure of common outer loops.
-            The follow_split step could be used here to split the Dense stage and makes sure its
-            splitting factors are the same as the given split step for the ReLU stage.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be split, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to split.
-        src_step_id : int
-            The index of the split step to be followed in the history.
-        n_split : int
-            The number of split level.
-        Returns
-        -------
-        res_its : List[Iterator]
-            The splitted new Iterators.
-        """
-
-        self.state_object, res = _ffi_api.StateFollowSplit(
-            self.state_object, self._resolve_stage_id(stage), iterator, src_step_id, n_split
-        )
-        return res
-
-    def follow_fused_split(self, stage, iterator, src_step_ids, level, factor_or_nparts):
-        """Schedule primitive extends to split step.
-        This step is used to split an iterator by the same factors
-        as the given list of SplitSteps and FuseSteps.
-        Notes
-        ------
-            This step is useful in a scenario that we have a subgraph
-            in GPU schedule: Input -> Dense
-            for i.0@j.0 = ... : Bind to blockIdx.x
-                for i.1@j.1 = ... : Bind to threadIdx.x
-                    for i.2@j.2 = ...
-                        Input_shared = Input ...
-                        for k = ...
-                            Dense = ...
-            We intend to apply cooperative fetching with the input stage, while the threadIdx.x
-            axis is bound to an iterator generated by split & fuse step.
-            The follow_fused_step is used split the iterator to 2 parts, while the split factor
-            matches the final extent of the threadIdx.x bound iterator.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be split, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The iterator to split.
-        src_step_ids : List[int]
-            The indices of the split steps to be followed in the history.
-        level : int
-            Use the length in this split level.
-        factor_or_nparts : bool
-            True to use `factor` for split from inner to outer,
-            False to use `nparts` for split from outer to inner.
-        Returns
-        -------
-        res_its : List[Iterator]
-            The splitted new Iterators.
-        """
-
-        self.state_object, res = _ffi_api.StateFollowFusedSplit(
-            self.state_object,
-            self._resolve_stage_id(stage),
-            iterator,
-            src_step_ids,
-            level,
-            factor_or_nparts,
-        )
-        return res
-
-    def storage_align(self, stage, iterator, factor, offset):
-        """Schedule primitive corresponding to `te.Stage.storage_align`.
-        See also the `te.Stage` for  more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be storage aligned, which can be specified by the integer index,
-            Operation, or output tensor of the stage.
-        iterator : Iterator
-            The iterator to be aligned.
-        factor : int
-            The factor in alignment specification.
-        offset : int
-            The offset in the alignment specification.
-        """
-        self.state_object = _ffi_api.StateStorageAlign(
-            self.state_object, self._resolve_stage_id(stage), iterator, factor, offset
-        )
-
-    def compute_at(self, stage, target_stage, target_iter):
-        """Schedule primitive corresponding to `te.Stage.compute_at`.
-        See also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The source Stage of computed at, which can be specified by the integer index,
-            Operation, or output tensor of the stage.
-        target_stage : Union[int, Operation, Tensor]
-            The target stage of compute_at, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        target_iter : Iterator
-            The target Iterator of compute_at.
-        Notes
-        -----
-        After compute_at, we need careful dependency analysis to compute the accurate bound
-        information. However, it is relatively expensive and complicated, so we just fill "None"
-        as bound for the newly created iterators.
-        Call ComputeDAG::InferBound on the returned state to get the complete bound information.
-        """
-        self.state_object = _ffi_api.StateComputeAt(
-            self.state_object,
-            self._resolve_stage_id(stage),
-            self._resolve_stage_id(target_stage),
-            target_iter,
-        )
-
-    def compute_inline(self, stage):
-        """Schedule primitive corresponding to `te.Stage.compute_inline`, see also the `te.Stage`
-        for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be marked compute inlined, which can be specified by the integer index,
-            Operation, or output tensor of the stage.
-        """
-        self.state_object = _ffi_api.StateComputeInline(
-            self.state_object, self._resolve_stage_id(stage)
-        )
-
-    def compute_root(self, stage):
-        """Schedule primitive corresponding to `te.Stage.compute_root`.
-        Ssee also the `te.Stage` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be marked compute at root, which can be specified by the integer index,
-            Operation, or output tensor of the stage.
-        Notes
-        -----
-        After compute_root, we need careful dependency analysis to compute the accurate bound
-        information. However, it is relatively expensive and complicated, so we just fill "None"
-        as bound for the newly created iterators.
-        Call ComputeDAG::InferBound on the returned state to get the complete bound information.
-        """
-        self.state_object = _ffi_api.StateComputeRoot(
-            self.state_object, self._resolve_stage_id(stage)
-        )
-
-    def cache_read(self, stage, scope_name, reader_stages):
-        """Schedule primitive corresponding to `te.Schedule.cache_read`.
-        See also the `te.Schedule` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be cache_read, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        scope_name : str
-            The scope name of the newly added read stage.
-        reader_stages : List[Union[int, Operation, Tensor]]
-            The reader stages. Each of the list can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        Returns
-        -------
-        new_stage_op : Operator
-            The Operator of the new added stage.
-        Notes
-        -----
-        Cache read step will insert an extra stage to the original ComputeDAG (at the back of the
-        target stage).
-        """
-        reader_stage_ids = [self._resolve_stage_id(i) for i in reader_stages]
-        self.state_object, new_stage_id = _ffi_api.StateCacheRead(
-            self.state_object,
-            self._resolve_stage_id(stage),
-            scope_name,
-            reader_stage_ids,
-            self.compute_dag,
-        )
-        # Add a new stage will change all ops behind the added stage. But we still want to keep the
-        # original ops map, apply stage id offset to stage_id_map to make them work.
-        self._apply_stage_id_offset(int(new_stage_id))
-        self._update_stage_id_map()
-        return self.stages[int(new_stage_id)].op
-
-    def cache_write(self, stage, scope_name):
-        """Schedule primitive corresponding to `te.Schedule.cache_write`.
-        See also the `te.Schedule` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be cache_write, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        scope_name : str
-            The scope name of the newly added compute stage.
-        Returns
-        -------
-        new_stage_op : Operator
-            The Operator of the new added stage.
-        Notes
-        -----
-        Cache write step will insert an extra stage to the original ComputeDAG (in the front of the
-        target stage).
-        This step will cache write all output tensors of the target stage.
-        """
-        self.state_object, new_stage_id = _ffi_api.StateCacheWrite(
-            self.state_object, self._resolve_stage_id(stage), scope_name, self.compute_dag
-        )
-        # Add a new stage will change all ops behind the added stage. But we still want to keep the
-        # original ops map, apply stage id offset to stage_id_map to make them work.
-        self._apply_stage_id_offset(int(new_stage_id))
-        self._update_stage_id_map()
-        return self.stages[int(new_stage_id)].op
-
-    def rfactor(self, stage, iterator, factor_iter_id):
-        """Schedule primitive corresponding to `te.Schedule.rfactor`.
-        See also the `te.Schedule` for more details.
-        Parameters
-        ----------
-        stage : Union[int, Operation, Tensor]
-            The Stage to be factored, which can be specified by the integer index, Operation,
-            or output tensor of the stage.
-        iterator : Iterator
-            The reduction iterator to be factored.
-        factor_iter_id : int
-            The position where the new iterator is placed.
-        Returns
-        -------
-        new_stage_op : Operator
-            The Operator of the new added stage.
-        Notes
-        -----
-        Rfactor step will insert an extra stage to the original ComputeDAG (in the front of the
-        target stage).
-        """
-        self.state_object, new_stage_id = _ffi_api.StateRfactor(
-            self.state_object,
-            self._resolve_stage_id(stage),
-            iterator,
-            factor_iter_id,
-            self.compute_dag,
-        )
-        # Add a new stage will change all ops behind the added stage. But we still want to keep the
-        # original ops map, apply stage id offset to stage_id_map to make them work.
-        self._apply_stage_id_offset(int(new_stage_id))
-        self._update_stage_id_map()
-        return self.stages[int(new_stage_id)].op
-
-    def copy(self):
-        """Do deep copy of this State."""
-        state = State(self.state_object, self.compute_dag)
-        state.stage_id_map = self.stage_id_map.copy()
-        return state
-
-    def _resolve_stage_id(self, stage_id):
-        if isinstance(stage_id, Operation):
-            return self.stage_id_map[stage_id]
-        if isinstance(stage_id, Tensor):
-            return self.stage_id_map[stage_id.op]
-        if isinstance(stage_id, int):
-            return stage_id
-        raise ValueError(
-            "Invalid stage: " + stage_id + " . Expect to be a int, Operation or Tensor"
-        )
-
-    def _update_stage_id_map(self):
-        for index, stage in enumerate(self.stages):
-            self.stage_id_map[stage.op] = index
-
-    def _apply_stage_id_offset(self, start_id, offset=1):
-        for key, value in self.stage_id_map.items():
-            if value >= start_id:
-                self.stage_id_map[key] = value + offset
-
-    def __getitem__(self, key):
-        if isinstance(key, Tensor):
-            key = key.op
-        if isinstance(key, Operation):
-            return self.stages[self.stage_id_map[key]]
-        raise ValueError("Invalid item: " + key + " . Expect to be a Operation or Tensor")
-
-    def __str__(self):
-        return str(self.state_object)
-
-    def __eq__(self, other):
-        return _ffi_api.StateEqual(self.state_object, other.state_object)
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
deleted file mode 100644
index fa5f06c38f6d..000000000000
--- a/python/tvm/auto_scheduler/measure.py
+++ /dev/null
@@ -1,1334 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Distributed measurement infrastructure to measure the runtime costs of tensor programs.
-
-These functions are responsible for building the tvm module, uploading it to
-remote devices, recording the running time costs, and checking the correctness of the output.
-
-We separate the measurement into two steps: build and run.
-A builder builds the executable binary files and a runner runs the binary files to
-get the measurement results. The flow of data structures is
-
-  .               `ProgramBuilder`                 `ProgramRunner`
-  `MeasureInput` -----------------> `BuildResult` ----------------> `MeasureResult`
-
-We implement these in python to utilize python's multiprocessing and error handling.
-"""
-
-import logging
-import multiprocessing
-import os
-import shutil
-import tempfile
-import time
-
-import tvm._ffi
-from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
-from tvm.contrib import ndk, tar
-from tvm.contrib.popen_pool import PopenPoolExecutor, PopenWorker, StatusKind
-from tvm.driver import build_module
-from tvm.ir import transform
-from tvm.runtime import Object, module, ndarray
-from tvm.target import Target
-
-from . import _ffi_api
-from .loop_state import StateObject
-from .utils import (
-    call_func_with_timeout,
-    check_remote,
-    get_const_tuple,
-    get_func_name,
-    make_traceback_info,
-    request_remote,
-)
-from .workload_registry import (
-    deserialize_workload_registry_entry,
-    serialize_workload_registry_entry,
-)
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("auto_scheduler")
-
-# The time cost for measurements with errors
-# We use 1e10 instead of sys.float_info.max for better readability in log
-MAX_FLOAT = 1e10
-
-
-class BuildFunc:
-    """store build_func name and callable to class variable.
-    name: str = "default"
-        The name of registered build function.
-    build_func: callable = tar.tar
-        The callable of registered build function.
-    """
-
-    name = "default"
-    build_func = tar.tar
-
-
-@tvm._ffi.register_object("auto_scheduler.MeasureCallback")
-class MeasureCallback(Object):
-    """The base class of measurement callback functions."""
-
-
-@tvm._ffi.register_object("auto_scheduler.PythonBasedMeasureCallback")
-class PythonBasedMeasureCallback(MeasureCallback):
-    """Base class for measure callbacks implemented in python"""
-
-    def __init__(self):
-        def callback_func(policy, inputs, results):
-            self.callback(policy, inputs, results)
-
-        self.__init_handle_by_constructor__(_ffi_api.PythonBasedMeasureCallback, callback_func)
-
-    def callback(self, policy, inputs, results):
-        """The callback function.
-
-        Parameters
-        ----------
-        policy: auto_scheduler.search_policy.SearchPolicy
-            The search policy.
-        inputs : List[auto_scheduler.measure.MeasureInput]
-            The measurement inputs
-        results : List[auto_scheduler.measure.MeasureResult]
-            The measurement results
-        """
-        raise NotImplementedError
-
-
-@tvm._ffi.register_object("auto_scheduler.MeasureInput")
-class MeasureInput(Object):
-    """Store the input of a measurement.
-
-    Parameters
-    ----------
-    task : SearchTask
-        The SearchTask of this measurement.
-    state : Union[State, StateObject]
-        The State to be measured.
-    """
-
-    def __init__(self, task, state):
-        state = state if isinstance(state, StateObject) else state.state_object
-        self.__init_handle_by_constructor__(_ffi_api.MeasureInput, task, state)
-
-    def serialize(self):
-        """Custom serialization to workaround MeasureInput not exposing all its
-        members to the TVM ffi interface.
-
-        Note that we do not implement __getstate__ as it does not seem to work
-        with initialization of the workload registry (maybe because of
-        initialization order?).
-        """
-        return [
-            _ffi_api.SerializeMeasureInput(self),
-            serialize_workload_registry_entry(self.task.workload_key),
-        ]
-
-    @staticmethod
-    def deserialize(data):
-        inp = _ffi_api.DeserializeMeasureInput(data[0])
-        deserialize_workload_registry_entry(data[1])
-        return recover_measure_input(inp)
-
-
-@tvm._ffi.register_object("auto_scheduler.BuildResult")
-class BuildResult(Object):
-    """Store the result of a build.
-
-    Parameters
-    ----------
-    filename : Optional[str]
-        The filename of built binary file.
-    args : List[Tensor]
-        The arguments.
-    error_no : int
-        The error code.
-    error_msg : Optional[str]
-        The error message if there is any error.
-    time_cost : float
-        The time cost of build.
-    """
-
-    def __init__(self, filename, args, error_no, error_msg, time_cost):
-        filename = filename if filename else ""
-        error_msg = error_msg if error_msg else ""
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.BuildResult, filename, args, error_no, error_msg, time_cost
-        )
-
-
-@tvm._ffi.register_object("auto_scheduler.MeasureResult")
-class MeasureResult(Object):
-    """Store the results of a measurement.
-
-    Parameters
-    ----------
-    costs : List[float]
-        The time costs of execution.
-    error_no : int
-        The error code.
-    error_msg : Optional[str]
-        The error message if there is any error.
-    all_cost : float
-        The time cost of build and run.
-    timestamp : float
-        The time stamps of this measurement.
-    """
-
-    def __init__(self, costs, error_no, error_msg, all_cost, timestamp):
-        error_msg = error_msg if error_msg else ""
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.MeasureResult, costs, error_no, error_msg, all_cost, timestamp
-        )
-
-
-def recover_measure_input(inp, rebuild_state=False):
-    """
-    Recover a deserialized MeasureInput by rebuilding the missing fields.
-    1. Rebuid the compute_dag in inp.task
-    2. (Optional) Rebuild the stages in inp.state
-
-    Parameters
-    ----------
-    inp: MeasureInput
-        The deserialized MeasureInput
-    rebuild_state: bool = False
-        Whether rebuild the stages in MeasureInput.State
-
-    Returns
-    -------
-    new_input: MeasureInput
-        The fully recovered MeasureInput with all fields rebuilt.
-    """
-    # pylint: disable=import-outside-toplevel
-    from .search_task import SearchTask  # lazily import to avoid recursive dependency
-
-    task = inp.task
-    task.target, task.target_host = Target.canon_target_and_host(task.target, task.target_host)
-    new_task = SearchTask(
-        workload_key=task.workload_key,
-        target=task.target,
-        hardware_params=task.hardware_params,
-        layout_rewrite_option=task.layout_rewrite_option,
-        task_inputs=list(task.task_input_names),
-    )
-
-    if rebuild_state:
-        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
-    else:
-        new_state = inp.state
-
-    return MeasureInput(new_task, new_state)
-
-
-@tvm._ffi.register_object("auto_scheduler.ProgramBuilder")
-class ProgramBuilder(Object):
-    """The base class of ProgramBuilders."""
-
-    def build(self, measure_inputs, verbose=1):
-        """Build programs and return results.
-
-        Parameters
-        ----------
-        measure_inputs : List[MeasureInput]
-            A List of MeasureInput.
-        verbose: int = 1
-            Verbosity level. 0 for silent, 1 to output information during program building.
-
-        Returns
-        -------
-        res : List[BuildResult]
-        """
-        return _ffi_api.ProgramBuilderBuild(self, measure_inputs, verbose)
-
-
-@tvm._ffi.register_object("auto_scheduler.ProgramRunner")
-class ProgramRunner(Object):
-    """The base class of ProgramRunners."""
-
-    def run(self, measure_inputs, build_results, verbose=1):
-        """Run measurement and return results.
-
-        Parameters
-        ----------
-        measure_inputs : List[MeasureInput]
-            A List of MeasureInput.
-        build_results : List[BuildResult]
-            A List of BuildResult to be ran.
-        verbose: int = 1
-            Verbosity level. 0 for silent, 1 to output information during program running.
-
-        Returns
-        -------
-        res : List[MeasureResult]
-        """
-        return _ffi_api.ProgramRunnerRun(self, measure_inputs, build_results, verbose)
-
-
-@tvm._ffi.register_object("auto_scheduler.ProgramMeasurer")
-class ProgramMeasurer(Object):
-    """
-    Measurer that measures the time costs of tvm programs
-    This class combines ProgramBuilder and ProgramRunner, and provides a simpler API.
-
-    Parameters
-    ----------
-    builder : ProgramBuilder
-        The ProgramBuilder to build programs
-    runner : ProgramRunner
-        The ProgramRunner to measure programs.
-    callbacks : List[MeasureCallback]
-        Callbacks to be called after each measurement batch
-    verbose : int
-        The Verbosity level: 0 for silent, 1 to output information during program
-    max_continuous_error : Optional[int]
-        The number of allowed maximum continuous error before stop the tuning
-    """
-
-    def __init__(self, builder, runner, callbacks, verbose, max_continuous_error=None):
-        max_continuous_error = max_continuous_error or -1  # -1 means using the default value
-        self.__init_handle_by_constructor__(
-            _ffi_api.ProgramMeasurer, builder, runner, callbacks, verbose, max_continuous_error
-        )
-
-
-@tvm._ffi.register_object("auto_scheduler.LocalBuilder")
-class LocalBuilder(ProgramBuilder):
-    """LocalBuilder use local CPU cores to build programs in parallel.
-
-    Parameters
-    ----------
-    timeout : int = 15
-        The timeout limit (in second) for each build thread.
-        This is used in a wrapper of the multiprocessing.Process.join().
-    n_parallel : int = multiprocessing.cpu_count()
-        Number of threads used to build in parallel.
-    build_func: callable or str = "default"
-        If is 'default', use default build function
-        If is 'ndk', use function for android ndk
-        If is callable, use it as custom build function, expect lib_format field.
-    """
-
-    def __init__(self, timeout=15, n_parallel=multiprocessing.cpu_count(), build_func="default"):
-        if build_func == "default":
-            BuildFunc.name = "default"
-            BuildFunc.build_func = tar.tar
-        elif build_func == "ndk":
-            BuildFunc.name = "ndk"
-            BuildFunc.build_func = ndk.create_shared
-        elif callable(build_func):
-            BuildFunc.name = "custom"
-            BuildFunc.build_func = build_func
-        else:
-            raise ValueError("Invalid build_func" + build_func)
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.LocalBuilder, timeout, n_parallel, BuildFunc.name
-        )
-
-
-@tvm._ffi.register_object("auto_scheduler.LocalRunner")
-class LocalRunner(ProgramRunner):
-    """LocalRunner that uses local CPU/GPU to measures the time cost of programs.
-
-    Parameters
-    ----------
-    timeout : int = 10
-        The timeout limit (in second) for each run.
-        This is used in a wrapper of the multiprocessing.Process.join().
-    number : int = 3
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int = 1
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first "1" is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms : int = 100
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval : float = 0.0
-        The cool down interval between two measurements in seconds.
-    enable_cpu_cache_flush: bool = False
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    device: int = 0
-        Which device to run on if multiple are available.
-    """
-
-    def __init__(
-        self,
-        timeout=10,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,
-        cooldown_interval=0.0,
-        enable_cpu_cache_flush=False,
-        device=0,
-    ):
-        if enable_cpu_cache_flush:
-            number = 1
-            min_repeat_ms = 0
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.LocalRunner,
-            timeout,
-            number,
-            repeat,
-            min_repeat_ms,
-            cooldown_interval,
-            enable_cpu_cache_flush,
-            device,
-        )
-
-
-@tvm._ffi.register_object("auto_scheduler.RPCRunner")
-class RPCRunner(ProgramRunner):
-    """RPCRunner that uses RPC call to measures the time cost of programs on remote devices.
-    Or sometime we may need to use RPC even in local running to insulate the thread environment.
-    (e.g. running CUDA programs)
-
-    Parameters
-    ----------
-    key : str
-        The key of the device registered in the RPC tracker.
-    host : str
-        The host address of the RPC Tracker.
-    port : int
-        The port of RPC Tracker.
-    priority : int = 1
-        The priority of this run request, larger is more prior.
-    n_parallel : int = 1
-        The number of tasks run in parallel.
-    timeout : int = 10
-        The timeout limit (in second) for each run.
-        This is used in a wrapper of the multiprocessing.Process.join().
-    number : int = 3
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int = 1
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first "1" is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms : int = 100
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval : float = 0.0
-        The cool down interval between two measurements in seconds.
-    enable_cpu_cache_flush: bool = False
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    device: int = 0
-        Which device to run on if multiple are available.
-    """
-
-    def __init__(
-        self,
-        key,
-        host,
-        port,
-        priority=1,
-        n_parallel=1,
-        timeout=10,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,
-        cooldown_interval=0.0,
-        enable_cpu_cache_flush=False,
-        device=0,
-    ):
-        self.__init_handle_by_constructor__(
-            _ffi_api.RPCRunner,
-            key,
-            host,
-            port,
-            priority,
-            n_parallel,
-            timeout,
-            number,
-            repeat,
-            min_repeat_ms,
-            cooldown_interval,
-            enable_cpu_cache_flush,
-            device,
-        )
-
-        if check_remote(key, host, port, priority, timeout):
-            print("Get devices for measurement successfully!")
-        else:
-            raise RuntimeError(
-                "Cannot get remote devices from the tracker. "
-                "Please check the status of tracker by "
-                "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
-                "and make sure you have free devices on the queue status."
-            )
-
-
-class LocalRPCMeasureContext:
-    """A context wrapper for running RPCRunner locally.
-    This will launch a local RPC Tracker and local RPC Server.
-
-    Parameters
-    ----------
-    priority : int = 1
-        The priority of this run request, larger is more prior.
-    n_parallel : int = 1
-        The number of tasks run in parallel.
-    timeout : int = 10
-        The timeout limit (in second) for each run.
-        This is used in a wrapper of the multiprocessing.Process.join().
-    number : int = 3
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int = 1
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first "1" is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms : int = 0
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval : float = 0.0
-        The cool down interval between two measurements in seconds.
-    enable_cpu_cache_flush: bool = False
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    device: int = 0
-        Which device to run on if multiple are available.
-    """
-
-    def __init__(
-        self,
-        priority=1,
-        n_parallel=1,
-        timeout=10,
-        number=3,
-        repeat=1,
-        min_repeat_ms=0,
-        cooldown_interval=0.0,
-        enable_cpu_cache_flush=False,
-        device=0,
-    ):
-        # pylint: disable=import-outside-toplevel
-        from tvm.rpc.server import Server
-        from tvm.rpc.tracker import Tracker
-
-        self.tracker = Tracker(port=9000, port_end=10000, silent=True)
-        device_key = f"$local$device${self.tracker.port}"
-        self.server = Server(
-            port=self.tracker.port,
-            port_end=10000,
-            key=device_key,
-            silent=True,
-            tracker_addr=("127.0.0.1", self.tracker.port),
-        )
-        self.runner = RPCRunner(
-            device_key,
-            "127.0.0.1",
-            self.tracker.port,
-            priority,
-            n_parallel,
-            timeout,
-            number,
-            repeat,
-            min_repeat_ms,
-            cooldown_interval,
-            enable_cpu_cache_flush,
-            device,
-        )
-        # Wait for the processes to start
-        time.sleep(0.5)
-
-    def __del__(self):
-        # Close the tracker and server before exit
-        self.tracker.terminate()
-        self.server.terminate()
-        time.sleep(0.5)
-
-
-class MeasureErrorNo(object):
-    """Error type for MeasureResult."""
-
-    NO_ERROR = 0  # No error
-    INSTANTIATION_ERROR = 1  # Errors happen when apply transform steps from init state
-    COMPILE_HOST = 2  # Errors happen when compiling code on host (e.g., tvm.build)
-    COMPILE_DEVICE = 3  # Errors happen when compiling code on device
-    # (e.g. OpenCL JIT on the device)
-    RUNTIME_DEVICE = 4  # Errors happen when run program on device
-    WRONG_ANSWER = 5  # Answer is wrong when compared to a reference output
-    BUILD_TIMEOUT = 6  # Timeout during compilation
-    RUN_TIMEOUT = 7  # Timeout during run
-    UNKNOWN_ERROR = 8  # Unknown error
-
-
-def _local_build_worker(inp_serialized, build_func, verbose):
-    tic = time.time()
-    inp = MeasureInput.deserialize(inp_serialized)
-    task = inp.task
-    task.target, task.target_host = Target.canon_target_and_host(task.target, task.target_host)
-
-    error_no = MeasureErrorNo.NO_ERROR
-    error_msg = None
-    args = []
-
-    try:
-        sch, args = task.compute_dag.apply_steps_from_state(
-            inp.state, layout_rewrite=task.layout_rewrite_option
-        )
-    # pylint: disable=broad-except
-    except Exception:
-        error_no = MeasureErrorNo.INSTANTIATION_ERROR
-        error_msg = make_traceback_info()
-
-    if error_no == 0:
-        dirname = tempfile.mkdtemp()
-        filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
-
-        try:
-            with transform.PassContext().current():
-                func = build_module.build(sch, args, target=task.target)
-            func.export_library(filename, fcompile=build_func)
-        # pylint: disable=broad-except
-        except Exception:
-            error_no = MeasureErrorNo.COMPILE_HOST
-            error_msg = make_traceback_info()
-    else:
-        filename = ""
-
-    if verbose >= 1:
-        if error_no == MeasureErrorNo.NO_ERROR:
-            print(".", end="", flush=True)
-        else:
-            print(".E", end="", flush=True)  # Build error
-
-    return filename, args, error_no, error_msg, time.time() - tic
-
-
-def local_build_worker(args):
-    """
-    Build function of LocalBuilder to be ran in the Builder thread pool.
-
-    Parameters
-    ----------
-    args: Tuple[MeasureInput, callable, int]
-        inputs, build-func, verbose args passed to local_builder_build
-
-    Returns
-    -------
-    res : BuildResult
-        The build result of this Builder thread.
-    """
-    inp, build_func, verbose = args
-
-    return _local_build_worker(inp, build_func, verbose)
-
-
-@tvm._ffi.register_func("auto_scheduler.local_builder.build")
-def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbose=1):
-    """
-    Build function of LocalBuilder to build the MeasureInputs to runnable modules.
-
-    Parameters
-    ----------
-    inputs : List[MeasureInput]
-        The MeasureInputs to be built.
-    timeout : int
-        The timeout limit (in second) for each build thread.
-        This is used in a wrapper of the multiprocessing.Process.join().
-    n_parallel : int
-        Number of threads used to build in parallel.
-    build_func : str = 'default'
-        The name of build function to process the built module.
-    verbose: int = 1
-        Verbosity level. 0 for silent, 1 to output information during program building.
-
-    Returns
-    -------
-    res : List[BuildResult]
-        The build results of these MeasureInputs.
-    """
-    assert build_func == BuildFunc.name, (
-        "BuildFunc.name: " + BuildFunc.name + ", but args is: " + build_func
-    )
-    executor = PopenPoolExecutor(
-        n_parallel, timeout, reset_global_scope, (AutotvmGlobalScope.current,)
-    )
-    tuple_res = executor.map_with_error_catching(
-        local_build_worker, [(i.serialize(), BuildFunc.build_func, verbose) for i in inputs]
-    )
-
-    results = []
-    for res in tuple_res:
-        if res.status == StatusKind.COMPLETE:
-            results.append(BuildResult(*res.value))
-        elif res.status == StatusKind.TIMEOUT:
-            if verbose >= 1:
-                print(".T", end="", flush=True)  # Build timeout
-            results.append(BuildResult(None, [], MeasureErrorNo.BUILD_TIMEOUT, None, timeout))
-        elif res.status == StatusKind.EXCEPTION:
-            if verbose >= 1:
-                print(".E", end="", flush=True)  # Build error
-            results.append(
-                BuildResult(None, [], MeasureErrorNo.COMPILE_HOST, repr(res.value), timeout)
-            )
-        else:
-            raise ValueError("Result status is not expected. Unreachable branch")
-
-    return results
-
-
-TASK_INPUT_CHECK_FUNC_REGISTRY = {}
-
-
-def register_task_input_check_func(func_name, f=None, override=False):
-    """Register a function that checks the input buffer map.
-
-    The input function should take a list of Tensor wich indicate the Input/output Tensor of a TVM
-    subgraph and return a Map from the input Tensor to its buffer name.
-
-    Parameters
-    ----------
-    func_name : Union[Function, str]
-        The check function that returns the compute declaration Tensors or its function name.
-    f : Optional[Function]
-        The check function to be registered.
-    override : boolean = False
-        Whether to override existing entry.
-
-    Examples
-    --------
-    .. code-block:: python
-
-      @auto_scheduler.register_task_input_check_func
-      def check_task_input_by_placeholder_name(args : List[Tensor]):
-          tensor_input_map = {}
-          for arg in args:
-              if isinstance(arg.op, tvm.te.PlaceholderOp):
-                  if arg.op.name != "placeholder":
-                      tensor_input_map[arg] = arg.op.name
-          return tensor_input_map
-    """
-    global TASK_INPUT_CHECK_FUNC_REGISTRY
-
-    if callable(func_name):
-        f = func_name
-        func_name = get_func_name(f)
-    if not isinstance(func_name, str):
-        raise ValueError("expect string function name")
-
-    def register(myf):
-        """internal register function"""
-        if func_name in TASK_INPUT_CHECK_FUNC_REGISTRY and not override:
-            raise RuntimeError(f"{func_name} has been registered already")
-        TASK_INPUT_CHECK_FUNC_REGISTRY[func_name] = myf
-        return myf
-
-    if f:
-        return register(f)
-    return register
-
-
-def prepare_input_map(args, workload_key=None):
-    """This function deals with special task inputs. Map the input Tensor of a TVM subgraph
-    to a specific buffer name in the global buffer map.
-
-    Parameters
-    ----------
-    args : List[Tensor]
-        Input/output Tensor of a TVM subgraph.
-
-    workload_key: Optional[str]
-        The workload for which these inputs are being prepared.  This
-        is used to identify if an input is being provided by (see
-        `register_task_input_buffer`).
-
-    Returns
-    -------
-    Dict[Tensor, str] :
-        Map from the input Tensor to its buffer name.
-
-    Notes
-    -----
-    The buffer name is specially designed, and these buffer should be provided in
-    `SearchTask(..., task_inputs={...})`.
-    """
-    # pylint: disable=import-outside-toplevel
-
-    global TASK_INPUT_CHECK_FUNC_REGISTRY
-
-    from .search_task import TASK_INPUT_BUFFER_TABLE
-
-    # A dict that maps the input tensor arg to a buffer name
-    tensor_input_map = {}
-
-    # Case 0: Check placeholder name
-    for arg in args:
-        if isinstance(arg.op, tvm.te.PlaceholderOp):
-            if (
-                workload_key
-                and workload_key in TASK_INPUT_BUFFER_TABLE
-                and arg.op.name in TASK_INPUT_BUFFER_TABLE[workload_key]
-            ):
-                tensor_input_map[arg] = arg.op.name
-
-    # Case 1: Check specific tensor inputs
-    for func_name in TASK_INPUT_CHECK_FUNC_REGISTRY:
-        func = TASK_INPUT_CHECK_FUNC_REGISTRY[func_name]
-        tensor_input_map.update(func(args))
-
-    return tensor_input_map
-
-
-def prepare_runner_args(inp, build_res):
-    """This function prepares the pre-defined arguments in `TASK_INPUT_BUFFER_TABLE` for local/rpc
-    runner in main process
-
-    Parameters
-    ----------
-    inp : MeasureInput
-        Measure input to be measured.
-
-    build_res : BuildResult
-        Build result to be measured.
-
-    Returns
-    -------
-    List[Optional[numpy.ndarray]] :
-        List of arguments for running the program. If the argument does not have a pre-defined input
-        buffer, None is added to the list as a placeholder.
-
-    """
-    # pylint: disable=import-outside-toplevel
-    from .search_task import get_task_input_buffer  # lazily import to avoid recursive dependency
-
-    task_input_names = inp.task.task_input_names
-    tensor_input_map = prepare_input_map(build_res.args, inp.task.workload_key)
-    if not task_input_names:
-        tensor_input_map = {}
-    args = []
-    task_inputs_count = 0
-    for arg in build_res.args:
-        if arg in tensor_input_map:
-            tensor_name = tensor_input_map[arg]
-            if tensor_name in task_input_names:
-                task_input_buffer = get_task_input_buffer(inp.task.workload_key, tensor_name)
-                # convert tvm.NDArray to picklable numpy.ndarray
-                args.append(task_input_buffer.numpy())
-                task_inputs_count += 1
-            else:
-                raise ValueError(
-                    f"{tensor_name} not found in task_inputs, "
-                    f"should provide with `SearchTask(..., task_inputs={{...}})`"
-                )
-        else:
-            args.append(None)
-    if task_inputs_count != len(task_input_names):
-        raise RuntimeError("task_inputs not fully matched, check if there's any unexpected error")
-    return args
-
-
-def _timed_eval_func(
-    inp_serialized,
-    build_res,
-    args,
-    number,
-    repeat,
-    min_repeat_ms,
-    cooldown_interval,
-    enable_cpu_cache_flush,
-    verbose,
-    device,
-):
-    inp = MeasureInput.deserialize(inp_serialized)
-    tic = time.time()
-    error_no = 0
-    error_msg = None
-    try:
-        func = module.load_module(build_res.filename)
-        dev = ndarray.device(str(inp.task.target), device)
-        # Limitation:
-        # We can not get PackFunction directly in the remote mode as it is wrapped
-        # under the std::function. We could lift the restriction later once we fold
-        # the PackedFunc as an object. Currently, we pass function name to work
-        # around it.
-        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-        time_f = func.time_evaluator(
-            func.entry_name,
-            dev,
-            number=number,
-            repeat=repeat,
-            min_repeat_ms=min_repeat_ms,
-            f_preproc=f_prepare,
-        )
-    # pylint: disable=broad-except
-    except Exception:
-        costs = (MAX_FLOAT,)
-        error_no = MeasureErrorNo.COMPILE_DEVICE
-        error_msg = make_traceback_info()
-
-    if error_no == 0:
-        try:
-            random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
-            assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"
-            assert len(args) == len(build_res.args)
-            loc_args = []
-            # pylint: disable=consider-using-enumerate
-            for idx in range(len(args)):
-                if args[idx] is None:
-                    build_res_arg = build_res.args[idx]
-                    empty_array = ndarray.empty(
-                        get_const_tuple(build_res_arg.shape), build_res_arg.dtype, dev
-                    )
-                    random_fill(empty_array)
-                    loc_args.append(empty_array)
-                else:
-                    loc_args.append(ndarray.array(args[idx], dev))
-            dev.sync()
-            costs = time_f(*loc_args).results
-        # pylint: disable=broad-except
-        except Exception:
-            costs = (MAX_FLOAT,)
-            error_no = MeasureErrorNo.RUNTIME_DEVICE
-            error_msg = make_traceback_info()
-
-    shutil.rmtree(os.path.dirname(build_res.filename))
-    toc = time.time()
-    time.sleep(cooldown_interval)
-
-    if verbose >= 1:
-        if error_no == MeasureErrorNo.NO_ERROR:
-            print("*", end="", flush=True)
-        else:
-            print("*E", end="", flush=True)  # Run error
-    return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
-
-
-@tvm._ffi.register_func("auto_scheduler.local_runner.run")
-def local_run(
-    inputs,
-    build_results,
-    timeout=10,
-    number=3,
-    repeat=1,
-    min_repeat_ms=0,
-    cooldown_interval=0,
-    enable_cpu_cache_flush=False,
-    verbose=1,
-    device=0,
-):
-    """
-    Run function of LocalRunner to test the performance of the input BuildResults.
-
-    Parameters
-    ----------
-    inputs : List[MeasureInput]
-        The MeasureInputs to be measured.
-    build_results : List[BuildResult]
-        The BuildResults to be measured.
-    timeout : int = 10
-        The timeout limit (in second) for each run.
-        This is used in a wrapper of the multiprocessing.Process.join().
-    number : int = 3
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int = 1
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first "1" is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms : int = 0
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval : float = 0.0
-        The cool down interval between two measurements in seconds.
-    enable_cpu_cache_flush: bool = False
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    verbose: int = 1
-        Verbosity level. 0 for silent, 1 to output information during program measuring.
-    device: int = 0
-        Which device to run on if multiple are available.
-
-    Returns
-    -------
-    res : List[MeasureResult]
-        The measure results of these MeasureInputs.
-    """
-
-    measure_results = []
-    assert len(inputs) == len(build_results), "Measure input size should be equal to build results"
-    worker = PopenWorker()
-    for inp, build_res in zip(inputs, build_results):
-        if build_res.error_no != 0:
-            res = (
-                (MAX_FLOAT,),
-                build_res.error_no,
-                build_res.error_msg,
-                build_res.time_cost,
-                time.time(),
-            )
-        else:
-            args = prepare_runner_args(inp, build_res)
-            res = call_func_with_timeout(
-                worker,
-                timeout,
-                _timed_eval_func,
-                args=(
-                    inp.serialize(),
-                    build_res,
-                    args,
-                    number,
-                    repeat,
-                    min_repeat_ms,
-                    cooldown_interval,
-                    enable_cpu_cache_flush,
-                    verbose,
-                    device,
-                ),
-            )
-            if isinstance(res, TimeoutError):
-                if verbose >= 1:
-                    print("*T", end="", flush=True)  # Run timeout
-                res = (
-                    (MAX_FLOAT,),
-                    MeasureErrorNo.RUN_TIMEOUT,
-                    None,
-                    build_res.time_cost + timeout,
-                    time.time(),
-                )
-            elif isinstance(res, Exception):
-                if verbose >= 1:
-                    print("*E", end="", flush=True)  # Run error
-                res = (
-                    (MAX_FLOAT,),
-                    MeasureErrorNo.RUNTIME_DEVICE,
-                    str(res),
-                    build_res.time_cost + timeout,
-                    time.time(),
-                )
-
-        measure_results.append(MeasureResult(*res))
-
-    if verbose >= 1:
-        print("", flush=True)
-
-    return measure_results
-
-
-def _rpc_run(
-    inp_serialized,
-    build_res,
-    args,
-    key,
-    host,
-    port,
-    priority,
-    timeout,
-    number,
-    repeat,
-    min_repeat_ms,
-    cooldown_interval,
-    enable_cpu_cache_flush,
-    verbose,
-    device,
-):
-    inp = MeasureInput.deserialize(inp_serialized)
-    tic = time.time()
-    error_no = 0
-    error_msg = None
-    try:
-        # upload built module
-        remote = request_remote(key, host, port, priority, timeout)
-        remote.upload(build_res.filename)
-        func = remote.load_module(os.path.split(build_res.filename)[1])
-        dev = remote.device(str(inp.task.target), device)
-        # Limitation:
-        # We can not get PackFunction directly in the remote mode as it is wrapped
-        # under the std::function. We could lift the restriction later once we fold
-        # the PackedFunc as an object. Currently, we pass function name to work
-        # around it.
-        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-        time_f = func.time_evaluator(
-            func.entry_name,
-            dev,
-            number=number,
-            repeat=repeat,
-            min_repeat_ms=min_repeat_ms,
-            f_preproc=f_prepare,
-        )
-    # pylint: disable=broad-except
-    except Exception:
-        costs = (MAX_FLOAT,)
-        error_no = MeasureErrorNo.COMPILE_DEVICE
-        error_msg = make_traceback_info()
-
-    if error_no == 0:
-        try:
-            stream = dev.create_raw_stream()
-            dev.set_raw_stream(stream)
-            random_fill = remote.get_function("tvm.contrib.random.random_fill")
-            assert (
-                random_fill
-            ), "Please make sure USE_RANDOM is ON in the config.cmake on the remote devices"
-
-            assert len(args) == len(build_res.args)
-            loc_args = []
-            # pylint: disable=consider-using-enumerate
-            for idx in range(len(args)):
-                if args[idx] is None:
-                    build_res_arg = build_res.args[idx]
-                    empty_array = ndarray.empty(
-                        get_const_tuple(build_res_arg.shape), build_res_arg.dtype, dev
-                    )
-                    random_fill(empty_array)
-                    loc_args.append(empty_array)
-                else:
-                    loc_args.append(ndarray.array(args[idx], dev))
-            dev.sync()
-
-            # First run for check that the kernel is correct
-            func.entry_func(*loc_args)
-            dev.sync()
-
-            costs = time_f(*loc_args).results
-
-            # clean up remote files
-            remote.remove(build_res.filename)
-            remote.remove(os.path.splitext(build_res.filename)[0] + ".so")
-            remote.remove("")
-            dev.free_raw_stream(stream)
-        # pylint: disable=broad-except
-        except Exception:
-            dev.free_raw_stream(stream)
-            costs = (MAX_FLOAT,)
-            error_no = MeasureErrorNo.RUNTIME_DEVICE
-            error_msg = make_traceback_info()
-
-    shutil.rmtree(os.path.dirname(build_res.filename))
-    toc = time.time()
-
-    time.sleep(cooldown_interval)
-    if verbose >= 1:
-        if error_no == MeasureErrorNo.NO_ERROR:
-            print("*", end="")
-        else:
-            print("*E", end="")  # Run error
-
-    return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
-
-
-def _rpc_run_worker(args):
-    """Function to be ran in the RPCRunner thread pool.
-
-    Parameters
-    ----------
-    args : Tuple[MeasureInput, BuildResult, ...]
-        Single input and build result plus the rest of the arguments to `rpc_runner_run`.
-
-    Returns
-    -------
-    res : MeasureResult
-        The measure result of this Runner thread.
-    """
-    _, build_res, _, _, _, _, _, timeout, _, _, _, _, _, verbose, _ = args
-    if build_res.error_no != MeasureErrorNo.NO_ERROR:
-        return (
-            (MAX_FLOAT,),
-            build_res.error_no,
-            build_res.error_msg,
-            build_res.time_cost,
-            time.time(),
-        )
-
-    try:
-        res = _rpc_run(*args)
-    # pylint: disable=broad-except
-    except Exception:
-        if verbose >= 1:
-            print("*E", end="")  # Run error
-        res = (
-            (MAX_FLOAT,),
-            MeasureErrorNo.RUNTIME_DEVICE,
-            make_traceback_info(),
-            build_res.time_cost + timeout,
-            time.time(),
-        )
-
-    return res
-
-
-@tvm._ffi.register_func("auto_scheduler.rpc_runner.run")
-def rpc_runner_run(
-    inputs,
-    build_results,
-    key,
-    host,
-    port,
-    priority=1,
-    n_parallel=1,
-    timeout=10,
-    number=3,
-    repeat=1,
-    min_repeat_ms=0,
-    cooldown_interval=0.0,
-    enable_cpu_cache_flush=False,
-    verbose=1,
-    device=0,
-):
-    """Run function of RPCRunner to test the performance of the input BuildResults.
-
-    Parameters
-    ----------
-    inputs : List[MeasureInput]
-        The MeasureInputs to be measured.
-    build_results : List[BuildResult]
-        The BuildResults to be measured.
-    key : str
-        The key of the device registered in the RPC tracker.
-    host : str
-        The host address of the RPC Tracker.
-    port : int
-        The port of RPC Tracker.
-    priority : int = 1
-        The priority of this run request, larger is more prior.
-    n_parallel : int = 1
-        The number of tasks run in parallel.
-    timeout : int = 10
-        The timeout limit (in second) for each run.
-        This is used in a wrapper of the multiprocessing.Process.join().
-    number : int = 3
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int = 1
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first "1" is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms : int = 0
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval : float = 0.0
-        The cool down interval between two measurements in seconds.
-    enable_cpu_cache_flush: bool = False
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    verbose: int = 1
-        Verbosity level. 0 for silent, 1 to output information during program measuring.
-    device: int = 0
-        Which device to run on if multiple are available.
-
-    Returns
-    -------
-    res : List[MeasureResult]
-        The measure results of these MeasureInputs.
-    """
-    assert len(inputs) == len(build_results), "Measure input size should be equal to build results"
-    # This pool is not doing computationally intensive work, so we can use threads
-    executor = PopenPoolExecutor(n_parallel)
-    tuple_res = executor.map_with_error_catching(
-        _rpc_run_worker,
-        [
-            (
-                inp.serialize(),
-                build_res,
-                prepare_runner_args(inp, build_res),
-                key,
-                host,
-                port,
-                priority,
-                timeout,
-                number,
-                repeat,
-                min_repeat_ms,
-                cooldown_interval,
-                enable_cpu_cache_flush,
-                verbose,
-                device,
-            )
-            for inp, build_res in zip(inputs, build_results)
-        ],
-    )
-
-    results = []
-    for i, res in enumerate(tuple_res):
-        if res.status == StatusKind.COMPLETE:
-            results.append(MeasureResult(*res.value))
-        else:
-            assert res.status == StatusKind.TIMEOUT
-            if verbose >= 1:
-                print("*T", end="")  # Run timeout
-            build_res = build_results[i]
-            results.append(
-                MeasureResult(
-                    (MAX_FLOAT,),
-                    MeasureErrorNo.RUN_TIMEOUT,
-                    None,
-                    build_res.time_cost + timeout,
-                    time.time(),
-                )
-            )
-
-    if verbose >= 1:
-        print("")
-
-    return results
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
deleted file mode 100644
index aaa8fdcd9138..000000000000
--- a/python/tvm/auto_scheduler/measure_record.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, pointless-string-statement
-
-""" Serialization and other I/O support for measurement records (tuning logs). """
-import argparse
-import logging
-import os
-import itertools
-
-import numpy as np
-
-import tvm._ffi
-from tvm.runtime import Object
-from .measure import MeasureErrorNo, MeasureCallback
-from .utils import calc_workload_dis_factor, decode_workload_key
-from . import _ffi_api
-
-logger = logging.getLogger("auto_scheduler")
-
-
-@tvm._ffi.register_object("auto_scheduler.RecordToFile")
-class RecordToFile(MeasureCallback):
-    """
-    A measurement callback that writes measurement records into a file.
-
-    Parameters
-    ----------
-    filename : str
-        File name for this callback to write log to.
-    """
-
-    def __init__(self, filename):
-        dirname = os.path.dirname(os.path.abspath(filename))
-        if not os.path.exists(dirname):
-            os.makedirs(dirname)
-        self.__init_handle_by_constructor__(_ffi_api.RecordToFile, filename)
-
-
-@tvm._ffi.register_object("auto_scheduler.RecordReader")
-class RecordReader(Object):
-    """
-    Reader of the json log file.
-
-    Parameters
-    ----------
-    filename : str
-        File name for this reader to load log from.
-    """
-
-    def __init__(self, filename):
-        if not os.path.exists(filename):
-            logger.warning("%s does not exist!", filename)
-        # a set to prevent print duplicated message
-        self.messages = set()
-        self.__init_handle_by_constructor__(_ffi_api.RecordReader, filename)
-
-    def check_workload_key(self, inputs):
-        """Check and throw warnings for records with old format workload key.
-
-        Parameters
-        ----------
-        inputs: List[MeasureInput]
-            The measure inputs to be checked.
-
-        Notes
-        -----
-        This checker could be deprecated in the future.
-        """
-        for inp in inputs:
-            _, args = decode_workload_key(inp.task.workload_key)
-            if args is None:
-                continue
-            if not args:
-                msg = (
-                    "MeasureInput with old format workload key %s should be updated "
-                    "using the script from https://github.com/apache/tvm/pull/7317."
-                    % inp.task.workload_key
-                )
-                if msg not in self.messages:
-                    self.messages.add(msg)
-                    logger.warning(msg)
-
-    def read_lines(self, max_lines=None, skip_lines=0):
-        """Read multiple lines from the log file.
-
-        Parameters
-        ----------
-        max_lines : Optional[int]
-            The maximum number of lines. None to read all lines.
-        skip_lines : int = 0
-            Skip the first n lines.
-
-        Returns
-        -------
-        inputs : List[auto_scheduler.measure.MeasureInput]
-            The MeasureInputs loaded from the log file.
-        results : List[auto_scheduler.measure.MeasureResult]
-            The MeasureResults loaded from the log file.
-
-        Notes
-        -----
-        Some unimportant and expensive fields in the returned MeasureInput are not deserialized
-        for faster read speed (e.g. input.task.compute_dag, input.state.stages).
-        If you want to use them, you can call the :code:`recover_measure_input` below
-        to rebuild these fields.
-        """
-        inputs, results = _ffi_api.RecordReaderReadLines(
-            self, max_lines if max_lines else -1, skip_lines
-        )
-        self.check_workload_key(inputs)
-        return inputs, results
-
-    def __iter__(self):
-        while True:
-            ret = _ffi_api.RecordReaderReadNext(self)
-            if not ret:
-                break
-            self.check_workload_key([ret[0]])
-            yield ret[0], ret[1]  # (input, result)
-
-
-def load_record_from_string(record):
-    """
-    Load the measure record from string.
-
-    Parameters
-    ----------
-    record: str
-        A record string, including the serialized MeausreInput and MeasureResult.
-
-    Returns
-    -------
-    ret: Tuple[MeasureInput, MeasureResult]
-        A tuple of MeasureInput, MeasureResult.
-    """
-    return _ffi_api.ReadMeasureRecord(record)
-
-
-def dump_record_to_string(inp, res):
-    """
-    Dump the measure record to a string.
-
-    Parameters
-    ----------
-    inp: MeasureInput
-        The measure input.
-
-    res: MeasureResult
-        The measure result.
-
-    Returns
-    -------
-    ret: str
-        The dumped string.
-    """
-    return _ffi_api.WriteMeasureRecords(inp, res)
-
-
-def load_records(filename):
-    """
-    Load measurement records from a file.
-
-    Parameters
-    ----------
-    filename : str
-        File name to load log from.
-
-    Returns
-    -------
-    logs : List[auto_scheduler.measure.MeasureInput, auto_scheduler.measure.MeasureResult]
-
-    Notes
-    -----
-    Some unimportant and expensive fields in the returned MeasureInput are not deserialized
-    for faster read speed (e.g., input.task.compute_dag, input.state.stages).
-    If you want to use them, you can call the :code:`recover_measure_input` below
-    to rebuild these fields.
-    """
-    return zip(*RecordReader(filename).read_lines())
-
-
-def save_records(filename, inputs, results):
-    """
-    Append measure records to file.
-
-    Parameters
-    ----------
-    filename : str
-        File name to write log to.
-    inputs: List[MeasureInputs]
-        The MeasureInputs to be written.
-    results: List[MeasureResults]
-        The MeasureResults to be written.
-    """
-    dirname = os.path.dirname(os.path.abspath(filename))
-    if not os.path.exists(dirname):
-        os.makedirs(dirname)
-    _ffi_api.SaveRecords(filename, inputs, results)
-
-
-def load_best_record(filename, workload_key=None, target=None, include_compatible=False):
-    """Return the best measurement pair form a log file. This may return none results if
-    there is no legal measure pair with the specified workload_key/target found from the log file.
-
-    Parameters
-    ----------
-    filename : str
-        File name to load log from.
-    workload_key : Optional[str]
-        The workload key of the compute declaration.
-        With `None`, this returns the best measure pair of all workloads.
-    target : Optional[tvm.target.Target]
-        The target device.
-        With `None`, this returns the best measure pair of all target devices.
-    include_compatible: bool
-        When set to True, all compatible records in the log file will be considered.
-
-    Returns
-    -------
-    input : auto_scheduler.measure.MeasureInput
-        The best State's MeasureInput from this log fine.
-    result : auto_scheduler.measure.MeasureResult
-        The best State's MeasureResult from this log fine.
-    """
-    log_reader = RecordReader(filename)
-    best_cost = 1e30
-    best_inp = None
-    best_res = None
-
-    for inp, res in log_reader:
-        if res.error_no != MeasureErrorNo.NO_ERROR:
-            continue
-        if target and inp.task.target.kind.name != target.kind.name:
-            continue
-
-        costs = [v.value for v in res.costs]
-        cost = np.mean(costs)
-
-        if workload_key is not None:
-            dis_f = calc_workload_dis_factor(
-                decode_workload_key(workload_key), decode_workload_key(inp.task.workload_key)
-            )
-            if dis_f == float("inf"):
-                continue
-            if not include_compatible and dis_f != 1:
-                continue
-
-            # Since different workloads have different FLOPS, we multiply the factor to
-            # eliminate this difference, which is basically the concept of throughput.
-            cost *= dis_f
-
-        if cost < best_cost:
-            best_cost = cost
-            best_inp = inp
-            best_res = res
-
-    return best_inp, best_res
-
-
-def distill_record_file(in_file, out_file):
-    """
-    Pick the best entries from a record file and store them to another file.
-    This function distills the useful log entries from a large log file.
-    If out_file already exists, the best entries from both
-    in_file and out_file will be saved.
-
-    Parameters
-    ----------
-    in_file: str
-        The filename of input
-    out_file: str or file
-        The filename of output
-    """
-    # pylint: disable=import-outside-toplevel
-    from .dispatcher import ApplyHistoryBest
-
-    context = load_records(in_file)
-
-    dirname = os.path.dirname(os.path.abspath(out_file))
-    if not os.path.exists(dirname):
-        os.makedirs(dirname)
-
-    if os.path.isfile(out_file):
-        out_context = load_records(out_file)
-        context = itertools.chain(context, out_context)
-
-    def measure_input_str_key(inp):
-        return _ffi_api.SerializeMeasureInput(inp)
-
-    # Dict[target key,
-    #   Dict[workload hash,
-    #     Dict[workload args, (cost, (MeasureInput, MeasureResult))]]]
-    # Full type: Dict[str, Dict[str, Dict[Tuple, Tuple[float, Tuple[Measureinput, MeasureResult]]]]]
-    best_records = {}
-
-    for inp, res in context:
-        if res.error_no != 0:
-            continue
-
-        # Keep the best record for each target and workload.
-        costs = [x.value for x in res.costs if isinstance(x, tvm.tir.expr.FloatImm)]
-        cost = np.mean(costs)
-        for k in inp.task.target.keys:
-            entry, _, workload_args = ApplyHistoryBest.get_workload_entry(
-                best_records, k, inp.task.workload_key
-            )
-            if workload_args not in entry or cost < entry[workload_args][0]:
-                entry[workload_args] = (cost, (inp, res))
-
-    # Remove duplications by multiple target keys.
-    out_records = {}
-    for target_entry in best_records.values():
-        for workload_entry in target_entry.values():
-            for _, (inp, res) in workload_entry.values():
-                out_records[measure_input_str_key(inp)] = (inp, res)
-
-    inputs = []
-    results = []
-    for inp, res in out_records.values():
-        inputs.append(inp)
-        results.append(res)
-
-    # create a new file and save the best records
-    open(out_file, "w")
-    save_records(out_file, inputs, results)
-    logger.info("Extract %d best records from %s to %s", len(inputs), in_file, out_file)
-
-
-def main():
-    """The main function for CLI."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["distill"], default="distill")
-    parser.add_argument("-i", "--input", type=str, help="input file")
-    parser.add_argument("-o", "--output", type=str, default=None, help="output file")
-
-    args = parser.parse_args()
-    logging.basicConfig()
-    logger.setLevel(logging.INFO)
-
-    if args.mode == "distill":
-        args.output = args.output or args.input + ".best.json"
-        distill_record_file(args.input, args.output)
-
-
-"""
-Usage:
-* Distill the best entries from a large log file
-e.g. python -m tvm.auto_scheduler.measure_record --mode distill -i input.json
-"""
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
deleted file mode 100644
index 973cbf19bece..000000000000
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ /dev/null
@@ -1,493 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-variable,invalid-name
-
-"""
-Integrate auto_scheduler into relay. It implements the following items:
-1. Extract search tasks from a relay program
-2. Provide auto-scheduling for all TOPI compute functions
-"""
-
-import json
-import logging
-import threading
-import traceback
-
-import tvm
-from tvm import autotvm, transform
-from tvm._ffi.base import TVMError
-from tvm.ir.transform import PassContext
-from tvm.runtime import convert_to_object
-from tvm.target import Target
-from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
-from tvm.tir import Reduce
-from tvm.tir import expr as _expr
-
-from . import _ffi_api
-from .compute_dag import ComputeDAG, LayoutRewriteOption
-from .dispatcher import DispatchContext
-from .search_task import SearchTask
-from .utils import get_const_tuple
-from .workload_registry import register_workload_tensors
-
-logger = logging.getLogger("auto_scheduler")
-
-
-def call_all_topi_funcs(mod, params, target, error_list, opt_level=3):
-    """Call all TOPI compute to extract auto_scheduler tasks in a Relay program"""
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    # Turn off AutoTVM config not found warnings
-    old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
-    autotvm.GLOBAL_SCOPE.silent = True
-
-    with transform.PassContext(
-        opt_level=opt_level,
-        config={
-            "relay.backend.use_auto_scheduler": True,
-        },
-        disabled_pass={"AutoSchedulerLayoutRewrite"},
-    ):
-        compiler = relay.vm.VMCompiler()
-        if params:
-            compiler.set_params(params)
-        mod = tvm.IRModule.from_expr(mod) if isinstance(mod, relay.Function) else mod
-        try:
-            compiler.lower(mod, target)
-        except TVMError:
-            error_list.append(f"{traceback.format_exc()}")
-        finally:
-            autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
-
-
-def extract_tasks(
-    mod,
-    params,
-    target,
-    target_host=None,
-    hardware_params=None,
-    include_simple_tasks=False,
-    dump_workload_to_dag_log=None,
-    opt_level=3,
-    other_targets=None,
-):
-    """Extract tuning tasks from a relay program.
-
-    Parameters
-    ----------
-    mod: tvm.IRModule or relay.function.Function
-        The module or function to tune
-    params: dict of str to numpy array
-        The associated parameters of the program
-    target: Union[tvm.target.Target, str]
-        The compilation target
-    target_host: Optional[Union[tvm.target.Target, str]]
-        The host compilation target
-    hardware_params : Optional[HardwareParams]
-        Hardware parameters used for the search tasks
-    include_simple_tasks: bool
-        Whether to extract simple tasks that do not include complicated ops.
-    dump_workload_to_dag_log: Optional[str]
-        A file to dump an association between the workload keys and the actual DAG
-    opt_level : Optional[int]
-        The optimization level of the task extractions.
-    other_targets: Optional[List[tvm.target.Target]]
-        Other targets for call_all_topi_funcs, e.g., cutlass target.
-
-    Returns
-    -------
-    tasks: List[SearchTask]
-        The tasks in this network
-    weights: List[int]
-        The weight (i.e. the number of appearance) of extracted tasks
-    """
-    # pylint: disable=import-outside-toplevel
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    # Run the compiler to collect all TOPI calls during compilation.
-    env = TracingEnvironment(
-        TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY
-    )
-
-    dispatch_ctx = DispatchContext.current
-    old_verbose = dispatch_ctx.verbose
-    dispatch_ctx.verbose = 0
-
-    targets = [target]
-    if other_targets is not None:
-        targets += other_targets
-    errors = []
-    with env:
-        # Wrap build call in a new thread to avoid the conflict
-        # between python's multiprocessing and tvm's thread pool
-        build_thread = threading.Thread(
-            target=call_all_topi_funcs, args=(mod, params, targets, errors, opt_level)
-        )
-        build_thread.start()
-        build_thread.join()
-
-    if errors:
-        error_strings = ["Task extraction had the following errors:"] + errors
-        raise TVMError("\n".join(error_strings))
-
-    dispatch_ctx.verbose = old_verbose
-
-    # create search tasks
-    tasks = []
-    weights = []
-    for wkl_key, (weight, func_names) in env.wkl_key_to_weight.items():
-        tasks.append(
-            SearchTask(
-                workload_key=wkl_key,
-                target=target,
-                hardware_params=hardware_params,
-                # When auto scheduler is used in end to end network, try to apply layout rewrite
-                # to improve the overall performance
-                layout_rewrite_option=LayoutRewriteOption.get_target_default(target, True),
-                task_inputs=(
-                    env.wkl_key_to_input_names[wkl_key]
-                    if wkl_key in env.wkl_key_to_input_names
-                    else None
-                ),
-                task_inputs_save_to_file=True,
-                desc=",".join(func_names),
-            )
-        )
-        weights.append(int(weight))
-
-    if dump_workload_to_dag_log is not None:
-        with open(dump_workload_to_dag_log, "w") as f:
-            json.dump({task.workload_key: str(task.compute_dag) for task in tasks}, f)
-
-    return tasks, weights
-
-
-class TracingMode:
-    """Two modes for tracing"""
-
-    EXTRACT_TASK = 0  # trace all topi calls to extract tasks
-    # same as EXTRACT_TASK but ignore the task without complex ops
-    EXTRACT_COMPLEX_TASK_ONLY = 1
-    PREPARE_LAYOUT_REWRITE = 2  # trace topi calls to prepare layout rewrite
-
-
-class TracingEnvironment:
-    """Global environment for tracing all topi function calls"""
-
-    current = None
-
-    def __init__(self, tracing_mode):
-        self.tracing_mode = tracing_mode
-        self.relay_disable_build_cache = "false"
-        self.func_name_to_wkl_key = {}
-        self.wkl_key_to_weight = {}
-        self.wkl_key_to_input_names = {}
-
-    def __enter__(self):
-        TracingEnvironment.current = self
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        TracingEnvironment.current = None
-
-    def add_workload_key(self, func_name, workload_key):
-        """Add the workload key of a search task.
-
-        Parameters
-        ----------
-        func_name: str
-            The function name of the task.
-
-        workload_key: str
-            The workload key of a task.
-        """
-        self.func_name_to_wkl_key[func_name] = workload_key
-        if workload_key not in self.wkl_key_to_weight:
-            self.wkl_key_to_weight[workload_key] = (0, set())
-        weight, func_names = self.wkl_key_to_weight[workload_key]
-        func_names.add(func_name)
-        self.wkl_key_to_weight[workload_key] = (weight + 1, func_names)
-
-    def add_workload_input_names(self, workload_key, input_names):
-        """Add special task inputs to this workload.
-
-        Parameters
-        ----------
-        workload_key : str
-            The workload key of a task.
-
-        input_names : List[str]
-            A list of input names.
-        """
-        self.wkl_key_to_input_names[workload_key] = input_names
-
-
-@tvm._ffi.register_func("auto_scheduler.enter_layout_rewrite")
-def enter_layout_rewrite():
-    """Enter layout rewrite tracing environment"""
-    env = TracingEnvironment(TracingMode.PREPARE_LAYOUT_REWRITE)
-    env.__enter__()
-
-
-@tvm._ffi.register_func("auto_scheduler.exit_layout_rewrite")
-def exit_layout_rewrite():
-    """Exit layout rewrite tracing environment"""
-    env = TracingEnvironment.current
-    env.__exit__(None, None, None)
-
-
-def traverse_to_get_io_tensors(outs):
-    """Traverse from a list of output tensors to get input/output tensors and
-    other useful information.
-
-    Parameters
-    ----------
-    outs: List[Tensor]
-        The output tensors
-
-    Returns
-    -------
-    io_tensors: List[Tensor]
-        The input and output tensors with static shape
-    has_layout_free: bool
-        Whether the compute DAG has layout_free placeholders
-    has_complex_op: bool
-        Whether the topi compute function includes at least one complex (reduce) op
-    """
-    layout_free_ops = []
-    inputs = []
-
-    has_complex_op = False
-    visited = set()
-
-    def traverse(t):
-        nonlocal has_complex_op
-
-        # We cannot directly add tensors to the set, because the comparison of
-        # two tensors with ndim=0 is ambiguous.
-        assert t.handle is not None
-        if t.handle.value in visited:
-            return
-        if isinstance(t.op, PlaceholderOp):
-            inputs.append(t)
-        elif isinstance(t.op, ComputeOp):
-            has_complex_op = has_complex_op or any([isinstance(e, Reduce) for e in t.op.body])
-            if "layout_free_placeholders" in t.op.attrs:
-                layout_free_ops.append(t.op)
-            for x in t.op.input_tensors:
-                traverse(x)
-        visited.add(t.handle.value)
-
-    for t in outs:
-        traverse(t)
-
-    io_tensors = inputs + list(outs)
-    for tensor in io_tensors:
-        # Reject the compute if any of its I/O tensors has dynamic shape.
-        if any([not isinstance(v, int) for v in get_const_tuple(tensor.shape)]):
-            return ([], False, False)
-
-    return (io_tensors, len(layout_free_ops) > 0, has_complex_op)
-
-
-@tvm._ffi.register_func("auto_scheduler.relay_integration.auto_schedule_topi_compute")
-def auto_schedule_topi(func_name, outs):
-    """Use auto-scheduler to schedule any topi compute function.
-
-    Note: This is used internally for relay integration. Do
-    not use this as a general user-facing API.
-
-    Parameters
-    ----------
-    func_name: str
-        The name of the function being scheduled.
-
-    outs: List[Tensor]
-        The output tensors of topi compute functions
-
-    Returns
-    -------
-    sch: Optional[te.Schedule]
-        A tuned schedule or none (if not tuned) in the final build mode;
-        None in the tracing mode so that the fallback topi schedule will be used.
-    """
-
-    # pylint: disable=import-outside-toplevel
-    from tvm.auto_scheduler.measure import (  # lazily import to avoid recursive dependency
-        prepare_input_map,
-    )
-
-    io_tensors, has_layout_free, has_complex_op = traverse_to_get_io_tensors(outs)
-    if not io_tensors:  # The compute includes dynamic shapes which are not supported yet.
-        return None
-
-    try:
-        dag = ComputeDAG(io_tensors)
-    except tvm.error.TVMError as err:
-        logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err))
-        return None
-
-    workload_key = dag.workload_key()
-    key = register_workload_tensors(workload_key, io_tensors)
-    target = tvm.target.Target.current()
-
-    dispatch_ctx = DispatchContext.current
-    state = dispatch_ctx.query(target, key, has_complex_op, dag, func_name)
-    schedule = None
-
-    env = TracingEnvironment.current
-    if env is None:
-        # in the final build mode
-        if state is None:
-            return None
-
-        schedule, _ = dag.apply_steps_from_state(state)
-        return schedule
-
-    if env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]:
-        # in the task extraction mode
-        if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK:
-            env.add_workload_key(func_name, key)
-            input_map = prepare_input_map(io_tensors, workload_key)
-            if input_map:
-                env.add_workload_input_names(key, list(input_map.values()))
-    elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
-        # in prepare_layout_rewrite mode
-        if (
-            LayoutRewriteOption.get_target_default(target, True) != LayoutRewriteOption.NO_REWRITE
-            and has_layout_free
-        ):
-            if state is None:
-                return None
-
-            # rewrite the layout and update the context for the new dag
-            new_dag = dag.rewrite_layout_from_state(state)
-            new_key = new_dag.workload_key()
-            if new_key != key:
-                dispatch_ctx.update(target, new_key, state)
-    else:
-        raise ValueError("Invalid tracing mode: " + env.tracing_mode)
-
-    return schedule
-
-
-@tvm._ffi.register_func("auto_scheduler.relay_integration.te_compiler_update_weights")
-def te_compiler_update_weights(function_weights):
-    """A callback for updating the weights of extracted tasks. When using the TE compiler
-    that avoids compiling the same function multiple times by caching, all extracted tasks
-    have weight 1, so the TE compiler invokes this callback at the end. In this case,
-    we override existing weights with the use_count in TE compiler cache.
-
-    Parameters
-    ----------
-    function_weights: Dict[str, int]
-        Mapping from function names to their weights.
-    """
-    env = TracingEnvironment.current
-    if env is not None:
-        # Override this map with the weights in the TE compiler.
-        env.wkl_key_to_weight = {}
-
-        for func_name, weight in function_weights.items():
-            # If the function name is not in the map, then it means we are not interested in
-            # this function during task extraction (e.g., a function without reduction).
-            if func_name not in env.func_name_to_wkl_key:
-                continue
-
-            workload_key = env.func_name_to_wkl_key[func_name]
-            if workload_key not in env.wkl_key_to_weight:
-                env.wkl_key_to_weight[workload_key] = (0, set())
-
-            # Note that the function appears multiple times in a model will be renamed
-            # to make sure function names are unique, so we use the workload key generated
-            # from the function's TE compute to determine their weights.
-            old_weight, func_names = env.wkl_key_to_weight[workload_key]
-            func_names.add(func_name)
-            env.wkl_key_to_weight[workload_key] = (old_weight + weight, func_names)
-
-
-def tensor_no_check_call(self, *indices):
-    """An indexing function without any check.
-    This is the same as `tvm.te.Tensor::__call__` except that the safety
-    check is removed.
-    """
-    indices = convert_to_object(indices)
-    args = []
-    for x in indices:
-        if isinstance(x, _expr.PrimExpr):
-            args.append(x)
-        elif isinstance(x, _expr.IterVar):
-            args.append(x.var)
-        else:
-            raise ValueError("The indices must be expression")
-
-    return _expr.ProducerLoad(self, args)
-
-
-def remove_index_check(tensor):
-    """Remove the safety check in the indexing function for a tensor.
-    This is done by monkey patching its indexing function.
-    After removing the check, we are allowed to create a
-    temporary wrong IR and fix it later in other places.
-
-    Parameters
-    ----------
-    tensor: Tensor
-      The tensor to remove index check.
-    """
-    # Monkey patch the indexing function
-    tensor.__call__ = tensor_no_check_call.__get__(tensor, Tensor)
-
-
-def rewrite_compute_body(compute_tensor, new_layout):
-    """Rewrite the body of a ComputeOp according to a new layout of a placeholder"""
-    op = compute_tensor.op
-
-    # Get layout free placeholders
-    layout_free_placeholders = op.attrs["layout_free_placeholders"]
-    assert len(layout_free_placeholders) == 1, "Only support one layout free placeholder"
-    placeholder_op = layout_free_placeholders[0].op
-
-    # Rewrite the index expression in body
-    body = []
-    for b in op.body:
-        body.append(_ffi_api.RewriteIndexForNewLayout(placeholder_op, new_layout, b))
-    op_node = tvm.te._ffi_api.ComputeOp(op.name, op.tag, op.attrs, op.axis, body)
-
-    num = op_node.num_outputs
-    outputs = tuple(op_node.output(i) for i in range(num))
-    return outputs[0] if num == 1 else outputs
-
-
-def rewrite_tensor_shape(tensor, shape):
-    """Rewrite the tensor shape"""
-    _ffi_api.RewriteTensorShape(tensor, shape)
-
-
-def is_auto_scheduler_enabled():
-    """Return whether the auto-scheduler is enabled.
-
-    Parameters
-    ----------
-    enabled: bool
-        Whether the auto-scheduler is enabled
-    """
-    return PassContext.current().config.get(
-        "relay.backend.use_auto_scheduler",
-        False,
-    )
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
deleted file mode 100644
index 4b12c031834f..000000000000
--- a/python/tvm/auto_scheduler/search_policy.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-The search policies of TVM auto-scheduler.
-
-The auto-scheduler constructs a search space according to the compute declaration.
-It then randomly samples programs from the search space and uses evolutionary search with a
-learned cost model to fine tune the sampled programs.
-The final optimized programs are sent to actual hardware for measurement.
-The above process is repeated until the auto-scheduler runs out of time budget.
-
-Reference:
-L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
-Programs for Deep Learning." (OSDI 2020).
-"""
-
-import random
-
-import tvm._ffi
-from tvm.runtime import Object
-from .cost_model import RandomModel
-from . import _ffi_api
-
-
-@tvm._ffi.register_object("auto_scheduler.SearchCallback")
-class SearchCallback(Object):
-    """Callback function before or after search process"""
-
-
-@tvm._ffi.register_object("auto_scheduler.PreloadMeasuredStates")
-class PreloadMeasuredStates(SearchCallback):
-    """A SearchCallback to load measured states from the log file for a search policy.
-
-    This can resume the state of the search policy:
-        - Making sure an already measured state in former searches will never be measured again.
-        - The history states can be used to speed up the search process(e.g. SketchPolicy uses
-          history states as starting point to perform Evolutionary Search).
-
-    Parameters
-    ----------
-    filename : str
-        The name of the record file.
-    """
-
-    def __init__(self, filename):
-        self.__init_handle_by_constructor__(_ffi_api.PreloadMeasuredStates, filename)
-
-
-@tvm._ffi.register_object("auto_scheduler.PreloadCustomSketchRule")
-class PreloadCustomSketchRule(SearchCallback):
-    """
-    A SearchCallback for SketchSearchPolicy that allows users to add
-    custom sketch rule.
-
-    Notes
-    -----
-    This is an advanced feature. Make sure you're clear how it works and this should only be used
-    in SketchSearchPolicy.
-
-    Parameters
-    ----------
-    meet_condition_func: Callable
-        A function with `(policy, state, stage_id) -> int`. Should return one of the result
-        enumeration.
-    apply_func: Callable
-        A function with `(policy, state, stage_id) -> [[State, int], ...]`.
-    rule_name: str = "CustomSketchRule"
-        The name of this custom sketch rule.
-    """
-
-    # Result enumeration of the condition function.
-    PASS = 0  # Skip this rule and continue to try the next rules.
-    APPLY = 1  # Apply this rule and continue to try the next rules.
-    APPLY_AND_SKIP_REST = 2  # Apply this rule and skip the rest rules.
-
-    def __init__(self, meet_condition_func, apply_func, rule_name="CustomSketchRule"):
-        self.__init_handle_by_constructor__(
-            _ffi_api.PreloadCustomSketchRule, meet_condition_func, apply_func, rule_name
-        )
-
-
-@tvm._ffi.register_object("auto_scheduler.SearchPolicy")
-class SearchPolicy(Object):
-    """The base class of search policies."""
-
-    def continue_search_one_round(self, num_measure, measurer):
-        """
-        Continue the search by doing an additional search round.
-
-        Parameters
-        ----------
-        num_measure: int
-            The number of programs to measure in this round
-        measurer: ProgramMeasurer
-            The program measurer to measure programs
-
-        Returns
-        -------
-        inputs: List[MeasureInput]
-            The inputs of measurments in this search round
-        results: List[MeasureResult]
-            The results of measurments in this search round
-        """
-        return _ffi_api.SearchPolicyContinueSearchOneRound(self, num_measure, measurer)
-
-    def set_verbose(self, verbose):
-        """
-        Set the verbosity level of the search policy.
-
-        Parameters
-        ----------
-        verbose: int
-            The verbosity level
-        """
-        return _ffi_api.SearchPolicySetVerbose(self, verbose)
-
-
-@tvm._ffi.register_object("auto_scheduler.EmptyPolicy")
-class EmptyPolicy(SearchPolicy):
-    """A simple example of the search policy which always returns
-    the initial naive schedule (state).
-
-    Parameters
-    ----------
-    task : SearchTask
-        The SearchTask for the computation declaration.
-    init_search_callbacks : Optional[List[SearchCallback]]
-        Callback functions called before the search process.
-    """
-
-    def __init__(self, task, init_search_callbacks=None):
-        self.__init_handle_by_constructor__(_ffi_api.EmptyPolicy, task, init_search_callbacks)
-
-
-@tvm._ffi.register_object("auto_scheduler.SketchPolicy")
-class SketchPolicy(SearchPolicy):
-    """The search policy that searches in a hierarchical search space defined by sketches.
-    The policy randomly samples programs from the space defined by sketches and use evolutionary
-    search to fine-tune them.
-
-    Parameters
-    ----------
-    task : SearchTask
-        The SearchTask for the computation declaration.
-    program_cost_model : CostModel = RandomModel()
-        The cost model to estimate the complete schedules.
-    params : Optional[Dict[str, Any]]
-        Parameters of the search policy.
-        See `src/auto_scheduler/search_policy/sketch_search_policy.h` for the definitions.
-        See `DEFAULT_PARAMS` below to find the default values.
-    seed : Optional[int]
-        Random seed.
-    verbose : int = 1
-        Verbosity level. 0 for silent, 1 to output information during schedule search.
-    init_search_callbacks : Optional[List[SearchCallback]]
-        Callback functions called before the search process, usually used to do extra
-        initializations.
-        Possible callbacks:
-
-          - auto_scheduler.PreloadMeasuredStates
-          - auto_scheduler.PreloadCustomSketchRule
-    """
-
-    DEFAULT_PARAMS = {
-        "eps_greedy": 0.05,
-        "retry_search_one_round_on_empty": 1,
-        "sample_init_min_population": 50,
-        "sample_init_use_measured_ratio": 0.2,
-        "evolutionary_search_population": 2048,
-        "evolutionary_search_num_iters": 4,
-        "evolutionary_search_mutation_prob": 0.85,
-        "cpu_multi_level_tiling_structure": "SSRSRS",
-        "gpu_multi_level_tiling_structure": "SSSRRSRS",
-        # Notice: the default thread bind policy of GPU assumes the tiling structure to have at
-        # least 3 spatial tiling levels in outermost
-        "max_innermost_split_factor": 64,
-        "max_vectorize_size": 16,
-        "disable_change_compute_location": 0,
-    }
-
-    def __init__(
-        self,
-        task,
-        program_cost_model=RandomModel(),
-        params=None,
-        seed=None,
-        verbose=1,
-        init_search_callbacks=None,
-    ):
-        if params is None:
-            params = SketchPolicy.DEFAULT_PARAMS
-        else:
-            for key, value in SketchPolicy.DEFAULT_PARAMS.items():
-                if key not in params:
-                    params[key] = value
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.SketchPolicy,
-            task,
-            program_cost_model,
-            params,
-            seed or random.randint(1, 1 << 30),
-            verbose,
-            init_search_callbacks,
-        )
-
-    def generate_sketches(self, print_for_debug=False):
-        """Generate the sketches.
-        This python interface is mainly used for debugging and testing.
-        The actual search is all done in c++.
-
-        Parameters
-        ----------
-        print_for_debug : bool = False
-            Whether print out the sketches for debug.
-
-        Returns
-        -------
-        sketches : List[State]
-            The generated sketches of this search task.
-        """
-        sketches = _ffi_api.SketchPolicyGenerateSketches(self)
-        if print_for_debug:
-            for i, s in enumerate(sketches):
-                print("=" * 20 + f" {i} " + "=" * 20)
-                print(s)
-        return sketches
-
-    def sample_initial_population(self):
-        """Sample initial population.
-        This python interface is mainly used for debugging and testing.
-        The actual search is all done in c++.
-
-        Returns
-        -------
-        states: List[State]
-            The sampled states
-        """
-        states = _ffi_api.SketchPolicySampleInitialPopulation(self)
-        return states
-
-    def evolutionary_search(self, init_populations, out_size):
-        """Perform evolutionary search.
-        This python interface is mainly used for debugging and testing.
-        The actual search is all done in c++.
-
-        Parameters
-        ----------
-        init_populations: List[State]
-            The initial population states
-        out_size : int
-            The size of generated states
-
-        Returns
-        -------
-        states: List[State]
-            The generated states
-        """
-        states = _ffi_api.SketchPolicyEvolutionarySearch(self, init_populations, out_size)
-        return states
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
deleted file mode 100644
index 767baf916d58..000000000000
--- a/python/tvm/auto_scheduler/search_task.py
+++ /dev/null
@@ -1,649 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" The definiton of SearchTask """
-
-import json
-
-import os
-import logging
-import numpy as np
-
-import tvm._ffi
-from tvm.runtime import Object, ndarray
-
-from tvm.driver.build_module import build
-from tvm.target import Target
-from .measure import LocalBuilder, LocalRunner
-from .measure_record import load_best_record
-from .workload_registry import make_workload_key
-from .compute_dag import ComputeDAG, LayoutRewriteOption
-from .cost_model import XGBModel
-from .search_policy import SketchPolicy
-from .workload_registry import WORKLOAD_FUNC_REGISTRY, register_workload_tensors
-from . import _ffi_api
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("auto_scheduler")
-
-
-@tvm._ffi.register_object("auto_scheduler.HardwareParams")
-class HardwareParams(Object):
-    """The parameters of target hardware used to guide the search policy.
-
-    When a parameter isn't provided, it will instead use the
-    current machine's default value if target is specified.
-    TODO(jcf94): This is considered to be merged with the new Target specification:
-    https://discuss.tvm.apache.org/t/rfc-tvm-target-specification/6844
-    Parameters
-    ----------
-    num_cores : int, optional
-        The number of device cores.
-    vector_unit_bytes : int, optional
-        The width of vector units in bytes.
-    cache_line_bytes : int, optional
-        The size of cache line in bytes.
-    max_shared_memory_per_block : int, optional
-        The max shared memory per block in bytes.
-    max_local_memory_per_block : int, optional
-        The max local memory per block in bytes.
-    max_threads_per_block : int, optional
-        The max number of threads per block.
-    max_vthread_extent : int, optional
-        The max vthread extent.
-    warp_size : int, optional
-        The thread numbers of a warp.
-    target : str or Target, optional
-        The compilation target. Used to determine default values if provided.
-    target_host : str or Target, optional
-        The compilation target host. Used to determine default values if provided.
-    """
-
-    def __init__(
-        self,
-        num_cores=None,
-        vector_unit_bytes=None,
-        cache_line_bytes=None,
-        max_shared_memory_per_block=None,
-        max_local_memory_per_block=None,
-        max_threads_per_block=None,
-        max_vthread_extent=None,
-        warp_size=None,
-        target=None,
-        target_host=None,
-    ):
-        # If target is provided, get the default paramters for this machine.
-        if target is not None:
-            if isinstance(target, str):
-                target = tvm.target.Target(target)
-            if isinstance(target_host, str):
-                target_host = tvm.target.Target(target_host)
-            default_params = _ffi_api.GetDefaultHardwareParams(target, target_host)
-
-            if num_cores is None:
-                num_cores = default_params.num_cores
-            if vector_unit_bytes is None:
-                vector_unit_bytes = default_params.vector_unit_bytes
-            if cache_line_bytes is None:
-                cache_line_bytes = default_params.cache_line_bytes
-            if max_shared_memory_per_block is None:
-                max_shared_memory_per_block = default_params.max_shared_memory_per_block
-            if max_local_memory_per_block is None:
-                max_local_memory_per_block = default_params.max_local_memory_per_block
-            if max_threads_per_block is None:
-                max_threads_per_block = default_params.max_threads_per_block
-            if max_vthread_extent is None:
-                max_vthread_extent = default_params.max_vthread_extent
-            if warp_size is None:
-                warp_size = default_params.warp_size
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.HardwareParams,
-            num_cores,
-            vector_unit_bytes,
-            cache_line_bytes,
-            max_shared_memory_per_block,
-            max_local_memory_per_block,
-            max_threads_per_block,
-            max_vthread_extent,
-            warp_size,
-        )
-
-    def __str__(self):
-        """Pretty printing for hardware parameter configuration."""
-        format_str = (
-            "HardwareParams:\n"
-            f"  num_cores: {self.num_cores}\n"
-            f"  vector_unit_bytes: {self.vector_unit_bytes}\n"
-            f"  cache_line_bytes: {self.cache_line_bytes}\n"
-            f"  max_shared_memory_per_block: {self.max_shared_memory_per_block}\n"
-            f"  max_local_memory_per_block: {self.max_local_memory_per_block}\n"
-            f"  max_threads_per_block: {self.max_threads_per_block}\n"
-            f"  max_vthread_extent: {self.max_vthread_extent}\n"
-            f"  warp_size: {self.warp_size}\n"
-        )
-        return format_str
-
-
-@tvm._ffi.register_object("auto_scheduler.TuningOptions")
-class TuningOptions(Object):
-    """This controls the options of performance tuning.
-
-    Parameters
-    ----------
-    num_measure_trials: int = 0
-        The number of measurement trials.
-        The search policy measures `num_measure_trials` schedules in total and returns the best one
-        among them.
-        With `num_measure_trials` == 0, the policy will do the schedule search but won't involve
-        measurement. This can be used to get a runnable schedule quickly without auto-tuning.
-    early_stopping: Optional[int]
-        Stop the tuning early if getting no improvement after n measurements.
-    num_measures_per_round: int = 64
-        The number of schedules to be measured at each search round.
-        The whole schedule search process will try a total number of `num_measure_trials` in several
-        rounds.
-    verbose: int = 1
-        Verbosity level. 0 for silent, 1 to output information during schedule search.
-    builder: Union[ProgramBuilder, str] = 'local'
-        ProgramBuilder which builds the program.
-    runner: Union[ProgramRunner, str] = 'local'
-        ProgramRunner which runs the program and measures time costs.
-    measure_callbacks: Optional[List[MeasureCallback]]
-        Callback functions called after each measurement.
-        Candidates:
-        - auto_scheduler.RecordToFile
-    """
-
-    def __init__(
-        self,
-        num_measure_trials=0,
-        early_stopping=None,
-        num_measures_per_round=64,
-        verbose=1,
-        builder="local",
-        runner="local",
-        measure_callbacks=None,
-    ):
-        if isinstance(builder, str):
-            if builder == "local":
-                builder = LocalBuilder()
-            else:
-                raise ValueError("Invalid builder: " + builder)
-        elif not isinstance(builder, tvm.auto_scheduler.measure.ProgramBuilder):
-            raise ValueError(
-                "Invalid builder: "
-                + builder
-                + " . TuningOptions expects a ProgramBuilder or string."
-            )
-
-        if isinstance(runner, str):
-            if runner == "local":
-                runner = LocalRunner()
-            else:
-                raise ValueError("Invalid runner: " + runner)
-        elif not isinstance(runner, tvm.auto_scheduler.measure.ProgramRunner):
-            raise ValueError(
-                "Invalid runner: " + runner + " . TuningOptions expects a ProgramRunner or string."
-            )
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.TuningOptions,
-            num_measure_trials,
-            early_stopping or -1,
-            num_measures_per_round,
-            verbose,
-            builder,
-            runner,
-            measure_callbacks,
-        )
-
-
-# The map stores special registered buffer for measurement.
-# This can be used for sparse workloads when we cannot use random tensors for measurment.
-# {
-#     "workload_key_0": {
-#         "task_input_0": Tensor(...),
-#         "task_input_1": Tensor(...)
-#     },
-#     "workload_key_1": {
-#         "task_input_2": Tensor(...),
-#         "task_input_3": Tensor(...)
-#     },
-#     ...
-# }
-TASK_INPUT_BUFFER_TABLE = {}
-
-
-def _save_buffer_to_file(buffer_name, buffer_data):
-    """Save the current Tensor buffer to a numpy file.
-
-    File name will be: {buffer_name}.{buffer_shape}_{buffer_data_type}.npy
-    """
-    np_data = buffer_data.numpy()
-
-    buffer_name += "."
-    for i in np_data.shape:
-        buffer_name += f"{i}_"
-    buffer_name += f"{np_data.dtype}.npy"
-
-    np_data.tofile(buffer_name, " ")
-
-
-def _try_load_buffer_from_file(buffer_name):
-    """Try to load buffer from a numpy file, if not found, return None.
-
-    File name has a same format as `_save_buffer_to_file`.
-    """
-    filelist = os.listdir()
-
-    for file in filelist:
-        if file.startswith(buffer_name + "."):
-            meta_info = file.split(".")[-2].split("_")
-            shape = [int(i) for i in meta_info[:-1]]
-            dtype = meta_info[-1]
-            buffer_data = np.fromfile(file, dtype=dtype, sep=" ")
-            buffer_data = buffer_data.reshape(shape)
-            return ndarray.array(buffer_data)
-
-    return None
-
-
-def register_task_input_buffer(
-    workload_key, input_name, input_data, overwrite=False, save_to_file=False
-):
-    """Register special buffer for measurement.
-
-    Parameters
-    ----------
-    workload_key : str
-        The workload key of the SearchTask.
-
-    input_name : str
-        The name of input buffer.
-
-    input_data : tvm.nd.NDArray
-        The input Tensor data.
-
-    overwrite : bool = False
-        Whether to overwrite the data if a name has already registered.
-
-    save_to_file : bool = False
-        Whether to save the data to a local file as well. This can be reused to resume the last
-        tuning process.
-
-    Returns
-    -------
-    tvm.nd.NDArray
-        The actual registered Tensor data of this input_name. With `overwrite` set to False, will
-        return the original one if the name has already registered before.
-    """
-    global TASK_INPUT_BUFFER_TABLE
-
-    if workload_key not in TASK_INPUT_BUFFER_TABLE:
-        TASK_INPUT_BUFFER_TABLE[workload_key] = {}
-    input_table = TASK_INPUT_BUFFER_TABLE[workload_key]
-
-    if not overwrite:
-        if input_name not in input_table.keys():
-            # Try to load buffer data from local file
-            tensor_from_file = _try_load_buffer_from_file(input_name)
-            if tensor_from_file:
-                input_table[input_name] = tensor_from_file
-        elif input_name in input_table.keys():
-            raise RuntimeError(
-                "Tensor %s exists in TASK_INPUT_BUFFER_TABLE, %s"
-                % (input_name, "set overwrite to True or this Tensor will not be registered")
-            )
-
-    input_table[input_name] = input_data
-    if save_to_file:
-        _save_buffer_to_file(input_name, input_data)
-    return input_data
-
-
-def get_task_input_buffer(workload_key, input_name):
-    """Get special buffer for measurement.
-
-    The buffers are registered by `register_task_input_buffer`.
-
-    Parameters
-    ----------
-    workload_key : str
-        The workload key of the SearchTask.
-
-    input_name : str
-        The name of input buffer.
-
-    Returns
-    -------
-    tvm.nd.NDArray
-        The registered input buffer.
-    """
-    global TASK_INPUT_BUFFER_TABLE
-
-    if workload_key not in TASK_INPUT_BUFFER_TABLE:
-        TASK_INPUT_BUFFER_TABLE[workload_key] = {}
-    input_table = TASK_INPUT_BUFFER_TABLE[workload_key]
-
-    if input_name not in input_table:
-        # Try to load buffer data from local file
-        tensor_from_file = _try_load_buffer_from_file(input_name)
-        if tensor_from_file:
-            input_table[input_name] = tensor_from_file
-
-    # Then check for the default table, the input names extracted from a relay model will be
-    # stored here for we're not able to get the workload_key at that time
-    if input_name not in input_table:
-        input_table = TASK_INPUT_BUFFER_TABLE["default"]
-
-    if input_name in input_table:
-        return input_table[input_name]
-
-    raise ValueError(
-        f"{input_name} not found in TASK_INPUT_BUFFER_TABLE, "
-        f"should provide with `SearchTask(..., task_inputs={{...}})`"
-    )
-
-
-@tvm._ffi.register_object("auto_scheduler.SearchTask")
-class SearchTask(Object):
-    """The computation information and hardware parameters for a schedule search task.
-
-    Parameters
-    ----------
-    func : Union[Function, str]
-        The function that returns the compute declaration Tensors.
-        Can be the a function or the function name.
-    args : Union[Tuple[Any, ...], List[Any]]
-        The args of the function.
-    compute_dag : ComputeDAG
-        The ComputeDAG for the corresponding compute declaration.
-    workload_key : str
-        The workload key for the corresponding compute declaration.
-    target : any target-like object, see Target.canon_target
-        The target device of this search task.
-    target_host : None or any target-like object, see Target.canon_target
-        The target host device of this search task.
-    hardware_params : Optional[HardwareParams]
-        Hardware parameters used in this search task.
-    layout_rewrite_option : Optional[LayoutRewriteOption]
-        The layout rewrite option used for measuring programs. If None, the default value will be
-        set depending on the specified target.
-        Auto_scheduler will find a better schedule for the specified layout rewrite option.
-        The NO_REWRITE and INSERT_TRANSFORM_STAGE are expected to be used when tuning a standalone
-        op, and the REWRITE_FOR_PRE_TRANSFORMED is expected to be used when tuning ops inside a
-        network.
-    task_inputs : Union[Dict[str, tvm.nd.NDArray], List[str]]
-        A dict maps the input names to input tensors or a list of input names.
-        Some special Tensor used as inputs in program measuring. Usually we do not need to care
-        about it, but for special workloads like Sparse computation the Sparse Tensor input are
-        meaningful that we cannot use random input directly.
-    task_inputs_overwrite : bool = False
-        Whether to overwrite the data if a name has already in the global table.
-    task_inputs_save_to_file : bool = False
-        Whether to save the data to a local file as well. This can be reused to resume the last
-        tuning process.
-    desc: str = ""
-        The description string of this task.
-
-    Examples
-    --------
-    .. code-block:: python
-
-      # We support two ways to create a search task
-
-      # Way 1: create a task by a workload generation function.
-      # The `workload_func` is a function decorated by @auto_scheduler.register_workload
-      task = SearchTask(func=workload_func, args=args, target=target)
-
-      # Way 2: create a task by a workload_key.
-      # The `workload_key` is a string, which can be either a hash key or a json-serialized
-      # tuple(func, args).
-      task = SearchTask(workload_key=workload_key, target=target)
-    """
-
-    def __init__(
-        self,
-        func=None,
-        args=None,
-        compute_dag=None,
-        workload_key=None,
-        target=None,
-        target_host=None,
-        hardware_params=None,
-        layout_rewrite_option=None,
-        task_inputs=None,
-        task_inputs_overwrite=False,
-        task_inputs_save_to_file=False,
-        desc="",
-    ):
-        assert (
-            func is not None or workload_key is not None
-        ), "Either a workload generation function or a workload key should be provided"
-
-        if func is not None:
-            workload_key = make_workload_key(func, args)
-        if compute_dag is None:
-            compute_dag = ComputeDAG(workload_key)
-
-        assert target is not None, "Must specify a target."
-
-        target, target_host = Target.canon_target_and_host(target, target_host)
-
-        if layout_rewrite_option is None:
-            layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
-
-        task_input_names = []
-        if isinstance(task_inputs, list):
-            task_input_names = task_inputs
-        elif isinstance(task_inputs, dict):
-            for input_name in task_inputs:
-                register_task_input_buffer(
-                    workload_key,
-                    input_name,
-                    task_inputs[input_name],
-                    task_inputs_overwrite,
-                    task_inputs_save_to_file,
-                )
-                task_input_names.append(input_name)
-        elif task_inputs is not None:
-            raise ValueError("task_inputs should be a dict or a list.")
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.SearchTask,
-            compute_dag,
-            workload_key,
-            target,
-            target_host,
-            hardware_params,
-            layout_rewrite_option,
-            task_input_names,
-            desc,
-        )
-
-    def tune(self, tuning_options, search_policy=None, adaptive_training=False):
-        """Run auto scheduling search for a task
-
-        Parameters
-        ----------
-        tuning_options : TuningOptions
-            Tuning and measurement options.
-        search_policy : Optional[SearchPolicy]
-            The search policy to be used for schedule search.
-        """
-        if search_policy is None:
-            cost_model = XGBModel(adaptive_training=adaptive_training)
-            search_policy = SketchPolicy(self, cost_model)
-
-        _ffi_api.AutoSchedule(search_policy, tuning_options)
-
-    def apply_best(self, log_file, include_compatible=False, layout_rewrite_option=None):
-        """Apply the history best from a log file and return the schedule.
-
-        Parameters
-        ----------
-        log_file : str
-           The name of the log file.
-        include_compatible: bool
-            When set to True, all compatible records in the log file will be considered.
-        layout_rewrite_option : Optional[LayoutRewriteOption]
-           The layout rewrite option.
-
-
-        Returns
-        -------
-            A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
-        """
-        inp, _ = load_best_record(
-            log_file, self.workload_key, include_compatible=include_compatible
-        )
-        if inp is None:
-            raise RuntimeError(
-                f"Cannot find any valid schedule for {self.workload_key} in file {log_file}"
-            )
-
-        sch, args = self.compute_dag.apply_steps_from_state(
-            inp.state, layout_rewrite_option or self.layout_rewrite_option
-        )
-        return sch, args
-
-    def print_best(self, log_file, print_mode="schedule"):
-        """Print the best schedule as python schedule API code or CUDA source code.
-
-        Parameters
-        ----------
-        log_file : str
-           The name of the log file
-        print_mode: str
-           if "schedule", print the best schedule as python schedule API code.
-           if "cuda", print the best schedule as CUDA source code.
-
-        Returns
-        -------
-        code: str
-            The best schedule code in python API or CUDA source code
-        """
-        inp, _ = load_best_record(log_file, self.workload_key)
-        if inp is None:
-            raise RuntimeError(
-                f"Cannot find any valid schedule for {self.workload_key} in file {log_file}"
-            )
-
-        if print_mode == "schedule":
-            return self.compute_dag.print_python_code_from_state(inp.state)
-        if print_mode == "cuda":
-            assert self.target.kind.name == "cuda"
-            sch, args = self.compute_dag.apply_steps_from_state(inp.state)
-            func = build(sch, args, "cuda")
-            return func.imported_modules[0].get_source()
-        raise ValueError(f"Invalid print_mode: {print_mode}")
-
-    def __getstate__(self):
-        self.target, self.target_host = Target.canon_target_and_host(self.target, self.target_host)
-        return {
-            "compute_dag": self.compute_dag,
-            "workload_key": self.workload_key,
-            "target": self.target,
-            "target_host": self.target_host,
-            "hardware_params": self.hardware_params,
-            "layout_rewrite_option": self.layout_rewrite_option,
-            "task_input_names": self.task_input_names,
-            "desc": self.desc,
-        }
-
-    def __setstate__(self, state):
-        # Register the workload if needed
-        try:
-            workload = json.loads(state["workload_key"])
-        except Exception:  # pylint: disable=broad-except
-            raise RuntimeError(f"Invalid workload key {state['workload_key']}")
-
-        # workload[0] is either the compute function name or the ComputeDAG hash.
-        # The compute functions are already registered when importing TVM, so here
-        # we only register the ComputeDAG workloads. If the same workload has
-        # already been registered, the later registration overrides the previous one.
-        if workload[0] not in WORKLOAD_FUNC_REGISTRY:
-            register_workload_tensors(state["workload_key"], state["compute_dag"].tensors)
-
-        state["target"], state["target_host"] = Target.canon_target_and_host(
-            state["target"], state["target_host"]
-        )
-        self.__init_handle_by_constructor__(
-            _ffi_api.SearchTask,
-            state["compute_dag"],
-            state["workload_key"],
-            state["target"],
-            state["target"].host,
-            state["hardware_params"],
-            state["layout_rewrite_option"],
-            state["task_input_names"],
-            state["desc"],
-        )
-
-
-def create_task(func, args, target, target_host=None, hardware_params=None):
-    """THIS API IS DEPRECATED.
-
-    Create a search task.
-
-    Parameters
-    ----------
-    func : Union[Function, str]
-        The function that returns the compute declaration Tensors.
-        Can be the a function or the function name.
-    args : Union[Tuple[Any, ...], List[Any]]
-        The args of the function.
-    target : Union[tvm.target.Target, str]
-        The target device of this search task.
-    target_host : Optional[Union[tvm.target.Target, str]]
-        The target host device of this search task.
-    hardware_params : Optional[HardwareParams]
-        Hardware parameters used in this search task.
-
-    Returns
-    -------
-        SearchTask: the created task
-    """
-    raise ValueError(
-        'The API "auto_scheduler.create_task" is deprecated.'
-        "See https://github.com/apache/tvm/pull/7028 for the upgrade guide"
-    )
-
-
-def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()):
-    """THIS API IS DEPRECATED.
-
-    Run auto scheduling search for a task.
-
-    Parameters
-    ----------
-    task : SearchTask
-        The SearchTask for the computation declaration.
-    search_policy : Optional[SearchPolicy]
-        The search policy to be used for schedule search.
-    tuning_options : Optional[TuningOptions]
-        Tuning and measurement options.
-
-    Returns
-    -------
-        A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
-    """
-    raise ValueError(
-        'The API "auto_scheduler.create_task" is deprecated.'
-        "See https://github.com/apache/tvm/pull/7028 for the upgrade guide."
-    )
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
deleted file mode 100644
index 58457daad0b6..000000000000
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ /dev/null
@@ -1,658 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-""" The task scheduler that allocates the time resources when tuning multiple tasks together
-
-The details of the "gradient" strategy below can be found in the section 6 of this paper:
-L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
-Programs for Deep Learning." (OSDI 2020).
-"""
-import os
-import time
-import math
-import logging
-
-import numpy as np
-
-from .search_policy import SearchPolicy, SketchPolicy, PreloadMeasuredStates
-from .cost_model import RandomModel, XGBModel
-from .utils import array_mean
-from .measure import ProgramMeasurer
-from .measure_record import RecordReader
-from . import _ffi_api
-
-logger = logging.getLogger("auto_scheduler")
-
-
-def make_search_policies(
-    search_policy,
-    search_policy_params,
-    tasks,
-    num_measures_per_round,
-    verbose,
-    load_model_file=None,
-    load_log_file=None,
-    adaptive_training=False,
-):
-    """Make a list of search policies for a list of search tasks.
-    It creates one policy per task.
-
-    Parameters
-    ----------
-    search_policy: Union[str, List[SearchPolicy]]
-        The name of search policy.
-    search_policy_params: Dict[str, Any]]
-        The parameters of the search policy.
-    tasks: List[SearchTask]
-        The list of all tasks
-    num_measures_per_round: int
-        The number of schedules to be measured at each search round.
-        This should be the same as `TuningOptions.num_measures_per_round`
-    verbose: int
-        The verbosity level. 0 for silent.
-    load_model_file: Optional[str]
-        Load pre-trained model from this file. If this is None, the cost model will
-        be trained from scratch.
-    load_log_file: Optional[str]
-        Load measurement records from this file. If it is not None, the status of the
-        task scheduler, search policies and cost models will be restored according to this file.
-    adaptive_training: bool = False
-        Option used by XGBModel to reduce the model training frequency when there're too
-        many logs.
-
-    Returns
-    -------
-    policies: List[SearchPolicy]
-        The list of search policies
-    """
-    if search_policy == "default":
-        search_policy = "sketch.xgb"
-
-    if isinstance(search_policy, str):
-        policy_type, model_type = search_policy.split(".")
-        if model_type == "xgb":
-            cost_model = XGBModel(
-                num_warmup_sample=len(tasks) * num_measures_per_round,
-                model_file=load_model_file,
-                adaptive_training=adaptive_training,
-            )
-            if load_model_file and os.path.isfile(load_model_file):
-                logger.info("TaskScheduler: Load pretrained model...")
-                cost_model.load(load_model_file)
-            elif load_log_file:
-                logger.info("TaskScheduler: Reload measured states and train the model...")
-                cost_model.update_from_file(load_log_file)
-        elif model_type == "random":
-            cost_model = RandomModel()
-        else:
-            raise ValueError("Invalid search policy: " + search_policy)
-
-        if policy_type == "sketch":
-            if load_log_file:
-                # use the log file to restore the status of search policies.
-                init_search_callbacks = [PreloadMeasuredStates(load_log_file)]
-            else:
-                init_search_callbacks = None
-            search_policies = [
-                SketchPolicy(
-                    task,
-                    cost_model,
-                    params=search_policy_params,
-                    verbose=verbose,
-                    init_search_callbacks=init_search_callbacks,
-                )
-                for task in tasks
-            ]
-        else:
-            raise ValueError("Invalid search policy: " + search_policy)
-    else:
-        # check type
-        assert isinstance(search_policy, (tuple, list))
-        for item in search_policy:
-            assert isinstance(item, SearchPolicy)
-        search_policies = search_policy
-
-    return search_policies
-
-
-def derive_similarity_tag(dag, log_base=1.618):
-    """Derive the tag for similarity check from one computational DAG.
-    The DAGs with the same tag are considered as similar tasks.
-
-    The tag format is <op1-tag>_<op2-tag> ... <log(flop)>.
-
-    If the tag is "", then the task is not considered to be similar to any other tasks.
-
-    Parameters
-    ----------
-    dag: ComputeDAG
-        The input computational DAG
-    log_base: float = 1.618
-        The base of log to normalize FLOPS
-
-    Returns
-    -------
-    tag: str
-        The tag of this computational DAG.
-    """
-    ret = ""
-    for op in dag.ops:
-        tag = op.attrs.get("auto_scheduler_task_scheduler_tag", None)
-        if tag:
-            ret += op.attrs["auto_scheduler_task_scheduler_tag"] + "_"
-    if ret:
-        ret += "%d" % int(math.log(dag.flop_ct + 1, log_base))
-    return ret
-
-
-class TaskScheduler:
-    """
-    Allocate the time resources when tuning multiple tasks together.
-    This implements two strategies: "round-robin" and "gradient".
-
-    Parameters
-    ----------
-    tasks: List[SearchTask]
-        All tasks to tune
-    task_weights: Optional[List[float]]
-        The weights of tasks.
-        If provided, the task scheduler will set the objective function to
-        sum(weight[t] * latency[t]), where weight[t] is the weight of a task
-        and the lantecy[t] is the lantecy of the task.
-        If not provided, the task scheduer will assign equal weights to all
-        tasks (i.e., the objective function is sum(latency[t])).
-    objective_func: Optional[Callable[List[float] -> float]]
-        The objective function to be minimized.
-        The objective function accepts the current latencies of all tasks and returns the
-        objective.
-        If not provided, the objective is the weighted sum of the latencies of all tasks.
-    strategy: str = "gradient"
-        The scheduling strategy.
-        "round-robin": Tune tasks in round robin order.
-        "gradient" : Tune tasks with gradient descent.
-    load_model_file: Optional[str]
-        Load pre-trained model from this file. If this is None, the cost model will
-        be trained from scratch.
-    load_log_file: Optional[str]
-        Load measurement records from this file. If it is not None, the status of the
-        task scheduler, search policies and cost models will be restored according to this file.
-    verbose: int = 1
-        The level of verbosity. 0 means silent.
-    alpha: float = 0.2
-        The parameter used for 'gradient' strategy
-    beta: float = 2
-        The parameter used for 'gradient' strategy
-    backward_window_size: int = 3
-        The parameter used for 'gradient' strategy
-    callbacks: Optional[List[TaskSchedulerCallback]]
-        The task scheduler callbacks that will be called before and after tuning a task.
-        If None, PrintTableInfo and LogEstimatedLatency callback will be used.
-    """
-
-    def __init__(
-        self,
-        tasks,
-        task_weights=None,
-        objective_func=None,
-        strategy="gradient",
-        load_model_file: str = None,
-        load_log_file: str = None,
-        alpha: float = 0.2,
-        beta: float = 2,
-        gamma: float = 0.5,
-        backward_window_size: int = 3,
-        callbacks=None,
-    ):
-        self.tasks = tasks
-        if objective_func:  # use custom objective function
-            self.objective_func = objective_func
-        else:  # use weighted sum
-            if task_weights:
-                self.objective_func = lambda costs: sum(c * w for c, w in zip(costs, task_weights))
-            else:
-                self.objective_func = sum
-
-        self.strategy = strategy
-        self.load_log_file = load_log_file
-        self.load_model_file = load_model_file
-        self.alpha = alpha
-        self.beta = beta
-        self.gamma = gamma
-        self.backward_window_size = backward_window_size
-        self.callbacks = (
-            callbacks
-            if callbacks is not None
-            else [PrintTableInfo(), LogEstimatedLatency("total_latency.tsv")]
-        )
-
-        assert len(self.tasks) != 0, "No tasks"
-        assert self.strategy in ["round-robin", "gradient"]
-
-        # task_cts[i] saves how many times task i is tuned
-        self.task_cts = [0 for _ in range(len(self.tasks))]
-
-        # task_best_cts[i] saves the round task i found the best latency
-        self.task_best_cts = [0 for _ in range(len(self.tasks))]
-
-        # task_costs_history[i] saves the latency history of task i
-        self.task_costs_history = [[] for _ in range(len(self.tasks))]
-
-        # best_costs[i] saves the best latency of task i
-        self.best_costs = 1e10 * np.ones(len(self.tasks))
-        self.cur_score = self._compute_score(self.best_costs)
-
-        self.tune_option = self.measurer = self.search_policies = None
-        self.ct = self.best_ct = self.best_score = self.tic = None
-        self.num_measures_per_round = None
-        self.dead_tasks = set()
-
-        # Build similarity groups
-        self.task_tags = []  # task_id -> tag
-        self.tag_to_group_id = {}  # tag -> group_id
-        self.group_task_ids = []  # group_id -> all task ids in this group
-        self.flop_cts = []  # task_id -> the number of floating ops
-        for i, task in enumerate(self.tasks):
-            tag = derive_similarity_tag(task.compute_dag)
-            self.task_tags.append(tag)
-            self.flop_cts.append(task.compute_dag.flop_ct)
-            if not tag:
-                continue
-
-            if tag not in self.tag_to_group_id:
-                self.tag_to_group_id[tag] = len(self.tag_to_group_id)
-                self.group_task_ids.append([])
-            self.group_task_ids[self.tag_to_group_id[tag]].append(i)
-
-    def tune(
-        self,
-        tune_option,
-        search_policy="default",
-        search_policy_params=None,
-        adaptive_training=False,
-        per_task_early_stopping=None,
-    ):
-        """Tune a batch of tasks together.
-
-        Parameters
-        ----------
-        tune_option: TuningOptions
-            The tuning options applied to all tasks.
-        search_policy: : Union[str, List[SearchPolicy]] = "default"
-            The list of search policies.
-            If it is str,
-            "default" for the default policy (SketchPolicy + XGBModel),
-            "sketch.xgb" for SketchPolicy + XGBModel,
-            "sketch.random" for SketchPolicy + RandomModel.
-        search_policy_params : Optional[Dict[str, Any]]
-            The parameters of the search policy
-        adaptive_training : bool = False
-            Option used by XGBModel to reduce the model training frequency when there're
-            too many logs.
-        per_task_early_stopping : Optional[int]
-            Stop tuning a task early if getting no improvement after n measurements.
-        """
-        # init members
-        self.tune_option = tune_option
-        self.early_stopping_all = (
-            1e20 if tune_option.early_stopping < 0 else tune_option.early_stopping
-        )
-        self.early_stopping_task = (
-            1e20 if per_task_early_stopping is None else per_task_early_stopping
-        )
-
-        self.measurer = ProgramMeasurer(
-            tune_option.builder,
-            tune_option.runner,
-            tune_option.measure_callbacks,
-            tune_option.verbose,
-        )
-        self.ct = self.best_ct = 0
-        self.tic = time.time()
-
-        # reset num_measures_per_round to make sure every task is tuned at least once
-        self.num_measures_per_round = min(
-            tune_option.num_measures_per_round, tune_option.num_measure_trials // len(self.tasks)
-        )
-        if self.num_measures_per_round <= 0:
-            raise ValueError(
-                "num_measure_trials is too small. Please set it to a higher value."
-                f"It should be at least {len(self.tasks)} for this model."
-            )
-
-        # restore the status of the task scheduler from a log file
-        if self.load_log_file:
-            self._restore_status(self.load_log_file, self.num_measures_per_round)
-
-        # make one search policy for one task
-        self.search_policies = make_search_policies(
-            search_policy,
-            search_policy_params,
-            self.tasks,
-            self.num_measures_per_round,
-            tune_option.verbose,
-            self.load_model_file,
-            self.load_log_file,
-            adaptive_training,
-        )
-
-        # do a round robin first to warm up
-        for idx in range(len(self.tasks)):
-            # skip warming up this task if it has been tuned before (restored from the log file)
-            if not self.task_cts[idx]:
-                self._tune_task(idx)
-        self.best_ct = self.ct
-        self.best_score = self.cur_score
-
-        # put task without schedule on warm up to dead state
-        for task_idx, cost in enumerate(self.best_costs):
-            if cost == 1e10:
-                self.dead_tasks.add(task_idx)
-
-        # use the specific strategy to choose workload to tune
-        task_idx = -1
-        while self.ct < tune_option.num_measure_trials and len(self.dead_tasks) < len(self.tasks):
-            if self.strategy == "round-robin":
-                task_idx = (task_idx + 1) % len(self.tasks)
-                while task_idx in self.dead_tasks:
-                    task_idx = (task_idx + 1) % len(self.tasks)
-            elif self.strategy == "gradient":
-                gradients = []
-
-                for i in range(len(self.tasks)):
-                    if i in self.dead_tasks:
-                        gradients.append(0)
-                        continue
-
-                    # compute gradient from chain rule : (delta f / delta g_i)
-                    delta = 1e-4
-                    new_costs = list(self.best_costs)
-                    new_costs[i] -= delta
-                    chain_grad = (
-                        self._compute_score(self.best_costs) - self._compute_score(new_costs)
-                    ) / delta
-
-                    # compute (g_i(t_i) - g(t_i - \Delta t)) / (\Delta t)
-                    if (
-                        self.task_cts[i] - 1 < len(self.task_costs_history[i])
-                        and self.task_cts[i] - 1 - self.backward_window_size >= 0
-                    ):
-                        backward_grad = (
-                            self.task_costs_history[i][self.task_cts[i] - 1]
-                            - self.task_costs_history[i][
-                                self.task_cts[i] - 1 - self.backward_window_size
-                            ]
-                        ) / self.backward_window_size
-                    else:
-                        backward_grad = 0
-
-                    # compute (g_i(t_i + \Delta t) - g(t_i)) / (\Delta t)
-                    g_next_1 = self.best_costs[i] - (self.best_costs[i] / self.task_cts[i])
-
-                    g_next_2 = self.beta * 1e30
-                    group_id = self.tag_to_group_id.get(self.task_tags[i], None)
-                    if group_id is not None and len(self.group_task_ids[group_id]) > 1:
-                        best_flops = max(
-                            [
-                                self.flop_cts[j] / self.best_costs[j]
-                                for j in self.group_task_ids[group_id]
-                            ]
-                        )
-                        g_next_2 = self.beta * self.flop_cts[i] / best_flops
-
-                    g_next = min(g_next_1, g_next_2)
-                    forward_grad = g_next - self.best_costs[i]
-
-                    # combine all grads
-                    grad = chain_grad * (
-                        self.alpha * backward_grad + (1 - self.alpha) * forward_grad
-                    )
-                    assert grad <= 0
-                    gradients.append(grad)
-
-                if max(gradients) == min(gradients):
-                    task_idx = np.random.choice(len(gradients))
-                else:
-                    task_idx = np.argmin(gradients)
-            else:
-                raise ValueError("Invalid strategy: " + self.strategy)
-
-            self._tune_task(task_idx)
-            self._adjust_similarity_group(task_idx)
-
-            if self.cur_score < self.best_score:
-                self.best_score = self.cur_score
-                self.best_ct = self.ct
-            elif self.ct - self.best_ct >= self.early_stopping_all and all(
-                cost < 1e9 for cost in self.best_costs
-            ):
-                if self.tune_option.verbose >= 1:
-                    print(
-                        "Stop early since no performance improvement in the last "
-                        + str(self.early_stopping_all)
-                        + " measurement trials."
-                    )
-                break
-
-    def _tune_task(self, task_idx):
-        """Tune the select task for one round"""
-
-        # Run pre-tune callbacks
-        for callback in self.callbacks:
-            callback.pre_tune(self, task_idx)
-
-        measure_inputs, measure_results = self.search_policies[task_idx].continue_search_one_round(
-            self.num_measures_per_round, self.measurer
-        )
-
-        self.task_cts[task_idx] += 1
-
-        for res in measure_results:
-            cost = array_mean(res.costs)
-            if cost < self.best_costs[task_idx]:
-                self.task_best_cts[task_idx] = self.task_cts[task_idx]
-                self.best_costs[task_idx] = cost
-
-        # Stop tuning this task in the rest of the process if its search space has been
-        # fully explored or it has no improvement for a long while.
-        no_change_trials = (
-            self.task_cts[task_idx] - self.task_best_cts[task_idx]
-        ) * self.num_measures_per_round
-        if len(measure_inputs) == 0 or no_change_trials > self.early_stopping_task:
-            self.dead_tasks.add(task_idx)
-
-        self.task_costs_history[task_idx].append(self.best_costs[task_idx])
-
-        self.ct += len(measure_inputs)
-        self.cur_score = self._compute_score(self.best_costs)
-
-        # Run post-tune callbacks
-        for callback in self.callbacks:
-            callback.post_tune(self, task_idx)
-
-    def _compute_score(self, costs):
-        """compute the objective function"""
-        # Make sure to return float.
-        score = self.objective_func(costs)
-        return score.value if hasattr(score, "value") else score
-
-    def _adjust_similarity_group(self, task_idx):
-        """adjust the similarity group for the selected task"""
-        group_id = self.tag_to_group_id.get(self.task_tags[task_idx], None)
-        if group_id is None or len(self.group_task_ids[group_id]) <= 1:
-            return
-
-        group_ids = self.group_task_ids[group_id]
-        best_group_flops = max([self.flop_cts[j] / self.best_costs[j] for j in group_ids])
-        cur_flops = self.flop_cts[task_idx] / self.best_costs[task_idx]
-
-        # if we tune a task for many times but it still cannot achieve
-        # a similar speed to the fastest one in its group, this means this task
-        # is actually not similar to other tasks in its group.
-        # So we will remove it from its original group.
-        if cur_flops < best_group_flops / self.beta and self.task_cts[task_idx] > 5 + max(
-            self.task_cts[j] for j in group_ids if j != task_idx
-        ):
-            self.task_tags[task_idx] = None
-            group_ids.remove(task_idx)
-
-    def _restore_status(self, log_file, num_measures_per_round):
-        """restore task_cts and best_costs from a log file"""
-        str_target = str(self.tasks[0].target)
-        workload_key_to_task_id = {t.workload_key: i for i, t in enumerate(self.tasks)}
-        total_ct = -1
-
-        for total_ct, (inp, res) in enumerate(RecordReader(log_file)):
-            if str(inp.task.target) != str_target:
-                continue
-            task_idx = workload_key_to_task_id.get(inp.task.workload_key, None)
-            if task_idx is None:
-                continue
-
-            self.task_cts[task_idx] += 1
-
-            if res.error_no == 0:
-                cost = array_mean(res.costs)
-                if cost < self.best_costs[task_idx]:
-                    self.best_costs[task_idx] = cost
-                    self.task_best_cts[task_idx] = self.task_cts[task_idx]
-
-        for idx in range(len(self.tasks)):
-            if self.task_cts[idx] - self.task_best_cts[idx] > self.early_stopping_task:
-                self.dead_tasks.add(idx)
-
-            # The computation of taks_cts is just an estimation.
-            # The estimation may not be accurate if the log file is changed externally or
-            # `num_measures_per_round` is different from the last tuning.
-            self.task_cts[idx] = int(self.task_cts[idx] / num_measures_per_round + 0.5)
-            self.task_best_cts[idx] = int(self.task_best_cts[idx] / num_measures_per_round + 0.5)
-            self.task_costs_history[idx].append(self.best_costs[idx])
-
-        self.cur_score = self._compute_score(self.best_costs)
-
-        logger.info("TaskScheduler: Loaded %d measurement records from %s", total_ct + 1, log_file)
-
-
-class TaskSchedulerCallback:
-    """The base class of task scheduler callback functions."""
-
-    def pre_tune(self, task_scheduler, task_id):
-        """The callback before tuning each task.
-
-        Parameters
-        ----------
-        task_scheduler: TaskScheduler
-            The task scheduler.
-        task_id: int
-            The task ID going to be tuned.
-        """
-        # Do nothing by default
-
-    def post_tune(self, task_scheduler, task_id):
-        """The callback after tuning each task.
-
-        Parameters
-        ----------
-        task_scheduler: TaskScheduler
-            The task scheduler.
-        task_id: int
-            The task ID be tuned.
-        """
-        # Do nothing by default
-
-
-class PrintTableInfo(TaskSchedulerCallback):
-    """The callback that prints a table of current progress."""
-
-    def pre_tune(self, task_scheduler, task_id):
-        if task_scheduler.tune_option.verbose < 1:
-            return
-
-        _ffi_api.PrintTitle("Task Scheduler")
-        print(
-            "|  ID  "
-            "|                       Task Description                        "
-            "| Latency (ms) | Speed (GFLOPS) | Trials |"
-        )
-        print(
-            "----------------------------------------------------------------"
-            "-------------------------------------------------"
-        )
-
-        # content
-        for i in range(len(task_scheduler.tasks)):
-            id_str = f"{i}"
-            latency_str = (
-                "%.3f" % (1e3 * task_scheduler.best_costs[i])
-                if task_scheduler.best_costs[i] < 1e9
-                else "-"
-            )
-            task_desc = task_scheduler.tasks[i].desc
-            speed_str = (
-                "%.2f"
-                % (task_scheduler.tasks[i].compute_dag.flop_ct / task_scheduler.best_costs[i] / 1e9)
-                if task_scheduler.best_costs[i] < 1e9
-                else "-"
-            )
-            trials_str = "%d" % (task_scheduler.task_cts[i] * task_scheduler.num_measures_per_round)
-            print(
-                "| %4s | %61s | %12s | % 14s | %6s |"
-                % (id_str, task_desc, latency_str, speed_str, trials_str)
-            )
-        print(
-            "----------------------------------------------------------------"
-            "-------------------------------------------------"
-        )
-
-        # overall info
-        if all(cost < 1e9 for cost in task_scheduler.best_costs):
-            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
-        else:
-            total_latency_str = "-"
-        print(
-            "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t"
-            % (total_latency_str, task_scheduler.ct, time.time() - task_scheduler.tic, task_id)
-        )
-
-
-class LogEstimatedLatency(TaskSchedulerCallback):
-    """Log the estimated latency to the file after tuning a task.
-
-    Parameters
-    ----------
-    log_file: str
-        The log file path.
-    """
-
-    def __init__(self, log_file):
-        if os.path.exists(log_file):  # Remove existing log
-            os.remove(log_file)
-
-        self.log_file = log_file
-
-    def post_tune(self, task_scheduler, task_id):
-        if all(cost < 1e9 for cost in task_scheduler.best_costs):
-            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
-        else:
-            total_latency_str = "N/A"
-
-        with open(self.log_file, "a") as filep:
-            filep.write(
-                "ElapsedTime(s)\t%.0f\tEstimatedLatency(ms)\t%s\tTrials\t%d\n"
-                % (time.time() - task_scheduler.tic, total_latency_str, task_scheduler.ct)
-            )
-            filep.flush()
diff --git a/python/tvm/auto_scheduler/testing/__init__.py b/python/tvm/auto_scheduler/testing/__init__.py
deleted file mode 100644
index 2bbcf8317de3..000000000000
--- a/python/tvm/auto_scheduler/testing/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import, redefined-builtin
-"""Testing utilities in auto scheduler."""
-
-# NOTE: Do not import any module here by default
diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
deleted file mode 100644
index 334b5d6726b7..000000000000
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import argparse
-import json
-import os
-import onnx  # type: ignore
-
-import tvm
-from tvm import auto_scheduler
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
-from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer
-from tvm.meta_schedule.utils import cpu_count
-from tvm.relay.frontend import from_onnx
-from tvm.support import describe
-from tvm.testing.utils import strtobool
-
-
-def _parse_args():
-    args = argparse.ArgumentParser()
-    args.add_argument(
-        "--model-name",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--onnx-path",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--input-shape",
-        type=str,
-        required=True,
-        help='example: `[{"name": "input1", "dtype": "int64", "shape": [1, 1, 8]}]',
-    )
-    args.add_argument(
-        "--target",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--num-trials",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-host",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-port",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-key",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--number",
-        type=int,
-        default=3,
-    )
-    args.add_argument(
-        "--repeat",
-        type=int,
-        default=1,
-    )
-    args.add_argument(
-        "--min-repeat-ms",
-        type=int,
-        default=100,
-    )
-    args.add_argument(
-        "--adaptive-training",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        default=True,
-    )
-    args.add_argument(
-        "--cpu-flush",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        required=True,
-    )
-    args.add_argument(
-        "--backend",
-        type=str,
-        choices=["graph", "vm"],
-        help="example: graph / vm",
-        required=True,
-    )
-    parsed = args.parse_args()
-    parsed.target = tvm.target.Target(parsed.target)
-    parsed.input_shape = json.loads(parsed.input_shape)
-    parsed.rpc_config = ms.runner.RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=600,
-    )
-    return parsed
-
-
-ARGS = _parse_args()
-
-
-def main():
-    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")
-
-    runner = auto_scheduler.RPCRunner(
-        key=ARGS.rpc_key,
-        host=ARGS.rpc_host,
-        port=ARGS.rpc_port,
-        n_parallel=cpu_count(logical=True),
-        number=ARGS.number,
-        repeat=ARGS.repeat,
-        min_repeat_ms=ARGS.min_repeat_ms,
-        enable_cpu_cache_flush=ARGS.cpu_flush,
-        timeout=ARGS.rpc_config.session_timeout_sec,
-    )
-
-    if ARGS.target.kind.name == "llvm":
-        hardware_params = auto_scheduler.HardwareParams(
-            num_cores=int(ARGS.target.attrs["num-cores"]),
-            target=ARGS.target,
-        )
-    elif ARGS.target.kind.name == "cuda":
-        hardware_params = auto_scheduler.HardwareParams(
-            num_cores=-1,
-            vector_unit_bytes=16,
-            cache_line_bytes=64,
-            max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
-            max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
-            # The value `max_local_memory_per_block` is not used in AutoScheduler,
-            # but is required by the API.
-            max_local_memory_per_block=12345678,
-            max_vthread_extent=8,
-            warp_size=32,
-        )
-    else:
-        raise NotImplementedError(f"Unsupported target {ARGS.target}")
-
-    describe()
-    print(f"Workload: {ARGS.model_name}")
-    onnx_model = onnx.load(ARGS.onnx_path)
-    shape_dict = {}
-    for item in ARGS.input_shape:
-        print(f"  input_name : {item['name']}")
-        print(f"  input_shape: {item['shape']}")
-        print(f"  input_dtype: {item['dtype']}")
-        shape_dict[item["name"]] = item["shape"]
-    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
-    input_data = {
-        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
-    }
-
-    with ms.Profiler() as profiler:
-        tasks, task_weights = auto_scheduler.extract_tasks(
-            mod["main"],
-            params,
-            target=ARGS.target,
-            hardware_params=hardware_params,
-        )
-        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-            print(
-                f"==== Task {idx}: {task.desc} "
-                f"(weight {task_weight} key: {task.workload_key}) ====="
-            )
-            print(task.compute_dag)
-
-        if ARGS.num_trials > 0:
-            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-            tuner.tune(
-                auto_scheduler.TuningOptions(
-                    num_measure_trials=ARGS.num_trials,
-                    runner=runner,
-                    measure_callbacks=[
-                        auto_scheduler.RecordToFile(log_file),
-                    ],
-                ),
-                adaptive_training=ARGS.adaptive_training,
-            )
-
-        relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend]
-        with auto_scheduler.ApplyHistoryBest(log_file):
-            with tvm.transform.PassContext(
-                opt_level=3,
-                config={"relay.backend.use_auto_scheduler": True},
-            ):
-                lib = relay_build(
-                    mod,
-                    target=ARGS.target,
-                    params=params,
-                )
-    print("Tuning Time:")
-    print(profiler.table())
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=lib,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=create_timer(ARGS.backend),
-        backend=ARGS.backend,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
deleted file mode 100644
index babec2cf50c4..000000000000
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import argparse
-import json
-import os
-
-import tvm
-from tvm import auto_scheduler
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data
-from tvm.meta_schedule.utils import cpu_count
-from tvm.support import describe
-from tvm.testing.utils import strtobool
-
-
-def _parse_args():
-    args = argparse.ArgumentParser()
-    args.add_argument(
-        "--workload",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--input-shape",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--target",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--num-trials",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-host",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-port",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-key",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--layout",
-        type=str,
-        default=None,
-    )
-    args.add_argument(
-        "--cache-dir",
-        type=str,
-        default=None,
-    )
-    args.add_argument(
-        "--number",
-        type=int,
-        default=3,
-    )
-    args.add_argument(
-        "--repeat",
-        type=int,
-        default=1,
-    )
-    args.add_argument(
-        "--min-repeat-ms",
-        type=int,
-        default=100,
-    )
-    args.add_argument(
-        "--adaptive-training",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        default=True,
-    )
-    args.add_argument(
-        "--cpu-flush",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        required=True,
-    )
-    args.add_argument(
-        "--backend",
-        type=str,
-        choices=["graph", "vm"],
-        help="example: graph / vm",
-        required=True,
-    )
-    parsed = args.parse_args()
-    parsed.target = tvm.target.Target(parsed.target)
-    parsed.input_shape = json.loads(parsed.input_shape)
-    parsed.rpc_config = ms.runner.RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=600,
-    )
-    return parsed
-
-
-ARGS = _parse_args()
-
-
-def main():
-    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
-
-    runner = auto_scheduler.RPCRunner(
-        key=ARGS.rpc_key,
-        host=ARGS.rpc_host,
-        port=ARGS.rpc_port,
-        n_parallel=cpu_count(logical=True),
-        number=ARGS.number,
-        repeat=ARGS.repeat,
-        min_repeat_ms=ARGS.min_repeat_ms,
-        enable_cpu_cache_flush=ARGS.cpu_flush,
-        timeout=ARGS.rpc_config.session_timeout_sec,
-    )
-
-    if ARGS.target.kind.name == "llvm":
-        hardware_params = auto_scheduler.HardwareParams(
-            num_cores=int(ARGS.target.attrs["num-cores"]),
-            target=ARGS.target,
-        )
-    elif ARGS.target.kind.name == "cuda":
-        hardware_params = auto_scheduler.HardwareParams(
-            num_cores=-1,
-            vector_unit_bytes=16,
-            cache_line_bytes=64,
-            max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
-            max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
-            # The value `max_local_memory_per_block` is not used in AutoScheduler,
-            # but is required by the API.
-            max_local_memory_per_block=12345678,
-            max_vthread_extent=8,
-            warp_size=32,
-        )
-    else:
-        raise NotImplementedError(f"Unsupported target {ARGS.target}")
-
-    describe()
-    print(f"Workload: {ARGS.workload}")
-    mod, params, (input_name, input_shape, input_dtype) = get_network(
-        ARGS.workload,
-        ARGS.input_shape,
-        layout=ARGS.layout,
-        cache_dir=ARGS.cache_dir,
-    )
-    input_info = [
-        {
-            "name": input_name,
-            "shape": input_shape,
-            "dtype": input_dtype,
-        },
-    ]
-    input_data = {
-        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in input_info
-    }
-    for item in input_info:
-        print(f"  input_name : {item['name']}")
-        print(f"  input_shape: {item['shape']}")
-        print(f"  input_dtype: {item['dtype']}")
-
-    with ms.Profiler() as profiler:
-        with ms.Profiler.timeit("TaskExtraction"):
-            tasks, task_weights = auto_scheduler.extract_tasks(
-                mod["main"],
-                params,
-                target=ARGS.target,
-                hardware_params=hardware_params,
-            )
-            for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-                print(
-                    f"==== Task {idx}: {task.desc} "
-                    f"(weight {task_weight} key: {task.workload_key}) ====="
-                )
-                print(task.compute_dag)
-
-        with ms.Profiler.timeit("Tuning"):
-            if ARGS.num_trials > 0:
-                tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-                tuner.tune(
-                    auto_scheduler.TuningOptions(
-                        num_measure_trials=ARGS.num_trials,
-                        runner=runner,
-                        measure_callbacks=[
-                            auto_scheduler.RecordToFile(log_file),
-                        ],
-                    ),
-                    adaptive_training=ARGS.adaptive_training,
-                )
-
-        relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend]
-        with ms.Profiler.timeit("PostTuningCompilation"):
-            with auto_scheduler.ApplyHistoryBest(log_file):
-                with tvm.transform.PassContext(
-                    opt_level=3,
-                    config={"relay.backend.use_auto_scheduler": True},
-                ):
-                    lib = relay_build(
-                        mod,
-                        target=ARGS.target,
-                        params=params,
-                    )
-    print("Tuning Time:")
-    print(profiler.table())
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=lib,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=create_timer(ARGS.backend),
-        backend=ARGS.backend,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
deleted file mode 100644
index 9452d88a4e65..000000000000
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import argparse
-import os
-
-import tvm
-from tvm import auto_scheduler
-from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.te_workload import CONFIGS
-from tvm.meta_schedule.utils import cpu_count
-from tvm.support import describe
-from tvm.testing.utils import strtobool
-
-
-def _parse_args():
-    args = argparse.ArgumentParser()
-    args.add_argument(
-        "--workload",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--target",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--num-trials",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-host",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-port",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-key",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--number",
-        type=int,
-        default=3,
-    )
-    args.add_argument(
-        "--repeat",
-        type=int,
-        default=1,
-    )
-    args.add_argument(
-        "--min-repeat-ms",
-        type=int,
-        default=100,
-    )
-    args.add_argument(
-        "--adaptive-training",
-        type=lambda x: bool(strtobool(x)),
-        required=False,
-        help="example: True / False",
-        default=True,
-    )
-    args.add_argument(
-        "--cpu-flush",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        required=True,
-    )
-    parsed = args.parse_args()
-    parsed.target = tvm.target.Target(parsed.target)
-    parsed.rpc_config = ms.runner.RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=60,
-    )
-    return parsed
-
-
-ARGS = _parse_args()
-
-
-def main():
-    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
-
-    runner = auto_scheduler.RPCRunner(
-        key=ARGS.rpc_key,
-        host=ARGS.rpc_host,
-        port=ARGS.rpc_port,
-        n_parallel=cpu_count(logical=True),
-        number=ARGS.number,
-        repeat=ARGS.repeat,
-        min_repeat_ms=ARGS.min_repeat_ms,
-        enable_cpu_cache_flush=ARGS.cpu_flush,
-        timeout=ARGS.rpc_config.session_timeout_sec,
-    )
-
-    if ARGS.target.kind.name == "llvm":
-        hardware_params = auto_scheduler.HardwareParams(
-            num_cores=int(ARGS.target.attrs["num-cores"]),
-            target=ARGS.target,
-        )
-    elif ARGS.target.kind.name == "cuda":
-        hardware_params = auto_scheduler.HardwareParams(
-            num_cores=-1,
-            vector_unit_bytes=16,
-            cache_line_bytes=64,
-            max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
-            max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
-            # The value `max_local_memory_per_block` is not used in AutoScheduler,
-            # but is required by the API.
-            max_local_memory_per_block=12345678,
-            max_vthread_extent=8,
-            warp_size=32,
-        )
-    else:
-        raise NotImplementedError(f"Unsupported target {ARGS.target}")
-
-    describe()
-    print(f"Workload: {ARGS.workload}")
-    with ms.Profiler() as profiler:
-        # Same as MetaSchedule Tune TE
-        # Does not count ApplyHistoryBest time
-
-        workload_func, params = CONFIGS[ARGS.workload]
-        params = params[0]  # type: ignore
-        workload_func = auto_scheduler.register_workload(workload_func)
-
-        task = auto_scheduler.SearchTask(
-            func=workload_func,
-            args=params,
-            target=ARGS.target,
-            hardware_params=hardware_params,
-        )
-        # Inspect the computational graph
-        print("Computational DAG:")
-        print(task.compute_dag)
-        tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=ARGS.num_trials,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-            verbose=2,
-            runner=runner,
-        )
-        if ARGS.num_trials > 0:
-            print("Running AutoTuning:")
-            task.tune(tune_option, adaptive_training=ARGS.adaptive_training)
-
-    print("Tuning Time:")
-    print(profiler.table())
-
-    print("History Best:")
-    print(task.print_best(log_file))
-
-    sch, args = task.apply_best(log_file)
-    print("Lowered TIR:")
-    print(tvm.lower(sch, args, simple_mode=True))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
deleted file mode 100644
index 4d05fc4856c8..000000000000
--- a/python/tvm/auto_scheduler/utils.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-""" Common utilities for auto_scheduler. """
-
-from typing import Hashable
-import json
-import signal
-import threading
-import traceback
-import os
-
-import numpy as np
-
-try:
-    import psutil
-except ImportError:
-    psutil = None
-
-import tvm
-from tvm import rpc
-from tvm.tir import expr
-from tvm.tir.transform import Simplify
-from tvm.ir.transform import Sequential
-from ..te import Tensor, placeholder
-
-
-def decode_workload_key(workload_key):
-    """Decode the workload key from a string to the name and arguments. The wokrload key
-    is expected to be a list of "[func_name/hash, args ...]" in a JSON string. If not,
-    then simply return the workload key as the name without arguments.
-
-    Parameters
-    ----------
-    workload_key: str
-        The workload key in string. Format: "[func_name/hash, args ...]".
-
-    Returns
-    -------
-    name: str
-        The workload function name or the DAG hash.
-    args: Optional[Tuple[Any, ...]]
-        The flatten arguments in a tuple, or None if the workload key format is not decodeable.
-    """
-
-    def flatten_list(inp):
-        ret = []
-        for elt in inp:
-            if isinstance(elt, list):
-                ret += flatten_list(elt)
-            else:
-                ret.append(elt)
-        return ret
-
-    try:
-        key_list = json.loads(workload_key)
-        if isinstance(key_list, list) and len(key_list) >= 1:
-            return key_list[0], tuple(flatten_list(key_list[1:]))
-    except json.decoder.JSONDecodeError:
-        pass
-    return workload_key, None
-
-
-def calc_workload_dis_factor(target_workload_pair, workload_pair):
-    """Calculate the distance factor of the workload to the target workload.
-    If two workloads are not compatible at all (i.e., different compute DAG or function),
-    then the distance factor is "inf". Otherwise, we calculate the factor by traversing
-    the workload arguments, which are the arguments of the compute function,
-    or the output shapes for the ComputeDAG. The factor is calculated by the following rules:
-
-    1. For non-zero integer values: `product(target_arg / candidate_arg)`.
-    2. For non-integer or zero values: "inf" if not equal else 1.
-
-    As a result, factor=1 is the optimal when two workloads are identical.
-
-    Parameters
-    ----------
-    target_workload_pair: Tuple[str, Optional[Tuple[Any, ...]]]
-        The target workload pair: (hash, argument tuple).
-
-    workload_pair: Tuple[str, Optional[Tuple[Any, ...]]]
-        The candidate workload pair: (hash, argument tuple).
-
-    Returns
-    -------
-    dis_f: float
-        The distance factor.
-    """
-    target_key, target_args = target_workload_pair
-    target_args = target_args if target_args is not None else []
-    key, args = workload_pair
-    args = args if args is not None else []
-
-    # Not even the same func/DAG.
-    if key != target_key or len(target_args) != len(args):
-        return float("inf")
-
-    dis_f = 1
-    for target_arg, arg in zip(target_args, args):
-        if isinstance(target_arg, int):
-            if target_arg == 0 or arg == 0:
-                if target_arg != arg:
-                    return float("inf")
-            elif target_arg % arg != 0:
-                return float("inf")
-            else:
-                dis_f *= target_arg / arg
-        elif target_arg != arg:
-            return float("inf")
-    return dis_f
-
-
-def get_func_name(func):
-    """Get name of a function.
-
-    Parameters
-    ----------
-    func: Function
-        The input function.
-
-    Returns
-    -------
-    name: str
-        The function name.
-    """
-    return func.func_name if hasattr(func, "func_name") else func.__qualname__
-
-
-def get_const_int(exp):
-    """Verifies expr is integer and get the constant value.
-
-    Parameters
-    ----------
-    exp : Union[tvm.tir.expr, int]
-        The input expression.
-
-    Returns
-    -------
-    out_value : int
-        The output.
-    """
-    if isinstance(exp, int):
-        return exp
-    if not isinstance(exp, expr.IntImm):
-        opt = Sequential([Simplify()])
-        exp = opt(exp)
-    if not isinstance(exp, expr.IntImm):
-        raise ValueError("Expect value to be constant int")
-    return exp.value
-
-
-def get_const_tuple(in_tuple):
-    """Verifies input tuple is IntImm, returns tuple of int.
-
-    Parameters
-    ----------
-    in_tuple : Tuple[tvm.tir.expr]
-        The input.
-
-    Returns
-    -------
-    out_tuple : Tuple[Union[int,tvm.tir.Var,tvm.tir.Any]]
-        The output tuple of int. The dynamic shape variables (Var or Any) will be preserved.
-    """
-    ret = []
-    for elem in in_tuple:
-        if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)):
-            ret.append(elem)
-        else:
-            ret.append(get_const_int(elem))
-    return tuple(ret)
-
-
-def list_to_tuple(x):
-    """Convert a list to a tuple recursively."""
-    assert isinstance(x, list)
-    return tuple(list_to_tuple(y) if isinstance(y, list) else y for y in x)
-
-
-def serialize_args(args):
-    """
-    Serialize arguments of a function to a hashable and jsonable tuple.
-    Currently this is mainly used for tvm.tensor.Tensor
-    """
-    ret = []
-    if args is None:
-        return tuple(ret)
-
-    for t in args:
-        if isinstance(t, Tensor):
-            t = ("TENSOR", get_const_tuple(t.shape), t.dtype)
-        elif isinstance(t, list):
-            t = list_to_tuple(t)
-
-        assert isinstance(t, Hashable), str(t) + " is not hashable"
-        ret.append(t)
-
-    return tuple(ret)
-
-
-def deserialize_args(args):
-    """The inverse function of :code:`serialize_args`"""
-    ret = []
-    for t in args:
-        if isinstance(t, (tuple, list)) and t[0] == "TENSOR":
-            ret.append(placeholder(shape=t[1], dtype=t[2]))
-        else:
-            ret.append(t)
-    return ret
-
-
-def kill_child_processes(parent_pid, sig=signal.SIGTERM):
-    """kill all child processes recursively"""
-    if not psutil:
-        raise ImportError("psutil not found, try `pip install psutil` to fix this")
-
-    try:
-        parent = psutil.Process(parent_pid)
-    except psutil.NoSuchProcess:
-        return
-
-    try:
-        children = parent.children(recursive=True)
-        for process in children:
-            process.send_signal(sig)
-    except psutil.NoSuchProcess:
-        return
-
-
-# The maximum length of traceback information
-MAX_TRACEBACK_INFO_LEN = 512
-
-
-def make_traceback_info():
-    """Get the error message from traceback."""
-    info = str(traceback.format_exc())
-    if len(info) > MAX_TRACEBACK_INFO_LEN:
-        info = (
-            info[: MAX_TRACEBACK_INFO_LEN // 2] + "\n...\n" + info[-MAX_TRACEBACK_INFO_LEN // 2 :]
-        )
-    return info
-
-
-class PropagatingThread(threading.Thread):
-    """A thread that propagates the exception to the main thread"""
-
-    def run(self):
-        self.exc = None
-        try:
-            self.ret = self._target(*self._args, **self._kwargs)
-        except Exception as e:  # pylint: disable=broad-except
-            self.exc = e
-
-    def join(self, timeout=None):
-        super(PropagatingThread, self).join(timeout)
-        if self.exc:
-            raise self.exc
-        return self.ret
-
-
-def call_func_with_thread(func, args, kwargs):
-    """Call a function within a new thread"""
-    res = []
-
-    def wrapper():
-        res.append(func(*args, **kwargs))
-
-    t = PropagatingThread(target=wrapper)
-    t.start()
-    t.join()
-    return res[0]
-
-
-def call_func_with_timeout(
-    worker, timeout, func, args=(), kwargs=None
-):  # pylint: disable=unused-argument
-    """Call a function with timeout"""
-    worker.send(func, args, kwargs, timeout)
-    try:
-        res = worker.recv()
-    except Exception:  # pylint: disable=broad-except
-        res = Exception(make_traceback_info())
-
-    return res
-
-
-def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
-    """Request a remote session.
-
-    Parameters
-    ----------
-    device_key : str
-        The device key of registered device in tracker.
-    host : Optional[str]
-        The host address of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_HOST".
-    port : Optional[int]
-        The port of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_PORT".
-    priority : int = 1
-        The priority of this request, larger is more prior.
-    timeout : int = 60
-        The timeout of this session in second.
-
-    Returns
-    -------
-    remote : RPCSession
-        The connected remote RPCSession.
-    """
-    # connect to the tracker
-    host = host or os.environ["TVM_TRACKER_HOST"]
-    port = port or int(os.environ["TVM_TRACKER_PORT"])
-
-    tracker = rpc.connect_tracker(host, port)
-    remote = tracker.request(device_key, priority=priority, session_timeout=timeout)
-    return remote
-
-
-def check_remote(device_key, host=None, port=None, priority=100, timeout=10):
-    """
-    Check the availability of a remote device.
-
-    Parameters
-    ----------
-    device_key: str
-        device key of registered device in tracker.
-    host: Optional[str]
-        The host address of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_HOST".
-    port: Optional[int]
-        The port address of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_PORT".
-    priority: int = 100
-        The priority of this request, larger is more prior.
-    timeout: int = 10
-        The timeout of this check in seconds.
-
-    Returns
-    -------
-    available: bool
-        True if can find available device.
-    """
-
-    def _check():
-        request_remote(device_key, host, port, priority)
-
-    t = threading.Thread(target=_check)
-    t.start()
-    t.join(timeout)
-    return not t.is_alive()
-
-
-def array_mean(arr):
-    """Compute mean of the elments in a TVM Array<PrimExpr>
-
-    Parameters
-    ----------
-    arr: Array
-        A TVM Array<PrimExpr>
-
-    Returns
-    -------
-    mean: float
-        The mean of the elements in the array
-    """
-    return sum(x.value for x in arr) / len(arr)
-
-
-def to_str_round(x, decimal=6):
-    """Convert an object to str and round float numbers
-
-    Parameters
-    ----------
-    x: Union[str, list, int, float, np.ndarray]
-        The input object
-    decimal: int
-        The precision of decimal fraction
-
-    Returns
-    -------
-    ret: str
-        The string format of these objects
-    """
-    if isinstance(x, str):
-        return x
-    if isinstance(x, (list, tuple, np.ndarray)):
-        return "[" + ", ".join([to_str_round(y, decimal=decimal) for y in x]) + "]"
-    if isinstance(x, dict):
-        return str({k: to_str_round(v) for k, v in x.items()})
-    if isinstance(x, int):
-        return str(x)
-    if isinstance(x, (np.float32, np.float64, float)):
-        format_str = f"%.{decimal}f"
-        return format_str % x
-    raise ValueError(f"Invalid value: {str(x)}\ttype: {type(x)}")
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
deleted file mode 100644
index 62ba2245b002..000000000000
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""
-Workload registration and serialization.
-
-We use a json string to represent a workload (a computation graph).
-The format of the string is `[func_name, [args...]]`.
-The dag should be the return value of this `func_name(*args)`.
-
-Rationale: The workload is actually a compute dag defined by tvm dsl. But serializing compute dags
-and matching them efficiently is not easy. Therefore, we use the above string to encode a compute
-dag.
-These strings are efficient for serialization/matching and won't be too long.
-When we need the dag, we decode the string and call the function, which will return the dag.
-"""
-
-import json
-import logging
-import pickle
-
-import tvm._ffi
-from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON
-
-from .utils import deserialize_args, get_func_name, serialize_args
-
-logger = logging.getLogger("auto_scheduler")
-
-# Global workload function and hash key registry
-# It stores two types of workload:
-# 1. User registered tasks. This type of workload is registered
-#    by the decorator "register_workload"
-# 2. Extracted tasks from a relay program. This type of workload is
-#    registered by function "register_workload_tensors".
-#
-# For 1, the dictionary maps a function name to its function pointer
-# For 2, the dictionary maps a hash key to a list of input/output tensors
-WORKLOAD_FUNC_REGISTRY = {}
-
-
-def register_workload(func_name, f=None, override=False):
-    """Register a function that generates a certain workload.
-
-    The input function should take hashable and jsonable arguments
-    (int, float, tuple of int, tvm.tensor.Tensor, ...) and return a list of tvm.tensor.Tensor.
-
-    Parameters
-    ----------
-    func_name : Union[Function, str]
-        The generation function that returns the compute declaration Tensors or its function name.
-    f : Optional[Function]
-        The generation function to be registered.
-    override : boolean = False
-        Whether to override existing entry.
-
-    Examples
-    --------
-    .. code-block:: python
-
-      @auto_scheduler.register_workload
-      def matmul(N, M, K):
-          A = te.placeholder((N, K), name='A')
-          B = te.placeholder((K, M), name='B')
-          k = te.reduce_axis((0, K), name='k')
-          C = te.compute((N, M), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name='C')
-          return [A, B, C]
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    if callable(func_name):
-        f = func_name
-        func_name = get_func_name(f)
-    if not isinstance(func_name, str):
-        raise ValueError("expect string function name")
-
-    def register(myf):
-        """internal register function"""
-        if func_name in WORKLOAD_FUNC_REGISTRY and not override:
-            raise RuntimeError(f"{func_name} has been registered already")
-        WORKLOAD_FUNC_REGISTRY[func_name] = myf
-        return myf
-
-    if f:
-        return register(f)
-    return register
-
-
-def register_workload_tensors(workload_key, tensors, override=True):
-    """Register a workload by provding input/output tensors. Since this function is used
-    when extracting/deserializing tasks, it expects duplicated registrations by default.
-
-    Parameters
-    ----------
-    workload_key: str
-        The wokrload key of the compute DAG in JSON string.
-    tensors: List[Tensor]
-        The input/output tensors of a compute DAG
-    override : boolean = True
-        Whether to override existing entry.
-
-    Returns
-    -------
-    workload_key: str
-        The wokrload key of the compute DAG in JSON string.
-    """
-    register_workload(workload_key, override=override)(tensors)
-    return workload_key
-
-
-def make_workload_key(func, args):
-    """Make a workload key by function and arguments.
-
-    Parameters
-    ----------
-    func : Union[Function, str]
-        The function that returns the compute declaration Tensors.
-        Can be the a function or the function name.
-    args : Args
-        The args of the function.
-
-    Returns
-    -------
-    workload_key : str
-        The workload key of the function.
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    if callable(func):
-        func_name = get_func_name(func)
-    elif isinstance(func, str):
-        func_name = func
-    else:
-        raise ValueError(
-            "Invalid function: "
-            + str(func)
-            + " . `make_workload_key` expects a callable function or its function name"
-        )
-
-    if not func_name in WORKLOAD_FUNC_REGISTRY:
-        raise ValueError(
-            f"{func} is not registered. "
-            f"Please register it with @auto_scheduler.register_workload"
-        )
-
-    args = serialize_args(args)
-
-    return json.dumps((func_name,) + args)
-
-
-@tvm._ffi.register_func("auto_scheduler.workload_key_to_tensors")
-def workload_key_to_tensors(workload_key):
-    """Get the input/output tensors from the workload key.
-
-    This method is usually used to create a ComputeDAG by workload key.
-
-    Parameters
-    ----------
-    workload_key : str
-        The input workload key in JSON string. The format is either (func_name, arguments...)
-        for compute functions, or (hash, shapes...) for ComputeDAG.
-
-    Returns
-    -------
-    tensors : List[Tensor]
-        The registered compute declaration Tensors.
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    # We register ComputeDAG with both hash and argumetns, which are fixed in ComputeDAG,
-    # so we use an entire workload key to query the ComputeDAG.
-    if workload_key in WORKLOAD_FUNC_REGISTRY:
-        return WORKLOAD_FUNC_REGISTRY[workload_key]
-
-    # We register compute function with only the function name since
-    # it does not bind to specific arguments, so we use the function name to query
-    # the function and call the function with arguments to get the tensors.
-    workload = json.loads(workload_key)
-    name = workload[0]
-    value = WORKLOAD_FUNC_REGISTRY[name]
-    assert callable(value)
-
-    args = deserialize_args(workload[1:])
-    result = value(*args)
-    if isinstance(result, tuple):
-        result = list(result)
-    return result
-
-
-def serialize_workload_registry_entry(workload_key):
-    """
-    Serialize a workload registry entry.
-
-    This is used when the start method of multiprocessing is spawn.
-    We need to serialize the entry and register it in the new processes.
-
-    Parameters
-    ----------
-    workload_key : str
-        The workload key
-
-    Returns
-    -------
-    data: Tuple
-        The serialized pickable data
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    if workload_key in WORKLOAD_FUNC_REGISTRY:
-        sname = workload_key
-    else:
-        workload = json.loads(workload_key)
-        sname = workload[0]
-
-    svalue = WORKLOAD_FUNC_REGISTRY[sname]
-    if not callable(svalue):
-        # pylint: disable=assignment-from-no-return
-        svalue = SaveJSON(svalue)
-
-    return sname, svalue
-
-
-def deserialize_workload_registry_entry(data):
-    """
-    Deserialize a workload registry entry.
-    This should be used along with :code:`serialize_workload_registry_entry`
-
-    Parameters
-    ----------
-    data: Tuple
-        The return value of :code:`serialize_workload_registry_entry`
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    name, value = data
-    if name not in WORKLOAD_FUNC_REGISTRY:
-        # pylint: disable=assignment-from-no-return
-        if not callable(value):
-            value = LoadJSON(value)
-        WORKLOAD_FUNC_REGISTRY[name] = value
-
-
-def save_workload_func_registry(filename):
-    """Dump workload function registry to a pickle binary file.
-
-    Parameters
-    ----------
-    filename : str
-        The filename to dump workload function registry to.
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    pickle.dump(WORKLOAD_FUNC_REGISTRY, open(filename, "wb"))
-
-
-def load_workload_func_registry(filename):
-    """Load workload function registry from a pickle binary file.
-
-    Parameters
-    ----------
-    filename : str
-        The filename to load workload function registry from.
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    WORKLOAD_FUNC_REGISTRY = pickle.load(open(filename, "rb"))
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
deleted file mode 100644
index 5a7d00960ecd..000000000000
--- a/python/tvm/autotvm/__init__.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The auto-tuning module of tvm
-
-This module includes:
-
-* Tuning space definition API
-
-* Efficient auto-tuners
-
-* Tuning result and database support
-
-* Distributed measurement to scale up tuning
-"""
-
-from . import database
-from . import feature
-from . import measure
-from . import record
-from . import task
-from . import tuner
-from . import utils
-from . import env
-from . import tophub
-
-# some shortcuts
-from .measure import (
-    measure_option,
-    MeasureInput,
-    MeasureResult,
-    MeasureErrorNo,
-    LocalBuilder,
-    LocalRunner,
-    RPCRunner,
-)
-from .tuner import callback
-from .task import (
-    get_config,
-    create,
-    ConfigSpace,
-    ConfigEntity,
-    register_topi_compute,
-    register_topi_schedule,
-    template,
-    DispatchContext,
-    FallbackContext,
-    ApplyHistoryBest as apply_history_best,
-    ApplyGraphBest as apply_graph_best,
-    ApplyFixedConfig as apply_fixed_config,
-)
-from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/database.py b/python/tvm/autotvm/database.py
deleted file mode 100644
index 7246f81d6a59..000000000000
--- a/python/tvm/autotvm/database.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=consider-using-enumerate, invalid-name, use-list-literal
-"""
-Database of MeasureInput/MeasureResult pair.
-This can be used for replaying measurement.
-"""
-import os
-
-from .record import encode, decode, measure_str_key
-
-
-class Database(object):
-    """
-    Base class for a record database object.
-    """
-
-    def load(self, inp, get_all=False):
-        """
-        Load a result based on an input's string key
-
-        Parameters
-        ----------
-        inp: MeasureInput
-            to be translated into key for RedisDB
-        get_all: bool, optional
-            Whether the latest result (or all matching results) should be returned
-
-        Returns
-        -------
-        rec: MeasureResult if previously saved, otherwise None
-        """
-        raise NotImplementedError()
-
-    def save(self, inp, res, extend=False):
-        """
-        Save a result based on an input's string key
-
-        Parameters
-        ----------
-        inp: MeasureInput
-            to be translated into key for RedisDB
-        res: MeasureResult
-            to associate with key
-        extend:
-            Whether to extend existing MeasureResults if they exist
-        """
-        raise NotImplementedError()
-
-
-def filter_inputs(db, measure_inputs, retry=False):
-    """
-    Filter a measure_inputs batch based on saved db results
-
-    Parameters
-    ----------
-    db: Database
-        database object
-    measure_inputs: Array of MeasureInput
-        measure_inputs as expected in measure_batch
-    retry: bool
-        whether to retry if the saved result is a failure
-
-    Returns
-    -------
-    partial_results: Array of MeasureResult
-        a full list of result, where None denotes no corresponding saved result
-    unsaved: Array of MeasureInput
-        a list that only contains unsaved inputs
-    """
-    partial_results = list()
-    unsaved = list()
-    for inp in measure_inputs:
-        res = db.load(inp)
-        if res is None or (retry and res.error_no != 0):
-            unsaved.append(inp)
-            partial_results.append(None)
-        else:
-            partial_results.append(res)
-    return partial_results, unsaved
-
-
-class RedisDatabase(Database):
-    """
-    Redis version of record database
-    """
-
-    REDIS_PROD = 15
-    REDIS_LOCA = 14
-    REDIS_TEST = 13  # for unit test
-    REDIS_NIGHT_TEMP = 12  # for nightly report (will be flushed after every workload)
-
-    MAGIC_SPLIT = "$"
-
-    def __init__(self, db_index=REDIS_PROD):
-        # pylint: disable=import-outside-toplevel
-        import redis
-
-        if db_index == RedisDatabase.REDIS_TEST:
-            host = "127.0.0.1"
-        else:
-            host = os.environ.get("TVM_FLEET_HOST")
-        self.db = redis.StrictRedis(host=host, port=6379, db=db_index)
-        self.db_index = db_index
-
-    def set(self, key, value):
-        self.db.set(key, value)
-
-    def get(self, key):
-        current = self.db.get(key)
-        return current.decode() if isinstance(current, bytes) else current
-
-    def load(self, inp, get_all=False):
-        current = self.get(measure_str_key(inp))
-        if current is not None:
-            records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)]
-            results = [rec[1] for rec in records if rec is not None]
-            if get_all:
-                return results
-            return max(results, key=lambda result: result.timestamp)
-        return current
-
-    def save(self, inp, res, extend=False):
-        current = self.get(measure_str_key(inp))
-        if not extend or current is None:
-            self.set(measure_str_key(inp), RedisDatabase.MAGIC_SPLIT.join([encode(inp, res)]))
-        else:
-            current = current.split(RedisDatabase.MAGIC_SPLIT)
-            self.set(
-                measure_str_key(inp), RedisDatabase.MAGIC_SPLIT.join(current + [encode(inp, res)])
-            )
-
-    def filter(self, func):
-        """
-        Dump all of the records that match the given rule
-
-        Parameters
-        ----------
-        func: callable
-            The signature of the function is (MeasureInput, [MeasureResult]) -> bool
-
-        Returns
-        -------
-        list of records in tuple (MeasureInput, MeasureResult) matching the rule
-
-        Examples
-        --------
-        get records for a target
-        >>> db.filter(lambda inp, results: "cuda" in inp.target.keys)
-        get records with errors
-        >>> db.filter(lambda inp, results: any(r.error_no != 0 for r in results))
-        """
-        matched_records = list()
-        # may consider filtering in iterator in the future
-        for key in self.db.keys():
-            current = self.get(key)
-            try:
-                records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)]
-                records = [rec for rec in records if rec is not None]
-            except TypeError:  # got a badly formatted/old format record
-                continue
-
-            if not records:
-                continue
-            inps, results = zip(*records)
-            inp = inps[0]
-            if not func(inp, results):
-                continue
-            result = max(results, key=lambda res: res.timestamp)
-            matched_records.append((inp, result))
-        return matched_records
-
-    def flush(self):
-        self.db.flushdb()
-
-
-class DummyDatabase(RedisDatabase):
-    """
-    A database based on python dictionary for testing.
-    """
-
-    def __init__(self):
-        # pylint: disable=super-init-not-called
-        self.db = {}
-
-    def set(self, key, value):
-        self.db[key] = value
-
-    def flush(self):
-        self.db = {}
diff --git a/python/tvm/autotvm/env.py b/python/tvm/autotvm/env.py
deleted file mode 100644
index 52ec8828bc1e..000000000000
--- a/python/tvm/autotvm/env.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=global-variable-not-assigned
-"""Global configuration/variable scope for autotvm"""
-
-
-class AutotvmGlobalScope(object):
-    """The global autotvm scope."""
-
-    current = None
-
-    def __init__(self):
-        self._old = AutotvmGlobalScope.current
-        AutotvmGlobalScope.current = self
-
-        self.in_tuning = False
-        self.silent = False
-
-    def deep_copy(self, global_scope):
-        """Deep copy from another instance of AutotvmGlobalScope."""
-        self._old = AutotvmGlobalScope.current
-
-        self.in_tuning = global_scope.in_tuning
-        self.silent = global_scope.silent
-
-
-GLOBAL_SCOPE = AutotvmGlobalScope()
-
-
-def reset_global_scope(global_scope):
-    """Reset global autotvm state. This is needed to initialize PopenPool workers."""
-    global GLOBAL_SCOPE
-    GLOBAL_SCOPE.deep_copy(global_scope)
-    AutotvmGlobalScope.current = global_scope
diff --git a/python/tvm/autotvm/feature.py b/python/tvm/autotvm/feature.py
deleted file mode 100644
index 1b66d79d0e5e..000000000000
--- a/python/tvm/autotvm/feature.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,
-"""Extract feature of iter vars
-
-There are two types of feature
-1) Itervar feature
-   This feature is extracted based on loop variables.
-   Different loop structures will result in different shapes of feature
-2) Curve sample feature (relation feature)
-   This feature is extracted by sampling relation curve.
-   This feature is invariant of loop structure.
-"""
-
-import struct
-import numpy as np
-import tvm._ffi
-
-from tvm.target import Target
-from tvm.driver import build_module
-
-
-def ana_lower(sch, args, binds=None, simple_mode=True):
-    """Do lower while keeping all axes in IR
-    i.e. Do not eliminate loop with extent of 1, do not vectorize, unroll or inject virtual threads
-    """
-    sch = sch.normalize()
-    # Phase 0
-    context = tvm.transform.PassContext(config={"tir.debug_keep_trivial_loop": True})
-    with context:
-        mod = build_module.schedule_to_module(sch, args, binds=binds)
-
-    mod = tvm.tir.transform.StorageFlatten(64)(mod._move())
-    mod = tvm.tir.transform.Simplify()(mod._move())
-    assert simple_mode
-    return mod["main"].body
-
-
-try:
-    _get_buffer_curve_sample_flatten = tvm._ffi.get_global_func(
-        "autotvm.feature.GetCurveSampleFeatureFlatten"
-    )
-    _get_itervar_feature = tvm._ffi.get_global_func("autotvm.feature.GetItervarFeature")
-    _get_itervar_feature_flatten = tvm._ffi.get_global_func(
-        "autotvm.feature.GetItervarFeatureFlatten"
-    )
-except ValueError as e:
-
-    def raise_error(*args, **kwargs):  # pylint: disable=unused-argument
-        raise RuntimeError("Cannot load autotvm c++ API")
-
-    _get_buffer_curve_sample_flatten = (
-        _get_itervar_feature
-    ) = _get_itervar_feature_flatten = raise_error
-
-
-def get_itervar_feature(sch, args, take_log=False):
-    """get features of iter vars
-
-    Parameters
-    ----------
-    sch: tvm.te.schedule.Schedule
-    args: Array of te.tensor.Tensor
-        the buffer args for lower
-    take_log: bool
-        whether take log of numerical statics
-
-    Returns
-    -------
-    features of every axis in the IR, see doc/features.md for detail
-    """
-    stmt = ana_lower(sch, args, simple_mode=True)
-    feas = _get_itervar_feature(stmt, take_log)
-
-    # convert tvm node to python type
-    ret = []
-    for row in feas:
-        tmp = []
-        tmp.append([row[0][0].value, row[0][1]])
-        for item in row[1:]:
-            tmp.append([item[0].value] + [x.value for x in item[1:]])
-        ret.append(tmp)
-    return ret
-
-
-def flatten_itervar_feature(fea):
-    """flatten features into one-dimensional feature vectors
-
-    Parameters
-    ----------
-    fea: list
-        return value of get_itervar_feature
-
-    Returns
-    -------
-    flatten_feature: np.ndarray
-        one-dimensional vector
-    """
-    flatten = []
-    for axis in fea:
-        for pair in axis[1:]:
-            flatten.append(pair[1:])
-    return np.concatenate(flatten)
-
-
-def get_itervar_feature_flatten(sch, args, take_log=True):
-    """get flatten features of iter vars
-    this is equivalent to get_itervar_feature + flatten_itervar_feature, but much faster.
-
-    Parameters
-    ----------
-    sch: tvm.te.schedule.Schedule
-    args: Array of te.tensor.Tensor
-        the buffer args for lower
-    take_log: bool
-        whether take log of numerical statics
-
-    Returns
-    -------
-    flatten_feature: np.ndarray
-        one-dimensional vector
-    """
-    stmt = ana_lower(sch, args, simple_mode=True)
-    feas = _get_itervar_feature_flatten(stmt, take_log)
-    feas = struct.unpack(f"{len(feas) // 4}f", feas)
-    return feas
-
-
-def get_flatten_name(fea):
-    """Get names of feature after flatten.
-
-    Parameters
-    ----------
-    fea: list or str
-        return value of get_itervar_feature or a line of logfile
-
-    Returns
-    -------
-    feature_names: Array of str
-    """
-
-    feature_name = {
-        "_attr_": ["length", "nest_level", "topdown", "bottomup"] + [f"ann_{i}" for i in range(20)],
-        "_arith_": ["add", "mul", "div"],
-        "buf_touch": ["stride", "mod", "count", "reuse", "T_count", "T_reuse"],
-    }
-
-    if isinstance(fea, str):
-        # pylint: disable=import-outside-toplevel
-        from .record import decode
-
-        # flatten line to feature
-        line = fea
-        ret = decode(line)
-        if ret is None:
-            raise ValueError("Unsupported AutoTVM log format")
-        inp, _ = ret
-        target = Target(inp.target)
-        with target:
-            s, args = inp.template.instantiate(inp.config)
-        fea = get_itervar_feature(s, args)
-
-    names = []
-    ct = 0
-    for row in fea:
-        var_name = str(row[0][1])
-        for pair in row[1:]:
-            key = pair[0]
-            if key in feature_name:
-                name_list = feature_name[key]
-            else:
-                name_list = feature_name["buf_touch"]
-
-            for i in range(len((pair[1:]))):
-                names.append(".".join([f"f{ct}", var_name, key, name_list[i]]))
-                ct += 1
-    return names
-
-
-def get_buffer_curve_sample_flatten(sch, args, sample_n=30):
-    """
-    Get flatten curve sample feature (relation feature)
-
-    Parameters
-    ----------
-    sch: tvm.te.schedule.Schedule
-    args: Array of te.tensor.Tensor
-        the buffer args for lower
-    sample_n: int
-        number of sample points along one dimension
-
-    Returns
-    -------
-    flatten_feature: np.ndarray
-        one-dimensional vector
-    """
-    stmt = ana_lower(sch, args, simple_mode=True)
-    feas = _get_buffer_curve_sample_flatten(stmt, sample_n, False)
-    feas = struct.unpack(f"{len(feas) // 4}f", feas)
-    return feas
diff --git a/python/tvm/autotvm/graph_tuner/__init__.py b/python/tvm/autotvm/graph_tuner/__init__.py
deleted file mode 100644
index d590db0e7c48..000000000000
--- a/python/tvm/autotvm/graph_tuner/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Autotvm graph tuner API."""
-from __future__ import absolute_import as _abs
-
-from . import _base
-from . import base_graph_tuner
-
-from .base_graph_tuner import BaseGraphTuner
-from .dynamic_programming_tuner import DPTuner
-from .pbqp_tuner import PBQPTuner
diff --git a/python/tvm/autotvm/graph_tuner/_base.py b/python/tvm/autotvm/graph_tuner/_base.py
deleted file mode 100644
index ae220bb5e2f8..000000000000
--- a/python/tvm/autotvm/graph_tuner/_base.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Helper functions and global data"""
-
-
-# We set a large time to represent an invalid layout-transformation.
-# This number is set to be 10e9 seconds to align with autotvm.
-INVALID_LAYOUT_TIME = 10e9
-
-MAX_OUTPUT_NODES = 16
-
-OPT_OUT_OP = ["layout_transform"]
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
deleted file mode 100644
index 7e975201c86a..000000000000
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ /dev/null
@@ -1,591 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-instance-attributes,too-many-branches,too-many-nested-blocks,invalid-name,unused-argument,unused-variable,no-member,no-value-for-parameter
-"""Base class for graph tuner."""
-import logging
-from abc import abstractmethod
-
-import numpy as np
-from tvm import topi
-
-import tvm
-from tvm import te
-from tvm import autotvm, relay
-from tvm.autotvm.task import get_config
-from tvm.autotvm.record import encode, load_from_file
-from tvm.autotvm.measure import MeasureResult, MeasureInput
-from tvm.target import Target
-
-from ...target import Target
-from .utils import (
-    is_boundary_node,
-    get_in_nodes,
-    get_out_nodes,
-    has_multiple_inputs,
-    bind_inputs,
-    expr2graph,
-)
-from ._base import INVALID_LAYOUT_TIME
-
-from ._base import OPT_OUT_OP
-
-
-def get_infer_layout(task_name):
-    if task_name.startswith("conv2d"):
-        return topi.nn.conv2d_infer_layout
-    if task_name.startswith("depthwise_conv2d"):
-        return topi.nn.depthwise_conv2d_infer_layout
-    raise ValueError(f"Cannot find infer layout for task {task_name}")
-
-
-@autotvm.template("layout_transform")
-def layout_transform(*args):
-    """Autotvm layout transform template."""
-    cfg = get_config()
-    cfg.add_flop(-1)
-    data = args[0]
-    out = topi.layout_transform(*args)
-    sch = topi.generic.schedule_injective([out])
-    return sch, [data, out]
-
-
-class BaseGraphTuner(object):
-    """Class to search schedules considering both kernel execution time and
-    layout transformation time.
-
-    Before creating a Graph Executor instance, schedule candidates for all kernels in
-    graph should be provided through tensor-level tuning.
-    """
-
-    def __init__(
-        self,
-        graph,
-        input_shapes,
-        records,
-        target_ops,
-        target,
-        max_sch_num=20,
-        dtype="float32",
-        verbose=True,
-        log_file="graph_tuner.log",
-        log_level=logging.DEBUG,
-        name="graph_tuner",
-    ):
-        """Create a GlobalTuner instance. Local schedule searching for all nodes with
-        target_op in the input graph and layout transformation benchmark need to be
-        executed before initialization.
-
-        graph : tvm.relay.function.Function
-            Input graph
-
-        input_shapes : dict of str to tuple.
-            Input shapes of graph
-
-        records : str or iterator of (MeasureInput, MeasureResult)
-            Collection of kernel level tuning records.
-            If it is str, then it should be the filename of a records log file.
-                       Each row of this file is an encoded record pair.
-            Otherwise, it is an iterator.
-
-        target_ops : List of tvm.ir.Op
-            Target tuning operators.
-
-        target : str or tvm.target
-            Compilation target.
-
-        max_sch_num : int, optional
-            Maximum number of schedule candidates for each workload.
-
-        dtype : str, optional
-            Data type.
-
-        log_file : str, optional
-            graph tuner log file name
-
-        name : str, optional
-            Name of global tuner.
-        """
-        self._node_list = []
-        self._layout_transform_perf_records = {}
-        self._layout_transform_interlayer_cost = {}
-        self._input_shapes = input_shapes
-        self._target_ops = target_ops
-
-        self._name = name
-        self._max_sch_num = max_sch_num
-        self._optimal_sch_dict = {}
-        self._records = records
-        self._dtype = dtype
-        if isinstance(target, str):
-            target = Target(target)
-        self._target = target
-        self._optimal_record_dict = {}
-
-        # Set up logger
-        self._verbose = verbose
-        self._logger = logging.getLogger(name + "_logger")
-        need_file_handler = need_console_handler = True
-        for handler in self._logger.handlers:
-            if handler.__class__.__name__ == "FileHandler":
-                need_file_handler = False
-            if handler.__class__.__name__ == "StreamHandler":
-                need_console_handler = False
-        self._log_level = log_level
-        self._log_file = log_file
-        self._formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
-        self._logger.setLevel(log_level)
-        if need_file_handler:
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(self._formatter)
-            self._logger.addHandler(file_handler)
-        if self._verbose and need_console_handler:
-            console_handler = logging.StreamHandler()
-            console_handler.setFormatter(self._formatter)
-            self._logger.addHandler(console_handler)
-            self._logger.setLevel(log_level)
-            self._logger.propagate = False
-
-        # Generate workload and schedule dictionaries.
-        if isinstance(graph, tvm.IRModule):
-            graph = graph["main"]
-
-        if isinstance(graph, relay.function.Function):
-            node_dict = {}
-            graph = bind_inputs(graph, input_shapes, dtype)
-            expr2graph(graph, self._target_ops, node_dict, self._node_list, target)
-        else:
-            raise RuntimeError(f"Unsupported graph type: {type(graph)}")
-
-        self._graph = graph
-        self._in_nodes_dict = get_in_nodes(self._node_list, self._target_ops, input_shapes.keys())
-        if len(self._in_nodes_dict) == 0:
-            raise RuntimeError(
-                f"Could not find any input nodes with whose "
-                f"operator is one of {self._target_ops}"
-            )
-        self._out_nodes_dict = get_out_nodes(self._in_nodes_dict)
-        self._fetch_cfg()
-        self._opt_out_op = OPT_OUT_OP
-
-        # Setup infer_layout for elemwise-like nodes
-        # Note: graph tuner currently only supports tuning of single input and single output
-        # op as target op, such as conv2d, dense and conv2d_transpose. In this case, we can
-        # reuse infer_layout function from target ops for elemwise-like nodes. The behavior
-        # is to modify the first tensor shape of input workload to the output shape of
-        # elemwise-like node, and use infer_layout function from input op to generate layouts.
-        input_names = self._input_shapes.keys()
-        for idx in sorted(self._in_nodes_dict.keys()):
-            if has_multiple_inputs(self._node_list, idx, input_names, self._opt_out_op):
-                node_entry = self._node_list[idx]
-                node_entry["topi_op"] = []
-                node_entry["workloads"] = []
-                for input_idx in self._in_nodes_dict[idx]:
-                    input_node = self._node_list[input_idx]
-                    if not is_boundary_node(input_node, input_names):
-                        input_topi_op = input_node["topi_op"][0]
-                        node_entry["topi_op"].append(input_topi_op)
-                        # Only replace the first input tensor
-                        input_workload = input_node["workloads"][0]
-                        first_tensor = input_workload[1]
-                        dtype = first_tensor[-1]
-                        new_shape = tuple([val.value for val in node_entry["types"][0].shape])
-                        actual_workload = (
-                            (input_workload[0],)
-                            + (("TENSOR", new_shape, dtype),)
-                            + input_workload[2:]
-                        )
-                        node_entry["workloads"].append(actual_workload)
-                        if "record_candidates" not in node_entry:
-                            node_entry["record_candidates"] = input_node["record_candidates"]
-                    else:
-                        node_entry["topi_op"].append(None)
-                        node_entry["workloads"].append(None)
-
-    def _fetch_cfg(self):
-        """Read and pre-process input schedules."""
-        if isinstance(self._records, str):
-            records = load_from_file(self._records)
-        else:
-            records = self._records
-        cfg_dict = {}
-        for record in records:
-            in_measure, _ = record
-            workload = in_measure.task.workload
-            if workload not in cfg_dict:
-                cfg_dict[workload] = []
-            cfg_dict[workload].append(record)
-
-        cache_dict = {}
-        for key in self._in_nodes_dict:
-            node_entry = self._node_list[key]
-            if node_entry["op"] not in self._target_ops:
-                continue
-            workload = node_entry["workloads"][0]
-            if workload in cache_dict:
-                node_entry["record_candidates"] = cache_dict[workload]
-                continue
-            record_candidates = []
-            infer_layout_func = get_infer_layout(node_entry["topi_op"][0])
-            layout_tracking_dict = {}
-            for record in cfg_dict[workload]:
-                in_measure, out_measure = record
-                workload = in_measure.task.workload
-                cfg = in_measure.config
-                # For multiple cfgs which produces the same in/out layouts,
-                # only the most efficient one is preserved.
-                with self._target:
-                    layouts = infer_layout_func(workload, cfg)
-                    if layouts in layout_tracking_dict:
-                        cost = out_measure.costs[0]
-                        current_best_cost = layout_tracking_dict[layouts][1].costs[0]
-                        if cost < current_best_cost:
-                            layout_tracking_dict[layouts] = record
-                    else:
-                        layout_tracking_dict[layouts] = record
-            sorted_records = sorted(
-                layout_tracking_dict.values(), key=lambda item: item[1].costs[0]
-            )
-            for i in range(min(self._max_sch_num, len(sorted_records))):
-                record_candidates.append(sorted_records[i])
-            node_entry["record_candidates"] = record_candidates
-            cache_dict[workload] = record_candidates
-
-    def _iterate_layout_transform(self, callback):
-        """Iterate all possible layout transformations and execute callback for each
-        iteration. callback function accepts 6 arguments: from_node_idx, to_node_idx,
-        from_sch_idx, to_sch_idx, args which represent the argument list of layout
-        transformation and is_valid showing whether this is a valid layout transformation.
-        """
-        input_names = self._input_shapes.keys()
-        pair_tracker = set()
-        for key, val in self._in_nodes_dict.items():
-            node_entry = self._node_list[key]
-            target_input_idx = -1
-            target_input_pos = -1
-            if has_multiple_inputs(self._node_list, key, input_names, self._opt_out_op):
-                for i, item in enumerate(val):
-                    node = self._node_list[item]
-                    if not is_boundary_node(node, input_names):
-                        target_input_idx = item
-                        target_input_pos = i
-                        break
-
-            for i, item in enumerate(val):
-                i_idx = item
-                in_node_entry = self._node_list[i_idx]
-                if is_boundary_node(in_node_entry, input_names):
-                    continue
-
-                if node_entry["op"] in self._target_ops:
-                    o_idx = key
-                    o_infer_layout_func = get_infer_layout(node_entry["topi_op"][0])
-                    o_wkl = node_entry["workloads"][0]
-                    i_topi_op = in_node_entry["topi_op"][0]
-                    i_wkl = in_node_entry["workloads"][0]
-                    pivot = 0
-                    while not i_wkl:
-                        pivot += 1
-                        i_topi_op = in_node_entry["topi_op"][pivot]
-                        i_wkl = in_node_entry["workloads"][pivot]
-                    i_infer_layout_func = get_infer_layout(i_topi_op)
-                else:
-                    o_idx = target_input_idx
-                    if i <= target_input_pos:
-                        continue
-                    o_infer_layout_func = get_infer_layout(node_entry["topi_op"][0])
-                    o_wkl = node_entry["workloads"][target_input_pos]
-                    i_infer_layout_func = get_infer_layout(node_entry["topi_op"][i])
-                    i_wkl = node_entry["workloads"][i]
-
-                if (i_idx, o_idx) in pair_tracker:
-                    continue
-                pair_tracker.add((i_idx, o_idx))
-
-                for m, i_record in enumerate(in_node_entry["record_candidates"]):
-                    for n, o_record in enumerate(node_entry["record_candidates"]):
-                        i_cfg, o_cfg = i_record[0].config, o_record[0].config
-                        with self._target:
-                            i_input_info, i_output_info = i_infer_layout_func(i_wkl, i_cfg)
-                            o_input_info, o_output_info = o_infer_layout_func(o_wkl, o_cfg)
-                        if (
-                            len(i_input_info) > 1
-                            or len(i_output_info) > 1
-                            or len(o_input_info) > 1
-                            or len(o_output_info) > 1
-                        ):
-                            raise RuntimeError(
-                                "Graph tuner only supports target operator "
-                                "with single input and single output. "
-                                "Please check target_ops argument."
-                            )
-
-                        in_shape, in_layout = i_output_info[0]
-                        if node_entry["op"] in self._target_ops:
-                            _, out_layout = o_input_info[0]
-                        else:
-                            _, out_layout = o_output_info[0]
-                        data_placeholder = te.placeholder(in_shape, name="data", dtype=self._dtype)
-                        args = [data_placeholder, in_layout, out_layout]
-                        callback(i_idx, o_idx, m, n, args)
-
-    def _create_matrix_callback(self, from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args):
-        """Create dictionary containing matrix format of layout transformation
-        between nodes."""
-        in_layout, out_layout = args[1], args[2]
-        ltf_workload = autotvm.task.args_to_workload(args, "layout_transform")
-        idx_pair_key = (from_node_idx, to_node_idx)
-
-        if in_layout == out_layout:
-            layout_transform_time = 0
-        else:
-            layout_transform_time = self._layout_transform_perf_records[ltf_workload][1].costs[0]
-
-        if idx_pair_key not in self._layout_transform_interlayer_cost:
-            self._layout_transform_interlayer_cost[idx_pair_key] = []
-        if len(self._layout_transform_interlayer_cost[idx_pair_key]) <= from_sch_idx:
-            self._layout_transform_interlayer_cost[idx_pair_key].append([])
-        self._layout_transform_interlayer_cost[idx_pair_key][from_sch_idx].append(
-            layout_transform_time
-        )
-
-    def benchmark_layout_transform(
-        self,
-        min_exec_num=100,
-        timeout=10,
-        use_rpc=False,
-        device_key=None,
-        host="127.0.0.1",
-        port=9190,
-        n_parallel=1,
-        build_func="default",
-        layout_records=None,
-        target_host=None,
-        infer_layout=False,
-        runner=None,
-    ):
-        """Benchmark all possible layout transformation in the graph,
-        given a set of schedule candidates for each workload of target operator.
-
-        Parameters
-        ----------
-        min_exec_num : int, optional
-            Minimum number of execution. Final execution time is the average of
-            all execution time.
-
-        timeout : int, optional
-            Time out for each execution.
-
-        use_rpc : boolean, optional
-            Whether to use rpc mode for benchmarking.
-
-        device_key : str, optional
-            Remote device key which can be queried by
-            python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
-
-        host : str, optional
-            IP address used to create RPC tracker on host machine.
-
-        port : int, optional
-            Port number used to create RPC tracker on host machine.
-
-        n_parallel: int, optional
-            The number of measurement task that can run in parallel.
-            Set this according to the number of cpu cores (for compilation) and
-            the number of devices you have (for measuring generate code).
-
-        build_func: str or callable, optional
-            'default': call default builder. This works for normal target (llvm, cuda)
-
-            'ndk': use Android NDK to create shared library. Use this for android target.
-
-            callable: customized build function for other backends (e.g. VTA).
-                      See autotvm/measure/measure_methods.py::default_build_func for example.
-
-        layout_records : str or iterator of (MeasureInput, MeasureResult). optional
-            Collection of layout_transform benchmarking records.
-            If is str, then it should be the filename of a records log file.
-                   Each row of this file is an encoded record pair.
-            Otherwise, it is an iterator.
-
-            If this argument is set, graph tuner will first check whether layout_transform
-            workload already exists in records and skip benchmarking if possible.
-
-        target_host : str, optional
-            str or :any:`tvm.target.Target` optional
-            Host compilation target, if target is device.
-            When TVM compiles device specific program such as CUDA,
-            we also need host(CPU) side code to interact with the driver
-            setup the dimensions and parameters correctly.
-            target_host is used to specify the host side codegen target.
-            By default, llvm is used if it is enabled,
-            otherwise a stackvm intepreter is used.
-
-        infer_layout : bool, optional
-            Whether to infer layout transformation time if it doesn't exist in records, instead
-            of benchmarking on target device.
-
-            This might bring performance loss comparing to benchmarking layout transformation.
-        runner : Runner, optional
-            Accept a user-supplied runner
-        """
-        self._logger.info("Start to benchmark layout transformation...")
-        self._target, target_host = Target.canon_target_and_host(self._target, target_host)
-
-        if layout_records is None and infer_layout:
-            raise RuntimeError("Requires some records to infer layout transformation time.")
-
-        if isinstance(layout_records, str):
-            layout_records = load_from_file(layout_records)
-            if not layout_records and infer_layout:
-                raise RuntimeError("Records must be non-empty to infer layout transformation time.")
-
-        if isinstance(layout_records, str):
-            layout_records = load_from_file(layout_records)
-        num_flops, total_time = 0, 0
-        if layout_records is not None:
-            for record in layout_records:
-                ltf_wkl = record[0].task.workload
-                self._layout_transform_perf_records[ltf_wkl] = record
-                input_shape = ltf_wkl[1][1]
-                flops = np.prod(input_shape)
-                num_flops += flops
-                total_time += record[1].costs[0]
-        avg_time = total_time / num_flops if num_flops > 0 else 0
-
-        args_list = []
-
-        def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args):
-            """Callback function to fetch layout transform args"""
-            _, in_layout, out_layout = args
-            if in_layout != out_layout:
-                args_list.append(args)
-
-        self._iterate_layout_transform(_fetch_args_callback)
-
-        def _log_to_list(record_list):
-            """Callback to log result to a list."""
-
-            def _callback(_, inputs, results):
-                """Callback implementation"""
-                record_list.append((inputs[0], results[0]))
-
-            return _callback
-
-        builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func)
-        if use_rpc:
-            if device_key is None:
-                raise RuntimeError("device_key need to be set to use rpc tracker mode.")
-            runner = autotvm.measure.RPCRunner(
-                device_key,
-                host,
-                port,
-                n_parallel=n_parallel,
-                number=min_exec_num,
-                repeat=1,
-                timeout=timeout,
-            )
-        elif not runner:
-            runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout)
-        measure_option = autotvm.measure_option(builder=builder, runner=runner)
-        for args in args_list:
-            data, in_layout, out_layout = args
-            ltf_workload = autotvm.task.args_to_workload(args, "layout_transform")
-            if ltf_workload in self._layout_transform_perf_records:
-                continue
-
-            if infer_layout:
-                input_shape = ltf_workload[1][1]
-                flops = 1
-                for i in input_shape:
-                    flops *= i
-
-                # Rule out invalid layout transformations
-                out = topi.layout_transform(data, in_layout, out_layout)
-                out_flops = 1
-                for i in topi.utils.get_const_tuple(out.shape):
-                    out_flops *= i
-
-                if flops != out_flops:
-                    inferred_time = INVALID_LAYOUT_TIME
-                else:
-                    inferred_time = flops * avg_time
-
-                record_input = MeasureInput(target=self._target, task=None, config=None)
-                record_output = MeasureResult(
-                    costs=(inferred_time,), error_no=0, all_cost=-1, timestamp=-1
-                )
-                self._layout_transform_perf_records[ltf_workload] = (record_input, record_output)
-                continue
-
-            records = []
-            task = autotvm.task.create("layout_transform", args=args, target=self._target)
-            tuner = autotvm.tuner.GridSearchTuner(task)
-            tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)])
-            if not isinstance(records[0][1].costs[0], float):
-                records[0] = (records[0][0], records[0][1]._replace(costs=(INVALID_LAYOUT_TIME,)))
-            self._layout_transform_perf_records[ltf_workload] = records[0]
-
-        self._iterate_layout_transform(self._create_matrix_callback)
-        self._logger.info("Benchmarking layout transformation successful.")
-
-    @property
-    def layout_transform_perf_records(self):
-        """Get layout transformation dictionary for input graph.
-
-        Returns
-        -------
-        layout_transform_perf_records : dict of tuple to (MeasureInput, MeasureResult)
-            Layout transformation dictionary for input graph.
-        """
-        return self._layout_transform_perf_records
-
-    def get_optimal_records(self):
-        """Convert optimal record dictionary to a list of records
-        with ascending order of node index in graph.
-
-        Returns
-        -------
-        sch_list : list of tuple
-            List of records with ascending order of node index in graph.
-        """
-        ordered_index_list = sorted(self._optimal_record_dict.keys())
-        ret = []
-        for index in ordered_index_list:
-            node_entry = self._node_list[index]
-            if node_entry["op"] not in self._target_ops:
-                continue
-            ret.append(node_entry["record_candidates"][self._optimal_record_dict[index]])
-        return ret
-
-    def write_opt_sch2record_file(self, record_file="graph_opt_schedule.log"):
-        """Write graph level optimal schedules into file.
-
-        Parameters
-        ----------
-        record_file : str, optional
-            Output schedule file.
-        """
-        with open(record_file, "a") as out_file:
-            records = self.get_optimal_records()
-            for record in records:
-                out_file.write(encode(record[0], record[1]) + "\n")
-        msg = f"Writing optimal schedules to {record_file} successfully."
-        self._logger.info(msg)
-
-    @abstractmethod
-    def run(self, **kwargs):
-        """Run graph tuning."""
diff --git a/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py b/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py
deleted file mode 100644
index 2d7560272e6d..000000000000
--- a/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=too-many-instance-attributes,too-many-branches,too-many-statements,too-many-arguments,too-many-locals,invalid-name
-"""Stage class for dynamic programming tuner"""
-import numpy as np
-
-from .utils import is_boundary_node
-
-
-class DPStage(object):
-    """Class to represent node in Markov decision process. A stage has states
-    to represent different schedules of the current node. Since in this problem
-    the action is the schedule selected for current node, action can be fully
-    represented by states. No extra attribute needs for action.
-
-    In most cases, instance of this class should be created through DPTuner.
-    """
-
-    def __init__(
-        self,
-        idx,
-        input_shapes,
-        node_list,
-        counted_nodes_set,
-        layout_transform_interlayer_cost,
-        stage_dict,
-        in_nodes_dict,
-        out_nodes_dict,
-        dep_dict,
-        target_ops,
-        dtype="float32",
-    ):
-        """Initialize a stage and create all states.
-
-        Parameters
-        ----------
-        idx : int
-            Index for current node.
-
-        input_shapes : dict of string to tuple of int
-            Input shapes for current graph.
-
-        node_list : list of dict
-            List of all nodes for current graph.
-
-        counted_nodes_set : set of int
-            Global set recording whether the execution time of a node has been counted.
-
-        layout_transform_interlayer_cost : dict of tuple to list
-            Dictionary maps node index pair to layout transformation time between them.
-
-        stage_dict : dict of int to Stage
-            Global dictionary for all stages mapping node index to stage.
-
-        in_nodes_dict : dict of int to list of int
-            Dictionary maps node index to corresponding input node index.
-
-        out_nodes_dict : dict of int to list of int
-            Dictionary maps node index to corresponding output node index.
-
-        dep_dict : dict of int to set of int
-            Dictionary maps node index to dependent node index.
-
-        target_ops : list of str
-            Target operators
-
-        dtype : str, optional
-            Data type.
-        """
-        self._global_input_shapes = input_shapes
-        self._global_input_names = input_shapes.keys()
-        self._global_node_list = node_list
-        self._global_counted_nodes_set = counted_nodes_set
-        self._global_layout_transform_interlayer_cost = layout_transform_interlayer_cost
-        self._global_stage_dict = stage_dict
-        self._global_in_nodes_dict = in_nodes_dict
-        self._global_out_nodes_dict = out_nodes_dict
-        self._global_dep_dict = dep_dict
-
-        self._idx = idx
-        self._node_entry = self._global_node_list[idx]
-        self._target_ops = target_ops
-        self._wkl = self._node_entry["workloads"][0]
-        self._record_list = self._node_entry["record_candidates"]
-        self._dep = []
-        self._dtype = dtype
-        self._states = None
-        self._full_states = None
-        self._full_states_idx = None
-        self._create_states()
-
-    def _create_states(self):
-        """Create states."""
-        node = self._global_node_list[self._idx]
-        if node["op"] in self._target_ops:
-            self._create_op_states()
-        else:
-            self._create_multi_inputs_states()
-
-    def _create_op_states(self):
-        """State creation routine for nodes with target_op."""
-        input_idx = self._global_in_nodes_dict[self._idx][0]
-        input_node_entry = self._global_node_list[input_idx]
-        if is_boundary_node(input_node_entry, self._global_input_names):
-            self._full_states = np.array([record[1].costs[0] for record in self._record_list])
-            self._states = self._full_states
-        else:
-            input_stage = self._global_stage_dict[input_idx]
-            input_dep = input_stage.dep
-            input_states = input_stage.states
-            input_flatten_states = input_states.flatten()
-            input_record_list = input_node_entry["record_candidates"]
-            num_schedules = len(self._record_list)
-            num_input_schedules = len(input_record_list)
-            num_input_states = input_flatten_states.shape[0]
-
-            full_states_shape = tuple(
-                [num_schedules, num_input_schedules]
-                + [
-                    len(self._global_node_list[dep_idx]["record_candidates"])
-                    for dep_idx in input_dep
-                ]
-            )
-            self._full_states = np.zeros(full_states_shape).flatten().astype("float32")
-            self._full_states_idx = [self._idx, input_idx] + input_dep
-            dep_multiplier = 1
-            for i in range(2, len(full_states_shape)):
-                dep_multiplier *= full_states_shape[i]
-            input_node_time_counted = input_idx in self._global_counted_nodes_set
-
-            for i in range(num_schedules):
-                current_sch_time = float(self._record_list[i][1].costs[0])
-                for j in range(num_input_states):
-                    input_sch_idx = j // dep_multiplier
-                    layout_transform_time = self._global_layout_transform_interlayer_cost[
-                        (input_idx, self._idx)
-                    ][input_sch_idx][i]
-
-                    if input_node_time_counted:
-                        total_time = current_sch_time + layout_transform_time
-                    else:
-                        total_time = (
-                            current_sch_time + layout_transform_time + input_flatten_states[j]
-                        )
-                    current_state_idx = i * num_input_states + j
-                    self._full_states[current_state_idx] = total_time
-
-            if not input_node_time_counted:
-                self._global_counted_nodes_set.add(input_idx)
-            self._full_states = self._full_states.reshape(full_states_shape)
-
-            # If out degree of input node is 1, we can remove the dimension of input node,
-            # since the states of input node will not be needed any more. Otherwise, input
-            # node should become a dependency.
-            if len(self._global_out_nodes_dict[input_idx]) == 1:
-                self._states = np.amin(self._full_states, axis=1)
-                self._dep = list(input_dep)
-            else:
-                self._states = self._full_states
-                self._dep = [
-                    input_idx,
-                ] + input_dep
-
-        # Update global dependency dictionary.
-        # This is to monitor the dependency states to decide
-        # when a dependency can be eliminated, so that total
-        # number of states can be largely reduced.
-        for dep_idx in self._dep:
-            self._global_dep_dict[dep_idx].remove(self._idx)
-            for child in self._global_out_nodes_dict[self._idx]:
-                self._global_dep_dict[dep_idx].add(child)
-        if len(self._global_out_nodes_dict[self._idx]) > 1:
-            self._global_dep_dict[self._idx] = set()
-            for child in self._global_out_nodes_dict[self._idx]:
-                self._global_dep_dict[self._idx].add(child)
-
-    def _create_multi_inputs_states(self):
-        """State creation routine for multi_input operator
-
-        In tvm, layout transformation for an elemwise-like follow the rule which
-        all input operators transform their layouts to the leftmost input operator
-        layout. For example:
-                            elemwise-sum
-                            |    |    |
-                            |    |    |
-                           op0  op1  op2
-        In this block, the possible layout transformations are: op1 -> op0 and op2 -> op0.
-        In graph tuning, a 3-D array with shape (k0, k1, k2) can represent the layout
-        transformations between these three nodes. It is also possible some earlier states
-        belong to other nodes(We name them as dependency) are required for dynamic programming.
-        The final states array for this elemwise-sum can be with shape (e0, k0, k1, e1, k2).
-        To iterate through all states, we first align the shape of op0, op1 and op2 to be
-        (e0, k0, k1, e1, k2) by broadcasting the original states. We also record the axis of
-        each input node in the states array, together with the multiplier. For example,
-        the axis index for op0 is 1, and multiplier is k1 * e1 * k2. If current iterating index
-        in the flatten array is i, the index of op0 can be computed as:
-        i % (k0 * k1 * e1 * k2) // (k1 * e1 * k2).
-        """
-        full_input_node_list = list(self._global_in_nodes_dict[self._idx])
-        input_index_list = []
-        # Remove input and ruled_out nodes
-        for input_idx in full_input_node_list:
-            input_node = self._global_node_list[input_idx]
-            if not is_boundary_node(input_node, self._global_input_names):
-                input_index_list.append(input_idx)
-
-        # Generate new states
-        states_list, aligned_node_list = DPStage.align_states(
-            input_index_list, self._global_stage_dict, self._global_node_list
-        )
-        target_node_idx, target_major_axis, target_multiplier, target_states = states_list[0]
-        aligned_shape = target_states.shape
-        self._full_states = np.zeros(aligned_shape).astype("float32").flatten()
-        self._full_states_idx = list(aligned_node_list)
-        num_states = self._full_states.shape[0]
-        node_time_counted = [item[0] in self._global_counted_nodes_set for item in states_list]
-        target_states = target_states.flatten()
-        src_states_list = [states_list[i][3].flatten() for i in range(1, len(states_list))]
-
-        for i in range(num_states):
-            target_sch_idx = (
-                i % (target_multiplier * aligned_shape[target_major_axis])
-            ) // target_multiplier
-            if node_time_counted[0]:
-                new_state = 0
-            else:
-                new_state = target_states[i]
-
-            for j in range(1, len(states_list)):
-                src_states = src_states_list[j - 1]
-                src_node_idx, src_major_axis, src_multiplier, _ = states_list[j]
-                src_sch_idx = (
-                    i % (src_multiplier * aligned_shape[src_major_axis])
-                ) // src_multiplier
-                layout_transform_time = self._global_layout_transform_interlayer_cost[
-                    (src_node_idx, target_node_idx)
-                ][src_sch_idx][target_sch_idx]
-
-                if node_time_counted[j]:
-                    new_state += layout_transform_time
-                else:
-                    new_state += layout_transform_time + src_states[i]
-                self._full_states[i] = new_state
-
-        for i, node_counted in enumerate(node_time_counted):
-            if not node_counted:
-                self._global_counted_nodes_set.add(states_list[i][0])
-        self._full_states = self._full_states.reshape(aligned_shape)
-
-        # Remove dependency to reduce states
-        reduced_states = np.array(self._full_states)
-        reduced_states_transpose = [states_list[0][1]]
-        reduced_states_dep_list = []
-        self._dep = []
-        for i in range(len(reduced_states.shape)):
-            if i != states_list[0][1]:
-                reduced_states_transpose.append(i)
-                reduced_states_dep_list.append(aligned_node_list[i])
-        reduced_states = np.transpose(reduced_states, reduced_states_transpose)
-        shift = 0
-        for i, dep in enumerate(reduced_states_dep_list):
-            if dep not in self._global_dep_dict or len(self._global_dep_dict[dep]) == 1:
-                self._global_dep_dict.pop(dep, None)
-                reduced_states = np.amin(reduced_states, axis=i + 1 - shift)
-                shift += 1
-            else:
-                self._dep.append(dep)
-        self._states = reduced_states
-
-        # Update dependency
-        for dep in self._dep:
-            self._global_dep_dict[dep].remove(self._idx)
-            for child in self._global_out_nodes_dict[self._idx]:
-                self._global_dep_dict[dep].add(child)
-        if len(self._global_out_nodes_dict[self._idx]) > 1:
-            self._global_dep_dict[self._idx] = set()
-            for child in self._global_out_nodes_dict[self._idx]:
-                self._global_dep_dict[self._idx].add(child)
-
-    @property
-    def dep(self):
-        """Get dependency list."""
-        return self._dep
-
-    @property
-    def states(self):
-        """Get states."""
-        return self._states
-
-    @property
-    def full_states(self):
-        """Get complete states."""
-        return self._full_states
-
-    @property
-    def full_states_idx(self):
-        """Get node index of complete states."""
-        return self._full_states_idx
-
-    @staticmethod
-    def align_states(input_index_list, stage_dict, node_list):
-        """Align all input node states shapes to be the same and transpose/reshape properly.
-
-        This is used in creating multi_input operator states.
-
-        Parameters
-        ----------
-        input_index_list : list of int
-            List of input node index.
-
-        stage_dict : dict of int to Stage
-            Global dictionary of node index to stage.
-
-        node_list : list of dict
-            List of all nodes for current graph.
-
-        Returns
-        -------
-        states_list : list of tuple
-            List of aligned states.
-
-        aligned_node_list : list in int
-            List of node index for aligned states.
-        """
-        aligned_node_list = list(input_index_list)
-        states_list = []
-        for input_idx in input_index_list:
-            input_node_stage = stage_dict[input_idx]
-            for dep_idx in input_node_stage.dep:
-                if dep_idx not in aligned_node_list:
-                    aligned_node_list.append(dep_idx)
-        aligned_shape = []
-        for idx in aligned_node_list:
-            aligned_shape.append(len(node_list[idx]["record_candidates"]))
-        for input_idx in input_index_list:
-            input_node_stage = stage_dict[input_idx]
-            input_node_shape_idx_list = [input_idx] + input_node_stage.dep
-            transpose_idx_list = []
-            reshape_list = []
-            major_axis = -1
-            for i, idx in enumerate(aligned_node_list):
-                if input_idx == idx:
-                    major_axis = i
-                if idx in input_node_shape_idx_list:
-                    transpose_idx_list.append(idx)
-                    reshape_list.append(aligned_shape[i])
-                else:
-                    reshape_list.append(1)
-            transpose_list = [input_node_shape_idx_list.index(idx) for idx in transpose_idx_list]
-            input_node_states = np.transpose(input_node_stage.states, tuple(transpose_list))
-            input_node_states = np.reshape(input_node_states, tuple(reshape_list))
-            input_node_states = np.broadcast_to(input_node_states, aligned_shape)
-            multiplier = 1
-            for i in range(major_axis + 1, len(aligned_shape)):
-                multiplier *= aligned_shape[i]
-            states_list.append((input_idx, major_axis, multiplier, input_node_states))
-        return states_list, aligned_node_list
diff --git a/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py b/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py
deleted file mode 100644
index 97253e406be1..000000000000
--- a/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-error,too-many-locals,too-many-statements,too-many-branches,unused-variable
-"""Dynamic programming tuner."""
-import sys
-import numpy as np
-
-from ._base import MAX_OUTPUT_NODES
-from .base_graph_tuner import BaseGraphTuner
-from .dynamic_programming_stage import DPStage
-from .utils import has_multiple_inputs, is_boundary_node
-
-if sys.version_info[0] == 3:
-    import queue
-else:
-    import Queue as queue
-
-
-class DPTuner(BaseGraphTuner):
-    """Tuner which uses dynamic programming to solve MDP problem.
-
-    Note: currently dynamic programming is used to solve this MDP problem. However,
-    this problem is intrinsically non-polynomial. DP can't apply for more complicated
-    models, such as networks with many element-wise sum operators. In this case, switch
-    to heuristic algorithm such as PBQP tuner.
-    """
-
-    def __init__(self, *args, **kwargs):
-        """Create a dynamic programming tuner."""
-        super(DPTuner, self).__init__(*args, **kwargs)
-        self._num_states = self._max_num_states = None
-        self._stage_dict = {}
-        self._dep_dict = {}
-        self._counted_nodes_set = set()
-
-        self._global_data_dict = {
-            "dtype": self._dtype,
-            "counted_nodes_set": self._counted_nodes_set,
-            "stage_dict": self._stage_dict,
-            "in_nodes_dict": self._in_nodes_dict,
-            "out_nodes_dict": self._out_nodes_dict,
-            "dep_dict": self._dep_dict,
-            "node_list": self._node_list,
-            "input_shapes": self._input_shapes,
-            "layout_transform_interlayer_cost": self._layout_transform_interlayer_cost,
-        }
-
-    def _check_num_states(self, num_states):
-        """Track the number of states."""
-        self._num_states += num_states
-        if self._max_num_states is not None:
-            if self._num_states > self._max_num_states:
-                raise RuntimeError(
-                    "Too many states detected while running dynamic "
-                    "programming: got %d states but upper limit is %d."
-                    % (self._num_states, self._max_num_states)
-                )
-
-    def _forward(self):
-        """Forward pass in DP to generate states for all stages."""
-        self._logger.info("Start forward pass...")
-        for node_idx in sorted(self._in_nodes_dict.keys()):
-            stage = DPStage(idx=node_idx, target_ops=self._target_ops, **self._global_data_dict)
-            self._check_num_states(stage.full_states.size)
-            self._stage_dict[node_idx] = stage
-        self._logger.info("Finished forward pass.")
-
-    def _backward(self):
-        """Backward pass in DP to generate optimal solution."""
-        self._logger.info("Start backward pass...")
-        input_names = self._input_shapes.keys()
-        optimal_record_dict = {}
-        # Pick optimal schedule for output nodes
-        output_idx_list = []
-        for key, val in self._out_nodes_dict.items():
-            if not val:
-                output_idx_list.append(key)
-
-        # Restrict number of output nodes to avoid numpy reshape error
-        if len(output_idx_list) > MAX_OUTPUT_NODES:
-            msg = (
-                "The number of outputs in graph is larger than upper "
-                "limit: %s vs %s. Usually this is caused by too many "
-                "LAYOUT_FIXED_OP in graph. Switch to greedily select schedule."
-                "No action required at this moment. We will continuously improve graph tuner"
-                % (len(output_idx_list), MAX_OUTPUT_NODES)
-            )
-            self._logger.warning(msg)
-            self._optimal_record_dict = {key: 0 for key in self._in_nodes_dict}
-            return
-
-        states_list, aligned_node_list = DPStage.align_states(
-            output_idx_list, self._stage_dict, self._node_list
-        )
-        num_states = states_list[0][3].size
-        self._check_num_states(num_states * len(output_idx_list))
-        aligned_node_shape = states_list[0][3].shape
-        min_time = 0
-        min_pos = -1
-        for states in states_list:
-            min_time += np.amax(states[3])
-        flatten_states_list = [current_states[3].flatten() for current_states in states_list]
-        for i in range(num_states):
-            current_time = 0
-            for j, current_states in enumerate(states_list):
-                current_time += flatten_states_list[j][i]
-            if min_time > current_time:
-                min_time = current_time
-                min_pos = i
-        for i, states in enumerate(states_list):
-            current_major_axis = states[1]
-            current_sch_idx = (
-                min_pos % (states[2] * aligned_node_shape[current_major_axis])
-            ) // states[2]
-            optimal_record_dict[aligned_node_list[i]] = current_sch_idx
-        # Pick optimal schedule for dependencies of output nodes
-        for i in range(len(states_list), len(aligned_node_list)):
-            multiplier = 1
-            for j in range(i + 1, len(aligned_node_list)):
-                multiplier *= aligned_node_shape[j]
-            optimal_record_dict[aligned_node_list[i]] = (
-                min_pos // multiplier % aligned_node_shape[i]
-            )
-
-        # Backward pass to get optimal schedules for other nodes
-        bfs_q = queue.Queue()
-        visited = set()
-        for out_idx in output_idx_list:
-            bfs_q.put(out_idx)
-        while not bfs_q.empty():
-            node_idx = bfs_q.get()
-            visited.add(node_idx)
-            node = self._node_list[node_idx]
-            if is_boundary_node(node, input_names):
-                continue
-            optimal_sch_idx = optimal_record_dict[node_idx]
-            full_states = self._stage_dict[node_idx].full_states
-            if not has_multiple_inputs(self._node_list, node_idx, input_names, self._opt_out_op):
-                input_idx = self._in_nodes_dict[node_idx][0]
-                input_node = self._node_list[input_idx]
-                if is_boundary_node(input_node, input_names):
-                    continue
-                if input_idx not in visited:
-                    bfs_q.put(input_idx)
-                    if input_idx not in optimal_record_dict:
-                        dep_list = self._stage_dict[node_idx].dep
-                        dep_idx = tuple([optimal_record_dict[item] for item in dep_list])
-                        tmp = np.argmin(full_states, axis=1)
-                        optimal_input_sch_idx = tmp[(optimal_sch_idx,) + dep_idx]
-                        optimal_record_dict[input_idx] = optimal_input_sch_idx
-            else:
-                input_idx_list = self._in_nodes_dict[node_idx]
-                optimal_record_dict[input_idx_list[0]] = optimal_sch_idx
-                full_states_idx = self._stage_dict[node_idx].full_states_idx
-                tmp = full_states[optimal_sch_idx]
-                new_states_idx, new_states_pos = [], []
-                visited_states_idx, visited_states_pos = [], []
-                for i in range(1, len(full_states_idx)):
-                    if full_states_idx[i] in optimal_record_dict:
-                        visited_states_idx.append(full_states_idx[i])
-                        visited_states_pos.append(i - 1)
-                    else:
-                        new_states_idx.append(full_states_idx[i])
-                        new_states_pos.append(i - 1)
-                if visited_states_idx:
-                    tmp = np.transpose(tmp, tuple(visited_states_pos + new_states_pos))
-                    tmp = tmp[tuple([optimal_record_dict[idx] for idx in visited_states_idx])]
-                min_pos = np.argmin(tmp)
-                multiplier = 1
-                for i in range(len(new_states_idx)):
-                    multiplier *= full_states.shape[new_states_pos[i] + 1]
-                for pos, idx in zip(new_states_pos, new_states_idx):
-                    multiplier //= full_states.shape[pos + 1]
-                    optimal_record_dict[idx] = min_pos // multiplier
-                    min_pos %= multiplier
-                for input_idx in input_idx_list:
-                    if input_idx not in visited:
-                        bfs_q.put(input_idx)
-
-        self._optimal_record_dict = optimal_record_dict
-        for node_idx, _ in self._in_nodes_dict.items():
-            if self._node_list[node_idx]["op"] not in self._target_ops:
-                continue
-        self._logger.info("Finished backward pass...")
-
-    def run(self, **kwargs):
-        """Run dynamic programming solver."""
-        max_num_states = None if "max_num_states" not in kwargs else kwargs["max_num_states"]
-        self._num_states = 0
-        self._max_num_states = max_num_states
-        self._logger.info("Start to run dynamic programming algorithm...")
-        self._forward()
-        self._backward()
-        self._logger.info("Finished DPExecutor run.")
diff --git a/python/tvm/autotvm/graph_tuner/pbqp_tuner.py b/python/tvm/autotvm/graph_tuner/pbqp_tuner.py
deleted file mode 100644
index c02cb2a5adcf..000000000000
--- a/python/tvm/autotvm/graph_tuner/pbqp_tuner.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, unnecessary-list-index-lookup
-"""Partitioned Boolean Quadratic Programming Tuner"""
-from ._base import INVALID_LAYOUT_TIME
-from .base_graph_tuner import BaseGraphTuner
-from .utils import is_boundary_node, has_multiple_inputs
-
-
-class PBQPTuner(BaseGraphTuner):
-    """An approximation method to deal with intractably
-    large size of graph tuning problem.
-
-    This graph coloring algorithm mainly comes from:
-
-    Lang Hames and Bernhard Scholz.
-    Nearly optimal register allocation with pbqp.JMLC 2006.
-    LNCS, vol.4228,pp. 346-361, 2016
-    """
-
-    def __init__(self, *args, **kwargs):
-        """Create a partitioned boolean quadratic programming tuner."""
-        super(PBQPTuner, self).__init__(*args, **kwargs)
-
-        # Remove input and ruled_out nodes
-        input_names = self._input_shapes.keys()
-        for node_idx in self._out_nodes_dict:
-            node = self._node_list[node_idx]
-            if is_boundary_node(node, input_names):
-                for out_node_idx in self._out_nodes_dict[node_idx]:
-                    self._in_nodes_dict[out_node_idx].remove(node_idx)
-
-        self._adj_dict = {}
-        for node_idx in self._in_nodes_dict:
-            self._adj_dict[node_idx] = list(self._in_nodes_dict[node_idx]) + list(
-                self._out_nodes_dict[node_idx]
-            )
-
-        self._record_cost_dict = {}
-        for key in self._in_nodes_dict:
-            self._record_cost_dict[key] = []
-            for record in self._node_list[key]["record_candidates"]:
-                self._record_cost_dict[key].append(record[1].costs[0])
-
-        self._max_degree = -1
-        self._node_degree_dict = {}
-        for node_idx in self._in_nodes_dict:
-            node_degree = self._get_degree(node_idx)
-            self._node_degree_dict[node_idx] = node_degree
-            self._max_degree = max(self._max_degree, node_degree)
-
-        self._stack = []
-        self._buckets = [[] for _ in range(self._max_degree + 2)]
-        for node_idx in sorted(self._in_nodes_dict):
-            node_degree = self._get_degree(node_idx)
-            self._buckets[node_degree].append(node_idx)
-
-        self._is_optimal = True
-
-    def _get_degree(self, node_idx):
-        """Get node degree."""
-        return len(self._adj_dict[node_idx])
-
-    def _reorder_adj_nodes(self, node_idx):
-        """Update buckets list with current adjacency list."""
-        for adj_node in self._adj_dict[node_idx]:
-            current_degree = self._get_degree(adj_node)
-            prev_degree = self._node_degree_dict[adj_node]
-            if prev_degree != current_degree:
-                self._buckets[prev_degree].remove(adj_node)
-                self._buckets[current_degree].insert(0, adj_node)
-                self._node_degree_dict[adj_node] = current_degree
-
-    def _remove_node(self, node_idx):
-        """Remove node from graph. Update adjacency list accordingly."""
-        node_degree = self._get_degree(node_idx)
-        self._buckets[node_degree].remove(node_idx)
-        for adj_node in self._adj_dict[node_idx]:
-            self._adj_dict[adj_node].remove(node_idx)
-
-    def _insert_edge(self, node_x, node_y, adj_cost_matrix):
-        """Insert an edge between two nodes."""
-        self._layout_transform_interlayer_cost[(node_x, node_y)] = adj_cost_matrix
-        self._layout_transform_interlayer_cost[(node_y, node_x)] = []
-        for i in range(len(adj_cost_matrix[0])):
-            self._layout_transform_interlayer_cost[(node_y, node_x)].append([])
-            for cost_vec in adj_cost_matrix:
-                self._layout_transform_interlayer_cost[(node_y, node_x)][i].append(cost_vec[i])
-
-        self._adj_dict[node_x].append(node_y)
-        self._adj_dict[node_y].append(node_x)
-
-    def _backward_insert_node(self, node_idx):
-        """Reinsert node in backward pass."""
-        for adj_node in self._adj_dict[node_idx]:
-            self._adj_dict[adj_node].append(node_idx)
-
-    def _RI_reduction(self, node_idx):
-        """Reduce nodes with degree 1."""
-        adj_node = self._adj_dict[node_idx][0]
-        ltf_matrix = self._layout_transform_interlayer_cost[(adj_node, node_idx)]
-        for i, cost_vec in enumerate(ltf_matrix):
-            min_cost = INVALID_LAYOUT_TIME
-            for j, cost in enumerate(cost_vec):
-                min_cost = min(min_cost, cost + self._record_cost_dict[node_idx][j])
-            self._record_cost_dict[adj_node][i] += min_cost
-        self._remove_node(node_idx)
-        self._reorder_adj_nodes(node_idx)
-        self._stack.append(node_idx)
-
-    def _RII_reduction(self, node_idx):
-        """Reduce nodes with degree 2."""
-        adj_node_x, adj_node_y = self._adj_dict[node_idx]
-        ltf_matrix_x = self._layout_transform_interlayer_cost[(adj_node_x, node_idx)]
-        ltf_matrix_y = self._layout_transform_interlayer_cost[(adj_node_y, node_idx)]
-        delta_matrix = [[] for _ in range(len(ltf_matrix_x))]
-        for i, cost_vec_x in enumerate(ltf_matrix_x):
-            for j, cost_vec_y in enumerate(ltf_matrix_y):
-                min_cost = INVALID_LAYOUT_TIME
-                for k in range(len(self._record_cost_dict[node_idx])):
-                    min_cost = min(
-                        min_cost,
-                        cost_vec_x[k] + cost_vec_y[k] + self._record_cost_dict[node_idx][k],
-                    )
-                delta_matrix[i].append(min_cost)
-
-        if adj_node_x == adj_node_y:
-            for i, delta_row in enumerate(delta_matrix):
-                self._record_cost_dict[adj_node_x][i] += delta_row[i]
-        elif adj_node_x in self._adj_dict[adj_node_y]:
-            for i, _ in enumerate(delta_matrix):
-                for j, delta in enumerate(delta_matrix[i]):
-                    self._layout_transform_interlayer_cost[(adj_node_x, adj_node_y)][i][j] += delta
-                    self._layout_transform_interlayer_cost[(adj_node_y, adj_node_x)][j][i] += delta
-        else:
-            self._insert_edge(adj_node_x, adj_node_y, delta_matrix)
-
-        self._remove_node(node_idx)
-        self._reorder_adj_nodes(node_idx)
-        self._stack.append(node_idx)
-
-    def _RN_reduction(self, node_idx):
-        """Reduce nodes with degree greater than 2."""
-        min_cost = INVALID_LAYOUT_TIME
-        record_idx = -1
-
-        for i, record_cost in enumerate(self._record_cost_dict[node_idx]):
-            current_cost = record_cost
-            for adj_node in self._adj_dict[node_idx]:
-                ltf_matrix = self._layout_transform_interlayer_cost[(node_idx, adj_node)]
-                adj_record_cost = list(self._record_cost_dict[adj_node])
-                for j, ltf_cost in enumerate(ltf_matrix[i]):
-                    adj_record_cost[j] += ltf_cost
-                current_cost += min(adj_record_cost)
-            if current_cost < min_cost:
-                min_cost = current_cost
-                record_idx = i
-
-        if record_idx < 0:
-            raise RuntimeError(
-                f"Can't find a soltuion for node {node_idx} when applying RN reduction"
-            )
-        self._optimal_record_dict[node_idx] = record_idx
-        self._is_optimal = False
-
-        for adj_node in self._adj_dict[node_idx]:
-            ltf_matrix = self._layout_transform_interlayer_cost[(node_idx, adj_node)]
-            for i, ltf_cost in enumerate(ltf_matrix[record_idx]):
-                self._record_cost_dict[adj_node][i] += ltf_cost
-
-        self._remove_node(node_idx)
-        self._reorder_adj_nodes(node_idx)
-        self._stack.append(node_idx)
-
-    def _forward(self):
-        """Forward pass in PBQP to reduce nodes."""
-        while True:
-            if self._buckets[1]:
-                node_idx = self._buckets[1][0]
-                self._RI_reduction(node_idx)
-            elif self._max_degree >= 2 and self._buckets[2]:
-                node_idx = self._buckets[2][0]
-                self._RII_reduction(node_idx)
-            elif self._max_degree >= 3:
-                max_degree_node = -1
-                for i in range(self._max_degree, 2, -1):
-                    if self._buckets[i]:
-                        max_degree_node = self._buckets[i][0]
-                        self._RN_reduction(max_degree_node)
-                        break
-                if max_degree_node < 0:
-                    break
-            else:
-                break
-
-    def _backward(self):
-        """Backward pass in PBQP to generate optimal solution."""
-        # Solve nodes left in the forward graph
-        for node_idx in self._buckets[0]:
-            record_costs = self._record_cost_dict[node_idx]
-            min_cost = min(record_costs)
-            self._optimal_record_dict[node_idx] = record_costs.index(min_cost)
-
-        # Solve nodes with one or two degrees
-        for node_idx in reversed(self._stack):
-            self._backward_insert_node(node_idx)
-            if node_idx not in self._optimal_record_dict:
-                record_costs = list(self._record_cost_dict[node_idx])
-                for adj_node in self._adj_dict[node_idx]:
-                    adj_optimal_idx = self._optimal_record_dict[adj_node]
-                    for i, _ in enumerate(record_costs):
-                        record_costs[i] += self._layout_transform_interlayer_cost[
-                            (node_idx, adj_node)
-                        ][i][adj_optimal_idx]
-                min_cost = min(record_costs)
-                self._optimal_record_dict[node_idx] = record_costs.index(min_cost)
-
-    def run(self, **kwargs):
-        """Run partitioned boolean quadratic programming tuner."""
-        self._logger.info("Start to run PBQP algorithm...")
-        # Define virtual record lists and layout transformaton matrices
-        # for multi-input nodes.
-        input_names = self._input_shapes.keys()
-        temp = {}
-        for key, val in self._in_nodes_dict.items():
-            target_input_idx = -1
-            target_input_pos = -1
-            if has_multiple_inputs(self._node_list, key, input_names, self._opt_out_op):
-                for i, item in enumerate(val):
-                    node = self._node_list[item]
-                    if not is_boundary_node(node, input_names):
-                        target_input_idx = item
-                        target_input_pos = i
-                        break
-
-                # Skip boundary operator
-                if target_input_idx < 0:
-                    continue
-
-                temp[(target_input_idx, key)] = []
-                record_candidates = self._node_list[target_input_idx]["record_candidates"]
-                for j in range(len(record_candidates)):
-                    temp[(target_input_idx, key)].append([])
-                    for k in range(len(record_candidates)):
-                        temp[(target_input_idx, key)][j].append(
-                            0 if j == k else INVALID_LAYOUT_TIME
-                        )
-
-                for j in range(target_input_pos + 1, len(val)):
-                    input_idx = val[j]
-                    input_node = self._node_list[input_idx]
-                    if is_boundary_node(input_node, input_names):
-                        continue
-                    temp[(input_idx, key)] = self._layout_transform_interlayer_cost[
-                        (input_idx, target_input_idx)
-                    ]
-        self._layout_transform_interlayer_cost.update(temp)
-
-        # Create reverse layout transformation matrices
-        temp = {}
-        for idx_pair, ltf_matrix in self._layout_transform_interlayer_cost.items():
-            reverse_key = (idx_pair[1], idx_pair[0])
-            reverse_matrix = [[] for _ in range(len(ltf_matrix[0]))]
-            for i, _ in enumerate(ltf_matrix):
-                for j, ltf in enumerate(ltf_matrix[i]):
-                    reverse_matrix[j].append(ltf)
-            temp[reverse_key] = reverse_matrix
-        self._layout_transform_interlayer_cost.update(temp)
-
-        self._forward()
-        self._backward()
-        is_optimal = "optimal" if self._is_optimal else "sub-optimal"
-        msg = f"Finished PBQPExecutor run. Got {is_optimal} solution."
-        self._logger.info(msg)
diff --git a/python/tvm/autotvm/graph_tuner/utils/__init__.py b/python/tvm/autotvm/graph_tuner/utils/__init__.py
deleted file mode 100644
index 21a16b8dcab1..000000000000
--- a/python/tvm/autotvm/graph_tuner/utils/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Graph tuner utility functions"""
-from __future__ import absolute_import
-
-from . import traverse_graph
-from . import utils
-
-from .traverse_graph import expr2graph, get_direct_ancestor, get_in_nodes, get_out_nodes
-from .utils import has_multiple_inputs, is_boundary_node, bind_inputs
diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
deleted file mode 100644
index 0c1ce36ba941..000000000000
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ /dev/null
@@ -1,334 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=too-many-locals,too-many-statements,too-many-branches,protected-access
-"""API for graph traversing."""
-import threading
-import re
-
-import tvm
-from tvm import relay, autotvm
-from tvm.relay import transform
-from tvm.relay.expr import Call, TupleGetItem, Var, Constant, Tuple
-from tvm.relay.function import Function
-from tvm.relay.ty import TupleType, TensorType
-from tvm.autotvm.task import TaskExtractEnv
-
-from .utils import has_multiple_inputs, is_boundary_node, is_skipped_node
-from .._base import OPT_OUT_OP
-
-
-def expr2graph(expr, target_ops, node_dict, node_list, tvm_target):
-    """Convert relay expr to graph data structure
-    and fetch workloads of target operators.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr.Function
-        Input relay function expression.
-
-    target_ops: List of tvm.ir.Op
-        List of target relay ops
-
-    node_dict : dictionary from tvm.relay.Expr to int
-        Dictionary to record node index
-
-    node_list : list of dictionary
-        List of nodes which contains all expr in the input relay function.
-        Each node will be stored as a dictionary in the format of
-        {"op": str, "node": tvm.relay.expr, "inputs": [int], "types": [tvm.relay.Type],
-         "name": str, "workloads": [tuple], "topi_op": [function]}
-
-    tvm_target : tvm.target
-        The TVM target object.
-    """
-    # TODO(@kevinthesun, @icemelon9): Currently graph tuning pass relies on the fact
-    #   that # autotvm tasks == # ops. But this won't be true after having relay op
-    #   strategy. We need to find a solution to fix this.
-    env = TaskExtractEnv.get(allow_duplicate=True)
-    env.reset(target_ops)
-    # pylint: disable=not-context-manager
-    with env:
-        _expr2graph_impl(expr, target_ops, node_dict, node_list, tvm_target)
-        task_pos = 0
-        for node_entry in node_list:
-            if node_entry["op"] in target_ops:
-                task_name, args = env.task_collection[task_pos]
-                task = autotvm.task.create(task_name, args, target=tvm_target)
-                node_entry["workloads"] = [task.workload]
-                node_entry["topi_op"] = [task_name]
-                task_pos += 1
-
-
-def _infer_type(node):
-    """A method to infer the type of a relay expression."""
-    mod = tvm.IRModule.from_expr(node)
-    mod = transform.InferType()(mod)
-    entry = mod["main"]
-    return entry if isinstance(node, relay.Function) else entry.body
-
-
-def _replace_device_with_tracing(target):
-    """This is to replace -device=XXX with -device=tracing in the tvm_target string.
-    It is a stand-along function for testability.
-    We need to have device=tracing in order to fetch the workloads, it is not used
-    for anything beyond that so it is safe to override the device here only."""
-    target = str(target)
-    if "-device" in target:
-        return re.sub("-device=[^\\-$]+", "-device=tracing ", target).strip(" ")
-    return target + " -device=tracing"
-
-
-def _expr2graph_impl(expr, target_ops, node_dict, node_list, tvm_target):
-    """Implementation to convert relay expr to graph data structure"""
-
-    def _traverse_expr(node):
-        if node in node_dict:
-            return
-        node_index = len(node_list)
-        node_entry = {"node": node, "inputs": [], "types": [], "op": None, "name": None}
-
-        if isinstance(node, Call):
-            op = node.op
-            node_entry["op"] = node.op
-            for arg in node.args:
-                in_node_idx = node_dict[arg]
-                if isinstance(arg, (Tuple, TupleGetItem)):
-                    node_entry["inputs"] += node_list[in_node_idx]["inputs"]
-                else:
-                    node_entry["inputs"].append([in_node_idx, 0, 0])
-            infer_out = _infer_type(node)
-            out_type = infer_out._checked_type_
-            if isinstance(out_type, TensorType):
-                node_entry["types"].append(out_type)
-            elif isinstance(out_type, TupleType):
-                for tupe_type in out_type.fields:
-                    node_entry["types"].append(tupe_type)
-            else:
-                raise RuntimeError(
-                    f"Unsupported output type {type(out_type)} in operator {op.name}"
-                )
-
-            # Utilize tracing target to fetch workload with topo-order.
-            # Since we only need workload, dummy target can be used to
-            # create task.
-            if op in target_ops:
-                params = []
-                for i, input_idx in enumerate(node_entry["inputs"]):
-                    input_node_entry = node_list[input_idx[0]]
-                    input_type = input_node_entry["types"][input_idx[1]]
-                    if not isinstance(input_node_entry["node"], (Var, Constant, Call)):
-                        raise RuntimeError(
-                            "Graph tuner can only tune target "
-                            "operators with input node of type "
-                            "relay.expr.Var/Constant/Call. Now "
-                            "find a target op %s with input type %s"
-                            % (op, str(type(input_node_entry["node"])))
-                        )
-                    free_var = relay.Var(f"var_{i}", input_type)
-                    params.append(free_var)
-                call = relay.Call(node.op, params, node.attrs)
-                mod = tvm.IRModule.from_expr(relay.Function(params, call))
-                relay.backend.te_compiler.get().clear()
-                tracing_target = _replace_device_with_tracing(tvm_target)
-                build_thread = threading.Thread(target=relay.build, args=(mod, tracing_target))
-                build_thread.start()
-                build_thread.join()
-        elif isinstance(node, Var):
-            node_entry["name"] = node.name_hint
-            node_entry["types"] = [node.type_annotation]
-        elif isinstance(node, Function):
-            # Ignore root node since it equals to input function expression
-            if node != expr:
-                _expr2graph_impl(node, target_ops, node_dict, node_list, tvm_target)
-            return
-        elif isinstance(node, TupleGetItem):
-            in_node_idx = node_dict[node.tuple_value]
-            node_entry["inputs"].append([in_node_idx, node.index, 0])
-        elif isinstance(node, Tuple):
-            for tuple_item in node:
-                in_node_idx = node_dict[tuple_item]
-                if isinstance(tuple_item, TupleGetItem):
-                    node_entry["inputs"] += node_list[in_node_idx]["inputs"]
-                elif isinstance(tuple_item, Tuple):
-                    raise RuntimeError("Graph tuner doesn't support nested tuple.")
-                else:
-                    node_entry["inputs"].append([in_node_idx, 0, 0])
-        elif isinstance(node, Constant):
-            node_entry["name"] = "Constant_" + str(node_index)
-            node_entry["types"] = [node.checked_type]
-        elif isinstance(node, tvm.ir.Op):
-            return
-        else:
-            raise RuntimeError(f"Not supported relay node type in graph tuning: {type(node)}")
-        node_dict[node] = node_index
-        node_list.append(node_entry)
-
-    relay.analysis.post_order_visit(expr, _traverse_expr)
-
-
-def get_direct_ancestor(node_list, visited_dict, target_ops, node_idx, input_names):
-    """Given a node_list in relay function and a node index, return the
-    closest ancestor which has op_name as operator name or is multi_input operator.
-
-    If node has multiple inputs, multiple ancestor nodes will be returned.
-
-    Parameters
-    ----------
-    node_list : list of dict of str to object
-        List of all nodes in a graph.
-
-    visited_dict : dict of int to int
-        Nodes and corresponding ancestors which have been visited.
-
-    target_ops: List of str
-        List of target relay base op name
-
-    node_idx : int
-        Input node index.
-
-    input_names : list of str
-        Names of graph input nodes.
-
-    Returns
-    -------
-    out : list of int
-        List of ancestor node index.
-    """
-    if node_idx in visited_dict:
-        return visited_dict[node_idx]
-    node = node_list[node_idx]
-    if is_boundary_node(node, input_names):
-        return [node_idx]
-
-    node_direct_ancestor = []
-    for item_idx in node["inputs"]:
-        item = node_list[item_idx[0]]
-        is_multiple_inputs = has_multiple_inputs(node_list, item_idx[0], input_names, OPT_OUT_OP)
-        if item["op"] in target_ops or is_multiple_inputs:
-            node_direct_ancestor.append(item_idx[0])
-        else:
-            tmp = get_direct_ancestor(node_list, visited_dict, target_ops, item_idx[0], input_names)
-            for tmp_item in tmp:
-                if tmp_item not in node_direct_ancestor:
-                    node_direct_ancestor.append(tmp_item)
-    visited_dict[node_idx] = node_direct_ancestor
-    return node_direct_ancestor
-
-
-def get_in_nodes(node_list, target_ops, input_names):
-    """Create a dictionary mapping from op_name nodes or multi-input
-    nodes to closest input ancestors.
-
-    Parameters
-    ----------
-    node_list : list of dict of str to object
-        List of all nodes in a graph.
-
-    target_ops: List of str
-        List of target relay op
-
-    input_names : list of str
-        Names of graph input nodes.
-
-    Returns
-    -------
-    out : dict of int to list of int
-        Dictionary maps node index to closest input ancestors.
-    """
-
-    visited_dict = {}
-    in_node_dict = {}
-    for i, node in enumerate(node_list):
-        if is_boundary_node(node, input_names) or is_skipped_node(node):
-            continue
-        get_direct_ancestor(node_list, visited_dict, target_ops, i, input_names)
-    for key, val in visited_dict.items():
-        node = node_list[key]
-        is_multiple_inputs = has_multiple_inputs(node_list, key, input_names, OPT_OUT_OP)
-        if node["op"] in target_ops or is_multiple_inputs:
-            in_node_dict[key] = val
-
-    # Reduce boundary nodes
-    out_node_dict = get_out_nodes(in_node_dict)
-    has_reduced_node = True
-    while has_reduced_node:
-        boundary_nodes = []
-        for key, val in in_node_dict.items():
-            node = node_list[key]
-            is_boundary = True
-            # Target ops can't be boundary nodes
-            if node["op"] not in target_ops:
-                for input_idx in val:
-                    in_node = node_list[input_idx]
-                    if not is_boundary_node(in_node, input_names) and input_idx in in_node_dict:
-                        is_boundary = False
-                    else:
-                        val.remove(input_idx)
-                    if is_boundary:
-                        boundary_nodes.append(key)
-        if boundary_nodes:
-            for idx in boundary_nodes:
-                if idx in in_node_dict:
-                    del in_node_dict[idx]
-        else:
-            has_reduced_node = False
-
-    # Remove empty nodes to ignore pre-computed sub-graph
-    has_empty_node = True
-    while has_empty_node:
-        empty_nodes = []
-        for key, val in in_node_dict.items():
-            if not val:
-                empty_nodes.append(key)
-        if empty_nodes:
-            has_empty_node = True
-            for node in empty_nodes:
-                del in_node_dict[node]
-                if node in out_node_dict:
-                    for out_node in out_node_dict[node]:
-                        in_node_dict[out_node].remove(node)
-        else:
-            has_empty_node = False
-
-    return in_node_dict
-
-
-def get_out_nodes(in_node_dict):
-    """Create output dictionary from input dictionary.
-
-    Parameters
-    ----------
-    in_node_dict : dict of int to list of int
-        Dictionary maps node index to closest input ancestors.
-        It can be created with get_in_nodes.
-
-    Returns
-    -------
-    out : dict of int to list of int
-        Dictionary maps node index to closest output nodes.
-    """
-    out_node_dict = {}
-    for key in in_node_dict:
-        out_node_dict[key] = []
-    for key, val in in_node_dict.items():
-        for item in val:
-            if item in out_node_dict:
-                out_node_dict[item].append(key)
-            else:
-                out_node_dict[item] = [key]
-
-    return out_node_dict
diff --git a/python/tvm/autotvm/graph_tuner/utils/utils.py b/python/tvm/autotvm/graph_tuner/utils/utils.py
deleted file mode 100644
index 54e0d1cb36b2..000000000000
--- a/python/tvm/autotvm/graph_tuner/utils/utils.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=eval-used,invalid-name,too-many-arguments
-"""Utility functions"""
-import tvm
-from tvm import relay
-from tvm.relay import transform
-
-
-def has_multiple_inputs(node_list, node_idx, input_names, opt_out_op):
-    """Check whether a node has multiple input nodes
-    except variable nodes.
-
-    Parameters
-    ----------
-    node_list : list of dict of str to object
-        List of all nodes in a graph.
-
-    node_idx : int
-        Node index to be checked.
-
-    input_names : list of str
-        List of input names of graph.
-
-    Returns
-    -------
-    out : bool
-        Whether the specified node has multiple input nodes
-    """
-    num_inputs = 0
-    node = node_list[node_idx]
-    for in_idx in node["inputs"]:
-        in_idx = in_idx[0]
-        in_node = node_list[in_idx]
-        # Exclude parameter nodes
-        if in_node["op"] is not None and in_node["op"].name in opt_out_op:
-            increase = False
-            for t_idx in in_node["inputs"]:
-                increase = has_multiple_inputs(node_list, t_idx[0], input_names, opt_out_op)
-            if increase:
-                num_inputs += 1
-        elif in_node["op"] is not None or ("name" in in_node and in_node["name"] in input_names):
-            num_inputs += 1
-    return num_inputs > 1
-
-
-def is_boundary_node(node_entry, input_names):
-    """Whether a node is a boundary node.
-    Currently input node and nodes in LAYOUT_FIXED_OP are
-    counted as boundary.
-
-    Parameters
-    ----------
-    node_entry : dict
-        Node entry.
-
-    input_names : list of str
-        List of input names of graph.
-
-    Returns
-    -------
-    out : bool
-        whether node is a boundary node.
-    """
-    # Operators dependent on original layouts.
-    _LAYOUT_FIXED_OP = [
-        relay.op.get(name)
-        for name in (
-            "nn.batch_flatten",
-            "transpose",
-            "reshape",
-            "vision.multibox_prior",
-            "vision.multibox_transform_loc",
-            "where",
-            "vision.non_max_suppression",
-            "strided_slice",
-        )
-    ]
-
-    out = node_entry["op"] in _LAYOUT_FIXED_OP or (
-        "name" in node_entry and node_entry["name"] in input_names
-    )
-    return out
-
-
-def is_skipped_node(node_entry):
-    """Whether a node is not counted.
-
-    Parameters
-    ----------
-    node_entry : dict
-        Node entry.
-
-    Returns
-    -------
-    out : bool
-        whether node is skipped.
-    """
-    # Operators not counted in graph tuner.
-    return isinstance(node_entry["node"], relay.Tuple)
-
-
-def bind_inputs(expr, input_shapes=None, input_dtypes="float32"):
-    """Bind input variables of a relay function expression
-    to new shapes and/or dtypes.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr.Function
-        Input relay function expression.
-
-    input_shapes : dict of str to tuple of int, optional
-        Input shapes.
-
-    input_dtypes : str or dict of str to str, optional
-        Input dtypes.
-
-    Returns
-    -------
-    out : tvm.relay.Expr.Function
-        Bind relay function expression.
-    """
-    if input_shapes is None:
-        return expr
-    if isinstance(input_dtypes, str):
-        input_dtypes = {key: input_dtypes for key in input_shapes.keys()}
-
-    updated_input_dict = {}
-    for input_name in input_shapes.keys():
-        updated_input = relay.var(
-            input_name, shape=input_shapes[input_name], dtype=input_dtypes[input_name]
-        )
-        updated_input_dict[input_name] = updated_input
-
-    rebind_dict = {}
-    for var in expr.params:
-        if var.name_hint in updated_input_dict:
-            rebind_dict[var] = updated_input_dict[var.name_hint]
-    updated_expr = relay.expr.bind(expr, rebind_dict)
-
-    mod = tvm.IRModule.from_expr(updated_expr)
-    mod = transform.InferType()(mod)
-    entry = mod["main"]
-    return entry if isinstance(updated_expr, relay.Function) else entry.body
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
deleted file mode 100644
index 10b0843402ea..000000000000
--- a/python/tvm/autotvm/measure/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Distributed executor infrastructure to scale up the tuning"""
-
-from .measure import (
-    MeasureInput,
-    MeasureResult,
-    MeasureErrorNo,
-    measure_option,
-    create_measure_batch,
-)
-from .measure_methods import (
-    LocalBuilder,
-    LocalRunner,
-    RPCRunner,
-    default_module_loader,
-    request_remote,
-)
-from .executor import Executor
diff --git a/python/tvm/autotvm/measure/executor.py b/python/tvm/autotvm/measure/executor.py
deleted file mode 100644
index f8eca7298c64..000000000000
--- a/python/tvm/autotvm/measure/executor.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Abstraction for asynchronous job execution """
-
-
-class Executor(object):
-    """
-    Base abstract executor interface for asynchronous job submission.
-    Allows submit asynchronous jobs and returns the Future object.
-    """
-
-    # timeout for jobs that may hang
-    DEFAULT_TIMEOUT = 120
-
-    def submit(self, func, *args, **kwargs):
-        """
-        Pass task (function, arguments) to the Executor.
-
-        Parameters
-        ----------
-        func : callable
-            function to be run by a worker
-        args : list or tuple, optional
-            arguments passed to the function
-        kwargs : dict, optional
-            The keyword arguments
-
-        Returns
-        -------
-        future : Future
-            Future object wrapping the task which can be used to
-            collect the task's result.
-        """
-        raise NotImplementedError()
-
-
-class Future(object):
-    """
-    Base class of the future object.
-    The implementations can return object of subclass of this.
-    This objects encapsulates the asynchronous execution of task
-    submitted to another thread, or another worker for execution.
-
-    Future objects store the state of tasks--can be polled for
-    result or a blocking call to retrieve the result can be used.
-    """
-
-    def done(self):
-        """
-        Return True if job was successfully cancelled or finished running.
-        """
-        raise NotImplementedError()
-
-    def get(self, timeout=None):
-        """
-        Get the result. This will block until the result is available.
-
-        Parameters
-        ----------
-        timeout : int or float, optional
-            Maximum number of seconds to wait before it timeouts.
-            If not specified, it means we block until the result is available.
-
-        Returns
-        -------
-        result : Any
-            The result returned by the submitted function.
-
-        Raises
-        ------
-        TimeoutError : if the result call timeouts.
-        """
-        raise NotImplementedError()
-
-
-class FutureError(RuntimeError):
-    """Base error class of all future events"""
-
-
-# pylint:disable=redefined-builtin
-class TimeoutError(FutureError):
-    """Error raised when a task is timeout."""
-
-
-class ExecutionError(FutureError):
-    """
-    Error raised when future execution crashes or failed.
-    """
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
deleted file mode 100644
index c9b82cd81c54..000000000000
--- a/python/tvm/autotvm/measure/measure.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
-"""User facing API for specifying how to measure the generated code"""
-import enum
-import multiprocessing
-from collections import namedtuple
-
-
-class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
-    """
-    Stores all the necessary inputs for a measurement.
-
-    Parameters
-    ----------
-    target : tvm.target.Target
-        The target device
-    task : task.Task
-        Task function
-    config : ConfigEntity
-        Specific configuration.
-    """
-
-
-class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])):
-    """
-    Stores all the results of a measurement
-
-    Parameters
-    ----------
-    costs: Array of float or Array of Exception
-        If no error occurs during measurement, it is an array of measured running times.
-        If an error occurs during measurement, it is an array of the exception objections.
-    error_no: int
-        Denote error type, defined by MeasureErrorNo
-    all_cost: float
-        All cost of this measure, including rpc, compilation, test runs
-    timestamp: float
-        The absolute time stamp when we finish measurement.
-    """
-
-    def __repr__(self):
-        error_no_str = (
-            str(MeasureErrorNo(self.error_no))
-            if isinstance(self.error_no, (MeasureErrorNo, int))
-            else str(self.error_no)
-        )
-        return (
-            f"{self.__class__.__name__}(costs={self.costs!r}, error_no={error_no_str}, "
-            f"all_cost={self.all_cost}, timestamp={self.timestamp!r})"
-        )
-
-
-class MeasureErrorNo(enum.IntEnum):
-    """Error type for MeasureResult"""
-
-    NO_ERROR = 0  # no error
-    INSTANTIATION_ERROR = 1  # actively detected error in instantiating a template with a config
-    COMPILE_HOST = 2  # error when compiling code on host (e.g. tvm.build)
-    COMPILE_DEVICE = 3  # error when compiling code on device (e.g. OpenCL JIT on the device)
-    RUNTIME_DEVICE = 4  # error when run program on device
-    WRONG_ANSWER = 5  # answer is wrong when compared to a golden output
-    BUILD_TIMEOUT = 6  # timeout during compilation
-    RUN_TIMEOUT = 7  # timeout during run
-    UNKNOWN_ERROR = 8  # unknown error
-
-
-class Builder(object):
-    """Builder that builds programs in tuning
-
-    Parameters
-    ----------
-    timeout: float, optional
-        The timeout of a build task
-    n_parallel: int, optional
-        The number of tasks submitted in parallel
-        By default it will use all cpu cores
-    build_kwargs: dict, optional
-        Keyword args given to the build function.
-    """
-
-    def __init__(self, timeout=10, n_parallel=None, build_kwargs=None):
-        self.timeout = timeout
-        self.n_parallel = n_parallel or multiprocessing.cpu_count()
-        self.user_build_kwargs = build_kwargs if build_kwargs is not None else {}
-        self.runner_build_kwargs = None
-        self.task = None
-
-    def set_task(self, task, build_kwargs=None):
-        """
-        Initialize for a new tuning task
-
-        Parameters
-        ----------
-        task: Task
-            The tuning task
-        build_kwargs: dict, optional
-            The additional kwargs for build function
-        """
-        self.task = task
-        self.build_kwargs = dict(build_kwargs.items()) if build_kwargs is not None else {}
-        if any(k in self.build_kwargs for k in self.user_build_kwargs):
-            logging.warn(
-                "Overriding these runner-supplied kwargs with user-supplied:\n%s",
-                "\n".join(
-                    f" * {k}: from {build_kwargs[k]!r} to {self.user_build_kwargs[k]!r}"
-                    for k in sorted([k for k in build_kwargs if k in self.user_build_kwargs])
-                ),
-            )
-        for k, v in self.user_build_kwargs.items():
-            self.build_kwargs[k] = v
-
-    def build(self, measure_inputs):
-        """Build programs
-
-        Parameters
-        ----------
-        measure_inputs: List of MeasureInput
-            The measure input
-
-        Returns
-        -------
-        build_results: List of BuildResult
-            The build result.
-        """
-        raise NotImplementedError()
-
-
-class Runner(object):
-    """Runner that runs and measures the time cost of a generated program in tuning
-
-    Parameters
-    ----------
-    timeout: float, optional
-        The timeout of a build task
-    n_parallel: int, optional
-        The number of tasks submitted in parallel
-        By default it will use all cpu cores
-    """
-
-    def __init__(self, timeout=5, n_parallel=None):
-        self.timeout = timeout
-        self.n_parallel = n_parallel or multiprocessing.cpu_count()
-        self.task = None
-
-    def set_task(self, task):
-        """
-        Initialize for a new tuning task
-
-        Parameters
-        ----------
-        task: Task
-            The tuning task
-        """
-        self.task = task
-
-    def get_build_kwargs(self):
-        """
-        Get device specific build arguments (e.g. maximum shared memory size)
-
-        Returns
-        ----------
-        kwargs: dict
-            The additional keyword arguments
-        """
-        raise NotImplementedError()
-
-    def run(self, measure_inputs, build_results):
-        """Run amd measure built programs
-
-        Parameters
-        ----------
-        measure_inputs: List of MeasureInput
-            The raw measure input
-        build_results: List of BuildResults
-            The build results
-
-        Returns
-        -------
-        measure_results: List of MeasureResult
-            The final results of measurement
-        """
-        raise NotImplementedError()
-
-
-def measure_option(builder, runner):
-    """
-    Set options for measure. To measure a config, we will build it and run it.
-    So we have to set options for these two steps.
-    They have their own options on timeout, parallel, etc.
-
-    Parameters
-    ----------
-    builder: Builder
-        Specify how to build programs
-    runner: Runner
-        Specify how to run programs
-
-    Examples
-    --------
-    # example setting for using local devices
-    >>> measure_option = autotvm.measure_option(
-    >>>     builder=autotvm.LocalBuilder(),      # use all local cpu cores for compilation
-    >>>     runner=autotvm.LocalRunner(          # measure them sequentially
-    >>>         number=10,
-    >>>         timeout=5)
-    >>> )
-
-    # example setting for using remote devices
-    >>> measure_option = autotvm.measure_option(
-    >>>    builder=autotvm.LocalBuilder(),  # use all local cpu cores for compilation
-    >>>    runner=autotvm.RPCRunner(
-    >>>        'rasp3b', 'locahost', 9190, # device key, host and port of the rpc tracker
-    >>>        number=4,
-    >>>        timeout=4) # timeout of a run on the device. RPC request waiting time is excluded.
-    >>>)
-
-    Note
-    ----
-    To make measurement results accurate, you should pick the correct value for the argument
-    `number` and `repeat` in Runner(). Some devices need a certain minimum running time to
-    "warm up," such as GPUs that need time to reach a performance power state.
-    Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
-    The typical value for NVIDIA GPU is 150 ms.
-    """
-    # pylint: disable=import-outside-toplevel
-    from .measure_methods import LocalBuilder, LocalRunner
-
-    if isinstance(builder, str):
-        if builder == "local":
-            builder = LocalBuilder()
-        else:
-            raise ValueError("Invalid builder: " + builder)
-
-    if isinstance(runner, str):
-        if runner == "local":
-            runner = LocalRunner()
-        else:
-            raise ValueError("Invalid runner: " + runner)
-
-    opt = {
-        "builder": builder,
-        "runner": runner,
-    }
-
-    return opt
-
-
-def create_measure_batch(task, option):
-    """Get a standard measure_batch function.
-
-    Parameters
-    ----------
-    task: tvm.autotvm.task.Task
-        The tuning task
-    option: dict
-        The option for measuring generated code.
-        You should use the return value of function :any:`measure_option` for this argument.
-
-    Returns
-    -------
-    measure_batch: callable
-        a callback function to measure a batch of configs
-    """
-    builder = option["builder"]
-    runner = option["runner"]
-
-    attach_objects = runner.set_task(task)
-
-    # feed device related information from runner to builder
-    # (e.g. max shared memory for validity checking)
-    build_kwargs = runner.get_build_kwargs()
-    builder.set_task(task, build_kwargs)
-
-    def measure_batch(measure_inputs):
-        build_results = builder.build(measure_inputs)
-        results = runner.run(measure_inputs, build_results)
-        return results
-
-    measure_batch.n_parallel = builder.n_parallel
-    measure_batch.attach_objects = attach_objects
-    return measure_batch
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
deleted file mode 100644
index 6a8d0f5e3c5c..000000000000
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ /dev/null
@@ -1,843 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,too-many-function-args,too-many-nested-blocks
-"""
-Functions that run on executor for measurement.
-
-These functions are responsible for building the tvm module, uploading it to
-remote devices, recording the running time costs, and checking the correctness of the output.
-"""
-
-import contextlib
-import logging
-import os
-import shutil
-import tempfile
-import threading
-import time
-import traceback
-import typing
-import warnings
-from collections import namedtuple
-from random import getrandbits
-
-import tvm._ffi
-import tvm.ir.transform
-from tvm import nd
-from tvm import rpc as _rpc
-from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
-from tvm.contrib import ndk, stackvm, tar
-from tvm.contrib.popen_pool import PopenPoolExecutor
-from tvm.driver import build
-from tvm.error import TVMError
-from tvm.target import Target
-
-from ..env import AutotvmGlobalScope
-from ..task.space import InstantiationError
-from ..utils import get_const_tuple
-from .measure import Builder, MeasureErrorNo, MeasureResult, Runner
-
-logger = logging.getLogger("autotvm")
-
-
-class BuildResult(namedtuple("BuildResult", ("filename", "arg_info", "error", "time_cost"))):
-    """
-    Stores all the necessary inputs for a measurement.
-
-    Parameters
-    ----------
-    filename : str
-        The filename of generated library
-    arg_info : Tuple
-        The shape and dtype information of tvm tensor arguments
-    error : Exception
-        The error happens during compilation.
-    time_cost : float
-        The time cost of building
-    """
-
-
-class LocalBuilder(Builder):
-    """Run compilation on local machine
-
-    Parameters
-    ----------
-    timeout: float
-        The timeout of a compilation
-    n_parallel: int
-        The number of tasks run in parallel. "None" will use all cpu cores
-    build_kwargs: dict
-        If supplied, additional kwargs passed to build_func. Overrides any build_kwargs supplied
-        by the Runner.
-    build_func: callable or str
-        If is 'default', use default build function
-        If is 'ndk', use function for android ndk
-        If id 'stackvm', use function for stackvm
-        If is callable, use it as custom build function, expect lib_format field.
-    do_fork: bool
-        If False, do not fork when building. Requires n_parallel=1.
-    runtime: Optional[Runtime]
-        Specify the runtime to generate artifacts for
-    """
-
-    def __init__(
-        self,
-        timeout=10,
-        n_parallel=None,
-        build_kwargs=None,
-        build_func="default",
-        do_fork=False,
-        runtime=None,
-    ):
-        super(LocalBuilder, self).__init__(timeout, n_parallel, build_kwargs)
-
-        if isinstance(build_func, str):
-            if build_func == "default":
-                build_func = tar.tar
-            elif build_func == "ndk":
-                build_func = ndk.create_shared
-            elif build_func == "stackvm":
-                build_func = stackvm.build
-            else:
-                raise ValueError("Invalid build_func" + build_func)
-        self.build_func = _WrappedBuildFunc(build_func, runtime)
-        if not do_fork:
-            assert n_parallel in (
-                None,
-                1,
-            ), f"if do_fork=False, need n_parallel=None or 1; got {n_parallel}"
-        self.executor = PopenPoolExecutor(
-            timeout=timeout, initializer=reset_global_scope, initargs=(AutotvmGlobalScope.current,)
-        )
-        self.tmp_dir = tempfile.mkdtemp()
-
-    def build(self, measure_inputs):
-        results = []
-
-        shutil.rmtree(self.tmp_dir, ignore_errors=True)
-        self.tmp_dir = tempfile.mkdtemp()
-
-        for i in range(0, len(measure_inputs), self.n_parallel):
-            futures = []
-            for inp in measure_inputs[i : i + self.n_parallel]:
-                ret = self.executor.submit(self.build_func, inp, self.tmp_dir, **self.build_kwargs)
-                futures.append(ret)
-
-            for future in futures:
-                try:
-                    res = future.result()
-                    if res.error is not None:
-                        assert len(res.error) == 2, (
-                            f"BuildResult errors should be a 2-tuple, but it is a {len(res.error)}"
-                            "-tuple. This should not happen!"
-                        )
-                        tb, exception = res.error
-                        # instantiation error
-                        if isinstance(exception, InstantiationError):
-                            res = MeasureResult(
-                                (tb, exception),
-                                MeasureErrorNo.INSTANTIATION_ERROR,
-                                res.time_cost,
-                                time.time(),
-                            )
-
-                        else:
-                            if "InstantiationError" in str(exception):
-                                msg = str(exception)
-                                try:
-                                    msg = msg.split("\n")[-2].split(": ")[1]
-                                except Exception:  # pylint: disable=broad-except
-                                    pass
-                                res = MeasureResult(
-                                    (tb, InstantiationError(msg)),
-                                    MeasureErrorNo.INSTANTIATION_ERROR,
-                                    res.time_cost,
-                                    time.time(),
-                                )
-
-                            else:  # tvm error
-                                res = MeasureResult(
-                                    (tb, res.error),
-                                    MeasureErrorNo.COMPILE_HOST,
-                                    res.time_cost,
-                                    time.time(),
-                                )
-                except TimeoutError as ex:
-                    tb = traceback.format_exc()
-                    res = MeasureResult(
-                        (tb, ex), MeasureErrorNo.BUILD_TIMEOUT, self.timeout, time.time()
-                    )
-                except ChildProcessError as ex:
-                    tb = traceback.format_exc()
-                    res = MeasureResult(
-                        (tb, ex), MeasureErrorNo.RUNTIME_DEVICE, self.timeout, time.time()
-                    )
-
-                results.append(res)
-
-        return results
-
-
-class RPCRunner(Runner):
-    """Run generated code on remove devices.
-    This function will ask a RPC Tracker to get device for measurement.
-
-    Parameters
-    ----------
-    timeout: float
-        The timeout of a RPCRunner measurement task
-    n_parallel: int
-        The number of tasks run in parallel. "None" will use all cpu cores
-    key: str
-        The key of the device registered in the tracker
-    host: str
-        The host address of RPC Tracker
-    port: int
-        The port of RPC Tracker
-    number: int
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int, optional
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first "1" is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms: int, optional
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval: float, optional
-        The cool down interval between two measurements.
-    enable_cpu_cache_flush: bool
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    module_loader : ModuleLoader
-        If given, a context manager that loads the module to be timed into the remote runtime.
-        If not given, default_module_loader is used.
-    """
-
-    def __init__(
-        self,
-        key,
-        host,
-        port,
-        priority=1,
-        timeout=10,
-        n_parallel=None,
-        number=4,
-        repeat=3,
-        min_repeat_ms=0,
-        cooldown_interval=0.1,
-        enable_cpu_cache_flush=False,
-        module_loader=None,
-    ):
-        super(RPCRunner, self).__init__(timeout, n_parallel)
-
-        self.key = key
-        self.host = host
-        self.port = port
-        self.priority = priority
-        self.timeout = timeout
-
-        self.number = number
-        self.repeat = repeat
-        self.min_repeat_ms = min_repeat_ms
-        self._ref_input = None
-
-        self.enable_cpu_cache_flush = enable_cpu_cache_flush
-        self.cooldown_interval = cooldown_interval
-        self.module_loader = module_loader
-
-        self.executor = PopenPoolExecutor(
-            timeout=timeout * (self.n_parallel + 1),
-            initializer=reset_global_scope,
-            initargs=(AutotvmGlobalScope.current,),
-        )
-
-    @property
-    def ref_input(self):
-        """
-        Fixed input for tuning special operators, e.g., sparse operators
-        requiring indices as input.
-        """
-        return self._ref_input
-
-    @ref_input.setter
-    def ref_input(self, val):
-        if val is not None:
-            warnings.warn(
-                "You are specifying fixed input for tuning the operator. "
-                "Be sure your input always fits the operator. Some "
-                "operators may conduct layout transformation during tuning, "
-                "thus can lead to unexpected behaviors. ",
-                RuntimeWarning,
-            )
-        self._ref_input = val
-
-    def set_task(self, task):
-        self.task = task
-
-        if check_remote(task.target, self.key, self.host, self.port):
-            logger.info("Get devices for measurement successfully!")
-        else:
-            raise RuntimeError(
-                "Cannot get remote devices from the tracker. "
-                "Please check the status of tracker by "
-                "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
-                "and make sure you have free devices on the queue status."
-            )
-
-    def get_build_kwargs(self):
-        kwargs = {"checks": {}}
-        if (
-            "cuda" in self.task.target.keys
-            or "opencl" in self.task.target.keys
-            or "rocm" in self.task.target.keys
-            or "vulkan" in self.task.target.keys
-        ):
-            remote = request_remote(self.key, self.host, self.port)
-            dev = remote.device(str(self.task.target), 0)
-            max_dims = dev.max_thread_dimensions
-            kwargs["checks"]["gpu"] = {
-                "max_shared_memory_per_block": dev.max_shared_memory_per_block,
-                "max_threads_per_block": dev.max_threads_per_block,
-                "max_thread_x": max_dims[0],
-                "max_thread_y": max_dims[1],
-                "max_thread_z": max_dims[2],
-            }
-        if "hexagon" in self.task.target.keys:
-            kwargs["checks"]["hexagon"] = {"vtcm_capacity": self.task.target.vtcm_capacity}
-
-        return kwargs
-
-    def run(self, measure_inputs, build_results):
-        results = []
-        remote_kwargs = dict(
-            device_key=self.key,
-            host=self.host,
-            port=self.port,
-            priority=self.priority,
-            timeout=self.timeout,
-        )
-
-        for i in range(0, len(measure_inputs), self.n_parallel):
-            futures = []
-            for measure_inp, build_res in zip(
-                measure_inputs[i : i + self.n_parallel], build_results[i : i + self.n_parallel]
-            ):
-                module_loader = (
-                    self.module_loader
-                    if self.module_loader is not None
-                    else default_module_loader()
-                )
-                ret = self.executor.submit(
-                    run_through_rpc,
-                    measure_inp,
-                    build_res,
-                    self.number,
-                    self.repeat,
-                    self.min_repeat_ms,
-                    self.cooldown_interval,
-                    remote_kwargs,
-                    self.ref_input,
-                    self.enable_cpu_cache_flush,
-                    module_loader,
-                )
-                futures.append(ret)
-
-            for future in futures:
-                try:
-                    res = future.result()
-                    results.append(res)
-                except Exception as ex:  # pylint: disable=broad-except
-                    tb = traceback.format_exc()
-                    results.append(
-                        MeasureResult(
-                            (tb, ex), MeasureErrorNo.RUN_TIMEOUT, self.timeout, time.time()
-                        )
-                    )
-
-        return results
-
-
-class LocalRunner(RPCRunner):
-    """Run generated code on local devices.
-
-    Parameters
-    ----------
-    timeout: float
-        The timeout of a compilation
-    number: int
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int, optional
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms: int, optional
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval: float, optional
-        The cool down interval between two measurements.
-    enable_cpu_cache_flush: bool
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    Note
-    ----
-    This is a "fake" local mode. We start a silent rpc tracker and rpc server
-    for the user. In this way we reuse timeout/isolation mechanism in RPC infrastructure.
-    """
-
-    def __init__(
-        self,
-        timeout=10,
-        number=4,
-        repeat=3,
-        min_repeat_ms=0,
-        cooldown_interval=0.1,
-        enable_cpu_cache_flush=False,
-        module_loader=None,
-    ):
-        super(LocalRunner, self).__init__(
-            "",
-            None,
-            None,
-            0,
-            timeout=timeout,
-            n_parallel=1,
-            number=number,
-            repeat=repeat,
-            min_repeat_ms=min_repeat_ms,
-            cooldown_interval=cooldown_interval,
-            enable_cpu_cache_flush=enable_cpu_cache_flush,
-            module_loader=module_loader,
-        )
-        self.tracker = None
-        self.server = None
-
-    def set_task(self, task):
-        # pylint: disable=import-outside-toplevel
-        from ...rpc.server import Server
-        from ...rpc.tracker import Tracker
-
-        self.task = task
-        tracker = Tracker(port=9000, port_end=10000, silent=True)
-        device_key = f"$local$device${tracker.port}"
-        server = Server(
-            port=9000,
-            port_end=10000,
-            key=device_key,
-            silent=True,
-            tracker_addr=("127.0.0.1", tracker.port),
-        )
-        self.key = device_key
-        self.host = "127.0.0.1"
-        self.port = tracker.port
-
-        super(LocalRunner, self).set_task(task)
-        return server, tracker
-
-
-def _build_func_common(measure_input, runtime=None, checks=None, build_option=None):
-    """Common part for building a configuration"""
-    target, task, config = measure_input
-    target, task.target_host = Target.canon_target_and_host(target, task.target_host)
-    checks = checks or {}
-    with target:
-        s, args = task.instantiate(config)
-
-        # check invalidity of template and code hash consistency
-        if not config.valid():
-            raise InstantiationError(config.errors)
-
-        current_pass_context: tvm.ir.transform.PassContext = tvm.ir.transform.PassContext.current()
-        current_config = dict(current_pass_context.config)
-        if build_option is not None:
-            current_config.update(build_option)
-
-        if "tir.add_lower_pass" in current_config:
-            current_add_lower_pass = list(current_config["tir.add_lower_pass"])
-        else:
-            current_add_lower_pass = []
-        if checks.get("gpu"):
-            current_add_lower_pass.append((2, gpu_verify_pass(**checks.get("gpu"))))
-        if checks.get("hexagon"):
-            current_add_lower_pass.append((2, vtcm_verify_pass(**checks.get("hexagon"))))
-        current_config["tir.add_lower_pass"] = current_add_lower_pass
-
-        with tvm.ir.transform.PassContext(
-            opt_level=current_pass_context.opt_level,
-            required_pass=current_pass_context.required_pass,
-            disabled_pass=current_pass_context.disabled_pass,
-            instruments=current_pass_context.instruments,
-            config=current_config,
-        ):
-            func = build(s, args, target=target, runtime=runtime)
-    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
-
-
-class _WrappedBuildFunc:
-    """
-    Wrap build_func to a function that can be used in measure.
-
-    Note: this is a class instead of a closure so that it can be pickled when
-    using multiprocessing.
-
-    Parameters
-    ----------
-    build_func : The compilation function
-        We expect fcompile to contain an attr "output_format".
-    runtime : Optional[Runtime]
-        The runtime to generate artifacts for
-
-    Returns
-    -------
-    wrapped_build_func : callable
-        The wrapped build function
-    """
-
-    def __init__(self, build_func, runtime=None):
-        if not hasattr(build_func, "output_format"):
-            raise AttributeError("Expect build_func to have the attribute output_format.")
-        self.build_func = build_func
-        self.runtime = runtime
-
-    def __call__(self, measure_input, tmp_dir, **kwargs):
-        """
-        Wrapped build func.
-
-        Parameters
-        ----------
-        measure_input: MeasureInput
-            The input of measurement
-
-        tmp_dir: str
-            The path of temporary directory to export generated library
-        """
-        tic = time.time()
-        try:
-            filename = os.path.join(
-                tmp_dir, f"tmp_func_{getrandbits(64):0x}.{self.build_func.output_format}"
-            )
-            # TODO(tvm-team) consider linline _build_func_common
-            func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
-            func.export_library(filename, fcompile=self.build_func)
-        except Exception as e:  # pylint: disable=broad-except
-            tb = traceback.format_exc()
-            return BuildResult(None, None, (tb, e), time.time() - tic)
-        return BuildResult(filename, arg_info, None, time.time() - tic)
-
-
-ModuleLoader = typing.Callable[
-    [dict, dict], typing.ContextManager[typing.Tuple[tvm.rpc.RPCSession, tvm.runtime.Module]]
-]
-
-
-def run_through_rpc(
-    measure_input,
-    build_result,
-    number,
-    repeat,
-    min_repeat_ms,
-    cooldown_interval,
-    remote_kwargs,
-    ref_input,
-    enable_cpu_cache_flush=False,
-    module_loader=None,
-):
-    """Run a generated library through rpc
-
-    Parameters
-    ----------
-    measure_input: MeasureInput
-        The raw measure input
-    build_result: BuildResult
-        The result returned from Builder. This contains the path to the generated library.
-    number: int
-        The number of times to run the generated code for taking average.
-        We call these runs as one `repeat` of measurement.
-    repeat : int, optional
-        The number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up and will be discarded.
-        The returned result contains `repeat` costs,
-        each of which is an average of `number` costs.
-    min_repeat_ms: int, optional
-        The minimum duration of one `repeat` in milliseconds.
-        By default, one `repeat` contains `number` runs. If this parameter is set,
-        the parameters `number` will be dynamically adjusted to meet the
-        minimum duration requirement of one `repeat`.
-        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
-        will be automatically increased.
-    cooldown_interval: float
-        The cool down interval between two measurements
-    remote_kwargs: dict
-        Passed to module_loader(). Ultimately, keyword args to request_remote().
-    ref_input: List of np.ndarray
-        The reference input used for tuning. Empty for randomly filled input.
-    enable_cpu_cache_flush: bool
-        Whether to flush cache on CPU between repeated measurements.
-        Flushing cache can make the measured latency of one operator closer to
-        its actual latency during end-to-end inference.
-        To make this option effective, the argument `number` should also be set to 1.
-        This is only has effect on CPU task.
-    module_loader: ModuleLoader
-        A function that returns a ContextManager used to establish and teardown the remote session.
-    """
-    if isinstance(build_result, MeasureResult):
-        return build_result
-
-    tic = time.time()
-    errno = MeasureErrorNo.NO_ERROR
-    try:
-        # upload built module
-        with module_loader(remote_kwargs, build_result) as (remote, mod):
-            dev = remote.device(str(measure_input.target), 0)
-
-            # Limitation:
-            # We can not get PackFunction directly in the remote mode as it is wrapped
-            # under the std::function. We could lift the restriction later once we fold
-            # the PackedFunc as an object. Currently, we pass function name to work
-            # around it.
-            f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-            time_f = mod.time_evaluator(
-                mod.entry_name,
-                dev,
-                number=number,
-                repeat=repeat,
-                min_repeat_ms=min_repeat_ms,
-                f_preproc=f_prepare,
-            )
-
-            if ref_input:
-                args = [nd.array(x, device=dev) for x in ref_input]
-            else:
-                try:
-                    random_fill = remote.get_function("tvm.contrib.random.random_fill")
-                except AttributeError:
-                    raise AttributeError(
-                        "Please make sure USE_RANDOM is ON in the config.cmake "
-                        "on the remote devices"
-                    )
-                args = [nd.empty(x[0], x[1], dev) for x in build_result.arg_info]
-                if "scatter" not in measure_input.task.name:
-                    # the index tensor of scatter op cannot be randomly initialized
-                    for arg in args:
-                        random_fill(arg)
-                dev.sync()
-
-            costs = time_f(*args).results
-
-        if len(costs) > 2:  # remove largest and smallest value to reduce variance
-            costs = list(costs)
-            costs.sort()
-            costs = tuple(costs[1:-1])
-    except TVMError as exc:
-        msg = str(exc)
-        if "Stack trace returned" in msg:
-            msg = msg[: msg.index("Stack trace returned")]
-        if "CUDA Source" in msg:
-            msg = msg[: msg.index("CUDA Source")]
-        costs = (traceback.format_exc(), RuntimeError(msg[:1024]))
-        errno = MeasureErrorNo.RUNTIME_DEVICE
-    tstamp = time.time()
-    time.sleep(cooldown_interval)
-    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
-
-
-class DefaultModuleLoader:
-    """See default_module_loader(). A pickleable emulation of the original function closure."""
-
-    def __init__(self, pre_load_function=None) -> None:
-        self.pre_load_function = pre_load_function
-
-    @contextlib.contextmanager
-    def __call__(self, remote_kwargs, build_result):
-        remote = request_remote(**remote_kwargs)
-        if self.pre_load_function is not None:
-            self.pre_load_function(remote, build_result)
-
-        remote.upload(build_result.filename)
-        try:
-            yield remote, remote.load_module(os.path.split(build_result.filename)[1])
-
-        finally:
-            # clean up remote files
-            remote.remove(build_result.filename)
-            remote.remove(os.path.splitext(build_result.filename)[0] + ".so")
-            remote.remove("")
-
-
-def default_module_loader(pre_load_function=None):
-    """Returns a default function that can be passed as module_loader to run_through_rpc.
-
-    Parameters
-    ----------
-    pre_load_function : Optional[Function[tvm.rpc.Session, tvm.runtime.Module]]
-        Invoked after a session is established and before the default code-loading RPC calls are
-        issued. Allows performing pre-upload actions, e.g. resetting the remote runtime environment.
-
-    Returns
-    -------
-    DefaultModuleLoader :
-        A callable that can be passed as module_loader to run_through_rpc.
-    """
-
-    # This was a function with a closure before but that couldn't be pickled!
-    # We need pickle to work for using python's multiprocessing on some platforms.
-    return DefaultModuleLoader(pre_load_function)
-
-
-def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
-    """Request a remote session
-
-    Parameters
-    ----------
-    device_key: string
-        The device key of registered device in tracker
-    host: host, optional
-        The host address of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-    port: int, optional
-        The port of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this session (units: second)
-
-    Returns
-    ------
-    session: RPCSession
-    """
-    # connect to the tracker
-    host = host or os.environ["TVM_TRACKER_HOST"]
-    port = port or int(os.environ["TVM_TRACKER_PORT"])
-
-    tracker = _rpc.connect_tracker(host, port)
-    remote = tracker.request(device_key, priority=priority, session_timeout=timeout)
-    return remote
-
-
-def check_remote(target, device_key, host=None, port=None, priority=100, timeout=10):
-    """
-    Check the availability of a remote device
-
-    Parameters
-    ----------
-    target: Target
-        The wanted compilation target
-    device_key: string
-        device key of registered device in tracker
-    host: host, optional
-        The host address of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-    port: int, optional
-        The port address of rpc tracker.
-        If is none, will use environment variable "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this check (units: seconds).
-
-    Returns
-    -------
-    available: bool
-        True if can find available device
-    """
-
-    def _check():
-        logger.debug("waiting for device...")
-        remote = request_remote(device_key, host, port, priority)
-        dev = remote.device(str(target))
-        while not dev.exist:  # wait until we get an available device
-            pass
-        logger.debug("device available")
-
-    t = threading.Thread(target=_check)
-    t.start()
-    t.join(timeout)
-
-    remote = request_remote(device_key, host, port, priority)
-    dev = remote.device(str(target))
-    return dev.exist
-
-
-def set_cuda_target_arch(arch):
-    """THIS API IS DEPRECATED.
-
-    set target architecture of nvcc compiler
-
-    Parameters
-    ----------
-    arch: str or list
-        The argument of nvcc -arch. (e.g. "sm_51", "sm_62")
-        it can also be a count of gencode arguments pass to nvcc command line,
-        e.g., ["-gencode", "arch=compute_52,code=sm_52", "-gencode", "arch=compute_70,code=sm_70"]
-    """
-    raise ValueError(
-        "The API 'autotvm.measure.set_cuda_target_arch' is deprecated."
-        "Try specifying it by adding '-arch=sm_xx' to your target, such as 'cuda -arch=sm_86'."
-        "See https://github.com/apache/tvm/pull/9544 for the upgrade guide."
-    )
-
-
-def gpu_verify_pass(**kwargs):
-    """Verify the validity of a gpu kernel.
-    This pass will check memory usage and number of threads per block.
-    """
-
-    def verify_pass(f, *_):
-        valid = tvm.tir.analysis.verify_gpu_code(f, kwargs)
-        if not valid:
-            raise InstantiationError("Skipped because of invalid gpu kernel")
-        return f
-
-    return tvm.tir.transform.prim_func_pass(verify_pass, opt_level=0)
-
-
-def vtcm_verify_pass(**kwargs):
-    """Verify the validity of a hexagon kernel.
-    This pass will check vtcm memory usage.
-    """
-
-    def verify_pass(f, *_):
-        sizes = tvm.tir.analysis.calculate_allocated_bytes(f)
-        vtcm_capacity = kwargs.get("vtcm_capacity", 0)
-        vtcm_allocated = sizes.get("global.vtcm", 0)
-        if 0 < vtcm_capacity < vtcm_allocated:
-            raise InstantiationError("Skipped because of invalid vtcm memory usage limit")
-
-        return f
-
-    return tvm.tir.transform.prim_func_pass(verify_pass, opt_level=0)
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
deleted file mode 100644
index cde78d1dbc31..000000000000
--- a/python/tvm/autotvm/record.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=superfluous-parens, redefined-outer-name, redefined-outer-name,pointless-string-statement
-# pylint: disable=consider-using-enumerate,invalid-name
-"""Tuning record and serialization format"""
-
-import argparse
-import base64
-from io import TextIOBase
-import logging
-import pickle
-import json
-import time
-from typing import Union
-import os
-import itertools
-from collections import OrderedDict
-import numpy as np
-
-from .. import build, lower
-from ..target import Target
-from ..contrib import popen_pool
-from .. import __version__
-from . import task
-from .task import ConfigEntity, ApplyHistoryBest
-from .measure import MeasureInput, MeasureResult
-
-AUTOTVM_LOG_VERSION = 0.2
-_old_version_warning = True
-logger = logging.getLogger("autotvm")
-
-try:  # convert unicode to str for python2
-    _unicode = unicode
-except NameError:
-    _unicode = ()
-
-try:
-    _long = long
-except NameError:
-    _long = int
-
-
-def measure_str_key(inp, include_config=True):
-    """get unique str key for MeasureInput
-
-    Parameters
-    ----------
-    inp: autotvm.measure.MeasureInput
-        input for the measure
-    include_config: bool, optional
-        whether includes config in the str key
-
-    Returns
-    -------
-    key: str
-        The str representation of key
-    """
-    config_str = str(inp.config) if include_config else ""
-    return "".join(
-        [str(inp.target), inp.task.name, str(inp.task.args), str(inp.task.kwargs), config_str]
-    )
-
-
-def encode(inp, result, protocol="json"):
-    """encode (MeasureInput, MeasureResult) pair to a string
-
-    Parameters
-    ----------
-    inp: autotvm.measure.MeasureInput
-    result: autotvm.measure.MeasureResult
-        pair of input/result
-    protocol: str
-        log protocol, json or pickle
-
-    Returns
-    -------
-    row: str
-        a row in the logger file
-    """
-
-    if protocol == "json":
-        json_dict = {
-            "input": (str(inp.target), inp.task.name, inp.task.args, inp.task.kwargs),
-            "config": inp.config.to_json_dict(),
-            "result": (
-                result.costs if result.error_no == 0 else (1e9,),
-                result.error_no,
-                result.all_cost,
-                result.timestamp,
-            ),
-            "version": AUTOTVM_LOG_VERSION,
-            "tvm_version": __version__,
-        }
-        return json.dumps(json_dict)
-    if protocol == "pickle":
-        row = (
-            str(inp.target),
-            str(
-                base64.b64encode(
-                    pickle.dumps([inp.task.name, inp.task.args, inp.task.kwargs])
-                ).decode()
-            ),
-            str(base64.b64encode(pickle.dumps(inp.config)).decode()),
-            str(base64.b64encode(pickle.dumps(tuple(result))).decode()),
-            str(AUTOTVM_LOG_VERSION),
-            str(__version__),
-        )
-        return "\t".join(row)
-
-    raise RuntimeError("Invalid log protocol: " + protocol)
-
-
-def decode(row, protocol="json"):
-    """Decode encoded record string to python object
-
-    Parameters
-    ----------
-    row : str
-        a row in the logger file
-
-    protocol : str
-        log protocol, json or pickle
-
-    Returns
-    -------
-    ret : tuple(autotvm.measure.MeasureInput, autotvm.measure.MeasureResult), or None
-        The tuple of input and result, or None if input uses old version log format.
-    """
-    # pylint: disable=unused-variable
-    global _old_version_warning
-
-    if protocol == "json":
-        row = json.loads(row)
-        if "v" in row and row["v"] == 0.1:
-            if _old_version_warning:
-                logger.warning("AutoTVM log version 0.1 is no longer supported.")
-                _old_version_warning = False
-            return None
-
-        tgt, task_name, task_args, task_kwargs = row["input"]
-        tgt = str(tgt)
-        if "-target" in tgt:
-            logger.warning('"-target" is deprecated, use "-mtriple" instead.')
-            tgt = tgt.replace("-target", "-mtriple")
-        tgt = Target(str(tgt))
-
-        def clean_json_to_python(x):
-            """1. Convert all list in x to tuple (hashable)
-            2. Convert unicode to str for python2
-            """
-            if isinstance(x, list):
-                return tuple([clean_json_to_python(a) for a in x])
-            if isinstance(x, _unicode):
-                return str(x)
-            if isinstance(x, (_long, int)):
-                return int(x)
-            return x
-
-        tsk = task.Task(clean_json_to_python(task_name), clean_json_to_python(task_args))
-        config = ConfigEntity.from_json_dict(row["config"])
-        inp = MeasureInput(tgt, tsk, config)
-        result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["result"]])
-        config.cost = np.mean(result.costs)
-
-        return inp, result
-    if protocol == "pickle":
-        items = row.split("\t")
-        if len(items) == 4:
-            if _old_version_warning:
-                logger.warning("AutoTVM log version 0.1 is no longer supported.")
-                _old_version_warning = False
-            return None
-        tgt = Target(items[0])
-        task_tuple = pickle.loads(base64.b64decode(items[1].encode()))
-        config = pickle.loads(base64.b64decode(items[2].encode()))
-        result = MeasureResult(*pickle.loads(base64.b64decode(items[3].encode())))
-        config.cost = np.mean(result.costs)
-
-        tsk = task.Task(task_tuple[0], task_tuple[1])
-        return MeasureInput(tgt, tsk, config), result
-
-    raise RuntimeError("Invalid log protocol: " + protocol)
-
-
-def load_from_buffer(file: TextIOBase):
-    """Generator: load records from buffer.
-    This is a generator that yields the records.
-
-    Parameters
-    ----------
-    file: io.TextIOBase
-
-    Yields
-    ------
-    input: autotvm.measure.MeasureInput
-    result: autotvm.measure.MeasureResult
-    """
-    for row in file:
-        if row and not row.startswith("#"):
-            ret = decode(row)
-            if ret is None:
-                continue
-            yield ret
-
-
-def load_from_file(filepath: Union[str, bytes, os.PathLike]):
-    """Generator: load records from path.
-    This is a generator that yields the records.
-
-    Parameters
-    ----------
-    filepath: str, bytes, or os.PathLike
-
-    Yields
-    ------
-    input: autotvm.measure.MeasureInput
-    result: autotvm.measure.MeasureResult
-    """
-    with open(filepath) as f:
-        for row in f:
-            if row and not row.startswith("#"):
-                ret = decode(row)
-                if ret is None:
-                    continue
-                yield ret
-
-
-def split_workload(in_file, clean=True):
-    """Split a log file into separate files, each of which contains only a single workload
-    This function can also delete duplicated records in log file
-
-    Parameters
-    ----------
-    in_file: str
-        input filename
-    clean: bool
-        whether delete duplicated items
-    """
-    tic = time.time()
-    lines = list(open(in_file).readlines())
-
-    logger.info("start converting...")
-    pool = popen_pool.PopenPoolExecutor()
-    lines = [rec for rec in pool.map(decode, lines) if rec is not None]
-    logger.info("map done %.2f", time.time() - tic)
-
-    wkl_dict = OrderedDict()
-    for inp, res in lines:
-        wkl = measure_str_key(inp, False)
-        if wkl not in wkl_dict:
-            wkl_dict[wkl] = []
-        wkl_dict[wkl].append([inp, res])
-
-    if clean:
-        for i, (k, v) in enumerate(wkl_dict.items()):
-            # clean duplicated items
-            added = set()
-            cleaned = []
-            for inp, res in v:
-                str_key = measure_str_key(inp)
-                if str_key in added:
-                    continue
-                added.add(str_key)
-                cleaned.append([inp, res])
-
-            # write to file
-            logger.info("Key: %s\tValid: %d\tDup: %d\t", k, len(cleaned), len(v) - len(cleaned))
-            with open(args.i + f".{i:03d}.wkl", "w") as fout:
-                for inp, res in cleaned:
-                    fout.write(encode(inp, res) + "\n")
-    else:
-        for i, (k, v) in enumerate(wkl_dict.items()):
-            logger.info("Key: %s\tNum: %d", k, len(v))
-            with open(args.i + f".{i:03d}.wkl", "w") as fout:
-                for inp, res in v:
-                    fout.write(encode(inp, res) + "\n")
-
-
-def pick_best(in_file, out_file):
-    """
-    Pick the best entries from a file and store them to another file.
-    This function distills the useful log entries from a large log file.
-    If out_file already exists, the best entries from both
-    in_file and out_file will be saved.
-
-    Parameters
-    ----------
-    in_file: str
-        The filename of input
-    out_file: str or file
-        The filename of output
-    """
-    context = load_from_file(in_file)
-    if os.path.isfile(out_file):
-        out_context = load_from_file(out_file)
-        context = itertools.chain(context, out_context)
-    context, context_clone = itertools.tee(context)
-    best_context = ApplyHistoryBest(context)
-    best_set = set()
-
-    for v in best_context.best_by_model.values():
-        best_set.add(measure_str_key(v[0]))
-
-    for v in best_context.best_by_targetkey.values():
-        best_set.add(measure_str_key(v[0]))
-
-    logger.info("Extract %d best records from the %s", len(best_set), in_file)
-    fout = open(out_file, "w") if isinstance(out_file, str) else out_file
-
-    for inp, res in context_clone:
-        if measure_str_key(inp) in best_set:
-            fout.write(encode(inp, res) + "\n")
-            best_set.remove(measure_str_key(inp))
-
-
-"""
-Usage:
-This record executable module has three modes.
-
-* Print log file in readable format
-e.g. python -m tvm.autotvm.record --mode read --i collect_conv.log --begin 0 --end 5 --ir --code
-
-* Extract history best from a large log file
-e.g. python -m tvm.autotvm.record --mode pick --i collect.log
-
-* Split a log file into separate files, each of which contains only a single wkl
-e.g. python -m tvm.autotvm.record --mode split --i collect.log
-"""
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["read", "pick", "split"], default="read")
-    parser.add_argument("--i", type=str, help="input file")
-    parser.add_argument("--o", type=str, default=None, help="output file")
-    parser.add_argument("--begin", type=int, default=0)
-    parser.add_argument("--end", type=int, default=5)
-    parser.add_argument("--ir", action="store_true")
-    parser.add_argument("--code", action="store_true")
-
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO)
-
-    if args.mode == "pick":
-        args.o = args.o or args.i + ".best.log"
-        pick_best(args.i, args.o)
-    elif args.mode == "read":
-        for i, (inp, result) in enumerate(load_from_file(args.i)):
-            if args.begin <= i < args.end:
-                with inp.target:
-                    s, arg_bufs = inp.task.instantiate(inp.config)
-
-                print("")
-                print(inp.target, inp.task, inp.config)
-                print(result)
-
-                if args.ir:
-                    with inp.target:
-                        print(lower(s, arg_bufs, simple_mode=True))
-
-                if args.code:
-                    with inp.target:
-                        func = build(s, arg_bufs)
-                        print(func.imported_modules[0].get_source())
-    elif args.mode == "split":
-        split_workload(args.i)
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
deleted file mode 100644
index 3949d324c4df..000000000000
--- a/python/tvm/autotvm/task/__init__.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Task is a tunable composition of template functions.
-
-Tuner takes a tunable task and optimizes the joint configuration
-space of all the template functions in the task.
-This module defines the task data structure, as well as a collection(zoo)
-of typical tasks of interest.
-"""
-
-from .task import (
-    Task,
-    create,
-    get_config,
-    args_to_workload,
-    template,
-    serialize_args,
-    deserialize_args,
-)
-from .space import ConfigSpace, ConfigEntity
-from .code_hash import attach_code_hash, attach_code_hash_to_arg
-from .dispatcher import (
-    DispatchContext,
-    ApplyConfig,
-    ApplyFixedConfig,
-    ApplyHistoryBest,
-    FallbackContext,
-    clear_fallback_cache,
-    ApplyGraphBest,
-)
-
-from .topi_integration import (
-    register_topi_compute,
-    register_topi_schedule,
-    TaskExtractEnv,
-    get_workload,
-)
-from .relay_integration import extract_from_program, extract_from_multiple_program
diff --git a/python/tvm/autotvm/task/code_hash.py b/python/tvm/autotvm/task/code_hash.py
deleted file mode 100644
index 2bd053da7244..000000000000
--- a/python/tvm/autotvm/task/code_hash.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Decorator functions for hashing schedule code
-
-code hashing is used to check the consistence of schedule code and the parameters loaded from log
-"""
-import functools
-import inspect
-import zlib
-
-from tvm.te import schedule
-
-
-def attach_code_hash(s):
-    """Decorator for attaching a code hash to a schedule
-
-    Parameters
-    ----------
-    s: Schedule
-        tvm.te.schedule.Schedule to attach the hash to
-    """
-
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            func(*args, **kwargs)
-            raw_hash = zlib.crc32("".join(inspect.getsourcelines(func)[0]).encode())
-            s.code_hash = hex(raw_hash)[2:]
-
-        return wrapper
-
-    return decorator
-
-
-def attach_code_hash_to_arg(arg_idx=1):
-    """Decorator for attaching a code hash to a schedule
-
-    Parameters
-    ----------
-    arg_idx: int
-        index of the argument (expected to be a Schedule) to attach the code
-        hash to
-    """
-
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            func(*args, **kwargs)
-            assert isinstance(args[arg_idx], schedule.Schedule)
-            raw_hash = zlib.crc32("".join(inspect.getsourcelines(func)[0]).encode())
-            args[arg_idx].code_hash = hex(raw_hash)[2:]
-
-        return wrapper
-
-    return decorator
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
deleted file mode 100644
index f3d5c290f454..000000000000
--- a/python/tvm/autotvm/task/dispatcher.py
+++ /dev/null
@@ -1,524 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Template dispatcher module.
-
-A dispatcher is a function that can contains multiple behaviors.
-Its specific behavior is can be controlled by DispatchContext.
-
-DispatchContext is used in two ways, usually via different implementation
-of the DispatchContext base class.
-
-- During search, we can use it to pass the current proposal from tuner.
-- During evaluation, we can use it to set pick the best policy.
-"""
-# pylint: disable=invalid-name
-
-from __future__ import absolute_import as _abs
-
-from io import TextIOBase
-import logging
-from os import PathLike
-from pathlib import Path
-from typing import List, Iterable, Tuple, Union
-
-import numpy as np
-
-from .space import FallbackConfigEntity
-from .. import env as _env
-from ..measure import MeasureInput, MeasureResult
-
-logger = logging.getLogger("autotvm")
-
-Records = Union[
-    Union[str, bytes, Path],  # Path-like objects
-    TextIOBase,  # File-like objects
-    Iterable[Tuple[MeasureInput, MeasureResult]],
-]
-
-
-class DispatchContext(object):
-    """
-    Base class of dispatch context.
-
-    DispatchContext enables the target and workload
-    specific dispatch mechanism for templates.
-    """
-
-    current = None
-    # a set to prevent print duplicated message
-    warning_messages = set()
-
-    def __init__(self):
-        self._old_ctx = DispatchContext.current
-
-    def query(self, target, workload):
-        """
-        Query the context to get the specific config for a template.
-        If cannot find the result inside this context, this function will query it
-        from the upper contexts.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : Workload
-            The current workload.
-
-        Returns
-        -------
-        cfg : ConfigSpace
-            The specific configuration.
-        """
-        ret = self._query_inside(target, workload)
-        if ret is None:
-            ret = self._old_ctx.query(target, workload)
-        return ret
-
-    def update(self, target, workload, cfg):
-        """
-        Update context with a specific config.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : Workload
-            The current workload.
-        cfg : ConfigSpace
-            The specific configuration.
-
-        Note
-        ----
-        This interface is for cases when TVM decides to replace an operator in the graph.
-        For example, `AlterOpLayout` pass (enables when `opt_level = 3`) replaces `NCHW`
-        convolution with `NCHW[x]c` implementation on x86 CPUs.
-        Thus in TOPI, we first query schedule using original `NCHW` workload,
-        then update the dispatcher with the new `NCHW[x]c` workload.
-        So that later on, `NCHW[x]c` convolution can get schedule from the dispatcher using
-        its own workload directly.
-
-        .. code-block:: python
-
-            @conv2d_alter_layout.register("cpu")
-            def _alter_conv2d_layout(attrs, inputs, tinfo):
-                workload = get_conv2d_workload(...)
-                dispatch_ctx = autotvm.task.DispatchContext.current
-                target = tvm.target.Target.current()
-                config = dispatch_ctx.query(target, workload)
-
-                # Get conv2d_NCHWc workload from config
-                # new_workload = ...
-                # new_inputs = ...
-                # new_attrs = ...
-
-                # Store altered operator's config
-                dispatch_ctx.update(target, new_workload, config)
-                return sym.contrib.conv2d_NCHWc(*new_inputs, **new_attrs)
-
-        We directly store `config` back because `conv2d_NCHW` and `conv2d_NCHWc`
-        share the same schedule parameters.
-        One can construct a new `ConfigEntity` if this is not the case.
-        """
-        raise NotImplementedError()
-
-    def _query_inside(self, target, workload):
-        """
-        Query the context to get the specific config for a template.
-        This function only query config inside this context.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : Workload
-            The current workload.
-
-        Returns
-        -------
-        cfg : ConfigSpace
-            The specific configuration.
-        """
-        raise NotImplementedError()
-
-    def __enter__(self):
-        self._old_ctx = DispatchContext.current
-        DispatchContext.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        DispatchContext.current = self._old_ctx
-
-
-class ApplyConfig(DispatchContext):
-    """Apply a deterministic config entity for all queries.
-
-    Parameters
-    ----------
-    config : ConfigSpace or ConfigEntity
-        The specific configuration we care about.
-    """
-
-    def __init__(self, config):
-        super(ApplyConfig, self).__init__()
-        self._config = config
-        self.workload = None
-
-    def _query_inside(self, target, workload):
-        """Override query"""
-        self.workload = workload
-        return self._config
-
-    def update(self, target, workload, cfg):
-        """Override update"""
-        self.workload = workload
-        self._config = cfg
-
-
-class ApplyFixedConfig(DispatchContext):
-    """Apply a config of a deterministic schedule.
-    This is used for building a single Relay operator with deterministic schedule
-    for testing schedules at Relay level.
-
-    Parameters
-    ----------
-    tasks : list[tvm.autotvm.task.task.Task]
-        List of autoTVM tasks.
-    schedule_names : str, List[str]
-        Name of schedules to use.
-    """
-
-    def __init__(self, tasks, schedule_names: Union[str, List[str]]):
-        super(ApplyFixedConfig, self).__init__()
-        if isinstance(schedule_names, str):
-            self._schedule_names = list(schedule_names)
-        elif isinstance(schedule_names, list):
-            self._schedule_names = schedule_names
-        else:
-            raise RuntimeError("Incorrect type: " + schedule_names)
-        self._tasks = tasks
-        self.workload = None
-
-    def _query_inside(self, target, workload):
-        """Override query"""
-        self.workload = workload
-
-        # Create a config from correct task
-        for task in self._tasks:
-            if task.name == workload[0]:
-                config = task.config_space.get(0)
-                break
-
-        if not config:
-            raise RuntimeError(f"workload: {str(workload)} does not exist in {str(self._tasks)}")
-        # Add low cost to the target schedule and high cost to others.
-        if workload[0] in self._schedule_names:
-            config.cost = 1e-6
-        else:
-            config.cost = 100000
-        return config
-
-    def update(self, target, workload, cfg):
-        """Override update"""
-        self.workload = workload
-        self._config = cfg
-
-
-class ApplyHistoryBest(DispatchContext):
-    """
-    Apply the history best config
-
-    Parameters
-    ----------
-    records : None, Records, or iterator of Records objects, where a
-              Records object is a path-like object, a file-like object,
-              or an iterator of (MeasureInput, MeasureResult).
-
-        Collection of tuning records. If multiple Records objects are passed, their
-        contents will be merged.
-    """
-
-    def __init__(self, records: Union[None, Records, Iterable[Records]]):
-        super(ApplyHistoryBest, self).__init__()
-
-        self.best_by_targetkey = {}
-        self.best_by_model = {}
-        self._best_user_defined = {}
-
-        if records:
-            self.load(records)
-
-    def load(self, records: Union[Records, Iterable[Records]]):
-        """Load records to this dispatch context
-
-        Parameters
-        ----------
-        records : str, list of str, or iterator of (autotvm.measure.MeasureInput,\
-                                                    autotvm.measure.MeasureResult)
-
-            Collection of tuning records. If multiple Records objects are passed, their
-            contents will be merged.
-        """
-        # pylint: disable=import-outside-toplevel
-        from ..record import load_from_file, load_from_buffer
-
-        def _unpack_records(
-            records: Union[Records, Iterable[Records]]
-        ) -> List[Tuple[MeasureInput, MeasureResult]]:
-
-            if isinstance(records, (str, bytes, PathLike)):
-                return load_from_file(records)
-
-            if isinstance(records, TextIOBase):
-                return load_from_buffer(records)
-
-            joint_records = []
-            for record in records:
-                if isinstance(record, Tuple) and isinstance(record[0], MeasureInput):
-                    joint_records.append(record)
-                else:
-                    joint_records += _unpack_records(record)
-
-            return joint_records
-
-        flattened_records = _unpack_records(records)
-        if not flattened_records:
-            return
-
-        best_by_targetkey = self.best_by_targetkey
-        best_by_model = self.best_by_model
-
-        counter = 0
-        for inp, res in flattened_records:
-            counter += 1
-            if res.error_no != 0:
-                continue
-
-            # use target keys in tvm target system as key to build best map
-            for k in inp.target.keys:
-                key = (k, inp.task.workload)
-                if key not in best_by_targetkey:
-                    best_by_targetkey[key] = (inp, res)
-                else:
-                    _, other_res = best_by_targetkey[key]
-                    if np.mean(other_res.costs) > np.mean(res.costs):
-                        best_by_targetkey[key] = (inp, res)
-
-            # use model as key to build best map
-            key = (inp.target.model, inp.task.workload)
-            if key not in best_by_model:
-                if inp.target.model != "unknown":
-                    best_by_model[key] = (inp, res)
-            else:
-                _, other_res = best_by_model[key]
-                if np.mean(other_res.costs) > np.mean(res.costs):
-                    best_by_model[key] = (inp, res)
-
-        logger.debug("Finish loading %d records", counter)
-
-    def _query_inside(self, target, workload):
-        if target is None:
-            raise RuntimeError(
-                "Need a target context to find the history best. "
-                "Hint: If your target is llvm, use `with tvm.target.Target('llvm'):`"
-                " above the dispatcher call. So does other target. "
-            )
-
-        # first try matching by model
-        key = (target.model, workload)
-        if key in self._best_user_defined:
-            return self._best_user_defined[key]
-        if key in self.best_by_model:
-            inp, _ = self.best_by_model[key]
-            return inp.config
-
-        # then try matching by target key
-        for k in target.keys:
-            key = (k, workload)
-            if key in self._best_user_defined:
-                return self._best_user_defined[key]
-            if key in self.best_by_targetkey:
-                inp, _ = self.best_by_targetkey[key]
-                return inp.config
-
-        return None
-
-    def update(self, target, workload, cfg):
-        model = target.model
-        key = (model, workload)
-        # assume user provided config is the best
-        cfg.cost = 0
-        self._best_user_defined[key] = cfg
-
-        for k in target.keys:
-            key = (k, workload)
-            self._best_user_defined[key] = cfg
-
-
-class FallbackContext(DispatchContext):
-    """
-    A fallback dispatch context.
-
-    Any tunable template can be called under this context.
-    This is the root context.
-    """
-
-    def __init__(self):
-        super(FallbackContext, self).__init__()
-        self.memory = {}
-
-    def _query_inside(self, target, workload):
-        key = (str(target), workload)
-        if key in self.memory:
-            return self.memory[key]
-
-        if not _env.GLOBAL_SCOPE.silent:
-            msg = (
-                f"Cannot find config for target={target}, workload={workload}. A fallback "
-                f"configuration is used, which may bring great performance regression."
-            )
-            if msg not in DispatchContext.warning_messages:
-                DispatchContext.warning_messages.add(msg)
-                logger.warning(msg)
-        cfg = FallbackConfigEntity()
-
-        # cache this config
-        self.memory[key] = cfg
-        return cfg
-
-    def clear_cache(self, target, workload):
-        """Clear fallback cache. Pass the same argument as _query_inside to this function
-        to clean the cache.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : Workload
-            The current workload.
-        """
-        key = (str(target), workload)
-        if key in self.memory:
-            del self.memory[key]
-
-    def update(self, target, workload, cfg):
-        key = (str(target), workload)
-        self.memory[key] = cfg
-
-
-DispatchContext.current = FallbackContext()
-
-
-def clear_fallback_cache(target, workload):
-    """Clear fallback cache. Pass the same argument as _query_inside to this function
-    to clean the cache.
-
-    Parameters
-    ----------
-    target: Target
-        The current target
-    workload : Workload
-        The current workload.
-
-    Note
-    ----
-    This is used in alter_op_layout to clear the bad cache created before call topi compute function
-    """
-    context = DispatchContext.current
-    while not isinstance(context, FallbackContext):
-        context = context._old_ctx
-    context.clear_cache(target, workload)
-
-
-class ApplyGraphBest(DispatchContext):
-    """Load the graph level tuning optimal schedules.
-
-    The input records should be in the ascending order of
-    node index for target operator. Usually this can be obtained
-    with graph tuner.
-
-    This context maintains an internal counter to indicate the current
-    node index.
-    """
-
-    def __init__(self, records: Records):
-        """
-        Parameters
-        ----------
-        records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
-            Collection of tuning records.
-            If is str, then it should be the filename of a records log file.
-                   Each row of this file is an encoded record pair.
-            Otherwise, it is an iterator.
-        """
-        # pylint: disable=import-outside-toplevel
-        from ..record import load_from_file, load_from_buffer
-
-        super(ApplyGraphBest, self).__init__()
-        if isinstance(records, (str, bytes, PathLike)):
-            records = load_from_file(records)
-        elif isinstance(records, TextIOBase):
-            records = load_from_buffer(records)
-        else:
-            records = list(records)
-
-        self._records = list(records)
-        self._counter = 0
-        self._global_cfg_dict = {}
-
-    def _query_inside(self, target, workload):
-        """
-        Query the context to get config from records.
-
-        Parameters
-        ----------
-        target : Target
-            The current target
-        workload : Workload
-            The current workload.
-
-        Returns
-        -------
-        cfg : ConfigSpace
-            The specific configuration.
-        """
-        if self._counter < len(self._records):
-            cfg = self._records[self._counter][0].config
-            wkl = self._records[self._counter][0].task.workload
-            if workload is not None:
-                assert wkl == workload
-            self._counter += 1
-            self.update(target, wkl, cfg)
-            cfg.workload = wkl
-            return cfg
-        key = (str(target), workload)
-        if key not in self._global_cfg_dict:
-            msg = (
-                f"Config for target={target}, workload={workload} is missing in ApplyGraphBest "
-                f"context. A fallback configuration is used, which may bring great performance "
-                f"regression."
-            )
-            logger.warning(msg)
-            cfg = FallbackConfigEntity()
-            self._global_cfg_dict[key] = cfg
-        else:
-            cfg = self._global_cfg_dict[key]
-        return cfg
-
-    def update(self, target, workload, cfg):
-        key = (str(target), workload)
-        self._global_cfg_dict[key] = cfg
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
deleted file mode 100644
index 4ee92641917b..000000000000
--- a/python/tvm/autotvm/task/relay_integration.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-variable,invalid-name, not-context-manager
-"""
-Decorator and utilities for the integration with TOPI and Relay
-99.9% copy-paste of implementation by @MerryMercy
-
-"""
-import threading
-import logging
-
-import tvm
-from tvm.autotvm.task.dispatcher import DispatchContext, FallbackContext
-from tvm.target import Target
-from .task import create
-from .topi_integration import TaskExtractEnv
-
-logger = logging.getLogger("autotvm")
-
-
-def _lower(mod, target, params, opt_level=3):
-    """Helper to lower VTA properly."""
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    # Alter op layout code has been written expecting that tuning is applied
-    # without it, so we disable AlterOpLayout to maintain that behavior.
-    with tvm.transform.PassContext(opt_level=opt_level, disabled_pass={"AlterOpLayout"}):
-        compiler = relay.vm.VMCompiler()
-        if params:
-            compiler.set_params(params)
-        compiler.lower(mod, target=target)
-
-
-def extract_from_program(mod, params, target, target_host=None, ops=None):
-    """Extract tuning tasks from a relay program.
-
-    This function is the single program version of extract_from_multiple_program.
-
-    Parameters
-    ----------
-    mod: tvm.IRModule or relay.function.Function
-        The module or function to tune
-    params: dict of str to numpy array
-        The associated parameters of the program
-    target: tvm.target.Target
-        The compilation target
-    target_host: tvm.target.Target
-        The host compilation target
-    ops: List[tvm.ir.Op] or None
-        List of relay ops to be tuned. If not specified, all tunable ops will be extracted.
-
-    Returns
-    -------
-    task: Array of autotvm.task.Task
-        collected tasks
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    return extract_from_multiple_program([mod], [params], target, ops=ops)
-
-
-def extract_from_multiple_program(mods, params, target, target_host=None, ops=None):
-    """Extract tuning tasks from multiple relay programs.
-
-    This function collects tuning tasks by building a list of programs
-    with a "tracing" target and tracing all the calls to topi.
-
-    Parameters
-    ----------
-    mods: List[tvm.IRModule] or List[relay.function.Function]
-        The list of modules or functions to tune
-    params: List of dict of str to numpy array
-        The associated parameters of the programs
-    target: tvm.target.Target
-        The compilation target
-    target_host: tvm.target.Target
-        The host compilation target
-    ops: List[tvm.ir.Op] or None
-        List of relay ops to be tuned.  If not specified, all tunable ops will be extracted.
-
-    Returns
-    -------
-    task: Array of autotvm.task.Task
-        collected tasks
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-    from tvm import topi
-
-    env = TaskExtractEnv.get()
-
-    # merge target and target host
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    # run compiler to collect all TOPI calls during compilation
-    env.reset(ops)
-    with env:
-        # disable logger temporarily
-        old_state = logger.disabled
-        logger.disabled = True
-
-        for mod, param in zip(mods, params):
-            if isinstance(mod, relay.function.Function):
-                mod = tvm.IRModule.from_expr(mod)
-            assert isinstance(
-                mod, tvm.IRModule
-            ), "only support relay Module or Function to be tuned"
-            relay.backend.te_compiler.get().clear()
-            # wrap build call in thread to avoid multiprocessing problems
-            build_thread = threading.Thread(target=_lower, args=(mod, target, param))
-            build_thread.start()
-            build_thread.join()
-            relay.backend.te_compiler.get().clear()
-            # Clear the warning message cache in FallbackContext
-            if isinstance(DispatchContext.current, FallbackContext):
-                DispatchContext.current.memory = {}
-                DispatchContext.warning_messages = set()
-
-        logger.disabled = old_state
-
-    # create tasks for target
-    tasks = []
-    for task_name, args in env.get_tasks():
-        try:
-            tsk = create(task_name, args, target=target)
-            tasks.append(tsk)
-        except topi.InvalidShapeError:
-            logger.warning("Invalid shape during AutoTVM task creation")
-
-    return tasks
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
deleted file mode 100644
index e81bad694683..000000000000
--- a/python/tvm/autotvm/task/space.py
+++ /dev/null
@@ -1,1444 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=too-few-public-methods,invalid-name,unused-argument,arguments-differ
-# pylint: disable=consider-using-enumerate,too-many-lines, invalid-sequence-index
-"""
-Template configuration space.
-
-Each template function can be parameterized by a ConfigSpace.
-The space is declared when we invoke the template function with ConfigSpace.
-During evaluation, we pass in a ConfigEntity, which contains a specific
-entity in the space. This entity contains deterministic parameters.
-"""
-from __future__ import absolute_import as _abs
-
-import itertools
-import functools
-import math
-from collections import namedtuple, OrderedDict
-from random import randrange
-import numpy as np
-
-from tvm.te import schedule, thread_axis
-from tvm.tir import expr
-from tvm.autotvm.utils import get_const_int
-
-Axis = namedtuple("Axis", ["space", "index"])
-
-try:
-    _long = long
-except NameError:
-    _long = int
-
-
-class InstantiationError(ValueError):
-    """Actively detected error in instantiating a template with a config,
-    raised by cfg.raise_error
-    e.g. too many unrolling, too many threads in a block
-    """
-
-
-class TransformSpace(object):
-    """Base class for transform space
-    TransformSpace is the node in the computation graph of axes
-
-    .. note::
-
-        We can regard our schedule code as a transformation graph of axes.
-        Starting from raw axes in the definition of te.compute, we can transform these axes
-        by some operators. The operator includes 'split', 'reorder' and 'annotate'.
-        Each operator has some tunable parameters (e.g. the split factor).
-        Then the tuning process is just to find good parameters of these op.
-
-    So all the combinations of the parameters of these op form our search space.
-
-    Naming convention:
-    We call the set of all possible values as XXXSpace. (XXX can be Split, Reorder, Config ...)
-    We call a specific entity in a space as XXXEntity.
-    """
-
-    def __init__(self):
-        self.ins = []
-        self.num_output = 0
-        self.entities = []
-
-    def __len__(self):
-        return len(self.entities)
-
-    def __getitem__(self, index):
-        """Get an entity of the space by index
-
-        Parameters
-        ----------
-        index: int
-
-        Returns
-        -------
-        transform entity
-        """
-        return self.entities[index]
-
-    @staticmethod
-    def get_num_output():
-        """get number of output axes after this transform
-
-        Returns
-        -------
-        n: int
-            number of output axes
-        """
-        return 0
-
-
-class VirtualAxis(TransformSpace):
-    """Axis placeholder in template
-
-    Parameters
-    ----------
-    var: int or tvm.te.schedule.IterVar
-        If is int, return a virtual axis whose length is the provided argument.
-        If is IterVar, return a virtual axis whose length is extracted from
-        the IterVar's extent domain.
-
-    name: str
-    """
-
-    name_ct = 0
-
-    def __init__(self, var, name=None):
-        super(VirtualAxis, self).__init__()
-        self.num_output = 1
-
-        if name is None:
-            name = f"axis_{VirtualAxis.name_ct}"
-            VirtualAxis.name_ct += 1
-
-        self.name = name
-        if isinstance(var, (int, _long)):
-            self.length = var
-        elif isinstance(var, schedule.IterVar):
-            self.name = var.var.name
-            if var.dom is None:
-                self.length = -1
-            else:
-                self.length = get_const_int(var.dom.extent)
-        elif isinstance(var, VirtualAxis):
-            self.length = var.length
-        else:
-            raise RuntimeError("Invalid type of axis: " + str(type(var)))
-
-    @staticmethod
-    def get_num_output(var, name=None):
-        return 1
-
-    def __repr__(self):
-        return f"vaxis({self.name})"
-
-
-def get_factors(n):
-    """return all factors of an integer
-
-    Parameters
-    ----------
-    n: int
-        integer to factorize
-
-    Returns
-    -------
-    factors: list
-        List of all factors
-    """
-    step = 2 if n % 2 else 1
-    ret = list(
-        set(
-            functools.reduce(
-                list.__add__,
-                ([i, n // i] for i in range(1, int(math.sqrt(n)) + 1, step) if n % i == 0),
-            )
-        )
-    )
-    ret.sort()
-    return ret
-
-
-def get_pow2s(n):
-    """return all power-of-two numbers that are less or equal than the integer
-
-    Parameters
-    ----------
-    n: int
-        integer for reference
-
-    Returns
-    -------
-    factors: list
-        List of all power-of-two numbers
-    """
-    return [2**x for x in range(math.floor(math.log2(n)) + 1)]
-
-
-class SplitSpace(TransformSpace):
-    """Split an axis for several times"""
-
-    def __init__(self, axes, policy, **kwargs):
-        super(SplitSpace, self).__init__()
-        axis = axes[0]
-
-        self.policy = policy
-        self.entities = []
-
-        max_factor = kwargs.get("max_factor", 1 << 31)
-        fil = kwargs.get("filter", lambda x: True)
-        self.product = axis.length
-        self.num_output = kwargs.get("num_outputs", 0)
-        assert self.num_output > 0
-
-        if policy == "candidate":
-            for size in kwargs["candidate"]:
-                assert len(size) == self.num_output
-                self.entities.append(SplitEntity(size))
-        else:
-            if policy == "verbose":
-                # Include factors and power-of-twos. May generate tails.
-                divisibles = get_factors(self.product)
-                pow2s = get_pow2s(self.product)
-                factors = [x for x in list(set(divisibles) | set(pow2s)) if x <= max_factor]
-            elif policy == "factors":
-                # Include divisible factors. Guarantee no tails.
-                factors = [x for x in get_factors(self.product) if x <= max_factor]
-            elif policy == "power2":
-                # Include less, equal, and round-up power-of-two numbers. May generate tails.
-                factors = [x for x in get_pow2s(self.product) if x <= max_factor]
-            else:
-                raise RuntimeError(f"Invalid policy: {policy}")
-
-            # Enforce the product of all split factors equals to the axis length
-            no_tail = kwargs.get("no_tail", policy == "factors")
-
-            # Generate split entity by enumerating candidate factors.
-            self.factors = factors
-            self._generate_space(0, [None] * (self.num_output - 1), enforce_no_tail=no_tail)
-
-        self.entities = list(filter(fil, self.entities))
-
-    def _generate_space(self, now, tmp_stack, enforce_no_tail=False):
-        """Generate space by DFS"""
-        if now == self.num_output - 1:
-            prod = functools.reduce(lambda x, y: x * y, tmp_stack)
-            if prod > self.product:
-                return
-            if self.product % prod == 0 or (not enforce_no_tail and prod < self.product):
-                self.entities.append(SplitEntity([-1] + tmp_stack[::-1]))
-        else:
-            for factor in self.factors:
-                tmp_stack[now] = factor
-                self._generate_space(now + 1, tmp_stack, enforce_no_tail)
-
-    @staticmethod
-    def get_num_output(axes, policy, **kwargs):
-        return kwargs["num_outputs"]
-
-    def __repr__(self):
-        return "Split(policy=%s, product=%d, num_outputs=%d) len=%d" % (
-            self.policy,
-            self.product,
-            self.num_output,
-            len(self),
-        )
-
-
-class SplitEntity(object):
-    """
-    A split operation with detailed parameters
-    that can apply to an axis
-
-    Parameters
-    ----------
-    size: Array of int
-        the size of every axis after split.
-        e.g. an axis of extent 128, we split it into 3 axes, a possible
-        size is [4, 4, 8] (4x4x8 = 128).
-    """
-
-    def __init__(self, size):
-        self.size = size
-
-    def apply(self, sch, op, axis):
-        """Apply split to an axis
-
-        Parameters
-        ----------
-        sch: tvm.te.schedule.Schedule
-            The tvm schedule
-        op: tvm.te.Operation
-            The stage to be applied
-        axis: tvm.te.schedule.IterVar
-            axis to split
-
-        Returns
-        -------
-        axes : list of Axis
-            The transformed axes.
-        """
-        ret = []
-        for i in range(1, len(self.size)):
-            ax0, ax1 = sch[op].split(axis, int(np.prod(self.size[i:])))
-            ret.append(ax0)
-            axis = ax1
-        return ret + [axis]
-
-    def __repr__(self):
-        return str(self.size)
-
-
-class ReorderSpace(TransformSpace):
-    """The parameter space for ordering an array of axes"""
-
-    def __init__(self, axes, policy, **kwargs):
-        super(ReorderSpace, self).__init__()
-        self.ins = axes
-        self.policy = policy
-        self.num_output = len(axes)
-
-        if policy == "identity":
-            self.entities = [ReorderEntity(range(len(axes)))]
-        elif policy == "all":
-            self.entities = [ReorderEntity(x) for x in itertools.permutations(range(len(axes)))]
-        elif policy == "interval_all":
-            begin, end = kwargs["interval"]
-            sub_space = list(itertools.permutations(range(begin, end)))
-            prefix, suffix = tuple(range(begin)), tuple(range(end, len(axes)))
-            self.entities = [ReorderEntity(prefix + x + suffix) for x in sub_space]
-        elif policy == "candidate":
-            candidate = kwargs["candidate"]
-            for can in candidate:
-                perm = [axes.index(x) for x in can]
-                self.entities.append(ReorderEntity(perm))
-        elif policy == "interleave":
-            spatial, reduce = kwargs["spatial"], kwargs["reduce"]
-
-            spatial = [[axes.index(x) for x in ch] for ch in spatial]
-            reduce = [[axes.index(x) for x in ch] for ch in reduce]
-
-            outer_merged = self._merge_chain([x[:-1] for x in spatial])
-            inner_merged = self._merge_chain([x[-1:] for x in spatial] + reduce)
-
-            for o in outer_merged:
-                for i in inner_merged:
-                    self.entities.append(ReorderEntity(o + i))
-        elif policy == "interleave_cuda":
-            spatial, reduce = kwargs["spatial"], kwargs["reduce"]
-
-            spatial = [[axes.index(x) for x in ch] for ch in spatial]
-            reduce = [[axes.index(x) for x in ch] for ch in reduce]
-
-            outer_merged = self._merge_chain([x[:-1] for x in spatial])
-            reduce_merged = self._merge_chain(reduce)
-            inner_merged = [x[-1] for x in spatial]
-
-            for o in outer_merged:
-                for r in reduce_merged:
-                    self.entities.append(ReorderEntity(o + r + inner_merged))
-        else:
-            raise RuntimeError("Invalid policy: " + policy)
-
-    @staticmethod
-    def get_num_output(axes, policy, **kwargs):
-        return len(axes)
-
-    def __repr__(self):
-        return f"Reorder(policy={self.policy}) len={len(self)}"
-
-    def _merge_chain(self, chains):
-        """generate all combinations of merge some chains"""
-        merged = []
-        tmp_pt = [0] * len(chains)
-        tmp_stack = []
-
-        size = np.sum([len(x) for x in chains])
-        self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
-        return merged
-
-    def _merge_dfs(self, chains, size, tmp_pt, tmp_stack, merged):
-        if np.sum(tmp_pt) == size:
-            merged.append(list(tmp_stack))
-            return
-
-        for i in range(len(chains)):
-            # use i == np.argmax(....) here to take spatial order into consideration
-            # if we don't want to consider spatial order, we can use tmp_pt[i] == np.max(....)
-            if tmp_pt[i] < len(chains[i]) and (
-                i == np.argmax([len(chains[x]) - tmp_pt[x] for x in range(len(chains))])
-            ):
-                tmp_stack.append(chains[i][tmp_pt[i]])
-                tmp_pt[i] += 1
-                self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
-                tmp_pt[i] -= 1
-                tmp_stack.pop()
-
-
-class ReorderEntity(object):
-    """A reorder operation with detailed parameters that can apply to axes
-
-    Parameters
-    ----------
-    perm: Array of int
-        define the permutation
-    """
-
-    def __init__(self, perm):
-        self.perm = perm
-
-    def apply(self, sch, op, axes):
-        """Apply reorder to an array of axes
-
-        Parameters
-        ----------
-        sch: tvm.te.schedule.Schedule
-            The tvm schedule
-        op: tvm.te.Operation
-            The stage to be applied
-        axis: tvm.te.schedule.IterVar
-            axis to split
-
-        Returns
-        -------
-        axes : list of Axis
-            The transformed axes.
-        """
-        if len(axes) == len(self.perm):
-            new_order = [axes[i] for i in self.perm]
-        else:
-            new_order = [axes[i] for i in self.perm if i < len(axes)]
-        sch[op].reorder(*new_order)
-        return new_order
-
-    def __repr__(self):
-        return str(self.perm)
-
-
-class AnnotateSpace(TransformSpace):
-    """The parameter space for annotating an array of axes"""
-
-    def __init__(self, axes, policy, **kwargs):
-        super(AnnotateSpace, self).__init__()
-
-        self.ins = axes
-        self.policy = policy
-        self.num_output = len(axes)
-
-        if policy == "bind_gpu":
-            self.num_axis = len(axes)
-            if self.num_axis >= 6:
-                self.entities.append(
-                    AnnotateEntity(
-                        ["fuse"] * (self.num_axis - 6)
-                        + [
-                            "blockIdx.z",
-                            "blockIdx.y",
-                            "blockIdx.x",
-                            "threadIdx.z",
-                            "threadIdx.y",
-                            "threadIdx.x",
-                        ]
-                    )
-                )
-            elif self.num_axis >= 4:
-                self.entities.append(
-                    AnnotateEntity(
-                        ["fuse"] * (self.num_axis - 4)
-                        + ["blockIdx.y", "blockIdx.x", "threadIdx.y", "threadIdx.x"]
-                    )
-                )
-            elif self.num_axis >= 2:
-                self.entities.append(
-                    AnnotateEntity(["fuse"] * (self.num_axis - 2) + ["blockIdx.x", "threadIdx.x"])
-                )
-            else:
-                raise RuntimeError("Unhandled case in bind_gpu")
-        elif policy == "bind_gpu_virtual":
-            self.num_axis = len(axes)
-            if self.num_axis >= 9:
-                self.entities.append(
-                    AnnotateEntity(
-                        ["fuse"] * (self.num_axis - 9)
-                        + [
-                            "blockIdx.z",
-                            "blockIdx.y",
-                            "blockIdx.x",
-                            "vthread",
-                            "vthread",
-                            "vthread",
-                            "threadIdx.z",
-                            "threadIdx.y",
-                            "threadIdx.x",
-                        ]
-                    )
-                )
-            elif self.num_axis >= 6:
-                self.entities.append(
-                    AnnotateEntity(
-                        ["fuse"] * (self.num_axis - 6)
-                        + [
-                            "blockIdx.y",
-                            "blockIdx.x",
-                            "vthread",
-                            "vthread",
-                            "threadIdx.y",
-                            "threadIdx.x",
-                        ]
-                    )
-                )
-            elif self.num_axis >= 3:
-                self.entities.append(
-                    AnnotateEntity(
-                        ["fuse"] * (self.num_axis - 3) + ["blockIdx.x", "vthread", "threadIdx.x"]
-                    )
-                )
-            else:
-                raise RuntimeError("Unhandled case in bind_gpu")
-        elif policy == "locate_cache":
-            self.num_axis = len(axes)
-            num_anchor = kwargs["num_anchor"]
-            self.anns = list(itertools.combinations(range(self.num_axis), num_anchor))
-            self.entities = [AnnotateEntity(x) for x in self.anns]
-        else:  # none, vec, unroll, try_vec, try_unroll, try_vec_unroll, ...
-            anns = policy.replace("try", "none").split("_")
-
-            for ann in anns:
-                if ann not in ["none", "unroll", "vec"]:
-                    raise RuntimeError("Invalid policy: " + policy)
-
-            self.num_axis = len(axes)
-            self.anns = [anns] * self.num_axis
-            self._generate_space(0, [""] * self.num_axis)
-
-    def _generate_space(self, now, tmp_stack):
-        """Generate space by DFS"""
-        if now == self.num_axis:
-            # only vectorize inner most dimension
-            vec_ct = tmp_stack.count("vec")
-            if vec_ct in (0, 1):
-                self.entities.append(AnnotateEntity(list(tmp_stack)))
-        else:
-            for ann in self.anns[now]:
-                tmp_stack[now] = ann
-                self._generate_space(now + 1, tmp_stack)
-
-    @staticmethod
-    def get_num_output(axes, policy, **kwargs):
-        return len(axes)
-
-    def __repr__(self):
-        return f"Annotate(policy={self.policy}) len={len(self)}"
-
-
-class AnnotateEntity(object):
-    """An annotation operation with detailed parameters that can apply to axes
-
-    Parameters
-    ----------
-    anns: Array of string
-        The annotations of axes
-    """
-
-    def __init__(self, anns):
-        self.anns = anns
-
-    def apply(
-        self, sch, op, axes, axis_lens=None, max_unroll=None, vec_size=None, cfg=None, source=None
-    ):
-        """Apply annotation to an array of axes
-
-        Parameters
-        ----------
-        sch: tvm.te.schedule.Schedule
-            The tvm schedule
-        op: tvm.te.Operation
-            The stage to be applied
-        axes: Array of tvm.te.schedule.IterVar
-            axis to split
-        axis_lens: Array of int, optional
-            the length of axes
-        max_unroll: int, optional
-            maximum unroll step
-        vec_size: Array of int, optional
-            valid vector lanes for vectorization
-        cfg: ConfigEntity, optional
-            cfg for recording error
-        source: Array of Array tensor, optional
-            source tensor for attaching cache
-
-        Returns
-        -------
-        axes : list of tvm.te.schedule.IterVar
-            The transformed axes
-        """
-        if source is not None:  # special case : attach cache_read/cache_write
-            for src, to in zip(source, self.anns):
-                for t in src:
-                    sch[t].compute_at(sch[op], axes[to])
-        else:  # other cases
-            for i, ann in enumerate(self.anns):
-                if ann == "none":
-                    pass
-                elif ann == "unroll":
-                    if max_unroll and axis_lens[i] > max_unroll:
-                        cfg.raise_error("Too large factor for unrolling")
-                    sch[op].unroll(axes[i])
-                elif ann == "vec":
-                    if vec_size and axis_lens[i] not in vec_size:
-                        cfg.raise_error("Wrong size of lanes in vectorization")
-                    sch[op].vectorize(axes[i])
-                elif ann == "blockIdx.x":
-                    sch[op].bind(axes[i], thread_axis("blockIdx.x"))
-                elif ann == "blockIdx.y":
-                    sch[op].bind(axes[i], thread_axis("blockIdx.y"))
-                elif ann == "blockIdx.z":
-                    sch[op].bind(axes[i], thread_axis("blockIdx.z"))
-                elif ann == "threadIdx.x":
-                    sch[op].bind(axes[i], thread_axis("threadIdx.x"))
-                elif ann == "threadIdx.y":
-                    sch[op].bind(axes[i], thread_axis("threadIdx.y"))
-                elif ann == "threadIdx.z":
-                    sch[op].bind(axes[i], thread_axis("threadIdx.z"))
-                elif ann == "vthread":
-                    sch[op].bind(axes[i], thread_axis("vthread"))
-                elif ann == "fuse":
-                    assert i < len(axes) - 1
-                    axes[i + 1] = sch[op].fuse(axes[i], axes[i + 1])
-                else:
-                    raise RuntimeError("Invalid annotation " + ann)
-        return axes
-
-    def __repr__(self):
-        return str(self.anns)
-
-
-class OtherOptionSpace(TransformSpace):
-    """The parameter space for general option"""
-
-    def __init__(self, axes, policy, **kwargs):
-        super(OtherOptionSpace, self).__init__()
-
-        candidate = kwargs["candidate"]
-        self.entities = [OtherOptionEntity(x) for x in candidate]
-
-    @staticmethod
-    def get_num_output(axes, policy, **kwargs):
-        return 0
-
-    def __repr__(self):
-        return f"OtherOption({self.entities}) len={len(self)}"
-
-
-class OtherOptionEntity(object):
-    """The parameter entity for general option, with a detailed value"""
-
-    def __init__(self, val):
-        self.val = val
-
-    def __repr__(self):
-        return str(self.val)
-
-
-class ConfigSpace(object):
-    """The configuration space of a schedule. Pass it as config in template to
-    collect transformation space and build transform graph of axes
-    """
-
-    def __init__(self):
-        # private dict to provide sugar
-        self.space_map = OrderedDict()  # name -> space
-        self._collect = True
-        self._length = None
-        self._range_length = None
-        self._dims = None
-        self._entity_map = OrderedDict()  # name -> entity
-        self._constraints = []
-        self.errors = []
-        self.code_hash = None
-        self.flop = 0
-        self.cost = None
-        self.is_fallback = False
-        self._shared_filter = None
-        self._shared_filter_cache = None
-
-    @staticmethod
-    def axis(var):
-        """get a virtual axis (axis placeholder)
-
-        Parameters
-        ----------
-        var: int or tvm.te.schedule.IterVar
-            If is int, return an axis whose length is the provided argument.
-            If is IterVar, return an axis whose length is extracted from the
-            IterVar's extent domain.
-        """
-        return VirtualAxis(var)
-
-    reduce_axis = axis
-
-    def define_split(self, name, axis, policy="factors", **kwargs):
-        """Define a new tunable knob which splits an axis into a list of axes
-
-        Parameters
-        ----------
-        name: str
-            name to index the entity of this space
-        axis: tvm.te.schedule.IterVar
-            axis to split
-        policy: str
-            name of policy.
-            If is 'factors', the tuner will try all divisible factors.
-            If is 'power2', the tuner will try power-of-two factors less or equal to the length.
-            If is 'verbose', the tuner will try all candidates in above two policies.
-            If is 'candidate', try given candidates.
-        **kwargs:
-            extra arguments for policy
-
-            ``max_factor``:
-                the maximum split factor (`int`).
-            ``filter``:
-                see examples below for how to use filter (`Callable[[int], bool]`).
-            ``num_outputs``:
-                the total number of axis after split (`int`).
-            ``no_tail``:
-                should we only include divisible numbers as split factors (`bool`).
-            ``candidate``:
-                (policy=candidate) manual candidate list (`List`).
-
-        Examples
-        --------
-        >>> # use custom candidates
-        >>> cfg.define_split('tile_x', x, policy='candidate', num_outputs=3,
-        >>>   candidate=[[1, 4, 4], [4, 1, 4]])
-
-        >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
-        >>> cfg.define_split('tile_y', y, policy='factors', num_outputs=3,
-        >>>   filter=lambda x: x.size[-1] <= 4)
-        """
-        axes = [axis]
-        return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
-
-    def define_reorder(self, name, axes, policy, **kwargs):
-        """Define a new tunable knob which reorders a list of axes
-
-        Parameters
-        ----------
-        name: str
-            name to index the entity of this space
-        axes: Array of tvm.te.schedule.IterVar
-            axes to reorder
-        policy: str
-            name of policy
-            If is 'identity', do an identity permutation.
-            If is 'all', try all permutations.
-            If is 'interval_all', try all permutations of an interval of axes.
-            If is 'candidate', try listed candidate.
-            If is 'interleave', interleave chains of spatial axes and chains of reduction axes.
-        kwargs: dict
-            extra arguments for policy
-        """
-        return self._add_new_transform(ReorderSpace, name, axes, policy, **kwargs)
-
-    def define_annotate(self, name, axes, policy, **kwargs):
-        """Define a new tunable knob which annotates a list of axes
-
-        Parameters
-        ----------
-        name: str
-            name to index the entity of this space
-        axes: Array of tvm.te.schedule.IterVar
-            axes to annotate
-        policy: str
-            name of policy
-            If is 'unroll', unroll the axes.
-            If is 'try_unroll', try to unroll the axes.
-            If is 'try_unroll_vec', try to unroll or vectorize the axes.
-            If is 'bind_gpu', bind the first few axes to gpu threads.
-            If is 'locate_cache', choose n axes to attach shared/local cache.
-        kwargs: dict
-            extra arguments for policy
-        """
-        return self._add_new_transform(AnnotateSpace, name, axes, policy, **kwargs)
-
-    def define_knob(self, name, candidate):
-        """Define a tunable knob with a list of candidates
-
-        Parameters
-        ----------
-        name: str
-            name key of that option
-        candidate: list
-            list of candidates
-        """
-        return self._add_new_transform(OtherOptionSpace, name, [], None, candidate=candidate)
-
-    def add_flop(self, flop):
-        """Add float operation statistics for this tuning task
-
-        Parameters
-        ---------
-        flop: int or float or IntImm or FloatImm
-            number of float operations
-        """
-        if isinstance(flop, (expr.IntImm, expr.FloatImm)):
-            flop = flop.value
-        self.flop += float(flop)
-
-    def raise_error(self, msg):
-        """register error in config
-        Using this to actively detect error when scheduling.
-        Otherwise these error will occur during runtime, which
-        will cost more time.
-
-        Parameters
-        ----------
-        msg: str
-        """
-        self.errors.append(msg)
-
-    def valid(self):
-        """Check whether the config meets all the constraints
-
-        .. note::
-
-            This check should be called after instantiation of task,
-            because the ConfigEntity/ConfigSpace collects errors during instantiation
-
-        Returns
-        -------
-        valid: bool
-            whether the config meets all the constraints
-        """
-        return not bool(self.errors)
-
-    def is_index_valid(self, index):
-        """Checks if the index satisfies the multi_filter condition
-
-        Parameters
-        ----------
-        index: int
-            index from the range of the space
-
-        Returns
-        -------
-        valid: bool
-            whether the index meets all the constraints
-        """
-        assert 0 <= index < self.range_length
-        if self._shared_filter is None:
-            return True
-        if self._shared_filter_cache is None:
-            self._make_shared_filter_cache()
-        return self._shared_filter_cache[index]
-
-    def multi_filter(self, filter):  # pylint: disable=redefined-builtin
-        """The filter can restrict combination of parameters in difference to the knob filter,
-        that restricts only single parameter
-
-        Parameters
-        ----------
-        filter: function
-            predicate with one argument (Callable[[int], bool])
-
-        .. note::
-
-            Using this filter causes additional restrictions on the use of __len__.
-            Normally, it define the count of valid indexes and the range of space, but when
-            multi_filter enabled, it requires to use __len__ for getting the count of valid
-            indexes or range_length for the range of space. It is recommended to use:
-            ``is_index_valid``, ``get_next_index``, ``get_rand_index`` to bypass the space
-
-        Examples
-        --------
-        >>> # Pre-requisites
-        >>> candidates = [[16, 64], [32, 32], [64, 16]]
-        >>> filter = lambda v: v.size[0] != 16
-        >>> multi_filter = lambda e: (e["tile_x"].size[0] + e["tile_y"].size[0]) <= 64
-
-        >>> # Case 1 - without filtering
-        >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates)
-        >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates)
-        >>> # [('tile_x', [16, 64]), ('tile_y', [16, 64])],None,0
-        >>> # [('tile_x', [32, 32]), ('tile_y', [16, 64])],None,1
-        >>> # [('tile_x', [64, 16]), ('tile_y', [16, 64])],None,2
-        >>> # [('tile_x', [16, 64]), ('tile_y', [32, 32])],None,3
-        >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,4
-        >>> # [('tile_x', [64, 16]), ('tile_y', [32, 32])],None,5
-        >>> # [('tile_x', [16, 64]), ('tile_y', [64, 16])],None,6
-        >>> # [('tile_x', [32, 32]), ('tile_y', [64, 16])],None,7
-        >>> # [('tile_x', [64, 16]), ('tile_y', [64, 16])],None,8
-
-        >>> # Case 2 - with filter
-        >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates,
-        >>>   filter=filter)
-        >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates,
-        >>>   filter=filter)
-        >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,0
-        >>> # [('tile_x', [64, 16]), ('tile_y', [32, 32])],None,1
-        >>> # [('tile_x', [32, 32]), ('tile_y', [64, 16])],None,2
-        >>> # [('tile_x', [64, 16]), ('tile_y', [64, 16])],None,3
-
-        >>> # Case 3 - with filter and multi_filter
-        >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates,
-        >>>   filter=filter)
-        >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates,
-        >>>   filter=filter)
-        >>> cfg.multi_filter(filter=multi_filter)
-        >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,0
-        """
-        if self._collect:
-            self.clear_cache()
-            self._shared_filter = filter
-
-    @property
-    def range_length(self):
-        """Length of the index range in the space"""
-        if self._range_length is None:
-            self._range_length = int(np.prod([len(x) for x in self.space_map.values()]))
-        return self._range_length
-
-    @property
-    def dims(self):
-        """Dimensions in the space"""
-        if self._dims is None:
-            self._dims = [len(x) for x in self.space_map.values()]
-        return self._dims
-
-    def subrange_length(self, start, end):
-        """Returns the number of valid indexes within the limited range from [start, end]
-
-        Parameters
-        ----------
-        start: int
-            start of subrange, inclusive
-        end: int
-            end of subrange, exclusive
-
-        Returns
-        -------
-        count: int
-            number of valid indexes
-        """
-        assert 0 <= start <= end <= self.range_length
-        if self._shared_filter is None:
-            return end - start
-        if self._shared_filter_cache is None:
-            self._make_shared_filter_cache()
-        return self._shared_filter_cache[start:end].count(True)
-
-    def get_rand_index(self, start=None, end=None, to_exclude=None):
-        """Returns a random valid index unlisted to exclusion
-
-        Parameters
-        ----------
-        start: int, optional
-            specifying at which position to start, inclusive
-        end: int, optional
-            specifying at which position to end, exclusive
-        to_exclude: list, optional
-            determines unsuitable values
-
-        Returns
-        -------
-        rand: int
-            random index in the space
-
-        .. note::
-
-            Excluding all valid space indexes will lead to an infinite loop.
-
-        """
-        start = start or 0
-        end = end or self.range_length
-        while True:
-            index = randrange(start, end)
-            if self.is_index_valid(index) and index not in (to_exclude or []):
-                return index
-
-    def get_next_index(self, index, n=1, start=None, end=None):
-        """Returns the nth valid next index or None if out of range
-
-        Parameters
-        ----------
-        index: int
-            specifying at which position to start, inclusive
-        n: int, optional
-            step by using to find the next index, for the opposite
-            direction a negative number should be used
-        start: list, optional
-            start of subrange, inclusive
-        end: list, optional
-            end of subrange, exclusive
-
-        Returns
-        -------
-        next: int
-            next index in the space
-        """
-        assert n != 0
-        start = start or 0
-        end = end or self.range_length
-        if self._shared_filter is None:
-            index += n
-            if start <= index < end:
-                return index
-            return None
-        trend = 1 if n > 0 else -1
-        counter = abs(n)
-        while counter != 0:
-            index += trend
-            if index < start or index >= end:
-                return None
-            if self.is_index_valid(index):
-                counter -= 1
-        return index
-
-    def clear_cache(self):
-        """Clears the cache of index validity"""
-        del self._shared_filter_cache
-        self._dims = None
-        self._length = None
-        self._range_length = None
-        self._shared_filter_cache = None
-
-    def _make_shared_filter_cache(self):
-        def apply(t):
-            entities = OrderedDict()
-            for name, space in self.space_map.items():
-                entities[name] = space[t % len(space)]
-                t //= len(space)
-            return bool(self._shared_filter(entities))
-
-        self._shared_filter_cache = tuple(apply(i) for i in range(self.range_length))
-        self._length = self._shared_filter_cache.count(True)
-
-    def point2knob(self, point):
-        """Convert point form (single integer) to knob (vector)
-
-        Parameters
-        ----------
-        point: int
-            point to convert
-
-        Returns
-        -------
-        knob: list
-            knob representation of the point
-        """
-        knob = []
-        for dim in self.dims:
-            knob.append(point % dim)
-            point //= dim
-        return knob
-
-    def knob2point(self, knob):
-        """Convert knob form (vector) to point form (single integer)
-
-        Parameters
-        ----------
-        knob: list
-            knob to convert
-
-        Returns
-        -------
-        point: int
-            point of the knob representation
-        """
-        point = 0
-        for j, k in enumerate(knob):
-            point += int(np.prod(self.dims[:j])) * k
-        return point
-
-    def sample_ints(self, m):
-        """
-        Sample m different integer numbers from [0, self.range_length) without replacement
-        This function is an alternative of `np.random.choice` when self.range_length > 2 ^ 32, in
-        which case numpy does not work.
-
-        Parameters
-        ----------
-        m: int
-            The number of sampled int
-
-        Returns
-        -------
-        ints: an numpy array of size m
-        """
-        assert m <= len(self)
-        vis = set()
-        while len(vis) < m:
-            new = randrange(0, self.range_length)
-            if self.is_index_valid(new):
-                vis.add(new)
-        return np.fromiter(vis, int, len(vis))
-
-    def random_walk(self, point):
-        """random walk as local transition
-
-        Parameters
-        ----------
-        point: int
-            index of the ConfigEntity
-
-        Returns
-        -------
-        new_point: int
-            new neighborhood index
-        """
-        # transform to knob form
-        old_knob = self.point2knob(point)
-        new_knob = old_knob.copy()
-        new_point = self.knob2point(new_knob)
-        # mutate
-        while new_knob == old_knob or not self.is_index_valid(new_point):
-            from_i = np.random.randint(len(old_knob))
-            to_v = np.random.randint(self.dims[from_i])
-            new_knob[from_i] = to_v
-            new_point = self.knob2point(new_knob)
-        # transform to index form
-        return new_point
-
-    def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
-        """Add a new transform space in template"""
-        # if we do not have tuned info (_collect == True) but defined KNOB value
-        # for "default" scheduling before call of _add_new_transform, in this case
-        # no need to create new space and override previously pointed KNOB values
-        if kwargs.get("filter"):
-            self.clear_cache()
-        if self._collect and not (self.is_fallback and name in self._entity_map):
-            # convert schedule axis to space definition axis
-            axes = [x if isinstance(x, (VirtualAxis, Axis)) else self.axis(x) for x in axes]
-
-            # add subspace (knob)
-            space = space_class(axes, policy, **kwargs)
-            self.space_map[name] = space
-            self._entity_map[name] = space[0]
-            return [Axis(space, i) for i in range(space.num_output)]
-        return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
-
-    def __len__(self):
-        """Returns the number of valid indexes in the space"""
-        if self._shared_filter is None:
-            return self.range_length
-        if self._shared_filter_cache is None:
-            self._make_shared_filter_cache()
-        return self._length
-
-    def get(self, index):
-        """Get a config entity with detailed parameters from this space
-
-        Parameters
-        ----------
-        index: int
-            index in the space
-
-        Returns
-        -------
-        config: ConfigEntity
-            config corresponds to the index
-        """
-        if index < 0 or index >= self.range_length:
-            raise IndexError(f"Index out of range: size {self.range_length}, got index {index}")
-        if not self.is_index_valid(index):
-            raise IndexError(
-                f"Index does not correspond to the multi-filter condition, got index {index}. "
-                f"Use is_index_valid to pre-check"
-            )
-        entities = OrderedDict()
-        t = index
-        for name, space in self.space_map.items():
-            entities[name] = space[t % len(space)]
-            t //= len(space)
-        ret = ConfigEntity(index, self.code_hash, entities, self._constraints)
-        return ret
-
-    def __iter__(self):
-        return self._entity_map.__iter__()
-
-    def __getitem__(self, name):
-        """get the transform entity(knob) of this entity by name
-           do not use this to get a ConfigEntity of this space (should use ConfigSpace.get instead)
-
-        Parameters
-        ----------
-        name: str
-            name of the transform
-        """
-        return self._entity_map[name]
-
-    def __repr__(self):
-        res = f"ConfigSpace (len={len(self)}, range_length={self.range_length}, space_map=\n"
-        for i, (name, space) in enumerate(self.space_map.items()):
-            res += f"  {i:2d} {name}: {space}\n"
-        return res + ")"
-
-
-_ann_to_number = {
-    "none": 0,
-    "vec": 1,
-    "unroll": 2,
-    "blockIdx.x": 3,
-    "blockIdx.y": 4,
-    "blockIdx.z": 5,
-    "threadIdx.x": 6,
-    "threadIdx.y": 7,
-    "threadIdx.z": 8,
-    "vthread": 9,
-    "fuse": 10,
-}
-
-
-class ConfigEntity(ConfigSpace):
-    """A configuration with detailed parameters
-
-    Parameters
-    ----------
-    index: int
-        index of this config in space
-    code_hash: str
-        hash of schedule code
-    entity_map: dict
-        map name to transform entity
-    constraints : list
-        List of constraints
-    """
-
-    def __init__(self, index, code_hash, entity_map, constraints):
-        super(ConfigEntity, self).__init__()
-        self.index = index
-        self._collect = False
-        self._entity_map = entity_map
-        self._space_map = None
-        self._constraints = constraints
-        self.code_hash = code_hash
-
-    def get_flatten_feature(self):
-        """flatten entities to a numerical one-dimensional feature vector
-
-        Returns
-        -------
-        fea: np.array
-            one dimensional float32 array
-        """
-        fea = []
-        for _, v in self._entity_map.items():
-            if isinstance(v, SplitEntity):
-                fea.extend(v.size)
-            elif isinstance(v, ReorderEntity):
-                # use a naive way: directly copy the permutation
-                fea.extend(v.perm)
-            elif isinstance(v, AnnotateEntity):
-                # one-hot encoding
-                for ann in v.anns:
-                    tmp = [0] * len(_ann_to_number)
-                    tmp[_ann_to_number[ann]] = 1
-                    fea.extend(tmp)
-            elif isinstance(v, OtherOptionEntity):
-                fea.append(v.val)
-        return np.array(fea, dtype=np.float32)
-
-    def get_other_option(self):
-        """
-        Returns
-        -------
-        other_option: dict
-            other tunable parameters (tunable parameters defined by `cfg.define_knob`)
-        """
-        return {x: x.val for x in self._entity_map.values() if isinstance(x, OtherOptionEntity)}
-
-    def to_json_dict(self):
-        """convert to a json serializable dictionary
-
-        Return
-        ------
-        json_dict: dict
-            a json serializable dictionary
-        """
-        ret = {}
-        ret["index"] = int(self.index)
-        ret["code_hash"] = self.code_hash
-        entity_map = []
-        for k, v in self._entity_map.items():
-            if isinstance(v, SplitEntity):
-                entity_map.append((k, "sp", v.size))
-            elif isinstance(v, ReorderEntity):
-                entity_map.append((k, "re", v.perm))
-            elif isinstance(v, AnnotateEntity):
-                entity_map.append((k, "an", v.anns))
-            elif isinstance(v, OtherOptionEntity):
-                entity_map.append((k, "ot", v.val))
-            else:
-                raise RuntimeError("Invalid entity instance: " + v)
-        ret["entity"] = entity_map
-        return ret
-
-    @staticmethod
-    def from_json_dict(json_dict):
-        """Build a ConfigEntity from json serializable dictionary
-
-        Parameters
-        ----------
-        json_dict: dict
-            Json serializable dictionary. This should be the return value
-            of :any:`to_json_dict`.
-
-        Returns
-        -------
-        config: ConfigEntity
-            The corresponding config object
-
-        """
-        index = json_dict["index"]
-        code_hash = json_dict["code_hash"]
-        constraints = []
-        entity_map = OrderedDict()
-
-        for item in json_dict["entity"]:
-            key, knob_type, knob_args = item
-            if knob_type == "sp":
-                entity = SplitEntity(knob_args)
-            elif knob_type == "re":
-                entity = ReorderEntity(knob_args)
-            elif knob_type == "an":
-                entity = AnnotateEntity(knob_args)
-            elif knob_type == "ot":
-                entity = OtherOptionEntity(knob_args)
-            else:
-                raise RuntimeError("Invalid config knob type: " + knob_type)
-            entity_map[str(key)] = entity
-
-        return ConfigEntity(index, code_hash, entity_map, constraints)
-
-    def __repr__(self):
-        return f"{str(self._entity_map)[12:-1]},{self.code_hash},{self.index}"
-
-
-class FallbackConfigEntity(ConfigSpace):
-    """The config entity created to support fallback"""
-
-    def __init__(self):
-        super(FallbackConfigEntity, self).__init__()
-        self.is_fallback = True
-
-    def fallback_split(self, name, constraints):
-        """Fallback a split knob
-
-        Parameters
-        ----------
-        name: str
-            name of the knob
-        constraints: List of int
-            The maximum tile size for every dimension. Value `-1` means no constraint.
-
-        Examples
-        --------
-        If you use cfg.define_split('tile_0', 128, num_outputs=3),
-        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [4, 8, 4]
-
-        If you use cfg.define_split('tile_0', 49, num_outputs=3),
-        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [7, 7, 1]
-        """
-        space = self.space_map[name]
-        assert isinstance(space, SplitSpace)
-        assert len(constraints) == space.num_output
-
-        # '-1' means no constraint
-        constraints = [x if x != -1 else 1e10 for x in constraints]
-
-        entity = self._entity_map[name]
-        now = space.product
-
-        for i in reversed(range(space.num_output)):
-            factors = get_factors(now)
-
-            find = len(factors) - 1
-            for j, f in enumerate(factors):
-                if f > constraints[i]:
-                    find = j - 1
-                    break
-
-            if find >= 0:
-                entity.size[i] = factors[find]
-                now //= factors[find]
-            else:
-                raise RuntimeError("Cannot find feasible fallback split entity for node: " + name)
-
-    def fallback_with_reference_log(self, ref_log):
-        """A data driven fallback mechanism.
-        We use tuned parameters from TopHub as reference data.
-        For an unseen shape, we find the most similar tuned one from TopHub and
-        mimic its parameters.
-        Note that we are not matching by workload (e.g., input size, kernel size),
-        but instead matching by configuration space. The idea is that if two workloads have
-        similar configuration space, their optimal configurations are also likely to be similar.
-
-        Parameters
-        ----------
-        ref_log: List of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
-            The reference log
-        """
-        knob_names = [x for x in self.space_map.keys() if isinstance(self.space_map[x], SplitSpace)]
-
-        # find best match config in reference data by matching tiling factors
-        factor_list = []
-        for knob_name in knob_names:
-            factor_list.append(get_factors(self.space_map[knob_name].product))
-
-        best_match_cfg = None
-        best_match_score = 0
-        for inp, _ in ref_log:
-            match_score = 0
-            for i, knob_name in enumerate(knob_names):
-                factors = get_factors(int(np.prod(inp.config[knob_name].size)))
-                match_score += float(len(set(factor_list[i]).intersection(factors))) / len(
-                    factor_list[i]
-                )
-
-                if match_score > best_match_score:
-                    best_match_score, best_match_cfg = match_score, inp.config
-
-        if best_match_cfg is None:
-            return
-
-        # mimic its tiling strategy
-        for knob_name in knob_names:
-            constraint = list(best_match_cfg[knob_name].size)
-            constraint[0] = -1
-            self.fallback_split(knob_name, constraint)
-
-        # copy other knobs
-        for knob_name in self.space_map.keys():
-            if not isinstance(self.space_map[knob_name], SplitSpace):
-                self._entity_map[knob_name] = best_match_cfg[knob_name]
-
-    def __setitem__(self, name, entity):
-        """set the entity(knob) of by name
-
-        Parameters
-        ----------
-        name: str
-            name of the entity
-        entity: SplitEntity, ReorderEntity, AnnotateEntity, OtherOptionEntity
-            value of the entity
-        """
-        self._entity_map[name] = entity
-
-    def __repr__(self):
-        return f"{str(self._entity_map)[12:-1]},{self.code_hash}"
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
deleted file mode 100644
index 575325c80e5b..000000000000
--- a/python/tvm/autotvm/task/task.py
+++ /dev/null
@@ -1,628 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-variable,not-callable
-"""Definition of task function.
-
-Task can be constructed from tuple of func, args, and kwargs.
-func is a state-less function, or a string that
-registers the standard task.
-"""
-import functools
-
-import numpy as np
-
-from tvm import runtime
-from tvm.ir import container
-from tvm.target import Target
-from tvm.te import placeholder, tensor
-from tvm.tir import expr
-
-
-from ..utils import get_const_int, get_const_tuple
-from .dispatcher import ApplyConfig, DispatchContext
-from .space import ConfigSpace
-
-
-def _lookup_task(name):
-    task = TASK_TABLE.get(name)
-    if task is None:
-        # Unable to find the given task. This might be because we are
-        # creating a task based on a name that has not been imported.
-        # Rather than raising an exception here, we return a dummy
-        # task which cannot be invoked.
-        task = MissingTask(name)
-    return task
-
-
-def serialize_args(args):
-    """serialize arguments of a topi function to a hashable tuple.
-
-    Parameters
-    ----------
-    args: list of hashable or Tensor
-    """
-
-    def _encode(x):
-        if isinstance(x, tensor.Tensor):
-            return ("TENSOR", get_const_tuple(x.shape), x.dtype)
-        if isinstance(x, (tuple, list, container.Array)):
-            return tuple([_encode(a) for a in x])
-        if isinstance(x, (str, int, float, expr.Var, expr.Any)):
-            return x
-        if isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
-            return x.value
-        if isinstance(x, runtime.container.String):
-            return str(x)
-        if x is None:
-            return None
-        raise RuntimeError(
-            f'Do not support type "{type(x)}" in argument. Consider to use'
-            f"primitive types or tvm.tir.Var only"
-        )
-
-    ret = []
-    for t in args:
-        ret.append(_encode(t))
-    return tuple(ret)
-
-
-def deserialize_args(args):
-    """The inverse function of :code:`serialize_args`.
-
-    Parameters
-    ----------
-    args: list of hashable or Tensor
-    """
-    ret = []
-    for t in args:
-        if isinstance(t, tuple) and t[0] == "TENSOR":
-            ret.append(placeholder(shape=t[1], dtype=t[2]))
-        else:
-            ret.append(t)
-    return ret
-
-
-def args_to_workload(args, task_name=None):
-    """Convert argument list to hashable workload tuple.
-    This function will convert list to tuple, tvm node to python value and
-    flatten te.tensor.Tensor to a tuple
-
-    Parameters
-    ----------
-    task_name : str
-        The AutoTVM task name
-
-    args : list of args
-        The arguments to the function
-
-    Returns
-    -------
-    ret: hashable
-        The hashable value
-    """
-    return (task_name,) + serialize_args(args) if task_name is not None else serialize_args(args)
-
-
-class Task(object):
-    """A Tunable Task
-
-    Parameters
-    ----------
-    name: str
-        The name of the task.
-    args: Tuple
-        Positional argument of func
-    """
-
-    def __init__(self, name, args):
-        self.name = name
-        self.args = args
-        self.kwargs = {}  # currently unused
-
-        # init null config space
-        self.config_space = None
-        self.func = _lookup_task(name)
-
-        # auxiliary info, available after `init_space` is called
-        self.flop = None
-        self.target = None
-        self.target_host = None
-
-    @property
-    def workload(self):
-        return (self.name,) + serialize_args(self.args)
-
-    def instantiate(self, config):
-        """Instantiate this task function (template) with a config.
-        Returns corresponding schedule.
-
-        Parameters
-        ----------
-        config: template.ConfigEntity
-            parameter config for this template
-
-        Returns
-        -------
-        sch: tvm.te.schedule.Schedule
-            The tvm schedule
-        arg_bufs: Array of te.tensor.Tensor
-            The input/output buffers
-        """
-        config.flop = 0
-        with ApplyConfig(config):
-            sch, arg_bufs = self.func(*self.args, **self.kwargs)
-        if not self.flop:
-            config.flop = config.flop or compute_flop(sch)
-            self.flop = config.flop
-        return sch, arg_bufs
-
-    def __getstate__(self):
-        # custom pickle implementation is required for
-        # some unpickable local task functions.
-        # So we only pickle the name of the function
-        # and restore the function by name when unpickling it.
-        import cloudpickle  # pylint: disable=import-outside-toplevel
-
-        self.target, self.target_host = Target.canon_target_and_host(self.target, self.target_host)
-        return {
-            "name": self.name,
-            "args": self.args,
-            "kwargs": self.kwargs,
-            "config_space": self.config_space,
-            "flop": self.flop,
-            "target": self.target,
-            "target_host": self.target_host,
-            "func": cloudpickle.dumps(self.func),
-        }
-
-    def __setstate__(self, state):
-        import cloudpickle  # pylint: disable=import-outside-toplevel
-
-        self.name = state["name"]
-        self.args = state["args"]
-        self.kwargs = state["kwargs"]
-        self.config_space = state["config_space"]
-        self.func = cloudpickle.loads(state["func"])
-        self.flop = state["flop"]
-        self.target, self.target_host = Target.canon_target_and_host(
-            state["target"], state["target_host"]
-        )
-
-    def __repr__(self):
-        return "Task(func_name=%s, args=%s, kwargs=%s, workload=%s)" % (
-            self.name,
-            self.args,
-            self.kwargs,
-            self.workload,
-        )
-
-
-TASK_TABLE = {}
-
-
-class TaskTemplate(object):
-    """
-    Task template is used to creates a tunable AutoTVM task.
-
-    It can be defined by a pair of compute and schedule function using
-    `_register_task_compute` and `_register_task_schedule`,
-    or by a customized task creation function that is more flexible using
-    `_register_customized_task`.
-
-    Note that when customized func is registered, compute and schedule function
-    will be ignored
-    """
-
-    def __init__(self):
-        self.fcompute = None
-        self.fschedule = None
-        self.fcustomized = None
-
-    def __call__(self, *args, **kwargs):
-        args = deserialize_args(args)
-        if self.fcustomized is None:
-            return self._default_func(*args, **kwargs)
-        assert callable(self.fcustomized)
-        return self.fcustomized(*args, **kwargs)
-
-    def _default_func(self, *args, **kwargs):
-        assert callable(self.fcompute) and callable(self.fschedule)
-        out = self.fcompute(*args, **kwargs)
-        arg_bufs = [out] + self._get_inputs(out)
-        s = self.fschedule([out])
-        return s, arg_bufs
-
-    @staticmethod
-    def _get_inputs(out):
-        inputs = []
-        queue = [out]
-        hash_set = set()
-        while queue:
-            t = queue.pop(0)
-            if isinstance(t.op, tensor.PlaceholderOp):
-                inputs.append(t)
-            else:
-                input_tensors = [t for t in t.op.input_tensors if t not in hash_set]
-                queue.extend(input_tensors)
-                hash_set.update(input_tensors)
-        return inputs
-
-
-class MissingTask(TaskTemplate):
-    """
-    Dummy task template for a task lookup which cannot be resolved.
-    This can occur if the task being requested from _lookup_task()
-    has not been imported in this run.
-    """
-
-    def __init__(self, taskname: str):
-        super().__init__()
-        self._taskname = taskname
-
-    def __call__(self, *args, **kwargs):
-        raise RuntimeError(
-            f"Attempting to invoke a missing task {self._taskname}."
-            "It is possible that the function is registered in a "
-            "Python module that is not imported in this run, or the log is out-of-date."
-        )
-
-
-def _register_task_compute(name, func=None):
-    """Register compute function to autotvm task
-
-    Parameters
-    ----------
-    name: str
-        The task name
-
-    func: None or callable
-        If it is None, return a decorator.
-        If is callable, decorate this function.
-
-    Returns
-    -------
-    decorator: callable
-        A decorator
-    """
-
-    def _do_reg(f):
-        if name not in TASK_TABLE:
-            TASK_TABLE[name] = TaskTemplate()
-        tmpl = TASK_TABLE[name]
-        if tmpl.fcompute is not None:
-            raise ValueError(f"Compute is already registered in autoTVM task {name}")
-        tmpl.fcompute = f
-        return f
-
-    if func:
-        return _do_reg(func)
-    return _do_reg
-
-
-def _register_task_schedule(name, func=None):
-    """Register schedule function to autotvm task
-
-    Parameters
-    ----------
-    name: str
-        The task name
-
-    func: None or callable
-        If it is None, return a decorator.
-        If is callable, decorate this function.
-
-    Returns
-    -------
-    decorator: callable
-        A decorator
-    """
-
-    def _do_reg(f):
-        if name not in TASK_TABLE:
-            TASK_TABLE[name] = TaskTemplate()
-        tmpl = TASK_TABLE[name]
-        if tmpl.fschedule is not None:
-            raise ValueError(f"Schedule is already registered in autoTVM task {name}")
-        tmpl.fschedule = f
-        return f
-
-    if func:
-        return _do_reg(func)
-    return _do_reg
-
-
-def _register_customized_task(name, func=None):
-    """Register a customized function to AutoTVM task.
-
-    Parameters
-    ----------
-    name: str
-        The task name
-
-    func: None or callable
-        If it is None, return a decorator.
-        If is callable, decorate this function.
-
-    Returns
-    -------
-    decorator: callable
-        A decorator
-    """
-
-    def _do_reg(f):
-        if name not in TASK_TABLE:
-            TASK_TABLE[name] = TaskTemplate()
-        tmpl = TASK_TABLE[name]
-        if tmpl.fcustomized is not None:
-            raise ValueError(f"Customized func is already registered in autoTVM task {name}")
-        tmpl.fcustomized = f
-        return f
-
-    if func:
-        return _do_reg(func)
-    return _do_reg
-
-
-def template(task_name, func=None):
-    """Decorate a function as a tunable schedule template.
-
-    Parameters
-    ----------
-    task_name: str
-        The task name
-
-    func: None or callable
-        A callable template function.
-        If it is None, return a decorator.
-        If is callable, decorate this function.
-
-    Returns
-    -------
-    func: callable
-        The decorated function
-
-    Examples
-    --------
-    The following code is a tunable template for a blocked matrix multiplication
-
-    .. code-block:: python
-
-        @autotvm.template("matmul")
-        def matmul(N, L, M, dtype):
-            A = te.placeholder((N, L), name='A', dtype=dtype)
-            B = te.placeholder((L, M), name='B', dtype=dtype)
-
-            k = te.reduce_axis((0, L), name='k')
-            C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
-            s = te.create_schedule(C.op)
-
-            # schedule
-            y, x = s[C].op.axis
-            k = s[C].op.reduce_axis[0]
-
-            ##### define space begin #####
-            cfg = autotvm.get_config()
-            cfg.define_split("tile_y", y, num_outputs=2)
-            cfg.define_split("tile_x", x, num_outputs=2)
-            ##### define space end #####
-
-            # schedule according to config
-            yo, yi = cfg["tile_y"].apply(s, C, y)
-            xo, xi = cfg["tile_x"].apply(s, C, x)
-
-            s[C].reorder(yo, xo, k, yi, xi)
-
-            return s, [A, B, C]
-    """
-
-    def _decorate(f):
-        @functools.wraps(f)
-        def wrapper(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            workload = args_to_workload(args, task_name)
-            tgt = Target.current()
-            cfg = DispatchContext.current.query(tgt, workload)
-            with ApplyConfig(cfg):
-                return f(*args, **kwargs)
-
-        _register_customized_task(task_name, f)
-        return wrapper
-
-    if func:
-        return _decorate(func)
-    return _decorate
-
-
-def create(task_name, args, target, target_host=None):
-    """Create a tuning task and initialize its search space
-
-    Parameters
-    ----------
-    task_name : str
-        The AutoTVM task name
-    args : List
-        Positional arguments
-    target : Target
-        The compilation target
-    target_host: Target, optional
-        The compilation target for host side
-
-    Returns
-    -------
-    tsk: Task
-        a task object
-    """
-    args = serialize_args(args)
-    ret = Task(task_name, args)
-
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    # init config space
-    ret.config_space = ConfigSpace()
-
-    ctx = ApplyConfig(ret.config_space)
-    with ctx:
-        with target:
-            sch, _ = ret.func(*args)
-            ret.config_space.code_hash = getattr(sch, "code_hash", None)
-
-    ret.flop = ret.config_space.flop or compute_flop(sch)
-    ret.target = target
-    ret.target_host = target_host
-
-    return ret
-
-
-def get_config():
-    """Get current config object
-
-    Returns
-    -------
-    cfg: ConfigSpace or ConfigEntity
-        The current config
-    """
-    tgt = Target.current(allow_none=True)
-    return DispatchContext.current.query(tgt, None)
-
-
-class FlopCalculationError(RuntimeError):
-    """Error happens when estimating FLOP for a compute op"""
-
-
-def compute_flop(sch):
-    """Calculate number of FLOP (floating number operations) of the compute ops in a schedule
-
-    Parameters
-    ----------
-    sch: tvm.te.schedule.Schedule
-        schedule
-
-    Returns
-    -------
-    flop: int
-        number of FLOP in this schedule
-    """
-
-    def _prod_length(axes):
-        """compute product of the lengths of a list of axes"""
-        try:
-            num_iter = int(np.prod([get_const_int(axis.dom.extent) for axis in axes]))
-        except ValueError:
-            raise FlopCalculationError("The length of axis is not constant. ")
-        return num_iter
-
-    def _count_flop(exp):
-        """compute flop for a single expression"""
-        if isinstance(exp, expr.Reduce):
-            num_iter = _prod_length(exp.axis)
-            combiner = exp.combiner.result
-            source = exp.source
-            if len(combiner) != 1:
-                raise FlopCalculationError("Found multiple output in the combiner of reduce op")
-            if len(source) != 1:
-                raise FlopCalculationError("Found multiple output in the source of reduce op")
-            return num_iter * (_count_flop(combiner[0]) + _count_flop(source[0]))
-        if isinstance(exp, (expr.FloatImm, expr.IntImm)):
-            return 0
-        if isinstance(exp, expr.Cast):
-            return _count_flop(exp.value)
-        if isinstance(exp, expr.Var):
-            return 0
-        if isinstance(
-            exp,
-            (
-                expr.Add,
-                expr.Sub,
-                expr.Mul,
-                expr.Div,
-                expr.Mod,
-                expr.FloorDiv,
-                expr.FloorMod,
-                expr.Max,
-                expr.Min,
-                expr.EQ,
-                expr.NE,
-                expr.LT,
-                expr.LE,
-                expr.GT,
-                expr.GE,
-                expr.And,
-                expr.Or,
-                expr.Not,
-            ),
-        ):
-            base = 1
-
-            if isinstance(exp, expr.Not):  # unary
-                return base + _count_flop(exp.a)
-
-            return base + _count_flop(exp.a) + _count_flop(exp.b)
-        if isinstance(exp, expr.Select):
-            return _count_flop(exp.condition) + max(
-                _count_flop(exp.true_value), _count_flop(exp.false_value)
-            )
-        if isinstance(exp, expr.ProducerLoad):
-            # Ignore flops from indexing expressions.
-            return 0
-
-        if isinstance(exp, expr.Call):
-            return sum([_count_flop(x) for x in exp.args])
-
-        raise FlopCalculationError("Found unsupported operator in the compute expr")
-
-    def traverse(ops):
-        """accumulate flops"""
-        ret = 0
-        for op in ops:
-            if isinstance(op, tensor.ComputeOp):
-                num_element = _prod_length(op.axis)
-
-                body = op.body
-                if len(body) != 1:
-                    raise FlopCalculationError("Found multiple output in the compute")
-                exp = body[0]
-
-                ret += num_element * _count_flop(exp)
-                ret += traverse([t.op for t in op.input_tensors])
-
-            elif isinstance(op, tensor.PlaceholderOp):
-                pass
-            else:
-                raise FlopCalculationError(
-                    f"{op.name} is not supported by autotvm. "
-                    "Only support te.compute currently. "
-                    "Other ops like tvm.te.scan/te.extern is not supported"
-                )
-        return ret
-
-    try:
-        ret = traverse(sch.outputs)
-    except FlopCalculationError as exc:
-        raise RuntimeError(
-            "FLOP estimator fails for this operator. Error msg: "
-            + str(exc)
-            + ". Please use `cfg.add_flop` to manually set "
-            "FLOP for this operator"
-        )
-
-    if ret == 0:
-        raise RuntimeError(
-            "Cannot find float number operation in this operator. "
-            "Please use `cfg.add_flop` to manually set "
-            "FLOP for this operator"
-        )
-    return ret
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
deleted file mode 100644
index a4f3636edbbe..000000000000
--- a/python/tvm/autotvm/task/topi_integration.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-variable,invalid-name,unused-argument
-"""
-Decorators for registering tunable templates to TOPI.
-
-These decorators can make your simple implementation be able to use different configurations
-for different workloads.
-Here we directly use all arguments to the TOPI call as "workload", so make sure all the arguments
-(except tvm.te.Tensor) in you calls are hashable. For tvm.te.Tensor,
-we will serialize it to a hashable tuple.
-
-See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
-"""
-import functools
-
-import tvm.te._ffi_api
-from tvm.target import Target
-from tvm.te import tensor
-
-from .task import (
-    args_to_workload,
-    serialize_args,
-    DispatchContext,
-    _register_task_compute,
-    _register_task_schedule,
-)
-
-
-# Task extractor for relay program
-class TaskExtractEnv:
-    """Global environment for extracting tuning tasks from graph"""
-
-    current = None
-    registered = None
-
-    def __init__(self, allow_duplicate=False):
-        self.allow_duplicate = allow_duplicate
-        self.task_collection = []
-        self.wanted_relay_ops = None
-        self.modified_funcs = []
-        self.tracing = False
-
-    def __enter__(self):
-        self.task_collection = []
-        self.tracing = True
-
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.tracing = False
-
-    def reset(self, wanted_relay_ops=None):
-        """Reset task collections
-
-        Parameters
-        ----------
-        wanted_relay_ops: List of tvm.ir.Op
-            The relay ops to be extracted
-        """
-        self.task_collection = []
-        self.wanted_relay_ops = wanted_relay_ops
-
-    def add_task(self, task_name, args):
-        """Add AutoTVM task
-
-        Parameters
-        ----------
-        task_name: str
-            AutoTVM task name.
-
-        args: tuple
-            Arguments to the TOPI function.
-        """
-        key = (task_name, serialize_args(args))
-        if self.allow_duplicate or key not in self.task_collection:
-            self.task_collection.append(key)
-
-    def get_tasks(self):
-        """Get collected tasks
-
-        Returns
-        -------
-        tasks: List of tuple(name, args)
-            A list of tasks extracted from the graph
-        """
-        return self.task_collection
-
-    @staticmethod
-    def get(allow_duplicate=False):
-        """Get the single instance of TaskExtractEnv
-
-        Parameters
-        ----------
-        allow_duplicate : boolean
-            Whether to fetch all workloads in the network,
-            even though some of them are the same. This is
-            useful for graph tuning.
-
-        Returns
-        -------
-        env: TaskExtractEnv
-            The single instance of TaskExtractEnv
-        """
-        if not TaskExtractEnv.current:
-            TaskExtractEnv.current = TaskExtractEnv(allow_duplicate)
-        else:
-            TaskExtractEnv.current.allow_duplicate = allow_duplicate
-        return TaskExtractEnv.current
-
-
-def register_topi_compute(task_name, func=None):
-    """Register a tunable template for a topi compute function.
-
-    The registration will wrap this topi compute to take `cfg` as the first argument,
-    followed by the original argument list. It uses all its argument as workload and
-    stores this "workload" to its final ComputeOp, which can be used to reconstruct
-    "workload" in the following topi_schedule call.
-
-    Parameters
-    ----------
-    task_name: str
-        The AutoTVM task name
-
-    func: None or callable
-        If it is None, return a decorator.
-        If is callable, decorate this function.
-
-    Returns
-    -------
-    decorator: callable
-        A decorator
-
-    Examples
-    --------
-    See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
-    """
-
-    def _decorate(topi_compute):
-        @functools.wraps(topi_compute)
-        @_register_task_compute(task_name)
-        def wrapper(*args, **kwargs):
-            """wrapper function for topi compute"""
-            assert not kwargs, "Do not support kwargs in template function call"
-            task_env = TaskExtractEnv.current
-            if task_env is not None and task_env.tracing:
-                task_env.add_task(task_name, args)
-            workload = args_to_workload(args, task_name)
-            tgt = Target.current()
-            cfg = DispatchContext.current.query(tgt, workload)
-            node = topi_compute(cfg, *args)
-
-            # attach workload to return op
-            op = node.op
-            attrs = {}
-            for k, v in node.op.attrs.items():
-                attrs[k] = v
-            attrs["workload"] = workload
-            if isinstance(op, tensor.ComputeOp):
-                op = tvm.te._ffi_api.ComputeOp(op.name, op.tag, attrs, op.axis, op.body)
-            elif isinstance(op, tensor.ExternOp):
-                op = tvm.te._ffi_api.ExternOp(
-                    op.name,
-                    op.tag,
-                    attrs,
-                    op.inputs,
-                    op.input_placeholders,
-                    op.output_placeholders,
-                    op.body,
-                )
-            else:
-                raise RuntimeError("Unsupported op type: " + str(type(op)))
-
-            if isinstance(node, tensor.Tensor):
-                return op.output(0)
-            return [op.output(i) for i in range(len(node))]
-
-        return wrapper
-
-    if func:
-        return _decorate(func)
-    return _decorate
-
-
-def register_topi_schedule(task_name, func=None):
-    """Register a tunable template for a topi schedule function.
-
-    The registration will wrap this topi schedule to take `cfg` as the first argument,
-    followed by the original argument list.
-
-    Note that this function will try to find "workload" from all the ComputeOp in the input.
-    You can attach "workload" to your compute op by using :any:`register_topi_compute`.
-
-    The task name has to be the same as that of the corresponding topi compute function.
-
-    Parameters
-    ----------
-    task_name: str
-        The AutoTVM task name
-
-    func: None or callable
-        If it is None, return a decorator.
-        If is callable, decorate this function.
-
-    Returns
-    -------
-    decorator: callable
-        A decorator
-
-    Examples
-    --------
-    See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
-    """
-
-    def _decorate(topi_schedule):
-        @functools.wraps(topi_schedule)
-        @_register_task_schedule(task_name)
-        def wrapper(outs, *args, **kwargs):
-            """wrapper function for topi schedule"""
-            workload = get_workload(outs, task_name)
-            if workload is None:
-                raise RuntimeError(
-                    f"Cannot find TOPI workload {task_name}. "
-                    "Is it registered with `register_topi_compute`?"
-                )
-            tgt = Target.current()
-            cfg = DispatchContext.current.query(tgt, workload)
-            return topi_schedule(cfg, outs, *args, **kwargs)
-
-        return wrapper
-
-    if func:
-        return _decorate(func)
-    return _decorate
-
-
-def get_workload(outs, task_name=None):
-    """Retrieve the workload from outputs"""
-    visited = set()
-
-    def traverse(tensors):
-        """traverse all ops to find attached workload"""
-        for t in tensors:
-            op = t.op
-            if op in visited:
-                continue
-            visited.add(op)
-            wkl = traverse(op.input_tensors)
-            if wkl is not None:
-                return wkl
-
-            if "workload" in op.attrs:
-                ret = args_to_workload(op.attrs["workload"])
-                if task_name is None or ret[0] == task_name:
-                    return ret
-        return None
-
-    outs = [outs] if isinstance(outs, tensor.Tensor) else outs
-    return traverse(outs)
diff --git a/python/tvm/autotvm/testing/__init__.py b/python/tvm/autotvm/testing/__init__.py
deleted file mode 100644
index 972d0cbaae5c..000000000000
--- a/python/tvm/autotvm/testing/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Testing utilities for autotvm"""
diff --git a/python/tvm/autotvm/testing/tune_relay.py b/python/tvm/autotvm/testing/tune_relay.py
deleted file mode 100644
index 916b2a800b2d..000000000000
--- a/python/tvm/autotvm/testing/tune_relay.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import argparse
-import json
-import os
-import warnings
-
-import tvm
-from tvm import autotvm
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.autotvm.graph_tuner import DPTuner
-from tvm.autotvm.tuner import XGBTuner
-from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data
-from tvm.support import describe
-from tvm.testing.utils import strtobool
-
-
-def _parse_args():
-    args = argparse.ArgumentParser()
-    args.add_argument(
-        "--workload",
-        type=str,
-        required=True,
-        help="The name of the workload to tune. Supported models: "
-        "https://github.com/apache/tvm/blob/main/python/tvm/meta_schedule/testing/relay_workload.py#L303-L322",  # pylint: disable=line-too-long
-    )
-    args.add_argument(
-        "--input-shape",
-        type=str,
-        required=True,
-        help="The input shape of the workload. Example: '[1, 3, 224, 224]'",
-    )
-    args.add_argument(
-        "--target",
-        type=str,
-        required=True,
-        help="The target device to tune. "
-        "Example: 'aws/cpu/c5.9xlarge', 'nvidia/nvidia-v100', 'nvidia/geforce-rtx-3090'",
-    )
-    args.add_argument(
-        "--num-trials",
-        type=int,
-        required=True,
-        help="The number of trials per kernel. Example: 800",
-    )
-    args.add_argument(
-        "--rpc-host",
-        type=str,
-        required=True,
-        help="The host address of the RPC tracker. Example: 192.168.6.66",
-    )
-    args.add_argument(
-        "--rpc-port", type=int, required=True, help="The port of the RPC tracker. Example: 4445"
-    )
-    args.add_argument(
-        "--rpc-key", type=str, required=True, help="The key of the RPC tracker. Example: '3090ti'"
-    )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-        help="The working directory to store the tuning logs. Example: '/tmp/tune_relay'",
-    )
-    args.add_argument(
-        "--layout",
-        type=str,
-        default=None,
-        help="The layout of the workload. Example: 'NCHW', 'NHWC'",
-    )
-    args.add_argument("--cache-dir", type=str, default=None)
-    args.add_argument("--number", type=int, default=3)
-    args.add_argument("--repeat", type=int, default=1)
-    args.add_argument("--min-repeat-ms", type=int, default=100)
-    args.add_argument(
-        "--cpu-flush",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        required=True,
-    )
-    args.add_argument(
-        "--graph-tuner",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        required=True,
-    )
-    args.add_argument(
-        "--backend", type=str, choices=["graph", "vm"], help="example: graph / vm", required=True
-    )
-    parsed = args.parse_args()
-    parsed.target = tvm.target.Target(parsed.target)
-    parsed.input_shape = json.loads(parsed.input_shape)
-    parsed.rpc_config = ms.runner.RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=600,
-    )
-    return parsed
-
-
-ARGS = _parse_args()
-
-
-def main():
-    if ARGS.target.kind.name != "llvm" and ARGS.graph_tuner:
-        raise ValueError("GraphTuner only supports llvm target")
-    if ARGS.target.kind.name != "llvm" and ARGS.cpu_flush:
-        raise ValueError("cpu_flush only supports llvm target")
-    if ARGS.target.kind.name == "llvm" and not ARGS.cpu_flush:
-        warnings.warn("cpu_flush is not enabled for llvm target")
-
-    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
-    graph_opt_sch_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}_graph_opt.log")
-    measure_option = autotvm.measure_option(
-        builder=autotvm.LocalBuilder(),
-        runner=autotvm.RPCRunner(
-            key=ARGS.rpc_key,
-            host=ARGS.rpc_host,
-            port=ARGS.rpc_port,
-            number=ARGS.number,
-            repeat=ARGS.repeat,
-            min_repeat_ms=ARGS.min_repeat_ms,
-            enable_cpu_cache_flush=ARGS.cpu_flush,
-        ),
-    )
-    describe()
-    print(f"Workload: {ARGS.workload}")
-    mod, params, (input_name, input_shape, input_dtype) = get_network(
-        ARGS.workload, ARGS.input_shape, layout=ARGS.layout, cache_dir=ARGS.cache_dir
-    )
-    input_info = [{"name": input_name, "shape": input_shape, "dtype": input_dtype}]
-    input_data = {
-        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in input_info
-    }
-    for item in input_info:
-        print(f"  input_name : {item['name']}")
-        print(f"  input_shape: {item['shape']}")
-        print(f"  input_dtype: {item['dtype']}")
-
-    with ms.Profiler() as profiler:
-        with ms.Profiler.timeit("TaskExtraction"):
-            # extract workloads from relay program
-            tasks = autotvm.task.extract_from_program(
-                mod["main"],
-                target=ARGS.target,
-                params=params,
-                ops=(
-                    relay.op.get("nn.conv2d"),
-                    relay.op.get("nn.conv3d"),
-                    relay.op.get("nn.conv2d_transpose"),
-                    relay.op.get("nn.dense"),
-                    relay.op.get("nn.batch_matmul"),
-                ),
-            )
-            for i, task in enumerate(tasks):
-                print(f"Task {i} {task.name}: {task}")
-
-        with ms.Profiler.timeit("Tuning"):
-            if ARGS.num_trials > 0:
-                for i, task in enumerate(tasks):
-                    prefix = f"[Task {i + 1:2d}/{len(tasks):2d}] "
-                    tuner_obj = XGBTuner(task, loss_type="reg")
-                    n_trial = min(len(task.config_space), ARGS.num_trials)
-                    tuner_obj.tune(
-                        n_trial=n_trial,
-                        early_stopping=800,
-                        measure_option=measure_option,
-                        callbacks=[
-                            autotvm.callback.progress_bar(n_trial, prefix=prefix),
-                            autotvm.callback.log_to_file(log_file),
-                        ],
-                    )
-                if ARGS.graph_tuner:
-                    executor = DPTuner(
-                        graph=mod["main"],
-                        input_shapes={input_name: input_shape},
-                        records=log_file,
-                        target_ops=[relay.op.get("nn.conv2d")],
-                        target=ARGS.target,
-                    )
-                    executor.benchmark_layout_transform(min_exec_num=1000)
-                    executor.run()
-                    executor.write_opt_sch2record_file(graph_opt_sch_file)
-
-        relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend]
-        with ms.Profiler.timeit("PostTuningCompilation"):
-            if ARGS.graph_tuner:
-                ctx = autotvm.apply_graph_best(graph_opt_sch_file)
-            else:
-                ctx = autotvm.apply_history_best(log_file)
-            with ctx:
-                print("compile...")
-                with tvm.transform.PassContext(opt_level=3):
-                    lib = relay_build(mod, target=ARGS.target, params=params)
-    print("Tuning Time:")
-    print(profiler.table())
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=lib,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=create_timer(ARGS.backend),
-        backend=ARGS.backend,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
deleted file mode 100644
index 3cbb7ff0e103..000000000000
--- a/python/tvm/autotvm/tophub.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: skip-file
-"""
-TopHub: Tensor Operator Hub
-To get the best performance, we typically need auto-tuning for the specific devices.
-TVM releases pre-tuned parameters in TopHub for some common networks and hardware targets.
-TVM will download these parameters for you when you call relay.build.
-"""
-
-import logging
-from os import getenv
-import sys
-from pathlib import Path
-from tvm.ir.container import Array
-
-from .task import ApplyHistoryBest
-from ..target import Target
-from ..contrib.download import download
-from .record import load_from_file
-from .utils import EmptyContext
-
-# environment variable to read TopHub location
-AUTOTVM_TOPHUB_LOC_VAR = "TOPHUB_LOCATION"
-
-# default location of TopHub
-AUTOTVM_TOPHUB_DEFAULT_LOC = "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub"
-
-# value of AUTOTVM_TOPHUB_LOC_VAR to specify to not read from TopHub
-AUTOTVM_TOPHUB_NONE_LOC = "NONE"
-
-# root path to store TopHub files
-AUTOTVM_TOPHUB_ROOT_PATH = Path(Path("~").expanduser(), ".tvm", "tophub")
-
-# the version of each package
-PACKAGE_VERSION = {
-    "arm_cpu": "v0.08",
-    "llvm": "v0.04",
-    "cuda": "v0.10",
-    "rocm": "v0.05",
-    "opencl": "v0.04",
-    "mali": "v0.06",
-    "intel_graphics": "v0.02",
-    "vta": "v0.10",
-    "amd_apu": "v0.01",
-    "adreno": "v0.01",
-}
-
-logger = logging.getLogger("autotvm")
-
-
-def _alias(name):
-    """convert alias for some packages"""
-    table = {
-        "vtacpu": "vta",
-        "webgpu": "opencl",
-        "vulkan": "opencl",
-        "nvptx": "cuda",
-        "amd_apu": "amd_apu",
-        "adreno": "adreno",
-    }
-    return table.get(name, name)
-
-
-def _get_tophub_location():
-    location = getenv(AUTOTVM_TOPHUB_LOC_VAR, None)
-    return AUTOTVM_TOPHUB_DEFAULT_LOC if location is None else location
-
-
-def context(target, extra_files=None):
-    """Return the dispatch context with pre-tuned parameters.
-    This function will load the corresponding *.log files in AUTOTVM_TOPHUB_ROOT_PATH.
-    If cannot find them, it will download them from TopHub github repo.
-    Users can also add their own files in argument `extra_files`.
-
-    Parameters
-    ----------
-    target: Target or List of Target
-        The compilation targets
-    extra_files: list of str, optional
-        Extra log files to load
-    """
-    tophub_location = _get_tophub_location()
-    if tophub_location == AUTOTVM_TOPHUB_NONE_LOC:
-        return EmptyContext()
-
-    best_context = ApplyHistoryBest([])
-
-    targets = target if isinstance(target, (Array, list, tuple)) else [target]
-
-    for tgt in targets:
-        if isinstance(tgt, str):
-            tgt = Target(tgt)
-
-        # The TOPHUB file names rely on Target's device or kind. Both these types of
-        # information exist in Target.keys, but rules of filling this filed is not explicitly
-        # defined, we are afraid to rely only on Target.keys. At the same time Target.device
-        # is filled only if device was pointed explicitly in target string, that is not mandatory
-        # and in some cases we need to get information about device from Target.keys
-        # In priority order we verify:
-        # 1) Target.device
-        # 2) Target.keys
-        # 3) Target.kind
-        possible_names = []
-        device = tgt.attrs.get("device", "")
-        if device != "":
-            possible_names.append(_alias(device))
-        possible_names.extend(tgt.keys)
-        possible_names.append(tgt.kind.name)
-
-        all_packages = list(PACKAGE_VERSION.keys())
-        for name in possible_names:
-            name = _alias(name)
-            if name in all_packages:
-                if not check_backend(tophub_location, name):
-                    continue
-
-                filename = f"{name}_{PACKAGE_VERSION[name]}.log"
-                best_context.load(Path(AUTOTVM_TOPHUB_ROOT_PATH, filename))
-                break  # only load one file to avoid some fallback template mismatch problem
-
-    if extra_files:
-        for filename in extra_files:
-            best_context.load(filename)
-
-    return best_context
-
-
-def check_backend(tophub_location, backend):
-    """Check whether have pre-tuned parameters of the certain target.
-    If not, will download it.
-
-    Parameters
-    ----------
-    backend: str
-        The name of backend.
-
-    Returns
-    ----------
-    success: bool
-        Whether the check is successful.
-    """
-    backend = _alias(backend)
-    assert backend in PACKAGE_VERSION, f'Cannot find backend "{backend}" in TopHub'
-
-    version = PACKAGE_VERSION[backend]
-    package_name = f"{backend}_{version}.log"
-    if Path(AUTOTVM_TOPHUB_ROOT_PATH, package_name).is_file():
-        return True
-
-    # pylint: disable=import-outside-toplevel
-    if sys.version_info >= (3,):
-        import urllib.request as urllib2
-    else:
-        import urllib2
-    try:
-        download_package(tophub_location, package_name)
-        return True
-    except urllib2.URLError as e:
-        logging.warning("Failed to download tophub package for %s: %s", backend, e)
-        return False
-
-
-def download_package(tophub_location, package_name):
-    """Download pre-tuned parameters of operators for a backend
-
-    Parameters
-    ----------
-    tophub_location: str
-        The location to download TopHub parameters from
-
-    package_name: str
-        The name of package
-    """
-    rootpath = Path(AUTOTVM_TOPHUB_ROOT_PATH)
-    rootpath.mkdir(parents=True, exist_ok=True)
-
-    download_url = f"{tophub_location}/{package_name}"
-    logger.info("Download pre-tuned parameters package from %s", download_url)
-    download(download_url, Path(rootpath, package_name), overwrite=True)
-
-
-# global cache for load_reference_log
-REFERENCE_LOG_CACHE = {}
-
-
-def load_reference_log(backend, model, workload_name):
-    """Load reference log from TopHub to support fallback in template.
-    Template will use these reference logs to choose fallback config.
-
-    Parameters
-    ----------
-    backend: str
-        The backend name
-    model: str
-        The name of the device model
-    workload_name: str
-        The name of the workload. (The first item in the workload tuple)
-    """
-
-    backend = _alias(backend)
-    if backend not in PACKAGE_VERSION:
-        return []
-    version = PACKAGE_VERSION[backend]
-    package_name = f"{backend}_{version}.log"
-    filename = Path(AUTOTVM_TOPHUB_ROOT_PATH, package_name)
-
-    global REFERENCE_LOG_CACHE
-    key = (backend, model, workload_name)
-
-    if key not in REFERENCE_LOG_CACHE:
-        tmp = []
-        # If TOPHUB_LOCATION is not AUTOTVM_TOPHUB_NONE_LOC,
-        # Download the config file from tophub if not exists.
-        if not Path(filename).exists():
-            tophub_location = _get_tophub_location()
-            if tophub_location != AUTOTVM_TOPHUB_NONE_LOC:
-                download_package(tophub_location, package_name)
-        if Path(filename).is_file():  # in case download failed
-            find = False
-            inp = None
-            counts = {}
-            for inp, res in load_from_file(filename):
-                counts[inp.target.model] = counts.get(inp.target.model, 0) + 1
-                if model == inp.target.model:
-                    find = True
-                    break
-            # if device model is not find, use the device model with the most tuned workloads
-            if not find and counts:
-                model = max(counts.items(), key=lambda k: k[1])[0]
-
-            for inp, res in load_from_file(filename):
-                if model == inp.target.model and inp.task.workload[0] == workload_name:
-                    tmp.append((inp, res))
-        REFERENCE_LOG_CACHE[key] = tmp
-
-    return REFERENCE_LOG_CACHE[key]
diff --git a/python/tvm/autotvm/tuner/__init__.py b/python/tvm/autotvm/tuner/__init__.py
deleted file mode 100644
index a1f71d5bf51b..000000000000
--- a/python/tvm/autotvm/tuner/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-A tuner takes a task as input. It proposes some promising :any:`ConfigEntity`
-in the :any:`ConfigSpace` and measure them on the real hardware. Then it
-proposed the next batch of :any:`ConfigEntity` according to the measure results.
-This tuning loop is repeated.
-"""
-
-from . import callback
-from .ga_tuner import GATuner
-from .index_based_tuner import GridSearchTuner, RandomTuner
-from .tuner import Tuner
-from .xgboost_tuner import XGBTuner
-from .droplet_tuner import DropletTuner
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
deleted file mode 100644
index 40ee24e077b4..000000000000
--- a/python/tvm/autotvm/tuner/callback.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=consider-using-enumerate,invalid-name
-"""Namespace of callback utilities of AutoTVM"""
-import sys
-import time
-import logging
-
-import numpy as np
-
-from .. import record
-from ..utils import format_si_prefix
-
-logger = logging.getLogger("autotvm")
-
-
-def log_to_file(file_out, protocol="json"):
-    """Log the tuning records into file.
-    The rows of the log are stored in the format of autotvm.record.encode.
-
-    Parameters
-    ----------
-    file_out : File or str
-        The file to log to.
-    protocol: str, optional
-        The log protocol. Can be 'json' or 'pickle'
-
-    Returns
-    -------
-    callback : callable
-        Callback function to do the logging.
-    """
-
-    def _callback(_, inputs, results):
-        """Callback implementation"""
-        if isinstance(file_out, str):
-            with open(file_out, "a") as f:
-                for inp, result in zip(inputs, results):
-                    f.write(record.encode(inp, result, protocol) + "\n")
-        else:
-            for inp, result in zip(inputs, results):
-                file_out.write(record.encode(inp, result, protocol) + "\n")
-
-    # pylint: disable=import-outside-toplevel
-    from pathlib import Path
-
-    if isinstance(file_out, Path):
-        file_out = str(file_out)
-
-    return _callback
-
-
-def log_to_database(db):
-    """Save the tuning records to a database object.
-
-    Parameters
-    ----------
-    db: Database
-        The database
-    """
-
-    def _callback(_, inputs, results):
-        """Callback implementation"""
-        for inp, result in zip(inputs, results):
-            db.save(inp, result)
-
-    return _callback
-
-
-class Monitor(object):
-    """A monitor to collect statistic during tuning"""
-
-    def __init__(self):
-        self.scores = []
-        self.timestamps = []
-
-    def __call__(self, tuner, inputs, results):
-        for inp, res in zip(inputs, results):
-            if res.error_no == 0:
-                flops = inp.task.flop / np.mean(res.costs)
-                self.scores.append(flops)
-            else:
-                self.scores.append(0)
-
-            self.timestamps.append(res.timestamp)
-
-    def reset(self):
-        self.scores = []
-        self.timestamps = []
-
-    def trial_scores(self):
-        """get scores (currently is flops) of all trials"""
-        return np.array(self.scores)
-
-    def trial_timestamps(self):
-        """get wall clock time stamp of all trials"""
-        return np.array(self.timestamps)
-
-
-def progress_bar(total, prefix="", si_prefix="G"):
-    """Display progress bar for tuning
-
-    Parameters
-    ----------
-    total: int
-        The total number of trials
-    prefix: str
-        The prefix of output message
-    si_prefix: str
-        SI prefix for flops
-    """
-
-    class _Context(object):
-        """Context to store local variables"""
-
-        def __init__(self):
-            self.best_flops = 0
-            self.cur_flops = 0
-            self.ct = 0
-            self.total = total
-
-        def __del__(self):
-            if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
-                sys.stdout.write(" Done.\n")
-
-    ctx = _Context()
-    tic = time.time()
-
-    # Validate si_prefix argument
-    format_si_prefix(0, si_prefix)
-
-    if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
-        sys.stdout.write(
-            "\r%s Current/Best: %7.2f/%7.2f %sFLOPS | Progress: (%d/%d) "
-            "| %.2f s" % (prefix, 0, 0, si_prefix, 0, total, time.time() - tic)
-        )
-        sys.stdout.flush()
-
-    def _callback(tuner, inputs, results):
-        ctx.ct += len(inputs)
-
-        flops = 0
-        for inp, res in zip(inputs, results):
-            if res.error_no == 0:
-                flops = inp.task.flop / np.mean(res.costs)
-
-        if not logger.isEnabledFor(logging.DEBUG):  # only print progress bar in non-debug mode
-            ctx.cur_flops = flops
-            ctx.best_flops = tuner.best_flops
-
-            sys.stdout.write(
-                "\r%s Current/Best: %7.2f/%7.2f %sFLOPS | Progress: (%d/%d) "
-                "| %.2f s"
-                % (
-                    prefix,
-                    format_si_prefix(ctx.cur_flops, si_prefix),
-                    format_si_prefix(ctx.best_flops, si_prefix),
-                    si_prefix,
-                    ctx.ct,
-                    ctx.total,
-                    time.time() - tic,
-                )
-            )
-            sys.stdout.flush()
-
-    return _callback
diff --git a/python/tvm/autotvm/tuner/droplet_tuner.py b/python/tvm/autotvm/tuner/droplet_tuner.py
deleted file mode 100644
index d115353d773e..000000000000
--- a/python/tvm/autotvm/tuner/droplet_tuner.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tuner with droplet algorithm"""
-
-import logging
-import os
-
-import numpy as np
-
-from .tuner import Tuner
-
-LOGGER = logging.getLogger("autotvm")
-
-
-class DropletTuner(Tuner):
-    """Tuner with droplet algorithm.
-
-    Parameters
-    ----------
-    start_position: list of int
-        position initial of the space, the default is [0, 0, ..., 0]
-    pvalue: float
-        statistical value to confidence level, the default is 0.05
-    """
-
-    def __init__(self, task, start_position=None, pvalue=0.05):
-        super(DropletTuner, self).__init__(task)
-
-        # space info
-        self.space = task.config_space
-        self.dims = []
-
-        for _, v in self.space.space_map.items():
-            self.dims.append(len(v))
-        if len(self.dims) == 0:
-            self.dims.append(1)
-
-        # start position
-        start_position = [0] * len(self.dims) if start_position is None else start_position
-        self.best_choice = (-1, [0] * len(self.dims), [99999])
-        self.visited = set([self.space.knob2point(start_position)])
-        self.execution, self.total_execution, self.pvalue = 1, max(self.dims), pvalue
-        self.step, self.iter, self.batch = 1, 0, max(16, os.cpu_count())
-        self.next = [(self.space.knob2point(start_position), start_position)]
-
-    def num_to_bin(self, value, factor=1):
-        bin_format = str(0) * (len(self.dims) - len(bin(value)[2:])) + bin(value)[2:]
-        return [int(i) * factor for i in bin_format]
-
-    def search_space(self, factor=1):
-        search_space = []
-        for i in range(2 ** len(self.dims) - 1, 0, -1):
-            search_space += [self.num_to_bin(i, factor)] + [self.num_to_bin(i, -factor)]
-        return search_space
-
-    def next_pos(self, new_positions):
-        "returns the neighbors of the best solution"
-        next_set = []
-        for p in new_positions:
-            if len(next_set) > self.batch:
-                break
-            new_p = [
-                (x + y) % self.dims[i] if (x + y > 0) else 0
-                for i, (x, y) in enumerate(zip(p, self.best_choice[1]))
-            ]
-            idx_p = self.space.knob2point(new_p)
-            if idx_p not in self.visited:
-                self.visited.add(idx_p)
-                next_set.append((idx_p, new_p))
-        return next_set
-
-    def p_value(self, elem_1, elem_2):
-        if len(elem_1) <= 1 or len(elem_2) <= 1:
-            return True
-
-        from scipy import stats  # pylint: disable=import-outside-toplevel
-
-        return stats.ttest_ind(np.array(elem_1), np.array(elem_2)).pvalue <= self.pvalue
-
-    def next_batch(self, batch_size):
-        ret, self.batch = [], batch_size
-        for i in range(batch_size):
-            if i >= len(self.next):
-                break
-            if self.space.is_index_valid(self.next[i][0]):
-                ret.append(self.space.get(self.next[i][0]))
-        return ret
-
-    def speculation(self):
-        # Gradient descending direction prediction and search space filling
-        while len(self.next) < self.batch and self.execution < self.total_execution:
-            self.execution += self.step
-            self.next += self.next_pos(self.search_space(self.execution))
-
-    def update(self, inputs, results):
-        found_best_pos, count_valids = False, 0
-        for i, (_, res) in enumerate(zip(inputs, results)):
-            try:
-                if np.mean(self.best_choice[2]) > np.mean(res.costs) and self.p_value(
-                    self.best_choice[2], res.costs
-                ):
-                    self.best_choice = (self.next[i][0], self.next[i][1], res.costs)
-                    found_best_pos = True
-                count_valids += 1
-            except TypeError:
-                LOGGER.debug("Solution is not valid")
-                continue
-            else:
-                continue
-
-        self.next = self.next[self.batch : -1]
-        if found_best_pos:
-            self.next += self.next_pos(self.search_space())
-            self.execution = 1
-        self.speculation()
-        # stop, because all neighborhoods are invalid.
-        if count_valids == 0 and self.iter > 3:
-            self.next = []
-            LOGGER.warning(
-                f"Warning: early termination due to an all-invalid neighborhood \
-                after {self.iter} iterations"
-            )
-
-    def has_next(self):
-        return len(self.next) > 0
-
-    def load_history(self, data_set, min_seed_records=500):
-        pass
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
deleted file mode 100644
index ad5b87ac5d70..000000000000
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=consider-using-enumerate,invalid-name,abstract-method
-
-"""Tuner with genetic algorithm"""
-
-import numpy as np
-
-from .tuner import Tuner
-
-
-class GATuner(Tuner):
-    """Tuner with genetic algorithm.
-    This tuner does not have a cost model so it always run measurement on real machines.
-    This tuner expands the :code:`ConfigEntity` as gene.
-
-    Parameters
-    ----------
-    pop_size: int
-        number of genes in one generation
-    elite_num: int
-        number of elite to keep
-    mutation_prob: float
-        probability of mutation of a knob in a gene
-    """
-
-    def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1):
-        super(GATuner, self).__init__(task)
-
-        # algorithm configurations
-        self.pop_size = pop_size
-        self.elite_num = elite_num
-        self.mutation_prob = mutation_prob
-
-        assert elite_num <= pop_size, "The number of elites must be less than population size"
-
-        # random initialization
-        self.pop_size = min(self.pop_size, len(self.space))
-        self.elite_num = min(self.pop_size, self.elite_num)
-        self.visited = set(self.space.sample_ints(self.pop_size))
-
-        # current generation
-        self.genes = [self.space.point2knob(idx) for idx in self.visited]
-        self.scores = []
-        self.elites = []
-        self.elite_scores = []
-        self.trial_pt = 0
-
-    def next_batch(self, batch_size):
-        ret = []
-        while len(ret) < batch_size and self.has_next():
-            gene = self.genes[self.trial_pt % self.pop_size]
-            self.trial_pt += 1
-            ret.append(self.space.get(self.space.knob2point(gene)))
-        return ret
-
-    def update(self, inputs, results):
-        for inp, res in zip(inputs, results):
-            if res.error_no == 0:
-                y = inp.task.flop / np.mean(res.costs)
-                self.scores.append(y)
-            else:
-                self.scores.append(0.0)
-
-        if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space):
-            next_genes = []
-            # There is no reason to crossover or mutate since the size of the unvisited
-            # is no larger than the size of the population.
-            if len(self.space) - len(self.visited) <= self.pop_size:
-                for idx in range(self.space.range_length):
-                    if self.space.is_index_valid(idx) and idx not in self.visited:
-                        next_genes.append(self.space.point2knob(idx))
-                        self.visited.add(idx)
-            else:
-                genes = self.genes + self.elites
-                scores = np.array(self.scores[: len(self.genes)] + self.elite_scores)
-
-                # reserve elite
-                self.elites, self.elite_scores = [], []
-                elite_indexes = np.argpartition(scores, -self.elite_num)[-self.elite_num :]
-                for ind in elite_indexes:
-                    self.elites.append(genes[ind])
-                    self.elite_scores.append(scores[ind])
-
-                indices = np.arange(len(genes))
-                scores += 1e-8
-                scores /= np.max(scores)
-                probs = scores / np.sum(scores)
-                while len(next_genes) < self.pop_size:
-                    # cross over
-                    p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)
-                    p1, p2 = genes[p1], genes[p2]
-                    point = np.random.randint(len(self.space.dims))
-                    tmp_gene = p1[:point] + p2[point:]
-                    # mutation
-                    for j, dim in enumerate(self.space.dims):
-                        if np.random.random() < self.mutation_prob:
-                            tmp_gene[j] = np.random.randint(dim)
-
-                    if self.space.is_index_valid(self.space.knob2point(tmp_gene)):
-                        next_genes.append(tmp_gene)
-                        self.visited.add(self.space.knob2point(tmp_gene))
-            self.genes = next_genes
-            self.trial_pt = 0
-            self.scores = []
-
-    def has_next(self):
-        return len(self.visited) - (len(self.genes) - self.trial_pt) < len(self.space)
-
-    def load_history(self, data_set, min_seed_records=500):
-        pass
diff --git a/python/tvm/autotvm/tuner/index_based_tuner.py b/python/tvm/autotvm/tuner/index_based_tuner.py
deleted file mode 100644
index 881728bc9b34..000000000000
--- a/python/tvm/autotvm/tuner/index_based_tuner.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=abstract-method
-"""Grid search tuner and random tuner"""
-
-from .tuner import Tuner
-
-
-class IndexBaseTuner(Tuner):
-    """Base class for index based tuner
-    This type of tuner determine the next batch of configs based on config indices.
-
-    Parameters
-    ----------
-    task: autotvm.task.Task
-        The tuning task
-
-    range_idx: Optional[Tuple[int, int]]
-        A tuple of index range that this tuner can select from [begin_idx, end_idx]
-    """
-
-    def __init__(self, task, range_idx=None):
-        super(IndexBaseTuner, self).__init__(task)
-        assert range_idx is None or isinstance(
-            range_idx, tuple
-        ), "range_idx must be None or (int, int)"
-
-        self.visited = []
-        self.begin_idx, self.end_idx = range_idx or (0, self.space.range_length - 1)
-        assert self.begin_idx >= 0, "Start index must be positive"
-        self.end_idx += 1  # Further end_idx is exclusive
-        assert (
-            self.end_idx <= self.space.range_length
-        ), "Finish index must be less the space range length "
-        self.range_length = self.end_idx - self.begin_idx
-        assert self.range_length > 0, "Index range must be positive"
-        self.visited_max = self.space.subrange_length(self.begin_idx, self.end_idx)
-
-    def has_next(self):
-        return len(self.visited) < self.visited_max
-
-    def load_history(self, data_set, min_seed_records=500):
-        pass
-
-
-class GridSearchTuner(IndexBaseTuner):
-    """Enumerate the search space in a grid search order"""
-
-    def __init__(self, task, range_idx=None):
-        super(GridSearchTuner, self).__init__(task, range_idx)
-
-        self.index = self.begin_idx
-        if not self.space.is_index_valid(self.index):
-            self.index = self.space.get_next_index(
-                self.index, start=self.begin_idx, end=self.end_idx
-            )
-
-    def next_batch(self, batch_size):
-        ret = []
-        while len(ret) < batch_size and self.has_next():
-            self.visited.append(self.index)
-            ret.append(self.space.get(self.index))
-            self.index = self.space.get_next_index(
-                self.index, start=self.begin_idx, end=self.end_idx
-            )
-        return ret
-
-
-class RandomTuner(IndexBaseTuner):
-    """Enumerate the search space in a random order
-
-    Parameters
-    ----------
-    task: autotvm.task.Task
-        Tuning Task
-
-    range_idx: Optional[Tuple[int, int]]
-        A tuple of index range to random
-    """
-
-    def next_batch(self, batch_size):
-        ret = []
-        while len(ret) < batch_size and self.has_next():
-            index = self.space.get_rand_index(self.begin_idx, self.end_idx, to_exclude=self.visited)
-            self.visited.append(index)
-            ret.append(self.space.get(index))
-        return ret
diff --git a/python/tvm/autotvm/tuner/metric.py b/python/tvm/autotvm/tuner/metric.py
deleted file mode 100644
index f6932f80d3e3..000000000000
--- a/python/tvm/autotvm/tuner/metric.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Metrics for evaluating tuning process"""
-
-import numpy as np
-
-from ..utils import get_rank
-
-
-def max_curve(trial_scores):
-    """f(n) = max([s[i] fo i < n])
-
-    Parameters
-    ----------
-    trial_scores: Array of float
-        the score of i th trial
-
-    Returns
-    -------
-    curve: Array of float
-        function values
-    """
-    ret = np.empty(len(trial_scores))
-    keep = -1e9
-    for i, score in enumerate(trial_scores):
-        keep = max(keep, score)
-        ret[i] = keep
-    return ret
-
-
-def mean_curve(trial_scores):
-    """f(n) = mean([s[i] fo i < n])
-
-    Parameters
-    ----------
-    trial_scores: Array of float
-        the score of i th trial
-
-    Returns
-    -------
-    curve: Array of float
-        function values
-    """
-    ret = np.empty(len(trial_scores))
-    keep = 0
-    for i, score in enumerate(trial_scores):
-        keep += score
-        ret[i] = keep / (i + 1)
-    return ret
-
-
-def recall_curve(trial_ranks, top=None):
-    """
-    if top is None, f(n) = sum([I(rank[i] < n) for i < n]) / n
-    if top is K,    f(n) = sum([I(rank[i] < K) for i < n]) / K
-
-    Parameters
-    ----------
-    trial_ranks: Array of int
-        the rank of i th trial in labels
-    top: int or None
-        top-n recall
-
-    Returns
-    -------
-    curve: Array of float
-        function values
-    """
-    if not isinstance(trial_ranks, np.ndarray):
-        trial_ranks = np.array(trial_ranks)
-
-    ret = np.zeros(len(trial_ranks))
-    if top is None:
-        for i in range(len(trial_ranks)):
-            ret[i] = np.sum(trial_ranks[:i] <= i) / (i + 1)
-    else:
-        for i in range(len(trial_ranks)):
-            ret[i] = 1.0 * np.sum(trial_ranks[:i] < top) / top
-    return ret
-
-
-def cover_curve(trial_ranks):
-    """
-    f(n) = max k s.t. {1,2,...,k} is a subset of {ranks[i] for i < n}
-
-    Parameters
-    ----------
-    trial_ranks: Array of int
-        the rank of i th trial in labels
-
-    Returns
-    -------
-    curve: Array of float
-        function values
-    """
-    ret = np.empty(len(trial_ranks))
-    keep = -1
-    cover = set()
-    for i, rank in enumerate(trial_ranks):
-        cover.add(rank)
-        while keep + 1 in cover:
-            keep += 1
-        ret[i] = keep + 1
-    return ret / len(trial_ranks)
-
-
-def average_recall(preds, labels, N):
-    """evaluate average recall-n for predictions and labels"""
-    trials = np.argsort(preds)[::-1]
-    ranks = get_rank(labels[trials])
-    curve = recall_curve(ranks)
-    return np.sum(curve[:N]) / N
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
deleted file mode 100644
index 0841e9a76528..000000000000
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return,invalid-name,consider-using-enumerate,abstract-method
-"""Base class for model-based tuner
-This type of tuner will fit a cost model and use some optimization methods to
-find optimums points of cost model in space.
-"""
-import gc
-
-import numpy as np
-
-from .tuner import Tuner
-from ..env import GLOBAL_SCOPE
-
-
-class FeatureCache(object):
-    """Feature cache manager for cache sharing between different cost models"""
-
-    def __init__(self):
-        self.feature_cache = {}
-
-    def get(self, key):
-        """Get feature cache dictionary for a key
-
-        Parameters
-        ----------
-        key: str
-            The key of a feature type
-
-        Returns
-        -------
-        fea_cache: dict
-            cache dictionary
-        """
-        if key not in self.feature_cache:
-            self.feature_cache[key] = {}
-
-        return self.feature_cache[key]
-
-    def size(self, key):
-        """ " Get the size of a feature cache dictionary
-
-        Parameters
-        ----------
-        key: str
-            The key of a feature type
-
-        Returns
-        -------
-        n: int
-        """
-        return len(self.feature_cache.get(key, tuple()))
-
-    def clear(self, key):
-        """Clear feature cache for a key
-
-        Parameters
-        ----------
-        key: str
-            The key of a feature type
-        """
-        del self.feature_cache[key]
-        self.feature_cache[key] = {}
-        gc.collect()
-
-
-class CostModel(object):
-    """Cost model to predict the speed of a config"""
-
-    def __init__(self):
-        pass
-
-    def fit(self, xs, ys, plan_size):
-        """Fit to training data
-
-        Parameters
-        ----------
-        xs: Array of int
-            indexes of configs in the config space
-        ys: Array of float
-            The speed (flop, float number operations per second)
-        plan_size: int
-            The plan size of tuner
-        """
-        raise NotImplementedError()
-
-    def fit_log(self, records, plan_size, min_seed_records=500):
-        """Fit training data from log.
-
-        Parameters
-        ----------
-        records: Array of Tuple(MeasureInput, MeasureResult)
-            The tuning records
-        plan_size: int
-            The plan size of tuner
-        min_seed_records: int
-            Defaults to 500. Indicates the minimum number of records to
-            train the tuner with. If there are less than `min_seed_records`
-            number of records in `data_set`, no training of the tuner
-            will be done.
-        """
-        raise NotImplementedError()
-
-    def predict(self, xs, output_margin=False):
-        """Predict the speed of configs
-
-        Parameters
-        ----------
-        xs: Array of int
-            The indexes of configs to predict
-        output_margin: bool, optional
-            Whether output the untransformed margin.
-            When a model is used as base model, it should output untransformed margin
-
-        Returns
-        -------
-        preds: Array of float
-            The prediction
-        """
-        raise NotImplementedError()
-
-    def load_basemodel(self, base_model):
-        """Load base model for transfer learning
-
-        Parameters
-        ----------
-        base_model: CostModel
-                base model
-        """
-        raise NotImplementedError()
-
-    def spawn_base_model(self):
-        """Clone a base model with the same parameters.
-        The base model is used to fit history data in transfer learning.
-
-        Returns
-        -------
-        model: CostModel
-            A model with the same hyperparameter (argument)
-        """
-        raise NotImplementedError()
-
-
-class ModelOptimizer(object):
-    """Optimizer used to find optimal points of cost model"""
-
-    def __init__(self):
-        pass
-
-    def find_maximums(self, model, num, exclusive):
-        """Find maximum of a cost model
-
-        Note we use cost model to predict GFLOPS, so we should find the maximum
-
-        Parameters
-        ----------
-        model: CostModel
-            Cost model
-        num: int
-            The number of returned maximum points
-        exclusive: set, optional
-            The excluded set of this optimizer. Return results won't include any
-            elements in this set.
-        """
-        raise NotImplementedError()
-
-
-class ModelBasedTuner(Tuner):
-    """Base class for model based tuner
-    This type of tuner will fit a cost model and use an optimizer to
-    find the maximums of the cost model as next trials
-
-    Parameters
-    ----------
-    task: autotvm.task.Task
-        The tuning task
-    cost_model: CostModel
-        The cost model that predicts the speed of a config (IR)
-    model_optimizer:
-        The optimizer to find local optimum points of cost model in tuning search space
-    plan_size: int
-        Tuner will re-fit model per `plan_size` new measure samples
-    diversity_filter_ratio: int or float, optional
-        If is not None, the tuner will first select
-        top-(plan_size * diversity_filter_ratio) candidates according to the cost model
-        and then pick plan_size of them according to the diversity metric.
-    """
-
-    def __init__(self, task, cost_model, model_optimizer, plan_size, diversity_filter_ratio=None):
-        super(ModelBasedTuner, self).__init__(task)
-
-        # space
-        self.task = task
-        self.target = task.target
-        self.plan_size = plan_size
-
-        self.cost_model = cost_model
-        self.model_optimizer = model_optimizer
-        self.diversity_filter_ratio = diversity_filter_ratio
-
-        if self.diversity_filter_ratio:
-            assert self.diversity_filter_ratio >= 1, (
-                "Diversity filter ratio " "must be larger than one"
-            )
-
-        # trial plan
-        self.trials = []
-        self.trial_pt = 0
-        self.visited = set()
-
-        # observed samples
-        self.xs = []
-        self.ys = []
-        self.flops_max = 0.0
-        self.train_ct = 0
-
-    def next_batch(self, batch_size):
-        ret = []
-        while len(ret) < batch_size and self.has_next():
-            while self.trial_pt < len(self.trials):
-                index = self.trials[self.trial_pt]
-                if index not in self.visited and self.space.is_index_valid(index):
-                    break
-                self.trial_pt += 1
-
-            if self.trial_pt >= len(self.trials) - int(0.05 * self.plan_size):
-                # if the trial list is empty or
-                # the tuner is doing the last 5% trials (e-greedy), choose randomly
-                index = self.space.get_rand_index(to_exclude=self.visited)
-            ret.append(self.space.get(index))
-            self.visited.add(index)
-        return ret
-
-    def update(self, inputs, results):
-        for inp, res in zip(inputs, results):
-            index = inp.config.index
-            if res.error_no == 0:
-                self.xs.append(index)
-                flops = inp.task.flop / np.mean(res.costs)
-                self.flops_max = max(self.flops_max, flops)
-                self.ys.append(flops)
-            else:
-                self.xs.append(index)
-                self.ys.append(0.0)
-            # Usually the update function is called during the tune loop
-            # after the index is already added to the visited set.
-            # However, adding the index to visited again here enables us
-            # to also use this update function to resume tuning progress in
-            # case of interruption.
-            assert self.space.is_index_valid(index)
-            self.visited.add(index)
-        # if we have enough new training samples
-        if len(self.xs) >= self.plan_size * (self.train_ct + 1) and self.flops_max > 1e-6:
-            self.cost_model.fit(self.xs, self.ys, self.plan_size)
-            if self.diversity_filter_ratio:
-                candidate = self.model_optimizer.find_maximums(
-                    self.cost_model, self.plan_size * self.diversity_filter_ratio, self.visited
-                )
-                scores = self.cost_model.predict(candidate)
-                knobs = [self.space.point2knob(x) for x in candidate]
-                pick_index = submodular_pick(0 * scores, knobs, self.plan_size, knob_weight=1)
-                maximums = np.array(candidate)[pick_index]
-            else:
-                maximums = self.model_optimizer.find_maximums(
-                    self.cost_model, self.plan_size, self.visited
-                )
-
-            self.trials = maximums
-            self.trial_pt = 0
-            self.train_ct += 1
-
-    def load_history(self, data_set, min_seed_records=500):
-        # set in_tuning as True to make the feature extraction consistent
-        GLOBAL_SCOPE.in_tuning = True
-
-        # fit base model
-        base_model = self.cost_model.spawn_base_model()
-        success = base_model.fit_log(data_set, self.plan_size, min_seed_records)
-
-        if not success:
-            GLOBAL_SCOPE.in_tuning = False
-            return
-
-        # use base model to select initial points
-        if not self.trials:
-            # no plan yet, use base model to select initial trials
-            maximums = self.model_optimizer.find_maximums(base_model, self.plan_size, self.visited)
-            self.trials = maximums
-            self.trial_pt = 0
-
-        self.cost_model.load_basemodel(base_model)
-        GLOBAL_SCOPE.in_tuning = False
-
-    def has_next(self):
-        return len(self.visited) < len(self.space)
-
-
-def submodular_pick(scores, knobs, n_pick, knob_weight=1.0):
-    """Run greedy optimization to pick points with regard to both score and diversity.
-    DiversityScore = knob_weight * number of unique knobs in the selected set
-    Obj = sum(scores[i] for i in pick) + DiversityScore
-    Note that this objective function is a monotone submodular function.
-
-    Parameters
-    ----------
-    scores: Array of float
-        score of every points
-    knobs: Array of Array of int
-        feature vector (tunable knobs) of every points
-    n_pick: int
-        number of points to pick
-    knob_weight: float
-        weight of an unique knob feature
-    """
-    n = len(scores)
-    assert n == len(knobs)
-    n_knobs = len(knobs[0])
-
-    knobs_set = [set() for _ in range(n_knobs)]
-
-    ret = []
-    remain = list(range(len(scores)))
-
-    for _ in range(n_pick):
-        max_x = -1
-        max_delta = -1e9
-
-        for x in remain:
-            tmp_delta = scores[x]
-            for i in range(n_knobs):
-                if knobs[x][i] not in knobs_set[i]:
-                    tmp_delta += knob_weight
-
-            if tmp_delta > max_delta:
-                max_delta, max_x = tmp_delta, x
-
-        ret.append(max_x)
-        remain.remove(max_x)
-        for i in range(n_knobs):
-            knobs_set[i].add(knobs[max_x][i])
-
-    return ret
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
deleted file mode 100644
index 518fc0e45eb2..000000000000
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=consider-using-enumerate, invalid-name, invalid-sequence-index
-"""
-Cost model optimizer based on simulated annealing
-"""
-
-import heapq
-import logging
-import time
-
-import numpy as np
-
-from .model_based_tuner import ModelOptimizer
-
-logger = logging.getLogger("autotvm")
-
-
-class SimulatedAnnealingOptimizer(ModelOptimizer):
-    """parallel simulated annealing optimization algorithm
-
-    Parameters
-    ----------
-    task: Task
-        The tuning task
-    n_iter: int
-        The number of iterations of simulated annealing
-    temp: float or Array of float
-        If is a single float, then use a constant temperature.
-        If is an Array, then perform linear cooling from temp[0] to temp[1]
-    early_stop: int, optional
-        Stop iteration if the optimal set do not change in `early_stop` rounds
-    log_interval: int, optional
-        Print log every `log_interval` iterations
-    """
-
-    def __init__(
-        self,
-        task,
-        n_iter=500,
-        temp=(1, 0),
-        persistent=True,
-        parallel_size=128,
-        early_stop=50,
-        log_interval=50,
-    ):
-        super(SimulatedAnnealingOptimizer, self).__init__()
-        self.task = task
-        self.n_iter = n_iter
-        self.temp = temp
-        self.persistent = persistent
-        self.parallel_size = min(parallel_size, len(self.task.config_space))
-        self.early_stop = early_stop or 1e9
-        self.log_interval = log_interval
-        self.points = None
-
-    def find_maximums(self, model, num, exclusive):
-        tic = time.time()
-        temp, n_iter, early_stop, log_interval = (
-            self.temp,
-            self.n_iter,
-            self.early_stop,
-            self.log_interval,
-        )
-
-        if self.persistent and self.points is not None:
-            points = self.points
-        else:
-            points = self.task.config_space.sample_ints(self.parallel_size)
-
-        scores = model.predict(points)
-
-        # build heap and insert initial points
-        heap_items = [(float("-inf"), -1 - i) for i in range(num)]
-        heapq.heapify(heap_items)
-        in_heap = set(exclusive)
-        in_heap.update([x[1] for x in heap_items])
-
-        for s, p in zip(scores, points):
-            if s > heap_items[0][0] and p not in in_heap:
-                pop = heapq.heapreplace(heap_items, (s, p))
-                in_heap.remove(pop[1])
-                in_heap.add(p)
-
-        k = 0
-        k_last_modify = 0
-
-        if isinstance(temp, (tuple, list, np.ndarray)):
-            t = temp[0]
-            cool = 1.0 * (temp[0] - temp[1]) / (n_iter + 1)
-        else:
-            t = temp
-            cool = 0
-
-        while k < n_iter and k < k_last_modify + early_stop:
-            new_points = np.empty_like(points)
-            for i, p in enumerate(points):
-                new_points[i] = self.task.config_space.random_walk(p)
-
-            new_scores = model.predict(new_points)
-
-            ac_prob = np.exp(np.minimum((new_scores - scores) / (t + 1e-5), 1))
-            ac_index = np.random.random(len(ac_prob)) < ac_prob
-
-            points[ac_index] = new_points[ac_index]
-            scores[ac_index] = new_scores[ac_index]
-
-            for s, p in zip(new_scores, new_points):
-                if s > heap_items[0][0] and p not in in_heap:
-                    pop = heapq.heapreplace(heap_items, (s, p))
-                    in_heap.remove(pop[1])
-                    in_heap.add(p)
-                    k_last_modify = k
-
-            k += 1
-            t -= cool
-
-            if log_interval and k % log_interval == 0:
-                t_str = f"{t:.2f}"
-                logger.debug(
-                    "SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\ttemp: %s\t"
-                    "elapsed: %.2f",
-                    k,
-                    k_last_modify,
-                    heap_items[0][0],
-                    np.max([v for v, _ in heap_items]),
-                    t_str,
-                    time.time() - tic,
-                )
-
-        heap_items.sort(key=lambda item: -item[0])
-        heap_items = [x for x in heap_items if x[0] >= 0]
-        logger.debug(
-            "SA iter: %d\tlast_update: %d\telapsed: %.2f", k, k_last_modify, time.time() - tic
-        )
-        logger.debug("SA Maximums: %s", heap_items)
-
-        if self.persistent:
-            self.points = points
-
-        return [x[1] for x in heap_items]
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
deleted file mode 100644
index a758a5d4cd9c..000000000000
--- a/python/tvm/autotvm/tuner/tuner.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, no-self-use, invalid-name
-"""Base class of tuner"""
-import logging
-import tempfile
-
-import numpy as np
-
-from ..measure import MeasureInput, create_measure_batch
-from ..utils import format_si_prefix
-
-from ..env import GLOBAL_SCOPE
-
-logger = logging.getLogger("autotvm")
-
-
-class Tuner(object):
-    """Base class for tuners
-
-    Parameters
-    ----------
-    task: autotvm.task.Task
-        Tuning Task
-    """
-
-    def __init__(self, task, **kwargs):
-        self.param = kwargs
-        self.recorder = None
-
-        self.task = task
-        self.space = self.task.config_space
-
-        # keep the current best
-        self.best_config = None
-        self.best_flops = 0
-        self.best_measure_pair = None
-        self.best_iter = 0
-        self.error_ct_threshold = 150
-
-        # time to leave
-        self.ttl = None
-        self.n_trial = None
-        self.early_stopping = None
-
-    def has_next(self):
-        """Whether has next untried config in the space
-
-        Returns
-        -------
-        has_next: bool
-        """
-        raise NotImplementedError()
-
-    def next_batch(self, batch_size):
-        """get the next batch of configs to be measure on real hardware
-
-        Parameters
-        ----------
-        batch_size: int
-            The size of the batch
-
-        Returns
-        -------
-        a batch of configs
-        """
-        raise NotImplementedError()
-
-    def update(self, inputs, results):
-        """Update parameters of the tuner according to measurement results
-
-        Parameters
-        ----------
-        inputs: Array of autotvm.measure.MeasureInput
-            The input for measurement
-        results: Array of autotvm.measure.MeasureResult
-            result for measurement
-        """
-
-    def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_prefix="G"):
-        """Begin tuning
-
-        Parameters
-        ----------
-        n_trial: int
-            Maximum number of configs to try (measure on real hardware)
-        measure_option: dict
-            The options for how to measure generated code.
-            You should use the return value ot autotvm.measure_option for this argument.
-        early_stopping: int, optional
-            Early stop the tuning when not finding better configs in this number of trials
-        callbacks: List of callable
-            A list of callback functions. The signature of callback function is
-            (Tuner, List of MeasureInput, List of MeasureResult)
-            with no return value. These callback functions will be called on
-            every measurement pair. See autotvm/tuner/callback.py for some examples.
-        si_prefix: str
-            One of tvm.autotvm.utils.SI_PREFIXES. The SI prefix to use when reporting FLOPS.
-        """
-        measure_batch = create_measure_batch(self.task, measure_option)
-        n_parallel = getattr(measure_batch, "n_parallel", 1)
-        early_stopping = early_stopping or 1e9
-        self.n_trial = n_trial
-        self.early_stopping = early_stopping
-
-        # Validate si_prefix arg
-        format_si_prefix(0, si_prefix)
-
-        old_level = logger.level
-
-        GLOBAL_SCOPE.in_tuning = True
-        i = error_ct = 0
-        errors = []
-        while i < n_trial:
-            if not self.has_next():
-                break
-
-            configs = self.next_batch(min(n_parallel, n_trial - i))
-
-            inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
-            results = measure_batch(inputs)
-
-            # keep best config
-            for k, (inp, res) in enumerate(zip(inputs, results)):
-                config = inp.config
-                if res.error_no == 0:
-                    flops = inp.task.flop / np.mean(res.costs)
-                    error_ct = 0
-                    result_msg = res
-                else:
-                    flops = 0
-                    error_ct += 1
-                    tb, error = res.costs
-                    if isinstance(error, str):
-                        errors.append(tb + "\n" + error)
-                    else:
-                        errors.append(tb + "\n" + str(error))
-                    result_msg = errors[-1]
-
-                if flops > self.best_flops:
-                    self.best_flops = flops
-                    self.best_config = config
-                    self.best_measure_pair = (inp, res)
-                    self.best_iter = i + k
-
-                logger.debug(
-                    "No: %d\t%sFLOPS: %.2f/%.2f\tresult: %s\t%s",
-                    i + k + 1,
-                    si_prefix,
-                    format_si_prefix(flops, si_prefix),
-                    format_si_prefix(self.best_flops, si_prefix),
-                    result_msg,
-                    config,
-                )
-
-            i += len(results)
-            self.ttl = min(early_stopping + self.best_iter, n_trial) - i
-
-            self.update(inputs, results)
-            for callback in callbacks:
-                callback(self, inputs, results)
-
-            if i >= self.best_iter + early_stopping:
-                logger.debug("Early stopped. Best iter: %d.", self.best_iter)
-                break
-
-            if error_ct > self.error_ct_threshold:
-                logging.basicConfig()
-                logger.warning("Too many errors happen in the tuning. Switching to debug mode.")
-                logger.setLevel(logging.DEBUG)
-            else:
-                logger.setLevel(old_level)
-
-        if error_ct == i:
-            _, f = tempfile.mkstemp(prefix="tvm_tuning_errors_", suffix=".log", text=True)
-            with open(f, "w") as file:
-                file.write("\n".join(errors))
-            logging.warning(
-                "Could not find any valid schedule for task %s. "
-                "A file containing the errors has been written to %s.",
-                self.task,
-                f,
-            )
-        GLOBAL_SCOPE.in_tuning = False
-        del measure_batch
-
-    def reset(self):
-        """reset the status of tuner"""
-        self.best_config = None
-        self.best_flops = 0
-        self.best_measure_pair = None
-
-    def load_history(self, data_set, min_seed_records=500):
-        """load history data for transfer learning
-
-        Parameters
-        ----------
-        data_set: Array of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult) pair
-            Previous tuning records
-        min_seed_records: int
-            Defaults to 500. Indicates the minimum number of records to
-            train the tuner with. If there are less than `min_seed_records`
-            number of records in `data_set`, no training of the tuner
-            will be done.
-        """
-        raise NotImplementedError()
-
-    def set_error_threshold(self, threshold):
-        """Modify error counter threshold, which controls switch to debug mode
-
-        Parameters
-        ----------
-        threshold: New threshold value
-        """
-        self.error_ct_threshold = threshold
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
deleted file mode 100644
index 048eecf10e25..000000000000
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ /dev/null
@@ -1,706 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""XGBoost as cost model"""
-
-import logging
-import time
-
-from typing import Dict
-
-import numpy as np
-from tvm.contrib.popen_pool import PopenPoolExecutor, StatusKind
-
-from .. import feature
-from ..utils import get_rank
-from .metric import cover_curve, max_curve, recall_curve
-from .model_based_tuner import CostModel, FeatureCache
-
-try:
-    from xgboost.callback import TrainingCallback  # type: ignore
-except ImportError:
-
-    class TrainingCallback:  # type: ignore
-        pass
-
-
-xgb = None
-
-logger = logging.getLogger("autotvm")
-
-
-class XGBoostCostModel(CostModel):
-    """XGBoost as cost model
-
-    Parameters
-    ----------
-    task: Task
-        The tuning task
-    feature_type: str, optional
-        If is 'itervar', use features extracted from IterVar (loop variable).
-        If is 'knob', use flatten ConfigEntity directly.
-        If is 'curve', use sampled curve feature (relation feature).
-
-        Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' are good.
-                                'itervar' is more accurate but 'knob' is much faster.
-                                There are some constraints on 'itervar', if you meet
-                                problems with feature extraction when using 'itervar',
-                                you can switch to 'knob'.
-
-        For cross-shape tuning (e.g. many convolutions with different shapes),
-                               'itervar' and 'curve' has better transferability,
-                               'knob' is faster.
-        For cross-device or cross-operator tuning, you can use 'curve' only.
-    loss_type: str
-        If is 'reg', use regression loss to train cost model.
-                     The cost model predicts the normalized flops.
-        If is 'rank', use pairwise rank loss to train cost model.
-                     The cost model predicts relative rank score.
-        If is 'rank-binary', use pairwise rank loss with binarized labels to train cost model.
-                     The cost model predicts relative rank score.
-    num_threads: int, optional
-        The number of threads.
-    log_interval: int, optional
-        If is not none, the cost model will print training log every `log_interval` iterations.
-    upper_model: XGBoostCostModel, optional
-        The upper model used in transfer learning
-    """
-
-    def __init__(
-        self,
-        task,
-        feature_type,
-        loss_type="reg",
-        num_threads=None,
-        log_interval=25,
-        upper_model=None,
-    ):
-        global xgb
-        super(XGBoostCostModel, self).__init__()
-        try:
-            if xgb is None:
-                xgb = __import__("xgboost")
-        except ImportError:
-            raise ImportError(
-                "XGBoost is required for XGBoostCostModel. "
-                "Please install its python package first. "
-                "Help: (https://xgboost.readthedocs.io/en/latest/) "
-            )
-
-        self.task = task
-        self.target = task.target
-        self.space = task.config_space
-
-        self.fea_type = feature_type
-        self.loss_type = loss_type
-        self.num_threads = num_threads
-        self.log_interval = log_interval
-
-        self.loss_type = loss_type
-
-        if loss_type == "reg":
-            self.xgb_params = {
-                "max_depth": 3,
-                "gamma": 0.0001,
-                "min_child_weight": 1,
-                "subsample": 1.0,
-                "eta": 0.3,
-                "lambda": 1.00,
-                "alpha": 0,
-                "objective": "reg:linear",
-            }
-        elif loss_type in ("rank", "rank-binary"):
-            self.xgb_params = {
-                "max_depth": 3,
-                "gamma": 0.0001,
-                "min_child_weight": 1,
-                "subsample": 1.0,
-                "eta": 0.3,
-                "lambda": 1.00,
-                "alpha": 0,
-                "objective": "rank:pairwise",
-            }
-        else:
-            raise RuntimeError("Invalid loss type: " + loss_type)
-
-        self.xgb_params["verbosity"] = 0
-        if num_threads:
-            self.xgb_params["nthread"] = num_threads
-        self.bst = None
-
-        if feature_type == "itervar":
-            self.feature_extract_func = _extract_itervar_feature_index
-        elif feature_type == "knob":
-            self.feature_extract_func = _extract_knob_feature_index
-        elif feature_type == "curve":
-            self.feature_extract_func = _extract_curve_feature_index
-        else:
-            raise RuntimeError("Invalid feature type " + feature_type)
-
-        if upper_model:  # share a same feature cache with upper model
-            self.feature_cache = upper_model.feature_cache
-        else:
-            self.feature_cache = FeatureCache()
-        self.upper_model = upper_model
-        self.feature_extra_ct = 0
-        self.pool = None
-        self.base_model = None
-
-        self._sample_size = 0
-        self._reset_pool(self.space, self.target, self.task)
-
-    def _reset_pool(self, space, target, task):
-        """reset processing pool for feature extraction"""
-
-        if self.upper_model:  # base model will reuse upper model's pool,
-            self.upper_model._reset_pool(space, target, task)
-            return
-
-        self._close_pool()
-
-        self.pool = PopenPoolExecutor(
-            max_workers=self.num_threads,
-            initializer=_extract_popen_initializer,
-            initargs=(space, target, task),
-        )
-
-    def _close_pool(self):
-        if self.pool:
-            self.pool = None
-
-    def _get_pool(self):
-        if self.upper_model:
-            return self.upper_model._get_pool()
-        return self.pool
-
-    def _base_model_discount(self):
-        return 1.0 / (2 ** (self._sample_size / 64.0))
-
-    def fit(self, xs, ys, plan_size):
-        tic = time.time()
-        self._reset_pool(self.space, self.target, self.task)
-
-        x_train = self._get_feature(xs)
-        y_train = np.array(ys)
-        y_max = np.max(y_train)
-        y_train = y_train / max(y_max, 1e-8)
-
-        valid_index = y_train > 1e-6
-        index = np.random.permutation(len(x_train))
-        dtrain = xgb.DMatrix(x_train[index], y_train[index])
-        self._sample_size = len(x_train)
-
-        if self.base_model:
-            discount = self._base_model_discount()
-            if discount < 0.05:  # discard base model
-                self.base_model.upper_model = None
-                self.base_model = None
-            else:
-                dtrain.set_base_margin(discount * self.base_model.predict(xs, output_margin=True))
-
-        self.bst = xgb.train(
-            self.xgb_params,
-            dtrain,
-            num_boost_round=8000,
-            callbacks=[
-                CustomCallback(
-                    stopping_rounds=20,
-                    metric=f"tr-a-recall@{plan_size}",
-                    evals=[(dtrain, "tr")],
-                    maximize=True,
-                    fevals=[xgb_average_recalln_curve_score(plan_size)],
-                    verbose_eval=self.log_interval,
-                    loss_type=self.loss_type,
-                )
-            ],
-        )
-
-        logger.debug(
-            "XGB train: %.2f\tobs: %d\terror: %d\tn_cache: %d",
-            time.time() - tic,
-            len(xs),
-            len(xs) - np.sum(valid_index),
-            self.feature_cache.size(self.fea_type),
-        )
-
-    def fit_log(self, records, plan_size, min_seed_records=500):
-        tic = time.time()
-
-        # filter data, only pick the data with a same task
-        data = []
-        for inp, res in records:
-            if inp.task.name == self.task.name:
-                data.append((inp, res))
-
-        logger.debug("XGB load %d entries from history log file", len(data))
-
-        # extract feature
-        self._reset_pool(self.space, self.target, self.task)
-        pool = self._get_pool()
-        if self.fea_type == "itervar":
-            feature_extract_func = _extract_itervar_feature_log
-        elif self.fea_type == "knob":
-            feature_extract_func = _extract_knob_feature_log
-        elif self.fea_type == "curve":
-            feature_extract_func = _extract_curve_feature_log
-        else:
-            raise RuntimeError("Invalid feature type: " + self.fea_type)
-        result = pool.map_with_error_catching(feature_extract_func, data)
-        result = list(result)  # store results so we can iterate through them twice
-
-        # get maximum feature length
-        fea_len = -1
-        for res in result:
-            if res.status != StatusKind.COMPLETE:
-                continue
-            x, _ = res.value
-            fea_len = max(fea_len, x.shape[0])
-
-        xs, ys = [], []
-        for res in result:
-            if res.status != StatusKind.COMPLETE:
-                continue
-            x, y = res.value
-            # Features may not be the same size, pad them until they are
-            if fea_len > len(x):
-                xs.append(np.pad(x, (0, fea_len - len(x))))
-            else:
-                xs.append(x)
-            ys.append(y)
-
-        if len(xs) < min_seed_records:  # no enough samples
-            return False
-
-        xs, ys = np.array(xs), np.array(ys)
-        x_train = xs
-        y_train = ys
-        y_max = np.max(y_train)
-        y_train = y_train / max(y_max, 1e-8)
-
-        index = np.random.permutation(len(x_train))
-        dtrain = xgb.DMatrix(x_train[index], y_train[index])
-
-        plan_size *= 2
-        self.bst = xgb.train(
-            self.xgb_params,
-            dtrain,
-            num_boost_round=400,
-            callbacks=[
-                CustomCallback(
-                    stopping_rounds=100,
-                    metric=f"tr-a-recall@{plan_size}",
-                    evals=[(dtrain, "tr")],
-                    maximize=True,
-                    fevals=[xgb_average_recalln_curve_score(plan_size)],
-                    verbose_eval=self.log_interval,
-                    loss_type=self.loss_type,
-                )
-            ],
-        )
-
-        logger.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))
-
-        return True
-
-    def predict(self, xs, output_margin=False):
-        feas = self._get_feature(xs)
-        dtest = xgb.DMatrix(feas)
-
-        if self.base_model:
-            dtest.set_base_margin(
-                self._base_model_discount() * self.base_model.predict(xs, output_margin=True)
-            )
-
-        return self.bst.predict(dtest, output_margin=output_margin)
-
-    def load_basemodel(self, base_model):
-        self.base_model = base_model
-        self.base_model._close_pool()
-        self.base_model.upper_model = self
-
-    def spawn_base_model(self):
-        return XGBoostCostModel(
-            self.task, self.fea_type, self.loss_type, self.num_threads, self.log_interval, self
-        )
-
-    def _get_feature(self, indexes):
-        """get features for indexes, run extraction if we do not have cache for them"""
-        # free feature cache
-        if self.feature_cache.size(self.fea_type) >= 100000:
-            self.feature_cache.clear(self.fea_type)
-
-        fea_cache = self.feature_cache.get(self.fea_type)
-
-        indexes = np.array(indexes)
-        need_extract = [x for x in indexes if x not in fea_cache]
-
-        if need_extract:
-            pool = self._get_pool()
-            feas = pool.map_with_error_catching(self.feature_extract_func, need_extract)
-            for i, fea in zip(need_extract, feas):
-                fea_cache[i] = fea.value if fea.status == StatusKind.COMPLETE else None
-
-        feature_len = -1
-        for idx in indexes:
-            if fea_cache[idx] is not None:
-                feature_len = max(fea_cache[idx].shape[-1], feature_len)
-
-        ret = np.empty((len(indexes), feature_len), dtype=np.float32)
-        for i, ii in enumerate(indexes):
-            t = fea_cache[ii]
-            if t is not None and t.shape[0] < feature_len:
-                t = np.pad(t, (0, feature_len - t.shape[0]))
-            ret[i, :] = t if t is not None else 0
-        return ret
-
-    def __del__(self):
-        self._close_pool()
-
-
-# Global variables for passing arguments to extract functions.
-_extract_space = None
-_extract_target = None
-_extract_task = None
-
-
-def _extract_popen_initializer(space, target, task):
-    global _extract_space, _extract_target, _extract_task
-    _extract_space = space
-    _extract_target = target
-    _extract_task = task
-
-
-def _extract_itervar_feature_index(args):
-    """extract iteration var feature for an index in extract_space"""
-    config = _extract_space.get(args)
-    with _extract_target:
-        sch, fargs = _extract_task.instantiate(config)
-
-    fea = feature.get_itervar_feature_flatten(sch, fargs, take_log=True)
-    fea = np.concatenate((fea, list(config.get_other_option().values())))
-    return fea
-
-
-def _extract_itervar_feature_log(arg):
-    """extract iteration var feature for log items"""
-    inp, res = arg
-    config = inp.config
-    with inp.target:
-        sch, args = inp.task.instantiate(config)
-    fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
-    x = np.concatenate((fea, list(config.get_other_option().values())))
-
-    if res.error_no == 0:
-        y = inp.task.flop / np.mean(res.costs)
-    else:
-        y = 0.0
-    return x, y
-
-
-def _extract_knob_feature_index(args):
-    """extract knob feature for an index in extract_space"""
-    config = _extract_space.get(args)
-
-    return config.get_flatten_feature()
-
-
-def _extract_knob_feature_log(arg):
-    """extract knob feature for log items"""
-    inp, res = arg
-    config = inp.config
-    x = config.get_flatten_feature()
-
-    if res.error_no == 0:
-        with inp.target:  # necessary, for calculating flops of this task
-            inp.task.instantiate(config)
-        y = inp.task.flop / np.mean(res.costs)
-    else:
-        y = 0.0
-    return x, y
-
-
-def _extract_curve_feature_index(args):
-    """extract sampled curve feature for an index in extract_space"""
-    config = _extract_space.get(args)
-    with _extract_target:
-        sch, fargs = _extract_task.instantiate(config)
-
-    fea = feature.get_buffer_curve_sample_flatten(sch, fargs, sample_n=20)
-    fea = np.concatenate((fea, list(config.get_other_option().values())))
-    return np.array(fea)
-
-
-def _extract_curve_feature_log(arg):
-    """extract sampled curve feature for log items"""
-    inp, res = arg
-    config = inp.config
-    with inp.target:
-        sch, args = inp.task.instantiate(config)
-    fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
-    x = np.concatenate((fea, list(config.get_other_option().values())))
-
-    if res.error_no == 0:
-        y = inp.task.flop / np.mean(res.costs)
-    else:
-        y = 0.0
-    return x, y
-
-
-def _binarize_evals(evals):
-    """binarize evaluation labels"""
-    bin_evals = []
-    for evalset in evals:
-        # binarize labels in xgb.dmatrix copy
-        barray = evalset[0].get_data().copy()
-        blabel = evalset[0].get_label().copy()
-        blabel[blabel < 0.5] = 0.0
-        blabel[blabel >= 0.5] = 1.0
-        # pylint: disable=R1721
-        bin_evals.append(tuple([xgb.DMatrix(barray, blabel)] + [e for e in evalset[1:]]))
-    return bin_evals
-
-
-class XGBoostCallback(TrainingCallback):
-    """Base class for XGBoost callbacks."""
-
-    def __call__(self, env: "xgb.core.CallbackEnv"):
-        # Compatibility with xgboost < 1.3
-        return self.after_iteration(env.model, env.iteration, env.evaluation_result_list)
-
-    def after_iteration(self, model: "xgb.Booster", epoch: int, evals_log: Dict):
-        raise NotImplementedError
-
-
-class CustomCallback(XGBoostCallback):
-    """
-    Callback function for xgboost.
-    Support custom evaluation function and early-stopping.
-    """
-
-    def __init__(
-        self,
-        stopping_rounds,
-        metric,
-        fevals,
-        loss_type="reg",
-        evals=(),
-        log_file=None,
-        maximize=False,
-        verbose_eval=True,
-        skip_every=2,
-    ):
-        """Init function"""
-        self.stopping_rounds = stopping_rounds
-        self.metric = metric
-        self.metric_shortname = metric.split("-")[1]
-        self.fevals = fevals
-        self.evals = evals
-        self.log_file = log_file
-        self.maximize = maximize
-        self.verbose_eval = verbose_eval
-        self.loss_type = loss_type
-        self.skip_every = skip_every
-        self.state = {}
-
-    def after_iteration(self, model: "xgb.Booster", epoch: int, evals_log: Dict):
-        """Run after each iteration.  Return True when training should stop."""
-        # pylint:disable = import-outside-toplevel
-        try:
-            from xgboost.callback import _fmt_metric  # type: ignore
-        except ImportError:
-            # Compatibility with xgboost >= 1.6
-            def _fmt_metric(value, show_stdv=True):
-                """format metric string"""
-                if len(value) == 2:
-                    return f"{value[0]}:{value[1]:.5f}"
-                if len(value) == 3:
-                    if show_stdv:
-                        return f"{value[0]}:{value[1]:.5f}+{value[2]:.5f}"
-                    return f"{value[0]}:{value[1]:.5f}"
-                raise ValueError("wrong metric value", value)
-
-        ##### init state #####
-        if not self.state:
-            self.state["maximize_score"] = self.maximize
-            self.state["best_iteration"] = 0
-            if self.maximize:
-                self.state["best_score"] = float("-inf")
-            else:
-                self.state["best_score"] = float("inf")
-
-            assert model is not None
-            if model.attr("best_score") is not None:
-                self.state["best_score"] = float(model.attr("best_score"))
-                self.state["best_iteration"] = int(model.attr("best_iteration"))
-                self.state["best_msg"] = model.attr("best_msg")
-            else:
-                model.set_attr(best_iteration=str(self.state["best_iteration"]))
-                model.set_attr(best_score=str(self.state["best_score"]))
-        res_dict = {}
-
-        if epoch % self.skip_every == 1:
-            return False
-
-        ##### evaluation #####
-        mod_evals = self.evals
-        if self.loss_type == "rank-binary":
-            mod_evals = _binarize_evals(self.evals)
-
-        if self.loss_type == "rank" and int(xgb.__version__[0]) >= 2:
-            # since xgboost pr#8931
-            raise RuntimeError(
-                "Use 'rank-binary' instead of 'rank' loss_type with xgboost %s >= 2.0.0"
-                % xgb.__version__
-            )
-
-        for feval in self.fevals:
-            bst_eval = model.eval_set(mod_evals, epoch, feval)
-            res = [x.split(":") for x in bst_eval.split()]
-            for kv in res[1:]:
-                res_dict[kv[0]] = [float(kv[1])]
-
-        eval_res = []
-        keys = list(res_dict.keys())
-        keys.sort(key=lambda x: x if self.metric_shortname not in x else "a" + x)
-        for key in keys:
-            v = res_dict[key]
-            eval_res.append([key] + v)
-
-        ##### print eval result #####
-        if (
-            not isinstance(self.verbose_eval, bool)
-            and self.verbose_eval
-            and epoch % self.verbose_eval == 0
-        ):
-            infos = [f"XGB iter: {epoch:3d}"]
-            for item in eval_res:
-                if "null" in item[0]:
-                    continue
-                infos.append(f"{item[0]}: {item[1]:.6f}")
-
-            logger.debug("\t".join(infos))
-            if self.log_file:
-                with open(self.log_file, "a") as fout:
-                    fout.write("\t".join(infos) + "\n")
-
-        ##### choose score and do early stopping #####
-        score = None
-        for item in eval_res:
-            if item[0] == self.metric:
-                score = item[1]
-                break
-        assert score is not None
-
-        best_score = self.state["best_score"]
-        best_iteration = self.state["best_iteration"]
-        maximize_score = self.state["maximize_score"]
-
-        if (maximize_score and score > best_score) or (not maximize_score and score < best_score):
-            msg = f"[{epoch}] " + "\t".join([_fmt_metric(x) for x in eval_res])
-            self.state["best_msg"] = msg
-            self.state["best_score"] = score
-            self.state["best_iteration"] = epoch
-            # save the property to attributes, so they will occur in checkpoint.
-            if model is not None:
-                model.set_attr(
-                    best_score=str(self.state["best_score"]),
-                    best_iteration=str(self.state["best_iteration"]),
-                    best_msg=self.state["best_msg"],
-                )
-        elif epoch - best_iteration >= self.stopping_rounds:
-            best_msg = self.state["best_msg"]
-            if self.verbose_eval:
-                logger.debug("XGB stopped. Best iteration: %s ", best_msg)
-            return True
-
-        return False
-
-
-# feval wrapper for xgboost
-def xgb_max_curve_score(N):
-    """evaluate max curve score for xgb"""
-
-    def feval(preds, labels):
-        labels = labels.get_label()
-        trials = np.argsort(preds)[::-1]
-        scores = labels[trials]
-        curve = max_curve(scores)
-        return f"Smax@{N}", curve[N] / np.max(labels)
-
-    return feval
-
-
-def xgb_recalln_curve_score(N):
-    """evaluate recall-n curve score for xgb"""
-
-    def feval(preds, labels):
-        labels = labels.get_label()
-        trials = np.argsort(preds)[::-1]
-        ranks = get_rank(labels[trials])
-        curve = recall_curve(ranks)
-        return f"recall@{N}", curve[N]
-
-    return feval
-
-
-def xgb_average_recalln_curve_score(N):
-    """evaluate average recall-n curve score for xgb"""
-
-    def feval(preds, labels):
-        labels = labels.get_label()
-        trials = np.argsort(preds)[::-1]
-        ranks = get_rank(labels[trials])
-        curve = recall_curve(ranks)
-        return f"a-recall@{N}", np.sum(curve[:N]) / N
-
-    return feval
-
-
-def xgb_recallk_curve_score(N, topk):
-    """evaluate recall-k curve score for xgb"""
-
-    def feval(preds, labels):
-        labels = labels.get_label()
-        trials = np.argsort(preds)[::-1]
-        ranks = get_rank(labels[trials])
-        curve = recall_curve(ranks, topk)
-        return f"recall@{topk}", curve[N]
-
-    return feval
-
-
-def xgb_cover_curve_score(N):
-    """evaluate cover curve score for xgb"""
-
-    def feval(preds, labels):
-        labels = labels.get_label()
-        trials = np.argsort(preds)[::-1]
-        ranks = get_rank(labels[trials])
-        curve = cover_curve(ranks)
-        return f"cover@{N}", curve[N]
-
-    return feval
-
-
-def xgb_null_score(_):
-    """empty score function for xgb"""
-
-    def feval(__, ___):
-        return "null", 0
-
-    return feval
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
deleted file mode 100644
index 0e77bf674bac..000000000000
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tuner that uses xgboost as cost model"""
-
-from .model_based_tuner import ModelBasedTuner, ModelOptimizer
-from .xgboost_cost_model import XGBoostCostModel
-from .sa_model_optimizer import SimulatedAnnealingOptimizer
-
-
-class XGBTuner(ModelBasedTuner):
-    """Tuner that uses xgboost as cost model
-
-    Parameters
-    ----------
-    task: Task
-        The tuning task
-    plan_size: int
-        The size of a plan. After `plan_size` trials, the tuner will refit a new cost model
-        and do planing for the next `plan_size` trials.
-    feature_type: str, optional
-        If is 'itervar', use features extracted from IterVar (loop variable).
-        If is 'knob', use flatten ConfigEntity directly.
-        If is 'curve', use sampled curve feature (relation feature).
-
-        Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' are good.
-        'itervar' is more accurate but 'knob' is much faster.
-        There are some constraints on 'itervar', if you meet
-        problems with feature extraction when using 'itervar',
-        you can switch to 'knob'.
-
-        For cross-shape tuning (e.g. many convolutions with different shapes),
-        'itervar' and 'curve' has better transferability,
-        'knob' is faster.
-
-        For cross-device or cross-operator tuning, you can use 'curve' only.
-    loss_type: str
-        If is 'reg', use regression loss to train cost model.
-        The cost model predicts the normalized flops.
-        If is 'rank', use pairwise rank loss to train cost model.
-        The cost model predicts relative rank score.
-        If is 'rank-binary', use pairwise rank loss with binarized labels to train cost model.
-        The cost model predicts relative rank score.
-
-    num_threads: int, optional
-        The number of threads.
-
-    optimizer: str or ModelOptimizer, optional
-        If is 'sa', use a default simulated annealing optimizer.
-        Otherwise it should be a ModelOptimizer object.
-
-    diversity_filter_ratio: int or float, optional
-        If is not None, the tuner will first select
-        top-(plan_size * diversity_filter_ratio) candidates according to the cost model
-        and then pick batch_size of them according to the diversity metric.
-
-    log_interval: int = 50
-        The verbose level.
-        If is 0, output nothing.
-        Otherwise, output debug information every `verbose` iterations.
-    """
-
-    def __init__(
-        self,
-        task,
-        plan_size=64,
-        feature_type="itervar",
-        loss_type="reg",
-        num_threads=None,
-        optimizer="sa",
-        diversity_filter_ratio=None,
-        log_interval=50,
-    ):
-        cost_model = XGBoostCostModel(
-            task,
-            feature_type=feature_type,
-            loss_type=loss_type,
-            num_threads=num_threads,
-            log_interval=log_interval // 2,
-        )
-        if optimizer == "sa":
-            optimizer = SimulatedAnnealingOptimizer(task, log_interval=log_interval)
-        else:
-            assert isinstance(optimizer, ModelOptimizer), (
-                "Optimizer must be " "a supported name string" "or a ModelOptimizer object."
-            )
-
-        super(XGBTuner, self).__init__(
-            task, cost_model, optimizer, plan_size, diversity_filter_ratio
-        )
-
-    def tune(self, *args, **kwargs):  # pylint: disable=arguments-differ
-        super(XGBTuner, self).tune(*args, **kwargs)
-
-        # manually close pool to avoid multiprocessing issues
-        self.cost_model._close_pool()
diff --git a/python/tvm/autotvm/utils.py b/python/tvm/autotvm/utils.py
deleted file mode 100644
index 75db5208adbe..000000000000
--- a/python/tvm/autotvm/utils.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Utilities"""
-import logging
-import time
-
-import numpy as np
-import tvm.arith
-from tvm.tir import expr
-from tvm.contrib.popen_pool import PopenPoolExecutor
-
-logger = logging.getLogger("autotvm")
-
-
-class EmptyContext(object):
-    """An empty context"""
-
-    def __enter__(self):
-        pass
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
-
-def get_rank(values):
-    """get rank of items
-
-    Parameters
-    ----------
-    values: Array
-
-    Returns
-    -------
-    ranks: Array of int
-        the rank of this item in the input (the largest value ranks first)
-    """
-    tmp = np.argsort(-values)
-    ranks = np.empty_like(tmp)
-    ranks[tmp] = np.arange(len(tmp))
-    return ranks
-
-
-def pool_map(func, args, batch_size, verbose=False, pool=None):
-    """A wrapper of multiprocessing.pool.Pool.map to support small-batch mapping
-    for large argument list. This can reduce memory usage
-
-    Parameters
-    ----------
-    func: Func(arg) -> np.ndarray
-        mapping function
-    args: List
-        list of arguments
-    batch_size: int
-        batch size in mapping
-    verbose: bool, optional
-        whether print progress
-    pool: multiprocessing.Pool, optional
-        pool objection
-
-    Returns
-    -------
-    converted numpy array
-    """
-
-    ret = None
-    tic = time.time()
-    local_pool = pool or PopenPoolExecutor()
-    if verbose:
-        logger.info("mapping begin")
-    for i in range(0, len(args), batch_size):
-        if verbose:
-            logger.info("mapping %d/%d elapsed %.2f", i, len(args), time.time() - tic)
-        tmp = np.array(local_pool.map(func, args[i : i + batch_size]))
-        ret = tmp if ret is None else np.concatenate((ret, tmp))
-    if verbose:
-        logger.info("mapping done")
-    if not pool:
-        local_pool.close()
-    return ret
-
-
-def get_func_name(func):
-    """Get name of a function
-
-    Parameters
-    ----------
-    func: Function
-        The function
-    Returns
-    -------
-    name: str
-        The name
-    """
-
-    return func.func_name if hasattr(func, "func_name") else func.__name__
-
-
-def get_const_int(exp):
-    """Verifies expr is integer and get the constant value.
-
-    Parameters
-    ----------
-    exp : tvm.Expr or int
-        The input expression.
-
-    Returns
-    -------
-    out_value : int
-        The output.
-    """
-    if isinstance(exp, int):
-        return exp
-    if not isinstance(exp, (expr.IntImm,)):
-        ana = tvm.arith.Analyzer()
-        exp = ana.simplify(exp)
-    if not isinstance(exp, (expr.IntImm,)):
-        raise ValueError("Expect value to be constant int")
-    return exp.value
-
-
-def get_const_tuple(in_tuple):
-    """Verifies input tuple is IntImm or Var, returns tuple of int or Var.
-
-    Parameters
-    ----------
-    in_tuple : tuple of Expr
-        The input.
-
-    Returns
-    -------
-    out_tuple : tuple of int
-        The output.
-    """
-    ret = []
-    for elem in in_tuple:
-        if isinstance(elem, expr.Var):
-            ret.append(elem)
-        elif not isinstance(elem, (expr.IntImm, int)):
-            ana = tvm.arith.Analyzer()
-            elem = ana.simplify(elem)
-            if not isinstance(elem, (expr.IntImm)):
-                ret.append(elem)
-        else:
-            ret.append(get_const_int(elem))
-    return tuple(ret)
-
-
-SI_PREFIXES = "yzafpn\xb5m kMGTPEZY"
-YOCTO_EXP10 = -24
-
-
-def format_si_prefix(x, si_prefix):
-    exp10 = 10 ** (SI_PREFIXES.index(si_prefix) * 3 + YOCTO_EXP10)
-    return float(x) / exp10
diff --git a/python/tvm/contrib/cuda_graph/__init__.py b/python/tvm/contrib/cuda_graph/__init__.py
deleted file mode 100644
index 13a83393a912..000000000000
--- a/python/tvm/contrib/cuda_graph/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_executor.py b/python/tvm/contrib/cuda_graph/cuda_graph_executor.py
deleted file mode 100644
index d047316eb564..000000000000
--- a/python/tvm/contrib/cuda_graph/cuda_graph_executor.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Graph executor with CUDA Graph"""
-import tvm._ffi
-
-from tvm._ffi.base import string_types
-from tvm.contrib import graph_executor
-
-
-def create(graph_json_str, libmod, device):
-    """Create a runtime executor module given a graph and module.
-
-    Parameters
-    ----------
-    graph_json_str : str
-        The graph to be deployed in json format output by json graph.
-        The graph can contain operator(tvm_op) that points to the name
-        of PackedFunc in the libmod.
-
-    libmod : tvm.runtime.Module
-        The module of the corresponding function
-
-    device : Device
-        The device to deploy the module, only supports CUDA GPU
-
-    Returns
-    -------
-    graph_module : GraphModuleCudaGraph
-        CUDA graph executor module that can be used to execute the graph.
-
-    Note
-    ----
-    See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_executor.GraphModuleCudaGraph`
-    for examples to directly construct a GraphModuleCudaGraph from an exported
-    relay compiled library.
-    """
-    assert isinstance(graph_json_str, string_types)
-    try:
-        dev, num_rpc_dev, device_type_id = graph_executor.get_device(libmod, device)
-        if num_rpc_dev == len(dev):
-            fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor_cuda_graph.create")
-        else:
-            fcreate = tvm._ffi.get_global_func("tvm.graph_executor_cuda_graph.create")
-    except ValueError:
-        raise ValueError(
-            "To enable CUDA graph support (experimental), please set "
-            "'(USE_GRAPH_EXECUTOR_CUGRAPH ON)' in config.cmake and rebuild TVM"
-        )
-
-    return GraphModuleCudaGraph(fcreate(graph_json_str, libmod, *device_type_id))
-
-
-class GraphModuleCudaGraph(graph_executor.GraphModule):
-    """CUDA graph executor module.
-
-    This is a CUDA graph executor wrapper over the TVM runtime.
-    Runtime interfaces are wrapped with CUDA graph functionalities.
-
-    Parameters
-    ----------
-    module : Module
-        The internal tvm module that holds the actual graph functions.
-    """
-
-    def __init__(self, module):
-        self._start_capture = module["start_capture"]
-        self._end_capture = module["end_capture"]
-        self._run_cuda_graph = module["run_cuda_graph"]
-        self._cuda_graph_captured = False
-        graph_executor.GraphModule.__init__(self, module)
-
-    def capture_cuda_graph(self):
-        """Capture a CUDA graph for tvm_op graph
-
-        This should be called before run_cuda_graph() to capture and
-        instantiate a CUDA graph instance.
-        """
-        self._run()  # call cuModuleLoadData before cudaStream API
-        self._start_capture()
-        self._run()
-        self._end_capture()
-        self._cuda_graph_captured = True
-
-    def run_cuda_graph(self):
-        """Run the CUDA graph for tvm_op graph
-
-        Run the captured CUDA graph instance instead of the
-        for-loop kernel launch of default graph executor
-        """
-        self._run_cuda_graph()
-
-    def run(self, **input_dict):
-        """A run wrapper for graph capture / launch, user can just
-        change default graph executor to cuda graph executor, and
-        the first call will capture a cuda graph for future launch
-
-        Parameters
-        ----------
-        input_dict: dict of str to NDArray
-            List of input values to be feed to
-        """
-        if input_dict:
-            self.set_input(**input_dict)
-        if not self._cuda_graph_captured:
-            self.capture_cuda_graph()
-        else:
-            self._run_cuda_graph()
-
-    def debug_get_output(self, node, out):
-        """Run graph up to node and get the output to out
-
-        Parameters
-        ----------
-        node : int / str
-            The node index or name
-
-        out : NDArray
-            The output array container
-        """
-        raise NotImplementedError("Please use debugger.debug_executor as graph_executor instead.")
diff --git a/python/tvm/contrib/cutlass/__init__.py b/python/tvm/contrib/cutlass/__init__.py
index 4b56ac4e164a..5c3ee3ae2556 100644
--- a/python/tvm/contrib/cutlass/__init__.py
+++ b/python/tvm/contrib/cutlass/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 """BYOC support for CUTLASS."""
-from .build import has_cutlass, num_cutlass_partitions, finalize_modules, finalize_modules_vm
+from .build import has_cutlass, num_cutlass_partitions, finalize_modules
diff --git a/python/tvm/contrib/cutlass/build.py b/python/tvm/contrib/cutlass/build.py
index 5c09c79bd906..ba598e9b225e 100644
--- a/python/tvm/contrib/cutlass/build.py
+++ b/python/tvm/contrib/cutlass/build.py
@@ -25,7 +25,7 @@
 from typing import Optional, Sequence
 
 import tvm
-from tvm import relax, relay, runtime
+from tvm import relax, runtime
 from tvm._ffi.registry import register_func
 from tvm.contrib.nvcc import get_cuda_version
 from tvm.topi.utils import get_const_tuple
@@ -89,35 +89,6 @@ def _get_cutlass_compile_options(sm, threads, use_fast_math=False):
     return kwargs
 
 
-class OpAnnotator(tvm.relay.ExprVisitor):
-    """Annotates partitioned functions with shape and dtype information."""
-
-    def __init__(self):
-        super().__init__()
-        self.signature = {}
-
-    def visit_call(self, call):
-        op = call.op
-        if isinstance(op, relay.Function) and "Composite" in op.attrs:
-            self.signature["op_type"] = op.attrs["Composite"]
-            for i, arg in enumerate(op.params):
-                self.signature[f"arg{i}_shape"] = arg.checked_type.shape
-                self.signature[f"arg{i}_dtype"] = arg.checked_type.dtype
-            self.signature["ret_shape"] = op.ret_type.shape
-            self.signature["ret_dtype"] = op.ret_type.dtype
-            self.visit(op.body)
-
-        elif isinstance(op, tvm.ir.Op) and op.name in [
-            "nn.conv2d",
-            "nn.conv2d_transpose",
-            "nn.conv2d_backward_weight",
-        ]:
-            self.op_attrs = call.attrs
-
-        for arg in call.args:
-            self.visit(arg)
-
-
 def select_gemm_kernel(
     cutlass_profiler,
     op_type,
@@ -389,148 +360,6 @@ def tune_cutlass_kernels(
     return mod, num_cutlass_partition
 
 
-def tune_cutlass_function(
-    func,
-    use_3xtf32,
-    split_k_slices,
-    profile_all_alignments,
-    find_first_valid,
-    use_multiprocessing,
-    gemm_profiler,
-    conv2d_profiler,
-):
-    """Given a function intended to be offloaded to CUTLASS,  profile each workload to select which
-    kernels to emit.
-
-    Parameters
-    ----------
-    func : IRModule
-        The Relay Function to tune for.
-
-    use_3xtf32 : bool
-        Wheter or not use slower but very accurate (compared to tf32) 3xtf32 mode for
-        fp32 inputs on tensorcore.
-
-    split_k_slices : list of int
-        Split factor candidates for split-K GEMM. If split-K > 1, the GEMM K-loop is computed in
-        parallel accross split-K blocks, and a seperate global reduction kernel is launched to
-        accumulate partial reductions. The profiler will pick the best split-k factor from the
-        given candidate list. Note that the larger split-K factor requires a larger workspace.
-        Currently, parallel split-k has been tested only for wgrad. For GEMM and other conv2d
-        kinds, split_k_slices is ignored.
-
-    profile_all_alignments : bool
-        When True, profile all kernal variants with smaller alignments than the largest possible.
-
-    find_first_valid : bool
-        Whether or not profile all candidate kernels, or stop profiling after
-        the first applicable kernel is found.
-
-    use_multiprocessing : bool
-        Whether or not compile profiler executables for different kernels in parallel.
-
-    gemm_profiler : CutlassGemmProfiler
-        Profiler for dense operators. May cache results between tuned functions.
-
-    conv2d_profiler : CutlassConv2DProfiler
-        Profiler for conv2d operators. May cach results between tuned functions.
-
-    Returns
-    -------
-    annot_func : Function
-        The input function with attributes capturing the best CUTLASS kernel found by tuning.
-    """
-    annotator = OpAnnotator()
-    annotator.visit(func)
-    out_shape = annotator.signature["ret_shape"]
-    out_dtype = annotator.signature["ret_dtype"]
-    op_type = annotator.signature["op_type"]
-
-    new_attrs = {"op_type": op_type}
-    new_attrs.update(annotator.signature)
-    new_attrs.update(func.attrs)
-    arg0_shape = new_attrs["arg0_shape"]
-    arg1_shape = new_attrs["arg1_shape"]
-    arg0_dtype = new_attrs["arg0_dtype"]
-    arg1_dtype = new_attrs["arg1_dtype"]
-
-    if "conv2d" in op_type:
-        new_attrs["padding"] = annotator.op_attrs.padding
-        new_attrs["strides"] = annotator.op_attrs.strides
-        new_attrs["dilation"] = annotator.op_attrs.dilation
-
-        if "conv2d_transpose" in op_type:
-            d_shape = out_shape
-            w_shape = arg1_shape
-        elif "conv2d_backward_weight" in op_type:
-            d_shape = arg1_shape
-            w_shape = out_shape
-        else:
-            d_shape = arg0_shape
-            w_shape = arg1_shape
-
-        new_attrs.update(
-            handle_conv2d(
-                conv2d_profiler,
-                op_type,
-                d_shape,
-                w_shape,
-                annotator.op_attrs.padding,
-                annotator.op_attrs.strides,
-                annotator.op_attrs.dilation,
-                out_dtype,
-                arg0_dtype,
-                arg1_dtype,
-                use_3xtf32,
-                split_k_slices,
-                profile_all_alignments,
-                find_first_valid,
-                use_multiprocessing,
-            )
-        )
-    elif "batch_matmul" in op_type:
-        new_attrs.update(
-            handle_batch_matmul(
-                gemm_profiler,
-                op_type,
-                arg0_shape,
-                arg1_shape,
-                out_dtype,
-                arg0_dtype,
-                arg1_dtype,
-                use_3xtf32,
-                find_first_valid,
-                use_multiprocessing,
-            )
-        )
-    elif "dense" in op_type:
-        new_attrs.update(
-            handle_dense(
-                gemm_profiler,
-                op_type,
-                arg0_shape,
-                arg1_shape,
-                out_dtype,
-                arg0_dtype,
-                arg1_dtype,
-                use_3xtf32,
-                find_first_valid,
-                use_multiprocessing,
-            )
-        )
-    else:
-        raise ValueError(f"{op_type} unsupported composite")
-
-    new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs)
-    return relay.Function(
-        func.params,
-        func.body,
-        ret_type=func.ret_type,
-        type_params=func.type_params,
-        attrs=new_attrs,
-    )
-
-
 def _get_call_node(expr: relax.Expr, op_name: str) -> Optional[relax.Call]:
     node = None
 
@@ -1047,52 +876,6 @@ def compile_cutlass_module(c_source_module, options):
     return tvm.runtime.load_static_library(lib_path, function_names)
 
 
-@register_func("relay.ext.cutlass.compile_for_cutlass")
-def compile_for_cutlass(mod, cutlass_target):
-    """Given an IRModule with at least one Compiler='cutlass' Relay function, return a
-    LibraryModule with all such functions compiled into their PackedFunc-compatible form.
-     - First runs CUTLASS tuning to decide on the best kernels, which itself requires the
-       repeated compilation and execution of CUDA code using nvcc. The results of this
-       is captured as annotation on each relevant function. Kernel performance is cached
-       overall all functions.
-     - Then generates a single CSourceModule containing C code implementing all the
-       Compiler='cutlass' Relay functions, accounting for the tuning done above.
-     - Then compiles that CSourceModule with the appropriate nvcc arguments to yield
-       a static .o library. An export_library step will be required on the final runtime
-       module to link that library into the overall .so library.
-     See CompileForCutlass in src/relay/backend/contrib/cutlass/codegen.cc for where this
-     helper function is used to implement the RelayToTIR pass hook for CUTLASS."""
-
-    # Recover options from the current 'cutlass' Target
-    assert cutlass_target.kind.name == "cutlass"
-    tuning_config = {
-        key: cutlass_target.attrs.get(key)
-        for key in [
-            "sm",
-            "use_3xtf32",
-            "split_k_slices",
-            "profile_all_alignments",
-            "find_first_valid",
-            "use_multiprocessing",
-        ]
-    }
-    compile_config = {
-        key: cutlass_target.attrs.get(key) for key in ["sm", "threads", "use_fast_math"]
-    }
-    tmp_dir = cutlass_target.attrs.get("tmp_dir")
-    compile_config["tmp_dir"] = tmp_dir
-
-    # Tune
-    logger.info("Tuning for CUTLASS")
-    mod, _ = tune_cutlass_kernels(mod, tmp_dir=tmp_dir, **tuning_config)
-
-    # Compile
-    logger.info("Creating CSource module for CUTLASS")
-    create_c_source_module = tvm._ffi.get_global_func("relay.ext.cutlass.create_c_source_module")
-    c_module = create_c_source_module(mod)
-    return compile_cutlass_module(c_module, compile_config)
-
-
 def finalize_modules(lib, lib_path="compile.so", tmp_dir="./tmp"):
     """Returns lib with any C source, LLVM and static library modules complied and linked in ready
     for use by the graph or AOT executors. This method is not specific to CUTLASS, however it does
@@ -1102,7 +885,7 @@ def finalize_modules(lib, lib_path="compile.so", tmp_dir="./tmp"):
     Parameters
     ----------
     lib : runtime.Module
-        The output from relay.build.
+        The output from build.
 
     lib_path : string
         The path to a shared library which will be generated as the result of the build process.
@@ -1119,38 +902,3 @@ def finalize_modules(lib, lib_path="compile.so", tmp_dir="./tmp"):
     lib_path = os.path.join(tmp_dir, lib_path)
     lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
     return runtime.load_module(lib_path)
-
-
-def finalize_modules_vm(vm_exec, lib_path="compile.so", vmcode_path="vmcode.ro", tmp_dir="./tmp"):
-    """Returns vm_exec with any C source, LLVM and static library modules compiled and linked in
-    ready for use by the VM executor. This method is not specific to CUTLASS, however it does
-    assume nvcc will be used for final compilation and linking. It is provided here for
-    convenience.
-
-    Parameters
-    ----------
-    vm_exec : vm.Executable
-        The output from relay.vm.compile containing compiled host code and kernels.
-
-    lib_path : string
-        The path to a shared library which will be generated as the result of the build process.
-
-    vmcode_path : string
-        The path where the VM bytecode will be serialized to as a side-effect.
-
-    tmp_dir : string
-        A temporary directory where intermediate compiled artifacts will be stored.
-
-    Returns
-    -------
-    updated_vm_exec : vm.Executable
-        The updated VM executable with all compilation and linking completed.
-    """
-    code, lib = vm_exec.save()
-    lib_path = os.path.join(tmp_dir, lib_path)
-    vmcode_path = os.path.join(tmp_dir, vmcode_path)
-    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
-    with open(vmcode_path, "wb") as fo:
-        fo.write(code)
-    lib = tvm.runtime.load_module(lib_path)
-    return tvm.runtime.vm.Executable.load_exec(code, lib)
diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
deleted file mode 100644
index 0db7877b5e42..000000000000
--- a/python/tvm/contrib/graph_executor.py
+++ /dev/null
@@ -1,549 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Minimum graph executor that executes graph containing TVM PackedFunc."""
-import numpy as np
-import tvm._ffi
-
-from tvm.rpc import _ffi_api as _rpc_ffi_api
-from tvm.rpc import base as rpc_base
-from tvm._ffi.base import string_types
-from tvm._ffi.runtime_ctypes import Device
-
-
-def create(graph_json_str, libmod, device):
-    """Create a runtime executor module given a graph and module.
-
-    Parameters
-    ----------
-    graph_json_str : str
-        The graph to be deployed in json format output by json graph.
-        The graph can contain operator(tvm_op) that points to the name
-        of PackedFunc in the libmod.
-
-    libmod : tvm.runtime.Module
-        The module of the corresponding function
-
-    device : Device or list of Device
-        The device to deploy the module. It can be local or remote when there
-        is only one Device. Otherwise, the first device in the list will
-        be used as this purpose. All device should be given for heterogeneous
-        execution.
-
-    Returns
-    -------
-    graph_module : GraphModule
-        Runtime graph module that can be used to execute the graph.
-
-    Note
-    ----
-    See also :py:class:`tvm.contrib.graph_executor.GraphModule`
-    for examples to directly construct a GraphModule from an exported
-    relay compiled library.
-    """
-    assert isinstance(graph_json_str, string_types)
-
-    dev, num_rpc_dev, device_type_id = get_device(libmod, device)
-
-    if num_rpc_dev == len(dev):
-        fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor.create")
-    else:
-        fcreate = tvm._ffi.get_global_func("tvm.graph_executor.create")
-
-    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
-
-
-def get_device(libmod, device):
-    """Parse and validate all the device(s).
-
-    Parameters
-    ----------
-    libmod : tvm.runtime.Module
-        The module of the corresponding function
-
-    device : Device or list of Device
-
-    Returns
-    -------
-    device : list of Device
-    num_rpc_dev : Number of rpc devices
-    device_type_id : List of device type and device id
-    """
-
-    if isinstance(device, Device):
-        device = [device]
-    elif not isinstance(device, (list, tuple)):
-        raise ValueError("dev has to be the type of Device or a list of Device")
-    for cur_dev in device:
-        if not isinstance(cur_dev, Device):
-            raise ValueError("dev has to be the type of Device or a list of Device")
-
-    # device_type_id[0], device_type_id[1] are used as the primary/fallback
-    # device type and id. All other ones are used as device for
-    # heterogeneous execution.
-    num_rpc_dev = 0
-    device_type_id = []
-    for cur_dev in device:
-        device_type = cur_dev.device_type
-        if device_type >= rpc_base.RPC_SESS_MASK:
-            assert libmod.type_key == "rpc"
-            assert _rpc_ffi_api.SessTableIndex(libmod) == cur_dev._rpc_sess._tbl_index
-            num_rpc_dev += 1
-            device_type = cur_dev.device_type % rpc_base.RPC_SESS_MASK
-        device_type_id.append(device_type)
-        device_type_id.append(cur_dev.device_id)
-
-    if 0 < num_rpc_dev < len(device):
-        raise ValueError("Either all or none of the devices should be rpc.")
-    return device, num_rpc_dev, device_type_id
-
-
-class GraphModule(object):
-    """Wrapper runtime module.
-
-    This is a thin wrapper of the underlying TVM module.
-    you can also directly call set_input, run, and get_output
-    of underlying module functions
-
-    Parameters
-    ----------
-    module : tvm.runtime.Module
-        The internal tvm module that holds the actual graph functions.
-
-    Attributes
-    ----------
-    module : tvm.runtime.Module
-        The internal tvm module that holds the actual graph functions.
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        import tvm
-        from tvm import relay
-        from tvm.contrib import graph_executor
-
-        # build the library using graph executor
-        lib = relay.build(...)
-        lib.export_library("compiled_lib.so")
-        # load it back as a runtime
-        lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
-        # Call the library factory function for default and create
-        # a new runtime.Module, wrap with graph module.
-        gmod = graph_executor.GraphModule(lib["default"](dev))
-        # use the graph module.
-        gmod.set_input("x", data)
-        gmod.run()
-    """
-
-    def __init__(self, module):
-        self.module = module
-        self._set_input = module["set_input"]
-
-        # TODO(shingjan): The graph_executor in C doesn't have
-        # set_input/output_zero_copy implemented.
-        try:
-            self._set_input_zero_copy = module["set_input_zero_copy"]
-        except AttributeError:
-            self._set_input_zero_copy = lambda *_: (_ for _ in ()).throw(
-                Exception("set_input_zero_copy is not implemented for C graph executor")
-            )
-        try:
-            self._set_output_zero_copy = module["set_output_zero_copy"]
-        except AttributeError:
-            self._set_output_zero_copy = lambda *_: (_ for _ in ()).throw(
-                Exception("set_output_zero_copy is not implemented for C graph executor")
-            )
-        self._run = module["run"]
-        self._get_output = module["get_output"]
-        self._get_input = module["get_input"]
-        self._get_num_outputs = module["get_num_outputs"]
-        self._get_input_index = module["get_input_index"]
-        self._get_output_index = module["get_output_index"]
-        self._get_input_info = module["get_input_info"]
-        self._get_output_info = module["get_output_info"]
-        self._get_num_inputs = module["get_num_inputs"]
-        self._load_params = module["load_params"]
-        self._share_params = module["share_params"]
-
-    def set_input(self, key=None, value=None, **params):
-        """Set inputs to the module via kwargs
-
-        Parameters
-        ----------
-        key : int or str
-           The input key
-
-        value : the input value.
-           The input value
-
-        params : dict of str to NDArray
-           Additional arguments
-        """
-        if key is not None:
-            v = self._get_input(key)
-            if v is None:
-                raise RuntimeError(f"Could not find '{key}' in graph's inputs")
-            v.copyfrom(value)
-
-        if params:
-            # upload big arrays first to avoid memory issue in rpc mode
-            keys = list(params.keys())
-            keys.sort(key=lambda x: -np.prod(params[x].shape))
-            for k in keys:
-                # TODO(zhiics) Skip the weights for submodule in a better way.
-                # We should use ConstLoaderModule for initialization and remove
-                # params from set_input
-                val = self._get_input(k)
-                if val:
-                    self._get_input(k).copyfrom(params[k])
-
-    def set_input_zero_copy(self, key=None, value=None, **params):
-        """Set inputs to the module via kwargs with zero memory copy
-
-        Parameters
-        ----------
-        key : int or str
-           The input key
-
-        value : the input value in DLPack
-           The input value
-
-        params : dict of str to NDArray
-           Additional arguments
-        """
-        if key is not None:
-            self._set_input_zero_copy(key, value)
-
-        if params:
-            keys = list(params.keys())
-
-            for k in keys:
-                # TODO(zhiics) Skip the weights for submodule in a better way.
-                # We should use ConstLoaderModule for initialization and remove
-                # params from set_input
-                val = self._get_input(k)
-                if val:
-                    self._set_input_zero_copy(k, params[k])
-
-    def set_output_zero_copy(self, key, value):
-        """Set outputs to the module with zero memory copy
-
-        Parameters
-        ----------
-        key : int or str
-           The output key
-
-        value : the output value in DLPack
-           The output value
-        """
-        self._set_output_zero_copy(key, value)
-
-    def run(self, **input_dict):
-        """Run forward execution of the graph
-
-        Parameters
-        ----------
-        input_dict: dict of str to NDArray
-            List of input values to be feed to
-        """
-        if input_dict:
-            self.set_input(**input_dict)
-        self._run()
-
-    def get_num_outputs(self):
-        """Get the number of outputs from the graph
-
-        Returns
-        -------
-        count : int
-            The number of outputs.
-        """
-        return self._get_num_outputs()
-
-    def get_num_inputs(self):
-        """Get the number of inputs to the graph
-
-        Returns
-        -------
-        count : int
-            The number of inputs.
-        """
-        return self._get_num_inputs()
-
-    def get_input(self, index, out=None):
-        """Get index-th input to out
-
-        Parameters
-        ----------
-        index : int
-            The input index
-
-        out : NDArray
-            The output array container
-        """
-        if out:
-            self._get_input(index).copyto(out)
-            return out
-
-        return self._get_input(index)
-
-    def get_input_index(self, name):
-        """Get inputs index via input name.
-
-        Parameters
-        ----------
-        name : str
-           The input key name
-
-        Returns
-        -------
-        index: int
-            The input index. -1 will be returned if the given input name is not found.
-        """
-        return self._get_input_index(name)
-
-    def get_output_index(self, name):
-        """Get outputs index via output name.
-
-        Parameters
-        ----------
-        name : str
-           The output key name
-
-        Returns
-        -------
-        index: int
-            The output index. -1 will be returned if the given output name is not found.
-        """
-        return self._get_output_index(name)
-
-    def get_input_info(self):
-        """Return the 'shape' and 'dtype' dictionaries of the graph.
-
-        .. note::
-            We can't simply get the input tensors from a TVM graph
-            because weight tensors are treated equivalently. Therefore, to
-            find the input tensors we look at the 'arg_nodes' in the graph
-            (which are either weights or inputs) and check which ones don't
-            appear in the params (where the weights are stored). These nodes
-            are therefore inferred to be input tensors.
-
-        Returns
-        -------
-        shape_dict : Map
-            Shape dictionary - {input_name: tuple}.
-        dtype_dict : Map
-            dtype dictionary - {input_name: dtype}.
-        """
-        input_info = self._get_input_info()
-        assert "shape" in input_info
-        shape_dict = input_info["shape"]
-        assert "dtype" in input_info
-        dtype_dict = input_info["dtype"]
-
-        return shape_dict, dtype_dict
-
-    def get_output_info(self):
-        """Return the 'shape' and 'dtype' dictionaries of the graph.
-
-        Returns
-        -------
-        shape_dict : Map
-            Shape dictionary - {output_name: tuple}.
-        dtype_dict : Map
-            dtype dictionary - {output_name: dtype}.
-        """
-        output_info = self._get_output_info()
-        assert "shape" in output_info
-        shape_dict = output_info["shape"]
-        assert "dtype" in output_info
-        dtype_dict = output_info["dtype"]
-
-        return shape_dict, dtype_dict
-
-    def get_output(self, index, out=None):
-        """Get index-th output to out
-
-        Parameters
-        ----------
-        index : int
-            The output index
-
-        out : NDArray
-            The output array container
-        """
-        if out:
-            self._get_output(index, out)
-            return out
-
-        return self._get_output(index)
-
-    def debug_get_output(self, node, out):
-        """Run graph up to node and get the output to out
-
-        Parameters
-        ----------
-        node : int / str
-            The node index or name
-
-        out : NDArray
-            The output array container
-        """
-        raise NotImplementedError("Please use debugger.debug_executor as graph_executor instead.")
-
-    def load_params(self, params_bytes):
-        """Load parameters from serialized byte array of parameter dict.
-
-        Parameters
-        ----------
-        params_bytes : bytearray
-            The serialized parameter dict.
-        """
-        self._load_params(bytearray(params_bytes))
-
-    def share_params(self, other, params_bytes):
-        """Share parameters from pre-existing GraphExecutor instance.
-
-        Parameters
-        ----------
-        other: GraphExecutor
-            The parent GraphExecutor from which this instance should share
-            it's parameters.
-        params_bytes : bytearray
-            The serialized parameter dict (used only for the parameter names).
-        """
-        self._share_params(other.module, bytearray(params_bytes))
-
-    def __getitem__(self, key):
-        """Get internal module function
-
-        Parameters
-        ----------
-        key : str
-            The key to the module.
-        """
-        return self.module[key]
-
-    def benchmark(
-        self,
-        device,
-        func_name="run",
-        repeat=5,
-        number=5,
-        min_repeat_ms=None,
-        limit_zero_time_iterations=100,
-        end_to_end=False,
-        cooldown_interval_ms=0,
-        repeats_to_cooldown=1,
-        **kwargs,
-    ):
-        """Calculate runtime of a function by repeatedly calling it.
-
-        Use this function to get an accurate measurement of the runtime of a function. The function
-        is run multiple times in order to account for variability in measurements, processor speed
-        or other external factors.  Mean, median, standard deviation, min and max runtime are all
-        reported.  On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
-        synchonization and data transfer operations are not counted towards the runtime. This allows
-        for fair comparison of runtimes across different functions and models. The `end_to_end` flag
-        switches this behavior to include data transfer operations in the runtime.
-
-        The benchmarking loop looks approximately like so:
-
-        .. code-block:: python
-
-            for r in range(repeat):
-                time_start = now()
-                for n in range(number):
-                    func_name()
-                time_end = now()
-                total_times.append((time_end - time_start)/number)
-
-
-        Parameters
-        ----------
-        func_name : str
-            The function to benchmark. This is ignored if `end_to_end` is true.
-
-        repeat : int
-            Number of times to run the outer loop of the timing code (see above). The output will
-            contain `repeat` number of datapoints.
-
-        number : int
-            Number of times to run the inner loop of the timing code. This inner loop is run in
-            between the timer starting and stopping. In order to amortize any timing overhead,
-            `number` should be increased when the runtime of the function is small (less than a 1/10
-            of a millisecond).
-
-        min_repeat_ms : Optional[int]
-            If set, the inner loop will be run until it takes longer than `min_repeat_ms`
-            milliseconds. This can be used to ensure that the function is run enough to get an
-            accurate measurement.
-
-        limit_zero_time_iterations : Optional[int]
-            The maximum number of repeats when measured time is equal to 0.
-            It helps to avoid hanging during measurements.
-
-        end_to_end : bool
-            If set, include time to transfer input tensors to the device and time to transfer
-            returned tensors in the total runtime. This will give accurate timings for end to end
-            workloads.
-
-        cooldown_interval_ms: Optional[int]
-            The cooldown interval in milliseconds between the number of repeats defined by
-            `repeats_to_cooldown`.
-
-        repeats_to_cooldown: Optional[int]
-            The number of repeats before the cooldown is activated.
-
-        kwargs : Dict[str, Object]
-            Named arguments to the function. These are cached before running timing code, so that
-            data transfer costs are not counted in the runtime.
-
-        Returns
-        -------
-        timing_results : BenchmarkResult
-            Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
-            access the individual runtimes (in seconds).
-        """
-        min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
-        if end_to_end:
-            # Have to unpack kwargs into a single list
-            args = []
-            for k, v in kwargs.items():
-                args.append(k)
-                args.append(v)
-            return self.module.time_evaluator(
-                "run_from_inputs",
-                device,
-                repeat=repeat,
-                number=number,
-                min_repeat_ms=min_repeat_ms,
-                limit_zero_time_iterations=limit_zero_time_iterations,
-            )(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args)
-        if kwargs:
-            self.set_input(**kwargs)
-        return self.module.time_evaluator(
-            func_name,
-            device,
-            repeat=repeat,
-            number=number,
-            min_repeat_ms=min_repeat_ms,
-            limit_zero_time_iterations=limit_zero_time_iterations,
-            cooldown_interval_ms=cooldown_interval_ms,
-            repeats_to_cooldown=repeats_to_cooldown,
-        )()
diff --git a/python/tvm/contrib/hexagon/__init__.py b/python/tvm/contrib/hexagon/__init__.py
index b2e4bbdd7945..33278916d3ae 100644
--- a/python/tvm/contrib/hexagon/__init__.py
+++ b/python/tvm/contrib/hexagon/__init__.py
@@ -17,4 +17,3 @@
 """Hexagon APIs."""
 
 from .tools import *
-from .transform import *
diff --git a/python/tvm/contrib/hexagon/transform.py b/python/tvm/contrib/hexagon/transform.py
deleted file mode 100644
index 6800f4cea0fb..000000000000
--- a/python/tvm/contrib/hexagon/transform.py
+++ /dev/null
@@ -1,504 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Hexagon-specific IR transformations"""
-
-import functools as ft
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm.relay.dataflow_pattern import (
-    DFPatternCallback,
-    is_constant,
-    is_op,
-    is_tuple,
-    rewrite,
-    wildcard,
-)
-from tvm.topi.utils import get_const_tuple
-from tvm.relay.expr import Call
-from tvm.runtime import ndarray as nd
-from ..._ffi.registry import register_func
-
-### VTCM
-
-vtcm_size = 4 * 1024 * 1024  # pylint: disable=invalid-name
-
-
-@register_func("tvm.info.mem.local.vtcm")
-def mem_info_vtcm():
-    # pylint: disable=bad-whitespace
-    return tvm.ir.make_node(
-        "MemoryInfo",
-        unit_bits=8,
-        max_num_bits=vtcm_size * 8,
-        max_simd_bits=128 * 8,
-        head_address=tvm.runtime.const(100, "uint32"),
-    )
-
-
-def lower_vtcm_(get_alloc, get_free, def_align, func, mod, ctx):  # pylint: disable=unused-argument
-    """Generic VTCM allocation
-
-    Parameters
-    ----------
-    get_alloc : function: tir.Allocate, int -> tir.expr (dtype='handle')
-      The VTCM allocation function. It takes an Allocate statement, and the required
-      alignment, and returns a pointer to the allocated VTCM buffer.
-    get_free : function: tir.expr (dtype='handle') -> None
-      The VTCM deallocation function. It takes the address of the allocated buffer
-      and frees it. It returns no value.
-    def_align : int
-      The default alignment that will be passed to the allocation function, if the
-      program does not specify the alignment via a 'storage_alignment' attribute.
-    func : tir.PrimFunc
-    mod : tvm.IRModule
-    ctx : transform.PassContext
-
-    Returns
-    -------
-    stmt : tvm.stmt
-        Transformed function body.
-    """
-
-    vtcm_buffers = []
-    alignments = {}
-
-    def buf_align(var):
-        """Determine the alignment of the buffer with variable 'var'."""
-        if var in alignments and alignments[var]:
-            return alignments[var][-1]
-        return def_align
-
-    def visit(stmt):
-        """Collect information about VTCM buffers and their alignments."""
-        if isinstance(stmt, tvm.tir.AttrStmt):
-            if stmt.attr_key == "storage_alignment":
-                if not stmt.node in alignments:
-                    alignments[stmt.node] = []
-                alignments[stmt.node].append(stmt.value)
-        elif isinstance(stmt, tvm.tir.Allocate):
-            scope = stmt.buffer_var.type_annotation.storage_scope
-            if scope == "local.vtcm":
-                vtcm_buffers.append(stmt.buffer_var)
-
-    def mutate(stmt):
-        """Insert calls to VTCM allocation and deallocation routines."""
-        if isinstance(stmt, tvm.tir.AttrStmt):
-            if stmt.attr_key == "storage_alignment":
-                alignments[stmt.node].pop()
-            return stmt
-        if isinstance(stmt, tvm.tir.Allocate):
-            var = stmt.buffer_var
-            scope = var.type_annotation.storage_scope
-            is_vtcm = var in vtcm_buffers
-            if scope == "local.vtcm":
-                vtcm_buffers.pop()
-            if is_vtcm:
-                is_null = tvm.tir.call_intrin("bool", tvm.ir.Op.get("tir.isnullptr"), var)
-                throw_error = tvm.tir.call_intrin(
-                    "int32", tvm.ir.Op.get("tir.tvm_throw_last_error")
-                )
-                body_w_free = tvm.tir.SeqStmt([stmt.body, tvm.tir.Evaluate(get_free(var))])
-                body_w_check = tvm.tir.IfThenElse(
-                    is_null, tvm.tir.Evaluate(throw_error), body_w_free
-                )
-                return tvm.tir.LetStmt(
-                    stmt.buffer_var, get_alloc(stmt, buf_align(var)), body_w_check
-                )
-            return stmt
-        raise ValueError("Wrong argument type (" + type(stmt) + ") to 'mutate'")
-
-    f = func.with_body(
-        tvm.tir.stmt_functor.ir_transform(
-            func.body, visit, mutate, ["tir.Allocate", "tir.AttrStmt"]
-        )
-    )
-    return f
-
-
-def ir_lower_vtcm():
-    """Create a VTCM lowering pass.
-
-    VTCM memory has to be allocated using special functions.
-    """
-
-    def get_alloc(stmt, align):
-        assert isinstance(stmt, tvm.tir.Allocate)
-        return tvm.tir.call_extern(
-            "handle",
-            "HexagonBackendAllocateVTCM",
-            ft.reduce(lambda x, y: x * y, stmt.extents, 1),
-            align,
-        )
-
-    def get_free(var):
-        return tvm.tir.call_extern("handle", "HexagonBackendFreeVTCM", var)
-
-    # pylint: disable=bad-whitespace
-    @tvm.tir.transform.prim_func_pass(opt_level=0, name="Lower VTCM pass")
-    def transform(func, mod, ctx):
-        return lower_vtcm_(get_alloc, get_free, 2048, func, mod, ctx)
-
-    return transform
-
-
-def ir_lower_vtcm_pass():
-    return [(3, ir_lower_vtcm())]
-
-
-class qdistilbert_rewrite(DFPatternCallback):
-    """
-    A callback to replace the below pattern:
-    Pattern:
-    %35 = strided_slice(%34, begin=[0, 0, 0], end=[1, 128, 64], strides=[1, 1, 1], axes=None);
-    %44 = reshape(%35, newshape=[-1, 64]);
-    <snip>
-    %42 = strided_slice(%41, begin=[0, 0, 0], end=[1, 64, 128], strides=[1, 1, 1], axes=None);
-    %43 = reshape(%42, newshape=[64, 128]);
-    %45 = transpose(%43, axes=[1, 0]);
-    <snip>
-    %46 = qnn.dense(%44, %45, 13, 1, 0.0541715f, 0.0489368f, units=None, out_dtype="int32");
-    %47 = qnn.requantize(%46, 0.00265098f, 0, 0.728874f, -14, axis=1, out_dtype="int8");
-    <snip>
-    %125 = expand_dims(%47, axis=0) /* ty=Tensor[(1, 128, 128), int8] */;
-    < The above pattern repeats 12 times, which is the batch size >
-
-    %137 = (%125, %126, %127, %128, %129, %130, %131, %132, %133, %134, %135, %136);
-    %138 = concatenate(%137);
-
-    """
-
-    def __init__(self):
-        super(qdistilbert_rewrite, self).__init__()
-        self.A = wildcard()  # Tensor A
-        self.B = wildcard()  # Tensor B
-        self.batch = 12  # Number of time pattern repeats or Batch size
-
-        self.d = []  # List of dense quantization parameters
-        self.q = []  # List of requantize parameters
-        L = []  # List of patterns
-
-        z = tvm.tir.IntImm("int64", 0)
-        s1 = tvm.tir.IntImm("int64", 1)
-
-        for i in range(self.batch):
-            x = tvm.tir.IntImm("int64", i)
-
-            self.d.append([is_constant(), is_constant(), is_constant(), is_constant()])
-            self.q.append([is_constant(), is_constant(), is_constant(), is_constant()])
-
-            pat_a = is_op("strided_slice")(self.A).has_attr(
-                {"begin": [x, z, z], "strides": [s1, s1, s1]}
-            )
-            pat_a = is_op("reshape")(pat_a)
-
-            pat_b = is_op("strided_slice")(self.B).has_attr(
-                {"begin": [x, z, z], "strides": [s1, s1, s1]}
-            )
-            pat_b = is_op("reshape")(pat_b)
-            pat_b = is_op("transpose")(pat_b)
-
-            pat = is_op("qnn.dense")(
-                pat_a, pat_b, self.d[i][0], self.d[i][1], self.d[i][2], self.d[i][3]
-            )
-            pat = is_op("qnn.requantize")(
-                pat, self.q[i][0], self.q[i][1], self.q[i][2], self.q[i][3]
-            )
-            pat = is_op("expand_dims")(pat)
-            L.append(pat)
-
-        T = is_tuple(L)
-        self.pattern = is_op("concatenate")(T)
-
-    def check_quant_params(self, node_map):
-        """checking if dense and requant params are the same across patterns"""
-        r = self.batch
-        x1 = [node_map[self.d[0][i]][0].data.numpy().item() for i in range(4)]
-        x2 = [node_map[self.q[0][i]][0].data.numpy().item() for i in range(4)]
-        for i in range(1, r):
-            for j in range(4):
-                y1 = node_map[self.d[i][j]][0].data.numpy().item()
-                y2 = node_map[self.q[i][j]][0].data.numpy().item()
-                if x1[j] != y1 or x2[j] != y2:
-                    return False
-        return True
-
-    def callback(self, pre, post, node_map):
-        A = node_map[self.A][0]
-        B = node_map[self.B][0]
-
-        if not self.check_quant_params(node_map):
-            return post
-
-        [a0, a1, a2] = [0, 0, 0]  # Tensor A shape
-        [b0, b1, b2] = [0, 0, 0]  # Tensor B shape
-
-        if isinstance(A, relay.expr.Call) and isinstance(B, relay.expr.Call):
-            if A.checked_type is None or B.checked_type is None:
-                # Need infer pass to be run before this pass
-                return post
-            if len(A.checked_type.shape) == 3 and len(B.checked_type.shape) == 3:
-                [a0, a1, a2] = A.checked_type.shape
-                [b0, b1, b2] = B.checked_type.shape
-
-        if isinstance(A, relay.Var) and isinstance(B, relay.Var):
-            if len(A.type_annotation.shape) == 3 and len(B.type_annotation.shape) == 3:
-                [a0, a1, a2] = A.type_annotation.shape
-                [b0, b1, b2] = B.type_annotation.shape
-
-        # Check if the batch size is same as expected tensor size
-        if (a0 != self.batch) or (b0 != self.batch):
-            return post
-
-        for i in range(self.batch):
-            # end=(x, pa1, pa2) attribute of strided_slice for Tensor A
-            pa1 = pre.args[0][i].args[0].args[0].args[0].args[0].attrs.end[1].value
-            pa2 = pre.args[0][i].args[0].args[0].args[0].args[0].attrs.end[2].value
-
-            # end=(x, pb1, pb2) attribute of strided_slice for Tensor B
-            pb1 = pre.args[0][i].args[0].args[0].args[1].args[0].args[0].attrs.end[1].value
-            pb2 = pre.args[0][i].args[0].args[0].args[1].args[0].args[0].attrs.end[2].value
-
-            if a1 != pa1 or a2 != pa2 or b1 != pb1 or b2 != pb2:
-                return post
-
-        d = [node_map[self.d[0][i]][0] for i in range(4)]
-        q = [node_map[self.q[0][i]][0] for i in range(4)]
-
-        out = relay.op.transpose(B, axes=[0, 2, 1])
-        out = relay.qnn.op.batch_matmul(A, out, d[0], d[1], d[2], d[3], out_dtype="int32")
-        out = relay.qnn.op.requantize(out, q[0], q[1], q[2], q[3], out_dtype="int8")
-        return out
-
-
-def rewrite_qdistilbert(mod):
-    """Rewrite the Quantized Distilbert to reduce computational complexity."""
-    mod["main"] = rewrite(qdistilbert_rewrite(), mod["main"])
-    return mod
-
-
-class remove_empty_pad_callback(DFPatternCallback):
-    """
-    A callback to remove empty pad op from the below pattern:
-    Pattern:
-    %0 = cast(0f, dtype="float16");
-    %1 = nn.pad(%inp, %0, pad_width=[[0i64, 0i64], [0i64, 0i64]]);
-    nn.matmul(%1, %inp2, units=None)
-
-    """
-
-    def __init__(self):
-        super(remove_empty_pad_callback, self).__init__()
-        self.A = wildcard()
-        self.B = wildcard()
-        self.a = is_op("nn.pad")(self.A, wildcard()).has_attr({"pad_width": ((0, 0), (0, 0))})
-        self.pattern = is_op("nn.matmul")(self.a, self.B)
-
-    def callback(self, pre, post, node_map):
-        A = node_map[self.A][0]
-        B = node_map[self.B][0]
-        return relay.nn.matmul(A, B)
-
-
-def remove_empty_pad(mod):
-    """Remove the empty pad operator."""
-    mod["main"] = rewrite(remove_empty_pad_callback(), mod["main"])
-    return mod
-
-
-class simplify_qnn_concat_in_func(DFPatternCallback):
-
-    """
-    Propagate qnn.concat's quantization params to its inputs,
-    and try to avoid redundant requantization while doing so.
-
-    Replace
-    def @main(%q1: Tensor[(1, 64, 35, 35), uint8],
-        %q2: Tensor[(1, 64, 35, 35), uint8], %q3: Tensor[(1, 32, 35, 35), uint8]) {
-        %0 = nn.max_pool2d(%q1, pool_size=[3, 3], padding=[1, 1, 1, 1], layout="NHWC");
-        %1 = qnn.requantize(%q2, 0.000109401f, 0, 0.00345f, 0, axis=1, out_dtype="uint8");
-        %2 = (%0, %1, %q3);
-        %3 = (0.0425042f, 0.00345f, 0.0486874f);
-        %4 = (0, 0, 0);
-        qnn.concatenate(%2, %3, %4, 0.0486874f, 0, axis=1)
-    }
-
-    with
-
-    def @main(%q1: Tensor[(1, 64, 35, 35), uint8],
-        %q2: Tensor[(1, 64, 35, 35), uint8], %q3: Tensor[(1, 32, 35, 35), uint8]) {
-        %0 = nn.max_pool2d(%q1, pool_size=[3, 3], padding=[1, 1, 1, 1], layout="NHWC");
-        %1 = qnn.requantize(%0, 0.0425042f, 0, 0.0486874f, 0, axis=1, out_dtype="uint8");
-        %2 = qnn.requantize(%q2, 0.000109401f, 0, 0.0486874f, 0, axis=1, out_dtype="uint8");
-        %3 = (%1, %2, %q3);
-        concatenate(%3, axis=1)
-    }
-    """
-
-    def __init__(self):
-        super(simplify_qnn_concat_in_func, self).__init__()
-        self.qvals = wildcard()
-        self.scales = wildcard()
-        self.zps = wildcard()
-        self.out_scale = wildcard()
-        self.out_zp = wildcard()
-        self.pattern = is_op("qnn.concatenate")(
-            self.qvals, self.scales, self.zps, self.out_scale, self.out_zp
-        )
-
-    def callback(self, pre, post, node_map):
-        in_qvals = node_map[self.qvals][0]
-        in_scales = node_map[self.scales][0]
-        in_zps = node_map[self.zps][0]
-        new_qvals = []
-        for i in range(len(in_qvals)):
-            new_requant_args = []
-            # TODO Generalize for all qnn ops
-            if isinstance(in_qvals[i], Call) and (in_qvals[i].op.name == "qnn.requantize"):
-                # propagate scale/zp of qnn.concat to this requantize op
-                for j in range(3):
-                    new_requant_args.append(in_qvals[i].args[j])
-                new_requant_args += [node_map[self.out_scale][0], node_map[self.out_zp][0]]
-                new_qvals.append(relay.qnn.op.requantize(*new_requant_args, **(in_qvals[i].attrs)))
-            else:
-                # simply create a new requantize op if there is a change in quantization params
-                # if not, just retain the old qval
-                if (in_scales[i] == node_map[self.out_scale][0]) and (
-                    in_zps[i] == node_map[self.out_zp][0]
-                ):
-                    new_qvals.append(in_qvals[i])
-                else:
-                    new_requant_args += [
-                        in_qvals[i],
-                        in_scales[i],
-                        in_zps[i],
-                        node_map[self.out_scale][0],
-                        node_map[self.out_zp][0],
-                    ]
-                    new_qvals.append(
-                        relay.qnn.op.requantize(
-                            *new_requant_args,
-                            axis=post.attrs["axis"],
-                            out_dtype=post.checked_type.dtype,
-                        )
-                    )
-
-        new_op = relay.op.concatenate(
-            new_qvals,
-            node_map[self.pattern][0].attrs["axis"],
-        )
-        return new_op
-
-
-# Right now context is ignored
-@tvm.transform.module_pass(opt_level=1)
-def simplify_qnn_concat(mod, _=None):
-    for global_var in mod.functions.keys():
-        mod[global_var] = rewrite(simplify_qnn_concat_in_func(), mod[global_var])
-    return mod
-
-
-class simplify_conv_pat_in_func(DFPatternCallback):
-
-    """
-    Simplify Mul->Sub->Conv->bias_add to Conv->bias_add->add sequence if
-    one of the inputs to Mul and Sub are constant scalars.
-
-    Replace
-    def @main(%q1: Tensor[(1, 128, 128, 3), float16])
-        %0 = multiply(%q1, c1_const_scalar)  /* ty=Tensor[(1, 128, 128, 3), float16] */;
-        %1 = subtract(%0, c2_const_scalar) /* ty=Tensor[(1, 128, 128, 3), float16] */
-        %2 = transpose(%1, axes=[0,3,1,2])
-            /* ty=Tensor[(1, 3, 128, 128), float16] */
-        %3 = nn.conv2d(%2, weights, ...) .
-        %4 = nn.bias_add(%3, bias)
-    }
-
-    with
-
-    def @main(%q1: Tensor[(1, 128, 128, 3), float16])
-        %0 = transpose(%q1, axes=[0, 3, 1, 2])
-            /* ty=Tensor[(1, 3, 128, 128), float16] */;
-        %1 = multiply(c1, weights) /* ty=Tensor[(64, 3, 3, 3), float16] */;
-        %2 = nn.conv2d(%0, %1, padding=[1, 1, 1, 1],
-            channels=64, kernel_size=[3, 3])
-            /* ty=Tensor[(1, 64, 128, 128), float16] */;
-        %3 = subtract(%0 shaped zero_tensor, c2)
-            /* ty=Tensor[(1, 3, 128, 128), float16] */;
-        %4 = nn.bias_add(%2, bias) /* ty=Tensor[(1, 64, 128, 128), float16] */;
-        %5 = nn.conv2d(%3, weights, padding=[1, 1, 1, 1],
-            channels=64, kernel_size=[3, 3])
-            /* ty=Tensor[(1, 64, 128, 128), float16] */;
-        add(%4, %5) /* ty=Tensor[(1, 64, 128, 128), float16] */
-
-    Why is it legal? Ignore the transpose in the above pattern.
-    res[p,q,r,s] = Conv(a*c1 - c2, W)
-                 = SUM{i=[0,c-1], j=[0,kh-1], k=[0,kw-1]}
-                    {(a[p,i,r+j,s+k] * c1 - c2) * W[q,i,j,k]}
-                 = SUM{i=[0,c-1], j=[0,kh-1], k=[0,kw-1]}
-                    {a[p,i,r+j,s+k] * c1 * W[q,i,j,k]} - c2 * W[q,i,j,k]}
-                 = Conv(a, W*c1) + Conv(0-c2, W)
-
-
-    }
-
-    In the above, %1, %3, %5 are constants and can be folded, so we're
-    left with 4 ops, as opposed to the original 5 ops
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.inp = wildcard()
-        self.mul = is_op("multiply")(self.inp, is_constant().has_shape(()))
-        self.sub = is_op("subtract")(self.mul, is_constant().has_shape(()))
-        self.act = is_op("transpose")(self.sub)
-        self.weights = is_constant()
-        self.conv2d_op = is_op("nn.conv2d")(self.act, self.weights)
-        self.pattern = is_op("nn.bias_add")(self.conv2d_op, is_constant())
-
-    def callback(self, pre, post, node_map):
-        new_transpose = relay.transpose((node_map[self.inp][0]), **((node_map[self.act][0]).attrs))
-        new_weights = relay.multiply((node_map[self.mul][0].args[1]), (node_map[self.weights][0]))
-        new_conv2d = relay.nn.conv2d(
-            new_transpose, new_weights, **((node_map[self.conv2d_op][0]).attrs)
-        )
-        new_bias_add = relay.nn.bias_add(new_conv2d, (node_map[self.pattern][0].args[1]))
-
-        zero_tensor = relay.Constant(
-            nd.array(
-                np.zeros(
-                    get_const_tuple((node_map[self.act][0]).checked_type.shape),
-                    dtype=(node_map[self.act][0]).checked_type.dtype,
-                )
-            )
-        )
-        negated = relay.subtract(zero_tensor, (node_map[self.sub][0].args[1]))
-        const_conv2d = relay.nn.conv2d(
-            negated, (node_map[self.weights][0]), **((node_map[self.conv2d_op][0]).attrs)
-        )
-        return relay.add(new_bias_add, const_conv2d)
-
-
-# Right now context is ignored
-@tvm.transform.module_pass(opt_level=1)
-def simplify_conv_pat(mod, _=None):
-    """top level function for conv pattern simplification"""
-    for global_var in mod.functions.keys():
-        mod[global_var] = rewrite(simplify_conv_pat_in_func(), mod[global_var])
-    return mod
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
deleted file mode 100644
index d6be16653c67..000000000000
--- a/python/tvm/contrib/pipeline_executor.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Pipeline executor that executes a series of modules in a pipeline fashion."""
-import json
-import os
-import time
-from tvm import runtime
-from tvm._ffi import get_global_func
-from tvm.contrib import graph_executor
-
-
-def pipeline_executor_enabled():
-    """Check if the pipeline executor is enabled.
-
-    Return
-    -------
-    enable: bool
-        Return whether the pipeline executor is enabled.
-    """
-    return get_global_func("tvm.pipeline_executor.create", allow_missing=True) is not None
-
-
-class PipelineModule(object):
-    """Wrapper of runtime module, caller can use this module to set parameters and get outputs.
-
-    Parameters
-    ----------
-    module : Union[PipelineExecutorFactoryModule, Module]
-        Common interface for pipeline executor factory modules or Module.
-    """
-
-    def __init__(self, module):
-        if isinstance(module, PipelineExecutorFactoryModule):
-            self.module = module.get_pipeline_executor_module()
-        else:
-            self.module = module
-        # Get the packed functions from the pipeline executor.
-        self._get_params_group_pipeline_map = self.module["get_params_group_pipeline_map"]
-        self._run = self.module["run"]
-        self._set_param = self.module["set_param"]
-        self._set_input = self.module["set_input"]
-        self._get_input = self.module["get_input"]
-        self._get_output = self.module["get_output"]
-        self._get_num_outputs = self.module["get_num_outputs"]
-        self._get_num_inputs = self.module["get_num_inputs"]
-        self._get_input_pipeline_map = self.module["get_input_pipeline_map"]
-        self._get_pipe_execute_count = self.module["get_execute_count"]
-
-    def run(self):
-        """Run the pipeline executor."""
-        self._run()
-
-    def get_input_pipeline_map(self, name):
-        """Using the "name" to get the corresponding subgraph index and also get the "input name"
-        of the corresponding subgraph interface.
-        Returns
-        -------
-        input map: Array[str]
-            Returning the index and "input name" of the subgraph.
-        """
-        return self._get_input_pipeline_map(name)
-
-    def get_params_group_pipeline_map(self, name):
-        """Use the name of the parameters group to get the corresponding runtime module index.
-
-        Parameters
-        ----------
-        name: str
-            The parameter group name.
-
-        Returns
-        -------
-        module_index: int
-            The index of the runtime module.
-        """
-        return self._get_params_group_pipeline_map(name)
-
-    def set_input(self, key, value):
-        """Set the input via input name.
-
-        Parameters
-        ----------
-        key : str
-            The input name
-        value : array_like.
-            The input value
-        """
-        self._set_input(key, value)
-
-    def set_params(self, params_group_name, params_data):
-        """Set the parameter group value given the parameter group name. Note that the parameter
-        group name is declared in the pipeline executor config.
-
-        Parameters
-        ----------
-        params_group_name : str
-            The parameters group name.
-
-        params_data : Dict[str, NDArray]
-            A map from parameter name to data.
-        """
-        if not params_data:
-            raise RuntimeError('"params_data is empty!"')
-
-        for key, val in params_data.items():
-            self._set_param(params_group_name, key, val)
-
-    def get_input(self, key):
-        """Get the input via an input name.
-        Parameters
-        ----------
-        key : str
-            The input key
-        Returns
-        -------
-        data : NDArray
-            The input data.
-        """
-        return self._get_input(key)
-
-    def get_output(self, synchronize=True, sleep_interval=0.001):
-        """Get the output.
-        Returns
-        -------
-        data : Array[NDArray]
-            A list of output data.
-        synchronize : BOOL
-            Whether to do a synchronize poll.
-        sleep_interval : Float32
-            When doing the synchronize loop poll, how many seconds the loop should sleep for yield.
-        """
-        outputs = []
-        if not synchronize:
-            outputs = self._get_output()
-        else:
-            while not outputs:
-                outputs = self._get_output()
-                time.sleep(sleep_interval)
-
-        return outputs
-
-    @property
-    def num_executing_pipeline(self):
-        """Getting the count of running pipeline.
-        Returns
-        -------
-        count : int
-            The count of running pipeline.
-        """
-        return self._get_pipe_execute_count()
-
-    @property
-    def num_outputs(self):
-        """Get the number of outputs.
-        Returns
-        -------
-        count : int
-            The number of outputs.
-        """
-        return self._get_num_outputs()
-
-    @property
-    def num_inputs(self):
-        """Get the number of inputs
-        Returns
-        -------
-        count : int
-            The number of inputs
-        """
-        return self._get_num_inputs()
-
-    @staticmethod
-    def load_library(config_file_name):
-        """Import files to create a pipeline executor.
-
-        Parameters
-        ----------
-        config_file_name : str
-            Path and name of the configuration file, the configuration file contains the
-            disk path of the parameter file, library file, and JSON file.
-        """
-        with open(config_file_name, "r") as file_handle:
-            config = file_handle.read()
-        config = json.loads(config)
-        if "load_config" not in config or "pipeline_config" not in config:
-            raise RuntimeError(
-                f'"load_config" or "pipeline_config" is missing in {config_file_name}'
-            )
-
-        # The config file used to load library, prameters, and JSON files.
-        with open(config["load_config"], "r") as file_handle:
-            load_config = file_handle.read()
-
-        # The config file used to load pipeline compute config.
-        with open(config["pipeline_config"], "r") as file_handle:
-            pipeline_config = file_handle.read()
-
-        # Load a PipelineExecutor from the disk files.
-        load_library = get_global_func("tvm.pipeline_executor.load", allow_missing=False)
-        module = load_library(load_config, pipeline_config)
-
-        return PipelineModule(module)
-
-
-class PipelineExecutorFactoryModule(object):
-    """Common interface for pipeline executor factory modules.
-
-    Parameters
-    ----------
-    pipeline_mods : List[GraphExecutorFactoryModule]
-        List of GraphExecutorFactoryModule.
-
-    mod_config : Dict[int, Dict[str, Any]]
-        Modules dependency configuration information.
-
-    """
-
-    def __init__(self, pipeline_mods, mods_config):
-        self.pipeline_mods = pipeline_mods
-        self.mods_config = mods_config
-        self.module = None
-
-    def get_pipeline_executor_module(self):
-        """Get the pipeline executor module.
-
-        Returns
-        -------
-        module : Module
-            Common interface for pipeline executor factory Module.
-        """
-        if not self.module:
-            graph_executors, config = self.graph_executor_create(
-                self.pipeline_mods, self.mods_config
-            )
-            self.pipeline_create = get_global_func(
-                "tvm.pipeline_executor.create", allow_missing=False
-            )
-            self.module = self.pipeline_create(graph_executors, config)
-        return self.module
-
-    def graph_executor_create(self, pipeline_mods, mod_config):
-        """Create graph_executor list and return configuration as a json string.
-
-        Parameters
-        ----------
-        pipeline_mods : List[GraphExecutorFactoryModule]
-          List of GraphExecutorFactoryModule
-
-        mod_config : Dict[str, Any]
-            Modules dependency configuration information.
-
-        Returns
-        -------
-        mods : List[Module]
-            The Module list.
-
-        mod_config : str
-            The Modudle configuration.
-        """
-        # Should store modules in the list named 'mods' in index order.
-        mods = [None for _ in range(len(pipeline_mods))]
-        for lib_index in pipeline_mods:
-            pipeline_lib = pipeline_mods[lib_index]["lib"]
-            dev = pipeline_mods[lib_index]["dev"]
-            lib = graph_executor.GraphModule(pipeline_lib["default"](dev))
-            # Return a module list sorted by lib_index.
-            mods[lib_index] = lib.module
-
-        return mods, json.dumps(mod_config)
-
-    def export_library(self, directory_path):
-        """Export the pipeline executor into disk files.
-
-        Parameters
-        ----------
-        directory_path : str
-            Export the files to this directory.
-        """
-        if not self.pipeline_mods:
-            raise RuntimeError("The pipeline executor has not been initialized.")
-
-        # Check if the directory_path exists.
-        if not os.path.exists(directory_path):
-            raise RuntimeError("The directory {directory_path} does not exist.")
-        # Create an load configuration.
-        load_config_file_name = f"{directory_path}/load_config"
-        pipeline_config_file_name = f"{directory_path}/pipeline_config"
-        config = {}
-        config["load_config"] = load_config_file_name
-        config["pipeline_config"] = pipeline_config_file_name
-        load_config = []
-        # Export the library, JSON, and parameter into files, then export these files path
-        # into a configuration file.
-        for lib_index in self.pipeline_mods:
-            mconfig = {}
-            mconfig["mod_idx"] = lib_index
-            mconfig["lib_name"] = f"{directory_path}/lib{lib_index}.so"
-            mconfig["json_name"] = f"{directory_path}/json{lib_index}"
-            mconfig["params_name"] = f"{directory_path}/params{lib_index}"
-            mconfig["dev"] = (
-                f"{self.pipeline_mods[lib_index]['dev'].device_type},"
-                f"{self.pipeline_mods[lib_index]['dev'].device_id}"
-            )
-            # Get the graph, lib, and parameters from GraphExecutorFactoryModule.
-            lib = self.pipeline_mods[lib_index]["lib"]
-            # Export the lib, graph, and parameters to disk.
-            if self.pipeline_mods[lib_index]["export_cc"]:
-                lib.export_library(
-                    mconfig["lib_name"], cc=self.pipeline_mods[lib_index]["export_cc"]
-                )
-            else:
-                lib.export_library(mconfig["lib_name"])
-
-            with open(mconfig["json_name"], "w") as file_handle:
-                file_handle.write(lib.graph_json)
-            with open(mconfig["params_name"], "wb") as file_handle:
-                file_handle.write(runtime.save_param_dict(lib.params))
-
-            load_config.append(mconfig)
-
-        with open(load_config_file_name, "w") as file_handle:
-            json.dump(load_config, file_handle)
-
-        with open(pipeline_config_file_name, "w") as file_handle:
-            json.dump(self.mods_config, file_handle)
-
-        config_file_name = f"{directory_path}/config"
-        with open(config_file_name, "w") as file_handle:
-            json.dump(config, file_handle)
-
-        return config_file_name
diff --git a/python/tvm/contrib/pipeline_executor_build.py b/python/tvm/contrib/pipeline_executor_build.py
deleted file mode 100644
index 9a16d1b7afaa..000000000000
--- a/python/tvm/contrib/pipeline_executor_build.py
+++ /dev/null
@@ -1,674 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=f-string-without-interpolation
-"""Pipeline executor that executes a series of modules in a pipeline fashion."""
-import json
-import os
-import tvm._ffi
-from tvm import relay
-from tvm.relay.transform import InferType
-from tvm.contrib.pipeline_executor import PipelineExecutorFactoryModule
-
-
-def pipeline_executor_build_enabled():
-    """Check if the pipeline executor build is enabled.
-
-    Return
-    -------
-    enable: bool
-        Return whether the pipeline executor is enabled.
-    """
-    return tvm.contrib.pipeline_executor.pipeline_executor_enabled()
-
-
-def build(pipe_configs):
-    """Build modules used in the pipeline executor, then use these modules and configuration
-    to create a pipeline executor.
-
-    Parameters
-    ----------
-    pipe_configs: PipelineConfig
-        Build Configuration information.
-
-    Returns
-    -------
-    ret: PipelineExecutorFactoryModule
-        Common interface for pipeline executor factory modules.
-    """
-    libs = {}
-    config = pipe_configs.get_config()
-    if "module_connection" not in config:
-        raise RuntimeError('"module_connection" is missing')
-    if "input_connection" not in config:
-        raise RuntimeError('"input_connection" is missing')
-    if "param_connection" not in config:
-        raise RuntimeError('"param_connection" is missing')
-
-    mod_n_configs = config["module_connection"]
-    config_len = len(mod_n_configs)
-    module_string_config = [{} for _ in range(config_len)]
-    # Use hardware configurations to build backend modules for each subgraph.
-    for ir_mod, mod_config in mod_n_configs.items():
-        pipe_config = mod_config["pipeline"].copy()
-        mod_idx = pipe_config["mod_idx"]
-        dev = mod_config["dev"]
-        target = mod_config["target"]
-        build_func = relay.build
-        # Callers may need to use a customized building function to wrap the pre-building logic
-        # and the backend building logic. For example, in order to support a backend which only
-        # can do "int8" computation, the caller may need to merge the "quantization" logic
-        # into the building logic to creat a customized building function.
-        if "build" in mod_config and mod_config["build"]:
-            build_func = mod_config["build"]
-
-        lib = build_func(
-            ir_mod,
-            target,
-            params=mod_config["params"],
-            target_host=mod_config["target_host"],
-            mod_name=mod_config["mod_name"],
-        )
-
-        pipe_config["dev"] = f"{dev.device_type},{dev.device_id}"
-        # Use "mod_idx" as the key to create a "module_connection" map which is not only
-        # for the module index but also for the module connection used to build the pipeline.
-        module_string_config[mod_idx] = pipe_config
-        libs[mod_idx] = {
-            "lib": lib,
-            "dev": dev,
-            "fcompile": mod_config["fcompile"],
-            "export_cc": mod_config["export_cc"],
-        }
-
-    # Creating a text form configuration to record the "input_connection" and the
-    # "module_connection" information. The "input_connection" is used to record the
-    # map of global input and subgraph input, and the "module_connection" is used to
-    # record module dependency.
-    string_config = {}
-    string_config["param_connection"] = config["param_connection"]
-    string_config["input_connection"] = config["input_connection"]
-    string_config["module_connection"] = module_string_config
-
-    return PipelineExecutorFactoryModule(libs, string_config)
-
-
-def export_library(factory, directory_path):
-    """Export the pipeline executor into disk files.
-
-    Parameters
-    ----------
-    factory : PipelineExecutorFactoryModule
-        The pipeline executor factory
-    directory_path : str
-        Export the files to this directory.
-    """
-    if not factory.pipeline_mods:
-        raise RuntimeError("The pipeline executor has not been initialized.")
-
-    # Check if the directory_path exists.
-    if not directory_path or not os.path.exists(directory_path):
-        raise RuntimeError("The directory {directory_path} does not exist.")
-    # Create an load configuration.
-    load_config_file_name = f"{directory_path}/load_config"
-    pipeline_config_file_name = f"{directory_path}/pipeline_config"
-    config = {}
-    config["load_config"] = load_config_file_name
-    config["pipeline_config"] = pipeline_config_file_name
-    load_config = []
-    # Export the library, JSON, and parameter into files, then export these files path
-    # into a configuration file.
-    for lib_index in factory.pipeline_mods:
-        mconfig = {}
-        mconfig["mod_idx"] = lib_index
-        mconfig["lib_name"] = f"{directory_path}/lib{lib_index}.so"
-        mconfig["json_name"] = f"{directory_path}/json{lib_index}"
-        mconfig["params_name"] = f"{directory_path}/params{lib_index}"
-        lib_config = factory.pipeline_mods[lib_index]
-        mconfig["dev"] = f"{lib_config['dev'].device_type}," f"{lib_config['dev'].device_id}"
-        fcompile = lib_config["fcompile"]
-        if not fcompile:
-            fcompile = False
-
-        # Get the graph, lib, and parameters from GraphExecutorFactoryModule.
-        lib = factory.pipeline_mods[lib_index]["lib"]
-        # Export the lib, graph, and parameters to disk.
-        lib.export_library(mconfig["lib_name"], fcompile=fcompile)
-        with open(mconfig["json_name"], "w") as file_handle:
-            file_handle.write(lib.graph_json)
-        with open(mconfig["params_name"], "wb") as file_handle:
-            file_handle.write(relay.save_param_dict(lib.params))
-
-        load_config.append(mconfig)
-
-    with open(load_config_file_name, "w") as file_handle:
-        json.dump(load_config, file_handle)
-
-    with open(pipeline_config_file_name, "w") as file_handle:
-        json.dump(factory.mods_config, file_handle)
-
-    config_file_name = f"{directory_path}/config"
-    with open(config_file_name, "w") as file_handle:
-        json.dump(config, file_handle)
-
-    return config_file_name
-
-
-class PipelineConfig(object):
-    """Pipeline configuration information, this class contains the DAG that expresses
-    the dependency of each module involved in a pipeline and the parameters for building
-    each module.
-    """
-
-    class Binding:
-        """This class defines the module connections information.
-        The type can only be "input" or "output".
-
-        Parameters
-        ----------
-        owner : ModuleWrapper
-            The class who owns this interface.
-
-        io_type : str
-            The I/O type of this interface. It can only be "input" or "output".
-
-        name : str/integer
-            Name, for input it is string such as "data0", for output it is an integer such as 0.
-
-        data_type: TensorType
-            The data type of this interface.
-        """
-
-        def __init__(self, owner, io_type, name, data_type=None):
-            self.io_owner = owner
-            self.io_type = io_type
-            self.name = str(name)
-            # Child interfaces that depend on this interface.
-            self.bindings = []
-            # Parents interfaces that this interface depend on.
-            self.parents = []
-
-            self.data_type = data_type
-
-        def get_name(self):
-            # Return name of this interface and the name of owner who owns this interface.
-            owner_name = ""
-            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                owner_name = self.io_owner.name
-
-            return owner_name, self.name
-
-        def get_owner_idx(self):
-            # If the owner is ModuleWrapper return the owner index, if not return 0.
-            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                return self.io_owner.idx
-
-            return -1
-
-        def is_pipeline_executor_interface(self):
-            """The pipeline interface is used to interact with the caller. There are two types
-            of interfaces, one is 'input' another is 'output'. The pipeline input interface
-            is responsible for passing parameters to the internal module interface, and the
-            pipeline output interface is responsible for outputting the results computed by
-            the pipeline executor to the caller.
-            """
-            return not isinstance(self.io_owner, PipelineConfig.ModuleWrapper)
-
-        def __repr__(self):
-            # Geting the binding information in the form of text.
-            str_format = f"  |{self.name}: "
-            for binding in self.bindings:
-                mname, dname = binding.get_name()
-                str_format += f"{mname}:{dname} "
-
-            return str_format
-
-        def check_binding_dict(self, connection_dict):
-            """Checking the binding dictionary.
-            Parameter
-            ---------
-            connection_dict : Dict[str, Any]
-                It is a dictionary of module connections.
-            """
-            if "interface_name" not in connection_dict:
-                raise RuntimeError('"inteface_name" is missing in global config!"')
-            if "connection" not in connection_dict:
-                raise RuntimeError(f'"connection" is missing!"')
-            # The global interface mapping should be one-to-one.
-            if not connection_dict["connection"]:
-                raise RuntimeError("The global interface map is empty!")
-            if len(connection_dict["connection"]) > 1:
-                raise RuntimeError("A global interface maps multiple module interfaces!")
-            if "mod_idx" not in connection_dict["connection"][0]:
-                raise RuntimeError('"mod_idx" is missing!')
-
-        def get_binding_dict(self):
-            """Returning the binding information in the form of dictionary.
-            Returns
-            -------
-            data : Dict[str, Any]
-                The binding information is in the form of dictionary.
-            """
-            dict_format = {"interface_name": self.name, "connection": []}
-            for binding in self.bindings:
-                _, dname = binding.get_name()
-                midx = binding.get_owner_idx()
-                dict_format["connection"].append({"mod_idx": midx, "interface_name": dname})
-
-            self.check_binding_dict(dict_format)
-            return dict_format
-
-        def check_dag_acyclic(self, start, inputs):
-            """This is to check whether the DAG containing these input interfaces is acyclic.
-            Parameters
-            ----------
-            start: ModuleWrapper
-                The starting node of the cycle check algorithm.
-
-            inputs: Binding
-                These interfaces are used to connect to each other to build DAG.
-
-            Return
-            ------
-                Return true if there is no cycle in the DAG.
-            """
-            for binding in inputs.values():
-                if start == binding.io_owner:
-                    return False
-                for p in binding.parents:
-                    if not self.check_dag_acyclic(start, p.io_owner.input_bindings.bindings):
-                        return False
-
-            return True
-
-        def connect(self, binding):
-            """Connect the current interface to the destination interface.
-            Correct connections are as follows: 1. the pipeline input connected to a module input,
-            2. the module output connected to a pipeline output, 3. the module output connected to
-            a module input.
-
-            Parameters
-            ----------
-            binding: Binding
-                The destination of this connection.
-            """
-
-            # Check whether the binding setting is correct or not.
-            if self.io_owner == binding.io_owner:
-                raise RuntimeError("Can not bind itself.")
-
-            if self.io_type == "param" and not self.is_pipeline_executor_interface():
-                raise RuntimeError(
-                    'The "param" binding can only be used by a pipeline executor interface!'
-                )
-
-            if not self.is_pipeline_executor_interface() and self.io_type == "input":
-                raise RuntimeError("Module can only bind from output interface!")
-
-            if self.io_type == "param" and binding.io_type != "param":
-                raise RuntimeError(
-                    'A global "param" interface can only be bind with a module "param" interface!'
-                )
-
-            if (
-                not self.is_pipeline_executor_interface()
-                and not binding.is_pipeline_executor_interface()
-                and binding.io_type == "output"
-            ):
-                raise RuntimeError("Can not bind module output with another module output!")
-
-            if (
-                not self.is_pipeline_executor_interface()
-                and binding.is_pipeline_executor_interface()
-                and binding.io_type == "input"
-            ):
-                raise RuntimeError("Can not bind module output with pipeline input!")
-
-            if self.is_pipeline_executor_interface() and self.io_type == "output":
-                raise RuntimeError("Global output can not be used as binding start point.")
-
-            if (
-                self.is_pipeline_executor_interface()
-                and self.io_type == "input"
-                and binding.io_type != "input"
-            ):
-                raise RuntimeError("Global input can only bind with module input.")
-
-            self.bindings.append(binding)
-            if not self.is_pipeline_executor_interface():
-                # Check whether the data types of the source and destination are the same.
-                if (
-                    isinstance(binding.io_owner, PipelineConfig.ModuleWrapper)
-                    and self.data_type != binding.data_type
-                ):
-                    raise RuntimeError(
-                        f"Illegal type (%s vs. %s): binding type is not same!"
-                        % (self.data_type, binding.data_type)
-                    )
-
-                binding.parents.append(self)
-
-                # Do acyclic check after increasing the in-degree of child node by setting
-                # current interface as a parent of the child node.
-
-                if not self.check_dag_acyclic(
-                    binding.io_owner, self.io_owner.input_bindings.bindings
-                ):
-                    raise RuntimeError("Illegal connection: Cause a cycle!")
-
-    class BindingList:
-        """Container for bindings(input or output interface).
-
-        Parameters
-        ----------
-        owner : ModuleWrapper/PipelineConfig
-            The owner of this class can be ModuleWrapper or PipelineConfig.
-
-        io_type : str
-            The type of this class can be "input" or "output".
-        """
-
-        def __init__(self, owner, io_type):
-            self.bindings = {}
-            self.io_owner = owner
-            self.binding_type = io_type
-
-        def get_binding_data_type(self, key):
-            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                return self.io_owner.get_data_type(key, self.binding_type)
-            return None
-
-        def __getitem__(self, key):
-            if key not in self.bindings:
-                data_type = self.get_binding_data_type(key)
-                if not data_type and isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                    raise RuntimeError(f"Can not find {key} in binding list {self.binding_type}.")
-
-                self.bindings[key] = PipelineConfig.Binding(
-                    self.io_owner, self.binding_type, key, data_type
-                )
-
-            return self.bindings[key]
-
-    class ModuleWrapper:
-        """This class is a wrapper representing the module and contains information such as
-        module information, binding information and building information.
-        """
-
-        def __init__(self, mod=None):
-            self.target_host = None
-            self.build_func = None
-            self.params = None
-            self.target = None
-            self.fcompile = None
-            self.name = None
-            self.dev = None
-            self.export_cc = None
-            self.cpu_affinity = ""
-            self.idx = None
-            self.mod = mod
-            self.input_params = InferType()(mod)["main"].params
-            self.output_type = InferType()(mod)["main"].checked_type.ret_type
-            self.input_bindings = PipelineConfig.BindingList(self, "input")
-            self.output_bindings = PipelineConfig.BindingList(self, "output")
-            self.param_binding = PipelineConfig.Binding(self, "param", "param")
-
-        def __eq__(self, other):
-            if isinstance(other, PipelineConfig.ModuleWrapper):
-                return self.mod == other.mod
-
-            return False
-
-        def __getitem__(self, key):
-            if isinstance(key, str):
-                if key == "input":
-                    return self.input_bindings
-
-                if key == "output":
-                    return self.output_bindings
-
-                if key == "param":
-                    return self.param_binding
-
-                raise RuntimeError(f"{key} not found!")
-
-            raise RuntimeError('The data type of "key" is not supported!')
-
-        def get_data_type(self, key, interface_type):
-            """Get the module interface data type according to the key value and interface type.
-            Parameters
-            ----------
-            key: str
-                The interface name.
-
-            interface_type:
-                The interface type.
-
-            Return
-            -------
-                Return data type.
-            """
-            if interface_type == "input":
-                for param in self.input_params:
-                    if param.name_hint == key:
-                        return param._checked_type_
-
-            if interface_type == "output":
-                if isinstance(self.output_type, tvm.ir.type.TupleType):
-                    if int(key) < len(self.output_type.fields):
-                        return self.output_type.fields[int(key)]
-                elif int(key) == 0:
-                    return self.output_type
-
-            return None
-
-        def set_idx_name(self, idx):
-            # Set the index value and generate the module name.
-            self.idx = idx
-            self.name = f"mod{str(idx)}"
-
-        def is_root_mod(self):
-            """Check whether this node is the root node in DAG, this function is used
-            in topological sort.
-            """
-            return all([not b.parents for b in self.input_bindings.bindings.values()])
-
-        def remove_self_from_bindings(self):
-            """Remove the current node from child dependencies to reduce the in-degree
-            of child node, this function is used in topological sort.
-            """
-            for binding in self.output_bindings.bindings.values():
-                for child in binding.bindings:
-                    if binding in child.parents:
-                        child.parents.remove(binding)
-
-    def __init__(self):
-        self.mod_wrapper = {}
-        self.input_bindings = self.BindingList(self, "input")
-        self.output_bindings = self.BindingList(self, "output")
-        # There is a map of global parameters group and module index.
-        self.param_group_bindings = self.BindingList(self, "param")
-
-    def __str__(self):
-        # Get configuration information as a string.
-
-        # Use topological sort to get correct module order.
-        self.dag_topology_sort()
-        # Getting the parameters dependencies.
-        param_dump = "Params\n"
-        for param_name in self.param_group_bindings.bindings:
-            inf = self.param_group_bindings.bindings[param_name]
-            param_dump += str(inf) + "\n"
-        # Get the input dependencies.
-        input_dump = "\nInputs\n"
-        for input_name in self.input_bindings.bindings:
-            inf = self.input_bindings.bindings[input_name]
-            input_dump += str(inf) + "\n"
-
-        # Get the connections information of each module.
-        output = {}
-        connections_dump = "\nconnections\n"
-        for mod in self.mod_wrapper:
-            for interface in self.mod_wrapper[mod].output_bindings.bindings.values():
-                if interface.bindings:
-                    mname, dname = interface.get_name()
-                    iname = mname + ".output(" + dname + ")->"
-                    for dep in interface.bindings:
-                        dep_mname, dep_dname = dep.get_name()
-                        if isinstance(dep.io_owner, PipelineConfig.ModuleWrapper):
-                            iname += f" {dep_mname}.{dep_dname}"
-                            connections_dump += f"  |{iname}\n"
-                        else:
-                            output[dep_dname] = f"{mname}.output({dname})"
-
-        # Get the output dependencies.
-        output_dump = "\noutput\n"
-        for name in sorted(output.keys()):
-            output_dump += f"  |output({name}) : {output[name]}\n"
-
-        return param_dump + input_dump + output_dump + connections_dump
-
-    def __getitem__(self, key):
-        if isinstance(key, tvm.ir.module.IRModule):
-            if key not in self.mod_wrapper:
-                self.mod_wrapper[key] = self.ModuleWrapper(key)
-            return self.mod_wrapper[key]
-
-        if isinstance(key, str):
-            if key == "input":
-                return self.input_bindings
-            if key == "output":
-                return self.output_bindings
-            if key == "param_group":
-                return self.param_group_bindings
-
-            raise RuntimeError(f"{key} not found!")
-
-        raise RuntimeError(f'The key type "{type(key)}" is not supported!')
-
-    def get_config(self):
-        """Get the configuration information in dictionary form, this configuration
-        will be used to create pipeline executor.
-        """
-
-        # Use topological sort to get the correct order of modules.
-        self.dag_topology_sort()
-        mconfig = {}
-        module_connection = {}
-        for mod in self.mod_wrapper:
-            # Generate pipeline configuration.
-            mconf = {}
-            output_conf = []
-            module = self.mod_wrapper[mod]
-            for _, binding in module.output_bindings.bindings.items():
-                dep_conf = []
-                output = {}
-                if binding.bindings:
-                    for dep in binding.bindings:
-                        dep_item = {}
-                        _, dname = dep.get_name()
-                        if dep.is_pipeline_executor_interface():
-                            dep_item["global_output_index"] = int(dname)
-                        else:
-                            dep_item["mod_idx"] = dep.get_owner_idx()
-                            dep_item["input_name"] = dname
-                        dep_conf.append(dep_item)
-
-                # The value of output_idx start from 0.
-                output["output_idx"] = int(binding.name)
-                output["dependencies"] = dep_conf
-                output_conf.append(output)
-
-            mconf["mod_idx"] = module.idx
-            mconf["cpu_affinity"] = module.cpu_affinity
-            mconf["output"] = output_conf
-
-            module_connection[mod] = {
-                "pipeline": mconf,
-                "target_host": module.target_host,
-                "mod_name": "default",
-                "build": module.build_func,
-                "params": module.params,
-                "target": module.target,
-                "fcompile": module.fcompile,
-                "dev": module.dev,
-                "export_cc": module.export_cc,
-            }
-
-        # Creating a map including pipeline inputs and subgraph inputs.
-        input_connection = []
-        for input_name in self.input_bindings.bindings:
-            input_dict = self.input_bindings.bindings[input_name].get_binding_dict()
-            if "interface_name" not in input_dict["connection"][0]:
-                raise RuntimeError("interface_name is missing in connection config!")
-            # Creating the map including global interfaces and subgraph interfaces.
-            input_map = {
-                "global_interface_name": input_dict["interface_name"],
-                "mod_idx": input_dict["connection"][0]["mod_idx"],
-                "module_interface_name": input_dict["connection"][0]["interface_name"],
-            }
-            input_connection.append(input_map)
-
-        # Create a map including global parameters groups and modules.
-        param_connection = []
-        for param_name in self.param_group_bindings.bindings:
-            param_dict = self.param_group_bindings.bindings[param_name].get_binding_dict()
-            param_map = {
-                "global_param_name": param_dict["interface_name"],
-                "mod_idx": param_dict["connection"][0]["mod_idx"],
-            }
-            param_connection.append(param_map)
-
-        mconfig["module_connection"] = module_connection
-        mconfig["input_connection"] = input_connection
-        mconfig["param_connection"] = param_connection
-        return mconfig
-
-    def dag_topology_sort(self):
-        """Use topological sort to get order of pipeline modules."""
-        mlist = []
-        mod_wrapper = self.mod_wrapper.copy()
-        while mod_wrapper:
-            temp_list = []
-            for mod, wrapper in mod_wrapper.items():
-                if wrapper.is_root_mod():
-                    temp_list.append(mod)
-                    wrapper.remove_self_from_bindings()
-
-            for mod in temp_list:
-                mod_wrapper.pop(mod, None)
-
-            mlist += temp_list
-
-        mod_wrapper_sort = {}
-        for mod, i in zip(mlist, range(len(mlist))):
-            self.mod_wrapper[mod].set_idx_name(i)
-            mod_wrapper_sort[mod] = self.mod_wrapper[mod]
-
-        self.mod_wrapper = mod_wrapper_sort
-
-    def get_mod_idx(self, mod):
-        # Return the module index.
-        idx = self.mod_wrapper[mod].idx
-        return idx
-
-    def pipe_input(self, name):
-        # Return the input interface according to the name.
-        return self.input_bindings[name]
-
-    def pipe_output(self, idx):
-        # Return the output interface according to the name.
-        return self.output_bindings[idx]
diff --git a/python/tvm/contrib/relay_viz/__init__.py b/python/tvm/contrib/relay_viz/__init__.py
deleted file mode 100644
index fb4dac226d57..000000000000
--- a/python/tvm/contrib/relay_viz/__init__.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Relay IR Visualizer"""
-from typing import Dict
-import tvm
-from tvm import relay
-from .interface import (
-    Plotter,
-    VizGraph,
-    VizParser,
-)
-from .terminal import (
-    TermPlotter,
-    TermVizParser,
-)
-from .dot import (
-    DotPlotter,
-    DotVizParser,
-)
-
-
-class RelayVisualizer:
-    """Relay IR Visualizer
-
-    Parameters
-    ----------
-    relay_mod: tvm.IRModule
-        Relay IR module.
-    relay_param: None | Dict[str, tvm.runtime.NDArray]
-        Relay parameter dictionary. Default `None`.
-    plotter: Plotter
-        An instance of class inheriting from Plotter interface.
-        Default is an instance of `terminal.TermPlotter`.
-    parser: VizParser
-        An instance of class inheriting from VizParser interface.
-        Default is an instance of `terminal.TermVizParser`.
-    """
-
-    def __init__(
-        self,
-        relay_mod: tvm.IRModule,
-        relay_param: Dict[str, tvm.runtime.NDArray] = None,
-        plotter: Plotter = None,
-        parser: VizParser = None,
-    ):
-        self._plotter = plotter if plotter is not None else TermPlotter()
-        self._relay_param = relay_param if relay_param is not None else {}
-        self._parser = parser if parser is not None else TermVizParser()
-
-        global_vars = relay_mod.get_global_vars()
-        graph_names = []
-        # If we have main function, put it to the first.
-        # Then main function can be shown on the top.
-        for gv_node in global_vars:
-            if gv_node.name_hint == "main":
-                graph_names.insert(0, gv_node.name_hint)
-            else:
-                graph_names.append(gv_node.name_hint)
-
-        node_to_id = {}
-        # callback to generate an unique string-ID for nodes.
-        # node_count_offset ensure each node ID is still unique across subgraph.
-        node_count_offset = 0
-
-        def traverse_expr(node):
-            if node in node_to_id:
-                return
-            node_to_id[node] = str(len(node_to_id) + node_count_offset)
-
-        for name in graph_names:
-            node_count_offset += len(node_to_id)
-            node_to_id.clear()
-            relay.analysis.post_order_visit(relay_mod[name], traverse_expr)
-            graph = self._plotter.create_graph(name)
-            self._add_nodes(graph, node_to_id)
-
-    def _add_nodes(self, graph: VizGraph, node_to_id: Dict[relay.Expr, str]):
-        """add nodes and to the graph.
-
-        Parameters
-        ----------
-        graph : VizGraph
-            a VizGraph for nodes to be added to.
-
-        node_to_id : Dict[relay.expr, str]
-            a mapping from nodes to an unique ID.
-
-        relay_param : Dict[str, tvm.runtime.NDarray]
-            relay parameter dictionary.
-        """
-        for node in node_to_id:
-            viz_node, viz_edges = self._parser.get_node_edges(node, self._relay_param, node_to_id)
-            if viz_node is not None:
-                graph.node(viz_node)
-            for edge in viz_edges:
-                graph.edge(edge)
-
-    def render(self, filename: str = None) -> None:
-        self._plotter.render(filename=filename)
diff --git a/python/tvm/contrib/relay_viz/dot.py b/python/tvm/contrib/relay_viz/dot.py
deleted file mode 100644
index a9e98189a85a..000000000000
--- a/python/tvm/contrib/relay_viz/dot.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Visualize Relay IR by Graphviz DOT language."""
-
-from typing import (
-    Any,
-    Callable,
-    Dict,
-)
-from .interface import (
-    DefaultVizParser,
-    Plotter,
-    VizEdge,
-    VizGraph,
-    VizNode,
-)
-
-try:
-    import graphviz
-except ImportError:
-    # add "from None" to silence
-    # "During handling of the above exception, another exception occurred"
-    raise ImportError(
-        "The graphviz package is required for DOT renderer. "
-        "Please install it first. For example, pip3 install graphviz"
-    ) from None
-
-DotVizParser = DefaultVizParser
-
-
-class DotGraph(VizGraph):
-    """DOT graph for relay IR.
-
-    See also :py:class:`tvm.contrib.relay_viz.dot.DotPlotter`
-
-    Parameters
-    ----------
-    name: str
-        name of this graph.
-    graph_attr: Optional[Dict[str, str]]
-        key-value pairs for the graph.
-    node_attr: Optional[Dict[str, str]]
-        key-value pairs for all nodes.
-    edge_attr: Optional[Dict[str, str]]
-        key-value pairs for all edges.
-    get_node_attr: Optional[Callable[[VizNode], Dict[str, str]]]
-        A callable returning attributes for the node.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        graph_attr: Dict[str, str] = None,
-        node_attr: Dict[str, str] = None,
-        edge_attr: Dict[str, str] = None,
-        get_node_attr: Callable[[VizNode], Dict[str, str]] = None,
-    ):
-        self._name = name
-        self._get_node_attr = self._default_get_node_attr
-        if get_node_attr is not None:
-            self._get_node_attr = get_node_attr
-
-        # graphviz recognizes the subgraph as a cluster subgraph
-        # by the name starting with "cluster" (all lowercase)
-        self._digraph = graphviz.Digraph(
-            name=f"cluster_{self._name}",
-            graph_attr=graph_attr,
-            node_attr=node_attr,
-            edge_attr=edge_attr,
-        )
-        self._digraph.attr(label=self._name)
-
-    def node(self, viz_node: VizNode) -> None:
-        """Add a node to the underlying graph.
-        Nodes in a Relay IR Module are expected to be added in the post-order.
-
-        Parameters
-        ----------
-        viz_node : VizNode
-            A `VizNode` instance.
-        """
-        self._digraph.node(
-            viz_node.identity,
-            f"{viz_node.type_name}\n{viz_node.detail}",
-            **self._get_node_attr(viz_node),
-        )
-
-    def edge(self, viz_edge: VizEdge) -> None:
-        """Add an edge to the underlying graph.
-
-        Parameters
-        ----------
-        viz_edge : VizEdge
-            A `VizEdge` instance.
-        """
-        self._digraph.edge(viz_edge.start, viz_edge.end)
-
-    @property
-    def digraph(self):
-        return self._digraph
-
-    @staticmethod
-    def _default_get_node_attr(node: VizNode):
-        if "Var" in node.type_name:
-            return {"shape": "ellipse"}
-        return {"shape": "box"}
-
-
-class DotPlotter(Plotter):
-    """DOT language graph plotter
-
-    The plotter accepts various graphviz attributes for graphs, nodes, and edges.
-    Please refer to https://graphviz.org/doc/info/attrs.html for available attributes.
-
-    Parameters
-    ----------
-    graph_attr: Optional[Dict[str, str]]
-        key-value pairs for all graphs.
-    node_attr: Optional[Dict[str, str]]
-        key-value pairs for all nodes.
-    edge_attr: Optional[Dict[str, str]]
-        key-value pairs for all edges.
-    get_node_attr: Optional[Callable[[VizNode], Dict[str, str]]]
-        A callable returning attributes for a specific node.
-    render_kwargs: Optional[Dict[str, Any]]
-        keyword arguments directly passed to `graphviz.Digraph.render()`.
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        from tvm.contrib import relay_viz
-        from tvm.relay.testing import resnet
-
-        mod, param = resnet.get_workload(num_layers=18)
-        # graphviz attributes
-        graph_attr = {"color": "red"}
-        node_attr = {"color": "blue"}
-        edge_attr = {"color": "black"}
-
-        # VizNode is passed to the callback.
-        # We want to color NCHW conv2d nodes. Also give Var a different shape.
-        def get_node_attr(node):
-            if "nn.conv2d" in node.type_name and "NCHW" in node.detail:
-                return {
-                    "fillcolor": "green",
-                    "style": "filled",
-                    "shape": "box",
-                }
-            if "Var" in node.type_name:
-                return {"shape": "ellipse"}
-            return {"shape": "box"}
-
-        # Create plotter and pass it to viz. Then render the graph.
-        dot_plotter = relay_viz.DotPlotter(
-            graph_attr=graph_attr,
-            node_attr=node_attr,
-            edge_attr=edge_attr,
-            get_node_attr=get_node_attr)
-
-        viz = relay_viz.RelayVisualizer(
-            mod,
-            relay_param=param,
-            plotter=dot_plotter,
-            parser=relay_viz.DotVizParser())
-        viz.render("hello")
-    """
-
-    def __init__(
-        self,
-        graph_attr: Dict[str, str] = None,
-        node_attr: Dict[str, str] = None,
-        edge_attr: Dict[str, str] = None,
-        get_node_attr: Callable[[VizNode], Dict[str, str]] = None,
-        render_kwargs: Dict[str, Any] = None,
-    ):
-        self._name_to_graph = {}
-        self._graph_attr = graph_attr
-        self._node_attr = node_attr
-        self._edge_attr = edge_attr
-        self._get_node_attr = get_node_attr
-
-        self._render_kwargs = {} if render_kwargs is None else render_kwargs
-
-    def create_graph(self, name):
-        self._name_to_graph[name] = DotGraph(
-            name, self._graph_attr, self._node_attr, self._edge_attr, self._get_node_attr
-        )
-        return self._name_to_graph[name]
-
-    def render(self, filename: str = None):
-        """render the graph generated from the Relay IR module.
-
-        This function is a thin wrapper of `graphviz.Digraph.render()`.
-        """
-        # Create or update the filename
-        if filename is not None:
-            self._render_kwargs["filename"] = filename
-        # default cleanup
-        if "cleanup" not in self._render_kwargs:
-            self._render_kwargs["cleanup"] = True
-
-        root_graph = graphviz.Digraph()
-        for graph in self._name_to_graph.values():
-            root_graph.subgraph(graph.digraph)
-        root_graph.render(**self._render_kwargs)
diff --git a/python/tvm/contrib/relay_viz/interface.py b/python/tvm/contrib/relay_viz/interface.py
deleted file mode 100644
index 8df188fcf42e..000000000000
--- a/python/tvm/contrib/relay_viz/interface.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Abstract class used by :py:class:`tvm.contrib.relay_viz.RelayVisualizer`."""
-import abc
-from typing import (
-    Dict,
-    Union,
-    Tuple,
-    List,
-)
-
-import tvm
-from tvm import relay
-
-UNKNOWN_TYPE = "unknown"
-
-
-class VizNode:
-    """VizNode carry node information for `VizGraph` interface.
-
-    Parameters
-    ----------
-    node_id: str
-        Unique identifier for this node.
-    node_type: str
-        Type of this node.
-    node_detail: str
-        Any supplement for this node such as attributes.
-    """
-
-    def __init__(self, node_id: str, node_type: str, node_detail: str):
-        self._id = node_id
-        self._type = node_type
-        self._detail = node_detail
-
-    @property
-    def identity(self) -> str:
-        return self._id
-
-    @property
-    def type_name(self) -> str:
-        return self._type
-
-    @property
-    def detail(self) -> str:
-        return self._detail
-
-    def __repr__(self) -> str:
-        detail = self._detail.replace("\n", ", ")
-        return f"VizNode(identity: {self._id}, type_name: {self._type}, detail: {detail}"
-
-
-class VizEdge:
-    """VizEdge connect two `VizNode`.
-
-    Parameters
-    ----------
-    start_node: str
-        The identifier of the node starting the edge.
-    end_node: str
-        The identifier of the node ending the edge.
-    """
-
-    def __init__(self, start_node: str, end_node: str):
-        self._start_node = start_node
-        self._end_node = end_node
-
-    @property
-    def start(self) -> str:
-        return self._start_node
-
-    @property
-    def end(self) -> str:
-        return self._end_node
-
-
-class VizParser(abc.ABC):
-    """VizParser parses out a VizNode and VizEdges from a `relay.Expr`."""
-
-    @abc.abstractmethod
-    def get_node_edges(
-        self,
-        node: relay.Expr,
-        relay_param: Dict[str, tvm.runtime.NDArray],
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        """Get VizNode and VizEdges for a `relay.Expr`.
-
-        Parameters
-        ----------
-        node : relay.Expr
-            relay.Expr which will be parsed and generate a node and edges.
-
-        relay_param: Dict[str, tvm.runtime.NDArray]
-            relay parameters dictionary.
-
-        node_to_id : Dict[relay.Expr, str]
-            This is a mapping from relay.Expr to a unique id, generated by `RelayVisualizer`.
-
-        Returns
-        -------
-        rv1 : Union[VizNode, None]
-            VizNode represent the relay.Expr. If the relay.Expr is not intended to introduce a node
-            to the graph, return None.
-
-        rv2 : List[VizEdge]
-            a list of VizEdges to describe the connectivity of the relay.Expr.
-            Can be empty list to indicate no connectivity.
-        """
-
-
-class VizGraph(abc.ABC):
-    """Abstract class for graph, which is composed of nodes and edges."""
-
-    @abc.abstractmethod
-    def node(self, viz_node: VizNode) -> None:
-        """Add a node to the underlying graph.
-        Nodes in a Relay IR Module are expected to be added in the post-order.
-
-        Parameters
-        ----------
-        viz_node : VizNode
-            A `VizNode` instance.
-        """
-
-    @abc.abstractmethod
-    def edge(self, viz_edge: VizEdge) -> None:
-        """Add an edge to the underlying graph.
-
-        Parameters
-        ----------
-        viz_edge : VizEdge
-            A `VizEdge` instance.
-        """
-
-
-class DefaultVizParser(VizParser):
-    """DefaultVizParser provde a set of logics to parse a various relay types.
-    These logics are inspired and heavily based on
-    `visualize` function in https://tvm.apache.org/2020/07/14/bert-pytorch-tvm
-    """
-
-    def get_node_edges(
-        self,
-        node: relay.Expr,
-        relay_param: Dict[str, tvm.runtime.NDArray],
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        if isinstance(node, relay.Function):
-            return self._function(node, node_to_id)
-        if isinstance(node, relay.expr.Call):
-            return self._call(node, node_to_id)
-        if isinstance(node, relay.expr.Var):
-            return self._var(node, relay_param, node_to_id)
-        if isinstance(node, relay.expr.Tuple):
-            return self._tuple(node, node_to_id)
-        if isinstance(node, relay.expr.TupleGetItem):
-            return self._tuple_get_item(node, node_to_id)
-        if isinstance(node, relay.expr.Constant):
-            return self._constant(node, node_to_id)
-        # GlobalVar possibly mean another global relay function,
-        # which is expected to in "Graph" level, not in "Node" level.
-        if isinstance(node, (relay.expr.GlobalVar, tvm.ir.Op)):
-            return None, []
-
-        viz_node = VizNode(node_to_id[node], UNKNOWN_TYPE, f"don't know how to parse {type(node)}")
-        viz_edges = []
-        return viz_node, viz_edges
-
-    def _var(
-        self,
-        node: relay.Expr,
-        relay_param: Dict[str, tvm.runtime.NDArray],
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        """Render rule for a relay var node"""
-
-        node_id = node_to_id[node]
-        name_hint = node.name_hint
-        node_detail = f"name_hint: {name_hint}"
-        node_type = "Var(Param)" if name_hint in relay_param else "Var(Input)"
-
-        if node.type_annotation is not None:
-            if hasattr(node.type_annotation, "shape"):
-                shape = tuple(map(int, node.type_annotation.shape))
-                dtype = node.type_annotation.dtype
-                node_detail = f"{node_detail}\nshape: {shape}\ndtype: {dtype}"
-            else:
-                node_detail = f"{node_detail}\ntype_annotation: {node.type_annotation}"
-
-        # only node
-        viz_node = VizNode(node_id, node_type, node_detail)
-        viz_edges = []
-        return viz_node, viz_edges
-
-    def _function(
-        self,
-        node: relay.Expr,
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        """Render rule for a relay function node"""
-        func_attrs = node.attrs
-        node_details = [f"{k}: {func_attrs.get_str(k)}" for k in func_attrs.keys()]
-        # "Composite" might from relay.transform.MergeComposite
-        if "Composite" in func_attrs.keys():
-            name = func_attrs["Composite"]
-        else:
-            name = ""
-
-        node_id = node_to_id[node]
-
-        # Body -> FunctionNode
-        viz_node = VizNode(node_id, f"Func {name}", "\n".join(node_details))
-        viz_edges = [VizEdge(node_to_id[node.body], node_id)]
-        return viz_node, viz_edges
-
-    def _call(
-        self,
-        node: relay.Expr,
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        """Render rule for a relay call node"""
-        node_id = node_to_id[node]
-        op_name = UNKNOWN_TYPE
-        node_detail = []
-        if isinstance(node.op, tvm.ir.Op):
-            op_name = node.op.name
-            if node.attrs:
-                node_detail = [f"{k}: {node.attrs.get_str(k)}" for k in node.attrs.keys()]
-        elif isinstance(node.op, relay.Function):
-            func_attrs = node.op.attrs
-            op_name = "Anonymous Func"
-            node_detail = [f"{k}: {func_attrs.get_str(k)}" for k in func_attrs.keys()]
-            # "Composite" might from relay.transform.MergeComposite
-            if "Composite" in func_attrs.keys():
-                op_name = func_attrs["Composite"]
-        elif isinstance(node.op, relay.GlobalVar):
-            op_name = "GlobalVar"
-            node_detail = [f"GlobalVar.name_hint: {node.op.name_hint}"]
-        else:
-            op_name = str(type(node.op)).split(".")[-1].split("'")[0]
-
-        # Arguments -> CallNode
-        viz_node = VizNode(node_id, f"Call {op_name}", "\n".join(node_detail))
-        args = [node_to_id[arg] for arg in node.args]
-        viz_edges = [VizEdge(arg, node_id) for arg in args]
-        return viz_node, viz_edges
-
-    def _tuple(
-        self,
-        node: relay.Expr,
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        node_id = node_to_id[node]
-
-        # Fields -> TupleNode
-        viz_node = VizNode(node_id, "Tuple", "")
-        viz_edges = [VizEdge(node_to_id[field], node_id) for field in node.fields]
-        return viz_node, viz_edges
-
-    def _tuple_get_item(
-        self,
-        node: relay.Expr,
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        node_id = node_to_id[node]
-
-        # Tuple -> TupleGetItemNode
-        viz_node = VizNode(node_id, "TupleGetItem", f"idx: {node.index}")
-        viz_edges = [VizEdge(node_to_id[node.tuple_value], node_id)]
-        return viz_node, viz_edges
-
-    def _constant(
-        self,
-        node: relay.Expr,
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        node_id = node_to_id[node]
-        node_detail = f"shape: {node.data.shape}, dtype: {node.data.dtype}"
-
-        # only node
-        viz_node = VizNode(node_id, "Const", node_detail)
-        viz_edges = []
-        return viz_node, viz_edges
-
-
-class Plotter(abc.ABC):
-    """Plotter can render a collection of Graph interfaces to a file."""
-
-    @abc.abstractmethod
-    def create_graph(self, name: str) -> VizGraph:
-        """Create a VizGraph
-
-        Parameters
-        ----------
-        name : str
-            the name of the graph
-
-        Return
-        ------
-        rv1: an instance of class inheriting from VizGraph interface.
-        """
-
-    @abc.abstractmethod
-    def render(self, filename: str) -> None:
-        """Render the graph as a file.
-
-        Parameters
-        ----------
-        filename : str
-            see the definition of implemented class.
-        """
diff --git a/python/tvm/contrib/relay_viz/terminal.py b/python/tvm/contrib/relay_viz/terminal.py
deleted file mode 100644
index f137bbf9d41c..000000000000
--- a/python/tvm/contrib/relay_viz/terminal.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Visualize Relay IR in AST text-form."""
-
-from collections import deque
-from typing import (
-    Dict,
-    Union,
-    Tuple,
-    List,
-)
-import tvm
-from tvm import relay
-from .interface import (
-    DefaultVizParser,
-    Plotter,
-    VizEdge,
-    VizGraph,
-    VizNode,
-    VizParser,
-)
-
-
-class TermVizParser(VizParser):
-    """`TermVizParser` parse nodes and edges for `TermPlotter`."""
-
-    def __init__(self):
-        self._default_parser = DefaultVizParser()
-
-    def get_node_edges(
-        self,
-        node: relay.Expr,
-        relay_param: Dict[str, tvm.runtime.NDArray],
-        node_to_id: Dict[relay.Expr, str],
-    ) -> Tuple[Union[VizNode, None], List[VizEdge]]:
-        """Parse a node and edges from a relay.Expr."""
-        if isinstance(node, relay.Call):
-            return self._call(node, node_to_id)
-        if isinstance(node, relay.Let):
-            return self._let(node, node_to_id)
-        if isinstance(node, relay.GlobalVar):
-            return self._global_var(node, node_to_id)
-        if isinstance(node, relay.If):
-            return self._if(node, node_to_id)
-        if isinstance(node, tvm.ir.Op):
-            return self._op(node, node_to_id)
-        if isinstance(node, relay.Function):
-            return self._function(node, node_to_id)
-
-        # Leverage logics from default parser.
-        return self._default_parser.get_node_edges(node, relay_param, node_to_id)
-
-    def _call(self, node, node_to_id):
-        node_id = node_to_id[node]
-        viz_node = VizNode(node_id, "Call", "")
-        viz_edges = [VizEdge(node_to_id[node.op], node_id)]
-        for arg in node.args:
-            arg_id = node_to_id[arg]
-            viz_edges.append(VizEdge(arg_id, node_id))
-        return viz_node, viz_edges
-
-    def _let(self, node, node_to_id):
-        node_id = node_to_id[node]
-        viz_node = VizNode(node_id, "Let", "(var, val, body)")
-        viz_edges = [
-            VizEdge(node_to_id[node.var], node_id),
-            VizEdge(node_to_id[node.value], node_id),
-            VizEdge(node_to_id[node.body], node_id),
-        ]
-        return viz_node, viz_edges
-
-    def _global_var(self, node, node_to_id):
-        node_id = node_to_id[node]
-        viz_node = VizNode(node_id, "GlobalVar", node.name_hint)
-        viz_edges = []
-        return viz_node, viz_edges
-
-    def _if(self, node, node_to_id):
-        node_id = node_to_id[node]
-        viz_node = VizNode(node_id, "If", "(cond, true, false)")
-        viz_edges = [
-            VizEdge(node_to_id[node.cond], node_id),
-            VizEdge(node_to_id[node.true_branch], node_id),
-            VizEdge(node_to_id[node.false_branch], node_id),
-        ]
-        return viz_node, viz_edges
-
-    def _op(self, node, node_to_id):
-        node_id = node_to_id[node]
-        op_name = node.name
-        viz_node = VizNode(node_id, op_name, "")
-        viz_edges = []
-        return viz_node, viz_edges
-
-    def _function(self, node, node_to_id):
-        node_id = node_to_id[node]
-        viz_node = VizNode(node_id, "Func", str(node.params))
-        viz_edges = [VizEdge(node_to_id[node.body], node_id)]
-        return viz_node, viz_edges
-
-
-class TermNode:
-    """TermNode is aimed to generate text more suitable for terminal visualization."""
-
-    def __init__(self, viz_node: VizNode):
-        self.type = viz_node.type_name
-        # We don't want too many lines in a terminal.
-        self.other_info = viz_node.detail.replace("\n", ", ")
-
-
-class TermGraph(VizGraph):
-    """Terminal graph for a relay IR Module
-
-    Parameters
-    ----------
-    name: str
-        name of this graph.
-    """
-
-    def __init__(self, name: str):
-        self._name = name
-        # A graph in adjacency list form.
-        # The key is source node, and the value is a list of destination nodes.
-        self._graph = {}
-        # a hash table for quick searching.
-        self._id_to_term_node = {}
-        # node_id in reversed post order
-        # That mean, root is the first node.
-        self._node_id_rpo = deque()
-
-    def node(self, viz_node: VizNode) -> None:
-        """Add a node to the underlying graph.
-        Nodes in a Relay IR Module are expected to be added in the post-order.
-
-        Parameters
-        ----------
-        viz_node : VizNode
-            A `VizNode` instance.
-        """
-
-        self._node_id_rpo.appendleft(viz_node.identity)
-
-        if viz_node.identity not in self._graph:
-            # Add the node into the graph.
-            self._graph[viz_node.identity] = []
-
-        # Create TermNode from VizNode
-        node = TermNode(viz_node)
-        self._id_to_term_node[viz_node.identity] = node
-
-    def edge(self, viz_edge: VizEdge) -> None:
-        """Add an edge to the terminal graph.
-
-        Parameters
-        ----------
-        viz_edge : VizEdge
-            A `VizEdge` instance.
-        """
-        # Take CallNode as an example, instead of "arguments point to CallNode",
-        # we want "CallNode points to arguments" in ast-dump form.
-        #
-        # The direction of edge is typically controlled by the implemented VizParser.
-        # Reverse start/end here simply because we leverage default parser implementation.
-        if viz_edge.end in self._graph:
-            self._graph[viz_edge.end].append(viz_edge.start)
-        else:
-            self._graph[viz_edge.end] = [viz_edge.start]
-
-    def render(self) -> str:
-        """Draw a terminal graph
-
-        Returns
-        -------
-        rv1: str
-            text representing a graph.
-        """
-        lines = []
-        seen_node = set()
-
-        def gen_line(indent, n_id):
-            if (indent, n_id) in seen_node:
-                return
-            seen_node.add((indent, n_id))
-
-            conn_symbol = ["|--", "`--"]
-            last = len(self._graph[n_id]) - 1
-            for i, next_n_id in enumerate(self._graph[n_id]):
-                node = self._id_to_term_node[next_n_id]
-                lines.append(
-                    f"{indent}{conn_symbol[1 if i==last else 0]}{node.type} {node.other_info}"
-                )
-                next_indent = indent
-                # increase indent for the next level.
-                next_indent += "   " if (i == last) else "|  "
-                gen_line(next_indent, next_n_id)
-
-        first_node_id = self._node_id_rpo[0]
-        first_node = self._id_to_term_node[first_node_id]
-        lines.append(f"@{self._name}({first_node.other_info})")
-        gen_line("", first_node_id)
-
-        return "\n".join(lines)
-
-
-class TermPlotter(Plotter):
-    """Terminal plotter"""
-
-    def __init__(self):
-        self._name_to_graph = {}
-
-    def create_graph(self, name):
-        self._name_to_graph[name] = TermGraph(name)
-        return self._name_to_graph[name]
-
-    def render(self, filename):
-        """If filename is None, print to stdio. Otherwise, write to the filename."""
-        lines = []
-        for name in self._name_to_graph:
-            text_graph = self._name_to_graph[name].render()
-            lines.append(text_graph)
-        if filename is None:
-            print("\n".join(lines))
-        else:
-            with open(filename, "w") as out_file:
-                out_file.write("\n".join(lines))
diff --git a/python/tvm/contrib/target/coreml.py b/python/tvm/contrib/target/coreml.py
index 8ff9e2210c14..d8846ce5f1cd 100644
--- a/python/tvm/contrib/target/coreml.py
+++ b/python/tvm/contrib/target/coreml.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel, missing-function-docstring
 """Utility to compile CoreML models"""
 
 import os
diff --git a/python/tvm/contrib/target/onnx.py b/python/tvm/contrib/target/onnx.py
deleted file mode 100644
index 239bf1e4b187..000000000000
--- a/python/tvm/contrib/target/onnx.py
+++ /dev/null
@@ -1,1110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines, redefined-builtin
-"""Relay to ONNX codegen """
-
-import os
-import struct
-import copy
-import numpy
-import onnx
-import onnx.utils
-from onnx import numpy_helper, OperatorSetIdProto, defs
-from onnx import TensorProto
-import tvm
-from tvm import relay
-import tvm._ffi
-from tvm.relay.expr_functor import ExprVisitor
-from tvm.relay.ty import TupleType, TensorType
-
-ONNX_OPSET_VERSONS_SUPPORTED = [11]
-
-
-def run_onnx_optimizer(onnx_model):
-    """Run ONNX's optimization routines.
-
-    ONNX Optimizer was moved to an external library in
-    version 1.9.  Attempt to use the optimizer in onnx if
-    it is available, fall back to the standalone
-    onnxoptimizer otherwise, and return the model
-    unoptimized if neither are available.
-
-    """
-    try:
-        onnx_polish_model = onnx.utils.polish_model
-    except AttributeError:
-        pass
-    else:
-        return onnx_polish_model(onnx_model)
-
-    try:
-        # pylint: disable=import-outside-toplevel
-        import onnxoptimizer
-    except ImportError:
-        pass
-    else:
-        return onnxoptimizer.optimize(onnx_model)
-
-    return onnx_model
-
-
-def tvm_array_to_list(arr):
-    return tuple(x.value for x in arr)
-
-
-def get_onnx_version():
-    return onnx.__version__
-
-
-def get_node_shape(node):
-    return tuple("Any" if isinstance(i, tvm.tir.Any) else int(i) for i in node.shape)
-
-
-def infer_type(node):
-    """A method to infer the type of a relay expression."""
-    mod = tvm.IRModule.from_expr(node)
-    mod = relay.transform.InferType()(mod)
-    entry = mod["main"]
-    return entry if isinstance(node, relay.Function) else entry.body
-
-
-def call_node_infer_type(node):
-    """infer the output types of call node"""
-    infer_out = infer_type(node)
-    out_type = infer_out._checked_type_
-    if isinstance(out_type, TensorType):
-        types = [out_type]
-    elif isinstance(out_type, TupleType):
-        types = list(out_type.fields)
-    else:
-        raise RuntimeError(f"Unsupported output type {type(out_type)} in operator {node.op.name}")
-
-    return types
-
-
-def add_input(data, name, prefix, model_container):
-    input_name = f"{prefix}_{name}"
-    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[data.dtype]
-    tensor_value_info = onnx.helper.make_tensor_value_info(input_name, dtype, shape=data.shape)
-    model_container.add_inputs([tensor_value_info])
-    data_tensor = numpy_helper.from_array(data, input_name)
-    model_container.add_initializers([data_tensor])
-    return input_name
-
-
-class OpConverter(object):
-    """Operator converter Base Class."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        """convert Relay attributes to ONNX attributes.
-        The derived classes should implement this method
-        if attributes are required by the operator
-        otherwise by default no attributes are passed
-        """
-        return {}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-        onnx_node = onnx.helper.make_node(
-            cls.__name__, node_entry["input_names"], node_entry["output_names"], **attrs
-        )
-        model_container.add_nodes([onnx_node])
-
-
-def rename(op_name):
-    """This method creates dynamic operator of name op_name with empty attributes"""
-    return type(op_name, (OpConverter,), {})
-
-
-class Reshape(object):
-    """Operator converter for Reshape."""
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        """Converts Relay operator Reshape to ONNX operator.
-        Relay operator accepts shape as attribute but ONNX operator
-        accepts it as a input.
-        """
-        name = node_entry["name"]
-        shape = numpy.asarray(
-            [a.value for a in node_entry["relay_node"].attrs.newshape], dtype=numpy.int64
-        )
-
-        input_names = [
-            node_entry["input_names"][0],
-            add_input(shape, name, "shape", model_container),
-        ]
-
-        node = onnx.helper.make_node(cls.__name__, input_names, node_entry["output_names"])
-        model_container.add_nodes([node])
-
-
-class Conv(OpConverter):
-    """Operator converter for Conv."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {
-            "group": attrs.get_int("groups"),
-            "pads": attrs.get_int_tuple("padding"),
-            "strides": attrs.get_int_tuple("strides"),
-            "dilations": attrs.get_int_tuple("dilation"),
-            "kernel_shape": attrs.get_int_tuple("kernel_size"),
-        }
-
-
-class ConvTranspose(OpConverter):
-    """Operator converter for ConvTranspose."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {
-            "group": attrs.get_int("groups"),
-            "pads": attrs.get_int_tuple("padding"),
-            "strides": attrs.get_int_tuple("strides"),
-            "dilations": attrs.get_int_tuple("dilation"),
-            "kernel_shape": attrs.get_int_tuple("kernel_size"),
-            "output_padding": attrs.get_int_tuple("output_padding"),
-        }
-
-
-class MaxPool(OpConverter):
-    """Operator converter for MaxPool."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {
-            "pads": attrs.get_int_tuple("padding"),
-            "strides": attrs.get_int_tuple("strides"),
-            "kernel_shape": attrs.get_int_tuple("pool_size"),
-            "ceil_mode": 1 if attrs.ceil_mode else 0,
-        }
-
-
-class Transpose(OpConverter):
-    """Operator converter for Transpose."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"perm": attrs.get_int_tuple("axes")} if attrs["axes"] else {}
-
-
-class MatMul(OpConverter):
-    """Operator converter for MatMul."""
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        inter_output_name = f"inter{node_entry['name']}"
-        transpose_node = onnx.helper.make_node(
-            Transpose.__name__, [node_entry["input_names"][1]], [inter_output_name], perm=(1, 0)
-        )
-        model_container.add_nodes([transpose_node])
-
-        inputs = [node_entry["input_names"][0], inter_output_name]
-        matmul_node = onnx.helper.make_node(cls.__name__, inputs, node_entry["output_names"])
-        model_container.add_nodes([matmul_node])
-
-
-class Flatten(OpConverter):
-    """Operator converter for Flatten."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"axis": 1}
-
-
-class BatchNormalization(OpConverter):
-    """Operator converter for BatchNormalization."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"epsilon": float(attrs.get_str("epsilon")), "axis": float(attrs.get_int("axis"))}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        """Converts Relay operator batch_norm to ONNX operator.
-        Relay operator has property axis to handle data in NHWC format.
-        """
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-        transpose_out_name = node_entry["input_names"][0]
-        inter_output_names = [node_entry["output_names"][0]]
-        # axis==3 means channel is specified along the 3rd axis
-        if attrs["axis"] == 3:
-            transpose_out_name = f"transpose_{node_entry['name']}"
-            node_transposed = onnx.helper.make_node(
-                Transpose.__name__,
-                [node_entry["input_names"][0]],
-                [transpose_out_name],
-                perm=[0, 3, 1, 2],
-            )
-            model_container.add_nodes([node_transposed])
-            inter_output_names = [f"batch_norm_{node_entry['name']}"]
-
-        input_names = [transpose_out_name] + node_entry["input_names"][1:]
-        batch_norm_node = onnx.helper.make_node(
-            cls.__name__, input_names, inter_output_names, epsilon=attrs["epsilon"]
-        )
-        model_container.add_nodes([batch_norm_node])
-
-        if attrs["axis"] == 3:
-            node_transposed = onnx.helper.make_node(
-                Transpose.__name__,
-                inter_output_names,
-                [node_entry["output_names"][0]],
-                perm=[0, 2, 3, 1],
-            )
-            model_container.add_nodes([node_transposed])
-
-
-class Dropout(OpConverter):
-    """Operator converter for Dropout."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"ratio": float(attrs.get_str("rate"))}
-
-
-class AveragePool(MaxPool):
-    """Operator converter for AveragePool."""
-
-
-class Concat(OpConverter):
-    """Operator converter for Concat."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"axis": attrs.get_int("axis")}
-
-
-class BiasAdd(OpConverter):
-    """Operator converter for BiasAdd."""
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node_entry can not be a Tuple"
-        input_node = input_node[0]
-        data_ndim = len(input_node["types"][0].shape)
-        axis = node_entry["relay_node"].attrs.get_int("axis")
-        if axis < 0:
-            axis = axis + data_ndim
-        new_axes = data_ndim - axis - 1
-        if new_axes:
-            inter_output_name = f"inter{node_entry['name']}"
-            unsqueeze_node = onnx.helper.make_node(
-                "Unsqueeze",
-                [node_entry["input_names"][1]],
-                [inter_output_name],
-                axes=tuple(range(1, new_axes + 1)),
-            )
-            model_container.add_nodes([unsqueeze_node])
-        else:
-            inter_output_name = node_entry["input_names"][1]
-
-        inputs = [node_entry["input_names"][0], inter_output_name]
-        matmul_node = onnx.helper.make_node("Add", inputs, node_entry["output_names"])
-        model_container.add_nodes([matmul_node])
-
-
-class ReduceMean(OpConverter):
-    """Operator converter for ReduceMean."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {
-            "axes": attrs.axis,
-            "keepdims": 0 if bool(attrs.get_int("keepdims", 0)) is False else 1,
-        }
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node can not be a Tuple"
-        input_node = input_node[0]
-        shape = input_node["types"][0].shape
-        axis = node_entry["relay_node"].attrs.axis
-        axis = list(range(shape.size())) if not axis else tvm_array_to_list(axis)
-        exclude = 0 if not bool(node_entry["relay_node"].attrs.exclude) else 1
-        keepdims = 0 if not bool(node_entry["relay_node"].attrs.keepdims) else 1
-        if exclude:
-            all_axis = list(range(len(shape)))
-            axis = set(all_axis) - set(axis)
-
-        node = onnx.helper.make_node(
-            cls.__name__,
-            node_entry["input_names"],
-            node_entry["output_names"],
-            axes=axis,
-            keepdims=keepdims,
-        )
-        model_container.add_nodes([node])
-
-
-class Pad(OpConverter):
-    """Operator converter for Pad."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        before = []
-        after = []
-        for axis_pads in attrs.pad_width:
-            before.append(axis_pads[0])
-            after.append(axis_pads[1])
-        pads = before + after
-        pads = numpy.asarray(pads, dtype=pads[0].dtype)
-        return {"pads": pads, "mode": attrs.get_str("pad_mode")}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        """Converts Relay operator Pad to ONNX operator.
-        Relay operator accepts pads as attribute but ONNX operator
-        accepts it as a input.
-        """
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-
-        name = node_entry["name"]
-        pad_data = numpy.asarray(attrs["pads"], dtype=attrs["pads"][0].dtype).astype(numpy.int64)
-
-        input_names = [
-            node_entry["input_names"][0],
-            add_input(pad_data, name, "pads", model_container),
-            node_entry["input_names"][1],
-        ]
-
-        node = onnx.helper.make_node(
-            cls.__name__, input_names, node_entry["output_names"], mode=attrs["mode"]
-        )
-        model_container.add_nodes([node])
-
-
-class Softmax(OpConverter):
-    """Operator converter for SoftMax."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"axis": attrs.axis}
-
-
-class Squeeze(OpConverter):
-    """Operator converter for Squeeze."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"axes": attrs.axis}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node can not be a Tuple"
-        input_node = input_node[0]
-        shape = input_node["types"][0].shape
-        axis = node_entry["relay_node"].attrs.get_int("axis")
-        if not axis:
-            axis = []
-            for axis_idx, val in enumerate(shape):
-                if val.value == 1:
-                    axis.append(axis_idx)
-        else:
-            axis = node_entry["relay_node"].attrs.get_int_tuple("axis")
-
-        node = onnx.helper.make_node(
-            cls.__name__, node_entry["input_names"], node_entry["output_names"], axes=axis
-        )
-        model_container.add_nodes([node])
-
-
-class Slice(OpConverter):
-    """Operator converter for Slice."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {
-            "starts": attrs.get_int_tuple("begin"),
-            "ends": attrs.get_int_tuple("end"),
-            "steps": attrs.get_int_tuple("strides"),
-            "slice_mode": attrs.get_str("slice_mode"),
-        }
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-
-        name = node_entry["name"]
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node can not be a Tuple"
-        input_node = input_node[0]
-        shape = input_node["types"][0].shape
-
-        starts = list(attrs["starts"])
-        ends = list(attrs["ends"])
-        steps = list(attrs["steps"])
-        starts += [0] * (len(shape) - len(starts))
-        ends += [shape[i] + 1 for i in range(len(ends), len(shape))]
-        axes = list(range(len(shape)))
-
-        if attrs["slice_mode"] == "size":
-            ends = [
-                starts[i] + (shape[i] + 1 if ends[i] < 0 else ends[i]) for i in range(len(shape))
-            ]
-            steps = [1] * len(shape)
-        else:
-            steps += [1] * (len(shape) - len(steps))
-
-        starts = numpy.asarray(starts).astype(numpy.int64)
-        ends = numpy.asarray(ends).astype(numpy.int64)
-        axes = numpy.asarray(axes).astype(numpy.int64)
-        steps = numpy.asarray(steps).astype(numpy.int64)
-
-        input_names = []
-        input_names.append(add_input(starts, name, "starts", model_container))
-        input_names.append(add_input(ends, name, "ends", model_container))
-        input_names.append(add_input(axes, name, "axes", model_container))
-        input_names.append(add_input(steps, name, "steps", model_container))
-
-        input_names = [node_entry["input_names"][0]] + input_names
-
-        slice_node = onnx.helper.make_node(cls.__name__, input_names, node_entry["output_names"])
-        model_container.add_nodes([slice_node])
-
-
-class Split(OpConverter):
-    """Operator converter for Split."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        indices_or_sections = attrs["indices_or_sections"]
-
-        if isinstance(indices_or_sections, (list, tvm.ir.container.Array)):
-            indices_or_sections = attrs.get_int_tuple("indices_or_sections")
-        if isinstance(indices_or_sections, tvm.ir.PrimExpr):
-            indices_or_sections = indices_or_sections.value
-
-        return {"indices_or_section": indices_or_sections, "axis": attrs.get_int("axis")}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node can not be a Tuple"
-        input_node = input_node[0]
-        shape = get_node_shape(input_node["types"][0])
-
-        indices_or_sect = attrs["indices_or_section"]
-        axis = attrs["axis"]
-        axis_length = shape[axis]
-
-        if isinstance(indices_or_sect, int):
-            split = [axis_length // indices_or_sect] * indices_or_sect
-        else:
-            split = []
-            for i in range(len(indices_or_sect) + 1):
-                if i == 0:
-                    split.append(indices_or_sect[0])
-                elif i == len(indices_or_sect):
-                    split.append(axis_length - indices_or_sect[-1])
-                else:
-                    split.append(indices_or_sect[i] - indices_or_sect[i - 1])
-
-        slice_node = onnx.helper.make_node(
-            cls.__name__,
-            node_entry["input_names"],
-            node_entry["output_names"],
-            split=split,
-            axis=axis,
-        )
-        model_container.add_nodes([slice_node])
-
-
-class LayoutTransform(OpConverter):
-    """Operator converter for Layouttransform"""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        src_layout = attrs.get_str("src_layout")
-        dst_layout = attrs.get_str("dst_layout")
-
-        perm = [src_layout.index(c) for c in dst_layout]
-        return {"perm": tuple(perm)}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-        onnx_node = onnx.helper.make_node(
-            "Transpose", node_entry["input_names"], node_entry["output_names"], **attrs
-        )
-        model_container.add_nodes([onnx_node])
-
-
-class Clip(OpConverter):
-    """Operator converter for Clip."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"min": attrs.a_min, "max": attrs.a_max}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-
-        name = node_entry["name"]
-
-        min_val = numpy.asarray(attrs["min"]).astype(numpy.float32)
-        max_val = numpy.asarray(attrs["max"]).astype(numpy.float32)
-
-        input_names = []
-        input_names.append(add_input(min_val, name, "min", model_container))
-        input_names.append(add_input(max_val, name, "max", model_container))
-
-        input_names = [node_entry["input_names"][0]] + input_names
-
-        node = onnx.helper.make_node(cls.__name__, input_names, node_entry["output_names"])
-        model_container.add_nodes([node])
-
-
-class Expand(OpConverter):
-    """Operator converter for Expand_dims."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"axis": attrs.axis, "num_newaxis": attrs.num_newaxis}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-
-        name = node_entry["name"]
-
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node_entry can not be a Tuple"
-        input_node = input_node[0]
-        data_shape = input_node["types"][0].shape
-        new_shape = list(data_shape)
-
-        for _ in range(attrs["num_newaxis"]):
-            new_shape.insert(attrs["axis"], 1)
-
-        new_shape = numpy.asarray(new_shape).astype(numpy.int64)
-        input_names = []
-        input_names.append(add_input(new_shape, name, "shape", model_container))
-
-        input_names = [node_entry["input_names"][0]] + input_names
-
-        node = onnx.helper.make_node(cls.__name__, input_names, node_entry["output_names"])
-        model_container.add_nodes([node])
-
-
-class ConstantOfShapeZeros(OpConverter):
-    """Operator converter for ConstantOfShape."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"value": 0}
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node can not be a Tuple"
-        input_node = input_node[0]
-        dtype = input_node["types"][0].dtype
-
-        name = node_entry["name"]
-        shape = [val.value for val in input_node["types"][0].shape]
-        shape = numpy.asarray(shape).astype(numpy.int64)
-
-        input_names = []
-        input_names.append(add_input(shape, name, "shape", model_container))
-
-        dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(dtype)]
-        tensor_value = onnx.helper.make_tensor("value", dtype, [1], [attrs["value"]])
-
-        node = onnx.helper.make_node(
-            "ConstantOfShape", input_names, node_entry["output_names"], value=tensor_value
-        )
-        model_container.add_nodes([node])
-
-
-class ConstantOfShapeOnes(ConstantOfShapeZeros):
-    """Operator converter for ConstantOfShape."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"value": 1}
-
-
-class LRN(OpConverter):
-    """Operator converter for LRN."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        """axis attr is not supported as an argument in onnx.
-        Onnx only supports axis=1 (channels)."""
-        if attrs.get_int("axis") != 1:
-            raise RuntimeError(
-                f"Unsupported axis {attrs.get_int('axis')} in operator relay lrn operator. "
-                f"Only axis = 1 is supported by Onnx."
-            )
-
-        return {"alpha": attrs.alpha, "beta": attrs.beta, "bias": attrs.bias, "size": attrs.size}
-
-
-class Cast(OpConverter):
-    """Operator converter for Cast."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        return {"to": getattr(TensorProto, attrs.dtype.upper())}
-
-
-class Resize(OpConverter):
-    """Operator converter for Resize."""
-
-    @classmethod
-    def convert_attributes(cls, attrs):
-        method = attrs.get_str("method")
-        if method == "nearest_neighbor":
-            mode = b"nearest"
-        elif "linear" in method:  # linear / bilinear
-            mode = b"linear"
-        elif "cubic" in method:  # cubic / bicubic
-            mode = b"cubic"
-        else:
-            raise RuntimeError(f"Unsupported method {method} in operator Resize")
-
-        coord_trans = attrs.get_str("coordinate_transformation_mode")
-        if coord_trans == "half_pixel":
-            coord_trans = b"half_pixel"
-        elif coord_trans == "align_corners":
-            coord_trans = b"align_corners"
-        elif coord_trans == "asymmetric":
-            coord_trans = b"asymmetric"
-        else:
-            raise RuntimeError(
-                f"Unsupported coordinate transform mode {coord_trans} in operator Resize"
-            )
-
-        rounding_method = attrs.get_str("rounding_method")
-        if rounding_method == "round":
-            rounding_method = b"round_prefer_ceil"
-        elif rounding_method == "floor":
-            rounding_method = b"floor"
-        elif rounding_method == "ceil":
-            rounding_method = b"ceil"
-        else:
-            raise RuntimeError(f"Unsupported rounding method {rounding_method} in operator Resize")
-
-        size = attrs.get_int_tuple("size")
-
-        return {
-            "mode": mode,
-            "coord_trans": coord_trans,
-            "size": size,
-            "nearest_mode": rounding_method,
-        }
-
-    @classmethod
-    def convert(cls, node_entry, model_container, node_dict):
-        attrs = cls.convert_attributes(node_entry["relay_node"].attrs)
-
-        name = node_entry["name"]
-        input_node = node_dict[node_entry["inputs"][0]]
-        assert len(input_node) == 1, "input node can not be a Tuple"
-        input_node = input_node[0]
-        input_shape = input_node["types"][0].shape
-
-        # (TBD) needed in opset 11
-        roi = [0] * len(input_shape) + [1] * len(input_shape)
-        roi_array = numpy.asarray(roi).astype(numpy.float64)
-        roi_node = add_input(roi_array, name, "roi", model_container)
-
-        out_size = attrs["size"]
-
-        # (onnx) rank of scale / size must match rank of X
-        # relay size node contains only spatial dimensions
-        # pad with 1s to match rank
-        match_rank_pad = len(input_shape) - len(out_size)
-        out_size_full_rank = input_shape[:match_rank_pad] + list(out_size)
-        out_size_array = numpy.asarray(out_size_full_rank).astype(numpy.int64)
-
-        input_size_array = numpy.asarray(list(input_shape)).astype(numpy.int64)
-
-        scale_array = numpy.divide(out_size_array, input_size_array).astype(numpy.float32)
-        scale_node = add_input(scale_array, name, "scales", model_container)
-
-        input_names = [node_entry["input_names"][0], roi_node, scale_node]
-
-        resize_node = onnx.helper.make_node(
-            cls.__name__,
-            input_names,
-            node_entry["output_names"],
-            mode=attrs["mode"],
-            coordinate_transformation_mode=attrs["coord_trans"],
-            nearest_mode=attrs["nearest_mode"],
-        )
-        model_container.add_nodes([resize_node])
-
-
-relay_to_onnx_op_mapping = {
-    "reshape": Reshape,
-    "nn.conv2d": Conv,
-    "nn.conv2d_transpose": ConvTranspose,
-    "add": rename("Add"),
-    "nn.relu": rename("Relu"),
-    "transpose": Transpose,
-    "nn.dense": MatMul,
-    "nn.max_pool2d": MaxPool,
-    "nn.batch_flatten": Flatten,
-    "multiply": rename("Mul"),
-    "nn.bias_add": BiasAdd,
-    "nn.batch_norm": BatchNormalization,
-    "nn.global_avg_pool2d": rename("GlobalAveragePool"),
-    "concatenate": Concat,
-    "nn.dropout": Dropout,
-    "nn.avg_pool2d": AveragePool,
-    "divide": rename("Div"),
-    "mean": ReduceMean,
-    "nn.pad": Pad,
-    "nn.softmax": Softmax,
-    "squeeze": Squeeze,
-    "strided_slice": Slice,
-    "greater": rename("Greater"),
-    "less": rename("Less"),
-    "equal": rename("Equal"),
-    "zeros_like": ConstantOfShapeZeros,
-    "ones_like": ConstantOfShapeOnes,
-    "subtract": rename("Sub"),
-    "split": Split,
-    "exp": rename("Exp"),
-    "layout_transform": LayoutTransform,
-    "clip": Clip,
-    "expand_dims": Expand,
-    "nn.lrn": LRN,
-    "sigmoid": rename("Sigmoid"),
-    "copy": rename("Identity"),
-    "round": rename("Round"),
-    "cast": Cast,
-    "image.resize2d": Resize,
-}
-
-
-class ModelContainer(object):
-    """A container class to hold  different attributes of ONNX model graph"""
-
-    def __init__(self, name, opset_version):
-        self._name = name
-        self._opset_version = opset_version
-        self._inputs = []
-        self._outputs = []
-        self._nodes = []
-        self._initializers = []
-
-    def add_inputs(self, inputs):
-        self._inputs.extend(inputs)
-
-    def add_outputs(self, outputs):
-        self._outputs.extend(outputs)
-
-    def add_nodes(self, nodes):
-        self._nodes.extend(nodes)
-
-    def add_initializers(self, initializers):
-        self._initializers.extend(initializers)
-
-    def _get_opsets(self):
-        opsets = []
-        imp = OperatorSetIdProto()
-        imp.version = self._opset_version
-        opsets.append(imp)
-        return opsets
-
-    def make_model(self):
-        """Creates the onnx model from the graph"""
-        onnx_graph = onnx.helper.make_graph(
-            self._nodes, self._name, self._inputs, self._outputs, self._initializers
-        )
-        kwargs = {}
-        kwargs["opset_imports"] = self._get_opsets()
-        kwargs["producer_name"] = "TVM Relay"
-        kwargs["producer_version"] = tvm.__version__
-
-        return onnx.helper.make_model(onnx_graph, **kwargs)
-
-
-class RelayToONNXConverter(ExprVisitor):
-    """A helper class to traverse the Relay graph and convert Relay nodes to ONNX model
-
-    Parameters
-    ----------
-    name : str
-       name of the model
-
-    params : dict
-        dict of the parameter names and NDarray values
-
-    opset_version : int
-        target onnx opset version
-
-    """
-
-    def __init__(self, name, params, opset_version):
-        super().__init__()
-        self._name = name
-        self._mc = ModelContainer(name, opset_version)
-        self._params = params
-        self._node_dict = {}
-        self._node_count = 0
-        self.last_node = None
-
-    @classmethod
-    def _get_node_entry(cls, relay_node, name):
-        return {
-            "relay_node": relay_node,
-            "inputs": [relay_node],  # inputs in the form of relay nodes
-            "types": [],  # output types in case of call nodes else self type
-            "name": name,  # name of the node
-            "input_names": [name],  # input names in case of call nodes else self name
-            "output_names": [name],  # output names in case of call nodes else self name
-            "op": None,  # op name in case of call node else None
-        }
-
-    def convert_to_onnx(self, func):
-        """Traverse Relay graph and generate a ONNX model"""
-
-        self.visit(func)
-        self._add_output(self._node_dict[self.last_node])
-        model = self._mc.make_model()
-        return run_onnx_optimizer(model)
-
-    def visit(self, expr):
-        self._node_count += 1
-        super().visit(expr)
-
-    def visit_constant(self, const):
-        node_index = self._node_count
-        name = self._name + "_const_" + str(node_index)
-        node_entry = self._get_node_entry(const, name)
-        node_entry["types"] = [const.checked_type]
-
-        self._add_constant_input(node_entry, node_index)
-        self._node_dict[const] = [node_entry]
-
-    def visit_var(self, var):
-        node_index = self._node_count
-        node_entry = self._get_node_entry(var, var.name_hint)
-        node_entry["types"] = [var.type_annotation]
-
-        self._add_input(node_entry, node_index)
-        self._node_dict[var] = [node_entry]
-
-    def visit_tuple(self, tup):
-        self._node_dict[tup] = []
-        for f in tup.fields:
-            self.visit(f)
-            self._node_dict[tup].extend(self._node_dict[f])
-
-        self.last_node = tup
-
-    def visit_tuple_getitem(self, t):
-        self.visit(t.tuple_value)
-        tup_node = self._node_dict[t.tuple_value]
-        if len(tup_node) > 1:
-            self._node_dict[t] = tup_node[t.index]
-        else:
-            node_entry = copy.deepcopy(tup_node[0])
-            output_names = [node_entry["output_names"][t.index]]
-            node_entry["output_names"] = output_names
-            self._node_dict[t] = [node_entry]
-        self.last_node = t
-
-    def visit_call(self, call):
-        node_index = self._node_count
-        op = call.op
-        name = f"{op}_{node_index}"
-        node_entry = self._get_node_entry(call, name)
-
-        node_entry["op"] = op
-        node_entry["input_names"] = []
-        node_entry["inputs"] = []
-        node_entry["output_names"] = None
-        for input_arg in call.args:
-            self.visit(input_arg)
-            input_names = []
-            for arg_node_entry in self._node_dict[input_arg]:
-                input_names.extend(arg_node_entry["output_names"])
-            node_entry["input_names"].extend(input_names)
-            node_entry["inputs"].extend([input_arg])
-
-        node_entry["types"] = call_node_infer_type(call)
-        node_entry["output_names"] = []
-        for i in range(len(node_entry["types"])):
-            node_entry["output_names"].append(name + str(i))
-        self.last_node = call
-        self._add_node(node_entry, node_index)
-        self._node_dict[call] = [node_entry]
-
-    def _add_node(self, node_entry, idx):
-        """Convert Relay operator node to ONNX operator and add it to container nodes list"""
-        if node_entry["op"].name not in relay_to_onnx_op_mapping:
-            raise NotImplementedError(
-                f"Currently the operator '{node_entry['op'].name}' is " "not supported."
-            )
-        converter = relay_to_onnx_op_mapping[node_entry["op"].name]()
-
-        return converter.convert(node_entry, self._mc, self._node_dict)
-
-    def _add_params(self, node_entry, idx):
-        """Add param value to initializer and name to inputs"""
-        param_name = node_entry["name"]
-        assert param_name in self._params, (
-            f"The parameter {param_name} is not present" "in params dict provided."
-        )
-        value = self._params[param_name]
-        numpy_array = value.numpy()
-        tensor = numpy_helper.from_array(numpy_array, param_name)
-        self._mc.add_initializers([tensor])
-        dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[numpy_array.dtype]
-        input = onnx.helper.make_tensor_value_info(param_name, dtype, shape=numpy_array.shape)
-        self._mc.add_inputs([input])
-
-    def _add_constant_input(self, node_entry, idx):
-        """Create named input for constant and add it to container inputs.
-        If input is a parameter then add to param
-        """
-        node = node_entry["relay_node"]
-        param_name = node_entry["name"]
-        self._params[param_name] = node.data
-        self._add_params(node_entry, idx)
-
-    def _add_input(self, node_entry, idx):
-        """Add input node to container inputs. If input is a parameter then add to param"""
-        if node_entry["name"] in self._params:
-            self._add_params(node_entry, idx)
-        else:
-            node_type = node_entry["types"][0]
-            dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(node_type.dtype)]
-            input = onnx.helper.make_tensor_value_info(
-                node_entry["name"], dtype, shape=get_node_shape(node_type)
-            )
-            self._mc.add_inputs([input])
-
-    def _add_output(self, node_entries):
-        """Add output node to container outputs."""
-
-        for node_entry in node_entries:
-            for node_type, output_name in zip(node_entry["types"], node_entry["output_names"]):
-                dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(node_type.dtype)]
-                output = onnx.helper.make_tensor_value_info(
-                    output_name, dtype, shape=get_node_shape(node_type)
-                )
-                self._mc.add_outputs([output])
-
-
-def to_onnx(relay_ir, params, name, opset_version=11, path=None):
-    """Convert a Relay Function Module into an equivalent ONNX and serialize it to the path
-
-    Parameters
-    ----------
-    relay_ir : tvm.ir.IRModule or tvm.relay.Function
-        The relay module object
-
-    params : dict
-        dict of the parameter names and NDarray values
-
-    name : str
-        name of the output ONNX graph
-
-    opset_version : int
-        target onnx opset version
-
-    path : str
-        The path where ONNX model will be saved
-
-    Returns
-    -------
-    onnx_model : onnx.ModelProto
-        converted ONNX model as a ModelProto.
-
-    """
-
-    if opset_version not in ONNX_OPSET_VERSONS_SUPPORTED:
-        raise NotImplementedError("Currently only opset version 11 is supported.")
-
-    if opset_version > defs.onnx_opset_version():
-        raise Exception(
-            f"The ONNX package installed of version {get_onnx_version()} does not support the "
-            f"opset version {opset_version}. Upgrade the ONNX package to latest version."
-        )
-
-    func = relay_ir["main"] if isinstance(relay_ir, tvm.ir.IRModule) else relay_ir
-    converter = RelayToONNXConverter(name, params, opset_version)
-    onnx_model = converter.convert_to_onnx(func)
-
-    if path:
-        onnx.save(onnx_model, path)
-    return onnx_model
-
-
-@tvm._ffi.register_func("relay.ext.onnx")
-def onnx_compiler(func):
-    """Create a runtime module for ONNX from Relay Function
-
-    :param func: Relay function
-    :return: runtime module for ONNX
-    """
-
-    assert isinstance(func, tvm.relay.function.Function)
-    name = str(func.attrs.global_symbol)
-    model = to_onnx(func, {}, name)
-    const_vars = [const.name for const in model.graph.initializer]
-    name_bytes = bytes(name, "utf-8")
-    name_size = struct.pack("I", len(name_bytes))
-    model_serialized = model.SerializeToString()
-    model_size = struct.pack("I", model.ByteSize())
-    data = b"" + name_size + name_bytes + model_size + model_serialized
-
-    runtime_func = "runtime.ONNXModuleCreate"
-    fcreate = tvm._ffi.get_global_func(runtime_func)
-    return fcreate(data.hex(), name, const_vars)
-
-
-@tvm._ffi.register_func("relay.ext.onnx.save_to_file")
-def save_to_file(hex_str, path=None, fmt="onnx"):
-    """Store the ONNX subgraphs in the path folder
-
-    :param hex_str: Subgrah names and corresponding serialized onnx hex string
-    :param path: path to which ONNX files to be stored
-                It is assumed that path exists
-    :param fmt: extension of the files to be stored
-    """
-    onnx_ir = bytes.fromhex(hex_str)
-
-    offset = 0
-    while offset < len(onnx_ir):
-        stop = offset + 4
-        (name_size,) = struct.unpack("I", onnx_ir[offset:stop])
-        name = onnx_ir[stop : stop + name_size].decode("utf-8")
-        stop = stop + name_size
-        (model_size,) = struct.unpack("I", onnx_ir[stop : stop + 4])
-        stop = stop + 4
-        model_serialized = onnx_ir[stop : stop + model_size]
-        offset = stop + model_size
-
-        model_onnx = onnx.load_model_from_string(model_serialized)
-        onnx.save(model_onnx, f"{path}{os.path.sep}{name}.{fmt}")
diff --git a/python/tvm/contrib/torch/__init__.py b/python/tvm/contrib/torch/__init__.py
deleted file mode 100644
index c3dd34d47044..000000000000
--- a/python/tvm/contrib/torch/__init__.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wrong-import-position,redefined-builtin,invalid-name
-"""Module container of Pytorch custom class"""
-import os
-import platform
-import warnings
-import torch
-from tvm._ffi import libinfo
-
-
-def _load_platform_specific_library(lib_name):
-    system = platform.system()
-    if system == "Darwin":
-        lib_file_name = lib_name + ".dylib"
-    elif system == "Windows":
-        lib_file_name = lib_name + ".dll"
-    else:
-        lib_file_name = lib_name + ".so"
-    lib_path = libinfo.find_lib_path()[0]
-    lib_dir = os.path.dirname(lib_path)
-    lib_file_path = os.path.join(lib_dir, lib_file_name)
-    try:
-        torch.classes.load_library(lib_file_path)
-    except OSError as err:
-        errmsg = str(err)
-        if errmsg.find("undefined symbol") != -1:
-            reason = " ".join(
-                (
-                    "Got undefined symbol error,",
-                    "which might be due to the CXXABI incompatibility.",
-                )
-            )
-        else:
-            reason = errmsg
-        warnings.warn(
-            f"The library {lib_name} is not built successfully. {reason}",
-            RuntimeWarning,
-        )
-
-
-_load_platform_specific_library("libpt_tvmdsoop")
-_load_platform_specific_library("libpt_tvmdsoop_new")
-
-from . import module
-
-GraphModule = module.GraphModule
-VMModule = module.VMModule
-TraceTvmModule = module.TraceTvmModule
-
-from . import pytorch_tvm
-
-PyTorchTVMModule = pytorch_tvm.PyTorchTVMModule
-compile = pytorch_tvm.compile
-
-from . import as_torch
-
-TVMScriptIRModule = as_torch.OperatorModuleWrapper
-as_torch = as_torch.as_torch
-
-from . import optimize_torch
-
-GraphExecutorFactoryWrapper = optimize_torch.GraphExecutorFactoryWrapper
-optimize_torch = optimize_torch.optimize_torch
diff --git a/python/tvm/contrib/torch/as_torch.py b/python/tvm/contrib/torch/as_torch.py
deleted file mode 100644
index c78d7bb831c5..000000000000
--- a/python/tvm/contrib/torch/as_torch.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# pylint: disable=inconsistent-return-statements
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-module-docstring
-# pylint: disable=missing-class-docstring
-# pylint: disable=missing-function-docstring
-"""
-as_torch: a decorator, which is used to wrap the TVMScript code to `torch.nn.module`.
-"""
-import tempfile
-from typing import Callable, List, Optional, Union
-
-# isort: off
-from typing_extensions import Literal
-
-# isort: on
-
-import torch
-import torch.utils.dlpack
-
-import tvm
-from tvm import meta_schedule as ms
-from tvm.target.target import Target
-from tvm.tir import PrimFunc
-
-
-# python wrapper for OperatorModule
-class OperatorModuleWrapper(torch.nn.Module):
-    def __init__(
-        self,
-        module: Union[
-            tvm.ir.module.IRModule,
-            tvm.tir.function.PrimFunc,
-        ],
-    ):
-        super().__init__()
-        self.rt_module = None  # runtime module
-        self.ir_module = module  # IR modules
-
-    def tune(
-        self,
-        target: Union[str, Target] = "cpu",
-        max_trials_global: int = 32,
-        *,
-        num_trials_per_iter: int = 32,
-        builder: ms.Builder.BuilderType = "local",
-        runner: ms.Runner.RunnerType = "local",
-        database: ms.Database.DatabaseType = "json",
-        cost_model: ms.CostModel.CostModelType = "xgb",
-        measure_callbacks: ms.MeasureCallback.CallbackListType = "default",
-        task_scheduler: ms.TaskScheduler.TaskSchedulerType = "round-robin",
-        space: ms.SpaceGenerator.SpaceGeneratorType = "post-order-apply",
-        strategy: ms.SearchStrategy.SearchStrategyType = "replay-trace",
-        num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
-        seed: Optional[int] = None,
-    ) -> None:
-        """
-        Tune the TVMScript code.
-
-        Parameters
-        ----------
-        config: Optional[TuneConfig]
-            The tuning configuration.
-
-        target : Optional[str, Target]
-            The target to tune for.
-        """
-        if target == "cpu":
-            target = f"llvm --num-cores {ms.utils.cpu_count(logical=False)}"
-
-        with tempfile.TemporaryDirectory() as work_dir:
-            database = ms.tir_integration.tune_tir(
-                mod=self.ir_module,
-                target=target,
-                work_dir=work_dir,
-                max_trials_global=max_trials_global,
-                num_trials_per_iter=num_trials_per_iter,
-                builder=builder,
-                runner=runner,
-                database=database,
-                cost_model=cost_model,
-                measure_callbacks=measure_callbacks,
-                task_scheduler=task_scheduler,
-                space=space,
-                strategy=strategy,
-                num_tuning_cores=num_tuning_cores,
-                seed=seed,
-            )
-            sch = ms.tir_integration.compile_tir(database, self.ir_module, target)
-            self.ir_module = sch.mod
-            self.build(target)
-
-    def script(self):
-        return self.ir_module.script()
-
-    def build(self, target=None):
-        runtime_module = tvm.build(self.ir_module, target=target)
-        func = tvm.get_global_func("tvmtorch.save_runtime_mod", allow_missing=True)
-
-        if func is None:
-            raise ValueError('as_torch requires the flag /"USE_PT_TVMDSOOP/" set in config.cmake')
-        func(runtime_module)
-
-        self.rt_module = torch.classes.tvm_torch.OperatorModuleWrapper()
-
-    def forward(self, *torch_inputs: List[torch.Tensor]) -> List[torch.Tensor]:
-        if self.rt_module is None:
-            if torch_inputs[0].is_cuda:
-                self.build(target="cuda")
-            elif torch_inputs[0].device.type == "cpu":
-                self.build()
-            else:
-                raise Exception(f"the target {torch_inputs[0].device.type} is not supported yet")
-
-        return self.rt_module.forward(torch_inputs)
-
-
-def as_torch(func: Union[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Callable]):
-    """A decorator of converting TensorIR to PyTorch nn.Module.
-
-    Parameters
-    ----------
-    func: Optional[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Callable]
-        The function written by TVMScript.
-
-    Returns
-    -------
-    mod : Union[OperatorModuleWrapper, Callable]
-        It will return an object, or a templated function of OperatorModuleWrapper,
-        which is the subclass of the original nn.Module.
-
-    """
-    if isinstance(func, (tvm.ir.module.IRModule, PrimFunc)):
-        return OperatorModuleWrapper(func)
-    if callable(func):
-
-        def func_get_param(*args, **kwargs):
-            return OperatorModuleWrapper(func(*args, **kwargs))
-
-        return func_get_param
diff --git a/python/tvm/contrib/torch/module.py b/python/tvm/contrib/torch/module.py
deleted file mode 100644
index cfa3ad264c3a..000000000000
--- a/python/tvm/contrib/torch/module.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Module container of PyTorch custom class"""
-import warnings
-from typing import List
-
-import torch
-
-
-class GraphModule(torch.nn.Module):
-    r"""Module container of Pytorch class which wraps exported
-    TVM op implementation library to be called on Pytorch side"""
-
-    @classmethod
-    def shape_repr(cls, input_shapes):
-        return torch.ops.tvm_dsoop.tvm_shape_repr(input_shapes)
-
-    def __init__(self, num_inputs, num_outputs, device=None):
-        warnings.warn(
-            "This module will be removed at TVM version 0.11",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        super().__init__()
-        self.dummy_param = torch.nn.Parameter(torch.empty(0))
-        self.engine = None
-
-        if device is not None:
-            self.to(device)
-        self.engine = torch.classes.tvm_dsoop.TvmGraphModule(num_inputs, num_outputs, self.device)
-
-    def init(self, input_shapes, lib_path, graph_path, params_path):
-        r"""Load tvm module"""
-        self.engine.load_tvm_module(input_shapes, lib_path, graph_path, params_path)
-
-    def forward(self, inputs: List[torch.Tensor]):
-        r"""Call tvm module to forward"""
-        return self.engine.forward(inputs)
-
-    @property
-    def device(self):
-        r"""Get the device string"""
-        return str(self.dummy_param.device)
-
-    def _apply(self, fn):
-        r"""Override to device function, manually move tvm module to desired device"""
-        super()._apply(fn)
-        if self.engine is not None:
-            self.engine.to(self.device)
-        return self
-
-
-class VMModule(torch.nn.Module):
-    r"""Module container of Pytorch class which wraps exported
-    TVM op implementation library to be called on Pytorch side"""
-
-    @classmethod
-    def shape_repr(cls, input_shapes):
-        return torch.ops.tvm_dsoop.tvm_shape_repr(input_shapes)
-
-    def __init__(self, num_inputs, num_outputs, device=None):
-        warnings.warn(
-            "This module will be removed at TVM version 0.11",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        super().__init__()
-        self.dummy_param = torch.nn.Parameter(torch.empty(0))
-        self.engine = None
-
-        if device is not None:
-            self.to(device)
-        self.engine = torch.classes.tvm_dsoop.TvmVMModule(num_inputs, num_outputs, self.device)
-
-    def init(self, input_shapes, lib_path, code_path):
-        r"""Load tvm module"""
-        self.engine.load_tvm_module(input_shapes, lib_path, code_path)
-
-    def forward(self, inputs: List[torch.Tensor]):
-        r"""Call tvm module to forward"""
-        return self.engine.forward(inputs)
-
-    @property
-    def device(self):
-        r"""Get the device string"""
-        return str(self.dummy_param.device)
-
-    def _apply(self, fn):
-        r"""Override to device function, manually move tvm module to desired device"""
-        super()._apply(fn)
-        if self.engine is not None:
-            self.engine.to(self.device)
-        return self
-
-
-class TraceTvmModule(torch.nn.Module):
-    r"""Wrapper for trace GraphModule
-
-    GraphModule and VMModule only supports List[Tensor] inputs and cannot be traced.
-    This is a wrapper class for trace GraphModule or VMModule in order to support
-    arbitrary number of inputs
-
-    Example:
-        import tvm.contrib.torch
-        tvm_module = tvm.contrib.torch.GraphModule(1, 1, 'cuda:0')
-        tvm_module.init(input_shapes, lib_path, graph_path, params_path)
-
-        trace_wrapper = tvm.contrib.torch.TraceGraphModule(torch.jit.script(tvm_module))
-        traced = torch.jit.trace(trace_wrapper, example_inputs)
-    """
-
-    def __init__(self, tvm_module):
-        warnings.warn(
-            "This module will be removed at TVM version 0.11",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        super().__init__()
-        self.tvm_module = tvm_module
-
-    def forward(self, *inputs):
-        outputs = self.tvm_module(inputs)
-        return outputs[0] if len(outputs) == 1 else tuple(outputs)
diff --git a/python/tvm/contrib/torch/optimize_torch.py b/python/tvm/contrib/torch/optimize_torch.py
deleted file mode 100644
index dfe35f2aae19..000000000000
--- a/python/tvm/contrib/torch/optimize_torch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# pylint: disable=inconsistent-return-statements
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-module-docstring
-# pylint: disable=missing-class-docstring
-# pylint: disable=missing-function-docstring
-"""
-optimize_torch: a function similar to `torch.jit.trace`,
-which is used to optimize the `torch.nn.module` by TVM metaSchedule,
-and returns a custom TorchScript operator
-"""
-
-import contextlib
-import tempfile
-from typing import Optional, Tuple, Union
-import base64
-import torch
-import torch.utils.dlpack
-import tvm
-import tvm._ffi
-from tvm._ffi import register_func
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm._ffi import get_global_func
-from tvm.target import Target
-
-
-class GraphExecutorFactoryWrapper(torch.nn.Module):
-    def __init__(self, module: tvm.runtime.Module):
-        super().__init__()
-        self.inner_module = module
-
-    def forward(self, *torch_inputs: Tuple[torch.Tensor]):
-        ret = self.inner_module.forward(torch_inputs)
-        if len(ret) == 1:
-            return ret[0]
-        return ret
-
-
-def optimize_torch(
-    func,
-    example_inputs,
-    *,
-    max_trials_global: int,
-    work_dir=None,
-    target: Union[str, Target] = "cpu",
-    max_trials_per_task: Optional[int] = None,
-    num_trials_per_iter: int = 64,
-    builder: ms.Builder.BuilderType = "local",
-    runner: ms.Runner.RunnerType = "local",
-    database: ms.Database.DatabaseType = "json",
-    cost_model: ms.CostModel.CostModelType = "xgb",
-    measure_callbacks: ms.MeasureCallback.CallbackListType = "default",
-    task_scheduler: ms.TaskScheduler.TaskSchedulerType = "gradient",
-    space: ms.SpaceGenerator.SpaceGeneratorType = "post-order-apply",
-    strategy: ms.SearchStrategy.SearchStrategyType = "evolutionary",
-    seed: Optional[int] = None,
-):
-    """Load PyTorch model that could be traced by TorchScript, then optimize it via MetaSchedule.
-
-    Parameters
-    ----------
-    func : callable or torch.nn.Module
-        A Python function or nn.Module that could run by TorchScript's trace.
-        (ie: torch.jit.trace(model, input))
-    example_inputs : tuple or torch.Tensor
-        Inputs to `torch.jit.trace`.
-    max_trials_global : int
-        The maximum number of trials to run globally.
-    work_dir : Optional[str]
-        The working directory to save intermediate results.
-    target : Optional[Union[str, Target]]
-        The target of the compilation.
-        If user doesn't set the target, the module will be built for the CPU target.
-    max_trials_per_task : Optional[int]
-        The maximum number of trials to run per task.
-    num_trials_per_iter : int
-        The number of trials to run per iteration
-    builder : Builder.BuilderType
-        The builder.
-    runner : Runner.RunnerType
-        The runner.
-    database : Database.DatabaseType
-        The database.
-    cost_model : CostModel.CostModelType
-        The cost model.
-    measure_callbacks : MeasureCallback.CallbackListType
-        The measure callbacks.
-    task_scheduler : TaskScheduler.TaskSchedulerType
-        The task scheduler.
-    space : SpaceGenerator.SpaceGeneratorType
-        The space generator to use.
-    strategy : SearchStrategy.SearchStrategyType
-        The search strategy to use.
-    seed : Optional[int]
-        The random seed to use.
-
-    Returns
-    -------
-    mod : GraphExecutorFactoryWrapper
-        It will return an object of GraphExecutorFactoryWrapper,
-        which is the subclass of the original nn.Module.
-    """
-
-    if target == "cpu":
-        target = f"llvm --num-cores {ms.utils.cpu_count(logical=False)}"
-    if not isinstance(target, Target):
-        target = Target(target)
-
-    # If `func` is already a traced module this statement makes no effect
-    jit_mod = torch.jit.trace(func, example_inputs)
-    if isinstance(example_inputs, torch.Tensor):
-        example_inputs = [example_inputs]
-    shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
-    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)  # IRmodule
-
-    if work_dir:
-        context_manager = contextlib.nullcontext(work_dir)
-    else:
-        context_manager = tempfile.TemporaryDirectory()
-    with context_manager as work_dir:  # pylint: disable=redefined-argument-from-local
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            params=params,
-            target=target,
-            work_dir=work_dir,
-            max_trials_global=max_trials_global,
-            max_trials_per_task=max_trials_per_task,
-            num_trials_per_iter=num_trials_per_iter,
-            builder=builder,
-            runner=runner,
-            database=database,
-            cost_model=cost_model,
-            measure_callbacks=measure_callbacks,
-            task_scheduler=task_scheduler,
-            space=space,
-            strategy=strategy,
-            seed=seed,
-        )
-        executor_factory = ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=target,
-            params=params,
-            backend="graph",
-        )
-
-    save_runtime_mod = get_global_func("tvmtorch.save_runtime_mod", allow_missing=True)
-    if save_runtime_mod is None:
-        raise ValueError('optimize_torch requires the flag /"USE_PT_TVMDSOOP/" set in config.cmake')
-    save_runtime_mod(executor_factory.module)
-
-    return GraphExecutorFactoryWrapper(torch.classes.tvm_torch.GraphExecutorFactoryWrapper())
-
-
-@register_func("export_runtime_module")
-def save_to_base64(obj) -> bytes:
-    with tempfile.NamedTemporaryFile(suffix=".so") as tmpfile:
-        obj.export_library(tmpfile.name)
-        with open(tmpfile.name, "rb") as temp_file:
-            return base64.b64encode(temp_file.read())
diff --git a/python/tvm/contrib/torch/pytorch_tvm.py b/python/tvm/contrib/torch/pytorch_tvm.py
deleted file mode 100644
index 30b0dd4f8c0e..000000000000
--- a/python/tvm/contrib/torch/pytorch_tvm.py
+++ /dev/null
@@ -1,292 +0,0 @@
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=redefined-builtin
-"""`compile` api that convert torch module to torch tvm module"""
-import os
-import warnings
-import tvm
-import tvm.testing
-from tvm import relay, autotvm
-from tvm.runtime import load_module
-from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib import graph_executor
-from tvm.contrib.debugger import debug_executor
-from . import GraphModule
-
-
-def tune_tasks(
-    tasks,
-    measure_option,
-    tuner="xgb",
-    n_trial=1000,
-    early_stopping=None,
-    log_filename="tuning.log",
-    use_transfer_learning=True,
-):
-    """Tune tasks and generate tuning log to file"""
-    # create tmp log file
-    tmp_log_file = log_filename + ".tmp"
-    if os.path.exists(tmp_log_file):
-        os.remove(tmp_log_file)
-
-    for i, tsk in enumerate(reversed(tasks)):
-        prefix = f"[Task {i + 1:2d}/{len(tasks):2d}] "
-
-        # create tuner
-        if tuner == "xgb":
-            tuner_obj = XGBTuner(tsk, loss_type="reg")
-        elif tuner == "xgb_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob")
-        elif tuner == "xgb_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar")
-        elif tuner == "xgb_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve")
-        elif tuner == "xgb_rank":
-            tuner_obj = XGBTuner(tsk, loss_type="rank")
-        elif tuner == "xgb_rank_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
-        elif tuner == "xgb_rank_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
-        elif tuner == "xgb_rank_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
-        elif tuner == "xgb_rank_binary":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary")
-        elif tuner == "xgb_rank_binary_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob")
-        elif tuner == "xgb_rank_binary_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar")
-        elif tuner == "xgb_rank_binary_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve")
-        elif tuner == "ga":
-            tuner_obj = GATuner(tsk, pop_size=100)
-        elif tuner == "random":
-            tuner_obj = RandomTuner(tsk)
-        elif tuner == "gridsearch":
-            tuner_obj = GridSearchTuner(tsk)
-        else:
-            raise ValueError("Invalid tuner: " + tuner)
-
-        if use_transfer_learning:
-            if os.path.isfile(tmp_log_file):
-                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
-
-        # do tuning
-        tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(
-            n_trial=tsk_trial,
-            early_stopping=early_stopping,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-    # pick best records to a cache file
-    if not os.path.exists(log_filename):
-        with open(log_filename, "w", encoding="utf-8"):
-            pass
-    if os.path.exists(tmp_log_file):
-        autotvm.record.pick_best(tmp_log_file, log_filename)
-        os.remove(tmp_log_file)
-
-
-def get_tuning_opt(log_file="tuning.log", n_trial=200):
-    """Returns tuning options"""
-    tuning_opt = {
-        "log_filename": log_file,
-        "tuner": "random",
-        "n_trial": n_trial,
-        "early_stopping": 60,
-        "measure_option": autotvm.measure_option(
-            builder=autotvm.LocalBuilder(timeout=10),
-            runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
-        ),
-    }
-    return tuning_opt
-
-
-TVM_ASSETS = ["mod.so", "graph.json", "params"]
-
-
-class PyTorchTVMModule:
-    """Helper class for compiling pytorch module to tvm module"""
-
-    def __init__(self, target="cuda", device=tvm.cuda(0)) -> None:
-        self.script_module = None
-        self.input_infos = None
-        self.default_dtype = "float32"
-        self.mod = None
-        self.params = None
-        self.tasks = None
-        self.target = target
-        self.dev = device
-        self.log_file = None
-        self.tvm_module = None
-        self.tvm_graph = None
-        self.tvm_lib = None
-        self.tvm_params = None
-
-    def from_pytorch(self, script_module, input_infos, default_dtype="float32"):
-        self.script_module = script_module
-        self.input_infos = input_infos
-        self.default_dtype = default_dtype
-        self.mod, self.params = relay.frontend.from_pytorch(
-            script_module, input_infos, default_dtype=default_dtype
-        )
-
-    def tune_tvm(self, log_file="tuning.log", n_trial=200):
-        self.tasks = autotvm.task.extract_from_program(
-            self.mod["main"],
-            target=self.target,
-            params=self.params,
-        )
-        self.log_file = log_file
-        tuning_opt = get_tuning_opt(log_file, n_trial)
-        tune_tasks(self.tasks, **tuning_opt)
-
-    def build_tvm(self, export_dir, debug_runtime=False):
-        tvm_mod = self._build_tvm(debug_runtime)
-        self._export_tvm(export_dir)
-        return tvm_mod
-
-    def _build_tvm(self, debug_runtime=False):
-        # compile kernels with history best records
-        with autotvm.apply_history_best(self.log_file):
-            with tvm.transform.PassContext(opt_level=3):
-                self.tvm_graph, self.tvm_lib, self.tvm_params = relay.build(
-                    self.mod, target=self.target, params=self.params
-                )
-
-        if not debug_runtime:
-            self.tvm_module = graph_executor.create(self.tvm_graph, self.tvm_lib, device=self.dev)
-        else:
-            self.tvm_module = debug_executor.create(self.tvm_graph, self.tvm_lib, device=self.dev)
-        self.tvm_module.set_input(**self.tvm_params)
-        return self.tvm_module
-
-    def _export_tvm(self, export_dir):
-        if not os.path.isdir(export_dir):
-            os.makedirs(export_dir)
-        self.export_dir = export_dir
-        self.tvm_lib.export_library(os.path.join(export_dir, TVM_ASSETS[0]))
-        with open(os.path.join(export_dir, TVM_ASSETS[1]), "w", encoding="utf8") as fout:
-            fout.write(self.tvm_graph)
-        with open(os.path.join(export_dir, TVM_ASSETS[2]), "wb") as fout:
-            fout.write(relay.save_param_dict(self.tvm_params))
-
-    def load_tvm(self, export_dir):
-        """Load tvm module from export directory"""
-        self.export_dir = export_dir
-        self.tvm_lib = load_module(os.path.join(export_dir, TVM_ASSETS[0]))
-        with open(os.path.join(export_dir, TVM_ASSETS[1]), "r", encoding="utf8") as f:
-            self.tvm_graph = f.read()
-        with open(os.path.join(export_dir, TVM_ASSETS[2]), "rb") as f:
-            self.tvm_params = relay.load_param_dict(f.read())
-
-        self.tvm_module = graph_executor.create(self.tvm_graph, self.tvm_lib, device=self.dev)
-        self.tvm_module.set_input(**self.tvm_params)
-        return self.tvm_module
-
-    def build_pytorch_module(self, num_inputs, num_outputs, input_infos=None):
-        """Build pytorch module containing TVM Graph Module"""
-        warnings.warn(
-            " ".join(
-                (
-                    "This function will be removed at TVM version 0.11,",
-                    "we suggest users to use `optimized_torch` for tuning Torch modules instead.",
-                )
-            ),
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        assert self.export_dir, "you must build_tvm or load_tvm before"
-        input_infos = input_infos or self.input_infos
-        assert input_infos
-        assert len(input_infos) == num_inputs
-        assets = [os.path.join(self.export_dir, i) for i in TVM_ASSETS]
-        input_shapes = [i[1] for i in input_infos]
-
-        def _tvm_dev_to_pt_dev(device):
-            """convert tvm device to pytorch device string"""
-            if tvm.runtime.Device.MASK2STR[device.device_type] == "cpu":
-                return "cpu"
-            if tvm.runtime.Device.MASK2STR[device.device_type] == "cuda":
-                return f"cuda:{device.device_id}"
-            raise ValueError(f"unsupported device for pt graph module: {device}")
-
-        mod = GraphModule(num_inputs=num_inputs, num_outputs=num_outputs).to(
-            _tvm_dev_to_pt_dev(self.dev)
-        )
-        mod.init(input_shapes, *assets)
-        return mod
-
-
-def compile(script_module, option):
-    """
-    example:
-    option = {
-        "input_infos": [
-            ("x", (1, 3, 244, 244)),
-        ],
-        "default_dtype": "float16",
-        "export_dir": "pytorch_compiled",
-        "num_outputs": 1,
-        "tuning_n_trials": 20,  # set zero to skip tuning
-        "tuning_log_file": "tuning.log",
-        "target": "llvm",
-        "device": tvm.cpu(),
-    }
-    script_module = torch.jit.script(model)
-    pytorch_tvm_module = compile(script_module, option)
-    pytorch_tvm_module("model_tvm.pt")
-    """
-    warnings.warn(
-        " ".join(
-            (
-                "This function will be removed at TVM version 0.11,",
-                "we suggest users to use `optimized_torch` for tuning Torch modules instead.",
-            )
-        ),
-        DeprecationWarning,
-        stacklevel=2,
-    )
-    input_infos = option["input_infos"]
-    default_dtype = option.get("default_dtype", "float32")
-    export_dir = option.get("export_dir", "pytorch_compiled")
-    tuning_log_file = option.get("tuning_log_file", "tuning.log")
-    tuning_n_trials = option.get("tuning_n_trials", 20)
-    num_outputs = option.get("num_outputs", 1)
-    target = option.get("target", "cuda")
-    device = option.get("device", tvm.cuda(0))
-
-    mod = PyTorchTVMModule(target=target, device=device)
-    print("Converting...")
-
-    mod.log_file = tuning_log_file
-    mod.from_pytorch(script_module, input_infos, default_dtype)
-
-    if tuning_n_trials > 0:
-        print("Tuning...")
-        mod.tune_tvm(log_file=tuning_log_file, n_trial=tuning_n_trials)
-
-    print("Building...")
-    mod.build_tvm(export_dir)
-    pytorch_mod = mod.build_pytorch_module(num_inputs=len(input_infos), num_outputs=num_outputs)
-    return pytorch_mod
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index 2b68600197e4..d12367330dde 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name
 """Utility to invoke Xcode compiler toolchain"""
-from __future__ import absolute_import as _abs
 
 import os
 import sys
diff --git a/python/tvm/dlight/benchmark/utils.py b/python/tvm/dlight/benchmark/utils.py
index 72e0ac8de016..8edb2addae1c 100644
--- a/python/tvm/dlight/benchmark/utils.py
+++ b/python/tvm/dlight/benchmark/utils.py
@@ -37,7 +37,7 @@ def get_func_name_from_gv(gv: tvm.ir.GlobalVar) -> str:  # pylint: disable=inval
     result : str
         The global variable name without the prefix "...@".
     """
-    return gv.astext().split("@")[1] if "@" in gv.astext() else gv.astext()
+    return gv.name_hint
 
 
 def dym_var_sample_str(sample: Dict[Union[str, tvm.relax.expr.Call], int]) -> str:
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 08af27e32f04..fb325de1d3ab 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -257,7 +257,7 @@ def build(
             target_input_mod = {}
             for tgt in target_mod.keys():
                 tir_mod = tvm.IRModule(target_mod[tgt])
-                tir_mod.with_attrs(input_mod.attrs)
+                tir_mod = tir_mod.with_attrs(input_mod.attrs)
                 target_input_mod[tgt] = tir_mod
         else:
             target_input_mod = {target: input_mod}
diff --git a/python/tvm/driver/tvmc/__init__.py b/python/tvm/driver/tvmc/__init__.py
deleted file mode 100644
index f7798a851251..000000000000
--- a/python/tvm/driver/tvmc/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=redefined-builtin,wrong-import-position
-"""
-TVMC - TVM driver command-line interface
-"""
-
-
-class TVMCException(Exception):
-    """TVMC Exception"""
-
-
-class TVMCImportError(TVMCException):
-    """TVMC TVMCImportError"""
-
-
-from . import runner
-from . import autotuner
-from . import compiler
-from . import result_utils
-from .frontends import load_model as load
-from .compiler import compile_model as compile
-from .runner import run_module as run
-from .autotuner import tune_model as tune
-from .model import TVMCModel, TVMCPackage, TVMCResult
diff --git a/python/tvm/driver/tvmc/__main__.py b/python/tvm/driver/tvmc/__main__.py
deleted file mode 100644
index 55235a6adfdd..000000000000
--- a/python/tvm/driver/tvmc/__main__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-TVMC - TVM driver command-line interface
-"""
-
-from tvm.driver import tvmc
-
-if __name__ == "__main__":
-    tvmc.main.main()
diff --git a/python/tvm/driver/tvmc/arguments.py b/python/tvm/driver/tvmc/arguments.py
deleted file mode 100644
index 57b6ee2f967a..000000000000
--- a/python/tvm/driver/tvmc/arguments.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-TVMC Argument Parsing
-"""
-
-import argparse
-
-from tvm.driver.tvmc import TVMCException
-
-
-class TVMCSuppressedArgumentParser(argparse.ArgumentParser):
-    """
-    A silent ArgumentParser class.
-    This class is meant to be used as a helper for creating dynamic parsers in
-    TVMC. It will create a "supressed" parser based on an existing one (parent)
-    which does not include a help message, does not print a usage message (even
-    when -h or --help is passed) and does not exit on invalid choice parse
-    errors but rather throws a TVMCException so it can be handled and the
-    dynamic parser construction is not interrupted prematurely.
-    """
-
-    def __init__(self, parent, **kwargs):
-        # Don't add '-h' or '--help' options to the newly created parser. Don't print usage message.
-        # 'add_help=False' won't supress existing '-h' and '--help' options from the parser (and its
-        # subparsers) present in 'parent'. However that class is meant to be used with the main
-        # parser, which is created with `add_help=False` - the help is added only later. Hence it
-        # the newly created parser won't have help options added in its (main) root parser. The
-        # subparsers in the main parser will eventually have help activated, which is enough for its
-        # use in TVMC.
-        super().__init__(parents=[parent], add_help=False, usage=argparse.SUPPRESS, **kwargs)
-
-    def exit(self, status=0, message=None):
-        # Don't exit on error when parsing the command line.
-        # This won't catch all the errors generated when parsing tho. For instance, it won't catch
-        # errors due to missing required arguments. But this will catch "error: invalid choice",
-        # which is what it's necessary for its use in TVMC.
-        raise TVMCException()
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
deleted file mode 100644
index ad4f8ae6169b..000000000000
--- a/python/tvm/driver/tvmc/autotuner.py
+++ /dev/null
@@ -1,864 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""
-Provides support to auto-tuning networks using AutoTVM.
-"""
-import os.path
-import logging
-import time
-from copy import deepcopy
-from typing import Any, Optional, Dict, List, Union
-
-from urllib.parse import urlparse
-
-import tvm
-from tvm import autotvm, auto_scheduler
-from tvm.auto_scheduler.search_task import HardwareParams
-from tvm.autotvm.tuner import GATuner
-from tvm.autotvm.tuner import GridSearchTuner
-from tvm.autotvm.tuner import RandomTuner
-from tvm.autotvm.tuner import XGBTuner
-from tvm.target import Target
-
-from . import TVMCException, composite_target, frontends
-from .main import register_parser
-from .model import TVMCModel
-from .target import target_from_cli, generate_target_args, reconstruct_target_args
-from .shape_parser import parse_shape_string
-from .transform import generate_transform_args, parse_graph_transform_args, apply_graph_transforms
-
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-
-@register_parser
-def add_tune_parser(subparsers, _, json_params):
-    """Include parser for 'tune' subcommand"""
-
-    parser = subparsers.add_parser("tune", help="auto-tune a model")
-    parser.set_defaults(func=drive_tune)
-    parser.add_argument(
-        "--early-stopping",
-        type=int,
-        help="minimum number of trials before early stopping",
-    )
-
-    # There is some extra processing required to define the actual default value
-    # for --min-repeat-ms. This is done in `tune_model`.
-    parser.add_argument(
-        "--min-repeat-ms",
-        default=None,
-        type=int,
-        help="minimum time to run each trial, in milliseconds. "
-        "Defaults to 0 on x86 and 1000 on all other targets",
-    )
-    parser.add_argument(
-        "--model-format",
-        choices=frontends.get_frontend_names(),
-        help="specify input model format",
-    )
-    parser.add_argument(
-        "--number",
-        default=10,
-        type=int,
-        help="number of runs a single repeat is made of. "
-        "The final number of tuning executions is: "
-        "(1 + number * repeat)",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        required=True,
-        help="output file to store the tuning records for the tuning process",
-    )
-    parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity.")
-    parser.add_argument(
-        "--parallel",
-        default=4,
-        type=int,
-        help="the maximum number of parallel devices to use when tuning",
-    )
-    parser.add_argument(
-        "--repeat",
-        type=int,
-        default=1,
-        help="how many times to repeat each measurement",
-    )
-    parser.add_argument(
-        "--rpc-key",
-        help="the RPC tracker key of the target device. "
-        "Required when --rpc-tracker is provided.",
-    )
-    parser.add_argument(
-        "--rpc-tracker",
-        help="hostname (required) and port (optional, defaults to 9090) of the RPC tracker, "
-        "e.g. '192.168.0.100:9999'",
-    )
-
-    generate_target_args(parser)
-    parser.add_argument(
-        "--target-host",
-        help="the host compilation target.",
-    )
-
-    parser.add_argument("--timeout", type=int, default=10, help="compilation timeout, in seconds")
-    parser.add_argument(
-        "--trials",
-        type=int,
-        default=1000,
-        help="the maximum number of tuning trials to perform",
-    )
-    parser.add_argument(
-        "--tuning-records",
-        metavar="PATH",
-        help="path to an auto-tuning log file by AutoTVM.",
-    )
-    generate_transform_args(parser)
-    parser.add_argument(
-        "--enable-autoscheduler",
-        help="enable tuning the graph through the AutoScheduler tuner",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--tasks",
-        default="all",
-        help="which tasks should be tuned, i.e. 0 0,2 3-5 all list",
-    )
-
-    auto_scheduler_group = parser.add_argument_group(
-        "AutoScheduler options",
-        "AutoScheduler options, used when --enable-autoscheduler is provided",
-    )
-
-    auto_scheduler_group.add_argument(
-        "--cache-line-bytes",
-        type=int,
-        help="the size of cache line in bytes. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--num-cores",
-        type=int,
-        help="the number of device cores. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--vector-unit-bytes",
-        type=int,
-        help="the width of vector units in bytes. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--max-shared-memory-per-block",
-        type=int,
-        help="the max shared memory per block in bytes. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--max-local-memory-per-block",
-        type=int,
-        help="the max local memory per block in bytes. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--max-threads-per-block",
-        type=int,
-        help="the max number of threads per block. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--max-vthread-extent",
-        type=int,
-        help="the max vthread extent. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--warp-size",
-        type=int,
-        help="the thread numbers of a warp. "
-        "If not specified, it will be autoset for the current machine.",
-    )
-    auto_scheduler_group.add_argument(
-        "--include-simple-tasks",
-        help="whether to extract simple tasks that do not include complicated ops",
-        action="store_true",
-    )
-    auto_scheduler_group.add_argument(
-        "--log-estimated-latency",
-        help="whether to log the estimated latency to the file after tuning a task",
-        action="store_true",
-    )
-    autotvm_group = parser.add_argument_group(
-        "AutoTVM options",
-        "AutoTVM options, used when the AutoScheduler is not enabled",
-    )
-    autotvm_group.add_argument(
-        "--tuner",
-        choices=[
-            "ga",
-            "gridsearch",
-            "random",
-            "xgb",
-            "xgb_knob",
-            "xgb_itervar",
-            "xgb_curve",
-            "xgb_rank",
-            "xgb_rank_knob",
-            "xgb_rank_itervar",
-            "xgb_rank_curve",
-            "xgb_rank_binary",
-            "xgb_rank_binary_knob",
-            "xgb_rank_binary_itervar",
-            "xgb_rank_binary_curve",
-        ],
-        default="xgb",
-        help="type of tuner to use when tuning with autotvm.",
-    )
-    # TODO (@leandron) This is a path to a physical file, but
-    #     can be improved in future to add integration with a modelzoo
-    #     or URL, for example.
-    parser.add_argument("FILE", help="path to the input model file")
-    parser.add_argument(
-        "--input-shapes",
-        help="specify non-generic shapes for model to run, format is "
-        '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"',
-        type=parse_shape_string,
-    )
-
-    for one_entry in json_params:
-        parser.set_defaults(**one_entry)
-
-
-def drive_tune(args):
-    """Invoke auto-tuning with command line arguments
-
-    Parameters
-    ----------
-    args: argparse.Namespace
-        Arguments from command line parser.
-    """
-    if not os.path.isfile(args.FILE):
-        raise TVMCException(
-            f"Input file '{args.FILE}' doesn't exist, is a broken symbolic link, or a directory."
-        )
-
-    tvmc_model = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
-
-    # Specify hardware parameters, although they'll only be used if autoscheduling.
-    hardware_params = auto_scheduler.HardwareParams(
-        num_cores=args.num_cores,
-        vector_unit_bytes=args.vector_unit_bytes,
-        cache_line_bytes=args.cache_line_bytes,
-        max_shared_memory_per_block=args.max_shared_memory_per_block,
-        max_local_memory_per_block=args.max_local_memory_per_block,
-        max_threads_per_block=args.max_threads_per_block,
-        max_vthread_extent=args.max_vthread_extent,
-        warp_size=args.warp_size,
-        target=args.target,
-        target_host=args.target_host,
-    )
-
-    if args.rpc_tracker:
-        parsed_url = urlparse("//%s" % args.rpc_tracker)
-        rpc_hostname = parsed_url.hostname
-        rpc_port = parsed_url.port or 9090
-        logger.info("RPC tracker hostname: %s", rpc_hostname)
-        logger.info("RPC tracker port: %s", rpc_port)
-
-        if not args.rpc_key:
-            raise TVMCException("need to provide an RPC tracker key (--rpc-key) for remote tuning")
-    else:
-        rpc_hostname = None
-        rpc_port = None
-
-    transform_args = parse_graph_transform_args(args)
-
-    tune_model(
-        tvmc_model,
-        args.target,
-        tuning_records=args.output,
-        prior_records=args.tuning_records,
-        enable_autoscheduler=args.enable_autoscheduler,
-        rpc_key=args.rpc_key,
-        hostname=rpc_hostname,
-        port=rpc_port,
-        trials=args.trials,
-        target_host=args.target_host,
-        tuner=args.tuner,
-        min_repeat_ms=args.min_repeat_ms,
-        early_stopping=args.early_stopping,
-        timeout=args.timeout,
-        repeat=args.repeat,
-        number=args.number,
-        parallel=args.parallel,
-        hardware_params=hardware_params,
-        include_simple_tasks=args.include_simple_tasks,
-        log_estimated_latency=args.log_estimated_latency,
-        additional_target_options=reconstruct_target_args(args),
-        tasks_filter=args.tasks,
-        **transform_args,
-    )
-
-
-def filter_tasks(
-    tasks: Union[List[auto_scheduler.SearchTask], List[autotvm.task.Task]],
-    expr: str,
-):
-    """Utility to filter a list of tasks (AutoTVM or AutoScheduler) based on
-    a user-supplied string expression.
-
-    Parameters
-    ----------
-    tasks: list
-        A list of extracted AutoTVM or AutoScheduler tasks.
-    expr: str
-        User-supplied expression to be used for filtering.
-    """
-    assert isinstance(expr, str), "Expected filter expression of string type"
-    assert len(expr) > 0, "Got empty filter expression"
-
-    # groups of keywords are comma-separated
-    splitted = expr.split(",")
-
-    do_list = False
-    do_filter = False
-    selected = []
-    for item in splitted:
-        if item in ["list", "help"]:
-            do_list = True
-        elif item in ["all"]:
-            selected = list(range(len(tasks)))
-        else:
-            do_filter = True
-            if "-" in item:
-                assert item.count("-") == 1, "Malformed range expression"
-                assert len(item) > 1, "Missing lhs or rhs for range expression"
-                lhs, rhs = item.split("-")[:2]
-                lhs = int(lhs) if lhs else 0
-                rhs = int(rhs) if rhs else len(tasks) - 1
-                assert 0 <= lhs < len(tasks), "Left-hand side expression out of range"
-                assert 0 <= rhs < len(tasks), "Right-hand side expression out of range"
-                selected.extend(list(range(lhs, rhs + 1)))
-            else:
-                assert isinstance(item, str)
-                idx = int(item)
-                assert 0 <= idx < len(tasks), "Task index out of range"
-                selected.append(idx)
-
-    if do_filter:
-        # remove duplicates
-        selected = list(set(selected))
-        tasks = [task for i, task in enumerate(tasks) if i in selected]
-
-    return tasks, do_list
-
-
-def gen_task_list(
-    tasks: Union[List[auto_scheduler.SearchTask], List[autotvm.task.Task]],
-    enable_autoscheduler: bool,
-):
-    """Utility for printing a list of tasks (AutoTVM or AutoScheduler)
-    to the terminal.
-
-    Parameters
-    ----------
-    tasks: list
-        A list of extracted AutoTVM or AutoScheduler tasks.
-    enable_autoscheduler: bool
-        Wether the tasks are extracted with AutoScheduler or AutoTVM.
-    """
-    ret = "Available Tasks for tuning:\n"
-
-    def _trunc_helper(text, length):
-        return text if len(text) < length else text[: length - 3] + "..."
-
-    ret += "\n".join(
-        [
-            "  {}. {}".format(
-                i, _trunc_helper("Unnamed" if len(task.desc) == 0 else task.desc, 100)
-            )
-            if enable_autoscheduler
-            else "  {}. {} (len={})".format(
-                i,
-                _trunc_helper(str(task), 100),
-                "?" if task.config_space is None else len(task.config_space),
-            )
-            for i, task in enumerate(tasks)
-        ]
-    )
-    return ret
-
-
-def tune_model(
-    tvmc_model: TVMCModel,
-    target: str,
-    tuning_records: Optional[str] = None,
-    prior_records: Optional[str] = None,
-    enable_autoscheduler: bool = False,
-    rpc_key: Optional[str] = None,
-    hostname: Optional[str] = None,
-    port: Optional[Union[int, str]] = 9090,
-    trials: int = 10000,
-    target_host: Optional[str] = None,
-    tuner: str = "xgb",
-    min_repeat_ms: Optional[int] = None,
-    early_stopping: Optional[int] = None,
-    timeout: int = 10,
-    repeat: int = 1,
-    number: int = 10,
-    parallel: int = 4,
-    hardware_params: Optional[HardwareParams] = None,
-    include_simple_tasks: bool = False,
-    log_estimated_latency: bool = False,
-    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
-    tasks_filter: str = "all",
-    desired_layout: Optional[str] = None,
-    desired_layout_ops: Optional[List[str]] = None,
-    mixed_precision: bool = False,
-    mixed_precision_ops: Optional[List[str]] = None,
-    mixed_precision_calculation_type: Optional[str] = None,
-    mixed_precision_acc_type: Optional[str] = None,
-):
-    """Use tuning to automatically optimize the functions in a model.
-
-    Parameters
-    ----------
-    tvmc_model : TVMCModel
-        The model to be optimized.
-    target : str
-        Compilation target as plain string, inline JSON or path to a JSON file.
-    tuning_records: str, optional
-        The path to a file that tuning results will be saved to. If not specified,
-        a temporary file will be used.
-    prior_records: str, optional
-        A path to previous tuning results that will be used to hot-start the tuning
-        cost model if provided.
-    enable_autoscheduler : bool, optional
-        When true, use autoscheduling rather than autotvm. This should produce
-        faster kernels for compatible model-target pairs.
-    rpc_key : str, optional
-        The RPC tracker key of the target device. Required when rpc_tracker is provided.
-    hostname : str, optional
-        The IP address of an RPC tracker, used when benchmarking remotely.
-    port : int or str, optional
-        The port of the RPC tracker to connect to. Defaults to 9090.
-    trials : int, optional
-        The number of schedules to try out for the entire model. Note that the default
-        value is chosen as a decent average for most models, but larger models may need
-        more trials to reach a good result while smaller models will converge with fewer
-        trials.
-    tuner : str, optional
-        The type of tuner to use when tuning with autotvm. Can be one of
-        "ga", "gridsearch", "random", "xgb", "xgb_knob", "xgb_itervar", "xgb_curve",
-        "xgb_rank", "xgb_rank_knob", "xgb_rank_itervar", "xgb_rank_binary", "xgb_rank_binary_knob",
-        "xgb_rank_binary_itervar" and "xgb_rank_binary_curve".
-    min_repeat_ms : int, optional
-        Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets.
-    early_stopping : int, optional
-        When specified, stop tuning after this number of trials if results aren't improving.
-    timeout : int, optional,
-        If a kernel trial lasts longer than this duration in seconds, it will be
-        considered a failure.
-    repeat : int, optional
-        How many times each measurement should be repeated.
-    number : int, optional
-        The number of runs a single repeat is made of.
-    parallel : int, optional
-        The maximum number of parallel devices to use when tuning.
-    hardware_params : auto_scheduler.HardwareParams, optional
-        When using the autoscheduler, this object defines the configuration of the target hardware.
-    include_simple_tasks : bool, optional
-        Whether to extract simple operations or only computationally intensive ones when using
-        the autoscheduler.
-    log_estimated_latency : bool, optional
-        If using the autoscheduler, write the estimated latency at each step of tuning to file.
-    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
-        Additional target options in a dictionary to combine with initial Target arguments
-    tasks_filter : str, optional
-        Filter which tasks should be tuned or output a list of the extracted tasks.
-        Examples: 0 0,2 3-5 all list
-    desired_layout: str, optional
-        Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph
-        will have their layout set to this format. Tasks will then be tuned using this
-        specified layout.
-    desired_layout_ops: list[str], optional
-        The list of operators to be transformed with desired layout.
-    mixed_precision: bool
-        To enable mixed precision transformation.
-    mixed_precision_ops: list[str], optional
-        The list of operators to be converted to mixed precision.
-    mixed_precision_calculation_type: str
-        The calculation dtype to be used while mixed precision.
-    mixed_precision_acc_type: str
-        The accumulation data type to be used while mixed precision.
-
-    Returns
-    -------
-    tuning_records : str
-        The path to the produced tuning log file.
-    """
-    transform_args = parse_graph_transform_args(locals())
-    target, extra_targets = target_from_cli(target, additional_target_options)
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source
-    # model is fixed. For now, creating a clone avoids the issue.
-    mod = deepcopy(tvmc_model.mod)
-    params = tvmc_model.params
-
-    with tvm.transform.PassContext(opt_level=3):
-        if tuning_records is None:
-            tuning_records = tvmc_model.default_tuning_records_path()
-
-        for codegen_from_cli in extra_targets:
-            codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
-            partition_function = codegen["pass_pipeline"]
-            mod = partition_function(mod, params, **codegen_from_cli["opts"])
-
-        # min_repeat_ms should be:
-        # a. the value provided by the user, if any, or
-        # b. 0ms in case target is "cpu"; otherwise 1000ms
-        if min_repeat_ms is None:
-            min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000
-            logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms)
-
-        if rpc_key:
-            if hostname is None or port is None:
-                raise TVMCException(
-                    "You must provide a hostname and port to connect to a remote RPC device."
-                )
-            if isinstance(port, str):
-                port = int(port)
-
-            logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port)
-
-            runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner
-            runner = runner_ctor(
-                key=rpc_key,
-                host=hostname,
-                port=port,
-                number=number,
-                repeat=repeat,
-                n_parallel=parallel,
-                timeout=timeout,
-                min_repeat_ms=min_repeat_ms,
-            )
-        else:
-            logger.info("Starting localhost tuning.")
-            runner_ctor = (
-                auto_scheduler.LocalRPCMeasureContext
-                if enable_autoscheduler
-                else autotvm.LocalRunner
-            )
-            local_server = runner_ctor(
-                number=number,
-                repeat=repeat,
-                timeout=timeout,
-                min_repeat_ms=min_repeat_ms,
-            )
-
-            # For autoscheduling on some devices, we need to maintain a
-            # LocalRPCMeasureContext object.
-            if enable_autoscheduler:
-                runner = local_server.runner
-            else:
-                runner = local_server
-
-        if enable_autoscheduler:
-            tasks, weights = autoscheduler_get_tuning_tasks(
-                mod=mod,
-                params=params,
-                target=target,
-                transform_args=transform_args,
-                hardware_params=hardware_params,
-                include_simple_tasks=include_simple_tasks,
-            )
-        else:
-            tasks = autotvm_get_tuning_tasks(
-                mod=mod,
-                params=params,
-                target=target,
-                transform_args=transform_args,
-            )
-
-        # Filter extracted tasks by provided user expression
-        if tasks_filter:
-            tasks, do_list = filter_tasks(tasks, tasks_filter)
-            if do_list:
-                print(gen_task_list(tasks, enable_autoscheduler))
-                return None
-        if len(tasks) == 0:
-            logger.info("No tasks have been selected for tuning.")
-            return None
-        else:
-            logger.info("Selected %s tasks for tuning.", len(tasks))
-
-        if enable_autoscheduler:
-            # Create the autoscheduler tuning options
-            tuning_options = auto_scheduler.TuningOptions(
-                num_measure_trials=trials,
-                measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)],
-                runner=runner,
-                early_stopping=early_stopping,
-            )
-
-            logger.info("Autoscheduling with configuration: %s", tuning_options)
-
-            # Schedule the tasks (i.e., produce a schedule for each task)
-            schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency)
-        else:
-            # In autotvm, trials is specified per task. We can convert the per-model input
-            # provided to per-task trials by dividing by the number of tasks.
-            trials = int(max(1, trials / max(len(tasks), 1)))
-            logger.info("Autotuning with %d trials per task.", trials)
-
-            tuning_options = {
-                "tuner": tuner,
-                "trials": trials,
-                "early_stopping": early_stopping,
-                "measure_option": autotvm.measure_option(
-                    builder=autotvm.LocalBuilder(build_func="default"), runner=runner
-                ),
-                "tuning_records": prior_records,
-            }
-            logger.info("Autotuning with configuration: %s", tuning_options)
-
-            tune_tasks(tasks, tuning_records, **tuning_options)
-
-        return tuning_records
-
-
-def autotvm_get_tuning_tasks(
-    mod: tvm.IRModule,
-    params: Dict[str, tvm.nd.NDArray],
-    target: str,
-    target_host: Optional[str] = None,
-    transform_args: Optional[Dict[str, Any]] = None,
-):
-    """Get the autotvm tuning tasks for a given relay module.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The relay module from which to extract tuning tasks.
-    params : dict
-        The params for the relay module.
-    target : tvm.target.Target
-        The compilation target.
-    target_host : str, optional
-        The compilation target for the host.
-    transform_args: dict, optional
-        Graph transformation arguments that are applied to the relay module.
-
-    Returns
-    -------
-    tasks : list of autotvm.Tasks
-        list of tasks to be tuned
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    mod = apply_graph_transforms(mod, transform_args, params)
-
-    tasks = autotvm.task.extract_from_program(
-        mod["main"],
-        target=target,
-        params=params,
-    )
-
-    return tasks
-
-
-def autoscheduler_get_tuning_tasks(
-    mod: tvm.IRModule,
-    params: Dict[str, tvm.nd.NDArray],
-    target: str,
-    target_host: Optional[str] = None,
-    transform_args: Optional[Dict[str, Any]] = None,
-    hardware_params: Optional[HardwareParams] = None,
-    include_simple_tasks: bool = False,
-):
-    """Get the autoscheduler tuning tasks for a given relay module.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The relay module from which to extract tuning tasks.
-    params : dict
-        The params for the relay module.
-    target : tvm.target.Target
-        The compilation target.
-    target_host : str, optional
-        The compilation target for the host.
-    transform_args: dict, optional
-        Graph transformation arguments that are applied to the relay module.
-    hardware_params : Optional[HardwareParams]
-        Hardware parameters used for the search tasks
-
-    Returns
-    -------
-    tasks : list of autotvm.Tasks
-        list of tasks to be tuned
-    weights : List[int]
-        the weight (i.e. the number of appearance) of extracted tasks
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    mod = apply_graph_transforms(mod, transform_args, params)
-
-    # Extract the tasks
-    tasks, task_weights = auto_scheduler.extract_tasks(
-        mod["main"],
-        params,
-        target=target,
-        hardware_params=hardware_params,
-        include_simple_tasks=include_simple_tasks,
-    )
-
-    return tasks, task_weights
-
-
-def schedule_tasks(
-    tasks: List[auto_scheduler.SearchTask],
-    task_weights: List[float],
-    tuning_options: auto_scheduler.TuningOptions,
-    prior_records: Optional[str] = None,
-    log_estimated_latency: bool = False,
-):
-    """Generate the schedules for the different tasks (i.e., subgraphs) contained in the module.
-    Store the schedules in a json file that will be used later by the compiler.
-
-    Parameters
-    ----------
-    tasks : list
-        A list of auto_scheduler.SearchTask to tune.
-    task_weights : list
-        The weight (i.e. the number of appearance) of extracted tasks
-    tuning_options: auto_scheduler.TuningOptions
-        The options of tuning
-    prior_records : str, optional
-        The json file used to preload the autoscheduler
-    log_estimated_latency : bool, optional
-        If true, writes the estimated runtime of the model during each step of tuning to file.
-    """
-    if not log_estimated_latency:
-        callbacks = [auto_scheduler.task_scheduler.PrintTableInfo()]
-    else:
-        callbacks = [
-            auto_scheduler.task_scheduler.PrintTableInfo(),
-            auto_scheduler.task_scheduler.LogEstimatedLatency(("total_latency.tsv")),
-        ]
-
-    # Create the scheduler
-    tuner = auto_scheduler.TaskScheduler(
-        tasks, task_weights, load_log_file=prior_records, callbacks=callbacks
-    )
-
-    # Tune the tasks
-    tuner.tune(tuning_options)
-
-
-def tune_tasks(
-    tasks: List[autotvm.task.Task],
-    log_file: str,
-    measure_option: autotvm.measure_option,
-    tuner: str,
-    trials: int,
-    early_stopping: Optional[int] = None,
-    tuning_records: Optional[str] = None,
-):
-    """Tune a list of tasks and output the history to a log file.
-
-    Parameters
-    ----------
-    tasks : list
-        A list of autotvm.Tasks to tune.
-    log_file : str
-        A file to output the tuning history, in JSON.
-    measure_option : autotvm.measure_option
-        Options to build and run a tuning task.
-    tuner : str
-        Which tuner to use.
-    trials : int
-        The maximum number of tuning trials to perform.
-    early_stopping : int, optional
-        The minimum number of tuning trials to perform.
-        This will be equal to 'trials' if not specified.
-    tuning_records: str, optional
-        Path to the file produced by the tuning, to be used during
-        tuning.
-    """
-    if not tasks:
-        logger.warning("there were no tasks found to be tuned")
-        return
-
-    if not early_stopping:
-        early_stopping = trials
-
-    for i, tsk in enumerate(tasks):
-        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-
-        # Create a tuner
-        if tuner == "xgb":
-            tuner_obj = XGBTuner(tsk, loss_type="reg")
-        elif tuner == "xgb_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob")
-        elif tuner == "xgb_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar")
-        elif tuner == "xgb_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve")
-        elif tuner == "xgb_rank":
-            tuner_obj = XGBTuner(tsk, loss_type="rank")
-        elif tuner == "xgb_rank_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
-        elif tuner == "xgb_rank_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
-        elif tuner == "xgb_rank_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
-        elif tuner == "xgb_rank_binary":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary")
-        elif tuner == "xgb_rank_binary_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob")
-        elif tuner == "xgb_rank_binary_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar")
-        elif tuner == "xgb_rank_binary_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve")
-        elif tuner == "ga":
-            tuner_obj = GATuner(tsk, pop_size=50)
-        elif tuner == "random":
-            tuner_obj = RandomTuner(tsk)
-        elif tuner == "gridsearch":
-            tuner_obj = GridSearchTuner(tsk)
-        else:
-            raise TVMCException("invalid tuner: %s " % tuner)
-
-        # If transfer learning is being used, load the existing results
-        if tuning_records and os.path.exists(tuning_records):
-            logger.info("loading tuning records from %s", tuning_records)
-            start_time = time.time()
-            tuner_obj.load_history(autotvm.record.load_from_file(tuning_records))
-            logging.info("loaded history in %.2f sec(s)", time.time() - start_time)
-
-        tuner_obj.tune(
-            n_trial=min(trials, len(tsk.config_space)),
-            early_stopping=early_stopping,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(min(trials, len(tsk.config_space)), prefix=prefix),
-                autotvm.callback.log_to_file(log_file),
-            ],
-        )
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
deleted file mode 100644
index 0dbae47294ad..000000000000
--- a/python/tvm/driver/tvmc/compiler.py
+++ /dev/null
@@ -1,743 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""
-Provides support to compile networks both AOT and JIT.
-"""
-import logging
-import os.path
-import re
-import itertools
-from copy import deepcopy
-from typing import Any, Optional, Dict, List, Union, Callable, Sequence
-from pathlib import Path
-from collections import defaultdict
-
-import tvm
-from tvm import autotvm, auto_scheduler
-from tvm import relay
-from tvm.driver.tvmc.registry import generate_registry_args, reconstruct_registry_entity
-from tvm.ir.instrument import PassInstrument, PassTimingInstrument, PassPrintingInstrument
-from tvm.ir.memory_pools import WorkspaceMemoryPools
-from tvm.target import Target
-from tvm.relay.backend import Executor, Runtime
-from tvm.relay.analysis.operations_distribution import analyze_operations_distribution
-from tvm.relay.transform.suffixes import tag_suffixes
-
-from . import composite_target, frontends, TVMCException
-from .model import TVMCModel, TVMCPackage
-from .main import register_parser
-from .target import target_from_cli, generate_target_args, reconstruct_target_args
-from .pass_config import parse_configs
-from .pass_list import parse_pass_list_str
-from .transform import generate_transform_args, parse_graph_transform_args, apply_graph_transforms
-from .shape_parser import parse_shape_string
-from .workspace_pools import generate_workspace_pools_args, workspace_pools_recombobulate
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-
-@register_parser
-def add_compile_parser(subparsers, _, json_params):
-    """Include parser for 'compile' subcommand"""
-
-    parser = subparsers.add_parser("compile", help="compile a model.")
-    parser.set_defaults(func=drive_compile)
-    parser.add_argument(
-        "--cross-compiler",
-        default="",
-        help="the cross compiler to generate target libraries, e.g. 'aarch64-linux-gnu-gcc'.",
-    )
-    parser.add_argument(
-        "--cross-compiler-options",
-        default="",
-        help="the cross compiler options to generate target libraries, e.g. '-mfpu=neon-vfpv4'.",
-    )
-    generate_transform_args(parser)
-    parser.add_argument(
-        "--dump-code",
-        metavar="FORMAT",
-        default="",
-        help="comma separated list of formats to export the input model, e.g. 'asm,ll,tir,relay'.",
-    )
-    parser.add_argument(
-        "--dump-offloads",
-        default="",
-        help="output a mapping of which operations of the initial Relay "
-        "will be transferred to which backend, indicating the composite "
-        "that includes those operations, "
-        "e.g. '--dump-offloads -' to dump to the console, "
-        "e.g. '--dump-offloads <path_to_file>' to dump to the file. "
-        "If not presented, no output is done. ",
-    )
-    parser.add_argument(
-        "--model-format",
-        choices=frontends.get_frontend_names(),
-        help="specify input model format.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        default="module.tar",
-        help="output the compiled module to a specified archive. Defaults to 'module.tar'.",
-    )
-    parser.add_argument(
-        "-f",
-        "--output-format",
-        choices=["so"],
-        default="so",
-        help="output format. Use 'so' for shared object. Defaults to 'so'.",
-    )
-    parser.add_argument(
-        "--pass-config",
-        action="append",
-        metavar=("name=value"),
-        help="configurations to be used at compile time. This option can be provided multiple "
-        "times, each one to set one configuration value, "
-        "e.g. '--pass-config relay.backend.use_auto_scheduler=0', "
-        "e.g. '--pass-config tir.add_lower_pass=opt_level1,pass1,opt_level2,pass2'.",
-    )
-
-    generate_target_args(parser)
-    parser.add_argument(
-        "--tuning-records",
-        metavar="PATH",
-        default="",
-        help="path to an auto-tuning log file by AutoTVM. If not presented, "
-        "the fallback/tophub configs will be used.",
-    )
-    generate_registry_args(parser, Executor, "graph")
-    generate_registry_args(parser, Runtime, "cpp")
-
-    parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity.")
-    # TODO (@leandron) This is a path to a physical file, but
-    #     can be improved in future to add integration with a modelzoo
-    #     or URL, for example.
-    parser.add_argument("FILE", help="path to the input model file.")
-    parser.add_argument(
-        "-O",
-        "--opt-level",
-        default=3,
-        type=int,
-        choices=range(0, 4),
-        metavar="[0-3]",
-        help="specify which optimization level to use. Defaults to '3'.",
-    )
-    parser.add_argument(
-        "--input-shapes",
-        help="specify non-generic shapes for model to run, format is "
-        '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]".',
-        type=parse_shape_string,
-        default=None,
-    )
-    parser.add_argument(
-        "--disabled-pass",
-        help="disable specific passes, comma-separated list of pass names.",
-        type=parse_pass_list_str,
-        default="",
-    )
-    parser.add_argument(
-        "--module-name",
-        default="default",
-        help="The output module name. Defaults to 'default'.",
-    )
-    parser.add_argument(
-        "--print-pass-times",
-        action="store_true",
-        help="print compilation time per pass",
-    )
-    parser.add_argument(
-        "--print-ir-before",
-        help="print IR before each named pass of a comma-separated list of pass names."
-        "e.g. '--print-ir-before [tir.SplitHostDevice,tir.ConvertSSA]' ",
-        default="",
-    )
-    parser.add_argument(
-        "--print-ir-after",
-        help="print IR after each named pass of a comma-separated list of pass names."
-        "e.g. '--print-ir-after [tir.SplitHostDevice,tir.ConvertSSA]' ",
-        default="",
-    )
-    for one_entry in json_params:
-        parser.set_defaults(**one_entry)
-
-    generate_workspace_pools_args(parser)
-
-
-def drive_compile(args):
-    """Invoke tvmc.compiler module with command line arguments
-
-    Parameters
-    ----------
-    args: argparse.Namespace
-        Arguments from command line parser.
-
-    Returns
-    -------
-    int
-        Zero if successfully completed
-
-    """
-
-    if not os.path.isfile(args.FILE):
-        raise TVMCException(
-            f"Input file '{args.FILE}' doesn't exist, is a broken symbolic link, or a directory."
-        )
-
-    tvmc_model = frontends.load_model(args.FILE, args.model_format, args.input_shapes)
-
-    dump_code = [x.strip() for x in args.dump_code.split(",")] if args.dump_code else None
-
-    dump_offloads = args.dump_offloads if args.dump_offloads else ""
-
-    additional_targets = reconstruct_target_args(args)
-    workspace_pools_target, extra_targets = target_from_cli(args.target, additional_targets)
-    transform_args = parse_graph_transform_args(args)
-
-    compile_model(
-        tvmc_model,
-        args.target,
-        opt_level=args.opt_level,
-        executor=reconstruct_registry_entity(args, Executor),
-        runtime=reconstruct_registry_entity(args, Runtime),
-        tuning_records=args.tuning_records,
-        package_path=args.output,
-        cross=args.cross_compiler,
-        cross_options=args.cross_compiler_options,
-        output_format=args.output_format,
-        dump_code=dump_code,
-        dump_offloads=dump_offloads,
-        target_host=None,
-        disabled_pass=args.disabled_pass,
-        pass_context_configs=args.pass_config,
-        mod_name=args.module_name,
-        additional_target_options=additional_targets,
-        workspace_pools=(
-            workspace_pools_recombobulate(args, [workspace_pools_target], extra_targets)
-        ),
-        print_pass_times=args.print_pass_times,
-        print_ir_before=args.print_ir_before,
-        print_ir_after=args.print_ir_after,
-        **transform_args,
-    )
-
-    return 0
-
-
-def compile_model(
-    tvmc_model: TVMCModel,
-    target: str,
-    opt_level: int = 3,
-    executor: Optional[Executor] = Executor("graph"),
-    runtime: Optional[Runtime] = Runtime("cpp"),
-    tuning_records: Optional[str] = None,
-    package_path: Optional[str] = None,
-    cross: Optional[Union[str, Callable]] = None,
-    cross_options: Optional[str] = None,
-    output_format: str = "so",
-    dump_code: Optional[List[str]] = None,
-    dump_offloads: str = "",
-    target_host: Optional[str] = None,
-    disabled_pass: Optional[str] = None,
-    pass_context_configs: Optional[List[str]] = None,
-    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
-    use_vm: bool = False,
-    mod_name: Optional[str] = "default",
-    workspace_pools: Optional[WorkspaceMemoryPools] = None,
-    print_pass_times: bool = False,
-    print_ir_before: Optional[List[str]] = None,
-    print_ir_after: Optional[List[str]] = None,
-    instruments: Optional[Sequence[PassInstrument]] = None,
-    desired_layout: Optional[str] = None,
-    desired_layout_ops: Optional[List[str]] = None,
-    mixed_precision: bool = False,
-    mixed_precision_ops: Optional[List[str]] = None,
-    mixed_precision_calculation_type: Optional[str] = None,
-    mixed_precision_acc_type: Optional[str] = None,
-):
-    """Compile a model from a supported framework into a TVM module.
-
-    This function takes a union of the arguments of both frontends.load_model
-    and compiler.compile_relay. The resulting TVM module can be executed using
-    the graph executor.
-
-    Parameters
-    ----------
-    tvmc_model : TVMCModel
-        The model object that should be compiled.
-    target : str
-        The target for which to compile. Can be a plain string or
-        a path.
-    opt_level : int
-        The option that controls various sorts of optimizations.
-    tuning_records : str
-        A path to tuning records produced using tvmc.tune. When provided,
-        compilation will use more optimized kernels leading to better results.
-    package_path : str, optional
-        The path to export the compiled model to. If not provided it will
-        be saved in a temporary directory.
-    cross : str or callable object, optional
-        Function that performs the actual compilation
-    cross_options : str, optional
-        Command line options to be passed to the cross compiler.
-    output_format : str
-        What format to use when saving the function library. Must be one of "so" or "tar".
-        When compiling for a remote device without a cross compiler, "tar" will likely work better.
-    dump_code : list[str], optional
-        Dump the generated code for the specified source types, on
-        the requested target. Choose from: ["asm", "ll", "tir", "relay"].
-    dump_offloads : str
-        Dump the information about the partition of input model's layers by external codegen.
-        Can be '' to not dump at all, '-' to dump to the console
-        or '<path_to_file>' to dump to the specified file.
-    target_host : str, optional
-        The target of the host machine if host-side code
-        needs to be generated.
-    disabled_pass: str, optional
-        Comma-separated list of passes which needs to be disabled
-        during compilation.
-    pass_context_configs: list[str], optional
-        List of strings containing a set of configurations to be passed to the
-        PassContext.
-    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
-        Additional target options in a dictionary to combine with initial Target arguments
-    use_vm: bool
-        Whether to use the VM to compile the model as opposed to the graph executor
-    mod_name: str, optional
-        The module name
-    workspace_pools: WorkspaceMemoryPools, optional
-        Specification of WorkspacePoolInfo objects to be used as workspace memory in the
-        compilation.
-    print_pass_times: bool
-        To enable printing a breakdown of compilation times by pass. Disabled by default.
-    print_ir_before: list[str], optional
-        To print IR before each named pass of a comma-separated list of passes.
-    print_ir_after: list[str], optional
-        To print IR after each named pass of a comma-separated list of passes.
-    instruments: Optional[Sequence[PassInstrument]]
-        The list of pass instrument implementations.
-    desired_layout: str, optional
-        Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph
-        will have their layout set to this format. Tasks will then be tuned using this
-        specified layout.
-    desired_layout_ops: list[str], optional
-        The list of operators to be transformed with desired layout.
-    mixed_precision: bool
-        To enable mixed precision transformation. Disabled by default.
-    mixed_precision_ops: list[str], optional
-        The list of operators to be converted to mixed precision.
-        Set to ["nn.conv2d", "nn.dense"] by default
-    mixed_precision_calculation_type: str
-        The calculation dtype to be used while mixed precision. Set to "float16" by default.
-    mixed_precision_acc_type: str
-        The accumulation data type to be used while mixed precision. Set to "float16" by default.
-
-    Returns
-    -------
-    compiled_model : TVMCPackage
-        The compiled TVMCModel ready to be run.
-
-    """
-    mod, params = tvmc_model.mod, tvmc_model.params
-
-    if dump_code is None:
-        dump_code = []
-    if not isinstance(dump_code, list):
-        dump_code = [dump_code]
-    dumps = {}
-
-    config = parse_configs(pass_context_configs)
-    if "tir" in dump_code:
-        config, dumps = add_tir_to_dumps(config, dumps)
-
-    initial_relay = None
-    if dump_offloads != "":
-        # add suffixes to the span field for calls in Relay
-        mod = tag_suffixes(mod)
-        # remember initial Relay
-        initial_relay = deepcopy(mod)
-
-    tvm_target, extra_targets = target_from_cli(target, additional_target_options)
-    tvm_target, target_host = Target.canon_target_and_host(tvm_target, target_host)
-
-    partition_functions = []
-    partition_opts = []
-    for codegen_from_cli in extra_targets:
-        codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
-        partition_functions.append(codegen["pass_pipeline"])
-        partition_opts.append(codegen_from_cli["opts"])
-        if codegen["config_key"] is not None:
-            config[codegen["config_key"]] = codegen_from_cli["opts"]
-
-    if print_pass_times:
-        timing_inst = PassTimingInstrument()
-        instruments = [timing_inst] if instruments is None else [timing_inst] + instruments
-
-    if print_ir_before or print_ir_after:
-        print_ir_instr = PassPrintingInstrument(
-            print_before_pass_names=print_ir_before, print_after_pass_names=print_ir_after
-        )
-        instruments = [print_ir_instr] if instruments is None else [print_ir_instr] + instruments
-
-    with tvm.transform.PassContext(
-        opt_level=opt_level,
-        config=config,
-        disabled_pass=disabled_pass,
-        instruments=instruments,
-    ):
-        transform_args = parse_graph_transform_args(locals())
-        mod = apply_graph_transforms(mod, transform_args, params)
-
-        for partition_function, opts in zip(partition_functions, partition_opts):
-            mod = partition_function(mod, params, mod_name=mod_name, **opts)
-
-        if initial_relay:
-            # dump which operations are offloaded to which backend
-            dump_operation_offloads(mod, initial_relay, dump_offloads)
-
-        if tuning_records and os.path.exists(tuning_records):
-            logger.debug("tuning records file provided: %s", tuning_records)
-
-            use_autoscheduler = True
-            try:
-                auto_scheduler.load_records(tuning_records)
-            except tvm._ffi.base.TVMError:
-                use_autoscheduler = False
-
-            if use_autoscheduler:
-                with auto_scheduler.ApplyHistoryBest(tuning_records):
-                    config["relay.backend.use_auto_scheduler"] = True
-                    logger.debug("building relay graph with autoscheduler")
-                    graph_module = build(
-                        mod,
-                        tvm_target=tvm_target,
-                        executor=executor,
-                        runtime=runtime,
-                        params=params,
-                        use_vm=use_vm,
-                        mod_name=mod_name,
-                        workspace_pools=workspace_pools,
-                    )
-            else:
-                with autotvm.apply_history_best(tuning_records):
-                    logger.debug("building relay graph with tuning records")
-                    graph_module = build(
-                        mod,
-                        tvm_target=tvm_target,
-                        executor=executor,
-                        runtime=runtime,
-                        params=params,
-                        use_vm=use_vm,
-                        mod_name=mod_name,
-                        workspace_pools=workspace_pools,
-                    )
-        else:
-            logger.debug("building relay graph (no tuning records provided)")
-            graph_module = build(
-                mod,
-                tvm_target=tvm_target,
-                executor=executor,
-                runtime=runtime,
-                params=params,
-                use_vm=use_vm,
-                mod_name=mod_name,
-                workspace_pools=workspace_pools,
-            )
-
-        # Generate output dump files with sources
-        for source_type in dump_code:
-            if source_type == "relay":
-                dumps[source_type] = str(mod)
-            elif source_type == "tir":
-                dumps[source_type] = "\n".join(dumps[source_type])
-            else:
-                lib = graph_module.lib if use_vm else graph_module.get_lib()
-                # TODO lib.get_source call have inconsistent behavior for unsupported
-                #      formats (@leandron).
-                try:
-                    dumps[source_type] = lib.get_source(source_type)
-                except tvm.TVMError:
-                    pass
-                for smod in lib.imported_modules:
-                    try:
-                        if smod.type_key not in dumps:
-                            dumps[smod.type_key] = ""
-                        else:
-                            dumps[smod.type_key] += "\n"
-                        dumps[smod.type_key] += smod.get_source()
-                    except tvm.TVMError:
-                        print(f"Imported module {smod.type_key} doesn't support source dump")
-
-        # Create a new tvmc model package object from the graph definition.
-        package_path = tvmc_model.export_package(
-            graph_module, package_path, cross, cross_options, output_format
-        )
-
-        # Write dumps to file.
-        if dumps:
-            save_dumps(package_path, dumps)
-
-        # Print compilation times per pass
-        if print_pass_times:
-            print("Compilation time breakdown by pass:")
-            print(timing_inst.render())
-
-        return TVMCPackage(package_path)
-
-
-def build(
-    mod: tvm.IRModule,
-    tvm_target: str,
-    executor: Executor,
-    runtime: Runtime,
-    params: Dict[str, tvm.nd.NDArray],
-    use_vm: bool,
-    mod_name: str,
-    workspace_pools: Optional[WorkspaceMemoryPools],
-):
-    """
-    Builds the model with the provided executor.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The relay module corresponding to this model.
-    tvm_target : str
-        The target for which to compile. Can be a plain string or
-        a path.
-    executor : Executor
-        The graph executor to build the model if use_vm is not True
-    runtime : Runtime
-        The runtime configuration.
-    params : dict
-        A parameter dictionary for the model.
-    use_vm: bool
-        Whether to use the VM to compile the model as opposed to the graph executor
-    mod_name: str
-        The module name
-
-    """
-    if use_vm:
-        logger.debug("building with vm compile")
-        return relay.vm.compile(mod, target=tvm_target, params=params)
-    logger.debug("building with relay build")
-    return relay.build(
-        mod,
-        target=tvm_target,
-        executor=executor,
-        runtime=runtime,
-        params=params,
-        mod_name=mod_name,
-        workspace_memory_pools=workspace_pools,
-    )
-
-
-def add_tir_to_dumps(config, dumps):
-    """
-    Creates a debug pass that dumps TIR functions as a list of strings.
-    """
-    key = "tir"
-    phase = 3  # final TIR phase before codegen
-    dumps[key] = []
-
-    @tvm.tir.transform.prim_func_pass(opt_level=0)
-    def _dump_tir_pass(tir_func, _, __):
-        dumps[key].append(str(tir_func))
-        return tir_func
-
-    tir_lower_passes = config.get("tir.add_lower_pass", [])
-    tir_lower_passes.append((phase, _dump_tir_pass))
-    config["tir.add_lower_pass"] = tir_lower_passes
-
-    return config, dumps
-
-
-def save_dumps(module_name: str, dumps: Dict[str, str], dump_root: str = "."):
-    """
-    Serialize dump files to the disk.
-
-    Parameters
-    ----------
-    module_name : str
-        File name, referring to the module that generated
-        the dump contents
-    dumps : dict
-        The output contents to be saved into the files
-    dump_root : str, optional
-        Path in which dump files will be created
-    """
-
-    for dump_format in dumps:
-        dump_name = module_name + "." + dump_format
-        with open(Path(dump_root, dump_name), "w") as f:
-            f.write(dumps[dump_format])
-
-
-def dump_operation_offloads(mod: tvm.ir.IRModule, initial_mod: tvm.ir.IRModule, dump_path: str):
-    """This helper function forms a line-by-line output of the initial Relay lines,
-    indicating which operations are ported to which target,
-    and indicating the composite that includes those operations;
-    the 'generic' target refers to operations uploaded to the host, e.g
-    'target1        <-     target1.qnn_conv2d'
-    'target1        <-          %0 = qnn.conv2d(%tfl.quantize, %v_param_1, ...'
-    'target1        <-          %1 = nn.bias_add(%0, %v_param_2, axis=3);'
-    'target1        <-          %2 = qnn.requantize(%1, meta[relay.Constant]...'
-    'target2        <-     target2.reshape'
-    'target2        <-          %3 = reshape(%2, newshape=[1, 1001]);'
-    'generic        <-     %4 = nn.pad(%3, -128f, pad_width=[[0, 0], [1, 1]...'
-
-    Parameters
-    ----------
-    mod : tvm.ir.IRModule
-        The partitioned IRModule with external global functions.
-    initial_mod : tvm.ir.IRModule
-        The initial IRModule that gets generated from a relay frontend.
-    dump_path: str
-        Value of the "dump_offloads" compiler atribute.
-        Could be dash ("-") or file path or empty string for
-        printing to console, file or doing nothing respectively.
-    """
-    print_to_console = dump_path == "-"
-    save_to_file = all([dump_path != "-", dump_path != ""])
-
-    if print_to_console or save_to_file:
-        operations_distribution = analyze_operations_distribution(mod)
-
-        def annotate_f(x):
-            ret = ""
-            if isinstance(x, relay.Call):
-                # if there is no x.span.source_name.name in operations_distribution,
-                # this could mean that the span was not copied during the application of passes
-                # to the Relay, in which case we can not associate the initial Relay string
-                # with the resulting Relay call
-                source_name = x.span.source_name.name
-                suffix = tvm.relay.transform.suffixes.SUFFIX_STRING
-                result = re.search(r"(.*)(" + suffix + r")(.*)", source_name)
-                func_id = result.group(1)
-                if func_id in operations_distribution:
-                    compiler_name, op_name = operations_distribution[func_id]
-                    ret = (
-                        f", compiler_name: {compiler_name}, op_name: {op_name}, "
-                        f"func_id: {func_id}"
-                    )
-                else:
-                    ret = ", compiler_name: unknown, op_name: unknown, func_id: unknown"
-            elif isinstance(x, (relay.Tuple, relay.TupleGetItem)):
-                ret = ", compiler_name: none, op_name: none, func_id: none"
-
-            return ret
-
-        initial_relay_astext = initial_mod.astext(show_meta_data=False, annotate=annotate_f).split(
-            "\n"
-        )
-
-        # funcs_list is a list of internal composite/function IDs.
-        # funcs_list helps keep the order of lines from the initial Relay.
-        funcs_list = []
-
-        # target_statistic is a mapping of the target name to the
-        # number of initial Relay calls offloaded on the target
-        target_statistic = defaultdict(int)
-
-        # funcs_dict is a mapping of the generated analyze_operations_distribution
-        # internal composite/function IDs to a list, where:
-        # 1st element is
-        #   (1a): "generic"|"unknown"|"none"* or
-        #   (1b): specific target name, like "ethos-u" or "cmsis-nn"
-        # 2nd element is
-        #   (2a): corresponding initial Relay line for the case (1a) or
-        #   (2b): the name of the target composite functon in the other case (1b)
-        # 3rd element or subsequent ones are presented only for the case (2b)
-        # and are the initial Relay's lines included in the corresponding
-        # target composite functon
-        #
-        # *Description of what is meant by "generic"|"unknown"|"none":
-        # "generic" means that operation will be run on a host
-        # "unknown" means that unique identifier of this Relay line not found in the partitioned
-        #           Relay and therefore not present in the operations_distribution dictionary
-        # "none" means that this Relay line is not relay.Call
-        funcs_dict = {}
-
-        # Here we group together initial Relay lines from the one composite
-        counter = itertools.count()
-        for s in initial_relay_astext:
-            result = re.search(
-                r"(compiler_name: )(.*)(, op_name: )(.*)(, func_id: )((.*)(?=;)|(.*))", s
-            )
-            if result:
-                target_name = result.group(2)
-                op_name = result.group(4)
-                func_id = result.group(6)
-                if target_name != "none":
-                    target_statistic[target_name] += 1
-
-                # create an identifier for each "unknown" or "none" case to keep the lines order
-                if func_id == "unknown" or func_id == "none" or target_name == "generic":
-                    func_id = str(next(counter) * -1)
-
-                if func_id not in funcs_dict:
-                    funcs_list.append(func_id)
-                    funcs_dict[func_id] = [target_name]
-                    if target_name not in ["unknown", "generic", "none"]:
-                        funcs_dict[func_id].append(op_name)
-
-                s = re.sub(r", compiler_name: (.*)", "", s).lstrip()
-                funcs_dict[func_id].append(s)
-
-        # Here we prepare the output for printing.
-        # The output in most cases keeps the original order of the Relay lines
-        # but some lines are moved to be in the corresponding composite group
-        output = []
-        total = 0
-        output.append("Total number of operators and distribution by targets")
-        output.append("Total:")
-        for target, statistic in target_statistic.items():
-            total += statistic
-            output.append(f"{target}: {statistic}")
-        output[1] += f" {total}"
-        output[len(target_statistic) + 1] += "\n"
-
-        for func_id in funcs_list:
-            _list = funcs_dict[func_id]
-
-            if _list[0] != "none":
-                output.append(f"{_list[0]:<15}<-{' ':5}{_list[1]}")
-            else:
-                output.append(f"{' ':>22}{_list[1]}")
-
-            if _list[0] == "unknown":
-                output.append(
-                    "Warning: The above line means that some pass(es) \
-                              in Relay partitioning"
-                )
-                output.append("do not copy the span when the call is recreated")
-                output.append(
-                    "and a line from initial Relay could not be associated \
-                              with the resulting Relay"
-                )
-            for el in _list[2:]:
-                output.append(f"{_list[0]:<15}<-{' ':10}{el}")
-
-        if print_to_console:
-            print("\n" + "\n".join(output))
-        if save_to_file:
-            file_path = os.path.abspath(dump_path)
-            os.makedirs(os.path.dirname(file_path), exist_ok=True)
-            with open(file_path, "w") as f:
-                f.write("\n".join(output))
-                f.write("\n")
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
deleted file mode 100644
index 68f544f06aa2..000000000000
--- a/python/tvm/driver/tvmc/composite_target.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Provides support to composite target on TVMC.
-"""
-import logging
-
-# Make sure Vitis AI codegen is registered
-import tvm.contrib.target.vitis_ai  # pylint: disable=unused-import
-
-from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
-from tvm.relay.op.contrib.bnns import partition_for_bnns
-from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
-from tvm.relay.op.contrib.clml import partition_for_clml
-from tvm.relay.op.contrib.mrvl import partition_for_mrvl
-
-
-from tvm.driver.tvmc import TVMCException
-
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-
-# Global dictionary to map targets
-#
-# Options
-# -------
-# config_key : str
-#   The configuration key to be used in the PassContext (if any).
-# pass_pipeline : Callable
-#   A function to transform a Module before compilation, mainly used
-#   for partitioning for the target currently.
-REGISTERED_CODEGEN = {
-    "compute-library": {
-        "config_key": None,
-        "pass_default": False,
-        "default_target": None,
-        "pass_pipeline": partition_for_arm_compute_lib,
-    },
-    "bnns": {
-        "config_key": None,
-        "pass_default": False,
-        "default_target": None,
-        "pass_pipeline": partition_for_bnns,
-    },
-    "vitis-ai": {
-        "config_key": "relay.ext.vitis_ai.options",
-        "pass_default": False,
-        "default_target": None,
-        "pass_pipeline": partition_for_vitis_ai,
-    },
-    "clml": {
-        "config_key": None,
-        "pass_default": False,
-        "default_target": None,
-        "pass_pipeline": partition_for_clml,
-    },
-    "mrvl": {
-        "config_key": "relay.ext.mrvl.options",
-        "pass_default": True,
-        "default_target": "llvm",
-        "pass_pipeline": partition_for_mrvl,
-    },
-}
-
-
-def get_codegen_names():
-    """Return a list of all registered codegens.
-
-    Returns
-    -------
-    list of str
-        all registered targets
-    """
-    return list(REGISTERED_CODEGEN.keys())
-
-
-def get_codegen_by_target(name):
-    """Return a codegen entry by name.
-
-    Parameters
-    ----------
-    name : str
-        The name of the target for which the codegen info should be retrieved.
-
-    Returns
-    -------
-    dict
-        requested target codegen information
-    """
-    try:
-        return REGISTERED_CODEGEN[name]
-    except KeyError:
-        raise TVMCException("Composite target %s is not defined in TVMC." % name)
diff --git a/python/tvm/driver/tvmc/config_options.py b/python/tvm/driver/tvmc/config_options.py
deleted file mode 100644
index 6ba89d650ec4..000000000000
--- a/python/tvm/driver/tvmc/config_options.py
+++ /dev/null
@@ -1,210 +0,0 @@
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-manipulate json config file to work with TVMC
-"""
-import os
-import json
-
-from tvm._ffi import libinfo
-from tvm.driver.tvmc import TVMCException
-
-CONFIGS_JSON_DIR = None
-
-
-class ConfigsJsonNotFoundError(TVMCException):
-    """Raised when the JSON configs dirtree cannot be found."""
-
-
-def get_configs_json_dir() -> str:
-    """Find the 'configs' directory, containing the JSON files used to configure tvmc
-    with persistent argument settings.
-
-    Returns
-    -------
-    str :
-        The path to the 'configs' directory
-    """
-    global CONFIGS_JSON_DIR
-    if CONFIGS_JSON_DIR is None:
-
-        # If a non-default location for the build directory is used, e.g. set via TVM_LIBRARY_PATH
-        # we need to provide the user a way to overwrite CONFIGS_JSON_DIR as well.
-        if os.environ.get("TVM_CONFIGS_JSON_DIR", None):
-            user_config_dir = os.environ["TVM_CONFIGS_JSON_DIR"]
-            if os.path.isdir(user_config_dir):
-                CONFIGS_JSON_DIR = user_config_dir
-                return CONFIGS_JSON_DIR
-
-        candidate_paths = []
-        candidate_paths.extend(libinfo.find_lib_path())
-        # When running from source, the configs directory will be located one directory above the
-        # native libraries, so covering that case.
-        candidate_paths.extend(
-            [os.path.abspath(os.path.join(lib_path, "..")) for lib_path in libinfo.find_lib_path()]
-        )
-        candidate_paths.extend(
-            [
-                os.path.abspath(os.path.join(lib_path, "../.."))
-                for lib_path in libinfo.find_lib_path()
-            ]
-        )
-        for path in candidate_paths:
-            configs_path = os.path.join(os.path.dirname(path), "configs")
-            if os.path.isdir(configs_path):
-                CONFIGS_JSON_DIR = configs_path
-                break
-
-        else:
-            raise ConfigsJsonNotFoundError()
-
-    return CONFIGS_JSON_DIR
-
-
-def find_json_file(name, path):
-    """search for json file given file name a path
-
-    Parameters
-    ----------
-    name: string
-        the file name need to be searched
-    path: string
-        path to search at
-
-    Returns
-    -------
-    string
-        the full path to that file
-
-    """
-    match = ""
-    for root, _dirs, files in os.walk(path):
-        if name in files:
-            match = os.path.join(root, name)
-            break
-
-    return match
-
-
-def read_and_convert_json_into_dict(config_args):
-    """Read json configuration file and return a dictionary with all parameters
-
-    Parameters
-    ----------
-    args: argparse.Namespace
-        Arguments from command line parser holding the json file path.
-
-    Returns
-    -------
-    dictionary
-        dictionary with all the json arguments keys and values
-
-    """
-    try:
-        if ".json" not in config_args.config:
-            config_args.config = config_args.config.strip() + ".json"
-        if os.path.isfile(config_args.config):
-            json_config_file = config_args.config
-        else:
-            config_dir = get_configs_json_dir()
-            json_config_file = find_json_file(config_args.config, config_dir)
-        return json.load(open(json_config_file, "rb"))
-
-    except FileNotFoundError:
-        raise TVMCException(
-            f"File {config_args.config} does not exist at {config_dir} or is wrong format."
-        )
-
-
-def parse_target_from_json(one_target, command_line_list):
-    """parse the targets out of the json file struct
-
-    Parameters
-    ----------
-    one_target: dict
-        dictionary with all target's details
-    command_line_list: list
-        list to update with target parameters
-    """
-    target_kind, *sub_type = [
-        one_target[key] if key == "kind" else (key, one_target[key]) for key in one_target
-    ]
-
-    internal_dict = {}
-    if sub_type:
-        sub_target_type = sub_type[0][0]
-        target_value = sub_type[0][1]
-        internal_dict[f"target_{target_kind}_{sub_target_type}"] = target_value
-        command_line_list.append(internal_dict)
-
-    return target_kind
-
-
-def convert_config_json_to_cli(json_params):
-    """convert all configuration keys & values from dictionary to cli format
-
-    Parameters
-    ----------
-    args: dictionary
-        dictionary with all configuration keys & values.
-
-    Returns
-    -------
-    int
-        list of configuration values in cli format
-
-    """
-    command_line_list = []
-    for param_key in json_params:
-        if param_key == "targets":
-            target_list = [
-                parse_target_from_json(one_target, command_line_list)
-                for one_target in json_params[param_key]
-            ]
-
-            internal_dict = {}
-            internal_dict["target"] = ", ".join(map(str, target_list))
-            command_line_list.append(internal_dict)
-
-        elif param_key in ("executor", "runtime"):
-            for key, value in json_params[param_key].items():
-                if key == "kind":
-                    kind = f"{value}_"
-                    new_dict_key = param_key
-                else:
-                    new_dict_key = f"{param_key}_{kind}{key}"
-
-                internal_dict = {}
-                internal_dict[new_dict_key.replace("-", "_")] = value
-                command_line_list.append(internal_dict)
-
-        elif isinstance(json_params[param_key], dict):
-            internal_dict = {}
-            modify_param_key = param_key.replace("-", "_")
-            internal_dict[modify_param_key] = []
-            for key, value in json_params[param_key].items():
-                internal_dict[modify_param_key].append(f"{key}={value}")
-            command_line_list.append(internal_dict)
-
-        else:
-            internal_dict = {}
-            internal_dict[param_key.replace("-", "_")] = json_params[param_key]
-            command_line_list.append(internal_dict)
-
-    return command_line_list
diff --git a/python/tvm/driver/tvmc/fmtopt.py b/python/tvm/driver/tvmc/fmtopt.py
deleted file mode 100644
index 7f27826d77bf..000000000000
--- a/python/tvm/driver/tvmc/fmtopt.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Utils to format help text for project options.
-"""
-from textwrap import TextWrapper
-
-
-# Maximum column length for accommodating option name and its choices.
-# Help text is placed after it in a new line.
-MAX_OPTNAME_CHOICES_TEXT_COL_LEN = 80
-
-
-# Maximum column length for accommodating help text.
-# 0 turns off formatting for the help text.
-MAX_HELP_TEXT_COL_LEN = 0
-
-
-# Justification of help text placed below option name + choices text.
-HELP_TEXT_JUST = 2
-
-
-def format_option(option_text, help_text, default_text, required=True):
-    """Format option name, choices, and default text into a single help text.
-
-    Parameters
-    ----------
-    options_text: str
-        String containing the option name and option's choices formatted as:
-        optname={opt0, opt1, ...}
-
-    help_text: str
-        Help text string.
-
-    default_text: str
-        Default text string.
-
-    required: bool
-        Flag that controls if a "(required)" text mark needs to be added to the final help text to
-        inform if the option is a required one.
-
-    Returns
-    -------
-    help_text_just: str
-       Single justified help text formatted as:
-       optname={opt0, opt1, ... }
-         HELP_TEXT. "(required)" | "Defaults to 'DEFAULT'."
-
-    """
-    optname, choices_text = option_text.split("=", 1)
-
-    # Prepare optname + choices text chunck.
-
-    optname_len = len(optname)
-    wrapper = TextWrapper(width=MAX_OPTNAME_CHOICES_TEXT_COL_LEN - optname_len)
-    choices_lines = wrapper.wrap(choices_text)
-
-    # Set first choices line which merely appends to optname string.
-    # No justification is necessary for the first line since first
-    # line was wrapped based on MAX_OPTNAME_CHOICES_TEXT_COL_LEN - optname_len,
-    # i.e. considering optname_len, hence only append justified choices_lines[0] line.
-    choices_just_lines = [optname + "=" + choices_lines[0]]
-
-    # Justify the remaining lines based on first optname + '='.
-    for line in choices_lines[1:]:
-        line_len = len(line)
-        line_just = line.rjust(
-            optname_len + 1 + line_len
-        )  # add 1 to align after '{' in the line above
-        choices_just_lines.append(line_just)
-
-    choices_text_just_chunk = "\n".join(choices_just_lines)
-
-    # Prepare help text chunck.
-
-    help_text = help_text[0].lower() + help_text[1:]
-    if MAX_HELP_TEXT_COL_LEN > 0:
-        wrapper = TextWrapper(width=MAX_HELP_TEXT_COL_LEN)
-        help_text_lines = wrapper.wrap(help_text)
-    else:
-        # Don't format help text.
-        help_text_lines = [help_text]
-
-    help_text_just_lines = []
-    for line in help_text_lines:
-        line_len = len(line)
-        line_just = line.rjust(HELP_TEXT_JUST + line_len)
-        help_text_just_lines.append(line_just)
-
-    help_text_just_chunk = "\n".join(help_text_just_lines)
-
-    # An option might be required for one method but optional for another one.
-    # If the option is required for one method it means there is no default for
-    # it when used in that method, hence suppress default text in that case.
-    if default_text and not required:
-        help_text_just_chunk += " " + default_text
-
-    if required:
-        help_text_just_chunk += " (required)"
-
-    help_text_just = choices_text_just_chunk + "\n" + help_text_just_chunk
-    return help_text_just
diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
deleted file mode 100644
index 2da548356446..000000000000
--- a/python/tvm/driver/tvmc/frontends.py
+++ /dev/null
@@ -1,478 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Provides support to parse models from different frameworks into Relay networks.
-
-Frontend classes do lazy-loading of modules on purpose, to reduce time spent on
-loading the tool.
-"""
-import logging
-import os
-import sys
-import re
-import importlib
-from abc import ABC
-from abc import abstractmethod
-from typing import Optional, List, Dict
-from pathlib import Path
-
-import numpy as np
-
-from tvm import relay
-from tvm import parser
-from tvm.driver.tvmc import TVMCException, TVMCImportError
-from tvm.driver.tvmc.model import TVMCModel
-
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-
-class Frontend(ABC):
-    """Abstract class for command line driver frontend.
-
-    Provide a unified way to import models (as files), and deal
-    with any required preprocessing to create a TVM module from it."""
-
-    @staticmethod
-    @abstractmethod
-    def name():
-        """Frontend name"""
-
-    @staticmethod
-    @abstractmethod
-    def suffixes():
-        """File suffixes (extensions) used by this frontend"""
-
-    @abstractmethod
-    def load(self, path, shape_dict=None, **kwargs):
-        """Load a model from a given path.
-
-        Parameters
-        ----------
-        path: str
-            Path to a file
-        shape_dict: dict, optional
-            Mapping from input names to their shapes.
-
-        Returns
-        -------
-        mod : tvm.IRModule
-            The produced relay module.
-        params : dict
-            The parameters (weights) for the relay module.
-
-        """
-
-
-def lazy_import(pkg_name, from_pkg_name=None, hide_stderr=False):
-    """Lazy import a frontend package or subpackage"""
-    try:
-        return importlib.import_module(pkg_name, package=from_pkg_name)
-    except ImportError as error:
-        raise TVMCImportError(pkg_name) from error
-    finally:
-        if hide_stderr:
-            sys.stderr = stderr
-
-
-class KerasFrontend(Frontend):
-    """Keras frontend for TVMC"""
-
-    @staticmethod
-    def name():
-        return "keras"
-
-    @staticmethod
-    def suffixes():
-        return ["h5"]
-
-    def load(self, path, shape_dict=None, **kwargs):
-        # pylint: disable=C0103
-        tf = lazy_import("tensorflow")
-        keras = lazy_import("keras", from_pkg_name="tensorflow")
-
-        # tvm build currently imports keras directly instead of tensorflow.keras
-        try:
-            model = keras.models.load_model(path)
-        except ValueError as err:
-            raise TVMCException(str(err))
-
-        # There are two flavours of keras model, sequential and
-        # functional, TVM expects a functional model, so convert
-        # if required:
-        if self.is_sequential_p(model):
-            model = self.sequential_to_functional(model)
-
-        in_shapes = []
-        for layer in model._input_layers:
-            if tf.executing_eagerly():
-                in_shapes.append(tuple(dim if dim is not None else 1 for dim in layer.input.shape))
-            else:
-                in_shapes.append(
-                    tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape)
-                )
-
-        inputs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
-        input_shapes = {name: x.shape for (name, x) in zip(model.input_names, inputs)}
-        if shape_dict is not None:
-            input_shapes.update(shape_dict)
-        kwargs.setdefault("layout", "NHWC")
-        return relay.frontend.from_keras(model, input_shapes, **kwargs)
-
-    def is_sequential_p(self, model):
-        keras = lazy_import("keras", from_pkg_name="tensorflow")
-        return isinstance(model, keras.models.Sequential)
-
-    def sequential_to_functional(self, model):
-        keras = lazy_import("keras", from_pkg_name="tensorflow")
-        assert self.is_sequential_p(model)
-        input_layer = keras.layers.Input(batch_shape=model.layers[0].input_shape)
-        prev_layer = input_layer
-        for layer in model.layers:
-            prev_layer = layer(prev_layer)
-        model = keras.models.Model([input_layer], [prev_layer])
-        return model
-
-
-class OnnxFrontend(Frontend):
-    """ONNX frontend for TVMC"""
-
-    @staticmethod
-    def name():
-        return "onnx"
-
-    @staticmethod
-    def suffixes():
-        return ["onnx"]
-
-    def load(self, path, shape_dict=None, **kwargs):
-        onnx = lazy_import("onnx")
-
-        # pylint: disable=E1101
-        model = onnx.load(path)
-
-        return relay.frontend.from_onnx(model, shape=shape_dict, **kwargs)
-
-
-class TensorflowFrontend(Frontend):
-    """TensorFlow frontend for TVMC"""
-
-    @staticmethod
-    def name():
-        return "pb"
-
-    @staticmethod
-    def suffixes():
-        return ["pb"]
-
-    def load(self, path, shape_dict=None, **kwargs):
-        tf = lazy_import("tensorflow")
-        tf_testing = lazy_import("tvm.relay.testing.tf")
-
-        with tf.io.gfile.GFile(path, "rb") as tf_graph:
-            content = tf_graph.read()
-
-        graph_def = tf.compat.v1.GraphDef()
-        graph_def.ParseFromString(content)
-        graph_def = tf_testing.ProcessGraphDefParam(graph_def)
-
-        logger.debug("parse TensorFlow model and convert into Relay computation graph")
-        return relay.frontend.from_tensorflow(graph_def, shape=shape_dict, **kwargs)
-
-
-class TFLiteFrontend(Frontend):
-    """TFLite frontend for TVMC"""
-
-    @staticmethod
-    def name():
-        return "tflite"
-
-    @staticmethod
-    def suffixes():
-        return ["tflite"]
-
-    def load(self, path, shape_dict=None, **kwargs):
-        model = lazy_import("tflite.Model")
-
-        with open(path, "rb") as tf_graph:
-            content = tf_graph.read()
-
-        # tflite.Model.Model is tflite.Model in 1.14 and 2.1.0
-        try:
-            tflite_model = model.Model.GetRootAsModel(content, 0)
-        except AttributeError:
-            tflite_model = model.GetRootAsModel(content, 0)
-
-        try:
-            version = tflite_model.Version()
-            logger.debug("tflite version %s", version)
-        except Exception:
-            raise TVMCException("input file not tflite")
-
-        if version != 3:
-            raise TVMCException("input file not tflite version 3")
-
-        logger.debug("parse TFLite model and convert into Relay computation graph")
-        mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, **kwargs)
-        return mod, params
-
-
-class PyTorchFrontend(Frontend):
-    """PyTorch frontend for TVMC"""
-
-    @staticmethod
-    def name():
-        return "pytorch"
-
-    @staticmethod
-    def suffixes():
-        # Torch Script is a zip file, but can be named pth
-        return ["pth", "zip"]
-
-    def load(self, path, shape_dict=None, **kwargs):
-        torch = lazy_import("torch")
-
-        if shape_dict is None:
-            raise TVMCException("--input-shapes must be specified for %s" % self.name())
-
-        traced_model = torch.jit.load(path)
-        traced_model.eval()  # Switch to inference mode
-
-        # Convert shape dictionary to list for Pytorch frontend compatibility
-        input_shapes = list(shape_dict.items())
-
-        logger.debug("parse Torch model and convert into Relay computation graph")
-        return relay.frontend.from_pytorch(
-            traced_model, input_shapes, keep_quantized_weight=True, **kwargs
-        )
-
-
-class PaddleFrontend(Frontend):
-    """PaddlePaddle frontend for TVMC"""
-
-    @staticmethod
-    def name():
-        return "paddle"
-
-    @staticmethod
-    def suffixes():
-        return ["pdmodel"]
-
-    def load(self, path, shape_dict=None, **kwargs):
-        # pylint: disable=C0415
-        import paddle
-
-        paddle.enable_static()
-        paddle.disable_signal_handler()
-
-        if not os.path.exists(path):
-            raise TVMCException("File {} is not exist.".format(path))
-        if not path.endswith(".pdmodel"):
-            raise TVMCException("Path of model file should be endwith suffixes '.pdmodel'.")
-        prefix = "".join(path.strip().split(".")[:-1])
-        params_file_path = prefix + ".pdiparams"
-        if not os.path.exists(params_file_path):
-            raise TVMCException("File {} is not exist.".format(params_file_path))
-
-        # pylint: disable=E1101
-        exe = paddle.static.Executor(paddle.CPUPlace())
-        prog, _, _ = paddle.static.load_inference_model(prefix, exe)
-
-        return relay.frontend.from_paddle(prog, shape_dict=shape_dict, **kwargs)
-
-
-class RelayFrontend(Frontend):
-    """Relay frontend for TVMC"""
-
-    @staticmethod
-    def name():
-        return "relay"
-
-    @staticmethod
-    def suffixes():
-        return ["relay"]
-
-    def load(self, path, shape_dict=None, **kwargs):
-        with open(path, "r", encoding="utf-8") as relay_text:
-            text = relay_text.read()
-        if shape_dict is None:
-            logger.warning(
-                "Specify --input-shapes to ensure that model inputs "
-                "will not be considered as constants."
-            )
-
-        def _validate_text(text):
-            """Check the provided file contents.
-            The relay.txt artifact contained in the MLF is missing the version header and
-            the metadata which is required to use meta[relay.Constant]."""
-
-            if re.compile(r".*\#\[version\.*").match(text) is None:
-                raise TVMCException(
-                    "The relay model does not include the required version information."
-                )
-            if re.compile(r".*meta\[.+\].*", re.DOTALL).match(text):
-                if "#[metadata]" not in text:
-                    raise TVMCException(
-                        "The relay model does not include the required #[metadata] section. "
-                        "Use ir_mod.astext(show_meta_data=True) to export compatible code."
-                    )
-
-        _validate_text(text)
-
-        ir_mod = parser.fromtext(text)
-
-        if shape_dict:
-            input_names = shape_dict.keys()
-        else:
-            input_names = []
-
-        def _gen_params(ir_mod, skip_names=None):
-            """Populate the all the params in the mode with ones."""
-            main_func = ir_mod["main"]
-            shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
-            type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
-            params = {}
-            for name, shape in shape_dict.items():
-                if skip_names and name in skip_names:
-                    continue
-
-                if "int" in type_dict[name]:
-                    data = np.random.randint(128, size=shape, dtype=type_dict[name])
-                else:
-                    data = np.random.uniform(-1, 1, size=shape).astype(type_dict[name])
-                params[name] = data
-            return params
-
-        params = _gen_params(ir_mod, skip_names=input_names)
-
-        return ir_mod, params
-
-
-ALL_FRONTENDS = [
-    KerasFrontend,
-    OnnxFrontend,
-    TensorflowFrontend,
-    TFLiteFrontend,
-    PyTorchFrontend,
-    PaddleFrontend,
-    RelayFrontend,
-]
-
-
-def get_frontend_names():
-    """Return the names of all supported frontends
-
-    Returns
-    -------
-    list : list of str
-        A list of frontend names as strings
-
-    """
-    return [frontend.name() for frontend in ALL_FRONTENDS]
-
-
-def get_frontend_by_name(name: str):
-    """
-    This function will try to get a frontend instance, based
-    on the name provided.
-
-    Parameters
-    ----------
-    name : str
-        the name of a given frontend
-
-    Returns
-    -------
-    frontend : tvm.driver.tvmc.Frontend
-        An instance of the frontend that matches with
-        the file extension provided in `path`.
-
-    """
-
-    for frontend in ALL_FRONTENDS:
-        if name == frontend.name():
-            return frontend()
-
-    raise TVMCException(
-        "unrecognized frontend '{0}'. Choose from: {1}".format(name, get_frontend_names())
-    )
-
-
-def guess_frontend(path: str):
-    """
-    This function will try to imply which framework is being used,
-    based on the extension of the file provided in the path parameter.
-
-    Parameters
-    ----------
-    path : str
-        The path to the model file.
-
-    Returns
-    -------
-    frontend : tvm.driver.tvmc.Frontend
-        An instance of the frontend that matches with
-        the file extension provided in `path`.
-
-    """
-
-    suffix = Path(path).suffix.lower()
-    if suffix.startswith("."):
-        suffix = suffix[1:]
-
-    for frontend in ALL_FRONTENDS:
-        if suffix in frontend.suffixes():
-            return frontend()
-
-    raise TVMCException("failed to infer the model format. Please specify --model-format")
-
-
-def load_model(
-    path: str,
-    model_format: Optional[str] = None,
-    shape_dict: Optional[Dict[str, List[int]]] = None,
-    **kwargs,
-):
-    """Load a model from a supported framework and convert it
-    into an equivalent relay representation.
-
-    Parameters
-    ----------
-    path : str
-        The path to the model file.
-    model_format : str, optional
-        The underlying framework used to create the model.
-        If not specified, this will be inferred from the file type.
-    shape_dict : dict, optional
-        Mapping from input names to their shapes.
-
-    Returns
-    -------
-    tvmc_model : TVMCModel
-        The produced model package.
-
-    """
-
-    if model_format is not None:
-        frontend = get_frontend_by_name(model_format)
-    else:
-        frontend = guess_frontend(path)
-
-    mod, params = frontend.load(path, shape_dict, **kwargs)
-
-    return TVMCModel(mod, params)
diff --git a/python/tvm/driver/tvmc/main.py b/python/tvm/driver/tvmc/main.py
deleted file mode 100644
index b5d039c75afb..000000000000
--- a/python/tvm/driver/tvmc/main.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=use-a-generator
-"""
-TVMC - TVM driver command-line interface
-"""
-import argparse
-import logging
-import sys
-
-import tvm
-
-from tvm.driver.tvmc import TVMCException, TVMCImportError
-from tvm.driver.tvmc.config_options import (
-    read_and_convert_json_into_dict,
-    convert_config_json_to_cli,
-)
-
-REGISTERED_PARSER = []
-
-
-def register_parser(make_subparser):
-    """
-    Utility function to register a subparser for tvmc.
-
-    Functions decorated with `tvm.driver.tvmc.main.register_parser` will be invoked
-    with a parameter containing the subparser instance they need to add itself to,
-    as a parser.
-
-    Example
-    -------
-
-        @register_parser
-        def _example_parser(main_subparser):
-            subparser = main_subparser.add_parser('example', help='...')
-            ...
-
-    """
-    REGISTERED_PARSER.append(make_subparser)
-    return make_subparser
-
-
-def _main(argv):
-    """TVM command line interface."""
-
-    parser = argparse.ArgumentParser(
-        prog="tvmc",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        description="TVM compiler driver",
-        epilog=__doc__,
-        # Help action will be added later, after all subparsers are created,
-        # so it doesn't interfere with the creation of the dynamic subparsers.
-        add_help=False,
-    )
-
-    parser.add_argument("--config", default="default", help="configuration json file")
-    config_arg, argv = parser.parse_known_args(argv)
-
-    json_param_dict = read_and_convert_json_into_dict(config_arg)
-    json_config_values = convert_config_json_to_cli(json_param_dict)
-
-    parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity")
-    parser.add_argument("--version", action="store_true", help="print the version and exit")
-
-    subparser = parser.add_subparsers(title="commands")
-    for make_subparser in REGISTERED_PARSER:
-        make_subparser(subparser, parser, json_config_values)
-
-    # Finally, add help for the main parser.
-    parser.add_argument("-h", "--help", action="help", help="show this help message and exit.")
-
-    args = parser.parse_args(argv)
-    args.verbose = min(args.verbose, 3)
-
-    # See the meaning of the logging levels at
-    # https://docs.python.org/3/library/logging.html#logging-levels
-    logging.basicConfig(stream=sys.stdout)
-    logging.getLogger("TVMC").setLevel(40 - args.verbose * 10)
-
-    if args.version:
-        sys.stdout.write("%s\n" % tvm.__version__)
-        return 0
-
-    if not hasattr(args, "func"):
-        # In case no valid subcommand is provided, show usage and exit
-        parser.print_help(sys.stderr)
-        return 1
-
-    try:
-        return args.func(args)
-    except TVMCImportError as err:
-        sys.stderr.write(
-            f'Package "{err}" is not installed. ' f'Hint: "pip install tlcpack[tvmc]".'
-        )
-        return 5
-    except TVMCException as err:
-        sys.stderr.write("Error: %s\n" % err)
-        return 4
-
-
-def main():
-    sys.exit(_main(sys.argv[1:]))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
deleted file mode 100644
index 73cc8da71cbf..000000000000
--- a/python/tvm/driver/tvmc/model.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=consider-using-with,broad-exception-raised,consider-using-from-import
-"""
-This file contains the definition of a set of classes that wrap the outputs
-of TVMC functions to create a simpler and more intuitive API.
-
-There is one class for each required stage of a TVM workflow.
-The TVMCModel represents the result of importing a model into TVM, it
-contains the precompiled graph definition and parameters that define
-what the model does.
-
-Compiling a TVMCModel produces a TVMCPackage, which contains the generated
-artifacts that allow the model to be run on the target hardware.
-
-Running a TVMCPackage produces a TVMCResult, which contains the outputs of
-the model and the measured runtime.
-
-Examples
---------
-The following code shows a full lifecycle for a model using tvmc, first the
-model is imported from an exterior framework, in this case onnx, then it
-is tuned to find the best schedules on CPU, then compiled into a TVMCPackage,
-and finally run.
-
-.. code-block:: python
-    tvmc_model = tvmc.load("my_model.onnx")
-    tuning_records = tvmc.tune(tvmc_model, target="llvm")
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", tuning_records=tuning_records)
-    result = tvmc.run(tvmc_package, device="cpu")
-    print(result)
-"""
-import os
-import tarfile
-from typing import Optional, Union, Dict, Callable, TextIO
-import numpy as np
-
-import tvm
-import tvm.contrib.cc
-from tvm import relay
-from tvm.contrib import utils
-from tvm.driver.tvmc import TVMCException
-from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
-from tvm.runtime.module import BenchmarkResult
-from tvm.runtime.vm import Executable
-
-
-class TVMCModel(object):
-    """Initialize a TVMC model from a relay model definition or a saved file.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule, optional
-        The relay module corresponding to this model.
-    params : dict, optional
-        A parameter dictionary for the model.
-    model_path: str, optional
-        An alternative way to load a TVMCModel, the path to a previously
-        saved model.
-    """
-
-    def __init__(
-        self,
-        mod: Optional[tvm.IRModule] = None,
-        params: Optional[Dict[str, tvm.nd.NDArray]] = None,
-        model_path: Optional[str] = None,
-    ):
-        if (mod is None or params is None) and (model_path is None):
-            raise TVMCException(
-                "Either mod and params must be provided "
-                "or a path to a previously saved TVMCModel"
-            )
-        self._tmp_dir = utils.tempdir()
-        if model_path is not None:
-            self.load(model_path)
-        else:
-            self.mod = mod
-            self.params = params if params else {}
-
-    def save(self, model_path: str):
-        """Save the TVMCModel to disk.
-
-        Note that this saves the graph representation,
-        the parameters, and the tuning records if applicable. It will not save any
-        compiled artifacts.
-
-        Parameters
-        ----------
-        model_path : str
-            A full path to save this TVMCModel to including the output file name.
-            The file will be saved as a tar file so using a ".tar" extension is advised.
-        """
-        temp = self._tmp_dir
-
-        # Save relay graph
-        relay_name = "model.json"
-        relay_path = temp.relpath(relay_name)
-        with open(relay_path, "w") as relay_file:
-            relay_file.write(tvm.ir.save_json(self.mod))
-
-        # Save params
-        params_name = "model.params"
-        params_path = temp.relpath(params_name)
-        with open(params_path, "wb") as params_file:
-            params_file.write(relay.save_param_dict(self.params))
-
-        # Create a tar file.
-        with tarfile.open(model_path, "w") as tar:
-            tar.add(relay_path, relay_name)
-            tar.add(params_path, params_name)
-            # If default tuning records exist, save them as well.
-            if os.path.exists(self.default_tuning_records_path()):
-                tar.add(self.default_tuning_records_path(), "tuning_records")
-            # Also save the compiled package if it can be found.
-            if os.path.exists(self.default_package_path()):
-                tar.add(self.default_package_path(), "model_package.tar")
-
-    def load(self, model_path: str):
-        """Load a TVMCModel from disk.
-
-        Parameters
-        ----------
-        model_path : str
-            A path to load the TVMCModel from.
-        """
-        temp = self._tmp_dir
-        t = tarfile.open(model_path)
-        t.extractall(temp.relpath("."))
-
-        # Load relay IR.
-        relay_path = temp.relpath("model.json")
-        with open(relay_path, "r") as relay_file:
-            self.mod = tvm.ir.load_json(relay_file.read())
-
-        # Load parameter dictionary.
-        params_path = temp.relpath("model.params")
-        with open(params_path, "rb") as params_file:
-            self.params = relay.load_param_dict(params_file.read())
-
-    def default_tuning_records_path(self):
-        """Get a full path for storing tuning records in this model's temporary direcotry
-
-        Note that when this path is used, the tuning records will be saved and loaded
-        when calling `save` and `load`.
-
-        Returns
-        -------
-        records_path: str
-            A path to the default location for tuning records.
-        """
-        return self._tmp_dir.relpath("tuning_records")
-
-    def default_package_path(self):
-        """Get a full path for storing a compiled package in this model's temporary direcotry
-
-        Note that when this path is used, the package will be saved and loaded
-        when calling `save` and `load`.
-
-        Returns
-        -------
-        records_path: str
-            A path to the default location for tuning records.
-        """
-        return self._tmp_dir.relpath("model_package.tar")
-
-    def export_vm_format(
-        self,
-        vm_exec: Executable,
-        package_path: Optional[str] = None,
-        lib_format: str = "so",
-    ):
-        """Save this TVMCModel compiled via vm to file.
-        Parameters
-        ----------
-        vm_exec : vm.Executable
-            The VM Executable containing compiled the compiled artifacts needed to run this model.
-        package_path : str, None
-            Where the model should be saved. Note that it will be packaged as a .tar file.
-            If not provided, the package will be saved to a generically named file in tmp.
-        lib_format : str
-            How to export the modules function library. Must be one of "so" or "tar".
-
-        Returns
-        -------
-        package_path : str
-            The path that the package was saved to.
-        """
-        lib_name = "lib." + lib_format
-        temp = self._tmp_dir
-        if package_path is None:
-            package_path = self.default_package_path()
-
-        path_lib = temp.relpath(lib_name)
-        vm_exec.mod.export_library(path_lib)
-        self.lib_path = path_lib
-        # Package up all the temp files into a tar file.
-        with tarfile.open(package_path, "w") as tar:
-            tar.add(path_lib, lib_name)
-
-        return package_path
-
-    def export_classic_format(
-        self,
-        executor_factory: GraphExecutorFactoryModule,
-        package_path: Optional[str] = None,
-        cross: Optional[Union[str, Callable]] = None,
-        cross_options: Optional[str] = None,
-        lib_format: str = "so",
-    ):
-        """Save this TVMCModel to file.
-        Parameters
-        ----------
-        executor_factory : GraphExecutorFactoryModule
-            The factory containing compiled the compiled artifacts needed to run this model.
-        package_path : str, None
-            Where the model should be saved. Note that it will be packaged as a .tar file.
-            If not provided, the package will be saved to a generically named file in tmp.
-        cross : str or callable object, optional
-            Function that performs the actual compilation.
-        cross_options : str, optional
-            Command line options to be passed to the cross compiler.
-        lib_format : str
-            How to export the modules function library. Must be one of "so" or "tar".
-
-        Returns
-        -------
-        package_path : str
-            The path that the package was saved to.
-        """
-        lib_name = "mod." + lib_format
-        graph_name = "mod.json"
-        param_name = "mod.params"
-
-        temp = self._tmp_dir
-        if package_path is None:
-            package_path = self.default_package_path()
-        path_lib = temp.relpath(lib_name)
-
-        if not cross:
-            executor_factory.get_lib().export_library(path_lib)
-        else:
-            if not cross_options:
-                executor_factory.get_lib().export_library(
-                    path_lib, fcompile=tvm.contrib.cc.cross_compiler(cross)
-                )
-            else:
-                executor_factory.get_lib().export_library(
-                    path_lib,
-                    fcompile=tvm.contrib.cc.cross_compiler(cross, options=cross_options.split(" ")),
-                )
-        self.lib_path = path_lib
-
-        with open(temp.relpath(graph_name), "w") as graph_file:
-            graph_file.write(executor_factory.get_graph_json())
-
-        with open(temp.relpath(param_name), "wb") as params_file:
-            params_file.write(relay.save_param_dict(executor_factory.get_params()))
-
-        # Package up all the temp files into a tar file.
-        with tarfile.open(package_path, "w") as tar:
-            tar.add(path_lib, lib_name)
-            tar.add(temp.relpath(graph_name), graph_name)
-            tar.add(temp.relpath(param_name), param_name)
-
-        return package_path
-
-    def export_package(
-        self,
-        executor_factory: Union[GraphExecutorFactoryModule, Executable],
-        package_path: Optional[str] = None,
-        cross: Optional[Union[str, Callable]] = None,
-        cross_options: Optional[str] = None,
-        output_format: str = "so",
-    ):
-        """Save this TVMCModel to file.
-        Parameters
-        ----------
-        executor_factory : GraphExecutorFactoryModule
-            The factory containing the compiled artifacts needed to run this model.
-        package_path : str, None
-            Where the model should be saved. Note that it will be packaged as a .tar file.
-            If not provided, the package will be saved to a generically named file in tmp.
-        cross : str or callable object, optional
-            Function that performs the actual compilation.
-        cross_options : str, optional
-            Command line options to be passed to the cross compiler.
-        output_format : str
-            How to save the modules function library. Must be one of "so" and "tar" to save
-            using the classic forma.
-
-        Returns
-        -------
-        package_path : str
-            The path that the package was saved to.
-        """
-        if output_format not in ["so", "tar"]:
-            raise TVMCException("Only 'so' and 'tar' output formats are supported.")
-
-        if isinstance(executor_factory, Executable):
-            package_path = self.export_vm_format(executor_factory, package_path, output_format)
-        elif output_format in ["so", "tar"]:
-            package_path = self.export_classic_format(
-                executor_factory, package_path, cross, cross_options, output_format
-            )
-
-        return package_path
-
-    def summary(self, file: TextIO = None):
-        """Print the IR corressponding to this model.
-
-        Arguments
-        ---------
-        file: Writable, optional
-            If specified, the summary will be written to this file.
-        """
-        print(self.mod, file=file)
-
-
-class TVMCPackage(object):
-    """Load a saved TVMCPackage from disk.
-
-    Parameters
-    ----------
-    package_path : str
-        The path to the saved TVMCPackage that will be loaded.
-
-    use_vm : bool
-        Whether the graph module was compiled with vm or not.
-    """
-
-    def __init__(self, package_path: str):
-        self._tmp_dir = utils.tempdir()
-        self.package_path = package_path
-        self.import_package(self.package_path)
-
-    def import_package(self, package_path: str):
-        """Load a TVMCPackage from a previously exported TVMCModel.
-
-        Parameters
-        ----------
-        package_path : str
-            The path to the saved TVMCPackage.
-        """
-        temp = self._tmp_dir
-        t = tarfile.open(package_path)
-        t.extractall(temp.relpath("."))
-
-        # Classic format
-        classic_lib_name_so = "mod.so"
-        classic_lib_name_tar = "mod.tar"
-
-        # VM format
-        vm_lib_name_so = "lib.so"
-        vm_lib_name_tar = "lib.tar"
-
-        if os.path.exists(temp.relpath(classic_lib_name_so)):
-            self.lib_name = classic_lib_name_so
-            self.type = "classic"
-        elif os.path.exists(temp.relpath(classic_lib_name_tar)):
-            self.lib_name = classic_lib_name_tar
-            self.type = "classic"
-        elif os.path.exists(temp.relpath(vm_lib_name_so)):
-            self.lib_name = vm_lib_name_so
-            self.type = "vm"
-        elif os.path.exists(temp.relpath(vm_lib_name_tar)):
-            self.lib_name = vm_lib_name_tar
-            self.type = "vm"
-        else:
-            raise TVMCException("Couldn't find exported library in the package.")
-
-        self.lib_path = temp.relpath(self.lib_name)
-
-        graph, params = None, None
-        self.executor_type = "vm"
-        if self.type == "classic":
-            graph = temp.relpath("mod.json")
-            params = temp.relpath("mod.params")
-            self.executor_type = "graph"
-
-        if params is not None:
-            with open(params, "rb") as param_file:
-                self.params = bytearray(param_file.read())
-        else:
-            self.params = None
-
-        if graph is not None:
-            with open(graph) as graph_file:
-                self.graph = graph_file.read()
-        else:
-            self.graph = None
-
-
-class TVMCResult(object):
-    """A class that stores the results of tvmc.run and provides helper utilities."""
-
-    def __init__(self, outputs: Dict[str, np.ndarray], times: BenchmarkResult):
-        """Create a convenience wrapper around the output of tvmc.run
-
-        Parameters
-        ----------
-        outputs : dict
-            Outputs dictionary mapping the name of the output to its numpy value.
-        times : BenchmarkResult
-            The execution times measured by the time evaluator in seconds to produce outputs.
-        """
-        self.outputs = outputs
-        self.times = times
-
-    def format_times(self):
-        """Format the mean, max, min and std of the execution times.
-
-        This has the effect of producing a small table that looks like:
-        .. code-block::
-            Execution time summary:
-            mean (ms)  median (ms) max (ms)    min (ms)    std (ms)
-            0.14310      0.14310   0.16161     0.12933    0.01004
-
-        Returns
-        -------
-        str
-            A formatted string containing the statistics.
-        """
-        return str(self.times)
-
-    def get_output(self, name: str):
-        """A helper function to grab one of the outputs by name.
-
-        Parameters
-        ----------
-        name : str
-            The name of the output to return
-
-        Returns
-        -------
-        output : np.ndarray
-            The output corresponding to name.
-        """
-        return self.outputs[name]
-
-    def save(self, output_path: str):
-        """Save the numpy outputs to disk as a .npz file.
-
-        Parameters
-        ----------
-        output_path : str
-            The path to save the numpy results to.
-        """
-        np.savez(output_path, **self.outputs)
-
-    def __str__(self):
-        stat_table = self.format_times()
-        output_keys = f"Output Names:\n {list(self.outputs.keys())}"
-        return stat_table + "\n" + output_keys
diff --git a/python/tvm/driver/tvmc/pass_config.py b/python/tvm/driver/tvmc/pass_config.py
deleted file mode 100644
index dde5b9c659d8..000000000000
--- a/python/tvm/driver/tvmc/pass_config.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-TVMC PassContext Interface
-"""
-
-import importlib
-
-import tvm
-from tvm.driver.tvmc import TVMCException
-
-
-def load_function(full_name):
-    """Dynamic loading a function by the full name.
-    Parameters
-    ----------
-    full_name: str
-        The name of a PackedFunc or a string of the form "path.to.module.func"
-        that indicates the module that can be imported.
-        You must be aware of the load order here, it first tries to find it via
-        TVM global function, if not find, try to import it by "importlib.import_module".
-    Returns
-    -------
-    func: function or PackedFunc
-        The loaded fucntion.
-    """
-    global_func = tvm.get_global_func(full_name, allow_missing=True)
-    if global_func is not None:
-        return global_func
-
-    # split full name "path.to.module.func" into two parts ["path.to.module", "func"]
-    module_name, func_name = full_name.rsplit(".", 1)
-
-    # import module and find the function
-    module = importlib.import_module(module_name)
-    if hasattr(module, func_name):
-        return getattr(module, func_name)
-
-    raise TVMCException(f"No function '{func_name}' found in module '{module_name}'.")
-
-
-def get_pass_config_value(name, value, config_type):
-    """Get a PassContext configuration value, based on its config data type.
-
-    Parameters
-    ----------
-    name: str
-        config identifier name.
-    value: str
-        value assigned to the config, provided via command line.
-    config_type: str
-        data type defined to the config, as string.
-
-    Returns
-    -------
-    parsed_value: bool, int or str
-        a representation of the input value, converted to the type
-        specified by config_type.
-    """
-
-    parsed_value = None
-
-    if config_type == "IntImm":
-        # "Bool" configurations in the PassContext are recognized as
-        # IntImm, so deal with this case here
-        mapping_values = {
-            "false": False,
-            "true": True,
-        }
-
-        if value.isdigit():
-            parsed_value = int(value)
-        else:
-            # if not an int, accept only values on the mapping table, case insensitive
-            parsed_value = mapping_values.get(value.lower(), None)
-
-        if parsed_value is None:
-            raise TVMCException(f"Invalid value '{value}' for configuration '{name}'.")
-
-    elif config_type == "runtime.String":
-        parsed_value = value
-
-    elif config_type == "Array":
-        if name == "tir.add_lower_pass":
-            pass_list = value.split(",")
-            if len(pass_list) % 2 != 0:
-                raise TVMCException(
-                    f"The configuration of '{name}' must be of the form "
-                    "'tir.add_lower_pass=opt_level1,pass1,opt_evel2,pass2'"
-                )
-
-            parsed_value = []
-            for i in range(0, len(pass_list), 2):
-                level, pass_func = pass_list[i].strip(), pass_list[i + 1].strip()
-                try:
-                    level = int(level)
-                except ValueError:
-                    raise TVMCException(f"Only integer is allow for configuration '{name}'.")
-
-                # TODO (@leeexyz) We should parse configurations of each tir Pass.
-                #     For now, we only use the defaults. Currently, There are four config nodes:
-                #     `tir.transform.LoopPartitionConfig`
-                #     `tir.transform.UnrollLoopConfig`
-                #     `tir.transform.HoistIfThenElseConfig`
-                #     `tir.transform.InjectDoubleBufferConfig`
-                # loading pass func and calling it to get the Pass
-                pass_func = load_function(pass_func)()
-                parsed_value.append((level, pass_func))
-        else:
-            raise TVMCException(f"Unsupported configuration '{name}' for '{config_type}' type.")
-
-    else:
-        # not raise here cause we alreay checked before calling this function
-        pass
-
-    return parsed_value
-
-
-def parse_configs(input_configs):
-    """Parse configuration values set via command line.
-
-    Parameters
-    ----------
-    input_configs: list of str
-        list of configurations provided via command line.
-
-    Returns
-    -------
-    pass_context_configs: dict
-        a dict containing key-value configs to be used in the PassContext.
-    """
-    if not input_configs:
-        return {}
-
-    all_configs = tvm.ir.transform.PassContext.list_configs()
-    supported_config_types = ("IntImm", "runtime.String", "Array")
-    supported_configs = [
-        name for name in all_configs.keys() if all_configs[name]["type"] in supported_config_types
-    ]
-
-    pass_context_configs = {}
-
-    for config in input_configs:
-        if not config:
-            raise TVMCException(
-                f"Invalid format for configuration '{config}', use <config>=<value>"
-            )
-
-        # Each config is expected to be provided as "name=value"
-        try:
-            name, value = config.split("=")
-            name = name.strip()
-            value = value.strip()
-        except ValueError:
-            raise TVMCException(
-                f"Invalid format for configuration '{config}', use <config>=<value>"
-            )
-
-        if name not in all_configs:
-            raise TVMCException(
-                f"Configuration '{name}' is not defined in TVM. "
-                f"These are the existing configurations: {', '.join(all_configs)}"
-            )
-
-        if name not in supported_configs:
-            raise TVMCException(
-                f"Configuration '{name}' uses a data type not supported by TVMC. "
-                f"The following configurations are supported: {', '.join(supported_configs)}"
-            )
-
-        config_type = all_configs[name]["type"]
-        parsed_value = get_pass_config_value(name, value, config_type)
-
-        if config_type == "Array" and name in pass_context_configs:
-            # merge configs if the configuration exists
-            pass_context_configs[name].extend(parsed_value)
-        else:
-            pass_context_configs[name] = parsed_value
-
-    return pass_context_configs
diff --git a/python/tvm/driver/tvmc/pass_list.py b/python/tvm/driver/tvmc/pass_list.py
deleted file mode 100644
index 09ec6aaf9102..000000000000
--- a/python/tvm/driver/tvmc/pass_list.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language
-"""
-TVMC Pass List Management
-"""
-
-import argparse
-
-import tvm
-from tvm._ffi import registry
-
-
-def parse_pass_list_str(input_string):
-    """Parse an input string for existing passes
-
-    Parameters
-    ----------
-    input_string: str
-        Possibly comma-separated string with the names of passes
-
-    Returns
-    -------
-    list: a list of existing passes.
-    """
-    _prefix = "relay._transform."
-    pass_list = input_string.split(",")
-    missing_list = [
-        p.strip()
-        for p in pass_list
-        if len(p.strip()) > 0 and tvm.get_global_func(_prefix + p.strip(), True) is None
-    ]
-    if len(missing_list) > 0:
-        available_list = [
-            n[len(_prefix) :] for n in registry.list_global_func_names() if n.startswith(_prefix)
-        ]
-        raise argparse.ArgumentTypeError(
-            "Following passes are not registered within tvm: {}. Available: {}.".format(
-                ", ".join(missing_list), ", ".join(sorted(available_list))
-            )
-        )
-    return pass_list
diff --git a/python/tvm/driver/tvmc/project.py b/python/tvm/driver/tvmc/project.py
deleted file mode 100644
index d9b22a2d6fc3..000000000000
--- a/python/tvm/driver/tvmc/project.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-TVMC Project Generation Functions
-"""
-
-import os
-import pathlib
-from collections import defaultdict
-from typing import Union
-
-from . import TVMCException
-from .fmtopt import format_option
-
-
-def get_project_options(project_info):
-    """Get all project options as returned by Project API 'server_info_query'
-    and return them in a dict indexed by the API method they belong to.
-
-
-    Parameters
-    ----------
-    project_info: dict of list
-        a dict of lists as returned by Project API 'server_info_query' among
-        which there is a list called 'project_options' containing all the
-        project options available for a given project/platform.
-
-    Returns
-    -------
-    options_by_method: dict of list
-        a dict indexed by the API method names (e.g. "generate_project",
-        "build", "flash", or "open_transport") of lists containing all the
-        options (plus associated metadata and formatted help text) that belong
-        to a method.
-
-        The metadata associated to the options include the field 'choices' and
-        'required' which are convenient for parsers.
-
-        The formatted help text field 'help_text' is a string that contains the
-        name of the option, the choices for the option, and the option's default
-        value.
-    """
-    options = project_info["project_options"]
-
-    options_by_method = defaultdict(list)
-    for opt in options:
-        # Get list of methods associated with an option based on the
-        # existance of a 'required' or 'optional' lists. API specification
-        # guarantees at least one of these lists will exist. If a list does
-        # not exist it's returned as None by the API.
-        metadata = ["required", "optional"]
-        option_methods = [(opt[md], bool(md == "required")) for md in metadata if opt[md]]
-        for methods, is_opt_required in option_methods:
-            for method in methods:
-                name = opt["name"]
-
-                # Only for boolean options set 'choices' accordingly to the
-                # option type. API returns 'choices' associated to them
-                # as None but 'choices' can be deduced from 'type' in this case.
-                if opt["type"] == "bool":
-                    opt["choices"] = ["true", "false"]
-
-                if opt["choices"]:
-                    choices = "{" + ", ".join(opt["choices"]) + "}"
-                else:
-                    choices = opt["name"].upper()
-                option_choices_text = f"{name}={choices}"
-
-                help_text = opt["help"][0].lower() + opt["help"][1:]
-
-                if opt["default"]:
-                    default_text = f"Defaults to '{opt['default']}'."
-                else:
-                    default_text = None
-
-                formatted_help_text = format_option(
-                    option_choices_text, help_text, default_text, is_opt_required
-                )
-
-                option = {
-                    "name": opt["name"],
-                    "choices": opt["choices"],
-                    "help_text": formatted_help_text,
-                    "required": is_opt_required,
-                }
-                options_by_method[method].append(option)
-
-    return options_by_method
-
-
-def get_options(options):
-    """Get option and option value from the list options returned by the parser.
-
-    Parameters
-    ----------
-    options: list of str
-        list of strings of the form "option=value" as returned by the parser.
-
-    Returns
-    -------
-    opts: dict
-        dict indexed by option names and associated values.
-    """
-
-    opts = {}
-    for option in options:
-        try:
-            k, v = option.split("=")
-            opts[k] = v
-        except ValueError:
-            raise TVMCException(f"Invalid option format: {option}. Please use OPTION=VALUE.")
-
-    return opts
-
-
-def check_options(options, valid_options):
-    """Check if an option (required or optional) is valid. i.e. in the list of valid options.
-
-    Parameters
-    ----------
-    options: dict
-        dict indexed by option name of options and options values to be checked.
-
-    valid_options: list of dict
-        list of all valid options and choices for a platform.
-
-    Returns
-    -------
-    None. Raise TVMCException if check fails, i.e. if an option is not in the list of valid options.
-
-    """
-    required_options = [opt["name"] for opt in valid_options if opt["required"]]
-    for required_option in required_options:
-        if required_option not in options:
-            raise TVMCException(
-                f"Option '{required_option}' is required but was not specified. Use --list-options "
-                "to see all required options."
-            )
-
-    remaining_options = set(options) - set(required_options)
-    optional_options = [opt["name"] for opt in valid_options if not opt["required"]]
-    for option in remaining_options:
-        if option not in optional_options:
-            raise TVMCException(
-                f"Option '{option}' is invalid. Use --list-options to see all available options."
-            )
-
-
-def check_options_choices(options, valid_options):
-    """Check if an option value is among the option's choices, when choices exist.
-
-    Parameters
-    ----------
-    options: dict
-        dict indexed by option name of options and options values to be checked.
-
-    valid_options: list of dict
-        list of all valid options and choices for a platform.
-
-    Returns
-    -------
-    None. Raise TVMCException if check fails, i.e. if an option value is not valid.
-
-    """
-    # Dict of all valid options and associated valid choices.
-    # Options with no choices are excluded from the dict.
-    valid_options_choices = {
-        opt["name"]: opt["choices"] for opt in valid_options if opt["choices"] is not None
-    }
-
-    for option in options:
-        if option in valid_options_choices:
-            if options[option] not in valid_options_choices[option]:
-                raise TVMCException(
-                    f"Choice '{options[option]}' for option '{option}' is invalid. "
-                    "Use --list-options to see all available choices for that option."
-                )
-
-
-def get_and_check_options(passed_options, valid_options):
-    """Get options and check if they are valid.  If choices exist for them, check values against it.
-
-    Parameters
-    ----------
-    passed_options: list of str
-        list of strings in the "key=value" form as captured by argparse.
-
-    valid_option: list
-        list with all options available for a given API method / project as returned by
-        get_project_options().
-
-    Returns
-    -------
-    opts: dict
-        dict indexed by option names and associated values.
-
-    Or None if passed_options is None.
-
-    """
-
-    if passed_options is None:
-        # No options to check
-        return None
-
-    # From a list of k=v strings, make a dict options[k]=v
-    opts = get_options(passed_options)
-    # Check if passed options are valid
-    check_options(opts, valid_options)
-    # Check (when a list of choices exists) if the passed values are valid
-    check_options_choices(opts, valid_options)
-
-    return opts
-
-
-def get_project_dir(project_dir: Union[pathlib.Path, str]) -> str:
-    """Get project directory path"""
-    if not os.path.isabs(project_dir):
-        return os.path.abspath(project_dir)
-    return project_dir
diff --git a/python/tvm/driver/tvmc/registry.py b/python/tvm/driver/tvmc/registry.py
deleted file mode 100644
index b76202a730a2..000000000000
--- a/python/tvm/driver/tvmc/registry.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-This file contains functions for processing registry based inputs for the TVMC CLI
-"""
-
-from tvm.driver.tvmc import TVMCException
-
-# We can't tell the type inside an Array but all current options are
-# strings so it can default to that. runtime.BoxBool is used to
-# distinguish from runtime.BoxInt.
-INTERNAL_TO_NATIVE_TYPE = {
-    "runtime.String": str,
-    "runtime.BoxBool": bool,
-    "runtime.BoxFloat": float,
-    "runtime.BoxInt": int,
-    "Array": str,
-}
-INTERNAL_TO_HELP = {
-    "runtime.String": " string",
-    "runtime.BoxBool": " bool",
-    "runtime.BoxInt": " int",
-    "runtime.BoxFloat": " float",
-    "Array": " options",
-}
-
-
-def _generate_registry_option_args(parser, registry, name):
-    target_group = parser.add_argument_group(f"{registry.flag_registry_name} {name}")
-    for option_name, option_type in registry.list_registered_options(name).items():
-        if option_type in INTERNAL_TO_NATIVE_TYPE:
-            target_group.add_argument(
-                f"--{registry.flag_registry_name}-{name}-{option_name}",
-                type=INTERNAL_TO_NATIVE_TYPE[option_type],
-                help=(
-                    f"{registry.flag_registry_name.title()} "
-                    + "{name} {option_name}{INTERNAL_TO_HELP[option_type]}"
-                ),
-            )
-
-
-def generate_registry_args(parser, registry, default=None):
-    """Walks through the given registry and generates arguments for each of the available options"""
-    parser.add_argument(
-        f"--{registry.flag_registry_name}",
-        help=f"{registry.flag_registry_name.title()} to compile the model with",
-        required=False,
-        default=default,
-    )
-    names = registry.list_registered()
-
-    for name in names:
-        _generate_registry_option_args(parser, registry, name)
-
-
-def _reconstruct_registry_options(args, registry, name):
-    options = {}
-    for option, option_type in registry.list_registered_options(name).items():
-        if option_type in INTERNAL_TO_NATIVE_TYPE:
-            var_name = f"{registry.flag_registry_name}_{name}_{option.replace('-', '_')}"
-            option_value = getattr(args, var_name)
-            if option_value is not None:
-                options[option] = option_value
-    return options
-
-
-def reconstruct_registry_entity(args, registry):
-    """Reconstructs an entity from arguments generated from a registry"""
-    possible_names = registry.list_registered()
-    name = getattr(args, registry.flag_registry_name)
-    if name is None:
-        return None
-
-    if name not in possible_names:
-        raise TVMCException(f'{registry.flag_registry_name.title()} "{name}" is not defined')
-
-    reconstructed = {
-        possible_name: _reconstruct_registry_options(args, registry, possible_name)
-        for possible_name in possible_names
-    }
-
-    for possible_name in possible_names:
-        if possible_name != name and reconstructed[possible_name]:
-            first_option = list(reconstructed[possible_name])[0]
-            raise TVMCException(
-                f"Passed --{registry.flag_registry_name}-{possible_name}-{first_option} "
-                f"but did not specify {possible_name} executor"
-            )
-
-    return registry(name, reconstructed[name])
diff --git a/python/tvm/driver/tvmc/result_utils.py b/python/tvm/driver/tvmc/result_utils.py
deleted file mode 100644
index 10d3159c8969..000000000000
--- a/python/tvm/driver/tvmc/result_utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-This file contains utility functions for processing the outputs
-of TVMC models. These utilities are likely to be task specific,
-overtime more will be added to support more machine learning tasks.
-
-Examples
---------
-The following code shows how one might postprocess
-the output of a classification model.
-
-.. code-block:: python
-    result = tvmc.run(tvmc_package, device="cpu")
-    top_results = result_utils.get_top_results(max_results=5)
-"""
-import numpy as np
-from .model import TVMCResult
-
-
-def get_top_results(result: TVMCResult, max_results: int):
-    """Return the top n results from the output tensor.
-
-    This function is primarily for image classification and will
-    not necessarily generalize.
-
-    Parameters
-    ----------
-    result : TVMCResult
-        The output of a TVMCModel
-    max_results : int
-        Number of results to return
-
-    Returns
-    -------
-    top_results : np.array
-        Results array of shape (2, n).
-        The first row is the indices and the second is the values.
-
-    """
-    output = np.copy(result.outputs["output_0"])
-    sorted_labels = output.argsort()[0][-max_results:][::-1]
-    output.sort()
-    sorted_values = output[0][-max_results:][::-1]
-    top_results = np.array([sorted_labels, sorted_values])
-    return top_results
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
deleted file mode 100644
index 4c47a56147b6..000000000000
--- a/python/tvm/driver/tvmc/runner.py
+++ /dev/null
@@ -1,578 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=consider-using-from-import
-"""
-Provides support to run compiled networks both locally and remotely.
-"""
-from contextlib import ExitStack
-import logging
-import pathlib
-from typing import Dict, Optional, Union
-from tarfile import ReadError
-import json
-
-import numpy as np
-
-import tvm
-from tvm import rpc
-from tvm.runtime import vm
-from tvm.autotvm.measure import request_remote
-from tvm.contrib import graph_executor as executor
-from tvm.contrib.debugger import debug_executor
-from tvm.runtime import profiler_vm
-from tvm.relay.param_dict import load_param_dict
-from . import TVMCException
-
-from .main import register_parser
-from .model import TVMCPackage, TVMCResult
-from .result_utils import get_top_results
-from .tracker import tracker_host_port_from_cli
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-
-@register_parser
-def add_run_parser(subparsers, main_parser, json_params):  # pylint: disable=unused-argument
-    """Include parser for 'run' subcommand"""
-
-    parser = subparsers.add_parser("run", help="run a compiled module", conflict_handler="resolve")
-    parser.set_defaults(func=drive_run)
-
-    # TODO --device needs to be extended and tested to support other targets,
-    #      like 'webgpu', etc (@leandron)
-    parser.add_argument(
-        "--device",
-        choices=["cpu", "cuda", "cl", "metal", "vulkan", "rocm"],
-        default="cpu",
-        help="target device to run the compiled module. Defaults to 'cpu'",
-    )
-    parser.add_argument(
-        "--fill-mode",
-        choices=["zeros", "ones", "random"],
-        default="random",
-        help="fill all input tensors with values. In case --inputs/-i is provided, "
-        "they will take precedence over --fill-mode. Any remaining inputs will be "
-        "filled using the chosen fill mode. Defaults to 'random'",
-    )
-    parser.add_argument("-i", "--inputs", help="path to the .npz input file")
-    parser.add_argument("-o", "--outputs", help="path to the .npz output file")
-    parser.add_argument(
-        "--print-time",
-        action="store_true",
-        help="record and print the execution time(s). Enabling print-time will result "
-        " in (1 + repeat * number) executions of the model.",
-    )
-    parser.add_argument(
-        "--print-top",
-        metavar="N",
-        type=int,
-        help="print the top n values and indices of the output tensor",
-    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="generate profiling data from the runtime execution. "
-        "Using --profile requires the Graph Executor Debug enabled on TVM. "
-        "Profiling may also have an impact on inference time, "
-        "making it take longer to be generated.",
-    )
-    parser.add_argument(
-        "--profile-options",
-        default="table,sort,aggregate,col_sums",
-        help="Additional options for profiling. Table dump is default"
-        "comma seperated string of table,csv,json,sort,aggregate,col_sums",
-    )
-    parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity.")
-    parser.add_argument(
-        "--end-to-end",
-        action="store_true",
-        help="Measure data transfers as well as model execution. This can provide a "
-        "more realistic performance measurement in many cases. Requires "
-        "'--print-time' to be specified.",
-    )
-    parser.add_argument(
-        "--repeat",
-        metavar="N",
-        type=int,
-        default=1,
-        help="How many times to repeat the run. Requires '--print-time' to be "
-        "specified. Defaults to '1'",
-    )
-    parser.add_argument(
-        "--number",
-        metavar="N",
-        type=int,
-        default=1,
-        help="The number of runs to measure within each repeat. Requires "
-        "'--print-time' to be specified. Defaults to '1'",
-    )
-    parser.add_argument(
-        "--rpc-key",
-        help="the RPC tracker key of the target device.",
-    )
-    parser.add_argument(
-        "--rpc-tracker",
-        help="hostname (required) and port (optional, defaults to 9090) of the RPC tracker, "
-        "e.g. '192.168.0.100:9999'.",
-    )
-    parser.add_argument(
-        "PATH",
-        help="path to the compiled module file or to the project directory if '--device micro' "
-        "is selected.",
-    )
-
-
-def drive_run(args):
-    """Invoke runner module with command line arguments
-
-    Parameters
-    ----------
-    args: argparse.Namespace
-        Arguments from command line parser.
-    """
-
-    path = pathlib.Path(args.PATH)
-
-    try:
-        tvmc_package = TVMCPackage(package_path=path)
-    except IsADirectoryError:
-        raise TVMCException(f"File {path} must be an archive, not a directory.")
-    except FileNotFoundError:
-        raise TVMCException(f"File {path} does not exist.")
-    except ReadError:
-        raise TVMCException(f"Could not read model from archive {path}!")
-
-    rpc_hostname, rpc_port = tracker_host_port_from_cli(args.rpc_tracker)
-
-    try:
-        inputs = np.load(args.inputs) if args.inputs else {}
-    except IOError as ex:
-        raise TVMCException("Error loading inputs file: %s" % ex)
-
-    result = run_module(
-        tvmc_package,
-        args.device,
-        hostname=rpc_hostname,
-        port=rpc_port,
-        rpc_key=args.rpc_key,
-        inputs=inputs,
-        fill_mode=args.fill_mode,
-        benchmark=args.print_time,
-        repeat=args.repeat,
-        number=args.number,
-        profile=args.profile,
-        profile_options=args.profile_options,
-        end_to_end=args.end_to_end,
-    )
-
-    if args.print_time:
-        stat_table = result.format_times()
-        # print here is intentional
-        print(stat_table)
-
-    if args.print_top:
-        top_results = get_top_results(result, args.print_top)
-        # print here is intentional
-        print(top_results)
-
-    if args.outputs:
-        # Save the outputs
-        result.save(args.outputs)
-
-
-def get_input_info(graph_str: str, params: Dict[str, tvm.nd.NDArray]):
-    """Return the 'shape' and 'dtype' dictionaries for the input
-    tensors of a compiled module.
-
-    .. note::
-        We can't simply get the input tensors from a TVM graph
-        because weight tensors are treated equivalently. Therefore, to
-        find the input tensors we look at the 'arg_nodes' in the graph
-        (which are either weights or inputs) and check which ones don't
-        appear in the params (where the weights are stored). These nodes
-        are therefore inferred to be input tensors.
-
-    .. note::
-        There exists a more recent API to retrieve the input information
-        directly from the module. However, this isn't supported when using
-        with RPC due to a lack of support for Array and Map datatypes.
-        Therefore, this function exists only as a fallback when RPC is in
-        use. If RPC isn't being used, please use the more recent API.
-
-    Parameters
-    ----------
-    graph_str : str
-        JSON graph of the module serialized as a string.
-    params : dict
-        Parameter dictionary mapping name to value.
-
-    Returns
-    -------
-    shape_dict : dict
-        Shape dictionary - {input_name: tuple}.
-    dtype_dict : dict
-        dtype dictionary - {input_name: dtype}.
-    """
-
-    shape_dict = {}
-    dtype_dict = {}
-    params_dict = load_param_dict(params)
-    param_names = [k for (k, v) in params_dict.items()]
-    graph = json.loads(graph_str)
-    for node_id in graph["arg_nodes"]:
-        node = graph["nodes"][node_id]
-        # If a node is not in the params, infer it to be an input node
-        name = node["name"]
-        if name not in param_names:
-            shape_dict[name] = graph["attrs"]["shape"][1][node_id]
-            dtype_dict[name] = graph["attrs"]["dltype"][1][node_id]
-
-    return shape_dict, dtype_dict
-
-
-def generate_tensor_data(shape: tuple, dtype: str, fill_mode: str):
-    """Generate data to produce a tensor of given shape and dtype.
-
-    Random data generation depends on the dtype. For int8 types,
-    random integers in the range 0->255 are generated. For all other
-    types, random floats are generated in the range -1->1 and then
-    cast to the appropriate dtype.
-
-    This is used to quickly generate some data to input the models, as
-    a way to check that compiled module is sane for running.
-
-    Parameters
-    ----------
-    shape : tuple
-        The shape of the tensor.
-    dtype : str
-        The dtype of the tensor.
-    fill_mode : str
-        The fill-mode to use, either "zeros", "ones" or "random".
-
-    Returns
-    -------
-    tensor : np.array
-        The generated tensor as a np.array.
-    """
-    if fill_mode == "zeros":
-        tensor = np.zeros(shape=shape, dtype=dtype)
-    elif fill_mode == "ones":
-        tensor = np.ones(shape=shape, dtype=dtype)
-    elif fill_mode == "random":
-        if "int8" in dtype:
-            tensor = np.random.randint(128, size=shape, dtype=dtype)
-        else:
-            tensor = np.random.uniform(-1, 1, size=shape).astype(dtype)
-    else:
-        raise TVMCException("unknown fill-mode: {}".format(fill_mode))
-
-    return tensor
-
-
-def make_inputs_dict(
-    shape_dict: tvm.container.Map,
-    dtype_dict: tvm.container.Map,
-    inputs: Optional[Dict[str, np.ndarray]] = None,
-    fill_mode: str = "random",
-):
-    """Make the inputs dictionary for a graph.
-
-    Use data from 'inputs' where specified. For input tensors
-    where no data has been given, generate data according to the
-    chosen fill-mode.
-
-    Parameters
-    ----------
-    shape_dict : Map
-        Shape dictionary - {input_name: tuple}.
-    dtype_dict : Map
-        dtype dictionary - {input_name: dtype}.
-    inputs : dict, optional
-        A dictionary that maps input names to numpy values.
-    fill_mode : str, optional
-        The fill-mode to use when generating tensor data.
-        Can be either "zeros", "ones" or "random".
-
-    Returns
-    -------
-    inputs_dict : dict
-        Complete inputs dictionary - {input_name: np.array}.
-    """
-    logger.debug("creating inputs dict")
-
-    if inputs is None:
-        inputs = {}
-
-    # First check all the keys in inputs exist in the graph
-    for input_name in inputs:
-        if input_name not in shape_dict.keys():
-            raise TVMCException(
-                "the input tensor '{}' is not in the graph. Expected inputs: '{}'".format(
-                    input_name, list(shape_dict.keys())
-                )
-            )
-
-    # Now construct the input dict, generating tensors where no
-    # data already exists in 'inputs'
-    inputs_dict = {}
-    for input_name in shape_dict:
-        if input_name in inputs.keys():
-            logger.debug("setting input '%s' with user input data", input_name)
-            inputs_dict[input_name] = inputs[input_name]
-        else:
-            # container.ShapleTuple -> tuple
-            shape = tuple(shape_dict[input_name])
-            # container.String -> str
-            dtype = str(dtype_dict[input_name])
-
-            logger.debug(
-                "generating data for input '%s' (shape: %s, dtype: %s), using fill-mode '%s'",
-                input_name,
-                shape,
-                dtype,
-                fill_mode,
-            )
-            data = generate_tensor_data(shape, dtype, fill_mode)
-            inputs_dict[input_name] = data
-
-    return inputs_dict
-
-
-def run_module(
-    tvmc_package: TVMCPackage,
-    device: str,
-    hostname: Optional[str] = None,
-    port: Union[int, str] = 9090,
-    rpc_key: Optional[str] = None,
-    inputs: Optional[Dict[str, np.ndarray]] = None,
-    fill_mode: str = "random",
-    benchmark: bool = False,
-    repeat: int = 10,
-    number: int = 10,
-    profile: bool = False,
-    profile_options: str = "table,sort,aggregate,col_sums",
-    end_to_end: bool = False,
-):
-    """Run a compiled graph executor module locally or remotely with
-    optional input values.
-
-    If input tensors are not specified explicitly, they can be filled
-    with zeroes, ones or random data.
-
-    Parameters
-    ----------
-    tvmc_package: TVMCPackage
-        The compiled model package object that will be run.
-    device: str,
-        the device (e.g. "cpu" or "cuda") to be targeted by the RPC
-        session, local or remote).
-    hostname : str, optional
-        The hostname of the target device on which to run.
-    port : int, optional
-        The port of the target device on which to run.
-    rpc_key : str, optional
-        The tracker key of the target device. If this is set, it
-        will be assumed that remote points to a tracker.
-    inputs : dict, optional
-        A dictionary that maps input names to numpy values. If not provided,
-        inputs will be generated using the fill_mode argument.
-    fill_mode : str, optional
-        The fill-mode to use when generating data for input tensors.
-        Valid options are "zeros", "ones" and "random".
-        Defaults to "random".
-    benchmark : bool, optional
-        Whether to benchmark the execution of the module. Enabling benchmark will
-        result in (1 + repeat * number) executions of the model.
-    repeat : int, optional
-        How many times to repeat the run. Requires `benchmark` to be set to True.
-    number : int, optional
-        The number of runs to measure within each repeat.
-        Requires `benchmark` to be set to True.
-    profile : bool
-        Whether to profile the run with the debug executor.
-    profile_options : string
-        Additional options for profiling
-    end_to_end : bool
-        Whether to measure the time of memory copies as well as model
-        execution. Turning this on can provide a more realistic estimate
-        of how long running the model in production would take.
-        Requires `benchmark` to be set to True.
-
-    Returns
-    -------
-    TVMCResult
-        The results of the run, including the output data.
-    """
-    if not isinstance(tvmc_package, TVMCPackage):
-        raise TVMCException(
-            "This model doesn't seem to have been compiled yet. "
-            "Try calling tvmc.compile on the model before running it."
-        )
-
-    with ExitStack() as stack:
-
-        if hostname:
-            if isinstance(port, str):
-                port = int(port)
-            # Remote RPC
-            if rpc_key:
-                logger.debug("Running on remote RPC tracker with key %s.", rpc_key)
-                session = request_remote(rpc_key, hostname, port, timeout=1000)
-            else:
-                logger.debug("Running on remote RPC with no key.")
-                session = rpc.connect(hostname, port)
-        elif device == "micro":
-            # Remote RPC (running on a micro target)
-            logger.debug("Running on remote RPC (micro target).")
-            try:
-                session = tvm.micro.Session(project_.transport())
-                stack.enter_context(session)
-            except:
-                raise TVMCException("Could not open a session with the micro target.")
-        else:
-            # Local
-            logger.debug("Running a local session.")
-            session = rpc.LocalSession()
-
-        session.upload(tvmc_package.lib_path)
-        lib = session.load_module(tvmc_package.lib_name)
-
-        # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron)
-        logger.debug("Device is %s.", device)
-        if device == "cuda":
-            dev = session.cuda()
-        elif device == "cl":
-            dev = session.cl()
-        elif device == "metal":
-            dev = session.metal()
-        elif device == "vulkan":
-            dev = session.vulkan()
-        elif device == "rocm":
-            dev = session.rocm()
-        else:
-            assert device == "cpu"
-            dev = session.cpu()
-
-        if tvmc_package.type == "vm":
-            assert inputs is not None, "vm runner requires inputs to be provided as a dict"
-
-            input_tensor = {}
-            for e, i in inputs.items():
-                input_tensor[e] = tvm.nd.array(i, dev)
-
-            if profile:
-                logger.debug("Creating vm with profile enabled.")
-                exe = profiler_vm.VirtualMachineProfiler(lib, dev)
-                res = exe.profile(**input_tensor, func_name="main")
-                # This print is intentional
-                print(res)
-            else:
-                exe = vm.VirtualMachine(lib, dev)
-
-            exe_outputs = exe.invoke("main", **input_tensor)
-
-            if benchmark:
-                times = exe.benchmark(
-                    dev,
-                    **input_tensor,
-                    func_name="main",
-                    repeat=repeat,
-                    number=number,
-                    end_to_end=end_to_end,
-                )
-            else:
-                exe.run(**input_tensor)
-                times = []
-
-            # Special handling if the output only has a single value
-            if not isinstance(exe_outputs, list):
-                exe_outputs = [exe_outputs]
-
-            outputs = {}
-            for i, val in enumerate(exe_outputs):
-                output_name = "output_{}".format(i)
-                outputs[output_name] = val.numpy()
-        else:
-            # TODO(gromero): Adjust for micro targets.
-            if profile:
-                logger.debug("Creating runtime with profiling enabled.")
-                module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof")
-            else:
-                logger.debug("Creating runtime with profiling disabled.")
-                module = executor.create(tvmc_package.graph, lib, dev)
-
-            if tvmc_package.executor_type == "graph":
-                logger.debug("Loading params into the runtime module.")
-                module.load_params(tvmc_package.params)
-
-            logger.debug("Collecting graph input shape and type:")
-
-            if isinstance(session, tvm.rpc.client.RPCSession):
-                # RPC does not support datatypes such as Array and Map,
-                # fallback to obtaining input information from graph json.
-                shape_dict, dtype_dict = get_input_info(tvmc_package.graph, tvmc_package.params)
-            else:
-                shape_dict, dtype_dict = module.get_input_info()
-
-            logger.debug("Graph input shape: %s", shape_dict)
-            logger.debug("Graph input type: %s", dtype_dict)
-
-            inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode)
-
-            logger.debug("Setting inputs to the module.")
-            module.set_input(**inputs_dict)
-
-            # Run must be called explicitly if profiling
-            if profile:
-                logger.info("Running the module with profiling enabled.")
-                report = module.profile()
-                # This print is intentional
-                if profile_options.find("table") != -1:
-                    is_sort = profile_options.find("sort") != -1
-                    is_aggr = profile_options.find("aggregate") != -1
-                    is_sum = profile_options.find("col_sums") != -1
-                    print(report.table(sort=is_sort, aggregate=is_aggr, col_sums=is_sum))
-                if profile_options.find("csv") != -1:
-                    print(report.csv())
-                if profile_options.find("json") != -1:
-                    print(report.json())
-
-            if not benchmark or device == "micro":
-                # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
-                # fixed module.benchmark() can be used instead and this if/else can
-                # be removed.
-                module.run()
-                times = []
-            else:
-                # Call the benchmarking function of the executor.
-                # Optionally measure e2e data transfers from the
-                # CPU to device memory overheads (e.g. PCIE
-                # overheads if the device is a discrete GPU).
-                if end_to_end:
-                    dev = session.cpu()
-                times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end)
-
-            logger.debug("Collecting the output tensors.")
-            num_outputs = module.get_num_outputs()
-            outputs = {}
-            for i in range(num_outputs):
-                output_name = "output_{}".format(i)
-                outputs[output_name] = module.get_output(i).numpy()
-
-        return TVMCResult(outputs, times)
diff --git a/python/tvm/driver/tvmc/shape_parser.py b/python/tvm/driver/tvmc/shape_parser.py
deleted file mode 100644
index 24b7727703d6..000000000000
--- a/python/tvm/driver/tvmc/shape_parser.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-TVMC Shape Parsing
-"""
-
-import argparse
-import re
-
-from tvm import relay
-
-
-def parse_shape_string(inputs_string):
-    """Parse an input shape dictionary string to a usable dictionary.
-
-    Parameters
-    ----------
-    inputs_string: str
-        A string of the form "input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]" that
-        indicates the desired shape for specific model inputs. Colons, forward slashes and dots
-        within input_names are supported. Spaces are supported inside of dimension arrays.
-
-    Returns
-    -------
-    shape_dict: dict
-        A dictionary mapping input names to their shape for use in relay frontend converters.
-    """
-
-    # Create a regex pattern that extracts each separate input mapping.
-    # We want to be able to handle:
-    # * Spaces inside arrays
-    # * forward slashes inside names (but not at the beginning or end)
-    # * colons inside names (but not at the beginning or end)
-    # * dots inside names
-    pattern = r"(?:\w+\/)?[:\w.]+\:\s*\[\-?\d+(?:\,\s*\-?\d+)*\]"
-    input_mappings = re.findall(pattern, inputs_string)
-    if not input_mappings:
-        raise argparse.ArgumentTypeError(
-            "--input-shapes argument must be of the form "
-            '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"'
-        )
-    shape_dict = {}
-    for mapping in input_mappings:
-        # Remove whitespace.
-        mapping = mapping.replace(" ", "")
-        # Split mapping into name and shape.
-        name, shape_string = mapping.rsplit(":", 1)
-        # Convert shape string into a list of integers or Anys if negative.
-        shape = [int(x) if int(x) > 0 else relay.Any() for x in shape_string.strip("][").split(",")]
-        # Add parsed mapping to shape dictionary.
-        shape_dict[name] = shape
-
-    return shape_dict
diff --git a/python/tvm/driver/tvmc/target.py b/python/tvm/driver/tvmc/target.py
deleted file mode 100644
index 4cfaf130e4db..000000000000
--- a/python/tvm/driver/tvmc/target.py
+++ /dev/null
@@ -1,431 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-This file contains functions for processing target inputs for the TVMC CLI
-"""
-
-import os
-import logging
-import json
-import re
-
-import tvm
-from tvm.driver import tvmc
-from tvm.driver.tvmc import TVMCException
-from tvm.driver.tvmc.composite_target import get_codegen_by_target, get_codegen_names
-from tvm.ir.attrs import make_node, _ffi_api as attrs_api
-from tvm.ir.transform import PassContext
-from tvm.target import Target, TargetKind
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-# We can't tell the type inside an Array but all current options are strings so
-# it can default to that. Bool is used alongside Integer but aren't distinguished
-# between as both are represented by IntImm
-INTERNAL_TO_NATIVE_TYPE = {"runtime.String": str, "IntImm": int, "Array": str}
-INTERNAL_TO_HELP = {"runtime.String": " string", "IntImm": "", "Array": " options"}
-
-
-def _valid_target_kinds():
-    codegen_names = tvmc.composite_target.get_codegen_names()
-    return filter(lambda target: target not in codegen_names, Target.list_kinds())
-
-
-def _generate_target_kind_args(parser, kind_name):
-    target_group = parser.add_argument_group(f"target {kind_name}")
-    for target_option, target_type in TargetKind.options_from_name(kind_name).items():
-        if target_type in INTERNAL_TO_NATIVE_TYPE:
-            target_group.add_argument(
-                f"--target-{kind_name}-{target_option}",
-                type=INTERNAL_TO_NATIVE_TYPE[target_type],
-                help=f"target {kind_name} {target_option}{INTERNAL_TO_HELP[target_type]}",
-            )
-
-
-def _generate_codegen_args(parser, codegen_name):
-    codegen = get_codegen_by_target(codegen_name)
-    pass_configs = PassContext.list_configs()
-
-    if codegen["config_key"] is not None and codegen["config_key"] in pass_configs:
-        target_group = parser.add_argument_group(f"target {codegen_name}")
-        attrs = make_node(pass_configs[codegen["config_key"]]["type"])
-        fields = attrs_api.AttrsListFieldInfo(attrs)
-        for field in fields:
-            for tvm_type, python_type in INTERNAL_TO_NATIVE_TYPE.items():
-                if field.type_info.startswith(tvm_type):
-                    target_option = field.name
-                    default_value = None
-
-                    # Retrieve the default value string from attrs(field) of config node
-                    # Eg: "default=target_cpu_name"
-                    target_option_default_str = field.type_info.split("default=")[1]
-
-                    # Extract the defalut value based on the tvm type
-                    if target_option_default_str and tvm_type == "runtime.String":
-                        default_value = target_option_default_str
-                    elif target_option_default_str and tvm_type == "IntImm":
-                        # Extract the numeric value from the python Int string, Eg: T.int64(8)
-                        str_slice = target_option_default_str.split("(")[1]
-                        default_value = str_slice.split(")")[0]
-
-                    if codegen["pass_default"] is False:
-                        default_value = None
-
-                    target_group.add_argument(
-                        f"--target-{codegen_name}-{target_option}",
-                        type=python_type,
-                        help=field.description,
-                        default=default_value,
-                    )
-
-
-def generate_target_args(parser):
-    """Walks through the TargetKind registry and generates arguments for each Target's options"""
-    parser.add_argument(
-        "--target",
-        help="compilation target as plain string, inline JSON or path to a JSON file",
-        required=False,
-    )
-    for target_kind in _valid_target_kinds():
-        _generate_target_kind_args(parser, target_kind)
-    for codegen_name in get_codegen_names():
-        _generate_codegen_args(parser, codegen_name)
-
-
-def _reconstruct_target_kind_args(args, kind_name):
-    kind_options = {}
-    for target_option, target_type in TargetKind.options_from_name(kind_name).items():
-        if target_type in INTERNAL_TO_NATIVE_TYPE:
-            var_name = f"target_{kind_name.replace('-', '_')}_{target_option.replace('-', '_')}"
-            option_value = getattr(args, var_name)
-            if option_value is not None:
-                kind_options[target_option] = getattr(args, var_name)
-    return kind_options
-
-
-def _reconstruct_codegen_args(args, codegen_name):
-    codegen = get_codegen_by_target(codegen_name)
-    pass_configs = PassContext.list_configs()
-    codegen_options = {}
-    default_tgt = codegen["default_target"]
-
-    # Do not fetch codegen options, if the default target alone is choosen by user
-    if codegen_name not in args.target and default_tgt is not None and default_tgt in args.target:
-        return codegen_options
-
-    if codegen["config_key"] is not None and codegen["config_key"] in pass_configs:
-        attrs = make_node(pass_configs[codegen["config_key"]]["type"])
-        fields = attrs_api.AttrsListFieldInfo(attrs)
-        for field in fields:
-            for tvm_type in INTERNAL_TO_NATIVE_TYPE:
-                if field.type_info.startswith(tvm_type):
-                    target_option = field.name
-                    var_name = (
-                        f"target_{codegen_name.replace('-', '_')}_{target_option.replace('-', '_')}"
-                    )
-                    option_value = getattr(args, var_name)
-                    if option_value is not None:
-                        codegen_options[target_option] = option_value
-    return codegen_options
-
-
-def reconstruct_target_args(args):
-    """Reconstructs the target options from the arguments"""
-    reconstructed = {}
-    for target_kind in _valid_target_kinds():
-        kind_options = _reconstruct_target_kind_args(args, target_kind)
-        if kind_options:
-            reconstructed[target_kind] = kind_options
-
-    for codegen_name in get_codegen_names():
-        codegen_options = _reconstruct_codegen_args(args, codegen_name)
-        if codegen_options:
-            reconstructed[codegen_name] = codegen_options
-    return reconstructed
-
-
-def validate_targets(parse_targets, additional_target_options=None):
-    """
-    Apply a series of validations in the targets provided via CLI.
-    """
-    tvm_target_kinds = tvm.target.Target.list_kinds()
-    targets = [t["name"] for t in parse_targets]
-
-    if len(targets) > len(set(targets)):
-        raise TVMCException("Duplicate target definitions are not allowed")
-
-    if targets[-1] not in tvm_target_kinds:
-        tvm_target_names = ", ".join(tvm_target_kinds)
-        raise TVMCException(
-            f"The last target needs to be a TVM target. Choices: {tvm_target_names}"
-        )
-
-    tvm_targets = [t for t in targets if t in _valid_target_kinds()]
-    if len(tvm_targets) > 2:
-        verbose_tvm_targets = ", ".join(tvm_targets)
-        raise TVMCException(
-            "Only two of the following targets can be used at a time. "
-            f"Found: {verbose_tvm_targets}."
-        )
-
-    if additional_target_options is not None:
-        for target_name in additional_target_options:
-            if not any([target for target in parse_targets if target["name"] == target_name]):
-                first_option = list(additional_target_options[target_name].keys())[0]
-                raise TVMCException(
-                    f"Passed --target-{target_name}-{first_option}"
-                    f" but did not specify {target_name} target"
-                )
-
-
-def tokenize_target(target):
-    """
-    Extract a list of tokens from a target specification text.
-
-    It covers some corner-cases that are not covered by the built-in
-    module 'shlex', such as the use of "+" as a punctuation character.
-
-
-    Example
-    -------
-
-    For the input `foo -op1=v1 -op2="v ,2", bar -op3=v-4` we
-    should obtain:
-
-        ["foo", "-op1=v1", "-op2="v ,2"", ",", "bar", "-op3=v-4"]
-
-    Parameters
-    ----------
-    target : str
-        Target options sent via CLI arguments
-
-    Returns
-    -------
-    list of str
-        a list of parsed tokens extracted from the target string
-    """
-
-    # Regex to tokenize the "--target" value. It is split into five parts
-    # to match with:
-    #  1. target and option names e.g. llvm, -mattr=, -mcpu=
-    #  2. option values, all together, without quotes e.g. -mattr=+foo,+opt
-    #  3. option values, when single quotes are used e.g. -mattr='+foo, +opt'
-    #  4. option values, when double quotes are used e.g. -mattr="+foo ,+opt"
-    #  5. commas that separate different targets e.g. "my-target, llvm"
-    target_pattern = (
-        r"(\-{0,2}[\w\-]+\=?"
-        r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*"
-        r"|[\'][\w\+\-,\s\.]+[\']"
-        r"|[\"][\w\+\-,\s\.]+[\"])*"
-        r"|,)"
-    )
-
-    return re.findall(target_pattern, target)
-
-
-def parse_target(target):
-    """
-    Parse a plain string of targets provided via a command-line
-    argument.
-
-    To send more than one codegen, a comma-separated list
-    is expected. Options start with -<option_name>=<value>.
-
-    We use python standard library 'shlex' to parse the argument in
-    a POSIX compatible way, so that if options are defined as
-    strings with spaces or commas, for example, this is considered
-    and parsed accordingly.
-
-
-    Example
-    -------
-
-    For the input `--target="foo -op1=v1 -op2="v ,2", bar -op3=v-4"` we
-    should obtain:
-
-      [
-        {
-            name: "foo",
-            opts: {"op1":"v1", "op2":"v ,2"},
-            raw: 'foo -op1=v1 -op2="v ,2"'
-        },
-        {
-            name: "bar",
-            opts: {"op3":"v-4"},
-            raw: 'bar -op3=v-4'
-        }
-      ]
-
-    Parameters
-    ----------
-    target : str
-        Target options sent via CLI arguments
-
-    Returns
-    -------
-    codegens : list of dict
-        This list preserves the order in which codegens were
-        provided via command line. Each Dict contains three keys:
-        'name', containing the name of the codegen; 'opts' containing
-        a key-value for all options passed via CLI; 'raw',
-        containing the plain string for this codegen
-    """
-    codegen_names = tvmc.composite_target.get_codegen_names()
-    codegens = []
-
-    tvm_target_kinds = tvm.target.Target.list_kinds()
-    parsed_tokens = tokenize_target(target)
-
-    split_codegens = []
-    current_codegen = []
-    split_codegens.append(current_codegen)
-    for token in parsed_tokens:
-        # every time there is a comma separating
-        # two codegen definitions, prepare for
-        # a new codegen
-        if token == ",":
-            current_codegen = []
-            split_codegens.append(current_codegen)
-        else:
-            # collect a new token for the current
-            # codegen being parsed
-            current_codegen.append(token)
-
-    # at this point we have a list of lists,
-    # each item on the first list is a codegen definition
-    # in the comma-separated values
-    for codegen_def in split_codegens:
-        # the first is expected to be the name
-        name = codegen_def[0]
-        is_tvm_target = name in tvm_target_kinds and name not in codegen_names
-        raw_target = " ".join(codegen_def)
-        all_opts = codegen_def[1:] if len(codegen_def) > 1 else []
-        opts = {}
-        for opt in all_opts:
-            try:
-                # deal with -- prefixed flags
-                if opt.startswith("--"):
-                    opt_name = opt[2:]
-                    opt_value = True
-                else:
-                    opt = opt[1:] if opt.startswith("-") else opt
-                    opt_name, opt_value = opt.split("=", maxsplit=1)
-
-                    # remove quotes from the value: quotes are only parsed if they match,
-                    # so it is safe to assume that if the string starts with quote, it ends
-                    # with quote.
-                    opt_value = opt_value[1:-1] if opt_value[0] in ('"', "'") else opt_value
-            except ValueError:
-                raise ValueError(f"Error when parsing '{opt}'")
-
-            opts[opt_name] = opt_value
-
-        codegens.append(
-            {"name": name, "opts": opts, "raw": raw_target, "is_tvm_target": is_tvm_target}
-        )
-
-    return codegens
-
-
-def is_inline_json(target):
-    try:
-        json.loads(target)
-        return True
-    except json.decoder.JSONDecodeError:
-        return False
-
-
-def _combine_target_options(target, additional_target_options=None):
-    if additional_target_options is None:
-        return target
-    if target["name"] in additional_target_options:
-        target["opts"].update(additional_target_options[target["name"]])
-    return target
-
-
-def _recombobulate_target(target):
-    name = target["name"]
-    opts = " ".join([f"-{key}={value}" for key, value in target["opts"].items()])
-    return f"{name} {opts}"
-
-
-def target_from_cli(target, additional_target_options=None):
-    """
-    Create a tvm.target.Target instance from a
-    command line interface (CLI) string.
-
-    Parameters
-    ----------
-    target : str
-        compilation target as plain string,
-        inline JSON or path to a JSON file
-
-    additional_target_options: Optional[Dict[str, Dict[str,str]]]
-        dictionary of additional target options to be
-        combined with parsed targets
-
-    Returns
-    -------
-    tvm.target.Target
-        an instance of target device information
-    extra_targets : list of dict
-        This list preserves the order in which extra targets were
-        provided via command line. Each Dict contains three keys:
-        'name', containing the name of the codegen; 'opts' containing
-        a key-value for all options passed via CLI; 'raw',
-        containing the plain string for this codegen
-    """
-    extra_targets = []
-
-    if os.path.isfile(target):
-        with open(target) as target_file:
-            logger.debug("target input is a path: %s", target)
-            target = "".join(target_file.readlines())
-    elif is_inline_json(target):
-        logger.debug("target input is inline JSON: %s", target)
-    else:
-        logger.debug("target input is plain text: %s", target)
-        try:
-            parsed_targets = parse_target(target)
-        except ValueError as error:
-            raise TVMCException(f"Error parsing target string '{target}'.\nThe error was: {error}")
-
-        validate_targets(parsed_targets, additional_target_options)
-        tvm_targets = [
-            _combine_target_options(t, additional_target_options)
-            for t in parsed_targets
-            if t["is_tvm_target"]
-        ]
-
-        # Validated target strings have 1 or 2 tvm targets, otherwise
-        # `validate_targets` above will fail.
-        if len(tvm_targets) == 1:
-            target = _recombobulate_target(tvm_targets[0])
-            target_host = None
-        else:
-            assert len(tvm_targets) == 2
-            target = _recombobulate_target(tvm_targets[0])
-            target_host = _recombobulate_target(tvm_targets[1])
-
-        extra_targets = [
-            _combine_target_options(t, additional_target_options)
-            for t in parsed_targets
-            if not t["is_tvm_target"]
-        ]
-
-    return tvm.target.Target(target, host=target_host), extra_targets
diff --git a/python/tvm/driver/tvmc/tracker.py b/python/tvm/driver/tvmc/tracker.py
deleted file mode 100644
index 65fda42ac541..000000000000
--- a/python/tvm/driver/tvmc/tracker.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language
-"""
-TVMC Remote Tracker
-"""
-
-import logging
-from urllib.parse import urlparse
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-
-def tracker_host_port_from_cli(rpc_tracker_str):
-    """Extract hostname and (optional) port from strings
-    like "1.2.3.4:9090" or "4.3.2.1".
-
-    Used as a helper function to cover --rpc-tracker
-    command line argument, in different subcommands.
-
-    Parameters
-    ----------
-    rpc_tracker_str : str
-        hostname (or IP address) and port of the RPC tracker,
-        in the format 'hostname[:port]'.
-
-    Returns
-    -------
-    rpc_hostname : str or None
-        hostname or IP address, extracted from input.
-    rpc_port : int or None
-        port number extracted from input (9090 default).
-    """
-
-    rpc_hostname = rpc_port = None
-
-    if rpc_tracker_str:
-        parsed_url = urlparse("//%s" % rpc_tracker_str)
-        rpc_hostname = parsed_url.hostname
-        rpc_port = parsed_url.port or 9090
-        logger.info("RPC tracker hostname: %s", rpc_hostname)
-        logger.info("RPC tracker port: %s", rpc_port)
-
-    return rpc_hostname, rpc_port
diff --git a/python/tvm/driver/tvmc/transform.py b/python/tvm/driver/tvmc/transform.py
deleted file mode 100644
index 253c624e6ed4..000000000000
--- a/python/tvm/driver/tvmc/transform.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language
-# pylint: disable=unused-argument
-"""
-TVMC Graph Transforms
-"""
-
-from tvm import relay, transform
-from tvm.driver.tvmc import TVMCException
-
-
-def generate_mixed_precision_rule(acc_dtype):
-    def _mixed_precision_rule(call_node: "relay.Call", mixed_precision_type: str):
-        return [
-            relay.transform.mixed_precision.MIXED_PRECISION_ALWAYS,
-            acc_dtype,
-            mixed_precision_type,
-        ]
-
-    return _mixed_precision_rule
-
-
-class MixedPrecision(object):
-    """Temporarily changes attr of ops to enable required precision."""
-
-    def __init__(self, ops, acc_type):
-        """Saves the required info for RAII pattern usage.
-
-        Parameters
-        ----------
-        ops : list
-            list of operators
-        acc_type: str
-            Output or accumulation precision to be used.
-        """
-        self.older_attr = {}
-        self.ops = ops
-        self.acc_type = acc_type
-        self.attr_key = "FTVMMixedPrecisionConversionType"
-
-    def __enter__(self):
-        for op_name in self.ops:
-            op = relay.op.get(op_name)
-            self.older_attr[op_name] = op.get_attr(self.attr_key)
-            op.reset_attr(self.attr_key)
-            op.set_attr(self.attr_key, generate_mixed_precision_rule(self.acc_type))
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        for op_name in self.ops:
-            op = relay.op.get(op_name)
-            op.reset_attr(self.attr_key)
-            if self.older_attr[op_name]:
-                op.set_attr(self.attr_key, self.older_attr[op_name])
-
-
-def convert_to_mixed_precision(mod, ops=None, calculation_type="float16", acc_type="float16"):
-    """Converts the operator datatypes
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The relay module to convert.
-    ops : list
-        List of operators to be precision converted.
-    calculation_type: str
-        Input precision to be used.
-    acc_type: str
-        Output or accumulation precision to be used.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The converted module.
-    """
-
-    if ops is None:
-        ops = ["nn.conv2d", "nn.dense"]
-
-    with MixedPrecision(ops, acc_type):
-        seq = transform.Sequential(
-            [relay.transform.InferType(), relay.transform.ToMixedPrecision(calculation_type)]
-        )
-        with transform.PassContext(
-            config={"relay.ToMixedPrecision.keep_orig_output_dtype": True}, opt_level=3
-        ):
-            try:
-                return seq(mod)
-            except Exception as err:
-                raise TVMCException("Error converting mixed precision : {0}".format(str(err)))
-
-
-def convert_graph_layout(mod, desired_layouts, ops=None):
-    """Alter the layout of the input graph.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The relay module to convert.
-    desired_layouts : list[str]
-        The layouts to convert to.
-        Expects either a single element or one str per operator.
-        Can be only data layouts or combination of both, e.g. NHWC:HWIO
-    ops : list
-        List of operators to be layout converted.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The converted module.
-    """
-    if ops is None:
-        ops = ["nn.conv2d", "nn.conv2d_transpose", "qnn.conv2d"]
-
-    if not isinstance(desired_layouts, list):
-        # For backwards compatibility
-        assert isinstance(desired_layouts, str)
-        desired_layouts = [desired_layouts]
-
-    if len(desired_layouts) != len(ops):
-        if len(desired_layouts) != 1:
-            raise TVMCException(
-                "Expected 1 or {} layouts but got {}".format(len(ops), len(desired_layouts))
-            )
-        desired_layouts = desired_layouts * len(ops)
-
-    def layout_helper(layout):
-        if ":" in layout:
-            data_layout, kernel_layout = layout.split(":", 1)
-        else:
-            data_layout = layout
-            kernel_layout = "default"
-        return [data_layout, kernel_layout]
-
-    desired_layouts = {op: layout_helper(desired_layouts[i]) for i, op in enumerate(ops)}
-
-    # Convert the layout of the graph where possible.
-    seq = transform.Sequential(
-        [
-            relay.transform.RemoveUnusedFunctions(),
-            relay.transform.ConvertLayout(desired_layouts),
-            relay.transform.FoldConstant(),
-        ]
-    )
-
-    try:
-        return seq(mod)
-    except Exception as err:
-        raise TVMCException("Error converting layouts: {}".format(str(err)))
-
-
-def apply_graph_transforms(mod, args, params=None):
-    """Alter the layout of the input graph.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The relay module to convert.
-    args : dict
-        The transform arguments.
-    params: dict
-        Module params
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The converted module.
-    """
-    if not args:
-        return mod
-
-    # AlterLayout
-    if args.get("desired_layout", None):
-        mod = convert_graph_layout(
-            mod, args["desired_layout"], args.get("desired_layout_ops", None)
-        )
-
-    # ToMixedPrecision
-    if args.get("mixed_precision", False):
-        mod = relay.quantize.prerequisite_optimize(mod, params)
-        mod = convert_to_mixed_precision(
-            mod,
-            args.get("mixed_precision_ops"),
-            args.get("mixed_precision_calculation_type"),
-            args.get("mixed_precision_acc_type"),
-        )
-    return mod
-
-
-def parse_graph_transform_args(args):
-    """Parse incoming options for graph transform arguments.
-
-    Parameters
-    ----------
-    args: argparse.Namespace or dict
-        Arguments.
-
-    Returns
-    -------
-    transform_args : dict
-        Graph transform arguments
-    """
-
-    if not isinstance(args, dict):
-        args = vars(args)
-
-    transform_args = [
-        "desired_layout",
-        "desired_layout_ops",
-        "mixed_precision",
-        "mixed_precision_ops",
-        "mixed_precision_calculation_type",
-        "mixed_precision_acc_type",
-    ]
-    transform_args = {key: args.get(key, None) for key in transform_args}
-    return transform_args
-
-
-def generate_transform_args(parser):
-    """Add graph transform related args"""
-
-    # AlterLayout
-    parser.add_argument(
-        "--desired-layout",
-        nargs="+",
-        help="Change the data/kernel layout of the graph. (i.e. NCHW or NHWC:HWIO)"
-        "This option can be provided multiple times to specify per-operator layouts, "
-        "e.g. '--desired-layout NHWC:HWIO' (Apply same layout for every operator)."
-        "e.g. '--desired-layout-ops nn.conv2d nn.avg_pool2d --desired-layout NCHW NHWC'.",
-    )
-    parser.add_argument(
-        "--desired-layout-ops",
-        default=["nn.conv2d", "nn.conv2d_transpose", "qnn.conv2d"],
-        nargs="+",
-        help="List of operators to be layout converted.",
-    )
-
-    # ToMixedPrecision
-    parser.add_argument(
-        "--mixed-precision",
-        help="Enable mixed precision conversion",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--mixed-precision-ops",
-        default=["nn.conv2d", "nn.dense"],
-        nargs="+",
-        help="List of operators to be converted to mixed precision",
-    )
-    parser.add_argument(
-        "--mixed-precision-calculation-type",
-        choices=["float16", "float32"],
-        default="float16",
-        help="Calculation precision type",
-    )
-    parser.add_argument(
-        "--mixed-precision-acc-type",
-        choices=["float16", "float32"],
-        default="float16",
-        help="Accumulator precision type",
-    )
diff --git a/python/tvm/driver/tvmc/workspace_pools.py b/python/tvm/driver/tvmc/workspace_pools.py
deleted file mode 100644
index fe304f7fc0af..000000000000
--- a/python/tvm/driver/tvmc/workspace_pools.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Functions for processing dynamic workspace pool TVMC args
-"""
-
-
-import logging
-import re
-
-from tvm.driver.tvmc import TVMCException
-from tvm.target import Target
-from tvm.ir.memory_pools import PoolInfoProperties, WorkspaceMemoryPools, WorkspacePoolInfo
-
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("TVMC")
-
-
-def generate_workspace_pools_args(parser):
-    """Generates arguments for each Workspace Pools's options"""
-    parser.add_argument(
-        "--workspace-pools",
-        help="""The name of the memory pool
-                Example usage: --workspace-pools=flash""",
-    )
-    parser.add_argument(
-        "--workspace-pools-targets",
-        help="""The name of the targets specified for the memory pool
-                Example usage: --workspace-pools-targets=flash:llvm""",
-        action="append",
-    )
-    parser.add_argument(
-        "--workspace-pools-size-hint-bytes",
-        nargs="?",
-        help="""The expected size hint to be used by the allocator.
-                Example usage: --workspace-pools-size-hint-bytes=flash:8""",
-        action="append",
-    )
-    parser.add_argument(
-        "--workspace-pools-clock-frequency-hz",
-        nargs="?",
-        help="""The clock frequency that the memory pool runs at in Hz.
-                Example usage: --workspace-pools-clock-frequency-hz=flash:70000000""",
-        action="append",
-    )
-    parser.add_argument(
-        "--workspace-pools-read-bandwidth-bytes-per-cycle",
-        nargs="?",
-        help="""The read bandwidth of the memory pool in bytes/cycle.
-                Example usage: --workspace-pools-read-bandwidth-bytes-per-cycle=flash:4""",
-        action="append",
-    )
-    parser.add_argument(
-        "--workspace-pools-write-bandwidth-bytes-per-cycle",
-        nargs="?",
-        help="""The write bandwidth of the memory pool in bytes/cycle.
-                Example usage: --workspace-pools-write-bandwidth-bytes-per-cycle=flash:8""",
-        action="append",
-    )
-    parser.add_argument(
-        "--workspace-pools-read-latency-cycles",
-        nargs="?",
-        help="""The read latency of the memory pool in cycles.
-                Example usage: --workspace-pools-read-latency-cycles=flash:4""",
-        action="append",
-    )
-    parser.add_argument(
-        "--workspace-pools-write-latency-cycles",
-        nargs="?",
-        help="""The write latency of the memory pool in cycles.
-                Example usage: --workspace-pools-write-latency-cycles=flash:8""",
-        action="append",
-    )
-    parser.add_argument(
-        "--workspace-pools-target-burst-bytes",
-        help="""The burst length of the memory pool in bytes per target.
-                Example usage: --workspace-pools-target-burst-bytes=flash:accel:1""",
-        action="append",
-    )
-
-
-def _parse_target_burst(attr_str, pool_name):
-    if pool_name not in attr_str:
-        return {}
-
-    return {target: int(attr_str[pool_name][target]) for target in attr_str[pool_name]}
-
-
-def _parse_target_string(attr_str, targets, pool_name):
-    if attr_str is None:
-        raise TVMCException(f'No target specified for Workspace Pool "{pool_name}"')
-
-    target_name = [re.split(",", attr_str)]
-    matched_targets = [
-        target
-        for target in targets
-        if any(target.kind.name in target_string_match for target_string_match in target_name[0])
-    ]
-    if not matched_targets:
-        raise TVMCException(f'Workspace Pool "{pool_name}" using undefined Target "{target_name}"')
-    return matched_targets
-
-
-def _split_pools_to_pool_names(attr_str):
-    return re.split(",", attr_str) if attr_str else []
-
-
-def _parse_target_attributes_of_pool_name(attr_str, targets):
-    if not targets or attr_str is None:
-        return {}
-
-    target_attributes = {}
-    for pool_values in attr_str:
-        pool_name, target_name, target_value = re.split(":", pool_values)
-        if pool_name not in target_attributes:
-            target_attributes[pool_name] = {}
-
-        matched_targets = [target for target in targets if target_name == target.kind.name]
-        if matched_targets:
-            target_attributes[pool_name][matched_targets[0]] = target_value
-        else:
-            raise TVMCException(
-                "The workspace pool target specification "
-                "needs to contain a subset of the same TVM "
-                "targets as when specifying targets to use."
-            )
-    return target_attributes
-
-
-def _parse_attribute_of_pool_name(attr_str):
-    return dict(pool.split(":", maxsplit=1) for pool in attr_str) if attr_str else {}
-
-
-def workspace_pools_recombobulate(parsed, targets, extra_target):
-    """Reconstructs the Workspace Pools args and returns a WorkspaceMemoryPool object"""
-    WORKSPACE_POOL_PARAMS = [
-        "workspace_pools_size_hint_bytes",
-        "workspace_pools_targets",
-        "workspace_pools_clock_frequency_hz",
-        "workspace_pools_read_bandwidth_bytes_per_cycle",
-        "workspace_pools_write_bandwidth_bytes_per_cycle",
-        "workspace_pools_read_latency_cycles",
-        "workspace_pools_write_latency_cycles",
-    ]
-    WORKSPACE_POOL_TARGET_PARAMS = [
-        "workspace_pools_target_burst_bytes",
-    ]
-
-    workspace_pools = _split_pools_to_pool_names(parsed.workspace_pools)
-    if not workspace_pools:
-        return None
-
-    parse_attribute_to_pool_name = {
-        workspace_pool_param: _parse_attribute_of_pool_name(getattr(parsed, workspace_pool_param))
-        for workspace_pool_param in WORKSPACE_POOL_PARAMS
-    }
-    parse_target_burst_bytes_to_pool = {
-        workspace_pool_param: _parse_target_attributes_of_pool_name(
-            getattr(parsed, workspace_pool_param), targets
-        )
-        for workspace_pool_param in WORKSPACE_POOL_TARGET_PARAMS
-    }
-
-    # Load extra targets from CLI
-    additional_targets = []
-
-    for t in extra_target:
-        additional_targets.append(Target(t["raw"], host=targets[0].host or targets[0]))
-
-    target = targets + additional_targets
-    if targets[0].host:
-        target.append(targets[0].host)
-
-    return WorkspaceMemoryPools(
-        [
-            WorkspacePoolInfo(
-                pool_name,
-                targets=_parse_target_string(
-                    parse_attribute_to_pool_name["workspace_pools_targets"].get(pool_name),
-                    target,
-                    pool_name,
-                ),
-                pool_info_properties=PoolInfoProperties(
-                    size_hint_bytes=int(
-                        parse_attribute_to_pool_name["workspace_pools_size_hint_bytes"].get(
-                            pool_name, -1
-                        )
-                    ),
-                    clock_frequency_hz=int(
-                        parse_attribute_to_pool_name["workspace_pools_clock_frequency_hz"].get(
-                            pool_name, -1
-                        )
-                    ),
-                    read_bandwidth_bytes_per_cycle=int(
-                        parse_attribute_to_pool_name[
-                            "workspace_pools_read_bandwidth_bytes_per_cycle"
-                        ].get(pool_name, -1)
-                    ),
-                    write_bandwidth_bytes_per_cycle=int(
-                        parse_attribute_to_pool_name[
-                            "workspace_pools_write_bandwidth_bytes_per_cycle"
-                        ].get(pool_name, -1)
-                    ),
-                    read_latency_cycles=int(
-                        parse_attribute_to_pool_name["workspace_pools_read_latency_cycles"].get(
-                            pool_name, 0
-                        )
-                    ),
-                    write_latency_cycles=int(
-                        parse_attribute_to_pool_name["workspace_pools_write_latency_cycles"].get(
-                            pool_name, 0
-                        )
-                    ),
-                    target_burst_bytes=_parse_target_burst(
-                        parse_target_burst_bytes_to_pool["workspace_pools_target_burst_bytes"],
-                        pool_name,
-                    ),
-                ),
-            )
-            for pool_name in workspace_pools
-        ]
-    )
diff --git a/python/tvm/exec/autotvm_log_editor.py b/python/tvm/exec/autotvm_log_editor.py
deleted file mode 100644
index 04d6aa6af87b..000000000000
--- a/python/tvm/exec/autotvm_log_editor.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Pick best log entries from a large file and store them to a small file"""
-
-import argparse
-import os
-import logging
-import warnings
-
-from .. import autotvm
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--act", type=str, choices=["pick-best"], required=True, help="The action")
-    parser.add_argument("--i", type=str, help="The input file or directory", required=True)
-    parser.add_argument("--o", type=str, help="The output file")
-
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO)
-
-    if args.act == "pick-best":
-        if os.path.isfile(args.i):
-            args.o = args.o or args.i + ".best.log"
-            autotvm.record.pick_best(args.i, args.o)
-        elif os.path.isdir(args.i):
-            args.o = args.o or "best.log"
-            tmp_filename = args.o + ".tmp"
-
-            with open(tmp_filename, "w") as tmp_fout:
-                for filename in os.listdir(args.i):
-                    if filename.endswith(".log"):
-                        try:
-                            autotvm.record.pick_best(filename, tmp_fout)
-                        except Exception:  # pylint: disable=broad-except
-                            warnings.warn("Ignore invalid file %s" % filename)
-
-            logging.info("Run final filter...")
-            autotvm.record.pick_best(tmp_filename, args.o)
-            os.remove(tmp_filename)
-            logging.info("Output to %s ...", args.o)
-        else:
-            raise ValueError("Invalid input file: " + args.i)
-    else:
-        raise ValueError("Invalid action " + args.act)
diff --git a/python/tvm/ir/__init__.py b/python/tvm/ir/__init__.py
index fdac74a0b4ec..e7376f4c1f0d 100644
--- a/python/tvm/ir/__init__.py
+++ b/python/tvm/ir/__init__.py
@@ -18,7 +18,6 @@
 """Common data structures across all IR variants."""
 
 from . import diagnostics, instrument, transform
-from .adt import Constructor, TypeData
 from .affine_type import TensorAffineType, TupleAffineType
 from .attrs import Attrs, DictAttrs, make_node
 from .base import (
diff --git a/python/tvm/ir/adt.py b/python/tvm/ir/adt.py
deleted file mode 100644
index 5ca026f727f0..000000000000
--- a/python/tvm/ir/adt.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Algebraic data type definitions."""
-import tvm._ffi
-
-from .type import Type
-from .expr import RelayExpr
-from . import _ffi_api
-
-
-@tvm._ffi.register_object("relay.Constructor")
-class Constructor(RelayExpr):
-    """Relay ADT constructor.
-
-    Parameters
-    ----------
-    name_hint : str
-        Name of constructor (only a hint).
-
-    inputs : List[Type]
-        Input types.
-
-    belong_to : GlobalTypeVar
-        Denotes which ADT the constructor belongs to.
-    """
-
-    def __init__(self, name_hint, inputs, belong_to):
-        self.__init_handle_by_constructor__(_ffi_api.Constructor, name_hint, inputs, belong_to)
-
-    def __call__(self, *args):
-        """Call the constructor.
-
-        Parameters
-        ----------
-        args: List[RelayExpr]
-            The arguments to the constructor.
-
-        Returns
-        -------
-        call: RelayExpr
-            A call to the constructor.
-        """
-        # pylint: disable=import-outside-toplevel
-        from tvm import relay
-
-        return relay.Call(self, args)
-
-
-@tvm._ffi.register_object("relay.TypeData")
-class TypeData(Type):
-    """Stores the definition for an Algebraic Data Type (ADT) in Relay.
-
-    Note that ADT definitions are treated as type-level functions because
-    the type parameters need to be given for an instance of the ADT. Thus,
-    any global type var that is an ADT header needs to be wrapped in a
-    type call that passes in the type params.
-
-    Parameters
-    ----------
-    header: GlobalTypeVar
-        The name of the ADT.
-        ADTs with the same constructors but different names are
-        treated as different types.
-
-    type_vars: List[TypeVar]
-        Type variables that appear in constructors.
-
-    constructors: List[Constructor]
-        The constructors for the ADT.
-    """
-
-    def __init__(self, header, type_vars, constructors):
-        self.__init_handle_by_constructor__(_ffi_api.TypeData, header, type_vars, constructors)
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index 263976fa98ff..1dcb9f6cf600 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -16,7 +16,7 @@
 # under the License.
 """Common expressions data structures in the IR."""
 from numbers import Number
-from typing import Callable, Optional
+from typing import Optional
 
 import tvm._ffi
 
@@ -106,14 +106,9 @@ def __call__(self, *args: RelayExpr) -> BaseExpr:
 
         # TODO(@relax-team): replace with Relax base class after it's introduced
         if all(isinstance(x, RelayExpr) for x in args):
-            if all(is_relax_expr(x) for x in args):
-                from tvm import relax
+            from tvm import relax
 
-                return relax.Call(self, args)
-            else:
-                from tvm import relay
-
-                return relay.Call(self, args)
+            return relax.Call(self, args)
 
         elif all(isinstance(x, (Number, PrimExpr)) for x in args):
             return tvm.tir.call_tir(self, *args)
@@ -121,36 +116,6 @@ def __call__(self, *args: RelayExpr) -> BaseExpr:
         arg_types = [type(x) for x in args]
         raise RuntimeError(f"Do not know how to handle GlobalVar.__call__ for types {arg_types}")
 
-    def astext(
-        self, show_meta_data: bool = True, annotate: Optional[Callable[[Object], str]] = None
-    ) -> str:
-        """Get the text format of the expression.
-
-        Parameters
-        ----------
-        show_meta_data : bool
-            Whether to include meta data section in the text
-            if there is meta data.
-
-        annotate: Optional[Object->str]
-            Optionally annotate function to provide additional
-            information in the comment block.
-
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
-        Notes
-        -----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
-        """
-        from tvm.relay import astext  # pylint: disable=import-outside-toplevel
-
-        return astext(self, show_meta_data, annotate)
-
 
 @tvm._ffi.register_object
 class Range(Node, Scriptable):
diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index 3c76dbfdd839..9ee87c5224e6 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -395,31 +395,3 @@ def with_attrs(self, attr_map: Union[DictAttrs, Dict[str, Object]]) -> "IRModule
             attr_map = attr_map._dict()
 
         return _ffi_api.Module_WithAttrs(self, attr_map)
-
-    def astext(self, show_meta_data=True, annotate=None):
-        """Get the text format of the expression.
-
-        Parameters
-        ----------
-        show_meta_data : bool
-            Whether to include meta data section in the text
-            if there is meta data.
-
-        annotate: Optional[Object->str]
-            Optionally annotate function to provide additional
-            information in the comment block.
-
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
-        Notes
-        -----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
-        """
-        from tvm.relay import astext  # pylint: disable=import-outside-toplevel
-
-        return astext(self, show_meta_data, annotate)
diff --git a/python/tvm/ir/op.py b/python/tvm/ir/op.py
index 70aba979518e..3ab5bb55c051 100644
--- a/python/tvm/ir/op.py
+++ b/python/tvm/ir/op.py
@@ -29,34 +29,6 @@ class Op(RelayExpr):
     def __init__(self):
         raise RuntimeError("Cannot create op, use get instead")
 
-    def astext(self, show_meta_data=True, annotate=None):
-        """Get the text format of the expression.
-
-        Parameters
-        ----------
-        show_meta_data : bool
-            Whether to include meta data section in the text
-            if there is meta data.
-
-        annotate: Optional[Object->str]
-            Optionally annotate function to provide additional
-            information in the comment block.
-
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
-        Notes
-        -----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
-        """
-        from tvm.relay import astext  # pylint: disable=import-outside-toplevel
-
-        return astext(self, show_meta_data, annotate)
-
     @staticmethod
     def get(op_name):
         """Get the Op for a given name
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index b44dbe45e0b7..c09871f60810 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -24,7 +24,6 @@
     measure_callback,
     mutator,
     postproc,
-    relay_integration,
     relax_integration,
     runner,
     schedule,
@@ -44,7 +43,6 @@
 from .mutator import Mutator
 from .postproc import Postproc
 from .profiler import Profiler
-from .relay_integration import is_meta_schedule_enabled
 from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import MeasureCandidate, SearchStrategy
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
deleted file mode 100644
index 1fd7b5d73e82..000000000000
--- a/python/tvm/meta_schedule/relay_integration.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""MetaSchedule-Relay integration"""
-from contextlib import contextmanager
-from types import MappingProxyType
-from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union
-
-# isort: off
-from typing_extensions import Literal
-
-# isort: on
-import numpy as np  # type: ignore
-
-from tvm import nd
-from tvm._ffi import get_global_func
-from tvm.ir import IRModule, transform
-from tvm.ir.instrument import PassInstrument
-from tvm.runtime import NDArray
-from tvm.target import Target
-
-from .builder import Builder
-from .cost_model import CostModel
-from .database import Database
-from .extracted_task import ExtractedTask
-from .logging import get_loggers_from_work_dir
-from .measure_callback import MeasureCallback
-from .profiler import Profiler
-from .runner import Runner
-from .search_strategy import SearchStrategy
-from .space_generator import SpaceGenerator
-from .task_scheduler import TaskScheduler
-from .tune import tune_tasks
-from .tune_context import TuneContext
-from .utils import fork_seed
-
-if TYPE_CHECKING:
-    from tvm import relay
-
-_extract_task = get_global_func(  # pylint: disable=invalid-name
-    "relay.backend.MetaScheduleExtractTask",
-    allow_missing=True,
-)
-
-
-@contextmanager
-def _autotvm_silencer():
-    """A context manager that silences autotvm warnings."""
-    from tvm import autotvm  # pylint: disable=import-outside-toplevel
-
-    silent = autotvm.GLOBAL_SCOPE.silent
-    autotvm.GLOBAL_SCOPE.silent = True
-    try:
-        yield
-    finally:
-        autotvm.GLOBAL_SCOPE.silent = silent
-
-
-def _normalize_params(
-    mod: IRModule,
-    target: Union[Target, str],
-    params: Optional[Dict[str, NDArray]],
-    pass_config: Mapping[str, Any],
-    executor: Optional["relay.backend.Executor"],
-    runtime: Optional["relay.backend.Runtime"],
-) -> Tuple[
-    IRModule,
-    Target,
-    Dict[str, NDArray],
-    Dict[str, Any],
-    Optional["relay.backend.Executor"],
-    Optional["relay.backend.Runtime"],
-]:
-    from tvm import relay  # pylint: disable=import-outside-toplevel
-
-    if isinstance(mod, relay.Function):
-        mod = IRModule.from_expr(mod)
-    if not isinstance(target, Target):
-        target = Target(target)
-    if params is None:
-        params = {}
-    relay_params = {}
-    for name, param in params.items():
-        if isinstance(param, np.ndarray):
-            param = nd.array(param)
-        relay_params[name] = param
-
-    if executor is None:
-        executor = relay.backend.Executor("graph")
-
-    if runtime is None:
-        runtime = relay.backend.Runtime("cpp")
-
-    if mod.get_attr("executor") is None:
-        mod = mod.with_attr("executor", executor)
-    else:
-        executor = mod.get_attr("executor")
-
-    pass_config = dict(pass_config)
-    return mod, target, relay_params, pass_config, executor, runtime
-
-
-def extract_tasks(
-    mod: IRModule,
-    target: Union[Target, str],
-    params: Optional[Dict[str, NDArray]],
-    *,
-    opt_level: int = 3,
-    pass_config: Mapping[str, Any] = MappingProxyType(
-        {
-            "relay.backend.use_meta_schedule": True,
-            "relay.backend.tir_converter": "default",
-        }
-    ),
-    executor: Optional["relay.backend.Executor"] = None,
-    runtime: Optional["relay.backend.Runtime"] = None,
-    module_equality: str = "structural",
-    disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
-    instruments: Optional[Sequence[PassInstrument]] = None,
-) -> List[ExtractedTask]:
-    """Extract tuning tasks from a relay program.
-
-    Parameters
-    ----------
-    mod : IRModule
-        The module or function to tune
-    target : tvm.target.Target
-        The compilation target
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        The associated parameters of the program
-    opt_level : int
-        The optimization level of the compilation
-    pass_config : Mapping[str, Any]
-        The pass configuration
-    executor : Optional[relay.backend.Executor]
-        The executor to use
-    runtime : Optional[relay.backend.Runtime]
-        The runtime to use
-    module_equality : Optional[str]
-        A string to specify the module equality testing and hashing method.
-        It must be one of the followings:
-          - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
-                              equality testing and hashing.
-          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" varint is used for the extracted
-                            blocks or in case no anchor block is found.
-                            For the definition of the anchor block, see tir/analysis/analysis.py.
-    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
-        The list of disabled passes
-    instruments : Optional[Sequence[PassInstrument]]
-        The list of pass instrument implementations.
-
-    Returns
-    -------
-    tasks: List[ExtractedTask]
-        The tasks extracted from this network
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import autotvm
-
-    # pylint: enable=import-outside-toplevel
-    mod, target, params, pass_config, _ex, _rt = _normalize_params(
-        mod,
-        target,
-        params,
-        pass_config,
-        executor,
-        runtime,
-    )
-    if target.kind.name != "cuda" and isinstance(
-        autotvm.DispatchContext.current, autotvm.FallbackContext
-    ):
-        tophub_context = autotvm.tophub.context(target)
-    else:
-        tophub_context = autotvm.utils.EmptyContext()
-    with Profiler.timeit("TaskExtraction"):
-        with target, _autotvm_silencer(), tophub_context:
-            with transform.PassContext(
-                opt_level=opt_level,
-                config=pass_config,
-                disabled_pass=disabled_pass,
-                instruments=instruments,
-            ):
-                return list(_extract_task(mod, target, params, module_equality))
-
-
-def extracted_tasks_to_tune_contexts(
-    extracted_tasks: List[ExtractedTask],
-    work_dir: str,
-    space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
-    strategy: SearchStrategy.SearchStrategyType = "evolutionary",
-    num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
-    seed: Optional[int] = None,
-) -> Tuple[List[TuneContext], List[float]]:
-    """Convert ExtractedTask to TuneContext.
-
-    Parameters
-    ----------
-    tasks : List[ExtractedTask]
-        The tasks to be converted
-    work_dir : str
-        The working directory to store logs and databases
-    space : SpaceGenerator.SpaceGeneratorType
-        The space generator to use.
-    strategy : SearchStrategy.SearchStrategyType
-        The search strategy to use.
-    num_tuning_cores : Union[Literal["physical", "logical"], int]
-        The number of CPU cores to use during tuning.
-    seed : Optional[int]
-        The random seed to use.
-
-    Returns
-    -------
-    tasks : List[TuneContext]
-        The converted tasks
-    task_weights : List[float]
-        The weights of the tasks
-    """
-    tasks: List[TuneContext] = []
-    task_weights: List[float] = []
-    for task, logger, rand_state in zip(
-        extracted_tasks,
-        get_loggers_from_work_dir(work_dir, [t.task_name for t in extracted_tasks]),
-        fork_seed(seed, n=len(extracted_tasks)),
-    ):
-        tasks.append(
-            TuneContext(
-                mod=task.dispatched[0],
-                target=task.target,
-                space_generator=space,
-                search_strategy=strategy,
-                task_name=task.task_name,
-                logger=logger,
-                rand_state=rand_state,
-                num_threads=num_tuning_cores,
-            ).clone()
-        )
-        task_weights.append(task.weight)
-    return tasks, task_weights
-
-
-def tune_relay(
-    mod: IRModule,
-    params: Dict[str, NDArray],
-    target: Union[str, Target],
-    work_dir: str,
-    max_trials_global: int,
-    *,
-    max_trials_per_task: Optional[int] = None,
-    num_trials_per_iter: int = 64,
-    builder: Builder.BuilderType = "local",
-    runner: Runner.RunnerType = "local",
-    database: Database.DatabaseType = "json",
-    cost_model: CostModel.CostModelType = "xgb",
-    measure_callbacks: MeasureCallback.CallbackListType = "default",
-    task_scheduler: TaskScheduler.TaskSchedulerType = "gradient",
-    space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
-    strategy: SearchStrategy.SearchStrategyType = "evolutionary",
-    seed: Optional[int] = None,
-    module_equality: str = "structural",
-    num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
-    opt_level: int = 3,
-    disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
-    instruments: Optional[Sequence[PassInstrument]] = None,
-    post_optimization: Optional[bool] = False,
-) -> Database:
-    """Tune a Relay program.
-
-    Parameters
-    ----------
-    mod : Union[IRModule, tir.PrimFunc]
-        The module or function to tune
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        The associated parameters of the program
-    target : Union[Target, str]
-        The compilation target
-    work_dir : str
-        The working directory to store the tuning records
-    max_trials_global : int
-        The maximum number of trials to run
-    max_trials_per_task : Optional[int]
-        The maximum number of trials to run for each task
-    num_trials_per_iter : int
-        The number of trials to run per iteration
-    builder : BuilderType
-        The builder to use
-    runner : RunnerType
-        The runner to use
-    database : DatabaseType
-        The database to use
-    cost_model : CostModelType
-        The cost model to use
-    measure_callbacks : CallbackListType
-        The measure callbacks to use
-    task_scheduler : TaskSchedulerType
-        The task scheduler to use
-    space : SpaceGeneratorType
-        The space generator to use
-    strategy : SearchStrategyType
-        The search strategy to use
-    seed : Optional[int]
-        The random seed
-    module_equality : Optional[str]
-        A string to specify the module equality testing and hashing method.
-        It must be one of the followings:
-          - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
-                              equality testing and hashing.
-          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" varint is used for the extracted
-                            blocks or in case no anchor block is found.
-                            For the definition of the anchor block, see tir/analysis/analysis.py.
-    num_tuning_cores : Union[Literal["physical", "logical"], int]
-        The number of CPU cores to use during tuning.
-    opt_level : int
-        The optimization level of the compilation
-    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
-        The list of disabled passes during tasks extraction
-    instruments : Optional[Sequence[PassInstrument]]
-        The list of pass instrument implementations.
-    post_optimization : Optional[Bool]
-        Generate post-optimization using Droplet Search as exploitation space.
-
-    Returns
-    -------
-    database : Database
-        The database that contains the tuning records
-    """
-    tasks, task_weights = extracted_tasks_to_tune_contexts(
-        extracted_tasks=extract_tasks(
-            mod,
-            target,
-            params,
-            opt_level=opt_level,
-            module_equality=module_equality,
-            disabled_pass=disabled_pass,
-            instruments=instruments,
-        ),
-        work_dir=work_dir,
-        space=space,
-        strategy=strategy,
-        seed=seed,
-        num_tuning_cores=num_tuning_cores,
-    )
-    return tune_tasks(
-        tasks=tasks,
-        task_weights=task_weights,
-        work_dir=work_dir,
-        max_trials_global=max_trials_global,
-        max_trials_per_task=max_trials_per_task,
-        num_trials_per_iter=num_trials_per_iter,
-        builder=builder,
-        runner=runner,
-        database=database,
-        cost_model=cost_model,
-        measure_callbacks=measure_callbacks,
-        task_scheduler=task_scheduler,
-        module_equality=module_equality,
-        post_optimization=post_optimization,
-    )
-
-
-def compile_relay(
-    database: Database,
-    mod: IRModule,
-    target: Union[Target, str],
-    params: Optional[Dict[str, NDArray]],
-    *,
-    backend: Literal["graph", "vm"] = "graph",
-    opt_level: int = 3,
-    pass_config: Mapping[str, Any] = MappingProxyType(
-        {
-            "relay.backend.use_meta_schedule": True,
-            "relay.backend.tir_converter": "default",
-        }
-    ),
-    executor: Optional["relay.backend.Executor"] = None,
-    disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
-    runtime: Optional["relay.backend.Runtime"] = None,
-    instruments: Optional[Sequence[PassInstrument]] = None,
-):
-    """Compile a relay program with a MetaSchedule database.
-
-    Parameters
-    ----------
-    database : Database
-        The database to use
-    mod : IRModule
-        The Relay program to be compiled
-    target : tvm.target.Target
-        The compilation target
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        The associated parameters of the program
-    backend : str
-        The backend to use. Builtin backends:
-            - "graph"
-            - "vm"
-    opt_level : int
-        The optimization level of the compilation
-    pass_config : Mapping[str, Any]
-        The pass configuration
-    executor : Optional[relay.backend.Executor]
-        The executor to use in relay.build. It is not supported by RelayVM.
-    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
-        The list of disabled passes
-    runtime : Optional[relay.backend.Runtime]
-        The runtime to use in relay.build. It is not supported by RelayVM.
-    instruments : Optional[Sequence[PassInstrument]]
-        The list of pass instrument implementations.
-
-    Returns
-    -------
-    lib : Union[Module, tvm.runtime.vm.Executable]
-        The built runtime module or vm Executable for the given relay workload.
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    # pylint: enable=import-outside-toplevel
-    mod, target, params, pass_config, executor, runtime = _normalize_params(
-        mod, target, params, pass_config, executor, runtime
-    )
-    pass_config.setdefault("relay.backend.use_meta_schedule_dispatch", True)
-    with Profiler.timeit("PostTuningCompilation"):
-        with target, _autotvm_silencer(), database:
-            with transform.PassContext(
-                opt_level=opt_level,
-                config=pass_config,
-                disabled_pass=disabled_pass,
-                instruments=instruments,
-            ):
-                if backend == "graph":
-                    return relay.build(
-                        mod, target=target, params=params, executor=executor, runtime=runtime
-                    )
-                elif backend == "vm":
-                    return relay.vm.compile(mod, target=target, params=params)
-                else:
-                    raise ValueError(f"Unknown backend: {backend}")
-
-
-def is_meta_schedule_enabled() -> bool:
-    """Return whether the meta-schedule is enabled.
-
-    Returns
-    -------
-    enabled: bool
-        Whether the meta schedule is enabled
-    """
-    return transform.PassContext.current().config.get(
-        "relay.backend.use_meta_schedule",
-        False,
-    )
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
deleted file mode 100644
index 20abcfce3dc1..000000000000
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Workloads in Relay IR"""
-# pylint: disable=import-outside-toplevel
-import logging
-import multiprocessing
-import os
-import pickle
-from typing import Any, Dict, List, Optional, Tuple
-
-import tvm
-import tvm.relay.testing
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.ir import IRModule
-from tvm.runtime import NDArray, load_param_dict, save_param_dict
-from tvm.target import Target
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-
-def _get_network(
-    args: Tuple[str, List[int], Optional[str]]
-) -> Tuple[IRModule, bytearray, Tuple[str, List[int], str]]:
-    name: str
-    input_shape: List[int]
-    layout: Optional[str]
-    name, input_shape, layout = args
-
-    if layout == "None":
-        layout = None
-
-    mod: IRModule
-    if name in [
-        "resnet_18",
-        "resnet_50",
-        "wide_resnet_50",
-        "resnext_50",
-        "mobilenet_v2",
-        "mobilenet_v3",
-        "inception_v3",
-        "densenet_121",
-        "resnet3d_18",
-        "vgg_16",
-    ]:
-        import torch  # type: ignore
-        from torchvision import models  # type: ignore
-
-        assert layout is None or layout in ["NCHW", "NHWC"]
-
-        params: Dict[str, Any] = {}
-        if name in ["resnet_18", "resnet_50"]:
-            model = getattr(models, name.replace("_", ""))
-        elif name == "wide_resnet_50":
-            model = getattr(models, "wide_resnet50_2")
-        elif name == "resnext_50":
-            model = getattr(models, "resnext50_32x4d")
-        elif name == "mobilenet_v2":
-            model = getattr(models, name)
-        elif name == "mobilenet_v3":
-            model = getattr(models, name + "_large")
-        elif name == "inception_v3":
-            model = getattr(models, name)
-            params["aux_logits"] = False
-        elif name == "densenet_121":
-            model = getattr(models, name.replace("_", ""))
-        elif name == "resnet3d_18":
-            model = models.video.r3d_18
-        elif name == "vgg_16":
-            model = getattr(models, name.replace("_", ""))
-        try:
-            model = model(**params, weights=None)
-        except TypeError:
-            model = model(**params, pretrained=False)
-
-        dtype = "float32"
-        input_data = torch.randn(input_shape).type(  # pylint: disable=no-member
-            {
-                "float32": torch.float32,  # pylint: disable=no-member
-            }[dtype]
-        )
-        scripted_model = torch.jit.trace(model, input_data).eval()  # type: ignore
-        input_name = "input0"
-        shape_list = [(input_name, input_shape)]
-        mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
-        passes = [relay.transform.RemoveUnusedFunctions()]
-        if layout is None or layout == "NHWC":
-            # PyTorch is imported as NCHW by default
-            passes.append(
-                relay.transform.ConvertLayout(
-                    {
-                        "nn.conv2d": ["NHWC", "default"],
-                        "nn.conv3d": ["NDHWC", "default"],
-                        "nn.max_pool2d": ["NHWC", "default"],
-                        "nn.avg_pool2d": ["NHWC", "default"],
-                    }
-                )
-            )
-        with tvm.transform.PassContext(opt_level=3):
-            mod = tvm.transform.Sequential(passes)(mod)
-        inputs = (input_name, input_shape, dtype)
-    elif name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]:
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        # pip3 install transformers==3.5 torch==1.7
-        import torch  # type: ignore
-        import transformers  # type: ignore
-
-        assert layout is None
-
-        config_dict = {
-            "bert_tiny": transformers.BertConfig(
-                num_hidden_layers=6,
-                hidden_size=512,
-                intermediate_size=2048,
-                num_attention_heads=8,
-                return_dict=False,
-            ),
-            "bert_base": transformers.BertConfig(
-                num_hidden_layers=12,
-                hidden_size=768,
-                intermediate_size=3072,
-                num_attention_heads=12,
-                return_dict=False,
-            ),
-            "bert_medium": transformers.BertConfig(
-                num_hidden_layers=12,
-                hidden_size=1024,
-                intermediate_size=4096,
-                num_attention_heads=16,
-                return_dict=False,
-            ),
-            "bert_large": transformers.BertConfig(
-                num_hidden_layers=24,
-                hidden_size=1024,
-                intermediate_size=4096,
-                num_attention_heads=16,
-                return_dict=False,
-            ),
-        }
-        configuration = config_dict[name]
-        model = transformers.BertModel(configuration)
-        input_name = "input_ids"
-        input_dtype = "int64"
-        a = torch.randint(10000, input_shape)  # pylint: disable=no-member
-        model.eval()
-        scripted_model = torch.jit.trace(model, [a], strict=False)  # type: ignore
-        input_name = "input_ids"
-        shape_list = [(input_name, input_shape)]
-        mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
-        mod = relay.transform.FastMath()(mod)
-        mod = relay.transform.CombineParallelBatchMatmul()(mod)
-        inputs = (input_name, input_shape, input_dtype)
-    elif name == "dcgan":
-        assert layout is None
-
-        output_shape = input_shape
-        batch_size = output_shape[0]
-        oshape = output_shape[1:]
-        mod, params = relay.testing.dcgan.get_workload(
-            batch_size=batch_size,
-            oshape=oshape,
-            layout="NHWC",
-        )
-        inputs = ("data", [100], "float32")
-    else:
-        raise ValueError("Invalid name: " + name)
-
-    params_bytearray: bytearray = save_param_dict(params)
-    return mod, params_bytearray, inputs
-
-
-def _load_cache(cache_dir: Optional[str], filename: str) -> Optional[List[Any]]:
-    if cache_dir is None:
-        return None
-    path = os.path.join(os.path.expanduser(cache_dir), filename)
-    if not os.path.exists(path):
-        return None
-    logger.info("Loaded from cached: %s", path)
-    with open(path, "rb") as i_f:
-        return pickle.load(i_f)
-
-
-def _save_cache(cache_dir: Optional[str], filename: str, objects: List[Any]) -> None:
-    if cache_dir is None:
-        return
-    path = os.path.join(os.path.expanduser(cache_dir), filename)
-    with open(path, "wb") as o_f:
-        pickle.dump(objects, o_f)
-
-
-def get_network(
-    name: str,
-    input_shape: List[int],
-    *,
-    layout: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-) -> Tuple[IRModule, Dict[str, NDArray], Tuple[str, List[int], str]]:
-    """Get the symbol definition and random weight of a network
-
-    Parameters
-    ----------
-    name : str
-        The name of the network.
-    input_shape : List[int]
-        The shape of the input tensor.
-    layout : Optional[str]
-        The layout of the input tensor. For vision models, the layout is by default NHWC.
-    cache_dir : Optional[str], optional
-        The directory to cache the generated network.
-        If not specified, the cache will be disabled.
-
-    Returns
-    -------
-    mod : IRModule
-        The IRModule representing the network.
-    params : Dict[str, NDArray]
-        The parameters of the networks.
-    inputs : Tuple[str, List[int], str]
-        The name, shape and dtype of the input tensor.
-    """
-    mod: IRModule
-    params: Dict[str, NDArray]
-    inputs: Tuple[str, List[int], str]
-    params_bytearray: bytearray
-
-    filename = f'relay-{name}-{layout}-{",".join(str(i) for i in input_shape)}.json'
-    cached = _load_cache(cache_dir, filename)
-    if cached is None:
-        with multiprocessing.Pool(processes=1) as pool:
-            result = pool.map(_get_network, [(name, input_shape, layout)])
-        ((mod, params_bytearray, inputs),) = result
-        cached = [mod, params_bytearray, inputs]
-        _save_cache(cache_dir, filename, cached)
-    mod, params_bytearray, inputs = cached
-    params = load_param_dict(params_bytearray)
-    return mod, params, inputs
-
-
-def extract_from_relay(
-    mod: IRModule,
-    target: Target,
-    params: Optional[Dict[str, NDArray]],
-    name: str,
-    input_shape: List[int],
-    *,
-    cache_dir: Optional[str] = None,
-) -> List[ms.ExtractedTask]:
-    """Extract the tasks from a network.
-
-    Parameters
-    ----------
-    mod : IRModule
-        The IRModule representing the network.
-    target : Target
-        The target that the network will be deployed to.
-    params : Optional[Dict[str, NDArray]]
-        The parameters of the networks.
-    name : str
-        The name of the network.
-    input_shape : List[int]
-        The shape of the input tensor.
-    cache_dir : Optional[str]
-        The directory to cache the generated network.
-        If not specified, the cache will be disabled.
-
-    Returns
-    -------
-    extracted_tasks : List[ExtractedTask]
-        The extracted tasks.
-    """
-    filename = f'tasks-{target.kind.name}-{name}-{",".join(str(i) for i in input_shape)}.json'
-    extracted_tasks = _load_cache(cache_dir, filename)
-    if extracted_tasks is None:
-        extracted_tasks = ms.relay_integration.extract_tasks(
-            mod=mod,
-            target=target,
-            params=params,
-        )
-        extracted_tasks = list(extracted_tasks)
-        _save_cache(cache_dir, filename, extracted_tasks)
-    return extracted_tasks
-
-
-SUPPORTED = [
-    # TorchVision
-    "resnet_18",
-    "resnet_50",
-    "mobilenet_v2",
-    "mobilenet_v3",
-    "wide_resnet_50",
-    "resnext_50",
-    "resnet3d_18",
-    "inception_v3",
-    "densenet_121",
-    "vgg_16",
-    # Transformer
-    "bert_tiny",
-    "bert_base",
-    "bert_medium",
-    "bert_large",
-    # Relay testing
-    "dcgan",
-]
diff --git a/python/tvm/meta_schedule/testing/te_workload.py b/python/tvm/meta_schedule/testing/te_workload.py
index cdc430087542..792224161d2b 100644
--- a/python/tvm/meta_schedule/testing/te_workload.py
+++ b/python/tvm/meta_schedule/testing/te_workload.py
@@ -549,41 +549,6 @@ def conv2d_winograd_nhwc(  # pylint: disable=invalid-name,missing-docstring
     return (data, weight, out)
 
 
-def conv2d_winograd_nchw(  # pylint: disable=invalid-name,missing-docstring
-    N: int,
-    H: int,
-    W: int,
-    CI: int,
-    CO: int,
-    kernel_size: int,
-    stride: int = 1,
-    padding: int = 1,
-    dilation: int = 1,
-) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    from tvm.topi.cuda.conv2d_winograd import (  # pylint: disable=import-outside-toplevel
-        _infer_tile_size,
-    )
-    from tvm.topi.nn.conv2d import (  # pylint: disable=import-outside-toplevel
-        _conv2d_winograd_nchw_impl,
-    )
-
-    data = te.placeholder((N, CI, H, W), "float32", name="data")
-    weight = te.placeholder((kernel_size, kernel_size, CI, CO), "float32", name="weight")
-    out = _conv2d_winograd_nchw_impl(
-        data,
-        weight,
-        stride,
-        padding,
-        dilation,
-        "float32",
-        pre_computed=True,
-        auto_scheduler_rewritten_layout="",
-        meta_schedule_original_shape=None,
-        tile_size=_infer_tile_size(data, weight),
-    )
-    return (data, weight, out)
-
-
 def matmul(
     n: int, m: int, k: int, in_dtype: str = "float32", out_dtype: str = "float32"
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
@@ -868,10 +833,4 @@ def create_te_workload(name: str, idx: int) -> tir.PrimFunc:
             (1, 14, 14, 128, 128, 6),
         ],
     ),
-    "C2D_WIN_NCHW": (
-        conv2d_winograd_nchw,
-        [
-            (1, 56, 56, 64, 64, 6),
-        ],
-    ),
 }
diff --git a/python/tvm/meta_schedule/testing/tlcbench.py b/python/tvm/meta_schedule/testing/tlcbench.py
deleted file mode 100644
index 2e9f9f52b1fc..000000000000
--- a/python/tvm/meta_schedule/testing/tlcbench.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,import-outside-toplevel
-# type: ignore
-"""Model loader for TLCBench."""
-import logging
-import multiprocessing
-import os
-
-import tvm
-from tvm import relay
-from tvm.contrib.download import download_testdata
-
-log = logging.getLogger(__name__)
-
-
-def _convert(args):
-    onnx_model, shape_dict, json_path, params_path = args
-    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
-
-    seq = tvm.transform.Sequential(
-        [relay.transform.InferType(), relay.transform.FakeQuantizationToInteger(use_qat=True)]
-    )
-    mod = seq(mod)
-
-    with open(json_path, "w") as fo:
-        fo.write(tvm.ir.save_json(mod))
-
-    with open(params_path, "wb") as fo:
-        fo.write(relay.save_param_dict(params))
-
-
-def convert_to_qnn(onnx_path, json_path, params_path, input_info):
-    """Run the ONNX frontend and the FQ2I pass. The output is serialized to disk."""
-    import onnx
-
-    onnx_model = onnx.load(onnx_path)
-
-    shape_dict = dict(input_info)
-
-    log.info("Converting te ONNX model to Relay and running the FQ2I pass, it may take a while...")
-
-    with multiprocessing.Pool(processes=1) as pool:
-        pool.map(_convert, [(onnx_model, shape_dict, json_path, params_path)])
-
-
-def deserialize_relay(json_path, params_path):
-    with open(json_path, "r") as fi:
-        mod = tvm.ir.load_json(fi.read())
-
-    with open(params_path, "rb") as fi:
-        params = relay.load_param_dict(fi.read())
-    return mod, params
-
-
-def load_quantized_bert_base(batch_size=1, seq_len=384):
-    """
-    Load the quantized bert-base model from TLCBench, possibly downloading it from github
-    and caching the converted int8 QNN module to disk.
-
-    In addition to returing the relay module and its parameters, it also returns input name
-    and shape information, which can be used at the deployment time as follows:
-
-    ```
-    mod, params, input_info = load_quantized_bert_base()
-
-    ...
-
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    for name, shape in input_info:
-        arr = np.random.uniform(1, 10, size=shape).astype("int64")
-        runtime.set_input(name, arr)
-
-    runtime.run()
-    ```
-
-    """
-    url = "https://github.com/tlc-pack/TLCBench/raw/main/models/bert-base-qat.onnx"
-    log.info("Downloading quantized bert-base model.")
-    onnx_path = download_testdata(url, "bert-base-qat.onnx", module="tlcbench")
-    data_dir = os.path.dirname(onnx_path)
-
-    json_path = os.path.join(data_dir, "bert_base_int8_b%d_s%d.json" % (batch_size, seq_len))
-    params_path = os.path.join(data_dir, "bert_base_int8_b%d_s%d.params" % (batch_size, seq_len))
-
-    # Input names and order encoded in the ONNX model
-    input_info = [
-        ("input_ids", (batch_size, seq_len)),
-        ("segment_ids", (batch_size, seq_len)),
-        ("input_mask", (batch_size, seq_len)),
-    ]
-
-    if not os.path.exists(json_path) or not os.path.exists(params_path):
-        convert_to_qnn(onnx_path, json_path, params_path, input_info)
-
-    def deserialize():
-        try:
-            return deserialize_relay(json_path, params_path)
-        except ValueError:
-            # A serialized Relay json file may become invalid after TVM bump
-            # Update the serialized model and try loading again
-            convert_to_qnn(onnx_path, json_path, params_path, input_info)
-            return deserialize_relay(json_path, params_path)
-
-    mod, params = deserialize()
-
-    return mod, params, input_info
diff --git a/python/tvm/parser.py b/python/tvm/parser.py
deleted file mode 100644
index b79682d8907b..000000000000
--- a/python/tvm/parser.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""The legacy TVM parser """
-from .ir.base import deprecated
-
-# pylint: disable=import-outside-toplevel
-
-
-@deprecated("tvm.parser.parse", "tvm.relay.parse")
-def parse(*args, **kwargs):
-    """Deprecated, use `tvm.relay.parse` instead"""
-    from tvm.relay import parse as _impl
-
-    return _impl(*args, **kwargs)
-
-
-@deprecated("tvm.parser.parse_expr", "tvm.relay.parse_expr")
-def parse_expr(*args, **kwargs):
-    """Deprecated, use `tvm.relay.parse_expr` instead"""
-    from tvm.relay import parse_expr as _impl
-
-    return _impl(*args, **kwargs)
-
-
-@deprecated("tvm.parser.fromtext", "tvm.relay.fromtext")
-def fromtext(*args, **kwargs):
-    """Deprecated, use `tvm.relay.fromtext` instead"""
-    from tvm.relay import fromtext as _impl
-
-    return _impl(*args, **kwargs)
-
-
-@deprecated("tvm.parser.SpanCheck", "tvm.relay.SpanCheck")
-def SpanCheck(*args, **kwargs):
-    """Deprecated, use `tvm.relay.SpanCheck` instead"""
-    from tvm.relay import SpanCheck as _impl
-
-    return _impl(*args, **kwargs)
diff --git a/python/tvm/relax/backend/dispatch_sort_scan.py b/python/tvm/relax/backend/dispatch_sort_scan.py
index e37869c40c46..b5a94619c228 100644
--- a/python/tvm/relax/backend/dispatch_sort_scan.py
+++ b/python/tvm/relax/backend/dispatch_sort_scan.py
@@ -79,10 +79,10 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
             kwargs = {}
             with tgt:
                 if can_use_thrust(tgt, "tvm.contrib.thrust.sort"):
-                    te_func = topi.cuda.sort_thrust
+                    te_func = topi.gpu.sort_thrust
                     kwargs["workspace"] = self.allocate_workspace(call)
                 elif self.is_gpu_target(tgt):
-                    te_func = topi.cuda.sort
+                    te_func = topi.gpu.sort
             return self.builder_.call_te(
                 te_func, call.args[0], call.attrs.axis, not call.attrs.descending, **kwargs
             )
@@ -92,10 +92,10 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
             kwargs = {}
             with tgt:
                 if can_use_thrust(tgt, "tvm.contrib.thrust.sort"):
-                    te_func = topi.cuda.argsort_thrust
+                    te_func = topi.gpu.argsort_thrust
                     kwargs["workspace"] = self.allocate_workspace(call)
                 elif self.is_gpu_target(tgt):
-                    te_func = topi.cuda.argsort
+                    te_func = topi.gpu.argsort
             return self.builder_.call_te(
                 te_func,
                 call.args[0],
@@ -109,10 +109,10 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
             te_func = topi.topk
             kwargs = {}
             if can_use_thrust(tgt, "tvm.contrib.thrust.sort"):
-                te_func = topi.cuda.topk_thrust
+                te_func = topi.gpu.topk_thrust
                 kwargs["workspace"] = self.allocate_workspace(call)
             elif self.is_gpu_target(tgt):
-                te_func = topi.cuda.topk
+                te_func = topi.gpu.topk
             tir_call = self.builder_.call_te(
                 te_func,
                 call.args[0],
@@ -176,11 +176,11 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
 
             with tgt:
                 if call.op.name == "relax.cumsum":
-                    te_func = topi.cuda.cumsum if self.is_gpu_target(tgt) else topi.cumsum
+                    te_func = topi.gpu.cumsum if self.is_gpu_target(tgt) else topi.cumsum
                     if can_use_thrust(tgt, "tvm.contrib.thrust.sum_scan"):
                         kwargs["workspace"] = self.allocate_workspace(call)
                 elif call.op.name == "relax.cumprod":
-                    te_func = topi.cuda.cumprod if self.is_gpu_target(tgt) else topi.cumprod
+                    te_func = topi.gpu.cumprod if self.is_gpu_target(tgt) else topi.cumprod
                 else:
                     raise ValueError(f"Unsupported op: {call.op.name}")
                 tir_call = self.builder_.call_te(
diff --git a/python/tvm/relax/testing/__init__.py b/python/tvm/relax/testing/__init__.py
index dc43d6c1f8ee..2e2e87266c47 100644
--- a/python/tvm/relax/testing/__init__.py
+++ b/python/tvm/relax/testing/__init__.py
@@ -18,7 +18,6 @@
 """The Relax testing namespace containing nn and translator."""
 
 from .nn import *
-from .relay_translator import *
 from .ast_printer import dump_ast
 from .matmul import *
 from .attention import *
diff --git a/python/tvm/relax/testing/relay_translator.py b/python/tvm/relax/testing/relay_translator.py
deleted file mode 100644
index 7b09c9ad5770..000000000000
--- a/python/tvm/relax/testing/relay_translator.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, invalid-name, no-else-return,
-# pylint: disable=too-many-nested-blocks, unused-variable
-"""Relay to Relax translator."""
-
-from typing import Any, Dict, List, Optional, Sequence
-
-import tvm
-from tvm import relax, relay
-from tvm.ir.module import IRModule
-from tvm.ir.instrument import PassInstrument
-from tvm.relax.testing import nn
-from tvm.relay.backend.te_compiler import select_implementation
-from tvm.runtime import NDArray
-from tvm.target import Target
-from tvm.meta_schedule.relay_integration import _autotvm_silencer
-
-
-def from_relay(
-    func: relay.Function,
-    target: Target,
-    relay_params: Optional[Dict[str, NDArray]] = None,
-    *,
-    opt_level: int = 3,
-    pass_config: Optional[Dict[str, Any]] = None,
-    instruments: Optional[Sequence[PassInstrument]] = None,
-    disabled_pass: Optional[List[str]] = None,
-    translate_op_with_tir: Optional[Dict[str, tvm.tir.PrimFunc]] = None,
-    append_op_attrs: bool = False,
-) -> IRModule:
-    """Convert a Relay function into a Relax program.
-
-    Parameters
-    ----------
-    func : relay.Function
-        Relay function to be converted.
-
-    target: Target
-        The target to compile the model, used for selecting topi functions.
-
-    relay_params: Optional[Dict[str, NDArray]]
-        Parameters to bind.
-
-    opt_level: int
-        The optimization level.
-
-    pass_config: Optional[Dict[str, Any]]
-        Pass configuration.
-
-    instruments : Optional[Sequence[PassInstrument]]
-        The list of pass instrument implementations to be passed onto relay
-        while calling relay passes
-
-    disabled_pass: Optional[List[str]]
-        Passes to disable.
-
-    translate_op_with_tir: Optional[Dict[str, tvm.tir.PrimFunc]]
-        Dict that maps op names to user-defined PrimFuncs.
-        Takes relay operator names and forces them to user-defined PrimFuncs during translation.
-
-    append_op_attrs: bool
-        Append relay op attrs to generated prim_funcs
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The Relax IRModule for compilation
-    """
-    # A map to store the mapping of Relay Expr to its corresponding Relax var
-    var_map = {}
-    # The output of the function
-    output_var = None
-
-    if not isinstance(target, Target):
-        target = Target(target)
-    if disabled_pass is None:
-        disabled_pass = []
-    if pass_config is None:
-        pass_config = {
-            "relay.FuseOps.max_depth": 1,  # Disable relay fusion
-            "relay.backend.use_meta_schedule": True,
-            "relay.backend.use_meta_schedule_dispatch": True,
-        }
-
-    if relay_params:
-        func = relay.build_module.bind_params_by_name(func, relay_params)
-
-    params = []
-    tir_var_map: Dict[tvm.tir.Var, tvm.tir.PrimExpr] = dict()
-
-    def convert_shape(shape: List[tvm.tir.PrimExpr]) -> List[tvm.tir.PrimExpr]:
-        """Convert the relay shape to relax shape by changing Any dim to symbolic dim"""
-        ret = []
-        for dim in shape:
-            if isinstance(dim, tvm.tir.IntImm):
-                ret.append(tvm.tir.IntImm("int64", int(dim)))
-            elif isinstance(dim, tvm.tir.Any):
-                ret.append(tvm.tir.Var("d", "int64"))
-            else:
-                ret.append(dim)
-        return ret
-
-    def _copy_undefined_var_in_shape(sinfo: relax.TensorStructInfo):
-        def _visit_expr(e: tvm.tir.PrimExpr):
-            if isinstance(e, tvm.tir.Var) and e not in tir_var_map:
-                new_var = tvm.tir.Var(e.name, e.dtype)
-                tir_var_map[e] = new_var
-
-        assert isinstance(
-            sinfo.shape, relax.ShapeExpr
-        ), "arg with TensorStructInfo in Relay translator must have ShapeExpr shape"
-        for shape_value in sinfo.shape.values:
-            tvm.tir.stmt_functor.post_order_visit(shape_value, _visit_expr)
-
-    def visit_func(node):
-        nonlocal output_var
-        if isinstance(node, relay.Var):
-            if isinstance(node.type_annotation, relay.TensorType):
-                var_map[node] = nn.Placeholder(
-                    tuple(convert_shape(node.type_annotation.shape)),
-                    node.type_annotation.dtype,
-                    node.name_hint,
-                )
-                params.append(var_map[node])
-            else:
-                raise TypeError("The type of relay.Var to be translated must be of TensorType.")
-        elif isinstance(node, relay.Call):
-            args = node.args
-            new_args = []
-            te_inputs = []
-            for arg in args:
-                if arg in var_map:
-                    arg_expr = var_map[arg]
-                    if isinstance(arg_expr.struct_info, relax.TensorStructInfo):
-                        _copy_undefined_var_in_shape(arg_expr.struct_info)
-                        new_args.append(arg_expr)
-                        te_inputs.append(tvm.relax.expr.te_tensor(arg_expr, tir_var_map))
-                    elif isinstance(arg_expr.struct_info, relax.TupleStructInfo):
-                        n_tensor = len(arg_expr.struct_info.fields)
-                        bound_tuple = bb.lookup_binding(arg_expr)
-                        if isinstance(bound_tuple, relax.Tuple):
-                            assert len(bound_tuple) == n_tensor
-                        for i in range(n_tensor):
-                            if isinstance(bound_tuple, relax.Tuple):
-                                item = bb.emit(bound_tuple[i])
-                            else:
-                                item = bb.emit(relax.TupleGetItem(arg_expr, i))
-
-                            assert isinstance(item.struct_info, relax.TensorStructInfo), (
-                                "Relay translator doesn't support Call "
-                                "argument being nested Tensor tuple."
-                            )
-                            _copy_undefined_var_in_shape(item.struct_info)
-                            new_args.append(item)
-                            te_inputs.append(tvm.relax.expr.te_tensor(item, tir_var_map))
-                    else:
-                        raise TypeError(
-                            f"CallTIR argument type being {type(arg_expr.checked_type)} is not "
-                            "supported."
-                        )
-
-            op_name = node.op.name
-            attrs = node.attrs
-            out_type = node.checked_type
-
-            op_attrs_map = {}
-            if append_op_attrs:
-                func_attr_map = {"op_name": op_name}
-                if attrs:
-                    for attr in attrs.keys():
-                        func_attr_map[attr] = attrs[attr]
-
-                op_attrs_map["op_attrs"] = func_attr_map
-
-            if translate_op_with_tir and op_name in translate_op_with_tir:
-                tir_gvar = bb.add_func(translate_op_with_tir[op_name], op_name)
-                call = relax.call_tir(
-                    tir_gvar, new_args, relax.TensorStructInfo(out_type.shape, out_type.dtype)
-                )
-                var = bb.emit(call)
-            else:
-                with target:
-                    best_impl, outputs = select_implementation(
-                        node.op,
-                        attrs,
-                        te_inputs,
-                        out_type,
-                        target,
-                        use_autotvm=False,
-                    )
-                    compute_func = best_impl.compute
-                    name_hint = op_name.split(".")[-1]
-                    var = bb.emit_te(
-                        compute_func,
-                        attrs,
-                        new_args,
-                        node.checked_type,
-                        primfunc_name_hint=name_hint,
-                        primfunc_attrs=op_attrs_map,
-                    )
-
-            output_var = var
-            var_map[node] = var
-        elif isinstance(node, relay.Constant):
-            # fill the shape and checked_type fields of the Constant
-            new_constant = relax.Constant(node.data)
-            var_map[node] = new_constant
-        elif isinstance(node, relay.Tuple):
-            new_fields = []
-            for field in node.fields:
-                if field in var_map:
-                    new_fields.append(var_map[field])
-                else:
-                    raise RuntimeError("field is not in var_map.")
-            new_tuple = relax.Tuple(new_fields)
-            new_tuple_var = relax.BlockBuilder.current().emit(new_tuple)
-            var_map[node] = new_tuple_var
-            output_var = new_tuple_var
-        elif isinstance(node, relay.TupleGetItem):
-            if node.tuple_value in var_map:
-                new_tuple = var_map[node.tuple_value]
-                new_tuple_get_item_node = relax.TupleGetItem(new_tuple, node.index)
-                new_tuple_get_item_var = relax.BlockBuilder.current().emit(new_tuple_get_item_node)
-                var_map[node] = new_tuple_get_item_var
-                output_var = new_tuple_get_item_var
-            else:
-                raise RuntimeError("tuple is not in var_map")
-        elif isinstance(node, relay.Function):
-            cur_bb = relax.BlockBuilder.current()
-            gv = cur_bb.emit_output(output_var)
-            df_block = cur_bb._end_block()
-            cur_bb._func._blocks.append(df_block)
-            cur_bb.emit_func_output(gv, params)
-        elif isinstance(node, tvm.ir.Op):
-            pass
-        else:
-            raise TypeError("{} is not supported yet.".format(str(type(node))))
-
-    # List of subset of relay->relay optimizations
-    # See src/relay/backend/utils.cc::GetPassPrefix() for full list
-    seq = tvm.get_global_func("relay.backend.GetPassPrefixSeq")(True, True)
-
-    # Since optimization passes and OpStrategy are highly context-dependent,
-    # we match the exact same context with `extract_task_from_relay()` env
-    with target, _autotvm_silencer(), tvm.transform.PassContext(
-        opt_level=opt_level,
-        config=pass_config,
-        disabled_pass=disabled_pass,
-        instruments=instruments,
-    ):
-        mod = tvm.IRModule.from_expr(func)
-        mod = seq(mod)
-        bb = relax.BlockBuilder()
-        with bb.function("main"):
-            bb._begin_dataflow_block()
-            relay.analysis.post_order_visit(mod["main"], visit_func)
-
-    return bb.get()
diff --git a/python/tvm/relax/testing/transform.py b/python/tvm/relax/testing/transform.py
index 42dbd37d2931..02c79bd4fa6e 100644
--- a/python/tvm/relax/testing/transform.py
+++ b/python/tvm/relax/testing/transform.py
@@ -21,115 +21,13 @@
 import os
 from typing import Dict, List, Set, Tuple
 import tvm
-from tvm import ir, relax
-from tvm.ir import transform
 from tvm.ir.module import IRModule
-from tvm.ir.transform import PassContext
-from tvm.relax import PyExprMutator
 from tvm.relax.expr import Call, DataflowBlock, Var
-from tvm.relay.backend.te_compiler import select_implementation
 from tvm.runtime.object import Object
-from tvm.target import Target
-
-
-@ir.transform.module_pass(opt_level=0)
-class LowerWithRelayOpStrategyPass(transform.Pass):
-    """Lower Relax Op into TIR by using Relay OpStrategy.
-
-    Since operators like conv2d, add, matmul are relay-, relax- independent,
-    this pass assumes we can always find relay op equivalent for such relax ops,
-    and use Relay Op Strategy (legacy) to perform lowering and find the TOPI implementation.
-
-    Parameters
-    ----------
-    target : Target
-        target info
-
-    Returns
-    -------
-    pass : transform.Pass
-        lowering pass
-    """
-
-    def __init__(self, target: Target):
-        self.target = target
-
-    def transform_module(self, mod: IRModule, ctx: PassContext) -> IRModule:
-        """Implement lowering mechanism.
-
-        Parameters
-        ----------
-        mod : IRModule
-            Input IRModule with Relax ops
-
-        ctx: PassContext
-            Pass context
-
-        Returns
-        -------
-        out_mod : IRModule
-            Output IRModule with lowered TIR functions
-        """
-        target = self.target
-
-        @relax.expr_functor.mutator
-        class Lowerer(PyExprMutator):
-            """Mutator that performs lowering."""
-
-            def visit_call_(self, call_node: Call):
-                # Ignore function calls
-                # We only target calls for operators
-                if isinstance(call_node.op, (relax.GlobalVar, relax.expr.ExternFunc)):
-                    return call_node
-
-                # Current relax op name simply adds "relax." prefix to relay op name.
-                # Thus, remove "relax." prefix to deduce relay op name.
-                relay_op_name = call_node.op.name[6:]
-                # Check if equivalent relay op exists. If not, return the original call.
-                if relay_op_name in ir.Op.list_op_names():
-                    relay_op = ir.Op.get(relay_op_name)
-
-                    # Todo(relax-team): to be revisited - support dyn shape or deprecate.
-                    tir_var_map = dict()
-                    te_inputs = [relax.expr.te_tensor(arg, tir_var_map) for arg in call_node.args]
-                    best_impl_tuple = select_implementation(
-                        relay_op,
-                        call_node.attrs,
-                        te_inputs,
-                        call_node.checked_type,
-                        target,
-                        use_autotvm=False,
-                    )
-                    compute_func = best_impl_tuple[0].compute
-                    # Extract the name of the operator without the prefix
-                    # e.g., for relay op "nn.conv2d", name_hint would be conv2d
-                    name_hint = relay_op_name.split(".")[-1]
-
-                    return self.builder_.call_te(
-                        compute_func,
-                        call_node.attrs,
-                        call_node.args,
-                        call_node.attrs,
-                        primfunc_name_hint=name_hint,
-                    )
-                else:
-                    return call_node
-
-            # TOOD(@team): transform() wapper is necessary to include TIR functions.
-            # IMO, this is bit unintuitive. Can we improve this?
-            def transform(self):
-                for gv, func in mod.functions_items():
-                    if isinstance(func, relax.Function):
-                        updated_func = self.visit_expr(func)
-                        self.builder_.update_func(gv, updated_func)
-                new_mod = self.builder_.get()
-                new_mod = new_mod.with_attrs(mod.attrs) if mod.attrs else new_mod
-                return new_mod
-
-        return Lowerer().transform()
 
 
 def ApplyEmptyCppMutator() -> tvm.ir.transform.Pass:
+    """Create empty cpp mutator"""
     packed_func = tvm.get_global_func("relax.testing.transform.ApplyEmptyCppMutator")
     return packed_func()
 
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
deleted file mode 100644
index ef2b515c3be2..000000000000
--- a/python/tvm/relay/__init__.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin, invalid-name
-"""The Relay IR namespace containing the IR definition and compiler."""
-import os
-from sys import setrecursionlimit
-
-from . import base
-from . import ty
-from . import expr
-from . import function
-from . import type_functor
-from . import expr_functor
-from . import adt
-from . import prelude
-from . import loops
-from . import scope_builder
-from .base import pretty_print, astext
-
-from . import transform
-from . import analysis
-from . import collage
-from .build_module import build, create_executor, optimize
-from .transform import build_config
-from . import debug
-from . import param_dict
-from .backend import vm
-
-# Root operators
-from .op import nn
-from .op import image
-from .op import annotation
-from .op import vision
-from .op import contrib
-from .op import dyn
-from .op import random
-from .op.reduce import *
-from .op.tensor import *
-from .op.transform import *
-from .op.algorithm import *
-from . import frontend
-from . import backend
-from . import quantize
-from . import data_dep_optimization
-
-# Dialects
-from . import qnn
-
-from .scope_builder import ScopeBuilder
-
-# Load Memory Passes
-from .transform import memory_plan
-
-# Parser
-from .parser import parse, parse_expr, fromtext, SpanCheck
-
-# Required to traverse large programs
-setrecursionlimit(10000)
-
-# Span
-Span = base.Span
-SequentialSpan = base.SequentialSpan
-SourceName = base.SourceName
-
-# Type
-Type = ty.Type
-TupleType = ty.TupleType
-TensorType = ty.TensorType
-TypeKind = ty.TypeKind
-TypeVar = ty.TypeVar
-ShapeVar = ty.ShapeVar
-TypeConstraint = ty.TypeConstraint
-FuncType = ty.FuncType
-TypeRelation = ty.TypeRelation
-IncompleteType = ty.IncompleteType
-scalar_type = ty.scalar_type
-RefType = ty.RefType
-GlobalTypeVar = ty.GlobalTypeVar
-TypeCall = ty.TypeCall
-Any = ty.Any
-
-# Expr
-Expr = expr.RelayExpr
-Constant = expr.Constant
-Tuple = expr.Tuple
-Var = expr.Var
-GlobalVar = expr.GlobalVar
-Function = function.Function
-Call = expr.Call
-Let = expr.Let
-If = expr.If
-TupleGetItem = expr.TupleGetItem
-RefCreate = expr.RefCreate
-RefRead = expr.RefRead
-RefWrite = expr.RefWrite
-
-# ADT
-Pattern = adt.Pattern
-PatternWildcard = adt.PatternWildcard
-PatternVar = adt.PatternVar
-PatternConstructor = adt.PatternConstructor
-PatternTuple = adt.PatternTuple
-Constructor = adt.Constructor
-TypeData = adt.TypeData
-Clause = adt.Clause
-Match = adt.Match
-
-# helper functions
-var = expr.var
-const = expr.const
-bind = expr.bind
-
-# TypeFunctor
-TypeFunctor = type_functor.TypeFunctor
-TypeVisitor = type_functor.TypeVisitor
-TypeMutator = type_functor.TypeMutator
-
-# ExprFunctor
-ExprFunctor = expr_functor.ExprFunctor
-ExprVisitor = expr_functor.ExprVisitor
-ExprMutator = expr_functor.ExprMutator
-
-# Prelude
-Prelude = prelude.Prelude
-
-# Scope Builder
-ScopeBuilder = scope_builder.ScopeBuilder
-
-# Param Serialization
-save_param_dict = param_dict.save_param_dict
-load_param_dict = param_dict.load_param_dict
diff --git a/python/tvm/relay/_build_module.py b/python/tvm/relay/_build_module.py
deleted file mode 100644
index 9ee92e0035fa..000000000000
--- a/python/tvm/relay/_build_module.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
-"""The interface for building Relay functions exposed from C++."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.build_module", __name__)
diff --git a/python/tvm/relay/_ffi_api.py b/python/tvm/relay/_ffi_api.py
deleted file mode 100644
index 8e9b46a14d35..000000000000
--- a/python/tvm/relay/_ffi_api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for Relay program IR."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.ir", __name__)
diff --git a/python/tvm/relay/_ffi_api_parser.py b/python/tvm/relay/_ffi_api_parser.py
deleted file mode 100644
index 731b926b5655..000000000000
--- a/python/tvm/relay/_ffi_api_parser.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for Relay parser."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.parser", __name__)
diff --git a/python/tvm/relay/_make.py b/python/tvm/relay/_make.py
deleted file mode 100644
index 351f7c6575ce..000000000000
--- a/python/tvm/relay/_make.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-The constructors for all Relay AST nodes exposed from C++.
-
-This module includes MyPy type signatures for all of the
-exposed modules.
-"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay._make", __name__)
diff --git a/python/tvm/relay/adt.py b/python/tvm/relay/adt.py
deleted file mode 100644
index df12aaece2da..000000000000
--- a/python/tvm/relay/adt.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import
-"""Algebraic data types in Relay."""
-from tvm.ir import Constructor, TypeData
-from tvm.runtime import Object
-import tvm._ffi
-
-from .base import RelayNode
-from . import _ffi_api
-from .ty import Type
-from .expr import ExprWithOp, RelayExpr, Call
-
-
-class Pattern(RelayNode):
-    """Base type for pattern matching constructs."""
-
-
-@tvm._ffi.register_object("relay.PatternWildcard")
-class PatternWildcard(Pattern):
-    """Wildcard pattern in Relay: Matches any ADT and binds nothing."""
-
-    def __init__(self):
-        """Constructs a wildcard pattern.
-
-        Parameters
-        ----------
-        None
-
-        Returns
-        -------
-        wildcard: PatternWildcard
-            a wildcard pattern.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.PatternWildcard)
-
-
-@tvm._ffi.register_object("relay.PatternVar")
-class PatternVar(Pattern):
-    """Variable pattern in Relay: Matches anything and binds it to the variable."""
-
-    def __init__(self, var):
-        """Construct a variable pattern.
-
-        Parameters
-        ----------
-        var: tvm.relay.Var
-
-        Returns
-        -------
-        pv: PatternVar
-            A variable pattern.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.PatternVar, var)
-
-
-@tvm._ffi.register_object("relay.PatternConstructor")
-class PatternConstructor(Pattern):
-    """Constructor pattern in Relay: Matches an ADT of the given constructor, binds recursively."""
-
-    def __init__(self, constructor, patterns=None):
-        """Construct a constructor pattern.
-
-        Parameters
-        ----------
-        constructor: Constructor
-            The constructor.
-        patterns: Optional[List[Pattern]]
-            Optional subpatterns: for each field of the constructor,
-            match to the given subpattern (treated as a variable pattern by default).
-
-        Returns
-        -------
-        wildcard: PatternWildcard
-            a wildcard pattern.
-        """
-        if patterns is None:
-            patterns = []
-        self.__init_handle_by_constructor__(_ffi_api.PatternConstructor, constructor, patterns)
-
-
-@tvm._ffi.register_object("relay.PatternTuple")
-class PatternTuple(Pattern):
-    """Constructor pattern in Relay: Matches a tuple, binds recursively."""
-
-    def __init__(self, patterns=None):
-        """Construct a tuple pattern.
-
-        Parameters
-        ----------
-        patterns: Optional[List[Pattern]]
-            Optional subpatterns: for each field of the constructor,
-            match to the given subpattern (treated as a variable pattern by default).
-
-        Returns
-        -------
-        wildcard: PatternWildcard
-            a wildcard pattern.
-        """
-        if patterns is None:
-            patterns = []
-        self.__init_handle_by_constructor__(_ffi_api.PatternTuple, patterns)
-
-
-@tvm._ffi.register_object("relay.Clause")
-class Clause(Object):
-    """Clause for pattern matching in Relay."""
-
-    def __init__(self, lhs, rhs):
-        """Construct a clause.
-
-        Parameters
-        ----------
-        lhs: tvm.relay.Pattern
-            Left-hand side of match clause.
-        rhs: tvm.relay.Expr
-            Right-hand side of match clause.
-
-        Returns
-        -------
-        clause: Clause
-            The Clause.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.Clause, lhs, rhs)
-
-
-@tvm._ffi.register_object("relay.Match")
-class Match(ExprWithOp):
-    """Pattern matching expression in Relay."""
-
-    def __init__(self, data, clauses, complete=True):
-        """Construct a Match.
-
-        Parameters
-        ----------
-        data: tvm.relay.Expr
-            The value being deconstructed and matched.
-
-        clauses: List[tvm.relay.Clause]
-            The pattern match clauses.
-
-        complete: Optional[Bool]
-            Should the match be complete (cover all cases)?
-            If yes, the type checker will generate an error if there are any missing cases.
-
-        Returns
-        -------
-        match: tvm.relay.Expr
-            The match expression.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.Match, data, clauses, complete)
diff --git a/python/tvm/relay/analysis/__init__.py b/python/tvm/relay/analysis/__init__.py
deleted file mode 100644
index ae642e44cf8d..000000000000
--- a/python/tvm/relay/analysis/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin, invalid-name
-"""The Relay IR namespace containing the analysis passes."""
-# Analysis passes
-from .analysis import *
-
-# Annotations
-from .annotated_regions import AnnotatedRegionSet
-
-# Call graph
-from . import call_graph
-from .call_graph import CallGraph
-
-# Feature
-from . import feature
-from . import sparse_dense
-from . import sparse_conv2d
-
-# Utilities
-from .count_layers import count_layers
diff --git a/python/tvm/relay/analysis/_ffi_api.py b/python/tvm/relay/analysis/_ffi_api.py
deleted file mode 100644
index 20b03c396e70..000000000000
--- a/python/tvm/relay/analysis/_ffi_api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for Relay program analysis."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.analysis", __name__)
diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
deleted file mode 100644
index 12f659f0037c..000000000000
--- a/python/tvm/relay/analysis/analysis.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return
-# pylint: disable=unidiomatic-typecheck
-"""
-This file contains the set of passes for Relay, which exposes an interface for
-configuring the passes and scripting them in Python.
-"""
-from ...ir import IRModule
-from ...relay import transform, build_module
-from ...runtime.ndarray import cpu
-
-from . import _ffi_api
-from .feature import Feature
-
-
-def post_order_visit(expr, fvisit):
-    """Recursively visit the ir in post DFS order node,
-    apply fvisit. Each node is guaranteed to be visited
-    only once.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression.
-
-    fvisit : function
-        The visitor function to be applied.
-    """
-    return _ffi_api.post_order_visit(expr, fvisit)
-
-
-def well_formed(expr):
-    """Check that each Var is only bound once (well formed).
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression
-
-    Returns
-    -------
-    well_form : bool
-        Whether the input expression is well formed
-    """
-    return _ffi_api.well_formed(expr)
-
-
-def check_kind(t, mod=None):
-    """Check that the type is well kinded and return the kind.
-    For example, this mean type cannot has tensor of tensor, or is a tuple type
-    of 2 shapes.
-
-    Parameters
-    ----------
-    t : tvm.relay.Type
-        The type to check
-
-    mod : Optional[tvm.IRModule]
-        The global module.
-
-    Returns
-    -------
-    kind : Kind
-        the kind of t
-
-    Examples
-    --------
-    .. code:: python
-
-        assert check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Shape)])) == Shape
-        assert check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Type)])) == Type
-    """
-    if mod is not None:
-        return _ffi_api.check_kind(t, mod)
-    else:
-        return _ffi_api.check_kind(t)
-
-
-def check_constant(expr):
-    """Check whether an expression is constant
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression
-
-    Returns
-    -------
-    result : bool
-        Whether the expression is constant.
-    """
-    return _ffi_api.check_constant(expr)
-
-
-def check_basic_block_normal_form(expr):
-    """Check whether an expression is in the basic block form
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression
-
-    Returns
-    -------
-    result : bool
-        Whether the expression is in the basic block form.
-    """
-    return _ffi_api.check_basic_block_normal_form(expr)
-
-
-def free_vars(expr):
-    """Get free Vars from expression expr in Post DFS order.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression
-
-    Returns
-    -------
-    free : List[tvm.relay.Var]
-        The list of free variables in post DFS order.
-
-    Note
-    ----
-    The fact that Vars are post-DFS ordred are useful in
-    neural networks: usually this means weights of previous
-    are ordered first.
-    """
-    return _ffi_api.free_vars(expr)
-
-
-def bound_vars(expr):
-    """Get bound vars from expression expr in post-DFS order.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression
-
-    Returns
-    -------
-    free : List[tvm.relay.Var]
-        The list of bound variables in post-DFS order.
-    """
-    return _ffi_api.bound_vars(expr)
-
-
-def all_vars(expr):
-    """Get all vars from expression expr in post-DFS order.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression
-
-    Returns
-    -------
-    free : List[tvm.relay.Var]
-        The list of all variables in post-DFS order.
-    """
-    return _ffi_api.all_vars(expr)
-
-
-def free_type_vars(expr, mod=None):
-    """Get free type variables from expression/type e
-
-    Parameters
-    ----------
-    expr : Union[tvm.relay.Expr,tvm.relay.Type]
-        The input expression/type
-
-    mod : Optional[tvm.IRModule]
-        The global module
-
-    Returns
-    -------
-    free : List[tvm.relay.TypeVar]
-        The list of free type variables in post-DFS order
-    """
-    use_mod = mod if mod is not None else IRModule()
-    return _ffi_api.free_type_vars(expr, use_mod)
-
-
-def bound_type_vars(expr, mod=None):
-    """Get bound type variables from expression/type e
-
-    Parameters
-    ----------
-    expr : Union[tvm.relay.Expr,tvm.relay.Type]
-        The input expression/type
-
-    mod : Optional[tvm.IRModule]
-        The global module
-
-    Returns
-    -------
-    free : List[tvm.relay.TypeVar]
-        The list of bound type variables in post-DFS order
-    """
-    use_mod = mod if mod is not None else IRModule()
-    return _ffi_api.bound_type_vars(expr, use_mod)
-
-
-def all_type_vars(expr, mod=None):
-    """Get all type variables from expression/type e
-
-    Parameters
-    ----------
-    expr : Union[tvm.relay.Expr,tvm.relay.Type]
-        The input expression/type
-
-    mod : Optional[tvm.IRModule]
-        The global module
-
-    Returns
-    -------
-    free : List[tvm.relay.TypeVar]
-        The list of all type variables in post-DFS order
-    """
-    use_mod = mod if mod is not None else IRModule()
-    return _ffi_api.all_type_vars(expr, use_mod)
-
-
-def all_dtypes(expr):
-    """Collect set of all data types used in `expr`.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression
-
-    Returns
-    -------
-    ret : Set[String]
-        Set of data types used in the expression (e.g., `{'int8', 'int32'}`)
-    """
-    return set(_ffi_api.all_dtypes(expr))
-
-
-def get_total_mac_number(expr):
-    """
-    Count the number of MACs (multiply-accumulate) of a model
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression.
-
-    Returns
-    -------
-    result : int64
-      The number of MACs (multiply-accumulate) of a model
-    """
-    return _ffi_api.GetTotalMacNumber(expr)
-
-
-def unmatched_cases(match, mod=None):
-    """
-    Finds cases that the match expression does not catch, if any.
-
-    Parameters
-    ----------
-    match : tvm.relay.Match
-        The match expression
-
-    mod : Optional[tvm.IRModule]
-        The module (defaults to an empty module)
-
-    Returns
-    -------
-    missing_patterns : [tvm.relay.Pattern]
-        Patterns that the match expression does not catch.
-    """
-    return _ffi_api.unmatched_cases(match, mod)
-
-
-def detect_feature(a, b=None):
-    """
-    Detect the feature used in a relay program.
-
-    Parameters
-    ----------
-    a : Union[tvm.relay.Expr, tvm.IRModule]
-      The input expression or module.
-
-    b : Optional[Union[tvm.relay.Expr, tvm.IRModule]]
-      The input expression or module.
-      The two arguments cannot both be expression or module.
-
-    Returns
-    -------
-    features : Set[Feature]
-      Features used in the program.
-    """
-    if isinstance(a, IRModule):
-        a, b = b, a
-    return {Feature(int(x)) for x in _ffi_api.detect_feature(a, b)}
-
-
-def extract_fused_functions(mod):
-    """Pass to extract IRModule of only fused primitive functions.
-
-    The ExtractFusedFunctions pass invokes SimplifyInference, FuseOps(3),
-    and ExtractFusedFunctions in that order
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-
-    Returns
-    -------
-    ret : Dict[int, tvm.relay.function.Function]
-        A module containing only fused primitive functions
-    """
-    ret_mod = _ffi_api.ExtractFusedFunctions()(mod)
-    ret = {}
-    for hash_, func in ret_mod.functions.items():
-        ret[hash_] = func
-    return ret
-
-
-def list_op_freqs(mod):
-    """Pass to extract unique operator names and how frequently they appear
-    in an IRModule. Fused functions are traversed to count the operators
-    that compose them.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-
-    Returns
-    -------
-    ret : Dict[str, int]
-        Dict of unique operator names to frequency
-    """
-    return _ffi_api.ExtractOperators(mod)
-
-
-def list_fake_quantized_op_freqs(mod):
-    """Pass to extract fake quantized op names and the frequency that they appear
-    in fake quantized regions of an IRModule.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-
-    Returns
-    -------
-    ret : Dict[str, int]
-        Dict of fake quantized operator names to frequency
-    """
-    return _ffi_api.ExtractFakeQuantizedOps(mod)
-
-
-def search_fc_transpose(expr):
-    """Search fc weight name in the patten: y = nn.dense(x, transpose(w, [1, 0]))
-
-    This function is used in the data_dep_optimization.simplify_fc_transpose method
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-
-    Returns
-    -------
-    ret : Array[String]
-        Array of weight variable name in pattern y = nn.dense(x, transpose(w, [1, 0]))
-    """
-    ret = _ffi_api.search_fc_transpose(expr)
-    return ret
-
-
-def get_calibration_data(mod, data):
-    """Get the calibration data of a given relay graph
-
-    This pass uses the graph executor to get the calibration data of a module, which
-    includes the input and output values of each function. The returned data uses
-    the GlobalVar of each function as a key. Users can further access the inputs and
-    outputs by using `inputs` or  `outputs` as the key.
-
-    Following are some limitations:
-    1. The input module (graph) cannot have control flows.
-    2. The input arguments of each function cannot be tuples (outputs can be tuples).
-    3. We only handle top-level functions (i.e., nested function is not handled).
-    4. We only handle functions with `Compiler` attribute being set.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The input module for collecting the calibration data
-
-    data : Dict[str, NDArray]
-        The input data for running the module
-
-    Returns
-    -------
-    data : Dict[tvm.relay.GlobalVar, Dict[str, NDArray]]
-    """
-    output_map = _ffi_api.get_calibrate_output_map(mod)
-
-    mod = _ffi_api.get_calibrate_module(mod)
-    mod = transform.Inline()(mod)
-
-    ref_res = build_module.create_executor("graph", mod=mod, device=cpu(0)).evaluate()(**data)
-
-    calib_data = {}
-    for gvar, indices in output_map.items():
-        offset = int(indices[0])
-        in_len = int(indices[1])
-        out_len = int(indices[2])
-        value = {
-            "inputs": ref_res[offset : offset + in_len],
-            "outputs": ref_res[offset + in_len : offset + in_len + out_len],
-        }
-        calib_data[gvar] = value
-
-    return calib_data
-
-
-def extract_intermdeiate_expr(mod, expr_id):
-    """Extract Relay Expr by its expression ID
-
-    This function is used for extracting Relay Expr
-    by its expression ID of the main function
-    that we can see in `print(mod["main"])`.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-
-    expr_id : the Expr ID that we want to extract
-
-    Returns
-    -------
-    ret : Extracted IRModule
-
-    Examples
-    --------
-    .. code-block:: python
-
-        # Suppose our module is printed like this:
-        # def @main(%x: Tensor[(1, 1, 5, 1), float32], %w1, %w2) {
-        #   %0 = nn.conv2d(%x, %w1, padding=[1, 1, 1, 1], channels=1, kernel_size=[3, 3]);
-        #   %1 = nn.conv2d(%0, %w2, padding=[1, 1, 1, 1], channels=1, kernel_size=[3, 3]);
-        #   %2 = add(%0, %1);
-        #   %3 = split(%2, indices_or_sections=1);
-        #   %4 = %3.0;
-        #   add(%4, 1f)
-        # }
-        # if we want to extract `%1 = nn.conv2d`
-        from tvm import relay
-
-        relay.analysis.extract_intermdeiate_expr(mod, 1)
-    """
-    return _ffi_api.ExtractIntermediateExpr(mod, expr_id)
diff --git a/python/tvm/relay/analysis/annotated_regions.py b/python/tvm/relay/analysis/annotated_regions.py
deleted file mode 100644
index a18ccb97836b..000000000000
--- a/python/tvm/relay/analysis/annotated_regions.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import
-"""Regions used in Relay."""
-
-from ...runtime import Object
-from . import _ffi_api
-
-
-class AnnotatedRegionSet(Object):
-    """Class to represent a relay expression split into regions."""
-
-    def __init__(self, expr, region_begin_op, region_end_op):
-        """Construct regions from an expression.
-
-        Parameters
-        ----------
-        expr : tvm.relay.Expr
-            The expression from which to construct the regions.
-        region_begin_op : tvm.ir.Op
-            The region begin annotation.
-        region_end_op : tvm.ir.Op
-            The region end annotation.
-
-        """
-        self.__init_handle_by_constructor__(
-            _ffi_api.AnnotatedRegionSet, expr, region_begin_op, region_end_op
-        )
-
-    def __len__(self):
-        return len(self.regions)
-
-    def get_region(self, expr):
-        """Get the region an expression belongs to.
-
-        Parameters
-        ----------
-        expr : tvm.relay.Expr
-            The expression.
-
-        Returns
-        -------
-        region
-            The region containing the expression.
-            None if not found.
-        """
-        return _ffi_api.GetRegion(self, expr)
diff --git a/python/tvm/relay/analysis/call_graph.py b/python/tvm/relay/analysis/call_graph.py
deleted file mode 100644
index fd9704d0af1f..000000000000
--- a/python/tvm/relay/analysis/call_graph.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import
-"""Call graph used in Relay."""
-
-from ...ir import IRModule
-from ...runtime import Object
-from ..expr import GlobalVar
-from . import _ffi_api
-
-
-class CallGraph(Object):
-    """Class to represent a call graph."""
-
-    def __init__(self, module):
-        """Construct a call graph.
-
-        Parameters
-        ----------
-        module : tvm.ir.IRModule
-            The IR module used to create a call graph
-
-        Returns
-        -------
-        call_graph: CallGraph
-            A constructed call graph.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.CallGraph, module)
-
-    @property
-    def module(self):
-        """Return the contained Relay IR module.
-
-        Parameters
-        ----------
-        None
-
-        Returns
-        -------
-        ret : tvm.ir.IRModule
-            The contained IRModule
-        """
-        return _ffi_api.GetModule(self)
-
-    def ref_count(self, var):
-        """Return the number of references to the global var
-
-        Parameters
-        ----------
-        var : Union[String, tvm.relay.GlobalVar]
-
-        Returns
-        -------
-        ret : int
-            The number reference to the global var
-        """
-        var = self._get_global_var(var)
-        return _ffi_api.GetRefCountGlobalVar(self, var)
-
-    def global_call_count(self, var):
-        """Return the number of global function calls from a given global var.
-
-        Parameters
-        ----------
-        var : Union[String, tvm.relay.GlobalVar]
-
-        Returns
-        -------
-        ret : int
-            The number of global function calls from the given var.
-        """
-        var = self._get_global_var(var)
-        return _ffi_api.GetGlobalVarCallCount(self, var)
-
-    def is_recursive(self, var):
-        """Return if the function corresponding to a var is a recursive
-        function.
-
-        Parameters
-        ----------
-        var : Union[String, tvm.relay.GlobalVar]
-
-        Returns
-        -------
-        ret : Boolean
-            If the function corresponding to var is recurisve.
-        """
-        var = self._get_global_var(var)
-        return _ffi_api.IsRecursive(self, var)
-
-    def _get_global_var(self, var):
-        """Return the global var using a given name or GlobalVar.
-
-        Parameters
-        ----------
-        var : Union[String, tvm.relay.GlobalVar]
-
-        Returns
-        -------
-        ret : tvm.relay.GlobalVar
-            The global var.
-        """
-        if isinstance(var, str):
-            mod = self.module
-            var = mod.get_global_var(var)
-
-        if isinstance(var, GlobalVar):
-            return var
-        else:
-            raise TypeError("var should be either a string or GlobalVar")
-
-    def print_var(self, var):
-        """Print a call graph of a global function by name or by variable.
-
-        Parameters
-        ----------
-        var: Union[String, tvm.relay.GlobalVar]
-            The name or global variable.
-
-        Returns
-        -------
-        ret : String
-            The call graph represented in string.
-        """
-        var = self._get_global_var(var)
-        return _ffi_api.PrintCallGraphGlobalVar(self, var)
-
-    def __str__(self):
-        """Print the call graph in the topological order."""
-        return _ffi_api.PrintCallGraph(self)
diff --git a/python/tvm/relay/analysis/count_layers.py b/python/tvm/relay/analysis/count_layers.py
deleted file mode 100644
index 93d4f2766284..000000000000
--- a/python/tvm/relay/analysis/count_layers.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities that enable counting the number of layers in a graph."""
-import tvm
-from tvm import relay
-from ..expr_functor import ExprVisitor
-
-
-class LayerCounter(ExprVisitor):
-    """A visitor pass that computes the deepest chain of specified ops in graph."""
-
-    def __init__(self, valid_ops):
-        self.depth_count = 0
-        self.deepest_count = 0
-        self.valid_ops = [relay.op.get(op) for op in valid_ops]
-        super().__init__()
-
-    def visit_call(self, call):
-        if call.op in self.valid_ops:
-            self.depth_count += 1
-        current_count = self.depth_count
-        self.deepest_count = max(self.deepest_count, current_count)
-        for arg in call.args:
-            self.visit(arg)
-            self.depth_count = current_count
-
-    def count(self):
-        return self.deepest_count
-
-
-def count_layers(expr, valid_ops):
-    """Determine the number of layers of specified ops in a graph.
-    This pass computes only the deepest chain of ops rather than the
-    total number of ops in a graph. Thus, if there are two parallel
-    convolutions (for example), they would be considered a single layer.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule.
-        The input expression.
-
-    valid_ops: List[str]
-        A list of the operations that should be included in the count.
-
-    Returns
-    -------
-    layer_count : int
-        The number of layers of the specified operations found in the graph.
-    """
-    if isinstance(expr, tvm.ir.IRModule):
-        expr = expr["main"]
-    count_pass = LayerCounter(valid_ops)
-    count_pass.visit(expr)
-    return count_pass.count()
diff --git a/python/tvm/relay/analysis/feature.py b/python/tvm/relay/analysis/feature.py
deleted file mode 100644
index 0e264a0eef7d..000000000000
--- a/python/tvm/relay/analysis/feature.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-"""The type nodes of the Relay language."""
-from enum import IntEnum
-
-
-class Feature(IntEnum):
-    """The features a program might contain."""
-
-    fVar = 0
-    fGlobalVar = 1
-    fConstant = 2
-    fTuple = 3
-    fTupleGetItem = 4
-    fFunction = 5
-    fOp = 6
-    fCall = 7
-    fLet = 8
-    fIf = 9
-    fRefCreate = 10
-    fRefRead = 11
-    fRefWrite = 12
-    fConstructor = 13
-    fMatch = 14
-    """ Whether any non-atom fragment of the program is shared, making the program a graph. """
-    fGraph = 15
-    """ Whether there is local fixpoint in the program. """
-    fLetRec = 16
diff --git a/python/tvm/relay/analysis/operations_distribution.py b/python/tvm/relay/analysis/operations_distribution.py
deleted file mode 100644
index 769f9ee88fb6..000000000000
--- a/python/tvm/relay/analysis/operations_distribution.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities that enable analyze Relay and get mappings for
-the unique identifier of the Relay line to the tuple of
-compiler name, composite name and composite/function identifier."""
-import re
-
-import tvm
-from tvm import relay
-from tvm.relay.expr_functor import ExprVisitor
-
-
-class AnalyzeOperationsDistribution(ExprVisitor):
-    """A visitor pass that maintains the dictionary unique_op_ids where
-    the tuple (compiler name, composite name) corresponds to the unique
-    identifier of the Relay line. The identifier will allow us to link
-    the lines of the initial Relay with the information about operators
-    offloading, which is present in the partitioned Relay
-    TVMC compiler adds a unique Relay line identifier as a suffix to the
-    call span field using the tag_suffixes pass if the --dump-offloads
-    option is specified.
-
-    Attributes
-    ----------
-    unique_op_ids : Dict[str, str]
-        Mapping the unique identifier of the Relay line obtained from
-        the "span" field of the Call and the tuple of compiler name,
-        composite name.
-    func_name : str
-        The name of the composite in the partitioned Relay or
-        'generic' in case the Call has not been included in any composite.
-    compiler_name : str
-        A name of the compiler (e.g. 'ethos-u' or 'cmsis-nn') or 'generic'
-        in case the Call has not been included in any composite.
-    """
-
-    def __init__(self):
-        self.unique_op_ids = {}
-        self.func_name = ""
-        self.compiler_name = ""
-        super().__init__()
-
-    def extract(self, call: relay.Call):
-        self.compiler_name = "generic"
-        self.func_name = "generic"
-        if "Compiler" in call.attrs:
-            self.compiler_name = call.attrs["Compiler"]
-        self.visit(call)
-
-    def visit_call(self, call: relay.Call):
-        if isinstance(call.op, tvm.ir.Op):
-            if call.span:
-                src = call.span.source_name.name
-                suffix = tvm.relay.transform.suffixes.SUFFIX_STRING
-                result = re.search(r"(.*)(" + suffix + r")(.*)", src)
-                res = result.group(1)
-                self.unique_op_ids[res] = [self.compiler_name, self.func_name]
-        if isinstance(call.op, relay.Function):
-            self.func_name = call.op.attrs["Composite"]
-        super().visit_call(call)
-
-
-def analyze_operations_distribution(mod):
-    """Traverses the partitioned graph to get the unique identifier
-    of the Relay line from the Call's span field.
-    The result is maintained in the dictionary unique_op_ids where
-    the unique indicator obtained from the op's span corresponds to
-    the tuple (compiler name, composite name).
-    With this information we can annotate the textual representation
-    of the initial Relay by indicating into which target composite
-    and function the operators are converted
-
-    Parameters
-    ----------
-    mod : tvm.ir.IRModule
-        The partitioned Relay graph usually obtained with
-        partition_for_<target> function
-
-    Returns
-    -------
-    unique_op_ids : Dict[str, str]
-        Mapping from the unique identifier of the Relay line to the tuple of
-        compiler name, composite name.
-    """
-    analyze = AnalyzeOperationsDistribution()
-    for _, func in mod.functions.items():
-        analyze.extract(func)
-    return analyze.unique_op_ids
diff --git a/python/tvm/relay/analysis/sparse_conv2d.py b/python/tvm/relay/analysis/sparse_conv2d.py
deleted file mode 100644
index 043cff989a33..000000000000
--- a/python/tvm/relay/analysis/sparse_conv2d.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return
-# pylint: disable=unidiomatic-typecheck
-"""
-This file contains helper functions for convert dense model
-to block sparse model
-"""
-from collections import namedtuple
-
-import numpy as np
-
-import tvm
-
-from . import _ffi_api
-
-SparseAnalysisResult = namedtuple(
-    "SparseAnalysisResult",
-    [
-        "weight_name",
-        "weight_shape",
-    ],
-)
-
-
-def _search_conv2d_op_weight(expr):
-    """Search name of weight in all ```nn.conv2d``` operator
-       This is a helpful function to determine which param need
-       to be converted to sparse
-
-    Parameters
-    ----------
-    expr : relay.Expr
-        Expr will be searched
-
-    Returns
-    -------
-    ret : Array[String]
-        name of weight in all ``nn.conv2d``` operator
-    """
-    return _ffi_api.search_conv2d_op_weight(expr)
-
-
-def process_params(
-    expr, params, block_size, sparsity_threshold, layout, kernel_size, reg_task_input=True
-):
-    """Process parameters of conv2d from dense to sparse.
-
-    Parameters
-    ----------
-    expr : Relay.Expr
-        Expr of the network
-    params : Dict[String, tvm.nd.array]
-        parameters of the network
-    block_size : Tuple(int, int)
-        Blocksize in BSR matrix
-    sparsity_threshold : float
-        Minimal sparsity requirement for converting to sparse operation
-    layout : str
-        layout of network
-
-    Returns
-    -------
-    ret : Namedtuple[weight_name: Array[String], weight_shape: Array[Array[IntImm]]]
-        return names of qualified conv2d weight and the shape in BSR format
-    """
-
-    # pylint: disable=import-outside-toplevel
-    import scipy.sparse as sp
-
-    from tvm.auto_scheduler.search_task import (  # lazily import to avoid recursive dependency
-        register_task_input_buffer,
-    )
-
-    memo = SparseAnalysisResult(weight_name=[], weight_shape=[])
-    weight_names = _search_conv2d_op_weight(expr)
-    for name in weight_names:
-        name = str(name)
-        w_np = params[name].numpy()
-
-        if layout == "NHWC":  # HWIO
-            weight_kernel = (w_np.shape[0], w_np.shape[1])
-        elif layout == "NCHW":  # OIHW
-            weight_kernel = (w_np.shape[2], w_np.shape[3])
-        if weight_kernel[0] != weight_kernel[1]:
-            continue
-
-        if weight_kernel[0] == kernel_size == 1:
-            sparsity = 1.0 - (np.count_nonzero(w_np) / w_np.size)
-            if sparsity < sparsity_threshold:
-                continue
-            if layout == "NHWC":
-                w_np = w_np.squeeze().T
-            elif layout == "NCHW":
-                w_np = w_np.squeeze()
-
-            sparse_weight = sp.bsr_matrix(w_np, blocksize=block_size)
-
-            # when bs_c=1, remove this dim
-            if block_size[1] == 1:
-                sparse_weight_data = sparse_weight.data.reshape(
-                    sparse_weight.data.shape[0], block_size[0]
-                )
-            else:
-                sparse_weight_data = sparse_weight.data
-        elif weight_kernel[0] == kernel_size == 3:
-            if layout == "NHWC":  # HWIO
-                w_np = w_np.reshape((-1, w_np.shape[-1])).T
-            elif layout == "NCHW":  # OIHW
-                w_np = w_np.reshape((w_np.shape[0], -1))
-            sparse_weight = sp.bsr_matrix(w_np, blocksize=block_size)
-            if 1 - (sparse_weight.nnz / w_np.size) < sparsity_threshold:
-                continue
-            sparse_weight_data = sparse_weight.data
-        else:
-            continue
-
-        # remove dense weight
-        del params[name]
-        memo.weight_name.append(name)
-        memo.weight_shape.append(
-            list(sparse_weight_data.shape)
-            + list(sparse_weight.indices.shape)
-            + list(sparse_weight.indptr.shape)
-        )
-        params[name + ".data"] = tvm.nd.array(sparse_weight_data)
-        params[name + ".indices"] = tvm.nd.array(sparse_weight.indices)
-        params[name + ".indptr"] = tvm.nd.array(sparse_weight.indptr)
-
-        if reg_task_input:
-            prefix = "sparse_conv2d_bsr_%d_%d_%d_%d_%d_%d_" % (
-                w_np.shape[0],
-                w_np.shape[1],
-                block_size[0],
-                block_size[1],
-                sparse_weight.indices.shape[0],
-                sparse_weight.indptr.shape[0],
-            )
-            register_task_input_buffer(
-                "default",
-                prefix + "W_data",
-                tvm.runtime.ndarray.array(sparse_weight_data),
-                overwrite=True,
-            )
-            register_task_input_buffer(
-                "default",
-                prefix + "W_indices",
-                tvm.runtime.ndarray.array(sparse_weight.indices),
-                overwrite=True,
-            )
-            register_task_input_buffer(
-                "default",
-                prefix + "W_indptr",
-                tvm.runtime.ndarray.array(sparse_weight.indptr),
-                overwrite=True,
-            )
-    ret = SparseAnalysisResult(
-        weight_name=tvm.runtime.convert(memo.weight_name),
-        weight_shape=tvm.runtime.convert(memo.weight_shape),
-    )
-    return ret
diff --git a/python/tvm/relay/analysis/sparse_dense.py b/python/tvm/relay/analysis/sparse_dense.py
deleted file mode 100644
index 16a724813de8..000000000000
--- a/python/tvm/relay/analysis/sparse_dense.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return
-# pylint: disable=unidiomatic-typecheck
-"""
-This file contains helper functions for convert dense model
-to block sparse model
-"""
-from collections import namedtuple
-
-import numpy as np
-
-import tvm
-
-from . import _ffi_api
-
-SparseAnalysisResult = namedtuple(
-    "SparseAnalysisResult",
-    [
-        "weight_name",
-        "weight_shape",
-    ],
-)
-
-
-def _search_dense_op_weight(expr):
-    """Search name of weight in all ```nn.dense``` operator
-       This is a helpful function to determine which param need
-       to be converted to sparse
-
-    Parameters
-    ----------
-    expr : relay.Expr
-        Expr will be searched
-
-    Returns
-    -------
-    ret : Array[String]
-        name of weight in all ``nn.dense``` operator
-    """
-    return _ffi_api.search_dense_op_weight(expr)
-
-
-def process_params(expr, params, block_size, sparsity_threshold):
-    """[summary]
-
-    Parameters
-    ----------
-    expr : Relay.Expr
-        Expr of the network
-    params : Dict[String, tvm.nd.array]
-        parameters of the network
-    block_size : Tuple(int, int)
-        Blocksize in BSR matrix
-    sparsity_threshold : float
-        Minimal sparsity requirement for converting to sparse operation
-
-    Returns
-    -------
-    ret : Namedtuple[weight_name: Array[String], weight_shape: Array[Array[IntImm]]]
-        return names of qualified dense weight and the shape in BSR format
-    """
-
-    # pylint: disable=import-outside-toplevel
-    import scipy.sparse as sp
-
-    from tvm.auto_scheduler.search_task import (  # lazily import to avoid recursive dependency
-        register_task_input_buffer,
-    )
-
-    memo = SparseAnalysisResult(weight_name=[], weight_shape=[])
-    weight_names = _search_dense_op_weight(expr)
-    for name in weight_names:
-        name = str(name)
-        w_np = params[name].numpy()
-        sparsity = 1.0 - (np.count_nonzero(w_np) / w_np.size)
-        if sparsity >= sparsity_threshold:
-            sparse_weight = sp.bsr_matrix(w_np, blocksize=block_size)
-            # remove dense weight
-            del params[name]
-            memo.weight_name.append(name)
-            memo.weight_shape.append(
-                list(sparse_weight.data.shape)
-                + list(sparse_weight.indices.shape)
-                + list(sparse_weight.indptr.shape)
-            )
-            params[name + ".data"] = tvm.nd.array(sparse_weight.data)
-            params[name + ".indices"] = tvm.nd.array(sparse_weight.indices)
-            params[name + ".indptr"] = tvm.nd.array(sparse_weight.indptr)
-
-            prefix = "sparse_dense_bsr_%d_%d_%d_%d_%d_%d_" % (
-                w_np.shape[0],
-                w_np.shape[1],
-                block_size[0],
-                block_size[1],
-                sparse_weight.indices.shape[0],
-                sparse_weight.indptr.shape[0],
-            )
-            register_task_input_buffer(
-                "default",
-                prefix + "W_data",
-                tvm.runtime.ndarray.array(sparse_weight.data),
-                overwrite=True,
-            )
-            register_task_input_buffer(
-                "default",
-                prefix + "W_indices",
-                tvm.runtime.ndarray.array(sparse_weight.indices),
-                overwrite=True,
-            )
-            register_task_input_buffer(
-                "default",
-                prefix + "W_indptr",
-                tvm.runtime.ndarray.array(sparse_weight.indptr),
-                overwrite=True,
-            )
-    ret = SparseAnalysisResult(
-        weight_name=tvm.runtime.convert(memo.weight_name),
-        weight_shape=tvm.runtime.convert(memo.weight_shape),
-    )
-    return ret
diff --git a/python/tvm/relay/backend/__init__.py b/python/tvm/relay/backend/__init__.py
deleted file mode 100644
index b6a402b0f30f..000000000000
--- a/python/tvm/relay/backend/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Backend codegen modules for relay."""
-from . import te_compiler
-from .executor import Executor
-from .runtime import Runtime
diff --git a/python/tvm/relay/backend/_aot.py b/python/tvm/relay/backend/_aot.py
deleted file mode 100644
index 437cd71c4c35..000000000000
--- a/python/tvm/relay/backend/_aot.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The AOT FFI namespace.
-"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.backend.aot", __name__)
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
deleted file mode 100644
index b377eefdb2c5..000000000000
--- a/python/tvm/relay/backend/_backend.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The interface of expr function exposed from C++."""
-import tvm._ffi
-import tvm.driver
-
-
-@tvm._ffi.register_func("relay.backend.build")
-def build(mod, target, target_host=None):
-    """Backend build function.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule or Dict[str, tvm.IRModule]
-        Input module
-
-    target : tvm.Target
-        The target to run the code on.
-
-    target_host : tvm.Target
-        The host target.
-
-    Returns
-    -------
-    module : tvm.Module
-        The runtime module.
-    """
-    target_host = None if target_host == "" else target_host
-    return tvm.driver.build(mod, target=target, target_host=target_host)
-
-
-@tvm._ffi.register_func("relay._tensor_value_repr")
-def _tensor_value_repr(tvalue):
-    return str(tvalue.data.numpy())
-
-
-@tvm._ffi.register_func("relay._constant_repr")
-def _tensor_constant_repr(tvalue):
-    dtype = tvm.runtime.DataType(tvalue.data.dtype)
-    if tvm.target.datatype.get_type_registered(dtype.type_code):
-        return "custom tensor of type " + dtype.type_code
-    return str(tvalue.data.numpy())
-
-
-tvm._ffi._init_api("relay.backend", __name__)
diff --git a/python/tvm/relay/backend/_vm.py b/python/tvm/relay/backend/_vm.py
deleted file mode 100644
index cffbbdccde5a..000000000000
--- a/python/tvm/relay/backend/_vm.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The Relay virtual machine FFI namespace.
-"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay._vm", __name__)
diff --git a/python/tvm/relay/backend/aot.py b/python/tvm/relay/backend/aot.py
deleted file mode 100644
index 778c9b4164dd..000000000000
--- a/python/tvm/relay/backend/aot.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""AOT passes"""
-from typing import Dict
-
-from tvm import IRModule
-from tvm.relay.backend import Executor
-from tvm.ir.transform import Pass
-from .utils import CallType
-
-from . import _aot
-
-
-def AOTLowerMain(mod_name: str, config: object, call_type: CallType) -> Pass:
-    """Lower a Relay main function into an AOT TIR main function.
-
-    Parameters
-    ----------
-    mod_name: str
-        The name of the module.
-    config : CompilationConfig
-        The compilation configuration.
-    call_type : CallType
-        The calling convention to use.
-
-    Returns
-    -------
-    Pass
-        The AOTLowerMain pass.
-
-    """
-    return _aot.AOTLowerMain(mod_name, config, call_type.value)
-
-
-def CreateFunctionMetadata(
-    mod: IRModule, workspace_byte_alignment: int, constant_byte_alignment: int
-) -> Dict[str, object]:
-    """Create the function metadata (FunctionInfos) from an AOT module.
-
-    Parameters
-    ----------
-    mod : IRModule
-        The IRModule.
-    workspace_byte_alignment : int
-        The alignment of the workspace buffer in bytes.
-    constant_byte_alignment : int
-        The alignment of the constant buffer in bytes.
-
-    Returns
-    -------
-    Dict[str, FunctionInfo]
-        A map between function names and FunctionInfos.
-
-    """
-    return _aot.CreateFunctionMetadata(mod, workspace_byte_alignment, constant_byte_alignment)
-
-
-def CreateExecutorMetadata(
-    mod: IRModule,
-    mod_name: str,
-    executor: Executor,
-    workspace_byte_alignment: int,
-    constant_byte_alignment: int,
-) -> object:
-    """Create the executor metadata from an AOT module.
-
-    Parameters
-    ----------
-    mod : IRModule
-        The IRModule.
-    mod_name : str
-        The name of the module.
-    executor : Executor
-        The executor configuration.
-    workspace_byte_alignment : int
-        The alignment of the workspace buffer in bytes.
-    constant_byte_alignment : int
-        The alignment of the constant buffer in bytes.
-
-    Returns
-    -------
-    ExecutorCodegenMetadata
-        The executor metadata.
-
-    """
-    return _aot.CreateExecutorMetadata(
-        mod, mod_name, executor, workspace_byte_alignment, constant_byte_alignment
-    )
diff --git a/python/tvm/relay/backend/contrib/__init__.py b/python/tvm/relay/backend/contrib/__init__.py
deleted file mode 100644
index 599b24592c4f..000000000000
--- a/python/tvm/relay/backend/contrib/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""External backend codegen modules for Relay."""
diff --git a/python/tvm/relay/backend/contrib/uma/__init__.py b/python/tvm/relay/backend/contrib/uma/__init__.py
deleted file mode 100644
index 061a42e23a87..000000000000
--- a/python/tvm/relay/backend/contrib/uma/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""UMA modules for Relay."""
-
-from .backend import UMABackend
-from .api.utils import uma_available
-
-__all__ = ["UMABackend", "uma_available"]
diff --git a/python/tvm/relay/backend/contrib/uma/api/__init__.py b/python/tvm/relay/backend/contrib/uma/api/__init__.py
deleted file mode 100644
index f826a56016fa..000000000000
--- a/python/tvm/relay/backend/contrib/uma/api/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""UMA: Universal Modular Accelerator Interface API"""
-
-from .codegen import UMACodegen
-from .lower import UMALower
-from .partitioner import UMAPartitioner
-
-
-__all__ = ["UMACodegen", "UMALower", "UMAPartitioner"]
diff --git a/python/tvm/relay/backend/contrib/uma/api/_ffi_api.py b/python/tvm/relay/backend/contrib/uma/api/_ffi_api.py
deleted file mode 100644
index 5f67cb7ec246..000000000000
--- a/python/tvm/relay/backend/contrib/uma/api/_ffi_api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for relay transformation passes."""
-import tvm._ffi  # type: ignore
-
-tvm._ffi._init_api("relay.ext.uma", __name__)
diff --git a/python/tvm/relay/backend/contrib/uma/api/codegen.py b/python/tvm/relay/backend/contrib/uma/api/codegen.py
deleted file mode 100644
index 8bbb77c91b44..000000000000
--- a/python/tvm/relay/backend/contrib/uma/api/codegen.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Codegen base class of the Universal Modular Accelerator Interface (UMA)"""
-
-from typing import Callable, Optional
-import tvm
-
-
-class UMACodegen(object):
-    """
-    Codegen base class of the Universal Modular Accelerator Interface (UMA)
-    """
-
-    def __init__(self, target_name: str) -> None:
-        self.target_name = target_name
-
-    def _register_codegen(
-        self, fmt: str = "c", includes: Optional[Callable[[], str]] = None, **kwargs
-    ) -> None:
-        """Registration codegen in UMA.
-
-        Parameters
-        ----------
-        fmt: str
-            format of codegen. Currently only "c" is supported.
-        includes : OptionalCallable[[], str]]
-            user-defined function that adds C-#include statement to UMA C-Code.
-        """
-        if fmt == "c":
-            self._register_c_codegen(includes, **kwargs)
-        else:
-            raise RuntimeError(f'Unsupported codegen format "{fmt}"')
-
-    def _register_c_codegen(self, includes: Optional[Callable[[], str]] = None) -> None:
-        """Registration of UMA helper functions, e.g. includes and replace_call_extern.
-
-        Parameters
-        ----------
-        includes : OptionalCallable[[], str]]
-            user-defined function that adds C-#include statement to UMA C-Code.
-        """
-        if includes is not None:
-            tvm._ffi.register_func(
-                f"relay.ext.uma.codegen_c_includes_{self.target_name}",
-                includes,
-                override=True,
-            )
-
-    def register(self) -> None:
-        pass
diff --git a/python/tvm/relay/backend/contrib/uma/api/lower.py b/python/tvm/relay/backend/contrib/uma/api/lower.py
deleted file mode 100644
index 334b6d101f82..000000000000
--- a/python/tvm/relay/backend/contrib/uma/api/lower.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Lowering base class of the Universal Modular Accelerator Interface (UMA)"""
-
-from typing import List, Tuple, Callable, Optional
-
-import tvm
-from tvm import relay, te
-from tvm.relay.op.op import register_strategy
-from . import _ffi_api
-from .utils import PassPhase
-
-OperatorStrategies = List[
-    Tuple[
-        str,
-        Callable[
-            [tvm.ir.Attrs, tvm.ir.Array, tvm.ir.TensorType, tvm.target.Target],
-            tvm.relay.op.op.OpStrategy,
-        ],
-        Optional[int],
-    ]
-]
-
-
-class UMALower:
-    """Lowering base class of the Universal Modular Accelerator Interface (UMA)."""
-
-    def __init__(self, target_name: str) -> None:
-        self.target_name = target_name
-        self._operator_strategies: OperatorStrategies = []
-        self._tir_passes: List[Tuple[PassPhase, tvm.tir.transform.PrimFuncPass]] = []
-
-    def _lower_relay_to_tir(self, relay_prim_func: relay.Function) -> tvm.tir.PrimFunc:
-        """Lower a Relay primitive function to a S-TIR primitive function.
-
-        Parameters
-        ----------
-        prim_func : tvm.relay.Function
-            The Relay function to lower.
-
-        Returns
-        -------
-        out : tvm.tir.PrimFunc
-            The lowered schedulable TensorIR primitive function.
-
-        """
-
-        def _get_tensors(te_cached_func):
-            return list(te_cached_func.inputs) + list(te_cached_func.outputs)
-
-        lower_to_te = tvm._ffi.get_global_func("relay.backend.LowerToTE")
-        te_cached_func = lower_to_te(relay_prim_func)
-        x = _get_tensors(te_cached_func)
-        tir_prim_func = te.create_prim_func(x)
-        tir_prim_func = tir_prim_func.with_attr(
-            "global_symbol", relay_prim_func.attrs["global_symbol"]
-        )
-
-        compiler_attr = relay_prim_func.attrs["Compiler"]
-        target = tvm.target.Target.current()
-        if target.kind.name != compiler_attr:
-            target = tvm.target.Target(compiler_attr)
-
-        tir_prim_func = tir_prim_func.with_attr("target", target)
-        tir_prim_func = tir_prim_func.with_attr("relay_attrs", relay_prim_func.attrs)
-        return tir_prim_func
-
-    def _lower_stir_to_nstir(self, prim_func: tvm.tir.PrimFunc) -> tvm.tir.PrimFunc:
-        """Lower a S-TIR primitive function to a NS-TIR primitive function.
-
-        Parameters
-        ----------
-        prim_func : tvm.tir.PrimFunc
-            The primitive function to lower.
-
-        Returns
-        -------
-        out : tvm.tir.PrimFunc
-            The lowered non-schedulable TensorIR primitive function.
-
-        """
-        curr_ctxt = tvm.transform.PassContext().current()
-        assert "tir.add_lower_pass" not in curr_ctxt.config
-
-        pass_map = {
-            PassPhase.TIR_PHASE_0: 0,
-            PassPhase.TIR_PHASE_1: 1,
-            PassPhase.TIR_PHASE_2: 2,
-            PassPhase.TIR_PHASE_3: 3,
-        }
-        lower_passes = [(pass_map[k], v) for k, v in self._tir_passes]
-
-        with tvm.transform.PassContext(
-            opt_level=curr_ctxt.opt_level,
-            required_pass=curr_ctxt.required_pass,
-            disabled_pass=curr_ctxt.disabled_pass,
-            instruments=curr_ctxt.instruments,
-            config={**dict(curr_ctxt.config), "tir.add_lower_pass": lower_passes},
-        ):
-            mod = tvm.lower(tvm.ir.IRModule.from_expr(prim_func))
-        prim_func = mod[prim_func.attrs["global_symbol"]]
-        return prim_func
-
-    def relay_to_tir(self, mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
-        """
-        This is the hook for python-based lowering of a Relay module which lowers NPU
-        external functions to TIR.
-
-        Parameters
-        ----------
-        mod : tvm.ir.IRModule
-            This is the Relay module.
-
-        Returns
-        -------
-        mod : tvm.ir.IRModule
-            The Relay module with scheduled NPU external functions.
-        """
-        mod = _ffi_api.OutlineCompilerFunctions(self.target_name)(mod)
-        for gvar, func in mod.functions.items():
-            if "Compiler" in func.attrs and func.attrs["Compiler"] == self.target_name:
-                func = self._lower_relay_to_tir(func)
-                func = self._lower_stir_to_nstir(func)
-                mod.update_func(gvar, func)
-        return mod
-
-    def register(self) -> None:
-        """Register all relevant relay-to-tir functions."""
-        tvm._ffi.register_func(f"relay.ext.uma.{self.target_name}.relay_to_tir", self.relay_to_tir)
-        for op, strategy, plevel in self._operator_strategies:
-            register_strategy(op, strategy, plevel)
diff --git a/python/tvm/relay/backend/contrib/uma/api/partitioner.py b/python/tvm/relay/backend/contrib/uma/api/partitioner.py
deleted file mode 100644
index 48cac81d13d8..000000000000
--- a/python/tvm/relay/backend/contrib/uma/api/partitioner.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Partitioner base class of the Universal Modular Accelerator Interface (UMA)"""
-
-from typing import Callable, Dict, List, Tuple, Optional
-
-import tvm
-from tvm import relay
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.op.contrib.register import register_pattern_table
-from .utils import PassPhase
-
-
-PatternTable = List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]
-
-
-class UMAPartitioner:
-    """Partitioner base class of the Universal Modular Accelerator Interface (UMA)."""
-
-    def __init__(self, target_name: str, merge_compiler_regions: bool = True) -> None:
-        self.target_name = target_name
-        self.merge_compiler_regions = merge_compiler_regions
-
-        self._relay_passes: List[Tuple[PassPhase, tvm.transform.Pass]] = []
-        self._patterns: PatternTable = []
-
-    def add_pattern(
-        self,
-        name: str,
-        pattern: tvm.relay.dataflow_pattern.DFPattern,
-        predicate: Optional[Callable] = None,
-    ) -> None:
-        """Add pattern to UMA partitioner
-
-        Parameters
-        ----------
-        name : str
-            relay name of pattern
-
-        pattern: tvm.relay.dataflow_pattern.DFPattern
-            pattern description as DFPattern
-
-        predicate: Optional[Callable]
-            Optional predicate
-
-        """
-
-        name = self.target_name + "." + name
-        if predicate:
-            self._patterns.append((name, pattern, predicate))
-        else:
-            self._patterns.append((name, pattern))
-
-    def _pattern_table(self) -> PatternTable:
-        return self._patterns
-
-    def register(self) -> None:
-        """Register all relevant relay-to-relay functions."""
-        register_pattern_table(self.target_name, self._pattern_table)
-
-    def partition(
-        self, mod: tvm.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None
-    ) -> tvm.IRModule:
-        """Partition the relay graph in parts supported and unsupported by the
-        target hardware accelerator.
-
-        Parameters
-        ----------
-        mod : tvm.IRModule
-            The relay module to be partitioned.
-
-        params: Optional[Dict[str, tvm.runtime.NDArray]]
-
-        Returns
-        -------
-        out : tvm.IRModule
-            The partitioned relay module.
-
-        """
-        if params:
-            mod["main"] = bind_params_by_name(mod["main"], params)
-
-        pass_sequence = []
-        pass_sequence.extend(
-            [p[1] for p in self._relay_passes if p[0] == PassPhase.PRE_PARTITIONING]
-        )
-        pass_sequence.append(relay.transform.MergeComposite(self._pattern_table()))
-        pass_sequence.append(relay.transform.AnnotateTarget(self.target_name))
-        if self.merge_compiler_regions:
-            pass_sequence.append(relay.transform.MergeCompilerRegions())
-        pass_sequence.append(relay.transform.PartitionGraph())
-        pass_sequence.extend(
-            [p[1] for p in self._relay_passes if p[0] == PassPhase.POST_PARTITIONING_0]
-        )
-
-        sequential_passes = tvm.transform.Sequential(pass_sequence)
-        mod = sequential_passes(mod)
-
-        # Defunctionalize the partitioned functions to allow lowering
-        for gvar, func in mod.functions.items():
-            mod.update_func(gvar, relay.transform.Defunctionalization(func, mod))
-
-        post_partition_passes_1 = tvm.transform.Sequential(
-            [p[1] for p in self._relay_passes if p[0] == PassPhase.POST_PARTITIONING_1]
-        )
-        mod = post_partition_passes_1(mod)
-
-        return mod
diff --git a/python/tvm/relay/backend/contrib/uma/api/utils.py b/python/tvm/relay/backend/contrib/uma/api/utils.py
deleted file mode 100644
index 42a25ea7630c..000000000000
--- a/python/tvm/relay/backend/contrib/uma/api/utils.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Utility methods for the Universal Modular Accelerator Interface (UMA)"""
-
-from enum import Enum, auto
-import uuid
-
-import tvm
-import tvm.tir
-from tvm.contrib import utils, clang
-
-
-def uma_available() -> bool:
-    registration_func = tvm.get_global_func(
-        "relay.backend.contrib.uma.RegisterTarget", allow_missing=True
-    )
-    return registration_func is not None
-
-
-class PassPhase(Enum):
-    """
-    UMA pass phases:
-
-    PRE_PARTITIONING: prior to UMA partitioning
-    POST_PARTITIONING_0: after UMA partitioning, before Defunctionalization
-    POST_PARTITIONING_1: after UMA partitioning and after Defunctionalization
-    TIR_PHASE_0: Generates the raw IR and loop levels.
-    TIR_PHASE_1: Flattens the array storage.
-    TIR_PHASE_2: Transforms loops, like unroll, vectorization and thread-binding.
-    TIR_PHASE_3: Does some cleanup work.
-
-    Reference to TIR phases: src/driver/driver_api.c
-    """
-
-    PRE_PARTITIONING = auto()
-    POST_PARTITIONING_0 = auto()
-    POST_PARTITIONING_1 = auto()
-    TIR_PHASE_0 = auto()
-    TIR_PHASE_1 = auto()
-    TIR_PHASE_2 = auto()
-    TIR_PHASE_3 = auto()
-
-
-def _c_to_llvm(c_code: str) -> str:
-    unique_filename = str(uuid.uuid4())
-    temp = utils.tempdir()
-    ll_path = temp.relpath(f"{unique_filename}.ll")
-    ll_code = clang.create_llvm([c_code], output=ll_path)
-    return ll_code
-
-
-def add_llvm_to_block(
-    sch: tvm.tir.Schedule, block_name: str, c_code_str: str = ""
-) -> tvm.tir.Schedule:
-    block = sch.get_block(block_name)
-    loops = sch.get_loops(block)
-    assert len(loops) > 0
-    sch.annotate(loops[0], "pragma_import_llvm", _c_to_llvm(c_code_str))
-    return sch
diff --git a/python/tvm/relay/backend/contrib/uma/backend.py b/python/tvm/relay/backend/contrib/uma/backend.py
deleted file mode 100644
index 8aa6931939df..000000000000
--- a/python/tvm/relay/backend/contrib/uma/backend.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unnecessary-ellipsis
-"""Backend base class of the Universal Modular Accelerator Interface (UMA)"""
-
-from abc import ABC, abstractmethod
-from typing import Union, Dict, Callable, Optional, Any
-
-import tvm
-from tvm.relay.backend.contrib.uma.api.codegen import UMACodegen
-from tvm.relay.backend.contrib.uma.api.lower import UMALower
-from tvm.relay.backend.contrib.uma.api.partitioner import UMAPartitioner
-from tvm.relay.backend.contrib.uma.api.utils import PassPhase
-
-
-class UMABackend(ABC):
-    """Backend base class of the Universal Modular Accelerator Interface (UMA)"""
-
-    def __init__(self, merge_compiler_regions: bool = True) -> None:
-        self._target_attrs: Dict = {}
-        self._target_preprocessor: Callable[[str], Dict[str, Any]] = None
-        self._relay_to_relay = UMAPartitioner(self.target_name, merge_compiler_regions)
-        self._relay_to_tir = UMALower(self.target_name)
-        self._tir_to_runtime = UMACodegen(self.target_name)
-
-    @property
-    @abstractmethod
-    def target_name(self) -> str:
-        """Name of the hardware target.
-
-        Returns
-        -------
-        out : str
-            The hardware target name.
-        """
-        ...
-
-    # Target configuration
-    def _register_target_attr(
-        self,
-        name: str,
-        default: Optional[Union[str, int, bool]] = "",
-    ) -> None:
-        """Register a target attribute name that can be used during target instantiation.
-        Parameters
-        ----------
-        name: str
-           The name of the target attribute.
-
-        default: Optional[Union[str, int, bool]]
-            A default value for the attribute.
-            If none is provided, the attribute will be treated as a string.
-
-        Example
-        -------
-        Here is an example of how two attribute options are registered.
-
-        .. code-block:: python
-
-            self._register_target_attr("attrA", default=0)
-            self._register_target_attr("attrB", default=False)
-        """
-        self._target_attrs[name] = default
-
-    # Relay to Relay function registration
-    def _register_relay_pass(self, phase: PassPhase, relay_pass: tvm.transform.Pass) -> None:
-        """Registers a relay pass at the given phase in the lowering process.
-
-        Parameters
-        ----------
-        phase: PassPhase
-           The phase at which the pass is registered.
-
-        relay_pass: tvm.transform.Pass
-            The relay pass to be registered.
-
-        Example
-        -------
-        Here is an example of how two relay passes are registered.
-        Passes of the same phase are executed in the order they are registered.
-
-        .. code-block:: python
-
-            self._register_relay_pass(PassPhase.PRE_PARTITIONING, MyPassA)
-            self._register_relay_pass(PassPhase.POST_PARTITIONING, MyPassB)
-
-        Where a relay pass can look like this:
-
-        .. code-block:: python
-
-            @tvm.ir.transform.module_pass(opt_level=0)
-            class MyPassA:
-                def transform_module(self, mod, ctx):
-                    # My pass functionality...
-                    return mod
-        """
-        self._relay_to_relay._relay_passes.append((phase, relay_pass))
-
-    def _register_pattern(
-        self,
-        name: str,
-        pattern: tvm.relay.dataflow_pattern.DFPattern,
-        predicate: Optional[Callable] = None,
-    ) -> None:
-        """Registers a dataflow pattern that is used to partition the relay graph.
-
-        Parameters
-        ----------
-        name: str
-           The name of the pattern
-
-        pattern: tvm.relay.dataflow_pattern.DFPattern
-            Relay DFPattern
-
-        predicate: Optional[Callable]
-            Optional predicate for Relay DFPattern
-        Example
-        -------
-        Here is an example of how two dataflow patterns are registered.
-        During partioning, patterns are searched in order of registration.
-
-        .. code-block:: python
-
-            self._register_pattern("conv1d", conv1d_pattern)
-            self._register_pattern("conv2d", conv2d_pattern)
-
-        Where a dataflow pattern can look like this:
-
-        .. code-block:: python
-
-            conv1d_pattern = is_op("nn.conv1d")(wildcard(), wildcard())
-            optional_bias = lambda x: is_op("nn.bias_add")(x, wildcard())
-            optional_relu = lambda x: is_op("nn.relu")(x)
-            conv1d_pattern = conv1d_pattern.optional(optional_bias).optional(optional_relu)
-        """
-        self._relay_to_relay.add_pattern(name, pattern, predicate)
-
-    # Relay to TIR function registration
-    def _register_operator_strategy(
-        self,
-        op: str,
-        strategy: Callable[
-            [tvm.ir.Attrs, tvm.ir.Array, tvm.ir.TensorType, tvm.target.Target],
-            tvm.relay.op.op.OpStrategy,
-        ],
-        plevel: Optional[int] = 11,
-    ) -> None:
-        """Registers an operator strategy that is used to partition the relay graph.
-
-        Parameters
-        ----------
-        op: str
-           The name of the operator for which this strategy will be registered.
-
-        strategy: Callable[[tvm.ir.Attrs, tvm.ir.Array, tvm.ir.TensorType, tvm.target.Target],
-                            tvm.relay.op.op.OpStrategy]
-            The strategy function.
-
-        plevel: Optional[int] = 11
-            The priority level of the strategy. Higher plevel equals higher priorization.
-            The TVM default for topi strategies is 10 so by default new UMA strategies are
-            always used.
-
-        Example
-        -------
-        Here is an example of how two operator strategies are registered.
-
-        .. code-block:: python
-
-            self._register_operator_strategy("nn.conv1d", custom_conv1d_strategy)
-            self._register_operator_strategy("nn.conv2d", custom_conv2d_strategy)
-
-        Where a strategy function can look like this:
-
-        .. code-block:: python
-
-            @relay.op.strategy.override_native_generic_func("custom_conv1d_strategy")
-            def custom_conv1d_strategy(attrs, inputs, out_type, target):
-                strategy = _op.OpStrategy()
-                strategy.add_implementation(
-                    wrap_compute_conv1d(custom_conv1d_compute),
-                    wrap_topi_schedule(custom_conv1d_schedule),
-                    name="custom_conv1d.generic",
-                return strategy
-        """
-        self._relay_to_tir._operator_strategies.append((op, strategy, plevel))
-
-    def _register_tir_pass(
-        self, phase: PassPhase, tir_pass: tvm.tir.transform.PrimFuncPass
-    ) -> None:
-        """Registers a TIR pass at the given phase in the lowering process.
-
-        Parameters
-        ----------
-        phase: PassPhase
-           The phase at which the pass is registered.
-
-        tir_pass: tvm.tir.transform.PrimFuncPass
-            The TIR pass to be registered.
-        Example
-        -------
-        Here is an example of how two TIR passes are registered.
-        Passes of the same phase are executed in the order they are registered.
-
-        .. code-block:: python
-
-            self._register_tir_pass(PassPhase.TIR_PHASE_0, MyPassA)
-            self._register_tir_pass(PassPhase.TIR_PHASE_1, MyPassB)
-
-        Where a TIR pass can look like this:
-
-        .. code-block:: python
-
-            @tvm.tir.transform.prim_func_pass(opt_level=0)
-            class MyPassA:
-                def transform_function(self, func, mod, ctx):
-                    # My pass functionality...
-                    return func
-        """
-        self._relay_to_tir._tir_passes.append((phase, tir_pass))
-
-    # TIR to runtime function registration
-    def _register_codegen(self, fmt: str = "c", **kwargs) -> None:
-        """Registers a codegen which is used in place of the default C-codegen.
-
-        Parameters
-        ----------
-        fmt: str
-            The codegen format. For now, only C-codegen is supported by UMA.
-
-        **kwargs
-            Keyword arguments for the chosen codegen.
-
-        Example
-        -------
-        Here is an example of how the custom C-codegen is registered and configured.
-        Passes of the same phase are executed in the order they are registered.
-
-        .. code-block:: python
-
-            self._register_codegen(
-                fmt="c", includes=gen_includes
-            )
-
-        The C-codegen currently provides one hook which allows the user to insert code through
-        the python API.
-            - `includes` hooks into the include stream and allows insertion of custom includes.
-
-
-        The code generation functions can look like this:
-
-        .. code-block:: python
-
-            def gen_includes() -> str:
-                includes = "#include <my_custom_header.h>\n"
-                return includes
-        """
-        self._tir_to_runtime._register_codegen(fmt, **kwargs)
-
-    # Backend functions
-    def register(self) -> None:
-        """
-        Registering UMABackend:
-         registering target attributes, relay_to_relay, relay_to_tir and tir_to_runtime
-        """
-        registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
-
-        for _, attr in self._target_attrs.items():
-            if attr is None:
-                raise ValueError("Target attribute None is not supported.")
-        # skip if target is already registered
-        if self.target_name not in tvm.target.Target.list_kinds():
-            registration_func(self.target_name, self._target_attrs)
-            self._relay_to_relay.register()
-            self._relay_to_tir.register()
-            self._tir_to_runtime.register()
-
-    def partition(
-        self, mod: tvm.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None
-    ) -> tvm.IRModule:
-        return self._relay_to_relay.partition(mod, params)
diff --git a/python/tvm/relay/backend/executor.py b/python/tvm/relay/backend/executor.py
deleted file mode 100644
index 854473f662c0..000000000000
--- a/python/tvm/relay/backend/executor.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=len-as-condition,no-else-return,invalid-name
-"""Executor configuration"""
-
-import tvm
-from tvm.runtime import Object
-
-from . import _backend
-
-
-@tvm._ffi.register_object
-class Executor(Object):
-    """Executor configuration"""
-
-    flag_registry_name = "executor"
-
-    def __init__(self, name, options=None) -> None:
-        if options is None:
-            options = {}
-        self.__init_handle_by_constructor__(_backend.CreateExecutor, name, options)
-        self._init_wrapper()
-
-    # Note:  sometimes the _attrs field is not properly populated,
-    # most likely since __new__ is called instead of __init__ in tvm/_ffi/_ctypes/object.py
-    def _init_wrapper(self):
-        self._attrs = _backend.GetExecutorAttrs(self)
-        self._init_wrapper_called = True
-
-    def _check_init_wrapper(self):
-        if not (hasattr(self, "_init_wrapper_called") and self._init_wrapper_called):
-            self._init_wrapper()
-
-    def __contains__(self, name):
-        self._check_init_wrapper()
-        return name in self._attrs
-
-    def __getitem__(self, name):
-        self._check_init_wrapper()
-        return self._attrs[name]
-
-    def __eq__(self, other):
-        self._check_init_wrapper()
-        return str(other) == str(self) and dict(other._attrs) == dict(self._attrs)
-
-    @staticmethod
-    def list_registered():
-        """Returns a list of possible executors"""
-        return list(_backend.ListExecutors())
-
-    @staticmethod
-    def list_registered_options(executor):
-        """Returns the dict of available option names and types"""
-        return dict(_backend.ListExecutorOptions(str(executor)))
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
deleted file mode 100644
index dd77a3f9bd9a..000000000000
--- a/python/tvm/relay/backend/executor_factory.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Executor factory modules."""
-from abc import abstractmethod
-import warnings
-
-from ..._ffi.base import string_types
-from ..._ffi.registry import get_global_func
-from ...runtime import ndarray
-
-
-class ExecutorFactoryModule:
-    """Common interface for executor factory modules
-    This class describes the common API of different
-    factory modules
-    """
-
-    @abstractmethod
-    def get_executor_config(self):
-        """Return the internal configuration the executor uses to execute the network"""
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_params(self):
-        """Return the compiled parameters."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_lib(self):
-        """Return the generated library"""
-        raise NotImplementedError
-
-    def __getitem__(self, item):
-        return self.module.__getitem__(item)
-
-    def __iter__(self):
-        warnings.warn(
-            "legacy graph executor behavior of producing json / lib / params will be "
-            "removed in the next release."
-            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
-            " new recommended usage.",
-            DeprecationWarning,
-            2,
-        )
-        return self
-
-    def __next__(self):
-        if self.iter_cnt > 2:
-            raise StopIteration
-
-        objs = [self.get_executor_config(), self.lib, self.params]
-        obj = objs[self.iter_cnt]
-        self.iter_cnt += 1
-        return obj
-
-
-class AOTExecutorFactoryModule(ExecutorFactoryModule):
-    """AOT executor factory module.
-
-    Attributes
-    ----------
-    ir_mod : :py:class:`~tvm.IRModule`
-        The IR module to build.
-    lowered_ir_mods : dict[Target, IRModule]
-        The IR modules lowered per Target.
-    target : tvm.Target
-        The Target used to build this module.
-    executor : tvm.relay.backend.Executor
-        Internal representation of the Executor
-    runtime : tvm.relay.backend.Runtime
-        Internal representation of the Runtime
-    libmod : tvm.Module
-        The module of the corresponding function
-    libmod_name: str
-        The name of module
-    params : dict of str to NDArray
-        The parameters of module
-    function_metadata : Map of String to FunctionInfo
-        This holds a map function names to their information
-    devices : List[str]
-        List of devices used in the module
-    """
-
-    def __init__(
-        self,
-        ir_mod,
-        lowered_ir_mods,
-        target,
-        executor,
-        runtime,
-        libmod,
-        libmod_name,
-        params,
-        function_metadata,
-        executor_codegen_metadata,
-        devices,
-    ):
-        fcreate = get_global_func("tvm.aot_executor_factory.create")
-        args = []
-        for k, v in params.items():
-            args.append(k)
-            args.append(ndarray.array(v))
-
-        self.module = fcreate(libmod, libmod_name, *args)
-        self.ir_mod = ir_mod
-        self.lowered_ir_mods = lowered_ir_mods
-        self.target = target
-        self.executor = executor
-        self.runtime = runtime
-        self.lib = libmod
-        self.libmod_name = libmod_name
-        self.params = params
-        self.iter_cnt = 0
-        self.function_metadata = function_metadata
-        self.executor_codegen_metadata = executor_codegen_metadata
-        self.devices = devices
-
-    def get_devices(self):
-        return self.devices
-
-    def get_params(self):
-        return self.params
-
-    def get_executor_config(self):
-        return None
-
-    def get_lib(self):
-        return self.lib
-
-    def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
-        return self.module.export_library(file_name, fcompile=fcompile, addons=addons, **kwargs)
-
-
-class GraphExecutorFactoryModule(ExecutorFactoryModule):
-    """Graph executor factory module.
-    This is a module of graph executor factory
-
-    Attributes
-    ----------
-    ir_mod : :py:class:`~tvm.IRModule`
-        The IR module to build.
-    target : tvm.Target
-        The Target used to build this module.
-    executor : tvm.relay.backend.Executor
-        Internal representation of the Executor
-    graph_json_str : the json graph to be deployed in json format output by graph compiler.
-        The graph can contain operator(tvm_op) that points to the name of
-        PackedFunc in the libmod.
-    libmod : tvm.Module
-        The module of the corresponding function
-    libmod_name: str
-        The name of module
-    params : dict of str to NDArray
-        The parameters of module
-    function_metadata : Map of String to FunctionInfo
-        This holds a map function names to their information
-    """
-
-    def __init__(
-        self,
-        ir_mod,
-        target,
-        executor,
-        graph_json_str,
-        libmod,
-        libmod_name,
-        params,
-        function_metadata,
-    ):
-        assert isinstance(graph_json_str, string_types)
-        fcreate = get_global_func("tvm.graph_executor_factory.create")
-        args = []
-        for k, v in params.items():
-            args.append(k)
-            args.append(ndarray.array(v))
-
-        self.ir_mod = ir_mod
-        self.target = target
-        self.executor = executor
-        self.module = fcreate(graph_json_str, libmod, libmod_name, *args)
-        self.graph_json = graph_json_str
-        self.lib = libmod
-        self.libmod_name = libmod_name
-        self.params = params
-        self.iter_cnt = 0
-        self.function_metadata = function_metadata
-
-    def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
-        return self.module.export_library(file_name, fcompile=fcompile, addons=addons, **kwargs)
-
-    def get_devices(self):
-        return []
-
-    def get_params(self):
-        return self.params
-
-    def get_graph_json(self):
-        return self.graph_json
-
-    def get_executor_config(self):
-        return self.graph_json
-
-    def get_lib(self):
-        return self.lib
diff --git a/python/tvm/relay/backend/graph_executor_codegen.py b/python/tvm/relay/backend/graph_executor_codegen.py
deleted file mode 100644
index aff41c76f89c..000000000000
--- a/python/tvm/relay/backend/graph_executor_codegen.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-A compiler from a Relay expression to TVM's graph executor.
-
-The compiler is built from a few pieces.
-
-First we define a compiler from a single Relay expression to the
-graph language. We require the expression to be a function.
-The function's parameters correspond to the placeholder/inputs
-and model parameters found in the computation graph representation.
-The body of the function represents the computation graph.
-
-The compiler's output is a program in the graph language, which is composed of
-Node, NodeRef, InputNode, OpNode. This "little language" represents programs in
-TVM's graph format.
-
-To connect to the graph executor, we use a printer that converts our graph format
-into TVM's JSON format. The resulting string can be loaded by
-contrib.graph_executor or any other TVM runtime compatible systems.
-"""
-from tvm.runtime.ndarray import empty
-from tvm.relay import _build_module
-from tvm.target import Target
-from .utils import mangle_module_name
-
-
-class GraphExecutorCodegen(object):
-    """The compiler from Relay to the TVM runtime system."""
-
-    def __init__(self, mod, target):
-        self._mod = _build_module._GraphExecutorCodegen()
-        self._init = self._mod["init"]
-        self._codegen = self._mod["codegen"]
-        self._get_graph_json = self._mod["get_graph_json"]
-        self._list_params_name = self._mod["list_params_name"]
-        self._get_param_by_name = self._mod["get_param_by_name"]
-        self._get_irmodule = self._mod["get_irmodule"]
-        self._setup(mod, target)
-
-    def _setup(self, mod, target):
-        raw_targets = Target.canon_multi_target_and_host(target)
-        self._init(mod, raw_targets)
-
-    def codegen(self, ir_module, func):
-        """Compile a single function into a graph.
-
-        Parameters
-        ----------
-        ir_module: tvm.ir.Module
-            The module to compile
-        func: tvm.relay.Expr
-            The function to compile.
-
-        Returns
-        -------
-        graph_json : str
-            The graph json that can be consumed by runtime.
-        mod : IRModule or Dict[Target, IRModule]
-            The lowered functions.
-        params : Dict[str, tvm.nd.NDArray]
-            Additional constant parameters.
-        """
-        default_mod_name = mangle_module_name("default")
-        self._codegen(ir_module, func, default_mod_name)
-        graph_json = self._get_graph_json()
-        lowered_func = self._get_irmodule()
-        param_names = self._list_params_name()
-        params = {}
-        for key in param_names:
-            arr = self._get_param_by_name(key)
-            param = empty(arr.shape, dtype=arr.dtype, device=arr.device)
-            arr.copyto(param)
-            params[key] = param
-        return graph_json, lowered_func, params
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
deleted file mode 100644
index 80a8880fbc37..000000000000
--- a/python/tvm/relay/backend/interpreter.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, broad-exception-raised
-"""The Python interface to the Relay reference interpreter."""
-from __future__ import absolute_import
-
-import numpy as np
-
-import tvm._ffi
-from tvm.runtime import container, Object
-
-from . import _backend
-from .. import _make, analysis
-from ... import nd
-from ..expr import Tuple, RefCreate, Call, Constant, GlobalVar, const
-from ..function import Function
-from ..scope_builder import ScopeBuilder
-
-
-@tvm._ffi.register_object("relay.ConstructorValue")
-class ConstructorValue(Object):
-    def __init__(self, tag, fields, constructor):
-        self.__init_handle_by_constructor__(_make.ConstructorValue, tag, fields, constructor)
-
-
-@tvm._ffi.register_object("relay.RefValue")
-class RefValue(Object):
-    def __init__(self, value):
-        self.__init_handle_by_constructor__(_make.RefValue, value)
-
-
-def _arg_to_ast(mod, arg):
-    if isinstance(arg, nd.NDArray):
-        return Constant(arg.copyto(nd.cpu(0)))
-    elif isinstance(arg, container.ADT):
-        return Tuple([_arg_to_ast(mod, field) for field in arg])
-    elif isinstance(arg, tuple):
-        return Tuple([_arg_to_ast(mod, field) for field in arg])
-    elif isinstance(arg, RefValue):
-        return RefCreate(_arg_to_ast(mod, arg.value))
-    elif isinstance(arg, ConstructorValue):
-        return Call(mod.get_constructor(arg.tag), [_arg_to_ast(mod, field) for field in arg.fields])
-    elif isinstance(arg, np.ndarray):
-        return Constant(nd.array(arg))
-    elif isinstance(arg, Constant):
-        return arg
-    else:
-        return const(arg)
-
-
-class Executor(object):
-    """An abstract interface for executing Relay programs."""
-
-    def _convert_args(self, expr, args, kwargs):
-        """
-        Convert the combination of arguments and keyword arguments
-        into a sequence of arguments that may be passed to
-        a Relay evaluator.
-
-        We first provide all positional arguments, and then attempt
-        to fill in the remaining arguments using the keyword arguments. We
-        map the keyword arguments to the corresponding parameters, if there
-        is an ambiguity between positional and keyword arguments this
-        procedure will raise an error.
-
-        Parameters
-        ----------
-        expr: relay.Expr
-            The expression to evaluate
-
-        args: List[tvm.nd.NDArray]
-            The arguments to pass to the evaluator.
-
-        kwargs: Dict[str, tvm.NDArrray]
-            The keyword arguments to pass to the evaluator.
-
-        Returns:
-            args: List[tvm.nd.NDArray]
-                The new arguments with all keyword arguments placed in the correct slot.
-        """
-        assert expr is not None
-
-        if not kwargs:
-            return args
-
-        if kwargs and not isinstance(expr, Function):
-            raise Exception(
-                f"can only supply keyword parameters for a relay.Function, found {expr}"
-            )
-
-        params = expr.params
-        param_names = [p.name_hint for p in params]
-        num_of_args = len(args)
-
-        cargs = list(args)[:]
-        for i, name in enumerate(param_names):
-            if i < num_of_args:
-                if kwargs.get(name):
-                    raise Exception(
-                        f"duplicate argument supplied in "
-                        f"both positional args (at position: {i}), "
-                        f"and keyword argument (with name: {name})"
-                    )
-            else:
-                cargs.append(kwargs[name])
-
-        if len(cargs) != len(params):
-            raise Exception(
-                f"insufficient arguments, expected " f"{len(cargs)}, provided {len(params)}"
-            )
-
-        return tuple(cargs)
-
-    def _make_executor(self, expr=None):
-        """
-        Construct a Python function that implements the evaluation
-        of expression.
-
-        Parameters
-        ----------
-        expr: Optional[relay.Expr]
-            The Relay expression to execute.
-
-        Returns
-        -------
-        executor: function,
-            A Python function which implements the behavior of `expr`.
-        """
-        raise NotImplementedError()
-
-    def evaluate(self, expr=None, binds=None):
-        """
-        Evaluate a Relay expression on the executor.
-
-        Parameters
-        ----------
-        expr: Optional[tvm.relay.Expr]
-            The expression to evaluate.
-
-        binds: Optional[Map[tvm.relay.Var, tvm.relay.Expr]]
-            Additional binding of free variable.
-
-        Returns
-        -------
-        val : Union[function, Object]
-            The evaluation result.
-        """
-        if binds:
-            scope_builder = ScopeBuilder()
-            for key, value in binds.items():
-                scope_builder.let(key, _arg_to_ast(self.mod, value))
-            scope_builder.ret(expr)
-            expr = scope_builder.get()
-
-        if not expr:
-            return self._make_executor()
-
-        if isinstance(expr, Function):
-            assert not analysis.free_vars(expr)
-
-        if isinstance(expr, (Function, GlobalVar)):
-            return self._make_executor(expr)
-
-        # normal expression evaluated by running a function.
-        # TODO(mbs): This should really be type rather than syntax driven.
-        func = Function([], expr)
-        return self._make_executor(func)()
-
-
-class Interpreter(Executor):
-    """
-    Simple interpreter interface.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The module to support the execution.
-
-    device : Device
-        The runtime device to run the code on.
-
-    target : tvm.Target
-        The target option to build the function. Only homogeneous execution is supported.
-
-    CAUTION: Despite the API the module is prepared upon each call to evaluate
-    rather than once in create_executor.
-    That is:
-    .. code-block:: python
-
-        executor = relay.create_executor(kind="debug", mod=module)
-        a = executor.evaluate(expr)(args1)
-        b = executor.evaluate(expr)(args2)
-
-    will prepare all the bindings in module twice. For efficiency, try to hoist
-    calls to evaluate as high as possible, preferably immediately after create_executor:
-    .. code-block:: python
-
-        func = relay.create_executor(kind="debug", mod=module).evaluate(expr)
-        a = func(args1)
-        b = func(args2)
-    """
-
-    def __init__(self, mod, device, target):
-        self.mod = mod
-        self.device = device
-        self.target = target
-
-    def _make_executor(self, expr=None):
-        if expr is None or isinstance(expr, GlobalVar):
-            assert self.mod is not None
-
-        if expr is None:
-            # A missing expr denotes 'main' in the given module.
-            expr = self.mod.get_global_var("main")
-
-        # Evaluate expr to a packed function we can efficiently re-apply
-        # to Relay arguments.
-        func = _backend.EvalFunction(self.mod, expr, self.device, self.target)
-
-        def _apply_args(*args, **kwargs):
-            if isinstance(expr, GlobalVar):
-                # When expanding args, look inside the actual global definition so kwargs
-                # can be matched.
-                args = self._convert_args(self.mod[expr.name_hint], args, kwargs)
-            else:
-                args = self._convert_args(expr, args, kwargs)
-            # Reflect python arguments up into Relay.
-            relay_args = []
-            for arg in args:
-                relay_args.append(_arg_to_ast(self.mod, arg))
-            # Apply func to Relay args
-            return func(relay_args)
-
-        return _apply_args
diff --git a/python/tvm/relay/backend/name_transforms.py b/python/tvm/relay/backend/name_transforms.py
deleted file mode 100644
index bbf51a8e24b1..000000000000
--- a/python/tvm/relay/backend/name_transforms.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Name transformation functions for use in code generation
-"""
-
-from typing import List, Union
-
-from tvm import TVMError
-from . import _backend
-
-
-def to_c_function_style(original_name: str):
-    """Transform a name to the C function style assuming it is
-    appropriately constructed using the prefixing functions
-
-    Parameters
-    ----------
-    original_name : str
-        Original name to transform
-    """
-    return _backend.ToCFunctionStyle(original_name)
-
-
-def to_c_variable_style(original_name: str):
-    """Transform a name to the C variable style assuming it is
-    appropriately constructed using the prefixing functions
-
-    Parameters
-    ----------
-    original_name : str
-        Original name to transform
-    """
-    return _backend.ToCVariableStyle(original_name)
-
-
-def to_c_constant_style(original_name: str):
-    """Transform a name to the C constant style assuming it is
-    appropriately constructed using the prefixing functions
-
-    Parameters
-    ----------
-    original_name : str
-        Original name to transform
-    """
-    return _backend.ToCConstantStyle(original_name)
-
-
-def _preprocess_names(names: Union[List[str], str]):
-    """Preprocesses name strings into format for C++ functions
-
-    Parameters
-    ----------
-    names : Union[List[str], str]
-        List of names to combine to form a combined name or the name itself
-    """
-    if isinstance(names, str):
-        if names == "":
-            raise TVMError("Name is empty")
-        return [names]
-    return names
-
-
-def prefix_name(names: Union[List[str], str]):
-    """Apply TVM-specific prefix to a function name
-
-    Parameters
-    ----------
-    names : Union[List[str], str]
-        List of names to combine to form a combined name or the name itself
-    """
-
-    return _backend.PrefixName(_preprocess_names(names))
-
-
-def prefix_generated_name(names: Union[List[str], str]):
-    """Apply generated TVM-specific prefix to a function name
-
-    Parameters
-    ----------
-    names : Union[List[str], str]
-        List of names to combine to form a combined name or the name itself
-    """
-
-    return _backend.PrefixGeneratedName(_preprocess_names(names))
diff --git a/python/tvm/relay/backend/runtime.py b/python/tvm/relay/backend/runtime.py
deleted file mode 100644
index b93c8076e698..000000000000
--- a/python/tvm/relay/backend/runtime.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=len-as-condition,no-else-return,invalid-name
-"""Runtime configuration"""
-
-import tvm
-from tvm.runtime import Object
-
-from . import _backend
-
-
-@tvm._ffi.register_object
-class Runtime(Object):
-    """Runtime configuration"""
-
-    flag_registry_name = "runtime"
-
-    def __init__(self, name, options=None) -> None:
-        if options is None:
-            options = {}
-        self.__init_handle_by_constructor__(_backend.CreateRuntime, name, options)
-        self._attrs = _backend.GetRuntimeAttrs(self)
-
-    def __contains__(self, name):
-        return name in self._attrs
-
-    def __getitem__(self, name):
-        self._attrs = _backend.GetRuntimeAttrs(self)
-        return self._attrs[name]
-
-    def __eq__(self, other):
-        return str(other) == str(self) and dict(other._attrs) == dict(self._attrs)
-
-    @staticmethod
-    def list_registered():
-        """Returns a list of possible runtimes"""
-        return list(_backend.ListRuntimes())
-
-    @staticmethod
-    def list_registered_options(runtime):
-        """Returns the dict of available option names and types"""
-        return dict(_backend.ListRuntimeOptions(str(runtime)))
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
deleted file mode 100644
index 84e4ecbaecfb..000000000000
--- a/python/tvm/relay/backend/te_compiler.py
+++ /dev/null
@@ -1,437 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=len-as-condition,no-else-return,invalid-name
-"""TE compiler engine (replacing legacy compile_engine)."""
-from __future__ import absolute_import
-
-import logging
-
-import numpy as np
-import tvm
-from tvm import autotvm, te
-from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.meta_schedule import is_meta_schedule_enabled
-from tvm.runtime import Object
-from tvm.support import libinfo
-from tvm.target import Target
-
-from .. import function as _function
-from .. import ty as _ty
-from ..backend.utils import mangle_module_name
-from . import _backend
-
-logger = logging.getLogger("te_compiler")
-autotvm_logger = logging.getLogger("autotvm")
-
-_first_warning = True
-
-
-@tvm._ffi.register_object("relay.LoweredOutput")
-class LoweredOutput(Object):
-    """Lowered output"""
-
-    def __init__(self, outputs, implement):
-        self.__init_handle_by_constructor__(_backend._make_LoweredOutput, outputs, implement)
-
-
-@tvm._ffi.register_object("relay.CCacheKey")
-class CCacheKey(Object):
-    """Key in the TE Compiler.
-
-    Parameters
-    ----------
-    source_func : tvm.relay.Function
-        The source function.
-
-    target : tvm.Target
-        The target we want to run the function on.
-    """
-
-    def __init__(self, source_func, target):
-        self.__init_handle_by_constructor__(_backend._make_CCacheKey, source_func, target)
-
-
-@tvm._ffi.register_object("relay.CCacheValue")
-class CCacheValue(Object):
-    """Value in the TE Compiler, including usage statistics."""
-
-
-def _get_cache_key(source_func, target):
-    if isinstance(source_func, _function.Function):
-        if isinstance(target, str):
-            target = Target(target)
-            if not target:
-                raise ValueError("Need target when source_func is a Function")
-        return CCacheKey(source_func, target)
-    if not isinstance(source_func, CCacheKey):
-        raise TypeError("Expect source_func to be CCacheKey")
-    return source_func
-
-
-def get_valid_implementations(op, attrs, inputs, out_type, target):
-    """Get all valid implementations from the op strategy.
-
-    Note that this function doesn't support op with symbolic input shapes.
-
-    Parameters
-    ----------
-    op : tvm.ir.Op
-        Relay operator.
-
-    attrs : object
-        The op attribute.
-
-    inputs : List[tvm.te.Tensor]
-        Input tensors to the op.
-
-    out_type : relay.Type
-        The output type.
-
-    target : tvm.target.Target
-        The target to compile the op.
-
-    Returns
-    -------
-    ret : List[relay.op.OpImplementation]
-        The list of all valid op implementations.
-    """
-    fstrategy = op.get_attr("FTVMStrategy")
-    assert fstrategy is not None, (
-        f"{op.name} doesn't have an FTVMStrategy registered. You can register "
-        f"one in python with `tvm.relay.op.register_strategy`."
-    )
-    with target:
-        strategy = fstrategy(attrs, inputs, out_type, target)
-    analyzer = tvm.arith.Analyzer()
-    ret = []
-    for spec in strategy.specializations:
-        if spec.condition:
-            # check if all the clauses in the specialized condition are true
-            flag = True
-            for clause in spec.condition.clauses:
-                clause = analyzer.canonical_simplify(clause)
-                if isinstance(clause, tvm.tir.IntImm) and clause.value:
-                    continue
-                flag = False
-                break
-            if flag:
-                for impl in spec.implementations:
-                    ret.append(impl)
-        else:
-            for impl in spec.implementations:
-                ret.append(impl)
-    return ret
-
-
-def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True):
-    """Select the best implementation from the op strategy.
-
-    If use_autotvm is True, it'll first try to find the best implementation
-    based on AutoTVM profile results. If no AutoTVM profile result is found,
-    it'll choose the implementation with highest plevel.
-
-    If use_autotvm is False, it'll directly choose the implementation with
-    highest plevel.
-
-    Note that this function doesn't support op with symbolic input shapes.
-
-    Parameters
-    ----------
-    op : tvm.ir.Op
-        Relay operator.
-
-    attrs : object
-        The op attribute.
-
-    inputs : List[tvm.te.Tensor]
-        Input tensors to the op.
-
-    out_type : relay.Type
-        The output type.
-
-    target : tvm.target.Target
-        The target to compile the op.
-
-    use_autotvm : bool
-        Whether query AutoTVM to pick the best.
-
-    Returns
-    -------
-    ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
-        The best op implementation and the corresponding output tensors.
-    """
-    all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
-    if len(all_impls) == 0:
-        raise RuntimeError(f"No valid {op} implementations for {target}")
-    best_plevel_impl = max(all_impls, key=lambda x: x.plevel)
-
-    # Disable autotvm if auto_scheduler is enabled.
-    # (i.e., always return the implementation with the highest priority for auto-scheduler).
-    if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
-        use_autotvm = False
-
-    # If not use autotvm, always return the implementation with the highest priority
-    if not use_autotvm:
-        logger.info(
-            "Using %s for %s based on highest priority (%d)",
-            best_plevel_impl.name,
-            op.name,
-            best_plevel_impl.plevel,
-        )
-        outs = best_plevel_impl.compute(attrs, inputs, out_type)
-        return best_plevel_impl, outs
-
-    # Otherwise, try autotvm templates
-    outputs = {}
-    workloads = {}
-    best_autotvm_impl = None
-    best_cfg = None
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    old_silent = autotvm.GLOBAL_SCOPE.silent
-    autotvm.GLOBAL_SCOPE.silent = True
-    for impl in all_impls:
-        outs = impl.compute(attrs, inputs, out_type)
-        outputs[impl] = outs
-        workload = autotvm.task.get_workload(outs)
-        workloads[impl] = workload
-        if workload is None:
-            # Not an AutoTVM tunable implementation
-            continue
-        cfg = dispatch_ctx.query(target, workload)
-        if cfg.is_fallback:
-            # Skip fallback config
-            continue
-        logger.info("Implementation %s for %s has cost %.2e", impl.name, op.name, cfg.cost)
-        if best_cfg is None or best_cfg.cost > cfg.cost:
-            best_autotvm_impl = impl
-            best_cfg = cfg
-    autotvm.GLOBAL_SCOPE.silent = old_silent
-
-    if best_autotvm_impl:
-        # The best autotvm implementation definitely doesn't use fallback config
-        logger.info(
-            "Using %s for %s based on lowest cost (%.2e)",
-            best_autotvm_impl.name,
-            op.name,
-            best_cfg.cost,
-        )
-        return best_autotvm_impl, outputs[best_autotvm_impl]
-
-    # Use the implementation with highest plevel
-    if workloads[best_plevel_impl] is not None:
-        msg = (
-            "Cannot find tuning records for:\n    target=%s\n    key=%s\n"
-            "TVM will apply a default schedule which may negatively impact performance."
-            % (target, workloads[best_plevel_impl])
-        )
-        if (
-            not autotvm.env.GLOBAL_SCOPE.silent
-            and msg not in autotvm.task.DispatchContext.warning_messages
-        ):
-            autotvm.task.DispatchContext.warning_messages.add(msg)
-            global _first_warning
-            if _first_warning:
-                _first_warning = False
-                info_msg = (
-                    "One or more operators have not been tuned. Please tune your model "
-                    "for better performance. Use DEBUG logging level to see more details."
-                )
-                autotvm_logger.warning(info_msg)
-            autotvm_logger.debug(msg)
-
-    logger.info(
-        "Using %s for %s based on highest priority (%s)",
-        best_plevel_impl.name,
-        op.name,
-        best_plevel_impl.plevel,
-    )
-    return best_plevel_impl, outputs[best_plevel_impl]
-
-
-def get_shape(shape):
-    """Convert the shape to correct dtype and vars."""
-    ret = []
-    for dim in shape:
-        if isinstance(dim, tvm.tir.IntImm):
-            if libinfo()["INDEX_DEFAULT_I64"] == "ON":
-                ret.append(dim)
-            else:
-                val = int(dim)
-                assert val <= np.iinfo(np.int32).max
-                ret.append(tvm.tir.IntImm("int32", val))
-        elif isinstance(dim, tvm.tir.Any):
-            ret.append(te.size_var("any_dim", "int32"))
-        else:
-            ret.append(dim)
-    return ret
-
-
-@tvm._ffi.register_func("relay.backend.lower_call")
-def lower_call(call, inputs, target, otype=None):
-    """Lower the call expression to op implementation and tensor outputs."""
-    assert isinstance(call.op, tvm.ir.Op)
-    op = call.op
-
-    if otype is not None:
-        ret_type = otype
-    else:
-        # Prepare the call_node->checked_type(). For the call node inputs, we ensure that
-        # the shape is Int32. Following code ensures the same for the output as well.
-        # TODO(@icemelon9): Support recursive tuple
-        ret_type = call.checked_type
-        if isinstance(ret_type, _ty.TensorType):
-            ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype)
-        elif isinstance(ret_type, _ty.TupleType):
-            new_fields = []
-            for field in ret_type.fields:
-                if isinstance(field, _ty.TensorType):
-                    new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype))
-                else:
-                    new_fields.append(field)
-            ret_type = _ty.TupleType(new_fields)
-
-    is_dyn = _ty.is_dynamic(call.checked_type)
-    for arg in call.args:
-        is_dyn = is_dyn or _ty.is_dynamic(arg.checked_type)
-
-    # check if in the AutoTVM tracing mode, and disable if op is not in wanted list
-    env = autotvm.task.TaskExtractEnv.current
-    reenable_tracing = False
-    if env is not None and env.tracing:
-        if env.wanted_relay_ops is not None and op not in env.wanted_relay_ops:
-            env.tracing = False
-            reenable_tracing = True
-
-    if not is_dyn:
-        best_impl, outputs = select_implementation(op, call.attrs, inputs, ret_type, target)
-    else:
-        # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
-        best_impl, outputs = select_implementation(
-            op, call.attrs, inputs, ret_type, target, use_autotvm=False
-        )
-
-    # re-enable AutoTVM tracing
-    if reenable_tracing:
-        env.tracing = True
-    return LoweredOutput(outputs, best_impl)
-
-
-@tvm._ffi.register_object("relay.TECompiler")
-class TECompiler(Object):
-    """TECompiler to get lowered code."""
-
-    def __init__(self):
-        raise RuntimeError("Cannot construct a TECompiler")
-
-    def lower(self, source_func, target=None, mod_name="default"):
-        """Lower a source_func to a CachedFunc.
-
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
-
-        target : tvm.Target
-            The target platform.
-
-        Returns
-        -------
-        cached_func: CachedFunc
-            The result of lowering.
-        """
-        # pylint: disable=broad-except, import-outside-toplevel
-        try:
-            mod_name = mangle_module_name(mod_name)
-            key = _get_cache_key(source_func, target)
-            return _backend._TECompilerLower(self, key, mod_name)
-        except Exception:
-            import traceback
-
-            msg = traceback.format_exc()
-            msg += "Error during compile func\n"
-            msg += "--------------------------\n"
-            msg += source_func.astext(show_meta_data=False)
-            msg += "--------------------------\n"
-            raise RuntimeError(msg)
-
-    def jit(self, source_func, target=None):
-        """JIT a source_func to a tvm.runtime.PackedFunc.
-
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
-
-        target : tvm.Target
-            The target platform.
-
-        Returns
-        -------
-        jited_func: tvm.runtime.PackedFunc
-            The result of jited function.
-        """
-        key = _get_cache_key(source_func, target)
-        return _backend._TECompilerJIT(self, key)
-
-    def clear(self):
-        """clear the existing cached functions"""
-        _backend._TECompilerClear(self)
-
-    def items(self):
-        """List items in the cache.
-        Returns
-        -------
-        item_list : List[Tuple[CCacheKey, CCacheValue]]
-            The list of items.
-        """
-        res = _backend._TECompilerListItems(self)
-        assert len(res) % 2 == 0
-        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
-
-
-def get():
-    """Get the global TE Compiler.
-
-    Returns
-    -------
-    engine : tvm.relay.backend.TECompiler
-        The TE Compiler.
-    """
-    return _backend._TECompilerGlobal()
-
-
-def lower_to_primfunc(relay_func, target):
-    """Lower Relay Function to TIR PrimFunc.
-
-    Parameters
-    ----------
-    relay_func: relay.Function
-        The source primitive function, created by FuseOps.
-
-    target : Target
-        The compilation target.
-
-    Returns
-    -------
-    prim_func : tir.PrimFunc
-        The created prim func.
-    """
-    f = tvm._ffi.get_global_func("relay.backend.LowerToPrimFunc")
-    assert f is not None, "relay.backend.LowerToPrimFunc does not exist. "
-
-    with target:
-        return f(relay_func, target)
diff --git a/python/tvm/relay/backend/utils.py b/python/tvm/relay/backend/utils.py
deleted file mode 100644
index 7289dbbc4af4..000000000000
--- a/python/tvm/relay/backend/utils.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility backend functions."""
-from enum import Enum
-
-
-class CallType(Enum):
-    Packed = 0
-    CPacked = 1
-    Unpacked = 2
-
-
-def _is_valid_modname(mod_name):
-    """Determine if mod_name is a valid string to use inside function names"""
-    if mod_name:
-        try:
-            mod_name.encode("ascii")
-            return True
-        except UnicodeEncodeError:
-            return False
-
-    return True
-
-
-def mangle_module_name(mod_name):
-    if not _is_valid_modname(mod_name):
-        raise ValueError(mod_name + " contains invalid characters")
-    if mod_name:
-        return "tvmgen_" + mod_name
-    return "tvmgen"
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
deleted file mode 100644
index bc11d43cb0ca..000000000000
--- a/python/tvm/relay/backend/vm.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, invalid-name, redefined-builtin
-"""
-The Relay Virtual Machine.
-
-Implements a Python interface to compiling and executing on the Relay VM.
-"""
-import numpy as np
-
-import tvm.runtime.ndarray as _nd
-import tvm.runtime.vm as vm_rt
-from tvm import autotvm
-from tvm.relay import expr as _expr
-from tvm.relay.backend.interpreter import Executor
-from tvm.target import Target
-from . import _vm
-
-
-def compile(mod, target=None, target_host=None, params=None):
-    """Compile the module to VM executable. A helper function for VMCompiler.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The Relay module to build.
-
-    target : any multi-target like object, see Target.canon_multi_target
-        For homogeneous compilation, the unique build target.
-        For heterogeneous compilation, a dictionary or list of possible build targets.
-
-    target_host : None, or any target-like object, see Target.canon_target
-        Host compilation target, if target is device.
-        When TVM compiles device specific program such as CUDA,
-        we also need host(CPU) side code to interact with the driver
-        to setup the dimensions and parameters correctly.
-        target_host is used to specify the host side codegen target.
-        By default, llvm is used if it is enabled,
-        otherwise a stackvm intepreter is used.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    Returns
-    -------
-    exec : tvm.runtime.vm.Executable
-        The VM executable that contains both library code and bytecode.
-    """
-    compiler = VMCompiler()
-    if params:
-        compiler.set_params(params)
-    compiler.lower(mod, target, target_host)
-    compiler.codegen()
-    return compiler.get_exec()
-
-
-class VMCompiler(object):
-    """Compiler that compiles Relay module to VM executable."""
-
-    def __init__(self):
-        self.mod = _vm._VMCompiler()
-        self._lower = self.mod["lower"]
-        self._codegen = self.mod["codegen"]
-        self._get_exec = self.mod["get_executable"]
-        self._set_params_func = self.mod["set_params"]
-        self._get_params_func = self.mod["get_params"]
-        self._optimize = self.mod["optimize"]
-
-    def set_params(self, params):
-        """Set constant parameters for the model.
-
-        Parameters
-        ----------
-        params : dict of str to NDArray
-            Input parameters to the graph that do not change
-            during inference time. Used for constant folding.
-        """
-        inputs = {}
-        for name, param in params.items():
-            if isinstance(param, np.ndarray):
-                param = _nd.array(param)
-            inputs[name] = _expr.const(param)
-        self._set_params_func(inputs)
-
-    def get_params(self):
-        """Return the updated weights."""
-        params = self._get_params_func()
-        ret = {}
-        for key, value in params.items():
-            ret[key] = value.data
-        return ret
-
-    def lower(self, mod, target=None, target_host=None):
-        """Lower the module to VM bytecode.
-
-        Parameters
-        ----------
-        mod : tvm.IRModule
-            The Relay module to build.
-
-        target : any multi-target like object, see Target.canon_multi_target
-            For homogeneous compilation, the unique build target.
-            For heterogeneous compilation, a dictionary or list of possible build targets.
-
-        target_host : any target-like object, see Target.canon_target
-            Host compilation target, if target is device.
-        """
-        raw_targets = Target.canon_multi_target_and_host(target, target_host)
-        tophub_context = self._tophub_context(raw_targets)
-        with tophub_context:
-            self._lower(mod, raw_targets)
-
-    def codegen(self):
-        """Generate the kernel library."""
-        self._codegen()
-
-    def optimize(self, mod, target=None, target_host=None, params=None):
-        """Helper method that optimizes a Relay module via VM.
-
-        Parameters
-        ----------
-        mod : tvm.IRModule
-
-        target : any multi-target like object, see Target.canon_multi_target
-            For homogeneous compilation, the unique build target.
-            For heterogeneous compilation, a dictionary or list of possible build targets.
-
-        target_host : any target-like object, see Target.canon_target
-            Host compilation target, if target is device.
-
-        params : dict of str to NDArray
-            Input parameters to the graph that do not change
-            during inference time. Used for constant folding.
-
-        Returns
-        -------
-        mod : tvm.IRModule
-            The optimized relay module.
-
-        params : dict
-            The parameters of the final module.
-        """
-        raw_targets = Target.canon_multi_target_and_host(target, target_host)
-        if params:
-            self.set_params(params)
-        return self._optimize(mod, raw_targets), self.get_params()
-
-    def get_exec(self):
-        """Get the VM executable.
-
-        Returns
-        -------
-        exec : tvm.runtime.vm.Executable
-            The VM executable that contains both library code and bytecode.
-        """
-        return vm_rt.Executable(self._get_exec())
-
-    def _tophub_context(self, raw_targets):
-        """Get the autotvm context."""
-        # If current dispatch context is fallback context (the default root context),
-        # then load pre-tuned parameters from TopHub
-        if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-            tophub_context = autotvm.tophub.context(raw_targets)
-        else:
-            tophub_context = autotvm.utils.EmptyContext()
-        return tophub_context
-
-
-class VMExecutor(Executor):
-    """
-    An implementation of the executor interface for
-    the Relay VM.
-
-    Useful interface for experimentation and debugging
-    the VM can also be used directly from the API.
-    supported by `tvm.runtime.vm`.
-
-    Parameters
-    ----------
-    mod : :py:class:`~tvm.IRModule`
-        The module to support the execution.
-
-    device : :py:class:`~tvm.runtime.Device`
-        The runtime device to run the code on.
-
-    target : any multi-target like object, see Target.canon_multi_target
-        For homogeneous compilation, the unique build target.
-        For heterogeneous compilation, a dictionary or list of possible build targets.
-    """
-
-    def __init__(self, mod, device, target):
-        if mod is None:
-            raise RuntimeError("Must provide module to get VM executor.")
-        self.mod = mod
-        self.device = device
-        self.target = target
-        self.executable = None
-        self.vm = None
-
-    def _make_executor(self, expr=None):
-        if expr:
-            self.mod["main"] = expr
-
-        self.executable = compile(self.mod, self.target)
-        self.vm = vm_rt.VirtualMachine(self.executable, self.device)
-
-        def _vm_wrapper(*args, **kwargs):
-            args = self._convert_args(self.mod["main"], args, kwargs)
-            return self.vm.run(*args)
-
-        return _vm_wrapper
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
deleted file mode 100644
index 460746f94f1f..000000000000
--- a/python/tvm/relay/base.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, unused-import
-"""The base node types for the Relay language."""
-import os
-
-import tvm._ffi
-from tvm.ir import Node as RelayNode
-from tvm.ir import SourceName, Span, SequentialSpan
-from tvm.runtime import Object
-
-from . import _ffi_api
-
-__STD_PATH__ = os.path.join(os.path.dirname(os.path.realpath(__file__)), "std")
-
-
-def pretty_print(obj: Object) -> None:
-    """Pretty print the object."""
-    return _ffi_api.PrettyPrint(obj)  # type: ignore # pylint: disable=no-member
-
-
-def astext(obj: Object, show_meta_data=True, annotate=None):
-    """Get the text format of the expression.
-
-    Parameters
-    ----------
-    obj : Object
-        The object to be printed.
-    show_meta_data : bool
-        Whether to include meta data section in the text
-        if there is meta data.
-    annotate: Optional[Object->str]
-        Optionally annotate function to provide additional
-        information in the comment block.
-
-    Returns
-    -------
-    text : str
-        The text format of the expression.
-
-    Notes
-    -----
-    The meta data section is necessary to fully parse the text format.
-    However, it can contain dumps that are big (e.g constant weights),
-    so it can be helpful to skip printing the meta data section.
-    """
-    return _ffi_api.AsText(obj, show_meta_data, annotate)  # type: ignore # pylint: disable=no-member
-
-
-@tvm._ffi.register_func("tvm.relay.std_path")
-def _std_path():
-    return __STD_PATH__
-
-
-@tvm._ffi.register_object("relay.Id")
-class Id(Object):
-    """Unique identifier(name) used in Var.
-    Guaranteed to be stable across all passes.
-    """
-
-    def __init__(self):
-        raise RuntimeError("Cannot directly construct Id")
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
deleted file mode 100644
index 40a91cc75a00..000000000000
--- a/python/tvm/relay/build_module.py
+++ /dev/null
@@ -1,686 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Construct the necessary state for the TVM graph executor
-from a Relay expression.
-"""
-import warnings
-
-import numpy as np
-from tvm.ir import IRModule
-from tvm.target import Target
-
-from .. import autotvm
-from .. import nd as _nd
-from .. import register_func
-from ..contrib import graph_executor as _graph_executor
-from ..contrib import utils as contrib_utils
-from ..runtime import load_module
-from ..runtime.executor import aot_executor as _aot_executor
-from ..target import Target
-from . import _build_module
-from . import expr as _expr
-from . import function as _function
-from . import ty as _ty
-from .backend import Executor, Runtime
-from .backend import executor_factory as _executor_factory
-from .backend import interpreter as _interpreter
-from .backend.utils import mangle_module_name
-from .backend.vm import VMExecutor
-from .transform import InferType
-
-
-def _convert_param_map(params):
-    inputs = {}
-    for name, param in params.items():
-        if isinstance(param, np.ndarray):
-            param = _nd.array(param)
-        inputs[name] = _expr.const(param)
-    return inputs
-
-
-class BuildModule(object):
-    """Build an IR module to run on TVM graph executor. This class is used
-    to expose the `RelayBuildModule` APIs implemented in C++.
-    """
-
-    def __init__(self):
-        self.mod = _build_module._BuildModule()
-        self._get_graph_json = self.mod["get_graph_json"]
-        self._get_module = self.mod["get_module"]
-        self._build = self.mod["build"]
-        self._optimize = self.mod["optimize"]
-        self._set_params_func = self.mod["set_params"]
-        self._get_params_func = self.mod["get_params"]
-        self._get_function_metadata = self.mod["get_function_metadata"]
-        self._get_executor_codegen_metadata = self.mod["get_executor_codegen_metadata"]
-        self._get_devices = self.mod["get_devices"]
-        self._get_irmodule = self.mod["get_irmodule"]
-
-    def build(
-        self,
-        mod,
-        target=None,
-        target_host=None,
-        executor=Executor("graph"),
-        runtime=Runtime("cpp"),
-        workspace_memory_pools=None,
-        constant_memory_pools=None,
-        params=None,
-        mod_name=None,
-    ):
-        """
-        Parameters
-        ----------
-        mod : :py:class:`~tvm.IRModule`
-            The IRModule to build.
-
-        target : any multi-target like object, see Target.canon_multi_target
-            For homogeneous compilation, the unique build target.
-            For heterogeneous compilation, a dictionary or list of possible build targets.
-
-        target_host : None, or any target-like object, see Target.canon_target
-            Host compilation target, if target is device.
-            When TVM compiles device specific program such as CUDA,
-            we also need host(CPU) side code to interact with the driver
-            to setup the dimensions and parameters correctly.
-            target_host is used to specify the host side codegen target.
-            By default, llvm is used if it is enabled,
-            otherwise a stackvm interpreter is used.
-
-        executor : Optional[Executor]
-            The executor configuration with which to build the model.
-            Defaults to "graph" if no executor specified.
-
-        runtime : Optional[Runtime]
-            Runtime configuration to use when building the model.
-            Defaults to "cpp" if no runtime specified.
-
-        workspace_memory_pools : Optional[WorkspaceMemoryPools]
-            The object that contains an Array of WorkspacePoolInfo objects
-            that hold properties of read-write workspace pools that could be
-            used by the inference.
-
-        constant_memory_pools : Optional[ConstantMemoryPools]
-            The object that contains an Array of ConstantPoolInfo objects
-            that hold properties of read-only memory pools that could be
-            used by the inference.
-
-        params : dict of str to NDArray
-            Input parameters to the graph that do not change
-            during inference time. Used for constant folding.
-
-        mod_name: Optional[str]
-            The module name we will build
-
-        Returns
-        -------
-        graph_json : str
-            The json string that can be accepted by graph executor.
-
-        mod : tvm.Module
-            The module containing necessary libraries.
-
-        params : dict
-            The parameters of the final graph.
-        """
-        # pylint: disable=import-outside-toplevel
-        from tvm.auto_scheduler import is_auto_scheduler_enabled
-        from tvm.meta_schedule import is_meta_schedule_enabled
-
-        # pylint: enable=import-outside-toplevel
-        # Setup the params.
-        if params:
-            self._set_params(params)
-
-        # Build the IR module. If auto_scheduler is not enabled,
-        # then use the TOPI-defined schedule.
-
-        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
-        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
-        autotvm.GLOBAL_SCOPE.silent = (
-            is_auto_scheduler_enabled() or is_meta_schedule_enabled() or old_autotvm_silent
-        )
-
-        mod_name = mangle_module_name(mod_name)
-
-        self._build(
-            mod,
-            target,
-            target_host,
-            executor,
-            runtime,
-            workspace_memory_pools,
-            constant_memory_pools,
-            mod_name,
-        )
-        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
-
-        # Get artifacts
-        mod = self.get_module()
-        params = self.get_params()
-        executor_config = self.get_graph_json() if executor.name == "graph" else None
-
-        return executor_config, mod, params
-
-    def optimize(self, mod, target=None, target_host=None, params=None):
-        """
-        Parameters
-        ----------
-        mod : :py:class:`~tvm.IRModule`
-            The IR module to build.
-
-        target : any multi-target like object, see Target.canon_multi_target.
-            For homogeneous compilation, the unique build target.
-            For heterogeneous compilation, a dictionary or list of possible build targets.
-
-        target_host : None, or any target-like object, see Target.canon_target
-            Host compilation target, if target is device.
-
-        params : dict of str to NDArray
-            Input parameters to the graph that do not change
-            during inference time. Used for constant folding.
-
-        Returns
-        -------
-        mod : :py:class:`~tvm.IRModule`
-            The optimized relay module.
-
-        params : dict
-            The parameters of the final graph.
-        """
-        raw_targets = Target.canon_multi_target_and_host(target, target_host)
-
-        # Setup the params.
-        if params:
-            self._set_params(params)
-        mod = self._optimize(mod, raw_targets)
-        # Get artifacts
-        params = self.get_params()
-
-        return mod, params
-
-    def _set_params(self, params):
-        self._set_params_func(_convert_param_map(params))
-
-    def get_graph_json(self):
-        """Return the json file of the built program."""
-        return self._get_graph_json()
-
-    def get_module(self):
-        """Return the built module."""
-        return self._get_module()
-
-    def get_function_metadata(self):
-        """Return the compiled function metadata.
-        Currently, the metadata contains workspace size required by
-        each PrimFunc"""
-        return self._get_function_metadata()
-
-    def get_executor_codegen_metadata(self):
-        """Return the metadata produced after executor
-        codegen
-        """
-        return self._get_executor_codegen_metadata()
-
-    def get_devices(self):
-        """Returns a list of devices configured in this module"""
-        return self._get_devices()
-
-    def get_params(self):
-        """Return the updated weights."""
-        params = self._get_params_func()
-        ret = {}
-        for key, value in params.items():
-            ret[key] = value.data
-        return ret
-
-    def get_irmodule(self):
-        """Returns the TargetIRModule's post-lowering"""
-        return self._get_irmodule()
-
-
-@register_func("tvm.relay.module_export_library")
-def _module_export(module, file_name):  # fcompile, addons, kwargs?
-    return module.export_library(file_name)
-
-
-@register_func("tvm.relay.build")
-def _build_module_no_factory_impl(mod, target, target_host, params, mod_name):
-    return build(
-        mod, target=target, target_host=target_host, params=params, mod_name=mod_name
-    ).module
-
-
-def _build_module_no_factory(mod, target=None, target_host=None, params=None, mod_name="default"):
-    """A wrapper around build which discards the Python GraphFactoryRuntime.
-    This wrapper is suitable to be used from other programming languages as
-    the runtime::Module can be freely passed between language boundaries.
-    """
-    return _build_module_no_factory_impl(mod, target, target_host, params, mod_name)
-
-
-def build(
-    ir_mod,
-    target=None,
-    target_host=None,
-    executor=Executor("graph"),
-    runtime=Runtime("cpp"),
-    workspace_memory_pools=None,
-    constant_memory_pools=None,
-    params=None,
-    mod_name="default",
-):
-    # fmt: off
-    # pylint: disable=line-too-long
-    """Helper function that builds a Relay function to run on TVM graph executor.
-
-    Parameters
-    ----------
-    ir_mod : :py:class:`~tvm.IRModule`
-        The IR module to build. Using relay.Function is deprecated.
-
-    target : None, or any multi-target like object, see Target.canon_multi_target
-        For homogeneous compilation, the unique build target.
-        For heterogeneous compilation, a dictionary or list of possible build targets.
-        Defaults to the current target in the environment if None.
-
-    target_host : None, or any target like object, see Target.canon_target
-        Host compilation target, if target is device.
-
-    executor : Optional[Executor]
-        The executor configuration with which to build the model.
-        Defaults to "graph" if no executor specified.
-
-    runtime : Optional[Runtime]
-        Runtime configuration to use when building the model.
-        Defaults to "cpp" if no runtime specified.
-
-    workspace_memory_pools : Optional[WorkspaceMemoryPools]
-        The object that contains an Array of WorkspacePoolInfo objects
-        that hold properties of read-write workspace pools that could be
-        used by the inference.
-
-    constant_memory_pools : Optional[ConstantMemoryPools]
-        The object that contains an Array of ConstantPoolInfo objects
-        that hold properties of read-only pools that could be
-        used by the inference.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    mod_name: Optional[str]
-        The module name we will build
-
-    Returns
-    -------
-    factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
-            The runtime factory for the TVM graph executor.
-    """
-    # pylint: enable=line-too-long
-    # fmt: on
-
-    if not isinstance(ir_mod, (IRModule, _function.Function)):
-        raise ValueError("Type of input parameter mod must be tvm.IRModule")
-
-    if isinstance(ir_mod, _function.Function):
-        if params:
-            ir_mod = bind_params_by_name(ir_mod, params)
-        ir_mod = IRModule.from_expr(ir_mod)
-        warnings.warn(
-            "Please use input parameter mod (tvm.IRModule) "
-            "instead of deprecated parameter mod (tvm.relay.function.Function)",
-            DeprecationWarning,
-        )
-
-    raw_targets = Target.canon_multi_target_and_host(Target.target_or_current(target), target_host)
-    assert len(raw_targets) > 0
-    target_host = raw_targets[0].host
-
-    # If current dispatch context is fallback context (the default root context),
-    # then load pre-tuned parameters from TopHub
-    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-        tophub_context = autotvm.tophub.context(list(raw_targets))
-    else:
-        tophub_context = autotvm.utils.EmptyContext()
-
-    with tophub_context:
-        bld_mod = BuildModule()
-        graph_json, runtime_mod, params = bld_mod.build(
-            mod=ir_mod,
-            target=raw_targets,
-            params=params,
-            executor=executor,
-            runtime=runtime,
-            workspace_memory_pools=workspace_memory_pools,
-            constant_memory_pools=constant_memory_pools,
-            mod_name=mod_name,
-        )
-        func_metadata = bld_mod.get_function_metadata()
-        devices = bld_mod.get_devices()
-        lowered_ir_mods = bld_mod.get_irmodule()
-        executor_codegen_metadata = bld_mod.get_executor_codegen_metadata()
-
-        if executor.name == "aot":
-            executor_factory = _executor_factory.AOTExecutorFactoryModule(
-                ir_mod,
-                lowered_ir_mods,
-                raw_targets,
-                executor,
-                runtime,
-                runtime_mod,
-                mod_name,
-                params,
-                func_metadata,
-                executor_codegen_metadata,
-                devices,
-            )
-        elif executor.name == "graph":
-            executor_factory = _executor_factory.GraphExecutorFactoryModule(
-                ir_mod,
-                raw_targets,
-                executor,
-                graph_json,
-                runtime_mod,
-                mod_name,
-                params,
-                func_metadata,
-            )
-        else:
-            assert False, "Executor " + executor + " not supported"
-
-        return executor_factory
-
-
-def optimize(mod, target=None, params=None):
-    """Helper function that optimizes a Relay module.
-
-    Parameters
-    ----------
-    mod : :py:class:`~tvm.IRModule`
-        The module to build. Using relay.Function is deprecated.
-
-    target : None, or any multi-target like object, see Target.canon_multi_target
-        For homogeneous compilation, the unique build target.
-        For heterogeneous compilation, a dictionary or list of possible build targets.
-        Defaults to the current target in the environment if None.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    Returns
-    -------
-    mod : :py:class:`~tvm.IRModule`
-        The optimized relay module.
-
-    params : dict
-        The parameters of the final graph.
-    """
-    if not isinstance(mod, (IRModule, _function.Function)):
-        raise ValueError("Type of input parameter mod must be tvm.IRModule")
-
-    if isinstance(mod, _function.Function):
-        if params:
-            mod = bind_params_by_name(mod, params)
-        mod = IRModule.from_expr(mod)
-        warnings.warn(
-            "Please use input parameter mod (tvm.IRModule) "
-            "instead of deprecated parameter func (tvm.relay.function.Function)",
-            DeprecationWarning,
-        )
-
-    raw_targets = Target.canon_multi_target_and_host(Target.target_or_current(target))
-
-    # If current dispatch context is fallback context (the default root context),
-    # then load pre-tuned parameters from TopHub
-    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-        tophub_context = autotvm.tophub.context(raw_targets)
-    else:
-        tophub_context = autotvm.utils.EmptyContext()
-
-    with tophub_context:
-        bld_mod = BuildModule()
-        mod, params = bld_mod.optimize(mod, target=raw_targets, params=params)
-    return mod, params
-
-
-def bind_params_by_name(func, params):
-    """Bind params to function by name.
-    This could be useful when assembling custom Relay optimization
-    passes that involve constant folding.
-
-    Parameters
-    ----------
-    func : relay.Function
-        The function to bind parameters to.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    Returns
-    -------
-    func : relay.Function
-        The function with parameters bound
-    """
-    inputs = _convert_param_map(params)
-    return _build_module.BindParamsByName(func, inputs)
-
-
-class GraphExecutor(_interpreter.Executor):
-    """Wrapper around Executor interface.
-
-    This executor is used for debug and testing purposes.
-
-    Parameters
-    ----------
-    mod : :py:class:`~tvm.IRModule`
-        The module to support the execution.
-
-    device : :py:class:`Device`
-        The runtime device to run the code on.
-
-    target : any multi-target like object, see Target.canon_multi_target
-        For homogeneous compilation, the unique build target.
-        For heterogeneous compilation, a dictionary or list of possible build targets.
-    """
-
-    def __init__(self, mod, device, target):
-        assert mod is not None
-        self.mod = mod
-        self.device = device
-        self.target = target
-
-    def _make_executor(self, expr=None):
-        if expr:
-            self.mod["main"] = expr
-        self.mod = InferType()(self.mod)
-        ret_type = self.mod["main"].checked_type.ret_type
-        if _ty.is_dynamic(ret_type):
-            raise ValueError(
-                "Graph Executor only supports static graphs, got output type", ret_type
-            )
-        mod = build(self.mod, target=self.target)
-        gmodule = _graph_executor.GraphModule(mod["default"](self.device))
-
-        def _unflatten(flat_iter, cur_type):
-            if isinstance(cur_type, _ty.TensorType):
-                return next(flat_iter)
-            if isinstance(cur_type, _ty.TupleType):
-                fields = []
-                for field_type in cur_type.fields:
-                    field = _unflatten(flat_iter, field_type)
-                    fields.append(field)
-                return fields
-            raise ValueError("Return type", ret_type, "contains unsupported type", cur_type)
-
-        def _graph_wrapper(*args, **kwargs):
-            args = self._convert_args(self.mod["main"], args, kwargs)
-            # Create map of inputs.
-            for i, arg in enumerate(args):
-                gmodule.set_input(i, arg)
-            # Run the module, and fetch the output.
-            gmodule.run()
-            flattened = []
-            for i in range(gmodule.get_num_outputs()):
-                flattened.append(gmodule.get_output(i).copyto(_nd.cpu(0)))
-            unflattened = _unflatten(iter(flattened), ret_type)
-            return unflattened
-
-        return _graph_wrapper
-
-
-class AotExecutor(_interpreter.Executor):
-    """Implements the Executor interface for AOT.
-
-    Parameters
-    ----------
-    mod : :py:class:`~tvm.IRModule`
-        The module to support the execution.
-
-    device : :py:class:`Device`
-        The runtime device to run the code on.
-
-    target : any multi-target like object, see Target.canon_multi_target
-        For homogeneous compilation, the unique build target.
-        For heterogeneous compilation, a dictionary or list of possible build targets.
-    """
-
-    def __init__(self, mod, device, target):
-        assert mod is not None
-        self.mod = mod
-        self.device = device
-        self.target = target
-
-    def _make_executor(self, expr=None):
-        if expr:
-            self.mod["main"] = expr
-        self.mod = InferType()(self.mod)
-        ret_type = self.mod["main"].checked_type.ret_type
-        if _ty.is_dynamic(ret_type):
-            raise ValueError("AOT Executor only supports static graphs, got output type", ret_type)
-        mod = build(self.mod, target=self.target, executor=Executor("aot"))
-
-        # NOTE: Given AOT requires use of the "c" backend, must export/import to compile the
-        # generated code.
-        temp_so_dir = contrib_utils.TempDirectory()
-        temp_so = temp_so_dir / "temp.so"
-        mod.export_library(temp_so, cc="gcc", options=["-std=c11"])
-
-        mod = load_module(temp_so)
-        aot_mod = mod["default"](self.device)
-        gmodule = _aot_executor.AotModule(aot_mod)
-
-        def _unflatten(flat_iter, cur_type):
-            if isinstance(cur_type, _ty.TensorType):
-                return next(flat_iter)
-            if isinstance(cur_type, _ty.TupleType):
-                fields = []
-                for field_type in cur_type.fields:
-                    field = _unflatten(flat_iter, field_type)
-                    fields.append(field)
-                return fields
-            raise ValueError("Return type", ret_type, "contains unsupported type", cur_type)
-
-        def _aot_wrapper(*args, **kwargs):
-            args = self._convert_args(self.mod["main"], args, kwargs)
-            # Create map of inputs.
-            for i, arg in enumerate(args):
-                gmodule.set_input(i, arg)
-            # Run the module, and fetch the output.
-            gmodule.run()
-            flattened = []
-            for i in range(gmodule.get_num_outputs()):
-                flattened.append(gmodule.get_output(i).copyto(_nd.cpu(0)))
-            unflattened = _unflatten(iter(flattened), ret_type)
-            return unflattened
-
-        return _aot_wrapper
-
-
-# TODO(mbs): Collapse the create_executor/evaluate phases together since a) most callers don't
-# reuse the executor for multiple expressions and b) any preparation necessary for the expression
-# evaluation needs to (currently) be done along with preparation for the module.
-def create_executor(kind="debug", mod=None, device=None, target="llvm", params=None):
-    """Factory function to create an executor.
-
-    Example
-    -------
-    .. code-block:: python
-
-        import tvm.relay
-        import numpy as np
-
-        x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
-        expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
-        tvm.relay.create_executor(
-            kind="vm", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr))
-        ).evaluate()(np.array([2], dtype="float32"))
-        # returns `array([3.], dtype=float32)`
-
-    Parameters
-    ----------
-    kind : str
-        The type of executor. Avaliable options are `debug` for the interpreter, `graph` for the
-        graph executor, `aot` for the aot executor, and `vm` for the virtual machine.
-
-    mod : :py:class:`~tvm.IRModule`
-        The Relay module containing collection of functions
-
-    device : :py:class:`Device`
-        The device to execute the code.
-
-    target : any multi-target like object, see Target.canon_multi_target
-        For homogeneous compilation, the unique build target.
-        For heterogeneous compilation, a dictionary or list of possible build targets.
-        CAUTION: Though this API allows multiple targets, it does not allow multiple devices, so
-        heterogenous compilation is not yet supported.
-
-    params : dict of str to NDArray
-         Input parameters to the graph that do not change
-         during inference time.
-
-    Returns
-    -------
-    executor : :py:class:`~tvm.relay.backend.interpreter.Executor`
-    """
-    raw_targets = Target.canon_multi_target(target)
-    if mod is None:
-        mod = IRModule()
-    if device is not None:
-        assert device.device_type == raw_targets[0].get_target_device_type()
-    else:
-        # Derive the default device from the first target.
-        device = _nd.device(raw_targets[0].get_target_device_type(), 0)
-
-    if params is not None:
-        mod = IRModule.from_expr(bind_params_by_name(mod["main"], params))
-
-    assert "executor" not in raw_targets[0].attrs or raw_targets[0].attrs["executor"] == kind
-
-    if kind == "debug":
-        assert len(raw_targets) == 1, "The interpreter currently only supports a single target"
-        return _interpreter.Interpreter(mod, device, raw_targets[0])
-    if kind == "graph":
-        return GraphExecutor(mod, device, raw_targets)
-    if kind == "vm":
-        return VMExecutor(mod, device, raw_targets)
-    if kind == "aot":
-        return AotExecutor(mod, device, raw_targets)
-    raise RuntimeError(f"unknown execution strategy: {kind}")
diff --git a/python/tvm/relay/collage/__init__.py b/python/tvm/relay/collage/__init__.py
deleted file mode 100644
index b3b485ead40b..000000000000
--- a/python/tvm/relay/collage/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""relay.collage exports"""
-from .collage import (
-    MEASURE_NUMBER,
-    MEASURE_REPEAT,
-    WARMUP_MIN_REPEAT_MS,
-    CostEstimator,
-    MockCostEstimator,
-    CustomCostEstimator,
-)
diff --git a/python/tvm/relay/collage/_ffi_api.py b/python/tvm/relay/collage/_ffi_api.py
deleted file mode 100644
index bb5be46c7af3..000000000000
--- a/python/tvm/relay/collage/_ffi_api.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for the Collage partitioner."""
-import tvm._ffi
-
-
-tvm._ffi._init_api("relay.collage", __name__)
diff --git a/python/tvm/relay/collage/collage.py b/python/tvm/relay/collage/collage.py
deleted file mode 100644
index cfc527c2b977..000000000000
--- a/python/tvm/relay/collage/collage.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Mostly helper methods which interface the main C++ Collage implementation with Python.
-   See relay.transform.CollagePartition for the main Collage entrypoint."""
-
-import logging
-import os
-import math
-import tempfile
-
-import numpy as np
-
-import tvm
-from tvm._ffi.registry import register_func, register_object
-from tvm.runtime import Object
-from . import _ffi_api
-
-# Parameters to use when estimating latency (of both partitions and overall models).
-MEASURE_NUMBER = 20
-MEASURE_REPEAT = 5
-WARMUP_MIN_REPEAT_MS = 250
-
-
-@register_object("relay.collage.CostEstimator")
-class CostEstimator(Object):
-    """CostEstimator class"""
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ffi_api.CostEstimator)
-
-
-@register_object("relay.collage.MockCostEstimator")
-class MockCostEstimator(Object):
-    """MockEstimator class"""
-
-    def __init__(self, target_costs, max_estimates=0):
-        self.__init_handle_by_constructor__(_ffi_api.MockCostEstimator, target_costs, max_estimates)
-
-
-@register_object("relay.collage.CustomCostEstimator")
-class CustomCostEstimator(Object):
-    """CustomEstimator class"""
-
-    def __init__(self, py_fn_estimator="tvm.relay.collage.estimate_seconds_custom"):
-        self.__init_handle_by_constructor__(_ffi_api.CustomCostEstimator, py_fn_estimator)
-
-
-def arg_for(arg_type, device):
-    """Returns a test argument of Relay arg_type on device"""
-    assert isinstance(arg_type, tvm.ir.TensorType)
-    return tvm.nd.array(
-        np.random.uniform(-1.0, 1.0, size=arg_type.concrete_shape).astype(arg_type.dtype),
-        device=device,
-    )
-
-
-def vm_estimate_seconds(device, the_vm, func_name, args):
-    """Returns the estimated latency, in seconds, of running func_name with args on the_vm."""
-    # Warmup
-    the_vm.benchmark(
-        device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, func_name=func_name, **args
-    )
-    # One more time, with feeling
-    return the_vm.benchmark(
-        device,
-        repeat=MEASURE_REPEAT,
-        number=MEASURE_NUMBER,
-        min_repeat_ms=0,
-        func_name=func_name,
-        **args,
-    )
-
-
-@register_func("tvm.relay.collage.estimate_seconds")
-def estimate_seconds(mod, target):
-    """Returns the mean execution time of "main" in mod on target with params. The module
-    may contain "Primitive" functions, possibly with "Compiler" attributes."""
-    device = tvm.device(target.get_target_device_type())
-
-    try:
-        # Build the module.
-        logging.info("Compiling module to estimate")
-        exe = tvm.relay.vm.compile(mod, target)
-    except RuntimeError as err:
-        # A build failure indicates the partition is not supported.
-        # eg trying to build an nn.batch_norm on GPU, which has no schedule since we assume it
-        # is only ever used with a tuple projection which is rewritten away.
-        logging.info("Assigning module infinite cost since unable to build: %s", err)
-        return math.inf
-
-    # Finalize compilation
-    tmp_dir = tempfile.mkdtemp()
-    code, lib = exe.save()
-    lib_path = os.path.join(tmp_dir, "library.so")
-    # TODO(mbs): Avoid nvcc dependency?
-    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
-    lib = tvm.runtime.load_module(lib_path)
-    exe = tvm.runtime.vm.Executable.load_exec(code, lib)
-
-    # Benchmark the module.
-    the_vm = tvm.runtime.vm.VirtualMachine(exe, device)
-    func_name = "main"
-    main_args = {v.name_hint: arg_for(v.checked_type, device) for v in mod[func_name].params}
-    logging.info("Benchmarking module to estimate")
-    profile = vm_estimate_seconds(device, the_vm, func_name, main_args)
-    logging.info("profile: %s", profile)
-    return profile.median  # seconds
-
-
-def make_labelled_dfpattern_partition_rule_wrapper(compiler, pattern_tuple):
-    """Returns a DFPatternPartitionRule representing one (label, pattern, predicate) entry from
-    the pattern table for external codegen compiler"""
-    if len(pattern_tuple) == 2:
-        rule_name, dataflow_pattern = pattern_tuple
-        return _ffi_api.MakeLabelledDFPatternPartitionRule(compiler, rule_name, dataflow_pattern)
-    else:
-        rule_name, dataflow_pattern, predicate = pattern_tuple
-        return _ffi_api.MakeLabelledDFPatternPartitionRuleWithPredicate(
-            compiler, rule_name, dataflow_pattern, predicate
-        )
-
-
-@register_func("tvm.relay.collage.make_byoc_partition_rule")
-def make_byoc_partition_rule(compiler):
-    """Returns the PartitionRule for external codegen compiler"""
-    pattern_table = tvm.relay.op.contrib.get_pattern_table(compiler)
-    assert (
-        pattern_table is not None
-    ), f"No pattern table entry was found for BYOC compiler {compiler}"
-    logging.info(
-        "Converting %s rules for %s for use in pattern style BYOC lowering/codegen",
-        len(pattern_table),
-        compiler,
-    )
-    sub_rules = [
-        make_labelled_dfpattern_partition_rule_wrapper(compiler, pattern_tuple)
-        for pattern_tuple in pattern_table
-    ]
-    return _ffi_api.MakePatternBYOCPartitionRule(compiler, sub_rules)
diff --git a/python/tvm/relay/data_dep_optimization/__init__.py b/python/tvm/relay/data_dep_optimization/__init__.py
deleted file mode 100644
index 5f429917b8a6..000000000000
--- a/python/tvm/relay/data_dep_optimization/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, not-context-manager
-"""Optimizations involves changing of paramters"""
-
-from . import bsr_dense
-from . import simplify_fc_transpose
-from . import bsr_conv2d
diff --git a/python/tvm/relay/data_dep_optimization/bsr_conv2d.py b/python/tvm/relay/data_dep_optimization/bsr_conv2d.py
deleted file mode 100644
index 20e01da1493e..000000000000
--- a/python/tvm/relay/data_dep_optimization/bsr_conv2d.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, not-context-manager
-"""Automatic convert model from dense to block sparse"""
-
-from tvm import relay
-from tvm.relay.analysis.sparse_conv2d import process_params
-
-from .utils import _run_opt_pass
-
-
-def convert(func, params, blocksize, sparsity_threshold, layout="NHWC", kernel_size=1):
-    """Convert a conv2d func and according parameters to block sparse
-
-    Parameters
-    ----------
-    func : relay.Expr
-        Expr will be optimized to sparse operation
-    params : Dict[Srting, tvm.nd.array]
-        Parameters of the Expr
-    blocksize : Tuple(int, int)
-        Blocksize for BSR matrix
-    sparsity_threshold : float
-        Minimal sparsity requirement for converting.
-        If weight sparsity is lower than this threshold,
-        the dense operation will be kept.
-    layout : str
-        layout of network
-
-    Returns
-    -------
-    new_func: relay.Expr
-        Mutated Expr with sparse operations
-
-    params: Dict[Srting, tvm.nd.array]
-        New params with BSR matrix for mutated Expr
-    """
-    weight_info = process_params(func, params, blocksize, sparsity_threshold, layout, kernel_size)
-    new_func = _run_opt_pass(
-        func,
-        relay.transform.Conv2dToSparse(
-            weight_info.weight_name, weight_info.weight_shape, layout, kernel_size
-        ),
-    )
-
-    return new_func, params
-
-
-def convert2(func, params, blocksize, sparsity_threshold, layout, kernel_size):
-    """Convert a freezed conv2d func to block sparse
-
-    Parameters
-    ----------
-    func : relay.Expr
-        Expr will be optimized to sparse operation, with params freezed
-    params : Dict[Srting, tvm.nd.array]
-        Parameters of the Expr (not used in this pass)
-    blocksize : Tuple(int, int)
-        Blocksize for BSR matrix
-    sparsity_threshold : float
-        Minimal sparsity requirement for converting.
-        If weight sparsity is lower than this threshold,
-        the dense operation will be kept.
-    layout : str
-        layout of network
-    kernel_size : int
-        kernel size of the conv2d, for filtering
-
-    Returns
-    -------
-    new_func: relay.Expr
-        Mutated Expr with sparse operations
-
-    params: Dict[Srting, tvm.nd.array]
-        New params with BSR matrix for mutated Expr (not modified)
-    """
-    new_func = _run_opt_pass(
-        func, relay.transform.Conv2dToSparse2(layout, kernel_size, blocksize, sparsity_threshold)
-    )
-    return new_func, params
diff --git a/python/tvm/relay/data_dep_optimization/bsr_dense.py b/python/tvm/relay/data_dep_optimization/bsr_dense.py
deleted file mode 100644
index 5f5875eeff68..000000000000
--- a/python/tvm/relay/data_dep_optimization/bsr_dense.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, not-context-manager
-"""Automatic convert model from dense to block sparse"""
-
-from tvm import relay
-from tvm.relay.analysis.sparse_dense import process_params
-
-from .utils import _run_opt_pass
-
-
-def convert(func, params, blocksize, sparsity_threshold):
-    """Convert a dense func and according parameters to block sparse
-
-    Parameters
-    ----------
-    func : relay.Expr
-        Expr will be optimized to sparse operation
-    params : Dict[Srting, tvm.nd.array]
-        Parameters of the Expr
-    blocksize : Tuple(int, int)
-        Blocksize for BSR matrix
-    sparsity_threshold : float
-        Minimal sparsity requirement for converting.
-        If weight sparsity is lower than this threshold,
-        the dense operation will be kept.
-
-    Returns
-    -------
-    new_func: relay.Expr
-        Mutated Expr with sparse operations
-
-    params: Dict[Srting, tvm.nd.array]
-        New params with BSR matrix for mutated Expr
-    """
-    weight_info = process_params(func, params, blocksize, sparsity_threshold)
-    new_func = _run_opt_pass(
-        func, relay.transform.DenseToSparse(weight_info.weight_name, weight_info.weight_shape)
-    )
-    return new_func, params
diff --git a/python/tvm/relay/data_dep_optimization/simplify_fc_transpose.py b/python/tvm/relay/data_dep_optimization/simplify_fc_transpose.py
deleted file mode 100644
index eeb474efa136..000000000000
--- a/python/tvm/relay/data_dep_optimization/simplify_fc_transpose.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, not-context-manager
-"""Automatic optimize fc tranpose"""
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm.relay.analysis import search_fc_transpose
-
-from .utils import _run_opt_pass
-
-
-def convert(func, params):
-    """convert all ```y = nn.dense(x, transpose(w, [1, 0]))``` to
-        ```y = nn.dense(x, wt)```
-
-    Parameters
-    ----------
-    func : relay.Expr
-        Expr will be optimized
-    params : Dict[String, tvm.nd.array]
-        Parameters of Expr
-
-    Returns
-    -------
-    new_func : relay.Expr
-        Mutated Expr from ```y = nn.dense(x, transpose(w, [1, 0]))``` to
-        ```y = nn.dense(x, wt)```
-    params: Dict[String, tvm.nd.array]
-        Parameters of mutated Expr, with weights pre-transposed
-    """
-    weight_info = search_fc_transpose(func)
-    for item in weight_info:
-        name = str(item)
-        w_np = params[name].numpy()
-        new_w = np.transpose(w_np, axes=[1, 0])
-        params[name + ".T"] = tvm.nd.array(new_w)
-        del params[name]
-    new_func = _run_opt_pass(
-        func,
-        relay.transform.SimplifyFCTranspose(
-            weight_info,
-        ),
-    )
-    return new_func, params
diff --git a/python/tvm/relay/data_dep_optimization/utils.py b/python/tvm/relay/data_dep_optimization/utils.py
deleted file mode 100644
index 2b58fdc0cf35..000000000000
--- a/python/tvm/relay/data_dep_optimization/utils.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, not-context-manager
-"""Utils functions for optimizations"""
-
-import tvm
-
-
-def _run_opt_pass(expr, opt_pass):
-    """Helper function to run pass
-
-    Parameters
-    ----------
-    expr : relay.Expr
-        Expr will be optimized
-    opt_pass : relay.Pass
-        Optimization pass
-
-    Returns
-    -------
-    ret: relay.Expr
-        Optimized Expr by running opt_pass
-    """
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = opt_pass(mod)
-    return mod["main"]
diff --git a/python/tvm/relay/dataflow_pattern/__init__.py b/python/tvm/relay/dataflow_pattern/__init__.py
deleted file mode 100644
index 76a24c048cf9..000000000000
--- a/python/tvm/relay/dataflow_pattern/__init__.py
+++ /dev/null
@@ -1,956 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The Relay Pattern Language and tooling."""
-# pylint: disable=no-member
-from typing import Callable, Dict, List, Optional
-
-import tvm._ffi
-from tvm.relay.expr import RelayExpr as Expr
-
-from ... import _ffi as tvm_ffi
-from ... import ir as _ir
-from ...ir import make_node
-from ...ir.base import Node
-from ...runtime import Object
-from ..base import astext, pretty_print
-from ..op import get
-from . import _ffi as ffi
-
-
-def register_df_node(type_key=None):
-    """Register a Relay node type.
-
-    Parameters
-    ----------
-    type_key : str or cls
-        The type key of the node.
-    """
-    if not isinstance(type_key, str):
-        return tvm._ffi.register_object("relay.dataflow_pattern." + type_key.__name__)(type_key)
-    return tvm._ffi.register_object(type_key)
-
-
-class DFPattern(Node):
-    """Base class of all Patterns."""
-
-    def __str__(self):
-        return pretty_print(self)
-
-    def astext(self, show_meta_data=True, annotate=None):
-        """Get the text format of the expression.
-
-        Parameters
-        ----------
-        show_meta_data : bool
-            Whether to include meta data section in the text
-            if there is meta data.
-
-        annotate: Optional[Object->str]
-            Optionally annotate function to provide additional
-            information in the comment block.
-
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
-        Notes
-        -----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
-        """
-        return astext(self, show_meta_data, annotate)
-
-    def __call__(self, *args):
-        args = list(args)
-        if len(args) == 1 and args[0] is None:
-            args = None
-        return CallPattern(self, args)
-
-    def __or__(self, other):
-        return AltPattern(self, other)
-
-    def __add__(self, other):
-        return is_op("add")(self, other)
-
-    def __sub__(self, other):
-        return is_op("subtract")(self, other)
-
-    def __mul__(self, other):
-        return is_op("multiply")(self, other)
-
-    def __truediv__(self, other):
-        return is_op("divide")(self, other)
-
-    def has_attr(self, attrs: Dict[str, Object]):
-        """
-        Add an attribute constraint to this pattern
-
-        Parameters
-        ----------
-        attrs: Dict[str, Object]
-
-        Returns
-        -------
-        result: tvm.relay.dataflow_pattern.DFPattern
-            The resulting AttrPattern
-        """
-        attrs = make_node("DictAttrs", **attrs)
-        return AttrPattern(self, attrs)
-
-    def has_type(self, ttype: tvm.ir.type.Type):
-        """
-        Add a type constraint to this pattern
-
-        Parameters
-        ----------
-        ttype: tvm.ir.type.Type
-            The type to match
-
-        Returns
-        -------
-        result: tvm.relay.dataflow_pattern.DFPattern
-            The resulting TypePattern
-        """
-        return has_type(ttype, self)
-
-    def has_dtype(self, dtype: str):
-        """
-        Add a type constraint to this pattern
-
-        Parameters
-        ----------
-        dtype: str
-            The dtype to match
-
-        Returns
-        -------
-        result: tvm.relay.dataflow_pattern.DFPattern
-            The resulting DataTypePattern
-        """
-        return has_dtype(dtype, self)
-
-    def has_shape(self, shape: List[tvm.ir.PrimExpr]):
-        """
-        Add a type constraint to this pattern
-
-        Parameters
-        ----------
-        shape: List[tvm.ir.PrimExpr]
-            The shape to match
-
-        Returns
-        -------
-        result: tvm.relay.dataflow_pattern.DFPattern
-            The resulting ShapePattern
-        """
-        return has_shape(shape, self)
-
-    def match(self, expr: Expr) -> bool:
-        """
-        Match this pattern to an expression
-
-        Parameters
-        ----------
-        expr : tvm.relay.Expr
-            The expression to match.
-
-        Returns
-        -------
-        result: bool
-            Whether or not the expression matches the pattern
-        """
-        return match(self, expr)
-
-    def partition(
-        self,
-        expr: Expr,
-        attrs: Optional[Dict[str, Object]] = None,
-        check: Callable[[Expr], bool] = lambda x: True,
-    ) -> Expr:
-        """
-        Partition the expression into functions defined by this pattern
-
-        Parameters
-        ----------
-        expr : tvm.relay.Expr
-            The expression to match.
-        attrs : Optional[Dict[str, Object]]
-            A dictionary of Attribute name/values to add to the paritioned function
-        check : Callable[[Expr], bool]
-            A function to perform more complicated checks on the matched expression.
-            Returns true if partitioning should proceed, false otherwise.
-
-        Returns
-        -------
-        result : tvm.relay.Expr
-            The Expression with matched subgraphs replaced by function calls to that subgraph
-        """
-        return partition(self, expr, attrs, check)
-
-    def dominates(self, parent: "DFPattern", path: "DFPattern" = None):
-        """
-        Create a dominator for this pattern.
-
-        Parameters
-        ----------
-        parent: tvm.relay.dataflow_pattern.DFPattern
-            The parent pattern this pattern dominates.
-        path: tvm.relay.dataflow_pattern.DFPattern
-            The fuzzy path pattern.
-
-        Returns
-        -------
-        result: tvm.relay.dataflow_pattern.DFPattern
-            The resulting DominatorPattern.
-        """
-        if path is None:
-            path = wildcard()
-        return DominatorPattern(parent, path, self)
-
-    def optional(self, option_constructor: Callable[["DFPattern"], "DFPattern"]):
-        """
-        Create a optional user of this pattern.
-
-        Parameters
-        ----------
-        option_constructor: function
-            A function that takes a single Pattern parameter and returns
-            a constructed pattern matching the option
-
-        Returns
-        -------
-        result: tvm.relay.dataflow_pattern.DFPattern
-            The resulting Pattern
-        """
-        return self | option_constructor(self)
-
-
-def is_var(name: str = "") -> "DFPattern":
-    """
-    Syntatic sugar for creating an optionally named VarPattern.
-
-    Parameters
-    ----------
-    name: str
-        The name of the input pattern to match.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return VarPattern(name)
-
-
-def is_constant() -> "DFPattern":
-    """
-    Syntatic sugar for creating a ConstantPattern.
-
-    Parameters
-    ----------
-    name: str
-        The name of the input pattern to match.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return ConstantPattern()
-
-
-def is_expr(expr: Expr) -> "DFPattern":
-    """
-    Syntatic sugar for creating an ExprPattern.
-
-    Parameters
-    ----------
-    expr: Expr
-        The Relay expression to match.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return ExprPattern(expr)
-
-
-def is_op(op_name: str) -> "DFPattern":
-    """
-    Syntatic sugar for creating an operator ExprPattern.
-
-    Parameters
-    ----------
-    op_name: String
-        The name of the relay op
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting ExprPattern
-    """
-    op = get(op_name)
-    return ExprPattern(op)
-
-
-def is_tuple(fields: tvm.ir.container.Array) -> "DFPattern":
-    """
-    Syntatic sugar for creating an ExprPattern.
-
-    Parameters
-    ----------
-    fields : Array[tvm.relay.dataflow_pattern.DFPattern]
-        The fields in the tuple.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return TuplePattern(fields)
-
-
-def is_tuple_get_item(tuple_value: "DFPattern", index: Optional[int] = None) -> "DFPattern":
-    """
-    Syntatic sugar for creating an ExprPattern.
-
-    Parameters
-    ----------
-    tuple_value: tvm.relay.dataflow_pattern.DFPattern
-        The input tuple expression.
-
-    index: Optional[int]
-        The index to match; Default (None) to match a TupleGetItem with any index.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return TupleGetItemPattern(tuple_value, index)
-
-
-def is_if(cond, true_branch, false_branch):
-    """
-    Syntatic sugar for creating an IfPattern.
-
-    Parameters
-    ----------
-    cond: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the condition of If.
-
-    true_branch: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the true branch of If.
-
-    false_branch: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the false branch of If.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return IfPattern(cond, true_branch, false_branch)
-
-
-def is_let(var, value, body):
-    """
-    Syntatic sugar for creating a LetPattern.
-
-    Parameters
-    ----------
-    var: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the variable of Let.
-
-    value: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the value of Let.
-
-    body: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the body where the binding is in effect.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return LetPattern(var, value, body)
-
-
-def wildcard() -> "DFPattern":
-    """
-    Syntatic sugar for creating a WildcardPattern.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting pattern.
-    """
-    return WildcardPattern()
-
-
-def has_type(ttype: tvm.ir.type.Type, pattern: "DFPattern" = None) -> "DFPattern":
-    """
-    Syntatic sugar for creating a TypePattern
-
-    Parameters
-    ----------
-    ttype: tvm.ir.type.Type
-        The type to match
-
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The pattern that needs type annotation
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting TypePattern
-    """
-    if pattern is None:
-        pattern = wildcard()
-    return TypePattern(pattern, ttype)
-
-
-def has_dtype(dtype: str, pattern: "DFPattern" = None) -> "DFPattern":
-    """
-    Syntatic sugar for creating a DataTypePattern
-
-    Parameters
-    ----------
-    dtype: str
-        The dtype to match
-
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The pattern that needs type annotation
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting DataTypePattern
-    """
-    if pattern is None:
-        pattern = wildcard()
-    return DataTypePattern(pattern, dtype)
-
-
-def has_shape(shape: List[tvm.ir.PrimExpr], pattern: "DFPattern" = None) -> "DFPattern":
-    """
-    Syntatic sugar for creating a ShapePattern
-
-    Parameters
-    ----------
-    shape: List[tvm.ir.PrimExpr]
-        The shape to match
-
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The pattern that needs type annotation
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting ShapePattern
-    """
-    if pattern is None:
-        pattern = wildcard()
-    return ShapePattern(pattern, shape)
-
-
-def has_attr(attrs, pattern=None) -> "DFPattern":
-    """
-    Syntatic sugar for creating an AttrPattern
-
-    Parameters
-    ----------
-    attrs: Dict[str, Object]
-        The attributes to match
-
-    pattern: Optional[tvm.relay.dataflow_pattern.DFPattern]
-        The input pattern.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting AttrPattern
-    """
-    if pattern is None:
-        pattern = wildcard()
-    return pattern.has_attr(attrs)
-
-
-def dominates(parent: "DFPattern", path: "DFPattern", child: "DFPattern") -> "DFPattern":
-    """
-    Syntatic sugar for creating an Dominator pattern
-
-    Parameters
-    ----------
-    parent: tvm.relay.dataflow_pattern.DFPattern
-        The parent pattern.
-    path: tvm.relay.dataflow_pattern.DFPattern
-        The fuzzy path pattern.
-    child: tvm.relay.dataflow_pattern.DFPattern
-        The child pattern.
-
-    Returns
-    -------
-    result: tvm.relay.dataflow_pattern.DFPattern
-        The resulting DominatorPattern.
-    """
-    return DominatorPattern(parent, path, child)
-
-
-def match(pattern: "DFPattern", expr: Expr) -> bool:
-    """
-    Match a pattern to an expression
-
-    Parameters
-    ----------
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The input pattern.
-    expr : tvm.relay.Expr
-        The expression to match.
-    """
-    return ffi.match(pattern, expr)
-
-
-@register_df_node
-class ExprPattern(DFPattern):
-    """A pattern which matches a constant expression.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The expression to match.
-    """
-
-    def __init__(self, expr: Expr):
-        self.__init_handle_by_constructor__(ffi.ExprPattern, expr)
-
-
-@register_df_node
-class VarPattern(DFPattern):
-    """A local variable in Relay.
-
-    Local variable can be used to declare input
-    arguments to a function, or intermediate variables.
-
-    Parameters
-    ----------
-    name_hint: str
-        The name of the variable. Optional, if not provided,
-        the pattern will match any VarNode.
-
-    type_annotation: tvm.ir.type.Type, optional
-        The type annotation on the variable.
-    """
-
-    def __init__(self, name_hint: str = ""):
-        self.__init_handle_by_constructor__(ffi.VarPattern, name_hint)
-
-
-@register_df_node
-class ConstantPattern(DFPattern):
-    """A pattern matching a Relay Constant."""
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(ffi.ConstantPattern)
-
-
-@register_df_node
-class CallPattern(DFPattern):
-    """A pattern matching a function call node in Relay.
-
-    Parameters
-    ----------
-    op: relay.dataflow_pattern.DFPattern
-        The operation to be called.
-
-    args: List[relay.dataflow_pattern.DFPattern]
-        The arguments to the call or None to match any arguments.
-
-    """
-
-    def __init__(
-        self,
-        op: "DFPattern",
-        args: List["DFPattern"],
-    ):
-        self.__init_handle_by_constructor__(ffi.CallPattern, op, args)
-
-
-@register_df_node
-class FunctionPattern(DFPattern):
-    """A pattern matching a function node in Relay.
-
-    Parameters
-    ----------
-    params: List[relay.dataflow_pattern.DFPattern]
-        The parameters to the Function or None to match any parameters.
-
-    body: relay.dataflow_pattern.DFPattern
-        The body fo the Function
-
-    """
-
-    def __init__(
-        self,
-        params: List["DFPattern"],
-        body: "DFPattern",
-    ):
-        self.__init_handle_by_constructor__(ffi.FunctionPattern, params, body)
-
-
-@register_df_node
-class IfPattern(DFPattern):
-    """A patern matching a Relay If.
-
-    Parameters
-    ----------
-    cond: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the condition of If.
-
-    true_branch: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the true branch of If.
-
-    false_branch: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the false branch of If.
-    """
-
-    def __init__(self, cond: "DFPattern", true_branch: "DFPattern", false_branch: "DFPattern"):
-        self.__init_handle_by_constructor__(ffi.IfPattern, cond, true_branch, false_branch)
-
-
-@register_df_node
-class LetPattern(DFPattern):
-    """A patern matching a Relay Let.
-
-    Parameters
-    ----------
-    var: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the variable of Let.
-
-    value: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the value of Let.
-
-    body: tvm.relay.dataflow_pattern.DFPattern
-        The pattern describing the body where the binding is in effect.
-
-    """
-
-    def __init__(self, var: "DFPattern", value: "DFPattern", body: "DFPattern"):
-        self.__init_handle_by_constructor__(ffi.LetPattern, var, value, body)
-
-
-@register_df_node
-class TuplePattern(DFPattern):
-    """A patern matching a Relay Tuple.
-
-    Parameters
-    ----------
-    fields : Array[tvm.relay.dataflow_pattern.DFPattern]
-        The fields in the tuple.
-    """
-
-    def __init__(self, fields: tvm.ir.container.Array):
-        self.__init_handle_by_constructor__(ffi.TuplePattern, fields)
-
-    def __getitem__(self, index: int):
-        if index >= len(self):
-            raise IndexError("TuplePattern index out of range")
-        return self.fields[index]
-
-    def __len__(self):
-        return len(self.fields)
-
-    def astype(self, _):
-        raise TypeError("astype cannot be used on TuplePattern")
-
-
-@register_df_node
-class TupleGetItemPattern(DFPattern):
-    """Get index-th item from a TuplePattern.
-
-    Parameters
-    ----------
-    tuple_value: tvm.relay.dataflow_pattern.DFPattern
-        The input tuple expression.
-
-    index: Optional[int]
-        The index to match; Default (None) to match a TupleGetItem with any index.
-    """
-
-    def __init__(self, tuple_value: "DFPattern", index: Optional[int] = None):
-        match_index = index if index is not None else -1
-        self.__init_handle_by_constructor__(ffi.TupleGetItemPattern, tuple_value, match_index)
-
-
-@register_df_node
-class AltPattern(DFPattern):
-    """Create a Pattern that can match one of two conditions
-
-    Parameters
-    ----------
-    left: tvm.relay.dataflow_pattern.DFPattern
-        One possible matching pattern.
-    right: tvm.relay.dataflow_pattern.DFPattern
-        One possible matching pattern.
-    """
-
-    def __init__(self, left: "DFPattern", right: "DFPattern"):
-        self.__init_handle_by_constructor__(ffi.AltPattern, left, right)
-
-
-@register_df_node
-class WildcardPattern(DFPattern):
-    """A pattern which matches anything."""
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(ffi.WildcardPattern)
-
-    def redirect_to(
-        self,
-        pat: "DFPattern",
-    ):
-        """Redirect the WildcardPattern to another pattern
-
-        Parameters
-        ----------
-        pat: relay.dataflow_pattern.DFPattern
-            The pattern that wildcard is redirected to.
-        """
-        ffi.WildcardPattern_redirect_to(self, pat)
-
-
-@register_df_node
-class TypePattern(DFPattern):
-    """A pattern that matches another pattern with a certain type annotation.
-
-    Parameters
-    ----------
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The input pattern that needs type annotation.
-
-    ttype: tvm.ir.type.Type
-        The type to match.
-    """
-
-    def __init__(self, pattern: "DFPattern", ttype: tvm.ir.type.Type):
-        self.__init_handle_by_constructor__(ffi.TypePattern, pattern, ttype)
-
-
-@register_df_node
-class DataTypePattern(DFPattern):
-    """A pattern that matches another pattern with certain data type
-
-    Parameters
-    ----------
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The input pattern that needs type annotation.
-
-    dtype: str
-        The dtype to match.
-    """
-
-    def __init__(self, pattern: "DFPattern", dtype: str):
-        self.__init_handle_by_constructor__(ffi.DataTypePattern, pattern, dtype)
-
-
-@register_df_node
-class ShapePattern(DFPattern):
-    """A pattern that matches another pattern with a certain tensor shape
-
-    Parameters
-    ----------
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The input pattern that needs type annotation.
-
-    shape: List[tvm.ir.PrimExpr]
-        The shape to match.
-    """
-
-    def __init__(self, pattern: "DFPattern", shape: List[tvm.ir.PrimExpr]):
-        self.__init_handle_by_constructor__(ffi.ShapePattern, pattern, shape)
-
-
-@register_df_node
-class AttrPattern(DFPattern):
-    """Get match an expression with a certain attributes.
-    Currently only supports Op Attributes, not call Attributes.
-
-    Parameters
-    ----------
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The input pattern.
-
-    attrs: tvm.ir.attrs.Attrs
-        The attributes to match.
-    """
-
-    def __init__(self, pattern: "DFPattern", attrs: tvm.ir.attrs.Attrs):
-        self.__init_handle_by_constructor__(ffi.AttrPattern, pattern, attrs)
-
-
-@register_df_node
-class DominatorPattern(DFPattern):
-    """Match a domination graph.
-
-    Parameters
-    ----------
-    parent: tvm.relay.dataflow_pattern.DFPattern
-        The parent, i.e., the single node which produces something,
-        later aggregated by the child.
-    path: tvm.relay.dataflow_pattern.DFPattern
-        The fuzzy path pattern between parent and child,
-        typically matches elementwise ops.
-    child: tvm.relay.dataflow_pattern.DFPattern
-        The last node in the domination which is the end user
-        for all nodes in the path and the parent.
-    """
-
-    def __init__(self, parent: "DFPattern", path: "DFPattern", child: "DFPattern"):
-        self.__init_handle_by_constructor__(ffi.DominatorPattern, parent, path, child)
-
-
-class DFPatternCallback:
-    """A Callback for Pattern Rewriting.
-
-    When rewrite is called on this DFPatternCallback, the backend will find matches for the
-    pattern, call the callback function, and replace the matched expression with whatever
-    the callback returns.
-
-    Users are expect to inherit from this class and provide a "self.pattern" to match
-
-    Parameters
-    ----------
-    require_type: bool
-        Whether InferType is required to be run before the callback.
-    rewrite_once: bool
-        If True, run the callback only once.
-    """
-
-    def __init__(self, require_type=False, rewrite_once=False):
-        self.pattern = None
-        self.require_type = require_type
-        self.rewrite_once = rewrite_once
-
-    def rewrite(self, expr: Expr) -> Expr:
-        """
-        Rewrite expression with this callback
-
-        Parameters
-        ----------
-        expr : tvm.relay.Expr
-            The expression to rewrite.
-
-        Returns
-        -------
-        result : tvm.relay.Expr
-            The Expression with matched subgraphs rewritten by the callbacks.
-        """
-        return rewrite(self, expr)
-
-    def callback(self, pre: Expr, post: Expr, node_map: tvm.ir.container.Map) -> Expr:
-        """
-        Callback function to use when we found a match to the pattern
-
-        Parameters
-        ----------
-        pre : tvm.relay.Expr
-            The matching expression from the original graph.
-        post : tvm.relay.Expr
-            The matching expression with rewritten inputs
-        node_map : tvm.ir.container.Map[DFPattern, List[Expr]]
-            The map between patterns and matched expressions
-
-        Returns
-        -------
-        result : tvm.relay.Expr
-            The Expression with matched subgraph rewritten by the callback
-        """
-        raise NotImplementedError()
-
-
-class _DFPatternCallback(Object):
-    """C++ implemenation"""
-
-    def __init__(self, pattern, callback, require_type, rewrite_once):
-        self.__init_handle_by_constructor__(
-            ffi.DFPatternCallback, pattern, callback, require_type, rewrite_once
-        )
-
-
-def rewrite(callbacks, expr: Expr, mod: Optional[_ir.IRModule] = None) -> Expr:
-    """
-    Rewrite expression with the given callbacks.
-
-    Parameters
-    ----------
-    callbacks: tvm.relay.dataflow_pattern.DFPatternCallback
-        The input callback or list of callbacks.
-    expr : tvm.relay.Expr
-        The expression to rewrite.
-    mod : Optional[tvm.ir.IRModule]
-        The module that associates with the expression.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The Expression with matched subgraphs rewritten by the callbacks.
-    """
-    if mod is None:
-        mod = _ir.IRModule()
-    callbacks = [callbacks] if isinstance(callbacks, DFPatternCallback) else callbacks
-    tmp = []
-    for callback in callbacks:
-        assert callback.pattern is not None
-        tmp.append(
-            _DFPatternCallback(
-                callback.pattern, callback.callback, callback.require_type, callback.rewrite_once
-            )
-        )
-
-    return ffi.rewrite(tmp, expr, mod)
-
-
-def partition(
-    pattern: "DFPattern",
-    expr: Expr,
-    attrs: Optional[Dict[str, Object]] = None,
-    check: Callable[[Expr], bool] = lambda x: True,
-) -> Expr:
-    """
-    Parition the expression into a series of functions that match the pattern
-
-    Parameters
-    ----------
-    pattern: tvm.relay.dataflow_pattern.DFPattern
-        The pattern to match
-    expr : tvm.relay.Expr
-        The expression to split into functions
-    attrs : Optional[Dict[str, Object]]
-        A dict of attributes to apply to the partitioned function
-    check : Callable[[Expr], bool]
-        A function to perform more complicated checks on the matched expression.
-        Returns true if partitioning should proceed, false otherwise.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The Expression with matched subgraphs replaced by function calls to that subgraph
-    """
-    return ffi.partition(pattern, expr, attrs, check)
diff --git a/python/tvm/relay/dataflow_pattern/_ffi.py b/python/tvm/relay/dataflow_pattern/_ffi.py
deleted file mode 100644
index b0a702c1d2f5..000000000000
--- a/python/tvm/relay/dataflow_pattern/_ffi.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""DataFlow Pattern Language FFI bindings."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.dataflow_pattern", __name__)
diff --git a/python/tvm/relay/debug.py b/python/tvm/relay/debug.py
deleted file mode 100644
index b52bcdb14926..000000000000
--- a/python/tvm/relay/debug.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin, invalid-name, forgotten-debug-statement
-"""The Relay IR namespace containing the IR definition and compiler."""
-import tvm._ffi
-
-# pylint: disable=unused-argument, import-outside-toplevel
-def _debugger_init(expr, stack):
-    import pdb
-
-    pdb.set_trace()
-
-
-@tvm._ffi.register_func("relay.debug")
-def _debug(*args):
-    import pdb
-
-    pdb.set_trace()
-
-
-# pylint: disable=unused-argument
-@tvm._ffi.register_func("relay.debug_interp")
-def _debug_interp(*args):
-    _, _, _, ist = args
-    print("Relay Debugger")
-    print("  You can manipulate the expression under evaluation with the name `expr`.")
-    print("  You can manipulate the call stack with the name `stack`.")
-    print("--------------")
-    print("--------------")
-    _debugger_init(ist.current_expr, ist.stack)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
deleted file mode 100644
index 5239eaa8830b..000000000000
--- a/python/tvm/relay/expr.py
+++ /dev/null
@@ -1,754 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, invalid-name, unused-import
-"""The expression nodes of Relay."""
-from __future__ import absolute_import
-
-from numbers import Number as _Number
-
-import numpy as _np
-
-import tvm._ffi
-from tvm._ffi import base as _base
-from tvm.ir import GlobalVar, Node, RelayExpr
-from tvm.runtime import NDArray
-from tvm.runtime import ndarray as _nd
-
-from . import _ffi_api
-from . import ty as _ty
-from .base import RelayNode, astext, pretty_print
-
-# alias relay expr as Expr.
-Expr = RelayExpr
-
-# will be registered afterwards
-_op_make = None
-
-
-class ExprWithOp(RelayExpr):
-    """Basetype of all relay expressions that defines op overloading."""
-
-    def astype(self, dtype):
-        """Cast the content type of the current data to dtype.
-
-        Parameters
-        ----------
-        dtype : str
-            The target data type.
-
-        Note
-        ----
-        This function only works for TensorType Exprs.
-
-        Returns
-        -------
-        result : tvm.relay.Expr
-            The result expression.
-        """
-        return _ffi_api.cast(self, dtype)
-
-    def __str__(self):
-        return pretty_print(self)
-
-    def astext(self, show_meta_data=True, annotate=None):
-        """Get the text format of the expression.
-
-        Parameters
-        ----------
-        show_meta_data : bool
-            Whether to include meta data section in the text
-            if there is meta data.
-
-        annotate: Optional[Object->str]
-            Optionally annotate function to provide additional
-            information in the comment block.
-
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
-        Notes
-        -----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
-        """
-        return astext(self, show_meta_data, annotate)
-
-    def __neg__(self):
-        return _op_make.negative(self)
-
-    def __lt__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.less(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __gt__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.greater(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __ge__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.greater_equal(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __le__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.less_equal(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __add__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.add(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __radd__(self, other):
-        return self.__add__(other)
-
-    def __sub__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.subtract(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __rsub__(self, other):
-        if isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        raise TypeError(f"type {type(other)} not supported")
-
-    def __mul__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.multiply(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __rmul__(self, other):
-        return self.__mul__(other)
-
-    def __div__(self, other):
-        if isinstance(other, Expr):
-            return _op_make.divide(self, other)
-        elif isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        else:
-            raise TypeError(f"type {type(other)} not supported")
-
-    def __rdiv__(self, other):
-        if isinstance(other, _Number):
-            raise TypeError(f'convert "{str(other)}" with `const` first')
-        raise TypeError(f"type {type(other)} not supported")
-
-    def __truediv__(self, other):
-        return self.__div__(other)
-
-    def __rtruediv__(self, other):
-        return self.__rdiv__(other)
-
-    def __call__(self, *args):
-        """Call the variable (if it represents a function).
-
-        Parameters
-        ----------
-        args: List[relay.Expr]
-            The arguments to the call.
-
-        Returns
-        -------
-        call: Call
-            A call taking the variable as a function.
-        """
-        return Call(self, args)
-
-
-@tvm._ffi.register_object("relay.Constant")
-class Constant(ExprWithOp):
-    """A constant expression in Relay.
-
-    Parameters
-    ----------
-    data : tvm.nd.NDArray
-        The data content of the constant expression.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, data, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.Constant, data, span)
-
-
-@tvm._ffi.register_func("relay.ConstantWithFields")
-def ConstantWithFields(constant, data=None, virtual_device=None, span=None):
-    """
-    Returns constant with the given properties. A None property denotes 'no change'.
-    Returns constant if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.ConstantWithFields(constant, data, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.Tuple")
-class Tuple(ExprWithOp):
-    """Tuple expression that groups several fields together.
-
-    Parameters
-    ----------
-    fields : List[tvm.relay.Expr]
-        The fields in the tuple.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, fields, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.Tuple, fields, span)
-
-    def __getitem__(self, index):
-        if index >= len(self):
-            raise IndexError("Tuple index out of range")
-        return self.fields[index]
-
-    def __len__(self):
-        return len(self.fields)
-
-    def astype(self, _):
-        raise TypeError("astype cannot be used on tuple")
-
-
-@tvm._ffi.register_func("relay.TupleWithFields")
-def TupleWithFields(tup, fields=None, virtual_device=None, span=None):
-    """
-    Returns tuple with the given properties. A None property denotes 'no change'.
-    Returns tuple if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.TupleWithFields(tup, fields, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.Var")
-class Var(ExprWithOp):
-    """A local variable in Relay.
-
-    Local variable can be used to declare input
-    arguments to a function, or intermediate variables.
-
-    Parameters
-    ----------
-    name_hint: str
-        The name of the variable.
-        This name only acts as a hint, and is not used
-        for equality.
-
-    type_annotation: tvm.relay.Type, optional
-        The type annotation on the variable.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, name_hint, type_annotation=None, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.Var, name_hint, type_annotation, span)
-
-    @property
-    def name_hint(self):
-        """Get name hint of the current var."""
-        name = str(self.vid.name_hint)
-        return name
-
-
-@tvm._ffi.register_func("relay.VarWithFields")
-def VarWithFields(variable, vid=None, type_annotation=None, virtual_device=None, span=None):
-    """
-    Returns var with the given properties. A None property denotes 'no change'.
-    Returns var if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.VarWithFields(variable, vid, type_annotation, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.Call")
-class Call(ExprWithOp):
-    """Function call node in Relay.
-
-    Call node corresponds the operator application node
-    in computational graph terminology.
-
-    Parameters
-    ----------
-    op: tvm.ir.Op or any tvm.relay.Expr with function type.
-        The operation to be called.
-
-    args: List[tvm.relay.Expr]
-        The arguments to the call.
-
-    attrs: Optional[tvm.Attrs]
-        Attributes to the call, can be None
-
-    type_args: Optional[List[tvm.relay.Type]]
-        The additional type arguments, this is only
-        used in advanced usecase of template functions.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, op, args, attrs=None, type_args=None, span=None):
-        if not type_args:
-            type_args = []
-        self.__init_handle_by_constructor__(_ffi_api.Call, op, args, attrs, type_args, span)
-
-
-@tvm._ffi.register_func("relay.CallWithFields")
-def CallWithFields(
-    call, op=None, args=None, attrs=None, type_args=None, virtual_device=None, span=None
-):
-    """
-    Returns call with the given properties. A None property denotes 'no change'.
-    Returns call if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.CallWithFields(call, op, args, attrs, type_args, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.Let")
-class Let(ExprWithOp):
-    """Let variable binding expression.
-
-    Parameters
-    ----------
-    variable: tvm.relay.Var
-        The local variable to be bound.
-
-    value: tvm.relay.Expr
-        The value to be bound.
-
-    body: tvm.relay.Expr
-        The body of the let binding.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, variable, value, body, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.Let, variable, value, body, span)
-
-
-@tvm._ffi.register_func("relay.LetWithFields")
-def LetWithFields(let, variable=None, value=None, body=None, virtual_device=None, span=None):
-    """
-    Returns let with the given properties. A None property denotes 'no change'.
-    Returns let if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.LetWithFields(let, variable, value, body, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.If")
-class If(ExprWithOp):
-    """A conditional expression in Relay.
-
-    Parameters
-    ----------
-    cond: tvm.relay.Expr
-        The condition.
-
-    true_branch: tvm.relay.Expr
-        The expression evaluated when condition is true.
-
-    false_branch: tvm.relay.Expr
-        The expression evaluated when condition is false.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, cond, true_branch, false_branch, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.If, cond, true_branch, false_branch, span)
-
-
-@tvm._ffi.register_func("relay.IfWithFields")
-def IfWithFields(
-    if_expr, cond=None, true_branch=None, false_branch=None, virtual_device=None, span=None
-):
-    """
-    Returns if with the given properties. A None property denotes 'no change'.
-    Returns if if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.IfWithFields(if_expr, cond, true_branch, false_branch, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.TupleGetItem")
-class TupleGetItem(ExprWithOp):
-    """Get index-th item from a tuple.
-
-    Parameters
-    ----------
-    tuple_value: tvm.relay.Expr
-        The input tuple expression.
-
-    index: int
-        The index.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, tuple_value, index, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.TupleGetItem, tuple_value, index, span)
-
-
-@tvm._ffi.register_func("relay.TupleGetItemWithFields")
-def TupleGetItemWithFields(
-    tuple_get_item, tuple_value=None, index=None, virtual_device=None, span=None
-):
-    """
-    Returns tuple_get_item with the given properties. A None property denotes 'no change'.
-    Returns tuple_get_item if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.TupleGetItemWithFields(tuple_get_item, tuple_value, index, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.RefCreate")
-class RefCreate(ExprWithOp):
-    """Create a new reference from initial value.
-    Parameters
-    ----------
-    value: tvm.relay.Expr
-       The initial value.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, value, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.RefCreate, value, span)
-
-
-@tvm._ffi.register_func("relay.RefCreateWithFields")
-def RefCreateWithFields(ref_create, value=None, virtual_device=None, span=None):
-    """
-    Returns ref_create with the given properties. A None property denotes 'no change'.
-    Returns ref_create if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.RefCreateWithFields(ref_create, value, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.RefRead")
-class RefRead(ExprWithOp):
-    """Get the value inside the reference.
-    Parameters
-    ----------
-    ref: tvm.relay.Expr
-         The reference.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, ref, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.RefRead, ref, span)
-
-
-@tvm._ffi.register_func("relay.RefReadWithFields")
-def RefReadWithFields(ref_read, ref=None, virtual_device=None, span=None):
-    """
-    Returns ref_read with the given properties. A None property denotes 'no change'.
-    Returns ref_read if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.RefReadWithFields(ref_read, ref, virtual_device, span)
-
-
-@tvm._ffi.register_object("relay.RefWrite")
-class RefWrite(ExprWithOp):
-    """
-    Update the value inside the reference.
-    The whole expression will evaluate to an empty tuple.
-    Parameters
-    ----------
-    ref: tvm.relay.Expr
-        The reference.
-
-    value: tvm.relay.Expr
-        The new value.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, ref, value, span=None):
-        self.__init_handle_by_constructor__(_ffi_api.RefWrite, ref, value, span)
-
-
-@tvm._ffi.register_func("relay.RefWriteWithFields")
-def RefWriteWithFields(ref_write, ref=None, value=None, virtual_device=None, span=None):
-    """
-    Returns ref_write with the given properties. A None property denotes 'no change'.
-    Returns ref_write if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.RefWriteWithFields(ref_write, ref, value, virtual_device, span)
-
-
-class TempExpr(ExprWithOp):
-    """Baseclass of all TempExpr.
-
-    TempExprs are pass specific expression that can be
-    useful to define intermediate result in the
-    rewriting pass such as layout or type transformation.
-    """
-
-    def realize(self):
-        """Convert the expression to a normal(non-temp) Expr.
-
-        Returns
-        -------
-        The corresponding normal expression.
-        """
-        return _ffi_api.TempExprRealize(self)
-
-
-class TupleWrapper(object):
-    """TupleWrapper.
-
-    This class is a Python wrapper for a Relay tuple of known size.
-    It allows for accessing the fields of the Relay tuple as though
-    it were a Python tuple.
-
-    Parameters
-    ----------
-    tuple_value: tvm.relay.Expr
-        The input tuple
-
-    size: int
-        The size of the tuple.
-    """
-
-    def __init__(self, tuple_value, size):
-        self.tuple_value = tuple_value
-        self.size = size
-
-    def astuple(self):
-        """Returns the underlying Relay tuple if this wrapper is passed
-        as an argument to an FFI function."""
-        return self.tuple_value
-
-    def astext(self):
-        """Get the text format of the tuple expression.
-
-        Returns
-        -------
-        text : str
-            The text format of the tuple expression.
-        """
-        return self.tuple_value.astext()
-
-    def __getitem__(self, index):
-        if index >= len(self):
-            raise IndexError("Tuple index out of range")
-        return TupleGetItem(self.tuple_value, index, span=self.tuple_value.span)
-
-    def __len__(self):
-        return self.size
-
-    def __repr__(self):
-        return "TupleWrapper(" + self.tuple_value.__repr__() + ", " + str(self.size) + ")"
-
-    def astype(self, _):
-        raise TypeError("astype cannot be used on tuple")
-
-
-def var(name_hint, type_annotation=None, shape=None, dtype="float32", span=None):
-    """Create a new tvm.relay.Var.
-
-    This is a simple wrapper function that allows specify
-    shape and dtype directly.
-
-    Parameters
-    ----------
-    name_hint: str
-        The name of the variable.
-        This name only acts as a hint, and is not used
-        for equality.
-
-    type_annotation: Optional[tvm.relay.Type, str]
-        The type annotation on the variable.
-        When type_annotation is a str, we will create a scalar variable.
-
-    shape: Optional[List[tvm.Expr]]
-        The shape of the tensor type.
-
-    dtype: str, optional
-        The data type of the tensor.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-
-    Examples
-    --------
-    .. code-block:: python
-
-      # The following 4 lines are equivalent to each other
-      x = tvm.relay.Var("x", tvm.relay.TensorType([1, 2]))
-      x = tvm.relay.var("x", tvm.relay.TensorType([1, 2]))
-      x = tvm.relay.var("x", shape=[1, 2])
-      x = tvm.relay.var("x", shape=[1, 2], dtype="float32")
-
-      # The following 2 lines are equivalent to each other.
-      y = tvm.relay.var("x", "float32")
-      y = tvm.relay.var("x", shape=(), dtype="float32")
-    """
-    if type_annotation is not None and shape is not None:
-        raise ValueError("Can only specify either type_annotation or shape.")
-    if shape is not None:
-        type_annotation = _ty.TensorType(shape, dtype)
-    elif isinstance(type_annotation, str):
-        type_annotation = _ty.TensorType((), type_annotation)
-    return Var(name_hint, type_annotation, span)
-
-
-def const(value, dtype=None, span=None):
-    """Create a constant value.
-
-    Parameters
-    ----------
-    value: Union[bool, int, float, numpy.ndarray, tvm.nd.NDArray]
-        The constant value.
-
-    dtype: str, optional
-        The data type of the resulting constant.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-
-    Note
-    ----
-    When dtype is None, we use the following rule:
-
-    - int maps to "int32"
-    - float maps to "float32"
-    - bool maps to "bool"
-    - other using the same default rule as numpy.
-    """
-    if isinstance(value, (_base.numeric_types, (bool, list))):
-        value = _np.array(value, dtype=dtype)
-
-    if not dtype:
-        # when dtype is None: int maps to "int32", float maps to "float32"
-        dtype = {_np.dtype("int64"): _np.int32, _np.dtype("float64"): _np.float32}.get(
-            value.dtype, None
-        )
-
-    if isinstance(value, (_np.ndarray, _np.generic)):
-        if dtype is not None:
-            value = value.astype(dtype)
-        value = _nd.array(value)
-
-    if not isinstance(value, _nd.NDArray):
-        raise ValueError("value has to be scalar or NDArray")
-
-    return Constant(value, span)
-
-
-def bind(expr, binds):
-    """Bind an free variables in expr or function arguments.
-
-    We can bind parameters expr if it is a function.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression.
-
-    binds : Map[tvm.relay.Var, tvm.relay.Expr]
-        The specific bindings.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The expression or function after binding.
-    """
-    return _ffi_api.Bind(expr, binds)
-
-
-@tvm._ffi.register_object("relay.StorageInfo")
-class StorageInfo(Node):
-    """StorageInfo
-
-    The static storage information produced by memory planning.
-    Contains the storage ids where expressions are stored, the
-    type of the "virtual devices" the expressions are stored on,
-    and the sizes of each storage element."""
-
-    def __init__(self, sids, dev_types, sizes):
-        self.__init_handle_by_constructor__(_ffi_api.StorageInfo, sids, dev_types, sizes)
-
-    def __str__(self):
-        return pretty_print(self)
-
-    @property
-    def storage_ids(self):
-        return _ffi_api.StorageInfoStorageIds(self)
-
-    @property
-    def device_types(self):
-        return _ffi_api.StorageInfoDeviceTypes(self)
-
-    @property
-    def storage_sizes(self):
-        return _ffi_api.StorageInfoStorageSizes(self)
-
-    @property
-    def virtual_devices(self):
-        return _ffi_api.StorageInfoVirtualDevices(self)
-
-
-@tvm._ffi.register_object("relay.StaticMemoryPlan")
-class StaticMemoryPlan(Node):
-    """StaticMemoryPlan
-
-    The result of static memory planning."""
-
-    def __init__(self, expr_to_storage_info):
-        self.__init_handle_by_constructor__(_ffi_api.StaticMemoryPlan, expr_to_storage_info)
-
-    def __str__(self):
-        return pretty_print(self)
diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
deleted file mode 100644
index 05e0feb0c354..000000000000
--- a/python/tvm/relay/expr_functor.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-"""The expression functor of Relay."""
-from tvm.ir import Op
-
-from .function import Function, FunctionWithFields
-from .expr import Call, Let, Var, GlobalVar
-from .expr import If, Tuple, TupleGetItem, Constant
-from .expr import RefCreate, RefRead, RefWrite
-from .adt import Constructor, Match, Clause
-
-
-class ExprFunctor:
-    """
-    An abstract visitor defined over Expr.
-
-    Defines the default dispatch over expressions, and
-    implements memoization.
-    """
-
-    def __init__(self):
-        self.memo_map = {}
-
-    # pylint: disable=no-else-return
-    def visit(self, expr):
-        """Apply the visitor to an expression."""
-        if expr in self.memo_map:
-            return self.memo_map[expr]
-
-        if isinstance(expr, Function):
-            res = self.visit_function(expr)
-        elif isinstance(expr, Call):
-            res = self.visit_call(expr)
-        elif isinstance(expr, Let):
-            res = self.visit_let(expr)
-        elif isinstance(expr, Var):
-            res = self.visit_var(expr)
-        elif isinstance(expr, GlobalVar):
-            res = self.visit_global_var(expr)
-        elif isinstance(expr, If):
-            res = self.visit_if(expr)
-        elif isinstance(expr, Tuple):
-            res = self.visit_tuple(expr)
-        elif isinstance(expr, TupleGetItem):
-            res = self.visit_tuple_getitem(expr)
-        elif isinstance(expr, Constant):
-            res = self.visit_constant(expr)
-        elif isinstance(expr, Op):
-            res = self.visit_op(expr)
-        elif isinstance(expr, RefCreate):
-            res = self.visit_ref_create(expr)
-        elif isinstance(expr, RefRead):
-            res = self.visit_ref_read(expr)
-        elif isinstance(expr, RefWrite):
-            res = self.visit_ref_write(expr)
-        elif isinstance(expr, Constructor):
-            res = self.visit_constructor(expr)
-        elif isinstance(expr, Match):
-            res = self.visit_match(expr)
-        else:
-            raise Exception(f"warning unhandled case: {type(expr)}")
-
-        self.memo_map[expr] = res
-
-        return res
-
-    def visit_function(self, _):
-        raise NotImplementedError()
-
-    def visit_let(self, _):
-        raise NotImplementedError()
-
-    def visit_call(self, _):
-        raise NotImplementedError()
-
-    def visit_var(self, _):
-        raise NotImplementedError()
-
-    def visit_type(self, typ):
-        return typ
-
-    def visit_if(self, _):
-        raise NotImplementedError()
-
-    def visit_tuple(self, _):
-        raise NotImplementedError()
-
-    def visit_tuple_getitem(self, _):
-        raise NotImplementedError()
-
-    def visit_global_var(self, _):
-        raise NotImplementedError()
-
-    def visit_op(self, _):
-        raise NotImplementedError()
-
-    def visit_constant(self, _):
-        raise NotImplementedError()
-
-    def visit_ref_create(self, _):
-        raise NotImplementedError()
-
-    def visit_ref_write(self, _):
-        raise NotImplementedError()
-
-    def visit_ref_read(self, _):
-        raise NotImplementedError()
-
-    def visit_constructor(self, _):
-        raise NotImplementedError()
-
-    def visit_match(self, _):
-        raise NotImplementedError()
-
-
-class ExprVisitor(ExprFunctor):
-    """
-    A visitor over Expr.
-
-    The default behavior recursively traverses the AST.
-    """
-
-    def visit_tuple(self, tup):
-        for x in tup.fields:
-            self.visit(x)
-
-    def visit_call(self, call):
-        self.visit(call.op)
-        for a in call.args:
-            self.visit(a)
-
-    def visit_var(self, var):
-        pass
-
-    def visit_let(self, let):
-        self.visit(let.var)
-        self.visit(let.value)
-        self.visit(let.body)
-
-    def visit_function(self, fn):
-        for x in fn.params:
-            self.visit(x)
-        self.visit(fn.body)
-
-    def visit_if(self, i):
-        self.visit(i.cond)
-        self.visit(i.true_branch)
-        self.visit(i.false_branch)
-
-    def visit_global_var(self, gv):
-        pass
-
-    def visit_constructor(self, c):
-        pass
-
-    def visit_op(self, op):
-        pass
-
-    def visit_constant(self, const):
-        pass
-
-    def visit_ref_create(self, r):
-        self.visit(r.value)
-
-    def visit_ref_read(self, r):
-        self.visit(r.ref)
-
-    def visit_ref_write(self, r):
-        self.visit(r.ref)
-        self.visit(r.value)
-
-    def visit_tuple_getitem(self, t):
-        self.visit(t.tuple_value)
-
-    def visit_match(self, m):
-        self.visit(m.data)
-        for c in m.clauses:
-            self.visit(c.rhs)
-
-
-class ExprMutator(ExprFunctor):
-    """
-    A functional visitor over Expr.
-
-    The default behavior recursively traverses the AST
-    and reconstructs the AST.
-    """
-
-    def visit_function(self, fn):
-        new_params = [self.visit(x) for x in fn.params]
-        new_body = self.visit(fn.body)
-        if new_params == list(fn.params) and new_body == fn.body:
-            return fn
-        return FunctionWithFields(fn, list(new_params), new_body)
-
-    def visit_let(self, let):
-        new_var = self.visit(let.var)
-        new_val = self.visit(let.value)
-        new_body = self.visit(let.body)
-        if new_var == let.var and new_val == let.value and new_body == let.body:
-            return let
-        return Let(new_var, new_val, new_body)
-
-    def visit_call(self, call):
-        new_fn = self.visit(call.op)
-        new_args = [self.visit(arg) for arg in call.args]
-        if new_fn == call.op and new_args == list(call.args):
-            return call
-        return Call(new_fn, new_args, call.attrs, call.type_args, call.span)
-
-    def visit_var(self, var):
-        return var
-
-    def visit_global_id(self, global_var):
-        return global_var
-
-    def visit_if(self, ite):
-        new_cond = self.visit(ite.cond)
-        new_true_branch = self.visit(ite.true_branch)
-        new_false_branch = self.visit(ite.false_branch)
-        if (
-            new_cond == ite.cond
-            and new_true_branch == ite.true_branch
-            and new_false_branch == ite.false_branch
-        ):
-            return ite
-        return If(new_cond, new_true_branch, new_false_branch)
-
-    def visit_tuple(self, tup):
-        new_fields = [self.visit(field) for field in tup.fields]
-        if new_fields == list(tup.fields):
-            return tup
-        return Tuple(new_fields, tup.span)
-
-    def visit_tuple_getitem(self, op):
-        new_tuple_value = self.visit(op.tuple_value)
-        if new_tuple_value == op.tuple_value:
-            return op
-        return TupleGetItem(new_tuple_value, op.index, span=op.span)
-
-    def visit_global_var(self, gvar):
-        return gvar
-
-    def visit_op(self, op):
-        return op
-
-    def visit_constant(self, const):
-        return const
-
-    def visit_constructor(self, con):
-        return con
-
-    def visit_match(self, m):
-        new_data = self.visit(m.data)
-        new_clauses = [Clause(c.lhs, self.visit(c.rhs)) for c in m.clauses]
-        if new_data == m.data and all(x.rhs == y.rhs for x, y in zip(new_clauses, m.clauses)):
-            return m
-        return Match(new_data, new_clauses, complete=m.complete)
-
-    def visit_ref_create(self, r):
-        new_value = self.visit(r.value)
-        if new_value == r.value:
-            return r
-        return RefCreate(new_value)
-
-    def visit_ref_write(self, r):
-        new_ref = self.visit(r.ref)
-        new_value = self.visit(r.value)
-        if new_ref == r.ref and new_value == r.value:
-            return r
-        return RefWrite(new_ref, new_value)
-
-    def visit_ref_read(self, r):
-        new_ref = self.visit(r.ref)
-        if new_ref == r.ref:
-            return r
-        return RefRead(new_ref)
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
deleted file mode 100644
index fbbd4f99212d..000000000000
--- a/python/tvm/relay/frontend/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Frontends for constructing Relay programs.
-
-Contains the model importers currently defined
-for Relay.
-"""
-from .mxnet import from_mxnet
-from .mxnet_qnn_op_utils import quantize_conv_bias_mkldnn_from_var
-from .keras import from_keras
-from .oneflow import from_oneflow
-from .onnx import from_onnx
-from .tflite import from_tflite
-from .coreml import from_coreml
-from .caffe2 import from_caffe2
-from .tensorflow import from_tensorflow
-from .darknet import from_darknet
-from .pytorch import from_pytorch
-from .caffe import from_caffe
-from .paddlepaddle import from_paddle
-from .change_datatype import ChangeDatatype
diff --git a/python/tvm/relay/frontend/caffe.py b/python/tvm/relay/frontend/caffe.py
deleted file mode 100644
index 708cc3f4f11f..000000000000
--- a/python/tvm/relay/frontend/caffe.py
+++ /dev/null
@@ -1,1001 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name, unused-argument, too-many-lines, import-outside-toplevel
-# pylint: disable=no-else-return, no-else-continue, use-list-literal
-"""Caffe frontend."""
-import numpy as np
-import tvm
-from tvm.ir import IRModule
-
-from ... import nd as _nd
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from .common import ExprTable
-from .common import infer_shape as _infer_shape
-
-__all__ = ["from_caffe"]
-
-
-class OperatorConverter(object):
-    """Operator Converted for converting Caffe ops to Relay ops"""
-
-    def __init__(self, init_layer_dict, predict_layer, exp_tab):
-        self.init_layer_dict = init_layer_dict
-        self.predict_layer = predict_layer
-        self.exp_tab = exp_tab
-        self.new_bn = {}
-        self.changed_layers = None
-
-        self.convert_map = {
-            "BatchNorm": self.convert_batch_norm,
-            "Concat": self.convert_concat,
-            "Convolution": self.convert_conv,
-            "Crop": self.convert_crop,
-            "Deconvolution": self.convert_deconv,
-            "Dropout": self.convert_dropout,
-            "Eltwise": self.convert_eltwise,
-            "Embed": self.convert_embed,
-            "Flatten": self.convert_flatten,
-            "InnerProduct": self.convert_innerproduct,
-            "Input": None,
-            "LRN": self.convert_lrn,
-            "Permute": self.convert_permute,
-            "Pooling": self.convert_pooling,
-            "Power": self.convert_power,
-            "PReLU": self.convert_prelu,
-            "ReLU": self.convert_relu,
-            "Reshape": self.convert_reshape,
-            "Scale": self.convert_scale,
-            "Sigmoid": self.convert_sigmoid,
-            "Slice": self.convert_slice,
-            "Softmax": self.convert_softmax,
-            "TanH": self.convert_tanh,
-            "Reduction": self.convert_reduction,
-        }
-
-    def convert_flatten(self, op):
-        """Convert Flatten layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-
-        flatten_params = op.flatten_param.axis
-        assert flatten_params == 1, "flatten axis should be 1"
-        out = _op.nn.batch_flatten(in_expr)
-
-        return out
-
-    def convert_eltwise(self, op):
-        """Convert Eltwise layer"""
-        inputs = op.bottom
-        assert len(inputs) >= 2, "input tensors length should be larger than 2"
-
-        # gethering initial 2 input expressions
-        lhs_expr = self.exp_tab.get_expr(inputs[0])
-        rhs_expr = self.exp_tab.get_expr(inputs[1])
-        lhs_shape = _infer_shape(lhs_expr)
-        rhs_shape = _infer_shape(rhs_expr)
-        assert lhs_shape == rhs_shape, "input tensors shape should be equal"
-
-        eltwise_params = op.eltwise_param
-        eltwise_type_dict = ["PROD", "SUM", "MAX"]
-        eltwise_type = eltwise_params.operation
-        coeff = list(eltwise_params.coeff)
-
-        if eltwise_type_dict[eltwise_type] == "PROD":
-            out = _op.multiply(lhs_expr, rhs_expr)
-            # for rest inputs
-            for i in range(len(inputs) - 2):
-                extra_expr = self.exp_tab.get_expr(inputs[i + 2])
-                assert _infer_shape(out) == _infer_shape(extra_expr)
-                out = _op.multiply(out, extra_expr)
-        elif eltwise_type_dict[eltwise_type] == "SUM":
-            if coeff:
-                left_coeff_expr = self.exp_tab.new_const(np.asarray(coeff[0], np.float32))
-                right_coeff_expr = self.exp_tab.new_const(np.asarray(coeff[1], np.float32))
-                lhs_expr_scale = _op.multiply(lhs_expr, left_coeff_expr)
-                rhs_expr_scale = _op.multiply(rhs_expr, right_coeff_expr)
-                out = _op.add(lhs_expr_scale, rhs_expr_scale)
-            else:
-                out = _op.add(lhs_expr, rhs_expr)
-            # for rest inputs
-            for i in range(len(inputs) - 2):
-                extra_expr = self.exp_tab.get_expr(inputs[i + 2])
-                assert _infer_shape(out) == _infer_shape(extra_expr)
-                if coeff:
-                    coeff_expr = self.exp_tab.new_const(np.asarray(coeff[i + 2], np.float32))
-                    extra_expr_scale = _op.multiply(extra_expr, coeff_expr)
-                    out = _op.add(out, extra_expr_scale)
-                else:
-                    out = _op.add(out, extra_expr)
-        elif eltwise_type_dict[eltwise_type] == "MAX":
-            out = _op.maximum(lhs_expr, rhs_expr)
-            # for rest inputs
-            for i in range(len(inputs) - 2):
-                extra_expr = self.exp_tab.get_expr(inputs[i + 2])
-                assert _infer_shape(out) == _infer_shape(extra_expr)
-                out = _op.maximum(out, extra_expr)
-        else:
-            raise tvm.error.OpNotImplemented(
-                f"eltwise_type {eltwise_type} is not supported for frontend Caffe."
-            )
-
-        return out
-
-    def _parse_conv_params(self, op):
-        """Parse the parameters of Convolution and Deconvolution layer"""
-        nonzone = lambda val, pos, dflt: val[pos] if pos < len(val) else dflt
-
-        conv_params = op.convolution_param
-
-        params = dict()
-        # parse kernel size
-        if conv_params.kernel_h > 0 or conv_params.kernel_w > 0:
-            params["kernel_size"] = (conv_params.kernel_h, conv_params.kernel_w)
-        else:
-            ksize_h = nonzone(conv_params.kernel_size, 0, 1)
-            ksize_w = nonzone(conv_params.kernel_size, 1, ksize_h)
-            params["kernel_size"] = (ksize_h, ksize_w)
-
-        # parse padding size
-        if conv_params.pad_h > 0 or conv_params.pad_w > 0:
-            params["padding"] = (conv_params.pad_h, conv_params.pad_w)
-        else:
-            pad_h = nonzone(conv_params.pad, 0, 0)
-            pad_w = nonzone(conv_params.pad, 1, pad_h)
-            params["padding"] = (pad_h, pad_w)
-
-        # parse stride size
-        if conv_params.stride_h > 0 or conv_params.stride_w > 0:
-            params["strides"] = (conv_params.stride_h, conv_params.stride_w)
-        else:
-            stride_h = nonzone(conv_params.stride, 0, 1)
-            stride_w = nonzone(conv_params.stride, 1, stride_h)
-            params["strides"] = (stride_h, stride_w)
-
-        # parse dilation size
-        if hasattr(conv_params, "dilation") and len(conv_params.dilation) > 0:
-            dilation = " ".join(str(d) for d in conv_params.dilation)
-            dilation = tuple(map(int, dilation.split(" ")))
-            params["dilation"] = dilation
-            if len(dilation) == 1:
-                params["dilation"] = (dilation[0], dilation[0])
-
-        params["kernel_layout"] = "OIHW"
-        params["data_layout"] = "NCHW"
-        params["groups"] = conv_params.group
-        params["channels"] = conv_params.num_output
-        return params
-
-    def convert_batch_norm(self, op):
-        """Convert BatchNorm layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        n, c, h, w = _infer_shape(in_expr)
-
-        if op.name in self.new_bn:
-            mean, var, eps, gamma, beta = self.new_bn[op.name]
-            mean_expr = self.exp_tab.new_const(mean, dtype="float32")
-            var_expr = self.exp_tab.new_const(var, dtype="float32")
-            gamma_expr = self.exp_tab.new_const(gamma, dtype="float32")
-            beta_expr = self.exp_tab.new_const(beta, dtype="float32")
-            out = _op.nn.batch_norm(
-                in_expr, gamma_expr, beta_expr, mean_expr, var_expr, epsilon=eps, scale=True
-            )
-
-        else:
-            weight_bias_blobs = self.init_layer_dict[op.name].blobs
-            mean = np.asarray(weight_bias_blobs[0].data, np.float32)
-            var = np.asarray(weight_bias_blobs[1].data, np.float32)
-            if len(weight_bias_blobs) == 2:
-                mean = np.repeat(mean, h * w).reshape((c, h, w))
-                mean = np.expand_dims(mean, 0).repeat(n, axis=0)
-                mean_expr = self.exp_tab.new_const(mean, dtype="float32")
-
-                var = np.repeat(var, h * w).reshape((c, h, w))
-                var = np.expand_dims(var, 0).repeat(n, axis=0)
-                var_expr = self.exp_tab.new_const(var, dtype="float32")
-
-                tmp_out = _op.multiply(in_expr, mean_expr)
-                out = _op.add(tmp_out, var_expr)
-
-                return out
-            else:
-                scale = np.asarray(weight_bias_blobs[2].data, np.float32)
-                if scale:
-                    scale = 1 / scale
-            mean_expr = self.exp_tab.new_const(mean * scale, dtype="float32")
-            var_expr = self.exp_tab.new_const(var * scale, dtype="float32")
-
-            # caffe bn layer not support scale
-            gamma_expr = self.exp_tab.new_const(
-                np.ones(mean.shape, dtype=np.float32), dtype="float32"
-            )
-            beta_expr = self.exp_tab.new_const(
-                np.zeros(mean.shape, dtype=np.float32), dtype="float32"
-            )
-
-            bn_params = op.batch_norm_param.eps
-            out = _op.nn.batch_norm(
-                in_expr, gamma_expr, beta_expr, mean_expr, var_expr, epsilon=bn_params, scale=False
-            )
-
-        return out[0]
-
-    def convert_scale(self, op):
-        """Convert Scale layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        weight_bias_blobs = self.init_layer_dict[op.name].blobs
-
-        params = dict()
-        params["bias"] = op.scale_param.bias_term
-        params["axis"] = op.scale_param.axis
-
-        gamma = np.asarray(weight_bias_blobs[0].data, np.float32)
-        gamma_expr = self.exp_tab.new_const(gamma, dtype="float32")
-        if params["bias"]:
-            beta = np.asarray(weight_bias_blobs[1].data, np.float32)
-            beta_expr = self.exp_tab.new_const(beta, dtype="float32")
-        else:
-            beta_expr = self.exp_tab.new_const(
-                np.zeros(gamma.shape, dtype=np.float32), dtype="float32"
-            )
-
-        _, c, _, _ = _infer_shape(in_expr)
-        gamma_expr = _op.reshape(gamma_expr, newshape=(1, c, 1, 1))
-        beta_expr = _op.reshape(beta_expr, newshape=(1, c, 1, 1))
-        out = _op.multiply(in_expr, gamma_expr)
-        out = _op.add(out, beta_expr)
-
-        return out
-
-    def convert_concat(self, op):
-        """Convert Concat layer"""
-        inputs = op.bottom
-        in_expr = (self.exp_tab.get_expr(inputs[i]) for i in range(len(inputs)))
-
-        c_params = dict()
-        c_params["axis"] = op.concat_param.axis
-        out = _op.concatenate(in_expr, axis=c_params["axis"])
-
-        return out
-
-    def convert_reshape(self, op):
-        """Convert Reshape layer"""
-        inputs = op.bottom
-        input_name = inputs[0]
-
-        reshape_param = op.reshape_param
-        dims = list(reshape_param.shape.dim)
-
-        in_expr = self.exp_tab.get_expr(input_name)
-        input_shape = list(_infer_shape(in_expr))
-
-        start_axis = int(reshape_param.axis)
-        if start_axis < 0:
-            start_axis = len(input_shape) + start_axis + 1
-        num_axes = int(reshape_param.num_axes)
-        end_axis = len(input_shape)
-        if num_axes != -1:
-            end_axis = start_axis + num_axes
-
-        left_shape = input_shape[:start_axis]
-        if end_axis == len(input_shape):
-            center_shape = input_shape[start_axis:]
-            right_shape = []
-        else:
-            center_shape = input_shape[start_axis:end_axis]
-            right_shape = input_shape[end_axis:]
-
-        for idx, dim in enumerate(dims):
-            if dim == 0:
-                dims[idx] = center_shape[idx]
-
-        tmp = np.random.rand(*center_shape)
-        tmp = np.reshape(tmp, dims)
-        center_shape = list(tmp.shape)
-
-        newshape = left_shape + center_shape + right_shape
-
-        out = _op.reshape(in_expr, newshape=newshape)
-        return out
-
-    def convert_softmax(self, op):
-        """Convert Softmax layer"""
-        inputs = op.bottom
-        assert len(inputs) == 1, "input tensors length should be 1"
-
-        input_name = inputs[0]
-        in_expr = self.exp_tab.get_expr(input_name)
-
-        softmax_param = op.softmax_param
-        parmas = {"axis": softmax_param.axis}
-
-        out = _op.nn.softmax(in_expr, **parmas)
-
-        return out
-
-    def convert_conv(self, op):
-        """Convert Convolution layer"""
-        params = self._parse_conv_params(op)
-        weight_bias_blobs = self.init_layer_dict[op.name].blobs
-        conv_params = op.convolution_param
-        inputs = op.bottom
-        # process weight and bias blobs
-        weight, bias = None, None
-        if len(weight_bias_blobs) > 1:
-            weight = weight_bias_blobs[0]
-            bias = weight_bias_blobs[1]
-        else:
-            weight = weight_bias_blobs[0]
-        if weight:
-            kh, kw = params["kernel_size"]
-            weight_shape = [conv_params.num_output, -1, kh, kw]
-            weight_value = np.asarray(weight.data, np.float32)
-            weight_value = np.reshape(weight_value, weight_shape)
-        else:
-            raise Exception(f"No weight value of layer {op.name} in caffemodel")
-
-        weight_expr = self.exp_tab.new_const(weight_value, dtype="float32")
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        out = _op.nn.conv2d(data=in_expr, weight=weight_expr, **params)
-        if bias:
-            bias_value = np.asarray(bias.data, np.float32)
-            bias_expr = self.exp_tab.new_const(bias_value, dtype="float32")
-            out = _op.nn.bias_add(out, bias_expr)
-        return out
-
-    def convert_pooling(self, op):
-        """Convert Pooling layer"""
-        inputs = op.bottom
-        input_name = inputs[0]
-
-        pool_params = op.pooling_param
-        pool_type_dict = ["MAX", "AVE", "STOCHASTIC"]
-
-        params = dict()
-        # parse pool type: 0: MAX, 1: AVE, 2: STOCHASTIC
-        pool_type = pool_params.pool
-        # parse kernel size
-        if pool_params.kernel_h > 0 or pool_params.kernel_w > 0:
-            params["pool_size"] = (pool_params.kernel_h, pool_params.kernel_w)
-        else:
-            params["pool_size"] = (pool_params.kernel_size, pool_params.kernel_size)
-
-        # parse padding size
-        if pool_params.pad_h > 0 or pool_params.pad_w > 0:
-            params["padding"] = (pool_params.pad_h, pool_params.pad_w)
-        else:
-            params["padding"] = (pool_params.pad, pool_params.pad)
-
-        # parse stride size
-        if pool_params.stride_h > 0 or pool_params.stride_w > 0:
-            params["strides"] = (pool_params.stride_h, pool_params.stride_w)
-        else:
-            params["strides"] = (pool_params.stride, pool_params.stride)
-
-        params["ceil_mode"] = True
-        if hasattr(pool_params, "round_mode"):
-            params["ceil_mode"] = pool_params.round_mode == "CEIL"
-
-        in_expr = self.exp_tab.get_expr(input_name)
-
-        if pool_type_dict[pool_type] == "MAX":
-            if pool_params.global_pooling:
-                out = _op.nn.global_max_pool2d(in_expr)
-            else:
-                if len(op.top) == 1:
-                    out = _op.nn.max_pool2d(in_expr, **params)
-                elif len(op.top) == 2:
-                    out1 = _op.nn.max_pool2d_with_argmax(in_expr, **params)
-                    out2 = _op.vision.max_pool2d_location(in_expr, **params)
-                    return _expr.Tuple((out1, out2))
-
-        elif pool_type_dict[pool_type] == "AVE":  # AVE
-            if pool_params.global_pooling:
-                out = _op.nn.global_avg_pool2d(in_expr)
-            else:
-                params["count_include_pad"] = True
-                out = _op.nn.avg_pool2d(in_expr, **params)
-        else:
-            raise tvm.error.OpNotImplemented(
-                f"Operator {pool_type_dict[pool_type]} pool is not supported for frontend Caffe."
-            )
-
-        return out
-
-    def convert_lrn(self, op):
-        """Convert LRN layer"""
-        inputs = op.bottom
-        input_name = inputs[0]
-
-        params = dict()
-        lrn_params = op.lrn_param
-        params["size"] = lrn_params.local_size
-        params["bias"] = lrn_params.k
-        params["alpha"] = lrn_params.alpha
-        params["beta"] = lrn_params.beta
-
-        in_expr = self.exp_tab.get_expr(input_name)
-        out = _op.nn.lrn(in_expr, **params)
-        return out
-
-    def convert_innerproduct(self, op):
-        """Convert InnerProduct layer"""
-        inputs = op.bottom
-        weight_bias_blobs = self.init_layer_dict[op.name].blobs
-        dense_params = op.inner_product_param
-
-        params = dict()
-        params["num_output"] = dense_params.num_output
-        params["bias"] = dense_params.bias_term
-        params["axis"] = dense_params.axis
-        if params["axis"] != 1:
-            raise Exception("Only support 2D InnerProduct")
-
-        # process weight and bias blobs
-        weight, bias = None, None
-        if params["bias"]:
-            weight = weight_bias_blobs[0]
-            bias = weight_bias_blobs[1]
-        else:
-            weight = weight_bias_blobs[0]
-
-        if weight:
-            weight_value = np.asarray(weight.data, np.float32)
-            weight_value = np.reshape(weight_value, (params["num_output"], -1))
-            weight_shape = weight_value.shape
-        else:
-            raise Exception(f"No weight value of layer {op.name} in caffemodel")
-
-        weight_expr = self.exp_tab.new_const(weight_value, dtype="float32")
-
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        in_reshape = _op.reshape(data=in_expr, newshape=(-1, weight_shape[-1]))
-
-        out = _op.nn.dense(data=in_reshape, weight=weight_expr)
-
-        if bias:
-            bias_value = np.asarray(bias.data, np.float32)
-            bias_expr = self.exp_tab.new_const(bias_value, dtype="float32")
-            out = _op.nn.bias_add(out, bias_expr, axis=params["axis"])
-        return out
-
-    def convert_dropout(self, op):
-        """Convert Dropout layer"""
-        inputs = op.bottom
-        input_name = inputs[0]
-
-        params = dict()
-        dropout_params = op.dropout_param
-
-        params["rate"] = dropout_params.dropout_ratio
-
-        in_expr = self.exp_tab.get_expr(input_name)
-        out = _op.nn.dropout(in_expr, **params)
-        return out
-
-    def convert_relu(self, op):
-        """Convert ReLU layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        negative_slope = op.relu_param.negative_slope
-        if negative_slope:
-            out = _op.nn.leaky_relu(in_expr, negative_slope)
-            return out
-
-        out = _op.nn.relu(in_expr)
-        return out
-
-    def convert_prelu(self, op):
-        """Convert PReLU layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-
-        alpha = self.init_layer_dict[op.name].blobs[0].data
-        alpha = np.asarray(alpha, np.float32)
-        alpha = self.exp_tab.new_const(alpha, dtype="float32")
-        axis = 1
-        out = _op.nn.prelu(in_expr, alpha, axis=axis)
-        return out
-
-    def convert_deconv(self, op):
-        """Convert Deconvolution layer"""
-        params = self._parse_conv_params(op)
-        weight_bias_blobs = self.init_layer_dict[op.name].blobs
-        conv_params = op.convolution_param
-        inputs = op.bottom
-
-        # process weight and bias blobs
-        weight, bias = None, None
-        if len(weight_bias_blobs) > 1:
-            weight = weight_bias_blobs[0]
-            bias = weight_bias_blobs[1]
-        else:
-            weight = weight_bias_blobs[0]
-        if weight:
-            kh, kw = params["kernel_size"]
-            weight_shape = [-1, conv_params.num_output, kh, kw]
-            if not weight.data:
-                if conv_params.weight_filler:
-                    _filler = conv_params.weight_filler.value
-                    weight_value = np.full(weight.shape.dim, _filler, np.float32)
-                else:
-                    raise tvm.error.OpAttributeInvalid("At least weight_filler must be given")
-            else:
-                weight_value = np.asarray(weight.data, np.float32)
-            weight_value = np.reshape(weight_value, weight_shape)
-
-            # weight shape is in relay's IOHW format rn, we need it to be OIHW
-            weight_value = np.transpose(weight_value, [1, 0, 2, 3])
-        else:
-            raise tvm.error.OpAttributeRequired(f"No weight value of layer {op.name} in caffemodel")
-
-        weight_expr = self.exp_tab.new_const(weight_value, dtype="float32")
-        in_expr = self.exp_tab.get_expr(inputs[0])
-
-        groups = params["groups"]
-        channels = params["channels"]
-
-        if bias:
-            bias_value = np.asarray(bias.data, np.float32)
-            bias_expr = self.exp_tab.new_const(bias_value, dtype="float32")
-
-        if groups > channels:
-            raise tvm.error.OpAttributeInvalid(
-                "Groups cannot be larger than the number of input channels"
-            )
-
-        if groups == channels:
-            inputs_expr = _op.split(in_expr, groups, axis=1)
-            # changing split axis to 0, according to PR #9336
-            weights_expr = _op.split(weight_expr, groups, axis=0)
-            # Preventing to create Concat layer with too many tensors(> 16)
-            q = groups >> 4
-            r = groups % 16
-
-            params["groups"] = 1
-            params["channels"] = 1
-            out = []
-            for lc in range(q):
-                _outputs = []
-                _inputs = [inputs_expr[i] for i in range(lc << 4, (lc << 4) + 16)]
-                _weights = [weights_expr[i] for i in range(lc << 4, (lc << 4) + 16)]
-                for (i, w) in zip(_inputs, _weights):
-                    _out = _op.nn.conv2d_transpose(data=i, weight=w, **params)
-                    if bias:
-                        _out = _op.nn.bias_add(_out, bias_expr)
-                    _outputs.append(_out)
-                out.append(_op.concatenate(_outputs, axis=1))
-            if r != 0:
-                _outputs = []
-                _inputs = [inputs_expr[i] for i in range(groups - r, groups)]
-                _weights = [weights_expr[i] for i in range(groups - r, groups)]
-                for (i, w) in zip(_inputs, _weights):
-                    _out = _op.nn.conv2d_transpose(data=i, weight=w, **params)
-                    if bias:
-                        _out = _op.nn.bias_add(_out, bias_expr)
-                    _outputs.append(_out)
-                out.append(_op.concatenate(_outputs, axis=1))
-            out = _op.concatenate(out, axis=1)
-        elif groups == 1:
-            out = _op.nn.conv2d_transpose(data=in_expr, weight=weight_expr, **params)
-            if bias:
-                out = _op.nn.bias_add(out, bias_expr)
-        else:
-            raise tvm.error.OpAttributeInvalid("Unable to handle.")
-        return out
-
-    def convert_slice(self, op):
-        """Convert Slice layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-
-        output_num = len(op.top)
-
-        slice_params = op.slice_param
-        axis = int(slice_params.axis)
-        indices_or_sections = list([int(s) for s in slice_params.slice_point])
-        if len(indices_or_sections) == 0:
-            indices_or_sections = output_num
-        else:
-            indices_or_sections = sorted(indices_or_sections)
-
-        out = _op.split(in_expr, indices_or_sections=indices_or_sections, axis=axis)
-        return out
-
-    def convert_sigmoid(self, op):
-        """Convert Sigmoid layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        out = _op.sigmoid(in_expr)
-        return out
-
-    def convert_tanh(self, op):
-        """Convert TanH layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        out = _op.tanh(in_expr)
-        return out
-
-    def convert_reduction(self, op):
-        """Convert Reduction layer"""
-        reduction_dic = ["NOP", "SUM", "ASUM", "SUMSQ", "MEAN"]
-
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        method = op.reduction_param.operation
-        axis = op.reduction_param.axis
-        coeff = op.reduction_param.coeff
-        coeff_expr = self.exp_tab.new_const(np.asarray(coeff, np.float32))
-        num_axes = len(_infer_shape(in_expr))
-
-        # Currently, only reduction along ALL "tail" axes is supported in Caffe;
-        # reduction of axis M through N, where N < num_axes - 1, is unsupported.
-        if 0 < axis < (num_axes - 1):
-            for _axis in reversed(range(axis + 1, num_axes)):
-                in_expr = _op.sum(in_expr, axis=_axis)
-            in_expr = _op.squeeze(in_expr)
-
-        if reduction_dic[method] == "SUM":
-            out = _op.sum(in_expr, axis=axis)
-        elif reduction_dic[method] == "MEAN":
-            out = _op.mean(in_expr, axis=axis)
-        elif reduction_dic[method] == "ASUM":
-            in_expr = _op.abs(in_expr)
-            out = _op.sum(in_expr, axis=axis)
-        elif reduction_dic[method] == "SUMSQ":
-            in_expr = _op.multiply(in_expr, in_expr)
-            out = _op.sum(in_expr, axis=axis)
-        else:
-            raise tvm.error.OpAttributeInvalid(
-                f"reduction method:{method} is invalid in Caffe frontend."
-            )
-
-        if float(coeff) != 1.0:
-            out = _op.multiply(out, coeff_expr)
-        return out
-
-    def convert_crop(self, op):
-        """Convert Crop layer"""
-        inputs = op.bottom
-        assert len(inputs) == 2, "Need two inputs of Crop layer"
-        in_expr_a = self.exp_tab.get_expr(inputs[0])
-        in_expr_b = self.exp_tab.get_expr(inputs[1])
-
-        # parse crop params
-        crop_params = op.crop_param
-        axis = int(getattr(crop_params, "axis", 2))
-        offset = list(getattr(crop_params, "offset", 0))
-
-        # expand offset to (offset1, offset2, ...)
-        in_a_shape = _infer_shape(in_expr_a)
-        num_to_crop = len(in_a_shape) - axis
-        if not offset:
-            offset = [0] * num_to_crop
-        if len(offset) == 1:
-            offset = offset * num_to_crop
-        elif len(offset) != num_to_crop:
-            raise tvm.error.OpAttributeInvalid("No matching the number between axis and offset!")
-
-        slice_end = in_a_shape
-        slice_start = [0] * len(in_a_shape)
-        for i in range(num_to_crop):
-            slice_start[i + axis] = offset[i]
-
-        to_crop_axis = list(range(len(in_a_shape)))
-        to_crop_axis = to_crop_axis[axis:]
-
-        # secondly, crop in_expr_a by in_expr_b
-        in_expr_a_stride = _op.strided_slice(in_expr_a, slice_start, slice_end)
-        out = _op.slice_like(in_expr_a_stride, in_expr_b, axes=to_crop_axis)
-        return out
-
-    def convert_permute(self, op):
-        """Convert Permute layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-
-        # parse permute params
-        permute_param = op.permute_param
-        axes = list(getattr(permute_param, "order", 0))
-        out = _op.transpose(in_expr, axes)
-        return out
-
-    def convert_embed(self, op):
-        """Convert Embed layer"""
-        inputs = op.bottom
-        embed_param = op.embed_param
-        num_output = embed_param.num_output
-        input_dim = embed_param.input_dim
-        bias_term = embed_param.bias_term
-        weight_bias_blobs = self.init_layer_dict[op.name].blobs
-        weight, bias = None, None
-        if bias_term:
-            weight = weight_bias_blobs[0]
-            bias = weight_bias_blobs[1]
-            assert weight and bias
-        else:
-            weight = weight_bias_blobs[0]
-            assert weight
-        weight_value = np.asarray(weight.data, np.float32)
-        weight_value = np.reshape(weight_value, [input_dim, num_output])
-        weight_expr = self.exp_tab.new_const(weight_value, dtype="float32")
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        input_shape = _infer_shape(in_expr)
-        input_count = 1
-        for dim in input_shape:
-            input_count *= dim
-
-        index = _op.cast(in_expr, "int32")
-        out = _op.take(weight_expr, index, axis=0)
-
-        if bias_term:
-            bias_value = np.asarray(bias.data, np.float32)
-            bias_expr = self.exp_tab.new_const(bias_value, dtype="float32")
-            out = _op.reshape(out, [input_count, num_output])
-            out = _op.add(out, bias_expr)
-
-        out_shape = list(input_shape)
-        out_shape.append(num_output)
-        out = _op.reshape(out, out_shape)
-
-        return out
-
-    def convert_power(self, op):
-        """Convert Power layer"""
-        inputs = op.bottom
-        in_expr = self.exp_tab.get_expr(inputs[0])
-        power = _expr.const(op.power_param.power)
-        scale = _expr.const(op.power_param.scale)
-        shift = _expr.const(op.power_param.shift)
-
-        out = _op.multiply(in_expr, scale)
-        out = _op.add(out, shift)
-        out = _op.power(out, power)
-        return out
-
-    def check_unsupported_ops(self):
-        """Check unsupported Caffe ops in our converter."""
-        unsupported_ops_set = set()
-
-        include_layer = dict()
-        for pl in self.predict_layer:
-            if pl.type not in include_layer:
-                include_layer[pl.type] = 1
-            else:
-                include_layer[pl.type] = include_layer[pl.type] + 1
-
-        for pl in self.predict_layer:
-            op_name = pl.type
-            if op_name not in self.convert_map:
-                unsupported_ops_set.add(op_name)
-
-        if unsupported_ops_set:
-            msg = "The following operators are not supported in frontend " "Caffe: {}"
-            ops = str(list(unsupported_ops_set)).strip("[,]")
-            raise tvm.error.OpNotImplemented(msg.format(ops))
-
-    def fuse_op(self, layers):
-        """Fusing the BatchNorm and Scale layer"""
-        bn, scale = layers["bn"], layers["scale"]
-
-        # bn params
-        bn_weight_bias_blobs = self.init_layer_dict[bn.name].blobs
-        bn_scale = np.asarray(bn_weight_bias_blobs[2].data, np.float32)
-        if bn_scale:
-            bn_scale = 1 / bn_scale
-        bn_mean = np.asarray(bn_weight_bias_blobs[0].data, np.float32) * bn_scale
-        bn_var = np.asarray(bn_weight_bias_blobs[1].data, np.float32) * bn_scale
-        bn_eps = bn.batch_norm_param.eps
-
-        # scale params
-        scale_weight_bias_blobs = self.init_layer_dict[scale.name].blobs
-        scale_gamma = np.asarray(scale_weight_bias_blobs[0].data, np.float32)
-        scale_bias = scale.scale_param.bias_term
-        if scale_bias:
-            scale_beta = np.asarray(scale_weight_bias_blobs[1].data, np.float32)
-        else:
-            scale_beta = np.zeros(scale_gamma.shape, dtype=np.float32)
-
-        # new params
-        self.new_bn[bn.name] = [bn_mean, bn_var, bn_eps, scale_gamma, scale_beta]
-        return bn
-
-    def op_fuse(self):
-        """fuse bn and scale"""
-        new_layers = []
-        temp_layers = {}
-        changed_layers = {}
-
-        for index, pl in enumerate(self.predict_layer):
-            op_type = pl.type
-            if op_type == "Input":
-                new_layers.append(pl)
-                continue
-            elif op_type == "BatchNorm":
-                if (index != len(self.predict_layer) - 1) and (
-                    self.predict_layer[index + 1].type == "Scale"
-                ):
-                    temp_layers["bn"] = pl
-                    continue
-                else:
-                    new_layers.append(pl)
-                    temp_layers.clear()
-            elif op_type == "Scale":
-                if self.predict_layer[index - 1].type == "BatchNorm":
-                    temp_layers["scale"] = pl
-                else:
-                    new_layers.append(pl)
-                    temp_layers.clear()
-            else:
-                temp_layers.clear()
-
-            if len(temp_layers) == 2:
-                layer = self.fuse_op(temp_layers)
-                new_layers.append(layer)
-                changed_layers[temp_layers["scale"].name] = temp_layers["bn"].name
-
-            for idx, plt in enumerate(pl.bottom):
-                if plt in changed_layers:
-                    pl.bottom[idx] = changed_layers[plt]
-
-            if op_type not in ["BatchNorm", "Scale"]:
-                new_layers.append(pl)
-
-        self.predict_layer = new_layers
-        self.changed_layers = changed_layers
-
-    def convert_op_to_relay(self):
-        """Convert Caffe ops to relay ops"""
-        for pl in self.predict_layer:
-            op_type = pl.type
-            if op_type == "Input":
-                continue
-            output_tensors = pl.top
-
-            ret = self.convert_map[op_type](pl)
-
-            if len(output_tensors) == 1:
-                self.exp_tab.set_expr(output_tensors[0], ret)
-            else:
-                for idx, output_tensor in enumerate(output_tensors):
-                    self.exp_tab.set_expr(output_tensor, ret[idx])
-
-
-def _rebuild_layers(predict_layer):
-    """Rebuild caffe layer. If the caffe net include in-place layers, repalce its top
-    with its name and update the bottom of other layer that is related to it.
-    """
-    # dict of input name that will be changed to new name
-    changed_top_dict = dict()
-
-    for pl in predict_layer:
-        if pl.type == "Input":
-            continue
-        # if current layer has single input and output and input equals to output
-        # it means that the layer does "in-place"
-        if len(pl.top) == 1 and len(pl.bottom) == 1:
-            if pl.top[0] == pl.bottom[0]:
-                # change current layer's input firstly
-                if pl.bottom[0] in changed_top_dict:
-                    pl.bottom[0] = changed_top_dict[pl.bottom[0]]
-                # update "change" dict
-                changed_top_dict[pl.top[0]] = pl.name
-                # change current layer's output to its name
-                pl.top[0] = pl.name
-            else:
-                if pl.bottom[0] in changed_top_dict:
-                    pl.bottom[0] = changed_top_dict[pl.bottom[0]]
-        # if the layer does not
-        else:
-            for index, plt in enumerate(pl.bottom):
-                if plt in changed_top_dict:
-                    pl.bottom[index] = changed_top_dict[plt]
-
-
-def _get_inputs_outputs(predict_layer):
-    """Obtain Caffe model's inputs and outpus"""
-    # model inputs / outputs
-    model_inputs = list()
-    model_outputs = list()
-
-    # The bottoms of every layer can not be as outputs
-    not_outputs = set()
-    for pl in predict_layer:
-        if pl.type == "Input":
-            assert len(pl.top) == 1, "The number of Input layer's output is more than 1."
-            model_inputs.append(pl.top[0])
-        for i in pl.bottom:
-            not_outputs.add(i)
-
-    for pl in predict_layer:
-        if len(pl.bottom) > 0:
-            for t in pl.top:
-                if t not in not_outputs:
-                    model_outputs.append(t)
-    return model_inputs, model_outputs
-
-
-def from_caffe(init_net, predict_net, shape_dict, dtype_dict):
-    """Convert from caffe model into compatible relay Function.
-
-    Parameters
-    ----------
-    init_net : caffe_pb2.NetParameter
-        caffemodel
-    predict_net : caffe_pb2.NetParameter
-        caffe prototxt
-    shape_dict : dict of str to int list/tuple
-        Input shapes of the model.
-    dtype_dict : dict of str to str
-        Input types of the model.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation.
-
-    params : dict of str to tvm.NDArray
-        The parameter dict to be used by relay
-    """
-    old_caffe = False
-    if len(predict_net.input) != 0:  # old caffe version
-        old_caffe = True
-        model_inputs = list(predict_net.input)
-
-    predict_layer = predict_net.layer
-
-    # replace layer's top with its name and update other layers'bottoms
-    _rebuild_layers(predict_layer)
-    # obtain inputs and outputs of Net
-    if old_caffe:
-        _, model_outputs = _get_inputs_outputs(predict_layer)
-    else:
-        model_inputs, model_outputs = _get_inputs_outputs(predict_layer)
-
-    exp_tab = ExprTable()
-    for in_name in model_inputs:
-        shape = shape_dict[in_name] if in_name in shape_dict else None
-        dtype = dtype_dict[in_name] if in_name in dtype_dict else "float32"
-        exp_tab.set_expr(in_name, _expr.var(in_name, shape=shape, dtype=dtype))
-    if list(init_net.layer):
-        init_layer = init_net.layer
-    else:
-        init_layer = init_net.layers
-    init_layer_dict = {il.name: il for il in init_layer}
-    # op code in model
-    op_converter = OperatorConverter(init_layer_dict, predict_layer, exp_tab)
-    op_converter.check_unsupported_ops()
-    op_converter.op_fuse()
-    op_converter.convert_op_to_relay()
-
-    # params and outputs
-    params = {k: _nd.array(np.array(v)) for k, v in exp_tab.params.items()}
-    outputs = list()
-    for n in model_outputs:
-        if n in op_converter.changed_layers:
-            n = op_converter.changed_layers[n]
-        outputs.append(exp_tab.get_expr(n))
-    outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-    func = _function.Function(analysis.free_vars(outputs), outputs)
-    mod = IRModule.from_expr(func)
-
-    return mod, params
diff --git a/python/tvm/relay/frontend/caffe2.py b/python/tvm/relay/frontend/caffe2.py
deleted file mode 100644
index e59aad255a80..000000000000
--- a/python/tvm/relay/frontend/caffe2.py
+++ /dev/null
@@ -1,604 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-self, invalid-name, line-too-long, unused-argument
-"""Caffe2 frontend"""
-import tvm
-from tvm.ir import IRModule
-
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from ... import nd as _nd
-from .common import AttrCvt, Renamer
-from .common import get_relay_op, new_var, infer_channels
-
-__all__ = ["from_caffe2"]
-
-
-def dimension_picker(prefix, surfix=""):
-    def _impl(attr):
-        kernel = attr["kernel_shape"]
-        if len(kernel) == 2:
-            return prefix + "2d" + surfix
-        raise tvm.error.OpAttributeUnImplemented(
-            f"Non-2D kernels are not supported for operator {prefix}2d"
-        )
-
-    return _impl
-
-
-def revert_caffe2_pad(pads):
-    """Caffe2 requires two times the normal padding."""
-    if len(pads) == 4:
-        pads = pads[:2]
-    elif len(pads) == 2:
-        pass
-    else:
-        raise tvm.error.OpAttributeInvalid("Number of pads must equal 2 or 4.")
-    return pads
-
-
-def dimension_constraint():
-    def _dim_check(args):
-        if len(args["kernel_shape"]) == 2:
-            return True
-        return False
-
-    return _dim_check, "Only 2d kernel supported."
-
-
-def _clean_up_pool_args(args):
-    """A helper function to clean up common arguments in conv and pooling ops."""
-    assert isinstance(args, dict)
-
-    if "stride_h" in args and "stride_w" in args:
-        assert "stride" not in args and "strides" not in args
-        args["strides"] = [args["stride_h"], args["stride_w"]]
-        args.pop("stride_h")
-        args.pop("stride_w")
-    elif "stride" in args:
-        args["strides"] = [args["stride"], args["stride"]]
-        args.pop("stride")
-
-    # rename 'kernel', 'kernels', to 'kernel_shape'
-    if "kernel_h" in args and "kernel_w" in args:
-        assert "kernel" not in args and "kernels" not in args
-        args["kernel_shape"] = [args["kernel_h"], args["kernel_w"]]
-        args.pop("kernel_h")
-        args.pop("kernel_w")
-    elif "kernel" in args:
-        args["kernel_shape"] = [args["kernel"], args["kernel"]]
-        args.pop("kernel")
-    elif "kernels" in args:
-        args["kernel_shape"] = args["kernels"]
-        args.pop("kernels")
-
-    if "pad_t" in args and "pad_l" in args and "pad_b" in args and "pad_r" in args:
-        assert "pad" not in args and "pads" not in args
-        args["pads"] = [args["pad_t"], args["pad_l"], args["pad_b"], args["pad_r"]]
-        for pad in ["pad_t", "pad_l", "pad_b", "pad_r"]:
-            args.pop(pad)
-    elif "pad" in args:
-        args["pads"] = [args["pad"], args["pad"]]
-        args.pop("pad")
-
-    if "dilation_h" in args and "dilation_w" in args:
-        assert "dilation" not in args and "dilations" not in args
-        args["dilations"] = [args["dilation_h"], args["dilation_w"]]
-        args.pop("dilation_h")
-        args.pop("dilation_w")
-    elif "dilation" in args:
-        args["dilations"] = [args["dilation"], args["dilation"]]
-        args.pop("dilation")
-
-    return args
-
-
-class Caffe2OpConverter(object):
-    """A helper class for holding Caffe2 op converters."""
-
-    @classmethod
-    def get_converter(cls):
-        """Get converter.
-
-        :return: converter, which should be `_impl`.
-        """
-
-        if hasattr(cls, "_impl"):
-            return getattr(cls, "_impl")
-        raise tvm.error.OpNotImplemented(
-            f"Operator {cls.__name__} is not supported in frontend Caffe2."
-        )
-
-
-_caffe2_internal_args = [
-    # nnpack args
-    "algo",
-    "convolution_transform_strategy",
-    "float16_compute",
-    "shared_buffer",
-    # training args
-    "init_params",
-    "cudnn_exhaustive_search",
-    "exhaustive_search",
-    # training args
-    "adj",
-    "hwgq",
-    # args that we don't care
-    "legacy_pad",
-]
-
-
-class Elemwise(Caffe2OpConverter):
-    """A helper class for elemwise op converters."""
-
-    name = ""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        assert len(inputs) == 2, f"Math op take 2 inputs, {len(inputs)} given"
-        op_name = cls.name
-        conv_ops = ["conv2d", "conv2d_transpose"]
-        if args.get("broadcast", 0) and any(x in str(inputs[0]) for x in conv_ops):
-            # TODO(zhreshold): remove hard coded infershape
-            axis = int(args.get("axis", 0))
-            inputs[1] = _op.expand_dims(inputs[1], axis=axis, num_newaxis=2)
-        return get_relay_op(op_name)(*inputs)
-
-
-class Add(Elemwise):
-    """Operator converter for Add."""
-
-    name = "add"
-
-
-class Mul(Elemwise):
-    """Operator converter for Mul."""
-
-    name = "multiply"
-
-
-class Pool(Caffe2OpConverter):
-    """A helper class for pool op converters."""
-
-    name = ""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        _clean_up_pool_args(args)
-        if "global_pooling" in args and args["global_pooling"] == 1:
-            op_name = dimension_picker("global_" + cls.name)
-            return get_relay_op(op_name(args))(*inputs)
-
-        return AttrCvt(
-            op_name=dimension_picker(cls.name),
-            transforms={
-                "kernel_shape": "pool_size",
-                "pads": ("padding", (0, 0), revert_caffe2_pad),
-                "strides": "strides",
-            },
-            ignores=["dilations", "order", "legacy_pad", "global_pooling"],
-            extras={"ceil_mode": False},
-            custom_check=dimension_constraint(),
-        )(inputs, args, params)
-
-
-class AveragePool(Pool):
-    name = "avg_pool"
-
-
-class MaxPool(Pool):
-    name = "max_pool"
-
-
-class Conv(Caffe2OpConverter):
-    """Operator converter for Conv."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        # get number of channels
-        channels = infer_channels(inputs[1])
-        args["channels"] = channels
-        _clean_up_pool_args(args)
-        out = AttrCvt(
-            op_name=dimension_picker("conv"),
-            transforms={
-                "group": ("groups", 1),
-                "kernel_shape": "kernel_size",
-                "pads": ("padding", (0, 0), revert_caffe2_pad),
-                "strides": "strides",
-                "dilations": ("dilation", (1, 1)),
-                "order": (
-                    "data_layout",
-                    ("NCHW"),
-                    lambda x: x if isinstance(x, str) else x.decode("UTF-8"),
-                ),
-            },
-            excludes=[],
-            ignores=_caffe2_internal_args,
-            custom_check=dimension_constraint(),
-        )(inputs[:2], args, params)
-        use_bias = len(inputs) == 3
-        if use_bias:
-            out = _op.nn.bias_add(out, inputs[2])
-        return out
-
-
-class ConvTranspose(Caffe2OpConverter):
-    """Operator converter for ConvTranspose."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        # get number of channels
-        channels = infer_channels(inputs[1], True)
-        args["channels"] = channels
-        _clean_up_pool_args(args)
-        out = AttrCvt(
-            op_name=dimension_picker("conv", "_transpose"),
-            transforms={
-                "kernel_shape": "kernel_size",
-                "pads": ("padding", (0, 0), revert_caffe2_pad),
-                "dilations": ("dilation", (1, 1)),
-                "order": (
-                    "data_layout",
-                    ("NCHW"),
-                    lambda x: x if isinstance(x, str) else x.decode("UTF-8"),
-                ),
-            },
-            excludes=[],
-            ignores=_caffe2_internal_args,
-            custom_check=dimension_constraint(),
-        )(inputs[:2], args, params)
-        use_bias = len(inputs) == 3
-        if use_bias:
-            out = _op.nn.bias_add(out, inputs[2])
-        return out
-
-
-class Concat(Caffe2OpConverter):
-    """Operator converter for Concat."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        def _get_axis_from_order_str(order):
-            order = order if isinstance(order, str) else order.decode("UTF-8")
-            if order == "NCHW":
-                return 1
-            if order == "NHWC":
-                return 3
-            raise tvm.error.OpAttributeUnImplemented(
-                f"Order {order} is not supported in operator Concat."
-            )
-
-        return AttrCvt(
-            op_name="concatenate",
-            transforms={"order": ("axis", (1), _get_axis_from_order_str)},
-            excludes=["add_axis"],
-        )((inputs,), args, params)
-
-
-class NormalizePlanarYUV(Caffe2OpConverter):
-    """Operator converter for NormalizePlanarYUV.
-    caffe2 definition: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/norm_planar_yuv_op.cc
-    """
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        assert len(inputs) == 3
-        mean = _op.expand_dims(inputs[1], axis=2, num_newaxis=2)
-        std = _op.expand_dims(inputs[2], axis=2, num_newaxis=2)
-
-        return _op.divide(_op.subtract(inputs[0], mean), std)
-
-
-class ResizeNearest(Caffe2OpConverter):
-    """Operator converter for Upsample (nearest mode)."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        width_scale = args["width_scale"] if "width_scale" in args else 1
-        height_scale = args["height_scale"] if "height_scale" in args else 1
-        assert width_scale == height_scale
-
-        return _op.nn.upsampling(
-            inputs[0], scale_h=int(width_scale), scale_w=int(width_scale), method="NEAREST_NEIGHBOR"
-        )
-
-
-class Sum(Caffe2OpConverter):
-    """Operator converter for Sum."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        # Sum Operator
-        for in_index in range(len(inputs) - 1):
-            inputs[in_index + 1] = _op.add(inputs[in_index], inputs[in_index + 1])
-
-        return inputs[len(inputs) - 1]
-
-
-class Softmax(Caffe2OpConverter):
-    """Operator converter for Softmax."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        # set default value when axis is not set in the model
-        if "axis" not in args:
-            args["axis"] = 1
-        return AttrCvt("softmax", transforms={"axis": ("axis", args["axis"])})(inputs, args, params)
-
-
-class FC(Caffe2OpConverter):
-    """Operator converter for FC."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        inputs[0] = _op.nn.batch_flatten(inputs[0])
-        units = infer_channels(inputs[1])
-        res = _op.nn.dense(inputs[0], inputs[1], units=units)
-        use_bias = len(inputs) == 3
-        if use_bias:
-            res = _op.nn.bias_add(res, inputs[2])
-        return res
-
-
-class SpatialBN(Caffe2OpConverter):
-    """Operator converter for SpatialBN."""
-
-    @classmethod
-    def _impl(cls, inputs, args, params):
-        return AttrCvt(
-            op_name="batch_norm",
-            disables=["momentum"],
-            ignores=["order", "spatial", "is_test", "consumed_inputs", "num_batches"],
-        )(inputs, args, params)
-
-
-# compatible operators that do NOT require any conversion.
-_identity_list = []
-
-# _convert_map defines maps of name to converter functor(callable)
-# for 1 to 1 mapping, use Renamer if nothing but name is different
-# use AttrCvt if attributes need to be converted
-# for 1 to N mapping(composed), use custom callable functions
-# for N to 1 mapping, currently not supported(?)
-
-# Minimal set of ops for squeezenet and resnet50
-def _get_convert_map():
-    return {
-        # caffe2 common operators
-        "Add": Add.get_converter(),
-        "Sum": Sum.get_converter(),
-        "Mul": Mul.get_converter(),
-        "Softmax": Softmax.get_converter(),
-        # nn
-        "AveragePool": AveragePool.get_converter(),
-        "MaxPool": MaxPool.get_converter(),
-        "Conv": Conv.get_converter(),
-        "ConvTranspose": ConvTranspose.get_converter(),
-        "Concat": Concat.get_converter(),
-        "FC": FC.get_converter(),
-        "SpatialBN": SpatialBN.get_converter(),
-        "ResizeNearest": ResizeNearest.get_converter(),
-        "Relu": AttrCvt("relu", {}, ignores=["order"]),
-        "Sigmoid": Renamer("sigmoid"),
-        "Dropout": AttrCvt("dropout", {"ratio": "rate"}, ignores=["is_test"]),
-        # c2 image preprocessing ops
-        "NormalizePlanarYUV": NormalizePlanarYUV.get_converter(),
-    }
-
-
-class Caffe2NetDef(object):
-    """A helper class for handling Relay expression copying from pb2.GraphProto.
-    Definition: https://github.com/pytorch/pytorch/blob/master/caffe2/proto/caffe2.proto
-    """
-
-    def __init__(self, shape, dtype):
-        self._nodes = {}
-        self._params = {}
-        self._visited_nodes = set()
-        self._ops = {}
-        self._shape = shape
-        self._dtype = dtype
-        self._mod = IRModule({})
-
-    def from_caffe2(self, init_net, predict_net):
-        """Construct Relay expression from caffe2 graph.
-
-        Parameters
-        ----------
-        init_net : protobuf object
-        predict_net : protobuf object
-
-        Returns
-        -------
-        mod : tvm.IRModule
-            The module that optimizations will be performed on.
-
-        params : dict
-            A dict of name: tvm.nd.array pairs, used as pretrained weights
-        """
-        # pylint: disable=import-outside-toplevel
-        from caffe2.python import workspace
-
-        workspace.RunNetOnce(init_net)
-
-        # Input
-        input_name = predict_net.op[0].input[0]
-
-        # Params
-        self._params = {}
-        used_blobs = set()
-        for c2_op in predict_net.op:
-            for i in c2_op.input:
-                used_blobs.add(i)
-        for blob in workspace.Blobs():
-            if blob in used_blobs and blob != input_name:
-                self._params[blob] = _nd.array(workspace.FetchBlob(blob))
-
-        # Variables
-        self._nodes = {}
-        for blob in predict_net.external_input:
-            if blob in self._params:
-                self._nodes[blob] = new_var(
-                    blob, shape=self._params[blob].shape, dtype=self._params[blob].dtype
-                )
-            else:
-                shape = self._shape[blob] if blob in self._shape else ()
-                if isinstance(self._dtype, dict) and blob in self._dtype:
-                    dtype = str(self._dtype[blob])
-                elif isinstance(self._dtype, str):
-                    dtype = self._dtype
-                else:
-                    dtype = "float32"
-                self._nodes[blob] = new_var(blob, shape=shape, dtype=dtype)
-
-        # Ops
-        for c2_op in predict_net.op:
-            for blob in c2_op.output:
-                self._ops[blob] = c2_op
-
-        for c2_op in predict_net.op:
-            self._process_op(c2_op)
-
-        # Outputs
-        out = []
-        for blob in predict_net.external_output:
-            out.append(self._nodes[blob])
-
-        if len(out) > 1:
-            outputs = _expr.Tuple(out)
-        else:
-            outputs = out[0]
-
-        func = _function.Function(analysis.free_vars(outputs), outputs)
-        self._mod["main"] = func
-
-        return self._mod, self._params
-
-    def _get_node(self, blob):
-        """Get the Symbol of blob and detect cyclic dependency in the graph."""
-        if blob in self._nodes:
-            return self._nodes[blob]
-
-        assert blob not in self._visited_nodes, f"Cyclic dependency in the graph (in {blob})"
-        self._visited_nodes.add(blob)
-
-        self._process_op(self._ops[blob])
-        return self._nodes[blob]
-
-    def _process_op(self, c2_op):
-        op_type = c2_op.type
-        args = self._parse_arg(c2_op.arg)
-        inputs = [self._get_node(i) for i in c2_op.input]
-        tvm_op = self._convert_operator(op_type, inputs, args)
-
-        if not isinstance(tvm_op, _expr.TupleWrapper):
-            self._nodes[c2_op.output[0]] = tvm_op
-        else:
-            for k, i in zip(list(c2_op.output), range(len(tvm_op))):
-                self._nodes[k] = tvm_op[i]
-
-    def _parse_arg(self, arg):
-        """Convert a list of Argument to a dict, with names as keys."""
-        args = {}
-        for a in arg:
-            for f in ["f", "i", "s"]:
-                if a.HasField(f):
-                    args[a.name] = getattr(a, f)
-            for f in ["floats", "ints", "strings"]:
-                if list(getattr(a, f)):
-                    assert a.name not in args, "Only one type of attr is allowed"
-                    args[a.name] = tuple(getattr(a, f))
-            for f in ["n"]:
-                if a.HasField(f):
-                    raise NotImplementedError(f"Field {f} is not supported in relay.")
-            for f in ["nets"]:
-                if list(getattr(a, f)):
-                    raise NotImplementedError(f"Field {f} is not supported in relay.")
-            if a.name not in args:
-                raise ValueError(f"Cannot parse attribute: \n{a}\n.")
-        return args
-
-    def _convert_operator(self, op_type, inputs, args, identity_list=None, convert_map=None):
-        """Convert from Caffe2 operator to Relay operator.
-        The converter must specify conversions explicitly for incompatible name, and
-        apply handlers to operator attributes.
-
-        Parameters
-        ----------
-        op_type : str
-            Operator name, such as Convolution, FullyConnected
-        inputs : list of tvm.relay.function.Function
-            List of input inputs.
-        args : dict
-            Dict of operator attributes
-        identity_list : list
-            List of operators that don't require conversion
-        convert_map : dict
-            Dict of name : callable, where name is the op's name that
-            require conversion to relay, callable are functions which
-            take args and return (new_op_type, new_args)
-
-        Returns
-        -------
-        func : tvm.relay.function.Function
-            Converted relay function
-        """
-        identity_list = identity_list if identity_list else _identity_list
-        convert_map = convert_map if convert_map else _get_convert_map()
-        if op_type in identity_list:
-            func = get_relay_op(op_type)(*inputs, **args)
-        elif op_type in convert_map:
-            # Add a sanitizing step to convert all byte strings in args to strings
-            func = convert_map[op_type](inputs, args, self._params)
-        else:
-            raise tvm.error.OpNotImplemented(
-                f"Operator {op_type} is not supported in frontend Caffe2."
-            )
-        return func
-
-
-def from_caffe2(init_net, predict_net, shape=None, dtype="float32"):
-    """Load caffe2 graph which contains init_net and predict_net into Relay Function.
-
-    Parameters
-    ----------
-    init_net : protobuf object
-        Caffe2 NetDef containing the weights
-
-    predict_net : protobuf object
-        Caffe2 NetDef containing the graph
-
-    shape : dict of str to tuple
-        The input shape to the graph
-
-    dtype : str or dict of str to str
-        The input types to the graph
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The module that optimizations will be performed on.
-
-    params : dict of str to tvm.nd.NDArray
-        Dict of converted parameters stored in tvm.nd.NDArray format
-    """
-
-    caffe2 = Caffe2NetDef(shape, dtype)
-    return caffe2.from_caffe2(init_net, predict_net)
diff --git a/python/tvm/relay/frontend/change_datatype.py b/python/tvm/relay/frontend/change_datatype.py
deleted file mode 100644
index 1873b3461e3e..000000000000
--- a/python/tvm/relay/frontend/change_datatype.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""Change Datatype Pass"""
-from ..function import Function
-from ..expr_functor import ExprMutator
-from ..transform.transform import function_pass
-from ..expr import var, bind
-
-
-@function_pass(opt_level=0)
-class ChangeDatatype(ExprMutator):
-    """Mutator for changing the datatype of Relay programs.
-
-    This pass should be useful for users of the Bring Your Own Datatypes
-    framework.
-    TODO(@gussmith23 @hypercubestart) Add link to documentation when it exists
-
-    Example:
-
-    .. code-block:: python
-
-        from tvm.relay.testing.inception_v3 import get_workload
-        mod, params = get_workload()
-
-        def change_dtype(mod, params, src, dst):
-            mod = ChangeDatatype(src, dst)(mod)
-            params = dict((p, tvm.nd.array(params[p].numpy().astype(dst))) for p in params)
-            return mod, params
-
-        mod, params = change_dtype(mod, params, "float32", "custom[posites2]32")
-
-    Parameters
-    ----------
-    src : String
-        The source datatype name, e.g. "float" or "posites2" (but not "float32"
-        or "custom[posites2]32").
-    dst : String
-        The destination datatype name, in the same format.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        Module where all nodes of dtype `src` have been changed to have dtype
-        `dst`.
-    """
-
-    def __init__(self, src, dst):
-        self.src = src
-        self.dst = dst
-        super().__init__()
-
-    def transform_function(self, func, mod, ctx):
-        return self.visit(func)
-
-    def visit_constant(self, const):
-        if const.data.dtype == self.src:
-            return const.astype(self.dst)
-        return const
-
-    def visit_function(self, fn):
-        new_params = []
-        binds = {}
-
-        for param in fn.params:
-            # Get the parameter's type annotation.
-            var_type = param.type_annotation
-
-            # See if we want to replace dtype.
-            if var_type.dtype == self.src:
-                dtype = self.dst
-            else:
-                dtype = var_type.dtype
-
-            # Generate new variable.
-            new_param = var(param.name_hint, shape=var_type.shape, dtype=dtype)
-
-            new_params.append(new_param)
-            binds[param] = new_param
-
-        new_body = self.visit(fn.body)
-        # Rewrite the body to use new parameters.
-        new_body = bind(new_body, binds)
-
-        # Construct the updated function and return.
-        return Function(
-            new_params,
-            new_body,
-            # You could change the return type, if you use None it will re-infer.
-            None,
-            type_params=fn.type_params,
-            attrs=fn.attrs,
-        )
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
deleted file mode 100644
index 0433d3b52ebf..000000000000
--- a/python/tvm/relay/frontend/common.py
+++ /dev/null
@@ -1,1217 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=broad-except
-"""Common utilities"""
-from __future__ import absolute_import as _abs
-import logging
-import numpy as np
-
-import tvm
-from tvm.ir import IRModule
-from tvm.topi.utils import get_const_tuple
-
-from ..expr_functor import ExprMutator
-from .. import expr as _expr
-from .. import function as _function
-from .. import transform as _transform
-from .. import op as _op
-from .. import ty as _ty
-from .. import analysis
-
-
-class DuplicateFilter:
-    """A log filter that only prints the same message once."""
-
-    def __init__(self):
-        self.msgs = set()
-
-    def filter(self, record):
-        self.msgs.add(record.msg)
-        return record.msg not in self.msgs
-
-
-# pylint: disable=invalid-name
-logger = logging.getLogger("Frontend")
-logger.addFilter(DuplicateFilter())
-# Uncomment below line to print all debug msgs
-# logger.setLevel(logging.DEBUG)
-
-
-class RequiredAttr(object):
-    """Dummpy class to represent required attr"""
-
-
-class StrAttrsDict(object):
-    """Helper class to parse attrs stored as Dict[str, str].
-
-    Parameters
-    ----------
-    attrs : Dict[str, str]
-        The attributes to be used.
-    """
-
-    def __init__(self, attrs):
-        self.attrs = attrs
-
-    def has_attr(self, key):
-        """Checks if a attribute is present in the map.
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        Returns
-        -------
-        bool : True if the key is present in the attributes else false.
-        """
-        return key in self.attrs
-
-    def get_float(self, key, default=RequiredAttr()):
-        """Get float attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-        if key in self.attrs:
-            return float(self.attrs[key])
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-    def get_int(self, key, default=RequiredAttr()):
-        """Get int attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-        if key in self.attrs:
-            val = self.attrs[key]
-            if val == "None":
-                return None
-            return int(val)
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-    def get_str(self, key, default=RequiredAttr()):
-        """Get str attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-        if key in self.attrs:
-            return self.attrs[key]
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-    def get_int_tuple(self, key, default=RequiredAttr()):
-        """Get int tuple attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-        if key in self.attrs:
-            tshape = self.attrs[key]
-            return tuple(
-                int(x) if x.strip("- ").isdigit() else None
-                for x in tshape.strip("()[]").split(",")
-                if x
-            )
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-    def get_float_tuple(self, key, default=RequiredAttr()):
-        """Get float tuple attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-
-        if key in self.attrs:
-            tshape = self.attrs[key]
-            return tuple(float(x.strip()) for x in tshape.strip("()[]").split(","))
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-    def get_tuple_tuple_int(self, key, default=RequiredAttr()):
-        """Get int list attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-        if key in self.attrs:
-            value = self.attrs[key]
-            seq = []
-            for tup in value.strip("()").split("),"):
-                tup = tup.strip("[]()")
-                els = [int(x.strip("( ")) for x in tup.split(",")]
-                seq.append(tuple(els))
-
-            return tuple(seq)
-
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-    def get_int_list(self, key, default=RequiredAttr()):
-        """Get int list attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-        if key in self.attrs:
-            tshape = self.attrs[key]
-            return tuple(int(x.strip()) for x in tshape.strip("[]()").split(","))
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-    def get_bool(self, key, default=RequiredAttr()):
-        """Get bool tuple attribute
-
-        Parameters
-        ----------
-        key : str
-            The attribute key
-
-        default : float
-            The default value.
-
-        Returns
-        -------
-        value : The result
-        """
-        if key in self.attrs:
-            val = self.attrs[key]
-            return val.strip().lower() in ["true", "1", "t", "y", "yes"]
-        if isinstance(default, RequiredAttr):
-            raise AttributeError(f"Required attribute {key} not found.")
-        return default
-
-
-def get_relay_op(op_name):
-    """Get the callable function from Relay based on operator name.
-    Parameters
-    ----------
-    op_name : str
-        The Relay operator name.
-    """
-    if "." in op_name:
-        # explicit hierarchical modules
-        op = _op
-        try:
-            for opn in op_name.split("."):
-                op = getattr(op, opn)
-        except AttributeError:
-            op = None
-    else:
-        # try search op in various modules
-        for candidate in (_op, _op.nn, _op.image, _op.vision, _op.contrib):
-            op = getattr(candidate, op_name, None)
-            if op is not None:
-                break
-    if not op:
-        raise tvm.error.OpNotImplemented(f"Unable to map op_name {op_name} to relay")
-    return op
-
-
-class ExprTable(object):
-    """Table storing Relay expressions by names."""
-
-    def __init__(self):
-        self.exprs = {}
-        self.params = {}
-        self.const_ctr = 1
-        self.in_padding = False
-
-    def new_const(self, value, shape=None, dtype="float32", source_name=None):
-        """Construct a new var expr and add to exprs dictionary"""
-        name = f"_param_{self.const_ctr}"
-        if hasattr(value, "shape"):
-            shape = value.shape
-        self.const_ctr += 1
-        self.params[name] = value
-        self.exprs[name] = _expr.var(name_hint=name, shape=shape, dtype=dtype)
-        if source_name:
-            self.exprs[name] = set_span(self.exprs[name], source_name)
-        return self.exprs[name]
-
-    def get_expr(self, name):
-        return self.exprs[name]
-
-    def set_expr(self, name, expr, force_override=False):
-        assert isinstance(expr, _expr.Expr)
-        # if name exists, we should override the value
-        # otherwise, we can not get like x = func(x) work.
-        # One example is CoreML preprocess, which will override
-        # the same name of input.
-        # However, according to git log, Find keras frontend depends
-        # on this property, so we add one force_override to control it.
-        if name not in self.exprs or force_override:
-            self.exprs[name] = expr
-
-    def has_expr(self, name):
-        return name in self.exprs
-
-    def set_padding(self, paddings):
-        self.paddings = paddings
-        self.in_padding = True
-
-    def clear_padding(self):
-        self.in_padding = False
-
-
-class AttrCvt(object):
-    """Common attribute converter. An AttrConverter instance is a callable:
-    ```
-    attr_converter = AttrConverter(op_name, transforms={'a':'b', 'c':('d', 1)})
-    new_op_name, new_attr = attr_converter(attrs)
-    ```
-
-    Parameters
-    ----------
-    op_name : str or callable
-        If set as str, returned operator name is the str.
-        If set as callable, returned operator is the str returned by calling:
-        `op_name = func(attr)`
-
-    transforms : dict of `new_name, or (new_name, default_value, transform function)`
-        If only a new_name is provided, it's like renaming the attribute name.
-        If default_value if provided, then the attribute is considered as optional.
-        If transform function is provided, the original attribute value is handled
-        by transform function.
-
-    excludes : list
-        A list of excluded attributes that should `NOT` appear.
-        Raise NotImplementedError if occurred.
-
-    disables : list
-        A list of attributes that is disabled in relay. Log warnings.
-
-    ignores : list
-        A list of attributes that is ignored in relay. Debug level logging.
-
-    extras : dict
-        A series of additional attributes should be added anyway to the returned
-        attribute dict.
-
-    custom_check : callable
-        A custom function takes attribute, and return True/False.
-        Raise RuntimeError if not bool(True) returned.
-    """
-
-    def __init__(
-        self,
-        op_name,
-        transforms=None,
-        excludes=None,
-        disables=None,
-        ignores=None,
-        extras=None,
-        custom_check=None,
-    ):
-        self._op_name = op_name
-        self._transforms = transforms if transforms else {}
-        self._excludes = excludes if excludes else []
-        self._disables = disables if disables else []
-        self._ignores = ignores if ignores else []
-        self._extras = extras if extras else {}
-        self._custom_check = custom_check
-
-    def __call__(self, inputs, attrs, *args):
-        self._ignores.append("_output_shapes")
-        self._ignores.append("_input_shapes")
-        self._ignores.append("T")
-        self._ignores.append("use_cudnn_on_gpu")
-        self._ignores.append("_node_name")
-        self._ignores.append("is_training")
-        self._ignores.append("_target_layout")
-
-        # apply custom check
-        if self._custom_check:
-            func, msg = self._custom_check
-            if not func(attrs):
-                raise RuntimeError(f"Check failed: {msg}")
-        # get new op_name
-        if isinstance(self._op_name, str):
-            op_name = self._op_name
-        else:
-            assert callable(self._op_name), "op_name can either be string or callable"
-            op_name = self._op_name(attrs)
-
-        # ignore 'tvm_custom' always
-        self._ignores.append("tvm_custom")
-
-        # convert attributes
-        new_attrs = {}
-        for k in attrs.keys():
-            if k in self._excludes:
-                raise NotImplementedError(
-                    "Attribute %s in operator %s is not" + " supported.", k, op_name
-                )
-            if k in self._disables:
-                logger.debug("Attribute %s is disabled in relay.sym.%s", k, op_name)
-            elif k in self._ignores:
-                if k != "tvm_custom":
-                    logger.debug("Attribute %s is ignored in relay.sym.%s", k, op_name)
-            elif k in self._transforms:
-                new_name, defaults, transform = self._parse_default(self._transforms[k])
-                if defaults is None:
-                    new_attr = self._required_attr(attrs, k)
-                else:
-                    new_attr = attrs.get(k, None)
-                if new_attr is None:
-                    new_attrs[new_name] = defaults
-                else:
-                    new_attrs[new_name] = transform(new_attr)
-            else:
-                # copy
-                new_attrs[k] = attrs[k]
-        # add extras
-        new_attrs.update(self._extras)
-        return get_relay_op(op_name)(*inputs, **new_attrs)
-
-    def _parse_default(self, target):
-        """Helper function to parse default values."""
-        if not isinstance(target, (list, tuple)):
-            k, v, t = target, None, lambda x: x
-        elif len(target) == 1:
-            k, v, t = target[0], None, lambda x: x
-        elif len(target) == 2:
-            k, v, t = target[0], target[1], lambda x: x
-        elif len(target) > 2:
-            k, v, t = target[0], target[1], target[2]
-        else:
-            k = None  # should raise
-        if not isinstance(k, str):
-            msg = f"{target} is not a valid target, (name, default) expected."
-            raise ValueError(msg)
-        return k, v, t
-
-    def _parse_bool(self, value):
-        """Helper function to parse default boolean values."""
-        if isinstance(value, str):
-            return value.strip().lower() in ["true", "1", "t", "y", "yes"]
-        return bool(value)
-
-    def _required_attr(self, attr, key):
-        """Wrapper for getting required attributes."""
-        assert isinstance(attr, dict)
-        if key not in attr:
-            raise AttributeError(f"Required attribute {key} not found.")
-        return attr[key]
-
-
-def get_name(node):
-    name = ""
-    if hasattr(node, "name_hint"):
-        name = node.name_hint
-    return name
-
-
-def infer_type(node, mod=None):
-    """A method to infer the type of an intermediate node in the relay graph."""
-    if isinstance(mod, IRModule):
-        mod["main"] = _function.Function(tvm.relay.analysis.free_vars(node), node)
-        mod = _transform.InferType()(mod)
-        entry = mod["main"]
-        ret = entry.body
-    else:
-        new_mod = IRModule.from_expr(node)
-        if mod is not None:
-            new_mod.update(mod)
-
-        new_mod = _transform.InferType()(new_mod)
-        entry = new_mod["main"]
-        ret = entry if isinstance(node, _function.Function) else entry.body
-
-    return ret
-
-
-def fold_constant(node, mod=None):
-    if mod is None:
-        mod = IRModule()
-    return _transform.FoldConstantExpr(node, mod)
-
-
-def infer_channels(inputs, transpose=False):
-    """A hack for getting 'channels' or 'units' since caffe2 does not provide
-    these attributes. We check the shape of weights provided to get the number.
-    """
-    out_type = infer_type(inputs)
-    out_shapes = [get_const_tuple(out_type.checked_type.shape)]
-    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
-    return channels
-
-
-def infer_shape(inputs, mod=None):
-    """A method to get the output type of an intermediate node in the graph."""
-    out_type = infer_type(inputs, mod=mod)
-    checked_type = out_type.checked_type
-    if hasattr(checked_type, "shape"):
-        # Regular operator that outputs tensors
-        return get_const_tuple(checked_type.shape)
-    # The return type is not a tensor, for example List
-    return checked_type
-
-
-def infer_value(input_val, params, mod=None):
-    """A hack for getting the value of an expression by evaluating a
-    portion of the relay graph. This is often needed for functions that
-    whose output shape depends on the value of a tensor.
-    """
-    # Check that all free variables have associated parameters.
-    assert all(
-        var.name_hint in params.keys() for var in analysis.free_vars(input_val)
-    ), "All inputs to infer must be available in params."
-    assert tvm.runtime.enabled("llvm"), "LLVM must be enabled to infer value."
-    try:
-        # TODO(kevinthesun): Use VM for all cases.
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib import graph_executor
-
-        func = _function.Function(analysis.free_vars(input_val), input_val)
-        with tvm.transform.PassContext(opt_level=0):
-            lib = tvm.relay.build(func, target="llvm", params=params)
-        dev = tvm.cpu(0)
-        m = graph_executor.GraphModule(lib["default"](dev))
-        m.run()
-        return m.get_output(0)
-    except Exception:
-        if isinstance(mod, IRModule):
-            mod["main"] = _function.Function(analysis.free_vars(input_val), input_val)
-        else:
-            mod = IRModule.from_expr(input_val)
-        inputs = []
-        for param in mod["main"].params:
-            inputs.append(params[param.name_hint])
-        result = tvm.relay.create_executor(
-            "debug", mod=mod, device=tvm.cpu(), target="llvm"
-        ).evaluate()(*inputs)
-        return result
-
-
-def infer_value_simulated(input_val, params):
-    """Extension to infer_value that can be used when some input
-    values are missing. This function creates dummy inputs with the same
-    shape and random values then calls infer_value. This is helpful when
-    implementing certain onnx operators where we need to evaluate the graph
-    to determine a static shape.
-    """
-    fake_params = []
-    # Add a fake copy of all missing params.
-    for free_param in analysis.free_vars(input_val):
-        if free_param.name_hint not in params:
-            fp_dtype = free_param.type_annotation.dtype
-            fp_shape = [s.value for s in free_param.type_annotation.shape]
-            fake_params.append(free_param)
-            params[free_param.name_hint] = tvm.nd.array(np.random.rand(*fp_shape).astype(fp_dtype))
-    # Now infer the value.
-    output_value = infer_value(input_val, params)
-    # Clean fake params out of param dictionary.
-    for fake_p in fake_params:
-        params.pop(fake_p.name_hint, None)
-    return output_value
-
-
-def try_infer_value(val, on_success=None, on_failure=None, parameters=None):
-    """Try running infer_value on the input val, and if successful, return the inferred value or
-    pass it to on_success callback if provided. Otherwise, run on_failure callback if it is
-    provided, or return the input val as output. In each case, the second return value
-    indicates whether infer_value has succeeded or not.
-    """
-    try:
-        params = parameters if parameters is not None else {}
-        ret = infer_value(val, params).numpy()
-        if on_success:
-            return on_success(ret), True
-        return ret, True
-    except Exception:
-        if on_failure:
-            return on_failure(), False
-        return val, False
-
-
-def shape_of(x, dtype="int64", start=None, end=None):
-    """Get shape of a tensor."""
-
-    ttype = infer_type(x).checked_type
-    if not _ty.is_dynamic(ttype):
-        shape = list(ttype.shape)
-        start = start or 0  # default to first
-        end = end or len(shape)  # default to last
-        shape_sliced = shape[start:end]
-        return _expr.const(shape_sliced, dtype)
-    return _op.shape_of(x, dtype)
-
-
-def new_var(name_hint, type_annotation=None, shape=None, dtype="float32"):
-    return _expr.var(name_hint, type_annotation, shape, dtype)
-
-
-class Renamer(object):
-    """A simply renamer for operators.
-
-    Parameters
-    ----------
-    new_name : str
-        The new name for the operator
-    """
-
-    def __init__(self, new_name):
-        self._new_name = new_name
-
-    def __call__(self, inputs, attrs, *args):
-        if "tvm_custom" in attrs:
-            attrs.pop("tvm_custom")
-        return get_relay_op(self._new_name)(*inputs, **attrs)
-
-
-def to_int_list(np_array):
-    """Convert a np array to a python int list.
-
-    Note: This function converts np.int32 to python's int.
-    If we don't do this conversion, numpy's automatic upcast will make
-    the shape / parameters be converted to int64 IntImm in relay and
-    cause problems in relay/TOPI.
-    """
-    return [int(x) for x in np_array]
-
-
-def unbind(data, axis=0):
-    """
-    Unbind was taken from Pytorch frontend. The operation removes a tensor dimension
-    and returns a tuple of all slices along a given dimension, with specified axis removed.
-    TODO (vvchernov): It needs such operation on relay side to reduce time consumption
-    on squeeze operation.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        Input tensor
-    axis : int
-        Axis along which tensor is split.
-    Returns
-    -------
-    result : List[relay.Expr]
-        The sequence of computed tensors
-    """
-    shape = infer_shape(data)
-    if axis >= len(shape):
-        msg = "Please check input dim, it shouldn't be greater than or equal to rank."
-        raise AttributeError(msg)
-
-    selections = shape[axis]
-    res_split = _op.split(data, selections, axis)
-    ret = []
-    for i in range(selections):
-        ret.append(_op.squeeze(res_split[i], axis=[axis]))
-    return _expr.TupleWrapper(_expr.Tuple(ret), selections)
-
-
-def rnn_cell(
-    input_seqs, hidden_state, w_inp, w_hid, b_inp=None, b_hid=None, backwards=False, act=_op.tanh
-):
-    """
-    Common implementation of RNN cell for all frontends of TVM
-
-    Parameters
-    ----------
-    input_seqs : List[relay.Expr]
-        The sequence of input tensors
-        Input tensor should be 2d while issue #8412 is not resolved
-        Shape = (batch, feature_size)
-    hidden_state : relay.Expr
-        Hidden state. shape = (batch_size, hidden_size)
-    w_inp, w_hid: relay.Expr
-        weight matrices. shape = (hidden_size, feature_size), (hidden_size, feature_size)
-    b_inp, b_hid : relay.Expr
-        bias matrices. The same order of internal parts as for weights. shape = (1 * hidden_size)
-    backwards : bool
-        Flag for reverse pass of RNN
-    act : relay.op
-        activation function. It is tanh by default.
-
-    Returns
-    -------
-    result : List[relay.Expr], relay.Expr, relay.Expr
-        The sequence of computed result, final hidden and cell state
-    """
-    outputs_list = []
-    for x_t in input_seqs if not backwards else reversed(input_seqs):
-        xwt = _op.nn.dense(x_t, w_inp)
-        hwt = _op.nn.dense(hidden_state, w_hid)
-        if b_inp is not None and b_hid is not None:
-            xwt += b_inp
-            hwt += b_hid
-        hidden_state = act(xwt + hwt)
-        outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
-    return outputs_list, hidden_state
-
-
-def gru_cell(
-    input_seqs,
-    hidden_state,
-    w_inp,
-    w_hid,
-    b_inp=None,
-    b_hid=None,
-    rz_act=_op.sigmoid,
-    n_act=_op.tanh,
-    backwards=False,
-    linear_before_reset=True,
-    sequence_lens=None,
-):
-    """
-    Common implementation of GRU cell for all frontends of TVM
-    TODO(vvchernov): currently it is used by pytorch and ONNX. Extend for other frontends
-
-    Parameters
-    ----------
-    input_seqs : List[relay.Expr]
-        The sequence of input tensors
-        Input tensor should be 2d while issue #8412 is not resolved
-        Shape = (batch, feature_size)
-    hidden_state : relay.Expr
-        Hidden state. shape = (batch_size, hidden_size)
-    w_inp, w_hid : relay.Expr
-        weight matrices. wi shape = (3 * hidden_size, feature_size)
-        wh shape = (3 * hidden_size, hidden_size)
-        NOTE: wi = (w_ir|w_iz|w_in) for reset, update and new gates.
-        The order is important for correct GRU calculation!
-    b_inp, b_hid : relay.Expr
-        bias matrices. The same order of internal parts as for weights. shape = (3 * hidden_size)
-    r_act : relay.op
-        activation function for reset gate. it is sigmoid by default
-    z_act : relay.op
-        activation function for update gate. it is sigmoid by default
-    n_act : relay.op
-        activation function for new gate. it is tanh by default
-    backwards : bool
-        Flag for reverse pass of GRU
-    linear_before_reset : bool
-        Flag for applying the linear transformation before multiplying by the output of the reset
-        gate.
-    sequence_lens : relay.op
-        Tensor specifying lengths of the sequences in a batch.
-        Shape = (batch_size)
-    Returns
-    -------
-    result : List[relay.Expr], relay.Expr, relay.Expr
-        The sequence of computed result, final hidden and cell state
-    """
-
-    outputs_list = []
-
-    seq_len = len(input_seqs)
-    input_dtype = infer_type(input_seqs[0]).checked_type.dtype
-
-    if sequence_lens is not None:
-        shape = infer_shape(sequence_lens)
-        dtype = infer_type(sequence_lens).checked_type.dtype
-
-        arange = _op.arange(_op.const(0), _op.const(seq_len), dtype=dtype)
-        arange = _op.expand_dims(arange, 1)
-        sequence_lens = _op.broadcast_to(sequence_lens, [seq_len, shape[0]])
-
-        # cast to data dtype
-        mask = _op.less(arange, sequence_lens)
-        mask = _op.cast(mask, dtype=input_dtype)
-        mask = _op.expand_dims(mask, 2)
-        mask_seqs = unbind(mask)
-
-        res_mask = _op.greater_equal(arange, sequence_lens)
-        res_mask = _op.cast(res_mask, dtype=input_dtype)
-        res_mask = _op.expand_dims(res_mask, 2)
-        res_mask_seqs = unbind(res_mask)
-
-        if backwards:
-            # need a mask to keep intial_h_B correct
-            initial_h = hidden_state
-            initial_h_mask = _op.equal(arange, sequence_lens)
-            initial_h_mask = _op.cast(initial_h_mask, dtype=input_dtype)
-            initial_h_mask = _op.expand_dims(initial_h_mask, 2)
-            initial_h_mask_seqs = unbind(initial_h_mask)
-
-    output = _op.zeros(infer_shape(hidden_state), input_dtype)
-    for i in range(seq_len) if not backwards else reversed(range(seq_len)):
-        x_t = input_seqs[i]
-        xwt = _op.nn.dense(x_t, w_inp)
-        if linear_before_reset:
-            hwt = _op.nn.dense(hidden_state, w_hid)
-            if b_inp is not None and b_hid is not None:
-                xwt += b_inp
-                hwt += b_hid
-            i_r, i_z, i_n = _op.split(xwt, 3, axis=-1)
-            h_r, h_z, h_n = _op.split(hwt, 3, axis=-1)
-            r_gate = rz_act(i_r + h_r)
-            z_gate = rz_act(i_z + h_z)
-            n_gate = n_act(i_n + r_gate * h_n)
-        else:
-            i_r, i_z, i_n = _op.split(xwt, 3, axis=1)
-            w_hr, w_hz, w_hn = _op.split(w_hid, 3, axis=0)
-            r_gate = i_r + _op.nn.dense(hidden_state, w_hr)
-            z_gate = i_z + _op.nn.dense(hidden_state, w_hz)
-            if b_inp is not None and b_hid is not None:
-                b_ir, b_iz, b_in = _op.split(b_inp, 3, axis=-1)
-                b_hr, b_hz, b_hn = _op.split(b_hid, 3, axis=-1)
-                r_gate += b_ir + b_hr
-                r_gate = rz_act(r_gate)
-                z_gate += b_iz + b_hz
-                i_n += b_in
-                h_n = _op.nn.dense((r_gate * hidden_state), w_hn) + b_hn
-            else:
-                r_gate = rz_act(r_gate)
-                h_n = _op.nn.dense((r_gate * hidden_state), w_hn)
-            z_gate = rz_act(z_gate)
-            n_gate = n_act(i_n + h_n)
-
-        hidden_state = (hidden_state - n_gate) * z_gate + n_gate
-
-        if sequence_lens is not None:
-            hidden_state = hidden_state * mask_seqs[i]
-
-        outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
-
-        if sequence_lens is not None:
-            output = output * res_mask_seqs[i] + hidden_state
-        else:
-            output = hidden_state
-
-        # make sure initial_h_B correct
-        if backwards and sequence_lens is not None:
-            hidden_state = hidden_state + initial_h * initial_h_mask_seqs[i]
-
-    return outputs_list, output
-
-
-def lstm_cell(
-    input_seqs,
-    hidden_state,
-    cell_state,
-    w_inp,
-    w_hid,
-    b_inp=None,
-    b_hid=None,
-    proj=None,
-    p_i=None,
-    p_f=None,
-    p_o=None,
-    f_act=_op.sigmoid,
-    g_act=_op.tanh,
-    h_act=_op.tanh,
-    backwards=False,
-):
-    """
-    Common implementation of LSTM cell for all frontends of TVM
-    TODO (vvchernov): currently it is used by onnx and pytorch. Extend for other frontends
-
-    Parameters
-    ----------
-    input_seqs : List[relay.Expr]
-        The sequence of input tensors
-        Input tensor should be 2d while issue #8412 is not resolved
-        Shape = (batch, feature_size)
-    hidden_state : relay.Expr
-        Hidden state. shape = (batch, hidden_size)
-    cell_state : relay.Expr
-        Cell state. shape = (batch, hidden_size)
-    w_inp, w_hid : relay.Expr
-        weight matrices. wi shape = (4 * hidden_size, feature_size)
-        wh shape = (4 * hidden_size, hidden_size or proj_size)
-        NOTE: wi = (w_ii|w_if|w_ig|w_io) for input, forget, cell and output gates.
-        The order is important for correct LSTM calculation!
-    b_inp, b_hid : relay.Expr
-        bias matrices. The same order of internal parts as for weights. shape = (4 * hidden_size)
-    proj : relay.Expr
-        projection matrix. shape = (proj_size, hidden_size)
-    p_i, p_f, p_o : relay.Expr
-        peephole LSTM matrices. shape = (batch, hidden_size)
-    f_act, g_act, h_act : relay.op
-        activation functions
-    backwards : bool
-        Flag for reverse pass of LSTM
-
-    Returns
-    -------
-    result : List[relay.Expr], relay.Expr, relay.Expr
-        The sequence of computed result, final hidden and cell state
-    """
-
-    outputs_list = []
-    for x_t in input_seqs if not backwards else reversed(input_seqs):
-        # x_t shape = (batch, feature size), step shape = (batch, feature size + hidden_size)
-        step = _op.concatenate([x_t, hidden_state], axis=1)
-        cat_w = _op.concatenate([w_inp, w_hid], axis=1)
-        # Instead of nn.dense(x_t, w_inp) + nn.dense(hidden_state, w_hid)
-        # nn.dense(step, cat_w) is used
-        # gates shape = (batch, 4 * hidden_size)
-        gates = _op.nn.dense(step, cat_w)
-        # Add biases
-        if b_inp is not None:
-            gates += b_inp
-        if b_hid is not None:
-            gates += b_hid
-        # any gate shape = (batch, hidden_size)
-        inp_gate, fgt_gate, cell_gate, otp_gate = _op.split(gates, 4, axis=-1)
-
-        if p_i is not None and p_f is not None:
-            inp_gate = f_act(inp_gate + p_i * cell_state)
-            fgt_gate = f_act(fgt_gate + p_f * cell_state)
-        else:
-            inp_gate = f_act(inp_gate)
-            fgt_gate = f_act(fgt_gate)
-
-        cell_gate = g_act(cell_gate)
-        cell_state = fgt_gate * cell_state + inp_gate * cell_gate
-        if p_o is not None:
-            otp_gate = f_act(otp_gate + p_o * cell_state)
-        else:
-            otp_gate = f_act(otp_gate)
-
-        hidden_state = otp_gate * h_act(cell_state)
-
-        if proj is not None:
-            hidden_state = _op.nn.dense(hidden_state, proj)
-
-        outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
-
-    return outputs_list, hidden_state, cell_state
-
-
-def autopad(
-    data,
-    strides,
-    kernel_shape,
-    dilations=(1, 1),
-    pad_type="constant",
-    deconv=False,
-    mode="SAME_UPPER",
-    pad_value=0.0,
-):
-    """
-    Perform autopadding with dynamic input shapes
-    """
-    # get attributes as constants
-    strides = _op.const(np.array(strides), dtype="int64")
-    dilated_kernel_shape = _op.const(
-        np.array(
-            [(kernel - 1) * dilation + 1 for kernel, dilation in zip(kernel_shape, dilations)]
-        ),
-        dtype="int64",
-    )
-    # get input shape
-    ndim = len(infer_shape(data))
-    shape = _op.strided_slice(shape_of(data, dtype="int64"), [2], [ndim])
-
-    # set up integer constants
-    zero = _op.const(0, dtype="int64")
-    one = _op.const(1, dtype="int64")
-    two = _op.const(2, dtype="int64")
-
-    # Calculate total padding
-    mod = _op.mod(shape, strides)
-
-    left = _op.maximum(dilated_kernel_shape - strides, zero)
-    right = _op.maximum(dilated_kernel_shape - mod, zero)
-
-    total_pad = _op.where(_op.equal(mod, zero), left, right)
-    if deconv:
-        total_pad = _op.const(np.array(kernel_shape), dtype="int64") - one - total_pad
-
-    # split total padding into before and after
-    pad_before = _op.floor_divide(total_pad, two)
-    pad_after = total_pad - pad_before
-
-    # combine
-    if "LOWER" in mode:
-        pad = _op.concatenate(
-            [_op.reshape(pad_after, [-1, 1]), _op.reshape(pad_before, [-1, 1])], axis=1
-        )
-    else:
-        pad = _op.concatenate(
-            [_op.reshape(pad_before, [-1, 1]), _op.reshape(pad_after, [-1, 1])], axis=1
-        )
-
-    # pad N and C with zeros
-    pad = _op.concatenate([_op.const(np.zeros([2, 2], dtype="int64"), dtype="int64"), pad], axis=0)
-
-    if isinstance(pad_value, (float, int)):
-        pad_value = _op.const(pad_value)
-
-    return _op.nn.pad(data, fold_constant(pad), pad_value, pad_type)
-
-
-def ensure_scalar_shape(x):
-    """
-    Assume that `x` is a tensor with one element (regardless of tensor rank).
-    Return a version of that tensor with rank 0.
-    """
-    x_shape = infer_shape(x)
-    x_rank = len(x_shape)
-
-    if x_rank == 0:
-        return x
-
-    num_elem = np.prod(x_shape)
-    assert num_elem == 1, f"Cannot squeeze tensor shape {x_shape} to scalar form."
-
-    return _op.squeeze(x)
-
-
-def try_resolve_var_to_const(x, graph_params):
-    """
-    Try to resolve the value of tensor `x` to a specific value.
-    If successful, return a Const op with that value.
-    If unsuccessful, simply return `x`.
-    """
-    if isinstance(x, _expr.Var) and x.name_hint in graph_params:
-        value = graph_params[x.name_hint].numpy()
-        dtype = infer_type(x).checked_type.dtype
-        return _op.const(value, dtype)
-
-    return x
-
-
-class _SpanFiller(ExprMutator):
-    """SpanFiller"""
-
-    def __init__(self, span):
-        ExprMutator.__init__(self)
-        if isinstance(span, tvm.relay.Span):
-            self._span = span
-        elif isinstance(span, str):
-            self._span = tvm.relay.Span(tvm.relay.SourceName(span), 0, 0, 0, 0)
-        elif isinstance(span, bytes):
-            self._span = tvm.relay.Span(tvm.relay.SourceName(span.decode("utf-8")), 0, 0, 0, 0)
-        else:
-            assert False, f"unsupported span type: {type(span)}"
-
-    def visit(self, expr):
-        if hasattr(expr, "span") and expr.span:
-            return expr
-
-        return super().visit(expr)
-
-    def visit_function(self, fn):
-        new_params = [self.visit(x) for x in fn.params]
-        new_body = self.visit(fn.body)
-        return _function.FunctionWithFields(
-            fn, list(new_params), new_body, fn.ret_type, fn.type_params, fn.attrs, None, self._span
-        )
-
-    def visit_let(self, let):
-        new_variable = self.visit(let.var)
-        new_value = self.visit(let.value)
-        new_body = self.visit(let.body)
-        return _expr.LetWithFields(let, new_variable, new_value, new_body, None, self._span)
-
-    def visit_call(self, call):
-        new_args = [self.visit(arg) for arg in call.args]
-        # call.op might be RelayExpr or Op type
-        # ExprMutator will return directly if subject belongs to Op type
-        new_op = self.visit(call.op)
-        return _expr.CallWithFields(
-            call, new_op, new_args, call.attrs, call.type_args, None, self._span
-        )
-
-    def visit_var(self, var):
-        return _expr.VarWithFields(var, var.vid, var.type_annotation, None, self._span)
-
-    def visit_if(self, ite):
-        return _expr.IfWithFields(
-            ite,
-            self.visit(ite.cond),
-            self.visit(ite.true_branch),
-            self.visit(ite.false_branch),
-            None,
-            self._span,
-        )
-
-    def visit_tuple(self, tup):
-        return _expr.TupleWithFields(
-            tup, [self.visit(field) for field in tup.fields], None, self._span
-        )
-
-    def visit_tuple_getitem(self, op):
-        return _expr.TupleGetItemWithFields(
-            op, self.visit(op.tuple_value), op.index, None, self._span
-        )
-
-    def visit_constant(self, const):
-        return _expr.ConstantWithFields(const, const.data, None, self._span)
-
-    # TODO: Frontend model translation could not use following relay expressions so far,
-    #       enable them when new models/impls leverage these kinds of relay expressions.
-    def visit_ref_create(self, _):
-        raise NotImplementedError()
-
-    def visit_ref_write(self, _):
-        raise NotImplementedError()
-
-    def visit_ref_read(self, _):
-        raise NotImplementedError()
-
-    def visit_match(self, _):
-        raise NotImplementedError()
-
-    def fill(self, sym):
-        """Fill span to sym when it is an expr, or return it without change
-
-        Parameters
-        ----------
-        sym :
-            A symbol which is generated from the conversion of a frontend operator.
-
-        Returns
-        -------
-        sym:
-            A expr with span-filled or the original sym.
-        """
-        if isinstance(sym, _expr.TupleWrapper):
-            return _expr.TupleWrapper(self.visit(sym.tuple_value), sym.size)
-        elif isinstance(sym, _expr.RelayExpr):
-            return self.visit(sym)
-        elif isinstance(sym, list):
-            assert all(
-                isinstance(expr, _expr.RelayExpr) for expr in sym
-            ), f"unexpected relay expressions in {sym}"
-            return [self.visit(expr) for expr in sym]
-        elif isinstance(sym, tuple):
-            # some op conversion may return dummy elements
-            # e.g. op in frontend/pytorch.py: min_max_common
-            assert all(
-                isinstance(expr, (_expr.RelayExpr, type(None))) for expr in sym
-            ), f"unexpected relay expressions in {sym}"
-            return tuple(self.visit(expr) if expr else None for expr in sym)
-        elif isinstance(sym, (float, int)):
-            return sym
-        elif isinstance(sym, np.ndarray):
-            return sym
-        elif not sym:
-            # some op conversion may return None
-            # e.g. op in frontend/pytorch.py: prim::device
-            return sym
-
-        raise RuntimeError(f"unsupported type {type(sym)}")
-
-
-def set_span(sym, span):
-    """
-    Recursively tag the span to the symbol. Stop when it encounters a span-tagged expr. Disabled
-    when setting the "relay.frontend.fill_span" as False to the config of PassContext
-
-    Parameters
-    ----------
-    sym :
-        A symbol is generated from the conversion of a frontend operator. Raise an error when the
-        type of the symbol is not supported.
-
-    span : String, Span, or bytes
-        The source information of the corresponding symbol.
-
-    Returns
-    -------
-    result :
-        The symbol tagged with span.
-
-    Examples
-    --------
-    .. code-block:: python
-
-      x = set_span(relay.var("x", shape=(1, 64, 56, 56)), "x_var")
-      w = relay.const(np.ones([64, 64, 3, 3]), dtype="int64")
-      y = set_span(
-          relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1)), "conv2d"
-      )
-      print(relay.Function([x], y))
-
-      #fn (%x: Tensor[(1, 64, 56, 56), float32] /* span=x_var:0:0 */) {
-      #  nn.conv2d(%x, meta[relay.Constant][0] /* span=conv2d:0:0 */, ...) /* span=conv2d:0:0 */
-      #}
-    """
-
-    if tvm.transform.PassContext.current().config.get("relay.frontend.fill_span", True):
-        return _SpanFiller(span).fill(sym)
-    return sym
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
deleted file mode 100644
index 9c525182a08c..000000000000
--- a/python/tvm/relay/frontend/coreml.py
+++ /dev/null
@@ -1,606 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, unused-argument, unused-variable, no-else-return
-# pylint: disable=inconsistent-return-statements, import-outside-toplevel
-"""CoreML frontend."""
-import math
-import numpy as np
-import tvm
-from tvm.ir import IRModule
-
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from ... import nd as _nd
-from ..._ffi import base as _base
-from .common import ExprTable
-from .common import infer_shape as _infer_shape
-
-__all__ = ["from_coreml"]
-
-
-def _NeuralNetworkImageScaler(op, inexpr, etab):
-    # TODO: we need to support more colorspace, such as rgb.
-    # this changes the symbol
-    biases = np.array([op.blueBias, op.greenBias, op.redBias]).reshape([3, 1, 1])
-    bias = etab.new_const(biases)
-    ret = _op.multiply(inexpr, _expr.const(op.channelScale, dtype="float32"))
-    ret = _op.add(ret, bias)
-    return ret
-
-
-def _NeuralNetworkMeanImage(op, inexpr, etab):
-    # this changes the symbol
-    ret = _op.subtract(inexpr, _expr.const(op.meanImage, dtype="float32"))
-    return ret
-
-
-def _ConvolutionLayerParams(op, inexpr, etab):
-    """Convolution layer params."""
-    if op.isDeconvolution:
-        weights = etab.new_const(
-            np.array(list(op.weights.floatValue)).reshape(
-                tuple([op.kernelChannels, op.outputChannels] + list(op.kernelSize))
-            )
-        )
-    else:
-        weights = etab.new_const(
-            np.array(list(op.weights.floatValue)).reshape(
-                tuple([op.outputChannels, op.kernelChannels] + list(op.kernelSize))
-            )
-        )
-    dilation = list(op.dilationFactor)
-    if not dilation:
-        dilation = [1, 1]
-    N, C, H, W = _infer_shape(inexpr)
-    params = {
-        "channels": op.outputChannels,
-        "kernel_size": list(op.kernelSize),
-        "strides": list(op.stride),
-        "dilation": dilation,
-        "groups": op.nGroups,
-    }
-
-    if op.WhichOneof("ConvolutionPaddingType") == "valid":
-        valid = op.valid
-        if valid.paddingAmounts.borderAmounts:
-            assert len(valid.paddingAmounts.borderAmounts) == 2
-            pad_t = valid.paddingAmounts.borderAmounts[0].startEdgeSize
-            pad_l = valid.paddingAmounts.borderAmounts[1].startEdgeSize
-            pad_b = valid.paddingAmounts.borderAmounts[0].endEdgeSize
-            pad_r = valid.paddingAmounts.borderAmounts[1].endEdgeSize
-            if not all(v == 0 for v in (pad_t, pad_l, pad_b, pad_r)):
-                params["padding"] = (pad_t, pad_l, pad_b, pad_r)
-    elif op.WhichOneof("ConvolutionPaddingType") == "same":
-        assert op.same.asymmetryMode == 0, (
-            "Only support BOTTOM_RIGHT_HEAVY mode, " "which is used by tf/caffe and so on"
-        )
-        kernel = params["kernel_size"]
-        strides = params["strides"]
-        pad_t, pad_b = get_pad_value(H, kernel[0], strides[0])
-        pad_l, pad_r = get_pad_value(W, kernel[1], strides[1])
-        params["padding"] = (pad_t, pad_l, pad_b, pad_r)
-    else:
-        raise NotImplementedError("Valid/Same convolution padding implemented")
-
-    if op.isDeconvolution:
-        ret = _op.nn.conv2d_transpose(data=inexpr, weight=weights, **params)
-    else:
-        ret = _op.nn.conv2d(data=inexpr, weight=weights, **params)
-    if op.hasBias:
-        biases = etab.new_const(list(op.bias.floatValue))
-        ret = _op.nn.bias_add(ret, biases)
-
-    return ret
-
-
-def _BatchnormLayerParams(op, inexpr, etab):
-    """Get layer of batchnorm parameter"""
-    # this changes the symbol
-    if op.instanceNormalization:
-        raise tvm.error.OpNotImplemented(
-            'Operator "instance normalization" is not supported in frontend CoreML.'
-        )
-    params = {
-        "gamma": etab.new_const(list(op.gamma.floatValue)),
-        "beta": etab.new_const(list(op.beta.floatValue)),
-        "moving_mean": etab.new_const(list(op.mean.floatValue)),
-        "moving_var": etab.new_const(list(op.variance.floatValue)),
-        "epsilon": op.epsilon,
-    }
-    result, moving_mean, moving_var = _op.nn.batch_norm(data=inexpr, **params)
-    return result
-
-
-def _ActivationParams(op, inexpr, etab):
-    """Get activation parameters"""
-    whichActivation = op.WhichOneof("NonlinearityType")
-    par = getattr(op, whichActivation)
-    if whichActivation == "linear":
-        alpha = _expr.const(par.alpha, dtype="float32")
-        beta = _expr.const(par.beta, dtype="float32")
-        return _op.add(_op.multiply(inexpr, alpha), beta)
-    if whichActivation == "ReLU":
-        return _op.nn.relu(inexpr)
-    if whichActivation == "leakyReLU":
-        return _op.nn.leaky_relu(inexpr, alpha=par.alpha)
-    elif whichActivation == "thresholdedReLU":
-        alpha_tensor = _op.full_like(inexpr, fill_value=_expr.const(par.alpha, dtype="float32"))
-        return _op.multiply(inexpr, _op.greater(inexpr, alpha_tensor).as_type("float32"))
-    if whichActivation == "PReLU":
-        return _op.nn.prelu(inexpr, alpha=_expr.const(par.alpha, dtype="float32"))
-    if whichActivation == "tanh":
-        return _op.tanh(inexpr)
-    if whichActivation == "scaledTanh":
-        alpha = _expr.const(par.alpha, dtype="float32")
-        beta = _expr.const(par.beta, dtype="float32")
-        return _op.multiply(_op.tanh(_op.multiply(inexpr, beta)), alpha)
-    if whichActivation == "sigmoid":
-        return _op.sigmoid(inexpr)
-    if whichActivation == "sigmoidHard":
-        alpha = _expr.const(par.alpha, dtype="float32")
-        beta = _expr.const(par.beta, dtype="float32")
-        transformX = (alpha * inexpr) + beta
-        return _op.clip(transformX, a_min=0.0, a_max=1.0)
-    if whichActivation == "ELU":
-        return _op.multiply(
-            _op.add(_op.exp(inexpr), _expr.const(-1, dtype="float32")),
-            _expr.const(par.alpha, dtype="float32"),
-        )
-    if whichActivation == "softsign":
-        return inexpr / (
-            _expr.const(1, dtype="float32")
-            + (op.nn.relu(inexpr) + _op.nn.relu(_op.negative(inexpr)))
-        )
-    if whichActivation == "softplus":
-        return _op.log(_op.add(_op.exp(inexpr), _expr.const(1, dtype="float32")))
-    if whichActivation == "parametricSoftplus":
-        alpha = list(par.alpha.floatValue)
-        beta = list(par.alpha.floatValue)
-        if len(alpha) == 1:
-            return _op.multiply(
-                _op.log(_op.add(_op.exp(inexpr), _expr.const(beta[0], dtype="float32"))),
-                _expr.const(alpha[0], dtype="float32"),
-            )
-        alpha = np.array(alpha).reshape((len(alpha), 1, 1))
-        beta = np.array(beta).reshape((len(beta), 1, 1))
-        alpha_expr = etab.new_const(alpha)
-        beta_expr = etab.new_const(beta)
-        return _op.multiply(_op.log(_op.add(_op.exp(inexpr), beta_expr)), alpha_expr)
-    raise tvm.error.OpNotImplemented(
-        f"Operator {whichActivation} is not supported in frontend CoreML."
-    )
-
-
-def _ScaleLayerParams(op, inexpr, etab):
-    """Scale layer params."""
-    scale = etab.new_const(
-        np.array(list(op.scale.floatValue)).reshape(tuple(list(op.shapeScale) + [1, 1]))
-    )
-    ret = _op.multiply(inexpr, scale)
-    if op.hasBias:
-        bias = etab.new_const(
-            np.array(list(op.bias.floatValue)).reshape(tuple(list(op.shapeBias) + [1, 1]))
-        )
-        ret = _op.add(ret, bias)
-    return ret
-
-
-def _PoolingLayerParams(op, inexpr, etab):
-    """get pooling parameters"""
-    if op.globalPooling:
-        if op.type == 0:
-            return _op.nn.global_max_pool2d(inexpr)
-        if op.type == 1:
-            return _op.nn.global_avg_pool2d(inexpr)
-        raise tvm.error.OpNotImplemented(
-            "Only Max and Average Pooling are supported in frontend CoreML."
-        )
-
-    params = {"pool_size": list(op.kernelSize), "strides": list(op.stride)}
-
-    if op.WhichOneof("PoolingPaddingType") == "valid":
-        valid = op.valid
-        if valid.paddingAmounts.borderAmounts:
-            assert len(valid.paddingAmounts.borderAmounts) == 2
-            pad_t = valid.paddingAmounts.borderAmounts[0].startEdgeSize
-            pad_l = valid.paddingAmounts.borderAmounts[1].startEdgeSize
-            pad_b = valid.paddingAmounts.borderAmounts[0].endEdgeSize
-            pad_r = valid.paddingAmounts.borderAmounts[1].endEdgeSize
-            if not all(v == 0 for v in (pad_t, pad_l, pad_b, pad_r)):
-                params["padding"] = [pad_t, pad_l, pad_b, pad_r]
-    elif op.WhichOneof("PoolingPaddingType") == "includeLastPixel":
-        # I don't know if this is correct
-        valid = op.includeLastPixel
-        padding = list(valid.paddingAmounts)
-        params["padding"] = padding
-        params["ceil_mode"] = True
-    else:
-        op_name = op.WhichOneof("PoolingPaddingType")
-        msg = f"PoolingPaddingType {op_name} is not supported in operator Pooling."
-        raise tvm.error.OpAttributeUnImplemented(msg)
-
-    if op.type == 0:
-        return _op.nn.max_pool2d(inexpr, **params)
-    if op.type == 1:
-        return _op.nn.avg_pool2d(inexpr, **params)
-    raise tvm.error.OpNotImplemented("Only Max and Average Pooling are supported in CoreML.")
-
-
-def _SoftmaxLayerParams(op, inexpr, etab):
-    return _op.nn.softmax(_op.nn.batch_flatten(inexpr))
-
-
-def _InnerProductLayerParams(op, inexpr, etab):
-    weights = etab.new_const(
-        np.array(op.weights.floatValue).reshape((op.outputChannels, op.inputChannels))
-    )
-    out = _op.nn.dense(data=inexpr, weight=weights, units=op.outputChannels)
-    if op.hasBias:
-        bias = etab.new_const(np.array(op.bias.floatValue))
-        out = _op.nn.bias_add(out, bias)
-    return out
-
-
-def _AddLayerParams(op, inexpr, etab):
-    if not isinstance(inexpr, list):
-        inexpr = [inexpr]
-    ret = inexpr[0]
-    for i in range(1, len(inexpr)):
-        ret = _op.add(ret, inexpr[i])
-    if op.alpha > 0:
-        ret = _op.add(ret, _expr.const(op.alpha, dtype="float32"))
-    return ret
-
-
-def _MultiplyLayerParams(op, inexpr, etab):
-    if not isinstance(inexpr, list):
-        inexpr = [inexpr]
-    ret = inexpr[0]
-    for i in range(1, len(inexpr)):
-        ret = _op.multiply(ret, inexpr[i])
-    if op.alpha != 1:
-        ret = _op.multiply(ret, _expr.const(op.alpha, dtype="float32"))
-    return ret
-
-
-def _ConcatLayerParams(op, inexpr, etab):
-    if not isinstance(inexpr, list):
-        inexpr = [inexpr]
-    if op.sequenceConcat:
-        raise tvm.error.OpNotImplemented(
-            "Operator Sequence Concat is not supported in frontend CoreML."
-        )
-    ret = _op.concatenate(inexpr, axis=1)
-    return ret
-
-
-def _FlattenLayerParams(op, inexpr, etab):
-    if op.mode == 1:
-        inexpr = _op.transpose(_op.reshape(inexpr, newshape=(0, 0, -1)), axes=(0, 2, 1))
-    return _op.nn.batch_flatten(inexpr)
-
-
-def _PaddingLayerParams(op, inexpr, etab):
-    """Padding layer params."""
-    if op.WhichOneof("PaddingType") == "constant":
-        constant = op.constant
-        if constant.value != 0:
-            raise tvm.error.OpAttributeUnImplemented(
-                f"{constant.value} is not supported in operator Padding."
-            )
-        pad_t = op.paddingAmounts.borderAmounts[0].startEdgeSize
-        pad_l = op.paddingAmounts.borderAmounts[1].startEdgeSize
-        pad_b = op.paddingAmounts.borderAmounts[0].endEdgeSize
-        pad_r = op.paddingAmounts.borderAmounts[1].endEdgeSize
-        return _op.nn.pad(data=inexpr, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
-    raise tvm.error.OpNotImplemented("Non-constant padding is not supported in frontend CoreML.")
-
-
-def _PermuteLayerParams(op, inexpr, etab):
-    axes = tuple(op.axis)
-    return _op.transpose(inexpr, axes=axes)
-
-
-def _UpsampleLayerParams(op, inexpr, etab):
-    if op.scalingFactor[0] != op.scalingFactor[1]:
-        raise tvm.error.OpAttributeUnimplemented("Upsample height and width must be equal.")
-    interpolationMode = "nearest_neighbor" if op.mode == 0 else "bilinear"
-    return _op.nn.upsampling(
-        inexpr, scale_h=op.scalingFactor[0], scale_w=op.scalingFactor[1], method=interpolationMode
-    )
-
-
-def _L2NormalizeLayerParams(op, inexpr, etab):
-    return _op.nn.l2_normalize(inexpr, eps=op.epsilon, axis=[1])
-
-
-def _LRNLayerParams(op, inexpr, etab):
-    par = {}
-    par["size"] = op.localSize
-    par["bias"] = op.k
-    par["alpha"] = op.alpha
-    par["beta"] = op.beta
-    par["axis"] = 1  # default layout is nchw
-    return _op.nn.lrn(data=inexpr, **par)
-
-
-def _AverageLayerParams(op, inexpr, etab):
-    if not isinstance(inexpr, list) or len(inexpr) < 2:
-        raise ValueError("Expect minimum 2 inputs")
-    count = len(inexpr)
-    _sum = inexpr[0]
-    for i in range(1, count):
-        _sum = _op.add(_sum, inexpr[i])
-    return _sum / _expr.const(count, dtype="float32")
-
-
-def _MaxLayerParams(op, inexpr, etab):
-    if not isinstance(inexpr, list) or len(inexpr) < 2:
-        raise ValueError("Expect minimum 2 inputs")
-    _max = inexpr[0]
-    for i in range(1, len(inexpr)):
-        _max = _op.maximum(_max, inexpr[i])
-    return _max
-
-
-def _MinLayerParams(op, inexpr, etab):
-    if not isinstance(inexpr, list) or len(inexpr) < 2:
-        raise ValueError("Expect minimum 2 inputs")
-    _min = inexpr[0]
-    for i in range(1, len(inexpr)):
-        _min = _op.minimum(_min, inexpr[i])
-    return _min
-
-
-def _UnaryFunctionLayerParams(op, inexpr, etab):
-    op_type = op.type
-    if op_type == op.SQRT:
-        return _op.sqrt(inexpr)
-    elif op_type == op.RSQRT:
-        epsilon = _expr.const(op.epsilon)
-        return _op.rsqrt(inexpr + epsilon)
-    elif op_type == op.INVERSE:
-        epsilon = _expr.const(op.epsilon)
-        return _expr.const(1.0) / (inexpr + epsilon)
-    elif op_type == op.POWER:
-        alpha = _expr.const(op.alpha)
-        return _op.power(inexpr, alpha)
-    elif op_type == op.EXP:
-        return _op.exp(inexpr)
-    elif op_type == op.LOG:
-        return _op.log(inexpr)
-    elif op_type == op.ABS:
-        return _op.abs(inexpr)
-    elif op_type == op.THRESHOLD:
-        alpha = _expr.const(op.alpha)
-        return _op.maximum(inexpr, alpha)
-    else:
-        msg = f"Unary Op type value {op_type} is not supported in frontend CoreML."
-        raise tvm.error.OpAttributeUnImplemented(msg)
-
-
-def _ReduceLayerParams(op, inexpr, etab):
-    axis = op.axis
-    if axis == op.CHW:
-        axis = [-3, -2, -1]
-    elif axis == op.HW:
-        axis = [-2, -1]
-    elif axis == op.C:
-        axis = -3
-    elif axis == op.H:
-        axis = -2
-    elif axis == op.W:
-        axis = -1
-    else:
-        msg = f"Reduce axis value {axis} is not supported in frontend CoreML."
-        raise tvm.error.OpAttributeUnImplemented(msg)
-
-    mode = op.mode
-    if mode == op.SUM:
-        return _op.sum(inexpr, axis=axis, keepdims=True)
-    elif mode == op.AVG:
-        return _op.mean(inexpr, axis=axis, keepdims=True)
-    elif mode == op.PROD:
-        return _op.prod(inexpr, axis=axis, keepdims=True)
-    elif mode == op.MIN:
-        return _op.min(inexpr, axis=axis, keepdims=True)
-    elif mode == op.MAX:
-        return _op.max(inexpr, axis=axis, keepdims=True)
-    elif mode == op.ARGMAX:
-        return _op.argmax(inexpr, axis=axis, keepdims=True)
-    else:
-        msg = f"Reduce mode value {mode} is not supported in frontend CoreML."
-        raise tvm.error.OpAttributeUnImplemented(msg)
-
-
-def _ReshapeLayerParams(op, inexpr, etab):
-    return _op.reshape(inexpr, op.targetShape)
-
-
-def _SplitLayerParams(op, inexpr, etab):
-    return _op.split(inexpr, op.nOutputs, axis=-3)
-
-
-_convert_map = {
-    "NeuralNetworkMeanImage": _NeuralNetworkMeanImage,
-    "NeuralNetworkImageScaler": _NeuralNetworkImageScaler,
-    "ConvolutionLayerParams": _ConvolutionLayerParams,
-    "BatchnormLayerParams": _BatchnormLayerParams,
-    "ActivationParams": _ActivationParams,
-    "ScaleLayerParams": _ScaleLayerParams,
-    "PoolingLayerParams": _PoolingLayerParams,
-    "SoftmaxLayerParams": _SoftmaxLayerParams,
-    "InnerProductLayerParams": _InnerProductLayerParams,
-    "AddLayerParams": _AddLayerParams,
-    "MultiplyLayerParams": _MultiplyLayerParams,
-    "FlattenLayerParams": _FlattenLayerParams,
-    "ConcatLayerParams": _ConcatLayerParams,
-    "PaddingLayerParams": _PaddingLayerParams,
-    "PermuteLayerParams": _PermuteLayerParams,
-    "UpsampleLayerParams": _UpsampleLayerParams,
-    "L2NormalizeLayerParams": _L2NormalizeLayerParams,
-    "LRNLayerParams": _LRNLayerParams,
-    "AverageLayerParams": _AverageLayerParams,
-    "MaxLayerParams": _MaxLayerParams,
-    "MinLayerParams": _MinLayerParams,
-    "UnaryFunctionLayerParams": _UnaryFunctionLayerParams,
-    "ReduceLayerParams": _ReduceLayerParams,
-    "ReshapeLayerParams": _ReshapeLayerParams,
-    "SplitLayerParams": _SplitLayerParams,
-}
-
-# SAME padding: https://www.tensorflow.org/api_guides/python/nn
-def get_pad_value(data, kernel, stride):
-    """Get the pad tuple of value for SAME padding
-
-    Parameters
-    ----------
-    data:
-        1D input data
-
-    kernel:
-        1D input kernel
-
-    stride:
-        1D input stride
-
-    Returns
-    -------
-        pad tuple of value
-    """
-
-    out = int(math.ceil(float(data) / float(stride)))
-    pad = max(0, (out - 1) * stride + kernel - data)
-    pad_before = pad // 2
-    pad_after = pad - pad_before
-    return pad_before, pad_after
-
-
-def coreml_op_to_relay(op, inname, outnames, etab):
-    """Convert coreml layer to a Relay expression and update the expression table.
-
-    Parameters
-    ----------
-    op: a coreml protobuf bit
-
-    inname : str or list of str
-        Name of the input Relay expression.
-
-    outnames : str or list of str
-        Name of the output Relay expression.
-
-    etab : relay.frontend.common.ExprTable
-        The global expression table to be updated.
-    """
-    classname = type(op).__name__
-    if classname not in _convert_map:
-        raise tvm.error.OpNotImplemented(
-            f"Operator {classname} is not supported in frontend CoreML."
-        )
-    if isinstance(inname, _base.string_types):
-        insym = etab.get_expr(inname)
-    else:
-        insym = [etab.get_expr(i) for i in inname]
-    outs = _convert_map[classname](op, insym, etab)
-
-    if outnames:
-        if isinstance(outnames, _base.string_types) or len(outnames) == 1:
-            outname = outnames if isinstance(outnames, _base.string_types) else outnames[0]
-            etab.set_expr(outname, outs, force_override=True)
-        else:
-            # the number of outputs from model op and tvm relay must be same
-            assert len(outnames) == len(outs)
-            for outname, out in zip(outnames, outs):
-                etab.set_expr(outname, out, force_override=True)
-
-
-def from_coreml(model, shape=None):
-    """Convert from coreml model into Relay Function.
-
-    Parameters
-    ----------
-    model:
-        coremltools.models.MLModel of a NeuralNetworkClassifier
-
-    shape : dict of str to int list/tuple, optional
-        The input shapes
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation.
-
-    params : dict of str to tvm.nd.NDArray
-        The parameter dict to be used by Relay.
-    """
-    try:
-        import coremltools as cm
-    except ImportError:
-        raise ImportError("The coremltools package must be installed")
-
-    assert isinstance(model, cm.models.MLModel)
-    spec = model.get_spec()
-    modeltype = spec.WhichOneof("Type")
-    assert modeltype in ["neuralNetworkClassifier", "neuralNetwork", "neuralNetworkRegressor"]
-    cc = getattr(spec, modeltype)
-
-    etab = ExprTable()
-    for i in spec.description.input:
-        input_shape = list(shape[i.name]) if shape is not None and i.name in shape else None
-        etab.set_expr(i.name, _expr.var(i.name, shape=input_shape))
-
-    for pp in cc.preprocessing:
-        whichpp = pp.WhichOneof("preprocessor")
-        ppmethod = getattr(pp, whichpp)
-        if whichpp == "scaler":
-            # Be careful we maybe only preprocess one input when we have multi inputs
-            # which is stored in pp.featureName. See unit testing verify_image_scaler
-            # in test_forward.py for CoreML.
-            for i in spec.description.input:
-                # we have multi inputs
-                if len(spec.description.input) > 1:
-                    assert pp.featureName != ""
-                    if i.name == pp.featureName:
-                        coreml_op_to_relay(ppmethod, i.name, i.name, etab)
-                else:
-                    assert pp.featureName == ""
-                    coreml_op_to_relay(ppmethod, i.name, i.name, etab)
-        else:
-            coreml_op_to_relay(ppmethod, pp.featureName, pp.featureName, etab)
-
-    for l in cc.layers:
-        layertype = l.WhichOneof("layer")
-        layerop = getattr(l, layertype)
-        if len(l.input) == 1:
-            coreml_op_to_relay(layerop, l.input[0], l.output, etab)
-        else:
-            coreml_op_to_relay(layerop, list(l.input), l.output, etab)
-
-    outexpr = [
-        etab.get_expr(o.name) if o.name in etab.exprs else _expr.var(o.name)
-        for o in spec.description.output
-    ]
-
-    # check there are multiple outputs in the model and all are there in etab
-    multi_out = all([bool(o.name in etab.exprs) for o in spec.description.output])
-    outexpr = _expr.Tuple(outexpr) if multi_out else outexpr[0]
-
-    func = _function.Function(analysis.free_vars(outexpr), outexpr)
-    params = {k: _nd.array(np.array(v, dtype=np.float32)) for k, v in etab.params.items()}
-    return IRModule.from_expr(func), params
diff --git a/python/tvm/relay/frontend/darknet.py b/python/tvm/relay/frontend/darknet.py
deleted file mode 100644
index aff3df3b91c5..000000000000
--- a/python/tvm/relay/frontend/darknet.py
+++ /dev/null
@@ -1,886 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""
-DarkNet symbol frontend for Relay.
-"""
-
-from enum import Enum
-import numpy as np
-import tvm
-from tvm.ir import IRModule
-
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .common import get_relay_op, new_var
-
-__all__ = ["from_darknet"]
-
-
-def _darknet_not_support(attr, op="relay"):
-    """Raise error if any operation is not supported."""
-    raise NotImplementedError(f"{attr} is not supported in {op}.")
-
-
-def _get_params_prefix(opname, layer_num):
-    """Makes the params prefix name from opname and layer number."""
-    return str(opname).replace(".", "_") + str(layer_num)
-
-
-def _get_params_name(prefix, item):
-    """Makes the params name for the k,v pair."""
-    return prefix + "_" + item
-
-
-def _get_param_var(params, prefix, item):
-    name = _get_params_name(prefix, item)
-    if name not in params:
-        raise AttributeError(f"{name} not found in params dict.")
-    return new_var(name, shape=params[name].shape, dtype=params[name].dtype)
-
-
-def _darknet_maxpooling(inputs, params, attrs, prefix):
-    """Process the max pool 2d operation."""
-    new_attrs = {}
-    kernel = attrs.get("kernel")
-    strides = attrs.get("stride", 1)
-    pads = attrs.get("pad", 1)
-    new_attrs["pool_size"] = (kernel, kernel)
-    new_attrs["strides"] = (strides, strides)
-    new_attrs["padding"] = (pads, pads)
-    extra_pad_size = attrs.get("extra_pad_size", 0)
-    if extra_pad_size:
-        pad_width = ((0, 0), (0, 0), (0, extra_pad_size), (0, extra_pad_size))
-        inputs = [
-            get_relay_op("pad")(*inputs, pad_width=pad_width, pad_value=np.finfo(np.float32).min)
-        ]
-    return get_relay_op("max_pool2d")(*inputs, **new_attrs)
-
-
-def _darknet_avgpooling(inputs, params, attrs, prefix):
-    """Process the average pool 2d operation."""
-    new_attrs = {}
-    kernel = attrs.get("kernel")
-    strides = attrs.get("stride", 1)
-    pads = attrs.get("pad", 0)
-
-    new_attrs["pool_size"] = (kernel, kernel)
-    new_attrs["strides"] = (strides, strides)
-    new_attrs["padding"] = (pads, pads)
-    return get_relay_op("avg_pool2d")(*inputs, **new_attrs)
-
-
-def _darknet_conv2d(inputs, params, attrs, prefix):
-    """Process the convolution 2d operation."""
-    new_attrs = {}
-    kernel = attrs.get("kernel")
-    strides = attrs.get("stride", 1)
-    pads = attrs.get("pad", 0)
-
-    new_attrs["channels"] = attrs.get("num_filter")
-    new_attrs["kernel_size"] = (kernel, kernel)
-    new_attrs["strides"] = (strides, strides)
-    new_attrs["padding"] = (pads, pads)
-    new_attrs["dilation"] = attrs.get("dilate", (1, 1))
-    new_attrs["groups"] = attrs.get("num_group", 1)
-
-    weight = _get_param_var(params, prefix, "weight")
-    out = get_relay_op("conv2d")(*inputs, weight=weight, **new_attrs)
-
-    use_bias = not attrs.get("use_batchNorm", False)
-    if use_bias:
-        new_attrs = {}
-        new_attrs["axis"] = 1
-        bias = _get_param_var(params, prefix, "bias")
-        out = get_relay_op("bias_add")(out, bias=bias, **new_attrs)
-    else:
-        new_attrs = {}
-        new_attrs["epsilon"] = 0.000001
-        gamma = _get_param_var(params, prefix, "gamma")
-        beta = _get_param_var(params, prefix, "beta")
-        moving_mean = _get_param_var(params, prefix, "moving_mean")
-        moving_var = _get_param_var(params, prefix, "moving_var")
-        out = get_relay_op("batch_norm")(out, gamma, beta, moving_mean, moving_var, **new_attrs)
-
-    if "activation" in attrs:
-        new_attrs = {}
-        new_attrs["activation"] = attrs["activation"]
-        new_attrs["slope"] = 0.1
-        out = _darknet_activations(out, None, new_attrs)
-    return out
-
-
-def _darknet_shortcut(inputs, params, attrs, prefix):
-    """Process the shortcut operation."""
-    input_0 = inputs[0]
-    input_1 = inputs[1]
-
-    input_0_channel = int(attrs["out_channel"])
-    input_1_channel = int(attrs["add_out_channel"])
-    input_0_size = int(attrs["out_size"])
-    input_1_size = int(attrs["add_out_size"])
-
-    if input_0_size > input_1_size:
-        scale = int(input_0_size / input_1_size)
-        input_1 = get_relay_op("upsampling")(input_1, scale_h=scale, scale_w=scale)
-
-    elif input_0_size < input_1_size:
-        stride = int(input_1_size / input_0_size)
-        input_1 = get_relay_op("avg_pool2d")(
-            input_1, pool_size=(1, 1), strides=(stride, stride), padding=(0, 0)
-        )
-
-    if input_0_channel != input_1_channel:
-        pad_channel = input_0_channel - input_1_channel
-        input_1 = get_relay_op("pad")(
-            input_1, pad_width=((0, 0), (0, pad_channel), (0, 0), (0, 0)), pad_value=0.0
-        )
-    sym = input_0 + input_1
-    if "activation" in attrs:
-        new_attrs = {}
-        new_attrs["activation"] = attrs["activation"]
-        sym = _darknet_activations(sym, None, new_attrs)
-    return sym
-
-
-def _darknet_dense(inputs, params, attrs, prefix):
-    """Process the dense operation."""
-    new_attrs = {}
-    new_attrs["units"] = attrs.get("num_hidden")
-    data = inputs[0]
-
-    if attrs.get("use_flatten", False) is True:
-        data = get_relay_op("batch_flatten")(data)
-
-    weight = _get_param_var(params, prefix, "weight")
-    data = get_relay_op("dense")(data, weight, **new_attrs)
-
-    use_bias = attrs.get("use_bias", False)
-    if use_bias:
-        bias = _get_param_var(params, prefix, "bias")
-        data = get_relay_op("bias_add")(data, bias, axis=1)
-
-    if "use_batchNorm" in attrs:
-        new_attrs = {}
-        new_attrs["epsilon"] = 0.000001
-        gamma = _get_param_var(params, prefix, "gamma")
-        beta = _get_param_var(params, prefix, "beta")
-        moving_mean = _get_param_var(params, prefix, "moving_mean")
-        moving_var = _get_param_var(params, prefix, "moving_var")
-        data = get_relay_op("batch_norm")(data, gamma, beta, moving_mean, moving_var, **new_attrs)
-    if "activation" in attrs:
-        new_attrs = {}
-        new_attrs["activation"] = attrs["activation"]
-        data = _darknet_activations(data, None, new_attrs)
-    return data
-
-
-def _darknet_dropout(inputs, params, attrs, prefix):
-    """Process the dropout operation, its a blank operation."""
-    new_attrs = {}
-    new_attrs["rate"] = attrs.get("p", 0.5)
-    return get_relay_op("dropout")(*inputs, **new_attrs)
-
-
-def _darknet_reshape(inputs, params, attrs, prefix):
-    """Process the reshape operation."""
-    new_attrs = {}
-    new_attrs["shape"] = attrs.get("shape")
-    return get_relay_op("reshape")(*inputs, **new_attrs)
-
-
-def _darknet_upsampling(inputs, params, attrs, prefix):
-    """Process the upsampling operation."""
-    new_attrs = {}
-    new_attrs["scale_h"] = attrs.get("scale", 1)
-    new_attrs["scale_w"] = attrs.get("scale", 1)
-    return get_relay_op("upsampling")(*inputs, **new_attrs)
-
-
-def _darknet_l2normalize(inputs, params, attrs, prefix):
-    """Process the l2 normalization operation."""
-    new_attrs = {}
-    new_attrs["eps"] = attrs.get("eps", 0.0)
-    new_attrs["axis"] = [attrs.get("axis", 1)]
-    return get_relay_op("l2_normalize")(*inputs, **new_attrs)
-
-
-def _darknet_softmax_output(inputs, params, attrs, prefix):
-    """Process the softmax operation."""
-    temperature = attrs.get("temperature", 1)
-    data = inputs[0]
-    if temperature != 1:
-        data = data / _expr.const(float(temperature))
-
-    if attrs.get("use_flatten", False) is True:
-        data = get_relay_op("batch_flatten")(data)
-
-    new_attrs = {}
-    if attrs.get("multi_output", False):
-        new_attrs["axis"] = 1
-    return get_relay_op("softmax")(data, **new_attrs)
-
-
-def _darknet_route(inputs, params, attrs, prefix):
-    """Process the route operation, which is equivalent to concat."""
-    new_attrs = {"axis": attrs.get("dim", 1)}
-    return get_relay_op("concatenate")((inputs[0], inputs[1]), **new_attrs)
-
-
-def _darknet_reorg(inputs, params, attrs, prefix):
-    """Process the reorg operation."""
-    new_attrs = {}
-    if "stride" in attrs:
-        new_attrs = {"stride": attrs.get("stride", 1)}
-    return get_relay_op("yolo_reorg")(*inputs, **new_attrs)
-
-
-def _darknet_region(inputs, params, attrs, prefix):
-    """Process the region operation."""
-    num = attrs.get("n", 1)
-    classes = attrs.get("classes", 1)
-    coords = attrs.get("coords", 0)
-    background = attrs.get("background", 0)
-    softmax = attrs.get("softmax", True)
-    input_shape = attrs.get("shape")
-
-    split_size = classes + coords + 1
-    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
-    data_block = get_relay_op("reshape")(inputs[0], newshape=intermediate_shape)
-    split_indices = (2, 4, 5)
-    split_res = get_relay_op("split")(data_block, indices_or_sections=split_indices, axis=2)
-    split_res0 = get_relay_op("sigmoid")(split_res[0])
-    split_res2 = split_res[2] if background else get_relay_op("sigmoid")(split_res[2])
-    split_res3 = get_relay_op("softmax")(split_res[3], axis=2) if softmax else split_res[3]
-    out = get_relay_op("concatenate")((split_res0, split_res[1], split_res2, split_res3), axis=2)
-    return get_relay_op("reshape")(out, newshape=input_shape)
-
-
-def _darknet_yolo(inputs, params, attrs, prefix):
-    """Process the yolo operation."""
-    num = attrs.get("n", 1)
-    classes = attrs.get("classes", 1)
-    input_shape = attrs.get("shape")
-    split_size = classes + 5
-    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
-    data_block = get_relay_op("reshape")(inputs[0], newshape=intermediate_shape)
-    split_indices = (2, 4)
-    split_res = get_relay_op("split")(data_block, indices_or_sections=split_indices, axis=2)
-    split_res0 = get_relay_op("sigmoid")(split_res[0])
-    split_res2 = get_relay_op("sigmoid")(split_res[2])
-    out = get_relay_op("concatenate")((split_res0, split_res[1], split_res2), axis=2)
-    return get_relay_op("reshape")(out, newshape=input_shape)
-
-
-class ACTIVATION(object):
-    """Darknet ACTIVATION Class constant."""
-
-    LOGISTIC = 0
-    RELU = 1
-    RELIE = 2
-    LINEAR = 3
-    RAMP = 4
-    TANH = 5
-    PLSE = 6
-    LEAKY = 7
-    ELU = 8
-    LOGGY = 9
-    STAIR = 10
-    HARDTAN = 11
-    LHTAN = 12
-
-
-def _darknet_activations(inputs, params, attrs):
-    """Process the activation function."""
-    act = attrs.get("activation")
-    data = inputs[0] if isinstance(inputs, _expr.TupleWrapper) else inputs
-
-    def _const(val):
-        return _expr.const(val)
-
-    def _relu(data):
-        return get_relay_op("relu")(data)
-
-    def _exp(data):
-        return get_relay_op("exp")(data)
-
-    def _tanh(data):
-        return get_relay_op("tanh")(data)
-
-    def _sigmoid(data):
-        return get_relay_op("sigmoid")(data)
-
-    def _elu(data):
-        alpha = _const(-1.0)
-        return alpha * _relu(_const(1.0) - _exp(data)) + _relu(data)
-
-    def _leaky_relu(data, slope):
-        new_attrs = {}
-        new_attrs["alpha"] = slope
-        return get_relay_op("leaky_relu")(data, **new_attrs)
-
-    if ACTIVATION.LOGISTIC == act:
-        data = _sigmoid(data)
-    elif ACTIVATION.RELU == act:
-        data = _relu(data)
-    elif ACTIVATION.TANH == act:
-        data = _tanh(data)
-    elif ACTIVATION.LINEAR == act:
-        return data
-    elif ACTIVATION.LEAKY == act:
-        data = _leaky_relu(data, attrs.get("slope", 0.1))
-    elif ACTIVATION.ELU == act:
-        data = _elu(data)
-    else:
-        _darknet_not_support("act: " + attrs)
-    return data
-
-
-class LAYERTYPE(Enum):
-    """Darknet LAYERTYPE Class constant."""
-
-    CONVOLUTIONAL = 0
-    DECONVOLUTIONAL = 1
-    CONNECTED = 2
-    MAXPOOL = 3
-    SOFTMAX = 4
-    DETECTION = 5
-    DROPOUT = 6
-    CROP = 7
-    ROUTE = 8
-    COST = 9
-    NORMALIZATION = 10
-    AVGPOOL = 11
-    LOCAL = 12
-    SHORTCUT = 13
-    ACTIVE = 14
-    RNN = 15
-    GRU = 16
-    LSTM = 17
-    CRNN = 18
-    BATCHNORM = 19
-    NETWORK = 20
-    XNOR = 21
-    REGION = 22
-    YOLO = 23
-    REORG = 24
-    UPSAMPLE = 25
-    LOGXENT = 26
-    L2NORM = 27
-    BLANK = 28
-
-
-_DARKNET_CONVERT_MAP = {
-    LAYERTYPE.CONVOLUTIONAL: _darknet_conv2d,
-    LAYERTYPE.CONNECTED: _darknet_dense,
-    LAYERTYPE.MAXPOOL: _darknet_maxpooling,
-    LAYERTYPE.SOFTMAX: _darknet_softmax_output,
-    LAYERTYPE.DROPOUT: _darknet_dropout,
-    LAYERTYPE.AVGPOOL: _darknet_avgpooling,
-    LAYERTYPE.ROUTE: _darknet_route,
-    LAYERTYPE.REORG: _darknet_reorg,
-    LAYERTYPE.REGION: _darknet_region,
-    LAYERTYPE.SHORTCUT: _darknet_shortcut,
-    LAYERTYPE.UPSAMPLE: _darknet_upsampling,
-    LAYERTYPE.L2NORM: _darknet_l2normalize,
-    LAYERTYPE.YOLO: _darknet_yolo,
-    LAYERTYPE.DECONVOLUTIONAL: _darknet_not_support,
-    LAYERTYPE.BATCHNORM: _darknet_not_support,
-    LAYERTYPE.DETECTION: _darknet_not_support,
-    LAYERTYPE.CROP: _darknet_not_support,
-    LAYERTYPE.COST: _darknet_not_support,
-    LAYERTYPE.NORMALIZATION: _darknet_not_support,
-    LAYERTYPE.LOCAL: _darknet_not_support,
-    LAYERTYPE.ACTIVE: _darknet_not_support,
-    LAYERTYPE.RNN: _darknet_not_support,
-    LAYERTYPE.GRU: _darknet_not_support,
-    LAYERTYPE.LSTM: _darknet_not_support,
-    LAYERTYPE.CRNN: _darknet_not_support,
-    LAYERTYPE.NETWORK: _darknet_not_support,
-    LAYERTYPE.XNOR: _darknet_not_support,
-    LAYERTYPE.BLANK: _darknet_not_support,
-}
-
-
-def _darknet_convert_symbol(op_name, inputs, params, attrs, params_prefix):
-    """Convert from darknet op to relay op.
-    Parameters
-    ----------
-    op_name : str
-        Operator name, such as Convolution, Connected, etc
-    inputs : list of relay.Function
-        List of input symbols.
-    attrs : dict
-        Dict of operator attributes
-    params_prefix: str
-        Params name for this operation
-
-    Returns
-    -------
-    out_name : converted out name of operation
-    sym : tvm.relay.Function
-        Converted relay function
-    """
-
-    if op_name in _DARKNET_CONVERT_MAP:
-        sym = _DARKNET_CONVERT_MAP[op_name](inputs, params, attrs, params_prefix)
-    else:
-        _darknet_not_support("Operator type " + str(op_name))
-    return sym
-
-
-def _as_list(arr):
-    """Force being a list, ignore if already is."""
-    if isinstance(arr, list):
-        return arr
-    return [arr]
-
-
-class GraphProto(object):
-    """A helper class for handling relay functions from darknet model."""
-
-    def __init__(self, net, shape, dtype="float32"):
-        self._net = net
-        self._shape = shape
-        self._dtype = dtype
-        self._sym_array = {}
-        self._tvmparams = {}
-        self._outs = []
-        self._state_ctr = {}
-        self._state_ctr["rnn"] = 0
-        self._state_ctr["crnn"] = 0
-        self._state_ctr["lstm"] = 0
-        self._state_ctr["cell_state"] = 0
-        self._state_ctr["gru"] = 0
-
-    def _read_memory_buffer(self, shape, data, dtype=None):
-        if dtype is None:
-            dtype = self._dtype
-        length = 1
-        for x in shape:
-            length *= x
-        data_np = np.zeros(length, dtype=dtype)
-        for i in range(length):
-            data_np[i] = data[i]
-        return data_np.reshape(shape)
-
-    def _get_convolution_weights(self, layer, opname):
-        """Get the convolution layer weights and biases."""
-        if layer.nweights == 0:
-            return None
-
-        if (layer.n * layer.c // layer.groups * layer.size * layer.size) != layer.nweights:
-            raise RuntimeError("layer weights size not matching with n c h w")
-
-        params = {}
-        shape = (layer.n, layer.c // layer.groups, layer.size, layer.size)
-        weights = self._read_memory_buffer(shape, layer.weights)
-
-        biases = self._read_memory_buffer((layer.n,), layer.biases)
-
-        k = _get_params_name(opname, "weight")
-        params[k] = tvm.nd.array(weights)
-
-        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
-            params.update(self._get_batchnorm_weights(layer, opname, layer.n))
-            k = _get_params_name(opname, "beta")
-            params[k] = tvm.nd.array(biases)
-        else:
-            k = _get_params_name(opname, "bias")
-            params[k] = tvm.nd.array(biases)
-        return params
-
-    def _get_connected_weights(self, layer, opname):
-        """Parse the weights and biases for fully connected or dense layer."""
-        size = layer.outputs * layer.inputs
-        if size == 0:
-            return None
-
-        weights = self._read_memory_buffer((layer.outputs, layer.inputs), layer.weights)
-        biases = self._read_memory_buffer((layer.outputs,), layer.biases)
-
-        params = {}
-        k = _get_params_name(opname, "weight")
-        params[k] = tvm.nd.array(weights)
-
-        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
-            params.update(self._get_batchnorm_weights(layer, opname, layer.outputs))
-            k = _get_params_name(opname, "beta")
-            params[k] = tvm.nd.array(biases)
-        else:
-            k = _get_params_name(opname, "bias")
-            params[k] = tvm.nd.array(biases)
-        return params
-
-    def _get_region_weights(self, layer, opname):
-        """Parse the biases for region layer."""
-        biases = self._read_memory_buffer((layer.n * 2,), layer.biases)
-        attributes = np.array(
-            [
-                layer.n,
-                layer.out_c,
-                layer.out_h,
-                layer.out_w,
-                layer.classes,
-                layer.coords,
-                layer.background,
-            ],
-            dtype=np.int32,
-        )
-        params = {}
-        k = _get_params_name(opname, "bias")
-        params[k] = tvm.nd.array(biases)
-        k = _get_params_name(opname, "attr")
-        params[k] = tvm.nd.array(attributes)
-        return params
-
-    def _get_yolo_weights(self, layer, opname):
-        """Parse the biases and mask for yolo layer."""
-        biases = self._read_memory_buffer((layer.total * 2,), layer.biases)
-        mask = self._read_memory_buffer((layer.n,), layer.mask, dtype="int32")
-        attributes = np.array(
-            [layer.n, layer.out_c, layer.out_h, layer.out_w, layer.classes, layer.total],
-            dtype=np.int32,
-        )
-        params = {}
-        k = _get_params_name(opname, "bias")
-        params[k] = tvm.nd.array(biases)
-        k = _get_params_name(opname, "mask")
-        params[k] = tvm.nd.array(mask)
-        k = _get_params_name(opname, "attr")
-        params[k] = tvm.nd.array(attributes)
-        return params
-
-    def _get_batchnorm_weights(self, layer, opname, size):
-        """Parse the weights for batchnorm, which includes, scales, moving mean
-        and moving variances."""
-        scales = self._read_memory_buffer((size,), layer.scales)
-        rolling_mean = self._read_memory_buffer((size,), layer.rolling_mean)
-        rolling_variance = self._read_memory_buffer((size,), layer.rolling_variance)
-
-        params = {}
-        k = _get_params_name(opname, "moving_mean")
-        params[k] = tvm.nd.array(rolling_mean)
-        k = _get_params_name(opname, "moving_var")
-        params[k] = tvm.nd.array(rolling_variance)
-        k = _get_params_name(opname, "gamma")
-        params[k] = tvm.nd.array(scales)
-        return params
-
-    def _get_darknet_attrs(self, layer, layer_num):
-        """Parse attributes of each layer and return."""
-        attr = {}
-        use_flatten = True
-        layer_type = LAYERTYPE(layer.type)
-        if LAYERTYPE.CONVOLUTIONAL == layer_type:
-            attr.update({"pad": layer.pad})
-            attr.update({"num_group": layer.groups})
-            attr.update({"num_filter": layer.n})
-            attr.update({"stride": layer.stride})
-            attr.update({"kernel": layer.size})
-            attr.update({"activation": (layer.activation)})
-
-            if layer.nbiases == 0:
-                attr.update({"use_bias": False})
-            else:
-                attr.update({"use_bias": True})
-
-            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
-                attr.update({"use_batchNorm": True})
-                attr.update({"use_scales": True})
-
-        elif LAYERTYPE.CONNECTED == layer_type:
-            attr.update({"num_hidden": layer.outputs})
-            attr.update({"activation": (layer.activation)})
-            if layer_num != 0:
-                layer_prev = self._net.layers[layer_num - 1]
-                if (
-                    layer_prev.out_h == layer.h
-                    and layer_prev.out_w == layer.w
-                    and layer_prev.out_c == layer.c
-                ):
-                    use_flatten = False
-            attr.update({"use_flatten": use_flatten})
-            attr.update({"use_bias": True})
-            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
-                attr.update({"use_batchNorm": True})
-                attr.update({"use_scales": True})
-                attr.update({"use_bias": False})
-
-        elif LAYERTYPE.MAXPOOL == layer_type:
-            attr.update({"pad": layer.pad})
-            attr.update({"stride": layer.stride})
-            attr.update({"kernel": layer.size})
-            max_output = (layer.w - layer.size + 2 * layer.pad) / float(layer.stride) + 1
-            if max_output < layer.out_w:
-                extra_pad = (layer.out_w - max_output) * layer.stride
-                attr.update({"extra_pad_size": int(extra_pad)})
-        elif LAYERTYPE.AVGPOOL == layer_type:
-            attr.update({"pad": layer.pad})
-            if layer.stride == 0:
-                attr.update({"stride": 1})
-            else:
-                attr.update({"stride": layer.stride})
-            if layer.size == 0 and layer.h == layer.w:
-                attr.update({"kernel": layer.h})
-            else:
-                attr.update({"kernel": layer.size})
-
-        elif LAYERTYPE.DROPOUT == layer_type:
-            attr.update({"p": layer.probability})
-
-        elif LAYERTYPE.SOFTMAX == layer_type:
-            attr.update({"axis": 1})
-            attr.update({"use_flatten": True})
-            if layer.temperature:
-                attr.update({"temperature": str(layer.temperature)})
-
-        elif LAYERTYPE.SHORTCUT == layer_type:
-            add_layer = self._net.layers[layer.index]
-            attr.update({"activation": layer.activation})
-            attr.update({"out_channel": layer.out_c})
-            attr.update({"out_size": layer.out_h})
-            attr.update({"add_out_channel": add_layer.out_c})
-            attr.update({"add_out_size": add_layer.out_h})
-
-        elif LAYERTYPE.ROUTE == layer_type:
-            pass
-
-        elif LAYERTYPE.COST == layer_type:
-            pass
-
-        elif LAYERTYPE.REORG == layer_type:
-            attr.update({"stride": layer.stride})
-
-        elif LAYERTYPE.REGION == layer_type:
-            attr.update({"n": layer.n})
-            attr.update({"classes": layer.classes})
-            attr.update({"coords": layer.coords})
-            attr.update({"background": layer.background})
-            attr.update({"softmax": layer.softmax})
-            attr.update({"shape": (-1, layer.c, layer.h, layer.w)})
-
-        elif LAYERTYPE.YOLO == layer_type:
-            attr.update({"n": layer.n})
-            attr.update({"classes": layer.classes})
-            attr.update({"shape": (-1, layer.c, layer.h, layer.w)})
-
-        elif LAYERTYPE.UPSAMPLE == layer_type:
-            attr.update({"scale": layer.stride})
-
-        elif LAYERTYPE.L2NORM == layer_type:
-            pass
-
-        else:
-            err = f"Darknet layer type {layer_type} is not supported in relay."
-            raise NotImplementedError(err)
-
-        return attr
-
-    def _get_darknet_params(self, layer, opname):
-        """To parse and get the darknet params."""
-        layer_type = LAYERTYPE(layer.type)
-        params = None
-        if LAYERTYPE.CONVOLUTIONAL == layer_type:
-            params = self._get_convolution_weights(layer, opname)
-        elif LAYERTYPE.CONNECTED == layer_type:
-            params = self._get_connected_weights(layer, opname)
-        elif LAYERTYPE.REGION == layer_type:
-            params = self._get_region_weights(layer, opname)
-        elif LAYERTYPE.YOLO == layer_type:
-            params = self._get_yolo_weights(layer, opname)
-        return params
-
-    def _preproc_layer(self, layer, layer_num):
-        """To preprocess each darknet layer, some layer doesnt need processing."""
-        if layer_num == 0:
-            name = "data"
-            sym = new_var(name, shape=self._shape, dtype=self._dtype)
-        else:
-            sym = self._sym_array[layer_num - 1]
-        skip_layer = False
-        layer_type = LAYERTYPE(layer.type)
-        if LAYERTYPE.ROUTE == layer_type:
-            sym = []
-            for j in range(layer.n):
-                sym.append(self._sym_array[layer.input_layers[j]])
-            if layer.n == 1:
-                skip_layer = True
-
-        elif LAYERTYPE.COST == layer_type:
-            skip_layer = True
-
-        elif LAYERTYPE.SHORTCUT == layer_type:
-            sym = [sym, self._sym_array[layer.index]]
-
-        elif LAYERTYPE.BLANK == layer_type:
-            skip_layer = True
-
-        if skip_layer is True:
-            self._sym_array[layer_num] = sym
-
-        return skip_layer, sym
-
-    def _get_opname(self, layer):
-        """Returs the layer name."""
-        return LAYERTYPE(layer.type)
-
-    def _new_rnn_state_var(self, state=None, name="rnn"):
-        """Returs a symbol for state"""
-        sym_name = name + f"{self._state_ctr[name]}_state"
-        self._state_ctr[name] += 1
-        return new_var(sym_name, shape=state.shape, dtype=str(state.dtype))
-
-    def _get_rnn_state_buffer(self, layer, name):
-        """Get the state buffer for rnn."""
-        buffer = np.zeros((1, layer.outputs), self._dtype)
-        return self._new_rnn_state_var(buffer, name)
-
-    def _get_darknet_rnn_attrs(self, layer, name, sym):
-        """Get the rnn converted symbol from attributes."""
-        attr = self._get_darknet_attrs(layer, 0)
-        op_name = self._get_opname(layer)
-        prefix = _get_params_prefix(op_name, name)
-        params = self._get_darknet_params(layer, prefix)
-        sym = _darknet_convert_symbol(op_name, _as_list(sym), params, attr, prefix)
-        if params:
-            self._tvmparams.update(params)
-        return sym
-
-    def _handle_darknet_rnn_layers(self, layer_num, sym):
-        """Parse attributes and handle the rnn layers."""
-        attr = {}
-        layer = self._net.layers[layer_num]
-        processed = False
-
-        layer_type = LAYERTYPE(layer.type)
-        if LAYERTYPE.RNN == layer_type:
-            attr.update({"n": layer.n})
-            attr.update({"batch": layer.batch})
-            attr.update({"num_hidden": str(layer.outputs)})
-            state = self._get_rnn_state_buffer(layer, "rnn")
-            for _ in range(layer.steps):
-                input_layer = layer.input_layer
-                prefix = "_input_" + str(layer_num)
-                sym = self._get_darknet_rnn_attrs(input_layer, prefix, sym)
-
-                self_layer = layer.self_layer
-                prefix = "_self_" + str(layer_num)
-                state = self._get_darknet_rnn_attrs(self_layer, prefix, state)
-
-                state = sym + state
-                self._outs.append(state)
-
-                output_layer = layer.output_layer
-                prefix = "_output_" + str(layer_num)
-                sym = self._get_darknet_rnn_attrs(output_layer, prefix, state)
-
-            self._sym_array[layer_num] = sym
-            processed = True
-        return processed, sym
-
-    def _make_outlist(self, sym, op_name, layer, layer_num):
-        layer_type = LAYERTYPE(layer.type)
-        if layer_type == LAYERTYPE.REGION:
-            # Add attributes
-            k = _get_params_name(op_name, "attr")
-            dshape = self._tvmparams[k].shape
-            dtype = self._tvmparams[k].dtype
-            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
-
-            # Add bias
-            k = _get_params_name(op_name, "bias")
-            dshape = self._tvmparams[k].shape
-            dtype = self._tvmparams[k].dtype
-            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
-            if layer_num != self._net.n - 1:
-                self._outs.insert(0, sym)
-
-        elif layer_type == LAYERTYPE.YOLO:
-            # Add attributes
-            k = _get_params_name(op_name, "attr")
-            dshape = self._tvmparams[k].shape
-            dtype = self._tvmparams[k].dtype
-            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
-
-            # Add bias
-            k = _get_params_name(op_name, "bias")
-            dshape = self._tvmparams[k].shape
-            dtype = self._tvmparams[k].dtype
-            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
-
-            # Add mask
-            k = _get_params_name(op_name, "mask")
-            dshape = self._tvmparams[k].shape
-            dtype = self._tvmparams[k].dtype
-            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
-
-            if layer_num != self._net.n - 1:
-                self._outs.insert(0, sym)
-
-    def from_darknet(self):
-        """To convert the darknet symbol to relay functions."""
-        for i in range(self._net.n):
-            layer = self._net.layers[i]
-            need_skip, sym = self._preproc_layer(layer, i)
-            if need_skip:
-                continue
-
-            processed, sym = self._handle_darknet_rnn_layers(i, sym)
-            if processed:
-                continue
-
-            attr = self._get_darknet_attrs(layer, i)
-            op_name = self._get_opname(layer)
-            prefix = _get_params_prefix(op_name, i)
-            params = self._get_darknet_params(self._net.layers[i], prefix)
-            sym = _darknet_convert_symbol(op_name, _as_list(sym), params, attr, prefix)
-
-            if params:
-                self._tvmparams.update(params)
-            self._sym_array[i] = sym
-            self._make_outlist(sym, prefix, layer, i)
-
-        outputs = _as_list(sym) + self._outs
-        outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-        sym = _function.Function(analysis.free_vars(outputs), outputs)
-        return IRModule.from_expr(sym), self._tvmparams
-
-
-def from_darknet(net, shape=None, dtype="float32"):
-    """Convert from Darknet's model into compatible relay Function.
-
-    Parameters
-    ----------
-    net : Darknet net parameter
-        Darknet net structure.
-    shape : dict of str to tuple, optional
-        The input shape to the graph
-    dtype : str or dict of str to str
-        The input types to the graph
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation.
-
-    params : dict of str to tvm.nd.NDArray
-        The parameter dict to be used by relay
-    """
-
-    return GraphProto(net, shape, dtype).from_darknet()
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
deleted file mode 100644
index d53647cc684c..000000000000
--- a/python/tvm/relay/frontend/keras.py
+++ /dev/null
@@ -1,1601 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, import-outside-toplevel
-"""Keras frontend."""
-import dis
-import sys
-import numpy as np
-import tvm
-from tvm.ir import IRModule, TensorType, TupleType
-
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from ... import nd as _nd
-from .common import ExprTable, new_var
-
-__all__ = ["from_keras"]
-
-
-def _check_data_format(keras_layer):
-    if hasattr(keras_layer, ("data_format")):
-        if keras_layer.data_format != "channels_last":
-            raise ValueError("Keras frontend currently supports data_format = channels_last only.")
-
-
-def _get_pad_pair(input1d, kernel1d, stride1d):
-    out1d = (input1d + stride1d - 1) // stride1d
-    pad = np.maximum((out1d - 1) * stride1d + kernel1d - input1d, 0)
-    pad_before = pad // 2
-    pad_after = pad - pad_before
-    return [pad_before, pad_after]
-
-
-def _get_elu(inexpr, alpha):
-    """A helper method for elu."""
-    return _op.negative(alpha) * _op.nn.relu(
-        _expr.const(1.0, dtype="float32") - _op.exp(inexpr)
-    ) + _op.nn.relu(inexpr)
-
-
-def _as_list(arr):
-    """Force being a list, ignore if already is."""
-    if isinstance(arr, list):
-        return arr
-    return [arr]
-
-
-def _convert_recurrent_activation(inexpr, keras_layer):
-    act_type = keras_layer.recurrent_activation.__name__
-    return _convert_activation(inexpr, act_type, None, None, None)
-
-
-def _convert_activation(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    if isinstance(keras_layer, str):
-        act_type = keras_layer
-    else:
-        if sys.version_info.major < 3:
-            act_type = keras_layer.activation.func_name
-        else:
-            act_type = keras_layer.activation.__name__
-    if act_type == "linear":
-        if isinstance(keras_layer, str):
-            return inexpr
-        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1.0
-        beta = keras_layer.beta if hasattr(keras_layer, "beta") else 0.0
-        alpha = _expr.const(alpha, dtype="float32")
-        beta = _expr.const(beta, dtype="float32")
-        return _op.add(_op.multiply(inexpr, alpha), beta)
-    if act_type == "softmax":
-        axis = 1 if data_layout == "NCHW" else -1
-        return _op.nn.softmax(inexpr, axis)
-    if act_type == "sigmoid":
-        return _op.sigmoid(inexpr)
-    if act_type == "tanh":
-        return _op.tanh(inexpr)
-    if act_type == "relu":
-        return _op.nn.relu(inexpr)
-    if act_type == "softplus":
-        return _op.log(_op.add(_op.exp(inexpr), _expr.const(1.0, dtype="float32")))
-    if act_type == "elu":
-        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1.0
-        alpha = _expr.const(alpha, dtype="float32")
-        return _get_elu(inexpr, alpha)
-    if act_type == "selu":
-        # Alpha, Gamma values obtained from https://arxiv.org/abs/1706.02515
-        alpha = (
-            keras_layer.alpha
-            if hasattr(keras_layer, "alpha")
-            else 1.6732632423543772848170429916717
-        )
-        gamma = (
-            keras_layer.gamma
-            if hasattr(keras_layer, "gamma")
-            else 1.0507009873554804934193349852946
-        )
-        alpha = _expr.const(alpha, dtype="float32")
-        gamma = _expr.const(gamma, dtype="float32")
-        return gamma * _get_elu(inexpr, alpha)
-    if act_type == "relu6":
-        return _op.clip(inexpr, a_min=0.0, a_max=6.0)
-    if act_type == "softsign":
-        return inexpr / (_expr.const(1.0, dtype="float32") + _op.abs(inexpr))
-    if act_type == "hard_sigmoid":
-        x = (_expr.const(0.2, dtype="float32") * inexpr) + _expr.const(0.5, dtype="float32")
-        return _op.clip(x, a_min=0.0, a_max=1.0)
-    if act_type == "swish":
-        return inexpr * _op.sigmoid(inexpr)
-
-    raise tvm.error.OpNotImplemented(f"Operator {act_type} is not supported in frontend Keras.")
-
-
-def _convert_advanced_activation(inexpr, keras_layer, etab, data_layout, input_shape=None):
-    act_type = type(keras_layer).__name__
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    if act_type == "Softmax":
-        axis = keras_layer.axis
-        dims = len(input_shape) if input_shape else 0
-        if isinstance(axis, list):
-            raise tvm.error.OpAttributeUnImplemented(f"Softmax with axes {axis} is not supported.")
-        if data_layout == "NCHW":
-            if dims == 0:
-                axis = 0
-            elif axis == -1:
-                axis = 1
-            else:
-                axis = axis + 1 if axis < dims - 1 else 1
-        return _op.nn.softmax(inexpr, axis=axis)
-    if act_type == "ReLU":
-        if np.isnan(keras_layer.threshold).any():
-            raise tvm.error.OpAttributeInvalid("The threshold value of a ReLU cannot be None.")
-        threshold = _expr.const(keras_layer.threshold, dtype="float32")
-        if keras_layer.max_value and float(keras_layer.threshold) == 0:
-            # f(x) = max_value, for x >= max_value
-            # f(x) = x,         for threshold <= x < max_value
-            return _op.clip(inexpr, a_min=0.0, a_max=float(keras_layer.max_value))
-        if keras_layer.max_value and _op.greater(threshold, inexpr).astype("float32"):
-            # f(x) = negative_slope * (inexpr - threshold)
-            negative_slope = _expr.const(keras_layer.negative_slope, dtype="float32")
-            return _op.multiply(negative_slope, _op.subtract(inexpr, threshold))
-        return _op.nn.relu(inexpr)
-    if act_type == "LeakyReLU":
-        if np.isnan(keras_layer.alpha).any():
-            raise tvm.error.OpAttributeInvalid("The alpha value of a LeakyReLU cannot be None.")
-        return _op.nn.leaky_relu(inexpr, alpha=float(keras_layer.alpha))
-    if act_type == "ELU":
-        if np.isnan(keras_layer.alpha).any():
-            raise tvm.error.OpAttributeInvalid("The alpha value of a ELU cannot be None.")
-        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1.0
-        alpha = _expr.const(alpha, dtype="float32")
-        return _get_elu(inexpr, alpha)
-    if act_type == "PReLU":
-        assert hasattr(keras_layer, "alpha"), "alpha required for PReLU."
-        _check_data_format(keras_layer)
-        size = len(keras_layer.alpha.shape)
-        if data_layout == "NCHW":
-            alpha = etab.new_const(keras_layer.get_weights()[0].transpose(np.roll(range(size), 1)))
-        else:
-            alpha = etab.new_const(keras_layer.get_weights()[0])
-        return _op.negative(alpha) * _op.nn.relu(_op.negative(inexpr)) + _op.nn.relu(inexpr)
-    if act_type == "ThresholdedReLU":
-        theta = keras_layer.theta if hasattr(keras_layer, "theta") else 1.0
-        return _op.multiply(
-            inexpr, _op.greater(inexpr, _expr.const(theta, dtype="float32")).astype("float32")
-        )
-
-    raise tvm.error.OpNotImplemented(f"Operator {act_type} is not supported in frontend Keras.")
-
-
-def _convert_merge(
-    inexpr, keras_layer, _, input_shape=None, data_layout=None
-):  # pylint: disable=unused-argument
-    merge_type = type(keras_layer).__name__
-    ret = inexpr[0]
-    if merge_type == "Dot":
-        axes = keras_layer.axes
-        if isinstance(keras_layer.axes, int):
-            axes = [keras_layer.axes, keras_layer.axes]
-        if isinstance(axes, list):
-            if len(axes) != 2:
-                raise tvm.error.OpAttributeUnImplemented(
-                    f"Dot with axes {keras_layer.axes} is not supported."
-                )
-            for i, axis in enumerate(axes):
-                if axis not in [1, 2]:
-                    raise tvm.error.OpAttributeUnImplemented(
-                        f"Dot with axes {keras_layer.axes} is not supported."
-                    )
-                if axes[i] == 2:
-                    inexpr[i] = _op.transpose(inexpr[i], axes=[0, 2, 1])
-        else:
-            raise tvm.error.OpAttributeUnImplemented(
-                f"Dot with axes {keras_layer.axes} is not supported."
-            )
-        ret_dot = _op.nn.batch_matmul(inexpr[0], inexpr[1])
-        ret = _op.transpose(ret_dot, axes=[0, 2, 1])
-    elif merge_type == "Subtract":
-        assert len(inexpr) == 2, "Subtract merge takes 2 inputs."
-        ret = _op.subtract(ret, inexpr[1])
-    elif merge_type in ["Add", "Multiply", "Minimum", "Maximum"]:
-        op_map = {
-            "Add": _op.add,
-            "Multiply": _op.multiply,
-            "Minimum": _op.minimum,
-            "Maximum": _op.maximum,
-        }
-        for i in range(1, len(inexpr)):
-            ret = op_map[merge_type](ret, inexpr[i])
-    elif merge_type == "Average":
-        for i in range(1, len(inexpr)):
-            ret = _op.add(ret, inexpr[i])
-        ret = ret / _expr.const(len(inexpr), dtype="float32")
-    else:
-        raise tvm.error.OpNotImplemented(
-            f"Operator {merge_type} is not supported in frontend Keras."
-        )
-    return ret
-
-
-def _convert_permute(
-    inexpr, keras_layer, _, input_shape=None, data_layout=None
-):  # pylint: disable=unused-argument
-    return _op.transpose(inexpr, axes=(0,) + keras_layer.dims)
-
-
-def _convert_embedding(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    indices = inexpr
-    weightList = keras_layer.get_weights()
-    weight = etab.new_const(weightList[0])
-    out = _op.take(weight, indices.astype("int32"), axis=0)
-
-    return out
-
-
-def _convert_dense(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    weightList = keras_layer.get_weights()
-    weight = etab.new_const(weightList[0].transpose([1, 0]))
-    params = {"weight": weight, "units": weightList[0].shape[1]}
-    units = list(weightList[0].shape)[1]
-    assert units > 0, "The value of units must be a positive integer"
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-    input_dim = len(input_shape)
-    # In case of RNN dense, input shape will be (1, 1, n)
-    if input_dim > 2:
-        input_shape = tuple(dim if dim else 1 for dim in _as_list(input_shape)[0])
-        # Keras has no limitations on the shape of the input tensor. But our
-        # dense op expects 2D input. All inputs with number of dimensions > 2
-        # are reshaped all "batch" axes into one.
-        # For example: (N, d1, d2, d3) -> (N * d1 * d2, d3)
-        new_batch_size = np.prod(input_shape[:-1])
-        inexpr = _op.reshape(inexpr, newshape=(new_batch_size, input_shape[-1]))
-    out = _op.nn.dense(data=inexpr, **params)
-    if keras_layer.use_bias:
-        bias = etab.new_const(weightList[1])
-        out = _op.nn.bias_add(out, bias)
-    # defuse activation
-    if sys.version_info.major < 3:
-        act_type = keras_layer.activation.func_name
-    else:
-        act_type = keras_layer.activation.__name__
-    if act_type != "linear":
-        out = _convert_activation(out, act_type, etab, data_layout)
-    if input_dim > 2:
-        out_shape = (*input_shape[:-1], units)
-        out = _op.reshape(out, newshape=out_shape)
-    return out
-
-
-def _convert_convolution1d(inexpr, keras_layer, etab, data_layout, input_shape=None):
-    is_deconv = type(keras_layer).__name__ == "Conv1DTranspose"
-
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-    _check_data_format(keras_layer)
-    weightList = keras_layer.get_weights()
-    weight = weightList[0]
-
-    if data_layout == "NWC":
-        kernel_layout = "WIO"
-        if is_deconv:
-            kernel_layout = "WOI"
-    else:
-        kernel_layout = "OIW"
-        if is_deconv:
-            kernel_layout = "IOW"
-        msg = (
-            f"Kernel layout with {kernel_layout} is not supported for operator Convolution1D "
-            f"in frontend Keras."
-        )
-        raise tvm.error.OpAttributeUnImplemented(msg)
-
-    if is_deconv:
-        if kernel_layout == "IOW":
-            weight = weight.transpose([2, 1, 0])
-        kernel_w, n_filters, _ = weight.shape
-    else:
-        kernel_w, _, n_filters = weight.shape
-
-    dilation_rate = keras_layer.dilation_rate
-    if isinstance(dilation_rate, (list, tuple)):
-        dilation = [dilation_rate[0]]
-    else:
-        dilation = [dilation_rate]
-
-    dilated_kernel_w = (kernel_w - 1) * dilation[0] + 1
-    stride_w = keras_layer.strides[0]
-    params = {
-        "weight": etab.new_const(weight),
-        "kernel_size": [kernel_w],
-        "strides": [stride_w],
-        "dilation": dilation,
-        "padding": [0],
-        "data_layout": data_layout,
-        "kernel_layout": kernel_layout,
-    }
-    params["channels"] = n_filters
-
-    if keras_layer.padding == "valid":
-        pass
-    # calculate the padding values
-    elif keras_layer.padding == "same":
-        in_w = input_shape[1]
-        pad_w = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
-        params["padding"] = [pad_w[0], pad_w[1]]
-    else:
-        msg = (
-            f"Padding with {keras_layer.padding} is not supported for operator Convolution3D "
-            f"in frontend Keras."
-        )
-        raise tvm.error.OpAttributeUnImplemented(msg)
-
-    if is_deconv:
-        out = _op.nn.conv1d_transpose(data=inexpr, **params)
-    else:
-        out = _op.nn.conv1d(data=inexpr, **params)
-
-    channel_axis = -1 if data_layout == "NWC" else 1
-    if keras_layer.use_bias:
-        bias = etab.new_const(weightList[1])
-        out = _op.nn.bias_add(out, bias, channel_axis)
-
-    # defuse activation
-    if sys.version_info.major < 3:
-        act_type = keras_layer.activation.func_name
-    else:
-        act_type = keras_layer.activation.__name__
-    if act_type != "linear":
-        out = _convert_activation(out, act_type, etab, data_layout)
-
-    return out
-
-
-def _convert_convolution(inexpr, keras_layer, etab, data_layout, input_shape=None):
-    _check_data_format(keras_layer)
-    is_deconv = type(keras_layer).__name__ == "Conv2DTranspose"
-    is_depthconv = type(keras_layer).__name__ == "DepthwiseConv2D"
-    weightList = keras_layer.get_weights()
-    weight = weightList[0]
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    if data_layout == "NHWC":
-        if is_depthconv:
-            kernel_layout = "HWOI"
-        elif is_deconv:
-            kernel_layout = "HWOI"
-        else:
-            kernel_layout = "HWIO"
-    else:
-        if is_deconv:
-            kernel_layout = "IOHW"
-        else:
-            kernel_layout = "OIHW"
-
-    if is_deconv:
-        kernel_h, kernel_w, n_filters, in_channels = weight.shape
-        if kernel_layout == "IOHW":
-            weight = weight.transpose([3, 2, 0, 1])
-    elif is_depthconv:
-        kernel_h, kernel_w, in_channels, depth_mult = weight.shape
-        if kernel_layout == "OIHW":
-            weight = weight.transpose([2, 3, 0, 1])
-    elif data_layout == "NCHW":
-        kernel_h, kernel_w, in_channels, n_filters = weight.shape
-        weight = weight.transpose([3, 2, 0, 1])
-    else:
-        kernel_h, kernel_w, in_channels, n_filters = weight.shape
-    if isinstance(keras_layer.dilation_rate, (list, tuple)):
-        dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
-    else:
-        dilation = [keras_layer.dilation_rate, keras_layer.dilation_rate]
-    dilated_kernel_h = (kernel_h - 1) * dilation[0] + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation[1] + 1
-    stride_h, stride_w = keras_layer.strides
-    params = {
-        "weight": etab.new_const(weight),
-        "kernel_size": [kernel_h, kernel_w],
-        "strides": [stride_h, stride_w],
-        "dilation": dilation,
-        "padding": [0, 0],
-        "data_layout": data_layout,
-        "kernel_layout": kernel_layout,
-    }
-    if is_depthconv:
-        params["channels"] = in_channels * depth_mult
-        params["groups"] = in_channels
-    else:
-        params["channels"] = n_filters
-    if is_deconv and keras_layer.output_padding:
-        params["output_padding"] = keras_layer.output_padding
-    if keras_layer.padding == "valid":
-        pass
-    # we insert a separate pad operator
-    elif keras_layer.padding == "same":
-        in_h = input_shape[1]
-        in_w = input_shape[2]
-        pad_t, pad_b = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
-        pad_l, pad_r = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
-        params["padding"] = (pad_t, pad_l, pad_b, pad_r)
-    else:
-        msg = (
-            f"Padding with {keras_layer.padding} is not supported for operator Convolution "
-            f"in frontend Keras."
-        )
-        raise tvm.error.OpAttributeUnImplemented(msg)
-    if is_deconv:
-        out = _op.nn.conv2d_transpose(data=inexpr, **params)
-    else:
-        out = _op.nn.conv2d(data=inexpr, **params)
-
-    if keras_layer.use_bias:
-        bias = etab.new_const(weightList[1])
-        if data_layout == "NCHW":
-            out = _op.nn.bias_add(out, bias)
-        else:
-            out = _op.nn.bias_add(out, bias, axis=-1)
-    # defuse activation
-    if sys.version_info.major < 3:
-        act_type = keras_layer.activation.func_name
-    else:
-        act_type = keras_layer.activation.__name__
-    if act_type != "linear":
-        out = _convert_activation(out, act_type, etab, data_layout)
-
-    return out
-
-
-def _convert_convolution3d(inexpr, keras_layer, etab, data_layout, input_shape=None):
-    _check_data_format(keras_layer)
-    is_deconv = type(keras_layer).__name__ == "Conv3DTranspose"
-    weightList = keras_layer.get_weights()
-    weight = weightList[0]
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    if data_layout == "NDHWC":
-        kernel_layout = "DHWIO"
-        if is_deconv:
-            kernel_layout = "DHWOI"
-    else:
-        kernel_layout = "OIDHW"
-        if is_deconv:
-            kernel_layout = "IODHW"
-        msg = (
-            f"Kernel layout with {kernel_layout} is not supported for operator Convolution3D "
-            f"in frontend Keras."
-        )
-        raise tvm.error.OpAttributeUnImplemented(msg)
-
-    if is_deconv:
-        kernel_d, kernel_h, kernel_w, n_filters, _ = weight.shape
-        if kernel_layout == "IODHW":
-            weight = weight.transpose([4, 3, 0, 1, 2])
-    else:
-        kernel_d, kernel_h, kernel_w, _, n_filters = weight.shape
-
-    dilation_rate = keras_layer.dilation_rate
-    if isinstance(dilation_rate, (list, tuple)):
-        dilation = [dilation_rate[0], dilation_rate[1], dilation_rate[2]]
-    else:
-        dilation = [dilation_rate, dilation_rate, dilation_rate]
-
-    dilated_kernel_d = (kernel_d - 1) * dilation[0] + 1
-    dilated_kernel_h = (kernel_h - 1) * dilation[1] + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation[2] + 1
-    stride_d, stride_h, stride_w = keras_layer.strides
-    params = {
-        "weight": etab.new_const(weight),
-        "kernel_size": [kernel_d, kernel_h, kernel_w],
-        "strides": [stride_d, stride_h, stride_w],
-        "dilation": dilation,
-        "padding": [0, 0, 0],
-        "data_layout": data_layout,
-        "kernel_layout": kernel_layout,
-    }
-    params["channels"] = n_filters
-    if is_deconv and keras_layer.output_padding:
-        params["output_padding"] = keras_layer.output_padding
-
-    if keras_layer.padding == "valid":
-        pass
-    # calculate the padding values
-    elif keras_layer.padding == "same":
-        in_d = input_shape[1]
-        in_h = input_shape[2]
-        in_w = input_shape[3]
-        pad_d = _get_pad_pair(in_d, dilated_kernel_d, stride_d)
-        pad_h = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
-        pad_w = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
-        params["padding"] = [pad_d[0], pad_h[0], pad_w[0], pad_d[1], pad_h[1], pad_w[1]]
-    else:
-        msg = (
-            f"Padding with {keras_layer.padding} is not supported for operator Convolution3D "
-            f"in frontend Keras."
-        )
-        raise tvm.error.OpAttributeUnImplemented(msg)
-    if is_deconv:
-        out = _op.nn.conv3d_transpose(data=inexpr, **params)
-    else:
-        out = _op.nn.conv3d(data=inexpr, **params)
-
-    channel_axis = -1 if data_layout == "NDHWC" else 1
-    if keras_layer.use_bias:
-        bias = etab.new_const(weightList[1])
-        out = _op.nn.bias_add(out, bias, channel_axis)
-
-    # defuse activation
-    if sys.version_info.major < 3:
-        act_type = keras_layer.activation.func_name
-    else:
-        act_type = keras_layer.activation.__name__
-    if act_type != "linear":
-        out = _convert_activation(out, act_type, etab, None)
-
-    return out
-
-
-def _convert_separable_convolution(inexpr, keras_layer, etab, data_layout, input_shape=None):
-    _check_data_format(keras_layer)
-
-    if data_layout == "NHWC":
-        kernel_layout = "HWOI"
-    else:
-        kernel_layout = "OIHW"
-
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    weightList = keras_layer.get_weights()
-    # depthwise conv
-    kernel_h, kernel_w, in_channels, depth_mult = weightList[0].shape
-    stride_h, stride_w = keras_layer.strides
-    if kernel_layout == "OIHW":
-        weight0 = weightList[0].transpose([2, 3, 0, 1])
-    else:
-        weight0 = weightList[0]
-    if isinstance(keras_layer.dilation_rate, (list, tuple)):
-        dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
-    else:
-        dilation = [keras_layer.dilation_rate, keras_layer.dilation_rate]
-    params0 = {
-        "weight": etab.new_const(weight0),
-        "channels": in_channels * depth_mult,
-        "groups": in_channels,
-        "kernel_size": [kernel_h, kernel_w],
-        "strides": [stride_h, stride_w],
-        "dilation": dilation,
-        "padding": [0, 0],
-        "data_layout": data_layout,
-        "kernel_layout": kernel_layout,
-    }
-    if keras_layer.padding == "valid":
-        pass
-    # we insert a separate pad operator
-    elif keras_layer.padding == "same":
-        in_h = input_shape[1]
-        in_w = input_shape[2]
-        pad_t, pad_b = _get_pad_pair(in_h, kernel_h, stride_h)
-        pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
-        params0["padding"] = (pad_t, pad_l, pad_b, pad_r)
-    else:
-        msg = (
-            f"Padding with {keras_layer.padding} is not supported for operator Separable "
-            f"Convolution in frontend Keras."
-        )
-        raise tvm.error.OpAttributeUnImplemented(msg)
-    depthconv = _op.nn.conv2d(data=inexpr, **params0)
-    # pointwise conv
-    if kernel_layout == "OIHW":
-        weight1 = weightList[1].transpose([3, 2, 0, 1])
-    else:
-        weight1 = weightList[1]
-        kernel_layout = "HWIO"
-    params1 = {
-        "weight": etab.new_const(weight1),
-        "channels": weightList[1].shape[3],
-        "groups": 1,
-        "kernel_size": [1, 1],
-        "strides": [1, 1],
-        "dilation": [1, 1],
-        "data_layout": data_layout,
-        "kernel_layout": kernel_layout,
-    }
-    out = _op.nn.conv2d(data=depthconv, **params1)
-    if keras_layer.use_bias:
-        bias = etab.new_const(weightList[2])
-        if data_layout == "NCHW":
-            out = _op.nn.bias_add(out, bias)
-        else:
-            out = _op.nn.bias_add(out, bias, axis=-1)
-    # defuse activation
-    if sys.version_info.major < 3:
-        act_type = keras_layer.activation.func_name
-    else:
-        act_type = keras_layer.activation.__name__
-    if act_type != "linear":
-        out = _convert_activation(out, act_type, etab, data_layout)
-    return out
-
-
-def _convert_flatten(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-
-    # NCHW -> NHWC so that dense can be correctly converted
-    if data_layout == "NCHW":
-        inexpr = _op.transpose(inexpr, axes=[0, 2, 3, 1])
-    return _op.nn.batch_flatten(inexpr)
-
-
-def _convert_pooling(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-
-    pool_type = type(keras_layer).__name__
-    # global pool in keras = global pool + flatten in relay
-    global_pool_params = {"layout": data_layout}
-
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    if pool_type == "GlobalMaxPooling2D":
-        return _convert_flatten(
-            _op.nn.global_max_pool2d(inexpr, **global_pool_params), keras_layer, etab, data_layout
-        )
-    if pool_type == "GlobalAveragePooling2D":
-        global_avg_pool2d = _op.nn.global_avg_pool2d(inexpr, **global_pool_params)
-        keep_dims = len(keras_layer.input.shape) == len(keras_layer.output.shape)
-        if keep_dims:
-            return global_avg_pool2d
-        return _convert_flatten(global_avg_pool2d, keras_layer, etab, data_layout)
-    pool_h, pool_w = keras_layer.pool_size
-    stride_h, stride_w = keras_layer.strides
-    params = {
-        "pool_size": [pool_h, pool_w],
-        "strides": [stride_h, stride_w],
-        "padding": [0, 0],
-        "layout": data_layout,
-    }
-    if keras_layer.padding == "valid":
-        pass
-    elif keras_layer.padding == "same":
-        in_h = input_shape[1]
-        in_w = input_shape[2]
-        pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
-        pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-        params["padding"] = [pad_t, pad_l, pad_b, pad_r]
-    else:
-        raise tvm.error.OpAttributeUnImplemented(
-            f"Padding with {keras_layer.padding} is not supported in operator Pooling."
-        )
-    if pool_type == "MaxPooling2D":
-        return _op.nn.max_pool2d(inexpr, **params)
-    if pool_type == "AveragePooling2D":
-        params["count_include_pad"] = False
-        return _op.nn.avg_pool2d(inexpr, **params)
-    raise tvm.error.OpNotImplemented(f"Operator {keras_layer} is not supported for frontend Keras.")
-
-
-def _convert_pooling3d(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    pool_type = type(keras_layer).__name__
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    if pool_type not in ["MaxPooling3D", "AveragePooling3D"]:
-        raise tvm.error.OpNotImplemented(
-            f"Operator {keras_layer} is not supported for frontend Keras."
-        )
-
-    pool_d1, pool_d2, pool_d3 = keras_layer.pool_size
-    stride_d1, stride_d2, stride_d3 = keras_layer.strides
-    params = {
-        "pool_size": [pool_d1, pool_d2, pool_d3],
-        "strides": [stride_d1, stride_d2, stride_d3],
-        "padding": [0, 0, 0],
-        "layout": data_layout,
-    }
-
-    if keras_layer.padding == "valid":
-        pass
-    elif keras_layer.padding == "same":
-        in_d1 = input_shape[1]
-        in_d2 = input_shape[2]
-        in_d3 = input_shape[3]
-        pad_d1 = _get_pad_pair(in_d1, pool_d1, stride_d1)
-        pad_d2 = _get_pad_pair(in_d2, pool_d2, stride_d2)
-        pad_d3 = _get_pad_pair(in_d3, pool_d3, stride_d3)
-        params["padding"] = [pad_d1[0], pad_d2[0], pad_d3[0], pad_d1[1], pad_d2[1], pad_d3[1]]
-    else:
-        raise tvm.error.OpAttributeUnImplemented(
-            f"Padding with {keras_layer.padding} is not supported in operator Pooling3D."
-        )
-
-    out = _op.transpose(inexpr, axes=(0, 4, 1, 2, 3))
-    params["layout"] = "NCDHW"
-    if pool_type == "MaxPooling3D":
-        out = _op.nn.max_pool3d(out, **params)
-    elif pool_type == "AveragePooling3D":
-        out = _op.nn.avg_pool3d(out, **params)
-
-    return _op.transpose(out, axes=(0, 2, 3, 4, 1))
-
-
-def _convert_global_pooling3d(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    pool_type = type(keras_layer).__name__
-
-    global_pool_params = {"layout": data_layout}
-    if pool_type == "GlobalMaxPooling3D":
-        out = _op.nn.global_max_pool3d(inexpr, **global_pool_params)
-    elif pool_type == "GlobalAveragePooling3D":
-        out = _op.nn.global_avg_pool3d(inexpr, **global_pool_params)
-    else:
-        raise tvm.error.OpNotImplemented(
-            f"Operator {keras_layer} is not supported for frontend Keras."
-        )
-
-    return _convert_flatten(out, keras_layer, etab, input_shape, data_layout)
-
-
-def _convert_upsample(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    upsample_type = type(keras_layer).__name__
-    params = {}
-    if upsample_type == "UpSampling1D":
-        h = keras_layer.size
-        params["scale_h"] = h
-    elif upsample_type == "UpSampling2D":
-        h, w = keras_layer.size
-        params["scale_h"] = h
-        params["scale_w"] = w
-
-        if hasattr(keras_layer, "interpolation"):
-            interpolation = keras_layer.interpolation
-            if interpolation == "nearest":
-                params["method"] = "nearest_neighbor"
-            else:
-                params["method"] = "bilinear"
-    else:
-        raise tvm.error.OpNotImplemented(
-            f"Operator {upsample_type} is not supported for frontend Keras."
-        )
-    params["layout"] = data_layout
-    out = _op.nn.upsampling(inexpr, **params)
-    return out
-
-
-def _convert_upsample3d(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-
-    params = {}
-    d, h, w = keras_layer.size
-    params["scale_d"] = d
-    params["scale_h"] = h
-    params["scale_w"] = w
-    params["layout"] = data_layout
-    params["coordinate_transformation_mode"] = "asymmetric"
-    out = _op.nn.upsampling3d(inexpr, **params)
-    return out
-
-
-def _convert_cropping(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    crop_type = type(keras_layer).__name__
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-    if crop_type == "Cropping2D":
-        (_, in_h, in_w, _) = input_shape
-        ((crop_t, crop_b), (crop_l, crop_r)) = keras_layer.cropping
-    else:
-        raise tvm.error.OpNotImplemented(
-            f"Operator {crop_type} is not supported for frontend Keras."
-        )
-    int32_max = np.iinfo(np.int32).max
-    if data_layout == "NHWC":
-        begin = [0, crop_t, crop_l, 0]
-        end = [int32_max, in_h - crop_b, in_w - crop_r, int32_max]
-    else:
-        begin = [0, 0, crop_t, crop_l]
-        end = [int32_max, int32_max, in_h - crop_b, in_w - crop_r]
-    return _op.strided_slice(
-        inexpr,
-        begin=begin,
-        end=end,
-    )
-
-
-def _convert_batchnorm(inexpr, keras_layer, etab, data_layout, input_shape=None):
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-    if data_layout == "NCHW" or len(input_shape) < 4:
-        axis = 1
-    else:
-        axis = 3
-
-    params = {"scale": False, "center": False, "epsilon": keras_layer.epsilon, "axis": axis}
-    idx = 0
-    if keras_layer.scale:
-        params["scale"] = True
-        gamma = keras_layer.get_weights()[idx]
-        params["gamma"] = etab.new_const(gamma)
-        idx += 1
-    if keras_layer.center:
-        params["center"] = True
-        beta = keras_layer.get_weights()[idx]
-        params["beta"] = etab.new_const(beta)
-        idx += 1
-    moving_mean = keras_layer.get_weights()[idx]
-    moving_var = keras_layer.get_weights()[idx + 1]
-    params["moving_mean"] = etab.new_const(moving_mean)
-    params["moving_var"] = etab.new_const(moving_var)
-    # in case beta or gamma is not defined
-    params["beta"] = (
-        etab.new_const(np.zeros(moving_mean.shape)) if "beta" not in params else params["beta"]
-    )
-    params["gamma"] = (
-        etab.new_const(np.ones(moving_mean.shape)) if "gamma" not in params else params["gamma"]
-    )
-    result, moving_mean, moving_var = _op.nn.batch_norm(inexpr, **params)
-    return result
-
-
-def _convert_padding(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-
-    padding_type = type(keras_layer).__name__
-    padding = keras_layer.padding
-    top = left = bottom = right = 0
-    if padding_type == "ZeroPadding2D":
-        if isinstance(padding, int):
-            top = left = bottom = right = padding
-        elif isinstance(padding, tuple):
-            if isinstance(padding[0], int):
-                top, left = padding
-                bottom, right = padding
-            elif isinstance(padding[0], tuple):
-                top, bottom = padding[0]
-                left, right = padding[1]
-            else:
-                msg = (
-                    f'Value {str(padding)} in attribute "padding" of operator Padding is '
-                    f"not valid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-        else:
-            msg = f'Value {str(padding)} in attribute "padding" of operator Padding is not valid.'
-            raise tvm.error.OpAttributeInvalid(msg)
-    else:
-        msg = f"Operator {padding_type} is not supported in frontend Keras."
-        raise tvm.error.OpNotImplemented(msg)
-    if data_layout == "NCHW":
-        return _op.nn.pad(data=inexpr, pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
-    return _op.nn.pad(data=inexpr, pad_width=((0, 0), (top, bottom), (left, right), (0, 0)))
-
-
-def _convert_padding3d(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-
-    padding = keras_layer.padding
-
-    d_pad = h_pad = w_pad = [0, 0]
-
-    # padding can be 'int' or 'tuple of 3 ints' or 'tuple of 3 tuples of 2 ints' or 'tuple
-    # of 3 tuples of 2 ints different values'. In all these scenarios keras will send 3
-    # tuples of 2 ints.
-    if isinstance(padding, tuple) and isinstance(padding[0], tuple):
-        d_pad = padding[0]
-        h_pad = padding[1]
-        w_pad = padding[2]
-    else:
-        msg = f'Value {str(padding)} in attribute "padding" of operator ZeroPadding3D is not valid.'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    if data_layout == "NCDHW":
-        out = _op.nn.pad(
-            data=inexpr,
-            pad_width=(
-                (0, 0),
-                (0, 0),
-                (d_pad[0], d_pad[1]),
-                (h_pad[0], h_pad[1]),
-                (w_pad[0], w_pad[1]),
-            ),
-        )
-    else:
-        out = _op.nn.pad(
-            data=inexpr,
-            pad_width=(
-                (0, 0),
-                (d_pad[0], d_pad[1]),
-                (h_pad[0], h_pad[1]),
-                (w_pad[0], w_pad[1]),
-                (0, 0),
-            ),
-        )
-    return out
-
-
-def _convert_concat(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    axis = keras_layer.axis
-    dims = len(input_shape[0])
-    if data_layout == "NCHW":  # need_transpose
-        if axis == -1:
-            axis = 1
-        else:
-            axis = axis + 1 if axis < (dims - 1) else 1
-    return _op.concatenate(_as_list(inexpr), axis=axis)
-
-
-def _convert_reshape(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    inshape = input_shape  # includes batch
-    tshape = keras_layer.target_shape  # no batch
-    shape = (-1,) + tshape
-
-    if data_layout == "NCHW" and (len(inshape) > 3 or len(tshape) > 2):
-        # Perform reshape in original NHWC format.
-        inexpr = _op.transpose(inexpr, [0] + list(range(2, len(inshape))) + [1])
-        inexpr = _op.reshape(inexpr, newshape=shape)
-        return _op.transpose(inexpr, axes=[0, -1] + list(range(1, len(shape) - 1)))
-
-    return _op.reshape(inexpr, newshape=shape)
-
-
-def _convert_lstm(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-    if not isinstance(inexpr, list):
-        buf = np.zeros((1, keras_layer.units), "float32")
-        c_op = etab.new_const(buf)
-        h_op = etab.new_const(buf)
-        inexpr = [inexpr, h_op, c_op]
-    in_data = inexpr[0]
-    next_h = inexpr[1]
-    next_c = inexpr[2]
-    weightList = keras_layer.get_weights()
-    in_shape = tuple(dim if dim else 1 for dim in _as_list(input_shape)[0])
-    kernel_weight = etab.new_const(weightList[0].transpose([1, 0]))
-    recurrent_weight = etab.new_const(weightList[1].transpose([1, 0]))
-    if keras_layer.use_bias:
-        in_bias = etab.new_const(weightList[2])
-    if keras_layer.go_backwards:
-        in_data = _op.reverse(in_data, axis=1)
-    units = list(weightList[0].shape)[1]
-    assert units > 0, "The value of units must be a positive integer"
-    time_steps = in_shape[1]
-    in_data = _op.squeeze(in_data, axis=[0])
-    in_data = _op.split(in_data, indices_or_sections=time_steps, axis=0)
-    # loop for the number of time_steps
-    out_list = []  # store h outputs in case return_sequences is True
-    for data in in_data:
-        ixh1 = _op.nn.dense(data, kernel_weight, units=units)
-        ixh2 = _op.nn.dense(next_h, recurrent_weight, units=units)
-        if keras_layer.use_bias:
-            ixh2 = _op.nn.bias_add(ixh2, bias=in_bias)
-        gate = ixh1 + ixh2
-        gates = _op.split(gate, indices_or_sections=4, axis=1)
-        in_gate = _convert_recurrent_activation(gates[0], keras_layer)
-        in_transform = _convert_recurrent_activation(gates[1], keras_layer)
-        next_c = in_transform * next_c + in_gate * _convert_activation(
-            gates[2], keras_layer, etab, data_layout
-        )
-        out_gate = _convert_recurrent_activation(gates[3], keras_layer)
-        next_h = out_gate * _convert_activation(next_c, keras_layer, etab, data_layout)
-        if keras_layer.return_sequences:
-            out_list.append(_op.expand_dims(next_h, axis=1))
-    out = _op.concatenate(out_list, axis=1) if keras_layer.return_sequences else next_h
-    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
-    out = _op.reshape(out, newshape=out_shape)
-    return [out, next_h, next_c]
-
-
-def _convert_simple_rnn(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    if not isinstance(inexpr, list):
-        buf = np.zeros((1, keras_layer.units), "float32")
-        prev_op = etab.new_const(buf)
-        inexpr = [inexpr, prev_op]
-    in_data = inexpr[0]
-    prev_op = inexpr[1]
-    prev_op = _op.nn.batch_flatten(prev_op)
-    weightList = keras_layer.get_weights()
-    kernel_weight = etab.new_const(weightList[0].transpose([1, 0]))
-    recurrent_weight = etab.new_const(weightList[1].transpose([1, 0]))
-    units = list(weightList[0].shape)[1]
-    assert units > 0, "The value of units must be a positive integer"
-    if keras_layer.use_bias:
-        in_bias = etab.new_const(weightList[2])
-    assert len(in_data.type_annotation.shape) == 3
-    timeDim = in_data.type_annotation.shape[1].value
-    if keras_layer.go_backwards:
-        in_data = _op.reverse(in_data, axis=1)
-    in_data_split = _op.split(in_data, indices_or_sections=timeDim, axis=1)
-    for i in range(len(in_data_split)):
-        in_data_split_i = _op.nn.batch_flatten(in_data_split[i])
-        ixh = _op.nn.dense(in_data_split_i, kernel_weight, units=units)
-        if keras_layer.use_bias:
-            ixh = _op.nn.bias_add(ixh, bias=in_bias)
-        ixh2 = _op.nn.dense(prev_op, recurrent_weight, units=units)
-        output = ixh + ixh2
-        output = _convert_activation(output, keras_layer, etab, data_layout)
-        prev_op = output
-    return [output, output]
-
-
-def _convert_gru(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    _check_data_format(keras_layer)
-    if not isinstance(inexpr, list):
-        buf = np.zeros((1, keras_layer.units), "float32")
-        h_tm1 = etab.new_const(buf)
-        inexpr = [inexpr, h_tm1]
-    in_data = inexpr[0]
-    h_tm1_op = inexpr[1]
-    weightList = keras_layer.get_weights()
-    kernel_weight = etab.new_const(weightList[0].transpose([1, 0]))
-    recurrent_weight = etab.new_const(weightList[1].transpose([1, 0]))
-    if keras_layer.use_bias:
-        in_bias = etab.new_const(weightList[2])
-    if keras_layer.go_backwards:
-        in_data = _op.reverse(in_data, axis=1)
-    units = list(weightList[0].shape)[1]
-    assert units > 0, "The value of units must be a positive integer"
-    in_data = _op.nn.batch_flatten(in_data)
-    matrix_x = _op.nn.dense(in_data, kernel_weight, units=units)
-    if keras_layer.use_bias:
-        matrix_x = _op.nn.bias_add(matrix_x, in_bias)
-    # inputs projected by all gate matrices at once
-    split_indices = [keras_layer.units, 2 * keras_layer.units]
-    gates = _op.split(matrix_x, indices_or_sections=split_indices, axis=1)
-    x_z = gates[0]
-    x_r = gates[1]
-    x_h = gates[2]
-    # hidden state projected separately for update/reset and new
-    units = 2 * keras_layer.units
-    split_indices = [units]
-    rec_weights = _op.split(recurrent_weight, indices_or_sections=split_indices, axis=0)
-    h_tm1_op = _op.nn.batch_flatten(h_tm1_op)
-    matrix_inner = _op.nn.dense(h_tm1_op, rec_weights[0], units=units)
-    split_indices = [keras_layer.units]
-    recurrent = _op.split(matrix_inner, indices_or_sections=split_indices, axis=1)
-    recurrent_z = recurrent[0]
-    recurrent_r = recurrent[1]
-    rec_act_z = _convert_recurrent_activation(x_z + recurrent_z, keras_layer)
-    rec_act_r = _convert_recurrent_activation(x_r + recurrent_r, keras_layer)
-    units = keras_layer.units
-    recurrent_h = _op.nn.dense(rec_act_r * h_tm1_op, rec_weights[1], units=units)
-    act_hh = _convert_activation(x_h + recurrent_h, keras_layer, etab, data_layout)
-    # previous and candidate state mixed by update gate
-    output = rec_act_z * h_tm1_op + (_expr.const(1.0, dtype="float32") - rec_act_z) * act_hh
-    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
-    output = _op.reshape(output, newshape=out_shape)
-    return [output, output]
-
-
-def _convert_repeat_vector(
-    inexpr, keras_layer, etab, data_layout, input_shape=None
-):  # pylint: disable=unused-argument
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-    input_shape = list(input_shape)
-    repeats = keras_layer.n
-    out_shape = [-1, repeats] + input_shape[1:]
-    out = _op.repeat(inexpr, repeats=repeats, axis=0)
-    out = _op.reshape(out, out_shape)
-    return out
-
-
-def _convert_l2_normalize(inexpr, keras_layer, data_layout):
-    l2_normalize_is_loaded = False
-    param_list = []
-    for i in dis.get_instructions(keras_layer.function):
-        if i.opname in ["LOAD_GLOBAL", "LOAD_DEREF"]:
-            continue
-        if i.opname in ["LOAD_ATTR", "LOAD_METHOD"]:
-            if i.argval == "l2_normalize":
-                assert not l2_normalize_is_loaded, "l2_normalize was already LOADED"
-                l2_normalize_is_loaded = True
-        elif i.opname in ["LOAD_CONST", "LOAD_FAST"] and l2_normalize_is_loaded:
-            param_list.append(i.argval)
-        elif i.opname == "BUILD_LIST":
-            sz = i.argval
-            assert len(param_list) >= sz
-            new_list = param_list[-sz:]
-            param_list = param_list[:-sz]
-            param_list.append(new_list)
-        elif i.opname in ["CALL_FUNCTION_KW", "CALL_METHOD"]:
-            break
-
-    axis = None
-    is_param_list_parsed = False
-    if l2_normalize_is_loaded and len(param_list) > 0:
-        # last param_list item is tuple of strings means that
-        # lambda uses named parameters when calling l2_normalize
-        if (
-            isinstance(param_list[-1], tuple)
-            and len(param_list[-1]) > 0
-            and isinstance(param_list[-1][0], str)
-        ):
-            param_names = param_list[-1]
-            if len(param_names) == 1 and param_names[0] == "x":
-                # lambda v: K.l2_normalize(x=v)
-                axis = None
-                is_param_list_parsed = True
-            elif len(param_names) == 1 and param_names[0] == "axis" and len(param_list) == 3:
-                # lambda x: K.l2_normalize(x, axis=(2,3))
-                axis = param_list[1]
-                is_param_list_parsed = True
-            elif len(param_names) == 2 and len(param_list) == 3:
-                # lambda x: K.l2_normalize(x=x, axis=(2,3))
-                # lambda x: K.l2_normalize(axis=(2,3), x=x)
-                axis = param_list[param_names.index("axis")]
-                is_param_list_parsed = True
-        else:
-            # lambda x: K.l2_normalize(x)
-            if len(param_list) == 1:
-                axis = None
-                is_param_list_parsed = True
-            # lambda x: K.l2_normalize(x, (2,3))
-            elif len(param_list) == 2:
-                axis = param_list[1]
-                is_param_list_parsed = True
-
-    def is_int_or_tuple_of_ints(v):
-        if isinstance(v, list) and len(v) > 0:
-            for i in v:
-                if not isinstance(i, int):
-                    return False
-            return True
-        if isinstance(v, tuple) and len(v) > 0:
-            return isinstance(v[0], int)
-        return isinstance(v, int)
-
-    assert is_param_list_parsed and (
-        axis is None or is_int_or_tuple_of_ints(axis)
-    ), "Can not parse l2_normalize lambda function found in Lambda layer"
-    if isinstance(axis, int):
-        axis = [axis]
-
-    if data_layout == "NCHW":
-        dims = len(keras_layer.input_shape)
-
-        def fix_axis_for_nchw(axis):
-            if axis == 0:
-                return 0
-            if axis in [(dims - 1), -1]:
-                return 1
-            return axis + 1
-
-        axis = [fix_axis_for_nchw(x) for x in axis]
-    return _op.nn.l2_normalize(inexpr, eps=1e-12, axis=axis)
-
-
-def _convert_lambda(inexpr, keras_layer, _, data_layout):
-    fcode = keras_layer.function.__code__
-    # Convert l2_normalize
-    if (
-        fcode.co_name == "<lambda>"
-        and len(fcode.co_names) > 0
-        and fcode.co_names[-1] == "l2_normalize"
-    ):
-        return _convert_l2_normalize(inexpr, keras_layer, data_layout)
-    raise tvm.error.OpNotImplemented(
-        f"Function {fcode.co_names} used in Lambda layer is not supported in frontend Keras."
-    )
-
-
-def _convert_time_distributed(inexpr, keras_layer, etab, data_layout, input_shape=None):
-    # TimeDistributed: split input tensor along the second dimension (assumed to be time),
-    # apply inner layer to each split individually,
-    # and then combine the results
-    if input_shape is None:
-        input_shape = keras_layer.input_shape
-
-    assert len(input_shape) >= 2, "Input to TimeDistributed must have at least two dimensions"
-
-    inner_layer = keras_layer.layer
-    inner_input_shape = [d for (i, d) in enumerate(input_shape) if i != 1]
-
-    # for NDHWC, inner data layout will drop the D
-    inner_data_layout = data_layout
-    if data_layout == "NDHWC":
-        inner_data_layout = "NHWC"
-
-    # some code duplication from keras_op_to_relay
-    # but it's useful to avoid cluttering the etab
-    inner_layer_op_name = type(keras_layer.layer).__name__
-    if inner_layer_op_name not in _convert_map:
-        raise tvm.error.OpNotImplemented(
-            f"The inner layer for TimeDistributed {inner_layer_op_name} is not supported for"
-            f" frontend Keras."
-        )
-
-    conversion_func = lambda expr: _convert_map[inner_layer_op_name](
-        expr, inner_layer, etab, inner_data_layout, input_shape=inner_input_shape
-    )
-
-    split_dim = input_shape[1]
-    split_input = _op.split(inexpr, split_dim, 1)
-
-    split_shape = list(input_shape)
-    if split_shape[0] is None:
-        split_shape[0] = 1
-    split_shape[1] = 1
-
-    split_var = new_var(
-        "time_distributed_split",
-        type_annotation=TupleType(
-            [TensorType(split_shape, dtype="float32") for i in range(split_dim)]
-        ),
-    )
-
-    # For each split, squeeze away the second dimension,
-    # apply the inner layer.
-    # Afterwards, combine the transformed splits back along
-    # the second dimension using stack
-    splits = [
-        conversion_func(_op.squeeze(_expr.TupleGetItem(split_var, i), axis=[1]))
-        for i in range(split_dim)
-    ]
-
-    return _expr.Let(split_var, split_input.astuple(), _op.stack(splits, axis=1))
-
-
-def _default_skip(inexpr, keras_layer, etab, data_layout):  # pylint: disable=unused-argument
-    """Layers that can be skipped because they are train time only."""
-    return inexpr
-
-
-_convert_map = {
-    "Dense": _convert_dense,
-    "Activation": _convert_activation,
-    "Softmax": _convert_advanced_activation,
-    "ReLU": _convert_advanced_activation,
-    "LeakyReLU": _convert_advanced_activation,
-    "PReLU": _convert_advanced_activation,
-    "ELU": _convert_advanced_activation,
-    "ThresholdedReLU": _convert_advanced_activation,
-    "AveragePooling2D": _convert_pooling,
-    "MaxPooling2D": _convert_pooling,
-    "GlobalAveragePooling2D": _convert_pooling,
-    "GlobalMaxPooling2D": _convert_pooling,
-    "Conv2D": _convert_convolution,
-    "Conv2DTranspose": _convert_convolution,
-    "DepthwiseConv2D": _convert_convolution,
-    "SeparableConv2D": _convert_separable_convolution,
-    "Flatten": _convert_flatten,
-    "Reshape": _convert_reshape,
-    "Concatenate": _convert_concat,
-    "BatchNormalization": _convert_batchnorm,
-    # Specific tf.Keras terminology for batch normalization
-    "BatchNormalizationV1": _convert_batchnorm,
-    "Add": _convert_merge,
-    "Subtract": _convert_merge,
-    "Multiply": _convert_merge,
-    "ZeroPadding2D": _convert_padding,
-    "UpSampling2D": _convert_upsample,
-    "Cropping2D": _convert_cropping,
-    # 'ZeroPadding1D'          : _convert_padding,
-    # 'AveragePooling1D'       : _convert_pooling,
-    # 'MaxPooling1D'           : _convert_pooling,
-    # 'GlobalAveragePooling1D' : _convert_pooling,
-    # 'GlobalMaxPooling1D'     : _convert_pooling,
-    # 'Cropping1D'             : _convert_cropping,
-    # 'UpSampling1D'           : _convert_upsample,
-    "Conv1D": _convert_convolution1d,
-    # "Conv1DTranspose": _convert_convolution1d,
-    "Conv3D": _convert_convolution3d,
-    "Conv3DTranspose": _convert_convolution3d,
-    # 'SeparableConv3D'        : _convert_convolution3d,
-    "MaxPooling3D": _convert_pooling3d,
-    "AveragePooling3D": _convert_pooling3d,
-    "GlobalMaxPooling3D": _convert_global_pooling3d,
-    "GlobalAveragePooling3D": _convert_global_pooling3d,
-    "UpSampling3D": _convert_upsample3d,
-    "ZeroPadding3D": _convert_padding3d,
-    "SimpleRNN": _convert_simple_rnn,
-    "LSTM": _convert_lstm,
-    "GRU": _convert_gru,
-    # 'Bidirectional'          : _convert_bidirectional,
-    "TimeDistributed": _convert_time_distributed,
-    "Average": _convert_merge,
-    "Minimum": _convert_merge,
-    "Maximum": _convert_merge,
-    "Dot": _convert_merge,
-    "Permute": _convert_permute,
-    "Embedding": _convert_embedding,
-    "RepeatVector": _convert_repeat_vector,
-    "Lambda": _convert_lambda,
-    "InputLayer": _default_skip,
-    "Dropout": _default_skip,
-    "AlphaDropout": _default_skip,
-    "SpatialDropout2D": _default_skip,
-    "SpatialDropout1D": _default_skip,
-    "GaussianDropout": _default_skip,
-    "GaussianNoise": _default_skip,
-}
-
-
-def _check_unsupported_layers(model):
-    missing_ops = set()
-    for layer in model.layers:
-        op_name = type(layer).__name__
-        if op_name not in _convert_map:
-            missing_ops.add(op_name)
-
-    if missing_ops:
-        raise NotImplementedError(f"The following operators are not implemented: {missing_ops}")
-
-
-def keras_op_to_relay(inexpr, keras_layer, outname, etab, data_layout):
-    """Convert a Keras layer to a Relay expression and update the expression table.
-
-    Parameters
-    ----------
-    inexpr : relay.expr.Expr or a list of it
-        The input Relay expression(s).
-
-    keras_layer : keras.layers
-        The Keras layer to be converted.
-
-    outname : str
-        Name of the output Relay expression.
-
-    etab : relay.frontend.common.ExprTable
-        The global expression table to be updated.
-
-    data_layout : str
-        The input data layout
-    """
-    op_name = type(keras_layer).__name__
-    if op_name not in _convert_map:
-        raise tvm.error.OpNotImplemented(f"Operator {op_name} is not supported for frontend Keras.")
-    outs = _convert_map[op_name](inexpr, keras_layer, etab, data_layout)
-    outs = _as_list(outs)
-    for t_idx, out in enumerate(outs):
-        name = outname + ":" + str(t_idx)
-        etab.set_expr(name, out)
-    return outs
-
-
-def from_keras(model, shape=None, layout="NCHW"):
-    """Convert keras model to relay Function.
-
-    Parameters
-    ----------
-    model : keras.engine.training.Model or tensorflow.keras.models.Model
-        The keras model to be converted.
-
-    shape: dict of str to int list/tuple
-        Input shapes of the model, optional
-
-    layout: str
-        One of 'NWC', 'NCHW', 'NHWC', 'NDHWC' indicates how data should
-        be arranged in the output model. Default layout is 'NCHW' as it
-        in general performs better across TVM.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation.
-
-    params : dict of str to tvm.nd.NDArray
-        The parameter dict to be used by Relay.
-    """
-
-    def _check_model_is_tf_keras():
-        return type(model).__module__.startswith("tensorflow.python.keras")
-
-    def _convert_input_layer(keras_layer):
-        input_name = keras_layer.name
-        input_shape = shape[input_name] if shape is not None and input_name in shape else None
-        if input_shape and len(input_shape) > 1 and any(dim <= 0 for dim in input_shape[1:]):
-            msg = (
-                "Expected input's non-batch dimensions to have positive length, "
-                f"but the input has a shape of {input_shape}"
-            )
-            raise ValueError(msg)
-        etab.set_expr(input_name, new_var(input_name, shape=input_shape))
-
-    def _convert_layer(keras_layer, etab, scope=""):
-        inbound_nodes = (
-            keras_layer.inbound_nodes
-            if hasattr(keras_layer, "inbound_nodes")
-            else keras_layer._inbound_nodes
-            if hasattr(keras_layer, "_inbound_nodes")
-            else None
-        )
-        if inbound_nodes is None:
-            raise TypeError(f"Unknown layer type or unsupported Keras version : {keras_layer}")
-        outs = []
-        for node_idx, node in enumerate(inbound_nodes):
-            # If some nodes in imported model are not relevant to the current model,
-            # skip such layers.
-            # - In Keras, model._network_nodes contains keys of all nodes relevant to the
-            #   current model;
-            # - In tf.Keras, this is already done as part of tensorflow.keras.network.get_config
-            if not is_tf_keras:
-                if (
-                    hasattr(model, "_node_key")
-                    and not model._node_key(keras_layer, node_idx) in model._network_nodes
-                ):
-                    continue
-            inexpr = []
-            # Since Keras allows creating multiple layers from the same name instance,
-            # we append node index to the expr name to make it unique.
-            # The one exception is InputLayer. Changing input variable names after conversion
-            # would confuse users, so we should keep them as far as possible. Fortunately,
-            # they are named uniquely to input_1, input_2, input_3... by default.
-            # node_indices attribute removed in tensorflow 2.3, however iterate_inbound() can
-            # be used
-            if hasattr(node, "node_indices"):
-                zip_node = zip(
-                    _as_list(node.inbound_layers),
-                    _as_list(node.node_indices),
-                    _as_list(node.tensor_indices),
-                    _as_list(node.input_tensors),
-                )
-                node_attributes = zip_node
-            else:
-                node_attributes = node.iterate_inbound()
-
-            for inbound_layer, n_idx, t_idx, _ in node_attributes:
-                if isinstance(inbound_layer, input_layer_class):
-                    expr_name = inbound_layer.name
-                    _convert_input_layer(inbound_layer)
-                else:
-                    expr_name = scope + inbound_layer.name + ":" + str(n_idx) + ":" + str(t_idx)
-                expr = etab.get_expr(expr_name)
-                inexpr.append(expr)
-
-            # Handle nested layers
-            if hasattr(keras_layer, "layers"):
-                input_index = 0
-                for layer in keras_layer.layers:
-                    if isinstance(layer, input_layer_class):
-                        # Replace input layer with inbound node
-                        etab.set_expr(layer.name, inexpr[input_index])
-                        input_index += 1
-                    else:
-                        # Convert child layer. Prepend scope with parent layer name.
-                        layer_outs = _convert_layer(layer, etab, keras_layer.name + "_" + scope)
-
-                # Get output of last child layer and mark as output of parent.
-                outname = keras_layer.name + ":" + str(node_idx)
-                for t_idx, out in enumerate(layer_outs):
-                    name = outname + ":" + str(t_idx)
-                    etab.set_expr(name, out)
-                outs.extend(layer_outs)
-            else:
-                if len(inexpr) == 1:
-                    inexpr = inexpr[0]
-                outs.extend(
-                    keras_op_to_relay(
-                        inexpr,
-                        keras_layer,
-                        scope + keras_layer.name + ":" + str(node_idx),
-                        etab,
-                        layout,
-                    )
-                )
-        return outs
-
-    is_tf_keras = _check_model_is_tf_keras()
-
-    if not is_tf_keras:
-        # Importing from Keras
-        try:
-            import keras
-        except ImportError:
-            raise ImportError("Keras must be installed")
-        if keras.backend.backend() != "tensorflow":
-            raise ValueError("Keras frontend currently supports tensorflow backend only.")
-        if keras.backend.image_data_format() != "channels_last":
-            raise ValueError("Keras frontend currently supports data_format = channels_last only.")
-        try:
-            import keras.engine as E
-        except ImportError:
-            try:
-                import keras.src.engine as E
-            except ImportError:
-                raise ImportError("Cannot find Keras's engine")
-        expected_model_class = E.training.Model
-        if hasattr(E, "InputLayer"):
-            input_layer_class = E.InputLayer
-        else:
-            # TFlite >=2.6
-            input_layer_class = E.input_layer.InputLayer
-    else:
-        # Importing from Tensorflow Keras (tf.keras)
-        try:
-            from tensorflow import keras as tf_keras
-        except ImportError:
-            raise ImportError("Tensorflow must be installed")
-        expected_model_class = tf_keras.models.Model
-        input_layer_class = tf_keras.layers.InputLayer
-
-    assert isinstance(model, expected_model_class)
-
-    etab = ExprTable()
-    # Set global data format.
-    assert layout in [
-        "NWC",
-        "NCHW",
-        "NHWC",
-        "NDHWC",
-    ], "Layout must be one of 'NWC', 'NCHW', NHWC or NDHWC"
-    for keras_layer in model.layers:
-        if isinstance(keras_layer, input_layer_class):
-            _convert_input_layer(keras_layer)
-        else:
-            _convert_layer(keras_layer, etab)
-
-    # model._output_coordinates contains out_node(oc[0]), node_index(oc[1]) and tensor_index(oc[2])
-    # Get all output nodes in etab using the name made from above values.
-    # The out exprs were added to etab in keras_op_to_relay using this name.
-    outexpr = [
-        etab.get_expr(oc[0].name + ":" + str(oc[1]) + ":" + str(oc[2]))
-        for oc in model._output_coordinates
-    ]
-    outexpr = outexpr[0] if len(outexpr) == 1 else _expr.Tuple(outexpr)
-    func = _function.Function(analysis.free_vars(outexpr), outexpr)
-    params = {k: _nd.array(np.array(v, dtype=np.float32)) for k, v in etab.params.items()}
-    return IRModule.from_expr(func), params
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
deleted file mode 100644
index c3e14c9b72f8..000000000000
--- a/python/tvm/relay/frontend/mxnet.py
+++ /dev/null
@@ -1,2963 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, len-as-condition, no-else-return, too-many-lines
-# pylint: disable=use-list-literal
-"""MXNet symbol frontend."""
-import json
-import math
-
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.ir import IRModule
-from tvm.topi.utils import get_const_tuple
-
-from ... import nd as _nd
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from .. import scope_builder as _scope_builder
-from .common import StrAttrsDict
-from .common import get_name as _get_name
-from .common import infer_shape as _infer_shape
-from .common import infer_type as _infer_type
-from .common import infer_value as _infer_value
-from .mxnet_qnn_op_utils import (
-    dequantize_mxnet_min_max,
-    get_conv_mkldnn_requantized_scale_outDtype,
-    get_mkldnn_int8_scale,
-    get_mkldnn_requantize_scale_outDtype,
-    get_mkldnn_uint8_scale,
-    quantize_conv_bias_mkldnn_from_var,
-    quantize_conv_weights_bias_channel_mkldnn_from_var,
-    quantize_mxnet_min_max,
-)
-from .nnvm_common import (
-    _arg_reduce,
-    _binop_scalar,
-    _cast,
-    _clip,
-    _elemwise_sum,
-    _init_op,
-    _rbinop_scalar,
-    _reduce,
-    _rename,
-    _reshape,
-    _softmax_op,
-    _transpose,
-    _upsampling,
-    _warn_not_used,
-)
-
-__all__ = ["from_mxnet"]
-
-_activation_map = {"sigmoid": _op.sigmoid, "tanh": _op.tanh, "relu": _op.nn.relu}
-
-
-def _mx_fully_connected(inputs, attrs):
-    import mxnet as mx  # pylint: disable=import-outside-toplevel
-
-    units = attrs.get_int("num_hidden")
-    use_bias = not attrs.get_bool("no_bias", False)
-    try:
-        _ = mx.sym.FullyConnected(mx.sym.var("x"), num_hidden=1, flatten=True)
-        has_flatten = True
-    except mx.base.MXNetError:
-        # no flatten attribute in old mxnet
-        has_flatten = False
-    use_flatten = attrs.get_bool("flatten", True)
-    if has_flatten and use_flatten:
-        inputs[0] = _op.nn.batch_flatten(inputs[0])
-    data_shape = _infer_type(inputs[0]).checked_type.shape
-    if len(data_shape) > 2:
-        inputs[0] = _op.reverse_reshape(inputs[0], [-1, 0])
-    res = _op.nn.dense(inputs[0], inputs[1], units=units)
-    if use_bias:
-        assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2], axis=-1)
-    if len(data_shape) > 2:
-        new_shape = data_shape[:-1]
-        new_shape.append(units)
-        res = _op.reshape(res, new_shape)
-    return res
-
-
-def _get_channel_axis(layout, op_name):
-    if layout in ["NCHW", "NCDHW"]:
-        return 1
-    if layout == "NHWC":
-        return 3
-    if layout == "NDHWC":
-        return 4
-    raise tvm.error.OpAttributeInvalid(
-        f'Value {padding} in attribute "layout" of operator {op_name} is not valid.'
-    )
-
-
-def _mx_activations(inputs, attrs):
-    act_type = attrs.get_str("act_type")
-    assert len(inputs) == 1
-    if act_type == "softrelu":
-
-        def _stable_softrelu(x):
-            # log(1 + exp(-abs(x))) + relu(x)
-            one = _expr.const(1, dtype="float32")
-            exp_neg_abs_x = _op.exp(_op.negative(_op.abs(x)))
-            return _op.add(_op.log(_op.add(one, exp_neg_abs_x)), _op.nn.relu(x))
-
-        return _stable_softrelu(inputs[0])
-    if act_type not in _activation_map:
-        raise tvm.error.OpNotImplemented(
-            f"Operator {act_type} is not supported for frontend MXNet."
-        )
-    return _activation_map[act_type](inputs[0])
-
-
-def _mx_compare(new_op, wrapper):
-    def impl(inputs, attrs):
-        expr = _infer_type(inputs[0])
-        dtype = expr.checked_type.dtype
-        return wrapper(new_op)(inputs, attrs).astype(dtype)
-
-    return impl
-
-
-def _mx_unravel_index(inputs, attrs):
-    assert len(inputs) == 1
-    shape = attrs.get_int_tuple("shape")
-    shape_expr = _expr.const(list(shape))
-    return _op.unravel_index(inputs[0], shape_expr)
-
-
-def _mx_swap_axis(inputs, attrs):
-    assert len(inputs) == 1
-    dim1 = attrs.get_int("dim1")
-    dim2 = attrs.get_int("dim2")
-    shape = _infer_type(inputs[0]).checked_type.shape
-    axes = list(range(len(shape)))
-    axes[dim1] = dim2
-    axes[dim2] = dim1
-    return _op.transpose(inputs[0], axes=axes)
-
-
-def _mx_zeros(inputs, attrs):
-    assert len(inputs) == 0
-    shape = attrs.get_int_tuple("shape")
-    dtype = attrs.get_str("dtype", "float32")
-    if 0 in shape:
-        return None
-    return _op.zeros(shape=shape, dtype=dtype)
-
-
-def _mx_conv(inputs, attrs):
-    kernel_size = attrs.get_int_tuple("kernel")
-    if len(kernel_size) == 3:
-        return _mx_conv3d(inputs, attrs)
-    elif len(kernel_size) == 2:
-        return _mx_conv2d(inputs, attrs)
-    elif len(kernel_size) == 1:
-        return _mx_conv1d(inputs, attrs)
-    else:
-        raise tvm.error.OpAttributeInvalid(
-            "1D, 2D or 3D kernels only are supported for operator Convolution"
-        )
-
-
-def _mx_conv1d(inputs, attrs):
-    kernel_size = attrs.get_int_tuple("kernel")
-    if len(kernel_size) != 1:
-        raise tvm.error.OpAttributeInvalid(
-            "Non 1D or 2D kernels are not supported for operator Convolution"
-        )
-    data_layout = attrs.get_str("layout", "NCW")
-    # MXNet Conv1D only supports ‘NCW’ layout for now.
-    if data_layout != "NCW":
-        raise tvm.error.OpAttributeInvalid('Only "NCW" data layout is supported for 1D Convolution')
-    data_layout = "NCHW"
-    channel_axis = 1
-    kernel_layout = "OIHW"
-
-    new_attrs = {}
-    new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["kernel_size"] = (1,) + kernel_size
-    new_attrs["strides"] = (1,) + attrs.get_int_tuple("stride", (1,))
-    new_attrs["padding"] = (0,) + attrs.get_int_tuple("pad", (0,))
-    new_attrs["dilation"] = (1,) + attrs.get_int_tuple("dilate", (1,))
-    new_attrs["groups"] = attrs.get_int("num_group", 1)
-    new_attrs["data_layout"] = data_layout
-    new_attrs["kernel_layout"] = kernel_layout
-    use_bias = not attrs.get_bool("no_bias", False)
-    data = _op.expand_dims(inputs[0], axis=2)
-    kernel = _op.expand_dims(inputs[1], axis=2)
-    res = _op.nn.conv2d(data, kernel, **new_attrs)
-    if use_bias:
-        assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
-    res = _op.squeeze(res, axis=[2])
-    return res
-
-
-def _get_mx_conv2d_attrs(attrs):
-    kernel_size = attrs.get_int_tuple("kernel")
-    data_layout = attrs.get_str("layout", "NCHW")
-    if "kernel_layout" in attrs.attrs:
-        kernel_layout = attrs.get_str("kernel_layout")
-    else:
-        kernel_layout = "HWIO" if data_layout == "NHWC" else "OIHW"
-    new_attrs = {}
-    new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["kernel_size"] = kernel_size
-    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
-    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
-    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1))
-    new_attrs["groups"] = attrs.get_int("num_group", 1)
-    new_attrs["data_layout"] = data_layout
-    new_attrs["kernel_layout"] = kernel_layout
-    return new_attrs
-
-
-def _mx_conv2d(inputs, attrs):
-    kernel_size = attrs.get_int_tuple("kernel")
-    data_layout = attrs.get_str("layout", "NCHW")
-    if len(kernel_size) != 2:
-        raise tvm.error.OpAttributeInvalid("Only 2D kernels are supported for operator Convolution")
-
-    new_attrs = _get_mx_conv2d_attrs(attrs)
-    channel_axis = _get_channel_axis(data_layout, "conv2d")
-    use_bias = not attrs.get_bool("no_bias", False)
-    res = _op.nn.conv2d(inputs[0], inputs[1], **new_attrs)
-    if use_bias:
-        assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
-    return res
-
-
-def _get_mx_conv3d_attrs(attrs):
-    kernel_size = attrs.get_int_tuple("kernel")
-    data_layout = attrs.get_str("layout", "NCDHW")
-    if "kernel_layout" in attrs.attrs:
-        kernel_layout = attrs.get_str("kernel_layout")
-    else:
-        kernel_layout = "DHWIO" if data_layout == "NDHWC" else "OIDHW"
-    new_attrs = {}
-    new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["kernel_size"] = kernel_size
-    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1, 1))
-    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0, 0))
-    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1, 1))
-    new_attrs["groups"] = attrs.get_int("num_group", 1)
-    new_attrs["data_layout"] = data_layout
-    new_attrs["kernel_layout"] = kernel_layout
-    return new_attrs
-
-
-def _mx_conv3d(inputs, attrs):
-    kernel_size = attrs.get_int_tuple("kernel")
-    data_layout = attrs.get_str("layout", "NCDHW")
-    if len(kernel_size) != 3:
-        raise tvm.error.OpAttributeInvalid("Only 3D kernels are supported for operator Convolution")
-
-    new_attrs = _get_mx_conv3d_attrs(attrs)
-    channel_axis = _get_channel_axis(data_layout, "conv3d")
-    use_bias = not attrs.get_bool("no_bias", False)
-    res = _op.nn.conv3d(inputs[0], inputs[1], **new_attrs)
-    if use_bias:
-        assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
-    return res
-
-
-def _mx_conv_transpose(inputs, attrs):
-    kernel_size = attrs.get_int_tuple("kernel")
-    if len(kernel_size) == 3:
-        return _mx_conv3d_transpose(inputs, attrs)
-    elif len(kernel_size) == 2:
-        return _mx_conv2d_transpose(inputs, attrs)
-    elif len(kernel_size) == 1:
-        return _mx_conv1d_transpose(inputs, attrs)
-    else:
-        raise tvm.error.OpAttributeInvalid(
-            "1D, 2D or 3D kernels only are supported for operator Convolution"
-        )
-
-
-def _mx_conv1d_transpose(inputs, attrs):
-    if "target_shape" in attrs.attrs:
-        raise tvm.error.OpAttributeUnImplemented(
-            'Attribute "target_shape" is not supported for operator Conv2D-transpose.'
-        )
-    data_layout = attrs.get_str("layout", "NCW")
-    if data_layout != "NCW":
-        raise tvm.error.OpAttributeInvalid('Only "NCW" data layout is supported for 1D Convolution')
-    channel_axis = 1
-    kernel_layout = "IOW"
-    new_attrs = {}
-    new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["kernel_size"] = attrs.get_int_tuple("kernel")
-    new_attrs["strides"] = attrs.get_int_tuple("stride", (1,))
-    new_attrs["output_padding"] = attrs.get_int_tuple("adj", (0,))
-    new_attrs["padding"] = attrs.get_int_tuple("pad", (0,))
-    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1,))
-    new_attrs["groups"] = attrs.get_int("num_group", 1)
-    new_attrs["data_layout"] = data_layout
-    new_attrs["kernel_layout"] = kernel_layout
-    use_bias = not attrs.get_bool("no_bias", True)
-    res = _op.nn.conv1d_transpose(inputs[0], inputs[1], **new_attrs)
-    if use_bias:
-        assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
-    return res
-
-
-def _mx_conv2d_transpose(inputs, attrs):
-    if "target_shape" in attrs.attrs:
-        raise tvm.error.OpAttributeUnImplemented(
-            'Attribute "target_shape" is not supported for operator Conv2D-transpose.'
-        )
-    kernel_size = attrs.get_int_tuple("kernel")
-    if len(kernel_size) != 2:
-        raise tvm.error.OpAttributeInvalid(
-            "Non-2D kernels are not supported for operator Conv2D-transpose."
-        )
-    data_layout = attrs.get_str("layout", "NCHW")
-    channel_axis = _get_channel_axis(data_layout, "conv2d_transpose")
-
-    if "kernel_layout" in attrs.attrs:
-        kernel_layout = attrs.get_str("kernel_layout")
-    else:
-        kernel_layout = "HWIO" if data_layout == "NHWC" else "IOHW"
-
-    new_attrs = {}
-    new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["kernel_size"] = kernel_size
-    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
-    new_attrs["output_padding"] = attrs.get_int_tuple("adj", (0, 0))
-    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
-    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1))
-    new_attrs["groups"] = attrs.get_int("num_group", 1)
-    new_attrs["data_layout"] = data_layout
-    new_attrs["kernel_layout"] = kernel_layout
-    use_bias = not attrs.get_bool("no_bias", True)
-    res = _op.nn.conv2d_transpose(inputs[0], inputs[1], **new_attrs)
-
-    if use_bias:
-        assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
-    return res
-
-
-def _mx_conv3d_transpose(inputs, attrs):
-    if "target_shape" in attrs.attrs:
-        raise tvm.error.OpAttributeUnImplemented(
-            'Attribute "target_shape" is not supported for operator Conv3D-transpose.'
-        )
-    kernel_size = attrs.get_int_tuple("kernel")
-    if len(kernel_size) != 3:
-        raise tvm.error.OpAttributeInvalid(
-            "Non-3D kernels are not supported for operator Conv3D-transpose."
-        )
-    data_layout = attrs.get_str("layout", "NCDHW")
-    channel_axis = _get_channel_axis(data_layout, "conv3d_transpose")
-
-    if "kernel_layout" in attrs.attrs:
-        kernel_layout = attrs.get_str("kernel_layout")
-    else:
-        kernel_layout = "DHWIO" if data_layout == "NDHWC" else "OIDHW"
-
-    new_attrs = {}
-    new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["kernel_size"] = kernel_size
-    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1, 1))
-    new_attrs["output_padding"] = attrs.get_int_tuple("adj", (0, 0, 0))
-    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0, 0))
-    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1, 1))
-    new_attrs["groups"] = attrs.get_int("num_group", 1)
-    new_attrs["data_layout"] = data_layout
-    new_attrs["kernel_layout"] = kernel_layout
-    use_bias = not attrs.get_bool("no_bias", True)
-    res = _op.nn.conv3d_transpose(inputs[0], inputs[1], **new_attrs)
-
-    if use_bias:
-        assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
-    return res
-
-
-def _mx_pooling(inputs, attrs):
-    global_pool = attrs.get_bool("global_pool", False)
-    pool_type = attrs.get_str("pool_type")
-
-    def _pool2d(new_op, is_avg):
-        kernel_size = attrs.get_int_tuple("kernel")
-        if len(kernel_size) != 2:
-            raise tvm.error.OpAttributeInvalid("Only 2D kernels are supported for operator Pool2D.")
-        new_attrs = {}
-        new_attrs["pool_size"] = kernel_size
-        new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
-        new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
-        new_attrs["ceil_mode"] = attrs.get_str("pooling_convention", "valid") == "full"
-        if is_avg:
-            new_attrs["count_include_pad"] = attrs.get_bool("count_include_pad", True)
-        return new_op(inputs[0], **new_attrs)
-
-    def _pool3d(new_op, is_avg):
-        kernel_size = attrs.get_int_tuple("kernel")
-        if len(kernel_size) != 3:
-            raise tvm.error.OpAttributeInvalid("Only 3D kernels are supported for operator Pool3D.")
-        new_attrs = {}
-        new_attrs["pool_size"] = kernel_size
-        new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1, 1))
-        new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0, 0))
-        new_attrs["ceil_mode"] = attrs.get_str("pooling_convention", "valid") == "full"
-        if is_avg:
-            new_attrs["count_include_pad"] = attrs.get_bool("count_include_pad", True)
-        return new_op(inputs[0], **new_attrs)
-
-    # 3D pooling
-    if len(_infer_shape(inputs[0])) == 5:
-        if pool_type == "max":
-            if global_pool:
-                return _op.nn.global_max_pool3d(inputs[0])
-            return _pool3d(_op.nn.max_pool3d, False)
-        if pool_type == "avg":
-            if global_pool:
-                return _op.nn.global_avg_pool3d(inputs[0])
-            return _pool3d(_op.nn.avg_pool3d, True)
-        raise tvm.error.OpNotImplemented(
-            f"Operator {pool_type.capitalize()} Pooling is not supported for frontend MXNet."
-        )
-    # 2D Pooling
-    if pool_type == "max":
-        if global_pool:
-            return _op.nn.global_max_pool2d(inputs[0])
-        return _pool2d(_op.nn.max_pool2d, False)
-    if pool_type == "avg":
-        if global_pool:
-            return _op.nn.global_avg_pool2d(inputs[0])
-        return _pool2d(_op.nn.avg_pool2d, True)
-    raise tvm.error.OpNotImplemented(
-        f"Operator {pool_type.capitalize()} Pooling is not supported for frontend MXNet."
-    )
-
-
-def _mx_adaptive_avg_pooling(inputs, attrs):
-    output_size = attrs.get_int_tuple("output_size", [])
-    return _op.nn.adaptive_avg_pool2d(inputs[0], output_size)
-
-
-def _mx_dropout(inputs, attrs):
-    rate = attrs.get_float("p", 0.5)
-    return _op.nn.dropout(inputs[0], rate=rate)
-
-
-def _mx_BlockGrad(inputs, attrs):  # pylint: disable=unused-argument
-    return inputs
-
-
-def _mx_batch_norm(inputs, attrs):
-    if attrs.get_bool("output_mean_var", False):
-        raise tvm.error.OpAttributeUnImplemented(
-            'Attribute "output_mean_var" is not supported for operator Batch Norm.'
-        )
-    if attrs.get_bool("use_global_stats", False):
-        _warn_not_used("use_global_stats", "batch_norm")
-    new_attrs = {}
-    new_attrs["axis"] = attrs.get_int("axis", 1)
-    new_attrs["epsilon"] = attrs.get_float("eps", 0.001)
-    new_attrs["center"] = True
-    new_attrs["scale"] = not attrs.get_bool("fix_gamma", True)
-    return _op.nn.batch_norm(*inputs, **new_attrs)
-
-
-def _mx_instance_norm(inputs, attrs):
-    assert len(inputs) == 3
-    new_attrs = {}
-    new_attrs["axis"] = attrs.get_int("axis", 1)
-    new_attrs["epsilon"] = attrs.get_float("eps", 1e-5)
-    return _op.nn.instance_norm(*inputs, **new_attrs)
-
-
-def _mx_layer_norm(inputs, attrs):
-    assert len(inputs) == 3
-    if attrs.get_bool("output_mean_var", False):
-        raise tvm.error.OpAttributeUnimplemented(
-            'Attribute "output_mean_var" is not supported for operator Layer Norm.'
-        )
-    new_attrs = {}
-    new_attrs["axis"] = attrs.get_int("axis", -1)
-    new_attrs["epsilon"] = attrs.get_float("eps", 1e-5)
-    return _op.nn.layer_norm(*inputs, **new_attrs)
-
-
-def _mx_group_norm(inputs, attrs):
-    assert len(inputs) == 3
-    if attrs.get_bool("output_mean_var", False):
-        raise tvm.error.OpAttributeUnimplemented(
-            'Attribute "output_mean_var" is not supported for operator Group Norm.'
-        )
-    new_attrs = {}
-    new_attrs["axis"] = 1
-    new_attrs["num_groups"] = attrs.get_int("num_groups", 1)
-    new_attrs["epsilon"] = attrs.get_float("eps", 1e-5)
-    return _op.nn.group_norm(*inputs, **new_attrs)
-
-
-def _mx_slice(inputs, attrs):
-    new_attrs = {}
-    begin = list(attrs.get_int_tuple("begin", None))
-    end = list(attrs.get_int_tuple("end", None))
-    stride = attrs.get_int_tuple("step", None)
-    input_shape = _infer_type(inputs[0]).checked_type.shape
-    if begin is None:
-        raise tvm.error.OpAttributeRequired('Attribute "begin" not found in operator Slice.')
-    if end is None:
-        raise tvm.error.OpAttributeRequired('Attribute "end" not found in operator Slice.')
-    begin = (x if x is not None else 0 for x in begin)
-    for i, ed in enumerate(end):
-        if ed is None:
-            end[i] = input_shape[i]
-    new_attrs = {"begin": list(begin), "end": list(end)}
-    if stride is not None:
-        stride = (x if x is not None else 1 for x in stride)
-        new_attrs["strides"] = list(stride)
-    return _op.strided_slice(inputs[0], **new_attrs)
-
-
-def _mx_slice_like(inputs, attrs):
-    assert len(inputs) == 2
-    new_attrs = {}
-    new_attrs["axes"] = attrs.get_int_tuple("axes", None)
-    return _op.slice_like(*inputs, **new_attrs)
-
-
-def _mx_slice_axis(inputs, attrs):
-    assert len(inputs) == 1
-    expr = _infer_type(inputs[0])
-    shape = expr.checked_type.shape
-    axis = attrs.get_int("axis")
-    ax_beg = attrs.get_int("begin")
-    ax_end = attrs.get_str("end")
-    if axis < 0:
-        axis += len(shape)
-    assert 0 <= axis < len(shape)
-    if ax_end == "None":
-        ax_end = int(shape[axis])
-    else:
-        ax_end = int(ax_end)
-    if ax_beg < 0:
-        ax_beg += int(shape[axis])
-    if ax_end < 0:
-        ax_end += int(shape[axis])
-    assert 0 <= ax_beg < int(shape[axis])
-    assert ax_beg < ax_end <= int(shape[axis])
-    begin = []
-    end = []
-    for i, dim in enumerate(shape):
-        if i != axis:
-            begin.append(0)
-            end.append(dim)
-        else:
-            begin.append(ax_beg)
-            end.append(ax_end)
-    return _op.strided_slice(inputs[0], begin, end)
-
-
-def _mx_crop_like(inputs, attrs):
-    if len(inputs) < 2:
-        raise tvm.error.OpAttributeUnimplemented(
-            "Only support crop_like pattern for operator Crop."
-        )
-    if attrs.get_bool("center_crop", False):
-        raise tvm.error.OpAttributeUnimplemented("Center crop is not supported in operator Crop.")
-    if attrs.get_int_tuple("h_w", (0, 0)) != (0, 0):
-        raise tvm.error.OpAttributeUnimplemented("Doesn't support h_w in operator Crop.")
-    offset = attrs.get_int_tuple("offset", (0, 0))
-    new_attrs = {}
-    if offset == (0, 0):
-        new_attrs["axes"] = (2, 3)
-        return _op.slice_like(*inputs, **new_attrs)
-    expr = _infer_type(inputs[1])
-    like_shape = expr.checked_type.shape
-    new_attrs["begin"] = [0, 0, offset[0], offset[1]]
-    new_attrs["end"] = [
-        like_shape[0],
-        like_shape[1],
-        offset[0] + like_shape[2],
-        offset[1] + like_shape[3],
-    ]
-    return _op.strided_slice(inputs[0], **new_attrs)
-
-
-def _mx_split(inputs, attrs):
-    axis = attrs.get_int("axis", 1)
-    new_attrs = {}
-    new_attrs["indices_or_sections"] = attrs.get_int("num_outputs")
-    new_attrs["axis"] = axis
-    res = _op.split(inputs[0], **new_attrs)
-    if attrs.get_bool("squeeze_axis", False):
-        return tuple([_op.squeeze(x, axis=[axis]) for x in res])
-    return res
-
-
-def _mx_softmax_activation(inputs, attrs):
-    mode = attrs.get_str("mode", "instance")
-    axis = 0 if mode == "instance" else 1
-    return _op.nn.softmax(inputs[0], axis=axis)
-
-
-def _mx_softmax_output(inputs, attrs):
-    if attrs.get_bool("multi_output", False):
-        return _op.nn.softmax(inputs[0], axis=1)
-    return _op.nn.softmax(inputs[0])
-
-
-def _mx_linear_regression_output(inputs, _):
-    return inputs[0]
-
-
-def _mx_logistic_regression_output(inputs, _):
-    return _op.sigmoid(inputs[0])
-
-
-def _mx_concat(inputs, attrs):
-    axis = attrs.get_int("dim", 1)
-    return _op.concatenate(tuple(inputs), axis=axis)
-
-
-def _mx_stack(inputs, attrs):
-    axis = attrs.get_int("axis", 0)
-    return _op.stack(tuple(inputs), axis=axis)
-
-
-def _mx_expand_dims(inputs, attrs):
-    axis = attrs.get_int("axis")
-    return _op.expand_dims(inputs[0], axis=axis)
-
-
-def _mx_pad(inputs, attrs):
-    pad_mode = attrs.get_str("mode", None)
-    if pad_mode is None:
-        raise tvm.error.OpAttributeRequired('Attribute "mode" not found in operator pad.')
-    if pad_mode not in ["constant", "edge", "reflect"]:
-        raise tvm.error.OpAttributeInvalid("Value " + mode + ' in attribute "mode" is not valid')
-    pad_width = attrs.get_int_tuple("pad_width", None)
-    if pad_width is None:
-        raise tvm.error.OpAttributeRequired('Attribute "pad_width" not found in operator pad.')
-    if None in pad_width:
-        raise tvm.error.OpAttributeInvalid(
-            'Value None in attribute "pad_width" of operator Slice is not valid.'
-        )
-    constant_value = attrs.get_float("constant_value", 0.0)
-    padding = tuple(tuple((b, a)) for b, a in zip(pad_width[::2], pad_width[1::2]))
-    return _op.nn.pad(
-        data=inputs[0], pad_width=padding, pad_value=constant_value, pad_mode=pad_mode
-    )
-
-
-def _mx_leaky_relu(inputs, attrs):
-    act_type = attrs.get_str("act_type", "leaky")
-    if act_type == "leaky":
-        return _op.nn.leaky_relu(inputs[0], alpha=attrs.get_float("slope", 0.25))
-    if act_type == "prelu":
-        assert len(inputs) == 2
-        return _op.nn.prelu(*inputs)
-    if act_type == "elu":
-        # -slope * relu(1-exp(x)) + relu(x)
-        slope = attrs.get_float("slope", 0.25)
-        one = _expr.const(1, dtype="float32")
-        x = inputs[0]
-        mslope = _op.nn.relu(_op.subtract(one, _op.exp(x)))
-        mslope = _op.multiply(mslope, _expr.const(-slope, dtype="float32"))
-        return _op.add(mslope, _op.nn.relu(x))
-    if act_type == "rrelu":
-        # NOTE this is only converted for inference.
-        lower_bound = attrs.get_float("lower_bound")
-        upper_bound = attrs.get_float("upper_bound")
-        alpha = (lower_bound + upper_bound) / 2.0
-        return _op.nn.leaky_relu(inputs[0], alpha=alpha)
-    if act_type == "gelu":
-        # 0.5 * x * (1 + erf(x / sqrt(2)))
-        sqrt2 = _expr.const(math.sqrt(2), dtype="float32")
-        erf = _op.erf(_op.divide(inputs[0], sqrt2))
-        one = _expr.const(1, dtype="float32")
-        erf_plus_one = _op.add(one, erf)
-        half = _expr.const(0.5, dtype="float32")
-        half_x = _op.multiply(inputs[0], half)
-        return _op.multiply(half_x, erf_plus_one)
-    raise tvm.error.OpNotImplemented(f"Operator {act_type} is not supported for frontend MXNet.")
-
-
-def _mx_make_power(power):
-    def _impl(inputs, _):  # Note: no attrs
-        assert len(inputs) == 1
-        scalar = _expr.const(power, dtype=None)
-        # Note: int maps to "int32", float maps to "float32"
-        return _op.power(inputs[0], scalar)
-
-    return _impl
-
-
-def _mx_make_exponent(base):
-    # exp(b, x) = e^b * e^x
-    def _impl(inputs, _):  # Note: no attrs
-        assert len(inputs) == 1
-        scalar = _op.exp(_expr.const(base, dtype="float32"))
-        return _op.multiply(inputs[0], scalar)
-
-    return _impl
-
-
-def _mx_make_logarithm(base):
-    # log(b, x) = log(x) / log(b)
-    def _impl(inputs, _):  # Note: no attrs
-        assert len(inputs) == 1
-        scalar = _op.log(_expr.const(base, dtype="float32"))
-        return _op.divide(inputs[0], scalar)
-
-    return _impl
-
-
-def _mx_expm1():
-    # exp_minus_1 x = exp(x) - 1
-    def _impl(inputs, _):  # Note: no attrs
-        assert len(inputs) == 1
-        one = _expr.const(1, dtype="float32")
-        return _op.log(_op.subtract(inputs[0], one))
-
-    return _impl
-
-
-def _mx_log1p():
-    # 1_plus_log x = log(x + 1)
-    def _impl(inputs, _):  # Note: no attrs
-        assert len(inputs) == 1
-        one = _expr.const(1, dtype="float32")
-        return _op.log(_op.add(inputs[0], one))
-
-    return _impl
-
-
-def _mx_lrn(inputs, attrs):
-    new_attrs = {}
-    new_attrs["alpha"] = attrs.get_float("alpha", 0.0001)
-    new_attrs["beta"] = attrs.get_float("beta", 0.75)
-    new_attrs["bias"] = attrs.get_float("knorm", 2)
-    # NCHW format and normalization along channel axis
-    new_attrs["axis"] = 1
-    new_attrs["size"] = attrs.get_int("nsize")
-    assert len(inputs) == 1
-    return _op.nn.lrn(inputs[0], **new_attrs)
-
-
-def _mx_multibox_prior(inputs, attrs):
-    new_attrs = {}
-    new_attrs["sizes"] = attrs.get_float_tuple("sizes", (1.0,))
-    new_attrs["steps"] = attrs.get_float_tuple("steps", (-1.0, -1.0))
-    new_attrs["offsets"] = attrs.get_float_tuple("offsets", (0.5, 0.5))
-    new_attrs["ratios"] = attrs.get_float_tuple("ratios", (1.0,))
-    new_attrs["clip"] = attrs.get_bool("clip", False)
-    return _op.vision.multibox_prior(inputs[0], **new_attrs)
-
-
-def _mx_multibox_detection(inputs, attrs):
-    new_attrs0 = {}
-    new_attrs0["clip"] = attrs.get_bool("clip", True)
-    new_attrs0["threshold"] = attrs.get_float("threshold", 0.01)
-    new_attrs0["variances"] = attrs.get_float_tuple("variances", (0.1, 0.1, 0.2, 0.2))
-
-    new_attrs1 = {}
-    new_attrs1["return_indices"] = False
-    new_attrs1["iou_threshold"] = attrs.get_float("nms_threshold", 0.5)
-    new_attrs1["force_suppress"] = attrs.get_bool("force_suppress", False)
-    new_attrs1["top_k"] = attrs.get_int("nms_topk", -1)
-
-    ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1], inputs[2], **new_attrs0)
-    return _op.vision.non_max_suppression(ret[0], ret[1], ret[1], **new_attrs1)
-
-
-def _mx_dot(inputs, attrs):
-    assert len(inputs) == 2
-
-    a = inputs[0]
-    b = inputs[1]
-
-    rank_a = len(_infer_type(a).checked_type.shape)
-    rank_b = len(_infer_type(b).checked_type.shape)
-
-    if rank_a < 1 or rank_b < 1:
-        raise tvm.error.OpAttributeInvalid("Unsupported shape of input tensors.")
-
-    transpose_a = attrs.get_bool("transpose_a", False)
-    transpose_b = attrs.get_bool("transpose_b", False)
-
-    if transpose_a is True:
-        msg = f'Value {transpose_a} in attribute "transpose_a" of operator dot is not valid.'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    # When performing dot product we need to properly handle shape of result -> out_shape
-    if rank_a == 1:
-        out_shape = list()
-        a = _op.expand_dims(a, axis=0)
-    else:
-        shape_a = list(_infer_type(a).checked_type.shape)
-        out_shape = shape_a[:-1]
-        a = _op.reshape(a, newshape=(-1, shape_a[-1]))
-
-    if rank_b == 1:
-        if not out_shape:
-            out_shape = [1]
-        b = _op.expand_dims(b, axis=1)
-    else:
-        # Transpose matrix b if needed
-        if transpose_b:
-            trans_axes = list(range(rank_b))
-            trans_axes = trans_axes[-1:] + trans_axes[:-1]
-            b = _op.transpose(b, axes=trans_axes)
-
-        shape_b = list(_infer_type(b).checked_type.shape)
-        out_shape += shape_b[1:]
-        b = _op.reshape(b, newshape=(shape_b[0], -1))
-
-    out = _op.reshape(_op.nn.matmul(a, b), newshape=out_shape)
-
-    return out
-
-
-def _mx_batch_dot(inputs, attrs):
-    assert len(inputs) == 2
-    a, b = inputs
-    a_shape = _infer_type(a).checked_type.shape
-    batch_shapes = None
-    if len(a_shape) > 3:
-        batch_shapes = a_shape[:-2]
-        a = _op.reverse_reshape(a, newshape=(-1, 0, 0))
-    b_shape = _infer_type(b).checked_type.shape
-    if len(b_shape) > 3:
-        if batch_shapes is None:
-            batch_shapes = b_shape[:-2]
-        b = _op.reverse_reshape(b, newshape=(-1, 0, 0))
-    transpose_a = attrs.get_bool("transpose_a", False)
-    transpose_b = attrs.get_bool("transpose_b", False)
-    if transpose_a is True:
-        msg = f'Value {transpose_a} in attribute "transpose_a" of operator batch_dot is not valid.'
-        raise tvm.error.OpAttributeInvalid(msg)
-    if transpose_b is False:
-        b = _op.transpose(b, axes=[0, 2, 1])
-    out = _op.nn.batch_matmul(a, b)
-    if batch_shapes is not None:
-        out = _op.reverse_reshape(out, newshape=tuple(batch_shapes) + (0, 0))
-    return out
-
-
-def _mx_arange(inputs, attrs):
-    assert len(inputs) == 0
-    if attrs.get_int("repeat", 1) != 1:
-        raise tvm.error.OpAttributeUnimplemented(
-            'Attribute "repeat" is not supported in operator arange.'
-        )
-    dtype = attrs.get_str("dtype", "float32")
-    stop = attrs.get_str("stop", "None")
-    if stop == "None":
-        stop = None
-    else:
-        stop = _expr.const(float(stop), dtype=dtype)
-    new_attrs = {}
-    new_attrs["start"] = _expr.const(attrs.get_float("start", 0.0), dtype=dtype)
-    new_attrs["stop"] = stop
-    new_attrs["step"] = _expr.const(attrs.get_float("step", 1.0), dtype=dtype)
-    new_attrs["dtype"] = dtype
-    return _op.arange(**new_attrs)
-
-
-# pylint: disable=unused-argument
-def _mx_make_loss(inputs, attrs):
-    # while doing inference make_loss does not have any effect
-    # and it should be mapped to identity
-    return inputs[0]
-
-
-def _mx_contrib_arange_like(inputs, attrs):
-    assert len(inputs) == 1
-    if attrs.get_int("repeat", 1) != 1:
-        raise tvm.error.OpAttributeUnimplemented(
-            'Attribute "repeat" is not supported in operator arange_like.'
-        )
-    ty = _infer_type(inputs[0]).checked_type
-    assert ty
-    shape, dtype = get_const_tuple(ty.shape), ty.dtype
-    axis = attrs.get_int("axis", None)
-    if axis is None:
-        n_elems = 1
-        for dim in shape:
-            if not isinstance(dim, int):
-                raise tvm.error.OpError("Don't support arange_like with symbolic input shape.")
-            n_elems *= dim
-    else:
-        axis = axis + len(shape) if axis < 0 else axis
-        assert 0 <= axis < len(shape)
-        n_elems = shape[axis]
-        if not isinstance(n_elems, int):
-            raise tvm.error.OpError("Don't support arange_like with symbolic input shape.")
-        shape = (n_elems,)
-    start = attrs.get_float("start", 0.0)
-    step = attrs.get_float("step", 1.0)
-    stop = start + step * n_elems
-    new_attrs = {}
-    new_attrs["start"] = _expr.const(start, dtype=dtype)
-    new_attrs["stop"] = _expr.const(stop, dtype=dtype)
-    new_attrs["step"] = _expr.const(step, dtype=dtype)
-    new_attrs["dtype"] = dtype
-    ret = _op.arange(**new_attrs)
-    if len(shape) > 1:
-        ret = _op.reshape(ret, shape)
-    return ret
-
-
-def _mx_repeat(inputs, attrs):
-    assert len(inputs) == 1
-    new_attrs = {}
-    new_attrs["repeats"] = attrs.get_int("repeats")
-    axis = attrs.get_int("axis", None)
-    if axis is None:
-        inputs[0] = _op.nn.batch_flatten(inputs[0])
-        new_attrs["axis"] = 0
-    else:
-        new_attrs["axis"] = axis
-    return _op.repeat(inputs[0], **new_attrs)
-
-
-def _mx_tile(inputs, attrs):
-    assert len(inputs) == 1
-    new_attrs = {}
-    new_attrs["reps"] = attrs.get_int_tuple("reps")
-    return _op.tile(inputs[0], **new_attrs)
-
-
-def _mx_take(inputs, attrs):
-    assert len(inputs) == 2
-    mode = attrs.get_str("mode", "clip")
-    if mode == "raise":
-        raise tvm.error.OpAttributeUnimplemented("take with raise mode is not supported yet")
-    axis = attrs.get_int("axis", 0)
-    return _op.take(inputs[0], inputs[1].astype("int32"), axis=axis, mode=mode)
-
-
-def _mx_gather_nd(inputs, attrs):
-    assert len(inputs) == 2
-    assert len(_infer_shape(inputs[1])) > 1, "index tensor to have at least 2 dimensions"
-    return _op.gather_nd(inputs[0], inputs[1])
-
-
-def _mx_reverse(inputs, attrs):
-    assert len(inputs) == 1
-    new_attrs = {}
-    new_attrs["axis"] = attrs.get_int("axis")
-    return _op.reverse(inputs[0], **new_attrs)
-
-
-def _mx_sequence_reverse(inputs, attrs):
-    new_attrs = {}
-    use_seq_lengths = attrs.get_bool("use_sequence_length")
-    if not use_seq_lengths:
-        assert len(inputs) == 1
-        new_attrs["axis"] = attrs.get_int("axis")
-        return _op.reverse(inputs[0], **new_attrs)
-
-    assert len(inputs) == 2
-    new_attrs["seq_axis"] = attrs.get_int("axis")
-    # MXNet assumes batch_axis as 1.
-    new_attrs["batch_axis"] = 1
-    return _op.reverse_sequence(inputs[0], inputs[1], **new_attrs)
-
-
-def _mx_roi_align(inputs, attrs):
-    new_attrs = {}
-    new_attrs["pooled_size"] = attrs.get_int_tuple("pooled_size")
-    new_attrs["spatial_scale"] = attrs.get_float("spatial_scale")
-    new_attrs["sample_ratio"] = attrs.get_int("sample_ratio", -1)
-    new_attrs["layout"] = "NCHW"
-    return _op.vision.roi_align(inputs[0], inputs[1], **new_attrs)
-
-
-def _mx_resize(inputs, attrs):
-    scale_height = attrs.get_float("scale_height", None)
-    scale_width = attrs.get_float("scale_width", None)
-    height = attrs.get_int("height", 1)
-    width = attrs.get_int("width", 1)
-    expr = _infer_type(inputs[0])
-    shape = expr.checked_type.shape
-    if scale_height is not None:
-        height = (scale_height * shape[2]).astype("int32")
-    if scale_width is not None:
-        width = (scale_width * shape[3]).astype("int32")
-    size = (height, width)
-    return _op.image.resize2d(inputs[0], size, coordinate_transformation_mode="align_corners")
-
-
-def _mx_amp_multicast(inputs, attrs):
-    cast_narrow = attrs.get_bool("cast_narrow", False)
-    dtypes = [_infer_type(x).checked_type.dtype for x in inputs]
-    supported_dtypes = ["float16", "float32"]
-    assert all(
-        [x in supported_dtypes for x in dtypes]
-    ), "amp_multicast support is limited to float16 and float32 inputs only."
-    has_float16 = any(x == "float16" for x in dtypes)
-    has_float32 = any(x == "float32" for x in dtypes)
-    dtype = dtypes[0]
-    if cast_narrow and has_float16:
-        dtype = "float16"
-    if not cast_narrow and has_float32:
-        dtype = "float32"
-    return [_op.cast(x, dtype) for x in inputs]
-
-
-def _mx_grid_generator(inputs, attrs):
-    transform_type = attrs.get_str("transform_type")
-    if transform_type == "affine":
-        target_shape = attrs.get_int_tuple("target_shape")
-        return _op.image.affine_grid(_op.reshape(inputs[0], (0, 2, 3)), target_shape)
-    if transform_type == "warp":
-        checked_type = _infer_type(inputs[0]).checked_type
-        batch, _, height, width = get_const_tuple(checked_type.shape)
-        dtype = checked_type.dtype
-        identity_affine = relay.const(np.array([[[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]], dtype=dtype))
-        identity_affine = _op.broadcast_to(identity_affine, (batch, 2, 3))
-        normalizer = (2.0 / np.array([width - 1, height - 1])).reshape(1, -1, 1, 1).astype(dtype)
-        normalized_flow = inputs[0] * relay.const(normalizer)
-        grid = _op.image.affine_grid(identity_affine, (height, width))
-        return grid + normalized_flow
-    raise ValueError("unknown transform type" + transform_type)
-
-
-def _mx_bilinear_sampler(inputs, attrs):
-    return _op.image.grid_sample(inputs[0], inputs[1], "bilinear", "NCHW")
-
-
-def _mx_roi_pooling(inputs, attrs):
-    new_attrs = {}
-    new_attrs["pooled_size"] = attrs.get_int_tuple("pooled_size")
-    new_attrs["spatial_scale"] = attrs.get_float("spatial_scale")
-    new_attrs["layout"] = "NCHW"
-    return _op.vision.roi_pool(inputs[0], inputs[1], **new_attrs)
-
-
-def _mx_proposal(inputs, attrs):
-    new_attrs = {}
-    new_attrs["scales"] = attrs.get_float_tuple("scales", (4.0, 8.0, 16.0, 32.0))
-    new_attrs["ratios"] = attrs.get_float_tuple("ratios", (0.5, 1.0, 2.0))
-    new_attrs["feature_stride"] = attrs.get_int("feature_stride", 16)
-    new_attrs["threshold"] = attrs.get_float("threshold", 0.7)
-    new_attrs["rpn_pre_nms_top_n"] = attrs.get_int("rpn_pre_nms_top_n", 6000)
-    new_attrs["rpn_post_nms_top_n"] = attrs.get_int("rpn_post_nms_top_n", 300)
-    new_attrs["rpn_min_size"] = attrs.get_int("rpn_min_size", 16)
-    new_attrs["iou_loss"] = attrs.get_bool("iou_loss", False)
-    assert not attrs.get_bool("output_score", False), "proposal doesn't support output score"
-    return _op.vision.proposal(inputs[0], inputs[1], inputs[2], **new_attrs)
-
-
-def _mx_box_nms(inputs, attrs):
-    force_suppress = attrs.get_bool("force_suppress", False)
-    iou_thresh = attrs.get_float("overlap_thresh", 0.5)
-    top_k = attrs.get_int("topk", -1)
-    valid_thresh = attrs.get_float("valid_thresh", 0)
-    coord_start = attrs.get_int("coord_start", 2)
-    score_index = attrs.get_int("score_index", 1)
-    id_index = attrs.get_int("id_index", -1)
-    in_format = attrs.get_str("in_format", "corner")
-    out_format = attrs.get_str("out_format", "corner")
-    if in_format != "corner":
-        raise tvm.error.OpAttributeInvalid(
-            'Value of attribute "in_format" must equal "corner" for operator box_nms.'
-        )
-    if out_format != "corner":
-        raise tvm.error.OpAttributeInvalid(
-            'Value of attribute "out_format" must equal "corner" for operator box_nms.'
-        )
-
-    ret = _op.vision.get_valid_counts(
-        inputs[0], score_threshold=valid_thresh, id_index=id_index, score_index=score_index
-    )
-    nms_out = _op.vision.non_max_suppression(
-        ret[1],
-        ret[0],
-        ret[2],
-        iou_threshold=iou_thresh,
-        force_suppress=force_suppress,
-        top_k=top_k,
-        coord_start=coord_start,
-        score_index=score_index,
-        id_index=id_index,
-        return_indices=False,
-        invalid_to_bottom=True,
-    )
-    return nms_out
-
-
-def _mx_box_decode(inputs, attrs):
-    std0 = relay.const(attrs.get_float("std0", 1), "float32")
-    std1 = relay.const(attrs.get_float("std1", 1), "float32")
-    std2 = relay.const(attrs.get_float("std2", 1), "float32")
-    std3 = relay.const(attrs.get_float("std3", 1), "float32")
-    clip = attrs.get_float("clip", -1)
-    in_format = attrs.get_str("format", "corner")
-
-    anchors = inputs[1]  # (1, N, 4) encoded in corner or center
-    a = _op.split(anchors, indices_or_sections=4, axis=-1)
-    # Convert to format "center".
-    if in_format == "corner":
-        a_width = a[2] - a[0]
-        a_height = a[3] - a[1]
-        a_x = a[0] + a_width * relay.const(0.5, "float32")
-        a_y = a[1] + a_height * relay.const(0.5, "float32")
-    else:
-        a_x, a_y, a_width, a_height = a
-    data = inputs[0]  # (B, N, 4) predicted bbox offset
-    p = _op.split(data, indices_or_sections=4, axis=-1)
-    ox = p[0] * std0 * a_width + a_x
-    oy = p[1] * std1 * a_height + a_y
-    dw = p[2] * std2
-    dh = p[3] * std3
-    if clip > 0:
-        clip = relay.const(clip, "float32")
-        dw = _op.minimum(dw, clip)
-        dh = _op.minimum(dh, clip)
-    dw = _op.exp(dw)
-    dh = _op.exp(dh)
-    ow = dw * a_width * relay.const(0.5, "float32")
-    oh = dh * a_height * relay.const(0.5, "float32")
-    out = _op.concatenate([ox - ow, oy - oh, ox + ow, oy + oh], axis=-1)
-    return out
-
-
-def _mx_l2_normalize(inputs, attrs):
-    new_attrs = {}
-    mode = attrs.get_str("mode", "instance")
-    if mode == "channel":
-        new_attrs["axis"] = [1]
-    elif mode == "instance":
-        ndim = len(_infer_type(inputs[0]).checked_type.shape)
-        new_attrs["axis"] = list(range(1, ndim))
-    elif mode == "spatial":
-        ndim = len(_infer_type(inputs[0]).checked_type.shape)
-        new_attrs["axis"] = list(range(2, ndim))
-    else:
-        raise tvm.error.OpAttributeInvalid(
-            f'Mode "{mode}" is not supported for operator l2_normalize.'
-        )
-    new_attrs["eps"] = attrs.get_float("eps", 1e-10)
-    return _op.nn.l2_normalize(inputs[0], **new_attrs)
-
-
-def _mx_softsign(inputs, attrs):
-    return inputs[0] / (_expr.const(1.0) + _op.abs(inputs[0]))
-
-
-def _mx_softmin(inputs, attrs):
-    axis = attrs.get_int("axis", -1)
-    return _op.nn.softmax(_op.negative(inputs[0]), axis)
-
-
-def _mx_hard_sigmoid(inputs, attrs):
-    x = (_expr.const(0.2) * inputs[0]) + _expr.const(0.5)
-    return _op.clip(x, a_min=0.0, a_max=1.0)
-
-
-def _mx_reciprocal(inputs, attrs):
-    return _expr.const(1.0) / inputs[0]
-
-
-def _mx_shape_array(inputs, attrs):
-    assert len(inputs) == 1
-    if attrs.get_int("lhs_begin", None) is not None:
-        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support lhs_begin")
-    if attrs.get_int("lhs_end", None) is not None:
-        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support lhs_end")
-    if attrs.get_int("rhs_begin", None) is not None:
-        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support rhs_begin")
-    if attrs.get_int("rhs_end", None) is not None:
-        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support rhs_end")
-    return _op.shape_of(inputs[0], dtype="int64")
-
-
-def _mx_full(inputs, attrs):
-    assert len(inputs) == 0
-    val = attrs.get_float("value")
-    shape = attrs.get_int_tuple("shape")
-    dtype = attrs.get_str("dtype", "float32")
-    return _op.full(_expr.const(val, dtype), shape, dtype)
-
-
-def _mx_squeeze(inputs, attrs):
-    assert len(inputs) == 1
-    axis = attrs.get_int_tuple("axis", None)
-    return _op.squeeze(inputs[0], axis)
-
-
-def _mx_broadcast_axis(inputs, attrs):
-    assert len(inputs) == 1
-    axis = attrs.get_int_tuple("axis", [])
-    size = attrs.get_int_tuple("size", [])
-    assert len(axis) == len(size)
-    if len(axis) == 0:
-        return inputs[0]
-    expr = _infer_type(inputs[0])
-    src_shape = expr.checked_type.shape
-    tgt_shape = []
-    for i, dim in enumerate(src_shape):
-        if i not in axis:
-            tgt_shape.append(dim)
-        else:
-            assert int(dim) == 1
-            idx = axis.index(i)
-            tgt_shape.append(size[idx])
-    return _op.broadcast_to(inputs[0], tgt_shape)
-
-
-def _mx_embedding(inputs, _):
-    assert len(inputs) == 2
-    indices, weight = inputs
-    return _op.take(weight, indices.astype("int32"), axis=0)
-
-
-def _mx_smooth_l1(inputs, attrs):
-    scalar = attrs.get_float("scalar", 1.0)
-    scalar_sq = scalar * scalar
-    mask = _op.less(inputs[0], _expr.const(1.0 / scalar_sq, dtype="float32"))
-    return _op.where(
-        mask,
-        _expr.const(scalar_sq / 2.0, dtype="float32") * inputs[0] * inputs[0],
-        _op.abs(inputs[0]) - _expr.const(0.5 / scalar_sq),
-    )
-
-
-def _mx_deformable_convolution(inputs, attrs):
-    new_attrs = {}
-    new_attrs["kernel_size"] = attrs.get_int_tuple("kernel")
-    new_attrs["strides"] = attrs.get_int_tuple("stride")
-    new_attrs["padding"] = attrs.get_int_tuple("pad")
-    new_attrs["dilation"] = attrs.get_int_tuple("dilate")
-    new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["deformable_groups"] = attrs.get_int("num_deformable_group", 1)
-    new_attrs["groups"] = attrs.get_int("num_group", 1)
-    assert attrs.get_str("layout", "NCHW") == "NCHW", "Deformable conv2d only supports NCHW layout"
-    use_bias = not attrs.get_bool("no_bias", False)
-    res = _op.nn.deformable_conv2d(inputs[0], inputs[1], inputs[2], **new_attrs)
-    if use_bias:
-        assert len(inputs) == 4
-        res = _op.nn.bias_add(res, inputs[3])
-    return res
-
-
-def _mx_argsort(inputs, attrs):
-    assert len(inputs) == 1
-    new_attrs = {}
-    new_attrs["axis"] = attrs.get_int("axis", -1)
-    new_attrs["is_ascend"] = attrs.get_bool("is_ascend", True)
-    new_attrs["dtype"] = attrs.get_str("dtype", "float32")
-    return _op.argsort(inputs[0], **new_attrs)
-
-
-def _mx_topk(inputs, attrs):
-    assert len(inputs) == 1
-    new_attrs = {}
-    new_attrs["k"] = attrs.get_int("k", 1)
-    new_attrs["axis"] = attrs.get_int("axis", -1)
-    new_attrs["is_ascend"] = attrs.get_bool("is_ascend", False)
-    ret_type = attrs.get_str("ret_typ", "indices")
-    if ret_type == "mask":
-        raise tvm.error.OpAttributeUnimplemented(
-            "Attribute ret_type=mask is not supported in topk operator"
-        )
-    new_attrs["ret_type"] = "values" if ret_type == "value" else ret_type
-    new_attrs["dtype"] = attrs.get_str("dtype", "float32")
-    return _op.topk(inputs[0], **new_attrs)
-
-
-def _mx_sequence_mask(inputs, attrs):
-    assert len(inputs) == 1 or len(inputs) == 2
-    new_attrs = {}
-    use_sequence_length = attrs.get_bool("use_sequence_length", False)
-    new_attrs["mask_value"] = attrs.get_float("value", 0.0)
-    new_attrs["axis"] = attrs.get_int("axis", 0)
-    if use_sequence_length:
-        return _op.sequence_mask(*inputs, **new_attrs)
-    else:
-        return inputs[0]
-
-
-def _mx_contrib_div_sqrt_dim(inputs, _):
-    assert len(inputs) == 1
-    ndim = len(_infer_type(inputs[0]).checked_type.shape)
-    dim = _op.take(_op.shape_of(inputs[0]), _expr.const(ndim - 1, dtype="int32"))
-    dtype = _infer_type(inputs[0]).checked_type.dtype
-    sqrt_dim = _op.sqrt(dim.astype(dtype))
-    out = inputs[0] / sqrt_dim
-    return out
-
-
-def _mx_rnn_param_concat(inputs, _):
-    # We don't need to concatenate RNN params because we will unravel the RNN op
-    return [inputs]
-
-
-def _mx_rnn_layer(inputs, attrs):
-    def _rnn_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias, activation):
-        i2h = _op.nn.bias_add(_op.nn.dense(data, i2h_weight), i2h_bias, axis=-1)
-        h2h = _op.nn.bias_add(_op.nn.dense(states[0], h2h_weight), h2h_bias, axis=-1)
-        out = _activation_map[activation](i2h + h2h)
-        return out, [out]
-
-    def _gru_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
-        expr = _infer_type(data)
-        dtype = expr.checked_type.dtype
-        i2h = _op.nn.bias_add(_op.nn.dense(data, i2h_weight), i2h_bias, axis=-1)
-        h2h = _op.nn.bias_add(_op.nn.dense(states[0], h2h_weight), h2h_bias, axis=-1)
-        i2h_r, i2h_z, i2h = _op.split(i2h, indices_or_sections=3, axis=1)
-        h2h_r, h2h_z, h2h = _op.split(h2h, indices_or_sections=3, axis=1)
-        reset_gate = _activation_map["sigmoid"](i2h_r + h2h_r)
-        update_gate = _activation_map["sigmoid"](i2h_z + h2h_z)
-        next_h_tmp = _activation_map["tanh"](reset_gate * h2h + i2h)
-        next_h = (_expr.const(1, dtype) - update_gate) * next_h_tmp + update_gate * states[0]
-        return next_h, [next_h]
-
-    def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
-        i2h = _op.nn.bias_add(_op.nn.dense(data, i2h_weight), i2h_bias, axis=-1)
-        h2h = _op.nn.bias_add(_op.nn.dense(states[0], h2h_weight), h2h_bias, axis=-1)
-        gates = i2h + h2h
-        slice_gates = _op.split(gates, indices_or_sections=4, axis=1)
-        in_gate = _activation_map["sigmoid"](slice_gates[0])
-        forget_gate = _activation_map["sigmoid"](slice_gates[1])
-        in_transform = _activation_map["tanh"](slice_gates[2])
-        out_gate = _activation_map["sigmoid"](slice_gates[3])
-        next_c = forget_gate * states[1] + in_gate * in_transform
-        next_h = out_gate * _activation_map["tanh"](next_c)
-        return next_h, [next_h, next_c]
-
-    num_layers = attrs.get_int("num_layers", 1)
-    mode = attrs.get_str("mode")
-    output_states = attrs.get_bool("state_outputs", False)
-    if mode.startswith("rnn"):
-        mode, activation = mode.split("_")
-    assert mode in ["rnn", "gru", "lstm"]
-    bidirectional = attrs.get_bool("bidirectional", False)
-    direct = 2 if bidirectional else 1
-    layout = attrs.get_str("layout", "TNC")
-    if layout != "TNC":
-        raise tvm.error.OpAttributeUnimplemented(
-            "RNN with layout other than TNC is not supported yet"
-        )
-    num_states = 2 if mode == "lstm" else 1
-    assert len(inputs) == num_states + 2
-
-    seq_data = inputs[0]
-    concat_weight = inputs[1]
-    init_states = inputs[2:]
-    expr = _infer_type(seq_data)
-    data_shape = expr.checked_type.shape
-    seq_len = int(data_shape[0])
-    assert len(concat_weight) == num_layers * 4 * direct
-
-    for idx, state in enumerate(init_states[:]):
-        if isinstance(state, dict):
-            node = state
-            attrs = StrAttrsDict(node.get("attrs", {}))
-            op_name = node["op"]
-            # by default, RNN layer uses zeros to initialize states
-            assert op_name == "_zeros"
-            shape = attrs.get_int_tuple("shape")
-            dtype = attrs.get_str("dtype", "float32")
-            init_layout = attrs.get_str("__layout__")
-            new_shape = list(shape)
-            for i, dim in enumerate(shape):
-                if dim == 0:
-                    axis = layout.find(init_layout[i])
-                    assert axis >= 0
-                    new_shape[i] = int(data_shape[axis])
-            init_states[idx] = _op.zeros(new_shape, dtype)
-
-    weights = []
-    bias = []
-    states = []
-    back_weights = []
-    back_bias = []
-    back_states = []
-    for i in range(num_layers):
-        weights.append(
-            [concat_weight[i * 2 * direct].args[0], concat_weight[i * 2 * direct + 1].args[0]]
-        )
-        bias.append(
-            [
-                concat_weight[(num_layers + i) * 2 * direct].args[0],
-                concat_weight[(num_layers + i) * 2 * direct + 1].args[0],
-            ]
-        )
-        s = []
-        for state in init_states:
-            s.append(_op.take(state, _expr.const(i * direct, "int32"), axis=0))
-        states.append(s)
-        if bidirectional:
-            back_weights.append(
-                [
-                    concat_weight[i * 2 * direct + 2].args[0],
-                    concat_weight[i * 2 * direct + 3].args[0],
-                ]
-            )
-            back_bias.append(
-                [
-                    concat_weight[(num_layers + i) * 2 * direct + 2].args[0],
-                    concat_weight[(num_layers + i) * 2 * direct + 3].args[0],
-                ]
-            )
-            s = []
-            for state in init_states:
-                s.append(_op.take(state, _expr.const(i * direct + 1, "int32"), axis=0))
-            back_states.append(s)
-
-    xs = [_op.take(seq_data, _expr.const(t, "int32"), axis=0) for t in range(seq_len)]
-    for l in range(num_layers):
-        outputs = []
-        back_outputs = []
-        for x in xs:
-            if mode == "rnn":
-                out, new_states = _rnn_cell(x, states[l], *weights[l], *bias[l], activation)
-            elif mode == "gru":
-                out, new_states = _gru_cell(x, states[l], *weights[l], *bias[l])
-            else:  # mode == "lstm"
-                out, new_states = _lstm_cell(x, states[l], *weights[l], *bias[l])
-            states[l] = new_states
-            outputs.append(out)
-        if bidirectional:
-            for x in reversed(xs):
-                if mode == "rnn":
-                    out, new_states = _rnn_cell(
-                        x, back_states[l], *back_weights[l], *back_bias[l], activation
-                    )
-                elif mode == "gru":
-                    out, new_states = _gru_cell(x, back_states[l], *back_weights[l], *back_bias[l])
-                else:  # mode == "lstm"
-                    out, new_states = _lstm_cell(x, back_states[l], *back_weights[l], *back_bias[l])
-                back_states[l] = new_states
-                back_outputs.append(out)
-            back_outputs.reverse()
-            concat_outputs = []
-            for t, out in enumerate(outputs):
-                new_out = _op.concatenate([out, back_outputs[t]], axis=-1)
-                concat_outputs.append(new_out)
-            outputs = concat_outputs
-        xs = outputs
-
-    ret = [_op.stack(outputs, axis=0)]
-    if output_states:
-        for i in range(num_states):
-            inputs = []
-            for l, s in enumerate(states):
-                inputs.append(s[i])
-                if bidirectional:
-                    inputs.append(back_states[l][i])
-            ret.append(_op.stack(inputs, axis=0))
-    return ret
-
-
-def _mx_one_hot(inputs, attrs):
-    indices = inputs[0].astype("int32")
-    depth = attrs.get_int("depth", 0)
-    dtype = attrs.get_str("dtype", "int32")
-    on_value = tvm.relay.const(attrs.get_float("on_value", 1.0), dtype)
-    off_value = tvm.relay.const(attrs.get_float("off_value", 0.0), dtype)
-    return _op.one_hot(indices, on_value, off_value, depth, -1, dtype)
-
-
-def _mx_depth_to_space(inputs, attrs):
-    assert len(inputs) == 1
-    new_attrs = {}
-    new_attrs["block_size"] = attrs.get_int("block_size")
-    return _op.nn.depth_to_space(*inputs, **new_attrs)
-
-
-def _mx_space_to_depth(inputs, attrs):
-    assert len(inputs) == 1
-    new_attrs = {}
-    new_attrs["block_size"] = attrs.get_int("block_size")
-    return _op.nn.space_to_depth(*inputs, **new_attrs)
-
-
-def _mx_correlation(inputs, attrs):
-    assert len(inputs) == 2
-    new_attrs = {}
-    new_attrs["kernel_size"] = attrs.get_int("kernel_size", 1)
-    new_attrs["max_displacement"] = attrs.get_int("max_displacement", 1)
-    new_attrs["stride1"] = attrs.get_int("stride1", 1)
-    new_attrs["stride2"] = attrs.get_int("stride2", 1)
-    new_attrs["padding"] = attrs.get_int("pad_size", 0)
-    new_attrs["is_multiply"] = attrs.get_bool("is_multiply", True)
-    new_attrs["layout"] = "NCHW"
-    return _op.nn.correlation(*inputs, **new_attrs)
-
-
-def _mx_contrib_fifo_buffer(inputs, attrs):
-    new_attrs = {}
-    new_attrs["axis"] = attrs.get_int("axis")
-    return _op.nn.fifo_buffer(*inputs, **new_attrs)
-
-
-def _mx_contrib_interleaved_matmul_selfatt_qk(inputs, attrs):
-    """
-    tmp = mx.nd.reshape(queries_keys_values, shape=(0, 0, num_heads, 3, -1))
-    q_proj = mx.nd.transpose(tmp[:,:,:,0,:], axes=(1, 2, 0, 3))
-    q_proj = mx.nd.reshape(q_proj, shape=(-1, 0, 0), reverse=True)
-    q_proj = mx.nd.contrib.div_sqrt_dim(q_proj)
-    k_proj = mx.nd.transpose(tmp[:,:,:,1,:], axes=(1, 2, 0, 3))
-    k_proj = mx.nd.reshape(k_proj, shape=(-1, 0, 0), reverse=True)
-    output = mx.nd.batch_dot(q_proj, k_proj, transpose_b=True)
-    """
-    assert len(inputs) == 1
-    qkv = inputs[0]
-    num_heads = attrs.get_int("heads")
-    qkv = _op.reshape(qkv, newshape=(0, 0, num_heads, 3, -1))
-    q_proj = _op.take(qkv, _expr.const(0, "int32"), axis=3)
-    q_proj = _op.transpose(q_proj, axes=[1, 2, 0, 3])
-    q_proj = _op.reverse_reshape(q_proj, newshape=(-1, 0, 0))
-    q_proj = _mx_contrib_div_sqrt_dim([q_proj], None)
-    k_proj = _op.take(qkv, _expr.const(1, "int32"), axis=3)
-    k_proj = _op.transpose(k_proj, axes=[1, 2, 0, 3])
-    k_proj = _op.reverse_reshape(k_proj, newshape=(-1, 0, 0))
-    ret = _op.nn.batch_matmul(q_proj, k_proj)
-    return ret
-
-
-def _mx_contrib_interleaved_matmul_selfatt_valatt(inputs, attrs):
-    """
-    tmp = mx.nd.reshape(queries_keys_values, shape=(0, 0, num_heads, 3, -1))
-    v_proj = mx.nd.transpose(tmp[:,:,:,2,:], axes=(1, 2, 0, 3))
-    v_proj = mx.nd.reshape(v_proj, shape=(-1, 0, 0), reverse=True)
-    output = mx.nd.batch_dot(attention, v_proj)
-    output = mx.nd.reshape(output, shape=(-1, num_heads, 0, 0), reverse=True)
-    output = mx.nd.transpose(output, axes=(2, 0, 1, 3))
-    output = mx.nd.reshape(output, shape=(0, 0, -1))
-    """
-    assert len(inputs) == 2
-    qkv, att = inputs
-    num_heads = attrs.get_int("heads")
-    qkv = _op.reshape(qkv, newshape=(0, 0, num_heads, 3, -1))
-    v_proj = _op.take(qkv, _expr.const(2, "int32"), axis=3)
-    v_proj = _op.transpose(v_proj, axes=(1, 2, 0, 3))
-    v_proj = _op.reverse_reshape(v_proj, newshape=(-1, 0, 0))
-    v_proj = _op.transpose(v_proj, axes=[0, 2, 1])
-    out = _op.nn.batch_matmul(att, v_proj)
-    out = _op.reverse_reshape(out, newshape=(-1, num_heads, 0, 0))
-    out = _op.transpose(out, axes=(2, 0, 1, 3))
-    out = _op.reshape(out, newshape=(0, 0, -1))
-    return out
-
-
-def _mx_cond(inputs, attrs, subgraphs):
-    assert len(subgraphs) == 3
-    cond_input_locs = json.loads(attrs.get_str("cond_input_locs"))
-    then_input_locs = json.loads(attrs.get_str("then_input_locs"))
-    else_input_locs = json.loads(attrs.get_str("else_input_locs"))
-    num_outputs = attrs.get_int("num_outputs")
-
-    input_args = []
-    for i, arg in enumerate(inputs):
-        var = _expr.var(f"arg{i}", _infer_type(arg).checked_type)
-        input_args.append(var)
-    cond_args = [input_args[i] for i in cond_input_locs]
-    then_args = [input_args[i] for i in then_input_locs]
-    else_args = [input_args[i] for i in else_input_locs]
-
-    cond_arg_shapes = [arg.type_annotation.shape for arg in cond_args]
-    cond_arg_dtype_info = [arg.type_annotation.dtype for arg in cond_args]
-    cond_func = _from_mxnet_impl(subgraphs[0], cond_arg_shapes, cond_arg_dtype_info)
-    cond = _expr.Call(cond_func, cond_args).astype("bool")
-    cond_shape = get_const_tuple(_infer_type(cond).checked_type.shape)
-    if len(cond_shape) > 0:
-        assert len(cond_shape) == 1 and cond_shape[0] == 1, "Condition is not scalar"
-        cond = _op.take(cond, _expr.const(1, "int"))
-
-    sb = _scope_builder.ScopeBuilder()
-    with sb.if_scope(cond):
-        then_arg_shapes = [arg.type_annotation.shape for arg in then_args]
-        then_arg_dtype_info = [arg.type_annotation.dtype for arg in then_args]
-        then_func = _from_mxnet_impl(subgraphs[1], then_arg_shapes, then_arg_dtype_info)
-        sb.ret(_expr.Call(then_func, then_args))
-    with sb.else_scope():
-        else_arg_shapes = [arg.type_annotation.shape for arg in else_args]
-        else_arg_dtype_info = [arg.type_annotation.dtype for arg in else_args]
-        else_func = _from_mxnet_impl(subgraphs[2], else_arg_shapes, else_arg_dtype_info)
-        sb.ret(_expr.Call(else_func, else_args))
-    func = _function.Function(input_args, sb.get())
-    ret = _expr.Call(func, inputs)
-    if num_outputs > 1:
-        ret = _expr.TupleWrapper(ret, num_outputs)
-    return ret
-
-
-def _qnn_contrib_concat(inputs, attrs):
-    axis = attrs.get_int("dim", 1)
-    num_args = attrs.get_int("num_args", -1)
-    assert num_args > 0
-
-    input_exprs = inputs[0:num_args]
-
-    min_start_idx = num_args
-    max_start_idx = num_args + 1
-
-    mins = list()
-    maxs = list()
-
-    for i in range(min_start_idx, len(inputs), 2):
-        mins.append(inputs[i])
-
-    for i in range(max_start_idx, len(inputs), 2):
-        maxs.append(inputs[i])
-
-    # Check if all the input tensors have same qnn params.
-    if len(set(mins)) == 1 and len(set(maxs)) == 1:
-        output_min = mins[0]
-        output_max = maxs[0]
-        concat = _op.concatenate(tuple(input_exprs), axis=axis)
-        return concat, output_min, output_max
-    else:
-        # Get all dtypes. Find input and output scales, call concatenate.
-        dtypes = [_infer_type(x).checked_type.dtype for x in input_exprs]
-        assert all(
-            [x == "uint8" for x in dtypes]
-        ), "Current support is limited to uint8 inputs only."
-        new_min = min(mins)
-        new_max = max(maxs)
-        assert new_min == 0
-
-        output_scale = get_mkldnn_uint8_scale(new_min, new_max)
-        min_max = zip(mins, maxs)
-        input_scales = [get_mkldnn_uint8_scale(x, y) for (x, y) in min_max]
-        input_zeros = [0] * len(input_scales)
-        output_zero = 0
-
-        input_scales_expr = [relay.const(x, "float32") for x in input_scales]
-        input_zeros_expr = [relay.const(x, "int32") for x in input_zeros]
-
-        output_scale_expr = relay.const(output_scale, "float32")
-        output_zero_expr = relay.const(output_zero, "int32")
-
-        res = relay.qnn.op.concatenate(
-            input_exprs,
-            input_scales_expr,
-            input_zeros_expr,
-            output_scale_expr,
-            output_zero_expr,
-            axis=axis,
-        )
-        return res, new_min, new_max
-
-
-def _qnn_quantize(inputs, attrs):
-    out_dtype = "int8"
-    out_type = attrs.get_str("out_type")
-    if out_type == "auto":
-        if attrs.has_attr("min_calib_range") and attrs.has_attr("max_calib_range"):
-            if attrs.get_float("min_calib_range") >= 0:
-                out_dtype = "uint8"
-            else:
-                out_dtype = "int8"
-    else:
-        out_dtype = out_type
-    if out_dtype not in {"int8", "uint8"}:
-        raise ValueError(f"Unsupported out_dtype: {out_dtype}")
-    min_calib_range = attrs.get_float("min_calib_range", 0.0)
-    max_calib_range = attrs.get_float("max_calib_range", 0.0)
-    quantized_output, _, _ = quantize_mxnet_min_max(
-        inputs[0], min_range=min_calib_range, max_range=max_calib_range, out_dtype=out_dtype
-    )
-    return quantized_output, min_calib_range, max_calib_range
-
-
-def _qnn_contrib_quantized_fifo_buffer(inputs, attrs, params):
-    data = inputs[0]
-    buffer = inputs[1]
-    min_calib_range = inputs[2]
-    max_calib_range = inputs[3]
-    data_dtype = _infer_type(data).checked_type.dtype
-    buffer_shape = _infer_shape(buffer)
-    buffer_name = _get_name(buffer)
-    params[buffer_name] = _nd.array(np.zeros(buffer_shape).astype(data_dtype))
-    new_buffer = relay.var(buffer_name, relay.TensorType(buffer_shape, data_dtype))
-    inputs[1] = new_buffer
-    res = _op.nn.fifo_buffer(data=data, buffer=new_buffer, axis=attrs.get_int("axis"))
-    return res, min_calib_range, max_calib_range
-
-
-def _get_subgraph_op(subgraphs, op_name):
-    assert len(subgraphs) == 1, f"Subgraph should have 1 node but has {len(subgraphs)}"
-    subgraph = subgraphs[0]
-    nodes = subgraph["nodes"]
-    assert nodes is not None
-    for node in nodes:
-        if node["op"] == op_name:
-            return node
-    raise ValueError(f"Op {op_name} was not found in the subgraph")
-
-
-def _qnn_conv(inputs, attrs, subgraphs, params):
-    def _has_fused_activation(_attrs, _supported_activations):
-        has_fused_activation = False
-        if attrs.get_bool("with_act", False) or attrs.get_bool("with_postsum_act", False):
-            subgraph_activation_attrs = _get_subgraph_op(subgraphs, "Activation")["attrs"]
-            act_type = subgraph_activation_attrs["act_type"]
-            if act_type not in _supported_activations:
-                raise ValueError(f"Fused activation {act_type} is not supported at this time")
-            has_fused_activation = True
-        return has_fused_activation
-
-    def _get_data_scale_and_zp(_data, _inputs, _data_min_idx, _data_max_idx):
-        """Finds the Qnn params for the data expr."""
-        data_min = _inputs[_data_min_idx]
-        data_max = _inputs[_data_max_idx]
-        assert data_min <= data_max
-        data_dtype = _infer_type(_data).checked_type.dtype
-        assert data_dtype in {"int8", "uint8"}
-        if data_min < 0.0:
-            assert (
-                data_dtype == "int8"
-            ), "Expect int8 when data_min < 0.0, consider quantize model with int8."
-        _data_scale = (
-            get_mkldnn_uint8_scale(data_min, data_max)
-            if data_dtype == "uint8"
-            else get_mkldnn_int8_scale(data_min, data_max)
-        )
-        _data_zero_point = 0
-        return _data_scale, _data_zero_point
-
-    def _get_bn_alpha_coeff(_bn_gamma_idx, _bn_beta_idx, _bn_running_mean_idx, _bn_running_var_idx):
-        """Extract the BN coeff. These will be use later for BN folding into convolution."""
-        # Extract relevant attrs from bn.
-        bn_attrs = _get_subgraph_op(subgraphs, "BatchNorm")["attrs"]
-        bn_epsilon_param = float(bn_attrs["eps"])
-        bn_scale_param = bn_attrs["fix_gamma"] == "False"
-        bn_center_param = True
-
-        # Extract the relevant relay expressions.
-        bn_running_var = inputs[_bn_running_var_idx]
-        bn_gamma = inputs[_bn_gamma_idx]
-        bn_beta = inputs[_bn_beta_idx]
-        bn_running_mean = inputs[_bn_running_mean_idx]
-
-        # Get coefficient to multiply to weights.
-        bn_epsilon = relay.const(bn_epsilon_param, "float32")
-        denominator = relay.sqrt(relay.add(bn_running_var, bn_epsilon))
-        _bn_scale = relay.divide(relay.const(1.0, "float32"), denominator)
-        if bn_scale_param:
-            _bn_scale = relay.multiply(bn_gamma, _bn_scale)
-
-        # Get the shift.
-        _bn_shift = relay.negative(relay.multiply(bn_running_mean, _bn_scale))
-        if bn_center_param:
-            _bn_shift = relay.add(bn_beta, _bn_shift)
-
-        return _bn_scale, _bn_shift
-
-    def _fold_bn(_bn_scale, _bn_shift, _has_bias, _has_bn):
-        """Fold BN into kernel and bias. Get new kernel and bias."""
-        _kernel = inputs[1]
-        if _bn_scale:
-            assert attrs.get_bool("with_bn", False)
-            # Weights are on OIHW, and _bn_scale is in O.
-            exp_bn_scale = relay.expand_dims(_bn_scale, axis=1, num_newaxis=3)
-            _kernel = relay.multiply(exp_bn_scale, _kernel)
-
-        _bias = None
-        if _has_bias:
-            _bias = inputs[2]
-            if _has_bn:
-                assert _bn_shift is not None
-                assert _bn_scale is not None
-                _bias = relay.add(relay.multiply(_bn_scale, _bias), _bn_shift)
-        elif _has_bn:
-            assert _bn_shift is not None
-            assert _bn_scale is not None
-            _bias = _bn_shift
-        return _kernel, _bias
-
-    def _get_quantized_kernel(_kernel, _bias, _data_scale):
-        # For quantizing, we need min/max of kernel. So, we have to pre compute this expr.
-        np_kernel = _infer_value(_kernel, params).numpy()
-        kernel_channel_min = np.amin(np_kernel, axis=(1, 2, 3))
-        kernel_channel_max = np.amax(np_kernel, axis=(1, 2, 3))
-
-        np_bias = None
-        if _bias is not None:
-            np_bias = _infer_value(_bias, params).numpy()
-        return quantize_conv_weights_bias_channel_mkldnn_from_var(
-            _kernel, np_bias, kernel_channel_min, kernel_channel_max, _data_scale
-        )
-
-    def _get_qnn_conv2d(
-        _data,
-        _kernel,
-        _data_zero_point,
-        _kernel_zero_point,
-        _data_scale,
-        _kernel_vector_scale,
-        _conv2d_attrs,
-    ):
-        return relay.qnn.op.conv2d(
-            _data,
-            _kernel,
-            input_zero_point=relay.const(_data_zero_point, "int32"),
-            kernel_zero_point=relay.const(_kernel_zero_point, "int32"),
-            input_scale=relay.const(_data_scale, "float32"),
-            kernel_scale=relay.const(_kernel_vector_scale),
-            channels=_conv2d_attrs["channels"],
-            groups=_conv2d_attrs["groups"],
-            kernel_size=_conv2d_attrs["kernel_size"],
-            strides=_conv2d_attrs["strides"],
-            dilation=_conv2d_attrs["dilation"],
-            padding=_conv2d_attrs["padding"],
-            data_layout=_conv2d_attrs["data_layout"],
-            kernel_layout=_conv2d_attrs["kernel_layout"],
-        )
-
-    def _get_requantized_op(_res, _input_scale, _output_scale, _out_dtype):
-        # Requantize to get the output back
-        return relay.qnn.op.requantize(
-            _res,
-            input_scale=relay.const(_input_scale),
-            input_zero_point=relay.const(0, "int32"),
-            output_scale=relay.const(_output_scale, "float32"),
-            output_zero_point=relay.const(0, "int32"),
-            axis=1,
-            out_dtype=_out_dtype,
-        )
-
-    def _get_sum(_res, _output_scale, out_dtype):
-        """Handles sum of the second quantized tensor."""
-        # This is done in following steps
-        #   1) rhs is the add's second operand. First rhs will be requantized to output scale with
-        #   dtype int32. The int32 dtype is to keep precision high before adding.
-        #   2) Call normal add
-        #   3) Depending on final out_dtype, clip and cast (basically requantize).
-
-        _output_scale = relay.const(_output_scale, "float32")
-        data_sum = inputs[-5]
-        data_sum_min = inputs[-2]
-        data_sum_max = inputs[-1]
-
-        data_sum_dtype = _infer_type(data_sum).checked_type.dtype
-        data_sum_scale = (
-            get_mkldnn_uint8_scale(data_sum_min, data_sum_max)
-            if data_sum_dtype == "uint8"
-            else get_mkldnn_int8_scale(data_sum_min, data_sum_max)
-        )
-        data_sum_scale = relay.const(data_sum_scale, "float32")
-        zero_point = relay.const(0, "int32")
-
-        # Save one requantize if the previous expr already has a requantize node. This also improves
-        # little bit with accuracy.
-        if isinstance(data_sum, _expr.Call) and data_sum.op.name == "qnn.requantize":
-            prev_input, prev_scale, prev_zero_point = data_sum.args[0:3]
-            prev_axis = data_sum.attrs.axis
-            data_sum = relay.qnn.op.requantize(
-                prev_input,
-                input_scale=prev_scale,
-                input_zero_point=prev_zero_point,
-                output_scale=_output_scale,
-                output_zero_point=zero_point,
-                axis=prev_axis,
-                out_dtype="int32",
-            )
-        else:
-            data_sum = relay.qnn.op.requantize(
-                data_sum,
-                input_scale=data_sum_scale,
-                input_zero_point=zero_point,
-                output_scale=_output_scale,
-                output_zero_point=zero_point,
-                out_dtype="int32",
-            )
-
-        # 2) Add two int32 tensors.
-        _res = relay.add(_res, data_sum)
-
-        # 3) Clip/cast to change the out dtype.
-        _res = relay.clip(
-            _res,
-            a_min=float(tvm.tir.op.min_value(out_dtype).value),
-            a_max=float(tvm.tir.op.max_value(out_dtype).value),
-        )
-        _res = relay.cast(_res, out_dtype)
-        return _res
-
-    def _parse():
-        assert len(subgraphs) == 1
-        subgraph_conv_attrs = StrAttrsDict(_get_subgraph_op(subgraphs, "Convolution")["attrs"])
-
-        is_quantized = attrs.get_bool("quantized", False)
-        if is_quantized:
-            # The MKLDNN has a quantized convolution subgraph. There are many different arguments
-            # that are taken into account to parse the subgraph.
-            #   * no_bias
-            #   * with_sum
-            #   * with_bn
-            #   * with_postsum_relu
-            #   * with_act
-            #
-            # Note - Relu/clip handling is not required because output min/max take care of that.
-            #
-            # The parsing can be broken down into following steps
-            #   1) Get the input data scale and zero points.
-            #   2) Extract BN params.
-            #   3) Fold the BN params into kernel and bias.
-            #   4) Quantize the kernel.
-            #   4) Call QNN conv2d op.
-            #   5) Quantize bias and call bias_add.
-            #   6) Handle sum of quantized tensors if needed. Or just Requantize.
-
-            has_bias = not subgraph_conv_attrs.get_bool("no_bias", False)
-            has_sum = attrs.get_bool("with_sum", False)
-            has_bn = attrs.get_bool("with_bn", False)
-
-            ###############################################
-            #   1) Get the input data scale and zero point.
-            ###############################################
-            # Last 2 indexes are data min and max. If the conv has a sum, then last 2 indexes are
-            # for the second tensor. So, the data min max indexes are last 3 and 4
-            data_min_idx = -2
-            data_max_idx = -1
-            if has_sum:
-                data_min_idx = -4
-                data_max_idx = -3
-
-            data = inputs[0]
-            data_scale, data_zero_point = _get_data_scale_and_zp(
-                data, inputs, data_min_idx, data_max_idx
-            )
-
-            #############################
-            #   2) Extract the BN params.
-            #############################
-            # Find the indexes to look at for BN.
-            bn_scale = bn_shift = None
-            if has_bn:
-                if has_bias:
-                    bn_start_idx = 3
-                else:
-                    bn_start_idx = 2
-
-                bn_gamma_idx = bn_start_idx
-                bn_beta_idx = bn_start_idx + 1
-                bn_running_mean_idx = bn_start_idx + 2
-                bn_running_var_idx = bn_start_idx + 3
-
-                bn_scale, bn_shift = _get_bn_alpha_coeff(
-                    bn_gamma_idx, bn_beta_idx, bn_running_mean_idx, bn_running_var_idx
-                )
-
-            ########################################
-            #   3) Fold the BN into kernel and bias.
-            ########################################
-            kernel, bias = _fold_bn(bn_scale, bn_shift, has_bias, has_bn)
-
-            #######################################################################
-            #   4) Fold BN params into kernel. Get quantized kernel and QNN params.
-            #######################################################################
-            kernel, kernel_vector_scale, kernel_zero_point = _get_quantized_kernel(
-                kernel, bias, data_scale
-            )
-
-            ##########################
-            #   5) Call QNN conv2d op.
-            ##########################
-            conv2d_attrs = _get_mx_conv2d_attrs(subgraph_conv_attrs)
-            res = _get_qnn_conv2d(
-                data,
-                kernel,
-                data_zero_point,
-                kernel_zero_point,
-                data_scale,
-                kernel_vector_scale,
-                conv2d_attrs,
-            )
-
-            ###############################################
-            #   6) Fold BN params into bias. Call bias_add.
-            ###############################################
-            if has_bias or has_bn:
-                bias_scale = data_scale * kernel_vector_scale
-                int32_bias = quantize_conv_bias_mkldnn_from_var(bias, bias_scale)
-                res = _op.nn.bias_add(res, int32_bias, axis=1)
-
-            #####################################################################
-            #   7) Handle sum of quantized tensors if needed. Or just Requantize.
-            #####################################################################
-            min_output_range = attrs.get_float("min_calib_range")
-            max_output_range = attrs.get_float("max_calib_range")
-            output_scale, out_dtype = get_conv_mkldnn_requantized_scale_outDtype(
-                min_output_range, max_output_range
-            )
-
-            # QNN conv2d output scale is product of data_scale and kernel_vector_scale
-            input_scale = data_scale * kernel_vector_scale
-            if attrs.get_bool("with_sum", False):
-                # There is a second tensor that has to be added to the QNN conv2d output. Therefore,
-                # the QNN conv2d is first requantized to output scale with int32 precision. The
-                # second tensor will also be requantized to output scale with int32 precision,
-                # followed by an add operator.
-                res = _get_requantized_op(res, input_scale, output_scale, "int32")
-                res = _get_sum(res, output_scale, out_dtype)
-            else:
-                # Get the requantized conv output
-                res = _get_requantized_op(res, input_scale, output_scale, out_dtype)
-
-            return res, min_output_range, max_output_range
-        else:
-            res = _mx_conv(inputs, subgraph_conv_attrs)
-            has_fused_relu = _has_fused_activation(attrs, ["relu"])
-            if has_fused_relu:
-                res = _op.nn.relu(res)
-            return res
-
-    return _parse()
-
-
-def _qnn_flatten(inputs, attrs):
-    # pylint: disable=unused-argument
-    data = inputs[0]
-    output_min = inputs[1]
-    output_max = inputs[2]
-    output = _op.nn.batch_flatten(data)
-    return output, output_min, output_max
-
-
-def _qnn_dequantize(inputs, attrs):
-    # pylint: disable=unused-argument
-    data = inputs[0]
-    input_min = inputs[1]
-    input_max = inputs[2]
-    in_dtype = _infer_type(data).checked_type.dtype
-    result = dequantize_mxnet_min_max(data, input_min, input_max, in_dtype)
-    return result
-
-
-def _qnn_activation(inputs, attrs):
-    act_type = attrs.get_str("act_type")
-    assert len(inputs) == 3
-    assert act_type == "relu", "Currently only relu is supported"
-    data = inputs[0]
-    range_min = inputs[1]
-    range_max = inputs[2]
-    res = _op.nn.relu(data)
-    return res, range_min, range_max
-
-
-def _qnn_pooling(inputs, attrs):
-    input_min = inputs[1]
-    input_max = inputs[2]
-    data = inputs[0]
-    data_dtype = _infer_type(data).checked_type.dtype
-    pool_type = attrs.get_str("pool_type")
-    if data_dtype in ("int8", "uint8") and pool_type != "max":
-        data = _op.cast(data, "int32")
-    res = _mx_pooling([data, input_min, input_max], attrs)
-    if data_dtype in ("int8", "uint8") and pool_type != "max":
-        res = _op.cast(res, data_dtype)
-    return res, input_min, input_max
-
-
-def _qnn_batch_norm(inputs, attrs):
-    # Perform batch norm in FP32
-    data = inputs[0]
-
-    # Dequantize the data.
-    data_min_idx, data_max_idx = (-2, -1)
-    data_min, data_max = inputs[data_min_idx], inputs[data_max_idx]
-    data_dtype = _infer_type(data).checked_type.dtype
-    data_scale = (
-        get_mkldnn_uint8_scale(data_min, data_max)
-        if data_dtype == "uint8"
-        else get_mkldnn_int8_scale(data_min, data_max)
-    )
-    data_zp = 0
-    data = relay.qnn.op.dequantize(
-        data, relay.const(data_scale, "float32"), relay.const(data_zp, "int32")
-    )
-
-    # Run BN. The last 4 inputs are same as before.
-    new_inputs = [data, *inputs[1:5]]
-    res = _mx_batch_norm(new_inputs, attrs)
-
-    # Quantize the result
-    min_output_range = attrs.get_float("min_calib_range")
-    max_output_range = attrs.get_float("max_calib_range")
-    output_scale, out_dtype = get_conv_mkldnn_requantized_scale_outDtype(
-        min_output_range, max_output_range
-    )
-    res = relay.qnn.op.quantize(
-        res[0], relay.const(output_scale, "float32"), relay.const(0, "int32"), out_dtype=out_dtype
-    )
-    return res, min_output_range, max_output_range
-
-
-def _qnn_fully_connected(inputs, attrs, subgraphs, params):
-    def _get_input_scale_zp(_data_dtype, _inputs, _has_bias):
-        data_min_idx, data_max_idx = (3, 4) if _has_bias else (2, 3)
-        data_min, data_max = _inputs[data_min_idx], _inputs[data_max_idx]
-        _data_scale = (
-            get_mkldnn_uint8_scale(data_min, data_max)
-            if _data_dtype == "uint8"
-            else get_mkldnn_int8_scale(data_min, data_max)
-        )
-        _data_zp = 0
-        return _data_scale, _data_zp
-
-    def _get_kernel_scale_zp_tensor_quantized(_kernel, _inputs, _has_bias):
-        kernel_dtype = _infer_type(_kernel).checked_type.dtype
-
-        if kernel_dtype != "int8":
-            raise tvm.error.OpNotImplemented(
-                "Tensor wise quantized expects weights in int8 data type"
-            )
-
-        if isinstance(_kernel, tvm.relay.Call) and _kernel.op.name == "qnn.quantize":
-            _kernel_scale = _kernel.args[1].data.numpy()
-            _kernel_zp = _kernel.args[2].data.numpy()
-            return _kernel_scale, _kernel_zp
-
-        kernel_min_idx, kernel_max_idx = (5, 6) if _has_bias else (4, 5)
-        kernel_min_name = _get_name(_inputs[kernel_min_idx])
-        kernel_min = params[kernel_min_name].numpy()[0]
-        kernel_max_name = _get_name(_inputs[kernel_max_idx])
-        kernel_max = params[kernel_max_name].numpy()[0]
-        _kernel_scale = (
-            get_mkldnn_uint8_scale(kernel_min, kernel_max)
-            if kernel_dtype == "uint8"
-            else get_mkldnn_int8_scale(kernel_min, kernel_max)
-        )
-        _kernel_zp = 0
-        return _kernel_scale, _kernel_zp
-
-    def _get_kernel_scale_zp_channel_quantized(_kernel, _bias, _data_scale):
-        kernel_dtype = _infer_type(_kernel).checked_type.dtype
-        if kernel_dtype != "float32":
-            raise tvm.error.OpNotImplemented(
-                "Channel wise quantized expects weights in float32 data type"
-            )
-
-        # Get the FP32 values, calculate min/max and then channel quantize them
-        np_kernel = _infer_value(_kernel, params).numpy()
-        kernel_channel_min = np.amin(np_kernel, axis=(1,))
-        kernel_channel_max = np.amax(np_kernel, axis=(1,))
-
-        np_bias = None
-        if _bias is not None:
-            np_bias = _infer_value(_bias, params).numpy()
-        return quantize_conv_weights_bias_channel_mkldnn_from_var(
-            _kernel, np_bias, kernel_channel_min, kernel_channel_max, _data_scale
-        )
-
-    def _get_bias_requantize_scale(_inputs, _data_scale, _kernel_scale):
-        _bias = _inputs[2]
-        if isinstance(_bias, tvm.relay.Call) and _bias.op.name == "qnn.quantize":
-            _bias_scale = _bias.args[1].data.numpy()
-            _bias_requantize_scale = _bias_scale / (_data_scale * _kernel_scale)
-            _bias_requantize_scale = _expr.const(_bias_requantize_scale, dtype="float32")
-            return _bias_requantize_scale
-
-        bias_min_name = _get_name(_inputs[7])
-        bias_min = params[bias_min_name].numpy()[0]
-        bias_max_name = _get_name(_inputs[8])
-        bias_max = params[bias_max_name].numpy()[0]
-        bias_scale = get_mkldnn_int8_scale(bias_min, bias_max)
-        _bias_requantize_scale = bias_scale / (_data_scale * _kernel_scale)
-        _bias_requantize_scale = _expr.const(_bias_requantize_scale, dtype="float32")
-        return _bias_requantize_scale
-
-    is_quantized = attrs.get_bool("quantized", False)
-    with_relu = attrs.get_bool("with_relu", False)
-    subgraph_dense_attrs = StrAttrsDict(_get_subgraph_op(subgraphs, "FullyConnected")["attrs"])
-    if not is_quantized:
-        res = _mx_fully_connected(inputs, subgraph_dense_attrs)
-        if with_relu:
-            res = _op.nn.relu(res)
-        return res
-    else:
-        has_bias = not subgraph_dense_attrs.get_bool("no_bias", False)
-        units = subgraph_dense_attrs.get_int("num_hidden")
-        is_flatten = subgraph_dense_attrs.get_bool("flatten", True)
-        enable_float_output = attrs.get_bool("enable_float_output", False)
-        is_channel_quantized = attrs.get_bool("channel_wise_quantize", False)
-
-        ########################
-        # Get data, kernel, bias
-        ########################
-        data, kernel = inputs[0], inputs[1]
-        bias = None
-        if has_bias:
-            bias = inputs[2]
-
-        ##############################
-        # Handle for shape of data > 2
-        ##############################
-        if is_flatten:
-            data = _op.nn.batch_flatten(data)
-        data_shape = _infer_type(data).checked_type.shape
-        if len(data_shape) > 2:
-            data = _op.reverse_reshape(data, [-1, 0])
-
-        ###############################
-        # Get data scale and zero point
-        ###############################
-        data_dtype = _infer_type(data).checked_type.dtype
-        data_scale, data_zp = _get_input_scale_zp(data_dtype, inputs, has_bias)
-
-        #################################
-        # Get weight scale and zero point
-        #################################
-        if is_channel_quantized:
-            kernel, kernel_scale, kernel_zp = _get_kernel_scale_zp_channel_quantized(
-                kernel, bias, data_scale
-            )
-        else:
-            kernel_scale, kernel_zp = _get_kernel_scale_zp_tensor_quantized(
-                kernel, inputs, has_bias
-            )
-
-        ################
-        # Call QNN dense
-        ################
-        res = relay.qnn.op.dense(
-            data,
-            kernel,
-            input_zero_point=relay.const(data_zp, "int32"),
-            kernel_zero_point=relay.const(kernel_zp, "int32"),
-            input_scale=relay.const(data_scale, "float32"),
-            kernel_scale=relay.const(kernel_scale, "float32"),
-            units=units,
-        )
-
-        #################
-        # Handle bias add
-        #################
-        if has_bias:
-            if is_channel_quantized:
-                bias_scale = data_scale * kernel_scale
-                int32_bias = quantize_conv_bias_mkldnn_from_var(bias, bias_scale)
-                res = _op.nn.bias_add(res, int32_bias, axis=-1)
-            else:
-                bias_data = inputs[2]
-                bias_requantize_scale = _get_bias_requantize_scale(inputs, data_scale, kernel_scale)
-                multiplied_bias = _op.multiply(
-                    _op.cast(bias_data, "float32"), bias_requantize_scale
-                )
-                rounded_bias = _op.round(multiplied_bias)
-                clipped_bias = _op.clip(
-                    rounded_bias,
-                    a_min=tvm.tir.op.min_value("int32").value,
-                    a_max=tvm.tir.op.max_value("int32").value,
-                )
-                requantized_bias = _op.cast(clipped_bias, "int32")
-                res = _op.nn.bias_add(res, requantized_bias, axis=-1)
-
-        ##############################################
-        # Dequantize if float32 output else Requantize
-        ##############################################
-        if enable_float_output:
-            output_scale = np.float32(data_scale * kernel_scale)
-            res = relay.qnn.op.dequantize(
-                res, relay.const(output_scale), input_zero_point=relay.const(0, "int32"), axis=1
-            )
-            if with_relu:
-                res = _op.nn.relu(res)
-        else:
-
-            if is_channel_quantized:
-                raise tvm.error.OpNotImplemented(
-                    "Channel wise quantized dense with non float output is not supported yet"
-                )
-            out_dtype = "uint8" if attrs.get_bool("with_relu", False) else "int8"
-            input_scale = np.float32(data_scale * kernel_scale)
-            min_output_range = attrs.get_float("min_calib_range")
-            max_output_range = attrs.get_float("max_calib_range")
-            output_scale = get_mkldnn_requantize_scale_outDtype(
-                min_output_range, max_output_range, out_dtype
-            )
-            res = relay.qnn.op.requantize(
-                res,
-                input_scale=relay.const(input_scale, "float32"),
-                input_zero_point=relay.const(0, "int32"),
-                output_scale=relay.const(output_scale, "float32"),
-                output_zero_point=relay.const(0, "int32"),
-                out_dtype=out_dtype,
-            )
-            if with_relu:
-                res = _op.nn.relu(res)
-
-        ##############################
-        # Handle for shape of data > 2
-        ##############################
-        if len(data_shape) > 2:
-            new_shape = data_shape[:-1]
-            new_shape.append(units)
-            res = _op.reshape(res, new_shape)
-
-        if enable_float_output:
-            return res
-        return res, min_output_range, max_output_range
-
-
-def _mx_broadcast_to(inputs, attrs):
-    data = inputs[0]
-    tgt_shape = attrs.get_int_tuple("shape", [])
-
-    return _op.broadcast_to(data, tgt_shape)
-
-
-def _mx_broadcast_like(inputs, attrs):
-    assert len(inputs) == 2
-    for axes in ["lhs_axes", "rhs_axes"]:
-        if axes in attrs.attrs:
-            raise tvm.error.OpAttributeUnImplemented(
-                f'Attribute "{axes}" is not supported for operator broadcast_like.'
-            )
-    return _op.broadcast_to_like(*inputs)
-
-
-def _mx_logical_not(inputs, input_types):
-    data = inputs[0]
-    dtype = _infer_type(data).checked_type.dtype
-    data = _op.cast(data, "bool") if dtype != "bool" else data
-
-    return _op.cast(_op.logical_not(data), dtype)
-
-
-def _mx_broadcast_logical(logical_op):
-    def impl(inputs, input_types):
-        lhs_type = _infer_type(inputs[0]).checked_type.dtype
-        rhs_type = _infer_type(inputs[1]).checked_type.dtype
-        lhs = _op.cast(inputs[0], "bool") if lhs_type != "bool" else inputs[0]
-        rhs = _op.cast(inputs[1], "bool") if rhs_type != "bool" else inputs[1]
-
-        return _op.cast(logical_op(lhs, rhs), lhs_type)
-
-    return impl
-
-
-def _mx_npi_transpose(inputs, attrs):
-    axes = attrs.get_int_tuple("axes", None)
-    # translate default case
-    axes = None if len(axes) == 0 or axes[0] is None else axes
-    return _op.transpose(inputs[0], axes=axes)
-
-
-def _mx_npi_pad(inputs, attrs):
-    pad_mode = attrs.get_str("mode", None)
-    if pad_mode is None:
-        raise tvm.error.OpAttributeRequired('Attribute "mode" not found in operator pad.')
-    if pad_mode not in ["constant", "edge", "reflect"]:
-        raise tvm.error.OpAttributeInvalid("Value " + mode + ' in attribute "mode" is not valid')
-    if "pad_width" not in attrs.attrs:
-        raise tvm.error.OpAttributeRequired('Attribute "pad_width" not found in operator pad.')
-    # Begin to parse tuple of tuple, we cannot use get_int_tuple here because it's a tuple of tuple.
-    pad_width = attrs.attrs["pad_width"]
-    pad_width = pad_width.replace("(", "[")
-    pad_width = pad_width.replace(")", "]")
-    pad_width = json.loads(pad_width)
-    constant_values = attrs.get_float("constant_values", 0.0)
-    return _op.nn.pad(
-        data=inputs[0], pad_width=pad_width, pad_value=constant_values, pad_mode=pad_mode
-    )
-
-
-def _mx_npi_concatenate(inputs, attrs):
-    axis = attrs.get_str("axis", "0")
-    if axis == "None":
-        return _op.reshape(_op.concatenate(tuple(inputs), axis=0), (-1,))
-    else:
-        return _op.concatenate(tuple(inputs), axis=int(axis))
-
-
-def _mx_npi_stack(inputs, attrs):
-    axis = attrs.get_str("axis", "0")
-    if axis == "None":
-        return _op.reshape(_op.stack(tuple(inputs), axis=0), (-1,))
-    else:
-        return _op.stack(tuple(inputs), axis=int(axis))
-
-
-def _mx_npx_reshape(inputs, attrs):
-    shape = attrs.get_int_tuple("newshape")
-    reverse = attrs.get_bool("reverse", False)
-    shape_list = list(shape)
-    old_shape = get_const_tuple(_infer_type(inputs[0]).checked_type.shape)
-    new_shape = []
-    if reverse:
-        old_shape = old_shape[::-1]
-        shape_list = shape_list[::-1]
-    ptr = 0
-    unknown_axis = None
-    src_ptr = 0
-    while src_ptr < len(shape_list):
-        ele = shape_list[src_ptr]
-        src_ptr += 1
-        if ele > 0:
-            new_shape.append(ele)
-            ptr += 1
-        elif ele == -1:
-            new_shape.append(-1)
-            if unknown_axis is not None:
-                raise tvm.error.OpAttributeInvalid("Can only have one -1 in the input shape.")
-            unknown_axis = len(new_shape)
-            ptr += 1
-        elif ele == -2:
-            new_shape.append(old_shape[ptr])
-            ptr += 1
-        elif ele == -3:
-            if old_shape[ptr] != 1:
-                raise tvm.error.OpAttributeInvalid(
-                    f"Dimension of the original shape "
-                    f"that corresponds to -3 must be 1. Received"
-                    f" {old_shape[ptr]}"
-                )
-            ptr += 1
-        elif ele == -4:
-            new_shape += old_shape[ptr:]
-            break
-        elif ele == -5:
-            new_shape.append(old_shape[ptr] * old_shape[ptr + 1])
-            ptr += 2
-        elif ele == -6:
-            # Split axis
-            lhs = shape_list[src_ptr]
-            rhs = shape_list[src_ptr + 1]
-            src_ptr += 2
-            if lhs == -1 and rhs == -1:
-                raise tvm.error.OpAttributeInvalid("The lhs and rhs can not both be -1.")
-            if lhs == -1:
-                if old_shape[ptr] % rhs != 0:
-                    raise tvm.error.OpAttributeInvalid(
-                        "When splitting the axis, "
-                        "the dimension of the split axis must "
-                        "be divisible by the splitted values."
-                    )
-                lhs = old_shape[ptr] // rhs
-            if rhs == -1:
-                if old_shape[ptr] % lhs != 0:
-                    raise tvm.error.OpAttributeInvalid(
-                        "When splitting the axis, "
-                        "the dimension of the split axis must "
-                        "be divisible by the splitted values."
-                    )
-                rhs = old_shape[ptr] // lhs
-            new_shape.append(lhs)
-            new_shape.append(rhs)
-            ptr += 1
-        else:
-            raise tvm.error.OpAttributeInvalid(f"Shape dimension {ele} is not supported")
-    if reverse:
-        new_shape = new_shape[::-1]
-    return _op.reshape(inputs[0], newshape=new_shape)
-
-
-def _mx_split_v2(inputs, attrs):
-    axis = attrs.get_int("axis")
-    indices = list(attrs.get_int_tuple("indices", []))
-    # remove the prefix '0'
-    if len(indices) != 0 and indices[0] == 0:
-        indices.remove(0)
-    sections = attrs.get_int("sections", 0)
-    indices_or_sections = list(indices) if len(indices) != 0 else sections
-    res = _op.split(inputs[0], indices_or_sections=indices_or_sections, axis=axis)
-    if attrs.get_bool("squeeze_axis", False):
-        res = tuple([_op.squeeze(x, axis=[axis]) for x in res])
-    return res
-
-
-def _mx_npi_where_rscalar(inputs, attrs):
-    cond, dat = inputs
-    scalar = attrs.get_float("scalar")
-    cond_shape = get_const_tuple(_infer_type(cond).checked_type.shape)
-    dat_shape = get_const_tuple(_infer_type(dat).checked_type.shape)
-    dtype = _infer_type(dat).checked_type.dtype
-    # Check for broadcasting
-    out_shape = np.broadcast(np.empty(cond_shape), np.empty(dat_shape)).shape
-    if out_shape != cond_shape:
-        cond = _op.broadcast_to(cond, out_shape)
-    if out_shape != dat_shape:
-        dat = _op.broadcast_to(dat, out_shape)
-    scalar = _expr.const(scalar, dtype=dtype)
-    ones = _op.ones_like(dat)
-    scalar = _op.multiply(ones, scalar)
-    return _op.where(cond, dat, scalar)
-
-
-# Note: due to attribute conversion constraint
-# ops in the identity set must be attribute free
-_identity_list = [
-    "abs",
-    "log",
-    "exp",
-    "erf",
-    "sqrt",
-    "floor",
-    "ceil",
-    "round",
-    "trunc",
-    "sign",
-    "sigmoid",
-    "negative",
-    "reshape_like",
-    "zeros_like",
-    "ones_like",
-    "cos",
-    "cosh",
-    "sin",
-    "sinh",
-    "tan",
-    "tanh",
-    "where",
-]
-
-_convert_map = {
-    "_copy": _rename(_op.copy),
-    "relu": _rename(_op.nn.relu),
-    "broadcast_add": _rename(_op.add),
-    "broadcast_plus": _rename(_op.add),
-    "broadcast_sub": _rename(_op.subtract),
-    "broadcast_minus": _rename(_op.subtract),
-    "broadcast_mul": _rename(_op.multiply),
-    "broadcast_div": _rename(_op.divide),
-    "broadcast_mod": _rename(_op.mod),
-    "broadcast_maximum": _rename(_op.maximum),
-    "broadcast_minimum": _rename(_op.minimum),
-    "broadcast_power": _rename(_op.power),
-    "arccos": _rename(_op.acos),
-    "arcsin": _rename(_op.asin),
-    "arctan": _rename(_op.atan),
-    "arccosh": _rename(_op.acosh),
-    "arcsinh": _rename(_op.asinh),
-    "arctanh": _rename(_op.atanh),
-    "broadcast_equal": _mx_compare(_op.equal, _rename),
-    "broadcast_not_equal": _mx_compare(_op.not_equal, _rename),
-    "broadcast_greater": _mx_compare(_op.greater, _rename),
-    "broadcast_greater_equal": _mx_compare(_op.greater_equal, _rename),
-    "broadcast_lesser": _mx_compare(_op.less, _rename),
-    "broadcast_lesser_equal": _mx_compare(_op.less_equal, _rename),
-    "broadcast_logical_or": _mx_broadcast_logical(_op.logical_or),
-    "broadcast_logical_and": _mx_broadcast_logical(_op.logical_and),
-    "broadcast_logical_xor": _mx_broadcast_logical(_op.logical_xor),
-    "broadcast_to": _mx_broadcast_to,
-    "broadcast_like": _mx_broadcast_like,
-    "logical_not": _mx_logical_not,
-    "_equal": _mx_compare(_op.equal, _rename),
-    "_not_equal": _mx_compare(_op.not_equal, _rename),
-    "_greater": _mx_compare(_op.greater, _rename),
-    "_greater_equal": _mx_compare(_op.greater_equal, _rename),
-    "_lesser": _mx_compare(_op.less, _rename),
-    "_lesser_equal": _mx_compare(_op.less_equal, _rename),
-    "elemwise_add": _rename(_op.add),
-    "elemwise_sub": _rename(_op.subtract),
-    "elemwise_mul": _rename(_op.multiply),
-    "elemwise_div": _rename(_op.divide),
-    "_maximum": _rename(_op.maximum),
-    "_minimum": _rename(_op.minimum),
-    "flatten": _rename(_op.nn.batch_flatten),
-    "Flatten": _rename(_op.nn.batch_flatten),
-    # scalar power
-    "square": _mx_make_power(2),
-    "rsqrt": _mx_make_power(-1 / 2),
-    "cbrt": _mx_make_power(1 / 3),
-    "rcbrt": _mx_make_power(-1 / 3),
-    "__pow_scalar__": _binop_scalar(_op.power),
-    "_power_scalar": _binop_scalar(_op.power),
-    "__rsub_scalar__": _rbinop_scalar(_op.subtract),
-    "_rminus_scalar": _rbinop_scalar(_op.subtract),
-    "__rdiv_scalar__": _rbinop_scalar(_op.divide),
-    "_rdiv_scalar": _rbinop_scalar(_op.divide),
-    "__rpow_scalar__": _rbinop_scalar(_op.power),
-    # scalar op
-    "__add_scalar__": _binop_scalar(_op.add),
-    "_plus_scalar": _binop_scalar(_op.add),
-    "__sub_scalar__": _binop_scalar(_op.subtract),
-    "_minus_scalar": _binop_scalar(_op.subtract),
-    "__mul_scalar__": _binop_scalar(_op.multiply),
-    "_mul_scalar": _binop_scalar(_op.multiply),
-    "__div_scalar__": _binop_scalar(_op.divide),
-    "_div_scalar": _binop_scalar(_op.divide),
-    "log2": _mx_make_logarithm(2),
-    "log10": _mx_make_logarithm(10),
-    "log1p": _mx_log1p,
-    "expm1": _mx_expm1,
-    "_equal_scalar": _mx_compare(_op.equal, _binop_scalar),
-    "_not_equal_scalar": _mx_compare(_op.not_equal, _binop_scalar),
-    "_greater_scalar": _mx_compare(_op.greater, _binop_scalar),
-    "_greater_equal_scalar": _mx_compare(_op.greater_equal, _binop_scalar),
-    "_lesser_scalar": _mx_compare(_op.less, _binop_scalar),
-    "_lesser_equal_scalar": _mx_compare(_op.less_equal, _binop_scalar),
-    "_maximum_scalar": _binop_scalar(_op.maximum),
-    "_minimum_scalar": _binop_scalar(_op.minimum),
-    # reduction ops
-    "mean": _reduce(_op.mean),
-    "max": _reduce(_op.max),
-    "min": _reduce(_op.min),
-    "sum": _reduce(_op.sum),
-    "max_axis": _reduce(_op.max),
-    "min_axis": _reduce(_op.min),
-    "sum_axis": _reduce(_op.sum),
-    "argmax": _arg_reduce(_op.argmax),
-    "argmin": _arg_reduce(_op.argmin),
-    # init ops
-    "_ones": _init_op(_op.ones),
-    # softmax
-    "softmax": _softmax_op(_op.nn.softmax),
-    "log_softmax": _softmax_op(_op.nn.log_softmax),
-    "Softmax": _softmax_op(_op.nn.softmax),
-    "softsign": _mx_softsign,
-    "softmin": _mx_softmin,
-    "hard_sigmoid": _mx_hard_sigmoid,
-    "reciprocal": _mx_reciprocal,
-    # per op specialization
-    "Reshape": _reshape,
-    "reshape": _reshape,
-    "Cast": _cast,
-    "amp_cast": _cast,
-    "amp_multicast": _mx_amp_multicast,
-    "clip": _clip,
-    "transpose": _transpose,
-    "UpSampling": _upsampling,
-    "add_n": _elemwise_sum,
-    # MXNet specific implementations
-    "_zeros": _mx_zeros,
-    "FullyConnected": _mx_fully_connected,
-    "Activation": _mx_activations,
-    "Convolution": _mx_conv,
-    "Convolution_v1": _mx_conv2d,
-    "Deconvolution": _mx_conv_transpose,
-    "Pooling": _mx_pooling,
-    "Pooling_v1": _mx_pooling,
-    "Dropout": _mx_dropout,
-    "BatchNorm": _mx_batch_norm,
-    "BatchNorm_v1": _mx_batch_norm,
-    "_contrib_SyncBatchNorm": _mx_batch_norm,
-    "InstanceNorm": _mx_instance_norm,
-    "LayerNorm": _mx_layer_norm,
-    "GroupNorm": _mx_group_norm,
-    "LRN": _mx_lrn,
-    "L2Normalization": _mx_l2_normalize,
-    "slice": _mx_slice,
-    "slice_like": _mx_slice_like,
-    "slice_axis": _mx_slice_axis,
-    "SliceChannel": _mx_split,
-    "split": _mx_split,
-    "_split_v2": _mx_split_v2,
-    "SwapAxis": _mx_swap_axis,
-    "expand_dims": _mx_expand_dims,
-    "Concat": _mx_concat,
-    "concat": _mx_concat,
-    "stack": _mx_stack,
-    "dot": _mx_dot,
-    "batch_dot": _mx_batch_dot,
-    "LeakyReLU": _mx_leaky_relu,
-    "_arange": _mx_arange,
-    "_full": _mx_full,
-    "repeat": _mx_repeat,
-    "tile": _mx_tile,
-    "pad": _mx_pad,
-    "Pad": _mx_pad,
-    "take": _mx_take,
-    "gather_nd": _mx_gather_nd,
-    "reverse": _mx_reverse,
-    "SequenceReverse": _mx_sequence_reverse,
-    "squeeze": _mx_squeeze,
-    "broadcast_axis": _mx_broadcast_axis,
-    "broadcast_axes": _mx_broadcast_axis,
-    "BlockGrad": _mx_BlockGrad,
-    "shape_array": _mx_shape_array,
-    "Embedding": _mx_embedding,
-    "argsort": _mx_argsort,
-    "topk": _mx_topk,
-    "_unravel_index": _mx_unravel_index,
-    "SequenceMask": _mx_sequence_mask,
-    "SoftmaxOutput": _mx_softmax_output,
-    "SoftmaxActivation": _mx_softmax_activation,
-    "LinearRegressionOutput": _mx_linear_regression_output,
-    "LogisticRegressionOutput": _mx_logistic_regression_output,
-    "smooth_l1": _mx_smooth_l1,
-    "make_loss": _mx_make_loss,
-    "_contrib_div_sqrt_dim": _mx_contrib_div_sqrt_dim,
-    "_contrib_arange_like": _mx_contrib_arange_like,
-    "one_hot": _mx_one_hot,
-    "depth_to_space": _mx_depth_to_space,
-    "space_to_depth": _mx_space_to_depth,
-    "Correlation": _mx_correlation,
-    # vision
-    "_contrib_BilinearResize2D": _mx_resize,
-    "_contrib_MultiBoxPrior": _mx_multibox_prior,
-    "_contrib_MultiBoxDetection": _mx_multibox_detection,
-    "_contrib_ROIAlign": _mx_roi_align,
-    "ROIPooling": _mx_roi_pooling,
-    "_contrib_Proposal": _mx_proposal,
-    "_contrib_MultiProposal": _mx_proposal,
-    "_contrib_box_nms": _mx_box_nms,
-    "_contrib_box_decode": _mx_box_decode,
-    "_contrib_DeformableConvolution": _mx_deformable_convolution,
-    "_contrib_AdaptiveAvgPooling2D": _mx_adaptive_avg_pooling,
-    "GridGenerator": _mx_grid_generator,
-    "BilinearSampler": _mx_bilinear_sampler,
-    # NLP
-    "RNN": _mx_rnn_layer,
-    "_rnn_param_concat": _mx_rnn_param_concat,
-    "_contrib_interleaved_matmul_selfatt_qk": _mx_contrib_interleaved_matmul_selfatt_qk,
-    "_contrib_interleaved_matmul_selfatt_valatt": _mx_contrib_interleaved_matmul_selfatt_valatt,
-    # control flow
-    "_cond": _mx_cond,
-    # Depricated:
-    "Crop": _mx_crop_like,
-    # List of missing operators that are present in NNVMv1
-    # TODO(tvm-tvm): support all operators.
-    #
-    # "contrib_fifo_buffer": _mx_contrib_fifo_buffer,
-    "ring_buffer": _mx_contrib_fifo_buffer,
-    # Qnn ops
-    "_contrib_quantize_v2": _qnn_quantize,
-    "_contrib_quantized_concat": _qnn_contrib_concat,
-    # "_contrib_quantized_fifo_buffer": _qnn_contrib_quantized_fifo_buffer,
-    "_contrib_quantized_ring_buffer": _qnn_contrib_quantized_fifo_buffer,
-    "_sg_mkldnn_conv": _qnn_conv,
-    "_contrib_quantized_flatten": _qnn_flatten,
-    "_contrib_dequantize": _qnn_dequantize,
-    "_contrib_quantized_act": _qnn_activation,
-    "_contrib_quantized_pooling": _qnn_pooling,
-    "_contrib_quantized_batch_norm": _qnn_batch_norm,
-    "_sg_mkldnn_fully_connected": _qnn_fully_connected,
-    # numpy
-    "_np_transpose": _mx_npi_transpose,
-    "_npi_transpose": _mx_npi_transpose,
-    "_npi_pad": _mx_npi_pad,
-    "_npi_concatenate": _mx_npi_concatenate,
-    "_npx_reshape": _mx_npx_reshape,
-    "_np_copy": _rename(_op.copy),
-    "_npi_copy": _rename(_op.copy),
-    "_npi_power": _rename(_op.power),
-    "_npi_power_scalar": _binop_scalar(_op.power),
-    "_npi_multiply": _rename(_op.multiply),
-    "_npi_multiply_scalar": _binop_scalar(_op.multiply),
-    "_npi_add": _rename(_op.add),
-    "_npi_add_scalar": _binop_scalar(_op.add),
-    "_npi_subtract": _rename(_op.subtract),
-    "_npi_subtract_scalar": _binop_scalar(_op.subtract),
-    "_npi_where_rscalar": _mx_npi_where_rscalar,
-    "_npi_less": _rename(_op.less),
-    "_npi_less_equal": _mx_compare(_op.less_equal, _rename),
-    "_npi_tanh": _rename(_op.tanh),
-    "_npi_true_divide_scalar": _binop_scalar(_op.divide),
-    "_npi_stack": _mx_npi_stack,
-}
-
-# set identity list
-_convert_map.update({k: _rename(k) for k in _identity_list})
-
-_control_flow_ops = ["_cond", "_foreach", "_while_loop"]
-_qnn_subgraph_ops = ["_sg_mkldnn_conv", "_sg_mkldnn_fully_connected"]
-_subgraph_ops = _control_flow_ops + _qnn_subgraph_ops
-_params_ops = ["_contrib_quantized_ring_buffer"]
-
-
-def _get_op_params(children, attrs, op_name, node, params):
-    op_params = [children, attrs]
-    if op_name in _subgraph_ops:
-        subgraphs = node["subgraphs"]
-        op_params.append(subgraphs)
-        if op_name in _qnn_subgraph_ops:
-            op_params.append(params)
-    if op_name in _params_ops:
-        op_params.append(params)
-    return op_params
-
-
-def _from_mxnet_impl(symbol, shape_dict, dtype_info, params=None, mod=None):
-    # pylint: disable=unused-argument
-    """Convert mxnet symbol to compatible relay Function.
-
-    Reconstruct a relay Function by traversing the mxnet symbol.
-
-    Parameters
-    ----------
-    symbol : mxnet.sym.Symbol
-        Incompatible symbol from mxnet.
-        The op_name and attrs inside are not always compatible.
-
-    shape_dict : dict
-        Known parameter shapes
-
-    dtype_info : dict or str.
-        Known parameter dtypes
-
-    mod : tvm.IRModule
-        The module that contains global information. It will be used for
-        converting ops that need global information, e.g. control-flow ops.
-
-    Returns:
-    -------
-    func : tvm.relay.Function
-        Converted relay Function
-    """
-    assert symbol is not None
-    if isinstance(symbol, dict):
-        jgraph = symbol
-    else:
-        jgraph = json.loads(symbol.tojson())
-    jnodes = jgraph["nodes"]
-    node_map = {}
-    shape_idx = 0
-
-    # Check if there have any unsupported ops
-    unsupported = {}
-    for node in jnodes:
-        op_name = node["op"]
-        if op_name != "null" and op_name not in _convert_map:
-            if op_name not in unsupported:
-                unsupported[op_name] = 0
-            unsupported[op_name] += 1
-
-    if unsupported:
-        msg = "\n".join([f"{op_name}: {cnt}" for op_name, cnt in unsupported.items()])
-        raise tvm.error.OpNotImplemented(
-            f"One or more operators are not supported in frontend MXNet:\n{msg}"
-        )
-
-    for nid, node in enumerate(jnodes):
-        children = [node_map[e[0]][e[1]] for e in node["inputs"]]
-        attrs = StrAttrsDict(node.get("attrs", {}))
-        node_name = node["name"]
-        op_name = node["op"]
-        if op_name == "null":
-            if isinstance(shape_dict, dict):
-                shape = shape_dict[node_name] if node_name in shape_dict else None
-            elif isinstance(shape_dict, (list, tuple)):
-                shape = shape_dict[shape_idx]
-            else:
-                raise ValueError("Unknown type of shape_dict: %s" + type(shape_dict))
-            if isinstance(dtype_info, dict):
-                dtype = dtype_info[node_name] if node_name in dtype_info else "float32"
-            elif isinstance(dtype_info, (list, tuple)):
-                dtype = dtype_info[shape_idx]
-            else:
-                dtype = dtype_info
-            if isinstance(shape_dict, (list, tuple)):
-                shape_idx += 1
-            node_map[nid] = [_expr.var(node_name, shape=shape, dtype=dtype)]
-        else:
-            assert op_name in _convert_map
-            op_params = _get_op_params(children, attrs, op_name, node, params)
-            res = _convert_map[op_name](*op_params)
-            if res is None:
-                # defer conversion, used in RNN state initialization
-                res = [node]
-            elif isinstance(res, (_expr.TupleWrapper, tuple, list)):
-                pass
-            elif isinstance(res, _expr.Expr):
-                res = [res]
-            else:
-                raise RuntimeError(f"unexpected type {type(res)}")
-            node_map[nid] = res
-    outputs = [node_map[e[0]][e[1]] for e in jgraph["heads"]]
-    outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-    func = _function.Function(analysis.free_vars(outputs), outputs)
-    return func
-
-
-def _update_shape_dtype(shape, dtype, params):
-    """Update shape dtype given params information"""
-    shape = {} if shape is None else shape
-    if not params:
-        return shape, dtype
-    shape = shape.copy()
-    shape.update({k: v.shape for k, v in params.items()})
-    if isinstance(dtype, str):
-        for k, v in params.items():
-            if v.dtype != dtype:
-                raise ValueError(f"{k}: dtype not expected {dtype} vs {v.dtype}")
-    else:
-        dtype = dtype.copy()
-        dtype.update({k: str(v.dtype) for k, v in params.items()})
-    return shape, dtype
-
-
-def from_mxnet(symbol, shape=None, dtype="float32", arg_params=None, aux_params=None):
-    """Convert from MXNet"s model into compatible relay Function.
-
-    Parameters
-    ----------
-    symbol : mxnet.Symbol or mxnet.gluon.HybridBlock
-        MXNet symbol.
-
-    shape : dict of str to tuple, optional
-        The input shape to the graph
-
-    dtype : str or dict of str to str
-        The input types to the graph
-
-    arg_params : dict of str to mx.NDArray
-        The argument parameters in mxnet
-
-    aux_params : dict of str to mx.NDArray
-        The auxiliary parameters in mxnet
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation
-
-    params : dict of str to tvm.nd.NDArray
-        The parameter dict to be used by nnvm
-    """
-    try:
-        import mxnet as mx  # pylint: disable=import-outside-toplevel
-    except ImportError as e:
-        raise ImportError(f"{e}. MXNet is required to parse symbols.")
-
-    mod = IRModule()
-    if isinstance(symbol, mx.sym.Symbol):
-        params = {}
-        arg_params = arg_params if arg_params else {}
-        aux_params = aux_params if aux_params else {}
-        for k, v in arg_params.items():
-            params[k] = _nd.array(v.asnumpy())
-        for k, v in aux_params.items():
-            params[k] = _nd.array(v.asnumpy())
-        shape, dtype = _update_shape_dtype(shape, dtype, params)
-        func = _from_mxnet_impl(symbol, shape, dtype, params, mod)
-    elif isinstance(symbol, mx.gluon.HybridBlock):
-        if arg_params is not None or aux_params is not None:
-            raise ValueError("arg_params and aux_params ae not used when importing HybridBlock")
-        params = {}
-        for k, v in symbol.collect_params().items():
-            params[k] = _nd.array(v.data().asnumpy())
-        inputs = []
-        for name in shape:
-            inputs.append(mx.sym.Variable(name))
-        sym = symbol(*inputs)
-        if isinstance(sym, (list, tuple)):
-            sym = mx.sym.Group(sym)
-        shape, dtype = _update_shape_dtype(shape, dtype, params)
-        func = _from_mxnet_impl(sym, shape, dtype, params, mod)
-    elif isinstance(symbol, mx.gluon.Block):
-        raise NotImplementedError("Only Hybrid Blocks are supported now.")
-    else:
-        msg = f"mxnet.Symbol or gluon.HybridBlock expected, got {type(symbol)}"
-        raise ValueError(msg)
-    mod["main"] = func
-    return mod, params
diff --git a/python/tvm/relay/frontend/mxnet_qnn_op_utils.py b/python/tvm/relay/frontend/mxnet_qnn_op_utils.py
deleted file mode 100644
index 1b2cf2ef4c83..000000000000
--- a/python/tvm/relay/frontend/mxnet_qnn_op_utils.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, len-as-condition, no-else-return
-"""MXNet qnn dialect helper methods for MXNet specific implementations of more
-   generic qnn supported ops.
-"""
-
-import numpy as np
-from tvm import relay
-from tvm.relay.qnn.op.qnn import quantize, dequantize
-
-# The below values are taken from -
-# https://github.com/apache/incubator-mxnet/blob/master/src/operator/quantization/quantization_utils.h#L38-L39
-zero_centered_uint8_quantized_range = np.float32(255.5)
-zero_centered_int8_quantized_range = np.float32(127.5)
-
-
-def _get_mkldnn_scale(data_min, data_max, quantized_range):
-    """Computes the scale as per MKLDNN specification mentioned here -
-    https://intel.github.io/mkl-dnn/ex_int8_simplenet.html
-
-    Parameters
-    ----------
-    data_min : float32
-        A number representing the lower end of the tensor to be quantized.
-    data_max : float32
-        A number representing the upper end of the tensor to be quantized.
-    quantized_range : float32
-        255 for uint8 and 127 for int8. This is the data type range.
-
-    Returns
-    -------
-    scale : A floating point number which acts as the scale for quantization.
-    """
-    real_range = np.max([np.abs(np.float32(data_min)), np.abs(np.float32(data_max))])
-    scale = np.divide(quantized_range, real_range)
-    scale_inverse = np.divide(1.0, scale)
-    return scale_inverse
-
-
-def _quantize_scale_with_zero_centered(data, scale, zero_point, out_dtype):
-    quantized_output = quantize(
-        data, relay.const(scale, "float32"), relay.const(zero_point, "int32"), out_dtype=out_dtype
-    )
-    return quantized_output, scale, zero_point
-
-
-def _quantize_with_zero_centered(data, data_min, data_max, quantized_range, out_dtype):
-    """Quantizes the given data tensor by calculating the scale
-    using the MKLDNN formula `quantized_range / max(abs(data_min, data_max))`.
-    Where quantized_range is 255 for uint8 and 127 for int8. The `data_min`
-    and `data_max` are the min and max to use for the `data` tensor elements.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-    data_min : float
-        The minimum to use data elements.
-    data_max : float
-        The maximum to use for data elements.
-    quantized_range : float
-        255 for uint8 and 127 for int8. This is the data type range.
-    out_dtype : str
-        The output data type. Can be int8 or uint8
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    scale = _get_mkldnn_scale(data_min, data_max, quantized_range)
-    zero_point = 0
-    return _quantize_scale_with_zero_centered(data, scale, zero_point, out_dtype)
-
-
-def _quantize_mkldnn_min_max_uint8(data, data_min, data_max):
-    """Quantizes the given `data` in float32 and the given
-    min and max ranges and the output data type is `uint8`.
-    The method of quantizing is described here - https://tinyurl.com/y5k6fz5w.
-    We use our default quantize implementation from src/relay/qnn/op/quantize.cc:72
-    but compute the `scale` and `zero_point` to fit our equation.
-    Unlike in TFLite where we get the scale and zero_point from the model, MKLDNN
-    stores the min and max from which we calculate the scale and zero_point.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-    imin_range : float
-        The minimum to use data elements.
-    imax_range : float
-        The maximum to use for data elements.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _quantize_with_zero_centered(
-        data, data_min, data_max, zero_centered_uint8_quantized_range, "uint8"
-    )
-
-
-def _quantize_mkldnn_min_max_int8(data, data_min, data_max):
-    """Quantizes the given `data` in float32 and the given
-    min and max ranges and the output data type is `int8`.
-    The method of quantizing is described here - https://tinyurl.com/y5k6fz5w.
-    We use our default quantize implementation from src/relay/qnn/op/quantize.cc:72
-    but compute the `scale` and `zero_point` to fit our equation.
-    Unlike in TFLite where we get the scale and zero_point from the model, MKLDNN
-    stores the min and max from which we calculate the scale and zero_point.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-    data_min : float
-        The minimum to use data elements.
-    data_max : float
-        The maximum to use for data elements.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _quantize_with_zero_centered(
-        data, data_min, data_max, zero_centered_int8_quantized_range, "int8"
-    )
-
-
-def get_mkldnn_int8_scale(range_min, range_max):
-    """Computes the quantization scale using MKLDNN specifications
-    with the given range. The output datatype of tensor to be quantized should be
-    int8.
-
-    Parameters
-    ----------
-    range_min : float32
-        A number representing the lower end of the tensor to be quantized.
-    range_max : float32
-        A number representing the upper end of the tensor to be quantized.
-
-    Returns
-    -------
-    scale : A float32 number which acts as the scale for quantization.
-    """
-
-    scale = _get_mkldnn_scale(range_min, range_max, zero_centered_int8_quantized_range)
-    return np.float32(scale)
-
-
-def get_mkldnn_uint8_scale(range_min, range_max):
-    """Computes the quantization scale using MKLDNN specifications
-    with the given range. The output datatype of tensor to be quantized should be
-    uint8.
-
-    Parameters
-    ----------
-    range_min : float32
-        A number representing the lower end of the tensor to be quantized.
-    range_max : float32
-        A number representing the upper end of the tensor to be quantized.
-
-    Returns
-    -------
-    scale : A float32 number which acts as the scale for quantization.
-    """
-
-    scale = _get_mkldnn_scale(range_min, range_max, zero_centered_uint8_quantized_range)
-    return np.float32(scale)
-
-
-def quantize_conv_weights_bias_channel_mkldnn_from_var(
-    weights_var, bias, min_vector_range, max_vector_range, data_scale
-):
-    """Helper method to quantize the convolution kernel in prequantized model
-    in MXNet with MKLDNN. The kernel is always quantized to int8 output datatype.
-    The inputs are the raw weights which are floating point numbers. The min and
-    max ranges are used from the weight itself. The name supplied is used to create
-    a tvm.relay.var with the given name.
-
-    Parameters
-    ----------
-    weights_var : tvm.relay.var
-        The float32 representation of the weights.
-    bias : np.array
-        The float32 np array for bias.
-    min_vector_range : array of float32
-        A number representing the minimum of the weights per channel.
-    max_vector_range : array of float32
-        A number representing the maximum of the weights per channel.
-    data_scale : float
-        The data scale value.
-
-    Returns
-    -------
-    result : tvm.relay.expr
-           The quantized representation of the weights.
-    """
-
-    quantized_range = zero_centered_int8_quantized_range
-    real_vector_range = np.maximum(np.absolute(min_vector_range), np.absolute(max_vector_range))
-    # If real_vector_range is 0, then to avoid division by 0 in scaling,
-    # make real_vector INT32_max
-    vector_scale = np.where(
-        real_vector_range == 0,
-        1.0 / float(np.iinfo(np.int32).max),
-        np.divide(real_vector_range, quantized_range),
-    )
-
-    # Handle bias impact on scales as done by MxNet-MKLDNN.
-    if bias is not None:
-        common = 2.0 * bias.astype("float32") * (1 / data_scale)
-        vector_scale_min = np.where(
-            bias > 0, common / float(np.iinfo(np.int32).max), common / float(np.iinfo(np.int32).min)
-        )
-        vector_scale = np.maximum(vector_scale, vector_scale_min)
-
-    zero_point = 0
-    quantized_output = quantize(
-        weights_var,
-        relay.const(vector_scale),
-        relay.const(zero_point, "int32"),
-        axis=0,
-        out_dtype="int8",
-    )
-    return quantized_output, vector_scale, zero_point
-
-
-def get_mkldnn_requantize_scale_outDtype(min_output_range, max_output_range, out_dtype):
-    """Get the MKLDNN requantized scale."""
-    quantized_out_range = (
-        zero_centered_int8_quantized_range
-        if out_dtype == "int8"
-        else zero_centered_uint8_quantized_range
-    )
-    out_range = np.max([np.abs(np.float32(min_output_range)), np.abs(np.float32(max_output_range))])
-    output_scale = quantized_out_range / out_range
-    requantize_scale = np.float32(1 / output_scale)
-    return requantize_scale
-
-
-def get_conv_mkldnn_requantized_scale_outDtype(min_output_range, max_output_range):
-    out_dtype = "uint8" if min_output_range >= 0.0 else "int8"
-    requantize_scale = get_mkldnn_requantize_scale_outDtype(
-        min_output_range, max_output_range, out_dtype
-    )
-    return requantize_scale, out_dtype
-
-
-def quantize_conv_bias_mkldnn_from_var(bias_var, bias_scale):
-    """Quantized conv2d bias"""
-    zero_point = 0
-    quantized_bias = quantize(
-        data=bias_var,
-        output_scale=relay.const(bias_scale),
-        output_zero_point=relay.const(zero_point, "int32"),
-        axis=0,
-        out_dtype="int32",
-    )
-
-    return quantized_bias
-
-
-def quantize_mxnet_min_max(data, min_range, max_range, out_dtype="int8"):
-    """Quantizes the given `data` in float32 and the given
-    min and max ranges and the output data type.
-    Only `int8` and `uint8` is supported as output data types.
-    The input data type is expected to be `float32`.
-    Mxnet has two different flavors for quantization 1) Default 2)MKLDNN.
-    To get the second one Mxnet must be built with MKLDNN during compile time.
-    Users can choose either of the implementation for TVM runtime.
-    The main difference between the two implementation is that MKLDNN is centered
-    around 0 and the default implementation for uint8 is not.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-    min_range : float
-        The minimum to use data elements.
-    max_range : float
-        The maximum to use for data elements.
-    out_dtype: str, optional
-        The output data type, can be 'int8' or 'uint8'
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    if out_dtype == "uint8":
-        return _quantize_mkldnn_min_max_uint8(data, min_range, max_range)
-    elif out_dtype == "int8":
-        return _quantize_mkldnn_min_max_int8(data, min_range, max_range)
-    else:
-        raise ValueError("Expected out_dtype to be int8 or uint8 but was  {out_dtype}")
-
-
-def _dequantize_zero_centered(data, data_min, data_max, quantized_range):
-    """Dequantizes the given data tensor by calculating the scale
-    using the MKLDNN formula `max(abs(data_min, data_max))/quantized_range`.
-    Where quantized_range is 255 for uint8 and 127 for int8. The `data_min`
-    and `data_max` are the min and max to use for the `data` tensor elements.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type {int8 or uint8}.
-    data_min : float
-        The minimum to use data elements.
-    data_max : float
-        The maximum to use for data elements.
-    quantized_range : float
-        255 for uint8 and 127 for int8. This is the data type range.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    real_range = np.max([np.abs(np.float32(data_min)), np.abs(np.float32(data_max))])
-    scale = relay.const(np.divide(real_range, quantized_range), "float32")
-    zero_point = relay.const(0, "int32")
-    return dequantize(data, scale, zero_point)
-
-
-def _dequantize_mkldnn_min_max_int8(data, imin_range, imax_range):
-    """Dequantizes the given `data` in {int8 or uint8} and the given
-    min and max ranges and the output data type is `float32`.
-    The method of dequantizing is described here - https://tinyurl.com/y5k6fz5w.
-    We use our default quantize implementation from src/relay/qnn/op/dequantize.cc:67
-    but compute the `scale` and `zero_point` to fit our equation.
-    Unlike in TFLite where we get the scale and zero_point from the model, MKLDNN
-    stores the min and max from which we calculate the scale and zero_point.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-    imin_range : float
-        The minimum to use data elements.
-    imax_range : float
-        The maximum to use for data elements.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _dequantize_zero_centered(
-        data,
-        data_min=imin_range,
-        data_max=imax_range,
-        quantized_range=zero_centered_int8_quantized_range,
-    )
-
-
-def _dequantize_mkldnn_min_max_uint8(data, imin_range, imax_range):
-    """Dequantizes the given `data` in {int8 or uint8} and the given
-    min and max ranges and the output data type is `float32`.
-    The method of dequantize is described here - https://tinyurl.com/y5k6fz5w.
-    We use our default quantize implementation from src/relay/qnn/op/dequantize.cc:67
-    but compute the `scale` and `zero_point` to fit our equation.
-    Unlike in TFLite where we get the scale and zero_point from the model, MKLDNN
-    stores the min and max from which we calculate the scale and zero_point.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-    imin_range : float
-        The minimum to use data elements.
-    imax_range : float
-        The maximum to use for data elements.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _dequantize_zero_centered(
-        data,
-        data_min=imin_range,
-        data_max=imax_range,
-        quantized_range=zero_centered_uint8_quantized_range,
-    )
-
-
-def dequantize_mxnet_min_max(data, min_range, max_range, in_dtype="int8"):
-    """Dequantizes the given `data` in {int8 or uint8} and the given
-    min and max ranges. The output data type is float32.
-    Only `float32` is supported as output data types.
-    The input data type is expected to be {int8 or uint8}.
-    Mxnet has two different flavors for dequantization 1) Default 2)MKLDNN.
-    To get the second one Mxnet must be built with MKLDNN during compile time.
-    Users can choose either of the implementation for TVM runtime.
-    The main difference between the two implementation is that MKLDNN is centered
-    around 0 and the default implementation for uint8 is not.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-    min_range : float
-        The minimum to use data elements for the output.
-    max_range : float
-        The maximum to use for data elements for the output.
-    in_dtype: str, optional
-        The input data type, can be 'int8' or 'uint8'
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    if in_dtype == "uint8":
-        return _dequantize_mkldnn_min_max_uint8(data, min_range, max_range)
-    elif in_dtype == "int8":
-        return _dequantize_mkldnn_min_max_int8(data, min_range, max_range)
-    else:
-        raise ValueError(f"Expected out_dtype to be int8 or uint8 but was  {in_dtype}")
diff --git a/python/tvm/relay/frontend/nnvm_common.py b/python/tvm/relay/frontend/nnvm_common.py
deleted file mode 100644
index 4a611e0537cd..000000000000
--- a/python/tvm/relay/frontend/nnvm_common.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, len-as-condition, superfluous-parens
-"""Utility functions common to NNVM and MxNet conversion."""
-import warnings
-from ... import error
-from ...tir.op import min_value
-from .. import expr as _expr
-from .. import op as _op
-from .common import get_relay_op
-from .common import infer_type as _infer_type
-from .common import infer_shape as _infer_shape
-
-
-def _warn_not_used(attr, op="nnvm"):
-    err = f"{attr} is ignored in {op}."
-    warnings.warn(err)
-
-
-def _rename(new_op):
-    if isinstance(new_op, str):
-        new_op = get_relay_op(new_op)
-    # attrs are ignored.
-    def impl(inputs, _, _dtype="float32"):
-        return new_op(*inputs)
-
-    return impl
-
-
-def _reshape(inputs, attrs):
-    shape = attrs.get_int_tuple("shape")
-    reverse = attrs.get_bool("reverse", False)
-    if reverse:
-        return _op.reverse_reshape(inputs[0], newshape=shape)
-    return _op.reshape(inputs[0], newshape=shape)
-
-
-def _init_op(new_op):
-    """Init ops like zeros/ones"""
-
-    def _impl(inputs, attrs):
-        assert len(inputs) == 0
-        shape = attrs.get_int_tuple("shape")
-        dtype = attrs.get_str("dtype", "float32")
-        return new_op(shape=shape, dtype=dtype)
-
-    return _impl
-
-
-def _softmax_op(new_op):
-    """softmax/log_softmax"""
-
-    def _impl(inputs, attrs, _dtype="float32"):
-        axis = attrs.get_int("axis", -1)
-        use_length = attrs.get_bool("use_length", False)
-        if use_length:
-            # The second arg is valid_length. We can use sequence mask to mask the input before
-            # computing softmax
-            assert len(inputs) == 2
-
-            data = inputs[0]
-            length = inputs[1]
-            data_shape = _infer_shape(data)
-            data_dtype = _infer_type(data).checked_type.dtype
-            length_shape = _infer_shape(length)
-
-            if axis < 0:
-                axis = len(data_shape) + axis
-
-            data_ndims = len(data_shape)
-            length_ndims = len(length_shape)
-
-            # Sequence_mask supports axis = 0 and 1 and requires data to be in specific format.
-            if axis == data_ndims - 1 and data_ndims > 2 and length_ndims == 2:
-                new_batch_size = 1
-                for dim in range(length_ndims):
-                    assert data_shape[dim] == length_shape[dim]
-                    new_batch_size *= data_shape[dim]
-
-                # Reshape the data and length to satisfy sequence mask
-                data = _op.reshape(data, newshape=(new_batch_size, -1))
-                length = _op.reshape(length, newshape=(new_batch_size))
-
-                # Input data is now 2D, we can set the axis = 1
-                axis = 1
-            elif data_ndims > 2:
-                raise error.OpNotImplemented(
-                    "Operator softmax with use_length=True is supported only for axis -1"
-                )
-
-            res = _op.sequence_mask(
-                data=data,
-                valid_length=length,
-                mask_value=float(min_value(data_dtype).value),
-                axis=axis,
-            )
-
-            # Apply softmax
-            res = new_op(res, axis=axis)
-
-            # Reshape back to input data shape
-            if len(data_shape) > 2:
-                return _op.reshape(res, newshape=data_shape)
-            return res
-        return new_op(inputs[0], axis=axis)
-
-    return _impl
-
-
-def _reduce(new_op):
-    """Reduction ops like sum/min/max"""
-
-    def _impl(inputs, attrs, _dtype="float32"):
-        assert len(inputs) == 1
-        axis = attrs.get_int_tuple("axis", [])
-        keepdims = attrs.get_bool("keepdims", False)
-        exclude = attrs.get_bool("exclude", False)
-        # use None for reduce over all axis.
-        axis = None if len(axis) == 0 else axis
-        return new_op(inputs[0], axis=axis, keepdims=keepdims, exclude=exclude)
-
-    return _impl
-
-
-def _arg_reduce(new_op):
-    """Arg Reduction ops like argmin/argmax"""
-
-    def _impl(inputs, attrs):
-        assert len(inputs) == 1
-        axis = attrs.get_int("axis", None)
-        keepdims = attrs.get_bool("keepdims", False)
-        res = new_op(inputs[0], axis=[axis], keepdims=keepdims)
-        # cast to dtype.
-        res = res.astype("float32")
-        return res
-
-    return _impl
-
-
-def _cast(inputs, attrs):
-    """Type cast"""
-    dtype = attrs.get_str("dtype")
-    return inputs[0].astype(dtype=dtype)
-
-
-def _clip(inputs, attrs):
-    a_min = attrs.get_float("a_min")
-    a_max = attrs.get_float("a_max")
-    return _op.clip(inputs[0], a_min=a_min, a_max=a_max)
-
-
-def _transpose(inputs, attrs):
-    axes = attrs.get_int_tuple("axes", None)
-    # translate default case
-    axes = None if len(axes) == 0 else axes
-    return _op.transpose(inputs[0], axes=axes)
-
-
-def _upsampling(inputs, attrs):
-    scale = attrs.get_int("scale")
-    return _op.nn.upsampling(inputs[0], scale_h=scale, scale_w=scale)
-
-
-def _elemwise_sum(inputs, _, _dtype="float32"):
-    assert len(inputs) > 0
-    res = inputs[0]
-    for x in inputs[1:]:
-        res = _op.add(res, x)
-    return res
-
-
-def _binop_scalar(new_op):
-    def _impl(inputs, attrs, odtype=None):
-        assert len(inputs) == 1
-        scalar = attrs.get_float("scalar")
-        if odtype is None:
-            odtype = _infer_type(inputs[0]).checked_type.dtype
-        scalar = _expr.const(scalar, dtype=odtype)
-        return new_op(inputs[0], scalar)
-
-    return _impl
-
-
-def _rbinop_scalar(new_op):
-    def _impl(inputs, attrs, odtype=None):
-        assert len(inputs) == 1
-        scalar = attrs.get_float("scalar")
-        if odtype is None:
-            odtype = _infer_type(inputs[0]).checked_type.dtype
-        scalar = _expr.const(scalar, dtype=odtype)
-        return new_op(scalar, inputs[0])
-
-    return _impl
-
-
-def _compare(new_op):
-    """Compare ops like greater/less"""
-
-    def _impl(inputs, _, odtype="float32"):
-        assert len(inputs) == 2
-        return new_op(inputs[0], inputs[1]).astype(odtype)
-
-    return _impl
diff --git a/python/tvm/relay/frontend/oneflow.py b/python/tvm/relay/frontend/oneflow.py
deleted file mode 100644
index 369bec445fb6..000000000000
--- a/python/tvm/relay/frontend/oneflow.py
+++ /dev/null
@@ -1,1949 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
-# pylint: disable=import-outside-toplevel, used-before-assignment, use-implicit-booleaness-not-comparison
-"""OneFlow: OneFlow is a performance-centered and open-source deep learning framework."""
-
-import os
-import re
-import copy
-from collections import OrderedDict
-
-import numpy as np
-import tvm
-from tvm.ir import IRModule
-from tvm.topi.utils import get_const_tuple
-
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from .. import ty as _ty
-from .common import AttrCvt, Renamer, fold_constant, get_relay_op, infer_shape, infer_type, new_var
-
-__all__ = ["from_oneflow"]
-
-FLOW_2_STR_DTYPE = {
-    2: "float32",
-    3: "float64",
-    6: "int64",
-    5: "int32",
-    4: "int8",
-    7: "uint8",
-    9: "float16",
-}
-
-
-def is_input_op(node):
-    """Return true when the node is the input of the graph."""
-    return node.WhichOneof("op_type") == "input_conf"
-
-
-def is_user_op(node):
-    """Return true when the node is the intermediate variables of graph."""
-    return node.WhichOneof("op_type") == "user_conf"
-
-
-def is_output_op(node):
-    """Return true when the node is the output of the graph."""
-    return node.WhichOneof("op_type") == "output_conf"
-
-
-def is_param_op(node):
-    """Return true when the node is the intermediate variables of model(saved)."""
-    return node.WhichOneof("op_type") == "variable_conf"
-
-
-def get_node_info(node):
-    """
-    Get basic information about nodes: shape, data_type
-    """
-    # list->tuple
-    shape = tuple(node.input_conf.blob_conf.shape.dim)
-    # get data type
-    dtype = node.input_conf.blob_conf.data_type
-    if dtype in list(FLOW_2_STR_DTYPE.keys()):
-        data_type = FLOW_2_STR_DTYPE[dtype]
-    else:
-        raise IndexError(f"Please check the data type of your node: {node.name}")
-
-    return shape, data_type
-
-
-def _dtype_shape_promotion(inputs):
-    """Promote data type and shape for list of tensors."""
-
-    dtype_order = ["bool", "int8", "int16", "int32", "int64", "float32", "float64"]
-    ranks = [len(infer_shape(x)) for x in inputs]
-    if set(ranks) == set([1, 0]):
-        for i, r in enumerate(ranks):
-            if r == 0:
-                inputs[i] = _op.expand_dims(inputs[i], axis=0)
-
-    dtypes = set(dtype_order.index(infer_type(x).checked_type.dtype) for x in inputs)
-    if len(dtypes) == 1:
-        return inputs
-    max_dtype = dtype_order[max(dtypes)]
-    for i, input_op in enumerate(inputs):
-        if infer_type(input_op).checked_type.dtype != max_dtype:
-            inputs[i] = input_op.astype(max_dtype)
-    return inputs
-
-
-def parse_attr(attr):
-    """Parse attribute of user op in oneflow."""
-    attrs = {}
-    for a in attr:
-        attr_str = str(attr[a])
-
-        if attr_str[0:7] == "at_list":
-            attr_str_ = attr_str.split(" ")[0]
-
-            if attr_str_ == "at_list_float":
-                attrs[a] = tuple(attr[a].at_list_float.val)
-            elif attr_str_ == "at_list_int32":
-                attrs[a] = tuple(attr[a].at_list_int32.val)
-            elif attr_str_ == "at_list_int64":
-                attrs[a] = tuple(attr[a].at_list_int64.val)
-
-        elif attr_str.split(":")[0] == "at_string":
-            attrs[a] = attr[a].at_string
-
-        elif attr_str.split(" ")[0] == "at_shape":
-            attrs[a] = tuple(list(attr[a].at_shape.dim))
-
-        else:
-            attr_str_ = attr_str.split(":")[0]
-            if attr_str_ == "at_bool":
-                attrs[a] = attr[a].at_bool
-            elif attr_str_ == "at_double":
-                attrs[a] = attr[a].at_double
-            elif attr_str_ == "at_float":
-                attrs[a] = attr[a].at_float
-            elif attr_str_ == "at_int32":
-                attrs[a] = attr[a].at_int32
-            elif attr_str_ == "at_int64":
-                attrs[a] = attr[a].at_int64
-
-    return attrs
-
-
-def shape_of(x, dtype="int64"):
-    ttype = infer_type(x).checked_type
-    if not _ty.is_dynamic(ttype):
-        shape = list(ttype.shape)
-        return _expr.const(shape, dtype)
-
-    return _op.shape_of(x, dtype)
-
-
-def dimension_constraint():
-    def _dim_check(attrs):
-        if len(attrs["kernel_size"]) in [1, 2, 3]:
-            return True
-        return False
-
-    return _dim_check, "Only 1d, 2d and 3d kernel supported."
-
-
-class OneFlowOpConverter(object):
-    """A helper class for holding oneflow op converters."""
-
-    @classmethod
-    def get_converter(cls):
-        """
-        Get converter matches given opset.
-        Parameters
-        ----------
-        None
-
-        Returns
-        -------
-        converter, which should be `_impl_vx`.
-        """
-        version = 1
-        if hasattr(cls, f"_impl_v{version}"):
-            return getattr(cls, f"_impl_v{version}")
-        raise NotImplementedError(f"version {version} of {cls.__name__} not implemented")
-
-
-class Pool(OneFlowOpConverter):
-    """A helper class for pool op converters."""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        data = inputs[0]
-        attrs.pop("data_format")
-        out = AttrCvt(
-            op_name=cls.name,
-            transforms={
-                "kernel_size": "pool_size",
-                "stride": "strides",
-                "dilations": ("dilation", 1),
-            },
-            ignores=["return_indices", "divisor_override"],
-            custom_check=dimension_constraint(),
-        )([data], attrs, params)
-
-        return out
-
-
-class AdaptiveAvgPool2d(OneFlowOpConverter):
-    """Operator converter for AdaptiveAvgPool2d"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.nn.adaptive_avg_pool2d(inputs[0], output_size=attrs["output_size"])
-
-
-class AdaptiveMaxPool2d(OneFlowOpConverter):
-    """Operator converter for AdaptiveMaxPool2d"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.nn.adaptive_max_pool2d(inputs[0], output_size=attrs["output_size"])
-
-
-class GlobalAveragePool(OneFlowOpConverter):
-    """Operator converter for GlobalAveragePool"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        rank = len(infer_shape(inputs[0]))
-        if rank == 3:
-            return _op.nn.global_avg_pool1d(inputs[0])
-        if rank == 4:
-            return _op.nn.global_avg_pool2d(inputs[0])
-        if rank == 5:
-            return _op.nn.global_avg_pool3d(inputs[0])
-        raise NotImplementedError(
-            "Global average pooling is only implemented for 1D, 2D, and 3D kernels, got %dD."
-            % (rank - 2)
-        )
-
-
-class GlobalMaxPool(OneFlowOpConverter):
-    """Operator converter for GlobalMaxPool"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        rank = len(infer_shape(inputs[0]))
-        if rank == 3:
-            return _op.nn.global_max_pool1d(inputs[0])
-        if rank == 4:
-            return _op.nn.global_max_pool2d(inputs[0])
-        if rank == 5:
-            return _op.nn.global_max_pool3d(inputs[0])
-        raise NotImplementedError(
-            "Global max pooling is only implemented for 1D, 2D, and 3D kernels, got %dD."
-            % (rank - 2)
-        )
-
-
-class Conv(OneFlowOpConverter):
-    """A helper class for conv op converters."""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        # The kernel is imported from model_dir_path, without the ".weight" logo, etc.
-        # The data is obtained through the graph, its op contains "_input."
-        in_names = ["_input."]
-        kernel_names = [".weight"]
-        for i in inputs:
-            IN_NAMES = any(x in str(i) for x in in_names)
-            KERNEL_NAMES = any(x in str(i) for x in kernel_names)
-            if IN_NAMES:
-                data = i
-            elif KERNEL_NAMES:
-                kernel = i
-            else:
-                data = i
-
-        # Use shape of input to determine convolution type.
-        kernel_type = infer_type(kernel)
-        kernel_shapes = [get_const_tuple(kernel_type.checked_type.shape)]
-
-        if "kernel_size" not in attrs:
-            attrs["kernel_size"] = kernel_shapes[0][2:]
-        if "dilation_rate" in attrs:
-            attrs["dilation"] = list(attrs["dilation_rate"])
-            attrs.pop("dilation_rate")
-
-        pad_v = attrs.get("padding_before", [0, 0])
-        attrs["padding"] = [pad_v[0], pad_v[1], pad_v[0], pad_v[1]]
-
-        group_conv1d = False
-        if cls.name == "conv1d" and attrs.get("groups") != 1:
-            group_conv1d = True
-            # Expand input from NCW to NCHW
-            data = _op.expand_dims(data, axis=2)
-            # Expand kernel from OIW to OIHW
-            kernel = _op.expand_dims(kernel, axis=2)
-            # Add new value to kernel_shape, strices, dilation, pads, if needed
-            attrs["kernel_size"] = [1] + list(attrs["kernel_size"])
-            if "strides" in attrs:
-                attrs["strides"] = [1] + list(attrs["strides"])
-            if "dilations" in attrs:
-                attrs["dilation"] = [1] + list(attrs["dilations"])
-
-        out = AttrCvt(
-            op_name=cls.name,
-            transforms={"group": ("groups", 1)},
-            ignores=["data_format", "filters", "padding_after", "padding_before"],
-            custom_check=dimension_constraint(),
-        )([data, kernel], attrs, params)
-
-        # If this was a group_conv1d, squish output back to NCW.
-        if group_conv1d:
-            out = _op.squeeze(out, axis=[2])
-
-        return out
-
-
-class ConvTranspose(OneFlowOpConverter):
-    """Operator converter for ConvTranspose."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        in_names = ["_input."]
-        kernel_names = [".weight"]
-        for i in inputs:
-            IN_NAMES = any(x in str(i) for x in in_names)
-            KERNEL_NAMES = any(x in str(i) for x in kernel_names)
-            if IN_NAMES:
-                data = i
-            elif KERNEL_NAMES:
-                kernel = i
-            else:
-                data = i
-
-        # get number of channels
-        attrs["channels"] = attrs.get("filters", 1)
-        attrs["groups"] = attrs.get("group", 1)
-
-        kernel_type = infer_type(kernel)
-        kernel_shapes = [get_const_tuple(kernel_type.checked_type.shape)]
-
-        if "kernel_size" not in attrs:
-            attrs["kernel_size"] = kernel_shapes[0][2:]
-
-        if "dilation_rate" in attrs:
-            attrs["dilation"] = list(attrs["dilation_rate"])
-            attrs.pop("dilation_rate")
-
-        pad_v = attrs.get("padding_before", [0, 0])
-        attrs["padding"] = [pad_v[0], pad_v[1], pad_v[0], pad_v[1]]
-
-        out = AttrCvt(
-            op_name=cls.name,
-            transforms={"group": ("groups", 1)},
-            disables=["filters", "data_format", "padding_before"],
-            custom_check=dimension_constraint(),
-        )([data, kernel], attrs, params)
-
-        return out
-
-
-class Upsample(OneFlowOpConverter):
-    """A helper class for upsample op converters"""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        data = inputs[0]
-        input_shape = infer_shape(data)
-        dims = len(input_shape)
-
-        width_scale = attrs.get("width_scale", 1.0)
-        height_scale = attrs.get("height_scale", 1.0)
-        align_corners = attrs.get("align_corners", False)
-
-        if "nearest" in cls.name:
-            method = "nearest_neighbor"
-        elif "trilinear" in cls.name:
-            method = "trilinear"
-        elif "bilinear" in cls.name:
-            method = "bilinear"
-
-        # in 3d case, we use the purely static op
-        if dims == 5:
-            if isinstance(scales, _expr.Expr):
-                scale_h = _op.take(scales, _op.const(3))
-                scale_w = _op.take(scales, _op.const(4))
-                scale_d = _op.take(scales, _op.const(1))
-            else:
-                assert len(scales) == 5
-                scale_h = scales[-2]
-                scale_w = scales[-1]
-                scale_d = scales[-3]
-
-            layout = "NCDHW"
-            out = _op.nn.upsampling3d(
-                data,
-                scale_d,
-                scale_h,
-                scale_w,
-                layout=layout,
-                method=method,
-                coordinate_transformation_mode="asymmetric",
-            )
-        # in 2d case, use dynamic op
-        else:
-            if isinstance(height_scale, _expr.Expr):
-                height_scale = _op.take(height_scale, _op.const(3))
-                width_scale = _op.take(width_scale, _op.const(4))
-            layout = "NCHW"
-
-            out = _op.nn.upsampling(
-                inputs[0],
-                height_scale,
-                width_scale,
-                layout=layout,
-                method=method,
-                align_corners=align_corners,
-            )
-        return out
-
-
-class UpsampleNearest(Upsample):
-    """Operator converter for Upsample Nearest"""
-
-    name = "upsample_nearest"
-
-
-class UpsampleBiLinear(Upsample):
-    """Operator converter for Upsample Bilinear"""
-
-    name = "upsample_bilinear"
-
-
-class Conv2d(Conv):
-    """Operator converter for Conv2d"""
-
-    name = "conv2d"
-
-
-class ConvTranspose2d(ConvTranspose):
-    """Operator converter for ConvTranspose2d"""
-
-    name = "conv2d_transpose"
-
-
-class BatchNorm(OneFlowOpConverter):
-    """Operator converter for BatchNorm"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        # sort the inputs
-        sorted_inputs = copy.deepcopy(inputs)
-        for i in inputs:
-            IN_NAMES = "_input." in str(i)
-            if IN_NAMES:
-                sorted_inputs[0] = i
-            elif "weight" in str(i) and not IN_NAMES:
-                sorted_inputs[1] = i
-            elif "bias" in str(i) and not IN_NAMES:
-                sorted_inputs[2] = i
-            elif "mean" in str(i) and not IN_NAMES:
-                sorted_inputs[3] = i
-            elif "var" in str(i) and not IN_NAMES:
-                sorted_inputs[4] = i
-
-        if "data_format" in attrs:
-            if attrs["data_format"] == "channel_first":
-                attrs["axis"] = 1
-
-        out = AttrCvt(op_name="batch_norm", ignores=["training"], disables=["momentum"])(
-            sorted_inputs, attrs, params
-        )
-        return out[0]
-
-
-class Flatten(OneFlowOpConverter):
-    """Operator converter for Flatten"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        x = inputs[0]
-        input_shape = list(infer_shape(x))
-
-        start = attrs["start_dim"]
-        end = attrs["end_dim"]
-        ndim = len(input_shape)
-        if end < 0:
-            end += ndim
-        new_shape = [0] * start
-
-        new_shape.append(-1)
-        squeeze_axes = []
-        for i in range(start + 1, end + 1):
-            new_shape.append(1)
-            squeeze_axes.append(i)
-        for _ in range(end + 1, ndim):
-            new_shape.append(0)
-        out = _op.reshape(x, new_shape)
-        if squeeze_axes:
-            out = _op.squeeze(out, axis=squeeze_axes)
-        return out
-
-
-class MatMul(OneFlowOpConverter):
-    """Operator converter for MatMul"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 2, f"MatMul op take 2 inputs, {len(inputs)} given"
-
-        dtype = infer_type(inputs[0]).checked_type.dtype
-        # Y = alpha * A * B
-        alpha = float(attrs.get("alpha", 1.0))
-        transA = bool(attrs.get("transpose_a", False))
-        transB = bool(attrs.get("transpose_b", False))
-
-        a_shape = infer_shape(inputs[0])
-        b_shape = infer_shape(inputs[1])
-        if (
-            (transA and transB and a_shape[-2] != b_shape[-1])
-            or (transA and not transB and a_shape[-2] != b_shape[-2])
-            or (transB and not transA and a_shape[-1] != b_shape[-1])
-            or (not transB and not transA and a_shape[-1] != b_shape[-2])
-        ):
-            matmul_a = inputs[1]
-            matmul_b = inputs[0]
-        else:
-            matmul_a = inputs[0]
-            matmul_b = inputs[1]
-
-        if transA:
-            perm = list(range(len(a_shape)))
-            perm[-2] = len(a_shape) - 1
-            perm[-1] = len(a_shape) - 2
-            matmul_a = _op.transpose(matmul_a, axes=perm)
-        if transB:
-            perm = list(range(len(b_shape)))
-            perm[-2] = len(b_shape) - 1
-            perm[-1] = len(b_shape) - 2
-            matmul_b = _op.transpose(matmul_b, axes=perm)
-
-        # This implemention almost keeps same with ONNX
-        # Need to check input shape as batch matmul must be supported.
-        a_shape = shape_of(matmul_a, dtype="int32")
-        a_rank = infer_shape(a_shape)[0]
-        b_shape = shape_of(matmul_b, dtype="int32")
-        b_rank = infer_shape(b_shape)[0]
-        # When performing a batch matmul, we need to properly handle N-dim shapes.
-        if a_rank > 2 or b_rank > 2:
-
-            def flatten_to_nd(x, x_shape, nd=3):
-                ndims = infer_shape(x_shape)[0]
-                if ndims == nd:
-                    return x
-                newshape = _op.concatenate(
-                    [
-                        _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-                        _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-                    ],
-                    0,
-                )
-                out = _op.reshape(x, fold_constant(newshape))
-                return out
-
-            b_type = infer_type(matmul_b)
-            # Convert to dense if the second matrix is 2d and non-dynamic
-            if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
-                a = flatten_to_nd(matmul_a, a_shape, 2)
-                b = _op.transpose(matmul_b)
-                output = _op.nn.dense(a, b)
-            else:
-                # Convert a and b into 3 dimensional tensors.
-                a = flatten_to_nd(matmul_a, a_shape, 3)
-                b = flatten_to_nd(matmul_b, b_shape, 3)
-                # Transpose matrix dimensions of b.
-                b = _op.transpose(b, [0, 2, 1])
-                # Perform a batch matmul.
-                output = _op.nn.batch_matmul(a, b)
-            # Determine the output batch dimension.
-            if a_rank > b_rank:
-                out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
-            elif a_rank < b_rank:
-                out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
-            # If its unclear how broadcasting should be applied, the output
-            # shape is determined by choosing the maximum value from each input.
-            else:
-                out_batch = _op.concatenate(
-                    [
-                        _op.maximum(
-                            _op.strided_slice(a_shape, [i], [i + 1]),
-                            _op.strided_slice(b_shape, [i], [i + 1]),
-                        )
-                        for i in range(a_rank - 2)
-                    ],
-                    0,
-                )
-            # Reshape output to original dimensions.
-            final_shape = _op.concatenate(
-                [
-                    out_batch,
-                    _op.strided_slice(
-                        a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
-                    ),
-                    _op.strided_slice(
-                        b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
-                    ),
-                ],
-                0,
-            )
-            out = _op.reshape(output, fold_constant(final_shape))
-        else:
-            if b_rank == 1:
-                matmul_b = _op.expand_dims(matmul_b, 1, 1)
-            # Otherwise a simple dense op will get the job done.
-            input_1_t = _op.transpose(matmul_b, axes=(1, 0))
-            out = _op.nn.dense(matmul_a, input_1_t)
-            if b_rank == 1:
-                out = _op.squeeze(out, axis=[-1])
-        if not np.isclose(alpha, 1.0):
-            out = out * _expr.const(alpha, dtype=dtype)
-        return out
-
-
-class Reduce(OneFlowOpConverter):
-    """Operator converter for reduce ops"""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        attr = {"axis": attrs.get("axis", 0), "keepdims": attrs.get("keepdims", True)}
-        return AttrCvt(cls.name)(inputs, attr)
-
-
-class ReduceMax(Reduce):
-    """Operator converter for ReduceMax"""
-
-    name = "max"
-
-
-class ReduceMin(Reduce):
-    """Operator converter for ReduceMin"""
-
-    name = "min"
-
-
-class ReduceSum(Reduce):
-    """Operator converter for ReduceSum"""
-
-    name = "sum"
-
-
-class ReduceMean(Reduce):
-    """Operator converter for ReduceMean"""
-
-    name = "mean"
-
-
-class Square(OneFlowOpConverter):
-    """Operator converter for square"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 1, f"Square op {cls.name} take 1 inputs, {len(inputs)} given"
-        return _op.multiply(inputs[0], inputs[0])
-
-
-class Add(OneFlowOpConverter):
-    """Operator converter for Add"""
-
-    name = "add"
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 2, f"Math op {cls.name} take 2 inputs, {len(inputs)} given"
-        axis = int(attrs.get("axis", 0))
-
-        true_names = ["weight", "bias"]
-        false_names = ["_input."]
-
-        for i in inputs:
-            T_NAMES = any(x in str(i) for x in true_names)
-            F_NAMES = any(x in str(i) for x in false_names)
-            if T_NAMES and not F_NAMES:
-                add_b = i
-            else:
-                add_a = i
-
-        # fix the shape
-        add_shape = infer_shape(add_a)
-        if len(add_shape) > 2:
-            add_b = _op.expand_dims(add_b, axis=axis, num_newaxis=len(add_shape) - 2)
-        add_b_shape = list(infer_shape(add_b))
-        add_b_shape.insert(0, add_shape[0])
-
-        add_b = _op.reshape(add_b, tuple(add_b_shape))
-        out = get_relay_op(cls.name)(add_a, add_b)
-
-        return out
-
-
-class Expand(OneFlowOpConverter):
-    """Operator converter for Expand"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        data_in = inputs[0]
-        shape = list(infer_shape(data_in))
-
-        ndims = len(shape)
-        sizes = attrs["logical_expand_shape"]
-        out = data_in
-        out_dims = len(sizes)
-        if ndims < out_dims:
-            num_newaxis = out_dims - ndims
-            out = _op.expand_dims(out, axis=0, num_newaxis=num_newaxis)
-            shape = [1] * num_newaxis + shape
-
-        for i in range(out_dims):
-            if sizes[i] != -1 and shape[i] == 1:
-                out = _op.repeat(out, sizes[i], axis=i)
-
-        return out
-
-
-class Transpose(OneFlowOpConverter):
-    """Operator converter for transpose."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        perm = attrs["perm"]
-        return _op.transpose(inputs[0], axes=perm)
-
-
-class ExpandDim(OneFlowOpConverter):
-    """Operator converter for ExpandDim"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.expand_dims(inputs[0], axis=attrs.get("axis", 0))
-
-
-class BroadcastMath(OneFlowOpConverter):
-    """Operator converter for broadcast math ops"""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 2, f"Math op {cls.name} take 2 inputs, {len(inputs)} given"
-        beta_names = ["weight", "bias", "mean", "var", "Constant"]
-
-        for i in inputs:
-            T_NAMES = any([x in str(i) for x in beta_names])
-            if T_NAMES and "_input." not in str(i):
-                input_b = i
-            else:
-                input_a = i
-
-        if cls.name == "divide":
-            length = []
-            for i in inputs:
-                length.append(len(str(i)))
-            for i in inputs:
-                if len(str(i)) == max(length):
-                    input_a = i
-                else:
-                    input_b = i
-        if cls.name == "subtract":
-            length = []
-            for i in inputs:
-                length.append(len(str(i)))
-            for i in inputs:
-                if len(str(i)) == max(length):
-                    input_b = i
-                else:
-                    input_a = i
-        try:
-            return get_relay_op(cls.name)(input_a, input_b)
-        except UnboundLocalError:
-            return get_relay_op(cls.name)(*inputs)
-
-
-class BroadcastMul(BroadcastMath):
-    """Operator converter for Mul broadcast"""
-
-    name = "multiply"
-
-
-class BroadcastAdd(BroadcastMath):
-    """Operator converter for Add broadcast"""
-
-    name = "add"
-
-
-class BroadcastSub(BroadcastMath):
-    """Operator converter for Sub broadcast"""
-
-    name = "subtract"
-
-
-class BroadcastDiv(BroadcastMath):
-    """Operator converter for Div broadcast"""
-
-    name = "divide"
-
-
-class LogicalGreater(OneFlowOpConverter):
-    """Operator converter for greater"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        res = None
-        if attrs.get("has_int_operand", True):
-            value = attrs.get("int_operand", 0.0)
-            res = _op.greater(inputs[0], _op.full_like(inputs[0], fill_value=_expr.const(value)))
-        elif attrs.get("has_float_operand", True):
-            value = float(attrs.get("float_operand", 0.0))
-            res = _op.greater(
-                inputs[0], _op.full_like(inputs[0], fill_value=_expr.const(value)).astype("float32")
-            )
-        else:
-            raise AttributeError(
-                "please check if has_int_operand or has_float_operand in your attrs"
-            )
-        return res
-
-
-class Log1p(OneFlowOpConverter):
-    """Operator converter for Log1p"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.log(inputs[0] + _expr.const(1.0))
-
-
-class Pow(OneFlowOpConverter):
-    """Operator converter for Power"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        inputs = _dtype_shape_promotion(inputs)
-        return get_relay_op(cls.name)(inputs[0], inputs[1])
-
-
-class Expm1(OneFlowOpConverter):
-    """Operator converter for Expm1"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.exp(inputs[0]) - _expr.const(1.0)
-
-
-class Unary(OneFlowOpConverter):
-    """A helper class for unary op converters"""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 1, f"Unary math op {cls.name} takes 1 input, {len(inputs)} given"
-        return get_relay_op(cls.name)(*inputs)
-
-
-class Absolute(Unary):
-    """Operator converter for Absolute."""
-
-    name = "abs"
-
-
-class AddN(OneFlowOpConverter):
-    """Operator converter for Add_n"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) > 0, "add_n take >=1 inputs, but 0 given."
-
-        res = inputs[0]
-        for each in inputs[1:]:
-            res = _op.add(res, each)
-        return res
-
-
-class ScalarAdd(OneFlowOpConverter):
-    """Operator convert for Add_scalar"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 1, f"add_scalar take == 1 inputs, but {len(inputs)} given."
-
-        if attrs.get("has_int_operand", True):
-            res = inputs[0] + _expr.const(attrs["int_operand"])
-        elif attrs.get("has_float_operand", True):
-            res = inputs[0] + _expr.const(attrs["float_operand"])
-        else:
-            raise AttributeError(
-                "please check if has_int_operand or has_float_operand in your attrs"
-            )
-
-        return res
-
-
-class ScalarMul(OneFlowOpConverter):
-    """Operator convert for Mul_scalar"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 1, f"mul_scalar take == 1 inputs, but {len(inputs)} given."
-
-        if attrs.get("has_int_operand", True):
-            res = inputs[0] * _expr.const(attrs["int_operand"], dtype="float32")
-        elif attrs.get("has_float_operand", True):
-            res = inputs[0] * _expr.const(attrs["float_operand"])
-        else:
-            raise AttributeError(
-                "please check if has_int_operand or has_float_operand in your attrs"
-            )
-
-        return res
-
-
-class ScalarDiv(OneFlowOpConverter):
-    """Operator convert for Div_scalar"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 1, f"div_scalar take == 1 inputs, but {len(inputs)} given."
-
-        if attrs.get("has_int_operand", True):
-            res = inputs[0] / _expr.const(attrs["int_operand"], dtype="float32")
-        elif attrs.get("has_float_operand", True):
-            res = inputs[0] / _expr.const(attrs["float_operand"])
-        else:
-            raise AttributeError(
-                "please check if has_int_operand or has_float_operand in your attrs"
-            )
-
-        return res
-
-
-class ScalarPow(OneFlowOpConverter):
-    """Operator convert for Pow_scalar"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        if attrs.get("has_int_operand", True):
-            coeff = _expr.const(attrs["int_operand"])
-        elif attrs.get("has_float_operand", True):
-            coeff = _expr.const(attrs["float_operand"])
-        return _op.power(inputs[0], coeff)
-
-
-class MaxPool2d(Pool):
-    """Operator converter for MaxPool"""
-
-    name = "max_pool2d"
-
-
-class AveragePool2d(Pool):
-    """Operator converter for AveragePool."""
-
-    name = "avg_pool2d"
-
-
-class Affine(OneFlowOpConverter):
-    """Operator converter for Affine transformation."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        alpha = _expr.const(attrs.get("alpha", 1.0))
-        beta = _expr.const(attrs.get("beta", 0.0))
-        return (alpha * inputs[0]) + beta
-
-
-class Reshape(OneFlowOpConverter):
-    """Operator converter for Reshape."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.reshape(inputs[0], attrs["shape"])
-
-
-class Softmax(OneFlowOpConverter):
-    """Operator converter for Softmax."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        axis = attrs.get("axis", -1)
-        data = inputs[0]
-        if isinstance(axis, str):
-            axis = int(axis)
-
-        return _op.nn.softmax(data, axis=axis)
-
-
-class LogSoftmax(OneFlowOpConverter):
-    """Operator converter for LogSoftmax."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        axis = attrs.get("axis", 1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        axes = list(range(axis, ndim))
-        x = inputs[0]
-        m = _op.max(x, axes, keepdims=True)
-        e = _op.exp(x - m)
-        s = _op.sum(e, axes, keepdims=True)
-        return x - m - _op.log(s)
-
-
-class Dropout(OneFlowOpConverter):
-    """Operator converter for Dropout."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        out = AttrCvt("dropout", {"ratio": "rate"}, ignores=["is_test"])
-        return out
-
-
-class Threshold(OneFlowOpConverter):
-    """Operator converter for Threshold."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        threshold = float(attrs.get("threshold_val", 1.0))
-        threshold_tensor = _op.full_like(inputs[0], fill_value=_expr.const(threshold))
-        value = float(attrs.get("value"))
-        value_tensor = _op.full_like(inputs[0], fill_value=_expr.const(value))
-        mask = _op.greater(inputs[0], threshold_tensor)
-        return _op.where(mask, inputs[0], value_tensor)
-
-
-class Elu(OneFlowOpConverter):
-    """Operator converter for Elu"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        alpha = float(attrs.get("alpha", 1.0))
-        return _expr.const(-alpha) * _op.nn.relu(
-            _expr.const(1.0) - _op.exp(inputs[0])
-        ) + _op.nn.relu(inputs[0])
-
-
-class PReLU(OneFlowOpConverter):
-    """Operator converter for PReLU"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 2, f"PReLU need 2 inputs, but {len(inputs)} given"
-        for i in inputs:
-            if "_input." in str(i):
-                prelu_a = i
-            else:
-                prelu_b = i
-
-        input_shape = shape_of(prelu_a)
-        alpha = _op.broadcast_to_like(prelu_b, prelu_a)
-        alpha = _op.reshape(alpha, [-1])
-
-        output = _op.nn.prelu(_op.reshape(prelu_a, [-1]), alpha, axis=0)
-        out = _op.reshape(output, input_shape)
-        return out
-
-
-class Selu(OneFlowOpConverter):
-    """Operator converter for Selu"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        alpha = float(attrs.get("alpha", 1.67326319217681884765625))
-        gamma = float(attrs.get("gamma", 1.05070102214813232421875))
-        return _expr.const(gamma) * (
-            _expr.const(-alpha) * _op.nn.relu(_expr.const(1.0) - _op.exp(inputs[0]))
-            + _op.nn.relu(inputs[0])
-        )
-
-
-class Silu(OneFlowOpConverter):
-    """Operator converter for Silu"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        a = inputs[0]
-        b = _op.sigmoid(inputs[0])
-        return _op.multiply(a, b)
-
-
-class Gelu(OneFlowOpConverter):
-    """Operator converter for Gelu"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        data = inputs[0]
-        return data * (
-            _expr.const(0.5) + _op.erf(data * _expr.const(0.5**0.5)) * _expr.const(0.5)
-        )
-
-
-class HardTanh(OneFlowOpConverter):
-    """Operator converter for HardTanh"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        tanh_min = attrs.get("min_val", 0.0)
-        tanh_max = attrs.get("max_val", 0.0)
-        return _op.tensor.clip(inputs[0], tanh_min, tanh_max)
-
-
-class Softplus(OneFlowOpConverter):
-    """Operator converter for Softplus"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        data = inputs[0]
-        data_dtype = infer_type(data).checked_type.dtype
-        beta = _expr.const(float(attrs.get("beta", 1.0)))
-        threshold = float(attrs.get("threshold", 20.0))
-        threshold_ = _op.full_like(data, fill_value=_expr.const(threshold))
-        softplus_value = _op.log(_op.exp(data * beta) + _expr.const(1.0, dtype=data_dtype)) / beta
-        return _op.where(_op.greater(data * beta, threshold_), data, softplus_value)
-
-
-class Softsign(OneFlowOpConverter):
-    """Operator converter for Softsign"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return inputs[0] / (_expr.const(1.0) + Absolute.get_converter()(inputs, attrs, params))
-
-
-class Variance(OneFlowOpConverter):
-    """Operator converter for Variance"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        axis = attrs["dim"]
-        keepdims = attrs["keepdim"]
-        unbiased = bool(attrs["unbiased"])
-        return _op.reduce.variance(inputs[0], axis=axis, keepdims=keepdims, unbiased=unbiased)
-
-
-class Concat(OneFlowOpConverter):
-    """Operator converter for Concat"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        attrs.pop("max_dim_size")
-        inputs = _dtype_shape_promotion(inputs)
-        return _op.concatenate(inputs, axis=attrs["axis"])
-
-
-class Clip(OneFlowOpConverter):
-    """Operator converter for Clip"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        attr = {}
-        dtype = infer_type(inputs[0])
-
-        if "float" in str(dtype):
-            attr["a_min"] = attrs["floating_min"]
-            attr["a_max"] = attrs["floating_max"]
-        elif "int" in str(dtype):
-            attr["a_min"] = attrs["integral_min"]
-            attr["a_max"] = attrs["integral_max"]
-        else:
-            attr["a_min"] = -np.inf
-            attr["a_max"] = np.inf
-
-        out = AttrCvt("clip")(inputs, attr, params)
-        return out
-
-
-class Slice(OneFlowOpConverter):
-    """Operator converter for Slice"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        starts = list(attrs["start"])
-        ends = list(attrs["stop"])
-        steps = list(attrs["step"])
-        return _op.strided_slice(inputs[0], starts, ends, steps)
-
-
-class Split(OneFlowOpConverter):
-    """Operator converter for Split"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        splits = attrs.get("split", None)
-        if splits is not None:
-            indices = []
-            attrs["indices_or_sections"] = []
-            index = 0
-            for i in splits[:-1]:
-                index += i
-                indices.append(index)
-        output = _op.split(inputs[0], indices, attrs.get("axis", 0))
-        # If the output of split is a single value, unpack if from the TupleWrapper
-        if len(output) == 1:
-            output = output[0]
-        return output
-
-
-class Scatter(OneFlowOpConverter):
-    """Operator converter for Scatter"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        axis = attrs.get("axis", 0)
-        return _op.scatter_elements(inputs[0], inputs[1], inputs[2], axis)
-
-
-class Unsqueeze(OneFlowOpConverter):
-    """Operator converter for Unsqueeze"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        axes = sorted(attrs["axes"])
-        for axis in axes:
-            inputs[0] = _op.expand_dims(inputs[0], axis=axis, num_newaxis=1)
-        return inputs[0]
-
-
-class Sign(OneFlowOpConverter):
-    """Operator converter for Sign"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.sign(inputs[0])
-
-
-class Reciprocal(OneFlowOpConverter):
-    """Operator converter for Reciprocal"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        dtype = infer_type(inputs[0]).checked_type.dtype
-        return _expr.const(1.0, dtype=dtype) / inputs[0]
-
-
-class Erf(OneFlowOpConverter):
-    """Operator converter for Erf"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _op.erf(inputs[0])
-
-
-class Erfc(OneFlowOpConverter):
-    """Operator converter for Erfs"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        return _expr.const(1.0) - _op.erf(inputs[0])
-
-
-class HardSigmoid(OneFlowOpConverter):
-    """Operator converter for HardSigmoid"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        alpha = attrs.get("alpha", 0.2)
-        beta = attrs.get("beta", 0.5)
-        transformX = (inputs[0] * _expr.const(alpha)) + _expr.const(beta)
-        attr = {"a_min": 0, "a_max": 1}
-        return AttrCvt("clip")([transformX], attr)
-
-
-class OneHot(OneFlowOpConverter):
-    """Operator converter for OneHot"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        # Extract relay one_hot inputs.
-        indices, depth, values = inputs
-        ndim = len(infer_shape(indices))
-        # Split onnx on off values into two separate expressions.
-        off_value, on_value = _op.take(values, _op.const(0)), _op.take(values, _op.const(1))
-        # Extract the datatype of the output from on_value.
-        dtype = infer_type(on_value).checked_type.dtype
-        ind_dtype = infer_type(indices).checked_type.dtype
-        # Normalize the indices to a positive range
-        indices = _op.where(
-            indices < _op.const(0, ind_dtype), indices + _op.cast(depth, ind_dtype), indices
-        )
-        # set default value when axis is not set in the model
-        axis = attrs.get("axis", -1)
-        if axis < 0:
-            axis += ndim + 1
-
-        return _op.one_hot(indices, on_value, off_value, depth, axis, dtype=dtype)
-
-
-class Where(OneFlowOpConverter):
-    """Operator converter for Where"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        condition_rank = len(infer_shape(inputs[0]))
-        x_rank = len(infer_shape(inputs[1]))
-        y_rank = len(infer_shape(inputs[2]))
-        ranks = [condition_rank, x_rank, y_rank]
-
-        # If one rank is longer than others, then we can broadcast
-        # to that shape.
-        max_rank = max(ranks)
-        max_rank_idxs = [i for i, x in enumerate(ranks) if x == max_rank]
-        broadcast_shape = shape_of(inputs[max_rank_idxs[0]])
-        # If two or more inputs have the same rank, compute the broadcast
-        # shape by taking the maximum value of each dimensions.
-        if len(max_rank_idxs) > 1:
-            for idx in max_rank_idxs:
-                broadcast_shape = _op.maximum(broadcast_shape, shape_of(inputs[idx]))
-
-        broadcast_shape = fold_constant(broadcast_shape)
-
-        condition = _op.broadcast_to(inputs[0], broadcast_shape)
-        x = _op.broadcast_to(inputs[1], broadcast_shape)
-        y = _op.broadcast_to(inputs[2], broadcast_shape)
-        return _op.where(condition, x, y)
-
-
-class Constant(OneFlowOpConverter):
-    """Operator converter for Constant"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        is_float = attrs.get("is_floating_value", True)
-        shape = attrs.get("shape", (1,))
-        if is_float:
-            dtype = "float32"
-            value = attrs.pop("floating_value")
-        else:
-            dtype = "int8"
-            value = attrs.pop("integer_value")
-        np_array = np.zeros(shape)
-        np_array.fill(value)
-        value = _expr.const(np_array, dtype)
-        return value
-
-
-class Range(OneFlowOpConverter):
-    """Operator converter for Range"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        if len(inputs) != 0:
-            raise ValueError(f"Expect no inputs but get {len(inputs)}")
-        start = attrs.get("start", 0.0)
-        limit = attrs.get("limit", 1.0)
-        delta = attrs.get("delta", 1.0)
-        return _op.arange(
-            _expr.const(start, dtype="float32"),
-            _expr.const(limit, dtype="float32"),
-            _expr.const(delta, dtype="float32"),
-        )
-
-
-class Cast(OneFlowOpConverter):
-    """Operator converter for Cast"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attrs, params):
-        attrs["dtype"] = infer_type(inputs[0]).checked_type.dtype
-        return AttrCvt(op_name="cast")(inputs, attrs)
-
-
-def get_convert_map():
-    # supported oneflow2relay op
-    return {
-        # defs/math
-        "bias_add": Add.get_converter(),
-        "scalar_add": ScalarAdd.get_converter(),
-        "scalar_mul": ScalarMul.get_converter(),
-        "scalar_div": ScalarDiv.get_converter(),
-        "scalar_pow": ScalarPow.get_converter(),
-        "reduce_sum": ReduceSum.get_converter(),
-        "reduce_max": ReduceMax.get_converter(),
-        "reduce_min": ReduceMin.get_converter(),
-        "reduce_mean": ReduceMean.get_converter(),
-        "broadcast_add": BroadcastAdd.get_converter(),
-        "broadcast_mul": BroadcastMul.get_converter(),
-        "broadcast_sub": BroadcastSub.get_converter(),
-        "broadcast_div": BroadcastDiv.get_converter(),
-        "scalar_logical_greater": LogicalGreater.get_converter(),
-        "log": Renamer("log"),
-        "log1p": Log1p.get_converter(),
-        "acos": Renamer("acos"),
-        "acosh": Renamer("acosh"),
-        "asin": Renamer("asin"),
-        "asinh": Renamer("asinh"),
-        "atan": Renamer("atan"),
-        "atanh": Renamer("atanh"),
-        "cos": Renamer("cos"),
-        "cosh": Renamer("cosh"),
-        "sin": Renamer("sin"),
-        "sinh": Renamer("sinh"),
-        "tan": Renamer("tan"),
-        "tanh": Renamer("tanh"),
-        "pow": Pow.get_converter(),
-        "exp": Renamer("exp"),
-        "expm1": Expm1.get_converter(),
-        "floor": Renamer("floor"),
-        "ceil": Renamer("ceil"),
-        "round": Renamer("round"),
-        "add_n": AddN.get_converter(),
-        "sqrt": Renamer("sqrt"),
-        "rsqrt": Renamer("rsqrt"),
-        "square": Square.get_converter(),
-        "sign": Sign.get_converter(),
-        "erf": Erf.get_converter(),
-        "erfc": Erfc.get_converter(),
-        "reciprocal": Reciprocal.get_converter(),
-        # defs/activation
-        "softmax": Softmax.get_converter(),
-        "softsign": Softsign.get_converter(),
-        "hardtanh": HardTanh.get_converter(),
-        "relu": Renamer("relu"),
-        "leaky_relu": Renamer("leaky_relu"),
-        "prelu": PReLU.get_converter(),
-        "threshold": Threshold.get_converter(),
-        "selu": Selu.get_converter(),
-        "silu": Silu.get_converter(),
-        "gelu": Gelu.get_converter(),
-        # defs/nn
-        "conv2d": Conv2d.get_converter(),
-        "deconv2d": ConvTranspose2d.get_converter(),
-        "max_pool_2d": MaxPool2d.get_converter(),
-        "avg_pool_2d": AveragePool2d.get_converter(),
-        "maxpool_2d": MaxPool2d.get_converter(),  # Maintained for oneflow versions <= "0.7.0"
-        "avgpool_2d": AveragePool2d.get_converter(),  # Maintained for oneflow versions <= "0.7.0"
-        "adaptive_avg_pool2d": AdaptiveAvgPool2d.get_converter(),
-        "adaptive_max_pool2d": AdaptiveMaxPool2d.get_converter(),
-        "dropout": Dropout.get_converter(),
-        "normalization": BatchNorm.get_converter(),
-        "upsample_nearest_2d": UpsampleNearest.get_converter(),
-        "upsample_bilinear_2d": UpsampleBiLinear.get_converter(),
-        # defs/tensor
-        "matmul": MatMul.get_converter(),
-        "batch_matmul": MatMul.get_converter(),
-        "broadcast_matmul": MatMul.get_converter(),
-        "concat": Concat.get_converter(),
-        "clip_by_scalar": Clip.get_converter(),
-        "slice": Slice.get_converter(),
-        "expand": Expand.get_converter(),
-        "transpose": Transpose.get_converter(),
-        "expand_dims": ExpandDim.get_converter(),
-        "range": Range.get_converter(),
-        "cast": Cast.get_converter(),
-        # defs/others
-        "reshape": Reshape.get_converter(),
-        "constant": Constant.get_converter(),
-        "where": Where.get_converter(),
-        "flatten": Flatten.get_converter(),
-        "sigmoid": Renamer("sigmoid"),
-        "sigmoid_v2": Renamer("sigmoid"),
-        "hardsigmoid": HardSigmoid.get_converter(),
-        "softplus": Softplus.get_converter(),
-        "squeeze": AttrCvt("squeeze", {"axes": "axis"}),
-        "unsqueeze": Unsqueeze.get_converter(),
-        "identity": Renamer("copy"),
-        "var": Variance.get_converter(),
-    }
-
-
-class oneflow_input(object):
-    """
-    Dual purpose list or dictionary access object
-    """
-
-    def __init__(self):
-        self.input_keys = []
-        self.input_dict = {}
-        self.n = 0
-
-    def __getitem__(self, item):
-        if isinstance(item, int):
-            if item > (len(self.input_keys) - 1):
-                return None
-            return self.input_dict[self.input_keys[item]]
-        if isinstance(item, str):
-            if item not in self.input_keys:
-                return None
-            return self.input_dict[item]
-        if isinstance(item, slice):
-            keys = self.input_keys[item]
-            return [self.input_dict[key] for key in keys]
-
-        raise ValueError("Only integer, string, and slice accesses allowed.")
-
-    def __setitem__(self, item, value):
-        if isinstance(item, int):
-            self.input_dict[self.input_keys[item]] = value
-        elif isinstance(item, str):
-            self.input_keys.append(item)
-            self.input_dict[item] = value
-        else:
-            raise ValueError("Only integer and string indexed writes allowed.")
-
-    def keys(self):
-        return self.input_keys
-
-    def __len__(self):
-        return len(self.input_keys)
-
-    def __iter__(self):
-        self.n = 0
-        return self
-
-    def __next__(self):
-        if self.n < len(self.input_keys):
-            output = self.input_dict[self.input_keys[self.n]]
-            self.n += 1
-            return output
-
-        raise StopIteration
-
-
-def deal_with_input_convert(
-    node_input, node_input_shape, node_input_dtype, node_path, _nodes, _input_path_2_name
-):
-    """deal with input convert in oneflow."""
-    if node_input not in _nodes:
-        if (
-            node_path not in _input_path_2_name
-            or "_input." in node_input
-            or "FreeEagerTensor" in node_input
-        ):
-            _nodes[node_input] = new_var(node_input, shape=node_input_shape, dtype=node_input_dtype)
-        else:
-            names = _input_path_2_name[node_path]
-            node_replace = None
-            for k in names:
-                if k in _nodes:
-                    node_replace = k
-            if node_replace is not None:
-                op_replace = copy.deepcopy(_nodes[node_replace])
-                _nodes[node_input] = op_replace
-            else:
-                print(f"{node_input} will not be in _nodes")
-
-
-def deal_parameter_convert(
-    node_input_paths, model_dir_path, _input_path_2_name, _model_array, _params, _nodes
-):
-    """deal with parameter(weight) convert in oneflow."""
-    for node_input_path in node_input_paths:
-        node_path = os.path.join(model_dir_path, node_input_path.replace("m.", "", 1))
-        node_input_name = node_input_path.split("/")[0]
-        _input_path_2_name[node_path] = node_input_name
-        for param_name in _model_array:
-            node_p = _model_array[param_name]
-            if node_path == node_p["path"]:
-                node_array = node_p["params"]
-                _params[node_input_name] = node_array
-                _nodes[node_input_name] = new_var(
-                    node_input_name, shape=node_array.shape, dtype=str(node_array.dtype)
-                )
-                break
-
-
-class OneflowGraph(object):
-    """
-    A helper class for handling Relay expression
-
-    Parameters
-    ----------
-    shape : dict of str to tuple, optional
-        The input shape to the graph
-    dtype : dict of str to str
-        The input types to the graph
-
-    node name:
-    1. param: m.layer4.1.bn1.weight / ...
-    2. buffer: m.layer4.1.bn1.running_mean / ...
-    3. node inputs: m.layer4.1.bn1_input.0
-    4. node outputs: m.layer4.1.bn1_output.0
-    """
-
-    def __init__(self, shape, dtype, nodes, model_dir_path):
-        self._nodes = {}
-        self._params = {}
-        self._inputs = {}
-        self._num_input = 0
-        self._num_param = 0
-        self._input_names = []
-        self._model_array = {}
-        self._input_path_2_name = {}
-        self._output_path_2_name = {}
-        self._init_variable_node = []
-        self._shape = shape
-        self._dtype = dtype
-        self._identity_list = []
-        self._sort_inputs = {}
-
-        import oneflow
-
-        model = oneflow.load(model_dir_path)
-        # model_array: keys: layer_name, values: dict('path', 'params')
-        for layer_name in model:
-            layer = model[layer_name]
-            layer_node = {}
-            layer_node["path"] = os.path.join(model_dir_path, layer_name, "out")  # get path
-            if "System-Train" in layer_name:
-                continue
-            node_name = "m." + layer_name
-            shape = self._shape[node_name]
-            dtype = self._dtype[node_name]
-            array = layer.detach().cpu().numpy()
-            layer_node["params"] = array.reshape(shape)
-            self._model_array[layer_name] = layer_node
-
-        for node_name in nodes:
-            node = nodes[node_name]
-            if is_user_op(node):
-                for input_name in node.user_conf.input:
-                    node_input_paths = getattr(node.user_conf.input[input_name], "s")
-                    deal_parameter_convert(
-                        node_input_paths,
-                        model_dir_path,
-                        self._input_path_2_name,
-                        self._model_array,
-                        self._params,
-                        self._nodes,
-                    )
-                for output_name in node.user_conf.output:
-                    node_output_paths = getattr(node.user_conf.output[output_name], "s")
-                    for node_output_path in node_output_paths:
-                        node_path = os.path.join(model_dir_path, node_output_path.replace("m.", ""))
-                        node_output_name = node_output_path.split("/")[0]
-                        self._output_path_2_name[node_path] = node_output_name
-            elif is_output_op(node):
-                node_output_path = getattr(node.output_conf, "in")
-                output_path = os.path.join(
-                    model_dir_path, getattr(node.output_conf, "in").replace("m.", "")
-                )
-                self._output_path_2_name[output_path] = node_name
-            elif is_param_op(node):
-                if "FreeEagerTensor" in node.name:
-                    shape = tuple(node.variable_conf.shape.dim)
-                    dtype = FLOW_2_STR_DTYPE[node.variable_conf.data_type]
-                    self._shape[node.name] = shape
-                    self._dtype[node.name] = dtype
-                    self._init_variable_node.append(node.name)
-        if self._init_variable_node != []:
-            print(f"{self._init_variable_node} should be defined by user")
-
-    def _parse_input(self, node, model_dir_path):
-        input_user_conf_list = []
-        for input_name in node.user_conf.input:
-            input_user_conf_list.append(input_name)
-        input_user_conf_list.sort()
-        for input_name in input_user_conf_list:
-            node_input_paths = getattr(node.user_conf.input[input_name], "s")
-            for i in node_input_paths:
-                node_input = i.split("/")[0]
-                node_input_shape = self._shape[node_input]
-                node_input_dtype = self._dtype[node_input]
-                node_path = os.path.join(model_dir_path, i.replace("m.", ""))
-                deal_with_input_convert(
-                    node_input,
-                    node_input_shape,
-                    node_input_dtype,
-                    node_path,
-                    self._nodes,
-                    self._input_path_2_name,
-                )
-
-    def _parse_output(self, op_name, outputs, cnt_init=0):
-        """
-        o: m.classifier.1_output.xxx
-        new_o: m.classifier.1-conv2d_0
-        "_"+new_o_xxx is in self._shape
-        """
-        for o in outputs:
-            if "_output." not in o:
-                new_o = o.replace("-" + op_name, "_output")
-                new_o = new_o.replace("-" + new_o.split("-")[-1], ".0")
-                for k in self._shape.keys():
-                    if new_o in k:
-                        self._shape[o] = self._shape[k]
-                        self._dtype[o] = self._dtype[k]
-                        break
-            elif len(outputs) > 1:
-                outputs.remove(o)
-        if op_name.lower() == "dropout":
-            if len(outputs) == 1:
-                return outputs
-            outputs = outputs[:-1]
-        elif op_name.lower() == "constant":
-            outputs = [self._init_variable_node[cnt_init]]
-
-        if len(outputs) > 1:
-            outputs = list(set(outputs))
-
-        return outputs
-
-    def from_oneflow(self, nodes, model_dir_path):
-        """
-        Implementation of convert the OneFlow model into an equivalent Relay Function.
-        """
-        # step 1: find out if unsupported ops are used
-        convert_map = get_convert_map()
-        unsupported_ops = set()
-        for node_name in nodes:
-            node = nodes[node_name]
-            if is_user_op(node):
-                # op names, not the layer names
-                op_name = node.user_conf.op_type_name
-                if (
-                    op_name not in convert_map
-                    and "constant" not in op_name
-                    and op_name not in self._identity_list
-                ):
-                    unsupported_ops.add(op_name)
-        # find out the unsupported op
-        if unsupported_ops:
-            msg = "The following operators are not supported for frontend OneFlow: "
-            msg += ", ".join(unsupported_ops)
-            raise tvm.error.OpNotImplemented(msg)
-
-        # step 2: convert op
-        for node_name in nodes:
-            node = nodes[node_name]
-            if is_user_op(node):
-                # If there is a user-defined node, skip the following steps
-                if node_name in self._inputs:
-                    continue
-
-                op_name = node.user_conf.op_type_name
-                op_attr = parse_attr(node.user_conf.attr)
-
-                self._parse_input(node, model_dir_path=model_dir_path)
-
-                node_inputs = oneflow_input()
-                input_user_conf_list = []
-                for input_name in node.user_conf.input:
-                    input_user_conf_list.append(input_name)
-                input_user_conf_list.sort()
-                for input_name in input_user_conf_list:
-                    node_input_paths = getattr(node.user_conf.input[input_name], "s")
-                    for i in node_input_paths:
-                        node_input = i.split("/")[0]
-                        node_inputs[node_input] = self._nodes[node_input]
-
-                node_outputs = []
-                for output_name in node.user_conf.output:
-                    node_output_paths = getattr(node.user_conf.output[output_name], "s")
-                    for i in node_output_paths:
-                        node_output_path = os.path.join(model_dir_path, i.replace("m.", ""))
-                        if node_output_path in self._input_path_2_name:
-                            node_outputs.append(self._input_path_2_name[node_output_path])
-                        elif node_output_path in self._output_path_2_name:
-                            node_outputs.append(self._output_path_2_name[node_output_path])
-                node_outputs = self._parse_output(op_name, node_outputs)
-
-                # convert
-                op = self._convert_operator(op_name, node_inputs, op_attr)
-
-                if not isinstance(op, _expr.TupleWrapper):
-                    outputs_num = 1
-                else:
-                    outputs_num = len(op)
-
-                assert (
-                    len(node_outputs) == outputs_num
-                ), f"Number of output mismatch {len(node_outputs)} vs {outputs_num} in {op_name}."
-                if outputs_num == 1:
-                    op = fold_constant(op)
-                else:
-                    op = _expr.TupleWrapper(fold_constant(op.astuple()), len(op))
-
-                op_temp = []
-                op_temp.append(op)
-                for i, _ in enumerate(node_outputs):
-                    if isinstance(node_outputs[i], list):
-                        for k in node_outputs[i]:
-                            self._nodes[k] = op_temp[i]
-                    else:
-                        self._nodes[node_outputs[i]] = op_temp[i]
-
-        # step 3: get the outputs
-        outputs = []
-        for node_name, node in nodes.items():
-            if is_output_op(node):
-                node_name_v2 = getattr(node.output_conf, "in").split("/")[0]
-                if node_name in self._nodes:
-                    outputs.append(self._nodes[node_name])
-                elif node_name_v2 in self._nodes:
-                    outputs.append(self._nodes[node_name_v2])
-        outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-
-        # step 4: get the relay IR
-        free_vars = analysis.free_vars(outputs)
-
-        nodes = {v: k for k, v in self._nodes.items()}
-        free_vars = [nodes[var] for var in free_vars]
-        free_vars_inputs = []
-        free_vars_parameters = []
-        for x in free_vars:
-            if "_input.0" in x:
-                free_vars_inputs.append(x)
-            else:
-                free_vars_parameters.append(x)
-        free_vars = free_vars_inputs + free_vars_parameters
-
-        # step 5: make sure the '_input.0' is the first in self._inputs
-        for free_var in free_vars:
-            if free_var not in self._inputs:
-                self._inputs[free_var] = self._nodes[free_var]
-
-        input_names = list(self._inputs.keys())
-        for input_name in input_names:
-            if input_name in self._inputs:
-                self._sort_inputs[input_name] = self._inputs[input_name]
-            else:
-                raise IndexError(f"{input_name} is not in self._inputs")
-
-        # step 6: create a function from our output expression and all input variables.
-        func = _function.Function([v for _, v in self._sort_inputs.items()], outputs)
-
-        return IRModule.from_expr(func), self._params
-
-    def _convert_operator(self, op_name, node_inputs, op_attr):
-        """
-        Parameters
-        ----------
-        op_name : str
-            Operator name, such as conv2d and relu
-        node_inputs : list of tvm.relay.function.Function
-            List of inputs.
-        op_attr : dict
-            Dict of operator attributes
-
-        Returns
-        -------
-        sym : tvm.relay.function.Function
-            Converted relay function
-        """
-        convert_map = get_convert_map()
-        if op_name in self._identity_list:
-            sym = get_relay_op(op_name)(*node_inputs, **op_attr)
-        elif op_name in convert_map:
-            sym = convert_map[op_name](node_inputs, op_attr, self._params)
-        else:
-            raise NotImplementedError(f"Operator {op_name} not implemented.")
-
-        return sym
-
-
-def from_oneflow(graph, model_dir_path):
-    """Convert a OneFlow model into an equivalent Relay Function.
-
-    At present, there are two ways to run models in deep learning framework
-    Dynamic Graph and Static Graph, which are also called Eager Mode and Graph
-    Mode in OneFlow.
-
-    In general, dynamic graphs are easier to use and static graphs have better performance.
-    OneFlow offers nn.Graph, so that users can use the eager-like programming style to build
-    static graphs and train the models.
-
-    We utilize the intermediate representation of nn.Graph to convert the OneFlow model to Relay.
-
-    Parameters
-    ----------
-    nodes : dict, keys: node.name, value: node
-        contain the graph
-    model_dir_path: str
-        The path of weight
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The returned relay module
-    params : dict
-        A dict of name: tvm.nd.array pairs, used as pretrained weights
-    """
-    try:
-        import oneflow as flow
-    except ImportError:
-        raise ImportError("please check that OneFlow is installed")
-
-    # get info of nodes
-    shape = {}
-    dtype = {}
-    graph_str = repr(graph)
-    size_where = 2
-    if "cuda" in graph_str:
-        size_where = 3
-
-    p_size = re.compile(r"size=\(.*?\)", re.S)
-    p_type = re.compile(r"dtype=.*?\)", re.S)
-    types = ["INPUT", "PARAMETER", "BUFFER", "OUTPUT"]
-    for t in types:
-        data = re.finditer(t + ":.*", graph_str)
-        for i in data:
-            attrs = i.group().split(":")
-            size_str = re.findall(p_size, attrs[size_where])
-            type_str = re.findall(p_type, attrs[size_where])
-            assert size_str != [], "size should not be None, please check your repr(graph)"
-
-            size_attr = size_str[0].replace("size=", "")
-            if size_attr[-2] == ",":
-                size_attr = size_attr.replace(",", "")
-            if size_attr == "()":
-                data_size = ()
-            else:
-                data_size = tuple(map(int, size_attr[1:-1].split(", ")))
-            node_name = attrs[1]
-            shape[node_name] = data_size
-            dtype[node_name] = "float32"
-
-            if type_str != []:
-                type_attr = type_str[0].replace("dtype=", "").replace(")", "")
-                if type_attr[-1] == ",":
-                    type_attr = type_attr.replace(",", "")
-                dtype[node_name] = type_attr.replace("oneflow.", "")
-
-    # get graph proto, if you don't _compile the graph, the _graph_proto will be None
-    graph_input = re.search(r"INPUT:.*", graph_str).group().split(":")
-    shape_input = tuple(
-        map(
-            int,
-            re.findall(p_size, graph_input[size_where])[0].replace("size=", "")[1:-1].split(", "),
-        )
-    )
-    if not graph._is_compiled:
-        graph._compile(flow.rand(shape_input))
-    graph_proto = graph._graph_proto
-
-    # get all nodes
-    nodes = OrderedDict()
-    for op in graph_proto.net.op:
-        nodes[op.name] = op
-
-    g = OneflowGraph(shape, dtype, nodes, model_dir_path)
-
-    # Use the graph proto as a scope so that ops can access other nodes if needed.
-    mod, params = g.from_oneflow(nodes=nodes, model_dir_path=model_dir_path)
-
-    return mod, params
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
deleted file mode 100644
index 8da8a5b11262..000000000000
--- a/python/tvm/relay/frontend/onnx.py
+++ /dev/null
@@ -1,7273 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
-# pylint: disable=import-outside-toplevel
-"""ONNX: Open Neural Network Exchange frontend for Relay."""
-import copy
-import math
-import warnings
-from typing import Optional
-
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.ir import IRModule
-from tvm.topi.utils import get_const_tuple
-
-from ... import nd as _nd
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import loops as _loops
-from .. import op as _op
-from .. import qnn as _qnn
-from .. import random as _random
-from .. import ty as _ty
-from .. import vision as _vision
-from .common import (
-    AttrCvt,
-    Renamer,
-    autopad,
-    ensure_scalar_shape,
-    fold_constant,
-    get_name,
-    get_relay_op,
-    gru_cell,
-    infer_channels,
-    infer_shape,
-    infer_type,
-    infer_value,
-    lstm_cell,
-    new_var,
-    rnn_cell,
-    shape_of,
-    try_resolve_var_to_const,
-    unbind,
-    set_span,
-)
-
-__all__ = ["from_onnx"]
-
-# The default configurations of Relay ONNX frontend.
-ONNX_DEFAULT_CONFIGS = {
-    # By default, TVM converts qualified onnx `matmul` to `transpose(weight) + nn.batch_matmul_NT`.
-    # Change this flag to False to directly convert to `nn.batch_matmul`.
-    # Note that `nn.batch_matmul` with format other than NT is in experimental, it may have some
-    # performance issues.
-    "use_nt_batch_matmul": True
-}
-
-
-class onnx_input(list):
-    """A helper extension to list that returns None for out of bound indices."""
-
-    def __getitem__(self, item):
-        if isinstance(item, slice):
-            if item.stop is None:
-                stop = len(self)
-            else:
-                stop = item.stop
-            indices = list(range(stop)[item])
-            return [self[i] for i in indices]
-        if isinstance(item, int):
-            return list(self)[item] if item < len(self) else None
-        raise TypeError(f"list indices must be integers or slices, not {type(item).__name__}")
-
-
-def get_numpy(tensor_proto):
-    """Grab data in TensorProto and convert to numpy array."""
-    try:
-        from onnx.numpy_helper import to_array
-    except ImportError as e:
-        raise ImportError(f"Unable to import onnx which is required {e}")
-    return to_array(tensor_proto)
-
-
-def get_type(elem_type):
-    """Converts onnx integer datatype to numpy datatype"""
-    # If a string was passed instead of a tensor type, it does not need
-    # conversion and can be returned.
-    if isinstance(elem_type, str):
-        return elem_type
-
-    try:
-        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-    except ImportError as e:
-        raise ImportError(f"Unable to import onnx which is required {e}")
-
-    try:
-        from onnx import TensorProto
-    except ImportError as e:
-        raise ImportError(f"Unable to import TensorProto from onnx {e}")
-
-    # Onnx mapping converts bfloat16 to float16 because
-    # numpy does not have a bfloat16 data type. However,
-    # tvm has one, so we force the return type to be bfloat16
-    if elem_type == int(TensorProto.BFLOAT16):
-        return "bfloat16"
-    return str(TENSOR_TYPE_TO_NP_TYPE[elem_type])
-
-
-def get_info(info_proto):
-    """Extract the shape from a ValueInfoProto."""
-    shape = []
-    shape_name = []
-    for dim in info_proto.type.tensor_type.shape.dim:
-        name = dim.dim_param
-        value = dim.dim_value
-        if value is None or value == 0:
-            value = _ty.Any()
-            shape_name.append(name)
-        else:
-            shape_name.append(value)
-        shape.append(value)
-
-    name = info_proto.name
-    if info_proto.type.tensor_type.elem_type:
-        dtype = get_type(info_proto.type.tensor_type.elem_type)
-    else:
-        dtype = None
-    return name, shape, dtype, shape_name
-
-
-def dimension_picker(prefix, suffix=""):
-    """Check that dimensions are supported."""
-
-    def _impl(attr):
-        kernel = attr["kernel_shape"]
-        if len(kernel) == 1:
-            return prefix + "1d" + suffix
-        if len(kernel) == 2:
-            return prefix + "2d" + suffix
-        if len(kernel) == 3:
-            return prefix + "3d" + suffix
-        op_name = prefix + "1d/2d/3d"
-        msg = f"Only 1D, 2D, and 3D kernels are supported for operator {op_name}."
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    return _impl
-
-
-def revert_caffe2_pad(pads):
-    """Caffe2 requires two times the normal padding."""
-    if len(pads) == 4:
-        pads = pads[:2]
-    elif len(pads) == 2:
-        pass
-    else:
-        raise tvm.error.OpAttributeInvalid("Number of pads must be either 2 or 4.")
-    return pads
-
-
-def get_pad_pair(input1d, kernel1d, stride1d, mode):
-    """infer pad size"""
-    if input1d % stride1d == 0:
-        pad = max(kernel1d - stride1d, 0)
-    else:
-        pad = max(kernel1d - (input1d % stride1d), 0)
-    pad_before = pad // 2
-    pad_after = pad - pad_before
-    if "LOWER" in mode:
-        return [pad_after, pad_before]
-    return [pad_before, pad_after]
-
-
-def onnx_default_layout(dims, op_name):
-    if dims == 1:
-        return "NCW"
-    if dims == 2:
-        return "NCHW"
-    if dims == 3:
-        return "NCDHW"
-
-    msg = f"Only 1D, 2D and 3D layouts are currently supported for operator {op_name}."
-    raise tvm.error.OpAttributeInvalid(msg)
-
-
-def onnx_storage_order2layout(storage_order, dims, op_name):
-    """converter of onnx storage order parameter to tvm storage order format"""
-    if storage_order not in (0, 1):
-        raise tvm.error.OpAttributeInvalid("Mode of storage_order must be either 0 or 1")
-
-    if dims == 1:
-        return "NCW" if storage_order == 0 else "NWC"
-    if dims == 2:
-        return "NCHW" if storage_order == 0 else "NHWC"
-    if dims == 3:
-        return "NCDHW" if storage_order == 0 else "NDHWC"
-
-    msg = f"Only 1D, 2D and 3D layouts are currently supported for operator {op_name}."
-    raise tvm.error.OpAttributeInvalid(msg)
-
-
-def dimension_constraint():
-    def _dim_check(attrs):
-        if len(attrs["kernel_shape"]) in [1, 2, 3]:
-            return True
-        return False
-
-    return _dim_check, "Only 1d, 2d and 3d kernel supported."
-
-
-def get_scalar(x, params, dtype="float32"):
-    """Helper to get a scalar value for Quantized operators."""
-    if isinstance(x, _expr.Var) and x.name_hint in params:
-        return _op.const(params[x.name_hint].numpy(), dtype)
-    rank = len(infer_shape(x))
-    assert rank <= 1, "scale and zero_point input must be scalars"
-    if rank == 1:
-        x = _op.squeeze(x, [0])
-    return _op.cast(x, dtype)
-
-
-def get_scalar_or_1d_tensor(x, params, dtype="float32"):
-    """Helper to get a scalar value or 1D tensor for Quantized operators."""
-    if isinstance(x, _expr.Var) and x.name_hint in params:
-        return _op.const(params[x.name_hint].numpy(), dtype)
-    rank = len(infer_shape(x))
-    assert rank <= 1, "scale and zero_point input must be scalars or 1D tensors"
-    return _op.cast(x, dtype)
-
-
-def flatten_to_nd(x, x_shape, nd=3):
-    """Flatten input tensor to nd rank"""
-    ndims = infer_shape(x_shape)[0]
-    if ndims == nd:
-        return x
-    newshape = _op.concatenate(
-        [
-            _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-            _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-        ],
-        0,
-    )
-    out = _op.reshape(x, fold_constant(newshape))
-    return out
-
-
-def matmul_out_dtype(inputs, out_dtype):
-    """Common function to handle MatMul and MatMulInteger16"""
-    a_shape = shape_of(inputs[0])
-    a_rank = infer_shape(a_shape)[0]
-    b_shape = shape_of(inputs[1])
-    b_rank = infer_shape(b_shape)[0]
-    if a_rank > 2 or b_rank > 2:
-        # Determine the output batch dimension.
-        new_a_shape = a_shape
-        new_b_shape = b_shape
-        if a_rank > b_rank:
-            rank_diff = a_rank - b_rank
-            new_b_shape = _op.concatenate(
-                [
-                    _expr.const([1] * rank_diff, dtype=infer_type(b_shape).checked_type.dtype),
-                    b_shape,
-                ],
-                0,
-            )
-        elif a_rank < b_rank:
-            rank_diff = b_rank - a_rank
-            new_a_shape = _op.concatenate(
-                [
-                    _expr.const([1] * rank_diff, dtype=infer_type(a_shape).checked_type.dtype),
-                    a_shape,
-                ],
-                0,
-            )
-        else:
-            pass
-
-        out_batch = _op.concatenate(
-            [
-                _op.maximum(
-                    _op.strided_slice(new_b_shape, [i], [i + 1]),
-                    _op.strided_slice(new_a_shape, [i], [i + 1]),
-                )
-                for i in range(max(a_rank, b_rank) - 2)
-            ],
-            0,
-        )
-
-        b_type = infer_type(inputs[1])
-        # Convert to dense if the second matrix is 2d and non-dynamic
-        if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
-            a = flatten_to_nd(inputs[0], a_shape, 2)
-            b = _op.transpose(inputs[1])
-            output = _op.nn.dense(a, b, out_dtype=out_dtype)
-        elif a_rank == 1 or b_rank == 1:
-            a, b = inputs
-            _a_shape = tuple(a_shape.data.numpy())
-            _b_shape = tuple(b_shape.data.numpy())
-            if a_rank == 1:
-                axis = -2
-                a = _op.expand_dims(a, axis=0)
-                batches = _b_shape[:-2]
-                a = _op.broadcast_to(a, (*batches, 1, _a_shape[0]))
-            else:
-                axis = -1
-                b = _op.expand_dims(b, axis=-1)
-                batches = _a_shape[:-2]
-                b = _op.broadcast_to(b, (*batches, _b_shape[0], 1))
-            return _op.squeeze(_op.nn.batch_matmul(a, b, transpose_b=False), axis=axis)
-        else:
-            a = inputs[0]
-            b = inputs[1]
-            # broadcast a and b
-            a_broadcasted_shape = fold_constant(
-                _op.concatenate([out_batch, _op.strided_slice(a_shape, [a_rank - 2], [a_rank])], 0)
-            )
-            b_broadcasted_shape = fold_constant(
-                _op.concatenate([out_batch, _op.strided_slice(b_shape, [b_rank - 2], [b_rank])], 0)
-            )
-            if not tvm.ir.structural_equal(a_shape, a_broadcasted_shape):
-                a = _op.transform.broadcast_to(a, a_broadcasted_shape)
-            if not tvm.ir.structural_equal(b_shape, b_broadcasted_shape):
-                b = _op.transform.broadcast_to(b, b_broadcasted_shape)
-            # Convert a and b into 3 dimensional tensors.
-            a = flatten_to_nd(a, shape_of(a), 3)
-            b = flatten_to_nd(b, shape_of(b), 3)
-            if ONNX_DEFAULT_CONFIGS["use_nt_batch_matmul"]:
-                # Transpose matrix dimensions of b.
-                bt = _op.transpose(b, [0, 2, 1])
-                # Perform a NT batch matmul.
-                output = _op.nn.batch_matmul(a, bt, out_dtype=out_dtype)
-            else:
-                # Perform a NN batch matmul.
-                output = _op.nn.batch_matmul(a, b, out_dtype=out_dtype, transpose_b=False)
-        # Reshape output to original dimensions.
-        final_shape = _op.concatenate(
-            [
-                out_batch,
-                _op.strided_slice(
-                    a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
-                ),
-                _op.strided_slice(
-                    b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
-                ),
-            ],
-            0,
-        )
-        return _op.reshape(output, fold_constant(final_shape))
-
-    if a_rank == 1 or b_rank == 1:
-        axis = []
-        if a_rank == 1:
-            lhs = _op.expand_dims(inputs[0], axis=0)
-            axis.append(0)
-        else:
-            lhs = inputs[0]
-        if b_rank == 1:
-            rhs = _op.expand_dims(inputs[1], axis=1)
-            axis.append(-1)
-        else:
-            rhs = inputs[1]
-        return _op.squeeze(_op.nn.matmul(lhs, rhs), axis=axis)
-
-    # Otherwise a simple dense op will get the job done.
-    input_1_t = _op.transpose(inputs[1], axes=(1, 0))
-    return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
-
-
-def qmatmul(
-    a,
-    b,
-    a_zp_scalar,
-    b_zp_scalar,
-    a_scale_scalar,
-    b_scale_scalar,
-    transform_num_hidden_units,
-    matmul_result_dtype,
-):
-    """
-    Helper function to handle QLinearMatMul
-    It is very close to 'matmul_out_dtype' but separated due to
-    differences in signatures of dense, matmul, batch_matmul of nn and qnn.
-    They requre scaling and zero point arguments
-    """
-    a_shape = shape_of(a)
-    a_rank = infer_shape(a_shape)[0]
-    b_shape = shape_of(b)
-    b_rank = infer_shape(b_shape)[0]
-    if a_rank > 2 or b_rank > 2:
-        # Determine the output batch dimension.
-        new_a_shape = a_shape
-        new_b_shape = b_shape
-        if a_rank > b_rank:
-            rank_diff = a_rank - b_rank
-            new_b_shape = _op.concatenate(
-                [
-                    _expr.const([1] * rank_diff, dtype=infer_type(b_shape).checked_type.dtype),
-                    b_shape,
-                ],
-                0,
-            )
-        elif a_rank < b_rank:
-            rank_diff = b_rank - a_rank
-            new_a_shape = _op.concatenate(
-                [
-                    _expr.const([1] * rank_diff, dtype=infer_type(a_shape).checked_type.dtype),
-                    a_shape,
-                ],
-                0,
-            )
-        else:
-            pass
-
-        out_batch = _op.concatenate(
-            [
-                _op.maximum(
-                    _op.strided_slice(new_b_shape, [i], [i + 1]),
-                    _op.strided_slice(new_a_shape, [i], [i + 1]),
-                )
-                for i in range(max(a_rank, b_rank) - 2)
-            ],
-            0,
-        )
-
-        b_type = infer_type(b)
-        # Convert to dense if the second matrix is 2d and non-dynamic
-        if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
-            a = flatten_to_nd(a, a_shape, 2)
-            b = _op.transpose(b)
-            output = _qnn.op.dense(
-                a,
-                b,
-                a_zp_scalar,
-                b_zp_scalar,
-                a_scale_scalar,
-                b_scale_scalar,
-                transform_num_hidden_units,
-                matmul_result_dtype,
-            )
-        else:
-            # broadcast a and b
-            a_broadcasted_shape = fold_constant(
-                _op.concatenate([out_batch, _op.strided_slice(a_shape, [a_rank - 2], [a_rank])], 0)
-            )
-            b_broadcasted_shape = fold_constant(
-                _op.concatenate([out_batch, _op.strided_slice(b_shape, [b_rank - 2], [b_rank])], 0)
-            )
-            if not tvm.ir.structural_equal(a_shape, a_broadcasted_shape):
-                a = _op.transform.broadcast_to(a, a_broadcasted_shape)
-            if not tvm.ir.structural_equal(b_shape, b_broadcasted_shape):
-                b = _op.transform.broadcast_to(b, b_broadcasted_shape)
-            # Convert a and b into 3 dimensional tensors.
-            a = flatten_to_nd(a, shape_of(a), 3)
-            b = flatten_to_nd(b, shape_of(b), 3)
-            # Transpose matrix dimensions of b.
-            bt = _op.transpose(b, [0, 2, 1])
-            # Perform a NT batch matmul.
-            output = _qnn.op.batch_matmul(
-                a, bt, a_zp_scalar, b_zp_scalar, a_scale_scalar, b_scale_scalar, matmul_result_dtype
-            )
-        # Reshape output to original dimensions.
-        final_shape = _op.concatenate(
-            [
-                out_batch,
-                _op.strided_slice(a_shape, [a_rank - 2], [a_rank - 1]),
-                _op.strided_slice(b_shape, [b_rank - 1], [b_rank]),
-            ],
-            0,
-        )
-        return _op.reshape(output, fold_constant(final_shape))
-
-    if a_rank == 1:
-        # TODO(vvchernov): There should be qnn.matmul but it is not implemented
-        # return _op.squeeze(_qnn.op.matmul(_op.expand_dims(a, axis=0),
-        #                                   b,
-        #                                   a_zp_scalar,
-        #                                   b_zp_scalar,
-        #                                   a_scale_scalar,
-        #                                   b_scale_scalar,
-        #                                   transform_num_hidden_units,
-        #                                   matmul_result_dtype,
-        #                                  ),
-        #                    axis=[0]
-        #                   )
-        return _op.squeeze(
-            _qnn.op.dense(
-                _op.expand_dims(a, axis=0),
-                _op.transpose(b),
-                a_zp_scalar,
-                b_zp_scalar,
-                a_scale_scalar,
-                b_scale_scalar,
-                transform_num_hidden_units,
-                matmul_result_dtype,
-            ),
-            axis=[0],
-        )
-
-    # Otherwise a simple dense op will get the job done.
-    return _qnn.op.dense(
-        a,
-        _op.transpose(b),
-        a_zp_scalar,
-        b_zp_scalar,
-        a_scale_scalar,
-        b_scale_scalar,
-        transform_num_hidden_units,
-        matmul_result_dtype,
-    )
-
-
-def layer_norm(x, eps, gamma, beta):
-    """A common function to handle layer norm.
-
-    Use LayerNormalization for the actual onnx op.
-    """
-    eps_dtype = infer_type(x).checked_type.dtype
-    u, s = _op.mean_variance(x, axis=-1, keepdims=True)
-    output = _op.divide(_op.subtract(x, u), _op.sqrt(_op.add(s, _op.const(eps, dtype=eps_dtype))))
-    output = _op.multiply(output, gamma)
-    if beta is not None:
-        output = _op.add(output, beta)
-
-    return output
-
-
-def get_source_name(node, type_dict):
-    """A helper function to get source information of onnx nodes."""
-    if node.name:
-        return node.name
-    else:
-        op_idx = 0
-        if node.op_type in type_dict:
-            op_idx = type_dict[node.op_type] + 1
-        type_dict[node.op_type] = op_idx
-        # rewrite name property in case any revisiting occurs to current node
-        node.name = f"{node.op_type}_{op_idx}"
-        return node.name
-
-
-def get_source_name_from_parameter(expr, name_sep="."):
-    """A helper function to get source information of graph node from parameter."""
-    if expr.span:
-        source_name = expr.span.source_name.name
-        # discard variable/parameter name to get span of op node
-        # e.g. conv2d.w -> conv2d
-        if isinstance(expr, _expr.Var):
-            postfix = f"{name_sep}{expr.name_hint}"
-            source_name = source_name[: -len(postfix)]
-        return source_name
-    return None
-
-
-def make_parameter_span(source_name_list, name_sep="."):
-    return name_sep.join(source_name_list)
-
-
-class OnnxOpConverter(object):
-    """A helper class for holding onnx op converters."""
-
-    @classmethod
-    def get_converter(cls, opset):
-        """Get converter matches given opset.
-
-        Parameters
-        ----------
-        opset: int
-            opset from model.
-
-        Returns
-        -------
-        converter, which should be `_impl_vx`. Number x is the biggest
-            number smaller than or equal to opset belongs to all support versions.
-        """
-        versions = [int(d.replace("_impl_v", "")) for d in dir(cls) if "_impl_v" in d]
-        versions = sorted(versions + [opset])
-        version = versions[max([i for i, v in enumerate(versions) if v == opset]) - 1]
-        if hasattr(cls, f"_impl_v{version}"):
-            return getattr(cls, f"_impl_v{version}")
-        raise NotImplementedError(f"opset version {version} of {cls.__name__} not implemented")
-
-
-class Unary(OnnxOpConverter):
-    """A helper class for unary op converters."""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        assert len(inputs) == 1, f"Unary math op {cls.name} takes 1 input, {len(inputs)} given"
-        op_name = cls.name
-        return get_relay_op(op_name)(*inputs)
-
-
-class Elemwise(OnnxOpConverter):
-    """A helper class for elemwise op converters."""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        assert len(inputs) == 2, f"Math op {cls.name} take 2 inputs, {len(inputs)} given"
-        op_name = cls.name
-        conv_ops = ["conv2d", "conv2d_transpose"]
-        if attr.get("broadcast", 0) and any(x in str(inputs[0]) for x in conv_ops):
-            # TODO(zhreshold): remove hard coded infershape
-            axis = int(attr.get("axis", 0))
-            inputs[1] = _op.expand_dims(inputs[1], axis=axis, num_newaxis=2)
-        return get_relay_op(op_name)(*inputs)
-
-
-class Pool(OnnxOpConverter):
-    """A helper class for pool op converters."""
-
-    name = ""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        data = inputs[0]
-        input_shape = infer_shape(data)
-        ndim = len(input_shape)
-
-        attr_cvt, data = cls._run_calculation(inputs, attr, params)
-        out = attr_cvt([data], attr, params)
-
-        if ndim - len(attr["kernel_shape"]) == 1:
-            out = _op.squeeze(out, axis=[0])
-        return out
-
-    @classmethod
-    def _run_calculation(cls, inputs, attr, params):
-        """Helper method to return the processed input data and AttrCvt object"""
-
-        data = inputs[0]
-        input_shape = infer_shape(data)
-        input_dtype = infer_type(data).checked_type.dtype
-        ndim = len(input_shape)
-        if "auto_pad" in attr:
-            attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
-                if cls.name == "avg_pool":
-                    pad_tuple = []
-                    for axis in range(len(input_shape) - 2):
-                        axis_shape = input_shape[2 + axis]
-                        stride = attr.get("strides", [1] * ndim)[axis]
-                        kernel = attr["kernel_shape"][axis]
-                        pad = get_pad_pair(axis_shape, kernel, stride, attr["auto_pad"])
-                        pad_tuple.append(pad)
-                    pad_tuple = tuple([val for pair in zip(*pad_tuple) for val in pair])
-                    attr["pads"] = pad_tuple
-                else:
-                    # Warning: Pool does not yet support dynamic shapes,
-                    # one will need to run dynamic_to_static on this model after import
-                    if "int" in input_dtype:
-                        pad_val = np.iinfo(np.dtype(input_dtype)).min
-                    else:
-                        pad_val = np.finfo(np.dtype(input_dtype)).min
-                    data = autopad(
-                        data,
-                        attr.get("strides", [1] * (ndim - 2)),
-                        attr["kernel_shape"],
-                        [1] * ndim,
-                        pad_value=pad_val,
-                        mode=attr["auto_pad"],
-                    )
-            elif attr["auto_pad"] == "VALID":
-                attr["pads"] = tuple([0 for i in range(ndim - 2)])
-            elif attr["auto_pad"] == "NOTSET":
-                pass
-            else:
-                msg = (
-                    f'Value {attr["auto_pad"]} in attribute "auto_pad" of operator {cls.name} '
-                    f"is invalid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-            attr.pop("auto_pad")
-
-        if "storage_order" in attr:
-            attr["layout"] = onnx_storage_order2layout(
-                attr["storage_order"], dims=(len(input_shape) - 2), op_name=cls.name
-            )
-        else:
-            if ndim - len(attr["kernel_shape"]) == 1:
-                data = _op.expand_dims(data, axis=0)
-                input_shape = [1] + list(input_shape)
-
-            attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2), op_name=cls.name)
-
-        return (
-            AttrCvt(
-                op_name=dimension_picker(cls.name),
-                transforms={
-                    "kernel_shape": "pool_size",
-                    "pads": ("padding", 0),
-                    "dilations": ("dilation", 1),
-                },
-                ignores=["storage_order"],
-                custom_check=dimension_constraint(),
-            ),
-            data,
-        )
-
-
-class Absolute(Unary):
-    """Operator converter for Absolute."""
-
-    name = "abs"
-
-
-class Add(Elemwise):
-    """Operator converter for Add."""
-
-    name = "add"
-
-
-class AveragePool(Pool):
-    """Operator converter for AveragePool."""
-
-    name = "avg_pool"
-
-
-class QLinearAveragePool(Pool):
-    """Operator converter for QLinearAveragePool from Microsoft onnxruntime contrib opset."""
-
-    name = "avg_pool"
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        x_scale = get_scalar(inputs[1], params)
-        x_zero_point = get_scalar(inputs[2], params, dtype="int32")
-        y_scale = fold_constant(get_scalar(inputs[3], params))
-        y_zero_point = get_scalar(inputs[4], params, dtype="int32")
-
-        attr_cvt, data = cls._run_calculation(inputs, attr, params)
-
-        input_dtype = infer_type(data).checked_type.dtype
-        # Onnxruntime doesn't actually do this op in integer, they dequantize to fp32
-        # and then requantize afer (according to documentation below)
-        # https://github.com/microsoft/onnxruntime/blob/master/docs/ContribOperators.md#com.microsoft.QLinearAveragePool
-        float_node = _qnn.op.dequantize(data, x_scale, x_zero_point)
-        out = attr_cvt([float_node], attr, params)
-        return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=input_dtype)
-
-
-class BatchNorm(OnnxOpConverter):
-    """Operator converter for BatchNorm."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # TODO(zhreshold): 'spatial' is not properly handled here.
-        # TODO(vvchernov): 'training_mode' (onnx tag) is not correctly handled, ignore for now
-        out = AttrCvt(
-            op_name="batch_norm",
-            ignores=["spatial", "is_test", "consumed_inputs", "momentum", "training_mode"],
-        )(inputs, attr, params)
-        # We only support test mode, so we return data, moving_mean, moving_var,
-        # and then moving_mean and moving_var again as placeholders for
-        # the expected "saved_mean", "saved_var".
-        return _expr.TupleWrapper(_expr.Tuple((*out, out[1], out[2])), 5)
-
-
-class InstanceNorm(OnnxOpConverter):
-    """Operator converter for BatchNorm."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return AttrCvt(op_name="instance_norm")(inputs, attr, params)
-
-
-class Conv(OnnxOpConverter):
-    """Operator converter for Conv."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # Use shape of input to determine convolution type.
-        data = inputs[0]
-        kernel = inputs[1]
-        input_shape = infer_shape(data)
-        ndim = len(input_shape)
-
-        kernel_type = infer_type(inputs[1])
-        kernel_shapes = [get_const_tuple(kernel_type.checked_type.shape)]
-
-        if "kernel_shape" not in attr:
-            attr["kernel_shape"] = kernel_shapes[0][2:]
-
-        if "auto_pad" in attr:
-            attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
-                # Warning: Convolution does not yet support dynamic shapes,
-                # one will need to run dynamic_to_static on this model after import
-                data = autopad(
-                    data,
-                    attr.get("strides", [1] * (ndim - 2)),
-                    attr["kernel_shape"],
-                    attr.get("dilations", [1] * (ndim - 2)),
-                    mode=attr["auto_pad"],
-                )
-            elif attr["auto_pad"] == "VALID":
-                attr["pads"] = [0 for i in range(ndim - 2)]
-            elif attr["auto_pad"] == "NOTSET":
-                pass
-            else:
-                msg = (
-                    f'Value {attr["auto_pad"]} in attribute "auto_pad" of operator Conv '
-                    f"is invalid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-            attr.pop("auto_pad")
-
-        attr["channels"] = kernel_shapes[0][0]
-        out = AttrCvt(
-            op_name=dimension_picker("conv"),
-            transforms={
-                "kernel_shape": "kernel_size",
-                "dilations": ("dilation", 1),
-                "pads": ("padding", 0),
-                "group": ("groups", 1),
-            },
-            custom_check=dimension_constraint(),
-        )([data, kernel], attr, params)
-
-        use_bias = len(inputs) == 3
-        if use_bias:
-            out = _op.nn.bias_add(out, inputs[2])
-        return out
-
-
-def is_ort_version_greater_than(ver):
-    import onnxruntime as ort
-
-    v11, v12, v13 = tuple(int(v) for v in ort.__version__.split("."))
-    v21, v22, v23 = tuple(int(v) for v in ver.split("."))
-
-    return (v11 > v21) or (v11 == v21 and v12 > v22) or ((v11, v12) == (v21, v22) and v13 > v23)
-
-
-class ConvTranspose(OnnxOpConverter):
-    """Operator converter for ConvTranspose."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # get number of channels
-        out_type = infer_type(inputs[1])
-        kernel_shape = [get_const_tuple(out_type.checked_type.shape)]
-        out_channels = kernel_shape[0][1] * attr.get("group", 1)
-        attr["channels"] = out_channels
-        groups = attr.get("group", 1)
-
-        if "kernel_shape" not in attr:
-            attr["kernel_shape"] = kernel_shape[0][2:]
-
-        attr["groups"] = groups
-        # infer pads for auto_pad
-        data = inputs[0]
-        input_shape = infer_shape(data)
-        ndim = len(input_shape)
-        if "auto_pad" in attr or "output_shape" in attr:
-            if "auto_pad" in attr:
-                attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if "output_shape" in attr or attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
-                # Warning: Convolution does not yet support dynamic shapes,
-                # one will need to run dynamic_to_static on this model after import
-                kernel_shape = attr["kernel_shape"]
-                kndim = len(kernel_shape)
-                dilations = attr.get("dilations", [1] * kndim)
-                output_padding = attr.get("output_padding", [0] * kndim)
-                strides = attr["strides"]
-                total_pad = [0] * kndim
-                # https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose
-                if "output_shape" in attr:
-                    for i in range(kndim):
-                        total_pad[i] = (
-                            strides[i] * (input_shape[ndim - kndim + i] - 1)
-                            + output_padding[i]
-                            + ((kernel_shape[i] - 1) * dilations[i] + 1)
-                            - attr["output_shape"][i]
-                        )
-                    left = [p // 2 for p in total_pad]
-                    right = [total_pad[i] - left[i] for i in range(kndim)]
-                    if "output_shape" in attr and "auto_pad" not in attr:
-                        pad = right + left
-                    elif "LOWER" in attr["auto_pad"]:
-                        pad = left + right
-                    else:
-                        pad = right + left
-                    attr["pads"] = pad
-                else:
-                    data = autopad(
-                        data,
-                        attr.get("strides", [1] * (ndim - 2)),
-                        attr["kernel_shape"],
-                        attr.get("dilations", [1] * (ndim - 2)),
-                        deconv=True,
-                        mode=attr["auto_pad"],
-                    )
-            elif attr["auto_pad"] == "VALID":
-                attr["pads"] = tuple([0 for i in range(ndim - 2)])
-            elif attr["auto_pad"] == "NOTSET":
-                pass
-            else:
-                msg = (
-                    f'Value {attr["auto_pad"]} in attribute "auto_pad" of operator Conv '
-                    f"is invalid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-            if "auto_pad" in attr:
-                attr.pop("auto_pad")
-
-        out = AttrCvt(
-            op_name=dimension_picker("conv", "_transpose"),
-            transforms={
-                "kernel_shape": "kernel_size",
-                "dilations": ("dilation", 1),
-                "pads": ("padding", 0),
-                "group": ("groups", 1),
-            },
-            disables=["output_shape"],
-            custom_check=dimension_constraint(),
-        )([data, inputs[1]], attr, params)
-        use_bias = len(inputs) == 3
-        if use_bias:
-            out = _op.nn.bias_add(out, inputs[2])
-        return out
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # get number of channels
-        out_type = infer_type(inputs[1])
-        kernel_shape = [get_const_tuple(out_type.checked_type.shape)]
-        out_channels = kernel_shape[0][1] * attr.get("group", 1)
-        attr["channels"] = out_channels
-        groups = attr.get("group", 1)
-
-        if "kernel_shape" not in attr:
-            attr["kernel_shape"] = kernel_shape[0][2:]
-
-        attr["groups"] = groups
-        # infer pads for auto_pad
-        data = inputs[0]
-        input_shape = infer_shape(data)
-        ndim = len(input_shape)
-        num_spatial_dims = ndim - 2
-        if "auto_pad" in attr or "output_shape" in attr:
-            if "auto_pad" in attr:
-                attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if "output_shape" in attr or attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
-                # Warning: Convolution does not yet support dynamic shapes,
-                # one will need to run dynamic_to_static on this model after import
-                kernel_shape = attr["kernel_shape"]
-                kndim = len(kernel_shape)
-                dilations = attr.get("dilations", [1] * kndim)
-                output_padding = attr.get("output_padding", [0] * kndim)
-                # this is meant to handle the field 'strides' being optional for opsets 11+
-                strides = attr.get("strides", [1] * num_spatial_dims)
-                total_pad = [0] * kndim
-                # https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose
-                if "output_shape" in attr:
-                    for i in range(kndim):
-                        total_pad[i] = (
-                            strides[i] * (input_shape[ndim - kndim + i] - 1)
-                            + output_padding[i]
-                            + ((kernel_shape[i] - 1) * dilations[i] + 1)
-                            - attr["output_shape"][i]
-                        )
-                else:
-                    for i in range(kndim):
-                        total_pad[i] = (
-                            output_padding[i]
-                            + ((kernel_shape[i] - 1) * dilations[i] + 1)
-                            - strides[i]
-                        )
-                left = [p // 2 for p in total_pad]
-                right = [total_pad[i] - left[i] for i in range(kndim)]
-
-                if "output_shape" in attr and "auto_pad" not in attr:
-                    pad = right + left
-                elif ("LOWER" in attr["auto_pad"] and is_ort_version_greater_than("1.12.1")) or (
-                    ("UPPER" in attr["auto_pad"] and not is_ort_version_greater_than("1.12.1"))
-                ):
-                    pad = right + left
-                else:
-                    pad = left + right
-                attr["pads"] = pad
-            elif attr["auto_pad"] == "VALID":
-                attr["pads"] = tuple([0 for i in range(ndim - 2)])
-            elif attr["auto_pad"] == "NOTSET":
-                pass
-            else:
-                msg = (
-                    f'Value {attr["auto_pad"]} in attribute "auto_pad" of operator Conv '
-                    f"is invalid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-            if "auto_pad" in attr:
-                attr.pop("auto_pad")
-
-        out = AttrCvt(
-            op_name=dimension_picker("conv", "_transpose"),
-            transforms={
-                "kernel_shape": "kernel_size",
-                "dilations": ("dilation", 1),
-                "pads": ("padding", 0),
-                "group": ("groups", 1),
-            },
-            disables=["output_shape"],
-            custom_check=dimension_constraint(),
-        )([data, inputs[1]], attr, params)
-        use_bias = len(inputs) == 3
-        if use_bias:
-            out = _op.nn.bias_add(out, inputs[2])
-        return out
-
-
-class GlobalAveragePool(OnnxOpConverter):
-    """Operator converter for GlobalAveragePool"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        rank = len(infer_shape(inputs[0]))
-        if rank == 3:
-            return _op.nn.global_avg_pool1d(inputs[0])
-        if rank == 4:
-            return _op.nn.global_avg_pool2d(inputs[0])
-        if rank == 5:
-            return _op.nn.global_avg_pool3d(inputs[0])
-        raise NotImplementedError(
-            "Global average pooling is only implemented for 1D, 2D, and 3D kernels, got %dD."
-            % (rank - 2)
-        )
-
-
-class QLinearGlobalAveragePool(OnnxOpConverter):
-    "Operator converter for QLinearGlobalAveragePool from Microsoft onnxruntime contrib opset."
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        rank = len(infer_shape(inputs[0]))
-
-        x_scale = get_scalar(inputs[1], params)
-        x_zero_point = get_scalar(inputs[2], params, dtype="int32")
-        y_scale = fold_constant(get_scalar(inputs[3], params))
-        y_zero_point = get_scalar(inputs[4], params, dtype="int32")
-
-        input_dtype = infer_type(inputs[0]).checked_type.dtype
-
-        # Onnxruntime documentation does not mention that this global avg_pool should follow the
-        # sequence dequantize -> float op -> quantize, but that is how QLinearAveragePool is done.
-        #
-        # This op also follows the same pattern since qnn op is not available right now.
-        # TODO: Generate QNN op to perform quantized operation instead of dequant -> op -> quant
-        x = _qnn.op.dequantize(inputs[0], x_scale, x_zero_point)
-        if rank == 3:
-            out = _op.nn.global_avg_pool1d(x)
-        elif rank == 4:
-            out = _op.nn.global_avg_pool2d(x)
-        elif rank == 5:
-            out = _op.nn.global_avg_pool3d(x)
-        else:
-            raise NotImplementedError(
-                "Global average pooling is only implemented for 1D, 2D, and 3D kernels, got %dD."
-                % (rank - 2)
-            )
-        return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=input_dtype)
-
-
-class GlobalMaxPool(OnnxOpConverter):
-    """Operator converter for GlobalMaxPool"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        rank = len(infer_shape(inputs[0]))
-        if rank == 3:
-            return _op.nn.global_max_pool1d(inputs[0])
-        if rank == 4:
-            return _op.nn.global_max_pool2d(inputs[0])
-        if rank == 5:
-            return _op.nn.global_max_pool3d(inputs[0])
-        raise NotImplementedError(
-            "Global max pooling is only implemented for 1D, 2D, and 3D kernels, got %dD."
-            % (rank - 2)
-        )
-
-
-class Div(Elemwise):
-    """Operator converter for Divide."""
-
-    name = "divide"
-
-
-class Elu(OnnxOpConverter):
-    """Operator converter for Elu."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = float(attr.get("alpha", 1.0))
-        return _expr.const(-alpha) * _op.nn.relu(
-            _expr.const(1.0) - _op.exp(inputs[0])
-        ) + _op.nn.relu(inputs[0])
-
-
-class Gelu(OnnxOpConverter):
-    """Operator converter for Gelu from Microsoft onnxruntime contrib opset.
-
-    gelu(x) = 0.5x(1 + erf(x/sqrt(2)))
-    """
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        x = inputs[0]
-
-        # Declare consts
-        const_dtype = infer_type(x).checked_type.dtype
-        half = _expr.const(0.5, dtype=const_dtype)
-        one = _expr.const(1.0, dtype=const_dtype)
-        sqrt2 = _expr.const(math.sqrt(2), dtype=const_dtype)
-
-        # Compute gelu
-        term1 = _op.multiply(half, x)
-        erf = _op.erf(_op.divide(x, sqrt2))
-        term2 = _op.add(one, erf)
-        return _op.multiply(term1, term2)
-
-
-class FastGelu(OnnxOpConverter):
-    """Operator converter for FastGelu from Microsoft onnxruntime contrib opset.
-
-    fast_gelu(x) = 0.5x(1 + tanh(sqrt(2/pi)(x + 0.044715x^3)))
-                 = 0.5x(1 + tanh((sqrt(2/pi)x + 0.044715(sqrt(2/pi)x^3)))
-                 = 0.5x(1 + tanh(c1 * x + c2 * x^3)))
-    , where
-        c1 = sqrt(2/pi)
-        c2 = 0.044715 * sqrt(2/pi)
-    """
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        x = inputs[0]
-        if inputs[1]:
-            bias = inputs[1]
-            bias_shape = infer_shape(bias)
-            assert len(bias_shape) == 1, "bias term must be a 1D tensor"
-            x += bias
-
-        # Declare consts
-        const_dtype = infer_type(x).checked_type.dtype
-        half = _expr.const(0.5, dtype=const_dtype)
-        one = _expr.const(1.0, dtype=const_dtype)
-        const1 = _expr.const(math.sqrt(2 / math.pi), dtype=const_dtype)
-        const2 = _expr.const(0.044715 * math.sqrt(2 / math.pi), dtype=const_dtype)
-
-        # Compute FastGelu
-        term1 = _op.multiply(half, x)
-        term2 = _op.multiply(const1, x)
-        term3 = _op.multiply(const2, _op.power(x, _expr.const(3, const_dtype)))
-        tanh = _op.tanh(_op.add(term2, term3))
-        return _op.multiply(term1, _op.add(one, tanh))
-
-
-class BiasGelu(OnnxOpConverter):
-    """Operator converter for BiasGelu from Microsoft onnxruntime contrib opset.
-
-    bias_gelu(x, b) = 0.5(x + b)(1 + erf((x + b)/sqrt(2)))
-    """
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        x = inputs[0]
-        b = inputs[1]
-
-        b_shape = infer_shape(b)
-        assert len(b_shape) == 1, "BiasGelu bias term must be a 1D tensor"
-
-        inp = _op.add(x, b)
-        return Gelu._impl_v1([inp], attr, params)
-
-
-class Mish(OnnxOpConverter):
-    """Operator converter for Mish from Microsoft onnxruntime contrib opset.
-
-    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
-    """
-
-    @classmethod
-    def _impl_v18(cls, inputs, attr, params):
-        x = inputs[0]
-        # Declare const
-        const_dtype = infer_type(x).checked_type.dtype
-        one = _expr.const(1.0, dtype=const_dtype)
-
-        # Compute Mish
-        term1 = _op.log(one + _op.exp(x))
-        return _op.multiply(x, _op.tanh(term1))
-
-
-class LayerNormalization(OnnxOpConverter):
-    """Operator converter for LayerNormalization from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v17(cls, inputs, attr, params):
-        x = inputs[0]
-        gamma = inputs[1]
-        beta = inputs[2]
-        axis = attr.get("axis", -1)
-        eps = attr.get("epsilon", 1e-5)
-        # according to the onnx doc, given the int axis (default -1)
-        # to compute the mean and inv_stdev which are of dim [d[0], ..., d[axis-1], 1, ..., 1]
-        # the actual computation is over (axis, ..., rank(x) - 1) axes
-        # see https://github.com/onnx/onnx/blob/main/docs/Changelog.md#layernormalization-17
-        rank = len(infer_shape(x))
-        axis = tuple(range(axis, rank)) if axis >= 0 else tuple(range(rank + axis, rank))
-        dtype = infer_type(x).checked_type.dtype
-        mean = _op.mean(x, axis, keepdims=True)
-        var = _op.variance(x, axis, keepdims=True, with_mean=mean)
-        inv_stdev = _op.divide(
-            _op.const(1, dtype=dtype), _op.sqrt(_op.add(var, _op.const(eps, dtype=dtype)))
-        )
-        x_norm = _op.multiply(_op.subtract(x, mean), inv_stdev)
-        ln = _op.multiply(x_norm, gamma)
-        if beta is not None:
-            ln = _op.add(ln, beta)
-
-        return _expr.TupleWrapper(_expr.Tuple([ln, mean, inv_stdev]), 3)
-
-
-class EmbedLayerNormalization(OnnxOpConverter):
-    """Operator converter for EmbedLayerNormalization from Microsoft onnxruntime contrib opset.
-
-    This layer embeds the input tokens, sums them, and applies layer normalization.
-    """
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        input_ids = inputs[0]
-        segment_ids = inputs[1]
-        word_emb = inputs[2]
-        pos_emb = inputs[3]
-        segment_emb = inputs[4]
-        gamma = inputs[5]
-        beta = inputs[6]
-
-        mask = inputs[7]
-        pos_ids = inputs[8]
-
-        eps = attr.get("epsilon", 1e-12)
-
-        (batch_size, seq_len) = infer_shape(input_ids)
-
-        if segment_ids:
-            assert segment_emb
-
-        if pos_ids is None:
-            pos_ids = _op.const([list(range(seq_len))] * batch_size, dtype="int32")
-
-        word_vec = _op.take(word_emb, input_ids, axis=0)
-        segment_vec = _op.take(segment_emb, segment_ids, axis=0)
-        pos_vec = _op.take(pos_emb, pos_ids, axis=0)
-
-        vec_sum = _op.add(word_vec, pos_vec)
-        if segment_ids:
-            vec_sum = _op.add(vec_sum, segment_vec)
-
-        ln = layer_norm(vec_sum, eps, gamma, beta)
-
-        mask_index = _op.const(np.zeros((batch_size,), dtype="int32"))
-        if mask:
-            # calculate number of words per sentence
-            mask_index = _op.sum(mask, axis=1)
-
-        # TODO(@anwang2009): onnxruntime v1.10.0 requires a third output of vec_sum
-        return _expr.TupleWrapper(_expr.Tuple([ln, mask_index]), 2)
-
-
-class SkipLayerNormalization(OnnxOpConverter):
-    """Operator converter for SkipLayerNormalization from Microsoft onnxruntime contrib opset.
-
-    This layer sums the two input tensors (along with optional bias), and applies layer
-    normalization.
-    """
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        data = inputs[0]
-        skip = inputs[1]
-        gamma = inputs[2]
-        beta = inputs[3]
-        bias = inputs[4]
-
-        assert (
-            beta is not None and bias is not None
-        ), "SkipLayerNormalization import currently only supports required beta and bias"
-
-        eps = attr.get("epsilon", 1e-12)
-
-        x = _op.add(data, skip)
-        if bias is not None:
-            x = _op.add(x, bias)
-
-        output = layer_norm(x, eps, gamma, beta)
-
-        # onnxruntime doesn't compute the other outputs, despite the documentation
-        placeholder = _op.const(0, dtype="float32")
-
-        return _expr.TupleWrapper(_expr.Tuple([output, placeholder, placeholder]), 3)
-
-
-class OrtAttentionBase:
-    """
-    Base class for Attention and QAttention from Microsoft onnxruntime contrib opset.
-    """
-
-    @classmethod
-    def _check_input_embeddings(cls, input_emb, valid_types, **kwargs):
-        assert infer_type(input_emb).checked_type.dtype in valid_types
-        assert (
-            len(infer_shape(input_emb)) == 3
-        ), "Input should be 3D tensor with shape (batch_size, sequence_length, input_hidden_size)"
-        (batch_size, seq_len, input_hidden) = infer_shape(input_emb)
-        assert input_hidden > 0, (
-            "The weight tensor has (input_hidden_size, 3 * output_hidden_size) shape, so it doesn't"
-            f" make sense to have ({input_hidden}, 3 * output_hidden_size) weight tensor."
-        )
-        assert seq_len > 0, (
-            "The output tensor has (batch_size, sequence_length, hidden_size) shape,"
-            f" so it doesn't make sense to have (batch_size, {seq_len}, hidden_size) output."
-        )
-
-        return batch_size, seq_len, input_hidden
-
-    @classmethod
-    def _check_weights(cls, weight, valid_types, **kwargs):
-        assert infer_type(weight).checked_type.dtype in valid_types
-        assert len(infer_shape(weight)) == 2, (
-            "Weight should be 2D input tensor with shape (input_hidden_size, 3 * hidden_size), "
-            "hidden_size = num_heads * head_size"
-        )
-        (input_hidden_weight, out_hidden_x3) = infer_shape(weight)
-        assert kwargs["input_hidden"] == input_hidden_weight
-        assert out_hidden_x3 % 3 == 0, "output hidden shape should be divisible by 3: W_Q, W_K, W_V"
-        out_hidden = out_hidden_x3 // 3
-        assert (
-            out_hidden % kwargs["num_heads"] == 0
-        ), "output hidden size should be divisible by number of attention heads"
-        head_size = out_hidden // kwargs["num_heads"]
-
-        return out_hidden_x3, out_hidden, head_size
-
-    @classmethod
-    def _check_bias(cls, bias, valid_types, **kwargs):
-        assert infer_type(bias).checked_type.dtype in valid_types
-        assert (
-            len(infer_shape(bias)) == 1
-        ), "Bias should be 1D input tensor with shape (3 * hidden_size)"
-        (out_hidden_x3_bias,) = infer_shape(bias)
-        assert kwargs["out_hidden_x3"] == out_hidden_x3_bias
-
-    @classmethod
-    def _check_mask_index(cls, mask_index, valid_types, **kwargs):
-        assert infer_type(mask_index).checked_type.dtype in valid_types
-        mask_index_shape = infer_shape(mask_index)
-        assert (
-            len(mask_index_shape) == 2
-            and mask_index_shape[0] == kwargs["batch_size"]
-            and mask_index_shape[1] >= kwargs["seq_len"]
-        ), "currently only support (batch_size, past_sequence_len + sequence_length) mask index"
-
-        return mask_index_shape[1]
-
-    @classmethod
-    def _check_past(cls, past, valid_types, **kwargs):
-        assert infer_type(past).checked_type.dtype in valid_types
-        past_shape = infer_shape(past)
-        assert len(past_shape) == 5, "past should be 5D tensor"
-        assert (
-            past_shape[0] == 2
-            and past_shape[1] == kwargs["batch_size"]
-            and past_shape[2] == kwargs["num_heads"]
-            and past_shape[3] + kwargs["seq_len"] == kwargs["total_seq_len"]
-            and past_shape[4] == kwargs["head_size"]
-        )
-        past_seq_len = past_shape[3]
-        return past_seq_len
-
-    @classmethod
-    def _split_into_heads(cls, tensor, batch_size, seq_len, num_heads, head_size):
-        """
-        In the implementation of Multi-head attention we just split queries, keys, and values
-        we compute for a single-head attention into several parts:
-        (batch_size, num_heads, seq_len, head_size)
-        """
-        tensor = _op.reshape(tensor, (batch_size, seq_len, num_heads, head_size))
-
-        # (batch_size, num_heads, seq_len, head_size)
-        tensor = _op.transpose(tensor, axes=[0, 2, 1, 3])
-
-        return tensor
-
-    @classmethod
-    def _merge_first_dimensions(cls, tensor):
-        """
-        nn.batch_matmul is expecting 3D tensor:
-        (batch_size * num_heads, past_seq_len + seq_len, head_size)
-        """
-        return _op.reverse_reshape(tensor, (-1, 0, 0))
-
-    @classmethod
-    def _create_unidirectional_mask(cls, left_value, right_value, past_seq_len, seq_len, dtype):
-        """
-        [lhs rhs rhs ... rhs rhs]
-        [lhs lhs rhs ... rhs rhs]
-        [lhs lhs lhs ... rhs rhs]
-        .........................
-        [lhs lhs lhs ... lhs rhs]
-        [lhs lhs lhs ... lhs lhs]
-        """
-        numpy_unidirectional_mask = np.array(
-            [
-                np.concatenate(
-                    [
-                        np.full(past_seq_len + s_i + 1, left_value),
-                        np.full(seq_len - s_i - 1, right_value),
-                    ]
-                )
-                for s_i in range(seq_len)
-            ]
-        )
-        unidirectional_mask = _op.const(numpy_unidirectional_mask, dtype=dtype)
-        unidirectional_mask = _op.expand_dims(unidirectional_mask, 0, num_newaxis=2)
-
-        return unidirectional_mask
-
-    @classmethod
-    def _compute_attention(cls, Q, K, V, mask_index, **kwargs):
-        # Compute Attention scores
-        att_scores = _op.nn.batch_matmul(Q, K, transpose_a=False, transpose_b=True)
-        score_dtype = infer_type(att_scores).checked_type.dtype
-        att_scores = _op.divide(
-            att_scores,
-            _op.const(
-                np.sqrt(kwargs["head_size"]), dtype=infer_type(att_scores).checked_type.dtype
-            ),
-        )
-        att_scores = _op.reshape(
-            att_scores,
-            (
-                kwargs["batch_size"],
-                kwargs["num_heads"],
-                kwargs["seq_len"],
-                kwargs["past_seq_len"] + kwargs["seq_len"],
-            ),
-        )
-
-        # Build the attention mask
-        att_mask = _op.cast(mask_index, score_dtype)
-        # Attention mask has value 0 or 1. Here we convert 0 to -10000, and 1 to 0.
-        att_mask = _op.subtract(_op.const(1, dtype=score_dtype), att_mask)
-        att_mask = _op.multiply(att_mask, _op.const(-10000, dtype=score_dtype))
-        # Expand for att_scores broadcast
-        # (batch_size, past_seq_len + seq_len) -> (batch_size, 1, seq_len, past_seq_len + seq_len)
-        att_mask = _op.expand_dims(att_mask, 1, num_newaxis=2)
-        att_mask = _op.concatenate([att_mask] * kwargs["seq_len"], axis=2)
-
-        if kwargs["unidirectional"]:
-            att_mask = _op.add(
-                att_mask,
-                cls._create_unidirectional_mask(
-                    0, -10000, kwargs["past_seq_len"], kwargs["seq_len"], score_dtype
-                ),
-            )
-
-        # Apply the mask
-        att_scores = _op.add(att_scores, att_mask)
-        # TODO(agladyshev):
-        #   Comment from ORT source code (onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h):
-        #   "Fix unidirectional mask to be parity with huggingface implementation"
-        if kwargs["unidirectional"]:
-            att_scores = _op.multiply(
-                att_scores,
-                cls._create_unidirectional_mask(
-                    1, 0, kwargs["past_seq_len"], kwargs["seq_len"], score_dtype
-                ),
-            )
-            att_scores = _op.add(
-                att_scores,
-                _op.multiply(
-                    att_mask,
-                    cls._create_unidirectional_mask(
-                        0, 1, kwargs["past_seq_len"], kwargs["seq_len"], score_dtype
-                    ),
-                ),
-            )
-
-        # Compute Softmax
-        att_scores = _op.reshape(
-            att_scores,
-            (
-                kwargs["batch_size"] * kwargs["num_heads"],
-                kwargs["seq_len"],
-                kwargs["past_seq_len"] + kwargs["seq_len"],
-            ),
-        )
-        att_probs = _op.nn.softmax(att_scores, axis=-1)
-
-        # Compute output
-        output = _op.nn.batch_matmul(att_probs, V, transpose_a=False, transpose_b=False)
-        output = _op.reverse_reshape(output, (-1, kwargs["num_heads"], 0, 0))
-        output = _op.transpose(output, axes=[0, 2, 1, 3])
-        output = _op.reshape(output, (0, 0, kwargs["out_hidden"]))
-
-        return output
-
-
-class Attention(OrtAttentionBase, OnnxOpConverter):
-    """Operator converter for Attention from Microsoft onnxruntime contrib opset.
-
-    This is the self-attention mechanism used in transformer models.
-    """
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # ************************* Read attrs *************************
-        num_heads = attr["num_heads"]
-        unidirectional = attr["unidirectional"]
-
-        assert (
-            "past_present_share_buffer" not in attr
-        ), "share past and present buffers are not currently supported"
-        assert (
-            "qkv_hidden_sizes" not in attr
-        ), "different hidden sizes for Q, K, V are not currently supported"
-
-        # ************************* Read inputs *************************
-        # (batch, seq, in_hidden)
-        input_emb = inputs[0]
-
-        # TODO(agladyshev):
-        #   ORT documentation says:
-        #       The weights for input projection of Q, K and V are merged.
-        #       The data is stacked on the second dimension.
-        #       Its shape is (input_hidden_size, hidden_size + hidden_size + v_hidden_size).
-        #       Here hidden_size is the hidden dimension of Q and K, and v_hidden_size is that of V.
-        #   However, in our case, we consider that hidden_size == v_hidden_size.
-        #   Therefore, weight has the following shape:
-        #       (in_hidden, 3 * out_hidden), where out_hidden = num_heads * head_size
-        weight = inputs[1]
-
-        # (3 * out_hidden,)
-        bias = inputs[2]
-
-        # 1. (    batch,              1,        max_seq, max_seq)
-        # 2. (    batch, past_seq + seq,)
-        # 3. (    batch,            seq, past_seq + seq,)
-        # 4. (    batch,)
-        # 5. (2 * batch,)
-        # TODO: For now, we only support case 2.
-        mask_index = inputs[3]
-
-        # (2, batch, num_heads, past_seq, head_size)
-        past = inputs[4]
-
-        # (batch, num_heads, seq, seq)
-        extra_add = inputs[5]
-        assert extra_add is None, "extra add to QxK not currently supported"
-
-        # When past_present_share_buffer is used,
-        # it is required to specify past_sequence_length (could be 0)
-        past_seq_len = inputs[6]
-        assert past_seq_len is None, "past sequence length not currently supported"
-
-        # ************************* Parse inputs *************************
-        t = ["float32", "float16"]
-        m = ["int32"]
-
-        # input
-        batch_size, seq_len, input_hidden = cls._check_input_embeddings(input_emb, t)
-
-        # weight
-        out_hidden_x3, out_hidden, head_size = cls._check_weights(
-            weight, t, num_heads=num_heads, input_hidden=input_hidden
-        )
-
-        # bias
-        cls._check_bias(bias, t, out_hidden_x3=out_hidden_x3)
-
-        # mask_index
-        assert (
-            mask_index is not None
-        ), "Attention import currently only supports required mask_index"
-        total_seq_len = cls._check_mask_index(mask_index, m, batch_size=batch_size, seq_len=seq_len)
-
-        # past
-        if past_seq_len is None:
-            past_seq_len = 0
-        if past is not None:
-            past_seq_len = cls._check_past(
-                past,
-                t,
-                batch_size=batch_size,
-                num_heads=num_heads,
-                seq_len=seq_len,
-                total_seq_len=total_seq_len,
-                head_size=head_size,
-            )
-
-        # split weight and biases and do the matmuls
-        w_Q, w_K, w_V = _op.split(weight, 3, axis=1)
-        b_Q, b_K, b_V = _op.split(bias, 3, axis=0)
-        # need to merge batch dimensions since TVM matmul is 2D
-        input_emb = _op.reverse_reshape(input_emb, (-1, 0))
-        Q = _op.add(_op.nn.matmul(input_emb, w_Q), b_Q)
-        K = _op.add(_op.nn.matmul(input_emb, w_K), b_K)
-        V = _op.add(_op.nn.matmul(input_emb, w_V), b_V)
-
-        Q = cls._split_into_heads(Q, batch_size, seq_len, num_heads, head_size)
-        K = cls._split_into_heads(K, batch_size, seq_len, num_heads, head_size)
-        V = cls._split_into_heads(V, batch_size, seq_len, num_heads, head_size)
-
-        # Concatenate (past_K, past_V) with (K, V) by sequence axis:
-        # (batch_size, num_heads, past_sequence_length + sequence_length, head_size)
-        if past is not None and past_seq_len > 0:
-            K_past, V_past = _op.split(past, 2, axis=0)
-            K = _op.concatenate([_op.squeeze(K_past, axis=[0]), K], axis=2)
-            V = _op.concatenate([_op.squeeze(V_past, axis=[0]), V], axis=2)
-
-        # Prepare present state for Key and Value with shape
-        # (2, batch_size, num_heads, past_sequence_length + sequence_length, head_size)
-        present = _op.stack([K, V], axis=0)
-
-        Q = cls._merge_first_dimensions(Q)
-        K = cls._merge_first_dimensions(K)
-        V = cls._merge_first_dimensions(V)
-
-        # Compute Attention output
-        output = cls._compute_attention(
-            Q,
-            K,
-            V,
-            mask_index,
-            unidirectional=unidirectional,
-            batch_size=batch_size,
-            out_hidden=out_hidden,
-            num_heads=num_heads,
-            head_size=head_size,
-            seq_len=seq_len,
-            past_seq_len=past_seq_len,
-        )
-
-        return _expr.TupleWrapper(_expr.Tuple([output, present]), 2)
-
-
-class QAttention(OrtAttentionBase, OnnxOpConverter):
-    """Operator converter for QAttention from Microsoft onnxruntime contrib opset.
-
-    This is the self-attention mechanism used in transformer models.
-    """
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # ************************* Read attrs *************************
-        num_heads = attr["num_heads"]
-        unidirectional = attr["unidirectional"]
-
-        # ************************* Read inputs *************************
-        # (batch, seq, in_hidden)
-        input_emb = inputs[0]
-
-        # (in_hidden, 3 * out_hidden), where out_hidden = num_heads * head_size
-        weight = inputs[1]
-
-        # (3 * out_hidden,)
-        bias = inputs[2]
-
-        # Scale of quantized input tensor.
-        # Scalar, which means a per-tensor/layer quantization
-        input_scale = inputs[3]
-
-        # Scale of quantized weight tensor.
-        # Scalar or a 1D tensor, which means a per-tensor/per-column quantization.
-        # Its size should be 3 * out_hidden if it is per-column quantization
-        weight_scale = inputs[4]
-
-        # TODO(agladyshev):
-        #  ORT documentation says that shape is (batch,),
-        #  but in ORT source code we have following comment:
-        #       1. (batch_size)
-        #       2. (2 * batch_size)
-        #       3. (batch_size, 1)
-        #       4. (1, 1)
-        #       5. (batch_size, past_sequence_length + sequence_length)
-        #  In practice, for GPT-2 there shape is (batch, past_seq_length + seq_length).
-        #  Currently only (batch, past_seq_length + seq_length) shape is supported.
-        mask_index = inputs[5]
-
-        # Zero point of quantized input tensor.
-        # Scalar, which means a per-tensor/layer quantization
-        input_zero_point = inputs[6]
-
-        # Zero point of quantized weight tensor.
-        # Scalar or a 1D tensor, which means a per-tensor/per-column quantization.
-        # Its size should be 3 * out_hidden if it is per-column quantization
-        weight_zero_point = inputs[7]
-
-        # (2, batch, num_heads, past_seq, head_size)
-        past = inputs[8]
-
-        # ************************* Parse inputs *************************
-        t1 = ["int8", "uint8"]
-        t2 = ["int8", "uint8"]
-        t3 = ["float32", "float16"]
-        t4 = ["int32"]
-
-        # input
-        batch_size, seq_len, input_hidden = cls._check_input_embeddings(input_emb, t1)
-
-        # weight
-        out_hidden_x3, out_hidden, head_size = cls._check_weights(
-            weight, t2, num_heads=num_heads, input_hidden=input_hidden
-        )
-
-        # bias
-        cls._check_bias(bias, t3, out_hidden_x3=out_hidden_x3)
-
-        # input_scale
-        assert infer_type(input_scale).checked_type.dtype in t3
-        input_scale = get_scalar(
-            input_scale, params, dtype=infer_type(input_scale).checked_type.dtype
-        )
-
-        # weight_scale
-        assert infer_type(weight_scale).checked_type.dtype in t3
-        # TODO(agladyshev): now QNN Batch Matmul only supports scalar types for scale and zero_point
-        weight_scale = get_scalar(
-            weight_scale, params, dtype=infer_type(weight_scale).checked_type.dtype
-        )
-
-        # mask_index
-        assert (
-            mask_index is not None
-        ), "Attention import currently only supports required mask_index"
-        total_seq_len = cls._check_mask_index(
-            mask_index, t4, batch_size=batch_size, seq_len=seq_len
-        )
-
-        # TODO(agladyshev): int32 required for qnn.batch_matmul (QnnBatchMatmulRel)
-        zero_point_zero = _expr.const(0, "int32")
-
-        # input_zero_point
-        if input_zero_point is None:
-            input_zero_point = zero_point_zero
-        else:
-            assert infer_type(input_zero_point).checked_type.dtype in t1
-            # TODO(agladyshev): int32 required for qnn.batch_matmul (QnnBatchMatmulRel)
-            input_zero_point = get_scalar(input_zero_point, params, dtype="int32")
-
-        # weight_zero_point
-        if weight_zero_point is None:
-            weight_zero_point = zero_point_zero
-        else:
-            assert infer_type(weight_zero_point).checked_type.dtype in t2
-            # TODO(agladyshev): int32 required for qnn.batch_matmul (QnnBatchMatmulRel)
-            weight_zero_point = get_scalar(weight_zero_point, params, dtype="int32")
-
-        # past (2, batch_size, num_heads, past_sequence_length, head_size)
-        past_seq_len = 0
-        if past is not None:
-            past_seq_len = cls._check_past(
-                past,
-                t3,
-                batch_size=batch_size,
-                num_heads=num_heads,
-                seq_len=seq_len,
-                total_seq_len=total_seq_len,
-                head_size=head_size,
-            )
-
-        # ************************* Create Relay *************************
-        # Add batch dimension for QNN Batch Matmul
-        weight = _op.expand_dims(weight, 0, num_newaxis=1)
-        weight = _op.concatenate([weight] * batch_size, axis=0)
-
-        # Split weight and biases and do the Matmul
-        w_Q, w_K, w_V = _op.split(weight, 3, axis=-1)
-        b_Q, b_K, b_V = _op.split(bias, 3, axis=-1)
-
-        def qmatmul_dequantize_bias(
-            lhs, rhs, lhs_scale, rhs_scale, lhs_zero_point, rhs_zero_point, bias
-        ):
-            rhs_transposed = _op.transpose(rhs, axes=[0, 2, 1])  # QNN Batch Matmul do: X * Y^T
-            result = _qnn.op.batch_matmul(
-                lhs, rhs_transposed, lhs_zero_point, rhs_zero_point, lhs_scale, rhs_scale
-            )
-            # In our case zero point and scale are scalar, therefore 'axis' doesn't matter
-            result = _qnn.op.dequantize(result, _op.multiply(lhs_scale, rhs_scale), zero_point_zero)
-            result = _op.add(result, bias)
-            return result
-
-        Q = qmatmul_dequantize_bias(
-            input_emb, w_Q, input_scale, weight_scale, input_zero_point, weight_zero_point, b_Q
-        )
-        K = qmatmul_dequantize_bias(
-            input_emb, w_K, input_scale, weight_scale, input_zero_point, weight_zero_point, b_K
-        )
-        V = qmatmul_dequantize_bias(
-            input_emb, w_V, input_scale, weight_scale, input_zero_point, weight_zero_point, b_V
-        )
-
-        Q = cls._split_into_heads(Q, batch_size, seq_len, num_heads, head_size)
-        K = cls._split_into_heads(K, batch_size, seq_len, num_heads, head_size)
-        V = cls._split_into_heads(V, batch_size, seq_len, num_heads, head_size)
-
-        # Concatenate (past_K, past_V) with (K, V) by sequence axis:
-        # (batch_size, num_heads, past_sequence_length + sequence_length, head_size)
-        if past is not None and past_seq_len > 0:
-            K_past, V_past = _op.split(past, 2, axis=0)
-            K = _op.concatenate([_op.squeeze(K_past, axis=[0]), K], axis=2)
-            V = _op.concatenate([_op.squeeze(V_past, axis=[0]), V], axis=2)
-
-        # Prepare present state for Key and Value with shape
-        # (2, batch_size, num_heads, past_sequence_length + sequence_length, head_size)
-        present = _op.stack([K, V], axis=0)
-
-        Q = cls._merge_first_dimensions(Q)
-        K = cls._merge_first_dimensions(K)
-        V = cls._merge_first_dimensions(V)
-
-        # Compute Attention output
-        output = cls._compute_attention(
-            Q,
-            K,
-            V,
-            mask_index,
-            unidirectional=unidirectional,
-            batch_size=batch_size,
-            out_hidden=out_hidden,
-            num_heads=num_heads,
-            head_size=head_size,
-            seq_len=seq_len,
-            past_seq_len=past_seq_len,
-        )
-
-        return _expr.TupleWrapper(_expr.Tuple([output, present]), 2)
-
-
-class Gemm(OnnxOpConverter):
-    """Operator converter for Gemm."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        assert (
-            len(inputs) == 3 or len(inputs) == 2
-        ), f"Gemm op take 2 or 3 inputs, {len(inputs)} given"
-        input0_state = infer_type(inputs[0])
-        dtype = input0_state.checked_type.dtype
-        # Y = alpha * A * B + beta * C
-        alpha = float(attr.get("alpha", 1.0))
-        beta = float(attr.get("beta", 1.0))
-        transA = int(attr.get("transA", 0))
-        transB = int(attr.get("transB", 0))
-        # get number of channels
-        channels = infer_channels(inputs[1], not transB)
-        if transA:
-            inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
-        if not transB:
-            inputs[1] = _op.transpose(inputs[1], axes=(1, 0))
-        if len(input0_state.checked_type.shape) != 2:
-            inputs[0] = _op.nn.batch_flatten(inputs[0])
-        if alpha != 1.0:
-            inputs[0] *= _expr.const(alpha, dtype=dtype)
-        out = _op.nn.dense(inputs[0], inputs[1], units=channels)
-        if len(inputs) == 3:
-            if beta != 1.0:
-                out += _expr.const(float(beta), dtype=dtype) * inputs[2]
-            else:
-                out += inputs[2]
-        return out
-
-
-class MatMul(OnnxOpConverter):
-    """Operator converter for MatMul."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        assert len(inputs) == 2, f"MatMul op take 2 inputs, {len(inputs)} given"
-        # Need to check input shape as batch matmul must be supported.
-        return matmul_out_dtype(inputs, out_dtype=infer_type(inputs[0]).checked_type.dtype)
-
-
-class MatMulInteger16(OnnxOpConverter):
-    """Operator converter for MatMulInteger16 from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        assert len(inputs) == 2, f"MatMulInteger16 op take 2 inputs, {len(inputs)} given"
-        a_dtype = infer_type(inputs[0]).checked_type.dtype
-        b_dtype = infer_type(inputs[1]).checked_type.dtype
-        # Check input data types
-        assert a_dtype in ("int16", "uint16"), "MatMulInteger16: invalid dtype for first input"
-        assert b_dtype in ("int16", "uint16"), "MatMulInteger16: invalid dtype for second input"
-        out_dtype = "int32"
-        if a_dtype == "uint16" and b_dtype == "uint16":
-            out_dtype = "uint32"
-        return matmul_out_dtype(inputs, out_dtype)
-
-
-class Mod(OnnxOpConverter):
-    """Operator converter for Mod."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        assert len(inputs) == 2, f"Mod op take 2 inputs, {len(inputs)} given"
-
-        # Note: attr['fmod'] determines whether the operator should behave like np.fmod or np.mod.
-        # attr['fmod'] == 0 will behave as np.mod and attr['fmod'] == 1 will force fmod treatment.
-        # The relay equivalent of np.fmod is relay.mod and np.mod is relay.floor_mod
-        if attr.get("fmod", 0) == 0:
-            op_name = "floor_mod"
-        else:
-            op_name = "mod"
-
-        return AttrCvt(op_name)(inputs, {}, params)
-
-
-class MaxPool(Pool):
-    """Operator converter for MaxPool"""
-
-    name = "max_pool"
-
-
-class MaxUnpool(OnnxOpConverter):
-    """Operator converter for MaxUnpool"""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # Unpack inputs and attributes
-        data = inputs[0]
-        data_type = infer_type(data).checked_type.dtype
-        indices = inputs[1]
-        output_shape = inputs[2]
-        kernel_shape = attr.get("kernel_shape")
-        pads = attr.get("pads", None)
-        strides = attr.get("strides", [1] * len(kernel_shape))
-
-        # Compute the proper output shape before padding.
-        multiplier = _op.concatenate(
-            [_expr.const([1, 1], dtype="int64"), _expr.const(list(strides), dtype="int64")], axis=0
-        )
-        total_output_shape = multiplier * shape_of(data, dtype="int64")
-        # Add extra dimensions from kernel size and stride mismatch
-        total_output_shape += _op.concatenate(
-            [_expr.const([0, 0], "int64"), _expr.const(list(kernel_shape), "int64")], axis=0
-        ) - _op.concatenate(
-            [_expr.const([0, 0], "int64"), _expr.const(list(strides), "int64")], axis=0
-        )
-
-        # Compute padding amount if output shape is specified.
-        if output_shape is not None:
-            total_output_shape = output_shape
-
-        elif pads is not None:
-            # Get pads in the proper format for relay.
-            pads = _op.concatenate(
-                [_expr.const([0, 0, 0, 0], "int64"), _expr.const(list(pads), "int64")], axis=0
-            )
-            pads = _op.reshape(pads, [-1, 2])
-            # Compute the total padding per axis.
-            total_pad = _op.sum(pads, axis=-1)
-            # Reversing maxpool means that padding actually makes our output smaller.
-            total_output_shape = total_output_shape - total_pad
-
-        # Create a tensor of zeros then scatter our data through it.
-        zeros_tensor = _op.zeros(total_output_shape, data_type)
-        # We need to flatten all our tensors before scattering.
-        flat_tensor = _op.scatter_elements(
-            _op.reshape(zeros_tensor, [-1]),
-            _op.reshape(indices, [-1]),
-            _op.reshape(data, [-1]),
-            axis=0,
-        )
-        # Now reshape back to prepadded shape.
-        output_tensor = _op.reshape(flat_tensor, total_output_shape)
-
-        return output_tensor
-
-
-class LpPool(OnnxOpConverter):
-    """A helper class for lppool op converters."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        dtype = infer_type(inputs[0]).checked_type.dtype
-        data = inputs[0]
-        input_shape = infer_shape(data)
-        ndim = len(input_shape)
-        num_spatial_dims = ndim - 2
-        if "auto_pad" in attr:
-            attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
-                # Warning: LpPool does not yet support dynamic shapes,
-                # one will need to run dynamic_to_static on this model after import
-                data = autopad(
-                    data,
-                    # this is meant to handle the field 'strides' being optional for opsets 11+
-                    attr.get("strides", [1] * num_spatial_dims),
-                    attr["kernel_shape"],
-                    [1] * ndim,
-                    mode=attr["auto_pad"],
-                )
-            elif attr["auto_pad"] == "VALID":
-                attr["pads"] = tuple([0 for i in range(ndim - 2)])
-            elif attr["auto_pad"] == "NOTSET":
-                pass
-            else:
-                msg = (
-                    f'Value {attr["auto_pad"]} in attribute "auto_pad" of operator LpPool '
-                    f"is invalid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-            attr.pop("auto_pad")
-
-        if "storage_order" in attr:
-            attr["layout"] = onnx_storage_order2layout(
-                attr["storage_order"], dims=(len(input_shape) - 2), op_name="LpPool"
-            )
-        else:
-            attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2), op_name="LpPool")
-
-        p_value = attr.get("p", 2)
-        p = _expr.const(p_value, dtype)
-        reci_p = _expr.const(1.0 / p_value, dtype)
-        data = _op.power(data, p)
-
-        out = AttrCvt(
-            op_name=dimension_picker("avg_pool"),
-            transforms={"kernel_shape": "pool_size", "pads": ("padding", 0)},
-            extras={"count_include_pad": True},
-            ignores=["p"],
-            custom_check=dimension_constraint(),
-        )([data], attr, params)
-        kernels = attr["kernel_shape"]
-        out = _op.abs(out) * _expr.const(np.prod(kernels).astype(dtype))
-        return _op.power(out, reci_p)
-
-
-class GlobalLpPool(OnnxOpConverter):
-    """Operator converter for GlobalLpPool."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # TODO: GlobalLpPool does not yet support dynamic shapes
-        in_shape = infer_shape(inputs[0])
-        attr["kernel_shape"] = in_shape[2:]
-
-        return LpPool._impl_v1(inputs, attr, params)
-
-
-class Mul(Elemwise):
-    """Operator converter for Multiply."""
-
-    name = "multiply"
-
-
-class Pad(OnnxOpConverter):
-    """Operator converter for Pad."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        pad_width = []
-        pads = attr.pop("paddings")
-        dims = int(len(pads) / 2)
-        for i in range(dims):
-            pad_width.append((pads[i], pads[i + dims]))
-        attr["pad_width"] = pad_width
-        pad_mode = attr.get("mode", b"constant").decode("utf-8")
-        if pad_mode in ["constant", "edge", "reflect"]:
-            attr["pad_mode"] = pad_mode
-            attr.pop("mode", None)
-        else:
-            raise tvm.error.OpAttributeInvalid(
-                "Value " + pad_mode + ' in attribute "mode" is invalid for operator Pad.'
-            )
-
-        return AttrCvt(_op.nn.pad, transforms={"value": "pad_value"})(inputs, attr, params)
-
-    @classmethod
-    def _impl_v2(cls, inputs, attr, params):
-        pad_width = []
-        pads = attr.pop("pads")
-        dims = int(len(pads) / 2)
-        for i in range(dims):
-            pad_width.append((pads[i], pads[i + dims]))
-        attr["pad_width"] = pad_width
-        pad_mode = attr.get("mode", b"constant").decode("utf-8")
-        if pad_mode in ["constant", "edge", "reflect"]:
-            attr["pad_mode"] = pad_mode
-            attr.pop("mode", None)
-        else:
-            raise tvm.error.OpAttributeInvalid(
-                "Value " + pad_mode + ' in attribute "mode" is invalid for operator Pad.'
-            )
-
-        return AttrCvt("pad", transforms={"value": "pad_value"})(inputs, attr, params)
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        pads = inputs[1]
-        if len(inputs) == 3 and inputs[2] is not None:
-            value = fold_constant(_op.take(inputs[2], _op.const(0)))
-        else:
-            value = 0.0
-
-        pad_width_expr = fold_constant(_op.transpose(_op.reshape(pads, (2, -1))))
-        pad_mode = attr.get("mode", b"constant").decode("utf-8")
-        if not pad_mode in ["constant", "edge", "reflect"]:
-            raise tvm.error.OpAttributeInvalid(
-                "Value " + pad_mode + ' in attribute "mode" is invalid for operator Pad.'
-            )
-
-        return _op.nn.pad(inputs[0], pad_width_expr, value, pad_mode=pad_mode)
-
-
-class ParametricSoftPlus(OnnxOpConverter):
-    """Operator converter for ParametricSoftPlus."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = _expr.const(float(attr.get("alpha", 1.0)))
-        beta = _expr.const(float(attr.get("beta", 1.0)))
-        return _op.log(_op.exp(beta * inputs[0]) + _expr.const(1.0)) * alpha
-
-
-class Pow(OnnxOpConverter):
-    """Operator converter for Pow."""
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        x = inputs[0]
-        y = inputs[1]
-
-        x_type = infer_type(x).checked_type.dtype
-        output_type = x_type
-        y_type = infer_type(y).checked_type.dtype
-
-        if not x_type.startswith("float"):
-            x_type = "float32"
-            x = _op.cast(x, x_type)
-
-        if x_type != y_type:
-            y = _op.cast(y, x_type)
-
-        # TODO: come up with good default integer pow() func for common backends
-        result = _op.power(x, y)
-        if x_type != output_type:
-            return _op.cast(result, output_type)
-        return result
-
-
-class Prelu(OnnxOpConverter):
-    """Operator converter for Prelu."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        assert len(inputs) == 2, f"Prelu need 2 inputs, {len(inputs)} given"
-        input_shape = shape_of(inputs[0])
-        alpha = _op.broadcast_to_like(inputs[1], inputs[0])
-        alpha = _op.reshape(alpha, [-1])
-        output = _op.nn.prelu(_op.reshape(inputs[0], [-1]), alpha, axis=0)
-        return _op.reshape(output, input_shape)
-
-
-class Reciprocal(OnnxOpConverter):
-    """Operator converter for Reciprocal."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        dtype = infer_type(inputs[0]).checked_type.dtype
-        return _expr.const(1.0, dtype=dtype) / inputs[0]
-
-
-class Flatten(OnnxOpConverter):
-    """Operator converter for Flatten."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        axis = attr.get("axis", 1)
-        ishape = shape_of(inputs[0])
-        ndim = infer_shape(ishape)[0]
-        if axis < 0:
-            axis = axis + ndim
-
-        if axis == 1:
-            out = _op.nn.batch_flatten(inputs[0])
-        else:
-            pre_shape = _op.prod(_op.strided_slice(ishape, [0], [axis], [1]), keepdims=True)
-            post_shape = _op.prod(_op.strided_slice(ishape, [axis], [ndim], [1]), keepdims=True)
-            newshape = fold_constant(_op.concatenate([pre_shape, post_shape], axis=0))
-            out = _op.reshape(inputs[0], newshape)
-        return out
-
-
-class Reshape(OnnxOpConverter):
-    """Operator converter for Reshape."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return _op.reshape(inputs[0], attr["shape"])
-
-    @classmethod
-    def _impl_v5(cls, inputs, attr, params):
-        allowzero = attr.get("allowzero", False)
-        if get_name(inputs[1]) in params:
-            shape = tuple(params[inputs[1].name_hint].numpy().astype("int32"))
-            out = _op.reshape(inputs[0], shape, allowzero=allowzero)
-        else:
-            out = _op.reshape(*inputs, allowzero=allowzero)
-        return out
-
-
-class DepthToSpace(OnnxOpConverter):
-    """Operator converter for DepthToSpace."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        block_size = int(attr["blocksize"])
-        mode = attr.get("mode", b"DCR").decode("utf-8")
-        return _op.nn.depth_to_space(inputs[0], block_size, mode=mode)
-
-
-class SpaceToDepth(OnnxOpConverter):
-    """Operator converter for SpaceToDepth."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-
-        block_size = int(attr["blocksize"])
-        return _op.nn.space_to_depth(inputs[0], block_size)
-
-
-class Concat(OnnxOpConverter):
-    """Operator converter for Concat."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, args, params):
-        return AttrCvt(op_name="concatenate")((inputs,), args)
-
-
-class Scale(OnnxOpConverter):
-    """Operator converter for Scale."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        scale = float(attr.get("scale", 1.0))
-        return inputs[0] * _expr.const(scale)
-
-
-class Selu(OnnxOpConverter):
-    """Operator converter for Selu."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = float(attr.get("alpha", 1.67326319217681884765625))
-        gamma = float(attr.get("gamma", 1.05070102214813232421875))
-        return _expr.const(gamma) * (
-            _expr.const(-alpha) * _op.nn.relu(_expr.const(1.0) - _op.exp(inputs[0]))
-            + _op.nn.relu(inputs[0])
-        )
-
-
-class ScaledTanh(OnnxOpConverter):
-    """Operator converter for ScaledTanh."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = float(attr.get("alpha", 1.0))
-        beta = float(attr.get("beta", 1.0))
-        return _op.tanh(_expr.const(beta) * inputs[0]) * _expr.const(alpha)
-
-
-class Shrink(OnnxOpConverter):
-    """Operator converter for Shrink."""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        x = inputs[0]
-        dtype = infer_type(x).checked_type.dtype
-        lambd = _op.const(attr.get("lambd", 0.5), dtype=dtype)
-        bias = _op.const(attr.get("bias", 0.0), dtype=dtype)
-
-        zeros = _op.zeros_like(x)
-        return _op.where(x < -lambd, x + bias, zeros) + _op.where(x > lambd, x - bias, zeros)
-
-
-class Softsign(OnnxOpConverter):
-    """Operator converter for Softsign."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return inputs[0] / (_expr.const(1.0) + Absolute.get_converter(1)(inputs, attr, params))
-
-
-class Sub(Elemwise):
-    """Operator converter for Subtract."""
-
-    name = "subtract"
-
-
-class Sum(OnnxOpConverter):
-    """Operator converter for Sum."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # Onnx Sum Operator
-        for in_index in range(len(inputs) - 1):
-            inputs[in_index + 1] = _op.add(inputs[in_index], inputs[in_index + 1])
-
-        return inputs[len(inputs) - 1]
-
-
-class Optional_(OnnxOpConverter):
-    """Operator converter for Optional based on sequence construction op."""
-
-    @classmethod
-    def _impl_v15(cls, inputs, attr, params):
-        return SequenceConstruct._impl_v11(inputs, attr, params)
-
-
-class OptionalHasElement(OnnxOpConverter):
-    """Operator converter for OptionalHasElement."""
-
-    @classmethod
-    def _impl_v15(cls, inputs, attr, params):
-        shape = infer_shape(inputs[0])
-        return _op.const(True) if shape else _op.const(False)
-
-
-class OptionalGetElement(OnnxOpConverter):
-    """Operator converter for OptionalGetElement based on sequence construction op."""
-
-    @classmethod
-    def _impl_v15(cls, inputs, attr, params):
-        opt_as_seq = Optional_._impl_v15(inputs, attr, params)
-        return _expr.TupleGetItem(opt_as_seq, 0)
-
-
-class Affine(OnnxOpConverter):
-    """Operator converter for Affine transformation."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = _expr.const(attr.get("alpha", 1.0))
-        beta = _expr.const(attr.get("beta", 0.0))
-        return (alpha * inputs[0]) + beta
-
-
-class ThresholdedRelu(OnnxOpConverter):
-    """Operator converter for ThresholdedRelu."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = float(attr.get("alpha", 1.0))
-        alpha_tensor = _op.full_like(inputs[0], fill_value=_expr.const(alpha))
-        mask = _op.greater(inputs[0], alpha_tensor).astype("float32")
-        return inputs[0] * mask
-
-
-def _broadcast_constraint():
-    def _broadcast_check(attrs):
-        if attrs.get("axis", None):
-            return False
-        return True
-
-    return _broadcast_check, "Specifying broadcast axis not allowed."
-
-
-def _fully_connected(opset):
-    def _impl(inputs, attr, params):
-        # get number of channels
-        channels = infer_channels(inputs[1], params)
-        attr["units"] = channels
-        return AttrCvt("dense", ignores=["axis", "axis_w"])(inputs, attr)
-
-    return _impl
-
-
-class Upsample(OnnxOpConverter):
-    """Operator converter for Upsample (nearest mode)."""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        scales = attr.get("scales")
-
-        input_shape = infer_shape(inputs[0])
-        dims = len(input_shape)
-
-        if not scales:
-            # Here we are going to higher OPSET version.
-            assert len(inputs) == 2, f"Upsample op takes 2 inputs, {len(inputs)} given"
-
-            if get_name(inputs[1]) in params:
-                scales = params[inputs[1].name_hint].numpy()
-            else:
-                scales = inputs[1]
-        if isinstance(scales, _expr.Constant):
-            scales = list(scales.data.numpy())
-        if not isinstance(scales, _expr.Expr):
-            assert scales[0] == 1.0 and scales[1] == 1.0
-
-        mode = attr.get("mode", b"nearest")
-        if mode == b"nearest":
-            method = "nearest_neighbor"
-        elif mode == b"linear":
-            method = "trilinear" if dims == 5 else "bilinear"
-        else:
-            raise tvm.error.OpAttributeInvalid(
-                f'Value {mode} in attribute "mode" of operator Upsample is not valid.'
-            )
-
-        # in 3d case, we use the purely static op
-        if dims == 5:
-            if isinstance(scales, _expr.Expr):
-                scale_h = _op.take(scales, _op.const(3))
-                scale_w = _op.take(scales, _op.const(4))
-                scale_d = _op.take(scales, _op.const(1))
-            else:
-                assert len(scales) == 5
-                scale_h = scales[-2]
-                scale_w = scales[-1]
-                scale_d = scales[-3]
-
-            layout = "NCDHW"
-            out = _op.nn.upsampling3d(
-                inputs[0],
-                scale_d,
-                scale_h,
-                scale_w,
-                layout=layout,
-                method=method,
-                coordinate_transformation_mode="asymmetric",
-            )
-        # in 2d case, use dynamic op
-        else:
-            if isinstance(scales, _expr.Expr):
-                scale_h = _op.take(scales, _op.const(3))
-                scale_w = _op.take(scales, _op.const(4))
-            else:
-                assert len(scales) == 4
-                scale_h = scales[-2]
-                scale_w = scales[-1]
-            layout = "NCHW"
-
-            out = _op.nn.upsampling(
-                inputs[0], scale_h, scale_w, layout=layout, method=method, align_corners=False
-            )
-        return out
-
-
-class Shape(OnnxOpConverter):
-    """Operator converter for Shape."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return shape_of(inputs[0], "int64")
-
-    @classmethod
-    def _impl_v15(cls, inputs, attr, params):
-        start = attr.get("start")
-        end = attr.get("end")
-        return shape_of(inputs[0], dtype="int64", start=start, end=end)
-
-
-class CumSum(OnnxOpConverter):
-    """Operator converter for CumSum."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        data = inputs[0]
-        dim = inputs[1]
-
-        if dim is not None:
-            dim = int(infer_value(dim, params).numpy())
-
-        exclusive = attr.get("exclusive", 0)
-        reverse = attr.get("reverse", 0)
-
-        if reverse != 0:
-            out = _op.reverse(data, axis=dim)
-            out = _op.cumsum(out, axis=dim, exclusive=exclusive)
-            return _op.reverse(out, axis=dim)
-
-        return _op.cumsum(data, axis=dim, exclusive=exclusive)
-
-
-class Cast(OnnxOpConverter):
-    """Operator converter for Cast."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return AttrCvt(op_name="cast", transforms={"to": "dtype"})(inputs, attr)
-
-    @classmethod
-    def _impl_v6(cls, inputs, attr, params):
-        try:
-            from onnx import TensorProto
-        except ImportError as e:
-            raise ImportError(f"Unable to import TensorProto from onnx {e}")
-
-        # If onnx mapping is used, bfloat16 gets converted to float16
-        # which is not the desired behavior
-        if attr["to"] == int(TensorProto.BFLOAT16):
-            attr["to"] = "bfloat16"
-        else:
-            try:
-                from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-
-                attr["to"] = str(TENSOR_TYPE_TO_NP_TYPE[attr["to"]])
-            except ImportError as e:
-                raise ImportError(f"Unable to import onnx.mapping which is required {e}")
-
-        return AttrCvt(op_name="cast", transforms={"to": "dtype"})(inputs, attr)
-
-
-class CastLike(OnnxOpConverter):
-    """Operator converter for CastLike."""
-
-    @classmethod
-    def _impl_v15(cls, inputs, attr, params):
-        return AttrCvt(op_name="cast_like")(inputs, attr)
-
-
-class Unsqueeze(OnnxOpConverter):
-    """Operator converter for Unsqueeze."""
-
-    @classmethod
-    def run_calculation(cls, tensor, axes):
-        axes = sorted(axes)
-        for axis in axes:
-            if axis < 0 and isinstance(tensor, _expr.Var):
-                axis = len(tensor.type_annotation.concrete_shape) + len(axes) + axis
-            tensor = _op.expand_dims(tensor, axis=axis, num_newaxis=1)
-        return tensor
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return cls.run_calculation(inputs[0], attr["axes"])
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        if isinstance(inputs[1], _expr.Constant):
-            constant_axes = list(inputs[1].data.numpy())
-            constant_axes = list(map(int, constant_axes))
-            return cls.run_calculation(inputs[0], constant_axes)
-
-        rank_input = len(infer_type(inputs[0]).checked_type.shape)
-        num_new_axis = int(infer_type(inputs[1]).checked_type.shape[0])
-        axes = relay.sort(inputs[1])
-        axes = relay.split(axes, num_new_axis).astuple()
-        rank_output = rank_input + num_new_axis
-        result = inputs[0]
-
-        # TODO (AndrewZhaoLuo): investigate performance issues with consecutive
-        # dynamic expand_dims on non-llvm targets.
-        for i in range(num_new_axis):
-            axis = relay.TupleGetItem(axes, i)
-            # Unpack scalar
-            axis = relay.reshape(axis, [])
-            axis = relay.where(
-                axis >= relay.const(0, "int64"), axis, axis + relay.const(rank_output, "int64")
-            )
-            result = _op.expand_dims(result, axis)
-        return result
-
-
-class Squeeze(OnnxOpConverter):
-    """Operator converter for Squeeze."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        axis = attr.get("axes", None)
-        return _op.squeeze(inputs[0], axis)
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        ishape = infer_shape(inputs[0])
-        axis = inputs[1]
-
-        if axis is None:
-            # If axes is not provided, all the single dimensions will be removed from the shape.
-            if not ishape:  # scalar
-                return inputs[0]
-
-            axis = [i for i in range(len(ishape)) if ishape[i] == 1]
-            axis = _op.const(axis)
-
-        dtype = infer_type(axis).checked_type.dtype
-
-        if isinstance(axis, _expr.Constant):
-            constant_axes = list(axis.data.numpy())
-            constant_axes = list(map(int, constant_axes))
-            return _op.squeeze(inputs[0], constant_axes)
-
-        rank = _op.shape_of(_op.shape_of(inputs[0], dtype), dtype)
-        axis = _op.where(axis < _op.const(0, dtype), axis + rank, axis)
-        return _op.squeeze(inputs[0], fold_constant(axis))
-
-
-class Split(OnnxOpConverter):
-    """Operator converter for Split."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        splits = attr.get("split", None)
-        if splits is not None and len(splits) > 1:
-            indices = []
-            index = 0
-            for i in splits[:-1]:
-                index += i
-                indices.append(index)
-        # When splits isnt specified divide evenly over axis.
-        else:
-            indices = attr["tvm_custom"]["num_outputs"]
-        output = _op.split(inputs[0], indices, attr.get("axis", 0))
-        # If the output of split is a single value, unpack if from the TupleWrapper
-        if len(output) == 1:
-            output = output[0]
-        return output
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        splits = inputs[1]
-        splits_rank = None
-        if splits is not None:
-            splits_rank = len(infer_shape(splits))
-        if splits is not None and splits_rank > 0:
-            if isinstance(splits, _expr.Constant):
-                splits = splits.data.asnumpy()
-                indices = []
-                index = 0
-                for i in splits[:-1]:
-                    index += i
-                    indices.append(index)
-            else:
-                raise ValueError("Dynamic Split not yet supported")
-        # When splits isnt specified divide evenly over axis.
-        else:
-            indices = attr["tvm_custom"]["num_outputs"]
-        output = _op.split(inputs[0], indices, attr.get("axis", 0))
-        # If the output of split is a single value, unpack if from the TupleWrapper
-        if len(output) == 1:
-            output = output[0]
-        return output
-
-
-class Slice(OnnxOpConverter):
-    """Operator converter for Slice."""
-
-    @classmethod
-    def _common(cls, starts, ends, axes):
-        N = max(axes) + 1
-        new_axes = list(range(N))
-        new_starts = [0] * N
-        new_ends = [np.iinfo(np.int32).max] * N
-        for i, axis in enumerate(axes):
-            new_starts[axis] = starts[i]
-            new_ends[axis] = ends[i]
-        return new_starts, new_ends, new_axes
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if isinstance(attr["starts"], int):
-            attr["starts"] = (attr["starts"],)
-            attr["ends"] = (attr["ends"],)
-
-        try:
-            # Update the starts and ends according to axes if required.
-            if isinstance(attr["axes"], int):
-                attr["axes"] = (attr["axes"],)
-            new_starts, new_ends, new_axes = cls._common(attr["starts"], attr["ends"], attr["axes"])
-            attr["axes"] = new_axes
-            attr["starts"] = new_starts
-            attr["ends"] = new_ends
-        except KeyError:
-            pass
-        begin = list(attr["starts"])
-        end = list(attr["ends"])
-
-        return _op.strided_slice(inputs[0], begin=begin, end=end)
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        starts = inputs[1]
-        ends = inputs[2]
-        axes = inputs[3]
-        steps = inputs[4]
-
-        ishape = infer_shape(inputs[0])
-        data_rank = len(ishape)
-
-        if axes is not None:
-            # Normalize for negative axes
-            axes_dtype = infer_type(axes).checked_type.dtype
-            axes = fold_constant(
-                _op.where(
-                    axes < _op.const(0, axes_dtype), axes + _op.const(data_rank, axes_dtype), axes
-                )
-            )
-
-        def has_static_axes():
-            return (
-                isinstance(axes, _expr.Constant)
-                and isinstance(starts, _expr.Constant)
-                and isinstance(ends, _expr.Constant)
-                and (steps is None or isinstance(steps, _expr.Constant))
-            )
-
-        if axes is not None and has_static_axes():
-            axes_np = axes.data.numpy().astype("int64")
-            begin_np = starts.data.numpy().astype("int64")
-            end_np = ends.data.numpy().astype("int64")
-            if steps is None:
-                strides_np = np.ones_like(begin_np).astype("int64")
-            else:
-                strides_np = steps.data.numpy().astype("int64")
-            if all([isinstance(ishape[i], int) for i in axes_np]):
-                return _op.strided_slice(
-                    inputs[0], list(begin_np), list(end_np), list(strides_np), axes=list(axes_np)
-                )
-
-        # Update the starts and ends according to axes if required.
-        if axes is not None:
-            data_shape = shape_of(inputs[0], dtype=infer_type(ends).checked_type.dtype)
-            starts = _op.scatter_elements(
-                _op.const([0] * data_rank, dtype=infer_type(starts).checked_type.dtype),
-                axes,
-                starts,
-                axis=0,
-            )
-            ends = _op.scatter_elements(data_shape, axes, ends, axis=0)
-            if steps is not None:
-                steps = _op.scatter_elements(
-                    _op.const([1] * data_rank, dtype=infer_type(steps).checked_type.dtype),
-                    axes,
-                    steps,
-                    axis=0,
-                )
-
-        if steps is None:
-            steps = _op.const([1] * data_rank, dtype=infer_type(starts).checked_type.dtype)
-
-        return _op.strided_slice(
-            inputs[0], fold_constant(starts), fold_constant(ends), fold_constant(steps)
-        )
-
-
-def normalize_gather_indices(data, indices, axis):
-    """Make sure gather indices aren't negative"""
-    ind_dtype = infer_type(indices).checked_type.dtype
-    # Normalize the indices to a positive range
-    s = _op.take(_op.shape_of(data, dtype=ind_dtype), _op.const(axis, dtype="int64"))
-    cond = fold_constant(indices < _op.const(0, ind_dtype))
-    if isinstance(cond, _expr.Constant):
-        val = cond.data.numpy()
-        if val.size == 1:
-            cond = val.item()
-            if cond:
-                indices = indices + s
-            return indices
-    indices = _op.where(cond, indices + s, indices)
-    return indices
-
-
-class Gather(OnnxOpConverter):
-    """Operator converter for Gather."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        axis = attr.get("axis", 0)
-        data = inputs[0]
-        indices = inputs[1]
-        indices = normalize_gather_indices(data, indices, axis)
-        return _op.take(data, indices, axis)
-
-
-class GatherElements(OnnxOpConverter):
-    """Operator converter for GatherElements."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        data = inputs[0]
-        indices = inputs[1]
-        axis = attr.get("axis", 0)
-        indices = normalize_gather_indices(data, indices, axis)
-        return _op.gather(data, axis, indices)
-
-
-class GatherND(OnnxOpConverter):
-    """Operator converter for GatherND."""
-
-    @classmethod
-    def _impl_common(cls, data, indices, batch_dims=0):
-        indices_dims = len(infer_shape(indices))
-        indices_shape = infer_shape(indices)
-        indices = _op.transpose(indices, axes=[-1] + list(range(indices_dims - 1)))
-        index_rank = indices_shape[-1]
-        return _op.gather_nd(data, indices, batch_dims=batch_dims, index_rank=index_rank)
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return cls._impl_common(inputs[0], inputs[1])
-
-    @classmethod
-    def _impl_v12(cls, inputs, attr, params):
-        batch_dims = attr.get("batch_dims", 0)
-        return cls._impl_common(inputs[0], inputs[1], batch_dims)
-
-
-class Compress(OnnxOpConverter):
-    """Operator converter for compress"""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        input_tensor, condition_tensor = inputs
-
-        axis = attr.get("axis", None)
-
-        # Change one hot tensor to indices e.g. [0, 1, 1, 0, 1] -> [1, 2, 4]
-        condition_tensor = _op.reshape(_op.argwhere(condition_tensor), (-1,))
-
-        if axis is not None:
-            return _op.take(input_tensor, condition_tensor, axis=axis)
-
-        # if axis is None, flatten input tensor before selection
-        input_tensor = _op.reshape(input_tensor, (-1,))
-        return _op.take(input_tensor, condition_tensor, axis=0)
-
-
-class Scatter(OnnxOpConverter):
-    """Operator converter for Scatter."""
-
-    @classmethod
-    def _args_check(cls, inputs, attr):
-        assert (
-            len(inputs) == 3
-        ), f"Scatter takes 3 inputs (data, indices, updates), {len(inputs)} given"
-        assert infer_type(inputs[1]).checked_type.dtype in ["int32", "int64"]
-
-        data_rank = len(infer_shape(inputs[0]))
-        assert data_rank > 0, "Data rank higher than 0 is expected"
-        indices_shape = infer_shape(inputs[1])
-        indices_rank = len(indices_shape)
-        assert indices_rank == data_rank, "Indices rank is not the same as data one"
-        updates_shape = infer_shape(inputs[2])
-        updates_rank = len(updates_shape)
-        assert updates_rank == data_rank, "Updates rank is not the same as data one"
-
-        for i in range(data_rank):
-            assert (
-                indices_shape[i] == updates_shape[i]
-            ), "Indices dimension size should be the same as updates one"
-
-        axis = attr.get("axis", 0)
-        assert -data_rank <= axis < data_rank, "Axis is out of bounds"
-
-        return axis
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        axis = cls._args_check(inputs, attr)
-        return _op.scatter_elements(inputs[0], inputs[1], inputs[2], axis)
-
-
-class ScatterElements(OnnxOpConverter):
-    """Operator converter for ScatterElements."""
-
-    @classmethod
-    def _args_check(cls, inputs, attr, red_valids=None):
-        ret = []
-        assert (
-            len(inputs) == 3
-        ), f"ScatterElements takes 3 inputs (data, indices, updates), {len(inputs)} given"
-        assert infer_type(inputs[1]).checked_type.dtype in ["int32", "int64"]
-
-        axis = attr.get("axis", 0)
-        rank = len(infer_shape(inputs[0]))
-        assert rank > 0, "Data rank higher than 0 is expected"
-        assert -rank <= axis < rank, "Axis is out of bounds"
-        ret.append(axis)
-
-        if red_valids:
-            reduction = attr.get("reduction", None)
-            if reduction is None:
-                reduction = b"update"
-            reduction = reduction.decode("utf-8")
-            assert (
-                reduction in red_valids
-            ), f"Only {red_valids} modes are supported, but {reduction} is gotten"
-            ret.append(reduction)
-
-        return ret
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        axis = cls._args_check(inputs, attr)[0]
-
-        return _op.scatter_elements(inputs[0], inputs[1], inputs[2], axis, "update")
-
-    @classmethod
-    def _impl_v16(cls, inputs, attr, params):
-        axis, reduction = cls._args_check(inputs, attr, ["update", "add", "mul"])
-
-        return _op.scatter_elements(inputs[0], inputs[1], inputs[2], axis, reduction)
-
-    @classmethod
-    def _impl_v18(cls, inputs, attr, params):
-        axis, reduction = cls._args_check(inputs, attr, ["update", "add", "mul", "min", "max"])
-
-        return _op.scatter_elements(inputs[0], inputs[1], inputs[2], axis, reduction)
-
-
-class ScatterND(OnnxOpConverter):
-    """Operator converter for ScatterND."""
-
-    @classmethod
-    def _inputs_check(cls, inputs):
-        assert (
-            len(inputs) == 3
-        ), f"ScatterND takes 3 inputs (data, indices, updates), {len(inputs)} given"
-        assert infer_type(inputs[1]).checked_type.dtype == "int64"
-
-        data_rank = len(infer_shape(inputs[0]))
-        assert data_rank > 0, "Data rank higher than 0 is expected"
-        indices_rank = len(infer_shape(inputs[1]))
-        assert indices_rank > 0, "Indices rank higher than 0 is expected"
-        updates_rank = len(infer_shape(inputs[2]))
-        assert (
-            updates_rank == data_rank + indices_rank - infer_shape(inputs[1])[-1] - 1
-        ), "Updates rank should be equal to data_rank + indices_rank - indices_shape[-1] - 1"
-
-    @classmethod
-    def _reduction_check(cls, attr, red_valids=None):
-        reduction = attr.get("reduction", None)
-        if reduction is None:
-            reduction = b"update"
-        reduction = reduction.decode("utf-8")
-        if red_valids is None:
-            red_valids = ["update"]
-        assert (
-            reduction in red_valids
-        ), f"Only {red_valids} reductions are supported, but {reduction} is gotten"
-
-        return reduction
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        cls._inputs_check(inputs)
-        indices_dim = len(infer_shape(inputs[1]))
-        axes = list(range(indices_dim))
-        return _op.scatter_nd(inputs[0], _op.transpose(inputs[1], axes[-1:] + axes[:-1]), inputs[2])
-
-    @classmethod
-    def _impl_v16(cls, inputs, attr, params):
-        cls._inputs_check(inputs)
-        reduction = cls._reduction_check(attr, ["update", "add", "mul"])
-
-        indices_dim = len(infer_shape(inputs[1]))
-        axes = list(range(indices_dim))
-        return _op.scatter_nd(
-            inputs[0], _op.transpose(inputs[1], axes[-1:] + axes[:-1]), inputs[2], reduction
-        )
-
-    @classmethod
-    def _impl_v18(cls, inputs, attr, params):
-        cls._inputs_check(inputs)
-        reduction = cls._reduction_check(attr, ["update", "add", "mul", "min", "max"])
-
-        indices_dim = len(infer_shape(inputs[1]))
-        axes = list(range(indices_dim))
-        return _op.scatter_nd(
-            inputs[0], _op.transpose(inputs[1], axes[-1:] + axes[:-1]), inputs[2], reduction
-        )
-
-
-class EyeLike(OnnxOpConverter):
-    """Operator converter for EyeLike."""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        dtype = attr.get("dtype", None)
-        if dtype is None:
-            in_checked_type = infer_type(inputs[0]).checked_type
-            in_dtype = in_checked_type.dtype
-            dtype = in_dtype
-        else:
-            dtype = get_type(dtype)
-
-        node_source_name = get_source_name_from_parameter(inputs[0])
-        # since there exists multi-comsumer for the same expression
-        # invoke set_span here to prevent expr-rewritten in span-filling stage
-        in_shape = set_span(_op.shape_of(inputs[0]), node_source_name)
-        zeros = _op.zeros(in_shape, dtype)
-
-        dim = set_span(_op.take(in_shape, _op.const(0)), node_source_name)
-
-        indices = _op.arange(_op.const(0), dim, dtype="int32")
-        ones = _op.full(_op.const(1), _op.reshape(dim, (1,)), dtype=dtype)
-        k = _op.const(attr.get("k", 0), dtype="int32")
-        return _op.scatter_nd(zeros, _op.stack([indices, indices + k], axis=0), ones, "update")
-
-
-class LRN(OnnxOpConverter):
-    """Operator converter for Local Response Normalization."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        """LRN support only NCHW format
-        https://github.com/onnx/onnx/blob/main/docs/Operators.md#LRN
-        """
-        axis = 1
-        alpha = attr.get("alpha", 0.0001)
-        beta = attr.get("beta", 0.75)
-        bias = attr.get("bias", 1.0)
-        nsize = attr.get("size")
-        attr = {"size": nsize, "axis": axis, "alpha": alpha, "beta": beta, "bias": bias}
-        return AttrCvt("lrn")(inputs, attr)
-
-
-class Maximum(OnnxOpConverter):
-    """Operator converter for Maximum."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if len(inputs) == 1:
-            return inputs[0]
-        _max = inputs[0]
-        for i in range(1, len(inputs)):
-            _max = AttrCvt("maximum")([_max, inputs[i]], {})
-        return _max
-
-
-class Minimum(OnnxOpConverter):
-    """Operator converter for Minimum."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if len(inputs) == 1:
-            return inputs[0]
-        _min = inputs[0]
-        for i in range(1, len(inputs)):
-            _min = AttrCvt("minimum")([_min, inputs[i]], {})
-        return _min
-
-
-class Mean(OnnxOpConverter):
-    """Operator converter for Mean."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if len(inputs) == 1:
-            return inputs[0]
-        # avoid overflow
-        concat = _op.concatenate([_op.expand_dims(x, axis=0) for x in inputs], axis=0)
-        return _op.mean(concat, axis=0, keepdims=False)
-
-
-class MeanVarianceNormalization(OnnxOpConverter):
-    """Operator converter for MeanVarianceNormalization."""
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        axis = attr.get("axes", (0, 2, 3))
-        data_mean = _op.mean(inputs[0], axis=axis, keepdims=True)
-        data_mean_squared = _op.power(data_mean, _expr.const(2, "float32"))
-        data_squared = _op.power(inputs[0], _expr.const(2, "float32"))
-        data_squared_mean = _op.mean(data_squared, axis=axis, keepdims=True)
-        return (inputs[0] - data_mean) / _op.sqrt(data_squared_mean - data_mean_squared)
-
-
-class HardSigmoid(OnnxOpConverter):
-    """Operator converter for HardSigmoid."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = attr.get("alpha", 0.2)
-        beta = attr.get("beta", 0.5)
-        transformX = (inputs[0] * _expr.const(alpha)) + _expr.const(beta)
-        attr = {"a_min": 0, "a_max": 1}
-        return AttrCvt("clip")([transformX], attr)
-
-
-class HardSwish(OnnxOpConverter):
-    """Operator converter for HardSwish."""
-
-    @classmethod
-    def _impl_v14(cls, inputs, attr, params):
-        alpha = attr.get("alpha", 1 / 6)
-        beta = attr.get("beta", 0.5)
-        transformX = inputs[0] * _expr.const(alpha) + _expr.const(beta)
-        attr = {"a_min": 0, "a_max": 1}
-        return inputs[0] * AttrCvt("clip")([transformX], attr)
-
-
-class Reduce(OnnxOpConverter):
-    """Operator converter for reduce ops."""
-
-    name = ""
-
-    @classmethod
-    def run_calculation(cls, inputs, axis, keepdims):
-        attr = {"axis": axis, "keepdims": keepdims}
-        return AttrCvt(cls.name)(inputs, attr)
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
-            inputs[0] = _op.expand_dims(inputs[0], axis=0)
-
-        if "axes" in attr:
-            axis = attr.get("axes", 0)
-        else:
-            axis_len = len(infer_shape(inputs[0]))
-            axis = list(range(axis_len))
-
-        return cls.run_calculation(inputs, axis, attr.get("keepdims", True))
-
-    @classmethod
-    def _impl_v12(cls, inputs, attr, params):
-        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
-            inputs[0] = _op.expand_dims(inputs[0], axis=0)
-
-        if len(inputs) == 2:
-            if isinstance(inputs[1], _expr.Constant):
-                # Get axis and unpack scalar
-                constant_axis = int(inputs[1].data.numpy()[0])
-                return cls.run_calculation([inputs[0]], constant_axis, attr.get("keepdims", True))
-
-            raise ValueError("Dynamic Reduce is not supported yet!")
-
-        return cls._impl_v1(inputs, attr, params)
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
-            inputs[0] = _op.expand_dims(inputs[0], axis=0)
-
-        noop_with_empty_axes = attr.get("noop_with_empty_axes", 0)
-        num_axis = int(infer_type(inputs[1]).checked_type.shape[0]) if inputs[1] is not None else 0
-
-        if noop_with_empty_axes and num_axis == 0:
-            return inputs[0]
-
-        if len(inputs) == 2:
-            if isinstance(inputs[1], _expr.Constant):
-                # Get axis and unpack scalar
-                constant_axis = int(inputs[1].data.numpy()[0])
-                return cls.run_calculation([inputs[0]], constant_axis, attr.get("keepdims", True))
-
-            if num_axis > 0:
-                raise ValueError("Dynamic Reduce is not supported yet!")
-
-            axis_len = len(infer_shape(inputs[0]))
-            axis = list(range(axis_len))
-            return cls.run_calculation([inputs[0]], axis, attr.get("keepdims", True))
-
-        return cls._impl_v1(inputs, attr, params)
-
-
-class ReduceMax(Reduce):
-    """Operator converter for ReduceMax."""
-
-    name = "max"
-
-
-class ReduceMin(Reduce):
-    """Operator converter for ReduceMin."""
-
-    name = "min"
-
-
-class ReduceSum(Reduce):
-    """Operator converter for ReduceSum."""
-
-    name = "sum"
-
-
-class ReduceMean(Reduce):
-    """Operator converter for ReduceMean."""
-
-    name = "mean"
-
-
-class ReduceProd(Reduce):
-    """Operator converter for ReduceProd."""
-
-    name = "prod"
-
-
-class ReduceLogSumExp(Reduce):
-    """Operator converter for ReduceLogSumExp."""
-
-    name = "logsumexp"
-
-
-class ReduceSumSquare(OnnxOpConverter):
-    """Operator converter for ReduceSumSquare."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
-            inputs[0] = _op.expand_dims(inputs[0], axis=0)
-
-        if "axes" in attr:
-            axis = attr.get("axes", 0)
-        else:
-            axis_len = len(infer_shape(inputs[0]))
-            axis = list(range(axis_len))
-        attr = {"axis": axis, "keepdims": attr.get("keepdims", True)}
-        inputs[0] = inputs[0] * inputs[0]
-
-        return AttrCvt("sum")(inputs, attr)
-
-
-class ReduceL1(OnnxOpConverter):
-    """Operator converter for ReduceL1."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
-            inputs[0] = _op.expand_dims(inputs[0], axis=0)
-
-        if "axes" in attr:
-            axis = attr.get("axes", 0)
-        else:
-            axis_len = len(infer_shape(inputs[0]))
-            axis = list(range(axis_len))
-        attr = {"axis": axis, "keepdims": attr.get("keepdims", True)}
-        inputs[0] = _op.abs(inputs[0])
-
-        return AttrCvt("sum")(inputs, attr)
-
-
-class ReduceL2(OnnxOpConverter):
-    """Operator converter for ReduceL2."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
-            inputs[0] = _op.expand_dims(inputs[0], axis=0)
-
-        if "axes" in attr:
-            axis = attr.get("axes", 0)
-        else:
-            axis_len = len(infer_shape(inputs[0]))
-            axis = list(range(axis_len))
-        attr = {"axis": axis, "keepdims": attr.get("keepdims", True)}
-        inputs[0] = inputs[0] * inputs[0]
-        out = AttrCvt("sum")(inputs, attr)
-
-        return _op.sqrt(out)
-
-
-class ReduceLogSum(OnnxOpConverter):
-    """Operator converter for ReduceLogSum."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
-            inputs[0] = _op.expand_dims(inputs[0], axis=0)
-
-        if "axes" in attr:
-            axis = attr.get("axes", 0)
-        else:
-            axis_len = len(infer_shape(inputs[0]))
-            axis = list(range(axis_len))
-        attr = {"axis": axis, "keepdims": attr.get("keepdims", True)}
-        out = AttrCvt("sum")(inputs, attr)
-
-        return _op.log(out)
-
-
-class ArgMax(OnnxOpConverter):
-    """Operator converter for ArgMax."""
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        axis = attr.get("axis", 0)
-        keepdims = attr.get("keepdims", True)
-        select_last_index = attr.get("select_last_index", False)
-        attr = {"axis": axis, "keepdims": keepdims, "select_last_index": select_last_index}
-        return _op.cast(AttrCvt("argmax")(inputs, attr), "int64")
-
-
-class ArgMin(OnnxOpConverter):
-    """Operator converter for ArgMin."""
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        axis = attr.get("axis", 0)
-        keepdims = attr.get("keepdims", True)
-        select_last_index = attr.get("select_last_index", False)
-        attr = {"axis": axis, "keepdims": keepdims, "select_last_index": select_last_index}
-        return _op.cast(AttrCvt("argmin")(inputs, attr), "int64")
-
-
-class Softmax(OnnxOpConverter):
-    """Operator converter for Softmax."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        axis = attr.get("axis", 1)
-        in_shape = infer_shape(inputs[0])
-        ndim = len(in_shape)
-        if axis < 0:
-            axis += ndim
-        if axis == 0:
-            reshape_shape = [-1]
-        elif axis == ndim - 1:
-            return _op.nn.softmax(inputs[0], axis=axis)
-        else:
-            axis_val = [in_shape[i] for i in range(axis)]
-            reshape_shape = [np.prod(axis_val)] + [-1]
-        data_reshape = _op.reshape(inputs[0], newshape=reshape_shape)
-        out = _op.nn.softmax(data_reshape, axis=-1)
-        out = _op.reshape(out, newshape=in_shape)
-        return out
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, _):
-        axis = attr.get("axis", -1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        return _op.nn.softmax(inputs[0], axis=axis)
-
-
-class LogSoftmax(OnnxOpConverter):
-    """Operator converter for Softmax."""
-
-    @classmethod
-    def run_calculation(cls, inputs, attr, params, opset):
-        """Run the calculation for Log Softmax calculation."""
-        res = Softmax.get_converter(opset)(inputs, attr, params)
-        return _op.log(res)
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return cls.run_calculation(inputs, attr, params, opset=1)
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        return cls.run_calculation(inputs, attr, params, opset=13)
-
-
-class Hardmax(OnnxOpConverter):
-    """Operator converter for Hardmax."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        axis = attr.get("axis", 1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        dtype = infer_type(inputs[0]).checked_type.dtype
-
-        if axis == 0:
-            pre = _op.const([1], "int64")
-        else:
-            pre = _op.prod(
-                _op.strided_slice(shape_of(inputs[0]), [0], [axis], [1]), axis=0, keepdims=True
-            )
-        post = _op.prod(
-            _op.strided_slice(shape_of(inputs[0]), [axis], [2147483647], [1]), axis=0, keepdims=True
-        )
-        newshape = _op.concatenate([pre, post], axis=0)
-        x = _op.reshape(inputs[0], fold_constant(newshape))
-        argmax = _op.argmax(x, axis=1)
-        onehot = _op.one_hot(
-            argmax,
-            _op.const(1.0, dtype),
-            _op.const(0.0, dtype),
-            fold_constant(_op.take(shape_of(x), _op.const([1], "int64"))),
-            1,
-            dtype,
-        )
-        return _op.reshape(onehot, shape_of(inputs[0]))
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params) -> relay.Expr:
-        inferred_type = infer_type(inputs[0])
-        dtype = inferred_type.checked_type.dtype
-        ndim = len(inferred_type.checked_type.shape)
-        axis = attr.get("axis", -1) % ndim
-
-        argmax = _op.argmax(inputs[0], axis=axis)
-        return _op.one_hot(
-            argmax,
-            _op.const(1.0, dtype),
-            _op.const(0.0, dtype),
-            fold_constant(_op.take(shape_of(inputs[0]), _op.const([axis], "int64"))),
-            axis,
-            dtype,
-        )
-
-
-class OneHot(OnnxOpConverter):
-    """Operator converter for OneHot."""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        # Extract relay one_hot inputs.
-        indices, depth, values = inputs
-        ndim = len(infer_shape(indices))
-        # Split onnx on off values into two separate expressions.
-        off_value, on_value = _op.take(values, _op.const(0)), _op.take(values, _op.const(1))
-        # Extract the datatype of the output from on_value.
-        dtype = infer_type(on_value).checked_type.dtype
-        ind_dtype = infer_type(indices).checked_type.dtype
-        # Normalize the indices to a positive range
-        indices = _op.where(
-            indices < _op.const(0, ind_dtype), indices + _op.cast(depth, ind_dtype), indices
-        )
-        # set default value when axis is not set in the model
-        if "axis" not in attr:
-            attr["axis"] = -1
-        axis = attr["axis"]
-        if axis < 0:
-            axis += ndim + 1
-
-        return _op.one_hot(indices, on_value, off_value, depth, axis, dtype=dtype)
-
-
-class ConstantOfShape(OnnxOpConverter):
-    """Operator converter for ConstantOfShape."""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        if "value" in attr:
-            np_value = get_numpy(attr.pop("value"))[0]
-            value = _expr.const(np_value)
-            dtype = np_value.dtype.name
-        else:
-            value = _expr.const(0)
-            dtype = "float32"
-        output = _op.full(value, inputs[0], dtype=dtype)
-        return output
-
-
-class Constant(OnnxOpConverter):
-    """Operator converter for ConstantOfShape."""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        if "value" not in attr:
-            raise tvm.error.OpAttributeRequired("no value in Constant")
-        value = attr.pop("value")
-        # Constants may rarely have string types. These are likely exported
-        # from other frameworks and not actually used in TVM. We'll just use
-        # a zero valued constant for compatibility.
-        if isinstance(value, bytes):
-            np_value = np.asarray([0]).astype("int64")
-        else:
-            np_value = get_numpy(value)
-        dtype = np_value.dtype.name
-        value = _expr.const(np_value, dtype)
-        return value
-
-
-class Sign(OnnxOpConverter):
-    """Operator converter for Sign."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return _op.sign(inputs[0])
-
-
-class Equal(Elemwise):
-    """Operator converter for Equal."""
-
-    name = "equal"
-
-
-class Not(Elemwise):
-    """Operator converter for Not."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return _op.logical_not(inputs[0])
-
-
-class And(Elemwise):
-    """Operator converter for And."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return _op.logical_and(inputs[0], inputs[1])
-
-
-class Tile(Elemwise):
-    """Operator converter for Tile"""
-
-    @classmethod
-    def _impl_v6(cls, inputs, attr, params):
-        return _op.tile(inputs[0], inputs[1])
-
-
-class Erf(OnnxOpConverter):
-    """Operator converter for Erf"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return _op.erf(inputs[0])
-
-
-class Where(OnnxOpConverter):
-    """Operator converter for Where"""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        return _op.where(*inputs)
-
-
-class Or(Elemwise):
-    """Operator converter for Or."""
-
-    @classmethod
-    def _impl_v7(cls, inputs, attr, params):
-        return _op.logical_or(inputs[0], inputs[1])
-
-
-class Expand(OnnxOpConverter):
-    """Operator converter for Expand."""
-
-    @classmethod
-    def _impl_v8(cls, inputs, attr, params):
-        dtype = infer_type(inputs[1]).checked_type.dtype
-        in_shape = shape_of(inputs[0], dtype=dtype)
-        shape = inputs[1]
-
-        # Currently 'op.broadcast_to' expect the rank of the given 'shape'
-        # (the 2nd input) is always higher than that of the given 'input' (the 1st input)
-        # However, ONNX Expand supports multi-directional broadcasting, which allows
-        # above pattern and also some extent of 'shape' can be smaller than the corresponding
-        # extent of 'input'. In this case, the extent of 'shape' must be 1.
-        # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
-        # In above cases, we cannot directorly apply 'op.broadcast_to' instead of 'expand'
-        # so, here we solved this problem by expanding the given 'shape' itself.
-        def expand_shape(in_shape, shape):
-            """A function expands the shape when the rank is lower than that of the given
-            intput. Also it replaces the extent of the shape with the corresponding extent
-            of the intput when it is 1.
-            """
-            in_dims = infer_shape(in_shape)[0]
-            new_dims = infer_shape(shape)[0]
-
-            if in_dims < new_dims:
-                in_shape = _op.concatenate(
-                    [_expr.const([1] * (new_dims - in_dims), dtype=dtype), in_shape], axis=0
-                )
-            elif new_dims < in_dims:
-                shape = _op.concatenate(
-                    [_expr.const([1] * (in_dims - new_dims), dtype=dtype), shape], axis=0
-                )
-            new_shape = _op.maximum(in_shape, shape)
-            return new_shape
-
-        shape = fold_constant(expand_shape(in_shape, shape))
-        return _op.broadcast_to(inputs[0], shape=shape)
-
-
-class RNN(OnnxOpConverter):
-    """Operator converter for RNNs such as RNN, LSTM and GRU."""
-
-    @classmethod
-    def _activation_helper(cls, activation, alpha, beta):
-        convert_map = _get_convert_map(1)
-        attrs = {}
-        if alpha is not None:
-            attrs["alpha"] = alpha
-        if beta is not None:
-            attrs["beta"] = beta
-        return lambda x: convert_map[activation.decode("utf-8")]([x], attrs, {})
-
-    @classmethod
-    def _activation_needs_alpha(cls, activation):
-        needs_alpha = ["Affine", "LeakyRelu", "ThresholdedRelu", "ScaledTanh", "HardSigmoid", "Elu"]
-        return activation.decode("utf-8") in needs_alpha
-
-    @classmethod
-    def _activation_needs_beta(cls, activation):
-        needs_beta = ["Affine", "ScaledTanh", "HardSigmoid"]
-        return activation.decode("utf-8") in needs_beta
-
-    @classmethod
-    def bidir_rnn_cell(cls, input_seqs, weight_dicts, acts):
-        """
-        Bidirectional RNN cell
-        """
-        seq_len = len(input_seqs)
-        forward_outputs, fw_H_t = rnn_cell(input_seqs, **weight_dicts[0], act=acts[0])
-
-        reverse_outputs, rev_H_t = rnn_cell(
-            input_seqs, **weight_dicts[1], act=acts[1], backwards=True
-        )
-
-        final_outputs = []
-        for i in range(seq_len):
-            final_outputs.append(
-                _op.stack([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=0)
-            )
-
-        return (_op.stack(final_outputs, axis=0), _op.stack([fw_H_t, rev_H_t], axis=0))
-
-    @classmethod
-    def _default_activations(cls, num_directions):
-        return [_op.tanh] * num_directions
-
-    @classmethod
-    def _get_activations(cls, attr, multiplier, num_directions, rnn_type):
-        """
-        Activation functions
-        """
-        if "activations" in attr:
-            activations = attr["activations"]
-            if len(activations) != multiplier * num_directions:
-                raise NotImplementedError(
-                    f"{rnn_type} assumes {multiplier} * num_directions activation functions "
-                    f"are provided"
-                )
-            alpha_loc = 0
-            alphas = attr.get("activation_alpha", [])
-            if isinstance(alphas, float):
-                alphas = [alphas]
-            beta_loc = 0
-            betas = attr.get("activation_beta", [])
-            if isinstance(betas, float):
-                betas = [betas]
-            acts = []
-            for i in range(multiplier * num_directions):
-                alpha = None
-                beta = None
-                activation = activations[i]
-                if cls._activation_needs_alpha(activation) and len(alphas) > alpha_loc:
-                    alpha = alphas[alpha_loc]
-                    alpha_loc += 1
-                if cls._activation_needs_beta(activation) and len(betas) > beta_loc:
-                    beta = betas[beta_loc]
-                    beta_loc += 1
-                acts.append(cls._activation_helper(activation, alpha, beta))
-        else:
-            acts = cls._default_activations(num_directions)
-        return acts
-
-    @classmethod
-    def _inputs_helper(cls, inputs, layout):
-        """
-        Process inputs
-        """
-        # Unpack inputs, note that if optional and not provided then value will be None.
-        X = inputs[0]
-        Wp = inputs[1]
-        Rp = inputs[2]
-        Bp = inputs[3]
-        sequence_lens = inputs[4]
-        Hp_0 = inputs[5]
-
-        num_directions = infer_shape(Wp)[0]
-
-        if num_directions not in [1, 2]:
-            raise ValueError("num_directions must be either 1 or 2!")
-
-        if layout == 1:
-            X = _op.transpose(X, axes=(1, 0))
-
-        # Initialize state if not provided.
-        if Hp_0 is None:
-            W_dtype = infer_type(Wp).checked_type.dtype
-            X_shape = infer_shape(X)
-            hidden_size = infer_shape(Rp)[-1]
-            batch_size = X_shape[1]
-            Hp_0 = _op.zeros((num_directions, batch_size, hidden_size), W_dtype)
-        elif layout == 1:
-            Hp_0 = _op.transpose(Hp_0, axes=(1, 0))
-
-        # TODO (vvchernov): It can be replaced by _op.split if issue #8412 is resolved
-        X_steps = unbind(X, axis=0)
-
-        H_ts = _op.split(Hp_0, num_directions)
-        Ws = _op.split(Wp, num_directions)
-        Rs = _op.split(Rp, num_directions)
-
-        Bs = None
-        if Bp is not None:
-            Bs = _op.split(Bp, num_directions)
-        return X_steps, H_ts, Ws, Rs, Bs, num_directions, sequence_lens
-
-    @classmethod
-    def _impl_common(cls, inputs, attr, layout):
-        X_steps, H_ts, Ws, Rs, Bs, num_directions, _ = cls._inputs_helper(inputs, layout)
-        acts = cls._get_activations(attr, 1, num_directions, "RNN")
-
-        weights_dicts = []
-        for i in range(num_directions):
-            weights_dict = {}
-
-            weights_dict["hidden_state"] = _op.squeeze(H_ts[i], axis=[0])
-
-            weights_dict["w_inp"] = _op.squeeze(Ws[i], axis=[0])
-            weights_dict["w_hid"] = _op.squeeze(Rs[i], axis=[0])
-            if Bs is not None:
-                Bi, Bh = _op.split(Bs[i], 2, -1)
-                weights_dict["b_inp"] = _op.squeeze(Bi, axis=[0])
-                weights_dict["b_hid"] = _op.squeeze(Bh, axis=[0])
-            weights_dicts.append(weights_dict)
-
-        if num_directions == 2:
-            output, H = RNN.bidir_rnn_cell(
-                input_seqs=X_steps, weight_dicts=weights_dicts, acts=acts
-            )
-        else:
-            # outputs shape = [seqs_num, (batch_size, hidden_size)]
-            outputs, H = rnn_cell(input_seqs=X_steps, **weights_dicts[0], act=acts[0])
-
-            # output shape = (seqs_num, num_directions, batch_size, hidden_size)
-            output = _op.expand_dims(_op.stack(outputs, axis=0), axis=1)
-            H = _op.expand_dims(H, axis=0)
-
-        if layout == 1:
-            output = _op.transpose(output, axes=(1, 0))
-            H = _op.transpose(H, axes=(1, 0))
-        return _expr.TupleWrapper(_expr.Tuple((output, H)), 2)
-
-    @classmethod
-    def _impl_v7(cls, inputs, attr, params):
-        return cls._impl_common(inputs, attr, 0)
-
-    @classmethod
-    def _impl_v14(cls, inputs, attr, params):
-        layout = attr.get("layout", 0)
-        return cls._impl_common(inputs, attr, layout)
-
-
-class LSTM(RNN):
-    """Operator converter for LSTM"""
-
-    @classmethod
-    def bidir_lstm_cell(cls, input_seqs, weight_dicts, acts):
-        """
-        Bidirectional LSTM cell
-        """
-        seq_len = len(input_seqs)
-        forward_outputs, fw_H_t, fw_C_t = lstm_cell(
-            input_seqs, **weight_dicts[0], f_act=acts[0], g_act=acts[1], h_act=acts[2]
-        )
-
-        reverse_outputs, rev_H_t, rev_C_t = lstm_cell(
-            input_seqs,
-            **weight_dicts[1],
-            f_act=acts[3],
-            g_act=acts[4],
-            h_act=acts[5],
-            backwards=True,
-        )
-
-        final_outputs = []
-        for i in range(seq_len):
-            final_outputs.append(
-                _op.stack([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=0)
-            )
-
-        return (
-            _op.stack(final_outputs, axis=0),
-            _op.stack([fw_H_t, rev_H_t], axis=0),
-            _op.stack([fw_C_t, rev_C_t], axis=0),
-        )
-
-    @classmethod
-    def _default_activations(cls, num_directions):
-        return [_op.sigmoid, _op.tanh, _op.tanh] * num_directions
-
-    @classmethod
-    def _impl_common(cls, inputs, attr, layout):
-        X_steps, H_ts, Ws, Rs, Bs, num_directions, _ = cls._inputs_helper(inputs, layout)
-        acts = cls._get_activations(attr, 3, num_directions, "LSTM")
-
-        # cell state
-        Cp_0 = inputs[6]
-        if Cp_0 is None:
-            C_ts = _expr.TupleWrapper(
-                _expr.Tuple([_op.zeros_like(H_ts[i]) for i in range(num_directions)]),
-                num_directions,
-            )
-        else:
-            if layout == 1:
-                Cp_0 = _op.transpose(Cp_0, axes=(1, 0))
-            C_ts = _op.split(Cp_0, num_directions)
-
-        # peepholes
-        Pp = inputs[7]
-        if Pp is not None:
-            p_i, p_o, p_f = _op.split(Pp, 3, axis=1)
-
-            p_is = _op.split(p_i, num_directions)
-            p_fs = _op.split(p_f, num_directions)
-            p_os = _op.split(p_o, num_directions)
-
-        weights_dicts = []
-        for i in range(num_directions):
-            weights_dict = {}
-
-            weights_dict["hidden_state"] = _op.squeeze(H_ts[i], axis=[0])
-            weights_dict["cell_state"] = _op.squeeze(C_ts[i], axis=[0])
-
-            # Weights permutation: onnx format i-o-f-c, lstm cell format i-f-c-o
-            mati, mato, matf, matc = _op.split(_op.squeeze(Ws[i], axis=[0]), 4)
-            weights_dict["w_inp"] = _op.concatenate([mati, matf, matc, mato], axis=0)
-            mati, mato, matf, matc = _op.split(_op.squeeze(Rs[i], axis=[0]), 4)
-            weights_dict["w_hid"] = _op.concatenate([mati, matf, matc, mato], axis=0)
-            if Bs is not None:
-                Bi, Bh = _op.split(Bs[i], 2, -1)
-                mati, mato, matf, matc = _op.split(_op.squeeze(Bi, axis=[0]), 4)
-                weights_dict["b_inp"] = _op.concatenate([mati, matf, matc, mato], axis=0)
-                mati, mato, matf, matc = _op.split(_op.squeeze(Bh, axis=[0]), 4)
-                weights_dict["b_hid"] = _op.concatenate([mati, matf, matc, mato], axis=0)
-            if Pp is not None:
-                weights_dict["p_i"] = _op.squeeze(p_is[i], axis=[0])
-                weights_dict["p_f"] = _op.squeeze(p_fs[i], axis=[0])
-                weights_dict["p_o"] = _op.squeeze(p_os[i], axis=[0])
-            weights_dicts.append(weights_dict)
-
-        if num_directions == 2:
-            output, H, C = LSTM.bidir_lstm_cell(
-                input_seqs=X_steps, weight_dicts=weights_dicts, acts=acts
-            )
-        else:
-            # outputs shape = [seqs_num, (batch_size, hidden_size)]
-            outputs, H, C = lstm_cell(
-                input_seqs=X_steps, **weights_dicts[0], f_act=acts[0], g_act=acts[1], h_act=acts[2]
-            )
-
-            # output shape = (seqs_num, num_directions, batch_size, hidden_size)
-            output = _op.expand_dims(_op.stack(outputs, axis=0), axis=1)
-            H = _op.expand_dims(H, axis=0)
-            C = _op.expand_dims(C, axis=0)
-
-        if layout == 1:
-            output = _op.transpose(output, axes=(1, 0))
-            H = _op.transpose(H, axes=(1, 0))
-            C = _op.transpose(C, axes=(1, 0))
-        return _expr.TupleWrapper(_expr.Tuple((output, H, C)), 3)
-
-
-class GRU(RNN):
-    """Operator convert for GRU"""
-
-    @classmethod
-    def bidir_gru_cell(cls, input_seqs, weight_dicts, acts, sequence_lens=None):
-        """
-        Bidirectional GRU cell
-        """
-        seq_len = len(input_seqs)
-        forward_outputs, fw_H_t = gru_cell(
-            input_seqs,
-            **weight_dicts[0],
-            rz_act=acts[0],
-            n_act=acts[1],
-            sequence_lens=sequence_lens,
-        )
-
-        reverse_outputs, rev_H_t = gru_cell(
-            input_seqs,
-            **weight_dicts[1],
-            rz_act=acts[2],
-            n_act=acts[3],
-            backwards=True,
-            sequence_lens=sequence_lens,
-        )
-
-        final_outputs = []
-        for i in range(seq_len):
-            final_outputs.append(
-                _op.stack([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=0)
-            )
-
-        return (_op.stack(final_outputs, axis=0), _op.stack([fw_H_t, rev_H_t], axis=0))
-
-    @classmethod
-    def _default_activations(cls, num_directions):
-        return [_op.sigmoid, _op.tanh] * num_directions
-
-    @classmethod
-    def _impl_common(cls, inputs, attr, layout):
-        X_steps, H_ts, Ws, Rs, Bs, num_directions, sequence_lens = cls._inputs_helper(
-            inputs, layout
-        )
-        acts = cls._get_activations(attr, 2, num_directions, "GRU")
-        linear_before_reset = attr.get("linear_before_reset", 0)
-
-        weights_dicts = []
-        for i in range(num_directions):
-            weights_dict = {}
-
-            weights_dict["hidden_state"] = _op.squeeze(H_ts[i], axis=[0])
-            weights_dict["linear_before_reset"] = linear_before_reset
-
-            # Weights permutation: onnx format i-o-f-c, lstm cell format i-f-c-o
-            matz, matr, matn = _op.split(_op.squeeze(Ws[i], axis=[0]), 3)
-            weights_dict["w_inp"] = _op.concatenate([matr, matz, matn], axis=0)
-            matz, matr, matn = _op.split(_op.squeeze(Rs[i], axis=[0]), 3)
-            weights_dict["w_hid"] = _op.concatenate([matr, matz, matn], axis=0)
-            if Bs is not None:
-                Bi, Bh = _op.split(Bs[i], 2, -1)
-                matz, matr, matn = _op.split(_op.squeeze(Bi, axis=[0]), 3)
-                weights_dict["b_inp"] = _op.concatenate([matr, matz, matn], axis=0)
-                matz, matr, matn = _op.split(_op.squeeze(Bh, axis=[0]), 3)
-                weights_dict["b_hid"] = _op.concatenate([matr, matz, matn], axis=0)
-            weights_dicts.append(weights_dict)
-
-        if num_directions == 2:
-            output, H = GRU.bidir_gru_cell(
-                input_seqs=X_steps,
-                weight_dicts=weights_dicts,
-                acts=acts,
-                sequence_lens=sequence_lens,
-            )
-        else:
-            # outputs shape = [seqs_num, (batch_size, hidden_size)]
-            outputs, H = gru_cell(
-                input_seqs=X_steps,
-                **weights_dicts[0],
-                rz_act=acts[0],
-                n_act=acts[1],
-                sequence_lens=sequence_lens,
-            )
-
-            # output shape = (seqs_num, num_directions, batch_size, hidden_size)
-            output = _op.expand_dims(_op.stack(outputs, axis=0), axis=1)
-            H = _op.expand_dims(H, axis=0)
-
-        if layout == 1:
-            output = _op.transpose(output, axes=(1, 0))
-            H = _op.transpose(H, axes=(1, 0))
-        return _expr.TupleWrapper(_expr.Tuple((output, H)), 2)
-
-
-class Resize(OnnxOpConverter):
-    """Operator converter for Resize"""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        mode = attr.get("mode", b"nearest").decode("ascii")
-        if mode == "nearest":
-            method = "nearest_neighbor"
-        elif mode == "linear":
-            method = "linear"
-        elif mode == "cubic":
-            method = "cubic"
-        else:
-            raise tvm.error.OpAttributeInvalid(
-                f'Value {mode} in attribute "mode" of operator Resize is not valid.'
-            )
-
-        scale = inputs[1]
-        size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
-        ndims = len(infer_shape(inputs[0]))
-        out = None
-        if ndims == 3:
-            out_size = fold_constant(_op.strided_slice(size, [2], [3]))
-            out = _op.image.resize1d(inputs[0], out_size, None, "NCW", method, "asymmetric")
-        elif ndims == 4:
-            out_size = fold_constant(_op.strided_slice(size, [2], [4]))
-            out = _op.image.resize2d(inputs[0], out_size, None, "NCHW", method, "asymmetric")
-        elif ndims == 5:
-            out_size = fold_constant(_op.strided_slice(size, [2], [5]))
-            out = _op.image.resize3d(inputs[0], out_size, None, "NCDHW", method, "asymmetric")
-        else:
-            raise NotImplementedError("Resize only supports 3, 4, or 5 dims")
-        return out
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        scale = inputs[2]
-        scale_shape = infer_shape(scale)
-        if len(inputs) == 4:
-            assert (
-                len(scale_shape) == 0 or scale_shape[0] == 0
-            ), "One of scale or size should be passed, not both."
-            size = inputs[3]
-        else:
-            assert len(scale_shape) != 0, "One of scale or size should be passed."
-            size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
-        return cls.v11_13_common(inputs, size, attr, params)
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        scale = inputs[2]
-        size = inputs[3]
-
-        # Some versions of onnx exporters produce an opset 13 model with the opset 11
-        # resize op, handle that edge case
-        if scale is not None and size is not None:
-            return cls._impl_v11(inputs, attr, params)
-
-        if size is not None:
-            assert scale is None, "One of scale or size should be passed, not both."
-        else:
-            scale_type = infer_type(scale)
-            scale_shape = scale_type.checked_type.shape
-            scale_dtype = scale_type.checked_type.dtype
-            assert len(scale_shape) != 0, "One of scale or size should be passed."
-            size = _op.cast(shape_of(inputs[0]), scale_dtype) * scale
-
-        return cls.v11_13_common(inputs, size, attr, params)
-
-    @classmethod
-    def v11_13_common(cls, inputs, size, attr, params):
-        """
-        Resize v11 and Resize v13 are identical except in how
-        they handle the passing of scale and size. This utility
-        provides the implementation for both
-        """
-        roi = inputs[1]
-        if roi is not None and infer_shape(roi)[0] == 0:
-            roi = None
-        ndims = len(infer_shape(inputs[0]))
-        mode = attr.get("mode", b"nearest").decode("ascii")
-        if mode == "nearest":
-            method = "nearest_neighbor"
-        elif mode == "linear":
-            method = "linear"
-        elif mode == "cubic":
-            method = "cubic"
-        else:
-            raise tvm.error.OpAttributeInvalid(
-                f'Value {mode} in attribute "mode" of operator Resize is not valid.'
-            )
-
-        coord_trans = attr.get("coordinate_transformation_mode", b"half_pixel").decode("ascii")
-        nearest_mode = attr.get("nearest_mode", b"round_prefer_floor").decode("ascii")
-        alpha = attr.get("cubic_coeff_a", -0.75)
-        exclude = attr.get("exclude_outside", 0)
-        extrapolation_value = attr.get("extrapolation_value", 0.0)
-
-        if roi is not None:
-            roi = fold_constant(
-                _op.concatenate(
-                    [
-                        _op.strided_slice(roi, [2], [ndims]),
-                        _op.strided_slice(roi, [ndims + 2], [2 * ndims]),
-                    ],
-                    axis=0,
-                )
-            )
-
-        out_size = fold_constant(_op.strided_slice(size, [2], [ndims]))
-
-        out = None
-        if ndims == 3:
-            out = _op.image.resize1d(
-                inputs[0],
-                out_size,
-                roi,
-                "NCW",
-                method,
-                coord_trans,
-                nearest_mode,
-                alpha,
-                exclude,
-                extrapolation_value,
-            )
-        elif ndims == 4:
-            out = _op.image.resize2d(
-                inputs[0],
-                out_size,
-                roi,
-                "NCHW",
-                method,
-                coord_trans,
-                nearest_mode,
-                alpha,
-                exclude,
-                extrapolation_value,
-            )
-        elif ndims == 5:
-            out = _op.image.resize3d(
-                inputs[0],
-                out_size,
-                roi,
-                "NCDHW",
-                method,
-                coord_trans,
-                nearest_mode,
-                alpha,
-                exclude,
-                extrapolation_value,
-            )
-        else:
-            raise NotImplementedError("Resize only supports 3, 4, or 5 dims")
-
-        return out
-
-
-class NonZero(OnnxOpConverter):
-    """Operator converter for NonZero"""
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        if len(inputs) > 1:
-            raise ValueError("Expect 1 input only")
-
-        output = AttrCvt(op_name="argwhere")(inputs, attr, params)
-        # ONNX NonZero always outputs int64
-        output = _op.cast(output, "int64")
-        return _op.transpose(output, axes=(1, 0))
-
-
-class ReverseSequence(OnnxOpConverter):
-    """Operator converter for ReverseSequence"""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-
-        return _op.reverse_sequence(inputs[0], inputs[1], attr["time_axis"], attr["batch_axis"])
-
-
-class TopK(OnnxOpConverter):
-    """Operator converter for TopK"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if len(inputs) != 2:
-            raise ValueError("Expect 2 input only")
-        axis = attr.get("axis", -1)
-        largest = attr.get("largest", 1)
-
-        if largest == 0:
-            # TODO(mbrookhart): optimize this by adding a smallest attribute to topi if this
-            # ever becomes a bottleneck
-            ndim = len(infer_shape(inputs[0]))
-            if axis < 0:
-                axis += ndim
-            sort = _op.sort(inputs[0], axis=axis)
-            argsort = _op.argsort(inputs[0], axis=axis, dtype="int64")
-            begin = [0] * ndim
-            stride = [1] * ndim
-            end = _op.concatenate(
-                [
-                    _op.const([np.iinfo(np.int64).max] * axis, dtype="int64"),
-                    inputs[1],
-                    _op.const([np.iinfo(np.int64).max] * (ndim - axis - 1), dtype="int64"),
-                ],
-                axis=0,
-            )
-            return _expr.TupleWrapper(
-                _expr.Tuple(
-                    [
-                        _op.strided_slice(sort, begin, end, stride),
-                        _op.strided_slice(argsort, begin, end, stride),
-                    ]
-                ),
-                2,
-            )
-
-        return _op.topk(inputs[0], inputs[1], axis=axis, dtype="int64")
-
-
-class Range(OnnxOpConverter):
-    """Operator converter for Range"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if len(inputs) != 3:
-            raise ValueError("Expect 3 input only")
-
-        return _op.arange(
-            inputs[0], inputs[1], inputs[2], dtype=infer_type(inputs[0]).checked_type.dtype
-        )
-
-
-class IsInf(OnnxOpConverter):
-    """Operator converter for IsInf"""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        detect_negative = attr.get("detect_negative", 1)
-        detect_positive = attr.get("detect_positive", 1)
-        dtype = infer_type(inputs[0]).checked_type.dtype
-        isinf = _op.isinf(inputs[0])
-        if not detect_negative:
-            isinf = isinf * (inputs[0] > _op.const(0, dtype))
-        if not detect_positive:
-            isinf = isinf * (inputs[0] < _op.const(0, dtype))
-        return isinf
-
-
-class Celu(OnnxOpConverter):
-    """Operator convereter for celu"""
-
-    @classmethod
-    def _impl_v12(cls, inputs, attr, params):
-        x = inputs[0]
-        dtype = infer_type(x).checked_type.dtype
-        alpha = _op.const(attr.get("alpha", 1.0), dtype)
-        zero = _op.const(0, dtype)
-        one = _op.const(1, dtype)
-        out = _op.maximum(zero, x) + _op.minimum(zero, alpha * (_op.exp(x / alpha) - one))
-        return out
-
-
-class MaxRoiPool(OnnxOpConverter):
-    """Operator converter for MaxRoiPool."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        assert len(inputs) == 2, f"MMaxRoiPool op take 2 inputs, {len(inputs)} given"
-
-        data = inputs[0]
-        rois = inputs[1]
-        pooled_shape = attr.get("pooled_shape")
-        spatial_scale = attr.get("spatial_scale", 1.0)
-
-        return _vision.roi_pool(data, rois, pooled_shape, spatial_scale)
-
-
-class RoiAlign(OnnxOpConverter):
-    """Operator converter for RoiAlign."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if len(inputs) != 3:
-            raise ValueError("Expect 3 inputs only")
-        x = inputs[0]
-        rois = inputs[1]
-        batch_indices = inputs[2]
-        mode = attr.get("mode", b"avg")
-        if mode not in (b"avg", b"max"):
-            raise NotImplementedError("RoiAlign in Relay only uses avg and max modes")
-        output_height = attr.get("output_height", 1)
-        output_width = attr.get("output_width", 1)
-
-        sampling_ratio = attr.get("sampling_ratio", 0)
-        spatial_scale = attr.get("spatial_scale", 1.0)
-
-        batch_indices = _op.expand_dims(batch_indices, axis=1, num_newaxis=1)
-        batch_indices = _op.cast(batch_indices, infer_type(rois).checked_type.dtype)
-        rois = _op.concatenate([batch_indices, rois], 1)
-
-        return _vision.roi_align(
-            x, rois, [output_height, output_width], spatial_scale, sampling_ratio, mode=mode
-        )
-
-
-class Clip(OnnxOpConverter):
-    """Operator converter for Clip."""
-
-    @staticmethod
-    def convert_attributes(inputs, attr, params):
-        convert = AttrCvt("clip", transforms={"min": "a_min", "max": "a_max"})
-        return convert(inputs, attr, params)
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if "min" not in attr:
-            attr["min"] = -np.inf
-        if "max" not in attr:
-            attr["max"] = np.inf
-        return Clip.convert_attributes(inputs, attr, params)
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        if len(inputs) == 3 and isinstance(inputs[2], _expr.Constant):
-            attr["max"] = inputs[2].data.numpy().item()
-            inputs = inputs[0:2]
-        if len(inputs) >= 2 and isinstance(inputs[1], _expr.Constant):
-            attr["min"] = inputs[1].data.numpy().item()
-            inputs = inputs[0:1]
-        if "min" in attr and "max" in attr:
-            return Clip.convert_attributes(inputs, attr, params)
-
-        assert len(inputs) <= 3, "Clip-11 takes up to 3 inputs, input, min, max"
-        result = inputs[0]
-        for i, op in enumerate([_op.tensor.maximum, _op.tensor.minimum]):
-            if i < len(inputs) - 1:
-                if inputs[i + 1] is not None:
-                    result = op(result, inputs[i + 1])
-        return result
-
-
-class Softplus(OnnxOpConverter):
-    """Operator converter for Softplus."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        data = inputs[0]
-        data_dtype = infer_type(data).checked_type.dtype
-        data = _op.exp(data) + _expr.const(1, dtype=data_dtype)
-        return _op.log(data)
-
-
-class Loop(OnnxOpConverter):
-    """Operator converter for Loop"""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        max_loop_count = inputs[0]
-        cond = inputs[1]
-        loop_deps = inputs[2:]
-        num_deps = len(loop_deps)
-        # Create a copy of the body function to prevent the original
-        # from being modified.
-        body = copy.copy(attr["body"])
-        iter_dtype = infer_type(max_loop_count).checked_type.dtype
-
-        # Determine what condition mode we're in.
-        assert cond is not None or max_loop_count is not None
-        is_for_loop = max_loop_count is not None and cond is None
-        is_condition_for_loop = cond is not None and max_loop_count is not None
-
-        # Loop inputs will be packed as
-        # [iter_count, max_count, condition, loop_deps, scan_outputs]
-        def cond_fn(*loop_inputs):
-            i = loop_inputs[0]
-            max_count = loop_inputs[1]
-            w = loop_inputs[2]
-
-            if cond is not None:
-                out_while = _op.equal(w, _expr.const(True, "bool"))
-            if max_loop_count is not None:
-                out_loop = _op.less(i, max_count)
-
-            if is_condition_for_loop:
-                return _op.logical_and(out_while, out_loop)
-            if is_for_loop:
-                return out_loop
-            return out_while
-
-        # Get the current graph proto and create a clone for the subgraph
-        graph_scope = GraphProto.current
-        subgraph_scope = GraphProto(
-            graph_scope._shape,
-            graph_scope._dtype,
-            graph_scope._freeze_params,
-            graph_scope._op_type_dict,
-        )
-        # Load nodes from outer graph into inner graph.
-        subgraph_scope._nodes = graph_scope._nodes.copy()
-
-        # Create a list of variables for each value updated in the loop.
-        def get_var(name, val, scan=False):
-            checked_type = infer_type(val)
-            if hasattr(checked_type, "type_annotation"):
-                checked_type = checked_type.type_annotation
-            if hasattr(checked_type, "checked_type"):
-                checked_type = checked_type.checked_type
-            shape = get_const_tuple(checked_type.shape)
-            actual_shape = []
-            for dim in shape:
-                if isinstance(dim, int) and dim == 0:
-                    actual_shape.append(_ty.Any())
-                else:
-                    actual_shape.append(dim)
-            if scan:
-                return _expr.var(name, shape=[_ty.Any()] + actual_shape, dtype=checked_type.dtype)
-
-            return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
-
-        loop_vars = [
-            _expr.var(body.input[0].name, shape=(), dtype=iter_dtype),  # iteration count
-            _expr.var("max_count", shape=(), dtype=iter_dtype),  # iteration count
-            get_var(body.input[1].name, cond),  # exit condition
-        ]
-        loop_vars += [get_var(body.input[i + 2].name, v) for i, v in enumerate(loop_deps)]
-        loop_var_names = [v.name_hint for v in loop_vars]
-        # get span information of loop body
-        body_source_name = get_source_name(body, subgraph_scope._op_type_dict)
-        # set span to inputs of loop body
-        for i, v in enumerate(loop_vars):
-            loop_vars[i] = set_span(v, make_parameter_span([v.name_hint, body_source_name]))
-
-        num_scan_outputs = len(body.output) - (1 + num_deps)
-
-        # Construct variables and initial empty tensors for any scan outputs.
-        # To do this, we'll figure out the output shapes of the body subgraph by importing
-        # it and doing type inference.
-        scan_output_vars = []
-        scan_output_init = []
-        if num_scan_outputs > 0:
-            with subgraph_scope:
-                loop_outputs = subgraph_scope.from_onnx(
-                    body, graph_scope.opset, get_output_expr=True
-                )
-            loop_outputs = _expr.TupleWrapper(loop_outputs, len(body.output))
-
-        for i in range(num_scan_outputs):
-            name, _, _, _ = get_info(body.output[i + 1 + num_deps])
-            output_node = infer_type(loop_outputs[i + 1 + num_deps])
-            shape = get_const_tuple(output_node.checked_type.shape)
-            dtype = output_node.checked_type.dtype
-            scan_output_vars.append(
-                _expr.var(name, shape=([_ty.Any()] * (len(shape) + 1)), dtype=dtype)
-            )
-            scan_output_init.append(
-                _op.reshape(_expr.const(np.array([]).astype(dtype)), [0] + [1] * len(shape))
-            )
-
-        # Now we can remove loop iter variables from our inner loop's inputs.
-        # This is kind of a hack since we have graph inputs that we don't
-        # want to treat as actual inputs.
-        while len(body.input) != 0:
-            body.input.pop(0)
-
-        # Define the loop body, in this function we need to unpack loop inputs,
-        # convert the loop subgraph, and pack outputs for the next iteration.
-        def body_fn(*loop_inputs):
-            # Unpack inputs
-            loop_count = loop_inputs[0]
-            max_count = loop_inputs[1]
-            cond = loop_inputs[2]
-            current_vars = list(loop_inputs[3 : (3 + num_deps)])
-            scan_outputs = loop_inputs[(3 + num_deps) :]
-
-            # Prepare body inputs by adding them to node dictionary.
-            new_inputs = [loop_count, max_count, cond] + current_vars
-            for i, inp in enumerate(new_inputs):
-                subgraph_scope._nodes[loop_var_names[i]] = inp
-
-            # Get the output of the current loop using the updated inputs.
-            with subgraph_scope:
-                loop_outputs = subgraph_scope.from_onnx(
-                    body, graph_scope.opset, get_output_expr=True
-                )
-            # Unpack the body outputs and prepare variables for next iteration.
-            new_cond = loop_outputs[0]
-            new_loop_vars = [loop_outputs[i] for i in range(1, 1 + num_deps)]
-            new_scan_outputs = [loop_outputs[i] for i in range(1 + num_deps, len(loop_outputs))]
-
-            # Add new scan outputs to tracking
-            combined_scan_outputs = []
-            for i, scan in enumerate(scan_outputs):
-                rank = len(infer_shape(scan)) - 1
-                new_scan = new_scan_outputs[i]
-                expand_scan = _op.expand_dims(new_scan, axis=0)
-                # For non scalar outputs we need to broadcast the initial value.
-                if rank > 0:
-                    new_scan_shape = shape_of(new_scan, dtype=iter_dtype)
-                    scan_broadcast = _op.concatenate(
-                        [_op.reshape(loop_count, [1]), new_scan_shape], axis=0
-                    )
-                    scan = _op.broadcast_to(scan, scan_broadcast)
-                combined_scan = _op.concatenate([scan, expand_scan], axis=0)
-                combined_scan_outputs.append(combined_scan)
-
-            # Increment counter.
-            if max_loop_count is not None:
-                incr = _expr.const(1, dtype=iter_dtype)
-                loop_count = loop_count + incr
-
-            # Pack loop outputs for next iteration
-            # [iter_count, cond, loop_deps, loop_scans]
-            return [loop_count, max_count, new_cond] + new_loop_vars + combined_scan_outputs
-
-        # Create the loop function.
-        loop = fold_constant(_loops.while_loop(cond_fn, loop_vars + scan_output_vars, body_fn))
-
-        # Now need to run initial values through the graph.
-        init_count = _expr.const(0, dtype=iter_dtype)
-        loop_vals = loop(init_count, max_loop_count, cond, *loop_deps, *scan_output_init)
-
-        # Extract final iteration outputs.
-        if num_deps + num_scan_outputs == 1:
-            outputs = _expr.TupleGetItem(loop_vals, 3)
-        else:
-            outputs = _expr.TupleWrapper(
-                _expr.Tuple(
-                    [
-                        _expr.TupleGetItem(loop_vals, i + 3)
-                        for i in range(num_deps + num_scan_outputs)
-                    ]
-                ),
-                num_deps + num_scan_outputs,
-            )
-
-        # Update outer graph with constants found in the subgraph.
-        free_vars = analysis.free_vars(loop)
-        graph_scope._params.update(subgraph_scope._params)
-        graph_scope._nodes.update(subgraph_scope._nodes)
-        for var in free_vars:
-            graph_scope._nodes.update({var.name_hint: var})
-        return outputs
-
-
-class If(OnnxOpConverter):
-    """Operator converter for If"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        cond = inputs[0]
-        # Convert array to bool if needed.
-        if len(infer_shape(cond)) > 0:
-            cond = _op.take(cond, _expr.const(0, dtype="int64"))
-        then_branch = attr.get("then_branch", None)
-        else_branch = attr.get("else_branch", None)
-        assert then_branch is not None and else_branch is not None
-
-        # Create graph converters for both branches.
-        graph_scope = GraphProto.current
-        then_graph = GraphProto(
-            graph_scope._shape,
-            graph_scope._dtype,
-            graph_scope._freeze_params,
-            graph_scope._op_type_dict,
-        )
-        then_graph._nodes = graph_scope._nodes.copy()
-        else_graph = GraphProto(
-            graph_scope._shape,
-            graph_scope._dtype,
-            graph_scope._freeze_params,
-            graph_scope._op_type_dict,
-        )
-        else_graph._nodes = graph_scope._nodes.copy()
-
-        # Convert each branch to a relay expression.
-        with then_graph:
-            then_expr = then_graph.from_onnx(then_branch, graph_scope.opset, get_output_expr=True)
-        with else_graph:
-            else_expr = else_graph.from_onnx(else_branch, graph_scope.opset, get_output_expr=True)
-
-        # Add constants from both branches to parent graph.
-        graph_scope._params.update(then_graph._params)
-        graph_scope._nodes.update(then_graph._nodes)
-        graph_scope._params.update(else_graph._params)
-        graph_scope._nodes.update(else_graph._nodes)
-
-        then_free_vars = analysis.free_vars(then_expr)
-        for var in then_free_vars:
-            graph_scope._nodes.update({var.name_hint: var})
-            if var.name_hint in graph_scope._inputs:
-                graph_scope._inputs.update({var.name_hint: var})
-        else_free_vars = analysis.free_vars(else_expr)
-        for var in else_free_vars:
-            graph_scope._nodes.update({var.name_hint: var})
-            if var.name_hint in graph_scope._inputs:
-                graph_scope._inputs.update({var.name_hint: var})
-
-        # Sometimes pytorch to onnx will insert silly if statements that produce dynamic ranks.
-        # Often these dont contribute anything. If we see a dynamic rank output, try to unify
-        # them so we can continue without breaking.
-        if not isinstance(then_expr, _expr.Tuple) and not isinstance(else_expr, _expr.Tuple):
-            then_shape = infer_shape(then_expr)
-            else_shape = infer_shape(else_expr)
-            if len(then_shape) != len(else_shape):
-                warning_msg = (
-                    "If statement produced outputs with different rank. "
-                    "Attempting to unify ranks but this may produce incorrect results."
-                )
-                warnings.warn(warning_msg)
-                # Skip constant If node to avoid irrational broadcast
-                if isinstance(inputs[0], tvm.relay.expr.Constant):
-                    predicate = inputs[0].data.asnumpy()[0]
-                    node_name = attr["tvm_custom"]["name"]
-                    warn_msg_begin = f"Predicate of If node {node_name} is always "
-                    if predicate == np.bool_(True):
-                        warnings.warn(
-                            warn_msg_begin
-                            + "true so only then branch would be executed. Removing else branch. "
-                        )
-                        else_expr = then_expr
-                    elif predicate == np.bool_(False):
-                        warnings.warn(
-                            warn_msg_begin
-                            + "false so only else branch would be executed. Removing then branch. "
-                        )
-                        then_expr = else_expr
-                if len(then_shape) < len(else_shape):
-                    then_expr = _op.broadcast_to_like(then_expr, else_expr)
-                else:
-                    else_expr = _op.broadcast_to_like(else_expr, then_expr)
-
-        # Now we can construct the relay if statement and return.
-        ret = _expr.If(cond, then_expr, else_expr)
-        if len(then_branch.output) > 1:
-            ret = _expr.TupleWrapper(ret, len(then_branch.output))
-        return ret
-
-
-class Scan(OnnxOpConverter):
-    """Operator converter for Scan"""
-
-    @classmethod
-    def _impl_v8(cls, inputs, attr, params):
-        new_inputs = inputs[1:]
-        batch_num = infer_shape(inputs[1])[0]
-        out = []
-        for i in range(batch_num):
-            v9_inputs = [
-                _op.take(new_inputs[j], _expr.const(i), axis=0) for j in range(len(new_inputs))
-            ]
-            results = cls._impl_v9(v9_inputs, attr, params)
-            results = [_op.expand_dims(results[j], axis=0) for j in range(len(results))]
-            if i == 0:
-                out = results
-            else:
-                out = [_op.concatenate([out[j], results[j]], axis=0) for j in range(len(results))]
-
-        out = _expr.TupleWrapper(_expr.Tuple(out), len(out))
-        return out
-
-    @classmethod
-    def _impl_v9(cls, inputs, attr, params):
-        body = attr.get("body")
-        num_scan_inputs = attr.get("num_scan_inputs")
-        num_all_inputs = len(inputs)
-        num_state_inputs = len(body.input) - num_scan_inputs
-        num_state_outputs = num_state_inputs
-        num_all_outputs = len(body.output)
-        num_scan_outputs = num_all_outputs - num_state_outputs
-        scan_input_axes = attr.get("scan_input_axes", [0] * num_scan_inputs)
-        scan_input_directions = attr.get("scan_input_directions", [0] * num_scan_inputs)
-        scan_output_axes = list(attr.get("scan_output_axes", [0] * num_scan_outputs))
-        scan_output_directions = attr.get("scan_output_directions", [0] * num_scan_outputs)
-        # loop count are the same for all scan inputs, so get loop count by first input scan
-        # strided_slice not support dynamic axes, so assume input shape are static
-        max_loop_count = infer_shape(inputs[num_state_inputs])[scan_input_axes[0]]
-
-        # Create a copy of the body function to prevent the original
-        # from being modified.
-        body = copy.copy(attr["body"])
-
-        # Loop inputs will be packed as
-        # [iter_count, loop_deps, scan_outputs]
-        def cond_fn(*loop_inputs):
-            i = loop_inputs[0]
-            return _op.less(i, relay.const(max_loop_count, "int32"))
-
-        # Get the current graph proto and create a clone for the subgraph
-        graph_scope = GraphProto.current
-        subgraph_scope = GraphProto(
-            graph_scope._shape,
-            graph_scope._dtype,
-            graph_scope._freeze_params,
-            graph_scope._op_type_dict,
-        )
-        # Load nodes from outer graph into inner graph.
-        subgraph_scope._nodes = graph_scope._nodes.copy()
-
-        # Create a list of variables for each value updated in the loop.
-        def get_var(name, val, scan=False):
-            checked_type = infer_type(val)
-            if hasattr(checked_type, "type_annotation"):
-                checked_type = checked_type.type_annotation
-            if hasattr(checked_type, "checked_type"):
-                checked_type = checked_type.checked_type
-            shape = get_const_tuple(checked_type.shape)
-            actual_shape = []
-            for dim in shape:
-                if isinstance(dim, int) and dim == 0:
-                    actual_shape.append(_ty.Any())
-                else:
-                    actual_shape.append(dim)
-            if scan:
-                return _expr.var(name, shape=[_ty.Any()] + actual_shape, dtype=checked_type.dtype)
-
-            return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
-
-        # Construct variables and initial empty tensors for any scan outputs.
-        # To do this, we'll figure out the output shapes of the body subgraph by importing
-        # it and doing type inference.
-        scan_output_vars = []
-        scan_output_init = []
-        if num_scan_outputs > 0:
-            with subgraph_scope:
-                loop_outputs = subgraph_scope.from_onnx(
-                    body, graph_scope.opset, get_output_expr=True
-                )
-            loop_outputs = _expr.TupleWrapper(loop_outputs, len(body.output))
-
-        for i in range(num_scan_outputs):
-            name, _, _, _ = get_info(body.output[i + num_state_outputs])
-            output_node = infer_type(loop_outputs[i + num_state_outputs])
-            shape = list(get_const_tuple(output_node.checked_type.shape))
-            if scan_output_axes[i] < 0:
-                scan_output_axes[i] = len(shape) + scan_output_axes[i] + 1
-            shape.insert(scan_output_axes[i], max_loop_count)
-            dtype = output_node.checked_type.dtype
-            scan_output_vars.append(_expr.var(name, shape=shape, dtype=dtype))
-            scan_output_init.append(_op.zeros(shape, dtype))
-
-        # loop vars = [iter_count, scan_state, scan_out]
-        loop_vars = [_expr.var("iter", shape=(), dtype="int32")]  # iteration count
-        loop_vars += [
-            get_var(body.input[i].name, v) for i, v in enumerate(inputs) if i < num_state_inputs
-        ]
-        # get span information of scan body
-        body_source_name = get_source_name(body, subgraph_scope._op_type_dict)
-        # set span to inputs of scan body
-        for i, v in enumerate(loop_vars):
-            loop_vars[i] = set_span(v, make_parameter_span([v.name_hint, body_source_name]))
-
-        loop_vars += scan_output_vars
-        body_input_var_names = ["iter"] + [body.input[i].name for i in range(len(body.input))]
-
-        # # Now we can remove loop iter variables from our inner loop's inputs.
-        # # This is kind of a hack since we have graph inputs that we don't
-        # # want to treat as actual inputs.
-        while len(body.input) != 0:
-            body.input.pop(0)
-
-        # Define the loop body, in this function we need to unpack loop inputs,
-        # convert the loop subgraph, and pack outputs for the next iteration.
-        def body_fn(*loop_inputs):
-            # Unpack inputs
-            loop_count = loop_inputs[0]
-            state_vars = list(loop_inputs[1 : 1 + num_state_inputs])
-            scan_vars = list(loop_inputs[1 + num_state_inputs :])
-            # body take scan graph scan inputs as original input
-            input_scan_exprs = []
-            for i in range(num_state_inputs, num_all_inputs):
-                if scan_input_directions[i - num_state_inputs] != 0:
-                    input_scan_exprs.append(
-                        relay.take(
-                            inputs[i],
-                            relay.const(max_loop_count - 1, "int32") - loop_count,
-                            axis=scan_input_axes[i - num_state_inputs],
-                        )
-                    )
-                else:
-                    input_scan_exprs.append(
-                        relay.take(
-                            inputs[i], loop_count, axis=scan_input_axes[i - num_state_inputs]
-                        )
-                    )
-
-            # Prepare body inputs by adding them to node dictionary.
-            body_inputs = [loop_count] + state_vars + input_scan_exprs
-            for i, inp in enumerate(body_inputs):
-                subgraph_scope._nodes[body_input_var_names[i]] = inp
-
-            # Get the output of the current loop using the updated inputs.
-            with subgraph_scope:
-                loop_outputs = subgraph_scope.from_onnx(
-                    body, graph_scope.opset, get_output_expr=True
-                )
-            # Unpack the body outputs and prepare variables for next iteration.
-            new_state_vars = [loop_outputs[i] for i in range(num_state_outputs)]
-            new_scan_vars = [loop_outputs[i] for i in range(num_state_outputs, num_all_outputs)]
-
-            # Add new scan outputs to tracking
-            combined_scan_outputs = []
-            for i in range(num_scan_outputs):
-                if scan_output_directions[i] == 0:
-                    # append new scan output
-                    combined_scan = _op.concatenate(
-                        [scan_vars[i], _op.expand_dims(new_scan_vars[i], axis=scan_output_axes[i])],
-                        axis=scan_output_axes[i],
-                    )
-                    # pop head scan output
-                    combined_scan = _op.strided_slice(
-                        combined_scan,
-                        begin=[1],
-                        end=[max_loop_count + 1],
-                        strides=[1],
-                        axes=[scan_output_axes[i]],
-                    )
-                else:
-                    # prepend new scan output
-                    combined_scan = _op.concatenate(
-                        [_op.expand_dims(new_scan_vars[i], axis=scan_output_axes[i]), scan_vars[i]],
-                        axis=scan_output_axes[i],
-                    )
-                    # pop tail scan output
-                    combined_scan = _op.strided_slice(
-                        combined_scan,
-                        begin=[0],
-                        end=[max_loop_count],
-                        strides=[1],
-                        axes=[scan_output_axes[i]],
-                    )
-                combined_scan_outputs.append(combined_scan)
-
-            incr = _expr.const(1, dtype="int32")
-            loop_count = loop_count + incr
-
-            # Pack loop outputs for next iteration
-            # [iter_count, state_var, scan_var]
-            return [loop_count] + new_state_vars + combined_scan_outputs
-
-        # Create the loop function.
-        loop = fold_constant(_loops.while_loop(cond_fn, loop_vars, body_fn))
-
-        # Now need to run initial values through the graph.
-        init_count = _expr.const(0, dtype="int32")
-
-        input_states = [inputs[i] for i in range(num_state_inputs)]
-        loop_vals = loop(init_count, *input_states, *scan_output_init)
-
-        outputs = _expr.TupleWrapper(
-            _expr.Tuple([_expr.TupleGetItem(loop_vals, i + 1) for i in range(num_all_outputs)]),
-            num_all_outputs,
-        )
-
-        # Update outer graph with constants found in the subgraph.
-        free_vars = analysis.free_vars(loop)
-        graph_scope._params.update(subgraph_scope._params)
-        graph_scope._nodes.update(subgraph_scope._nodes)
-        for var in free_vars:
-            graph_scope._nodes.update({var.name_hint: var})
-        return outputs
-
-
-class LinearRegressor(OnnxOpConverter):
-    """Operator converter for LinearRegressor."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        data = inputs[0]
-        coefficients = attr.get("coefficients", 0)
-        data_shape = infer_shape(data)
-        targets = attr.get("targets", 1)
-        coefficients = _expr.const(list(coefficients), dtype="float32")
-        coefficients_shape = infer_shape(coefficients)
-
-        coefficients = _op.reshape(coefficients, (targets, coefficients_shape[0] // targets))
-        if coefficients_shape[0] // targets < data_shape[-1]:
-            data = _op.split(data, [coefficients_shape[0] // targets], -1)[0]
-
-        mm_out = _op.nn.dense(data, coefficients)
-
-        if "intercepts" in attr:
-            intercepts = attr.get("intercepts", 0)
-            intercepts = _expr.const(list(intercepts), dtype="float32")
-
-            if targets == 1:
-                return _op.nn.bias_add(mm_out, intercepts, axis=-1)
-            return get_relay_op("add")(mm_out, intercepts)
-
-        return mm_out
-
-
-class DFT(OnnxOpConverter):
-    """Operator converter for discrete Fourier transform (DFT)."""
-
-    @classmethod
-    def _impl_v17(cls, inputs, attr, params):
-        # ************************* Read attrs *************************
-        axis = attr.get("axis", 1)
-        inverse = attr.get("inverse", 0)
-        onesided = attr.get("onesided", 0)
-
-        # ************************* Read inputs ************************
-        input_tensor = inputs[0]
-        dft_length = inputs[1]
-
-        # ************************* Parse inputs ***********************
-        t1 = ["float16", "float32", "float64"]
-        t2 = ["int32", "int64"]
-
-        # input
-        assert infer_type(input_tensor).checked_type.dtype in t1
-        input_shape = infer_shape(input_tensor)
-        assert len(input_shape) >= 3
-        if axis < 0:
-            axis = len(input_shape) + axis
-        assert 1 <= axis <= len(input_shape) - 1, "axis is out of bounds"
-
-        # dft_length
-        if dft_length is None:
-            dft_length = input_shape[axis]
-        else:
-            dft_length_dtype = infer_type(dft_length).checked_type.dtype
-            assert dft_length_dtype in t2
-            dft_length = int(infer_value(dft_length, params).numpy())
-
-        # ************************
-        input_tensor = cls._maybe_crop_or_pad(input_tensor, axis, dft_length)
-
-        swap_axis = -1
-        re_input_tensor, im_input_tensor = cls._split_real_and_imag_parts(input_tensor)
-
-        re_input_tensor = cls._swap_axes(re_input_tensor, axis, swap_axis)
-        im_input_tensor = cls._swap_axes(im_input_tensor, axis, swap_axis)
-
-        re_input_tensor, im_input_tensor = _op.dft(re_input_tensor, im_input_tensor, inverse)
-
-        re_input_tensor = cls._swap_axes(re_input_tensor, axis, swap_axis)
-        im_input_tensor = cls._swap_axes(im_input_tensor, axis, swap_axis)
-
-        if onesided:
-            re_input_tensor = cls._crop_onesided(re_input_tensor, axis)
-            im_input_tensor = cls._crop_onesided(im_input_tensor, axis)
-
-        return cls._merge_real_and_imag_parts(re_input_tensor, im_input_tensor)
-
-    @classmethod
-    def _crop_axis(cls, tensor, axis, new_dim):
-        shape = infer_shape(tensor)
-        slices = [slice(0, a, 1) for a in shape]
-        slices[axis] = slice(0, new_dim, 1)
-        return _op.strided_slice(
-            tensor,
-            begin=[s.start for s in slices],
-            end=[s.stop for s in slices],
-            strides=[s.step for s in slices],
-            axes=list(range(len(shape))),
-        )
-
-    @classmethod
-    def _maybe_crop_or_pad(cls, input_tensor, axis, n_fft):
-        shape = infer_shape(input_tensor)
-        if shape[axis] != n_fft:
-            if shape[axis] > n_fft:
-                return cls._crop_axis(input_tensor, axis, n_fft)
-            else:
-                pad_width = [(0, 0)] * len(shape)
-                pad_width[axis] = (0, n_fft - shape[axis])
-                return _op.nn.pad(input_tensor, pad_width)
-        return input_tensor
-
-    @classmethod
-    def _swap_axes(cls, tensor, axis1, axis2):
-        permutation = list(range(len(infer_shape(tensor))))
-        permutation[axis1] = axis2
-        permutation[axis2] = axis1
-        return _op.transpose(tensor, permutation)
-
-    @classmethod
-    def _split_real_and_imag_parts(cls, tensor):
-        shape = infer_shape(tensor)
-        dtype = infer_type(tensor).checked_type.dtype
-        if shape[-1] == 1:
-            re = tensor
-            im = _op.const(np.zeros(shape), dtype=dtype)
-        else:
-            re, im = _op.split(tensor, 2, -1)
-
-        return _op.squeeze(re, -1), _op.squeeze(im, -1)
-
-    @classmethod
-    def _merge_real_and_imag_parts(cls, re, im):
-        re = _op.expand_dims(re, axis=-1)
-        im = _op.expand_dims(im, axis=-1)
-        return _op.concatenate([re, im], axis=-1)
-
-    @classmethod
-    def _crop_onesided(cls, tensor, axis):
-        shape = infer_shape(tensor)
-        return cls._crop_axis(tensor, axis, shape[axis] // 2 + 1)
-
-
-class NonMaxSuppression(OnnxOpConverter):
-    """Operator converter for NonMaxSuppression."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        # Get parameter values
-        boxes = inputs[0]
-        scores = inputs[1]
-        max_output_boxes_per_class = inputs[2]
-        iou_threshold = inputs[3]
-        score_threshold = inputs[4]
-
-        boxes_dtype = infer_type(boxes).checked_type.dtype
-
-        if attr.get("center_point_box", 0) != 0:
-            xc, yc, w, h = _op.split(boxes, 4, axis=2)
-            half_w = w / _expr.const(2.0, boxes_dtype)
-            half_h = h / _expr.const(2.0, boxes_dtype)
-            x1 = xc - half_w
-            x2 = xc + half_w
-            y1 = yc - half_h
-            y2 = yc + half_h
-            boxes = _op.concatenate([y1, x1, y2, x2], axis=2)
-
-        if iou_threshold is None:
-            iou_threshold = _expr.const(0.0, dtype="float32")
-        if score_threshold is None:
-            score_threshold = _expr.const(0.0, dtype="float32")
-
-        def conditionally_squeeze_scalar(x):
-            rank = len(infer_shape(x))
-            assert rank <= 1, "nms thresholds must be scalars"
-            if rank == 1:
-                return _op.squeeze(x, [0])
-            return x
-
-        max_output_boxes_per_class = conditionally_squeeze_scalar(max_output_boxes_per_class)
-        iou_threshold = conditionally_squeeze_scalar(iou_threshold)
-        score_threshold = conditionally_squeeze_scalar(score_threshold)
-
-        nms_out = _op.vision.all_class_non_max_suppression(
-            boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold
-        )
-
-        return _op.strided_slice(nms_out[0], _op.const([0], dtype="int64"), nms_out[1])
-
-
-class ATen(OnnxOpConverter):
-    """Operator converter for Pytorch ATen ops."""
-
-    @classmethod
-    def _op_dispatch(cls, operator, inputs, attr, params):
-        op_map = {
-            "size": cls._size,
-            "arange": cls._arange,
-            "index_put": cls._index_put,
-            "reshape": cls._reshape,
-            "embedding_bag": cls._embedding_bag,
-        }
-        assert operator in op_map, f"Operator {operator} is not supported."
-        return op_map[operator](inputs, attr, params)
-
-    @classmethod
-    def _size(cls, inputs, attr, params):
-        return _op.take(
-            _op.shape_of(inputs[0], dtype="int64"),
-            _expr.const(-1, dtype="int64"),
-            axis=0,
-            mode="wrap",
-        )
-
-    @classmethod
-    def _arange(cls, inputs, attr, params):
-        return _op.arange(inputs[0], inputs[1], inputs[2], dtype="int64")
-
-    @classmethod
-    def _check_index(cls, indices, values):
-        def unfolding_indices(indices, values):
-            n = len(indices)
-            flatten_indices = []
-            slices_size = []
-            for index in indices:
-                flatten_indices.append(_op.reshape(index, _op.const([-1])))
-                slices_size.append(infer_shape(flatten_indices[-1])[0])
-            repeat_size = [1]
-            tile_size = [1]
-            for i in range(1, n):
-                repeat_size.append(slices_size[-i] * repeat_size[-1])
-                tile_size.append(slices_size[i - 1] * tile_size[-1])
-            repeat_size.reverse()
-            unflod_slices = []
-            for i in range(n):
-                unflod_slices.append(
-                    fold_constant(
-                        _op.repeat(_op.tile(flatten_indices[i], (tile_size[i],)), repeat_size[i], 0)
-                    )
-                )
-            return unflod_slices, _op.reshape(values, _op.const([-1]))
-
-        values_shape = infer_shape(values)
-        if len(values_shape) != 1:
-            return unfolding_indices(indices, values)
-        return indices, values
-
-    @classmethod
-    def _index_put(cls, inputs, attr, params):
-        in_tensor = inputs[0]
-        indices, values = cls._check_index(inputs[1 : len(inputs) - 2], inputs[len(inputs) - 2])
-        accumulate = inputs[len(inputs) - 1].data.asnumpy() != 0
-        if not accumulate:
-            mode = "update"
-        else:
-            mode = "add"
-        index_tensor = _op.stack(indices, axis=0)
-        return _op.scatter_nd(in_tensor, index_tensor, values, mode)
-
-    @classmethod
-    def _reshape(cls, inputs, attr, params):
-        return _op.reshape(inputs[0], inputs[1])
-
-    @classmethod
-    def _embedding_bag(cls, inputs, attr, params):
-        mode_map = {0: _op.sum, 1: _op.mean, 2: _op.max}
-
-        mode = attr.get("mode", 1)
-        reduction_fn = mode_map[mode]
-        weights, indices, offsets = inputs[0], inputs[1], inputs[2]
-        offsets_shape = _op.shape_of(offsets, dtype="int64")
-        indices_shape = _op.stack(
-            [
-                _op.take(offsets_shape, _expr.const(0, dtype="int64")),
-                _expr.const(-1, dtype="int64"),
-            ],
-            axis=0,
-        )
-        indices = _op.reshape(indices, indices_shape)
-        embedding = _op.take(weights, indices.astype("int64"), axis=0)
-        rembedding = reduction_fn(embedding, axis=1)
-        # EmbeddingBag has 4 outputs for some reason despite only one ever being used.
-        # Fill the rest with 0s.
-        unused_output = _expr.const(0, dtype="float32")
-        return _expr.TupleWrapper(
-            _expr.Tuple((rembedding, unused_output, unused_output, unused_output)), 4
-        )
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        operator = attr.get("operator", None).decode("utf-8")
-        assert operator, "ATen Operator not found"
-        return cls._op_dispatch(operator, inputs, attr, params)
-
-
-class QuantizeLinear(OnnxOpConverter):
-    """Operator converter for QuantizeLinear."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        data, scale, zp = inputs
-        out_dtype = infer_type(zp).checked_type.dtype
-        return _qnn.op.quantize(data, scale, _op.cast(zp, "int32"), 0, out_dtype)
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        data, scale, zp = inputs
-        out_dtype = infer_type(zp).checked_type.dtype
-        axis = attr.get("axis", 1)
-        if len(infer_shape(data)) < 2:
-            axis = 0
-        return _qnn.op.quantize(data, scale, _op.cast(zp, "int32"), axis, out_dtype)
-
-
-class DequantizeLinear(OnnxOpConverter):
-    """Operator converter for QuantizeLinear."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        data, scale, zp = inputs
-        return _qnn.op.dequantize(data, scale, _op.cast(zp, "int32"), 0)
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        data, scale, zp = inputs
-        axis = attr.get("axis", 1)
-        if len(infer_shape(data)) <= 1:
-            axis = 0
-        return _qnn.op.dequantize(data, scale, _op.cast(zp, "int32"), axis)
-
-
-class DynamicQuantizeLinear(OnnxOpConverter):
-    """Operator converter for QuantizeLinear."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        """This op is deprecated an only supports uint8"""
-        data = inputs[0]
-        data_dtype = infer_type(data).checked_type.dtype
-        zero = _op.const(0, dtype=data_dtype)
-        maximum = _op.maximum(zero, _op.max(data))
-        minimum = _op.minimum(zero, _op.min(data))
-        scale = (maximum - minimum) / _op.const(255, dtype=data_dtype)
-        zp = zero - _op.min(data) / scale
-        zp = _op.cast(_op.round(_op.clip(zp, 0, 255)), "uint8")
-        return _expr.TupleWrapper(
-            _expr.Tuple(
-                [_qnn.op.quantize(data, scale, _op.cast(zp, "int32"), 0, "uint8"), scale, zp]
-            ),
-            size=3,
-        )
-
-
-class QLinearConv(OnnxOpConverter):
-    """Operator converter for QLinearConv."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        data = inputs[0]
-        x_scale = get_scalar(inputs[1], params)
-        x_zero_point = get_scalar(inputs[2], params, "int32")
-        weight = inputs[3]
-        w_scale = get_scalar_or_1d_tensor(inputs[4], params)
-        w_zero_point = get_scalar_or_1d_tensor(inputs[5], params, "int32")
-        y_scale = fold_constant(get_scalar(inputs[6], params))
-        y_zero_point = get_scalar(inputs[7], params, "int32")
-
-        # Check shapes for per channel quantization
-        w_scale_shape = infer_shape(w_scale)
-        w_zero_point_shape = infer_shape(w_zero_point)
-        if len(w_scale_shape) == 1 or len(w_zero_point_shape) == 1:
-            m = infer_shape(weight)[0]
-            if m != w_scale_shape[0] or m != w_zero_point_shape[0]:
-                raise tvm.error.OpAttributeInvalid(
-                    "The number of elements should be equal to the number of output channels"
-                )
-
-        input_shape = infer_shape(data)
-
-        ndim = len(input_shape)
-        kernel_type = infer_type(weight)
-        kernel_shapes = [get_const_tuple(kernel_type.checked_type.shape)]
-        if "kernel_shape" not in attr:
-            attr["kernel_shape"] = kernel_shapes[0][2:]
-
-        if "auto_pad" in attr:
-            attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
-                # Warning: Convolution does not yet support dynamic shapes,
-                # one will need to run dynamic_to_static on this model after import
-                zp = fold_constant(x_zero_point)
-                assert isinstance(zp, relay.Constant), "Zero point expected to be a constant"
-                data = autopad(
-                    data,
-                    attr.get("strides", [1] * (ndim - 2)),
-                    attr["kernel_shape"],
-                    attr.get("dilations", [1] * (ndim - 2)),
-                    pad_value=zp.data,
-                    mode=attr["auto_pad"],
-                )
-            elif attr["auto_pad"] == "VALID":
-                attr["pads"] = tuple([0 for i in range(ndim - 2)])
-            elif attr["auto_pad"] == "NOTSET":
-                pass
-            else:
-                msg = (
-                    f'Value {attr["auto_pad"]} in attribute "auto_pad" of operator Conv '
-                    f"is invalid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-            attr.pop("auto_pad")
-
-        out_channels = kernel_shapes[0][0]
-        dilation = attr.get("dilations", [1] * (ndim - 2))
-        strides = attr.get("strides", [1] * (ndim - 2))
-        padding = attr["pads"] if "pads" in attr else 0
-        groups = attr["group"] if "group" in attr else 1
-
-        if ndim != 4:
-            raise tvm.error.OpAttributeInvalid(
-                "Only 2D kernels are supported for operator QLinearConv."
-            )
-
-        out = _qnn.op.conv2d(
-            data,
-            weight,
-            x_zero_point,
-            w_zero_point,
-            x_scale,
-            w_scale,
-            kernel_size=attr["kernel_shape"],
-            channels=out_channels,
-            strides=strides,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-        )
-        use_bias = len(inputs) == 9
-        if use_bias:
-            out = _op.nn.bias_add(out, inputs[8])
-
-        out_dtype = infer_type(inputs[7]).checked_type.dtype
-        requantize_scale = _op.multiply(x_scale, w_scale)
-
-        # requantize requires y_scale to be constant,
-        # if y_scale is not constant, doing dequantize -> quantize
-        if isinstance(y_scale, _expr.Constant):
-            out = _qnn.op.requantize(
-                out,
-                requantize_scale,
-                _op.const(0, dtype="int32"),
-                y_scale,
-                y_zero_point,
-                out_dtype=out_dtype,
-                axis=1,
-            )
-        else:
-            out = _qnn.op.dequantize(out, requantize_scale, _op.const(0, dtype="int32"), axis=1)
-            out = _qnn.op.quantize(out, y_scale, y_zero_point, axis=1, out_dtype=out_dtype)
-        return out
-
-
-class QGemm(OnnxOpConverter):
-    """Operator converter for QGemm."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.QGemm
-
-        a = inputs[0]
-        a_scale = get_scalar(inputs[1], params)
-        a_zp = get_scalar(inputs[2], params, "int32")
-
-        b = inputs[3]
-        # must be a scalar or 1D tensor which means a per-tensor or per-column quantization
-        # If 1-D tensor, number of elements should be equal to columns elements of input B
-        b_scale = get_scalar_or_1d_tensor(inputs[4], params)
-        b_zp = get_scalar_or_1d_tensor(inputs[5], params, "int32")
-
-        # note that if optional and not provided then value will be None.
-        C = inputs[6]
-        # must be null or a scalar or 1D tensor of size 1
-        y_scale = inputs[7]
-        # must be null or a scalar or 1D tensor of size 1
-        y_zp = get_scalar(inputs[8], params, "int32")
-
-        assert len(infer_shape(a)) == 2
-        assert len(infer_shape(b)) == 2
-        # zero point and scale of input b should have same shape size
-        assert infer_shape(b_scale) == infer_shape(b_zp)
-
-        alpha = float(attr.get("alpha", 1.0))
-        transA = int(attr.get("transA", 0))
-        transB = int(attr.get("transB", 0))
-
-        # get number of channels
-        channels = infer_channels(b, not transB)
-        a_dtype = infer_type(a).checked_type.dtype
-
-        if transA:
-            a = _op.transpose(a, axes=(1, 0))
-        if not transB:
-            b = _op.transpose(b, axes=(1, 0))
-
-        result = _qnn.op.dense(a, b, a_zp, b_zp, a_scale, b_scale, channels)
-
-        if C:
-            result = _op.add(result, C)
-
-        requantize_scale = _op.multiply(a_scale, b_scale)
-        if alpha != 1.0:
-            requantize_scale *= _expr.const(alpha, dtype="float32")
-        requantize_zp = _op.const(0, dtype="int32")
-
-        if y_scale:
-            # requantize requires y_scale to be constant,
-            # if y_scale is not constant, doing dequantize -> quantize
-            if isinstance(y_scale, _expr.Constant):
-                y = _qnn.op.requantize(
-                    result,
-                    requantize_scale,
-                    requantize_zp,
-                    y_scale,
-                    y_zp,
-                    axis=-1,
-                    rounding="TONEAREST",
-                    out_dtype=a_dtype,
-                )
-            else:
-                result_deq = _qnn.op.dequantize(result, requantize_scale, requantize_zp, axis=0)
-
-                y = _qnn.op.quantize(result_deq, y_scale, y_zp, axis=0, out_dtype=a_dtype)
-        else:
-            y = _op.multiply(_op.cast(result, "float32"), requantize_scale)
-
-        return y
-
-
-class QLinearAdd(OnnxOpConverter):
-    """Operator converter for QLinearAdd from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        a = inputs[0]
-        a_scale = get_scalar(inputs[1], params)
-        a_zero_point = get_scalar(inputs[2], params, "int32")
-        b = inputs[3]
-        b_scale = get_scalar(inputs[4], params)
-        b_zero_point = get_scalar(inputs[5], params, "int32")
-        c_scale = get_scalar(inputs[6], params)
-        c_zero_point = get_scalar(inputs[7], params, "int32")
-
-        dtype = infer_type(a).checked_type.dtype
-
-        ## Onnxruntime doesn't actually do this op in integer, they dequantize to fp32
-        ## and then requantize afer
-        ## https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/mlas/lib/qladd.cpp
-        a = _qnn.op.dequantize(
-            inputs[0], a_scale, a_zero_point
-        )  # , c_scale, c_zero_point, out_dtype = dtype)
-        b = _qnn.op.dequantize(
-            inputs[3], b_scale, b_zero_point
-        )  # , c_scale, c_zero_point, out_dtype = dtype)
-        out = _op.add(a, b)
-        return _qnn.op.quantize(out, c_scale, c_zero_point, out_dtype=dtype)
-
-
-class QLinearMatMul(OnnxOpConverter):
-    """
-    Operator converter for QLinearMatMul from Microsoft onnxruntime contrib opset.
-
-    Limitations:
-    - Not guaranteed to meet the integer-overflow behavior stipulated in the
-      ONNX documentation for this operator.
-
-    The QLinearMatMul converter is re-used for MatMulInteger and is adapted for
-    the latter with the optional `expected_out_dtypes` argument.
-    """
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=None):
-        if expected_out_dtypes is None:
-            # The default QLinearMatMul converter is expected to have one of
-            # these output dtypes.
-            expected_out_dtypes = ["int8", "uint8"]
-
-        # Some of the ops used below take scalar-like inputs, and may require either
-        # of the following:
-        #
-        # - the input is Const node (not merely an expression that *could* be reduced
-        #   to a single Const at graph-compilation time)
-        #
-        # - the input has a specific dtype
-        #
-        # This function attempts to present 'x' in a form that meets both of those
-        # requirements.
-        def try_resolve_to_const(x, dtype_override=None):
-            x2 = try_resolve_var_to_const(x, params)
-            num_elem = np.prod(infer_shape(x))
-            if num_elem == 1:
-                x2 = ensure_scalar_shape(x2)
-            x_dtype = infer_type(x).checked_type.dtype
-            if (dtype_override is not None) and (dtype_override != x_dtype):
-                x2 = _op.cast(x2, dtype_override)
-            x3 = fold_constant(x2)
-            return x3
-
-        # Unpack the inputs and obtain some type info...
-        a, a_scale, a_zp, b, b_scale, b_zp, y_scale, y_zp = inputs
-
-        a_type = infer_type(a).checked_type  # 'T1' in ONNX doc for this op
-        a_scale_type = infer_type(a_scale).checked_type
-        a_zp_type = infer_type(a_zp).checked_type
-
-        b_type = infer_type(b).checked_type  # 'T2' in ONNX doc for this op
-        b_scale_type = infer_type(b_scale).checked_type
-        b_zp_type = infer_type(b_zp).checked_type
-
-        y_scale_type = infer_type(y_scale).checked_type
-        y_zp_type = infer_type(y_zp).checked_type  # 'T3' in ONNX doc for this op
-
-        # Verify type assumptions, based on the ONNX doc for this op...
-        assert a_type.dtype in ["int8", "uint8"]
-        assert a_scale_type.dtype == "float32"
-        assert a_zp_type.dtype == a_type.dtype
-
-        assert b_type.dtype in ["int8", "uint8"]
-        assert b_scale_type.dtype == "float32"
-        assert b_zp_type.dtype == b_type.dtype
-
-        assert y_scale_type.dtype == "float32"
-        assert y_zp_type.dtype in expected_out_dtypes
-
-        # _qnn.op.dense requires the zero-point values to have dtype int32.
-        a_scale_scalar = try_resolve_to_const(a_scale)
-        a_zp_scalar = try_resolve_to_const(a_zp, "int32")
-
-        b_scale_scalar = try_resolve_to_const(b_scale)
-        b_zp_scalar = try_resolve_to_const(b_zp, "int32")
-
-        y_scale_scalar = try_resolve_to_const(y_scale)
-        y_zp_scalar = try_resolve_to_const(y_zp, "int32")
-
-        # TODO: Confirm that we're using 'num_hidden_units' correctly / as intended with
-        # the '_qnn.op.dense' instance below.
-        num_hidden_units = infer_shape(b)[-1]
-
-        # - Specify the matmul result dtype as int32, so that hopefully the matmul will use
-        #   a 32-bit accumulator as seems to be required by the ONNX op's documentation.
-        #
-        # TL;DR:
-        # The ONNX documentation for this op is clear about acceptable overflow
-        # behavior during the matmul operation:
-        #   - The scalar multiplication ops MAY NOT overflow.
-        #   - The scalar addition ops, which sum the results of the scalar multiplication,
-        #     MAY overflow, but if they do so, it must behave as one would expect during
-        #     32-bit integer-addition overflow.
-        # As of this writing, Relay's qnn.op.dense operator doesn't expose a way for us to
-        # express these constraints.
-        #
-        # TODO: Extend TVM / Relay / TIR / etc. to allow this kind of constraint to be
-        # expressed in a Relay graph. And then update this importer and various TVM
-        # backends accordingly.
-        matmul_result_dtype = "int32"
-        # TODO(vvchernov): possibly it is better to use unsigned type for result
-        # if input types are unsigned:
-        # if a_type.dtype == "uint8" and b_type.dtype == "uint8":
-        #     matmul_result_dtype = "uint32"
-
-        matmul_result = qmatmul(
-            a,
-            b,
-            a_zp_scalar,
-            b_zp_scalar,
-            a_scale_scalar,
-            b_scale_scalar,
-            num_hidden_units,
-            matmul_result_dtype,
-        )
-
-        # This information might only be found in the C++ code-comments for the
-        # dense.matmul op, but the quantized tensor returned by _qnn.op.dense
-        # has scale==(a_scale_scalar * b_scale_scalar), and zero_point==0.
-        #
-        # 'matmul_result_zp_scalar' has type 'int32' to satisfy input requirements
-        # of the [de/re]quantize ops below.
-        matmul_result_scale_scalar = fold_constant(_op.multiply(a_scale_scalar, b_scale_scalar))
-        matmul_result_zp_scalar = _op.const(0, dtype="int32")
-
-        if "int32" in expected_out_dtypes:
-            # This is the adaptation of the QLinearMatMul converter for MatMulInteger,
-            # in the MatMulInteger case we skip the unnecessary requantization step.
-            return matmul_result
-
-        # requantize requires y_scale to be constant,
-        # if y_scale is not constant, doing dequantize -> quantize
-        if isinstance(y_scale_scalar, _expr.Constant):
-            y = _qnn.op.requantize(
-                matmul_result,
-                matmul_result_scale_scalar,
-                matmul_result_zp_scalar,
-                y_scale_scalar,
-                y_zp_scalar,
-                axis=-1,
-                rounding="TONEAREST",
-                out_dtype=y_zp_type.dtype,
-            )
-        else:
-            matmul_result_deq = _qnn.op.dequantize(
-                matmul_result, matmul_result_scale_scalar, matmul_result_zp_scalar, axis=0
-            )
-
-            y = _qnn.op.quantize(
-                matmul_result_deq, y_scale_scalar, y_zp_scalar, axis=0, out_dtype=y_zp_type.dtype
-            )
-
-        return y
-
-
-class MatMulInteger(OnnxOpConverter):
-    """Operator converter for MatMulInteger."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        a = inputs[0]
-        b = inputs[1]
-
-        a_dtype = infer_type(a).checked_type.dtype
-        b_dtype = infer_type(b).checked_type.dtype
-
-        assert a_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for first input"
-        assert b_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for second input"
-
-        assert a_dtype == b_dtype, "MatMulInteger: input dtypes must match"
-
-        a_scale = _op.const(1.0, dtype="float32")
-        b_scale = _op.const(1.0, dtype="float32")
-        out_scale = _op.const(1.0, dtype="float32")
-
-        a_zero_point = _op.const(0.0, dtype=a_dtype)
-        b_zero_point = _op.const(0.0, dtype=b_dtype)
-        out_zero_point = _op.const(0.0, dtype="int32")
-
-        if len(inputs) == 4:
-            a_zero_point = inputs[2]
-            b_zero_point = inputs[3]
-
-            a_zp_dtype = infer_type(a_zero_point).checked_type.dtype
-            b_zp_dtype = infer_type(b_zero_point).checked_type.dtype
-            assert (
-                a_zp_dtype == a_dtype and b_zp_dtype == b_dtype
-            ), "MatMulInteger: input dtype doesn't match zero point dtype"
-        elif len(inputs) != 2:
-            raise AssertionError(f"MatMulInteger op takes 2 or 4 inputs, {len(inputs)} given")
-
-        inputs = [a, a_scale, a_zero_point, b, b_scale, b_zero_point, out_scale, out_zero_point]
-
-        return QLinearMatMul.get_converter(10)(inputs, attr, params, expected_out_dtypes=["int32"])
-
-
-class QLinearMul(OnnxOpConverter):
-    """Operator converter for QLinearMul from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        a = inputs[0]
-        a_scale = get_scalar(inputs[1], params)
-        a_zero_point = get_scalar(inputs[2], params, "int32")
-        b = inputs[3]
-        b_scale = get_scalar(inputs[4], params)
-        b_zero_point = get_scalar(inputs[5], params, "int32")
-        y_scale = fold_constant(get_scalar(inputs[6], params))
-        y_zero_point = get_scalar(inputs[7], params, "int32")
-
-        dtype = infer_type(a).checked_type.dtype
-
-        ## Onnxruntime doesn't actually do this op in integer, they dequantize to fp32
-        ## and then requantize afer
-        ## https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/mlas/lib/qlmul.cpp
-        a = _qnn.op.dequantize(inputs[0], a_scale, a_zero_point)
-        b = _qnn.op.dequantize(inputs[3], b_scale, b_zero_point)
-        out = _op.multiply(a, b)
-        return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=dtype)
-
-
-class QLinearLeakyRelu(OnnxOpConverter):
-    """Operator converter for QLinearLeakyRelu from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-
-        a_scale = get_scalar(inputs[1], params)
-        a_zero_point = get_scalar(inputs[2], params, "int32")
-        y_scale = fold_constant(get_scalar(inputs[3], params))
-        y_zero_point = get_scalar(inputs[4], params, "int32")
-        alpha = float(attr.get("alpha", 1.0))
-
-        dtype = infer_type(inputs[0]).checked_type.dtype
-
-        # Onnxruntime doesn't actually do this op in integer, they dequantize to fp32
-        # and then requantize afer (according to documentation below)
-        # https://github.com/microsoft/onnxruntime/blob/master/docs/ContribOperators.md#com.microsoft.QLinearLeakyRelu
-        a = _qnn.op.dequantize(inputs[0], a_scale, a_zero_point)
-        out = _op.nn.leaky_relu(a, alpha)
-        return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=dtype)
-
-
-class QLinearSigmoid(OnnxOpConverter):
-    """Operator converter for QLinearSigmoid from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        x = inputs[0]
-        x_scale = get_scalar(inputs[1], params)
-        x_zero_point = get_scalar(inputs[2], params, "int32")
-        y_scale = fold_constant(get_scalar(inputs[3], params))
-        y_zero_point = get_scalar(inputs[4], params, "int32")
-
-        dtype = infer_type(x).checked_type.dtype
-
-        ## Apparently, onnxruntime doesn't do this op in integer, they dequantize to fp32
-        ## and then requantize after:
-        ## https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/
-        ## providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp#L245
-        x = _qnn.op.dequantize(x, x_scale, x_zero_point)
-        out = _op.sigmoid(x)
-        return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=dtype)
-
-
-class QLinearSoftmax(OnnxOpConverter):
-    """Operator converter for QLinearSoftmax from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        axis = attr["axis"]
-
-        x = inputs[0]
-        x_scale = get_scalar(inputs[1], params)
-        x_zero_point = get_scalar(inputs[2], params, "int32")
-        y_scale = fold_constant(get_scalar(inputs[3], params))
-        y_zero_point = get_scalar(inputs[4], params, "int32")
-
-        dtype = infer_type(x).checked_type.dtype
-
-        x = _qnn.op.dequantize(x, x_scale, x_zero_point)
-        out = _op.nn.softmax(x, axis)
-        return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=dtype)
-
-
-class QLinearConcat(OnnxOpConverter):
-    """Operator converter for QLinearConcat from Microsoft onnxruntime contrib opset."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        # which axis to concat on
-        axis = attr["axis"]
-
-        y_scale = fold_constant(get_scalar(inputs[0], params))
-        y_zero_point = get_scalar(inputs[1], params, "int32")
-
-        # input tensors, scales, zero_points
-        assert (
-            len(inputs) % 3 == 2
-        ), "Additional input count must be a multiple of 3 -- tensor/scale/zero_point tuples"
-        tensors = []
-        scales = []
-        zero_points = []
-        for i in range(2, len(inputs), 3):
-            tensors.append(inputs[i])
-            scales.append(get_scalar(inputs[i + 1], params))
-            zero_points.append(get_scalar(inputs[i + 2], params, "int32"))
-
-        return _qnn.op.concatenate(tensors, scales, zero_points, y_scale, y_zero_point, axis)
-
-
-class ConvInteger(OnnxOpConverter):
-    """Operator converter for ConvInteger."""
-
-    @classmethod
-    def _impl_v10(cls, inputs, attr, params):
-        data = inputs[0]
-        weight = inputs[1]
-        data_zp = inputs[2]
-        weight_zp = inputs[3]
-        if data_zp is None:
-            data_zp = _expr.const(0, "int32")
-        if weight_zp is None:
-            weight_zp = _expr.const(0, "int32")
-
-        input_type = infer_type(data)
-        input_shape = get_const_tuple(input_type.checked_type.shape)
-
-        ndim = len(input_shape)
-        kernel_type = infer_type(weight)
-        kernel_shape = get_const_tuple(kernel_type.checked_type.shape)
-        if "kernel_shape" not in attr:
-            attr["kernel_shape"] = kernel_shape[2:]
-
-        if "auto_pad" in attr:
-            attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
-                # Warning: Convolution does not yet support dynamic shapes,
-                # one will need to run dynamic_to_static on this model after import
-                data = autopad(
-                    data,
-                    attr.get("strides", [1] * (ndim - 2)),
-                    attr["kernel_shape"],
-                    attr.get("dilations", [1] * (ndim - 2)),
-                    pad_value=data_zp,
-                    mode=attr["auto_pad"],
-                )
-            elif attr["auto_pad"] == "VALID":
-                attr["pads"] = tuple([0 for i in range(ndim - 2)])
-            elif attr["auto_pad"] == "NOTSET":
-                pass
-            else:
-                msg = (
-                    f'Value {attr["auto_pad"]} in attribute "auto_pad" of operator Conv '
-                    f"is invalid."
-                )
-                raise tvm.error.OpAttributeInvalid(msg)
-            attr.pop("auto_pad")
-
-        out_channels = kernel_shape[0]
-        dilation = attr.get("dilations", [1] * (ndim - 2))
-        strides = attr.get("strides", [1] * (ndim - 2))
-        padding = attr["pads"] if "pads" in attr else 0
-        groups = attr["group"] if "group" in attr else 1
-
-        if ndim != 4:
-            raise tvm.error.OpAttributeInvalid(
-                "Only 2D kernels are supported for operator ConvInteger."
-            )
-
-        return _qnn.op.conv2d(
-            data,
-            weight,
-            _op.cast(data_zp, "int32"),
-            _op.cast(weight_zp, "int32"),
-            _expr.const(1.0, "float32"),
-            _expr.const(1.0, "float32"),
-            kernel_size=attr["kernel_shape"],
-            channels=out_channels,
-            strides=strides,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-        )
-
-
-class BitwiseBase(OnnxOpConverter):
-    """Base class of operator converter for Bitwise operations"""
-
-    name = ""
-
-    @classmethod
-    def check_inputs(cls, inputs, num=2, use_int=True):
-        assert len(inputs) == num, f"{cls.name} takes {num} inputs, {len(inputs)} given"
-
-        valid_types = ["uint8", "uint16", "uint32", "uint64"]
-        if use_int:
-            valid_types += ["int8", "int16", "int32", "int64"]
-        for i in range(num):
-            in_dtype = infer_type(inputs[i]).checked_type.dtype
-            assert in_dtype in valid_types, f"Wrong dtype of the {i}-th input: {in_dtype}"
-
-
-class BitShift(BitwiseBase):
-    """Operator converter for BitShift"""
-
-    name = "BitShift"
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        cls.check_inputs(inputs, use_int=False)
-
-        direction = attr.get("direction", "LEFT").decode("ascii")
-        if direction == "LEFT":
-            out = _op.left_shift(*inputs)
-        elif direction == "RIGHT":
-            out = _op.right_shift(*inputs)
-        else:
-            raise ValueError("Unsupported Shift Direction: " + direction)
-        return out
-
-
-class BitwiseAnd(BitwiseBase):
-    """Operator converter for BitwiseAnd"""
-
-    name = "BitwiseAnd"
-
-    @classmethod
-    def _impl_v18(cls, inputs, attr, params):
-        cls.check_inputs(inputs)
-
-        return _op.bitwise_and(*inputs)
-
-
-class BitwiseNot(BitwiseBase):
-    """Operator converter for BitwiseNot"""
-
-    name = "BitwiseNot"
-
-    @classmethod
-    def _impl_v18(cls, inputs, attr, params):
-        cls.check_inputs(inputs, num=1)
-
-        return _op.bitwise_not(*inputs)
-
-
-class BitwiseOr(BitwiseBase):
-    """Operator converter for BitwiseOr"""
-
-    name = "BitwiseOr"
-
-    @classmethod
-    def _impl_v18(cls, inputs, attr, params):
-        cls.check_inputs(inputs)
-
-        return _op.bitwise_or(*inputs)
-
-
-class BitwiseXor(BitwiseBase):
-    """Operator converter for BitwiseXor"""
-
-    name = "BitwiseXor"
-
-    @classmethod
-    def _impl_v18(cls, inputs, attr, params):
-        cls.check_inputs(inputs)
-
-        return _op.bitwise_xor(*inputs)
-
-
-class Unique(OnnxOpConverter):
-    """Operator converter for unique"""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        if len(inputs) != 1:
-            raise ValueError("Unique expects 1 input")
-
-        data = inputs[0]
-        axis = attr.get("axis", None)
-        if axis is None:  # If axis is None, flatten the input before calling unique
-            data = _op.reshape(data, _op.const([-1]))
-        else:
-            data_shape = infer_shape(data)
-            if len(data_shape) != 1:
-                raise ValueError("TVM only supports 1D Unique operator.")
-        is_sorted = attr.get("sorted", 1)  # sorted is 0 or 1, 1 by default
-
-        # ONNX documentation lists return_counts as optional but there is no input to specify
-        # whether it is returned. Therefore we'll just always return it.
-        unique = _op.unique(data, is_sorted=(is_sorted == 1), return_counts=True)
-        num_unique = unique[3]
-
-        trim_unique_lambda = lambda input: _op.strided_slice(input, _op.const([0]), num_unique)
-
-        unique_vals = trim_unique_lambda(unique[0])
-        indices = _op.cast(trim_unique_lambda(unique[1]), "int64")  # ONNX always returns int64
-        inverse_indices = _op.cast(unique[2], "int64")  # ONNX always returns int64
-        counts = _op.cast(trim_unique_lambda(unique[4]), "int64")  # ONNX always returns int64
-        # ONNX unique returns unique, indices, inverse_indices, (optional) counts
-        return _expr.TupleWrapper(_expr.Tuple([unique_vals, indices, inverse_indices, counts]), 4)
-
-
-class Einsum(OnnxOpConverter):
-    """Operator converter for Einsum"""
-
-    @classmethod
-    def _impl_v12(cls, inputs, attr, params):
-        equation = attr["equation"].decode("utf-8")
-        return _op.einsum(inputs, equation)
-
-
-class Trilu(OnnxOpConverter):
-    """Operator converter for Trilu"""
-
-    @classmethod
-    def _impl_v14(cls, inputs, attr, params):
-        upper = attr.get("upper", True)
-        if len(inputs) == 2:
-            data, k = inputs
-        else:
-            data = inputs[0]
-            k = 0
-        return _op.trilu(data, k, upper)
-
-
-class GridSample(OnnxOpConverter):
-    """Operator converter for GridSample"""
-
-    @classmethod
-    def _impl_v16(cls, inputs, attr, params):
-        grid = inputs[1]
-        # onnx grid is of shape (N, H, W, 2) which should be transposed to (N, 2, H, W) for relay
-        grid = _op.transform.transpose(grid, axes=(0, 3, 1, 2))
-        method: str = attr.get("mode", b"bilinear").decode("utf-8")
-        padding_mode: str = attr.get("padding_mode", b"zeros").decode("utf-8")
-        # onnx default is 0 which should be changed to False in relay
-        align_corners = attr.get("align_corners", 0) != 0
-        return _op.image.grid_sample(
-            inputs[0], grid, method, padding_mode=padding_mode, align_corners=align_corners
-        )
-
-
-class Bernoulli(OnnxOpConverter):
-    """Operator converter for Bernoulli"""
-
-    @classmethod
-    def _impl_v15(cls, inputs, attr, params):
-        in_dtype = infer_type(inputs[0]).checked_type.dtype
-        assert in_dtype in ["float32", "float64"], "Only float input tensor is currently supported."
-        # The data type for the elements of the output tensor.
-        # if not specified, we will use the data type of the input tensor
-        out_dtype = attr.get("dtype", None)
-        if out_dtype is None:
-            out_dtype = in_dtype
-        else:
-            out_dtype = get_type(out_dtype)
-
-        seed = attr.get("seed", None)
-        if seed is None:
-            seed = np.random.randint(1e6)
-        else:
-            seed = int(seed)
-
-        key = _random.threefry_key(seed)
-        inter_outputs = _op.random.uniform(key, infer_shape(inputs[0]), in_dtype)
-        _, uniform_nums = _expr.TupleWrapper(inter_outputs, 2)
-        return _op.cast(_op.less(uniform_nums, inputs[0]), out_dtype)
-
-
-class RandomNormal(OnnxOpConverter):
-    """Operator converter for random_normal"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        dtype = get_type(attr.get("dtype", 1))
-        mean = attr.get("mean", 0.0)
-        scale = attr.get("scale", 1.0)
-        seed = attr.get("seed", None)
-        shape = attr["shape"]
-
-        assert dtype in [
-            "float32",
-            "float64",
-        ], "Only float random value generation is currently supported."
-
-        if seed is None:
-            seed = np.random.randint(1e6)
-        else:
-            seed = int(seed)
-        key = _random.threefry_key(seed)
-        output = _op.random.normal(key, shape, dtype=dtype, mean=mean, scale=scale)
-        _, vals = _expr.TupleWrapper(output, 2)
-        return vals
-
-
-class RandomNormalLike(OnnxOpConverter):
-    """Operator converter for random_normal_like"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        dtype = attr.get("dtype", None)
-        scale = attr.get("scale", 1.0)
-        mean = attr.get("mean", 0.0)
-        seed = attr.get("seed", None)
-        shape = infer_shape(inputs[0])
-        if dtype is None:
-            dtype = infer_type(inputs[0]).checked_type.dtype
-        else:
-            dtype = get_type(dtype)
-
-        assert dtype in [
-            "float32",
-            "float64",
-        ], "Only float random value generation is currently supported."
-
-        if seed is None:
-            seed = np.random.randint(1e6)
-        else:
-            seed = int(seed)
-        key = _random.threefry_key(seed)
-        output = _op.random.normal(key, shape, dtype=dtype, mean=mean, scale=scale)
-        _, vals = _expr.TupleWrapper(output, 2)
-        return vals
-
-
-class RandomUniform(OnnxOpConverter):
-    """Operator converter for random_uniform"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        dtype = get_type(attr.get("dtype", 1))
-        high = attr.get("high", 1.0)
-        low = attr.get("low", 0.0)
-        seed = attr.get("seed", None)
-        shape = attr["shape"]
-
-        assert dtype in [
-            "float32",
-            "float64",
-        ], "Only float random value generation is currently supported."
-
-        if seed is None:
-            seed = np.random.randint(1e6)
-        else:
-            seed = int(seed)
-        key = _random.threefry_key(seed)
-        output = _op.random.uniform(key, shape, dtype=dtype, low=low, high=high)
-        _, vals = _expr.TupleWrapper(output, 2)
-        return vals
-
-
-class RandomUniformLike(OnnxOpConverter):
-    """Operator converter for random_uniform_like"""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        dtype = attr.get("dtype", None)
-        high = attr.get("high", 1.0)
-        low = attr.get("low", 0.0)
-        seed = attr.get("seed", None)
-        shape = infer_shape(inputs[0])
-        if dtype is None:
-            dtype = infer_type(inputs[0]).checked_type.dtype
-        else:
-            dtype = get_type(dtype)
-
-        assert dtype in [
-            "float32",
-            "float64",
-        ], "Only float random value generation is currently supported."
-
-        if seed is None:
-            seed = np.random.randint(1e6)
-        else:
-            seed = int(seed)
-        key = _random.threefry_key(seed)
-        output = _op.random.uniform(key, shape, dtype=dtype, low=low, high=high)
-        _, vals = _expr.TupleWrapper(output, 2)
-        return vals
-
-
-class Multinomial(OnnxOpConverter):
-    """Operator converter for multinomial"""
-
-    @classmethod
-    def _impl_v7(cls, inputs, attr, params):
-        dtype = attr.get("dtype", "int32")
-        sample_size = attr.get("sample_size", 1)
-        seed = attr.get("seed", None)
-        if seed is None:
-            seed = np.random.randint(1e6)
-        key = _op.random.threefry_key(seed)
-        output = _op.random.multinomial(key, inputs[0], sample_size)
-        _, indices = _expr.TupleWrapper(output, 2)
-        return _op.cast(indices, get_type(dtype))
-
-
-class NegativeLogLikelihoodLoss(OnnxOpConverter):
-    """Operator converter for NegativeLogLikehoodLoss"""
-
-    VALID_REDUCTIONS = {"mean", "sum", "none"}
-
-    @classmethod
-    def run_calculation(
-        cls: "NegativeLogLikelihoodLoss",
-        input_tensor: relay.Expr,
-        target_tensor: relay.Expr,
-        weight_tensor: Optional[relay.Expr],
-        ignore_index: int,
-    ):
-        """Run calculation for NegativeLogLikelihood, returning output tensor and
-        weight tensor used for mean-style reductions.
-        """
-        # Convert negative indices --> positive indices for gather ops, note we have to
-        # use the original target tensor to interact with ignore_index to have proper behavior.
-        normalized_target_tensor = normalize_gather_indices(input_tensor, target_tensor, 1)
-
-        if weight_tensor is None:
-            channels = infer_shape(input_tensor)[1]
-            weight_tensor = relay.ones(
-                [channels], dtype=infer_type(input_tensor).checked_type.dtype
-            )
-
-        loss = -relay.gather(
-            input_tensor, axis=1, indices=relay.expand_dims(normalized_target_tensor, 1)
-        )
-        loss = relay.squeeze(loss, axis=[1])
-
-        expanded_normalized_target_tensor = relay.expand_dims(normalized_target_tensor, 0)
-        expanded_normalized_target_tensor = relay.nn.batch_flatten(
-            expanded_normalized_target_tensor
-        )
-        flattened_weights = relay.gather_nd(weight_tensor, expanded_normalized_target_tensor)
-        select_weights = relay.reshape_like(flattened_weights, loss)
-        loss *= select_weights
-
-        if ignore_index is not None:
-            # "Ignore" values whose target is the ignore_index
-            mask_tensor = relay.equal(
-                target_tensor, relay.const(ignore_index, dtype=target_tensor.type_annotation.dtype)
-            )
-            mask_tensor = relay.const(1, dtype="int8") - relay.cast(mask_tensor, "int8")
-            loss = relay.where(
-                mask_tensor, loss, relay.const(0, infer_type(loss).checked_type.dtype)
-            )
-
-            # This is not explained super clearly in the onnx spec, but masked values don't
-            # contribute toward the final value in reduction
-            select_weights *= relay.cast_like(mask_tensor, select_weights)
-
-        weight_total = relay.sum(select_weights)
-        return loss, weight_total
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        ignore_index = attr.get("ignore_index", None)
-        reduction = attr.get("reduction", b"mean").decode("utf-8")
-
-        if reduction not in cls.VALID_REDUCTIONS:
-            raise ValueError(
-                f"Unknown reduction type {reduction}, choices are {cls.VALID_REDUCTIONS}"
-            )
-
-        input_tensor, target_tensor = inputs[0], inputs[1]
-        if len(inputs) == 3:
-            weight_tensor = inputs[2]
-        else:
-            weight_tensor = None
-
-        loss, weight_total = cls.run_calculation(
-            input_tensor, target_tensor, weight_tensor=weight_tensor, ignore_index=ignore_index
-        )
-        if reduction == "mean":
-            return relay.sum(loss) / weight_total
-        if reduction == "sum":
-            return relay.sum(loss)
-        # Case reduction == 'none'
-        return loss
-
-
-class SoftmaxCrossEntropyLoss(OnnxOpConverter):
-    """Operator converter for SCE_loss"""
-
-    @classmethod
-    def _impl_v13(cls, inputs, attr, params):
-        ignore_index = attr.get("ignore_index", None)
-        reduction = attr.get("reduction", b"mean").decode("utf-8")
-        input_tensor, target_tensor = inputs[0], inputs[1]
-        if len(inputs) == 3:
-            weight_tensor = inputs[2]
-        else:
-            weight_tensor = None
-
-        get_log_prob = attr["tvm_custom"]["num_outputs"] == 2
-        log_softmax_attr = {"axis": 1}
-        log_softmax_tensor = LogSoftmax.get_converter(13)([input_tensor], log_softmax_attr, None)
-
-        loss, weight_total = NegativeLogLikelihoodLoss.run_calculation(
-            log_softmax_tensor, target_tensor, weight_tensor, ignore_index=ignore_index
-        )
-
-        if reduction == "mean":
-            loss = relay.sum(loss) / weight_total
-        elif reduction == "sum":
-            loss = relay.sum(loss)
-
-        if get_log_prob:
-            return relay.TupleWrapper(relay.Tuple((loss, log_softmax_tensor)), 2)
-        return loss
-
-
-class Adagrad(OnnxOpConverter):
-    """Operator converter for adagrad op."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        decay_factor = attr.get("decay_factor", 0.0)
-        epsilon = attr.get("epsilon", 0.0)
-        norm_coefficient = attr.get("norm_coefficient", 0.0)
-
-        R = inputs[0]
-        T = inputs[1]
-
-        # convert attributes to constants, proper types
-        dtype_inputs = infer_type(inputs[3]).checked_type.dtype
-        decay_factor = relay.const(decay_factor, dtype=dtype_inputs)
-        epsilon = relay.const(epsilon, dtype=dtype_inputs)
-        norm_coefficient = relay.const(norm_coefficient, dtype=dtype_inputs)
-        T = relay.cast_like(T, inputs[3])
-
-        assert (
-            len(inputs) - 2
-        ) % 3 == 0, f"Expect triplets for remaining inputs, found {len(inputs) - 2}"
-
-        # Remaining inputs are:
-        # [x_1, x_2 ..., x_1_gradient, x_2_gradient, ... x_1_sq_g, x_2_sq_g...]
-        num_input_tensors = (len(inputs) - 2) // 3
-        output_tensors = []
-        output_accumulated_squared_gradients = []
-        for i in range(num_input_tensors):
-            x = inputs[i + 2]
-            gradient = inputs[i + 2 + num_input_tensors]
-            accumulated_squared_gradient = inputs[i + 2 + 2 * num_input_tensors]
-
-            r = R / (relay.const(1.0, dtype=dtype_inputs) + T * decay_factor)
-            g_regularized = norm_coefficient * x + gradient
-            new_accumulated_squared_gradient = (
-                accumulated_squared_gradient + g_regularized * g_regularized
-            )
-            h_adaptive = relay.sqrt(new_accumulated_squared_gradient) + epsilon
-
-            x_new = x - r * g_regularized / h_adaptive
-
-            output_tensors.append(x_new)
-            output_accumulated_squared_gradients.append(new_accumulated_squared_gradient)
-
-        # append lists together, momentums come after result tensors
-        result = output_tensors + output_accumulated_squared_gradients
-        return _expr.TupleWrapper(_expr.Tuple(result), len(result))
-
-
-class Adam(OnnxOpConverter):
-    """Operator converter for Adam op."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = attr.get("alpha", 0.9)
-        beta = attr.get("beta", 0.999)
-
-        # Note in the docs epsilon default is 0.0 but in the tests it is set to 1e-2:
-        # https://git.io/Ju5C4
-        epsilon = attr.get("epsilon", 1e-2)
-        norm_coefficient = attr.get("norm_coefficient", 0.0)
-        norm_coefficient_post = attr.get("norm_coefficient_post", 0.0)
-
-        R = inputs[0]
-        T = inputs[1]
-
-        assert (
-            len(inputs) - 2
-        ) % 4 == 0, f"Expect 4-lets for remaining inputs, found {len(inputs) - 2}"
-
-        # convert attributes to constants, proper types
-        dtype_inputs = infer_type(inputs[3]).checked_type.dtype
-        inverse_alpha = relay.const(1 - alpha, dtype=dtype_inputs)
-        alpha = relay.const(alpha, dtype=dtype_inputs)
-        inverse_beta = relay.const(1 - beta, dtype=dtype_inputs)
-        beta = relay.const(beta, dtype=dtype_inputs)
-        epsilon = relay.const(epsilon, dtype=dtype_inputs)
-        norm_coefficient = relay.const(norm_coefficient, dtype=dtype_inputs)
-        norm_coefficient_post = relay.const(norm_coefficient_post, dtype=dtype_inputs)
-        one = relay.const(1, dtype=dtype_inputs)
-        T = relay.cast_like(T, inputs[3])
-
-        # Remaining inputs are:
-        # [x_1, x_2 ..., x_1_grad, x_2_grad, ... x_1_g_accum, x_2_g_accum..., x_1_g_sq_accum, ...]
-        num_input_tensors = (len(inputs) - 2) // 4
-        output_tensors = []
-        output_accumulated_gradients = []
-        output_accumulated_squared_gradients = []
-        for i in range(num_input_tensors):
-            x = inputs[i + 2]
-            g = inputs[i + 2 + num_input_tensors]
-            v = inputs[i + 2 + 2 * num_input_tensors]
-            h = inputs[i + 2 + 3 * num_input_tensors]
-
-            g_regularized = norm_coefficient * x + g
-            v_new = alpha * v + inverse_alpha * g_regularized
-            h_new = beta * h + inverse_beta * g_regularized * g_regularized
-            h_sqrt = relay.sqrt(h_new) + epsilon
-
-            true_branch = R * relay.sqrt(one - relay.power(beta, T)) / (one - relay.power(alpha, T))
-            R_adjusted = relay.If(T > relay.const(0, dtype=dtype_inputs), true_branch, R)
-
-            x_new = x - R_adjusted * (v_new / h_sqrt)
-            x_result = (one - norm_coefficient_post) * x_new
-
-            output_tensors.append(x_result)
-            output_accumulated_gradients.append(v_new)
-            output_accumulated_squared_gradients.append(h_new)
-
-        # append lists together to get final result
-        result = (
-            output_tensors + output_accumulated_gradients + output_accumulated_squared_gradients
-        )
-        return _expr.TupleWrapper(_expr.Tuple(result), len(result))
-
-
-class Momentum(OnnxOpConverter):
-    """Operator converter for Momentum op."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        alpha = attr["alpha"]
-        beta = attr["beta"]
-        mode = attr["mode"].decode("utf-8")
-        norm_coefficient = attr["norm_coefficient"]
-
-        assert mode in ["nesterov", "standard"], f"Unknown momentum mode {mode}"
-        R = inputs[0]
-        T = inputs[1]
-
-        assert (
-            len(inputs) - 2
-        ) % 3 == 0, f"Expect triplets for remaining inputs, found {len(inputs) - 2}"
-        # Remaining inputs are:
-        # [x_1, x_2 ..., x_1_gradient, x_2_gradient, ... x_1_momentum, x_2_momentum...]
-        num_input_tensors = (len(inputs) - 2) // 3
-
-        # convert attributes to constants
-        dtype_inputs = infer_type(inputs[3]).checked_type.dtype
-        alpha = relay.const(alpha, dtype=dtype_inputs)
-        beta = relay.const(beta, dtype=dtype_inputs)
-        norm_coefficient = relay.const(norm_coefficient, dtype=dtype_inputs)
-        default_beta = relay.const(1.0, dtype=dtype_inputs)
-
-        # Calculate updated values for every input
-        output_tensors = []
-        output_momentums = []
-        for i in range(num_input_tensors):
-            x = inputs[i + 2]
-            gradient = inputs[i + 2 + num_input_tensors]
-            momentum = inputs[i + 2 + 2 * num_input_tensors]
-            g_regularized = norm_coefficient * x + gradient
-            beta_adjusted = relay.If(T > relay.const(0, dtype="int64"), beta, default_beta)
-            new_momentum = alpha * momentum + beta_adjusted * g_regularized
-
-            if mode == "standard":
-                x_output = x - R * new_momentum
-            else:
-                # mode == 'nesterov'
-                x_output = x - R * (g_regularized + alpha * new_momentum)
-
-            output_tensors.append(x_output)
-            output_momentums.append(new_momentum)
-
-        # append lists together, momentums come after result tensors
-        result = output_tensors + output_momentums
-        return _expr.TupleWrapper(_expr.Tuple(result), len(result))
-
-
-class Round(OnnxOpConverter):
-    """Operator converter for round op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # Onnx round uses Banker's rounding which rounds .5 to the nearest even integer
-
-        x = inputs[0]
-        dtype = infer_type(x).checked_type.dtype
-        half = _expr.const(0.5, dtype=dtype)
-        one = _expr.const(1, dtype=dtype)
-        two = _expr.const(2, dtype=dtype)
-
-        rounded = _op.ceil(x - half)
-        bankers_mask = one - (_op.ceil(x + half) - _op.floor(x + half))
-        non_even = _op.abs(_op.mod(rounded, two))
-        return rounded + (bankers_mask * non_even)
-
-
-class SequenceConstruct(OnnxOpConverter):
-    """Operator converter for sequence construction op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # Construct a tuple from input tensors.
-        return _expr.Tuple(inputs)
-
-
-class SequenceEmpty(OnnxOpConverter):
-    """Operator converter for sequence empty op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # Construct an empty tuple.
-        return _expr.Tuple([])
-
-
-class SequenceErase(OnnxOpConverter):
-    """Operator converter for sequence erase op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # Erase tensor from sequence on specified position
-        input_sequence = inputs[0]
-
-        if len(inputs) == 2:
-            position = inputs[1]
-            # Non constant position is not supported.
-            if isinstance(position, _expr.Constant):
-                position = position.data.numpy()
-            elif position.name_hint in params:
-                position = params[position.name_hint].numpy()
-            else:
-                raise NotImplementedError("Position must be a constant.")
-        else:
-            position = -1
-
-        seq_len = len(input_sequence)
-        assert -seq_len <= position < seq_len, "Position is out of bounds"
-
-        if position < 0:
-            position = seq_len + position
-        # Convert sequence to a list, insert tensors before erased, and repackage as Tuple.
-        tensor_list = [input_sequence[i] for i in range(seq_len) if i != position]
-        # Create new tuple and return.
-        return _expr.Tuple(tensor_list)
-
-
-class SequenceInsert(OnnxOpConverter):
-    """Operator converter for sequence insert op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # Insert a new tensor into a tuple of tensors.
-        input_sequence = inputs[0]
-        new_tensor = inputs[1]
-
-        if len(inputs) == 3:
-            position = inputs[2]
-            # Non constant position is not supported.
-            if isinstance(position, _expr.Constant):
-                position = position.data.numpy()
-            elif position.name_hint in params:
-                position = params[position.name_hint].numpy()
-            else:
-                raise NotImplementedError("Position must be a constant.")
-        else:
-            position = -1
-
-        if position < 0:
-            position = len(input_sequence) + position + 1
-        # Convert sequence to a list, insert new tensor, and repackage as Tuple.
-        tensor_list = [input_sequence[i] for i in range(len(input_sequence))]
-        # Insert new tensor.
-        tensor_list.insert(position, new_tensor)
-        # Create new tuple and return.
-        return _expr.Tuple(tensor_list)
-
-
-class SequenceLength(OnnxOpConverter):
-    """Operator converter for sequence length op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        # Get length of input sequence
-        return _expr.const(len(inputs[0]), dtype="int64")
-
-
-class ConcatFromSequence(OnnxOpConverter):
-    """Operator converter for sequence concatenation op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        axis = attr.get("axis", 0)
-        new_axis = attr.get("new_axis", 0)
-
-        # If a new axis should be created, just stack input tensors.
-        if new_axis == 1:
-            return _op.stack(inputs[0], axis=axis)
-
-        return _op.concatenate(inputs[0], axis=axis)
-
-
-class SplitToSequence(OnnxOpConverter):
-    """Operator converter for split to sequence op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        axis = attr.get("axis", 0)
-        keepdims = attr.get("keepdims", 1)
-
-        input_tensor = inputs[0]
-        input_shape = infer_shape(input_tensor)
-        split = inputs[1]
-
-        # If split is not provided, we split all values along axis.
-        if split is None:
-            output = _op.split(input_tensor, input_shape[axis], axis=axis)
-            # If keepdims is 0, then we need to squeeze off the axis.
-            if not keepdims:
-                output = [_op.squeeze(tensor_slice, axis=[axis]) for tensor_slice in output]
-            return _expr.Tuple(list(output))
-
-        # Otherwise, split based on provided split value.
-        else:
-            # For now we only support constant valued split.
-            assert isinstance(
-                split, _expr.Constant
-            ), "Only constant split supported for SplitToSequence"
-            split = split.data.numpy()
-            if len(split.shape) == 1 and split.shape[0] > 1:
-                # If split is a 1D tensor, it must be converted to indices for relay compatibility.
-                split = np.cumsum(split)
-                # Remove final invalid index.
-                split = split[:-1]
-            else:
-                # Otherwise get split as an integer.
-                split = int(split)
-
-            output = _op.split(input_tensor, split, axis=axis)
-
-            # If keepdims is set to 0 remove split axis. Note that this is
-            # an inconsistency with the onnx spec but is needed for pytorch compatibility.
-            if not keepdims:
-                output = [_op.squeeze(tensor_slice, axis=[axis]) for tensor_slice in output]
-            return _expr.Tuple(list(output))
-
-
-class SequenceAt(OnnxOpConverter):
-    """Operator converter for sequence at op."""
-
-    @classmethod
-    def _impl_v11(cls, inputs, attr, params):
-        input_sequence = inputs[0]
-        position = inputs[1]
-        assert isinstance(
-            position, _expr.Constant
-        ), "Only constant position supported for SequenceAt"
-        # Convert position to integer.
-        position = int(position.data.numpy())
-        return input_sequence[position]
-
-
-# compatible operators that do NOT require any conversion.
-_identity_list = []
-
-
-# _convert_map defines maps of name to converter functor(callable)
-# for 1 to 1 mapping, use Renamer if nothing but name is different
-# use AttrCvt if attributes need to be converted
-# for 1 to N mapping(composed), use custom callable functions
-# for N to 1 mapping, currently not supported(?)
-def _get_convert_map(opset):
-    return {
-        # defs/experimental
-        "Identity": Renamer("copy"),
-        "Optional": Optional_.get_converter(opset),
-        "OptionalHasElement": OptionalHasElement.get_converter(opset),
-        "OptionalGetElement": OptionalGetElement.get_converter(opset),
-        "Affine": Affine.get_converter(opset),
-        # Bitwise operators
-        "BitShift": BitShift.get_converter(opset),
-        "BitwiseAnd": BitwiseAnd.get_converter(opset),
-        "BitwiseNot": BitwiseNot.get_converter(opset),
-        "BitwiseOr": BitwiseOr.get_converter(opset),
-        "BitwiseXor": BitwiseXor.get_converter(opset),
-        "ThresholdedRelu": ThresholdedRelu.get_converter(opset),
-        "ScaledTanh": ScaledTanh.get_converter(opset),
-        "ParametricSoftplus": ParametricSoftPlus.get_converter(opset),
-        "Constant": Constant.get_converter(opset),
-        "ConstantOfShape": ConstantOfShape.get_converter(opset),
-        # 'GivenTensorFill'
-        "FC": AttrCvt("dense", ignores=["axis", "axis_w"]),
-        "Scale": Scale.get_converter(opset),
-        # 'GRUUnit'
-        # 'ATen'
-        # 'ImageScaler'
-        "MeanVarianceNormalization": MeanVarianceNormalization.get_converter(opset),
-        # 'Crop'
-        # 'Embedding'
-        "Upsample": Upsample.get_converter(opset),
-        "SpatialBN": BatchNorm.get_converter(opset),
-        # defs/generator
-        # defs/logical
-        # defs/math
-        "Add": Add.get_converter(opset),
-        "Sub": Sub.get_converter(opset),
-        "Mul": Mul.get_converter(opset),
-        "Div": Div.get_converter(opset),
-        "Neg": Renamer("negative"),
-        "Abs": Absolute.get_converter(opset),
-        "Reciprocal": Reciprocal.get_converter(opset),
-        "Floor": Renamer("floor"),
-        "Ceil": Renamer("ceil"),
-        "Round": Round.get_converter(opset),
-        "IsInf": IsInf.get_converter(opset),
-        "IsNaN": Renamer("isnan"),
-        "Sqrt": Renamer("sqrt"),
-        "Relu": Renamer("relu"),
-        "Celu": Celu.get_converter(opset),
-        "LeakyRelu": Renamer("leaky_relu"),
-        "Selu": Selu.get_converter(opset),
-        "Elu": Elu.get_converter(opset),
-        "Gelu": Gelu.get_converter(opset),
-        "FastGelu": FastGelu.get_converter(opset),
-        "BiasGelu": BiasGelu.get_converter(opset),
-        "Mish": Mish.get_converter(opset),
-        "LayerNormalization": LayerNormalization.get_converter(opset),
-        # TODO: We need a better way to handle different domains, in case
-        # of name collisions. EmbedLayerNormalization, SkipLayerNormalization, and Attention
-        # are in the `com.microsoft` domain.
-        "EmbedLayerNormalization": EmbedLayerNormalization.get_converter(opset),
-        "SkipLayerNormalization": SkipLayerNormalization.get_converter(opset),
-        "Attention": Attention.get_converter(opset),
-        "QAttention": QAttention.get_converter(opset),
-        "Exp": Renamer("exp"),
-        "Greater": Renamer("greater"),
-        "GreaterOrEqual": Renamer("greater_equal"),
-        "Less": Renamer("less"),
-        "LessOrEqual": Renamer("less_equal"),
-        "Log": Renamer("log"),
-        "Acos": Renamer("acos"),
-        "Acosh": Renamer("acosh"),
-        "Asin": Renamer("asin"),
-        "Asinh": Renamer("asinh"),
-        "Atan": Renamer("atan"),
-        "Atanh": Renamer("atanh"),
-        "Cos": Renamer("cos"),
-        "Cosh": Renamer("cosh"),
-        "Sin": Renamer("sin"),
-        "Sinh": Renamer("sinh"),
-        "Tan": Renamer("tan"),
-        "Tanh": Renamer("tanh"),
-        "Pow": Pow.get_converter(opset),
-        "PRelu": Prelu.get_converter(opset),
-        "Sigmoid": Renamer("sigmoid"),
-        "HardSigmoid": HardSigmoid.get_converter(opset),
-        "HardSwish": HardSwish.get_converter(opset),
-        "Max": Maximum.get_converter(opset),
-        "Min": Minimum.get_converter(opset),
-        "Sum": Sum.get_converter(opset),
-        "Mean": Mean.get_converter(opset),
-        "Clip": Clip.get_converter(opset),
-        "Softplus": Softplus.get_converter(opset),
-        # softmax default axis is different in onnx
-        "Softmax": Softmax.get_converter(opset),
-        "LogSoftmax": LogSoftmax.get_converter(opset),
-        "OneHot": OneHot.get_converter(opset),
-        "Hardmax": Hardmax.get_converter(opset),
-        "Shrink": Shrink.get_converter(opset),
-        "Softsign": Softsign.get_converter(opset),
-        "Gemm": Gemm.get_converter(opset),
-        "MatMul": MatMul.get_converter(opset),
-        "MatMulInteger": MatMulInteger.get_converter(opset),
-        "MatMulInteger16": MatMulInteger16.get_converter(opset),
-        "Mod": Mod.get_converter(opset),
-        "Xor": Renamer("logical_xor"),
-        # defs/nn
-        "AveragePool": AveragePool.get_converter(opset),
-        "LpPool": LpPool.get_converter(opset),
-        "GlobalLpPool": GlobalLpPool.get_converter(opset),
-        "MaxPool": MaxPool.get_converter(opset),
-        "MaxUnpool": MaxUnpool.get_converter(opset),
-        "Conv": Conv.get_converter(opset),
-        "ConvTranspose": ConvTranspose.get_converter(opset),
-        "GlobalAveragePool": GlobalAveragePool.get_converter(opset),
-        "GlobalMaxPool": GlobalMaxPool.get_converter(opset),
-        "BatchNormalization": BatchNorm.get_converter(opset),
-        "InstanceNormalization": InstanceNorm.get_converter(opset),
-        # 'LpNormalization'
-        "Dropout": AttrCvt("dropout", {"ratio": "rate"}, ignores=["is_test"]),
-        "Flatten": Flatten.get_converter(opset),
-        "LRN": LRN.get_converter(opset),
-        # Recurrent Layers
-        "RNN": RNN.get_converter(opset),
-        "LSTM": LSTM.get_converter(opset),
-        "GRU": GRU.get_converter(opset),
-        # defs/vision
-        "MaxRoiPool": MaxRoiPool.get_converter(opset),
-        "RoiAlign": RoiAlign.get_converter(opset),
-        "NonMaxSuppression": NonMaxSuppression.get_converter(opset),
-        # defs/reduction
-        "ReduceMax": ReduceMax.get_converter(opset),
-        "ReduceMin": ReduceMin.get_converter(opset),
-        "ReduceSum": ReduceSum.get_converter(opset),
-        "ReduceMean": ReduceMean.get_converter(opset),
-        "ReduceProd": ReduceProd.get_converter(opset),
-        "ReduceLogSumExp": ReduceLogSumExp.get_converter(opset),
-        "ReduceLogSum": ReduceLogSum.get_converter(opset),
-        "ReduceSumSquare": ReduceSumSquare.get_converter(opset),
-        "ReduceL1": ReduceL1.get_converter(opset),
-        "ReduceL2": ReduceL2.get_converter(opset),
-        # defs/sorting
-        "ArgMax": ArgMax.get_converter(opset),
-        "ArgMin": ArgMin.get_converter(opset),
-        "TopK": TopK.get_converter(opset),
-        # defs/tensor
-        "Cast": Cast.get_converter(opset),
-        "CastLike": CastLike.get_converter(opset),
-        "Reshape": Reshape.get_converter(opset),
-        "Expand": Expand.get_converter(opset),
-        "Concat": Concat.get_converter(opset),
-        "Split": Split.get_converter(opset),
-        "Slice": Slice.get_converter(opset),
-        "Transpose": AttrCvt("transpose", {"perm": "axes"}),
-        "DepthToSpace": DepthToSpace.get_converter(opset),
-        "SpaceToDepth": SpaceToDepth.get_converter(opset),
-        "Gather": Gather.get_converter(opset),
-        "GatherElements": GatherElements.get_converter(opset),
-        "GatherND": GatherND.get_converter(opset),
-        "Compress": Compress.get_converter(opset),
-        "Size": AttrCvt("ndarray_size", extras={"dtype": "int64"}),
-        "Scatter": Scatter.get_converter(opset),
-        "ScatterElements": ScatterElements.get_converter(opset),
-        "ScatterND": ScatterND.get_converter(opset),
-        "EyeLike": EyeLike.get_converter(opset),
-        "Squeeze": Squeeze.get_converter(opset),
-        "Unsqueeze": Unsqueeze.get_converter(opset),
-        "Pad": Pad.get_converter(opset),
-        "Shape": Shape.get_converter(opset),
-        "Sign": Sign.get_converter(opset),
-        "Equal": Equal.get_converter(opset),
-        "Not": Not.get_converter(opset),
-        "And": And.get_converter(opset),
-        "Tile": Tile.get_converter(opset),
-        "Erf": Erf.get_converter(opset),
-        "Where": Where.get_converter(opset),
-        "Or": Or.get_converter(opset),
-        "Resize": Resize.get_converter(opset),
-        "NonZero": NonZero.get_converter(opset),
-        "Range": Range.get_converter(opset),
-        "CumSum": CumSum.get_converter(opset),
-        "Unique": Unique.get_converter(opset),
-        "Einsum": Einsum.get_converter(opset),
-        "Trilu": Trilu.get_converter(opset),
-        "GridSample": GridSample.get_converter(opset),
-        # defs/control_flow
-        "Loop": Loop.get_converter(opset),
-        "If": If.get_converter(opset),
-        # Torch ATen Dispatcher.
-        "ATen": ATen.get_converter(opset),
-        # Quantization
-        "QuantizeLinear": QuantizeLinear.get_converter(opset),
-        "DequantizeLinear": DequantizeLinear.get_converter(opset),
-        "DynamicQuantizeLinear": DynamicQuantizeLinear.get_converter(opset),
-        "ReverseSequence": ReverseSequence.get_converter(opset),
-        "QGemm": QGemm.get_converter(opset),
-        "QLinearConv": QLinearConv.get_converter(opset),
-        "QLinearConcat": QLinearConcat.get_converter(opset),
-        "QLinearAdd": QLinearAdd.get_converter(opset),
-        "QLinearMatMul": QLinearMatMul.get_converter(opset),
-        "QLinearMul": QLinearMul.get_converter(opset),
-        "QLinearSigmoid": QLinearSigmoid.get_converter(opset),
-        "QLinearSoftmax": QLinearSoftmax.get_converter(opset),
-        "ConvInteger": ConvInteger.get_converter(opset),
-        "QLinearAveragePool": QLinearAveragePool.get_converter(opset),
-        "QLinearGlobalAveragePool": QLinearGlobalAveragePool.get_converter(opset),
-        "QLinearLeakyRelu": QLinearLeakyRelu.get_converter(opset),
-        # Random number generation.
-        "Bernoulli": Bernoulli.get_converter(opset),
-        "RandomNormal": RandomNormal.get_converter(opset),
-        "RandomNormalLike": RandomNormalLike.get_converter(opset),
-        "RandomUniform": RandomUniform.get_converter(opset),
-        "RandomUniformLike": RandomUniformLike.get_converter(opset),
-        "Multinomial": Multinomial.get_converter(opset),
-        # Loss functions / training
-        "NegativeLogLikelihoodLoss": NegativeLogLikelihoodLoss.get_converter(opset),
-        "SoftmaxCrossEntropyLoss": SoftmaxCrossEntropyLoss.get_converter(opset),
-        "Adagrad": Adagrad.get_converter(opset),
-        "Adam": Adam.get_converter(opset),
-        "Momentum": Momentum.get_converter(opset),
-        "Scan": Scan.get_converter(opset),
-        # ML
-        "LinearRegressor": LinearRegressor.get_converter(opset),
-        "DFT": DFT.get_converter(opset),
-        # Sequence operators
-        "SequenceConstruct": SequenceConstruct.get_converter(opset),
-        "SequenceEmpty": SequenceEmpty.get_converter(opset),
-        "SequenceErase": SequenceErase.get_converter(opset),
-        "SequenceInsert": SequenceInsert.get_converter(opset),
-        "SequenceLength": SequenceLength.get_converter(opset),
-        "ConcatFromSequence": ConcatFromSequence.get_converter(opset),
-        "SplitToSequence": SplitToSequence.get_converter(opset),
-        "SequenceAt": SequenceAt.get_converter(opset),
-    }
-
-
-class GraphProto:
-    """A helper class for handling Relay expression copying from pb2.GraphProto.
-    Definition: https://github.com/onnx/onnx/blob/main/onnx/onnx.proto
-
-        Parameters
-    ----------
-    shape : dict of str to tuple, optional
-        The input shape to the graph
-
-    dtype : str or dict of str to str
-        The input types to the graph
-
-    freeze_params: bool
-        If this parameter is true, the importer will take any provided
-        onnx input values (weights, shapes, etc) and embed them into the relay model
-        as Constants instead of variables. This allows more aggressive optimizations
-        at compile time and helps in making models static if certain inputs represent
-        attributes relay would traditionally consider compile-time constants.
-
-    op_type_dict: Dict[str, int]
-        Dictionary for span filling usage. If the name property of op was not set
-        op_type_dict will provide an alternative by combining literal op type with
-        its presenting order
-
-    """
-
-    current = None
-
-    def __init__(self, shape, dtype, freeze_params=False, op_type_dict=None):
-        self._nodes = {}
-        self._params = {}
-        self._inputs = {}
-        self._renames = {}
-        self._num_input = 0
-        self._num_param = 0
-        self._shape = shape.copy() if shape else {}
-        self._input_names = []
-        self._dtype = dtype
-        self.opset = None
-        self._freeze_params = freeze_params
-        self._op_type_dict = op_type_dict
-
-    def __enter__(self):
-        self._old_manager = GraphProto.current
-        GraphProto.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        GraphProto.current = self._old_manager
-
-    def freeze(self, func, params):
-        bind_map = {}
-        for name in params.keys():
-            if name in self._nodes.keys():
-                bind_map[self._nodes[name]] = _expr.const(params[name])
-        body = _expr.bind(func.body, bind_map)
-        fn = _function.Function(analysis.free_vars(body), body)
-        return fn, {}
-
-    def from_onnx(self, graph, opset, get_output_expr=False):
-        """Construct Relay expression from ONNX graph.
-
-        Onnx graph is a python protobuf object.
-        The companion parameters will be handled automatically.
-        However, the input names from onnx graph is vague, mixing inputs and
-        network weights/bias such as "1", "2"...
-        For convenience, we rename the `real` input names to "input_0",
-        "input_1"... And renaming parameters to "param_0", "param_1"...
-
-        Parameters
-        ----------
-        graph : onnx protobuf object
-            The loaded onnx graph
-
-        opset : opset version
-
-        get_output_expr: bool
-            If set to true, this conversion will return each output expression rather
-            than a packaged module. This can be useful when converting subgraphs to
-            relay.
-
-        Returns
-        -------
-        mod : tvm.IRModule
-            The returned relay module
-
-        params : dict
-            A dict of name: tvm.nd.array pairs, used as pretrained weights
-        """
-        self.opset = opset
-        self._parse_graph_initializers(graph)
-        self._parse_graph_input(graph)
-        self._check_user_inputs_in_outermost_graph_scope()
-        self._check_for_unsupported_ops(graph)
-        self._construct_nodes(graph)
-
-        # now return the outputs
-        outputs = [self._nodes[self._parse_value_proto(i)] for i in graph.output]
-        outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-        # If requested, directly return the converted expressions.
-        if get_output_expr:
-            return outputs
-        ## Maintain the order of inputs and parameters from the ONNX graph, but only include
-        ## those parameters that are needed to execute the relay graph
-        free_vars = analysis.free_vars(outputs)
-        nodes = {v: k for k, v in self._nodes.items()}
-        free_vars = [nodes[var] for var in free_vars]
-        for i_name in self._params:
-            if i_name in free_vars and i_name not in self._inputs:
-                self._inputs[i_name] = self._nodes[i_name]
-        # Create a function from our output expression and all input variables.
-        func = _function.Function([v for k, v in self._inputs.items()], outputs)
-        return IRModule.from_expr(func), self._params
-
-    def _parse_graph_initializers(self, graph):
-        """Parse network inputs to relay, aka parameters."""
-        for init_tensor in graph.initializer:
-            if not init_tensor.name.strip():
-                raise ValueError("Tensor's name is required.")
-            array = self._parse_array(init_tensor)
-            if self._freeze_params:
-                self._nodes[init_tensor.name] = _expr.const(array)
-            else:
-                self._params[init_tensor.name] = array
-                self._nodes[init_tensor.name] = new_var(
-                    init_tensor.name,
-                    shape=self._params[init_tensor.name].shape,
-                    dtype=self._params[init_tensor.name].dtype,
-                )
-
-    def _parse_graph_input(self, graph):
-        for i in graph.input:
-            # from onnx v0.2, GraphProto.input has type ValueInfoProto,
-            #  and the name is 'i.name'
-            i_name, i_shape, d_type, i_shape_name = get_info(i)
-            if i_name in self._params:
-                # i is a param instead of input
-                self._num_param += 1
-                self._nodes[i_name] = new_var(
-                    i_name, shape=self._params[i_name].shape, dtype=self._params[i_name].dtype
-                )
-            elif i_name in self._nodes:
-                continue
-            else:
-                self._num_input += 1
-                self._input_names.append(i_name)
-                if i_name in self._shape:
-                    i_shape = self._shape[i_name]
-                else:
-                    if "?" in str(i_shape):
-                        warning_msg = (
-                            "Input %s has unknown dimension shapes: %s. "
-                            "Specifying static values may improve performance"
-                            % (i_name, str(i_shape_name))
-                        )
-                        warnings.warn(warning_msg)
-                if isinstance(self._dtype, dict):
-                    dtype = self._dtype[i_name] if i_name in self._dtype else d_type
-                else:
-                    dtype = d_type
-                self._nodes[i_name] = new_var(i_name, shape=i_shape, dtype=dtype)
-            self._inputs[i_name] = self._nodes[i_name]
-
-    def _check_user_inputs_in_outermost_graph_scope(self):
-        """Only check user inputs in the outer-most graph scope."""
-        if self._old_manager is None:
-            assert all(
-                [name in self._input_names for name in self._shape.keys()]
-            ), "User specified the shape for inputs that weren't found in the graph: " + str(
-                self._shape
-            )
-
-    def _check_for_unsupported_ops(self, graph):
-        convert_map = _get_convert_map(self.opset)
-        unsupported_ops = set()
-        for node in graph.node:
-            op_name = node.op_type
-            if (
-                op_name not in convert_map
-                and op_name != "Constant"
-                and op_name not in _identity_list
-            ):
-                unsupported_ops.add(op_name)
-        if unsupported_ops:
-            msg = "The following operators are not supported for frontend ONNX: "
-            msg += ", ".join(unsupported_ops)
-            raise tvm.error.OpNotImplemented(msg)
-
-    def _construct_nodes(self, graph):
-        """Nodes are stored as directed acyclic graph."""
-        for node in graph.node:
-            op_name = node.op_type
-            attr = self._parse_attr(node.attribute)
-            # Fill in span of inputs
-            node_source_name = get_source_name(node, self._op_type_dict)
-            self._set_parameter_span(node, node_source_name)
-            # Create and populate input list.
-            inputs = onnx_input()
-            for i in node.input:
-                if i != "":
-                    inputs.append(self._nodes[self._renames.get(i, i)])
-                else:
-                    inputs.append(None)
-            i_name = self._parse_value_proto(node)
-            node_output = self._fix_outputs(op_name, node.output)
-            attr["tvm_custom"] = {}
-            attr["tvm_custom"]["name"] = i_name
-            attr["tvm_custom"]["num_outputs"] = len(node_output)
-
-            op = self._convert_operator(op_name, inputs, attr, self.opset)
-            if not isinstance(op, _expr.TupleWrapper):
-                outputs_num = 1
-            else:
-                outputs_num = len(op)
-
-            if outputs_num == 1:
-                op = fold_constant(op)
-            else:
-                op = _expr.TupleWrapper(fold_constant(op.astuple()), len(op))
-
-            op = set_span(op, node_source_name)
-
-            if outputs_num > 1:
-                # ONNX supports optional outputs for some nodes.
-                # This block searches for missing outputs in the ONNX graph
-                # and removes any unneeded ops
-                valid_outputs = [False] * outputs_num
-                for i, output in enumerate(node_output):
-                    if output != "":
-                        valid_outputs[i] = True
-                # If we have outputs ONNX isn't expecting, we need to drop them
-                if not all(valid_outputs):
-                    tup = op.astuple()
-                    # TupleWrapper can also wrap ops with TupleType outputs
-                    if isinstance(tup, _expr.Tuple):
-                        # For tuples, we extract the fields instead of using GetTupleItem
-                        outputs = [tup.fields[i] for i, valid in enumerate(valid_outputs) if valid]
-                    else:
-                        # For call nodes, we need to GetTupleItem
-                        outputs = [op[i] for i, valid in enumerate(valid_outputs) if valid]
-                    # Create the new op with valid outputs
-                    if len(outputs) == 1:
-                        op = outputs[0]
-                    elif len(outputs) != outputs_num:
-                        op = _expr.TupleWrapper(_expr.Tuple(outputs), len(outputs))
-                    # Drop invalid outputs for the onnx node
-                    outputs_num = len(outputs)
-                    node_output = [output for output in node_output if output != ""]
-            assert (
-                len(node_output) == outputs_num
-            ), f"Number of output mismatch {len(node_output)} vs {outputs_num} in {op_name}."
-
-            if outputs_num == 1:
-                self._nodes[node_output[0]] = op
-            else:
-                for k, i in zip(list(node_output), range(len(node_output))):
-                    self._nodes[k] = op[i]
-
-    def _set_parameter_span(self, node, node_source_name):
-        for i in node.input:
-            if i != "":
-                name = self._renames.get(i, i)
-                expr = self._nodes.get(name)
-                # relay.Var -> inputs / params
-                # relay.Constant -> freezed params / built-in constants
-                if isinstance(expr, (relay.Var, relay.Constant)):
-                    expr_with_span = set_span(expr, make_parameter_span([node_source_name, name]))
-                    self._nodes[name] = expr_with_span
-                    if name in self._inputs:
-                        self._inputs[name] = expr_with_span
-
-    def _parse_value_proto(self, value_proto):
-        """Parse ValueProto or raw str."""
-        try:
-            name = value_proto.name
-        except AttributeError:
-            name = value_proto
-        return name
-
-    def _parse_array(self, tensor_proto):
-        np_array = get_numpy(tensor_proto).reshape(tuple(tensor_proto.dims))
-        return _nd.array(np_array)
-
-    def _parse_attr(self, attr_proto):
-        """Convert a list of AttributeProto to a dict, with names as keys."""
-        attrs = {}
-        for a in attr_proto:
-            for f in ["f", "i", "s", "g"]:
-                if a.HasField(f):
-                    attrs[a.name] = getattr(a, f)
-            for f in ["floats", "ints", "strings"]:
-                if list(getattr(a, f)):
-                    assert a.name not in attrs, "Only one type of attr is allowed"
-                    attrs[a.name] = tuple(getattr(a, f))
-            for f in ["t"]:
-                if a.HasField(f):
-                    attrs[a.name] = getattr(a, f)
-            for f in ["tensors"]:
-                if list(getattr(a, f)):
-                    assert a.name not in attrs, "Only one type of attr is allowed"
-                    attrs[a.name] = tuple(getattr(a, f))
-            for f in ["graphs"]:
-                if list(getattr(a, f)):
-                    raise NotImplementedError(f"Field {f} is not supported in relay.")
-            if a.name not in attrs:
-                raise ValueError(f"Cannot parse attribute: \n{a}\n.")
-        return attrs
-
-    def _convert_operator(self, op_name, inputs, attrs, opset):
-        """Convert ONNX operator into a Relay operator.
-        The converter must specify conversions explicitly for incompatible name, and
-        apply handlers to operator attributes.
-
-        Parameters
-        ----------
-        op_name : str
-            Operator name, such as Convolution, FullyConnected
-        inputs : list of tvm.relay.function.Function
-            List of inputs.
-        attrs : dict
-            Dict of operator attributes
-        opset : int
-            Opset version
-
-        Returns
-        -------
-        sym : tvm.relay.function.Function
-            Converted relay function
-        """
-        convert_map = _get_convert_map(opset)
-        if op_name in _identity_list:
-            sym = get_relay_op(op_name)(*inputs, **attrs)
-        elif op_name in convert_map:
-            sym = convert_map[op_name](inputs, attrs, self._params)
-        else:
-            raise NotImplementedError(f"Operator {op_name} not implemented.")
-        return sym
-
-    def _fix_outputs(self, op_name, outputs):
-        """A hack to handle dropout or similar operator that have more than one out
-        in ONNX.
-        """
-        if op_name == "Dropout":
-            if len(outputs) == 1:
-                return outputs
-            # TODO(zhreshold): support dropout mask?
-            outputs = outputs[:-1]
-        return outputs
-
-
-def export_model(location, graph):
-    """Convert the graph to an onnx model and export it to the location."""
-    import datetime
-    import os
-
-    from onnx import save, helper
-
-    if not os.path.exists(location):
-        os.makedirs(location)
-    time_stamp = datetime.datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
-    model = helper.make_model(graph)
-    save(model, os.path.join(location, f"tvm_exported_model_{time_stamp}.onnx"))
-
-
-def from_onnx(
-    model,
-    shape=None,
-    dtype="float32",
-    opset=None,
-    freeze_params=True,
-    convert_config=None,
-    export_node_renamed_model_path=None,
-):
-    """Convert a ONNX model into an equivalent Relay Function.
-
-    ONNX graphs are represented as Python Protobuf objects.
-    The companion parameters will be handled automatically.
-    However, the input names from onnx graph is vague, mixing inputs and
-    network weights/bias such as "1", "2"...
-    For convenience, we rename the `real` input names to "input_0",
-    "input_1"... And renaming parameters to "param_0", "param_1"...
-
-    By default, ONNX defines models in terms of dynamic shapes. The ONNX importer
-    retains that dynamism upon import, and the compiler attempts to convert the
-    model into a static shapes at compile time. If this fails, there may still
-    be dynamic operations in the model. Not all TVM kernels currently support
-    dynamic shapes, please file an issue on discuss.tvm.apache.org
-    if you hit an error with dynamic kernels.
-
-    Parameters
-    ----------
-    model : protobuf object
-        ONNX ModelProto after ONNX v1.1.0
-
-    shape : dict of str to tuple, optional
-        The input shape to the graph
-
-    dtype : str or dict of str to str
-        The input types to the graph
-
-    opset : int, optional
-        Override to autodetected opset.
-        This can be helpful for some testing.
-
-    freeze_params: bool
-        If this parameter is true, the importer will take any provided
-        onnx input values (weights, shapes, etc) and embed them into the relay model
-        as Constants instead of variables. This allows more aggressive optimizations
-        at compile time and helps in making models static if certain inputs represent
-        attributes relay would traditionally consider compile-time constants.
-
-    convert_config : Optional[Dict[str, Any]]
-        Default config:
-            use_nt_batch_matmul : bool = True
-                True to convert qualified onnx `matmul` to `nn.batch_matmul` strict to NT format
-                (transpose_a=False, transpose_b=True).
-
-    export_node_renamed_model_path : str, optional
-        Export the node renamed onnx model to the path.
-        Some models do not contain names in their nodes. During the conversion, if names of nodes
-        are empty, new names will be assigned based on their op types. The exported model can be the
-        reference to spans.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation
-
-    params : dict of str to tvm.nd.NDArray
-        The parameter dict to be used by relay
-    """
-    global ONNX_DEFAULT_CONFIGS
-    if convert_config is not None:
-        ONNX_DEFAULT_CONFIGS.update(convert_config)
-
-    try:
-        import onnx
-
-        if hasattr(onnx.checker, "check_model"):
-            # try use onnx's own model checker before converting any model
-            try:
-                onnx.checker.check_model(model)
-            except Exception as e:  # pylint: disable=c-extension-no-member, broad-except
-                # the checker is a bit violent about errors, so simply print warnings here
-                warnings.warn(str(e))
-    except ImportError:
-        pass
-    g = GraphProto(shape, dtype, freeze_params, op_type_dict={})
-    graph = model.graph
-
-    try:
-        opset_in_model = 1
-        if model.opset_import:
-            # TODO: for now we only really support ai.onnx op set
-            # TODO: handle other namespaces well see https://github.com/apache/tvm/issues/10950
-            for opset_identifier in model.opset_import:
-                # As per https://github.com/onnx/onnx/blob/main/docs/IR.md
-                # All operator sets except the default one must specify the operator version
-                if str(opset_identifier.domain) in ["ai.onnx", ""]:
-                    opset_in_model = opset_identifier.version
-                    break
-    except AttributeError:
-        opset_in_model = 1
-
-    if opset is None:
-        opset = opset_in_model
-    elif opset < opset_in_model:
-        warnings.warn(
-            ""
-            f"You are overwritting original opset ver = {opset_in_model} by lower ver = {opset}. "
-            f"That might cause model conversion errors."
-        )
-
-    # Use the graph proto as a scope so that ops can access other nodes if needed.
-    with g:
-        mod, params = g.from_onnx(graph, opset)
-
-    if export_node_renamed_model_path:
-        export_model(export_node_renamed_model_path, graph)
-
-    if freeze_params:
-        mod = relay.transform.DynamicToStatic()(mod)
-
-    return mod, params
diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
deleted file mode 100755
index e912c932233a..000000000000
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ /dev/null
@@ -1,3165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
-# pylint: disable=import-outside-toplevel, broad-exception-raised, use-list-literal, superfluous-parens
-"""Paddle: PArallel Distributed Deep LEarning."""
-
-import warnings
-import numpy as np
-
-import tvm
-from tvm.ir import IRModule
-
-from ... import nd as _nd
-from .. import analysis
-from .. import ty as _ty
-from .. import expr as _expr
-from .. import function as _function
-from .. import ty as _ty
-from .. import op as _op
-from .. import qnn as _qnn
-from .common import (
-    autopad,
-    fold_constant,
-    get_relay_op,
-    infer_shape,
-    infer_type,
-    infer_value,
-    shape_of,
-    try_infer_value,
-    new_var,
-)
-
-__all__ = ["from_paddle"]
-
-
-def _dtype_shape_promotion(inputs):
-    """Promote data type and shape for list of tensors."""
-
-    dtype_order = ["bool", "int8", "int16", "int32", "int64", "float32", "float64"]
-
-    ranks = [len(infer_shape(x)) for x in inputs]
-    if set(ranks) == set([1, 0]):
-        for i, r in enumerate(ranks):
-            if r == 0:
-                inputs[i] = _op.expand_dims(inputs[i], axis=0)
-
-    dtypes = set(dtype_order.index(infer_type(x).checked_type.dtype) for x in inputs)
-    if len(dtypes) == 1:
-        return inputs
-    max_dtype = dtype_order[max(dtypes)]
-    for i, input_op in enumerate(inputs):
-        if infer_type(input_op).checked_type.dtype != max_dtype:
-            inputs[i] = input_op.astype(max_dtype)
-    return inputs
-
-
-def _convert_dtype_value(val):
-    """Converts a Paddle type id to a string."""
-
-    convert_dtype_map = {
-        21: "int8",
-        20: "uint8",
-        6: "float64",
-        5: "float32",
-        4: "float16",
-        3: "int64",
-        2: "int32",
-        1: "int16",
-        0: "bool",
-    }
-    if val not in convert_dtype_map:
-        msg = f"Paddle data type value {val} is not handled yet."
-        raise NotImplementedError(msg)
-    return convert_dtype_map[val]
-
-
-def convert_unary_op(g, op, block):
-    """Operator converter for all the unary operators."""
-
-    # op_map stores mapping relationship between paddlepaddle and relay
-    op_map = {"isinf_v2": _op.isinf, "isfinite_v2": _op.isfinite, "isnan_v2": _op.isnan}
-    if op.type in op_map:
-        unary_func = op_map[op.type]
-    else:
-        # while paddle operator's name is same with relay
-        unary_func = get_relay_op(op.type)
-    out = unary_func(g.get_node(op.input("X")[0]))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_binary_logical_op(g, op, block):
-    """Operator converter for logical op."""
-
-    ipt0 = g.get_node(op.input("X")[0])
-    ipt1 = g.get_node(op.input("Y")[0])
-    op_func = get_relay_op(op.type)
-    out = op_func(ipt0, ipt1)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_addmm(g, op, block):
-    """Operator converter for addmm."""
-
-    input_x = g.get_node(op.input("Input")[0])
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-
-    alpha = op.attr("Alpha")
-    beta = op.attr("Beta")
-    dtype = block.var(op.output("Out")[0]).dtype
-    dtype = _convert_dtype_value(dtype)
-
-    if not isinstance(alpha, _expr.Expr) and alpha != 1:
-        alpha = _expr.const(alpha, dtype)
-        x *= alpha
-
-    if not isinstance(beta, _expr.Expr) and beta != 1:
-        beta = _expr.const(beta, dtype)
-        input_x *= beta
-
-    transposed_y = _op.transpose(y, axes=[1, 0])
-    dense_out = _op.nn.dense(x, transposed_y)
-    out = dense_out + input_x
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_arg_max_min(g, op, block):
-    """Operator converter for arg_max and arg_min."""
-
-    axis = op.attr("axis")
-    keepdims = op.attr("keepdims")
-    flatten = op.attr("flatten")
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-
-    func = _op.argmax if op.type == "arg_max" else _op.argmin
-    x = g.get_node(op.input("X")[0])
-    if axis is None or flatten:
-        x = _op.reshape(x, [-1])
-        out = func(x, axis=None, keepdims=True)
-    else:
-        out = func(x, axis=axis, keepdims=keepdims)
-    if dtype != infer_type(out).checked_type.dtype:
-        out = _op.cast(out, dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_argsort(g, op, block):
-    """Operator converter for argsort."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-    descending = op.attr("descending")
-
-    out_indices = _op.argsort(x, axis, not descending, dtype="int64")
-    out = _op.gather(x, axis, out_indices)
-    g.add_node(op.output("Out")[0], out)
-    g.add_node(op.output("Indices")[0], out_indices)
-
-
-def convert_assign(g, op, block):
-    """Operator converter for assign."""
-
-    out = g.get_node(op.input("X")[0])
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_assign_value(g, op, block):
-    """Operator converter for assign_value."""
-
-    keys = ["bool_values", "fp32_values", "int32_values", "int64_values"]
-    dtypes = ["bool", "float32", "int32", "int64"]
-    for i, key in enumerate(keys):
-        dtype = dtypes[i]
-        value = np.array(op.attr(key)).astype(dtype)
-        if value is not None and value.size >= 1:
-            break
-    shape = op.attr("shape")
-    value = value.reshape(shape)
-    out = _op.const(value, dtype=dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_batch_norm(g, op, block):
-    """Operator converter for batch_norm."""
-
-    ipt_name = op.input("X")[0]
-    scale_name = op.input("Scale")[0]
-    bias_name = op.input("Bias")[0]
-    mean_name = op.input("Mean")[0]
-    variance_name = op.input("Variance")[0]
-    epsilon = op.attr("epsilon")
-    data_layout = op.attr("data_layout")
-
-    if data_layout == "NCHW":
-        axis = 1
-    elif data_layout == "NHWC":
-        axis = 3
-    else:
-        msg = f'Value {data_layout} in attribute "batch_norm" of operator Conv is not "valid."'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    out = _op.nn.batch_norm(
-        g.get_node(ipt_name),  # data
-        g.get_node(scale_name),  # gamma
-        g.get_node(bias_name),  # beta
-        g.get_node(mean_name),  # moving_mean
-        g.get_node(variance_name),  # moving_var
-        axis=axis,
-        epsilon=epsilon,
-    )
-    g.add_node(op.output("Y")[0], out[0])
-
-
-def convert_bmm(g, op, block):
-    """Operator converter for bmm."""
-
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-    y = _op.transpose(y, [0, 2, 1])
-    out = _op.nn.batch_matmul(x, y)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_brelu(g, op, block):
-    """Operator converter for brelu."""
-
-    x = g.get_node(op.input("X")[0])
-    t_max = op.attr("t_max")
-    t_min = op.attr("t_min")
-    out = _op.tensor.clip(x, t_min, t_max)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_cast(g, op, block):
-    """Operator converter for cast."""
-
-    dtype = op.attr("out_dtype")
-    dtype = _convert_dtype_value(dtype)
-    x = g.get_node(op.input("X")[0])
-    out = _op.cast(x, dtype=dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_clip(g, op, block):
-    """Operator converter for clip."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    # if the min/max value is a tensor
-    min_max_is_tensor = False
-    if op.input("Min"):
-        min_value = g.get_node(op.input("Min")[0])
-        min_value, infered = try_infer_value(min_value, g.get_params())
-        if infered:
-            min_value = min_value.tolist()[0]
-        if isinstance(min_value, _expr.Expr):
-            min_max_is_tensor = True
-    else:
-        min_value = op.attr("min")
-
-    if op.input("Max"):
-        max_value = g.get_node(op.input("Max")[0])
-        max_value, infered = try_infer_value(max_value, g.get_params())
-        if infered:
-            max_value = max_value.tolist()[0]
-        if isinstance(max_value, _expr.Expr):
-            min_max_is_tensor = True
-    else:
-        max_value = op.attr("max")
-
-    if min_max_is_tensor:
-        if not isinstance(min_value, _expr.Expr):
-            min_value = _op.const(min_value, dtype)
-        if not isinstance(max_value, _expr.Expr):
-            max_value = _op.const(max_value, dtype)
-        out = _op.maximum(x, min_value)
-        out = _op.minimum(out, max_value)
-    else:
-        out = _op.clip(x, min_value, max_value)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_concat(g, op, block):
-    """Operator converter for concat."""
-
-    inputs = [g.get_node(op.input("X")[i]) for i in range(len(op.input("X")))]
-    axis = op.attr("axis")
-    inputs = _dtype_shape_promotion(inputs)
-    out = _op.concatenate(inputs, axis=axis)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_conv2d(g, op, block):
-    """Operator converter for conv2d."""
-
-    dilations = op.attr("dilations")
-    groups = op.attr("groups")
-    paddings = op.attr("paddings")
-    padding_algorithm = op.attr("padding_algorithm")
-    strides = op.attr("strides")
-
-    kernel = g.get_node(op.input("Filter")[0])
-    input_x = g.get_node(op.input("Input")[0])
-    data_layout = op.attr("data_format")
-    kernel_layout = "OIHW" if data_layout == "NCHW" else "HWIO"
-    out_channels, _, k_h, k_w = infer_shape(kernel)
-    if padding_algorithm == "VALID":
-        paddings = [0, 0]
-    elif padding_algorithm == "SAME":
-        # Handle history issue of PaddlePaddle
-        # while padding_algorithm == "SAME"
-        # dilations will be set to [1, 1]
-        dilations = [1, 1]
-        input_x = autopad(input_x, strides, [k_h, k_w], dilations)
-        paddings = [0, 0]
-    elif padding_algorithm == "EXPLICIT":
-        if len(paddings) == 2:
-            paddings = [paddings[0], paddings[1], paddings[0], paddings[1]]
-        elif len(paddings) == 4:
-            paddings = [paddings[0], paddings[2], paddings[1], paddings[3]]
-    else:
-        msg = f'Value {padding_algorithm} in attribute "padding" of operator Conv is not "valid."'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    is_quantized = op.has_attr("quantization_type")
-    # PaddlePaddle wieght layout is "OIHW", tvm need "HWIO" when op data_format is "NHWC".
-    # There are two situations when converting the data format of weights:
-    # 1 Conv_2d is not a quantified OP, its weight information is the weights themselves.
-    #   We directly convert the weight information when processing conv_2d.
-    # 2 Conv_2d is a quantified OP, and its weight information is the output of
-    #   the quantize_linear operator. Therefore, the weight information needs to be
-    #   transformed when processing the quantize_linear operator.
-    if (not is_quantized) and (data_layout == "NHWC"):
-        kernel_data = g.get_params(op.input("Filter")[0])
-        kernel_data = kernel_data.asnumpy()
-        kernel_data = kernel_data.transpose((2, 3, 1, 0))
-        kernel_data = _nd.array(kernel_data)
-        g.modify_node(op.input("Filter")[0], kernel_data)
-        kernel = g.get_node(op.input("Filter")[0])
-
-    out = _op.nn.conv2d(
-        input_x,
-        kernel,
-        strides=strides,
-        padding=paddings,
-        dilation=dilations,
-        groups=groups,
-        channels=out_channels,
-        kernel_size=[k_h, k_w],
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    g.add_node(op.output("Output")[0], out)
-
-
-def convert_conv2d_transpose(g, op, block):
-    """Operator converter for conv2d_transpose."""
-
-    dilations = op.attr("dilations")
-    groups = op.attr("groups")
-    paddings = op.attr("paddings")
-    padding_algorithm = op.attr("padding_algorithm")
-    strides = op.attr("strides")
-    output_padding = op.attr("output_padding") if op.attr("output_padding") else [0, 0]
-
-    kernel = g.get_node(op.input("Filter")[0])
-    input_x = g.get_node(op.input("Input")[0])
-    _, out_channels, k_h, k_w = infer_shape(kernel)
-    k_size = [k_h, k_w]
-    if padding_algorithm == "VALID":
-        paddings = [0, 0]
-    elif padding_algorithm == "SAME":
-        # SAME padding of conv2d_transpose is not same with conv2d
-        # We cannot use auto_pad here, only static shape is supported now
-        dilations = [1, 1]
-        input_shape = shape_of(input_x)
-        h_w = _op.strided_slice(input_shape, [2], [4])
-        try:
-            h_w = infer_value(h_w, g.get_params()).numpy().tolist()
-        except Exception as e:
-            msg = "The SAME padding algorithm of conv2d_transpose not support dynamic shape"
-            raise tvm.error.OpAttributeInvalid(msg) from e
-        paddings = []
-        for i in range(2):
-            if strides[i] == 1 or h_w[i] % strides[i] == 0:
-                pad = max(k_size[i] - strides[i], 0)
-            else:
-                pad = max(k_size[i] - (h_w[i] % strides[i]), 0)
-            pad_before = pad // 2
-            pad_after = pad - pad_before
-            paddings.insert(-1, pad_before)
-            paddings.append(pad_after)
-    elif padding_algorithm == "EXPLICIT":
-        if len(paddings) == 2:
-            paddings = [paddings[0], paddings[1], paddings[0], paddings[1]]
-        elif len(paddings) == 4:
-            paddings = [paddings[0], paddings[2], paddings[1], paddings[3]]
-    else:
-        msg = f'Value {padding_algorithm} in attribute "padding" of operator Conv is not "valid."'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    out = _op.nn.conv2d_transpose(
-        input_x,
-        kernel,
-        strides=strides,
-        padding=paddings,
-        dilation=dilations,
-        groups=groups,
-        channels=out_channels * groups,
-        kernel_size=k_size,
-        output_padding=output_padding,
-    )
-    g.add_node(op.output("Output")[0], out)
-
-
-def convert_conv3d(g, op, block):
-    """Operator converter for conv3d."""
-
-    dilations = op.attr("dilations")
-    groups = op.attr("groups")
-    paddings = op.attr("paddings")
-    padding_algorithm = op.attr("padding_algorithm")
-    strides = op.attr("strides")
-
-    kernel = g.get_node(op.input("Filter")[0])
-    input_x = g.get_node(op.input("Input")[0])
-    data_layout = op.attr("data_format")
-    out_channels, _, k_d, k_h, k_w = infer_shape(kernel)
-    if padding_algorithm == "VALID":
-        paddings = [0, 0, 0]
-    elif padding_algorithm == "SAME":
-        dilations = [1, 1, 1]
-        input_x = autopad(input_x, strides, [k_d, k_h, k_w], dilations)
-        paddings = [0, 0, 0]
-    elif padding_algorithm == "EXPLICIT":
-        if len(paddings) == 3:
-            paddings = [
-                paddings[0],
-                paddings[1],
-                paddings[2],
-                paddings[0],
-                paddings[1],
-                paddings[2],
-            ]
-        elif len(paddings) == 6:
-            paddings = [
-                paddings[0],
-                paddings[3],
-                paddings[1],
-                paddings[4],
-                paddings[2],
-                paddings[5],
-            ]
-    else:
-        msg = f'Value {padding_algorithm} in attribute "padding" of operator Conv is not "valid."'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    out = _op.nn.conv3d(
-        input_x,
-        kernel,
-        strides=strides,
-        padding=paddings,
-        dilation=dilations,
-        groups=groups,
-        channels=out_channels,
-        kernel_size=[k_d, k_h, k_w],
-        data_layout=data_layout,
-    )
-    g.add_node(op.output("Output")[0], out)
-
-
-def convert_dist(g, op, block):
-    """Operator converter for dist."""
-
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-    z = _op.abs(_op.subtract(x, y))
-    dtype = infer_type(x).checked_type.dtype
-    p = op.attr("p")
-    if p == np.inf:
-        out = _op.reduce.max(z)
-    elif p == -np.inf:
-        out = _op.reduce.min(z)
-    elif p == 0.0:
-        out = _op.reduce.sum(_op.sign(z))
-    else:
-        inv_p = _expr.const(1.0 / p, dtype=dtype)
-        p = _expr.const(p, dtype=dtype)
-        power_z = _op.power(z, p)
-        sum_pow = _op.reduce.sum(power_z)
-        out = _op.power(sum_pow, inv_p)
-    out = _op.full(out, shape=(1))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_cumsum(g, op, block):
-    """Operator converter for cumsum."""
-
-    axis = op.attr("axis")
-    exclusive = op.attr("exclusive")
-    flatten = op.attr("flatten")
-    reverse = op.attr("reverse")
-
-    x = g.get_node(op.input("X")[0])
-    if axis is None or flatten:
-        x = _op.reshape(x, [-1])
-    if reverse:
-        x = _op.reverse(x, axis=axis)
-        out = _op.cumsum(x, axis=axis, exclusive=exclusive)
-        out = _op.reverse(out, axis=axis)
-    else:
-        out = _op.cumsum(x, axis=axis, exclusive=exclusive)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_dropout(g, op, block):
-    """Operator converter for dropout."""
-
-    x = g.get_node(op.input("X")[0])
-    dropout_prob = op.attr("dropout_prob")
-    dropout_implementation = op.attr("dropout_implementation")
-    if dropout_implementation == "downgrade_in_infer":
-        out = _op.nn.dropout(x, dropout_prob) * _expr.const(1 - dropout_prob, dtype="float32")
-    else:
-        out = _op.nn.dropout(x, dropout_prob)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_dot(g, op, block):
-    """Operator converter for dot."""
-
-    # x, y should be 1D or 2D tensor
-    # when it's 2D tensor, the first dimension means batch dimension
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-
-    out = _op.sum(_op.multiply(x, y), axis=[-1], keepdims=True)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_elementwise_op(g, op, block):
-    """Operator converter for all the elementwise operators."""
-
-    op_map = {
-        "elementwise_div": "divide",
-        "elementwise_add": "add",
-        "elementwise_mul": "multiply",
-        "elementwise_sub": "subtract",
-        "elementwise_mod": "mod",
-        "elementwise_max": "maximum",
-        "elementwise_min": "minimum",
-        "elementwise_pow": "power",
-        "elementwise_floordiv": "floor_divide",
-        "equal": "equal",
-        "greater_equal": "greater_equal",
-        "greater_than": "greater",
-        "less_equal": "less_equal",
-        "less_than": "less",
-        "not_equal": "not_equal",
-    }
-    op_func = op_map[op.type]
-    ipt0 = g.get_node(op.input("X")[0])
-    ipt1 = g.get_node(op.input("Y")[0])
-    ipt0_shape = infer_shape(ipt0)
-    ipt1_shape = infer_shape(ipt1)
-    axis = op.attr("axis")
-    if len(ipt0_shape) != len(ipt1_shape):
-        if axis < 0:
-            axis = axis + len(ipt0_shape)
-        if axis != len(ipt0_shape) - 1:
-            ipt1 = _op.expand_dims(ipt1, axis=axis, num_newaxis=(len(ipt0_shape) - axis - 1))
-    op_func = get_relay_op(op_func)
-    out = op_func(ipt0, ipt1)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_linspace(g, op, block):
-    """Operator converter for linspace."""
-
-    start = g.get_node(op.input("Start")[0])
-    stop = g.get_node(op.input("Stop")[0])
-    num = g.get_node(op.input("Num")[0])
-    dtype = _convert_dtype_value(op.attr("dtype"))
-
-    start = _op.cast(start, dtype)
-    stop = _op.cast(stop, dtype)
-    num = _op.cast(num, dtype)
-
-    if dtype in ["int32", "float32"]:
-        tmp_dtype = "float32"
-    else:
-        tmp_dtype = "float64"
-    start = _op.cast(start, tmp_dtype)
-    stop = _op.cast(stop, tmp_dtype)
-    num = _op.cast(num, tmp_dtype)
-    const_one = _expr.const(1, tmp_dtype)
-    const_zero = _expr.const(0, tmp_dtype)
-    seg_num = _op.where(num > const_one, num - const_one, num - const_zero)
-    seg_len = _op.subtract(stop, start)
-    step_len = _op.divide(seg_len, seg_num)
-    step_cnt = _op.argwhere(_op.ones(num, dtype=tmp_dtype))
-    step_cnt = _op.cast(step_cnt, dtype=tmp_dtype)
-    out = _op.multiply(step_len, step_cnt)
-    out = _op.add(start, out)
-    out = _op.squeeze(out, axis=[1])
-    out = _op.cast(out, dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_elu(g, op, block):
-    """Operator converter for elu."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    alpha = op.attr("alpha")
-    alpha = _expr.const(-1.0 * alpha, dtype=dtype)
-    out = alpha * _op.nn.relu(_expr.const(1, dtype=dtype) - _op.exp(x)) + _op.nn.relu(x)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_expand(g, op, block):
-    """Operator converter for expand."""
-
-    x = g.get_node(op.input("X")[0])
-    if op.input("Shape"):
-        sizes = g.get_node(op.input("Shape")[0])
-    else:
-        sizes = op.attr("shape")
-
-    if isinstance(sizes, _expr.Expr):
-        sizes = try_infer_value(sizes, parameters=g.get_params())[0]
-
-    if isinstance(sizes, np.ndarray):
-        sizes = sizes.tolist()
-
-    out = _op.broadcast_to(x, sizes)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_expand_as(g, op, block):
-    """Operator converter for expand_as."""
-
-    x = g.get_node(op.input("X")[0])
-    target_shape = op.attr("target_shape")
-    out = _op.broadcast_to(x, target_shape)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_eye(g, op, block):
-    """Operator converter for eye."""
-
-    num_rows = op.attr("num_rows")
-    num_columns = op.attr("num_columns")
-    if num_columns == -1:
-        num_columns = num_rows
-    one_nums = min(num_rows, num_columns)
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-
-    zeros = _op.zeros((num_rows, num_columns), dtype)
-    if one_nums == 0:
-        out = zeros
-    else:
-        ones = _op.ones(one_nums, dtype)
-        indices = _op.arange(
-            _expr.const(0, dtype="int32"), _expr.const(one_nums, dtype="int32"), dtype="int32"
-        )
-        out = _op.scatter_nd(zeros, _op.stack([indices, indices], axis=0), ones, "update")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_feed(g, op, block):
-    """Converter for model input node."""
-
-    if block is not None:
-        ipt_name = op.output("Out")[0]
-        ipt_shape = block.var(ipt_name).shape
-        ipt_dtype = block.var(ipt_name).dtype
-        ipt_dtype = str(ipt_dtype).strip().split(".")[1]
-    else:
-        ipt_shape = op.shape
-        ipt_dtype = str(op.dtype).strip().split(".")[1]
-        ipt_name = op.name
-    if g.shape_dict is not None:
-        ipt_shape = g.shape_dict[ipt_name]
-
-    if isinstance(ipt_shape, tuple):
-        ipt_shape = list(ipt_shape)
-    for i, s in enumerate(ipt_shape):
-        if s < 0:
-            ipt_shape[i] = _ty.Any()
-    out = new_var(ipt_name, shape=ipt_shape, dtype=ipt_dtype)
-    g.add_node(ipt_name, out)
-
-
-def convert_fill_any_like(g, op, block):
-    """Operator converter for fill_any_like."""
-
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-    x = g.get_node(op.input("X")[0])
-    value = _expr.const(op.attr("value"), dtype=dtype)
-    out = _op.transform.full_like(x, value).astype(dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_fill_constant(g, op, block):
-    """Operator converter for fill_constant."""
-
-    value = op.attr("value")
-    shape = block.var(op.output("Out")[0]).shape
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-    value = _expr.const(value).astype(dtype)
-    if "ValueTensor" in op.input_names and op.input("ValueTensor"):
-        shape = g.get_node(op.input("ValueTensor")[0])
-    if "ShapeTensor" in op.input_names and op.input("ShapeTensor"):
-        shape = g.get_node(op.input("ShapeTensor")[0])
-
-    if isinstance(shape, _expr.Expr):
-        shape = try_infer_value(shape, parameters=g.get_params())[0]
-
-    if isinstance(shape, np.ndarray):
-        shape = shape.tolist()
-
-    out = _op.full(value, shape=shape, dtype=dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_fill_constant_batch_size_like(g, op, block):
-    """Operator converter for fill_constant_batch_size_like."""
-
-    x = g.get_node(op.input("Input")[0])
-    value = op.attr("value")
-    shape = op.attr("shape")
-    input_dim_idx = op.attr("input_dim_idx")
-    output_dim_idx = op.attr("output_dim_idx")
-    dtype = op.attr("dtype")
-
-    dtype = _convert_dtype_value(dtype)
-    input_shape = shape_of(x)
-    batch = _op.strided_slice(input_shape, begin=[input_dim_idx], end=[input_dim_idx + 1]).astype(
-        "int32"
-    )
-    shape_before = shape[:output_dim_idx]
-    shape_before = _expr.const(shape_before, dtype="int32")
-    shape_after = shape[output_dim_idx + 1 :]
-    shape_after = _expr.const(shape_after, dtype="int32")
-
-    out_shape = _op.concatenate([shape_before, batch, shape_after], axis=0)
-    out_shape, infered = try_infer_value(out_shape, g.get_params())
-    if infered:
-        out_shape = out_shape.tolist()
-    constant = _expr.const(value, dtype=dtype).astype(dtype)
-    out = _op.full(constant, out_shape, dtype=dtype)
-
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_fill_zeros_like(g, op, block):
-    """Operator converter for fill_zeros_like."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-    value = _expr.const(0, dtype=dtype)
-    out = _op.transform.full_like(x, value).astype(dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_flatten(g, op, block):
-    """Operator converter for flatten."""
-
-    x = g.get_node(op.input("X")[0])
-    input_shape = list(infer_shape(x))
-
-    start = op.attr("start_axis")
-    end = op.attr("stop_axis")
-    ndim = len(input_shape)
-    if end < 0:
-        end += ndim
-    new_shape = [0] * start
-
-    new_shape.append(-1)
-    squeeze_axes = []
-    for i in range(start + 1, end + 1):
-        new_shape.append(1)
-        squeeze_axes.append(i)
-    for _ in range(end + 1, ndim):
-        new_shape.append(0)
-    out = _op.reshape(x, new_shape)
-    if squeeze_axes:
-        out = _op.squeeze(out, axis=squeeze_axes)
-
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_flip(g, op, block):
-    """Operator converter for flip."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-
-    for i, ax in enumerate(axis):
-        if i == 0:
-            out = _op.reverse(x, ax)
-        else:
-            out = _op.reverse(out, ax)
-
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_gather(g, op, block):
-    """Operator converter for gather."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Index")[0])
-    axis = op.attr("axis")
-    out = _op.take(x, index, axis)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_gather_nd(g, op, block):
-    """Operator converter for gather_nd."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Index")[0])
-    shape = infer_shape(index)
-    perm = list(range(0, len(shape) - 1))
-    perm.insert(0, len(shape) - 1)
-    index = _op.transpose(index, axes=perm)
-    out = _op.gather_nd(x, index, 0, shape[-1])
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_gaussian_random(g, op, block):
-    """Operator converter for convert_gaussian_random."""
-
-    mean = op.attr("mean")
-    std = op.attr("std")
-    shape = op.attr("shape")
-    seed = op.attr("seed")
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-    out = _op.random.normal(key=seed, shape=shape, dtype=dtype, mean=mean, scale=std)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_gelu(g, op, block):
-    """Operator converter for gelu."""
-
-    x = g.get_node(op.input("X")[0])
-    out = x * (
-        _expr.const(0.5, dtype="float32")
-        + _op.erf(x * _expr.const(0.5**0.5, dtype="float32")) * _expr.const(0.5, dtype="float32")
-    )
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_grid_sampler(g, op, block):
-    """Operator converter for grid_sampler."""
-
-    x = g.get_node(op.input("X")[0])
-    data_shape = infer_shape(x)
-    grid = g.get_node(op.input("Grid")[0])
-    mode = op.attr("mode")
-    padding_mode = op.attr("padding_mode")
-    align_corners = op.attr("align_corners")
-
-    if len(data_shape) == 4:
-        layout = "NCHW"
-        axes = [0, 3, 1, 2]
-        grid = _op.transform.transpose(grid, axes)
-    elif len(data_shape) == 5:
-        layout = "NCDHW"
-        axes = [0, 4, 1, 2, 3]
-        grid = _op.transform.transpose(grid, axes)
-    else:
-        msg = "only 4D and 5D are supported."
-        raise ValueError(msg)
-
-    out = _op.image.grid_sample(x, grid, mode, layout, padding_mode, align_corners)
-    g.add_node(op.output("Output")[0], out)
-
-
-def convert_group_norm(g, op, block):
-    """Operator converter for group_norm."""
-
-    x = g.get_node(op.input("X")[0])
-    num_groups = op.attr("groups")
-    epsilon = op.attr("epsilon")
-    gamma = g.get_node(op.input("Scale")[0])
-    beta = g.get_node(op.input("Bias")[0])
-    out = _op.nn.group_norm(
-        x,
-        gamma=gamma,
-        beta=beta,
-        num_groups=num_groups,
-        axis=1,
-        epsilon=epsilon,
-        center=True,
-        scale=True,
-    )
-    g.add_node(op.output("Y")[0], out)
-
-
-def convert_hard_shrink(g, op, block):
-    """Operator converter for hard_shrink."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    threshold = op.attr("threshold")
-    threshold = _op.const(threshold, dtype)
-    out = _op.logical_or(x < _op.const(-1.0, dtype) * threshold, x > threshold)
-    out = _op.cast(out, dtype) * x
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_hard_sigmoid(g, op, block):
-    """Operator converter for hard_sigmoid."""
-
-    slope = op.attr("slope")
-    offset = op.attr("offset")
-    x = g.get_node(op.input("X")[0])
-    out = x * _expr.const(slope) + _expr.const(offset)
-    out = _op.clip(out, 0, 1)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_hard_swish(g, op, block):
-    """Operator converter for hard_swish."""
-
-    offset = op.attr("offset")
-    scale = op.attr("scale")
-    threshold = op.attr("threshold")
-    assert np.isclose(offset, 3.0), "Only support offset==3.0 for PaddlePaddle's hard_swish"
-    assert np.isclose(scale, 6.0), "Only support scale==6.0 for PaddlePaddle's hard_swish"
-    assert np.isclose(threshold, 6.0), "Only support threshold==6.0 for PaddlePaddle's hard_swish"
-    x = g.get_node(op.input("X")[0])
-    out = _op.clip(x, -1 * offset, offset)
-    out = out / _expr.const(threshold) + _expr.const(0.5)
-    out = x * out
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_interpolate(g, op, block):
-    """Operator converter for interpolate."""
-
-    def get_interpolate_mode(op):
-        """Get parameters for interpolation methods."""
-
-        interp_method = op.attr("interp_method")
-        align_corners = op.attr("align_corners")
-        align_mode = op.attr("align_mode")
-
-        rounding_method = ""
-        if interp_method == "nearest":
-            interp_method = "nearest_neighbor"
-            coordinate_transformation_mode = "asymmetric"
-            rounding_method = "floor"
-        elif interp_method == "bilinear":
-            interp_method = "linear"
-            if not align_corners and align_mode == 0:
-                coordinate_transformation_mode = "half_pixel"
-            else:
-                if align_corners:
-                    coordinate_transformation_mode = "align_corners"
-                else:
-                    coordinate_transformation_mode = "asymmetric"
-        elif interp_method == "bicubic":
-            interp_method = "cubic"
-            if align_corners:
-                coordinate_transformation_mode = "align_corners"
-            else:
-                coordinate_transformation_mode = "half_pixel"
-        else:
-            msg = f"interp_method {interp_method} is not supported for PaddlePaddle's interpolate"
-            raise tvm.error.OpAttributeInvalid(msg)
-        return rounding_method, interp_method, coordinate_transformation_mode
-
-    layout = op.attr("data_layout")
-    out_h = op.attr("out_h")
-    out_w = op.attr("out_w")
-    scale = op.attr("scale")
-    if not isinstance(scale, (list, tuple)):
-        scale = [scale, scale]
-
-    x = g.get_node(op.input("X")[0])
-    x_shape = infer_shape(x)
-    assert len(x_shape) == 4, "Only 4D input tensor is supported for PaddlePaddle's interpolate"
-    input_out_size = op.input("OutSize")
-    input_size_tensor = op.input("SizeTensor")
-    input_scale = op.input("Scale")
-    rounding_method, interp_method, coordinate_transformation_mode = get_interpolate_mode(op)
-
-    if input_size_tensor:
-        # if out_size is a list of tensor
-        out_size = list()
-        for name in input_size_tensor:
-            size = g.get_node(name)
-            if len(infer_shape(size)) == 0:
-                size = _op.reshape(size, [-1])
-            out_size.append(size)
-        out_size = _op.concatenate(out_size, axis=0)
-        out_size, infered = try_infer_value(out_size, parameters=g.get_params())
-        if infered:
-            out_size = out_size.tolist()
-    elif input_scale:
-        # if out_size is not defined, but scale is defined
-        input_scale = g.get_node(input_scale[0])
-        input_shape = shape_of(x).astype("float32")
-        if layout.startswith("NC"):
-            out_size = _op.strided_slice(input_shape, begin=[2], end=[4]) * input_scale
-        else:
-            out_size = _op.strided_slice(input_shape, begin=[1], end=[3]) * input_scale
-        out_size = out_size.astype("int32")
-        out_size, infered = try_infer_value(out_size, parameters=g.get_params())
-        if infered:
-            out_size = out_size.tolist()
-    elif scale and scale[0] > 0 and scale[1] > 0:
-        # use attribute scale
-        input_shape = shape_of(x).astype("float32")
-        input_scale = _expr.const(np.array([scale[0], scale[1]]).astype("float32"))
-        if layout.startswith("NC"):
-            out_size = _op.strided_slice(input_shape, begin=[2], end=[4]) * input_scale
-        else:
-            out_size = _op.strided_slice(input_shape, begin=[1], end=[3]) * input_scale
-        out_size = out_size.astype("int32")
-        out_size, infered = try_infer_value(out_size, parameters=g.get_params())
-        if infered:
-            out_size = out_size.tolist()
-    elif input_out_size:
-        # if out_size is a tensor
-        out_size = g.get_node(input_out_size[0])
-        out_size, infered = try_infer_value(out_size, parameters=g.get_params())
-        if infered:
-            out_size = out_size.tolist()
-    else:
-        # if out_size is a constant value
-        out_size = [out_h, out_w]
-
-    out = _op.image.resize2d(
-        x,
-        size=out_size,
-        layout=layout,
-        method=interp_method,
-        coordinate_transformation_mode=coordinate_transformation_mode,
-        rounding_method=rounding_method,
-        cubic_alpha=-0.75,
-    )
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_index_select(g, op, block):
-    """Operator converter for index_select."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Index")[0])
-    axis = op.attr("dim")
-    out = _op.transform.take(x, index, axis, mode="wrap")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_instance_norm(g, op, block):
-    """Operator converter for instance_norm."""
-
-    x = g.get_node(op.input("X")[0])
-    gamma = g.get_node(op.input("Scale")[0])
-    beta = g.get_node(op.input("Bias")[0])
-    epsilon = op.attr("epsilon")
-
-    scale = center = True
-    out = _op.nn.instance_norm(x, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale)
-    g.add_node(op.output("Y")[0], out)
-
-
-def convert_layer_norm(g, op, block):
-    """Operator converter for layer_norm."""
-
-    begin_norm_axis = op.attr("begin_norm_axis")
-    epsilon = op.attr("epsilon")
-    x = g.get_node(op.input("X")[0])
-    bias_input = op.input("Bias")
-    scale_input = op.input("Scale")
-
-    x_shape = infer_shape(x)
-    assert begin_norm_axis in (
-        len(x_shape) - 1,
-        -1,
-    ), "Support only normalization over last one dimension."
-
-    if bias_input:
-        bias = g.get_node(bias_input[0])
-    else:
-        bias = _expr.const(np.zeros(x_shape[begin_norm_axis]))
-
-    if scale_input:
-        scale = g.get_node(scale_input[0])
-    else:
-        scale = _expr.const(np.ones(x_shape[begin_norm_axis]))
-
-    out = _op.nn.layer_norm(
-        x, gamma=scale, beta=bias, axis=begin_norm_axis, epsilon=epsilon, center=True, scale=True
-    )
-    g.add_node(op.output("Y")[0], out)
-
-
-def convert_leaky_relu(g, op, block):
-    """Operator converter for leaky_relu."""
-
-    alpha = op.attr("alpha")
-    x = g.get_node(op.input("X")[0])
-    out = _op.nn.leaky_relu(x, alpha=alpha)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_log1p(g, op, block):
-    """Operator converter for log1p."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    one = _expr.const(1, dtype=dtype)
-    out = _op.log(x + one)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_logical_not(g, op, block):
-    """Operator converter for logical_not op."""
-
-    ipt0 = g.get_node(op.input("X")[0])
-    op_func = get_relay_op(op.type)
-    out = op_func(ipt0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_logsigmoid(g, op, block):
-    """Operator converter for logsigmoid."""
-
-    x = g.get_node(op.input("X")[0])
-    out = _op.log(_op.tensor.sigmoid(x))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_logsoftmax(g, op, block):
-    """Operator converter for logsoftmax."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-    ndim = len(infer_shape(x))
-    if axis < 0:
-        axis += ndim
-    m = _op.max(x, [axis], keepdims=True)
-    e = _op.exp(x - m)
-    s = _op.sum(e, [axis], keepdims=True)
-    out = x - m - _op.log(s)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_logsumexp(g, op, block):
-    """Operator converter for logsumexp."""
-
-    input_x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-    if op.attr("reduce_all"):
-        axis = None
-    keepdims = op.attr("keepdim")
-    out = get_relay_op("logsumexp")(input_x, axis=axis, keepdims=keepdims)
-    if not axis and not keepdims:
-        out = _op.expand_dims(out, axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_lookup_table(g, op, block):
-    """Operator converter for lookup_table_v2."""
-
-    indices = g.get_node(op.input("Ids")[0])
-    padding_idx = op.attr("padding_idx")
-    weights = g.get_node(op.input("W")[0])
-    if padding_idx != -1:
-        if op.input("W")[0] in g.get_params():
-            weights = g.get_params(op.input("W")[0])
-            weights[padding_idx] = 0.0
-            weights = _expr.const(weights)
-        else:
-            shape, infered = try_infer_value(shape_of(weights), g.get_params())
-            if infered:
-                shape = shape.tolist()
-            assert not isinstance(
-                shape, _expr.Expr
-            ), "Shape of weight has to be fixed for PaddlePaddle's lookup_table"
-            filters = np.ones(shape).astype(infer_type(weights).checked_type.dtype)
-            filters[padding_idx] = 0.0
-            filters = _expr.const(filters)
-            weights = weights * filters
-    out = _op.take(weights, indices.astype("int32"), axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_matmul(g, op, block):
-    """Operator converter for matmul."""
-
-    inputs = [g.get_node(op.input("X")[0]), g.get_node(op.input("Y")[0])]
-    a_shape = infer_shape(inputs[0])
-    b_shape = infer_shape(inputs[1])
-    if op.has_attr("trans_x"):
-        # for matmul_v2
-        trans_x = op.attr("trans_x")
-        trans_y = op.attr("trans_y")
-    else:
-        # for matmul
-        trans_x = op.attr("transpose_X")
-        trans_y = op.attr("transpose_Y")
-    if trans_x:
-        perm = list(range(len(a_shape)))
-        perm[-2] = len(a_shape) - 1
-        perm[-1] = len(a_shape) - 2
-        inputs[0] = _op.transpose(inputs[0], axes=perm)
-    if trans_y:
-        perm = list(range(len(b_shape)))
-        perm[-2] = len(b_shape) - 1
-        perm[-1] = len(b_shape) - 2
-        inputs[1] = _op.transpose(inputs[1], axes=perm)
-
-    # This implemention almost keeps same with ONNX
-    # Need to check input shape as batch matmul must be supported.
-    a_rank = len(a_shape)
-    b_rank = len(b_shape)
-    # When performing a batch matmul, we need to properly handle N-dim shapes.
-    if a_rank > 2 or b_rank > 2:
-        a_shape = shape_of(inputs[0], dtype="int32")
-        b_shape = shape_of(inputs[1], dtype="int32")
-
-        def flatten_to_nd(x, x_shape, nd=3):
-            ndims = infer_shape(x_shape)[0]
-            if ndims == nd:
-                return x
-            newshape = _op.concatenate(
-                [
-                    _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-                    _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-                ],
-                0,
-            )
-            out = _op.reshape(x, fold_constant(newshape))
-            return out
-
-        b_type = infer_type(inputs[1])
-        # Convert to dense if the second matrix is 2d and non-dynamic
-        if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
-            a = flatten_to_nd(inputs[0], a_shape, 2)
-            b = _op.transpose(inputs[1])
-            output = _op.nn.dense(a, b)
-        else:
-            # Convert a and b into 3 dimensional tensors.
-            a = flatten_to_nd(inputs[0], a_shape, 3)
-            b = flatten_to_nd(inputs[1], b_shape, 3)
-            # Transpose matrix dimensions of b.
-            b = _op.transpose(b, [0, 2, 1])
-            # Perform a batch matmul.
-            output = _op.nn.batch_matmul(a, b)
-        # Determine the output batch dimension.
-        if a_rank > b_rank:
-            out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
-        elif a_rank < b_rank:
-            out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
-        # If its unclear how broadcasting should be applied, the output
-        # shape is determined by choosing the maximum value from each input.
-        else:
-            out_batch = _op.concatenate(
-                [
-                    _op.maximum(
-                        _op.strided_slice(a_shape, [i], [i + 1]),
-                        _op.strided_slice(b_shape, [i], [i + 1]),
-                    )
-                    for i in range(a_rank - 2)
-                ],
-                0,
-            )
-        # Reshape output to original dimensions.
-        final_shape = _op.concatenate(
-            [
-                out_batch,
-                _op.strided_slice(
-                    a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
-                ),
-                _op.strided_slice(
-                    b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
-                ),
-            ],
-            0,
-        )
-        out = _op.reshape(output, fold_constant(final_shape))
-    else:
-        if b_rank == 1:
-            inputs[1] = _op.expand_dims(inputs[1], 1, 1)
-        # Otherwise a simple dense op will get the job done.
-        input_1_t = _op.transpose(inputs[1], axes=(1, 0))
-        out = _op.nn.dense(inputs[0], input_1_t)
-        if b_rank == 1:
-            out = _op.squeeze(out, axis=[-1])
-    if op.has_attr("alpha"):
-        alpha = op.attr("alpha")
-        if not np.isclose(alpha, 1.0):
-            out = out * _expr.const(alpha).astype("float32")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_meshgrid(g, op, block):
-    """Operator converter for meshgrid."""
-
-    inputs = op.input("X")
-    x = [g.get_node(i) for i in inputs]
-    outs = _op.meshgrid(x, indexing="ij")
-    for i, out in enumerate(outs):
-        g.add_node(op.output("Out")[i], out)
-
-
-def convert_mish(g, op, block):
-    """Operator converter for mish."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    exp = _op.exp(x)
-    add = _op.add(exp, _expr.const(1.0, dtype))
-    log = _op.log(add)
-    tanh = _op.tanh(log)
-    out = _op.multiply(x, tanh)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_mul(g, op, block):
-    """Operator converter for mul."""
-
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-    x_num_col_dims = op.attr("x_num_col_dims")
-    y_num_col_dims = op.attr("y_num_col_dims")
-    x_shape = shape_of(x, dtype="int32")
-    y_shape = shape_of(y, dtype="int32")
-    x_dim = infer_shape(x_shape)[0]
-    y_dim = infer_shape(y_shape)[0]
-    if x_num_col_dims < 0:
-        x_num_col_dims += x_dim
-    if y_num_col_dims < 0:
-        y_num_col_dims += y_dim
-    if x_num_col_dims == 1:
-        x = _op.nn.batch_flatten(x)
-    else:
-        pre_shape = _op.prod(_op.strided_slice(x_shape, [0], [x_num_col_dims], [1]), keepdims=True)
-        post_shape = _op.prod(
-            _op.strided_slice(x_shape, [x_num_col_dims], [x_dim], [1]), keepdims=True
-        )
-        new_shape = _op.concatenate([pre_shape, post_shape], axis=0)
-        new_shape = fold_constant(new_shape)
-        x = _op.reshape(x, new_shape)
-    if y_num_col_dims == 1:
-        y = _op.nn.batch_flatten(y)
-    else:
-        pre_shape = _op.prod(_op.strided_slice(y_shape, [0], [y_num_col_dims], [1]), keepdims=True)
-        post_shape = _op.prod(
-            _op.strided_slice(y_shape, [y_num_col_dims], [y_dim], [1]), keepdims=True
-        )
-        new_shape = _op.concatenate([pre_shape, post_shape], axis=0)
-        new_shape = fold_constant(new_shape)
-        y = _op.reshape(y, new_shape)
-    y = _op.transpose(y)
-    out = _op.nn.dense(x, y)
-    out_pre_shape = _op.strided_slice(x_shape, [0], [x_num_col_dims], [1])
-    out_post_shape = _op.strided_slice(y_shape, [y_num_col_dims], [y_dim], [1])
-    out_shape = _op.concatenate([out_pre_shape, out_post_shape], axis=0)
-    out_shape = fold_constant(out_shape)
-    out = _op.reshape(out, out_shape)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_mv(g, op, block):
-    """Operator converter for mv."""
-
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Vec")[0])
-    y = _op.expand_dims(y, axis=-1)
-    y = _op.transpose(y)
-    out = _op.nn.dense(x, y)
-    out = _op.squeeze(out, axis=[-1])
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_norm(g, op, block):
-    """Operator converter for norm."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-    axis_l = [axis]
-    epsilon = op.attr("epsilon")
-    out = _op.nn.l2_normalize(x, epsilon, axis_l)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_one_hot_v2(g, op, block):
-    """Operator converter for one_hot_v2."""
-
-    x = g.get_node(op.input("X")[0])
-    depth = op.attr("depth")
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-    ndim = len(infer_shape(x))
-    on_value = _op.const(1)
-    off_value = _op.const(0)
-    axis = ndim
-    out = _op.one_hot(x, on_value, off_value, depth, axis, dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_p_norm(g, op, blcok):
-    """Operator converter for p_norm."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-    p = op.attr("porder")
-    keepdim = op.attr("keepdim")
-    p_node = _expr.const(p, dtype="float32")
-    abs_node = _op.abs(x)
-    pow_node = _op.power(abs_node, p_node)
-    reduce_sum = _op.sum(pow_node, axis=[axis], keepdims=keepdim)
-    p_node1 = _expr.const(1.0 / p, dtype="float32")
-    out = _op.power(reduce_sum, p_node1)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_padding(g, op, block):
-    """Operator converter for padding."""
-
-    input_x = g.get_node(op.input("X")[0])
-    input_padding = op.input("Paddings")
-    if input_padding:
-        padding = g.get_node(input_padding[0])
-        padding = infer_value(padding, g.get_params()).numpy().tolist()
-    else:
-        padding = op.attr("paddings")
-    padding = op.attr("paddings")
-    value = op.attr("value")
-    data_format = op.attr("data_format")
-    mode = op.attr("mode")
-    assert mode != "circular", "Don't support mod='circular' for PaddlePaddle's padding"
-    if mode == "replicate":
-        mode = "edge"
-
-    pad_len = len(padding)
-    new_paddings = [0] * (pad_len + 4)
-    for i in range(0, pad_len, 2):
-        index = -1 - i
-        if data_format[:2] != "NC":
-            index = -3 - i
-        new_paddings[index] = padding[i + 1]
-        new_paddings[index - 1] = padding[i]
-
-    new_paddings = [new_paddings[i : i + 2] for i in range(0, len(new_paddings), 2)]
-
-    out = _op.nn.pad(input_x, new_paddings, pad_value=value, pad_mode=mode)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_pixel_shuffle(g, op, block):
-    """Operator converter for pixel_shuffle."""
-
-    x = g.get_node(op.input("X")[0])
-    upscale_factor = op.attr("upscale_factor")
-    data_format = op.attr("data_format")
-    out = _op.nn.depth_to_space(x, block_size=upscale_factor, layout=data_format, mode="CRD")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_pool2d(g, op, block):
-    """Operator converter for pool2d."""
-
-    adaptive = op.attr("adaptive")
-    ceil_mode = op.attr("ceil_mode")
-    global_pooling = op.attr("global_pooling")
-    ksize = op.attr("ksize")
-    paddings = op.attr("paddings")
-    padding_algorithm = op.attr("padding_algorithm")
-    pooling_type = op.attr("pooling_type")
-    data_format = op.attr("data_format")
-
-    if global_pooling:
-        adaptive = True
-        ksize = [1, 1]
-
-    input_x = g.get_node(op.input("X")[0])
-    _, _, in_h, in_w = infer_shape(input_x)
-
-    op_map = {"avg": "avg_pool2d", "max": "max_pool2d"}
-
-    strides = op.attr("strides")
-    if isinstance(strides, int):
-        strides = [strides, strides]
-    if isinstance(ksize, int):
-        ksize = [ksize, ksize]
-    if isinstance(paddings, int):
-        paddings = [paddings] * 2
-
-    if padding_algorithm == "VALID":
-        paddings = [0, 0]
-    elif padding_algorithm == "SAME":
-        input_x = autopad(input_x, strides, ksize)
-        paddings = [0, 0]
-    elif padding_algorithm == "EXPLICIT":
-        if len(paddings) == 2:
-            paddings = [paddings[0], paddings[1], paddings[0], paddings[1]]
-        elif len(paddings) == 4:
-            paddings = [paddings[0], paddings[2], paddings[1], paddings[3]]
-    else:
-        msg = f'Value {padding_algorithm} in attribute "padding" of operator Pool2d is not "valid."'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    # handle with special case
-    # while kernel size less than input size
-    # shrink kernel size to input size
-    if (
-        not isinstance(in_h, _op.Expr)
-        and padding_algorithm == "EXPLICIT"
-        and in_h + paddings[0] + paddings[2] < ksize[0]
-    ):
-        ksize[0] = in_h
-    if (
-        not isinstance(in_w, _op.Expr)
-        and padding_algorithm == "EXPLICIT"
-        and in_w + paddings[1] + paddings[3] < ksize[1]
-    ):
-        ksize[1] = in_w
-
-    if not adaptive:
-        if pooling_type == "avg":
-            exclusive = op.attr("exclusive")
-            out = _op.nn.avg_pool2d(
-                input_x,
-                pool_size=ksize,
-                strides=strides,
-                padding=paddings,
-                ceil_mode=ceil_mode,
-                count_include_pad=not exclusive,
-                layout=data_format,
-            )
-        else:
-            out = getattr(_op.nn, op_map[pooling_type])(
-                input_x,
-                pool_size=ksize,
-                strides=strides,
-                padding=paddings,
-                ceil_mode=ceil_mode,
-                layout=data_format,
-            )
-    else:
-        out = getattr(_op.nn, "adaptive_" + op_map[pooling_type])(
-            input_x, output_size=ksize, layout=data_format
-        )
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_pool3d(g, op, block):
-    """Operator converter for pool3d."""
-
-    adaptive = op.attr("adaptive")
-    ceil_mode = op.attr("ceil_mode")
-    global_pooling = op.attr("global_pooling")
-    ksize = op.attr("ksize")
-    paddings = op.attr("paddings")
-    padding_algorithm = op.attr("padding_algorithm")
-    pooling_type = op.attr("pooling_type")
-    data_format = op.attr("data_format")
-
-    if global_pooling:
-        adaptive = True
-        ksize = [1, 1, 1]
-
-    input_x = g.get_node(op.input("X")[0])
-    _, _, _, in_h, in_w = infer_shape(input_x)
-
-    op_map = {
-        "avg": "avg_pool3d",
-        "max": "max_pool3d",
-    }
-
-    strides = op.attr("strides")
-    if isinstance(strides, int):
-        strides = [strides, strides]
-    if isinstance(ksize, int):
-        ksize = [ksize, ksize, ksize]
-    if isinstance(paddings, int):
-        paddings = [paddings] * 3
-
-    if padding_algorithm == "VALID":
-        paddings = [0, 0, 0]
-    elif padding_algorithm == "SAME":
-        input_x = autopad(input_x, strides, ksize)
-        paddings = [0, 0, 0]
-    elif padding_algorithm == "EXPLICIT":
-        if len(paddings) == 3:
-            paddings = [
-                paddings[0],
-                paddings[1],
-                paddings[2],
-                paddings[0],
-                paddings[1],
-                paddings[2],
-            ]
-        elif len(paddings) == 6:
-            paddings = [
-                paddings[0],
-                paddings[3],
-                paddings[1],
-                paddings[4],
-                paddings[2],
-                paddings[5],
-            ]
-    else:
-        msg = 'Value {} in attribute "padding" of operator Pool3d is not "valid."'
-        raise tvm.error.OpAttributeInvalid(msg.format(padding_algorithm))
-
-    # handle with special case
-    # while kernel size more than input size
-    # shrink kernel size to input size
-    if (
-        not isinstance(in_h, _op.Expr)
-        and padding_algorithm == "EXPLICIT"
-        and in_h + paddings[0] + paddings[2] < ksize[0]
-    ):
-        ksize[0] = in_h
-    if (
-        not isinstance(in_w, _op.Expr)
-        and padding_algorithm == "EXPLICIT"
-        and in_w + paddings[1] + paddings[3] < ksize[1]
-    ):
-        ksize[1] = in_w
-
-    if not adaptive:
-        if pooling_type == "avg":
-            exclusive = op.attr("exclusive")
-            out = _op.nn.avg_pool3d(
-                input_x,
-                pool_size=ksize,
-                strides=strides,
-                padding=paddings,
-                ceil_mode=ceil_mode,
-                count_include_pad=not exclusive,
-                layout=data_format,
-            )
-        else:
-            out = getattr(_op.nn, op_map[pooling_type])(
-                input_x, pool_size=ksize, strides=strides, padding=paddings, ceil_mode=ceil_mode
-            )
-    else:
-        out = getattr(_op.nn, "adaptive_" + op_map[pooling_type])(
-            input_x, output_size=ksize, layout=data_format
-        )
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_pow(g, op, block):
-    """Operator converter for pow."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = block.var(op.output("Out")[0]).dtype
-    dtype = _convert_dtype_value(dtype)
-    factor = op.attr("factor")
-    factor = _expr.const(factor, dtype=dtype)
-    out = _op.power(x, factor)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_prelu(g, op, block):
-    """Operator converter for prelu."""
-
-    x = g.get_node(op.input("X")[0])
-    alpha = g.get_node(op.input("Alpha")[0])
-    ndims = len(infer_shape(x))
-    axis = 0 if ndims <= 1 else 1
-    mode = op.attr("mode")
-    if mode == "all":
-        if ndims == 1:
-            shape = _op.strided_slice(shape_of(x), [0], [1])
-        else:
-            shape = _op.strided_slice(shape_of(x), [1], [2])
-        alpha = _op.broadcast_to(alpha, fold_constant(shape))
-    out = _op.nn.prelu(x, alpha, axis)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_range(g, op, block):
-    """Operator converter for range."""
-
-    start = g.get_node(op.input("Start")[0])
-    stop = g.get_node(op.input("End")[0])
-    step = g.get_node(op.input("Step")[0])
-    dtype = infer_type(start).checked_type.dtype
-
-    params = []
-    for param in (start, stop, step):
-        param, infered = try_infer_value(param, g.get_params())
-        if infered:
-            param = param.tolist()
-        if isinstance(param, list):
-            param = param[0]
-        if isinstance(param, _expr.Expr):
-            param = _op.squeeze(param)
-        else:
-            param = _op.const(param, dtype=dtype)
-        params.append(param)
-
-    out = _op.transform.arange(params[0], params[1], params[2], dtype=dtype)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_reciprocal(g, op, block):
-    """Operator converter for reciprocal."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    out = _expr.const(1.0, dtype) / x
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_reduce(g, op, block):
-    """Operator converter for series of reduce operators."""
-
-    op_map = {
-        "reduce_all": "all",
-        "reduce_any": "any",
-        "reduce_max": "max",
-        "reduce_min": "min",
-        "reduce_prod": "prod",
-        "reduce_sum": "sum",
-        "reduce_mean": "mean",
-    }
-    op_name = op_map[op.type]
-    input_x = g.get_node(op.input("X")[0])
-    axis = op.attr("dim")
-    if op.attr("reduce_all"):
-        axis = None
-    keepdims = op.attr("keep_dim")
-    out = get_relay_op(op_name)(input_x, axis=axis, keepdims=keepdims)
-    if not axis and not keepdims:
-        # use `expand_dims` to solve the following situation
-        # for TVM, the shape of `out` will be (, )
-        # for Paddle, the shape of `out` will be [1]
-        out = _op.expand_dims(out, axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_relu6(g, op, block):
-    """Operator converter for relu6."""
-
-    x = g.get_node(op.input("X")[0])
-    out = _op.clip(x, 0.0, 6.0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_reshape(g, op, block):
-    """Operator converter for reshape."""
-
-    input_shape = op.input("Shape")
-    input_shape_tensor = op.input("ShapeTensor")
-    data = g.get_node(op.input("X")[0])
-    if input_shape:
-        new_shape = g.get_node(input_shape[0])
-    elif input_shape_tensor:
-        new_shape = []
-        for shape_name in input_shape_tensor:
-            shape = g.get_node(shape_name)
-            if len(infer_shape(shape)) == 0:
-                shape = _op.reshape(shape, [-1])
-            new_shape.append(shape)
-        new_shape = _op.concatenate(new_shape, axis=0)
-        new_shape, infered = try_infer_value(new_shape, parameters=g.get_params())
-        if infered:
-            new_shape = new_shape.tolist()
-    else:
-        new_shape = op.attr("shape")
-    out = _op.reshape(data, new_shape)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_roi_align(g, op, block):
-    """Operator converter for roi_align."""
-
-    rois = g.get_node(op.input("ROIs")[0])
-    spatial_scale = op.attr("spatial_scale")
-    if op.attr("aligned"):
-        offset = _expr.const(0.5, dtype="float32")
-        roi_offset = _op.divide(offset, _expr.const(spatial_scale, dtype="float32"))
-        rois = _op.subtract(rois, roi_offset)
-    num_rois = infer_shape(rois)[0]
-    zero_node = _expr.const(0, dtype="int32")
-    batch_index = _op.full(zero_node, [num_rois, 1], dtype="float32")
-    rois = _op.concatenate([batch_index, rois], axis=1)
-    out = _op.vision.roi_align(
-        g.get_node(op.input("X")[0]),
-        rois,
-        pooled_size=[op.attr("pooled_height"), op.attr("pooled_width")],
-        spatial_scale=spatial_scale,
-        sample_ratio=op.attr("sampling_ratio"),
-        mode="avg",
-    )
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_dequantize_linear(g, op, block):
-    """Operator converter for dequantize_linear."""
-
-    data_node_name = op.input("X")[0]
-    data_node = g.get_node(data_node_name)
-
-    # paddle_scale = tvm_scale * 127
-    paddle_quantize_scale = g.get_params(op.input("Scale")[0]).asnumpy()
-    tvm_quantize_scale = paddle_quantize_scale / 127.0
-
-    tvm_quantize_zp = g.get_params(op.input("ZeroPoint")[0]).asnumpy()
-
-    tvm_quantize_axis = op.attr("quant_axis")
-    if tvm_quantize_axis == -1:
-        tvm_quantize_axis = 0
-
-    if len(infer_shape(data_node)) < 2:
-        tvm_quantize_axis = 0
-
-    out = _qnn.op.dequantize(
-        data=data_node,
-        input_scale=_op.const(tvm_quantize_scale, "float32"),
-        input_zero_point=_op.const(tvm_quantize_zp, "int32"),
-        axis=tvm_quantize_axis,
-    )
-    g.add_node(op.output("Y")[0], out)
-
-
-def convert_quantize_linear(g, op, block):
-    """Operator converter for dequantize_linear."""
-
-    data_node_name = op.input("X")[0]
-    data_node = g.get_node(data_node_name)
-
-    # paddle_scale = tvm_scale * 127
-    paddle_quantize_scale = g.get_params(op.input("Scale")[0]).asnumpy()
-    tvm_quantize_scale = paddle_quantize_scale / 127.0
-
-    tvm_quantize_zp = g.get_params(op.input("ZeroPoint")[0]).asnumpy()
-    tvm_quantize_axis = op.attr("quant_axis")
-
-    if tvm_quantize_axis == -1:
-        tvm_quantize_axis = 0
-
-    out = _qnn.op.quantize(
-        data=data_node,
-        output_scale=_op.const(tvm_quantize_scale, "float32"),
-        output_zero_point=_op.const(tvm_quantize_zp, "int32"),
-        axis=tvm_quantize_axis,
-    )
-    g.add_node(op.output("Y")[0], out)
-
-
-def convert_rnn(g, op, block):
-    """Operator converter for rnn."""
-
-    def generate_lstm(
-        input_seqs,
-        hidden_state,
-        cell_state,
-        w_inp,
-        w_hid,
-        b_inp,
-        b_hid,
-        f_act,
-        g_act,
-        h_act,
-        backwards=False,
-    ):
-        """Implementation of LSTM cell for paddlepaddle of TVM"""
-
-        h_list = []
-        seq_length = len(input_seqs)
-        for i in range(seq_length):
-            step = input_seqs[i] if not backwards else input_seqs[seq_length - (i + 1)]
-            step = _op.squeeze(step, axis=[0])
-            gates = _op.nn.dense(step, w_inp) + _op.nn.dense(hidden_state, w_hid)
-            if b_inp is not None:
-                gates += b_inp
-            if b_hid is not None:
-                gates += b_hid
-            i, f, c, o = _op.split(gates, 4, axis=-1)
-
-            i = f_act(i)
-            f = f_act(f)
-
-            c = g_act(c)
-            C = f * cell_state + i * c
-
-            o = f_act(o)
-
-            H = o * h_act(C)
-
-            hidden_state = H
-            cell_state = C
-            h_list.append(_op.expand_dims(H, axis=0))
-
-        if backwards:
-            h_list = h_list[::-1]
-
-        # Concatenate outputs and add back in direction axis.
-        concatenated = _op.concatenate(h_list, 0)
-        output = _op.expand_dims(concatenated, axis=1)
-        hidden_state = _op.expand_dims(hidden_state, axis=0)
-        cell_state = _op.expand_dims(cell_state, axis=0)
-
-        return output, hidden_state, cell_state
-
-    def generate_gru(
-        input_seqs, hidden_state, w_inp, w_hid, b_inp, b_hid, rz_act, n_act, backwards=False
-    ):
-        """Implementation of GRU cell for paddlepaddle of TVM"""
-
-        h_list = []
-        seq_length = len(input_seqs)
-        for i in range(seq_length):
-            step = input_seqs[i] if not backwards else input_seqs[seq_length - (i + 1)]
-            step = _op.squeeze(step, axis=[0])
-            xwt = _op.nn.dense(step, w_inp)
-            hwt = _op.nn.dense(hidden_state, w_hid)
-            if b_inp is not None:
-                xwt += b_inp
-            if b_hid is not None:
-                hwt += b_hid
-            i_r, i_z, i_n = _op.split(xwt, 3, axis=-1)
-            h_r, h_z, h_n = _op.split(hwt, 3, axis=-1)
-
-            r_gate = rz_act(i_r + h_r)
-            z_gate = rz_act(i_z + h_z)
-            n_gate = n_act(i_n + r_gate * h_n)
-
-            hidden_state = (hidden_state - n_gate) * z_gate + n_gate
-            h_list.append(_op.expand_dims(hidden_state, axis=0))
-
-        if backwards:
-            h_list = h_list[::-1]
-
-        # Concatenate outputs and add back in direction axis.
-        concatenated = _op.concatenate(h_list, 0)
-        output = _op.expand_dims(concatenated, axis=1)
-        hidden_state = _op.expand_dims(hidden_state, axis=0)
-
-        return output, hidden_state
-
-    def generate_simplernn(
-        input_seqs, hidden_state, w_inp, w_hid, b_inp, b_hid, n_act, backwards=False
-    ):
-        """Implementation of SimpleRNN cell for paddlepaddle of TVM"""
-
-        h_list = []
-        seq_length = len(input_seqs)
-        for i in range(seq_length):
-            step = input_seqs[i] if not backwards else input_seqs[seq_length - (i + 1)]
-            step = _op.squeeze(step, axis=[0])
-            xwt = _op.nn.dense(step, w_inp)
-            hwt = _op.nn.dense(hidden_state, w_hid)
-            if b_inp is not None:
-                xwt += b_inp
-            if b_hid is not None:
-                hwt += b_hid
-
-            n_gate = n_act(xwt + hwt)
-
-            hidden_state = n_gate
-            h_list.append(_op.expand_dims(hidden_state, axis=0))
-
-        if backwards:
-            h_list = h_list[::-1]
-
-        # Concatenate outputs and add back in direction axis.
-        concatenated = _op.concatenate(h_list, 0)
-        output = _op.expand_dims(concatenated, axis=1)
-        hidden_state = _op.expand_dims(hidden_state, axis=0)
-
-        return output, hidden_state
-
-    def make_param_inputs(g, node, layer, hidden_size, num_layers):
-        """Param for weight and bias."""
-
-        bidirect_len = 4 if node.attr("is_bidirec") else 2
-        all_layer_param_len = len(node.input("WeightList"))
-        weight_list = node.input("WeightList")[: all_layer_param_len // 2]
-        bias_list = node.input("WeightList")[all_layer_param_len // 2 :]
-
-        layer_weight_list = weight_list[layer * bidirect_len : layer * bidirect_len + bidirect_len]
-        layer_bias_list = bias_list[layer * bidirect_len : layer * bidirect_len + bidirect_len]
-        param_list = layer_weight_list + layer_bias_list
-        param_list_len = len(param_list)
-
-        input_weights = param_list[0 : param_list_len // 2 : 2]
-        hidden_weights = param_list[1 : param_list_len // 2 : 2]
-
-        input_bias = param_list[param_list_len // 2 : param_list_len : 2]
-        hidden_bias = param_list[param_list_len // 2 + 1 : param_list_len : 2]
-
-        return input_weights, hidden_weights, input_bias, hidden_bias
-
-    def make_init_param_inputs(g, node, layer):
-        """Init param for inputs."""
-
-        mode = node.attr("mode")
-        if mode == "LSTM":
-            all_init_h, all_init_c = node.input("PreState")
-            bidirect_len = 2 if node.attr("is_bidirec") else 1
-            init_h = _op.strided_slice(
-                g.get_node(all_init_h),
-                [layer * bidirect_len],
-                [layer * bidirect_len + bidirect_len],
-                axes=[0],
-            )
-            init_c = _op.strided_slice(
-                g.get_node(all_init_c),
-                [layer * bidirect_len],
-                [layer * bidirect_len + bidirect_len],
-                axes=[0],
-            )
-            return init_h, init_c
-        all_init_h = node.input("PreState")[0]
-        bidirect_len = 2 if node.attr("is_bidirec") else 1
-        init_h = _op.strided_slice(
-            g.get_node(all_init_h),
-            [layer * bidirect_len],
-            [layer * bidirect_len + bidirect_len],
-            axes=[0],
-        )
-        return init_h
-
-    hidden_size = op.attr("hidden_size")
-    num_layers = op.attr("num_layers")
-    is_bidirec = op.attr("is_bidirec")
-    mode = op.attr("mode")
-
-    input_x = g.get_node(op.input("Input")[0])
-
-    num_directions = 1
-    if is_bidirec:
-        num_directions = 2
-
-    x_shape = infer_shape(input_x)
-    time_steps = x_shape[0]
-    x_steps = _op.split(input_x, indices_or_sections=time_steps, axis=0)
-    for layer in range(num_layers):
-        input_weights, hidden_weights, input_bias, hidden_bias = make_param_inputs(
-            g, op, layer, hidden_size, num_layers
-        )
-        if mode == "LSTM":
-            init_h, init_c = make_init_param_inputs(g, op, layer)
-            init_hs = _op.split(init_h, num_directions)
-            init_cs = _op.split(init_c, num_directions)
-            result_output = []
-            result_H = []
-            result_C = []
-            for i in range(num_directions):
-                H_t = _op.squeeze(init_hs[i], axis=[0])
-                C_t = _op.squeeze(init_cs[i], axis=[0])
-                W = g.get_node(input_weights[i])
-                R = g.get_node(hidden_weights[i])
-                WB = g.get_node(input_bias[i])
-                RB = g.get_node(hidden_bias[i])
-                output, H, C = generate_lstm(
-                    input_seqs=x_steps,
-                    hidden_state=H_t,
-                    cell_state=C_t,
-                    w_inp=W,
-                    w_hid=R,
-                    b_inp=WB,
-                    b_hid=RB,
-                    f_act=_op.sigmoid,
-                    g_act=_op.tanh,
-                    h_act=_op.tanh,
-                    backwards=i == 1,
-                )
-                result_output.append(output)
-                result_H.append(H)
-                result_C.append(C)
-            output = _op.concatenate(result_output, axis=1)
-            H = _op.concatenate(result_H, axis=0)
-            C = _op.concatenate(result_C, axis=0)
-        elif mode == "GRU":
-            init_h = make_init_param_inputs(g, op, layer)
-            init_hs = _op.split(init_h, num_directions)
-            result_output = []
-            result_H = []
-            for i in range(num_directions):
-                H_t = _op.squeeze(init_hs[i], axis=[0])
-                W = g.get_node(input_weights[i])
-                R = g.get_node(hidden_weights[i])
-                WB = g.get_node(input_bias[i])
-                RB = g.get_node(hidden_bias[i])
-                output, H = generate_gru(
-                    input_seqs=x_steps,
-                    hidden_state=H_t,
-                    w_inp=W,
-                    w_hid=R,
-                    b_inp=WB,
-                    b_hid=RB,
-                    rz_act=_op.sigmoid,
-                    n_act=_op.tanh,
-                    backwards=i == 1,
-                )
-                result_output.append(output)
-                result_H.append(H)
-            output = _op.concatenate(result_output, axis=1)
-            H = _op.concatenate(result_H, axis=0)
-        elif mode == "RNN_TANH":
-            init_h = make_init_param_inputs(g, op, layer)
-            init_hs = _op.split(init_h, num_directions)
-            result_output = []
-            result_H = []
-            for i in range(num_directions):
-                H_t = _op.squeeze(init_hs[i], axis=[0])
-                W = g.get_node(input_weights[i])
-                R = g.get_node(hidden_weights[i])
-                WB = g.get_node(input_bias[i])
-                RB = g.get_node(hidden_bias[i])
-                output, H = generate_simplernn(
-                    input_seqs=x_steps,
-                    hidden_state=H_t,
-                    w_inp=W,
-                    w_hid=R,
-                    b_inp=WB,
-                    b_hid=RB,
-                    n_act=_op.tanh,
-                    backwards=i == 1,
-                )
-                result_output.append(output)
-                result_H.append(H)
-            output = _op.concatenate(result_output, axis=1)
-            H = _op.concatenate(result_H, axis=0)
-
-        output = _op.transpose(output, axes=[0, 2, 1, 3])
-        output = _op.reshape(output, newshape=(0, 0, -1))
-        x_steps = _op.split(output, indices_or_sections=time_steps, axis=0)
-
-    g.add_node(op.output("Out")[0], output)
-
-
-def convert_scale(g, op, block):
-    """Operator converter for scale."""
-
-    scale = op.attr("scale")
-    bias = op.attr("bias")
-    bias_after_scale = op.attr("bias_after_scale")
-    x = g.get_node(op.input("X")[0])
-    if np.isclose(scale, 1.0) and np.isclose(bias, 0.0):
-        out = x
-    else:
-        if np.isclose(bias, 0.0):
-            out = x * _expr.const(np.array(scale).astype("float32"))
-        elif np.isclose(scale, 1.0):
-            out = x + _expr.const(np.array(bias).astype("float32"))
-        else:
-            if bias_after_scale:
-                out = x * _expr.const(np.array(scale).astype("float32")) + _expr.const(
-                    np.array(bias).astype("float32")
-                )
-            else:
-                out = (x + _expr.const(np.array(bias).astype("float32"))) * _expr.const(
-                    np.array(scale).astype("float32")
-                )
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_scatter(g, op, block):
-    """Operator converter for scatter."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Ids")[0])
-    updates = g.get_node(op.input("Updates")[0])
-    overwrite = op.attr("overwrite")
-
-    shape = infer_shape(updates)
-    ndims = len(shape)
-    index = _op.expand_dims(index, axis=-1, num_newaxis=ndims - 1)
-    index = _op.transform.broadcast_to(index, shape)
-
-    if overwrite:
-        out = _op.scatter_elements(x, index, updates, axis=0)
-    else:
-        out = _op.scatter_elements(_op.zeros_like(x), index, updates, axis=0, reduction="add")
-        out += _op.scatter_elements(x, index, _op.zeros_like(updates), axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_scatter_nd_add(g, op, block):
-    """Operator converter for scatter_nd_add."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Index")[0])
-    updates = g.get_node(op.input("Updates")[0])
-    indices_dim = len(infer_shape(index))
-    axes = list(range(indices_dim))
-    index = _op.transpose(index, axes[-1:] + axes[:-1])
-    out = _op.scatter_nd(x, index, updates, mode="add")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_selu(g, op, block):
-    """Operator converter for selu."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    alpha = _op.const(op.attr("alpha"), dtype)
-    scale = _op.const(op.attr("scale"), dtype)
-    out = (
-        _expr.const(-1.0, dtype=dtype)
-        * alpha
-        * _op.nn.relu(_expr.const(1.0, dtype=dtype) - _op.exp(x))
-    )
-    out = scale * (out + _op.nn.relu(x))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_set_value(g, op, block):
-    """Operator converter for set_value."""
-
-    x = g.get_node(op.input("Input")[0])
-    if op.input("StartsTensorList"):
-        starts = g.get_node(op.input("StartsTensorList")[0])
-    else:
-        starts = op.attr("starts")[0]
-
-    if op.input("EndsTensorList"):
-        ends = g.get_node(op.input("EndsTensorList")[0])
-    else:
-        ends = op.attr("ends")[0]
-
-    axes = op.attr("axes")
-    assert len(axes) == 1, "Only support one axes now."
-    axes = axes[0]
-
-    input_shape = infer_shape(x)
-    ends = min(ends, input_shape[axes])
-
-    if op.input("StepsTensorList"):
-        steps = g.get_node(op.input("StepsTensorList")[0])
-    else:
-        steps = op.attr("steps")[0]
-
-    if op.input("ValueTensor"):
-        value = g.get_node(op.input("ValueTensor")[0])
-    else:
-        input_dtype = infer_type(x).checked_type.dtype
-        if input_dtype == "float64":
-            value = _expr.const(op.attr("fp64_values"), dtype="float64")
-        elif input_dtype == "float32":
-            value = _expr.const(op.attr("fp32_values"), dtype="float32")
-        elif input_dtype == "int32":
-            value = _expr.const(op.attr("int32_values"), dtype="int32")
-        elif input_dtype == "int64":
-            value = _expr.const(op.attr("int64_values"), dtype="int64")
-        else:
-            raise tvm.error.OpNotImplemented(
-                "dtype {} is not supported for set_value".format(input_dtype)
-            )
-
-    sliced_data = _op.strided_slice(x, begin=[starts], end=[ends], strides=[steps], axes=[axes])
-    sliced_shape = infer_shape(sliced_data)
-
-    if infer_shape(value) != sliced_shape:
-        expand_value = _op.broadcast_to(value, sliced_shape)
-    else:
-        expand_value = value
-
-    if starts < 0:
-        starts = starts + input_shape[axes]
-    if ends < 0:
-        ends = ends + input_shape[axes]
-
-    indices = _op.arange(
-        start=_expr.const(starts, dtype="int32"),
-        stop=_expr.const(ends, dtype="int32"),
-        step=_expr.const(steps, dtype="int32"),
-        dtype="int32",
-    )
-    indices = _op.expand_dims(indices, axis=0)
-    out = _op.scatter_nd(x, indices, expand_value, "update")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_shape(g, op, block):
-    """Operator converter for shape."""
-
-    x = g.get_node(op.input("Input")[0])
-    out = shape_of(x, dtype="int32")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_silu(g, op, block):
-    """Operator converter for silu."""
-
-    x = g.get_node(op.input("X")[0])
-    out = _op.multiply(x, _op.sigmoid(x))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_size(g, op, block):
-    """Operator converter for size."""
-
-    input_x = g.get_node(op.input("Input")[0])
-    out = _op.ndarray_size(input_x, dtype="int64")
-    out = _op.expand_dims(out, axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_slice(g, op, block):
-    """Operator converter for slice."""
-
-    data = g.get_node(op.input("Input")[0])
-    dims = len(infer_shape(data))
-
-    axes = op.attr("axes")
-    indices = _expr.const(axes, dtype="int64")
-
-    decrease_axis = op.attr("decrease_axis")
-    if isinstance(decrease_axis, int):
-        decrease_axis = [decrease_axis]
-
-    if op.input("StartsTensor"):
-        starts = g.get_node(op.input("StartsTensor")[0])
-        starts, infered = try_infer_value(starts, g.get_params())
-        if infered:
-            starts = starts.tolist()
-    elif op.input("StartsTensorList"):
-        starts = []
-        for start_index in op.input("StartsTensorList"):
-            start_index = g.get_node(start_index).astype("int64")
-            starts.append(start_index)
-        starts = _op.concatenate(starts, axis=0)
-        starts, infered = try_infer_value(starts, g.get_params())
-        if infered:
-            starts = starts.tolist()
-    else:
-        starts = op.attr("starts")
-
-    if len(axes) < dims:
-        if isinstance(starts, _expr.Expr):
-            starts = _op.scatter_elements(
-                _op.const([0] * dims, dtype=infer_type(starts).checked_type.dtype),
-                indices,
-                starts,
-                axis=0,
-            )
-        else:
-            base = [0] * dims
-            for i, axis in enumerate(axes):
-                base[axis] = starts[i]
-            starts = base
-
-    if op.input("EndsTensor"):
-        ends = g.get_node(op.input("EndsTensor")[0])
-        ends, infered = try_infer_value(ends, g.get_params())
-        if infered:
-            ends = ends.tolist()
-    elif op.input("EndsTensorList"):
-        ends = []
-        for end_index in op.input("EndsTensorList"):
-            end_index = g.get_node(end_index).astype("int64")
-            ends.append(end_index)
-        ends = _op.concatenate(ends, axis=0)
-        ends, infered = try_infer_value(ends, g.get_params())
-        if infered:
-            ends = ends.tolist()
-    else:
-        ends = op.attr("ends")
-
-    if len(axes) < dims:
-        if isinstance(ends, _expr.Expr):
-            ends = _op.scatter_elements(
-                _expr.const(
-                    np.array([np.iinfo(np.int32).max] * dims),
-                    dtype=infer_type(ends).checked_type.dtype,
-                ),
-                indices,
-                ends,
-                axis=0,
-            )
-        else:
-            base = [np.iinfo(np.int32).max] * dims
-            for i, axis in enumerate(axes):
-                base[axis] = ends[i]
-            ends = base
-
-    strides = None
-    if "StridesTensor" in op.input_names and op.input("StridesTensor"):
-        strides = g.get_node(op.input("StridesTensor")[0])
-        strides, infered = try_infer_value(strides, g.get_params())
-        if infered:
-            strides = strides.tolist()
-    elif "StridesTensorList" in op.input_names and op.input("StridesTensorList"):
-        strides = []
-        for strides_index in op.input("StridesTensorList"):
-            strides_index = g.get_node(strides_index).astype("int64")
-            strides.append(strides_index)
-        strides = _op.concatenate(strides, axis=0)
-        strides, infered = try_infer_value(strides, g.get_params())
-        if infered:
-            strides = strides.tolist()
-    elif op.has_attr("strides"):
-        strides = op.attr("strides")
-
-    if len(axes) < dims:
-        if isinstance(strides, _expr.Expr):
-            strides = _op.scatter_elements(
-                _expr.const(np.array([1] * dims), dtype=infer_type(strides).checked_type.dtype),
-                indices,
-                strides,
-                axis=0,
-            )
-        elif strides:
-            base = [1] * dims
-            for i, axis in enumerate(axes):
-                base[axis] = strides[i]
-            strides = base
-    if not strides:
-        strides = _op.const([1] * dims, dtype="int64")
-
-    out = _op.strided_slice(data, begin=starts, end=ends, strides=strides)
-    out_shape = infer_shape(out)
-    if decrease_axis and len(out_shape) > 1:
-        out = _op.squeeze(out, axis=decrease_axis)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_softmax(g, op, block):
-    """Operator converter for softmax."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-    input_shape = block.var(op.input("X")[0]).shape
-    if axis < 0:
-        axis = len(input_shape) + axis
-    m = _op.max(x, axis, keepdims=True)
-    e = _op.exp(x - m)
-    out = e / _op.sum(e, axis, keepdims=True)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_softmax_with_cross_entropy(g, op, block):
-    """Operator converter for softmax_with_cross_entropy."""
-
-    logits = g.get_node(op.input("Logits")[0])
-    labels = g.get_node(op.input("Label")[0])
-    ignore_index = op.attr("ignore_index")
-    axis = op.attr("axis")
-    if axis < 0:
-        axis = len(infer_shape(logits)) + axis
-
-    softmax = _op.nn.softmax(logits, axis=axis)
-
-    g.add_node(op.output("Softmax")[0], softmax)
-
-    softmax = _op.log(softmax)
-    soft_label = op.attr("soft_label")
-    if soft_label:
-        loss = _op.sum(-labels * softmax, axis=axis)
-    else:
-        labels_one = _op.one_hot(
-            labels,
-            on_value=_expr.const(1.0, dtype="float32"),
-            off_value=_expr.const(0.0, dtype="float32"),
-            depth=infer_shape(logits)[axis],
-            axis=axis + 1,
-            dtype="float32",
-        )
-        labels_one = _op.squeeze(labels_one, axis=axis)
-        loss = _op.sum(-labels_one * softmax, axis=axis)
-    loss = _op.expand_dims(loss, axis=axis)
-    if ignore_index != -100:  # noly when soft_label is False
-        assert not soft_label, "soft_label and ignore_index cannot be set at the same time."
-        ignore_mask = _op.not_equal(labels, _expr.const(ignore_index, dtype="int64"))
-        ignore_mask = _op.cast(ignore_mask, "float32")
-        loss = _op.multiply(loss, ignore_mask)
-
-    g.add_node(op.output("Loss")[0], loss)
-
-
-def convert_softplus(g, op, block):
-    """Operator converter for softplus."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    beta = op.attr("beta")
-    beta = _expr.const(beta, dtype=dtype)
-    threshold = op.attr("threshold")
-
-    if threshold is None:
-        threshold = _expr.const(20.0, dtype=dtype)
-    threshold = _expr.const(threshold, dtype=dtype)
-    out_softplus = _op.log(_op.exp(x * beta) + _expr.const(1.0, dtype=dtype)) / beta
-    out = _op.where(_op.greater(x * beta, threshold), x, out_softplus)
-
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_softsign(g, op, block):
-    """Operator converter for softsign."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    out = x / (_op.const(1.0, dtype) + _op.abs(x))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_softshrink(g, op, block):
-    """Operator converter for softshrink."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    threshold = _expr.const(op.attr("lambda"), dtype=dtype)
-    zeros = _op.zeros_like(x)
-    out = _op.where(x < -threshold, x + threshold, zeros) + _op.where(
-        x > threshold, x - threshold, zeros
-    )
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_split(g, op, block):
-    """Operator converter for split."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.input("AxisTensor")
-    if axis:
-        axis = g.get_node(axis[0])
-        axis, infered = try_infer_value(axis, g.get_params())
-        if infered:
-            axis = axis.tolist()[0]
-    else:
-        axis = op.attr("axis")
-
-    sections = op.input("SectionsTensorList")
-    if sections:
-        tmp_section = []
-        for i in sections:
-            i = g.get_node(i)
-            i, infered = try_infer_value(i, g.get_params())
-            if infered:
-                i = i.tolist()
-            else:
-                raise ValueError("Dynamic Split not yet supported.")
-            tmp_section.extend(i)
-        sections = tmp_section
-    else:
-        sections = op.attr("sections")
-    if sections:
-        indices = []
-        split_index = 0
-        for i in sections[:-1]:
-            if i == -1:
-                input_shape = infer_shape(x)[axis]
-                i = input_shape - np.sum(sections) - 1
-            split_index += i
-            indices.append(split_index)
-    else:
-        indices = op.attr("num")
-
-    out = _op.split(x, indices, axis)
-    for i, out_i in enumerate(out):
-        g.add_node(op.output("Out")[i], out_i)
-
-
-def convert_stack(g, op, blcok):
-    """Operator converter for stack."""
-
-    x = op.input("X")
-    all_inputs = []
-    for inp in x:
-        all_inputs.append(g.get_node(inp))
-    axis = op.attr("axis")
-    out = _op.stack(all_inputs, axis)
-    g.add_node(op.output("Y")[0], out)
-
-
-def convert_square(g, op, block):
-    """Operator converter for square."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = block.var(op.output("Out")[0]).dtype
-    dtype = _convert_dtype_value(dtype)
-    out = _op.power(x, _expr.const(2, dtype))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_squeeze(g, op, block):
-    """Operator converter for squeeze2."""
-
-    x = g.get_node(op.input("X")[0])
-    axes = op.attr("axes")
-    if not axes:
-        axes = None
-    x = _op.squeeze(x, axis=axes)
-    g.add_node(op.output("Out")[0], x)
-
-
-def convert_swish(g, op, block):
-    """Operator converter for swish."""
-
-    x = g.get_node(op.input("X")[0])
-    beta = op.attr("beta")
-    assert beta == 1.0, "Only support beta==1.0 for PaddlePaddle's swish"
-    out = x * _op.tensor.sigmoid(x)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_take_along_axis(g, op, block):
-    """Operator converter for take_along_axis."""
-
-    x = g.get_node(op.input("Input")[0])
-    idx = g.get_node(op.input("Index")[0])
-    axis = op.attr("Axis")
-    out = _op.gather(x, axis, idx)
-    g.add_node(op.output("Result")[0], out)
-
-
-def convert_tanhshrink(g, op, block):
-    """Operator converter for tanhshrink."""
-
-    x = g.get_node(op.input("X")[0])
-    out = x - _op.tanh(x)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_thresholded_relu(g, op, block):
-    """Operator converter for thresholded_relu."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    threshold = op.attr("threshold")
-    threshold = _expr.const(threshold, dtype)
-    zero = _expr.const(0, dtype=dtype)
-    out = tvm.relay.where(x > threshold, x, zero)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_tile(g, op, block):
-    """Operator converter for tile."""
-
-    x = g.get_node(op.input("X")[0])
-    if op.input("RepeatTimes"):
-        reps = g.get_node(op.input("RepeatTimes")[0])
-        reps, infered = try_infer_value(reps, g.get_params())
-        if infered:
-            reps = reps.tolist()
-    elif op.input("repeat_times_tensor"):
-        reps = []
-        for rep_value in op.input("repeat_times_tensor"):
-            rep_value = g.get_node(rep_value).astype("int32")
-            reps.append(rep_value)
-        reps = _op.concatenate(reps, axis=0)
-        reps, infered = try_infer_value(reps, g.get_params())
-        if infered:
-            reps = reps.tolist()
-    else:
-        reps = op.attr("repeat_times")
-        infered = True
-
-    if not infered:
-        msg = f'Value {reps} in attribute "repeat_times" of operator Tile is not "valid."'
-        raise tvm.error.OpAttributeInvalid(msg)
-
-    op_func = get_relay_op(op.type)
-    out = op_func(x, reps=reps)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_topk(g, op, block):
-    """Operator converter for topk."""
-
-    data = g.get_node(op.input("X")[0])
-    if op.input("K"):
-        k = g.get_node(op.input("K")[0])
-    else:
-        k = op.attr("k")
-
-    largest = True
-    axis = -1
-    if op.has_attr("axis"):
-        axis = op.attr("axis")
-    if op.has_attr("largest"):
-        largest = op.attr("largest")
-    is_ascend = not largest
-
-    value_names = op.output("Out")
-    indice_names = op.output("Indices")
-
-    out = None
-    indice = None
-    if value_names and indice_names:
-        out, indice = _op.topk(data=data, k=k, axis=axis, ret_type="both", is_ascend=is_ascend)
-    elif value_names:
-        out = _op.topk(data=data, k=k, axis=axis, ret_type="values", is_ascend=is_ascend)
-    elif indice_names:
-        indice = _op.topk(data=data, k=k, axis=axis, ret_type="indices", is_ascend=is_ascend)
-
-    if out is not None:
-        g.add_node(value_names[0], out)
-    if indice is not None:
-        g.add_node(indice_names[0], indice)
-
-
-def convert_transpose(g, op, block):
-    """Operator converter for transpose."""
-
-    perm = op.attr("axis")
-    out = _op.transpose(g.get_node(op.input("X")[0]), axes=perm)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_unique(g, op, block):
-    """Operator converter for unique."""
-
-    x = g.get_node(op.input("X")[0])
-    return_index = op.attr("return_index")
-    return_inverse = op.attr("return_inverse")
-    return_counts = op.attr("return_counts")
-    axis = op.attr("axis")
-    dtype = op.attr("dtype")
-    dtype = _convert_dtype_value(dtype)
-
-    if len(axis) == 0:
-        x = _op.reshape(x, [-1])
-
-    if return_counts:
-        unique, indices, inverse_indices, _, counts = _op.unique(
-            x, is_sorted=True, return_counts=True
-        )
-    else:
-        unique, indices, inverse_indices, _ = _op.unique(x, is_sorted=True, return_counts=False)
-
-    out = unique
-    if dtype != infer_type(out).checked_type.dtype:
-        out = _op.cast(out, dtype)
-    g.add_node(op.output("Out")[0], unique)
-
-    if return_index:
-        g.add_node(op.output("Indices")[0], indices)
-    if return_inverse:
-        g.add_node(op.output("Index")[0], inverse_indices)
-    if return_counts:
-        g.add_node(op.output("Counts")[0], counts)
-
-
-def convert_unsqueeze(g, op, block):
-    """Operator converter for unsqueeze."""
-
-    x = g.get_node(op.input("X")[0])
-    axes = sorted(op.attr("axes"))
-    for axis in axes:
-        x = _op.expand_dims(x, axis=axis, num_newaxis=1)
-    g.add_node(op.output("Out")[0], x)
-
-
-def convert_unstack(g, op, block):
-    """Operator converter for unstack."""
-
-    x = g.get_node(op.input("X")[0])
-    axis = op.attr("axis")
-    indices_or_sections = len(op.output("Y"))
-    outs = _op.split(x, indices_or_sections=indices_or_sections, axis=axis)
-    for i, out in enumerate(outs):
-        out = _op.squeeze(out, axis=axis)
-        g.add_node(op.output("Y")[i], out)
-
-
-def convert_where(g, op, block):
-    """Operator converter for where."""
-
-    condition = g.get_node(op.input("Condition")[0])
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-    out = _op.where(condition, x, y)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_where_index(g, op, block):
-    """Operator converter for where_index."""
-
-    condition = g.get_node(op.input("Condition")[0])
-    out = _op.argwhere(condition)
-    g.add_node(op.output("Out")[0], out)
-
-
-_convert_map = {
-    "abs": convert_unary_op,
-    "acos": convert_unary_op,
-    "addmm": convert_addmm,
-    "arg_max": convert_arg_max_min,
-    "arg_min": convert_arg_max_min,
-    "argsort": convert_argsort,
-    "asin": convert_unary_op,
-    "assign": convert_assign,
-    "assign_value": convert_assign_value,
-    "atan": convert_unary_op,
-    "batch_norm": convert_batch_norm,
-    "bicubic_interp_v2": convert_interpolate,
-    "bilinear_interp_v2": convert_interpolate,
-    "bmm": convert_bmm,
-    "brelu": convert_brelu,
-    "cast": convert_cast,
-    "ceil": convert_unary_op,
-    "clip": convert_clip,
-    "concat": convert_concat,
-    "conv2d": convert_conv2d,
-    "conv2d_transpose": convert_conv2d_transpose,
-    "conv3d": convert_conv3d,
-    "cos": convert_unary_op,
-    "cosh": convert_unary_op,
-    "cumsum": convert_cumsum,
-    "depthwise_conv2d": convert_conv2d,
-    "depthwise_conv2d_transpose": convert_conv2d_transpose,
-    "dist": convert_dist,
-    "dot": convert_dot,
-    "dropout": convert_dropout,
-    "elementwise_add": convert_elementwise_op,
-    "elementwise_div": convert_elementwise_op,
-    "elementwise_floordiv": convert_elementwise_op,
-    "elementwise_max": convert_elementwise_op,
-    "elementwise_min": convert_elementwise_op,
-    "elementwise_mod": convert_elementwise_op,
-    "elementwise_mul": convert_elementwise_op,
-    "elementwise_pow": convert_elementwise_op,
-    "elementwise_prod": convert_elementwise_op,
-    "elementwise_sub": convert_elementwise_op,
-    "elu": convert_elu,
-    "equal": convert_elementwise_op,
-    "erf": convert_unary_op,
-    "exp": convert_unary_op,
-    "expand_v2": convert_expand,
-    "expand_as_v2": convert_expand_as,
-    "eye": convert_eye,
-    "feed": convert_feed,
-    "fill_any_like": convert_fill_any_like,
-    "fill_constant": convert_fill_constant,
-    "fill_constant_batch_size_like": convert_fill_constant_batch_size_like,
-    "fill_zeros_like": convert_fill_zeros_like,
-    "flatten_contiguous_range": convert_flatten,
-    "floor": convert_unary_op,
-    "floor_mod": convert_elementwise_op,
-    "flip": convert_flip,
-    "gather": convert_gather,
-    "gather_nd": convert_gather_nd,
-    "gaussian_random": convert_gaussian_random,
-    "gelu": convert_gelu,
-    "greater_equal": convert_elementwise_op,
-    "greater_than": convert_elementwise_op,
-    "grid_sampler": convert_grid_sampler,
-    "group_norm": convert_group_norm,
-    "hard_shrink": convert_hard_shrink,
-    "hard_sigmoid": convert_hard_sigmoid,
-    "hard_swish": convert_hard_swish,
-    "index_select": convert_index_select,
-    "instance_norm": convert_instance_norm,
-    "isfinite_v2": convert_unary_op,
-    "isinf_v2": convert_unary_op,
-    "isnan_v2": convert_unary_op,
-    "layer_norm": convert_layer_norm,
-    "leaky_relu": convert_leaky_relu,
-    "less_equal": convert_elementwise_op,
-    "less_than": convert_elementwise_op,
-    "linspace": convert_linspace,
-    "log": convert_unary_op,
-    "log2": convert_unary_op,
-    "log10": convert_unary_op,
-    "log1p": convert_log1p,
-    "logical_and": convert_binary_logical_op,
-    "logical_not": convert_logical_not,
-    "logical_or": convert_binary_logical_op,
-    "logical_xor": convert_binary_logical_op,
-    "logsigmoid": convert_logsigmoid,
-    "log_softmax": convert_logsoftmax,
-    "logsumexp": convert_logsumexp,
-    "lookup_table_v2": convert_lookup_table,
-    "matmul": convert_matmul,
-    "matmul_v2": convert_matmul,
-    "meshgrid": convert_meshgrid,
-    "mish": convert_mish,
-    "mul": convert_mul,
-    "mv": convert_mv,
-    "nearest_interp_v2": convert_interpolate,
-    "norm": convert_norm,
-    "not_equal": convert_elementwise_op,
-    "one_hot_v2": convert_one_hot_v2,
-    "p_norm": convert_p_norm,
-    "pad1d": convert_padding,
-    "pad2d": convert_padding,
-    "pad3d": convert_padding,
-    "pixel_shuffle": convert_pixel_shuffle,
-    "pool2d": convert_pool2d,
-    "pool3d": convert_pool3d,
-    "pow": convert_pow,
-    "prelu": convert_prelu,
-    "range": convert_range,
-    "relu": convert_unary_op,
-    "relu6": convert_relu6,
-    "reshape2": convert_reshape,
-    "round": convert_unary_op,
-    "roi_align": convert_roi_align,
-    "reciprocal": convert_reciprocal,
-    "reduce_all": convert_reduce,
-    "reduce_any": convert_reduce,
-    "reduce_max": convert_reduce,
-    "reduce_min": convert_reduce,
-    "reduce_prod": convert_reduce,
-    "reduce_sum": convert_reduce,
-    "reduce_mean": convert_reduce,
-    "rnn": convert_rnn,
-    "rsqrt": convert_unary_op,
-    "scale": convert_scale,
-    "scatter": convert_scatter,
-    "scatter_nd_add": convert_scatter_nd_add,
-    "selu": convert_selu,
-    "set_value": convert_set_value,
-    "shape": convert_shape,
-    "sigmoid": convert_unary_op,
-    "sign": convert_unary_op,
-    "silu": convert_silu,
-    "sin": convert_unary_op,
-    "sinh": convert_unary_op,
-    "size": convert_size,
-    "slice": convert_slice,
-    "softmax": convert_softmax,
-    "softmax_with_cross_entropy": convert_softmax_with_cross_entropy,
-    "softplus": convert_softplus,
-    "softsign": convert_softsign,
-    "softshrink": convert_softshrink,
-    "split": convert_split,
-    "stack": convert_stack,
-    "strided_slice": convert_slice,
-    "sqrt": convert_unary_op,
-    "square": convert_square,
-    "squeeze2": convert_squeeze,
-    "swish": convert_swish,
-    "take_along_axis": convert_take_along_axis,
-    "tan": convert_unary_op,
-    "tanh": convert_unary_op,
-    "tanh_shrink": convert_tanhshrink,
-    "top_k": convert_topk,
-    "thresholded_relu": convert_thresholded_relu,
-    "tile": convert_tile,
-    "top_k_v2": convert_topk,
-    "transpose2": convert_transpose,
-    "unique": convert_unique,
-    "unsqueeze2": convert_unsqueeze,
-    "unstack": convert_unstack,
-    "where": convert_where,
-    "where_index": convert_where_index,
-    # Quantized
-    "dequantize_linear": convert_dequantize_linear,
-    "quantize_linear": convert_quantize_linear,
-}
-
-
-class GraphProto:
-    """A helper class for handling relay functions from PaddlePaddle model."""
-
-    def __init__(self):
-        self.nodes = {}
-        self.params = {}
-        self.shape_dict = None
-
-    def get_node(self, name):
-        """get node from graph"""
-
-        assert name in self.nodes
-        return self.nodes[name]
-
-    def add_node(self, name, node):
-        """add a node to graph"""
-
-        self.nodes[name] = fold_constant(node)
-
-    def modify_node(self, name, params):
-        """modify node from graph"""
-
-        self.params[name] = params
-        self.nodes[name] = new_var(name, shape=params.shape, dtype=params.dtype)
-
-    def get_params(self, name=None):
-        """Get params from graph."""
-
-        if name is None:
-            return self.params
-        assert name in self.params, f"The name({name}) is not in params"
-        return self.params[name]
-
-    def extract_parameters(self, program, scope=None):
-        """Extract all the weights from PaddlePaddle program."""
-
-        self.params = {}
-        variables = program.global_block().vars
-        for name in variables:
-            if name.endswith("feed") or name.endswith("fetch"):
-                continue
-            # This judgment will cause the PaddleInference model
-            # exported by PaddleSlim to skip some operators
-            # that need to be read in NHWC format.
-            var = program.global_block().var(name)
-            if not var.persistable:
-                continue
-            if isinstance(scope, dict):
-                self.params[name] = _nd.array(scope[name])
-            else:
-                self.params[name] = _nd.array(np.array(scope.var(name).get_tensor()))
-            shape = self.params[name].shape
-            dtype = self.params[name].dtype
-            self.nodes[name] = new_var(name, shape=shape, dtype=dtype)
-
-    def check_input_shape(self, op, block):
-        """Check the shape information of model's inputs, fixed shape is recommended."""
-
-        ipt_name = op.input(op.input_names[0])
-        ipt_shape = block.var(ipt_name).shape
-        for i in ipt_shape:
-            if i < 0:
-                warning_msg = (
-                    f"Input {ipt_name}(shape={ipt_shape}) has unkown dimension shapes. "
-                    f"Specifying static values may improve performance"
-                )
-                warnings.warn(warning_msg)
-
-    def check_unsupported_ops(self, program):
-        """Check whether all the operators are supported."""
-
-        unsupported_ops = set()
-        for block in program.blocks:
-            for op in block.ops:
-                if op.type == "fetch":
-                    continue
-                if op.type not in _convert_map:
-                    unsupported_ops.add(op.type)
-        if len(unsupported_ops) > 0:
-            msg = "The following operators are not supported for frontend Paddle: "
-            msg += ", ".join(unsupported_ops)
-            raise tvm.error.OpNotImplemented(msg)
-
-    def ops_to_relay(self, program, input_specs=None):
-        """Convert PaddlePaddle operators to TVM relay functions."""
-
-        if input_specs is not None:
-            for input_spec in input_specs:
-                convert_feed(self, input_spec, None)
-        for block in program.blocks:
-            for op in block.ops:
-                if op.type == "fetch":
-                    continue
-                convert_func = _convert_map[op.type]
-                convert_func(self, op, block)
-
-    def from_program(self, program, shape_dict, scope):
-        """Construct the TVM relay expression from PaddlePaddle program."""
-
-        self.shape_dict = shape_dict
-        if scope is None:
-            import paddle
-
-            scope = paddle.static.global_scope()
-        self.check_unsupported_ops(program)
-        self.extract_parameters(program, scope)
-        self.ops_to_relay(program)
-
-        output_names = list()
-        for block in program.blocks:
-            for op in block.ops:
-                if op.type == "fetch":
-                    output_names.append(op.input("X")[0])
-        outputs = [self.nodes[name] for name in output_names]
-        outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-
-        free_vars = analysis.free_vars(outputs)
-        func = _function.Function(free_vars, outputs)
-        mod = IRModule.from_expr(func)
-        return mod, self.params
-
-    def from_translated_layer(self, layer, shape_dict):
-        """Construct the TVM relay expression from PaddlePaddle TranslatedLayer."""
-
-        self.shape_dict = shape_dict
-        program = layer.program()
-        parameters = dict()
-        for param in layer.parameters() + layer.buffers():
-            parameters[param.name] = np.array(param.value().get_tensor())
-        self.check_unsupported_ops(program)
-        self.extract_parameters(program, parameters)
-
-        input_specs = layer._input_spec()
-        self.ops_to_relay(program, input_specs)
-
-        output_names = [x.name for x in layer._output_spec()]
-
-        outputs = [self.nodes[name] for name in output_names]
-        outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-
-        free_vars = analysis.free_vars(outputs)
-        func = _function.Function(free_vars, outputs)
-        mod = IRModule.from_expr(func)
-        # remove unused parameters
-        final_params = dict()
-        for var in free_vars:
-            if var.name_hint in self.params:
-                final_params[var.name_hint] = self.params[var.name_hint]
-        self.params = final_params
-        return mod, self.params
-
-
-def from_paddle(program_or_layer, shape_dict=None, scope=None):
-    """Convert a PaddlePaddle model into an equivalent Relay Function.
-    PaddlePaddle Program/TranslatedLayer represent the computation graph of PaddlePaddle model,
-    and PaddlePaddle scope stores all the weights of PaddlePaddle model.
-
-    Parameters
-    ----------
-    program_or_layer : object of `paddle.static.Program` or `paddle.jit.TranslatedLayer`
-        Loaded model by `paddle.static.load_inference_model` or `paddle.jit.load`
-
-    shape_dict : dict of str to tuple/list, optional
-        The input shape of model
-
-    scope : object of `paddle.static.Scope`, optional
-        The scope that saves all the weights of model, use `paddle.static.global_scope` by default
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation
-
-    params : dict of str to tvm.nd.NDArray
-    """
-
-    import paddle
-
-    # disable system signal capturing in paddle framework
-    # the signal capturing may cause conflict while running autotvm with paddle frontend
-    paddle.disable_signal_handler()
-
-    g = GraphProto()
-    if isinstance(program_or_layer, paddle.jit.TranslatedLayer):
-        # model is loaded by `paddle.jit.load`
-        mod, params = g.from_translated_layer(program_or_layer, shape_dict)
-    elif isinstance(program_or_layer, paddle.static.Program):
-        # model is loaded by `paddle.static.load_inference_model`
-        mod, params = g.from_program(program_or_layer, shape_dict, scope)
-    else:
-        raise Exception("Only PaddlePaddle's Program and TranslatedLayer are supported.")
-    return mod, params
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
deleted file mode 100644
index 0d93ff987c6e..000000000000
--- a/python/tvm/relay/frontend/pytorch.py
+++ /dev/null
@@ -1,5474 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-self, too-many-lines, len-as-condition, no-else-return, unused-variable, too-many-nested-blocks
-# pylint: disable=consider-iterating-dictionary, invalid-name, unused-argument, unused-variable, broad-except
-# pylint: disable=import-outside-toplevel, simplifiable-if-expression, cell-var-from-loop, unnecessary-lambda
-# pylint: disable=missing-function-docstring, redefined-builtin, use-implicit-booleaness-not-comparison
-"""PT: PyTorch frontend."""
-import functools
-import itertools
-from abc import ABC
-from typing import Dict
-import math
-import re
-import sys
-
-import numpy as np
-import tvm
-from tvm.ir import IRModule
-from tvm.topi.utils import get_const_tuple
-
-from .. import analysis as _analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from .. import qnn, transform
-from ..expr_functor import ExprMutator
-from ..loops import while_loop
-from ..prelude import Prelude, StaticTensorArrayOps
-from ..ty import Any, TensorType, TupleType
-from . import qnn_torch
-from .common import AttrCvt, get_relay_op, gru_cell, logger, rnn_cell
-from .common import infer_shape as _infer_shape
-from .common import infer_value as _infer_value
-from .common import infer_value_simulated as _infer_value_simulated
-from .common import lstm_cell, try_infer_value, unbind, fold_constant
-from .common import set_span
-from .pytorch_utils import is_version_greater_than, getattr_attr_name
-
-__all__ = ["from_pytorch"]
-
-# This returns a "subgraph" which puts variables whenever
-# the type is known. It also records things to map the input
-# nodes to the extracted graph's nodes.
-# As Python objects are not round-trippable through C++, and
-# our type annotations only live in Python, we need to map
-# the nodes we get in visiting to the nodes
-# we used to construct the graph (they are the same in C++,
-# match each other in dictionary lookups, but are not the same
-# in Python) by using the hint dictionary filled as
-# {node: node for node in nodes} to get the type annotations.
-# https://discuss.tvm.apache.org/t/round-tripping-objects-through-the-ffi/8440
-class _TypeFinder(ExprMutator):
-    def __init__(self, types):
-        super().__init__()
-        self.counter = 0
-        self.vars = {}
-        self.types = types
-        self.leave = set()  # some variables are not inputs
-
-    def visit_let(self, let):
-        self.leave.add(let.var)
-        return super().visit_let(let)
-
-    def visit_function(self, fn):
-        self.leave.update(fn.params)
-        return super().visit_function(fn)
-
-    def visit(self, expr):
-        if expr in self.leave:
-            return super().visit(expr)
-        if expr in self.vars:
-            return self.vars[expr]
-        if isinstance(expr, tvm.relay.Var):
-            self.vars[expr] = expr
-            return expr
-        if expr in self.types:
-            ty = self.types[expr]
-            v = tvm.relay.var(f"_{self.counter}", type_annotation=ty)
-            self.counter += 1
-            self.vars[expr] = v
-            return v
-        v = super().visit(expr)
-        return v
-
-
-def _should_construct_dynamic_list(list_construct_node):
-    # if this list is element-accessed or modified at runtime, generate List ADT
-    def inplace_add_to_add(op_name):
-        if op_name == "aten::add_":
-            return "aten::add"
-        else:
-            return op_name
-
-    uses = _get_uses(list_construct_node)
-
-    for loop_use in filter(lambda use: use.user.kind() == "prim::Loop", uses):
-        block_input_index = loop_use.offset - 1
-        block = list(loop_use.user.blocks())[0]
-        list_loop_var = list(block.inputs())[block_input_index]
-        uses += _get_uses(list_loop_var.node())
-
-    op_names = map(inplace_add_to_add, set(use.user.kind() for use in uses))
-
-    list_ops = set(["aten::add", "aten::__getitem__"])
-    intersect = list_ops.intersection(op_names)
-
-    if len(intersect) > 0 and intersect != set(["aten::add"]):
-        return True
-
-    # if add op outputs list, it is dynamic so we need to construct List ADT
-    for use in filter(lambda use: use.user.kind() in ["aten::add", "aten::add_"], uses):
-        output_type = _get_node_type(use.user)
-        if output_type == "ListType":
-            return True
-
-    return False
-
-
-def _is_int_seq(seq):
-    # TODO (t-vi): handle non-int constants? (like numpy.intXX)
-    return len(seq) > 0 and all([isinstance(i, int) for i in seq])
-
-
-# operator implementation
-class PyTorchOpConverter:
-    """A helper class for holding PyTorch op converters."""
-
-    def __init__(
-        self, prelude, default_dtype, use_parser_friendly_name=False, preserve_pytorch_scopes=False
-    ):
-        self.prelude = prelude
-        self.default_dtype = default_dtype
-        self.create_convert_map()
-        self.types = {}  # map from nodes to (Relay) type annotations
-        self.source_map = {}  # map from graph node to its source name
-        self.op_type_dict = {}  # map from op type to its presenting order
-        self.current_op = []  # stack for recording current processing op
-        self.use_parser_friendly_name = use_parser_friendly_name
-        self.preserve_pytorch_scopes = preserve_pytorch_scopes
-
-    # this incrementally infers the type, see the comments on the type visitor
-    # above.
-    def infer_type(self, node, mod=None):
-        """An incremental method to infer the type of a node in the relay graph."""
-
-        if node in self.types:
-            return self.types[node]
-        if isinstance(node, tvm.relay.Var):
-            return node.type_annotation
-
-        tf = _TypeFinder(types=self.types)
-        new_node = tf.visit(node)
-        fn = _function.Function(list(tf.vars.values()), new_node)
-        new_mod = IRModule({"main": fn})
-        if mod is not None:
-            new_mod.update(mod)
-        new_mod = transform.RemoveUnusedFunctions()(new_mod)
-        new_mod = transform.InferType()(new_mod)
-        entry = new_mod["main"]
-        ty = entry.body.checked_type
-        self.types[node] = ty
-        return self.types[node]
-
-    def infer_type_with_prelude(self, val):
-        body = self.infer_type(val, self.prelude.mod)
-        return body
-
-    # list ADT utilities
-    def convert_to_list_adt(self, py_lst):
-        elem_tys = [self.infer_type_with_prelude(elem) for elem in py_lst]
-        msg = "List elements should have identical types"
-        assert all(map(lambda ty: ty == elem_tys[0], elem_tys)), msg
-
-        # get_type returns type_name, ctor1, ..., ctorN
-        # 1 is nil
-        _, cons, nil = self.prelude.mod.get_type("List")
-        adt_lst = nil()
-        for elem in reversed(py_lst):
-            adt_lst = cons(elem, adt_lst)
-        return adt_lst
-
-    def map_tensor_array_constructor(self, adt_lst, shape):
-        static_tensor_array_ops = StaticTensorArrayOps(self.prelude, "float32", shape)
-        static_tensor_array_ops.register()
-        tensor_create = self.prelude.get_tensor_ctor_static("tensor_constructor", "float32", shape)
-        return self.prelude.map(tensor_create, adt_lst)
-
-    def convert_to_tensor_array(self, adt_lst):
-        _, cons, nil = self.prelude.mod.get_type("List")
-        if self.prelude.length(adt_lst) == 0:
-            return nil()
-
-        checked_type = self.infer_type_with_prelude(self.prelude.hd(adt_lst))
-        shape = checked_type.shape
-        tensor_array = self.map_tensor_array_constructor(adt_lst, shape)
-        return tensor_array, tuple(shape)
-
-    def infer_shape(self, inputs, mod=None):
-        """A method to get the output type of an intermediate node in the graph."""
-        typ = self.infer_type(inputs, mod=mod)
-        if hasattr(typ, "shape"):
-            # Regular operator that outputs tensors
-            return get_const_tuple(typ.shape)
-        # The return type is not a tensor, for example List
-        return typ
-
-    def infer_shape_with_prelude(self, inputs):
-        return self.infer_shape(inputs, mod=self.prelude.mod)
-
-    def is_empty_shape(self, shape):
-        rank = len(shape)
-        if rank:
-            is_empty = False
-            for i in range(rank):
-                if shape[i] == 0:
-                    is_empty = True
-                    break
-            return is_empty
-        else:
-            return True
-
-    def record_output_type(self, output):
-        if isinstance(output, tuple):
-            cleaned_output = [o for o in output if o is not None]
-            types = self.infer_type_with_prelude(_expr.Tuple(cleaned_output))
-            for o, t in zip(cleaned_output, types.fields):
-                self.types[o] = t
-        elif isinstance(output, _expr.Expr):
-            self.infer_type_with_prelude(output)
-        # it can also happen that the type is int or so
-
-    def pytorch_promote_types(self, inputs, dtypes):
-        """This promotes TVM inputs with TVM dtypes passed like PyTorch would"""
-        actual_dtypes = []
-        for i, inp in enumerate(inputs):
-            if isinstance(inp, _expr.Expr):
-                idt = self.infer_type(inp).dtype
-                actual_dtypes.append(idt)
-            else:
-                actual_dtypes.append(dtypes[i])
-        dtypes = actual_dtypes
-        tensor_dtypes = [dt for inp, dt in zip(inputs, dtypes) if not np.isscalar(inp)]
-        non_tensor_inputs = [inp for inp in inputs if np.isscalar(inp)]
-        result_type = _pytorch_result_type(tensor_dtypes, non_tensor_inputs)
-        results = []
-        for inp, dt in zip(inputs, dtypes):
-            if np.isscalar(inp):
-                results.append(_expr.const(inp, dtype=result_type))
-            elif dt == result_type:
-                results.append(inp)
-            else:
-                results.append(_op.cast(inp, result_type))
-        return results
-
-    def is_quantized_tensor(self, data):
-        # If a quantized Torch module is saved and loaded back, dtype will be dropped
-        # Since dtypes from Torch tensors are not reliable in such cases, we use
-        # Relay's type inference result to decide if an input tensor is quantized
-        ty = self.infer_type_with_prelude(data)
-        return ty.dtype == "uint8"
-
-    # Operator implementations
-    def make_elemwise(self, name):
-        def elemwise(inputs, input_types):
-            if name == "divide":
-                # https://pytorch.org/docs/stable/generated/torch.div.html#torch.div
-                # None - default behavior. Performs no rounding and, if both input and
-                # other are integer types, promotes the inputs to the default scalar type.
-                if all(["int" in input_type for input_type in input_types[:2]]):
-                    input_types[:2] = ["float32"] * 2
-                    cast_inputs = []
-                    for inp in inputs[:2]:
-                        if np.isscalar(inp):
-                            cast_inputs.append(_expr.const(inp, dtype="float32"))
-                        else:
-                            cast_inputs.append(_op.cast(inp, "float32"))
-                    inputs[:2] = cast_inputs
-
-            data0, data1 = self.pytorch_promote_types(inputs[:2], input_types[:2])
-            return get_relay_op(name)(data0, data1)
-
-        return elemwise
-
-    def min_max_common(self, name_elemwise, name_reduce, inputs, input_types):
-        if len(inputs) == 1:
-            data = self.pytorch_promote_types(inputs[:1], input_types[:1])
-            return get_relay_op(name_reduce)(data[0])
-        elif len(inputs) >= 2 and isinstance(inputs[1], (list, int)):
-            data = self.pytorch_promote_types(inputs[:1], input_types[:1])
-            dim = inputs[1]
-            keepdims = inputs[2] if len(inputs) > 2 else False
-            # also return dummy indices
-            return get_relay_op(name_reduce)(data[0], axis=dim, keepdims=keepdims), None
-        else:
-            data0, data1 = self.pytorch_promote_types(inputs[:2], input_types[:2])
-            return get_relay_op(name_elemwise)(data0, data1)
-
-    def max(self, inputs, input_types):
-        return self.min_max_common("maximum", "max", inputs, input_types)
-
-    def min(self, inputs, input_types):
-        return self.min_max_common("minimum", "min", inputs, input_types)
-
-    def maximum(self, inputs, input_types):
-        data0, data1 = self.pytorch_promote_types(inputs[:2], input_types[:2])
-        return _op.maximum(data0, data1)
-
-    def minimum(self, inputs, input_types):
-        data0, data1 = self.pytorch_promote_types(inputs[:2], input_types[:2])
-        return _op.minimum(data0, data1)
-
-    def make_unary(self, name):
-        def unary(inputs, input_types):
-            # this is just to ensure tensor input
-            (data,) = self.pytorch_promote_types(inputs[:1], input_types[:1])
-            return get_relay_op(name)(data)
-
-        return unary
-
-    def log1p(self, inputs, input_types):
-        # 1_plus_log x = log(x + 1)
-        (dtype,) = input_types
-        one = _expr.const(1, dtype=dtype)
-        return _op.log(inputs[0] + one)
-
-    def square(self, inputs, input_types):
-        (dtype,) = input_types
-        return _op.power(inputs[0], _expr.const(2, dtype))
-
-    def lerp(self, inputs, input_types):
-        if len(inputs) != 3:
-            msg = f"Wrong number of arguments ({len(inputs)}) to parse."
-            raise AssertionError(msg)
-
-        start = inputs[0]
-        end = inputs[1]
-        weight = inputs[2]
-        return start + weight * (end - start)
-
-    def arange(self, inputs, input_types):
-        def _get_value(val, dtype):
-            # dtype is a tvm dtype
-            if isinstance(val, _expr.Expr):
-                # since "arange" op will fill expr into its attribute
-                # invoke set_span here to prevent expr-rewritten occurrs in span-filling stage
-                source_name = self.source_map[self.current_op[-1]]
-                inp = set_span(_op.cast(val, dtype), source_name)
-                ret, _ = try_infer_value(inp, lambda ret: _expr.const(ret, dtype))
-            else:
-                ret = _create_typed_const(val, dtype)
-            return ret
-
-        def _get_type(val, inp_type):
-            if isinstance(val, _expr.Expr):
-                dtype = str(self.infer_type(val))
-                return dtype
-            return inp_type
-
-        # PyTorch arange uses the following type semantics:
-        # - if a dtype is given, start, stop, step are converted to that dtype
-        # - if no dtype is given and all args are integral, dtype is int64
-        # - if no dtype is given and there is a float arg, dtype is float32
-        if len(inputs) in {5, 6, 7}:
-            # inputs look like [_,_,_,dtype,layout,device,requires_grad]
-            # therefore dtype_idx is always the length of inputs minus 4
-            dtype_idx = len(inputs) - 4
-            types = [_get_type(inputs[i], input_types[i]) for i in range(dtype_idx)]
-            if inputs[dtype_idx] is not None:
-                dtype = _convert_dtype_value(inputs[dtype_idx])
-            elif any([t.startswith("float") for t in types]):
-                dtype = "float32"
-            else:
-                dtype = "int64"
-
-            # - if len(inputs) == 5, inputs = [stop, dtype, ...]
-            # - if len(inputs) == 6, inputs = [start, stop, dtype, ...]
-            # - if len(inputs) == 7, inputs = [start, stop, step, dtype, ...]
-            start = _get_value(inputs[0], dtype) if len(inputs) > 5 else _expr.const(0, dtype)
-            stop = _get_value(inputs[1 if len(inputs) > 5 else 0], dtype)
-            step = _get_value(inputs[2], dtype) if len(inputs) > 6 else _expr.const(1, dtype)
-        else:
-            msg = f"Unknown number of arguments ({len(inputs)}) to parse."
-            raise AssertionError(msg)
-
-        return _op.transform.arange(start=start, stop=stop, step=step, dtype=dtype)
-
-    def squeeze(self, inputs, input_types):
-        data = inputs[0]
-        if len(inputs) == 1:
-            axis = None
-        else:
-            # TODO (t-vi): why is the cast to int needed? similarly elsewhere
-            inputs = [inputs[1]] if not isinstance(inputs[1], list) else inputs[1]
-            axis = [int(v) for v in inputs]
-
-        return _op.transform.squeeze(data, axis)
-
-    def unsqueeze(self, inputs, input_types):
-        data = inputs[0]
-        axis = inputs[1]
-
-        return _op.transform.expand_dims(data, int(axis), 1)
-
-    def concatenate(self, inputs, input_types):
-        def tensor_array_concat(lst, axis):
-            assert axis == 0, "Tensor array concat supported only for axis 0"
-            tensor_array, shape = self.convert_to_tensor_array(lst)
-            concat_shape = (Any(),) + shape[1:]
-            concat = self.prelude.get_global_var_static("tensor_array_concat", "float32", shape)
-            concatenated = concat(tensor_array)
-
-            static_tensor_array_ops = StaticTensorArrayOps(self.prelude, "float32", concat_shape)
-            static_tensor_array_ops.register()
-            get_tensor = self.prelude.get_global_var_static(
-                "tensor_get_data", "float32", concat_shape
-            )
-            return get_tensor(concatenated)
-
-        data = inputs[0]
-        axis = inputs[1]
-
-        if not isinstance(data, list):
-            return tensor_array_concat(data, axis)
-
-        if isinstance(data, _expr.Expr):
-            data = [data]
-
-        return _op.tensor.concatenate(data, int(axis))
-
-    def slice(self, inputs, input_types):
-        axis_dtype = "int64"
-        index_size_limit = sys.maxsize
-        data = inputs[0]
-        dshape = self.infer_shape(data)
-        ndim = len(dshape)
-        dim = int(inputs[1])
-        stride = inputs[4]
-
-        target_begin, is_begin_const = try_infer_value(
-            inputs[2], lambda ret: ret.astype(int).item(0)
-        )
-        target_end, is_end_const = try_infer_value(inputs[3], lambda ret: ret.astype(int).item(0))
-
-        # A fast path when slicing is nop.
-        if (
-            isinstance(target_begin, int)
-            and isinstance(target_end, int)
-            and target_begin == 0
-            and target_end >= index_size_limit
-            and stride == 1
-        ):
-            return data
-
-        if target_begin is None and target_end is None:
-            return data
-
-        # Process begin
-        begin = [0] * ndim
-
-        if target_begin is not None:
-            begin[dim] = target_begin
-
-        if target_begin is not None and not isinstance(begin[dim], int):
-            tmp = []
-            for b in begin:
-                if isinstance(b, int):
-                    tmp.append(_op.expand_dims(_expr.const(b, axis_dtype), axis=0))
-                else:
-                    tmp.append(_op.cast(_op.expand_dims(b, axis=0), axis_dtype))
-            begin = _op.concatenate(tmp, axis=0)
-            btype = self.infer_type(begin).dtype
-            if str(btype) != axis_dtype:
-                begin = _op.cast(begin, axis_dtype)
-
-        # Process end
-        if isinstance(target_end, int) and target_end >= index_size_limit:
-            target_end = dshape[dim]
-
-        if any([isinstance(d, tvm.tir.Any) for d in dshape]):
-            end = _op.shape_of(data)
-        else:
-            end = dshape
-
-        if isinstance(target_end, int):
-            if isinstance(end, list):
-                end[dim] = target_end
-            else:
-                all_static = True
-                for i, shape_dim in enumerate(dshape):
-                    if i != dim and isinstance(shape_dim, tvm.tir.Any):
-                        all_static = False
-
-                if all_static:
-                    end = list(get_const_tuple(dshape))
-                    end[dim] = target_end
-                else:
-                    target_end = _expr.const(target_end)
-                    end = _op.scatter_elements(
-                        end,
-                        _op.expand_dims(_expr.const(dim), axis=0),
-                        _op.expand_dims(target_end, axis=0),
-                        axis=0,
-                    )
-        else:
-            end = _op.cast(_op.shape_of(data), axis_dtype)
-            if target_end is not None and not isinstance(target_end, tvm.tir.Any):
-                ttype = self.infer_type(target_end).dtype
-                if str(ttype) != axis_dtype:
-                    target_end = _op.cast(target_end, axis_dtype)
-                end = _op.scatter_elements(
-                    end,
-                    _op.expand_dims(_expr.const(dim), axis=0),
-                    _op.expand_dims(target_end, axis=0),
-                    axis=0,
-                )
-
-        if not isinstance(end, list):
-            etype = self.infer_type(end).dtype
-            if str(etype) != axis_dtype:
-                end = _op.cast(end, axis_dtype)
-
-        strides = [1] * ndim
-        strides[dim] = stride
-
-        return _op.transform.strided_slice(
-            data, begin=begin, end=end, strides=strides, slice_mode="end"
-        )
-
-    def narrow(self, inputs, input_types):
-        # Inputs are:
-        # 0 - the tensor to narrow
-        # 1 - the dimension along which to narrow
-        # 2 - the starting dimension
-        # 3 - the distance to the ending dimension
-        # Lets find the ending dimension
-        end = self.add(inputs[2:4], input_types[2:4])
-        stride = 1
-        slice_input = inputs[:3] + [end, stride]
-        slice_types = input_types + ["int32"]
-        return self.slice(slice_input, slice_types)
-
-    def split(self, inputs, input_types):
-        data = inputs[0]
-        split_size = int(inputs[1])
-        dim = int(inputs[2])
-
-        split_index = split_size
-        indices = []
-        while split_index < self.infer_shape(data)[dim]:
-            indices.append(split_index)
-            split_index += split_size
-
-        return _op.split(data, indices, dim)
-
-    def split_with_sizes(self, inputs, input_types):
-        data = inputs[0]
-        sections = inputs[1]
-        dim = int(inputs[2])
-
-        if len(sections) == 1:
-            # a special case used in torchvision detection models
-            return _expr.TupleWrapper(_expr.Tuple([data]), 1)
-
-        split_index = 0
-        indices = []
-        for i in range(len(sections) - 1):
-            index, _ = try_infer_value(sections[i], lambda ret: int(ret))
-            split_index += index
-            indices.append(split_index)
-
-        return _op.split(data, indices, dim)
-
-    def tensor_split(self, inputs, input_types):
-        # Reference: https://pytorch.org/docs/stable/generated/torch.tensor_split.html
-        import torch
-
-        if not isinstance(inputs[1], (int, list, tuple, torch.Tensor)):
-            msg = (
-                f"indices_or_sections type {type(inputs[1])} could not be parsed in "
-                f"tensor_split op"
-            )
-            raise AssertionError(msg)
-
-        if isinstance(inputs[1], torch.Tensor) and len(inputs[1].shape) not in [0, 1]:
-            msg = "indices_or_sections must be a zero-dimensional or one-dimensional long tensor"
-            raise AssertionError(msg)
-
-        if isinstance(inputs[1], int) or (
-            isinstance(inputs[1], torch.Tensor) and list(inputs[1].shape) == []
-        ):
-            data = inputs[0]
-            n = int(inputs[1])
-            dim = int(inputs[2])
-
-            split_size = int(self.infer_shape(data)[dim] / n)
-            split_rest = int(self.infer_shape(data)[dim] % n)
-
-            indices = []
-            split_index = split_size
-            if split_rest == 0:
-                for i in range(n - 1):
-                    indices.append(split_index)
-                    split_index += split_size
-            else:
-                for i in range(split_rest):
-                    indices.append(split_index + 1)
-                    split_index = (i + 1) * (split_index + 1)
-                for i in range(n - split_rest - 1):
-                    split_index += split_size
-                    indices.append(split_index)
-
-            return _op.split(data, indices, dim)
-        else:
-            data = inputs[0]
-            sections = inputs[1]
-            dim = int(inputs[2])
-
-            if isinstance(sections, tuple):
-                sections = list(sections)
-            elif isinstance(sections, torch.Tensor):
-                sections = sections.cpu().numpy().tolist()
-
-            return _op.split(data, sections, dim)
-
-    def select(self, inputs, input_types):
-        data = inputs[0]
-        dim = int(inputs[1])
-        index = _wrap_const(inputs[2])
-        return _op.transform.take(data, index, axis=dim, mode="wrap")
-
-    def take(self, inputs, input_types):
-        data = inputs[0]
-        indices = _op.cast(inputs[1], "int32")
-
-        return _op.transform.take(data, indices=indices, mode="wrap")
-
-    def topk(self, inputs, input_types):
-        data = inputs[0]
-        axis = int(inputs[2])
-        is_ascend = not bool(inputs[3])
-        sort = bool(inputs[4])
-
-        if isinstance(inputs[1], _expr.Expr):
-            k, _ = try_infer_value(inputs[1], lambda ret: ret.tolist())
-        else:
-            k = inputs[1]
-
-        if not sort:
-            msg = "Currently supports only sorted output for topk operator."
-            raise AssertionError(msg)
-
-        outs = _op.topk(data, k=k, axis=axis, is_ascend=is_ascend, ret_type="both", dtype="int64")
-
-        return outs[0], outs[1]
-
-    def reciprocal(self, inputs, input_types):
-        data = inputs[0]
-        return _expr.const(1.0, dtype=input_types[0]) / data
-
-    def repeat(self, inputs, input_types):
-        data = inputs[0]
-        reps = []
-        for r in inputs[1]:
-            if isinstance(r, int):
-                reps.append(r)
-            else:
-                reps.append(int(_infer_value(r, {}).numpy()))
-
-        return _op.transform.tile(data, reps=reps)
-
-    def repeat_interleave(self, inputs, input_types):
-        data = inputs[0]
-        if isinstance(inputs[1], int):
-            repeats = inputs[1]
-            axis = inputs[2]
-        elif isinstance(inputs[1], _expr.Expr):
-            if isinstance(inputs[1], _expr.Constant):
-                repeats = int(inputs[1].data.numpy())
-            else:
-                repeats, _ = try_infer_value(inputs[1], lambda ret: ret.tolist())
-            axis = inputs[2]
-        else:
-            msg = "Only repeat with one value as repeat is currently supported."
-            raise AssertionError(msg)
-        if axis is None:  # Flatten the data if no axis is given from torch
-            data = _op.transform.reshape(data, [-1])
-            axis = 0
-        return _op.transform.repeat(data, repeats=repeats, axis=axis)
-
-    def addcdiv(self, inputs, input_types):
-        data, t1, t2, c = self.pytorch_promote_types(inputs[:4], input_types[:4])
-        return data + (c * (t1 / t2))
-
-    def addcmul(self, inputs, input_types):
-        data, t1, t2, c = self.pytorch_promote_types(inputs[:4], input_types[:4])
-        return data + (c * (t1 * t2))
-
-    def where(self, inputs, input_types):
-        if len(inputs) == 1:
-            return self.nonzero([inputs[0], True], input_types)
-
-        cond = inputs[0]
-        x, y = self.pytorch_promote_types(inputs[1:3], input_types[1:3])
-        return _op.where(cond, x, y)
-
-    def full_impl(self, data, fill_value, dtype):
-        size = []
-        need_reshape = False
-        new_shape = []
-        for dim in data:
-            if isinstance(dim, _expr.Expr):
-                if isinstance(dim, _expr.Constant):
-                    dim = int(dim.data.numpy())
-                    if isinstance(size, list):
-                        size.append(dim)
-                    new_shape.append(dim)
-                else:
-                    dim, success = try_infer_value(dim, lambda ret: int(ret), lambda: 0)
-                    new_shape.append(dim)
-
-                    if success:
-                        if isinstance(size, list):
-                            size.append(dim)
-                    else:
-                        size = None
-                        need_reshape = True
-            else:
-                if isinstance(size, list):
-                    size.append(dim)
-                new_shape.append(dim)
-
-        if size is None:
-            tmp = []
-            for dim in data:
-                tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64"))
-            size = _op.concatenate(tmp, axis=0)
-
-        if not isinstance(fill_value, _expr.Constant):
-            if isinstance(fill_value, _expr.Expr):
-                fill_value = _infer_value(fill_value, {})
-            fill_value = _expr.const(fill_value, dtype=dtype)
-        out = _op.full(fill_value, size, dtype=dtype)
-        if need_reshape:
-            out = _op.reshape(out, new_shape)
-        return out
-
-    def ones(self, inputs, input_types):
-        data = inputs[0]
-
-        import torch
-
-        if not isinstance(data, (_expr.Expr, list, torch.Tensor, np.ndarray)):
-            msg = f"Data type {type(data)} could not be parsed in ones op"
-            raise AssertionError(msg)
-
-        if inputs[1] is not None:
-            dtype = _convert_dtype_value(inputs[1])
-        else:
-            dtype = self.default_dtype
-        return self.full_impl(data, 1, dtype)
-
-    def ones_like(self, inputs, input_types):
-        data = inputs[0]
-        out = _op.ones_like(data)
-
-        # If the input and the output datatype is different, do a cast
-        if inputs[1] is not None:
-            dtype = _convert_dtype_value(inputs[1])
-        else:
-            dtype = self.default_dtype
-        if input_types[0] != dtype:
-            out = _op.cast(out, dtype)
-
-        return out
-
-    def new_ones(self, inputs, input_types):
-        size = inputs[1]
-
-        import torch
-
-        if not isinstance(size, (_expr.Expr, list, tuple, torch.Size, np.ndarray)):
-            msg = f"Data type {type(size)} could not be parsed in ones op"
-            raise AssertionError(msg)
-
-        if inputs[2] is not None:
-            dtype = _convert_dtype_value(inputs[2])
-        else:
-            dtype = input_types[0]
-        return self.full_impl(size, 1, dtype)
-
-    def zeros(self, inputs, input_types):
-        data = inputs[0]
-
-        import torch
-
-        if not isinstance(data, (_expr.Expr, list, torch.Tensor, np.ndarray)):
-            msg = f"Data type {type(data)} could not be parsed in zeros op"
-            raise AssertionError(msg)
-
-        if inputs[1] is not None:
-            dtype = _convert_dtype_value(inputs[1])
-        else:
-            dtype = self.default_dtype
-        return self.full_impl(data, 0, dtype)
-
-    def zero_(self, inputs, input_types):
-        data = inputs[0]
-        return self.full_impl(self.infer_shape(data), 0, input_types[0])
-
-    def zeros_like(self, inputs, input_types):
-        data = inputs[0]
-        out = _op.zeros_like(data)
-
-        # If the input and the output datatype is different, do a cast
-        if inputs[1] is not None:
-            dtype = _convert_dtype_value(inputs[1])
-        else:
-            dtype = self.default_dtype
-        if input_types[0] not in dtype:
-            out = _op.cast(out, dtype)
-
-        return out
-
-    def new_zeros(self, inputs, input_types):
-        data = inputs[1]
-
-        import torch
-
-        if not isinstance(data, (_expr.Expr, list, tuple, torch.Size)):
-            msg = f"Data type {type(data)} could not be parsed in new_zeros op"
-            raise AssertionError(msg)
-
-        if inputs[2] is not None:
-            dtype = _convert_dtype_value(inputs[2])
-        else:
-            # if dtype is None, use the dtype of the input tensor
-            dtype = self.infer_type(inputs[0]).dtype
-        return self.full_impl(data, 0, dtype)
-
-    def full(self, inputs, input_types):
-        data = inputs[0]
-        fill_value = inputs[1]
-
-        import torch
-
-        if not isinstance(data, (_expr.Expr, list, torch.Tensor, np.ndarray)):
-            msg = f"Data type {type(data)} could not be parsed in full op"
-            raise AssertionError(msg)
-
-        if inputs[2] is not None:  # dtype given
-            dtype = _convert_dtype_value(inputs[2])
-        else:
-            # if dtype is None, torch uses a global default set by torch.set_default_tensor_type()
-            dtype = self.default_dtype
-
-        return self.full_impl(data, fill_value, dtype)
-
-    def full_like(self, inputs, input_types):
-        data = inputs[0]
-        fill_value = inputs[1]
-
-        out = _op.full_like(data, _expr.const(fill_value))
-
-        # If the input and the output datatype is different, do a cast
-        if inputs[2] is not None:  # dtype given
-            dtype = _convert_dtype_value(inputs[2])
-        else:
-            # if dtype is None, torch uses a global default set by torch.set_default_tensor_type()
-            dtype = self.default_dtype
-        if input_types[0] not in dtype:
-            out = _op.cast(out, dtype)
-
-        return out
-
-    def new_full(self, inputs, input_types):
-        data = inputs[1]
-        fill_value = inputs[2]
-        import torch
-
-        if not isinstance(data, (_expr.Expr, list, tuple, torch.Size)):
-            msg = f"Data type {type(data)} could not be parsed in full op"
-            raise AssertionError(msg)
-
-        if inputs[3] is not None:  # dtype given
-            dtype = _convert_dtype_value(inputs[3])
-        else:
-            # if dtype is None, use the dtype of the input tensor
-            dtype = self.infer_type(inputs[0]).dtype
-
-        return self.full_impl(data, fill_value, dtype)
-
-    def fill_(self, inputs, input_types):
-        data = inputs[0]
-        fill_value = inputs[1]
-        if not isinstance(fill_value, (bool, int, float, complex)):
-            fill_value = fold_constant(fill_value)
-        return self.full_impl(self.infer_shape(data), fill_value, input_types[0])
-
-    def linspace(self, inputs, input_types):
-        start = inputs[0]
-        stop = inputs[1]
-        step = inputs[2]
-
-        # Find the spacing between values as step
-        if step != 1:
-            step = (stop - start) / (step - 1)
-            stop = stop + (step / 2)
-        else:
-            stop = start + step
-
-        if inputs[3] is None:
-            import torch
-
-            dtype = _convert_data_type(str(torch.get_default_dtype()))
-        else:
-            dtype = _convert_dtype_value(inputs[3])
-
-        start = _create_typed_const(start, dtype)
-        stop = _create_typed_const(stop, dtype)
-        step = _create_typed_const(step, dtype)
-
-        return _op.transform.arange(start=start, stop=stop, step=step, dtype=dtype)
-
-    def relu(self, inputs, input_types):
-        data = inputs[0]
-        if self.is_quantized_tensor(data):
-            assert len(inputs) == 3, "Input quant param not found in op inputs"
-            input_zero_point = _expr.const(inputs[2], dtype="int32")
-            return qnn_torch.quantized_relu(data, input_zero_point)
-        return _op.nn.relu(data)
-
-    def relu6(self, inputs, input_types):
-        data = inputs[0]
-        return _op.tensor.clip(data, 0.0, 6.0)
-
-    def prelu(self, inputs, input_types):
-        # Reference: https://pytorch.org/docs/stable/generated/torch.nn.PReLU.html#torch.nn.PReLU
-        data = inputs[0]
-        dim = self.get_dims(data)
-        ndims = len(dim)
-        axis = 0 if ndims == 1 else 1
-        alpha = _op.broadcast_to(inputs[1], (dim[axis]))
-        return _op.nn.prelu(data, alpha, axis)
-
-    def leaky_relu(self, inputs, input_types):
-        data = inputs[0]
-        alpha = float(inputs[1])
-        return _op.nn.leaky_relu(data, alpha)
-
-    def elu(self, inputs, input_types):
-        data = inputs[0]
-        dtype = input_types[0]
-        alpha = _expr.const(-float(inputs[1]), dtype=dtype)
-        return alpha * _op.nn.relu(_expr.const(1, dtype=dtype) - _op.exp(data)) + _op.nn.relu(data)
-
-    def celu(self, inputs, input_types):
-        data = inputs[0]
-        dtype = input_types[0]
-        alpha = _expr.const(float(inputs[1]), dtype=dtype)
-        zero = _op.const(0, dtype)
-        return alpha * _op.minimum(
-            zero, _op.exp(data / alpha) - _expr.const(1, dtype=dtype)
-        ) + _op.nn.relu(data)
-
-    def gelu(self, inputs, input_types):
-        data = inputs[0]
-        dtype = input_types[0]
-        # gelu is data  * normcdf(data)
-        # normcdf expressed as erf because we don't currently have that intrinsic
-        # note that there is also a fastgelu variant approximating normcdf
-        # with tanh and third order polynomials, but this is "true" gelu
-        return data * (
-            _expr.const(0.5, dtype=dtype)
-            + _op.erf(data * _expr.const(0.5**0.5, dtype=dtype)) * _expr.const(0.5, dtype=dtype)
-        )
-
-    def selu(self, inputs, input_types):
-        data = inputs[0]
-        # https://pytorch.org/docs/stable/nn.html#selu
-        dtype = input_types[0]
-        alpha = _expr.const(-1.6732632423543772848170429916717, dtype=dtype)
-        gamma = _expr.const(1.0507009873554804934193349852946, dtype=dtype)
-        return gamma * (
-            alpha * _op.nn.relu(_expr.const(1.0, dtype=dtype) - _op.exp(data)) + _op.nn.relu(data)
-        )
-
-    def silu(self, inputs, input_types):
-        data = inputs[0]
-        return data * _op.tensor.sigmoid(data)
-
-    def glu(self, inputs, input_types):
-        """
-        Applies the gated linear unit function GLU(a,b)= a * sigmoid(b)
-        where a is the first half of the input matrices and b is the second half.
-        Link: https://pytorch.org/docs/stable/generated/torch.nn.GLU.html
-        """
-        data = inputs[0]
-        dim = inputs[1]
-        relay_tup = _op.transform.split(data, 2, dim)
-        return relay_tup[0] * _op.tensor.sigmoid(relay_tup[1])
-
-    def log_sigmoid(self, inputs, input_types):
-        data = inputs[0]
-        mn = _op.minimum(_op.const(0, dtype=input_types[0]), data)
-        z = _op.exp(-_op.abs(data))
-        return mn - self.log1p([z], input_types)
-
-    def cross_entropy_loss_with_logits(self, inputs, input_types):
-        input = inputs[0]
-        target = inputs[1]
-        weights = inputs[2]
-        reduction = inputs[3]
-        ignore_index = inputs[4]
-        label_smoothing = inputs[5]
-        input_shape = self.infer_shape(input)
-        target_shape = self.infer_shape(target)
-        if input_shape != target_shape:
-            if reduction == 0:
-                reduction = "none"
-            elif reduction == 1:
-                reduction = "mean"
-            else:
-                reduction = "sum"
-            num_class = self.infer_shape(input)[1]
-            if weights is None:
-                weights = _op.full(_expr.const(1), (num_class,), dtype=input_types[0])
-            return _op.nn.nll_loss(
-                _op.nn.log_softmax(input), target, weights, reduction, ignore_index
-            )
-        assert reduction == 1, "reduction not supported in cross_entropy_loss"
-        assert ignore_index == -100, "ignore_index not supported in cross_entropy_loss"
-        assert label_smoothing == 0.0, "label_smoothing not supported in cross_entropy_loss"
-        assert weights is None, "weight not supported in cross_entropy_loss"
-        return _op.nn.cross_entropy_with_logits(_op.nn.log_softmax(input), target)
-
-    def l1_loss(self, inputs, input_types):
-        assert len(inputs) == 3
-        [predictions, targets, reduction] = inputs
-        delta = _op.abs(_op.subtract(predictions, targets))
-        if reduction == 0:
-            # reduction = "none"
-            return delta
-        elif reduction == 1:
-            # reduction = "mean"
-            return _op.mean(delta)
-        else:
-            # reduction = "sum"
-            return _op.sum(delta)
-
-    def mse_loss(self, inputs, input_types):
-        assert len(inputs) == 3
-        [predictions, targets, reduction] = inputs
-        delta = _op.subtract(predictions, targets)
-        delta = _op.power(delta, _expr.const(2, input_types[0]))
-        if reduction == 0:
-            # reduction = "none"
-            return delta
-        elif reduction == 1:
-            # reduction = "mean"
-            return _op.mean(delta)
-        else:
-            # reduction = "sum"
-            return _op.sum(delta)
-
-    def hard_sigmoid(self, inputs, input_types):
-        def _relu6(x):
-            return _op.tensor.clip(x, 0.0, 6.0)
-
-        def func(x):
-            return _relu6(x + _expr.const(3.0)) / _expr.const(6.0)
-
-        if self.is_quantized_tensor(inputs[0]):
-            input_scale = _expr.const(inputs[1])
-            input_zero_point = _expr.const(inputs[2])
-            # PyTorch seems to use the following output qparams, but accuracy
-            # is broken if we use this.
-            # TODO(masahi): Revisit this parameter choice
-            #
-            # Taken from src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
-            # output_scale = _expr.const(0.00390625)  # 1.0 / 2^8
-            # output_zero_point = _expr.const(-128)
-            output_scale = input_scale
-            output_zero_point = input_zero_point
-
-            data = qnn.op.dequantize(inputs[0], input_scale, input_zero_point, axis=1)
-            out = func(data)
-            return qnn.op.quantize(out, output_scale, output_zero_point, out_dtype="uint8")
-
-        return func(inputs[0])
-
-    def hard_swish(self, inputs, input_types):
-        data = inputs[0]
-        return data * self.hard_sigmoid(inputs, input_types)
-
-    def adaptive_avg_pool(self, op, inputs, input_types):
-        data = inputs[0]
-        output_size = inputs[1]
-        for i, item in enumerate(output_size):
-            if isinstance(item, tvm.relay.expr.Constant):
-                # convert Constant to int
-                output_size[i] = item.data.numpy()[()]
-
-        def func(x):
-            return op(x, output_size=output_size)
-
-        if self.is_quantized_tensor(data):
-            return qnn_torch.apply_with_upcast(data, func)
-
-        return func(data)
-
-    def adaptive_max_pool(self, op, inputs, input_types):
-        data = inputs[0]
-        output_size = inputs[1]
-        for i, item in enumerate(output_size):
-            if isinstance(item, tvm.relay.expr.Constant):
-                # convert Constant to int
-                output_size[i] = item.data.numpy()[()]
-        # returns dummy indices too
-        return op(data, output_size=output_size), None
-
-    @staticmethod
-    def convert_const_list(data):
-        if isinstance(data, list):
-            for i, _ in enumerate(data):
-                if isinstance(data[i], _expr.Expr):
-                    data[i] = int(_infer_value_simulated(data[i], {}).numpy())
-        return data
-
-    def maxpool_2d(self, inputs, input_types):
-        data = inputs[0]
-
-        pool_size = self.convert_const_list(inputs[1])
-        strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size)
-        padding = inputs[3]
-        dilation = inputs[4]
-        ceil_mode = int(inputs[5])
-
-        return _op.nn.max_pool2d(
-            data,
-            pool_size=pool_size,
-            strides=strides,
-            dilation=dilation,
-            padding=padding,
-            layout="NCHW",
-            ceil_mode=ceil_mode,
-        )
-
-    def maxpool_2d_with_indices(self, inputs, input_types):
-        # returns dummy indices too
-        return self.maxpool_2d(inputs, input_types), None
-
-    def maxpool_1d(self, inputs, input_types):
-        data = inputs[0]
-
-        pool_size = inputs[1]
-        strides = inputs[2] if inputs[2] else pool_size
-        padding = inputs[3]
-        dilation = inputs[4]
-        ceil_mode = int(inputs[5])
-
-        return _op.nn.max_pool1d(
-            data,
-            pool_size=pool_size,
-            strides=strides,
-            dilation=dilation,
-            padding=padding,
-            layout="NCW",
-            ceil_mode=ceil_mode,
-        )
-
-    def maxpool_3d(self, inputs, input_types):
-        data = inputs[0]
-
-        need_squeeze = False
-        if len(self.get_dims(data)) == 4:
-            need_squeeze = True
-            data = _op.expand_dims(data, 0)
-        pool_size = inputs[1]
-        strides = inputs[2] if inputs[2] else pool_size
-        padding = inputs[3]
-        dilation = inputs[4]
-        ceil_mode = int(inputs[5])
-
-        res = _op.nn.max_pool3d(
-            data,
-            pool_size=pool_size,
-            strides=strides,
-            dilation=dilation,
-            padding=padding,
-            ceil_mode=ceil_mode,
-        )
-        return res if not need_squeeze else _op.squeeze(res, [0])
-
-    def hardtanh(self, inputs, input_types):
-        a = inputs[0]
-        tanh_min = float(inputs[1])
-        tanh_max = float(inputs[2])
-        return _op.tensor.clip(a, tanh_min, tanh_max)
-
-    def convolution(self, inputs, input_types):
-        # Use transpose or normal
-        use_transpose = True if inputs[6] == 1 else False
-
-        data = inputs[0]
-        weight = inputs[1]
-        bias = inputs[2]
-        strides = tuple(inputs[3])
-        padding = tuple(inputs[4])
-        dilation = tuple(inputs[5])
-
-        if isinstance(weight, _expr.Expr):
-            inferred_shape = self.infer_shape(weight)
-            weight_shape = []
-            for infer in inferred_shape:
-                weight_shape.append(infer)
-        else:
-            msg = f"Data type {type(weight)} could not be parsed in conv op"
-            raise AssertionError(msg)
-
-        groups = int(inputs[8])
-
-        if use_transpose:
-            channels = weight_shape[1] * groups
-            in_channels = weight_shape[0]
-        else:
-            channels = weight_shape[0]
-            in_channels = weight_shape[1]
-
-        # Check if this is depth wise convolution
-        # We need to reshape weight so that Relay could recognize this is depth wise
-        # weight_shape[1] is always in_channels // groups
-        # For depthwise, in_channels == groups, so weight_shape[1] == 1
-        # If groups > 1 but weight_shape[1] != 1, this is group convolution
-        if groups > 1 and in_channels == 1:
-            channel_multiplier = channels // groups
-            new_weight_shape = (groups, channel_multiplier) + tuple(weight_shape[2:])
-            weight = _op.transform.reshape(weight, new_weight_shape)
-
-        kernel_size = weight_shape[2:]
-        use_bias = isinstance(bias, _expr.Expr)
-
-        # We are trying to invoke various relay operations through a single conv_op variable.
-        # However the function signatures for some operations have additional attributes so we
-        # pass these in along with the standard ones.
-        additional_arguments = dict()
-
-        if use_transpose:
-            if len(kernel_size) == 3:
-                conv_op = _op.nn.conv3d_transpose
-            elif len(kernel_size) == 2:
-                conv_op = _op.nn.conv2d_transpose
-            else:
-                conv_op = _op.nn.conv1d_transpose
-            output_padding = tuple(inputs[7])
-            additional_arguments["output_padding"] = output_padding
-
-        else:
-            if len(kernel_size) == 3:
-                conv_op = _op.nn.conv3d
-            elif len(kernel_size) == 2:
-                conv_op = _op.nn.conv2d
-            else:
-                conv_op = _op.nn.conv1d
-
-        if len(kernel_size) == 3:
-            data_layout = "NCDHW"
-            kernel_layout = "OIDHW"
-            if use_transpose:
-                # Transposed convolutions have IODHW layout.
-                kernel_layout = "IODHW"
-        elif len(kernel_size) == 2:
-            data_layout = "NCHW"
-            kernel_layout = "OIHW"
-            if use_transpose:
-                # Transposed convolutions have IOHW layout.
-                kernel_layout = "IOHW"
-        else:
-            data_layout = "NCW"
-            kernel_layout = "OIW"
-            if use_transpose:
-                # Transposed convolutions have IOW layout.
-                kernel_layout = "IOW"
-
-        # Conv1d does not currently support grouped convolution so we convert it to conv2d
-        is_grouped_conv1d = False
-        if groups > 1 and len(kernel_size) == 1 and not use_transpose:
-            is_grouped_conv1d = True
-            conv_op = _op.nn.conv2d
-            kernel_size = [1] + kernel_size
-            strides = (1,) + strides
-            padding = (0,) + padding
-            dilation = (1,) + dilation
-            data = _op.expand_dims(data, axis=2)
-            weight = _op.expand_dims(weight, axis=2)
-            data_layout = "NCHW"
-            kernel_layout = "OIHW"
-
-        conv_out = conv_op(
-            data,
-            weight,
-            strides=strides,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            channels=channels,
-            kernel_size=kernel_size,
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-            out_layout="",
-            out_dtype="",
-            **additional_arguments,
-        )
-        if use_bias:
-            res = _op.nn.bias_add(conv_out, bias)
-        else:
-            res = conv_out
-        if is_grouped_conv1d:
-            # Because we conducted grouped conv1d convolution through conv2d we must
-            # squeeze the output to get the correct result.
-            res = _op.squeeze(res, axis=[2])
-        return res
-
-    def softmax(self, inputs, input_types):
-        data = inputs[0]
-        axis = inputs[1]
-        if isinstance(axis, str):
-            axis = int(axis)
-
-        return _op.nn.softmax(data, axis=axis)
-
-    def threshold(self, inputs, input_types):
-        data = inputs[0]
-        threshold_f = float(inputs[1])
-        threshold_ = _op.full_like(inputs[0], fill_value=_expr.const(threshold_f))
-        value_f = float(inputs[2])
-        value = _op.full_like(inputs[0], fill_value=_expr.const(value_f))
-        return _op.where(_op.greater(data, threshold_), data, value)
-
-    def contiguous(self, inputs, input_types):
-        return inputs[0]
-
-    def batch_norm(self, inputs, input_types):
-        data = inputs[0]
-        data_type = input_types[0]
-
-        channels = self.infer_shape(data)
-
-        scale = isinstance(inputs[1], _expr.Expr)
-        if scale:
-            gamma = inputs[1]
-        else:
-            gamma = _create_typed_const(np.ones([int(channels[1])]), data_type)
-
-        center = isinstance(inputs[2], _expr.Expr)
-        if center:
-            beta = inputs[2]
-        else:
-            beta = _create_typed_const(np.zeros([int(channels[1])]), data_type)
-
-        moving_mean = inputs[3]
-        moving_var = inputs[4]
-        epsilon = float(inputs[7])
-
-        return _op.nn.batch_norm(
-            data,
-            gamma,
-            beta,
-            moving_mean,
-            moving_var,
-            axis=1,
-            epsilon=epsilon,
-            center=center,
-            scale=scale,
-        )[0]
-
-    def instance_norm(self, inputs, input_types):
-        data = inputs[0]
-        data_type = input_types[0]
-        channels = self.infer_shape(data)
-        running_mean = inputs[3]
-        running_var = inputs[4]
-        use_input_stats = inputs[5]
-
-        if isinstance(inputs[1], _expr.Expr) and isinstance(inputs[2], _expr.Expr):
-            scale = center = True
-            weight = inputs[1]
-            beta = inputs[2]
-            gamma = weight
-        else:
-            scale = center = False
-
-        if not scale:
-            gamma = _create_typed_const(np.ones([int(channels[1])]), data_type)
-
-        if not center:
-            beta = _create_typed_const(np.zeros([int(channels[1])]), data_type)
-
-        epsilon = float(inputs[7])
-
-        if not use_input_stats:
-            return _op.nn.batch_norm(
-                data,
-                gamma,
-                beta,
-                running_mean,
-                running_var,
-                axis=1,
-                epsilon=epsilon,
-                center=center,
-                scale=scale,
-            )[0]
-
-        return _op.nn.instance_norm(
-            data, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale
-        )
-
-    def get_dims(self, data):
-        import torch
-
-        if isinstance(data, _expr.Expr):
-            dims = self.infer_shape(data)
-        elif isinstance(data, list):
-            dims = data
-        elif isinstance(data, (torch.Tensor, np.ndarray)):
-            dims = data.shape
-        else:
-            msg = f"Data type {type(data)} could not be parsed"
-            raise AssertionError(msg)
-        return dims
-
-    def layer_norm(self, inputs, input_types):
-        data = inputs[0]
-        ndims = len(self.get_dims(inputs[1]))
-        assert ndims == 1, "Support only normalization over last one dimension."
-
-        return _op.nn.layer_norm(
-            data,
-            gamma=inputs[2],
-            beta=inputs[3],
-            axis=-1,
-            epsilon=float(inputs[4]),
-            center=True,
-            scale=True,
-        )
-
-    def group_norm(self, inputs, input_types):
-        data = inputs[0]
-        gamma = inputs[2]
-        beta = inputs[3]
-        num_groups = inputs[1]
-        epsilon = float(inputs[4])
-
-        return _op.nn.group_norm(
-            data,
-            gamma=gamma,
-            beta=beta,
-            num_groups=num_groups,
-            axis=1,
-            epsilon=epsilon,
-            center=True,
-            scale=True,
-        )
-
-    def transpose(self, inputs, input_types):
-        data = inputs[0]
-
-        import torch
-
-        if isinstance(data, _expr.Expr):
-            ndims = len(self.infer_shape_with_prelude(data))
-        elif isinstance(data, list):
-            ndims = data
-        elif isinstance(data, (torch.Tensor, np.ndarray)):
-            ndims = data.shape
-        else:
-            msg = f"Data type {type(data)} could not be parsed in transpose op"
-            raise AssertionError(msg)
-
-        if isinstance(data, tvm.runtime.NDArray):
-            ndims = len(data.shape)
-        axes = list(range(ndims))
-
-        num_inputs = len(inputs)
-
-        if num_inputs == 1:
-            if ndims >= 2:
-                axes[-1] = ndims - 2
-                axes[-2] = ndims - 1
-            if not isinstance(data, _expr.Expr):
-                data = _expr.const(data)
-
-        elif num_inputs == 3:
-            parse = lambda i: ndims * (i < 0) + i
-            src, dst = [parse(int(inputs[i])) for i in [1, 2]]
-            axes[src] = dst
-            axes[dst] = src
-        else:
-            axes = inputs[1]
-        return _op.transform.transpose(data, axes)
-
-    def numpy_T(self, inputs, input_types):
-        data = inputs[0]
-        shape = self.infer_shape(data)
-        if len(shape) != 2:
-            logger.warning(
-                "The use of Tensor.T on tensors of dimensions != 2 is deprecated"
-                "and will be removed in a future release of PyTorch."
-            )
-        return _op.transform.transpose(data)
-
-    def flatten(self, inputs, input_types):
-        data = inputs[0]
-        start = int(inputs[1])
-        end = int(inputs[2])
-        dshape = get_const_tuple(self.infer_shape_with_prelude(data))
-        ndim = len(dshape)
-        if start < 0:
-            start += ndim
-        if end < 0:
-            end += ndim
-        assert start <= end, "start dim cannot come after end dim"
-        new_shape = [0] * start
-
-        new_shape.append(-1)
-        squeeze_axes = []
-        for i in range(start + 1, end + 1):
-            new_shape.append(1)
-            squeeze_axes.append(i)
-        for _ in range(end + 1, ndim):
-            new_shape.append(0)
-        out = _op.reshape(data, new_shape)
-        if squeeze_axes:
-            out = _op.squeeze(out, axis=squeeze_axes)
-        return out
-
-    def unflatten(self, inputs, input_types):
-        data = inputs[0]
-        dim = int(inputs[1])
-        unflattened_size = tuple(inputs[2])
-        dshape = get_const_tuple(self.infer_shape_with_prelude(data))
-
-        dim = dim if dim >= 0 else len(dshape) + dim
-        assert len(dshape) > dim >= 0
-
-        new_unflattened_size = []
-        for s in unflattened_size:
-            if isinstance(s, _expr.Constant):
-                s = s.data.numpy().item()
-            new_unflattened_size.append(s)
-
-        assert new_unflattened_size.count(-1) <= 1
-
-        mult = np.multiply.reduce(new_unflattened_size)
-        if mult < 0:
-            assert dshape[dim] % mult == 0
-        else:
-            assert dshape[dim] == mult
-
-        new_shape = dshape[:dim] + tuple(new_unflattened_size) + dshape[dim + 1 :]
-        out = _op.reshape(data, new_shape)
-        return out
-
-    def addmm(self, inputs, input_types):
-        input_mat = inputs[0]
-        mat1 = inputs[1]
-        mat2 = inputs[2]
-        beta = inputs[3]
-        alpha = inputs[4]
-        data_type = input_types[1]
-
-        transposed_mat2 = _op.transform.transpose(mat2, axes=[1, 0])
-        units = self.infer_shape(transposed_mat2)[0]
-        dense_out = _op.nn.dense(mat1, transposed_mat2, units=units)
-
-        if not isinstance(alpha, _expr.Expr) and alpha != 1:
-            alpha = _create_typed_const(alpha, data_type)
-            dense_out *= alpha
-
-        if not isinstance(beta, _expr.Expr) and beta != 1:
-            beta = _create_typed_const(beta, data_type)
-            input_mat *= beta
-
-        return dense_out + input_mat
-
-    def size(self, inputs, input_types):
-        shape = self.infer_shape_with_prelude(inputs[0])
-        axis = None
-        if len(inputs) > 1:
-            axis = int(inputs[1])
-
-        if any(map(lambda s: isinstance(s, tvm.tir.expr.Any), shape)):
-            if axis is None or isinstance(shape[axis], tvm.tir.expr.Any):
-                shape_dynamic = _op.shape_of(inputs[0], dtype="int32")
-                if axis is not None:
-                    return _op.take(shape_dynamic, _expr.const(axis), 0)
-                return shape_dynamic
-
-        if axis is not None:
-            return _expr.const(shape[axis])
-        return _expr.const(shape)
-
-    def numtotensor(self, inputs, input_types):
-        val = inputs[0]
-        dtype = input_types[0]
-
-        if isinstance(val, _expr.Expr):
-            return val
-
-        if isinstance(val, tvm.tir.IntImm):
-            val = val.__int__()
-            dtype = int
-
-        arr = val * np.ones([]).astype(dtype)
-        return arr
-
-    def tensortonum(self, inputs, input_types):
-        return inputs[0]
-
-    def view(self, inputs, input_types):
-        data = inputs[0]
-
-        if len(inputs) == 3:
-            shape_inp = [inputs[1], self.infer_shape(inputs[2])[0]]
-        else:
-            if isinstance(inputs[1], list):
-                shape_inp = inputs[1]
-            else:
-                shape_inp = self.infer_shape(inputs[1])
-        new_shape = shape_inp
-        for i, shape in enumerate(shape_inp):
-            if isinstance(shape, _expr.Expr):
-                val = _infer_value_simulated(shape, {})
-                new_shape[i] = val.numpy().item(0)
-
-        return _op.transform.reshape(data, new_shape)
-
-    def view_as(self, inputs, input_types):
-        data = inputs[0]
-        tensors = inputs[1]
-
-        if not isinstance(tensors, (_expr.Call, _expr.Constant, _expr.Var)):
-            msg = f"Data type {type(tensors)} could not be parsed in view_as op"
-            raise AssertionError(msg)
-
-        shape = self.infer_shape(tensors)
-
-        return _op.transform.reshape(data, shape)
-
-    def reshape(self, inputs, input_types):
-        data = inputs[0]
-        new_shape = inputs[1]
-
-        tmp_shape = []
-        is_dyn = False
-        for s in new_shape:
-            if isinstance(s, _expr.Constant):
-                tmp_shape.append(int(s.data.numpy()))
-            elif isinstance(s, _expr.Expr):
-                dim, success = try_infer_value(s, lambda ret: int(ret))
-                tmp_shape.append(dim)
-
-                if not success:
-                    is_dyn = True
-            else:
-                tmp_shape.append(s)
-
-        if is_dyn:
-            new_shape = []
-            for i, s in enumerate(tmp_shape):
-                if not isinstance(s, _expr.Expr):
-                    s = _expr.const(s, "int64")
-                else:
-                    s = _op.cast(s, "int64")
-                new_shape.append(_op.expand_dims(s, axis=0))
-            new_shape = _op.concatenate(new_shape, axis=0)
-        else:
-            new_shape = tmp_shape
-        return _op.transform.reshape(data, new_shape)
-
-    def reshape_as(self, inputs, input_types):
-        data = inputs[0]
-        new_shape = self.infer_shape(inputs[1])
-        return _op.transform.reshape(data, new_shape)
-
-    def pixel_shuffle(self, inputs, input_types):
-        data = inputs[0]
-        upscale_factor = inputs[1]
-        upscale_squared = upscale_factor * upscale_factor
-        b, c, h, w = self.infer_shape(data)
-        assert (
-            c % upscale_squared == 0
-        ), "input channel should be divisible by square of upscale_factor"
-
-        ndims = len(self.infer_shape_with_prelude(data))
-        axes = list(range(ndims))
-        num_inputs = len(inputs)
-        oc = c // upscale_squared
-        oh = h * upscale_factor
-        ow = w * upscale_factor
-
-        new_shape = [b, oc, upscale_factor, upscale_factor, h, w]
-        out_shape = [b, oc, oh, ow]
-
-        data = _op.transform.reshape(data, new_shape)
-        # The data will be transposed to
-        # [b, oc, h, upscale_factor, w, upscale_factor]
-        # for further reshape
-        axes = [0, 1, 4, 2, 5, 3]
-        data = _op.transform.transpose(data, axes)
-        return _op.transform.reshape(data, out_shape)
-
-    def clone(self, inputs, input_types):
-        data = inputs[0]
-        return _op.tensor.copy(data)
-
-    def log_softmax(self, inputs, input_types):
-        data = inputs[0]
-        axis = int(inputs[1])
-        return _op.nn.log_softmax(data, axis)
-
-    def sigmoid(self, inputs, input_types):
-        data = inputs[0]
-
-        def func(x):
-            return _op.tensor.sigmoid(x)
-
-        if self.is_quantized_tensor(data):
-            assert len(inputs) == 5, "Input/Ouput quant param not found in op inputs"
-            return qnn_torch.quantized_sigmoid(inputs)
-
-        return func(data)
-
-    def softplus(self, inputs, input_types):
-        dtype = input_types[0]
-        beta = _expr.const(float(inputs[1]), dtype=dtype)
-        threshold = int(inputs[2]) if inputs[2] else 20
-        threshold_ = _op.full_like(inputs[0], fill_value=_expr.const(threshold))
-        softplus_value = _op.log(_op.exp(inputs[0] * beta) + _expr.const(1.0, dtype=dtype)) / beta
-        return _op.where(_op.greater(inputs[0] * beta, threshold_), inputs[0], softplus_value)
-
-    def make_avg_pool(self, dim):
-        def avg_pool(inputs, input_types):
-            data = inputs[0]
-
-            pool_size = self.convert_const_list(inputs[1])
-            strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size)
-            padding = inputs[3]
-            ceil_mode = int(inputs[4])
-            count_include_pad = int(inputs[5])
-
-            def func(x):
-                if dim == 1:
-                    return _op.nn.avg_pool1d(
-                        x,
-                        pool_size=pool_size,
-                        strides=strides,
-                        padding=padding,
-                        dilation=(1,),
-                        ceil_mode=ceil_mode,
-                        count_include_pad=count_include_pad,
-                    )
-                elif dim == 2:
-                    return _op.nn.avg_pool2d(
-                        x,
-                        pool_size=pool_size,
-                        strides=strides,
-                        padding=padding,
-                        dilation=(1, 1),
-                        ceil_mode=ceil_mode,
-                        count_include_pad=count_include_pad,
-                    )
-                elif dim == 3:
-                    return _op.nn.avg_pool3d(
-                        x,
-                        pool_size=pool_size,
-                        strides=strides,
-                        padding=padding,
-                        dilation=(1, 1, 1),
-                        ceil_mode=ceil_mode,
-                        count_include_pad=count_include_pad,
-                    )
-                else:
-                    msg = "Average Pooling dimension should be between 1 and 3"
-                    raise RuntimeError(msg)
-
-            if self.is_quantized_tensor(data):
-                return qnn_torch.apply_with_upcast(data, func)
-
-            return func(data)
-
-        return avg_pool
-
-    def linear(self, inputs, input_types):
-        # https://pytorch.org/docs/stable/nn.functional.html#linear
-        # 0 - input
-        # 1 - weight
-        bias = inputs[2]
-        a_shape = self.infer_shape_with_prelude(inputs[0])
-        b_shape = self.infer_shape_with_prelude(inputs[1])
-        if len(a_shape) == 2 and len(b_shape) == 2:
-            mm_out = _op.nn.dense(inputs[0], inputs[1])
-        elif len(b_shape) == 1:
-            mm_out = self.matmul([inputs[0], inputs[1]], input_types[:2])
-        else:
-            mm_out = self.matmul(
-                [inputs[0], _op.transpose(inputs[1], axes=(1, 0))], input_types[:2]
-            )
-        if isinstance(bias, _expr.Expr):
-            bias_ndims = len(self.infer_shape_with_prelude(bias))
-            if bias_ndims == 1:
-                return _op.nn.bias_add(mm_out, bias, axis=-1)
-            mm_dtype = self.infer_type_with_prelude(mm_out).dtype
-            return self.add([mm_out, bias], [mm_dtype, input_types[2]])
-        return mm_out
-
-    def dropout(self, inputs, input_types):
-        data = inputs[0]
-        rate = float(inputs[1])
-
-        return _op.nn.dropout(data, rate)
-
-    def make_reduce(self, name):
-        def reduce(inputs, input_types):
-            data = inputs[0]
-            axis = None
-            keepdims = False
-
-            if len(inputs) > 2:  # default, torch have only data, axis=None, keepdims=False
-                if isinstance(inputs[1], int):
-                    axis = int(inputs[1])
-                elif _is_int_seq(inputs[1]):
-                    axis = inputs[1]
-                else:
-                    axis = list(self.infer_shape(inputs[1]))
-                keepdims = bool(inputs[2])
-
-            return get_relay_op(name)(data, axis=axis, keepdims=keepdims)
-
-        return reduce
-
-    def norm(self, inputs, input_types):
-        data = inputs[0]
-        dtype = input_types[0]
-        axis = None
-        keepdims = False
-        if len(inputs) > 3:
-            axis = inputs[2]
-            keepdims = bool(inputs[3])
-
-        order = inputs[1]
-        if order == np.inf:
-            return _op.reduce.max(_op.abs(data), axis=axis, keepdims=keepdims)
-        elif order == -np.inf:
-            return _op.reduce.min(_op.abs(data), axis=axis, keepdims=keepdims)
-        else:
-            reci_order = _expr.const(1.0 / order, dtype=dtype)
-            order = _expr.const(order)
-            return _op.power(
-                _op.reduce.sum(_op.power(_op.abs(data), order), axis=axis, keepdims=keepdims),
-                reci_order,
-            )
-
-    def frobenius_norm(self, inputs, input_types):
-        data = inputs[0]
-        axis = None
-        keepdims = False
-        if len(inputs) > 2:
-            axis = inputs[1] if len(inputs[1]) > 0 else None
-            keepdims = bool(inputs[2])
-
-        return _op.sqrt(_op.reduce.sum((data * data), axis=axis, keepdims=keepdims))
-
-    def std(self, inputs, input_types):
-        data = inputs[0]
-        if len(inputs) == 2:
-            axis = None
-            keepdims = False
-            unbiased = bool(inputs[1])
-        else:
-            axis = inputs[1]
-            keepdims = bool(inputs[3])
-            unbiased = bool(inputs[2])
-
-        return _op.reduce.std(data, axis=axis, keepdims=keepdims, unbiased=unbiased)
-
-    def variance(self, inputs, input_types):
-        data = inputs[0]
-        if len(inputs) == 2:
-            axis = None
-            keepdims = False
-            unbiased = bool(inputs[1])
-        else:
-            axis = inputs[1]
-            keepdims = bool(inputs[3])
-            unbiased = bool(inputs[2])
-
-        return _op.reduce.variance(data, axis=axis, keepdims=keepdims, unbiased=unbiased)
-
-    def mean(self, inputs, input_types):
-        data = inputs[0]
-
-        if inputs[1]:
-            axis = inputs[1]
-        else:
-            axis = None
-
-        if len(inputs) > 2 and inputs[2]:
-            keepdims = int(inputs[2])
-        else:
-            keepdims = False
-        if len(inputs) > 3 and inputs[3]:
-            exclude = int(inputs[3])
-        else:
-            exclude = False
-
-        def func(x):
-            return _op.mean(x, axis, keepdims, exclude)
-
-        if self.is_quantized_tensor(data):
-            assert len(inputs) == 6, "Input quant param not found in op inputs"
-            input_scale = _expr.const(inputs[4])
-            input_zero_point = _expr.const(inputs[5])
-            # refer to aten/src/ATen/native/quantized/cpu/qreduction.cpp
-            return qnn_torch.apply_with_fp32_fallback(data, input_scale, input_zero_point, func)
-
-        return func(data)
-
-    def var_mean(self, inputs, input_types):
-        data = inputs[0]
-        if len(inputs) == 2:
-            axis = None
-            keepdims = False
-            unbiased = bool(inputs[1])
-        else:
-            axis = inputs[1]
-            keepdims = bool(inputs[3])
-            unbiased = bool(inputs[2])
-
-        m, v = _op.reduce.mean_variance(data, axis, keepdims, False, unbiased)
-        return v, m
-
-    def chunk(self, inputs, input_types):
-        data = inputs[0]
-
-        num_chunks = int(inputs[1])
-        axis = int(inputs[2])
-
-        if isinstance(data, _expr.Expr):
-            inferred_shape = self.infer_shape_with_prelude(data)
-
-        shape = []
-        for infer in inferred_shape:
-            shape.append(infer)
-
-        dim = int(shape[axis])
-
-        if dim % num_chunks:
-            unif_size = int(dim / (num_chunks - 1))
-        else:
-            unif_size = int(dim / num_chunks)
-
-        indeces = []
-        for i in range(unif_size, dim, unif_size):
-            indeces.append(i)
-
-        return _op.split(data, indeces, axis)
-
-    def baddbmm(self, inputs, _):
-        input = inputs[0]
-        batch1, batch2 = inputs[1:3]
-        beta = _expr.const(float(inputs[3]))
-        alpha = _expr.const(float(inputs[4]))
-        return beta * input + alpha * _op.nn.batch_matmul(batch1, batch2, transpose_b=False)
-
-    def matmul(self, inputs, input_types):
-        assert len(inputs) == 2, "Two tensors to be multiplied are expected."
-
-        a = inputs[0]
-        b = inputs[1]
-
-        # Need to check input shape as batch matmul must be supported.
-        a_shape = self.infer_shape_with_prelude(a)
-        b_shape = self.infer_shape_with_prelude(b)
-
-        a_ndims = len(a_shape)
-        b_ndims = len(b_shape)
-
-        # Check if both tensors are at least 1D.
-        if a_ndims == 0 or b_ndims == 0:
-            msg = "Both arguments to matmul must be at least 1D."
-            raise AssertionError(msg)
-
-        # Check if tensors can be multiplied.
-        b_mulaxis = b_shape[-2] if b_ndims > 1 else b_shape[0]
-        if a_shape[-1] != b_mulaxis:
-            msg = "Tensors being multiplied do not have compatible shapes."
-            raise AssertionError(msg)
-
-        # If 1D, remember axis that should be deleted at the end
-        squeeze_dims = []
-        if a_ndims == 1:
-            a = _op.expand_dims(a, axis=0)
-            squeeze_dims += [-2]
-            a_ndims = 2
-            a_shape = (1,) + a_shape
-
-        if b_ndims == 1:
-            b = _op.expand_dims(b, axis=1)
-            squeeze_dims += [-1]
-            b_ndims = 2
-            b_shape = b_shape + (1,)
-
-        # Compute result
-        if a_ndims == 2 and b_ndims == 2:
-            # Result is obtained using matmul
-            out = _op.nn.dense(a, _op.transpose(b))
-        else:
-            # Result is obtained using batch_matmul
-            batch_shape = [1] * (max(a_ndims, b_ndims) - 2)
-
-            for i, j in enumerate(reversed(a_shape[:-2])):
-                batch_shape[i] = j
-
-            for i, j in enumerate(reversed(b_shape[:-2])):
-                # Need to check if axis can be broadcasted
-                if batch_shape[i] == 1 or j == 1 or batch_shape[i] == j:
-                    batch_shape[i] = max(batch_shape[i], j)
-                else:
-                    msg = "Batch dimensions are not broadcastable."
-                    raise AssertionError(msg)
-
-            batch_shape = batch_shape[::-1]
-
-            a = _op.broadcast_to(a, batch_shape + list(a_shape[-2:]))
-            b = _op.broadcast_to(b, batch_shape + list(b_shape[-2:]))
-
-            out = _op.nn.batch_matmul(
-                _op.reshape(a, [-1, *a_shape[-2:]]),
-                _op.reshape(b, [-1, *b_shape[-2:]]),
-                transpose_b=False,
-            )
-
-            out_shape = batch_shape + [a_shape[-2]] + [b_shape[-1]]
-            out = _op.reshape(out, out_shape)
-
-        return _op.squeeze(out, axis=squeeze_dims)
-
-    def expand(self, inputs, input_types):
-        data_in = inputs[0]
-        shape = list(self.infer_shape(data_in))
-
-        ndims = len(shape)
-        sizes = inputs[1]
-        out = data_in
-
-        out_dims = len(sizes)
-        if ndims < out_dims:
-            num_newaxis = out_dims - ndims
-            out = _op.expand_dims(out, axis=0, num_newaxis=num_newaxis)
-            shape = [1] * num_newaxis + shape
-
-        for i in range(out_dims):
-            if sizes[i] != -1 and shape[i] == 1:
-                if not isinstance(sizes[i], int):
-                    sizes[i] = int(_infer_value(sizes[i], {}).numpy())
-                out = _op.repeat(out, sizes[i], axis=i)
-
-        return out
-
-    def int(self, inputs, input_types):
-        if isinstance(inputs[0], _expr.Expr):
-            return inputs[0]
-        return int(inputs[0])
-
-    def identity(self, inputs, input_types):
-        return inputs[0]
-
-    def none(self, inputs, input_types):
-        return None
-
-    def pad_common(self, mode, pad_value, inputs, input_types):
-        data = inputs[0]
-        if isinstance(inputs[1], list):
-            pad_list = inputs[1]
-        else:
-            pad_list = list(self.infer_shape(inputs[1]))
-
-        # initialize paddings based on input len
-        pad_len = len(self.infer_shape(data)) * 2
-        paddings = [0] * pad_len
-
-        if len(pad_list) >= 2:
-            paddings[-1] = pad_list[1]
-            paddings[-2] = pad_list[0]
-        if len(pad_list) >= 4:
-            paddings[-3] = pad_list[3]
-            paddings[-4] = pad_list[2]
-        if len(pad_list) >= 6:
-            paddings[-5] = pad_list[5]
-            paddings[-6] = pad_list[4]
-
-        # group into tuple of 2 ints
-        paddings = [paddings[i : i + 2] for i in range(0, len(paddings), 2)]
-
-        const_paddings = []
-        non_zero_found = False
-        for pad in paddings:
-            const_paddings.append([])
-            for p in pad:
-                if isinstance(p, _expr.Expr):
-                    p = int(_infer_value(p, {}).numpy())
-                elif not isinstance(p, int):
-                    raise NotImplementedError("pad width should be int/expr")
-                const_paddings[-1].append(p)
-                if p != 0:
-                    non_zero_found = True
-
-        if not non_zero_found:
-            return data
-        elif mode == "constant":
-            return _op.nn.pad(data, const_paddings, pad_value=pad_value, pad_mode=mode)
-        else:
-            return _op.nn.pad(data, const_paddings, pad_mode=mode)
-
-    def pad(self, inputs, input_types):
-        # mode: Optional default "constant"
-        if len(inputs) > 2 and inputs[2] is not None:
-            mode = inputs[2]
-        else:
-            mode = "constant"
-
-        # pad_value: Optional default 0
-        if len(inputs) == 4 and inputs[3] is not None:
-            pad_value = inputs[3]
-        else:
-            pad_value = 0
-
-        # replicate is edge in TVM's padding mode
-        if mode == "replicate":
-            mode = "edge"
-        elif mode == "circular":
-            raise ValueError("circular mode for torch.nn.functional.pad are not supported in TVM")
-        return self.pad_common(mode, pad_value, inputs, input_types)
-
-    def constant_pad_nd(self, inputs, input_types):
-        return self.pad_common("constant", _expr.const(inputs[2]), inputs, input_types)
-
-    def reflection_pad1d(self, inputs, input_types):
-        return self.pad_common("reflect", 0, inputs, input_types)
-
-    def reflection_pad2d(self, inputs, input_types):
-        return self.pad_common("reflect", 0, inputs, input_types)
-
-    def replication_pad1d(self, inputs, input_types):
-        return self.pad_common("edge", 0, inputs, input_types)
-
-    def replication_pad2d(self, inputs, input_types):
-        return self.pad_common("edge", 0, inputs, input_types)
-
-    def replication_pad3d(self, inputs, input_types):
-        return self.pad_common("edge", 0, inputs, input_types)
-
-    def clamp_common(self, data, min=None, max=None):
-        def get_v(v, default_v):
-            if isinstance(v, _expr.Constant):
-                return float(v.data.numpy())
-            if isinstance(v, _expr.Expr):
-                infer_v, success = try_infer_value(v, lambda ret: float(ret))
-                if success:
-                    return infer_v
-            if v is not None:
-                return v
-            return default_v
-
-        dtype = self.infer_type(data).dtype
-
-        type_info = np.finfo(dtype) if "float" in dtype else np.iinfo(dtype)
-
-        # TODO(masahi): Properly handle inf in a one-way clamp case.
-        if min is not None and max is not None:
-            amin = get_v(min, type_info.min)
-            amax = get_v(max, type_info.max)
-        elif min is not None:
-            amin = get_v(min, type_info.min)
-            amax = type_info.max
-        else:
-            amin = type_info.min
-            amax = get_v(max, type_info.max)
-
-        return _op.clip(data, amin, amax)
-
-    def clamp(self, inputs, _):
-        return self.clamp_common(inputs[0], min=inputs[1], max=inputs[2])
-
-    def clamp_min(self, inputs, input_types):
-        return self.clamp_common(inputs[0], min=inputs[1])
-
-    def clamp_max(self, inputs, input_types):
-        return self.clamp_common(inputs[0], max=inputs[1])
-
-    def to(self, inputs, input_types):
-        data = inputs[0]
-        dtype = inputs[1] if inputs[1] is not None and not isinstance(inputs[1], str) else inputs[2]
-        # special handling for aten::to(data, 6, _, _, _) case
-        # 6 means dtype = float
-        # this happens when converting upsampling with scale factor
-        cast_map = {5: "float16", 6: "float32", 7: "float64", 3: "int32", 4: "int64"}
-
-        cast_func = {5: float, 6: float, 7: float, 3: int, 4: int}
-
-        ret = data
-        if isinstance(data, _expr.Expr):
-            actual_dtype = str(self.infer_type(data).dtype)
-            if dtype in cast_map and cast_map[dtype] != actual_dtype:
-                ret = _op.cast(data, cast_map[dtype])
-        elif dtype in cast_map:
-            ret = cast_func[dtype](data)
-
-        return ret
-
-    def get_upsample_out_size(self, inputs, method):
-        # This assumes a static shape
-        out_size = []
-        if inputs[1] is not None:
-            for size in inputs[1]:
-                if not isinstance(size, int):
-                    out_size.append(int(_infer_value(size, {}).numpy()))
-                else:
-                    out_size.append(size)
-        else:
-            scale_index = 3 if method != "nearest_neighbor" else 2
-            scales = inputs[scale_index]
-            assert scales is not None, "neither out size nor scale provided"
-            assert isinstance(scales, list)
-            ishape = self.infer_shape(inputs[0])
-            for i, scale in enumerate(scales):
-                out_size.append(int(math.floor(float(ishape[2 + i]) * scale)))
-
-        return out_size
-
-    def make_upsample(self, method):
-        def upsample(inputs, input_types):
-            data = inputs[0]
-            out_size = self.get_upsample_out_size(inputs, method)
-
-            if len(inputs) > 2 and method != "nearest_neighbor":
-                align_corners = inputs[2]
-            else:
-                align_corners = False
-
-            if method == "nearest_neighbor":
-                coord_trans = "asymmetric"
-            elif align_corners:
-                coord_trans = "align_corners"
-            else:
-                coord_trans = "half_pixel"
-
-            def func(x):
-                return _op.image.resize2d(
-                    x, out_size, None, "NCHW", method, coord_trans, cubic_alpha=-0.75
-                )
-
-            if self.is_quantized_tensor(data):
-                # input qparams are manually appended by us
-                assert isinstance(inputs[-2], float)
-                assert isinstance(inputs[-1], int)
-                input_scale = _expr.const(inputs[-2])
-                input_zero_point = _expr.const(inputs[-1])
-                # currently piggy backs to fp32, it gets identical output as torch
-                return qnn_torch.apply_with_fp32_fallback(data, input_scale, input_zero_point, func)
-
-            return func(data)
-
-        return upsample
-
-    def make_upsample3d(self, method):
-        def upsample3d(inputs, input_types):
-            data = inputs[0]
-            out_size = self.get_upsample_out_size(inputs, method)
-
-            if len(inputs) > 2 and method == "linear":
-                align_corners = inputs[2]
-            else:
-                align_corners = False
-
-            if method == "nearest_neighbor":
-                coord_trans = "asymmetric"
-            elif align_corners:
-                coord_trans = "align_corners"
-            else:
-                coord_trans = "half_pixel"
-
-            return _op.image.resize3d(data, out_size, None, "NCDHW", method, coord_trans)
-
-        return upsample3d
-
-    def expand_as(self, inputs, input_types):
-        target = inputs[1]
-        t0 = self.infer_type(inputs[0]).dtype
-        t1 = self.infer_type(inputs[1]).dtype
-        if str(t0) != str(t1):
-            target = _op.cast(target, t0)
-        return _op.broadcast_to_like(inputs[0], target)
-
-    def broadcast_tensors(self, inputs, input_types):
-        tensor_list = inputs[0]
-        import torch
-
-        infer_shape_value = [self.infer_shape(t) for t in tensor_list]
-        # "torch.broadcast_shapes" is available after PyTorch 1.8.0
-        if hasattr(torch, "broadcast_shapes"):
-            res_shape = list(torch.broadcast_shapes(*infer_shape_value))
-        else:
-            res_shape = list(torch.broadcast_tensors(*map(torch.empty, infer_shape_value))[0].shape)
-        return [_op.broadcast_to(tensor, res_shape) for tensor in tensor_list]
-
-    def broadcast_to(self, inputs, input_types):
-        tensor = inputs[0]
-        new_shape = inputs[1]
-        import torch
-
-        if not isinstance(new_shape, (list, tuple, torch.Size)):
-            msg = f"Data type {type(new_shape)} could not be parsed in broadcast_to op"
-            raise AssertionError(msg)
-
-        for i, dim in enumerate(new_shape):
-            if not isinstance(dim, int):
-                new_shape[i] = int(_infer_value(dim, {}).numpy())
-
-        return _op.broadcast_to(tensor, new_shape)
-
-    def Bool(self, inputs, input_types):
-        assert len(inputs) == 1
-        return inputs[0]
-
-    def Float(self, inputs, input_types):
-        assert len(inputs) == 1
-        return _op.cast(inputs[0], "float32")
-
-    def bitwise_not(self, inputs, input_types):
-        data = inputs[0]
-        # The input tensor must be of integral or Boolean types.
-        # For bool tensors, it computes the logical NOT
-        if input_types[0] == "bool":
-            out = _op.logical_not(_op.cast(data, "bool"))
-        else:
-            out = _op.bitwise_not(_op.cast(data, "int"))
-
-        return out
-
-    def bitwise_xor(self, inputs, input_types):
-        lhs = inputs[0]
-        rhs = inputs[1]
-        lhs = _op.cast(lhs, "bool") if input_types[0] == "bool" else _op.cast(lhs, "int")
-        rhs = _op.cast(rhs, "bool") if input_types[1] == "bool" else _op.cast(rhs, "int")
-
-        return _op.bitwise_xor(lhs, rhs)
-
-    def bitwise_and(self, inputs, input_types):
-        lhs = inputs[0]
-        rhs = inputs[1]
-        lhs = _op.cast(lhs, "bool") if input_types[0] == "bool" else _op.cast(lhs, "int")
-        rhs = _op.cast(rhs, "bool") if input_types[1] == "bool" else _op.cast(rhs, "int")
-
-        return _op.bitwise_and(lhs, rhs)
-
-    def logical_not(self, inputs, input_types):
-        data = _wrap_const(inputs[0])
-        return _op.logical_not(_op.cast(data, "bool"))
-
-    def logical_xor(self, inputs, input_types):
-        lhs = _op.cast(inputs[0], "bool")
-        rhs = _op.cast(inputs[1], "bool")
-
-        return _op.logical_xor(lhs, rhs)
-
-    def list_getitem(self, inputs, input_types):
-        return self.prelude.nth(inputs[0], _wrap_const(inputs[1]))
-
-    def list_len(self, inputs, input_types):
-        return self.prelude.length(inputs[0])
-
-    def type_as(self, inputs, input_types):
-        assert len(inputs) == 2
-        assert len(input_types) == 2
-        return _op.cast(inputs[0], input_types[1])
-
-    def gather(self, inputs, input_types):
-        data = inputs[0]
-        axis = inputs[1]
-        indices = inputs[2]
-
-        return _op.gather(data, axis, indices)
-
-    def add(self, inputs, input_types):
-        # add_ is overloaded for tensor add and list concat
-        if input_types[0] == "ListType":
-            return self.prelude.concat(inputs[0], inputs[1])
-        return self.make_elemwise("add")(inputs, input_types)
-
-    def tensor_array_stack(self, inputs, input_types):
-        dim = inputs[1]
-        assert dim == 0, "stacking on a dynamic tensor list only supported on a first axis"
-        tensor_array, shape = self.convert_to_tensor_array(inputs[0])
-
-        stacked_shape = (Any(),) + shape
-        stack = self.prelude.get_global_var_static("tensor_array_stack", "float32", shape)
-        stacked = stack(tensor_array)
-
-        static_tensor_array_ops = StaticTensorArrayOps(self.prelude, "float32", stacked_shape)
-        static_tensor_array_ops.register()
-        get_tensor = self.prelude.get_global_var_static("tensor_get_data", "float32", stacked_shape)
-        return get_tensor(stacked)
-
-    def stack(self, inputs, input_types):
-        if isinstance(inputs[0], list):
-            # a static python list of tensors
-            dim = inputs[1]
-            return _op.stack(inputs[0], dim)
-        else:
-            # List ADT case
-            assert isinstance(inputs[0], _expr.Expr)
-            ty = self.infer_type_with_prelude(inputs[0])
-            list_ty = self.prelude.mod.get_global_type_var("List")
-            msg = "The input list is expected to be List ADT"
-            assert isinstance(ty, tvm.ir.TypeCall) and ty.func == list_ty, msg
-            return self.tensor_array_stack(inputs, input_types)
-
-    def sub(self, inputs, input_types):
-        if len(inputs) == 3:
-            data0, data1, alpha = self.pytorch_promote_types(inputs, input_types)
-            return get_relay_op("subtract")(data0, alpha * data1)
-        else:
-            data0, data1 = self.pytorch_promote_types(inputs, input_types)
-            return get_relay_op("subtract")(data0, data1)
-
-    def rsub(self, inputs, input_types):
-        data0, data1, alpha = self.pytorch_promote_types(inputs, input_types)
-
-        # note: rsub means data0 and data1 swap places
-        return get_relay_op("subtract")(data1, alpha * data0)
-
-    def embedding(self, inputs, input_types):
-        weight = inputs[0]
-        indices = inputs[1]
-
-        return _op.take(weight, indices.astype("int32"), axis=0)
-
-    def one_hot(self, inputs, input_types):
-        indices = inputs[0].astype("int32")
-        num_classes = inputs[1]
-        if num_classes == -1:
-            msg = "Inferring the number of classes is not yet supported."
-            raise NotImplementedError(msg)
-
-        dtype = "int32"
-        on_value = tvm.relay.const(1.0, dtype)
-        off_value = tvm.relay.const(0.0, dtype)
-
-        return _op.one_hot(indices, on_value, off_value, num_classes, -1, dtype)
-
-    def index(self, inputs, input_types):
-        data = inputs[0]
-        data_shape = self.infer_type(data).shape
-
-        axes_adv_idx = [i for i, v in enumerate(inputs[1]) if v is not None]
-        axes_rest = [i for i in range(len(data_shape)) if i not in axes_adv_idx]
-
-        # check if the adv_index axes are consecutive
-        # if consecutive, result must be transposed again at the end
-        consecutive = True
-        for curr, nxt in zip(axes_adv_idx[:-1], axes_adv_idx[1:]):
-            if nxt - curr != 1:
-                consecutive = False
-                break
-
-        indices_list = []
-        axes_order = axes_adv_idx + axes_rest
-
-        for i in axes_adv_idx:
-            inp = inputs[1][i]
-            if self.infer_type(inp).dtype == "bool":
-                # adv_index does not support a mask as the index tensor (it will treat 0/1 as
-                # an index rather than a flag).
-                # So we use argwhere to turn the mask into indices, which will also take care
-                # of the dynamism in the indexing by mask.
-                indices_list.append(_op.squeeze(_op.transform.argwhere(inp), axis=[1]))
-            else:
-                indices_list.append(inp)
-
-        data_after_adv_index = _op.adv_index([_op.transpose(data, axes=axes_order)] + indices_list)
-
-        if consecutive:
-            num_dims = len(self.infer_type(data_after_adv_index).shape)
-            num_new_dims = num_dims - len(axes_rest)
-
-            axes_final_order = list(range(num_dims))
-            axes_final_order = (
-                axes_final_order[num_new_dims : num_new_dims + axes_adv_idx[0]]
-                + axes_final_order[:num_new_dims]
-                + axes_final_order[num_new_dims + axes_adv_idx[0] :]
-            )
-
-            return _op.transpose(data_after_adv_index, axes=axes_final_order)
-        else:
-            return data_after_adv_index
-
-    def meshgrid(self, inputs, input_types):
-        data = inputs[0]
-        return _op.meshgrid(data, indexing="ij")
-
-    def nms(self, inputs, input_types):
-        boxes = inputs[0]
-        scores = inputs[1]
-        iou_threshold = inputs[2]
-
-        # TVM NMS assumes score > 0
-        # - since there exists multi-comsumers for "scores", "num_boxes"
-        # - invoke set_span here to prevent expr-rewritten occurrs in span-filling stage
-        source_name = self.source_map[self.current_op[-1]]
-        scores = set_span(scores - _op.min(scores) + _op.const(1.0), source_name)
-
-        num_boxes = set_span(_op.shape_of(scores), source_name)
-        # PyTorch NMS doesn't have score_threshold, so no need to run get_valid_count
-        # - since "arange" op will fill expr into its attribute
-        # - invoke set_span here to prevent expr-rewritten occurrs in span-filling stage
-        indices = _op.transform.arange(set_span(_op.squeeze(num_boxes), source_name), dtype="int32")
-        indices = _op.expand_dims(indices, 0, 1)
-
-        # Generate data with shape (1, num_anchors, 5)
-        scores = AttrCvt(op_name="expand_dims", extras={"axis": -1, "num_newaxis": 1})([scores], {})
-        data = _op.concatenate([scores, boxes], -1)
-        data = _op.expand_dims(data, 0, 1)
-
-        # Perform Non-Maximum Suppression,
-        # PyTorch NMS doesn't have parameter top_k and max_output_size
-        score_index = 0
-        top_k = max_out_size = -1
-        nms_ret = get_relay_op("non_max_suppression")(
-            data=data,
-            valid_count=num_boxes,
-            indices=indices,
-            max_output_size=max_out_size,
-            iou_threshold=iou_threshold,
-            force_suppress=True,
-            top_k=top_k,
-            coord_start=1,
-            score_index=score_index,
-            id_index=-1,
-            return_indices=True,
-            invalid_to_bottom=False,
-        )
-
-        # squeeze the two outputs of nms for strided_slice
-        size = get_relay_op("squeeze")(nms_ret[1], axis=[1])
-        data_slice = get_relay_op("squeeze")(nms_ret[0], axis=[0])
-
-        # strided slice to get the dynamic result
-        ret = get_relay_op("strided_slice")(
-            data_slice, begin=_expr.const([0]), end=size, slice_mode="size"
-        )
-        # in torchvision, indices from nms are int64
-        return _op.cast(ret, "int64")
-
-    def logsumexp(self, inputs, input_types):
-        data = self.pytorch_promote_types(inputs[:1], input_types[:1])
-        dim_list = inputs[1]
-        keepdim = inputs[2] if len(inputs) > 2 else False
-        # dim is output of prim::ListConstruct, even if it is int in python code
-        assert isinstance(dim_list, list), "dim is expected to be a list"
-        return _op.logsumexp(data[0], axis=dim_list, keepdims=keepdim)
-
-    def roi_align(self, inputs, input_types):
-        data = inputs[0]
-        boxes = inputs[1]
-
-        output_size = (inputs[3], inputs[4])
-        spatial_scale = inputs[2]
-        sample_ratio = inputs[5]
-        aligned = False if len(inputs) < 7 else inputs[6]
-
-        if aligned:
-            boxes -= _expr.const(0.5 / spatial_scale)
-
-        return _op.vision.roi_align(data, boxes, output_size, spatial_scale, sample_ratio)
-
-    def deform_conv2d(self, inputs, input_types):
-        data = inputs[0]
-        weight = inputs[1]
-        offset = inputs[2]
-
-        if len(inputs) > 12:
-            strides_offset = 5
-            bias = inputs[4]
-            logger.warning("mask argument in deformable conv2d is not supported and ignored")
-        else:
-            strides_offset = 4
-            bias = inputs[3]
-
-        strides = (inputs[strides_offset], inputs[strides_offset + 1])
-        padding = (inputs[strides_offset + 2], inputs[strides_offset + 3])
-        dilation = (inputs[strides_offset + 4], inputs[strides_offset + 5])
-        groups = inputs[strides_offset + 6]
-        deformable_groups = inputs[strides_offset + 7]
-        weight_shape = self.infer_shape(weight)
-        output_channels = weight_shape[0]
-        kernel_size = (weight_shape[2], weight_shape[3])
-
-        conv_out = _op.nn.deformable_conv2d(
-            data,
-            offset,
-            weight,
-            strides,
-            padding,
-            dilation,
-            deformable_groups,
-            groups,
-            output_channels,
-            kernel_size,
-        )
-
-        return _op.nn.bias_add(conv_out, bias)
-
-    def stft(self, inputs, input_types):
-        data = inputs[0]
-        n_fft = inputs[1]
-        hop_length = inputs[2]
-        win_length = inputs[3]
-        window = inputs[4]
-        normalized = inputs[5]
-        onesided = inputs[6]
-
-        return _op.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
-
-    def unbind(self, inputs, input_types):
-        data = inputs[0]
-        axis = int(inputs[1])
-        return unbind(data, axis)
-
-    def shape_as_tensor(self, inputs, input_types):
-        is_symbolic_shape = False
-        input_shape = self.infer_shape(inputs[0], self.prelude.mod)
-        for axis in input_shape:
-            if not isinstance(axis, (int, tvm.tir.IntImm)):
-                is_symbolic_shape = True
-                break
-
-        if is_symbolic_shape:
-            ret = _op.shape_of(inputs[0], dtype="int64")
-        else:
-            ret = _expr.const(np.array(input_shape), dtype="int64")
-
-        return ret
-
-    def logical_and(self, inputs, input_types):
-        lhs = _op.cast(inputs[0], "bool")
-        rhs = _op.cast(inputs[1], "bool")
-
-        return _op.logical_and(lhs, rhs)
-
-    def logical_or(self, inputs, input_types):
-        lhs = _op.cast(inputs[0], "bool")
-        rhs = _op.cast(inputs[1], "bool")
-
-        return _op.logical_or(lhs, rhs)
-
-    def nonzero(self, inputs, input_types, is_numpy_style=False):
-        data = inputs[0]
-        ret = _op.transform.argwhere(data)
-        if is_numpy_style or (len(inputs) > 1 and inputs[1]):
-            return unbind(ret, 1)
-        return ret
-
-    def nonzero_numpy(self, inputs, input_types):
-        return self.nonzero(inputs, input_types, is_numpy_style=True)
-
-    def scatter(self, inputs, input_types):
-        assert len(inputs) == 4 or len(inputs) == 5, (
-            f"scatter takes 4 or 5 inputs: data, dim, index, src, reduce (optional), "
-            f"but {len(inputs)} given"
-        )
-        data = inputs[0]
-        axis = int(inputs[1])
-        index = inputs[2]
-        src = inputs[3]
-        if len(inputs) == 5:
-            reduce = inputs[4]
-        else:
-            reduce = "update"
-
-        data_shape = self.infer_shape(data)
-        data_rank = len(data_shape)
-        index_shape = self.infer_shape(index)
-        index_rank = len(index_shape)
-        # When index is empty, the operation returns data unchanged
-        if self.is_empty_shape(index_shape):
-            return data
-
-        if np.isscalar(src):
-            assert self.infer_type(src).dtype == "float", "Scalar source can be float only"
-            src = _op.broadcast_to_like(src, data_shape)
-            src_shape = data_shape
-        else:
-            src_shape = self.infer_shape(src)
-        src_rank = len(src_shape)
-        assert data_rank == index_rank, "Index rank is not the same as data rank"
-        assert data_rank == src_rank, "Src rank is not the same as data rank"
-
-        assert 0 <= axis < data_rank, "Dim is out of bounds"
-
-        for i in range(data_rank):
-            index_dim = index_shape[i]
-            src_dim = src_shape[i]
-            data_dim = data_shape[i]
-            # Skip check for dynamic dimensions
-            if not any([isinstance(index_dim, tvm.tir.Any), isinstance(src_dim, tvm.tir.Any)]):
-                assert index_dim <= src_dim, "Index dim size should be less than src one"
-            if i != axis and not any(
-                [isinstance(index_dim, tvm.tir.Any), isinstance(data_dim, tvm.tir.Any)]
-            ):
-                assert index_dim <= data_dim, "Index dim size should be less than data one"
-
-        if reduce is None:
-            reduce = "update"
-        elif reduce == "multiply":
-            reduce = "mul"
-        assert reduce in [
-            "update",
-            "add",
-            "mul",
-        ], 'reduce arg is expected from "add", "multiply" or None'
-
-        return _op.scatter_elements(data, index, src, axis, reduce)
-
-    def index_put(self, inputs, input_types):
-        in_tensor = inputs[0]
-        indices = inputs[1]
-        values = inputs[2]
-        accumulate = inputs[3]
-        if not accumulate:
-            mode = "update"
-        else:
-            mode = "add"
-        # Combine array of index tensors into one index tensor with shape (N,_)
-        index_tensor = _op.stack(indices, axis=0)
-        return _op.scatter_nd(in_tensor, index_tensor, values, mode)
-
-    def scalar_tensor(self, inputs, input_types):
-        data = inputs[0]
-        cast_map = {6: "float32", 7: "float64", 3: "int32", 4: "int64"}
-        type_key = inputs[1]
-        if isinstance(data, _expr.Constant):
-            data = data.data.numpy().tolist()
-        return _expr.const(data, cast_map[type_key])
-
-    def interpolate(self, inputs, input_types):
-        if isinstance(inputs[1], _expr.Expr):
-            out_size = inputs[1]
-        elif isinstance(inputs[1], list):
-            out_size = []
-            for i in [0, 1]:
-                size, _ = try_infer_value(
-                    inputs[1][i],
-                    lambda ret: ret.astype(int),
-                    lambda: _op.expand_dims(inputs[1][i], axis=0),
-                )
-                out_size.append(size)
-            out_size = _op.concatenate(out_size, axis=0)
-
-        data = inputs[0]
-        align_corners = inputs[4]
-        method = inputs[3]
-        if method.startswith("nearest"):
-            method = "nearest_neighbor"
-        elif method[0:2] == "bi":
-            method = method[2:]
-
-        if method == "nearest_neighbor":
-            coord_trans = "asymmetric"
-        elif align_corners:
-            coord_trans = "align_corners"
-        else:
-            coord_trans = "half_pixel"
-
-        return _op.image.resize2d(
-            data, out_size, None, "NCHW", method, coord_trans, cubic_alpha=-0.75
-        )
-
-    def numel(self, inputs, input_types):
-        return _op.ndarray_size(inputs[0])
-
-    def empty(self, inputs, input_types):
-        shape = []
-        for s in inputs[0]:
-            if isinstance(s, _expr.Constant):
-                shape.append(s.data.numpy().item())
-            else:
-                assert isinstance(s, int)
-                shape.append(s)
-
-        return _op.zeros(shape, _convert_dtype_value(inputs[1]))
-
-    def empty_like(self, inputs, input_types):
-        shape = self.infer_shape(inputs[0])
-        if inputs[1] is not None:
-            dtype = _convert_dtype_value(inputs[1])
-        else:
-            dtype = input_types[0]
-        return _op.zeros(shape, dtype)
-
-    def new_empty(self, inputs, input_types):
-        size = inputs[1]
-
-        import torch
-
-        if not isinstance(size, (_expr.Expr, list, tuple, torch.Size, np.ndarray)):
-            msg = f"Data type {type(size)} could not be parsed in empty op"
-            raise AssertionError(msg)
-
-        if inputs[2] is not None:
-            dtype = _convert_dtype_value(inputs[2])
-        else:
-            dtype = input_types[0]
-        return _op.zeros(size, dtype)
-
-    def randn(self, inputs, input_types):
-        import time  # use current time as seed
-
-        shape = inputs[0]
-        output = _op.random.normal(_op.random.threefry_key(int(time.time())), shape)
-        _, values = _expr.TupleWrapper(output, 2)
-        return values
-
-    def bincount(self, inputs, input_types):
-        data = inputs[0]
-        weights = inputs[1]
-        input_type = self.infer_type(data).dtype
-        if input_type == "int64":
-            logger.warning(
-                "Casting an int64 input to int32, since we do not have int64 atomic add"
-                "needed for bincount yet."
-            )
-            data = _op.cast(data, "int32")
-        maximum = _op.max(data)
-        dim = maximum + _expr.const(1, dtype="int32")
-        if weights:
-            weight_type = self.infer_type(weights)
-            out_dtype = weight_type.dtype
-            updates = weights
-        else:
-            out_dtype = "int32"
-            updates = _op.ones_like(data)
-
-        counts = _op.zeros(_op.reshape(dim, [1]), out_dtype)
-        out = _op.scatter_elements(counts, data, updates, axis=0, reduction="add")
-        if input_type == "int32":
-            # Torch always outputs int64 results for bincount
-            return _op.cast(out, "int64")
-        return out
-
-    def scatter_add(self, inputs, input_types):
-        assert (
-            len(inputs) == 4
-        ), f"scatter_add takes 4 inputs (data, dim, index, src), but {len(inputs)} given"
-        data = inputs[0]
-        axis = inputs[1]
-        index = inputs[2]
-        src = inputs[3]
-
-        data_shape = self.infer_shape(inputs[0])
-        data_rank = len(data_shape)
-        index_shape = self.infer_shape(inputs[2])
-        index_rank = len(index_shape)
-        # When index is empty, the operation returns data unchanged
-        if self.is_empty_shape(index_shape):
-            return data
-        src_shape = self.infer_shape(inputs[3])
-        src_rank = len(src_shape)
-        assert data_rank == index_rank, "Index rank is not the same as data rank"
-        assert data_rank == src_rank, "Src rank is not the same as data rank"
-
-        assert 0 <= axis < data_rank, "Dim is out of bounds"
-
-        for i in range(data_rank):
-            assert index_shape[i] <= src_shape[i], "Index dim size should be less than src one"
-            if i != axis:
-                assert (
-                    index_shape[i] <= data_shape[i]
-                ), "Index dim size should be less than data one"
-
-        return _op.scatter_elements(data, index, src, axis=axis, reduction="add")
-
-    def scatter_reduce(self, inputs, input_types):
-        assert len(inputs) == 5 or len(inputs) == 6, (
-            f"scatter_reduce takes 5 or 6 inputs (data, dim, index, src, reduce, include_self), "
-            f"but {len(inputs)} given"
-        )
-        data = inputs[0]
-        dim = inputs[1]
-        index = inputs[2]
-        src = inputs[3]
-        reduce = inputs[4]
-        if len(inputs) == 6:
-            include_self = inputs[5]
-            # TODO(vvchernov): support include_self == False
-            assert include_self, "include_self=False has not been suppoted for scatter_reduce yet"
-
-        data_shape = self.infer_shape(inputs[0])
-        data_rank = len(data_shape)
-        index_shape = self.infer_shape(inputs[2])
-        index_rank = len(index_shape)
-        src_shape = self.infer_shape(inputs[3])
-        src_rank = len(src_shape)
-        assert data_rank == index_rank, "Index rank is not the same as data rank"
-        assert data_rank == src_rank, "Src rank is not the same as data rank"
-
-        assert 0 <= dim < data_rank, "Dim is out of bounds"
-
-        for i in range(data_rank):
-            assert index_shape[i] <= src_shape[i], "Index dim size should be less than src one"
-            if i != dim:
-                assert (
-                    index_shape[i] <= data_shape[i]
-                ), "Index dim size should be less than data one"
-
-        red_valids = ["sum", "prod", "mean", "amax", "amin"]
-        assert (
-            reduce in red_valids
-        ), f"Only {red_valids} modes are supported, but {reduce} is gotten"
-        if reduce == "sum":
-            reduce = "add"
-        elif reduce == "prod":
-            reduce = "mul"
-        elif reduce == "amin":
-            reduce = "min"
-        elif reduce == "amax":
-            reduce = "max"
-
-        return _op.scatter_elements(data, index, src, axis=dim, reduction=reduce)
-
-    def cumsum(self, inputs, input_types):
-        data = inputs[0]
-        dim = inputs[1]
-        dtype = inputs[2]
-
-        if inputs[2] is not None:
-            dtype = _convert_dtype_value(inputs[2])
-
-        return _op.cumsum(data, axis=dim, dtype=dtype)
-
-    def masked_fill(self, inputs, input_types):
-        mask = inputs[1]
-        value = _op.cast(_wrap_const(inputs[2]), input_types[0])
-        return _op.where(mask, value, inputs[0])
-
-    def masked_select(self, inputs, input_types):
-        mask = inputs[1]
-        indices = self.nonzero([mask], input_types, is_numpy_style=True)
-        return _op.adv_index([inputs[0]] + [indices[i] for i in range(indices.size)])
-
-    def sort(self, inputs, input_types):
-        data = inputs[0]
-        dim = inputs[1]
-        is_descending = inputs[2]
-        # pytorch sort returns both sorted indices and values
-        indices = _op.argsort(data, dim, not is_descending)
-        return _op.gather(data, dim, indices), indices
-
-    def argsort(self, inputs, input_types):
-        data = inputs[0]
-        dim = inputs[1]
-        is_descending = inputs[2]
-        return _op.argsort(data, dim, not is_descending)
-
-    def is_floating_point(self, inputs, input_types):
-        assert len(inputs) == 1
-
-        if isinstance(inputs[0], _expr.Expr):
-            input_type = self.infer_type(inputs[0]).dtype
-        else:
-            input_type = input_types[0]
-
-        is_float = input_type in ["float32", "float64", "float16", "bfloat16"]
-        return _expr.const(is_float)
-
-    def unique(self, inputs, input_types):
-        assert len(inputs) == 4
-        [data, is_sorted, return_inverse, return_counts] = inputs
-        if not is_sorted:
-            logger.warning("TVM always assumes sorted=True for torch.unique")
-            is_sorted = True
-        if return_counts:
-            [unique, indices, inverse_indices, num_uniq, counts] = _op.unique(
-                data, is_sorted=is_sorted, return_counts=True
-            )
-            unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
-            counts_sliced = _op.strided_slice(counts, begin=[0], end=num_uniq, slice_mode="size")
-            return (unique_sliced, inverse_indices, counts_sliced)
-        else:
-            [unique, indices, inverse_indices, num_uniq] = _op.unique(
-                data, is_sorted=is_sorted, return_counts=False
-            )
-            unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
-            return (unique_sliced, inverse_indices)
-
-    def nll_loss(self, inputs, input_types):
-        assert len(inputs) == 5
-        [predictions, targets, weights, reduction, ignore_index] = inputs
-        num_class = self.infer_shape(predictions)[1]
-        if reduction == 0:
-            reduction = "none"
-        elif reduction == 1:
-            reduction = "mean"
-        else:
-            reduction = "sum"
-        if weights is None:
-            weights = _op.full(_expr.const(1), (num_class,), dtype=input_types[0])
-        return _op.nn.nll_loss(predictions, targets, weights, reduction, ignore_index)
-
-    def flip(self, inputs, input_types):
-        data = inputs[0]
-        axis = inputs[1]
-        out = data
-        for ax in axis:
-            out = _op.reverse(out, ax)
-        return out
-
-    def bidir_rnn_cell(self, input_seqs, weights_dicts, act=_op.tanh):
-        """
-        Bidirectional RNN cell
-        """
-        seq_len = len(input_seqs)
-        forward_outputs, fw_H_t = rnn_cell(input_seqs, **weights_dicts[0], backwards=False, act=act)
-
-        reverse_outputs, rev_H_t = rnn_cell(input_seqs, **weights_dicts[1], backwards=True, act=act)
-
-        final_outputs = []
-        for i in range(seq_len):
-            final_outputs.append(
-                _op.concatenate([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=-1)
-            )
-
-        return final_outputs, _op.stack([fw_H_t, rev_H_t], axis=0)
-
-    def rnn_layers(self, input_data, layer_weights_dicts, bidirectional, act, dropout_p=0.0):
-        """
-        Methods iterates layers for Stacked RNN
-        """
-        layers_num = len(layer_weights_dicts)
-        # split input sequence to samples set
-        input_seqs = unbind(input_data, 0)  # [seq_num, (batch, feature_size)]
-        output_hiddens = []
-        for i in range(layers_num):
-            weights_dicts = layer_weights_dicts[i]
-            # input_seqs shape = [seq_num, (batch, feature_size)] or
-            # [seq_num, (batch, 2*feature_size)] for bidirectional
-            if bidirectional:
-                input_seqs, H_t = self.bidir_rnn_cell(input_seqs, weights_dicts, act=act)
-            else:
-                input_seqs, H_t = rnn_cell(input_seqs, **weights_dicts[0], act=act)
-
-            output_hiddens.append(H_t)
-
-            # TODO (yuanfz98): in pytorch implementation train is also checked
-            # see https://github.com/pytorch/pytorch/blob/70c8daf43946b53af6493d058899ef952d27d339
-            # /aten/src/ATen/native/RNN.cpp#L1054
-            if dropout_p != 0 and i < layers_num - 1:
-                # for input in input_seqs:
-                #     input = _op.dropout(input, dropout_p)
-                raise NotImplementedError("Dropout for GRU has not been supported yet!")
-        output_hiddens = (
-            _op.concatenate(output_hiddens, 0) if bidirectional else _op.stack(output_hiddens, 0)
-        )
-        return _op.stack(input_seqs, 0), output_hiddens
-
-    def rnn(self, inputs, input_types, nonlinearity):
-        """
-        Description of RNN in pytorch:
-        https://pytorch.org/docs/stable/generated/torch.nn.RNN.html#torch.nn.RNN
-        Description of inputs:
-        https://github.com/pytorch/pytorch/blob/736fb7d22cc948b739db2c35aeb5ad4d19aea4f4/torch/overrides.py#L937
-        """
-        # TODO (yuanfz98): support dropout
-        assert len(inputs) == 9, "Input of size 9 is expected"
-        # Unpack inputs, note that if optional and not provided then value will be None.
-        _X = inputs[0]
-        # _X shape (seq_num, batch, feature_size) or (batch, seq_num, feature_size)
-
-        hidden_state = inputs[1]
-        # Hidden state shape (hidden_layers_num, batch, hidden_size)
-
-        _weights = inputs[2]
-        # Wi layer[0] shape (hidden_size, feature_size)
-        # Wh layer[0] shape (hidden_size, hidden_size)
-        # Bi layer[0] shape (hidden_size)
-        # Bh layer[0] shape (hidden_size)
-
-        # Wi layer[>0] shape (hidden_size, hidden_size * num_directions)
-        # Wh layer[>0] shape (hidden_size, hidden_size)
-        # Bi layer[>0] shape (hidden_size)
-        # Bh layer[>0] shape (hidden_size)
-
-        # Scalar inputs
-        has_biases = inputs[3]
-        num_layers = inputs[4]
-        dropout_p = inputs[5]  # dropout probability, if 0.0 it means there is no dropout
-        # train = inputs[6]
-        bidirectional = inputs[7]
-        batch_first = inputs[8]
-
-        num_directions = 1
-        if bidirectional:
-            num_directions = 2
-
-        rsd = len(_weights) % num_layers
-        assert rsd == 0, "The number of weights must be a multiple of the number of layers!"
-        rsd = (len(_weights) / num_layers) % num_directions
-        assert (
-            rsd == 0
-        ), "The number of weights in layer must be a multiple of the number of directions!"
-
-        weights_num = int(len(_weights) / num_layers / num_directions)
-        if has_biases:
-            assert weights_num == 4, "The weights number in layer is expected equal to 4"
-        else:
-            assert weights_num == 2, "The weights number in layer is expected equal to 2"
-        if nonlinearity == "tanh":
-            act = _op.tanh
-        elif nonlinearity == "relu":
-            act = _op.nn.relu
-        assert act, "The nonlinearity is unknown"
-        X = (
-            _op.transpose(_X, (1, 0, 2)) if batch_first else _X
-        )  # always (seq_num, batch, feature_size)
-        # TODO (yuanfz98): Which data type should be used? from input or weights?
-        # Instead of it _infer_type(X).checked_type.dtype can be used
-        X_dtype = input_types[0]
-        X_shape = _infer_shape(X)  # (seq_num, batch, feature_size)
-
-        hidden_size = int(_infer_shape(_weights[0])[0])
-        batch_size = X_shape[1]
-
-        # Initialize hidden states if not provided.
-        layers_h = []
-        hidden_layers_num = num_directions * num_layers
-        if hidden_state is None:
-            h_0 = _op.zeros((batch_size, hidden_size), X_dtype)
-            for i in range(hidden_layers_num):
-                layers_h.append(h_0)
-        else:
-            layers_h = unbind(hidden_state, 0)
-
-        layer_weights_dicts = []
-        k = 0  # layer counter
-        if has_biases:
-            names = ["hidden_state", "w_inp", "w_hid", "b_inp", "b_hid"]
-            if bidirectional:
-                rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of RNN weights"
-                for i in range(0, len(_weights), 2 * weights_num):
-                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 4]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    j = i + weights_num
-                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 4]]
-                    rev_weights_dict = dict(zip(names, rev_tensors))
-                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
-                    k += 1
-            else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of GRU weights"
-                for i in range(0, len(_weights), weights_num):
-                    fw_tensors = [layers_h[k], *_weights[i : i + 4]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    layer_weights_dicts.append([fw_weights_dict])
-                    k += 1
-        else:
-            names = ["hidden_state", "w_inp", "w_hid"]
-            if bidirectional:
-                rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of RNN weights"
-                for i in range(0, len(_weights), 2 * weights_num):
-                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 2]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    j = i + weights_num
-                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 2]]
-                    rev_weights_dict = dict(zip(names, rev_tensors))
-                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
-                    k += 1
-            else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of RNN weights"
-                for i in range(0, len(_weights), weights_num):
-                    fw_tensors = [layers_h[k], *_weights[i : i + 2]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    layer_weights_dicts.append([fw_weights_dict])
-                    k += 1
-        assert (
-            len(layer_weights_dicts) == num_layers and k == num_layers
-        ), "For stacked RNN number of weights sets should be the same as number of layers!"
-        output, out_hidden_state = self.rnn_layers(
-            X, layer_weights_dicts, bidirectional, act, dropout_p=dropout_p
-        )
-
-        # output shape = (seq_num, batch, hidden_size) or
-        # (seq_num, batch, 2*feature_size) for bidirectional
-        if batch_first:
-            output = _op.transpose(output, (1, 0, 2))
-
-        return (output, out_hidden_state)
-
-    def bidir_gru_cell(self, input_seqs, weights_dicts):
-        """
-        Bidirectional GRU cell
-        """
-        seq_len = len(input_seqs)
-        forward_outputs, fw_H_t = gru_cell(input_seqs, **weights_dicts[0])
-
-        reverse_outputs, rev_H_t = gru_cell(input_seqs, **weights_dicts[1], backwards=True)
-
-        final_outputs = []
-        for i in range(seq_len):
-            final_outputs.append(
-                _op.concatenate([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=-1)
-            )
-
-        return final_outputs, _op.stack([fw_H_t, rev_H_t], axis=0)
-
-    def gru_layers(self, input_data, layer_weights_dicts, bidirectional, dropout_p=0.0):
-        """
-        Methods iterates layers for Stacked GRU
-        """
-        layers_num = len(layer_weights_dicts)
-        # split input sequence to samples set
-        input_seqs = unbind(input_data, 0)  # [seq_num, (batch, feature_size)]
-        output_hiddens = []
-        for i in range(layers_num):
-            weights_dicts = layer_weights_dicts[i]
-            # input_seqs shape = [seq_num, (batch, feature_size)] or
-            # [seq_num, (batch, 2*feature_size)] for bidirectional
-            if bidirectional:
-                input_seqs, H_t = self.bidir_gru_cell(input_seqs, weights_dicts)
-            else:
-                input_seqs, H_t = gru_cell(input_seqs, **weights_dicts[0])
-
-            output_hiddens.append(H_t)
-
-            # TODO (vvchernov): in pytorch implementation train is also checked
-            # see https://github.com/pytorch/pytorch/blob/70c8daf43946b53af6493d058899ef952d27d339
-            # /aten/src/ATen/native/RNN.cpp#L1054
-            if dropout_p != 0 and i < layers_num - 1:
-                # for input in input_seqs:
-                #     input = _op.dropout(input, dropout_p)
-                raise NotImplementedError("Dropout for GRU has not been supported yet!")
-
-        return _op.stack(input_seqs, 0), _op.stack(output_hiddens, 0)
-
-    def gru(self, inputs, input_types):
-        """
-        Description of GRU in pytorch:
-        https://pytorch.org/docs/stable/generated/torch.nn.GRU.html?highlight=gru#torch.nn.GRU
-        """
-        # TODO (vvchernov): support dropout
-        assert len(inputs) == 9, "Input of size 9 is expected"
-        # Unpack inputs, note that if optional and not provided then value will be None.
-        _X = inputs[0]
-        # _X shape (seq_num, batch, feature_size) or (batch, seq_num, feature_size)
-
-        hidden_state = inputs[1]
-        # Hidden state shape (hidden_layers_num, batch, hidden_size)
-
-        _weights = inputs[2]
-        # Wi layer[0] shape (3 * hidden_size, feature_size)
-        # Wh layer[0] shape (3 * hidden_size, hidden_size)
-        # Bi layer[0] shape (3 * hidden_size)
-        # Bh layer[0] shape (3 * hidden_size)
-
-        # Wi layer[>0] shape (3 * hidden_size, hidden_size * num_directions)
-        # Wh layer[>0] shape (3 * hidden_size, hidden_size)
-        # Bi layer[>0] shape (3 * hidden_size)
-        # Bh layer[>0] shape (3 * hidden_size)
-
-        # Scalar inputs
-        has_biases = inputs[3]
-        num_layers = inputs[4]
-        dropout_p = inputs[5]  # dropout probability, if 0.0 it means there is no dropout
-        # train = inputs[6]
-        bidirectional = inputs[7]
-        batch_first = inputs[8]
-
-        num_directions = 1
-        if bidirectional:
-            num_directions = 2
-
-        rsd = len(_weights) % num_layers
-        assert rsd == 0, "The number of weights must be a multiple of the number of layers!"
-        rsd = (len(_weights) / num_layers) % num_directions
-        assert (
-            rsd == 0
-        ), "The number of weights in layer must be a multiple of the number of directions!"
-
-        weights_num = int(len(_weights) / num_layers / num_directions)
-        if has_biases:
-            assert weights_num == 4, "The weights number in layer is expected equal to 4"
-        else:
-            assert weights_num == 2, "The weights number in layer is expected equal to 2"
-
-        X = _op.transpose(_X, (1, 0, 2)) if batch_first else _X
-        # TODO (vvchernov): Which data type should be used? from input or weights?
-        # Instead of it _infer_type(X).checked_type.dtype can be used
-        X_dtype = input_types[0]
-        X_shape = _infer_shape(X)  # (seq_num, batch, feature_size)
-
-        hidden_size = int(_infer_shape(_weights[0])[0] / 3)
-        batch_size = X_shape[1]
-
-        # Initialize hidden states if not provided.
-        layers_h = []
-        hidden_layers_num = num_directions * num_layers
-        if hidden_state is None:
-            h_0 = _op.zeros((batch_size, hidden_size), X_dtype)
-            for i in range(hidden_layers_num):
-                layers_h.append(h_0)
-        else:
-            layers_h = unbind(hidden_state, 0)
-
-        layer_weights_dicts = []
-        k = 0  # layer counter
-        if has_biases:
-            names = ["hidden_state", "w_inp", "w_hid", "b_inp", "b_hid"]
-            if bidirectional:
-                rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of GRU weights"
-                for i in range(0, len(_weights), 2 * weights_num):
-                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 4]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    j = i + weights_num
-                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 4]]
-                    rev_weights_dict = dict(zip(names, rev_tensors))
-                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
-                    k += 1
-            else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of GRU weights"
-                for i in range(0, len(_weights), weights_num):
-                    fw_tensors = [layers_h[k], *_weights[i : i + 4]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    layer_weights_dicts.append([fw_weights_dict])
-                    k += 1
-        else:
-            names = ["hidden_state", "w_inp", "w_hid"]
-            if bidirectional:
-                rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of GRU weights"
-                for i in range(0, len(_weights), 2 * weights_num):
-                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 2]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    j = i + weights_num
-                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 2]]
-                    rev_weights_dict = dict(zip(names, rev_tensors))
-                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
-                    k += 1
-            else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of GRU weights"
-                for i in range(0, len(_weights), weights_num):
-                    fw_tensors = [layers_h[k], *_weights[i : i + 2]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    layer_weights_dicts.append([fw_weights_dict])
-                    k += 1
-        assert (
-            len(layer_weights_dicts) == num_layers and k == num_layers
-        ), "For stacked GRU number of weights sets should be the same as number of layers!"
-
-        output, out_hidden_state = self.gru_layers(
-            X, layer_weights_dicts, bidirectional, dropout_p=dropout_p
-        )
-
-        # output shape = (seq_num, batch, hidden_size) or
-        # (seq_num, batch, 2*feature_size) for bidirectional
-        if batch_first:
-            output = _op.transpose(output, (1, 0, 2))
-
-        return (output, out_hidden_state)
-
-    def bidir_lstm_cell(self, input_seqs, weights_dicts):
-        """
-        Bidirectional LSTM cell
-        """
-        seq_len = len(input_seqs)
-        forward_outputs, fw_H_t, fw_C_t = lstm_cell(input_seqs, **weights_dicts[0])
-
-        reverse_outputs, rev_H_t, rev_C_t = lstm_cell(
-            input_seqs, **weights_dicts[1], backwards=True
-        )
-
-        final_outputs = []
-        for i in range(seq_len):
-            final_outputs.append(
-                _op.concatenate([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=-1)
-            )
-
-        return final_outputs, (fw_H_t, fw_C_t), (rev_H_t, rev_C_t)
-
-    def lstm_layers(self, input_data, layer_weights_dicts, bidirectional, dtype, dropout_p=0.0):
-        """
-        Methods iterates layers for Stacked LSTM
-        """
-        layers_num = len(layer_weights_dicts)
-        # split input sequence to samples set
-        input_seqs = unbind(input_data, 0)  # [seq_num, (batch, feature_size)]
-        output_hiddens = []
-        for i in range(layers_num):
-            weights_dicts = layer_weights_dicts[i]
-            # input_seqs shape = [seq_num, (batch, feature_size)] or
-            # [seq_num, (batch, 2*feature_size)] for bidirectional
-            if bidirectional:
-                input_seqs, H_t, C_t = self.bidir_lstm_cell(input_seqs, weights_dicts)
-            else:
-                input_seqs, H_t, C_t = lstm_cell(input_seqs, **weights_dicts[0])
-
-            output_hiddens.append((H_t, C_t))
-
-            # TODO (vvchernov): in pytorch implementation train is also checked
-            # see https://github.com/pytorch/pytorch/blob/70c8daf43946b53af6493d058899ef952d27d339
-            # /aten/src/ATen/native/RNN.cpp#L1054
-            if dropout_p != 0 and i < layers_num - 1:
-                # for input in input_seqs:
-                #     input = _op.dropout(input, dropout_p)
-                raise NotImplementedError("Dropout for LSTM has not been supported yet!")
-        final_hiddens = []
-        if bidirectional:
-            for output_hidden in output_hiddens:
-                final_hiddens.append(output_hidden[0])
-                final_hiddens.append(output_hidden[1])
-        else:
-            final_hiddens = output_hiddens
-
-        return _op.stack(input_seqs, 0), final_hiddens
-
-    def lstm(self, inputs, input_types):
-        """
-        Description of LSTM in pytorch:https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
-        Native implementation for torch version less than 1.8.0 (projection is unsupported):
-        https://github.com/pytorch/pytorch/blob/70c8daf43946b53af6493d058899ef952d27d339/aten/ \
-        src/ATen/native/RNN.cpp#L1396
-        Native implementation for torch version from 1.8.0 and higher (projection is supported):
-        https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/RNN.cpp#L1483
-        """
-        # TODO (vvchernov): support dropout
-        assert len(inputs) == 9, "Input of size 9 is expected"
-        # Unpack inputs, note that if optional and not provided then value will be None.
-        _X = inputs[0]
-        # _X shape (seq_num, batch, feature_size) or (batch, seq_num, feature_size)
-
-        hidden_states = inputs[1]
-        assert len(hidden_states) == 2, "lstm expects two hidden states"
-        h_0 = hidden_states[0]
-        c_0 = hidden_states[1]
-        # H0 shape (hidden_layers_num, batch, proj_size) if projection
-        # else (hidden_layers_num, batch, hidden_size)
-        # C0 shape (hidden_layers_num, batch, hidden_size)
-
-        _weights = inputs[2]
-        # If no projection
-        # Wi layer[0] shape (4 * hidden_size, feature_size)
-        # Wh layer[0] shape (4 * hidden_size, hidden_size)
-        # Bi layer[0] shape (4 * hidden_size)
-        # Bh layer[0] shape (4 * hidden_size)
-
-        # Wi layer[>0] shape (4 * hidden_size, hidden_size * num_directions)
-        # Wh layer[>0] shape (4 * hidden_size, hidden_size)
-        # Bi layer[>0] shape (4 * hidden_size)
-        # Bh layer[>0] shape (4 * hidden_size)
-
-        # If projection
-        # Wi layer[0] shape (4 * hidden_size, feature_size)
-        # Wh layer[0] shape (4 * hidden_size, proj_size)
-        # Bi layer[0] shape (4 * hidden_size)
-        # Bh layer[0] shape (4 * hidden_size)
-        # P  layer[0] shape (proj_size, hidden_size)
-
-        # Wi layer[>0] shape (4 * hidden_size, proj_size * num_directions)
-        # Wh layer[>0] shape (4 * hidden_size, proj_size)
-        # Bi layer[>0] shape (4 * hidden_size)
-        # Bh layer[>0] shape (4 * hidden_size)
-        # P  layer[>0] shape (proj_size, hidden_size)
-
-        # Scalar inputs
-        has_biases = inputs[3]
-        num_layers = inputs[4]
-        dropout_p = inputs[5]  # dropout probability, if 0.0 it means there is no dropout
-        # train = inputs[6]
-        bidirectional = inputs[7]
-        batch_first = inputs[8]
-
-        num_directions = 1
-        if bidirectional:
-            num_directions = 2
-
-        rsd = len(_weights) % num_layers
-        assert rsd == 0, "The number of weights must be a multiple of the number of layers!"
-        rsd = (len(_weights) / num_layers) % num_directions
-        assert (
-            rsd == 0
-        ), "The number of weights in layer must be a multiple of the number of directions!"
-        has_proj = False
-        proj_size = 0
-        weights_num = int(len(_weights) / num_layers / num_directions)
-        if has_biases:
-            if weights_num == 5:
-                has_proj = True
-                proj_size = _infer_shape(_weights[4])[0]
-            else:
-                assert weights_num == 4, "The weights number in layer is expected equal to 4"
-        else:
-            if weights_num == 3:
-                has_proj = True
-                proj_size = _infer_shape(_weights[2])[0]
-            else:
-                assert weights_num == 2, "The weights number in layer is expected equal to 2"
-
-        X = _op.transpose(_X, (1, 0, 2)) if batch_first else _X
-        # TODO (vvchernov): Which data type should be used? from input or weights?
-        # Instead of it _infer_type(X).checked_type.dtype can be used
-        X_dtype = input_types[0]
-        X_shape = _infer_shape(X)  # (seq_num, batch, feature_size)
-
-        hidden_size = _infer_shape(_weights[0])[0] / 4
-        batch_size = X_shape[1]
-
-        # Initialize hidden states if not provided.
-        layers_h = []
-        layers_c = []
-        hidden_layers_num = num_directions * num_layers
-        if h_0 is None:
-            if has_proj:
-                h_0 = _op.zeros((batch_size, proj_size), X_dtype)
-            else:
-                h_0 = _op.zeros((batch_size, hidden_size), X_dtype)
-            for i in range(hidden_layers_num):
-                layers_h.append(h_0)
-        else:
-            layers_h = unbind(h_0, 0)
-        if c_0 is None:
-            c_0 = _op.zeros((batch_size, hidden_size), X_dtype)
-            for i in range(hidden_layers_num):
-                layers_c.append(c_0)
-        else:
-            layers_c = unbind(c_0, 0)
-
-        layer_weights_dicts = []
-        k = 0  # layer counter
-        if has_biases:
-            names = ["hidden_state", "cell_state", "w_inp", "w_hid", "b_inp", "b_hid"]
-            if bidirectional:
-                rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of LSTM weights"
-                for i in range(0, len(_weights), 2 * weights_num):
-                    fw_tensors = [layers_h[2 * k], layers_c[2 * k], *_weights[i : i + 4]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    if has_proj:
-                        fw_weights_dict["proj"] = _weights[i + 4]
-                    j = i + weights_num
-                    rev_tensors = [layers_h[2 * k + 1], layers_c[2 * k + 1], *_weights[j : j + 4]]
-                    rev_weights_dict = dict(zip(names, rev_tensors))
-                    if has_proj:
-                        rev_weights_dict["proj"] = _weights[j + 4]
-                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
-                    k += 1
-            else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of LSTM weights"
-                for i in range(0, len(_weights), weights_num):
-                    fw_tensors = [layers_h[k], layers_c[k], *_weights[i : i + 4]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    if has_proj:
-                        fw_weights_dict["proj"] = _weights[i + 4]
-                    layer_weights_dicts.append([fw_weights_dict])
-                    k += 1
-        else:
-            names = ["hidden_state", "cell_state", "w_inp", "w_hid"]
-            if bidirectional:
-                rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of LSTM weights"
-                for i in range(0, len(_weights), 2 * weights_num):
-                    fw_tensors = [layers_h[2 * k], layers_c[2 * k], *_weights[i : i + 2]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    if has_proj:
-                        fw_weights_dict["proj"] = _weights[i + 2]
-                    j = i + weights_num
-                    rev_tensors = [layers_h[2 * k + 1], layers_c[2 * k + 1], *_weights[j : j + 2]]
-                    rev_weights_dict = dict(zip(names, rev_tensors))
-                    if has_proj:
-                        rev_weights_dict["proj"] = _weights[j + 2]
-                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
-                    k += 1
-            else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of LSTM weights"
-                for i in range(0, len(_weights), weights_num):
-                    fw_tensors = [layers_h[k], layers_c[k], *_weights[i : i + 2]]
-                    fw_weights_dict = dict(zip(names, fw_tensors))
-                    if has_proj:
-                        fw_weights_dict["proj"] = _weights[i + 2]
-                    layer_weights_dicts.append([fw_weights_dict])
-                    k += 1
-        assert (
-            len(layer_weights_dicts) == num_layers and k == num_layers
-        ), "For stacked LSTM number of weights sets should be the same as number of layers!"
-
-        outputs = self.lstm_layers(
-            X, layer_weights_dicts, bidirectional, dtype=X_dtype, dropout_p=dropout_p
-        )
-
-        # output shape = (seq_num, batch, hidden_size) or
-        # (seq_num, batch, 2*feature_size) for bidirectional
-        output = outputs[0]
-
-        hy = []
-        cy = []
-        for hidden in outputs[1]:
-            hy.append(hidden[0])
-            cy.append(hidden[1])
-
-        if batch_first:
-            output = _op.transpose(output, (1, 0, 2))
-
-        return (output, _op.stack(hy, 0), _op.stack(cy, 0))
-
-    def all_any_common(self, op, inputs, input_types):
-        if len(inputs) >= 2:
-            dim = inputs[1]
-        else:
-            dim = None
-        if len(inputs) >= 3:
-            keepdim = inputs[2]
-        else:
-            keepdim = False
-        if self.infer_type(inputs[0]).dtype != "bool":
-            # The input dtype can be uint8.
-            inp = _op.cast(inputs[0], "bool")
-        else:
-            inp = inputs[0]
-        return op(inp, axis=dim, keepdims=keepdim)
-
-    def searchsorted_common(
-        self, sorted_sequence, values, out_int32, right, side=None, out=None, sorter=None
-    ):
-        assert side is None and out is None and sorter is None, "unsupported parameters"
-        dtype = "int32" if out_int32 else "int64"
-        values_shape = _infer_shape(values)
-
-        if len(values_shape) == 0:
-            values = _op.expand_dims(values, 0)
-
-        out = _op.searchsorted(sorted_sequence, values, right=right, dtype=dtype)
-
-        if len(values_shape) == 0:
-            return _op.squeeze(out)
-
-        return out
-
-    def searchsorted(self, inputs, input_types):
-        return self.searchsorted_common(*inputs)
-
-    def bucketize(self, inputs, input_types):
-        return self.searchsorted_common(inputs[1], inputs[0], inputs[2], inputs[3])
-
-    def roll(self, inputs, input_types):
-        def slide_axes(inp, shape, ax):
-            axes = list(range(len(shape)))
-            axes = axes[:ax] + [-1] + axes[ax:-1]
-            return _op.transpose(inp, axes)
-
-        x = inputs[0]
-        shifts = inputs[1]
-        dims = inputs[2]
-        shape = self.infer_shape(x)
-        start = _expr.const(0, "int64")
-        step = _expr.const(1, "int64")
-
-        out = x
-        for i, dim in enumerate(dims):
-            roll_dim = _expr.const(shape[dim], "int64")
-            indices_1d = _op.mod(
-                _op.transform.arange(start, roll_dim, step, "int64")
-                - _expr.const(shifts[i], "int64")
-                + roll_dim,
-                roll_dim,
-            )
-            # First fill in the last axis with roll indices, and then do transpose to
-            # bring the roll indices into the desired axis.
-            indices = slide_axes(
-                _op.tile(indices_1d, shape[:dim] + shape[dim + 1 :] + (1,)), shape, dim
-            )
-            out = _op.gather(out, dim, indices)
-
-        return out
-
-    def einsum(self, inputs, input_types):
-        equation = inputs[0]
-        data = inputs[1]
-        return _op.einsum(data, equation)
-
-    def dot(self, inputs, _):
-        lhs, rhs = inputs
-        return _op.sum(_op.multiply(lhs, rhs))
-
-    def mv(self, inputs, _):
-        lhs, rhs = inputs
-
-        # Convert the 1D matrix (vector) into a 2D matrix with the extra
-        # dimension=1
-        rhs_matrix = _op.transform.expand_dims(rhs, 0)
-
-        # Run multiplication
-        dense_result = _op.nn.dense(lhs, rhs_matrix, units=None)
-
-        # Chop off the extra result dimension
-        return _op.transform.squeeze(dense_result)
-
-    def grid_sampler(self, inputs, input_types):
-        interpolate_mode = inputs[2]
-        padding_mode = inputs[3]
-        align_corners = inputs[4]
-        data_shape = self.infer_shape_with_prelude(inputs[0])
-
-        if len(data_shape) == 4:
-            layout = "NCHW"
-            axes = [0, 3, 1, 2]
-            grid = _op.transform.transpose(inputs[1], axes)
-        elif len(data_shape) == 5:
-            layout = "NCDHW"
-            axes = [0, 4, 1, 2, 3]
-            grid = _op.transform.transpose(inputs[1], axes)
-        else:
-            msg = "only 4D and 5D are supported."
-            raise ValueError(msg)
-
-        if interpolate_mode == 0:
-            interpolate_str = "bilinear"
-        elif interpolate_mode == 1:
-            interpolate_str = "nearest"
-        elif interpolate_mode == 2:
-            interpolate_str = "bicubic"
-        else:
-            msg = f"interpolation method {interpolate_mode} is not supported"
-            raise ValueError(msg)
-
-        if padding_mode == 0:
-            padding_mode_str = "zeros"
-        elif padding_mode == 1:
-            padding_mode_str = "border"
-        elif padding_mode == 2:
-            padding_mode_str = "reflection"
-        else:
-            msg = f"padding_mode {padding_mode} is not supported"
-            raise ValueError(msg)
-
-        return _op.image.grid_sample(
-            inputs[0], grid, interpolate_str, layout, padding_mode_str, align_corners
-        )
-
-    def trilu(self, inputs, input_types, mode):
-        data = inputs[0]
-        k = inputs[1] if inputs[1] else 0
-        upper = True if mode == "triu" else False
-        return _op.trilu(data, k, upper)
-
-    def multinomial(self, inputs, input_types):
-        probs = inputs[0]
-        num_samples = inputs[1]
-        replacement = inputs[2] if inputs[2] else True
-        assert not (
-            replacement is False and num_samples > 1
-        ), "Multinomial without replacement is not yet supported."
-        # Ideally this seed would be generated by a previous threefry operation.
-        # Eventually we might want to add a global store for random keys.
-        seed = np.random.randint(1e6)
-        key = _op.random.threefry_key(seed)
-        output = _op.random.multinomial(key, probs, num_samples)
-        _, indices = _expr.TupleWrapper(output, 2)
-        return indices
-
-    def weight_norm(self, inputs, input_types):
-        weight_v, weight_g = inputs[0], inputs[1]
-        dim = inputs[2]
-        dtype = input_types[0]
-        order = 2.0
-        reci_order = _expr.const(1.0 / order, dtype=dtype)
-        order = _expr.const(order)
-
-        norm_v = _op.power(
-            _op.reduce.sum(_op.power(_op.abs(weight_v), order), axis=dim, exclude=2, keepdims=True),
-            reci_order,
-        )
-        return weight_g * (weight_v / norm_v)
-
-    def inplace_copy(self, inputs, input_types):
-        source = inputs[0]
-        values = inputs[1]
-        accumulate = inputs[2]
-        if not accumulate:
-            mode = "update"
-        else:
-            mode = "add"
-
-        # Track slice and select calls
-        slice_and_select_calls = []
-        while True:
-            if isinstance(source, _expr.Call) and source.op.name in [
-                "strided_slice",
-                "take",
-            ]:
-                slice_and_select_calls.append(source)
-                source = source.args[0]
-            else:
-                break
-        slice_and_select_calls = slice_and_select_calls[::-1]
-        source_shape = _infer_shape(source)
-
-        # Create index map
-        index_map = {}
-        squeezed_axes = []
-        for call in slice_and_select_calls:
-            if call.op.name == "strided_slice":
-                axes = call.attrs.axes
-                if axes is None:
-                    axes = list(range(len(source_shape)))
-                begins = call.attrs.begin
-                ends = call.attrs.end
-                for axis, begin, end in zip(axes, begins, ends):
-                    num_squeezed_axis = len([v for v in squeezed_axes if v <= axis])
-                    axis += num_squeezed_axis
-                    # Set range
-                    if begin < 0:
-                        begin = source_shape[axis] + begin
-                    if end < 0:
-                        end = source_shape[axis] + end
-                    if begin == 0 and end == source_shape[axis]:
-                        continue
-                    index_map[axis] = (begin.value, end.value)
-            elif call.op.name == "take":
-                num_squeezed_axis = len([v for v in squeezed_axes if v <= axis])
-                axis = call.attrs.axis.value + num_squeezed_axis
-                idx = call.args[1]
-                assert isinstance(idx, _expr.Constant)
-                idx = idx.data.numpy().item()
-                if idx < 0:
-                    idx = source_shape[axis] + idx
-                index_map[axis] = (idx, idx + 1)
-                values = _op.expand_dims(values, axis)
-                squeezed_axes.append(axis)
-            else:
-                pass
-        last_index_dim = np.max(list(index_map)).item()
-        for axis in range(last_index_dim + 1):
-            if axis not in index_map:
-                index_map[axis] = 0, source_shape[axis]
-
-        # Create indices
-        nelem = 1
-        for (begin, end) in index_map.values():
-            nelem *= end - begin
-        chunk_sizes = [nelem]
-        for i in range(1, last_index_dim + 1):
-            begin, end = index_map[i - 1]
-            chunk_sizes.append(chunk_sizes[-1] // (end - begin))
-        indices = []
-        for axis in range(last_index_dim + 1):
-            chunk_size = chunk_sizes[axis]
-            repeat = nelem // chunk_size
-            begin, end = index_map[axis]
-            step_size = chunk_size // (end - begin)
-            chunk = np.repeat(np.arange(begin, end), step_size)
-            chunk = np.concatenate([chunk] * repeat)
-            indices.append(chunk)
-        indices = np.stack(indices, axis=0).astype(np.int64)
-        new_shape = [indices.shape[0]] + [
-            index_map[i][1] - index_map[i][0] for i in range(last_index_dim + 1)
-        ]
-        indices = np.resize(indices, new_shape)
-        indices = _expr.const(indices)
-
-        # Return
-        return _op.scatter_nd(source, indices, values, mode)
-
-    def linalg_vector_norm(self, inputs, input_types):
-        data = inputs[0]
-        dtype = input_types[0]
-        ord = inputs[1]
-        dim = inputs[2]
-        keepdim = inputs[3]
-
-        assert dtype == "float32" or dtype == "float64"
-
-        if ord == 0:
-            return _op.reduce.sum(
-                _op.cast(_op.not_equal(data, _expr.const(0, dtype=dtype)), dtype=dtype),
-                axis=dim,
-                keepdims=keepdim,
-            )
-        elif ord == np.inf:
-            return _op.reduce.max(_op.abs(data), axis=dim, keepdims=keepdim)
-        elif ord == -np.inf:
-            return _op.reduce.min(_op.abs(data), axis=dim, keepdims=keepdim)
-        reci_ord = _expr.const(1.0 / ord, dtype=dtype)
-        ord = _expr.const(ord, dtype=dtype)
-        return _op.power(
-            _op.reduce.sum(_op.power(_op.abs(data), ord), axis=dim, keepdims=keepdim),
-            reci_ord,
-        )
-
-    def scaled_dot_product_attention(self, inputs, input_types):
-        query = inputs[0]
-        key = inputs[1]
-        value = inputs[2]
-        attn_mask = inputs[3]
-        dropout_p = inputs[4]
-        is_causal = inputs[5]
-
-        # Explicit scale can be used from torch>=2.1.0
-        if len(inputs) == 7:
-            scale = inputs[6]
-        else:
-            scale = None
-
-        assert (
-            input_types[0] == input_types[1] == input_types[2]
-        ), "Expected query, key, and value to have the same dtype"
-
-        dtype = input_types[0]
-        assert dtype == "float32" or dtype == "float64", "Data type can be float32 or float64"
-
-        query_shape = self.infer_shape_with_prelude(query)
-        key_shape = self.infer_shape_with_prelude(key)
-        value_shape = self.infer_shape_with_prelude(value)
-        assert 3 <= len(query_shape) <= 4, "Only 3D or 4D query supported"
-        assert 3 <= len(key_shape) <= 4, "Only 3D or 4D key supported"
-        assert 3 <= len(value_shape) <= 4, "Only 3D or 4D value supported"
-
-        assert dropout_p == 0.0, "Only dropout_p==0.0 supported"
-
-        L, S = query_shape[-2], key_shape[-2]
-
-        if scale is None:
-            scale_factor = _expr.const(1 / math.sqrt(query_shape[-1]), dtype=dtype)
-        else:
-            scale_factor = _expr.const(scale, dtype=dtype)
-
-        attn_bias = _op.full(_expr.const(0.0, dtype=dtype), (L, S))
-
-        if is_causal:
-            assert attn_mask is None, "Explicit attn_mask shouldn't be set when is_causal=True"
-            temp_mask = _op.full(_expr.const(True), [L, S], dtype="bool")
-            temp_mask = _op.trilu(temp_mask, 0, upper=False)
-            temp_mask = _op.cast(temp_mask, dtype="bool")
-            temp_mask = _op.logical_not(temp_mask)
-            fill_value = _op.cast(_expr.const(float("-inf")), dtype=dtype)
-            attn_bias = _op.where(temp_mask, fill_value, attn_bias)
-            attn_bias = _op.cast(attn_bias, dtype)
-
-        if attn_mask is not None:
-            if input_types[3] == "bool":
-                attn_mask = _op.logical_not(attn_mask)
-                fill_value = _op.cast(_expr.const(float("-inf")), dtype=dtype)
-                attn_bias = _op.where(attn_mask, fill_value, attn_bias)
-            else:
-                attn_bias = _op.add(attn_bias, attn_mask)
-
-        if len(query_shape) < len(key_shape):
-            batch_size = key_shape[0]
-        else:
-            batch_size = query_shape[0]
-        if len(query_shape) == 4 and len(key_shape) == 4:
-            query = _op.reshape(query, newshape=[-3, -2])
-            key = _op.reshape(key, newshape=[-3, -2])
-        if len(query_shape) == 3 and len(key_shape) == 4:
-            query = _op.broadcast_to(query, shape=(batch_size,) + query_shape)
-            query = _op.reshape(query, newshape=[-3, -2])
-            key = _op.reshape(key, newshape=[-3, -2])
-        if len(query_shape) == 4 and len(key_shape) == 3:
-            query = _op.reshape(query, newshape=[-3, -2])
-            key = _op.broadcast_to(key, shape=(batch_size,) + key_shape)
-            key = _op.reshape(key, newshape=[-3, -2])
-        attn_weight = _op.nn.batch_matmul(query, key)
-        if len(query_shape) == 4 or len(key_shape) == 4:
-            attn_weight = _op.reshape(attn_weight, newshape=[-4, batch_size, -1, -2])
-        attn_weight = _op.squeeze(attn_weight, axis=[])
-
-        attn_weight = _op.multiply(attn_weight, scale_factor)
-        attn_weight = _op.add(attn_weight, attn_bias)
-        attn_weight = _op.nn.softmax(attn_weight)
-        attn_weight = _op.nn.dropout(attn_weight, rate=dropout_p)
-
-        aw_shape = self.infer_shape_with_prelude(attn_weight)
-        if len(aw_shape) < len(value_shape):
-            batch_size = value_shape[0]
-        else:
-            batch_size = aw_shape[0]
-        if len(aw_shape) == 4 and len(value_shape) == 4:
-            attn_weight = _op.reshape(attn_weight, newshape=[-3, -2])
-            value = _op.reshape(value, newshape=[-3, -2])
-        if len(aw_shape) == 3 and len(value_shape) == 4:
-            attn_weight = _op.broadcast_to(attn_weight, shape=(batch_size,) + aw_shape)
-            attn_weight = _op.reshape(attn_weight, newshape=[-3, -2])
-            value = _op.reshape(value, newshape=[-3, -2])
-        if len(aw_shape) == 4 and len(value_shape) == 3:
-            attn_weight = _op.reshape(attn_weight, newshape=[-3, -2])
-            value = _op.broadcast_to(value, shape=(batch_size,) + value_shape)
-            value = _op.reshape(value, newshape=[-3, -2])
-        attn_weight = _op.nn.batch_matmul(attn_weight, value, transpose_b=False)
-        if len(aw_shape) == 4 or len(value_shape) == 4:
-            attn_weight = _op.reshape(attn_weight, newshape=[-4, batch_size, -1, -2])
-        return attn_weight
-
-    def tile(self, inputs, input_types):
-        data = inputs[0]
-        reps = []
-        for r in inputs[1]:
-            if isinstance(r, int):
-                reps.append(r)
-            else:
-                reps.append(int(_infer_value(r, {}).numpy()))
-        return _op.tile(data, reps)
-
-    # Operator mappings
-    def create_convert_map(self):
-        self.convert_map = {
-            "aten::is_floating_point": self.is_floating_point,
-            "aten::pixel_shuffle": self.pixel_shuffle,
-            "aten::device": self.none,
-            "prim::device": self.none,
-            "aten::sub": self.sub,
-            "aten::max": self.max,
-            "aten::min": self.min,
-            "aten::maximum": self.maximum,
-            "aten::minimum": self.minimum,
-            "aten::amax": self.max,
-            "aten::amin": self.min,
-            "aten::stft": self.stft,
-            "aten::mul": self.make_elemwise("multiply"),
-            "aten::pow": self.make_elemwise("power"),
-            "aten::lerp": self.lerp,
-            "aten::arange": self.arange,
-            "aten::meshgrid": self.meshgrid,
-            "aten::div": self.make_elemwise("divide"),
-            "aten::floor_divide": self.make_elemwise("floor_divide"),
-            "aten::true_divide": self.make_elemwise("divide"),
-            "aten::fmod": self.make_elemwise("trunc_mod"),
-            "aten::remainder": self.make_elemwise("floor_mod"),
-            "aten::addcdiv": self.addcdiv,
-            "aten::addcmul": self.addcmul,
-            "aten::ones": self.ones,
-            "aten::ones_like": self.ones_like,
-            "aten::zeros": self.zeros,
-            "aten::zero_": self.zero_,
-            "aten::zeros_like": self.zeros_like,
-            "aten::new_zeros": self.new_zeros,
-            "aten::new_ones": self.new_ones,
-            "aten::full": self.full,
-            "aten::full_like": self.full_like,
-            "aten::new_full": self.new_full,
-            "aten::fill_": self.fill_,
-            "aten::linspace": self.linspace,
-            "aten::reciprocal": self.reciprocal,
-            "aten::repeat": self.repeat,
-            "aten::repeat_interleave": self.repeat_interleave,
-            "aten::to": self.to,
-            "aten::squeeze": self.squeeze,
-            "aten::unsqueeze": self.unsqueeze,
-            "aten::cat": self.concatenate,
-            "aten::concat": self.concatenate,
-            "aten::slice": self.slice,
-            "aten::narrow": self.narrow,
-            "aten::split": self.split,
-            "aten::tensor_split": self.tensor_split,
-            "aten::split_with_sizes": self.split_with_sizes,
-            "aten::select": self.select,
-            "aten::take": self.take,
-            "aten::where": self.where,
-            "aten::topk": self.topk,
-            "aten::relu": self.relu,
-            "aten::relu6": self.relu6,
-            "aten::prelu": self.prelu,
-            "aten::leaky_relu": self.leaky_relu,
-            "aten::elu": self.elu,
-            "aten::celu": self.celu,
-            "aten::gelu": self.gelu,
-            "aten::selu": self.selu,
-            "aten::silu": self.silu,
-            "aten::glu": self.glu,
-            "aten::log_sigmoid": self.log_sigmoid,
-            "aten::adaptive_avg_pool1d": functools.partial(
-                self.adaptive_avg_pool, _op.nn.adaptive_avg_pool1d
-            ),
-            "aten::adaptive_avg_pool2d": functools.partial(
-                self.adaptive_avg_pool, _op.nn.adaptive_avg_pool2d
-            ),
-            "aten::adaptive_avg_pool3d": functools.partial(
-                self.adaptive_avg_pool, _op.nn.adaptive_avg_pool3d
-            ),
-            "aten::adaptive_max_pool1d": functools.partial(
-                self.adaptive_max_pool, _op.nn.adaptive_max_pool1d
-            ),
-            "aten::adaptive_max_pool2d": functools.partial(
-                self.adaptive_max_pool, _op.nn.adaptive_max_pool2d
-            ),
-            "aten::adaptive_max_pool3d": functools.partial(
-                self.adaptive_max_pool, _op.nn.adaptive_max_pool3d
-            ),
-            "aten::max_pool2d": self.maxpool_2d,
-            "aten::max_pool2d_with_indices": self.maxpool_2d_with_indices,
-            "aten::max_pool1d": self.maxpool_1d,
-            "aten::max_pool3d": self.maxpool_3d,
-            "aten::hardtanh": self.hardtanh,
-            "aten::_convolution": self.convolution,
-            "aten::softmax": self.softmax,
-            "aten::threshold": self.threshold,
-            "aten::contiguous": self.contiguous,
-            "aten::batch_norm": self.batch_norm,
-            "aten::instance_norm": self.instance_norm,
-            "aten::layer_norm": self.layer_norm,
-            "aten::group_norm": self.group_norm,
-            "aten::transpose": self.transpose,
-            "aten::t": self.transpose,
-            "aten::numpy_T": self.numpy_T,
-            "aten::flatten": self.flatten,
-            "aten::unflatten": self.unflatten,
-            "aten::addmm": self.addmm,
-            "aten::size": self.size,
-            "aten::view": self.view,
-            "aten::view_as": self.view_as,
-            "aten::reshape": self.reshape,
-            "aten::reshape_as": self.reshape_as,
-            "aten::clone": self.clone,
-            "aten::log_softmax": self.log_softmax,
-            "aten::sigmoid": self.sigmoid,
-            "aten::softplus": self.softplus,
-            "aten::avg_pool1d": self.make_avg_pool(1),
-            "aten::avg_pool2d": self.make_avg_pool(2),
-            "aten::avg_pool3d": self.make_avg_pool(3),
-            "aten::linear": self.linear,
-            "aten::dropout": self.dropout,
-            "aten::feature_dropout": self.dropout,
-            "aten::alpha_dropout": self.dropout,
-            "aten::mean": self.mean,
-            "aten::chunk": self.chunk,
-            "aten::unsafe_chunk": self.chunk,
-            "aten::matmul": self.matmul,
-            "aten::bmm": self.matmul,
-            "aten::baddbmm": self.baddbmm,
-            "aten::expand": self.expand,
-            "aten::Int": self.int,
-            "prim::NumToTensor": self.numtotensor,
-            "prim::ImplicitTensorToNum": self.tensortonum,
-            "aten::ScalarImplicit": self.tensortonum,
-            "aten::pad": self.pad,
-            "aten::constant_pad_nd": self.constant_pad_nd,
-            "aten::reflection_pad1d": self.reflection_pad1d,
-            "aten::reflection_pad2d": self.reflection_pad2d,
-            "aten::replication_pad1d": self.replication_pad1d,
-            "aten::replication_pad2d": self.replication_pad2d,
-            "aten::replication_pad3d": self.replication_pad3d,
-            "aten::permute": self.transpose,
-            "aten::sum": self.make_reduce("sum"),
-            "aten::prod": self.make_reduce("prod"),
-            "aten::argmin": self.make_reduce("argmin"),
-            "aten::argmax": self.make_reduce("argmax"),
-            "aten::norm": self.norm,
-            "aten::frobenius_norm": self.frobenius_norm,
-            "aten::std": self.std,
-            "aten::var": self.variance,
-            "aten::var_mean": self.var_mean,
-            "aten::abs": self.make_unary("abs"),
-            "aten::neg": self.make_unary("negative"),
-            "aten::cos": self.make_unary("cos"),
-            "aten::cosh": self.make_unary("cosh"),
-            "aten::sin": self.make_unary("sin"),
-            "aten::sinh": self.make_unary("sinh"),
-            "aten::tan": self.make_unary("tan"),
-            "aten::tanh": self.make_unary("tanh"),
-            "aten::acos": self.make_unary("acos"),
-            "aten::asin": self.make_unary("asin"),
-            "aten::atan": self.make_unary("atan"),
-            "aten::log": self.make_unary("log"),
-            "aten::log2": self.make_unary("log2"),
-            "aten::log10": self.make_unary("log10"),
-            "aten::log1p": self.log1p,
-            "aten::exp": self.make_unary("exp"),
-            "aten::erf": self.make_unary("erf"),
-            "aten::trunc": self.make_unary("trunc"),
-            "aten::sign": self.make_unary("sign"),
-            "aten::sqrt": self.make_unary("sqrt"),
-            "aten::rsqrt": self.make_unary("rsqrt"),
-            "aten::square": self.square,
-            "aten::tril": functools.partial(self.trilu, mode="tril"),
-            "aten::triu": functools.partial(self.trilu, mode="triu"),
-            "aten::ceil": self.make_unary("ceil"),
-            "aten::floor": self.make_unary("floor"),
-            "aten::round": self.make_unary("round"),
-            "aten::isfinite": self.make_unary("isfinite"),
-            "aten::isinf": self.make_unary("isinf"),
-            "aten::isnan": self.make_unary("isnan"),
-            "aten::clamp": self.clamp,
-            "aten::clamp_min": self.clamp_min,
-            "aten::clamp_max": self.clamp_max,
-            "aten::detach": self.identity,
-            "aten::upsample_bilinear2d": self.make_upsample("linear"),
-            "aten::upsample_bicubic2d": self.make_upsample("cubic"),
-            "aten::upsample_nearest2d": self.make_upsample("nearest_neighbor"),
-            "aten::upsample_trilinear3d": self.make_upsample3d("linear"),
-            "aten::upsample_nearest3d": self.make_upsample3d("nearest_neighbor"),
-            "aten::expand_as": self.expand_as,
-            "aten::broadcast_tensors": self.broadcast_tensors,
-            "aten::broadcast_to": self.broadcast_to,
-            "aten::lt": self.make_elemwise("less"),
-            "aten::gt": self.make_elemwise("greater"),
-            "aten::le": self.make_elemwise("less_equal"),
-            "aten::ge": self.make_elemwise("greater_equal"),
-            "aten::ne": self.make_elemwise("not_equal"),
-            "aten::eq": self.make_elemwise("equal"),
-            "aten::logical_not": self.logical_not,
-            "aten::logical_xor": self.logical_xor,
-            "aten::bitwise_not": self.bitwise_not,
-            "aten::bitwise_xor": self.bitwise_xor,
-            "aten::bitwise_and": self.bitwise_and,
-            "aten::Bool": self.Bool,
-            "aten::Float": self.Float,
-            "aten::rsub": self.rsub,
-            "aten::embedding": self.embedding,
-            "aten::one_hot": self.one_hot,
-            "aten::mm": self.matmul,
-            "aten::add": self.add,
-            "aten::stack": self.stack,
-            "aten::__getitem__": self.list_getitem,
-            "aten::len": self.list_len,
-            "aten::type_as": self.type_as,
-            "aten::gather": self.gather,
-            "aten::index_select": self.select,
-            "aten::index": self.index,
-            "torchvision::nms": self.nms,
-            "aten::logsumexp": self.logsumexp,
-            "torchvision::roi_align": self.roi_align,
-            "torchvision::deform_conv2d": self.deform_conv2d,
-            "aten::unbind": self.unbind,
-            "aten::__and__": self.logical_and,
-            "aten::logical_and": self.logical_and,
-            "aten::logical_or": self.logical_or,
-            "aten::_shape_as_tensor": self.shape_as_tensor,
-            "aten::nonzero": self.nonzero,
-            "aten::nonzero_numpy": self.nonzero_numpy,
-            "aten::scatter": self.scatter,
-            "aten::scatter_add": self.scatter_add,
-            "aten::scatter_reduce": self.scatter_reduce,
-            "aten::index_put": self.index_put,
-            "aten::scalar_tensor": self.scalar_tensor,
-            "aten::__interpolate": self.interpolate,
-            "aten::IntImplicit": self.identity,
-            "aten::tensor": self.identity,  # used for example in tensor(1.0)
-            "aten::numel": self.numel,
-            "aten::empty": self.empty,
-            "aten::empty_like": self.empty_like,
-            "aten::new_empty": self.new_empty,
-            "aten::randn": self.randn,
-            "aten::bincount": self.bincount,
-            "aten::__not__": self.logical_not,
-            "aten::hardswish": self.hard_swish,
-            "aten::hardsigmoid": self.hard_sigmoid,
-            "aten::cumsum": self.cumsum,
-            "aten::masked_fill": self.masked_fill,
-            "aten::masked_select": self.masked_select,
-            "aten::argsort": self.argsort,
-            "aten::sort": self.sort,
-            "aten::_unique2": self.unique,
-            "aten::nll_loss": self.nll_loss,
-            "aten::nll_loss2d": self.nll_loss,
-            "aten::nll_loss_nd": self.nll_loss,
-            "aten::cross_entropy_loss": self.cross_entropy_loss_with_logits,
-            "aten::l1_loss": self.l1_loss,
-            "aten::mse_loss": self.mse_loss,
-            "aten::flip": self.flip,
-            "aten::rnn_tanh": functools.partial(self.rnn, nonlinearity="tanh"),
-            "aten::rnn_relu": functools.partial(self.rnn, nonlinearity="relu"),
-            "aten::gru": self.gru,
-            "aten::lstm": self.lstm,
-            "aten::all": functools.partial(self.all_any_common, _op.all),
-            "aten::any": functools.partial(self.all_any_common, _op.any),
-            "aten::searchsorted": self.searchsorted,
-            "aten::bucketize": self.bucketize,
-            "aten::roll": self.roll,
-            "aten::einsum": self.einsum,
-            "aten::dot": self.dot,
-            "aten::mv": self.mv,
-            "aten::grid_sampler": self.grid_sampler,
-            "aten::__ior__": self.make_elemwise("bitwise_or"),
-            "aten::__iand__": self.make_elemwise("bitwise_and"),
-            "aten::__ixor__": self.make_elemwise("bitwise_xor"),
-            "aten::__lshift__": self.make_elemwise("left_shift"),
-            "aten::__rshift__": self.make_elemwise("right_shift"),
-            "aten::multinomial": self.multinomial,
-            "aten::_weight_norm": self.weight_norm,
-            "aten::copy_": self.inplace_copy,
-            "aten::swapaxes": self.transpose,
-            "aten::linalg_vector_norm": self.linalg_vector_norm,
-            "aten::scaled_dot_product_attention": self.scaled_dot_product_attention,
-            "aten::tile": self.tile,
-        }
-
-    def update_convert_map(self, custom_map):
-        self.convert_map.update(custom_map)
-
-    def report_missing_conversion(self, op_names):
-        """Check if all ops in an input graph are supported by TVM"""
-        known_ops = [
-            "prim::Constant",
-            "prim::GetAttr",
-            "prim::ListConstruct",
-            "prim::ListUnpack",
-            "prim::TupleConstruct",
-            "prim::TupleUnpack",
-            "prim::RaiseException",
-            "prim::If",
-            "prim::Loop",
-        ]
-        known_ops += list(self.convert_map.keys())
-        known_ops += list(qnn_torch.convert_map.keys())
-
-        missing = []
-
-        for op_name in op_names:
-            # Also take care of in-place variant ops like aten::relu_
-            if op_name not in known_ops and not (
-                op_name.endswith("_") and op_name[:-1] in known_ops
-            ):
-                missing.append(op_name)
-
-        if missing:
-            msg = f"The following operators are not implemented: {missing}"
-            raise NotImplementedError(msg)
-
-    def convert_block(self, block, outputs):
-        """Translate Torch "Block", used for prim::If and prim::Loop"""
-        ops = _get_operator_nodes(
-            block.nodes(),
-            self.source_map,
-            self.op_type_dict,
-            self.use_parser_friendly_name,
-            self.preserve_pytorch_scopes,
-        )
-        ret_names = _get_input_names(block.returnNode())
-        return self.convert_operators(ops, outputs, ret_names)
-
-    def convert_if(self, if_node, outputs):
-        """Translate Torch prim::If to Relay If"""
-        cond = outputs[if_node.inputsAt(0).debugName()]
-        blocks = list(if_node.blocks())
-        true_branch = self.convert_block(blocks[0], outputs)
-        false_branch = self.convert_block(blocks[1], outputs)
-        assert len(true_branch) == 1 and len(false_branch) == 1
-        return _expr.If(cond, true_branch[0], false_branch[0])
-
-    def convert_loop(self, loop_node, outputs):
-        """Translate Torch prim::Loop to Relay while_loop"""
-
-        def get_input(index):
-            ivalue = loop_node.inputsAt(index)
-            inode = ivalue.node()
-            if inode.kind() == "prim::Constant":
-                return _expr.const(_get_constant(inode))
-            var_name = ivalue.debugName()
-            assert var_name in outputs
-            return _wrap_const(outputs[var_name])
-
-        # Refer to the spec for prim::Loop below
-        # https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/OVERVIEW.md#loops
-        # The first input: %max_trip_count
-        # The second input: %initial_condition
-        # The rest of input: loop variables
-        max_loop_count = get_input(0)
-        init_cond = get_input(1)
-        num_loop_var = len(list(loop_node.inputs())) - 2
-        init_vals = [get_input(i + 2) for i in range(num_loop_var)]
-
-        # while loop has always max_loop_count being int64 max
-        # max_loop_count.data (tvm.runtime.NDArray) is -1, so _get_constant again
-        is_while_loop = (
-            isinstance(max_loop_count, _expr.Constant)
-            and _get_constant(loop_node.inputsAt(0).node()) == sys.maxsize
-        )
-
-        if is_while_loop:
-            loop_iter_dtype = "bool"
-            # while loop with non input dependent condition such as while i < 10:
-            # init_cond is int, need to cast to bool to type check
-            if isinstance(init_cond, _expr.Constant):
-                init_cond = _op.cast(init_cond, "bool")
-            init_loop_iter_val = init_cond
-        else:
-            loop_iter_dtype = "int32"
-            # always count from 0
-            init_loop_iter_val = _expr.const(0, dtype="int32")
-
-        body_block = list(loop_node.blocks())[0]
-        block_input_names = _get_input_names(body_block)
-        num_block_inputs = len(block_input_names)
-        name_val_pairs = list(zip(block_input_names, [init_loop_iter_val] + init_vals))
-        outputs.update(name_val_pairs)
-
-        def get_var(name, val):
-            if val:
-                checked_type = self.infer_type_with_prelude(val)
-                if hasattr(checked_type, "shape"):
-                    shape = get_const_tuple(checked_type.shape)
-                    actual_shape = []
-                    for dim in shape:
-                        if isinstance(dim, int) and dim == 0:
-                            actual_shape.append(Any())
-                        else:
-                            actual_shape.append(dim)
-                    expr = _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
-                else:
-                    expr = _expr.var(name, type_annotation=checked_type)
-                return set_span(expr, val.span) if val.span else expr
-            return _expr.var(name)
-
-        source_name = self.source_map[loop_node]
-        loop_iter_var = set_span(
-            _expr.var(block_input_names[0], shape=(), dtype=loop_iter_dtype), span=source_name
-        )
-        loop_vars = set_span(
-            [get_var(name, val) for name, val in name_val_pairs[1:]], span=source_name
-        )
-
-        # Add non constant free variables to loop variables to prevent code blow up
-        # Without this, if there are two for loops in a row, which often happens
-        # if the outer loop is unrolled, the computation corresponding to the first for loop
-        # is inlined inside loop body, turning O(N) + O(N) computation into O(N^2).
-        # This issue was found when converting from Stacked LSTM test. Torch does not add the
-        # outputof the eariler loop into loop variables of the next loop.
-        # So the variable corresponding to the first loop output appears free in the second
-        # loop body.
-        free_vars = [
-            var
-            for var in _get_free_vars_from_block(body_block)
-            if var in outputs
-            and not isinstance(outputs[var], (_expr.Constant, int, float, str))
-            and outputs[var]
-        ]
-
-        prev_outputs = {}
-        for name in free_vars:
-            prev_output = outputs[name]
-            new_loop_var = get_var(name, prev_output)
-            prev_outputs[name] = prev_output
-            outputs[name] = set_span(new_loop_var, source_name)
-            loop_vars.append(new_loop_var)
-            init_vals.append(prev_output)
-
-        def cond(*current_vals):
-            i = current_vals[0]
-
-            if is_while_loop:
-                return _op.equal(i, _expr.const(True, "bool"))
-
-            return _op.less(i, max_loop_count)
-
-        def body(*current_vals):
-            # Update loop variables using the prev iteration outputs
-            assert len(current_vals) == num_block_inputs + len(free_vars)
-
-            for (i, val) in enumerate(current_vals):
-                if i < num_block_inputs:
-                    outputs[block_input_names[i]] = val
-                else:
-                    outputs[free_vars[i - num_block_inputs]] = val
-
-            block_outputs = self.convert_block(body_block, outputs)
-            block_outputs += [outputs[name] for name in free_vars]
-
-            if not is_while_loop:
-                # iter var increment implicit in torch, so do it manually
-                # for while loop, block_outputs[0] is already a boolean,
-                # the result of termination check
-                incr = _expr.const(1, dtype="int32")
-                block_outputs[0] = current_vals[0] + incr
-
-            return block_outputs
-
-        loop = while_loop(cond, [loop_iter_var] + loop_vars, body)
-        loop_val = loop(init_loop_iter_val, *init_vals)
-
-        # restore original output values for free vars
-        outputs.update(prev_outputs)
-
-        # The first element is a loop counter or boolean condition, ignore it
-        return [_expr.TupleGetItem(loop_val, i + 1) for i in range(num_loop_var)]
-
-    def convert_operators(self, operators, outputs, ret_names):
-        """Convert each Torch IR operators to Relay equivalent"""
-        for node_name, op_node in operators:
-            operator = op_node.kind()
-            inputs = _get_op_inputs(op_node, outputs)
-            # we need to record what current operator is to provide correct source name
-            # for operators needed to be taken care with (e.g. nms / arange ...)
-            self.current_op.append(op_node)
-
-            if operator == "prim::Constant":
-                outputs[node_name] = _get_constant(op_node)
-            elif operator == "prim::ListConstruct" and _should_construct_dynamic_list(op_node):
-                outputs[node_name] = set_span(
-                    self.convert_to_list_adt(inputs), self.source_map[op_node]
-                )
-            elif operator == "prim::ListConstruct":
-                # This assumes that no more elements will be appended to this list
-                # In this case, we keep the Python list
-                outputs[node_name] = inputs
-            elif operator == "prim::TupleConstruct":
-
-                def _handel_nested_input(inputs):
-                    inputs_list = []
-                    for i, _ in enumerate(inputs):
-                        if isinstance(inputs[i], list):
-                            inputs_list.append(_handel_nested_input(inputs[i]))
-                        else:
-                            assert isinstance(inputs[i], _expr.Expr)
-                            inputs_list.append(inputs[i])
-                    return _expr.Tuple(inputs_list)
-
-                outputs[node_name] = set_span(
-                    _handel_nested_input(inputs), self.source_map[op_node]
-                )
-            elif operator in ["prim::ListUnpack", "prim::TupleUnpack"]:
-                assert len(inputs) == 1
-                if isinstance(inputs[0], (list, _expr.TupleWrapper)):
-                    unpacked = inputs[0]
-                else:
-                    unpacked = _unpack_tuple(inputs[0])
-                outputs.update(
-                    zip(_get_output_names(op_node), set_span(unpacked, self.source_map[op_node]))
-                )
-            elif operator == "prim::prim::RaiseException":
-                logger.warning("raising exceptions is ignored")
-                outputs[node_name] = None
-            elif operator == "prim::If":
-                if_out = self.convert_if(op_node, outputs)
-                outputs[node_name] = set_span(if_out, self.source_map[op_node])
-            elif operator == "prim::Loop":
-                loop_out = self.convert_loop(op_node, outputs)
-                unpacked_names = _get_output_names(op_node)
-                assert len(loop_out) == len(unpacked_names)
-                outputs.update(zip(unpacked_names, set_span(loop_out, self.source_map[op_node])))
-            else:
-                if operator not in self.convert_map:
-                    # At this point, the only possible ops that are not in convert_map are
-                    # in-place variant of ops like aten::relu_
-                    assert operator.endswith("_")
-                    logger.warning(
-                        "An in-place op %s found, the result will not be correct "
-                        "if the model depends on side-effects by this op.",
-                        operator,
-                    )
-                    relay_op = self.convert_map[operator[:-1]]
-                else:
-                    relay_op = self.convert_map[operator]
-
-                self._set_parameter_source_name(op_node, outputs)
-                relay_out = relay_op(
-                    # since the elements in "outputs" may change due to span-filling process
-                    # we have to call "_get_op_inputs" again rather than use "inputs" directly
-                    _get_op_inputs(op_node, outputs),
-                    _get_input_types(op_node, outputs, default_dtype=self.default_dtype),
-                )
-                relay_out = set_span(relay_out, self.source_map[op_node])
-                self.record_output_type(relay_out)
-
-                if isinstance(relay_out, tuple):
-                    # This is for torch operators that return multiple outputs
-                    # See _adaptive_max_2d above for example
-                    out_names = _get_output_names(op_node)
-                    outputs.update(zip(out_names, relay_out))
-                else:
-                    assert op_node.outputsSize() == 1
-                    outputs[node_name] = relay_out
-
-            self.current_op.pop()
-
-        # TODO(@haoyang9804): outputs[ret_name] could be None and cause some issue
-        # revealed by https://github.com/apache/tvm/issues/15004
-        # Now only adaptive_max_pool1d is considered. Maybe other ops could also
-        # trigger this problem.
-        return [
-            _wrap_const(outputs[ret_name])
-            for ret_name in ret_names
-            if ret_name != "aten::adaptive_max_pool1d_0_1"
-        ]
-
-    def _set_parameter_source_name(self, op_node, outputs):
-        """A helper function to rewrite source_name of parameter."""
-        for name in _get_input_names(op_node):
-            expr = outputs[name]
-            if isinstance(expr, (_expr.Var, _expr.Constant)):
-                name_sep = "_" if self.use_parser_friendly_name else "."
-                source_name = [self.source_map[op_node]]
-                if isinstance(expr, _expr.Var):
-                    # variable name should have contained node source name
-                    # for op with attributes in convert_params stage
-                    # e.g. "aten::batch_norm_5.running_mean"
-                    if expr.name_hint.startswith(source_name[0]):
-                        source_name[0] = expr.name_hint
-                    else:
-                        source_name.append(expr.name_hint)
-                new_expr = set_span(expr, name_sep.join(source_name))
-                outputs[name] = new_expr
-
-
-def _pytorch_result_type(dtypes, non_tensor_inputs):
-    """This promotes TVM dtypes like PyTorch would"""
-    import torch
-
-    dtype_map = {
-        "float64": torch.float64,
-        "float32": torch.float32,
-        "float16": torch.float16,
-        "bfloat16": torch.bfloat16,
-        "int64": torch.int64,
-        "int32": torch.int32,
-        "int16": torch.int16,
-        "int8": torch.int8,
-        "uint8": torch.uint8,
-        "bool": torch.bool,
-    }
-    if len(dtypes) > 0:
-        result_type = dtypes[0]
-        for dt in dtypes[1:]:
-            if dt != result_type:  # we don't want to work with same types as we
-                # don't do quantized here (which cannot be promoted?)
-                result_type = _convert_data_type(
-                    str(
-                        torch.result_type(
-                            torch.zeros((), dtype=dtype_map[result_type]),
-                            torch.zeros((), dtype=dtype_map[dt]),
-                        )
-                    )
-                )
-    else:
-        result_type = "bool"  # this is the smallest type...
-    for inp in non_tensor_inputs:
-        result_type = _convert_data_type(
-            str(torch.result_type(torch.zeros((), dtype=dtype_map[result_type]), inp))
-        )
-    return result_type
-
-
-# Helper functions for operator implementation
-def _convert_dtype_value(val):
-    """converts a PyTorch the PyTorch numeric type id to a torch scalar type."""
-    convert_torch_dtype_map = {
-        11: "torch.bool",
-        7: "torch.float64",
-        6: "torch.float32",
-        5: "torch.float16",
-        4: "torch.int64",
-        3: "torch.int32",
-        2: "torch.int16",
-        1: "torch.int8",
-        0: "torch.uint8",
-        None: "torch.int64",
-    }  # Default is torch.int64
-    if val in convert_torch_dtype_map:
-        return _convert_data_type(convert_torch_dtype_map[val])
-    else:
-        msg = f"Torch data type value {val} is not handled yet."
-        raise NotImplementedError(msg)
-
-
-def _convert_data_type(input_type, default_dtype=None):
-    """converts the PyTorch scalar type input_type to a TVM dtype.
-    optionally, default_dtype can be a TVM dtype that is used
-    if input_type is None (but not when it is unknown)"""
-    if input_type is None and default_dtype is not None:
-        return default_dtype
-
-    input_type = input_type.lower()
-    if input_type in ["double", "float64", "torch.float64"]:
-        return "float64"
-    elif input_type in ["float", "float32", "torch.float32"]:
-        return "float32"
-    elif input_type in ["half", "float16", "torch.float16"]:
-        return "float16"
-    elif input_type in ["long", "int64", "torch.int64"]:
-        return "int64"
-    elif input_type in ["int", "int32", "torch.int32"]:
-        return "int32"
-    elif input_type in ["short", "int16", "torch.int16"]:
-        return "int16"
-    elif input_type in ["char", "int8", "torch.int8"]:
-        return "int8"
-    elif input_type in ["byte", "uint8", "torch.uint8"]:
-        return "uint8"
-    elif input_type in ["quint8", "torch.quint8"]:
-        return "quint8"
-    elif input_type in ["qint8", "torch.qint8"]:
-        return "qint8"
-    elif input_type in ["qint32", "torch.qint32"]:
-        return "qint32"
-    elif input_type in ["bool", "torch.bool"]:
-        return "bool"
-    elif input_type in ["str"]:
-        return "str"
-    else:
-        raise NotImplementedError(f"input_type {input_type} is not handled yet")
-    return "float32"  # Never reached
-
-
-def _create_typed_const(data, dtype):
-    """create a (scalar) constant of given value and dtype.
-    dtype should be a TVM dtype"""
-
-    if dtype == "float64":
-        typed_data = _expr.const(np.asarray(data, dtype="float64"), dtype=dtype)
-    elif dtype == "float32":
-        typed_data = _expr.const(np.float32(data), dtype=dtype)
-    elif dtype == "float16":
-        typed_data = _expr.const(np.float16(data), dtype=dtype)
-    elif dtype == "int64":
-        typed_data = _expr.const(np.int64(data), dtype=dtype)
-    elif dtype == "int32":
-        typed_data = _expr.const(np.int32(data), dtype=dtype)
-    elif dtype == "int16":
-        typed_data = _expr.const(np.int16(data), dtype=dtype)
-    elif dtype == "int8":
-        typed_data = _expr.const(np.int8(data), dtype=dtype)
-    elif dtype == "uint8":
-        typed_data = _expr.const(np.uint8(data), dtype=dtype)
-    else:
-        raise NotImplementedError(f"input_type {dtype} is not handled yet")
-    return typed_data
-
-
-def _wrap_const(c):
-    if not isinstance(c, (_expr.Expr, list, tvm.tir.expr.Any)):
-        return _expr.const(c)
-    return c
-
-
-def _run_jit_passes(graph, enable_lower_all_tuples=True):
-    """The inline pass is necessary to unwrap prim::CallMethod"""
-    # pylint: disable=c-extension-no-member
-    import torch
-
-    if is_version_greater_than("1.5.1"):
-        # This is required for torchvision detection models from 1.6 above
-        # It is the same as _jit_pass_inline, except that it has some special
-        # case behaviors for some ops such as aten::__interpolate()
-        torch._C._jit_pass_onnx_function_substitution(graph)
-    else:
-        torch._C._jit_pass_inline(graph)
-
-    if enable_lower_all_tuples:
-        torch._C._jit_pass_lower_all_tuples(graph)
-
-
-def _redirect_inplace_output(graph):
-    """
-    This pass redirects the output node of the in-place op i.e. aten::copy_.
-    Before:
-      %1: ...
-      %2: ...
-      %3: Float(requires_grad=0, device=cpu) = aten::copy_(%input, %1, %2)
-      return (%input)
-    After:
-      %1: ...
-      %2: ...
-      %3: Float(requires_grad=0, device=cpu) = aten::copy_(%input, %1, %2)
-      return (%3)
-    """
-    for node in graph.nodes():
-        if node.kind() == "aten::copy_":
-            node_inputs = list(node.inputs())
-            src_node = node_inputs[0].node()
-            slice_and_select_nodes = []
-            while True:
-                if src_node.kind() in ["aten::slice", "aten::select", "aten::unsqueeze"]:
-                    src_node = list(src_node.inputs())[0].node()
-                    slice_and_select_nodes.append(src_node)
-                else:
-                    break
-            if src_node.kind() == "prim::Param":
-                # First one is "self"
-                src_value = list(src_node.outputs())[1]
-            else:
-                src_value = src_node.output()
-            src_value.replaceAllUsesAfterNodeWith(node, node.output())
-
-
-def _get_tensor_and_var(torch_tensor, name):
-    tensor = tvm.nd.array(torch_tensor.cpu().numpy())
-    var = _expr.var(name, shape=tensor.shape, dtype=tensor.dtype)
-    return tensor, var
-
-
-def _get_output_name(node):
-    assert node.outputsSize() == 1
-    return node.output().debugName()
-
-
-def _get_output_names(node):
-    return [output.debugName() for output in node.outputs()]
-
-
-def _get_input_names(node_or_graph):
-    return [inp.debugName() for inp in node_or_graph.inputs()]
-
-
-def _get_op_inputs(op_node, outputs):
-    return [outputs[name] for name in _get_input_names(op_node)]
-
-
-def _get_node_type(node):
-    assert node.outputsSize() == 1
-    return node.output().type().kind()
-
-
-def _get_uses(node):
-    uses = []
-    for output in node.outputs():
-        uses += output.uses()
-    return uses
-
-
-def _get_users(node):
-    return [use.user for use in _get_uses(node)]
-
-
-def _getattr_full_name(getattrs, sep="."):
-    return sep.join([getattr_attr_name(node) for node in getattrs])
-
-
-def _get_pytorch_value_type(typ, default_dtype="float32"):
-    kind = typ.kind()
-    if kind == "TensorType":
-        if typ.scalarType() is None:
-            # Tensor's type can be unknown if we use torch.jit.script(...)
-            # Defaults can be passed in, if not it is float32
-            logger.warning("Untyped Tensor found, assume it is %s", default_dtype)
-            return default_dtype
-        else:
-            return _convert_data_type(typ.scalarType())
-
-    elif kind == "ListType":
-        return "ListType"
-    elif kind in ["IntType", "FloatType", "BoolType", "StringType", "OptionalType"]:
-        pt_dtype = str(typ).lower()
-        dtype = pt_dtype if kind == "OptionalType" else _convert_data_type(pt_dtype)
-        return dtype
-    else:
-        return "UnsupportedType"
-
-
-def _get_input_types(op_node, outputs, default_dtype="float32"):
-    """Returns a TVM dtype for each input nodes derived from the torch type"""
-    in_types = []
-    for inp in op_node.inputs():
-        if inp.node().kind() == "prim::GetAttr":
-            # GetAttr nodes always return None when we call scalarType() on it
-            name = inp.debugName()
-            assert name in outputs
-            if isinstance(outputs[name], _expr.Var):
-                in_types.append(outputs[name].type_annotation.dtype)
-            else:
-                # For quantized modules with parameters, here we would get
-                # "prim::GetAttr[name="_packed_params"]". Since the dtype corresponding to
-                # _packed_params is not needed by quantized ops, we return an arbitrary type.
-                in_types.append(default_dtype)
-        else:
-            in_types.append(_get_pytorch_value_type(inp.type(), default_dtype=default_dtype))
-
-    return in_types
-
-
-def _get_constant(node):
-    """Retrieve a constant associated with this prim::Constant node"""
-    attribute_names = node.attributeNames()
-    num_attributes = len(attribute_names)
-
-    if num_attributes == 1:
-        attr_name = attribute_names[0]
-        ty = node.output().type().kind()
-
-        if ty == "IntType":
-            return node.i(attr_name)
-        elif ty == "BoolType":
-            return bool(node.i(attr_name))
-        elif ty in ["FloatType", "LongType"]:
-            return node.f(attr_name)
-        elif ty in ["TensorType", "CompleteTensorType"]:
-            tensor = node.t(attr_name)
-            if tensor.is_cuda:
-                tensor = tensor.cpu()
-            if len(tensor.shape) == 0:  # tensor(0.1)
-                # TODO(t-vi): When is this needed?
-                return tensor.item()
-            return _wrap_const(tensor.numpy())
-        elif ty in ["DeviceObjType", "StringType"]:
-            return node.s(attr_name)
-        elif ty == "FunctionType":
-            return None
-        else:
-            raise NotImplementedError(f"Unsupported type: {ty}")
-    else:
-        assert num_attributes == 0
-        return None
-
-
-class NodeNamer(ABC):
-    """Name each node and output edge in the relay graph"""
-
-    def __init__(self, op_counter_dict: Dict[str, int]):
-        self.op_counter_dict = op_counter_dict
-
-    def increment_counter(self, identifier: str) -> int:
-        op_idx = 0
-        if identifier in self.op_counter_dict:
-            op_idx = self.op_counter_dict[identifier] + 1
-        self.op_counter_dict[identifier] = op_idx
-        return op_idx
-
-    def get_node_source_name(self, node) -> str:
-        raise NotImplementedError()
-
-    def get_node_output_name(self, node_src_name: str, index: int) -> str:
-        raise NotImplementedError()
-
-
-class DefaultNodeKindNamer(NodeNamer):
-    """
-    Namer that uses a default naming based on the "type"/kind of node
-    # e.g. node.kind(): aten::adaptive_max_pool2d
-    # node_src_name -> aten::adaptive_max_pool2d_x
-    # output_1 -> aten::adaptive_max_pool2d_x_0
-    # output_2 -> aten::adaptive_max_pool2d_x_1
-    """
-
-    def get_node_source_name(self, node) -> str:
-        op_idx = self.increment_counter(node.kind())
-        return "_".join([node.kind(), str(op_idx)])
-
-    def get_node_output_name(self, node_src_name: str, index: int) -> str:
-        return "_".join([node_src_name, str(index)])
-
-
-class PytorchScopePreservingNamer(NodeNamer):
-    """
-    Namer that uses the Pytorch scope to name nodes.
-    eg. node could be called "bert.encoder.layer.11.output.dense"
-    """
-
-    def get_node_source_name(self, node) -> str:
-        # This works per the scope naming in Pytorch 2.0 and beyond.
-        scope_name_parts = node.scopeName().split("/")
-        imp_parts = [part.split("::")[-1] for part in scope_name_parts]
-        node_src_name = ".".join([part for part in imp_parts if part])
-        return node_src_name
-
-    def get_node_output_name(self, node_src_name: str, index: int) -> str:
-        op_idx = self.increment_counter(node_src_name)
-        return "_".join([node_src_name, str(op_idx), str(index)])
-
-
-def _rename_outputs(
-    node, source_map, op_type_dict, use_parser_friendly_name, preserve_pytorch_scopes
-):
-    """Rewrite debug name of node outputs with its operator type"""
-    namer = (
-        PytorchScopePreservingNamer(op_type_dict)
-        if preserve_pytorch_scopes
-        else DefaultNodeKindNamer(op_type_dict)
-    )
-    # get source name of operator and rename all of its outputs
-    if node.kind() != "prim::GetAttr":
-        node_src_name = namer.get_node_source_name(node)
-        for index, output in enumerate(node.outputs()):
-            name = namer.get_node_output_name(node_src_name, index)
-            output.setDebugName(name)
-        # update source map
-        # if use_parser_friendly_name is True: e.g. prim::Constant_0 -> prim__Constant_0
-        if use_parser_friendly_name:
-            node_src_name = re.sub(r":|\.", "_", node_src_name)
-        source_map[node] = node_src_name
-
-
-def _debug_rename(graph, use_parser_friendly_name, preserve_pytorch_scopes):
-    """Returns map between node and source name"""
-    source_map, op_type_dict = {}, {}
-    prim_with_blocks = ["prim::If", "prim::Loop"]
-
-    def _traverse_graph(nodes):
-        for node in nodes:
-            if node.outputsSize() == 0:
-                continue
-            if node.kind() in prim_with_blocks:
-                for block in node.blocks():
-                    _traverse_graph(block.nodes())
-            _rename_outputs(
-                node, source_map, op_type_dict, use_parser_friendly_name, preserve_pytorch_scopes
-            )
-
-    _traverse_graph(graph.nodes())
-    return source_map
-
-
-def _get_operator_nodes(
-    nodes,
-    source_map=None,
-    op_type_dict=None,
-    use_parser_friendly_name=False,
-    preserve_pytorch_scopes=False,
-):
-    """Returns torch IR nodes that need conversion to Relay"""
-    ops, should_rename_graph = [], all([source_map, op_type_dict]) is not None
-
-    # Traverse nodes and add to graph
-    for node in nodes:
-        if node.outputsSize() == 0:
-            continue
-
-        if should_rename_graph:
-            _rename_outputs(
-                node, source_map, op_type_dict, use_parser_friendly_name, preserve_pytorch_scopes
-            )
-
-        if node.outputsSize() > 1:
-            node_name = "_".join(_get_output_names(node))
-        else:
-            node_name = _get_output_name(node)
-
-        if node.kind() != "prim::GetAttr":
-            ops.append((node_name, node))
-
-    return ops
-
-
-def _get_relay_input_vars(graph, input_infos, prelude, is_module=True, default_dtype="float32"):
-    """
-    Return Relay vars from input shapes and create entries based on
-    expected graph inputs - to allow translation
-    """
-
-    graph_inputs = list(graph.inputs())
-    if is_module:
-        # a module has "self" as first input, which we do not need/want
-        graph_inputs = graph_inputs[1:]
-
-    if not isinstance(input_infos, list):
-        msg = "Graph inputs input_infos should be a list"
-        raise RuntimeError(msg)
-
-    if len(graph_inputs) != len(input_infos):
-        msg = f"PyTorch has {len(graph_inputs)} inputs and input_infos lists {len(input_infos)}."
-        raise RuntimeError(msg)
-
-    def get_relay_ty(ishape, itype, pt_type):
-        if pt_type.kind() == "TensorType":
-            if not (_is_int_seq(ishape) or len(ishape) == 0):
-                msg = "Shape for Tensors must be lists of ints"
-                raise RuntimeError(msg)
-            if (pt_type.dim() is not None and pt_type.dim() != len(ishape)) or (
-                pt_type.sizes() is not None
-                and any([s1 != s2 for s1, s2 in zip(pt_type.sizes(), ishape)])
-            ):
-                msg = "Shapes of input list and information in the graph do not match"
-                raise RuntimeError(msg)
-            if len(ishape) > 1 and any(dim <= 0 for dim in ishape[1:]):
-                msg = (
-                    "Expected input's non-batch dimensions to have positive length, "
-                    f"but input has a shape of {pt_type.sizes()}"
-                )
-                raise RuntimeError(msg)
-            pt_dtype = pt_type.scalarType()
-            if not pt_dtype and itype:
-                pt_dtype = itype
-            dtype = _convert_data_type(pt_dtype, default_dtype=default_dtype)
-            return TensorType(ishape, dtype)
-        elif pt_type.kind() == "TupleType":
-            if not isinstance(ishape, tuple):
-                msg = "Shapes for tuples must be tuples"
-                raise RuntimeError(msg)
-            return TupleType(
-                [get_relay_ty(elem, itype, pt_t) for elem, pt_t in zip(ishape, pt_type.elements())]
-            )
-        elif pt_type.kind() == "ListType":
-            if not isinstance(ishape, list):
-                msg = "Shapes for lists must be lists"
-                raise RuntimeError(msg)
-            pt_elemtype = pt_type.getElementType()
-            elem_tys = [get_relay_ty(s, itype, pt_elemtype) for s in ishape]
-            if len(elem_tys) > 0 and not all(map(lambda ty: ty == elem_tys[0], elem_tys)):
-                msg = "List elements need have identical types"
-                raise RuntimeError(msg)
-            rlist, _, _ = prelude.mod.get_type("List")
-            return rlist(elem_tys[0])
-        elif pt_type.kind() == "OptionalType":
-            # we do not support None yet, so we fill in the type
-            return get_relay_ty(ishape, itype, pt_type.getElementType())
-        # TODO: scalar inputs
-        raise NotImplementedError("unsupported input type")
-
-    input_vars = {}
-
-    new_input_infos = []
-    for num, inp in enumerate(input_infos):
-        if not isinstance(inp, tuple):
-            msg = f"Graph input {num} is not a tuple"
-            raise RuntimeError(msg)
-        if len(inp) != 2 or not isinstance(inp[0], str):
-            msg = (
-                f"Graph input {inp} is not valid,"
-                f" expected ('name', shape) or ('name', (shape, dtype))"
-            )
-            raise RuntimeError(msg)
-        if not isinstance(inp[1], tuple) or len(inp[1]) == 0 or not isinstance(inp[1][-1], str):
-            new_input_infos.append((inp[0], (inp[1], default_dtype)))
-        else:
-            new_input_infos.append(inp)
-
-    input_types = [
-        (name, get_relay_ty(info[0], info[1], gi.type()))
-        for (name, info), gi in zip(new_input_infos, graph_inputs)
-    ]
-
-    ir_inputs = [i.debugName() for i in graph_inputs]
-    for ir_input, (name, itype) in zip(ir_inputs, input_types):
-        inp = _expr.var(name, type_annotation=itype)
-        # Translate from graph input to user input name
-        input_vars[ir_input] = inp
-
-    return input_vars
-
-
-def _unpack_tuple(tup):
-    def unpack(tup, num_fields):
-        return [_expr.TupleGetItem(tup, i) for i in range(num_fields)]
-
-    if isinstance(tup, _expr.Tuple):
-        return unpack(tup, len(tup.fields))
-    elif isinstance(tup.type_annotation, TupleType):
-        return unpack(tup, len(tup.type_annotation.fields))
-    # shouldn't happen
-    assert False
-
-
-def _get_free_vars_from_block(block):
-    block_inp_names = _get_input_names(block)
-    bound_names = block_inp_names
-    free_vars = set()
-
-    for node in block.nodes():
-        inp_names = _get_input_names(node)
-        list_diff = [name for name in inp_names if name not in bound_names]
-        free_vars.update(list_diff)
-        bound_names += _get_output_names(node)
-
-    return free_vars
-
-
-def get_use_chains(root_node, terminate=lambda _: False):
-    """
-    Track a chain of users of this node forward, returning a list of chains
-    See get_attr_chains below for its usage
-    """
-
-    def concat_lists(lists):
-        return itertools.chain.from_iterable(lists)
-
-    def inner(current, accum):
-        users = _get_users(current)
-
-        if not users or terminate(users):
-            return [accum]
-
-        return concat_lists([inner(nxt, accum + [nxt]) for nxt in users])
-
-    return inner(root_node, [root_node])
-
-
-def get_attr_chains(root_getattr_node):
-    """Returns chains of attribute access starting from root_getattr_node
-
-    For example, given attribute "block", as in "self.block" when "self" points
-    to the top level torch.nn.Module, it returns lists of attribute "chains",
-    e.g. ['block', '2'], ['block', '1'], ['block', '0', '_packed_params']
-
-    These sets of attributes form full attribute accessors. For example,
-    "self.block.1", "self.block.2" will return the second and third submodule,
-    and "self.block.0._packed_params" will return the parameters of the first
-    submodule.
-    """
-
-    def terminate(users):
-        next_attrs = [user for user in users if user.kind() == "prim::GetAttr"]
-        return len(next_attrs) == 0
-
-    return get_use_chains(root_getattr_node, terminate)
-
-
-def convert_params(graph, state_dict, source_map, use_parser_friendly_name=False):
-    """
-    Return Relay vars and TVM NDArrays for input parameters
-    A chain of prim::GetAttr nodes is processed one at a time
-    """
-    getattr_nodes = graph.findAllNodes("prim::GetAttr", recurse=True)
-    params = {}
-    param_tensors = {}
-    packed_param_map = {}
-    param_debug_name_map = {}
-    vars_by_name = {}
-    seen = set()
-    attr_name_sep = "_" if use_parser_friendly_name else "."
-
-    for node in getattr_nodes:
-        if _get_output_name(node) in seen:
-            continue
-
-        for getattrs in get_attr_chains(node):
-            seen.update(map(_get_output_name, getattrs))
-
-            full_attr = _getattr_full_name(getattrs, attr_name_sep)
-            full_attr_node_name = _get_output_name(getattrs[-1])
-
-            # check if the node is a torch.nn.ParameterList, and if so, include the index in
-            # the attribute name as well
-            # e.g. "weights.1"
-            if re.search(attr_name_sep + r"\d+$", full_attr):
-                attr_name = full_attr.split(attr_name_sep)[-2:]
-            else:
-                attr_name = [full_attr.split(attr_name_sep)[-1]]
-
-            # set variable name by concatenating first consumer's name with attribute name
-            # e.g. "aten::batch_norm_5.running_mean"
-            var_name = attr_name_sep.join([source_map[_get_users(getattrs[-1])[0]]] + attr_name)
-
-            if full_attr.endswith("_packed_params"):  # for quantized models
-                packed_param_map[full_attr_node_name] = full_attr
-            elif full_attr in state_dict:
-                if var_name in vars_by_name:
-                    var = vars_by_name[var_name]
-                else:
-                    torch_tensor = state_dict[full_attr]
-                    tensor, var = _get_tensor_and_var(torch_tensor, var_name)
-                    param_tensors[var_name] = tensor
-                    # for quantized parameters to be correctly located
-                    param_debug_name_map[full_attr_node_name] = var_name
-                    vars_by_name[var_name] = var
-                params[full_attr_node_name] = var
-
-    return params, param_tensors, packed_param_map, param_debug_name_map
-
-
-def get_all_op_names(graph):
-    """Return all operator names in the input graph"""
-    nodes = list(graph.nodes())
-    prim_with_blocks = ["prim::If", "prim::Loop"]
-    for prim in prim_with_blocks:
-        prim_nodes = graph.findAllNodes(prim, recurse=True)
-        for prim_node in prim_nodes:
-            for block in prim_node.blocks():
-                nodes += block.nodes()
-    return set(node.kind() for node in nodes)
-
-
-def export_c_graph(location, graph):
-    """Convert the graph to an onnx model and export it to the location."""
-    import datetime
-    import os
-
-    if not os.path.exists(location):
-        os.makedirs(location)
-    time_stamp = datetime.datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
-    fname = os.path.join(location, f"tvm_exported_c_graph_{time_stamp}.txt")
-    with open(f"{fname}", "w") as f:
-        f.write(str(graph))
-
-
-def from_pytorch(
-    script_module,
-    input_infos,
-    custom_convert_map=None,
-    default_dtype="float32",
-    use_parser_friendly_name=False,
-    keep_quantized_weight=False,
-    export_renamed_c_graph_path=None,
-    preserve_pytorch_scopes=False,
-):
-    """Load PyTorch model in the form of a scripted PyTorch model and convert into relay.
-    The companion parameters will be handled automatically.
-
-    Parameters
-    ----------
-    script_module : TopLevelTracedModule object
-        TorchScripted PyTorch graph
-        Note: We currently only support traces (ie: torch.jit.trace(model, input))
-
-    input_infos : List of tuples
-        Can be (input name, input shape) or (input name, (input shape, input types))
-        Graph level input shape and type list
-        The same input names need to be used for deployment, so choose easy to
-        remember names (such as: input0, input1)
-        e.g.
-        [('input0', (1, 2)), ('input1', (3, 4))]
-        or
-        [('input0', ((1, 2), 'int')), ('input1', ((3, 4), 'float'))]
-
-    custom_convert_map : Dictionary of str to Relay op
-        A custom op conversion map in the same format as _convert_map above
-
-    default_type : str
-        The default dtype to use when type information is not provided by PyTorch.
-
-    use_parser_friendly_name : bool
-        When True, replace '.' with `_' in a original parameter name.
-        The Relay text parser treats a variable name followed by a period as a tuple element access,
-        so a variable name like "dense.weight" cannot be parsed correctly.
-        Use this option when you want to run the AnnotateSpans pass on the imported module.
-
-    keep_quantized_weight : bool
-        Return quantized weights and bias, rather than float ones. PyTorch stores quantized weights
-        in a custom format, so we cannot directly access 8 bit weights as Numpy arrays. We use
-        a PyTorch function to unpack quantized weights into float32 arrays and quantization
-        parameters. By default, we return float32 weights and rely on the QNN lowering and the
-        Relay constant folding pass to quantize weights at compile time. In BYOC use cases, however,
-        we cannot apply the constant folding pass on a QNN graph. If keep_quantized_weight is True,
-        we quantize weights in the frontend using a function that is equivalent to
-        qnn.op.quantize(...) operating on Numpy arrays.
-
-    export_renamed_c_graph_path : str, optional
-        Export the renamed torch._C.Graph to the path.
-        During the conversion, variable names in torch._C.Graph will be assigned based on their op
-        types. The exported text file can be the reference to spans.
-
-    preserve_pytorch_scopes : bool
-        When naming the nodes in the Relay graph, use the "scope name" from the Pytorch model.
-        If false, a default namer is used that does not preserve the Pytorch scope names.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The module that optimizations will be performed on.
-
-    params : dict of str to tvm.runtime.NDArray
-        Dict of converted parameters stored in tvm.runtime.ndarray format
-    """
-    import torch
-
-    mod = tvm.IRModule()
-    prelude = Prelude(mod)
-    enable_lower_all_tuples = True
-
-    converter = PyTorchOpConverter(
-        prelude, default_dtype, use_parser_friendly_name, preserve_pytorch_scopes
-    )
-
-    graph = script_module.graph.copy()
-
-    # Check if lower_all_tuples pass can be enabled
-    graph_inputs = list(graph.inputs())
-    for inp in graph_inputs:
-        if inp.type().kind() == "TupleType" or inp.type().kind() == "ListType":
-            enable_lower_all_tuples = False
-            break
-
-    _run_jit_passes(graph, enable_lower_all_tuples)
-    _redirect_inplace_output(graph)
-
-    if custom_convert_map:
-        converter.update_convert_map(custom_convert_map)
-
-    op_names = get_all_op_names(graph)
-    converter.report_missing_conversion(op_names)
-
-    is_module = isinstance(script_module, torch.jit.ScriptModule)
-    params = script_module.state_dict() if is_module else {}
-    outputs = _get_relay_input_vars(
-        graph, input_infos, prelude, default_dtype=default_dtype, is_module=is_module
-    )
-
-    if use_parser_friendly_name:
-        new_names = [key.replace(".", "_") for key in params.keys()]
-        params = dict(zip(new_names, params.values()))
-
-    # rename _C.Graph here for constructing meaningful source name of graph nodes
-    # by doing so, we could Use source_map as the reference to rename model parameters
-    source_map = _debug_rename(graph, use_parser_friendly_name, preserve_pytorch_scopes)
-    param_vars, tensors, packed_param_map, param_debug_name_map = convert_params(
-        graph, params, source_map, use_parser_friendly_name
-    )
-
-    tvm_params = {k: tvm.nd.array(v) for k, v in tensors.items()}
-
-    outputs.update(param_vars)
-
-    # For quantized models
-    quantized_ops = set(["aten::quantize_per_tensor", "quantized::linear_dynamic"])
-    if len(quantized_ops.intersection(set(op_names))) > 0:
-        weight_quant_params = qnn_torch.get_weight_quant_params(
-            script_module, packed_param_map.values()
-        )
-        qnn_torch.inline_input_quant_params_for_fx(graph, tensors, param_debug_name_map)
-        input_scales_for_bias = qnn_torch.add_input_quant_params_to_op_inputs(graph)
-        qnn_torch.add_quant_params_to_outputs(
-            outputs,
-            packed_param_map,
-            weight_quant_params,
-            input_scales_for_bias,
-            keep_quantized_weight,
-        )
-        qnn_torch.add_quant_params(tvm_params, weight_quant_params)
-        converter.update_convert_map(qnn_torch.convert_map)
-
-    operator_nodes = _get_operator_nodes(
-        graph.nodes(),
-        converter.source_map,
-        converter.op_type_dict,
-        use_parser_friendly_name,
-        preserve_pytorch_scopes,
-    )
-    ret_name = _get_input_names(graph.return_node())
-    outputs = converter.convert_operators(operator_nodes, outputs, ret_name)
-
-    # ListConstruct kept original python list. Convert to tuple.
-    outputs = [_expr.Tuple(output) if isinstance(output, list) else output for output in outputs]
-
-    if len(outputs) > 1:
-        ret = _expr.Tuple(outputs)
-    else:
-        ret = outputs[0]
-
-    # Separate data inputs and parameters to make sure data inputs come first.
-    func_args = []
-    data_inputs = []
-    for arg in _analysis.free_vars(ret):
-        if arg.name_hint not in tvm_params.keys():
-            data_inputs.append(arg)
-        else:
-            func_args.append(arg)
-
-    # Ensures the order of data_input is the same as the order of inputs specified in input_info.
-    order_input_infos = {
-        input_info[0]: len(input_infos) - idx for idx, input_info in enumerate(input_infos)
-    }
-    data_inputs = sorted(
-        data_inputs,
-        key=lambda data_input: order_input_infos[data_input.name_hint]
-        if data_input.name_hint in order_input_infos
-        else -1,
-        reverse=True,
-    )
-
-    func_args = data_inputs + func_args
-
-    mod["main"] = tvm.relay.Function(func_args, ret)
-
-    if export_renamed_c_graph_path:
-        export_c_graph(export_renamed_c_graph_path, graph)
-
-    return transform.RemoveUnusedFunctions()(mod), tvm_params
diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
deleted file mode 100644
index 8686be4b1ea9..000000000000
--- a/python/tvm/relay/frontend/pytorch_utils.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-outside-toplevel, unused-argument, invalid-name
-""" Common utilities used by PyTorch frontend """
-from .. import expr
-from .. import op
-from ..dataflow_pattern import (
-    wildcard,
-    is_constant,
-    is_op,
-    rewrite,
-    is_tuple,
-    is_tuple_get_item,
-    is_if,
-    DFPatternCallback,
-)
-
-
-def is_version_greater_than(ver):
-    """
-    Returns True if the local PyTorch version is greater
-    than the one given as an argument.
-    """
-    import torch
-    from packaging.version import parse
-
-    torch_ver = torch.__version__
-    # PT version numbers can include +cu[cuda version code]
-    # and we don't want to include that in the comparison
-    if "+cu" in torch_ver:
-        torch_ver = torch_ver.split("+cu")[0]
-
-    return parse(torch_ver) > parse(ver)
-
-
-def getattr_attr_name(node):
-    attribute_names = node.attributeNames()
-    assert len(attribute_names) == 1
-    return node.s(attribute_names[0])
-
-
-def dyn_strided_slice_pattern(inp, end):
-    """A pattern to detect dynamic strided slice op."""
-    zero = is_constant()
-    cast_like = is_op("cast_like")(zero, is_constant())
-    less = is_op("less")(is_constant(), cast_like)
-    shape_of = is_op("shape_of")(inp)
-    cast_like = is_op("cast_like")(shape_of, is_constant())
-    add = is_op("add")(is_constant(), cast_like)
-    where = is_op("where")(less, add, is_constant())
-
-    return is_op("dyn.strided_slice")(inp, where, end, is_constant())
-
-
-def batched_nms_pattern(boxes, scores, idxs, iou_threshold, num_boxes, indices):
-    """A pattern to detect batched_nms function in torchvision
-
-    The inputs to this function, boxes, scores, idxs, iou_threshold are wildcard
-    patterns which can be used later in the rewriting to extract matched Relay fragments.
-
-    We want to detect the following PyTorch code snippet:
-
-    def batched_nms(boxes, scores, idxs, iou_threshold):
-        max_coordinate = boxes.max()
-        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
-        boxes_for_nms = boxes + offsets[:, None]
-        keep = nms(boxes_for_nms, scores, iou_threshold)
-        return keep
-
-    Here is how PyTorch frontend lowers above PyTorch code. For simplicity, Relay ops for
-    dealing with dynamic strided_slice are omitted. %num_boxes, %indices are complex
-    expressions, but since we can use the wildcard part for them, we do not need to construct
-    their patterns.
-
-    %2 = expand_dims(%scores, axis=-1);
-    %3 = cast(%idxs, dtype="float32");
-    %4 = max(%boxes);
-    %5 = add(%4, 1f);
-    %6 = multiply(%3, %5);
-    %7 = strided_slice(%6, begin=[0], end=[4507], strides=[1]);
-    %8 = expand_dims(%7, axis=1);
-    %9 = add(%boxes, %8);
-    %10 = (%2, %9);
-    %11 = concatenate(%10, axis=-1);
-    %12 = expand_dims(%11, axis=0);
-    ...
-    ...
-    %17 = vision.non_max_suppression(%12, %num_boxes, %indices, -1, 0.7f, ...);
-
-    """
-    one = is_constant()
-
-    # Equivalent PyTorch code from above snippet
-    # offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
-    cast = is_op("cast")(idxs)
-    mx = is_op("max")(boxes)
-    add = is_op("add")(mx, one)
-    mul = is_op("multiply")(cast, add)
-
-    shape_of = is_op("shape_of")(mul)
-    cast = is_op("cast")(shape_of)
-
-    # Add offsets to the boxes
-    expand_dims = is_op("expand_dims")(mul)
-    add = is_op("add")(boxes, expand_dims)
-
-    # The rest of patterns correspond to the PyTorch frontend conversion
-    # function for torchvision::nms
-    score_expand_dims = is_op("expand_dims")(scores)
-    tup = is_tuple([score_expand_dims, add])
-    concat = is_op("concatenate")(tup)
-    data = is_op("expand_dims")(concat)
-
-    return is_op("vision.non_max_suppression")(
-        data, num_boxes, indices, is_constant(), iou_threshold
-    )
-
-
-def topk_after_batch_nms_pattern(cond, true_branch, data, valid_count, indices, iou_threshold):
-    """
-    Detect the following pattern used in torchvision detection models.
-
-    def batched_nms(...):
-        if boxes.numel() == 0:
-            return torch.empty((0,), dtype=torch.int64, device=boxes.device)
-        else:
-            ...
-            return nms(boxes_for_nms, scores, iou_threshold)
-
-    keep = batched_nms(boxes, scores, lvl, self.nms_thresh)
-    keep = keep[:post_nms_top_k] # keep only topk scoring predictions
-
-    An equivalent Relay subgraph:
-
-    %1184 = if (%1117) {
-      ...
-    } else {
-      ...
-      %1172 = vision.non_max_suppression(%1167, %1168, %1171, -1, 0.7f, ...);
-      ...
-      %1183 = dyn.strided_slice(%1174, %1180, %1182, ...);
-      cast(%1183, dtype="int64")
-    };
-    %1185 = strided_slice(%1184, begin=[0], end=[1000], strides=[1]);
-
-    """
-    nms = is_op("vision.non_max_suppression")(
-        data, valid_count, indices, is_constant(), iou_threshold
-    )
-    indices = is_op("squeeze")(is_tuple_get_item(nms, 0))
-    size = is_op("squeeze")(is_tuple_get_item(nms, 1))
-    dyn_strided_slice = dyn_strided_slice_pattern(indices, size)
-    cast_i64 = is_op("cast")(dyn_strided_slice)
-
-    batched_nms_result = is_if(cond, true_branch, cast_i64)
-
-    return is_op("strided_slice")(batched_nms_result)
-
-
-class MulticlassNMSRewrite(DFPatternCallback):
-    """A callback to rewrite nms and restore batched nms."""
-
-    def __init__(self):
-        super().__init__()
-        # exprs to extract
-        self.boxes = wildcard()
-        self.scores = wildcard()
-        self.idxs = wildcard()
-        self.iou_threshold = wildcard()
-        self.num_boxes = wildcard()
-        self.indices = wildcard()
-
-        self.pattern = batched_nms_pattern(
-            self.boxes,
-            self.scores,
-            self.idxs,
-            self.iou_threshold,
-            self.num_boxes,
-            self.indices,
-        )
-
-    def convert_batched_nms(self, boxes, scores, idxs, iou_thres, num_boxes, indices):
-        """Restore class-aware NMS using extracted class indices"""
-        scores = op.expand_dims(scores, axis=-1, num_newaxis=1)
-        idxs = op.expand_dims(idxs, axis=-1, num_newaxis=1)
-        idxs = op.cast(idxs, "float32")
-        data = op.concatenate([idxs, scores, boxes], -1)
-        data = op.expand_dims(data, 0, 1)
-
-        top_k = max_out_size = -1
-        out = op.vision.non_max_suppression(
-            data=data,
-            valid_count=num_boxes,
-            indices=indices,
-            max_output_size=max_out_size,
-            iou_threshold=iou_thres,
-            force_suppress=False,
-            top_k=top_k,
-            coord_start=2,
-            score_index=1,
-            id_index=0,
-            return_indices=True,
-            invalid_to_bottom=False,
-        )
-        return out.tuple_value
-
-    def callback(self, pre, post, node_map):
-        boxes = node_map[self.boxes][0]
-        scores = node_map[self.scores][0]
-        idxs = node_map[self.idxs][0]
-        iou_thres = node_map[self.iou_threshold][0]
-        num_boxes = node_map[self.num_boxes][0]
-        indices = node_map[self.indices][0]
-        return self.convert_batched_nms(boxes, scores, idxs, iou_thres, num_boxes, indices)
-
-
-class PostNMSTopKRewrite(DFPatternCallback):
-    """A callback to rewrite nms to exploit max_out_size parameter."""
-
-    def __init__(self):
-        super().__init__()
-        self.cond = wildcard()
-        self.true_branch = wildcard()
-        self.data = wildcard()
-        self.valid_count = wildcard()
-        self.indices = wildcard()
-        self.iou_threshold = wildcard()
-
-        self.pattern = topk_after_batch_nms_pattern(
-            self.cond,
-            self.true_branch,
-            self.data,
-            self.valid_count,
-            self.indices,
-            self.iou_threshold,
-        )
-
-    def rewrite_batch_nms_with_max_out_size(
-        self, cond, true_branch, data, valid_count, indices, iou_threshold, post_nms_topk
-    ):
-        """Use the detected post NMS topk parameter in NMS op."""
-        nms_ret = op.vision.non_max_suppression(
-            data=data,
-            valid_count=valid_count,
-            indices=indices,
-            max_output_size=post_nms_topk,
-            iou_threshold=iou_threshold,
-            force_suppress=False,
-            top_k=-1,
-            coord_start=2,
-            score_index=1,
-            id_index=0,
-            return_indices=True,
-            invalid_to_bottom=False,
-        )
-
-        size = op.squeeze(nms_ret[1], axis=[1])
-        data_slice = op.squeeze(nms_ret[0], axis=[0])
-
-        ret = op.strided_slice(data_slice, begin=expr.const([0]), end=size, slice_mode="size")
-
-        nms_result = op.cast(ret, "int64")
-
-        return expr.If(cond, true_branch, nms_result)
-
-    def callback(self, pre, post, node_map):
-        post_nms_topk = post.attrs.end[0].value
-        return self.rewrite_batch_nms_with_max_out_size(
-            node_map[self.cond][0],
-            node_map[self.true_branch][0],
-            node_map[self.data][0],
-            node_map[self.valid_count][0],
-            node_map[self.indices][0],
-            node_map[self.iou_threshold][0],
-            post_nms_topk,
-        )
-
-
-def scatter_roi_align_result_pattern(levels, roi_align_results, num_scales):
-    """Detect the Relay subgraph corresponding to the following PyTorch code
-
-    first_result = roi_align_results[0]
-    dtype, device = first_result.dtype, first_result.device
-    res = torch.zeros((levels.size(0), first_result.size(1),
-                       first_result.size(2), first_result.size(3)),
-                      dtype=dtype, device=device)
-    for level in range(len(roi_align_results)):
-        index = torch.where(levels == level)[0].view(-1, 1, 1, 1)
-        index = index.expand(index.size(0),
-                             roi_align_results[level].size(1),
-                             roi_align_results[level].size(2),
-                             roi_align_results[level].size(3))
-        res = res.scatter(0, index, roi_align_results[level])
-    return res
-    """
-
-    def do_where(levels, _):
-        idx_in_level = is_op("argwhere")(is_op("equal")(levels, is_constant()))
-        idx_in_level = is_op("split")(idx_in_level)
-        idx_in_level = is_tuple_get_item(idx_in_level, 0)
-        idx_in_level = is_op("squeeze")(idx_in_level)
-        idx_in_level = is_tuple_get_item(is_tuple([idx_in_level]), 0)
-        return idx_in_level
-
-    scatter_res = wildcard()
-
-    for i in range(num_scales):
-        # index = torch.where(levels == level)[0].view(-1, 1, 1, 1)
-        scatter_indices = do_where(levels, i)
-        scatter_indices = is_op("reshape")(scatter_indices)
-
-        # index = index.expand(index.size(0),
-        #                      unmerged_results[level].size(1),
-        #                      unmerged_results[level].size(2),
-        #                      unmerged_results[level].size(3))
-        scatter_indices = is_op("repeat")(scatter_indices)
-        scatter_indices = is_op("repeat")(scatter_indices)
-        scatter_indices = is_op("repeat")(scatter_indices)
-
-        scatter_res = is_op("scatter_elements")(scatter_res, scatter_indices, roi_align_results[i])
-
-    return is_op("reshape")(scatter_res)
-
-
-class ScatterRewrite(DFPatternCallback):
-    """A callback to rewrite repeated scatters with a batched gather."""
-
-    def __init__(self, num_scales):
-        super().__init__()
-        self.num_scales = num_scales
-        self.levels = wildcard()
-        self.roi_align_results = []
-        for _ in range(num_scales):
-            self.roi_align_results.append(wildcard())
-
-        self.pattern = scatter_roi_align_result_pattern(
-            self.levels, self.roi_align_results, num_scales
-        )
-
-    def convert_scatter_to_gather(self, levels, roi_align_results):
-        """Replace the detected scatter loop with the following PyTorch code
-
-        indices_per_level = []
-        for level in range(num_scales):
-            idx_in_level = torch.where(levels == level)[0]
-            indices_per_leve.append(idx_in_level)
-
-        stacked_features = torch.cat(roi_align_results, dim=0)
-        stacked_indices = torch.cat(indices_per_level, dim=0)
-        argsort_indices = torch.argort(stacked_indices)
-        return stacked_features[argsort_indices, :]
-        """
-
-        # Collect inidices and concat them
-        indices_per_level = []
-        for i in range(self.num_scales):
-            equal = op.equal(levels, expr.const(i, dtype="int64"))
-            argwhere = op.argwhere(equal)
-            split = op.split(argwhere, indices_or_sections=1, axis=1)
-            squeeze = op.squeeze(split[0], axis=[1])
-            indices = op.cast(squeeze, dtype="int64")
-            indices_per_level.append(indices)
-
-        indices_concat = op.concatenate(indices_per_level, 0)
-
-        # Concat roi align results per level, and argsort indices
-        # To prepare for a batched gather
-        roi_align_results_concat = op.concatenate(roi_align_results, 0)
-        argsort_indices = op.cast(op.argsort(indices_concat), dtype="int64")
-
-        # Permute rows by argsorted indices
-        permuted = op.take(roi_align_results_concat, argsort_indices, axis=0)
-
-        return op.reshape(permuted, [0, -1, 1, 1])
-
-    def callback(self, pre, post, node_map):
-        levels = node_map[self.levels][0]
-        roi_align_results = [node_map[feat][0] for feat in self.roi_align_results]
-        return self.convert_scatter_to_gather(levels, roi_align_results)
-
-
-def rewrite_nms_to_batched_nms(mod):
-    """Rewrite the input graph to replace non maximum surpression
-    in torchvision that does not take class id into account with the one
-    that avoids IOU tests between different classes.
-    """
-    mod["main"] = rewrite(MulticlassNMSRewrite(), mod["main"])
-    return mod
-
-
-def rewrite_batched_nms_with_max_out_size(mod):
-    """Rewrite the input graph to detect slicing after batched nms and
-    use the slicing size as the parameter max_out_size in NMS.
-    """
-    mod["main"] = rewrite(PostNMSTopKRewrite(), mod["main"])
-    return mod
-
-
-def rewrite_scatter_to_gather(mod, num_scales):
-    """Rewrite the input graph to replace a repeated scatter loop with
-    a batched gather. The scatter loop is used in torchvision MultiScaleRoIAlign
-    to merge roi_align results for all scales. The scatter is used to emulate
-    inplace updates.
-    """
-    mod["main"] = rewrite(ScatterRewrite(num_scales), mod["main"])
-    return mod
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
deleted file mode 100644
index a6d536eaccf7..000000000000
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ /dev/null
@@ -1,1202 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, import-outside-toplevel
-""" Functions to convert quantized torch models to QNN """
-
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.relay import expr as _expr
-from tvm.relay import op as _op
-from tvm.relay.frontend.common import infer_shape
-
-from .common import logger
-from .pytorch_utils import is_version_greater_than
-
-
-class QNNParam(object):
-    """A placeholder for weight quantization parameters"""
-
-    def __init__(self, weight, bias, scale, zero_point):
-        self.weight = weight
-        self.bias = None if bias is None else bias.detach().numpy()
-        self.scale = _expr.const(scale)
-        self.zero_point = _expr.const(zero_point, dtype="int32")
-
-
-class ConvPackedParam(QNNParam):
-    """A placeholder for quantized conv2d op attributes
-    As of PyTorch 1.6, attributes of quantized conv2d ops, like
-    stride, padding etc are stored in ConvPackedParams objects,
-    together with weights and quantization parameters
-    """
-
-    def __init__(
-        self, weight_np, bias, scale, zero_point, stride, padding, dilation, groups, output_padding
-    ):
-        super().__init__(weight_np, bias, scale, zero_point)
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.groups = groups
-        # Used only for conv_transpose2d
-        self.output_padding = output_padding
-
-
-def _get_quant_params(qweight):
-    import torch
-
-    weight_np = qweight.dequantize().numpy()
-
-    if qweight.qscheme() == torch.per_tensor_affine:
-        return weight_np, qweight.q_scale(), int(qweight.q_zero_point())
-
-    scales = qweight.q_per_channel_scales().numpy()
-    zero_points = qweight.q_per_channel_zero_points().numpy()
-    # This is an assumption posed by QNN
-    msg = "The values of zero points should be all zero for per channel"
-    assert np.all(zero_points == 0), msg
-    return weight_np, scales, 0
-
-
-def make_qnn_param(qweight, bias):
-    weight_np, scale, zero_point = _get_quant_params(qweight)
-    return QNNParam(weight_np, bias, scale, zero_point)
-
-
-def make_conv_packed_param(qweight, bias, packed_params):
-    weight_np, scale, zero_point = _get_quant_params(qweight)
-    stride = packed_params.stride()
-    padding = packed_params.padding()
-    dilation = packed_params.dilation()
-    groups = packed_params.groups()
-    output_padding = packed_params.output_padding()
-    return ConvPackedParam(
-        weight_np, bias, scale, zero_point, stride, padding, dilation, groups, output_padding
-    )
-
-
-def get_weight_quant_params(script_module, packed_param_names):
-    """Retrieve and unpack weight parameters from quantized modules"""
-    import torch
-
-    param_name = "_packed_params"
-    quant_params = {}
-
-    def filter_func(named_module):
-        m = named_module[1]
-        return isinstance(m, torch.jit.RecursiveScriptModule) and (
-            ("Conv" in m.original_name) or (m.original_name == "LinearPackedParams")
-        )
-
-    for name, m in filter(filter_func, script_module.named_modules()):
-        key = name + "." + param_name
-        state_dict = m.state_dict()
-
-        if key not in packed_param_names:
-            continue
-
-        if len(state_dict) == 0 and not hasattr(m, param_name):
-            # for v1.6 and above
-            # This case seems to happen if a model is serialized
-            # and loaded back
-            # This module can be safely ignored
-            continue
-
-        if len(state_dict) == 0 and hasattr(m, param_name):
-            # for v1.6 and above
-            packed_params = m._packed_params
-        else:
-            assert len(state_dict) == 1
-            packed_params = list(state_dict.values())[0]
-
-        if "Conv" in m.original_name and len(state_dict) == 0:
-            qweight, bias = torch.ops.quantized.conv2d_unpack(packed_params)
-            quant_params[key] = make_conv_packed_param(qweight, bias, packed_params)
-        elif "Conv" in m.original_name:
-            qweight, bias = torch.ops.quantized.conv2d_unpack(packed_params)
-            quant_params[key] = make_qnn_param(qweight, bias)
-        elif m.original_name == "LinearPackedParams":
-            qweight, bias = torch.ops.quantized.linear_unpack(packed_params)
-            quant_params[key] = make_qnn_param(qweight, bias)
-
-    return quant_params
-
-
-def quantize_numpy(weight, scale, zero_point, out_dtype_np):
-    iinfo = np.iinfo(out_dtype_np)
-    clip_min = iinfo.min
-    clip_max = iinfo.max
-    if len(scale.shape) > 0:
-        scale = np.reshape(scale, [weight.shape[0]] + [1] * (len(weight.shape) - 1))
-    transformed = zero_point + weight / scale
-    return np.clip(np.round(transformed), clip_min, clip_max).astype(out_dtype_np)
-
-
-def add_quant_params_to_outputs(
-    outputs, packed_param_map, quant_params, input_scales_for_bias, keep_quantized_weight=False
-):
-    """
-    Add quant params to outputs so that they can be referenced by other
-    ops later. Weights are quantized here.
-    """
-    for node_name, packed_param_name in packed_param_map.items():
-        qparam = quant_params[packed_param_name]
-        weight_scale = _get_numpy(qparam.scale)
-        param_prefix = packed_param_name[: -len("._packed_params")]
-
-        if keep_quantized_weight:
-            qparam.weight_var = _expr.var(
-                param_prefix + "_weight", shape=qparam.weight.shape, dtype="int8"
-            )
-            qparam.weight = quantize_numpy(
-                qparam.weight, weight_scale, _get_numpy(qparam.zero_point), np.int8
-            )
-            qweight = qparam.weight_var
-        else:
-            qparam.weight_var = _expr.var(
-                param_prefix + "_weight", shape=qparam.weight.shape, dtype="float32"
-            )
-            qweight = relay.qnn.op.quantize(
-                qparam.weight_var, qparam.scale, qparam.zero_point, out_dtype="int8", axis=0
-            )
-
-        if qparam.bias is not None:
-            float_bias_var = _expr.var(
-                param_prefix + "_bias", shape=qparam.bias.shape, dtype="float32"
-            )
-            if node_name not in input_scales_for_bias:
-                # This case is for dynamic quantization, where the input activation scale is
-                # unknown until runtime.
-                qparam.bias_var = float_bias_var
-                qbias = qparam.bias_var
-            elif keep_quantized_weight:
-                qparam.bias_var = _expr.var(
-                    param_prefix + "_bias", shape=qparam.bias.shape, dtype="int32"
-                )
-                qparam.bias = quantize_numpy(
-                    qparam.bias, input_scales_for_bias[node_name] * weight_scale, 0, np.int32
-                )
-                qbias = qparam.bias_var
-            else:
-                qparam.bias_var = float_bias_var
-                qbias = relay.qnn.op.quantize(
-                    qparam.bias_var,
-                    _expr.const(input_scales_for_bias[node_name] * weight_scale),
-                    _expr.const(0, "int32"),
-                    out_dtype="int32",
-                    axis=0,
-                )
-        else:
-            qbias = None
-
-        quant_params[packed_param_name] = qparam
-
-        params = [qweight, qparam.scale, qparam.zero_point, qbias]
-
-        if isinstance(quant_params[packed_param_name], ConvPackedParam):
-            params += [
-                qparam.stride,
-                qparam.padding,
-                qparam.dilation,
-                qparam.groups,
-                qparam.output_padding,
-            ]
-
-        outputs[node_name] = params
-
-
-def _get_quant_param_for_input(input_value):
-    """
-    We want to know the input scale and zp of this input_value, since
-    input quant params are not explicitly passed around in torch (they
-    are embedded in a QTensor data structure, not visible statically).
-    We know that it is quantized using output scale and zp
-    of some previous quantized op. The purpose of this function
-    is to find that pair of parameters.
-    """
-    # Indices for output scale and zp
-    # For example, in quantized::conv2d(%input, %1, %2, %3, %4, %5, %6, %7),
-    # 6th and 7th arg are output scale and zp respectively.
-
-    # PyTorch 1.6 changed qconv API
-    if is_version_greater_than("1.5.1"):
-        qconv_indices = (2, 3)
-    else:
-        qconv_indices = (6, 7)
-
-    output_quant_param_indices = {
-        "aten::quantize_per_tensor": (1, 2),
-        "quantized::conv2d": qconv_indices,
-        "quantized::conv2d_relu": qconv_indices,
-        "quantized::linear": (2, 3),
-        "quantized::linear_relu": (2, 3),
-        "quantized::add_relu": (2, 3),
-        "quantized::add": (2, 3),
-        "quantized::mul_relu": (2, 3),
-        "quantized::mul": (2, 3),
-        "quantized::cat": (2, 3),
-        "quantized::mul_scalar": (2, 3),
-        "quantized::add_scalar": (2, 3),
-        "quantized::hardswish": (1, 2),
-        "quantized::conv_transpose2d": qconv_indices,
-        "quantized::leaky_relu": (3, 4),
-        "aten::sigmoid": (1, 2),
-    }
-
-    def dfs(current_node):
-        # trace back to find the producer of this input value
-        current_op = current_node.kind()
-        if current_op in output_quant_param_indices:
-            indices = output_quant_param_indices[current_op]
-            scale = current_node.inputsAt(indices[0])
-            zp = current_node.inputsAt(indices[1])
-            return scale, zp
-
-        # Trace back eariler nodes, dfs order
-        # Assume quantized tensor comes earlier in the args
-        for arg in current_node.inputs():
-            return dfs(arg.node())
-
-        # If input_value is not quantized, we reach here.
-        return None, None
-
-    return dfs(input_value.node())
-
-
-def _get_add_scalar_output_quant_param(input_scale, input_zero_point, scalar):
-    """
-    Determine the output scale and zp of quantized::add_scalar op
-    This is used for mobilenet v3
-    Refer to aten/src/ATen/native/quantized/cpu/qadd.cpp
-    The names of variables are the same as torch impl
-    """
-    q_min = 0
-    q_max = 255
-    s = input_scale
-    z = input_zero_point
-    c = scalar
-    c_q = round(c / s)
-
-    if q_min > z - c_q:
-        s_prime = (float(q_max) - (z - c_q)) / (float(q_max) - q_min) * s
-        z_prime = q_min
-    elif q_max < z - c_q:
-        s_prime = (float(z - c_q) - q_min) / (float(q_max) - q_min) * s
-        z_prime = q_max
-    else:
-        s_prime = s
-        z_prime = z - c_q
-
-    return s_prime, z_prime
-
-
-def _get_mul_scalar_output_quant_param(input_scale, input_zero_point, scalar):
-    """
-    Determine the output scale and zp of quantized::mul_scalar op
-    This is used for mobilenet v3
-    Refer to aten/src/ATen/native/quantized/cpu/qmul.cpp
-    The names of variables are the same as torch impl
-    """
-    q_min = 0
-    q_max = 255
-    self_scale = input_scale
-    self_zero_point = input_zero_point
-    other_val = scalar
-
-    if other_val > 0.0:
-        s_prime = other_val * self_scale
-        z_prime = self_zero_point
-    elif other_val == 0.0:
-        s_prime = 1.0
-        z_prime = 0
-    else:
-        s_prime = abs(other_val) * self_scale
-        z_prime = q_max - (self_zero_point - q_min)
-
-    return s_prime, z_prime
-
-
-def _add_output_quant_params_to_scalar_op(node, graph, input_scale, input_zero_point, scalar):
-    """
-    The output scale and zp of {add,mul}_scalar op are not explicit in the IR
-    They are required for _get_quant_param_for_input above to work correctly
-    So calculate these params using the same way torch does, and make new
-    constant nodes in the input IR. Also add these params to the inputs of
-    scalar op.
-
-    For example,
-       %6 : float = prim::Constant[value=3.]()
-       %input : QUInt8(1, 3, 224, 224) = quantized::add_scalar(%x.1, %6)
-    becomes
-       %6 : float = prim::Constant[value=3.]()
-       %7 : float = prim::Constant[value=0.015686161816120148]()
-       %8 : int = prim::Constant[value=0]()
-       %input : UInt8(1, 3, 224, 224) = quantized::add_scalar(%x.1, %6, %7, %8)
-
-    %7 and %8 are newly created output scale and zp constant nodes
-    """
-    # pylint: disable=c-extension-no-member
-    import torch
-
-    operator = node.kind()
-
-    if operator == "quantized::mul_scalar":
-        out_scale, out_zero_point = _get_mul_scalar_output_quant_param(
-            input_scale, input_zero_point, scalar
-        )
-    elif operator == "quantized::add_scalar":
-        out_scale, out_zero_point = _get_add_scalar_output_quant_param(
-            input_scale, input_zero_point, scalar
-        )
-    else:
-        raise NotImplementedError(f"unsupported scalar op: {operator}")
-
-    # create new constant nodes and add them to graph
-    out_scale_node = graph.create("prim::Constant")
-    out_zero_point_node = graph.create("prim::Constant")
-    out_scale_node.insertBefore(node)
-    out_zero_point_node.insertBefore(node)
-    out_scale_node.f_("value", out_scale)
-    out_zero_point_node.i_("value", out_zero_point)
-    out_scale_node.output().setType(torch._C.FloatType.get())
-    out_zero_point_node.output().setType(torch._C.IntType.get())
-    node.addInput(out_scale_node.output())
-    node.addInput(out_zero_point_node.output())
-
-
-def _add_output_quant_params_to_sigmoid_op(node, graph):
-    """
-    Refer to aten/src/ATen/native/quantized/cpu/qsigmoid.cpp,
-    the output scale and zp of sigmoid op are two fixed numbers.
-    So we need to make two new constant nodes in the input IR and
-    add these params to the inputs of sigmoid op.
-    """
-    # pylint: disable=c-extension-no-member
-    import torch
-
-    # suppose scale_type is uint8
-    out_scale = 1.0 / 256
-    out_zero_point = 0
-
-    # create new constant nodes and add them to graph
-    out_scale_node = graph.create("prim::Constant")
-    out_zero_point_node = graph.create("prim::Constant")
-    out_scale_node.insertBefore(node)
-    out_zero_point_node.insertBefore(node)
-    out_scale_node.f_("value", out_scale)
-    out_zero_point_node.i_("value", out_zero_point)
-    out_scale_node.output().setType(torch._C.FloatType.get())
-    out_zero_point_node.output().setType(torch._C.IntType.get())
-    node.addInput(out_scale_node.output())
-    node.addInput(out_zero_point_node.output())
-
-
-def add_input_quant_params_to_op_inputs(graph):
-    """
-    In Torch, input quant params are not explicitly passed around
-    Instead, they are stored in QTensor data structure, and retrieved
-    at runtime by each quantized ops.
-    However, they need to be known statically for QNN translation.
-    To workaround and simplify the translation of inputs, we manually add
-    input quant params to inputs of Torch quantized operators listed below.
-    See _quantized_conv2d() below for example of why this is helpful.
-
-    For example,
-      %input : QUInt8(1, 512, 7, 7) = quantized::add(%x.8, %x.9, %434, %435)
-    becomes
-      %395 : float = prim::Constant[value=0.036212071776390076]()
-      %396 : int = prim::Constant[value=0]()
-      %430 : float = prim::Constant[value=0.16080744564533234]()
-      %431 : int = prim::Constant[value=42]()
-      %input : QUInt8(1, 512, 7, 7) = quantized::add(%x.8, %x.9, %434, %435,
-                                                     %430, %431, %395, %396)
-
-    %434, %435 are output scale and zp of quantized::add op
-    %430, %431, %395, %396 are two pairs of input (scale, zp) for two tensors
-    added by this function
-    """
-    # How many quantized tensors each op takes as inputs?
-    # A pair of (scale, zp) for each input quantized tensor will be added
-    # to the input nodes
-    num_quantized_inputs = {
-        "quantized::conv2d": 1,
-        "quantized::conv2d_relu": 1,
-        "quantized::linear": 1,
-        "quantized::linear_relu": 1,
-        "quantized::add_relu": 2,
-        "quantized::add": 2,
-        "quantized::mul_relu": 2,
-        "quantized::mul": 2,
-        "aten::dequantize": 1,
-        "aten::mean": 1,
-        "aten::sigmoid": 1,
-        "aten::upsample_nearest2d": 1,
-        "aten::upsample_bilinear2d": 1,
-        "aten::relu_": 1,
-        "aten::relu": 1,
-        "quantized::add_scalar": 1,
-        "quantized::mul_scalar": 1,
-        "quantized::relu6": 1,
-        "quantized::hardswish": 1,
-        "aten::hardsigmoid": 1,
-        "quantized::conv_transpose2d": 1,
-        "quantized::leaky_relu": 1,
-    }
-
-    need_input_quant_param = set(num_quantized_inputs.keys())
-    need_input_quant_param.add("quantized::cat")
-
-    input_scales_for_bias = {}
-
-    for node in graph.nodes():
-        operator = node.kind()
-        if operator not in need_input_quant_param:
-            continue
-
-        input_scales = []
-        input_zero_points = []
-
-        if operator == "quantized::cat":
-            # the number of inputs to concat is not constant
-            # so handle it separately
-            inputs = node.inputsAt(0).node().inputs()
-            for inp in inputs:
-                scale, zp = _get_quant_param_for_input(inp)
-                input_scales.append(scale)
-                input_zero_points.append(zp)
-        else:
-            for i in range(num_quantized_inputs[operator]):
-                scale, zp = _get_quant_param_for_input(node.inputsAt(i))
-                if scale is not None and zp is not None:
-                    input_scales.append(scale)
-                    input_zero_points.append(zp)
-
-        if operator in ["quantized::add_scalar", "quantized::mul_scalar"]:
-            scalar = node.inputsAt(1).node().f("value")
-            inp_scale = input_scales[0].node().f("value")
-            inp_zero_point = input_zero_points[0].node().i("value")
-
-            # see the comments in this function above
-            _add_output_quant_params_to_scalar_op(node, graph, inp_scale, inp_zero_point, scalar)
-
-        if operator == "aten::sigmoid":
-            _add_output_quant_params_to_sigmoid_op(node, graph)
-
-        for scale, zp in zip(input_scales, input_zero_points):
-            node.addInput(scale)
-            node.addInput(zp)
-
-        if "quantized::conv" in operator or "quantized::linear" in operator:
-            # This is required for quantizing the bias
-            assert len(input_scales) == 1, "One quantized parameter expected for qconv or qlinear."
-            input_scales_for_bias[node.inputsAt(1).debugName()] = input_scales[0].node().f("value")
-
-    return input_scales_for_bias
-
-
-def add_quant_params(params, quant_params):
-    """Add quant parameters to TVM param map"""
-    for qparam in quant_params.values():
-        params[qparam.weight_var.name_hint] = tvm.nd.array(qparam.weight)
-        if qparam.bias is not None:
-            params[qparam.bias_var.name_hint] = tvm.nd.array(qparam.bias)
-
-
-def inline_input_quant_params_for_fx(graph, params, param_debug_name_map):
-    """
-    Canonicalize input scale and zero point access for FX-quantized graphs.
-    We expect input qparams to aten::quantize_per_tensor to be prim::Constant, but that's
-    not the case for FX-based quantized models as shown below.
-    We replace prim::GetAttr with prim::Constant so that FX-based quantized models can be
-    converted in the same way as eager-mode based quantized models.
-
-    Before:
-    %pan_input_zero_point_1 : Tensor = prim::GetAttr[name="pan_input_zero_point_1"](%backbone)
-    %pan_input_scale_1 : Tensor = prim::GetAttr[name="pan_input_scale_1"](%backbone)
-    ...
-    %quantize_per_tensor_2 ... = aten::quantize_per_tensor(...,
-                                       %pan_input_scale_1, %pan_input_zero_point_1, ...)
-
-    After:
-    %2402 : int = prim::Constant[value=0]()
-    %2403 : float = prim::Constant[value=1.]()
-    %quantize_per_tensor_2 ...  = aten::quantize_per_tensor(..., %2403, %2402, ...)
-    """
-    # pylint: disable=c-extension-no-member
-    import torch
-
-    for node in graph.findAllNodes("prim::GetAttr", recurse=True):
-        out_name = node.output().debugName()
-
-        if "_scale" in out_name or "_zero_point" in out_name:
-            full_attr = param_debug_name_map[out_name]
-            assert full_attr in params, f"{full_attr} not found in param dict."
-            param_np = params[full_attr].numpy()
-            new_const_node = graph.create("prim::Constant")
-            new_const_node.insertBefore(node)
-
-            if "_scale" in out_name:
-                new_const_node.f_("value", param_np)
-                new_const_node.output().setType(torch._C.FloatType.get())
-            else:
-                new_const_node.i_("value", param_np.item())
-                new_const_node.output().setType(torch._C.IntType.get())
-
-            node.replaceAllUsesWith(new_const_node)
-
-
-def apply_with_upcast(data, func):
-    inp = _op.cast(data, dtype="int32")
-    out = func(inp)
-    return _op.cast(out, "uint8")
-
-
-def apply_with_fp32_fallback(data, input_scale, input_zero_point, func_fp32):
-    dequantized = relay.qnn.op.dequantize(data, input_scale, input_zero_point)
-    out = func_fp32(dequantized)
-    return relay.qnn.op.quantize(out, input_scale, input_zero_point, out_dtype="uint8", axis=1)
-
-
-def quantized_relu(data, input_zero_point):
-    # refer to aten/src/ATen/native/quantized/cpu/qrelu.cpp
-    zp = _op.cast(input_zero_point, dtype="uint8")
-    return _op.tensor.maximum(data, zp)
-
-
-def quantized_sigmoid(inputs):
-    data = inputs[0]
-    output_scale = _expr.const(inputs[1])
-    output_zero_point = _expr.const(inputs[2])
-    input_scale = _expr.const(inputs[3])
-    input_zero_point = _expr.const(inputs[4])
-    return relay.qnn.op.sigmoid(
-        data, input_scale, input_zero_point, output_scale, output_zero_point
-    )
-
-
-def _quantize_per_tensor():
-    def _impl(inputs, _):
-        dim = len(infer_shape(inputs[0]))
-        if dim > 1:
-            axis = 1
-        else:
-            axis = 0
-
-        return relay.qnn.op.quantize(
-            inputs[0], _expr.const(inputs[1]), _expr.const(inputs[2]), out_dtype="uint8", axis=axis
-        )
-
-    return _impl
-
-
-def _dequantize():
-    def _impl(inputs, _):
-        assert len(inputs) == 3, "Input quant params not found in op inputs"
-        inp_scale = _expr.const(inputs[1])
-        inp_zero_point = _expr.const(inputs[2])
-        return relay.qnn.op.dequantize(inputs[0], inp_scale, inp_zero_point)
-
-    return _impl
-
-
-def _get_numpy(relay_const_scalar):
-    return relay_const_scalar.data.numpy()
-
-
-def _get_scalar(relay_const_scalar):
-    return _get_numpy(relay_const_scalar).item(0)
-
-
-def _do_bias_and_requantize(
-    output, bias, input_scale, weight_scale, output_scale, output_zero_point, with_relu
-):
-    """Output processing for conv and linear"""
-    # this is a vector for per channel case
-    requant_input_scale = _expr.const(_get_numpy(input_scale) * _get_numpy(weight_scale))
-    # Torch does bias add and requanize scale in fp32
-    # refer to third_party/fbgemm/include/fbgemm/OutputProcessing-inl.h
-    # Instead, we do bias add in int32 and use qnn requantize, which needs
-    # integer input.
-    # We observed no loss in accuracy in doing this way, and it is better
-    # for tvm because bias quantization can be done at compile time
-    # Instead, the torch way requires rounding of activation at runtime
-
-    if bias is not None:
-        requantize_input = _op.nn.bias_add(output, bias)
-    else:
-        requantize_input = output
-
-    requantized = relay.qnn.op.requantize(
-        requantize_input,
-        requant_input_scale,
-        relay.const(0, "int32"),
-        output_scale,
-        output_zero_point,
-        out_dtype="int32",
-        axis=1,
-    )
-    clip_min = 0
-    if with_relu:
-        clip_min = _get_scalar(output_zero_point)
-
-    clip = _op.tensor.clip(requantized, clip_min, 255.0)
-    return _op.cast(clip, dtype="uint8")
-
-
-def _quantized_conv2d(with_relu=False):
-    def _impl(inputs, _):
-        # refer to src/ATen/native/quantized/cpu/qconv.cpp
-        # inputs[0]: input tensor
-        # inputs[1]: (weight, scale, zero_point, bias)
-        # inputs[2-5]: stride, padding, dilation, groups
-        # inputs[6]: output_scale
-        # inputs[7]: output_zero_point
-        # inputs[8]: input_scale (added manually by frontend)
-        # inputs[9]: input_zero_point (added manually by frontend)
-        conv_params = inputs[1]
-        weight = conv_params[0]
-        weight_scale = conv_params[1]
-        weight_zero_point = conv_params[2]
-        bias = conv_params[3]
-
-        if len(conv_params) > 4:
-            # Torch 1.6 or newer case
-            strides = conv_params[4]
-            padding = conv_params[5]
-            dilation = conv_params[6]
-            groups = conv_params[7]
-
-            output_scale = _expr.const(inputs[2])
-            output_zero_point = _expr.const(inputs[3])
-
-            assert len(inputs) == 6, "Input quant params not found in op inputs"
-
-            # These are manually added by add_input_quant_params_to_op_inputs above
-            # In torch, they are retrieved from QTensor data structure at runtime
-            input_scale = _expr.const(inputs[4])
-            input_zero_point = _expr.const(inputs[5])
-        else:
-            strides = inputs[2]
-            padding = inputs[3]
-            dilation = inputs[4]
-            groups = inputs[5]
-            output_scale = _expr.const(inputs[6])
-            output_zero_point = _expr.const(inputs[7])
-
-            assert len(inputs) == 10, "Input quant params not found in op inputs"
-
-            input_scale = _expr.const(inputs[8])
-            input_zero_point = _expr.const(inputs[9])
-
-        weight_shape = infer_shape(weight)
-        kernel_size = (weight_shape[2], weight_shape[3])
-        out_channels = weight_shape[0]
-
-        if padding[0] != 0 or padding[1] != 0:
-            pad_val = _get_scalar(input_zero_point)
-            inp = _op.nn.pad(
-                inputs[0],
-                pad_width=((0, 0), (0, 0), (padding[0], padding[0]), (padding[1], padding[1])),
-                pad_value=float(pad_val),
-            )
-        else:
-            inp = inputs[0]
-
-        # padding is (0, 0) because we did explicit pad op with
-        # pad value being zero point above
-        conv_out = relay.qnn.op.conv2d(
-            inp,
-            weight,
-            input_zero_point,
-            weight_zero_point,
-            input_scale,
-            weight_scale,
-            kernel_size=kernel_size,
-            dilation=dilation,
-            strides=strides,
-            padding=(0, 0),
-            groups=groups,
-            channels=out_channels,
-        )
-
-        return _do_bias_and_requantize(
-            conv_out, bias, input_scale, weight_scale, output_scale, output_zero_point, with_relu
-        )
-
-    return _impl
-
-
-def _linear(with_relu=False):
-    # similar to conv
-    def _impl(inputs, _):
-        weight = inputs[1][0]
-        weight_scale = inputs[1][1]
-        weight_zero_point = inputs[1][2]
-        output_scale = _expr.const(inputs[2])
-        output_zero_point = _expr.const(inputs[3])
-        assert len(inputs) == 6, "Input quant params not found in op inputs"
-        # Manually added by add_input_quant_params_to_op_inputs above
-        input_scale = _expr.const(inputs[4])
-        input_zero_point = _expr.const(inputs[5])
-
-        weight_shape = infer_shape(weight)
-        dense = relay.qnn.op.dense(
-            inputs[0],
-            weight,
-            input_zero_point,
-            weight_zero_point,
-            input_scale,
-            weight_scale,
-            units=weight_shape[0],
-        )
-        bias_var = inputs[1][3]
-
-        return _do_bias_and_requantize(
-            dense, bias_var, input_scale, weight_scale, output_scale, output_zero_point, with_relu
-        )
-
-    return _impl
-
-
-def _binop(relay_op, with_relu=False, fp32_piggy_back=False):
-    def qnn_impl(
-        lhs,
-        rhs,
-        input_scale_lhs,
-        input_zero_point_lhs,
-        input_scale_rhs,
-        input_zero_point_rhs,
-        output_scale,
-        output_zero_point,
-    ):
-        qnn_out = relay_op(
-            lhs,
-            rhs,
-            input_scale_lhs,
-            input_zero_point_lhs,
-            input_scale_rhs,
-            input_zero_point_rhs,
-            output_scale,
-            output_zero_point,
-        )
-        if with_relu:
-            clip_min = _get_scalar(output_zero_point)
-            return _op.tensor.clip(qnn_out, clip_min, 255)
-        return qnn_out
-
-    # refer to aten/src/ATen/native/quantized/cpu/{qadd, qmul}.cpp
-    # they piggy backs to fp32 math by dequantize -> fp32 math -> quantize
-    def torch_impl(
-        lhs,
-        rhs,
-        input_scale_lhs,
-        input_zero_point_lhs,
-        input_scale_rhs,
-        input_zero_point_rhs,
-        output_scale,
-        output_zero_point,
-    ):
-        if isinstance(lhs, _expr.Call) and lhs.op.name == "qnn.quantize":
-            lhs = lhs.args[0]
-        else:
-            lhs = relay.qnn.op.dequantize(lhs, input_scale_lhs, input_zero_point_lhs)
-
-        if isinstance(rhs, _expr.Call) and rhs.op.name == "qnn.quantize":
-            rhs = rhs.args[0]
-        else:
-            rhs = relay.qnn.op.dequantize(rhs, input_scale_rhs, input_zero_point_rhs)
-        fp32_out = relay_op(lhs, rhs)
-
-        if with_relu:
-            fp32_out = _op.nn.relu(fp32_out)
-
-        return relay.qnn.op.quantize(
-            fp32_out, output_scale, output_zero_point, axis=-1, out_dtype="uint8"
-        )
-
-    def _impl(inputs, _):
-        lhs = inputs[0]
-        rhs = inputs[1]
-        output_scale = _expr.const(inputs[2])
-        output_zero_point = _expr.const(inputs[3])
-        assert len(inputs) == 8, "Input quant params not found in op inputs"
-        # Manually added by add_input_quant_params_to_op_inputs above
-        input_scale_lhs = _expr.const(inputs[4])
-        input_zero_point_lhs = _expr.const(inputs[5])
-        input_scale_rhs = _expr.const(inputs[6])
-        input_zero_point_rhs = _expr.const(inputs[7])
-
-        if fp32_piggy_back:
-            logger.info("Piggy backing to FP32 op (PyTorch way)")
-            return torch_impl(
-                lhs,
-                rhs,
-                input_scale_lhs,
-                input_zero_point_lhs,
-                input_scale_rhs,
-                input_zero_point_rhs,
-                output_scale,
-                output_zero_point,
-            )
-
-        return qnn_impl(
-            lhs,
-            rhs,
-            input_scale_lhs,
-            input_zero_point_lhs,
-            input_scale_rhs,
-            input_zero_point_rhs,
-            output_scale,
-            output_zero_point,
-        )
-
-    return _impl
-
-
-def _cat(fp32_piggy_back=False):
-    # refer to aten/src/ATen/native/quantized/cpu/qconcat.cpp
-    # for concat they also piggy backs to fp32(!)
-    # dequantize -> fp32 math -> quantize
-    def torch_impl(inputs, input_scales, input_zero_points, output_scale, output_zero_point, axis):
-        dequantized = []
-        for inp, inp_scale, inp_zp in zip(inputs, input_scales, input_zero_points):
-            dequantized.append(relay.qnn.op.dequantize(inp, inp_scale, inp_zp))
-
-        concat = _op.tensor.concatenate(dequantized, axis=axis)
-        return relay.qnn.op.quantize(
-            concat, output_scale, output_zero_point, axis=axis, out_dtype="uint8"
-        )
-
-    def _impl(inputs, _):
-        axis = inputs[1]
-        output_scale = _expr.const(inputs[2])
-        output_zero_point = _expr.const(inputs[3])
-        num_inputs = (len(inputs) - 4) // 2
-
-        input_scales = []
-        input_zero_points = []
-
-        for i in range(0, num_inputs):
-            input_scales.append(_expr.const(inputs[4 + i * 2]))
-            input_zero_points.append(_expr.const(inputs[4 + i * 2 + 1]))
-
-        if fp32_piggy_back:
-            return torch_impl(
-                inputs[0], input_scales, input_zero_points, output_scale, output_zero_point, axis
-            )
-
-        return relay.qnn.op.concatenate(
-            inputs[0], input_scales, input_zero_points, output_scale, output_zero_point, axis
-        )
-
-    return _impl
-
-
-def _add_scalar():
-    # this is used for mobilenet v3
-    def _impl(inputs, _):
-        # refer to aten/src/ATen/native/quantized/cpu/qadd.cpp
-        assert len(inputs) == 6, "Input quant params not found in op inputs"
-        s = inputs[4]
-        z = inputs[5]
-        c = inputs[1]
-        c_q = round(c / s)
-        q_min = 0
-        q_max = 255
-
-        # math for calculating output scale and zp are already done
-        # during _add_output_quant_params_to_scalar_op above
-        out_scale = _expr.const(inputs[2])
-        out_zp = _expr.const(inputs[3])
-
-        if q_min > z - c_q or q_max < z - c_q:
-            # TODO(masahi): Replace this with integer only compute
-            dequant = relay.qnn.op.dequantize(inputs[0], _expr.const(s), _expr.const(z))
-            dequantized_add = _op.tensor.add(dequant, _expr.const(c_q * s))
-            return relay.qnn.op.quantize(
-                dequantized_add, out_scale, out_zp, axis=1, out_dtype="uint8"
-            )
-        # only scale change
-        return inputs[0]
-
-    return _impl
-
-
-def quantize_scalar(data, scale, zero_point):
-    # used to quantize 6., in mobilenet v3
-    transformed = zero_point + data / scale
-    return max(0, min(round(transformed), 255))
-
-
-def _relu6():
-    # refer to src/ATen/native/quantized/cpu/qrelu.cpp
-    def _impl(inputs, _):
-        assert len(inputs) == 4, "Input quant params not found in op inputs"
-        input_scale = inputs[2]
-        input_zero_point = inputs[3]
-        six = quantize_scalar(6.0, input_scale, input_zero_point)
-        return _op.tensor.clip(inputs[0], input_zero_point, six)
-
-    return _impl
-
-
-def _leaky_relu(fp32_piggy_back=False):
-    # refer to src/ATen/native/quantized/cpu/qrelu.cpp
-    def _impl_fp32(inputs, _):
-        alpha = inputs[1]
-        output_scale = _expr.const(inputs[3])
-        output_zero_point = _expr.const(inputs[4])
-        input_scale = _expr.const(inputs[5])
-        input_zero_point = _expr.const(inputs[6])
-        dequant = relay.qnn.op.dequantize(inputs[0], input_scale, input_zero_point)
-        dequantized = _op.nn.leaky_relu(dequant, alpha)
-        return relay.qnn.op.quantize(
-            dequantized, output_scale, output_zero_point, out_dtype="uint8"
-        )
-
-    def _impl_int8(inputs, _):
-        alpha = inputs[1]
-        output_scale = _expr.const(inputs[3])
-        output_zero_point = _expr.const(inputs[4])
-        input_scale = _expr.const(inputs[5])
-        input_zero_point = _expr.const(inputs[6])
-        return relay.qnn.op.leaky_relu(
-            inputs[0], alpha, input_scale, input_zero_point, output_scale, output_zero_point
-        )
-
-    def _impl(inputs, _):
-        assert len(inputs) == 7, "Input quant params not found in op inputs"
-        if fp32_piggy_back:
-            return _impl_fp32(inputs, _)
-        return _impl_int8(inputs, _)
-
-    return _impl
-
-
-def _mul_scalar():
-    # this is used for mobilenet v3
-    def _impl(inputs, _):
-        # refer to aten/src/ATen/native/quantized/cpu/qmul.cpp
-        # math for calculating output scale and zp are already done
-        # during _add_output_quant_params_to_scalar_op above
-        assert len(inputs) == 6, "Input quant params not found in op inputs"
-        other_val = inputs[1]  # scalar
-
-        if other_val > 0.0:
-            # only scale change
-            return inputs[0]
-        if other_val == 0.0:
-            shape = infer_shape(inputs[0])
-            return _op.full(_expr.const(0), shape, dtype="uint8")
-
-        # negative scale case
-        q_min = 0
-        q_max = 255
-        bias = _expr.const(q_max + q_min, dtype="int8")
-        int8 = bias - _op.cast(inputs[0], "int8")
-        return _op.cast(int8, "uint8")
-
-    return _impl
-
-
-def _hswish(fp32_piggy_back=False):
-    def _impl_fp32(inputs):
-        # refer to src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
-        # They fallback to fp32
-        def relu6(x):
-            return _op.tensor.clip(x, 0.0, 6.0)
-
-        def hardsigmoid(x):
-            dtype = "float32"
-            return relu6(x + _expr.const(3.0, dtype=dtype)) / _expr.const(6.0, dtype=dtype)
-
-        output_scale = _expr.const(inputs[1])
-        output_zero_point = _expr.const(inputs[2])
-        input_scale = _expr.const(inputs[3])
-        input_zero_point = _expr.const(inputs[4])
-
-        dequant = relay.qnn.op.dequantize(inputs[0], input_scale, input_zero_point, axis=1)
-        dequantized_hswish = dequant * hardsigmoid(dequant)
-        return relay.qnn.op.quantize(
-            dequantized_hswish, output_scale, output_zero_point, out_dtype="uint8"
-        )
-
-    def _impl_int8(inputs):
-        output_scale = _expr.const(inputs[1])
-        output_zero_point = _expr.const(inputs[2])
-        input_scale = _expr.const(inputs[3])
-        input_zero_point = _expr.const(inputs[4])
-        return relay.qnn.op.hardswish(
-            inputs[0], input_scale, input_zero_point, output_scale, output_zero_point
-        )
-
-    def _impl(inputs, _):
-        assert len(inputs) == 5, "Input quant params not found in op inputs"
-        if fp32_piggy_back:
-            return _impl_fp32(inputs)
-        return _impl_int8(inputs)
-
-    return _impl
-
-
-def _linear_dynamic():
-    def _calculate_qparam(inp):
-        # reference ATen/native/quantized/cpu/qlinear_dynamic.cpp
-        # ChooseQuantizationParams function
-        mn = _op.min(inp)
-        mx = _op.max(inp)
-
-        # Ensure that the interval contains 0
-        mn = _op.minimum(mn, _op.const(0.0, dtype="float32"))
-        mx = _op.maximum(mx, _op.const(0.0, dtype="float32"))
-
-        qmax = 255
-
-        # reduce_range became True in v1.6
-        if is_version_greater_than("1.5.1"):
-            qmax = 127
-
-        scale = (mx - mn) / _expr.const(qmax, dtype="float32")
-
-        zero_point_from_min = -(mn / scale)
-        zero_point = _op.cast(_op.round(_op.clip(zero_point_from_min, 0.0, qmax)), "int32")
-
-        return scale, zero_point
-
-    def _impl(inputs, _):
-        weight = inputs[1][0]
-        weight_scale = inputs[1][1]
-        weight_zero_point = inputs[1][2]
-
-        inp = inputs[0]
-
-        input_scale, input_zero_point = _calculate_qparam(inp)
-        qinp = relay.qnn.op.quantize(inp, input_scale, input_zero_point, out_dtype="uint8")
-
-        data_shape = infer_shape(inp)
-
-        if len(data_shape) > 2:
-            qinp = _op.reverse_reshape(qinp, [-1, 0])
-
-        weight_shape = infer_shape(weight)
-        units = weight_shape[0]
-        dense = relay.qnn.op.dense(
-            qinp,
-            weight,
-            input_zero_point,
-            weight_zero_point,
-            input_scale,
-            weight_scale,
-            units=units,
-        )
-        bias_var = inputs[1][3]
-
-        dequant_scale = input_scale * weight_scale
-        dense_out = relay.qnn.op.dequantize(
-            dense, dequant_scale, input_zero_point=relay.const(0, "int32"), axis=1
-        )
-
-        if len(data_shape) > 2:
-            new_shape = list(data_shape[:-1])
-            new_shape.append(units)
-            dense_out = _op.reshape(dense_out, new_shape)
-
-        if bias_var is not None:
-            return dense_out + bias_var
-
-        return dense_out
-
-    return _impl
-
-
-def _quantized_conv_transpose2d(with_relu=False):
-    def _impl(inputs, _):
-        # Refer to aten/src/ATen/native/quantized/cpu/qconv.cpp
-        # Supported in Torch 1.7 or newer
-        conv_params = inputs[1]
-        weight = conv_params[0]
-        weight_scale = conv_params[1]
-        weight_zero_point = conv_params[2]
-        bias = conv_params[3]
-
-        strides = conv_params[4]
-        padding = conv_params[5]
-        dilation = conv_params[6]
-        groups = conv_params[7]
-        output_padding = conv_params[8]
-
-        output_scale = _expr.const(inputs[2])
-        output_zero_point = _expr.const(inputs[3])
-
-        assert len(inputs) == 6, "Input quant params not found in op inputs"
-
-        # These are manually added by add_input_quant_params_to_op_inputs above
-        # In torch, they are retrieved from QTensor data structure at runtime
-        input_scale = _expr.const(inputs[4])
-        input_zero_point = _expr.const(inputs[5])
-
-        weight_shape = list(infer_shape(weight))
-
-        kernel_size = (weight_shape[2], weight_shape[3])
-        out_channels = weight_shape[1]
-
-        conv_out = relay.qnn.op.conv2d_transpose(
-            inputs[0],
-            weight,
-            input_zero_point,
-            weight_zero_point,
-            input_scale,
-            weight_scale,
-            kernel_size=kernel_size,
-            dilation=dilation,
-            strides=strides,
-            padding=padding,
-            groups=groups,
-            channels=out_channels,
-            output_padding=output_padding,
-            out_dtype="int32",
-            kernel_layout="IOHW",
-        )
-
-        return _do_bias_and_requantize(
-            conv_out, bias, input_scale, weight_scale, output_scale, output_zero_point, with_relu
-        )
-
-    return _impl
-
-
-convert_map = {
-    "aten::quantize_per_tensor": _quantize_per_tensor(),
-    "quantized::conv2d_relu": _quantized_conv2d(with_relu=True),
-    "aten::dequantize": _dequantize(),
-    "quantized::conv2d": _quantized_conv2d(),
-    "quantized::add_relu": _binop(relay.qnn.op.add, with_relu=True),
-    "quantized::add": _binop(relay.qnn.op.add),
-    "quantized::mul_relu": _binop(relay.qnn.op.mul, with_relu=True),
-    "quantized::mul": _binop(relay.qnn.op.mul),
-    "quantized::linear": _linear(),
-    "quantized::linear_relu": _linear(with_relu=True),
-    "quantized::cat": _cat(),
-    "quantized::add_scalar": _add_scalar(),
-    "quantized::mul_scalar": _mul_scalar(),
-    "quantized::relu6": _relu6(),
-    "quantized::leaky_relu": _leaky_relu(),
-    "quantized::linear_dynamic": _linear_dynamic(),
-    "quantized::hardswish": _hswish(fp32_piggy_back=False),
-    "quantized::conv_transpose2d": _quantized_conv_transpose2d(),
-}
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
deleted file mode 100644
index e12542f8e276..000000000000
--- a/python/tvm/relay/frontend/tensorflow.py
+++ /dev/null
@@ -1,1264 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition, broad-except
-# pylint: disable=import-outside-toplevel, redefined-builtin
-"""TF: Tensorflow frontend."""
-import warnings
-from collections import defaultdict
-
-# Numpy support
-import numpy as np
-import tvm
-
-from tvm.ir import IRModule
-from tvm.relay.prelude import Prelude
-from tvm.relay.transform import InferType
-
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from ..ty import Any
-from ..expr_functor import ExprMutator, ExprVisitor
-from .common import get_relay_op
-from .common import infer_type as _infer_type
-from .common import infer_shape as _infer_shape
-from .common import infer_value as _infer_value
-from .common import set_span
-
-from .tensorflow_ops import _convert_map
-from .tensorflow_ops import _need_prelude_for_shape_inference
-from .tensorflow_ops import _get_more_static_shape
-
-__all__ = ["from_tensorflow"]
-
-# The default configurations of Relay TensorFlow frontend.
-TF_DEFAULT_CONFIGS = {
-    # By default, TVM converts `tf.matmul` to `transpose(weight) + nn.dense`, which introduces
-    # unnecessary overhead in weight transpose. Change this flag to False to directly convert to
-    # `nn.matmul` to get rid of the overhead.
-    # However, please note that `nn.matmul` is in experimental so it may have some performance
-    # issues.
-    "use_dense": True,
-    # By default, TVM converts `tf.batch_matmul` to `transpose(weight) + nn.batch_matmul_NT`.
-    # Change this flag to False to directly convert to `nn.batch_matmul`.
-    # Note that `nn.batch_matmul` with format other than NT is in experimental, it may have some
-    # performance issues.
-    "use_nt_batch_matmul": True,
-}
-
-# compatible operators that do NOT require any conversion.
-_identity_list = []
-
-# Operators that get pruned away when the complete graph is frozen.
-# These operators are not needed for inference.
-_freezed_graph_pruned_op_list = [
-    "ReadVariableOp",
-    "ResourceGather",
-    "Variable",
-    "VariableV2",
-    "VarHandleOp",
-    "Assign",
-    "AssignVariableOp",
-]
-
-# An internal list to contain all the control flow primitives used in Tensorflow
-# 1.x.
-_control_flow_nodes = ["Merge", "Switch", "NextIteration", "Exit", "Enter", "LoopCond"]
-
-# A map to record tensor array write ops and input ta/tensor indices
-# Value is (index of tensor array, index of written node)
-_tensor_array_write_ops = {
-    "TensorArrayWrite": (3, 2),
-    "TensorArrayScatter": (0, 2),
-    "TensorArraySplit": (0, 1),
-}
-
-
-def is_tensor_array_constuctor(tf_node):
-    """Check whether is tensor array constructor node."""
-    is_ta = False
-    ta_start = "TensorArrayV"
-    if tf_node.op.startswith(ta_start):
-        is_ta = tf_node.op[len(ta_start)].isnumeric()
-    return is_ta
-
-
-def find_parent_loop_name(node_name, while_loop_name_set):
-    """Find name of direct parent while loop."""
-    ploop_name = ""
-    name_prefix = node_name.rsplit("/", 1)[0]
-    if name_prefix.startswith("^"):
-        name_prefix = name_prefix[1:]
-    for lname in while_loop_name_set:
-        if name_prefix.startswith(lname) and len(ploop_name) < len(lname):
-            ploop_name = lname
-
-    if len(ploop_name) == 0:
-        ploop_name = name_prefix
-
-    return ploop_name
-
-
-def _in_while_loop(control_flow_node_map, op_name):
-    """
-    Check if a given control flow operator is part of a while loop execution
-    frame. This is based on the fact that there is only one occurrence of
-    `LoopCond` for a loop execution frame and it is only presented in the loop
-    construct.
-
-    Parameters
-    ----------
-    control_flow_node_map : Dict[str, Set[str]]
-        A dictionary contains the unique control flow execution frame name to
-        a set of primitive operators mapping.
-
-    op_name : str
-        The name of a control flow primitive.
-
-    Returns
-    -------
-    ret : bool
-        Return true if the operator is in a while loop execution frame,
-    otherwise, return false.
-    """
-    return op_name in control_flow_node_map and "LoopCond" in control_flow_node_map[op_name]
-
-
-class RewriteSubgraph(ExprMutator):
-    """
-    A helper class to rewrite expr in while loop function to variable.
-
-    Parameters
-    ----------
-    rewrite_map : Dict[expr, expr]
-        A dictionary contains a set of expr to var mapping.
-    """
-
-    def __init__(self, rewrite_map):
-        ExprMutator.__init__(self)
-        self.rewrite_map = rewrite_map
-
-    def visit(self, expr):
-        if expr in self.rewrite_map:
-            return self.rewrite_map[expr]
-        return super().visit(expr)
-
-
-def rewrite_subgraph(expr, rewrites):
-    """Rewrite loop body."""
-    return RewriteSubgraph(rewrites).visit(expr)
-
-
-class Branch:
-    """A class contains the components that are used to build up a Relay if
-    node.
-
-    Parameters
-    ----------
-    cond : tvm.relay.Expr
-        The condition of a if node.
-
-    true_branch : tvm.relay.Expr
-        The body of the true branch of a if expression.
-
-    false_branch: tvm.relay.Expr
-        The body of the false branch of a if expression.
-
-    _if : tvm.relay.Expr
-        An internal variable indicates where an if expression is already created
-        for a matched TF condition construct.
-
-    Examples
-    --------
-    The following is a cond statement written in TensorFlow:
-
-    .. code-block:: python
-
-        def vanilla_cond():
-            i = tf.constant(1)
-            j = tf.constant(4)
-
-             def f1():
-                return tf.multiply(1, 17)
-
-             def f2():
-                return tf.add(4, 23)
-            r = tf.cond(tf.less(i, j), f1, f2)
-
-    This condition statement should be converted into Relay in the following
-    form:
-
-    .. code-block:: python
-
-        fn (%Const: Tensor[(1,), int32],
-            %Const_1: Tensor[(1,), int32],
-            %cond/Mul/x: Tensor[(1,), int32],
-            %cond/Mul/y: Tensor[(1,), int32],
-            %cond/Add/x: Tensor[(1,), int32],
-            %cond/Add/y: Tensor[(1,), int32]) {
-          %0 = less(%Const, %Const_1) # ty=Tensor[(1,), bool]
-          %1 = min(%0)
-          if (%1) {
-            %2 = multiply(%cond/Mul/x, %cond/Mul/y)
-            %2
-          }  else {
-            %3 = add(%cond/Add/x, %cond/Add/y)
-            %3
-          }
-        }
-    """
-
-    def __init__(self):
-        self._if = None
-        self.cond = None
-        self.true_branch = None
-        self.false_branch = None
-
-    def _if_node(self):
-        """An internal API to create a relay if node from the matched TF
-        condition construct.
-        """
-        # `cond`  returns a tensor that contains boolean values. We add a `min`
-        # operator to checks if there is any false value. If so, this condition
-        # doesn't not hold.
-        cond = tvm.relay.op.min(self.cond)
-        return tvm.relay.If(cond, self.true_branch, self.false_branch)
-
-    def if_node(self):
-        """Create an tvm.relay.If node if it hasn't been created yet."""
-        if self._if is None:
-            self._if = self._if_node()
-        return self._if
-
-
-class VarChecker(ExprVisitor):
-    """Check whether a Variable is used in loop body.
-
-    Parameters
-    ----------
-    var : relay.expr.Var
-        Relay Variable to be checked.
-    """
-
-    def __init__(self, var):
-        ExprVisitor.__init__(self)
-        self._var = var
-        self.used = False
-
-    def visit(self, expr):
-        if self._var == expr:
-            self.used = True
-        super().visit(expr)
-
-
-class Loop:
-    """
-    A class contains the components that are used to build up a Relay
-    recursive call.
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        Module for current parsed IR.
-
-    loop_name : str
-        Name prefix of while loop in TensorFlow graph.
-
-    lvar2expr : dict from str to dict from Relay.expr.Var to Relay.expr
-        A dictionary recording all loop vars and corresponding
-        relay expression.
-
-    Examples
-    --------
-    The following is a vanilla loop from TensorFlow:
-    .. code-block:: python
-        i = tf.constant(0)
-        c = lambda i: tf.less(i, 10)
-        b = lambda i: tf.add(i, 1)
-        r = tf.while_loop(c, b, [i])
-    It will be converted to the following recursive call in Relay:
-    .. code-block:: python
-        fn (%while/Less/y: Tensor[(1,), int32],
-            %while/Add/y: Tensor[(1,), int32],
-            %Const: Tensor[(1,), int32]) {
-          %0 = fn(%loop_var0: Tensor[(1,), int32]) {
-            %1 = less(%loop_var0, %while/Less/y)
-            %2 = min(%1)
-            if (%2) {
-              %3 = add(%loop_var0, %while/Add/y)
-              free_var %while_loop
-              %4 = %while_loop(%3)
-              %4
-            }    else {
-              %5 = (%loop_var0,)
-              %5
-            }
-          }
-          let %while_loop1 = %0
-          %6 = %while_loop1(%Const)
-          %6
-        }
-    """
-
-    def __init__(self, mod, loop_name, lvar2expr):
-        self.cond = None
-        self.body = []
-        self._loop = None
-        self._mod = mod
-        self._loop_name = loop_name
-        self._lvar2expr = lvar2expr
-        self.loop_vars = []
-
-        self.aligned = False
-
-    def _while_loop(self):
-        """An internal API to create a Relay recursive call for a matched TF
-        `while_loop` construct.
-        """
-        bind_map = {}
-        wl = set_span(tvm.relay.var("while_loop"), self._loop_name)
-        sb = tvm.relay.scope_builder.ScopeBuilder()
-
-        lv_list = []
-        expr_list = []
-        extra_vars = []
-
-        for i, lv in enumerate(self.loop_vars):
-            if self._loop_name not in self._lvar2expr:
-                self._lvar2expr[self._loop_name] = {}
-
-            # Handle the case when loop var is not properly lifted.
-            # This can happen when loop var node name is set accidentally
-            # beginning with loop name.
-            if lv not in self._lvar2expr[self._loop_name]:
-                var_name = f"{self._loop_name}_loop_var_{i}"
-                var_type = _infer_type(lv, self._mod).checked_type
-                loop_var = set_span(tvm.relay.var(var_name, type_annotation=var_type), var_name)
-                self._lvar2expr[self._loop_name][loop_var] = lv
-                bind_map[lv] = loop_var
-                self.loop_vars[i] = loop_var
-                lv = loop_var
-
-            lv_list.append(lv)
-            expr_list.append(self._lvar2expr[self._loop_name][lv])
-
-        if bind_map:
-            self.cond = rewrite_subgraph(self.cond, bind_map)
-            self.body = [rewrite_subgraph(b, bind_map) for b in self.body]
-
-        cond = set_span(tvm.relay.op.min(self.cond), self.cond.span)
-
-        for lv, exp in self._lvar2expr[self._loop_name].items():
-            if lv not in self.loop_vars:
-                var_checker = VarChecker(lv)
-                for bd in self.body + [cond]:
-                    var_checker.visit(bd)
-                    if var_checker.used:
-                        lv_list.append(lv)
-                        expr_list.append(exp)
-                        extra_vars.append(lv)
-                        break
-
-        with sb.if_scope(cond):
-            sb.ret(wl(*list(self.body + extra_vars)))
-        with sb.else_scope():
-            sb.ret(tvm.relay.Tuple(lv_list))
-
-        loop_fn = tvm.relay.Function(lv_list, sb.get())
-        sb = tvm.relay.scope_builder.ScopeBuilder()
-        sb.let(wl, loop_fn)
-        loop_ret = wl(*expr_list)
-
-        sb.ret(loop_ret)
-        ret = sb.get()
-        return ret
-
-    def while_loop(self):
-        """Instantiate a while loop if it has not been created yet."""
-        if self._loop is None:
-            self._loop = self._while_loop()
-            return self._loop
-        return self._loop
-
-
-class GraphProto(object):
-    """A helper class for handling relay graph copying from Tensorflow GraphDef.
-    Definition:
-        https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto
-    """
-
-    def __init__(self):
-        self._nodes = {}
-        self._tf_node_map = {}
-        self._params = {}
-        self._input_shapes = {}
-        self._output_shapes = {}
-        self._num_rnn_layer = False
-        self._input_shapes = {}
-        self._loops = {}
-        self._branches = {}
-        self._mod = IRModule({})
-        self._prelude = Prelude(self._mod)
-        self._control_flow_node_map = defaultdict(set)
-        self._loop_body_order = {}
-        self._loop_var_order = {}
-        self._lvar2expr = {}
-        self._lname_map = {}
-        self._sorted_cf_node_names = []
-        self._while_loop_name_set = set()
-        self._main_graph_proto = self
-        self._tensor_array_shapes = {}
-        self._tensor_array_shape_nodes = {}
-
-    def _get_relay_func(self, graph, layout="NHWC", shape=None, outputs=None):
-        """Construct relay nodes from tensorflow graph definition - GraphDef.
-
-        Follow the tensorflow graph definition to parse and convert it to Relay.
-        Some of the assumptions listed below.
-
-            -> All Placeholders are considered as graph input.
-            -> All Const nodes are params.
-            -> Last node is assumed as graph output.
-            -> _output_shapes : Graph should be frozen with add_shapes=True.
-                                Or user can pass input shape dictionary optionally.
-            -> DecodeJpeg, ResizeBilinear: These are dummy operators.
-                                           Hence user should handle preprocessing outside.
-            -> CheckNumerics: No implementation as of now for this.
-                              Just copies input to output.
-
-        Parameters
-        ----------
-        graph : tensorflow graph definition object
-            The loaded tensorflow GraphDef
-
-        layout : target layout to be used (Optional)
-            NCHW only supported now to enable NHWC models on GPU.
-
-        shape : Dictionary of input dimensions (Optional)
-            Graph level input shape dictionary.
-
-        outputs : List of output tensor names (Optional)
-            if not specified then the last node is assumed as graph output.
-
-        Returns
-        -------
-        mod : tvm.IRModule
-            The module that optimizations will be performed on.
-
-        params : dict
-            A dict of name: tvm.nd.array pairs, used as pretrained weights
-        """
-        try:
-            from tensorflow.python.framework import tensor_util
-        except ImportError as e:
-            raise ImportError(f"Unable to import tensorflow which is required {e}")
-
-        missing_operators = self._parse_import_prerequisites(graph)
-        control_flow_nodes = []
-        ta_write_nodes = []
-        ta_gather_nodes = []
-        ta_construct_nodes = []
-        self._in_shape = shape
-        self._layout = layout
-        self._graph = graph
-
-        if missing_operators:
-            freezed_ops = [op for op in missing_operators if op in _freezed_graph_pruned_op_list]
-            if freezed_ops:
-                raise Exception(
-                    f"Graph is not frozen. Provide a frozen graph. "
-                    f"Found operators {freezed_ops}"
-                )
-
-            raise NotImplementedError(
-                f"The following operators are not implemented: {missing_operators}"
-            )
-
-        for node in graph.node:
-            node_name_prefix = node.name.rsplit("/", 1)[0]
-            self._control_flow_node_map[node_name_prefix].add(node.op)
-            self._tf_node_map[node.name] = node
-
-            # Parse output_shapes attribute
-            parsed_attr = self._parse_attr(node.attr)
-            if "_output_shapes" in parsed_attr:
-                self._output_shapes[node.name] = [
-                    tensor_util.TensorShapeProtoToList(tshape)
-                    for tshape in parsed_attr["_output_shapes"]
-                ]
-            else:
-                self._output_shapes[node.name] = [None]
-
-            # Parse placeholder and const here since input shape info is required.
-            if node.op == "Placeholder" or node.op == "PlaceholderWithDefault":
-                # Give priority to user argument.
-                if shape and node.name in shape:
-                    self._input_shapes[node.name] = list(shape[node.name])
-                else:
-                    self._input_shapes[node.name] = tensor_util.TensorShapeProtoToList(
-                        node.attr["shape"].shape
-                    )
-                    for idx, dim in enumerate(self._input_shapes[node.name]):
-                        if dim < 0:
-                            self._input_shapes[node.name][idx] = Any()
-
-                self._output_shapes[node.name] = [self._input_shapes[node.name]]
-                attr = self._parse_attr(node.attr)
-                self._nodes[node.name] = [
-                    set_span(
-                        _expr.var(
-                            node.name, shape=self._input_shapes[node.name], dtype=attr["dtype"].name
-                        ),
-                        node.name,
-                    )
-                ]
-
-                # Ignore user's input shape for Non placeholder
-            elif node.op == "Const":
-                tensor_value = node.attr["value"].tensor
-                self._input_shapes[node.name] = tensor_util.TensorShapeProtoToList(
-                    tensor_value.tensor_shape
-                )
-                self._output_shapes[node.name] = [self._input_shapes[node.name]]
-                if shape and node.name in shape:
-                    warnings.warn(
-                        f"Ignore the passed shape. Shape in graphdef "
-                        f"will be used for operator {node.name}."
-                    )
-                for key, value in node.attr.items():
-                    self._parse_param(key, value, node.name, self._in_shape)
-            elif node.op in _control_flow_nodes:
-                # We assume that the direct parent node of Exit is a while loop block
-                if node.op == "Exit":
-                    self._while_loop_name_set.add(node_name_prefix)
-                control_flow_nodes.append(node)
-            elif node.op.startswith("TensorArray"):
-                if is_tensor_array_constuctor(node):
-                    ta_construct_nodes.append(node)
-                else:
-                    for ta_write_name, idx in _tensor_array_write_ops.items():
-                        if node.op.startswith(ta_write_name):
-                            ta_write_nodes.append((node, idx))
-                            break
-                    if node.op.startswith("TensorArrayGather"):
-                        ta_gather_nodes.append(node)
-
-        # Use tensor array gather to infer static tensor array shape
-        for gather_node in ta_gather_nodes:
-            input_ta_name = gather_node.input[0]
-            input_ta_node = self._tf_node_map[input_ta_name]
-            if is_tensor_array_constuctor(input_ta_node):
-                gather_attr = self._parse_attr(gather_node.attr)
-                if "element_shape" not in gather_attr:
-                    continue
-                raw_elem_shape = tensor_util.TensorShapeProtoToList(gather_attr["element_shape"])
-                elem_shape = []
-                for dim in raw_elem_shape:
-                    if dim < 0:
-                        elem_shape.append(Any())
-                    else:
-                        elem_shape.append(int(dim))
-                self._tensor_array_shapes[input_ta_node.name] = elem_shape
-
-        # Fetch node contains static tensor array shape
-        for item in ta_write_nodes:
-            wnode = item[0]
-            ta_idx, inode_idx = item[1]
-
-            stack = [self._tf_node_map[wnode.input[ta_idx].split(":")[0]]]
-            while stack:
-                cnode = stack.pop(0)
-                if not cnode.op.startswith("TensorArray"):
-                    for iname in cnode.input:
-                        stack.append(self._tf_node_map[iname.split(":")[0]])
-                elif cnode.name != wnode.name:
-                    if is_tensor_array_constuctor(cnode):
-                        inode = self._tf_node_map[wnode.input[inode_idx].split(":")[0]]
-                        tn = wnode.input[inode_idx].split(":")
-                        output_index = int(tn[1]) if len(tn) > 1 else 0
-                        self._tensor_array_shape_nodes[cnode.name] = (inode, wnode.op, output_index)
-                    break
-
-        # First, parse all control flow nodes.
-        # Convert tf.cond to Branch and tf.while_loop to Loop.
-        sorted_cf_nodes = []
-        exit_pos_map = {}
-        ordered_prefix = []
-        # Sort control flow nodes to move all Exit nodes to the end
-        # of corresponding while_loop block.
-        for node in control_flow_nodes:
-            loop_name = find_parent_loop_name(node.name, self._while_loop_name_set)
-            if node.op == "Exit":
-                if loop_name not in exit_pos_map:
-                    ordered_prefix.append(loop_name)
-                    exit_pos_map[loop_name] = len(sorted_cf_nodes)
-                sorted_cf_nodes.append(node)
-            elif loop_name in self._while_loop_name_set:
-                if loop_name not in exit_pos_map:
-                    sorted_cf_nodes.append(node)
-                else:
-                    sorted_cf_nodes.insert(exit_pos_map[loop_name], node)
-                    for j in range(ordered_prefix.index(loop_name), len(ordered_prefix)):
-                        exit_pos_map[ordered_prefix[j]] += 1
-            else:
-                sorted_cf_nodes.append(node)
-
-        for node in sorted_cf_nodes:
-            self._sorted_cf_node_names.append(node.name)
-
-        for node in sorted_cf_nodes:
-            self._backtrack_construct(node.name)
-
-        # Second, parse other nodes to re-create TF graph using Relay operators.
-        for node in graph.node:
-            self._backtrack_construct(node.name)
-
-        out = []
-        if outputs is None:
-            last_node = graph.node[-1]
-            op = self._nodes[last_node.name.split(":")[0]]
-            if last_node.op == "Exit":
-                out = [op[0].tuple_value]
-            else:
-                out = op
-        else:
-            for out_name in outputs:
-                if ":" in out_name:
-                    out_name, out_num = out_name.split(":")
-                    out_num = int(out_num)
-                    out.append(self._nodes[out_name][out_num])
-                else:
-                    out.append(self._nodes[out_name][0])
-
-        if isinstance(out, _expr.TupleWrapper):
-            out = out.tuple_value
-        else:
-            out = out[0] if len(out) == 1 else _expr.Tuple(out)
-        fvars = analysis.free_vars(out)
-        func = _function.Function(fvars, out)
-        final_params = {}
-        for fv in fvars:
-            if fv.name_hint in self._params:
-                final_params[fv.name_hint] = self._params[fv.name_hint]
-        self._params = final_params
-        return func
-
-    def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
-        """Wrapper to _get_relay_func which converts Tensorflow graph to Relay function
-        which is used as main function for the Relay module
-        """
-        func = self._get_relay_func(graph, layout=layout, shape=shape, outputs=outputs)
-        self._mod["main"] = func
-        return self._mod, self._params
-
-    def _parse_import_prerequisites(self, graph):
-        """Calculate the named preconditions from TensorFlow `graph`.
-        Return prerequisites for parsing:
-        a. Set of operator names which don't have their mapping in TVM, i.e.
-            which are not supported
-        """
-        missing_operators = set()
-        from tensorflow.python.framework import op_def_registry
-
-        for node in graph.node:
-            getOpDef = (
-                op_def_registry._registered_ops.get
-                if hasattr(op_def_registry, "_registered_ops")
-                else op_def_registry.get
-            )
-            op_def = getOpDef(node.op)
-            if node.op == "Placeholder" or node.op == "PlaceholderWithDefault":
-                pass
-            elif node.op == "Const":
-                pass
-            elif node.op in ["PartitionedCall", "StatefulPartitionedCall"]:
-                pass
-            else:
-                if any([node.op in t for t in [_identity_list, _convert_map, _control_flow_nodes]]):
-                    pass
-                elif op_def is not None and op_def.is_stateful:
-                    missing_operators.add(node.op)
-                else:
-                    missing_operators.add(node.op)
-
-        return missing_operators
-
-    def _parse_param(self, key, value, name, shape):
-        try:
-            from tensorflow.python.framework import tensor_util
-        except ImportError as e:
-            raise ImportError(f"Unable to import tensorflow which is required {e}")
-
-        if key == "value":
-            np_array = tensor_util.MakeNdarray(value.tensor)
-
-            if np_array.dtype == np.dtype(object):
-                # Object types are generally tensorflow DT_STRING (DecodeJpeg op).
-                # Just leave it as placeholder.
-                if shape and name in shape:
-                    var_shape = shape[name]
-                else:
-                    var_shape = tensor_util.TensorShapeProtoToList(value.tensor.tensor_shape)
-                self._nodes[name] = [
-                    set_span(_expr.var(name, shape=var_shape, dtype="uint8"), span=name)
-                ]
-                return
-
-            array_ndim = len(np_array.shape)
-            if array_ndim == 0:
-                self._nodes[name] = [set_span(tvm.relay.const(np_array, np_array.dtype), name)]
-            else:
-                self._params[name] = tvm.nd.array(np_array)
-                self._nodes[name] = [
-                    set_span(
-                        _expr.var(
-                            name, shape=self._params[name].shape, dtype=self._params[name].dtype
-                        ),
-                        name,
-                    )
-                ]
-        else:
-            if key not in ("dtype", "_output_shapes", "_class"):
-                raise NotImplementedError(f"Other attributes for a Const(param) Node {key} ? .")
-
-    def _get_attr(self, buf):
-        """Returns the value of the attr of this buf with the given `name`.
-
-        Args:
-          buf: attrvalue protobuf.
-
-        Returns:
-          The value of the attr, as a Python object.
-
-        Raises:
-          ValueError: If this op does not have an attr with the given `name`.
-        """
-        fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
-
-        x = buf
-
-        ret = []
-
-        try:
-            from tensorflow.python.framework import dtypes
-        except ImportError as e:
-            raise ImportError(f"Unable to import tensorflow which is required {e}")
-
-        # Treat an empty oneof value as an empty list.
-        if not x.WhichOneof("value"):
-            return ret
-        if x.HasField("list"):
-            for f in fields:
-                if getattr(x.list, f):
-                    if f == "type":
-                        ret += [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
-                    else:
-                        ret += list(getattr(x.list, f))
-        else:
-            for f in fields:
-                if x.HasField(f):
-                    if f == "type":
-                        ret = dtypes.as_dtype(getattr(x, f))
-                    else:
-                        ret = getattr(x, f)
-        return ret
-
-    def _parse_attr(self, attr_proto):
-        """Convert a list of AttributeProto to a dict, with names as keys."""
-        attrs = {}
-        for key, value in attr_proto.items():
-            attrs[key] = self._get_attr(value)
-
-        return attrs
-
-    def _convert_control_flow_operator(self, node, inputs, attrs, control_flow_node_map):
-        """
-        Convert the Relay control flow primitive into corresponding component
-        of a Relay control flow construct, i.e. `tf.cond` and `tf.while_loop`
-        are converted in Relay `If` and recusrive call, respectively.
-
-        Parameters
-        ----------
-        node: TensorFlow graph node object.
-            A TensorFlow graph node object.
-
-        inputs : List[tvm.relay.Expr]
-            List of input symbols.
-
-        attrs : Dict[tvm.Attrs]
-            Dict of operator attributes.
-
-        control_flow_node_map : Dict[str, Set[str]]
-            A dictionary contains the execution frame name to primitives
-            mapping.
-
-        Returns
-        -------
-        op : tvm.relay.Expr
-            Converted relay expression.
-        """
-        node_name_prefix = node.name.rsplit("/", 1)[0]
-        plname = find_parent_loop_name(node.name, self._while_loop_name_set)
-        if node.op == "Merge":
-            if _in_while_loop(self._control_flow_node_map, node_name_prefix):
-                op = self._licm_construct(plname, node.input[0])
-                if node_name_prefix not in self._loops:
-                    self._loops[node_name_prefix] = Loop(self._mod, plname, self._lvar2expr)
-            else:
-                if node_name_prefix not in self._branches:
-                    switch_prefix = node_name_prefix + "/Switch"
-                    merge_idx = self._sorted_cf_node_names.index(node.name)
-                    for i in range(merge_idx - 1, -1, -1):
-                        cf_name = self._sorted_cf_node_names[i]
-                        if cf_name.startswith(switch_prefix):
-                            self._backtrack_construct(cf_name)
-                            break
-
-                branch = self._branches[node_name_prefix]
-                false_br = self._licm_construct(plname, node.input[0])
-                true_br = self._licm_construct(plname, node.input[1])
-                branch.true_branch = true_br
-                branch.false_branch = false_br
-                op = branch.if_node()
-                if node_name_prefix not in self._while_loop_name_set:
-                    try:
-                        cond_val = np.all(
-                            _infer_value(branch.cond, self._params, self._mod).numpy()
-                        )
-                        if cond_val:
-                            op = branch.true_branch
-                        else:
-                            op = branch.false_branch
-                    except Exception:
-                        op = branch.if_node()
-        elif node.op == "Exit":
-            loop = self._loops[node_name_prefix]
-
-            # Check whether the order of loop variables aligns
-            # with loop body. If not, create new loop variable list
-            # with correct order.
-            if not loop.aligned:
-                loop_vars = []
-                for i in self._loop_body_order[node_name_prefix]:
-                    for j, k in enumerate(self._loop_var_order[node_name_prefix]):
-                        if k == i:
-                            loop_vars.append(loop.loop_vars[j])
-                loop.loop_vars = loop_vars
-                loop.aligned = True
-            exit_name = node.name.split("/")[-1]
-            if "_" in exit_name:
-                exit_number = int(exit_name[5:])
-            else:
-                exit_number = 0
-            expr = loop.while_loop()
-            body_pos = exit_number
-            for i, j in enumerate(self._loop_body_order[node_name_prefix]):
-                if exit_number == j:
-                    body_pos = i
-                    break
-            op = _expr.TupleGetItem(expr, body_pos)
-        elif node.op == "Enter":
-            op = self._licm_construct(plname, node.input[0])
-        elif node.op == "LoopCond":
-            op = self._licm_construct(plname, node.input[0])
-            self._loops[node_name_prefix].cond = op
-        elif node.op == "Switch":
-            op = self._licm_construct(plname, node.input[0])
-            cond = self._licm_construct(plname, node.input[1])
-            if _in_while_loop(self._control_flow_node_map, node_name_prefix):
-                if node_name_prefix not in self._loop_var_order:
-                    self._loop_var_order[node_name_prefix] = []
-                if node.name.endswith("Switch"):
-                    self._loop_var_order[node_name_prefix].append(0)
-                else:
-                    self._loop_var_order[node_name_prefix].append(
-                        int(node.name.split("Switch_")[-1])
-                    )
-                self._loops[node_name_prefix].loop_vars.append(op)
-            else:
-                if node_name_prefix not in self._branches:
-                    self._branches[node_name_prefix] = Branch()
-                self._branches[node_name_prefix].cond = cond
-        elif node.op == "NextIteration":
-            if node_name_prefix not in self._loop_body_order:
-                self._loop_body_order[node_name_prefix] = []
-            if node.name.endswith("NextIteration"):
-                self._loop_body_order[node_name_prefix].append(0)
-            else:
-                self._loop_body_order[node_name_prefix].append(
-                    int(node.name.split("NextIteration_")[-1])
-                )
-            op = self._licm_construct(plname, node.input[0])
-            self._loops[node_name_prefix].body.append(op)
-        else:
-            raise Exception(f"Cannot identify control flow operator: {node.op}")
-
-        return op
-
-    def _partition_call_operator(self, inputs, attr):
-        """
-        Convert the Relay Partition call ops into Relay Function calls and
-        function definitions from Tensorflow graph library attribute to Relay global
-        functions
-
-        Parameters
-        ----------
-        node: TensorFlow graph node object.
-            A TensorFlow graph node object.
-
-        inputs : List[tvm.relay.Expr]
-            List of input symbols.
-
-        attrs : Dict[tvm.Attrs]
-            Dict of operator attributes.
-
-        Returns
-        -------
-        op : tvm.relay.Expr
-            Converted relay expression.
-        """
-
-        try:
-            from tensorflow.python.framework import function_def_to_graph
-        except ImportError as e:
-            raise ImportError(f"Unable to import tensorflow which is required {e}")
-
-        main_graph_proto = self._main_graph_proto
-        outer_graph_def = main_graph_proto._graph
-
-        node_func_name = attr.get("f").name
-        func = next(
-            (f for f in outer_graph_def.library.function if f.signature.name == node_func_name),
-            None,
-        )
-        if func:
-            devices = set(node.device for node in func.node_def)
-            if len(devices) > 1:
-                raise Exception(
-                    "Found inconsistent Device assignment in the "
-                    "Stateful Partitioned SubGraph. Rejecting "
-                    "the subgraph "
-                )
-            # Convert function definition to graph
-            func_input_shapes = func.attr["_input_shapes"].list.shape
-            subgraph, _ = function_def_to_graph.function_def_to_graph_def(func, func_input_shapes)
-
-            # Computing subgraph's input shape dictionary
-            subgraph_shape_dict, input_expr_dict = {}, {}
-            for f_arg, input in zip(func.signature.input_arg, inputs):
-                input_expr_dict[f_arg.name] = input
-                subgraph_shape_dict[f_arg.name] = _infer_shape(input, main_graph_proto._mod)
-
-            func_name = f"func_{func.signature.name}"
-            try:
-                global_func = main_graph_proto._mod[func_name]
-                sub_func = global_func
-                sub_params = main_graph_proto._params
-            except ValueError:
-                # Construct relay nodes from the subgraph
-                g1 = SubGraphProto(main_graph_proto)
-                sub_func, sub_params = g1.from_tensorflow(subgraph, shape=subgraph_shape_dict)
-                main_graph_proto._params.update(sub_params)
-                func_expr = _function.Function(sub_func.params, sub_func.body)
-                global_func = tvm.relay.GlobalVar(func_name)
-                main_graph_proto._mod[global_func] = func_expr
-                main_graph_proto._mod = InferType()(main_graph_proto._mod)
-
-            param_exprs = []
-            for param_expr in sub_func.params:
-                # sub_params is subset of sub_func.params
-                param_name = param_expr.vid.name_hint
-                if param_name in input_expr_dict.keys():
-                    param_exprs.append(input_expr_dict[param_name])
-                elif param_name in sub_params.keys():
-                    param_exprs.append(param_expr)
-                else:
-                    raise Exception(f"Input parameter {param_name} not found")
-
-            sb = tvm.relay.scope_builder.ScopeBuilder()
-            loop_ret = global_func(*param_exprs)
-            sb.ret(loop_ret)
-            ret = sb.get()
-        else:
-            raise Exception(f"Function not found - {node_func_name}")
-        return ret
-
-    def _convert_operator(
-        self, op_name, node_name, inputs, attrs, identity_list=None, convert_map=None
-    ):
-        """Convert from Tensorflow operator to relay operator.
-        The converter must specify conversions explicitly for incompatible name, and
-        apply handlers to operator attributes.
-
-        Parameters
-        ----------
-        op_name : str
-            Operator name, such as Conv2D, AvgPool
-        node_name : str
-            Node name, predefined by user or default setting of TF
-        inputs : list of relay.op
-            List of input symbols.
-        attrs : dict
-            Dict of operator attributes
-        identity_list : list
-            List of operators that don't require conversion
-        convert_map : dict
-            Dict of name : callable, where name is the op's name that
-            require conversion to relay, callable are functions which
-            take attrs and return (new_op_name, new_attrs)
-
-        Returns
-        -------
-        sym : relay.op
-            Converted relay operator
-        """
-        identity_list = identity_list if identity_list else _identity_list
-        convert_map = convert_map if convert_map else _convert_map
-        if op_name in identity_list:
-            sym = get_relay_op(op_name)(*inputs, **attrs)
-        elif op_name in convert_map:
-            if _need_prelude_for_shape_inference(op_name):
-                sym = convert_map[op_name](inputs, attrs, self._params, self._prelude)
-            else:
-                sym = convert_map[op_name](inputs, attrs, self._params, self._mod)
-        elif op_name in ["PartitionedCall", "StatefulPartitionedCall"]:
-            sym = self._partition_call_operator(inputs, attrs)
-        else:
-            raise NotImplementedError(f"Operator {op_name} not implemented.")
-
-        sym = set_span(sym, node_name)
-
-        return sym
-
-    def _licm_construct(self, loop_name, node_name):
-        """Construct a node by considering whether it is
-        loop invariant with the given while loop. If yes, we
-        generate a loop Variable. Otherwise, return regular
-        converted relay expression.
-
-        Parameters
-        ----------
-        loop_name : str
-            TensorFlow while loop name to be checked.
-
-        node_name : str
-            TensorFlow node name.
-
-        Returns
-        -------
-        out : relay.Expr or relay.Var
-            Converted relay expression or loop var.
-        """
-        actual_expr = self._backtrack_construct(node_name)
-        tn = node_name.split(":")
-        node_name = tn[0].split("^")[-1]
-        cloop_name = find_parent_loop_name(node_name, self._while_loop_name_set)
-
-        if loop_name in self._while_loop_name_set and not cloop_name.startswith(loop_name):
-            if loop_name not in self._lvar2expr:
-                self._lvar2expr[loop_name] = {}
-            if loop_name not in self._lname_map:
-                self._lname_map[loop_name] = {}
-
-            if node_name not in self._lname_map[loop_name]:
-                var_name = f"{node_name}_loop_var"
-                var_type = _infer_type(actual_expr, self._mod).checked_type
-                loop_var = set_span(tvm.relay.var(var_name, type_annotation=var_type), var_name)
-                try:
-                    extra_param = _infer_value(actual_expr, self._params, self._mod)
-                    self._params[var_name] = extra_param
-                except Exception:
-                    pass
-                self._lvar2expr[loop_name][loop_var] = actual_expr
-                self._lname_map[loop_name][node_name] = loop_var
-                ret = loop_var
-            else:
-                ret = self._lname_map[loop_name][node_name]
-        else:
-            ret = actual_expr
-
-        return ret
-
-    def _backtrack_construct(self, node_name):
-        """Convert a specific tensorflow node to relay expression.
-
-        If any of its ancestor node is not converted yet, backtrack as
-        far as input node and covert all nodes on the path.
-
-        This is required when parsing control flow nodes, since the parsing
-        order may not follow the original graph def.
-
-        Parameters
-        ----------
-        node_name : str
-            TensorFlow node name.
-
-        Returns
-        -------
-        op : relay.Expr
-            Converted relay expression
-        """
-        try:
-            from tensorflow.python.framework import tensor_util
-        except ImportError as e:
-            raise ImportError(f"Unable to import tensorflow which is required {e}")
-
-        input_op_name = node_name.split(":")[0].split("^")[-1]
-        if input_op_name not in self._nodes:
-            node = self._tf_node_map[input_op_name]
-            attr = self._parse_attr(node.attr)
-
-            if node.op in _control_flow_nodes:
-                attr = self._parse_attr(node.attr)
-                op = self._convert_control_flow_operator(
-                    node, [], attr, self._control_flow_node_map
-                )
-            else:
-                attr["_output_shapes"] = self._output_shapes[input_op_name]
-                attr["_node_name"] = node.name
-                attr["_target_layout"] = self._layout
-
-                inputs = [self._backtrack_construct(iname) for iname in node.input]
-
-                plname = find_parent_loop_name(node_name, self._while_loop_name_set)
-
-                # For TensorArrayV3 op, we need to infer shape first
-                if is_tensor_array_constuctor(node):
-                    raw_elem_shape = tensor_util.TensorShapeProtoToList(attr["element_shape"])
-                    elem_shape = []
-                    for dim in raw_elem_shape:
-                        if dim < 0:
-                            elem_shape.append(Any())
-                        else:
-                            elem_shape.append(dim)
-
-                    if elem_shape:
-                        attr["shape"] = elem_shape
-                    if attr["identical_element_shapes"] or elem_shape:
-                        shape_node, wnode_op, output_index = self._tensor_array_shape_nodes[
-                            node.name
-                        ]
-                        name = shape_node.name
-                        if output_index > 0:
-                            name += ":" + str(output_index)
-                        converted = self._backtrack_construct(name)
-                        shape = _infer_shape(converted, self._mod)
-                        if wnode_op.startswith("TensorArraySplit"):
-                            shape = (Any(),) + shape[1:]
-                        elif wnode_op.startswith("TensorArrayScatter"):
-                            shape = shape[1:]
-
-                        if node.name in self._tensor_array_shapes:
-                            preset_shape = self._tensor_array_shapes[node.name]
-                            shape = _get_more_static_shape(shape, preset_shape)
-
-                        if "shape" in attr:
-                            attr["shape"] = _get_more_static_shape(shape, attr["shape"])
-                        else:
-                            attr["shape"] = shape
-
-                # LICM
-                if plname in self._while_loop_name_set:
-                    for i, iname in enumerate(node.input):
-                        actual_input = self._licm_construct(plname, iname)
-                        inputs[i] = actual_input
-
-                op = self._convert_operator(node.op, node.name, inputs, attr)
-            if isinstance(op, np.ndarray):
-                self._params[node.name] = tvm.nd.array(op)
-                op = [
-                    set_span(
-                        _expr.var(
-                            node.name,
-                            shape=self._params[node.name].shape,
-                            dtype=self._params[node.name].dtype,
-                        ),
-                        node.name,
-                    )
-                ]
-
-            elif isinstance(op, (_expr.Expr, _expr.TupleGetItem)):
-                op = [op]
-
-            self._nodes[input_op_name] = op
-
-        out = self._nodes[input_op_name]
-
-        if isinstance(out, _expr.TupleWrapper):
-            tn = node_name.split(":")
-            tensor_slot = int(tn[1]) if len(tn) > 1 else 0
-            return out[tensor_slot]
-        return out[0]
-
-
-class SubGraphProto(GraphProto):
-    """A helper class for handling relay subgraph copying from Tensorflow GraphDef."""
-
-    def __init__(self, main_graph_proto):
-        super().__init__()
-        self._main_graph_proto = main_graph_proto  # holds main graph proto object
-
-    def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
-        """Wrapper to _get_relay_func which converts Tensorflow graph to Relay function.
-        Return Relay function and params
-        """
-        func = self._get_relay_func(graph, layout=layout, shape=shape, outputs=outputs)
-        return func, self._params
-
-
-def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, convert_config=None):
-    """Load tensorflow graph which is a python tensorflow graph object into relay.
-    The companion parameters will be handled automatically.
-
-    Parameters
-    ----------
-    graph : GraphDef object
-        Tensorflow GraphDef
-
-    layout : target layout to be used (Optional)
-        NCHW only supported now to enable NHWC models on GPU.
-
-    shape : Dictionary of input dimensions (Optional)
-        Graph level input shape dictionary.
-
-    outputs : List of output tensor names (Optional)
-        if not specified then the last node is assumed as graph output.
-
-    convert_config : Optional[Dict[str, Any]]
-        Default config:
-            use_dense : bool = True
-                Ture to convert `tf.matmul` to `nn.dense`, else to `nn.matmul`.
-                The `nn.dense` op requires the data tensor to be non-transposed and weight tensor
-                to be transposed, may insert extra `transpose` to the original graph.
-            use_nt_batch_matmul : bool = True
-                True to convert `tf.batch_matmul` to `nn.batch_matmul` strict to NT format
-                (transpose_a=False, transpose_b=True).
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The module that optimizations will be performed on.
-
-    params : dict of str to tvm.nd.NDArray
-        Dict of converted parameters stored in tvm.nd.NDArray format
-    """
-    global TF_DEFAULT_CONFIGS
-    if convert_config is not None:
-        TF_DEFAULT_CONFIGS.update(convert_config)
-
-    g = GraphProto()
-    mod, params = g.from_tensorflow(graph, layout, shape, outputs)
-    return mod, params
diff --git a/python/tvm/relay/frontend/tensorflow2.py b/python/tvm/relay/frontend/tensorflow2.py
deleted file mode 100644
index e6ad1f7805af..000000000000
--- a/python/tvm/relay/frontend/tensorflow2.py
+++ /dev/null
@@ -1,848 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, too-many-lines, len-as-condition
-# pylint: disable=broad-except, too-many-nested-blocks, not-context-manager, broad-exception-raised
-"""Tensorflow2.x graph to relay converter.
-
-If model is constructed using tf2.x API, then use this converter:
-    from tvm.relay.frontend.tensorflow2 import from_tensorflow
-Otherwise use the tf1.x converter:
-    from tvm.relay.frontend.tensorflow import from_tensorflow
-
-"""
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.framework import function_def_to_graph, tensor_util, dtypes
-
-import tvm
-from tvm.relay.transform import InferType
-from tvm.relay.prelude import Prelude
-from tvm.ir import IRModule
-from .. import expr as _expr
-from .. import analysis
-from .. import function as _function
-from ..loops import while_loop as _while_loop
-from .common import infer_type as _infer_type
-
-from .tensorflow_ops import _convert_map as _convert_map_common
-from .tensorflow_ops import _get_more_static_shape_rank
-from .tensorflow2_ops import _convert_map as _convert_map_tf2
-from .tensorflow2_ops import _need_prelude_for_shape_inference
-
-from ..ty import Any
-
-__all__ = ["from_tensorflow"]
-
-# A map to record tensor list write ops and input tl/tensor indices
-# Value is (index of tensor list, index of written node)
-_tensor_list_write_ops = {"TensorListSetItem": (0, 2)}
-
-
-def _infer_type_with_prelude(val, prelude):
-    body = _infer_type(val, prelude.mod)
-    return body.checked_type
-
-
-def set_span(sym, node_name):
-    """set span of symbol"""
-
-    span = tvm.relay.Span(tvm.relay.SourceName(node_name), 0, 0, 0, 0)
-    if isinstance(sym, _expr.Call):
-        sym = _expr.Call(sym.op, sym.args, sym.attrs, sym.type_args, span)
-    elif isinstance(sym, _expr.TupleWrapper):
-        tuple_value = sym.tuple_value
-        if isinstance(tuple_value, _expr.Call):
-            tuple_value = _expr.Call(
-                tuple_value.op, tuple_value.args, tuple_value.attrs, tuple_value.type_args, span
-            )
-            sym = _expr.TupleWrapper(tuple_value, sym.size)
-    return sym
-
-
-def is_tensor_list_constuctor(tf_node):
-    """Check whether is tensor list constructor node."""
-    return tf_node.op == "TensorListReserve"
-
-
-def convert_const_node(node, shape):
-    """convert tf const node into relay const or var"""
-
-    # get the value of the constant
-    tensor_value = node.attr["value"].tensor
-    np_array = tensor_util.MakeNdarray(tensor_value)
-
-    if np_array.dtype == np.dtype(object):
-        if shape and node.name in shape:
-            var_shape = shape[node.name]
-        else:
-            var_shape = tensor_util.TensorShapeProtoToList(tensor_value.tensor_shape)
-        param = None
-        sym = [_expr.var(node.name, shape=var_shape, dtype="uint8")]
-        return sym, param
-
-    if len(np_array.shape) == 0:
-        param = None
-        sym = [tvm.relay.const(np_array, np_array.dtype)]
-    else:
-        param = tvm.nd.array(np_array)
-        sym = [_expr.var(node.name, shape=param.shape, dtype=param.dtype)]
-
-    return sym, param
-
-
-def get_attr(buf):
-    """convert value of a node attribute. node attribute is part of a node in a graph.
-
-    Parameters
-    ----------
-    buf: attrvalue protobuf.  <class 'tensorflow.core.framework.attr_value_pb2.AttrValue'>
-
-    Returns
-    -------
-    The value of the attr, as a Python object.
-
-    Raises:
-    -------
-    ValueError: If this op does not have an attr with the given `name`.
-    """
-
-    fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
-
-    ret = []
-
-    if not buf.WhichOneof("value"):
-        return ret
-
-    if buf.HasField("list"):
-        for f in fields:
-            if getattr(buf.list, f):
-                if f == "type":
-                    ret += [dtypes.as_dtype(x) for x in list(getattr(buf.list, f))]
-                else:
-                    ret += list(getattr(buf.list, f))
-    else:
-        for f in fields:
-            if buf.HasField(f):
-                if f == "type":
-                    ret = dtypes.as_dtype(getattr(buf, f))
-                else:
-                    ret = getattr(buf, f)
-    return ret
-
-
-def parse_attr(attr_proto):
-    """Convert node attributes (a serialized map of key-value pairs) in a node to a dict
-
-    Parameters
-    ----------
-    attr_proto: <class 'google.protobuf.pyext._message.MessageMapContainer'>
-
-    Returns
-    -------
-    Dict {string: python object}
-
-    """
-    attrs = {}
-    for key, value in attr_proto.items():
-        attrs[key] = get_attr(value)
-
-    return attrs
-
-
-def convert_placeholder(shape, node, in_type=None):
-    """convert tf placeholder into relay var.
-
-    Example
-    --------
-    a tf placeholder with name "x" is converted to [Var(x, ty=TensorType([], float32))]
-    """
-
-    if shape and node.name in shape:
-        input_shape = list(shape[node.name])
-    else:
-        input_shape = tensor_util.TensorShapeProtoToList(node.attr["shape"].shape)
-        for idx, dim in enumerate(input_shape):
-            if dim < 0:
-                input_shape[idx] = Any()
-    attr = parse_attr(node.attr)
-    if in_type is not None:
-        sym = [_expr.var(node.name, type_annotation=in_type)]
-    else:
-        sym = [_expr.var(node.name, shape=input_shape, dtype=attr["dtype"].name)]
-    return input_shape, sym
-
-
-class RelayModule:
-    """states related to the entire relay module (multiple functions)
-    after converted from tf graphdef"""
-
-    def __init__(self):
-        self.mod = IRModule({})
-        self.params = {}
-        self.prelude = Prelude(self.mod)
-
-
-class GraphProto:
-    """Capturing states when converting a tf graph to a single relay function."""
-
-    def __init__(self, module):
-        self._module = module
-        self._prelude = self._module.prelude
-        self._params = {}
-        self._nodes = {}
-        self._input_shapes = {}
-        self._output_shapes = {}
-        self._tf_node_map = {}
-        self._gdef_lib = {}
-        self._tensor_list_shapes = {}
-        self._tensor_list_shape_nodes = {}
-        self._sub_map = {}
-        self._sub_input_idx_map = {}
-
-    def from_tensorflow(
-        self, graph, layout="NHWC", shape=None, outputs=None, input_types=None, gdef_lib=None
-    ):
-        """Wrapper to _get_relay_func which converts Tensorflow graph to Relay function
-        which is used as main function for the Relay module
-        """
-        if input_types is None:
-            input_types = {}
-
-        if gdef_lib is None:
-            gdef_lib = {}
-
-        self._gdef_lib = gdef_lib
-        func = self._get_relay_func(
-            graph, layout=layout, shape=shape, outputs=outputs, input_types=input_types
-        )
-        return func, self._params
-
-    def _analysis_tensor_list_op(
-        self,
-        graph,
-        node,
-        tl_write_nodes,
-        tl_stack_nodes,
-        tl_construct_nodes,
-        sub_func_name="",
-        root_node="",
-    ):
-        if sub_func_name and sub_func_name not in self._sub_input_idx_map:
-            self._sub_input_idx_map[sub_func_name] = {}
-
-        if node.op == "Placeholder":
-            # record placeholder node in sub functions
-            self._sub_map[sub_func_name] = node
-            self._sub_input_idx_map[sub_func_name][node.name] = len(
-                self._sub_input_idx_map[sub_func_name]
-            )
-
-        if node.op.startswith("TensorList"):
-            if is_tensor_list_constuctor(node):
-                tl_construct_nodes.append(node)
-            else:
-                for tl_write_name, idx in _tensor_list_write_ops.items():
-                    if node.op.startswith(tl_write_name):
-                        tl_write_nodes.append((node, idx, sub_func_name, root_node))
-                if node.op.startswith("TensorListStack"):
-                    tl_stack_nodes.append(node)
-        elif node.op.startswith("StatelessWhile"):
-            root_node = node.name
-            cond_fn_name, body_fn_name = [
-                parse_attr(node.attr).get(x).name for x in ["cond", "body"]
-            ]
-            for fn_name in [cond_fn_name, body_fn_name]:
-                subfunction = self._gdef_lib[fn_name]
-                sub_func_name = fn_name
-                for sub_node in subfunction.node:
-                    # bypass const node
-                    if sub_node.op == "Const":
-                        continue
-                    self._tf_node_map[sub_node.name] = sub_node
-                    self._analysis_tensor_list_op(
-                        subfunction,
-                        sub_node,
-                        tl_write_nodes,
-                        tl_stack_nodes,
-                        tl_construct_nodes,
-                        sub_func_name=sub_func_name,
-                        root_node=root_node,
-                    )
-
-    def _infer_static_shape_stack_node(self, tl_stack_nodes):
-        for stack_node in tl_stack_nodes:
-            if len(stack_node.input) < 2:
-                # Stack node does not have shape
-                continue
-            input_shape_name = stack_node.input[1].split(":")[0]
-            input_shape_node = self._tf_node_map[input_shape_name]
-            stack = [self._tf_node_map[stack_node.input[0].split(":")[0]]]
-            in_idx = -1
-            while stack:
-                cnode = stack.pop(0)
-                if not cnode.op.startswith("TensorList"):
-                    if in_idx and cnode.op.startswith("StatelessWhile"):
-                        stack.append(self._tf_node_map[cnode.input[in_idx].split(":")[0]])
-                    else:
-                        for iname in cnode.input:
-                            if self._tf_node_map[iname.split(":")[0]].op.startswith(
-                                "StatelessWhile"
-                            ):
-                                # identify input index based on output index
-                                if iname.split(":")[1]:
-                                    in_idx = int(iname.split(":")[1])
-                            stack.append(self._tf_node_map[iname.split(":")[0]])
-                # identify the corresponding constructor node and add shape to _tensor_list_shapes
-                elif cnode.name != stack_node.name:
-                    if is_tensor_list_constuctor(cnode):
-                        shape_attr = parse_attr(input_shape_node.attr)
-                        if "value" not in shape_attr:
-                            continue
-                        raw_elem_shape = tensor_util.MakeNdarray(shape_attr["value"])
-                        elem_shape = []
-                        for dim in raw_elem_shape:
-                            if dim < 0:
-                                elem_shape.append(Any())
-                            else:
-                                elem_shape.append(int(dim))
-                        self._tensor_list_shapes[cnode.name] = elem_shape
-                    break
-
-    def _infer_static_shape_write_node(self, tl_write_nodes):
-        for item in tl_write_nodes:
-            wnode = item[0]
-            ta_idx, inode_idx = item[1]
-            sub_func_name = item[2]
-            root_name = item[3]
-            stack = [self._tf_node_map[wnode.input[ta_idx].split(":")[0]]]
-            while stack:
-                cnode = stack.pop(0)
-
-                if not cnode.op.startswith("TensorList"):
-                    if cnode.op == "Placeholder" and sub_func_name:
-                        # need to map subfunction
-                        input_idx = self._sub_input_idx_map[sub_func_name][cnode.name]
-                        stack.append(
-                            self._tf_node_map[
-                                self._tf_node_map[root_name].input[input_idx].split(":")[0]
-                            ]
-                        )
-                    else:
-                        for iname in cnode.input:
-                            stack.append(self._tf_node_map[iname.split(":")[0]])
-                # identify the corresponding constructor node and add it to _tensor_list_shape_nodes
-                elif cnode.name != wnode.name:
-                    if is_tensor_list_constuctor(cnode):
-                        inode = self._tf_node_map[wnode.input[inode_idx].split(":")[0]]
-                        tn = wnode.input[inode_idx].split(":")
-                        output_index = int(tn[1]) if len(tn) > 1 else 0
-                        self._tensor_list_shape_nodes[cnode.name] = (inode, wnode.op, output_index)
-                    break
-
-    def _get_relay_func(self, graph, layout="NHWC", shape=None, outputs=None, input_types=None):
-        if input_types is None:
-            input_types = {}
-        tl_write_nodes = []
-        tl_stack_nodes = []
-        tl_construct_nodes = []
-        self._layout = layout
-        for node in graph.node:
-            name = node.name
-            self._tf_node_map[name] = node
-            if node.op == "Placeholder":
-                in_type = None
-                if node.name in input_types:
-                    in_type = input_types[node.name]
-                self._input_shapes[name], self._nodes[name] = convert_placeholder(
-                    shape, node, in_type
-                )
-            elif node.op == "Const":
-                sym, param = convert_const_node(node, shape)
-                self._nodes[node.name] = sym
-                if param:
-                    self._params[node.name] = param
-            # recursivly iterate tensorlist op if seen while loop
-            else:
-                self._analysis_tensor_list_op(
-                    graph, node, tl_write_nodes, tl_stack_nodes, tl_construct_nodes
-                )
-
-        # Use tensor list stack to infer static tensor list shape
-        self._infer_static_shape_stack_node(tl_stack_nodes)
-
-        # Fetch node contains static tensor list shape
-        self._infer_static_shape_write_node(tl_write_nodes)
-
-        for node in graph.node:
-            self._backtrack_construct(graph, node.name)
-
-        return self._func(graph, outputs)
-
-    def _func(self, graph, outputs):
-        out = []
-        if outputs is None:
-            last_node = graph.node[-1]
-            op = self._nodes[last_node.name.split(":")[0]]
-            if last_node.op == "Exit":
-                out = [op[0].tuple_value]
-            else:
-                out = op
-        else:
-            for out_name in outputs:
-                if ":" in out_name:
-                    out_name = out_name.split(":")
-                    out_name, out_num = out_name[0], out_name[-1]
-                    out_num = int(out_num)
-                    out.append(self._nodes[out_name][out_num])
-                else:
-                    out.append(self._nodes[out_name][0])
-
-        if isinstance(out, _expr.TupleWrapper):
-            out = out.astuple()
-        else:
-            out = out[0] if len(out) == 1 else _expr.Tuple(out)
-
-        fvars = analysis.free_vars(out)
-        func = _function.Function(fvars, out)
-        final_params = {}
-        for fv in fvars:
-            if fv.name_hint in self._params:
-                final_params[fv.name_hint] = self._params[fv.name_hint]
-        self._params = final_params
-        return func
-
-    def _convert_operator(self, graph, op_name, node_name, inputs, attrs):
-        """Convert from Tensorflow operator to relay operator.
-        The converter must specify conversions explicitly for incompatible name, and
-        apply handlers to operator attributes.
-
-        Parameters
-        ----------
-        graph: <class 'tensorflow.core.framework.graph_pb2.GraphDef'>
-            TF2 frozen graph def
-        op_name : str
-            Operator name, such as Conv2D, AvgPool
-        node_name: str
-             Name of the node in TF2 graph, such as Identity:0
-        inputs : list of relay.op
-            List of input symbols.
-        attrs : dict
-            Dict of operator attributes
-
-        Returns
-        -------
-        sym : relay.op
-            Converted relay operator
-        """
-        if op_name in ["PartitionedCall", "StatefulPartitionedCall"]:
-            sym = _partition_call_operator(
-                self._module, graph, inputs, attrs, self._prelude, gdef_lib=self._gdef_lib
-            )
-        elif op_name in ["StatelessIf", "If"]:
-            sym = _convert_if(
-                self._module, graph, inputs, attrs, self._prelude, gdef_lib=self._gdef_lib
-            )
-        elif op_name in ["StatelessWhile", "While"]:
-            sym = _convert_loop(
-                self._module,
-                graph,
-                inputs,
-                attrs,
-                node_name,
-                self._tf_node_map,
-                self._prelude,
-                gdef_lib=self._gdef_lib,
-            )
-        elif op_name in _convert_map_common:
-            # assert op are exclusive
-            assert not set(_convert_map_common.keys()) & set(_convert_map_tf2.keys())
-            if _need_prelude_for_shape_inference(op_name):
-                sym = _convert_map_common[op_name](inputs, attrs, self._params, self._prelude)
-            else:
-                sym = _convert_map_common[op_name](inputs, attrs, self._params, self._module.mod)
-        elif op_name in _convert_map_tf2:
-            if _need_prelude_for_shape_inference(op_name):
-                sym = _convert_map_tf2[op_name](inputs, attrs, self._params, self._prelude)
-            else:
-                sym = _convert_map_tf2[op_name](inputs, attrs, self._params, self._module.mod)
-        else:
-            raise NotImplementedError(f"Operator {op_name} not implemented.")
-
-        sym = set_span(sym, node_name)
-        return sym
-
-    def _parse_element_shape(self, elem_shape, shape_attr):
-        if "value" in shape_attr:
-            raw_elem_shape = tensor_util.MakeNdarray(shape_attr["value"])
-
-            if raw_elem_shape.size == 1 and raw_elem_shape == -1:
-                elem_shape.append(Any())
-            else:
-                for dim in raw_elem_shape:
-                    if dim < 0:
-                        elem_shape.append(Any())
-                    else:
-                        elem_shape.append(dim)
-
-    def _backtrack_construct(self, graph, node_name):
-        """Convert a specific tensorflow node to relay expression.
-
-        If any of its ancestor node is not converted yet, backtrack as
-        far as input node and covert all nodes on the path. resurion is used here.
-
-        This is required when parsing control flow nodes, since the parsing
-        order may not follow the original graph def.
-
-        to discover input node, current tf node's input is iterated:
-
-        tensorflow/core/framework/node_def.proto
-            message NodeDef {
-                repeated string input = 3;
-            }
-
-        a node has many inputs (other nodes). each input has the following format:
-            data input is "node:src_output".  node is the string name.
-            control input is "^node".
-
-        Parameters
-        ----------
-        graph : <class 'tensorflow.core.framework.graph_pb2.GraphDef'>
-            TF2 frozen graph def
-
-        node_name : str
-            node name
-
-        Returns
-        -------
-        op : relay.Expr
-            Converted relay expression.
-
-        Examples
-        --------
-        tf expression "x+1" is converted to relay expression:
-            CallNode(Op(add), [Var(x, ty=TensorType([], float32)), Constant(1.0)], (nullptr), [])
-
-        """
-        input_op_name = node_name.split(":")[0].split("^")[-1]
-
-        if input_op_name not in self._nodes:
-            node = self._tf_node_map[input_op_name]
-            attr = parse_attr(node.attr)
-            if "_output_shapes" in attr:
-                self._output_shapes[node.name] = [
-                    tensor_util.TensorShapeProtoToList(tshape) for tshape in attr["_output_shapes"]
-                ]
-            else:
-                self._output_shapes[node.name] = [None]
-
-            attr["_output_shapes"] = self._output_shapes[input_op_name]
-            attr["_node_name"] = node.name
-            attr["_target_layout"] = self._layout
-            inputs = [self._backtrack_construct(graph, iname) for iname in node.input]
-
-            # infer shape for TensorList op
-            if is_tensor_list_constuctor(node):
-                input_shape_name = (
-                    node.input[1] if "TensorListFromTensor" in node.op else node.input[0]
-                )
-                input_shape_name = input_shape_name.split(":")[0]
-                input_shape_node = self._tf_node_map[input_shape_name]
-                shape_attr = parse_attr(input_shape_node.attr)
-                elem_shape = []
-
-                self._parse_element_shape(elem_shape, shape_attr)
-
-                if elem_shape:
-                    attr["shape"] = elem_shape
-                if (
-                    "identical_element_shapes" in attr and attr["identical_element_shapes"]
-                ) or elem_shape:
-                    shape = elem_shape
-                    if node.name in self._tensor_list_shapes:
-                        preset_shape = self._tensor_list_shapes[node.name]
-                        shape = _get_more_static_shape_rank(shape, preset_shape)
-                    attr["shape"] = shape
-
-            op = self._convert_operator(graph, node.op, node.name, inputs, attr)
-            if isinstance(op, np.ndarray):
-                self._params[node.name] = tvm.nd.array(op)
-                op = [
-                    _expr.var(
-                        node.name,
-                        shape=self._params[node.name].shape,
-                        dtype=self._params[node.name].dtype,
-                    )
-                ]
-            elif isinstance(op, (_expr.Expr, _expr.TupleGetItem)):
-                op = [op]
-            self._nodes[input_op_name] = op
-
-        out = self._nodes[input_op_name]
-        if isinstance(out, _expr.TupleWrapper):
-            tn = node_name.split(":")
-            tensor_slot = int(tn[1]) if len(tn) > 1 else 0
-            return out[tensor_slot]
-
-        return out[0]
-
-
-def _partition_call_operator(module, graph, inputs, attr, prelude, gdef_lib):
-    """convert tf PartitionedCall node to a relay function call"""
-    node_func_name = attr.get("f").name
-    return _convert_function(
-        module, graph, inputs, attr, node_func_name, prelude, gdef_lib=gdef_lib
-    )
-
-
-def _convert_if(module, graph, inputs, attr, prelude, gdef_lib):
-    """Convert tf If/StatelessIf to Relay If"""
-    cond_expr = inputs[0]
-    branch_names = [attr.get(x).name for x in ["then_branch", "else_branch"]]
-    then_fn, else_fn = [
-        _convert_function(module, graph, inputs[1:], attr, name, prelude, gdef_lib=gdef_lib)
-        for name in branch_names
-    ]
-    out = _expr.If(cond_expr, then_fn, else_fn)
-    return out
-
-
-def _convert_loop(module, graph, inputs, attr, node_name, nodes, prelude, gdef_lib):
-    """convert tf while_loop to Relay loop"""
-    input_size = len(inputs)
-    cond_fn_name, body_fn_name = [attr.get(x).name for x in ["cond", "body"]]
-
-    def convert_vars(loop_inputs, input_signature):
-        """convert inputs to relay vars to be used as loop variables
-        Loop inputs are packed as:
-            [iteration_number, max_iterations, loop_variables...]
-        """
-        new_vars = []
-        for i, v in enumerate(loop_inputs):
-            if isinstance(v, _expr.Constant):
-                vtype = _infer_type(v).checked_type.dtype
-                new_vars.append(_expr.var(input_signature[i].name, shape=(), dtype=vtype))
-            else:
-                vtype = _infer_type_with_prelude(v, prelude)
-                new_vars.append(_expr.var(input_signature[i].name, type_annotation=vtype))
-        return new_vars
-
-    while_func = next(
-        (f for f in graph.library.function if f.signature.name == attr["body"].name), None
-    )
-    loop_inputs = convert_vars(inputs, while_func.signature.input_arg)
-
-    def cond_fn(*loop_inputs):
-        return _convert_function(
-            module, graph, loop_inputs, attr, cond_fn_name, prelude, gdef_lib=gdef_lib
-        )
-
-    # Define the loop body, in this function we need to unpack loop inputs,
-    # convert the loop subgraph, and pack outputs for the next iteration.
-    def body_fn(*loop_inputs):
-        # Increment loop iteration counter
-        loop_count = loop_inputs[0] + _expr.const(1, dtype="int32")
-        max_count = loop_inputs[1]
-        fn = _convert_function(
-            module, graph, loop_inputs, attr, body_fn_name, prelude, gdef_lib=gdef_lib
-        )
-
-        # Repack loop variables
-        out = [loop_count, max_count] + [_expr.TupleGetItem(fn, i) for i in range(2, input_size)]
-        return out
-
-    loop = _while_loop(cond_fn, loop_inputs, body_fn)
-    outputs = loop(*inputs)
-    outputs = _expr.TupleWrapper(
-        _expr.Tuple([_expr.TupleGetItem(outputs, i) for i in range(input_size)]), input_size
-    )
-    return outputs
-
-
-def _convert_function(
-    module, graph, inputs, attr, node_func_name, prelude, gdef_lib, in_shapes=None
-):
-    """Convert given tf node to a relay function call
-
-    Parameters
-    ----------
-    module : IRModule
-        where converted function is stored
-
-    graph: <class 'tensorflow.core.framework.graph_pb2.GraphDef'>
-        top level tf graphdef
-
-    inputs : List[tvm.relay.Expr]
-        List of input symbols. Parameters for the function.
-
-    attrs : Dict[tvm.Attrs]
-        Dict of operator attributes.
-
-    node_func_name : str
-        Name of tf2 node to be converted
-
-    Returns
-    -------
-    op : tvm.relay.Expr
-        <class 'tvm.relay.expr.Call'>
-
-    Examples
-    --------
-    a tf function "x+1", is implemented as a subgraph in the library section of the graph.
-    this subgraph is converted to a relay function such as
-        fn (%x: float32) {
-        add(%x, 1f) /* Identity */
-        }
-
-    the subgraph has a function name such as __inference_add_95
-    the tf function call operator is returned as relay expression, such as:
-        free_var %x: float32;
-        @func___inference_add_95(%x)
-
-    """
-    func = next((f for f in graph.library.function if f.signature.name == node_func_name), None)
-    if func is None:
-        raise Exception(f"Function not found - {node_func_name}")
-    devices = set(node.device for node in func.node_def)
-    if len(devices) > 1:
-        raise Exception(
-            f"node_def in function {node_func_name} contains > 1 types of devices {devices}"
-        )
-
-    subgraph = gdef_lib[node_func_name]
-    # preserve library functions in subgraphs to make them available to nested functions
-    for fn in graph.library.function:
-        subgraph.library.function.add().CopyFrom(fn)
-
-    # Computing subgraph's input shape and type dictionaries
-    input_expr_dict = {}
-    input_types = {}
-    for f_arg, input_ in zip(func.signature.input_arg, inputs):
-        input_expr_dict[f_arg.name] = input_
-        input_types[f_arg.name] = _infer_type_with_prelude(input_, prelude)
-
-    func_name = f"func_{func.signature.name}"
-    try:
-        global_func = module.mod[func_name]
-        sub_func = global_func
-        sub_params = module.params
-    except ValueError:
-        # Construct relay nodes from the subgraph
-        g1 = GraphProto(module)
-        output_sig = [func.ret[f.name] for f in func.signature.output_arg]
-        # TODO: unify prelude and main IRModules
-        sub_func, sub_params = g1.from_tensorflow(
-            subgraph, outputs=output_sig, input_types=input_types, gdef_lib=gdef_lib
-        )
-        module.params.update(sub_params)
-        func_expr = _function.Function(sub_func.params, sub_func.body)
-        global_func = tvm.relay.GlobalVar(func_name)
-        module.mod[global_func] = func_expr
-        module.mod = InferType()(module.mod)
-        prelude.mod = module.mod
-
-    param_exprs = []
-    for param_expr in sub_func.params:
-        # sub_params is subset of sub_func.params
-        param_name = param_expr.vid.name_hint
-        if param_name in input_expr_dict.keys():
-            param_exprs.append(input_expr_dict[param_name])
-        elif param_name in sub_params.keys():
-            param_exprs.append(param_expr)
-        else:
-            raise Exception(f"Input parameter {param_name} not found")
-
-    sb = tvm.relay.scope_builder.ScopeBuilder()
-    loop_ret = global_func(*param_exprs)
-    sb.ret(loop_ret)
-    ret = sb.get()
-    return ret
-
-
-def from_tensorflow(graph_def, layout="NHWC", shape=None, outputs=None):
-    """convert tensorflow2.x graph into relay function.
-
-    Parameters
-    ----------
-    graph_def : must be frozen graph (no variables allowed).
-        Placeholders are assumed to be inputs to the graph.
-
-        tensorflow/core/framework/graph.proto
-            message GraphDef {
-              repeated NodeDef node = 1;
-              FunctionDefLibrary library = 2;
-            }
-        tensorflow/core/framework/function.proto
-            message FunctionDef {
-              repeated NodeDef node_def = 3;
-            }
-
-    layout : str
-        The layout for the model.
-
-    shape : List[str, List[int]]
-        Input to the model. It is a key and shape vector mapping. Applies to placeholders.
-
-    outputs : List[str]
-        The list of output nodes. The last node is treated as the output if not
-        specified.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The module that optimizations will be performed on.
-
-    params : dict of str to tvm.nd.NDArray
-        Dict of converted parameters stored in tvm.nd.NDArray format.
-
-    Examples
-    --------
-    "x+1" tf module where x has a shape of (2,2) is converted as follows:
-
-    mod : tvm.IRModule
-        def @func___inference_add_95(%x: Tensor[(2, 2), float32], %add/y: Tensor[(2, 2), float32])
-        -> Tensor[(2, 2), float32] {
-        add(%x, %add/y) /* Identity */ /* ty=Tensor[(2, 2), float32] */
-        }
-
-        def @main(%x1: Tensor[(2, 2), float32], %add/y1: Tensor[(2, 2), float32]) {
-        @func___inference_add_95(%x1, %add/y1) /* Identity */
-        }
-
-    params : dict of str to tvm.nd.NDArray
-        {'add/y': <tvm.nd.NDArray shape=(2, 2), cpu(0)>
-
-    """
-
-    with tf.Graph().as_default():
-        tf.import_graph_def(graph_def, name="")
-        # Subgraph graph_defs are cached here to avoid a TF error when parsing after prelude init
-        graph_def_library = {}
-        for func in graph_def.library.function:
-            inshape = func.attr["_input_shapes"].list.shape
-            (
-                graph_def_library[func.signature.name],
-                _,
-            ) = function_def_to_graph.function_def_to_graph_def(func, inshape)
-        module = RelayModule()
-        g = GraphProto(module)
-        func, params = g.from_tensorflow(
-            graph_def, layout, shape, outputs, gdef_lib=graph_def_library
-        )
-        module.mod["main"] = func
-        module.params.update(params)
-        return module.mod, module.params
diff --git a/python/tvm/relay/frontend/tensorflow2_ops.py b/python/tvm/relay/frontend/tensorflow2_ops.py
deleted file mode 100644
index 41af74add587..000000000000
--- a/python/tvm/relay/frontend/tensorflow2_ops.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, too-many-lines, len-as-condition, broad-except
-"""Tensorflow2.x to relay converter ops and helper"""
-import tvm
-from tvm.relay.prelude import StaticTensorArrayOps, get_tensor_array_shape
-
-from .. import op as _op
-from ..ty import Any
-from .common import infer_value as _infer_value
-from .common import infer_type as _infer_type
-from .tensorflow_ops import _get_more_static_shape_rank
-
-
-def _infer_type_with_prelude(val, prelude):
-    body = _infer_type(val, prelude.mod)
-    return body.checked_type
-
-
-def _need_prelude_for_shape_inference(op):
-    return "TensorList" in op or "TensorArray" in op
-
-
-def _tensorlist_reserve():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr.get("element_dtype").name
-        elem_shape = _infer_value(inputs[0], params, prelude.mod)
-        elem_shape = tuple(elem_shape.numpy().astype("int32").flatten())
-
-        if elem_shape or "shape" in attr:
-            shape = attr["shape"] if "shape" in attr else elem_shape
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, shape)
-            static_tensor_array_ops.register()
-            tensor_array_constructor = static_tensor_array_ops.get_global_var("tensor_array")
-            tensor_array = tensor_array_constructor(inputs[1])
-        else:
-            tensor_array_constructor = prelude.get_global_var("tensor_array", dtype_str)
-            tensor_array = tensor_array_constructor(inputs[1])
-        return tensor_array
-
-    return _impl
-
-
-def _tensorlist_set_item():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr.get("element_dtype").name
-        input_ta = inputs[0]
-        input_ta_shape = get_tensor_array_shape(input_ta, dtype_str, prelude)
-        input_t_shape = _infer_type_with_prelude(inputs[2], prelude).shape
-        input_rank = len(input_t_shape)
-
-        if input_ta_shape is None:
-            tensor_name = f"tensor{input_rank}"
-            tensor_func = prelude.get_tensor_ctor(tensor_name, dtype_str)
-            v = tensor_func(inputs[2])
-            write_func = prelude.get_global_var("tensor_array_write", dtype_str)
-            out = write_func(input_ta, inputs[1], v)
-        else:
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_ta_shape)
-            static_tensor_array_ops.register()
-            tensor_func = static_tensor_array_ops.get_ctor("tensor_constructor")
-            v = tensor_func(inputs[2])
-            # Write tensor with more static shape
-            # convert shape with -1 to any()
-            input_ta_shape_a = []
-            for dim in input_ta_shape:
-                if isinstance(dim, (int, tvm.tir.expr.IntImm)):
-                    if dim < 0:
-                        input_ta_shape_a.append(Any())
-                    else:
-                        input_ta_shape_a.append(dim)
-                else:
-                    input_ta_shape_a.append(dim)
-            actual_shape = _get_more_static_shape_rank(input_t_shape, input_ta_shape_a)
-            if actual_shape != input_ta_shape_a:
-                new_shape = []
-                num_any_dim = 0
-                for dim in actual_shape:
-                    if not isinstance(dim, int):
-                        num_any_dim += 1
-                    new_shape.append(dim if isinstance(dim, int) else -1)
-                if num_any_dim <= 1:
-                    v = tensor_func(_op.reshape(inputs[2], new_shape))
-            write_func = prelude.get_global_var_static(
-                "tensor_array_write", dtype_str, input_ta_shape_a
-            )
-            out = write_func(input_ta, inputs[1], v)
-        return out
-
-    return _impl
-
-
-def _tensorlist_get_item():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr["element_dtype"].name
-        input_shape = get_tensor_array_shape(inputs[0], dtype_str, prelude)
-
-        if input_shape is None:
-            read_func = prelude.get_global_var("tensor_array_read", dtype_str)
-            out = read_func(inputs[0], _op.take(inputs[1], tvm.relay.const(0)))
-        else:
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_shape)
-            static_tensor_array_ops.register()
-            read_func = static_tensor_array_ops.get_global_var("tensor_array_read")
-            out_tensor = read_func(inputs[0], _op.take(inputs[1], tvm.relay.const(0)))
-            get_data_func = static_tensor_array_ops.get_global_var("tensor_get_data")
-            out = get_data_func(out_tensor)
-        return out
-
-    return _impl
-
-
-def _tensorlist_stack():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr["element_dtype"].name
-        input_ta_shape = get_tensor_array_shape(inputs[0], dtype_str, prelude)
-
-        if input_ta_shape is None:
-            stack_func = prelude.get_global_var("tensor_array_stack", dtype_str)
-            out = stack_func(inputs[0])
-        else:
-            if "num_elements" in attr:
-                num_elements = attr["num_elements"]
-            static_tensor_array_ops = StaticTensorArrayOps(
-                prelude, dtype_str, input_ta_shape, num_elements
-            )
-            static_tensor_array_ops.register()
-            stack_func = prelude.get_global_var_static(
-                "tensor_array_stack", dtype_str, input_ta_shape, num_elements
-            )
-            out_tensor = stack_func(inputs[0])
-            out_shape = (
-                (num_elements,) + input_ta_shape
-                if num_elements and num_elements == 1
-                else (Any(),) + input_ta_shape
-            )
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, out_shape)
-            static_tensor_array_ops.register()
-            get_data_func = prelude.get_global_var_static("tensor_get_data", dtype_str, out_shape)
-            out = get_data_func(out_tensor)
-
-        return out
-
-    return _impl
-
-
-def _tensorlist_from_tensor():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr["element_dtype"].name
-        input_ta_shape = _infer_type_with_prelude(inputs[0], prelude).shape
-
-        if input_ta_shape is None:
-            unstack_func = prelude.get_global_var("tensor_array_unstack", dtype_str)
-            out = unstack_func(inputs[0])
-        else:
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_ta_shape)
-            static_tensor_array_ops.register()
-            unstack_func = prelude.get_global_var_static(
-                "tensor_array_unstack", dtype_str, input_ta_shape
-            )
-            out = unstack_func(inputs[0])
-        return out
-
-    return _impl
-
-
-_convert_map = {
-    "TensorListFromTensor": _tensorlist_from_tensor(),
-    "TensorListGetItem": _tensorlist_get_item(),
-    "TensorListReserve": _tensorlist_reserve(),
-    "TensorListSetItem": _tensorlist_set_item(),
-    "TensorListStack": _tensorlist_stack(),
-}
diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
deleted file mode 100644
index 14171afb3d44..000000000000
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ /dev/null
@@ -1,3176 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition, broad-except
-# pylint: disable=import-outside-toplevel, redefined-builtin
-"""TF: Tensorflow frontend."""
-import warnings
-from collections import deque
-
-# Numpy support
-import numpy as np
-import tvm
-
-from tvm.relay.prelude import StaticTensorArrayOps, get_tensor_array_shape
-from tvm.topi.utils import get_const_tuple
-
-from .. import expr as _expr
-from .. import op as _op
-from ..ty import Any
-from .common import AttrCvt, get_relay_op
-from .common import infer_type as _infer_type
-from .common import infer_shape as _infer_shape
-from .common import infer_channels as _infer_channels
-from .common import infer_value as _infer_value
-
-
-def check_symbolic_shape(shape):
-    return not all([isinstance(dim, (int, tvm.tir.IntImm)) for dim in shape])
-
-
-def list_shape_of(tensor, ndim):
-    shape_tensor = _op.shape_of(tensor)
-    return [
-        _op.strided_slice(shape_tensor, begin=[i], end=[i + 1], strides=[1]) for i in range(ndim)
-    ]
-
-
-def _get_pad_pair(input1d, kernel1d, stride1d):
-    if isinstance(input1d, tvm.tir.Any) and stride1d != 1:
-        raise tvm.error.OpAttributeUnImplemented(
-            "SAME padding is not supported in combination with dynamic height or width when stride"
-            " is not 1."
-        )
-    if stride1d == 1 or input1d % stride1d == 0:
-        pad = max(kernel1d - stride1d, 0)
-    else:
-        pad = max(kernel1d - (input1d % stride1d), 0)
-
-    pad_before = pad // 2
-    pad_after = pad - pad_before
-
-    return [pad_before, pad_after]
-
-
-def _math_name_picker(surfix):
-    def _impl(attr):
-        return "broadcast_" + surfix
-
-    return _impl
-
-
-def _dimension_picker(prefix, surfix=""):
-    def _impl(attr):
-        kernel = attr["kernel_shape"]
-        if len(kernel) == 2:
-            return prefix + "2d" + surfix
-        if len(kernel) == 3:
-            return prefix + "3d" + surfix
-        raise tvm.error.OpAttributeInvalid(
-            f"Only 2D or 3D kernels are supported for operator {prefix}2d or 3d"
-        )
-
-    return _impl
-
-
-def _dimension_constraint():
-    def _dim_check(attrs):
-        if len(attrs["kernel_shape"]) in (2, 3):
-            return True
-        return False
-
-    return _dim_check, "Only 2d or 3d kernel supported."
-
-
-def _get_param(params, input_node):
-    if isinstance(input_node, _expr.Constant):
-        return np.atleast_1d(input_node.data.numpy())
-    return params[input_node.name_hint].numpy()
-
-
-def _get_num_param(params, input_node):
-    return _get_param(params, input_node).item()
-
-
-def _get_list_param(params, input_node, mod):
-    try:
-        return _get_param(params, input_node).tolist()
-    except (IndexError, KeyError, AttributeError):
-        return _infer_value(input_node, params, mod).numpy().tolist()
-
-
-def _get_tuple_param(params, input_node):
-    return tuple(_get_param(params, input_node))
-
-
-def _need_prelude_for_shape_inference(op):
-    return "TensorArray" in op
-
-
-def _get_more_static_shape(shape0, shape1):
-    """Compare two shapes with the same rank,
-    and return the one with fewer symbolic dimension.
-    """
-    assert len(shape0) == len(shape1)
-    num_sym_dim0 = 0
-    num_sym_dim1 = 0
-    for dim0, dim1 in zip(list(shape0), list(shape1)):
-        if not isinstance(dim0, int):
-            num_sym_dim0 += 1
-        if not isinstance(dim1, int):
-            num_sym_dim1 += 1
-
-    if num_sym_dim0 < num_sym_dim1:
-        return shape0
-    return shape1
-
-
-def _get_more_static_shape_rank(shape0, shape1):
-    """Compare two shapes with different rank,
-    and return the one with fewer symbolic dimension.
-    """
-    num_sym_dim0 = sum([not isinstance(dim, (int, tvm.tir.expr.IntImm)) for dim in list(shape0)])
-    num_sym_dim1 = sum([not isinstance(dim, (int, tvm.tir.expr.IntImm)) for dim in list(shape1)])
-
-    if num_sym_dim0 < num_sym_dim1:
-        return shape0
-    return shape1
-
-
-def _rsqrt():
-    def _impl(inputs, attr, params, mod):
-        inputs.append(tvm.relay.const(-0.5, attr["T"].name))
-        return AttrCvt(op_name="power")(inputs, attr)
-
-    return _impl
-
-
-def _argx(func, func_name):
-    """A common wrapper for argmin and argmax operations"""
-
-    def _impl(inputs, attr, params, mod):
-        try:
-            # In Tensorflow, `axis` argument is a Tensor, not attribute. We
-            # support the case where it inputs from a scalar constant.
-            axis_input_value = [_get_num_param(params, inputs[1])]
-        except (IndexError, KeyError):
-            raise TypeError(f"Unsupported argument for `{func_name}` : `axis` should be a constant")
-        out = func(inputs[0], axis=axis_input_value, keepdims=False)
-        dtype = attr["output_type"].name
-        if dtype != "int32":
-            out = _op.cast(out, dtype=dtype)
-        return out
-
-    return _impl
-
-
-def _elemwise(name):
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 2, f"{name} take 2 inputs, {len(inputs)} given"
-        return get_relay_op(name)(*inputs)
-
-    return _impl
-
-
-def _pool3d(name):
-    def _impl(inputs, attr, params, mod):
-        attr["data_format"] = attr["data_format"].decode("utf-8")
-        flip_layout = False
-
-        input_shape = _infer_shape(inputs[0], mod)
-
-        if attr["data_format"] == "NDHWC":
-            attr["kernel_shape"] = (attr["ksize"][1], attr["ksize"][2], attr["ksize"][3])
-            attr["strides"] = (attr["strides"][1], attr["strides"][2], attr["strides"][3])
-        elif attr["data_format"] == "NCDHW":
-            attr["kernel_shape"] = (attr["ksize"][2], attr["ksize"][3], attr["ksize"][4])
-            attr["strides"] = (attr["strides"][2], attr["strides"][3], attr["strides"][4])
-        else:
-            msg = (
-                f'Value {attr["data_format"]} of attribute "data_format" of operator Pooling '
-                f"is not valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-        if attr["data_format"] == "NDHWC":
-            input_shape = [_infer_shape(inputs[0], mod)[i] for i in (0, 4, 1, 2, 3)]
-            inputs[0] = _op.transpose(inputs[0], axes=(0, 4, 1, 2, 3))
-            attr["data_format"] = "NCDHW"
-            flip_layout = True
-
-        attr["padding"] = attr["padding"].decode("utf-8")
-
-        if attr["padding"] == "VALID":
-            attr["padding"] = [0, 0, 0, 0, 0, 0]
-        elif attr["padding"] == "SAME":
-            stride_d, stride_h, stride_w = attr["strides"]
-            kernel_d, kernel_h, kernel_w = attr["kernel_shape"]
-            if attr["data_format"] == "NDHWC":
-                in_d = input_shape[1]
-                in_h = input_shape[2]
-                in_w = input_shape[3]
-            else:
-                in_d = input_shape[2]
-                in_h = input_shape[3]
-                in_w = input_shape[4]
-            pad_d = _get_pad_pair(in_d, kernel_d, stride_d)
-            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
-
-            attr["padding"] = [pad_d[0], pad_v[0], pad_h[0], pad_d[1], pad_v[1], pad_h[1]]
-        else:
-            msg = (
-                f'Value {attr["padding"]} in attribute "padding" of operator Pooling is '
-                f"not valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        if name == "avg_pool":
-            attr["count_include_pad"] = False
-        attr["ceil_mode"] = False
-        out = AttrCvt(
-            op_name=name,
-            transforms={"kernel_shape": "pool_size", "data_format": "layout"},
-            ignores=["ksize"],
-        )(inputs, attr)
-        if flip_layout:
-            out = _op.transpose(out, axes=(0, 2, 3, 4, 1))
-        return out
-
-    return _impl
-
-
-def _pooling(name):
-    def _impl(inputs, attr, params, mod):
-
-        attr["data_format"] = attr["data_format"].decode("utf-8")
-        flip_layout = False
-
-        input_shape = _infer_shape(inputs[0], mod)
-
-        if attr["data_format"] == "NHWC":
-            attr["kernel_shape"] = (attr["ksize"][1], attr["ksize"][2])
-            attr["strides"] = (attr["strides"][1], attr["strides"][2])
-        elif attr["data_format"] == "NCHW":
-            attr["kernel_shape"] = (attr["ksize"][2], attr["ksize"][3])
-            attr["strides"] = (attr["strides"][2], attr["strides"][3])
-        else:
-            msg = (
-                f'Value {attr["data_format"]} of attribute "data_format" of operator Pooling '
-                f"is not valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        if attr["_target_layout"] == "NCHW" and attr["data_format"] == "NHWC":
-            tmp_shape = _infer_shape(inputs[0], mod)
-            input_shape = [tmp_shape[ii] for ii in (0, 3, 1, 2)]
-            inputs[0] = _op.transpose(inputs[0], axes=(0, 3, 1, 2))
-            attr["data_format"] = "NCHW"
-            flip_layout = True
-
-        # Fix padding
-        attr["padding"] = attr["padding"].decode("utf-8")
-
-        if attr["padding"] == "VALID":
-            attr["padding"] = [0, 0]
-        elif attr["padding"] == "SAME":
-            stride_h, stride_w = attr["strides"]
-            kernel_h, kernel_w = attr["kernel_shape"]
-            if attr["data_format"] == "NHWC":
-                in_h = input_shape[1]
-                in_w = input_shape[2]
-            else:
-                in_h = input_shape[2]
-                in_w = input_shape[3]
-
-            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
-
-            attr["padding"] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
-        elif attr["padding"] == "EXPLICIT":
-            paddings = attr["explicit_paddings"]
-            assert len(paddings) == 8
-            if flip_layout or attr["data_format"] == "NHWC":
-                attr["padding"] = [paddings[2], paddings[4], paddings[3], paddings[5]]
-            else:
-                attr["padding"] = [paddings[4], paddings[6], paddings[5], paddings[7]]
-        else:
-            msg = (
-                f'Value {attr["padding"]} in attribute "padding" of operator Pooling is '
-                f"not valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        if name == "avg_pool":
-            attr["count_include_pad"] = False
-
-        out = AttrCvt(
-            op_name=_dimension_picker(name),
-            transforms={"kernel_shape": "pool_size", "data_format": "layout"},
-            ignores=["ksize", "explicit_paddings"],
-            extras={"ceil_mode": False},
-            custom_check=_dimension_constraint(),
-        )(inputs, attr)
-
-        if flip_layout:
-            out = _op.transpose(out, axes=(0, 2, 3, 1))
-
-        return out
-
-    return _impl
-
-
-def _conv(opname):
-    def _impl(inputs, attr, params, mod):
-        attr["data_format"] = attr["data_format"].decode("utf-8")
-        flip_layout = False
-
-        if opname == "conv_transpose" and attr["data_format"] == "NHWC":
-            # transform to NCHW for TVM backend compatible and set 'flip_layout'
-            # to have output flip back to NHWC
-            inputs[2] = _op.transpose(inputs[2], axes=(0, 3, 1, 2))
-            attr["strides"][1], attr["strides"][2], attr["strides"][3] = (
-                attr["strides"][3],
-                attr["strides"][1],
-                attr["strides"][2],
-            )
-            attr["data_format"] = "NCHW"
-
-            # Check whether output shapes attribute is set and not None
-            if (
-                opname == "conv_transpose"
-                and len(attr["_output_shapes"]) > 0
-                and attr["_output_shapes"][0]
-            ):
-                tmp_shape = attr["_output_shapes"][0]
-                tmp_shape = [tmp_shape[ii] for ii in (0, 3, 1, 2)]
-                attr["_output_shapes"][0] = tmp_shape
-
-            flip_layout = True
-
-        inputs_data = inputs[0] if opname != "conv_transpose" else inputs[2]
-
-        # NCHW Layout require weights transpose
-        weights_shape = _infer_shape(inputs[1], mod)
-        if attr["data_format"] == "NCHW":
-            tmp_shape = weights_shape
-            if opname in ["conv", "conv_transpose"]:
-                tmp_shape = [tmp_shape[ii] for ii in (3, 2, 0, 1)]
-                inputs[1] = _op.transpose(inputs[1], axes=(3, 2, 0, 1))
-            else:
-                tmp_shape = [tmp_shape[ii] for ii in (2, 3, 0, 1)]
-                inputs[1] = _op.transpose(inputs[1], axes=(2, 3, 0, 1))
-            weights_shape = tmp_shape
-
-        input_shape = _infer_shape(inputs_data, mod)
-        if attr["_target_layout"] == "NCHW" and attr["data_format"] == "NHWC":
-            input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
-            inputs_data = _op.transpose(inputs_data, axes=(0, 3, 1, 2))
-            if opname in ["conv", "conv_transpose"]:
-                weights_shape = [weights_shape[ii] for ii in (3, 2, 0, 1)]
-                inputs[1] = _op.transpose(inputs[1], axes=(3, 2, 0, 1))
-            else:
-                weights_shape = [weights_shape[ii] for ii in (2, 3, 0, 1)]
-                inputs[1] = _op.transpose(inputs[1], axes=(2, 3, 0, 1))
-
-            attr["data_format"] = "NCHW"
-            attr["strides"] = [attr["strides"][ii] for ii in (0, 3, 1, 2)]
-            flip_layout = True
-
-        if attr["data_format"] == "NHWC":
-            in_channels = input_shape[3]
-            kernel_h, kernel_w, _, depth_mult = weights_shape
-            attr["kernel_shape"] = (weights_shape[0], weights_shape[1])
-            if opname == "conv":
-                attr["channels"] = weights_shape[3]
-            elif opname == "conv_transpose":
-                attr["channels"] = weights_shape[2]
-            else:
-                attr["channels"] = input_shape[3] * depth_mult
-
-            if "dilations" in attr:
-                attr["dilations"] = (attr["dilations"][1], attr["dilations"][2])
-            attr["strides"] = (attr["strides"][1], attr["strides"][2])
-        elif attr["data_format"] == "NCHW":
-            in_channels = input_shape[1]
-            _, depth_mult, kernel_h, kernel_w = weights_shape
-            attr["kernel_shape"] = (weights_shape[2], weights_shape[3])
-            if opname == "conv":
-                attr["channels"] = weights_shape[0]
-            elif opname == "conv_transpose":
-                attr["channels"] = weights_shape[1]
-            else:
-                attr["channels"] = input_shape[1] * depth_mult
-                if attr["channels"] < 0:
-                    attr["channels"] *= -1
-
-            if "dilations" in attr:
-                attr["dilations"] = (attr["dilations"][2], attr["dilations"][3])
-            attr["strides"] = (attr["strides"][2], attr["strides"][3])
-        else:
-            msg = (
-                f'Value {attr["data_format"]} in attribute "data_format" of operator Conv is '
-                f"not valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        if opname == "depthwise":
-            attr["groups"] = in_channels
-
-        # Fix padding
-        attr["padding"] = attr["padding"].decode("utf-8")
-
-        if attr["padding"] == "VALID":
-            attr["padding"] = [0, 0]
-        elif attr["padding"] == "SAME":
-            stride_h, stride_w = attr["strides"]
-            kernel_h, kernel_w = attr["kernel_shape"]
-
-            pdata_shape = input_shape
-            # Check whether output shapes attribute is set and not None
-            if (
-                opname == "conv_transpose"
-                and len(attr["_output_shapes"]) > 0
-                and attr["_output_shapes"][0]
-            ):
-                pdata_shape = attr["_output_shapes"][0]
-
-            if attr["data_format"] == "NHWC":
-                in_h = pdata_shape[1]
-                in_w = pdata_shape[2]
-            else:
-                in_h = pdata_shape[2]
-                in_w = pdata_shape[3]
-
-            dilation_h = attr["dilations"][0]
-            dilation_w = attr["dilations"][1]
-            dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-            dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-            pad_v = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
-
-            attr["padding"] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
-        elif attr["padding"] == "EXPLICIT":
-            paddings = attr["explicit_paddings"]
-            assert len(paddings) == 8
-            if flip_layout or attr["data_format"] == "NHWC":
-                attr["padding"] = [paddings[2], paddings[4], paddings[3], paddings[5]]
-            else:
-                attr["padding"] = [paddings[4], paddings[6], paddings[5], paddings[7]]
-        else:
-            msg = (
-                f'Value {attr["padding"]} in attribute "padding" of operator Conv is not ' f"valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        if "kernel_layout" not in attr:
-            if opname == "conv":
-                attr["kernel_layout"] = "HWIO" if attr["data_format"] == "NHWC" else "OIHW"
-            elif opname == "conv_transpose":
-                # conv_transpose has weights be IOHW, because the attr["data_format"] always be NCHW
-                attr["kernel_layout"] = "IOHW"
-            else:
-                attr["kernel_layout"] = "HWOI" if attr["data_format"] == "NHWC" else "OIHW"
-
-        # Ignore the new attributes from TF2.0, for now.
-        out = AttrCvt(
-            op_name=_dimension_picker(
-                "conv", surfix="_transpose" if opname == "conv_transpose" else ""
-            ),
-            ignores=["explicit_paddings"],
-            transforms={
-                "kernel_shape": "kernel_size",
-                "data_format": "data_layout",
-                "dilations": ("dilation", (0, 0)),
-                "group": ("groups", 1),
-            },
-            custom_check=_dimension_constraint(),
-        )([inputs_data, inputs[1]], attr)
-
-        if flip_layout:
-            out = _op.transpose(out, axes=(0, 2, 3, 1))
-
-        return out
-
-    return _impl
-
-
-# Dilation2d
-def _dilation2d():
-    def _impl(inputs, attr, params, mod):
-        if "data_format" not in attr:
-            attr["data_format"] = "NHWC"
-
-        input_shape = _infer_shape(inputs[0], mod)
-        weights_shape = _infer_shape(inputs[1], mod)
-
-        if attr["_target_layout"] == "NCHW" and attr["data_format"] == "NHWC":
-            input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
-            inputs[0] = _op.transpose(inputs[0], axes=(0, 3, 1, 2))
-            weights_shape = [weights_shape[ii] for ii in (2, 0, 1)]
-            inputs[1] = _op.transpose(inputs[1], axes=(2, 0, 1))
-            attr["data_format"] = "NCHW"
-
-        if attr["data_format"] in ["NHWC", "NCHW"]:
-            if "rates" in attr:
-                attr["dilations"] = attr["rates"]
-            if "dilations" in attr:
-                attr["dilations"] = (attr["dilations"][1], attr["dilations"][2])
-            attr["strides"] = (attr["strides"][1], attr["strides"][2])
-        else:
-            msg = (
-                f'Value {attr["data_format"]} in attribute "data_format" of operator Dilation2D is '
-                f"not valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        attr["padding"] = attr["padding"].decode("utf-8")
-        if attr["padding"] == "VALID":
-            attr["padding"] = [0, 0]
-        elif attr["padding"] == "SAME":
-            stride_h, stride_w = attr["strides"]
-            if attr["data_format"] == "NHWC":
-                kernel_h, kernel_w = weights_shape[0], weights_shape[1]
-            else:
-                kernel_h, kernel_w = weights_shape[1], weights_shape[2]
-            if attr["data_format"] == "NHWC":
-                in_h = input_shape[1]
-                in_w = input_shape[2]
-            else:
-                in_h = input_shape[2]
-                in_w = input_shape[3]
-
-            dilation_h = attr["dilations"][0]
-            dilation_w = attr["dilations"][1]
-            dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-            dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-            pad_v = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
-
-            if attr["data_format"] == "NHWC":
-                inputs[0] = _op.nn.pad(
-                    data=inputs[0],
-                    pad_width=((0, 0), (pad_v[0], pad_v[1]), (pad_h[0], pad_h[1]), (0, 0)),
-                )
-            else:
-                inputs[0] = _op.nn.pad(
-                    data=inputs[0],
-                    pad_width=((0, 0), (0, 0), (pad_v[0], pad_v[1]), (pad_h[0], pad_h[1])),
-                )
-
-            attr["padding"] = [0, 0]
-
-        else:
-            msg = (
-                f'Value {attr["padding"]} in attribute "padding" of operator Dilation2d is not '
-                f"valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        attr["kernel_layout"] = "HWI" if attr["data_format"] == "NHWC" else "IHW"
-        out = AttrCvt(
-            op_name="dilation2d",
-            ignores=["explicit_paddings", "rates"],
-            transforms={"data_format": "data_layout"},
-        )([inputs[0], inputs[1]], attr)
-        if attr["_target_layout"] == "NCHW":
-            out = _op.transpose(out, axes=(0, 2, 3, 1))
-        return out
-
-    return _impl
-
-
-def _conv3d(opname):
-    def _impl(inputs, attr, params, mod):
-        attr["data_format"] = attr["data_format"].decode("utf-8")
-        flip_layout = False
-
-        inputs_data = inputs[0] if opname != "conv_transpose" else inputs[2]
-
-        # NCDHW Layout require weights transpose
-        weights_shape = _infer_shape(inputs[1], mod)
-        if attr["data_format"] == "NCDHW":
-            tmp_shape = weights_shape
-            tmp_shape = [tmp_shape[ii] for ii in (4, 3, 0, 1, 2)]
-            inputs[1] = _op.transpose(inputs[1], axes=(4, 3, 0, 1, 2))
-            weights_shape = tmp_shape
-
-        input_shape = _infer_shape(inputs_data, mod)
-
-        if attr["_target_layout"] == "NCDHW" and attr["data_format"] == "NDHWC":
-            input_shape = [input_shape[ii] for ii in (0, 4, 1, 2, 3)]
-            inputs_data = _op.transpose(inputs_data, axes=(0, 4, 1, 2, 3))
-            weights_shape = [weights_shape[ii] for ii in (4, 3, 0, 1, 2)]
-            inputs[1] = _op.transpose(inputs[1], axes=(4, 3, 0, 1, 2))
-
-            attr["data_format"] = "NCDHW"
-            attr["strides"] = [attr["strides"][ii] for ii in (0, 4, 1, 2, 3)]
-            flip_layout = True
-
-        if attr["data_format"] == "NDHWC":
-            kernel_d, kernel_h, kernel_w, _, _ = weights_shape
-            attr["kernel_shape"] = (kernel_d, kernel_h, kernel_w)
-            if opname == "conv":
-                attr["channels"] = weights_shape[4]
-            elif opname == "conv_transpose":
-                attr["channels"] = weights_shape[3]
-
-            if "dilations" in attr:
-                attr["dilations"] = (
-                    attr["dilations"][1],
-                    attr["dilations"][2],
-                    attr["dilations"][3],
-                )
-            attr["strides"] = (attr["strides"][1], attr["strides"][2], attr["strides"][3])
-        elif attr["data_format"] == "NCDHW":
-            _, _, kernel_d, kernel_h, kernel_w = weights_shape
-            attr["kernel_shape"] = (kernel_d, kernel_h, kernel_w)
-            if opname == "conv":
-                attr["channels"] = weights_shape[0]
-            elif opname == "conv_transpose":
-                attr["channels"] = weights_shape[1]
-
-            if "dilations" in attr:
-                attr["dilations"] = (
-                    attr["dilations"][2],
-                    attr["dilations"][3],
-                    attr["dilations"][4],
-                )
-            attr["strides"] = (attr["strides"][2], attr["strides"][3], attr["strides"][4])
-        else:
-            msg = (
-                f'Value {attr["data_format"]} in attribute "data_format" of operator Conv is '
-                f"not valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        # Fix padding
-        attr["padding"] = attr["padding"].decode("utf-8")
-
-        if attr["padding"] == "VALID":
-            attr["padding"] = [0, 0, 0]
-        elif attr["padding"] == "SAME":
-            stride_d, stride_h, stride_w = attr["strides"]
-            kernel_d, kernel_h, kernel_w = attr["kernel_shape"]
-
-            pdata_shape = input_shape
-            if opname == "conv_transpose" and len(attr["_output_shapes"]) > 0:
-                pdata_shape = attr["_output_shapes"][0]
-
-            if attr["data_format"] == "NDHWC":
-                in_d = pdata_shape[1]
-                in_h = pdata_shape[2]
-                in_w = pdata_shape[3]
-            else:
-                in_d = pdata_shape[2]
-                in_h = pdata_shape[3]
-                in_w = pdata_shape[4]
-
-            dilation_d = attr["dilations"][0]
-            dilation_h = attr["dilations"][1]
-            dilation_w = attr["dilations"][2]
-            dilated_kernel_d = (kernel_d - 1) * dilation_d + 1
-            dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-            dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-            pad_d = _get_pad_pair(in_d, dilated_kernel_d, stride_d)
-            pad_v = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
-
-            attr["padding"] = [pad_d[0], pad_v[0], pad_h[0], pad_d[1], pad_v[1], pad_h[1]]
-        elif attr["padding"] == "EXPLICIT":
-            paddings = attr["explicit_paddings"]
-            assert len(paddings) == 10
-            if flip_layout or attr["data_format"] == "NDHWC":
-                attr["padding"] = [
-                    paddings[2],
-                    paddings[4],
-                    paddings[6],
-                    paddings[3],
-                    paddings[5],
-                    paddings[7],
-                ]
-            else:
-                attr["padding"] = [
-                    paddings[4],
-                    paddings[6],
-                    paddings[8],
-                    paddings[5],
-                    paddings[7],
-                    paddings[9],
-                ]
-        else:
-            msg = (
-                f'Value {attr["padding"]} in attribute "padding" of operator Conv is not ' f"valid."
-            )
-            raise tvm.error.OpAttributeInvalid(msg)
-
-        if "kernel_layout" not in attr:
-            if opname == "conv":
-                attr["kernel_layout"] = "DHWIO" if attr["data_format"] == "NDHWC" else "OIDHW"
-            elif opname == "conv_transpose":
-                attr["kernel_layout"] = "DHWOI" if attr["data_format"] == "NDHWC" else "IODHW"
-
-        use_bias = len(inputs) == (3 if opname != "conv_transpose" else 4)
-        channel_axis = 1 if attr["data_format"] == "NCDHW" else 4
-
-        # Ignore the new attributes from TF2.0, for now.
-        out = AttrCvt(
-            op_name=_dimension_picker(
-                "conv", surfix="_transpose" if opname == "conv_transpose" else ""
-            ),
-            ignores=["explicit_paddings", "Tshape"],
-            transforms={
-                "kernel_shape": "kernel_size",
-                "data_format": "data_layout",
-                "dilations": ("dilation", (0, 0)),
-                "group": ("groups", 1),
-            },
-            custom_check=_dimension_constraint(),
-        )([inputs_data, inputs[1]], attr)
-
-        if use_bias:
-            out = _op.nn.bias_add(
-                out, inputs[2] if opname != "conv_transpose" else inputs[3], axis=channel_axis
-            )
-
-        if flip_layout:
-            out = _op.transpose(out, axes=(0, 2, 3, 4, 1))
-
-        return out
-
-    return _impl
-
-
-def _nms(return_scores=False):
-    def _impl(inputs, attr, params, mod):
-        # Get parameter values
-        try:
-            max_output_size = int(np.atleast_1d(inputs[2].data.numpy().astype("int64"))[0])
-        except Exception:
-            try:
-                max_output_size = (
-                    _infer_value(inputs[2], params, mod).numpy().astype("int64").tolist()[0]
-                )
-            except Exception:
-                max_output_size = inputs[2]
-        iou_threshold = np.atleast_1d(inputs[3].data.numpy())[0]
-        # score_threshold was introduced from V3
-        score_threshold = np.atleast_1d(inputs[4].data.numpy())[0] if len(inputs) > 4 else 0.0
-        pad_output = "pad_to_max_output_size"
-
-        # Generate data with shape (1, num_anchors, 5)
-        scores = AttrCvt(
-            op_name="expand_dims",
-            ignores=["T_threshold", pad_output],
-            extras={"axis": -1, "num_newaxis": 1},
-        )([inputs[1]], attr)
-        data = get_relay_op("concatenate")([scores, inputs[0]], -1)
-        data = get_relay_op("expand_dims")(data, 0, 1)
-
-        # reason why using get_valid_counts is for inference performance
-        ct, data, indices = get_relay_op("get_valid_counts")(
-            data, score_threshold=score_threshold, id_index=-1, score_index=0
-        )
-        # TensorFlow NMS doesn't have parameter top_k
-        top_k = -1
-        # TF doesn't have class id for nms input
-        score_index = 0
-        nms_ret = get_relay_op("non_max_suppression")(
-            data=data,
-            valid_count=ct,
-            indices=indices,
-            max_output_size=max_output_size,
-            iou_threshold=iou_threshold,
-            force_suppress=True,
-            top_k=top_k,
-            coord_start=1,
-            score_index=score_index,
-            id_index=-1,
-            return_indices=True,
-            invalid_to_bottom=False,
-        )
-
-        if pad_output in attr and attr[pad_output]:
-            return nms_ret
-        # squeeze it, TF NMS is not batched
-        size = get_relay_op("squeeze")(nms_ret[1], axis=[1])
-        data_slice = get_relay_op("squeeze")(nms_ret[0], axis=[0])
-
-        # slice to get the dynamic result
-        ret = get_relay_op("strided_slice")(
-            data_slice, begin=_expr.const([0]), end=size, slice_mode="size"
-        )
-
-        # NonMaxSuppressionV5 returns scores. pad_output is always False for NMSv5.
-        if return_scores:
-            if "soft_nms_sigma" in attr and attr["soft_nms_sigma"] != 0.0:
-                raise tvm.error.OpAttributeUnImplemented(
-                    "soft_nms_sigma for NonMaxSuppressionV5 is not supported"
-                )
-            ret_scores = _op.take(inputs[1], ret, axis=0)
-            return _expr.TupleWrapper(_expr.Tuple([ret, ret_scores, size]), 3)
-
-        return ret
-
-    return _impl
-
-
-def convert_combined_nms_with_all_class_nms(
-    batch_size,
-    max_output_boxes_per_batch,
-    num_class,
-    boxes,
-    scores,
-    max_output_boxes_per_class,
-    iou_threshold,
-    score_threshold,
-    max_total_size,
-    clip_boxes,
-):
-    """Converts TF combined_nms using Relay all_class_max_suppression op"""
-    (selected_indices, selected_scores, num_detections) = _op.vision.all_class_non_max_suppression(
-        boxes,
-        scores,
-        max_output_boxes_per_class,
-        iou_threshold,
-        score_threshold,
-        output_format="tensorflow",
-    )
-    box_range = _op.arange(
-        _op.const(0, dtype="int64"), _op.const(max_total_size, dtype="int64"), dtype="int64"
-    )
-    assert isinstance(batch_size, int), "dynamic batch size not supported yet."
-    tile_batch_reps = _op.const([batch_size, 1])
-    box_range_2d = _op.tile(box_range, tile_batch_reps)
-    valid_mask = _op.cast(
-        _op.less(box_range_2d, _op.expand_dims(num_detections, axis=1)), "float32"
-    )
-
-    def select_topk(do_zero_pad):
-        def true_branch():
-            arange = _op.arange(
-                _op.const(0, dtype="int64"),
-                _op.const(max_output_boxes_per_batch, dtype="int64"),
-                dtype="int64",
-            )
-            pad = _op.full(
-                _op.const(0, dtype="int64"), (max_total_size - max_output_boxes_per_batch,)
-            )
-            topk_indices = _op.tile(_op.concatenate([arange, pad], 0), tile_batch_reps)
-            nmsed_scores = _op.gather(selected_scores, 1, topk_indices)
-            nmsed_scores = nmsed_scores * valid_mask
-            return nmsed_scores, topk_indices
-
-        def false_branch():
-            if isinstance(max_output_boxes_per_class, int):
-                # Do topk on smaller input if possible
-                slice_mx = _op.const([max_output_boxes_per_class * num_class], dtype="int64")
-                selected_scores_slice = _op.strided_slice(
-                    selected_scores, begin=_op.const([0], dtype="int64"), end=slice_mx, axes=[1]
-                )
-            else:
-                selected_scores_slice = selected_scores
-            return _op.topk(selected_scores_slice, k=max_total_size, axis=1, ret_type="both")
-
-        # TODO(masahi): support dynamic num_boxes
-        # return _expr.If(do_zero_pad, true_branch(), false_branch())
-        return true_branch() if do_zero_pad else false_branch()
-
-    assert isinstance(max_output_boxes_per_batch, int), "dynamic number of boxes not supported yet."
-    nmsed_scores, topk_indices = select_topk(max_output_boxes_per_batch < max_total_size)
-
-    indices = _op.take(selected_indices, topk_indices, axis=1, batch_dims=1)
-    nmsed_box_indices = _op.take(indices, _op.const(1), axis=2)
-    nmsed_classes = _op.take(indices, _op.const(0), axis=2)
-    nmsed_classes = _op.cast(nmsed_classes, "float32")
-    nmsed_boxes = _op.take(boxes, nmsed_box_indices, axis=1, batch_dims=1)
-    num_detections = _op.minimum(num_detections, _op.const(max_total_size, dtype="int64"))
-
-    if clip_boxes:
-        nmsed_boxes = _op.maximum(nmsed_boxes, _expr.const(0, dtype="float32"))
-        nmsed_boxes = _op.minimum(nmsed_boxes, _expr.const(1, dtype="float32"))
-
-    nmsed_boxes = nmsed_boxes * _op.expand_dims(valid_mask, axis=2)
-
-    return _expr.TupleWrapper(
-        _expr.Tuple([nmsed_boxes, nmsed_scores, nmsed_classes, num_detections]), 4
-    )
-
-
-def _combined_nms():
-    def _impl(inputs, attr, params, mod):
-        # Get parameter values
-        boxes = inputs[0]
-        scores = inputs[1]
-        try:
-            max_output_size = int(np.atleast_1d(inputs[2].data.numpy().astype("int64"))[0])
-        except Exception:
-            try:
-                max_output_size = (
-                    _infer_value(inputs[2], params, mod).numpy().astype("int64").tolist()[0]
-                )
-            except Exception:
-                max_output_size = inputs[2]
-        max_total_size = inputs[3]
-        iou_threshold = np.atleast_1d(inputs[4].data.numpy())[0]
-        score_threshold = np.atleast_1d(inputs[5].data.numpy())[0]
-        if attr["pad_per_class"]:
-            raise tvm.error.OpAttributeUnImplemented(
-                "pad_per_class for CombinedNonMaxSuppression is not supported"
-            )
-        boxes_shape = _infer_shape(inputs[0], mod)
-        scores_shape = _infer_shape(inputs[1], mod)
-        batch_size = boxes_shape[0]
-        num_anchors = boxes_shape[1]
-        q = boxes_shape[2]
-        num_classes = scores_shape[2]
-
-        assert isinstance(batch_size, int) and isinstance(
-            num_anchors, int
-        ), "Dynamic inputs not supported yet"
-
-        if q == 1:
-            boxes = _op.squeeze(boxes, axis=[2])
-            scores_trans = _op.transpose(scores, [0, 2, 1])
-            max_output_boxes_per_batch = num_anchors * num_classes
-            return convert_combined_nms_with_all_class_nms(
-                batch_size,
-                max_output_boxes_per_batch,
-                num_classes,
-                boxes,
-                scores_trans,
-                max_output_size,
-                iou_threshold,
-                score_threshold,
-                max_total_size.data.numpy().item(),
-                attr["clip_boxes"],
-            )
-
-        boxes = _op.reshape(boxes, newshape=[batch_size, num_anchors * num_classes, 4])
-        scores = _op.reshape(scores, newshape=[batch_size, num_anchors * num_classes, 1])
-
-        # In TF, class is specified by memory layout only.
-        ids = _op.arange(_op.const(num_classes, dtype="float32"))
-        ids = _op.broadcast_to(ids, (batch_size, num_anchors, num_classes))
-        ids = _op.reshape(ids, newshape=[batch_size, num_anchors * num_classes, 1])
-
-        data = _op.concatenate([ids, scores, boxes], -1)
-        ct, data, indices = _op.vision.get_valid_counts(
-            data, score_threshold=score_threshold, id_index=0, score_index=1
-        )
-        nms_ret = _op.vision.non_max_suppression(
-            data=data,
-            valid_count=ct,
-            indices=indices,
-            max_output_size=max_output_size,
-            iou_threshold=iou_threshold,
-            force_suppress=False,
-            top_k=-1,
-            coord_start=2,
-            score_index=1,
-            id_index=0,
-            return_indices=False,
-            invalid_to_bottom=True,
-        )
-        # Dynamic slice to max_total_size
-        neg_one = _expr.const([-1])
-        slice_end = _op.concatenate(
-            [neg_one, _op.expand_dims(max_total_size, axis=0), neg_one], axis=0
-        )
-        nms_ret = _op.strided_slice(
-            nms_ret, begin=[0, 0, 0], end=slice_end, strides=[1, 1, 1], slice_mode="size"
-        )
-
-        # Slice output into boxes, scores, classes
-        nmsed_boxes = _op.strided_slice(
-            nms_ret, begin=[0, 0, 2], end=[-1, -1, 4], slice_mode="size"
-        )
-        if attr["clip_boxes"]:
-            nmsed_boxes = _op.maximum(nmsed_boxes, _expr.const(0, dtype="float32"))
-            nmsed_boxes = _op.minimum(nmsed_boxes, _expr.const(1, dtype="float32"))
-        nmsed_scores = _op.strided_slice(
-            nms_ret, begin=[0, 0, 1], end=[-1, -1, 1], slice_mode="size"
-        )
-        nmsed_scores = _op.squeeze(nmsed_scores, axis=[2])
-        nmsed_classes = _op.strided_slice(
-            nms_ret, begin=[0, 0, 0], end=[-1, -1, 1], slice_mode="size"
-        )
-        nmsed_classes = _op.squeeze(nmsed_classes, axis=[2])
-        # Get number of valid boxes
-        nms_count = _op.sum(
-            _op.cast(_op.greater(nmsed_scores, _expr.const(0, dtype="float32")), "int32"), axis=1
-        )
-
-        # TVM uses -1 for invalid outputs while TF uses 0
-        box_range = _op.arange(_expr.const(0, dtype="int32"), max_total_size, dtype="int32")
-        shape = _op.strided_slice(_op.shape_of(nmsed_boxes), begin=[0], end=[2])
-        box_range = _op.broadcast_to(box_range, shape)
-        valid_mask = _op.cast(_op.less(box_range, _op.expand_dims(nms_count, axis=1)), "float32")
-        nmsed_boxes = nmsed_boxes * _op.expand_dims(valid_mask, axis=2)
-        # Could instead use mask for scores, classes if negative values are possible.
-        nmsed_scores = _op.maximum(nmsed_scores, _expr.const(0, dtype="float32"))
-        nmsed_classes = _op.maximum(nmsed_classes, _expr.const(0, dtype="float32"))
-
-        return _expr.TupleWrapper(
-            _expr.Tuple([nmsed_boxes, nmsed_scores, nmsed_classes, nms_count]), 4
-        )
-
-    return _impl
-
-
-def _decode_image():
-    def _impl(inputs, attr, params, mod):
-        # Image decode wrapper: Expecting user to feed decoded input to next layer drop this layer.
-        warnings.warn("DecodeJpeg: It's a pass through, please handle preprocessing before input")
-        return inputs[0]
-
-    return _impl
-
-
-def _unravel_index():
-    def _impl(inputs, attr, params, mod):
-        return _op.unravel_index(inputs[0], inputs[1])
-
-    return _impl
-
-
-def _crop_and_resize():
-    def _impl(inputs, attr, params, mod):
-        # input image is a 4-D tensor of shape [batch, image_height, image_width, depth]
-        # boxes is a 2-D tensor of shape [num_boxes, 4], 4 is for [y1, x1, y2, x2]
-        crop_size = _get_list_param(params, inputs[3], mod)
-
-        method = attr["method"].decode()
-        method = "nearest_neighbor" if method == "nearest" else method
-        if method not in ["bilinear", "nearest_neighbor"]:
-            raise tvm.error.OpAttributeUnImplemented(f"Method {method} is not supported")
-        layout = attr["layout"] if "layout" in attr else "NHWC"
-        extrapolation_value = attr["extrapolation_value"]
-
-        return get_relay_op("crop_and_resize")(
-            inputs[0], inputs[1], inputs[2], crop_size, layout, method, extrapolation_value
-        )
-
-    return _impl
-
-
-def _cast():
-    def _impl(inputs, attr, params, mod):
-        return inputs[0].astype(attr["DstT"].name)
-
-    return _impl
-
-
-def _expand_dims():
-    def _impl(inputs, attr, params, mod):
-        dim_input = inputs.pop(1)
-        axis = _get_num_param(params, dim_input)
-        return AttrCvt(
-            op_name="expand_dims",
-            ignores=["Tdim", "N"],
-            extras={"axis": int(axis), "num_newaxis": 1},
-        )(inputs, attr)
-
-    return _impl
-
-
-def _expm1():
-    # op description: https://www.tensorflow.org/api_docs/python/tf/math/expm1
-    def _impl(inputs, attr, params, mod):
-        exp_out = get_relay_op("exp")(inputs[0])
-        return exp_out - tvm.relay.const(1.0)
-
-    return _impl
-
-
-def _resize(method):
-    def _impl(inputs, attr, params, mod):
-        if attr["_output_shapes"][0] is not None:
-            size = attr["_output_shapes"][0][1:3]
-            # Important that the size is defined. If an axis is not, we need to infer what
-            # the shape should be.
-            if -1 in size:
-                size = _infer_value(inputs[1], params, mod).numpy().reshape([-1]).tolist()
-        else:
-            size = _infer_value(inputs[1], params, mod).numpy().reshape([-1]).tolist()
-
-        attr["size"] = size
-        inputs.pop(1)
-        # NHWC
-        attr["layout"] = "NHWC"
-        if attr.pop("align_corners") is True:
-            attr["coordinate_transformation_mode"] = "align_corners"
-        else:
-            attr["coordinate_transformation_mode"] = "asymmetric"
-
-        # Ignore the new attributes from TF2.0, for now.
-        return AttrCvt(
-            op_name="resize2d",
-            ignores=["Tdim", "half_pixel_centers"],
-            extras={"method": method, "roi": None},
-        )(inputs, attr)
-
-    return _impl
-
-
-def _check_numerics():
-    def _impl(inputs, attr, params, mod):
-        # Making a copy node assuming no need to verify
-        return AttrCvt(op_name="copy", ignores=["message"])(inputs, attr)
-
-    return _impl
-
-
-def _assert():
-    # ToDo: In general people want asserts to be gone from TensorFlow graphs
-    # when they are optimizing them, so converting it to a no-op is
-    # reasonable. However, it would be nice to have the option to keep them
-    # once Relay gets a Halt or Assert op.
-    return _no_op()
-
-
-def _no_op():
-    def _impl(inputs, attr, params, mod):
-        # ToDo: This should really be an op that returns nothing, which could
-        # be represented as an empty tuple. It turns out that TVM
-        # infrastructure doesn't like running functions that return None and
-        # also don't like running functions that return an empty tuple. So it
-        # doesn't work, but it should be made to work and then this could be
-        # improved. In the mean time, it is hard to imagine a case where it
-        # matters in any real way that a no-op is converted to a constant 0.
-        return tvm.relay.const(0)
-
-    return _impl
-
-
-def _matmul():
-    def _impl(inputs, attr, params, mod):
-        from .tensorflow import TF_DEFAULT_CONFIGS
-
-        channels = _infer_channels(inputs[1], not attr["transpose_b"])
-        if TF_DEFAULT_CONFIGS["use_dense"]:
-            if attr["transpose_a"]:
-                inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
-            if not attr["transpose_b"]:
-                inputs[1] = _op.transpose(inputs[1], axes=(1, 0))
-            return AttrCvt(
-                op_name="dense",
-                extras={"units": channels},
-                ignores=["transpose_a", "transpose_b", "T"],
-            )(inputs, attr)
-        return AttrCvt(op_name="matmul", extras={"units": channels}, ignores=["T"])(inputs, attr)
-
-    return _impl
-
-
-def _batch_matmul():
-    def _impl(inputs, attr, params, mod):
-        from .tensorflow import TF_DEFAULT_CONFIGS
-
-        input_x = inputs[0]
-        input_y = inputs[1]
-        orig_shape_x = _infer_shape(input_x, mod)
-        orig_shape_y = _infer_shape(input_y, mod)
-        ndim = len(orig_shape_x)
-        ndim_y = len(orig_shape_y)
-
-        is_static = not check_symbolic_shape(orig_shape_x)
-
-        # reshape n-dimensional batch matmul into 3d
-        if ndim > 3:
-            outer_dims = [orig_shape_x[i] for i in range(0, len(orig_shape_x) - 2)]
-            if is_static:
-                num_outer_elts = np.prod(outer_dims)
-                new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1])
-                if ndim_y > 2:
-                    new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1])
-                elif ndim_y == 2:
-                    new_shape_y = (1, orig_shape_y[-2], orig_shape_y[-1])
-            else:  # handle dynamic shape (dyn.reshape op)
-                shape_of_x = list_shape_of(inputs[0], ndim)
-                shape_of_y = list_shape_of(inputs[1], ndim)
-                new_shape_x = [_op.const(1), shape_of_x[-2], shape_of_x[-1]]
-                new_shape_y = [_op.const(1), shape_of_y[-2], shape_of_y[-1]]
-                for i in range(ndim - 2):
-                    new_shape_x[0] *= shape_of_x[i]
-                    new_shape_y[0] *= shape_of_y[i]
-                new_shape_x = _op.concatenate(_op.Tuple(new_shape_x), axis=0)
-                new_shape_y = _op.concatenate(_op.Tuple(new_shape_y), axis=0)
-
-            input_x = _op.reshape(input_x, newshape=new_shape_x)
-            input_y = _op.reshape(input_y, newshape=new_shape_y)
-        elif ndim_y == 2:
-            input_y = _op.reshape(input_y, (1, orig_shape_y[-2], orig_shape_y[-1]))
-        adj_x = attr["adj_x"]
-        adj_y = attr["adj_y"]
-
-        if TF_DEFAULT_CONFIGS["use_nt_batch_matmul"]:
-            # Strictly convert all batch_matmul to NT format
-            input_x = _op.transpose(input_x, axes=[0, 2, 1]) if adj_x else input_x
-            input_y = _op.transpose(input_y, axes=[0, 2, 1]) if not adj_y else input_y
-            ret = get_relay_op("batch_matmul")(input_x, input_y)
-        else:
-            ret = get_relay_op("batch_matmul")(
-                input_x, input_y, transpose_a=adj_x, transpose_b=adj_y
-            )
-
-        # reshape result back to n-dimensional
-        if ndim > 3:
-            if is_static:
-                final_shape = list(orig_shape_x)
-                final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2]
-                final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1]
-            else:
-                # calculate the resulting shape = [shape[:-2], 0, 0]
-                final_shape = list(shape_of_x)
-                final_shape[-2] = shape_of_x[-1] if adj_x else shape_of_x[-2]
-                final_shape[-1] = shape_of_y[-2] if adj_y else shape_of_y[-1]
-                final_shape = _op.concatenate(_op.Tuple(final_shape), axis=0)
-
-            ret = _op.reshape(ret, newshape=final_shape)
-        return ret
-
-    return _impl
-
-
-def _sparse_tensor_dense_matmul():
-    def _impl(inputs, attr, params, mod):
-        # Loading this by default causes TVM to not be loadable from other languages.
-        # Sparse utility from scipy
-        from scipy.sparse import csr_matrix
-
-        assert len(inputs) == 4, "There should be 4 input tensors"
-
-        indices_tensor = _infer_value(inputs[0], params, mod).numpy()
-        values_tensor = _infer_value(inputs[1], params, mod).numpy()
-        dense_shape_tensor = _infer_value(inputs[2], params, mod).numpy()
-
-        data = inputs[3]
-
-        rows = [x[0] for x in indices_tensor]
-        cols = [x[1] for x in indices_tensor]
-
-        # Create scipy sparse Tensor(CSR)
-        weight_sp = csr_matrix(
-            (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist())
-        )
-
-        # As per tensorflow implementation, we have 4 possible input combination
-        # and the first input(A) is always sparse and second input(B) is always dense.
-        # Case 1: A , B , adjoint_a=False, adjoint_b=False  --> A * B
-        # Case 2: A , B , adjoint_a=True,   adjoint_b=False  --> A.T * B
-        # Case 3: A , B , adjoint_a=False, adjoint_b=True    --> A * B.T
-        # Case 4: A , B , adjoint_a=True,   adjoint_b=True    --> A.T * B.T
-        #
-        # Topi implementation for sparse_dense(matmul) has 2 possible input
-        # combination where first input(A) is always dense
-        # and second input(B) is always sparse.
-        # Case 1: A , B, sparse_lhs = False  --> A * B.T
-        # Case 2: A , B, sparse_lhs = True    --> B * A.T
-        #
-        # The mapping would be as below:
-        # TF Case 1: A , B , adjoint_a=False, adjoint_b=False
-        #           --> In TF: A * B   --> In Topi: A * B.T.T
-        #           --> sparse_dense(transpose(B), A, sparse_lhs=True)
-        #
-        # TF Case 2: A , B , adjoint_a=True, adjoint_b=False
-        #           --> In TF: A.T * B   --> In Topi: A.T * B.T.T
-        #           --> sparse_dense(transpose(B), transpose(A), sparse_lhs=True)
-        #
-        # TF Case 3: A , B , adjoint_a=False, adjoint_b=True
-        #           --> In TF: A * B.T   --> In Topi: A * B
-        #           --> sparse_dense(B, A, sparse_lhs=True)
-        #
-        # TF Case 4: A , B , adjoint_a=True, adjoint_b=True
-        #           --> In TF: A.T * B.T   --> In Topi: (B * A.T).T
-        #           --> transpose(sparse_dense(B, transpose(A), sparse_lhs=False))
-
-        # By default, in tensorflow the first input ,i.e., data is sparse
-        sparse_lhs = True
-
-        # TF Case 1:
-        if not attr.get("adjoint_a") and not attr.get("adjoint_b"):
-            data = _op.transpose(data)
-        # TF Case 2:
-        elif attr.get("adjoint_a") and not attr.get("adjoint_b"):
-            data = _op.transpose(data)
-            weight_sp = csr_matrix(weight_sp.transpose())
-        # TF Case 3:
-        elif not attr.get("adjoint_a") and attr.get("adjoint_b"):
-            pass
-        # TF Case 4:
-        # attr.get("adjoint_a") and attr.get("adjoint_b"):
-        else:
-            sparse_lhs = False
-            weight_sp = csr_matrix(weight_sp.transpose())
-
-        weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype)
-        weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype)
-        weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype)
-
-        ret = _op.nn.sparse_dense(data, [weight_data, weight_indices, weight_indptrs], sparse_lhs)
-
-        if not sparse_lhs:
-            # TF Case 4
-            ret = _op.transpose(ret)
-
-        return ret
-
-    return _impl
-
-
-def _sparse_fill_empty_rows():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 4, "There should be 4 input tensors"
-        sparse_indices = inputs[0]
-        sparse_values = inputs[1]
-        sparse_indices_num_cols = _infer_shape(sparse_indices, mod)[1]
-        first_column = _op.split(sparse_indices, sparse_indices_num_cols, axis=1)[0]
-        sorted_indices = _op.argsort(_op.squeeze(first_column))
-        sorted_sparse_indices = _op.take(sparse_indices, sorted_indices, axis=0)
-        sorted_sparse_values = _op.take(sparse_values, sorted_indices, axis=0)
-        new_sparse_indices, new_sparse_values, empty_row_indicator = _op.sparse_fill_empty_rows(
-            sorted_sparse_indices, sorted_sparse_values, inputs[2], inputs[3]
-        )
-
-        return _expr.TupleWrapper(
-            _expr.Tuple([new_sparse_indices, new_sparse_values, empty_row_indicator]), 3
-        )
-
-    return _impl
-
-
-def _sparse_reshape():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 3, "There should be 3 input tensors"
-        new_indices, new_shape = get_relay_op("sparse_reshape")(inputs[0], inputs[1], inputs[2])
-        return _expr.TupleWrapper(_expr.Tuple([new_indices, new_shape]), 2)
-
-    return _impl
-
-
-def _math_segment_sum():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 2, "There should be 2 input tensors"
-        return get_relay_op("segment_sum")(inputs[0], inputs[1])
-
-    return _impl
-
-
-def _sparse_segment_sum():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 3, "There should be 3 input tensors"
-        data = _op.take(inputs[0], inputs[1], axis=0)
-        return _op.segment_sum(data, inputs[2])
-
-    return _impl
-
-
-def _sparse_segment_sum_with_num_segments():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 4, "There should be 4 input tensors"
-        data = _op.take(inputs[0], inputs[1], axis=0)
-        num_segments = int(inputs[3].data.numpy().item())
-        return _op.segment_sum(data, inputs[2], num_segments)
-
-    return _impl
-
-
-def row_wise_divide(multi_dim_tensor, one_dim_vector):
-    """
-    This function enables row-wise division of multi_dim_tensor and one_dim_vector.
-    To achieve this, it is first tiled to the appropriate shape and then elemwise_division
-    """
-    multi_dim_tensor_offrow_shape = _op.strided_slice(
-        _op.shape_of(multi_dim_tensor, "int32"), [1], [-1], slice_mode="size"
-    )
-    one_dim_vector_tiled_shape = _op.concatenate(
-        [_op.reverse(multi_dim_tensor_offrow_shape, 0), _expr.const([1])], axis=0
-    )
-    one_dim_vector_tiled = _op.transpose(_op.tile(one_dim_vector, one_dim_vector_tiled_shape))
-    return _op.divide(multi_dim_tensor, one_dim_vector_tiled)
-
-
-def count_all_indices(segment_ids, counts_dtype, num_segments=None):
-    """
-    This snippet calculates the sqrt count of each index among all valid indices
-    Valid indices are from 0 to max of [segment ids, num_segments]
-    """
-
-    max_segments = _op.reshape(_op.max(segment_ids), -1) + _expr.const([1])
-    if num_segments:
-        max_segments = _op.maximum(max_segments, _expr.const([num_segments]))
-    max_ones = _op.maximum(max_segments, _op.shape_of(segment_ids))
-    counts = _op.segment_sum(
-        _op.ones(max_ones, counts_dtype), segment_ids, num_segments=num_segments
-    )
-    real_counts = _op.clip(counts, 1, 2147483647)  # Clip max doesn't work over int32
-    return real_counts
-
-
-def _sparse_segment_sum_sqrtn():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 3, "There should be 3 input tensors"
-        data = _op.take(inputs[0], inputs[1], axis=0)
-        real_counts = count_all_indices(inputs[2], attr["T"].name)
-        real_sqrt_counts = _op.sqrt(_op.cast_like(real_counts, data))
-
-        # Calculate regular segment sum
-        segment_sum = _op.segment_sum(data, inputs[2])
-
-        return row_wise_divide(segment_sum, real_sqrt_counts)
-
-    return _impl
-
-
-def _sparse_segment_sum_sqrtn_with_num_segments():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 4, "There should be 4 input tensors"
-        data = _op.take(inputs[0], inputs[1], axis=0)
-        num_segments = int(inputs[3].data.numpy().item())
-        real_counts = count_all_indices(inputs[2], attr["T"].name, num_segments=num_segments)
-        real_sqrt_counts = _op.sqrt(_op.cast_like(real_counts, data))
-
-        # Calculate regular segment sum
-        segment_sum = _op.segment_sum(data, inputs[2], num_segments=num_segments)
-
-        return row_wise_divide(segment_sum, real_sqrt_counts)
-
-    return _impl
-
-
-def _sparse_segment_mean():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 3, "There should be 3 input tensors"
-        data = _op.take(inputs[0], inputs[1], axis=0)
-        real_counts = count_all_indices(inputs[2], attr["T"].name)
-
-        # Calculate regular segment sum
-        segment_sum = _op.segment_sum(data, inputs[2])
-
-        return row_wise_divide(segment_sum, real_counts)
-
-    return _impl
-
-
-def _sparse_segment_mean_with_num_segments():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 4, "There should be 4 input tensors"
-        data = _op.take(inputs[0], inputs[1], axis=0)
-        num_segments = int(inputs[3].data.numpy().item())
-        real_counts = count_all_indices(inputs[2], attr["T"].name, num_segments=num_segments)
-
-        # Calculate regular segment sum
-        segment_sum = _op.segment_sum(data, inputs[2], num_segments=num_segments)
-
-        return row_wise_divide(segment_sum, real_counts)
-
-    return _impl
-
-
-def _sparse_tensor_dense_add():
-    def _impl(inputs, attr, params, mod):
-        # Sparse utility from scipy
-        from scipy.sparse import csr_matrix
-
-        assert (
-            len(inputs) == 4
-        ), "There should be 4 input tensors [sparse_indices, sparse_values, sparse_shape, dense]."
-
-        indices_tensor = _infer_value(inputs[0], params, mod).numpy()
-        values_tensor = _infer_value(inputs[1], params, mod).numpy()
-        dense_shape_tensor = _infer_value(inputs[2], params, mod).numpy()
-
-        data = inputs[3]
-
-        rows = [x[0] for x in indices_tensor]
-        cols = [x[1] for x in indices_tensor]
-
-        # Create scipy sparse Tensor(CSR)
-        weight_sp = csr_matrix(
-            (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist())
-        )
-
-        weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype)
-        weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype)
-        weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype)
-
-        ret = _op.nn.sparse_add(data, [weight_data, weight_indices, weight_indptrs])
-
-        return ret
-
-    return _impl
-
-
-def _identity():
-    def _impl(inputs, attr, params, mod):
-        return inputs[0]
-
-    return _impl
-
-
-def _identityn():
-    def _impl(inputs, attr, params, mod):
-        return inputs
-
-    return _impl
-
-
-def _concatV2():
-    def _impl(inputs, attr, params, mod):
-        pop_node = inputs.pop(len(inputs) - 1)
-        try:
-            axis = int(_get_num_param(params, pop_node))
-        except (IndexError, KeyError, AttributeError):
-            try:
-                axis = int(_infer_value(pop_node, params, mod).numpy())
-            except Exception:
-                axis = int(pop_node)
-        return AttrCvt(op_name="concatenate", ignores=["T", "N", "Tidx"], extras={"axis": axis})(
-            [inputs], attr
-        )
-
-    return _impl
-
-
-def _concat():
-    def _impl(inputs, attr, params, mod):
-        pop_node = inputs.pop(0)
-        axis = int(_get_num_param(params, pop_node))
-        return AttrCvt(op_name="concatenate", ignores=["N"], extras={"axis": axis})([inputs], attr)
-
-    return _impl
-
-
-def _pack():
-    def _impl(inputs, attr, params, mod):
-        axis = int(attr["axis"])
-        inputs_reshaped = [_op.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs]
-        return _op.concatenate(inputs_reshaped, axis)
-
-    return _impl
-
-
-def _tensor_array():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr.get("dtype").name
-        assert not attr["dynamic_size"], "Dynamic size tensor array is " "not supported in TVM yet."
-
-        if "shape" in attr:
-            shape = attr["shape"]
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, shape)
-            static_tensor_array_ops.register()
-            tensor_array_constructor = static_tensor_array_ops.get_global_var("tensor_array")
-            tensor_array = tensor_array_constructor(inputs[0])
-        else:
-            tensor_array_constructor = prelude.get_global_var("tensor_array", dtype_str)
-            tensor_array = tensor_array_constructor(inputs[0])
-        return tensor_array
-
-    return _impl
-
-
-def _tensor_array_scatter():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr.get("T").name
-        input_ta = inputs[0]
-        input_shape = get_tensor_array_shape(input_ta, dtype_str, prelude)
-        values_shape = _infer_shape(inputs[2], prelude.mod)
-        input_t_shape = values_shape[1:]
-        indices_shape = _infer_shape(inputs[1], prelude.mod)
-
-        if input_shape is None:
-            values_rank = len(values_shape)
-            unstack_name = f"tensor_array_unstack_tensor{values_rank}"
-            unstack_function = prelude.get_global_var(unstack_name, dtype_str)
-            values = unstack_function(inputs[2])
-            tensor_array_scatter_func = prelude.get_global_var("tensor_array_scatter", dtype_str)
-        else:
-            input_t_shape = _get_more_static_shape(input_t_shape, input_shape)
-            values_shape = (values_shape[0],) + input_t_shape
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_t_shape)
-            static_tensor_array_ops.register()
-            # Register static indices shape
-            if isinstance(indices_shape[0], int):
-                static_tensor_array_ops.define_tensor_array_scatter(indices_shape, True)
-            tensor_array_scatter_func = prelude.get_global_var_static(
-                "tensor_array_scatter", dtype_str, input_t_shape
-            )
-
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, values_shape)
-            static_tensor_array_ops.register()
-            unstack_function = prelude.get_global_var_static(
-                "tensor_array_unstack", dtype_str, values_shape
-            )
-            values = unstack_function(inputs[2])
-        ret = tensor_array_scatter_func(input_ta, inputs[1], values)
-        return ret
-
-    return _impl
-
-
-def _tensor_array_gather():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr.get("dtype").name
-        input_shape = get_tensor_array_shape(inputs[2], dtype_str, prelude)
-        indices_shape = _infer_shape(inputs[1], prelude.mod)
-
-        if input_shape is None:
-            gather_func = prelude.get_var("tensor_array_gather", dtype_str)
-            out = gather_func(inputs[2], inputs[1])
-        else:
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_shape)
-            static_tensor_array_ops.register()
-
-            if not isinstance(indices_shape[0], int):
-                gather_function = prelude.get_global_var_static(
-                    "tensor_array_gather", dtype_str, input_shape
-                )
-                out_tensor_t = gather_function(inputs[2], inputs[1])
-                out_shape = (indices_shape[0],) + input_shape
-                static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, out_shape)
-                static_tensor_array_ops.register()
-
-                # Output shape is (indices_shape[0],) + input_shape
-                get_data_func = prelude.get_global_var_static(
-                    "tensor_get_data", dtype_str, out_shape
-                )
-                out = get_data_func(out_tensor_t)
-            else:
-                # For fixed length indices, directly generate static shape output
-                read_func = prelude.get_global_var_static(
-                    "tensor_array_read", dtype_str, input_shape
-                )
-                get_data_func = prelude.get_global_var_static(
-                    "tensor_get_data", dtype_str, input_shape
-                )
-                tensor_list = []
-                for i in range(indices_shape[0]):
-                    index = _op.take(inputs[1], tvm.relay.const(i))
-                    out_tensor = get_data_func(read_func(inputs[2], index))
-                    tensor_list.append(_op.expand_dims(out_tensor, axis=0))
-
-                if indices_shape[0] > 1:
-                    out = _op.concatenate(tensor_list, axis=0)
-                else:
-                    out = tensor_list[0]
-
-        return out
-
-    return _impl
-
-
-def _tensor_array_size():
-    def _impl(inputs, attr, params, prelude):
-        return prelude.length(inputs[0])
-
-    return _impl
-
-
-def _tensor_array_write():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr.get("T").name
-        input_ta = inputs[3]
-        input_ta_shape = get_tensor_array_shape(input_ta, dtype_str, prelude)
-        input_t_shape = _infer_shape(inputs[2], prelude.mod)
-        input_rank = len(input_t_shape)
-
-        if input_ta_shape is None:
-            tensor_name = f"tensor{input_rank}"
-            tensor_func = prelude.get_tensor_ctor(tensor_name, dtype_str)
-            v = tensor_func(inputs[2])
-            write_func = prelude.get_global_var("tensor_array_write", dtype_str)
-        else:
-            input_ta_rank = len(input_ta_shape)
-            assert (
-                input_ta_rank == input_rank
-            ), f"Shape rank mismatch: {input_ta_rank} vs {input_rank}"
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_ta_shape)
-            static_tensor_array_ops.register()
-            tensor_func = static_tensor_array_ops.get_ctor("tensor_constructor")
-            v = tensor_func(inputs[2])
-            # Write tensor with more static shape
-            actual_shape = _get_more_static_shape(input_t_shape, input_ta_shape)
-            if actual_shape != input_t_shape:
-                new_shape = []
-                num_any_dim = 0
-                for dim in actual_shape:
-                    if not isinstance(dim, int):
-                        num_any_dim += 1
-                    new_shape.append(dim if isinstance(dim, int) else -1)
-                if num_any_dim <= 1:
-                    v = tensor_func(_op.reshape(inputs[2], new_shape))
-
-            write_func = prelude.get_global_var_static(
-                "tensor_array_write", dtype_str, input_ta_shape
-            )
-
-        return write_func(input_ta, _op.take(inputs[1], tvm.relay.const(0)), v)
-
-    return _impl
-
-
-def _tensor_array_read():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr["dtype"].name
-        input_shape = get_tensor_array_shape(inputs[2], dtype_str, prelude)
-
-        if input_shape is None:
-            read_func = prelude.get_global_var("tensor_array_read", dtype_str)
-            out = read_func(inputs[2], _op.take(inputs[1], tvm.relay.const(0)))
-        else:
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_shape)
-            static_tensor_array_ops.register()
-            read_func = static_tensor_array_ops.get_global_var("tensor_array_read")
-            out_tensor = read_func(inputs[2], _op.take(inputs[1], tvm.relay.const(0)))
-            get_data_func = static_tensor_array_ops.get_global_var("tensor_get_data")
-            out = get_data_func(out_tensor)
-
-        return out
-
-    return _impl
-
-
-def _tensor_array_split():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr.get("T").name
-        input_ta = inputs[0]
-        input_ta_shape = get_tensor_array_shape(input_ta, dtype_str, prelude)
-        lengths = _op.cast(inputs[2], "int32")
-        lengths_shape = _infer_shape(lengths, prelude.mod)
-        value_shape = _infer_shape(inputs[1], prelude.mod)
-        input_rank = len(value_shape)
-
-        if input_ta_shape is None:
-            tensor_name = f"tensor{input_rank}"
-            tensor_ctor = prelude.get_tensor_ctor(tensor_name, dtype_str)
-            v = tensor_ctor(inputs[1])
-            split_func = prelude.get_global_var("tensor_array_split", dtype_str)
-        else:
-            input_ta_rank = len(input_ta_shape)
-            assert (
-                input_ta_rank == input_rank
-            ), f"Shape rank mismatch: {input_ta_rank} vs {input_rank}"
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_ta_shape)
-            static_tensor_array_ops.register()
-
-            # Check static value/indices shape
-            if isinstance(value_shape[0], int) or isinstance(lengths_shape[0], int):
-                static_tensor_array_ops.define_tensor_array_split(value_shape, lengths_shape, True)
-
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, value_shape)
-            static_tensor_array_ops.register()
-            tensor_ctor = static_tensor_array_ops.get_ctor("tensor_constructor")
-            v = tensor_ctor(inputs[1])
-            split_func = prelude.get_global_var_static(
-                "tensor_array_split", dtype_str, input_ta_shape
-            )
-
-        return split_func(input_ta, v, lengths)
-
-    return _impl
-
-
-def _tensor_array_concat():
-    def _impl(inputs, attr, params, prelude):
-        dtype_str = attr["dtype"].name
-        input_shape = get_tensor_array_shape(inputs[1], dtype_str, prelude)
-
-        if input_shape is None:
-            concat_func = prelude.get_global_var("tensor_array_concat", dtype_str)
-            out = concat_func(inputs[1])
-        else:
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, input_shape)
-            static_tensor_array_ops.register()
-            concat_func = prelude.get_global_var_static(
-                "tensor_array_concat", dtype_str, input_shape
-            )
-            out_tensor = concat_func(inputs[1])
-            out_shape = (Any(),) + input_shape[1:]
-            static_tensor_array_ops = StaticTensorArrayOps(prelude, dtype_str, out_shape)
-            static_tensor_array_ops.register()
-            get_data_func = prelude.get_global_var_static("tensor_get_data", dtype_str, out_shape)
-            out = get_data_func(out_tensor)
-
-        return out
-
-    return _impl
-
-
-def _tile():
-    def _impl(inputs, attr, params, mod):
-        reps_input = inputs.pop()
-        if isinstance(reps_input, _expr.Call):
-            np_reps = _infer_value(reps_input, params, mod).numpy()
-            reps = [np_reps.flatten()[i] for i in range(np_reps.flatten().shape[0])]
-        else:
-            reps = _get_list_param(params, reps_input, mod)
-        new_input = [inputs.pop(0)]
-
-        return AttrCvt(op_name="tile", extras={"reps": tuple(reps)}, ignores=["Tmultiples"])(
-            new_input, attr
-        )
-
-    return _impl
-
-
-def _slice():
-    def _impl(inputs, attr, params, mod):
-        try:
-            begin = _get_list_param(params, inputs[1], mod)
-        except Exception:
-            # Handle symbolic begin
-            begin = inputs[1]
-        try:
-            size = _get_list_param(params, inputs[2], mod)
-        except Exception:
-            # Handle symbolic size
-            size = inputs[2]
-
-        # Align begin and strides for dynamic shape.
-        data_dim = len(_infer_shape(inputs[0], mod))
-        strides = [1] * data_dim
-        if not isinstance(begin, (_expr.Call, _expr.Var)):
-            for _ in range(len(begin), data_dim):
-                begin.append(0)
-        elif not isinstance(size, (_expr.Call, _expr.Var)):
-            for _ in range(len(size), data_dim):
-                size.append(-1)
-        return _op.strided_slice(
-            inputs[0], begin=begin, end=size, strides=strides, slice_mode="size"
-        )
-
-    return _impl
-
-
-def _reshape():
-    def _impl(inputs, attr, params, mod):
-        pop_node = inputs.pop(1)
-
-        try:
-            shape_arg = _get_tuple_param(params, pop_node)
-        except AttributeError:
-            # Shape operator is already pruned, hence
-            # try to infer shape by precompute prune if possible.
-            try:
-                params_new = _infer_value(pop_node, params, mod)
-                shape_arg = tuple(params_new.numpy().astype("int32").flatten())
-            except Exception:
-                # Deal with symbolic shape case.
-                if isinstance(pop_node, _expr.Call) and "shape_of" in str(pop_node.op.name):
-                    # shape_of is the direct ancestor.
-                    return _op.reshape_like(inputs[0], pop_node.args[0])
-                shape_arg = pop_node
-
-        return AttrCvt(op_name="reshape", extras={"newshape": shape_arg}, ignores=["Tshape"])(
-            inputs, attr
-        )
-
-    return _impl
-
-
-def _depth_to_space():
-    def _impl(inputs, attr, params, mod):
-        block_size = int(attr["block_size"])
-        layout = attr["data_format"].decode("utf-8")
-        return _op.nn.depth_to_space(inputs[0], block_size, layout)
-
-    return _impl
-
-
-def _space_to_depth():
-    def _impl(inputs, attr, params, mod):
-        block_size = int(attr["block_size"])
-        layout = attr["data_format"].decode("utf-8")
-        return _op.nn.space_to_depth(inputs[0], block_size, layout)
-
-    return _impl
-
-
-def _sparse_to_dense():
-    def _impl(inputs, attr, params, mod):
-        sparse_indices = inputs[0]
-        output_shape = inputs[1]
-        sparse_values = inputs[2]
-        default_value = inputs[3]
-
-        return _op.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
-
-    return _impl
-
-
-def _bias_add():
-    def _impl(inputs, attr, params, mod):
-        # Must expand for proper broadcasting in NCHW.
-        if "data_format" in attr and attr["data_format"].decode("utf-8") == "NCHW":
-            bias = _op.reshape(inputs[1], newshape=(1, -1, 1, 1))
-        else:
-            bias = inputs[1]
-        return _op.add(inputs[0], bias)
-
-    return _impl
-
-
-def _broadcast_args():
-    def _impl(inputs, attr, params, mod):
-        if isinstance(inputs[0], _expr.Var):
-            s0 = params[inputs[0].name_hint]
-        else:
-            s0 = _infer_value(inputs[0], params, mod)
-        if isinstance(inputs[1], _expr.Var):
-            s1 = params[inputs[1].name_hint]
-        else:
-            s1 = _infer_value(inputs[1], params, mod)
-        s0 = list(s0.numpy().reshape([-1]))
-        s1 = list(s1.numpy().reshape([-1]))
-        s0_size, s1_size = len(s0), len(s1)
-
-        out = deque([])
-        for i in range(1, min(s0_size, s1_size) + 1):
-            if s0[s0_size - i] == s1[s1_size - i]:
-                out.appendleft(s0[s0_size - i])
-            elif s0[s0_size - i] == 1:
-                out.appendleft(s1[s1_size - i])
-            else:
-                assert (
-                    s1[s1_size - i] == 1
-                ), f"Incompatible broadcast type {s0[s0_size - i]} and {s1[s1_size - i]}"
-                out.appendleft(s0[s0_size - i])
-        if s0_size < s1_size:
-            for i in range(s0_size + 1, s1_size + 1):
-                out.appendleft(s1[s1_size - i])
-        if s1_size < s0_size:
-            for i in range(s1_size + 1, s0_size + 1):
-                out.appendleft(s0[s0_size - i])
-        return _expr.const(list(out), attr["T"].name)
-
-    return _impl
-
-
-def _broadcast_to():
-    def _impl(inputs, attr, params, mod):
-        if isinstance(inputs[1], _expr.Var):
-            shape = params[inputs[1].name_hint]
-        else:
-            shape = _infer_value(inputs[1], params, mod)
-        shape = list(shape.numpy().reshape([-1]))
-        return _op.broadcast_to(inputs[0], shape)
-
-    return _impl
-
-
-def _squeeze():
-    def _impl(inputs, attr, params, mod):
-        if len(attr["squeeze_dims"]) == 0:
-            attr["squeeze_dims"] = None
-        return AttrCvt(
-            op_name="squeeze", transforms={"squeeze_dims": "axis"}, ignores=["T", "_cloned"]
-        )(inputs, attr)
-
-    return _impl
-
-
-def _fused_batch_norm():
-    def _impl(inputs, attr, params, mod):
-        # Tensorflow: (data, gamma, beta, moving_mean, moving_variance)
-        # Relay:       (data, gamma, beta, moving_mean, moving_varience)
-        assert len(inputs) == 5
-        axis = 3
-        need_cast = False
-
-        if "data_format" in attr:
-            attr["data_format"] = attr["data_format"].decode("utf-8")
-            if attr["data_format"] == "NCHW":
-                axis = 1
-        if "U" in attr and attr["U"].name != attr["T"].name:
-            need_cast = True
-            inputs[0] = _op.cast(inputs[0], dtype=attr["U"].name)
-        # Check if mean and variance are empty
-        # If so, replace them with Mean and Variance Ops
-        # For run-time calculation
-        moving_mean_shape = [int(n) for n in inputs[3].type_annotation.shape]
-        moving_variance_shape = [int(n) for n in inputs[4].type_annotation.shape]
-        if moving_mean_shape[0] == 0 and moving_variance_shape[0] == 0:
-            inputs[3] = _op.mean(inputs[0], axis=axis, keepdims=False, exclude=True)
-            inputs[4] = _op.variance(inputs[0], axis=axis, keepdims=False, exclude=True)
-        out = AttrCvt(
-            op_name="batch_norm",
-            transforms={"scale_after_normalization": "scale", "variance_epsilon": "epsilon"},
-            extras={"axis": axis},
-            ignores=["data_format", "U", "exponential_avg_factor"],
-            disables=["momentum"],
-        )(inputs, attr)
-
-        if need_cast:
-            out = _expr.TupleGetItem(out.astuple(), 0)
-            out = _op.cast(out, dtype=attr["T"].name)
-        return out
-
-    return _impl
-
-
-def _batch_norm():
-    def _impl(inputs, attr, params, mod):
-        # Rearrange inputs from
-        # (data, moving_mean, moving_variance, beta, gamma)
-        #     to
-        # (data, gamma, beta, moving_mean, moving_var)
-        new_inputs = [inputs[0], inputs[4], inputs[3], inputs[1], inputs[2]]
-
-        axis = 3
-        if "data_format" in attr:
-            attr["data_format"] = attr["data_format"].decode("utf-8")
-            if attr["data_format"] == "NCHW":
-                axis = 1
-
-        return AttrCvt(
-            op_name="batch_norm",
-            transforms={"scale_after_normalization": "scale", "variance_epsilon": "epsilon"},
-            extras={"axis": axis},
-            ignores=["data_format", "exponential_avg_factor"],
-            disables=["momentum"],
-        )(new_inputs, attr)
-
-    return _impl
-
-
-def _relu6():
-    def _impl(inputs, attr, params, mod):
-        return _op.clip(inputs[0], a_min=0, a_max=6)
-
-    return _impl
-
-
-def _shape():
-    def _impl(inputs, attr, params, mod):
-        is_symbolic_shape = False
-        input_shape = _infer_shape(inputs[0], mod)
-        for axis in input_shape:
-            if not isinstance(axis, (int, tvm.tir.IntImm)):
-                is_symbolic_shape = True
-                break
-
-        if is_symbolic_shape:
-            ret = _op.shape_of(inputs[0], dtype=attr["out_type"].name)
-        else:
-            ret = np.array(input_shape, dtype=attr["out_type"].name)
-        return ret
-
-    return _impl
-
-
-def _fill():
-    def _impl(inputs, attr, params, mod):
-        try:
-            output_shape = _infer_value(inputs[0], params, mod).numpy().tolist()
-        except Exception:
-            output_shape = inputs[0]
-
-        return _op.full(inputs[1], output_shape, attr["T"].name)
-
-    return _impl
-
-
-def _lrn():
-    def _impl(inputs, attr, params, mod):
-        attr_new = {}
-        depth_radius = attr.get("depth_radius", 5)
-        size = (depth_radius * 2) + 1
-        attr_new["axis"] = 3  # Fix axis, NHWC format
-        attr_new["size"] = size
-        attr_new["bias"] = attr.get("bias", 1)
-        attr_new["alpha"] = attr.get("alpha", 1) * size
-        attr_new["beta"] = attr.get("beta", 0.5)
-        return AttrCvt(op_name="lrn")(inputs, attr_new)
-
-    return _impl
-
-
-def _sum():
-    def _impl(inputs, attr, params, mod):
-        axis = _get_tuple_param(params, inputs[1])
-        return AttrCvt(
-            op_name="sum",
-            extras={"axis": axis},
-            transforms={"keep_dims": "keepdims"},
-            ignores=["name", "Tidx"],
-        )([inputs[0]], attr)
-
-    return _impl
-
-
-def _reduce(op):
-    def _impl(inputs, attr, params, mod):
-        axis = _get_list_param(params, inputs[1], mod)
-        axis = tuple(axis)
-        if not axis:
-            axis = None
-        return AttrCvt(
-            op_name=op,
-            extras={"axis": axis},
-            transforms={"keep_dims": "keepdims"},
-            ignores=["name", "Tidx"],
-        )([inputs[0]], attr)
-
-    return _impl
-
-
-def _euclidean_norm():
-    def _impl(inputs, attr, params, mod):
-        axis = tuple(_get_list_param(params, inputs[1], mod))
-        keep_dims = bool(attr.get("keep_dims", False))
-        return _op.sqrt(
-            _op.cast(_op.reduce.sum(_op.multiply(inputs[0], inputs[0]), axis, keep_dims), "float32")
-        )
-
-    return _impl
-
-
-def _square():
-    def _impl(inputs, attr, params, mod):
-        return _op.multiply(inputs[0], inputs[0])
-
-    return _impl
-
-
-def _gather():
-    "GatherV2, Gather"
-
-    def _impl(inputs, attr, params, mod):
-        if len(inputs) > 2:
-            axis = _get_num_param(params, inputs.pop(2))
-        else:
-            axis = 0
-        batch_dims = 0
-        if int(attr.get("batch_dims", 0)) != 0:
-            batch_dims = int(attr.get("batch_dims", 0))
-        new_input = inputs[0:2]
-        op_ = AttrCvt(
-            op_name="take",
-            extras={
-                "axis": tvm.tir.const(axis, "int32"),
-                "batch_dims": tvm.tir.const(batch_dims, "int32"),
-            },
-            ignores=["Tindices", "Tparams", "validate_indices", "Taxis", "_class"],
-        )(new_input, attr)
-        return op_
-
-    return _impl
-
-
-def _gather_nd():
-    """GatherNd"""
-
-    def _impl(inputs, attr, params, mod):
-        indices_dims = len(_infer_shape(inputs[1], mod))
-        indices = _op.transpose(inputs[1], axes=[-1] + list(range(indices_dims - 1)))
-        return AttrCvt(op_name="gather_nd", ignores=["Tindices", "Tparams", "Taxis", "_class"])(
-            [inputs[0], indices], attr
-        )
-
-    return _impl
-
-
-def _stridedSlice():
-    def _impl(inputs, attr, params, mod):
-        """Strided Slice.
-        Operator description: https://www.tensorflow.org/api_docs/python/tf/strided_slice
-        Tensorflow mask validation: https://github.com/tensorflow/tensorflow/blob/master/
-        tensorflow/core/util/strided_slice_op.cc#L147-L368
-        """
-        begin = _get_list_param(params, inputs[1], mod)
-        end = _get_list_param(params, inputs[2], mod)
-        stride = _get_list_param(params, inputs[3], mod)
-
-        begin_mask = int(attr.get("begin_mask", 0))
-        end_mask = int(attr.get("end_mask", 0))
-        ellipsis_mask = int(attr.get("ellipsis_mask", 0))
-        new_axis_mask = int(attr.get("new_axis_mask", 0))
-        shrink_axis_mask = int(attr.get("shrink_axis_mask", 0))
-        in_type = _infer_type(inputs[0], mod)
-        data_shape = get_const_tuple(in_type.checked_type.shape)
-        data_dim = len(data_shape)
-        stride_dim = len(stride)
-        if data_dim == 0 and isinstance(inputs[0], _expr.Constant):
-            new_data = inputs[0].data.numpy().reshape(1)
-            return _expr.const(new_data, inputs[0].data.dtype)
-
-        # This is a special routine to handle strided_slice after shape_of.
-        # We need this since in some cases we want to do strided_slice on
-        # a partial symbolic shape, such as (1, ?), and get a static shape
-        # (1,). Directly slice on shape_of will result in fully dynamic shape.
-        # TODO(kevinthesun): Can we generalize this process with partial eval?
-        if isinstance(inputs[0], _expr.Call) and inputs[0].op == _op.get("shape_of"):
-            bg = begin[0]
-            ed = end[0]
-            st = stride[0]
-
-            if ed <= 0 < st:
-                ed += data_shape[0]
-
-            in_shape = _infer_shape(inputs[0].args[0], mod)
-            dtype = in_type.checked_type.dtype
-            out_data = []
-            idx = bg
-            while idx < ed:
-                if isinstance(in_shape[idx], int):
-                    out_data.append(in_shape[idx])
-                else:
-                    break
-                idx += st
-
-            # Only return when in_shape is fully static in the range from begin to end.
-            if idx >= ed:
-                ret = _expr.const(out_data, dtype)
-                if shrink_axis_mask:
-                    ret = _op.squeeze(ret)
-
-                return ret
-
-        def _transform_mask(stride_dim, ellipsis_mask):
-            """Handle mask inputs to create new begin, end, stride and output shape"""
-            m_begin = [0] * data_dim
-            m_end = [0] * data_dim
-            m_stride = [0] * data_dim
-            fshape_indices = []
-            # Count new axis after ellipsis_mask, consider while applying ellipsis_mask.
-            ellipsis_seen = False
-            new_axes_after_ellipsis = 0
-            for i in range(stride_dim):
-                mask = 1 << i
-                if ellipsis_seen and (mask & new_axis_mask) != 0:
-                    new_axes_after_ellipsis += 1
-                if (mask & ellipsis_mask) != 0:
-                    ellipsis_seen = True
-            if not ellipsis_seen:
-                # Used later for extending the stride attributes in the below loop.
-                ellipsis_mask |= 1 << stride_dim
-                stride_dim += 1
-            final_index = 0
-            for index in range(stride_dim):
-                mask = 1 << index
-                if mask & ellipsis_mask:
-                    # Identify the end index for applying ellipsis_mask
-                    to_index = min(
-                        ((data_dim - (stride_dim - index)) + 1 + new_axes_after_ellipsis), data_dim
-                    )
-                    for i in range(final_index, to_index):
-                        m_begin[final_index] = 0
-                        m_end[final_index] = data_shape[final_index]
-                        m_stride[final_index] = 1
-                        fshape_indices.append(final_index)
-                        final_index += 1
-                elif mask & new_axis_mask:
-                    fshape_indices.append(-1)
-                elif not mask & new_axis_mask:
-                    if final_index == len(m_begin):
-                        break
-                    if mask & begin_mask:
-                        m_begin[final_index] = -1 if stride[index] < 0 else 0
-                    elif begin[index]:
-                        m_begin[final_index] = begin[index]
-                    if mask & end_mask:
-                        m_end[final_index] = (
-                            -(data_shape[final_index] + 1)
-                            if stride[index] < 0
-                            else data_shape[final_index]
-                        )
-                    elif end[index]:
-                        m_end[final_index] = end[index]
-                    m_stride[final_index] = stride[index]
-                    if mask & shrink_axis_mask:
-                        # Tensorflow make axis with shrink_axis_mask as dimension 1
-                        m_begin[final_index] = (
-                            data_shape[final_index] + begin[index]
-                            if begin[index] < 0
-                            else begin[index]
-                        )
-                        m_end[final_index] = m_begin[final_index] + 1
-                        m_stride[final_index] = 1
-                        fshape_indices.append(-2)
-                    else:
-                        fshape_indices.append(final_index)
-
-                    final_index += 1
-            return m_begin, m_end, m_stride, fshape_indices
-
-        fshape_indices = None
-        if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
-            begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
-        out = _op.strided_slice(inputs[0], begin=begin, end=end, strides=stride)
-        out_shape = _infer_shape(out, mod=mod)
-        if not fshape_indices:
-            fshape_indices = range(len(out_shape))
-
-        # Create final output shape.
-        final_output = []
-        for gather_index in fshape_indices:
-            if gather_index == -1:
-                final_output.append(1)
-            elif gather_index == -2:
-                pass
-            else:
-                final_output.append(out_shape[gather_index])
-
-        if not final_output:
-            if not shrink_axis_mask:
-                ret = out
-            else:
-                final_shape = []
-                for dim in out_shape:
-                    if dim != 1:
-                        final_shape.append(dim)
-                if len(final_shape) == 0:
-                    ret = _op.squeeze(out)
-                else:
-                    # We need reshape to handle dynamic shape.
-                    ret = _op.reshape(out, newshape=tuple(final_shape))
-        else:
-            ret = _op.reshape(out, newshape=tuple(final_output))
-        return ret
-
-    return _impl
-
-
-def _pad(name):
-    def _impl(inputs, attr, params, mod):
-        try:
-            padlist = _get_param(params, inputs[1])
-        except (IndexError, KeyError, AttributeError):
-            try:
-                padlist = _infer_value(inputs[1], params, mod).numpy().tolist()
-            except Exception:
-                padlist = inputs[1]
-
-        if isinstance(padlist, _expr.Expr):
-            paddings = padlist
-        else:
-            paddings = tuple(tuple(l) for l in padlist)
-        attr["pad_width"] = paddings
-        attr["pad_value"] = 0
-        new_inputs = [inputs[0]]
-        if name == "PadV2":
-            try:
-                attr["pad_value"] = _get_num_param(params, inputs[2])
-            except (IndexError, KeyError, AttributeError):
-                attr["pad_value"] = inputs[2]
-        return AttrCvt(op_name="pad", ignores=["Tpaddings"])(new_inputs, attr)
-
-    return _impl
-
-
-def _mirror_pad():
-    def _impl(inputs, attr, params, mod):
-        padlist = _get_param(params, inputs[1])
-        paddings = tuple(tuple(l) for l in padlist)
-        attr["pad_width"] = paddings
-        mode = attr["mode"].decode("utf-8")
-        attr["mode"] = mode
-        new_inputs = [inputs[0]]
-        return AttrCvt(op_name="mirror_pad", ignores=["Tpaddings"])(new_inputs, attr)
-
-    return _impl
-
-
-def _transpose():
-    def _impl(inputs, attr, params, mod):
-        # If perm is not specified, axes is left empty,
-        # otherwise its value is get from params
-        axes = _get_list_param(params, inputs[1], mod)
-        return _op.transpose(inputs[0], axes=axes)
-
-    return _impl
-
-
-def _where():
-    def _impl(inputs, attr, params, mod):
-        if len(inputs) == 1:
-            return AttrCvt(op_name="argwhere")(inputs, attr)
-        cond_shape = _infer_shape(inputs[0], mod)
-        x_shape = _infer_shape(inputs[1], mod)
-        # Due to difference in broadcast behavior between Select and SelectV2,
-        # we adjust condition dimension with expand_dim and then broadcast.
-        if len(cond_shape) == 1 and cond_shape[0] == x_shape[0]:
-            for _ in range(len(x_shape) - 1):
-                inputs[0] = _op.expand_dims(inputs[0], axis=-1)
-            broadcast_cond = _op.broadcast_to(inputs[0], x_shape)
-            inputs[0] = _op.cast(broadcast_cond, "bool")
-        return AttrCvt(op_name="where")(inputs, attr)
-
-    return _impl
-
-
-def _where_v2():
-    def _impl(inputs, attr, params, mod):
-        if len(inputs) == 1:
-            return AttrCvt(op_name="argwhere")(inputs, attr)
-        return AttrCvt(op_name="where")(inputs, attr)
-
-    return _impl
-
-
-def _clip_by_value():
-    def _impl(inputs, attr, params, mod):
-        a_min = _get_num_param(params, inputs[1])
-        a_max = _get_num_param(params, inputs[2])
-        return _op.clip(inputs[0], a_min=a_min, a_max=a_max)
-
-    return _impl
-
-
-def _reverse_v2():
-    def _impl(inputs, attr, params, mod):
-        axis = _get_num_param(params, inputs[1])
-        return AttrCvt(op_name="reverse", ignores=["Tidx"], extras={"axis": int(axis)})(
-            [inputs[0]], attr
-        )
-
-    return _impl
-
-
-def _rank():
-    def _impl(inputs, attr, params, mod):
-        input_shape = _infer_shape(inputs[0], mod)
-
-        name = attr["_node_name"]
-        params[name] = tvm.nd.array(np.array([len(input_shape)]).astype("int32"))
-        return [_expr.var(name, shape=params[name].shape, dtype="int32")]
-
-    return _impl
-
-
-def _range():
-    def _impl(inputs, attr, params, mod):
-        try:
-            start = _get_param(params, inputs[0])[0]
-        except (IndexError, KeyError, AttributeError):
-            try:
-                start = _infer_value(inputs[1], params, mod).numpy().tolist()
-                start = start if not isinstance(start, list) else start[0]
-            except Exception:
-                # Symbolic start
-                start = inputs[0]
-
-        try:
-            limit = (
-                _get_param(params, inputs[1])[0]
-                if hasattr(inputs[1], "name_hint") or isinstance(inputs[1], _expr.Constant)
-                else params.pop("Rank").numpy()[0]
-            )
-        except (IndexError, KeyError, AttributeError):
-            try:
-                limit = _infer_value(inputs[1], params, mod).numpy().tolist()
-                limit = limit if not isinstance(limit, list) else limit[0]
-            except Exception:
-                limit = inputs[1]
-
-        try:
-            delta = _get_param(params, inputs[2])[0]
-        except (IndexError, KeyError, AttributeError):
-            try:
-                delta = _infer_value(inputs[2], params, mod).numpy().tolist()
-                delta = delta if not isinstance(delta, list) else delta[0]
-            except Exception:
-                # Symbolic delta
-                delta = inputs[2]
-
-        # if all attributes are constant, evalute the range function and return relay.const
-        dtype = attr["Tidx"].name if "Tidx" in attr else str(start.dtype)
-        if all(
-            [
-                isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)),
-                isinstance(limit, (np.int32, np.int64, int, np.float32, np.float64, float)),
-                isinstance(delta, (np.int32, np.int64, int, np.float32, np.float64, float)),
-            ]
-        ):
-            return tvm.relay.const(list(range(int(start), int(limit), int(delta))), dtype=dtype)
-
-        if isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            start = _expr.const(start, dtype=dtype)
-        if isinstance(limit, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            limit = _expr.const(limit, dtype=dtype)
-        if isinstance(delta, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            delta = _expr.const(delta, dtype=dtype)
-
-        return AttrCvt(
-            op_name="arange",
-            ignores=["Tidx", "_class"],
-            extras={"start": start, "stop": limit, "step": delta, "dtype": dtype},
-        )([], attr)
-
-    return _impl
-
-
-def _einsum():
-    def _impl(inputs, attr, params, mod):
-        einsum_attr = dict(attr)
-        einsum_attr["equation"] = einsum_attr["equation"].decode("utf-8")
-        return AttrCvt(op_name="einsum", ignores=["N"])([inputs], einsum_attr)
-
-    return _impl
-
-
-def _elu():
-    def _impl(inputs, attr, params, mod):
-        dtype = attr["T"].name
-        alpha = tvm.relay.const(-1.0, dtype)
-        return alpha * _op.nn.relu(tvm.relay.const(1, dtype) - _op.exp(inputs[0])) + _op.nn.relu(
-            inputs[0]
-        )
-
-    return _impl
-
-
-def _selu():
-    def _impl(inputs, attr, params, mod):
-        dtype = attr["T"].name
-        alpha = tvm.relay.const(-1.6732632423543772848170429916717, dtype)
-        gamma = tvm.relay.const(1.0507009873554804934193349852946, dtype)
-        return gamma * (
-            alpha * _op.nn.relu(tvm.relay.const(1, dtype) - _op.exp(inputs[0]))
-            + _op.nn.relu(inputs[0])
-        )
-
-    return _impl
-
-
-def _mean():
-    def _impl(inputs, attr, params, mod):
-        axis = _get_tuple_param(params, inputs[1])
-        return AttrCvt(
-            op_name="mean",
-            ignores=["Tdim", "Tidx"],
-            transforms={"keep_dims": "keepdims"},
-            extras={"axis": axis},
-        )([inputs[0]], attr)
-
-    return _impl
-
-
-def _broadcast(name):
-    def _impl(inputs, attr, params, mod):
-        return AttrCvt(op_name=name, ignores=["name", "incompatible_shape_error", "Tidx"])(
-            inputs, attr
-        )
-
-    return _impl
-
-
-def _split(has_size_vector):
-    # TF documentation https://www.tensorflow.org/api_docs/python/tf/split
-    def _impl(inputs, attr, params, mod):
-        try:
-            # order and number of inputs are different:
-            # if has_size_vector:
-            #     https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/split-v
-            # else:
-            #     https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/split
-
-            # in addition, `axis` and `num_or_size_splits` can be tensors in TensorFlow,
-            # we can only support constants
-            if has_size_vector:
-                input_node_index = 0
-                input_axis_index = 2
-                size_splits = _get_param(params, inputs[1])
-                section_beginnings = np.cumsum(size_splits)[:-1]
-                indices_or_sections = tuple(section_beginnings)
-            else:
-                input_node_index = 1
-                input_axis_index = 0
-                indices_or_sections = attr["num_split"]
-            input_node = inputs[input_node_index]
-            axis_input_value = _get_num_param(params, inputs[input_axis_index])
-        except (IndexError, KeyError, AttributeError):
-            raise TypeError(
-                "Unsupported argument for split: `axis` and `num_or_size_splits` "
-                "should be constants"
-            )
-        return _op.split(
-            input_node, indices_or_sections=indices_or_sections, axis=int(axis_input_value)
-        )
-
-    return _impl
-
-
-def _unpack():
-    def _impl(inputs, attr, params, mod):
-        input_node = inputs[0]
-        axis = attr["axis"]
-        input_shape = _infer_shape(input_node, mod)
-        axis_length = input_shape[axis]
-        if axis_length < 0:
-            raise TypeError("Unstack with unknown axis length")
-        splitted = _op.split(input_node, indices_or_sections=axis_length, axis=axis)
-        axis = [axis]
-        return _expr.TupleWrapper(
-            _expr.Tuple([_op.squeeze(split_item, axis=axis) for split_item in splitted]),
-            len(splitted),
-        )
-
-    return _impl
-
-
-def _softmax():
-    def _impl(inputs, attr, params, mod):
-        return AttrCvt(op_name="softmax", transforms={"axis": ("axis", 1)})([inputs[0]], attr)
-
-    return _impl
-
-
-def _softsign():
-    # op description: https://www.tensorflow.org/api_docs/python/tf/math/softsign
-    def _impl(inputs, attr, params, mod):
-        abs_out = get_relay_op("abs")(inputs[0])
-        add_out = abs_out + tvm.relay.const(1, attr["T"].name)
-        return inputs[0] / add_out
-
-    return _impl
-
-
-def _softplus():
-    # op description: https://www.tensorflow.org/api_docs/python/tf/math/softplus
-    def _impl(inputs, attr, params, mod):
-        exp_out = AttrCvt("exp")(inputs, attr)
-        inputs.append(tvm.relay.const(1, attr["T"].name))
-        rh = tvm.relay.const(1, attr["T"].name)
-        add_out = get_relay_op("add")(exp_out, rh)
-        return get_relay_op("log")(add_out)
-
-    return _impl
-
-
-def _topk():
-    def _impl(inputs, attr, params, mod):
-        k_input = inputs.pop(1)
-        try:
-            k = int(_get_num_param(params, k_input))
-        except (IndexError, KeyError, AttributeError):
-            try:
-                k = int(_infer_value(k_input, params, mod).numpy().tolist())
-            except Exception:
-                k = k_input
-        if isinstance(k, int):
-            if k < 1:
-                raise tvm.error.OpAttributeInvalid(
-                    "Attribute k must be positive in operator TopKV2"
-                )
-            k = _expr.const(k)
-        if attr["sorted"] is False:
-            raise tvm.error.OpAttributeUnImplemented(
-                "Attribute sorted=False is not supported in operator TopKV2"
-            )
-        return AttrCvt(
-            op_name="topk",
-            ignores=["sorted"],
-            extras={"k": k, "is_ascend": False, "dtype": "int32"},
-        )([inputs[0]], attr)
-
-    return _impl
-
-
-def _floordiv():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 2
-        return AttrCvt("floor_divide")(inputs, attr)
-
-    return _impl
-
-
-def _floormod():
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 2
-        return AttrCvt("floor_mod")(inputs, attr)
-
-    return _impl
-
-
-def _logical(name):
-    def _impl(inputs, attr, params, mod):
-        return AttrCvt(op_name=name)(inputs, attr)
-
-    return _impl
-
-
-def _space_to_batch_nd():
-    def _impl(inputs, attr, params, mod):
-        block_shape = _get_list_param(params, inputs[1], mod)
-
-        paddings = _get_list_param(params, inputs[2], mod)
-        paddings = np.squeeze(paddings)
-        if len(paddings.shape) == 1:
-            paddings = np.expand_dims(paddings, axis=0)
-        paddings = paddings.tolist()
-
-        attr["block_shape"] = block_shape
-        attr["paddings"] = paddings
-        out = AttrCvt("space_to_batch_nd", ignores=["Tblock_shape", "Tpaddings"])([inputs[0]], attr)
-
-        return out
-
-    return _impl
-
-
-def _batch_to_space_nd():
-    def _impl(inputs, attr, params, mod):
-        block_shape = _get_list_param(params, inputs[1], mod)
-
-        crops = _get_list_param(params, inputs[2], mod)
-        crops = np.squeeze(crops)
-        if len(crops.shape) == 1:
-            crops = np.expand_dims(crops, axis=0)
-        crops = crops.tolist()
-
-        attr["block_shape"] = block_shape
-        attr["crops"] = crops
-        out = AttrCvt("batch_to_space_nd", ignores=["Tblock_shape", "Tcrops"])([inputs[0]], attr)
-
-        return out
-
-    return _impl
-
-
-def _atan2():
-    def _impl(inputs, attr, params, mod):
-        divide = _elemwise("divide")(inputs, attr, params, mod)
-        return get_relay_op("atan")(divide)
-
-    return _impl
-
-
-def _prod():
-    def _impl(inputs, attr, params, mod):
-        axis = _get_num_param(params, inputs[1])
-        keepdims = attr["keep_dims"]
-        return _op.prod(inputs[0], int(axis), keepdims=keepdims)
-
-    return _impl
-
-
-def _log1p():
-    # op description: https://www.tensorflow.org/api_docs/python/tf/math/log1p
-    def _impl(inputs, attr, params, mod):
-        one = tvm.relay.const(1, attr["T"].name)
-        add_out = get_relay_op("add")(inputs[0], one)
-        return get_relay_op("log")(add_out)
-
-    return _impl
-
-
-def _one_hot():
-    def _impl(inputs, attr, params, mod):
-        depth = int(_get_num_param(params, inputs[1]))
-        dtype = attr["T"].name
-
-        on_value = _get_num_param(params, inputs[2])
-        off_value = _get_num_param(params, inputs[3])
-        new_inputs = [
-            inputs[0],
-            tvm.relay.const(on_value, dtype),
-            tvm.relay.const(off_value, dtype),
-        ]
-        return AttrCvt("one_hot", ignores=["TI"], extras={"depth": depth, "dtype": dtype})(
-            new_inputs, attr
-        )
-
-    return _impl
-
-
-def _squared_difference():
-    def _impl(inputs, attr, params, mod):
-        difference = _op.subtract(inputs[0], inputs[1])
-        return _op.multiply(difference, difference)
-
-    return _impl
-
-
-def _size():
-    def _impl(inputs, attr, params, mod):
-        new_attr = attr
-        new_attr["out_type"] = attr["out_type"].name
-        return AttrCvt("ndarray_size", transforms={"out_type": "dtype"})(inputs, new_attr)
-
-    return _impl
-
-
-def _add_n():
-    def _impl(inputs, attr, params, mod):
-        if not isinstance(inputs, tuple):
-            inputs = list(inputs)
-        assert len(inputs) > 0, "add_n take >=1 inputs, but 0 given."
-        _res = inputs[0]
-        for each in inputs[1:]:
-            _res = _op.add(_res, each)
-        return _res
-
-    return _impl
-
-
-def _LSTMBlockCell():
-    def _impl(inputs, attr, params, mod):
-        """LSTM Block cell.
-        Calculations and return values are described in:
-        https://github.com/tensorflow/tensorflow/blob/
-        r1.8/tensorflow/contrib/rnn/python/ops/lstm_ops.py#L41-L114
-
-        Parameters
-        ----------
-        inputs : relay.Expr
-            Input data
-        in_state_c: list of relay.Expr
-            Cell state input values for all the layers
-        in_state_h: list of relay.Expr
-            Hidden state input values for all the layers
-        attrs : dict
-            Dict of operator attributes
-        params : dict
-            List of pretrained weights and bias
-
-        Returns
-        -------
-        relay.Expr.TupleWapper
-            [i, cs, f, o, ci, co, h]
-        """
-        in_data = inputs[0]
-        in_state_c = inputs[1]
-        in_state_h = inputs[2]
-        in_weight = inputs[3]
-        in_bias = inputs[7]
-        forget_bias = attr.pop("forget_bias")
-        input_shape = _infer_shape(inputs[0], mod)
-        weight_shape = _infer_shape(inputs[3], mod)
-        batch_size, input_size = input_shape[0], input_shape[1]
-        num_hidden_layers = weight_shape[1]
-
-        in_data = _op.reshape(in_data, newshape=(batch_size, input_size))
-        ixh = _op.concatenate([in_data, in_state_h], axis=1)
-        in_weight = _op.transpose(in_weight, axes=None)
-        gates = _op.nn.dense(ixh, in_weight, units=num_hidden_layers)
-        gates_bias = _op.add(gates, in_bias)
-        gate_list = _op.split(gates_bias, indices_or_sections=4, axis=1)
-        in_gate = _op.sigmoid(gate_list[0])
-        in_transform = _op.tanh(gate_list[1])
-        forget_gate = _op.add(gate_list[2], tvm.relay.const(forget_bias, attr["T"].name))
-        forget_gate = _op.sigmoid(forget_gate)
-        out_gate = _op.sigmoid(gate_list[3])
-        next_c = _op.add(_op.multiply(forget_gate, in_state_c), _op.multiply(in_gate, in_transform))
-        co = _op.tanh(next_c)
-        next_h = out_gate * co
-
-        return tvm.relay.TupleWrapper(
-            tvm.relay.Tuple([in_gate, next_c, forget_gate, out_gate, in_transform, co, next_h]), 7
-        )
-
-    return _impl
-
-
-def _unique(return_counts=True):
-    def _impl(inputs, attr, params, mod):
-        assert len(inputs) == 1
-        data = inputs[0]
-        if return_counts:
-            [unique, _, inverse_indices, num_uniq, counts] = _op.unique(
-                data, is_sorted=False, return_counts=True
-            )
-            unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
-            counts_sliced = _op.strided_slice(counts, begin=[0], end=num_uniq, slice_mode="size")
-            return _expr.TupleWrapper(
-                _expr.Tuple([unique_sliced, inverse_indices, counts_sliced]), 3
-            )
-        [unique, _, inverse_indices, num_uniq] = _op.unique(
-            data, is_sorted=False, return_counts=False
-        )
-        unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
-        return _expr.TupleWrapper(_expr.Tuple([unique_sliced, inverse_indices]), 2)
-
-    return _impl
-
-
-def _bincount():
-    def _impl(inputs, attr, params, mod):
-        input = inputs[0]  # arr: int32 Tensor
-        size = inputs[1]  # size: non-negative int scalar Tensor
-        # weights: int32, int64, float32, or float64 Tensor with the same shape as arr
-        # or a length-0 Tensor, in which case it acts as all weights equal to 1.
-        weights = inputs[2]
-        # Returns: Output: 1D Tensor with length equal to size
-        # The counts or summed weights for each value in the range [0, size).
-
-        input_shape = _infer_shape(input, mod)
-        if len(input_shape) > 1:
-            input = _op.reshape(input, [-1])
-
-        is_weights_zero_tensor = True
-        if weights:
-            weights_shape = _infer_shape(weights, mod)
-            is_weights_zero_tensor = weights_shape == (0,)
-            if len(weights_shape) > 1:
-                weights = _op.reshape(weights, [-1])
-
-        # Output should have the same dtype as weights.
-        if is_weights_zero_tensor:
-            # if weights are length-0 Tensor - output dtype is float32
-            out_dtype = "float32"
-            updates = _op.cast(_op.ones_like(input), out_dtype)
-        else:
-            out_dtype = _infer_type(weights, mod).checked_type.dtype
-            updates = weights
-
-        counts_shape = _op.reshape(size, [1])
-        counts = _op.zeros(counts_shape, out_dtype)
-        out = _op.scatter_elements(counts, input, updates, axis=0, reduction="add")
-        return out
-
-    return _impl
-
-
-def _dense_bincount():
-    def _impl(inputs, attr, params, mod):
-        input = inputs[0]  # input: int32, int64. 1D or 2D int Tensor
-        size = inputs[1]  # size: non-negative int scalar Tensor
-        # weights: int32, int64, float32, or float64 Tensor with the same shape as input
-        # or a length-0 Tensor, in which case it acts as all weights equal to 1.
-        weights = inputs[2]
-        # Returns: Output: 1D Tensor with length equal to size
-        # or 2D Tensor with [batch_size, size].
-        # The counts or summed weights for each value in the range [0, size).
-
-        input_dtype = _infer_type(input, mod).checked_type.dtype
-        input_shape = _infer_shape(input, mod)
-        is_2d_input = len(input_shape) == 2
-
-        if input_dtype == "int64":
-            warnings.warn(
-                "Casting an int64 input to int32, since we do not have int64 atomic add"
-                "needed for bincount yet."
-            )
-            input = _op.cast(input, "int32")
-
-        is_weights_zero_tensor = True
-        if weights:
-            weights_shape = _infer_shape(weights, mod)
-            is_weights_zero_tensor = weights_shape == (0,)
-
-        # Output should have the same dtype as weights.
-        if is_weights_zero_tensor:
-            # if weights are length-0 Tensor - output dtype is float32
-            out_dtype = "float32"
-            updates = _op.cast(_op.ones_like(input), out_dtype)
-        else:
-            out_dtype = _infer_type(weights, mod).checked_type.dtype
-            updates = weights
-
-        if is_2d_input:
-            batch_arr = _op.take(_op.shape_of(input), _expr.const([0]))
-            size_arr = _op.reshape(size, [1])
-            counts_shape = _op.concatenate([batch_arr, size_arr], axis=0)
-            counts = _op.zeros(counts_shape, out_dtype)
-            out = _op.scatter_elements(counts, input, updates, axis=1, reduction="add")
-        else:
-            counts_shape = _op.reshape(size, [1])
-            counts = _op.zeros(counts_shape, out_dtype)
-            out = _op.scatter_elements(counts, input, updates, axis=0, reduction="add")
-
-        if attr["binary_output"]:
-            out = _op.cast(_op.cast(out, "bool"), out_dtype)
-        return out
-
-    return _impl
-
-
-# _convert_map defines maps of name to converter functor(callable)
-# for 1 to 1 mapping, use Renamer if nothing but name is different
-# use AttrCvt if attributes need to be converted
-# for 1 to N mapping(composed), use custom callable functions
-# for N to 1 mapping, currently not supported(?)
-_convert_map = {
-    "Abs": AttrCvt("abs"),
-    "Acos": AttrCvt("acos"),
-    "Acosh": AttrCvt("acosh"),
-    "Add": _elemwise("add"),
-    "AddN": _add_n(),
-    "AddV2": _elemwise("add"),
-    "All": _reduce("all"),
-    "Any": _reduce("any"),
-    "ArgMax": _argx(_op.argmax, "argmax"),
-    "ArgMin": _argx(_op.argmin, "argmin"),
-    "Asin": AttrCvt("asin"),
-    "Asinh": AttrCvt("asinh"),
-    "Assert": _assert(),
-    "Atan": AttrCvt("atan"),
-    "Atanh": AttrCvt("atanh"),
-    "Atan2": _atan2(),
-    "AvgPool": _pooling("avg_pool"),
-    "AvgPool3D": _pool3d("avg_pool3d"),
-    "BatchMatMul": _batch_matmul(),
-    "BatchMatMulV2": _batch_matmul(),
-    "BatchNormWithGlobalNormalization": _batch_norm(),
-    "BatchToSpaceND": _batch_to_space_nd(),
-    "BiasAdd": _bias_add(),
-    "Bincount": _bincount(),
-    "BroadcastTo": _broadcast_to(),
-    "BroadcastArgs": _broadcast_args(),
-    "Cast": _cast(),
-    "Ceil": AttrCvt("ceil"),
-    "CheckNumerics": _check_numerics(),
-    "ClipByValue": _clip_by_value(),
-    "Concat": _concat(),
-    "ConcatV2": _concatV2(),
-    "Conv2D": _conv("conv"),
-    "Conv2DBackpropInput": _conv("conv_transpose"),
-    "Conv3D": _conv3d("conv"),
-    "Conv3DBackpropInputV2": _conv3d("conv_transpose"),
-    "Cos": AttrCvt("cos"),
-    "Cosh": AttrCvt("cosh"),
-    "CropAndResize": _crop_and_resize(),
-    "DecodeJpeg": _decode_image(),
-    "DenseBincount": _dense_bincount(),
-    "DepthToSpace": _depth_to_space(),
-    "DepthwiseConv2dNative": _conv("depthwise"),
-    "Dilation2D": _dilation2d(),
-    "Einsum": _einsum(),
-    "Elu": _elu(),
-    "Equal": _broadcast("equal"),
-    "Erf": AttrCvt("erf"),
-    "EuclideanNorm": _euclidean_norm(),
-    "Exp": AttrCvt("exp"),
-    "ExpandDims": _expand_dims(),
-    "Expm1": _expm1(),
-    "Fill": _fill(),
-    "Floor": AttrCvt("floor"),
-    "FloorDiv": _floordiv(),
-    "FloorMod": _floormod(),
-    "FusedBatchNorm": _fused_batch_norm(),
-    "FusedBatchNormV2": _fused_batch_norm(),
-    "FusedBatchNormV3": _fused_batch_norm(),
-    "Gather": _gather(),
-    "GatherNd": _gather_nd(),
-    "GatherV2": _gather(),
-    "Greater": _broadcast("greater"),
-    "GreaterEqual": _broadcast("greater_equal"),
-    "Identity": _identity(),
-    "IdentityN": _identityn(),
-    "InvertPermutation": AttrCvt("invert_permutation"),
-    "IsFinite": AttrCvt("isfinite"),
-    "IsInf": AttrCvt("isinf"),
-    "IsNan": AttrCvt("isnan"),
-    "LeakyRelu": AttrCvt("leaky_relu"),
-    "LeftShift": AttrCvt("left_shift"),
-    "Less": _broadcast("less"),
-    "LessEqual": _broadcast("less_equal"),
-    "Log": AttrCvt("log"),
-    "Log1p": _log1p(),
-    "LogicalAnd": _logical("logical_and"),
-    "LogicalNot": _logical("logical_not"),
-    "LogicalOr": _logical("logical_or"),
-    "LogSoftmax": AttrCvt("log_softmax"),
-    "LRN": _lrn(),
-    "LSTMBlockCell": _LSTMBlockCell(),
-    "MatMul": _matmul(),
-    "Max": _reduce("max"),
-    "Maximum": _elemwise("maximum"),
-    "MaxPool": _pooling("max_pool"),
-    "MaxPool3D": _pool3d("max_pool3d"),
-    "Mean": _mean(),
-    "Min": _reduce("min"),
-    "Minimum": _elemwise("minimum"),
-    "MirrorPad": _mirror_pad(),
-    "Mod": _elemwise("mod"),
-    "Mul": _elemwise("multiply"),
-    "Neg": AttrCvt("negative"),
-    "NonMaxSuppressionV2": _nms(),
-    "NonMaxSuppressionV3": _nms(),
-    "NonMaxSuppressionV4": _nms(),
-    "NonMaxSuppressionV5": _nms(True),
-    "CombinedNonMaxSuppression": _combined_nms(),
-    "NoOp": _no_op(),
-    "NotEqual": _broadcast("not_equal"),
-    "OneHot": _one_hot(),
-    "Pack": _pack(),
-    "Pad": _pad("Pad"),
-    "PadV2": _pad("PadV2"),
-    "Pow": _elemwise("power"),
-    "Prod": _prod(),
-    "Range": _range(),
-    "Rank": _rank(),
-    "RealDiv": _elemwise("divide"),
-    "Relu": AttrCvt("relu"),
-    "Relu6": _relu6(),
-    "Reshape": _reshape(),
-    "ResizeBicubic": _resize("cubic"),
-    "ResizeBilinear": _resize("linear"),
-    "ResizeNearestNeighbor": _resize("nearest_neighbor"),
-    "ReverseV2": _reverse_v2(),
-    "RightShift": AttrCvt("right_shift"),
-    "Rint": AttrCvt("round"),
-    "Round": AttrCvt("round"),
-    "Rsqrt": _rsqrt(),
-    "Select": _where(),
-    "SelectV2": _where_v2(),
-    "Selu": _selu(),
-    "Shape": _shape(),
-    "Sigmoid": AttrCvt("sigmoid"),
-    "Sign": AttrCvt("sign"),
-    "Sin": AttrCvt("sin"),
-    "Sinh": AttrCvt("sinh"),
-    "Size": _size(),
-    "Slice": _slice(),
-    "Softmax": _softmax(),
-    "Softplus": _softplus(),
-    "Softsign": _softsign(),
-    "SpaceToBatchND": _space_to_batch_nd(),
-    "SpaceToDepth": _space_to_depth(),
-    "SparseToDense": _sparse_to_dense(),
-    "SparseTensorDenseMatMul": _sparse_tensor_dense_matmul(),
-    "SparseFillEmptyRows": _sparse_fill_empty_rows(),
-    "SparseReshape": _sparse_reshape(),
-    "SegmentSum": _math_segment_sum(),
-    "SparseSegmentSum": _sparse_segment_sum(),
-    "SparseSegmentSumWithNumSegments": _sparse_segment_sum_with_num_segments(),
-    "SparseSegmentSqrtN": _sparse_segment_sum_sqrtn(),
-    "SparseSegmentSqrtNWithNumSegments": _sparse_segment_sum_sqrtn_with_num_segments(),
-    "SparseSegmentMean": _sparse_segment_mean(),
-    "SparseSegmentMeanWithNumSegments": _sparse_segment_mean_with_num_segments(),
-    "SparseTensorDenseAdd": _sparse_tensor_dense_add(),
-    "Split": _split(False),
-    "SplitV": _split(True),
-    "Sqrt": AttrCvt("sqrt"),
-    "Square": _square(),
-    "SquaredDifference": _squared_difference(),
-    "Squeeze": _squeeze(),
-    "StopGradient": _identity(),
-    "StridedSlice": _stridedSlice(),
-    "Sub": _elemwise("subtract"),
-    "Sum": _sum(),
-    "Tan": AttrCvt("tan"),
-    "Tanh": AttrCvt("tanh"),
-    "TensorArrayConcatV3": _tensor_array_concat(),
-    "TensorArrayGatherV3": _tensor_array_gather(),
-    "TensorArrayReadV3": _tensor_array_read(),
-    "TensorArrayScatterV3": _tensor_array_scatter(),
-    "TensorArraySizeV3": _tensor_array_size(),
-    "TensorArraySplitV3": _tensor_array_split(),
-    "TensorArrayV3": _tensor_array(),
-    "TensorArrayWriteV3": _tensor_array_write(),
-    "Tile": _tile(),
-    "TopKV2": _topk(),
-    "Transpose": _transpose(),
-    "TruncateMod": _elemwise("mod"),
-    "Unique": _unique(False),
-    "UniqueWithCounts": _unique(True),
-    "Unpack": _unpack(),
-    "UnravelIndex": _unravel_index(),
-    "Where": _where_v2(),
-    "ZerosLike": AttrCvt("zeros_like"),
-}
diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
deleted file mode 100644
index b1b10eb81f56..000000000000
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TF: Tensorflow parser"""
-# pylint: disable=import-outside-toplevel, assignment-from-no-return
-
-import os
-from tvm.contrib import utils
-
-
-class TFParser(object):
-    """
-    A Wrapper to handle tensorflow models parsing, TensorFlow is needed
-
-    Parameters
-    ----------
-    model_dir : tensorflow frozen pb file or a directory that contains saved
-    model or checkpoints.
-
-    outputs : List of output tensor names (Optional)
-        Optional output node names. This will be protected for saved model
-        when we do remove training nodes.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        parser = TFParser(model_dir)
-        graphdef = parser.parse()
-    """
-
-    def __init__(self, model_dir, outputs=None):
-        from tensorflow.core.framework import graph_pb2
-
-        self._tmp_dir = utils.tempdir()
-        self._model_dir = model_dir
-        self._graph = graph_pb2.GraphDef()
-        self._outputs = outputs or []
-
-    def _set_graph(self, graph):
-        """Set Graph"""
-        self._graph = graph
-
-    def _get_graph(self):
-        """Get Graph"""
-        return self._graph
-
-    def _load_pb_file(self):
-        """Load single pb file"""
-        graph = self._get_graph()
-        with open(self._model_dir, "rb") as f:
-            graph.ParseFromString(f.read())
-        return graph
-
-    def _get_tag_set(self):
-        """Return the tag set of saved model, multiple metagraphs are not supported"""
-        try:
-            from tensorflow.contrib.saved_model.python.saved_model.reader import (
-                get_saved_model_tag_sets,
-            )
-        except ImportError:
-            try:
-                from tensorflow.python.tools.saved_model_utils import get_saved_model_tag_sets
-            except ImportError:
-                raise ImportError(
-                    "InputConfiguration: Unable to import get_saved_model_tag_sets which is "
-                    "required to get tag set from saved model."
-                )
-        tag_sets = get_saved_model_tag_sets(self._model_dir)
-        return tag_sets[0]
-
-    def _get_output_names(self):
-        """Return the concatenated output names"""
-        try:
-            import tensorflow.compat.v1 as tf
-        except ImportError:
-            raise ImportError(
-                "InputConfiguration: Unable to import tensorflow which is "
-                "required to restore from saved model."
-            )
-        tags = self._get_tag_set()
-        output_names = set()
-        with tf.Session() as sess:
-            meta_graph_def = tf.saved_model.loader.load(sess, tags, self._model_dir)
-            for sig_def in meta_graph_def.signature_def.values():
-                for output_tensor in sig_def.outputs.values():
-                    output_names.add(output_tensor.name.replace(":0", ""))
-        tf.reset_default_graph()
-        return ",".join(output_names)
-
-    def _load_saved_model(self):
-        """Load the tensorflow saved model."""
-        try:
-            from tensorflow.python.tools import freeze_graph
-            from tensorflow.python.framework import ops
-            from tensorflow.python.framework import graph_util
-            from tensorflow.core.framework import graph_pb2
-        except ImportError:
-            raise ImportError(
-                "InputConfiguration: Unable to import tensorflow which is "
-                "required to restore from saved model."
-            )
-
-        saved_model_dir = self._model_dir
-        output_graph_filename = self._tmp_dir.relpath("tf_frozen_model.pb")
-        input_saved_model_dir = saved_model_dir
-        output_node_names = self._get_output_names()
-
-        input_binary = False
-        input_saver_def_path = False
-        restore_op_name = None
-        filename_tensor_name = None
-        clear_devices = True
-        input_meta_graph = False
-        checkpoint_path = None
-        input_graph_filename = None
-        saved_model_tags = ",".join(self._get_tag_set())
-
-        freeze_graph.freeze_graph(
-            input_graph_filename,
-            input_saver_def_path,
-            input_binary,
-            checkpoint_path,
-            output_node_names,
-            restore_op_name,
-            filename_tensor_name,
-            output_graph_filename,
-            clear_devices,
-            "",
-            "",
-            "",
-            input_meta_graph,
-            input_saved_model_dir,
-            saved_model_tags,
-        )
-
-        with ops.Graph().as_default():  # pylint: disable=not-context-manager
-            output_graph_def = graph_pb2.GraphDef()
-            with open(output_graph_filename, "rb") as f:
-                output_graph_def.ParseFromString(f.read())
-            output_graph_def = graph_util.remove_training_nodes(
-                output_graph_def, protected_nodes=self._outputs
-            )
-            return output_graph_def
-
-    def _load_ckpt(self):
-        """TODO: Load checkpoint model."""
-        raise RuntimeError(
-            "InputConfiguration: Loading tf checkpoint model is " "not supported yet."
-        )
-
-    def parse(self):
-        """
-        Parse tensorflow models: checkpoints, saved models, and single frozen pb file.
-
-        Returns
-        -------
-        GraphDef of the passed model
-        """
-
-        graph = None
-
-        if os.path.isdir(self._model_dir):
-            ckpt = os.path.join(self._model_dir, "checkpoint")
-            if not os.path.isfile(ckpt):
-                if not os.path.isdir(os.path.join(self._model_dir, "variables")):
-                    raise RuntimeError("InputConfiguration: Invalid model path.")
-                graph = self._load_saved_model()
-            else:
-                graph = self._load_ckpt()
-        elif os.path.isfile(self._model_dir):
-            # Only .pb or .pbtxt is a valid suffix name.
-            if self._model_dir.endswith(".pb") or self._model_dir.endswith(".pbtxt"):
-                cur_dir = os.path.dirname(self._model_dir)
-            else:
-                raise RuntimeError("InputConfiguration: Invalid model format.")
-
-            # It is a saved model if `variables` directory is present at the
-            # same directory with the pb or pbtxt file.
-            if os.path.isdir(os.path.join(cur_dir, "variables")):
-                self._model_dir = cur_dir
-                graph = self._load_saved_model()
-            else:
-                graph = self._load_pb_file()
-        else:
-            raise RuntimeError("InputConfiguration: Unrecognized model " "file or path.")
-
-        self._set_graph(graph)
-        return graph
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
deleted file mode 100644
index e939895adeae..000000000000
--- a/python/tvm/relay/frontend/tflite.py
+++ /dev/null
@@ -1,4230 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, too-many-lines
-# pylint: disable=import-outside-toplevel, use-list-literal
-"""Tensorflow lite frontend."""
-import itertools
-import math
-
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.ir import IRModule
-from tvm.runtime.name_transforms import sanitize_name
-
-from ... import nd as _nd
-from .. import analysis
-from .. import expr as _expr
-from .. import function as _function
-from .. import op as _op
-from .. import qnn as _qnn
-from .common import ExprTable
-from .common import fold_constant as _fold_constant
-from .common import infer_shape as _infer_shape
-from .common import infer_type as _infer_type
-from .common import lstm_cell, to_int_list, shape_of, try_infer_value
-from .common import set_span
-from .tflite_flexbuffer import FlexBufferDecoder
-
-__all__ = ["from_tflite"]
-
-
-class TensorWrapper(object):
-    """Tensor wrapper for TFLite Tensor"""
-
-    def __init__(self, tensor_idx, tensor, buffer, qnn_params=None):
-        self.tensor_idx = tensor_idx
-        self.tensor = tensor
-        self.buffer = buffer
-        self.qnn_params = qnn_params
-
-
-class OperatorConverter(object):
-    """Operator Converted for converting TFLite ops to Relay ops"""
-
-    def __init__(self, model, subgraph, exp_tab):
-
-        try:
-            from tflite.ActivationFunctionType import ActivationFunctionType
-            from tflite.BuiltinOperator import BuiltinOperator
-            from tflite.BuiltinOptions import BuiltinOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        self.model = model
-        self.subgraph = subgraph
-        self.exp_tab = exp_tab
-        self.builtin_op_code = build_str_map(BuiltinOperator())
-        self.activation_fn_type = build_str_map(ActivationFunctionType())
-        self.builtin_options = build_str_map(BuiltinOptions())
-        self.prefetched_nodes = {}
-        self.allow_custom_ops = False
-
-        # Add more operators
-        self.convert_map = {
-            "ABS": self.convert_abs,
-            "ADD": self.convert_add,
-            "ADD_N": self.convert_add_n,
-            "ARG_MAX": self.convert_arg_max,
-            "ARG_MIN": self.convert_arg_min,
-            "AVERAGE_POOL_2D": self.convert_average_pool2d,
-            "BATCH_TO_SPACE_ND": self.convert_batch_to_space_nd,
-            "BATCH_MATMUL": self.convert_batch_matmul,
-            "CAST": self.convert_cast,
-            "CEIL": self.convert_ceil,
-            "CONCATENATION": self.convert_concatenation,
-            "CONV_2D": self.convert_conv2d,
-            "COS": self.convert_cos,
-            "DENSIFY": self.convert_densify,
-            "DEPTH_TO_SPACE": self.convert_depth_to_space,
-            "DEPTHWISE_CONV_2D": self.convert_depthwise_conv2d,
-            "DEQUANTIZE": self.convert_dequantize,
-            "DETECTION_POSTPROCESS": self.convert_detection_postprocess,
-            "DIV": self.convert_div,
-            "ELU": self.convert_elu,
-            "EQUAL": self.convert_equal,
-            "EXP": self.convert_exp,
-            "EXPAND_DIMS": self.convert_expand_dims,
-            "FAKE_QUANT": self.convert_fake_quant,
-            "FILL": self.convert_fill,
-            "FLOOR_DIV": self.convert_floor_div,
-            "FLOOR_MOD": self.convert_floor_mod,
-            "FLOOR": self.convert_floor,
-            "FULLY_CONNECTED": self.convert_fully_connected,
-            "GATHER": self.convert_gather,
-            "GATHER_ND": self.convert_gather_nd,
-            "GREATER_EQUAL": self.convert_greater_equal,
-            "GREATER": self.convert_greater,
-            "GELU": self.convert_gelu,
-            "HARD_SWISH": self.convert_hard_swish,
-            "L2_NORMALIZATION": self.convert_l2_normalization,
-            "L2_POOL_2D": self.convert_l2_pool2d,
-            "LEAKY_RELU": self.convert_leaky_relu,
-            "LESS_EQUAL": self.convert_less_equal,
-            "LESS": self.convert_less,
-            "LOCAL_RESPONSE_NORMALIZATION": self.convert_lrn,
-            "LOG": self.convert_log,
-            "LOG_SOFTMAX": self.convert_log_softmax,
-            "LOGICAL_AND": self.convert_logical_and,
-            "LOGICAL_NOT": self.convert_logical_not,
-            "LOGICAL_OR": self.convert_logical_or,
-            "LOGISTIC": self.convert_logistic,
-            "MATRIX_DIAG": self.convert_matrix_diag,
-            "MATRIX_SET_DIAG": self.convert_matrix_set_diag,
-            "MAX_POOL_2D": self.convert_max_pool2d,
-            "MAXIMUM": self.convert_maximum,
-            "MEAN": self.convert_reduce_mean,
-            "MINIMUM": self.convert_minimum,
-            "MIRROR_PAD": self.convert_mirror_pad,
-            "MUL": self.convert_mul,
-            "NEG": self.convert_neg,
-            "NOT_EQUAL": self.convert_not_equal,
-            "ONE_HOT": self.convert_one_hot,
-            "PACK": self.convert_pack,
-            "PAD": self.convert_pad,
-            "PADV2": self.convert_pad,
-            "POW": self.convert_pow,
-            "PRELU": self.convert_prelu,
-            "RANGE": self.convert_range,
-            "QUANTIZE": self.convert_quantize,
-            "REDUCE_ANY": self.convert_reduce_any,
-            "REDUCE_MAX": self.convert_reduce_max,
-            "REDUCE_MIN": self.convert_reduce_min,
-            "REDUCE_PROD": self.convert_reduce_prod,
-            "RELU": self.convert_relu,
-            "RELU6": self.convert_relu6,
-            "RELU_N1_TO_1": self.convert_relu_n1_to_1,
-            "RESHAPE": self.convert_reshape,
-            "RESIZE_BILINEAR": self.convert_resize_bilinear,
-            "RESIZE_NEAREST_NEIGHBOR": self.convert_resize_nearest_neighbor,
-            "ROUND": self.convert_round,
-            "RSQRT": self.convert_rsqrt,
-            "REVERSE_SEQUENCE": self.convert_reverse_sequence,
-            "REVERSE_V2": self.convert_reverse_v2,
-            "SELECT": self.convert_select,
-            "SHAPE": self.convert_shape,
-            "SIN": self.convert_sin,
-            "SLICE": self.convert_slice,
-            "SOFTMAX": self.convert_softmax,
-            "SPACE_TO_BATCH_ND": self.convert_space_to_batch_nd,
-            "SPACE_TO_DEPTH": self.convert_space_to_depth,
-            "SPARSE_TO_DENSE": self.convert_sparse_to_dense,
-            "SPLIT": self.convert_split,
-            "SPLIT_V": self.convert_split_v,
-            "SQRT": self.convert_sqrt,
-            "SQUARE": self.convert_square,
-            "SQUARED_DIFFERENCE": self.convert_squared_difference,
-            "SQUEEZE": self.convert_squeeze,
-            "STRIDED_SLICE": self.convert_strided_slice,
-            "SUB": self.convert_sub,
-            "SUM": self.convert_reduce_sum,
-            "TAN": self.convert_tan,
-            "TANH": self.convert_tanh,
-            "TILE": self.convert_tile,
-            "TOPK_V2": self.convert_topk_v2,
-            "TRANSPOSE_CONV": self.convert_transpose_conv,
-            "TRANSPOSE": self.convert_transpose,
-            "UNPACK": self.convert_unpack,
-            "UNIDIRECTIONAL_SEQUENCE_LSTM": self.convert_unidirectional_sequence_lstm,
-            "WHERE": self.convert_select,
-            "ZEROS_LIKE": self.convert_zeros_like,
-            "NON_MAX_SUPPRESSION_V5": self.convert_nms_v5,
-        }
-
-    def check_unsupported_ops(self):
-        """Check unsupported TFLite ops in our converter."""
-        unsupported_ops_set = set()
-        dynamic_range_ops_set = set()
-        for op_idx in range(self.subgraph.OperatorsLength()):
-            op = self.subgraph.Operators(op_idx)
-            op_code_str = self.get_op_code_str(op)
-            if op_code_str not in self.convert_map:
-                unsupported_ops_set.add(op_code_str)
-                continue
-
-            # Trying to exclude "dynamic range quantization" optimized ops as not supported in TVM
-            qnn_in_cnt = len(
-                [_.qnn_params for _ in self.get_input_tensors(op)[0:1] if _.qnn_params is not None]
-            )
-            qnn_weight_cnt = len(
-                [_.qnn_params for _ in self.get_input_tensors(op)[1:] if _.qnn_params is not None]
-            )
-            qnn_out_cnt = len(
-                [_.qnn_params for _ in self.get_output_tensors(op) if _.qnn_params is not None]
-            )
-
-            if qnn_in_cnt == 0 and qnn_out_cnt == 0 and qnn_weight_cnt > 0:
-                dynamic_range_ops_set.add(op_code_str)
-
-        raise_msg = ""
-
-        if unsupported_ops_set:
-            ops = str(list(unsupported_ops_set)).strip("[,]")
-            raise_msg += f"The following operators are not supported in frontend TFLite: {ops}\n"
-
-        if dynamic_range_ops_set:
-            ops = str(list(dynamic_range_ops_set)).strip("[,]")
-            raise_msg += (
-                f"The following operators are likely to have dynamic range quantization: {ops}. "
-                f"If you are running an optimized graph, please turn off dynamic range "
-                f"quantization or use full integer quantization"
-            )
-
-        if len(raise_msg) > 0:
-            raise tvm.error.OpNotImplemented(raise_msg)
-
-    def unbind(self, data, axis=1):
-        """
-        This is a modified version compared to the one in common.py.
-        The onnx version takes a relay.Expr.Call, the tflite
-        version a TensorWrapper. Also this version by default splits
-        along axis 1 and not axis 0 as the onnx version.
-
-         Parameters
-         ----------
-         data : tvm.relay.frontend.tflite.TensorWrapper
-             Input tensor
-         axis : int
-             Axis along which tensor is split.
-         Returns
-         -------
-         result : List[relay.Expr]
-             The sequence of computed tensors
-        """
-        shape = to_int_list(self.get_tensor_shape(data))
-        if axis >= len(shape):
-            msg = "Please check input dim, it shouldn't be greater than or equal to rank."
-            raise AttributeError(msg)
-
-        selections = shape[axis]
-        shape.pop(axis)
-        timestep = 0  # Reshape to make time step as the first dim
-        shape.insert(timestep, selections)
-        res_split = _op.split(
-            _op.reshape(self.get_expr(data.tensor_idx), tuple(shape)), selections, timestep
-        )
-        ret = []
-        for i in range(selections):
-            ret.append(_op.squeeze(res_split[i], axis=[timestep]))
-        return _expr.TupleWrapper(_expr.Tuple(ret), selections)
-
-    def convert_op_to_relay(self):
-        """Convert TFLite ops to relay ops"""
-        for op_idx in range(self.subgraph.OperatorsLength()):
-            op = self.subgraph.Operators(op_idx)
-            op_code_str = self.get_op_code_str(op)
-            output_tensors = self.get_output_tensors(op)
-            try:
-                from tflite.Operator import Operator
-            except ImportError:
-                raise ImportError("The tflite package must be installed")
-
-            assert isinstance(op, Operator)
-            ret = self.convert_map[op_code_str](op)
-
-            # In case the Op can be prefetched, the output can be optimized out
-            if ret is None:
-                continue
-
-            output_names = ", ".join(
-                [get_tensor_name(self.subgraph, tensor.tensor_idx) for tensor in output_tensors]
-            )
-            ret = set_span(ret, f"{output_names}")
-
-            if len(output_tensors) == 1:
-                tensor_idx = output_tensors[0].tensor_idx
-                self.exp_tab.set_expr(get_tensor_name(self.subgraph, tensor_idx), ret)
-            else:
-                for idx, output_tensor in enumerate(output_tensors):
-                    self.exp_tab.set_expr(
-                        get_tensor_name(self.subgraph, output_tensor.tensor_idx), ret[idx]
-                    )
-
-    def get_op_code_str(self, op):
-        """Get TFLite ops string representation"""
-        try:
-            from tflite.BuiltinOperator import BuiltinOperator
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        op_code_list_idx = op.OpcodeIndex()
-
-        op_c = self.model.OperatorCodes(op_code_list_idx)
-        # In TFlite 2.4.x there was a change where the type of the field that contained
-        # the builtin code changed from int8 to int32 in the flat buffer representation.
-        # However, to retain support for old flat buffers that were created, they retained
-        # the original 8 bit field, but named it "deprecated_builtin_code" in TFLite 2.4.
-        # This means that the API function BuiltinCode() which originally returned the value
-        # of the 8 bit field would now look for the value in the new int32 field in the
-        # schema and DeprecatedBuiltinCode() will look at the old 8 bit field.
-        # In TFLite 2.4, if the opcode value is less than 127, it can be in either field
-        # (however, if it is only in the "builtin_code" field, the model is not backward
-        # compatible), so similarly to TFLite 2.4 reader, we'll pick the higher value of the
-        # two fields.
-        # Remember however that this value came into existence only after Tensorflow
-        # lite 2.4.x and hence encase it in a try -except block.
-        # Phew !
-        try:
-            opc = max(op_c.DeprecatedBuiltinCode(), op_c.BuiltinCode())
-        except AttributeError:
-            # In versions before 2.4 the int8 field that holds the builtin code is accessed
-            # by BuiltinCode() and DeprecatedBuiltinCode() doesn't exist
-            opc = op_c.BuiltinCode()
-
-        op_code_id = opc
-        try:
-            op_code_str = self.builtin_op_code[op_code_id]
-        except KeyError:
-            raise NotImplementedError(
-                "TFLite operator with code "
-                + str(op_code_id)
-                + " is not supported by this version of the fbs schema."
-            )
-        if op_code_id == BuiltinOperator.CUSTOM:
-            # Custom operator
-            custom_op_code_str = self.model.OperatorCodes(op_code_list_idx).CustomCode()
-
-            if self.allow_custom_ops:
-                return "CUSTOM"
-
-            if custom_op_code_str == b"TFLite_Detection_PostProcess":
-                return "DETECTION_POSTPROCESS"
-
-            raise NotImplementedError("Custom operators are currently not supported")
-        return op_code_str
-
-    def get_input_tensors(self, op):
-        operator_inputs = op.InputsAsNumpy()
-        return self.get_tensors(operator_inputs)
-
-    def get_output_tensors(self, op):
-        operator_outputs = op.OutputsAsNumpy()
-        return self.get_tensors(operator_outputs)
-
-    def get_tensors(self, tensors_idx_list):
-        """Get tensor wrapper list from given TFLite tensor index list"""
-        return_list = list()
-        for tensor_idx in tensors_idx_list:
-            if tensor_idx < 0:
-                return_list.append(TensorWrapper(tensor_idx, 0, 0))
-                continue
-
-            tensor = self.subgraph.Tensors(tensor_idx)
-            buffer_idx = tensor.Buffer()
-            buffer = self.model.Buffers(buffer_idx)
-
-            # Check if the tensors are quantized. Parse if yes.
-            qnn_params = None
-            tflite_qnn_params = tensor.Quantization()
-            if tflite_qnn_params is not None:
-                # TFLite supports both per-tensor and per-axis (aka channel) quantization.  For
-                # per-tensor quantization, scale and zero points are scalar values.  For per-axis
-                # quantization, scale and zero points for the weights are tensors (activations are
-                # per-tensor quantized). However, the TFLite quantization spec puts restrictions on
-                # zero points for per-axis quantization.  Specifically, the zero point is a tensor
-                # but all values are 0. More information can be found here -
-                # https://www.tensorflow.org/lite/performance/quantization_spec
-
-                tflite_scale = tflite_qnn_params.ScaleAsNumpy()
-                tflite_zero_point = tflite_qnn_params.ZeroPointAsNumpy()
-                is_qnn_params_valid = True
-
-                # Handle Per-axis and per-tensor cases
-                if isinstance(tflite_scale, np.ndarray):
-                    assert isinstance(tflite_zero_point, np.ndarray)
-
-                    # Tensor - Per-axis quantization
-                    if tflite_scale.size != 1 and tflite_zero_point.size != 1:
-                        scale = tflite_scale
-                        # Ensure that all zero points are zeros
-                        zero_point = tflite_zero_point
-                        if not np.all(zero_point == 0):
-                            raise tvm.error.OpAttributeInvalid(
-                                "TFLite per-axis quantization restricts all zero points to be"
-                                + " 0, but a non-zero value is observed"
-                            )
-                        zero_point = int(zero_point[0])
-
-                    # Scalar - Per-tensor quantization
-                    elif tflite_scale.size == 1 and tflite_zero_point.size == 1:
-                        scale = float(tflite_scale[0])
-                        zero_point = int(tflite_zero_point[0])
-
-                    else:
-                        raise NotImplementedError(
-                            f"Quantized type {type(tflite_scale)} (scale) and  "
-                            f"{type(tflite_zero_point)} (zero point) not supported"
-                        )
-                elif tflite_scale == 0 and tflite_zero_point == 0:
-                    # Handle corner case for ops like quantized reshape whose second operand (shape)
-                    # has zero scale and zero zero point. This is not used.
-                    is_qnn_params_valid = False
-                else:
-                    raise NotImplementedError(f"Quantized type {type(tflite_scale)} not supported")
-
-                # Check that the scale and zero points are valid.
-                if is_qnn_params_valid:
-                    qnn_params = dict()
-                    qnn_params["scale"] = relay.const(scale, "float32")
-                    qnn_params["zero_point"] = relay.const(zero_point, "int32")
-            return_list.append(TensorWrapper(tensor_idx, tensor, buffer, qnn_params))
-        return return_list
-
-    def get_tensor_type_as_numpy(self, tensor_wrapper):
-        """Returns np.dtype out of TensorType"""
-        assert isinstance(tensor_wrapper, TensorWrapper)
-
-        try:
-            from tflite.TensorType import TensorType
-
-            return {
-                TensorType.UINT8: np.uint8,
-                TensorType.INT8: np.int8,
-                TensorType.INT16: np.int16,
-                TensorType.FLOAT16: np.float16,
-                TensorType.FLOAT32: np.float32,
-                TensorType.INT32: np.int32,
-                TensorType.INT64: np.int64,
-                TensorType.BOOL: np.bool_,
-            }[tensor_wrapper.tensor.Type()]
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-        except KeyError:
-            raise NotImplementedError(
-                f"Tensor type '{tensor_wrapper.tensor.Type()}' currently not supported"
-            )
-
-    # pylint: disable=no-else-return
-    def get_tensor_value(self, tensor_wrapper, is_sparse=False):
-        """Get tensor buffer value from given tensor wrapper"""
-        assert isinstance(tensor_wrapper, TensorWrapper)
-
-        dtype = self.get_tensor_type_as_numpy(tensor_wrapper)
-        data = tensor_wrapper.buffer.DataAsNumpy()
-
-        if tensor_wrapper.tensor.ShapeLength() != 0:
-            shape = to_int_list(self.get_tensor_shape(tensor_wrapper))
-        else:
-            shape = []
-
-        if is_sparse:
-            return np.frombuffer(data, dtype=dtype)
-        else:
-            return np.frombuffer(data, dtype=dtype).reshape(shape)
-
-    def get_tensor_type_str(self, tensor_type):
-        """Get tensor type string representation when given TFLite tensor type"""
-        try:
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        if tensor_type == TensorType.INT8:
-            return "int8"
-        if tensor_type == TensorType.INT16:
-            return "int16"
-        if tensor_type == TensorType.UINT8:
-            return "uint8"
-        if tensor_type == TensorType.FLOAT16:
-            return "float16"
-        if tensor_type == TensorType.FLOAT32:
-            return "float32"
-        if tensor_type == TensorType.INT32:
-            return "int32"
-        if tensor_type == TensorType.INT64:
-            return "int64"
-        if tensor_type == TensorType.BOOL:
-            return "bool"
-        raise NotImplementedError(f"Tensor type {str(tensor_type)} is currently not supported")
-
-    def flatten_to_nd(self, x, x_shape, nd=3):
-        """Flatten input tensor to nd rank"""
-        ndims = _infer_shape(x_shape)[0]
-        if ndims == nd:
-            return x
-        newshape = _op.concatenate(
-            [
-                _expr.const([-1], dtype=_infer_type(x_shape).checked_type.dtype),
-                _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-            ],
-            0,
-        )
-        out = _op.reshape(x, _fold_constant(newshape))
-        return out
-
-    def has_same_qnn_params(self, lhs_tensor, rhs_tensor):
-        lhs_scale = lhs_tensor.qnn_params["scale"]
-        rhs_scale = rhs_tensor.qnn_params["scale"]
-        lhs_zero_point = lhs_tensor.qnn_params["zero_point"]
-        rhs_zero_point = rhs_tensor.qnn_params["zero_point"]
-        # 0.1 + 0.2 != 0.3
-        return np.allclose(
-            lhs_scale.data.numpy(), rhs_scale.data.numpy(), rtol=1e-5, atol=1e-5
-        ) and np.allclose(
-            lhs_zero_point.data.numpy(), rhs_zero_point.data.numpy(), rtol=1e-5, atol=1e-5
-        )
-
-    def is_quantized(self, op):
-        """Check if an input tensor is quantized."""
-        input_tensors = self.get_input_tensors(op)
-        first_tensor = input_tensors[0]
-        return first_tensor.qnn_params is not None
-
-    def quantize(self, expr, tensor_to_quantize):
-        """Helper function to quantize a tensor with Relay"""
-        tensor_type = tensor_to_quantize.tensor.Type()
-        tensor_type_str = self.get_tensor_type_str(tensor_type)
-        quantized = _qnn.op.quantize(
-            data=expr,
-            output_scale=tensor_to_quantize.qnn_params["scale"],
-            output_zero_point=tensor_to_quantize.qnn_params["zero_point"],
-            out_dtype=tensor_type_str,
-        )
-        return quantized
-
-    def dequantize(self, expr, tensor):
-        """Helper function to dequantize a tensor with Relay"""
-        dequantized = _qnn.op.dequantize(
-            data=expr,
-            input_scale=tensor.qnn_params["scale"],
-            input_zero_point=tensor.qnn_params["zero_point"],
-        )
-        return dequantized
-
-    def convert_qnn_fused_activation_function(
-        self, expr, fused_activation_fn, scale, zero_point, dtype
-    ):
-        """Convert TFLite fused activation function. The expr is an input quantized tensor with
-        scale and zero point"""
-        try:
-            from tflite.ActivationFunctionType import ActivationFunctionType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        # Quantize a float value to an quantized integer value
-        quantize = lambda x: float(int(round(x / scale)) + zero_point)
-
-        # Get min/max of the output dtype. This will be used to ensure that clip a_min/a_max are not
-        # beyond the dtype range.
-        qmin = float(tvm.tir.op.min_value(dtype).value)
-        qmax = float(tvm.tir.op.max_value(dtype).value)
-
-        # The input expr is a quantized tensor with its scale and zero point. We calculate the
-        # suitable clip off points based on these scale and zero point.
-        if fused_activation_fn == ActivationFunctionType.NONE:
-            return expr
-        if fused_activation_fn == ActivationFunctionType.RELU6:
-            return _op.clip(expr, a_min=max(qmin, quantize(0)), a_max=min(qmax, quantize(6.0)))
-        if fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
-            return _op.clip(expr, a_min=max(qmin, quantize(-1.0)), a_max=min(qmax, quantize(1.0)))
-        if fused_activation_fn == ActivationFunctionType.RELU:
-            return _op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
-
-        fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
-        raise tvm.error.OpNotImplemented(
-            f"Quantized activation {fused_activation_fn_str} is not supported yet."
-        )
-
-    def convert_conv2d(self, op):
-        """Convert TFLite conv2d"""
-        return self.convert_conv(op, "conv2d")
-
-    def convert_depthwise_conv2d(self, op):
-        """Convert TFLite depthwise conv2d"""
-        return self.convert_conv(op, "depthwise")
-
-    def convert_average_pool2d(self, op):
-        """Convert TFLite average pool2d"""
-        return self.convert_pool2d(op, "average")
-
-    def convert_max_pool2d(self, op):
-        """Convert TFLite max pool2d"""
-        return self.convert_pool2d(op, "max")
-
-    def convert_l2_pool2d(self, op):
-        """Convert TFLite l2 pool2d"""
-        return self.convert_pool2d(op, "l2")
-
-    def convert_reshape(self, op):
-        """Convert TFLite reshape"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.ReshapeOptions import ReshapeOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) in (1, 2), "input tensors should not be empty"
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "There should be only 1 output tensor"
-
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-
-        if len(input_tensors) == 2:
-            shape_tensor = input_tensors[1]
-            if self.has_expr(shape_tensor.tensor_idx):
-                target_expr = self.get_expr(shape_tensor.tensor_idx)
-                target_value, success = try_infer_value(
-                    target_expr,
-                    parameters={k: _nd.array(np.array(v)) for k, v in self.exp_tab.params.items()},
-                )
-                if success:
-                    # convert to flattened list
-                    from itertools import chain
-
-                    try:
-                        target_shape = list(chain(*target_value))
-                    except TypeError:
-                        target_shape = list(chain(target_value))
-                else:
-                    target_shape = target_expr
-            else:
-                target_shape = self.get_tensor_value(shape_tensor)
-                # convert to flattened list
-                from itertools import chain
-
-                try:
-                    target_shape = list(chain(*target_shape))
-                except TypeError:
-                    target_shape = list(chain(target_shape))
-
-        else:
-            assert op.BuiltinOptionsType() == BuiltinOptions.ReshapeOptions
-            op_options = op.BuiltinOptions()
-            reshape_options = ReshapeOptions()
-            reshape_options.Init(op_options.Bytes, op_options.Pos)
-            target_shape = to_int_list(reshape_options.NewShapeAsNumpy())
-
-        in_expr = self.get_expr(input_tensor_idx)
-
-        # If the tensors are quantized, ensure that input/output qnn params are same.
-
-        input_tensor_type_str = self.get_tensor_type_str(input_tensor.tensor.Type())
-        if input_tensor.qnn_params and input_tensor_type_str == "int8":
-            # TFLite 2.x quantization spec requires qnn params to be same and dtype to be int8.
-            # For TFLite 1.x, dtype can be uint8 and qnn params can be different
-            output_tensor = output_tensors[0]
-            assert self.has_same_qnn_params(
-                input_tensor, output_tensor
-            ), "TFLite reshape requires input and output scale and zero points to be equal"
-
-        out = _op.reshape(in_expr, newshape=target_shape)
-        if input_tensor.qnn_params and input_tensor_type_str == "uint8":
-            output_tensor = output_tensors[0]
-            if not self.has_same_qnn_params(input_tensor, output_tensor):
-                output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-                out = _qnn.op.requantize(
-                    out,
-                    input_scale=input_tensor.qnn_params["scale"],
-                    input_zero_point=input_tensor.qnn_params["zero_point"],
-                    output_scale=output_tensor.qnn_params["scale"],
-                    output_zero_point=output_tensor.qnn_params["zero_point"],
-                    out_dtype=output_tensor_type_str,
-                )
-
-        return out
-
-    def _convert_resize(self, method, op):
-        """Generic method to Convert TFLite RESIZE operators"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.ResizeBilinearOptions import ResizeBilinearOptions
-
-            # ResizeNearestNeighborOptions was added in tflite v1.13
-            tflite_ver = 1120
-            if "ResizeNearestNeighborOptions" in dir(BuiltinOptions):
-                from tflite.ResizeNearestNeighborOptions import ResizeNearestNeighborOptions
-
-                tflite_ver = 1130
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        # images, 4-D Tensor with shape NHWC.
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        # size - 1-D int32 Tensor of 2 elements: new_height, new_width
-        target_size = tuple(self.get_tensor_value(input_tensors[1]))
-
-        # Options - align_corners (bool)
-        resize_options = None
-        align_corners = False
-        bilinear_method = method == "linear"
-        if bilinear_method:
-            assert op.BuiltinOptionsType() == BuiltinOptions.ResizeBilinearOptions
-            resize_options = ResizeBilinearOptions()
-        elif tflite_ver >= 1130:
-            assert op.BuiltinOptionsType() == BuiltinOptions.ResizeNearestNeighborOptions
-            resize_options = ResizeNearestNeighborOptions()
-
-        if resize_options is not None:
-            op_options = op.BuiltinOptions()
-            resize_options.Init(op_options.Bytes, op_options.Pos)
-            align_corners = resize_options.AlignCorners()
-            half_pixel_centers = resize_options.HalfPixelCenters()
-
-        # Use layout NHWC
-        coord_trans = "align_corners" if align_corners else "asymmetric"
-        coord_trans = "half_pixel" if half_pixel_centers else coord_trans
-
-        rounding_method = ""
-        if method == "nearest_neighbor":
-            if not align_corners and half_pixel_centers:
-                rounding_method = "round_prefer_ceil"
-
-        if bilinear_method and input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-        out = _op.image.resize2d(
-            in_expr, target_size, None, "NHWC", method, coord_trans, rounding_method
-        )
-        if bilinear_method and output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-        return out
-
-    def convert_resize_bilinear(self, op):
-        """Convert TFLite RESIZE_BILINEAR"""
-        return self._convert_resize("linear", op)
-
-    def convert_resize_nearest_neighbor(self, op):
-        """Convert TFLite RESIZE_NEAREST_NEIGHBOR"""
-        return self._convert_resize("nearest_neighbor", op)
-
-    def convert_l2_normalization(self, op):
-        """Convert TFLite L2_NORMALIZATION"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.L2NormOptions import L2NormOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.L2NormOptions
-        op_options = op.BuiltinOptions()
-        l2_norm_options = L2NormOptions()
-        l2_norm_options.Init(op_options.Bytes, op_options.Pos)
-        fused_activation_fn = l2_norm_options.FusedActivationFunction()
-
-        # TFLite supports normalization only over the last dim
-        input_tensor_rank = len(input_tensor.tensor.ShapeAsNumpy())
-
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFLite quantized L2_NORMALIZATION operator is not supported yet."
-            )
-
-        # TFL uses only the default epsilon value
-        out = _op.nn.l2_normalize(in_expr, eps=1e-12, axis=[input_tensor_rank - 1])
-
-        # if we have fused activation fn
-        if output_tensor.qnn_params:
-            raise tvm.error.OpNotImplemented(
-                "TFLite quantized L2_NORMALIZATION operator is not supported yet."
-            )
-        out = self.convert_fused_activation_function(out, fused_activation_fn)
-
-        return out
-
-    def convert_lrn(self, op):
-        """Convert TFLite LOCAL_RESPONSE_NORMALIZATION"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.LocalResponseNormalizationOptions import LocalResponseNormalizationOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented("TFlite quantized LRN operator is not supported yet.")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.LocalResponseNormalizationOptions
-        op_options = op.BuiltinOptions()
-        lrn_options = LocalResponseNormalizationOptions()
-        lrn_options.Init(op_options.Bytes, op_options.Pos)
-        radius = lrn_options.Radius()
-        bias = lrn_options.Bias()
-        alpha = lrn_options.Alpha()
-        beta = lrn_options.Beta()
-        size = (radius * 2) + 1
-        alpha = alpha * size
-        axis = 3  # NHWC format
-        out = _op.nn.lrn(in_expr, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
-
-        return out
-
-    def convert_logistic(self, op):
-        """Convert TFLite LOGISTIC"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-        out = _op.sigmoid(in_expr)
-        if output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-
-        return out
-
-    def convert_softmax(self, op):
-        """Convert TFLite softmax"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        params = {"axis": -1}  # -1 is channel
-        in_expr = self.get_expr(input_tensor_idx)
-
-        # TODO - Naive softmax int8 implementation leads to bad accuracy. Currently, we can
-        # dequantize to FP32 and perform softmax on FP32. We can investigate an integer only softmax
-        # implementation in future.
-        if input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-
-        out = _op.nn.softmax(in_expr, **params)
-
-        # Go back to integer dataype if the original operator was quantized.
-        if output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-
-        return out
-
-    def convert_tanh(self, op):
-        """Convert TFLite TANH"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-        out = _op.tanh(in_expr)
-        if output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-        return out
-
-    def convert_range(self, op):
-        """Convert TFLite Range"""
-        try:
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 3, "input tensors length should be 3"
-
-        start, limit, delta = input_tensors[0], input_tensors[1], input_tensors[2]
-
-        expressions = [self.get_tensor_expr(t) for t in [start, limit, delta]]
-
-        # out type inference
-        if delta.tensor.Type() == TensorType.FLOAT32:
-            out_type = self.get_tensor_type_str(delta.tensor.Type())
-        else:
-            out_type = self.get_tensor_type_str(start.tensor.Type())
-
-        out = _op.arange(expressions[0], expressions[1], expressions[2], out_type)
-
-        return out
-
-    def convert_shape(self, op):
-        """Convert TFLite Shape"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.ShapeOptions import ShapeOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.ShapeOptions
-        op_options = op.BuiltinOptions()
-        shape_options = ShapeOptions()
-        shape_options.Init(op_options.Bytes, op_options.Pos)
-
-        out_type = self.get_tensor_type_str(shape_options.OutType())
-        out = shape_of(self.get_tensor_expr(input_tensors[0]), dtype=out_type)
-
-        return out
-
-    def convert_relu(self, op):
-        """Convert TFLite ReLU"""
-        try:
-            from tflite.ActivationFunctionType import ActivationFunctionType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            # Quantize a float value to an quantized integer value
-            scale_val = get_scalar_from_constant(input_tensor.qnn_params["scale"])
-            zero_point_val = get_scalar_from_constant(input_tensor.qnn_params["zero_point"])
-
-            output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-            out = self.convert_qnn_fused_activation_function(
-                expr=in_expr,
-                fused_activation_fn=ActivationFunctionType.RELU,
-                scale=scale_val,
-                zero_point=zero_point_val,
-                dtype=output_tensor_type_str,
-            )
-        else:
-            out = _op.nn.relu(in_expr)
-
-        if output_tensor.qnn_params:
-            output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-            out = _qnn.op.requantize(
-                out,
-                input_scale=input_tensor.qnn_params["scale"],
-                input_zero_point=input_tensor.qnn_params["zero_point"],
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-            )
-
-        return out
-
-    def convert_hard_swish(self, op):
-        """Convert TFLite Hard swish"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        def _relu6(data):
-            return _op.tensor.clip(data, 0.0, 6.0)
-
-        def _hard_swish(data):
-            return data * _relu6(data + relay.const(3.0)) / relay.const(6.0)
-
-        # Dequantize if the input is quantized.
-        if input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-
-        # Perform hardswish
-        out = _hard_swish(in_expr)
-
-        # Go back to integer dataype if the original operator was quantized.
-        if output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-
-        return out
-
-    def convert_relu6(self, op):
-        """Convert TFLite ReLU6"""
-        try:
-            from tflite.ActivationFunctionType import ActivationFunctionType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            # Quantize a float value to an quantized integer value
-            scale_val = get_scalar_from_constant(input_tensor.qnn_params["scale"])
-            zero_point_val = get_scalar_from_constant(input_tensor.qnn_params["zero_point"])
-
-            output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-            out = self.convert_qnn_fused_activation_function(
-                expr=in_expr,
-                fused_activation_fn=ActivationFunctionType.RELU6,
-                scale=scale_val,
-                zero_point=zero_point_val,
-                dtype=output_tensor_type_str,
-            )
-        else:
-            out = _op.clip(in_expr, a_min=0, a_max=6)
-
-        if output_tensor.qnn_params:
-            output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-            out = _qnn.op.requantize(
-                out,
-                input_scale=input_tensor.qnn_params["scale"],
-                input_zero_point=input_tensor.qnn_params["zero_point"],
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-            )
-
-        return out
-
-    def convert_leaky_relu(self, op):
-        """Convert TFLite LEAKY_RELU"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.LeakyReluOptions import LeakyReluOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.LeakyReluOptions
-        op_options = op.BuiltinOptions()
-        leaky_relu_options = LeakyReluOptions()
-        leaky_relu_options.Init(op_options.Bytes, op_options.Pos)
-        alpha_tensor = leaky_relu_options.Alpha()
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-        out = _op.nn.leaky_relu(in_expr, alpha_tensor)
-        if output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-
-        return out
-
-    def convert_relu_n1_to_1(self, op):
-        """Convert TFLite RELU_N1_TO_1"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            # Quantize a float value to an quantized integer value
-            scale_val = get_scalar_from_constant(input_tensor.qnn_params["scale"])
-            zero_point_val = get_scalar_from_constant(input_tensor.qnn_params["zero_point"])
-            quantize = lambda x: float(int(round(x / scale_val)) + zero_point_val)
-
-            # Get min/max of the input dtype. This will be used to ensure that
-            # clip a_min/a_max are not beyond the dtype range.
-            input_tensor_type_str = self.get_tensor_type_str(input_tensor.tensor.Type())
-            qmin = float(tvm.tir.op.min_value(input_tensor_type_str).value)
-            qmax = float(tvm.tir.op.max_value(input_tensor_type_str).value)
-
-            out = _op.clip(in_expr, a_min=max(qmin, quantize(-1.0)), a_max=min(qmax, quantize(1.0)))
-        else:
-            out = _op.clip(in_expr, a_min=-1, a_max=1)
-
-        if output_tensor.qnn_params:
-            output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-            out = _qnn.op.requantize(
-                out,
-                input_scale=input_tensor.qnn_params["scale"],
-                input_zero_point=input_tensor.qnn_params["zero_point"],
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-            )
-
-        return out
-
-    def convert_log_softmax(self, op):
-        """Convert TFLite LOG_SOFTMAX"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-        out = _op.nn.log_softmax(in_expr)
-        if output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-
-        return out
-
-    def convert_concatenation(self, op):
-        """Convert TFLite concatenation"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.ConcatenationOptions import ConcatenationOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) >= 1, "input tensors should greater than 1"
-        in_exprs = [self.get_tensor_expr(_) for _ in input_tensors]
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.ConcatenationOptions
-        op_options = op.BuiltinOptions()
-        concatenation_options = ConcatenationOptions()
-        concatenation_options.Init(op_options.Bytes, op_options.Pos)
-        concatenation_axis = concatenation_options.Axis()
-        fused_activation_fn = concatenation_options.FusedActivationFunction()
-
-        if not input_tensors[0].qnn_params:
-            out = _op.concatenate(in_exprs, axis=concatenation_axis)
-        else:
-            input_scales = [input_tensor.qnn_params["scale"] for input_tensor in input_tensors]
-            input_zero_points = [
-                input_tensor.qnn_params["zero_point"] for input_tensor in input_tensors
-            ]
-            out = _qnn.op.concatenate(
-                in_exprs,
-                input_scales=input_scales,
-                input_zero_points=input_zero_points,
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                axis=concatenation_axis,
-            )
-
-        # Handle fused activations
-        if output_tensor.qnn_params:
-            scale_val = get_scalar_from_constant(output_tensor.qnn_params["scale"])
-            zero_point_val = get_scalar_from_constant(output_tensor.qnn_params["zero_point"])
-            output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-            out = self.convert_qnn_fused_activation_function(
-                expr=out,
-                fused_activation_fn=fused_activation_fn,
-                scale=scale_val,
-                zero_point=zero_point_val,
-                dtype=output_tensor_type_str,
-            )
-        else:
-            out = self.convert_fused_activation_function(out, fused_activation_fn)
-
-        return out
-
-    def _convert_unary_elemwise(self, relay_op, op):
-        """Generic method to convert TFLite unary elemwise functions"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        if input_tensor.qnn_params:
-            in_expr = self.dequantize(in_expr, input_tensor)
-        out = relay_op(in_expr)
-        if output_tensor.qnn_params:
-            out = self.quantize(out, output_tensor)
-        return out
-
-    def convert_abs(self, op):
-        """Convert TFLite ABS"""
-        return self._convert_unary_elemwise(_op.abs, op)
-
-    def convert_ceil(self, op):
-        """Convert TFLite CEIL"""
-        return self._convert_unary_elemwise(_op.ceil, op)
-
-    def convert_floor(self, op):
-        """Convert TFLite FLOOR"""
-        return self._convert_unary_elemwise(_op.floor, op)
-
-    def convert_round(self, op):
-        """Convert TFLite ROUND"""
-        return self._convert_unary_elemwise(_op.round, op)
-
-    def convert_exp(self, op):
-        """Convert TFLite EXP"""
-        return self._convert_unary_elemwise(_op.exp, op)
-
-    def convert_log(self, op):
-        """Convert TFLite LOG"""
-        return self._convert_unary_elemwise(_op.log, op)
-
-    def convert_sin(self, op):
-        """Convert TFLite SIN"""
-        return self._convert_unary_elemwise(_op.sin, op)
-
-    def convert_tan(self, op):
-        """Convert TFLite TAN"""
-        return self._convert_unary_elemwise(_op.tan, op)
-
-    def convert_cos(self, op):
-        """Convert TFLite COS"""
-        return self._convert_unary_elemwise(_op.cos, op)
-
-    def convert_sqrt(self, op):
-        """Convert TFLite SQRT"""
-        return self._convert_unary_elemwise(_op.sqrt, op)
-
-    def convert_rsqrt(self, op):
-        """Convert TFLite RSQRT"""
-        return self._convert_unary_elemwise(_op.rsqrt, op)
-
-    def convert_neg(self, op):
-        """Convert TFLite NEG"""
-        return self._convert_unary_elemwise(_op.negative, op)
-
-    def convert_elu(self, op):
-        """Convert TFLite ELU"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-        exp_type = self.get_tensor_type_str(input_tensor.tensor.Type())
-        out = relay.const(-1.0, exp_type) * _op.nn.relu(
-            relay.const(1.0, exp_type) - _op.exp(in_expr)
-        ) + _op.nn.relu(in_expr)
-
-        return out
-
-    def convert_gelu(self, op):
-        """Convert TFLite GELU"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "The TFLite to Relay converter does not support quantized GELU operator yet."
-            )
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-        in_type = self.get_tensor_type_str(input_tensor.tensor.Type())
-
-        return in_expr * (
-            _expr.const(0.5, dtype=in_type)
-            + _op.erf(in_expr * _expr.const(0.5**0.5, dtype=in_type))
-            * _expr.const(0.5, dtype=in_type)
-        )
-
-    def convert_square(self, op):
-        """Convert TFLite SQUARE"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        exp_type = self.get_tensor_type_str(output_tensor.tensor.Type())
-        out = _op.power(in_expr, relay.const(2, exp_type))
-
-        return out
-
-    def _convert_elemwise(self, relay_op, op, ignore_qnn_params=False, comparison_op=False):
-        """Generic method to Convert TFLite elemwise"""
-        try:
-            from tflite.AddOptions import AddOptions
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.DivOptions import DivOptions
-            from tflite.MulOptions import MulOptions
-            from tflite.SubOptions import SubOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        lhs_tensor = input_tensors[0]
-        rhs_tensor = input_tensors[1]
-        lhs_expr = self.get_tensor_expr(lhs_tensor)
-        rhs_expr = self.get_tensor_expr(rhs_tensor)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        # TFLite format demands equal scale and zero_point tuple parameters for some operations
-        # to allow us to use non-quantized operation instead of quantized if ignore_qnn_params=True
-        if ignore_qnn_params and not comparison_op:
-            assert (
-                lhs_tensor.qnn_params
-                and self.has_same_qnn_params(lhs_tensor, output_tensor)
-                and self.has_same_qnn_params(rhs_tensor, output_tensor)
-            ), "All tensors should be quantized with the same (scale,zero-point) tuple parameters"
-
-        # If quantized, extracts qnn params and call QNN add operator.
-        if not ignore_qnn_params and lhs_tensor.qnn_params:
-            assert rhs_tensor.qnn_params, "Both tensors should be quantized."
-            assert output_tensor.qnn_params, "Output tensor should be quantized."
-            out = relay_op(
-                lhs=lhs_expr,
-                rhs=rhs_expr,
-                lhs_scale=lhs_tensor.qnn_params["scale"],
-                lhs_zero_point=lhs_tensor.qnn_params["zero_point"],
-                rhs_scale=rhs_tensor.qnn_params["scale"],
-                rhs_zero_point=rhs_tensor.qnn_params["zero_point"],
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-            )
-        else:
-            out = relay_op(lhs_expr, rhs_expr)
-
-        # Options (fused_activation_function)
-        options = None
-        if op.BuiltinOptionsType() == BuiltinOptions.AddOptions:
-            options = AddOptions()
-        elif op.BuiltinOptionsType() == BuiltinOptions.SubOptions:
-            options = SubOptions()
-        elif op.BuiltinOptionsType() == BuiltinOptions.MulOptions:
-            options = MulOptions()
-        elif op.BuiltinOptionsType() == BuiltinOptions.DivOptions:
-            options = DivOptions()
-
-        if options is not None:
-            op_options = op.BuiltinOptions()
-            options.Init(op_options.Bytes, op_options.Pos)
-            fused_activation_fn = options.FusedActivationFunction()
-
-            # Handle fused activations
-            if not ignore_qnn_params and output_tensor.qnn_params:
-                scale_val = get_scalar_from_constant(output_tensor.qnn_params["scale"])
-                zero_point_val = get_scalar_from_constant(output_tensor.qnn_params["zero_point"])
-                output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-                out = self.convert_qnn_fused_activation_function(
-                    expr=out,
-                    fused_activation_fn=fused_activation_fn,
-                    scale=scale_val,
-                    zero_point=zero_point_val,
-                    dtype=output_tensor_type_str,
-                )
-            else:
-                out = self.convert_fused_activation_function(out, fused_activation_fn)
-        return out
-
-    def convert_add(self, op):
-        """Convert TFLite ADD"""
-        # Check if the input tensor is quantized, call QNN op
-        if self.is_quantized(op):
-            return self._convert_elemwise(_qnn.op.add, op)
-        return self._convert_elemwise(_op.add, op)
-
-    def convert_add_n(self, op):
-        """Convert TFLite ADD_N"""
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-
-        input_tensors = self.get_input_tensors(op)
-        assert not input_tensors[0].qnn_params, "TFLite does not support quantized ADD_N."
-        lhs_expr = self.get_tensor_expr(input_tensors[0])
-        for rhs_tensor in input_tensors[1:]:
-            assert not rhs_tensor.qnn_params, "TFLite does not support quantized ADD_N"
-            rhs_expr = self.get_tensor_expr(rhs_tensor)
-            lhs_expr = _op.add(lhs_expr, rhs_expr)
-        return lhs_expr
-
-    def convert_sub(self, op):
-        """Convert TFLite SUB"""
-        # Check if the input tensor is quantized, call QNN op
-        if self.is_quantized(op):
-            return self._convert_elemwise(_qnn.op.subtract, op)
-        return self._convert_elemwise(_op.subtract, op)
-
-    def convert_mul(self, op):
-        """Convert TFLite MUL"""
-        # Check if the input tensor is quantized, call QNN op
-        if self.is_quantized(op):
-            return self._convert_elemwise(_qnn.op.mul, op)
-        return self._convert_elemwise(_op.multiply, op)
-
-    def convert_div(self, op):
-        """Convert TFLite DIV"""
-        # Check if the input tensor is quantized, call QNN op
-        return self._convert_elemwise(_op.divide, op, self.is_quantized(op))
-
-    def convert_pow(self, op):
-        """Convert TFLite POW"""
-        # Check if the input tensor is quantized, call QNN op
-        return self._convert_elemwise(_op.power, op)
-
-    def convert_maximum(self, op):
-        """Convert TFLite MAXIMUM"""
-        return self._convert_elemwise(_op.maximum, op, self.is_quantized(op))
-
-    def convert_minimum(self, op):
-        """Convert TFLite MINIMUM"""
-        return self._convert_elemwise(_op.minimum, op, self.is_quantized(op))
-
-    def convert_greater(self, op):
-        """Convert TFLite GREATER"""
-        return self._convert_elemwise(_op.greater, op, self.is_quantized(op), comparison_op=True)
-
-    def convert_squared_difference(self, op):
-        """Convert TFLite SQUARED DIFFERENCE"""
-        # Check if the input tensor is quantized, call QNN op
-        # (https://github.com/tensorflow/tflite-micro/blob/bc35c3ed9c7ab93b3a13b46fce936f854bcfce2c/tensorflow/lite/micro/kernels/squared_difference.cc#L157)  # pylint: disable=line-too-long
-        if self.is_quantized(op):
-            input_tensors = self.get_input_tensors(op)
-            output_tensors = self.get_output_tensors(op)
-            lhs_expr = self.get_tensor_expr(input_tensors[0])
-            rhs_expr = self.get_tensor_expr(input_tensors[1])
-            assert len(input_tensors) == 2, "input tensors length should be 2"
-            assert len(output_tensors) == 1, "output tensors length should be 1"
-            lhs_expr_f32 = self.dequantize(lhs_expr, input_tensors[0])
-            rhs_expr_f32 = self.dequantize(rhs_expr, input_tensors[1])
-            out_f32 = _op.subtract(lhs_expr_f32, rhs_expr_f32)
-            return self.quantize(out_f32 * out_f32, output_tensors[0])
-
-        difference = self._convert_elemwise(_op.subtract, op)
-        # _convert_elemwise has guaranteed only have one output tensor
-        exp_type = self.get_tensor_type_str(self.get_output_tensors(op)[0].tensor.Type())
-        out = _op.power(difference, relay.const(2, exp_type))
-        return out
-
-    def convert_greater_equal(self, op):
-        """Convert TFLite GREATER_EQUAL"""
-        return self._convert_elemwise(
-            _op.greater_equal, op, self.is_quantized(op), comparison_op=True
-        )
-
-    def convert_less(self, op):
-        """Convert TFLite LESS"""
-        return self._convert_elemwise(_op.less, op, self.is_quantized(op), comparison_op=True)
-
-    def convert_less_equal(self, op):
-        """Convert TFLite LESS_EQUAL"""
-        return self._convert_elemwise(_op.less_equal, op, self.is_quantized(op), comparison_op=True)
-
-    def convert_equal(self, op):
-        """Convert TFLite EQUAL"""
-        return self._convert_elemwise(_op.equal, op, self.is_quantized(op), comparison_op=True)
-
-    def convert_not_equal(self, op):
-        """Convert TFLite NOT_EQUAL"""
-        return self._convert_elemwise(_op.not_equal, op, self.is_quantized(op), comparison_op=True)
-
-    def _convert_logical_binary(self, relay_op, op):
-        """Generic method to convert logical binary ops"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        lhs_tensor = input_tensors[0]
-        lhs_expr = self.get_tensor_expr(lhs_tensor)
-        rhs_tensor = input_tensors[1]
-        rhs_expr = self.get_tensor_expr(rhs_tensor)
-        out = relay_op(lhs_expr, rhs_expr)
-
-        return out
-
-    def convert_logical_and(self, op):
-        """Convert tflite LOGICAL_AND"""
-        return self._convert_logical_binary(_op.logical_and, op)
-
-    def convert_logical_or(self, op):
-        """Convert tflite LOGICAL_OR"""
-        return self._convert_logical_binary(_op.logical_or, op)
-
-    def convert_logical_not(self, op):
-        """Convert tflite LOGICAL_NOT"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        data = self.get_expr(input_tensors[0].tensor_idx)
-        out = _op.logical_not(data)
-
-        return out
-
-    def convert_gather(self, op):
-        """Method to Convert TFLite GATHER operator"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.GatherOptions import GatherOptions
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        data = self.get_tensor_expr(input_tensors[0])
-        indices = input_tensors[1]
-        indices_type = indices.tensor.Type()
-        assert indices_type in (TensorType.INT32, TensorType.INT64)
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.GatherOptions
-        op_options = op.BuiltinOptions()
-        gather_options = GatherOptions()
-        gather_options.Init(op_options.Bytes, op_options.Pos)
-        axis = gather_options.Axis()
-
-        # Check the indices are with in bounds.
-        data_shape = to_int_list(self.get_tensor_shape(input_tensors[0]))
-        data_dim = len(data_shape)
-
-        axis = data_dim + axis if axis < 0 else axis
-        assert axis >= 0, "Axis out of bounds"
-        assert axis < data_dim, "Axis out of bounds"
-
-        if self.has_expr(indices.tensor_idx):
-            indices_expr = _op.cast(self.get_expr(indices.tensor_idx), "int32")
-        else:
-            indices_val = self.get_tensor_value(indices)
-            indices_expr = self.exp_tab.new_const(
-                indices_val,
-                dtype=self.get_tensor_type_str(indices_type),
-                source_name=indices.tensor.Name(),
-            )
-            indices_shape = list(indices_val.shape)
-            indices_len = len(indices_shape)
-
-            out_shape = data_shape[:axis] + indices_shape[:] + data_shape[axis + 1 :]
-
-            loopover = [range(s) for s in out_shape]
-            for idx in list(itertools.product(*loopover)):
-                real_indices = (
-                    list(idx[:axis])
-                    + [indices_val[idx[axis : axis + indices_len]]]
-                    + list(idx[axis + indices_len :])
-                )
-                if np.any(np.subtract(data_shape, real_indices) < 0):
-                    raise ValueError("TFLite out of bound indices are not supported.")
-
-        # Use mode 'fast' since indices are already checked within bounds.
-        out = _op.take(data, indices_expr, axis=axis, mode="fast")
-        return out
-
-    def convert_gather_nd(self, op):
-        """Method to Convert TFLite GATHER_ND operator"""
-        try:
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        for t in input_tensors:
-            assert not t.qnn_params, "Quantized input is not expected."
-
-        data = self.get_tensor_expr(input_tensors[0])
-        indices = self.get_tensor_expr(input_tensors[1])
-
-        indices_type = input_tensors[1].tensor.Type()
-        assert indices_type in (TensorType.INT32, TensorType.INT64)
-
-        indices_dims = len(_infer_shape(indices))
-        indices_t = _op.transpose(indices, axes=[-1] + list(range(indices_dims - 1)))
-
-        out = _op.gather_nd(data, indices_t)
-        return out
-
-    def convert_strided_slice(self, op):
-        """Method to Convert TFLite STRIDED_SLICE operator.
-        NOTE: Eventhough tensorflow supports begin_mask, end_mask, ellipsis_mask, new_axis_mask
-        and shrink_axis_mask, tflite doesn't support these and expect these values to be zero.
-        But in future, they may open up the mask implementation, so kept the implementation
-        same as tensorflow.
-
-        This op extracts a slice of size (end - begin) / stride from the given input tensor.
-        Starting at the location specified by begin the slice continues by adding stride to the
-        index until all dimensions are not less than end. Note that a stride can be negative,
-        which causes a reverse slice.
-
-        For slice input[val0, val1, ..., valn], begin/end/strides will be vectors of length n.
-
-        In each mask field(begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)
-        the ith bit will correspond to the ith val.
-
-        If the ith bit of begin_mask is set, begin[i] is ignored and the fullest possible range
-        in that dimension is used instead.
-
-        If the ith bit of ellipsis_mask is set, as many unspecified dimensions as needed will be
-        inserted between other dimensions. Only one non-zero bit is allowed in ellipsis_mask.
-
-        If the ith bit of new_axis_mask is set, then begin, end, and stride are ignored and a
-        new length 1 dimension is added at this point in the output tensor.
-
-        If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks
-        the dimensionality by 1, taking on the value at index begin[i]. end[i] and strides[i]
-        are ignored in this case.
-        begin and end are zero-indexed. strides entries must be non-zero.
-
-        TVM Relay implementation of doesn't support mask, so the mask values are processed in
-        this function and begin/end/strides are updated accordingly. If any mask is present, and
-        since tvm doesn't support mask computation directly, the output need a final reshape.
-        """
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.StridedSliceOptions import StridedSliceOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 4, "input tensors length should be 4"
-
-        data_expr = self.get_expr(input_tensors[0].tensor_idx)
-
-        begin = list(self.get_tensor_value(input_tensors[1]))
-        end = list(self.get_tensor_value(input_tensors[2]))
-        stride = list(self.get_tensor_value(input_tensors[3]))
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.StridedSliceOptions
-        op_options = op.BuiltinOptions()
-        options = StridedSliceOptions()
-        options.Init(op_options.Bytes, op_options.Pos)
-        begin_mask = options.BeginMask()
-        end_mask = options.EndMask()
-        ellipsis_mask = options.EllipsisMask()
-        new_axis_mask = options.NewAxisMask()
-        shrink_axis_mask = options.ShrinkAxisMask()
-
-        data_shape = to_int_list(self.get_tensor_shape(input_tensors[0]))
-        data_dim = len(data_shape)
-        stride_dim = len(stride)
-
-        def _transform_mask(stride_dim, ellipsis_mask):
-            """Handle mask inputs to create new begin, end, stride and output shape"""
-            m_begin = [0] * data_dim
-            m_end = [0] * data_dim
-            m_stride = [0] * data_dim
-            fshape_indices = []
-            # Count new axis after ellipsis_mask, consider while applying ellipsis_mask.
-            ellipsis_seen = False
-            new_axes_after_ellipsis = 0
-            for i in range(stride_dim):
-                mask = 1 << i
-                if ellipsis_seen and (mask & new_axis_mask) != 0:
-                    new_axes_after_ellipsis += 1
-                if (mask & ellipsis_mask) != 0:
-                    ellipsis_seen = True
-            if not ellipsis_seen:
-                # Used later for extending the stride attributes in the below loop.
-                ellipsis_mask |= 1 << stride_dim
-                stride_dim += 1
-            final_index = 0
-            for index in range(stride_dim):
-                mask = 1 << index
-                if mask & ellipsis_mask:
-                    # Identify the end index for applying ellipsis_mask
-                    to_index = min(
-                        ((data_dim - (stride_dim - index)) + 1 + new_axes_after_ellipsis), data_dim
-                    )
-                    for i in range(final_index, to_index):
-                        m_begin[final_index] = 0
-                        m_end[final_index] = data_shape[final_index]
-                        m_stride[final_index] = 1
-                        fshape_indices.append(final_index)
-                        final_index += 1
-                elif mask & new_axis_mask:
-                    fshape_indices.append(-1)
-                elif not mask & new_axis_mask:
-                    if final_index == len(m_begin):
-                        break
-                    if mask & begin_mask:
-                        m_begin[final_index] = data_shape[final_index] if stride[index] < 0 else 0
-                    elif begin[index]:
-                        m_begin[final_index] = begin[index]
-                    if mask & end_mask:
-                        m_end[final_index] = 0 if stride[index] < 0 else data_shape[final_index]
-                    elif end[index]:
-                        m_end[final_index] = end[index]
-                    m_stride[final_index] = stride[index]
-                    if mask & shrink_axis_mask:
-                        # Tensorflow make axis with shrink_axis_mask as dimension 1
-                        m_begin[final_index] = (
-                            data_shape[final_index] + begin[index]
-                            if begin[index] < 0
-                            else begin[index]
-                        )
-                        m_end[final_index] = m_begin[final_index] + 1
-                        m_stride[final_index] = 1
-                        fshape_indices.append(-2)
-                    else:
-                        fshape_indices.append(final_index)
-
-                    final_index += 1
-            return m_begin, m_end, m_stride, fshape_indices
-
-        fshape_indices = None
-        if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
-            begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
-
-        out = _op.strided_slice(data_expr, begin=begin, end=end, strides=stride)
-        out_shape = _infer_shape(out)
-        if not fshape_indices:
-            fshape_indices = range(len(out_shape))
-
-        # Create final output shape.
-        final_output = []
-        final_len = len(fshape_indices)
-        for gather_index in fshape_indices:
-            if gather_index == -1:
-                final_output.append(1)
-                final_len += 1
-            elif gather_index == -2:
-                final_len -= 1
-            else:
-                final_output.append(out_shape[gather_index])
-
-        if final_len == 0:
-            return _op.squeeze(out, axis=tuple(range(len(fshape_indices))))
-
-        if not final_output:
-            return out
-        return _op.reshape(out, newshape=tuple(final_output))
-
-    def convert_zeros_like(self, op):
-        """Convert TFLite ZEROS LIKE"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-        out = _op.zeros_like(in_expr)
-
-        return out
-
-    def convert_fill(self, op):
-        """Convert TFLite FILL"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        if self.has_expr(input_tensors[0].tensor_idx):
-            raise tvm.error.OpNotImplemented(
-                "For dims parameter of Fill operator," " only constant values are supported."
-            )
-
-        in_dims = list(self.get_tensor_value(input_tensors[0]))
-        in_value_expr = self.get_expr(input_tensors[1].tensor_idx)
-        out = _op.full(in_value_expr, in_dims)
-
-        return out
-
-    def _convert_reduce(self, relay_op, op):
-        """Generic method to Convert TFLite REDUCE operators"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.ReducerOptions import ReducerOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        # input_tensor
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        # axis
-        axis_value = self.get_tensor_value(input_tensors[1])
-        axis = tuple(axis_value) if len(axis_value.shape) > 0 else tuple((axis_value.item(),))
-
-        # Options - keep_dims (bool)
-        # In case Options are not present, set keep_dims to False(default)
-        if op.BuiltinOptionsType():
-            assert op.BuiltinOptionsType() == BuiltinOptions.ReducerOptions
-            reduce_options = ReducerOptions()
-            op_options = op.BuiltinOptions()
-            reduce_options.Init(op_options.Bytes, op_options.Pos)
-            keep_dims = reduce_options.KeepDims()
-        else:
-            keep_dims = False
-
-        if input_tensor.qnn_params:
-            in_expr = _op.cast(in_expr, "int32")
-
-        out = relay_op(in_expr, axis, keep_dims)
-
-        # Finally if the reduce is quantized. Add a requantize at the end.
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-        output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-        if output_tensor.qnn_params:
-            out = _qnn.op.requantize(
-                out,
-                input_scale=input_tensor.qnn_params["scale"],
-                input_zero_point=input_tensor.qnn_params["zero_point"],
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-            )
-
-        return out
-
-    def convert_reduce_min(self, op):
-        return self._convert_reduce(_op.reduce.min, op)
-
-    def convert_reduce_max(self, op):
-        return self._convert_reduce(_op.reduce.max, op)
-
-    def convert_reduce_mean(self, op):
-        return self._convert_reduce(_op.reduce.mean, op)
-
-    def convert_reduce_prod(self, op):
-        return self._convert_reduce(_op.reduce.prod, op)
-
-    def convert_reduce_sum(self, op):
-        return self._convert_reduce(_op.reduce.sum, op)
-
-    def convert_reduce_any(self, op):
-        return self._convert_reduce(_op.reduce.any, op)
-
-    def _convert_arg_min_max(self, relay_op, op):
-        """Generic method converting TFLite arg_min_max"""
-        try:
-            from tflite.ArgMaxOptions import ArgMaxOptions
-            from tflite.ArgMinOptions import ArgMinOptions
-            from tflite.BuiltinOptions import BuiltinOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "two input tensor arguments expected"
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "one output tensor expected"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-        axis_tensor = input_tensors[1]
-        # In Tensorflow, `axis` argument is a Tensor, not attribute. We
-        # support the case where it inputs from a scalar constant.
-        axis_value = self.get_tensor_value(axis_tensor)
-        assert axis_value.size == 1
-        axis_value = axis_value.item()
-
-        if op.BuiltinOptionsType() == BuiltinOptions.ArgMinOptions:
-            arg_min_max_options = ArgMinOptions()
-        elif op.BuiltinOptionsType() == BuiltinOptions.ArgMaxOptions:
-            arg_min_max_options = ArgMaxOptions()
-        op_options = op.BuiltinOptions()
-        arg_min_max_options.Init(op_options.Bytes, op_options.Pos)
-
-        # set keepdims to True since tflite 1.13 removes all dims of size 1
-        # WARNING: all other versions of tflite > 1.13 need keepdims=False
-        out = relay_op(in_expr, axis=axis_value, keepdims=False, exclude=False)
-
-        return out
-
-    def convert_arg_min(self, op):
-        """Convert TFLite ARG_MIN"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFlite quantized ARG_MIN operator is not supported yet."
-            )
-        return self._convert_arg_min_max(_op.argmin, op)
-
-    def convert_arg_max(self, op):
-        """Convert TFLite ARG_MAX"""
-        return self._convert_arg_min_max(_op.argmax, op)
-
-    def convert_fully_connected(self, op):
-        """Convert TFLite fully connected"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.FullyConnectedOptions import FullyConnectedOptions
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) in (2, 3), "input tensors length should be two or three"
-
-        input_tensor = input_tensors[0]
-        weight_tensor = input_tensors[1]
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-        output_tensor_type = output_tensor.tensor.Type()
-        output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
-
-        weight_tensor_shape = to_int_list(self.get_tensor_shape(weight_tensor))
-
-        # Weight should have only 2 dimensions(TFLite convention)
-        assert len(weight_tensor_shape) == 2, "Weight should be only 2-dim"
-
-        # Input shape: [i_batch_size, ..., n_inputs]
-        # Filter shape: [n_inputs, n_units]
-        #
-        # As we will transform Fully_Connected Input to Dense Op inputs as below
-        # Dense expected Input shape: [batch_size, n_units]
-        # Dense expected Weight shape: [out_dim, n_units]
-        # Dense output shape: [batch_size, out_dim]
-        target_shape = tuple((-1, weight_tensor_shape[1]))
-        in_expr = self.get_tensor_expr(input_tensor)
-        in_expr = _op.reshape(in_expr, target_shape)
-
-        # TODO: Change the output shape calculation based on keep_dim option
-        assert op.BuiltinOptionsType() == BuiltinOptions.FullyConnectedOptions
-        op_options = op.BuiltinOptions()
-        fully_connected_options = FullyConnectedOptions()
-        fully_connected_options.Init(op_options.Bytes, op_options.Pos)
-        fused_activation_fn = fully_connected_options.FusedActivationFunction()
-        keep_num_dims = fully_connected_options.KeepNumDims()
-
-        # weight tensor type should be INT8/UINT8 (quantization) or FLOAT32
-        weight_tensor_type = weight_tensor.tensor.Type()
-        assert weight_tensor_type in (TensorType.INT8, TensorType.UINT8, TensorType.FLOAT32)
-        weight_tensor_type_str = self.get_tensor_type_str(weight_tensor_type)
-
-        if self.has_expr(weight_tensor.tensor_idx):
-            weight_expr = self.get_expr(weight_tensor.tensor_idx)
-        else:
-            weight_value = self.get_tensor_value(weight_tensor)
-            weight_expr = self.exp_tab.new_const(
-                weight_value, dtype=weight_tensor_type_str, source_name=weight_tensor.tensor.Name()
-            )
-        weight_shape = _infer_shape(weight_expr)
-
-        if input_tensor.qnn_params:
-            out = _qnn.op.dense(
-                in_expr,
-                weight_expr,
-                input_zero_point=input_tensor.qnn_params["zero_point"],
-                kernel_zero_point=weight_tensor.qnn_params["zero_point"],
-                input_scale=input_tensor.qnn_params["scale"],
-                kernel_scale=weight_tensor.qnn_params["scale"],
-                units=weight_shape[0],
-                out_dtype="int64" if output_tensor_type_str == "int16" else "int32",
-            )
-        else:
-            out = _op.nn.dense(in_expr, weight_expr, units=weight_shape[0])
-
-        # if we have bias
-        if len(input_tensors) == 3:
-            bias_tensor = input_tensors[2]
-            if bias_tensor.tensor_idx != -1:
-                bias_tensor_type = bias_tensor.tensor.Type()
-                # bias tensor type should be INT32 (quantization) or FLOAT32
-                assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32)
-                bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
-                if self.has_expr(bias_tensor.tensor_idx):
-                    bias_expr = self.get_expr(bias_tensor.tensor_idx)
-                else:
-                    bias_expr = self.exp_tab.new_const(
-                        self.get_tensor_value(bias_tensor),
-                        dtype=bias_tensor_type_str,
-                        source_name=bias_tensor.tensor.Name(),
-                    )
-                out = _op.nn.bias_add(out, bias_expr)
-
-        # Finally if the dense is quantized. Add a requantize at the end.
-        if output_tensor.qnn_params:
-            data_scale = input_tensor.qnn_params["scale"]
-            weight_scale = weight_tensor.qnn_params["scale"]
-            data_scale_val = get_scalar_from_constant(data_scale)
-            weight_scale_val = get_scalar_from_constant(weight_scale)
-            new_input_scale_val = data_scale_val * weight_scale_val
-            new_input_scale = relay.const(new_input_scale_val, "float32")
-            new_input_zero_point = relay.const(0, "int32")
-
-            # Requantize
-            out = _qnn.op.requantize(
-                out,
-                input_scale=new_input_scale,
-                input_zero_point=new_input_zero_point,
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-            )
-
-            # Call activation function
-            output_scale_val = get_scalar_from_constant(output_tensor.qnn_params["scale"])
-            output_zero_point_val = get_scalar_from_constant(output_tensor.qnn_params["zero_point"])
-            out = self.convert_qnn_fused_activation_function(
-                expr=out,
-                fused_activation_fn=fused_activation_fn,
-                scale=output_scale_val,
-                zero_point=output_zero_point_val,
-                dtype=output_tensor_type_str,
-            )
-
-        else:
-            out = self.convert_fused_activation_function(out, fused_activation_fn)
-
-        # Change the output shape calculation based on keep_dim option
-        if keep_num_dims:
-            input_shape = _infer_shape(self.get_tensor_expr(input_tensor))
-            output_shape = input_shape[:-1] + tuple([weight_tensor_shape[0]])
-            out = _op.reshape(out, output_shape)
-
-        return out
-
-    def convert_squeeze(self, op):
-        """Convert TFLite squeeze"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.SqueezeOptions import SqueezeOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        output_tensors = self.get_output_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.SqueezeOptions
-        op_options = op.BuiltinOptions()
-        squeeze_options = SqueezeOptions()
-        squeeze_options.Init(op_options.Bytes, op_options.Pos)
-        squeeze_axis = squeeze_options.SqueezeDimsAsNumpy()
-
-        in_expr = self.get_expr(input_tensor_idx)
-        out = _op.squeeze(in_expr, axis=tuple(squeeze_axis))
-
-        return out
-
-    def convert_fused_activation_function(self, in_expr, fused_activation_fn):
-        """Convert TFLite fused activation function"""
-        try:
-            from tflite.ActivationFunctionType import ActivationFunctionType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        if fused_activation_fn == ActivationFunctionType.NONE:
-            return in_expr
-        if fused_activation_fn == ActivationFunctionType.RELU6:
-            return _op.clip(in_expr, a_min=0, a_max=6)
-        if fused_activation_fn == ActivationFunctionType.RELU:
-            return _op.nn.relu(in_expr)
-        if fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
-            return _op.clip(in_expr, a_min=-1, a_max=1)
-        if fused_activation_fn == ActivationFunctionType.TANH:
-            return _op.tanh(in_expr)
-        fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
-        raise tvm.error.OpNotImplemented(
-            f"Fused activation {fused_activation_fn_str} is not supported yet."
-        )
-
-    def convert_conv(self, op, conv_type):
-        """convolution implementation."""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.Conv2DOptions import Conv2DOptions
-            from tflite.DepthwiseConv2DOptions import DepthwiseConv2DOptions
-            from tflite.Padding import Padding
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) >= 2, "input tensors length should be >= 2"
-
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-        weight_tensor = input_tensors[1]
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-        output_tensor_type = output_tensor.tensor.Type()
-        output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
-
-        is_depthwise_conv = False
-        if conv_type == "conv2d":
-            assert op.BuiltinOptionsType() == BuiltinOptions.Conv2DOptions
-            op_options = op.BuiltinOptions()
-            conv_options = Conv2DOptions()
-            conv_options.Init(op_options.Bytes, op_options.Pos)
-        elif conv_type == "depthwise":
-            is_depthwise_conv = True
-            assert op.BuiltinOptionsType() == BuiltinOptions.DepthwiseConv2DOptions
-            op_options = op.BuiltinOptions()
-            conv_options = DepthwiseConv2DOptions()
-            conv_options.Init(op_options.Bytes, op_options.Pos)
-            depth_multiplier = conv_options.DepthMultiplier()
-        else:
-            raise tvm.error.OpNotImplemented(
-                f"Operator {conv_type} is not supported for frontend TFLite."
-            )
-
-        stride_h = conv_options.StrideH()
-        stride_w = conv_options.StrideW()
-        dilation_h = conv_options.DilationHFactor()
-        dilation_w = conv_options.DilationWFactor()
-        padding = conv_options.Padding()
-        fused_activation_fn = conv_options.FusedActivationFunction()
-
-        _, input_h, input_w, input_c = to_int_list(self.get_tensor_shape(input_tensor))
-
-        if is_depthwise_conv:
-            # TFLite depthwise convolution kernel layout is:
-            # 1 KH KW C(input_c * depth_multiplier)
-            _, kernel_h, kernel_w, in_channels = to_int_list(self.get_tensor_shape(weight_tensor))
-            assert in_channels == input_c * depth_multiplier
-        else:
-            output_channels, kernel_h, kernel_w, in_channels = to_int_list(
-                self.get_tensor_shape(weight_tensor)
-            )
-
-        dilated_kernel_h = dilation_h * (kernel_h - 1) + 1
-        dilated_kernel_w = dilation_w * (kernel_w - 1) + 1
-
-        params = {
-            "kernel_size": [kernel_h, kernel_w],
-            "strides": [stride_h, stride_w],
-            "dilation": [dilation_h, dilation_w],
-            "padding": [0, 0],
-            "data_layout": "NHWC",
-        }
-
-        if is_depthwise_conv:
-            params["channels"] = int(in_channels)
-            params["groups"] = int(input_c)
-            # If number of input channels is 1, treat as normal
-            # convolution.
-            params["kernel_layout"] = "HWIO" if input_c == 1 else "HWOI"
-        else:
-            params["channels"] = int(output_channels)
-            params["kernel_layout"] = "HWIO"
-            if input_c != in_channels:
-                assert (
-                    input_c % in_channels == 0
-                ), "Input channels is not divisible of kernel in_channels."
-                params["groups"] = int(input_c / in_channels)
-
-        # weight tensor type should be INT8/UINT8 (quantization) or FLOAT32
-        weight_tensor_type = weight_tensor.tensor.Type()
-        assert weight_tensor_type in (TensorType.INT8, TensorType.UINT8, TensorType.FLOAT32)
-        weight_tensor_type_str = self.get_tensor_type_str(weight_tensor_type)
-
-        in_expr = self.get_expr(input_tensor_idx)
-
-        # TFLite converts float32 models to float16 models by introducing
-        # a Dequantize op in every op that contains a float32 values.
-        # (weights, biases, and constants etc. )
-        # So conv op may have weight and bias as tensors instead of values.
-        if self.has_expr(weight_tensor.tensor_idx):
-            weight_expr = self.get_expr(weight_tensor.tensor_idx)
-            if is_depthwise_conv:
-                weight_expr = _op.reshape(
-                    weight_expr, (kernel_h, kernel_w, input_c, depth_multiplier)
-                )
-            else:
-                weight_expr = _op.transpose(weight_expr, axes=(1, 2, 3, 0))
-        else:
-            if self.is_prefetched(weight_tensor.tensor_idx):
-                weight_value = self.get_prefetched_node(weight_tensor.tensor_idx)
-            else:
-                weight_value = self.get_tensor_value(weight_tensor)
-
-            # TFLite kernel layout:
-            # convolution:
-            # OC KH KW IC, we require KH KW IC OC (HWIO)
-            # depthwise convolution:
-            # 1 KH KW C(input_c * depth_multiplier), we require
-            # KH KW IC M (depth_multiplier) (HWOI)
-            if is_depthwise_conv:
-                weight_value = weight_value.reshape(kernel_h, kernel_w, input_c, depth_multiplier)
-            else:
-                weight_value = weight_value.transpose((1, 2, 3, 0))
-
-            weight_expr = self.exp_tab.new_const(
-                weight_value, dtype=weight_tensor_type_str, source_name=weight_tensor.tensor.Name()
-            )
-
-        if padding == Padding.VALID:
-            pass
-        elif padding == Padding.SAME:
-            pad_top, pad_bottom = get_pad_value(input_h, dilated_kernel_h, stride_h)
-
-            pad_left, pad_right = get_pad_value(input_w, dilated_kernel_w, stride_w)
-            do_pad = not (pad_top == 0 and pad_bottom == 0 and pad_left == 0 and pad_right == 0)
-            if do_pad:
-                params["padding"] = [pad_top, pad_left, pad_bottom, pad_right]
-
-        else:
-            raise tvm.error.OpAttributeUnImplemented(
-                f"Padding format {padding} is not supported for operator Conv."
-            )
-
-        if input_tensor.qnn_params:
-            qnn_conv2d_params = dict(params)
-            qnn_conv2d_params["input_zero_point"] = input_tensor.qnn_params["zero_point"]
-            qnn_conv2d_params["kernel_zero_point"] = weight_tensor.qnn_params["zero_point"]
-            qnn_conv2d_params["out_dtype"] = (
-                "int64" if output_tensor_type_str == "int16" else "int32"
-            )
-            qnn_conv2d_params["input_scale"] = input_tensor.qnn_params["scale"]
-            qnn_conv2d_params["kernel_scale"] = weight_tensor.qnn_params["scale"]
-            out = _qnn.op.conv2d(in_expr, weight_expr, **qnn_conv2d_params)
-        else:
-            out = _op.nn.conv2d(in_expr, weight_expr, **params)
-
-        # if we have bias
-        if len(input_tensors) == 3:
-            bias_tensor = input_tensors[2]
-            bias_tensor_type = bias_tensor.tensor.Type()
-            # bias tensor type should be INT32 (int8 qnn) or INT64 (int16 qnn) or FLOAT32
-            assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32)
-            bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
-            if self.has_expr(bias_tensor.tensor_idx):
-                bias_expr = self.get_expr(bias_tensor.tensor_idx)
-            else:
-                bias_expr = self.exp_tab.new_const(
-                    self.get_tensor_value(bias_tensor),
-                    dtype=bias_tensor_type_str,
-                    source_name=bias_tensor.tensor.Name(),
-                )
-            channel_axis = 3
-            out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
-
-        # Handle fused activation.
-        if output_tensor.qnn_params:
-            # Calculate the intermediate scale and zero point of the int32 output.
-            data_scale = input_tensor.qnn_params["scale"]
-            data_scale_val = get_scalar_from_constant(data_scale)
-
-            weight_scale = weight_tensor.qnn_params["scale"]
-            # If weight scale is scalar, it is per-tensor quantization
-            if isinstance(weight_scale, float):
-                weight_scale_val = get_scalar_from_constant(weight_scale)
-            else:
-                weight_scale_val = get_tensor_from_constant(weight_scale)
-
-            new_input_scale_val = data_scale_val * weight_scale_val
-            new_input_scale = relay.const(new_input_scale_val, "float32")
-            new_input_zero_point = relay.const(0, "int32")
-
-            # Finally requantize
-            out = _qnn.op.requantize(
-                out,
-                input_scale=new_input_scale,
-                input_zero_point=new_input_zero_point,
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-                axis=3,
-            )
-
-            # Call activation function
-            output_scale_val = get_scalar_from_constant(output_tensor.qnn_params["scale"])
-            output_zero_point_val = get_scalar_from_constant(output_tensor.qnn_params["zero_point"])
-            out = self.convert_qnn_fused_activation_function(
-                expr=out,
-                fused_activation_fn=fused_activation_fn,
-                scale=output_scale_val,
-                zero_point=output_zero_point_val,
-                dtype=output_tensor_type_str,
-            )
-        else:
-            out = self.convert_fused_activation_function(out, fused_activation_fn)
-        return out
-
-    def convert_split(self, op):
-        """split implementation."""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.SplitOptions import SplitOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-
-        assert len(input_tensors) == 2, "input tensors length should be == 2"
-
-        axis_tensor = input_tensors[0]
-        split_axis = self.get_tensor_value(axis_tensor)
-        input_tensor = input_tensors[1]
-        input_tensor_idx = input_tensor.tensor_idx
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.SplitOptions
-        op_options = op.BuiltinOptions()
-        split_options = SplitOptions()
-        split_options.Init(op_options.Bytes, op_options.Pos)
-        num_splits = split_options.NumSplits()
-
-        in_expr = self.get_expr(input_tensor_idx)
-        out = _op.split(in_expr, num_splits, axis=int(split_axis))
-        # Relay does not like a TupleWrapper of 1 element, further this
-        # only shows up with tf1.13 if we use a split with num_splits==1.
-        # In tf 1.14 this doesn't appear as it is automatically a reshape
-        # operation.
-        if isinstance(out, _expr.TupleWrapper):
-            if out.size == 1:
-                out = out[0]
-
-        return out
-
-    def convert_split_v(self, op):
-        """SPLIT_V implementation."""
-        input_tensors = self.get_input_tensors(op)
-
-        assert len(input_tensors) == 3, "input tensors length should be 3"
-
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-        in_expr = self.get_expr(input_tensor_idx)
-
-        if self.has_expr(input_tensors[1].tensor_idx):
-            raise tvm.error.OpNotImplemented(
-                "For size_splits parameter of SPLIT_V operator, "
-                "only constant values are supported."
-            )
-        size_splits = list(self.get_tensor_value(input_tensors[1]))
-        size_splits = tuple(np.cumsum(size_splits)[:-1])
-
-        axis_tensor = input_tensors[2]
-        split_axis = self.get_tensor_value(axis_tensor)
-
-        out = _op.split(in_expr, size_splits, axis=int(split_axis))
-        # Relay does not like a TupleWrapper of 1 element, further this
-        # only shows up with tf1.13 if we use a split with num_splits==1.
-        # In tf 1.14 this doesn't appear as it is automatically a reshape
-        # operation.
-        if isinstance(out, _expr.TupleWrapper) and out.size == 1:
-            out = out[0]
-
-        return out
-
-    def convert_slice(self, op):
-        """Convert TFLite SLICE"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 3, "input tensors length should be == 3"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        begin = list(self.get_tensor_value(input_tensors[1]))
-        size = list(self.get_tensor_value(input_tensors[2]))
-        # strided_slice(Relay) needs the slice's end indices, not the size
-        end = size
-        input_tensor_shape = to_int_list(self.get_tensor_shape(input_tensor))
-        input_tensor_rank = len(input_tensor_shape)
-        for i in range(input_tensor_rank):
-            if size[i] == -1:
-                end[i] = input_tensor_shape[i]
-            else:
-                end[i] += begin[i]
-
-        out = _op.strided_slice(in_expr, begin, end)
-
-        return out
-
-    def convert_select(self, op):
-        """Convert TFLite SELECT"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 3, "input tensors length should be == 3"
-        cond = self.get_tensor_expr(input_tensors[0])
-        x = self.get_tensor_expr(input_tensors[1])
-        y = self.get_tensor_expr(input_tensors[2])
-
-        out = _op.where(cond, x, y)
-
-        return out
-
-    def convert_transpose(self, op):
-        """transpose implementation."""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-
-        in_expr = self.get_expr(input_tensor_idx)
-
-        # axis
-        in_axis = tuple(self.get_tensor_value(input_tensors[1]))
-
-        if not in_axis:
-            out = _op.transpose(in_expr)
-        else:
-            out = _op.transpose(in_expr, in_axis)
-
-        return out
-
-    def convert_reverse_sequence(self, op):
-        """Convert TFLite REVERSE_SEQUENCE"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.ReverseSequenceOptions import ReverseSequenceOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFLite does not support quantized REVERSE_SEQUENCE operator yet."
-            )
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        in_expr = self.get_tensor_expr(input_tensors[0])
-        length_expr = self.get_tensor_expr(input_tensors[1])
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.ReverseSequenceOptions
-        op_options = op.BuiltinOptions()
-        options = ReverseSequenceOptions()
-        options.Init(op_options.Bytes, op_options.Pos)
-        batch_axis = options.BatchDim()
-        seq_axis = options.SeqDim()
-
-        return _op.reverse_sequence(in_expr, length_expr, seq_axis, batch_axis)
-
-    def convert_cast(self, op):
-        """Convert TFLite CAST"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.CastOptions import CastOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        # MLIR-based converter outputs no BuiltinOptions for Cast operator. In this
-        # case the output type can be derived from the Cast operator output tensor.
-        # When TOCO converter is used there will be "normal" BuiltinOptions.CastOptions
-        # with output type.
-        if op.BuiltinOptions() is not None:
-            assert op.BuiltinOptionsType() == BuiltinOptions.CastOptions
-            op_options = op.BuiltinOptions()
-            cast_options = CastOptions()
-            cast_options.Init(op_options.Bytes, op_options.Pos)
-            cast_dtype = cast_options.OutDataType()
-        else:
-            cast_dtype = self.get_output_tensors(op)[0].tensor.Type()
-
-        out = _op.cast(in_expr, self.get_tensor_type_str(cast_dtype))
-
-        return out
-
-    def convert_tile(self, op):
-        """tile implementation."""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-
-        in_expr = self.get_expr(input_tensor_idx)
-
-        # reps (tuple of int) – The number of times repeating the tensor data.
-        reps = tuple(self.get_tensor_value(input_tensors[1]))
-
-        out = _op.tile(in_expr, reps)
-
-        return out
-
-    def convert_topk_v2(self, op):
-        """Convert TFLite TOPK_v2"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-        in_expr = self.get_expr(input_tensor_idx)
-        k = self.get_tensor_value(input_tensors[1])
-        out = _op.topk(in_expr, int(k))
-
-        return out
-
-    def convert_pool2d(self, op, pool_type):
-        """pool2d implementation."""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.Padding import Padding
-            from tflite.Pool2DOptions import Pool2DOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors should be 1"
-        output_tensor = output_tensors[0]
-        output_tensor_type = output_tensor.tensor.Type()
-        output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.Pool2DOptions
-        op_options = op.BuiltinOptions()
-        pool2d_options = Pool2DOptions()
-        pool2d_options.Init(op_options.Bytes, op_options.Pos)
-        stride_h = pool2d_options.StrideH()
-        stride_w = pool2d_options.StrideW()
-        padding = pool2d_options.Padding()
-        filter_h = pool2d_options.FilterHeight()
-        filter_w = pool2d_options.FilterWidth()
-        fused_activation_fn = pool2d_options.FusedActivationFunction()
-
-        params = {
-            "pool_size": (filter_h, filter_w),
-            "strides": (stride_h, stride_w),
-            "padding": [0, 0],
-            "layout": "NHWC",
-        }
-
-        in_expr = self.get_expr(input_tensor_idx)
-
-        _, input_h, input_w, _ = to_int_list(self.get_tensor_shape(input_tensor))
-
-        if padding == Padding.VALID:
-            pass
-        elif padding == Padding.SAME:
-            pad_top, pad_bottom = get_pad_value(input_h, filter_h, stride_h)
-            pad_left, pad_right = get_pad_value(input_w, filter_w, stride_w)
-            params["padding"] = [pad_top, pad_left, pad_bottom, pad_right]
-        else:
-            raise tvm.error.OpAttributeUnImplemented(
-                f"Padding format {padding} for operator Pool2D is not supported."
-            )
-
-        if pool_type == "average":
-            if input_tensor.qnn_params:
-                assert self.has_same_qnn_params(input_tensor, output_tensor), (
-                    "TFLite avg_pool2dreshape requires input and output scale"
-                    "and zero points to be equal"
-                )
-                out = _op.cast(in_expr, dtype="int32")
-                out = _op.nn.avg_pool2d(out, **params)
-                out = _op.cast(out, dtype=output_tensor_type_str)
-            else:
-                out = _op.nn.avg_pool2d(in_expr, **params)
-        elif pool_type == "max":
-            if input_tensor.qnn_params:
-                assert self.has_same_qnn_params(
-                    input_tensor, output_tensor
-                ), "qnn.op.max_pool2d requires input and output qnn params to be same"
-            out = _op.nn.max_pool2d(in_expr, **params)
-        elif pool_type == "l2":
-            # L2_POOL_2D is equivalent to square_root(avg_pool(square(in_data)))
-            # TFLite does not have support for quantised L2_POOL_2D op.
-            assert (
-                not input_tensor.qnn_params
-            ), "As TFLite does not have support for quantized L2_POOL_2D, \
-                Quantized input is not expected."
-            exp_type = self.get_tensor_type_str(output_tensor.tensor.Type())
-            square_exp = _op.power(in_expr, relay.const(2, exp_type))
-            avg_pool_exp = _op.nn.avg_pool2d(square_exp, **params)
-            out = _op.sqrt(avg_pool_exp)
-        else:
-            raise tvm.error.OpNotImplemented(
-                f"Operator {pool_type} pool is not supported for frontend TFLite."
-            )
-
-        # Handle fused activations
-        if output_tensor.qnn_params:
-            scale_val = get_scalar_from_constant(output_tensor.qnn_params["scale"])
-            zero_point_val = get_scalar_from_constant(output_tensor.qnn_params["zero_point"])
-            out = self.convert_qnn_fused_activation_function(
-                expr=out,
-                fused_activation_fn=fused_activation_fn,
-                scale=scale_val,
-                zero_point=zero_point_val,
-                dtype=output_tensor_type_str,
-            )
-        else:
-            out = self.convert_fused_activation_function(out, fused_activation_fn)
-
-        return out
-
-    def convert_pad(self, op):
-        """Convert TFLite PAD/PADV2 \
-           TFLite treats PAD and PADV2 operators identically"""
-
-        input_tensors = self.get_input_tensors(op)
-
-        # TFLite PAD/PADV2 only supports CONSTANT mode
-        assert (
-            len(input_tensors) == 2 or len(input_tensors) == 3
-        ), "input tensor's length should be 2 for PAD and 3 for PADV2"
-
-        if len(input_tensors) == 3:
-            assert (
-                input_tensors[0].tensor.Type() == input_tensors[2].tensor.Type()
-            ), "constant_values tensor must be of same type as input tensor"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        # paddings
-        pad_list = self.get_tensor_value(input_tensors[1])
-
-        # convert list of lists to tuple of tuples
-        paddings = tuple(tuple(l) for l in pad_list)
-
-        # Set the pad value, by default 0, unless constant_values parameter is provided
-        pad_value = 0
-
-        if input_tensor.qnn_params:
-            # Check that input and output tensor have same qnn params.
-            output_tensors = self.get_output_tensors(op)
-            output_tensor = output_tensors[0]
-            assert self.has_same_qnn_params(
-                input_tensor, output_tensor
-            ), "TFLite PADV2 requires input and output scale and zero points to be equal"
-
-            # The pad value for quantized pad is the input zero point by default.
-            pad_value = float(input_tensor.qnn_params["zero_point"].data.numpy())
-
-        if len(input_tensors) == 3:
-            pad_value = self.get_tensor_value(input_tensors[2])
-            if isinstance(pad_value, np.ndarray):
-                pad_value = pad_value.tolist()
-            if isinstance(pad_value, list):
-                assert len(pad_value) == 1, "Only one constant value is expected."
-                pad_value = pad_value[0]
-            if input_tensor.qnn_params:
-                # Check that input tensor and constant_values have same qnn params.
-                assert self.has_same_qnn_params(
-                    input_tensor, input_tensors[2]
-                ), "TFLite PADV2 requires input and constant_values tensors' \
-                        scale and zero points to be equal"
-
-        out = _op.nn.pad(in_expr, pad_width=paddings, pad_value=pad_value)
-        return out
-
-    def convert_floor_div(self, op):
-        """Convert TFLite FLOOR_DIV"""
-        return self._convert_elemwise(_op.floor_divide, op, self.is_quantized(op))
-
-    def convert_floor_mod(self, op):
-        """Convert TFLite FLOOR_MOD"""
-        return self._convert_elemwise(_op.floor_mod, op, self.is_quantized(op))
-
-    def convert_mirror_pad(self, op):
-        """Convert TFLite MIRROR_PAD"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.MirrorPadOptions import MirrorPadOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        # tensor
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        # paddings
-        pad_list = self.get_tensor_value(input_tensors[1])
-        # convert list of lists to tuple of tuples
-        paddings = tuple(tuple(l.astype(np.int32)) for l in pad_list)
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.MirrorPadOptions
-        op_options = op.BuiltinOptions()
-        mirror_pad_options = MirrorPadOptions()
-        mirror_pad_options.Init(op_options.Bytes, op_options.Pos)
-        mode_byte = mirror_pad_options.Mode()
-
-        mode = "REFLECT" if mode_byte == 0 else "SYMMETRIC"
-        out = _op.nn.mirror_pad(in_expr, paddings, mode)
-
-        return out
-
-    def convert_pack(self, op):
-        """Convert TFLite pack"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.PackOptions import PackOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-
-        if input_tensors[0].qnn_params:
-            output_tensor = output_tensors[0]
-            assert self.has_same_qnn_params(
-                input_tensors[0], output_tensor
-            ), "TFLite pack requires input and output scale and zero points to be equal"
-
-            for input_tensor in input_tensors:
-                assert self.has_same_qnn_params(
-                    input_tensors[0], input_tensor
-                ), "TFLite pack requires all input tensors to have same scale and zero point"
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.PackOptions
-        op_options = op.BuiltinOptions()
-        pack_options = PackOptions()
-        pack_options.Init(op_options.Bytes, op_options.Pos)
-        pack_axis = pack_options.Axis()
-        pack_values_count = pack_options.ValuesCount()
-        assert len(input_tensors) == pack_values_count, "Discordance in input values count"
-
-        in_exprs = [self.get_tensor_expr(_) for _ in input_tensors]
-        in_exprs_reshaped = [_op.expand_dims(_, axis=pack_axis, num_newaxis=1) for _ in in_exprs]
-        out = _op.concatenate(in_exprs_reshaped, pack_axis)
-        return out
-
-    def convert_unpack(self, op):
-        """Convert TFLite unpack"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.UnpackOptions import UnpackOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-        assert op.BuiltinOptionsType() == BuiltinOptions.UnpackOptions
-        op_options = op.BuiltinOptions()
-        unpack_options = UnpackOptions()
-        unpack_options.Init(op_options.Bytes, op_options.Pos)
-        num_unpacks = unpack_options.Num()
-        unpack_axis = unpack_options.Axis()
-
-        # Relay doesn't support 'unpack' operator so we use 'split' & 'squeeze' instead.
-        # We have to do 'squeeze' along the split axis.
-        # Relay expects squeeze_axis to be List.
-        squeeze_axis = [unpack_axis]
-
-        # Relay doesn't like TupleWrapper of 1 element so we isolate the case of unpacking
-        # a tensor by an axis with len(axis) == 1. For reference see convert_split().
-        # Such unpacking will result in the same tensor so we omit 'split' and only squeeze
-        # along the axis of dim == 1.
-        if num_unpacks == 1:
-            squeezed = _op.squeeze(in_expr, axis=squeeze_axis)
-            if isinstance(squeezed, _expr.TupleWrapper):
-                squeezed = squeezed[0]
-        else:
-            splitted = _op.split(in_expr, indices_or_sections=num_unpacks, axis=unpack_axis)
-            squeezed = _expr.TupleWrapper(
-                _expr.Tuple(
-                    [_op.squeeze(split_item, axis=squeeze_axis) for split_item in splitted]
-                ),
-                len(splitted),
-            )
-
-        return squeezed
-
-    def convert_unidirectional_sequence_lstm(self, op):
-        """Long Short Term Memory for TFLite implementation."""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFlite quantized UNIDIRECTIONALSEQUENCELSTM operator is not supported yet."
-            )
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 24, "input tensors length should be == 24"
-
-        # Extract input tensor from saved model
-        input_tensor = input_tensors[0]
-
-        # Extract tensors from input tensors from saved model
-        # Input weights
-        input_input_weights = input_tensors[1]
-        input_forget_weights = input_tensors[2]
-        input_cell_weights = input_tensors[3]
-        input_output_weights = input_tensors[4]
-        # Recurrent weights
-        recurrent_input_weights = input_tensors[5]
-        recurrent_forget_weights = input_tensors[6]
-        recurrent_cell_weights = input_tensors[7]
-        recurrent_output_weights = input_tensors[8]
-        # inputs 9, 10, 11, 16, 17, 20, 21, 22, 23 are not occupied
-        # there locations are -1 in the flatbuffer
-        # Bias weights
-        input_gate_bias = input_tensors[12]
-        forget_gate_bias = input_tensors[13]
-        cell_gate_bias = input_tensors[14]
-        output_gate_bias = input_tensors[15]
-
-        # State input
-        output_state_in = input_tensors[18]
-        cell_state_in = input_tensors[19]
-
-        # Extract output tensor from saved model
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        X_steps = self.unbind(input_tensor, axis=1)
-        weights_dict = {}
-
-        # hidden_state_weights is equivalent to output_state_in in tflite model
-        out_state_in_shape = tuple(self.get_tensor_shape(output_state_in))
-        out_state_in_dtype = self.get_tensor_type_str(output_state_in.tensor.Type())
-        out_state_in_expr = _op.zeros(out_state_in_shape, dtype=out_state_in_dtype)
-        weights_dict["hidden_state"] = _op.split(out_state_in_expr, 1)[0]
-
-        # cell_state_weights is equivalent to output_state_in tflite model
-        cell_state_in_shape = tuple(self.get_tensor_shape(cell_state_in))
-        cell_state_in_dtype = self.get_tensor_type_str(cell_state_in.tensor.Type())
-        cell_state_in_expr = _op.zeros(cell_state_in_shape, dtype=cell_state_in_dtype)
-        weights_dict["cell_state"] = _op.split(cell_state_in_expr, 1)[0]
-
-        # Process weight matrix of input: w_inp
-        # Concatenate of [input_input_weight, input_forget_weights,
-        # input_cell_weights, input_output_weights]
-        input_input_weights_default_values = self.get_tensor_value(input_input_weights)
-        input_input_weights_op = _op.split(
-            _op.const(input_input_weights_default_values.tolist()), 1
-        )
-        input_output_weights_default_values = self.get_tensor_value(input_output_weights)
-        input_output_weights_op = _op.split(
-            _op.const(input_output_weights_default_values.tolist()), 1
-        )
-        input_forget_weights_default_values = self.get_tensor_value(input_forget_weights)
-        input_forget_weights_op = _op.split(
-            _op.const(input_forget_weights_default_values.tolist()), 1
-        )
-        input_cell_weights_default_values = self.get_tensor_value(input_cell_weights)
-        input_cell_weights_op = _op.split(_op.const(input_cell_weights_default_values.tolist()), 1)
-        weights_dict["w_inp"] = _op.concatenate(
-            [
-                _op.squeeze(input_input_weights_op[0]),
-                _op.squeeze(input_forget_weights_op[0]),
-                _op.squeeze(input_cell_weights_op[0]),
-                _op.squeeze(input_output_weights_op[0]),
-            ],
-            axis=0,
-        )
-
-        # Process weight matrix of hidden state:
-        # w_hid to support lstm_cell function. Not used in tflite
-        recurrent_input_weights_values = self.get_tensor_value(recurrent_input_weights)
-        recurrent_input_weights_op = _op.split(
-            _op.const(recurrent_input_weights_values.tolist()), 1
-        )
-        recurrent_output_weights_values = self.get_tensor_value(recurrent_output_weights)
-        recurrent_output_weights_op = _op.split(
-            _op.const(recurrent_output_weights_values.tolist()), 1
-        )
-        recurrent_forget_weights_values = self.get_tensor_value(recurrent_forget_weights)
-        recurrent_forget_weights_op = _op.split(
-            _op.const(recurrent_forget_weights_values.tolist()), 1
-        )
-        recurrent_cell_weights_values = self.get_tensor_value(recurrent_cell_weights)
-        recurrent_cell_weights_op = _op.split(_op.const(recurrent_cell_weights_values.tolist()), 1)
-        weights_dict["w_hid"] = _op.concatenate(
-            [
-                recurrent_input_weights_op[0],
-                recurrent_forget_weights_op[0],
-                recurrent_cell_weights_op[0],
-                recurrent_output_weights_op[0],
-            ],
-            axis=0,
-        )
-
-        # Process weight matrix of bias: b_inp
-        input_gate_bias_values = self.get_tensor_value(input_gate_bias)
-        input_gate_bias_op = _op.split(_op.const(input_gate_bias_values.tolist()), 1)
-        output_gate_bias_values = self.get_tensor_value(output_gate_bias)
-        output_gate_bias_op = _op.split(_op.const(output_gate_bias_values.tolist()), 1)
-        forget_gate_bias_values = self.get_tensor_value(forget_gate_bias)
-        forget_gate_bias_op = _op.split(_op.const(forget_gate_bias_values.tolist()), 1)
-        cell_gate_bias_values = self.get_tensor_value(cell_gate_bias)
-        cell_gate_bias_op = _op.split(_op.const(cell_gate_bias_values.tolist()), 1)
-        weights_dict["b_inp"] = _op.concatenate(
-            [
-                input_gate_bias_op[0],
-                forget_gate_bias_op[0],
-                cell_gate_bias_op[0],
-                output_gate_bias_op[0],
-            ],
-            axis=0,
-        )
-
-        # Process weight matrix of hidden bias:
-        # b_hid (with the same shape as b_inp)
-        gate_bias_dtype = self.get_tensor_type_str(input_gate_bias.tensor.Type())
-        weights_dict["b_hid"] = _op.split(
-            _op.const(
-                np.zeros(_infer_shape(weights_dict["b_inp"]), dtype=gate_bias_dtype),
-                dtype=gate_bias_dtype,
-            ),
-            1,
-        )[0]
-
-        outputs, _, _ = lstm_cell(input_seqs=X_steps, **weights_dict)
-
-        output = _op.stack(outputs, axis=1)
-        return output
-
-    def convert_batch_to_space_nd(self, op):
-        """batch_to_space_nd implementation."""
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 3, "input tensors length should be 3"
-
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-        in_expr = self.get_expr(input_tensor_idx)
-
-        block_shape = list(self.get_tensor_value(input_tensors[1]))
-        crops = self.get_tensor_value(input_tensors[2]).tolist()
-
-        out = _op.nn.batch_to_space_nd(in_expr, block_shape, crops)
-
-        return out
-
-    def convert_batch_matmul(self, op):
-        """batch_matmul implementation."""
-        try:
-            from tflite.BatchMatMulOptions import BatchMatMulOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-
-        assert len(input_tensors) == 2, "two input tensor arguments expected"
-
-        batch_matmul_options = BatchMatMulOptions()
-        op_options = op.BuiltinOptions()
-        batch_matmul_options.Init(op_options.Bytes, op_options.Pos)
-
-        input_a = self.get_expr(input_tensors[0].tensor_idx)
-        input_b = self.get_expr(input_tensors[1].tensor_idx)
-
-        shape_a = shape_of(input_a)
-        shape_b = shape_of(input_b)
-        rank_a = _infer_shape(shape_a)[0]
-        rank_b = _infer_shape(shape_b)[0]
-
-        if rank_a > 2 or rank_b > 2:
-            # Determine the output batch dimension
-            new_a_shape = shape_a
-            new_b_shape = shape_b
-            if rank_a > rank_b:
-                rank_diff = rank_a - rank_b
-                new_b_shape = _op.concatenate(
-                    [
-                        _expr.const(
-                            [1] * rank_diff, dtype=_infer_type(new_b_shape).checked_type.dtype
-                        ),
-                        shape_b,
-                    ],
-                    0,
-                )
-            elif rank_a < rank_b:
-                rank_diff = rank_b - rank_a
-                new_a_shape = _op.concatenate(
-                    [
-                        _expr.const(
-                            [1] * rank_diff, dtype=_infer_type(new_a_shape).checked_type.dtype
-                        ),
-                        shape_a,
-                    ],
-                    0,
-                )
-            else:
-                pass
-
-            out_batch = _op.concatenate(
-                [
-                    _op.maximum(
-                        _op.strided_slice(new_b_shape, [i], [i + 1]),
-                        _op.strided_slice(new_a_shape, [i], [i + 1]),
-                    )
-                    for i in range(max(rank_a, rank_b) - 2)
-                ],
-                0,
-            )
-
-            a_broadcasted_shape = _fold_constant(
-                _op.concatenate([out_batch, _op.strided_slice(shape_a, [rank_a - 2], [rank_a])], 0)
-            )
-            b_broadcasted_shape = _fold_constant(
-                _op.concatenate([out_batch, _op.strided_slice(shape_b, [rank_b - 2], [rank_b])], 0)
-            )
-            if not tvm.ir.structural_equal(shape_a, a_broadcasted_shape):
-                input_a = _op.transform.broadcast_to(input_a, a_broadcasted_shape)
-            if not tvm.ir.structural_equal(shape_b, b_broadcasted_shape):
-                input_b = _op.transform.broadcast_to(input_b, b_broadcasted_shape)
-
-            input_a = self.flatten_to_nd(input_a, shape_a, 3)
-            input_b = self.flatten_to_nd(input_b, shape_b, 3)
-
-            if batch_matmul_options.AdjX():
-                input_a = _op.transpose(input_a, [0, 2, 1])
-            if not batch_matmul_options.AdjY():
-                input_b = _op.transpose(input_b, [0, 2, 1])
-
-            if self.is_quantized(op):
-                output = _qnn.op.batch_matmul(
-                    input_a,
-                    input_b,
-                    relay.const(0, "int32"),
-                    relay.const(0, "int32"),
-                    relay.const(1.0, "float32"),
-                    relay.const(1.0, "float32"),
-                )
-            else:
-                output = _op.nn.batch_matmul(input_a, input_b)
-
-            # Reshape output to original dimensions.
-            output_shape = shape_of(output)
-
-            rank_out = _infer_shape(output_shape)[0]
-
-        final_shape = _op.concatenate(
-            [
-                _op.strided_slice(shape_a, [0], [rank_a - 2]),
-                _op.strided_slice(output_shape, [rank_out - 2], [rank_out]),
-            ],
-            0,
-        )
-
-        reshape = _op.reshape(output, _fold_constant(final_shape))
-        # qnn batch matmul returns a int32 tensor so we need to requantize
-        if self.is_quantized(op):
-            return _qnn.op.requantize(
-                reshape,
-                relay.const(1.0, "float32"),
-                relay.const(0, "int32"),
-                relay.const(1.0, "float32"),
-                relay.const(0, "int32"),
-                out_dtype="int8",
-            )
-        else:
-            return reshape
-
-    def convert_space_to_batch_nd(self, op):
-        """space_to_batch_nd implementation."""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 3, "input tensors length should be 3"
-
-        input_tensor = input_tensors[0]
-        input_tensor_idx = input_tensor.tensor_idx
-        in_expr = self.get_expr(input_tensor_idx)
-
-        block_shape = list(self.get_tensor_value(input_tensors[1]))
-        paddings = self.get_tensor_value(input_tensors[2]).tolist()
-
-        out = _op.nn.space_to_batch_nd(in_expr, block_shape, paddings)
-
-        return out
-
-    def convert_depth_to_space(self, op):
-        """Convert TFLite DEPTH_TO_SPACE"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.DepthToSpaceOptions import DepthToSpaceOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.DepthToSpaceOptions
-        op_options = op.BuiltinOptions()
-        depth_to_space_options = DepthToSpaceOptions()
-        depth_to_space_options.Init(op_options.Bytes, op_options.Pos)
-        block_size = depth_to_space_options.BlockSize()
-        out = _op.nn.depth_to_space(in_expr, block_size, layout="NHWC")
-
-        return out
-
-    def convert_space_to_depth(self, op):
-        """Convert TFLite SPACE_TO_DEPTH"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.SpaceToDepthOptions import SpaceToDepthOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.SpaceToDepthOptions
-        op_options = op.BuiltinOptions()
-        space_to_depth_options = SpaceToDepthOptions()
-        space_to_depth_options.Init(op_options.Bytes, op_options.Pos)
-        block_size = space_to_depth_options.BlockSize()
-        out = _op.nn.space_to_depth(in_expr, block_size, layout="NHWC")
-
-        return out
-
-    def convert_sparse_to_dense(self, op):
-        """Convert TFLite SPARSE_TO_DENSE"""
-        try:
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 4, "input tensors length should be 4"
-
-        indices, values = input_tensors[0], input_tensors[2]
-        default_value = input_tensors[3]
-        output_shape = input_tensors[1]
-
-        for t in input_tensors:
-            assert not t.qnn_params, "Quantized input is not expected."
-
-        for t in [indices, output_shape]:
-            t_type = t.tensor.Type()
-            assert t_type in (TensorType.INT32, TensorType.INT64)
-
-        out = _op.sparse_to_dense(
-            self.get_tensor_expr(indices),
-            list(self.get_tensor_value(output_shape)),
-            self.get_tensor_expr(values),
-            self.get_tensor_expr(default_value),
-        )
-
-        return out
-
-    def convert_prelu(self, op):
-        """Convert TFLite PReLU"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        input_tensor = input_tensors[0]
-        alpha_tensor = input_tensors[1]
-        if self.has_expr(alpha_tensor.tensor_idx):
-            alpha_expr = self.get_expr(alpha_tensor.tensor_idx)
-        else:
-            alpha_tensor_type = alpha_tensor.tensor.Type()
-            alpha_tensor_type_str = self.get_tensor_type_str(alpha_tensor_type)
-            alpha_expr = self.exp_tab.new_const(
-                self.get_tensor_value(alpha_tensor),
-                dtype=alpha_tensor_type_str,
-                source_name=alpha_tensor.tensor.Name(),
-            )
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-        data_shape = to_int_list(self.get_tensor_shape(input_tensor))
-
-        alpha_expr = _op.broadcast_to(alpha_expr, data_shape)
-        alpha_expr = _op.reshape(alpha_expr, [-1])
-        out = _op.nn.prelu(_op.reshape(in_expr, [-1]), alpha_expr, axis=0)
-        out = _op.reshape(out, data_shape)
-        return out
-
-    def convert_transpose_conv(self, op):
-        """Convert TFLite TRANSPOSE_CONV"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.Padding import Padding
-            from tflite.TensorType import TensorType
-            from tflite.TransposeConvOptions import TransposeConvOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) >= 3, "input tensors length should be >= 3"
-
-        # Input (data) Tensor. NHWC layout
-        input_tensor = input_tensors[2]
-        _, _, _, input_c = to_int_list(self.get_tensor_shape(input_tensor))
-        # Weights tensor. TFLite uses OHWI layout
-        weights_tensor = input_tensors[1]
-        out_channels, kernel_h, kernel_w, in_channels = to_int_list(
-            self.get_tensor_shape(weights_tensor)
-        )
-
-        assert (
-            input_c == in_channels
-        ), "Input channel in the filter should match to channel in the input"
-        # output_shape Tensor. NHWC layout
-        output_shape_tensor = input_tensors[0]
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-        output_tensor_type = output_tensor.tensor.Type()
-        output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.TransposeConvOptions
-        op_options = op.BuiltinOptions()
-        deconv_options = TransposeConvOptions()
-        deconv_options.Init(op_options.Bytes, op_options.Pos)
-
-        padding = deconv_options.Padding()
-        stride_h = deconv_options.StrideH()
-        stride_w = deconv_options.StrideW()
-        assert padding in (
-            Padding.VALID,
-            Padding.SAME,
-        ), f"Padding format {padding} is not supported for operator TRANSPOSE_CONV"
-
-        # Data
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        # Weights
-        weights_tensor_type = weights_tensor.tensor.Type()
-        # weights tensor type should be UINT8 (quantization) or FLOAT32
-        assert weights_tensor_type in (TensorType.INT8, TensorType.UINT8, TensorType.FLOAT32)
-        weight_tensor_type_str = self.get_tensor_type_str(weights_tensor_type)
-
-        if self.has_expr(weights_tensor.tensor_idx):
-            weight_expr_iohw = self.get_expr(weights_tensor.tensor_idx)
-            weight_expr_iohw = _op.transpose(weight_expr_iohw, axes=(3, 0, 1, 2))
-        else:
-            weight_value_ohwi = self.get_tensor_value(weights_tensor)
-            # Relay kernel_layout should be OIHW
-            # Relay weights layout should be different from kernel_layout - it should be IOHW
-            weight_value_iohw = np.transpose(weight_value_ohwi, (3, 0, 1, 2))
-            weight_expr_iohw = self.exp_tab.new_const(
-                weight_value_iohw,
-                dtype=weight_tensor_type_str,
-                source_name=weights_tensor.tensor.Name(),
-            )
-
-        # Output shape value
-        output_shape_value = self.get_tensor_value(output_shape_tensor)
-        # Relay expects filter output channel to match to output tensor channel.
-        assert (
-            out_channels == output_shape_value[3]
-        ), "Output channel in the filter should match to channel in the output_shape"
-
-        if padding == Padding.SAME:
-            output_h, output_w = output_shape_value[1], output_shape_value[2]
-            pad_top, pad_bottom = get_pad_value(output_h, kernel_h, stride_h)
-            pad_left, pad_right = get_pad_value(output_w, kernel_w, stride_w)
-            padding = (pad_top, pad_left, pad_bottom, pad_right)
-        else:
-            padding = (0, 0, 0, 0)
-
-        if input_tensor.qnn_params:
-            input_zero_point = input_tensor.qnn_params["zero_point"]
-            kernel_zero_point = weights_tensor.qnn_params["zero_point"]
-            input_scale = input_tensor.qnn_params["scale"]
-            kernel_scale = weights_tensor.qnn_params["scale"]
-            out_dtype = "int64" if output_tensor_type_str == "int16" else "int32"
-            out = _qnn.op.conv2d_transpose(
-                in_expr,
-                weight_expr_iohw,
-                input_zero_point,
-                kernel_zero_point,
-                input_scale,
-                kernel_scale,
-                strides=(stride_h, stride_w),
-                padding=padding,
-                channels=int(out_channels),
-                kernel_size=(int(kernel_h), int(kernel_w)),
-                data_layout="NHWC",
-                kernel_layout="IOHW",
-                out_dtype=out_dtype,
-            )
-        else:
-            out = _op.nn.conv2d_transpose(
-                in_expr,
-                weight_expr_iohw,
-                strides=(stride_h, stride_w),
-                padding=padding,
-                channels=int(out_channels),
-                kernel_size=(int(kernel_h), int(kernel_w)),
-                data_layout="NHWC",
-                kernel_layout="IOHW",
-                out_dtype=output_tensor_type_str,
-            )
-
-        # Checking if there is a fused bias
-        if len(input_tensors) == 4:
-            bias_tensor = input_tensors[3]
-            bias_tensor_type = bias_tensor.tensor.Type()
-            # bias tensor type should be INT32 (quantization) or FLOAT32
-            assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32)
-            bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
-            if self.has_expr(bias_tensor.tensor_idx):
-                bias_expr = self.get_expr(bias_tensor.tensor_idx)
-            else:
-                bias_expr = self.exp_tab.new_const(
-                    self.get_tensor_value(bias_tensor),
-                    dtype=bias_tensor_type_str,
-                    source_name=bias_tensor.tensor.Name(),
-                )
-            channel_axis = 3
-            out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
-
-        if output_tensor.qnn_params:
-            # Calculate the intermediate scale and zero point of the int32 output.
-            data_scale = input_tensor.qnn_params["scale"]
-            data_scale_val = get_scalar_from_constant(data_scale)
-
-            weight_scale = weights_tensor.qnn_params["scale"]
-            # If weight scale is scalar, it is per-tensor quantization
-            if isinstance(weight_scale, float):
-                weight_scale_val = get_scalar_from_constant(weight_scale)
-            else:
-                weight_scale_val = get_tensor_from_constant(weight_scale)
-
-            new_input_scale_val = data_scale_val * weight_scale_val
-            new_input_scale = relay.const(new_input_scale_val, "float32")
-            new_input_zero_point = relay.const(0, "int32")
-
-            out = _qnn.op.requantize(
-                out,
-                input_scale=new_input_scale,
-                input_zero_point=new_input_zero_point,
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-                axis=3,
-            )
-        return out
-
-    def convert_quantize(self, op):
-        """Convert TFLite Quantize"""
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-        input_tensor_type_str = self.get_tensor_type_str(input_tensor.tensor.Type())
-        in_expr = self.get_tensor_expr(input_tensor)
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-        output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
-
-        # The output must be quantized
-        assert output_tensor.qnn_params
-
-        # TFLite Quantize op can also act as Requantize op
-        if input_tensor_type_str == "float32":
-            out = self.quantize(in_expr, output_tensor)
-        else:
-            out = _qnn.op.requantize(
-                in_expr,
-                input_scale=input_tensor.qnn_params["scale"],
-                input_zero_point=input_tensor.qnn_params["zero_point"],
-                output_scale=output_tensor.qnn_params["scale"],
-                output_zero_point=output_tensor.qnn_params["zero_point"],
-                out_dtype=output_tensor_type_str,
-            )
-        return out
-
-    def convert_dequantize(self, op):
-        """Convert TFLite Dequantize"""
-        try:
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-        input_tensor = input_tensors[0]
-
-        if input_tensor.tensor.Type() == TensorType.FLOAT16:
-            dtype = self.get_tensor_type_str(input_tensor.tensor.Type())
-            input_value = self.get_tensor_value(input_tensor)
-            in_expr = self.exp_tab.new_const(
-                input_value, dtype=dtype, source_name=input_tensor.tensor.Name()
-            )
-            out = relay.cast(in_expr, dtype="float32")
-            return out
-
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        # The input must be quantized
-        assert input_tensor.qnn_params
-        # Dequantize the input.
-        out = self.dequantize(in_expr, input_tensor)
-
-        return out
-
-    def convert_detection_postprocess(self, op):
-        """Convert TFLite_Detection_PostProcess"""
-        flexbuffer = op.CustomOptionsAsNumpy().tobytes()
-        custom_options = FlexBufferDecoder(flexbuffer).decode()
-
-        use_regular_nms = "use_regular_nms" in custom_options and custom_options["use_regular_nms"]
-
-        inputs = self.get_input_tensors(op)
-        assert len(inputs) == 3, "inputs length should be 3"
-        cls_pred = self.get_expr(inputs[1].tensor_idx)
-        loc_prob = self.get_expr(inputs[0].tensor_idx)
-        batch_size = inputs[1].tensor.Shape(0)
-        anchor_values = self.get_tensor_value(inputs[2])
-        anchor_boxes = len(anchor_values)
-        anchor_type = self.get_tensor_type_str(inputs[2].tensor.Type())
-        anchor_expr = self.exp_tab.new_const(
-            anchor_values, dtype=anchor_type, source_name=inputs[2].tensor.Name()
-        )
-
-        if inputs[0].qnn_params:
-            loc_prob = _qnn.op.dequantize(
-                data=loc_prob,
-                input_scale=inputs[0].qnn_params["scale"],
-                input_zero_point=inputs[0].qnn_params["zero_point"],
-            )
-        if inputs[1].qnn_params:
-            cls_pred = _qnn.op.dequantize(
-                data=cls_pred,
-                input_scale=inputs[1].qnn_params["scale"],
-                input_zero_point=inputs[1].qnn_params["zero_point"],
-            )
-        if inputs[2].qnn_params:
-            anchor_expr = _qnn.op.dequantize(
-                data=anchor_expr,
-                input_scale=inputs[2].qnn_params["scale"],
-                input_zero_point=inputs[2].qnn_params["zero_point"],
-            )
-
-        # loc_prob coords are in yxhw format
-        # need to convert to xywh
-        loc_coords = _op.split(loc_prob, 4, axis=2)
-        loc_prob = _op.concatenate(
-            [loc_coords[1], loc_coords[0], loc_coords[3], loc_coords[2]], axis=2
-        )
-        # reshape loc_prob tensor so is can be consumed by
-        # multibox_transform_loc
-        loc_prob = _op.reshape(loc_prob, [batch_size, anchor_boxes * 4])
-
-        # anchor coords are in yxhw format
-        # need to convert to ltrb
-        anchor_coords = _op.split(anchor_expr, 4, axis=1)
-        anchor_y = anchor_coords[0]
-        anchor_x = anchor_coords[1]
-        anchor_h = anchor_coords[2]
-        anchor_w = anchor_coords[3]
-        plus_half = _expr.const(0.5, dtype="float32")
-        minus_half = _expr.const(-0.5, dtype="float32")
-        anchor_l = _op.add(anchor_x, _op.multiply(anchor_w, minus_half))
-        anchor_r = _op.add(anchor_x, _op.multiply(anchor_w, plus_half))
-        anchor_t = _op.add(anchor_y, _op.multiply(anchor_h, minus_half))
-        anchor_b = _op.add(anchor_y, _op.multiply(anchor_h, plus_half))
-        anchor_expr = _op.concatenate([anchor_l, anchor_t, anchor_r, anchor_b], axis=1)
-        anchor_expr = _op.expand_dims(anchor_expr, 0)
-
-        # attributes for multibox_transform_loc
-        multibox_transform_loc_attrs = {}
-        multibox_transform_loc_attrs["clip"] = False
-        multibox_transform_loc_attrs["threshold"] = (
-            0.0 if use_regular_nms else custom_options["nms_score_threshold"]
-        )
-        multibox_transform_loc_attrs["variances"] = (
-            1 / custom_options["x_scale"],
-            1 / custom_options["y_scale"],
-            1 / custom_options["w_scale"],
-            1 / custom_options["h_scale"],
-        )
-        multibox_transform_loc_attrs["keep_background"] = use_regular_nms
-
-        ret = _op.vision.multibox_transform_loc(
-            # reshape cls_pred so it can be consumed by
-            # multibox_transform_loc
-            _op.transpose(cls_pred, [0, 2, 1]),
-            loc_prob,
-            anchor_expr,
-            **multibox_transform_loc_attrs,
-        )
-
-        if use_regular_nms:
-            # box coordinates need to be converted from ltrb to (ymin, xmin, ymax, xmax)
-            _, transformed_boxes = _op.split(ret[0], (2,), axis=2)
-            box_l, box_t, box_r, box_b = _op.split(transformed_boxes, 4, axis=2)
-            transformed_boxes = _op.concatenate([box_t, box_l, box_b, box_r], axis=2)
-
-            return _op.vision.regular_non_max_suppression(
-                boxes=transformed_boxes,
-                scores=cls_pred,
-                max_detections_per_class=custom_options["detections_per_class"],
-                max_detections=custom_options["max_detections"],
-                num_classes=custom_options["num_classes"],
-                iou_threshold=custom_options["nms_iou_threshold"],
-                score_threshold=custom_options["nms_score_threshold"],
-            )
-
-        # attributes for non_max_suppression
-        non_max_suppression_attrs = {}
-        non_max_suppression_attrs["return_indices"] = False
-        non_max_suppression_attrs["iou_threshold"] = custom_options["nms_iou_threshold"]
-        non_max_suppression_attrs["force_suppress"] = True
-        non_max_suppression_attrs["top_k"] = anchor_boxes
-        non_max_suppression_attrs["max_output_size"] = custom_options["max_detections"]
-        non_max_suppression_attrs["invalid_to_bottom"] = False
-
-        ret = _op.vision.non_max_suppression(ret[0], ret[1], ret[1], **non_max_suppression_attrs)
-        ret = _op.vision.get_valid_counts(ret, 0)
-        valid_count = ret[0]
-        # keep only the top 'max_detections' rows
-        ret = _op.strided_slice(
-            ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], 6]
-        )
-        # the output needs some reshaping to match tflite
-        ret = _op.split(ret, 6, axis=2)
-        cls_ids = _op.reshape(ret[0], [batch_size, -1])
-        scores = _op.reshape(ret[1], [batch_size, -1])
-        boxes = _op.concatenate([ret[3], ret[2], ret[5], ret[4]], axis=2)
-        ret = _expr.TupleWrapper(_expr.Tuple([boxes, cls_ids, scores, valid_count]), size=4)
-        return ret
-
-    def convert_nms_v5(self, op):
-        """Convert TFLite NonMaxSuppressionV5"""
-        # https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/non-max-suppression-v5
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 6, "input tensor length should be 6"
-        boxes = self.get_expr(input_tensors[0].tensor_idx)
-        scores = self.get_expr(input_tensors[1].tensor_idx)
-        max_output_size = self.get_tensor_value(input_tensors[2])
-        iou_threshold = self.get_tensor_value(input_tensors[3])
-        score_threshold = self.get_tensor_value(input_tensors[4])
-        soft_nms_sigma = self.get_tensor_value(input_tensors[5])
-
-        if isinstance(max_output_size, np.ndarray):
-            assert max_output_size.size == 1, "only one value is expected."
-            max_output_size = int(max_output_size)
-
-        if isinstance(iou_threshold, np.ndarray):
-            assert iou_threshold.size == 1, "only one value is expected."
-            iou_threshold = float(iou_threshold)
-
-        if isinstance(score_threshold, np.ndarray):
-            assert score_threshold.size == 1, "only one value is expected."
-            score_threshold = float(score_threshold)
-
-        if isinstance(soft_nms_sigma, np.ndarray):
-            assert soft_nms_sigma.size == 1, "only one value is expected."
-            soft_nms_sigma = float(soft_nms_sigma)
-        if soft_nms_sigma != 0.0:
-            raise tvm.error.OpNotImplemented(
-                "It is soft_nms when soft_nms_sigma != 0, which is not supported!"
-            )
-
-        scores_expand = _op.expand_dims(scores, axis=-1, num_newaxis=1)
-        data = _op.concatenate([scores_expand, boxes], -1)
-        data = _op.expand_dims(data, axis=0, num_newaxis=1)
-
-        count, data, indices = _op.vision.get_valid_counts(
-            data, score_threshold=score_threshold, id_index=-1, score_index=0
-        )
-
-        nms_ret = _op.vision.non_max_suppression(
-            data=data,
-            valid_count=count,
-            indices=indices,
-            max_output_size=max_output_size,
-            iou_threshold=iou_threshold,
-            force_suppress=True,
-            top_k=-1,
-            coord_start=1,
-            score_index=0,
-            id_index=-1,
-            return_indices=True,
-            invalid_to_bottom=False,
-        )
-
-        selected_indices = _op.squeeze(nms_ret[0], axis=[0])
-        selected_indices = _op.strided_slice(selected_indices, [0], [max_output_size])
-        valide_num = _op.squeeze(nms_ret[1], axis=[1])
-        selected_scores = _op.take(scores, selected_indices, axis=0)
-        out = _expr.TupleWrapper(_expr.Tuple([selected_indices, selected_scores, valide_num]), 3)
-        return out
-
-    def convert_expand_dims(self, op):
-        """Convert TFLite EXPAND_DIMS"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensors length should be 2"
-
-        if input_tensors[0].qnn_params:
-            # Check that input and output tensor have same qnn params.
-            output_tensors = self.get_output_tensors(op)
-            assert self.has_same_qnn_params(
-                input_tensors[0], output_tensors[0]
-            ), "TFLite EXPAND_DIMS requires input and output tensors' \
-                    scale and zero points to be equal"
-
-        input_expr = self.get_tensor_expr(input_tensors[0])
-        axis = self.get_tensor_value(input_tensors[1])
-        if isinstance(axis, np.ndarray):
-            assert axis.size == 1, "only one value is expected."
-            axis = int(axis)
-
-        ndims = len(input_tensors[0].tensor.ShapeAsNumpy())
-        assert -1 - ndims <= axis <= ndims, "axis out of range"
-
-        out = _op.expand_dims(input_expr, axis, 1)
-
-        return out
-
-    def convert_one_hot(self, op):
-        """Convert TFLite ONE_HOT"""
-        try:
-            from tflite.BuiltinOptions import BuiltinOptions
-            from tflite.OneHotOptions import OneHotOptions
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 4, "Input tensor's length should be 4"
-
-        # Ensuring input isn't quantized
-        assert all(not i.qnn_params for i in input_tensors), "Quantized input is not expected."
-
-        # TFlite ONE_HOT requires both on_value
-        # and off_value, making dtype redundant.
-        indices = input_tensors[0]
-        depth = input_tensors[1]
-        on_value = input_tensors[2]
-        off_value = input_tensors[3]
-
-        assert (
-            on_value.tensor.Type() == off_value.tensor.Type()
-        ), "on_value and off_value should be the same type"
-
-        # Getting relay expr
-        indices_expr = self.get_expr(indices.tensor_idx)
-        on_value_expr = self.get_expr(on_value.tensor_idx)
-        off_value_expr = self.get_expr(off_value.tensor_idx)
-
-        # Getting depth value
-        depth = self.get_tensor_value(depth)
-        if isinstance(depth, np.ndarray):
-            depth = int(depth)
-
-        # Getting Axis from Option (Attributes)
-        assert op.BuiltinOptionsType() == BuiltinOptions.OneHotOptions
-        op_options = op.BuiltinOptions()
-        one_hot_options = OneHotOptions()
-        one_hot_options.Init(op_options.Bytes, op_options.Pos)
-        axis = one_hot_options.Axis()
-
-        # Setting dtype
-        dtype = self.get_tensor_type_str(on_value.tensor.Type())
-
-        out = _op.one_hot(indices_expr, on_value_expr, off_value_expr, depth, axis, dtype)
-
-        return out
-
-    def convert_reverse_v2(self, op):
-        """Convert TFLite REVERSE_V2"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensor's length should be 2"
-
-        input_expr = self.get_expr(input_tensors[0].tensor_idx)
-
-        # Getting axis value
-        axis = self.get_tensor_value(input_tensors[1])
-        if isinstance(axis, np.ndarray):
-            assert len(axis) == 1, "TFLite does not support multi-axis yet"
-            axis = int(axis)
-
-        out = _op.reverse(input_expr, axis)
-        return out
-
-    def convert_matrix_set_diag(self, op):
-        """Convert TFLite MATRIX_SET_DIAG"""
-
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 2, "input tensor's length should be 2"
-
-        assert (
-            input_tensors[0].tensor.Type() == input_tensors[1].tensor.Type()
-        ), "input and diagonal should be the same type of tensors"
-
-        if input_tensors[0].qnn_params:
-            # Check that input and output tensor have same qnn params.
-            output_tensors = self.get_output_tensors(op)
-            assert self.has_same_qnn_params(
-                input_tensors[0], output_tensors[0]
-            ), "TFLite MATRIX_SET_DIAG requires input and output tensors' \
-                    scale and zero points to be equal"
-
-            # Check that input and diagonal tensor have same qnn params.
-            assert self.has_same_qnn_params(
-                input_tensors[0], input_tensors[1]
-            ), "TFLite MATRIX_SET_DIAG requires input and diagonal tensors' \
-                    scale and zero points to be equal"
-
-        input_expr = self.get_tensor_expr(input_tensors[0])
-        diagonal_expr = self.get_tensor_expr(input_tensors[1])
-
-        out = _op.matrix_set_diag(input_expr, diagonal_expr)
-        return out
-
-    def convert_matrix_diag(self, op):
-        """Convert TFLite MATRIX_DIAG"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensor's length should be 1"
-
-        diagonal = input_tensors[0]
-
-        if diagonal.qnn_params:
-            # Check that diagonal and output tensor have same qnn params.
-            output_tensors = self.get_output_tensors(op)
-            assert self.has_same_qnn_params(
-                diagonal, output_tensors[0]
-            ), "TFLite MATRIX_DIAG requires diagonal and output tensors' \
-                    scale and zero points to be equal"
-
-        shape = to_int_list(self.get_tensor_shape(diagonal))
-        shape = np.append(shape, shape[-1])
-        dtype = self.get_tensor_type_str(diagonal.tensor.Type())
-
-        input_expr = _op.zeros(tuple(shape), dtype)
-        diagonal_expr = self.get_tensor_expr(diagonal)
-
-        out = _op.matrix_set_diag(input_expr, diagonal_expr)
-        return out
-
-    def convert_densify(self, op):
-        """Convert TFLite DENSIFY"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        output_tensors = self.get_output_tensors(op)
-        assert len(output_tensors) == 1, "output tensors length should be 1"
-        output_tensor = output_tensors[0]
-
-        sparse_weight_tensor = input_tensors[0]
-        sparse_weight_tensor_type_str = self.get_tensor_type_str(sparse_weight_tensor.tensor.Type())
-
-        # NOTE: With current implementation in TFLite, Densify Op does not need to be present
-        # in runtime.
-        # TODO(ANSHUMAN87): we need to use the sparse_indices output
-        # from below function and use that in sparse_to_dense Op.
-        # Once the stack corruption issue is resolved in sparse_to_dense Op.
-        _, dense_weight = prepare_dense_matrix_from_sparse(
-            sparse_weight_tensor.tensor,
-            self.get_tensor_value(sparse_weight_tensor, is_sparse=True),
-            sparse_weight_tensor_type_str,
-        )
-
-        self.set_prefetched_node(output_tensor.tensor_idx, dense_weight)
-
-    def convert_fake_quant(self, op):
-        """Convert TFLite FAKE_QUANT"""
-        input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 1, "input tensors length should be 1"
-
-        input_tensor = input_tensors[0]
-        in_expr = self.get_expr(input_tensor.tensor_idx)
-
-        from tflite.BuiltinOptions import BuiltinOptions
-        from tflite.FakeQuantOptions import FakeQuantOptions
-
-        assert op.BuiltinOptionsType() == BuiltinOptions.FakeQuantOptions
-
-        op_options = op.BuiltinOptions()
-        fake_quant_options = FakeQuantOptions()
-        fake_quant_options.Init(op_options.Bytes, op_options.Pos)
-
-        opt_min = fake_quant_options.Min()
-        opt_max = fake_quant_options.Max()
-        narrow_range = fake_quant_options.NarrowRange()
-        num_bits = fake_quant_options.NumBits()
-
-        assert 2 <= num_bits <= 16
-
-        quant_min = 1 if narrow_range else 0
-        quant_max = (1 << num_bits) - 1
-        scale = (opt_max - opt_min) / (quant_max - quant_min)
-
-        zero_point_from_min = quant_min - opt_min / scale
-        if zero_point_from_min <= quant_min:
-            nudged_zero_point = quant_min
-        elif zero_point_from_min >= quant_max:
-            nudged_zero_point = quant_max
-        else:
-            nudged_zero_point = round(zero_point_from_min)
-
-        nudged_min = (quant_min - nudged_zero_point) * scale
-        nudged_max = (quant_max - nudged_zero_point) * scale
-
-        nudged_min_expr = _op.const(nudged_min)
-        clamped = _op.clip(in_expr, nudged_min, nudged_max)
-        clamped_shifted = _op.subtract(clamped, nudged_min_expr)
-
-        half = _op.const(0.5)
-        one = _op.const(1.0)
-        scale_expr = _op.const(scale)
-        inv_scale = _op.divide(one, scale_expr)
-        rounded = _op.floor(_op.add(_op.multiply(clamped_shifted, inv_scale), half))
-        return _op.add(_op.multiply(rounded, scale_expr), nudged_min_expr)
-
-    def get_expr(self, input_tensor_idx):
-        return self.exp_tab.get_expr(get_tensor_name(self.subgraph, input_tensor_idx))
-
-    def has_expr(self, input_tensor_idx):
-        return self.exp_tab.has_expr(get_tensor_name(self.subgraph, input_tensor_idx))
-
-    def is_prefetched(self, input_tensor_idx):
-        return (
-            self.prefetched_nodes.get(get_tensor_name(self.subgraph, input_tensor_idx)) is not None
-        )
-
-    def set_prefetched_node(self, input_tensor_idx, value):
-        self.prefetched_nodes[get_tensor_name(self.subgraph, input_tensor_idx)] = value
-
-    def get_prefetched_node(self, input_tensor_idx):
-        return self.prefetched_nodes[get_tensor_name(self.subgraph, input_tensor_idx)]
-
-    def get_tensor_expr(self, tensor, is_sparse=False):
-        """Return the Relay expr for tensor."""
-        if self.has_expr(tensor.tensor_idx):
-            expr = self.get_expr(tensor.tensor_idx)
-        else:
-            type_str = self.get_tensor_type_str(tensor.tensor.Type())
-            expr = self.exp_tab.new_const(
-                self.get_tensor_value(tensor, is_sparse),
-                dtype=type_str,
-                source_name=tensor.tensor.Name(),
-            )
-        return expr
-
-    def get_tensor_shape(self, tensor_wrapper):
-        """Returns tensor shape. Infers shape if the shape is empty."""
-        assert isinstance(tensor_wrapper, TensorWrapper), "Expecting TensorWrapper here"
-        return (
-            tensor_wrapper.tensor.ShapeAsNumpy()
-            if tensor_wrapper.tensor.ShapeLength() > 0
-            else _infer_shape(self.get_tensor_expr(tensor_wrapper))
-        )
-
-
-# pylint: disable=no-else-return
-def prepare_dense_matrix_from_sparse(sparse_tensor, sparse_tensor_value, sparse_tensor_type):
-    """Prepare sparse indices and dense matrix from TFLite sparse parameters."""
-    # The function is implemented based on TFLite sparse parameter specifications
-    # Please refer
-    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs#L89
-    # for details about each parameters
-    sparsity = sparse_tensor.Sparsity()
-    dense_shape = sparse_tensor.ShapeAsNumpy()
-    orig_rank = len(dense_shape)
-
-    # The traversal order of the dimensions defined in the `shape` field of the to be dense tensor.
-    traversal_order = sparsity.TraversalOrderAsNumpy()
-
-    # For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
-    # stores how a block dimension in (dn, ..., dn+k-1) maps to the original
-    # tensor dimension in (d0, ..., dn). It's stored in the order of (dn, ..., dn+k-1).
-    # If not block-sparse, this field is NULL.
-    block_map = sparsity.BlockMapAsNumpy()
-
-    total_rank = sparsity.TraversalOrderLength()
-    dense_mat = np.full(shape=dense_shape, fill_value=0, dtype=sparse_tensor_type).flatten()
-
-    from enum import Enum
-
-    # NOTE: Here the Vector term is borrowed from TFLite spec.
-    class VectorType(Enum):
-        Empty = 0
-        Int32 = 1
-        Uint16 = 2
-        Uint8 = 3
-
-    def _get_vector_flag(v_type):
-        if VectorType(v_type) == VectorType.Int32:
-            return N.Int32Flags
-        elif VectorType(v_type) == VectorType.Uint16:
-            return N.Uint16Flags
-        elif VectorType(v_type) == VectorType.Uint8:
-            return N.Uint8Flags
-        else:
-            raise tvm.error.OpNotImplemented(f"The provided type {v_type} is not supported")
-
-    def _get_flattened_index(indices, shape):
-        index = 0
-        sub_elements = 1
-        for i in reversed(range(0, len(dense_shape))):
-            index += indices[i] * sub_elements
-            sub_elements *= shape[i]
-        return index
-
-    # DimensionMetadata per dimension: the metadata needed for
-    #     each dimension to locate the non-zero values in the original dense tensor
-    #     inline with traversal order parameter.
-    #
-    # sp_format has 2 possible values: {DENSE = 0, SPARSE_CSR = 1}
-    # If format = DENSE{0} : DenseSize represents size of that dimension
-    # If format = SPARSE_CSR{1} : array_segments represents how to segment the indices array,
-    #      each segment corresponds to one element in the previous dimension. array_indices
-    #      represents the index of the non-zero elements within this dimension
-    #      (as those in the CSR matrix format, where the first array is row pointers
-    #       and the second array is column indices).
-    sp_format = np.zeros(sparsity.DimMetadataLength())
-    dim_metadata = [None] * (2 * sparsity.DimMetadataLength())
-
-    # Below loop will fetch all meta data per dimension based on format type
-    # Dense or Sparse and will put it in an agnostic array for easy access
-    # while preparing dense buffer or indices.
-    for i in range(sparsity.DimMetadataLength()):
-        sp_format[i] = sparsity.DimMetadata(i).Format()
-        if sp_format[i] == 0:
-            dim_metadata[2 * i] = [sparsity.DimMetadata(i).DenseSize()]
-        else:
-            from flatbuffers import number_types as N
-
-            dim_metadata[2 * i] = (
-                sparsity.DimMetadata(i)
-                .ArraySegments()
-                .GetVectorAsNumpy(
-                    flags=_get_vector_flag(sparsity.DimMetadata(i).ArraySegmentsType()), off=4
-                )
-            )
-            dim_metadata[2 * i + 1] = (
-                sparsity.DimMetadata(i)
-                .ArrayIndices()
-                .GetVectorAsNumpy(
-                    flags=_get_vector_flag(sparsity.DimMetadata(i).ArrayIndicesType()), off=4
-                )
-            )
-
-    block_dim = 0
-    block_size = np.zeros(sparsity.BlockMapLength())
-
-    # Block size parameter if encoded in BSR format
-    for i in range(orig_rank):
-        if block_dim < sparsity.BlockMapLength() and block_map[block_dim] == i:
-            orig_dim = traversal_order[orig_rank + block_dim]
-            block_size[block_dim] = sparsity.DimMetadata(orig_dim).DenseSize()
-            block_dim += 1
-
-    indices_list = []
-
-    # Below function iterates through each applicable indices per dimension
-    # based on format type specified and finally produce the dense matrix and the NZ indices.
-    def _def_prepare_dense_matrix_from_sparse(indices, level, prev_idx):
-        if level == len(indices):
-            start_pos = 0
-            orig_idx = np.zeros(orig_rank, dtype="int32")
-            while start_pos < orig_rank:
-                orig_idx[traversal_order[start_pos]] = indices[start_pos]
-                start_pos += 1
-            while start_pos < len(indices):
-                block_idx = traversal_order[start_pos] - orig_rank
-                orig_dim = block_map[block_idx]
-                orig_idx[orig_dim] = orig_idx[orig_dim] * block_size[block_idx] + indices[start_pos]
-                start_pos += 1
-            indices_list.append(orig_idx)
-            nonlocal value_idx
-            dense_mat[_get_flattened_index(orig_idx, dense_shape)] = sparse_tensor_value[value_idx]
-            value_idx += 1
-        else:
-            metadata_idx = 2 * level
-            if sp_format[level] == 0:
-                shape_of_level = dim_metadata[metadata_idx][0]
-                for idx in range(shape_of_level):
-                    indices[level] = idx
-                    _def_prepare_dense_matrix_from_sparse(
-                        indices, level + 1, prev_idx * shape_of_level + idx
-                    )
-            else:
-                array_segments = dim_metadata[metadata_idx]
-                array_indices = dim_metadata[metadata_idx + 1]
-                for idx in range(array_segments[prev_idx], array_segments[prev_idx + 1]):
-                    indices[level] = array_indices[idx]
-                    _def_prepare_dense_matrix_from_sparse(indices, level + 1, idx)
-
-    indices = np.zeros(total_rank)
-    value_idx = 0
-    _def_prepare_dense_matrix_from_sparse(indices, 0, 0)
-    return np.array(indices_list, dtype="int32"), dense_mat.reshape(dense_shape)
-
-
-def get_scalar_from_constant(expr):
-    """Returns scalar value from Relay constant scalar."""
-    assert (
-        isinstance(expr, _expr.Constant) and not expr.data.shape
-    ), "Expr is not a constant scalar."
-    value = expr.data.numpy()
-    assert value.dtype == np.dtype(np.int32) or value.dtype == np.dtype(
-        np.float32
-    ), "value must be float32/int32"
-    return value.item(0)
-
-
-def get_tensor_from_constant(expr):
-    """Returns tensor of values from Relay constant node."""
-    assert isinstance(expr, _expr.Constant)
-    value = expr.data.numpy()
-    assert value.dtype == np.dtype(np.int32) or value.dtype == np.dtype(
-        np.float32
-    ), "value must be float32/int32"
-    return value
-
-
-def build_str_map(obj):
-    """Build string map of TFLite enum int value
-
-    Parameters
-    ----------
-    obj:
-        TFLite class which contains enum int value, such as BuiltInOptions
-
-    Returns
-    -------
-        String representation map of TFLite class enum int value
-    """
-    ret = {}
-    for field_name in dir(obj):
-        if not field_name.startswith("_"):
-            field_value = getattr(obj, field_name)
-            if isinstance(field_value, int):
-                ret[field_value] = field_name
-    return ret
-
-
-# SAME padding: https://www.tensorflow.org/api_guides/python/nn
-def get_pad_value(data, kernel, stride):
-    """Get the pad tuple of value for SAME padding
-
-    Parameters
-    ----------
-    data:
-        1D input data
-
-    kernel:
-        1D input kernel
-
-    stride:
-        1D input stride
-
-    Returns
-    -------
-        pad tuple of value
-    """
-
-    out = int(math.ceil(float(data) / float(stride)))
-    pad = max(0, (out - 1) * stride + kernel - data)
-    pad_before = pad // 2
-    pad_after = pad - pad_before
-    return pad_before, pad_after
-
-
-def get_tensor_name(subgraph, tensor_idx):
-    """Get the tensor name.
-
-    Parameters
-    ----------
-    subgraph:
-        tflite.Subgraph.Subgraph
-
-    tensor:
-        tensor index in subgraph
-
-    Returns
-    -------
-        tensor name in UTF-8 encoding
-    """
-    tensor_name = subgraph.Tensors(tensor_idx).Name()
-    if tensor_name is not None:
-        tensor_name = tensor_name.decode("utf-8")
-    else:
-        tensor_name = "tvmgen_tensor_" + str(tensor_idx)
-    return tensor_name
-
-
-def _decode_type(n):
-    _tflite_m = {
-        0: "float32",
-        1: "float16",
-        2: "int32",
-        3: "uint8",
-        4: "int64",
-        5: "string",
-        6: "bool",
-        7: "int16",
-        8: "complex64",
-        9: "int8",
-    }
-    return _tflite_m[n]
-
-
-def _input_type(model):
-    subgraph_count = model.SubgraphsLength()
-    assert subgraph_count > 0
-    shape_dict = {}
-    dtype_dict = {}
-    for subgraph_index in range(subgraph_count):
-        subgraph = model.Subgraphs(subgraph_index)
-        inputs_count = subgraph.InputsLength()
-        assert inputs_count >= 1
-        for input_index in range(inputs_count):
-            input_ = subgraph.Inputs(input_index)
-            assert subgraph.TensorsLength() > input_
-            tensor = subgraph.Tensors(input_)
-            input_shape = tuple(tensor.ShapeAsNumpy())
-            tensor_type = tensor.Type()
-            input_name = get_tensor_name(subgraph, input_)
-            shape_dict[input_name] = input_shape
-            dtype_dict[input_name] = _decode_type(tensor_type)
-
-    return shape_dict, dtype_dict
-
-
-def from_tflite(model, shape_dict=None, dtype_dict=None, op_converter=OperatorConverter):
-    """Convert from tflite model into compatible relay Function.
-
-    Parameters
-    ----------
-    model:
-        tflite.Model or tflite.Model.Model (depending on tflite version)
-
-    shape_dict : dict of str to int list/tuple
-        Input shapes of the model.
-
-    dtype_dict : dict of str to str
-        Input types of the model.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module for compilation.
-
-    params : dict of str to tvm.nd.NDArray
-        The parameter dict to be used by relay
-    """
-    try:
-        import tflite.BuiltinOperator
-        import tflite.SubGraph
-    except ImportError:
-        raise ImportError("The tflite package must be installed")
-
-    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
-    try:
-        import tflite
-
-        assert isinstance(model, tflite.Model)
-    except TypeError:
-        import tflite.Model
-
-        assert isinstance(model, tflite.Model.Model)
-
-    _shape_dict, _dtype_dict = _input_type(model)
-    if shape_dict is not None:
-        _shape_dict.update(shape_dict)
-    if dtype_dict is not None:
-        _dtype_dict.update(dtype_dict)
-
-    # keep the same as tflite
-    assert model.SubgraphsLength() == 1, "only support one subgraph (main subgraph)"
-    subgraph = model.Subgraphs(0)
-
-    # model inputs / outputs
-    model_inputs = subgraph.InputsAsNumpy()
-    model_outputs = subgraph.OutputsAsNumpy()
-
-    exp_tab = ExprTable()
-    for model_input in model_inputs:
-        model_input_name = get_tensor_name(subgraph, model_input)
-        shape = _shape_dict[model_input_name] if model_input_name in _shape_dict else None
-        dtype = _dtype_dict[model_input_name] if model_input_name in _dtype_dict else "float32"
-        input_var = set_span(
-            _expr.var(model_input_name, shape=shape, dtype=dtype), model_input_name
-        )
-        exp_tab.set_expr(model_input_name, input_var)
-
-    # op code in model
-    op_converter = op_converter(model, subgraph, exp_tab)
-    op_converter.check_unsupported_ops()
-    op_converter.convert_op_to_relay()
-
-    # params and outputs
-    params = {k: _nd.array(np.array(v)) for k, v in exp_tab.params.items()}
-    outputs = [exp_tab.get_expr(get_tensor_name(subgraph, i)) for i in model_outputs]
-    outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
-    attrs = tvm.ir.make_node(
-        "DictAttrs",
-        **{
-            "output_tensor_names": [
-                sanitize_name(get_tensor_name(subgraph, model_output))
-                for model_output in model_outputs
-            ]
-        },
-    )
-    func = _function.Function(analysis.free_vars(outputs), outputs, attrs=attrs)
-    mod = IRModule.from_expr(func)
-    return mod, params
diff --git a/python/tvm/relay/frontend/tflite_flexbuffer.py b/python/tvm/relay/frontend/tflite_flexbuffer.py
deleted file mode 100644
index 7a2b549addaf..000000000000
--- a/python/tvm/relay/frontend/tflite_flexbuffer.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, too-many-lines, import-outside-toplevel
-# pylint: disable=broad-exception-raised, use-list-literal
-"""Tensorflow lite frontend helper to parse custom options in Flexbuffer format."""
-
-import struct
-from enum import IntEnum
-
-
-class BitWidth(IntEnum):
-    """Flexbuffer bit width schema from flexbuffers.h"""
-
-    BIT_WIDTH_8 = 0
-    BIT_WIDTH_16 = 1
-    BIT_WIDTH_32 = 2
-    BIT_WIDTH_64 = 3
-
-
-class FlexBufferType(IntEnum):
-    """Flexbuffer type schema from flexbuffers.h"""
-
-    FBT_NULL = 0
-    FBT_INT = 1
-    FBT_UINT = 2
-    FBT_FLOAT = 3
-    # Types above stored inline, types below store an offset.
-    FBT_KEY = 4
-    FBT_STRING = 5
-    FBT_INDIRECT_INT = 6
-    FBT_INDIRECT_UINT = 7
-    FBT_INDIRECT_FLOAT = 8
-    FBT_MAP = 9
-    FBT_VECTOR = 10  # Untyped.
-    FBT_VECTOR_INT = 11  # Typed any size (stores no type table).
-    FBT_VECTOR_UINT = 12
-    FBT_VECTOR_FLOAT = 13
-    FBT_VECTOR_KEY = 14
-    FBT_VECTOR_STRING = 15
-    FBT_VECTOR_INT2 = 16  # Typed tuple (no type table, no size field).
-    FBT_VECTOR_UINT2 = 17
-    FBT_VECTOR_FLOAT2 = 18
-    FBT_VECTOR_INT3 = 19  # Typed triple (no type table, no size field).
-    FBT_VECTOR_UINT3 = 20
-    FBT_VECTOR_FLOAT3 = 21
-    FBT_VECTOR_INT4 = 22  # Typed quad (no type table, no size field).
-    FBT_VECTOR_UINT4 = 23
-    FBT_VECTOR_FLOAT4 = 24
-    FBT_BLOB = 25
-    FBT_BOOL = 26
-    FBT_VECTOR_BOOL = 36  # To Allow the same type of conversion of type to vector type
-
-
-class FlexBufferDecoder(object):
-    """
-    This implements partial flexbuffer deserialization to be able
-    to read custom options. It is not intended to be a general
-    purpose flexbuffer deserializer and as such only supports a
-    limited number of types and assumes the data is a flat map.
-    """
-
-    def __init__(self, buffer):
-        self.buffer = buffer
-
-    def indirect_jump(self, offset, byte_width):
-        """Helper function to read the offset value and jump"""
-        unpack_str = ""
-        if byte_width == 1:
-            unpack_str = "<B"
-        elif byte_width == 4:
-            unpack_str = "<i"
-        assert unpack_str != ""
-        back_jump = struct.unpack(unpack_str, self.buffer[offset : offset + byte_width])[0]
-        return offset - back_jump
-
-    def decode_keys(self, end, size, byte_width):
-        """Decodes the flexbuffer type vector. Map keys are stored in this form"""
-        # Keys are strings here. The format is all strings separated by null, followed by back
-        # offsets for each of the string. For example, (str1)\0(str1)\0(offset1)(offset2) The end
-        # pointer is pointing at the end of all strings
-        keys = list()
-        for i in range(0, size):
-            offset_pos = end + i * byte_width
-            start_index = self.indirect_jump(offset_pos, byte_width)
-            str_size = self.buffer[start_index:].find(b"\0")
-            assert str_size != -1
-            s = self.buffer[start_index : start_index + str_size].decode("utf-8")
-            keys.append(s)
-        return keys
-
-    def decode_vector(self, end, size, byte_width):
-        """Decodes the flexbuffer vector"""
-        # Each entry in the vector can have different datatype. Each entry is of fixed length. The
-        # format is a sequence of all values followed by a sequence of datatype of all values. For
-        # example - (4)(3.56)(int)(float) The end here points to the start of the values.
-        values = list()
-        for i in range(0, size):
-            value_type_pos = end + size * byte_width + i
-            value_type = FlexBufferType(self.buffer[value_type_pos] >> 2)
-            value_bytes = self.buffer[end + i * byte_width : end + (i + 1) * byte_width]
-            if value_type == FlexBufferType.FBT_BOOL:
-                value = bool(value_bytes[0])
-            elif value_type == FlexBufferType.FBT_INT:
-                value = struct.unpack("<i", value_bytes)[0]
-            elif value_type == FlexBufferType.FBT_UINT:
-                value = struct.unpack("<I", value_bytes)[0]
-            elif value_type == FlexBufferType.FBT_FLOAT:
-                value = struct.unpack("<f", value_bytes)[0]
-            else:
-                raise Exception
-            values.append(value)
-        return values
-
-    def decode_map(self, end, byte_width, parent_byte_width):
-        """Decodes the flexbuffer map and returns a dict"""
-        mid_loc = self.indirect_jump(end, parent_byte_width)
-        map_size = struct.unpack("<i", self.buffer[mid_loc - byte_width : mid_loc])[0]
-
-        # Find keys
-        keys_offset = mid_loc - byte_width * 3
-        keys_end = self.indirect_jump(keys_offset, byte_width)
-        keys = self.decode_keys(keys_end, map_size, 1)
-
-        # Find values
-        values_end = self.indirect_jump(end, parent_byte_width)
-        values = self.decode_vector(values_end, map_size, byte_width)
-        return dict(zip(keys, values))
-
-    def decode(self):
-        """Decode the buffer. Decoding is partially implemented"""
-        root_end = len(self.buffer) - 1
-        root_byte_width = self.buffer[root_end]
-        root_end -= 1
-        root_packed_type = self.buffer[root_end]
-        root_end -= root_byte_width
-
-        root_type = FlexBufferType(root_packed_type >> 2)
-        byte_width = 1 << BitWidth(root_packed_type & 3)
-
-        if root_type == FlexBufferType.FBT_MAP:
-            return self.decode_map(root_end, byte_width, root_byte_width)
-        raise NotImplementedError("Flexbuffer Decoding is partially imlpemented.")
diff --git a/python/tvm/relay/function.py b/python/tvm/relay/function.py
deleted file mode 100644
index f1eada9159e1..000000000000
--- a/python/tvm/relay/function.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, invalid-name, unused-import
-"""The expression nodes of Relay."""
-from __future__ import absolute_import
-
-import tvm._ffi
-from tvm.ir import BaseFunc
-from tvm.runtime import convert
-
-from . import _ffi_api
-from .base import astext, pretty_print
-from .expr import Call
-
-
-@tvm._ffi.register_object("relay.Function")
-class Function(BaseFunc):
-    """A function declaration expression.
-
-    Parameters
-    ----------
-    params: List[tvm.relay.Var]
-        List of input parameters to the function.
-
-    body: tvm.relay.Expr
-        The body of the function.
-
-    ret_type: Optional[tvm.relay.Type]
-        The return type annotation of the function.
-
-    type_params: Optional[List[tvm.relay.TypeParam]]
-        The additional type parameters, this is only
-        used in advanced usecase of template functions.
-
-    span: Optional[tvm.relay.Span]
-        Span that points to original source code.
-    """
-
-    def __init__(self, params, body, ret_type=None, type_params=None, attrs=None, span=None):
-        if type_params is None:
-            type_params = convert([])
-
-        if attrs is None:
-            attrs = tvm.ir.make_node("DictAttrs")
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.Function, params, body, ret_type, type_params, attrs, span
-        )
-
-    def __call__(self, *args):
-        """Invoke the global function.
-
-        Parameters
-        ----------
-        args: List[relay.Expr]
-            Arguments.
-        """
-        return Call(self, args, None, None)
-
-    def __str__(self):
-        return pretty_print(self)
-
-    def astext(self, show_meta_data=True, annotate=None):
-        """Get the text format of the expression.
-
-        Parameters
-        ----------
-        show_meta_data : bool
-            Whether to include meta data section in the text
-            if there is meta data.
-
-        annotate: Optional[Object->str]
-            Optionally annotate function to provide additional
-            information in the comment block.
-
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
-        Notes
-        -----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
-        """
-        return astext(self, show_meta_data, annotate)
-
-
-def FunctionWithFields(
-    function,
-    params=None,
-    body=None,
-    ret_type=None,
-    ty_params=None,
-    attrs=None,
-    virtual_device=None,
-    span=None,
-):
-    """
-    Returns function with the given properties. A None property denotes 'no change'.
-    Returns function if all properties are unchanged. Otherwise, returns a copy with the new
-    fields.
-    """
-    return _ffi_api.FunctionWithFields(
-        function, params, body, ret_type, ty_params, attrs, virtual_device, span
-    )
diff --git a/python/tvm/relay/loops.py b/python/tvm/relay/loops.py
deleted file mode 100644
index 61183fd5312c..000000000000
--- a/python/tvm/relay/loops.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""
-Utilities for building Relay loops.
-"""
-from .scope_builder import ScopeBuilder
-from . import expr as _expr
-from . import function as _function
-
-
-def while_loop(cond, loop_vars, loop_bodies):
-    """
-    Construct a while loop.
-
-    Parameters
-    ----------
-
-    cond: Callable[Tuple[relay.Expr], relay.Expr]
-        The condition of the loop.
-
-    loop_vars:  Tuple[relay.Expr]
-        The variables being looped over.
-        The initial values of the loop, will be used to
-        construct the loop variables.
-
-    loop_bodies: Callable[Tuple[relay.Expr], Tuple[relay.Expr]]
-        The body of the loop, should be a function which
-        given loop variables produces the output result
-        also as a tuple
-
-    Returns
-    -------
-    loop: relay.Expr
-        The loop expression.
-    """
-    sb = ScopeBuilder()
-    loop = _expr.Var("while_loop")
-    fresh_vars = []
-
-    for i, loop_var in enumerate(loop_vars):
-        name = loop_var.name_hint if isinstance(loop_var, _expr.Var) else f"arg{i}"
-        new_var = _expr.var(name, type_annotation=sb.type_of(loop_var), span=loop_var.span)
-        fresh_vars.append(new_var)
-
-    with sb.if_scope(cond(*fresh_vars)):
-        sb.ret(loop(*loop_bodies(*fresh_vars)))
-    with sb.else_scope():
-        sb.ret(_expr.Tuple(fresh_vars))
-
-    func = _function.Function(fresh_vars, sb.get())
-    let = _expr.Let(loop, func, loop)
-    return let
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
deleted file mode 100644
index 9a996838c46e..000000000000
--- a/python/tvm/relay/op/__init__.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin
-"""Relay core operators."""
-# operator defs
-from .op import (
-    get,
-    register_compute,
-    register_gradient,
-    register_pattern,
-    register_alter_op_layout,
-    register_legalize,
-    OpPattern,
-    OpStrategy,
-    debug,
-    register_external_compiler,
-    register_fake_quantization_to_integer,
-    register_optional_fake_quantization_to_integer,
-    register_mixed_precision_conversion,
-)
-from . import strategy
-
-# Operators
-from .reduce import *
-from .tensor import *
-from .transform import *
-from .algorithm import *
-from . import vm
-from . import nn
-from . import annotation
-from . import memory
-from . import image
-from . import vision
-from . import op_attrs
-from . import random
-
-
-# operator registry
-from . import _tensor
-from . import _tensor_grad
-from . import _transform
-from . import _reduce
-from . import _algorithm
-from . import _math
-
-
-def _register_op_make():
-    # pylint: disable=import-outside-toplevel
-    from . import _make
-    from .. import expr
-
-    expr._op_make = _make
-
-
-_register_op_make()
diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py
deleted file mode 100644
index dd1a65288955..000000000000
--- a/python/tvm/relay/op/_algorithm.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"Definition of classic algorithms"
-# pylint: disable=invalid-name,unused-argument
-from __future__ import absolute_import
-
-from tvm.te.hybrid import script
-from tvm.runtime import convert
-
-from . import strategy
-from . import op as _reg
-from .op import OpPattern, register_pattern
-from .op import register_strategy, register_shape_func
-from ._tensor import elemwise_shape_func
-
-# sort
-register_strategy("sort", strategy.sort_strategy)
-register_pattern("sort", OpPattern.OPAQUE)
-register_shape_func("sort", False, elemwise_shape_func)
-
-# argsort
-register_strategy("argsort", strategy.argsort_strategy)
-register_pattern("argsort", OpPattern.OPAQUE)
-register_shape_func("argsort", False, elemwise_shape_func)
-
-# topk
-register_strategy("topk", strategy.topk_strategy)
-register_pattern("topk", OpPattern.OPAQUE)
-
-# searchsorted
-register_strategy("searchsorted", strategy.searchsorted_strategy)
-register_pattern("searchsorted", OpPattern.OPAQUE)
-
-
-@script
-def _topk_shape_func_input_shape(data_shape, k, axis):
-    ndim = data_shape.shape[0]
-    val_out = output_tensor((ndim,), "int64")
-    indices_out = output_tensor((ndim,), "int64")
-
-    for i in const_range(ndim):
-        if i != axis:
-            val_out[i] = int64(data_shape[i])
-            indices_out[i] = int64(data_shape[i])
-        else:
-            if k < 1:
-                val_out[i] = int64(data_shape[i])
-                indices_out[i] = int64(data_shape[i])
-            else:
-                val_out[i] = int64(k)
-                indices_out[i] = int64(k)
-    return val_out, indices_out
-
-
-@_reg.register_shape_func("topk", False)
-def topk_shape_func(attrs, inputs, _):
-    """
-    Shape func for topk.
-    """
-    axis = attrs.axis
-    if axis < 0:
-        axis += inputs[0].shape[0]
-    val_out, indices_out = _topk_shape_func_input_shape(inputs[0], attrs.k, convert(axis))
-    ret_type = attrs.ret_type
-    if ret_type == "both":
-        ret = [val_out, indices_out]
-    elif ret_type == "values":
-        ret = [val_out]
-    else:
-        ret = [indices_out]
-
-    return ret
-
-
-@script
-def _searchsorted_shape(sorted_sequence_shape, values_shape):
-    out_shape = output_tensor((values_shape.shape[0],), "int64")
-    if sorted_sequence_shape.shape[0] > 1:
-        assert (
-            sorted_sequence_shape.shape[0] == values_shape.shape[0]
-        ), "Ranks of `sorted_sequence` and values must be the same if `sorted_sequence` is not 1-D."
-    for i in range(values_shape.shape[0]):
-        if sorted_sequence_shape.shape[0] > 1 and i < values_shape.shape[0] - 1:
-            assert (
-                sorted_sequence_shape[i] == values_shape[i]
-            ), "`sorted_sequence and `values` do not have the same shape along outer axes."
-
-        out_shape[i] = values_shape[i]
-    return out_shape
-
-
-@_reg.register_shape_func("searchsorted", False)
-def searchsorted_shape_func(attrs, inputs, _):
-    """
-    Shape func for searchsorted operator.
-    """
-    return [_searchsorted_shape(inputs[0], inputs[1])]
diff --git a/python/tvm/relay/op/_make.py b/python/tvm/relay/op/_make.py
deleted file mode 100644
index 85c2368fad4a..000000000000
--- a/python/tvm/relay/op/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op._make", __name__)
diff --git a/python/tvm/relay/op/_math.py b/python/tvm/relay/op/_math.py
deleted file mode 100644
index ff74fafcef75..000000000000
--- a/python/tvm/relay/op/_math.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Backend compiler related feature registration"""
-from . import op as _reg
-from . import strategy
-
-# einsum
-_reg.register_strategy("einsum", strategy.einsum_strategy)
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
deleted file mode 100644
index 2872640109d3..000000000000
--- a/python/tvm/relay/op/_reduce.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Backend compiler related feature registration"""
-from __future__ import absolute_import
-
-from tvm.runtime import convert
-from tvm.te.hybrid import script
-from tvm.topi.utils import get_const_int, get_const_tuple
-from . import op as _reg
-
-_reg.register_reduce_schedule("argmax")
-_reg.register_reduce_schedule("argmin")
-_reg.register_reduce_schedule("sum")
-_reg.register_reduce_schedule("all")
-_reg.register_reduce_schedule("any")
-_reg.register_reduce_schedule("max")
-_reg.register_reduce_schedule("min")
-_reg.register_reduce_schedule("prod")
-_reg.register_reduce_schedule("mean")
-_reg.register_reduce_schedule("variance")
-
-
-def _create_axis_record(attrs, inputs):
-    axes = attrs.axis if attrs.axis is None else list(get_const_tuple(attrs.axis))
-    exclude = get_const_int(attrs.exclude) > 0
-    keepdims = get_const_int(attrs.keepdims) > 0
-    data_shape = inputs[0]
-    shape_size = data_shape.shape[0].value
-    axis_record = [-1] * shape_size
-    if axes is None:
-        axes = list(range(shape_size))
-
-    for i, axis in enumerate(axes):
-        if axis < 0:
-            axes[i] = shape_size + axis
-
-    if exclude:
-        ex_axes = []
-        for i in range(shape_size):
-            if i not in axes:
-                ex_axes.append(i)
-        axes = ex_axes
-
-    for i in range(shape_size):
-        if i not in axes:
-            axis_record[i] = i
-
-    if not keepdims:
-        tmp = []
-        for i in axis_record:
-            if i >= 0:
-                tmp.append(i)
-        axis_record = tmp
-
-    return axis_record
-
-
-@script
-def _reduce_shape_func(data_shape, axis_record):
-    out = output_tensor((len(axis_record),), "int64")
-    for i in const_range(len(axis_record)):
-        if axis_record[i] >= 0:
-            out[i] = data_shape[axis_record[i]]
-        else:
-            out[i] = int64(1)
-
-    return out
-
-
-def reduce_shape_func(attrs, inputs, _):
-    """
-    Shape function for reduce op.
-    """
-    axis_record = _create_axis_record(attrs, inputs)
-    return [_reduce_shape_func(inputs[0], convert(axis_record))]
-
-
-_reg.register_shape_func("argmax", False, reduce_shape_func)
-_reg.register_shape_func("argmin", False, reduce_shape_func)
-_reg.register_shape_func("all", False, reduce_shape_func)
-_reg.register_shape_func("sum", False, reduce_shape_func)
-_reg.register_shape_func("max", False, reduce_shape_func)
-_reg.register_shape_func("min", False, reduce_shape_func)
-_reg.register_shape_func("prod", False, reduce_shape_func)
-_reg.register_shape_func("mean", False, reduce_shape_func)
-_reg.register_shape_func("variance", False, reduce_shape_func)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
deleted file mode 100644
index cf318a025c36..000000000000
--- a/python/tvm/relay/op/_tensor.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, len-as-condition
-"""Backend compiler related feature registration"""
-
-from tvm.te.hybrid import script
-from tvm import topi
-from tvm.runtime import convert
-
-from .op import register_compute, register_shape_func, register_legalize
-from .op import register_broadcast_schedule, register_injective_schedule
-from .op import register_pattern, OpPattern
-
-
-register_broadcast_schedule("log")
-register_broadcast_schedule("log2")
-register_broadcast_schedule("log10")
-register_broadcast_schedule("tan")
-register_broadcast_schedule("cos")
-register_broadcast_schedule("cosh")
-register_broadcast_schedule("sin")
-register_broadcast_schedule("sinh")
-register_broadcast_schedule("acos")
-register_broadcast_schedule("acosh")
-register_broadcast_schedule("asin")
-register_broadcast_schedule("asinh")
-register_broadcast_schedule("atan")
-register_broadcast_schedule("atanh")
-register_broadcast_schedule("exp")
-register_broadcast_schedule("erf")
-register_broadcast_schedule("sqrt")
-register_broadcast_schedule("rsqrt")
-register_broadcast_schedule("sigmoid")
-register_broadcast_schedule("floor")
-register_broadcast_schedule("ceil")
-register_broadcast_schedule("trunc")
-register_broadcast_schedule("round")
-register_broadcast_schedule("sign")
-register_broadcast_schedule("abs")
-register_broadcast_schedule("tanh")
-register_broadcast_schedule("add")
-register_broadcast_schedule("subtract")
-register_broadcast_schedule("multiply")
-register_broadcast_schedule("divide")
-register_broadcast_schedule("floor_divide")
-register_broadcast_schedule("trunc_divide")
-register_broadcast_schedule("power")
-register_broadcast_schedule("copy")
-register_broadcast_schedule("logical_not")
-register_broadcast_schedule("logical_and")
-register_broadcast_schedule("logical_or")
-register_broadcast_schedule("logical_xor")
-register_broadcast_schedule("bitwise_not")
-register_broadcast_schedule("bitwise_and")
-register_broadcast_schedule("bitwise_or")
-register_broadcast_schedule("bitwise_xor")
-register_broadcast_schedule("negative")
-register_broadcast_schedule("mod")
-register_broadcast_schedule("floor_mod")
-register_broadcast_schedule("trunc_mod")
-register_broadcast_schedule("equal")
-register_broadcast_schedule("not_equal")
-register_broadcast_schedule("less")
-register_broadcast_schedule("less_equal")
-register_broadcast_schedule("greater")
-register_broadcast_schedule("greater_equal")
-register_broadcast_schedule("isnan")
-register_broadcast_schedule("isfinite")
-register_broadcast_schedule("isinf")
-register_injective_schedule("maximum")
-register_injective_schedule("minimum")
-register_injective_schedule("right_shift")
-register_injective_schedule("left_shift")
-register_injective_schedule("shape_of")
-register_injective_schedule("ndarray_size")
-register_injective_schedule("device_copy")
-register_broadcast_schedule("fast_exp")
-register_broadcast_schedule("fast_tanh")
-register_broadcast_schedule("fast_erf")
-
-
-@register_legalize("erf")
-def legalize_erf(attrs, inputs, types):
-    """Legalize ERF op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.math.erf_legalize(attrs, inputs, types)
-
-
-# zeros
-@register_compute("zeros")
-def zeros_compute(attrs, inputs, output_type):
-    assert not inputs
-    return [topi.full(output_type.shape, output_type.dtype, 0.0)]
-
-
-register_broadcast_schedule("zeros")
-register_pattern("zeros", OpPattern.ELEMWISE)
-
-# zeros_like
-@register_compute("zeros_like")
-def zeros_like_compute(attrs, inputs, output_type):
-    assert len(inputs) == 1
-    return [topi.full_like(inputs[0], 0.0)]
-
-
-register_broadcast_schedule("zeros_like")
-
-# ones
-@register_compute("ones")
-def ones_compute(attrs, inputs, output_type):
-    assert not inputs
-    return [topi.full(output_type.shape, output_type.dtype, 1.0)]
-
-
-register_broadcast_schedule("ones")
-register_pattern("ones", OpPattern.ELEMWISE)
-
-# ones_like
-@register_compute("ones_like")
-def ones_like_compute(attrs, inputs, output_type):
-    assert len(inputs) == 1
-    return [topi.full_like(inputs[0], 1.0)]
-
-
-register_broadcast_schedule("ones_like")
-
-# clip
-@register_compute("clip")
-def clip_compute(attrs, inputs, output_type):
-    assert len(inputs) == 1
-    return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)]
-
-
-register_injective_schedule("clip")
-
-# fixed point multiply
-@register_compute("fixed_point_multiply")
-def fixed_point_multiply_compute(attrs, inputs, output_type):
-    assert len(inputs) == 1
-    return [topi.fixed_point_multiply(inputs[0], attrs.multiplier, attrs.shift)]
-
-
-register_injective_schedule("fixed_point_multiply")
-
-# per-channel/per-axis fixed point multiply
-@register_compute("fixed_point_multiply_per_axis")
-def fixed_point_multiply_per_axis_compute(attrs, inputs, output_type):
-    assert len(inputs) == 4
-    return [
-        topi.fixed_point_multiply_per_axis(
-            *inputs, attrs.is_lshift_required, attrs.is_rshift_required, attrs.axes
-        )
-    ]
-
-
-register_broadcast_schedule("fixed_point_multiply_per_axis")
-
-# full
-@script
-def _full_shape_func(shape):
-    out_ndim = shape.shape[0]
-    out = output_tensor((out_ndim,), "int64")
-    for i in const_range(out_ndim):
-        out[i] = int64(shape[i])
-    return out
-
-
-@script
-def _convert_shape(shape):
-    out = output_tensor((len(shape),), "int64")
-    for i in const_range(len(shape)):
-        out[i] = int64(shape[i])
-    return out
-
-
-def full_shape_func(attrs, inputs, out_ndims):
-    """
-    Shape func for full.
-    """
-    if len(inputs) > 1:
-        return [_full_shape_func(inputs[1])]
-
-    return [_convert_shape(convert(attrs.shape))]
-
-
-def no_data_full_shape_func(attrs, inputs, out_ndims):
-    """
-    Shape func for zeros and ones.
-    """
-    if len(inputs) == 0:
-        return [_convert_shape(convert(attrs.shape))]
-    return [_full_shape_func(inputs[0])]
-
-
-@script
-def _broadcast_shape_func(x, y, ndim):
-    out = output_tensor((ndim,), "int64")
-    if len(x.shape) == 0:
-        for i in const_range(ndim):
-            out[i] = y[i]
-    elif len(y.shape) == 0:
-        for i in const_range(ndim):
-            out[i] = x[i]
-    else:
-        ndim1 = x.shape[0]
-        ndim2 = y.shape[0]
-        for i in const_range(1, min(ndim1, ndim2) + 1):
-            if x[ndim1 - i] == y[ndim2 - i]:
-                out[ndim - i] = x[ndim1 - i]
-            elif x[ndim1 - i] == 1:
-                out[ndim - i] = y[ndim2 - i]
-            else:
-                assert y[ndim2 - i] == 1, "Incompatible broadcast type %s and %s" % (
-                    x[ndim1 - i],
-                    y[ndim2 - i],
-                )
-                out[ndim - i] = x[ndim1 - i]
-        for i in const_range(min(ndim1, ndim2) + 1, ndim + 1):
-            if ndim1 >= ndim2:
-                out[ndim - i] = x[ndim1 - i]
-            else:
-                out[ndim - i] = y[ndim2 - i]
-    return out
-
-
-def broadcast_shape_func(attrs, inputs, out_ndims):
-    """
-    Shape function for broadcast op.
-    """
-    return [_broadcast_shape_func(*inputs, out_ndims[0])]
-
-
-def elemwise_shape_func(attrs, inputs, _):
-    """
-    Shape function for elemwise op.
-    """
-    return [topi.math.identity(inputs[0])]
-
-
-register_shape_func("cast", False, elemwise_shape_func)
-register_shape_func("cast_like", False, elemwise_shape_func)
-register_shape_func("round", False, elemwise_shape_func)
-register_shape_func("zeros", False, no_data_full_shape_func)
-register_shape_func("zeros_like", False, elemwise_shape_func)
-register_shape_func("ones", False, no_data_full_shape_func)
-register_shape_func("ones_like", False, elemwise_shape_func)
-register_shape_func("full", False, full_shape_func)
-register_shape_func("full_like", False, elemwise_shape_func)
-register_shape_func("broadcast_to", True, full_shape_func)
-
-register_shape_func("add", False, broadcast_shape_func)
-register_shape_func("subtract", False, broadcast_shape_func)
-register_shape_func("multiply", False, broadcast_shape_func)
-register_shape_func("divide", False, broadcast_shape_func)
-register_shape_func("floor_divide", False, broadcast_shape_func)
-register_shape_func("trunc_divide", False, broadcast_shape_func)
-register_shape_func("power", False, broadcast_shape_func)
-register_shape_func("mod", False, broadcast_shape_func)
-register_shape_func("floor_mod", False, broadcast_shape_func)
-register_shape_func("trunc_mod", False, broadcast_shape_func)
-register_shape_func("logical_and", False, broadcast_shape_func)
-register_shape_func("logical_or", False, broadcast_shape_func)
-register_shape_func("logical_xor", False, broadcast_shape_func)
-register_shape_func("bitwise_not", False, broadcast_shape_func)
-register_shape_func("bitwise_and", False, broadcast_shape_func)
-register_shape_func("bitwise_or", False, broadcast_shape_func)
-register_shape_func("bitwise_xor", False, broadcast_shape_func)
-register_shape_func("equal", False, broadcast_shape_func)
-register_shape_func("not_equal", False, broadcast_shape_func)
-register_shape_func("less", False, broadcast_shape_func)
-register_shape_func("less_equal", False, broadcast_shape_func)
-register_shape_func("greater", False, broadcast_shape_func)
-register_shape_func("greater_equal", False, broadcast_shape_func)
-register_shape_func("maximum", False, broadcast_shape_func)
-register_shape_func("minimum", False, broadcast_shape_func)
-register_shape_func("left_shift", False, broadcast_shape_func)
-register_shape_func("right_shift", False, broadcast_shape_func)
-
-register_shape_func("sqrt", False, elemwise_shape_func)
-register_shape_func("rsqrt", False, elemwise_shape_func)
-register_shape_func("negative", False, elemwise_shape_func)
-register_shape_func("exp", False, elemwise_shape_func)
-register_shape_func("tan", False, elemwise_shape_func)
-register_shape_func("fast_exp", False, elemwise_shape_func)
-register_shape_func("fast_tanh", False, elemwise_shape_func)
-register_shape_func("fast_erf", False, elemwise_shape_func)
-register_shape_func("floor", False, elemwise_shape_func)
-register_shape_func("log", False, elemwise_shape_func)
-register_shape_func("device_copy", False, elemwise_shape_func)
-register_shape_func("clip", False, elemwise_shape_func)
-register_shape_func("log2", False, elemwise_shape_func)
-register_shape_func("sigmoid", False, elemwise_shape_func)
-register_shape_func("tanh", False, elemwise_shape_func)
-register_shape_func("logical_not", False, elemwise_shape_func)
-register_shape_func("ceil", False, elemwise_shape_func)
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
deleted file mode 100644
index dca7b995b22d..000000000000
--- a/python/tvm/relay/op/_tensor_grad.py
+++ /dev/null
@@ -1,987 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Gradient definitions for Relay operators"""
-import tvm
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-from tvm.error import OpError
-
-
-from ..expr import Tuple, TupleGetItem, const, Var
-from ..ty import TensorType
-from ..loops import while_loop
-from . import nn as _nn
-from .op import register_gradient
-from .reduce import sum as _sum
-from .tensor import (
-    cos,
-    cosh,
-    exp,
-    less,
-    negative,
-    ones_like,
-    power,
-    sin,
-    sinh,
-    sqrt,
-    zeros_like,
-    equal,
-    shape_of,
-    log,
-    concatenate,
-)
-from .transform import (
-    broadcast_to_like,
-    collapse_sum_like,
-    cast_like,
-    reshape,
-    reshape_like,
-    strided_slice,
-    take,
-    transpose,
-    where,
-    repeat,
-    expand_dims,
-    full_like,
-    split,
-    squeeze,
-    strided_set,
-    arange,
-    scatter_nd,
-)
-
-
-@register_gradient("log")
-def log_grad(orig, grad):
-    """Returns [grad * (1 / x)]"""
-    x = orig.args[0]
-    return [grad * ones_like(x) / x]
-
-
-@register_gradient("log2")
-def log2_grad(orig, grad):
-    """Returns [grad * 1 / (log(2) * x)]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    two = const(2.0, dtype=x.checked_type.dtype)
-    return [grad * ones / (log(two) * x)]
-
-
-@register_gradient("log10")
-def log10_grad(orig, grad):
-    """Returns [grad * 1 / (log(10) * x)]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    ten = const(10.0, dtype=x.checked_type.dtype)
-    return [grad * ones / (log(ten) * x)]
-
-
-@register_gradient("tan")
-def tan_grad(orig, grad):
-    """Returns [grad / (cos^2(x))]"""
-    x = orig.args[0]
-    return [grad / (cos(x) * cos(x))]
-
-
-@register_gradient("cos")
-def cos_grad(orig, grad):
-    """Returns [grad * (-sin(x))]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    return [grad * (-ones * sin(x))]
-
-
-@register_gradient("cosh")
-def cosh_grad(orig, grad):
-    """Returns [grad * sinh(x)]"""
-    x = orig.args[0]
-    return [grad * sinh(x)]
-
-
-@register_gradient("sin")
-def sin_grad(orig, grad):
-    """Returns [grad * cos(x)]"""
-    x = orig.args[0]
-    return [grad * cos(x)]
-
-
-@register_gradient("sinh")
-def sinh_grad(orig, grad):
-    """Returns [grad * cosh(x)]"""
-    x = orig.args[0]
-    return [grad * cosh(x)]
-
-
-@register_gradient("acos")
-def acos_grad(orig, grad):
-    """Returns [grad * -1/((1 - (x ^ 2)) ^ 1/2)]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    return [grad * (-ones / sqrt(ones - (x * x)))]
-
-
-@register_gradient("acosh")
-def acosh_grad(orig, grad):
-    """Returns [grad * 1/((x - 1) ^ 1/2 * (x + 1) ^ 1/2)]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    return [grad * ones / sqrt((x * x) - ones)]
-
-
-@register_gradient("asin")
-def asin_grad(orig, grad):
-    """Returns [grad * 1/((1 - (x ^ 2)) ^ (1/2))]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    return [grad * ones / sqrt(ones - (x * x))]
-
-
-@register_gradient("asinh")
-def asinh_grad(orig, grad):
-    """Returns [grad * 1/((1 + (x ^ 2)) ^ (1/2))]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    return [grad * ones / sqrt(ones + (x * x))]
-
-
-@register_gradient("atan")
-def atan_grad(orig, grad):
-    """Returns [grad * 1 / (1 + x ^ 2)]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    return [grad * ones / (ones + (x * x))]
-
-
-@register_gradient("atanh")
-def atanh_grad(orig, grad):
-    """Returns [grad * 1 / (1 - x ^ 2)]"""
-    x = orig.args[0]
-    ones = ones_like(x)
-    return [grad * ones / (ones - (x * x))]
-
-
-@register_gradient("exp")
-def exp_grad(orig, grad):
-    """Returns [grad * exp(x)]"""
-    return [grad * exp(orig.args[0])]
-
-
-@register_gradient("sqrt")
-def sqrt_grad(orig, grad):
-    """Returns [grad * 0.5 * (x ^ -0.5)]"""
-    x = orig.args[0]
-    a = const(0.5, dtype=x.checked_type.dtype)
-    return [grad * a * power(x, negative(a))]
-
-
-@register_gradient("sigmoid")
-def sigmoid_grad(orig, grad):
-    """Returns [grad * sigmoid(x) * (1 - sigmoid(x))]."""
-    return [grad * orig * (ones_like(orig) - orig)]
-
-
-@register_gradient("tanh")
-def tanh_grad(orig, grad):
-    """Returns grad * (1 - tanh(x) * tanh(x))."""
-    return [grad * (ones_like(orig) - orig * orig)]
-
-
-@register_gradient("nn.relu")
-def relu_grad(orig, grad):
-    """Returns grad * (select(x < 0, 0, 1))."""
-    x = orig.args[0]
-    zeros = zeros_like(x)
-    ones = ones_like(x)
-    return [where(less(x, zeros), zeros, ones * grad)]
-
-
-@register_gradient("add")
-def add_grad(orig, grad):
-    """Returns [grad, grad]"""
-    return [collapse_sum_like(grad, orig.args[0]), collapse_sum_like(grad, orig.args[1])]
-
-
-@register_gradient("subtract")
-def subtract_grad(orig, grad):
-    """Returns [grad, -grad]"""
-    return [collapse_sum_like(grad, orig.args[0]), collapse_sum_like(negative(grad), orig.args[1])]
-
-
-@register_gradient("multiply")
-def multiply_grad(orig, grad):
-    """Returns [grad * y, grad * x]"""
-    x, y = orig.args
-    return [collapse_sum_like(grad * y, x), collapse_sum_like(grad * x, y)]
-
-
-@register_gradient("divide")
-def divide_grad(orig, grad):
-    """Returns [grad / y,  - grad * (x / y) / y]"""
-    x, y = orig.args
-    return [collapse_sum_like(grad / y, x), collapse_sum_like(-(grad * orig / y), y)]
-
-
-@register_gradient("zeros")
-def zeros_grad(orig, grad):
-    """Returns []"""
-    return []
-
-
-@register_gradient("dyn.zeros")
-def dyn_zeros_grad(orig, grad):
-    """Returns the gradient of dyn.zeros which is just zero."""
-    assert len(orig.args) == 1
-    return [zeros_like(orig.args[0])]
-
-
-@register_gradient("ones")
-def ones_grad(orig, grad):
-    """Returns []"""
-    return []
-
-
-@register_gradient("dyn.ones")
-def dyn_ones_grad(orig, grad):
-    """Returns the gradient of dyn.ones which is just zero."""
-    assert len(orig.args) == 1
-    return [zeros_like(orig.args[0])]
-
-
-@register_gradient("zeros_like")
-def zeros_like_grad(orig, grad):
-    """Returns [0]"""
-    return [orig]
-
-
-@register_gradient("ones_like")
-def ones_like_grad(orig, grad):
-    """Returns [0]"""
-    return [zeros_like(orig.args[0])]
-
-
-@register_gradient("collapse_sum_like")
-def collapse_sum_like_grad(orig, grad):
-    """Returns [broadcast_to_like(grad, x), 0]"""
-    x, y = orig.args
-    return [broadcast_to_like(grad, x), zeros_like(y)]
-
-
-@register_gradient("collapse_sum_to")
-def collapse_sum_to_grad(orig, grad):
-    """Returns [broadcast_to_like(grad, x), 0]"""
-    x, y = orig.args
-    return [broadcast_to_like(grad, x), zeros_like(y)]
-
-
-@register_gradient("abs")
-def abs_grad(orig, grad):
-    """Returns grad * (select(x < 0, -1, 1))."""
-    x = orig.args[0]
-    zeros = zeros_like(x)
-    ones = ones_like(x)
-    return [where(less(x, zeros), -ones * grad, ones * grad)]
-
-
-@register_gradient("erf")
-def erf_grad(orig, grad):
-    # c_2_div_sqrt_pi = 2.0 / math.sqrt(math.pi)
-    (inp,) = orig.args
-    c_2_div_sqrt_pi = const(1.1283791670955126, dtype=inp.checked_type.dtype)
-    return [c_2_div_sqrt_pi * exp(-inp * inp) * grad]
-
-
-@register_gradient("clip")
-def clip_grad(orig, grad):
-    """Returns grad * (select(x < min || max < x , 0, 1))."""
-    x = orig.args[0]
-    a_min = orig.attrs.get_int("a_min")
-    a_max = orig.attrs.get_int("a_max")
-    a_mins = broadcast_to_like(const(a_min, dtype=x.checked_type.dtype), x)
-    a_maxs = broadcast_to_like(const(a_max, dtype=x.checked_type.dtype), x)
-    zeros = zeros_like(x)
-    ones = ones_like(x)
-    return [where(less(x, a_mins), zeros, where(less(a_maxs, x), zeros, ones * grad))]
-
-
-@register_gradient("nn.max_pool2d")
-def max_pool2d_grad(orig, grad):
-    """Returns the gradient of max_pool2d."""
-    attrs = orig.attrs
-    pool_grad = _nn.max_pool2d_grad(
-        grad,
-        orig.args[0],
-        pool_size=attrs.pool_size,
-        strides=attrs.strides,
-        padding=attrs.padding,
-        layout=attrs.layout,
-        ceil_mode=attrs.ceil_mode,
-    )
-    return [pool_grad]
-
-
-@register_gradient("nn.avg_pool2d")
-def avg_pool2d_grad(orig, grad):
-    """Returns the gradient of avg_pool2d."""
-    attrs = orig.attrs
-    pool_grad = _nn.avg_pool2d_grad(
-        grad,
-        orig.args[0],
-        pool_size=attrs.pool_size,
-        strides=attrs.strides,
-        padding=attrs.padding,
-        layout=attrs.layout,
-        ceil_mode=attrs.ceil_mode,
-        count_include_pad=attrs.count_include_pad,
-    )
-    return [pool_grad]
-
-
-@register_gradient("nn.global_avg_pool2d")
-def global_avg_pool2d_grad(orig, grad):
-    """Returns the gradient of global_avg_pool2d."""
-    data = orig.args[0]
-    shape = data.checked_type.shape
-    layout = orig.attrs.layout
-
-    # we assume NCHW or NHWC layout for now, but easy to add more
-    assert layout in ["NCHW", "NHWC"]
-    if layout == "NCHW":
-        pool_size = shape[2], shape[3]
-    elif layout == "NHWC":
-        pool_size = shape[1], shape[2]
-
-    pool_grad = _nn.avg_pool2d_grad(
-        grad, data, pool_size=pool_size, strides=(1, 1), padding=(0, 0), layout=layout
-    )
-    return [pool_grad]
-
-
-@register_gradient("concatenate")
-def concatenate_grad(orig, grad):
-    """
-    Returns the gradient of concatenate, which is just the downstream gradient
-    split across the inputs.
-    """
-    assert len(orig.args) == 1
-    t = orig.args[0]
-
-    # calculate split indices. TODO(@altanh): support Any?
-    axis_dims = [ty.shape[orig.attrs.axis] for ty in t.checked_type.fields]
-    splits, cumsum = [], 0
-    for dim in axis_dims[:-1]:
-        if isinstance(dim, tvm.tir.IntImm):
-            dim = dim.value
-        cumsum += dim
-        splits.append(cumsum)
-
-    grads = split(grad, tuple(splits), axis=orig.attrs.axis).tuple_value
-    return [grads]
-
-
-@register_gradient("nn.conv2d")
-def conv2d_grad(orig, grad):
-    """Gradient of conv2d"""
-    attrs = orig.attrs
-    data, weight = orig.args
-    data_shape = get_const_tuple(data.checked_type.shape)
-    weight_shape = get_const_tuple(weight.checked_type.shape)
-    _, _, grad_h, grad_w = get_const_tuple(orig.checked_type.shape)
-    _, _, in_h, in_w = data_shape
-    _, _, filter_h, filter_w = weight_shape
-
-    # infer output_padding
-    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
-        get_const_tuple(attrs.padding), (filter_h, filter_w)
-    )
-    stride_h, stride_w = get_const_tuple(attrs.strides)
-    out_h = (grad_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
-    out_w = (grad_w - 1) * stride_w - fpad_left - fpad_right + filter_w
-    output_padding = (in_h - out_h, in_w - out_w)
-
-    assert attrs.data_layout == "NCHW", "only support NCHW data layout"
-    assert attrs.kernel_layout == "OIHW", "only support OIHW kernel layout"
-    assert attrs.out_layout in ["", "NCHW"], "only support NCHW output layout"
-
-    if attrs.out_dtype in ["", None]:
-        assert data.checked_type, "Call InferType first."
-        out_dtype = data.checked_type.dtype
-    else:
-        out_dtype = attrs.out_dtype
-
-    backward_data = _nn.conv2d_transpose(
-        grad,
-        weight,
-        strides=attrs.strides,
-        padding=attrs.padding,
-        dilation=attrs.dilation,
-        groups=attrs.groups,
-        output_padding=output_padding,
-        out_dtype=out_dtype,
-    )
-
-    backward_weight = _nn.conv2d_backward_weight(
-        grad,
-        data,
-        strides=attrs.strides,
-        padding=attrs.padding,
-        dilation=attrs.dilation,
-        groups=attrs.groups,
-        channels=attrs.channels,
-        kernel_size=(filter_h, filter_w),
-        grad_layout=attrs.out_layout if attrs.out_layout else attrs.data_layout,
-        data_layout=attrs.data_layout,
-        kernel_layout=attrs.kernel_layout,
-        out_dtype=out_dtype,
-    )
-
-    return [backward_data, backward_weight]
-
-
-def _get_reduce_axis(call):
-    """Helper function that returns the reduce axis of the call as plain python ints."""
-    x, axis = call.args[0], call.attrs.axis
-    shape = x.checked_type.concrete_shape
-
-    # should never exclude when axis is None
-    assert not (axis is None and call.attrs.exclude)
-
-    if axis is None:
-        return None
-
-    # convert to nonnegative integers and sort
-    axis = sorted([ax if ax >= 0 else len(shape) + ax for ax in map(int, axis)])
-    if call.attrs.exclude:
-        axis = [ax for ax in range(len(shape)) if ax not in axis]
-    return axis
-
-
-def _unreduce_expand(x, axis):
-    """Helper function that returns x expanded on the reduced dimensions in axis."""
-    # assume axis is sorted nonnegative ints
-    for ax in axis:
-        x = expand_dims(x, ax)
-    return x
-
-
-@register_gradient("max")
-def max_grad(orig, grad):
-    """Returns the gradient of max"""
-    x, axis = orig.args[0], _get_reduce_axis(orig)
-    shape = x.checked_type.concrete_shape
-
-    repeated = orig
-    if axis is None:
-        repeated = full_like(x, repeated)
-    else:
-        # expand dims (if necessary) and repeat along each axis
-        if not orig.attrs.keepdims:
-            repeated = _unreduce_expand(repeated, axis)
-            grad = _unreduce_expand(grad, axis)
-        for ax in axis:
-            repeated = repeat(repeated, shape[ax], ax)
-
-    indicators = cast_like(equal(repeated, x), grad)
-    num_selected = _sum(indicators, axis, keepdims=True)
-    # spread error across all max weights
-    return [indicators * grad / num_selected]
-
-
-@register_gradient("nn.softmax")
-def softmax_grad(orig, grad):
-    """Gradient of softmax"""
-    return [(grad - _sum(grad * orig, orig.attrs.axis, True)) * orig]
-
-
-@register_gradient("nn.log_softmax")
-def log_softmax_grad(orig, grad):
-    """Gradient of log_softmax"""
-    return [grad - _sum(grad, axis=orig.attrs.axis, keepdims=True) * exp(orig)]
-
-
-@register_gradient("nn.bias_add")
-def bias_add_grad(orig, grad):
-    """Returns gradient of bias_add"""
-    data = orig.args[0]
-    return [
-        collapse_sum_like(grad, data),
-        _sum(grad, orig.attrs.axis, keepdims=False, exclude=True),
-    ]
-
-
-@register_gradient("nn.dense")
-def dense_grad(orig, grad):
-    """Returns [grad' @ weight, data @ grad']"""
-    data, weight = orig.args
-    return [
-        collapse_sum_like(
-            _nn.dense(grad, transpose(weight), units=weight.checked_type.shape[1]), data
-        ),
-        collapse_sum_like(
-            _nn.dense(transpose(grad), transpose(data), units=data.checked_type.shape[1]), weight
-        ),
-    ]
-
-
-@register_gradient("nn.matmul")
-def matmul_grad(orig, grad):
-    """Returns [grad' @ tensor_b, tensor_a @ grad']"""
-    tensor_a, tensor_b = orig.args
-    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, True):
-        return [
-            collapse_sum_like(
-                _nn.matmul(tensor_b, grad, transpose_a=True, transpose_b=True), tensor_a
-            ),
-            collapse_sum_like(
-                _nn.matmul(grad, tensor_a, transpose_a=True, transpose_b=True), tensor_b
-            ),
-        ]
-    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, False):
-        return [
-            collapse_sum_like(_nn.matmul(tensor_b, grad, transpose_b=True), tensor_a),
-            collapse_sum_like(_nn.matmul(tensor_a, grad), tensor_b),
-        ]
-    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, True):
-        # Keep using Dense op here for not involving extra ops
-        # TODO(jcf94): Merge all to nn.matmul when it is finally ready
-        return dense_grad(orig, grad)
-    # (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, False)
-    return [
-        collapse_sum_like(_nn.matmul(grad, tensor_b, transpose_b=True), tensor_a),
-        collapse_sum_like(_nn.matmul(tensor_a, grad, transpose_a=True), tensor_b),
-    ]
-
-
-@register_gradient("nn.batch_matmul")
-def batch_matmul_grad(orig, grad):
-    """gradient for nn.batch_matmul: in einsum LHS_bik,RHS_bjk->RES_bij
-    grads: GRAD_OUT_bij,RHS_bjk->GRAD_IN_LHS_bik
-           GRAD_OUT_bij,LHS_bik->GRAD_IN_RHS_bjk
-    """
-    lhs, rhs = orig.args
-    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, True):
-        # ki,   jk  ->  ij
-        # jk,   ij  ->  ki
-        # ij,   ki  ->  jk
-        return [
-            collapse_sum_like(_nn.batch_matmul(rhs, grad, transpose_a=True, transpose_b=True), lhs),
-            collapse_sum_like(_nn.batch_matmul(grad, lhs, transpose_a=True, transpose_b=True), rhs),
-        ]
-    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, False):
-        # ki,   kj  ->  ij
-        # kj,   ij  ->  ki
-        # ki,   ij  ->  kj
-        return [
-            collapse_sum_like(
-                _nn.batch_matmul(rhs, grad, transpose_a=False, transpose_b=True), lhs
-            ),
-            collapse_sum_like(
-                _nn.batch_matmul(lhs, grad, transpose_a=False, transpose_b=False), rhs
-            ),
-        ]
-    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, True):
-        # ik,   jk  ->  ij
-        # ij,   jk  ->  ik
-        # ij,   ik  ->  jk
-        # Keep using NT format batch_matmul here for not involving extra ops
-        # TODO(jcf94): Merge all to normal batch_matmul when it is finally ready
-        return [
-            collapse_sum_like(
-                _nn.batch_matmul(
-                    grad,
-                    transpose(rhs, [0, 2, 1]),
-                    transpose_a=False,
-                    transpose_b=True,
-                ),
-                lhs,
-            ),
-            collapse_sum_like(
-                _nn.batch_matmul(
-                    transpose(grad, [0, 2, 1]),
-                    transpose(lhs, [0, 2, 1]),
-                    transpose_a=False,
-                    transpose_b=True,
-                ),
-                rhs,
-            ),
-        ]
-    # (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, False)
-    # ik,   kj  ->  ij
-    # ij,   kj  ->  ik
-    # ik,   ij  ->  kj
-    return [
-        collapse_sum_like(_nn.batch_matmul(grad, rhs, transpose_a=False, transpose_b=True), lhs),
-        collapse_sum_like(_nn.batch_matmul(lhs, grad, transpose_a=True, transpose_b=False), rhs),
-    ]
-
-
-@register_gradient("reshape")
-def reshape_grad(orig, grad):
-    """Gradient of reshape"""
-    return [reshape_like(grad, orig.args[0])]
-
-
-@register_gradient("dyn.reshape")
-def dyn_reshape_grad(orig, grad):
-    """Gradient of dyn_reshape"""
-    return [reshape_like(grad, orig.args[0]), zeros_like(orig.args[1])]
-
-
-@register_gradient("shape_of")
-def shape_of_grad(orig, grad):
-    """Gradient of shape_of"""
-    return [zeros_like(orig.args[0])]
-
-
-@register_gradient("cast")
-def cast_grad(orig, grad):
-    x = orig.args[0]
-    return [cast_like(grad, x)]
-
-
-@register_gradient("cast_like")
-def cast_like_grad(orig, grad):
-    x, like = orig.args
-    return [cast_like(grad, x), zeros_like(like)]
-
-
-@register_gradient("nn.batch_flatten")
-def batch_flatten_grad(orig, grad):
-    """Returns grad reshaped to data dims"""
-    data = orig.args[0]
-    return [reshape_like(grad, data)]
-
-
-@register_gradient("transpose")
-def transpose_grad(orig, grad):
-    """Returns grad transposed over the complement of original transpose axes"""
-    orig_axes = orig.attrs.axes
-    if orig_axes:
-        dims = len(orig_axes)
-        new_axes = [0] * dims
-        for i in range(dims):
-            new_axes[int(orig_axes[i])] = i
-    else:
-        new_axes = None
-    return [transpose(grad, axes=new_axes)]
-
-
-@register_gradient("negative")
-def negative_grad(orig, grad):
-    """Returns -grad"""
-    return [-grad]
-
-
-@register_gradient("sum")
-def sum_grad(orig, grad):
-    """Returns grad broadcasted to data dims"""
-    data, axis = orig.args[0], _get_reduce_axis(orig)
-    if not orig.attrs.keepdims:
-        if axis is None:
-            axis = list(range(len(data.checked_type.concrete_shape)))
-        grad = _unreduce_expand(grad, axis)
-    return [broadcast_to_like(grad, data)]
-
-
-@register_gradient("mean")
-def mean_grad(orig, grad):
-    """Returns grad broadcasted to data dims"""
-    data, axis = orig.args[0], _get_reduce_axis(orig)
-    shape = data.checked_type.concrete_shape
-    if axis is None:
-        axis = list(range(len(data.checked_type.concrete_shape)))
-    if not orig.attrs.keepdims:
-        grad = _unreduce_expand(grad, axis)
-    mult = 1.0
-    for a in axis:
-        mult /= shape[a]
-    return [broadcast_to_like(grad * const(mult, dtype=data.checked_type.dtype), data)]
-
-
-@register_gradient("variance")
-def variance_grad(orig, grad):
-    """Note that we take mean as an argument in the variance node"""
-    data, data_mean, axis = orig.args[0], orig.args[1], _get_reduce_axis(orig)
-    unbiased = orig.attrs.unbiased
-    shape = data.checked_type.concrete_shape
-    if axis is None:
-        axis = list(range(len(data.checked_type.concrete_shape)))
-    if not orig.attrs.keepdims:
-        grad = _unreduce_expand(grad, axis)
-    mult1 = 2.0
-    mult2 = -2.0
-    count = 1
-    for a in axis:
-        count *= shape[a]
-    if unbiased:
-        mult2 = mult2 * count / (count - 1)
-        count -= 1
-    mult1 /= count
-    return [
-        (grad * const(mult1, dtype=data.checked_type.dtype)) * data,
-        const(mult2, dtype=data.checked_type.dtype) * grad * data_mean,
-    ]
-
-
-@register_gradient("copy")
-def copy_grad(orig, grad):
-    return [grad]
-
-
-@register_gradient("nn.cross_entropy")
-def cross_entropy_grad(orig, grad):
-    x, y = orig.args
-    shape = shape_of(x)
-    batch_size = take(shape, const(0, dtype="int32"), axis=0)
-    grad = grad / batch_size.astype(x.checked_type.dtype)
-    return [-grad * y / x, -grad * log(x)]
-
-
-@register_gradient("nn.cross_entropy_with_logits")
-def cross_entropy_with_logits_grad(orig, grad):
-    x, y = orig.args
-    shape = shape_of(x)
-    batch_size = take(shape, const(0, dtype="int32"), axis=0)
-    grad = grad / batch_size.astype(x.checked_type.dtype)
-    return [-grad * y, -grad * x]
-
-
-@register_gradient("take")
-def take_grad(orig, grad):
-    """
-    Returns the gradient of take.
-    """
-
-    def make_scalar_tensor(v):
-        if isinstance(v, int):
-            v = const(v, dtype="int32")
-        return reshape(v, (1,))
-
-    # TODO(@altanh): we currently assume indices are in range
-    data, indices = orig.args
-    axis = orig.attrs.axis
-    batch_dims = orig.attrs.batch_dims
-    zero, one = map(make_scalar_tensor, [0, 1])
-    data_grad = zeros_like(data)
-    try:
-        data_shape = data.checked_type.concrete_shape
-    except TypeError as ty_err:
-        raise OpError("currently take_grad only supports data with concrete shape") from ty_err
-    if axis is None:
-        axis = 0
-        data_grad = reshape(data_grad, (-1,))
-        data_shape = 1
-        for dim in data.checked_type.concrete_shape:
-            data_shape *= dim
-        data_shape = (data_shape,)
-    else:
-        axis = int(axis)
-    if batch_dims is None:
-        batch_dims = 0
-    else:
-        batch_dims = int(batch_dims)
-    if batch_dims != 0:
-        raise OpError("take_grad only supports batch_dims equales to 0")
-    strides = [1] * len(data_shape)
-
-    if len(indices.checked_type.shape) == 0:
-        # axis on grad has been squeezed in this case
-        num_indices = one
-        indices = reshape(indices, (1,))
-        grad = expand_dims(grad, int(axis))
-    elif len(indices.checked_type.shape) == 1:
-        num_indices = take(shape_of(indices), zero, axis=0)
-    else:
-        raise OpError("take_grad only supports scalar or 1D indices")
-
-    def loop_cond(data_grad, i):
-        return squeeze(less(i, num_indices))
-
-    def loop_body(data_grad, i):
-        index = take(indices, i, axis=0)
-        grad_slice = take(grad, i, axis=axis)
-        begin, end = [], []
-        for ax, size in enumerate(data_shape):
-            size = make_scalar_tensor(size)
-            begin.append(zero if ax != axis else index)
-            end.append(size if ax != axis else index + one)
-        begin, end = concatenate(begin, axis=0), concatenate(end, axis=0)
-        # data_grad[:,...,index at axis,...,:] += grad_slice
-        update = strided_slice(data_grad, begin, end, strides=strides)
-        update = update + grad_slice  # no need to expand grad_slice since i has shape (1,)
-        next_data_grad = strided_set(data_grad, update, begin, end, strides=strides)
-        return (next_data_grad, i + one)
-
-    loop_vars = [
-        Var("data_grad", type_annotation=TensorType(data_shape, data.checked_type.dtype)),
-        Var("i", type_annotation=TensorType((1,), "int32")),
-    ]
-
-    loop = while_loop(loop_cond, loop_vars, loop_body)
-    result = loop(data_grad, zero)
-    data_grad = TupleGetItem(result, 0)
-
-    if orig.attrs.axis is None:
-        data_grad = reshape_like(data_grad, data)
-
-    return [data_grad, zeros_like(orig.args[1])]
-
-
-@register_gradient("contrib_reverse_reshape")
-def reverse_reshape_grad(orig, grad):
-    """
-    Returns the gradient of reverse_reshape (same as reshape).
-    """
-    return [reshape_like(grad, orig.args[0])]
-
-
-@register_gradient("stack")
-def stack_grad(orig, grad):
-    """
-    Returns grad split across stacked inputs.
-    """
-    stack_axis = int(orig.attrs.axis)
-    sections = len(orig.args[0].checked_type.fields)
-    splits = split(grad, sections, stack_axis)
-    splits = Tuple([squeeze(x, axis=[stack_axis]) for x in splits])
-    return [splits]
-
-
-@register_gradient("squeeze")
-def squeeze_grad(orig, grad):
-    """
-    Returns grad expanded to input size.
-    """
-    # this should work, can't use expand_dims since we lose
-    # squeeze information when axis=None
-    return [reshape_like(grad, orig.args[0])]
-
-
-@register_gradient("expand_dims")
-def expand_dims_grad(orig, grad):
-    """
-    Returns grad squeezed on expanded dims.
-    """
-    axis = int(orig.attrs.axis)
-    for _ in range(orig.attrs.num_newaxis):
-        grad = squeeze(grad, axis=[axis])
-    return [grad]
-
-
-@register_gradient("arange")
-def arange_grad(orig, grad):
-    """
-    Returns the gradient of arange.
-    """
-    start, stop, step = orig.args
-    length = take(shape_of(orig), const(0, dtype="int32"), axis=0)
-
-    grad_start = cast_like(_sum(grad), start)
-    grad_stop = zeros_like(stop)
-    grad_step = cast_like(arange(length, dtype="int32"), grad) * grad
-    grad_step = cast_like(_sum(grad_step), step)
-
-    return [grad_start, grad_stop, grad_step]
-
-
-@register_gradient("gather_nd")
-def gather_nd_grad(orig, grad):
-    """
-    Returns the gradient of gather_nd, which is simply scatter_nd.
-    """
-    data, indices = orig.args
-    return [scatter_nd(zeros_like(data), indices, grad, mode="add"), zeros_like(indices)]
-
-
-@register_gradient("reshape_like")
-def reshape_like_grad(orig, grad):
-    """
-    Returns the gradient of reshape_like.
-    """
-    data, shape_like = orig.args
-    return [reshape_like(grad, data), zeros_like(shape_like)]
-
-
-@register_gradient("where")
-def where_grad(orig, grad):
-    """
-    Returns the gradient of where.
-    """
-    cond, x, y = orig.args
-    g_zeros = zeros_like(grad)
-
-    grad_x = collapse_sum_like(where(cond, grad, g_zeros), x)
-    grad_y = collapse_sum_like(where(cond, g_zeros, grad), y)
-
-    return [zeros_like(cond), grad_x, grad_y]
-
-
-@register_gradient("less_equal")
-def less_equal_grad(orig, grad):
-    """
-    Returns the gradient of less_equal.
-    """
-    return [zeros_like(orig.args[0]), zeros_like(orig.args[1])]
-
-
-@register_gradient("not_equal")
-def not_equal_grad(orig, grad):
-    """
-    Returns the gradient of not_equal (just zeros).
-    """
-    return [zeros_like(orig.args[0]), zeros_like(orig.args[1])]
-
-
-@register_gradient("strided_slice")
-def strided_slice_grad(orig, grad):
-    """
-    Returns the gradient of strided_slice, which is equal to grad where the
-    input was sliced and zero elsewhere.
-    """
-    assert orig.attrs.axes is None, "grad for strided_slice with axes is not yet supported"
-    x = orig.args[0]
-    begin = get_const_tuple(orig.attrs.begin)
-    end = get_const_tuple(orig.attrs.end)
-    strides = get_const_tuple(orig.attrs.strides)
-    if orig.attrs.slice_mode == "size":
-        # convert sizes to ending indices and ignore strides
-        end = list(end)
-        for i, (start, size) in enumerate(zip(begin, end)):
-            if size == -1:
-                end[i] = int(x.checked_type.shape[i])
-            else:
-                end[i] = start + size
-        strides = None
-    else:
-        assert orig.attrs.slice_mode == "end"
-    return [strided_set(zeros_like(x), grad, begin, end, strides)]
-
-
-@register_gradient("one_hot")
-def one_hot_grad(orig, grad):
-    """
-    Returns the gradient of one_hot, which is the sum of grad at on and off
-    indices for on_value and off_value respectively.
-    """
-    indices, on_value, off_value = orig.args
-
-    g_zeros = zeros_like(grad)
-    on_mask = equal(orig, on_value)
-    grad_on = _sum(where(on_mask, grad, g_zeros))
-    grad_off = _sum(where(on_mask, g_zeros, grad))
-
-    return [zeros_like(indices), cast_like(grad_on, on_value), cast_like(grad_off, off_value)]
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
deleted file mode 100644
index 8bca72655491..000000000000
--- a/python/tvm/relay/op/_transform.py
+++ /dev/null
@@ -1,1286 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Backend compiler related feature registration"""
-# pylint: disable=invalid-name,unused-argument, len-as-condition, too-many-nested-blocks,
-# pylint: disable=too-many-local-variables, too-many-arguments, no-else-return
-
-from __future__ import absolute_import
-
-import tvm
-from tvm import te, topi
-from tvm.runtime import convert
-from tvm.te.hybrid import script
-from tvm.topi.utils import get_const_int, get_const_tuple
-
-from . import op as _reg
-from . import strategy
-from ._tensor import elemwise_shape_func
-from .op import OpPattern
-
-_reg.register_broadcast_schedule("broadcast_to")
-_reg.register_broadcast_schedule("broadcast_to_like")
-_reg.register_broadcast_schedule("expand_dims")
-_reg.register_broadcast_schedule("repeat")
-_reg.register_broadcast_schedule("tile")
-_reg.register_broadcast_schedule("where")
-_reg.register_injective_schedule("squeeze")
-_reg.register_injective_schedule("reshape")
-_reg.register_injective_schedule("reshape_like")
-_reg.register_injective_schedule("full")
-_reg.register_injective_schedule("full_like")
-_reg.register_injective_schedule("arange")
-_reg.register_injective_schedule("meshgrid")
-_reg.register_injective_schedule("reverse")
-_reg.register_injective_schedule("reverse_sequence")
-_reg.register_injective_schedule("cast")
-_reg.register_injective_schedule("cast_like")
-_reg.register_injective_schedule("reinterpret")
-_reg.register_injective_schedule("strided_slice")
-_reg.register_injective_schedule("slice_like")
-_reg.register_injective_schedule("split")
-_reg.register_injective_schedule("take")
-_reg.register_injective_schedule("stack")
-_reg.register_injective_schedule("contrib_reverse_reshape")
-_reg.register_injective_schedule("gather")
-_reg.register_injective_schedule("gather_nd")
-_reg.register_injective_schedule("sequence_mask")
-_reg.register_injective_schedule("one_hot")
-_reg.register_reduce_schedule("collapse_sum_like")
-_reg.register_reduce_schedule("collapse_sum_to")
-_reg.register_injective_schedule("unravel_index")
-_reg.register_injective_schedule("sparse_to_dense")
-_reg.register_injective_schedule("matrix_set_diag")
-_reg.register_injective_schedule("adv_index")
-
-
-# concatenate
-@_reg.register_compute("concatenate")
-def compute_concat(attrs, inputs, output_type):
-    return [topi.concatenate(inputs, attrs.axis)]
-
-
-_reg.register_strategy("concatenate", strategy.concatenate_strategy)
-
-# sliding_window
-@_reg.register_compute("sliding_window")
-def compute_sliding_window(attrs, inputs, output_type):
-    """Compute definition of sliding_window"""
-    return [topi.sliding_window(inputs[0], attrs.axis, attrs.window_shape, attrs.strides)]
-
-
-_reg.register_strategy("sliding_window", strategy.sliding_window_strategy)
-
-# strided_set
-@_reg.register_compute("strided_set")
-def compute_strided_set(attrs, inputs, output_type):
-    """Compute definition of strided_set"""
-    return [topi.strided_set(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4])]
-
-
-_reg.register_injective_schedule("strided_set")
-
-# layout_transform
-_reg.register_strategy("layout_transform", strategy.layout_transform_strategy)
-_reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
-_reg.register_injective_schedule("auto_scheduler_layout_transform")
-_reg.register_pattern("auto_scheduler_layout_transform", OpPattern.INJECTIVE)
-_reg.register_injective_schedule("meta_schedule_layout_transform")
-_reg.register_pattern("meta_schedule_layout_transform", OpPattern.INJECTIVE)
-
-# argwhere
-_reg.register_strategy("argwhere", strategy.argwhere_strategy)
-
-# sparse_fill_empty_rows
-@_reg.register_compute("sparse_fill_empty_rows")
-def compute_sparse_fill_empty_rows(attrs, inputs, output_type):
-    """Compute definition of sparse_fill_empty_rows"""
-
-    return topi.sparse_fill_empty_rows(
-        inputs[0],
-        inputs[1],
-        inputs[2],
-        inputs[3],
-        output_type.fields[0].shape,
-        output_type.fields[1].shape,
-        output_type.fields[2].shape,
-    )
-
-
-_reg.register_strategy("sparse_fill_empty_rows", strategy.sparse_fill_empty_rows_strategy)
-
-# sparse_reshape
-@_reg.register_compute("sparse_reshape")
-def compute_reshape(attrs, inputs, output_type):
-    """Compute definition of sparse_reshape"""
-
-    return topi.sparse_reshape(
-        inputs[0],
-        inputs[1],
-        inputs[2],
-        output_type.fields[0].shape,
-        output_type.fields[1].shape,
-    )
-
-
-_reg.register_strategy("sparse_reshape", strategy.sparse_reshape_strategy)
-
-# stft
-@_reg.register_compute("stft")
-def compute_stft(attrs, inputs, output_type):
-    """Compute definition of stft"""
-    return topi.stft(
-        inputs[0],
-        attrs.n_fft,
-        attrs.hop_length,
-        attrs.win_length,
-        attrs.window,
-        attrs.normalized,
-        attrs.onesided,
-        output_type.shape,
-    )
-
-
-_reg.register_strategy("stft", strategy.stft_strategy)
-
-
-@script
-def _stft_shape_func(data, n_fft, hop_length, onesided):
-    output_shape = output_tensor((4,), "int64")
-    output_shape[0] = int64(data.shape[0])
-    if onesided:
-        output_shape[1] = int64(int64(n_fft) // int64(2)) + int64(1)
-    else:
-        output_shape[1] = int64(n_fft)
-    output_shape[2] = int64(int64(data.shape[1] - n_fft) // int64(hop_length)) + int64(1)
-    output_shape[3] = int64(2)
-    return output_shape
-
-
-@_reg.register_shape_func("stft", True)
-def stft_shape_func(attrs, inputs, _):
-    """
-    Shape func for stft.
-    """
-    return [
-        _stft_shape_func(
-            inputs[0], convert(attrs.n_fft), convert(attrs.hop_length), convert(attrs.onesided)
-        )
-    ]
-
-
-# DFT
-@_reg.register_compute("dft")
-def compute_dft(attrs, inputs, _):
-    """Compute definition of DFT"""
-    return topi.dft(
-        inputs[0],
-        inputs[1],
-        attrs.inverse,
-    )
-
-
-_reg.register_strategy("dft", strategy.dft_strategy)
-
-
-# trilu
-_reg.register_strategy("trilu", strategy.trilu_strategy)
-
-
-# scatter_elements
-@_reg.register_compute("scatter_elements")
-def compute_scatter_elements(attrs, inputs, output_type):
-    """Compute definition of scatter_elements"""
-    return [topi.scatter_elements(inputs[0], inputs[1], inputs[2], attrs.axis, attrs.reduction)]
-
-
-_reg.register_strategy("scatter_elements", strategy.scatter_elements_strategy)
-
-# scatter_nd
-@_reg.register_compute("scatter_nd")
-def compute_scatter_nd(attrs, inputs, output_type):
-    """Compute definition of scatter_nd"""
-    return [topi.scatter_nd(inputs[0], inputs[1], inputs[2], attrs.mode)]
-
-
-_reg.register_strategy("scatter_nd", strategy.scatter_nd_strategy)
-
-# cumsum
-@_reg.register_compute("cumsum")
-def compute_cumsum(attrs, inputs, output_type):
-    """Compute definition of cumsum"""
-    return [topi.cumsum(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
-
-
-_reg.register_strategy("cumsum", strategy.cumsum_strategy)
-_reg.register_shape_func("cumsum", False, elemwise_shape_func)
-
-# cumprod
-@_reg.register_compute("cumprod")
-def compute_cumprod(attrs, inputs, output_type):
-    """Compute definition of cumprod"""
-    return [topi.cumprod(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
-
-
-_reg.register_strategy("cumprod", strategy.cumprod_strategy)
-_reg.register_shape_func("cumprod", False, elemwise_shape_func)
-
-
-@_reg.register_compute("unique")
-def compute_unique(attrs, inputs, output_type):
-    """Compute definition of unique"""
-    return topi.unique(inputs[0], attrs.sorted, attrs.return_counts)
-
-
-_reg.register_strategy("unique", strategy.unique_strategy)
-
-# invert_permutation
-_reg.register_strategy("invert_permutation", strategy.invert_permutation_strategy)
-_reg.register_shape_func("invert_permutation", False, elemwise_shape_func)
-
-
-#####################
-#  Shape functions  #
-#####################
-
-
-@script
-def _arange_shape_func(start, stop, step):
-    out = output_tensor((1,), "int64")
-    if step[()] < 0:
-        out[0] = int64(ceil_div((int64(start[()]) - int64(stop[()])), int64(-step[()])))
-    else:
-        out[0] = int64(ceil_div((int64(stop[()]) - int64(start[()])), int64(step[()])))
-    return out
-
-
-@_reg.register_shape_func("arange", True)
-def arange_shape_func(attrs, inputs, _):
-    """
-    Shape func for arange
-    """
-    return [_arange_shape_func(*inputs)]
-
-
-@script
-def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, slice_mode):
-    ndim = len(data_shape)
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        dim_size = int64(data_shape[i])
-        cbegin = int64(0)
-        cend = dim_size
-        cstride = int64(1)
-
-        if len(strides) > i:
-            cstride = int64(strides[i])
-
-        if len(begin) > i:
-            cbegin = int64(begin[i])
-        elif cstride < 0:
-            cbegin = dim_size
-
-        if len(end) <= i:
-            if cstride < 0:
-                cend = int64(0)
-        elif slice_mode != 0:
-            cstride = int64(1)
-            if end[i] < 0:
-                cend = dim_size
-            else:
-                cend = cbegin + int64(end[i])
-        else:
-            if end[i] > data_shape[i]:
-                cend = dim_size
-            else:
-                cend = int64(end[i])
-
-        assert cstride != 0, "Strides can't be zero."
-
-        if cbegin < 0:
-            cbegin += dim_size
-        if cend < 0:
-            cend += dim_size
-
-        if cstride < 0:
-            if cend < 0:
-                cend = int64(-1)
-            if cbegin > dim_size - 1:
-                cbegin = dim_size - 1
-            slice_range = cbegin - cend
-            step = -cstride
-        else:
-            slice_range = cend - cbegin
-            step = cstride
-        out[i] = int64(ceil_div(slice_range, step))
-    return out
-
-
-@script
-def _strided_slice_shape_func_with_axes(data_shape, begin, end, strides, slice_mode, axes):
-    ndim = data_shape.shape[0]
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        out[i] = data_shape[i]
-
-    for i in const_range(len(axes)):
-        dim_size = int64(data_shape[axes[i]])
-        cbegin = int64(0)
-        cend = dim_size
-        cstride = int64(1)
-
-        if len(strides) > i:
-            cstride = int64(strides[i])
-
-        if len(begin) > i:
-            cbegin = int64(begin[i])
-        elif cstride < 0:
-            cbegin = dim_size
-
-        if len(end) <= i:
-            cend = dim_size
-        elif slice_mode != 0:
-            cstride = int64(1)
-            if end[i] < 0:
-                cend = dim_size
-            else:
-                cend = cbegin + int64(end[i])
-        else:
-            if end[i] > data_shape[axes[i]]:
-                cend = dim_size
-            else:
-                cend = int64(end[i])
-
-        assert cstride != 0, "Strides can't be zero."
-
-        if cbegin < 0:
-            cbegin += dim_size
-        if cend < 0:
-            cend += dim_size
-
-        if cstride < 0:
-            if cend < 0:
-                cend = int64(-1)
-            if cbegin > dim_size - 1:
-                cbegin = dim_size - 1
-            slice_range = cbegin - cend
-            step = -cstride
-        else:
-            slice_range = cend - cbegin
-            step = cstride
-
-        out[axes[i]] = int64(ceil_div(slice_range, step))
-    return out
-
-
-@_reg.register_shape_func("strided_slice", False)
-def strided_slice_shape_func(attrs, inputs, _):
-    """
-    Shape func for strided_slice
-    """
-    slice_mode = convert(0 if attrs.slice_mode == "end" else 1)
-    if attrs.axes is None:
-        return [
-            _strided_slice_shape_func_input_shape(
-                inputs[0], attrs.begin, attrs.end, attrs.strides, slice_mode
-            )
-        ]
-    return [
-        _strided_slice_shape_func_with_axes(
-            inputs[0], attrs.begin, attrs.end, attrs.strides, slice_mode, attrs.axes
-        )
-    ]
-
-
-@script
-def _one_hot_shape_func(indices_shape, depth, axis):
-    in_ndim = indices_shape.shape[0]
-    out_ndim = in_ndim + 1
-    true_axis = in_ndim if axis == -1 else axis
-    indices_i = 0
-    out = output_tensor((out_ndim,), "int64")
-    for i in range(out_ndim):
-        if i == true_axis:
-            out[i] = int64(depth)
-        else:
-            out[i] = int64(indices_shape[indices_i])
-            indices_i += 1
-    return out
-
-
-@_reg.register_shape_func("one_hot", False)
-def one_hot_shape_func(attrs, inputs, _):
-    """
-    Shape func for one_hot
-    """
-    shape_func = [_one_hot_shape_func(inputs[0], convert(attrs.depth), convert(attrs.axis))]
-    return shape_func
-
-
-@script
-def _concatenate_shape_func(inputs, axis):
-    ndim = inputs[0].shape[0]
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        if i != axis:
-            out[i] = inputs[0][i]
-        else:
-            out[i] = int64(0)
-            for j in const_range(len(inputs)):
-                out[i] += inputs[j][i]
-    return out
-
-
-@_reg.register_shape_func("concatenate", False)
-def concatenate_shape_func(attrs, inputs, _):
-    axis = get_const_int(attrs.axis)
-    if axis < 0:
-        axis += inputs[0].shape[0]
-    return [_concatenate_shape_func(inputs, convert(axis))]
-
-
-@script
-def _reshape_shape_func_input_shape(data_shape, newshape, ndim, allowzero):
-    out = output_tensor((ndim,), "int64")
-    src_idx = 0
-    dst_idx = 0
-    infer_idx = -1
-    copy = False
-    skip = 0
-    for i in const_range(len(newshape)):
-        if skip > 0:
-            skip -= 1
-        elif newshape[i] > 0:
-            out[dst_idx] = int64(newshape[i])
-            src_idx += 1
-            dst_idx += 1
-        elif newshape[i] == 0:
-            if allowzero:
-                out[dst_idx] = int64(newshape[i])
-            else:
-                out[dst_idx] = data_shape[src_idx]
-            src_idx += 1
-            dst_idx += 1
-        elif newshape[i] == -1:
-            assert infer_idx < 0, "One and only one dim can be inferred"
-            out[dst_idx] = int64(1)
-            infer_idx = i
-            src_idx += 1
-            dst_idx += 1
-        elif newshape[i] == -2:
-            copy = True
-        elif newshape[i] == -3:
-            assert data_shape.shape[0] - src_idx > 1, "Not enough dims in input shape for -3"
-            out[dst_idx] = data_shape[src_idx] * data_shape[src_idx + 1]
-            src_idx += 2
-            dst_idx += 1
-        elif newshape[i] == -4:
-            assert len(newshape) - i > 2, "Not enough dims in new shape for -4"
-            if newshape[i + 1] == -1:
-                assert newshape[i + 2] != -1, "Split dims cannot both be -1."
-                out[dst_idx] = data_shape[src_idx] // int64(newshape[i + 2])
-                out[dst_idx + 1] = int64(newshape[i + 2])
-            else:
-                out[dst_idx] = int64(newshape[i + 1])
-                if newshape[i + 2] == -1:
-                    out[dst_idx + 1] = data_shape[src_idx] // int64(newshape[i + 1])
-                else:
-                    out[dst_idx + 1] = int64(newshape[i + 2])
-            assert (
-                data_shape[src_idx] == out[dst_idx] * out[dst_idx + 1]
-            ), "Product of split dims doesn't match to input dim"
-            src_idx += 1
-            dst_idx += 2
-            skip = 2
-        else:
-            assert False, "Invalid special values in new shape"
-    if len(data_shape.shape) > 0:
-        # if data is not constant, we can then handle -1 and -2
-        if copy:
-            for i in range(src_idx, data_shape.shape[0]):
-                out[dst_idx] = data_shape[i]
-                dst_idx += 1
-        if infer_idx >= 0:
-            old_size = int64(1)
-            for i in const_range(data_shape.shape[0]):
-                old_size *= data_shape[i]
-            new_size = int64(1)
-            for i in const_range(out.shape[0]):
-                new_size *= out[i]
-            out[infer_idx] = old_size // new_size
-    return out
-
-
-@_reg.register_shape_func("reshape", False)
-def reshape_shape_func(attrs, inputs, out_ndims):
-    newshape = get_const_tuple(attrs.newshape)
-    allowzero = attrs.allowzero
-    return [
-        _reshape_shape_func_input_shape(
-            inputs[0], convert(newshape), out_ndims[0], convert(allowzero)
-        )
-    ]
-
-
-@script
-def _take_no_axis_shape_func(indices_shape, out_ndim):
-    out = output_tensor((out_ndim,), "int64")
-    for i in const_range(out_ndim):
-        out[i] = indices_shape[i]
-    return out
-
-
-@script
-def _take_with_axis_shape_func(data_shape, indices_shape, axis, batch_dims, out_ndim):
-    out = output_tensor((out_ndim,), "int64")
-    for i in const_range(axis):
-        out[i] = data_shape[i]
-    if len(indices_shape.shape) == 0:
-        # indices is constant
-        for i in const_range(axis + 1, len(data_shape)):
-            out[i - 1] = data_shape[i]
-    else:
-        for i in const_range(len(indices_shape) - batch_dims):
-            out[axis + i] = indices_shape[i + batch_dims]
-        for i in const_range(axis + 1, len(data_shape)):
-            out[len(indices_shape) + i - 1 - batch_dims] = data_shape[i]
-    return out
-
-
-@_reg.register_shape_func("take", False)
-def take_shape_func(attrs, inputs, out_ndims):
-    """
-    Shape function for take op.
-    """
-    if attrs.axis is None:
-        return [_take_no_axis_shape_func(inputs[1], out_ndims[0])]
-    axis = get_const_int(attrs.axis)
-    batch_dims = get_const_int(attrs.batch_dims)
-    data_ndim = int(inputs[0].shape[0])
-    if inputs[1].shape:
-        indices_ndim = int(inputs[1].shape[0])
-    if axis < 0:
-        axis += data_ndim
-    assert 0 <= axis < data_ndim
-    if batch_dims < 0:
-        batch_dims += indices_ndim
-    return [_take_with_axis_shape_func(*inputs, convert(axis), convert(batch_dims), out_ndims[0])]
-
-
-@_reg.register_legalize("take")
-def legalize_dyn_topk(attrs, inputs, types):
-    """Legalize take op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current op
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.take_legalize(attrs, inputs, types)
-
-
-@script
-def _argwhere_shape_func_1d(condition):
-    out = output_tensor((2,), "int64")
-    out[0] = int64(0)
-    out[1] = int64(1)
-    for i1 in range(condition.shape[0]):
-        if condition[i1] != 0:
-            out[0] += int64(1)
-    return out
-
-
-@script
-def _argwhere_shape_func_2d(condition):
-    out = output_tensor((2,), "int64")
-    out[0] = int64(0)
-    out[1] = int64(2)
-    for i1 in range(condition.shape[0]):
-        for i2 in range(condition.shape[1]):
-            if condition[i1, i2] != 0:
-                out[0] += int64(1)
-    return out
-
-
-@script
-def _argwhere_shape_func_3d(condition):
-    out = output_tensor((2,), "int64")
-    out[0] = int64(0)
-    out[1] = int64(3)
-    for i1 in range(condition.shape[0]):
-        for i2 in range(condition.shape[1]):
-            for i3 in range(condition.shape[2]):
-                if condition[i1, i2, i3] != 0:
-                    out[0] += int64(1)
-    return out
-
-
-@script
-def _argwhere_shape_func_4d(condition):
-    out = output_tensor((2,), "int64")
-    out[0] = int64(0)
-    out[1] = int64(4)
-    for i1 in range(condition.shape[0]):
-        for i2 in range(condition.shape[1]):
-            for i3 in range(condition.shape[2]):
-                for i4 in range(condition.shape[3]):
-                    if condition[i1, i2, i3, i4] != 0:
-                        out[0] += int64(1)
-    return out
-
-
-@script
-def _argwhere_shape_func_5d(condition):
-    out = output_tensor((2,), "int64")
-    out[0] = int64(0)
-    out[1] = int64(5)
-    for i1 in range(condition.shape[0]):
-        for i2 in range(condition.shape[1]):
-            for i3 in range(condition.shape[2]):
-                for i4 in range(condition.shape[3]):
-                    for i5 in range(condition.shape[4]):
-                        if condition[i1, i2, i3, i4, i5] != 0:
-                            out[0] += int64(1)
-    return out
-
-
-@_reg.register_shape_func("argwhere", True)
-def argwhere_shape_func(attrs, inputs, out_ndims):
-    """
-    Shape function for argwhere.
-    """
-    if len(inputs[0].shape) == 1:
-        return [_argwhere_shape_func_1d(inputs[0])]
-    if len(inputs[0].shape) == 2:
-        return [_argwhere_shape_func_2d(inputs[0])]
-    if len(inputs[0].shape) == 3:
-        return [_argwhere_shape_func_3d(inputs[0])]
-    if len(inputs[0].shape) == 4:
-        return [_argwhere_shape_func_4d(inputs[0])]
-    if len(inputs[0].shape) == 5:
-        return [_argwhere_shape_func_5d(inputs[0])]
-    return ValueError("Does not support rank higher than 5 in argwhere")
-
-
-_reg.register_shape_func("scatter_elements", False, elemwise_shape_func)
-_reg.register_shape_func("scatter_nd", False, elemwise_shape_func)
-
-
-@script
-def _sparse_fill_empty_rows_shape_func(sparse_indices, dense_shape):
-
-    new_sparse_indices_shape = output_tensor((2,), "int64")
-    new_sparse_values_shape = output_tensor((1,), "int64")
-    empty_row_indicator_shape = output_tensor((1,), "int64")
-    num_dense_rows = int64(dense_shape[0])
-
-    if int64(sparse_indices.shape[0]) == int64(0):  # Handle Empty Case
-        #  Total rows will equal dense_shape[0]
-        new_sparse_indices_shape[0] = num_dense_rows
-        new_sparse_indices_shape[1] = int64(sparse_indices.shape[1])
-        new_sparse_values_shape[0] = num_dense_rows
-        empty_row_indicator_shape[0] = num_dense_rows
-        return (new_sparse_indices_shape, new_sparse_values_shape, empty_row_indicator_shape)
-
-    else:
-        count = int64(sparse_indices.shape[0])  # Add count of all rows already in sparse_indices
-        for i in range(1, int64(sparse_indices.shape[0])):
-            index = int64(sparse_indices[i, 0])
-            prev_index = int64(sparse_indices[i - 1, 0] + 1)
-
-            if index > prev_index:
-                count += index - prev_index  # Add count of all rows between two consecutive indices
-
-        count += int64(sparse_indices[0, 0])  # Add count from 0 to first row id in sparse_indices
-        count += int64(
-            num_dense_rows - 1 - sparse_indices[sparse_indices.shape[0] - 1, 0]
-        )  # Add count from last row id to dense_shape - 1
-        new_sparse_indices_shape[0] = int64(count)
-        new_sparse_indices_shape[1] = int64(sparse_indices.shape[1])
-        new_sparse_values_shape[0] = int64(count)
-        empty_row_indicator_shape[0] = num_dense_rows
-        return (new_sparse_indices_shape, new_sparse_values_shape, empty_row_indicator_shape)
-
-
-@_reg.register_shape_func("sparse_fill_empty_rows", True)
-def sparse_fill_empty_rows_func(attrs, inputs, _):
-    return _sparse_fill_empty_rows_shape_func(inputs[0], inputs[2])
-
-
-@script
-def _sparse_reshape_shape_func(sparse_indices_shape, prev_shape_shape, new_shape_shape):
-    indices_shape = output_tensor((2,), "int64")
-    indices_shape[0] = int64(sparse_indices_shape[0])
-    indices_shape[1] = int64(new_shape_shape[0])
-    shape_tensor = output_tensor((1,), "int64")
-    shape_tensor[0] = int64(new_shape_shape[0])
-    return (indices_shape, shape_tensor)
-
-
-@_reg.register_shape_func("sparse_reshape", False)
-def sparse_reshape_shape_func(attrs, inputs, _):
-    """
-    Shape func for sparse_reshape.
-    """
-    return _sparse_reshape_shape_func(inputs[0], inputs[1], inputs[2])
-
-
-@script
-def _layout_transform_shape_func(
-    data_shape, out_layout_len, dst_equal_list, dst_mul_list, dst_div_list, dst_mix_list
-):
-    out = output_tensor((out_layout_len,), "int64")
-    for i in const_range(len(dst_equal_list)):
-        out[dst_equal_list[i][0]] = data_shape[dst_equal_list[i][1]]
-    for i in const_range(len(dst_mul_list)):
-        out[dst_mul_list[i][0]] = data_shape[dst_mul_list[i][1]] * data_shape[dst_mul_list[i][2]]
-    for i in const_range(len(dst_div_list)):
-        out[dst_div_list[i][0]] = data_shape[dst_div_list[i][1]] // dst_div_list[i][3]
-        out[dst_div_list[i][2]] = int64(dst_div_list[i][3])
-    for i in const_range(len(dst_mix_list)):
-        out[dst_mix_list[i][0]] = (
-            data_shape[dst_mix_list[i][1]] * dst_mix_list[i][2] // dst_mix_list[i][4]
-        )
-        out[dst_mix_list[i][3]] = int64(dst_mix_list[i][4])
-
-    return out
-
-
-@_reg.register_shape_func("layout_transform", False)
-def layout_transform_shape_func(attrs, inputs, _):
-    """
-    Shape function for layout_transform op.
-    """
-
-    def _fetch_axis(layout):
-        major_axes = []
-        minor_axes = {}
-        num_start = -1
-        for i, item in enumerate(layout):
-            if "A" <= item <= "Z":
-                major_axes.append(item)
-            elif "a" <= item <= "z":
-                last_num = int(layout[num_start:i])
-                minor_axes[item] = last_num
-                num_start = -1
-            elif num_start < 0:
-                num_start = i
-        return major_axes, minor_axes
-
-    _, src_minor_axes = _fetch_axis(attrs.src_layout)
-    dst_major_axes, dst_minor_axes = _fetch_axis(attrs.dst_layout)
-    src_letter_list = []
-    dst_letter_list = []
-    for item in attrs.src_layout:
-        if "A" <= item <= "Z" or "a" <= item <= "z":
-            src_letter_list.append(item)
-    for item in attrs.dst_layout:
-        if "A" <= item <= "Z" or "a" <= item <= "z":
-            dst_letter_list.append(item)
-    out_layout_len = len(dst_major_axes) + len(dst_minor_axes)
-    dst_equal_list = []
-    dst_mul_list = []
-    dst_div_list = []
-    dst_mix_list = []
-
-    for key in dst_major_axes:
-        if key.lower() not in dst_minor_axes:
-            if key.lower() not in src_minor_axes:
-                dst_equal_list.append((dst_letter_list.index(key), src_letter_list.index(key)))
-            else:
-                dst_mul_list.append(
-                    (
-                        dst_letter_list.index(key),
-                        src_letter_list.index(key),
-                        src_letter_list.index(key.lower()),
-                    )
-                )
-        else:
-            if key.lower() not in src_minor_axes:
-                dst_div_list.append(
-                    (
-                        dst_letter_list.index(key),
-                        src_letter_list.index(key),
-                        dst_letter_list.index(key.lower()),
-                        dst_minor_axes[key.lower()],
-                    )
-                )
-            else:
-                dst_mix_list.append(
-                    (
-                        dst_letter_list.index(key),
-                        src_letter_list.index(key),
-                        src_minor_axes[key.lower()],
-                        dst_letter_list.index(key.lower()),
-                        dst_minor_axes[key.lower()],
-                    )
-                )
-
-    return [
-        _layout_transform_shape_func(
-            inputs[0],
-            convert(out_layout_len),
-            convert(dst_equal_list),
-            convert(dst_mul_list),
-            convert(dst_div_list),
-            convert(dst_mix_list),
-        )
-    ]
-
-
-@script
-def _expand_dim_shape_func(data_shape, ndim, axis, num_newaxis):
-    out = output_tensor((ndim + num_newaxis,), "int64")
-    for i in const_range(out.shape[0]):
-        if i < axis:
-            out[i] = data_shape[i]
-        elif i < axis + num_newaxis:
-            out[i] = int64(1)
-        else:
-            out[i] = data_shape[i - num_newaxis]
-
-    return out
-
-
-@_reg.register_shape_func("expand_dims", False)
-def expand_dim_shape_func(attrs, inputs, _):
-    """
-    Shape function for expand_dim op.
-    """
-    axis = get_const_int(attrs.axis)
-    num_newaxis = get_const_int(attrs.num_newaxis)
-    if axis < 0:
-        axis = inputs[0].shape[0] + axis + 1
-    ndim = inputs[0].shape[0] if inputs[0].shape else 0
-    return [_expand_dim_shape_func(inputs[0], convert(ndim), convert(axis), convert(num_newaxis))]
-
-
-@script
-def _transpose_shape_func(data_shape, axes):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    for i in const_range(len(axes)):
-        out[i] = data_shape[axes[i]]
-
-    return out
-
-
-@_reg.register_shape_func("transpose", False)
-def transpose_shape_func(attrs, inputs, _):
-    """
-    Shape function for transpose op.
-    """
-    axes = attrs.axes if attrs.axes is None else get_const_tuple(attrs.axes)
-    if axes is None:
-        axes = list(range(inputs[0].shape[0].value))
-        axes.reverse()
-    axes = list(axes)
-    for i, axis in enumerate(axes):
-        if axis < 0:
-            axes[i] = inputs[0].shape[0] + axis
-    return [_transpose_shape_func(inputs[0], convert(axes))]
-
-
-_reg.register_schedule("transpose", strategy.schedule_transpose)
-
-
-@script
-def _squeeze_shape_func(data_shape, keep_axes, remove_axes):
-    out = output_tensor((len(keep_axes),), "int64")
-    for i in const_range(len(keep_axes)):
-        out[i] = data_shape[keep_axes[i]]
-
-    for i in const_range(len(remove_axes)):
-        assert data_shape[remove_axes[i]] == 1, "Removed dimension must have size 1"
-
-    return out
-
-
-@_reg.register_shape_func("squeeze", False)
-def squeeze_shape_func(attrs, inputs, _):
-    """
-    Shape function for squeeze op.
-    """
-    axis = attrs.axis if attrs.axis is None else get_const_tuple(attrs.axis)
-    keep_axes = []
-    remove_axes = []
-    if axis is not None:
-        ndim = inputs[0].shape[0].value
-        axis = [i + ndim if i < 0 else i for i in axis]
-        for i in range(ndim):
-            if i not in axis:
-                keep_axes.append(i)
-            else:
-                remove_axes.append(i)
-
-    # Due to current relay type system, it is possible even
-    # a static kernel function needs shape function. To handle
-    # this case, we allow axis to be None in squeeze shape func
-    # for now.
-    # TODO(kevinthesun): Enhance relay type system to avoid this.
-    if keep_axes:
-        out = _squeeze_shape_func(inputs[0], convert(keep_axes), convert(remove_axes))
-    else:
-        out = te.compute((), lambda *indices: 0)
-    return [out]
-
-
-@script
-def _reshape_like_shape_func(target_shape):
-    out = output_tensor((target_shape.shape[0],), "int64")
-    for i in const_range(target_shape.shape[0]):
-        out[i] = target_shape[i]
-
-    return out
-
-
-@_reg.register_shape_func("reshape_like", False)
-def reshape_like_shape_func(attrs, inputs, _):
-    """
-    Shape function for reshape_like op.
-    """
-    return [_reshape_like_shape_func(inputs[1])]
-
-
-@script
-def _tile_shape_func(data, reps, ndim, tndim, rndim):
-    out = output_tensor((tndim,), "int64")
-
-    if ndim == rndim:
-        for i in const_range(tndim):
-            out[i] = data[i] * int64(reps[i])
-    elif ndim > rndim:
-        ngap = ndim - rndim
-        for i in const_range(ndim):
-            if i < ngap:
-                out[i] = data[i]
-            else:
-                out[i] = data[i] * int64(reps[i - ngap])
-    else:
-        rgap = rndim - ndim
-        for i in const_range(rndim):
-            if i < rgap:
-                out[i] = int64(reps[i])
-            else:
-                out[i] = int64(reps[i]) * data[i - rgap]
-    return out
-
-
-@_reg.register_shape_func("tile", False)
-def tile_shape_func(attrs, inputs, _):
-    """
-    Shape function for tile op.
-    """
-    reps = get_const_tuple(attrs.reps)
-    ndim = inputs[0].shape[0].value
-    rndim = len(reps)
-    tndim = ndim if ndim > rndim else rndim
-    return [
-        _tile_shape_func(inputs[0], convert(reps), convert(ndim), convert(tndim), convert(rndim))
-    ]
-
-
-@script
-def _split_shape_func(data_shape, index, indices_or_sections, param_is_indices, axis):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    if param_is_indices:
-        for i in const_range(data_shape.shape[0]):
-            if i == axis:
-                assert (
-                    data_shape[axis] % indices_or_sections[0] == 0
-                ), "num_sections must be an integer factor of the size of axis"
-                out[i] = ceil_div(data_shape[axis], indices_or_sections[0])
-            else:
-                out[i] = data_shape[i]
-    else:
-        start = int64(0)
-        if index > 0:
-            start = int64(indices_or_sections[index - 1])
-        end = data_shape[axis]
-        if index < len(indices_or_sections):
-            end = int64(indices_or_sections[index])
-        for i in const_range(data_shape.shape[0]):
-            if i == axis:
-                out[i] = end - start
-            else:
-                out[i] = data_shape[i]
-    return out
-
-
-@_reg.register_shape_func("split", False)
-def split_shape_func(attrs, inputs, _):
-    """
-    Shape function for split op.
-    """
-    if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)):
-        indices_or_sections = get_const_int(attrs.indices_or_sections)
-        assert indices_or_sections > 0, "Slice count must be > 0"
-    else:
-        indices_or_sections = list(get_const_tuple(attrs.indices_or_sections))
-        assert sorted(indices_or_sections)[0] > 0 and indices_or_sections == sorted(
-            indices_or_sections
-        ), "split_indices must be sorted"
-
-    axis = get_const_int(attrs.axis)
-
-    if axis < 0:
-        axis += get_const_int(inputs[0].shape[0])
-
-    num_out = (
-        indices_or_sections
-        if isinstance(indices_or_sections, int)
-        else len(indices_or_sections) + 1
-    )
-
-    param_is_indices = isinstance(indices_or_sections, int)
-    if param_is_indices:
-        indices_or_sections = [indices_or_sections]
-    return [
-        _split_shape_func(
-            inputs[0],
-            i,
-            indices_or_sections,
-            param_is_indices,
-            axis,
-        )
-        for i in range(num_out)
-    ]
-
-
-@script
-def _repeat_shape_func(data_shape, repeats, axis):
-    out = output_tensor((data_shape.shape[0],), "int64")
-
-    for i in const_range(data_shape.shape[0]):
-        if i == axis:
-            out[i] = int64(data_shape[i] * repeats)
-        else:
-            out[i] = data_shape[i]
-
-    return out
-
-
-@_reg.register_shape_func("repeat", False)
-def repeat_shape_func(attrs, inputs, _):
-    """
-    Shape func for repeat.
-    """
-    axis = get_const_int(attrs.axis)
-    if axis < 0:
-        axis = inputs[0].shape[0] + axis
-    return [_repeat_shape_func(inputs[0], attrs.repeats, convert(axis))]
-
-
-@_reg.register_shape_func("broadcast_to_like", False)
-def broadcast_to_like_shape_func(attrs, inputs, _):
-    """
-    Shape func for broadcast_to_like.
-    """
-    return [topi.math.identity(inputs[1])]
-
-
-@script
-def _stack_shape_func(data_shape, axis, num_inputs):
-    out = output_tensor((data_shape.shape[0] + 1,), "int64")
-
-    for i in const_range(data_shape.shape[0] + 1):
-        if i == axis:
-            out[i] = int64(num_inputs)
-        elif i < axis:
-            out[i] = data_shape[i]
-        else:
-            out[i] = data_shape[i - 1]
-
-    return out
-
-
-@_reg.register_shape_func("stack", False)
-def stack_shape_func(attrs, inputs, _):
-    """
-    Shape func for stack.
-    """
-    axis = get_const_int(attrs.axis)
-    if axis < 0:
-        axis += inputs[0].shape[0] + 1
-    return [_stack_shape_func(inputs[0], convert(axis), convert(len(inputs)))]
-
-
-@script
-def _broadcast_shape_tensors(shape_tensor1, shape_tensor2):
-    rank1 = shape_tensor1.shape[0]
-    rank2 = shape_tensor2.shape[0]
-    out_rank = max(rank1, rank2)
-    bcast_shape_tensor = output_tensor((out_rank,), "int64")
-
-    for index in const_range(out_rank):
-        dim1 = int64(1)
-        dim2 = int64(1)
-
-        if rank1 == out_rank:
-            dim1 = shape_tensor1[index]
-        elif rank1 - (out_rank - index) >= 0:
-            dim1 = shape_tensor1[rank1 - (out_rank - index)]
-
-        if rank2 == out_rank:
-            dim2 = shape_tensor2[index]
-        elif rank2 - (out_rank - index) >= 0:
-            dim2 = shape_tensor2[rank2 - (out_rank - index)]
-
-        assert dim1 == dim2 or dim1 == 1 or dim2 == 1, "Invalid broadcast shapes"
-        bcast_shape_tensor[index] = max(dim1, dim2)
-
-    return bcast_shape_tensor
-
-
-@_reg.register_shape_func("where", False)
-def where_shape_func(attrs, inputs, _):
-    """
-    Shape func for where.
-    """
-
-    def ensure_tensor(tensor):
-        if len(tensor.shape) == 0:
-            return topi.full((1,), "int64", 1)
-        return tensor
-
-    cond_shape = ensure_tensor(inputs[0])
-    x_shape = ensure_tensor(inputs[1])
-    y_shape = ensure_tensor(inputs[2])
-
-    bcast_shape = _broadcast_shape_tensors(x_shape, y_shape)
-    out_shape = _broadcast_shape_tensors(bcast_shape, cond_shape)
-
-    return [out_shape]
-
-
-@script
-def _adv_index_post_process(data_shape, bcast_shape, num_indices):
-    data_rank = data_shape.shape[0]
-    bcast_rank = bcast_shape.shape[0]
-    out = output_tensor((data_rank + bcast_rank - num_indices,), "int64")
-
-    for i in const_range(bcast_rank):
-        out[i] = bcast_shape[i]
-    for i in const_range(data_rank - num_indices):
-        out[i + bcast_rank] = data_shape[i + num_indices]
-    return out
-
-
-@_reg.register_shape_func("adv_index", False)
-def adv_index_shape_func(attrs, inputs, _):
-    """
-    Shape func for adv_index.
-    """
-    bcast_shape = inputs[1]
-    for i in inputs[2:]:
-        bcast_shape = _broadcast_shape_tensors(bcast_shape, i)
-    return [_adv_index_post_process(inputs[0], bcast_shape, convert(len(inputs) - 1))]
-
-
-@script
-def _unique_shape(data_shape):
-    unique_shape = output_tensor((1,), "int64")
-    indices_shape = output_tensor((1,), "int64")
-    inverse_indices_shape = output_tensor((1,), "int64")
-    num_unique_shape = output_tensor((1,), "int64")
-    unique_shape[0] = data_shape[0]
-    indices_shape[0] = data_shape[0]
-    inverse_indices_shape[0] = data_shape[0]
-    num_unique_shape[0] = int64(1)
-    return (unique_shape, indices_shape, inverse_indices_shape, num_unique_shape)
-
-
-@script
-def _unique_with_counts_shape(data_shape):
-    unique_shape = output_tensor((1,), "int64")
-    indices_shape = output_tensor((1,), "int64")
-    inverse_indices_shape = output_tensor((1,), "int64")
-    num_unique_shape = output_tensor((1,), "int64")
-    counts_shape = output_tensor((1,), "int64")
-    unique_shape[0] = data_shape[0]
-    indices_shape[0] = data_shape[0]
-    inverse_indices_shape[0] = data_shape[0]
-    num_unique_shape[0] = int64(1)
-    counts_shape[0] = data_shape[0]
-    return (unique_shape, indices_shape, inverse_indices_shape, num_unique_shape, counts_shape)
-
-
-@_reg.register_shape_func("unique", False)
-def unique_shape_func(attrs, inputs, _):
-    """
-    Shape func for unique operator.
-    """
-    if attrs.return_counts:
-        return _unique_with_counts_shape(inputs[0])
-    else:
-        return _unique_shape(inputs[0])
-
-
-@script
-def _gather_nd_shape(data_shape, indices_shape, batch_dims, index_rank):
-    ndim = data_shape.shape[0]
-    # using mdim = indices_shape[0] wouldn't work because a rank cannot
-    # depend on a runtime shape dimension of indices tensor, even if the
-    # dimension is always a known, fixed value. As a workaround, we assume that
-    # the fixed gather dimension (the size of an indexing tuple) is recorded
-    # in gather_nd op attributes.
-    mdim = index_rank
-    kdim = indices_shape.shape[0] - 1
-    out_shape = output_tensor((kdim + ndim - (mdim + batch_dims),), "int64")
-    for i in range(1, kdim + 1):
-        out_shape[i - 1] = indices_shape[i]
-    for i in range(mdim + batch_dims, ndim):
-        out_shape[kdim + i - (mdim + batch_dims)] = data_shape[i]
-    return out_shape
-
-
-@_reg.register_shape_func("gather_nd", False)
-def gather_nd_shape_func(attrs, inputs, _):
-    """
-    Shape func for gather_nd operator.
-    """
-    batch_dims = get_const_int(attrs.batch_dims)
-    index_rank = get_const_int(attrs.index_rank)
-
-    assert index_rank > 0, "index_rank needs to be specified for dynamic gather_nd"
-
-    return [_gather_nd_shape(inputs[0], inputs[1], convert(batch_dims), convert(index_rank))]
-
-
-@script
-def _gather_shape(data_shape, indices_shape, axis):
-    out_shape = output_tensor((data_shape.shape[0],), "int64")
-    for i in range(data_shape.shape[0]):
-        if i != axis:
-            assert (
-                data_shape[i] == indices_shape[i]
-            ), "data and indices size at non-gather axes must be the same"
-        out_shape[i] = indices_shape[i]
-    return out_shape
-
-
-@_reg.register_shape_func("gather", False)
-def gather_shape_func(attrs, inputs, _):
-    """
-    Shape func for gather operator.
-    """
-    return [_gather_shape(inputs[0], inputs[1], attrs.axis)]
diff --git a/python/tvm/relay/op/algorithm.py b/python/tvm/relay/op/algorithm.py
deleted file mode 100644
index 9be62150bb53..000000000000
--- a/python/tvm/relay/op/algorithm.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Classic algorithm operation"""
-from __future__ import absolute_import as _abs
-
-from ..expr import Constant, Expr, TupleWrapper
-from . import _make
-from .dyn import _make as _dyn_make
-
-
-def sort(data, axis=-1, is_ascend=1):
-    """Performs sorting along the given axis and returns data in sorted order.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data tensor.
-
-    axis : int, optional
-        Axis long which to sort the input tensor.
-
-    is_ascend : boolean, optional
-        Whether to sort in ascending or descending order.
-
-    Returns
-    -------
-    out : relay.Expr
-        Tensor with same shape as data.
-    """
-    return _make.sort(data, axis, is_ascend)
-
-
-def argsort(data, axis=-1, is_ascend=1, dtype="int32"):
-    """Performs sorting along the given axis and returns an array of indices
-    having same shape as an input array that index data in sorted order.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data tensor.
-
-    valid_count : tvm.te.Tensor
-        The number of valid elements to be sorted.
-
-    axis : int, optional
-        Axis long which to sort the input tensor.
-
-    is_ascend : boolean, optional
-        Whether to sort in ascending or descending order.
-
-    dtype : string, optional
-        The data type of the output indices.
-
-    Returns
-    -------
-    out : relay.Expr
-        Tensor with same shape as data.
-    """
-    return _make.argsort(data, axis, is_ascend, dtype)
-
-
-def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int32"):
-    """Get the top k elements in an input tensor along the given axis.
-
-    ret_type specifies the return type, can be one of ("both", "values", "indices").
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data tensor.
-
-    k : int or relay.Expr, optional
-        Number of top elements to select. Return all elements if k < 1.
-
-    axis : int, optional
-        Axis long which to sort the input tensor.
-
-    ret_type: str, optional
-        The return type [both, values, indices].
-        "both": return both top k data and indices.
-        "values": return top k data only.
-        "indices": return top k indices only.
-
-    is_ascend : boolean, optional
-        Whether to sort in ascending or descending order.
-
-    dtype : string, optional
-        The data type of the indices output.
-
-    Returns
-    -------
-    out : relay.Expr or List[relay.Expr]
-        The computed result.
-    """
-    if isinstance(k, Constant):
-        k = k.data.numpy().item()
-    if isinstance(k, Expr):
-        out = _dyn_make.topk(data, k, axis, ret_type, is_ascend, dtype)
-    else:
-        out = _make.topk(data, k, axis, ret_type, is_ascend, dtype)
-    if ret_type == "both":
-        return TupleWrapper(out, 2)
-    return out
-
-
-def searchsorted(sorted_sequence, values, right=False, dtype="int32"):
-    """Find indices where elements should be inserted to maintain order.
-       If `sorted_sequence` is N-dimensional, the innermost dimension of
-       `values` are searched in the corresponding dimension of `sorted_sequence`.
-
-    Parameters
-    ----------
-    sorted_sequence : relay.Expr
-        N-D or 1-D Tensor, containing monotonically increasing sequence
-        on the innermost dimension.
-
-    values : relay.Expr
-        N-D Tensor containing the search values. When `sorted_sequence` is 1-D,
-        the shape of `values` can be arbitrary. Otherwise, ranks of `sorted_sequence`
-        and `values` must be the same, and outer N-1 axes must have the same size.
-
-    right : bool, optional
-        Controls which index is returned if a value lands exactly on one of sorted values. If
-        False, the index of the first suitable location found is given. If true, return the
-        last such index. If there is no suitable index, return either 0 or N (where N is the
-        size of the innermost dimension).
-
-    dtype : string, optional
-        The data type of the output indices.
-
-    Returns
-    -------
-    indices : relay.Expr
-        Tensor with same shape as values, representing the indices of
-        elements of `values` if they are inserted in `sorted_sequence`.
-    """
-    return _make.searchsorted(sorted_sequence, values, right, dtype)
diff --git a/python/tvm/relay/op/annotation/__init__.py b/python/tvm/relay/op/annotation/__init__.py
deleted file mode 100644
index 366c5617092a..000000000000
--- a/python/tvm/relay/op/annotation/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Annotation related operators."""
-from __future__ import absolute_import as _abs
-from .annotation import *
diff --git a/python/tvm/relay/op/annotation/_make.py b/python/tvm/relay/op/annotation/_make.py
deleted file mode 100644
index 12ece522c854..000000000000
--- a/python/tvm/relay/op/annotation/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.annotation._make", __name__)
diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
deleted file mode 100644
index 71a434917e72..000000000000
--- a/python/tvm/relay/op/annotation/annotation.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Annotation operations."""
-from tvm import target
-from tvm.runtime import ndarray as _nd
-from tvm.runtime import Device as _Device
-
-from . import _make
-from .. import op as reg
-
-
-def _make_virtual_device(device):
-    if isinstance(device, _Device):
-        return target.VirtualDevice(device)
-    if isinstance(device, str):
-        return target.VirtualDevice(_nd.device(device))
-    if isinstance(device, target.VirtualDevice):
-        return device
-    raise ValueError(f"expecting a Device or device name, but received a {type(device)}")
-
-
-def on_device(body, device, constrain_result=False, constrain_body=True):
-    """Annotates a body expression with device constraints. The constraint influences
-    how the body is compiled, where the body is evaluated, and where the result of
-    evaluation is stored.
-
-    Note that the defaults for the constrain_body and constrain_result parameters should
-    almost never need to be overridden by the user. These parameters are exposed here
-    to help unit tests exercise the PlanDevices pass machinery.
-
-    Parameters
-    ----------
-    body : tvm.relay.Expr
-        The expression to be annotated.
-
-    device : Union[:py:class:`Device`, str]
-        The device to annotate with.
-
-    constrain_result  : bool
-        If false (the default), the result of the on_device is not constrained to be on device.
-
-    constrain_body : bool
-        If true (the default), the body of the on_device is constrained to be on device.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The annotated expression.
-    """
-    return _make.OnDevice(body, _make_virtual_device(device), constrain_result, constrain_body)
-
-
-def function_on_device(function, param_devices, result_device):
-    """Annotates a Relay function with the device types on which its parameters and result should
-    be stored.
-
-    Parameters
-    ----------
-    function : tvm.relay.Function
-        The function to be annotated.
-
-    param_devices : Array[Union[:py:class:`Device`, str]]
-        The devices for each parameter.
-
-    result_device: Union[:py:class:`Device`, str]
-        The device for the function result.
-
-    Returns
-    -------
-    result : tvm.relay.Function
-        The annotated function.
-    """
-    return _make.FunctionOnDevice(
-        function,
-        [_make_virtual_device(d) for d in param_devices],
-        _make_virtual_device(result_device),
-    )
-
-
-def stop_fusion(data):
-    """Annotate an expression to prevent it being fused with following expressions.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The expression to be annotated.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The annotated expression.
-    """
-    return _make.stop_fusion(data)
-
-
-def checkpoint(data):
-    """Annotate an expression to be a checkpoint for the checkpointing memory optimization.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The expression to be annotated.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The annotated expression.
-    """
-    return _make.checkpoint(data)
-
-
-reg.register_injective_schedule("annotation.checkpoint")
-
-
-def compiler_begin(data, compiler):
-    """Annotate an expression to indicate that it is the beginning of
-    a regeion that will be handled by the given compiler.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The expression to be annotated.
-
-    compiler : Str
-        The compiler used to generate code of the annotated region.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The annotated expression.
-    """
-    return _make.compiler_begin(data, compiler)
-
-
-def compiler_end(data, compiler):
-    """Annotate an expression to indicate that it is the end of a region that
-    is handled by the provided compiler.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The expression to be annotated.
-
-    compiler : Str
-        The compiler used to generate code of the annotated region.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The annotated expression.
-    """
-    return _make.compiler_end(data, compiler)
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
deleted file mode 100644
index 33cf449db0ab..000000000000
--- a/python/tvm/relay/op/contrib/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Contrib modules."""
-from .register import get_pattern_table, register_pattern_table
-
-from .arm_compute_lib import *
-from .dnnl import *
-from .bnns import *
-from .coreml import *
-from .libtorch import *
-from .tensorrt import *
-from .cutlass import *
-from .clml import *
-from .mrvl import *
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
deleted file mode 100644
index 6b8d000c6664..000000000000
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ /dev/null
@@ -1,573 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, dangerous-default-value
-"""Arm Compute Library supported operators."""
-import tvm
-from tvm import relay
-from tvm._ffi import register_func
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.expr import const
-
-from ...dataflow_pattern import is_constant, is_expr, is_op, is_tuple, wildcard
-from ..strategy.generic import is_depthwise_conv2d
-from .register import register_pattern_table
-
-
-def is_arm_compute_runtime_enabled():
-    """Check if the ACL graph executor is present.
-
-    Returns
-    -------
-    ret: bool
-        True if present, False if not.
-    """
-    check_enabled = tvm.get_global_func("relay.op.is_arm_compute_runtime_enabled", True)
-    if check_enabled:
-        return check_enabled()
-    return False
-
-
-def partition_for_arm_compute_lib(mod, params=None, disabled_ops=["concatenate"], **opts):
-    """Partition the graph greedily offloading supported
-    operators to Arm Compute Library.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-    disabled_ops : Optional[list]
-        Ops do not want to offload to ACL.
-
-    Returns
-    -------
-    ret : annotated and partitioned module.
-    """
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(arm_compute_lib_pattern_table(disabled_ops)),
-            transform.AnnotateTarget("arm_compute_lib", False),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    return seq(mod)
-
-
-@register_func("relay.ext.arm_compute_lib.optimize")
-def preprocess_module(mod):
-    """
-    Pre-process a module containing functions ready for ACL codegen. For now we enforce OHWI
-    kernel layout and fold the transforms away.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-
-    Returns
-    -------
-    preprocessed_mod : The processed module.
-    """
-
-    def convert_layout_conv2d(conv2d_function):
-        def convert_conv(attrs, inputs, tinfos, desired_layouts):
-            new_attrs = dict(attrs)
-            data_info = tinfos[0]
-            weight_info = tinfos[1]
-            desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-            new_attrs["data_layout"] = desired_data_layout
-            new_attrs["kernel_layout"] = desired_kernel_layout
-
-            if is_depthwise_conv2d(
-                data_info.shape,
-                attrs["data_layout"],
-                weight_info.shape,
-                attrs["kernel_layout"],
-                attrs["groups"],
-            ):
-                dkl = desired_kernel_layout
-                new_attrs["kernel_layout"] = dkl[3] + dkl[1:3] + dkl[0]
-            return conv2d_function(*inputs, **new_attrs)
-
-        return convert_conv
-
-    with OpAttrContext(
-        "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
-    ), OpAttrContext(
-        "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d)
-    ):
-        seq = tvm.transform.Sequential(
-            [
-                transform.ConvertLayout(
-                    {"nn.conv2d": ["NHWC", "OHWI"], "qnn.conv2d": ["NHWC", "OHWI"]}
-                ),
-                transform.FoldConstant(),
-            ]
-        )
-        preprocessed_mod = seq(mod)
-    return preprocessed_mod
-
-
-@register_pattern_table("arm_compute_lib")
-def arm_compute_lib_pattern_table(disabled_ops=["concatenate"]):
-    """Get the ACL pattern table."""
-
-    def conv_pattern():
-        """Create a convolution pattern.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution pattern.
-        """
-        pattern = is_op("nn.pad")(wildcard(), wildcard()) | wildcard()
-        pattern = is_op("nn.conv2d")(pattern, is_constant())
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        pattern = pattern.optional(is_op("nn.relu"))
-        return pattern
-
-    def qnn_conv_pattern():
-        """Create a quantized convolution pattern.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution pattern.
-        """
-        pattern = is_op("nn.pad")(wildcard(), wildcard()) | wildcard()
-        pattern = is_op("qnn.conv2d")(
-            pattern, is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
-        )
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        pattern = pattern.optional(is_op("nn.relu"))
-        pattern = is_op("qnn.requantize")(
-            pattern, wildcard(), wildcard(), is_constant(), is_constant()
-        )
-        return pattern
-
-    def dense_pattern():
-        """Create a dense (fully-connected) pattern.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution pattern.
-        """
-        pattern = is_op("nn.dense")(wildcard(), is_constant())
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        return pattern
-
-    def qnn_dense_pattern():
-        """Create a quantized dense (fully-connected) pattern.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution pattern.
-        """
-        pattern = is_op("qnn.dense")(
-            wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
-        )
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        pattern = is_op("qnn.requantize")(
-            pattern, wildcard(), wildcard(), is_constant(), is_constant()
-        )
-        return pattern
-
-    def avg_pool2d_pattern():
-        """Creates a pattern that matches either quantized
-        avg_pool2d or quantized global_avg_pool2d.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution pattern.
-        """
-        pattern = is_op("cast")(wildcard())
-        pattern = is_op("nn.avg_pool2d")(pattern) | is_op("nn.global_avg_pool2d")(pattern)
-        pattern = is_op("cast")(pattern)
-        return pattern
-
-    def l2_pool2d_pattern():
-        """Create an l2 pooling pattern from equivalent relay operators.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution pattern.
-        """
-        pattern = is_op("power")(wildcard(), is_expr(const(2.0)))
-        pattern = is_op("nn.avg_pool2d")(pattern)
-        pattern = is_op("sqrt")(pattern)
-        return pattern
-
-    def concatenate_pattern():
-        """Create an concatenate pattern from equivalent relay operators.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the concatenate pattern.
-        """
-        pattern = is_op("concatenate")(is_tuple(None))
-        return pattern
-
-    def check_conv(extract):
-        """Check conv pattern is supported by ACL."""
-        call = extract
-        while call.op.name != "nn.conv2d":
-            call = call.args[0]
-        return conv2d(call)
-
-    def check_qnn_conv(extract):
-        """Check qnn conv pattern is supported by ACL."""
-        if extract.attrs.out_dtype not in ("uint8", "int8"):
-            return False
-        call = extract
-        while call.op.name != "qnn.conv2d":
-            call = call.args[0]
-        return qnn_conv2d(call)
-
-    def check_dense(extract):
-        """Check conv pattern is supported by ACL."""
-        call = extract
-        while call.op.name != "nn.dense":
-            call = call.args[0]
-        return dense(call)
-
-    def check_qnn_dense(extract):
-        """Check qnn conv pattern is supported by ACL."""
-        if extract.attrs.out_dtype not in ("uint8", "int8"):
-            return False
-        call = extract
-        while call.op.name != "qnn.dense":
-            call = call.args[0]
-        return qnn_dense(call)
-
-    def check_avg_pool2d(extract):
-        """Check average pool2d pattern is supported by ACL."""
-        if extract.attrs.dtype not in ("uint8", "int8"):
-            return False
-        pool = extract.args[0]
-        if pool.args[0].attrs.dtype != "int32":
-            return False
-        return avg_pool2d(pool, from_quantized_composite=True)
-
-    def check_l2_pool2d(extract):
-        """Check l2 pool2d pattern is supported by ACL."""
-        pool = extract.args[0]
-        return avg_pool2d(pool)
-
-    def check_concatenate(expr):
-        """Check concatenate pattern is supported by ACL."""
-        if "concatenate" in disabled_ops:
-            return False
-        attrs, type_args = expr.attrs, expr.type_args
-        for idx in range(len(type_args[0].fields)):
-            if type_args[0].fields[idx].dtype not in ["float32", "uint8", "int8"]:
-                return False
-        # ACL concatenate only supports maximum 4 dimensions input tensor
-        if attrs.axis not in [-4, -3, -2, -1, 0, 1, 2, 3]:
-            return False
-        return True
-
-    return [
-        ("arm_compute_lib.conv2d", conv_pattern(), check_conv),
-        ("arm_compute_lib.qnn_conv2d", qnn_conv_pattern(), check_qnn_conv),
-        ("arm_compute_lib.dense", dense_pattern(), check_dense),
-        ("arm_compute_lib.qnn_dense", qnn_dense_pattern(), check_qnn_dense),
-        ("arm_compute_lib.qnn_conv2d", qnn_conv_pattern(), check_qnn_conv),
-        ("arm_compute_lib.avg_pool2d", avg_pool2d_pattern(), check_avg_pool2d),
-        ("arm_compute_lib.l2_pool2d", l2_pool2d_pattern(), check_l2_pool2d),
-        ("arm_compute_lib.concatenate", concatenate_pattern(), check_concatenate),
-    ]
-
-
-def _register_external_op_helper(op_name, supported=True):
-    @tvm.ir.register_op_attr(op_name, "target.arm_compute_lib")
-    def _func_wrapper(expr):
-        return supported
-
-    return _func_wrapper
-
-
-_register_external_op_helper("reshape")
-
-
-@tvm.ir.register_op_attr("nn.conv2d", "target.arm_compute_lib")
-def conv2d(expr):
-    """Check if the external ACL codegen for conv2d should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.data_layout != "NHWC":
-        return False
-    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
-        return False
-    data_typ = args[0].checked_type
-    if len(data_typ.shape) != 4 or data_typ.shape[0] != 1 or data_typ.dtype != "float32":
-        return False
-    kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32":
-        return False
-    is_depthwise = is_depthwise_conv2d(
-        data_typ.shape,
-        attrs["data_layout"],
-        kernel_typ.shape,
-        attrs["kernel_layout"],
-        attrs["groups"],
-    )
-    if is_depthwise:
-        return depthwise_conv2d(attrs, args)
-    # ACL doesn't support grouped convolution
-    if attrs.groups != 1 and not is_depthwise:
-        return False
-    return True
-
-
-def qnn_conv2d(expr):
-    """Check if the external ACL codegen for qnn.conv2d should be used."""
-    attrs, args = expr.attrs, expr.args
-    qnn_dtypes = ("uint8", "int8")
-
-    if attrs.data_layout != "NHWC":
-        return False
-    if attrs.out_dtype != "int32" and attrs.out_dtype != "":
-        return False
-    data_typ = args[0].checked_type
-    if len(data_typ.shape) != 4 or data_typ.shape[0] != 1 or data_typ.dtype not in qnn_dtypes:
-        return False
-    kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 4 or kernel_typ.dtype not in qnn_dtypes:
-        return False
-    if is_per_channel_quantization(
-        zero_point=args[2], scale=args[4]
-    ) or is_per_channel_quantization(zero_point=args[3], scale=args[5]):
-        return False
-    is_depthwise = is_depthwise_conv2d(
-        data_typ.shape,
-        attrs["data_layout"],
-        kernel_typ.shape,
-        attrs["kernel_layout"],
-        attrs["groups"],
-    )
-    if is_depthwise:
-        return depthwise_conv2d(attrs, args)
-    # ACL doesn't support grouped convolution
-    if attrs.groups != 1 and not is_depthwise:
-        return False
-    return True
-
-
-def depthwise_conv2d(attrs, args):
-    """Check if the external ACL codegen for depthwise convolution should be used.
-
-    Note
-    ----
-    Relay does not have a depthwise conv2d operator whilst ACL does. We simply
-    separate the checks for depthwise for clarity.
-    """
-    kernel_typ = args[1].checked_type
-    # Only supports 3x3, 5x5 depthwise
-    if (
-        kernel_typ.shape[0] not in [3, 5]
-        or kernel_typ.shape[1] not in [3, 5]
-        or kernel_typ.shape[0] != kernel_typ.shape[1]
-    ):
-        return False
-    # Stride must be (1, 1) or (2, 2)
-    if (attrs.strides[0], attrs.strides[1]) not in [(1, 1), (2, 2)]:
-        return False
-    return True
-
-
-@tvm.ir.register_op_attr("nn.dense", "target.arm_compute_lib")
-def dense(expr):
-    """Check if the external ACL codegen for dense should be used."""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    if data_typ.dtype != "float32":
-        return False
-    kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "float32":
-        return False
-    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
-        return False
-    return True
-
-
-def qnn_dense(expr):
-    """Check if the external ACL codegen for qnn.dense should be used."""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    if data_typ.dtype not in ("uint8", "int8"):
-        return False
-    kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 2 or kernel_typ.dtype not in ("uint8", "int8"):
-        return False
-    if attrs.out_dtype != "int32":
-        return False
-    if is_per_channel_quantization(
-        zero_point=args[2], scale=args[4]
-    ) or is_per_channel_quantization(zero_point=args[3], scale=args[5]):
-        return False
-    return True
-
-
-def check_dilation(attrs):
-    """Prevents offloading if dilation other than (1, 1)"""
-    if not isinstance(attrs, relay.op.op_attrs.GlobalPool2DAttrs):
-        if not (len(attrs.dilation) == 2 and attrs.dilation[0] == 1 and attrs.dilation[1] == 1):
-            return False
-    return True
-
-
-@tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
-def max_pool2d(expr):
-    """Check if the external ACL codegen for maxpool2d should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.layout != "NHWC":
-        return False
-    typ = args[0].checked_type
-    if typ.dtype not in ["float32", "uint8", "int8"]:
-        return False
-    return check_dilation(attrs)
-
-
-@tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib")
-def avg_pool2d(expr, from_quantized_composite=False):
-    """Check if the external ACL codegen for avgpool2d should be used."""
-    attrs, args = expr.attrs, expr.args
-    typ = args[0].checked_type
-
-    if from_quantized_composite:
-        if typ.dtype != "int32":
-            return False
-    else:
-        if typ.dtype not in ["float32"]:
-            return False
-    if attrs.layout != "NHWC":
-        return False
-
-    return check_dilation(attrs)
-
-
-@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib")
-def global_max_pool2d(expr):
-    """Check if the external ACL codegen for gloval_maxpool2d should be used."""
-    attrs, args = expr.attrs, expr.args
-    typ = args[0].checked_type
-    if typ.dtype not in ["float32", "uint8", "int8"]:
-        return False
-    if attrs.layout != "NHWC":
-        return False
-    return True
-
-
-@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
-def global_avg_pool2d(expr):
-    """Check if the external ACL codegen for global_avgpool2d should be used."""
-    attrs, args = expr.attrs, expr.args
-    typ = args[0].checked_type
-    if typ.dtype not in ["float32"]:
-        return False
-    if attrs.layout != "NHWC":
-        return False
-    return True
-
-
-@tvm.ir.register_op_attr("maximum", "target.arm_compute_lib")
-def maximum(expr):
-    """Check if the external ACL codegen for maximum should be used."""
-    args = expr.args
-    type_a = args[0].checked_type
-    type_b = args[0].checked_type
-    return (type_a.dtype == "float32") and (type_b.dtype == "float32")
-
-
-@tvm.ir.register_op_attr("add", "target.arm_compute_lib")
-def add(expr):
-    """Check if the external ACL codegen for add should be used."""
-    args = expr.args
-    for typ in [args[0].checked_type, args[1].checked_type]:
-        if typ.dtype != "float32":
-            return False
-
-    return True
-
-
-@tvm.ir.register_op_attr("qnn.add", "target.arm_compute_lib")
-def qnn_add(expr):
-    """Check if the external ACL codegen for add should be used."""
-    args = expr.args
-    for typ in [args[0].checked_type, args[1].checked_type]:
-        if typ.dtype not in ["int8", "uint8"]:
-            return False
-    if (
-        is_per_channel_quantization(zero_point=args[3], scale=args[2])
-        or is_per_channel_quantization(zero_point=args[5], scale=args[4])
-        or is_per_channel_quantization(zero_point=args[7], scale=args[6])
-    ):
-        return False
-    return True
-
-
-def is_per_channel_quantization(zero_point, scale):
-    """Check if the quantization is per-channel"""
-    for value in [zero_point, scale]:
-        shape = value.checked_type.shape
-        if len(shape) != 0 and shape[0] != 1:
-            return True
-    return False
-
-
-class OpAttrContext(object):
-    """Temporarily changes the attr of an op."""
-
-    def __init__(self, op_name, attr_key, attr_value):
-        """Saves the required info for RAII pattern usage.
-
-        Parameters
-        ----------
-        op_name : str
-            The op name.
-
-        attr_key : str
-            The attribute name.
-
-        attr_value : object
-            The attribute value.
-        """
-        self.op = relay.op.get(op_name)
-        self.attr_key = attr_key
-        self.attr_value = attr_value
-
-    def __enter__(self):
-        self.older_attr = self.op.get_attr(self.attr_key)
-        self.op.reset_attr(self.attr_key)
-        self.op.set_attr(self.attr_key, self.attr_value)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        self.op.reset_attr(self.attr_key)
-        if self.older_attr:
-            self.op.set_attr(self.attr_key, self.older_attr)
diff --git a/python/tvm/relay/op/contrib/bnns.py b/python/tvm/relay/op/contrib/bnns.py
deleted file mode 100644
index 2ace502e6528..000000000000
--- a/python/tvm/relay/op/contrib/bnns.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""BNNS library supported operators.
-Is a part of Accelerate framework on macOS/iOS platforms. Apple provide several APIs
-to handle tensor processing. Particularly:
- * BNNS (basic neural )
- * vDSP (1D and 2D tensor processing)
-"""
-import math
-import tvm.ir
-
-from tvm.relay import transform
-from tvm.relay.expr import const
-from tvm.relay.build_module import bind_params_by_name
-
-from .register import register_pattern_table, get_pattern_table
-from ...dataflow_pattern import wildcard, is_op, is_expr
-
-
-def partition_for_bnns(mod, params=None):
-    """Partition the graph greedily offloading supported
-    operators to BNNS.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-
-    Returns
-    -------
-    ret : annotated and partitioned module.
-    """
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.FoldConstant(),
-            transform.FoldScaleAxis(),
-            transform.DynamicToStatic(),
-            transform.AlterOpLayout(),
-            # TODO(apeskov): WA. AlterOpLayout call lead to constants shape transformation
-            #   Some expand_dims op may appears after constants. It breaks BNNS fusing.
-            #   So we have to call FoldConstant right before bnns composite passes.
-            transform.FoldConstant(),
-            transform.MergeComposite(get_pattern_table("bnns")),
-            transform.AnnotateTarget("bnns"),
-            #   If you no need in per layer performance statistic you can
-            #   uncomment next line
-            # transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    return seq(mod)
-
-
-def _register_external_op_helper(op_name, supported=True):
-    """The helper function to indicate that a given operator can be supported
-    by BNNS.
-
-    Parameters
-    ----------
-    op_name : Str
-        The name of supported operator that will be registered.
-
-    Returns
-    -------
-    f : callable
-        A function that returns if the operator is supported by BNNS.
-    """
-
-    @tvm.ir.register_op_attr(op_name, "target.bnns")
-    def _func_wrapper(expr):
-        return supported
-
-    return _func_wrapper
-
-
-_register_external_op_helper("nn.batch_matmul")
-
-
-@tvm.ir.register_op_attr("nn.max_pool2d", "target.bnns")
-def max_pool2d_check(expr):
-    """Check if the nn.max_pool2d can be executed in BNNS"""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    rank = len(data_typ.shape)
-    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
-        return False
-    if attrs.layout != "NCHW":
-        return False
-    return True
-
-
-@tvm.ir.register_op_attr("nn.avg_pool2d", "target.bnns")
-def avg_pool2d_check(expr):
-    """Check if the nn.avg_pool2d can be executed in BNNS"""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    rank = len(data_typ.shape)
-    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
-        return False
-    if attrs.layout != "NCHW":
-        return False
-    return True
-
-
-@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.bnns")
-def global_max_pool2d_check(expr):
-    """Check if the nn.global_max_pool2d can be executed in BNNS"""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    rank = len(data_typ.shape)
-    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
-        return False
-    if attrs.layout != "NCHW":
-        return False
-    return True
-
-
-@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.bnns")
-def global_avg_pool2d_check(expr):
-    """Check if the nn.global_avg_pool2d can be executed in BNNS"""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    rank = len(data_typ.shape)
-    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
-        return False
-    if attrs.layout != "NCHW":
-        return False
-    return True
-
-
-def dtype_is_supported(dtype):
-    """Check if data type is supported by BNNS backend"""
-    return dtype in ("", "float32")
-
-
-@tvm.ir.register_op_attr("nn.conv2d", "target.bnns")
-def conv2d_check(expr):
-    """Check if the conv2d can be executed in BNNS"""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    if len(data_typ.shape) != 4 or data_typ.dtype != "float32":
-        return False
-    if not isinstance(args[1], tvm.relay.expr.Constant):
-        return False
-    kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32":
-        return False
-    if attrs.data_layout != "NCHW":
-        return False
-    if not dtype_is_supported(attrs.out_dtype):
-        return False
-    return True
-
-
-def bias_check(expr):
-    """Check is bias added through the correct dimension"""
-    attrs, args = expr.attrs, expr.args
-    if not isinstance(args[1], tvm.relay.expr.Constant):
-        return False
-    if expr.op.name == "nn.bias_add":
-        return attrs.axis == 1
-    if expr.op.name == "add":
-        b_shape = args[1].checked_type.shape
-        if len(b_shape) == 4:
-            return bool(b_shape[0] == 1 and b_shape[2] == 1 and b_shape[3] == 1)
-        if len(b_shape) == 3:
-            return bool(b_shape[1] == 1 and b_shape[2] == 1)
-
-    return False
-
-
-@tvm.ir.register_op_attr("nn.dense", "target.bnns")
-def dense(expr):
-    """Check if the dense can be used in BNNS."""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    if data_typ.dtype != "float32":
-        return False
-    if not isinstance(args[1], tvm.relay.expr.Constant):
-        return False
-    kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "float32":
-        return False
-    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
-        return False
-    return True
-
-
-def make_conv_pattern(with_bias=True, activation="none"):
-    """Make pattern for bnns.conv2d primitive"""
-    data = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    pat = is_op("nn.conv2d")(data, weight)
-    if with_bias:
-        pat = is_op("add")(pat, bias) | is_op("nn.bias_add")(pat, bias)
-    if activation == "relu":
-        pat = is_op("nn.relu")(pat)
-    elif activation == "sigmoid":
-        pat = is_op("sigmoid")(pat)
-    return pat
-
-
-def check_conv(extract):
-    """Check conv pattern is supported by BNNS."""
-    bias_is_ok = True
-    call = extract
-    while call.op.name != "nn.conv2d":
-        if call.op.name in ("nn.bias_add", "add"):
-            bias_is_ok &= bias_check(call)
-        call = call.args[0]
-    return conv2d_check(call) and bias_is_ok
-
-
-def make_dense_bias_pattern():
-    """Make pattern for bnns.dense primitive"""
-    data = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    d = is_op("nn.dense")(data, weight)
-    return is_op("add")(d, bias)
-
-
-def make_dense_bias_gelu_pattern():
-    """Make pattern for bnns.dense primitive with fused bias and gelu activation"""
-    dense_bias = make_dense_bias_pattern()
-    const1 = is_expr(const(0.044715))
-    const2 = is_expr(const(math.sqrt(2 / math.pi)))
-
-    gelu = is_op("power")(dense_bias, is_expr(const(3, dtype="float32")))
-    gelu = is_op("multiply")(gelu, const1)
-    gelu = is_op("add")(gelu, dense_bias)
-    gelu = is_op("multiply")(gelu, const2)
-    gelu = is_op("tanh")(gelu)
-    gelu = is_op("add")(gelu, is_expr(const(1, dtype="float32")))
-    gelu = is_op("multiply")(gelu, is_expr(const(0.5)))
-    gelu = is_op("multiply")(gelu, dense_bias)
-    return gelu
-
-
-def check_dense(extract):
-    """Check dense pattern is supported by BNNS."""
-    call = extract
-    while call.op.name != "nn.dense":
-        call = call.args[0]
-    return dense(call)
-
-
-@tvm.ir.register_op_attr("nn.instance_norm", "target.bnns")
-def instance_norm_check(expr):
-    """Check if the nn.instance_norm can be executed in BNNS"""
-    attrs, args = expr.attrs, expr.args
-    data_typ = args[0].checked_type
-    rank = len(data_typ.shape)
-    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
-        return False
-    if not isinstance(args[1], tvm.relay.expr.Constant) or not isinstance(
-        args[2], tvm.relay.expr.Constant
-    ):
-        return False
-    if attrs.axis == 0 and rank == 3 or attrs.axis == 1 and rank == 4:
-        return True
-    return False
-
-
-@register_pattern_table("bnns")
-def pattern_table():
-    """Get BNNS specific fusing patterns collection"""
-    conv2d_bias_pat = (
-        "bnns.conv2d_bias",
-        make_conv_pattern(with_bias=True),
-        check_conv,
-    )
-    conv2d_bias_relu_pat = (
-        "bnns.conv2d_bias_relu",
-        make_conv_pattern(with_bias=True, activation="relu"),
-        check_conv,
-    )
-    conv2d_relu_pat = (
-        "bnns.conv2d_relu",
-        make_conv_pattern(with_bias=False, activation="relu"),
-        check_conv,
-    )
-    conv2d_bias_sigmoid_pat = (
-        "bnns.conv2d_bias_sigmoid",
-        make_conv_pattern(with_bias=True, activation="sigmoid"),
-        check_conv,
-    )
-    conv2d_sigmoid_pat = (
-        "bnns.conv2d_sigmoid",
-        make_conv_pattern(with_bias=False, activation="sigmoid"),
-        check_conv,
-    )
-    dense_bias_gelu = ("bnns.dense_bias_gelu", make_dense_bias_gelu_pattern(), check_dense)
-    dense_bias = ("bnns.dense_bias", make_dense_bias_pattern(), check_dense)
-    bnns_patterns = [
-        conv2d_bias_relu_pat,
-        conv2d_relu_pat,
-        conv2d_bias_sigmoid_pat,
-        conv2d_sigmoid_pat,
-        conv2d_bias_pat,
-        dense_bias_gelu,
-        dense_bias,
-    ]
-    return bnns_patterns
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
deleted file mode 100644
index 3f79acbdc8f1..000000000000
--- a/python/tvm/relay/op/contrib/clml.py
+++ /dev/null
@@ -1,1418 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, pointless-exception-statement.
-"""CLML Library supported operators."""
-import json
-import os
-from string import Template
-import numpy as np
-import tvm
-
-from tvm import relay
-from tvm.ir import Op
-from tvm._ffi import register_func
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay import function as _function
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.expr import Call, TupleGetItem, Var, Constant
-from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
-
-from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple
-from .register import register_pattern_table
-from ..strategy.generic import is_depthwise_conv2d
-
-
-def clml_sdk_version():
-    """Utility function to get clml version"""
-
-    return int(tvm.support.libinfo().get("TVM_CLML_VERSION", 2))
-
-
-def is_clml_runtime_enabled():
-    """Check if the CLML graph runtime is present.
-
-    Returns
-    -------
-    ret: bool
-        True if present, False if not.
-    """
-    check_enabled = tvm.get_global_func("relay.op.is_clml_runtime_enabled", True)
-    if check_enabled:
-        return check_enabled()
-    return False
-
-
-class RemoveDropout(ExprMutator):
-    """
-    Removes all nn.dropout from an expr.
-    """
-
-    def visit_tuple_getitem(self, op: TupleGetItem) -> relay.expr.Expr:
-        visit = super().visit_tuple_getitem(op)
-        if visit.index != 0:
-            return visit
-        if (
-            isinstance(visit.tuple_value, Call)
-            and isinstance(visit.tuple_value.op, Op)
-            and visit.tuple_value.op.name == "nn.dropout"
-            and visit.index == 0
-        ):
-            return visit.tuple_value.args[0]
-        return visit
-
-
-@transform.function_pass(opt_level=0)
-class RemoveDropoutPass:
-    def transform_function(
-        self, func: relay.function.Function, mod: tvm.IRModule, _: tvm.transform.PassContext
-    ) -> relay.function.Function:
-        return RemoveDropout().visit(func)
-
-
-class OptimizeBatchnorm(ExprMutator):
-    """
-    Fuse Conv+Batchnorm and constant folder to generate Conv+Add.
-    """
-
-    def visit_call(self, call) -> relay.expr.Expr:
-        new_args = []
-        for arg in call.args:
-            if (
-                not isinstance(arg, (Var, Constant))
-                and isinstance(arg, tvm.relay.TupleGetItem)
-                and isinstance(arg.tuple_value.op, tvm.ir.op.Op)
-                and arg.tuple_value.op.name == "nn.batch_norm"
-                and (not isinstance(arg.tuple_value.args[0], (Var, Constant)))
-                and arg.tuple_value.args[0].op.name == "nn.conv2d"
-            ):
-                ep = arg.tuple_value.attrs["epsilon"]
-                wt = arg.tuple_value.args[1].data.numpy()
-                bs = arg.tuple_value.args[2].data.numpy()
-                mn = arg.tuple_value.args[3].data.numpy()
-                vr = arg.tuple_value.args[4].data.numpy() + ep
-                dino = np.sqrt(vr)
-                wt = wt / dino
-                bs = bs - mn * wt
-                conv_op = arg.tuple_value.args[0]
-                conv_args = list(conv_op.args)
-                wt_conv = conv_args[1].data.numpy()
-                if conv_op.attrs["kernel_layout"] == "OIHW":
-                    wt = wt.reshape(wt.shape[0], 1, 1, 1)
-                elif conv_op.attrs["kernel_layout"] == "IOHW":
-                    wt = wt.reshape(1, wt.shape[0], 1, 1)
-                else:
-                    raise ValueError("Unsupported Conv2d kernel layout")
-                wt_conv = wt_conv * wt
-                conv_args[1] = relay.const(tvm.nd.array(wt_conv))
-                bs_args = relay.const(tvm.nd.array(bs.reshape(-1, bs.shape[0], 1, 1)))
-                conv_out = Call(
-                    arg.tuple_value.args[0].op, conv_args, arg.tuple_value.args[0].attrs
-                )
-                mod = tvm.relay.add(conv_out, bs_args)
-                new_args.append(mod)
-            else:
-                new_args.append(arg)
-
-        call = Call(call.op, new_args, call.attrs)
-        args = [self.visit(arg) for arg in call.args]
-
-        return Call(call.op, args, call.attrs)
-
-
-@transform.function_pass(opt_level=0)
-class OptimizeBatchnormPass:
-    def transform_function(
-        self, func: relay.function.Function, mod: tvm.IRModule, _: tvm.transform.PassContext
-    ) -> relay.function.Function:
-        return OptimizeBatchnorm().visit(func)
-
-
-def partition_for_clml(mod, params=None, **opts):
-    """Partition the graph greedily offloading supported
-    operators to CLML Library.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-
-    Returns
-    -------
-    ret : annotated and partitioned module.
-    """
-
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            RemoveDropoutPass(),
-            transform.FoldConstant(),
-            OptimizeBatchnormPass(),
-            transform.MergeComposite(clml_pattern_table()),
-            transform.AnnotateTarget("clml"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    result_mod = seq(mod)
-    return result_mod
-
-
-@register_func("relay.ext.clml.optimize")
-def preprocess_module(mod):
-    """
-    Pre-process a module containing functions ready for CLML codegen. For now we enforce OIHW
-    kernel layout and fold the transforms away.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-
-    Returns
-    -------
-    preprocessed_mod : The processed module.
-    """
-
-    def alter_conv(attrs, inputs, tinfos, out_type):
-        new_attrs = dict(attrs)
-        data_info = tinfos[0]
-        weight_info = tinfos[1]
-        (desired_data_layout, desired_kernel_layout) = ("NCHW", "OIHW")
-        new_attrs["data_layout"] = desired_data_layout
-        new_attrs["kernel_layout"] = desired_kernel_layout
-
-        if is_depthwise_conv2d(
-            data_info.shape,
-            attrs["data_layout"],
-            weight_info.shape,
-            attrs["kernel_layout"],
-            attrs["groups"],
-        ):
-            dkl = desired_kernel_layout
-            new_attrs["kernel_layout"] = dkl[1] + dkl[0] + dkl[2] + dkl[3]
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    with OpAttrContext("nn.conv2d", "FTVMAlterOpLayout", alter_conv):
-        seq = tvm.transform.Sequential(
-            [
-                transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"]}),
-                transform.ConvertLayout({"nn.conv2d_transpose": ["NCHW", "OIHW"]}),
-                transform.AlterOpLayout(),
-                transform.FoldConstant(),
-            ]
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            preprocessed_mod = seq(mod)
-    return preprocessed_mod
-
-
-def preprocess_for_clml(mod):
-    """Preprocessing pass to alter the layouts for CLML compiler target"""
-
-    for _var in mod.get_global_vars():
-        if _var.name_hint == "main":
-            continue
-        fn = mod[_var.name_hint]
-        if "Compiler" in fn.attrs.keys() and fn.attrs["Compiler"] == "clml":
-            new_fn = fn.body
-            clml_mod = tvm.IRModule.from_expr(new_fn)
-            with tvm.transform.PassContext(opt_level=3):
-                clml_mod = preprocess_module(clml_mod)
-            new_body = clml_mod["main"].body
-            mod[_var.name_hint] = _function.Function(
-                fn.params, new_body, fn.ret_type, fn.type_params, fn.attrs
-            )
-    return mod
-
-
-@register_pattern_table("clml")
-def clml_pattern_table():
-    """Get the CLML pattern table."""
-
-    def conv_pattern():
-        """Create a convolution pattern."""
-        pattern = is_op("nn.conv2d")(wildcard(), is_constant())
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
-        pattern = pattern.optional(
-            lambda x: is_tuple_get_item(
-                is_op("nn.batch_norm")(
-                    x, is_constant(), is_constant(), is_constant(), is_constant()
-                )
-            )
-        )
-        pattern = pattern.optional(is_op("nn.relu"))
-        # Fusion pattern to support with relu6 layer.
-        pattern = pattern.optional(is_op("clip").has_attr({"a_min": 0.0, "a_max": 6.0}))
-        return pattern
-
-    def conv_transpose_pattern():
-        """Create a transposed convolution pattern."""
-        pattern = is_op("nn.conv2d_transpose")(wildcard(), is_constant())
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
-        pattern = pattern.optional(
-            lambda x: is_tuple_get_item(
-                is_op("nn.batch_norm")(
-                    x, is_constant(), is_constant(), is_constant(), is_constant()
-                )
-            )
-        )
-        pattern = pattern.optional(is_op("nn.relu"))
-        # Fusion pattern to support with relu6 layer.
-        pattern = pattern.optional(is_op("clip").has_attr({"a_min": 0.0, "a_max": 6.0}))
-        return pattern
-
-    def pad_conv_pattern():
-        """Create a pad with convolution pattern."""
-        pattern = is_op("nn.pad")(wildcard(), is_constant())
-        pattern = is_op("nn.conv2d")(pattern, is_constant())
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
-        pattern = pattern.optional(
-            lambda x: is_tuple_get_item(
-                is_op("nn.batch_norm")(
-                    x, is_constant(), is_constant(), is_constant(), is_constant()
-                )
-            )
-        )
-        pattern = pattern.optional(is_op("nn.relu"))
-        # Fusion pattern to support with relu6 layer.
-        pattern = pattern.optional(is_op("clip").has_attr({"a_min": 0.0, "a_max": 6.0}))
-        return pattern
-
-    def batch_norm_pattern():
-        """Create a batch norm pattern."""
-        pattern = is_op("nn.batch_norm")(
-            wildcard(), is_constant(), is_constant(), is_constant(), is_constant()
-        )
-        pattern = is_tuple_get_item(pattern)
-        return pattern
-
-    def concat_pattern():
-        """Create a concat pattern.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the concat pattern.
-        """
-        pattern = is_tuple(None)
-        pattern = is_op("concatenate")(pattern)
-
-        return pattern
-
-    def dense1d_pattern():
-        """Create a dense pattern for 1d vector to matrix multiple."""
-        pattern = is_op("nn.dense")(wildcard(), is_constant())
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
-        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
-        return pattern
-
-    def dense2d_pattern():
-        """Create a dense pattern for 2d matrix to matrix multiple."""
-        pattern = is_op("nn.dense")(wildcard(), is_constant())
-        return pattern
-
-    def pad_pattern():
-        """Create a pad pattern."""
-        pattern = is_op("nn.pad")(wildcard(), is_constant())
-        return pattern
-
-    def check_conv(extract):
-        """Check conv pattern is supported by CLML."""
-        call = extract
-        clip_found = False
-        if isinstance(call, tvm.relay.expr.TupleGetItem):
-            call = call.tuple_value
-        elif call.op.name == "nn.relu":
-            call = call.args[0]
-            if isinstance(call, tvm.relay.expr.TupleGetItem):
-                call = call.tuple_value
-        elif call.op.name == "clip":
-            clip_found = True
-            if call.attrs["a_min"] != 0.0 or call.attrs["a_max"] != 6.0:
-                return False
-            call = call.args[0]
-            if isinstance(call, tvm.relay.expr.TupleGetItem):
-                call = call.tuple_value
-
-        while call.op.name != "nn.conv2d":
-            call = call.args[0]
-
-        attrs, args = call.attrs, call.args
-        if attrs.data_layout != "NCHW":
-            return False
-
-        if call.checked_type.shape[0] > 1:
-            return False
-
-        if (
-            (not clip_found)
-            and (attrs.kernel_size[0] == 3)
-            and (attrs.dilation[0] != 1)
-            and (attrs.groups != 1)
-            and (attrs.channels == attrs.groups)
-        ):
-            return False
-
-        data_typ = args[0].checked_type
-        kernel_typ = args[1].checked_type
-        is_depthwise = is_depthwise_conv2d(
-            data_typ.shape,
-            attrs["data_layout"],
-            kernel_typ.shape,
-            attrs["kernel_layout"],
-            attrs["groups"],
-        )
-        if attrs.groups != 1 and not is_depthwise:
-            return False
-        return True
-
-    def check_conv_transpose(extract):
-        """Check transposed conv pattern is supported by CLML."""
-        call = extract
-        if isinstance(call, tvm.relay.expr.TupleGetItem):
-            call = call.tuple_value
-        elif call.op.name == "nn.relu":
-            call = call.args[0]
-            if isinstance(call, tvm.relay.expr.TupleGetItem):
-                call = call.tuple_value
-        elif call.op.name == "clip":
-            if call.attrs["a_min"] != 0.0 or call.attrs["a_max"] != 6.0:
-                return False
-            call = call.args[0]
-            if isinstance(call, tvm.relay.expr.TupleGetItem):
-                call = call.tuple_value
-
-        while call.op.name != "nn.conv2d_transpose":
-            call = call.args[0]
-
-        attrs = call.attrs
-        if attrs.data_layout != "NCHW":
-            return False
-
-        return True
-
-    def check_binary_op(extract):
-        call = extract
-        # Scalars are not supported
-        if len(call.args[1].checked_type.shape) == 0:
-            return False
-        if call.args[0] == call.args[1]:
-            return False
-
-        if tuple(call.args[0].checked_type.shape) != tuple(call.args[1].checked_type.shape):
-            return False
-
-        return check_default_op(call)
-
-    def check_pad_op(extract):
-        call = extract
-        if len(call.attrs["pad_width"]) != 4:
-            return False
-        # CLML can't process Tensor padding with out knowing layout.
-        # Pad layers before any convolution are not guarenteed to be NCHW.
-        if isinstance(call.args[0], tvm.relay.expr.Var):
-            return False
-        return check_default_op(call)
-
-    def check_softmax_op(extract):
-        call = extract
-        # supports 2D and 4D tensors.
-        if len(call.args[0].checked_type.shape) not in [2, 4]:
-            return False
-        return check_default_op(call)
-
-    def check_upsampling_op(extract):
-        call = extract
-        if call.attrs["method"] != "bilinear":
-            return False
-        return check_default_op(call)
-
-    def check_concat_op(extract):
-        call = extract
-        if call.attrs["axis"] != 1:
-            return False
-        return check_default_op(call)
-
-    def check_default_op(extract):
-        call = extract
-
-        if isinstance(call, tvm.relay.expr.TupleGetItem):
-            call = call.tuple_value
-            call_shape = call.checked_type.fields[0].shape
-            call_dtype = call.checked_type.fields[0].dtype
-        else:
-            call_shape = call.checked_type.shape
-            call_dtype = call.checked_type.dtype
-
-        # int64, int32 dtypes are not Supported in CLML
-        if call_dtype in ["int64", "int32"]:
-            return False
-
-        # Supports only upto 4 dim shapes
-        if len(call_shape) > 4:
-            return False
-        # Only support batch dim = 1
-        if isinstance(call_shape[0], tvm.tir.expr.Any) or call_shape[0] > 1:
-            return False
-        # Checking buffer indexing limit
-        for shape in call_shape:
-            if shape > 32768:
-                return False
-        # Avoid any operators with dtype Int64 and upsupported shape
-        for _arg in call.args:
-            t_arg = _arg if isinstance(_arg, tvm.relay.Tuple) else [_arg]
-            for arg in t_arg:
-                checked_type = (
-                    arg.tuple_value.checked_type.fields[arg.index]
-                    if isinstance(arg, tvm.relay.TupleGetItem)
-                    else arg.checked_type
-                )
-                if checked_type.dtype in ["int64", "int32"]:
-                    return False
-                # Supports only 4 dim shapes
-                if len(checked_type.shape) > 4:
-                    return False
-                # Only support batch dim = 1
-                if len(checked_type.shape) > 0 and checked_type.shape[0] > 1:
-                    return False
-                for shape in checked_type.shape:
-                    if shape > 32768:
-                        return False
-        return True
-
-    def check_batch_matmul_op(extract):
-        call = extract
-        # Only support single Matmul.
-        if call.args[0].checked_type.shape[0] > 1:
-            return False
-        if call.args[1].checked_type.shape[0] > 1:
-            return False
-        return check_default_op(call)
-
-    def check_dense1d_op(extract):
-        call = extract
-        # Only support single Matmul.
-        if call.args[0].checked_type.shape[0] > 1:
-            return False
-        if not (call.op.name in ["nn.bias_add", "add"] and call.args[0].op.name == "nn.dense"):
-            return False
-        return True
-
-    def check_dense2d_op(extract):
-        call = extract
-        # Only support 2D Matmul without bias
-        if call.op.name in ["nn.bias_add", "add"] and call.args[0].op.name == "nn.dense":
-            return False
-        # Avoid any operators with dtype Int64 and upsupported shape
-        for _arg in call.args:
-            t_arg = _arg if isinstance(_arg, tvm.relay.Tuple) else [_arg]
-            for arg in t_arg:
-                checked_type = (
-                    arg.tuple_value.checked_type.fields[arg.index]
-                    if isinstance(arg, tvm.relay.TupleGetItem)
-                    else arg.checked_type
-                )
-                if len(checked_type.shape) != 2:
-                    return False
-        return True
-
-    def check_depth_to_space(extract):
-        call = extract
-        call_shape = call.checked_type.shape
-        arg_shape = call.args[0].checked_type.shape
-        # Supports only upto 4 dim shapes
-        if len(call_shape) > 4 or len(arg_shape) > 4:
-            return False
-        # Only support batch dim = 1
-        if call_shape[0] > 1:
-            return False
-        # Checking buffer indexing limit
-        for shape in call_shape:
-            if shape > 32768:
-                return False
-        if call.attrs["layout"] != "NCHW" or call.attrs["mode"] != "DCR":
-            return False
-        return True
-
-    return [
-        ("clml.pad_conv2d", pad_conv_pattern(), check_conv),
-        ("clml.conv2d", conv_pattern(), check_conv),
-        ("clml.conv2d_transpose", conv_transpose_pattern(), check_conv_transpose),
-        ("clml.dense1d", dense1d_pattern(), check_dense1d_op),
-        ("clml.dense2d", dense2d_pattern(), check_dense2d_op),
-        ("clml.pad", pad_pattern(), check_pad_op),
-        ("clml.concat", concat_pattern(), check_concat_op),
-        ("clml.batch_norm", batch_norm_pattern()),
-        ("clml.add", is_op("add")(wildcard(), wildcard()), check_binary_op),
-        ("clml.subtract", is_op("subtract")(wildcard(), wildcard()), check_binary_op),
-        ("clml.multiply", is_op("multiply")(wildcard(), wildcard()), check_binary_op),
-        ("clml.divide", is_op("divide")(wildcard(), wildcard()), check_binary_op),
-        ("clml.minimum", is_op("minimum")(wildcard(), wildcard()), check_binary_op),
-        ("clml.maximum", is_op("maximum")(wildcard(), wildcard()), check_binary_op),
-        ("clml.softmax", is_op("nn.softmax")(wildcard()), check_softmax_op),
-        ("clml.reshape", is_op("reshape")(wildcard()), check_default_op),
-        ("clml.avg_pool2d", is_op("nn.avg_pool2d")(wildcard()), check_default_op),
-        ("clml.max_pool2d", is_op("nn.max_pool2d")(wildcard()), check_default_op),
-        ("clml.global_avg_pool2d", is_op("nn.global_avg_pool2d")(wildcard()), check_default_op),
-        ("clml.global_max_pool2d", is_op("nn.global_max_pool2d")(wildcard()), check_default_op),
-        ("clml.relu", is_op("nn.relu")(wildcard()), check_default_op),
-        ("clml.clip", is_op("clip")(wildcard()), check_default_op),
-        ("clml.batch_flatten", is_op("nn.batch_flatten")(wildcard()), check_default_op),
-        ("clml.depth_to_space", is_op("nn.depth_to_space")(wildcard()), check_depth_to_space),
-        ("clml.upsampling", is_op("nn.upsampling")(wildcard()), check_upsampling_op),
-        (
-            "clml.batch_matmul",
-            is_op("nn.batch_matmul")(wildcard(), wildcard()),
-            check_batch_matmul_op,
-        ),
-    ]
-
-
-def _register_external_op_helper(op_name, supported=True):
-    @tvm.ir.register_op_attr(op_name, "target.clml")
-    def _func_wrapper(expr):
-        return supported
-
-    return _func_wrapper
-
-
-class OpAttrContext(object):
-    """Temporarily changes the attr of an op."""
-
-    def __init__(self, op_name, attr_key, attr_value):
-        """Saves the required info for RAII pattern usage.
-
-        Parameters
-        ----------
-        op_name : str
-            The op name.
-
-        attr_key : str
-            The attribute name.
-
-        attr_value : object
-            The attribute value.
-        """
-        self.op = relay.op.get(op_name)
-        self.attr_key = attr_key
-        self.attr_value = attr_value
-
-    def __enter__(self):
-        self.older_attr = self.op.get_attr(self.attr_key)
-        self.op.reset_attr(self.attr_key)
-        self.op.set_attr(self.attr_key, self.attr_value)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        self.op.reset_attr(self.attr_key)
-        if self.older_attr:
-            self.op.set_attr(self.attr_key, self.older_attr)
-
-
-@register_func("runtime.ext.debug.clml")
-def process_debug(op, dump, dump_path):
-    """Dump the required debug information in given path"""
-    dump_json = json.loads(dump)
-
-    graph_json = json.loads(dump_json["graph"])
-    with open(os.path.join(dump_path, op + ".json"), "w") as outfile:
-        json.dump(graph_json, outfile, indent=4, sort_keys=False)
-
-    hex_tensors = dump_json["tensors"]
-    fload = tvm._ffi.get_global_func("runtime.LoadParams")
-    tensor_map = fload(bytearray.fromhex(hex_tensors))
-    np_tensors = {}
-    for key, val in tensor_map.items():
-        np_tensors[key] = val.asnumpy()
-    np.savez(os.path.join(dump_path, op + ".npz"), **np_tensors)
-
-
-class CLMLGetSubModuleSrc:
-    """Generates CLML API one CLML sub module out ot global TVM module"""
-
-    def __init__(self, codegen):
-        """Initialize
-        Parameters
-        ----------
-        codegen : JSON
-            The CLML sub module as JSON
-        """
-        self.codegen = codegen
-        self.nodes = None
-        self.node_map = {}
-        self.input_meta = []
-        self.output_meta = []
-        self.clml_code = []
-        self.sub_module_name = None
-
-        self.MakeCLMLTensor = Template(
-            """auto $name = runner.MakeCLMLTensor
-        (std::vector<size_t>({$shape}), "$dtype", $layout);"""
-        )
-        self.MapInsert = Template("""runner.storage_map.insert({"$nid", $tensor_desc});""")
-        self.MakeConv2D = Template(
-            """
-        // Convolution / Depthwise Convolution
-        runner.MakeConv2D($input_tensor,
-           $weight_tensor,
-           $bias_tensor,
-           $output_tensor,
-           std::vector<cl_uint>({$padding}),
-           std::vector<cl_uint>({$dilation}),
-           std::vector<cl_uint>({$strides}),
-           $groups,
-           $mode,
-           $activation,
-           $has_bias,
-           $has_act,
-           "$dtype");"""
-        )
-        self.MakeConv2DWithBN = Template(
-            """
-        // Batchnorm
-        runner.MakeConv2DWithBN($input_tensor,
-                 $weight_tensor,
-                 $bias_tensor,
-                 $output_tensor,
-                 $bn_scale_tensor,
-                 $bn_bias_tensor,
-                 $bn_mean_tensor,
-                 $bn_var_tensor,
-                 std::vector<float>  ({$bn_attrs}),
-                 std::vector<cl_uint> ({$padding}),
-                 std::vector<cl_uint> ({$dilation}),
-                 std::vector<cl_uint> ({$strides}),
-                 $groups,
-                 $mode,
-                 $activation,
-                 $has_bias,
-                 $has_act,
-                 "$dtype");"""
-        )
-        self.MakeRelu = Template(
-            """
-        // Relu / Relu6
-        runner.MakeRelu($input_tensor, $output_tensor, $relu_type, "$dtype");
-        """
-        )
-        self.MakeBN = Template(
-            """
-        // Batchnorm
-        runner.MakeBatchNorm($input_tensor,
-              $output_tensor,
-              $bn_scale_tensor,
-              $bn_bias_tensor,
-              $bn_mean_tensor,
-              $bn_var_tensor,
-              std::vector<float> ({$bn_attrs}), "$dtype");"""
-        )
-        self.MakePool2D = Template(
-            """
-        // Pool2D
-        runner.MakePool2D($input_tensor,
-           $output_tensor,
-           std::vector<cl_uint> ({$pool_size}),
-           std::vector<cl_uint> ({$strides}),
-           std::vector<cl_uint> ({$padding}),
-           "$pool_type", "$dtype");"""
-        )
-        self.MakeGlobalPool2D = Template(
-            """
-        // GlobalPool2D
-        runner.MakeGlobalPool2D($input_tensor,
-                 $output_tensor,
-                 std::vector<cl_uint> ({$in_shape}),
-                 "$pool_type", "$dtype");"""
-        )
-        self.MakeReshape = Template(
-            """
-        // Reshape
-        runner.MakeReshape($input_tensor,
-            $output_tensor, "$dtype");"""
-        )
-        self.MakeConcatenate = Template(
-            """
-        // Concatinate
-        runner.MakeConcatenate(
-                std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> ({$in_list}),
-                $output_tensor,
-                $axis, "$dtype");"""
-        )
-        self.MakeDense = Template(
-            """
-        // Dense
-        runner.MakeDense($input_tensor,
-          $weight_tensor,
-          $output_tensor,
-          std::vector<cl_uint> ({$in_shape}),
-          std::vector<cl_uint> ({$wt_shape}),
-          "$dtype");"""
-        )
-        self.MakeSoftMax = Template(
-            """
-        // Softmax
-        runner.MakeSoftMax($input_tensor,
-            $output_tensor, "$dtype");"""
-        )
-        self.MakePad = Template(
-            """
-        // Pad
-        runner.MakePad($input_tensor,
-        $output_tensor,
-        "$pad_mode",
-        std::vector<cl_uint> ({$padding}), "$dtype");"""
-        )
-        self.MakeBatchFlatten = Template(
-            """
-        // BatchFlatten
-        runner.MakeBatchFlatten($input_tensor,
-                 $output_tensor, "$dtype");"""
-        )
-        self.MakeClip = Template(
-            """
-        // Clip
-        runner.MakeClip($input_tensor,
-         $output_tensor,
-         $a_max,
-         $a_min,
-         "$dtype");"""
-        )
-        self.MakeBinaryOp = Template(
-            """
-        // BinaryOp
-        runner.MakeBinaryOp($input_a,
-             $input_b,
-             $output_tensor,
-             "$op", "$dtype");"""
-        )
-
-        self.MakeHeader = Template(
-            """
-        CLMLRunner $module(std::string name,
-                   ToolArgs& args,
-                   cl_platform_id arg_platform_id,
-                   cl_context arg_context,
-                   cl_device_id arg_device_id,
-                   cl_command_queue arg_queue) {
-        CLMLRunner runner = CLMLRunner(name,
-                                 args,
-                                 arg_platform_id,
-                                 arg_context,
-                                 arg_device_id,
-                                 arg_queue);
-        runner.MakeUnusedTensor();
-        """
-        )
-
-        self.MakeFooter = Template(
-            """
-            return runner;
-        }
-        """
-        )
-
-        self.MakeMetaInfo = Template(
-            "runner.SetMetaInfo("
-            '"Subgraph Name: $name\\n    Input Count  : $input_count\\n'
-            "    Output Count : $output_count\\n"
-            '    Input MetaInfo\\n$input_meta\\n    Output MetaInfo\\n$output_meta");'
-        )
-        self.MakeInputMetaInfo = Template(
-            "        Input: $in_name\\n          Dtype : $dtype\\n          Shape : [$shape]\\n"
-        )
-
-        self.MakeOutputMetaInfo = Template(
-            "        Output: $out_name\\n         Dtype : $dtype\\n          Shape : [$shape]\\n"
-        )
-
-    def get_src(self):
-        """Returns pair of sub module name and the generated source"""
-
-        self.sub_module_name = self.codegen["symbol"]
-        self.nodes = self.codegen["nodes"]
-        self.clml_code.append(self.MakeHeader.substitute(module=self.sub_module_name))
-
-        def get_tensor_from_map(
-            node_seq, shape=None, layout="CL_TENSOR_LAYOUT_OPTIMAL_QCOM", dtype="float32"
-        ):
-            if node_seq in self.node_map:
-                return self.node_map[node_seq]
-            else:
-                node = self.nodes[node_seq]
-                dtype = str(node["attrs"]["dtype"][0][0])
-                if node["op"] == "input":
-                    self.clml_code.append("// Input Node")
-                    node_out_name = node["name"]
-                else:
-                    node_out_name = node["name"]
-                if shape is None:
-                    shape = str(tuple(node["attrs"]["shape"][0][0]))[1:-1]
-
-                self.clml_code.append(
-                    self.MakeCLMLTensor.substitute(
-                        name=node_out_name, shape=shape, dtype=dtype, layout=layout
-                    )
-                )
-                self.clml_code.append(
-                    self.MapInsert.substitute(nid=node_out_name, tensor_desc=node_out_name)
-                )
-                if node["op"] == "input":
-                    self.clml_code.append(
-                        Template("runner.inputs.push_back($clml_input);").substitute(
-                            clml_input=node_out_name
-                        )
-                    )
-                    self.input_meta.append(
-                        self.MakeInputMetaInfo.substitute(
-                            in_name=node_out_name, dtype=dtype, shape=shape
-                        )
-                    )
-
-                if self.nodes[node_seq]["op"] == "const":
-                    self.clml_code.append(
-                        Template('runner.consts.push_back("$nid");').substitute(nid=node["name"])
-                    )
-                self.node_map[node_seq] = node_out_name
-                return node_out_name
-
-        def make_output_tensor(
-            node, node_seq, shape=None, layout="CL_TENSOR_LAYOUT_OPTIMAL_QCOM", dtype="float32"
-        ):
-            if dtype is None:
-                dtype = str(node["attrs"]["dtype"][0][0])
-            if shape is None:
-                shape = str(tuple(node["attrs"]["shape"][0][0]))[1:-1]
-            node_out_name = self.sub_module_name + "_" + "layer_out_" + str(node_seq)
-            self.clml_code.append(
-                self.MakeCLMLTensor.substitute(
-                    name=node_out_name,
-                    shape=shape,
-                    dtype=dtype,
-                    layout=layout,
-                )
-            )
-            return node_out_name
-
-        for node_seq, node in enumerate(self.nodes):
-            if node["op"] == "kernel":
-                self.clml_code.append("// Kernel Node : " + node["name"])
-                if node["name"] == "nn.conv2d" or node["name"] == "nn.depthwise_conv2d":
-                    if "padding" in node["attrs"]:
-                        padding = str(tuple(int(x) for x in node["attrs"]["padding"][0]))[1:-1]
-                    else:
-                        padding = "0, 0, 0, 0"
-                    dilation = str(tuple(int(x) for x in node["attrs"]["dilation"][0]))[1:-1]
-                    strides = str(tuple(int(x) for x in node["attrs"]["strides"][0]))[1:-1]
-                    groups = node["attrs"]["groups"][0][0]
-                    if node["name"] == "nn.conv2d":
-                        mode = "CL_CONVOLUTION_MODE_CONVOLUTION_QCOM"
-                    else:
-                        mode = "CL_CONVOLUTION_MODE_DEPTHWISE_QCOM"
-                    activation = "CL_ACTIVATION_RELU"
-                    has_act = False
-                    if "activation_type" in node["attrs"]:
-                        has_act = True
-                        activation = node["attrs"]["activation_type"][0][0]
-                        if activation == "relu":
-                            activation = "CL_ACTIVATION_RELU"
-                        elif activation == "relu6":
-                            activation = "CL_ACTIVATION_RELU6"
-                        else:
-                            raise RuntimeError("Unknown activation:" + activation)
-                    has_bias = bool((node["inputs"] == 3) or (node["inputs"] == 7))
-                    has_bn = bool((node["inputs"] == 6) or (node["inputs"] == 7))
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    weight_tensor = get_tensor_from_map(node["inputs"][1][0])
-                    if not has_bias:
-                        bias_tensor = "runner.unusedTensor"
-                    else:
-                        bias_tensor = get_tensor_from_map(node["inputs"][2][0])
-
-                    node_out_name = make_output_tensor(node, node_seq)
-
-                    if not has_bn:
-                        self.clml_code.append(
-                            self.MakeConv2D.substitute(
-                                input_tensor=input_tensor,
-                                weight_tensor=weight_tensor,
-                                bias_tensor=bias_tensor,
-                                output_tensor=node_out_name,
-                                padding=padding,
-                                dilation=dilation,
-                                strides=strides,
-                                groups=groups,
-                                mode=mode,
-                                activation=activation,
-                                has_bias="true" if has_bias else "false",
-                                has_act="true" if has_act else "false",
-                                dtype=node["attrs"]["dtype"][0][0],
-                            )
-                        )
-                    else:
-                        bn_index = 3 if has_bias else 2
-                        bn_attrs = tuple(node["attrs"]["batchnorm"][0][0])
-                        axis = bn_attrs[0]
-                        bn_shape = [1, 1, 1, 1]
-                        bn_node = self.nodes[node["inputs"][bn_index][0]]
-                        bn_shape[axis] = bn_node["attrs"]["shape"][0][0]
-                        dtype = bn_node["attrs"]["dtype"][0][0]
-
-                        bn_scale_tensor = get_tensor_from_map(
-                            node["inputs"][bn_index][0],
-                            shape=str(tuple(bn_shape))[1:-1],
-                            dtype=dtype,
-                        )
-
-                        bn_bias_tensor = get_tensor_from_map(
-                            node["inputs"][bn_index + 1][0],
-                            shape=str(tuple(bn_shape))[1:-1],
-                            dtype=dtype,
-                        )
-
-                        bn_mean_tensor = get_tensor_from_map(
-                            node["inputs"][bn_index + 2][0],
-                            shape=str(tuple(bn_shape))[1:-1],
-                            dtype=dtype,
-                        )
-
-                        bn_var_tensor = get_tensor_from_map(
-                            node["inputs"][bn_index + 3][0],
-                            shape=str(tuple(bn_shape))[1:-1],
-                            dtype=dtype,
-                        )
-
-                        self.clml_code.append(
-                            self.MakeConv2DWithBN.substitute(
-                                input_tensor=input_tensor,
-                                weight_tensor=weight_tensor,
-                                bias_tensor=bias_tensor,
-                                output_tensor=node_out_name,
-                                bn_scale_tensor=bn_scale_tensor,
-                                bn_bias_tensor=bn_bias_tensor,
-                                bn_mean_tensor=bn_mean_tensor,
-                                bn_var_tensor=bn_var_tensor,
-                                bn_attrs=str(bn_attrs)[1:-1],
-                                padding=padding,
-                                dilation=dilation,
-                                strides=strides,
-                                groups=groups,
-                                mode=mode,
-                                activation=activation,
-                                has_bias="true" if has_bias else "false",
-                                has_act="true" if has_act else "false",
-                                dtype=node["attrs"]["dtype"][0][0],
-                            )
-                        )
-                elif node["name"] == "nn.relu6" or node["name"] == "nn.relu":
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    relu_type = (
-                        "CL_ACTIVATION_RELU" if node["name"] == "nn.relu" else "CL_ACTIVATION_RELU6"
-                    )
-                    self.clml_code.append(
-                        self.MakeRelu.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            relu_type=relu_type,
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "nn.batch_norm":
-                    bn_attrs = tuple(node["attrs"]["axis"])
-                    axis = int(bn_attrs[0][0])
-                    bn_shape = [1, 1, 1, 1]
-                    bn_node = self.nodes[node["inputs"][0][0]]
-                    bn_shape[axis] = bn_node["attrs"]["shape"][0][0]
-                    dtype = bn_node["attrs"]["dtype"][0][0]
-                    bn_scale_tensor = get_tensor_from_map(
-                        node["inputs"][0][0], shape=str(tuple(bn_shape))[1:-1], dtype=dtype
-                    )
-                    bn_bias_tensor = get_tensor_from_map(
-                        node["inputs"][1][0], shape=str(tuple(bn_shape))[1:-1], dtype=dtype
-                    )
-                    bn_mean_tensor = get_tensor_from_map(
-                        node["inputs"][2][0], shape=str(tuple(bn_shape))[1:-1], dtype=dtype
-                    )
-                    bn_var_tensor = get_tensor_from_map(
-                        node["inputs"][3][0], shape=str(tuple(bn_shape))[1:-1], dtype=dtype
-                    )
-
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-
-                    self.clml_code.append(
-                        self.MakeBN.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            bn_scale_tensor=bn_scale_tensor,
-                            bn_bias_tensor=bn_bias_tensor,
-                            bn_mean_tensor=bn_mean_tensor,
-                            bn_var_tensor=bn_var_tensor,
-                            bn_attrs=str(bn_attrs)[1:-1],
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] in ["nn.max_pool2d", "nn.avg_pool2d", "nn.l2_pool2d"]:
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    pool_size = str(tuple(int(x) for x in node["attrs"]["pool_size"][0]))[1:-1]
-                    strides = str(tuple(int(x) for x in node["attrs"]["strides"][0]))[1:-1]
-                    padding = str(tuple(int(x) for x in node["attrs"]["padding"][0]))[1:-1]
-                    self.clml_code.append(
-                        self.MakePool2D.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            pool_size=pool_size,
-                            strides=strides,
-                            padding=padding,
-                            pool_type=node["name"],
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] in ["nn.global_max_pool2d", "nn.global_avg_pool2d"]:
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    in_node = self.nodes[node["inputs"][0][0]]
-                    in_shape = str(tuple(in_node["attrs"]["shape"][0][0]))[1:-1]
-                    self.clml_code.append(
-                        self.MakeGlobalPool2D.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            in_shape=in_shape,
-                            pool_type=node["name"],
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "reshape":
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    self.clml_code.append(
-                        self.MakeReshape.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "concatenate":
-                    input_len = len(node["inputs"])
-                    in_list = str(
-                        [get_tensor_from_map(node["inputs"][x][0]) for x in range(input_len)]
-                    )[1:-1]
-                    node_out_name = make_output_tensor(node, node_seq)
-                    axis = node["attrs"]["axis"][0][0]
-                    self.clml_code.append(
-                        self.MakeConcatenate.substitute(
-                            in_list=in_list,
-                            output_tensor=node_out_name,
-                            axis=axis,
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "nn.dense":
-                    in_node = self.nodes[node["inputs"][0][0]]
-                    in_shape = tuple(in_node["attrs"]["shape"][0][0])
-                    wt_shape = tuple(in_node["attrs"]["shape"][0][0])
-                    input_tensor = get_tensor_from_map(
-                        node["inputs"][0][0], layout="CL_TENSOR_LAYOUT_NCHW_QCOM"
-                    )
-                    weight_tensor = get_tensor_from_map(
-                        node["inputs"][1][0],
-                        shape=str(tuple([1, 1, wt_shape[0], wt_shape[1]]))[1:-1],
-                        layout="CL_TENSOR_LAYOUT_NCHW_QCOM",
-                    )
-                    node_out_name = make_output_tensor(
-                        node,
-                        node_seq,
-                        shape=str(tuple([in_shape[0], wt_shape[0], 1, 1]))[1:-1],
-                        layout="CL_TENSOR_LAYOUT_NCHW_QCOM",
-                    )
-                    self.clml_code.append(
-                        self.MakeDense.substitute(
-                            input_tensor=input_tensor,
-                            weight_tensor=weight_tensor,
-                            output_tensor=node_out_name,
-                            in_shape=str(in_shape)[1:-1],
-                            wt_shape=str(wt_shape)[1:-1],
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "nn.softmax":
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    self.clml_code.append(
-                        self.MakeSoftMax.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "nn.pad":
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    pad_mode = node["attrs"]["pad_mode"][0][0]
-                    padding = str(tuple(int(x) for x in node["attrs"]["pad_width"][0]))[1:-1]
-                    self.clml_code.append(
-                        self.MakePad.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            pad_mode=pad_mode,
-                            padding=padding,
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "nn.batch_flatten":
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    self.clml_code.append(
-                        self.MakeBatchFlatten.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] == "clip":
-                    input_tensor = get_tensor_from_map(node["inputs"][0][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    a_max = node["attrs"]["a_max"][0][0]
-                    a_min = node["attrs"]["a_min"][0][0]
-                    self.clml_code.append(
-                        self.MakeClip.substitute(
-                            input_tensor=input_tensor,
-                            output_tensor=node_out_name,
-                            a_max=a_max,
-                            a_min=a_min,
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                elif node["name"] in [
-                    "add",
-                    "subtract",
-                    "multiply",
-                    "minimum",
-                    "maximum",
-                    "divide",
-                ]:
-                    input_a = get_tensor_from_map(node["inputs"][0][0])
-                    input_b = get_tensor_from_map(node["inputs"][1][0])
-                    node_out_name = make_output_tensor(node, node_seq)
-                    self.clml_code.append(
-                        self.MakeBinaryOp.substitute(
-                            input_a=input_a,
-                            input_b=input_b,
-                            output_tensor=node_out_name,
-                            op=node["name"],
-                            dtype=node["attrs"]["dtype"][0][0],
-                        )
-                    )
-                else:
-                    raise RuntimeError("Unsupported Op:" + node["name"])
-                self.clml_code.append(
-                    self.MapInsert.substitute(nid=node_out_name, tensor_desc=node_out_name)
-                )
-                self.node_map[node_seq] = node_out_name
-
-            elif node["op"] not in ["const", "input"]:
-                print("Unknown Node type:", node["op"])
-
-        # Populate outputs
-        out_nodes = self.codegen["heads"]
-        self.clml_code.append("// Populate outputs")
-        for nid_triple in out_nodes:
-            nid = nid_triple[0]
-            out_node = self.nodes[nid]
-            dtype = str(out_node["attrs"]["dtype"][0][0])
-            shape = str(tuple(out_node["attrs"]["shape"][0][0]))[1:-1]
-            out_name = self.sub_module_name + "_" + "layer_out_" + str(nid)
-            self.clml_code.append(
-                Template(
-                    'runner.outputs.insert({"$out_name", runner.storage_map["$out_name"]});'
-                ).substitute(out_name=out_name)
-            )
-            self.clml_code.append(
-                Template('runner.outputs_dtypes.insert({"$out_name", "$dtype"});').substitute(
-                    out_name=out_name, dtype=dtype
-                )
-            )
-            self.clml_code.append(
-                Template(
-                    "runner.outputs_shapes.insert" '({"$out_name", std::vector<size_t>({$shape})});'
-                ).substitute(out_name=out_name, shape=shape)
-            )
-            self.output_meta.append(
-                self.MakeOutputMetaInfo.substitute(out_name=out_name, dtype=dtype, shape=shape)
-            )
-
-        # Mem allocation & Param copy
-        self.clml_code.append("// Allocate Tensor Memory and copy params")
-        self.clml_code.append("runner.AllocateMemAndPopulateParams();")
-
-        # Meta data preparation
-        self.clml_code.append(
-            self.MakeMetaInfo.substitute(
-                name=self.sub_module_name,
-                input_count=len(self.input_meta),
-                output_count=len(self.output_meta),
-                input_meta="\\\n".join(self.input_meta),
-                output_meta="\\\n".join(self.output_meta),
-            )
-        )
-
-        self.clml_code.append(self.MakeFooter.substitute())
-        return (self.sub_module_name, self.clml_code)
-
-
-HEADER_STR = """
-    /*
-    * Licensed to the Apache Software Foundation (ASF) under one
-    * or more contributor license agreements.  See the NOTICE file
-    * distributed with this work for additional information
-    * regarding copyright ownership.  The ASF licenses this file
-    * to you under the Apache License, Version 2.0 (the
-    * "License"); you may not use this file except in compliance
-    * with the License.  You may obtain a copy of the License at
-    *
-    *   http://www.apache.org/licenses/LICENSE-2.0
-    *
-    * Unless required by applicable law or agreed to in writing,
-    * software distributed under the License is distributed on an
-    * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    * KIND, either express or implied.  See the License for the
-    * specific language governing permissions and limitations
-    * under the License.
-    */
-
-    /*!
-     * \\file clml_models.cc
-     * \\brief CLML models for all subgraph in given TVM module.
-     */
-
-    // AUTO GENERATED BY TOOL (clml_codegen.py), PLEASE DO NOT CHANGE THIS FILE!
-    // =========================================================================
-
-    #include <iostream>
-    #include <fstream>
-
-    #include <vector>
-    #include <string>
-    #include <algorithm>
-    #include <math.h>
-    #include <list>
-
-    // Project includes
-    #include "CL/cl.h"
-    #include "CL/cl_qcom_ml_ops.h"
-
-    #include "clml_runner.h"
-
-    using namespace tvm::runtime;
-"""
-
-
-class CLMLGenSrc:
-    """Generates CLML API source given a TVM compiled mod"""
-
-    def __init__(self, libm):
-        """Initialize
-        Parameters
-        ----------
-        libm : Module or json codegen object
-        """
-        self.libm = libm
-        self.gen_src = []
-        self.clml_modules = None
-        self.clml_builds = {}
-        self.codegen = None
-        self.nodes = None
-
-        self.MakeFileHeader = Template(HEADER_STR)
-
-    def get_clml_params(self):
-        """Returns parameters from the TVM module"""
-        if not isinstance(self.libm, GraphExecutorFactoryModule):
-            return {}
-
-        clml_params = {}
-        if self.libm.get_lib().type_key == "const_loader":
-            params = self.libm.get_lib().get_function("get_const_var_ndarray")()
-            clml_params.update(params)
-
-        for mod in self.libm.get_lib().imported_modules:
-            if mod.type_key == "const_loader":
-                params = mod.get_const_var_ndarray()
-                clml_params.update(params)
-
-        clml_params_save = {}
-        for key, val in clml_params.items():
-            clml_params_save[str(key)] = val.numpy()
-
-        return clml_params_save
-
-    def get_artifacts(self):
-        """Function that returns params as dict and source as list of cource code lines"""
-
-        self.clml_builds["file_header"] = [self.MakeFileHeader.substitute()]
-        if isinstance(self.libm, GraphExecutorFactoryModule):
-            self.clml_modules = list(
-                filter(lambda mod: mod.type_key == "clml", self.libm.get_lib().imported_modules)
-            )
-
-            for cmod in self.clml_modules:
-                codegen = json.loads(cmod.get_source("json"))
-                (sub_module_name, clml_code) = CLMLGetSubModuleSrc(codegen).get_src()
-                self.clml_builds[sub_module_name] = clml_code
-        elif isinstance(self.libm, dict):
-            (sub_module_name, clml_code) = CLMLGetSubModuleSrc(self.libm).get_src()
-            self.clml_builds[sub_module_name] = clml_code
-        else:
-            raise Exception("Don't know how to handle the input")
-
-        main_code = []
-        main_code.append(
-            """
-            std::vector<CLMLRunner> BuildModules(ToolArgs& args,
-                                                 cl_platform_id arg_platform,
-                                                 cl_context arg_context,
-                                                 cl_device_id arg_device_id,
-                                                 cl_command_queue arg_queue) {
-                  std::vector<CLMLRunner> runners;"""
-        )
-        for key, val in self.clml_builds.items():
-            if key != "file_header":
-                main_code.append(
-                    "runners.push_back("
-                    + key
-                    + '("'
-                    + key
-                    + '", args, arg_platform, arg_context, arg_device_id, arg_queue));'
-                )
-        main_code.append("return runners;}")
-        self.clml_builds["MainBuild"] = main_code
-
-        for key, val in self.clml_builds.items():
-            self.gen_src.extend(val)
-
-        return (self.get_clml_params(), self.gen_src)
diff --git a/python/tvm/relay/op/contrib/coreml.py b/python/tvm/relay/op/contrib/coreml.py
deleted file mode 100644
index c1c012199cec..000000000000
--- a/python/tvm/relay/op/contrib/coreml.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""CoreML codegen supported operators."""
-import tvm.ir
-from tvm.contrib.target.coreml import _convert_map
-from ...expr import Constant
-
-
-def _register_coreml_op(op_name):
-    """Register a function to check the given operator is supported by Core ML.
-
-    Paramters
-    ---------
-    op_name : Str
-        The name of operator that will be registered.
-
-    """
-
-    def _check_supported(expr):
-        attrs, args = expr.attrs, expr.args
-        if op_name == "nn.conv2d":
-            if not isinstance(args[1], Constant):
-                return False
-            if attrs["kernel_layout"] not in ["HWIO", "OIHW"]:
-                return False
-        return True
-
-    tvm.ir.register_op_attr(op_name, "target.coremlcompiler", _check_supported)
-
-
-for op in _convert_map:
-    _register_coreml_op(op)
diff --git a/python/tvm/relay/op/contrib/cublas.py b/python/tvm/relay/op/contrib/cublas.py
deleted file mode 100644
index 47b70efebdab..000000000000
--- a/python/tvm/relay/op/contrib/cublas.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""cuBLAS Relay integration."""
-from typing import Callable, List, Tuple, Dict, Optional
-
-import tvm
-import tvm.ir
-from tvm import relay
-from tvm import te
-from tvm.relay import transform
-from tvm.contrib import cublas
-
-from ...dataflow_pattern import is_op, wildcard
-from .te_target import lower_composite, relay_to_runtime
-from .register import register_pattern_table
-
-
-tvm._ffi.register_func("relay.ext.cublas", relay_to_runtime(tvm.target.cuda()))
-
-
-def partition_for_cublas(
-    mod: tvm.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None
-) -> tvm.IRModule:
-    """Partition the graph to offload for cuBLAS.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The module to partition.
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        Constant input parameters.
-
-    Returns
-    -------
-    tvm.IRModule
-        The partitioned module.
-    """
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("cublas"),
-            transform.PartitionGraph(),
-            transform.InferType(),
-        ]
-    )
-    return seq(mod)
-
-
-@register_pattern_table("cublas")
-def pattern_table() -> List[Tuple[str, relay.Pattern, Callable[[relay.Call], bool]]]:
-    """Get the cuBLAS pattern table."""
-
-    def matmul_pattern() -> relay.Pattern:
-        """Create pattern for matmul."""
-        return is_op("nn.matmul")(wildcard(), wildcard())
-
-    def batch_matmul_pattern() -> relay.Pattern:
-        """Create pattern for batch_matmul."""
-        return is_op("nn.batch_matmul")(wildcard(), wildcard())
-
-    def dense_pattern() -> relay.Pattern:
-        """Create pattern for dense."""
-        return is_op("nn.dense")(wildcard(), wildcard())
-
-    def check_matmul_like(matched: relay.Call) -> bool:
-        """Check if matmul is supported by cuBLAS."""
-        # Input data types can't be mixed
-        if matched.args[0].checked_type.dtype != matched.args[1].checked_type.dtype:
-            return False
-
-        in_dtype = matched.args[0].checked_type.dtype
-        out_dtype = matched.checked_type.dtype
-        # Only the following data type combinations are supported
-        if (in_dtype, out_dtype) not in [
-            ("float32", "float32"),
-            ("float16", "float16"),
-            ("float16", "float32"),
-            ("int8", "int32"),
-            ("float64", "float64"),
-            ("int8", "float32"),
-        ]:
-            return False
-
-        # If inputs are int8, input column strides must be a multiple of 4
-        if in_dtype == "int8":
-            if (
-                matched.args[0].checked_type.shape[-1] % 4 != 0
-                or matched.args[1].checked_type.shape[-1] % 4 != 0
-            ):
-                return False
-
-        return True
-
-    return [
-        ("cublas.matmul", matmul_pattern(), check_matmul_like),
-        ("cublas.batch_matmul", batch_matmul_pattern(), check_matmul_like),
-        ("cublas.dense", dense_pattern(), check_matmul_like),
-    ]
-
-
-@lower_composite("cublas.matmul")
-def _lower_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
-    """Lower a matmul using cuBLAS."""
-    return cublas.matmul(
-        inputs[0],
-        inputs[1],
-        transa=op.attrs["transpose_a"],
-        transb=op.attrs["transpose_b"],
-        dtype=op.checked_type.dtype,
-    )
-
-
-@lower_composite("cublas.batch_matmul")
-def _lower_batch_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
-    """Lower a batch_matmul using cuBLAS."""
-    return cublas.batch_matmul(
-        inputs[0],
-        inputs[1],
-        transa=op.attrs["transpose_a"],
-        transb=op.attrs["transpose_b"],
-        dtype=op.checked_type.dtype,
-    )
-
-
-@lower_composite("cublas.dense")
-def _lower_dense(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
-    """Lower a dense using cuBLAS."""
-    return cublas.matmul(
-        inputs[0], inputs[1], transa=False, transb=True, dtype=op.checked_type.dtype
-    )
diff --git a/python/tvm/relay/op/contrib/cudnn.py b/python/tvm/relay/op/contrib/cudnn.py
deleted file mode 100644
index e3c256f7e38a..000000000000
--- a/python/tvm/relay/op/contrib/cudnn.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""cuDNN Relay integration."""
-from typing import Callable, List, Tuple
-
-import tvm
-import tvm.ir
-from tvm import relay
-from tvm import te
-from tvm.relay import transform
-from tvm.contrib import cudnn
-
-from ...dataflow_pattern import is_op, wildcard
-from .te_target import lower_composite, relay_to_runtime
-from .register import register_pattern_table
-
-
-tvm._ffi.register_func("relay.ext.cudnn", relay_to_runtime(tvm.target.cuda()))
-
-
-def partition_for_cudnn(mod: tvm.IRModule) -> tvm.IRModule:
-    """Partition the graph to offload for cuDNN.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The module to partition.
-
-    Returns
-    -------
-    tvm.IRModule
-        The partitioned module.
-    """
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("cudnn"),
-            transform.PartitionGraph(),
-            transform.InferType(),
-        ]
-    )
-    return seq(mod)
-
-
-@register_pattern_table("cudnn")
-def pattern_table() -> List[Tuple[str, relay.Pattern, Callable[[relay.Call], bool]]]:
-    """Get the cuDNN pattern table."""
-
-    def softmax_pattern() -> relay.Pattern:
-        """Create pattern for softmax."""
-        return is_op("nn.softmax")(wildcard())
-
-    def log_softmax_pattern() -> relay.Pattern:
-        """Create pattern for log_softmax."""
-        return is_op("nn.log_softmax")(wildcard())
-
-    def conv2d_pattern() -> relay.Pattern:
-        """Create pattern for conv2d."""
-        return is_op("nn.conv2d")(wildcard(), wildcard())
-
-    def conv2d_bias_act_pattern() -> relay.Pattern:
-        """Create pattern for fused conv2d+bias+activation."""
-        conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-        bias = is_op("nn.bias_add")(conv2d, wildcard())
-        return bias.optional(is_op("nn.relu"))
-
-    def check_softmax(matched: relay.Call) -> bool:
-        """Check if softmax is supported by cuDNN."""
-        if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
-            return False
-
-        return True
-
-    def check_log_softmax(matched: relay.Call) -> bool:
-        """Check if log_softmax is supported by cuDNN."""
-        if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
-            return False
-
-        if len(matched.args[0].checked_type.shape) != 2:
-            return False
-
-        if matched.attrs["axis"] not in (1, -1):
-            return False
-
-        return True
-
-    def check_conv2d(matched: relay.Call) -> bool:
-        if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
-            return False
-
-        if matched.attrs["data_layout"] != "NCHW" or matched.attrs["kernel_layout"] != "OIHW":
-            return False
-
-        padding = matched.attrs["padding"]
-        if padding[0] != padding[2] or padding[1] != padding[3]:
-            return False
-
-        return True
-
-    def check_conv2d_bias_act(matched: relay.Call) -> bool:
-        return True
-
-    return [
-        ("cudnn.softmax", softmax_pattern(), check_softmax),
-        ("cudnn.log_softmax", log_softmax_pattern(), check_log_softmax),
-        ("cudnn.conv2d_bias_act", conv2d_bias_act_pattern(), check_conv2d_bias_act),
-        ("cudnn.conv2d", conv2d_pattern(), check_conv2d),
-    ]
-
-
-@lower_composite("cudnn.softmax")
-def _lower_softmax(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
-    """Lower a softmax using cuDNN."""
-    return cudnn.softmax(inputs[0], axis=op.attrs["axis"])
-
-
-@lower_composite("cudnn.log_softmax")
-def _lower_log_softmax(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
-    """Lower a log_softmax using cuDNN."""
-    return cudnn.log_softmax(inputs[0], axis=op.attrs["axis"])
-
-
-@lower_composite("cudnn.conv2d_bias_act")
-def _lower_conv2d_bias_act(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
-    """Lower a fused conv2d+bias+activation using cuDNN."""
-    conv_dtype = op.checked_type.dtype
-    if op.op.name == "nn.relu":
-        activation_mode = 1  # Relu
-        conv2d = op.args[0].args[0]
-    else:
-        activation_mode = 5  # Identity
-        conv2d = op.args[0]
-
-    conv_mode = 1
-    tensor_format = 0
-    algo = 1
-    pad = conv2d.attrs["padding"]
-    strides = conv2d.attrs["strides"]
-    dilation = conv2d.attrs["dilation"]
-    groups = conv2d.attrs["groups"]
-
-    oshape = cudnn.conv_output_shape(
-        tensor_format,
-        pad,
-        strides,
-        dilation,
-        inputs[0].shape,
-        inputs[1].shape,
-        inputs[0].dtype,
-        conv_dtype,
-        groups,
-    )
-
-    return te.extern(
-        oshape,
-        inputs,
-        lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.cudnn.conv2d+bias+act.forward",
-            conv_mode,
-            tensor_format,
-            algo,
-            pad[0],
-            pad[1],
-            strides[0],
-            strides[1],
-            dilation[0],
-            dilation[1],
-            activation_mode,
-            0,
-            ins[0],
-            ins[1],
-            ins[2],
-            outs[0],
-            conv_dtype,
-            groups,
-        ),
-        name="y",
-    )
-
-
-@lower_composite("cudnn.conv2d")
-def _lower_conv2d(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
-    """Lower a conv2d using cuDNN."""
-    return cudnn.conv_forward(
-        inputs[0],
-        inputs[1],
-        pad=op.attrs["padding"],
-        stride=op.attrs["strides"],
-        dilation=op.attrs["dilation"],
-        conv_mode=1,
-        tensor_format=0,
-        algo=1,
-        conv_dtype=op.checked_type.dtype,
-        groups=op.attrs["groups"],
-    )
diff --git a/python/tvm/relay/op/contrib/cutlass.py b/python/tvm/relay/op/contrib/cutlass.py
deleted file mode 100644
index 40fc22e9e82f..000000000000
--- a/python/tvm/relay/op/contrib/cutlass.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Patterns supported CUTLASS."""
-from functools import partial
-
-from tvm import relay
-from tvm.ir.transform import PassContext, Sequential
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
-
-from ...dataflow_pattern import is_constant, is_op, wildcard
-
-
-def make_gelu_pattern(bias_out, out_dtype="float16"):
-    mul = is_op("multiply")(bias_out, is_constant() | wildcard())
-    if out_dtype == "float16":
-        erf = is_op("cast")(is_op("erf")(is_op("cast")(mul)))
-    else:
-        erf = is_op("erf")(mul)
-    mul_half = is_op("multiply")(erf, is_constant() | wildcard())
-    add = is_op("add")(mul_half, is_constant() | wildcard())
-    return is_op("multiply")(add, bias_out)
-
-
-def make_gemm_pattern(with_bias=True, with_act=None, out_dtype="float16"):
-    """Create a pattern for dense op followed by activations."""
-    data = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    gemm = is_op("nn.dense")(data, weight)
-    if with_bias:
-        add_or_bias_add = is_op("add") | is_op("nn.bias_add")
-        gemm_out = add_or_bias_add(gemm, bias)
-    else:
-        gemm_out = gemm
-
-    if with_act is None:
-        return gemm_out
-    if isinstance(with_act, str) and with_act == "relu":
-        return is_op("nn.relu")(gemm_out)
-
-    assert isinstance(with_act, str) and with_act == "gelu"
-    return make_gelu_pattern(gemm_out, out_dtype)
-
-
-def make_batch_matmul_pattern():
-    return is_op("nn.batch_matmul")(wildcard(), wildcard())
-
-
-def make_conv2d_pattern(with_bias=False, with_act=None):
-    """Create a pattern for dense op followed by activations."""
-    data = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    conv2d = is_op("nn.conv2d")(data, weight)
-    if with_bias:
-        add_or_bias_add = is_op("add") | is_op("nn.bias_add")
-        conv2d_out = add_or_bias_add(conv2d, bias)
-    else:
-        conv2d_out = conv2d
-
-    if with_act is not None:
-        if with_act == "relu":
-            return is_op("nn.relu")(conv2d_out)
-        if with_act == "sigmoid":
-            return is_op("sigmoid")(conv2d_out)
-        if with_act == "silu":
-            return is_op("multiply")(conv2d_out, is_op("sigmoid")(conv2d_out))
-        if with_act == "hardswish":
-            rhs = is_op("divide")(
-                is_op("clip")(is_op("add")(conv2d_out, is_constant())), is_constant()
-            )
-            return is_op("multiply")(conv2d_out, rhs)
-
-        raise ValueError(f"Unknown activation {with_act}.")
-
-    return conv2d_out
-
-
-def make_conv2d_transpose_pattern():
-    return is_op("nn.conv2d_transpose")(wildcard(), wildcard())
-
-
-def make_conv2d_backward_weight_pattern():
-    return is_op("nn.conv2d_backward_weight")(wildcard(), wildcard())
-
-
-def make_residual_block_pattern(tensor_op_out, binary_op="add", with_act="relu"):
-    """Add pattern for residual blocks."""
-    residual_input = wildcard()
-    binary_out = is_op(binary_op)(tensor_op_out, residual_input) | is_op(binary_op)(
-        residual_input, tensor_op_out
-    )
-
-    if with_act is not None and with_act == "relu":
-        return is_op("nn.relu")(binary_out)
-
-    return binary_out
-
-
-def check_dtype(lhs, rhs):
-    """Check if dtypes in the given workload are supported by CUTLASS."""
-    return (
-        (lhs.dtype == "float16" and rhs.dtype == "float16")
-        or (lhs.dtype == "float32" and rhs.dtype == "float32")
-        or (lhs.dtype in ["int8", "uint8"] and rhs.dtype in ["int8", "uint8"])
-    )
-
-
-def get_root_call(call, root_op_name):
-    if not isinstance(call, relay.Call):
-        return None
-    if str(call.op.name) == root_op_name:
-        return call
-    return get_root_call(call.args[0], root_op_name)
-
-
-def check_gemm(call):
-    """Check if the given dense workload can be offloaded to CUTLASS."""
-    dense = get_root_call(call, "nn.dense")
-    lhs = dense.args[0].checked_type
-    rhs = dense.args[1].checked_type
-    return check_dtype(lhs, rhs)
-
-
-def check_batch_matmul(call):
-    """Check if the given batch_matmul workload can be offloaded to CUTLASS."""
-    batch_matmul = get_root_call(call, "nn.batch_matmul")
-    lhs = batch_matmul.args[0].checked_type
-    rhs = batch_matmul.args[1].checked_type
-    transpose_a = batch_matmul.attrs.transpose_a
-    transpose_b = batch_matmul.attrs.transpose_b
-    return check_dtype(lhs, rhs) and not transpose_a and transpose_b
-
-
-def is_depthwise_conv2d(ic, oc, groups):
-    return ic == oc == groups
-
-
-def check_conv2d_common(op_name, expected_kernel_layout, call):
-    """Check if the given conv2d workload can be offloaded to CUTLASS."""
-    conv2d = get_root_call(call, op_name)
-    data_layout = conv2d.attrs.data_layout
-    kernel_layout = conv2d.attrs.kernel_layout
-    data = conv2d.args[0].checked_type
-    weight = conv2d.args[1].checked_type
-    if (
-        data_layout != "NHWC"
-        or kernel_layout != expected_kernel_layout
-        or not check_dtype(data, weight)
-    ):
-        return False
-    IC = data.shape[3]
-    OC = weight.shape[0]
-    return not is_depthwise_conv2d(IC, OC, conv2d.attrs.groups)
-
-
-def check_conv2d(call):
-    return check_conv2d_common("nn.conv2d", "OHWI", call)
-
-
-def check_conv2d_transpose(call):
-    # conv2d_transpose is implemented as dgrad, needs to swap the roles of C and K
-    return check_conv2d_common("nn.conv2d_transpose", "IHWO", call)
-
-
-def check_conv2d_backward_weight(call):
-    return check_conv2d_common("nn.conv2d_backward_weight", "NHWC", call)
-
-
-def check_conv2d_residual(call, binary_op):
-    """Check if the given conv2d workload can be offloaded to CUTLASS."""
-    conv2d = get_root_call(call, "nn.conv2d")
-    if not check_conv2d(call):
-        return False
-
-    residual_binop = get_root_call(call, binary_op)
-    lhs = residual_binop.args[0]
-    rhs = residual_binop.args[1]
-
-    # residual_input is pattern-matched as a wildcard. Make sure it does not sit between
-    # residual binary op and the root conv2d of this pattern.
-    # If the root conv2d is the parent of both lhs and rhs, we should reject this pattern.
-    if get_root_call(lhs, "nn.conv2d") == conv2d and get_root_call(rhs, "nn.conv2d") == conv2d:
-        return False
-
-    return all(x == y for (x, y) in zip(lhs.checked_type.shape, rhs.checked_type.shape))
-
-
-@register_pattern_table("cutlass")
-def pattern_table():
-    """Returns list of triples describing the name, dataflow pattern and predicate for all
-    the CUTLASS-supported operators."""
-    dense_pat = ("cutlass.dense", make_gemm_pattern(False, None), check_gemm)
-    dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None), check_gemm)
-    dense_bias_relu_pat = ("cutlass.dense_bias_relu", make_gemm_pattern(True, "relu"), check_gemm)
-    dense_bias_gelu_fp16_pat = (
-        "cutlass.dense_bias_gelu_fp16",
-        make_gemm_pattern(True, "gelu"),
-        check_gemm,
-    )
-    dense_bias_gelu_fp32_pat = (
-        "cutlass.dense_bias_gelu_fp32",
-        make_gemm_pattern(True, "gelu", out_dtype="float32"),
-        check_gemm,
-    )
-
-    dense_patterns = [
-        dense_bias_gelu_fp16_pat,
-        dense_bias_gelu_fp32_pat,
-        dense_bias_relu_pat,
-        dense_bias_pat,
-        dense_pat,
-        ("cutlass.batch_matmul", make_batch_matmul_pattern(), check_batch_matmul),
-    ]
-
-    conv2d_patterns = [
-        (
-            "cutlass.conv2d_bias_hardswish",
-            make_conv2d_pattern(with_bias=True, with_act="hardswish"),
-            check_conv2d,
-        ),
-        (
-            "cutlass.conv2d_bias_silu",
-            make_conv2d_pattern(with_bias=True, with_act="silu"),
-            check_conv2d,
-        ),
-        (
-            "cutlass.conv2d_bias_relu",
-            make_conv2d_pattern(with_bias=True, with_act="relu"),
-            check_conv2d,
-        ),
-        (
-            "cutlass.conv2d_bias_sigmoid",
-            make_conv2d_pattern(with_bias=True, with_act="sigmoid"),
-            check_conv2d,
-        ),
-        ("cutlass.conv2d_bias", make_conv2d_pattern(with_bias=True), check_conv2d),
-        ("cutlass.conv2d", make_conv2d_pattern(), check_conv2d),
-    ]
-
-    # For now, no fusion for grad kernels
-    conv2d_grad_patterns = [
-        ("cutlass.conv2d_transpose", make_conv2d_transpose_pattern(), check_conv2d_transpose),
-        (
-            "cutlass.conv2d_backward_weight",
-            make_conv2d_backward_weight_pattern(),
-            check_conv2d_backward_weight,
-        ),
-    ]
-
-    residual_block_patterns = []
-
-    for with_act, postfix in [("relu", "_relu"), (None, "")]:
-        for name, pat, _ in conv2d_patterns[:-1]:
-            for bin_op in ["add", "multiply"]:
-                residual_block_patterns.append(
-                    (
-                        name + "_residual_" + bin_op + postfix,
-                        make_residual_block_pattern(pat, bin_op, with_act=with_act),
-                        partial(check_conv2d_residual, binary_op=bin_op),
-                    )
-                )
-
-    return residual_block_patterns + dense_patterns + conv2d_patterns + conv2d_grad_patterns
-
-
-def partition_for_cutlass(mod, params=None):
-    """Partition the input module into CUTLASS-supported subgraphs."""
-
-    if params is not None:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-        remove_bn_pass = Sequential(
-            [
-                transform.InferType(),
-                transform.SimplifyInference(),
-                transform.FoldConstant(),
-                transform.FoldScaleAxis(),
-            ]
-        )
-        with PassContext(opt_level=3):
-            mod = remove_bn_pass(mod)
-
-    cutlass_patterns = relay.op.contrib.get_pattern_table("cutlass")
-
-    seq = Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(cutlass_patterns),
-            transform.AnnotateTarget(["cutlass"], include_non_call_ops=False),
-            transform.PartitionGraph(bind_constants=False),
-        ]
-    )
-
-    return seq(mod)
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
deleted file mode 100644
index aa54dc7c190d..000000000000
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ /dev/null
@@ -1,1368 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, use-list-literal
-"""DNNL library supported operators.
-There are two ways to registering a function for an op to indicate if it is
-supported by DNNL.
-
-- The first and simplest way is to use the helper so that
-users only need to provide the operator name and a boolean value to indicate if
-it is supported. For example:
-
-    .. code-block:: python
-
-      add = _register_external_op_helper("add")
-      add = _register_external_op_helper("add", True)
-      add = _register_external_op_helper("add", False)
-
-- The other way is to implement the function by themselves to
-check the attributes of the op and decide if it should be offloaded to DNNL.
-"""
-import logging
-from functools import reduce
-
-import tvm.ir
-from tvm import relay
-from tvm.ir import Op
-from tvm.relay import expr as _expr
-from tvm.relay import transform
-from tvm.relay.analysis import analysis as _analysis
-from tvm.relay.expr import Call, GlobalVar, TupleGetItem, const
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
-
-from ... import _ffi_api
-from ...dataflow_pattern import DFPatternCallback, is_constant, is_expr, is_op, rewrite, wildcard
-from .register import register_pattern_table
-
-logger = logging.getLogger("DNNL")
-supported_post_elts = ["nn.relu", "tanh", "sigmoid", "clip", "gelu", "swish", "mish", None]
-
-
-def _register_external_op_helper(op_name, supported=True):
-    """The helper function to indicate that a given operator can be supported
-    by DNNL.
-
-    Parameters
-    ----------
-    op_name : Str
-        The name of operator that will be registered.
-
-    Returns
-    -------
-    f : callable
-        A function that returns if the operator is supported by DNNL.
-    """
-
-    @tvm.ir.register_op_attr(op_name, "target.dnnl")
-    def _func_wrapper(expr):
-        args = expr.args
-        if any([x.checked_type.dtype == "int64" for x in args]):
-            logger.info("DNNL does not support int64.")
-            return False
-        # DNNL does not support pooling with ceil_mode = True.
-        if "pool" in op_name:
-            attrs = dict(get_attrs(expr))
-            if "ceil_mode" in attrs.keys() and attrs["ceil_mode"]:
-                return False
-        return supported
-
-    return _func_wrapper
-
-
-_register_external_op_helper("nn.batch_norm")
-_register_external_op_helper("nn.conv1d")
-_register_external_op_helper("nn.conv2d")
-_register_external_op_helper("nn.conv3d")
-_register_external_op_helper("nn.conv2d_transpose")
-_register_external_op_helper("nn.conv3d_transpose")
-_register_external_op_helper("nn.dense")
-_register_external_op_helper("nn.max_pool2d")
-_register_external_op_helper("nn.avg_pool2d")
-_register_external_op_helper("nn.global_avg_pool2d")
-_register_external_op_helper("nn.max_pool3d")
-_register_external_op_helper("nn.avg_pool3d")
-_register_external_op_helper("abs")
-_register_external_op_helper("clip")
-_register_external_op_helper("exp")
-_register_external_op_helper("log")
-_register_external_op_helper("sqrt")
-_register_external_op_helper("round")
-_register_external_op_helper("nn.relu")
-_register_external_op_helper("nn.leaky_relu")
-_register_external_op_helper("tanh")
-_register_external_op_helper("sigmoid")
-_register_external_op_helper("nn.softmax")
-_register_external_op_helper("add")
-_register_external_op_helper("multiply")
-_register_external_op_helper("nn.layer_norm")
-_register_external_op_helper("nn.batch_matmul")
-
-
-def append_eltwise_ops(op, eltwise):
-    """Append element-wise post-ops to conv / conv_transpose / dense
-
-    Parameters
-    ----------
-    op : str
-        The op name to be attached with element-wise post-op.
-    eltwise : str
-        The attached elementwise post-op name.
-    Returns
-    -------
-    pattern : CallPattern
-        Call node sequence.
-    """
-    if eltwise == "gelu":
-        const1 = wildcard()
-        const2 = wildcard()
-        const3 = wildcard()
-        div = is_op("divide")(op, const1)
-        erf_val = is_op("erf")(div)
-        added_erf_val = is_op("add")(erf_val, const2)
-        mul_val = is_op("multiply")(op, added_erf_val)
-        op = is_op("multiply")(mul_val, const3)
-    elif eltwise == "swish":
-        sig_out = is_op("sigmoid")(op)
-        op = is_op("multiply")(op, sig_out)
-    elif eltwise == "mish":
-        const1 = wildcard()
-        exp = is_op("exp")(op)
-        add = is_op("add")(exp, const1)
-        log = is_op("log")(add)
-        tanh = is_op("tanh")(log)
-        op = is_op("multiply")(op, tanh)
-    elif eltwise:
-        op = is_op(eltwise)(op)
-    return op
-
-
-def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
-    """Create patterns related to conv and conv_transpose.
-
-    Parameters
-    ----------
-    with_bias : bool
-        Whether attach `bias_add` to `conv / conv_transpose`.
-    with_eltwise : str
-        The attached elementwise post-op name.
-    Returns
-    -------
-    conv_out : CallPattern
-        Call node sequence.
-    """
-    if with_eltwise not in supported_post_elts:
-        raise ValueError(f"Unsupported eltwise post-op: {with_eltwise}")
-    data = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    conv = is_op(conv_name)(data, weight)
-    if with_bias:
-        conv_out = is_op("add")(conv, bias)
-    else:
-        conv_out = conv
-    return append_eltwise_ops(conv_out, with_eltwise)
-
-
-def make_conv_bias_sum_relu_pattern(conv_type, has_relu=True):
-    """Create patterns with sum op.
-
-    Parameters
-    ----------
-    conv_type : str
-        Should be nn.conv1d / nn.conv2d / nn.conv3d.
-    has_relu : bool
-        Whether attach relu.
-    Returns
-    -------
-    out : CallPattern
-        Call node sequence.
-    """
-    data1 = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    data2 = wildcard()
-    out = is_op(conv_type)(data1, weight)
-    out = is_op("add")(out, bias)
-    out = is_op("add")(out, data2)
-    if has_relu:
-        out = is_op("nn.relu")(out)
-    return out
-
-
-def make_dense_bias_sum_pattern():
-    """Create patterns with sum op.
-
-    Parameters
-    ----------
-    N/A
-
-    Returns
-    -------
-    out : CallPattern
-        Call node sequence.
-    """
-    data1 = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    data2 = wildcard()
-    out = is_op("nn.dense")(data1, weight)
-    out = is_op("add")(out, bias)
-    out = is_op("add")(out, data2)
-    return "dnnl.dense_bias_sum", out
-
-
-def get_op_name(expr):
-    """Get the operator name from an expression."""
-    if isinstance(expr, Op):
-        return expr.name
-    if isinstance(expr, Call):
-        return get_op_name(expr.op)
-    if isinstance(expr, TupleGetItem):
-        return get_op_name(expr.tuple_value)
-    if isinstance(expr, relay.Tuple):
-        return get_op_name(expr.fields[0])
-    return ""
-
-
-def get_args(expr):
-    """Get the arguments from an expression."""
-    if isinstance(expr, Call):
-        return expr.args
-    if isinstance(expr, TupleGetItem):
-        return get_args(expr.tuple_value)
-    if isinstance(expr, relay.Tuple):
-        return [arg for args in map(get_args, expr.fields) for arg in args]
-    return []
-
-
-def get_attrs(expr):
-    """Get the attributes from an expression."""
-    if isinstance(expr, Call):
-        return expr.attrs
-    if isinstance(expr, TupleGetItem):
-        return get_attrs(expr.tuple_value)
-    return {}
-
-
-def make_sum_pattren_predicate(checker):
-    """Check whether the conv_bias_add_sum pattern is as expected."""
-
-    def predicate(expr):
-        if get_op_name(expr) == "nn.relu":
-            expr = expr.args[0]
-        for e, op_name in zip([expr, expr.args[0]], ["sum", "bias_add"]):
-            args = get_args(e)
-            attrs = get_attrs(e.args[0])
-            if not checker(attrs, args, op_name):
-                return False
-        return True
-
-    return predicate
-
-
-def make_bias_add_pattren_predicate(checker):
-    """Check whether the conv_bias pattern is as expected."""
-
-    def predicate(expr):
-        if get_op_name(expr) == "nn.relu":
-            expr = expr.args[0]
-        if get_op_name(expr) == "add":
-            args = get_args(expr)
-            attrs = get_attrs(expr.args[0])
-            if not checker(attrs, args, "bias_add"):
-                return False
-        return True
-
-    return predicate
-
-
-def add_checker(attrs, args, op_name):
-    """Check if add is aligned with elementwise_add and bias_add."""
-    if op_name == "sum":
-        if not isinstance(args[0].op, tvm.ir.op.Op):
-            return False
-        if args[0].op.name != "add":
-            return False
-        if tuple(get_shape(args[0])) != tuple(get_shape(args[1])):
-            return False
-    if op_name == "bias_add":
-        if attrs is None:
-            return False
-        if not isinstance(args[0].op, tvm.ir.op.Op):
-            return False
-        if args[0].op.name != "nn.conv2d":
-            return False
-        channel = dict(attrs)["channels"]
-        const_shape = get_shape(args[1])
-        if channel != reduce(lambda x, y: x * y, const_shape):
-            return False
-    return True
-
-
-def make_dense_pattern(with_bias=True, with_eltwise=None):
-    """Create patterns related to nn.dense.
-
-    Parameters
-    ----------
-    with_bias : bool
-        Whether attach `bias_add` to `nn.dense`.
-    with_eltwise : str
-        The attached elementwise post-op name.
-    Returns
-    -------
-    dense_out : CallPattern
-        Call node sequence.
-    """
-    if with_eltwise not in supported_post_elts:
-        raise ValueError(f"Unsupported eltwise post-op: {with_eltwise}")
-    data = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-
-    dense = is_op("nn.dense")(data, weight)
-    if with_bias:
-        dense_out = is_op("add")(dense, bias)
-    else:
-        dense_out = dense
-    return append_eltwise_ops(dense_out, with_eltwise)
-
-
-def make_dnnl_pattern(op_name, with_bias, with_eltwise):
-    """Create dnnl patterns.
-
-    Parameters
-    ----------
-    op_name : str
-        The first call node's op name.
-    with_bias : bool
-        Whether attach `bias_add` to `nn.dense`.
-    with_eltwise : str
-        The attached elementwise post-op name.
-    Returns
-    -------
-    pattern : Tuple(pattern_name, CallPattern)
-        Created pattern name, along with its CallPattern.
-    """
-    pat_name = op_name.replace("nn", "dnnl")
-    if "_transpose" in op_name:
-        pat_name = "dnnl.deconv" + op_name.split("_")[0][-2::]
-    pat_name += "_bias" if with_bias else ""
-    pat_name += ("_" + with_eltwise.split(".")[-1]) if with_eltwise else ""
-    if "conv" in op_name:
-        dnnl_pattern = (
-            pat_name,
-            make_conv_pattern(op_name, with_bias, with_eltwise),
-            make_bias_add_pattren_predicate(add_checker),
-        )
-    elif op_name == "nn.dense":
-        dnnl_pattern = (pat_name, make_dense_pattern(with_bias, with_eltwise))
-    else:
-        logger.warning(
-            "Currently, only conv1d, conv2d, conv2d_transpose, conv3d_transpose, "
-            "dense op are supported, but got %s.",
-            op_name,
-        )
-        dnnl_pattern = ()
-    return dnnl_pattern
-
-
-def make_qnn_conv2d_pattern():
-    """Make qnn.conv2d based pattern supported by DNNL
-
-    Returns
-    -------
-    pattern : Tuple(pattern_name, CallPattern)
-        Created pattern name, along with its CallPattern.
-    """
-    data = wildcard()
-    weight = is_constant()
-    bias = is_constant()
-    o_scl = is_constant()
-    dst_zp = is_constant()
-    act_scl = is_constant()
-    sum_scl = is_constant()
-    sum_src = wildcard()
-
-    zero_zp = is_expr(const(0, dtype="int32"))
-
-    pat = is_op("qnn.conv2d")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
-    pat = is_op("cast")(pat)
-    pat = is_op("add")(pat, bias) | pat  # optional bias
-    pat = is_op("multiply")(pat, o_scl)
-    pat = is_op("clip")(pat)  # TBD, not only clip
-    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. Ex: act_scl == 1
-    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
-    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
-    pat = is_op("cast")(pat)
-
-    return "dnnl.qnn.conv2d", pat
-
-
-def make_qnn_dense_pattern():
-    """Make qnn.dense based pattern supported by DNNL
-
-    Returns
-    -------
-    pattern : Tuple(pattern_name, CallPattern)
-        Created pattern name, along with its CallPattern.
-    """
-    data = wildcard()
-    weight = is_constant()
-    bias = is_constant()
-    o_scl = is_constant()
-    dst_zp = is_constant()
-    act_scl = is_constant()
-    sum_scl = is_constant()
-    sum_src = wildcard()
-
-    zero_zp = is_expr(const(0, dtype="int32"))
-
-    pat = is_op("qnn.dense")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
-    pat = is_op("cast")(pat)
-    pat = is_op("add")(pat, bias) | pat  # optional bias
-    pat = is_op("multiply")(pat, o_scl)
-    pat = is_op("clip")(pat)  # TBD, not only clip
-    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. ex act_scl == 1
-    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
-    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
-    pat = is_op("cast")(pat)
-
-    return "dnnl.qnn.dense", pat
-
-
-@register_pattern_table("dnnl")
-def pattern_table():
-    """Create dnnl patterns.
-
-    Returns
-    -------
-    dnnl_patterns : List[dnnl_pattern]
-        Created patterns.
-    """
-    dnnl_patterns = list()
-    dnnl_patterns.append(make_qnn_conv2d_pattern())
-    dnnl_patterns.append(make_qnn_dense_pattern())
-    dnnl_patterns.append(make_dense_bias_sum_pattern())
-    dnnl_patterns.append(
-        (
-            "dnnl.conv2d_bias_sum_relu",
-            make_conv_bias_sum_relu_pattern("nn.conv2d"),
-            make_sum_pattren_predicate(add_checker),
-        )
-    )
-    dnnl_patterns.append(
-        (
-            "dnnl.conv2d_bias_sum",
-            make_conv_bias_sum_relu_pattern("nn.conv2d", False),
-            make_sum_pattren_predicate(add_checker),
-        )
-    )
-
-    elt_list = ["nn.relu", "tanh", "sigmoid", "clip", "gelu", "swish", "mish", None]
-    for with_bias in [True, False]:
-        for elt in elt_list:
-            if not with_bias and not elt:
-                continue
-            for conv_name in [
-                "nn.conv1d",
-                "nn.conv2d",
-                "nn.conv3d",
-                "nn.conv2d_transpose",
-                "nn.conv3d_transpose",
-            ]:
-                dnnl_patterns.append(make_dnnl_pattern(conv_name, with_bias, elt))
-            dnnl_patterns.append(make_dnnl_pattern("nn.dense", with_bias, elt))
-    return dnnl_patterns
-
-
-def get_optimal_layout_for_conv(
-    data_layout, kernel_layout, weight_shape, out_shape, paddings, strides, dilates, groups, dtype
-):
-    """Get the optimal layout of dnnl, given shape of conv2d.
-
-    Parameters
-    ----------
-    data_layout, kernel_layout,weight_shape, out_shape, paddings, strides, dilates, groups
-        : String
-          Input argument.
-
-    Returns
-    -------
-    layouts : string
-              The result.
-    """
-    return _ffi_api.get_optimal_layout_for_conv(
-        data_layout,
-        kernel_layout,
-        weight_shape,
-        out_shape,
-        paddings,
-        strides,
-        dilates,
-        groups,
-        dtype,
-    )
-
-
-def get_optimal_layout_for_conv_transpose(
-    data_layout,
-    kernel_layout,
-    weight_shape,
-    out_shape,
-    paddings,
-    output_paddings,
-    strides,
-    dilates,
-    groups,
-    dtype,
-):
-    """Get the optimal layout of dnnl, given shape of tranposed conv2d.
-
-    Parameters
-    ----------
-    data_layout, kernel_layout, weight_shape, out_shape, paddings, output_paddings, strides,
-    dilates, groups
-        : Int, String
-          Input argument.
-
-    Returns
-    -------
-    layouts : string
-              The result.
-    """
-    return _ffi_api.get_optimal_layout_for_conv_transpose(
-        data_layout,
-        kernel_layout,
-        weight_shape,
-        out_shape,
-        paddings,
-        output_paddings,
-        strides,
-        dilates,
-        groups,
-        dtype,
-    )
-
-
-def get_shape(tensor):
-    """Get tensor's shape."""
-    if isinstance(tensor, relay.expr.Var):
-        return tensor.type_annotation.concrete_shape
-    if isinstance(tensor, relay.expr.Constant):
-        return tensor.data.shape
-    if isinstance(tensor, tvm.ir.tensor_type.TensorType):
-        return tensor.concrete_shape
-    if isinstance(tensor, tvm.ir.container.Array):
-        return tensor[-1].shape
-    if isinstance(tensor, relay.expr.Call):
-        if tensor.op.name == "multiply":
-            return tensor.type_args[0].shape
-        return tensor.checked_type.shape
-    raise TypeError(f"Unsupport data type: {type(tensor)}")
-
-
-def get_dtype(tensor):
-    """Get tensor's dtype."""
-    if isinstance(tensor, relay.expr.Var):
-        return tensor.type_annotation.dtype
-    if isinstance(tensor, relay.expr.Constant):
-        return tensor.data.dtype
-    if isinstance(tensor, tvm.ir.tensor_type.TensorType):
-        return tensor.dtype
-    if isinstance(tensor, tvm.ir.container.Array):
-        return tensor[-1].dtype
-    if isinstance(tensor, relay.expr.Call):
-        if tensor.op.name == "multiply":
-            return tensor.type_args[0].dtype
-        return tensor.checked_type.dtype
-    raise TypeError(f"Unsupport data type: {type(tensor)}")
-
-
-def tag2layout(input_data, is_weight=False, conv_type="Conv1D"):
-    """Transfer layout, denoted with `a, b, c, d, e`,
-    into valid layout (NCHW / OIHW) of TVM."""
-    if "Conv1D" in conv_type:
-        data_dic = {"a": "N", "b": "C", "c": "W"}
-        weight_dic = {"a": "O", "b": "I", "c": "W", "d": "G"}
-    elif "Conv2D" in conv_type:
-        data_dic = {"a": "N", "b": "C", "c": "H", "d": "W"}
-        weight_dic = {"a": "O", "b": "I", "c": "H", "d": "W"}
-        if "e" in input_data:
-            weight_dic = {"a": "G", "b": "O", "c": "I", "d": "H", "e": "W"}
-    elif "Conv3D" in conv_type:
-        data_dic = {"a": "N", "b": "C", "c": "D", "d": "H", "e": "W"}
-        weight_dic = {"a": "O", "b": "I", "c": "D", "d": "H", "e": "W", "f": "G"}
-
-    dic = weight_dic if is_weight else data_dic
-    res = ""
-
-    for i in input_data:
-        if i.isupper():
-            i = i.lower()
-            res += dic[i]
-            dic[i] = dic[i].lower()
-        elif i.islower():
-            res += dic[i]
-        elif i.isdigit():
-            res += i
-        else:
-            raise ValueError(f"Unsupport layout format: {input_data}")
-
-    return res
-
-
-def legalize_pad_avg_pool(attrs, inputs, types):
-    """Legalize pad->avg_pool2d pattern.
-    Fuse this pattern into one avg_pool2d with padding = (1, 1),
-    and count_include_pad = True"""
-    data = inputs[0]
-    new_attrs = dict(attrs)
-    if isinstance(data, relay.expr.Call) and data.op.name == "nn.pad":
-        new_attrs["padding"] = (1, 1)
-        new_attrs["count_include_pad"] = True
-        return relay.nn.avg_pool2d(data.args[0], **new_attrs)
-    return relay.nn.avg_pool2d(data, **attrs)
-
-
-def legalize_group_conv(attrs, inputs, types):
-    """Legalize group conv / conv_transpose calculation.
-    Alter weight layout from OIHW to GOIHW / IOHW to GIOHW"""
-    groups = attrs.groups
-    data, weight = inputs
-    if groups == 1:
-        if "Transpose" not in type(attrs).__name__:
-            return relay.nn.conv2d(data, weight, **attrs)
-        return relay.nn.conv2d_transpose(data, weight, **attrs)
-    OC, IC, H, W = get_shape(weight)
-    new_attrs = dict(attrs)
-    weight = relay.reshape(weight, (groups, OC // groups, IC, H, W))
-    if "Transpose" not in type(attrs).__name__:
-        new_attrs["kernel_layout"] = "GOIHW"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-    new_attrs["kernel_layout"] = "GIOHW"
-    return relay.nn.conv2d_transpose(data, weight, **new_attrs)
-
-
-def alter_conv(attrs, inputs, tinfos, out_type):
-    """The convolution's layout auto-query func for dnnl."""
-
-    data, weight = inputs
-    groups = str(attrs.groups)
-    weight_shape = ",".join([str(x) for x in get_shape(weight)])
-    out_shape = ",".join([str(x) for x in get_shape(out_type)])
-    paddings = ",".join([str(x) for x in attrs.get_int_tuple("padding")])
-    strides = ",".join([str(x) for x in attrs.get_int_tuple("strides")])
-    dilates = ",".join([str(x) for x in attrs.get_int_tuple("dilation")])
-    dtype = get_dtype(weight)
-    new_attrs = dict(attrs)
-    conv_type = type(attrs).__name__.split("Attrs")[0]
-
-    res = get_optimal_layout_for_conv(
-        attrs["data_layout"],
-        attrs["kernel_layout"],
-        weight_shape,
-        out_shape,
-        paddings,
-        strides,
-        dilates,
-        groups,
-        dtype,
-    )
-    src_df, weight_df, dst_df = res.split(",")
-    new_attrs["data_layout"] = tag2layout(src_df, is_weight=False, conv_type=conv_type)
-    new_attrs["kernel_layout"] = tag2layout(weight_df, is_weight=True, conv_type=conv_type)
-    new_attrs["out_layout"] = tag2layout(dst_df, is_weight=False, conv_type=conv_type)
-
-    if conv_type == "Conv1D":
-        return relay.nn.conv1d(data, weight, **new_attrs)
-    if conv_type == "Conv2D":
-        return relay.nn.conv2d(data, weight, **new_attrs)
-    return relay.nn.conv3d(data, weight, **new_attrs)
-
-
-def alter_conv_transpose(attrs, inputs, tinfos, out_type):
-    """The transposed convolution's layout auto-query func for dnnl."""
-
-    data, weight = inputs
-    weight_shape = ",".join([str(x) for x in get_shape(weight)])
-    out_shape = ",".join([str(x) for x in get_shape(out_type)])
-    paddings = ",".join([str(x) for x in attrs.get_int_tuple("padding")])
-    output_paddings = ",".join([str(x) for x in attrs.get_int_tuple("output_padding")])
-    strides = ",".join([str(x) for x in attrs.get_int_tuple("strides")])
-    dilates = ",".join([str(x) for x in attrs.get_int_tuple("dilation")])
-    groups = str(attrs.groups)
-    dtype = get_dtype(weight)
-    new_attrs = dict(attrs)
-    conv_type = type(attrs).__name__.split("Attrs")[0]
-
-    res = get_optimal_layout_for_conv_transpose(
-        attrs["data_layout"],
-        attrs["kernel_layout"],
-        weight_shape,
-        out_shape,
-        paddings,
-        output_paddings,
-        strides,
-        dilates,
-        groups,
-        dtype,
-    )
-    src_df, weight_df, dst_df = res.split(",")
-    new_attrs["data_layout"] = tag2layout(src_df, is_weight=False, conv_type=conv_type)
-    new_attrs["kernel_layout"] = tag2layout(weight_df, is_weight=True, conv_type=conv_type)
-    new_attrs["out_layout"] = tag2layout(dst_df, is_weight=False, conv_type=conv_type)
-
-    if conv_type == "Conv1DTranspose":
-        return relay.nn.conv1d_transpose(data, weight, **new_attrs)
-    if conv_type == "Conv2DTranspose":
-        return relay.nn.conv2d_transpose(data, weight, **new_attrs)
-    return relay.nn.conv3d_transpose(data, weight, **new_attrs)
-
-
-class IsComputeIntensiveGraph(ExprVisitor):
-    """
-    Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
-    its transpose and dense.
-    """
-
-    def __init__(self):
-        ExprVisitor.__init__(self)
-        self.is_compute_intensive = False
-
-    def visit_call(self, call):
-        compute_intensive_ops = set(
-            [
-                "nn.conv1d",
-                "nn.conv2d",
-                "nn.conv2d_transpose",
-                "nn.conv3d",
-                "nn.conv3d_transpose",
-                "nn.dense",
-                "nn.layer_norm",
-                "nn.batch_matmul",
-                "nn.global_avg_pool2d",
-            ]
-        )
-        if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op.name) in compute_intensive_ops:
-                self.is_compute_intensive = True
-
-        return super().visit_call(call)
-
-    def is_graph_compute_intensive(self, subgraph) -> bool:
-        """
-        This function recursively visits the graph and checks if it's compute intensive"
-        """
-        self.visit(subgraph)
-        return self.is_compute_intensive
-
-
-def is_valid_subgraph(body):
-    """Final check on whether the subgraph is valid and should be offloaded to DNNL."""
-    return IsComputeIntensiveGraph().is_graph_compute_intensive(body)
-
-
-def prune_dnnl_subgraphs(mod):
-    """
-    Removes invalid subgraphs, which does not contain compute intensive dnnl ops.
-    """
-
-    class SubgraphRemover(ExprMutator):
-        """
-        Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
-        """
-
-        def __init__(self, subgraphs_to_remove, mod, new_mod):
-            ExprMutator.__init__(self)
-            self.subgraphs_to_remove = subgraphs_to_remove
-            self.mod = mod
-            self.new_mod = new_mod
-
-        def visit_call(self, call):
-            if isinstance(call.op, GlobalVar):
-                name = call.op.name_hint
-                if name in self.subgraphs_to_remove:
-                    # "Inline" the subgraph back into new main function.
-                    func = self.mod[name]
-                    var_map = {}
-                    for arg, param in zip(call.args, func.params):
-                        var_map[param] = super().visit(arg)
-                    new_body = relay.bind(func.body, var_map)
-                    return new_body
-                if name != "main":
-                    args = []
-                    for arg in call.args:
-                        args.append(super().visit(arg))
-                    return call.op(*args)
-            return super().visit_call(call)
-
-    subgraphs_to_remove = []
-    # If only one subgraph, do nothing.
-    if len(mod.get_global_vars()) <= 2:
-        return mod
-    # Remove invalid subgraphs
-    for subgraph in mod.get_global_vars():
-        name = subgraph.name_hint
-        if not mod[name].attrs or mod[name].attrs["Compiler"] != "dnnl":
-            continue
-        if not is_valid_subgraph(mod[name].body):
-            subgraphs_to_remove.append(name)
-    # Create new pruned module
-    new_mod = tvm.IRModule(mod.functions, mod.type_definitions)
-    new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
-    new_mod = transform.RemoveUnusedFunctions()(new_mod)
-    return new_mod
-
-
-class LayerNormRewrite(DFPatternCallback):
-    """
-    A callback to rewrite the following operators into a single layer normalization operator.
-
-    Pattern #1:
-    1   %4 = mean(%3, axis=[-1], keepdims=True) /* ty=Tensor[(1, 3136, 1), float32] */;
-    2   %5 = subtract(%3, %4) /* ty=Tensor[(1, 3136, 64), float32] */;
-    3   %6 = cast(%5, dtype="float32") /* ty=Tensor[(1, 3136, 64), float32] */;
-    4   %7 = power(%6, 2f /* ty=float32 */) /* ty=Tensor[(1, 3136, 64), float32] */;
-    5   %8 = mean(%7, axis=[-1], keepdims=True) /* ty=Tensor[(1, 3136, 1), float32] */;
-    6   %9 = add(%8, 1e-05f /* ty=float32 */) /* ty=Tensor[(1, 3136, 1), float32] */;
-    7   %10 = sqrt(%9) /* ty=Tensor[(1, 3136, 1), float32] */;
-    8   %11 = divide(%5, %10) /* ty=Tensor[(1, 3136, 64), float32] */;
-    9   %12 = multiply(%11, meta[relay.Constant][2] /* ty=Tensor[(64), float32] */)
-            /* ty=Tensor[(1, 3136, 64), float32] */;
-    10   %13 = add(%12, meta[relay.Constant][3] /* ty=Tensor[(64), float32] */)
-            /* ty=Tensor[(1, 3136, 64), float32] */;
-
-    Pattern #2:
-    1   %0 = mean(%input, axis=[-1], keepdims=True);
-    2   %1 = variance(%input, %0, axis=[-1], keepdims=True);
-    3   %2 = add(%1, 1e-05f /* ty=float32 */) /* ty=Tensor[(1, 49, 1), float32] */;
-    4   %3 = subtract(%input, %0);
-    5   %4 = sqrt(%2) /* ty=Tensor[(1, 49, 1), float32] */;
-    6   %5 = divide(%3, %4);
-    7   %6 = multiply(%5, meta[relay.Constant][0] /* ty=Tensor[(64), float32] */)
-            /* ty=Tensor[(1, 49, 64), float32] */;
-    8   %7 = add(%6, meta[relay.Constant][1] /* ty=Tensor[(64), float32] */)
-            /* ty=Tensor[(1, 49, 64), float32] */
-
-    """
-
-    def __init__(self):
-        super(LayerNormRewrite, self).__init__()
-        self.data = wildcard()
-        self.gamma = wildcard()
-        self.beta = wildcard()
-        mu = is_op("mean")(self.data)
-        diff = is_op("subtract")(self.data, mu)
-        cdiff = diff | is_op("cast")(diff)
-        const_two = is_expr(relay.const(2)) | is_expr(relay.const(2.0))
-        p1 = is_op("power")(cdiff, const_two)
-        mp1 = is_op("mean")(p1) | is_op("variance")(self.data, mu)
-        eps = is_expr(relay.const(1e-5)) | is_expr(relay.const(1e-6))
-        added_eps = is_op("add")(mp1, eps)
-        deno = is_op("sqrt")(added_eps)
-        div_out = is_op("divide")(diff, deno)
-        div_out2 = diff * is_op("rsqrt")(added_eps)
-        weighted = is_op("multiply")(div_out | div_out2, self.gamma)
-        added_bias = is_op("add")(weighted, self.beta)
-        self.pattern = added_bias
-
-    def callback(self, pre, post, node_map):
-        data = node_map[self.data][0]
-        gamma = node_map[self.gamma][0]
-        beta = node_map[self.beta][0]
-        return relay.op.nn.layer_norm(data=data, gamma=gamma, beta=beta)
-
-
-def rewrite_layer_norm(mod):
-    """Rewrite the input graph to replace multiple operators with a TVM native layer normalization
-    operator so that we can offload them to dnnl layer normalization byoc part.
-    """
-    mod["main"] = rewrite(LayerNormRewrite(), mod["main"])
-    return mod
-
-
-class DenseReshapeBiasGeluRewrite(DFPatternCallback):
-    """
-    A callback to reorder reshape operators when the patterns are as below:
-
-    Pattern #1:
-    1   %62 = nn.dense(%61, meta[relay.Constant][13] /* ty=Tensor[(64, 64), float32] */,
-                units=None, out_dtype="float32") /* ty=Tensor[(3136, 64), float32] */;
-    2   %63 = reshape(%62, newshape=[1, 3136, 64]) /* ty=Tensor[(1, 3136, 64), float32] */;
-    3   %64 = add(meta[relay.Constant][4] /* ty=Tensor[(64), float32] */, %63)
-                /* ty=Tensor[(1, 3136, 64), float32] */;
-
-    Pattern #2:
-    1   %76 = nn.dense(%75, meta[relay.Constant][18] /* ty=Tensor[(512, 64), float32] */,
-                units=None, out_dtype="float32") /*  ty=Tensor[(3136, 512), float32] */;
-    2   %77 = reshape(%76, newshape=[1, 3136, 512]) /* ty=Tensor[(1, 3136, 512), float32] */;
-    3   %78 = add(meta[relay.Constant][15] /* ty=Tensor[(512), float32] */, %77)
-                /* ty=Tensor[(1, 3136, 512), float32] */;
-    4   %79 = divide(%78, 1.41421f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
-    5   %80 = erf(%79) /* ty=Tensor[(1, 3136, 512), float32] */;
-    6   %81 = add(%80, 1f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
-    7   %82 = multiply(%78, %81) /* ty=Tensor[(1, 3136, 512), float32] */;
-    8   %83 = multiply(%82, 0.5f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
-    """
-
-    def __init__(self, has_gelu=True):
-        super(DenseReshapeBiasGeluRewrite, self).__init__()
-        self.data = wildcard()
-        self.weight = wildcard()
-        self.bias = wildcard()
-        self.const1 = wildcard()
-        self.const2 = wildcard()
-        self.const3 = wildcard()
-
-        self.attr_map = {}
-        self.has_gelu = has_gelu
-
-        den = is_op("nn.dense")(self.data, self.weight)
-        re_den = is_op("reshape")(den)
-        added = is_op("add")(self.bias, re_den)
-        if self.has_gelu:
-            divisor = is_op("divide")(added, self.const1)
-            val_erf = is_op("erf")(divisor)
-            added_erf = is_op("add")(val_erf, self.const2)
-            mul1 = is_op("multiply")(added, added_erf)
-            mul2 = is_op("multiply")(mul1, self.const3)
-            self.pattern = mul2
-        else:
-            self.pattern = added
-
-    def get_attr(self, pre):
-        """Recursively retrieve attributes from reshape operator."""
-
-        def visit_func(expr):
-            if isinstance(expr, _expr.Call) and expr.op == relay.op.get("reshape"):
-                new_attrs = {}
-                for k in expr.attrs.keys():
-                    new_attrs[k] = expr.attrs[k]
-                self.attr_map["reshape"] = new_attrs
-
-        _analysis.post_order_visit(pre, visit_func)
-
-    def callback(self, pre, post, node_map):
-        self.get_attr(pre)
-
-        data = node_map[self.data][0]
-        weight = node_map[self.weight][0]
-        bias = node_map[self.bias][0]
-
-        den = relay.op.nn.dense(data, weight)
-        added = relay.op.add(bias, den)
-        if not self.has_gelu:
-            return relay.op.reshape(added, self.attr_map["reshape"]["newshape"])
-
-        const1 = node_map[self.const1][0]
-        const2 = node_map[self.const2][0]
-        const3 = node_map[self.const3][0]
-
-        divisor = relay.op.divide(added, const1)
-        val_erf = relay.op.erf(divisor)
-        added_erf = relay.op.add(val_erf, const2)
-        mul1 = relay.op.multiply(added, added_erf)
-        mul2 = relay.op.multiply(mul1, const3)
-        return relay.op.reshape(mul2, self.attr_map["reshape"]["newshape"])
-
-
-def rewrite_dense_bias_gelu_reshape_last(mod):
-    """Rewrite the input graph to reorder reshape operators so that
-    we can perform dense_bias_gelu/dense_bias fusion and then offload
-    them to byoc part.
-    """
-    mod["main"] = rewrite(
-        [DenseReshapeBiasGeluRewrite(), DenseReshapeBiasGeluRewrite(has_gelu=False)], mod["main"]
-    )
-    return mod
-
-
-class ResNetV1Rewrite(DFPatternCallback):
-    """
-    A callback to advance downsize operation when the patterns are as pattern1,
-    and the result is written in pattern2:
-    Pattern #1:
-    %26 = nn.conv2d(%25, ty=Tensor[(64, 256, 1, 1));
-    %27 = add(%26, ty=Tensor[(64, 1, 1));
-    %28 = nn.relu(%27);
-
-    %29 = nn.conv2d(%28, ty=Tensor[(64, 64, 3, 3));
-    %30 = add(%29, ty=Tensor[(64, 1, 1));
-    %31 = nn.relu(%30);
-
-    %32 = nn.conv2d(%31, ty=Tensor[(256, 64, 1, 1));
-    %33 = add(%32, ty=Tensor[(256, 1, 1));
-    %34 = add(%33, %25);
-    %35 = nn.relu(%34);
-
-    %36 = nn.conv2d(%35, ty=Tensor[(128, 256, 1, 1), strides=[2, 2]);
-    %37 = add(%36, ty=Tensor[(128, 1, 1));
-    %38 = nn.relu(%37);
-
-    %39 = nn.conv2d(%38, ty=Tensor[(128, 128, 3, 3));
-    %40 = add(%39, ty=Tensor[(128, 1, 1)]);
-    %41 = nn.relu(%40);
-
-    %42 = nn.conv2d(%41, ty=Tensor[(512, 128, 1, 1));
-    %43 = nn.conv2d(%35, ty=Tensor[(512, 256, 1, 1), strides=[2, 2]);
-    %44 = add(%42, ty=Tensor[(512, 1, 1));
-    %45 = add(%43, ty=Tensor[(512, 1, 1));
-
-    %46 = add(%44, %45);
-    %47 = nn.relu(%46);
-    Pattern #2:
-    %26 = nn.conv2d(%25, ty=Tensor[(64, 256, 1, 1));
-    %27 = add(%26, ty=Tensor[(64, 1, 1));
-    %28 = nn.relu(%27);
-
-    %29 = nn.conv2d(%28, ty=Tensor[(64, 64, 3, 3), strides=[2, 2]);
-    %30 = add(%29, ty=Tensor[(64, 1, 1));
-    %31 = nn.relu(%30);
-
-    %32 = nn.conv2d(%31, ty=Tensor[(256, 64, 1, 1));
-    %33 = add(%32, ty=Tensor[(256, 1, 1));
-    %34 = nn.max_pool2d(%25, pool_size=[1, 1], strides=[2, 2], padding=[0, 0, 0, 0]);
-    %35 = add(%33, %34);
-    %36 = nn.relu(%35);
-
-    %37 = nn.conv2d(%36, ty=Tensor[(128, 256, 1, 1));
-    %38 = add(%37, ty=Tensor[(128, 1, 1));
-    %39 = nn.relu(%38);
-
-    %40 = nn.conv2d(%39, ty=Tensor[(128, 128, 3, 3));
-    %41 = add(%40, ty=Tensor[(128, 1, 1));
-    %42 = nn.relu(%41);
-
-    %43 = nn.conv2d(%42, ty=Tensor[(512, 128, 1, 1));
-    %44 = nn.conv2d(%36, ty=Tensor[(512, 256, 1, 1));
-    %45 = add(%43, ty=Tensor[(512, 1, 1));
-    %46 = add(%44, ty=Tensor[(512, 1, 1));
-    %47 = add(%45, %46);
-    %48 = nn.relu(%47);
-    """
-
-    def __init__(self):
-        super(ResNetV1Rewrite, self).__init__()
-        self.attr_lst = []
-        self.data = wildcard()
-        self.w1, self.b1 = wildcard(), wildcard()
-        self.w2, self.b2 = wildcard(), wildcard()
-        self.w3, self.b3 = wildcard(), wildcard()
-        self.w4, self.b4 = wildcard(), wildcard()
-        self.w5, self.b5 = wildcard(), wildcard()
-        self.w6, self.b6 = wildcard(), wildcard()
-        self.w7, self.b7 = wildcard(), wildcard()
-
-        conv1 = is_op("nn.conv2d")(self.data, self.w1).has_attr({"kernel_size": [1, 1]})
-        conv1 = is_op("add")(conv1, self.b1)
-        conv1 = is_op("nn.relu")(conv1)
-
-        conv2 = is_op("nn.conv2d")(conv1, self.w2).has_attr({"kernel_size": [3, 3]})
-        conv2 = is_op("add")(conv2, self.b2)
-        conv2 = is_op("nn.relu")(conv2)
-
-        conv3 = is_op("nn.conv2d")(conv2, self.w3).has_attr({"kernel_size": [1, 1]})
-        conv3 = is_op("add")(conv3, self.b3)
-        conv3 = is_op("add")(conv3, self.data)
-        conv3 = is_op("nn.relu")(conv3)
-
-        left_conv4 = is_op("nn.conv2d")(conv3, self.w4).has_attr({"strides": [2, 2]})
-        left_conv4 = is_op("add")(left_conv4, self.b4)
-        left_conv4 = is_op("nn.relu")(left_conv4)
-
-        left_conv5 = is_op("nn.conv2d")(left_conv4, self.w5).has_attr({"kernel_size": [3, 3]})
-        left_conv5 = is_op("add")(left_conv5, self.b5)
-        left_conv5 = is_op("nn.relu")(left_conv5)
-
-        left_conv6 = is_op("nn.conv2d")(left_conv5, self.w6).has_attr({"kernel_size": [1, 1]})
-        left_conv6 = is_op("add")(left_conv6, self.b6)
-
-        right_conv7 = is_op("nn.conv2d")(conv3, self.w7).has_attr({"strides": [2, 2]})
-        right_conv7 = is_op("add")(right_conv7, self.b7)
-
-        out = is_op("add")(left_conv6, right_conv7)
-        out = is_op("nn.relu")(out)
-        self.pattern = out
-
-    def get_attr(self, pre):
-        """Recursively retrieve attributes from reshape operator."""
-
-        def visit_func(expr):
-            if isinstance(expr, _expr.Call) and expr.op == relay.op.get("nn.conv2d"):
-                self.attr_lst.append(expr.attrs)
-
-        _analysis.post_order_visit(pre, visit_func)
-
-    def callback(self, pre, post, node_map):
-        self.get_attr(pre)
-        data = node_map[self.data][0]
-        w1, b1 = node_map[self.w1][0], node_map[self.b1][0]
-        w2, b2 = node_map[self.w2][0], node_map[self.b2][0]
-        w3, b3 = node_map[self.w3][0], node_map[self.b3][0]
-        w4, b4 = node_map[self.w4][0], node_map[self.b4][0]
-        w5, b5 = node_map[self.w5][0], node_map[self.b5][0]
-        w6, b6 = node_map[self.w6][0], node_map[self.b6][0]
-        w7, b7 = node_map[self.w7][0], node_map[self.b7][0]
-
-        new_attrs = self.attr_lst[-7]
-        conv1 = relay.op.nn.conv2d(data, w1, **new_attrs)
-        conv1 = relay.op.add(conv1, b1)
-        conv1 = relay.op.nn.relu(conv1)
-
-        new_attrs = dict(self.attr_lst[-6])
-        new_attrs["strides"] = [2, 2]
-        conv2 = relay.op.nn.conv2d(conv1, w2, **new_attrs)
-        conv2 = relay.op.add(conv2, b2)
-        conv2 = relay.op.nn.relu(conv2)
-
-        new_attrs = self.attr_lst[-5]
-        conv3 = relay.op.nn.conv2d(conv2, w3, **new_attrs)
-        conv3 = relay.op.add(conv3, b3)
-        max_pool = relay.op.nn.max_pool2d(
-            data, pool_size=(1, 1), strides=(2, 2), layout=new_attrs["data_layout"]
-        )
-        conv3 = relay.op.add(conv3, max_pool)
-        conv3 = relay.op.nn.relu(conv3)
-
-        new_attrs = dict(self.attr_lst[-4])
-        new_attrs["strides"] = [1, 1]
-        left_conv4 = relay.op.nn.conv2d(conv3, w4, **new_attrs)
-        left_conv4 = relay.op.add(left_conv4, b4)
-        left_conv4 = relay.op.nn.relu(left_conv4)
-
-        new_attrs = self.attr_lst[-3]
-        left_conv5 = relay.op.nn.conv2d(left_conv4, w5, **new_attrs)
-        left_conv5 = relay.op.add(left_conv5, b5)
-        left_conv5 = relay.op.nn.relu(left_conv5)
-
-        new_attrs = self.attr_lst[-2]
-        left_conv6 = relay.op.nn.conv2d(left_conv5, w6, **new_attrs)
-        left_conv6 = relay.op.add(left_conv6, b6)
-
-        new_attrs = dict(self.attr_lst[-1])
-        new_attrs["strides"] = [1, 1]
-        right_conv7 = relay.op.nn.conv2d(conv3, w7, **new_attrs)
-        right_conv7 = relay.op.add(right_conv7, b7)
-
-        out = relay.op.add(left_conv6, right_conv7)
-        out = relay.op.nn.relu(out)
-        self.attr_lst = []
-        return out
-
-
-def rewrite_resnetv1(mod):
-    """Rewrite the ResNetV1 downsize block to reduce the computation complexity."""
-    mod["main"] = rewrite(ResNetV1Rewrite(), mod["main"])
-    return mod
-
-
-class LegalizeQnnOpForDnnl(DFPatternCallback):
-    """Legalize QNN based patterns to match DNNL
-
-    original pattern:
-      OP = qnn.dense | qnn.conv2d
-      %1 = OP<int>(SRC, WGH) - OP<int>(src_zp, WGH)   // qnn.conv2d
-      %2 = %1 + orig_bias                             // bias
-      %2 = (%1 - rq_in_zp) * rq_in_scl / rq_out_scl + rq_out_zp  // qnn.requantize
-      %3 = act(%2)                                               // activation == clip
-      %4 = ((%3 - sum_lh_zp) * sum_lh_scl + (SRC2 - sum_rh_zp) * sum_rh_scl)  // qnn.add
-           / sum_out_scl + sum_out_zp
-
-    transform to DNNL compatible:
-      %1 = OP<int>(SRC, WGH)
-      %2 = cast(%1, dtype="float")
-      %2 = (%1 + bias) * o_scl
-      %3 = act(%2) * act_scl
-      %4 = %3 + SRC2 * sum_scl
-      %5 = %4 + dst_zp
-      %6 = cast(%5, dtype="float")
-
-    where:
-      o_scl = rq_in_scl / rq_out_scl
-      act_scl = sum_lhs_scl / sum_out_scl
-      sum_scl = sum_rhs_scl / sum_out_scl
-      bias = orig_bias - OP(src_zp, WGH) - rq_in_zp + rq_out_zp * rq_out_scl / rq_in_scl
-      dst_zp = sum_out_zp - sum_lhs_zp * sum_lhs_scl / sum_out_scl -
-               sum_rhs_zp * sum_rhs_scl / sum_out_scl
-    """
-
-    def __init__(self):
-        super(LegalizeQnnOpForDnnl, self).__init__()
-        self.src = wildcard()
-        self.wgh = wildcard()
-        self.bias = wildcard()
-        self.sum_src = wildcard()
-
-        self.src_scl = is_constant()
-        self.src_zp = is_constant()
-        self.wgh_scl = is_constant()
-        self.wgh_zp = is_expr(const(0))
-
-        self.rq_in_scl = is_constant()
-        self.rq_in_zp = is_constant()
-        self.rq_out_scl = is_constant()
-        self.rq_out_zp = is_constant()
-
-        self.sum_lhs_scl = is_constant()
-        self.sum_lhs_zp = is_constant()
-        self.sum_rhs_scl = is_constant()
-        self.sum_rhs_zp = is_constant()
-        self.sum_out_scl = is_constant()
-        self.sum_out_zp = is_constant()
-
-        self.root = (is_op("qnn.conv2d") | is_op("qnn.dense"))(
-            self.src, self.wgh, self.src_zp, self.wgh_zp, self.src_scl, self.wgh_scl
-        )
-        pat = is_op("add")(self.root, self.bias) | self.root  # optional bias
-        pat = is_op("qnn.requantize")(
-            pat, self.rq_in_scl, self.rq_in_zp, self.rq_out_scl, self.rq_out_zp
-        )
-        pat = is_op("clip")(pat)
-        cast = is_op("cast")(pat)
-        pat = is_op("qnn.add")(
-            cast,
-            self.sum_src,
-            self.sum_lhs_scl,
-            self.sum_lhs_zp,
-            self.sum_rhs_scl,
-            self.sum_rhs_zp,
-            self.sum_out_scl,
-            self.sum_out_zp,
-        )
-        pat = is_op("clip")(pat)
-        self.pattern = pat | cast
-
-    def callback(self, pre, post, node_map):
-        root = node_map[self.root][0]
-        src = node_map[self.src][0]
-        wgh = node_map[self.wgh][0]
-        bias = node_map.get(self.bias, default=[relay.const(0, dtype="int32")])[0]
-        src_zp = node_map[self.src_zp][0]
-        rq_in_scl = node_map[self.rq_in_scl][0]
-        rq_in_zp = node_map[self.rq_in_zp][0]
-        rq_out_scl = node_map[self.rq_out_scl][0]
-        rq_out_zp = node_map[self.rq_out_zp][0]
-
-        final_dtype = node_map[self.pattern][0].checked_type.dtype
-
-        if root.op == relay.op.get("qnn.conv2d"):
-            dst_layout = root.attrs.out_layout
-            dst_layout = root.attrs.data_layout if dst_layout == "" else dst_layout
-            wgh_layout = root.attrs.kernel_layout
-        else:
-            # qnn.dense has no layout attributes. Assume that is plain
-            dst_layout = "NC"
-            wgh_layout = "OI"
-
-        # TODO(@apeskov): dst_layout may ne blocked
-        bias_rank = len(dst_layout) - dst_layout.index("C")
-
-        sum_src = node_map[self.sum_src][0] if self.sum_src in node_map else None
-        # Default values if qnn.sum is not present
-        sum_lhs_scl = node_map[self.sum_lhs_scl][0] if sum_src else relay.const(1, dtype="float32")
-        sum_lhs_zp = node_map[self.sum_lhs_zp][0] if sum_src else relay.const(0, dtype="int32")
-        sum_rhs_scl = node_map[self.sum_rhs_scl][0] if sum_src else relay.const(0, dtype="float32")
-        sum_rhs_zp = node_map[self.sum_rhs_zp][0] if sum_src else relay.const(0, dtype="int32")
-        sum_out_scl = node_map[self.sum_out_scl][0] if sum_src else relay.const(1, dtype="float32")
-        sum_out_zp = node_map[self.sum_out_zp][0] if sum_src else relay.const(0, dtype="int32")
-
-        def cast_fp(op):
-            return relay.op.cast(op, dtype="float32")
-
-        # recalculate some factors
-        o_scl = rq_in_scl / rq_out_scl
-        act_scl = sum_lhs_scl / sum_out_scl
-        sum_scl = sum_rhs_scl / sum_out_scl
-        dst_zp = (
-            cast_fp(sum_out_zp)
-            - cast_fp(sum_lhs_zp) * sum_lhs_scl / sum_out_scl
-            - cast_fp(sum_rhs_zp) * sum_rhs_scl / sum_out_scl
-        )
-        bias = self.squeeze_bias(bias, dst_layout)
-        bias = (
-            cast_fp(bias)
-            - cast_fp(self.fake_op(src_zp, wgh, wgh_layout))
-            - cast_fp(rq_in_zp)
-            + cast_fp(rq_out_zp) * rq_out_scl / rq_in_scl
-        )
-        bias = self.broadcast_to_rank(bias, bias_rank)
-
-        zero_zp = relay.const(0, dtype="int32")
-        one_scl = relay.const(1.0, dtype="float32")
-
-        # construct new graph with proper post op ordering
-        gr = tvm.relay.Call(
-            root.op,
-            [src, wgh, zero_zp, zero_zp, one_scl, one_scl],
-            root.attrs,
-            root.type_args,
-            root.span,
-        )
-        gr = relay.op.cast(gr, dtype="float32")
-        gr = gr + bias
-        gr = gr * o_scl
-        gr = relay.op.clip(gr, 0, 255) * act_scl
-        gr = gr + sum_scl * cast_fp(sum_src) if sum_src else gr
-        gr = gr + dst_zp
-        gr = relay.op.cast(gr, dtype=final_dtype)
-        return gr
-
-    @staticmethod
-    def fake_op(zp, wgh, layout):
-        """Fake operator implementation for zp broadcast input"""
-        # Conv:  reduce kernel {OC, IC, KH, KW} -> {OC} in case of group that is still correct
-        # Dense: reduce kernel {OC, IC} -> {OC}
-        wgh_int = relay.op.cast(wgh, dtype="int32")
-        reduced_kernel = relay.op.sum(
-            wgh_int, axis=[layout.index("O")], keepdims=False, exclude=True
-        )
-        return zp * reduced_kernel
-
-    @staticmethod
-    def squeeze_bias(bias, layout):
-        shape = transform.InferTypeLocal(bias).concrete_shape
-        c_position = layout.index("C") - len(layout) + len(shape)
-        squeeze_idxs = [i for i in range(len(shape)) if i != c_position]
-        return relay.op.squeeze(bias, squeeze_idxs)
-
-    @staticmethod
-    def broadcast_to_rank(op, rank):
-        """Scalar or 1D tensor are supported"""
-        shape = transform.InferTypeLocal(op).concrete_shape
-        if len(shape) == 0:
-            return op
-        if len(shape) == 1:
-            return relay.op.expand_dims(op, 1, rank - 1)
-        raise ValueError("Unexpected bias rank to broadcast. Only 0 and 1 are supported.")
-
-
-def legalize_qnn_for_dnnl(mod):
-    """Transform qnn primitives to DNNL compatible form. Eliminate source zero point and apply
-    strict sequence of post ops."""
-    mod["main"] = rewrite(LegalizeQnnOpForDnnl(), mod["main"])
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            # transform.SimplifyInference(),  # TODO: this pass decompose nn.layer_norm
-            # transform.FoldScaleAxis(),  # TODO: fail inside TVM in case of grouped convolutions.
-            transform.FoldConstant(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    return mod
diff --git a/python/tvm/relay/op/contrib/libtorch.py b/python/tvm/relay/op/contrib/libtorch.py
deleted file mode 100644
index 2827c2abd88b..000000000000
--- a/python/tvm/relay/op/contrib/libtorch.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, no-else-return, E1102
-"""Torch codegen operators"""
-
-from tvm import relay
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-
-
-def torchop(script_fn, *params):
-    """Insert an Operation executed in the PyTorch JIT
-
-    The operation includes backend annotation
-
-    Currently, only tensors are supported. The shape inferrence
-    assumes that input shapes (and not values) determine output shapes."""
-    return compiler_end(
-        relay.op._make.torchop(
-            [compiler_begin(p, "torch") for p in params], script_fn.save_to_buffer()
-        ),
-        "torch",
-    )
diff --git a/python/tvm/relay/op/contrib/mrvl.py b/python/tvm/relay/op/contrib/mrvl.py
deleted file mode 100644
index 6100fcb991c0..000000000000
--- a/python/tvm/relay/op/contrib/mrvl.py
+++ /dev/null
@@ -1,977 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, broad-except
-"""Marvell Library supported operators."""
-
-import tvm
-from tvm import relay
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
-from tvm.relay.expr import Call, TupleGetItem
-from tvm.contrib import mrvl as mrvl_contrib
-
-from ...dataflow_pattern import (
-    wildcard,
-    is_op,
-    is_constant,
-    is_tuple,
-    is_tuple_get_item,
-    is_var,
-)
-from .register import register_pattern_table
-from ..strategy.generic import is_depthwise_conv2d
-
-
-def partition_for_mrvl(
-    mod,
-    params=None,
-    **kwargs,
-):
-    """Partition the graph greedily into Marvell graph region(s) and a LLVM region(s). The LLVM
-    region will contain ops not supported by the Marvell backend.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-
-    Returns
-    -------
-    mod_mrvl_llvm_regions : annotated & partitioned module (of Mrvl region(s) & LLVM region(s))
-    """
-
-    # setup & register convert layout options
-    convert_layout_dict = {
-        "nn.conv2d": ["NHWC", "OHWI"],
-        "nn.max_pool2d": ["NHWC"],
-        "nn.avg_pool2d": ["NHWC"],
-        "nn.global_avg_pool2d": ["NHWC"],
-    }
-
-    mrvl_register_conv2d_attr_funcs_for_convert_layout()
-    mrvl_register_max_pool2d_attr_funcs_for_convert_layout()
-    mrvl_register_avg_pool2d_attr_funcs_for_convert_layout()
-    mrvl_register_global_avg_pool2d_attr_funcs_for_convert_layout()
-
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    opt_level = 3
-    disabled_pass_list = ["AlterOpLayout"]
-    annotate_target_str = "mrvl"
-    annotate_target_include_non_call_ops = True
-
-    seq_tvmc_pre_repartition = tvm.transform.Sequential(
-        passes=[
-            relay.transform.InferType(),
-            MrvlRemoveDropoutPass(),
-            MrvlRemoveCopyPass(),
-            relay.transform.RemoveUnusedFunctions(),
-            relay.transform.FoldConstant(),
-            relay.transform.SimplifyExpr(),
-            relay.transform.InferType(),
-            relay.transform.ConvertLayout(convert_layout_dict),
-            relay.transform.FoldConstant(),
-            relay.transform.SimplifyExpr(),
-            relay.transform.InferType(),
-            relay.transform.MergeComposite(mrvl_pattern_table()),
-            relay.transform.AnnotateTarget(
-                annotate_target_str,
-                annotate_target_include_non_call_ops,
-            ),
-            relay.transform.MergeCompilerRegions(),
-            relay.transform.PartitionGraph(""),
-            relay.transform.InferType(),
-        ]
-    )
-
-    # convert layout back to NCHW for ops in main
-    desired_layouts_in_main = {
-        "nn.conv2d": ["NCHW", "OIHW"],
-        "nn.max_pool2d": ["NCHW"],
-        "nn.avg_pool2d": ["NCHW"],
-        "nn.global_avg_pool2d": ["NCHW"],
-    }
-
-    seq_tvmc_post_repartition = tvm.transform.Sequential(
-        passes=[
-            # Convert Layout of conv ops in main to NCHW (as expected by LLVM).
-            # This pass does not change layout of ops already partitioned into
-            # Marvell regions.
-            relay.transform.ConvertLayout(desired_layouts_in_main),
-            relay.transform.FoldConstant(),
-            relay.transform.SimplifyExpr(),
-            relay.transform.InferType(),
-        ]
-    )
-
-    with tvm.transform.PassContext(opt_level=opt_level, disabled_pass=disabled_pass_list):
-        tmp_mod1 = seq_tvmc_pre_repartition(mod)
-        tmp_mod1 = repartition_mrvl_subgraphs(tmp_mod1)
-        tmp_mod1 = seq_tvmc_post_repartition(tmp_mod1)
-        mod_mrvl_llvm_regions = add_attributes(tmp_mod1, annotate_target_str, **kwargs)
-
-    return mod_mrvl_llvm_regions
-
-
-def is_activation(pattern):
-    """
-    Check if pattern in Marvell supported activations list
-    """
-    mrvl_activations = [
-        "nn.relu",
-    ]
-    activation_pattern = None
-    for ptrn in mrvl_activations:
-        activ = is_op(ptrn)
-        if activation_pattern is None:
-            activation_pattern = activ
-        else:
-            activation_pattern |= activ
-    pattern = pattern.optional(activation_pattern)
-    return pattern
-
-
-class IsComputeIntensiveGraph(ExprVisitor):
-    """
-    Visits the graph recursively and checks if it contains compute heavy ops like
-    convolutions and dense.
-    """
-
-    def __init__(self):
-        ExprVisitor.__init__(self)
-        self.is_compute_intensive = False
-
-    def visit_call(self, call):
-        compute_intensive_ops = {
-            "nn.conv2d",
-            "nn.dense",
-        }
-        if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op.name) in compute_intensive_ops:
-                self.is_compute_intensive = True
-
-        return super().visit_call(call)
-
-    def is_graph_compute_intensive(self, subgraph):
-        """
-        This function recursively visits the graph and checks if it's compute intensive"
-        """
-        self.visit(subgraph)
-        return self.is_compute_intensive
-
-
-class IsSupportedGraph(ExprVisitor):
-    """
-    Visits the graph recursively and checks if function inputs feed into
-    any unsupported ops.
-    """
-
-    def __init__(self, function):
-        ExprVisitor.__init__(self)
-        self.is_supported = True
-        self.function = function
-        self.input_op_list = []
-
-    def _check_legal(self, node, parent_call):
-        unsupported_ops = {
-            "mrvl.sum2d",
-            "mrvl.concat",
-        }
-
-        input_ops = {
-            "mrvl.reshape",
-        }
-
-        if isinstance(node, relay.Function):
-            if node.attrs["Composite"] in unsupported_ops:
-                self.is_supported = False
-            if node.attrs["Composite"] in input_ops:
-                self.input_op_list.append(parent_call)
-
-    def visit_call(self, call):
-        for args in call.args:
-            if args in self.function.params or args in self.input_op_list:
-                relay.analysis.post_order_visit(
-                    call, lambda expr, parent_call=call: self._check_legal(expr, parent_call)
-                )
-
-        return super().visit_call(call)
-
-    def is_supported_subgraph(self):
-        """
-        This function recursively visits the graph and checks if graph is legal"
-        """
-        self.visit(self.function.body)
-        return self.is_supported
-
-
-def first_op_unsupported(function):
-    return not IsSupportedGraph(function).is_supported_subgraph()
-
-
-def repartition_subgraph(function):
-    """
-    Revert back to LLVM if the subgraph is not compute intensive or marked as
-    force_llvm.
-    """
-    if not IsComputeIntensiveGraph().is_graph_compute_intensive(function.body):
-        return True
-
-    if first_op_unsupported(function):
-        return True
-
-    return False
-
-
-def repartition_mrvl_subgraphs(mod):
-    """
-    Un-partition those partitions which:
-     - are not computationally intensive subgraph
-     - cannot be supported by the backend currently
-    """
-    global_vars_to_inline = [
-        gv
-        for gv in mod.get_global_vars()
-        if mod[gv].attrs and mod[gv].attrs["Compiler"] == "mrvl" and repartition_subgraph(mod[gv])
-    ]
-    return relay.transform.InlineCompilerFunctionsBoundTo(global_vars_to_inline)(mod)
-
-
-def add_attributes(mod, annotate_target_str, **kwargs):
-    """This method iterates across all Marvell partitioned functions in the
-    module and attaches attributes which are supplied by the user from the CLI.
-    Use good defaults in case a particular option is not specified. These options
-    are later accessed by codegen and are embedded into the runtime.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to attach attributes to
-    kwargs : Dict[str, str]
-        Dictionary with command line options
-
-    Returns
-    -------
-    mod : module with attributes
-    """
-    working_dir = mrvl_contrib.get_working_dir()
-    sim_attr_found = False
-    hw_attr_found = False
-
-    if "mattr" in kwargs:
-        base_opts_str = kwargs.get("mattr")
-
-        # Set defaults to options if explicit command line option is not given
-        if "arch" not in base_opts_str:
-            base_opts_str = f"{base_opts_str} -arch=mlip"
-
-        if "quantize" not in base_opts_str:
-            base_opts_str = f"{base_opts_str} -quantize=fp16"
-
-        if "wb_pin_ocm" not in base_opts_str:
-            base_opts_str = f"{base_opts_str} -wb_pin_ocm=0"
-
-        if "sim" in base_opts_str:
-            sim_attr_found = True
-            base_opts_str = base_opts_str.replace("sim", "")
-
-        if "hw" in base_opts_str:
-            hw_attr_found = True
-            base_opts_str = base_opts_str.replace("hw", "")
-
-    else:
-        base_opts_str = "-arch=mlip -quantize=fp16 -wb_pin_ocm=0"
-
-    if "num_tiles" in kwargs:
-        base_opts_str = f"{base_opts_str} -num_tiles={kwargs.get('num_tiles')}"
-    elif "num_tiles" not in base_opts_str:
-        base_opts_str = f"{base_opts_str} -num_tiles=8"
-
-    mode_string = "sim"
-    if sim_attr_found:
-        mode_string = "sim"
-    elif hw_attr_found:
-        mode_string = "hw"
-
-    for var in mod.get_global_vars():
-        func_name = var.name_hint
-        func = mod[func_name]
-
-        if annotate_target_str in func_name:
-            func = func.with_attr("working_dir", working_dir)
-            func = func.with_attr("compiler_opts_string", base_opts_str)
-            func = func.with_attr("mode", mode_string)
-            mod.update_func(var, func)
-
-    return mod
-
-
-def is_valid_batch_size(batch_size):
-    if isinstance(batch_size, type(relay.Any())):
-        return False
-    elif batch_size > 8:
-        return False
-    else:
-        return True
-
-
-def mrvl_register_conv2d_attr_funcs_for_convert_layout():
-    """register the conv2d attr func(s) to convert op layout"""
-    # reset first in order to register & use a new nn.conv2d convert layout function
-    relay.op.get("nn.conv2d").reset_attr("FTVMConvertOpLayout")
-
-    @tvm.ir.register_op_attr("nn.conv2d", "FTVMConvertOpLayout")
-    def convert_conv2d(attrs, inputs, tinfos, desired_layouts):
-        if not is_valid_batch_size(tinfos[0].shape[0]):
-            return relay.nn.conv2d(*inputs, **attrs)
-        new_attrs = dict(attrs)
-        weight_info_const = tinfos[1]
-        new_attrs["channels"] = weight_info_const.shape[0]
-        desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-        new_attrs["data_layout"] = desired_data_layout
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        new_attrs["out_layout"] = desired_data_layout
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    return convert_conv2d
-
-
-def mrvl_register_max_pool2d_attr_funcs_for_convert_layout():
-    """register the max_pool2d attr func(s) to convert op layout"""
-    # reset first in order to register & use a new nn.max_pool2d convert layout function
-    relay.op.get("nn.max_pool2d").reset_attr("FTVMConvertOpLayout")
-
-    @tvm.ir.register_op_attr("nn.max_pool2d", "FTVMConvertOpLayout")
-    def convert_max_pool2d(attrs, inputs, tinfos, desired_layouts):
-        if not is_valid_batch_size(tinfos[0].shape[0]):
-            return relay.nn.max_pool2d(*inputs, **attrs)
-        new_attrs = dict(attrs)
-        new_attrs["layout"] = str(desired_layouts[0])
-        new_attrs["out_layout"] = str(desired_layouts[0])
-        return relay.nn.max_pool2d(*inputs, **new_attrs)
-
-    return convert_max_pool2d
-
-
-def mrvl_register_avg_pool2d_attr_funcs_for_convert_layout():
-    """register the avg_pool2d attr func(s) to convert op layout"""
-    # reset first in order to register& use a new nn.avg_pool2d convert layout function
-    relay.op.get("nn.avg_pool2d").reset_attr("FTVMConvertOpLayout")
-
-    @tvm.ir.register_op_attr("nn.avg_pool2d", "FTVMConvertOpLayout")
-    def convert_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
-        if (tinfos[0].shape[0] != 1) and not isinstance(tinfos[0].shape[0], type(relay.Any())):
-            return relay.nn.avg_pool2d(*inputs, **attrs)
-        new_attrs = dict(attrs)
-        new_attrs["layout"] = str(desired_layouts[0])
-        new_attrs["out_layout"] = str(desired_layouts[0])
-        return relay.nn.avg_pool2d(*inputs, **new_attrs)
-
-    return convert_avg_pool2d
-
-
-def mrvl_register_global_avg_pool2d_attr_funcs_for_convert_layout():
-    """register the global_avg_pool2d attr func(s) to convert op layout"""
-    # reset first in order to register& use a new nn.global_avg_pool2d convert layout function
-    relay.op.get("nn.global_avg_pool2d").reset_attr("FTVMConvertOpLayout")
-
-    @tvm.ir.register_op_attr("nn.global_avg_pool2d", "FTVMConvertOpLayout")
-    def convert_global_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
-        if (tinfos[0].shape[0] != 1) and not isinstance(tinfos[0].shape[0], type(relay.Any())):
-            return relay.nn.global_avg_pool2d(*inputs, **attrs)
-        new_attrs = dict(attrs)
-        new_attrs["layout"] = str(desired_layouts[0])
-        new_attrs["out_layout"] = str(desired_layouts[0])
-        return relay.nn.global_avg_pool2d(*inputs, **new_attrs)
-
-    return convert_global_avg_pool2d
-
-
-@register_pattern_table("mrvl")
-def mrvl_pattern_table():
-    """Get the Mrvl pattern table."""
-
-    def conv2d_nhwc2nhwc_pattern():
-        """Create a convolution-2d pattern.
-           review tvm/tests/python/relay/test_dataflow_pattern.py for examples
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution-2d pattern.
-        """
-
-        def conv2d_base_pattern(pattern):
-            pattern = is_op("nn.conv2d")(pattern, is_constant())
-            pattern = pattern.optional(
-                lambda x: (is_op("nn.bias_add")(x, is_constant()) | is_op("add")(x, is_constant()))
-            )
-
-            def conv2d_no_batchnorm(pattern):
-                # conv + [add] + [relu]
-                pattern1 = is_activation(pattern)
-                return pattern1
-
-            def conv2d_batchnorm(pattern):
-                pattern2 = is_op("nn.batch_norm")(
-                    pattern, is_constant(), is_constant(), is_constant(), is_constant()
-                )
-                pattern2 = is_tuple_get_item(pattern2, 0)
-                pattern2 = is_activation(pattern2)
-                return pattern2
-
-            pattern1 = conv2d_no_batchnorm(pattern)
-            pattern2 = conv2d_batchnorm(pattern)
-
-            return pattern1 | pattern2
-
-        pad = is_op("nn.pad")(wildcard(), wildcard())
-        pad = conv2d_base_pattern(pad)
-        no_pad = wildcard()
-        no_pad = conv2d_base_pattern(no_pad)
-
-        return pad | no_pad
-
-    def sum_pattern():
-        """Create a sum pattern.
-           review tvm/tests/python/relay/test_dataflow_pattern.py for examples
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the sum pattern.
-        """
-        pattern = is_op("add")(wildcard(), wildcard())
-        pattern = is_activation(pattern)
-        return pattern
-
-    def concat_pattern():
-        """Create a concat pattern.
-           review tvm/tests/python/relay/test_dataflow_pattern.py for examples
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the concat pattern.
-        """
-        pattern = is_op("concatenate")(is_tuple(None))
-        return pattern
-
-    def fc_pattern():
-        """Create a fc (fully-connected) pattern.
-           review tvm/tests/python/relay/test_dataflow_pattern.py for examples
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the fc pattern.
-        """
-
-        def fc_base_pattern(pattern):
-            pattern = is_op("nn.dense")(pattern, is_constant())
-            pattern = pattern.optional(
-                lambda x: (is_op("nn.bias_add")(x, is_constant()) | is_op("add")(x, is_constant()))
-            )
-            pattern = is_activation(pattern)
-
-            return pattern
-
-        transform1 = is_op("layout_transform")(wildcard()).has_attr(
-            {"src_layout": "NHWC", "dst_layout": "NCHW"}
-        )
-        reshape = is_op("reshape")(transform1)
-        flatten = is_op("nn.batch_flatten")(transform1)
-        flatten = reshape | flatten
-        flatten = fc_base_pattern(flatten)
-
-        no_flatten = wildcard()
-        no_flatten = fc_base_pattern(no_flatten)
-
-        return flatten | no_flatten
-
-    def maxpool2d_pattern():
-        """Create a maxpool2d pattern.
-           review tvm/tests/python/relay/test_dataflow_pattern.py for examples
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the maxpool2d pattern.
-        """
-
-        def maxpool2d_base_pattern(pattern):
-            pattern = is_op("nn.max_pool2d")(pattern)
-            return pattern
-
-        pad = is_op("nn.pad")(wildcard(), wildcard())
-        pad = maxpool2d_base_pattern(pad)
-
-        no_pad = wildcard()
-        no_pad = maxpool2d_base_pattern(no_pad)
-
-        return pad | no_pad
-
-    def avgpool2d_pattern():
-        """Create a avgpool2d pattern.
-           review tvm/tests/python/relay/test_dataflow_pattern.py for examples
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the avgpool2d pattern.
-        """
-
-        def avgpool2d_base_pattern(pattern):
-            pattern = is_op("nn.avg_pool2d")(pattern)
-
-            return pattern
-
-        pad = is_op("nn.pad")(wildcard(), wildcard())
-        pad = avgpool2d_base_pattern(pad)
-
-        no_pad = wildcard()
-        no_pad = avgpool2d_base_pattern(no_pad)
-
-        return pad | no_pad
-
-    def globalavgpool2d_pattern():
-        """Create a globalavgpool2d pattern.
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the globalavgpool2d pattern.
-        """
-        pattern = is_op("nn.global_avg_pool2d")(wildcard())
-        return pattern
-
-    def globalmaxpool2d_pattern():
-        """Create a globalmaxpool2d pattern.
-           review tvm/tests/python/relay/test_dataflow_pattern.py for examples
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the globalmaxpool2d pattern.
-        """
-        pattern = is_op("nn.global_max_pool2d")(wildcard())
-        return pattern
-
-    def reshape_pattern():
-        pattern = is_op("reshape")(wildcard())
-        return pattern
-
-    def batch_flatten_pattern():
-        pattern = is_op("nn.batch_flatten")(wildcard())
-        return pattern
-
-    def squeeze_pattern():
-        pattern = is_op("squeeze")(wildcard())
-        return pattern
-
-    def layout_transform_nchw2nhwc_pattern():
-        pattern = is_op("layout_transform")(is_var(), wildcard(), wildcard()).has_attr(
-            {"src_layout": "NCHW", "dst_layout": "NHWC"}
-        )
-        return pattern
-
-    def check_conv2d(extract):
-        """Check conv pattern is supported by Mrvl."""
-        call = extract
-        while isinstance(call, TupleGetItem) or (call.op.name != "nn.conv2d"):
-            if isinstance(call, TupleGetItem):
-                call = call.tuple_value
-            else:
-                call = call.args[0]
-        return conv2d_nhwc2nhwc(call)
-
-    def check_fc(extract):
-        """Check fc pattern is supported by Mrvl."""
-        call = extract
-        while call.op.name != "nn.dense":
-            call = call.args[0]
-        return fc_ni2no(call)
-
-    def check_maxpool2d(extract):
-        """Check maxpool2d pattern is supported by Mrvl."""
-        call = extract
-        while call.op.name != "nn.max_pool2d":
-            call = call.args[0]
-        return maxpool2d_nhwc2nhwc(call)
-
-    def check_avgpool2d(extract):
-        """Check avgpool2d pattern is supported by Mrvl."""
-        call = extract
-        while call.op.name != "nn.avg_pool2d":
-            call = call.args[0]
-        return avgpool2d_nhwc2nhwc(call)
-
-    def check_globalavgpool2d(extract):
-        """Check globalavgpool2d pattern is supported by Mrvl."""
-        call = extract
-        while call.op.name != "nn.global_avg_pool2d":
-            call = call.args[0]
-        return globalavgpool2d_nhwc2nhwc(call)
-
-    def check_globalmaxpool2d(extract):
-        """Check globalmaxpool2d pattern is supported by Mrvl."""
-        call = extract
-        while call.op.name != "nn.global_max_pool2d":
-            call = call.args[0]
-        return globalmaxpool2d_nhwc2nhwc(call)
-
-    def check_reshape(extract):
-        call = extract
-        while call.op.name != "reshape":
-            call = call.args[0]
-        return reshape_mrvl(call)
-
-    def check_batch_flatten(extract):
-        call = extract
-        while call.op.name != "nn.batch_flatten":
-            call = call.args[0]
-        return batch_flatten_mrvl(call)
-
-    def check_squeeze(extract):
-        call = extract
-        while call.op.name != "squeeze":
-            call = call.args[0]
-        return squeeze_mrvl(call)
-
-    def check_layout_transform_nchw2nhwc(extract):
-        call = extract
-        while call.op.name != "layout_transform":
-            call = call.args[0]
-        return layout_transform_nchw2nhwc(call)
-
-    def check_sum(extract):
-        """Check sum2d pattern is supported by Mrvl."""
-        call = extract
-        while call.op.name != "add":
-            call = call.args[0]
-        return summation(call)
-
-    def check_concat(extract):
-        """Check concat pattern is supported by Mrvl."""
-        call = extract
-        while call.op.name != "concatenate":
-            call = call.args[0]
-        return concat(call)
-
-    return [
-        ("mrvl.conv2d_nhwc2nhwc", conv2d_nhwc2nhwc_pattern(), check_conv2d),
-        ("mrvl.fc_ni2no", fc_pattern(), check_fc),
-        ("mrvl.maxpool2d_nhwc2nhwc", maxpool2d_pattern(), check_maxpool2d),
-        ("mrvl.avgpool2d_nhwc2nhwc", avgpool2d_pattern(), check_avgpool2d),
-        ("mrvl.globalavgpool2d_nhwc2nhwc", globalavgpool2d_pattern(), check_globalavgpool2d),
-        ("mrvl.globalmaxpool2d_nhwc2nhwc", globalmaxpool2d_pattern(), check_globalmaxpool2d),
-        ("mrvl.sum", sum_pattern(), check_sum),
-        ("mrvl.concat", concat_pattern(), check_concat),
-        (
-            "mrvl.layout_transform_nchw2nhwc",
-            layout_transform_nchw2nhwc_pattern(),
-            check_layout_transform_nchw2nhwc,
-        ),
-        ("mrvl.reshape", reshape_pattern(), check_reshape),
-        ("mrvl.batch_flatten", batch_flatten_pattern(), check_batch_flatten),
-        ("mrvl.squeeze", squeeze_pattern(), check_squeeze),
-    ]
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("nn.conv2d", "target.mrvl")
-def conv2d_nhwc2nhwc(expr):
-    """Check if the external Mrvl codegen for conv2d_nhwc2nhwc should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.data_layout != "NHWC":
-        return False
-    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
-        return False
-    data_type = args[0].checked_type
-    if (
-        (len(data_type.shape) != 4)
-        or not is_valid_batch_size(data_type.shape[0])
-        or (data_type.dtype not in ["float32"])
-    ):
-        return False
-    kernel_typ = args[1].checked_type
-    if (len(kernel_typ.shape) != 4) or (kernel_typ.dtype not in ["float32"]):
-        return False
-
-    is_depthwise = is_depthwise_conv2d(
-        data_type.shape,
-        attrs["data_layout"],
-        kernel_typ.shape,
-        attrs["kernel_layout"],
-        attrs["groups"],
-    )
-    if is_depthwise:
-        # Mrvl support grouped conv only for groups == ch
-        return bool(attrs.groups == kernel_typ.shape[0])
-    if attrs.groups != 1 and not is_depthwise:
-        return False
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("add", "target.mrvl")
-def summation(expr):
-    """Check if the external Mrvl codegen for sum should be used."""
-    arg0 = expr.args[0]
-
-    # - need to further checking if the call_func of arg0 is not nn.conv2d nor nn.dense
-    if (
-        isinstance(arg0, Call)
-        and isinstance(arg0.op, tvm.ir.Op)
-        and arg0.op.name in ["nn.conv2d", "nn.dense"]
-    ):
-        return False
-
-    # - need to further checking if dimension of input or output tensor is 4
-    data_type = arg0.checked_type
-    if (
-        (len(data_type.shape) != 4 and len(data_type.shape) != 3)
-        or not is_valid_batch_size(data_type.shape[0])
-        or (data_type.dtype not in ["float32"])
-    ):
-        return False
-
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("concatenate", "target.mrvl")
-def concat(expr):
-    """Check if the external Mrvl codegen for concat should be used."""
-    attrs, args = expr.attrs, expr.args
-    arg0 = args[0]
-    assert not isinstance(arg0, Call)
-
-    # check data types for both inputs
-    # - only support 4-dimension input tensors in NHWC
-    # - only support batch size is 1
-    data_type_a = arg0.checked_type.fields[0]
-    data_type_b = arg0.checked_type.fields[1]
-    if (
-        (len(data_type_a.shape) != 4)
-        or (len(data_type_b.shape) != 4)
-        or (data_type_a.shape[0] != 1)
-        or (data_type_b.shape[0] != 1)
-        or (data_type_a.dtype not in ["float32"])
-        or (data_type_b.dtype not in ["float32"])
-    ):
-        return False
-
-    for data_type in arg0.checked_type.fields:
-        if (
-            (len(data_type.shape) != 4)
-            or (data_type.shape[0] != 1)
-            or (data_type.dtype not in ["float32"])
-        ):
-            return False
-
-    if attrs["axis"] != 3:
-        return False
-
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("nn.dense", "target.mrvl")
-def fc_ni2no(expr):
-    """Check if the external Mrvl codegen for fc_ni2no should be used."""
-    attrs, args = expr.attrs, expr.args
-    data_type = args[0].checked_type
-    if data_type.dtype not in ["float32"]:
-        return False
-    kernel_typ = args[1].checked_type
-    if (len(kernel_typ.shape) != 2) or (kernel_typ.dtype not in ["float32"]):
-        return False
-    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
-        return False
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("nn.max_pool2d", "target.mrvl")
-def maxpool2d_nhwc2nhwc(expr):
-    """Check if the external Mrvl codegen for maxpool2d_nhwc2nhwc should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.layout != "NHWC":
-        return False
-    data_type = args[0].checked_type
-    if (
-        (len(data_type.shape) != 4)
-        or not is_valid_batch_size(data_type.shape[0])
-        or (data_type.dtype not in ["float32"])
-    ):
-        return False
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("nn.avg_pool2d", "target.mrvl")
-def avgpool2d_nhwc2nhwc(expr):
-    """Check if the external Mrvl codegen for avgpool2d_nhwc2nhwc should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.layout != "NHWC":
-        return False
-    data_type = args[0].checked_type
-    if (
-        (len(data_type.shape) != 4)
-        or ((data_type.shape[0] != 1) and not isinstance(data_type.shape[0], type(relay.Any())))
-        or (data_type.dtype not in ["float32"])
-    ):
-        return False
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.mrvl")
-def globalavgpool2d_nhwc2nhwc(expr):
-    """Check if the external Mrvl codegen for globalavgpool2d_nhwc2nhwc should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.layout != "NHWC":
-        return False
-    data_type = args[0].checked_type
-    if not (len(data_type.shape) == 4 or len(data_type.shape) == 2):
-        return False
-    if (
-        (len(data_type.shape) != 4)
-        or ((data_type.shape[0] != 1) and not isinstance(data_type.shape[0], type(relay.Any())))
-        or (data_type.dtype not in ["float32"])
-    ):
-        return False
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.mrvl")
-def globalmaxpool2d_nhwc2nhwc(expr):
-    """Check if the external Mrvl codegen for globalmaxpool2d_nhwc2nhwc should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.layout != "NHWC":
-        return False
-    data_type = args[0].checked_type
-    if not (len(data_type.shape) == 4 or len(data_type.shape) == 2):
-        return False
-    if (len(data_type.shape) != 4) or (data_type.dtype not in ["float32"]):
-        return False
-    return True
-
-
-@tvm.ir.register_op_attr("reshape", "target.mrvl")
-def reshape_mrvl(expr):
-    """Check if the external Mrvl codegen for reshape should be used."""
-    if expr.op.name != "reshape":
-        return False
-    data_type = expr.checked_type
-    if not (len(data_type.shape) == 4 or len(data_type.shape) == 2):
-        return False
-
-    args = expr.args
-    data_type = args[0].checked_type
-    return True
-
-
-@tvm.ir.register_op_attr("nn.batch_flatten", "target.mrvl")
-def batch_flatten_mrvl(expr):
-    """Check if the external Mrvl codegen for batch_flatten should be used."""
-    if expr.op.name != "nn.batch_flatten":
-        return False
-    else:
-        data_type = expr.checked_type
-        if len(data_type.shape) != 2:
-            return False
-
-        args = expr.args
-        data_type = args[0].checked_type
-
-        if not (len(data_type.shape) == 4 or len(data_type.shape) == 2):
-            return False
-
-        return True
-
-
-@tvm.ir.register_op_attr("squeeze", "target.mrvl")
-def squeeze_mrvl(expr):
-    """Check if the external Mrvl codegen for squeeze should be used."""
-    if expr.op.name != "squeeze":
-        return False
-    return True
-
-
-# register a helper function to indicate that the given operator can be supported by Mrvl.
-@tvm.ir.register_op_attr("layout_transform", "target.mrvl")
-def layout_transform_nchw2nhwc(expr):
-    """Check if the external Mrvl codegen for Layout Transform should be used."""
-    attrs, args = expr.attrs, expr.args
-    if attrs.src_layout != "NCHW":
-        return False
-    if attrs.dst_layout != "NHWC":
-        return False
-    data_type = args[0].checked_type
-    if data_type.dtype not in ["float32"]:
-        return False
-    return True
-
-
-class RemoveDropout(ExprMutator):
-    """Removes all nn.dropout from an expr."""
-
-    def visit_tuple_getitem(self, op):
-        visit = super().visit_tuple_getitem(op)
-        if visit.index != 0:
-            return visit
-        if (
-            isinstance(visit.tuple_value, Call)
-            and visit.tuple_value.op.name == "nn.dropout"
-            and visit.index == 0
-        ):
-            # skip nn.dropout call and return arg0 instead
-            return visit.tuple_value.args[0]
-        return visit
-
-
-@relay.transform.function_pass(opt_level=0)
-class MrvlRemoveDropoutPass:
-    """Removes Dropouts."""
-
-    def transform_function(self, func, mod, _):
-        """call RemoveDropout func."""
-        return RemoveDropout().visit(func)
-
-
-class RemoveCopy(ExprMutator):
-    """
-    Delete Copy expression
-    """
-
-    def visit_call(self, call):
-        visit = super().visit_call(call)
-        if visit.op.name in ["copy"]:
-            return visit.args[0]
-        return visit
-
-
-@relay.transform.function_pass(opt_level=0)
-class MrvlRemoveCopyPass:
-    """Removes Copy."""
-
-    def transform_function(self, func, mod, _):
-        """call RemoveCopy func."""
-        return RemoveCopy().visit(func)
diff --git a/python/tvm/relay/op/contrib/register.py b/python/tvm/relay/op/contrib/register.py
deleted file mode 100644
index 278a311c09b0..000000000000
--- a/python/tvm/relay/op/contrib/register.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Register utilities for external codegen."""
-_PATTERN_TABLES = {}
-
-
-def register_pattern_table(compiler, table=None):
-    """Register a pattern table for an external compiler.
-
-    Pattern tables are used to create composite functions.
-    See the MergeComposite pass.
-
-    Parameters
-    ----------
-    compiler : str
-        The name of compiler
-
-    table : function, optional
-        A function that returns the pattern table
-
-    Returns
-    -------
-    fregister : function
-        Register function if value is not specified.
-    """
-
-    def _register(t):
-        """internal register function"""
-        _PATTERN_TABLES[compiler] = t()
-        return t
-
-    return _register(table) if table is not None else _register
-
-
-def get_pattern_table(compiler):
-    """Get the pattern table associated with a compiler (if it's registered)."""
-    return _PATTERN_TABLES[compiler] if compiler in _PATTERN_TABLES else None
diff --git a/python/tvm/relay/op/contrib/te_target.py b/python/tvm/relay/op/contrib/te_target.py
deleted file mode 100644
index ab1a1d0cda28..000000000000
--- a/python/tvm/relay/op/contrib/te_target.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Support a Relay partitioning target using Tensor Expressions."""
-from typing import Callable, List, Dict
-
-import tvm
-import tvm.ir
-from tvm import relay
-from tvm import te
-
-
-_LowerFunc = Callable[[relay.Call, List[te.Tensor]], te.Tensor]
-_LOWER_MAP: Dict[str, _LowerFunc] = {}
-
-
-def lower_composite(comp_name: str) -> Callable[[_LowerFunc], _LowerFunc]:
-    """Register a lowering function for a given composite function name."""
-
-    def _register(f: _LowerFunc) -> _LowerFunc:
-        _LOWER_MAP[comp_name] = f
-        return f
-
-    return _register
-
-
-def relay_to_runtime(target: tvm.target.Target) -> Callable[[relay.Function], tvm.runtime.Module]:
-    """Create a Relay to runtime module lowering function using Tensor Expressions for lowering."""
-
-    def _relay_to_runtime(partition: relay.Function) -> tvm.runtime.Module:
-        """Compile Relay functions to a runtime module using Tensor Expressions."""
-        assert isinstance(partition, relay.Function)
-        assert isinstance(partition.body, relay.Call)
-        assert isinstance(partition.body.op, relay.Function)
-
-        global_name = str(partition.attrs.global_symbol)
-        comp_func = partition.body.op
-        comp_name = comp_func.attrs["Composite"]
-        assert comp_name in _LOWER_MAP
-        assert isinstance(comp_func.body, relay.Call)
-
-        op = comp_func.body
-        inputs = []
-        for i, param in enumerate(comp_func.params):
-            inputs.append(
-                te.placeholder(
-                    param.checked_type.shape,
-                    name=f"input_{i}",
-                    dtype=param.checked_type.dtype,
-                )
-            )
-
-        output = _LOWER_MAP[comp_name](op, inputs)
-        prim_func = te.create_prim_func(inputs + [output])
-        return tvm.build(prim_func, target=target, name=global_name)
-
-    return _relay_to_runtime
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
deleted file mode 100644
index 7679d0d68b2f..000000000000
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ /dev/null
@@ -1,1150 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, logging-format-interpolation
-"""TensorRT supported operators."""
-import logging
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np  # type: ignore
-
-import tvm
-from tvm import relay
-from tvm.ir import Op
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.dataflow_pattern import (
-    is_constant,
-    is_op,
-    is_tuple,
-    is_tuple_get_item,
-    wildcard,
-)
-from tvm.relay.expr import Call, Constant, TupleGetItem
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
-from tvm.relay.op.contrib.register import register_pattern_table
-
-logger = logging.getLogger("TensorRT")
-
-
-def is_tensorrt_compiler_enabled() -> bool:
-    return tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True) is not None
-
-
-def is_tensorrt_runtime_enabled() -> bool:
-    """Check if the TensorRT graph executor is present.
-    Returns
-    -------
-    ret: bool
-        True if present, False if not.
-    """
-    check_enabled = tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True)
-    if check_enabled:
-        return check_enabled()
-    return False
-
-
-def get_tensorrt_target() -> tvm.target.Target:
-    """Returns the current Target, which must be of kind "tensorrt"."""
-    target = tvm.target.Target.current()
-    if target is None or target.kind.name != "tensorrt":
-        # Create the default target.
-        return tvm.target.Target("tensorrt")
-    return target
-
-
-def get_tensorrt_version() -> Tuple[int, int, int]:
-    """Returns the version of TensorRT to assume during compilation.
-    In order of preference this is taken from:
-     - The current "tensorrt" target's "tensorrt_version" attribute string.
-     - The version linked to the TVM runtime.
-     - (6, 0, 1)
-
-    Returns
-    -------
-    ret: Tuple[int, int, int]
-        TensorRT version as a tuple of (major, minor, patch).
-    """
-    # cf logic in tensorrt/codegen.cc::SaveGlobalAttributes
-    # First check for version in target.
-    target = get_tensorrt_target()
-    version = target.attrs["tensorrt_version"]
-    if len(version) == 3:
-        return int(version[0]), int(version[1]), int(version[2])
-    assert len(version) == 0
-
-    # Next, ask runtime for its version.
-    if is_tensorrt_runtime_enabled():
-        get_version = tvm.get_global_func("relay.ext.tensorrt.get_version")
-        version = get_version()
-        assert len(version) == 3
-        return int(version[0]), int(version[1]), int(version[2])
-
-    # Finally, use default.
-    logger.warning(
-        "TVM was not built against TensorRT and no version was provided in the 'tensorrt' target."
-        "Defaulting to 6.0.1."
-    )
-    return (6, 0, 1)
-
-
-def get_tensorrt_use_implicit_batch_mode() -> bool:
-    """Returns the "use_implicit_batch" attribute of the current "tensorrt" target."""
-    target = get_tensorrt_target()
-    return target.attrs["use_implicit_batch"]
-
-
-def get_tensorrt_remove_no_mac_subgraphs() -> bool:
-    """Returns the "remove_no_mac_subgraphs" attribute of the current "tensorrt" target."""
-    target = get_tensorrt_target()
-    return target.attrs["remove_no_mac_subgraphs"]
-
-
-def get_tensorrt_use_fp16() -> bool:
-    """Returns the "use_fp16" attribute of the current "tensorrt" target."""
-    target = get_tensorrt_target()
-    return target.attrs["use_fp16"]
-
-
-def partition_for_tensorrt(
-    mod: tvm.IRModule,
-    params: Optional[Dict[str, tvm.nd.NDArray]] = None,
-    # CAUTION: Can't use default Target("tensorrt") here since the target kind is only available
-    #          if is_tensorrt_compiler_enabled() == True.
-    target: Optional[tvm.target.Target] = None,
-) -> tvm.IRModule:
-    """Partition all functions in mod to greedily offload supported operators to TensorRT.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        The module to partition.
-    target : tvm.target.Target
-        A target of kind "tensorrt" describing additional partitioning and compilation options.
-    params : Optional[Dict[str, tvm.nd.NDArray]]
-        Constant input parameters.
-
-    Returns
-    -------
-    partitioned_mod : tvm.IRModule
-        The partitioned module.
-
-    """
-    assert is_tensorrt_compiler_enabled(), "Can only partition for TensorRT if it is enabled"
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-    if target is None:
-        # Use a default target. The get_tensorrt_target() function will similarly create an
-        # equivalent default target when compilation continues after partitioning.
-        target = tvm.target.Target("tensorrt")
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            RemoveDropoutPass(),
-            transform.RemoveUnusedFunctions(),
-            transform.ConvertLayout(
-                {
-                    "nn.conv1d": ["NCW", "default"],
-                    "nn.conv2d": ["NCHW", "default"],
-                    "nn.conv3d": ["NCDHW", "default"],
-                    "nn.conv2d_transpose": ["NCHW", "default"],
-                }
-            ),
-            transform.FoldConstant(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("tensorrt"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            transform.InferType(),
-        ]
-    )
-    with target:
-        mod = seq(mod)
-        mod = prune_tensorrt_subgraphs(mod)
-    return mod
-
-
-def is_supported_trt_type(typ: Union[tvm.ir.TensorType, tvm.ir.TupleType], op_name: str) -> bool:
-    """Check whether a type is supported by TensorRT."""
-    supported_dtypes = ["float32"]
-    if get_tensorrt_use_fp16():
-        supported_dtypes.append("float16")
-    if isinstance(typ, tvm.ir.TensorType):
-        if typ.dtype not in supported_dtypes:
-            logger.info(f"{op_name}: Only {supported_dtypes} tensor dtypes are supported.")
-            return False
-        dims = typ.shape
-        if get_tensorrt_use_implicit_batch_mode():
-            # The first dimension can be Any.
-            dims = dims[1:]
-        for dim in dims:
-            if isinstance(dim, tvm.tir.expr.Any):
-                logger.info(f"{op_name}: Only statically known tensor shapes are supported.")
-                return False
-    elif isinstance(typ, tvm.ir.TupleType):
-        for field_type in typ.fields:
-            if not is_supported_trt_type(field_type, op_name):
-                return False
-    else:
-        logger.info(f"{op_name}: Type {typ} is not supported.")
-        return False
-    return True
-
-
-def get_op_name(expr: relay.expr.Expr) -> str:
-    """Get the operator name from an expression."""
-    if isinstance(expr, Op):
-        return expr.name
-    if isinstance(expr, Call):
-        return get_op_name(expr.op)
-    if isinstance(expr, TupleGetItem):
-        return get_op_name(expr.tuple_value)
-    if isinstance(expr, relay.Tuple):
-        return get_op_name(expr.fields[0])
-    return ""
-
-
-def get_args(expr: relay.expr.Expr) -> List[relay.expr.Expr]:
-    """Get the arguments from an expression."""
-    if isinstance(expr, Call):
-        return expr.args
-    if isinstance(expr, TupleGetItem):
-        return get_args(expr.tuple_value)
-    if isinstance(expr, relay.Tuple):
-        return [arg for args in map(get_args, expr.fields) for arg in args]
-    return []
-
-
-def get_attrs(expr: relay.expr.Expr) -> Any:
-    """Get the attributes from an expression."""
-    if isinstance(expr, Call):
-        return expr.attrs
-    if isinstance(expr, TupleGetItem):
-        return get_attrs(expr.tuple_value)
-    return {}
-
-
-CheckFunc = Callable[[Any, List[relay.expr.Expr], str], bool]
-
-
-def make_predicate(checker: CheckFunc) -> Callable[[relay.expr.Expr], bool]:
-    """Returns the pattern predicate which performs the standard checks, then invokes the
-    more primitive checker."""
-
-    def predicate(expr: relay.expr.Expr) -> bool:
-        op_name = get_op_name(expr)
-        attrs = get_attrs(expr)
-        args = get_args(expr)
-        if not all([is_supported_trt_type(arg.checked_type, op_name) for arg in args]):
-            return False
-        if not checker(attrs, args, op_name):
-            return False
-        logger.info(f"{op_name}: Predicate passes")
-        return True
-
-    return predicate
-
-
-standard_predicate = make_predicate(lambda attrs, args, op_name: True)
-
-
-def make_trt_version_checker(version: Tuple[int, int, int]) -> CheckFunc:
-    """Helper for ops which require a minimum TRT version"""
-
-    def checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
-        if get_tensorrt_version() < version:
-            logger.info(
-                f"{op_name}: requires TensorRT version {'.'.join(map(str, version))} or higher."
-            )
-            return False
-        return True
-
-    return checker
-
-
-def make_and_checker(*checkers: CheckFunc) -> CheckFunc:
-    def checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
-        return all([c(attrs, args, op_name) for c in checkers])
-
-    return checker
-
-
-def multiply_checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
-    """Helper for multiply operations."""
-    shapes = [
-        [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
-        for arg in args
-    ]
-    # TODO(mbs): Follow up
-    # Batched multiply operations don't work in implicit batch mode. The following shapes
-    # have been excluded because they occur in PT MaskRCNN model. The long term solution is
-    # to switch to explicit batch mode after performance regressions are solved.
-    if all([list(map(int, shape)) in [[300, 64, 7, 7], [300, 1, 1, 1]] for shape in shapes]):
-        logger.info(f"{op_name}: Excluding since problematic in implicit batch mode")
-        return False
-    return True
-
-
-def reduce_checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
-    """Helper for reduce operations."""
-    if get_tensorrt_use_implicit_batch_mode() and (not attrs.axis or len(attrs.axis) == 0):
-        logger.info(f"{op_name}: cannot reduce to scalar.")
-        return False
-    if attrs.exclude:
-        logger.info(f"{op_name}: exclude not supported.")
-        return False
-    if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]):
-        logger.info(f"{op_name}: can't modify batch dimension.")
-        return False
-    return True
-
-
-def add_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if add is supported by TensorRT."""
-    shapes = [
-        [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
-        for arg in args
-    ]
-
-    # Scalars require explicit batch mode.
-    if get_tensorrt_use_implicit_batch_mode() and any([len(shape) < 1 for shape in shapes]):
-        logger.info(f"{op_name}: Scalars not supported in implicit batch mode")
-        return False
-
-    if (
-        not get_tensorrt_use_implicit_batch_mode()
-        and (isinstance(args[0], Constant) or isinstance(args[1], Constant))
-        and len(shapes[0]) > 0
-        and len(shapes[1]) > 0
-        and shapes[0][0] == shapes[1][0]
-        and shapes[0][0] != 1
-        and (len(shapes[0]) > 3 or len(shapes[1]) > 3)
-    ):
-        logger.info(f"{op_name}: bug in TRT with adding batched constants.")
-        return False
-    return True
-
-
-def batch_norm_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.batch_norm is supported by TensorRT."""
-    if len(args[0].checked_type.shape) == 5 and get_tensorrt_version() < (6, 0, 1):
-        logger.info(f"{op_name}: TensorRT 6.0.1 or higher is required for rank 5 inputs.")
-        return False
-    if len(args[0].checked_type.shape) > 5:
-        logger.info(f"{op_name}: Input rank must be 5 or less.")
-        return False
-    if int(attrs.axis) not in (1, 3):
-        logger.info(f"{op_name}: axis is {int(attrs.axis)} but must be 1 or 3.")
-        return False
-    return True
-
-
-def softmax_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.softmax is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
-        logger.info(f"{op_name}: can't modify batch dimension.")
-        return False
-    return True
-
-
-def conv1d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.conv1d is supported by TensorRT."""
-    if not isinstance(args[1], Constant):
-        logger.info(f"{op_name}: kernel argument must be constant.")
-        return False
-    if attrs.data_layout != "NCW":
-        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCW.")
-        return False
-    if attrs.kernel_layout != "OIW":
-        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIW.")
-        return False
-    return True
-
-
-def conv2d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.conv2d is supported by TensorRT."""
-    assert len(args) == 2
-    if not isinstance(args[1], Constant):
-        logger.info(f"{op_name}: kernel argument must be constant.")
-        return False
-    if attrs.data_layout != "NCHW":
-        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCHW.")
-        return False
-    if attrs.kernel_layout != "OIHW":
-        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIHW.")
-        return False
-    if attrs.out_layout and attrs.out_layout != "NCHW":
-        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCHW.")
-        return False
-    return True
-
-
-def dense_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if dense is supported by TensorRT."""
-    if not isinstance(args[1], Constant):
-        logger.info(f"{op_name}: weight must be constant")
-        return False
-    input_rank = len(args[0].checked_type.shape)
-    weight_rank = len(args[1].checked_type.shape)
-    if input_rank not in (2, 3, 4):
-        logger.info(f"{op_name}: input has rank {input_rank} but must be 2, 3 or 4.")
-        return False
-    if weight_rank != 2:
-        logger.info(f"{op_name}: weight has rank {weight_rank} but must be 2.")
-        return False
-    return True
-
-
-def batch_matmul_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if dense is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode() and len(args[0].checked_type.shape) != len(
-        args[1].checked_type.shape
-    ):
-        logger.info(f"{op_name}: requires use_implict_batch=False.")
-        return False
-    return True
-
-
-def layer_norm_checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
-    """Check if dense is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
-        logger.info(f"{op_name}: requires use_implict_batch=False.")
-        return False
-    return True
-
-
-def bias_add_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.bias_add is supported by TensorRT."""
-    input_rank = len(args[0].checked_type.shape)
-    if input_rank not in (2, 3, 4):
-        logger.info(f"{op_name}: input rank is {input_rank} but must be 2, 3 or 4.")
-        return False
-    return True
-
-
-def max_pool_2d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.max_pool2d is supported by TensorRT."""
-    if attrs.layout != "NCHW":
-        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
-        return False
-    if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
-        logger.info(f"{op_name}: ceil_mode=True requires TensorRT 5.1.5 or greater.")
-        return False
-    return True
-
-
-def avg_pool_2d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.avg_pool2d is supported by TensorRT."""
-    if attrs.layout != "NCHW":
-        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
-        return False
-    if (
-        attrs.count_include_pad
-        and len(attrs.padding) == 4
-        and (
-            int(attrs.padding[0]) != int(attrs.padding[2])
-            or int(attrs.padding[1]) != int(attrs.padding[3])
-        )
-    ):
-        logger.info(
-            f"{op_name}: inclusive-counted blended or average "
-            "pooling is not supported in combination with asymmetric padding"
-        )
-        return False
-    if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
-        logger.info(f"{op_name}: ceil_mode=True requires TensorRT 5.1.5 or greater.")
-        return False
-    return True
-
-
-def global_max_pool_2d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.global_max_pool2d is supported by TensorRT."""
-    if attrs.layout != "NCHW":
-        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
-        return False
-    return True
-
-
-def global_avg_pool_2d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.global_avg_pool2d is supported by TensorRT."""
-    if attrs.layout != "NCHW":
-        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
-        return False
-    return True
-
-
-def expand_dims_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if expand_dims is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
-        logger.info(f"{op_name}: can't modify batch dimension.")
-        return False
-    return True
-
-
-def squeeze_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if squeeze is supported by TensorRT."""
-    if not attrs.axis:
-        logger.info(f"{op_name}: must explicitly set axis.")
-        return False
-    if get_tensorrt_use_implicit_batch_mode() and any([axis == 0 for axis in map(int, attrs.axis)]):
-        logger.info(f"{op_name}: can't modify batch dimension.")
-        return False
-    return True
-
-
-def concatenate_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if concatenate is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode():
-        if int(attrs.axis) == 0:
-            logger.info(f"{op_name}: can't modify batch dimension.")
-            return False
-
-    if not isinstance(args[0], relay.Tuple):
-        logger.info("f{op_name}: concatenate must be applied to a literal tuple")
-        return False
-
-    for tuple_input in args[0].fields:
-        if isinstance(tuple_input, Constant):
-            logger.info(f"{op_name}: can't concatenate tensors with constants.")
-            return False
-
-    return True
-
-
-def split_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if split is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
-        logger.info(f"{op_name}: can't modify batch dimension.")
-        return False
-    return True
-
-
-def conv2d_transpose_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.conv2d_transpose is supported by TensorRT."""
-    if attrs.data_layout != "NCHW":
-        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCHW.")
-        return False
-    if attrs.kernel_layout != "OIHW":
-        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIHW.")
-        return False
-    if attrs.out_layout and attrs.out_layout != "NCHW":
-        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCHW.")
-        return False
-    if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-        logger.info(f"{op_name}: dilation rate must be 1.")
-        return False
-    return True
-
-
-def transpose_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if transpose is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axes[0]) != 0:
-        logger.info(f"{op_name}: can't modify batch dimension.")
-        return False
-    return True
-
-
-def layout_transform_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if layout_transform is supported by TensorRT."""
-    if (attrs.src_layout, attrs.dst_layout) not in [
-        ("NCHW", "NHWC"),
-        ("NHWC", "NCHW"),
-        ("NDHWC", "NCDHW"),
-        ("NCDHW", "NDHWC"),
-    ]:
-        logger.info(f"{op_name}: {attrs.src_layout} to {attrs.dst_layout} is not supported.")
-        return False
-    return True
-
-
-def reshape_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if reshape is supported by TensorRT."""
-    if any([x < -1 for x in map(int, attrs.newshape)]):
-        logger.info(f"{op_name}: new shape dims must be explicit.")
-        return False
-    if get_tensorrt_use_implicit_batch_mode():
-        shape = args[0].checked_type.shape
-        new_shape = attrs.newshape
-        if len(new_shape) == 0 or len(shape) == 0:
-            logger.info(f"{op_name}: Can't reshape to or from scalar.")
-            return False
-        dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape])
-
-        if dynamic_reshape:
-            # Make sure that the batch dim is unmodified.
-            if int(new_shape[0]) < 0:
-                for shape_val, new_shape_val in zip(shape[1:], new_shape[1:]):
-                    if not (
-                        isinstance(shape_val, (int, tvm.tir.expr.IntImm))
-                        and isinstance(new_shape_val, (int, tvm.tir.expr.IntImm))
-                        and int(shape_val) == int(new_shape_val)
-                    ):
-                        logger.info(f"{op_name}: can't modify batch dimension")
-                        return False
-            elif int(new_shape[0]) > 0:
-                # Currently we only allow dim[0] to be Any, so this branch will always be False
-                if not (
-                    isinstance(shape[0], (int, tvm.tir.expr.IntImm))
-                    and isinstance(new_shape[0], (int, tvm.tir.expr.IntImm))
-                    and int(shape[0]) == int(new_shape[0])
-                ):
-                    logger.info(f"{op_name}: can't modify batch dimension")
-                    return False
-        else:
-            shape = list(map(int, shape))
-            new_shape = list(map(int, new_shape))
-
-            # TRT cannot modify batch dimension.
-            original_volume = np.prod(shape)
-            # First, resolve 0.
-            for i, value in enumerate(new_shape):
-                if value == 0:
-                    new_shape[i] = shape[i]
-            # Resolve -1.
-            for i, value in enumerate(new_shape):
-                if value == -1:
-                    new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
-            # Remove batch dimension and see if volumes match
-            if shape[0] != new_shape[0]:
-                logger.info(f"{op_name}: can't modify batch dimension.")
-                return False
-    return True
-
-
-def pad_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.pad is supported by TensorRT."""
-    pad_value = args[1]
-    if not isinstance(pad_value, relay.Constant):
-        logger.info(f"{op_name}: pad argument must be constant")
-        return False
-    pad_value = pad_value.data.numpy().item()
-    if attrs.pad_mode != "constant":
-        logger.info(f"{op_name}: pad mode is {attrs.pad_mode} but must be constant.")
-        return False
-    if pad_value > 0.0:
-        logger.info(f"{op_name}: pad value is {pad_value} but must be 0.0.")
-        return False
-    if len(attrs.pad_width) not in [4, 5]:
-        logger.info(f"{op_name}: can only pad 4D or 5D inputs")
-        return False
-    if any([x != 0 for x in attrs.pad_width[0]]) or any([x != 0 for x in attrs.pad_width[1]]):
-        logger.info(f"{op_name}: can't pad batch or channel dimensions.")
-        return False
-    if len(attrs.pad_width) == 5 and any([x != 0 for x in attrs.pad_width[2]]):
-        logger.info(f"{op_name}: can only pad last two dimensions for 5D inputs.")
-        return False
-    return True
-
-
-def strided_slice_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if strided_slice is supported by TensorRT."""
-    if get_tensorrt_use_implicit_batch_mode():
-        batch_dim_begin_modified = attrs.begin[0] is not None and int(attrs.begin[0]) != 0
-        batch_dim_end_modified = (
-            attrs.end[0] is not None
-            and int(attrs.end[0]) != -1
-            and int(attrs.end[0]) != int(args[0].checked_type.shape[0])
-        )
-        if batch_dim_begin_modified or batch_dim_end_modified:
-            logger.info(f"{op_name}: can't modify batch dimension.")
-            return False
-    if any([x is not None and x <= 0 for x in attrs.strides]):
-        logger.info(f"{op_name}: stride must be positive")
-        return False
-    length: int = len(attrs.axes) if attrs.axes is not None else len(args[0].checked_type.shape)
-    for i in range(0, length):
-        begin = int(attrs.begin[i])
-        if attrs.slice_mode == "end":
-            end = (
-                int(attrs.end[i])
-                if attrs.end[i] is not None and int(attrs.end[i]) != -1
-                else args[0].checked_type.shape[i]
-            )
-            size = int(end) - int(begin)
-        elif attrs.slice_mode == "size":
-            size = (
-                int(attrs.end[i])
-                if attrs.end[i] is not None and int(attrs.end[i]) != -1
-                else args[0].checked_type.shape[i] - begin
-            )
-        else:
-            logger.warning(f"{op_name}: unknown slice mode encountered")
-            size = 1
-
-        if int(size) < 1:
-            logger.info(f"{op_name}: size of slice must be at least 1")
-            return False
-
-    return True
-
-
-def adaptive_max_pool2d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.adaptive_max_pool2d is supported by TensorRT."""
-    if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-        logger.info(f"{op_name}: output size must be (1, 1).")
-        return False
-    return True
-
-
-def adaptive_avg_pool2d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.adaptive_avg_pool2d is supported by TensorRT."""
-    if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-        logger.info(f"{op_name}: output size must be (1, 1).")
-        return False
-    return True
-
-
-def conv3d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.conv3d is supported by TensorRT."""
-    if not isinstance(args[1], Constant):
-        logger.info(f"{op_name}: kernel argument must be constant.")
-        return False
-    if attrs.data_layout != "NCDHW":
-        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCDHW.")
-        return False
-    if attrs.kernel_layout != "OIDHW":
-        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIDHW.")
-        return False
-    if attrs.out_layout and attrs.out_layout != "NCDHW":
-        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCDHW.")
-        return False
-    return True
-
-
-def max_pool_3d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.max_pool3d is supported by TensorRT."""
-    if attrs.layout != "NCDHW":
-        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCDHW.")
-        return False
-    return True
-
-
-def avg_pool_3d_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.avg_pool3d is supported by TensorRT."""
-    if attrs.layout != "NCDHW":
-        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCDHW.")
-        return False
-    return True
-
-
-def conv3d_transpose_checker(
-    attrs: Any, args: List[relay.expr.Expr], op_name: str
-) -> bool:  # pylint: disable=unused-variable
-    """Check if nn.conv3d_transpose is supported by TensorRT."""
-    if attrs.data_layout != "NCDHW":
-        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCDHW.")
-        return False
-    if attrs.kernel_layout != "OIDHW":
-        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIDHW.")
-        return False
-    if attrs.out_layout and attrs.out_layout != "NCDHW":
-        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCDHW.")
-        return False
-    if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-        logger.info(f"{op_name}: dilation rate must be 1.")
-        return False
-    if attrs.output_padding and any([x != 0 for x in map(int, attrs.output_padding)]):
-        logger.info(f"{op_name}: output padding is not supported.")
-        return False
-    return True
-
-
-def unary_op_pattern(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
-    """Matches unary operation"""
-    return is_op(op)(wildcard())
-
-
-def unary_op_pattern_with_any_tuple(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
-    """Matches unary operation with literal tuple argument"""
-    return is_op(op)(is_tuple(None))
-
-
-def binary_op_pattern(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
-    """Matches binary operation"""
-    return is_op(op)(wildcard(), wildcard())
-
-
-def binary_op_pattern_with_const(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
-    """Matches binary operation with rhs arg a constant"""
-    return is_op(op)(wildcard(), is_constant())
-
-
-def proj_five_op_pattern_with_const(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
-    return is_tuple_get_item(
-        is_op(op)(wildcard(), is_constant(), is_constant(), is_constant(), is_constant()), 0
-    )
-
-
-@register_pattern_table("tensorrt")
-def pattern_table() -> List[
-    Tuple[str, relay.dataflow_pattern.DFPattern, Callable[[relay.expr.Call], bool]]
-]:
-    """Get the Tensorrt compiler pattern table for supported ops."""
-
-    return [
-        (
-            "tensorrt.nn.conv3d",
-            binary_op_pattern_with_const("nn.conv3d"),
-            make_predicate(make_and_checker(make_trt_version_checker((6, 0, 1)), conv3d_checker)),
-        ),
-        (
-            "tensorrt.nn.conv2d",
-            binary_op_pattern_with_const("nn.conv2d"),
-            make_predicate(conv2d_checker),
-        ),
-        (
-            "tensorrt.nn.conv1d",
-            binary_op_pattern_with_const("nn.conv1d"),
-            make_predicate(conv1d_checker),
-        ),
-        (
-            "tensorrt.nn.conv2d_transpose",
-            binary_op_pattern("nn.conv2d_transpose"),
-            make_predicate(conv2d_transpose_checker),
-        ),
-        ("tensorrt.squeeze", binary_op_pattern("squeeze"), make_predicate(squeeze_checker)),
-        ("tensorrt.add", binary_op_pattern("add"), make_predicate(add_checker)),
-        (
-            "tensorrt.nn.dense",
-            binary_op_pattern_with_const("nn.dense"),
-            make_predicate(dense_checker),
-        ),
-        (
-            "tensorrt.nn.bias_add",
-            binary_op_pattern("nn.bias_add"),
-            make_predicate(bias_add_checker),
-        ),
-        (
-            "tensorrt.nn.batch_matmul",
-            binary_op_pattern("nn.batch_matmul"),
-            make_predicate(batch_matmul_checker),
-        ),
-        ("tensorrt.divide", binary_op_pattern("divide"), standard_predicate),
-        ("tensorrt.multiply", binary_op_pattern("multiply"), make_predicate(multiply_checker)),
-        ("tensorrt.subtract", binary_op_pattern("subtract"), standard_predicate),
-        ("tensorrt.power", binary_op_pattern("power"), standard_predicate),
-        ("tensorrt.maximum", binary_op_pattern("maximum"), standard_predicate),
-        ("tensorrt.minimum", binary_op_pattern("minimum"), standard_predicate),
-        ("tensorrt.nn.relu", unary_op_pattern("nn.relu"), standard_predicate),
-        (
-            "tensorrt.nn.leaky_relu",
-            unary_op_pattern("nn.leaky_relu"),
-            make_predicate(make_trt_version_checker((5, 1, 5))),
-        ),
-        ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), standard_predicate),
-        ("tensorrt.sigmoid", unary_op_pattern("sigmoid"), standard_predicate),
-        ("tensorrt.tanh", unary_op_pattern("tanh"), standard_predicate),
-        ("tensorrt.exp", unary_op_pattern("exp"), standard_predicate),
-        ("tensorrt.log", unary_op_pattern("log"), standard_predicate),
-        ("tensorrt.sqrt", unary_op_pattern("sqrt"), standard_predicate),
-        ("tensorrt.abs", unary_op_pattern("abs"), standard_predicate),
-        ("tensorrt.negative", unary_op_pattern("negative"), standard_predicate),
-        ("tensorrt.nn.batch_flatten", unary_op_pattern("nn.batch_flatten"), standard_predicate),
-        ("tensorrt.clip", unary_op_pattern("clip"), standard_predicate),
-        (
-            "tensorrt.sin",
-            unary_op_pattern("sin"),
-            make_predicate(make_trt_version_checker((5, 1, 5))),
-        ),
-        (
-            "tensorrt.cos",
-            unary_op_pattern("cos"),
-            make_predicate(make_trt_version_checker((5, 1, 5))),
-        ),
-        (
-            "tensorrt.atan",
-            unary_op_pattern("atan"),
-            make_predicate(make_trt_version_checker((5, 1, 5))),
-        ),
-        (
-            "tensorrt.ceil",
-            unary_op_pattern("ceil"),
-            make_predicate(make_trt_version_checker((5, 1, 5))),
-        ),
-        ("tensorrt.floor", unary_op_pattern("floor"), standard_predicate),
-        (
-            "tensorrt.erf",
-            unary_op_pattern("erf"),
-            make_predicate(make_trt_version_checker((7, 0, 0))),
-        ),
-        ("tensorrt.sum", unary_op_pattern("sum"), make_predicate(reduce_checker)),
-        ("tensorrt.prod", unary_op_pattern("prod"), make_predicate(reduce_checker)),
-        ("tensorrt.max", unary_op_pattern("max"), make_predicate(reduce_checker)),
-        ("tensorrt.min", unary_op_pattern("min"), make_predicate(reduce_checker)),
-        ("tensorrt.max", unary_op_pattern("max"), make_predicate(reduce_checker)),
-        ("tensorrt.mean", unary_op_pattern("mean"), make_predicate(reduce_checker)),
-        (
-            "tensorrt.concatenate",
-            unary_op_pattern_with_any_tuple("concatenate"),
-            make_predicate(concatenate_checker),
-        ),
-        (
-            "tensorrt.expand_dims",
-            unary_op_pattern("expand_dims"),
-            make_predicate(expand_dims_checker),
-        ),
-        (
-            "tensorrt.layout_transform",
-            unary_op_pattern("layout_transform"),
-            make_predicate(layout_transform_checker),
-        ),
-        ("tensorrt.transpose", unary_op_pattern("transpose"), make_predicate(transpose_checker)),
-        ("tensorrt.reshape", unary_op_pattern("reshape"), make_predicate(reshape_checker)),
-        ("tensorrt.split", unary_op_pattern("split"), make_predicate(split_checker)),
-        ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), make_predicate(pad_checker)),
-        (
-            "tensorrt.strided_slice",
-            unary_op_pattern("strided_slice"),
-            make_predicate(
-                make_and_checker(make_trt_version_checker((5, 1, 5)), strided_slice_checker)
-            ),
-        ),
-        (
-            "tensorrt.nn.adaptive_avg_pool2d",
-            unary_op_pattern("nn.adaptive_avg_pool2d"),
-            make_predicate(adaptive_avg_pool2d_checker),
-        ),
-        (
-            "tensorrt.nn.adaptive_max_pool2d",
-            unary_op_pattern("nn.adaptive_max_pool2d"),
-            make_predicate(adaptive_max_pool2d_checker),
-        ),
-        (
-            "tensorrt.nn.max_pool3d",
-            unary_op_pattern("nn.max_pool3d"),
-            make_predicate(
-                make_and_checker(make_trt_version_checker((6, 0, 1)), max_pool_3d_checker)
-            ),
-        ),
-        (
-            "tensorrt.nn.avg_pool3d",
-            unary_op_pattern("nn.avg_pool3d"),
-            make_predicate(
-                make_and_checker(make_trt_version_checker((6, 0, 1)), avg_pool_3d_checker)
-            ),
-        ),
-        (
-            "tensorrt.nn.conv3d_transpose",
-            unary_op_pattern("nn.conv3d_transpose"),
-            make_predicate(
-                make_and_checker(make_trt_version_checker((6, 0, 1)), conv3d_transpose_checker)
-            ),
-        ),
-        ("tensorrt.nn.softmax", unary_op_pattern("nn.softmax"), make_predicate(softmax_checker)),
-        (
-            "tensorrt.nn.layer_norm",
-            unary_op_pattern("nn.layer_norm"),
-            make_predicate(layer_norm_checker),
-        ),
-        (
-            "tensorrt.nn.max_pool2d",
-            unary_op_pattern("nn.max_pool2d"),
-            make_predicate(max_pool_2d_checker),
-        ),
-        (
-            "tensorrt.nn.avg_pool2d",
-            unary_op_pattern("nn.avg_pool2d"),
-            make_predicate(avg_pool_2d_checker),
-        ),
-        (
-            "tensorrt.nn.global_max_pool2d",
-            unary_op_pattern("nn.global_max_pool2d"),
-            make_predicate(global_max_pool_2d_checker),
-        ),
-        (
-            "tensorrt.nn.global_avg_pool2d",
-            unary_op_pattern("nn.global_avg_pool2d"),
-            make_predicate(global_avg_pool_2d_checker),
-        ),
-        (
-            "tensorrt.nn.batch_norm",
-            proj_five_op_pattern_with_const("nn.batch_norm"),
-            make_predicate(batch_norm_checker),
-        ),
-    ]
-
-
-class IsComputeIntensiveGraph(ExprVisitor):
-    """
-    Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
-    its transpose, dense and batch mat-mul.
-    """
-
-    def __init__(self) -> None:
-        ExprVisitor.__init__(self)
-        self.is_compute_intensive = False
-
-    def visit_call(self, call: relay.expr.Call) -> None:
-        compute_intensive_ops = {
-            "nn.conv1d",
-            "nn.conv2d",
-            "nn.conv2d_transpose",
-            "nn.conv3d",
-            "nn.conv3d_transpose",
-            "nn.dense",
-            "nn.batch_matmul",
-            "sum",
-            "prod",
-            "max",
-            "min",
-            "mean",
-        }
-        if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op.name) in compute_intensive_ops:
-                self.is_compute_intensive = True
-
-        return super().visit_call(call)
-
-    def is_graph_compute_intensive(self, subgraph: relay.expr.Expr) -> bool:
-        """
-        This function recursively visits the graph and checks if it's compute intensive"
-        """
-        self.visit(subgraph)
-        return self.is_compute_intensive
-
-
-def is_valid_subgraph(params: List[relay.expr.Var], body: relay.expr.Expr) -> bool:
-    """Final check on whether the subgraph is valid and should be offloaded to TensorRT."""
-    # Remove invalid subgraphs for implicit batch mode.
-    if get_tensorrt_use_implicit_batch_mode():
-        input_batch_sizes = []
-        for var in params:
-            # In implicit batch mode, all inputs must have same batch size
-            # TODO: (codeislife99) : Fix different dynamic batch size inputs
-            if isinstance(var.checked_type, relay.TupleType):
-                for tupe_type in var.checked_type.fields:
-                    # Scalar inputs not allowed
-                    if len(tupe_type.shape) == 0:
-                        logger.info("tensorrt: scalar inputs not supported")
-                        return False
-
-                    if not isinstance(tupe_type.shape[0], tvm.tir.expr.Any):
-                        input_batch_sizes.append(int(tupe_type.shape[0]))
-            else:
-                # Scalar inputs not allowed
-                if len(var.checked_type.shape) == 0:
-                    logger.info("tensorrt: scalar inputs not supported")
-                    return False
-                if not isinstance(var.checked_type.shape[0], tvm.tir.expr.Any):
-                    input_batch_sizes.append(int(var.checked_type.shape[0]))
-
-        if len(input_batch_sizes) > 1 and len(set(input_batch_sizes)) != 1:
-            logger.info("tensorrt: inputs have different batch sizes: %s", input_batch_sizes)
-            return False
-
-    if get_tensorrt_remove_no_mac_subgraphs():
-        if not IsComputeIntensiveGraph().is_graph_compute_intensive(body):
-            logger.info("tensorrt: not a compute-intensize sub-graph")
-            return False
-
-    return True
-
-
-def prune_tensorrt_subgraphs(mod: tvm.IRModule) -> tvm.IRModule:
-    """
-    Un-partition those partitions which:
-     - have no multiply-accumulates (if remove_no_mac_subgraphs is True)
-     - can't actually be supported by TensorRT now that we see the whole partition."""
-    global_vars_to_inline = [
-        gv
-        for gv in mod.get_global_vars()
-        if mod[gv].attrs
-        and mod[gv].attrs["Compiler"] == "tensorrt"
-        and not is_valid_subgraph(mod[gv].params, mod[gv].body)
-    ]
-    return relay.transform.InlineCompilerFunctionsBoundTo(global_vars_to_inline)(mod)
-
-
-class RemoveDropout(ExprMutator):
-    """
-    Removes all nn.dropout from an expr.
-    """
-
-    def visit_tuple_getitem(self, op: TupleGetItem) -> relay.expr.Expr:
-        visit = super().visit_tuple_getitem(op)
-        if visit.index != 0:
-            return visit
-        if (
-            isinstance(visit.tuple_value, Call)
-            and isinstance(visit.tuple_value.op, Op)
-            and visit.tuple_value.op.name == "nn.dropout"
-            and visit.index == 0
-        ):
-            return visit.tuple_value.args[0]
-        return visit
-
-
-@transform.function_pass(opt_level=0)
-class RemoveDropoutPass:
-    def transform_function(
-        self, func: relay.function.Function, mod: tvm.IRModule, _: tvm.transform.PassContext
-    ) -> relay.function.Function:
-        return RemoveDropout().visit(func)
diff --git a/python/tvm/relay/op/contrib/vitis_ai.py b/python/tvm/relay/op/contrib/vitis_ai.py
deleted file mode 100644
index 185fdc2ce82e..000000000000
--- a/python/tvm/relay/op/contrib/vitis_ai.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, no-else-return, E1102
-"""Vitis-AI codegen annotation of supported operators"""
-
-import warnings
-import numpy as np
-
-from tvm import relay
-import tvm._ffi
-from tvm.relay import transform
-from tvm.relay.expr import Tuple, TupleGetItem
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-
-# Placeholder for PyXIR module
-pyxir = None
-
-
-def enabled():
-    """Return whether Vitis-AI support is available"""
-    if not tvm.get_global_func("relay.ext.vitis_ai.available", True):
-        print("Skip because Vitis-AI codegen is not available.")
-        return False
-    return True
-
-
-@transform.function_pass(opt_level=0)
-class VitisAIAnnotationPass:
-    """Responsible for annotating Relay expressions for Vitis-AI DPU accelerators
-
-    Parameters
-    ----------
-    compiler : str
-        The compiler name used for annotations (`vitis_ai`).
-    dpu_target : str
-        The Vitis AI DPU target identifier.
-    params : dict
-        A dictionary containing the module's parameters.
-    """
-
-    def __init__(self, compiler, dpu_target, params):
-        global pyxir
-        try:
-            if pyxir is None:
-                pyxir = __import__("pyxir")
-                __import__("pyxir.frontend.tvm")
-        except ImportError:
-            # add "from None" to silence
-            # "During handling of the above exception, another exception occurred"
-            raise ImportError(
-                "The pyxir package is required for the Vitis AI backend. "
-                "Please install it first. "
-                "Help: (https://tvm.apache.org/docs/deploy/vitis_ai.html) "
-            ) from None
-
-        self.compiler = compiler
-        self.dpu_target = dpu_target
-        self.params = params
-
-    def transform_function(self, func, mod, ctx):
-        """Transform function for annotating Relay module"""
-        annotator = self
-
-        class Annotator(tvm.relay.ExprMutator):
-            """Annotator for Vitis-AI DPU accelerators"""
-
-            def visit_tuple(self, tup):
-                """Add compiler_begin and compiler_end annotations to Tuple"""
-                field_list = []
-                cond = int(hash(tup))
-                for field in tup.fields:
-                    if cond in annotator.relay_ids:
-                        field_list.append(compiler_begin(super().visit(field), annotator.compiler))
-                    else:
-                        field_list.append(super().visit(field))
-                if cond in annotator.relay_ids:
-                    return compiler_end(Tuple(field_list), annotator.compiler)
-                else:
-                    return Tuple(field_list)
-
-            def visit_tuple_getitem(self, op):
-                """Add compiler_begin and compiler_end annotations to TupleGetItem"""
-                if int(hash(op.tuple_value)) in annotator.relay_ids:
-                    tuple_value = compiler_begin(super().visit(op.tuple_value), annotator.compiler)
-                    return compiler_end(TupleGetItem(tuple_value, op.index), annotator.compiler)
-                else:
-                    tuple_value = super().visit(op.tuple_value)
-                    return TupleGetItem(tuple_value, op.index)
-
-            def visit_call(self, call):
-                """Add compiler_begin and compiler_end annotations to the Call expr"""
-                if int(hash(call)) in annotator.relay_ids:
-                    new_args = []
-                    for arg in call.args:
-                        ann = compiler_begin(super().visit(arg), annotator.compiler)
-                        new_args.append(ann)
-                    new_call = relay.Call(call.op, new_args, call.attrs, call.type_args)
-                    return compiler_end(new_call, annotator.compiler)
-
-                else:
-                    return super().visit_call(call)
-
-        xgraph = pyxir.frontend.tvm.from_relay(mod, self.params, postprocessing=None)
-        xgraph = pyxir.partition(xgraph, targets=[self.dpu_target])
-
-        layers = xgraph.get_layers()
-        relay_ids = [
-            list(np.array(layer.attrs["relay_id"]).flatten())
-            for layer in layers
-            if layer.target == self.dpu_target
-        ]
-        self.relay_ids = [item for sublist in relay_ids for item in sublist]
-
-        return Annotator().visit(func)
-
-
-def annotation(mod, params, target):
-    """DEPRECATED
-
-    Annotate Relay expression for offloading operators to Vitis AI DPU accelerators
-    NOTE: This function does the same as the next one (`partition_for_vitis_ai`) but is
-    still here for backward compatibility"""
-    # We need type information for supporting models that contain operations that don't
-    #   have a Relay to XLayer translation
-    warnings.warn(
-        "tvm.relay.op.contrib.vitis_ai.annotation() is being deprecated."
-        " Please use tvm.relay.op.contrib.vitis_ai.partition_for_vitis_ai() instead. "
-        " Check out https://tvm.apache.org/docs/deploy/vitis_ai.html for documentation. "
-    )
-    mod = relay.transform.InferType()(mod)
-    mod = VitisAIAnnotationPass("vitis_ai", target, params)(mod)
-    return mod
-
-
-def partition_for_vitis_ai(mod, params=None, dpu=None, **opts):
-    """Partition the Relay expression for offloading operators to Vitis AI DPU
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-    dpu : str
-        The DPU identifier (e.g. DPUCZDX8G-zcu104, DPUCADF8H)
-
-    Returns
-    -------
-    ret : Module
-    """
-
-    if dpu is None:
-        raise ValueError("Please pass Vitis AI DPU identifier to the partitioning function")
-
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    desired_layouts_in_partition = {
-        "nn.conv2d": ["NHWC", "default"],
-        "nn.upsampling": ["NHWC"],
-        "image.resize2d": ["NHWC"],
-    }
-    desired_layouts_in_main = {
-        "nn.conv2d": ["NCHW", "default"],
-        "nn.upsampling": ["NCHW"],
-        "image.resize2d": ["NCHW"],
-    }
-    seq = tvm.transform.Sequential(
-        [
-            transform.RemoveUnusedFunctions(),
-            transform.ConvertLayout(desired_layouts_in_partition),
-            transform.FoldConstant(),
-            transform.InferType(),
-            VitisAIAnnotationPass("vitis_ai", dpu, params),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            transform.RemoveUnusedFunctions(),
-            transform.ConvertLayout(desired_layouts_in_main),
-            transform.FoldConstant(),
-        ]
-    )
-
-    with tvm.transform.PassContext(opt_level=3):
-        return seq(mod)
diff --git a/python/tvm/relay/op/dyn/__init__.py b/python/tvm/relay/op/dyn/__init__.py
deleted file mode 100644
index 45bab2b1c4b5..000000000000
--- a/python/tvm/relay/op/dyn/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin, invalid-name
-"""The Relay namespace containing dynamic ops."""
-
-from . import _algorithm
-from . import _transform
-from . import _tensor
-
-from . import image
diff --git a/python/tvm/relay/op/dyn/_algorithm.py b/python/tvm/relay/op/dyn/_algorithm.py
deleted file mode 100644
index ba903e680bbd..000000000000
--- a/python/tvm/relay/op/dyn/_algorithm.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"Definition of classic algorithms"
-# pylint: disable=invalid-name,unused-argument
-from __future__ import absolute_import
-
-from tvm.te.hybrid import script
-from tvm.runtime import convert
-
-from .. import strategy
-from .. import op as _reg
-from ..op import OpPattern, register_pattern
-from ..op import register_strategy
-
-# topk
-register_strategy("dyn.topk", strategy.topk_strategy)
-register_pattern("dyn.topk", OpPattern.OPAQUE)
-
-
-@script
-def _topk_shape_func_input_data(data, k, axis):
-    ndim = len(data.shape)
-    val_out = output_tensor((ndim,), "int64")
-    indices_out = output_tensor((ndim,), "int64")
-
-    for i in const_range(ndim):
-        if i != axis:
-            val_out[i] = int64(data.shape[i])
-            indices_out[i] = int64(data.shape[i])
-        else:
-            if k[0] < 1:
-                val_out[i] = int64(data.shape[i])
-                indices_out[i] = int64(data.shape[i])
-            else:
-                val_out[i] = int64(k[0])
-                indices_out[i] = int64(k[0])
-    return val_out, indices_out
-
-
-@_reg.register_shape_func("dyn.topk", True)
-def topk_shape_func(attrs, inputs, _):
-    """
-    Shape func for topk.
-    """
-    axis = attrs.axis
-    if axis < 0:
-        axis += len(inputs[0].shape)
-    val_out, indices_out = _topk_shape_func_input_data(inputs[0], inputs[1], convert(axis))
-
-    ret_type = attrs.ret_type
-    if ret_type == "both":
-        ret = [val_out, indices_out]
-    elif ret_type == "values":
-        ret = [val_out]
-    else:
-        ret = [indices_out]
-
-    return ret
diff --git a/python/tvm/relay/op/dyn/_make.py b/python/tvm/relay/op/dyn/_make.py
deleted file mode 100644
index ab88fe872458..000000000000
--- a/python/tvm/relay/op/dyn/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.dyn._make", __name__)
diff --git a/python/tvm/relay/op/dyn/_tensor.py b/python/tvm/relay/op/dyn/_tensor.py
deleted file mode 100644
index 5d5d5556c8ea..000000000000
--- a/python/tvm/relay/op/dyn/_tensor.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, len-as-condition
-"""Backend compiler related feature registration for dynamic ops"""
-
-from tvm import topi
-
-from ..op import register_shape_func, register_compute
-from ..op import register_broadcast_schedule
-from ..op import register_pattern, OpPattern
-from .._tensor import full_shape_func, no_data_full_shape_func
-
-# ones
-@register_compute("dyn.ones")
-def ones_compute(attrs, inputs, output_type):
-    assert len(inputs) == 1
-    return [topi.full(output_type.shape, output_type.dtype, 1.0)]
-
-
-register_broadcast_schedule("dyn.ones")
-register_pattern("dyn.ones", OpPattern.ELEMWISE)
-
-
-@register_compute("dyn.zeros")
-def zeros_compute(attrs, inputs, output_type):
-    assert len(inputs) == 1
-    return [topi.full(output_type.shape, output_type.dtype, 0.0)]
-
-
-register_broadcast_schedule("dyn.zeros")
-register_pattern("dyn.zeros", OpPattern.ELEMWISE)
-
-register_shape_func("dyn.broadcast_to", True, full_shape_func)
-register_shape_func("dyn.ones", True, no_data_full_shape_func)
-register_shape_func("dyn.zeros", True, no_data_full_shape_func)
-register_shape_func("dyn.full", True, full_shape_func)
diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py
deleted file mode 100644
index a1f014cb422f..000000000000
--- a/python/tvm/relay/op/dyn/_transform.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Backend compiler related feature registration"""
-# pylint: disable=invalid-name,unused-argument, len-as-condition, too-many-nested-blocks, too-many-local-variables, too-many-arguments
-from __future__ import absolute_import
-
-from tvm.runtime import convert
-from tvm.te.hybrid import script
-
-from .. import op as _reg
-
-_reg.register_broadcast_schedule("dyn.broadcast_to")
-_reg.register_injective_schedule("dyn.reshape")
-_reg.register_injective_schedule("dyn.expand_dims")
-_reg.register_injective_schedule("dyn.squeeze")
-_reg.register_broadcast_schedule("dyn.tile")
-_reg.register_injective_schedule("dyn.one_hot")
-_reg.register_injective_schedule("dyn.full")
-_reg.register_injective_schedule("dyn.strided_slice")
-_reg.register_injective_schedule("dyn.sparse_to_dense")
-
-
-@script
-def _reshape_shape_func_input_data(data_shape, newshape, ndim, allowzero):
-    out = output_tensor((ndim,), "int64")
-    src_idx = 0
-    dst_idx = 0
-    infer_idx = -1
-    copy = False
-    skip = 0
-    for i in const_range(len(newshape)):
-        if skip > 0:
-            skip -= 1
-        elif newshape[i] > 0:
-            out[dst_idx] = int64(newshape[i])
-            src_idx += 1
-            dst_idx += 1
-        elif newshape[i] == 0:
-            if allowzero:
-                out[dst_idx] = int64(newshape[i])
-            else:
-                out[dst_idx] = data_shape[src_idx]
-            src_idx += 1
-            dst_idx += 1
-        elif newshape[i] == -1:
-            assert infer_idx < 0, "One and only one dim can be inferred"
-            out[dst_idx] = int64(1)
-            infer_idx = i
-            src_idx += 1
-            dst_idx += 1
-        elif newshape[i] == -2:
-            assert False, "Value -2 is not valid in newshape argument of dynamic reshape"
-        elif newshape[i] == -3:
-            assert data_shape.shape[0] - src_idx > 1, "Not enough dims in input shape for -3"
-            out[dst_idx] = data_shape[src_idx] * data_shape[src_idx + 1]
-            src_idx += 2
-            dst_idx += 1
-        elif newshape[i] == -4:
-            assert False, "Value -4 is not valid in newshape argument of dynamic reshape"
-        else:
-            assert False, "Invalid special values in new shape"
-    if len(data_shape.shape) > 0:
-        # if data is not constant, we can then handle -1 and -2
-        if copy:
-            for i in range(src_idx, data_shape.shape[0]):
-                out[dst_idx] = data_shape[i]
-                dst_idx += 1
-        if infer_idx >= 0:
-            old_size = int64(1)
-            for i in const_range(data_shape.shape[0]):
-                old_size *= data_shape[i]
-            new_size = int64(1)
-            for i in const_range(out.shape[0]):
-                new_size *= out[i]
-            out[infer_idx] = old_size // new_size
-    return out
-
-
-@_reg.register_shape_func("dyn.reshape", [False, True])
-def dynamic_reshape_shape_func(attrs, inputs, out_ndims):
-    allowzero = attrs.allowzero
-    return [_reshape_shape_func_input_data(*inputs, out_ndims[0], convert(allowzero))]
-
-
-@script
-def _expand_dims_shape_func_input_data(data, axis, ndims, num_newaxis):
-    out = output_tensor((ndims,), "int64")
-
-    for i in const_range(ndims):
-        if i < axis:
-            # We multiply by a check (i < len(data.shape)) to avoid
-            # a constant folding mechanism leading to an overflow
-            out[i] = int64(data.shape[i * (i < len(data.shape))])
-        elif i - num_newaxis < axis:
-            out[i] = int64(1)
-        else:
-            out[i] = int64(
-                # We can't use axis in indices as it is not constant but we can
-                # use negative indices (kind of, have to manually do it)
-                data.shape[
-                    (i - num_newaxis) * (i - num_newaxis >= 0)
-                    + (i - num_newaxis + len(data.shape)) * (i - num_newaxis < 0)
-                ]
-            )
-
-    return out
-
-
-@_reg.register_shape_func("dyn.expand_dims", [True, True])
-def dynamic_expand_dims_shape_func(attrs, inputs, out_ndims):
-    return [
-        _expand_dims_shape_func_input_data(
-            inputs[0],
-            inputs[1],
-            out_ndims[0],
-            convert(attrs.num_newaxis),
-        )
-    ]
-
-
-@script
-def _tile_shape_func(data, reps, ndim, tndim, rndim):
-    out = output_tensor((tndim,), "int64")
-
-    if ndim == rndim:
-        for i in const_range(tndim):
-            out[i] = int64(data.shape[i] * reps[i])
-    elif ndim > rndim:
-        ngap = ndim - rndim
-        for i in const_range(ndim):
-            if i < ngap:
-                out[i] = int64(data.shape[i])
-            else:
-                out[i] = int64(data.shape[i] * reps[i - ngap])
-    else:
-        rgap = rndim - ndim
-        for i in const_range(rndim):
-            if i < rgap:
-                out[i] = int64(reps[i])
-            else:
-                out[i] = int64(reps[i] * data.shape[i - rgap])
-    return out
-
-
-@_reg.register_shape_func("dyn.tile", True)
-def tile_shape_func(attrs, inputs, _):
-    """
-    Shape function for dyn.tile op.
-    """
-    reps = inputs[1]
-    ndim = len(inputs[0].shape)
-    rndim = inputs[1].shape[0].value
-    tndim = ndim if ndim > rndim else rndim
-    return [_tile_shape_func(inputs[0], reps, convert(ndim), convert(tndim), convert(rndim))]
-
-
-@script
-def _onehot_shape_func(dshape, k, axis):
-    ndim = len(dshape) + 1
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(axis):
-        out[i] = int64(dshape[i])
-    out[axis] = int64(k[(0)])
-    for j in const_range(axis + 1, ndim):
-        out[j] = int64(dshape[j - 1])
-    return out
-
-
-@_reg.register_shape_func("dyn.one_hot", True)
-def one_hot_shape_func(attrs, inputs, _):
-    """
-    Shape function for dyn.one_hot op.
-    """
-    axis = len(inputs[0].shape) if attrs.axis == -1 else attrs.axis
-    return [_onehot_shape_func(inputs[0].shape, inputs[3], convert(axis))]
-
-
-@script
-def _strided_slice_shape_func_input_data(data_shape, begin, end, strides, slice_mode):
-    ndim = len(data_shape)
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        dim_size = int64(data_shape[i])
-        cbegin = int64(0)
-        cend = dim_size
-        cstride = int64(1)
-
-        if strides.shape[0] > i:
-            cstride = int64(strides[i])
-
-        if begin.shape[0] > i:
-            cbegin = int64(begin[i])
-        elif cstride < 0:
-            cbegin = dim_size
-
-        if end.shape[0] <= i:
-            if cstride < 0:
-                cend = int64(0)
-        elif slice_mode != 0:
-            cstride = int64(1)
-            if end[i] < 0:
-                cend = dim_size
-            else:
-                cend = cbegin + int64(end[i])
-        else:
-            if end[i] > data_shape[i]:
-                cend = dim_size
-            else:
-                cend = int64(end[i])
-
-        assert cstride != 0, "Strides can't be zero."
-
-        if cbegin < 0:
-            cbegin += dim_size
-        if cend < 0:
-            cend += dim_size
-
-        if cstride < 0:
-            if cend < 0:
-                cend = int64(-1)
-            if cbegin > dim_size - 1:
-                cbegin = dim_size - 1
-            slice_range = cbegin - cend
-            step = -cstride
-        else:
-            slice_range = cend - cbegin
-            step = cstride
-        out[i] = int64(ceil_div(slice_range, step))
-    return out
-
-
-@_reg.register_shape_func("dyn.strided_slice", [False, True, True, True])
-def strided_slice_shape_func(attrs, inputs, _):
-    """
-    Shape func for strided_slice
-    """
-    slice_mode = convert(0 if attrs.slice_mode == "end" else 1)
-    return [_strided_slice_shape_func_input_data(*inputs, slice_mode)]
-
-
-@script
-def _sparse_to_dense_shape_func(output_shape, ndim):
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        out[i] = int64(output_shape[i])
-    return out
-
-
-@_reg.register_shape_func("dyn.sparse_to_dense", True)
-def sparse_to_dense_shape_func(attrs, inputs, out_ndims):
-    return [_sparse_to_dense_shape_func(inputs[3], out_ndims[0])]
-
-
-@script
-def _squeeze_shape_func_input_data(data, axis, ndims):
-    out = output_tensor((ndims,), "int64")
-    out_i = 0
-    for i in const_range(data.shape[0]):
-        not_in_axis = True
-        for j in const_range(axis.shape[0]):
-            if i == axis[j]:
-                not_in_axis = False
-        if not_in_axis:
-            out[out_i] = int64(data[i])
-            out_i += 1
-
-    return out
-
-
-@_reg.register_shape_func("dyn.squeeze", [False, True])
-def dynamic_squeeze_shape_func(attrs, inputs, out_ndims):
-    return [_squeeze_shape_func_input_data(inputs[0], inputs[1], out_ndims[0])]
diff --git a/python/tvm/relay/op/dyn/image/__init__.py b/python/tvm/relay/op/dyn/image/__init__.py
deleted file mode 100644
index 270421a9a409..000000000000
--- a/python/tvm/relay/op/dyn/image/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin, invalid-name
-"""The Relay namespace containing dynamic image ops."""
-
-from . import _image
diff --git a/python/tvm/relay/op/dyn/image/_image.py b/python/tvm/relay/op/dyn/image/_image.py
deleted file mode 100644
index faebde02b2ca..000000000000
--- a/python/tvm/relay/op/dyn/image/_image.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Backend compiler related feature registration"""
-from __future__ import absolute_import
-
-import tvm.topi
-from tvm.runtime import convert
-from tvm.te.hybrid import script
-from tvm.topi.utils import nchw_pack_layout, nchw_xc_layout
-from ... import op as reg
-
-
-# resize
-@reg.register_compute("dyn.image.resize2d")
-def compute_resize2d(attrs, inputs, out_type):
-    """
-    Compute function calls into topi
-    """
-    layout = attrs.layout
-    method = attrs.method
-    coord_trans = attrs.coordinate_transformation_mode
-    rounding_method = attrs.rounding_method
-    cubic_alpha = attrs.cubic_alpha
-    cubic_exclude = attrs.cubic_exclude
-    extrapolation_value = attrs.extrapolation_value
-    out_dtype = attrs.out_dtype
-    return [
-        tvm.topi.image.resize2d(
-            inputs[0],
-            inputs[2],
-            inputs[1],
-            layout,
-            method,
-            coord_trans,
-            rounding_method,
-            cubic_alpha,
-            cubic_exclude,
-            extrapolation_value,
-            out_dtype,
-            out_type.shape,
-        )
-    ]
-
-
-reg.register_injective_schedule("dyn.image.resize2d")
-
-
-@script
-def _resize2d_shape_func(dshape, size, ndim, height_axis, width_axis):
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        out[i] = int64(dshape[i])
-    out[height_axis] = int64(size[0])
-    out[width_axis] = int64(size[1])
-    return out
-
-
-@reg.register_shape_func("dyn.image.resize2d", True)
-def resize2d_shape_func(attrs, inputs, _):
-    """
-    Shape function for dyn.image.resize op.
-    """
-    layout = attrs.layout
-    if nchw_pack_layout(layout) or nchw_xc_layout(layout):
-        out = [
-            _resize2d_shape_func(
-                inputs[0].shape, inputs[1], convert(len(inputs[0].shape)), convert(2), convert(3)
-            )
-        ]
-    else:
-        height_axis = width_axis = 1
-        for i, letter in enumerate(layout):
-            if letter == "H":
-                height_axis = i
-            if letter == "W":
-                width_axis = i
-        out = [
-            _resize2d_shape_func(
-                inputs[0].shape,
-                inputs[1],
-                convert(len(inputs[0].shape)),
-                convert(height_axis),
-                convert(width_axis),
-            )
-        ]
-    return out
diff --git a/python/tvm/relay/op/dyn/image/_make.py b/python/tvm/relay/op/dyn/image/_make.py
deleted file mode 100644
index 69830aeefd62..000000000000
--- a/python/tvm/relay/op/dyn/image/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.dyn.image._make", __name__)
diff --git a/python/tvm/relay/op/dyn/nn/__init__.py b/python/tvm/relay/op/dyn/nn/__init__.py
deleted file mode 100644
index 01a3a1bc0679..000000000000
--- a/python/tvm/relay/op/dyn/nn/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin, invalid-name
-"""The Relay namespace containing dynamic ops."""
-
-from . import _nn
diff --git a/python/tvm/relay/op/dyn/nn/_make.py b/python/tvm/relay/op/dyn/nn/_make.py
deleted file mode 100644
index 280fe72315ad..000000000000
--- a/python/tvm/relay/op/dyn/nn/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.dyn.nn._make", __name__)
diff --git a/python/tvm/relay/op/dyn/nn/_nn.py b/python/tvm/relay/op/dyn/nn/_nn.py
deleted file mode 100644
index ec4066561fce..000000000000
--- a/python/tvm/relay/op/dyn/nn/_nn.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, invalid-name, unused-argument, too-many-arguments, consider-using-in
-"""Backend compiler related feature registration for dynamic relay ops in nn namespace"""
-
-from __future__ import absolute_import
-
-from tvm import topi
-
-from tvm.runtime import convert
-from tvm.te.hybrid import script
-from ...op import register_shape_func, register_compute
-from ...op import register_injective_schedule, register_broadcast_schedule
-
-# upsampling
-@register_compute("dyn.nn.upsampling")
-def compute_upsampling(attrs, inputs, out_dtype):
-    data = inputs[0]
-    scale_h = inputs[1]
-    scale_w = inputs[2]
-    layout = attrs.layout
-    method = attrs.method
-    align_corners = attrs.align_corners
-    return [
-        topi.nn.upsampling(data, scale_h, scale_w, layout, method, align_corners, out_dtype.shape)
-    ]
-
-
-# upsampling3d
-@register_compute("dyn.nn.upsampling3d")
-def compute_upsampling3d(attrs, inputs, out_dtype):
-    data = inputs[0]
-    scale_d = inputs[1]
-    scale_h = inputs[2]
-    scale_w = inputs[3]
-    layout = attrs.layout
-    method = attrs.method
-    coordinate_transformation_mode = attrs.coordinate_transformation_mode
-    return [
-        topi.nn.upsampling3d(
-            data,
-            scale_d,
-            scale_h,
-            scale_w,
-            layout,
-            method,
-            coordinate_transformation_mode,
-            out_dtype.shape,
-        )
-    ]
-
-
-register_injective_schedule("dyn.nn.upsampling")
-register_injective_schedule("dyn.nn.upsampling3d")
-register_broadcast_schedule("dyn.nn.pad")
-
-#####################
-#  Shape functions  #
-#####################
-
-# upsampling
-@script
-def _upsampling_shape_func(dshape, scale_h, scale_w, height_axis, width_axis):
-    out = output_tensor((4,), "int64")
-    for i in const_range(4):
-        out[i] = int64(dshape[i])
-    out[height_axis] = int64(round(dshape[height_axis] * scale_h[()]))
-    out[width_axis] = int64(round(dshape[width_axis] * scale_w[()]))
-    return out
-
-
-@register_shape_func("dyn.nn.upsampling", True)
-def upsampling_shape_func(attrs, inputs, _):
-    """Shape function for upsampling. Supports NCHW and NHWC layouts."""
-    layout = attrs.layout
-    height_axis = width_axis = 1
-    for i, letter in enumerate(layout):
-        if letter == "H":
-            height_axis = i
-        if letter == "W":
-            width_axis = i
-    return [
-        _upsampling_shape_func(
-            inputs[0].shape, inputs[1], inputs[2], convert(height_axis), convert(width_axis)
-        )
-    ]
-
-
-# upsampling3d
-@script
-def _upsampling3d_shape_func(
-    dshape, scale_d, scale_h, scale_w, depth_axis, height_axis, width_axis
-):
-    out = output_tensor((5,), "int64")
-    for i in const_range(5):
-        out[i] = int64(dshape[i])
-    out[depth_axis] = int64(round(dshape[depth_axis] * scale_d[()]))
-    out[height_axis] = int64(round(dshape[height_axis] * scale_h[()]))
-    out[width_axis] = int64(round(dshape[width_axis] * scale_w[()]))
-    return out
-
-
-@register_shape_func("dyn.nn.upsampling3d", True)
-def upsampling3d_shape_func(attrs, inputs, _):
-    """Shape function for upsampling. Supports NCHW and NHWC layouts."""
-    layout = attrs.layout
-    depth_axis = height_axis = width_axis = 1
-    for i, letter in enumerate(layout):
-        if letter == "D":
-            depth_axis = i
-        if letter == "H":
-            height_axis = i
-        if letter == "W":
-            width_axis = i
-    return [
-        _upsampling3d_shape_func(
-            inputs[0].shape,
-            inputs[1],
-            inputs[2],
-            inputs[3],
-            convert(depth_axis),
-            convert(height_axis),
-            convert(width_axis),
-        )
-    ]
-
-
-# pad
-@script
-def _dyn_pad_shape_func(data, pad_width):
-    ndim = len(data.shape)
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        out[i] = int64(pad_width[i, 0] + pad_width[i, 1] + data.shape[i])
-    return out
-
-
-@register_shape_func("dyn.nn.pad", True)
-def pad_shape_func(attrs, inputs, data):
-    """
-    Shape function for dynamic pad op.
-    """
-    return [_dyn_pad_shape_func(inputs[0], inputs[1])]
diff --git a/python/tvm/relay/op/image/__init__.py b/python/tvm/relay/op/image/__init__.py
deleted file mode 100644
index 146b449fb04c..000000000000
--- a/python/tvm/relay/op/image/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Image network related operators."""
-from .image import *
-from . import _image
diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
deleted file mode 100644
index f46a04bd0592..000000000000
--- a/python/tvm/relay/op/image/_image.py
+++ /dev/null
@@ -1,411 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Backend compiler related feature registration"""
-from __future__ import absolute_import
-
-from tvm.te.hybrid import script
-from tvm.runtime import convert
-
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-from .. import op as reg
-from .. import strategy
-from ..op import OpPattern
-from .image import resize2d
-
-
-# resize
-@reg.register_compute("image.resize1d")
-def compute_resize1d(attrs, inputs, out_type):
-    """compute definition for resize1d op"""
-    size = attrs.size
-    roi = attrs.roi
-    layout = attrs.layout
-    method = attrs.method
-    coord_trans = attrs.coordinate_transformation_mode
-    rounding_method = attrs.rounding_method
-    cubic_alpha = attrs.cubic_alpha
-    cubic_exclude = attrs.cubic_exclude
-    extrapolation_value = attrs.extrapolation_value
-    out_dtype = attrs.out_dtype
-    return [
-        topi.image.resize1d(
-            inputs[0],
-            roi,
-            size,
-            layout,
-            method,
-            coord_trans,
-            rounding_method,
-            cubic_alpha,
-            cubic_exclude,
-            extrapolation_value,
-            out_dtype,
-        )
-    ]
-
-
-reg.register_injective_schedule("image.resize1d")
-
-
-@reg.register_convert_op_layout("image.resize1d")
-def convert_image_resize1d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for image resize1d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current resize op
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data input.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-
-    new_attrs = dict(attrs)
-    assert len(desired_layouts) == 1, "Only one desired layout is expected"
-    desired_layout = str(desired_layouts[0])
-    assert desired_layout != "default", "Layout cannot be default"
-    new_attrs["layout"] = desired_layout
-    return resize1d(*inputs, **new_attrs)
-
-
-@script
-def _resize1d_shape_func(image_shape, size, batch_axis, width_axis, channel_axis):
-    out = output_tensor((3,), "int64")
-    out[batch_axis] = int64(image_shape[0])
-    out[width_axis] = int64(size[1])
-    out[channel_axis] = image_shape[channel_axis]
-    return out
-
-
-@reg.register_shape_func("image.resize1d", False)
-def resize1d_shape_func(attrs, inputs, _):
-    """
-    Shape function for resize2d op.
-    """
-    layout = attrs.layout
-    width_axis = channel_axis = 1
-    for i, letter in enumerate(layout):
-        if letter == "N":
-            batch_axis = i
-        if letter == "W":
-            width_axis = i
-        if letter == "C":
-            channel_axis = i
-    size = get_const_tuple(attrs.size)
-    return [
-        _resize1d_shape_func(
-            inputs[0],
-            convert(size),
-            convert(batch_axis),
-            convert(width_axis),
-            convert(channel_axis),
-        )
-    ]
-
-
-@reg.register_compute("image.resize2d")
-def compute_resize2d(attrs, inputs, out_type):
-    """compute definition for resize2d op"""
-    size = attrs.size
-    roi = attrs.roi
-    layout = attrs.layout
-    method = attrs.method
-    coord_trans = attrs.coordinate_transformation_mode
-    rounding_method = attrs.rounding_method
-    cubic_alpha = attrs.cubic_alpha
-    cubic_exclude = attrs.cubic_exclude
-    extrapolation_value = attrs.extrapolation_value
-    out_dtype = attrs.out_dtype
-    return [
-        topi.image.resize2d(
-            inputs[0],
-            roi,
-            size,
-            layout,
-            method,
-            coord_trans,
-            rounding_method,
-            cubic_alpha,
-            cubic_exclude,
-            extrapolation_value,
-            out_dtype,
-        )
-    ]
-
-
-reg.register_injective_schedule("image.resize2d")
-
-
-@reg.register_convert_op_layout("image.resize2d")
-def convert_image_resize2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for image resize2d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current resize op
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data input.
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-
-    new_attrs = dict(attrs)
-    assert len(desired_layouts) == 1, "Only one desired layout is expected"
-    desired_layout = str(desired_layouts[0])
-    assert desired_layout != "default", "Layout cannot be default"
-    new_attrs["layout"] = desired_layout
-    return resize2d(*inputs, **new_attrs)
-
-
-@script
-def _resize2d_shape_func(image_shape, size, batch_axis, height_axis, width_axis, channel_axis):
-    out = output_tensor((4,), "int64")
-    out[batch_axis] = int64(image_shape[0])
-    out[height_axis] = int64(size[0])
-    out[width_axis] = int64(size[1])
-    out[channel_axis] = image_shape[channel_axis]
-    return out
-
-
-@reg.register_shape_func("image.resize2d", False)
-def resize2d_shape_func(attrs, inputs, _):
-    """
-    Shape function for resize2d op.
-    """
-    layout = attrs.layout
-    height_axis = width_axis = channel_axis = 1
-    for i, letter in enumerate(layout):
-        if letter == "N":
-            batch_axis = i
-        if letter == "H":
-            height_axis = i
-        if letter == "W":
-            width_axis = i
-        if letter == "C":
-            channel_axis = i
-    size = get_const_tuple(attrs.size)
-    return [
-        _resize2d_shape_func(
-            inputs[0],
-            convert(size),
-            convert(batch_axis),
-            convert(height_axis),
-            convert(width_axis),
-            convert(channel_axis),
-        )
-    ]
-
-
-@reg.register_compute("image.resize3d")
-def compute_resize3d(attrs, inputs, out_type):
-    """compute definition for resize3d op"""
-    size = attrs.size
-    roi = attrs.roi
-    layout = attrs.layout
-    method = attrs.method
-    coord_trans = attrs.coordinate_transformation_mode
-    rounding_method = attrs.rounding_method
-    cubic_alpha = attrs.cubic_alpha
-    cubic_exclude = attrs.cubic_exclude
-    extrapolation_value = attrs.extrapolation_value
-    out_dtype = attrs.out_dtype
-    return [
-        topi.image.resize3d(
-            inputs[0],
-            roi,
-            size,
-            layout,
-            method,
-            coord_trans,
-            rounding_method,
-            cubic_alpha,
-            cubic_exclude,
-            extrapolation_value,
-            out_dtype,
-        )
-    ]
-
-
-reg.register_injective_schedule("image.resize3d")
-
-
-# crop and resize
-@reg.register_compute("image.crop_and_resize")
-def compute_crop_and_resize(attrs, inputs, out_type):
-    crop_size = attrs.crop_size
-    layout = attrs.layout
-    method = attrs.method
-    extrapolation_value = attrs.extrapolation_value
-    out_dtype = attrs.out_dtype
-    return [
-        topi.image.crop_and_resize(
-            inputs[0],
-            inputs[1],
-            inputs[2],
-            crop_size,
-            layout,
-            method,
-            extrapolation_value,
-            out_dtype,
-        )
-    ]
-
-
-reg.register_injective_schedule("image.crop_and_resize")
-
-
-@script
-def _crop_and_resize_func(
-    image_shape, boxes_shape, crop_size, height_axis, width_axis, channel_axis
-):
-    out = output_tensor((4,), "int64")
-    out[0] = boxes_shape[0]
-    out[height_axis] = int64(crop_size[0])
-    out[width_axis] = int64(crop_size[1])
-    out[channel_axis] = image_shape[channel_axis]
-    return out
-
-
-@reg.register_shape_func("image.crop_and_resize", False)
-def crop_and_resize_func(attrs, inputs, _):
-    """
-    Shape function for crop_and_resize op.
-    """
-    layout = attrs.layout
-    height_axis = width_axis = channel_axis = 1
-    for i, letter in enumerate(layout):
-        if letter == "H":
-            height_axis = i
-        if letter == "W":
-            width_axis = i
-        if letter == "C":
-            channel_axis = i
-    crop_size = get_const_tuple(attrs.crop_size)
-    return [
-        _crop_and_resize_func(
-            inputs[0],
-            inputs[1],
-            convert(crop_size),
-            convert(height_axis),
-            convert(width_axis),
-            convert(channel_axis),
-        )
-    ]
-
-
-# dilation2d
-reg.register_strategy("image.dilation2d", strategy.dilation2d_strategy)
-reg.register_pattern("image.dilation2d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
-# affine_grid
-@reg.register_compute("image.affine_grid")
-def compute_affine_grid(attrs, inputs, out_dtype):
-    target_shape = get_const_tuple(attrs.target_shape)
-    return [topi.image.affine_grid(inputs[0], target_shape)]
-
-
-reg.register_injective_schedule("image.affine_grid")
-
-
-@script
-def _affine_grid_func(data, target_shape):
-    out = output_tensor((4,), "int64")
-    out[0] = int64(data[0])
-    out[1] = int64(2)
-    out[2] = int64(target_shape[0])
-    out[3] = int64(target_shape[1])
-    return out
-
-
-@reg.register_shape_func("image.affine_grid", False)
-def affine_grid_func(attrs, inputs, _):
-    """
-    Shape function for affine_grid op.
-    """
-    target_shape = get_const_tuple(attrs.target_shape)
-    return [_affine_grid_func(inputs[0], convert(target_shape))]
-
-
-# grid_sample
-@reg.register_compute("image.grid_sample")
-def compute_grid_sample(attrs, inputs, out_dtype):
-    method = attrs.method
-    layout = attrs.layout
-    padding_mode = attrs.padding_mode
-    align_corners = attrs.align_corners
-    return [
-        topi.image.grid_sample(inputs[0], inputs[1], method, layout, padding_mode, align_corners)
-    ]
-
-
-reg.register_injective_schedule("image.grid_sample")
-
-
-@script
-def _grid_sample_func_nchw(data, grid):
-    out = output_tensor((4,), "int64")
-    out[0] = int64(data[0])
-    out[1] = int64(data[1])
-    out[2] = int64(grid[2])
-    out[3] = int64(grid[3])
-    return out
-
-
-@script
-def _grid_sample_func_ncdhw(data, grid):
-    out = output_tensor((5,), "int64")
-    out[0] = int64(data[0])
-    out[1] = int64(data[1])
-    out[2] = int64(grid[2])
-    out[3] = int64(grid[3])
-    out[4] = int64(grid[4])
-    return out
-
-
-@reg.register_shape_func("image.grid_sample", False)
-def grid_sample_func(attrs, inputs, _):
-    """
-    Shape function for grid_sample op.
-    """
-    if attrs.layout == "NCHW":
-        script_func = _grid_sample_func_nchw
-    elif attrs.layout == "NCDHW":
-        script_func = _grid_sample_func_ncdhw
-    else:
-        msg = f"layout {attrs.layout} is not supported"
-        raise ValueError(msg)
-    return [script_func(inputs[0], inputs[1])]
diff --git a/python/tvm/relay/op/image/_make.py b/python/tvm/relay/op/image/_make.py
deleted file mode 100644
index 1d5e02848a46..000000000000
--- a/python/tvm/relay/op/image/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.image._make", __name__)
diff --git a/python/tvm/relay/op/image/image.py b/python/tvm/relay/op/image/image.py
deleted file mode 100644
index 5a17532dd018..000000000000
--- a/python/tvm/relay/op/image/image.py
+++ /dev/null
@@ -1,523 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Image operations."""
-from . import _make
-from ..dyn.image import _make as _dyn_make
-from ...expr import Expr, Constant, const
-
-
-def resize1d(
-    data,
-    size,
-    roi=None,
-    layout="NCW",
-    method="linear",
-    coordinate_transformation_mode="half_pixel",
-    rounding_method="",
-    cubic_alpha=-0.5,
-    cubic_exclude=0,
-    extrapolation_value=0.0,
-    out_dtype=None,
-):
-    """Image resize1d operator.
-
-    This operator takes data as input and does 1D scaling to the given scale factor.
-    In the default case, where the data_layout is `NCW`
-    with data of shape (n, c, w)
-    out will have a shape (n, c, size[0])
-
-    method indicates the algorithm to be used while calculating the out value
-    and method can be one of ("linear", "nearest_neighbor", "cubic")
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    size: Tuple of Int or Expr
-        The out size to which the image will be resized.
-
-    roi: Tuple of Float or Expr, optional
-        The region of interest for cropping the input image. Expected to be of
-        size 2, and format [start_w, end_w].
-        Only used if coordinate_transformation_mode is tf_crop_and_resize.
-
-    layout : str, optional
-        Layout of the input.
-
-    method : str, optional
-        Scale method to used [nearest_neighbor, linear, cubic].
-
-    coordinate_transformation_mode : string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor. Defintions can be found
-        in topi/image/resize.py.
-        [half_pixel, align_corners, asymmetric, pytorch_half_pixel,
-        tf_half_pixel_for_nn, and tf_crop_and_resize].
-
-    rounding_method: string, optional
-        indicates how to find the "nearest" pixel in nearest_neighbor method
-        [round, floor, ceil]
-
-    cubic_alpha: float
-        Spline Coefficient for cubic interpolation
-
-    cubic_exclude: int
-        Flag to exclude exterior of the image during cubic interpolation
-
-    extrapolation_value: float
-        Fill value to use when roi is outside of the image
-
-    out_dtype : str, optional
-        Type to return. If left None returns the same type as input.
-
-    Returns
-    -------
-    result: relay.Expr
-        The resized result.
-    """
-    if roi is None:
-        roi = [0.0] * 2
-    if isinstance(size, Constant):
-        size = list(size.data.numpy().astype("int32"))
-    if isinstance(roi, Constant):
-        roi = list(roi.data.numpy().astype("int32"))
-    if isinstance(size, Expr) or isinstance(roi, Expr):
-        raise NotImplementedError(
-            "dyn.resize1d is not yet implemented, got size", size, "and roi", roi
-        )
-    return _make.resize1d(
-        data,
-        size,
-        roi,
-        layout,
-        method,
-        coordinate_transformation_mode,
-        rounding_method,
-        cubic_alpha,
-        cubic_exclude,
-        extrapolation_value,
-        out_dtype,
-    )
-
-
-def resize2d(
-    data,
-    size,
-    roi=None,
-    layout="NCHW",
-    method="linear",
-    coordinate_transformation_mode="half_pixel",
-    rounding_method="",
-    cubic_alpha=-0.5,
-    cubic_exclude=0,
-    extrapolation_value=0.0,
-    out_dtype=None,
-):
-    """Image resize2d operator.
-
-    This operator takes data as input and does 2D scaling to the given scale factor.
-    In the default case, where the data_layout is `NCHW`
-    with data of shape (n, c, h, w)
-    out will have a shape (n, c, size[0], size[1])
-
-    method indicates the algorithm to be used while calculating the out value
-    and method can be one of ("linear", "nearest_neighbor", "cubic")
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    size: Tuple of Int or Expr
-        The out size to which the image will be resized.
-
-    roi: Tuple of Float or Expr, optional
-        The region of interest for cropping the input image. Expected to be of
-        size 4, and format [start_h, start_w, end_h, end_w].
-        Only used if coordinate_transformation_mode is tf_crop_and_resize.
-
-    layout : str, optional
-        Layout of the input.
-
-    method : str, optional
-        Scale method to used [nearest_neighbor, linear, cubic].
-
-    coordinate_transformation_mode : string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor. Defintions can be found
-        in topi/image/resize.py.
-        [half_pixel, align_corners, asymmetric, pytorch_half_pixel,
-        tf_half_pixel_for_nn, and tf_crop_and_resize].
-
-    rounding_method: string, optional
-        indicates how to find the "nearest" pixel in nearest_neighbor method
-        [round, floor, ceil]
-
-    cubic_alpha: float
-        Spline Coefficient for bicubic interpolation
-
-    cubic_exclude: int
-        Flag to exclude exterior of the image during bicubic interpolation
-
-    extrapolation_value: float
-        Fill value to use when roi is outside of the image
-
-    out_dtype : str, optional
-        Type to return. If left None returns the same type as input.
-
-    Returns
-    -------
-    result: relay.Expr
-        The resized result.
-    """
-    if roi is None:
-        roi = [0.0] * 4
-    if isinstance(size, Constant):
-        size = list(size.data.numpy().astype("int32"))
-    if isinstance(roi, Constant):
-        roi = list(roi.data.numpy().astype("float32"))
-    if isinstance(size, Expr) or isinstance(roi, Expr):
-        if not isinstance(size, Expr):
-            size = const(size, "int64")
-        if not isinstance(roi, Expr):
-            roi = const(roi, "float32")
-        return _dyn_make.resize2d(
-            data,
-            size,
-            roi,
-            layout,
-            method,
-            coordinate_transformation_mode,
-            rounding_method,
-            cubic_alpha,
-            cubic_exclude,
-            extrapolation_value,
-            out_dtype,
-        )
-    return _make.resize2d(
-        data,
-        size,
-        roi,
-        layout,
-        method,
-        coordinate_transformation_mode,
-        rounding_method,
-        cubic_alpha,
-        cubic_exclude,
-        extrapolation_value,
-        out_dtype,
-    )
-
-
-def resize3d(
-    data,
-    size,
-    roi=None,
-    layout="NCDHW",
-    method="linear",
-    coordinate_transformation_mode="half_pixel",
-    rounding_method="",
-    cubic_alpha=-0.5,
-    cubic_exclude=0,
-    extrapolation_value=0.0,
-    out_dtype=None,
-):
-    """Image resize3d operator.
-
-    This operator takes data as input and does 3D scaling to the given scale factor.
-    In the default case, where the data_layout is `NCDHW`
-    with data of shape `(n, c, d, h, w)`
-    out will have a shape `(n, c, size[0], size[1], size[2])`
-
-    method indicates the algorithm to be used while calculating the out value
-    and method can be one of ("linear", "nearest_neighbor", "cubic")
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    size: Tuple of Int or Expr
-        The out size to which the image will be resized.
-
-    roi: Tuple of Float or Expr, optional
-        The region of interest for cropping the input image. Expected to be of
-        size 6, and format [start_d, start_h, start_w, end_d, end_h, end_w].
-        Only used if coordinate_transformation_mode is tf_crop_and_resize.
-
-    layout : str, optional
-        Layout of the input.
-
-    method : str, optional
-        Scale method to used [nearest_neighbor, linear, cubic].
-
-    coordinate_transformation_mode : string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor. Defintions can be found
-        in topi/image/resize.py.
-        [half_pixel, align_corners, asymmetric, pytorch_half_pixel,
-        tf_half_pixel_for_nn, and tf_crop_and_resize].
-
-    rounding_method: string, optional
-        indicates how to find the "nearest" pixel in nearest_neighbor method
-        [round, floor, ceil]
-
-    cubic_alpha: float
-        Spline Coefficient for cubic interpolation
-
-    cubic_exclude: int
-        Flag to exclude exterior of the image during cubic interpolation
-
-    extrapolation_value: float
-        Fill value to use when roi is outside of the image
-
-    out_dtype : str, optional
-        Type to return. If left None returns the same type as input.
-
-    Returns
-    -------
-    result: relay.Expr
-        The resized result.
-    """
-    if roi is None:
-        roi = [0.0] * 6
-    if isinstance(size, Constant):
-        size = list(size.data.numpy().astype("int32"))
-    if isinstance(roi, Constant):
-        roi = list(roi.data.numpy().astype("int32"))
-    if isinstance(size, Expr) or isinstance(roi, Expr):
-        raise NotImplementedError(
-            "dyn.resize3d is not yet implemented, got size", size, "and roi", roi
-        )
-    return _make.resize3d(
-        data,
-        size,
-        roi,
-        layout,
-        method,
-        coordinate_transformation_mode,
-        rounding_method,
-        cubic_alpha,
-        cubic_exclude,
-        extrapolation_value,
-        out_dtype,
-    )
-
-
-def crop_and_resize(
-    data,
-    boxes,
-    box_indices,
-    crop_size,
-    layout,
-    method="bilinear",
-    extrapolation_value=0,
-    out_dtype=None,
-):
-    """Crop input images and resize them.
-
-    method indicates the algorithm to be used while calculating the out value
-    and method can be either "bilinear" or "nearest_neighbor".
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    boxes : relay.Expr
-        A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies
-        the coordinates of a box.
-
-    box_indices : relay.Expr
-        A 1-D tensor of shape [num_boxes], box_ind[i] specifies the data that
-        the i-th box refers to.
-
-    crop_size : Tuple of PrimExpr
-        The target size to which each box will be resized.
-
-    layout : str, optional
-        Layout of the input.
-
-    method : str, optional
-        Scale method, it can be either "nearest_neighbor" or "bilinear".
-
-    extrapolation_value : float, optional
-        Value used for extrapolation, when applicable.
-
-    out_dtype : str, optional
-        Type to return. If left None returns the same type as input.
-
-    Returns
-    -------
-    result: relay.Expr
-        The computed result.
-    """
-    return _make.crop_and_resize(
-        data, boxes, box_indices, crop_size, layout, method, extrapolation_value, out_dtype
-    )
-
-
-def dilation2d(
-    data,
-    weight,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilations=(1, 1),
-    data_layout="NCHW",
-    kernel_layout="IHW",
-    out_dtype="",
-):
-    r"""Morphological Dilation 2D.
-    This operator takes the weight as the dilation kernel and dilates it with
-    data to produce an output. In the default case, where the data_layout is `NCHW`
-    and kernel_layout is `OIHW`, dilation2d takes in a data Tensor with shape
-    `(batch_size, in_channels, height, width)`, and a weight Tensor with shape
-    `(channels, kernel_height, kernel_width)` to produce an output Tensor
-    with the following rule:
-
-    .. math::
-        \mbox{out}[b, c, y, x] = \max_{dy, dx}
-           \mbox{data}[b, c, \mbox{strides}[0] * y  + dy, \mbox{strides}[1] * x + dx] +
-           \mbox{weight}[c, dy, dx]
-
-    Padding and dilation are applied to data and weight respectively before the computation.
-    This operator accepts data layout specification. Semantically, the operator
-    will convert the layout to the canonical layout
-    (`NCHW` for data and `IHW` for weight) and perform the computation.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Optional[Tuple[int]]
-        The strides of convolution.
-
-    padding : Optional[Tuple[int]]
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilations : Optional[Tuple[int]]
-        Specifies the dilation rate to be used for dilated convolution.
-
-    data_layout : Optional[str]
-        Layout of the input.
-
-    kernel_layout : Optional[str]
-        Layout of the weight.
-
-    out_dtype : Optional[str]
-        Specifies the output data type.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _make.dilation2d(
-        data, weight, strides, padding, dilations, data_layout, kernel_layout, out_dtype
-    )
-
-
-def affine_grid(data, target_shape=None):
-    """affine_grid operator that generates 2D sampling grid.
-
-    This operation is described in https://arxiv.org/pdf/1506.02025.pdf. It generates a uniform
-    sampling grid within the target shape and normalizes it to [-1, 1]. The provided affine
-    transformation is then applied on the sampling grid.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        3-D with shape [batch, 2, 3]. The affine matrix.
-
-    target_shape: list/tuple of two int
-        Specifies the output shape (H, W).
-
-    Returns
-    -------
-    Output : tvm.Tensor
-        4-D with shape [batch, 2, target_height, target_width]
-    """
-    return _make.affine_grid(data, target_shape)
-
-
-def grid_sample(
-    data, grid, method="bilinear", layout="NCHW", padding_mode="zeros", align_corners=True
-):
-    """Applies grid sampling to input feature map.
-
-    Given :math:`data` and :math:`grid`, then for 4-D the output is computed by
-
-    .. math::
-
-        x_{src} = grid[batch, 0, y_{dst}, x_{dst}] \\
-        y_{src} = grid[batch, 1, y_{dst}, x_{dst}] \\
-        output[batch, channel, y_{dst}, x_{dst}] = G(data[batch, channel, y_{src}, x_{src}])
-
-    :math:`x_{dst}`, :math:`y_{dst}` enumerate all spatial locations in :math:`output`, and
-    :math:`G()` denotes the interpolation function.
-
-    The out-boundary points will be padded with zeros if padding_mode is "zeros", or
-    border pixel value if padding_mode is "border", or
-    inner pixel value if padding_mode is "reflection".
-
-    The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
-    (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
-    (-0.5, -0.5) and (h - 0.5, w - 0.5) of data if align_corners is "False".
-
-    The shape of the output will be
-    4-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]), or
-    5-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3], grid.shape[4]).
-
-    The operator assumes that :math:`grid` has been normalized to [-1, 1].
-
-    grid_sample often cooperates with affine_grid which generates sampling grids for grid_sample.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width], or
-        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
-
-    grid : tvm.Tensor
-        4-D with shape [batch, 2, out_height, out_width], or
-        5-D with shape [batch, 3, out_depth, out_height, out_width]
-
-    method : str
-        The interpolation method, 4-D "nearest", "bilinear", "bicubic" and
-        5-D "nearest", "bilinear"("trilinear") are supported.
-
-    layout : str
-        The layout of input data and the output.
-
-    padding_mode : str
-        The padding mode for outside grid values, "zeros", "border", "reflection" are supported.
-
-    align_corners: bool
-        Geometrically, we consider the pixels of the input as squares rather than points.
-        If set to "True", the extrema ("-1" and "1") are considered as referring
-        to the center points of the input corner pixels. If set to "False", they
-        are instead considered as referring to the corner points of the input corner
-        pixels, making the sampling more resolution agnostic.
-
-    Returns
-    -------
-    Output : tvm.Tensor
-        4-D with shape [batch, in_channel, out_height, out_width], or
-        5-D with shape [batch, in_channel, out_depth, out_height, out_width]
-    """
-    return _make.grid_sample(data, grid, method, layout, padding_mode, align_corners)
diff --git a/python/tvm/relay/op/memory/__init__.py b/python/tvm/relay/op/memory/__init__.py
deleted file mode 100644
index f3f73554f444..000000000000
--- a/python/tvm/relay/op/memory/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Operators for manipulating low level memory."""
-from __future__ import absolute_import as _abs
-from .memory import *
diff --git a/python/tvm/relay/op/memory/_make.py b/python/tvm/relay/op/memory/_make.py
deleted file mode 100644
index 52a3777a3785..000000000000
--- a/python/tvm/relay/op/memory/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.memory._make", __name__)
diff --git a/python/tvm/relay/op/memory/memory.py b/python/tvm/relay/op/memory/memory.py
deleted file mode 100644
index 9dae23d5b65e..000000000000
--- a/python/tvm/relay/op/memory/memory.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return,invalid-name,len-as-condition,too-many-nested-blocks
-"""Operators for manipulating low-level memory."""
-from __future__ import absolute_import as _abs
-from . import _make
-
-
-def alloc_tensor(storage, offset, shape, dtype="float32", assert_shape=None):
-    """Allocate a tensor with the provided shape, and dtype.
-
-    Parameters
-    ----------
-    storage : tvm.relay.Expr
-        The storage to allocate from.
-
-    offset : tvm.relay.Expr
-        The offset to allocate from.
-
-    shape : tvm.relay.Expr
-        The shape of the tensor to allocate.
-
-    dtype: str
-        The dtype of the tensor.
-
-    assert_shape: Control the static shape when computed by dynamic shape expression.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The alloc_tensor expression.
-    """
-    return _make.alloc_tensor(storage, offset, shape, dtype, assert_shape)
-
-
-def alloc_storage(size, alignment, device, dtype_hint="float32"):
-    """Allocate a piece of tensor storage.
-
-    Parameters
-    ----------
-    size : tvm.relay.Expr
-        The size of the allocation.
-    alignment : tvm.relay.Expr
-        The alignment of the allocation.
-    device : tvm.runtime.Device
-        The device of the allocation.
-    dtype_hint : str
-        The dtype hint of the allocation.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The alloc_storage expression.
-    """
-    return _make.alloc_storage(size, alignment, device, dtype_hint)
-
-
-def flatten_tuple_type(ty):
-    """Return a sequence of the types contained in the tuple type in order.
-
-    Parameters
-    ----------
-    ty: tvm.Type
-        The type to flatten.
-
-    Returns
-    -------
-    result: List[tvm.Type]
-        The types in their linear order.
-    """
-    return _make.FlattenTupleType(ty)
-
-
-def from_tuple_type(ty, expr):
-    """Convert an expression with the given type into a sequence of expressions.
-       Each expression maps to a field of the tuple or nested tuples in linear
-       order.
-
-    Parameters
-    ----------
-    ty: tvm.Type
-        The type to unpack.
-
-    expr: tvm.relay.Expr
-        The expression from which to extract each sub-field.
-
-    Returns
-    -------
-    result: List[tvm.relay.Expr]
-        The list of sub-expressions.
-    """
-    return _make.FromTupleType(ty, expr)
-
-
-def to_tuple_type(ty, exprs):
-    """Pack the sequence of expressions into the nested tuple type.
-
-    Parameters
-    ----------
-    ty: tvm.Type
-        The type to pack with.
-
-    exprs: tvm.relay.Expr
-        The expressions to pack back into the nested tuple type.
-
-    Returns
-    -------
-    result: List[tvm.relay.Expr]
-        The packed tuple expression.
-    """
-    return _make.ToTupleType(ty, exprs)
diff --git a/python/tvm/relay/op/nn/__init__.py b/python/tvm/relay/op/nn/__init__.py
deleted file mode 100644
index ebabbbcd9d3a..000000000000
--- a/python/tvm/relay/op/nn/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Neural network related operators."""
-from __future__ import absolute_import as _abs
-from .nn import *
-from . import _nn
diff --git a/python/tvm/relay/op/nn/_make.py b/python/tvm/relay/op/nn/_make.py
deleted file mode 100644
index 15ae43b35cb0..000000000000
--- a/python/tvm/relay/op/nn/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.nn._make", __name__)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
deleted file mode 100644
index a03907f071fd..000000000000
--- a/python/tvm/relay/op/nn/_nn.py
+++ /dev/null
@@ -1,1559 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, invalid-name, unused-argument, too-many-arguments, consider-using-in
-"""Backend compiler related feature registration"""
-from __future__ import absolute_import
-import re
-
-from tvm import relay, topi
-from tvm.runtime import convert
-from tvm.te.hybrid import script
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-
-from ....ir import container
-from ....tir import expr
-from ...transform import LayoutConfig
-from .. import op as reg
-from .. import strategy
-from .._tensor import elemwise_shape_func
-from ..strategy.generic import is_depthwise_conv2d
-
-# relu
-reg.register_broadcast_schedule("nn.relu")
-
-# softmax
-reg.register_strategy("nn.softmax", strategy.softmax_strategy)
-
-
-# fast softmax
-reg.register_strategy("nn.fast_softmax", strategy.fast_softmax_strategy)
-
-
-# log_softmax
-reg.register_strategy("nn.log_softmax", strategy.log_softmax_strategy)
-
-
-@reg.register_legalize("nn.matmul")
-def legalize_matmul(attrs, inputs, types):
-    """Legalize matmul op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current matmul
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.nn.matmul_legalize(attrs, inputs, types)
-
-
-# matmul
-reg.register_strategy("nn.matmul", strategy.matmul_strategy)
-
-
-@reg.register_legalize("nn.dense")
-def legalize_dense(attrs, inputs, types):
-    """Legalize dense op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.nn.dense_legalize(attrs, inputs, types)
-
-
-# dense
-reg.register_strategy("nn.dense", strategy.dense_strategy)
-
-
-@reg.register_alter_op_layout("nn.dense")
-def alter_op_layout_dense(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of dense"""
-    return topi.nn.dense_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-# dense_pack
-reg.register_strategy("nn.contrib_dense_pack", strategy.dense_pack_strategy)
-
-
-# fifo_buffer
-@reg.register_compute("nn.fifo_buffer")
-def compute_fifo_buffer(attrs, inputs, out_type):
-    return [topi.nn.fifo_buffer(inputs[0], inputs[1], axis=attrs.get_int("axis"))]
-
-
-reg.register_injective_schedule("nn.fifo_buffer")
-
-
-@reg.register_legalize("nn.batch_matmul")
-def legalize_batch_matmul(attrs, inputs, types):
-    """Legalize batch_matmul op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.nn.batch_matmul_legalize(attrs, inputs, types)
-
-
-# batch_matmul
-reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy)
-
-
-# batch_norm
-reg.register_strategy("nn.batch_norm", strategy.batch_norm_strategy)
-
-
-# sparse_dense
-@reg.register_compute("nn.sparse_dense")
-def compute_sparse_dense(attrs, inputs, out_type):
-    """Compute definition of sparse_dense"""
-    return [topi.nn.sparse_dense(inputs[0], inputs[1], inputs[2], inputs[3], attrs["sparse_lhs"])]
-
-
-reg.register_strategy("nn.sparse_dense", strategy.sparse_dense_strategy)
-
-
-@reg.register_alter_op_layout("nn.sparse_dense")
-def alter_op_layout_sparse_dense(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of sparse_dense"""
-    return topi.nn.sparse_dense_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-# sparse_add
-reg.register_strategy("nn.sparse_add", strategy.sparse_add_strategy)
-
-
-@reg.register_compute("nn.internal.sparse_dense_padded")
-def compute_sparse_dense_padded(attrs, inputs, out_type):
-    """Compute definition of sparse_dense_padded"""
-    raise NotImplementedError("nn.internal.sparse_dense_padded is only available on cuda")
-
-
-reg.register_strategy("nn.internal.sparse_dense_padded", strategy.sparse_dense_padded_strategy)
-
-
-# sparse_transpose
-@reg.register_compute("nn.sparse_transpose")
-def compute_sparse_transpose(attrs, inputs, out_type):
-    """Compute definition of sparse_transpose"""
-    return topi.nn.sparse_transpose(inputs[0], inputs[1], inputs[2])
-
-
-reg.register_schedule("nn.sparse_transpose", strategy.schedule_sparse_transpose)
-
-
-# sparse_conv2d
-@reg.register_compute("nn.sparse_conv2d")
-def compute_sparse_conv2d(attrs, inputs, out_type):
-    """Compute definition of sparse_conv2d"""
-    return [
-        topi.nn.sparse_conv2d(
-            inputs[0], inputs[1], inputs[2], inputs[3], attrs["layout"], attrs["kernel_size"]
-        )
-    ]
-
-
-reg.register_strategy("nn.sparse_conv2d", strategy.sparse_conv2d_strategy)
-
-
-# conv1d
-reg.register_strategy("nn.conv1d", strategy.conv1d_strategy)
-
-
-# conv2d
-reg.register_strategy("nn.conv2d", strategy.conv2d_strategy)
-
-
-@reg.register_alter_op_layout("nn.conv2d")
-def alter_op_layout_conv2d(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of conv2d"""
-    return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@reg.register_legalize("nn.conv2d")
-def legalize_conv2d(attrs, inputs, types):
-    """Legalize conv2d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.nn.conv2d_legalize(attrs, inputs, types)
-
-
-@reg.register_convert_op_layout("nn.conv2d")
-def convert_conv2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for conv2d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    data, weight = inputs
-
-    # First check if there is a LayoutConfig scope, and if so, whether
-    # it indicates we should ignore this layer or not.
-    layout_config = LayoutConfig.current
-    if layout_config is not None:
-        skip_layer = layout_config.check_skip()
-        if skip_layer:
-            return relay.nn.conv2d(data, weight, **attrs)
-
-    # Prepare new layout.
-    new_attrs = dict(attrs)
-    assert len(desired_layouts) == 2, "A desired layout is expected for both of nn.conv2d's inputs"
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    new_attrs["data_layout"] = desired_data_layout
-    need_tile = re.match(r"NCHW(\d*)c", desired_data_layout)
-
-    if desired_kernel_layout != "default" and not need_tile:
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    # Handle default kernel layouts
-    if desired_data_layout == "NCHW":
-        new_attrs["kernel_layout"] = "OIHW"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-    elif desired_data_layout == "NHWC":
-        # Check for depthwise convolution.
-        data_info, weight_info = tinfos
-        if is_depthwise_conv2d(
-            data_info.shape,
-            attrs["data_layout"],
-            weight_info.shape,
-            attrs["kernel_layout"],
-            attrs["groups"],
-        ):
-            new_attrs["kernel_layout"] = "HWOI"
-        else:
-            new_attrs["kernel_layout"] = "HWIO"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-    elif desired_data_layout == "HWNC":
-        new_attrs["kernel_layout"] = "HWOI"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-    elif need_tile:
-        assert desired_kernel_layout != "default", "Kernel layout cannot be default."
-        tile = int(need_tile.group(1))
-        if isinstance(data, relay.expr.Var) and data.checked_type.shape[1] % tile != 0:
-            return relay.nn.conv2d(data, weight, **attrs)
-        else:
-            new_attrs["kernel_layout"] = desired_kernel_layout
-            return relay.nn.contrib_conv2d_nchwc(data, weight, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported.")
-
-
-# conv2d_transpose
-reg.register_strategy("nn.conv2d_transpose", strategy.conv2d_transpose_strategy)
-
-
-@reg.register_legalize("nn.conv2d_transpose")
-def legalize_conv2d_transpose(attrs, inputs, types):
-    """Legalize conv2d_transpose op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current Transposed convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.nn.conv2d_transpose_legalize(attrs, inputs, types)
-
-
-@reg.register_alter_op_layout("nn.conv2d_transpose")
-def alter_op_layout_conv2d_transpose(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of conv2d_transpose"""
-    return topi.nn.conv2d_transpose_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@reg.register_convert_op_layout("nn.conv2d_transpose")
-def convert_conv2d_transpose(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for conv2d_transpose op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    data, weight = inputs
-    new_attrs = dict(attrs)
-    assert len(desired_layouts) == 2, "A desired layout is expected for both of nn.conv2d's inputs"
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    new_attrs["data_layout"] = desired_data_layout
-
-    if desired_kernel_layout != "default":
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        return relay.nn.conv2d_transpose(data, weight, **new_attrs)
-
-    # Handle default kernel layouts
-    if desired_data_layout == "NCHW":
-        new_attrs["kernel_layout"] = "IOHW"
-        return relay.nn.conv2d_transpose(data, weight, **new_attrs)
-    elif desired_data_layout == "NHWC":
-        new_attrs["kernel_layout"] = "HWIO"
-        return relay.nn.conv2d_transpose(data, weight, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported.")
-
-
-# conv3d_transpose
-reg.register_strategy("nn.conv3d_transpose", strategy.conv3d_transpose_strategy)
-
-
-@reg.register_convert_op_layout("nn.conv3d_transpose")
-def convert_conv3d_transpose(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for conv3d_transpose op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    data, weight = inputs
-    new_attrs = dict(attrs)
-    assert (
-        len(desired_layouts) == 2
-    ), "A desired layout is expected for both of nn.conv3d_transpose's inputs"
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    new_attrs["data_layout"] = desired_data_layout
-
-    if desired_kernel_layout != "default":
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        return relay.nn.conv3d_transpose(data, weight, **new_attrs)
-
-    # Handle default kernel layouts
-    if desired_data_layout == "NCDHW":
-        new_attrs["kernel_layout"] = "IODHW"
-        return relay.nn.conv3d_transpose(data, weight, **new_attrs)
-    elif desired_data_layout == "NDHWC":
-        new_attrs["kernel_layout"] = "DHWOI"
-        return relay.nn.conv3d_transpose(data, weight, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported")
-
-
-@reg.register_legalize("nn.conv3d_transpose")
-def legalize_conv3d_transpose(attrs, inputs, types):
-    """Legalize conv3d_transpose op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current Transposed convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.nn.conv3d_transpose_legalize(attrs, inputs, types)
-
-
-# conv3d
-reg.register_strategy("nn.conv3d", strategy.conv3d_strategy)
-
-
-@reg.register_alter_op_layout("nn.conv3d")
-def alter_op_layout_conv3d(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of conv3d"""
-    return topi.nn.conv3d_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@reg.register_convert_op_layout("nn.conv3d")
-def convert_conv3d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for conv3d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    data, weight = inputs
-    new_attrs = dict(attrs)
-    assert len(desired_layouts) == 2, "A desired layout is expected for both of nn.conv3d's inputs"
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    new_attrs["data_layout"] = desired_data_layout
-
-    if desired_kernel_layout != "default":
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        return relay.nn.conv3d(data, weight, **new_attrs)
-
-    # Handle default kernel layouts
-    if desired_data_layout == "NCDHW":
-        new_attrs["kernel_layout"] = "OIDHW"
-        return relay.nn.conv3d(data, weight, **new_attrs)
-    elif desired_data_layout == "NDHWC":
-        new_attrs["kernel_layout"] = "DHWIO"
-        return relay.nn.conv3d(data, weight, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported")
-
-
-# conv3d_winograd related operators
-reg.register_strategy(
-    "nn.contrib_conv3d_winograd_without_weight_transform",
-    strategy.conv3d_winograd_without_weight_transform_strategy,
-)
-
-
-@reg.register_compute("nn.contrib_conv3d_winograd_weight_transform")
-def compute_contrib_conv3d_winograd_weight_transform(attrs, inputs, out_dtype):
-    """Compute definition of contrib_conv3d_winograd_weight_transform"""
-    out = topi.nn.conv3d_winograd_weight_transform(inputs[0], attrs.get_int("tile_size"))
-    return [out]
-
-
-reg.register_schedule(
-    "nn.contrib_conv3d_winograd_weight_transform",
-    strategy.schedule_conv3d_winograd_weight_transform,
-)
-
-
-# conv1d_transpose
-reg.register_strategy("nn.conv1d_transpose", strategy.conv1d_transpose_strategy)
-
-
-# bias_add
-reg.register_injective_schedule("nn.bias_add")
-
-
-# max_pool1d
-reg.register_schedule("nn.max_pool1d", strategy.schedule_pool)
-
-
-# max_pool2d
-reg.register_schedule("nn.max_pool2d", strategy.schedule_pool)
-
-
-@reg.register_convert_op_layout("nn.max_pool2d")
-def convert_max_pool2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for max_pool2d op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current pooling
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of one layout string
-        layout string defining our desired layout for input and output.
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    new_attrs = dict(attrs)
-    new_attrs["layout"] = str(desired_layouts[0])
-    new_attrs["out_layout"] = str(desired_layouts[0])
-    return relay.nn.max_pool2d(*inputs, **new_attrs)
-
-
-# max_pool3d
-reg.register_schedule("nn.max_pool3d", strategy.schedule_pool)
-
-
-# avg_pool1d
-reg.register_schedule("nn.avg_pool1d", strategy.schedule_pool)
-
-
-# avg_pool2d
-reg.register_schedule("nn.avg_pool2d", strategy.schedule_pool)
-
-
-@reg.register_convert_op_layout("nn.avg_pool2d")
-def convert_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for avg_pool2d op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current pooling
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of one layout string
-        layout string defining our desired layout for input and output.
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    new_attrs = dict(attrs)
-    new_attrs["layout"] = str(desired_layouts[0])
-    new_attrs["out_layout"] = str(desired_layouts[0])
-    return relay.nn.avg_pool2d(*inputs, **new_attrs)
-
-
-# avg_pool3d
-reg.register_schedule("nn.avg_pool3d", strategy.schedule_pool)
-
-
-# max_pool2d_grad
-reg.register_schedule("nn.max_pool2d_grad", strategy.schedule_pool_grad)
-
-
-# avg_pool2d_grad
-reg.register_schedule("nn.avg_pool2d_grad", strategy.schedule_pool_grad)
-
-
-# adaptive_max_pool1d
-reg.register_schedule("nn.adaptive_max_pool1d", strategy.schedule_adaptive_pool)
-
-
-# adaptive_avg_pool1d
-reg.register_schedule("nn.adaptive_avg_pool1d", strategy.schedule_adaptive_pool)
-
-
-# global_max_pool2d
-reg.register_schedule("nn.global_max_pool2d", strategy.schedule_adaptive_pool)
-
-
-@reg.register_convert_op_layout("nn.global_max_pool2d")
-def convert_global_max_pool2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for global_max_pool2d op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current pooling
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of one layout string
-        layout string defining our desired layout for input and output.
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    new_attrs = dict(attrs)
-    new_attrs["layout"] = str(desired_layouts[0])
-    new_attrs["out_layout"] = str(desired_layouts[0])
-    return relay.nn.global_max_pool2d(*inputs, **new_attrs)
-
-
-# global_avg_pool2d
-reg.register_schedule("nn.global_avg_pool2d", strategy.schedule_adaptive_pool)
-
-
-@reg.register_convert_op_layout("nn.global_avg_pool2d")
-def convert_global_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for global_avg_pool2d op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current pooling
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of one layout string
-        layout string defining our desired layout for input and output.
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    new_attrs = dict(attrs)
-    new_attrs["layout"] = str(desired_layouts[0])
-    new_attrs["out_layout"] = str(desired_layouts[0])
-    return relay.nn.global_avg_pool2d(*inputs, **new_attrs)
-
-
-# adaptive_max_pool2d
-reg.register_schedule("nn.adaptive_max_pool2d", strategy.schedule_adaptive_pool)
-
-
-# adaptive_avg_pool2d
-reg.register_schedule("nn.adaptive_avg_pool2d", strategy.schedule_adaptive_pool)
-
-
-# adaptive_max_pool3d
-reg.register_schedule("nn.adaptive_max_pool3d", strategy.schedule_adaptive_pool)
-
-
-# adaptive_avg_pool3d
-reg.register_schedule("nn.adaptive_avg_pool3d", strategy.schedule_adaptive_pool)
-
-
-# leaky_relu
-reg.register_broadcast_schedule("nn.leaky_relu")
-
-
-# prelu
-reg.register_broadcast_schedule("nn.prelu")
-
-
-# flatten
-reg.register_broadcast_schedule("nn.batch_flatten")
-
-
-# lrn
-@reg.register_compute("nn.lrn")
-def compute_lrn(attrs, inputs, out_dtype):
-    """Compute definition of lrn"""
-    assert len(inputs) == 1
-    return [topi.nn.lrn(inputs[0], attrs.size, attrs.axis, attrs.alpha, attrs.beta, attrs.bias)]
-
-
-reg.register_schedule("nn.lrn", strategy.schedule_lrn)
-
-
-# upsampling
-@reg.register_compute("nn.upsampling")
-def compute_upsampling(attrs, inputs, out_dtype):
-    scale_h = attrs.scale_h
-    scale_w = attrs.scale_w
-    layout = attrs.layout
-    method = attrs.method
-    align_corners = attrs.align_corners
-    return [topi.nn.upsampling(inputs[0], scale_h, scale_w, layout, method, align_corners)]
-
-
-reg.register_injective_schedule("nn.upsampling")
-
-
-# upsampling3d
-@reg.register_compute("nn.upsampling3d")
-def compute_upsampling3d(attrs, inputs, out_dtype):
-    scale_d = attrs.scale_d
-    scale_h = attrs.scale_h
-    scale_w = attrs.scale_w
-    layout = attrs.layout
-    method = attrs.method
-    coordinate_transformation_mode = attrs.coordinate_transformation_mode
-    return [
-        topi.nn.upsampling3d(
-            inputs[0], scale_d, scale_h, scale_w, layout, method, coordinate_transformation_mode
-        )
-    ]
-
-
-reg.register_injective_schedule("nn.upsampling3d")
-
-
-# pad
-reg.register_schedule("nn.pad", strategy.schedule_pad)
-
-
-# mirror_pad
-@reg.register_compute("nn.mirror_pad")
-def compute_mirror_pad(attrs, inputs, out_dtype):
-    pad_before, pad_after = list(zip(*attrs.pad_width))
-    mode = attrs.mode
-    out = topi.nn.mirror_pad(inputs[0], pad_before=pad_before, pad_after=pad_after, mode=mode)
-    return [out]
-
-
-reg.register_broadcast_schedule("nn.mirror_pad")
-
-
-@script
-def _mirror_pad_func(data_shape, pad_width):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    for i in const_range(data_shape.shape[0]):
-        out[i] = data_shape[i] + int64(pad_width[i][0]) + int64(pad_width[i][1])
-    return out
-
-
-@reg.register_shape_func("nn.mirror_pad", False)
-def mirror_pad_func(attrs, inputs, _):
-    pad_width_tuple = [get_const_tuple(p) for p in attrs.pad_width]
-    return [_mirror_pad_func(inputs[0], convert(pad_width_tuple))]
-
-
-# conv2d_winograd related operators
-reg.register_strategy(
-    "nn.contrib_conv2d_winograd_without_weight_transform",
-    strategy.conv2d_winograd_without_weight_transform_strategy,
-)
-
-
-# conv2d_gemm related operators
-reg.register_strategy(
-    "nn.contrib_conv2d_gemm_without_weight_transform",
-    strategy.conv2d_gemm_without_weight_transform_strategy,
-)
-
-
-@reg.register_compute("nn.contrib_conv2d_gemm_weight_transform")
-def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype):
-    """Compute definition of contrib_conv2d_gemm_weight_transform"""
-    out = topi.nn.conv2d_gemm_weight_transform(inputs[0], attrs.tile_N, attrs.tile_K)
-    return [out]
-
-
-reg.register_schedule(
-    "nn.contrib_conv2d_gemm_weight_transform", strategy.schedule_conv2d_gemm_weight_transform
-)
-
-
-@reg.register_compute("nn.contrib_conv2d_winograd_weight_transform")
-def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype):
-    """Compute definition of contrib_conv2d_winograd_weight_transform"""
-    out = topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int("tile_size"))
-    return [out]
-
-
-reg.register_schedule(
-    "nn.contrib_conv2d_winograd_weight_transform",
-    strategy.schedule_conv2d_winograd_weight_transform,
-)
-
-
-@reg.register_compute("nn.contrib_conv2d_winograd_nnpack_weight_transform")
-def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype):
-    """Compute definition of contrib_conv2d_winograd_nnpack_weight_transform"""
-    convolution_algorithm = attrs.get_int("convolution_algorithm")
-    out = topi.nn.conv2d_winograd_nnpack_weight_transform(
-        inputs[0], convolution_algorithm, out_dtype
-    )
-    return [out]
-
-
-reg.register_schedule(
-    "nn.contrib_conv2d_winograd_nnpack_weight_transform",
-    strategy.schedule_conv2d_winograd_nnpack_weight_transform,
-)
-
-
-# conv2d_NCHWc
-reg.register_strategy("nn.contrib_conv2d_NCHWc", strategy.conv2d_NCHWc_strategy)
-
-# depthwise_conv2d_NCHWc
-reg.register_strategy("nn.contrib_depthwise_conv2d_NCHWc", strategy.depthwise_conv2d_NCHWc_strategy)
-
-
-# deformable_conv2d
-reg.register_strategy("nn.deformable_conv2d", strategy.deformable_conv2d_strategy)
-
-
-@reg.register_alter_op_layout("nn.deformable_conv2d")
-def alter_op_layout_deformable_conv2d(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of deformable conv2d"""
-    return None
-
-
-@reg.register_legalize("nn.deformable_conv2d")
-def legalize_deformable_conv2d(attrs, inputs, types):
-    """Legalize deformable conv2d op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return None
-
-
-@reg.register_convert_op_layout("nn.deformable_conv2d")
-def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for deformable conv2d op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    data, offset, weight = inputs
-    new_attrs = dict(attrs)
-    for attr in new_attrs:
-        if isinstance(new_attrs[attr], container.Array):
-            new_attrs[attr] = list(new_attrs[attr])
-        elif isinstance(new_attrs[attr], expr.IntImm):
-            new_attrs[attr] = new_attrs[attr].value
-
-    # First check if there is a LayoutConfig scope, and if so, whether
-    # it indicates we should ignore this layer or not.
-    layout_config = LayoutConfig.current
-    if layout_config is not None:
-        skip_layer = layout_config.check_skip()
-        if skip_layer:
-            return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
-
-    # Prepare new layout.
-    assert len(desired_layouts) == 2, "A desired layout is expected for data and kernel"
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    new_attrs["data_layout"] = desired_data_layout
-
-    if desired_kernel_layout != "default":
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
-
-    # Handle default kernel layouts
-    if desired_data_layout == "NCHW":
-        new_attrs["kernel_layout"] = "OIHW"
-    elif desired_data_layout == "NHWC":
-        new_attrs["kernel_layout"] = "HWIO"
-    else:
-        raise ValueError(f"Layout {desired_data_layout} is not yet supported.")
-
-    return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
-
-
-# bitpack
-@reg.register_compute("nn.bitpack")
-def compute_bitpack(attrs, inputs, out_dtype):
-    """Compute definition for bitpack"""
-    bits = attrs.bits
-    pack_axis = attrs.pack_axis
-    bit_axis = attrs.bit_axis
-    pack_type = attrs.pack_type
-    name = attrs.name
-    out = topi.nn.bitpack(inputs[0], bits, pack_axis, bit_axis, pack_type, name)
-    return [out]
-
-
-reg.register_schedule("nn.bitpack", strategy.schedule_bitpack)
-
-
-# bitserial_conv2d
-reg.register_strategy("nn.bitserial_conv2d", strategy.bitserial_conv2d_strategy)
-
-
-@reg.register_legalize("nn.bitserial_conv2d")
-def legalize_bitserial_conv2d(attrs, inputs, types):
-    """Legalize bitserial_conv2d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return topi.nn.bitserial_conv2d_legalize(attrs, inputs, types)
-
-
-# bitserial_dense
-reg.register_strategy("nn.bitserial_dense", strategy.bitserial_dense_strategy)
-
-
-# cross_entropy
-@reg.register_compute("nn.cross_entropy")
-def compute_cross_entropy(attrs, inputs, out_dtype):
-    x, y = inputs
-    return [-topi.sum(topi.log(x) * y) / x.shape[0]]
-
-
-reg.register_reduce_schedule("nn.cross_entropy")
-
-
-# dilate
-@reg.register_compute("nn.dilate")
-def compute_dilate(attrs, inputs, out_dtype):
-    return [topi.nn.dilate(inputs[0], attrs.strides, attrs.dilation_value)]
-
-
-reg.register_broadcast_schedule("nn.dilate")
-
-
-# cross_entropy_with_logits
-@reg.register_compute("nn.cross_entropy_with_logits")
-def compute_cross_entropy_with_logits(attrs, inputs, out_dtype):
-    x, y = inputs
-    return [-topi.sum(x * y) / x.shape[0]]
-
-
-reg.register_reduce_schedule("nn.cross_entropy_with_logits")
-
-
-# nll_loss
-@reg.register_compute("nn.nll_loss")
-def compute_nll_loss(attrs, inputs, out_dtype):
-    predictions, targets, weights = inputs
-    return [topi.nn.nll_loss(predictions, targets, weights, attrs.reduction, attrs.ignore_index)]
-
-
-reg.register_reduce_schedule("nn.nll_loss")
-
-
-# depth_to_space
-@reg.register_compute("nn.depth_to_space")
-def compute_depth_to_space(attrs, inputs, out_dtype):
-    block_size = attrs.block_size
-    layout = attrs.layout
-    mode = attrs.mode
-    return [topi.nn.depth_to_space(inputs[0], block_size, layout=layout, mode=mode)]
-
-
-reg.register_injective_schedule("nn.depth_to_space")
-
-
-# space_to_depth
-@reg.register_compute("nn.space_to_depth")
-def compute_space_to_depth(attrs, inputs, out_dtype):
-    block_size = attrs.block_size
-    layout = attrs.layout
-    return [topi.nn.space_to_depth(inputs[0], block_size, layout=layout)]
-
-
-reg.register_injective_schedule("nn.space_to_depth")
-
-
-# correlation
-reg.register_strategy("nn.correlation", strategy.correlation_strategy)
-
-
-# space_to_batch_nd and batch_to_space_nd
-reg.register_injective_schedule("nn.space_to_batch_nd")
-reg.register_injective_schedule("nn.batch_to_space_nd")
-
-
-reg.register_strategy("nn.conv2d_backward_weight", strategy.conv2d_backward_weight_strategy)
-
-
-@reg.register_legalize("nn.conv2d_backward_weight")
-def legalize_conv2d_backward_weight(attrs, inputs, types):
-    """Legalize conv2d_backward_weight op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current op
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    grad, data = inputs
-    data_shape = get_const_tuple(data.checked_type.shape)
-    weight_shape = get_const_tuple(types[2].shape)
-    _, out_channel, grad_h, grad_w = get_const_tuple(grad.checked_type.shape)
-    batch, in_channel, in_h, in_w = data_shape
-    _, _, filter_h, filter_w = weight_shape
-    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
-        get_const_tuple(attrs.padding), (filter_h, filter_w)
-    )
-    stride_h, stride_w = get_const_tuple(attrs.strides)
-    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
-
-    grad = relay.tile(grad, [1, in_channel // attrs.groups, 1, 1])
-    grad = relay.reshape(grad, [-1, 1, 0, 0])  # batch * oc * ic // groups, 1, oh, ow
-    data = relay.reshape(data, [1, -1, 0, 0])  # 1, batch * ic, ih, iw
-
-    backward_weight = relay.nn.conv2d(
-        data,
-        grad,
-        strides=attrs.dilation,
-        padding=attrs.padding,
-        dilation=attrs.strides,
-        groups=in_channel * batch,
-        out_dtype=attrs.out_dtype,
-    )
-
-    # infer shape of backward_weight
-    padded_weight_grad_h = (
-        in_h - (grad_h - 1) * stride_h - 1 + fpad_top + fpad_bottom
-    ) // dilation_h + 1
-    padded_weight_grad_w = (
-        in_w - (grad_w - 1) * stride_w - 1 + fpad_left + fpad_right
-    ) // dilation_w + 1
-
-    backward_weight = relay.reshape(
-        backward_weight,
-        [
-            batch,
-            in_channel // attrs.groups,
-            out_channel,
-            padded_weight_grad_h,
-            padded_weight_grad_w,
-        ],
-    )
-    backward_weight = relay.sum(backward_weight, axis=0)
-    backward_weight = relay.transpose(backward_weight, [1, 0, 2, 3])
-
-    assert padded_weight_grad_h >= filter_h
-    assert padded_weight_grad_w >= filter_w
-
-    if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w:
-        backward_weight = relay.strided_slice(
-            backward_weight,
-            begin=[0, 0, 0, 0],
-            end=[out_channel, in_channel // attrs.groups, filter_h, filter_w],
-        )
-
-    return backward_weight
-
-
-@reg.register_convert_op_layout("nn.conv2d_backward_weight")
-def convert_conv2d_backward_weight(attrs, inputs, _, desired_layouts):
-    """Convert Layout pass registration for conv2d_backward_weight op.
-    Note that `desired_layouts` must be a pair [`data_layout`, `kernel_layouts`],
-    where `kernel_layouts` affects the output of this op (since the output of this op
-    is the weight gradient). The layout of the output gradient (the second input to this op)
-    is assumed to be the same as `data_layout`.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current op
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    new_attrs = dict(attrs)
-    assert len(desired_layouts) == 2, "A desired layout is expected for both of data and gradient."
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    new_attrs["grad_layout"] = desired_data_layout
-    new_attrs["data_layout"] = desired_data_layout
-    new_attrs["kernel_layout"] = desired_kernel_layout
-    new_attrs.pop("out_layout")
-    return relay.nn.conv2d_backward_weight(inputs[0], inputs[1], **new_attrs)
-
-
-#####################
-#  Shape functions  #
-#####################
-
-
-@script
-def _conv_shape_func_nchw(dshape, kshape, strides, padding, dilation):
-    """Shape function for conv*d op with nchw & oihw layout."""
-    out = output_tensor((dshape.shape[0],), "int64")
-    out[0] = dshape[0]
-    out[1] = kshape[0]
-
-    for i in const_range(dshape.shape[0] - 2):
-        dilated_k = (kshape[i + 2] - 1) * dilation[i] + 1
-        out[i + 2] = (dshape[i + 2] + 2 * padding[i] - dilated_k) // strides[i] + 1
-    return out
-
-
-@script
-def _conv_shape_func_nhwc_hwio(dshape, kshape, strides, padding, dilation):
-    """Shape function for conv*d op with nhwc & hwio layout."""
-    out = output_tensor((dshape.shape[0],), "int64")
-    out[0] = dshape[0]
-    out[dshape.shape[0] - 1] = kshape[kshape.shape[0] - 1]
-
-    for i in const_range(dshape.shape[0] - 2):
-        dilated_k = (kshape[i] - 1) * dilation[i] + 1
-        out[i + 1] = (dshape[i + 1] + 2 * padding[i] - dilated_k) // strides[i] + 1
-    return out
-
-
-@script
-def _conv_shape_func_nhwc_hwoi(dshape, kshape, strides, padding, dilation):
-    """Shape function for conv*d op with nhwc & hwoi layout."""
-    out = output_tensor((dshape.shape[0],), "int64")
-    out[0] = dshape[0]
-    out[dshape.shape[0] - 1] = kshape[kshape.shape[0] - 2]
-
-    for i in const_range(dshape.shape[0] - 2):
-        dilated_k = (kshape[i] - 1) * dilation[i] + 1
-        out[i + 1] = (dshape[i + 1] + 2 * padding[i] - dilated_k) // strides[i] + 1
-    return out
-
-
-@script
-def _conv_shape_func_nhwc_ohwi(dshape, kshape, strides, padding, dilation):
-    """Shape function for conv*d op with nhwc & ohwi layout."""
-    out = output_tensor((dshape.shape[0],), "int64")
-    out[0] = dshape[0]
-    out[dshape.shape[0] - 1] = kshape[0]
-
-    for i in const_range(dshape.shape[0] - 2):
-        dilated_k = (kshape[i + 1] - 1) * dilation[i] + 1
-        out[i + 1] = (dshape[i + 1] + 2 * padding[i] - dilated_k) // strides[i] + 1
-    return out
-
-
-def conv_shape_func(attrs, inputs, _):
-    """Shape function for conv*d op."""
-    strides = get_const_tuple(attrs.strides)
-    padding = get_const_tuple(attrs.padding)
-    dilation = get_const_tuple(attrs.dilation)
-
-    shape_func = None
-    if attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
-        shape_func = _conv_shape_func_nchw
-    elif attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
-        shape_func = _conv_shape_func_nhwc_hwio
-    elif attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWOI":
-        shape_func = _conv_shape_func_nhwc_hwoi
-    elif attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "OHWI":
-        shape_func = _conv_shape_func_nhwc_ohwi
-    else:
-        raise ValueError(
-            "Unsupported data/kernel layout: %s, %s"
-            % (attrs["data_layout"], attrs["kernel_layout"])
-        )
-
-    return [shape_func(inputs[0], inputs[1], convert(strides), convert(padding), convert(dilation))]
-
-
-reg.register_shape_func("nn.conv1d", False, conv_shape_func)
-reg.register_shape_func("nn.conv2d", False, conv_shape_func)
-reg.register_shape_func("nn.conv3d", False, conv_shape_func)
-
-
-@script
-def _conv2d_NCHWc_shape_func(dshape, kshape, strides, padding, dilation, oc_bn):
-    out = output_tensor((dshape.shape[0],), "int64")
-    ic_chunk = dshape[1]
-    height = dshape[2]
-    width = dshape[3]
-    ic_bn = dshape[4]
-    kheight = kshape[2]
-    kwidth = kshape[3]
-    dilated_kh = (kheight - 1) * dilation[0] + 1
-    dilated_kw = (kwidth - 1) * dilation[1] + 1
-    kflatten = int64(1)
-    for i in const_range(kshape.shape[0]):
-        kflatten *= kshape[i]
-
-    oc = kflatten // (kheight * kwidth * ic_chunk * ic_bn)
-    oc_chunk = oc // oc_bn
-
-    out_height = (height + 2 * padding[0] - dilated_kh) // strides[0] + 1
-    out_width = (width + 2 * padding[1] - dilated_kw) // strides[1] + 1
-
-    out[0] = dshape[0]
-    out[1] = oc_chunk
-    out[2] = out_height
-    out[3] = out_width
-    out[4] = int64(oc_bn)
-    return out
-
-
-@reg.register_shape_func("nn.contrib_conv2d_NCHWc", False)
-def conv2d_NCHWc_shape_func(attrs, inputs, _):
-    """
-    Shape function for contrib_conv2d_NCHWc op.
-    """
-    strides = get_const_tuple(attrs.strides)
-    padding = get_const_tuple(attrs.padding)
-    dilation = get_const_tuple(attrs.dilation)
-    out_layout = attrs.out_layout
-    oc_bn = int(out_layout[4:-1])
-
-    return [
-        _conv2d_NCHWc_shape_func(
-            inputs[0],
-            inputs[1],
-            convert(strides),
-            convert(padding),
-            convert(dilation),
-            convert(oc_bn),
-        )
-    ]
-
-
-@script
-def _conv_transpose_shape_func(dshape, kshape, strides, padding, dilation, output_padding):
-    out = output_tensor((dshape.shape[0],), "int64")
-    out[0] = dshape[0]
-    out[1] = kshape[1]
-
-    for i in const_range(dshape.shape[0] - 2):
-        dilated_k = (kshape[i + 2] - 1) * dilation[i] + 1
-        out[i + 2] = (
-            strides[i] * (dshape[i + 2] - 1) + dilated_k - 2 * padding[i] + output_padding[i]
-        )
-    return out
-
-
-def conv_transpose_shape_func(attrs, inputs, _):
-    """
-    Shape function for contrib_conv2d_NCHWc op.
-    """
-    strides = get_const_tuple(attrs.strides)
-    padding = get_const_tuple(attrs.padding)
-    dilation = get_const_tuple(attrs.dilation)
-    output_padding = get_const_tuple(attrs.output_padding)
-
-    return [
-        _conv_transpose_shape_func(
-            inputs[0],
-            inputs[1],
-            convert(strides),
-            convert(padding),
-            convert(dilation),
-            convert(output_padding),
-        )
-    ]
-
-
-reg.register_shape_func("nn.conv1d_transpose", False, conv_transpose_shape_func)
-reg.register_shape_func("nn.conv2d_transpose", False, conv_transpose_shape_func)
-
-
-@script
-def _pool2d_shape_func(data_shape, pool_size, strides, padding, height_axis, width_axis):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    for i in const_range(data_shape.shape[0]):
-        if i == height_axis:
-            out[i] = (data_shape[i] + padding[0] + padding[2] - pool_size[0]) // strides[0] + 1
-        elif i == width_axis:
-            out[i] = (data_shape[i] + padding[1] + padding[3] - pool_size[1]) // strides[1] + 1
-        else:
-            out[i] = data_shape[i]
-
-    return out
-
-
-def pool2d_shape_func(attrs, inputs, _):
-    """
-    Shape function for pool2d op.
-    """
-    pool_size = get_const_tuple(attrs.pool_size)
-    strides = get_const_tuple(attrs.strides)
-    padding = get_const_tuple(attrs.padding)
-    layout = attrs.layout
-    height_axis = layout.index("H")
-    width_axis = layout.index("W")
-    if len(padding) == 1:
-        padding = [padding[0]] * 4
-    elif len(padding) == 2:
-        padding = [padding[0], padding[1], padding[0], padding[1]]
-
-    return [
-        _pool2d_shape_func(
-            inputs[0],
-            convert(pool_size),
-            convert(strides),
-            convert(padding),
-            convert(height_axis),
-            convert(width_axis),
-        )
-    ]
-
-
-reg.register_shape_func("nn.max_pool2d", False, pool2d_shape_func)
-reg.register_shape_func("nn.avg_pool2d", False, pool2d_shape_func)
-
-
-@script
-def _global_pool2d_shape_func(data_shape, height_axis, width_axis):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    for i in const_range(out.shape[0]):
-        if i == height_axis or i == width_axis:
-            out[i] = int64(1)
-        else:
-            out[i] = data_shape[i]
-
-    return out
-
-
-def global_pool2d_shape_func(attrs, inputs, _):
-    """
-    Shape function for global pool2d op.
-    """
-    layout = attrs.layout
-    height_axis = width_axis = 1
-    for i, letter in enumerate(layout):
-        if letter == "H":
-            height_axis = i
-        if letter == "W":
-            width_axis = i
-    return [_global_pool2d_shape_func(inputs[0], convert(height_axis), convert(width_axis))]
-
-
-reg.register_shape_func("nn.global_max_pool2d", False, global_pool2d_shape_func)
-reg.register_shape_func("nn.global_avg_pool2d", False, global_pool2d_shape_func)
-
-
-@script
-def _batch_flatten_shape_func(data_shape):
-    out = output_tensor((2,), "int64")
-    out[0] = data_shape[0]
-    out[1] = int64(1)
-    for i in const_range(data_shape.shape[0] - 1):
-        out[1] *= data_shape[i + 1]
-
-    return out
-
-
-@reg.register_shape_func("nn.batch_flatten", False)
-def batch_flatten_shape_func(attrs, inputs, _):
-    """
-    Shape function for batch_flatten op.
-    """
-    return [_batch_flatten_shape_func(inputs[0])]
-
-
-@script
-def _matmul_shape_func(tensor_a_shape, tensor_b_shape, transpose_a, transpose_b):
-    out = output_tensor((tensor_a_shape.shape[0],), "int64")
-    for i in const_range(out.shape[0] - 1):
-        out[i] = tensor_a_shape[i]
-    if transpose_a:
-        out[out.shape[0] - 2] = out[out.shape[0] - 1]
-    out[out.shape[0] - 1] = tensor_b_shape[0] if transpose_b else tensor_b_shape[1]
-
-    return out
-
-
-@reg.register_shape_func("nn.matmul", False)
-def matmul_shape_func(attrs, inputs, _):
-    """Shape function for matmul op."""
-    ret = [
-        _matmul_shape_func(
-            inputs[0],
-            inputs[1],
-            expr.IntImm("bool", attrs.transpose_a),
-            expr.IntImm("bool", attrs.transpose_b),
-        )
-    ]
-    return ret
-
-
-@reg.register_shape_func("nn.dense", False)
-def dense_shape_func(attrs, inputs, _):
-    """Shape function for dense op. This is an alias of matmul_nt operator for data tensor in
-    non-transposed format and weight tensor in transposed format.
-    """
-    ret = [
-        _matmul_shape_func(
-            inputs[0], inputs[1], expr.IntImm("bool", False), expr.IntImm("bool", True)
-        )
-    ]
-    return ret
-
-
-@script
-def _dense_pack_shape_func(data_shape, weight_shape):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    assert data_shape.shape[0] == 2, "Input data must be 2D"
-    out[0] = data_shape[0]
-    out[1] = weight_shape[0] * weight_shape[2]
-
-    return out
-
-
-@reg.register_shape_func("nn.contrib_dense_pack", False)
-def dense_pack_shape_func(attrs, inputs, _):
-    """
-    Shape function for dense_pack op.
-    """
-    ret = [_dense_pack_shape_func(inputs[0], inputs[1])]
-    return ret
-
-
-@script
-def _batch_matmul_shape_func(tensor_a_shape, tensor_b_shape, transpose_a, transpose_b):
-    out = output_tensor((tensor_a_shape.shape[0],), "int64")
-    out[0] = max(tensor_a_shape[0], tensor_b_shape[0])
-    out[1] = tensor_a_shape[2] if transpose_a else tensor_a_shape[1]
-    out[2] = tensor_b_shape[1] if transpose_b else tensor_b_shape[2]
-
-    return out
-
-
-@reg.register_shape_func("nn.batch_matmul", False)
-def batch_matmul_shape_func(attrs, inputs, _):
-    """
-    Shape function for batch matmul op.
-    """
-    ret = [
-        _batch_matmul_shape_func(
-            inputs[0],
-            inputs[1],
-            expr.IntImm("bool", attrs.transpose_a),
-            expr.IntImm("bool", attrs.transpose_b),
-        )
-    ]
-    return ret
-
-
-@script
-def _pad_shape_func(data_shape, pad_width):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    for i in const_range(out.shape[0]):
-        out[i] = data_shape[i] + pad_width[i][0] + pad_width[i][1]
-
-    return out
-
-
-@reg.register_shape_func("nn.pad", False)
-def pad_shape_func(attrs, inputs, _):
-    """
-    Shape function for pad op.
-    """
-    pad_width = []
-    for pair in attrs.pad_width:
-        pad_width.append(get_const_tuple(pair))
-    return [_pad_shape_func(inputs[0], convert(pad_width))]
-
-
-@script
-def _dilate_shape_func(data_shape, strides):
-    out = output_tensor((data_shape.shape[0],), "int64")
-    for i in const_range(out.shape[0]):
-        out[i] = (data_shape[i] - 1) * strides[i] + 1
-
-    return out
-
-
-@reg.register_shape_func("nn.dilate", False)
-def dilate_shape_func(attrs, inputs, _):
-    """
-    Shape function for dilate op.
-    """
-    return [_dilate_shape_func(inputs[0], convert(attrs.strides))]
-
-
-reg.register_shape_func("nn.bias_add", False, elemwise_shape_func)
-reg.register_shape_func("nn.softmax", False, elemwise_shape_func)
-reg.register_shape_func("nn.fast_softmax", False, elemwise_shape_func)
-reg.register_shape_func("nn.relu", False, elemwise_shape_func)
-reg.register_shape_func("nn.leaky_relu", False, elemwise_shape_func)
-reg.register_shape_func("nn.prelu", False, elemwise_shape_func)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
deleted file mode 100644
index 8cb66ecaa9a2..000000000000
--- a/python/tvm/relay/op/nn/nn.py
+++ /dev/null
@@ -1,3823 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-lines
-"""Neural network operations."""
-from tvm.relay import expr
-
-from ...expr import Constant, Expr, const
-from ..dyn.nn import _make as _dyn_make
-from . import _make
-from .utils import get_pad_tuple1d, get_pad_tuple2d, get_pad_tuple3d
-
-
-def conv1d(
-    data,
-    weight,
-    strides=1,
-    padding=0,
-    dilation=1,
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCW",
-    kernel_layout="OIW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""1D convolution.
-
-    This operator takes the weight as the convolution kernel
-    and convolves it with data to produce an output.
-
-
-    In the default case, where the data_layout is `NCW`
-    and kernel_layout is `OIW`, conv1d takes in
-    a data Tensor with shape `(batch_size, in_channels, width)`,
-    and a weight Tensor with shape `(channels, in_channels, kernel_size)`
-    to produce an output Tensor with the following rule:
-
-    .. math::
-
-        \mbox{out}[b, c, w] = \sum_{dw, k}
-           \mbox{data}[b, k, \mbox{strides}[0] * w + dw] *
-           \mbox{weight}[c, k, dw]
-
-    Padding and dilation are applied to data and weight respectively before the computation.
-    This operator accepts data layout specification.
-    Semantically, the operator will convert the layout to the canonical layout
-    (`NCW` for data and `OIW` for weight), perform the computation,
-    then convert to the out_layout.
-
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Optional[int, Tuple[int]]
-        The strides of convolution.
-
-    padding : Optional[int, Tuple[int]]
-        The padding of convolution on both sides of the input before convolution.
-
-    dilation : Optional[int, Tuple[int]]
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : Optional[int]
-        Currently unused for 1D convolution.
-
-    channels : Optional[int]
-        Number of output channels of this convolution.
-
-    kernel_size : Optional[int, Tuple[int]]
-        The spatial dimension of the convolution kernel.
-
-    data_layout : Optional[str]
-        Layout of the input.
-
-    kernel_layout : Optional[str]
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(kernel_size, int):
-        kernel_size = (kernel_size,)
-    if isinstance(strides, int):
-        strides = (strides,)
-    if isinstance(dilation, int):
-        dilation = (dilation,)
-    padding = get_pad_tuple1d(padding)
-    return _make.conv1d(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def conv2d(
-    data,
-    weight,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""2D convolution.
-
-    This operator takes the weight as the convolution kernel
-    and convolves it with data to produce an output.
-
-
-    In the default case, where the data_layout is `NCHW`
-    and kernel_layout is `OIHW`, conv2d takes in
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    and a weight Tensor with shape `(channels, in_channels, kernel_size[0], kernel_size[1])`
-    to produce an output Tensor with the following rule:
-
-    .. math::
-
-        \mbox{out}[b, c, y, x] = \sum_{dy, dx, k}
-           \mbox{data}[b, k, \mbox{strides}[0] * y  + dy, \mbox{strides}[1] * x + dx] *
-           \mbox{weight}[c, k, dy, dx]
-
-    Padding and dilation are applied to data and weight respectively before the computation.
-    This operator accepts data layout specification.
-    Semantically, the operator will convert the layout to the canonical layout
-    (`NCHW` for data and `OIHW` for weight), perform the computation,
-    then convert to the out_layout.
-
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Optional[int, Tuple[int]]
-        The strides of convolution.
-
-    padding : Optional[int, Tuple[int]]
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : Optional[int, Tuple[int]]
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : Optional[int]
-        Number of groups for grouped convolution.
-
-    channels : Optional[int]
-        Number of output channels of this convolution.
-
-    kernel_size : Optional[int, Tuple[int]]
-        The spatial of the convolution kernel.
-
-    data_layout : Optional[str]
-        Layout of the input.
-
-    kernel_layout : Optional[str]
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(kernel_size, int):
-        kernel_size = (kernel_size, kernel_size)
-    if isinstance(strides, int):
-        strides = (strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation)
-    # TODO enforce 4-way padding in topi/nn/conv2d after #4644 merged
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.conv2d(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def conv3d(
-    data,
-    weight,
-    strides=(1, 1, 1),
-    padding=(0, 0, 0),
-    dilation=(1, 1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCDHW",
-    kernel_layout="OIDHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""3D convolution.
-
-    This operator takes the weight as the convolution kernel
-    and convolves it with data to produce an output.
-
-
-    In the default case, where the data_layout is `NCDHW`
-    and kernel_layout is `OIDHW`, conv3d takes in
-    a data Tensor with shape `(batch_size, in_channels, depth, height, width)`,
-    and a weight Tensor with shape `(channels, in_channels, kernel_size[0], kernel_size[1],
-    kernel_size[2])` to produce an output Tensor with the following rule:
-
-    .. math::
-
-        \mbox{out}[b, c, z, y, x] = \sum_{dz, dy, dx, k}
-           \mbox{data}[b, k, \mbox{strides}[0] * z  + dz, \mbox{strides}[1] * y  + dy,
-           \mbox{strides}[2] * x + dx] * \mbox{weight}[c, k, dz, dy, dx]
-
-    Padding and dilation are applied to data and weight respectively before the computation.
-    This operator accepts data layout specification.
-    Semantically, the operator will convert the layout to the canonical layout
-    (`NCDHW` for data and `OIDHW` for weight), perform the computation,
-    then convert to the out_layout.
-
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Optional[Tuple[int]]
-        The strides of convolution.
-
-    padding : Optional[int, Tuple[int]]
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : Optional[int, Tuple[int]]
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : Optional[int]
-        Number of groups for grouped convolution.
-
-    channels : Optional[int]
-        Number of output channels of this convolution.
-
-    kernel_size : Optional[int, Tuple[int]]
-        The spatial of the convolution kernel.
-
-    data_layout : Optional[str]
-        Layout of the input.
-
-    kernel_layout : Optional[str]
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(kernel_size, int):
-        kernel_size = (kernel_size, kernel_size, kernel_size)
-    if isinstance(strides, int):
-        strides = (strides, strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation, dilation)
-    padding = get_pad_tuple3d(padding)
-    return _make.conv3d(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def contrib_conv3d_winograd_without_weight_transform(
-    data,
-    weight,
-    tile_size,
-    strides=(1, 1, 1),
-    padding=(0, 0, 0),
-    dilation=(1, 1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCDHW",
-    kernel_layout="OIDHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""3D convolution with winograd algorithm.
-
-    The basic parameters are the same as the ones in vanilla conv3d.
-    It assumes the weight is pre-transformed by nn.contrib_conv3d_winograd_weight_transform
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    tile_size : int
-        The Tile size of winograd. E.g. 2 for F(2x2x2, 3x3x3) and 4 for F(4x4x4, 3x3x3)
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 3-way padding to 6-way padding
-    padding = get_pad_tuple3d(padding)
-    return _make.contrib_conv3d_winograd_without_weight_transform(
-        data,
-        weight,
-        tile_size,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def conv3d_transpose(
-    data,
-    weight,
-    strides=(1, 1, 1),
-    padding=(0, 0, 0),
-    dilation=(1, 1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCDHW",
-    kernel_layout="IODHW",
-    out_layout="",
-    output_padding=(0, 0, 0),
-    out_dtype="",
-):
-    r"""3D transpose convolution.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Optional[Tuple[int]]
-        The strides of convolution.
-
-    padding : Optional[int, Tuple[int]]
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : Optional[int, Tuple[int]]
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : Optional[int]
-        Number of groups for grouped convolution.
-
-    channels : Optional[int]
-        Number of output channels of this convolution.
-
-    kernel_size : Optional[int, Tuple[int]]
-        The spatial of the convolution kernel.
-
-    data_layout : Optional[str]
-        Layout of the input.
-
-    kernel_layout : Optional[str]
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision conv3d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    if isinstance(kernel_size, int):
-        kernel_size = (kernel_size, kernel_size, kernel_size)
-    if isinstance(strides, int):
-        strides = (strides, strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation, dilation)
-    padding = get_pad_tuple3d(padding)
-
-    return _make.conv3d_transpose(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        output_padding,
-        out_dtype,
-    )
-
-
-def conv2d_transpose(
-    data,
-    weight,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW",
-    kernel_layout="IOHW",
-    out_layout="",
-    output_padding=(0, 0),
-    out_dtype="",
-):
-    """Two dimensional transposed convolution operator.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Tuple[int], optional
-        The strides of convolution.
-
-    padding : Tuple[int], optional
-        The padding of convolution on both sides of inputs.
-
-    dilation : Tuple[int], optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    output_padding : Tuple[int], optional
-        Used to disambiguate the output shape.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.conv2d_transpose(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        output_padding,
-        out_dtype,
-    )
-
-
-def conv1d_transpose(
-    data,
-    weight,
-    strides=(1,),
-    padding=(0,),
-    dilation=(1,),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCW",
-    kernel_layout="IOW",
-    out_layout="",
-    output_padding=(0,),
-    out_dtype="",
-):
-    """One dimensional transposed convolution operator.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Tuple[int], optional
-        The strides of convolution.
-
-    padding : Tuple[int], optional
-        The padding of convolution on both sides of inputs.
-
-    dilation : Tuple[int], optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    output_padding : Tuple[int], optional
-        Used to disambiguate the output shape.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.conv1d_transpose(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        output_padding,
-        out_dtype,
-    )
-
-
-def softmax(data, axis=-1):
-    r"""Computes softmax.
-
-    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
-
-    .. note::
-        This operator can be optimized away for inference.
-
-    Parameters
-    ----------
-    data: tvm.relay.Expr
-        The input data to the operator.
-
-    axis: int, optional
-        The axis to sum over when computing softmax
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.softmax(data, axis)
-
-
-def fast_softmax(data, axis=-1):
-    r"""Computes softmax.
-    Use approximation to compute exponent for faster speed.
-
-    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
-    .. note::
-        This operator can be optimized away for inference.
-
-    Parameters
-    ----------
-    data: tvm.relay.Expr
-        The input data to the operator.
-    axis: int, optional
-        The axis to sum over when computing softmax
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.fast_softmax(data, axis)
-
-
-def log_softmax(data, axis=-1):
-    r"""Computes log softmax.
-
-    .. math::
-
-        \text{log_softmax}(x)_i = \log \frac{exp(x_i)}{\sum_j exp(x_j)}
-
-    .. note::
-        This operator can be optimized away for inference.
-
-    Parameters
-    ----------
-    data: tvm.relay.Expr
-        The input data to the operator.
-
-    axis: int, optional
-        The axis to sum over when computing log softmax
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.log_softmax(data, axis)
-
-
-def max_pool1d(
-    data,
-    pool_size=(1,),
-    strides=(1,),
-    dilation=(1,),
-    padding=(0,),
-    layout="NCW",
-    out_layout="",
-    ceil_mode=False,
-):
-    r"""1D maximum pooling operator.
-
-    This operator takes data as input and does 1D max value calculation
-    with in pool_size sized window by striding defined by stride.
-
-    In the default case, where the data_layout is `NCW`
-    a data Tensor with shape `(batch_size, channels, width)`,
-    to produce an output Tensor.
-
-    The ceil_mode is used to take ceil or floor while computing out shape.
-    count_include_pad indicates including or excluding padded input values in computation.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : int or tuple of int, optional
-        The strides of pooling.
-
-    dilation : int or tuple of int, optional
-        The dilation of pooling.
-
-    padding : int or tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(pool_size, int):
-        pool_size = (pool_size,)
-    if isinstance(strides, int):
-        strides = (strides,)
-    if isinstance(dilation, int):
-        dilation = (dilation,)
-    padding = get_pad_tuple1d(padding)
-    return _make.max_pool1d(
-        data, pool_size, strides, dilation, padding, layout, out_layout, ceil_mode
-    )
-
-
-def max_pool2d(
-    data,
-    pool_size=(1, 1),
-    strides=(1, 1),
-    dilation=(1, 1),
-    padding=(0, 0),
-    layout="NCHW",
-    out_layout="",
-    ceil_mode=False,
-):
-    r"""2D maximum pooling operator.
-
-    This operator takes data as input and does 2D max value calculation
-    with in pool_size sized window by striding defined by stride
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, h, w) and pool_size (kh, kw)
-
-    .. math::
-
-        \mbox{out}(b, c, y, x)  = \max_{m=0, \ldots, kh-1} \max_{n=0, \ldots, kw-1}
-             \mbox{data}(b, c, \mbox{stride}[0] * y + m, \mbox{stride}[1] * x + n)
-
-    Padding is applied to data before the computation.
-    ceil_mode is used to take ceil or floor while computing out shape.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    dilation : int or tuple of int, optional
-        The dilation of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(pool_size, int):
-        pool_size = (pool_size, pool_size)
-    if isinstance(strides, int):
-        strides = (strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation)
-    padding = get_pad_tuple2d(padding)
-    return _make.max_pool2d(
-        data, pool_size, strides, dilation, padding, layout, out_layout, ceil_mode
-    )
-
-
-def max_pool3d(
-    data,
-    pool_size=(1, 1, 1),
-    strides=(1, 1, 1),
-    dilation=(1, 1, 1),
-    padding=(0, 0, 0),
-    layout="NCDHW",
-    out_layout="",
-    ceil_mode=False,
-):
-    r"""3D maximum pooling operator.
-
-    This operator takes data as input and does 3D max value calculation
-    with in pool_size sized window by striding defined by stride.
-
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, channels, depth, height, width)`,
-    to produce an output Tensor.
-
-    The ceil_mode is used to take ceil or floor while computing out shape.
-    count_include_pad indicates including or excluding padded input values in computation.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    dilation : int or tuple of int, optional
-        The dilation of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(pool_size, int):
-        pool_size = (pool_size, pool_size, pool_size)
-    if isinstance(strides, int):
-        strides = (strides, strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation, dilation)
-    padding = get_pad_tuple3d(padding)
-    return _make.max_pool3d(
-        data, pool_size, strides, dilation, padding, layout, out_layout, ceil_mode
-    )
-
-
-def avg_pool1d(
-    data,
-    pool_size=(1,),
-    strides=(1,),
-    dilation=(1,),
-    padding=(0,),
-    layout="NCW",
-    out_layout="",
-    ceil_mode=False,
-    count_include_pad=False,
-):
-    r"""1D average pooling operator.
-
-    This operator takes data as input and does 1D average value calculation
-    with in pool_size sized window by striding defined by stride
-
-    In the default case, where the data_layout is `NCW`
-    a data Tensor with shape `(batch_size, channels, width)`,
-    to produce an output Tensor.
-
-    The ceil_mode is used to take ceil or floor while computing out shape.
-    count_include_pad indicates including or excluding padded input values in computation.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : int or tuple of int, optional
-        The strides of pooling.
-
-    dilation : int or tuple of int, optional
-        The dilation of pooling.
-
-    padding : int or tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    count_include_pad : bool, optional
-        To include padding to compute the average.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(pool_size, int):
-        pool_size = (pool_size,)
-    if isinstance(strides, int):
-        strides = (strides,)
-    if isinstance(dilation, int):
-        dilation = (dilation,)
-    padding = get_pad_tuple1d(padding)
-    return _make.avg_pool1d(
-        data,
-        pool_size,
-        strides,
-        dilation,
-        padding,
-        layout,
-        out_layout,
-        ceil_mode,
-        count_include_pad,
-    )
-
-
-def avg_pool2d(
-    data,
-    pool_size=(1, 1),
-    strides=(1, 1),
-    dilation=(1, 1),
-    padding=(0, 0),
-    layout="NCHW",
-    out_layout="",
-    ceil_mode=False,
-    count_include_pad=False,
-):
-    r"""2D average pooling operator.
-
-    This operator takes data as input and does 2D average value calculation
-    with in pool_size sized window by striding defined by stride
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, h, w), pool_size (kh, kw)
-
-    .. math::
-
-        \mbox{out}(b, c, y, x)  = \frac{1}{kh * kw} \sum_{m=0}^{kh-1} \sum_{n=0}^{kw-1}
-             \mbox{data}(b, c, \mbox{stride}[0] * y + m, \mbox{stride}[1] * x + n)
-
-    Padding is applied to data before the computation.
-    ceil_mode is used to take ceil or floor while computing out shape.
-    count_include_pad indicates including or excluding padded input values in computation.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    dilation : int or tuple of int, optional
-        The dilation of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    count_include_pad : bool, optional
-        To include padding to compute the average.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(pool_size, int):
-        pool_size = (pool_size, pool_size)
-    if isinstance(strides, int):
-        strides = (strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation)
-    padding = get_pad_tuple2d(padding)
-    return _make.avg_pool2d(
-        data,
-        pool_size,
-        strides,
-        dilation,
-        padding,
-        layout,
-        out_layout,
-        ceil_mode,
-        count_include_pad,
-    )
-
-
-def avg_pool3d(
-    data,
-    pool_size=(1, 1, 1),
-    strides=(1, 1, 1),
-    dilation=(1, 1, 1),
-    padding=(0, 0, 0),
-    layout="NCDHW",
-    out_layout="",
-    ceil_mode=False,
-    count_include_pad=False,
-):
-    r"""3D average pooling operator.
-
-    This operator takes data as input and does 3D average value calculation
-    with in pool_size sized window by striding defined by stride
-
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, channels, depth, height, width)`,
-    to produce an output Tensor.
-
-    The ceil_mode is used to take ceil or floor while computing out shape.
-    count_include_pad indicates including or excluding padded input values in computation.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    dilation : int or tuple of int, optional
-        The dilation of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    count_include_pad : bool, optional
-        To include padding to compute the average.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(pool_size, int):
-        pool_size = (pool_size, pool_size, pool_size)
-    if isinstance(strides, int):
-        strides = (strides, strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation, dilation)
-    padding = get_pad_tuple3d(padding)
-    return _make.avg_pool3d(
-        data,
-        pool_size,
-        strides,
-        dilation,
-        padding,
-        layout,
-        out_layout,
-        ceil_mode,
-        count_include_pad,
-    )
-
-
-def max_pool2d_grad(
-    out_grad,
-    data,
-    pool_size=(1, 1),
-    strides=(1, 1),
-    padding=(0, 0),
-    layout="NCHW",
-    out_layout="",
-    ceil_mode=False,
-):
-    r"""Gradient of 2D maximum pooling operator.
-
-    This operator takes out_grad and data as input and calculates gradient of max_pool2d.
-
-    Parameters
-    ----------
-    out_grad : tvm.relay.Expr
-        The output gradient
-
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.max_pool2d_grad(
-        out_grad, data, pool_size, strides, padding, layout, out_layout, ceil_mode
-    )
-
-
-def avg_pool2d_grad(
-    out_grad,
-    data,
-    pool_size=(1, 1),
-    strides=(1, 1),
-    padding=(0, 0),
-    layout="NCHW",
-    out_layout="",
-    ceil_mode=False,
-    count_include_pad=False,
-):
-    r"""Gradient of 2D average pooling operator.
-
-    This operator takes out_grad and data as input and calculates gradient of avg_pool2d.
-
-    Parameters
-    ----------
-    out_grad : tvm.relay.Expr
-        The output gradient
-
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    pool_size : int or tuple of int, optional
-        The size of window for pooling.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    count_include_pad : bool, optional
-        To include padding to compute the average.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.avg_pool2d_grad(
-        out_grad,
-        data,
-        pool_size,
-        strides,
-        padding,
-        layout,
-        out_layout,
-        ceil_mode,
-        count_include_pad,
-    )
-
-
-def global_max_pool2d(data, layout="NCHW", out_layout=""):
-    r"""2D global maximum pooling operator.
-
-    This operator takes data as input and does 2D max value calculation
-    across each window represented by WxH.
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, h, w)
-
-    .. math::
-
-        \mbox{out}(b, c, 1, 1)  = \max_{m=0, \ldots, h} \max_{n=0, \ldots, w}
-             \mbox{data}(b, c, m, n)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.global_max_pool2d(data, layout, out_layout)
-
-
-def global_avg_pool2d(data, layout="NCHW", out_layout=""):
-    r"""2D global average pooling operator.
-
-    This operator takes data as input and does 2D average value calculation
-    across each window represented by WxH.
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, h, w)
-
-    .. math::
-
-        \mbox{out}(b, c, 1, 1)  = \frac{1}{h * w} \sum_{m=0}^{h-1} \sum_{n=0}^{w-1}
-             \mbox{data}(b, c, m, n)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : Optional[str]
-        Layout of the output
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.global_avg_pool2d(data, layout, out_layout)
-
-
-def upsampling(
-    data, scale_h=1, scale_w=1, layout="NCHW", method="nearest_neighbor", align_corners=False
-):
-    """Upsampling.
-
-    This operator takes data as input and does 2D scaling to the given scale factor.
-    In the default case, where the data_layout is `NCHW`
-    with data of shape (n, c, h, w)
-    out will have a shape (n, c, h*scale_h, w*scale_w)
-
-    method indicates the algorithm to be used while calculating the out value
-    and method can be one of ("bilinear", "nearest_neighbor", "bicubic")
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    scale_h : tvm.relay.Expr or int or float
-        The scale factor for height upsampling.
-
-    scale_w : tvm.relay.Expr or int or float
-        The scale factor for width upsampling.
-
-    layout : str, optional
-        Layout of the input.
-
-    method : str, optional
-        Scale method to used [nearest_neighbor, bilinear, bicubic].
-
-    align_corners : bool, optional
-        Whether to keep corners in proper place.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(scale_h, Constant):
-        scale_h = scale_h.data.numpy().item()
-    if isinstance(scale_w, Constant):
-        scale_w = scale_w.data.numpy().item()
-    if isinstance(scale_h, Expr) or isinstance(scale_w, Expr):
-        if not isinstance(scale_h, Expr):
-            scale_h = const(scale_h, "float64")
-        if not isinstance(scale_w, Expr):
-            scale_w = const(scale_w, "float64")
-        return _dyn_make.upsampling(data, scale_h, scale_w, layout, method, align_corners)
-    return _make.upsampling(data, scale_h, scale_w, layout, method, align_corners)
-
-
-def upsampling3d(
-    data,
-    scale_d=1,
-    scale_h=1,
-    scale_w=1,
-    layout="NCDHW",
-    method="nearest_neighbor",
-    coordinate_transformation_mode="half_pixel",
-):
-    """3D Upsampling.
-
-    This operator takes data as input and does 3D scaling to the given scale factor.
-    In the default case, where the data_layout is `NCDHW`
-    with data of shape (n, c, d, h, w)
-    out will have a shape (n, c, d*scale_d, h*scale_h, w*scale_w)
-
-    method indicates the algorithm to be used while calculating the out value
-    and method can be one of ("trilinear", "nearest_neighbor")
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    scale_d : tvm.relay.Expr
-        The scale factor for depth upsampling.
-
-    scale_h : tvm.relay.Expr
-        The scale factor for height upsampling.
-
-    scale_w : tvm.relay.Expr
-        The scale factor for width upsampling.
-
-    layout : str, optional
-        Layout of the input.
-
-    method : str, optional
-        Scale method to used [nearest_neighbor, trilinear].
-
-    coordinate_transformation_mode: string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor.
-        Refer to the ONNX Resize operator specification for details.
-        Available options are "half_pixel", "align_corners" and "asymmetric".
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(scale_d, Constant):
-        scale_d = scale_d.data.numpy().item()
-    if isinstance(scale_h, Constant):
-        scale_h = scale_h.data.numpy().item()
-    if isinstance(scale_w, Constant):
-        scale_w = scale_w.data.numpy().item()
-    if isinstance(scale_d, Expr) or isinstance(scale_h, Expr) or isinstance(scale_w, Expr):
-        if not isinstance(scale_d, Expr):
-            scale_d = const(scale_d, "float64")
-        if not isinstance(scale_h, Expr):
-            scale_h = const(scale_h, "float64")
-        if not isinstance(scale_w, Expr):
-            scale_w = const(scale_w, "float64")
-        return _dyn_make.upsampling3d(
-            data, scale_d, scale_h, scale_w, layout, method, coordinate_transformation_mode
-        )
-    return _make.upsampling3d(
-        data, scale_d, scale_h, scale_w, layout, method, coordinate_transformation_mode
-    )
-
-
-def batch_flatten(data):
-    """BatchFlatten.
-
-    This operator flattens all the dimensions except for the batch dimension.
-    which results a 2D output.
-
-    For data with shape ``(d1, d2, ..., dk)``
-    batch_flatten(data) returns reshaped output of shape ``(d1, d2*...*dk)``.
-
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The Flattened result.
-    """
-    return _make.batch_flatten(data)
-
-
-def bias_add(data, bias, axis=1):
-    """add_bias operator.
-
-    Add 1D bias to the axis of data.
-    This function is a special case of add which allows
-    inference of shape of the bias from data.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    bias : tvm.relay.Expr
-        The bias to be added.
-
-    axis : int, optional
-        The axis to add the bias.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The final result.
-    """
-    return _make.bias_add(data, bias, axis)
-
-
-def matmul(tensor_a, tensor_b, units=None, out_dtype="", transpose_a=False, transpose_b=False):
-    """Matmul operator.
-    Applies a linear transformation. The A & B can be transposed.
-
-    .. math::
-
-        `C = A * B`
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The first input of the operator,
-        of shape `(d_1, d_2, ..., d_n, units_in)` or `(d_1, d_2, ..., units_in, d_n)`.
-
-    weight : tvm.relay.Expr
-        The second input expressions, 2-D matrix,
-        of shape `(units_in, units)` or `(units, units_in)`.
-
-    units : Optional[int]
-        Number of hidden units of the matmul transformation.
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision matmul,
-        of shape `(d_1, d_2, ..., d_n, units)`.
-
-    transpose_a : Optional[bool] = False
-        Whether the data tensor is in transposed format.
-
-    transpose_b : Optional[bool] = False
-        Whether the weight tensor is in transposed format.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # Since currently `nn.dense` has better topi schedule support, will prefer to use `dense`
-    # rather than `matmul` for better compatibility
-    if not transpose_a and transpose_b:
-        # TODO(jcf94): Remove this when `nn.matmul` is finnaly ready
-        return dense(tensor_a, tensor_b, units, out_dtype)
-    return _make.matmul(tensor_a, tensor_b, units, out_dtype, transpose_a, transpose_b)
-
-
-def dense(data, weight, units=None, out_dtype=""):
-    """Dense operator.
-    Applies a linear transformation
-
-    .. math::
-
-    `Y = X * W^T`
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator,
-        of shape `(d_1, d_2, ..., d_n, units_in)`.
-
-    weight : tvm.relay.Expr
-        The weight expressions, 2-D matrix,
-        of shape `(units, units_in)`.
-
-    units : int, optional
-        Number of hidden units of the dense transformation.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision dense,
-        of shape `(d_1, d_2, ..., d_n, units)`.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.dense(data, weight, units, out_dtype)
-
-
-def contrib_dense_pack(data, weight, weight_layout="NC", units=None, out_dtype=""):
-    """Dense operator.
-    Applies a linear transformation with packed weight
-
-    .. math::
-
-    `Y = X * W^T`
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator,
-        of shape `(batch, units_in)`.
-
-    weight : tvm.relay.Expr
-        The transformed weight expressions, 3-D matrix,
-        of shape `(units // pack_weight_tile, units_in, pack_weight_tile)`.
-
-    weight_layout: str
-        The layout of weight, such as "NC" or "NC8n".
-
-    units : int, optional
-        Number of hidden units of the dense transformation.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision dense.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.contrib_dense_pack(data, weight, weight_layout, units, out_dtype)
-
-
-def fifo_buffer(data, buffer, axis):
-    """FIFO buffer to enable computation reuse in CNNs with sliding indow input
-
-    Compute equivalent of
-
-    .. code-block:: python
-
-        concat(buffer, data, axis=axis)
-        .slice_axis(axis=axis,
-                    begin=data.shape[axis],
-                    end=data.shape[axis]+buffer.shape[axis])
-
-    Useful for
-
-    * Encoding explicit re-use of computation in convolution ops operated on a sliding window input
-    * Implementing a FIFO queue to cache intermediate results, e.g. as in Fast WaveNet.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data
-    buffer : tvm.relay.Expr
-        Previous value of the FIFO buffer
-    axis : int
-        Specify which axis should be used for buffering
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        Updated value for the buffer
-    """
-    return _make.fifo_buffer(data, buffer, axis)
-
-
-def relu(data):
-    """Rectified linear unit.
-
-    .. math::
-       out = max(x, 0)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.relu(data)
-
-
-def leaky_relu(data, alpha=0.01):
-    """This operator takes data as input and does Leaky version
-    of a Rectified Linear Unit.
-
-    .. math::
-
-        `y = x > 0 ? x : alpha * x`
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    alpha : float
-        Slope coefficient for the negative half axis.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.leaky_relu(data, alpha)
-
-
-def prelu(data, alpha, axis=1):
-    """This operator takes data as input and does Leaky version
-    of a Rectified Linear Unit.
-
-    .. math::
-
-        y = x > 0 ? x : alpha * x
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    alpha : tvm.relay.Expr
-        Slope coefficient for the negative half axis.
-
-    axis : int, optional
-        Specify which shape axis the channel is specified.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.prelu(data, alpha, axis)
-
-
-def pad(data, pad_width, pad_value=0, pad_mode="constant"):
-    r"""Padding
-
-    This operator takes in a tensor and pads each axis by the specified
-    widths using the specified value.
-
-    Parameters
-    ----------
-    data: tvm.relay.Expr
-        The input data to the operator
-    pad_width: tuple of <tuple of <int>>, or tvm.relay.Expr, required
-        Number of values padded to the edges of each axis, in the format
-        of ((before_1, after_1), ..., (before_N, after_N))
-    pad_value: float, or tvm.relay.Expr, optional, default=0
-        The value used for padding
-    pad_mode: 'constant', 'edge', 'reflect'
-        'constant' pads with constant_value pad_value
-        'edge' pads using the edge values of the input array
-        'reflect' pads by reflecting values with respect to the edge
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    if isinstance(pad_width, Constant):
-        pad_width = [list(i) for i in pad_width.data.numpy()]
-    if not isinstance(pad_value, Expr):
-        pad_value = const(pad_value)
-    if isinstance(pad_width, Expr):
-        return _dyn_make.pad(data, pad_width, pad_value, pad_mode)
-    return _make.pad(data, pad_width, pad_value, pad_mode)
-
-
-def dilate(data, strides, dilation_value=0.0):
-    """Dilate data with given dilation value (0 by default).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        n-D, can be any layout.
-
-    strides : tuple of <int>
-        Dilation stride on each dimension, 1 means no dilation.
-
-    dilation_value : int/float, optional
-        Value used to dilate the input.
-
-    Returns
-    -------
-    Output : tvm.relay.Expr
-        The computed result
-    """
-    return _make.dilate(data, strides, dilation_value)
-
-
-def mirror_pad(data, pad_width, mode="SYMMETRIC"):
-    r"""MirrorPadding
-
-    This operator takes in a tensor and pads each axis by the specified
-    widths using mirroring of the border pixels.
-
-    Parameters
-    ----------
-    data: tvm.relay.Expr
-        The input data to the operator
-    pad_width: tuple of <tuple of <int>>, required
-        Number of values padded to the edges of each axis, in the format
-        of ((before_1, after_1), ..., (before_N, after_N))
-    mode: string, optional, default='SYMMETRIC'
-        What type of mirroring to use, must be SYMMETRIC or REFLECT.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.mirror_pad(data, pad_width, mode)
-
-
-def lrn(data, size=5, axis=1, bias=2, alpha=0.00001, beta=0.75):
-    """This operator takes data as input and does local response normalization.
-
-    Normalize the input in a local region across or within feature maps.
-    Each input value is divided by (data / (bias + (alpha * sum_data ^2 /size))^beta)
-    where n is the size of each local region, and the sum is taken over the region
-    centered at that value (zero padding is added where necessary).
-
-    .. math::
-        (data / (bias + (alpha * sum_data ^2 /size))^beta)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    size : int, optional
-        The size of the local region to be considered for normalization.
-
-    axis : int, optional
-        Input data layout channel axis. Default value is 1 for NCHW format
-
-    bias : float, optional
-        The offset parameter to avoid dividing by 0.
-
-    alpha : float, optional
-        The scaling parameter.
-
-    beta : float, optional
-        The exponent parameter.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.lrn(data, size, axis, alpha, beta, bias)
-
-
-def l2_normalize(data, eps, axis=None):
-    """Perform L2 normalization on the input data
-
-    .. math::
-        y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps))
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    eps : float
-        epsilon value
-
-    axis : list of int, optional
-        axis over the normalization applied
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.l2_normalize(data, eps, axis)
-
-
-def dropout(data, rate=0.5):
-    """Applies the dropout operation to the input array.
-
-    During training, each element of the input is set to zero with
-    probability ``p``. The whole array is rescaled by ``1/(1-p)``
-    to keep the expected sum of the input unchanged.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    rate : float, optional (default=0.5)
-        The probability for an element to be reset to 0.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The result of dropout
-    """
-    return expr.TupleWrapper(dropout_raw(data, rate), 2)[0]
-
-
-def dropout_raw(data, rate=0.5):
-    """Applies the dropout operation to the input array.
-
-    During training, each element of the input is set to zero with
-    probability ``p``. The whole array is rescaled by ``1/(1-p)``
-    to keep the expected sum of the input unchanged.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    rate : float, optional (default=0.5)
-        The probability for an element to be reset to 0.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The result of dropout
-    """
-    return _make.dropout(data, rate)
-
-
-def batch_norm(
-    data, gamma, beta, moving_mean, moving_var, axis=1, epsilon=1e-5, center=True, scale=True
-):
-    r"""
-    Batch normalization layer (Ioffe and Szegedy, 2014).
-    Normalizes the input at each batch, i.e. applies a transformation
-    that maintains the mean activation close to 0 and the activation
-    standard deviation close to 1.
-
-    .. math::
-
-        data\_mean[i] = mean(data[:,i,:,...]) \\
-        data\_var[i] = var(data[:,i,:,...])
-
-    Then compute the normalized output, which has the same shape as input, as following:
-
-    .. math::
-
-        out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}}
-            * gamma[i] + beta[i]
-
-    Both *mean* and *var* returns a scalar by treating the input as a vector.
-
-    Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
-    have shape *(k,)*.
-
-    Besides the inputs and the outputs, this operator accepts two auxiliary
-    states, ``moving_mean`` and ``moving_var``, which are *k*-length
-    vectors. They are global statistics for the whole dataset, which are updated by
-
-    .. code:: python
-
-        moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
-        moving_var = moving_var * momentum + data_var * (1 - momentum)
-
-    The parameter ``axis`` specifies which axis of the input shape denotes
-    the 'channel' (separately normalized groups).  The default is 1.
-    Specifying -1 sets the channel axis to be the last item in the input shape.
-
-    .. note::
-
-        This operator can be optimized away for inference.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input to which batch_norm will be applied.
-
-    gamma : tvm.relay.Expr
-        The gamma scale factor.
-
-    beta : tvm.relay.Expr
-        The beta offset factor.
-
-    moving_mean : tvm.relay.Expr
-        Running mean of input,
-
-    moving_var : tvm.relay.Expr
-        Running variance of input.
-
-    axis : int, optional, default=1
-        Specify along which shape axis the channel is specified.
-
-    epsilon : double, optional, default=1e-5
-        Small float added to variance to avoid dividing by zero.
-
-    center : boolean, optional, default=True
-        If True, add offset of beta to normalized tensor, If False,
-        beta is ignored.
-
-    scale : boolean, optional, default=True
-        If true, multiply by gamma. If False, gamma is not used.
-        When the next layer is piecewise linear (also e.g. nn.relu),
-        this can be disabled since the scaling will be done by the next layer.
-
-    Returns
-    -------
-    result : relay.Tuple([tvm.relay.Expr, tvm.relay.Expr, tvm.relay.Expr])
-        Tuple of normed data (same shape as input),
-        new running mean (k-length vector),
-        and new running variance (k-length vector)
-    """
-    result = _make.batch_norm(
-        data, gamma, beta, moving_mean, moving_var, axis, epsilon, center, scale
-    )
-    return expr.TupleWrapper(result, 3)
-
-
-def instance_norm(data, gamma, beta, axis=1, epsilon=1e-5, center=True, scale=True):
-    r"""
-    Instance Normalization (Ulyanov and et al., 2016)
-    Applies instance normalization to the n-dimensional input array.
-
-    .. math::
-
-        out = \frac{data - mean(data)}{\sqrt{var(data)+\epsilon}}
-            * gamma + beta
-
-    The instance normalization is similar to batch normalization, but unlike
-    batch normalization, the mean and var are calculated per-dimension
-    separately for each object(instance) in a mini-batch, not over a batch.
-    And the same normalization is applied both at test and train time.
-
-    Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
-    have shape *(k,)*.
-
-    The parameter ``axis`` specifies which axis of the input shape denotes
-    the 'channel'.  The default is 1. Specifying -1 sets the channel axis
-    to be the last item in the input shape.
-
-    .. note::
-
-        This operator can be optimized away for inference.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input to which instance_norm will be applied.
-
-    gamma : tvm.relay.Expr
-        The gamma scale factor.
-
-    beta : tvm.relay.Expr
-        The beta offset factor.
-
-    axis : int, optional, default=1
-        Specify along which shape axis the channel is specified.
-
-    epsilon : double, optional, default=1e-5
-        Small float added to variance to avoid dividing by zero.
-
-    center : boolean, optional, default=True
-        If True, add offset of beta to normalized tensor, If False,
-        beta is ignored.
-
-    scale : boolean, optional, default=True
-        If True, multiply by gamma. If False, gamma is not used.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The normalized data.
-
-    .. _`Instance Normalization: The Missing Ingredient for Fast Stylization`:
-        https://arxiv.org/abs/1607.08022
-    """
-    return _make.instance_norm(data, gamma, beta, axis, epsilon, center, scale)
-
-
-def layer_norm(data, gamma, beta, axis=-1, epsilon=1e-5, center=True, scale=True):
-    r"""
-    Layer normalization (Lei Ba and et al., 2016).
-    Applies layer normalization to the n-dimensional input array.
-    This operator takes an n-dimensional input array and normalizes
-    the input using the given axis:
-
-    .. math::
-
-        out = \frac{data - mean(data, axis)}{\sqrt{var(data, axis)+\epsilon}}
-            * gamma + beta
-
-    Unlike batch normalization, the mean and var are computed along the channel dimension.
-
-    Assume the input has size k on axis 1, then both gamma and beta have shape (k,).
-
-    .. note::
-
-        This operator can be optimized away for inference.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input to which layer_norm will be applied.
-
-    gamma : tvm.relay.Expr
-        The gamma scale factor.
-
-    beta : tvm.relay.Expr
-        The beta offset factor.
-
-    axis : int, optional, default=-1
-        The axis that should be normalized, typically the axis of the channels.
-
-    epsilon : double, optional, default=1e-5
-        Small float added to variance to avoid dividing by zero.
-
-    center : boolean, optional, default=True
-        If True, add offset of beta to normalized tensor, If False,
-        beta is ignored.
-
-    scale : boolean, optional, default=True
-        If True, multiply by gamma. If False, gamma is not used.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The normalized data.
-    """
-    return _make.layer_norm(data, gamma, beta, axis, epsilon, center, scale)
-
-
-def group_norm(data, gamma, beta, num_groups, axis=1, epsilon=1e-5, center=True, scale=True):
-    r"""
-    Group normalization normalizes over group of channels for each training examples.
-    We can say that, Group Norm is in between Instance Norm and Layer Norm. When we put
-    all the channels into a single group, group normalization becomes Layer normalization.
-    And, when we put each channel into different groups it becomes Instance normalization
-
-    https://arxiv.org/pdf/1803.08494.pdf
-
-    Applies group normalization to the n-dimensional input array by seperating the input channels
-    into 'num_groups' groups, each containing 'num_channels / num_groups' channels.
-    The mean and standard-deviation are calculated separately over the each group. gamma and
-    beta are learnable per-channel affine transform parameter vectors of size num_channels.
-
-    .. math::
-
-        out = \frac{data - mean(data, axis)}{\sqrt{var(data, axis)+\epsilon}}
-            * gamma + beta
-
-    Unlike batch normalization, the mean and var are computed along a group of channels.
-
-    If the input has size k on axis 1, then both gamma and beta have shape (k,).
-
-    .. note::
-
-        This operator can be optimized away for inference.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input to which group_norm will be applied.
-
-    gamma : tvm.relay.Expr
-        The gamma scale factor.
-
-    beta : tvm.relay.Expr
-        The beta offset factor.
-
-    num_groups : int
-        The number of groups to separate the channels into.
-
-    axis : int, optional, default=1
-        The axis of the channels.
-
-    epsilon : double, optional, default=1e-5
-        Small float added to variance to avoid dividing by zero.
-
-    center : boolean, optional, default=True
-        If True, add offset of beta to normalized tensor, If False,
-        beta is ignored.
-
-    scale : boolean, optional, default=True
-        If True, multiply by gamma. If False, gamma is not used.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The normalized data.
-    """
-    return _make.group_norm(data, gamma, beta, num_groups, axis, epsilon, center, scale)
-
-
-def batch_matmul(tensor_a, tensor_b, out_dtype="", transpose_a=False, transpose_b=True):
-    r"""
-    Compute batch matrix multiplication of `tensor_a` and `tensor_b`.
-
-    Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT format
-    (transpose_a=False, transpose_b=True) by default.
-
-    .. math::
-
-        \mbox{batch_matmul}(A, B)[i, :, :] = \mbox{matmul}(A[i, :, :], B[i, :, :])
-
-    Parameters
-    ----------
-    tensor_a : tvm.relay.Expr
-        The first input.
-
-    tensor_b : tvm.relay.Expr
-        The second input.
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision batch matmul.
-
-    transpose_a : Optional[bool] = False
-        Whether the first tensor is in transposed format.
-
-    transpose_b : Optional[bool] = True
-        Whether the second tensor is in transposed format.
-
-    Returns
-    -------
-    result: tvm.relay.Expr
-        The computed result.
-    """
-    return _make.batch_matmul(tensor_a, tensor_b, out_dtype, transpose_a, transpose_b)
-
-
-# pylint: disable=no-else-return,inconsistent-return-statements
-def sparse_dense(dense_mat, sparse_mat, sparse_lhs=False):
-    r"""
-    Computes the matrix multiplication of `dense_mat` and `sparse_mat`, where `dense_mat` is
-    a dense matrix and `sparse_mat` is a sparse (either BSR or CSR) namedtuple with
-    fields `data`, `indices`, and `indptr`.
-
-    \if sparse_lhs=False:
-        .. math::
-
-            \mbox{sparse_dense}(dense_mat, sparse_mat)[m, n]
-            = \mbox{matmul}(D, \mbox{as_dense}(S)^T)[m, n]
-
-    \if sparse_lhs=True:
-        .. math::
-
-            \mbox{sparse_dense}(dense_mat, sparse_mat)[m, n]
-            = \mbox{matmul}(\mbox{as_dense}(S), (D)^T)[m, n]
-
-    where `as_dense` returns dense equivalent of the given S(sparse matrix)
-    while performing matmul with given D(dense matrix).
-
-    See
-    https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
-    and
-    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.bsr_matrix.html
-    for more detail on the sparse matrix representation.
-
-    Parameters
-    ----------
-    dense_mat : tvm.relay.Expr
-        The input dense matrix for the matrix multiplication
-
-    sparse_mat : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]].
-        The input sparse matrix for the matrix multiplication.
-
-    sparse_lhs : bool, optional
-        Indicates whether lhs or rhs matrix is sparse. Default value is False.
-
-    Returns
-    -------
-    result: tvm.relay.Expr
-        The computed result.
-    """
-    if hasattr(sparse_mat, "indices"):
-        return _make.sparse_dense(
-            dense_mat, sparse_mat.data, sparse_mat.indices, sparse_mat.indptr, sparse_lhs
-        )
-    else:
-        return _make.sparse_dense(
-            dense_mat, sparse_mat[0], sparse_mat[1], sparse_mat[2], sparse_lhs
-        )
-
-
-def sparse_transpose(x):
-    r"""
-    Computes the fast matrix transpose of x,
-    where x is a sparse tensor in CSR format (represented as a namedtuple
-    with fields `data`, `indices`, and `indptr`).
-
-    ** Currently only support Square Matrices **
-
-    .. math::
-
-        \mbox{sparse_transpose}(x)[n, n] = (x^T)[n, n]
-
-    Please refer to https://github.com/scipy/scipy/blob/v1.3.0/scipy/sparse/csr.py
-    for the algorithm implemented in this operator.
-
-    Parameters
-    ----------
-    x : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]].
-        The sparse weight matrix for the fast matrix transpose.
-
-    Returns
-    -------
-    result : relay.Tuple([tvm.relay.Expr, tvm.relay.Expr, tvm.relay.Expr])
-        Tuple of output sparse tensor (same shape and format as input),
-        i.e. if CSR then output is in ([data, indices, indptr]) form
-    """
-    if hasattr(x, "indices"):
-        return expr.TupleWrapper(_make.sparse_transpose(x.data, x.indices, x.indptr), 3)
-    return expr.TupleWrapper(_make.sparse_transpose(x[0], x[1], x[2]), 3)
-
-
-# pylint: disable=no-else-return,inconsistent-return-statements
-def sparse_add(dense_mat, sparse_mat):
-    r"""
-    Computes the matrix addition of `dense_mat` and `sparse_mat`, where `dense_mat` is
-    a dense matrix and `sparse_mat` is a sparse (CSR) namedtuple with
-    fields `data`, `indices`, and `indptr`.
-
-    .. math::
-
-        \mbox{sparse_add}(dense_mat, sparse_mat)[m, n] = \mbox{add}(\mbox{as_dense}(S), (D))[m, n]
-
-    where `as_dense` returns dense equivalent of the given S(sparse matrix)
-    while performing addition with given D(dense matrix).
-
-    Parameters
-    ----------
-    dense_mat : tvm.relay.Expr
-        The input dense matrix for the matrix addition
-
-    sparse_mat : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]].
-        The input sparse matrix(CSR) for the matrix addition.
-
-    Returns
-    -------
-    result: tvm.relay.Expr
-        The computed result.
-
-    Examples
-    -------
-    .. code-block:: python
-
-        dense_data = [[ 3.,   4.,   4. ]
-                      [ 4.,  2.,  5. ]]
-        sparse_data = [4., 8.]
-        sparse_indices =[0, 2]
-        sparse_indptr =[0, 1, 2]
-
-        output = relay.sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr)
-
-        output = [[ 7.,   4.,   4. ]
-                  [ 4.,  2.,  13. ]]
-    """
-    if hasattr(sparse_mat, "indices"):
-        return _make.sparse_add(dense_mat, sparse_mat.data, sparse_mat.indices, sparse_mat.indptr)
-    else:
-        return _make.sparse_add(dense_mat, sparse_mat[0], sparse_mat[1], sparse_mat[2])
-
-
-def contrib_conv2d_winograd_without_weight_transform(
-    data,
-    weight,
-    tile_size,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""2D convolution with winograd algorithm.
-
-    The basic parameters are the same as the ones in vanilla conv2d.
-    It assumes the weight is pre-transformed by nn.contrib_conv2d_winograd_weight_transform
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    tile_size : int
-        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.contrib_conv2d_winograd_without_weight_transform(
-        data,
-        weight,
-        tile_size,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def contrib_conv2d_gemm_without_weight_transform(
-    data,
-    weight,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""2D convolution with gemm algorithm.
-
-    The basic parameters are the same as the ones in vanilla conv2d.
-    It assumes the weight is pre-transformed by nn.contrib_conv2d_gemm_weight_transform
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.contrib_conv2d_gemm_without_weight_transform(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def contrib_conv2d_nchwc(
-    data,
-    kernel,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW8c",
-    kernel_layout="OIHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""Variant of 2D convolution.
-
-    This operator takes the weight as the convolution kernel
-    and convolves it with data to produce an output, following a specialized
-    NCHWc data layout.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    kernel : tvm.relay.Expr
-        The kernel expressions.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.contrib_conv2d_NCHWc(
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def contrib_depthwise_conv2d_nchwc(
-    data,
-    kernel,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW8c",
-    kernel_layout="OIHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""Variant of 2D depthwise convolution.
-
-    This operator takes the weight as the depthwise convolution kernel
-    and depthwise convolves it with data to produce an output, following a specialized
-    NCHWc data layout.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    kernel : tvm.relay.Expr
-        The kernel expressions.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.contrib_depthwise_conv2d_NCHWc(
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def contrib_conv2d_winograd_weight_transform(weight, tile_size):
-    r"""Weight Transformation part for 2D convolution with winograd algorithm.
-
-    We separate this as a single op to enable pre-compute for inference.
-    Use this together with nn.contrib_conv2d_winograd_without_weight_transform
-
-    Parameters
-    ----------
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    tile_size : int
-        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size)
-
-
-def contrib_conv2d_gemm_weight_transform(weights, tile_N, tile_K):
-    r"""Weight Transformation part for 2D convolution with gemm algorithm.
-
-    We separate this as a single op to enable pre-compute for inference.
-    Use this together with nn.contrib_conv2d_gemm_without_weight_transform
-
-    Parameters
-    ----------
-    weights : tvm.relay.Expr
-        The weight expressions.
-    tile_N: int
-        Tile size across N axis of the weight transformation for ConvGemm. (N = OC)
-    tile_K: int
-       Tile size across K axis of the weight transformation for ConvGemm. (K = KW * KH * IC)
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.contrib_conv2d_gemm_weight_transform(weights, tile_N, tile_K)
-
-
-def contrib_conv3d_winograd_weight_transform(weight, tile_size):
-    r"""Weight Transformation part for 3D convolution with winograd algorithm.
-
-    We separate this as a single op to enable pre-compute for inference.
-    Use this together with nn.contrib_conv3d_winograd_without_weight_transform
-
-    Parameters
-    ----------
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    tile_size : int
-        The Tile size of winograd. E.g. 2 for F(2x2x2, 3x3x3) and 4 for F(4x4x4, 3x3x3)
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.contrib_conv3d_winograd_weight_transform(weight, tile_size)
-
-
-def contrib_conv2d_winograd_nnpack_weight_transform(weight, convolution_algorithm, out_dtype=""):
-    r"""Weight Transformation part for 2D convolution with winograd algorithm.
-
-    We separate this as a single op to enable pre-compute for inference.
-    Use this together with nn.contrib_conv2d_winograd_without_weight_transform
-
-    Parameters
-    ----------
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    convolution_algorithm : int
-        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.contrib_conv2d_winograd_nnpack_weight_transform(
-        weight, convolution_algorithm, out_dtype
-    )
-
-
-def deformable_conv2d(
-    data,
-    offset,
-    weight,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    deformable_groups=1,
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    out_layout="",
-    out_dtype="",
-):
-    r"""Deformable 2d convolution.
-
-    The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    offset : tvm.relay.Expr
-        The offset expressions.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    deformable_groups : int, optional
-        Number of deformable groups.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.deformable_conv2d(
-        data,
-        offset,
-        weight,
-        strides,
-        padding,
-        dilation,
-        deformable_groups,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def bitpack(data, bits=1, pack_axis=1, bit_axis=2, pack_type="uint32", name="BitPack"):
-    """Tensor packing for bitserial operations.
-
-    The values along the input tensor's pack_axis are quantized
-    and packed together into the specified pack_type in a new bit axis.
-
-    For example, consider bitpacking with data to be a tensor with shape `[1, 64, 128, 128]`,
-    pack_axis=1, bit_axis=4, pack_type=uint8, and bits=2. The output in this case will
-    be of shape `[1, 8, 128, 128, 2]`. The dimension of axis 1 has been reduced by a factor
-    of 8 since each value is packed into an 8-bit uint8. Axis 4 is now two bitplanes
-    representing the quantized value of the incoming data. The output tensor is now
-    ready to be used in a bitserial operation.
-
-    Parameters
-    ----------
-    data : tvm.relay.expr
-        The incoming tensor to be packed.
-
-    bits : int
-        Number of bits that should be packed.
-
-    pack_axis : int
-        Axis that should be decomposed and packed.
-
-    bit_axis : int
-        New axis containing bitplane.
-
-    pack_type : str
-        Datatype to pack bits into.
-
-    name : str, optional
-        Name of the operation.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The packed tensor.
-    """
-    return _make.bitpack(data, bits, pack_axis, bit_axis, pack_type, name)
-
-
-def bitserial_conv2d(
-    data,
-    weight,
-    strides=(1, 1),
-    padding=(0, 0),
-    channels=None,
-    kernel_size=(3, 3),
-    activation_bits=1,
-    weight_bits=1,
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    pack_dtype="uint32",
-    out_dtype="int16",
-    unipolar=True,
-):
-    r"""2D convolution using bitserial computation.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    activation_bits : int
-        Number of bits to pack for activations.
-
-    weight_bits : int
-        Number of bits to pack for weights.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the kernel
-
-    pack_dtype: str, optional
-        Datatype to pack bits into.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.bitserial_conv2d(
-        data,
-        weight,
-        strides,
-        padding,
-        channels,
-        kernel_size,
-        activation_bits,
-        weight_bits,
-        data_layout,
-        kernel_layout,
-        pack_dtype,
-        out_dtype,
-        unipolar,
-    )
-
-
-def bitserial_dense(
-    data,
-    weight,
-    units=None,
-    data_bits=1,
-    weight_bits=1,
-    pack_dtype="uint32",
-    out_dtype="int16",
-    unipolar=True,
-):
-    """Bitserial Dense operator.
-    Applies matrix multiplication of two quantized matrices
-    using a fast bitserial algorithm.
-
-    .. math::
-
-    `Y = X * W`
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    units : int, optional
-        Number of hidden units of the dense transformation.
-
-    data_bits : int
-        Number of bits incoming tensor should be packed with.
-
-    weight_bits : int
-        Number of bits weight tensor should be packed with.
-
-    pack_dtype : str, optional
-        Datatype to pack individual bits into before computation.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision dense.
-
-    unipolar : bool, optional
-        Whether to use unipolar or bipolar quantization for inputs.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.bitserial_dense(
-        data, weight, units, data_bits, weight_bits, pack_dtype, out_dtype, unipolar
-    )
-
-
-def cross_entropy(predictions, targets):
-    """CrossEntropy without logits.
-
-    Parameters
-    ----------
-    predictions : tvm.relay.Expr
-      The predictions.
-
-    targets : tvm.relay.Expr
-      The targets.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-      The computed result.
-    """
-    return _make.cross_entropy(predictions, targets)
-
-
-def cross_entropy_with_logits(predictions, targets):
-    """CrossEntropy with logits.
-
-    Parameters
-    ----------
-    predictions : tvm.relay.Expr
-      The predictions.
-
-    targets : tvm.relay.Expr
-      The targets.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-      The computed result.
-    """
-    return _make.cross_entropy_with_logits(predictions, targets)
-
-
-def nll_loss(predictions, targets, weights, reduction="mean", ignore_index=-100):
-    """Negative log likelihood loss.
-
-    output{n, i_1, i_2, ..., i_k} = -p * w
-      where t = target{n, i_1, i_2, ..., i_k}
-            p = predictions{n, t, i_1, i_2, i_k}
-            w = weights{n, i_1, i_2, ..., i_k} if t != ignore_index else 0
-
-    result = reduction(output)
-
-    Parameters
-    ----------
-    predictions : tvm.relay.Expr
-      The predictions.
-
-    targets : tvm.relay.Expr
-      The target value of each prediction.
-
-    weights : tvm.relay.Expr
-      The weight of each target value.
-
-    reduction : string
-      The reduction method to apply to the output.
-      Possible values are "mean", "sum" and "none".
-
-    ignore_index : int
-      The target value to ignore.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-      The computed result.
-    """
-    return _make.nll_loss(predictions, targets, weights, reduction, ignore_index)
-
-
-def depth_to_space(data, block_size, layout="NCHW", mode="DCR"):
-    """Convert channels into spatial blocks.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input data with channels divisible by block_size**2
-
-    block_size : int
-        Size of blocks to convert channels into.
-
-    layout : string
-        One of NCHW or NHWC, indicates channel axis.
-
-    mode : string
-        One of DCR or CDR, indicates which order channels
-        are accessed in.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        Tensor with shape [in_batch, in_channel / block_size * block_size,
-                           in_height * block_size, in_width * block_size]
-    """
-    return _make.depth_to_space(data, block_size, layout, mode)
-
-
-def space_to_depth(data, block_size, layout="NCHW"):
-    """Convert spatial blocks into channels.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input data with spatial dimensions divisible by block_size
-
-    block_size : int
-        Size of blocks to decompose into channels.
-
-    layout : string
-        One of NCHW or NHWC, indicates channel axis.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        Tensor with shape [in_batch, in_channel * block_size * block_size,
-                           in_height / block_size, in_width / block_size]
-    """
-    return _make.space_to_depth(data, block_size, layout)
-
-
-def adaptive_max_pool1d(data, output_size=None, layout="NCW", out_layout=""):
-    r"""1D adaptive max pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 1D max value calculation
-    across each window represented by W.
-
-
-    In the default case, where the data_layout is `NCW`
-    a data Tensor with shape `(batch_size, in_channels, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input height and width will be used
-        as output height and width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size) for any input (NCW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    if isinstance(output_size, int):
-        output_size = [output_size]
-    return _make.adaptive_max_pool1d(data, output_size, layout, out_layout)
-
-
-def adaptive_avg_pool1d(data, output_size=None, layout="NCW", out_layout=""):
-    r"""1D adaptive average pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 1D average value calculation
-    across each window represented by W.
-
-
-    In the default case, where the data_layout is `NCW`
-    a data Tensor with shape `(batch_size, in_channels, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input height and width will be used
-        as output width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size) for any input (NCW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    if isinstance(output_size, int):
-        output_size = [output_size]
-    return _make.adaptive_avg_pool1d(data, output_size, layout, out_layout)
-
-
-def adaptive_max_pool2d(data, output_size=None, layout="NCHW", out_layout=""):
-    r"""2D adaptive max pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 2D max value calculation
-    across each window represented by WxH.
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_height, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input height and width will be used
-        as output height and width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size x output_size) for any input (NCHW).
-
-        If a tuple of integers (height, width) are provided for output_size,
-        the output size is (N x C x height x width) for any input (NCHW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    return _make.adaptive_max_pool2d(data, output_size, layout, out_layout)
-
-
-def adaptive_avg_pool2d(data, output_size=None, layout="NCHW", out_layout=""):
-    r"""2D adaptive average pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 2D average value calculation
-    across each window represented by WxH.
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_height, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input height and width will be used
-        as output height and width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size x output_size) for any input (NCHW).
-
-        If a tuple of integers (height, width) are provided for output_size,
-        the output size is (N x C x height x width) for any input (NCHW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    return _make.adaptive_avg_pool2d(data, output_size, layout, out_layout)
-
-
-def adaptive_max_pool3d(data, output_size=None, layout="NCDHW", out_layout=""):
-    r"""3D adaptive max pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 3D max value calculation
-    across each window represented by DxWxH.
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, in_channels, depth, height, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_depth, output_height, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input depth, height and width will be used
-        as output depth, height and width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size x output_size x output_size) for any input (NCDHW).
-
-        If a tuple of integers (depth, height, width) are provided for output_size,
-        the output size is (N x C x depth x height x width) for any input (NCDHW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    return _make.adaptive_max_pool3d(data, output_size, layout, out_layout)
-
-
-def adaptive_avg_pool3d(data, output_size=None, layout="NCDHW", out_layout=""):
-    r"""3D adaptive avg pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 3D avg value calculation
-    across each window represented by DxWxH.
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, in_channels, depth, height, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_depth, output_height, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input depth, height and width will be used
-        as output depth, height and width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size x output_size x output_size) for any input (NCDHW).
-
-        If a tuple of integers (depth, height, width) are provided for output_size,
-        the output size is (N x C x depth x height x width) for any input (NCDHW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    return _make.adaptive_avg_pool3d(data, output_size, layout, out_layout)
-
-
-def global_max_pool1d(data, layout="NCW", out_layout=""):
-    r"""1D global maximum pooling operator.
-
-    This operator takes data as input and does 1D max value calculation
-    across each window represented by W.
-
-    In the default case, where the data_layout is `NCW`
-    a data Tensor with shape `(batch_size, in_channels, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, w)
-    .. math::
-
-        \mbox{out}(b, c, 1)  = \max_{n=0, \ldots, w} \mbox{data}(b, c, n)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [1]
-    return _make.adaptive_max_pool1d(data, output_size, layout, out_layout)
-
-
-def global_avg_pool1d(data, layout="NCW", out_layout=""):
-    r"""1D global average pooling operator.
-
-    This operator takes data as input and does 1D average value calculation
-    across each window represented by W.
-
-    In the default case, where the data_layout is `NCW`
-    a data Tensor with shape `(batch_size, in_channels, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, w)
-
-    .. math::
-
-        \mbox{out}(b, c, 1)  = \frac{1}{w} \sum_{n=0}^{w-1} \mbox{data}(b, c, n)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [1]
-    return _make.adaptive_avg_pool1d(data, output_size, layout, out_layout)
-
-
-def global_max_pool3d(data, layout="NCDHW", out_layout=""):
-    r"""3D global maximum pooling operator.
-
-    This operator takes data as input and does 3D max value calculation
-    across each window represented by DxWxH.
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, in_channels, depth, height, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, d, h, w)
-    .. math::
-
-        \mbox{out}(b, c, 1, 1, 1)  =  \max_{l=0, \ldots, d},  \max_{m=0, \ldots, h},
-             \max_{n=0, \ldots, w} \mbox{data}(b, c, l, m, n)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [1, 1, 1]
-    return _make.adaptive_max_pool3d(data, output_size, layout, out_layout)
-
-
-def global_avg_pool3d(data, layout="NCDHW", out_layout=""):
-    r"""3D global average pooling operator.
-
-    This operator takes data as input and does 3D average value calculation
-    across each window represented by DxWxH.
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, in_channels, depth, height, width)`,
-    to produce an output Tensor with the following rule:
-
-    with data of shape (b, c, d, h, w)
-
-    .. math::
-
-        \mbox{out}(b, c, 1, 1, 1)  = \frac{1}{d * h * w} \sum_{l=0}^{d-1}  \sum_{m=0}^{h-1}
-             \sum_{n=0}^{w-1} \mbox{data}(b, c, l, m, n)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    layout : str, optional
-        Layout of the input.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [1, 1, 1]
-    return _make.adaptive_avg_pool3d(data, output_size, layout, out_layout)
-
-
-def correlation(
-    data1, data2, kernel_size, max_displacement, stride1, stride2, padding, is_multiply, layout
-):
-    r"""Applies correlation to inputs.
-
-    The correlation layer performs multiplicative patch comparisons between two feature maps.
-    Given two multi-channel feature maps :math:`f_{1}, f_{2}`, with :math:`w`, :math:`h`, and
-    :math:`c` being their width, height, and number of channels, the correlation layer lets the
-    network compare each patch from :math:`f_{1}` with each patch from :math:`f_{2}`.
-
-    For now we consider only a single comparison of two patches. The 'correlation' of two patches
-    centered at :math:`x_{1}` in the first map and :math:`x_{2}` in the second map is then defined
-    as:
-
-    .. math::
-
-        c(x_{1}, x_{2}) = \sum_{o \in [-k,k] \times [-k,k]} <f_{1}(x_{1} + o), f_{2}(x_{2} + o)>
-
-    for a square patch of size :math:`K:=2k+1`.
-
-    Note that the equation above is identical to one step of a convolution in neural networks, but
-    instead of convolving data with a filter, it convolves data with other    data. For this
-    reason, it has no training weights.
-
-    Computing :math:`c(x_{1}, x_{2})` involves :math:`c * K^{2}` multiplications. Comparing all
-    patch combinations involves :math:`w^{2}*h^{2}` such computations.
-
-    Given a maximum displacement :math:`d`, for each location :math:`x_{1}` it computes
-    correlations :math:`c(x_{1}, x_{2})` only in a neighborhood of size :math:`D:=2d+1`,
-    by limiting the range of :math:`x_{2}`. We use strides :math:`s_{1}, s_{2}`, to quantize
-    :math:`x_{1}` globally and to quantize :math:`x_{2}` within the neighborhood
-    centered around :math:`x_{1}`.
-
-    The final output is defined by the following expression:
-
-    .. math::
-
-        out[n, q, i, j] = c(x_{i, j}, x_{q})
-
-    where :math:`i` and :math:`j` enumerate spatial locations in :math:`f_{1}`, and :math:`q`
-    denotes the :math:`q^{th}` neighborhood of :math:`x_{i,j}`.
-
-    Parameters
-    ----------
-    data1 : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    data2 : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    kernel_size: int
-        Kernel size for correlation, must be an odd number
-
-    max_displacement: int
-        Max displacement of Correlation
-
-    stride1: int
-        Stride for data1
-
-    stride2: int
-        Stride for data2 within the neightborhood centered around data1
-
-    padding : int or a list/tuple of 2 or 4 ints
-        Padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    is_multiply: bool
-        operation type is either multiplication or substraction
-
-    layout: str
-        layout of data1, data2 and the output
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    if isinstance(padding, int):
-        padding = (padding, padding)
-    return _make.correlation(
-        data1, data2, kernel_size, max_displacement, stride1, stride2, padding, is_multiply, layout
-    )
-
-
-def space_to_batch_nd(data, block_shape, paddings, pad_value=0):
-    r"""Divide spatial dimensions of the data into a grid of blocks
-    and interleave them into batch dim.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        N-D with shape [batch, spatial_shape, remaining_shape]
-
-    block_shape : relay.Expr
-        1-D of size [M] where M is number of spatial dims, specifies block size
-        for each spatial dimension.
-
-    paddings : relay.Expr
-        2-D of shape [M, 2] where M is number of spatial dims, specifies
-        [before, after] paddings for each spatial dimension.
-
-    pad_value : float, or relay.Expr, optional, default=0
-        The value used for padding.
-
-    Returns
-    -------
-    result : relay.Expr
-        N-D Tensor with shape
-        [in_batch * prod(block_shape),
-        padded_data[1] / block_shape[0], ..., padded_data[M] / block_shape[M-1],
-        remaining_shape]
-    """
-
-    return _make.space_to_batch_nd(data, block_shape, paddings, pad_value)
-
-
-def batch_to_space_nd(data, block_shape, crops):
-    r"""Reshape the batch dimension into spatial dimensions.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        N-D with shape [batch, spatial_shape, remaining_shape]
-
-    block_shape : relay.Expr
-        1-D of size [M] where M is number of spatial dims, specifies block size
-        for each spatial dimension.
-
-    crops : relay.Expr
-        2-D of shape [M, 2] where M is number of spatial dims, specifies
-        [begin, end] crop size for each spatial dimension.
-
-    Returns
-    -------
-    result : relay.Expr
-        N-D Tensor with shape
-        [batch / prod(block_shape),
-        in_shape[1] * block_shape[0] - crops[0,0] - crops[0,1], ...,
-        in_shape[M] * block_shape[M-1] - crops[M-1, 0] - crops[M-1, 1],
-        remaining_shape]
-    """
-
-    return _make.batch_to_space_nd(data, block_shape, crops)
-
-
-def conv2d_backward_weight(
-    grad,
-    data,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    grad_layout="NCHW",
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    out_dtype="",
-):
-    r"""The gradient of conv2d with respect to weight.
-
-    This operator takes the output gradient `grad` and convolves it with `data` as
-    the convolution kernel, to produce the gradient with respect to weight.
-
-    Note that the parameter `kernel_size` is the spatial size of the corresponding
-    forward convolution kernel, not that of `data`. `grad_layout` and
-    `kernel_layout` are the layouts of `grad` and the weight gradient respectively.
-
-    Other parameters are the same as the conv2d op. See its documentation for more
-    details.
-
-    """
-    if isinstance(kernel_size, int):
-        kernel_size = (kernel_size, kernel_size)
-    if isinstance(strides, int):
-        strides = (strides, strides)
-    if isinstance(dilation, int):
-        dilation = (dilation, dilation)
-    padding = get_pad_tuple2d(padding)
-
-    return _make.conv2d_backward_weight(
-        grad,
-        data,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        grad_layout,
-        data_layout,
-        kernel_layout,
-        out_dtype,
-    )
diff --git a/python/tvm/relay/op/nn/utils.py b/python/tvm/relay/op/nn/utils.py
deleted file mode 100644
index 0286f0a8f4fb..000000000000
--- a/python/tvm/relay/op/nn/utils.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable
-"""NN operator common utilities"""
-from tvm.ir import container
-
-
-def get_pad_tuple1d(padding):
-    """Common code to get the 1 dimensional pad option
-    Parameters
-    ----------
-    padding : Union[int, Tuple[int, ...]]
-        Padding size
-    Returns
-    -------
-    pad_left : int
-        Padding size on left
-    pad_right : int
-        Padding size on right.
-    """
-    # compute the padding size
-    if isinstance(padding, container.Array):
-        padding = list(padding)
-    if isinstance(padding, (tuple, list)):
-        if len(padding) == 1:
-            pad_w = padding[0] * 2
-        elif len(padding) == 2:
-            return padding[0], padding[1]
-        else:
-            raise ValueError("Size of padding can only be 1 or 2")
-    elif isinstance(padding, int):
-        pad_w = padding * 2
-    else:
-        raise ValueError(f"Unknown padding option {padding}")
-    pad_left = (pad_w + 1) // 2
-    return pad_left, pad_w - pad_left
-
-
-def get_pad_tuple2d(padding):
-    """Common code to get the pad option
-    Parameters
-    ----------
-    padding : Union[int, Tuple[int, ...]]
-        Padding size
-    Returns
-    -------
-    pad_top : int
-        Padding size on top
-    pad_left : int
-        Padding size on left
-    pad_down : int
-        Padding size on down.
-    pad_right : int
-        Padding size on right.
-    """
-    # compute the padding size
-    if isinstance(padding, container.Array):
-        padding = list(padding)
-    if isinstance(padding, (tuple, list)):
-        if len(padding) == 2:
-            pad_h = padding[0] * 2
-            pad_w = padding[1] * 2
-        elif len(padding) == 4:
-            return padding[0], padding[1], padding[2], padding[3]
-        else:
-            raise ValueError("Size of padding can only be 2 or 4")
-    elif isinstance(padding, int):
-        pad_h = pad_w = padding * 2
-    else:
-        raise ValueError(f"Unknown padding option {padding}")
-    pad_top = (pad_h + 1) // 2
-    pad_left = (pad_w + 1) // 2
-    return pad_top, pad_left, pad_h - pad_top, pad_w - pad_left
-
-
-def get_pad_tuple3d(padding):
-    """Common code to get the pad option
-    Parameters
-    ----------
-    padding : Union[int, Tuple[int, ...]]
-        Padding size
-    Returns
-    -------
-    pad_front : int
-        Padding size on front
-    pad_top : int
-        Padding size on top
-    pad_left : int
-        Padding size on left
-    pad_back : int
-        Padding size on back
-    pad_down : int
-        Padding size on down.
-    pad_right : int
-        Padding size on right.
-    """
-    # compute the padding size
-    if isinstance(padding, container.Array):
-        padding = list(padding)
-    if isinstance(padding, (tuple, list)):
-        if len(padding) == 3:
-            pad_d = padding[0] * 2
-            pad_h = padding[1] * 2
-            pad_w = padding[2] * 2
-        elif len(padding) == 6:
-            return padding[0], padding[1], padding[2], padding[3], padding[4], padding[5]
-        else:
-            raise ValueError("Size of padding can only be 3 or 6")
-    elif isinstance(padding, int):
-        pad_d = pad_h = pad_w = padding * 2
-    else:
-        raise ValueError(f"Unknown padding option {padding}")
-    pad_front = (pad_d + 1) // 2
-    pad_top = (pad_h + 1) // 2
-    pad_left = (pad_w + 1) // 2
-    return pad_front, pad_top, pad_left, pad_d - pad_front, pad_h - pad_top, pad_w - pad_left
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
deleted file mode 100644
index d897a68f2056..000000000000
--- a/python/tvm/relay/op/op.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument,invalid-name
-"""The base node types for the Relay language."""
-import tvm._ffi
-import tvm.ir
-import tvm.ir._ffi_api
-from tvm.driver import build, lower
-from tvm.runtime import Object
-from tvm.target import GenericFunc, get_native_generic_func
-
-from . import _make
-
-
-def get(op_name):
-    """Get the Op for a given name
-
-    Parameters
-    ----------
-    op_name : str
-        The operator name
-
-    Returns
-    -------
-    op : Op
-        The op of the corresponding name
-    """
-    return tvm.ir.Op.get(op_name)
-
-
-def register(op_name, describe=""):
-    """Get the Op for a given name.
-    when the op_name is not registered, create a new empty op with the given name.
-    when the op_name has been registered, abort with an error message.
-
-    Parameters
-    ----------
-    op_name : str
-        The operator name
-
-    describe : Optional[str]
-        The operator description
-    """
-
-    tvm.ir._ffi_api.RegisterOp(op_name, describe)
-
-
-def register_stateful(op_name, stateful, level=10):
-    """Register stateful flag for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    stateful : bool
-        The stateful flag.
-
-    level : int
-        The priority level
-    """
-    tvm.ir.register_op_attr(op_name, "TOpIsStateful", stateful, level)
-
-
-class OpPattern(object):
-    """Operator generic patterns
-
-    See Also
-    --------
-    topi.tag : Contains explanation of the tag type.
-    """
-
-    # Elementwise operator
-    ELEMWISE = 0
-    # Broadcast operator
-    BROADCAST = 1
-    # Injective mapping
-    INJECTIVE = 2
-    # Communication
-    COMM_REDUCE = 3
-    # Complex op, can still fuse ewise into it
-    OUT_ELEMWISE_FUSABLE = 4
-    # Represents tuple node
-    TUPLE = 7
-    # Not fusable opaque op
-    OPAQUE = 8
-
-
-@tvm._ffi.register_object("relay.OpImplementation")
-class OpImplementation(Object):
-    """Operator implementation"""
-
-    def compute(self, attrs, inputs, out_type):
-        """Call compute function.
-
-        Parameters
-        ----------
-        attrs : Attrs
-            Op attributes.
-
-        inputs : list[te.tensor.Tensor]
-            The input tensors.
-
-        out_type : relay.Type
-            The output type.
-
-        Returns
-        -------
-        outs : list[te.tensor.Tensor]
-            The output tensors.
-        """
-        return _OpImplementationCompute(self, attrs, inputs, out_type)
-
-    def schedule(self, attrs, outs, target):
-        """Call schedule function.
-
-        Parameters
-        ----------
-        attrs : Attrs
-            Op attributes.
-
-        outs : list[te.tensor.Tensor]
-            The output tensors.
-
-        target : tvm.target.Target
-            The target to schedule the op.
-
-        Returns
-        -------
-        schedule : tvm.te.Schedule
-            The schedule.
-        """
-        return _OpImplementationSchedule(self, attrs, outs, target)
-
-
-@tvm._ffi.register_object("relay.OpSpecialization")
-class OpSpecialization(Object):
-    """Operator specialization"""
-
-
-@tvm._ffi.register_object("relay.OpStrategy")
-class OpStrategy(Object):
-    """Operator strategy"""
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(_make.OpStrategy)
-
-    def add_implementation(self, compute, schedule, name="default", plevel=10):
-        """Add an implementation to the strategy
-
-        Parameters
-        ----------
-        compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type)
-                           -> List[Tensor]
-            The compute function.
-
-        schedule : function (attrs: Attrs, outs: List[Tensor], target:Target) -> Schedule
-            The schedule function.
-
-        name : str
-            The name of implementation.
-
-        plevel : int
-            The priority level of implementation.
-        """
-        _OpStrategyAddImplementation(self, compute, schedule, name, plevel)
-
-
-def _wrap_default_fstrategy(compute, schedule, name):
-    def _fstrategy(attrs, inputs, out_type, target):
-        strategy = OpStrategy()
-        strategy.add_implementation(compute, schedule, name=name)
-        return strategy
-
-    return _fstrategy
-
-
-def _create_fstrategy_from_schedule(op_name, schedule):
-    assert hasattr(schedule, "dispatch_dict")
-    compute = get(op_name).get_attr("FTVMCompute")
-    assert compute is not None, f"FTVMCompute is not registered for op {op_name}"
-    fstrategy = get_native_generic_func(f"{op_name}_strategy")
-    name_pfx = schedule.__name__
-    name_pfx = name_pfx[name_pfx.index("_") + 1 :]
-    fstrategy.set_default(
-        _wrap_default_fstrategy(compute, schedule.fdefault, f"{name_pfx}.generic")
-    )
-    for key, sch in schedule.dispatch_dict.items():
-        fstrategy.register(_wrap_default_fstrategy(compute, sch, f"{name_pfx}.{key}"), [key])
-    return fstrategy
-
-
-def register_compute(op_name, compute=None, level=10):
-    """Register compute function for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type)
-                       -> List[Tensor]
-        The compute function.
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMCompute", compute, level)
-
-
-def register_strategy(op_name, fstrategy=None, level=10):
-    """Register strategy function for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    fstrategy : function (attrs: Attrs, inputs: List[Tensor], out_type: Type,
-                          target:Target) -> OpStrategy
-        The strategy function. Need to be native GenericFunc.
-
-    level : int
-        The priority level
-    """
-    if not isinstance(fstrategy, GenericFunc):
-        assert hasattr(fstrategy, "generic_func_node")
-        fstrategy = fstrategy.generic_func_node
-    return tvm.ir.register_op_attr(op_name, "FTVMStrategy", fstrategy, level)
-
-
-def register_schedule(op_name, schedule, level=10):
-    """Register schedule function for an op.
-
-    This is used when compute function is the same for all targets and only
-    schedule is different. It requires FTVMCompute is already registered to
-    the op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    schedule : function (attrs: Attrs, outs: List[Tensor], target:Target) -> Schedule
-        The schedule function. Need to be target.generic_func.
-
-    level : int
-        The priority level
-    """
-    fstrategy = _create_fstrategy_from_schedule(op_name, schedule)
-    return register_strategy(op_name, fstrategy, level)
-
-
-def register_injective_schedule(op_name, level=10):
-    """Register injective schedule function for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    level : int
-        The priority level
-    """
-    return register_schedule(op_name, _schedule_injective, level)
-
-
-def register_broadcast_schedule(op_name, level=10):
-    """Register broadcast schedule function for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    level : int
-        The priority level
-    """
-    return register_schedule(op_name, _schedule_injective, level)
-
-
-def register_reduce_schedule(op_name, level=10):
-    """Register reduce schedule function for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    level : int
-        The priority level
-    """
-    return register_schedule(op_name, _schedule_reduce, level)
-
-
-def register_alter_op_layout(op_name, alter_layout=None, level=10):
-    """Register alter op layout function for an op
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    alter_layout: function (attrs: Attrs, inputs: List[Expr]) -> new_expr: Expr
-        The function for changing the layout or replacing the operator
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMAlterOpLayout", alter_layout, level)
-
-
-def register_convert_op_layout(op_name, convert_layout=None, level=10):
-    """Register convert op layout function for an op
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    convert_layout: function (attrs: Attrs, inputs: List[Expr]) -> new_expr: Expr
-        The function for changing the layout or replacing the operator
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMConvertOpLayout", convert_layout, level)
-
-
-def register_infer_correct_layout(op_name, infer_layout=None, level=10):
-    """Register infer op layout function for an op
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    infer_layout: function (attrs: Attrs, inputs: List[Layout]) -> InferCorrectLayoutOutput
-        The function to infer correct layout
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FInferCorrectLayout", infer_layout, level)
-
-
-def register_legalize(op_name, legal_op=None, level=10):
-    """Register legal transformation function for an op
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    legal_op: function (attrs: Attrs, inputs: List[Expr]) -> new_expr: Expr
-        The function for transforming an expr to another expr.
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMLegalize", legal_op, level)
-
-
-def register_pattern(op_name, pattern, level=10):
-    """Register operator pattern for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    pattern : int
-        The pattern being used.
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "TOpPattern", pattern, level)
-
-
-def register_gradient(op_name, fgradient=None, level=10):
-    """Register operator gradient function for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    fgradient : function (orig_expr : Expr, output_grad : Expr) -> new_expr : Expr
-        The gradient being used.
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FPrimalGradient", fgradient, level)
-
-
-def register_shape_func(op_name, data_dependent, shape_func=None, level=10):
-    """Register operator shape function for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    data_dependent : bool or list of bool
-        Whether the shape function depends on input data. If this is a list of bool,
-        the length of the list must be the same as the number of arguments of this op.
-        The list specifies per-input data dependence of the op.
-
-    shape_func : function (attrs: Attrs, inputs: List[Tensor], out_ndims: List[IndexExpr])
-                 -> shape_tensors: List<Tensor>
-        The function for computing the dynamic output shapes
-
-    level : int
-        The priority level
-    """
-    if not isinstance(data_dependent, list):
-        data_dependent = [data_dependent]
-    get(op_name).set_attr("TShapeDataDependent", data_dependent, level)
-    return tvm.ir.register_op_attr(op_name, "FShapeFunc", shape_func, level)
-
-
-def register_external_compiler(op_name, fexternal=None, level=10):
-    """Register the external compiler for an op.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator.
-
-    fexternal : function (attrs: Attrs, args: List[Expr], compiler: str)
-              -> new_expr: Expr
-        The function for wrapping a call expr with compiler_begin and
-        compiler_end.
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMExternalCompiler", fexternal, level)
-
-
-def register_fake_quantization_to_integer(op_name, func=None, level=10):
-    """Register quantize function for an op
-
-    Given an op and Affine Types on it's inputs, this function should return the op
-    in affine space/integer operators and the new type of the output, where affine
-    denotes the transformation x_real = (x_affine - zero_point) * scale
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    func: function (expr: Expr, map: Map<Expr, AffineType>) -> new_expr: Expr
-        The function for translating the op into affine space and integer operators
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMFakeQuantizationToInteger", func, level)
-
-
-def register_optional_fake_quantization_to_integer(op_name, func=None, level=10):
-    """Register optional quantize function for an op
-
-    Given an op and Affine Types on it's inputs, this function should return the op
-    in affine space/integer operators and the new type of the output, where affine
-    denotes the transformation x_real = (x_affine - zero_point) * scale
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    func: function (expr: Expr, map: Map<Expr, AffineType>) -> new_expr: Expr
-        The function for translating the op into affine space and integer operators
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMOptionalFakeQuantizationToInteger", func, level)
-
-
-def register_mixed_precision_conversion(op_name, func=None, level=10):
-    """Register mixed precision conversion function for an op
-
-    Given an op the function should return information on how the value should be
-    converted. Specifically the function should take a call node and the target
-    mixed precision datatype (e.g. FP16) and return the conversion category
-    (see python/tvm/relay/transform/mixed_precision.py) as well as the accumulation
-    and output datatype of the operation in the mixed precision dtype space.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    func: function (call_node: relay.Call, target_dtype: string)
-    -> [conversion category, accumulation dtype, output dtype]: [int, string, string]
-        A function which given a call_node and target_dtype (e.g. FP16) returns the
-        conversion category and associated accumulation/output of the operation
-        when transformed into the mixed precision dtype space.
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMMixedPrecisionConversionType", func, level)
-
-
-@tvm._ffi.register_func("relay.op.compiler._lower")
-def _lower(name, schedule, inputs, outputs):
-    return lower(schedule, list(inputs) + list(outputs), name=name)
-
-
-@tvm._ffi.register_func("relay.op.compiler._build")
-def _build(lowered_funcs):
-    return build(lowered_funcs, target="llvm")
-
-
-_schedule_injective = None
-_schedule_reduce = None
-
-__DEBUG_COUNTER__ = 0
-
-
-def debug(expr, debug_func=None):
-    """The main entry point to the debugger."""
-    global __DEBUG_COUNTER__
-
-    if debug_func:
-        name = f"debugger_func{__DEBUG_COUNTER__}"
-        tvm._ffi.register_func(name, debug_func)
-        __DEBUG_COUNTER__ += 1
-    else:
-        name = ""
-
-    return _make.debug(expr, name)
-
-
-tvm._ffi._init_api("relay.op", __name__)
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
deleted file mode 100644
index deae9e2f48be..000000000000
--- a/python/tvm/relay/op/op_attrs.py
+++ /dev/null
@@ -1,669 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The attributes node used for Relay operators"""
-from tvm.ir import Attrs
-import tvm._ffi
-
-
-@tvm._ffi.register_object("relay.attrs.Conv1DAttrs")
-class Conv1DAttrs(Attrs):
-    """Attributes for nn.conv1d"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv2DAttrs")
-class Conv2DAttrs(Attrs):
-    """Attributes for nn.conv2d"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv2DWinogradAttrs")
-class Conv2DWinogradAttrs(Attrs):
-    """Attributes for nn.contrib_conv2d_winograd_without_weight_transform"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv3DAttrs")
-class Conv3DAttrs(Attrs):
-    """Attributes for nn.conv3d"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv3DWinogradAttrs")
-class Conv3DWinogradAttrs(Attrs):
-    """Attributes for nn.contrib_conv3d_winograd_without_weight_transform"""
-
-
-@tvm._ffi.register_object("relay.attrs.ConvWinogradWeightTransformAttrs")
-class ConvWinogradWeightTransformAttrs(Attrs):
-    """Attributes for nn.contrib_convNd_winograd_weight_transform"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs")
-class Conv2DWinogradNNPACKWeightTransformAttrs(Attrs):
-    """Attributes for nn.contrib_conv2d_winograd_nnpack_weight_transform"""
-
-
-@tvm._ffi.register_object("relay.attrs.GlobalPool2DAttrs")
-class GlobalPool2DAttrs(Attrs):
-    """Attributes for nn.global_pool"""
-
-
-@tvm._ffi.register_object("relay.attrs.BiasAddAttrs")
-class BiasAddAttrs(Attrs):
-    """Atttribute of nn.bias_add"""
-
-
-@tvm._ffi.register_object("relay.attrs.MatmulAttrs")
-class MatmulAttrs(Attrs):
-    """Attributes for nn.matmul"""
-
-
-@tvm._ffi.register_object("relay.attrs.DenseAttrs")
-class DenseAttrs(Attrs):
-    """Attributes for nn.dense"""
-
-
-@tvm._ffi.register_object("relay.attrs.DensePackAttrs")
-class DensePackAttrs(Attrs):
-    """Attributes for nn.contrib_dense_pack"""
-
-
-@tvm._ffi.register_object("relay.attrs.BatchMatmulAttrs")
-class BatchMatmulAttrs(Attrs):
-    """Attributes for nn.batch_matmul"""
-
-
-@tvm._ffi.register_object("relay.attrs.SoftmaxAttrs")
-class SoftmaxAttrs(Attrs):
-    """Attributes for nn.softmax"""
-
-
-@tvm._ffi.register_object("relay.attrs.FIFOBufferAttrs")
-class FIFOBufferAttrs(Attrs):
-    """Attributes for nn.fifo_buffer"""
-
-
-@tvm._ffi.register_object("relay.attrs.UpSamplingAttrs")
-class UpSamplingAttrs(Attrs):
-    """Attributes for nn.upsampling"""
-
-
-@tvm._ffi.register_object("relay.attrs.UpSampling3DAttrs")
-class UpSampling3DAttrs(Attrs):
-    """Attributes for nn.upsampling3d"""
-
-
-@tvm._ffi.register_object("relay.attrs.PadAttrs")
-class PadAttrs(Attrs):
-    """Attributes for nn.pad"""
-
-
-@tvm._ffi.register_object("relay.attrs.MirrorPadAttrs")
-class MirrorPadAttrs(Attrs):
-    """Attributes for nn.mirror_pad"""
-
-
-@tvm._ffi.register_object("relay.attrs.LeakyReluAttrs")
-class LeakyReluAttrs(Attrs):
-    """Attributes for nn.leaky_relu"""
-
-
-@tvm._ffi.register_object("relay.attrs.PReluAttrs")
-class PReluAttrs(Attrs):
-    """Attributes for nn.prelu"""
-
-
-@tvm._ffi.register_object("relay.attrs.DropoutAttrs")
-class DropoutAttrs(Attrs):
-    """Attributes for nn.dropout"""
-
-
-@tvm._ffi.register_object("relay.attrs.BatchNormAttrs")
-class BatchNormAttrs(Attrs):
-    """Attributes for nn.batch_norm"""
-
-
-@tvm._ffi.register_object("relay.attrs.LRNAttrs")
-class LRNAttrs(Attrs):
-    """Attributes for nn.lrn"""
-
-
-@tvm._ffi.register_object("relay.attrs.L2NormalizeAttrs")
-class L2NormalizeAttrs(Attrs):
-    """Attributes for nn.l2_normalize"""
-
-
-@tvm._ffi.register_object("relay.attrs.DeformableConv2DAttrs")
-class DeformableConv2DAttrs(Attrs):
-    """Attributes for nn.deformable_conv2d"""
-
-
-@tvm._ffi.register_object("relay.attrs.Resize1DAttrs")
-class Resize1DAttrs(Attrs):
-    """Attributes for image.resize1d"""
-
-
-@tvm._ffi.register_object("relay.attrs.Resize2DAttrs")
-class Resize2DAttrs(Attrs):
-    """Attributes for image.resize2d"""
-
-
-@tvm._ffi.register_object("relay.attrs.Resize3DAttrs")
-class Resize3DAttrs(Attrs):
-    """Attributes used in resize3d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.CropAndResizeAttrs")
-class CropAndResizeAttrs(Attrs):
-    """Attributes for image.crop_and_resize"""
-
-
-@tvm._ffi.register_object("relay.attrs.Dilation2DAttrs")
-class Dilation2DAttrs(Attrs):
-    """Attributes for image.dilation2d"""
-
-
-@tvm._ffi.register_object("relay.attrs.ArgsortAttrs")
-class ArgsortAttrs(Attrs):
-    """Attributes for algorithm.argsort"""
-
-
-@tvm._ffi.register_object("relay.attrs.OnDeviceAttrs")
-class OnDeviceAttrs(Attrs):
-    """Attributes for annotation.on_device"""
-
-
-@tvm._ffi.register_object("relay.attrs.DebugAttrs")
-class DebugAttrs(Attrs):
-    """Attributes for debug"""
-
-
-@tvm._ffi.register_object("relay.attrs.CompilerAttrs")
-class CompilerAttrs(Attrs):
-    """Attributes for compiler"""
-
-
-@tvm._ffi.register_object("relay.attrs.DeviceCopyAttrs")
-class DeviceCopyAttrs(Attrs):
-    """Attributes for annotation.device_copy"""
-
-
-@tvm._ffi.register_object("relay.attrs.CastAttrs")
-class CastAttrs(Attrs):
-    """Attributes for transform.cast"""
-
-
-@tvm._ffi.register_object("relay.attrs.ConcatenateAttrs")
-class ConcatenateAttrs(Attrs):
-    """Attributes for tensor.concatenate"""
-
-
-@tvm._ffi.register_object("relay.attrs.TransposeAttrs")
-class TransposeAttrs(Attrs):
-    """Attributes for transform.transpose"""
-
-
-@tvm._ffi.register_object("relay.attrs.ReshapeAttrs")
-class ReshapeAttrs(Attrs):
-    """Attributes for transform.reshape"""
-
-
-@tvm._ffi.register_object("relay.attrs.ReshapeLikeAttrs")
-class ReshapeLikeAttrs(Attrs):
-    """Attributes for transform.reshape_like"""
-
-
-@tvm._ffi.register_object("relay.attrs.GatherAttrs")
-class GatherAttrs(Attrs):
-    """Attributes for transform.gather"""
-
-
-@tvm._ffi.register_object("relay.attrs.TakeAttrs")
-class TakeAttrs(Attrs):
-    """Attributes for transform.take"""
-
-
-@tvm._ffi.register_object("relay.attrs.InitOpAttrs")
-class InitOpAttrs(Attrs):
-    """Attributes for ops specifying a tensor"""
-
-
-@tvm._ffi.register_object("relay.attrs.ArangeAttrs")
-class ArangeAttrs(Attrs):
-    """Attributes used in arange operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.MeshgridAttrs")
-class MeshgridAttrs(Attrs):
-    """Attributes used in arange operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.StackAttrs")
-class StackAttrs(Attrs):
-    """Attributes used in stack operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.RepeatAttrs")
-class RepeatAttrs(Attrs):
-    """Attributes used in repeat operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.TileAttrs")
-class TileAttrs(Attrs):
-    """Attributes used in tile operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ReverseAttrs")
-class ReverseAttrs(Attrs):
-    """Attributes used in reverse operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ReverseSequenceAttrs")
-class ReverseSequenceAttrs(Attrs):
-    """Attributes used in reverse sequence operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SqueezeAttrs")
-class SqueezeAttrs(Attrs):
-    """Attributes used in squeeze operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SplitAttrs")
-class SplitAttrs(Attrs):
-    """Attributes for transform.split"""
-
-
-@tvm._ffi.register_object("relay.attrs.StridedSliceAttrs")
-class StridedSliceAttrs(Attrs):
-    """Attributes for transform.stranded_slice"""
-
-
-@tvm._ffi.register_object("relay.attrs.SliceLikeAttrs")
-class SliceLikeAttrs(Attrs):
-    """Attributes for transform.slice_like"""
-
-
-@tvm._ffi.register_object("relay.attrs.ClipAttrs")
-class ClipAttrs(Attrs):
-    """Attributes for transform.clip"""
-
-
-@tvm._ffi.register_object("relay.attrs.LayoutTransformAttrs")
-class LayoutTransformAttrs(Attrs):
-    """Attributes for transform.layout_transform"""
-
-
-@tvm._ffi.register_object("relay.attrs.ShapeOfAttrs")
-class ShapeOfAttrs(Attrs):
-    """Attributes for tensor.shape_of"""
-
-
-@tvm._ffi.register_object("relay.attrs.MultiBoxPriorAttrs")
-class MultiBoxPriorAttrs(Attrs):
-    """Attributes for vision.multibox_prior"""
-
-
-@tvm._ffi.register_object("relay.attrs.MultiBoxTransformLocAttrs")
-class MultiBoxTransformLocAttrs(Attrs):
-    """Attributes for vision.multibox_transform_loc"""
-
-
-@tvm._ffi.register_object("relay.attrs.GetValidCountsAttrs")
-class GetValidCountsAttrs(Attrs):
-    """Attributes for vision.get_valid_counts"""
-
-
-@tvm._ffi.register_object("relay.attrs.NonMaximumSuppressionAttrs")
-class NonMaximumSuppressionAttrs(Attrs):
-    """Attributes for vision.non_maximum_suppression"""
-
-
-@tvm._ffi.register_object("relay.attrs.AllClassNonMaximumSuppressionAttrs")
-class AllClassNonMaximumSuppressionAttrs(Attrs):
-    """Attributes for vision.all_classnon_maximum_suppression"""
-
-
-@tvm._ffi.register_object("relay.attrs.ROIAlignAttrs")
-class ROIAlignAttrs(Attrs):
-    """Attributes for vision.roi_align"""
-
-
-@tvm._ffi.register_object("relay.attrs.ROIPoolAttrs")
-class ROIPoolAttrs(Attrs):
-    """Attributes for vision.roi_pool"""
-
-
-@tvm._ffi.register_object("relay.attrs.YoloReorgAttrs")
-class YoloReorgAttrs(Attrs):
-    """Attributes for vision.yolo_reorg"""
-
-
-@tvm._ffi.register_object("relay.attrs.ProposalAttrs")
-class ProposalAttrs(Attrs):
-    """Attributes used in proposal operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.MaxPool2DAttrs")
-class MaxPool2DAttrs(Attrs):
-    """Attributes used in max_pool2d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.AvgPool2DAttrs")
-class AvgPool2DAttrs(Attrs):
-    """Attributes used in avg_pool2d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.MaxPool1DAttrs")
-class MaxPool1DAttrs(Attrs):
-    """Attributes used in max_pool1d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.AvgPool1DAttrs")
-class AvgPool1DAttrs(Attrs):
-    """Attributes used in avg_pool1d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.MaxPool3DAttrs")
-class MaxPool3DAttrs(Attrs):
-    """Attributes used in max_pool3d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.AvgPool3DAttrs")
-class AvgPool3DAttrs(Attrs):
-    """Attributes used in avg_pool3d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.BitPackAttrs")
-class BitPackAttrs(Attrs):
-    """Attributes used in bitpack operator"""
-
-
-@tvm._ffi.register_object("relay.attrs.BinaryConv2DAttrs")
-class BinaryConv2DAttrs(Attrs):
-    """Attributes used in bitserial conv2d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.BinaryDenseAttrs")
-class BinaryDenseAttrs(Attrs):
-    """Attributes used in bitserial dense operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv2DTransposeAttrs")
-class Conv2DTransposeAttrs(Attrs):
-    """Attributes used in Transposed Conv2D operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv3DTransposeAttrs")
-class Conv3DTransposeAttrs(Attrs):
-    """Attributes used in Transposed Conv3D operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.DilateAttrs")
-class DilateAttrs(Attrs):
-    """Attributes used in dilate operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SubPixelAttrs")
-class SubPixelAttrs(Attrs):
-    """Attributes used in depth to space and space to depth operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.CorrelationAttrs")
-class CorrelationAttrs(Attrs):
-    """Attributes used in correlation operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.AdaptivePool2DAttrs")
-class AdaptivePool2DAttrs(Attrs):
-    """Attributes used in 2D adaptive pooling operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.AdaptivePool3DAttrs")
-class AdaptivePool3DAttrs(Attrs):
-    """Attributes used in 3D adaptive pooling operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.AffineGridAttrs")
-class AffineGridAttrs(Attrs):
-    """Attributes used in affine_grid operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.CastHintAttrs")
-class CastHintAttrs(Attrs):
-    """Attributes used in cast_hint annotation operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.Conv1DTransposeAttrs")
-class Conv1DTransposeAttrs(Attrs):
-    """Attributes used in 1D transposed convolution operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ExpandDimsAttrs")
-class ExpandDimsAttrs(Attrs):
-    """Attributes used in expand_dims operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.GridSampleAttrs")
-class GridSampleAttrs(Attrs):
-    """Attributes used in grid_sample operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.GroupNormAttrs")
-class GroupNormAttrs(Attrs):
-    """Attributes used in group norm operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.InstanceNormAttrs")
-class InstanceNormAttrs(Attrs):
-    """Attributes used in instance norm operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.LayerNormAttrs")
-class LayerNormAttrs(Attrs):
-    """Attributes used in layer norm operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.NdarraySizeAttrs")
-class NdarraySizeAttrs(Attrs):
-    """Attributes used in ndarray_size operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.OneHotAttrs")
-class OneHotAttrs(Attrs):
-    """Attributes used in one_hot operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.BroadcastAttrs")
-class BroadcastAttrs(Attrs):
-    """Attributes used in broadcast operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.QuantizeAttrs")
-class QuantizeAttrs(Attrs):
-    """Attributes used in quantize operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.DequantizeAttrs")
-class DequantizeAttrs(Attrs):
-    """Attributes used in dequantize operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ReduceAttrs")
-class ReduceAttrs(Attrs):
-    """Attributes used in reduction operators (e.g. sum)"""
-
-
-@tvm._ffi.register_object("relay.attrs.ArgReduceAttrs")
-class ArgReduceAttrs(Attrs):
-    """Attributes used in reduction operators (e.g. argmin/argmax)"""
-
-
-@tvm._ffi.register_object("relay.attrs.VarianceAttrs")
-class VarianceAttrs(Attrs):
-    """Attributes used in reduction operators (e.g. sum)"""
-
-
-@tvm._ffi.register_object("relay.attrs.RequantizeAttrs")
-class RequantizeAttrs(Attrs):
-    """Attributes used in requantize operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SequenceMaskAttrs")
-class SequenceMaskAttrs(Attrs):
-    """Attributes used in sequence_mask operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ShapeFuncAttrs")
-class ShapeFuncAttrs(Attrs):
-    """Attributes used in shape func operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SimulatedQuantizeAttrs")
-class SimulatedQuantizeAttrs(Attrs):
-    """Attributes used in simulated_quantize operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SparseDenseAttrs")
-class SparseDenseAttrs(Attrs):
-    """Attributes used in sparse_dense operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SparseToDenseAttrs")
-class SparseToDenseAttrs(Attrs):
-    """Attributes used in sparse_to_dense operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SparseTransposeAttrs")
-class SparseTransposeAttrs(Attrs):
-    """Attributes used in sparse_transpose operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SparseConv2DAttrs")
-class SparseConv2DAttrs(Attrs):
-    """Attributes used in sparse_conv2d operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.TopkAttrs")
-class TopkAttrs(Attrs):
-    """Attributes used in topk operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SearchSortedAttrs")
-class SearchSortedAttrs(Attrs):
-    """Attributes used in searchsorted operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.TupleGetItemAttrs")
-class TupleGetItemAttrs(Attrs):
-    """Attributes used in tuple item access operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.WithFuncIdAttrs")
-class WithFuncIdAttrs(Attrs):
-    """Attributes used in with_funcid annotation operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SpaceToBatchNDAttrs")
-class SpaceToBatchNDAttrs(Attrs):
-    """Attributes used in SpaceToBatchND operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.BatchToSpaceNDAttrs")
-class BatchToSpaceNDAttrs(Attrs):
-    """Attributes used in BatchToSpaceNDAttrs operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ThreefryGenerateAttrs")
-class ThreefryGenerateAttrs(Attrs):
-    """Attributes used in ThreefryGenerateAttrs operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.UniformAttrs")
-class UniformAttrs(Attrs):
-    """Attributes used in UniformAttrs operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.NLLLossAttrs")
-class NLLLossAttrs(Attrs):
-    """Attributes for nn.nll_loss"""
-
-
-@tvm._ffi.register_object("relay.attrs.FixedPointMultiplyAttrs")
-class FixedPointMultiplyAttrs(Attrs):
-    """Attributes used in fixed_point_multiply operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.TriluAttrs")
-class TriluAttrs(Attrs):
-    """Attributes used in trilu operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.SlidingWindowAttrs")
-class SlidingWindowAttrs(Attrs):
-    """Attributes used in sliding_window operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.DynExpandDimsAttrs")
-class DynExpandDimsAttrs(Attrs):
-    """Attributes used in dynamic expand_dims operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ScatterElementsAttrs")
-class ScatterElementsAttrs(Attrs):
-    """Attributes used in scatter_elements operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ScatterNDAttrs")
-class ScatterNDAttrs(Attrs):
-    """Attributes used in scatter_nd operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.GatherNDAttrs")
-class GatherNDAttrs(Attrs):
-    """Attributes used in gather_nd operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.AutoSchedulerLayoutTransformAttrs")
-class AutoSchedulerLayoutTransformAttrs(Attrs):
-    """Attributes used in AutoSchedulerLayoutTransform operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.MetaScheduleLayoutTransformAttrs")
-class MetaScheduleLayoutTransformAttrs(Attrs):
-    """Attributes used in MetaScheduleLayoutTransform operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.MatrixSetDiagAttrs")
-class MatrixSetDiagAttrs(Attrs):
-    """Attributes used in matrix_set_diag operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.ScanopAttrs")
-class ScanopAttrs(Attrs):
-    """Attributes used in cumsum and cumprod operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.UniqueAttrs")
-class UniqueAttrs(Attrs):
-    """Attributes used in unique operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.EinsumAttrs")
-class EinsumAttrs(Attrs):
-    """Attributes used in einsum operators"""
-
-
-@tvm._ffi.register_object("relay.attrs.StftAttrs")
-class StftAttrs(Attrs):
-    """Attributes used in stft operators"""
diff --git a/python/tvm/relay/op/random/__init__.py b/python/tvm/relay/op/random/__init__.py
deleted file mode 100644
index 8366f4a06dac..000000000000
--- a/python/tvm/relay/op/random/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""PRNG related operators."""
-from .kernel import *
-from . import _kernel
diff --git a/python/tvm/relay/op/random/_kernel.py b/python/tvm/relay/op/random/_kernel.py
deleted file mode 100644
index e2250a8cf3fe..000000000000
--- a/python/tvm/relay/op/random/_kernel.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Splittable and parallelizable PRNG kernels."""
-# pylint: disable=invalid-name,unused-argument
-from __future__ import absolute_import
-
-from .. import strategy
-from ..op import register_strategy, register_pattern, OpPattern
-
-
-# Threefry
-register_strategy("random.threefry_generate", strategy.threefry_generate_strategy)
-register_pattern("random.threefry_generate", OpPattern.OPAQUE)
-register_strategy("random.threefry_split", strategy.threefry_split_strategy)
-register_pattern("random.threefry_split", OpPattern.OPAQUE)
-
-# Distribution
-register_strategy("random.uniform", strategy.uniform_strategy)
-register_pattern("random.uniform", OpPattern.OPAQUE)
-register_strategy("random.normal", strategy.normal_strategy)
-register_pattern("random.normal", OpPattern.OPAQUE)
-register_strategy("random.multinomial", strategy.multinomial_strategy)
-register_pattern("random.multinomial", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/random/_make.py b/python/tvm/relay/op/random/_make.py
deleted file mode 100644
index 51a8a6aa9339..000000000000
--- a/python/tvm/relay/op/random/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.random._make", __name__)
diff --git a/python/tvm/relay/op/random/kernel.py b/python/tvm/relay/op/random/kernel.py
deleted file mode 100644
index 674760d376ef..000000000000
--- a/python/tvm/relay/op/random/kernel.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Splittable and parallelizable PRNG kernels."""
-# pylint: disable=invalid-name,unused-argument
-from __future__ import absolute_import
-
-import sys
-import numpy as np
-
-from ...expr import Constant, Expr, const
-from .... import nd
-from . import _make
-
-
-def threefry_key(seed):
-    """Create a new Threefry random number generator key.
-
-    Example
-    -------
-
-    .. code-block:: python
-
-        gen = threefry_key(0)
-        _, random_number = threefry_generate(gen, (4,))
-
-    Parameters
-    ----------
-    seed : int
-        Starting seed for the key
-
-    Returns
-    -------
-    key : relay.Expr
-        New key to pass to future uses of :py:func:`threefry_split` or
-        :py:func:`threefry_generate`.
-    """
-    s = np.frombuffer(seed.to_bytes(32, sys.byteorder), dtype="uint64")
-    a = np.concatenate((s, np.array([0, 0, 0, 0, 1 << 63, 0], dtype="uint64")))
-    return Constant(nd.array(a))
-
-
-def threefry_generate(key, shape):
-    """Generate an array of random bits (`uint64`) using the Threefry algorithm
-
-    Example
-    -------
-
-    .. code-block:: python
-
-        key = threefry_key(0)
-        new_key, random1 = threefry_generate(key, (4,))
-        _, random2 = threefry_generate(new_key, (4,))
-        # random1 and random2 are different random numbers
-
-    Parameters
-    ----------
-    key : relay.Expr
-        key that uniquely determines the random values. Multiple uses with the
-        same key will generate the same random values. This key should be
-        treated as an opaque pointer. You can create one from calling
-        :py:func:`threefry_key`, :py:func:`threefry_split`, or
-        :py:func:`threefry_generate`. **Do not use this key again after calling
-        this function.**
-
-    shape : Sequence[int]
-        Desired outputs shape of random numbers.
-
-    Returns
-    -------
-    new_key : relay.Expr
-        New key to pass to future uses of :py:func:`threefry_split` or
-        :py:func:`threefry_generate`.
-
-    random_array : relay.Expr
-        Array of random numbers. Has shape `shape`.
-    """
-    return _make.threefry_generate(key, shape)
-
-
-def threefry_split(key):
-    """Split an existing Threefry key into two new ones.
-
-    This is useful if you have to subsequent calls which each need their own
-    independent random number generation.
-
-    Example
-    -------
-
-    .. code-block:: python
-
-        def foo(key):
-            new_key, num = threefry_generate(key, (4,))
-            return num
-
-        key = threefry_key(0)
-        key1, key2 = threefry_split(key)
-        assert foo(key1) != foo(key2)
-
-    Parameters
-    ----------
-    key : relay.Expr
-        key that uniquely determines the random values. Multiple uses with the
-        same generator will generate the same random values. This generator should be
-        treated as an opaque pointer. You can create one from calling
-        :py:func:`threefry_key`, :py:func:`threefry_split`, or
-        :py:func:`threefry_generate`. **Do not use this generator again after calling
-        this function.**
-
-    Returns
-    -------
-    new_key_1 : relay.Expr
-        New key to pass to future uses of :py:func:`threefry_split` or
-        :py:func:`threefry_generate`.
-
-    new_key_2 : relay.Expr
-        New key to pass to future uses of :py:func:`threefry_split` or
-        :py:func:`threefry_generate`.
-    """
-    return _make.threefry_split(key)
-
-
-def uniform(key, shape, dtype="float32", low=0.0, high=1.0):
-    """Draw samples from a uniform distribution.
-
-    Samples are uniformly distributed over the half-open interval [low, high)
-    (includes low, but excludes high). In other words, any value within the
-    given interval is equally likely to be drawn by uniform.
-
-    Example
-    -------
-
-    .. code-block:: python
-
-        key = threefry_key(0)
-        key, random_values = uniform(key, (100,), low=0, high=10)
-
-    Parameters
-    ----------
-    key : relay.Expr
-        key that uniquely determines the random values. Multiple uses with the
-        same generator will generate the same random values. This generator should be
-        treated as an opaque pointer. You can create one from calling
-        :py:func:`threefry_key`, :py:func:`threefry_split`, or
-        :py:func:`threefry_generate`. **Do not use this generator again after calling
-        this function.**
-
-    shape : Sequence[int]
-        Desired outputs shape of random numbers.
-
-    dtype : str
-        Desired outputs type of random numbers.
-
-    low : float or relay.Expr, optional
-        Lower bound of the uniform distribution.
-
-    high : float or relay.Expr, optional
-        Upper bound of the uniform distribution.
-
-    Returns
-    -------
-    new_key : relay.Expr
-        New random key to pass to future uses of random functions.
-
-    random_values : relay.Expr
-        The generated uniform distributed random numbers.
-    """
-    if not isinstance(low, Expr):
-        low = const(low, dtype=dtype)
-    if not isinstance(high, Expr):
-        high = const(high, dtype=dtype)
-    return _make.uniform(key, low, high, shape, dtype)
-
-
-def normal(key, shape, dtype="float32", mean=0.0, scale=1.0):
-    """Draw samples from a normal distribution.
-
-    Example
-    -------
-
-    .. code-block:: python
-
-        key = threefry_key(0)
-        key, random_values = normal(key, (100,), low=0, high=10)
-
-    Parameters
-    ----------
-    key : relay.Expr
-        key that uniquely determines the random values. Multiple uses with the
-        same generator will generate the same random values. This generator should be
-        treated as an opaque pointer. You can create one from calling
-        :py:func:`threefry_key`, :py:func:`threefry_split`, or
-        :py:func:`threefry_generate`. **Do not use this generator again after calling
-        this function.**
-
-    shape : Sequence[int]
-        Desired outputs shape of random numbers.
-
-    dtype : str
-        Desired outputs type of random numbers.
-
-    low : float or relay.Expr, optional
-        Mean of the normal distribution.
-
-    high : float or relay.Expr, optional
-        Standard deviation of the normal distribution.
-
-    Returns
-    -------
-    new_key : relay.Expr
-        New random key to pass to future uses of random functions.
-
-    random_values : relay.Expr
-        The generated normal distributed random numbers.
-    """
-    if not isinstance(mean, Expr):
-        mean = const(mean, dtype=dtype)
-    if not isinstance(scale, Expr):
-        scale = const(scale, dtype=dtype)
-    return _make.normal(key, mean, scale, shape, dtype)
-
-
-def multinomial(key, probs, num_samples):
-    """Draw samples from a multinomial distribution.
-
-    Example
-    -------
-
-    .. code-block:: python
-
-        key = threefry_key(0)
-        key, random_indices = multinomial(key, (3, 5, 10), num_samples=2)
-
-    Parameters
-    ----------
-    key : relay.Expr
-        key that uniquely determines the random values. Multiple uses with the
-        same generator will generate the same random values. This generator should be
-        treated as an opaque pointer. You can create one from calling
-        :py:func:`threefry_key`, :py:func:`threefry_split`, or
-        :py:func:`threefry_generate`. **Do not use this generator again after calling
-        this function.**
-
-    probs: relay.Expr
-        Array containing the probabilities of returning each respective index.
-        If a tensor is provided, the last dimension is treated independently.
-        Negative values in this tensor will be clipped to zero to
-        represent they have no chance of being selected.
-
-    num_samples : int
-        Number of samples to return
-
-    Returns
-    -------
-    new_key : relay.Expr
-        New random key to pass to future uses of random functions.
-
-    random_indices : relay.Expr
-        The generated indices.
-    """
-    return _make.multinomial(key, probs, num_samples)
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
deleted file mode 100644
index 67dc82efaf87..000000000000
--- a/python/tvm/relay/op/reduce.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Reduce operators."""
-# pylint: disable=redefined-builtin
-
-from ..expr import Tuple, TupleWrapper
-from . import _make
-from .tensor import exp, log, sqrt
-from .transform import squeeze
-
-
-def argmax(data, axis=None, keepdims=False, exclude=False, select_last_index=False):
-    """Returns the indices of the maximum values along an axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a argmax operation is performed.
-        The default, axis=None, will find the indices of the maximum element of the elements of
-        the input array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    select_last_index : bool
-        Whether to select the last index or the first index if the max element appears in
-        multiple indices, default is False (first index).
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.argmax(data, axis, keepdims, exclude, select_last_index)
-
-
-def argmin(data, axis=None, keepdims=False, exclude=False, select_last_index=False):
-    """Returns the indices of the minimum values along an axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a argmin operation is performed.
-        The default, axis=None, will find the indices of minimum element all of the elements of
-        the input array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    select_last_index : bool
-        Whether to select the last index or the first index if the min element appears in
-        multiple indices, default is False (first index).
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.argmin(data, axis, keepdims, exclude, select_last_index)
-
-
-def sum(data, axis=None, keepdims=False, exclude=False):
-    """Computes the sum of array elements over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a sum is performed. The default, axis=None,
-        will sum all of the elements of the input array. If axis is
-        negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as
-        dimensions with size one. With this option, the result will broadcast
-        correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.sum(data, axis, keepdims, exclude)
-
-
-def all(data, axis=None, keepdims=False, exclude=False):
-    """Computes the logical AND of boolean array elements over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input boolean tensor
-
-    axis : None or int or tuple of int
-        Axis or axes along which a sum is performed. The default, axis=None,
-        will sum all of the elements of the input array. If axis is
-        negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as
-        dimensions with size one. With this option, the result will broadcast
-        correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
-                                           [ True,  True,  True],
-                                           [False,  True, False]],
-                                          [[ True, False, False],
-                                           [ True,  True, False],
-                                           [False,  True,  True]]]))
-
-        relay.all(data, axis=1)
-        # [[False,  True, False],
-        # [False, False, False]]
-
-        relay.all(data, axis=0)
-        # [[ True, False, False],
-        # [ True,  True, False],
-        # [False,  True, False]]
-
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.all(data, axis, keepdims, exclude)
-
-
-def any(data, axis=None, keepdims=False, exclude=False):
-    """Computes the logical OR of boolean array elements over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input boolean tensor
-
-    axis : None or int or tuple of int
-        Axis or axes along which a sum is performed. The default, axis=None,
-        will sum all of the elements of the input array. If axis is
-        negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as
-        dimensions with size one. With this option, the result will broadcast
-        correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
-                                            [ True,  True,  True],
-                                            [False,  True, False]],
-                                            [[ True, False, False],
-                                            [ True,  True, False],
-                                            [False,  True,  True]]]))
-
-        relay.any(data, axis=1)
-        # [[True, True, True],
-        # [True,  True, True]]
-
-        relay.any(data, axis=0)
-        # [[ True, True, True],
-        # [ True,  True, True],
-        # [False,  True, True]]
-
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.any(data, axis, keepdims, exclude)
-
-
-def max(data, axis=None, keepdims=False, exclude=False):
-    """Computes the max of array elements over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which the max operation is performed.
-        The default, axis=None, will find the max element from all of the elements of the input
-        array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.max(data, axis, keepdims, exclude)
-
-
-def min(data, axis=None, keepdims=False, exclude=False):
-    """Computes the min of array elements over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a minimum operation is performed.
-        The default, axis=None, will find the minimum element from all
-        of the elements of the input array. If axis is negative it counts from
-        the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.min(data, axis, keepdims, exclude)
-
-
-def mean(data, axis=None, keepdims=False, exclude=False):
-    """Computes the mean of array elements over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a mean operation is performed.
-        The default, axis=None, will compute the mean of all elements in the input array.
-        If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.mean(data, axis, keepdims, exclude)
-
-
-def variance(data, axis=None, keepdims=False, exclude=False, unbiased=False, with_mean=None):
-    """Computes the variance of data over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a variance operation is performed.
-        The default, axis=None, will compute the variance of all elements in the input array.
-        If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    unbiased : bool
-        If this is set to True, the unbiased estimation will be used.
-
-    with_mean : Optional[relay.Expr]
-        To compute variance given an already computed mean
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    m = mean(data, axis, True, exclude) if with_mean is None else with_mean
-    return _make._variance(data, m, axis, keepdims, exclude, unbiased)
-
-
-def std(data, axis=None, keepdims=False, exclude=False, unbiased=False):
-    """Computes the standard deviation of data over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a standard deviation operation is performed.
-        The default, axis=None, will compute the standard deviation of all elements in the
-        input array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    unbiased : bool
-        If this is set to True, the unbiased estimation will be used.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    m = mean(data, axis, True, exclude)
-    return sqrt(_make._variance(data, m, axis, keepdims, exclude, unbiased))
-
-
-def mean_variance(data, axis=None, keepdims=False, exclude=False, unbiased=False):
-    """Computes the mean and variance of data over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a mean and variance operation is performed.
-        The default, axis=None, will compute the mean and variance of all elements in
-        the input array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    unbiased : bool
-        If this is set to True, the unbiased estimation will be used.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    m = mean(data, axis, True, exclude)
-    var = _make._variance(data, m, axis, keepdims, exclude, unbiased)
-    if not keepdims:
-        m = squeeze(m, axis=axis)
-    return TupleWrapper(Tuple((m, var)), 2)
-
-
-def mean_std(data, axis=None, keepdims=False, exclude=False):
-    """Computes the mean and standard deviation of data over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a mean and standard deviation operation is performed.
-        The default, axis=None, will compute the mean and standard deviation of all elements in
-        the input array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    m = mean(data, axis, True, exclude)
-    s = sqrt(_make._variance(data, m, axis, keepdims, exclude, False))
-    if not keepdims:
-        m = squeeze(m)
-    return TupleWrapper(Tuple((m, s)), 2)
-
-
-def prod(data, axis=None, keepdims=False, exclude=False):
-    """Computes the products of array elements over given axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a product is performed.
-        The default, axis=None, will find the indices of minimum element all of the elements of
-        the input array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    exclude : bool
-        If `exclude` is true, reduction will be performed on the axes that are
-        NOT in axis instead.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    axis = [axis] if isinstance(axis, int) else axis
-    return _make.prod(data, axis, keepdims, exclude)
-
-
-def logsumexp(data, axis=None, keepdims=False):
-    """Compute the log of the sum of exponentials of input elements over given axes.
-
-       This function is more numerically stable than log(sum(exp(input))).
-       It avoids overflows caused by taking the exp of large inputs and underflows
-       caused by taking the log of small inputs.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a standard deviation operation is performed.
-        The default, axis=None, will compute the log of the sum of exponentials of all elements
-        in the input array. If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-        with size one.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-
-    axis = [axis] if isinstance(axis, int) else axis
-    max_x = max(data, axis, True)
-    exp_x = exp(data - max_x)
-    sum_x = sum(exp_x, axis, True)
-    out_x = log(sum_x) + max_x
-    if not keepdims:
-        out_x = squeeze(out_x, axis)
-    return out_x
diff --git a/python/tvm/relay/op/strategy/__init__.py b/python/tvm/relay/op/strategy/__init__.py
deleted file mode 100644
index 1be5425e702c..000000000000
--- a/python/tvm/relay/op/strategy/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Relay op strategies."""
-from __future__ import absolute_import as _abs
-
-from .generic import *
-from . import x86
-from . import arm_cpu
-from . import cuda
-from . import hls
-from . import mali
-from . import bifrost
-from . import rocm
-from . import intel_graphics
-from . import hexagon
-from . import adreno
diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
deleted file mode 100644
index 99e4d0a405f0..000000000000
--- a/python/tvm/relay/op/strategy/adreno.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of adreno operator strategy."""
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-from tvm import topi
-from .generic import *
-from .. import op as _op
-
-
-@conv2d_NCHWc_strategy.register("adreno")
-@conv2d_strategy.register("adreno")
-def conv2d_strategy_adreno(attrs, inputs, out_type, target):
-    """conv2d adreno strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    groups = attrs.groups
-    data_layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if (
-            (data_layout == "NCHW" and kernel_layout == "OIHW")
-            or (data_layout == "NCHW4c" and kernel_layout == "OIHW4o")
-            or (data_layout == "NCHW" and kernel_layout == "OIHW4o")
-        ):
-            if len(kernel.shape) == 4:
-                oc, _, kh, kw = get_const_tuple(kernel.shape)
-            else:
-                oc, _, kh, kw, _ = get_const_tuple(kernel.shape)
-            # We cannot use textures for case than number of channels is less than 4.
-            # So, we use compute functions from cuda.
-            if len(kernel.shape) == 4 and oc < 4:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
-                    name="conv2d_nchw.cuda",
-                )
-                return strategy
-            if (
-                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
-                and (stride_h == 1 and stride_w == 1)
-                and (dilation_h == 1 and dilation_w == 1)
-                and not (data_layout == "NCHW" and kernel_layout == "OIHW4o")
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd),
-                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd),
-                    name="conv2d_nchw_winograd.image2d",
-                    plevel=5,
-                )
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.adreno.conv2d_nchwc),
-                wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc),
-                name="conv2d_nchwc.image2d",
-                plevel=10,
-            )
-        elif (
-            (data_layout == "NHWC" and kernel_layout == "HWIO")
-            or (data_layout == "NHWC4c" and kernel_layout == "HWIO4o")
-            or (data_layout == "NHWC" and kernel_layout == "HWIO4o")
-        ):
-            if len(kernel.shape) == 4:
-                kh, kw, _, oc = get_const_tuple(kernel.shape)
-            else:
-                kh, kw, _, oc, _ = get_const_tuple(kernel.shape)
-            # We cannot use textures for case than number of channels is less than 4.
-            # So, we use compute functions from cuda.
-            if len(kernel.shape) == 4 and oc < 4:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
-                    wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
-                    name="conv2d_nhwc.gpu",
-                )
-                return strategy
-            if (
-                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
-                and (stride_h == 1 and stride_w == 1)
-                and (dilation_h == 1 and dilation_w == 1)
-                and not (data_layout == "NHWC" and kernel_layout == "HWIO4o")
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd),
-                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd),
-                    name="conv2d_nhwc_winograd.image2d",
-                    plevel=5,
-                )
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.adreno.conv2d_nhwc),
-                wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.image2d",
-                plevel=10,
-            )
-        else:
-            raise RuntimeError(
-                "Layout not supported: ("
-                + data_layout
-                + ", "
-                + kernel_layout
-                + ") - only support NCHW4c / OIHW4o and NHWC / HWOI layouts for conv2d"
-            )
-    else:
-        # cannot use is_depthwise_conv2d because it does not know about NHWC4c/HWOI4o layouts
-        if data_layout == "NCHW":
-            ic = data.shape[1]
-        elif data_layout == "NCHW4c":
-            ic = data.shape[1] * data.shape[4]
-        elif data_layout == "NHWC":
-            ic = data.shape[3]
-        elif data_layout == "NHWC4c":
-            ic = data.shape[3] * data.shape[4]
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d data layout {data_layout}")
-        if kernel_layout == "OIHW":
-            oc = kernel.shape[0]
-        elif kernel_layout == "OIHW4o":
-            oc = kernel.shape[0] * kernel.shape[4]
-        elif kernel_layout == "HWOI":
-            oc = kernel.shape[2]
-        elif kernel_layout == "HWOI4o":
-            oc = kernel.shape[2] * kernel.shape[4]
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d kernel layout {kernel_layout}")
-
-        if ic == oc == groups:
-            if (data_layout == "NCHW" and kernel_layout == "OIHW") or (
-                data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
-            ):
-                # We cannot use textures for case than number of channels is less than 4.
-                # So, we use compute functions from cuda.
-                if len(kernel.shape) == 4 and oc < 4:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
-                        wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
-                        name="depthwise_conv2d_nchw.cuda",
-                    )
-                else:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc),
-                        wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc),
-                        name="depthwise_conv2d_nchwc.image2d",
-                        plevel=10,
-                    )
-            elif (data_layout == "NHWC" and kernel_layout == "HWOI") or (
-                data_layout == "NHWC4c" and kernel_layout == "HWOI4o"
-            ):
-                if data.shape[-1] >= 4:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc),
-                        wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc),
-                        name="depthwise_conv2d_nhwc.image2d",
-                        plevel=10,
-                    )
-                else:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                        wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
-                        name="depthwise_conv2d_nhwc.cuda",
-                    )
-            else:
-                raise RuntimeError(
-                    "Layout not supported: ("
-                    + data_layout
-                    + ", "
-                    + kernel_layout
-                    + ") - only support NCHW4c / OIHW4o and NHWC / HWOI layouts for conv2d"
-                )
-        elif (data_layout == "NCHW4c" or data_layout == "NCHW") and (
-            kernel_layout == "OIHW" or kernel_layout == "OIHW4o"
-        ):
-            pad_in_chunks = (len(data.shape) == 5 and data.shape[1] % groups != 0) or (
-                len(data.shape) == 4 and data.shape[1] % (groups * 4) != 0
-            )
-            pad_out_chunks = (len(kernel.shape) == 5 and kernel.shape[0] % groups != 0) or (
-                len(kernel.shape) == 4 and kernel.shape[0] % (groups * 4) != 0
-            )
-
-            if not (pad_in_chunks or pad_out_chunks):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.group_conv2d_nchwc),
-                    wrap_topi_schedule(topi.adreno.schedule_group_conv2d_nchwc),
-                    name="group_conv2d_nchwc.image2d",
-                    plevel=10,
-                )
-            elif len(data.shape) == 4 and len(kernel.shape) == 4:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
-                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
-                    name="group_conv2d_nchw.cuda",
-                )
-            else:
-                raise RuntimeError(
-                    "General group convolution is not currently supported for NCHWc layouts"
-                )
-        else:
-            raise RuntimeError(
-                "General group convolution has limited support for NCHW(4c) layouts..."
-            )
-    return strategy
-
-
-@conv2d_winograd_without_weight_transform_strategy.register("adreno")
-def conv2d_winograd_without_weight_transform_strategy_adreno(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform adreno strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.data_layout
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not support arbitrary group number"
-    strategy = _op.OpStrategy()
-    if layout in ("NCHW", "NCHW4c"):
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform),
-            wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform),
-            name="conv2d_nchw_winograd_without_weight_transform.image2d",
-            plevel=5,
-        )
-    elif layout in ("NHWC", "NHWC4c"):
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform),
-            wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform),
-            name="conv2d_nhwc_winograd_without_weight_transform.image2d",
-            plevel=5,
-        )
-    else:
-        raise RuntimeError(f"Unsupported conv2d_winograd_without_weight_transform layout {layout}")
-    return strategy
-
-
-@conv2d_transpose_strategy.register("adreno")
-def conv2d_transpose_strategy_adreno(attrs, inputs, out_type, target):
-    """conv2d_transpose adreno strategy"""
-    strategy = _op.OpStrategy()
-    _, kernel = inputs
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.groups
-    data_layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    assert dilation == (1, 1), "not support dilate now"
-
-    if (groups == 1) and (
-        (data_layout == "NCHW" and kernel_layout == "IOHW")
-        or (data_layout == "NCHW4c" and kernel_layout == "IOHW4o")
-        or (data_layout == "NCHW" and kernel_layout == "IOHW4o")
-    ):
-        if len(kernel.shape) == 4:
-            _, oc, _, _ = get_const_tuple(kernel.shape)
-        else:
-            _, oc, _, _, _ = get_const_tuple(kernel.shape)
-        # We cannot use textures for case than number of channels is less than 4.
-        # So, we use compute functions from cuda.
-        if len(kernel.shape) == 4 and oc < 4:
-            strategy.add_implementation(
-                wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
-                name="conv2d_transpose_nchw.cuda",
-            )
-            return strategy
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.adreno.conv2d_transpose_nchwc),
-            wrap_topi_schedule(topi.adreno.schedule_conv2d_transpose_nchwc),
-            name="conv2d_transpose_nchwc.image2d",
-            plevel=10,
-        )
-    elif data_layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw, has_groups=True),
-            wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.cuda",
-        )
-    else:
-        raise RuntimeError(
-            "Layout not supported: ("
-            + data_layout
-            + ", "
-            + kernel_layout
-            + ") - only support NCHW, NCHW4c / IOHW4o layouts for conv2d_transpose"
-        )
-    return strategy
-
-
-@schedule_pool.register("adreno")
-def schedule_pool_adreno(attrs, outs, target):
-    """schedule pooling ops for adreno"""
-    with target:
-        if attrs.layout == "NCHW4c":
-            return topi.adreno.schedule_pool(outs, attrs.layout)
-        return topi.cuda.schedule_pool(outs, attrs.layout)
-
-
-@schedule_injective.register(["adreno"])
-def schedule_injective_adreno(attrs, outs, target):
-    """schedule injective ops for adreno"""
-    with target:
-        return topi.adreno.schedule_injective(outs)
-
-
-@schedule_reduce.register(["adreno"])
-def schedule_reduce_adreno(attrs, outs, target):
-    """schedule reduction ops for adreno GPU"""
-    with target:
-        return topi.adreno.schedule_reduce(outs)
-
-
-@schedule_adaptive_pool.register(["adreno"])
-def schedule_adaptive_pool_adreno(attrs, outs, target):
-    """schedule adaptive pooling ops for adreno"""
-    with target:
-        return topi.adreno.schedule_adaptive_pool(outs, attrs.layout)
-
-
-@concatenate_strategy.register(["adreno"])
-def concatenate_strategy_adreno(attrs, inputs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_concat(topi.transform.concatenate),
-        wrap_topi_schedule(topi.adreno.schedule_injective),
-        name="concatenate.adreno",
-    )
-    return strategy
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
deleted file mode 100644
index bd9a0a4d020b..000000000000
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ /dev/null
@@ -1,879 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of ARM CPU operator strategy."""
-from functools import reduce
-import logging
-
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-import re
-
-from tvm import relay, topi, tir
-from tvm.tir.schedule.analysis import has_block
-from tvm.dlight.gpu.matmul import auto_inline_consumers
-
-from ....auto_scheduler import is_auto_scheduler_enabled
-from ....meta_schedule import is_meta_schedule_enabled
-from ....topi.generic import conv2d as conv2d_generic
-from .. import op as _op
-from .generic import *
-
-logger = logging.getLogger("strategy")
-
-
-@schedule_reduce.register("arm_cpu")
-def schedule_reduce_cpu(attrs, outs, target):
-    """schedule reduction ops for arm_cpu"""
-    with target:
-        return topi.x86.schedule_reduce(outs)
-
-
-@schedule_injective.register("arm_cpu")
-def schedule_injective_arm_cpu(_, outs, target):
-    """schedule injective ops for arm cpu"""
-    with target:
-        return topi.arm_cpu.schedule_injective(outs)
-
-
-@concatenate_strategy.register(["arm_cpu"])
-def concatenate_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """concatenate arm_cpu strategy"""
-    strategy = _op.OpStrategy()
-
-    strategy.add_implementation(
-        wrap_compute_concat(topi.concatenate),
-        wrap_topi_schedule(topi.arm_cpu.schedule_concatenate),
-        name="concatenate.arm_cpu",
-    )
-    return strategy
-
-
-@schedule_pool.register(["arm_cpu"])
-def schedule_pool_arm_cpu(attrs, outs, target):
-    """schedule pooling ops arm cpu"""
-    layout = attrs.layout
-    avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs)
-    with target:
-        if (
-            avg_pool
-            and target.features.has_dsp
-            and layout in ("NCW", "NCHW")
-            or not avg_pool
-            and target.features.has_dsp
-            and layout in ("NWC", "NHWC")
-        ):
-            return topi.arm_cpu.schedule_pool(outs, layout)
-        return topi.x86.schedule_pool(outs, layout)
-
-
-def _get_padding_width(padding):
-    assert isinstance(padding, tuple)
-    if len(padding) == 2:
-        _, (pad_left, pad_right) = padding
-    else:
-        _, pad_left, _, pad_right = padding
-    return pad_left + pad_right
-
-
-def _is_simd_aligned(dtype, dimensions, padding=None):
-    if padding:
-        assert len(dimensions) == len(padding)
-        padded_dims = (sum(x) for x in zip(dimensions, padding))
-    else:
-        padded_dims = dimensions
-
-    # Multiply all elements of padded_dims together. We can't use math.prod, as it
-    # does not exist in Python 3.7.
-    size = reduce(lambda x, y: x * y, padded_dims)
-    return (
-        (dtype == "int8" and size % 4 == 0)
-        or (dtype == "int16" and size % 2 == 0)
-        or (dtype == "int32")
-    )
-
-
-@conv2d_strategy.register("arm_cpu")
-def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv2d arm cpu strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    data_shape = data.shape
-    kernel_shape = kernel.shape
-    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            if kernel_layout == "OIHW":
-                if (
-                    topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype)
-                    and kernel.shape[1] >= 64
-                ):
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
-                        name="conv2d_nchw_int8.arm_cpu",
-                        plevel=15,
-                    )
-                else:
-                    # ARM conv2d spatial pack schedule.
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
-                        name="conv2d_nchw_spatial_pack.arm_cpu",
-                        plevel=10,
-                    )
-
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.x86.conv2d_nchw),
-                        wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
-                        name="conv2d_nchw.x86",
-                    )
-
-                # check if winograd algorithm is applicable
-                _, _, kh, kw = get_const_tuple(kernel.shape)
-                pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
-                is_winograd_applicable = (
-                    "float" in data.dtype
-                    and "custom" not in data.dtype
-                    and "float" in kernel.dtype
-                    and "custom" not in kernel.dtype
-                    and kh == 3
-                    and kw == 3
-                    and stride_h == 1
-                    and stride_w == 1
-                    and dilation_h == 1
-                    and dilation_w == 1
-                )
-                if is_winograd_applicable:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
-                        name="conv2d_nchw_winograd.arm_cpu",
-                        plevel=5,
-                    )
-                    if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1:
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack),
-                            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack),
-                            name="conv2d_nchw_winograd_nnpack.arm_cpu",
-                            plevel=15,
-                        )
-            elif re.match(r"OIHW\d*o", kernel_layout):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
-                    name="conv2d_nchw_spatial_pack.arm_cpu",
-                )
-            else:
-                raise RuntimeError(f"Unsupported weight layout {kernel_layout} for conv2d NCHW")
-        elif layout == "HWCN":
-            assert kernel_layout == "HWIO"
-            logger.warning("conv2d_hwcn is not optimized for arm cpu.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
-                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
-                name="conv2d_hwcn.generic",
-            )
-        elif layout == "NHWC":
-            data_width_padding = _get_padding_width(padding)
-            if (
-                target.features.has_dsp
-                and dilation_w == dilation_h == 1
-                and kernel_layout == "OHWI"
-                # Check SIMD alignment
-                and _is_simd_aligned(data.dtype, data.shape[2:], padding=(data_width_padding, 0))
-                and _is_simd_aligned(kernel.dtype, kernel.shape[2:])
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_ohwi_dsp, need_out_layout=True),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_ohwi_dsp),
-                    name="conv2d_nhwc_ohwi_dsp.arm_cpu",
-                )
-            elif target.features.has_dsp and kernel_layout == "HWOI":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
-                    name="conv2d_nhwc_dsp.arm_cpu",
-                )
-            elif kernel_layout == "HWIO":
-                is_aarch64 = target.features.is_aarch64
-                has_dot_prod = target.features.has_dotprod
-                has_matmul_i8 = target.features.has_matmul_i8
-                interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved
-                interleaved_schedule = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved
-                native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native
-                native_schedule = topi.arm_cpu.schedule_conv2d_NHWC_quantized_native
-                # Quantized cases
-                if is_aarch64 and data.dtype in ["int8", "uint8"]:
-                    if has_matmul_i8 and has_dot_prod:
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(interleaved_compute),
-                            wrap_topi_schedule(interleaved_schedule),
-                            name="conv2d_NHWC_quantized_interleaved.arm_cpu",
-                        )
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(native_compute),
-                            wrap_topi_schedule(native_schedule),
-                            name="conv2d_NHWC_quantized_native.arm_cpu",
-                        )
-                    elif has_matmul_i8:
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(interleaved_compute),
-                            wrap_topi_schedule(interleaved_schedule),
-                            name="conv2d_NHWC_quantized_interleaved.arm_cpu",
-                        )
-                    elif has_dot_prod:
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(native_compute),
-                            wrap_topi_schedule(native_schedule),
-                            name="conv2d_NHWC_quantized_native.arm_cpu",
-                        )
-                    else:
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(interleaved_compute),
-                            wrap_topi_schedule(interleaved_schedule),
-                            name="conv2d_NHWC_quantized_interleaved.arm_cpu",
-                        )
-                # Non-quantized cases
-                if is_aarch64 and data.dtype in ["float32", "float16"]:
-                    if (
-                        target.features.has_sme
-                        and kernel.dtype == data.dtype
-                        and out_type.dtype == "float32"
-                        and data_shape[0] == 1
-                        # The schedule uses tensorization which does not work when the
-                        # reduction axis of the gemm has unit iters. See
-                        # https://github.com/apache/tvm/issues/16566
-                        and (data_shape[3] * kernel_shape[0] * kernel_shape[1]) > 1
-                    ):
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_hybrid_SME),
-                            lambda: None,
-                            name="conv2d_NHWC_hybrid_SME.arm_cpu",
-                            plevel=12,
-                        )
-                    if target.features.has_sve:
-                        # This strategy is currently suboptimal because of LLVM's limited support
-                        # for scalable vector alias analysis, which causes redundant loads / stores
-                        # to remain after LLVM's optimisation passes, unlike the non-scalable case.
-                        # Hence, it is given a lower priority level until these issues are resolved.
-                        # Last checked manually using: LLVM 18.1.0
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_hybrid_SVE),
-                            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_hybrid_SVE),
-                            name="conv2d_NHWC_hybrid_SVE.arm_cpu",
-                            plevel=5,
-                        )
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_hybrid),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_hybrid),
-                        name="conv2d_NHWC_hybrid.arm_cpu",
-                    )
-                if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]):
-                    # TODO(@giuseros)
-                    # This strategy errors out for quantized data types when tuning.
-                    # Let's use this only for non-aarch64 or non-quantized cases
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                        name="conv2d_nhwc_spatial_pack.arm_cpu",
-                        plevel=5,
-                    )
-            else:
-                raise RuntimeError(f"Unsupported kernel layout {kernel_layout} for conv2d NHWC")
-
-        else:
-            raise RuntimeError(f"Unsupported conv2d layout {layout} for arm cpu")
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout)
-            if kernel_layout == "OIHW":
-                data_width_padding = _get_padding_width(padding)
-                if (
-                    target.features.has_dsp
-                    and dilation_w == dilation_h == 1
-                    and _is_simd_aligned(data.dtype, data.shape[3:], padding=(data_width_padding,))
-                    and _is_simd_aligned(kernel.dtype, kernel.shape[3:])
-                ):
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(
-                            topi.arm_cpu.depthwise_conv2d_nchw_oihw_dsp, need_out_layout=True
-                        ),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_oihw_dsp),
-                        name="depthwise_conv2d_nchw_oihw_dsp.arm_cpu",
-                    )
-                else:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
-                        name="depthwise_conv2d_nchw.arm_cpu",
-                    )
-
-            # TODO:
-            # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
-            # Let us comment it out but not remove.
-            # see discussion:
-            # https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
-            # strategy.add_implementation(
-            #     wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack),
-            #     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
-            #     name="depthwise_conv2d_nchw_spatial_pack.arm_cpu",
-            #     plevel=15)
-
-            # Intel x86 depthwise conv2d schedule.
-            channel_multiplier = get_const_tuple(inputs[1].shape)[1]
-            if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
-                    wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
-                    name="depthwise_conv2d_nchw.x86",
-                )
-        elif layout == "NHWC":
-            if kernel_layout != "HWOI":
-                logger.warning(
-                    """
-                    depthwise_conv2d with layout NHWC and HWOI
-                    kernel layout is not optimized for arm_cpu target.
-                    """
-                )
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
-                    wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
-                    name="depthwise_conv2d_nhwc.generic",
-                )
-
-            elif target.features.is_aarch64 and target.features.has_asimd:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
-                    name="depthwise_conv2d_nhwc.arm_cpu",
-                )
-
-            # Optimized special case depthwiseConv2D operation. Requires NHWC layout,
-            # a HWOI kernel layout (which we rearrange to a custom layout) no dilation,
-            # int8/16 inputs, int32 output, and the same number of input and output channels.
-            # The int8 implementation DOES need the DSP unit (for SXTB16), but it is not
-            # possible to use the DSP unit to speed up a NHWC depthwise convolution (though
-            # an NCHW convolution would benefit).
-            elif (
-                dilation_w == dilation_h == 1
-                and kernel.shape[3] == 1  # channel_multiplier == 1
-                and out_type.dtype == "int32"
-                and (
-                    (data.shape[3] % 4 == 0 and data.dtype == "int8" and target.features.has_dsp)
-                    or (data.shape[3] % 2 == 0 and data.dtype == "int16")
-                )
-                and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0)
-                and target.kind.name == "c"
-                # Ideally we should check that kernel is a Relay constant, but strategy functions
-                # don't have access to the data needed to check this.
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc_dsp),
-                    name="depthwise_conv2d_nhwc_dsp.arm_cpu",
-                )
-
-            else:
-                logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
-                    wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
-                    name="depthwise_conv2d_nhwc.generic",
-                )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout} for arm cpu")
-    else:  # group_conv2d
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.arm_cpu",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            logger.warning("group_conv2d with layout NHWC is not optimized for arm cpu.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
-                name="group_conv2d_nhwc.generic",
-            )
-        else:
-            raise RuntimeError(f"Unsupported group_conv2d layout {layout} for arm cpu")
-    return strategy
-
-
-@conv2d_NCHWc_strategy.register("arm_cpu")
-def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv2d_NCHWc adopted from x86"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
-        strategy.add_implementation(
-            wrap_compute_conv2d(
-                topi.arm_cpu.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
-            ),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NCHWc_int8),
-            name="conv2d_NCHWc_int8.arm_cpu",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
-            wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
-            name="conv2d_NCHWc.x86",
-        )
-    return strategy
-
-
-@depthwise_conv2d_NCHWc_strategy.register("arm_cpu")
-def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """depthwise_conv2d_NCHWc adopted from x86"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d(
-            topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
-        ),
-        wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
-        name="depthwise_conv2d_NCHWc.x86",
-    )
-    return strategy
-
-
-def wrap_compute_conv2d_winograd_nnpack(topi_compute):
-    """wrap topi compute for conv2d_winograd NNPack"""
-
-    def _compute_conv2d_nnpack(attrs, inputs, out_type):
-        padding = attrs.get_int_tuple("padding")
-        strides = attrs.get_int_tuple("strides")
-        dilation = attrs.get_int_tuple("dilation")
-        out_dtype = attrs.get_str("out_dtype")
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        return [topi_compute(inputs[0], inputs[1], None, strides, padding, dilation, out_dtype)]
-
-    return _compute_conv2d_nnpack
-
-
-@conv2d_winograd_without_weight_transform_strategy.register("arm_cpu")
-def conv2d_winograd_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform arm cpu strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.data_layout
-    strides = attrs.get_int_tuple("strides")
-    kernel = inputs[1]
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not support arbitrary group number"
-    strategy = _op.OpStrategy()
-    if layout == "NCHW":
-        if len(kernel.shape) == 5:
-            pad_kh, pad_kw, _, _, _ = get_const_tuple(inputs[1].shape)
-            tile_size = attrs.get_int("tile_size")
-            kh = pad_kh - tile_size + 1
-            kw = pad_kw - tile_size + 1
-            assert kh == 3 and kw == 3
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
-                wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
-                name="conv2d_nchw_winograd.arm_cpu",
-            )
-        elif len(kernel.shape) == 4:
-            # kernel must be packed by winograd nnpack
-            assert "nnpack" in target.libs
-            strategy.add_implementation(
-                wrap_compute_conv2d_winograd_nnpack(
-                    topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform
-                ),
-                wrap_topi_schedule(
-                    topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack_without_weight_transform
-                ),
-                name="conv2d_nchw_winograd_nnpack_withou_weight_transform.arm_cpu",
-                plevel=15,
-            )
-        else:
-            raise RuntimeError(f"Unsupported kernel shape: {kernel.shape}")
-    else:
-        raise RuntimeError(f"Unsupported conv2d_winograd_without_weight_transform layout {layout}")
-    return strategy
-
-
-def wrap_compute_conv2d_gemm(topi_compute):
-    """wrap topi compute for conv2d_gemm"""
-
-    def _compute_conv2d_gemm(attrs, inputs, out_type):
-        padding = attrs.get_int_tuple("padding")
-        strides = attrs.get_int_tuple("strides")
-        dilation = attrs.get_int_tuple("dilation")
-        out_dtype = attrs.get_str("out_dtype")
-        channels = attrs["channels"]
-        kernel_size = attrs["kernel_size"]
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        return [
-            topi_compute(
-                inputs[0], inputs[1], strides, padding, dilation, out_dtype, kernel_size, channels
-            )
-        ]
-
-    return _compute_conv2d_gemm
-
-
-@conv2d_gemm_without_weight_transform_strategy.register("arm_cpu")
-def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform arm cpu strategy"""
-    layout = attrs.data_layout
-    data = inputs[0]
-    kernel = inputs[1]
-    strategy = _op.OpStrategy()
-    is_aarch64 = target.features.is_aarch64
-    has_dot_prod = target.features.has_dotprod
-    has_matmul_i8 = target.features.has_matmul_i8
-
-    interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved_without_transform
-    interleaved_schedule = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved_without_transform
-    native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native_without_transform
-    native_schedule = topi.arm_cpu.schedule_conv2d_NHWC_quantized_native_without_transform
-    if layout == "NHWC" and data.dtype in ["int8", "uint8", "float32", "float16"]:
-        # Non-AArch64 cases
-        if not is_aarch64:
-            raise RuntimeError("Unsupported non-AArch64 conv2d_NHWC_without_transform")
-        # AArch64 cases
-        if data.dtype in ["int8", "uint8"]:
-            # Quantized cases
-            if has_matmul_i8 and has_dot_prod:
-                strategy.add_implementation(
-                    wrap_compute_conv2d_gemm(interleaved_compute),
-                    wrap_topi_schedule(interleaved_schedule),
-                    name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
-                )
-                strategy.add_implementation(
-                    wrap_compute_conv2d_gemm(native_compute),
-                    wrap_topi_schedule(native_schedule),
-                    name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
-                )
-            elif has_matmul_i8:
-                strategy.add_implementation(
-                    wrap_compute_conv2d_gemm(interleaved_compute),
-                    wrap_topi_schedule(interleaved_schedule),
-                    name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
-                )
-            elif has_dot_prod:
-                strategy.add_implementation(
-                    wrap_compute_conv2d_gemm(native_compute),
-                    wrap_topi_schedule(native_schedule),
-                    name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
-                )
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d_gemm(interleaved_compute),
-                    wrap_topi_schedule(interleaved_schedule),
-                    name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
-                )
-        # Non-quantized cases
-        elif data.dtype in ["float32", "float16"]:
-            # The SME schedule for float16->float32 prearranges the two matrices to be multiplied
-            # using the ARM_SME_BLOCK2_2SVLx1SVL_FP16_TRANSPOSE_INTERLEAVE intrinsic which expects
-            # the reduction axis K as the second dimension of the matrix (i.e. shape = (_, K)).
-            # This means that the flattened weights matrix B needs to be transposed to (N, K).
-            if (
-                target.features.has_sme
-                and kernel.dtype == "float16"
-                and data.dtype == "float16"
-                and out_type.dtype == "float32"
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d_gemm(topi.arm_cpu.compute_conv2d_NHWC_SME_transposed_B),
-                    lambda: None,
-                    name="conv2d_NHWC_hybrid_SME_transposed_B.arm_cpu",
-                )
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d_gemm(
-                        topi.arm_cpu.compute_conv2d_NHWC_hybrid_without_transform
-                    ),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_hybrid_without_transform),
-                    name="conv2d_NHWC_hybrid_without_transform.arm_cpu",
-                )
-    else:
-        raise RuntimeError(
-            f"Unsupported conv2d_NHWC_without_transform layout {layout}"
-            f"with datatype {data.dtype}"
-        )
-    return strategy
-
-
-@conv2d_transpose_strategy.register("arm_cpu")
-def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv2d_transpose arm cpu strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCHW", "only support nchw for now"
-    assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
-        name="conv2d_tranpose_nchw.arm_cpu",
-    )
-    return strategy
-
-
-@bitserial_conv2d_strategy.register("arm_cpu")
-def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """bitserial_conv2d x86 strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw),
-            wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw),
-            name="bitserial_conv2d_nchw.arm_cpu",
-        )
-    elif layout == "NHWC":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.arm_cpu.bitserial_conv2d_nhwc),
-            wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc),
-            name="bitserial_conv2d_nhwc.arm_cpu",
-        )
-    else:
-        raise ValueError(f"Data layout {layout} not supported.")
-    return strategy
-
-
-@bitserial_dense_strategy.register("arm_cpu")
-def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
-    """bitserial_dense arm cpu strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_bitserial_dense(topi.arm_cpu.bitserial_dense),
-        wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense),
-        name="bitserial_dense.arm_cpu",
-    )
-    return strategy
-
-
-@dense_strategy.register(["arm_cpu"])
-def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
-    """dense arm cpu strategy"""
-    strategy = _op.OpStrategy()
-    data, weight = inputs
-
-    if target.features.has_dsp and data.dtype in ["int8", "int16"]:
-        strategy.add_implementation(
-            wrap_compute_dense(topi.arm_cpu.dense_dsp),
-            wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
-            name="dense_dsp.arm_cpu",
-        )
-        return strategy
-
-    # For dynamic matrix-vector multiply we use a hand written kernel.
-    if (
-        isinstance(inputs[0].shape[0], (int, tir.IntImm))
-        and inputs[0].shape[0] == 1
-        and (
-            topi.utils.is_dynamic_shape(inputs[0].shape)
-            or topi.utils.is_dynamic_shape(inputs[1].shape)
-        )
-    ):
-        strategy.add_implementation(
-            wrap_compute_dense(topi.x86.dense_dynamic),
-            wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
-            name="dense_dynamic.x86",
-            plevel=20,
-        )
-        return strategy
-
-    need_auto_scheduler_layout = is_auto_scheduler_enabled()
-    need_meta_schedule_layout = is_meta_schedule_enabled()
-    if need_auto_scheduler_layout or need_meta_schedule_layout:
-        strategy.add_implementation(
-            wrap_compute_dense(
-                topi.nn.dense,
-                need_auto_scheduler_layout=need_auto_scheduler_layout,
-                need_meta_schedule_layout=need_meta_schedule_layout,
-            ),
-            naive_schedule,
-            name="dense.generic",
-            plevel=11,
-        )
-
-    if (
-        target.features.has_sme
-        and data.dtype in ["float32", "float16"]
-        and weight.dtype == data.dtype
-        and out_type.dtype == "float32"
-        # The schedule uses tensorization which does not work when the
-        # reduction axis has unit iters. See
-        # https://github.com/apache/tvm/issues/16566
-        and data.shape[1] > 1
-    ):
-        strategy.add_implementation(
-            wrap_compute_dense(topi.arm_cpu.compute_matmul_sme),
-            lambda: None,
-            name="matmul.arm_cpu.sme",
-            plevel=12,
-        )
-
-    if (
-        target.features.is_aarch64
-        and data.dtype in ["float16", "float32"]
-        and weight.dtype in ["float16", "float32"]
-        and out_type.dtype in ["float16", "float32"]
-    ):
-        strategy.add_implementation(
-            wrap_compute_dense(topi.arm_cpu.dense_gemm),
-            wrap_topi_schedule(topi.arm_cpu.schedule_dense_gemm),
-            name="dense_gemm.arm_cpu",
-            plevel=11,
-        )
-    # Fallback to x86 schedules as there is currently no arm_cpu schedule for dense
-    strategy.add_implementation(
-        wrap_compute_dense(topi.x86.dense_nopack),
-        wrap_topi_schedule(topi.x86.schedule_dense_nopack),
-        name="dense_nopack.x86",
-        plevel=5,
-    )
-    strategy.add_implementation(
-        wrap_compute_dense(topi.x86.dense_pack),
-        wrap_topi_schedule(topi.x86.schedule_dense_pack),
-        name="dense_pack.x86",
-        plevel=10,
-    )
-
-    return strategy
-
-
-@matmul_strategy.register("arm_cpu")
-def matmul_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """matmul arm cpu strategy"""
-    strategy = _op.OpStrategy()
-    data, weight = inputs
-
-    if (
-        target.features.has_sme
-        and data.dtype in ["float32", "float16"]
-        and weight.dtype == data.dtype
-        and out_type.dtype == "float32"
-        and not attrs.transpose_a
-        and not (data.dtype == "float16" and not attrs.transpose_b)
-        and not (data.dtype == "float32" and attrs.transpose_b)
-        and len(data.shape) == 2
-        # The schedule uses tensorization which does not work when the
-        # reduction axis has unit iters. See
-        # https://github.com/apache/tvm/issues/16566
-        and data.shape[1] > 1
-    ):
-        # Ideally we should check that weight is a Relay constant, but strategy functions
-        # don't have access to the data needed to check this.
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.arm_cpu.compute_matmul_sme),
-            lambda: None,
-            name="matmul.arm_cpu.sme",
-        )
-    elif (
-        target.features.is_aarch64
-        and data.dtype in ["float16", "float32"]
-        and weight.dtype in ["float16", "float32"]
-        and out_type.dtype in ["float16", "float32"]
-        and not (attrs.transpose_a or attrs.transpose_b)
-        and len(data.shape) == 2
-    ):
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.arm_cpu.dense_gemm),
-            wrap_topi_schedule(topi.arm_cpu.schedule_dense_gemm),
-            name="matmul.arm_cpu.neon",
-        )
-        return strategy
-
-    logger.warning("matmul is not optimized for arm cpu.")
-    strategy.add_implementation(
-        wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.generic"
-    )
-    return strategy
-
-
-@conv1d_strategy.register("arm_cpu")
-def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv1d strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    dilation = get_const_tuple(attrs.dilation)
-    if dilation[0] < 1:
-        raise ValueError("dilation should be a positive value")
-
-    if kernel_layout == "WOI":
-        if layout == "NWC" and target.features.has_dsp:
-            strategy.add_implementation(
-                wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp),
-                wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp),
-                name="conv1d_dsp.arm_cpu",
-            )
-        else:
-            raise RuntimeError(
-                f"Unsupported kernel layout {kernel_layout} for conv1d {layout} for arm cpu."
-            )
-    elif layout == "NCW":
-        logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.conv1d_ncw),
-            wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
-            name="conv1d_ncw.generic",
-        )
-    elif layout == "NWC":
-        logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.conv1d_nwc),
-            wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
-            name="conv1d_nwc.generic",
-        )
-    else:
-        raise RuntimeError(
-            f"Unsupported kernel layout {kernel_layout} for conv1d {layout} for arm cpu."
-        )
-    return strategy
-
-
-def arm_cpu_tir_strategy(sch: tir.Schedule) -> bool:
-    """
-    Strategy for arm_cpu STIR schedules.
-    """
-    matmul_block = None
-    if has_block(sch, "T_matmul_NN"):
-        matmul_block = sch.get_block("T_matmul_NN")
-    elif has_block(sch, "T_matmul_NT"):
-        matmul_block = sch.get_block("T_matmul_NT")
-
-    if matmul_block and sch.get(matmul_block).annotations.get("schedule_type", "") == "sme":
-        topi.arm_cpu.matmul.tir_schedule_matmul_sme(sch)
-        return True
-    elif has_block(sch, "conv2d_gemm_output"):
-        conv2d_block = sch.get_block("conv2d_gemm_output")
-        auto_inline_consumers(sch, conv2d_block)
-        topi.arm_cpu.schedule_conv2d_NHWC_hybrid_TIR(sch)
-        return True
-
-    # Fallback to TE schedule for operators we have not written a special TIR schedule for
-    return False
diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py
deleted file mode 100644
index f437aa15f6a0..000000000000
--- a/python/tvm/relay/op/strategy/bifrost.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of bifrost operator strategy."""
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-import re
-from tvm import topi
-from .generic import *
-from .. import op as _op
-
-
-@conv2d_strategy.register("bifrost")
-def conv2d_strategy_bifrost(attrs, inputs, out_type, target):
-    """conv2d mali(bifrost) strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            if kernel_layout == "OIHW":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack),
-                    wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack),
-                    name="conv2d_nchw_spatial_pack.bifrost",
-                )
-
-                _, _, kh, kw = get_const_tuple(kernel.shape)
-                if (
-                    kh == 3
-                    and kw == 3
-                    and stride_h == 1
-                    and stride_w == 1
-                    and dilation_h == 1
-                    and dilation_w == 1
-                ):
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd),
-                        wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd),
-                        name="conv2d_nchw_winograd.bifrost",
-                        plevel=5,
-                    )
-            elif re.match(r"OIHW\d*o", kernel_layout):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack),
-                    wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack),
-                    name="conv2d_nchw_spatial_pack.bifrost",
-                )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            # For now just reuse general Mali strategy.
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.mali.conv2d_nhwc_spatial_pack),
-                wrap_topi_schedule(topi.mali.schedule_conv2d_nhwc_spatial_pack),
-                name="conv2d_nhwc_spatial_pack.bifrost",
-            )
-        else:
-            raise RuntimeError(f"Unsupported conv2d layout {layout} for Mali(Bifrost)")
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.bifrost",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            # For now just reuse general Mali strategy.
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc),
-                wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nchw.bifrost",
-            )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout} for Mali(Bifrost)")
-    else:  # group_conv2d
-        raise RuntimeError("group_conv2d is not supported for Mali(Bifrost)")
-    return strategy
-
-
-@conv2d_winograd_without_weight_transform_strategy.register("bifrost")
-def conv2d_winograd_without_weight_transform_strategy_bifrost(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform mali(bifrost) strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.data_layout
-    strides = attrs.get_int_tuple("strides")
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not support arbitrary group number"
-    strategy = _op.OpStrategy()
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd),
-            wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd),
-            name="conv2d_nchw_winograd.bifrost",
-        )
-    else:
-        raise RuntimeError(f"Unsupported conv2d_winograd_without_weight_transform layout {layout}")
-    return strategy
-
-
-@dense_strategy.register("bifrost")
-def dense_strategy_bifrost(attrs, inputs, out_type, target):
-    """dense mali(bifrost) strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.bifrost.dense),
-        wrap_topi_schedule(topi.bifrost.schedule_dense),
-        name="dense.bifrost",
-    )
-    return strategy
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
deleted file mode 100644
index 1fd806b7cf5c..000000000000
--- a/python/tvm/relay/op/strategy/cuda.py
+++ /dev/null
@@ -1,1400 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of CUDA/GPU operator strategy."""
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-from tvm import topi
-from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.contrib import nvcc
-from tvm.contrib.thrust import can_use_thrust
-from tvm.meta_schedule import is_meta_schedule_enabled
-from tvm.te import SpecializedCondition
-
-from ....target import Target
-from ....tir import IntImm
-from .. import op as _op
-from .generic import *
-
-
-@schedule_injective.register(["cuda", "gpu"])
-def schedule_injective_cuda(attrs, outs, target):
-    """schedule injective ops for cuda"""
-    with target:
-        return topi.cuda.schedule_injective(outs)
-
-
-@schedule_reduce.register(["cuda", "gpu"])
-def schedule_reduce_cuda(attrs, outs, target):
-    """schedule reduction ops for cuda"""
-    with target:
-        return topi.cuda.schedule_reduce(outs)
-
-
-@concatenate_strategy.register(["cuda", "gpu"])
-def concatenate_strategy_cuda(attrs, inputs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_concat(topi.transform.concatenate),
-        wrap_topi_schedule(topi.cuda.schedule_injective),
-        name="concatenate.cuda",
-    )
-    return strategy
-
-
-@schedule_pool.register(["cuda", "gpu"])
-def schedule_pool_cuda(attrs, outs, target):
-    """schedule pooling ops for cuda"""
-    with target:
-        return topi.cuda.schedule_pool(outs, attrs.layout)
-
-
-@schedule_pool_grad.register(["cuda", "gpu"])
-def schedule_pool_grad_cuda(attrs, outs, target):
-    """schedule pooling gradient ops for cuda"""
-    with target:
-        return topi.cuda.schedule_pool_grad(outs)
-
-
-@schedule_adaptive_pool.register(["cuda", "gpu"])
-def schedule_adaptive_pool_cuda(attrs, outs, target):
-    """schedule adaptive pooling ops for cuda"""
-    with target:
-        return topi.cuda.schedule_adaptive_pool(outs, attrs.layout)
-
-
-@softmax_strategy.register(["cuda", "gpu"])
-def softmax_strategy_cuda(attrs, inputs, out_type, target):
-    """softmax cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.softmax),
-        wrap_topi_schedule(topi.cuda.schedule_softmax),
-        name="softmax.cuda",
-    )
-    if target.kind.name == "cuda" and "cudnn" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_softmax(topi.cuda.softmax_cudnn),
-            wrap_topi_schedule(topi.cuda.schedule_softmax_cudnn),
-            name="softmax.cudnn",
-            plevel=15,
-        )
-    return strategy
-
-
-@fast_softmax_strategy.register(["cuda", "gpu"])
-def fast_softmax_strategy_cuda(attrs, inputs, out_type, target):
-    """fast_softmax cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.fast_softmax),
-        wrap_topi_schedule(topi.cuda.schedule_softmax),
-        name="fast_softmax.cuda",
-    )
-    return strategy
-
-
-@log_softmax_strategy.register(["cuda", "gpu"])
-def log_softmax_strategy_cuda(attrs, inputs, out_type, target):
-    """log_softmax cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.log_softmax),
-        wrap_topi_schedule(topi.cuda.schedule_softmax),
-        name="log_softmax.cuda",
-    )
-    if target.kind.name == "cuda" and "cudnn" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_softmax(topi.cuda.log_softmax_cudnn),
-            wrap_topi_schedule(topi.cuda.schedule_log_softmax_cudnn),
-            name="log_softmax.cudnn",
-            plevel=15,
-        )
-    return strategy
-
-
-@schedule_lrn.register(["cuda", "gpu"])
-def schedule_lrn_cuda(attrs, outs, target):
-    """schedule LRN for cuda"""
-    with target:
-        return topi.cuda.schedule_lrn(outs)
-
-
-@conv2d_strategy.register(["cuda", "gpu"])
-def conv2d_strategy_cuda(attrs, inputs, out_type, target):
-    """conv2d cuda strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
-    padding = attrs.get_int_tuple("padding")
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-    if groups == 1:
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            if (
-                (target.kind.name in ["cuda", "vulkan", "rocm"])
-                and data.dtype in ("int8", "uint8")
-                and kernel.dtype in ("int8", "uint8")
-            ):
-                assert data.dtype == kernel.dtype
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
-                    name="conv2d_nchw_int8.cuda",
-                )
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
-                    name="conv2d_nchw.cuda",
-                )
-            N, _, H, W = get_const_tuple(data.shape)
-            CO, CI, KH, KW = get_const_tuple(kernel.shape)
-            (_, _, judge_winograd_auto_scheduler) = judge_winograd(
-                N,
-                H,
-                W,
-                KH,
-                KW,
-                CI,
-                CO,
-                padding,
-                stride_h,
-                stride_w,
-                dilation_h,
-                dilation_w,
-                data.dtype,
-                kernel.dtype,
-                pre_flag=False,
-            )
-            if is_meta_schedule_enabled() and judge_winograd_auto_scheduler:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nchw),
-                    naive_schedule,  # this implementation should never be picked by autotvm
-                    name="conv2d_nchw_winograd.cuda",
-                    plevel=15,
-                )
-            elif (
-                (2 < KH < 8 and 2 < KW < 8 and KH == KW)
-                and (stride_h == 1 and stride_w == 1)
-                and (dilation_h == 1 and dilation_w == 1)
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd),
-                    name="conv2d_nchw_winograd.cuda",
-                    plevel=5,
-                )
-        elif layout == "HWCN":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
-                name="conv2d_hwcn.cuda",
-            )
-        elif layout == "NHWC" and kernel_layout == "HWIO":
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
-                wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.gpu",
-            )
-
-            N, H, W, _ = get_const_tuple(data.shape)
-            KH, KW, CI, CO = get_const_tuple(kernel.shape)
-            # Winograd shape related judgment
-            (
-                judge_winograd_tensorcore,
-                judge_winograd_autotvm,
-                judge_winograd_auto_scheduler,
-            ) = judge_winograd(
-                N,
-                H,
-                W,
-                KH,
-                KW,
-                CI,
-                CO,
-                padding,
-                stride_h,
-                stride_w,
-                dilation_h,
-                dilation_w,
-                data.dtype,
-                kernel.dtype,
-                pre_flag=False,
-            )
-            if judge_winograd_autotvm:
-                if (
-                    target.kind.name == "cuda"
-                    and nvcc.have_tensorcore(target=target)
-                    and judge_winograd_tensorcore
-                ):
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_tensorcore),
-                        wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore),
-                        name="conv2d_nhwc_winograd_tensorcore.cuda",
-                        plevel=5,
-                    )
-                else:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct),
-                        wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_direct),
-                        name="conv2d_nhwc_winograd_direct.cuda",
-                        plevel=5,
-                    )
-            if (
-                target.kind.name == "cuda"
-                and not is_auto_scheduler_enabled()
-                and not is_meta_schedule_enabled()
-                and nvcc.have_tensorcore(target=target)
-                and (
-                    (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-                    or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-                    or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-                )
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
-                    name="conv2d_nhwc_tensorcore.cuda",
-                    plevel=20,
-                )
-
-            # register auto-scheduler implementations
-            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
-                    naive_schedule,  # this implementation should never be picked by autotvm
-                    name="conv2d_nhwc.winograd",
-                    plevel=15,
-                )
-            # register meta-schedule implementations
-            if is_meta_schedule_enabled() and judge_winograd_auto_scheduler:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
-                    naive_schedule,  # this implementation should never be picked by autotvm
-                    name="conv2d_nhwc.winograd",
-                    plevel=15,
-                )
-
-        elif layout == "HWNC":
-            assert kernel_layout in ["HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"]
-            _, _, N, in_channels = get_const_tuple(data.shape)
-            pre_computed = len(kernel.shape) == 6
-            if pre_computed:
-                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(kernel.shape)
-                out_channels = oc_chunk * oc_block_factor
-            else:
-                _, _, out_channels, _ = get_const_tuple(kernel.shape)
-
-            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
-            if (
-                target.kind.name == "cuda"
-                and nvcc.have_tensorcore(target=target)
-                and kernel.dtype in tensorcore_dtypes
-                and (
-                    (
-                        data.dtype in ["int4", "uint4"]
-                        and N % 8 == 0
-                        and in_channels % 32 == 0
-                        and out_channels % 8 == 0
-                    )
-                    or (
-                        data.dtype in ["int8", "uint8"]
-                        and N % 8 == 0
-                        and in_channels % 16 == 0
-                        and out_channels % 32 == 0
-                    )
-                )
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_hwnc_tensorcore),
-                    name="conv2d_hwnc_tensorcore_direct.cuda",
-                    plevel=20,
-                )
-            else:
-                raise RuntimeError(
-                    "Unsupported shape for conv2d HWNC.\
-                                    Need to satisfy tensor core schedule."
-                )
-        elif (
-            (target.kind.name in ["cuda", "vulkan", "rocm"])
-            and layout == "NCHW4c"
-            and data.dtype in ["int8", "uint8"]
-        ):
-            assert kernel_layout == "OIHW4o4i"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, need_data_layout=True),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
-                name="conv2d_NCHWc_int8.cuda",
-            )
-        elif is_auto_scheduler_enabled() or is_meta_schedule_enabled():
-            strategy.add_implementation(
-                wrap_compute_conv2d(
-                    topi.nn.conv, need_data_layout=True, need_kernel_layout=True, has_groups=True
-                ),
-                naive_schedule,
-                name="conv2d.cuda",
-                plevel=15,
-            )
-        elif target.kind.name == "cuda" and "cudnn" not in target.libs:
-            # No TVM native kernel applicable
-            raise RuntimeError(f"Unsupported conv2d layout {layout} for CUDA")
-
-        if (
-            target.kind.name == "cuda"
-            and "cudnn" in target.libs
-            and layout in ["NCHW", "NHWC"]
-            and padding[0] == padding[2]
-            and padding[1] == padding[3]
-            and not (data.dtype in ["uint8", "int8"] or kernel.dtype in ["uint8", "int8"])
-        ):
-            # add cudnn implementation
-            if layout == "NHWC":
-                assert kernel_layout == "OHWI"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
-                name="conv2d_cudnn.cuda",
-                plevel=25,
-            )
-
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups) and (
-        layout == "NCHW" or "cudnn" not in target.libs
-    ):  # cuDNN requires a different kernel layout for NHWC inputs.
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.cuda",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.cuda",
-            )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout}")
-    else:  # group_conv2d
-        # add cudnn implementation, if any
-        cudnn_impl = False
-        if target.kind.name == "cuda" and "cudnn" in target.libs:
-            if (
-                layout in ["NCHW", "NHWC"]
-                and padding[0] == padding[2]
-                and padding[1] == padding[3]
-                and not (data.dtype in ["uint8", "int8"] or kernel.dtype in ["uint8", "int8"])
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(
-                        topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True
-                    ),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
-                    name="conv2d_cudnn.cuda",
-                    plevel=25,
-                )
-                cudnn_impl = True
-
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            _, channels, _, _ = get_const_tuple(data.shape)
-            out_channels, in_channels, _, _ = get_const_tuple(kernel.shape)
-            oc_chunk = out_channels // 4
-            ic_chunk = in_channels // 4
-
-            if (
-                (target.kind.name in ["cuda", "vulkan", "rocm"])
-                and data.dtype in ["int8", "uint8"]
-                and kernel.dtype in ["int8", "uint8"]
-                and channels % groups == 0
-                and out_channels % groups == 0
-                and channels % 4 == 0
-                and out_channels % 4 == 0
-                and groups <= oc_chunk
-                and groups <= ic_chunk
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw_int8, has_groups=True),
-                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw_int8),
-                    name="group_conv2d_nchw_int8.cuda",
-                )
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
-                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
-                    name="group_conv2d_nchw.cuda",
-                )
-        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
-            assert kernel_layout == "OIHW4o4i"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, has_groups=True),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
-                name="group_conv2d_NCHWc_int8.cuda",
-            )
-        elif not cudnn_impl:
-            raise RuntimeError(f"Unsupported group_conv2d layout {layout}")
-    return strategy
-
-
-def judge_winograd(
-    N,
-    H,
-    W,
-    KH,
-    KW,
-    CI,
-    CO,
-    padding,
-    stride_h,
-    stride_w,
-    dilation_h,
-    dilation_w,
-    data_dtype,
-    kernel_dtype,
-    pre_flag,
-):
-    """Winograd judgement about tensorcore and shape"""
-    if H % 8 == 0:
-        tile_size = 4
-    else:
-        tile_size = 2
-    if pre_flag:
-        alpha = KH
-        KH = KW = alpha + 1 - tile_size
-    pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (KH, KW))
-    OH = (H + pt + pb - KH) // stride_h + 1
-    OW = (W + pl + pr - KW) // stride_w + 1
-    nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // tile_size
-    if not isinstance(N, int):
-        return False, False, False
-    P = N * nH * nW
-
-    judge_winograd_tensorcore = (
-        (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-        or (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-        or (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-    )
-
-    judge_winograd_autotvm = (
-        2 < KH < 8
-        and 2 < KW < 8
-        and KH == KW
-        and stride_h == 1
-        and stride_w == 1
-        and dilation_h == 1
-        and dilation_w == 1
-    )
-
-    judge_winograd_auto_scheduler = (
-        ("float" in data_dtype and "float" in kernel_dtype)
-        and (KH == 3 and KW == 3)
-        and (stride_h == 1 and stride_w == 1)
-        and (dilation_h == 1 and dilation_w == 1)
-    )
-
-    return judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler
-
-
-@conv2d_winograd_without_weight_transform_strategy.register(["cuda", "gpu"])
-def conv2d_winograd_without_weight_transform_strategy_cuda(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform cuda strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.data_layout
-    data, kernel = inputs
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not support arbitrary group number"
-    strategy = _op.OpStrategy()
-    if layout == "NCHW":
-        if is_meta_schedule_enabled():
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_winograd_nchw_without_weight_transform),
-                naive_schedule,  # this implementation should never be picked by autotvm
-                name="conv2d_nchw_winograd_without_weight_transform",
-                plevel=15,
-            )
-        else:
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform),
-                wrap_topi_schedule(
-                    topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform
-                ),
-                name="conv2d_nchw_winograd_without_weight_transform.cuda",
-            )
-    elif layout == "NHWC":
-        N, H, W, _ = get_const_tuple(data.shape)
-        alpha, _, CI, CO = get_const_tuple(kernel.shape)
-        dilation_h, dilation_w = dilation
-        judge_winograd_tensorcore, _, _ = judge_winograd(
-            N,
-            H,
-            W,
-            alpha,
-            alpha,
-            CI,
-            CO,
-            padding,
-            stride_h,
-            stride_w,
-            dilation_h,
-            dilation_w,
-            data.dtype,
-            kernel.dtype,
-            pre_flag=True,
-        )
-        if (
-            target.kind.name == "cuda"
-            and nvcc.have_tensorcore(target=target)
-            and judge_winograd_tensorcore
-        ):
-            strategy.add_implementation(
-                wrap_compute_conv2d(
-                    topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform
-                ),
-                wrap_topi_schedule(
-                    topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform
-                ),
-                name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
-            )
-        else:
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform),
-                wrap_topi_schedule(
-                    topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform
-                ),
-                name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
-            )
-
-        if is_auto_scheduler_enabled():
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
-                naive_schedule,  # this implementation should never be picked by autotvm
-                name="conv2d_nhwc_winograd_without_weight_transform",
-                plevel=15,
-            )
-        if is_meta_schedule_enabled():
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
-                naive_schedule,  # this implementation should never be picked by autotvm
-                name="conv2d_nhwc_winograd_without_weight_transform",
-                plevel=15,
-            )
-    else:
-        raise RuntimeError(f"Unsupported conv2d_winograd_without_weight_transform layout {layout}")
-    return strategy
-
-
-@deformable_conv2d_strategy.register(["cuda", "gpu"])
-def deformable_conv2d_strategy_cuda(attrs, inputs, out_type, target):
-    """deformable_conv2d cuda strategy"""
-    layout = attrs.data_layout
-    strategy = _op.OpStrategy()
-
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_deformable_conv2d(topi.cuda.deformable_conv2d_nchw),
-            wrap_topi_schedule(topi.cuda.schedule_deformable_conv2d_nchw),
-            name="deformable_conv2d_nchw.cuda",
-        )
-    elif layout == "NHWC":
-        # This implementation should never be picked by autotvm
-        strategy.add_implementation(
-            wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nhwc),
-            naive_schedule,
-            name="deformable_conv2d_nhwc.cuda",
-        )
-    else:
-        raise RuntimeError(f"Layout {layout} is not supported in deformable conv2d on CUDA")
-    return strategy
-
-
-@conv2d_backward_weight_strategy.register(["cuda"])
-def conv2d_backward_weight_strategy_cuda(attrs, inputs, out_type, target):
-    """conv2d_backward_weight cuda strategy"""
-    strategy = _op.OpStrategy()
-    if target.kind.name == "cuda" and "cudnn" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_conv2d_backward_weight(topi.cuda.conv2d_backward_weight_cudnn),
-            wrap_topi_schedule(topi.generic.schedule_extern),
-            name="conv2d_backward_weight_strategy.cudnn",
-            plevel=15,
-        )
-    else:
-        raise RuntimeError(
-            "conv2d_backward_weight on cuda is currently only supported with cudnn. "
-            "Please run Legalize pass to decompose this op into supported ops."
-        )
-    return strategy
-
-
-@conv2d_transpose_strategy.register(["cuda", "gpu"])
-def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
-    """conv2d_transpose cuda strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert dilation == (1, 1), "not support dilate now"
-    strategy = _op.OpStrategy()
-    num_strategies = 0
-
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw, has_groups=True),
-            wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.cuda",
-        )
-        num_strategies += 1
-
-    if (
-        target.kind.name == "cuda"
-        and "cudnn" in target.libs
-        and (
-            (layout == "NCHW" and attrs.kernel_layout == "IOHW")
-            or (layout == "NHWC" and attrs.kernel_layout == "IHWO")
-        )
-    ):
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(
-                topi.cuda.conv2d_transpose_cudnn, add_layout=True, has_groups=True
-            ),
-            wrap_topi_schedule(topi.generic.schedule_extern),
-            name="conv2d_transpose.cudnn.cuda",
-            plevel=25,
-        )
-        num_strategies += 1
-
-    # TODO(masahi): Support conv2d_transpose NHWC for non-cudnn path.
-    assert (
-        num_strategies > 0
-    ), f"Unsupported conv2d_transpose workload, layout = {layout}, groups = {groups}"
-    return strategy
-
-
-@conv3d_transpose_strategy.register(["cuda", "gpu"])
-def conv3d_transpose_strategy_cuda(attrs, inputs, out_type, target):
-    """conv3d_transpose cuda strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCDHW", "only support ncdhw for now"
-    assert dilation == (1, 1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv3d_transpose(topi.cuda.conv3d_transpose_ncdhw),
-        wrap_topi_schedule(topi.cuda.schedule_conv3d_transpose_ncdhw),
-        name="conv3d_transpose_ncdhw.cuda",
-    )
-    return strategy
-
-
-@conv3d_strategy.register(["cuda", "gpu"])
-def conv3d_strategy_cuda(attrs, inputs, out_type, target):
-    """conv3d cuda strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    layout = attrs.data_layout
-    _, stride_h, stride_w = attrs.get_int_tuple("strides")
-    _, dilation_h, dilation_w = attrs.get_int_tuple("dilation")
-    assert layout in ["NCDHW", "NDHWC"], f"Not support this layout {layout} yet"
-    if layout == "NCDHW":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.cuda.conv3d_ncdhw),
-            wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw),
-            name="conv3d_ncdhw.cuda",
-            plevel=10,
-        )
-        _, _, _, kh, kw = get_const_tuple(kernel.shape)
-        if (
-            2 < kh < 8
-            and 2 < kw < 8
-            and kh == kw
-            and stride_h == 1
-            and stride_w == 1
-            and dilation_h == 1
-            and dilation_w == 1
-            and attrs["groups"] == 1
-        ):
-            strategy.add_implementation(
-                wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd),
-                wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd),
-                name="conv3d_ncdhw_winograd.cuda",
-                plevel=5,
-            )
-    else:  # layout == "NDHWC":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
-            wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc),
-            name="conv3d_ndhwc.cuda",
-            plevel=10,
-        )
-        N, _, _, _, _ = get_const_tuple(data.shape)
-        _, _, _, CI, CO = get_const_tuple(kernel.shape)
-        if target.kind.name == "cuda":
-            if nvcc.have_tensorcore(target=target):
-                if (
-                    (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-                    or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-                    or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-                ) and out_type == "float16":
-                    strategy.add_implementation(
-                        wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore),
-                        wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc_tensorcore),
-                        name="conv3d_ndhwc_tensorcore.cuda",
-                        plevel=20,
-                    )
-
-    if target.kind.name == "cuda" and "cudnn" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True),
-            wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn),
-            name="conv3d_cudnn.cuda",
-            plevel=25,
-        )
-    return strategy
-
-
-@conv3d_winograd_without_weight_transform_strategy.register(["cuda", "gpu"])
-def conv3d_winograd_without_weight_transform_strategy_cuda(attrs, inputs, out_type, target):
-    """conv3d_winograd_without_weight_transform cuda strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.data_layout
-    assert dilation == (1, 1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not support arbitrary group number"
-    strategy = _op.OpStrategy()
-    if layout == "NCDHW":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd_without_weight_transform),
-            wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd_without_weight_transform),
-            name="conv3d_ncdhw_winograd_without_weight_transform.cuda",
-        )
-    else:
-        raise RuntimeError(f"Unsupported conv3d_winograd_without_weight_transform layout {layout}")
-    return strategy
-
-
-@conv1d_strategy.register(["cuda", "gpu"])
-def conv1d_strategy_cuda(attrs, inputs, out_type, target):
-    """conv1d cuda strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    if dilation[0] < 1:
-        raise ValueError("dilation should be a positive value")
-    strategy = _op.OpStrategy()
-    if attrs.groups == 1:
-        if layout == "NCW":
-            strategy.add_implementation(
-                wrap_compute_conv1d(topi.cuda.conv1d_ncw),
-                wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw),
-                name="conv1d_ncw.cuda",
-            )
-        elif layout == "NWC":
-            strategy.add_implementation(
-                wrap_compute_conv1d(topi.cuda.conv1d_nwc),
-                wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc),
-                name="conv1d_nwc.cuda",
-            )
-        else:
-            raise ValueError(f"Unsupported conv1d layout {layout}")
-    else:
-        if layout == "NCW":
-            strategy.add_implementation(
-                wrap_compute_group_conv1d(topi.cuda.group_conv1d_ncw),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv1d_ncw),
-                name="group_conv1d_ncw.cuda",
-            )
-        elif layout == "NWC":
-            strategy.add_implementation(
-                wrap_compute_group_conv1d(topi.cuda.group_conv1d_nwc),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv1d_nwc),
-                name="group_conv1d_nwc.cuda",
-            )
-        else:
-            raise ValueError(f"Unsupported conv1d layout {layout}")
-    return strategy
-
-
-@conv1d_transpose_strategy.register(["cuda", "gpu"])
-def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
-    """conv1d_transpose cuda strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCW", "conv1d_transpose ncw only supported"
-    assert dilation == (1,), "conv1d_transpose dilation is not supported"
-    assert groups == 1, "conv1d_transpose groups == 1 only supported"
-    strategy.add_implementation(
-        wrap_compute_conv1d_transpose(topi.cuda.conv1d_transpose_ncw),
-        wrap_topi_schedule(topi.cuda.schedule_conv1d_transpose_ncw),
-        name="conv1d_transpose_ncw.cuda",
-    )
-    return strategy
-
-
-@matmul_strategy.register(["cuda", "gpu"])
-def matmul_strategy_cuda(attrs, inputs, out_type, target):
-    """Matmul cuda strategy."""
-    strategy = _op.OpStrategy()
-
-    if is_auto_scheduler_enabled():
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.cuda"
-        )
-    elif is_meta_schedule_enabled():
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.cuda"
-        )
-    else:
-        logger.warning(
-            "Matmul is not optimized for cuda. Recommend to use cublas for better performance."
-        )
-        # Temporary use this as a basic schedule
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.gpu.matmul_default),
-            wrap_topi_schedule(topi.gpu.schedule_matmul_default),
-            name="matmul_default.gpu",
-        )
-
-    if target.kind.name == "cuda" and "cublas" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.cuda.matmul_cublas),
-            wrap_topi_schedule(topi.cuda.schedule_matmul_cublas),
-            name="matmul_cublas.cuda",
-            plevel=25,
-        )
-    return strategy
-
-
-@dense_strategy.register(["cuda", "gpu"])
-def dense_strategy_cuda(attrs, inputs, out_type, target):
-    """dense cuda strategy"""
-    strategy = _op.OpStrategy()
-    data, weights = inputs
-    b, i = get_const_tuple(data.shape)
-    o, _ = get_const_tuple(weights.shape)
-    if (
-        target.kind.name in ["cuda", "vulkan", "rocm"]
-        and data.dtype == "int8"
-        and weights.dtype == "int8"
-        and out_type.dtype == "int32"
-    ):
-        strategy.add_implementation(
-            wrap_compute_dense(topi.cuda.dense_int8),
-            wrap_topi_schedule(topi.cuda.schedule_dense_int8),
-            name="dense_int8.cuda",
-        )
-    else:
-        # Some AMDGPU cards have accuracy issues with this schedule
-        # See https://github.com/apache/tvm/issues/13666
-        if target.kind.name != "rocm":
-            strategy.add_implementation(
-                wrap_compute_dense(topi.gpu.dense_small_batch),
-                wrap_topi_schedule(topi.gpu.schedule_dense_small_batch),
-                name="dense_small_batch.gpu",
-            )
-
-        with SpecializedCondition(target.kind.name == "rocm" or b >= 32):
-            strategy.add_implementation(
-                wrap_compute_dense(topi.gpu.dense_large_batch),
-                wrap_topi_schedule(topi.gpu.schedule_dense_large_batch),
-                name="dense_large_batch.gpu",
-                plevel=5,
-            )
-
-    if target.kind.name == "cuda":
-        if nvcc.have_tensorcore(target=target):
-            if (
-                (
-                    data.dtype in ["float16", "int8", "uint8"]
-                    and (
-                        (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
-                        or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
-                        or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
-                    )
-                )
-                or (data.dtype in ["int4", "uint4"] and i % 32 == 0 and b % 8 == 0 and o % 8 == 0)
-                or (data.dtype in ["int1", "uint1"] and i % 128 == 0 and b % 8 == 0 and o % 8 == 0)
-            ):
-                strategy.add_implementation(
-                    wrap_compute_dense(topi.cuda.dense_tensorcore),
-                    wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore),
-                    name="dense_tensorcore.cuda",
-                    plevel=20,
-                )
-
-    if target.kind.name == "cuda" and "cublas" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_dense(topi.cuda.dense_cublas),
-            wrap_topi_schedule(topi.cuda.schedule_dense_cublas),
-            name="dense_cublas.cuda",
-            plevel=25,
-        )
-    return strategy
-
-
-@batch_matmul_strategy.register(["cuda", "gpu"])
-def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
-    """batch_matmul cuda strategy"""
-    strategy = _op.OpStrategy()
-    x, y = inputs
-    if (
-        x.dtype == "int8"
-        and y.dtype == "int8"
-        and out_type.dtype == "int32"
-        and not attrs["transpose_a"]
-        and attrs["transpose_b"]
-    ):
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.cuda.batch_matmul_int8, need_out_dtype=True),
-            wrap_topi_schedule(topi.cuda.schedule_batch_matmul_int8),
-            name="batch_matmul_int8.cuda",
-            plevel=10,
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.cuda.batch_matmul, need_out_dtype=True),
-            wrap_topi_schedule(topi.cuda.schedule_batch_matmul),
-            name="batch_matmul.cuda",
-            plevel=10,
-        )
-    if target.kind.name == "cuda" and "cublas" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas, need_out_dtype=True),
-            wrap_topi_schedule(topi.generic.schedule_extern),
-            name="batch_matmul_cublas.cuda",
-            plevel=30,
-        )
-    if (
-        target.kind.name == "cuda"
-        and nvcc.have_tensorcore(target=target)
-        and not attrs["transpose_a"]
-        and attrs["transpose_b"]
-    ):
-        x, y = inputs
-        _, M, K = get_const_tuple(x.shape)
-        _, N, K = get_const_tuple(y.shape)
-        if (
-            x.dtype in ["float16", "int8", "uint8"]
-            and (
-                (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
-                or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
-                or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
-            )
-        ) or (x.dtype in ["int4", "uint4"] and K % 32 == 0 and M % 8 == 0 and N % 8 == 0):
-            strategy.add_implementation(
-                wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore, need_out_dtype=True),
-                wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore),
-                name="batch_matmul_tensorcore.cuda",
-                plevel=20,
-            )
-
-    return strategy
-
-
-@sparse_dense_strategy.register(["cuda", "gpu"])
-def sparse_dense_strategy_cuda(attrs, inputs, out_type, target):
-    """sparse dense cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_dense(topi.cuda.sparse_dense),
-        wrap_topi_schedule(topi.cuda.schedule_sparse_dense),
-        name="sparse_dense.cuda",
-        plevel=10,
-    )
-    return strategy
-
-
-@sparse_reshape_strategy.register(["cuda", "gpu"])
-def sparse_reshape_strategy_cuda(attrs, inputs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_reshape(topi.cuda.sparse_reshape),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="sparse_reshape.cuda",
-    )
-    return strategy
-
-
-@sparse_dense_padded_strategy.register(["cuda", "gpu", "rocm"])
-def sparse_dense_padded_strategy_cuda(attrs, inputs, out_type, target):
-    """sparse dense cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_dense(topi.cuda.sparse_dense_padded),
-        wrap_topi_schedule(topi.cuda.schedule_sparse_dense_padded),
-        name="sparse_dense_padded.cuda",
-        plevel=10,
-    )
-    return strategy
-
-
-@scatter_elements_strategy.register(["cuda", "gpu"])
-def scatter_elements_cuda(attrs, inputs, out_type, target):
-    """scatter elements cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scatter_elements(topi.cuda.scatter_elements),
-        wrap_topi_schedule(topi.cuda.schedule_extern),
-        name="scatter_elements.cuda",
-        plevel=10,
-    )
-
-    rank = len(inputs[0].shape)
-
-    with SpecializedCondition(rank == 1 and attrs.reduction == "update"):
-        if can_use_thrust(target, "tvm.contrib.thrust.stable_sort_by_key"):
-            strategy.add_implementation(
-                wrap_compute_scatter_elements(topi.cuda.scatter_via_sort),
-                wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort),
-                name="scatter_via_sort.cuda",
-                plevel=9,  # use the sequential version by default
-            )
-    return strategy
-
-
-@scatter_nd_strategy.register(["cuda", "gpu"])
-def scatter_nd_cuda(attrs, inputs, out_type, target):
-    """scatter_nd cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scatter_nd(topi.cuda.scatter_nd),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="scatter_nd.cuda",
-        plevel=10,
-    )
-    return strategy
-
-
-@sort_strategy.register(["cuda", "gpu"])
-def sort_strategy_cuda(attrs, inputs, out_type, target):
-    """sort cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sort(topi.cuda.sort),
-        wrap_topi_schedule(topi.cuda.schedule_sort),
-        name="sort.cuda",
-    )
-    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
-        strategy.add_implementation(
-            wrap_compute_sort(topi.cuda.sort_thrust),
-            wrap_topi_schedule(topi.cuda.schedule_sort),
-            name="sort_thrust.cuda",
-            plevel=15,
-        )
-    return strategy
-
-
-@argsort_strategy.register(["cuda", "gpu"])
-def argsort_strategy_cuda(attrs, inputs, out_type, target):
-    """argsort cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_argsort(topi.cuda.argsort),
-        wrap_topi_schedule(topi.cuda.schedule_argsort),
-        name="argsort.cuda",
-    )
-    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
-        strategy.add_implementation(
-            wrap_compute_argsort(topi.cuda.argsort_thrust),
-            wrap_topi_schedule(topi.cuda.schedule_argsort),
-            name="argsort_thrust.cuda",
-            plevel=15,
-        )
-    return strategy
-
-
-@topk_strategy.register(["cuda", "gpu"])
-def topk_strategy_cuda(attrs, inputs, out_type, target):
-    """topk cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_topk(topi.cuda.topk),
-        wrap_topi_schedule(topi.cuda.schedule_topk),
-        name="topk.cuda",
-    )
-    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
-        strategy.add_implementation(
-            wrap_compute_topk(topi.cuda.topk_thrust),
-            wrap_topi_schedule(topi.cuda.schedule_topk),
-            name="topk_thrust.cuda",
-            plevel=15,
-        )
-    return strategy
-
-
-@searchsorted_strategy.register(["cuda", "gpu"])
-def searchsorted_strategy_cuda(attrs, inputs, out_type, target):
-    """searchsorted cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_searchsorted(topi.cuda.searchsorted),
-        wrap_topi_schedule(topi.cuda.schedule_extern),
-        name="searchsorted.cuda",
-    )
-    return strategy
-
-
-@multibox_prior_strategy.register(["cuda", "gpu"])
-def multibox_prior_strategy_cuda(attrs, inputs, out_type, target):
-    """multibox_prior cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_multibox_prior(topi.cuda.multibox_prior),
-        wrap_topi_schedule(topi.cuda.schedule_multibox_prior),
-        name="multibox_prior.cuda",
-    )
-    return strategy
-
-
-@multibox_transform_loc_strategy.register(["cuda", "gpu"])
-def multibox_transform_loc_strategy_cuda(attrs, inputs, out_type, target):
-    """multibox_transform_loc cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_multibox_transform_loc(topi.cuda.multibox_transform_loc),
-        wrap_topi_schedule(topi.cuda.schedule_multibox_transform_loc),
-        name="multibox_transform_loc.cuda",
-    )
-    return strategy
-
-
-@get_valid_counts_strategy.register(["cuda", "gpu"])
-def get_valid_counts_strategy_cuda(attrs, inputs, out_type, target):
-    """get_valid_counts cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_get_valid_counts(topi.cuda.get_valid_counts),
-        wrap_topi_schedule(topi.cuda.schedule_get_valid_counts),
-        name="get_valid_counts.cuda",
-    )
-    return strategy
-
-
-@nms_strategy.register(["cuda", "gpu"])
-def nms_strategy_cuda(attrs, inputs, out_type, target):
-    """nms cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_nms(topi.cuda.non_max_suppression),
-        wrap_topi_schedule(topi.cuda.schedule_nms),
-        name="nms.cuda",
-    )
-    return strategy
-
-
-@all_class_nms_strategy.register(["cuda", "gpu"])
-def all_class_nms_strategy_cuda(attrs, inputs, out_type, target):
-    """all class nms cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_all_class_nms(topi.cuda.all_class_non_max_suppression),
-        wrap_topi_schedule(topi.cuda.schedule_nms),
-        name="all_class_nms.cuda",
-    )
-    return strategy
-
-
-@roi_align_strategy.register(["cuda", "gpu"])
-def roi_align_strategy_cuda(attrs, inputs, out_type, target):
-    """roi_align cuda strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.layout
-
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
-            wrap_topi_schedule(topi.cuda.schedule_roi_align),
-            name="roi_align_nchw.cuda",
-        )
-    else:
-        assert layout == "NHWC", "layout must be NCHW or NHWC."
-        strategy.add_implementation(
-            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc),
-            wrap_topi_schedule(topi.cuda.schedule_roi_align),
-            name="roi_align_nhwc.cuda",
-        )
-    return strategy
-
-
-@schedule_roi_pool.register(["cuda", "gpu"])
-def schedule_roi_pool_cuda(attrs, outs, target):
-    """schedule roi_pool for cuda"""
-    with target:
-        return topi.cuda.schedule_roi_pool(outs)
-
-
-@proposal_strategy.register(["cuda", "gpu"])
-def proposal_strategy_cuda(attrs, inputs, out_type, target):
-    """proposal cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_proposal(topi.cuda.proposal),
-        wrap_topi_schedule(topi.cuda.schedule_proposal),
-        name="proposal.cuda",
-    )
-    return strategy
-
-
-@correlation_strategy.register(["cuda", "gpu"])
-def correlation_strategy_cuda(attrs, inputs, out_type, target):
-    """correlation cuda strategy"""
-    layout = attrs.layout
-    assert layout == "NCHW", "Only support NCHW layout"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_correlation(topi.cuda.correlation_nchw),
-        wrap_topi_schedule(topi.cuda.schedule_correlation_nchw),
-        name="correlation.cuda",
-    )
-    return strategy
-
-
-@argwhere_strategy.register(["cuda", "gpu"])
-def argwhere_strategy_cuda(attrs, inputs, out_type, target):
-    """argwhere cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_argwhere(topi.cuda.argwhere),
-        wrap_topi_schedule(topi.cuda.schedule_argwhere),
-        name="argwhere.cuda",
-    )
-    return strategy
-
-
-@cumsum_strategy.register(["cuda", "gpu"])
-def cumsum_strategy_cuda(attrs, inputs, out_type, target):
-    """cumsum cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scanop(topi.cuda.cumsum),
-        wrap_topi_schedule(topi.cuda.schedule_scan),
-        name="cumsum.cuda",
-    )
-    return strategy
-
-
-@cumprod_strategy.register(["cuda", "gpu"])
-def cumprod_strategy_cuda(attrs, inputs, out_type, target):
-    """cumprod cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scanop(topi.cuda.cumprod),
-        wrap_topi_schedule(topi.cuda.schedule_scan),
-        name="cumprod.cuda",
-    )
-    return strategy
-
-
-@unique_strategy.register(["cuda", "gpu"])
-def unique_strategy_cuda(attrs, inputs, out_type, target):
-    """unique cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_unique(topi.cuda.unique),
-        wrap_topi_schedule(topi.cuda.schedule_scan),
-        name="unique.cuda",
-    )
-    return strategy
-
-
-@schedule_transpose.register(["cuda", "gpu", "rocm"])
-def schedule_transpose_cuda(attrs, outs, target):
-    """
-    Transpose cuda strategy
-    Dispatches to and optimized schedule if the transpose is standalone (not fused).
-    """
-    warp_size = int(Target.current(allow_none=False).thread_warp_size)
-    if (
-        isinstance(outs[0].op.input_tensors[0].op, te.PlaceholderOp)
-        and len(outs[0].shape) == 2
-        and (attrs.axes is None or (len(attrs.axes) == 2 and attrs.axes == [1, 0]))
-        and isinstance(outs[0].shape[0], (int, IntImm))
-        and outs[0].shape[0] >= warp_size
-        and isinstance(outs[0].shape[1], (int, IntImm))
-        and outs[0].shape[1] >= warp_size
-    ):
-        return topi.cuda.schedule_transpose(outs)
-    return schedule_injective(attrs, outs, target)
-
-
-@invert_permutation_strategy.register(["cuda", "gpu"])
-def invert_permutation_strategy_cuda(attrs, inputs, out_type, target):
-    """invert_permutation cuda strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_invert_permutation(topi.cuda.invert_permutation),
-        wrap_topi_schedule(topi.cuda.vision._default_schedule),
-        name="invert_permutation.cuda",
-    )
-    return strategy
-
-
-@einsum_strategy.register(["cuda", "gpu"])
-def einsum_strategy_cuda(attrs, inputs, out_type, target):
-    """einsum cuda strategy"""
-    strategy = _op.OpStrategy()
-    # TODO: Add cuda-specific op implementation for einsum
-    strategy.add_implementation(
-        wrap_compute_einsum(topi.einsum),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="einsum.cuda",
-    )
-    return strategy
-
-
-@stft_strategy.register(["cuda", "gpu"])
-def stft_strategy_cuda(attrs, inputs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_stft(topi.cuda.stft),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="stft.cuda",
-    )
-    return strategy
-
-
-@dft_strategy.register(["cuda", "gpu"])
-def dft_strategy_cuda(attrs, inputs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dft(topi.cuda.dft),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="dft.cuda",
-    )
-    return strategy
-
-
-@layout_transform_strategy.register(["cuda", "gpu"])
-def layout_transform_strategy_cuda(attrs, inputs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_layout_transform(topi.layout_transform, schedule_rule="layout_transform"),
-        schedule_injective,
-        name="layout_transform.cuda",
-    )
-    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
deleted file mode 100644
index c2a4b4c302af..000000000000
--- a/python/tvm/relay/op/strategy/generic.py
+++ /dev/null
@@ -1,2097 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of generic operator strategy."""
-# pylint: disable=invalid-name,unused-argument
-import logging
-import re
-
-from tvm import _ffi, ir, te, topi
-from tvm.target import generic_func, override_native_generic_func
-from tvm.topi.utils import get_const_float, get_const_int, get_const_tuple, get_float_tuple
-
-from .. import op as _op
-
-logger = logging.getLogger("strategy")
-
-
-def naive_schedule(_, outs, target):
-    """Return the naive default schedule.
-    This function acts as a placeholder for op implementations that uses auto-scheduler.
-    Implemenations using this function should only be used along with auto-scheduler.
-    """
-    if "gpu" in target.keys:
-        # For GPU, we at least need thread binding to make a valid schedule.
-        # So the naive schedule cannot be compiled.
-        logger.debug(
-            "Cannot compile for GPU targets if no tuned schedule is found. "
-            "Please see the warning messages above for more information about the failed workloads."
-        )
-    return te.create_schedule(outs[-1].op)
-
-
-def wrap_topi_schedule(topi_schedule):
-    """Wrap TOPI schedule which doesn't use attrs"""
-
-    def wrapper(attrs, outs, target):
-        with target:
-            return topi_schedule(outs)
-
-    return wrapper
-
-
-def wrap_topi_compute(topi_compute):
-    """Wrap TOPI compute which doesn't use attrs"""
-
-    def wrapper(attrs, inputs, out_type):
-        return [topi_compute(*inputs)]
-
-    return wrapper
-
-
-def get_conv2d_in_channels(data_shape, data_layout):
-    """Get conv2d input channels"""
-    data_shape = get_const_tuple(data_shape)
-    if len(data_shape) == 4:
-        idx = data_layout.find("C")
-        assert idx >= 0, f"Invalid conv2d data layout {data_layout}"
-        return data_shape[idx]
-    if re.match(r"NCHW\d*c", data_layout):
-        # NCHW[8]c
-        return data_shape[1] * data_shape[4]
-    raise ValueError(f"Unknown conv2d data layout {data_layout}")
-
-
-def get_conv2d_out_channels(kernel_shape, kernel_layout):
-    """Get conv2d output channels"""
-    kernel_shape = get_const_tuple(kernel_shape)
-    if len(kernel_shape) == 4:
-        idx = kernel_layout.find("O")
-        assert idx >= 0, f"Invalid conv2d kernel layout {kernel_layout}"
-        return kernel_shape[idx]
-    if re.match(r"OIHW\d*i\d*o", kernel_layout):
-        return kernel_shape[0] * kernel_shape[5]
-    if re.match(r"OIHW\d*o", kernel_layout):
-        return kernel_shape[0] * kernel_shape[4]
-    raise ValueError(f"Unknown conv2d kernel layout {kernel_layout}")
-
-
-def is_depthwise_conv2d(data_shape, data_layout, kernel_shape, kernel_layout, groups):
-    ic = get_conv2d_in_channels(data_shape, data_layout)
-    oc = get_conv2d_out_channels(kernel_shape, kernel_layout)
-    return ic == oc == groups
-
-
-@generic_func
-def schedule_injective(attrs, outs, target):
-    """Schedule injective ops"""
-    with target:
-        return topi.generic.schedule_injective(outs)
-
-
-@generic_func
-def schedule_reduce(attrs, outs, target):
-    """Schedule reduction ops"""
-    with target:
-        return topi.generic.schedule_reduce(outs)
-
-
-_op._schedule_injective = schedule_injective
-_op._schedule_reduce = schedule_reduce
-
-# concatenate
-@generic_func
-def schedule_concatenate(attrs, outs, target):
-    """Schedule concatenate op"""
-    with target:
-        return topi.generic.schedule_injective(outs)
-
-
-# pool
-@generic_func
-def schedule_pool(attrs, outs, target):
-    """Schedule pooling ops"""
-    with target:
-        return topi.generic.schedule_pool(outs, attrs.layout)
-
-
-# pool_grad
-@generic_func
-def schedule_pool_grad(attrs, outs, target):
-    """Schedule pooling gradient ops"""
-    with target:
-        return topi.generic.schedule_pool_grad(outs)
-
-
-# adaptive pool
-@generic_func
-def schedule_adaptive_pool(attrs, outs, target):
-    """Schedule adaptive pooling ops"""
-    with target:
-        return topi.generic.schedule_adaptive_pool(outs)
-
-
-# softmax
-def wrap_compute_softmax(topi_compute):
-    """Wrap softmax topi compute"""
-
-    def _compute_softmax(attrs, inputs, out_type):
-        axis = attrs.get_int("axis")
-        return [topi_compute(inputs[0], axis)]
-
-    return _compute_softmax
-
-
-@override_native_generic_func("softmax_strategy")
-def softmax_strategy(attrs, inputs, out_type, target):
-    """softmax generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.softmax),
-        wrap_topi_schedule(topi.generic.schedule_softmax),
-        name="softmax.generic",
-    )
-    return strategy
-
-
-@override_native_generic_func("fast_softmax_strategy")
-def fast_softmax_strategy(attrs, inputs, out_type, target):
-    """fast softmax generic strategy"""
-    # NOTE: This op does not have an optimized manual schedule,
-    # so it should only be used together with auto-scheduler.
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.fast_softmax),
-        wrap_topi_schedule(topi.generic.schedule_fast_softmax),
-        name="fast_softmax.generic",
-    )
-    return strategy
-
-
-@override_native_generic_func("log_softmax_strategy")
-def log_softmax_strategy(attrs, inputs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.log_softmax),
-        wrap_topi_schedule(topi.generic.schedule_softmax),
-        name="log_softmax.generic",
-    )
-    return strategy
-
-
-# lrn
-@generic_func
-def schedule_lrn(attrs, outs, target):
-    """Schedule LRN op"""
-    with target:
-        return topi.generic.schedule_lrn(outs)
-
-
-# pad
-@generic_func
-def schedule_pad(attrs, outs, target):
-    """Schedule PAD op"""
-    with target:
-        return schedule_injective(attrs, outs, target)
-
-
-# bitpack
-@generic_func
-def schedule_bitpack(attrs, outs, target):
-    """Schedule bitpack"""
-    with target:
-        return topi.generic.schedule_bitpack(outs)
-
-
-get_auto_scheduler_rewritten_layout = _ffi.get_global_func(
-    "relay.attrs.get_auto_scheduler_rewritten_layout"
-)
-get_meta_schedule_original_shape = _ffi.get_global_func(
-    "relay.attrs.get_meta_schedule_original_shape"
-)
-
-# conv2d
-def wrap_compute_conv2d(
-    topi_compute,
-    *,
-    need_data_layout=False,
-    need_kernel_layout=False,
-    need_out_layout=False,
-    has_groups=False,
-    need_auto_scheduler_layout=False,
-    need_meta_schedule_layout=False,
-):
-    """Wrap conv2d topi compute"""
-
-    def _compute_conv2d(attrs, inputs, out_type):
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        dilation = get_const_tuple(attrs.dilation)
-        data_layout = attrs.get_str("data_layout")
-        kernel_layout = attrs.get_str("kernel_layout")
-        out_layout = attrs.get_str("out_layout")
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        args = [inputs[0], inputs[1], strides, padding, dilation]
-        if has_groups:
-            args.append(attrs.groups)
-        if need_data_layout:
-            args.append(data_layout)
-        if need_kernel_layout:
-            args.append(kernel_layout)
-        if need_out_layout:
-            args.append(out_layout)
-        args.append(out_dtype)
-        if need_auto_scheduler_layout:
-            args.append(get_auto_scheduler_rewritten_layout(attrs))
-        elif need_meta_schedule_layout:
-            args.append("")
-            args.append(get_meta_schedule_original_shape(attrs))
-        return [topi_compute(*args)]
-
-    return _compute_conv2d
-
-
-@override_native_generic_func("conv2d_strategy")
-def conv2d_strategy(attrs, inputs, out_type, target):
-    """conv2d generic strategy"""
-    logger.warning("conv2d is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    (dilation_h, dilation_w) = dilation
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nchw),
-                wrap_topi_schedule(topi.generic.schedule_conv2d_nchw),
-                name="conv2d_nchw.generic",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
-                wrap_topi_schedule(topi.generic.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.generic",
-            )
-        elif layout == "HWCN":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
-                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
-                name="conv2d_hwcn.generic",
-            )
-        else:
-            raise RuntimeError(f"Unsupported conv2d layout {layout}")
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.generic",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.generic",
-            )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout}")
-    else:  # group_conv2d
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.generic",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
-                name="group_conv2d_nhwc.generic",
-            )
-        else:
-            raise RuntimeError(f"Unsupported group_conv2d layout {layout}")
-    return strategy
-
-
-# conv2d_NCHWc
-@override_native_generic_func("conv2d_NCHWc_strategy")
-def conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
-    """conv2d_NCHWc generic strategy"""
-    logger.warning("conv2d_NCHWc is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    if inputs[0].dtype == "int8" or inputs[0].dtype == "uint8":
-        strategy.add_implementation(
-            wrap_compute_conv2d(
-                topi.nn.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
-            ),
-            wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8),
-            name="conv2d_NCHWc_int8.generic",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.nn.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
-            wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc),
-            name="conv2d_NCHWc.generic",
-        )
-    return strategy
-
-
-# depthwise_conv2d_NCHWc
-@override_native_generic_func("depthwise_conv2d_NCHWc_strategy")
-def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
-    """depthwise_conv2d generic strategy"""
-    logger.warning("depthwise_conv2d_NCHWc is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d(
-            topi.nn.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
-        ),
-        wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc),
-        name="depthwise_conv2d_NCHWc.generic",
-    )
-    return strategy
-
-
-# conv2d_winograd_without_weight_transform
-@override_native_generic_func("conv2d_winograd_without_weight_transform_strategy")
-def conv2d_winograd_without_weight_transform_strategy(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform generic strategy"""
-    raise ValueError("No generic implemenation for conv2d_winograd_without_weight_transform")
-
-
-# conv2d_gemm_without_weight_transform
-@override_native_generic_func("conv2d_gemm_without_weight_transform_strategy")
-def conv2d_gemm_without_weight_transform_strategy(attrs, inputs, out_type, target):
-    """conv2d_gemm_without_weight_transform generic strategy"""
-    raise ValueError("No generic implemenation for conv2d_gemm_without_weight_transform")
-
-
-# conv2d_winograd_weight_transform
-@generic_func
-def schedule_conv2d_winograd_weight_transform(attrs, outs, target):
-    """Schedule conv2d_winograd_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv2d_winograd_weight_transform(outs)
-
-
-# conv2d_winograd_nnpack_weight_transform
-@generic_func
-def schedule_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
-    """Schedule conv2d_winograd_nnpack_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
-
-
-# conv2d_gemm_weight_transform
-@generic_func
-def schedule_conv2d_gemm_weight_transform(attrs, outs, target):
-    """Schedule conv2d_gemm_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv2d_gemm_weight_transform(outs)
-
-
-# deformable_conv2d
-def wrap_compute_deformable_conv2d(topi_compute):
-    """wrap deformable_conv2d topi compute"""
-
-    def _compute_deformable_conv2d(attrs, inputs, out_dtype):
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        dilation = get_const_tuple(attrs.dilation)
-        deformable_groups = attrs.deformable_groups
-        groups = attrs.groups
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        out = topi_compute(
-            inputs[0],
-            inputs[1],
-            inputs[2],
-            strides,
-            padding,
-            dilation,
-            deformable_groups,
-            groups,
-            out_dtype,
-        )
-        return [out]
-
-    return _compute_deformable_conv2d
-
-
-@override_native_generic_func("deformable_conv2d_strategy")
-def deformable_conv2d_strategy(attrs, inputs, out_type, target):
-    """deformable_conv2d generic strategy"""
-    layout = attrs.data_layout
-    strategy = _op.OpStrategy()
-
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nchw),
-            wrap_topi_schedule(topi.generic.schedule_deformable_conv2d_nchw),
-            name="deformable_conv2d_nchw.generic",
-        )
-    elif layout == "NHWC":
-        # This implementation should never be picked by autotvm
-        strategy.add_implementation(
-            wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nhwc),
-            naive_schedule,
-            name="deformable_conv2d_nhwc.generic",
-        )
-    else:
-        raise RuntimeError(f"Layout {layout} is not supported in deformable conv2d")
-    return strategy
-
-
-# conv2d_transpose
-def wrap_compute_conv2d_transpose(topi_compute, has_groups=False, add_layout=False):
-    """wrap conv2d_transpose topi compute"""
-
-    def compute_conv2d_transpose(attrs, inputs, out_dtype):
-        """Compute definition of conv2d_transpose"""
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        output_padding = get_const_tuple(attrs.output_padding)
-        # out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype, output_padding)
-        args = [inputs[0], inputs[1], strides, padding, out_dtype, output_padding]
-        if add_layout:
-            args.append(attrs.data_layout)
-        if has_groups:
-            args.append(attrs.groups)
-        out = topi_compute(*args)
-        return [out]
-
-    return compute_conv2d_transpose
-
-
-@override_native_generic_func("conv2d_transpose_strategy")
-def conv2d_transpose_strategy(attrs, inputs, out_type, target):
-    """conv2d_transpose generic strategy"""
-    logger.warning("conv2d_transpose is not optimized for this platform.")
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCHW", "only support nchw for now"
-    assert dilation == (1, 1), "not support dilate now"
-    strategy = _op.OpStrategy()
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.generic",
-        )
-    else:  # group_conv2d_transpose
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw, has_groups=True),
-            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
-            name="group_conv2d_transpose_nchw.generic",
-        )
-    return strategy
-
-
-# conv3d_transpose
-def wrap_compute_conv3d_transpose(topi_compute, has_groups=False):
-    """wrap conv3d_transpose topi compute"""
-
-    def compute_conv3d_transpose(attrs, inputs, out_dtype):
-        """Compute definition of conv3d_transpose"""
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        output_padding = get_const_tuple(attrs.output_padding)
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        args = [inputs[0], inputs[1], strides, padding, out_dtype, output_padding]
-        if has_groups:
-            args.append(attrs.group)
-        out = topi_compute(*args)
-        return [out]
-
-    return compute_conv3d_transpose
-
-
-@override_native_generic_func("conv3d_transpose_strategy")
-def conv3d_transpose_strategy(attrs, inputs, out_type, target):
-    """conv3d_transpose generic strategy"""
-    logger.warning("conv3d_transpose is not optimized for this platform.")
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCDHW", "only support ncdhw for now"
-    assert dilation == (1, 1, 1), "not support dilate now"
-
-    strategy = _op.OpStrategy()
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv3d_transpose(topi.nn.conv3d_transpose_ncdhw),
-            wrap_topi_schedule(topi.generic.schedule_conv3d_transpose_ncdhw),
-            name="conv3d_transpose_ncdhw.generic",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_conv3d_transpose(topi.nn.group_conv3d_transpose_ncdhw, has_groups=True),
-            wrap_topi_schedule(topi.generic.schedule_group_conv3d_transpose_ncdhw),
-            name="group_conv3d_transpose_ncdhw.generic",
-        )
-    return strategy
-
-
-# conv3d
-def wrap_compute_conv3d(
-    topi_compute,
-    need_layout=False,
-    need_auto_scheduler_layout=False,
-    need_meta_schedule_layout=False,
-):
-    """wrap conv3d topi compute"""
-
-    def _compute_conv3d(attrs, inputs, out_type):
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        dilation = get_const_tuple(attrs.dilation)
-        groups = attrs.groups
-        layout = attrs.data_layout
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-
-        (dilation_d, dilation_h, dilation_w) = dilation
-        if dilation_d < 1 or dilation_h < 1 or dilation_w < 1:
-            raise ValueError("Dilation should be positive value")
-
-        args = [inputs[0], inputs[1], strides, padding, dilation, groups]
-        if need_layout:
-            args.append(layout)
-        args.append(out_dtype)
-        if need_auto_scheduler_layout:
-            args.append(get_auto_scheduler_rewritten_layout(attrs))
-        elif need_meta_schedule_layout:
-            args.append("")
-            args.append(get_meta_schedule_original_shape(attrs))
-        return [topi_compute(*args)]
-
-    return _compute_conv3d
-
-
-@override_native_generic_func("conv3d_strategy")
-def conv3d_strategy(attrs, inputs, out_type, target):
-    """conv3d generic strategy"""
-    logger.warning("conv3d is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    if layout == "NCDHW":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
-            wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw),
-            name="conv3d_ncdhw.generic",
-        )
-    elif layout == "NDHWC":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.nn.conv3d_ndhwc),
-            wrap_topi_schedule(topi.generic.schedule_conv3d_ndhwc),
-            name="conv3d_ndhwc.generic",
-        )
-    else:
-        raise ValueError(f"Not support this layout {layout} yet")
-    return strategy
-
-
-# conv3d_winograd_without_weight_transform
-@override_native_generic_func("conv3d_winograd_without_weight_transform_strategy")
-def conv3d_winograd_without_weight_transform_strategy(attrs, inputs, out_type, target):
-    """conv3d_winograd_without_weight_transform generic strategy"""
-    raise ValueError("No generic implemenation for conv3d_winograd_without_weight_transform")
-
-
-# conv3d_winograd_weight_transform
-@generic_func
-def schedule_conv3d_winograd_weight_transform(attrs, outs, target):
-    """Schedule conv3d_winograd_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv3d_winograd_weight_transform(outs)
-
-
-# conv1d
-def wrap_compute_conv1d(topi_compute):
-    """wrap conv1d topi compute"""
-
-    def _compute_conv1d(attrs, inputs, out_type):
-        """Compute definition of conv1d"""
-        strides = get_const_tuple(attrs.strides)
-        padding = get_const_tuple(attrs.padding)
-        dilation = get_const_tuple(attrs.dilation)
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        return [topi_compute(inputs[0], inputs[1], strides, padding, dilation, out_dtype)]
-
-    return _compute_conv1d
-
-
-@override_native_generic_func("conv1d_strategy")
-def conv1d_strategy(attrs, inputs, out_type, target):
-    """conv1d generic strategy"""
-    logger.warning("conv1d is not optimized for this platform.")
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    if dilation[0] < 1:
-        raise ValueError("dilation should be a positive value")
-    strategy = _op.OpStrategy()
-    if layout == "NCW":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.conv1d_ncw),
-            wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
-            name="conv1d_ncw.generic",
-        )
-    elif layout == "NWC":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.conv1d_nwc),
-            wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
-            name="conv1d_nwc.generic",
-        )
-    else:
-        raise ValueError(f"Unsupported conv1d layout {layout}")
-    return strategy
-
-
-def wrap_compute_group_conv1d(topi_compute):
-    """wrap conv1d topi compute"""
-
-    def _compute_group_conv1d(attrs, inputs, out_type):
-        """Compute definition of conv1d"""
-        strides = get_const_tuple(attrs.strides)
-        padding = get_const_tuple(attrs.padding)
-        dilation = get_const_tuple(attrs.dilation)
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        return [
-            topi_compute(inputs[0], inputs[1], strides, padding, dilation, attrs.groups, out_dtype)
-        ]
-
-    return _compute_group_conv1d
-
-
-@override_native_generic_func("group_conv1d_strategy")
-def group_conv1d_strategy(attrs, inputs, out_type, target):
-    """group_conv1d generic strategy"""
-    logger.warning("group_conv1d is not optimized for this platform.")
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    if dilation[0] < 1:
-        raise ValueError("dilation should be a positive value")
-    strategy = _op.OpStrategy()
-    if layout == "NCW":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.group_conv1d_ncw),
-            wrap_topi_schedule(topi.generic.schedule_group_conv1d_ncw),
-            name="group_conv1d_ncw.generic",
-        )
-    elif layout == "NWC":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.group_conv1d_nwc),
-            wrap_topi_schedule(topi.generic.schedule_group_conv1d_nwc),
-            name="group_conv1d_nwc.generic",
-        )
-    else:
-        raise ValueError(f"Unsupported conv1d layout {layout}")
-    return strategy
-
-
-# conv1d_transpose
-def wrap_compute_conv1d_transpose(topi_compute, has_groups=False):
-    """wrap conv1d_transpose topi compute"""
-
-    def _compute_conv1d_tranpsoe(attrs, inputs, out_type):
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        output_padding = get_const_tuple(attrs.output_padding)
-        args = [inputs[0], inputs[1], strides, padding, out_dtype, output_padding]
-        if has_groups:
-            args.append(attrs.groups)
-
-        out = topi_compute(*args)
-        return [out]
-
-    return _compute_conv1d_tranpsoe
-
-
-@override_native_generic_func("conv1d_transpose_strategy")
-def conv1d_transpose_strategy(attrs, inputs, out_type, target):
-    """conv1d_transpose generic strategy"""
-    logger.warning("conv1d_transpose is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCW", "conv1d_transpose ncw only supported"
-    assert dilation == (1,), "conv1d_transpose dilation is not supported"
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv1d_transpose(topi.nn.conv1d_transpose_ncw),
-            wrap_topi_schedule(topi.generic.schedule_conv1d_transpose_ncw),
-            name="conv1d_transpose_ncw.generic",
-        )
-    else:  # group_conv1d_transpose
-        strategy.add_implementation(
-            wrap_compute_conv1d_transpose(topi.nn.group_conv1d_transpose_ncw, has_groups=True),
-            wrap_topi_schedule(topi.generic.schedule_group_conv1d_transpose_ncw),
-            name="group_conv1d_transpose_ncw.generic",
-        )
-    return strategy
-
-
-# dilation2d
-def wrap_compute_dilation2d(topi_compute, need_data_layout=False):
-    """Wrap dilation2d topi compute"""
-
-    def _compute_dilation2d(attrs, inputs, out_type):
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        dilations = get_const_tuple(attrs.dilations)
-        data_layout = attrs.get_str("data_layout")
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        args = [inputs[0], inputs[1], strides, padding, dilations]
-        if need_data_layout:
-            args.append(data_layout)
-        args.append(out_dtype)
-        return [topi_compute(*args)]
-
-    return _compute_dilation2d
-
-
-@override_native_generic_func("dilation2d_strategy")
-def dilation2d_strategy(attrs, inputs, out_type, target):
-    """dilation2d_strategy generic strategy"""
-    logger.warning("dilation2d_strategy is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    dilations = get_const_tuple(attrs.dilations)
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-
-    assert layout in ["NCHW", "NHWC"]
-    (dilation_h, dilation_w) = dilations
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if layout == "NCHW":
-        assert kernel_layout == "IHW"
-        strategy.add_implementation(
-            wrap_compute_dilation2d(topi.image.dilation2d_nchw),
-            wrap_topi_schedule(topi.generic.schedule_dilation2d_nchw),
-            name="dilation2d_nchw.generic",
-        )
-    elif layout == "NHWC":
-        assert kernel_layout == "HWI"
-        strategy.add_implementation(
-            wrap_compute_dilation2d(topi.image.dilation2d_nhwc),
-            wrap_topi_schedule(topi.generic.schedule_dilation2d_nhwc),
-            name="dilation2d_nhwc.generic",
-        )
-    else:
-        raise RuntimeError(f"Unsupported dilation2d layout {layout}")
-    return strategy
-
-
-def copy_if_identical(tensor_a, tensor_b):
-    """
-    When two inputs to batch_matul or dense are the same tensor, e.g. batch_matmul(x, x),
-    compilation fails because TE thinks there is only one input tensor x, and doing
-    cache_read(x) on the same tensor twice results in an error.
-    To prevent such errors, we make the second tensor be the copy of the first one
-    when two input tensors are identical.
-    """
-    if tensor_a == tensor_b:
-        return te.compute(tensor_a.shape, lambda *ind: tensor_a[ind])
-    return tensor_b
-
-
-# matmul
-def wrap_compute_matmul(
-    topi_compute, need_auto_scheduler_layout=False, need_meta_schedule_layout=False
-):
-    """wrap matmul topi compute"""
-
-    def _compute_matmul(attrs, inputs, out_type):
-        """Compute definition of matmul"""
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-        args = [inputs[0], inputs[1], None, out_dtype, attrs.transpose_a, attrs.transpose_b]
-        if need_auto_scheduler_layout:
-            args.append(get_auto_scheduler_rewritten_layout(attrs))
-        elif need_meta_schedule_layout:
-            args.append("")
-            args.append(get_meta_schedule_original_shape(attrs))
-        args[1] = copy_if_identical(inputs[0], inputs[1])
-        return [topi_compute(*args)]
-
-    return _compute_matmul
-
-
-@override_native_generic_func("matmul_strategy")
-def matmul_strategy(attrs, inputs, out_type, target):
-    """matmul generic strategy"""
-    logger.warning("matmul is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_matmul(topi.nn.matmul),
-        wrap_topi_schedule(topi.generic.schedule_matmul),
-        name="matmul.generic",
-    )
-    return strategy
-
-
-# dense
-def wrap_compute_dense(
-    topi_compute, need_auto_scheduler_layout=False, need_meta_schedule_layout=False
-):
-    """wrap dense topi compute"""
-
-    def _compute_dense(attrs, inputs, out_type):
-        """Compute definition of dense"""
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-        args = [inputs[0], inputs[1], None, out_dtype]
-        if need_auto_scheduler_layout:
-            args.append(get_auto_scheduler_rewritten_layout(attrs))
-        elif need_meta_schedule_layout:
-            args.append("")
-            args.append(get_meta_schedule_original_shape(attrs))
-        args[1] = copy_if_identical(inputs[0], inputs[1])
-        return [topi_compute(*args)]
-
-    return _compute_dense
-
-
-@override_native_generic_func("dense_strategy")
-def dense_strategy(attrs, inputs, out_type, target):
-    """dense generic strategy"""
-    logger.warning("dense is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.nn.dense),
-        wrap_topi_schedule(topi.generic.schedule_dense),
-        name="dense.generic",
-    )
-    return strategy
-
-
-@override_native_generic_func("dense_pack_strategy")
-def dense_pack_strategy(attrs, inputs, out_type, target):
-    """dense_pack generic strategy"""
-    logger.warning("dense_pack is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.nn.dense_pack),
-        wrap_topi_schedule(topi.generic.schedule_dense),
-        name="dense_pack.generic",
-    )
-    return strategy
-
-
-# batch_matmul
-def wrap_compute_batch_matmul(
-    topi_compute,
-    *,
-    need_auto_scheduler_layout=False,
-    need_meta_schedule_layout=False,
-    need_out_dtype=False,
-):
-    """wrap batch_matmul topi compute"""
-
-    def _compute_batch_matmul(attrs, inputs, out_type):
-        args = [inputs[0], inputs[1], out_type.shape]
-        args.append(out_type.dtype if need_out_dtype else None)
-        args.append(attrs.transpose_a)
-        args.append(attrs.transpose_b)
-        if need_auto_scheduler_layout:
-            args.append(get_auto_scheduler_rewritten_layout(attrs))
-        elif need_meta_schedule_layout:
-            args.append("")
-            args.append(get_meta_schedule_original_shape(attrs))
-        args[1] = copy_if_identical(inputs[0], inputs[1])
-        return [topi_compute(*args)]
-
-    return _compute_batch_matmul
-
-
-@override_native_generic_func("batch_matmul_strategy")
-def batch_matmul_strategy(attrs, inputs, out_type, target):
-    """batch_matmul generic strategy"""
-    logger.warning("batch_matmul is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_batch_matmul(topi.nn.batch_matmul),
-        wrap_topi_schedule(topi.generic.schedule_batch_matmul),
-        name="batch_matmul.generic",
-    )
-    return strategy
-
-
-# batch_norm
-def wrap_compute_batch_norm(topi_compute):
-    """wrap batch_norm topi compute"""
-
-    def _compute_batch_norm(attrs, inputs, out_type):
-        return topi_compute(*inputs, attrs.axis, attrs.epsilon, attrs.center, attrs.scale)
-
-    return _compute_batch_norm
-
-
-@override_native_generic_func("batch_norm_strategy")
-def batch_norm_strategy(attrs, inputs, out_type, target):
-    """batch_norm generic strategy"""
-    logger.warning("batch_norm is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_batch_norm(topi.nn.batch_norm),
-        wrap_topi_schedule(topi.generic.schedule_batch_norm),
-        name="batch_norm.generic",
-    )
-    return strategy
-
-
-# sparse dense
-def wrap_compute_sparse_dense(topi_compute):
-    """wrap sparse dense topi compute"""
-
-    def _compute_sparse_dense(attrs, inputs, out_type):
-        return [topi_compute(inputs[0], inputs[1], inputs[2], inputs[3], attrs["sparse_lhs"])]
-
-    return _compute_sparse_dense
-
-
-@override_native_generic_func("sparse_dense_strategy")
-def sparse_dense_strategy(attrs, inputs, out_type, target):
-    """sparse dense generic strategy"""
-    logger.warning("sparse dense is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_dense(topi.nn.sparse_dense),
-        wrap_topi_schedule(topi.generic.schedule_sparse_dense),
-        name="sparse_dense.generic",
-    )
-    return strategy
-
-
-@override_native_generic_func("sparse_dense_padded_strategy")
-def sparse_dense_padded_strategy(attrs, inputs, out_type, target):
-    """sparse dense padded generic strategy"""
-    raise NotImplementedError("sparse_dense_padded is only implemented for cuda")
-
-
-# sparse_add
-def wrap_compute_sparse_add(topi_compute):
-    """wrap sparse add topi compute"""
-
-    def _compute_sparse_add(attrs, inputs, out_type):
-        return [topi_compute(inputs[0], inputs[1], inputs[2], inputs[3])]
-
-    return _compute_sparse_add
-
-
-@override_native_generic_func("sparse_add_strategy")
-def sparse_add_strategy(attrs, inputs, out_type, target):
-    """sparse add generic strategy"""
-    logger.warning("sparse add is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_add(topi.nn.sparse_add),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="sparse_add.generic",
-    )
-    return strategy
-
-
-# sparse_transpose
-@generic_func
-def schedule_sparse_transpose(attrs, outs, target):
-    """schedule sparse_transpose"""
-    with target:
-        return topi.generic.schedule_sparse_transpose(outs)
-
-
-# sparse conv2d
-def wrap_compute_sparse_conv2d(topi_compute):
-    """wrap sparse conv2d topi compute"""
-
-    def _compute_sparse_conv2d(attrs, inputs, out_type):
-        return [topi_compute(inputs[0], inputs[1], inputs[2], inputs[3], attrs["layout"])]
-
-    return _compute_sparse_conv2d
-
-
-@override_native_generic_func("sparse_conv2d_strategy")
-def sparse_conv2d_strategy(attrs, inputs, out_type, target):
-    """sparse conv2d generic strategy"""
-    logger.warning("sparse conv2d is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_conv2d(topi.nn.sparse_conv2d),
-        wrap_topi_schedule(topi.generic.schedule_sparse_conv2d),
-        name="sparse_conv2d.generic",
-    )
-    return strategy
-
-
-# sort
-def wrap_compute_sort(topi_compute):
-    """Wrap sort topi compute"""
-
-    def _compute_sort(attrs, inputs, _):
-        axis = get_const_int(attrs.axis)
-        is_ascend = bool(get_const_int(attrs.is_ascend))
-        return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend)]
-
-    return _compute_sort
-
-
-@override_native_generic_func("sort_strategy")
-def sort_strategy(attrs, inputs, out_type, target):
-    """sort generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sort(topi.sort),
-        wrap_topi_schedule(topi.generic.schedule_sort),
-        name="sort.generic",
-    )
-    return strategy
-
-
-# argsort
-def wrap_compute_argsort(topi_compute):
-    """Wrap argsort topi compute"""
-
-    def _compute_argsort(attrs, inputs, _):
-        axis = get_const_int(attrs.axis)
-        is_ascend = bool(get_const_int(attrs.is_ascend))
-        dtype = attrs.dtype
-        return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)]
-
-    return _compute_argsort
-
-
-@override_native_generic_func("argsort_strategy")
-def argsort_strategy(attrs, inputs, out_type, target):
-    """argsort generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_argsort(topi.argsort),
-        wrap_topi_schedule(topi.generic.schedule_argsort),
-        name="argsort.generic",
-    )
-    return strategy
-
-
-# topk
-def wrap_compute_topk(topi_compute):
-    """Wrap topk compute"""
-
-    def _compute_topk(attrs, inputs, out_type):
-        if attrs.k is not None:
-            k = attrs.k
-        else:
-            k = inputs[1]
-        axis = get_const_int(attrs.axis)
-        ret_type = attrs.ret_type
-        is_ascend = bool(get_const_int(attrs.is_ascend))
-        dtype = attrs.dtype
-        out = topi_compute(inputs[0], k, axis, ret_type, is_ascend, dtype)
-        out = out if isinstance(out, list) else [out]
-        return out
-
-    return _compute_topk
-
-
-@override_native_generic_func("topk_strategy")
-def topk_strategy(attrs, inputs, out_type, target):
-    """topk generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_topk(topi.topk),
-        wrap_topi_schedule(topi.generic.schedule_topk),
-        name="topk.generic",
-    )
-    return strategy
-
-
-# searchsorted
-def wrap_compute_searchsorted(topi_compute):
-    """Wrap searchsorted compute"""
-
-    def _compute_searchsorted(attrs, inputs, out_type):
-        right = attrs.right
-        dtype = attrs.dtype
-        return [topi_compute(inputs[0], inputs[1], right, dtype)]
-
-    return _compute_searchsorted
-
-
-# searchsorted_strategy
-@override_native_generic_func("searchsorted_strategy")
-def searchsorted_strategy(attrs, inputs, out_type, target):
-    """searchsorted generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_searchsorted(topi.searchsorted),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="searchsorted.generic",
-    )
-    return strategy
-
-
-# multibox_prior
-def wrap_compute_multibox_prior(topi_compute):
-    """Wrap multibox_prior compute"""
-
-    def _compute_multibox_prior(attrs, inputs, _):
-        """Compute definition of multibox_prior"""
-        sizes = get_float_tuple(attrs.sizes)
-        ratios = get_float_tuple(attrs.ratios)
-        steps = get_float_tuple(attrs.steps)
-        offsets = get_float_tuple(attrs.offsets)
-        clip = bool(get_const_int(attrs.clip))
-        return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)]
-
-    return _compute_multibox_prior
-
-
-@override_native_generic_func("multibox_prior_strategy")
-def multibox_prior_strategy(attrs, inputs, out_type, target):
-    """multibox_prior generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_multibox_prior(topi.vision.ssd.multibox_prior),
-        wrap_topi_schedule(topi.generic.schedule_multibox_prior),
-        name="multibox_prior.generic",
-    )
-    return strategy
-
-
-# multibox_transform_loc
-def wrap_compute_multibox_transform_loc(topi_compute):
-    """Wrap multibox_transform_loc compute"""
-
-    def _compute_multibox_transform_loc(attrs, inputs, _):
-        """Compute definition of multibox_detection"""
-        clip = bool(get_const_int(attrs.clip))
-        threshold = get_const_float(attrs.threshold)
-        variances = get_float_tuple(attrs.variances)
-        keep_background = bool(get_const_int(attrs.keep_background))
-        return topi_compute(
-            inputs[0], inputs[1], inputs[2], clip, threshold, variances, keep_background
-        )
-
-    return _compute_multibox_transform_loc
-
-
-@override_native_generic_func("multibox_transform_loc_strategy")
-def multibox_transform_loc_strategy(attrs, inputs, out_type, target):
-    """schedule multibox_transform_loc"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_multibox_transform_loc(topi.vision.ssd.multibox_transform_loc),
-        wrap_topi_schedule(topi.generic.schedule_multibox_transform_loc),
-        name="multibox_transform_loc.generic",
-    )
-    return strategy
-
-
-# get_valid_counts
-def wrap_compute_get_valid_counts(topi_compute):
-    """wrap get_valid_counts topi compute"""
-
-    def _compute_get_valid_counts(attrs, inputs, out_type):
-        score_threshold = inputs[1]
-        id_index = get_const_int(attrs.id_index)
-        score_index = get_const_int(attrs.score_index)
-        if attrs.score_threshold is not None:
-            score_threshold = get_const_float(attrs.score_threshold)
-        return topi_compute(inputs[0], score_threshold, id_index, score_index)
-
-    return _compute_get_valid_counts
-
-
-@override_native_generic_func("get_valid_counts_strategy")
-def get_valid_counts_strategy(attrs, inputs, out_type, target):
-    """get_valid_counts generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_get_valid_counts(topi.vision.get_valid_counts),
-        wrap_topi_schedule(topi.generic.schedule_get_valid_counts),
-        name="get_valid_counts.generic",
-    )
-    return strategy
-
-
-# non-maximum suppression
-def wrap_compute_nms(topi_compute):
-    """wrap nms topi compute"""
-
-    def _compute_nms(attrs, inputs, out_type):
-        max_output_size = inputs[3]
-        iou_threshold = inputs[4]
-        return_indices = bool(get_const_int(attrs.return_indices))
-        force_suppress = bool(get_const_int(attrs.force_suppress))
-        top_k = get_const_int(attrs.top_k)
-        coord_start = get_const_int(attrs.coord_start)
-        score_index = get_const_int(attrs.score_index)
-        id_index = get_const_int(attrs.id_index)
-        invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
-        if return_indices:
-            return topi_compute(
-                inputs[0],
-                inputs[1],
-                inputs[2],
-                max_output_size,
-                iou_threshold,
-                force_suppress,
-                top_k,
-                coord_start,
-                score_index,
-                id_index,
-                return_indices,
-                invalid_to_bottom,
-            )
-        return [
-            topi_compute(
-                inputs[0],
-                inputs[1],
-                inputs[2],
-                max_output_size,
-                iou_threshold,
-                force_suppress,
-                top_k,
-                coord_start,
-                score_index,
-                id_index,
-                return_indices,
-                invalid_to_bottom,
-            )
-        ]
-
-    return _compute_nms
-
-
-@override_native_generic_func("non_max_suppression_strategy")
-def nms_strategy(attrs, inputs, out_type, target):
-    """nms generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_nms(topi.vision.non_max_suppression),
-        wrap_topi_schedule(topi.generic.schedule_nms),
-        name="nms.generic",
-    )
-    return strategy
-
-
-def wrap_compute_all_class_nms(topi_compute):
-    """wrap all class nms topi compute"""
-
-    def _compute_nms(attrs, inputs, out_type):
-        max_output_size = inputs[2]
-        iou_threshold = inputs[3]
-        score_threshold = inputs[4]
-        output_format = attrs.output_format
-        return topi_compute(
-            inputs[0], inputs[1], max_output_size, iou_threshold, score_threshold, output_format
-        )
-
-    return _compute_nms
-
-
-@override_native_generic_func("all_class_non_max_suppression_strategy")
-def all_class_nms_strategy(attrs, inputs, out_type, target):
-    """all class nms generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_all_class_nms(topi.vision.all_class_non_max_suppression),
-        wrap_topi_schedule(topi.generic.schedule_nms),
-        name="all_class_nms.generic",
-    )
-    return strategy
-
-
-def wrap_compute_regular_nms(topi_compute):
-    """wrap regular nms topi compute"""
-
-    def _compute_nms(attrs, inputs, out_type):
-        return topi_compute(
-            inputs[0],
-            inputs[1],
-            attrs.max_detections_per_class,
-            attrs.max_detections,
-            attrs.num_classes,
-            attrs.iou_threshold,
-            attrs.score_threshold,
-        )
-
-    return _compute_nms
-
-
-@override_native_generic_func("regular_non_max_suppression_strategy")
-def regular_nms_strategy(attrs, inputs, out_type, target):
-    """regular nms generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_regular_nms(topi.vision.regular_non_max_suppression),
-        wrap_topi_schedule(topi.generic.schedule_nms),
-        name="regular_nms.generic",
-    )
-    return strategy
-
-
-# roi_align
-def wrap_compute_roi_align(topi_compute):
-    """wrap roi_align topi compute"""
-
-    def _compute_roi_align(attrs, inputs, out_type):
-        pooled_size = get_const_tuple(attrs.pooled_size)
-        mode = bytes(attrs.mode, "utf-8")
-        return [
-            topi_compute(
-                inputs[0],
-                inputs[1],
-                pooled_size=pooled_size,
-                spatial_scale=attrs.spatial_scale,
-                sample_ratio=attrs.sample_ratio,
-                mode=mode,
-            )
-        ]
-
-    return _compute_roi_align
-
-
-@override_native_generic_func("roi_align_strategy")
-def roi_align_strategy(attrs, inputs, out_type, target):
-    """roi_align generic strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.layout
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
-            wrap_topi_schedule(topi.generic.schedule_roi_align),
-            name="roi_align.generic",
-        )
-    else:
-        assert layout == "NHWC", "layout must be NCHW or NHWC."
-        strategy.add_implementation(
-            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc),
-            wrap_topi_schedule(topi.generic.schedule_roi_align),
-            name="roi_align.generic",
-        )
-    return strategy
-
-
-# sparse_fill_empty_rows
-@override_native_generic_func("sparse_fill_empty_rows_strategy")
-def sparse_fill_empty_rows_strategy(attrs, outs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_fill_empty_rows(topi.sparse_fill_empty_rows),
-        wrap_topi_schedule(topi.generic.schedule_sparse_fill_empty_rows),
-        name="sparse_fill_empty_rows.generic",
-    )
-    return strategy
-
-
-def wrap_compute_sparse_fill_empty_rows(topi_compute):
-    """Wrap sparse_fill_empty_rows compute"""
-
-    def _compute_sparse_fill_empty_rows(attrs, inputs, output_type):
-        return topi_compute(
-            inputs[0],
-            inputs[1],
-            inputs[2],
-            inputs[3],
-            output_type.fields[0].shape,
-            output_type.fields[1].shape,
-            output_type.fields[2].shape,
-        )
-
-    return _compute_sparse_fill_empty_rows
-
-
-# sparse_reshape
-@override_native_generic_func("sparse_reshape_strategy")
-def sparse_reshape_strategy(attrs, outs, out_type, target):
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_reshape(topi.sparse_reshape),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="sparse_reshape.generic",
-    )
-    return strategy
-
-
-def wrap_compute_sparse_reshape(topi_compute):
-    """Wrap sparse_reshape compute"""
-
-    def _compute_sparse_reshape(attrs, inputs, output_type):
-        return topi_compute(
-            inputs[0],
-            inputs[1],
-            inputs[2],
-            output_type.fields[0].shape,
-            output_type.fields[1].shape,
-        )
-
-    return _compute_sparse_reshape
-
-
-# stft
-@override_native_generic_func("stft_strategy")
-def stft_strategy(attrs, outs, out_type, target):
-    """stft generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_stft(topi.stft),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="stft.generic",
-    )
-    return strategy
-
-
-def wrap_compute_stft(topi_compute):
-    """Wrap stft compute"""
-
-    def _compute_stft(attrs, inputs, output_type):
-        return [
-            topi_compute(
-                inputs[0],
-                attrs.n_fft,
-                attrs.hop_length,
-                attrs.win_length,
-                inputs[1],
-                attrs.normalized,
-                attrs.onesided,
-                output_type.shape,
-            )
-        ]
-
-    return _compute_stft
-
-
-# dft
-@override_native_generic_func("dft_strategy")
-def dft_strategy(attrs, outs, out_type, target):
-    """DFT generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dft(topi.dft),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="dft.generic",
-    )
-    return strategy
-
-
-def wrap_compute_dft(topi_compute):
-    """Wrap DFT compute"""
-
-    def _compute_dft(attrs, inputs, _):
-        return topi_compute(inputs[0], inputs[1], attrs.inverse)
-
-    return _compute_dft
-
-
-# trilu
-@override_native_generic_func("trilu_strategy")
-def trilu_strategy(attrs, outs, out_type, target):
-    """trilu generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_trilu(topi.trilu),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="trilu.generic",
-    )
-    return strategy
-
-
-def wrap_compute_trilu(topi_compute):
-    """Wrap trilu compute"""
-
-    def _compute_trilu(attrs, inputs, output_type):
-        return [topi_compute(inputs[0], inputs[1], attrs.upper)]
-
-    return _compute_trilu
-
-
-# roi_pool
-@generic_func
-def schedule_roi_pool(attrs, outs, target):
-    """schedule roi_pool"""
-    with target:
-        return topi.generic.schedule_roi_pool(outs)
-
-
-# proposal
-def wrap_compute_proposal(topi_compute):
-    """wrap proposal topi compute"""
-
-    def _compute_proposal(attrs, inputs, out_type):
-        scales = get_float_tuple(attrs.scales)
-        ratios = get_float_tuple(attrs.ratios)
-        feature_stride = attrs.feature_stride
-        threshold = attrs.threshold
-        rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
-        rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
-        rpn_min_size = attrs.rpn_min_size
-        iou_loss = bool(get_const_int(attrs.iou_loss))
-        return [
-            topi_compute(
-                inputs[0],
-                inputs[1],
-                inputs[2],
-                scales,
-                ratios,
-                feature_stride,
-                threshold,
-                rpn_pre_nms_top_n,
-                rpn_post_nms_top_n,
-                rpn_min_size,
-                iou_loss,
-            )
-        ]
-
-    return _compute_proposal
-
-
-@override_native_generic_func("proposal_strategy")
-def proposal_strategy(attrs, inputs, out_type, target):
-    """proposal generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_proposal(topi.vision.rcnn.proposal),
-        wrap_topi_schedule(topi.generic.schedule_proposal),
-        name="proposal.generic",
-    )
-    return strategy
-
-
-# scatter_elements
-@override_native_generic_func("scatter_elements_strategy")
-def scatter_elements_strategy(attrs, inputs, out_type, target):
-    """scatter_elements generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scatter_elements(topi.scatter_elements),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="scatter_elements.generic",
-    )
-    # TODO(vvchernov): implement specialized case (rank=1, reduction="update"), see cuda strategy
-    return strategy
-
-
-def wrap_compute_scatter_elements(topi_compute):
-    """Wrap scatter_elements topi compute"""
-
-    def _compute_scatter_elements(attrs, inputs, _):
-        return [topi_compute(inputs[0], inputs[1], inputs[2], attrs.axis, attrs.reduction)]
-
-    return _compute_scatter_elements
-
-
-# scatter_nd
-@override_native_generic_func("scatter_nd_strategy")
-def scatter_nd_strategy(attrs, inputs, out_type, target):
-    """scatter_nd generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scatter_nd(topi.scatter_nd),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="scatter_nd.generic",
-    )
-    return strategy
-
-
-def wrap_compute_scatter_nd(topi_compute):
-    """Wrap scatter_nd topi compute"""
-
-    def _compute_scatter_nd(attrs, inputs, _):
-        return [topi_compute(inputs[0], inputs[1], inputs[2], attrs.mode)]
-
-    return _compute_scatter_nd
-
-
-# bitserial_conv2d
-def wrap_compute_bitserial_conv2d(topi_compute):
-    """wrap bitserial_conv2d topi compute"""
-
-    def compute_bitserial_conv2d(attrs, inputs, out_dtype):
-        """Compute definition for bitserial conv2d."""
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        activation_bits = attrs.activation_bits
-        weight_bits = attrs.weight_bits
-        pack_dtype = attrs.pack_dtype
-        out_dtype = attrs.out_dtype
-        unipolar = attrs.unipolar
-        return [
-            topi_compute(
-                inputs[0],
-                inputs[1],
-                strides,
-                padding,
-                activation_bits,
-                weight_bits,
-                pack_dtype,
-                out_dtype,
-                unipolar,
-            )
-        ]
-
-    return compute_bitserial_conv2d
-
-
-@override_native_generic_func("bitserial_conv2d_strategy")
-def bitserial_conv2d_strategy(attrs, inputs, out_type, target):
-    """bitserial_conv2d generic strategy"""
-    logger.warning("bitserial_conv2d is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw),
-            wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nchw),
-            name="bitserial_conv2d_nchw.generic",
-        )
-    elif layout == "NHWC":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc),
-            wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nhwc),
-            name="bitserial_conv2d_nhwc.generic",
-        )
-    else:
-        raise ValueError(f"Data layout {layout} not supported.")
-    return strategy
-
-
-# bitserial_dense
-def wrap_compute_bitserial_dense(topi_compute):
-    """wrap bitserial_dense topi compute"""
-
-    def compute_bitserial_dense(attrs, inputs, out_type):
-        """Compute definition of bitserial dense"""
-        data_bits = attrs.data_bits
-        weight_bits = attrs.weight_bits
-        pack_dtype = attrs.pack_dtype
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-        unipolar = attrs.unipolar
-        return [
-            topi_compute(
-                inputs[0], inputs[1], data_bits, weight_bits, pack_dtype, out_dtype, unipolar
-            )
-        ]
-
-    return compute_bitserial_dense
-
-
-@override_native_generic_func("bitserial_dense_strategy")
-def bitserial_dense_strategy(attrs, inputs, out_type, target):
-    """bitserial_dense generic strategy"""
-    logger.warning("bitserial_dense is not optimized for this platform.")
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_bitserial_dense(topi.nn.bitserial_dense),
-        wrap_topi_schedule(topi.generic.schedule_bitserial_dense),
-        name="bitserial_dense.generic",
-    )
-    return strategy
-
-
-# correlation
-def wrap_compute_correlation(topi_compute):
-    """wrap correlation topi compute"""
-
-    def _compute_correlation(attrs, inputs, out_type):
-        kernel_size = attrs.kernel_size
-        max_displacement = attrs.max_displacement
-        stride1 = attrs.stride1
-        stride2 = attrs.stride2
-        padding = get_const_tuple(attrs.padding)
-        is_multiply = attrs.is_multiply
-        return [
-            topi_compute(
-                inputs[0],
-                inputs[1],
-                kernel_size,
-                max_displacement,
-                stride1,
-                stride2,
-                padding,
-                is_multiply,
-            )
-        ]
-
-    return _compute_correlation
-
-
-@override_native_generic_func("correlation_strategy")
-def correlation_strategy(attrs, inputs, out_type, target):
-    """correlation generic strategy"""
-    logger.warning("correlation is not optimized for this platform.")
-    layout = attrs.layout
-    assert layout == "NCHW", "Only support NCHW layout"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_correlation(topi.nn.correlation_nchw),
-        wrap_topi_schedule(topi.generic.schedule_correlation_nchw),
-        name="correlation.generic",
-    )
-    return strategy
-
-
-# argwhere
-def wrap_compute_argwhere(topi_compute):
-    """wrap argwhere topi compute"""
-
-    def _compute_argwhere(attrs, inputs, out_type):
-        output_shape = []
-        for s in out_type.shape:
-            if hasattr(s, "value"):
-                output_shape.append(s)
-            else:
-                output_shape.append(te.var("any_dim", "int32"))
-        new_output_type = ir.TensorType(output_shape, "int32")
-        return [topi_compute(new_output_type, inputs[0])]
-
-    return _compute_argwhere
-
-
-@override_native_generic_func("argwhere_strategy")
-def argwhere_strategy(attrs, inputs, out_type, target):
-    """argwhere generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_argwhere(topi.argwhere),
-        wrap_topi_schedule(topi.generic.schedule_argwhere),
-        name="argwhere.generic",
-    )
-    return strategy
-
-
-# threefry_generate
-def wrap_compute_threefry_generate(topi_compute):
-    """Wrap threefry_generate topi compute"""
-
-    def _compute_threefry_generate(attrs, inputs, _):
-        return topi_compute(inputs[0], attrs.out_shape)
-
-    return _compute_threefry_generate
-
-
-@override_native_generic_func("threefry_generate_strategy")
-def threefry_generate_strategy(attrs, inputs, out_type, target):
-    """threefry_generate generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_threefry_generate(topi.random.threefry_generate),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="threefry_generate.generic",
-    )
-    return strategy
-
-
-# threefry_split
-def wrap_compute_threefry_split(topi_compute):
-    """Wrap threefry_split topi compute"""
-
-    def _compute_threefry_split(attrs, inputs, _):
-        return topi_compute(inputs[0])
-
-    return _compute_threefry_split
-
-
-@override_native_generic_func("threefry_split_strategy")
-def threefry_split_strategy(attrs, inputs, out_type, target):
-    """threefry_split generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_threefry_split(topi.random.threefry_split),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="threefry_split.generic",
-    )
-    return strategy
-
-
-# uniform
-def wrap_compute_uniform(topi_compute):
-    """Wrap uniform topi compute"""
-
-    def _compute_uniform(attrs, inputs, _):
-        return list(topi_compute(inputs[0], inputs[1], inputs[2], attrs.out_shape, attrs.out_dtype))
-
-    return _compute_uniform
-
-
-@override_native_generic_func("uniform_strategy")
-def uniform_strategy(attrs, inputs, out_type, target):
-    """uniform generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_uniform(topi.random.uniform),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="uniform.generic",
-    )
-    return strategy
-
-
-# multinomial
-def wrap_compute_multinomial(topi_compute):
-    """Wrap multinomial topi compute"""
-
-    def _compute_multinomial(attrs, inputs, _):
-        return list(topi_compute(inputs[0], inputs[1], attrs.num_samples))
-
-    return _compute_multinomial
-
-
-# sliding_window
-def wrap_compute_sliding_window():
-    """Wrap sliding_window topi compute"""
-
-    def _compute_sliding_window(attrs, inputs, _):
-        return [topi.sliding_window(inputs[0], attrs.axis, attrs.window_shape, attrs.strides)]
-
-    return _compute_sliding_window
-
-
-@override_native_generic_func("sliding_window_strategy")
-def sliding_window_strategy(attrs, inputs, out_type, target):
-    """sliding_window generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sliding_window(),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="sliding_window.generic",
-    )
-    return strategy
-
-
-@override_native_generic_func("normal_strategy")
-def normal_strategy(attrs, inputs, out_type, target):
-    """normal generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_uniform(topi.random.normal),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="normal.generic",
-    )
-    return strategy
-
-
-@override_native_generic_func("multinomial_strategy")
-def multinomial_strategy(attrs, inputs, out_type, target):
-    """multinomial generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_multinomial(topi.random.multinomial),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="multinomial.generic",
-    )
-    return strategy
-
-
-def wrap_compute_scanop(topi_compute):
-    """Wrap scanop style topi compute"""
-
-    def _compute_scanop(attrs, inputs, _):
-        return [topi_compute(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
-
-    return _compute_scanop
-
-
-def wrap_compute_concat(topi_compute):
-    """Wrap concatenate topi compute"""
-
-    def _compute_concat(attrs, inputs, _):
-        return [topi_compute(inputs, attrs.axis)]
-
-    return _compute_concat
-
-
-@override_native_generic_func("cumsum_strategy")
-def cumsum_strategy(attrs, inputs, out_type, target):
-    """cumsum generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scanop(topi.cumsum),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="cumsum.generic",
-    )
-    return strategy
-
-
-@override_native_generic_func("concat_strategy")
-def concatenate_strategy(attrs, inputs, out_type, target):
-    """concatenate generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_concat(topi.concatenate),
-        wrap_topi_schedule(topi.generic.schedule_injective),
-        name="concatenate",
-    )
-    return strategy
-
-
-@override_native_generic_func("cumprod_strategy")
-def cumprod_strategy(attrs, inputs, out_type, target):
-    """cumprod generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scanop(topi.cumprod),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="cumprod.generic",
-    )
-    return strategy
-
-
-def wrap_compute_unique(topi_compute):
-    """Wrap unique topi compute"""
-
-    def _compute_unique(attrs, inputs, _):
-        return topi_compute(inputs[0], attrs.sorted, attrs.return_counts)
-
-    return _compute_unique
-
-
-@override_native_generic_func("unique_strategy")
-def unique_strategy(attrs, inputs, out_type, target):
-    """unique generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_unique(topi.unique),
-        wrap_topi_schedule(topi.generic.schedule_unique),
-        name="unique.generic",
-    )
-    return strategy
-
-
-@generic_func
-def schedule_transpose(attrs, outs, target):
-    """schedule transpose"""
-    with target:
-        return schedule_injective(attrs, outs, target)
-
-
-# invert_permutation
-def wrap_compute_invert_permutation(topi_compute):
-    """wrap invert_permutation topi compute"""
-
-    def _compute_invert_permutation(attrs, inputs, out_type):
-        return [topi_compute(inputs[0])]
-
-    return _compute_invert_permutation
-
-
-@override_native_generic_func("invert_permutation_strategy")
-def invert_permutation_strategy(attrs, inputs, out_type, target):
-    """invert_permutation generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_invert_permutation(topi.invert_permutation),
-        wrap_topi_schedule(topi.generic.schedule_injective),
-        name="invert_permutation.generic",
-    )
-    return strategy
-
-
-def wrap_compute_einsum(topi_compute):
-    """Wrap einsum topi compute"""
-
-    def _compute_einsum(attrs, inputs, _):
-        return [topi_compute(attrs.equation, *inputs)]
-
-    return _compute_einsum
-
-
-@override_native_generic_func("einsum_strategy")
-def einsum_strategy(attrs, inputs, out_type, target):
-    """einsum generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_einsum(topi.einsum),
-        wrap_topi_schedule(topi.generic.schedule_einsum),
-        name="einsum.generic",
-    )
-    return strategy
-
-
-# conv2d_backward_weight
-def wrap_compute_conv2d_backward_weight(topi_compute):
-    """wrap conv2d_backward_weight topi compute"""
-
-    def _compute_conv2d_backward_weight(attrs, inputs, out_dtype):
-        kernel_size = get_const_tuple(attrs.kernel_size)
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        dilation = get_const_tuple(attrs.dilation)
-        groups = attrs.groups
-        out_dtype = attrs.out_dtype
-        layout = attrs.data_layout
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        out = topi_compute(
-            inputs[0], inputs[1], kernel_size, padding, strides, dilation, groups, layout, out_dtype
-        )
-        return [out]
-
-    return _compute_conv2d_backward_weight
-
-
-@override_native_generic_func("conv2d_backward_weight_strategy")
-def conv2d_backward_weight_strategy(attrs, inputs, out_type, target):
-    """wgrad generic strategy"""
-    raise RuntimeError(
-        "conv2d_backward_weight is currently only supported with cudnn. "
-        "Please run Legalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("layout_transform_strategy")
-def layout_transform_strategy(attrs, inputs, out_type, target):
-    """layout transform generic strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_layout_transform(topi.layout_transform),
-        # Defined earlier in the file
-        schedule_injective,
-        name="layout_transform.generic",
-    )
-    return strategy
-
-
-def wrap_compute_layout_transform(topi_compute, schedule_rule="None"):
-    """Wrap layout transform compute"""
-
-    def _compute_layout_transform(attrs, inputs, output_type):
-        return [topi_compute(inputs[0], attrs.src_layout, attrs.dst_layout, schedule_rule)]
-
-    return _compute_layout_transform
diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
deleted file mode 100644
index 2db3b2c886f5..000000000000
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of Hexagon operator strategy."""
-
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-
-from tvm import topi
-from .generic import *
-from .. import op as _op
-
-# --- Op strategy registration
-
-
-@batch_matmul_strategy.register("hexagon")
-def batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
-    """batch_matmul strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_batch_matmul(topi.nn.batch_matmul, need_out_dtype=True),
-        wrap_topi_schedule(topi.hexagon.schedule_batch_matmul),
-        name="batch_matmul.hexagon",
-    )
-    return strategy
-
-
-@concatenate_strategy.register("hexagon")
-def concatenate_strategy_hexagon(attrs, inputs, out_type, target):
-    """concatenate strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_concat(topi.concatenate),
-        wrap_topi_schedule(topi.hexagon.schedule_injective),
-        name="concatenate.hexagon",
-    )
-    return strategy
-
-
-@conv2d_strategy.register("hexagon")
-def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
-    """Conv2d strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    data_layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    groups = attrs.groups
-    data, kernel = inputs
-    layout = attrs.data_layout
-
-    if groups == 1:
-        if data_layout == "NHWC" and kernel_layout == "HWIO":
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
-                wrap_topi_schedule(topi.hexagon.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.hexagon",
-            )
-        elif data_layout == "NCHW" and kernel_layout == "OIHW":
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nchw),
-                wrap_topi_schedule(topi.hexagon.schedule_conv2d_nchw),
-                name="conv2d_nchw.hexagon",
-            )
-        else:
-            raise RuntimeError(
-                f"Unsupported layouts: data_layout:{data_layout}, kernel_layout:{kernel_layout}, "
-                f"groups:{attrs.groups}"
-            )
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.hexagon.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.hexagon",
-            )
-        elif layout == "NHWC":
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
-                wrap_topi_schedule(topi.hexagon.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.hexagon",
-            )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout}")
-    else:  # group_conv2d
-        raise RuntimeError(f"Unsupported group_conv2d layout {layout}")
-
-    return strategy
-
-
-@dense_strategy.register("hexagon")
-def dense_strategy_hexagon(attrs, inputs, out_type, target):
-    """Dense strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.nn.dense),
-        wrap_topi_schedule(topi.hexagon.schedule_dense),
-        name="dense.hexagon",
-    )
-    return strategy
-
-
-@softmax_strategy.register("hexagon")
-def softmax_strategy_hexagon(attrs, inputs, out_type, target):
-    """Softmax strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.softmax),
-        wrap_topi_schedule(topi.hexagon.schedule_softmax),
-        name="softmax.hexagon",
-    )
-    return strategy
-
-
-@conv2d_transpose_strategy.register("hexagon")
-def conv2d_transpose_strategy_hexagon(attrs, inputs, out_type, target):
-    """conv2d_transpose hexagon strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCHW", "only support nchw for now"
-    assert dilation == (1, 1), "not support dilate now"
-    strategy = _op.OpStrategy()
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.hexagon.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.generic",
-        )
-    else:
-        raise RuntimeError(f"Unsupported conv2d_transpose layout {layout}")
-    return strategy
-
-
-# --- Op schedule registration
-
-
-@schedule_adaptive_pool.register("hexagon")
-def schedule_adaptive_pool_hexagon(attrs, outs, target):
-    """Schedule adaptive pool ops for Hexagon"""
-    with target:
-        return topi.hexagon.schedule_adaptive_pool(outs)
-
-
-@schedule_injective.register("hexagon")
-def schedule_injective_hexagon(attrs, outs, target):
-    """Schedule injective ops for Hexagon"""
-    with target:
-        return topi.hexagon.schedule_injective(outs)
-
-
-@schedule_concatenate.register("hexagon")
-def schedule_concatenate_hexagon(attrs, outs, target):
-    """Schedule concatenate ops for Hexagon"""
-    with target:
-        return topi.hexagon.schedule_injective(outs)
-
-
-@schedule_pad.register("hexagon")
-def schedule_pad_hexagon(attrs, outs, target):
-    """Schedule pad ops for Hexagon"""
-    with target:
-        return topi.hexagon.schedule_pad(outs)
-
-
-@schedule_pool.register("hexagon")
-def schedule_pool_hexagon(attrs, outs, target):
-    """Schedule pool ops for Hexagon"""
-    with target:
-        return topi.hexagon.schedule_pool(outs)
-
-
-@schedule_reduce.register("hexagon")
-def schedule_reduce_hexagon(attrs, outs, target):
-    """Schedule reduction ops for Hexagon"""
-    with target:
-        return topi.hexagon.schedule_reduce(outs)
-
-
-@conv2d_NCHWc_strategy.register("hexagon")
-def conv2d_NCHWc_strategy_hexagon(attrs, inputs, out_type, target):
-    """conv2d_NCHWc_ hexagon strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d(
-            topi.hexagon.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
-        ),
-        wrap_topi_schedule(topi.hexagon.schedule_conv2d_NCHWc_int8),
-        name="conv2d_NCHWc_int8.hexagon",
-    )
-    return strategy
-
-
-@dense_pack_strategy.register("hexagon")
-def dense_pack_strategy_hexagon(attrs, inputs, out_type, target):
-    """dense_pack hexagon strategy"""
-    strategy = _op.OpStrategy()
-
-    if (
-        inputs[0].dtype == "uint8"
-        and inputs[1].dtype == "uint8"
-        and out_type.dtype == "int32"
-        and attrs["weight_layout"] == "NC32n4c"
-    ):
-        strategy.add_implementation(
-            wrap_compute_dense(topi.hexagon.dense.dense_u8u8i32_vrmpy_compute),
-            wrap_topi_schedule(topi.hexagon.dense.dense_u8u8i32_vrmpy_schedule),
-            name="dense_uint8.hexagon",
-            plevel=12,
-        )
-
-    return strategy
-
-
-@fast_softmax_strategy.register("hexagon")
-def fast_softmax_strategy_cpu(attrs, inputs, out_type, target):
-    """fast_softmax hexagon strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.fast_softmax),
-        wrap_topi_schedule(topi.hexagon.schedule_softmax),
-        name="fast_softmax.hexagon",
-    )
-    return strategy
diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py
deleted file mode 100644
index 61f5a18e9ce9..000000000000
--- a/python/tvm/relay/op/strategy/hls.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of HLS operator strategy."""
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-from tvm import topi
-from .generic import *
-from .. import op as _op
-
-
-@schedule_injective.register("hls")
-def schedule_injective_hls(attrs, outs, target):
-    """schedule injective ops for hls"""
-    with target:
-        return topi.hls.schedule_injective(outs)
-
-
-@schedule_reduce.register("hls")
-def schedule_reduce_hls(attrs, outs, target):
-    """schedule reduction ops for hls"""
-    with target:
-        return topi.hls.schedule_reduce(outs)
-
-
-@schedule_concatenate.register("hls")
-def schedule_concatenate_hls(attrs, outs, target):
-    """schedule concatenate for hls"""
-    with target:
-        return topi.hls.schedule_injective(outs)
-
-
-@schedule_pool.register("hls")
-def schedule_pool_hls(attrs, outs, target):
-    """schedule pooling ops for hls"""
-    with target:
-        return topi.hls.schedule_pool(outs, attrs.layout)
-
-
-@schedule_adaptive_pool.register("hls")
-def schedule_adaptive_pool_hls(attrs, outs, target):
-    """schedule adaptive pooling ops for hls"""
-    with target:
-        return topi.hls.schedule_adaptive_pool(outs)
-
-
-@softmax_strategy.register("hls")
-def softmax_strategy_hls(attrs, inputs, out_type, target):
-    """softmax hls strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.softmax),
-        wrap_topi_schedule(topi.hls.schedule_softmax),
-        name="softmax.hls",
-    )
-    return strategy
-
-
-@log_softmax_strategy.register("hls")
-def log_softmax_strategy_hls(attrs, inputs, out_type, target):
-    """log_softmax hls strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.log_softmax),
-        wrap_topi_schedule(topi.hls.schedule_softmax),
-        name="log_softmax.hls",
-    )
-    return strategy
-
-
-@conv2d_strategy.register("hls")
-def conv2d_strategy_hls(attrs, inputs, out_type, target):
-    """conv2d hls strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    (dilation_h, dilation_w) = dilation
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nchw),
-                wrap_topi_schedule(topi.hls.schedule_conv2d_nchw),
-                name="conv2d_nchw.hls",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
-                wrap_topi_schedule(topi.hls.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.hls",
-            )
-        else:
-            raise RuntimeError(f"Unsupported conv2d layout {layout}")
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.hls",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_nhwc.hls",
-            )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout}")
-    else:  # group_conv2d
-        raise RuntimeError("group_conv2d is not supported for hls")
-    return strategy
-
-
-@conv2d_NCHWc_strategy.register("hls")
-def conv2d_NCHWc_strategy_hls(attrs, inputs, out_type, target):
-    """conv2d_NCHWc hls strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d(topi.nn.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
-        wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc),
-        name="conv2d_NCHWc.hls",
-    )
-    return strategy
-
-
-@conv2d_transpose_strategy.register("hls")
-def conv2d_transpose_strategy_hls(attrs, inputs, out_type, target):
-    """conv2d_transpose hls strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCHW", "only support nchw for now"
-    assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.hls.schedule_conv2d_transpose_nchw),
-        name="conv2d_transpose_nchw.hls",
-    )
-    return strategy
-
-
-@dense_strategy.register("hls")
-def dense_strategy_hls(attrs, inputs, out_type, target):
-    """dense hls strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.nn.dense),
-        wrap_topi_schedule(topi.hls.schedule_dense),
-        name="dense.hls",
-    )
-    return strategy
-
-
-@bitserial_conv2d_strategy.register("hls")
-def bitserial_conv2d_strategy_hls(attrs, inputs, out_type, target):
-    """bitserial_conv2d hls strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw),
-            wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nchw),
-            name="bitserial_conv2d_nchw.hls",
-        )
-    elif layout == "NHWC":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc),
-            wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nhwc),
-            name="bitserial_conv2d_nhwc.hls",
-        )
-    else:
-        raise ValueError(f"Data layout {layout} not supported.")
-    return strategy
diff --git a/python/tvm/relay/op/strategy/intel_graphics.py b/python/tvm/relay/op/strategy/intel_graphics.py
deleted file mode 100644
index 4bbafb62f2f2..000000000000
--- a/python/tvm/relay/op/strategy/intel_graphics.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of x86 operator strategy."""
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-from tvm import topi
-from .generic import *
-from .. import op as _op
-
-
-@conv2d_strategy.register("intel_graphics")
-def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target):
-    """conv2d intel graphics strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.intel_graphics.conv2d_nchw),
-                wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_nchw),
-                name="conv2d_nchw.intel_graphics",
-            )
-            # conv2d_NCHWc won't work without alter op layout pass
-            # TODO(@Laurawly): fix this
-            strategy.add_implementation(
-                wrap_compute_conv2d(
-                    topi.intel_graphics.conv2d_NCHWc, need_data_layout=True, need_out_layout=True
-                ),
-                wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
-                name="conv2d_NCHWc.intel_graphics",
-                plevel=5,
-            )
-        else:
-            raise RuntimeError(f"Unsupported conv2d layout {layout} for intel graphics")
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.intel_graphics.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.intel_graphics.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.intel_graphics",
-            )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout}")
-    else:  # group_conv2d
-        raise RuntimeError("group_conv2d is not supported for intel graphics")
-    return strategy
-
-
-@conv2d_NCHWc_strategy.register("intel_graphics")
-def conv2d_NCHWc_strategy_intel_graphics(attrs, inputs, out_type, target):
-    """conv2d_NCHWc intel_graphics strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d(
-            topi.intel_graphics.conv2d_NCHWc, need_data_layout=True, need_out_layout=True
-        ),
-        wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
-        name="conv2d_NCHWc.intel_graphics",
-    )
-    return strategy
diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
deleted file mode 100644
index f37071c9fcbd..000000000000
--- a/python/tvm/relay/op/strategy/mali.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of mali operator strategy."""
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-import re
-
-from tvm import topi
-from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.meta_schedule import is_meta_schedule_enabled
-
-from .. import op as _op
-from .generic import *
-
-
-@conv2d_strategy.register("mali")
-def conv2d_strategy_mali(attrs, inputs, out_type, target):
-    """conv2d mali strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            if kernel_layout == "OIHW":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack),
-                    wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack),
-                    name="conv2d_nchw_spatial_pack.mali",
-                )
-                # check if winograd algorithm is applicable
-                _, _, kh, kw = get_const_tuple(kernel.shape)
-                if (
-                    kh == 3
-                    and kw == 3
-                    and stride_h == 1
-                    and stride_w == 1
-                    and dilation_h == 1
-                    and dilation_w == 1
-                ):
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd),
-                        wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd),
-                        name="conv2d_nchw_winograd.mali",
-                        plevel=5,
-                    )
-            elif re.match(r"OIHW\d*o", kernel_layout):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack),
-                    wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack),
-                    name="conv2d_nchw_spatial_pack.mali",
-                )
-            else:
-                raise RuntimeError(f"Unsupported weight layout {kernel_layout} for conv2d NCHW")
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            need_auto_scheduler_layout = is_auto_scheduler_enabled()
-            need_meta_schedule_layout = is_meta_schedule_enabled()
-            if need_auto_scheduler_layout or need_meta_schedule_layout:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(
-                        topi.nn.conv2d_nhwc,
-                        need_auto_scheduler_layout=need_auto_scheduler_layout,
-                        need_meta_schedule_layout=need_meta_schedule_layout,
-                    ),
-                    naive_schedule,
-                    name="conv2d_nhwc.mali",
-                )
-                is_winograd_applicable = False
-                if len(kernel.shape) == 4:
-                    kernel_h, kernel_w, _, _ = get_const_tuple(kernel.shape)
-                    is_winograd_applicable = (
-                        "float" in data.dtype
-                        and "float" in kernel.dtype
-                        and kernel_h == 3
-                        and kernel_w == 3
-                        and stride_h == 1
-                        and stride_w == 1
-                        and dilation_h == 1
-                        and dilation_w == 1
-                    )
-                if is_winograd_applicable:
-                    if need_meta_schedule_layout:
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(
-                                topi.nn.conv2d_winograd_nhwc,
-                                need_auto_scheduler_layout=False,
-                                need_meta_schedule_layout=True,
-                            ),
-                            naive_schedule,  # this implementation should never be picked by autotvm
-                            name="conv2d_nhwc.winograd",
-                            plevel=15,
-                        )
-                    elif need_auto_scheduler_layout:
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(
-                                topi.nn.conv2d_winograd_nhwc,
-                                need_auto_scheduler_layout=True,
-                                need_meta_schedule_layout=False,
-                            ),
-                            naive_schedule,  # this implementation should never be picked by autotvm
-                            name="conv2d_nhwc.winograd",
-                            plevel=15,
-                        )
-                    else:
-                        raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled")
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.mali.conv2d_nhwc_spatial_pack),
-                    wrap_topi_schedule(topi.mali.schedule_conv2d_nhwc_spatial_pack),
-                    name="conv2d_nhwc_spatial_pack.mali",
-                )
-
-        else:
-            raise RuntimeError(f"Unsupported conv2d layout {layout} for mali")
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.mali",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            if is_auto_scheduler_enabled():
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                    naive_schedule,
-                    name="depthwise_conv2d_nhwc.mali",
-                )
-            elif is_meta_schedule_enabled():
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                    naive_schedule,
-                    name="depthwise_conv2d_nhwc.mali",
-                )
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc),
-                    wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc),
-                    name="depthwise_conv2d_nhwc.mali",
-                )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout} for mali")
-    else:  # group_conv2d
-        raise RuntimeError("group_conv2d is not supported for mali")
-    return strategy
-
-
-@conv2d_winograd_without_weight_transform_strategy.register("mali")
-def conv2d_winograd_without_weight_transform_strategy_mali(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform mali strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.data_layout
-    strides = attrs.get_int_tuple("strides")
-    kernel = inputs[1]
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not support arbitrary group number"
-    strategy = _op.OpStrategy()
-    if layout == "NCHW":
-        assert len(kernel.shape) == 5, "Kernel must be packed into 5-dim"
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd),
-            wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd),
-            name="conv2d_nchw_winograd.mali",
-        )
-    elif layout == "NHWC":
-        need_auto_scheduler_layout = is_auto_scheduler_enabled()
-        need_meta_schedule_layout = is_meta_schedule_enabled()
-        if need_auto_scheduler_layout or need_meta_schedule_layout:
-            strategy.add_implementation(
-                wrap_compute_conv2d(
-                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
-                    need_auto_scheduler_layout=need_auto_scheduler_layout,
-                    need_meta_schedule_layout=need_meta_schedule_layout,
-                ),
-                naive_schedule,  # this implementation should never be picked by autotvm
-                name="conv2d_nhwc_winograd_without_weight_transform",
-                plevel=15,
-            )
-        else:
-            raise RuntimeError(
-                "Winograd conv2d NHWC is not enabled for mali without auto_scheduler."
-            )
-    else:
-        raise RuntimeError(f"Unsupported conv2d_winograd_without_weight_transform layout {layout}")
-    return strategy
-
-
-@dense_strategy.register("mali")
-def dense_strategy_mali(attrs, inputs, out_type, target):
-    """dense mali strategy"""
-    strategy = _op.OpStrategy()
-    if is_auto_scheduler_enabled():
-        strategy.add_implementation(
-            wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True),
-            naive_schedule,
-            name="dense.mali",
-        )
-    elif is_meta_schedule_enabled():
-        strategy.add_implementation(
-            wrap_compute_dense(topi.nn.dense, need_meta_schedule_layout=True),
-            naive_schedule,
-            name="dense.mali",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_dense(topi.mali.dense),
-            wrap_topi_schedule(topi.mali.schedule_dense),
-            name="dense.mali",
-        )
-    return strategy
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
deleted file mode 100644
index d80f3479754b..000000000000
--- a/python/tvm/relay/op/strategy/rocm.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of ROCm operator strategy."""
-# pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import
-from tvm import topi
-from tvm.te import SpecializedCondition
-from tvm.contrib.thrust import can_use_rocthrust
-from tvm.contrib import miopen
-
-from .generic import *
-from .. import op as _op
-from .cuda import batch_matmul_strategy_cuda, conv2d_strategy_cuda, dense_strategy_cuda
-
-
-@conv2d_strategy.register("rocm")
-def conv2d_strategy_rocm(attrs, inputs, out_type, target):
-    """conv2d rocm strategy"""
-    groups = attrs.groups
-    layout = attrs.data_layout
-    padding = attrs.get_int_tuple("padding")
-
-    strategy = conv2d_strategy_cuda(attrs, inputs, out_type, target)
-
-    # add miopen implementation
-    if (
-        "miopen" in target.libs
-        and groups == 1
-        and layout == "NCHW"
-        and padding[0] == padding[2]
-        and padding[1] == padding[3]
-    ):
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, need_data_layout=True),
-            wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
-            name="conv2d_nchw_miopen.rocm",
-            plevel=50,
-        )
-
-    return strategy
-
-
-@dense_strategy.register("rocm")
-def dense_strategy_rocm(attrs, inputs, out_type, target):
-    """Dense strategy for ROCM"""
-    assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense"
-    strategy = dense_strategy_cuda(attrs, inputs, out_type, target)
-
-    if target.kind.name == "rocm" and "rocblas" in target.libs:
-        assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported."
-        strategy.add_implementation(
-            wrap_compute_dense(topi.rocm.dense_rocblas),
-            wrap_topi_schedule(topi.rocm.schedule_dense_rocblas),
-            name="dense_rocblas.rocm",
-            plevel=15,
-        )
-    return strategy
-
-
-@batch_matmul_strategy.register("rocm")
-def batch_matmul_strategy_rocm(attrs, inputs, out_type, target):
-    """Batch matmul strategy for ROCM"""
-    strategy = batch_matmul_strategy_cuda(attrs, inputs, out_type, target)
-
-    if target.kind.name == "rocm" and "rocblas" in target.libs:
-        assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported."
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.rocm.batch_matmul_rocblas),
-            wrap_topi_schedule(topi.rocm.schedule_batch_matmul_rocblas),
-            name="batch_matmul_rocblas.rocm",
-            plevel=12,
-        )
-    return strategy
-
-
-@argsort_strategy.register(["rocm"])
-def argsort_strategy_cuda(attrs, inputs, out_type, target):
-    """argsort rocm strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_argsort(topi.cuda.argsort),
-        wrap_topi_schedule(topi.cuda.schedule_argsort),
-        name="argsort.rocm",
-    )
-    if can_use_rocthrust(target, "tvm.contrib.thrust.sort"):
-        strategy.add_implementation(
-            wrap_compute_argsort(topi.cuda.argsort_thrust),
-            wrap_topi_schedule(topi.cuda.schedule_argsort),
-            name="argsort_thrust.rocm",
-            plevel=15,
-        )
-    return strategy
-
-
-@scatter_elements_strategy.register(["rocm"])
-def scatter_elements_cuda(attrs, inputs, out_type, target):
-    """scatter rocm strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scatter_elements(topi.cuda.scatter_elements),
-        wrap_topi_schedule(topi.cuda.schedule_extern),
-        name="scatter_elements.rocm",
-        plevel=10,
-    )
-
-    rank = len(inputs[0].shape)
-
-    with SpecializedCondition(rank == 1 and attrs.reduction == "update"):
-        if can_use_rocthrust(target, "tvm.contrib.thrust.stable_sort_by_key"):
-            strategy.add_implementation(
-                wrap_compute_scatter_elements(topi.cuda.scatter_via_sort),
-                wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort),
-                name="scatter_via_sort.rocm",
-                plevel=9,  # use the sequential version by default
-            )
-    return strategy
-
-
-@sort_strategy.register(["rocm"])
-def sort_strategy_cuda(attrs, inputs, out_type, target):
-    """sort rocm strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sort(topi.cuda.sort),
-        wrap_topi_schedule(topi.cuda.schedule_sort),
-        name="sort.rocm",
-    )
-    if can_use_rocthrust(target, "tvm.contrib.thrust.sort"):
-        strategy.add_implementation(
-            wrap_compute_sort(topi.cuda.sort_thrust),
-            wrap_topi_schedule(topi.cuda.schedule_sort),
-            name="sort_thrust.cuda",
-            plevel=15,
-        )
-    return strategy
-
-
-@topk_strategy.register(["rocm"])
-def topk_strategy_cuda(attrs, inputs, out_type, target):
-    """topk rocm strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_topk(topi.cuda.topk),
-        wrap_topi_schedule(topi.cuda.schedule_topk),
-        name="topk.rocm",
-    )
-
-    if can_use_rocthrust(target, "tvm.contrib.thrust.sort"):
-        strategy.add_implementation(
-            wrap_compute_topk(topi.cuda.topk_thrust),
-            wrap_topi_schedule(topi.cuda.schedule_topk),
-            name="topk_thrust.rocm",
-            plevel=15,
-        )
-    return strategy
-
-
-@softmax_strategy.register(["rocm"])
-def softmax_strategy_rocm(attrs, inputs, out_type, target):
-    """rocm strategy for softmax"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.softmax),
-        wrap_topi_schedule(topi.cuda.schedule_softmax),
-        name="softmax.rocm",
-    )
-    if "miopen" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_softmax(miopen.softmax),
-            wrap_topi_schedule(topi.generic.schedule_extern),
-            name="softmax.miopen",
-            plevel=15,
-        )
-    return strategy
-
-
-@log_softmax_strategy.register(["rocm"])
-def log_softmax_strategy_rocm(attrs, inputs, out_type, target):
-    """rocm strategy for log softmax"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.log_softmax),
-        wrap_topi_schedule(topi.cuda.schedule_softmax),
-        name="log_softmax.rocm",
-    )
-    if "miopen" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_softmax(miopen.log_softmax),
-            wrap_topi_schedule(topi.generic.schedule_extern),
-            name="log_softmax.miopen",
-            plevel=15,
-        )
-    return strategy
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
deleted file mode 100644
index 1b69c7a6ca42..000000000000
--- a/python/tvm/relay/op/strategy/x86.py
+++ /dev/null
@@ -1,859 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of x86 operator strategy."""
-# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
-import logging
-import re
-
-from tvm import tir, topi
-from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.meta_schedule import is_meta_schedule_enabled
-from tvm.relay.ty import is_dynamic
-from tvm.te import SpecializedCondition
-
-from .. import op as _op
-from .generic import *
-
-logger = logging.getLogger("strategy")
-
-_NCHWc_matcher = re.compile("^NCHW[0-9]+c$")
-_OIHWio_matcher = re.compile("^OIHW[0-9]+i[0-9]+o$")
-
-
-@schedule_injective.register("cpu")
-def schedule_injective_cpu(attrs, outs, target):
-    """schedule injective ops for x86"""
-    with target:
-        return topi.x86.schedule_injective(outs)
-
-
-@schedule_reduce.register("cpu")
-def schedule_reduce_cpu(attrs, outs, target):
-    """schedule reduction ops for x86"""
-    with target:
-        return topi.x86.schedule_reduce(outs)
-
-
-@schedule_pool.register("cpu")
-def schedule_pool_cpu(attrs, outs, target):
-    """schedule pooling ops for x86"""
-    with target:
-        return topi.x86.schedule_pool(outs, attrs.layout)
-
-
-@schedule_adaptive_pool.register("cpu")
-def schedule_adaptive_pool_cpu(attrs, outs, target):
-    """schedule adaptive pooling ops for x86"""
-    with target:
-        return topi.x86.schedule_adaptive_pool(outs)
-
-
-@softmax_strategy.register("cpu")
-def softmax_strategy_cpu(attrs, inputs, out_type, target):
-    """softmax x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.softmax),
-        wrap_topi_schedule(topi.x86.schedule_softmax),
-        name="softmax.x86",
-    )
-    return strategy
-
-
-@fast_softmax_strategy.register("cpu")
-def fast_softmax_strategy_cpu(attrs, inputs, out_type, target):
-    """fast_softmax x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.fast_softmax),
-        wrap_topi_schedule(topi.x86.schedule_softmax),
-        name="fast_softmax.x86",
-    )
-    return strategy
-
-
-@log_softmax_strategy.register("cpu")
-def log_softmax_strategy_cpu(attrs, inputs, out_type, target):
-    """log_softmax x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_softmax(topi.nn.log_softmax),
-        wrap_topi_schedule(topi.x86.schedule_softmax),
-        name="log_softmax.x86",
-    )
-    return strategy
-
-
-@conv2d_strategy.register("cpu")
-def conv2d_strategy_cpu(attrs, inputs, out_type, target):
-    """conv2d x86 strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    stride_h, stride_w = get_const_tuple(attrs.strides)
-    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    need_auto_scheduler_layout = is_auto_scheduler_enabled()
-    need_meta_schedule_layout = is_meta_schedule_enabled()
-
-    if groups == 1:
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nchw_int8),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
-                    name="conv2d_nchw_int8.x86",
-                )
-            elif "dnnl" in target.libs:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nchw_dnnl),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_dnnl),
-                    name="conv2d_nchw_dnnl.x86",
-                )
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nchw),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
-                    name="conv2d_nchw.x86",
-                )
-        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
-            assert _OIHWio_matcher.match(kernel_layout)  # check if kernel is OIHWio
-            return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            if (not need_auto_scheduler_layout) and (not need_meta_schedule_layout):
-                logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
-            if "dnnl" in target.libs:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nhwc_dnnl),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_dnnl),
-                    name="conv2d_nhwc_dnnl.x86",
-                )
-            else:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(
-                        topi.nn.conv2d_nhwc,
-                        need_auto_scheduler_layout=need_auto_scheduler_layout,
-                        need_meta_schedule_layout=need_meta_schedule_layout,
-                    ),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
-                    name="conv2d_nhwc.x86",
-                )
-
-            judge_winograd_auto_scheduler = False
-            if len(kernel.shape) == 4:
-                kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape)
-                judge_winograd_auto_scheduler = (
-                    "float" in data.dtype
-                    and "float" in kernel.dtype
-                    and kernel_h == 3
-                    and kernel_w == 3
-                    and stride_h == 1
-                    and stride_w == 1
-                    and dilation_h == 1
-                    and dilation_w == 1
-                    and 64 < co < 512
-                    # The last condition of co is based on our profiling of resnet workloads
-                    # on skylake avx512 machines. We found winograd is faster than direct
-                    # only when co is within this range
-                )
-
-            # register auto-scheduler implementations
-            if (
-                need_auto_scheduler_layout or need_meta_schedule_layout
-            ) and judge_winograd_auto_scheduler:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(
-                        topi.nn.conv2d_winograd_nhwc,
-                        need_auto_scheduler_layout=need_auto_scheduler_layout,
-                        need_meta_schedule_layout=need_meta_schedule_layout,
-                    ),
-                    naive_schedule,  # this implementation should never be picked by autotvm
-                    name="conv2d_nhwc.winograd",
-                    plevel=15,
-                )
-        elif layout == "HWCN":
-            assert kernel_layout == "HWIO"
-            if (not need_auto_scheduler_layout) or (not need_meta_schedule_layout):
-                logger.warning("conv2d HWCN layout is not optimized for x86 with autotvm.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
-                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
-                name="conv2d_hwcn.generic",
-            )
-        else:
-            raise RuntimeError(f"Unsupported conv2d layout {layout} for x86")
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            channel_multiplier = get_const_tuple(inputs[1].shape)[1]
-            if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
-                    wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
-                    name="depthwise_conv2d_nchw.x86",
-                )
-            else:
-                logger.warning(
-                    "For x86 target, depthwise_conv2d with channel "
-                    "multiplier greater than 1 is not optimized"
-                )
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
-                    wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw),
-                    name="depthwise_conv2d_nchw.generic",
-                )
-        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
-            assert _OIHWio_matcher.match(kernel_layout)  # check if kernel is OIHWio
-            return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
-        elif layout == "NHWC":
-            if (not need_auto_scheduler_layout) and (not need_meta_schedule_layout):
-                logger.warning(
-                    "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
-                )
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
-                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.generic",
-            )
-        else:
-            raise RuntimeError(f"Unsupported depthwise_conv2d layout {layout}")
-    else:  # group_conv2d
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.x86",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            if (not need_auto_scheduler_layout) and (not need_meta_schedule_layout):
-                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
-                name="group_conv2d_nhwc.generic",
-            )
-        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
-            assert _OIHWio_matcher.match(kernel_layout)  # check if kernel is OIHWio
-            return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
-        else:
-            raise RuntimeError(f"Unsupported group_conv2d layout {layout}")
-    return strategy
-
-
-@conv2d_NCHWc_strategy.register("cpu")
-def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
-    """conv2d_NCHWc x86 strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
-        strategy.add_implementation(
-            wrap_compute_conv2d(
-                topi.x86.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
-            ),
-            wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8),
-            name="conv2d_NCHWc_int8.x86",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
-            wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
-            name="conv2d_NCHWc.x86",
-        )
-    return strategy
-
-
-@depthwise_conv2d_NCHWc_strategy.register("cpu")
-def depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
-    """depthwise_conv2d x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d(
-            topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
-        ),
-        wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
-        name="depthwise_conv2d_NCHWc.x86",
-    )
-    return strategy
-
-
-@conv2d_transpose_strategy.register("cpu")
-def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target):
-    """conv2d_transpose x86 strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCHW", "only support nchw for now"
-    assert dilation == (1, 1), "not support dilate now"
-    strategy = _op.OpStrategy()
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.x86",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw, has_groups=True),
-            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
-            name="group_conv2d_transpose_nchw.x86",
-        )
-    return strategy
-
-
-@conv3d_transpose_strategy.register("cpu")
-def conv3d_transpose_strategy_cpu(attrs, inputs, out_type, target):
-    """conv3d_transpose x86 strategy"""
-    layout = attrs.data_layout
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    assert layout == "NCDHW", "only support ncdhw for now"
-    assert dilation == (1, 1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv3d_transpose(topi.x86.conv3d_transpose_ncdhw),
-        wrap_topi_schedule(topi.x86.schedule_conv3d_transpose_ncdhw),
-        name="conv3d_transpose_ncdhw.x86",
-    )
-    return strategy
-
-
-@conv3d_strategy.register("cpu")
-def conv3d_strategy_cpu(attrs, inputs, out_type, target):
-    """conv3d generic strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    need_auto_scheduler_layout = is_auto_scheduler_enabled()
-    need_meta_schedule_layout = is_meta_schedule_enabled()
-    if need_auto_scheduler_layout or need_meta_schedule_layout:
-        # Use auto-scheduler. We should provide clear compute definition without autotvm templates
-        # or packed layouts.
-        if layout == "NCDHW":
-            strategy.add_implementation(
-                wrap_compute_conv3d(topi.nn.conv3d_ncdhw), naive_schedule, name="conv3d_ncdhw.x86"
-            )
-        elif layout == "NDHWC":
-            strategy.add_implementation(
-                wrap_compute_conv3d(
-                    topi.nn.conv3d_ndhwc,
-                    need_auto_scheduler_layout=need_auto_scheduler_layout,
-                    need_meta_schedule_layout=need_meta_schedule_layout,
-                ),
-                naive_schedule,
-                name="conv3d_ndhwc.x86",
-            )
-        else:
-            raise ValueError(f"Not support this layout {layout} yet")
-    else:
-        # Use autotvm templates
-        if layout == "NCDHW":
-            strategy.add_implementation(
-                wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
-                wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
-                name="conv3d_ncdhw.x86",
-            )
-        elif layout == "NDHWC":
-            strategy.add_implementation(
-                wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
-                wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
-                name="conv3d_ndhwc.x86",
-            )
-        else:
-            raise ValueError(f"Not support this layout {layout} yet")
-    return strategy
-
-
-@conv1d_strategy.register("cpu")
-def conv1d_strategy_cpu(attrs, inputs, out_type, target):
-    """conv1d x86 strategy"""
-    layout = attrs.data_layout
-    groups = attrs.groups
-    dilation = get_const_tuple(attrs.dilation)
-    if dilation[0] < 1:
-        raise ValueError("dilation should be a positive value")
-    strategy = _op.OpStrategy()
-    if groups == 1:
-        if layout == "NCW":
-            strategy.add_implementation(
-                wrap_compute_conv1d(topi.nn.conv1d_ncw),
-                wrap_topi_schedule(topi.x86.schedule_conv1d_ncw),
-                name="conv1d_ncw.x86",
-            )
-        elif layout == "NWC":
-            strategy.add_implementation(
-                wrap_compute_conv1d(topi.nn.conv1d_nwc),
-                wrap_topi_schedule(topi.x86.schedule_conv1d_nwc),
-                name="conv1d_nwc.x86",
-            )
-        else:
-            raise ValueError(f"Unsupported conv1d layout {layout}")
-    else:
-        if layout == "NCW":
-            strategy.add_implementation(
-                wrap_compute_group_conv1d(topi.nn.group_conv1d_ncw),
-                wrap_topi_schedule(topi.x86.schedule_group_conv1d_ncw),
-                name="group_conv1d_ncw.x86",
-            )
-        elif layout == "NWC":
-            strategy.add_implementation(
-                wrap_compute_group_conv1d(topi.nn.group_conv1d_nwc),
-                wrap_topi_schedule(topi.x86.schedule_group_conv1d_nwc),
-                name="group_conv1d_nwc.x86",
-            )
-        else:
-            raise ValueError(f"Unsupported conv1d layout {layout}")
-    return strategy
-
-
-@matmul_strategy.register("cpu")
-def matmul_strategy_cpu(attrs, inputs, out_type, target):
-    """matmul x86 strategy"""
-    strategy = _op.OpStrategy()
-
-    same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
-    dtype = inputs[0].dtype
-    u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
-    if "cblas" in target.libs:
-        length_before = len(strategy.specializations) if strategy.specializations else 0
-        with SpecializedCondition(same_type and dtype in ["float32", "float64"]):
-            strategy.add_implementation(
-                wrap_compute_matmul(topi.x86.matmul_cblas),
-                wrap_topi_schedule(topi.x86.schedule_matmul_cblas),
-                name="matmul_cblas.x86",
-                plevel=13,
-            )
-        length_after = len(strategy.specializations) if strategy.specializations else 0
-        if length_before == length_after:
-            logger.warning(
-                "Currently cblas only support the data type to be float32 or float64. Skip."
-            )
-    if "mkl" in target.libs:
-        length_before = len(strategy.specializations) if strategy.specializations else 0
-        with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32):
-            strategy.add_implementation(
-                wrap_compute_matmul(topi.x86.matmul_mkl),
-                wrap_topi_schedule(topi.x86.schedule_matmul_mkl),
-                name="matmul_mkl.x86",
-                plevel=14,
-            )
-        length_after = len(strategy.specializations) if strategy.specializations else 0
-        if length_before == length_after:
-            logger.warning(
-                "Currently mkl only support the data type to be float32, float64 or input with "
-                "uint8 and int8 while output wiht int32. Skip."
-            )
-    if "dnnl" in target.libs:
-        length_before = len(strategy.specializations) if strategy.specializations else 0
-        with SpecializedCondition(same_type and dtype == "float32"):
-            strategy.add_implementation(
-                wrap_compute_matmul(topi.x86.matmul_dnnl),
-                wrap_topi_schedule(topi.x86.schedule_matmul_dnnl),
-                name="matmul_dnnl.x86",
-                plevel=15,
-            )
-        length_after = len(strategy.specializations) if strategy.specializations else 0
-        if length_before == length_after:
-            logger.warning("Currently dnnl only support the data type to be float32. Skip.")
-
-    need_auto_scheduler_layout = is_auto_scheduler_enabled()
-    need_meta_schedule_layout = is_meta_schedule_enabled()
-    if need_auto_scheduler_layout or need_meta_schedule_layout:
-        strategy.add_implementation(
-            wrap_compute_matmul(
-                topi.nn.matmul,
-                need_auto_scheduler_layout=need_auto_scheduler_layout,
-                need_meta_schedule_layout=need_meta_schedule_layout,
-            ),
-            naive_schedule,
-            name="matmul.generic",
-            plevel=11,
-        )
-    else:
-        # If no cblas/mkl/dnnl strategy choosed
-        if not strategy.specializations:
-            logger.warning(
-                "Matmul is not optimized for x86. "
-                "Recommend to use cblas/mkl/dnnl for better performance."
-            )
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.generic"
-        )
-    return strategy
-
-
-@dense_strategy.register("cpu")
-def dense_strategy_cpu(attrs, inputs, out_type, target):
-    """dense x86 strategy"""
-
-    strategy = _op.OpStrategy()
-    # For dynamic matrix-vector multiply we use a hand written kernel.
-    if (
-        isinstance(inputs[0].shape[0], (int, tir.IntImm))
-        and inputs[0].shape[0] == 1
-        and (
-            topi.utils.is_dynamic_shape(inputs[0].shape)
-            or topi.utils.is_dynamic_shape(inputs[1].shape)
-        )
-    ):
-        strategy.add_implementation(
-            wrap_compute_dense(topi.x86.dense_dynamic),
-            wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
-            name="dense_dynamic.x86",
-            plevel=20,
-        )
-        return strategy
-
-    same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
-    dtype = inputs[0].dtype
-    u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
-    strategy.add_implementation(
-        wrap_compute_dense(topi.x86.dense_nopack),
-        wrap_topi_schedule(topi.x86.schedule_dense_nopack),
-        name="dense_nopack.x86",
-        plevel=5,
-    )
-
-    strategy.add_implementation(
-        wrap_compute_dense(topi.x86.dense_pack),
-        wrap_topi_schedule(topi.x86.schedule_dense_pack),
-        name="dense_pack.x86",
-        plevel=10,
-    )
-
-    need_auto_scheduler_layout = is_auto_scheduler_enabled()
-    need_meta_schedule_layout = is_meta_schedule_enabled()
-
-    if need_auto_scheduler_layout or need_meta_schedule_layout:
-        strategy.add_implementation(
-            wrap_compute_dense(
-                topi.nn.dense,
-                need_auto_scheduler_layout=need_auto_scheduler_layout,
-                need_meta_schedule_layout=need_meta_schedule_layout,
-            ),
-            naive_schedule,
-            name="dense.generic",
-            plevel=11,
-        )
-
-    if "cblas" in target.libs:
-        with SpecializedCondition(same_type and dtype in ["float32", "float64"]):
-            strategy.add_implementation(
-                wrap_compute_dense(topi.x86.dense_cblas),
-                wrap_topi_schedule(topi.x86.schedule_dense_cblas),
-                name="dense_cblas.x86",
-                plevel=13,
-            )
-    if "mkl" in target.libs:
-        with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32):
-            strategy.add_implementation(
-                wrap_compute_dense(topi.x86.dense_mkl),
-                wrap_topi_schedule(topi.x86.schedule_dense_mkl),
-                name="dense_mkl.x86",
-                plevel=14,
-            )
-    if "dnnl" in target.libs:
-        with SpecializedCondition(same_type and dtype == "float32"):
-            strategy.add_implementation(
-                wrap_compute_dense(topi.x86.dense_dnnl),
-                wrap_topi_schedule(topi.x86.schedule_dense_dnnl),
-                name="dense_dnnl.x86",
-                plevel=15,
-            )
-    return strategy
-
-
-@dense_pack_strategy.register("cpu")
-def dense_pack_strategy_cpu(attrs, inputs, out_type, target):
-    """dense_pack x86 strategy"""
-    strategy = _op.OpStrategy()
-    if (
-        inputs[0].dtype == "uint8"
-        and inputs[1].dtype == "int8"
-        and out_type.dtype == "int32"
-        and attrs["weight_layout"] == "NC16n4c"
-    ):
-        strategy.add_implementation(
-            wrap_compute_dense(topi.x86.dense_int8),
-            wrap_topi_schedule(topi.x86.schedule_dense_int8),
-            name="dense_int8.x86",
-            plevel=13,
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_dense(topi.x86.dense_pack),
-            wrap_topi_schedule(topi.x86.schedule_dense_pack),
-            name="dense_pack.x86",
-            plevel=10,
-        )
-
-    return strategy
-
-
-@batch_matmul_strategy.register("cpu")
-def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
-    """batch_matmul x86 strategy"""
-    strategy = _op.OpStrategy()
-
-    need_auto_scheduler_layout = is_auto_scheduler_enabled()
-    need_meta_schedule_layout = is_meta_schedule_enabled()
-
-    if (
-        not attrs.transpose_a
-        and attrs.transpose_b
-        and inputs[0].dtype == "uint8"
-        and inputs[1].dtype == "int8"
-        and inputs[1].shape[-2] % 16 == 0
-        and inputs[1].shape[-1] % 4 == 0
-    ):
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.x86.batch_matmul_int8_compute, need_out_dtype=True),
-            wrap_topi_schedule(topi.x86.schedule_batch_matmul_int8),
-            name="batch_matmul_int8.x86",
-            plevel=10,
-        )
-    elif is_dynamic(out_type) or need_auto_scheduler_layout or need_meta_schedule_layout:
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(
-                topi.nn.batch_matmul,
-                need_out_dtype=True,
-                need_auto_scheduler_layout=need_auto_scheduler_layout,
-                need_meta_schedule_layout=need_meta_schedule_layout,
-            ),
-            wrap_topi_schedule(topi.generic.nn.schedule_batch_matmul),
-            name="batch_matmul.generic",
-            plevel=10,
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.x86.batch_matmul, need_out_dtype=True),
-            wrap_topi_schedule(topi.x86.schedule_batch_matmul),
-            name="batch_matmul.x86",
-            plevel=10,
-        )
-    if "cblas" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas),
-            wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas),
-            name="batch_matmul_cblas.x86",
-            plevel=15,
-        )
-    if "mkl" in target.libs:
-        strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl),
-            wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl),
-            name="batch_matmul_mkl.x86",
-            plevel=15,
-        )
-    return strategy
-
-
-@sparse_dense_strategy.register("cpu")
-def sparse_dense_strategy_cpu(attrs, inputs, out_type, target):
-    """sparse dense x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_sparse_dense(topi.nn.sparse_dense),
-        wrap_topi_schedule(topi.x86.schedule_sparse_dense),
-        name="sparse_dense.x86",
-        plevel=10,
-    )
-    return strategy
-
-
-@sparse_conv2d_strategy.register("cpu")
-def sparse_conv2d_strategy_cpu(attrs, inputs, out_type, target):
-    """sparse conv2d x86 strategy"""
-    strategy = _op.OpStrategy()
-    if attrs["kernel_size"][0] == 1:
-        strategy.add_implementation(
-            wrap_compute_sparse_conv2d(topi.nn.sparse_conv2d),
-            wrap_topi_schedule(topi.generic.schedule_sparse_conv2d),
-            name="sparse_conv2d.generic",
-        )
-    elif attrs["kernel_size"][0] == 3:
-        if attrs["layout"] == "NHWC":
-            strategy.add_implementation(
-                wrap_compute_sparse_conv2d(topi.x86.spconv2d_3x3_nhwc),
-                wrap_topi_schedule(topi.x86.schedule_spconv2d_3x3_nhwc),
-                name="conv3x3_spNHWC.x86",
-            )
-        elif attrs["layout"] == "NCHW":
-            strategy.add_implementation(
-                wrap_compute_sparse_conv2d(topi.x86.spconv2d_3x3_nchw),
-                wrap_topi_schedule(topi.x86.schedule_spconv2d_3x3_nchw),
-            )
-    return strategy
-
-
-@roi_align_strategy.register("cpu")
-def roi_align_strategy_cpu(attrs, inputs, out_type, target):
-    """roi_align x86 strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.layout
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_roi_align(topi.x86.roi_align_nchw),
-            wrap_topi_schedule(topi.generic.schedule_roi_align),
-            name="roi_align.x86",
-        )
-    else:
-        assert layout == "NHWC", "layout must be NCHW or NHWC."
-        strategy.add_implementation(
-            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc),
-            wrap_topi_schedule(topi.generic.schedule_roi_align),
-            name="roi_align.x86",
-        )
-    return strategy
-
-
-@bitserial_conv2d_strategy.register("cpu")
-def bitserial_conv2d_strategy_cpu(attrs, inputs, out_type, target):
-    """bitserial_conv2d x86 strategy"""
-    strategy = _op.OpStrategy()
-    layout = attrs.data_layout
-    if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw),
-            wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw),
-            name="bitserial_conv2d_nchw.x86",
-        )
-    elif layout == "NHWC":
-        strategy.add_implementation(
-            wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nhwc),
-            wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nhwc),
-            name="bitserial_conv2d_nhwc.x86",
-        )
-    else:
-        raise ValueError(f"Data layout {layout} not supported.")
-    return strategy
-
-
-@bitserial_dense_strategy.register("cpu")
-def bitserial_dense_strategy_cpu(attrs, inputs, out_type, target):
-    """bitserial_dense x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_bitserial_dense(topi.x86.bitserial_dense),
-        wrap_topi_schedule(topi.x86.schedule_bitserial_dense),
-        name="bitserial_dense.x86",
-    )
-    return strategy
-
-
-@scatter_nd_strategy.register("cpu")
-def scatter_nd_strategy_cpu(attrs, inputs, out_type, target):
-    """scatter_nd x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_scatter_nd(topi.scatter_nd),
-        wrap_topi_schedule(topi.generic.schedule_extern),
-        name="scatter_nd.x86",
-        plevel=10,
-    )
-    return strategy
-
-
-@conv2d_winograd_without_weight_transform_strategy.register("cpu")
-def conv2d_winograd_without_weight_transform_strategy_cpu(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transform cpu strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.data_layout
-    strides = attrs.get_int_tuple("strides")
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not support arbitrary group number"
-    strategy = _op.OpStrategy()
-    need_auto_scheduler_layout = is_auto_scheduler_enabled()
-    need_meta_schedule_layout = is_meta_schedule_enabled()
-    if layout == "NHWC":
-        if need_meta_schedule_layout:
-            strategy.add_implementation(
-                wrap_compute_conv2d(
-                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
-                    need_auto_scheduler_layout=False,
-                    need_meta_schedule_layout=True,
-                ),
-                naive_schedule,
-                name="ansor.winograd",
-            )
-        elif need_auto_scheduler_layout:
-            strategy.add_implementation(
-                wrap_compute_conv2d(
-                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
-                    need_auto_scheduler_layout=True,
-                    need_meta_schedule_layout=False,
-                ),
-                naive_schedule,
-                name="ansor.winograd",
-            )
-        else:
-            raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled")
-    else:
-        raise RuntimeError(f"Unsupported conv2d_winograd_without_weight_transform layout {layout}")
-    return strategy
-
-
-@concatenate_strategy.register(["cpu"])
-def concatenate_strategy_cpu(attrs, inputs, out_type, target):
-    """concatenate x86 strategy"""
-    strategy = _op.OpStrategy()
-    use_only_old_concat = False
-    for inpt in inputs:
-        shape = inpt.shape
-        for i in shape:
-            if not isinstance(i, tir.expr.IntImm):
-                use_only_old_concat = True
-                break
-    if use_only_old_concat:
-        strategy.add_implementation(
-            wrap_compute_concat(topi.transform.concatenate),
-            wrap_topi_schedule(topi.x86.injective.schedule_concatenate),
-            name="concatenate.generic",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_concat(topi.x86.concatenate),
-            wrap_topi_schedule(topi.x86.schedule_concatenate_cpu),
-            name="concatenate.cpu",
-        )
-        strategy.add_implementation(
-            wrap_compute_concat(topi.transform.concatenate),
-            wrap_topi_schedule(topi.x86.injective.schedule_concatenate),
-            name="concatenate.generic",
-        )
-    return strategy
-
-
-@batch_norm_strategy.register(["cpu"])
-def batch_norm_strategy_cpu(attrs, inputs, out_type, target):
-    """batch_norm x86 strategy"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_batch_norm(topi.nn.batch_norm),
-        wrap_topi_schedule(topi.x86.schedule_batch_norm),
-        name="batch_norm.cpu",
-    )
-    return strategy
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
deleted file mode 100644
index 26caa4584c79..000000000000
--- a/python/tvm/relay/op/tensor.py
+++ /dev/null
@@ -1,1354 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Basic tensor operations."""
-# pylint: disable=redefined-builtin, unused-argument
-from tvm import target
-from tvm.runtime import ndarray as _nd
-from tvm.runtime import Device as _Device
-from tvm.te.hybrid import script
-
-from . import _make
-from .dyn import _make as _dyn_make
-from ..expr import Tuple, Expr, Constant, Call
-from . import op as reg
-
-
-def _make_virtual_device(device):
-    if isinstance(device, _Device):
-        return target.VirtualDevice(device)
-    if isinstance(device, str):
-        return target.VirtualDevice(_nd.device(device))
-    raise ValueError(f"expecting a Device or device name, but received a {type(device)}")
-
-
-# We create a wrapper function for each operator in the
-# python side to call into the positional _make.OpName function.
-#
-# We make this decision so that we can:
-# - Have declare python docstring for each function
-# - Enable keyword arguments easily
-# - Not put too much burden on FFI to support complicated features
-#   like default value and keyword arguments
-
-
-def log(data):
-    """Compute elementwise log of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.log(data)
-
-
-def log2(data):
-    """Compute elementwise log to the base 2 of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.log2(data)
-
-
-def log10(data):
-    """Compute elementwise log to the base 10 of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.log10(data)
-
-
-def tan(data):
-    """Compute elementwise tan of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.tan(data)
-
-
-def cos(data):
-    """Compute elementwise cos of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.cos(data)
-
-
-def cosh(data):
-    """Compute elementwise cosh of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.cosh(data)
-
-
-def sin(data):
-    """Compute elementwise sin of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.sin(data)
-
-
-def sinh(data):
-    """Compute elementwise sinh of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.sinh(data)
-
-
-def acos(data):
-    """Compute elementwise acos of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.acos(data)
-
-
-def acosh(data):
-    """Compute elementwise acosh of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.acosh(data)
-
-
-def asin(data):
-    """Compute elementwise asin of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.asin(data)
-
-
-def asinh(data):
-    """Compute elementwise asinh of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.asinh(data)
-
-
-def atan(data):
-    """Compute elementwise atan of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.atan(data)
-
-
-def atanh(data):
-    """Compute elementwise atanh of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.atanh(data)
-
-
-def exp(data):
-    """Compute elementwise exp of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.exp(data)
-
-
-def erf(data):
-    """Compute elementwise error function of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.erf(data)
-
-
-def sqrt(data):
-    """Compute elementwise sqrt of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.sqrt(data)
-
-
-def rsqrt(data):
-    """Compute elementwise rsqrt of data.
-
-    .. math::
-
-      1/sqrt(x)
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.rsqrt(data)
-
-
-def sigmoid(data):
-    """Compute elementwise sigmoid of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.sigmoid(data)
-
-
-def floor(data):
-    """Compute element-wise floor of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.floor(data)
-
-
-def ceil(data):
-    """Compute element-wise ceil of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.ceil(data)
-
-
-def trunc(data):
-    """Compute element-wise trunc of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.trunc(data)
-
-
-def round(data):
-    """Compute element-wise round of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.round(data)
-
-
-def abs(data):
-    """Compute element-wise absolute of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.abs(data)
-
-
-def sign(data):
-    """Compute element-wise absolute of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.sign(data)
-
-
-def tanh(data):
-    """Compute element-wise tanh of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.tanh(data)
-
-
-def negative(data):
-    """Compute element-wise negative of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.negative(data)
-
-
-def logical_not(data):
-    """Compute element-wise logical not of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.logical_not(data)
-
-
-def bitwise_not(data):
-    """Compute element-wise bitwise not of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.bitwise_not(data)
-
-
-def add(lhs, rhs):
-    """Addition with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code:: python
-
-      x = relay.Var("a") # shape is [2, 3]
-      y = relay.Var("b") # shape is [2, 1]
-      z = relay.add(x, y)  # result shape is [2, 3]
-    """
-    return _make.add(lhs, rhs)
-
-
-def subtract(lhs, rhs):
-    """Subtraction with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.subtract(lhs, rhs)
-
-
-def multiply(lhs, rhs):
-    """Multiplication with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.multiply(lhs, rhs)
-
-
-def divide(lhs, rhs):
-    """Division with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.divide(lhs, rhs)
-
-
-def floor_divide(lhs, rhs):
-    """Floor division with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.floor_divide(lhs, rhs)
-
-
-def trunc_divide(lhs, rhs):
-    """Trunc division with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.trunc_divide(lhs, rhs)
-
-
-def power(lhs, rhs):
-    """Power with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.power(lhs, rhs)
-
-
-def mod(lhs, rhs):
-    """Mod with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.mod(lhs, rhs)
-
-
-def floor_mod(lhs, rhs):
-    """Floor mod with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.floor_mod(lhs, rhs)
-
-
-def trunc_mod(lhs, rhs):
-    """Trunc mod with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.trunc_mod(lhs, rhs)
-
-
-def logical_and(lhs, rhs):
-    """logical AND with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.logical_and(lhs, rhs)
-
-
-def logical_or(lhs, rhs):
-    """logical OR with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.logical_or(lhs, rhs)
-
-
-def logical_xor(lhs, rhs):
-    """logical XOR with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.logical_xor(lhs, rhs)
-
-
-def bitwise_and(lhs, rhs):
-    """bitwise AND with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.bitwise_and(lhs, rhs)
-
-
-def bitwise_or(lhs, rhs):
-    """bitwise OR with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.bitwise_or(lhs, rhs)
-
-
-def bitwise_xor(lhs, rhs):
-    """bitwise XOR with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.bitwise_xor(lhs, rhs)
-
-
-def equal(lhs, rhs):
-    """Broadcasted elementwise test for (lhs == rhs).
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.equal(lhs, rhs)
-
-
-def not_equal(lhs, rhs):
-    """Broadcasted elementwise test for (lhs != rhs).
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.not_equal(lhs, rhs)
-
-
-def less(lhs, rhs):
-    """Broadcasted elementwise test for (lhs < rhs).
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.less(lhs, rhs)
-
-
-def less_equal(lhs, rhs):
-    """Broadcasted elementwise test for (lhs <= rhs).
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.less_equal(lhs, rhs)
-
-
-def greater(lhs, rhs):
-    """Broadcasted elementwise test for (lhs > rhs).
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.greater(lhs, rhs)
-
-
-def greater_equal(lhs, rhs):
-    """Broadcasted elementwise test for (lhs >= rhs).
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.greater_equal(lhs, rhs)
-
-
-def maximum(lhs, rhs):
-    """Maximum with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.maximum(lhs, rhs)
-
-
-def minimum(lhs, rhs):
-    """Minimum with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.minimum(lhs, rhs)
-
-
-def right_shift(lhs, rhs):
-    """Right shift with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.right_shift(lhs, rhs)
-
-
-def left_shift(lhs, rhs):
-    """Left shift with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side input data
-    rhs : relay.Expr
-        The right hand side input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.left_shift(lhs, rhs)
-
-
-def zeros(shape, dtype):
-    """Fill array with zeros.
-
-    Parameters
-    ----------
-    shape : tuple of int or relay.Expr
-        The shape of the target.
-
-    dtype : data type
-        The data type of the target.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    if isinstance(shape, Constant):
-        shape = list(shape.data.numpy())
-    if isinstance(shape, Expr):
-        return _dyn_make.zeros(shape, dtype)
-    if isinstance(shape, int):
-        shape = [shape]
-    if isinstance(shape, (list, tuple)):
-        shape = list(shape)
-    return _make.zeros(shape, dtype)
-
-
-def zeros_like(data):
-    """Returns an array of zeros, with same type and shape as the input.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.zeros_like(data)
-
-
-def ones(shape, dtype):
-    """Fill array with ones.
-
-    Parameters
-    ----------
-    shape : tuple of int or relay.Expr
-        The shape of the target.
-
-    dtype : data type
-        The data type of the target.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    if isinstance(shape, Constant):
-        shape = list(shape.data.numpy())
-    if isinstance(shape, Expr):
-        return _dyn_make.ones(shape, dtype)
-    if isinstance(shape, int):
-        shape = [shape]
-    if isinstance(shape, (list, tuple)):
-        shape = list(shape)
-    return _make.ones(shape, dtype)
-
-
-def ones_like(data):
-    """Returns an array of ones, with same type and shape as the input.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.ones_like(data)
-
-
-def clip(a, a_min, a_max):
-    """Clip the elements in `a` between `a_min` and `a_max`.
-    `a_min` and `a_max` are cast to `a`'s dtype.
-
-    Parameters
-    ----------
-    a : relay.Expr
-        The input tensor.
-    a_min : float
-        The clip minimum.
-    a_max : float
-        The clip maximum.
-
-    Returns
-    -------
-    result : relay.Expr
-        `a` with elements clipped between `a_min` and `a_max`.
-
-    Examples
-    --------
-    .. code:: python
-
-      x = relay.Constant(tvm.nd.array([0, 1, 5, 3, 4, 2]))
-      relay.clip(x, 1., 4.)
-      # [1, 1, 4, 3, 4, 2]
-    """
-    return _make.clip(a, a_min, a_max)
-
-
-def fixed_point_multiply(data, multiplier, shift):
-    """Fixed point multiplication between data and a fixed point
-    constant expressed as multiplier * 2^(-shift), where multiplier
-    is a Q-number with 31 fractional bits
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input tensor.
-    multiplier : int
-        The integer multiplier of the fixed point constant.
-    shift : int
-        The integer shift of the fixed point constant.
-
-    Returns
-    -------
-    result : relay.Expr
-        The output of the fixed point multiplication
-    """
-    return _make.fixed_point_multiply(data, multiplier, shift)
-
-
-def concatenate(data, axis):
-    """Concatenate the input tensors along the given axis.
-
-    Parameters
-    ----------
-    data : Union(List[relay.Expr], Tuple[relay.Expr])
-        A list of tensors.
-    axis : int
-        The axis along which the tensors are concatenated.
-
-    Returns
-    -------
-    result: relay.Expr
-        The concatenated tensor.
-    """
-    if not isinstance(data, Call):
-        data = list(data)
-    if not data:
-        raise ValueError("relay.concatenate requires data to be non-empty.")
-    if not isinstance(data, Call):
-        data = Tuple(data)
-    if not isinstance(axis, int):
-        raise ValueError("For now, we only support integer axis")
-    return _make.concatenate(data, axis)
-
-
-def einsum(data, equation):
-    """Evaluates the Einstein summation convention on data
-
-    Parameters
-    ----------
-    data : Union(List[relay.Expr], Tuple[relay.Expr])
-        A list of tensors.
-    equation : str
-        The einsum expression string.
-
-    Returns
-    -------
-    result : relay.Expr
-        The output tensor from the einsum op.
-    """
-    data = list(data)
-    if not data:
-        raise ValueError("relay.einsum requires data to be non-empty.")
-    if not isinstance(equation, str):
-        raise ValueError("einsum `equation` must be a str")
-    return _make.einsum(Tuple(data), equation)
-
-
-def stack(data, axis):
-    """Join a sequence of arrays along a new axis.
-
-    Parameters
-    ----------
-    data : Union(List[relay.Expr], relay.Expr)
-        A list of tensors or a Relay expression that evaluates to a tuple of tensors.
-
-    axis : int
-        The axis in the result array along which the input arrays are stacked.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The stacked tensor.
-    """
-    if not data:
-        raise ValueError("relay.stack requires data to be non-empty.")
-    if not isinstance(axis, int):
-        raise ValueError("For now, we only support integer axis")
-    if not isinstance(data, Expr):
-        data = Tuple(list(data))
-    return _make.stack(data, axis)
-
-
-def copy(data):
-    """Copy a tensor.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The tensor to be copied.
-
-    Returns
-    -------
-    result: relay.Expr
-        The copied result.
-    """
-    return _make.copy(data)
-
-
-@script
-def _copy_shape_func_tensor(data_shape):
-    ndim = data_shape.shape[0]
-    out = output_tensor((ndim,), "int64")
-    for i in const_range(ndim):
-        out[i] = data_shape[i]
-    return out
-
-
-@script
-def _copy_shape_func_scalar(data_shape):
-    out = output_tensor((), "int64")
-    return out
-
-
-@reg.register_shape_func("copy", False)
-def copy_shape_func(attrs, inputs, _):
-    """
-    Shape function for copy op.
-    """
-    input = inputs[0]
-    if len(input.shape) == 0:
-        return [_copy_shape_func_scalar(input)]
-    return [_copy_shape_func_tensor(input)]
-
-
-def device_copy(data, src_device, dst_device):
-    """Copy data from the source device to the destination device. This
-    operator helps data transferring between difference devices for
-    heterogeneous execution.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The tensor to be copied.
-
-    src_device : Union[:py:class:`Device`, str]
-        The source device where the data is copied from.
-
-    dst_device : Union[:py:class:`Device`, str]
-        The destination device where the data is copied to.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The copied result.
-    """
-    return _make.DeviceCopy(
-        data, _make_virtual_device(src_device), _make_virtual_device(dst_device)
-    )
-
-
-def shape_of(data, dtype="int32"):
-    """Get shape of a tensor.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor.
-
-    dtype : str, optional
-        The target data type.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The shape tensor.
-    """
-    return _make.shape_of(data, dtype)
-
-
-def ndarray_size(data, dtype="int32"):
-    """Get number of elements of input tensor.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor.
-
-    dtype : str, optional
-        The target data type.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The number of elements of input tensor.
-    """
-    return _make.ndarray_size(data, dtype)
-
-
-def isnan(data):
-    """Check nan in input data element-wise.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.isnan(data)
-
-
-def isfinite(data):
-    """Compute element-wise finiteness of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.isfinite(data)
-
-
-def isinf(data):
-    """Compute element-wise infiniteness of data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.isinf(data)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
deleted file mode 100644
index dd9c670e2a37..000000000000
--- a/python/tvm/relay/op/transform.py
+++ /dev/null
@@ -1,2015 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=import-outside-toplevel
-"""Transform operators."""
-
-from typing import Optional
-
-from ...tir import expr as _expr
-from ..expr import Constant, Expr, Tuple, TupleWrapper, const
-from . import _make
-from .dyn import _make as _dyn_make
-from .tensor import shape_of
-
-
-def sliding_window(data, axis, window_shape, strides):
-    """Slide a window over the data tensor.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis : int
-        What axis the window begins sliding over. Window will be slid over
-        this axis and all following axes. The axis value determines the window
-        shape (and thus, the number of strides): window shape and strides must
-        both be of length `data.ndim-axis`.
-
-    window_shape : List[int]
-        The window shape to form over the input. Window shape must be of length
-        `data.ndim-axis`.
-
-    strides : List[int]
-        How to stride the window along each dimension. Strides must be of length
-        `data.ndim-axis`.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        # Slide a window of shape (3, 4, 5) over the x tensor, beginning with
-        # dimension 1, which slides the window over the two subtensors of
-        # shape (3, 32, 32).
-        x = relay.var("x", relay.TensorType((2, 3, 32, 32), "float32"))
-        y = relay.sliding_window(x, 1, [3, 4, 5], [1, 2, 3])
-
-        data = np.random.rand(2, 3, 32, 32).astype("float32")
-        result = create_executor().evaluate(y, {x: relay.const(data)}).numpy()
-
-        # The resulting shape still has batch size 2. Each dimension in
-        # (1, 15, 10) represents the locations where we were able to
-        # form a window; that is, we were able to place the window
-        # in one place along the dimension of length 3, 15 places along
-        # the dimension of length 32 (when striding by 2), and 10 places
-        # along the second dimension of length 32 (when striding by 3).
-        # The remaining dimension (3, 4, 5) represent the formed windows.
-        assert result.shape == (2, 1, 15, 10, 3, 4, 5)
-
-        assert np.array_equal(result[0, 0, 0, 0, :, :, :], data[0, :, 0:4, 0:5])
-        assert np.array_equal(result[1, 0, 7, 3, :, :, :], data[1, :, 14:18, 9:14])
-        assert np.array_equal(result[1, 0, 14, 9, :, :, :], data[1, :, 28:32, 27:32])
-    """
-    from .. import _ffi_api as _relay_make
-
-    return _relay_make.sliding_window(data, axis, window_shape, strides)
-
-
-def cast(data, dtype):
-    """Cast input tensor to data type.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    dtype : str
-        The target data type.
-
-    Returns
-    -------
-    result : relay.Expr
-        The casted result.
-    """
-    from .. import _ffi_api as _relay_make
-
-    return _relay_make.cast(data, dtype)
-
-
-def cast_like(data, dtype_like):
-    """Cast input tensor to data type of another tensor.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    dtype_like : relay.Expr
-        The tensor to cast to.
-
-    Returns
-    -------
-    result : relay.Expr
-        The casted result.
-    """
-    from .. import _ffi_api as _relay_make
-
-    return _relay_make.cast_like(data, dtype_like)
-
-
-def reinterpret(data, dtype):
-    """Reinterpret input tensor to data type.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    dtype : str
-        The target data type.
-
-    Returns
-    -------
-    result : relay.Expr
-        The reinterpreted result.
-    """
-    from .. import _make as _relay_make
-
-    return _relay_make.reinterpret(data, dtype)
-
-
-def expand_dims(data, axis, num_newaxis=1):
-    """Insert `num_newaxis` axes at the position given by `axis`.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis : Union[int, Expr]
-        The axis at which the input array is expanded.
-        Should lie in range `[-data.ndim - 1, data.ndim]`.
-        If `axis < 0`, it is the first axis inserted;
-        If `axis >= 0`, it is the last axis inserted in Python's negative indexing.
-
-    num_newaxis : int, optional
-        Number of axes to be inserted. Should be >= 0.
-
-    Returns
-    -------
-    result : relay.Expr
-        The reshaped result.
-    """
-    if isinstance(axis, int):
-        return _make.expand_dims(data, axis, num_newaxis)
-    if isinstance(axis, Expr):
-        # TODO (AndrewZhaoLuo): investigate performance issues with consecutive
-        # dynamic expand_dims on non-llvm targets.
-        return _dyn_make.expand_dims(data, axis, num_newaxis)
-    raise ValueError(f"Unknown type for axis: {type(axis)}")
-
-
-def transpose(data, axes=None):
-    """Permutes the dimensions of an array.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axes : None or List[int]
-        The target axes order, reverse order if not specified.
-
-    Returns
-    -------
-    result : relay.Expr
-        The transposed result.
-    """
-
-    if axes is not None:
-        axes = list(axes)
-    return _make.transpose(data, axes)
-
-
-def squeeze(data, axis=None):
-    """Squeeze axes in the array.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis : Union[None, int, Tuple[int], List[int]] or Expr
-        The set of axes to remove.
-        If axis = None, remove all axes of dimension 1.
-        If any specified axis has dimension that does not equal 1, it is an error.
-
-    Returns
-    -------
-    result : relay.Expr
-        The squeezed result.
-    """
-    if isinstance(axis, Constant):
-        if axis.data.shape:
-            axis = list(axis.data.numpy())
-        else:
-            axis = [axis.data.numpy().item()]
-    if isinstance(axis, Expr):
-        return _dyn_make.squeeze(data, axis)
-    if isinstance(axis, int):
-        axis = [axis]
-    if isinstance(axis, (tuple, list)):
-        tempaxis = []
-        for tmpax in axis:
-            if isinstance(tmpax, _expr.IntImm):
-                tempaxis.append(tmpax.value)
-            else:
-                try:
-                    tempaxis.append(int(tmpax))
-                except ValueError as err:
-                    raise RuntimeError(f"Unrecognized axis type: {err}")
-        axis = tempaxis
-    return _make.squeeze(data, axis)
-
-
-def reshape(data, newshape, allowzero=False):
-    """Reshape the input array.
-
-    To give user more convenience in without doing manual shape inference,
-    some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}.
-    The significance of each is explained below:
-
-    ``0`` copy this dimension from the input to the output shape.
-
-        .. code-block:: python
-
-            data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
-            data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
-
-    Note: If the parameter allowzero is manually set to true, it specifies a
-    special case where 0 actually means a true empty tensor.
-
-    ``-1`` infers the dimension of the output shape by using the remainder of
-    the input dimensions keeping the size of the new array same as that of the input array.
-    At most one dimension of shape can be -1.
-
-        .. code-block:: python
-
-            data.shape = (2,3,4), newshape = (6,1,-1), result.shape = (6,1,4)
-            data.shape = (2,3,4), newshape = (3,-1,8), result.shape = (3,1,8)
-            data.shape = (2,3,4), newshape = (-1,), result.shape = (24,)
-
-    ``-2`` copy all/remainder of the input dimensions to the output shape.
-
-        .. code-block:: python
-
-            data.shape = (2,3,4), newshape = (-2,), result.shape = (2,3,4)
-            data.shape = (2,3,4), newshape = (2,-2), result.shape = (2,3,4)
-            data.shape = (2,3,4), newshape = (-2,1,1), result.shape = (2,3,4,1,1)
-
-    ``-3`` use the product of two consecutive dimensions of the input shape
-    as the output dimension.
-
-        .. code-block:: python
-
-            data.shape = (2,3,4), newshape = (-3,4), result.shape = (6,4)
-            data.shape = (2,3,4,5), newshape = (-3,-3), result.shape = (6,20)
-            data.shape = (2,3,4), newshape = (0,-3), result.shape = (2,12)
-            data.shape = (2,3,4), newshape = (-3,-2), result.shape = (6,4)
-
-    ``-4`` split one dimension of the input into two dimensions passed subsequent
-    to -4 in shape (can contain -1).
-
-        .. code-block:: python
-
-            data.shape = (2,3,4), newshape = (-4,1,2,-2), result.shape = (1,2,3,4)
-            data.shape = (2,3,4), newshape = (2,-4,-1,3,-2), result.shape = (2,1,3,4)
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    newshape : Union[int, Tuple[int], List[int]] or relay.Expr
-        The new shape. Should be compatible with the original shape.
-
-    allowzero : Bool, optional
-        If true, then treat zero as true empty tensor rather than a copy instruction.
-
-    Returns
-    -------
-    result : relay.Expr
-        The reshaped result.
-    """
-    if isinstance(newshape, Constant):
-        newshape = list(newshape.data.numpy())
-    if isinstance(newshape, Expr):
-        return _dyn_make.reshape(data, newshape, allowzero)
-    if isinstance(newshape, int):
-        newshape = [newshape]
-    if isinstance(newshape, (tuple, list)):
-        tempshape = []
-        for shape in newshape:
-            if isinstance(shape, _expr.IntImm):
-                tempshape.append(shape.value)
-            else:
-                try:
-                    tempshape.append(int(shape))
-                except ValueError as err:
-                    raise RuntimeError(f"Unrecognized shape type: {err}")
-        newshape = tempshape
-    return _make.reshape(data, list(newshape), allowzero)
-
-
-def argwhere(condition):
-    """Find the indices of elements of a tensor that are
-    non-zero.
-
-    Parameters
-    ----------
-    condition : relay.Expr
-        The input condition tensor.
-
-    Returns
-    -------
-    result : relay.Expr
-        Tensor with the indices of elements that are non-zero.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        condition = [[True, False], [False, True]]
-        relay.argwhere(condition) = [[0, 0], [1, 1]]
-    """
-    return _make.argwhere(condition)
-
-
-def scatter_elements(data, indices, updates, axis=0, reduction="update"):
-    """Scatter elements with updating data by reduction of values in updates
-    at positions defined by indices.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    indices : relay.Expr
-        The index locations to update.
-
-    updates : relay.Expr
-        The values to update.
-
-    axis : int
-        The axis to scatter elements on. It is zero by default.
-
-    reduction : string, optional
-        The reduction mode for scatter. Choise is from ["update", "add", "mul", "mean", "min", max"]
-        If update, the update values will replace the input data
-        If add, the update values will be added to the input data
-        If mul, the input data will be multiplied on the update values
-        If mean, the input data will be mean between the update values and the input data
-        If min, there is choice of minimal between the update values and the input data
-        If max, there is choice of maximal between the update values and the input data
-        It is "update" by default
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    return _make.scatter_elements(data, indices, updates, axis, reduction)
-
-
-def scatter_nd(data, indices, updates, mode="update"):
-    """Scatter values from an array and update.
-
-    See :py:func:`tvm.topi.scatter` for how data is scattered.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    indices : relay.Expr
-        The index locations to update.
-
-    updates : relay.Expr
-        The values to update.
-
-    mode : string, optional
-        The accumulation mode for scatter. "update", "add", "mul", "min" or "max"
-        If update, the update values will replace the input data
-        If add, the update values will be added to the input data
-        If mul, the update values will be multiply to the input data
-        If min, there is choice of minimal between the update values and the input data
-        If max, there is choice of maximal between the update values and the input data
-        It is "update" by default
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    return _make.scatter_nd(data, indices, updates, mode)
-
-
-def reshape_like(data, shape_like, lhs_begin=0, lhs_end=None, rhs_begin=0, rhs_end=None):
-    """Reshapes the input tensor by the size of another tensor.
-    For an input tensor with shape ``(d0, d1, ..., d(k-1))``, `reshape_like` operation reshapes
-    the input tensor into an output tensor with the same shape as the second input tensor,
-    in particular reshaping the dimensions of `data` in `[lhs_begin, lhs_end)` using the dimensions
-    from `shape_like` in `[rhs_begin, rhs_end)`.
-
-    .. note::
-        Sizes for `data` and the output tensor should be compatible.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    shape_like : relay.Expr
-        The tensor to reshape data like. Should be compatible with the original shape on the
-        reshaped dimensions.
-
-    lhs_begin : int, optional
-        The axis of data to begin reshaping. Default is 0.
-
-    lhs_end : int or None, optional
-        The axis of data where reshaping should stop, exclusive. Default is None which reshapes to
-        the end.
-
-    rhs_begin : int, optional
-        The axis of shape_like where the target shape begins. Default is 0.
-
-    rhs_end : int or None, optional
-        The axis of shape_like where the target shape ends, exclusive. Default is None which extends
-        to the end.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data.shape == (1, 2, 3, 4)
-        shape_like.shape == (6, 2, 2, 3)
-
-        ret = relay.reshape_like(data, shape_like, lhs_begin=1, rhs_end=3)
-        ret.shape == (1, 6, 2, 2)
-    """
-    return _make.reshape_like(data, shape_like, lhs_begin, lhs_end, rhs_begin, rhs_end)
-
-
-def take(data, indices, axis=None, batch_dims=0, mode="clip"):
-    """Take elements from an array along an axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source array.
-
-    indices : relay.Expr
-        The indices of the values to extract.
-
-    axis : int, optional
-        The axis over which to select values. By default,
-        the flattened input array is used.
-
-    batch_dims : int, optional
-        The number of batch dimensions. By default is 0.
-
-    mode : str, optional
-        Specifies how out-of-bound indices will behave [clip, wrap, fast].
-        clip: clip to the range (default).
-        wrap: wrap around the indices.
-        fast: no clip or wrap around (user must make sure indices are in-bound).
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    return _make.take(data, indices, batch_dims, axis, mode)
-
-
-def full(fill_value, shape=(), dtype=""):
-    """Fill array with scalar value.
-
-    Parameters
-    ----------
-    fill_value : relay.Expr
-        The value to fill. Must be a scalar.
-
-    shape : tuple of int or relay.Expr, optional
-        The shape of the target.
-
-    dtype : data type, optional (defaults to data type of the fill value)
-        The data type of the target.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    if isinstance(shape, Constant):
-        shape = list(shape.data.numpy())
-    if isinstance(shape, Expr):
-        return _dyn_make.full(fill_value, shape, dtype)
-    if isinstance(shape, int):
-        shape = [shape]
-    if isinstance(shape, (list, tuple)):
-        shape = list(shape)
-    return _make.full(fill_value, shape, dtype)
-
-
-def full_like(data, fill_value):
-    """Return a scalar value array with the same shape and type as the input array.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input tensor.
-
-    fill_value : relay.Expr
-        The scalar value to fill.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    return _make.full_like(data, fill_value)
-
-
-def arange(start, stop=None, step=None, dtype="float32"):
-    """Return evenly spaced values within a given interval.
-
-    .. note::
-        Similar to ``numpy.arange``. When only one argument is given, it is used
-        as `stop` instead of `start` while `start` takes default value 0.
-
-        Warning: Undefined behavior when dtype is incompatible with start/stop/step.
-        It could lead to different results compared to numpy, MXNet, pytorch, etc.
-
-    Parameters
-    ----------
-    start : relay.Expr, optional
-        Start of interval. The interval includes this value. The default start
-        value is 0.
-
-    stop : relay.Expr
-        Stop of interval. The interval does not include this value.
-
-    step : relay.Expr, optional
-        Spacing between values. The default step size is 1.
-
-    dtype : str, optional
-        The target data type.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        relay.arange(5) = [0, 1, 2, 3, 4]
-        relay.arange(1, 5) = [1, 2, 3, 4]
-        relay.arange(1, 5, 1.5) = [1, 2.5, 4]
-    """
-    if step is None:
-        step = const(1, dtype=dtype)
-
-    if stop is None:
-        stop = start
-        start = const(0, dtype=dtype)
-
-    return _make.arange(start, stop, step, dtype)
-
-
-def meshgrid(data, indexing="ij"):
-    """Create coordinate matrices from coordinate vectors.
-
-    .. note::
-        Similar to ``numpy.meshgrid``.
-
-    Parameters
-    ----------
-    data : Union(List[relay.Expr], Tuple[relay.Expr])
-        A list of tensors, which must be either scalars or 1-D vectors.
-
-    indexing : str, optional
-        Indexing mode, either "ij" for matrix indexing or "xy" for Cartesian indexing.
-
-    Returns
-    -------
-    ret : relay.Tuple([relay.Expr, relay.Expr])
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [1, 2, 3]
-        y = [4, 5]
-
-        gx, gy = relay.meshgrid([x, y])
-
-        gx = [[1., 1.],
-              [2., 2.],
-              [3., 3.]]
-
-        gy = [[4., 5.],
-              [4., 5.],
-              [4., 5.]]
-    """
-    data = list(data)
-    ret_size = len(data)
-    return TupleWrapper(_make.meshgrid(Tuple(data), indexing), ret_size)
-
-
-def repeat(data, repeats, axis):
-    """Repeats elements of an array.
-    By default, repeat flattens the input array into 1-D and then repeats the elements.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input tensor.
-
-    repeats : int
-        The number of repetitions for each element.
-
-    axis: int
-        The axis along which to repeat values. The negative numbers are interpreted
-        counting from the backward. By default, use the flattened input array, and
-        return a flat output array.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [[1, 2], [3, 4]]
-        relay.repeat(x, repeats=2) = [1., 1., 2., 2., 3., 3., 4., 4.]
-
-        relay.repeat(x, repeats=2, axis=1) = [[1., 1., 2., 2.],
-                                              [3., 3., 4., 4.]]
-    """
-    return _make.repeat(data, repeats, axis)
-
-
-def tile(data, reps):
-    """Repeats the whole array multiple times.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    reps : tuple of int or relay.Expr
-        The number of times repeating the tensor data.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [[1, 2], [3, 4]]
-        relay.tile(x, reps=(2,3)) = [[1., 2., 1., 2., 1., 2.],
-                                     [3., 4., 3., 4., 3., 4.],
-                                     [1., 2., 1., 2., 1., 2.],
-                                     [3., 4., 3., 4., 3., 4.]]
-
-        relay.tile(x, reps=(2,)) = [[1., 2., 1., 2.],
-                                    [3., 4., 3., 4.]]
-
-    Notes
-    -----
-    Each dim size of reps must be a positive integer. If reps has length d,
-    the result will have dimension of max(d, data.ndim); If data.ndim < d,
-    data is promoted to be d-dimensional by prepending new axes.
-    If data.ndim >=  d, reps is promoted to a.ndim by pre-pending 1's to it.
-    """
-    if isinstance(reps, Constant):
-        reps = list(reps.data.numpy())
-    if isinstance(reps, Expr):
-        return _dyn_make.tile(data, reps)
-    return _make.tile(data, reps)
-
-
-def reverse(data, axis):
-    """Reverses the order of elements along given axis while preserving array shape.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis: int
-        The axis along which to reverse elements.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [[1., 2.], [3., 4.]]
-        relay.reverse(x, axis=0) = [[3., 4.], [1., 2.]]
-
-        relay.reverse(x, axis=1) = [[2., 1.], [4., 3.]]
-    """
-    return _make.reverse(data, axis)
-
-
-def reverse_sequence(data, seq_lengths, seq_axis=1, batch_axis=0):
-    """Reverse the tensor for variable length slices.
-    Input is first sliced along batch axis and then elements are reversed along seq axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The tensor to be reversed.
-
-    seq_lengths : relay.Expr
-        A 1D Tensor with length a.dims[batch_axis].
-        Must be one of the following types: int32, int64.
-        If seq_lengths[i] > a.dims[seq_axis], it is rounded to a.dims[seq_axis].
-        If seq_lengths[i] < 1, it is rounded to 1.
-
-    seq_axis : int, optional
-        The axis along which the elements will be reversed. Default is 1.
-
-    batch_axis : int, optional
-        The axis along which the tensor will be sliced. Default is 0.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result of same shape and type as of input.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [[0, 1, 2, 3],
-             [4, 5, 6, 7],
-             [8, 9, 10, 11],
-             [12, 13, 14, 15]]
-        relay.reverse(x, [1, 2, 3, 4], 0, 1) = [[0, 5, 10, 15],
-                                                [4, 1, 6, 11],
-                                                [8, 9, 2, 7],
-                                                [12, 13, 14, 3]]
-
-        relay.reverse(x, [1, 2, 3, 4], 1, 0) = [[0, 1, 2, 3],
-                                                [5, 4, 6, 7],
-                                                [10, 9, 8, 11],
-                                                [15, 14, 13, 12]]
-    """
-    return _make.reverse_sequence(data, seq_lengths, seq_axis, batch_axis)
-
-
-def where(condition, x, y):
-    """Selecting elements from either x or y depending on the value of the
-    condition.
-
-    .. note::
-        Shapes of condition, x, and y must be broadcastable to a common shape.
-        Semantics follow numpy where function
-        https://numpy.org/doc/stable/reference/generated/numpy.where.html
-
-    Parameters
-    ----------
-    condition : relay.Expr
-        Where True, yield x, otherwise yield y
-
-    x : relay.Expr
-        The first array or scalar to be selected.
-
-    y : relay.Expr
-        The second array or scalar to be selected.
-
-    Returns
-    -------
-    result : relay.Expr
-        The selected array. The output shape is the broadcasted shape from
-        condition, x, and y.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [[1, 2], [3, 4]]
-        y = [[5, 6], [7, 8]]
-        condition = [[0, 1], [-1, 0]]
-        relay.where(conditon, x, y) = [[5, 2], [3, 8]]
-
-        condition = [[1], [0]]
-        relay.where(conditon, x, y) = [[1, 2], [7, 8]]
-    """
-    return _make.where(condition, x, y)
-
-
-def broadcast_to(data, shape):
-    """Return a scalar value array with the same type, broadcasted to
-    the provided shape.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input tensor.
-
-    shape : tuple of int or relay.Expr
-        Provide the shape to broadcast to.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    if isinstance(shape, Constant):
-        shape = shape.data.numpy()
-        shape = [_expr.IntImm(str(shape.dtype), int(value)) for value in shape]
-    elif isinstance(shape, Expr):
-        return _dyn_make.broadcast_to(data, shape)
-
-    if isinstance(shape, int):
-        shape = [shape]
-
-    return _make.broadcast_to(data, shape)
-
-
-def broadcast_to_like(data, broadcast_type):
-    """Return a scalar value array with the same shape and type as the input array.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input tensor.
-
-    broadcast_type : relay.Expr
-        Provide the shape to broadcast to.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    return _make.broadcast_to_like(data, broadcast_type)
-
-
-def collapse_sum_like(data, collapse_type):
-    """Return a scalar value array with the same shape and type as the input array.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input tensor.
-
-    collapse_type : relay.Expr
-        Provide the shape to collapse to.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    return _make.collapse_sum_like(data, collapse_type)
-
-
-def collapse_sum_to(data, shape):
-    """Return a summation of data to the specified shape.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input tensor.
-
-    shape : relay.Expr
-        Shape to collapse to.
-
-    Returns
-    -------
-    result : relay.Expr
-        The resulting tensor.
-    """
-    if isinstance(shape, (list, tuple)):
-        shape = const(list(shape), "int32")
-    return _make.collapse_sum_to(data, shape)
-
-
-def split(data, indices_or_sections, axis=0):
-    """Split input tensor along axis by sections or indices.
-
-    If indices_or_sections is an integer, the input will be divided equally
-    along given axis. If such a split is not possible, an error is raised.
-
-    If indices_or_sections is a tuple of sorted integers,
-    the entries indicate where along axis the array is split.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source array.
-
-    indices_or_sections : int or tuple of int
-        Indices or sections to split into. Accepts an int or a tuple.
-
-    axis : int, optional
-        The axis over which to split.
-
-    Returns
-    -------
-    ret : relay.Tuple([relay.Expr, relay.Expr])
-        The computed result.
-    """
-    if isinstance(indices_or_sections, int):
-        ret_size = indices_or_sections
-    else:
-        ret_size = len(indices_or_sections) + 1
-    return TupleWrapper(_make.split(data, indices_or_sections, axis), ret_size)
-
-
-def strided_slice(data, begin, end, strides=None, axes=None, slice_mode="end"):
-    """Strided slice of an array.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source array to be sliced.
-
-    begin : relay.Expr, Tuple[int], or List[int]
-        The indices to begin with in the slicing.
-
-    end : relay.Expr, Tuple[int], or List[int]
-        Indices indicating end of the slice.
-
-    strides : relay.Expr, Tuple[int], or List[int], optional
-        Specifies the stride values. It can be negative. In that case,
-        the input tensor will be reversed in that particular axis.
-
-    axes : Tuple[int] or List[int], optional
-        Axes along which slicing is applied. When it is specified, the length of begin, end,
-        strides, and axes must be equal. Moreover, begin, end, strides, and axes must be
-        static (cannot be relay.Expr). Axes argument for dynamic parameter slicing is
-        not supported yet.
-
-    slice_mode : str, optional
-        The slice mode [end, size].
-        end: The ending indices for the slice [default].
-        size: The input strides will be ignored. Input end in this mode indicates
-        the size of a slice starting at the location specified by begin. If end[i]
-        is -1, all remaining elements in that dimension are included in the slice.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    strides = strides or [1]
-    if isinstance(begin, Constant):
-        begin = list(begin.data.numpy())
-    if isinstance(end, Constant):
-        end = list(end.data.numpy())
-    if isinstance(strides, Constant):
-        strides = list(strides.data.numpy())
-    if isinstance(begin, Expr) or isinstance(end, Expr) or isinstance(strides, Expr):
-        if isinstance(begin, (tuple, list)):
-            begin = const(list(begin))
-        if isinstance(end, (tuple, list)):
-            end = const(list(end))
-        if isinstance(strides, (tuple, list)):
-            strides = const(list(strides))
-
-        ishape = cast_like(shape_of(data), begin)
-        ishape_slice = slice_like(ishape, begin)
-        begin = _make.where(begin < cast_like(const(0), begin), begin + ishape_slice, begin)
-        begin = _make.where(begin >= ishape_slice, ishape_slice, begin)
-        # TODO(masahi): Support axes argument in dynamic strided slice
-        assert axes is None, "Axes argument for dynamic parameter slicing is not supported yet."
-        return _dyn_make.strided_slice(data, begin, end, strides, slice_mode)
-    return _make.strided_slice(data, begin, end, strides, slice_mode, axes)
-
-
-def strided_set(data, v, begin, end, strides=None):
-    """Strided set of an array.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source array to be sliced.
-
-    v : relay.Expr
-        The data to be set.
-
-    begin : relay.Expr, Tuple[int], or List[int]
-        The indices to begin with in the slicing.
-
-    end : relay.Expr, Tuple[int], or List[int]
-        Indices indicating end of the slice.
-
-    strides: relay.Expr, Tuple[int], or List[int], optional
-        Specifies the stride values. It can be negative. In that case,
-        the input tensor will be reversed in that particular axis.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    strides = strides or const([1], dtype="int32")
-    if isinstance(begin, (tuple, list)):
-        begin = const(list(begin))
-    if isinstance(end, (tuple, list)):
-        end = const(list(end))
-    if isinstance(strides, (tuple, list)):
-        strides = const(list(strides))
-    return _make.strided_set(data, v, begin, end, strides)
-
-
-def slice_like(data, shape_like, axes=None):
-    """Slice the first input with respect to the second input.
-
-    For an input array with shape ``(d1, d2, ..., dk)``, `slice_like` operation slices the
-    input array corresponding to the size of the second array. By default will slice on all axes.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source array.
-
-    shape_like : relay.Expr
-        An array based on which shape, the result shape is computed.
-
-    axes : Tuple[int] or List[int], optional
-        List of axes on which input data will be sliced according to the corresponding size of
-        the second input. By default will slice on all axes. Negative axes mean counting in reverse.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.slice_like(data, shape_like, axes)
-
-
-def layout_transform(data, src_layout, dst_layout):
-    """Transform the layout of a tensor.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source tensor to be transformed.
-
-    src_layout : str
-        The source layout.  (e.g NCHW)
-
-    dst_layout : str
-        The destination layout.  (e.g. NCHW16c)
-
-    Returns
-    -------
-    ret : relay.Expr
-        The transformed tensor.
-    """
-    return _make.layout_transform(data, src_layout, dst_layout)
-
-
-def reverse_reshape(data, newshape):
-    """Reshapes the input array where the special values are inferred from
-    right to left.
-
-    The special values have the same semantics as :py:class:`tvm.relay.reshape`.
-    The difference is that special values are inferred from right to left. It
-    can be explained in the example below.
-
-    .. code-block:: python
-
-        data.shape = (10,5,4), newshape = (-1,0), reshape results in (40,5)
-        data.shape = (10,5,4), newshape = (-1,0), reverse_reshape results in (50,4)
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    newshape : Union[int, Tuple[int], List[int]]
-        The new shape. Should be compatible with the original shape.
-
-    Returns
-    -------
-    result : relay.Expr
-        The reshaped result.
-    """
-    if isinstance(newshape, int):
-        newshape = [newshape]
-    return _make.contrib_reverse_reshape(data, list(newshape))
-
-
-def gather(data, axis, indices):
-    """Gather values along given axis from given indices.
-
-    E.g. for a 3D tensor, output is computed as:
-
-    .. code-block:: python
-
-        out[i][j][k] = data[indices[i][j][k]][j][k]  # if axis == 0
-        out[i][j][k] = data[i][indices[i][j][k]][k]  # if axis == 1
-        out[i][j][k] = data[i][j][indices[i][j][k]]  # if axis == 2
-
-    ``indices`` must have the same shape as ``data``, except at dimension ``axis``
-    which must just be not null. Output will have the same shape as ``indices``.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis : int
-        The axis along which to index. Negative axis is supported.
-
-    indices : relay.Expr
-        The indices of values to gather.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [[1, 2], [3, 4]]
-        axis = 1
-        indices = [[0, 0], [1, 0]]
-        relay.gather(data, axis, indices) = [[1, 1], [4, 3]]
-    """
-    return _make.gather(data, axis, indices)
-
-
-def gather_nd(data, indices, batch_dims=0, index_rank=None):
-    """Gather elements or slices from data and store them to a tensor whose shape is
-    defined by indices.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    indices : relay.Expr
-        The shape of output tensor.
-
-    batch_dims : int, optional
-        The number of batch dimensions.
-
-    index_rank : int, optional
-        The size of an indexing tuple, which is a fixed value and the same as indices.shape[0].
-        Only needed when other dimensions of indices are dynamic.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [[0, 1], [2, 3]]
-        indices = [[1, 1, 0], [0, 1, 0]]
-        relay.gather_nd(data, indices) = [2, 3, 0]
-
-        data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
-        indices = [[0, 1], [1, 0]]
-        relay.gather_nd(data, indices) = [[3, 4], [5, 6]]
-
-        data = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
-        indices = [[1, 0]]
-        relay.gather_nd(data, indices, batch_dims=1) = [[2, 3],[4, 5]]
-    """
-    return _make.gather_nd(data, indices, batch_dims, index_rank)
-
-
-def sequence_mask(data, valid_length, mask_value=0, axis=0):
-    """Sets all elements outside the expected length of the sequence to a constant value.
-
-    This function takes an n-dimensional input array of the form [MAX_LENGTH, batch_size, ...] or
-    [batch_size, MAX_LENGTH, ...] and returns an array of the same shape.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data.
-
-    valid_length : relay.Expr
-        The expected (valid) length of each sequence in the tensor.
-
-    mask_value : float, optional
-        The masking value.
-
-    axis : int, optional
-        The axis of the length dimension.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [[[  1.,   2.,   3.], [  4.,   5.,   6.]],
-             [[  7.,   8.,   9.], [ 10.,  11.,  12.]],
-             [[ 13.,  14.,   15.], [ 16.,  17.,   18.]]]
-
-       relay.sequence_mask(x, valid_length=[1, 1]) =
-            [[[  1.,   2.,   3.], [  4.,   5.,   6.]],
-             [[  0.,   0.,   0.], [  0.,   0.,   0.]],
-             [[  0.,   0.,   0.], [  0.,   0.,   0.]]]
-
-       relay.sequence_mask(x, valid_length=[2, 3], mask_value=0.1) =
-            [[[  1.,   2.,   3.], [  4.,   5.,   6.]],
-             [[  7.,   8.,   9.], [  10.,  11.,  12.]],
-             [[  0.1,  0.1,  0.1], [  16.,  17.,  18.]]]
-    """
-    return _make.sequence_mask(data, valid_length, mask_value, axis)
-
-
-def one_hot(indices, on_value, off_value, depth, axis, dtype):
-    """Returns a one-hot tensor where the locations represented by indices take value on_value,
-    and other locations take value off_value.
-    Final dimension is <indices outer dimensions> x depth x <indices inner dimensions>.
-
-    Parameters
-    ----------
-    indices : relay.Expr
-        Locations to set to on_value.
-
-    on_value : relay.Expr
-        Value to fill at indices.
-
-    off_value : relay.Expr
-        Value to fill at all other positions besides indices.
-
-    depth : int or relay.Expr
-        Depth of the one-hot dimension.
-
-    axis : int
-        Axis to fill.
-
-    dtype : str
-        Data type of the output tensor.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The one-hot tensor.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        indices = [0, 1, 2]
-
-        relay.one_hot(indices, 3) =
-            [[1, 0, 0],
-             [0, 1, 0],
-             [0, 0, 1]]
-    """
-    if isinstance(depth, Constant):
-        depth = depth.data.numpy().item()
-    if isinstance(depth, Expr):
-        return _dyn_make.one_hot(indices, on_value, off_value, depth, axis, dtype)
-    return _make.one_hot(indices, on_value, off_value, depth, axis, dtype)
-
-
-def unravel_index(indices, shape):
-    """Convert a flat index or array of flat indices into a tuple of coordinate arrays.
-
-    Parameters
-    ----------
-    indices : relay.Expr
-        An integer array containing indices.
-
-    shape : relay.Expr
-        The shape of the array.
-
-    Returns
-    -------
-    result : relay.Expr
-        The tuple of coordinate arrays.
-
-    Examples
-    -------
-    .. code-block:: python
-
-        relay.unravel_index([22, 41, 37], [7, 6]) =
-            [[3, 6, 6],
-             [4, 5, 1]]
-    """
-    return _make.unravel_index(indices, shape)
-
-
-def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0):
-    """Converts a sparse representation into a dense tensor.
-
-    Parameters
-    ----------
-    sparse_indices : relay.Expr
-        A 0-D, 1-D, or 2-D tensor of integers containing location of sparse values.
-
-    output_shape : relay.Expr
-        A list of integers. Shape of the dense output tensor.
-
-    sparse_values : relay.Expr
-        A 0-D or 1-D tensor containing the sparse values for the sparse indices.
-
-    default_value : relay.Expr, optional
-        A 0-D tensor containing the default value for the remaining locations.
-        Defaults to 0.
-
-    Returns
-    -------
-    result : relay.Expr
-        Dense tensor of shape output_shape. Has the same type as sparse_values.
-
-    Examples
-    -------
-    .. code-block:: python
-
-        relay.sparse_to_dense([[0, 0], [1, 1]], [2, 2], [3, 3], 0) =
-            [[3, 0],
-             [0, 3]]
-    """
-    if default_value == 0:
-        default_value = const(0)
-    if isinstance(output_shape, Expr):
-        return _dyn_make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
-    return _make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
-
-
-def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"):
-    """Returns a tensor with the diagonals of input tensor replaced with the provided
-    diagonal values.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        Input tensor.
-
-    diagonal : relay.Expr
-        Values to be filled in the diagonal.
-
-    k : int or tuple of int, optional
-        Diagonal offset(s). The diagonal or range of diagonals to set. (0 by default)
-        Positive value means superdiagonal, 0 refers to the main diagonal, and
-        negative value means subdiagonals. k can be a single integer (for a single diagonal)
-        or a pair of integers specifying the low and high ends of a matrix band.
-        k[0] must not be larger than k[1].
-
-    align : string, optional
-        Some diagonals are shorter than max_diag_len and need to be padded.
-        align is a string specifying how superdiagonals and subdiagonals should be aligned,
-        respectively. There are four possible alignments: "RIGHT_LEFT" (default), "LEFT_RIGHT",
-        "LEFT_LEFT", and "RIGHT_RIGHT". "RIGHT_LEFT" aligns superdiagonals to the right
-        (left-pads the row) and subdiagonals to the left (right-pads the row). It is the packing
-        format LAPACK uses. cuSPARSE uses "LEFT_RIGHT", which is the opposite alignment.
-
-    Returns
-    -------
-    result : relay.Expr
-        New tensor with given diagonal values.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [[[7, 7, 7, 7],
-                 [7, 7, 7, 7],
-                 [7, 7, 7, 7]],
-                [[7, 7, 7, 7],
-                 [7, 7, 7, 7],
-                 [7, 7, 7, 7]]]
-
-        diagonal = [[1, 2, 3],
-                    [4, 5, 6]]
-
-        relay.matrix_set_diag(input, diagonal) =
-            [[[1, 7, 7, 7],
-              [7, 2, 7, 7],
-              [7, 7, 3, 7]],
-             [[4, 7, 7, 7],
-              [7, 5, 7, 7],
-              [7, 7, 6, 7]]]
-    """
-    if isinstance(k, (tuple, list)):
-        k_one = k[0]
-        if len(k) >= 2:
-            k_two = k[1]
-        else:
-            k_two = k[0]
-    else:
-        k_one = k
-        k_two = k
-
-    super_diag_right_align = align[:5] == "RIGHT"
-    sub_diag_right_align = align[-5:] == "RIGHT"
-
-    return _make.matrix_set_diag(
-        data, diagonal, k_one, k_two, super_diag_right_align, sub_diag_right_align
-    )
-
-
-def adv_index(inputs):
-    """Numpy style advanced indexing. Index with a list of tensors.
-
-    Parameters
-    ----------
-    inputs : Union(List[relay.Expr], Tuple[relay.Expr])
-        Input tensor and indices.
-        The first tensor is the input data and the rest are the indices.
-
-    Returns
-    -------
-    result : relay.Expr
-        Output tensor.
-    """
-    return _make.adv_index(Tuple(inputs))
-
-
-def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value):
-    """Fill rows in a sparse matrix that do not contain any values. Values are placed in the first
-    column of empty rows. The sparse array is in COO format.
-    It returns a TupleWrapper with 3 outputs.
-
-    Parameters
-    ----------
-    sparse_indices : relay.Expr
-        A 2-D tensor[N, ndims] of integers containing the locations of sparse values, where N is
-        the number of sparse values and n_dim is the number of dimensions of the dense_shape.
-        The first column of this parameter must be sorted in ascending order.
-
-    sparse_values : relay.Expr
-        A 1-D tensor[N] containing the sparse values for the sparse indices.
-
-    dense_shape : relay.Expr
-        A 1-D tensor[ndims] which contains the shape of the dense output tensor.
-
-    default_value : relay.Expr
-        A 1-D tensor[1] containing the default value for the remaining locations.
-
-    Returns
-    -------
-    new_sparse_indices : relay.Expr
-        A 2-D tensor[?, ndims] of integers containing location of new sparse
-        indices. The first column outputs must be sorted in ascending order.
-
-    new_sparse_values : relay.Expr
-        A 1-D tensor[?] containing the sparse values for the sparse indices.
-
-    empty_row_indicator : relay.Expr
-        A 1-D tensor[dense_shape[0]] filled with zeros and ones
-        indicating whether the particular row is empty or full respectively.
-
-    Note
-    ----
-    This op exactly follows the documentation here:
-    https://www.tensorflow.org/api_docs/python/tf/sparse/fill_empty_rows
-    There are two exceptions:
-    1. Input Sparse Indices are expected to be in row-major order.
-    2. Empty Row Indicator has int64 output type with 1(for True) and 0(for False).
-
-    Examples
-    -------
-    .. code-block:: python
-
-        sparse_indices = [[0, 1],
-                         [0, 3],
-                         [2, 0],
-                         [3, 1]]
-
-        sparse_values = [1, 2, 3, 4]
-
-        default_value = [10]
-
-        dense_shape = [5, 6]
-
-        new_sparse_indices, empty_row_indicator, new_sparse_values =
-                            relay.sparse_fill_empty_rows(
-                            sparse_indices,
-                            sparse_values,
-                            default_value,
-                            dense_shape)
-
-        new_sparse_indices = [[0, 1],
-                              [0, 3],
-                              [1, 0],
-                              [2, 0],
-                              [3, 1],
-                              [4, 0]]
-
-        empty_row_indicator = [False, True, False, False, True]
-
-        new_sparse_values = [1, 2, 10, 3, 4, 10]
-    """
-    new_sparse_indices, new_sparse_values, empty_row_indicator = TupleWrapper(
-        _make.sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value), 3
-    )
-    new_sparse_indices = cast_like(new_sparse_indices, sparse_indices)
-    new_sparse_values = cast_like(new_sparse_values, sparse_values)
-    empty_row_indicator = cast(empty_row_indicator, "bool")
-
-    return Tuple((new_sparse_indices, new_sparse_values, empty_row_indicator))
-
-
-def sparse_reshape(sparse_indices, prev_shape, new_shape):
-    """Reshape a sparse tensor. The sparse array is in COO format.
-
-    Parameters
-    ----------
-    sparse_indices : relay.Expr
-        A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the
-        number of sparse values and n_dim is the number of dimensions of the dense_shape.
-
-    prev_shape : relay.Expr
-        A 1-D tensor containing the previous shape of the dense tensor.
-
-    new_shape : relay.Expr
-        A 1-D tensor containing the new shape of the dense tensor.
-
-    Returns
-    -------
-    result: relay.Expr
-        Output tensor.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        sparse_indices = [[0, 0, 0],
-                          [0, 0, 1],
-                          [0, 1, 0],
-                          [1, 0, 0],
-                          [1, 2, 3]]
-
-        prev_shape = [2, 3, 6]
-
-        new_shape = [9, -1]
-
-        new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices,
-                                                             prev_shape,
-                                                             new_shape)
-        new_sparse_indices = [[0, 0],
-                              [0, 1],
-                              [1, 2],
-                              [4, 2],
-                              [8, 1]]
-        new_shape = [9, 4]
-    """
-    return TupleWrapper(_make.sparse_reshape(sparse_indices, prev_shape, new_shape), 2)
-
-
-def segment_sum(data, segment_ids, num_segments=None):
-    """Computes the sum along segment_ids along axis 0. If multiple segment_ids reference the same
-    location their contributions add up.
-    result[index, j, k, ...] = Σi... data[i, j, k,..] where index = segment_ids[i]
-    This op is much better understood with visualization articulated in the following links and
-    examples at the end of this docstring.
-
-    https://www.tensorflow.org/api_docs/python/tf/math/unsorted_segment_sum
-    https://caffe2.ai/docs/sparse-operations.html#null__unsorted-segment-reduction-ops
-
-    Parameters
-    ----------
-    data : relay.Expr
-        Input tensor. It can be of any type and multi-dimensional.
-
-    segment_ids : relay.Expr
-        A 1-D int32/int64 tensor containing the segment_ids of the rows to calculate the output
-        sum upon. It defines a mapping from the zeroth dimension of data onto segment_ids. The
-        segment_ids tensor should be the size of the first dimension, d0, with consecutive IDs
-        in the range 0 to k, where k<d0. In particular, a segmentation of a matrix tensor is a
-        mapping of rows to segments. This tensor doesn't need to be sorted.
-
-    num_segments : int, optional
-        An integer describing the shape of the zeroth dimension. If unspecified, it is calculated
-        equivalent to the number of unique segment_ids.
-
-    Returns
-    -------
-    result : relay.Expr
-        Output tensor.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [[1, 2, 3, 4],
-                [4, -3, 2, -1],
-                [5, 6, 7, 8]]
-
-        segment_ids = [0, 0, 1]
-
-        relay.segment_sum(data, segment_ids) = [[5, -1, 5, 3],
-                                                [5, 6, 7, 8]]
-
-        data = [[1, 2, 3, 4],
-                [4, -3, 2, -1],
-                [5, 6, 7, 8]]
-
-        segment_ids = [2, 0, 0]
-
-        num_segments = 3
-
-        segment_sum(data, segment_ids, num_segments) = [[9, 3, 9, 7],
-                                                        [0, 0, 0, 0],
-                                                        [1, 2, 3, 4]]
-    """
-
-    one_tensor = cast_like(const([1]), segment_ids)
-    if num_segments:
-        if isinstance(num_segments, int):
-            max_segments = const([num_segments])
-            max_segments = cast_like(max_segments, segment_ids)
-        else:
-            max_segments = cast_like(num_segments, segment_ids)
-    else:
-        max_segments = _make.add(reshape(_make.max(segment_ids, [0], False, False), -1), one_tensor)
-
-    data_offrow_shape = strided_slice(_make.shape_of(data, "int32"), [1], [-1], slice_mode="size")
-    data_offrow_shape = cast_like(data_offrow_shape, max_segments)
-    new_shape = _make.concatenate(Tuple([max_segments, data_offrow_shape]), 0)
-    segment_ids_tiled_shape = _make.concatenate(
-        Tuple([reverse(data_offrow_shape, 0), one_tensor]), 0
-    )
-    expanded_segment_ids = tile(segment_ids, segment_ids_tiled_shape)
-    scatter_add_segment_ids = transpose(expanded_segment_ids)
-    src = cast_like(_dyn_make.zeros(new_shape, "float64"), data)
-    return scatter_elements(src, scatter_add_segment_ids, data, axis=0, reduction="add")
-
-
-def cumsum(data, axis=None, dtype=None, exclusive=None):
-    """Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
-    a given axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis : int, optional
-        Axis along which the cumulative sum is computed. The default (None) is to compute
-        the cumsum over the flattened array.
-
-    dtype : string, optional
-        Type of the returned array and of the accumulator in which the elements are summed.
-        If dtype is not specified, it defaults to the dtype of data.
-
-    exclusive : bool, optional
-        If true will return exclusive sum in which the first element is not
-        included. In other terms, if true, the j-th output element would be
-        the sum of the first (j-1) elements. Otherwise, it would be the sum of
-        the first j elements.
-
-    Returns
-    -------
-    result : relay.Expr
-        The result has the same size as data, and the same shape as data if axis is not None.
-        If axis is None, the result is a 1-d array.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        a = [[1, 2, 3], [4, 5, 6]]
-
-        cumsum(a)  # if axis is not provided, cumsum is done over the flattened input.
-        -> [ 1,  3,  6, 10, 15, 21]
-
-        cumsum(a, dtype="float32")
-        -> [  1.,   3.,   6.,  10.,  15.,  21.]
-
-        cumsum(a, axis=0)  # sum over rows for each of the 3 columns
-        -> [[1, 2, 3],
-            [5, 7, 9]]
-
-        cumsum(a, axis=1)
-        -> [[ 1,  3,  6],
-            [ 4,  9, 15]]
-
-        a = [1, 0, 1, 0, 1, 1, 0]  # a is a boolean array
-        cumsum(a, dtype=int32)  # dtype should be provided to get the expected results
-        -> [1, 1, 2, 2, 3, 4, 4]
-    """
-    return _make.cumsum(data, axis, dtype, exclusive)
-
-
-def cumprod(data, axis=None, dtype=None, exclusive=None):
-    """Numpy style cumprod op. Return the cumulative inclusive product of the elements along
-    a given axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis : int, optional
-        Axis along which the cumulative product is computed. The default (None) is to compute
-        the cumprod over the flattened array.
-
-    dtype : string, optional
-        Type of the returned array and of the accumulator in which the elements are multiplied.
-        If dtype is not specified, it defaults to the dtype of data.
-
-    exclusive : bool, optional
-        If true will return exclusive product in which the first element is not
-        included. In other terms, if true, the j-th output element would be
-        the product of the first (j-1) elements. Otherwise, it would be the product of
-        the first j elements. The product of zero elements will be 1.
-
-    Returns
-    -------
-    result : relay.Expr
-        The result has the same size as data, and the same shape as data if axis is not None.
-        If axis is None, the result is a 1-d array.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        a = [[1, 2, 3], [4, 5, 6]]
-
-        cumprod(a)  # if axis is not provided, cumprod is done over the flattened input.
-        -> [ 1,  2,  6, 24, 120, 720]
-
-        cumprod(a, dtype="float32")
-        -> [  1.,  2.,  6., 24., 120., 720.]
-
-        cumprod(a, axis=0)  # multiply over rows for each of the 3 columns
-        -> [[1, 2, 3],
-            [4, 10, 18]]
-
-        cumprod(a, axis=1)
-        -> [[ 1,  2,  6],
-            [ 4,  20, 120]]
-
-        a = [1, 1, 1, 0, 1, 1, 0]  # a is a boolean array
-        cumprod(a, dtype=int32)  # dtype should be provided to get the expected results
-        -> [1, 1, 1, 0, 0, 0, 0]
-    """
-    return _make.cumprod(data, axis, dtype, exclusive)
-
-
-def unique(data, is_sorted=True, return_counts=False):
-    """Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
-    have the same length of `data` and element with index >= num_unique[0] has undefined value.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        A 1-D tensor of integers.
-
-    is_sorted : bool, optional
-        Whether to sort the unique elements in ascending order before returning as output.
-
-    return_counts : bool, optional
-        Whether to return the count of each unique element.
-
-    Returns
-    -------
-    unique : relay.Expr
-        A 1-D tensor containing the unique elements of the input data tensor.
-
-    indices : relay.Expr
-        A 1-D tensor containing the indeces of the first occurence of each unique value
-        in the input tensor.
-
-    inverse_indices : relay.Expr
-        A 1-D tensor. For each entry in data, it contains the index of that data element in the
-        unique array.
-
-    num_unique : relay.Expr
-        A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
-
-    counts : relay.Expr, optional
-        A 1-D tensor containing the count of each unique element in the output.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        [output, indices, inverse_indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5],
-                                                                False,
-                                                                False)
-        output          =  [4, 5, 1, 2, 3, _, _, _]
-        indices         =  [0, 1, 2, 3, 4, _, _, _]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-
-        [output, indices, inverse_indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5],
-                                                                        False,
-                                                                        True)
-        output          =  [4, 5, 1, 2, 3, _, _, _]
-        indices         =  [0, 1, 2, 3, 4, _, _, _]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-        counts          =  [2, 2, 1, 1, 2, _, _, _]
-
-        [output, indices, inverse_indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
-        output          =  [1, 2, 3, 4, 5, _, _, _]
-        indices         =  [2, 3, 4, 0, 1, _, _, _]
-        inverse_indices =  [3, 4, 0, 1, 2, 2, 3, 4]
-        num_unique      =  [5]
-    """
-    if return_counts:
-        return TupleWrapper(_make.unique(data, is_sorted, return_counts), 5)
-    return TupleWrapper(_make.unique(data, is_sorted, return_counts), 4)
-
-
-def invert_permutation(data):
-    """Computes the inverse permutation of data.
-    This operation computes the inverse of an index permutation.
-    It takes a 1-D integer tensor x, which represents the indices of a zero-based
-    array and swaps each value with its index position.
-
-    For an output tensor y and an input tensor x, this operation computes the following:
-    y[x[i]] = i for i in [0, 1, ..., len(x) - 1]
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source data to be invert permuted.
-
-    Returns
-    -------
-    ret : relay.Expr
-        Invert permuted data. Has the same type as data.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [3, 4, 0, 2, 1]
-        relay.invert_permutation(data) = [2, 4, 3, 0, 1]
-    """
-    return _make.invert_permutation(data)
-
-
-def stft(
-    data, n_fft, hop_length=None, win_length=None, window=None, normalized=False, onesided=True
-):
-    """The STFT computes the Fourier transform of short overlapping windows of the input.
-    This gives frequency components of the signal as they change over time.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        Either a 1-D tensor or a 2-D batch tensor.
-
-    n_fft : int
-        The size of Fourier transform.
-
-    hop_length : int, optional
-        The distance between neighboring sliding window frames. If is None,
-        it is treated as equal to floor(n_fft / 4).
-
-    win_length : int, optional
-        The size of window frame and STFT filter. If is None, it is treated as equal to n_fft.
-
-    window : relay.Expr, optional
-        A 1-D tensor window frame. If is None (default), it is treated as if
-        having 1 everywhere in the window.
-
-    normalized : bool, optional
-        Whether to return the normalized STFT results. Default value is False.
-
-    onesided : bool, optional
-        Whether to return onesided result or fill with conjugate symmetry. Default value is True.
-
-    Returns
-    -------
-    output : relay.Expr
-        Tensor containing the STFT result with shape [batch, N, T, 2], where N is the
-        number of frequencies where STFT is applied and T is the total number of frames used.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [1, 2, 3, 4, 5, 6]
-        window = [4, 3, 2]
-        [n_fft, hop_length, win_length, normalized, onesided] = [3, 3, 3, False, True]
-
-        relay.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
-        -> [[[16.0000,  0.0000], [43.0000,  0.0000]], [[ -2.0000,  0.0000], [ 2.5000, -2.5981]]]
-    """
-    if hop_length is None:
-        hop_length = n_fft // 4
-
-    if win_length is None:
-        win_length = n_fft
-
-    if window is None:
-        window = _make.ones([n_fft], "int32")
-
-    return _make.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
-
-
-def dft(re_data, im_data, inverse: Optional[bool] = False):
-    """Computes the discrete Fourier transform of input (calculation along the last axis).
-    This gives frequency components of the signal as they change over time.
-
-    Parameters
-    ----------
-    re_data : relay.Expr
-        N-D tensor, real part of the input signal.
-
-    im_data : relay.Expr
-        N-D tensor, imaginary part of the input signal.
-        If the signal is real, then the values of this tensor are zeros.
-
-    inverse : Optional[bool]
-
-        Whether to perform the inverse discrete fourier transform.
-        Providing None is equivalent to False, and is maintained for
-        compatibility.
-
-    Returns
-    -------
-    re_output : relay.Expr
-        The Fourier Transform of the input (Real part).
-    im_output : relay.Expr
-        The Fourier Transform of the input (Imaginary part).
-
-    """
-    if inverse is None:
-        inverse = False
-
-    return TupleWrapper(_make.dft(re_data, im_data, inverse), 2)
-
-
-def trilu(data, k, upper=True):
-    """Given a 2-D matrix or batches of 2-D matrices, returns the
-    upper or lower triangular part of the tensor.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The tensor that trilu will be applied to. Must be either
-        a 2D matrix or a tensor of batches of 2D matrices.
-
-    k : int
-        The number of diagonals above or below the main diagonal
-        to exclude or include.
-
-    upper: bool, optional
-        If True, only upper triangular values of input are kept,
-        if False, the lower triangular values are kept.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The new tensor with appropriate diagonals set to zero.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        x = [[0, 1, 2],
-             [3, 4, 5],
-             [6, 7, 8]]
-
-        relay.trilu(x, True, 0) =
-            [[0, 1, 2],
-             [0, 4, 5],
-             [0, 0, 8]]
-    """
-    if not isinstance(k, Expr):
-        k = const(k, dtype="int32")
-    return _make.trilu(data, k, upper)
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
deleted file mode 100644
index 55e6bf9d5fd9..000000000000
--- a/python/tvm/relay/op/vision/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Vision network related operators."""
-from .multibox import *
-from .nms import *
-from .rcnn import *
-from .yolo import *
-from . import _rcnn
-from . import _yolo
-from . import _vision
diff --git a/python/tvm/relay/op/vision/_make.py b/python/tvm/relay/op/vision/_make.py
deleted file mode 100644
index eddca15c19b5..000000000000
--- a/python/tvm/relay/op/vision/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.vision._make", __name__)
diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py
deleted file mode 100644
index a3f749236d3f..000000000000
--- a/python/tvm/relay/op/vision/_rcnn.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Faster R-CNN and Mask R-CNN operations."""
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-from .. import op as reg
-from .. import strategy
-from ..op import OpPattern
-
-# roi_align
-reg.register_strategy("vision.roi_align", strategy.roi_align_strategy)
-reg.register_pattern("vision.roi_align", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
-@reg.register_convert_op_layout("vision.roi_align")
-def convert_roi_align(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for roi_align op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current roi_align
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and rois inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    data, rois = inputs
-    new_attrs = dict(attrs)
-    assert (
-        len(desired_layouts) == 2
-    ), "A desired layout is expected for both of vision.roi_align's inputs"
-
-    desired_data_layout, desired_rois_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    assert desired_rois_layout == "default", "Rois layout must be default"
-
-    new_attrs["layout"] = desired_data_layout
-    # rois layout not change
-    if desired_data_layout in ["NCHW", "NHWC"]:
-        return relay.vision.roi_align(data, rois, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported.")
-
-
-@reg.register_convert_op_layout("vision.roi_pool")
-def convert_roi_pool(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for roi_pool op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current roi_pool
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and rois inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    data, rois = inputs
-    new_attrs = dict(attrs)
-    assert (
-        len(desired_layouts) == 2
-    ), "A desired layout is expected for both of vision.roi_pool's inputs"
-
-    desired_data_layout, desired_rois_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-    assert desired_rois_layout == "default", "Rois layout must be default"
-
-    new_attrs["layout"] = desired_data_layout
-    # rois layout not change
-    if desired_data_layout in ["NCHW", "NHWC"]:
-        return relay.vision.roi_pool(data, rois, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported.")
-
-
-# roi_pool
-@reg.register_compute("vision.roi_pool")
-def compute_roi_pool(attrs, inputs, _):
-    """Compute definition of roi_pool"""
-    assert attrs.layout == "NCHW", "only support nchw for now"
-    return [
-        topi.vision.rcnn.roi_pool_nchw(
-            inputs[0],
-            inputs[1],
-            pooled_size=get_const_tuple(attrs.pooled_size),
-            spatial_scale=attrs.spatial_scale,
-        )
-    ]
-
-
-reg.register_schedule("vision.roi_pool", strategy.schedule_roi_pool)
-reg.register_pattern("vision.roi_pool", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-# proposal
-reg.register_strategy("vision.proposal", strategy.proposal_strategy)
-reg.register_pattern("vision.proposal", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
deleted file mode 100644
index 522b79d6b56c..000000000000
--- a/python/tvm/relay/op/vision/_vision.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Definition of vision ops"""
-from __future__ import absolute_import
-
-from tvm import topi
-from tvm.te.hybrid import script
-from tvm.runtime import convert
-
-from .. import op as reg
-from .. import strategy
-from ..op import OpPattern
-
-# multibox_prior
-reg.register_strategy("vision.multibox_prior", strategy.multibox_prior_strategy)
-reg.register_pattern("vision.multibox_prior", OpPattern.OPAQUE)
-
-
-# multibox_transform_loc
-reg.register_strategy("vision.multibox_transform_loc", strategy.multibox_transform_loc_strategy)
-reg.register_pattern("vision.multibox_transform_loc", OpPattern.OPAQUE)
-
-
-# Get counts of valid boxes
-reg.register_strategy("vision.get_valid_counts", strategy.get_valid_counts_strategy)
-reg.register_pattern("vision.get_valid_counts", OpPattern.OPAQUE)
-
-
-# non-maximum suppression
-reg.register_strategy("vision.non_max_suppression", strategy.nms_strategy)
-reg.register_pattern("vision.non_max_suppression", OpPattern.OPAQUE)
-
-reg.register_strategy("vision.all_class_non_max_suppression", strategy.all_class_nms_strategy)
-reg.register_pattern("vision.all_class_non_max_suppression", OpPattern.OPAQUE)
-
-reg.register_strategy("vision.regular_non_max_suppression", strategy.regular_nms_strategy)
-reg.register_pattern("vision.regular_non_max_suppression", OpPattern.OPAQUE)
-
-
-@script
-def _get_valid_counts_shape_func(data_shape):
-    valid_counts_shape = output_tensor((1,), "int64")
-    out_tensor_shape = output_tensor((data_shape.shape[0],), "int64")
-    out_indices_shape = output_tensor((2,), "int64")
-
-    valid_counts_shape[0] = data_shape[0]
-    for i in const_range(data_shape.shape[0]):
-        out_tensor_shape[i] = data_shape[i]
-    out_indices_shape[0] = data_shape[0]
-    out_indices_shape[1] = data_shape[1]
-
-    return valid_counts_shape, out_tensor_shape, out_indices_shape
-
-
-@reg.register_shape_func("vision.get_valid_counts", False)
-def get_valid_counts_shape_func(attrs, inputs, _):
-    return _get_valid_counts_shape_func(inputs[0])
-
-
-@script
-def _nms_shape_func(data_shape):
-    out_shape = output_tensor((2,), "int64")
-    count_shape = output_tensor((2,), "int64")
-
-    out_shape[0] = data_shape[0]
-    out_shape[1] = data_shape[1]
-    count_shape[0] = data_shape[0]
-    count_shape[1] = int64(1)
-    return out_shape, count_shape
-
-
-@reg.register_shape_func("vision.non_max_suppression", False)
-def nms_shape_func(attrs, inputs, _):
-    if attrs.return_indices:
-        return _nms_shape_func(inputs[0])
-    return [topi.math.identity(inputs[0])]
-
-
-@script
-def _all_class_nms_shape_func_onnx(boxes_shape, scores_shape):
-    out_shape = output_tensor((2,), "int64")
-    count_shape = output_tensor((1,), "int64")
-
-    out_shape[0] = boxes_shape[0] * scores_shape[1] * boxes_shape[1]
-    out_shape[1] = int64(3)
-    count_shape[0] = int64(1)
-    return out_shape, count_shape
-
-
-@script
-def _all_class_nms_shape_func_tf(boxes_shape, scores_shape):
-    out_indices_shape = output_tensor((3,), "int64")
-    out_scores_shape = output_tensor((2,), "int64")
-    count_shape = output_tensor((1,), "int64")
-
-    out_indices_shape[0] = boxes_shape[0]
-    out_indices_shape[1] = scores_shape[1] * boxes_shape[1]
-    out_indices_shape[2] = int64(2)
-    out_scores_shape[0] = boxes_shape[0]
-    out_scores_shape[1] = scores_shape[1] * boxes_shape[1]
-    count_shape[0] = boxes_shape[0]
-
-    return out_indices_shape, out_scores_shape, count_shape
-
-
-@reg.register_shape_func("vision.all_class_non_max_suppression", False)
-def all_class_nms_shape_func(attrs, inputs, _):
-    if attrs.output_format == "onnx":
-        return _all_class_nms_shape_func_onnx(inputs[0], inputs[1])
-    return _all_class_nms_shape_func_tf(inputs[0], inputs[1])
-
-
-@script
-def _regular_nms_shape_func(boxes_shape, scores_shape, attrs):
-    out_boxes_shape = output_tensor((3,), "int64")
-    out_classes_shape = output_tensor((2,), "int64")
-    out_scores_shape = output_tensor((2,), "int64")
-    out_num_detections_shape = output_tensor((1,), "int64")
-
-    out_boxes_shape[0] = boxes_shape[0]
-    out_boxes_shape[1] = int64(attrs.max_detections)
-    out_boxes_shape[2] = int64(4)
-
-    out_classes_shape[0] = boxes_shape[0]
-    out_classes_shape[1] = int64(attrs.max_detections)
-
-    out_scores_shape[0] = boxes_shape[0]
-    out_scores_shape[1] = int64(attrs.max_detections)
-
-    out_num_detections_shape[0] = boxes_shape[0]
-
-    return out_boxes_shape, out_classes_shape, out_scores_shape, out_num_detections_shape
-
-
-@reg.register_shape_func("vision.regular_non_max_suppression", False)
-def regular_nms_shape_func(attrs, inputs, _):
-    return _regular_nms_shape_func(inputs[0], inputs[1], attrs)
-
-
-@script
-def _roi_align_shape_func_nchw(data_shape, rois_shape, pooled_size):
-    out = output_tensor((4,), "int64")
-    out[0] = rois_shape[0]
-    out[1] = data_shape[1]
-    out[2] = int64(pooled_size[0])
-    out[3] = int64(pooled_size[1])
-    return out
-
-
-@script
-def _roi_align_shape_func_nhwc(data_shape, rois_shape, pooled_size):
-    out = output_tensor((4,), "int64")
-    out[0] = rois_shape[0]
-    out[1] = int64(pooled_size[0])
-    out[2] = int64(pooled_size[1])
-    out[3] = data_shape[3]
-    return out
-
-
-@reg.register_shape_func("vision.roi_align", False)
-def roi_align_shape_func(attrs, inputs, _):
-    if attrs.layout == "NCHW":
-        return [_roi_align_shape_func_nchw(inputs[0], inputs[1], convert(attrs.pooled_size))]
-    assert attrs.layout == "NHWC", "layout must be NCHW or NHWC."
-    return [_roi_align_shape_func_nhwc(inputs[0], inputs[1], convert(attrs.pooled_size))]
diff --git a/python/tvm/relay/op/vision/_yolo.py b/python/tvm/relay/op/vision/_yolo.py
deleted file mode 100644
index 3c43cb2915f7..000000000000
--- a/python/tvm/relay/op/vision/_yolo.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Backend compiler related feature registration"""
-from __future__ import absolute_import
-from ..op import register_pattern, OpPattern
-from ..op import register_injective_schedule
-
-# reorg
-register_pattern("vision.yolo_reorg", OpPattern.INJECTIVE)
-register_injective_schedule("vision.yolo_reorg")
diff --git a/python/tvm/relay/op/vision/multibox.py b/python/tvm/relay/op/vision/multibox.py
deleted file mode 100644
index 898ff4ffeda0..000000000000
--- a/python/tvm/relay/op/vision/multibox.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Multibox operations."""
-from tvm.relay import expr
-from . import _make
-
-
-def multibox_prior(
-    data, sizes=(1.0,), ratios=(1.0,), steps=(-1.0, -1.0), offsets=(0.5, 0.5), clip=False
-):
-    """Generate prior(anchor) boxes from data, sizes and ratios.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data tensor.
-
-    sizes : tuple of float, optional
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float, optional
-        Tuple of ratios for anchor boxes.
-
-    steps : Tuple of float, optional
-        Priorbox step across y and x, -1 for auto calculation.
-
-    offsets : tuple of int, optional
-        Priorbox center offsets, y and x respectively.
-
-    clip : boolean, optional
-        Whether to clip out-of-boundary boxes.
-
-    Returns
-    -------
-    out : relay.Expr
-        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
-    """
-    return _make.multibox_prior(data, sizes, ratios, steps, offsets, clip)
-
-
-def multibox_transform_loc(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip=True,
-    threshold=0.01,
-    variances=(0.1, 0.1, 0.2, 0.2),
-    keep_background=False,
-):
-    """Location transformation for multibox detection
-
-    Parameters
-    ----------
-    cls_prob : tvm.relay.Expr
-        Class probabilities.
-
-    loc_pred : tvm.relay.Expr
-        Location regression predictions.
-
-    anchor : tvm.relay.Expr
-        Prior anchor boxes.
-
-    clip : boolean, optional
-        Whether to clip out-of-boundary boxes.
-
-    threshold : double, optional
-        Threshold to be a positive prediction.
-
-    variances : Tuple of float, optional
-        variances to be decoded from box regression output.
-
-    keep_background : boolean, optional
-        Whether to keep boxes detected as background or not.
-
-    Returns
-    -------
-    ret : tuple of tvm.relay.Expr
-    """
-    return expr.TupleWrapper(
-        _make.multibox_transform_loc(
-            cls_prob,
-            loc_pred,
-            anchor,
-            clip,
-            threshold,
-            variances,
-            keep_background,
-        ),
-        2,
-    )
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
deleted file mode 100644
index 4ce2c44275be..000000000000
--- a/python/tvm/relay/op/vision/nms.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Non-maximum suppression operations."""
-from tvm.relay import expr
-from . import _make
-
-
-def get_valid_counts(data, score_threshold, id_index=0, score_index=1):
-    """Get valid count of bounding boxes given a score threshold.
-    Also moves valid boxes to the top of input data.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
-
-    score_threshold : optional, float
-        Lower limit of score for valid bounding boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
-
-    Returns
-    -------
-    valid_count : relay.Expr
-        1-D tensor for valid number of boxes.
-
-    out_tensor : relay.Expr
-        Rearranged data tensor.
-
-    out_indices: relay.Expr
-        Indices in input data
-    """
-    if not isinstance(score_threshold, expr.Expr):
-        score_threshold = expr.const(score_threshold, "float32")
-    return expr.TupleWrapper(
-        _make.get_valid_counts(data, score_threshold, id_index, score_index), 3
-    )
-
-
-def non_max_suppression(
-    data,
-    valid_count,
-    indices,
-    max_output_size=-1,
-    iou_threshold=0.5,
-    force_suppress=False,
-    top_k=-1,
-    coord_start=2,
-    score_index=1,
-    id_index=0,
-    return_indices=True,
-    invalid_to_bottom=False,
-):
-    """Non-maximum suppression operator for object detection.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5].
-        The last dimension should be in format of
-        [class_id, score, box_left, box_top, box_right, box_bottom]
-        or [score, box_left, box_top, box_right, box_bottom]. It could
-        be the second output out_tensor of get_valid_counts.
-
-    valid_count : relay.Expr
-        1-D tensor for valid number of boxes. It could be the output
-        valid_count of get_valid_counts.
-
-    indices: relay.Expr
-        2-D tensor with shape [batch_size, num_anchors], represents
-        the index of box in original data. It could be the third
-        output out_indices of get_valid_counts. The values in the
-        second dimension are like the output of arange(num_anchors)
-        if get_valid_counts is not used before non_max_suppression.
-
-    max_output_size : int or relay.Expr, optional
-        Max number of output valid boxes for each instance.
-        Return all valid boxes if the value of max_output_size is less than 0.
-
-    iou_threshold : float or relay.Expr, optional
-        Non-maximum suppression threshold.
-
-    force_suppress : bool, optional
-        Suppress all detections regardless of class_id.
-
-    top_k : int, optional
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    coord_start : int, optional
-        The starting index of the consecutive 4 coordinates.
-
-    score_index : int, optional
-        Index of the scores/confidence of boxes.
-
-    id_index : int, optional
-        index of the class categories, -1 to disable.
-
-    return_indices : bool, optional
-        Whether to return box indices in input data.
-
-    invalid_to_bottom : bool, optional
-        Whether to move all valid bounding boxes to the top.
-
-    Returns
-    -------
-    out : relay.Expr or relay.Tuple
-        return relay.Expr if return_indices is disabled, a 3-D tensor
-        with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
-        If return_indices is True, return relay.Tuple of two 2-D tensors, with
-        shape [batch_size, num_anchors] and [batch_size, num_valid_anchors] respectively.
-    """
-    if not isinstance(max_output_size, expr.Expr):
-        max_output_size = expr.const(max_output_size, "int32")
-    if not isinstance(iou_threshold, expr.Expr):
-        iou_threshold = expr.const(iou_threshold, "float32")
-    out = _make.non_max_suppression(
-        data,
-        valid_count,
-        indices,
-        max_output_size,
-        iou_threshold,
-        force_suppress,
-        top_k,
-        coord_start,
-        score_index,
-        id_index,
-        return_indices,
-        invalid_to_bottom,
-    )
-    if return_indices:
-        return expr.TupleWrapper(out, 2)
-    return out
-
-
-def all_class_non_max_suppression(
-    boxes,
-    scores,
-    max_output_boxes_per_class=-1,
-    iou_threshold=-1.0,
-    score_threshold=-1.0,
-    output_format="onnx",
-):
-    """Non-maximum suppression operator for object detection, corresponding to ONNX
-    NonMaxSuppression and TensorFlow combined_non_max_suppression.
-    NMS is performed for each class separately.
-
-    Parameters
-    ----------
-    boxes : relay.Expr
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    scores: relay.Expr
-        3-D tensor with shape (batch_size, num_classes, num_boxes)
-
-    max_output_boxes_per_class : int or relay.Expr, optional
-        The maxinum number of output selected boxes per class
-
-    iou_threshold : float or relay.Expr, optionaIl
-        IoU test threshold
-
-    score_threshold : float or relay.Expr, optional
-        Score threshold to filter out low score boxes early
-
-    output_format : string, optional
-        "onnx" or "tensorflow". Specify by which frontends the outputs are
-        intented to be consumed.
-
-    Returns
-    -------
-    out : relay.Tuple
-        If `output_format` is "onnx", the output is a relay.Tuple of two tensors, the first is
-        `indices` of size `(batch_size * num_class* num_boxes , 3)` and the second is a scalar
-        tensor `num_total_detection` of shape `(1,)` representing the total number of selected
-        boxes. The three values in `indices` encode batch, class, and box indices.
-        Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come first,
-        in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
-        `batch_size * num_class* num_boxes` rows of indices,  only the first `num_total_detection`
-        rows are valid.
-
-        If `output_format` is "tensorflow", the output is a relay.Tuple of three tensors, the first
-        is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of
-        size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size
-        `(batch_size,)` representing the total number of selected boxes per batch. The two values
-        in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at
-        batch b, only the first `num_total_detection[b]` entries are valid. The second axis of
-        `indices` and `scores` are sorted within each class by box scores, but not across classes.
-        So the box indices and scores for the class 0 come first in a sorted order, followed by
-        the class 1 etc.
-    """
-    if not isinstance(max_output_boxes_per_class, expr.Expr):
-        max_output_boxes_per_class = expr.const(max_output_boxes_per_class, "int32")
-    if not isinstance(iou_threshold, expr.Expr):
-        iou_threshold = expr.const(iou_threshold, "float32")
-    if not isinstance(score_threshold, expr.Expr):
-        score_threshold = expr.const(score_threshold, "float32")
-
-    out = _make.all_class_non_max_suppression(
-        boxes,
-        scores,
-        max_output_boxes_per_class,
-        iou_threshold,
-        score_threshold,
-        output_format,
-    )
-
-    if output_format == "onnx":
-        return expr.TupleWrapper(out, 2)
-
-    return expr.TupleWrapper(out, 3)
-
-
-def regular_non_max_suppression(
-    boxes,
-    scores,
-    max_detections_per_class,
-    max_detections,
-    num_classes,
-    iou_threshold,
-    score_threshold,
-):
-    """Regular non-maximum suppression operator for object detection, corresponding to TFLite's
-    regular NMS. NMS is performed for each class separately.
-
-    Parameters
-    ----------
-    boxes : relay.Expr
-        3-D tensor with shape (batch_size, num_boxes, 4). The four values in boxes
-        encode (ymin, xmin, ymax, xmax) coordinates of a box
-
-    scores: relay.Expr
-        3-D tensor with shape (batch_size, num_boxes, num_classes_with_background)
-
-    max_detections_per_class : int
-        The maxinum number of output selected boxes per class
-
-    max_detections : int
-        The maxinum number of output selected boxes
-
-    num_classes : int
-        The number of classes without background
-
-    iou_threshold : float
-        IoU test threshold
-
-    score_threshold : float
-        Score threshold to filter out low score boxes early
-
-    Returns
-    -------
-    out : relay.Tuple
-        The output is a relay.Tuple of four tensors. The first is `detection_boxes` of size
-        `(batch_size, max_detections , 4)`, the second is `detection_classes` of size
-        `(batch_size, max_detections)`, the third is `detection_scores` of size
-        `(batch_size, max_detections)`, and the fourth is `num_detections` of size `(batch_size,)`
-        representing the total number of selected boxes per batch.
-    """
-    return expr.TupleWrapper(
-        _make.regular_non_max_suppression(
-            boxes,
-            scores,
-            max_detections_per_class,
-            max_detections,
-            num_classes,
-            iou_threshold,
-            score_threshold,
-        ),
-        4,
-    )
diff --git a/python/tvm/relay/op/vision/rcnn.py b/python/tvm/relay/op/vision/rcnn.py
deleted file mode 100644
index d25c5de89cee..000000000000
--- a/python/tvm/relay/op/vision/rcnn.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Faster R-CNN and Mask R-CNN operations."""
-from . import _make
-
-
-def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout="NCHW", mode="avg"):
-    """ROI align operator.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        4-D tensor with shape [batch, channel, height, width]
-
-    rois : relay.Expr
-        2-D tensor with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : list/tuple of two ints
-        output size
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    sample_ratio : int
-        Optional sampling ratio of ROI align, using adaptive size by default.
-
-    mode : str, Optional
-        The pooling method. Relay supports two methods, 'avg' and 'max'. Default is 'avg'.
-
-    Returns
-    -------
-    output : relay.Expr
-        4-D tensor with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    return _make.roi_align(data, rois, pooled_size, spatial_scale, sample_ratio, layout, mode)
-
-
-def roi_pool(data, rois, pooled_size, spatial_scale, layout="NCHW"):
-    """ROI pool operator.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        4-D tensor with shape [batch, channel, height, width]
-
-    rois : relay.Expr
-        2-D tensor with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : list/tuple of two ints
-        output size
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    Returns
-    -------
-    output : relay.Expr
-        4-D tensor with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    return _make.roi_pool(data, rois, pooled_size, spatial_scale, layout)
-
-
-def proposal(
-    cls_prob,
-    bbox_pred,
-    im_info,
-    scales,
-    ratios,
-    feature_stride,
-    threshold,
-    rpn_pre_nms_top_n,
-    rpn_post_nms_top_n,
-    rpn_min_size,
-    iou_loss,
-):
-    """Proposal operator.
-
-    Parameters
-    ----------
-    cls_prob : relay.Expr
-        4-D tensor with shape [batch, 2 * num_anchors, height, width].
-
-    bbox_pred : relay.Expr
-        4-D tensor with shape [batch, 4 * num_anchors, height, width].
-
-    im_info : relay.Expr
-        2-D tensor with shape [batch, 3]. The last dimension should be in format of
-        [im_height, im_width, im_scale]
-
-    scales : list/tuple of float
-        Scales of anchor windows.
-
-    ratios : list/tuple of float
-        Ratios of anchor windows.
-
-    feature_stride : int
-        The size of the receptive field each unit in the convolution layer of the rpn, for example
-        the product of all stride's prior to this layer.
-
-    threshold : float
-        Non-maximum suppression threshold.
-
-    rpn_pre_nms_top_n : int
-        Number of top scoring boxes to apply NMS. -1 to use all boxes.
-
-    rpn_post_nms_top_n : int
-        Number of top scoring boxes to keep after applying NMS to RPN proposals.
-
-    rpn_min_size : int
-        Minimum height or width in proposal.
-
-    iou_loss : bool
-        Usage of IoU loss.
-
-    Returns
-    -------
-    output : relay.Expr
-        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
-        [batch_index, w_start, h_start, w_end, h_end].
-    """
-    return _make.proposal(
-        cls_prob,
-        bbox_pred,
-        im_info,
-        scales,
-        ratios,
-        feature_stride,
-        threshold,
-        rpn_pre_nms_top_n,
-        rpn_post_nms_top_n,
-        rpn_min_size,
-        iou_loss,
-    )
diff --git a/python/tvm/relay/op/vision/yolo.py b/python/tvm/relay/op/vision/yolo.py
deleted file mode 100644
index f556d7438e0b..000000000000
--- a/python/tvm/relay/op/vision/yolo.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Yolo operations."""
-from . import _make
-
-
-def yolo_reorg(data, stride):
-    """Yolo reorg operation used in darknet models.
-    This layer shuffles the input tensor values based on the stride value.
-    Along with the shuffling, it does the shape transform.
-    If '(n, c, h, w)' is the data shape and 's' is stride, output shape is '(n, c*s*s, h/s, w/s)'.
-
-    Example:
-
-    .. code-block:: python
-
-        data(1, 4, 2, 2) = [[[[ 0  1] [ 2  3]]
-                            [[ 4  5] [ 6  7]]
-                            [[ 8  9] [10 11]]
-                            [[12 13] [14 15]]]]
-        stride = 2
-        ret(1, 16, 1, 1) = [[[[ 0]]  [[ 2]]  [[ 8]]  [[10]]
-                            [[ 1]]  [[ 3]]  [[ 9]]  [[11]]
-                            [[ 4]]  [[ 6]]  [[12]]  [[14]]
-                            [[ 5]]  [[ 7]]  [[13]]  [[15]]]]
-
-    .. note::
-
-        stride=1 has no significance for reorg operation.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data tensor.
-
-    stride : int
-        The stride value for reorganisation.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    return _make.yolo_reorg(data, stride)
diff --git a/python/tvm/relay/op/vm/__init__.py b/python/tvm/relay/op/vm/__init__.py
deleted file mode 100644
index 7e128c9334ce..000000000000
--- a/python/tvm/relay/op/vm/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Dialect operators for Relay VM."""
-from __future__ import absolute_import as _abs
-from .vm import *
diff --git a/python/tvm/relay/op/vm/_ffi_api.py b/python/tvm/relay/op/vm/_ffi_api.py
deleted file mode 100644
index 3eeeeb811859..000000000000
--- a/python/tvm/relay/op/vm/_ffi_api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for relay.op.vm"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.vm", __name__)
diff --git a/python/tvm/relay/op/vm/vm.py b/python/tvm/relay/op/vm/vm.py
deleted file mode 100644
index 0fb7acec314e..000000000000
--- a/python/tvm/relay/op/vm/vm.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return,invalid-name,len-as-condition,too-many-nested-blocks
-"""Dialect operators for Relay VM."""
-from . import _ffi_api
-
-
-def shape_of(expr):
-    """Invoke a function to get the shape of a tensor.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The expr used to evaluate its tensor shape.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The expression with the evaluated tensor shape.
-    """
-    return _ffi_api.shape_of(expr)
-
-
-def invoke_tvm_op(func, inputs, outputs):
-    """Call a primitive function with the TVM operator calling convention.
-
-    Parameters
-    ----------
-    func : tvm.relay.Expr
-        The input expr.
-
-    inputs : tvm.relay.Expr
-        A tuple of the inputs to pass to the TVM function.
-
-    outputs : tvm.relay.Expr
-        A tuple of the outputs to pass to the TVM function.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The invoke_tvm_op call node.
-    """
-    return _ffi_api.invoke_tvm_op(func, inputs, outputs)
-
-
-def shape_func(func, inputs, outputs, is_inputs):
-    """Invoke the shape function of the passed function.
-
-    Parameters
-    ----------
-    func : tvm.relay.Expr
-        The primitive function from which to compute the shape function.
-
-    inputs : tvm.relay.Tuple
-        The tupled inputs.
-
-    outputs : tvm.relay.Tuple
-        The tupled outputs.
-
-    is_inputs : List[bool]
-        A boolean list indicating whether the shape function should expect
-        shape or input at each position.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The shape function expression.
-    """
-    return _ffi_api.shape_func(func, inputs, outputs, is_inputs)
-
-
-def reshape_tensor(data, shape, newshape):
-    """Invoke the VM ReshapeTensor instruction.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data.
-
-    shape : tvm.relay.Expr
-        The newshape tensor.
-
-    newshape : List[tvm.ir.PrimExpr]
-        The new shape.
-    """
-    return _ffi_api.reshape_tensor(data, shape, newshape)
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
deleted file mode 100644
index 2714607947f3..000000000000
--- a/python/tvm/relay/param_dict.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Helper utility to save parameter dicts."""
-import tvm.runtime
-
-
-def save_param_dict(params):
-    """Save parameter dictionary to binary bytes.
-
-    The result binary bytes can be loaded by the
-    GraphModule with API "load_params".
-
-    .. deprecated:: 0.9.0
-        Use :py:func:`tvm.runtime.save_param_dict` instead.
-
-    Parameters
-    ----------
-    params : dict of str to NDArray
-        The parameter dictionary.
-
-    Returns
-    -------
-    param_bytes: bytearray
-        Serialized parameters.
-
-    Examples
-    --------
-    .. code-block:: python
-
-       # set up the parameter dict
-       params = {"param0": arr0, "param1": arr1}
-       # save the parameters as byte array
-       param_bytes = tvm.runtime.save_param_dict(params)
-       # We can serialize the param_bytes and load it back later.
-       # Pass in byte array to module to directly set parameters
-       tvm.runtime.load_param_dict(param_bytes)
-    """
-    return tvm.runtime.save_param_dict(params)
-
-
-def load_param_dict(param_bytes):
-    """Load parameter dictionary to binary bytes.
-
-    .. deprecated:: 0.9.0
-        Use :py:func:`tvm.runtime.load_param_dict` instead.
-
-    Parameters
-    ----------
-    param_bytes: bytearray
-        Serialized parameters.
-
-    Returns
-    -------
-    params : dict of str to NDArray
-        The parameter dictionary.
-    """
-    return tvm.runtime.load_param_dict(param_bytes)
diff --git a/python/tvm/relay/parser.py b/python/tvm/relay/parser.py
deleted file mode 100644
index 5e5f00a90eea..000000000000
--- a/python/tvm/relay/parser.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""The relay parser."""
-from . import _ffi_api_parser
-
-
-def parse(source, source_name="from_string", init_module=None, init_meta_table=None):
-    if init_meta_table is None:
-        init_meta_table = {}
-    return _ffi_api_parser.ParseModuleInContext(  # type: ignore # pylint: disable=no-member
-        source_name,
-        source,
-        init_module,
-        init_meta_table,
-    )
-
-
-def parse_expr(source):
-    return _ffi_api_parser.ParseExpr("string", source)  # type: ignore # pylint: disable=no-member
-
-
-def fromtext(source, source_name="from_string"):
-    return parse(source, source_name)
-
-
-def SpanCheck():
-    """A debugging utility for reporting missing span information."""
-    return _ffi_api_parser.SpanCheck()  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
deleted file mode 100644
index 0db639a3a8e7..000000000000
--- a/python/tvm/relay/prelude.py
+++ /dev/null
@@ -1,1582 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-"""A prelude containing useful global functions and ADT definitions."""
-from tvm.ir import IRModule, TypeCall
-from tvm.tir import Any
-from tvm.relay.transform import ToANormalFormExpr
-
-from .ty import GlobalTypeVar, TensorType, scalar_type
-from .expr import Var, GlobalVar, If, const
-from .function import Function
-from .op.tensor import add, subtract, equal
-from .adt import Constructor, TypeData, Clause, Match
-from .adt import PatternConstructor, PatternVar, PatternWildcard
-from . import op, transform
-from .analysis import free_vars
-
-
-def get_tensor_array_shape(expr, dtype, prelude):
-    """Get the static shape of a tensor array if it has fixed rank shape.
-
-    By design, static ADT tensor in TVM has type name in the format
-    of static_tensor_dim0_dim1_..._dimN_t.
-
-    Parameters
-    ----------
-    expr : Relay Expr
-        Input expression.
-
-    dtype : str
-        Data type.
-
-    prelude : Prelude
-        Tensor array prelude
-
-    Returns
-    -------
-    shape : tuple of (int, Any) or None
-        The output shape. None if input tensor array
-        has dynamic shape.
-    """
-    mod = prelude.mod
-    mod["main"] = Function(free_vars(expr), expr)
-    mod = transform.InferType()(mod)
-    checked_type = mod["main"].body.checked_type
-    assert isinstance(checked_type, TypeCall), "Input must be a tensor array."
-    ta_type_str = checked_type.args[0].func.name_hint
-    static_ta_ty_start = f"static_tensor_{dtype}"
-    if ta_type_str.startswith(static_ta_ty_start):
-        shape_str = ta_type_str.replace(f"{static_ta_ty_start}_", "").replace("_t", "")
-        shape = []
-        if "scalar" not in shape_str:
-            for dim_str in shape_str.split("_"):
-                if dim_str in ["?", "any"]:
-                    shape.append(Any())
-                else:
-                    shape.append(int(dim_str))
-        return tuple(shape)
-    return None
-
-
-def _get_name_static(canonical, dtype, shape, batch_dim=None, extra_shapes=None):
-    """Get name for static shape tensor array op
-
-    By design, static ADT tensor in TVM has type name in the format
-    of static_tensor_dim0_dim1_..._dimN_t
-    or static_tensor_batch1_dim0_dim1_..._dimN_t if tensorlist stack only have one item.
-
-    Parameters
-    ----------
-    canonical : String
-        Tensor array op name
-
-    dtype : str
-        Data type.
-
-    shape : tuple of (int, Any) or None
-        Tensor array shape
-
-    batch_dim: None or int
-        1 if tensorlist stack only have one item.
-        None by default
-
-    Returns
-    -------
-    name : String
-        The tensor array op name
-    """
-    shape_str = _to_str(shape)
-
-    if extra_shapes is not None:
-        for n, s in extra_shapes.items():
-            extra_shape_str = f"_{n}_{_to_str(s)}"
-            shape_str += extra_shape_str
-
-    if len(shape_str) == 0:
-        shape_str = "scalar"
-    if canonical == "tensor_t":
-        return f"static_tensor_{dtype}_{shape_str}_t"
-    if batch_dim is None or canonical in ["tensor_constructor", "tensor_nil"]:
-        return f"{canonical}_{dtype}_{shape_str}"
-    if batch_dim != 1:
-        return f"{canonical}_{dtype}_{shape_str}"
-    return f"{canonical}_{dtype}_batch{batch_dim}_{shape_str}"
-
-
-def _to_str(shape):
-    dim_names = []
-    for dim in shape:
-        if isinstance(dim, Any):
-            dim_names.append("any")
-        else:
-            dim_names.append(str(dim))
-    return "_".join(dim_names)
-
-
-class StaticTensorArrayOps(object):
-    """Contains tensor array related ops for fixed rank tensor array"""
-
-    def __init__(self, prelude, dtype, shape, batch_dim=None):
-        """Create tensor array ops registry"""
-        self.prelude = prelude
-        self.dtype = dtype
-        self.shape = shape
-        self.batch_dim = batch_dim
-        self.list, self.cons, self.nil = self.prelude.mod.get_type("List")
-
-    def get_name(self, canonical, extra_shapes=None):
-        """Get name corresponding to the canonical name"""
-        return _get_name_static(canonical, self.dtype, self.shape, self.batch_dim, extra_shapes)
-
-    def get_global_var(self, canonical):
-        """Get global corresponding to the canonical name"""
-        return self.prelude.get_global_var_static(canonical, self.dtype, self.shape, self.batch_dim)
-
-    def get_type(self, canonical):
-        """Get type corresponding to the canonical name"""
-        return self.prelude.get_type_static(canonical, self.dtype, self.shape)
-
-    def get_ctor(self, canonical):
-        """Get ctor corresponding to the canonical name"""
-        return self.prelude.get_ctor_static("tensor_t", canonical, self.dtype, self.shape)
-
-    def define_tensor_adt(self):
-        """Defines the static tensor ADT, which is the container for tensors
-        with fixed shapes."""
-        tensor_type_name = self.get_name("tensor_t")
-
-        # This is effectively functioning as a monomorphizer.
-        # TODO(@jroesch): we should add full shape polymoprhism
-        # and do monomorphization.
-        #
-        # Skip register if tensor type is already registered.
-        global_type_names = set()
-        for g_ty_var in self.prelude.mod.get_global_type_vars():
-            global_type_names.add(g_ty_var.name_hint)
-
-        if tensor_type_name in global_type_names:
-            self.tensor_type_var = self.get_type("tensor_t")
-            return
-
-        self.tensor_type_var = GlobalTypeVar(tensor_type_name)
-
-        tensor_type = TensorType(self.shape, self.dtype)
-        tensor_constructor_name = self.get_name("tensor_constructor")
-
-        tensor_nil_name = self.get_name("tensor_nil")
-        tensor_nil_case = Constructor(tensor_nil_name, [], self.tensor_type_var)
-        tensor_case = Constructor(tensor_constructor_name, [tensor_type], self.tensor_type_var)
-
-        self.prelude.mod[self.tensor_type_var] = TypeData(
-            self.tensor_type_var, [], [tensor_nil_case, tensor_case]
-        )
-
-    def define_tensor_array(self):
-        """Defines a function to create a tensor array with size n.
-        tensor_array(n) : Tensor[(), int32] -> list[tensor_t]
-        """
-        tensor_array_constructor_name = self.get_name("tensor_array")
-        tensor_array_constructor_var = self._create_global_var(tensor_array_constructor_name)
-
-        tensor_nil_var = self.get_ctor("tensor_nil")
-        tensor_type_var = self.get_ctor("tensor_t")
-        n = Var("x", scalar_type("int32"))
-        body = If(
-            equal(n, const(0)),
-            self.nil(),
-            self.cons(tensor_nil_var(), tensor_array_constructor_var(subtract(n, const(1)))),
-        )
-        self.prelude.mod[tensor_array_constructor_var] = Function(
-            [n], body, self.list(tensor_type_var()), []
-        )
-
-    def define_tensor_take(self):
-        """Defines a function to return a range of tensor_t on axis 0.
-        tensor_take(t, lower, upper) :
-        tensor_t -> Tensor[(), int32] -> Tensor[(), int32] -> tensor_t
-        """
-        # We don't register take for scalar tensor.
-        ndim = len(self.shape)
-        if ndim == 0:
-            return
-
-        take_name = self.get_name("tensor_take")
-
-        if self.is_cached(take_name):
-            return
-
-        take_var = GlobalVar(take_name)
-
-        origin_tensor_constructor = self.get_ctor("tensor_constructor")
-
-        output_shape = [Any()] + list(self.shape[1:])
-        tensor_type_var, tensor_constructor, _ = self._get_adt_by_shape(output_shape)
-
-        t = Var("tensor", self.tensor_type_var())
-        lower = Var("lower", scalar_type("int32"))
-        upper = Var("upper", scalar_type("int32"))
-        tvar = Var("t")
-        case = Clause(
-            PatternConstructor(origin_tensor_constructor, [PatternVar(tvar)]),
-            tensor_constructor(op.take(tvar, op.arange(lower, upper, dtype="int32"), axis=0)),
-        )
-        self.prelude.mod[take_var] = Function(
-            [t, lower, upper], Match(t, [case], False), tensor_type_var(), []
-        )
-
-    def define_tensor_concatenate(self):
-        """Defines a function to concatenate two tensor_t on axis 0.
-        tensor_concatenate(t) : tensor_t -> tensor_t -> tensor_t
-        """
-        # We don't register concatenate for scalar tensor.
-        ndim = len(self.shape)
-        if ndim == 0:
-            return
-
-        concat_name = self.get_name("tensor_concatenate")
-        concat_var = GlobalVar(concat_name)
-        if self.is_cached(concat_name):
-            return
-
-        output_shape = [Any()] + list(self.shape[1:])
-        tensor_type_var, tensor_constructor, _ = self._get_adt_by_shape(output_shape)
-
-        origin_tensor_constructor = self.get_ctor("tensor_constructor")
-        origin_tensor_type_var = self.tensor_type_var
-        x = Var("x", origin_tensor_type_var())
-        y = Var("y", origin_tensor_type_var())
-        t1 = Var("t1")
-        t2 = Var("t2")
-
-        case = Clause(
-            PatternConstructor(origin_tensor_constructor, [PatternVar(t1)]),
-            Match(
-                y,
-                [
-                    Clause(
-                        PatternConstructor(origin_tensor_constructor, [PatternVar(t2)]),
-                        tensor_constructor(op.concatenate([t1, t2], axis=0)),
-                    )
-                ],
-                False,
-            ),
-        )
-
-        self.prelude.mod[concat_var] = Function(
-            [x, y], Match(x, [case], False), tensor_type_var(), []
-        )
-
-    def define_tensor_expand_dims(self):
-        """Defines a function to grow a tensor_t's rank by adding one dimension in front
-        of the original tensor_t.
-        tensor_expand_dims(t) : tensor_t -> tensor_t
-        """
-        expand_dims_name = self.get_name("tensor_expand_dims")
-        expand_dims_var = self._create_global_var(expand_dims_name)
-        setattr(self.prelude, expand_dims_name, expand_dims_var)
-        origin_tensor_type_var = self.tensor_type_var
-        origin_tensor_constructor = self.get_ctor("tensor_constructor")
-        x = Var("x", origin_tensor_type_var())
-
-        # Note: we set the added axis to be Any() instead of 1 due to
-        # in stack op, we need to recursively concatenate.
-        new_axis = Any() if self.batch_dim is None or self.batch_dim != 1 else self.batch_dim
-        tensor_type_var, tensor_constructor, _ = self._get_adt_by_shape(
-            [new_axis] + list(self.shape)
-        )
-        t = Var("t")
-        case = Clause(
-            PatternConstructor(origin_tensor_constructor, [PatternVar(t)]),
-            tensor_constructor(op.expand_dims(t, 0, 1)),
-        )
-
-        self.prelude.mod[expand_dims_var] = Function(
-            [x], Match(x, [case], False), tensor_type_var(), []
-        )
-
-    def define_tensor_array_read(self):
-        """Defines a function to get the nth element of a list. Assume the list has at least one
-        element.
-        tensor_array_read(ta, n) : list[static_tensor_t] -> Tensor[(), int32] ->
-        Tensor[self.shape, self.dtype]
-        """
-        read_name = self.get_name("tensor_array_read")
-
-        if self.is_cached(read_name):
-            return
-
-        read_var = GlobalVar(read_name)
-
-        tensor_array = Var("tensor_array", self.list(self.tensor_type_var()))
-        n = Var("x", scalar_type("int32"))
-        self.prelude.mod[read_var] = Function(
-            [tensor_array, n], self.prelude.nth(tensor_array, n), self.tensor_type_var(), []
-        )
-
-    def is_cached(self, name):
-        try:
-            self.prelude.mod.get_global_var(name)
-            return True
-        except ValueError:
-            return False
-
-    def define_tensor_array_write(self):
-        """Defines a function to update a tensor array at index n with value v.
-        tensor_array_write(ta, n, v) :
-            list[static_tensor_t] -> Tensor[(), int32] -> Tensor[self.shape, self.dtype] ->
-            list[static_tensor_t]
-        """
-        write_name = self.get_name("tensor_array_write")
-        if self.is_cached(write_name):
-            return
-
-        write_var = GlobalVar(write_name)
-        tensor_array = Var("tensor_array", self.list(self.tensor_type_var()))
-        n = Var("x", scalar_type("int32"))
-        v = Var("v", self.tensor_type_var())
-        self.prelude.mod[write_var] = Function(
-            [tensor_array, n, v],
-            self.prelude.update(tensor_array, n, v),
-            self.list(self.tensor_type_var()),
-            [],
-        )
-
-    def define_tensor_array_unstack(self):
-        """Defines a function to unstack the values of a tensor_t in a tensor array.
-        tensor_array_unstack_tensor(t) : tensor_t -> list[tensor_t]
-        """
-        ndim = len(self.shape)
-        # We don't register unstack for scalar tensor array
-        if ndim == 0:
-            return
-
-        helper_name = self.get_name("tensor_array_unstack_helper")
-        helper_var = self._create_global_var(helper_name)
-        setattr(self.prelude, helper_name, helper_var)
-        tensor = Var("t", TensorType(self.shape, self.dtype))
-        up = Var("up", scalar_type("int32"))
-        i = Var("i", scalar_type("int32"))
-        tensor_var = Var("tensor", TensorType(self.shape, self.dtype))
-
-        reduced_tensor_type_var, tensor_constructor, _ = self._get_adt_by_shape(self.shape[1:])
-        helper_body = If(
-            equal(i, up),
-            self.nil(),
-            self.cons(
-                tensor_constructor(op.take(tensor, i, axis=0)),
-                helper_var(add(i, const(1)), up, tensor),
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [i, up, tensor], helper_body, self.list(reduced_tensor_type_var()), []
-        )
-
-        unstack_name = self.get_name("tensor_array_unstack")
-        unstack_var = self._create_global_var(unstack_name)
-        setattr(self.prelude, unstack_name, unstack_var)
-        shape = op.shape_of(tensor_var)
-        unstack_length = op.take(shape, const(0))
-        self.prelude.mod[unstack_var] = Function(
-            [tensor_var],
-            helper_var(const(0), unstack_length, tensor_var),
-            self.list(reduced_tensor_type_var()),
-            [],
-        )
-
-    def define_tensor_array_scatter(self, indices_shape=None, force_update=False):
-        """Defines a function to scatter the values of a tensor_t in indices of a tensor array.
-        tensor_array_scatter(ta, indices, value) :
-            list[tensor_t] -> Tensor[(Any), int32] -> tensor_t -> list[tensor_t]
-
-        Set static indices shape by specifying indices_shape.
-        Set force_update to get static indices shape operator.
-        """
-        # When this operator has already been registered, only update
-        # when force_update is set. This should be used only when we need to
-        # redefine this op for static indices shape.
-
-        extra_shapes = {"indices": indices_shape} if indices_shape is not None else None
-        tensor_array_scatter_name = self.get_name("tensor_array_scatter", extra_shapes)
-        if hasattr(self.prelude, tensor_array_scatter_name) and not force_update:
-            return
-
-        tensor_array_scatter_helper_name = self.get_name(
-            "tensor_array_scatter_helper", extra_shapes
-        )
-
-        tensor_array_scatter_helper_var = self._create_global_var(tensor_array_scatter_helper_name)
-        ta = Var("ta", self.list(self.tensor_type_var()))
-        current = Var("current", scalar_type("int32"))
-        limit = Var("limit", scalar_type("int32"))
-        indices_ = Var("indices_", TensorType(indices_shape or [Any()], "int32"))
-        values_ = Var("values_", self.list(self.tensor_type_var()))
-        write_var = self.get_global_var("tensor_array_write")
-        read_var = self.get_global_var("tensor_array_read")
-        helper_body = If(
-            equal(current, limit),
-            ta,
-            tensor_array_scatter_helper_var(
-                write_var(ta, op.take(indices_, current), read_var(values_, current)),
-                add(current, const(1)),
-                limit,
-                indices_,
-                values_,
-            ),
-        )
-        self.prelude.mod[tensor_array_scatter_helper_var] = Function(
-            [ta, current, limit, indices_, values_],
-            helper_body,
-            self.list(self.tensor_type_var()),
-            [],
-        )
-
-        tensor_array_scatter_var = self._create_global_var(tensor_array_scatter_name)
-        setattr(self.prelude, tensor_array_scatter_name, tensor_array_scatter_var)
-        tensor_array = Var("tensor_array", self.list(self.tensor_type_var()))
-
-        indices = Var("indices", TensorType(indices_shape or [Any()], "int32"))
-        values = Var("values", self.list(self.tensor_type_var()))
-        if indices_shape is None:
-            indices_shape = op.shape_of(indices)
-            limit = op.take(indices_shape, const(0))
-        else:
-            limit = const(indices_shape[0])
-
-        body = tensor_array_scatter_helper_var(tensor_array, const(0), limit, indices, values)
-        self.prelude.mod[tensor_array_scatter_var] = Function(
-            [tensor_array, indices, values], body, self.list(self.tensor_type_var()), []
-        )
-
-    def define_tensor_array_split(self, value_shape=None, lengths_shape=None, force_update=False):
-        """Defines a function to split the values of a tensor_t into a tensor array.
-        tensor_array_split(ta, value, lengths) :
-            list[tensor_t] -> tensor_t -> Tensor[(Any), int32] -> list[tensor_t]
-
-        Set static value and lengths shapes by specifying value_shape and lengths_shape.
-        Set force_update to get static value and lengths shape operator.
-        """
-        # Skip scalar case
-        ndim = len(self.shape)
-        if ndim == 0:
-            return
-
-        # When this operator has already been registered, only update
-        # when force_update is set. This should be used only when we need to
-        # redefine this op for static value/indices shape.
-        split_name = self.get_name("tensor_array_split")
-
-        if self.is_cached(split_name):
-            if not force_update:
-                return
-            tensor_array_split_helper_var = self.get_global_var("ta_split_helper")
-            split_var = self.get_global_var("tensor_array_split")
-        else:
-            tensor_array_split_helper_name = self.get_name("ta_split_helper")
-            tensor_array_split_helper_var = GlobalVar(tensor_array_split_helper_name)
-            split_var = GlobalVar(split_name)
-
-        output_shape = [Any()] + list(self.shape[1:])
-        output_tensor_type_var, _, output_ops = self._get_adt_by_shape(output_shape)
-        output_ops.define_tensor_array_write()
-        write_var = output_ops.get_global_var("tensor_array_write")
-
-        if value_shape is None:
-            value_type_var = self.tensor_type_var
-            take_var = self.get_global_var("tensor_take")
-        else:
-            value_type_var, _, value_adts = self._get_adt_by_shape(value_shape)
-            value_adts.define_tensor_take()
-            take_var = value_adts.get_global_var("tensor_take")
-
-        ta1 = Var("tensor_array", self.list(output_tensor_type_var()))
-        value1 = Var("value1", value_type_var())
-        offset1 = Var("offset1", scalar_type("int32"))
-        current1 = Var("current1", scalar_type("int32"))
-        limit1 = Var("limit1", scalar_type("int32"))
-        lengths1 = Var("lengths", TensorType(lengths_shape or [Any()], "int32"))
-
-        helper1_body = If(
-            equal(current1, limit1),
-            ta1,
-            write_var(
-                tensor_array_split_helper_var(
-                    ta1,
-                    value1,
-                    add(offset1, op.take(lengths1, current1)),
-                    add(current1, const(1)),
-                    limit1,
-                    lengths1,
-                ),
-                current1,
-                take_var(value1, offset1, add(op.take(lengths1, current1), offset1)),
-            ),
-        )
-
-        self.prelude.mod[tensor_array_split_helper_var] = Function(
-            [ta1, value1, offset1, current1, limit1, lengths1],
-            helper1_body,
-            self.list(output_tensor_type_var()),
-            [],
-        )
-        tensor_array = Var("tensor_array", self.list(output_tensor_type_var()))
-
-        value = Var("value", value_type_var())
-        lengths = Var("lengths", TensorType(lengths_shape or [Any()], "int32"))
-        if lengths_shape is None:
-            lengths_shape = op.shape_of(lengths)
-            lengths_limit = op.take(lengths_shape, const(0))
-        else:
-            lengths_limit = const(lengths_shape[0])
-        body = tensor_array_split_helper_var(
-            tensor_array, value, const(0), const(0), lengths_limit, lengths
-        )
-
-        self.prelude.mod[split_var] = Function(
-            [tensor_array, value, lengths], body, self.list(output_tensor_type_var()), []
-        )
-
-    def define_tensor_array_concat(self):
-        """Defines a function to return the values in the tensor array as concatenated tensor_t.
-        tensor_array_concat(ta) : list[tensor_t] -> tensor_t
-        """
-        # We don't register concat for scalar tensor array.
-        ndim = len(self.shape)
-        if ndim == 0:
-            return
-
-        concat_name = self.get_name("tensor_array_concat")
-
-        if self.is_cached(concat_name):
-            return
-
-        concat_var = GlobalVar(concat_name)
-
-        output_shape = [Any()] + list(self.shape[1:])
-
-        tensor_type_var, _, output_ops = self._get_adt_by_shape(output_shape)
-
-        # Register tensor concatenate and get tensor_nil var for output shape
-        output_ops.define_tensor_concatenate()
-        tensor_concat_var = output_ops.get_global_var("tensor_concatenate")
-        tensor_nil_var = output_ops.get_ctor("tensor_nil")
-
-        tensor_array = Var("tensor_array", self.list(tensor_type_var()))
-        hd = Var("hd")
-        tl = Var("tl")
-        nil_case = Clause(PatternConstructor(self.nil), tensor_nil_var())
-        cons_case = Clause(
-            PatternConstructor(self.cons, [PatternVar(hd), PatternVar(tl)]),
-            Match(
-                tl,
-                [
-                    Clause(PatternConstructor(self.nil), hd),
-                    Clause(PatternWildcard(), tensor_concat_var(hd, concat_var(tl))),
-                ],
-                False,
-            ),
-        )
-        self.prelude.mod[concat_var] = Function(
-            [tensor_array], Match(tensor_array, [nil_case, cons_case], False), tensor_type_var(), []
-        )
-
-    def define_tensor_array_stack(self):
-        """Defines a function to get the values in the tensor array as a stack tensor_t.
-        tensor_array_stack(l) : list[tensor_t] -> tensor_t
-        """
-        stack_name = self.get_name("tensor_array_stack")
-        stack_var = self._create_global_var(stack_name)
-        setattr(self.prelude, stack_name, stack_var)
-        tensor_array = Var("tensor_array", self.list(self.tensor_type_var()))
-        expand_dims_var = self.get_global_var("tensor_expand_dims")
-
-        # Register tensor_concatenate for output_shape
-        new_axis = Any() if not self.batch_dim or self.batch_dim != 1 else self.batch_dim
-        output_shape = [new_axis] + list(self.shape)
-        _, _, output_ops = self._get_adt_by_shape(output_shape)
-        output_ops.define_tensor_concatenate()
-        concat_var = output_ops.get_global_var("tensor_concatenate")
-
-        tensor_array_expand_dims = self.prelude.map(expand_dims_var, tensor_array)
-        if self.batch_dim is not None and self.batch_dim == 1:
-            # only one element
-            tensors = self.prelude.id(self.prelude.hd(tensor_array_expand_dims))
-        else:
-            tensors = self.prelude.foldl(
-                concat_var,
-                self.prelude.hd(tensor_array_expand_dims),
-                self.prelude.tl(tensor_array_expand_dims),
-            )
-
-        output_tensor_type_var, _, _ = self._get_adt_by_shape(output_shape)
-        self.prelude.mod[stack_var] = Function(
-            [tensor_array], tensors, output_tensor_type_var(), []
-        )
-
-    def define_tensor_array_gather(self):
-        """Defines a function to return the selected values in a tensor array as tensor_t.
-        tensor_array_gather(ta, indices) : list[tensor_t] -> Tensor[(Any), int32] -> tensor_t
-        """
-        helper_name = self.get_name("tensor_array_gather_helper")
-        helper_var = self._create_global_var(helper_name)
-
-        new_axis = Any() if self.batch_dim is None or self.batch_dim != 1 else self.batch_dim
-        output_shape = [new_axis] + list(self.shape)
-        output_tensor_type_var, _, _ = self._get_adt_by_shape(output_shape)
-        stack_var = self.get_global_var("tensor_array_stack")
-        read_var = self.get_global_var("tensor_array_read")
-        ta = Var("ta", self.list(self.tensor_type_var()))
-        accu = Var("accu", self.list(self.tensor_type_var()))
-        current = Var("current", scalar_type("int32"))
-        limit = Var("limit", scalar_type("int32"))
-        indices_ = Var("indices_", TensorType([Any()], "int32"))
-        helper_body = If(
-            equal(current, const(0)),
-            stack_var(accu),
-            helper_var(
-                ta,
-                self.cons(read_var(ta, op.take(indices_, subtract(current, const(1)))), accu),
-                subtract(current, const(1)),
-                limit,
-                indices_,
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [ta, accu, current, limit, indices_], helper_body, output_tensor_type_var(), []
-        )
-        gather_name = self.get_name("tensor_array_gather")
-        gather_var = self._create_global_var(gather_name)
-
-        tensor_array = Var("tensor_array", self.list(self.tensor_type_var()))
-        indices = Var("indices", TensorType([Any()], "int32"))
-        indices_shape = op.shape_of(indices)
-        limit = op.take(indices_shape, const(0))
-        body = helper_var(tensor_array, self.nil(), limit, limit, indices)
-        self.prelude.mod[gather_var] = Function(
-            [tensor_array, indices], body, output_tensor_type_var(), []
-        )
-
-    def define_tensor_get_data(self):
-        """Defines a function to get a Tensor from tensor_t with given shape."""
-        tensor_get_data_name = self.get_name("tensor_get_data")
-        tensor_get_data_var = self._create_global_var(tensor_get_data_name)
-
-        tensor_constructor = self.get_ctor("tensor_constructor")
-        t = Var("tensor", self.tensor_type_var())
-        tvar = Var("t")
-        case = Clause(PatternConstructor(tensor_constructor, [PatternVar(tvar)]), tvar)
-        self.prelude.mod[tensor_get_data_var] = Function(
-            [t], Match(t, [case], False), TensorType(self.shape, self.dtype), []
-        )
-
-    def register(self):
-        """Register all tensor array ops in Prelude"""
-        self.define_tensor_adt()
-        self.define_tensor_take()
-        self.define_tensor_concatenate()
-        self.define_tensor_expand_dims()
-        self.define_tensor_array()
-        self.define_tensor_array_read()
-        self.define_tensor_array_write()
-        self.define_tensor_array_unstack()
-        self.define_tensor_array_scatter()
-        self.define_tensor_array_split()
-        self.define_tensor_array_concat()
-        self.define_tensor_array_stack()
-        self.define_tensor_array_gather()
-        self.define_tensor_get_data()
-
-    def _get_adt_by_shape(self, shape):
-        """Get ADT type and constructor with given shape."""
-        adt_ops = StaticTensorArrayOps(self.prelude, self.dtype, shape, self.batch_dim)
-        adt_ops.define_tensor_adt()
-        tensor_type_var = adt_ops.get_type("tensor_t")
-        tensor_constructor = adt_ops.get_ctor("tensor_constructor")
-        return tensor_type_var, tensor_constructor, adt_ops
-
-    def _create_global_var(self, name):
-        """Create a GlobalVar if doesn't exist in prelude."""
-        global_var_name_set = set()
-        for g_var_name in self.prelude.mod.get_global_vars():
-            global_var_name_set.add(g_var_name.name_hint)
-        if name not in global_var_name_set:
-            gvar = GlobalVar(name)
-        else:
-            gvar = self.prelude.mod.get_global_var(name)
-
-        return gvar
-
-
-class TensorArrayOps(object):
-    """Contains tensor array related ops"""
-
-    def __init__(self, prelude, dtype):
-        """Create tensor array ops registry"""
-        self.prelude = prelude
-        self.dtype = dtype
-        self.list, self.cons, self.nil = self.prelude.mod.get_type("List")
-
-    def get_name(self, canonical):
-        """Get name corresponding to the canonical name"""
-        return self.prelude.get_name(canonical, self.dtype)
-
-    def get_global_var(self, canonical):
-        """Get global corresponding to the canonical name"""
-        return self.prelude.get_global_var(canonical, self.dtype)
-
-    def get_type(self, canonical):
-        """Get type corresponding to the canonical name"""
-        return self.prelude.get_type(canonical, self.dtype)
-
-    def get_ctor(self, canonical):
-        """Get ctor corresponding to the canonical name"""
-        return self.prelude.get_ctor(self.tensor_type_var.name_hint, canonical, self.dtype)
-
-    def define_tensor_adt(self):
-        """Defines the dynamic tensor ADT, which is the container for tensors
-        with variable shapes."""
-        tensor_type_name = self.get_name("tensor_t")
-        self.tensor_type_var = tensor_type_var = GlobalTypeVar(tensor_type_name)
-
-        tensor0_type = TensorType([], self.dtype)
-        tensor1_type = TensorType([Any()], self.dtype)
-        tensor2_type = TensorType([Any(), Any()], self.dtype)
-        tensor3_type = TensorType([Any(), Any(), Any()], self.dtype)
-        tensor4_type = TensorType([Any(), Any(), Any(), Any()], self.dtype)
-        tensor5_type = TensorType([Any(), Any(), Any(), Any(), Any()], self.dtype)
-        tensor6_type = TensorType([Any(), Any(), Any(), Any(), Any(), Any()], self.dtype)
-        tensor_nil_name = self.get_name("tensor_nil")
-        tensor0_name = self.get_name("tensor0")
-        tensor1_name = self.get_name("tensor1")
-        tensor2_name = self.get_name("tensor2")
-        tensor3_name = self.get_name("tensor3")
-        tensor4_name = self.get_name("tensor4")
-        tensor5_name = self.get_name("tensor5")
-        tensor6_name = self.get_name("tensor6")
-        tensor_nil_case = Constructor(tensor_nil_name, [], tensor_type_var)
-        tensor0_case = Constructor(tensor0_name, [tensor0_type], tensor_type_var)
-        tensor1_case = Constructor(tensor1_name, [tensor1_type], tensor_type_var)
-        tensor2_case = Constructor(tensor2_name, [tensor2_type], tensor_type_var)
-        tensor3_case = Constructor(tensor3_name, [tensor3_type], tensor_type_var)
-        tensor4_case = Constructor(tensor4_name, [tensor4_type], tensor_type_var)
-        tensor5_case = Constructor(tensor5_name, [tensor5_type], tensor_type_var)
-        tensor6_case = Constructor(tensor6_name, [tensor6_type], tensor_type_var)
-
-        self.prelude.mod[tensor_type_var] = TypeData(
-            tensor_type_var,
-            [],
-            [
-                tensor_nil_case,
-                tensor0_case,
-                tensor1_case,
-                tensor2_case,
-                tensor3_case,
-                tensor4_case,
-                tensor5_case,
-                tensor6_case,
-            ],
-        )
-
-    def define_tensor_take(self):
-        """Defines a function to return a range of tensor_t on axis 0.
-        tensor_take(t, lower, upper) :
-        tensor_t -> Tensor[(), int32] -> Tensor[(), int32] -> tensor_t
-        """
-        take_name = self.get_name("tensor_take")
-        take_var = GlobalVar(take_name)
-
-        tensor_t = self.tensor_type_var
-        tensor1_var = self.get_ctor("tensor1")
-        tensor2_var = self.get_ctor("tensor2")
-        tensor3_var = self.get_ctor("tensor3")
-        tensor4_var = self.get_ctor("tensor4")
-        tensor5_var = self.get_ctor("tensor5")
-        tensor6_var = self.get_ctor("tensor6")
-
-        t = Var("tensor", tensor_t())
-        lower = Var("lower", scalar_type("int32"))
-        upper = Var("upper", scalar_type("int32"))
-        t1 = Var("t1")
-        t2 = Var("t2")
-        t3 = Var("t3")
-        t4 = Var("t4")
-        t5 = Var("t5")
-        t6 = Var("t6")
-        tensor1_case = Clause(
-            PatternConstructor(tensor1_var, [PatternVar(t1)]),
-            tensor1_var(op.take(t1, op.arange(lower, upper, dtype="int32"))),
-        )
-        tensor2_case = Clause(
-            PatternConstructor(tensor2_var, [PatternVar(t2)]),
-            tensor2_var(op.take(t2, op.arange(lower, upper, dtype="int32"), axis=0)),
-        )
-        tensor3_case = Clause(
-            PatternConstructor(tensor3_var, [PatternVar(t3)]),
-            tensor3_var(op.take(t3, op.arange(lower, upper, dtype="int32"), axis=0)),
-        )
-        tensor4_case = Clause(
-            PatternConstructor(tensor4_var, [PatternVar(t4)]),
-            tensor4_var(op.take(t4, op.arange(lower, upper, dtype="int32"), axis=0)),
-        )
-        tensor5_case = Clause(
-            PatternConstructor(tensor5_var, [PatternVar(t5)]),
-            tensor5_var(op.take(t5, op.arange(lower, upper, dtype="int32"), axis=0)),
-        )
-        tensor6_case = Clause(
-            PatternConstructor(tensor6_var, [PatternVar(t6)]),
-            tensor6_var(op.take(t6, op.arange(lower, upper, dtype="int32"), axis=0)),
-        )
-        self.prelude.mod[take_var] = Function(
-            [t, lower, upper],
-            Match(
-                t,
-                [
-                    tensor1_case,
-                    tensor2_case,
-                    tensor3_case,
-                    tensor4_case,
-                    tensor5_case,
-                    tensor6_case,
-                ],
-                False,
-            ),
-            tensor_t(),
-            [],
-        )
-
-    def define_tensor_expand_dims(self):
-        """Defines a function to grow a tensor_t's rank by adding one dimension in front
-        of the original tensor_t.
-        tensor_expand_dims(t) : tensor_t -> tensor_t
-        """
-        expand_dims_name = self.get_name("tensor_expand_dims")
-        expand_dims_var = GlobalVar(expand_dims_name)
-        tensor_type_var = self.tensor_type_var
-
-        x = Var("x", tensor_type_var())
-        t0 = Var("t0")
-        t1 = Var("t1")
-        t2 = Var("t2")
-        t3 = Var("t3")
-        t4 = Var("t4")
-        t5 = Var("t5")
-        tensor0_var = self.get_ctor("tensor0")
-        tensor1_var = self.get_ctor("tensor1")
-        tensor2_var = self.get_ctor("tensor2")
-        tensor3_var = self.get_ctor("tensor3")
-        tensor4_var = self.get_ctor("tensor4")
-        tensor5_var = self.get_ctor("tensor5")
-        tensor6_var = self.get_ctor("tensor6")
-        tensor0_case = Clause(
-            PatternConstructor(tensor0_var, [PatternVar(t0)]), tensor1_var(op.expand_dims(t0, 0, 1))
-        )
-        tensor1_case = Clause(
-            PatternConstructor(tensor1_var, [PatternVar(t1)]), tensor2_var(op.expand_dims(t1, 0, 1))
-        )
-        tensor2_case = Clause(
-            PatternConstructor(tensor2_var, [PatternVar(t2)]), tensor3_var(op.expand_dims(t2, 0, 1))
-        )
-        tensor3_case = Clause(
-            PatternConstructor(tensor3_var, [PatternVar(t3)]), tensor4_var(op.expand_dims(t3, 0, 1))
-        )
-        tensor4_case = Clause(
-            PatternConstructor(tensor4_var, [PatternVar(t4)]), tensor5_var(op.expand_dims(t4, 0, 1))
-        )
-        tensor5_case = Clause(
-            PatternConstructor(tensor5_var, [PatternVar(t5)]), tensor6_var(op.expand_dims(t5, 0, 1))
-        )
-        self.prelude.mod[expand_dims_var] = Function(
-            [x],
-            Match(
-                x,
-                [
-                    tensor0_case,
-                    tensor1_case,
-                    tensor2_case,
-                    tensor3_case,
-                    tensor4_case,
-                    tensor5_case,
-                ],
-                False,
-            ),
-            tensor_type_var(),
-        )
-
-    def define_tensor_concat(self):
-        """Defines a function to concatenate two tensor_t on the first axis
-
-        tensor_concatenate(t) : tensor_t -> tensor_t -> tensor_t
-        """
-        concat_name = self.get_name("tensor_concatenate")
-        concat_var = GlobalVar(concat_name)
-
-        tensor_type_var = self.tensor_type_var
-        x = Var("x", tensor_type_var())
-        y = Var("y", tensor_type_var())
-
-        tensor1_var = self.get_ctor("tensor1")
-        tensor2_var = self.get_ctor("tensor2")
-        tensor3_var = self.get_ctor("tensor3")
-        tensor4_var = self.get_ctor("tensor4")
-        t11 = Var("t11")
-        t12 = Var("t12")
-        t21 = Var("t21")
-        t22 = Var("t22")
-        t31 = Var("t31")
-        t32 = Var("t32")
-        t41 = Var("t41")
-        t42 = Var("t42")
-        tensor1_case = Clause(
-            PatternConstructor(tensor1_var, [PatternVar(t11)]),
-            Match(
-                y,
-                [
-                    Clause(
-                        PatternConstructor(tensor1_var, [PatternVar(t12)]),
-                        tensor1_var(op.concatenate([t11, t12], axis=0)),
-                    )
-                ],
-                False,
-            ),
-        )
-        tensor2_case = Clause(
-            PatternConstructor(tensor2_var, [PatternVar(t21)]),
-            Match(
-                y,
-                [
-                    Clause(
-                        PatternConstructor(tensor2_var, [PatternVar(t22)]),
-                        tensor2_var(op.concatenate([t21, t22], axis=0)),
-                    )
-                ],
-                False,
-            ),
-        )
-        tensor3_case = Clause(
-            PatternConstructor(tensor3_var, [PatternVar(t31)]),
-            Match(
-                y,
-                [
-                    Clause(
-                        PatternConstructor(tensor3_var, [PatternVar(t32)]),
-                        tensor3_var(op.concatenate([t31, t32], axis=0)),
-                    )
-                ],
-                False,
-            ),
-        )
-        tensor4_case = Clause(
-            PatternConstructor(tensor4_var, [PatternVar(t41)]),
-            Match(
-                y,
-                [
-                    Clause(
-                        PatternConstructor(tensor4_var, [PatternVar(t42)]),
-                        tensor4_var(op.concatenate([t41, t42], axis=0)),
-                    )
-                ],
-                False,
-            ),
-        )
-        # op.concatenate does not support tensor with rank higher than 4
-        self.prelude.mod[concat_var] = Function(
-            [x, y],
-            Match(x, [tensor1_case, tensor2_case, tensor3_case, tensor4_case], False),
-            tensor_type_var(),
-        )
-
-    def define_tensor_array(self):
-        """Defines a function to create a tensor array with size n.
-        tensor_array(n) : Tensor[(), int32] -> list[tensor_t]
-        """
-        tensor_array_constructor_name = self.get_name("tensor_array")
-        tensor_array_constructor_var = GlobalVar(tensor_array_constructor_name)
-        setattr(self.prelude, tensor_array_constructor_name, tensor_array_constructor_var)
-        tensor_nil_var = self.get_ctor("tensor_nil")
-        tensor_type_var = self.get_ctor("tensor_t")
-        n = Var("x", scalar_type("int32"))
-        body = If(
-            equal(n, const(0)),
-            self.nil(),
-            self.cons(tensor_nil_var(), tensor_array_constructor_var(subtract(n, const(1)))),
-        )
-        self.prelude.mod[tensor_array_constructor_var] = Function(
-            [n], body, self.list(tensor_type_var()), []
-        )
-
-    def define_tensor_array_read(self):
-        """Defines a function to get the head of a list. Assume the list has at least one
-        element.
-
-        tensor_array_read(ta, n) : list[tensor_t] -> Tensor[(), int32] -> tensor_t
-        """
-        read_name = self.get_name("tensor_array_read")
-        read_var = GlobalVar(read_name)
-        setattr(self.prelude, read_name, read_var)
-        tensor_type_var = self.tensor_type_var
-
-        tensor_array = Var("tensor_array", self.list(tensor_type_var()))
-        n = Var("x", scalar_type("int32"))
-        self.prelude.mod[read_var] = Function(
-            [tensor_array, n], self.prelude.nth(tensor_array, n), tensor_type_var(), []
-        )
-
-    def define_tensor_array_write(self):
-        """Defines a function to update a tensor array at index n with value v.
-        tensor_array_write(ta, n, v) :
-            list[tensor_t] -> Tensor[(), int32] -> tensor_t -> list[tensor_t]
-        """
-        write_name = self.get_name("tensor_array_write")
-        write_var = GlobalVar(write_name)
-
-        tensor_type_var = self.tensor_type_var
-        tensor_array = Var("tensor_array", self.list(tensor_type_var()))
-        n = Var("x", scalar_type("int32"))
-        v = Var("v", tensor_type_var())
-        self.prelude.mod[write_var] = Function(
-            [tensor_array, n, v],
-            self.prelude.update(tensor_array, n, v),
-            self.list(tensor_type_var()),
-            [],
-        )
-
-    def define_tensor_array_unstack_tensor1(self):
-        """Defines a function to unstack the values of a tensor_t with rank 1 in a tensor array.
-        tensor_array_unstack_tensor1(t) : tensor_t -> list[tensor_t]
-        """
-        helper_name = self.get_name("tensor_array_unstack_tensor1_helper")
-        helper_var = GlobalVar(helper_name)
-        tensor = Var("t", TensorType([Any()], self.dtype))
-        up = Var("up", scalar_type("int32"))
-        i = Var("i", scalar_type("int32"))
-        tensor_type_var = self.tensor_type_var
-        tensor0_var = self.get_ctor("tensor0")
-        helper_body = If(
-            equal(i, up),
-            self.nil(),
-            self.cons(tensor0_var(op.take(tensor, i)), helper_var(add(i, const(1)), up, tensor)),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [i, up, tensor], helper_body, self.list(tensor_type_var()), []
-        )
-        unstack_name = self.get_name("tensor_array_unstack_tensor1")
-        unstack_var = GlobalVar(unstack_name)
-        tensor1 = Var("tensor", TensorType([Any()], self.dtype))
-        shape = op.shape_of(tensor1)
-        ndim = op.take(shape, const(0))
-        self.prelude.mod[unstack_var] = Function(
-            [tensor1], helper_var(const(0), ndim, tensor1), self.list(tensor_type_var()), []
-        )
-
-    def define_tensor_array_unstack_tensor2(self):
-        """Defines a function to unstack the values of a tensor_t with rank 2 in a tensor array.
-
-        tensor_array_unstack_tensor2(t) : tensor_t -> list[tensor_t]
-        """
-        helper_name = self.get_name("tensor_array_unstack_tensor2_helper")
-        helper_var = GlobalVar(helper_name)
-        setattr(self.prelude, helper_name, helper_var)
-        tensor = Var("t", TensorType([Any(), Any()], self.dtype))
-        up = Var("up", scalar_type("int32"))
-        i = Var("i", scalar_type("int32"))
-
-        helper_body = If(
-            equal(i, up),
-            self.nil(),
-            self.cons(
-                self.get_ctor("tensor1")(op.take(tensor, i, axis=0)),
-                helper_var(add(i, const(1)), up, tensor),
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [i, up, tensor], helper_body, self.list(self.tensor_type_var()), []
-        )
-
-        tensor_array_unstack_tensor2_name = self.get_name("tensor_array_unstack_tensor2")
-        tensor_array_unstack_tensor2_var = GlobalVar(tensor_array_unstack_tensor2_name)
-        setattr(self.prelude, tensor_array_unstack_tensor2_name, tensor_array_unstack_tensor2_var)
-        tensor2 = Var("tensor", TensorType([Any(), Any()], self.dtype))
-        shape = op.shape_of(tensor2)
-        ndim = op.take(shape, const(0))
-        self.prelude.mod[tensor_array_unstack_tensor2_var] = Function(
-            [tensor2], helper_var(const(0), ndim, tensor2), self.list(self.tensor_type_var()), []
-        )
-
-    def define_tensor_array_unstack_tensor3(self):
-        """Defines a function to unstack the values of a tensor_t with rank 3 in a tensor array.
-
-        tensor_array_unstack_tensor3(t) : tensor_t -> list[tensor_t]
-        """
-        helper_name = self.get_name("tensor_array_unstack_tensor3_helper")
-        helper_var = GlobalVar(helper_name)
-        setattr(self.prelude, helper_name, helper_var)
-        tensor = Var("t", TensorType([Any(), Any(), Any()], self.dtype))
-        up = Var("up", scalar_type("int32"))
-        i = Var("i", scalar_type("int32"))
-
-        helper_body = If(
-            equal(i, up),
-            self.nil(),
-            self.cons(
-                self.get_ctor("tensor2")(op.take(tensor, i, axis=0)),
-                helper_var(add(i, const(1)), up, tensor),
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [i, up, tensor], helper_body, self.list(self.tensor_type_var()), []
-        )
-
-        tensor_array_unstack_tensor3_name = self.get_name("tensor_array_unstack_tensor3")
-        tensor_array_unstack_tensor3_var = GlobalVar(tensor_array_unstack_tensor3_name)
-        setattr(self.prelude, tensor_array_unstack_tensor3_name, tensor_array_unstack_tensor3_var)
-        tensor3 = Var("tensor", TensorType([Any(), Any(), Any()], self.dtype))
-        shape = op.shape_of(tensor3)
-        ndim = op.take(shape, const(0))
-        self.prelude.mod[tensor_array_unstack_tensor3_var] = Function(
-            [tensor3], helper_var(const(0), ndim, tensor3), self.list(self.tensor_type_var()), []
-        )
-
-    def define_tensor_array_unstack_tensor4(self):
-        """Defines a function to unstack the values of a tensor_t with rank 4 in a tensor array.
-
-        tensor_array_unstack_tensor4(t) : tensor_t -> list[tensor_t]
-        """
-        helper_name = self.get_name("tensor_array_unstack_tensor4_helper")
-        helper_var = GlobalVar(helper_name)
-        setattr(self.prelude, helper_name, helper_var)
-        tensor = Var("t", TensorType([Any(), Any(), Any(), Any()], self.dtype))
-        up = Var("up", scalar_type("int32"))
-        i = Var("i", scalar_type("int32"))
-
-        helper_body = If(
-            equal(i, up),
-            self.nil(),
-            self.cons(
-                self.get_ctor("tensor3")(op.take(tensor, i, axis=0)),
-                helper_var(add(i, const(1)), up, tensor),
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [i, up, tensor], helper_body, self.list(self.tensor_type_var()), []
-        )
-
-        tensor_array_unstack_tensor4_name = self.get_name("tensor_array_unstack_tensor4")
-        tensor_array_unstack_tensor4_var = GlobalVar(tensor_array_unstack_tensor4_name)
-        setattr(self.prelude, tensor_array_unstack_tensor4_name, tensor_array_unstack_tensor4_var)
-        tensor4 = Var("tensor", TensorType([Any(), Any(), Any(), Any()], self.dtype))
-        shape = op.shape_of(tensor4)
-        ndim = op.take(shape, const(0))
-        self.prelude.mod[tensor_array_unstack_tensor4_var] = Function(
-            [tensor4], helper_var(const(0), ndim, tensor4), self.list(self.tensor_type_var()), []
-        )
-
-    def define_tensor_array_unstack_tensor5(self):
-        """Defines a function to unstack the values of a tensor_t with rank 5 in a tensor array.
-
-        tensor_array_unstack_tensor5(t) : tensor_t -> list[tensor_t]
-        """
-        helper_name = self.get_name("tensor_array_unstack_tensor5_helper")
-        helper_var = GlobalVar(helper_name)
-        setattr(self.prelude, helper_name, helper_var)
-        tensor = Var("t", TensorType([Any(), Any(), Any(), Any(), Any()], self.dtype))
-        up = Var("up", scalar_type("int32"))
-        i = Var("i", scalar_type("int32"))
-
-        helper_body = If(
-            equal(i, up),
-            self.nil(),
-            self.cons(
-                self.get_ctor("tensor4")(op.take(tensor, i, axis=0)),
-                helper_var(add(i, const(1)), up, tensor),
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [i, up, tensor], helper_body, self.list(self.tensor_type_var()), []
-        )
-
-        tensor_array_unstack_tensor5_name = self.get_name("tensor_array_unstack_tensor5")
-        tensor_array_unstack_tensor5_var = GlobalVar(tensor_array_unstack_tensor5_name)
-        setattr(self.prelude, tensor_array_unstack_tensor5_name, tensor_array_unstack_tensor5_var)
-        tensor5 = Var("tensor", TensorType([Any(), Any(), Any(), Any(), Any()], self.dtype))
-        shape = op.shape_of(tensor5)
-        ndim = op.take(shape, const(0))
-        self.prelude.mod[tensor_array_unstack_tensor5_var] = Function(
-            [tensor5], helper_var(const(0), ndim, tensor5), self.list(self.tensor_type_var()), []
-        )
-
-    def define_tensor_array_unstack_tensor6(self):
-        """Defines a function to unstack the values of a tensor_t with rank 6 in a tensor array.
-
-        tensor_array_unstack_tensor6(t) : tensor_t -> list[tensor_t]
-        """
-        helper_name = self.get_name("tensor_array_unstack_tensor6_helper")
-        helper_var = GlobalVar(helper_name)
-        setattr(self.prelude, helper_name, helper_var)
-        tensor = Var("t", TensorType([Any(), Any(), Any(), Any(), Any(), Any()], self.dtype))
-        up = Var("up", scalar_type("int32"))
-        i = Var("i", scalar_type("int32"))
-
-        helper_body = If(
-            equal(i, up),
-            self.nil(),
-            self.cons(
-                self.get_ctor("tensor5")(op.take(tensor, i, axis=0)),
-                helper_var(add(i, const(1)), up, tensor),
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [i, up, tensor], helper_body, self.list(self.tensor_type_var()), []
-        )
-
-        tensor_array_unstack_tensor6_name = self.get_name("tensor_array_unstack_tensor6")
-        tensor_array_unstack_tensor6_var = GlobalVar(tensor_array_unstack_tensor6_name)
-        setattr(self.prelude, tensor_array_unstack_tensor6_name, tensor_array_unstack_tensor6_var)
-        tensor6 = Var("tensor", TensorType([Any(), Any(), Any(), Any(), Any(), Any()], self.dtype))
-        shape = op.shape_of(tensor6)
-        ndim = op.take(shape, const(0))
-        self.prelude.mod[tensor_array_unstack_tensor6_var] = Function(
-            [tensor6], helper_var(const(0), ndim, tensor6), self.list(self.tensor_type_var()), []
-        )
-
-    def define_tensor_array_scatter(self):
-        """Defines a function to scatter the values of a tensor_t in indices of a tensor array.
-        tensor_array_scatter(ta, indices, value) :
-            list[tensor_t] -> Tensor[(Any), int32] -> tensor_t -> list[tensor_t]
-        """
-        tensor_array_scatter_helper_name = self.get_name("tensor_array_scatter_helper")
-        tensor_array_scatter_helper_var = GlobalVar(tensor_array_scatter_helper_name)
-        tensor_t = self.tensor_type_var
-        ta = Var("ta", self.list(tensor_t()))
-        current = Var("current", scalar_type("int32"))
-        limit = Var("limit", scalar_type("int32"))
-        indices_ = Var("indices_", TensorType([Any()], "int32"))
-        values_ = Var("values_", self.list(tensor_t()))
-        write_var = self.get_global_var("tensor_array_write")
-        read_var = self.get_global_var("tensor_array_read")
-        helper_body = If(
-            equal(current, limit),
-            ta,
-            tensor_array_scatter_helper_var(
-                write_var(ta, op.take(indices_, current), read_var(values_, current)),
-                add(current, const(1)),
-                limit,
-                indices_,
-                values_,
-            ),
-        )
-        self.prelude.mod[tensor_array_scatter_helper_var] = Function(
-            [ta, current, limit, indices_, values_], helper_body, self.list(tensor_t()), []
-        )
-        tensor_array_scatter_name = self.get_name("tensor_array_scatter")
-        tensor_array_scatter_var = GlobalVar(tensor_array_scatter_name)
-        setattr(self.prelude, tensor_array_scatter_name, tensor_array_scatter_var)
-        tensor_array = Var("tensor_array", self.list(tensor_t()))
-        indices = Var("indices", TensorType([Any()], "int32"))
-        values = Var("values", self.list(tensor_t()))
-        indices_shape = op.shape_of(indices)
-        limit = op.take(indices_shape, const(0))
-        body = tensor_array_scatter_helper_var(tensor_array, const(0), limit, indices, values)
-        self.prelude.mod[tensor_array_scatter_var] = Function(
-            [tensor_array, indices, values], body, self.list(tensor_t()), []
-        )
-
-    def define_tensor_array_split(self):
-        """Defines a function to split the values of a tensor_t into a tensor array.
-        tensor_array_split(ta, value, lengths) :
-            list[tensor_t] -> tensor_t -> Tensor[(Any), int32] -> list[tensor_t]
-        """
-        tensor_t = self.tensor_type_var
-        tensor_array_split_helper_name = self.get_name("ta_split_helper")
-        tensor_array_split_helper_var = GlobalVar(tensor_array_split_helper_name)
-        setattr(self.prelude, tensor_array_split_helper_name, tensor_array_split_helper_var)
-        ta1 = Var("tensor_array", self.list(tensor_t()))
-        value1 = Var("value1", tensor_t())
-        offset1 = Var("offset1", scalar_type("int32"))
-        current1 = Var("current1", scalar_type("int32"))
-        limit1 = Var("limit1", scalar_type("int32"))
-        lengths1 = Var("lengths", TensorType([Any()], "int32"))
-        write_var = self.get_global_var("tensor_array_write")
-        take_var = self.get_global_var("tensor_take")
-        helper1_body = If(
-            equal(current1, limit1),
-            ta1,
-            write_var(
-                tensor_array_split_helper_var(
-                    ta1,
-                    value1,
-                    add(offset1, op.take(lengths1, current1)),
-                    add(current1, const(1)),
-                    limit1,
-                    lengths1,
-                ),
-                current1,
-                take_var(value1, offset1, add(op.take(lengths1, current1), offset1)),
-            ),
-        )
-        self.prelude.mod[tensor_array_split_helper_var] = Function(
-            [ta1, value1, offset1, current1, limit1, lengths1],
-            helper1_body,
-            self.list(tensor_t()),
-            [],
-        )
-        split_name = self.get_name("tensor_array_split")
-        split_var = GlobalVar(split_name)
-        setattr(self.prelude, split_name, split_var)
-        tensor_array = Var("tensor_array", self.list(tensor_t()))
-        value = Var("value", tensor_t())
-        lengths = Var("lengths", TensorType([Any()], "int32"))
-        lengths_shape = op.shape_of(lengths)
-        lengths_limit = op.take(lengths_shape, const(0))
-        body = tensor_array_split_helper_var(
-            tensor_array, value, const(0), const(0), lengths_limit, lengths
-        )
-        self.prelude.mod[split_var] = Function(
-            [tensor_array, value, lengths], body, self.list(tensor_t()), []
-        )
-
-    def define_tensor_array_concat(self):
-        """Defines a function to return the values in the tensor array as concatenated tensor_t.
-        tensor_array_concat(ta) : list[tensor_t] -> tensor_t
-        """
-        concat_name = self.get_name("tensor_array_concat")
-        concat_var = GlobalVar(concat_name)
-        setattr(self.prelude, concat_name, concat_var)
-        tensor_concat_var = self.get_global_var("tensor_concatenate")
-        tensor_t = self.tensor_type_var
-        tensor_nil_var = self.get_ctor("tensor_nil")
-        tensor_array = Var("tensor_array", self.list(tensor_t()))
-        hd = Var("hd")
-        tl = Var("tl")
-        nil_case = Clause(PatternConstructor(self.nil), tensor_nil_var())
-        cons_case = Clause(
-            PatternConstructor(self.cons, [PatternVar(hd), PatternVar(tl)]),
-            Match(
-                tl,
-                [
-                    Clause(PatternConstructor(self.nil), hd),
-                    Clause(PatternWildcard(), tensor_concat_var(hd, concat_var(tl))),
-                ],
-                False,
-            ),
-        )
-        self.prelude.mod[concat_var] = Function(
-            [tensor_array], Match(tensor_array, [nil_case, cons_case], False), tensor_t(), []
-        )
-
-    def define_tensor_array_gather(self):
-        """Defines a function to return the selected values in a tensor array as tensor_t.
-        tensor_array_gather(ta, indices) : list[tensor_t] -> Tensor[(Any), int32] -> tensor_t
-        """
-        helper_name = self.get_name("tensor_array_gather_helper")
-        helper_var = GlobalVar(helper_name)
-        setattr(self.prelude, helper_name, helper_var)
-        tensor_type_var = self.tensor_type_var
-        stack_var = self.get_var("tensor_array_stack")
-        read_var = self.get_var("tensor_array_read")
-        ta = Var("ta", self.list(tensor_type_var()))
-        accu = Var("accu", self.list(tensor_type_var()))
-        current = Var("current", scalar_type("int32"))
-        limit = Var("limit", scalar_type("int32"))
-        indices_ = Var("indices_", TensorType([Any()], "int32"))
-        helper_body = If(
-            equal(current, const(0)),
-            stack_var(accu),
-            helper_var(
-                ta,
-                self.cons(read_var(ta, op.take(indices_, subtract(current, const(1)))), accu),
-                subtract(current, const(1)),
-                limit,
-                indices_,
-            ),
-        )
-        self.prelude.mod[helper_var] = Function(
-            [ta, accu, current, limit, indices_], helper_body, tensor_type_var(), []
-        )
-        gather_name = self.get_name("tensor_array_gather")
-        gather_var = GlobalVar(gather_name)
-        setattr(self.prelude, gather_name, gather_var)
-        tensor_array = Var("tensor_array", self.list(tensor_type_var()))
-        indices = Var("indices", TensorType([Any()], "int32"))
-        indices_shape = op.shape_of(indices)
-        limit = op.take(indices_shape, const(0))
-        body = helper_var(tensor_array, self.nil(), limit, limit, indices)
-        self.prelude.mod[gather_var] = Function(
-            [tensor_array, indices], body, tensor_type_var(), []
-        )
-
-    def define_tensor_array_stack(self):
-        """Defines a function to get the values in the tensor array as a stack tensor_t.
-        tensor_array_stack(l) : list[tensor_t] -> tensor_t
-        """
-        stack_name = self.get_name("tensor_array_stack")
-        stack_var = GlobalVar(stack_name)
-        setattr(self.prelude, stack_name, stack_var)
-        tensor_type_var = self.tensor_type_var
-        tensor_array = Var("tensor_array", self.list(tensor_type_var()))
-        expand_dims_var = self.get_global_var("tensor_expand_dims")
-        concat_var = self.get_global_var("tensor_concatenate")
-
-        tensor_array_expand_dims = self.prelude.map(expand_dims_var, tensor_array)
-        tensors = self.prelude.foldl(
-            concat_var,
-            self.prelude.hd(tensor_array_expand_dims),
-            self.prelude.tl(tensor_array_expand_dims),
-        )
-        self.prelude.mod[stack_var] = Function(
-            [tensor_array], ToANormalFormExpr(tensors), tensor_type_var(), []
-        )
-
-    def register(self):
-        """Register all tensor array ops in Prelude"""
-        self.define_tensor_adt()
-        self.define_tensor_take()
-        self.define_tensor_expand_dims()
-        self.define_tensor_concat()
-        self.define_tensor_array()
-        self.define_tensor_array_read()
-        self.define_tensor_array_write()
-        self.define_tensor_array_unstack_tensor1()
-        self.define_tensor_array_unstack_tensor2()
-        self.define_tensor_array_unstack_tensor3()
-        self.define_tensor_array_unstack_tensor4()
-        self.define_tensor_array_unstack_tensor5()
-        self.define_tensor_array_unstack_tensor6()
-        self.define_tensor_array_scatter()
-        self.define_tensor_array_split()
-        self.define_tensor_array_concat()
-        self.define_tensor_array_stack()
-        # TODO(wweic): Gather fails in PartialEvaluate
-        # self.define_tensor_array_gather()
-
-
-class Prelude:
-    """Contains standard definitions."""
-
-    def __init__(self, mod=None):
-        if mod is None:
-            mod = IRModule()
-        self.mod = mod
-        self.load_prelude()
-
-    def get_name(self, canonical, dtype):
-        """Get name corresponding to the canonical name"""
-        if canonical == "tensor_t":
-            return f"tensor_{dtype}_t"
-        return f"{canonical}_{dtype}"
-
-    def get_global_var(self, canonical, dtype):
-        """Get global var corresponding to the canonical name"""
-        name = self.get_name(canonical, dtype)
-        return self.mod.get_global_var(name)
-
-    def get_type(self, canonical, dtype):
-        """Get type corresponding to the canonical name"""
-        name = self.get_name(canonical, dtype)
-        return self.mod.get_global_type_var(name)
-
-    def get_ctor(self, ty_name, canonical, dtype):
-        """Get constructor corresponding to the canonical name"""
-        name = self.get_name(canonical, dtype)
-        ctors = self.mod.get_type(ty_name)
-        for ctor in ctors:
-            if ctor.name_hint == name:
-                return ctor
-        raise Exception(f"could not find {name}")
-
-    def get_tensor_ctor(self, canonical, dtype):
-        ty = self.get_type("tensor_t", dtype)
-        return self.get_ctor(ty.name_hint, canonical, dtype)
-
-    def get_name_static(self, canonical, dtype, shape, batch_dim=None):
-        """Get name corresponding to the canonical name"""
-        return _get_name_static(canonical, dtype, shape, batch_dim)
-
-    def get_global_var_static(self, canonical, dtype, shape, batch_dim=None):
-        """Get var corresponding to the canonical name"""
-        name = self.get_name_static(canonical, dtype, shape, batch_dim)
-        return self.mod.get_global_var(name)
-
-    def get_type_static(self, canonical, dtype, shape):
-        """Get type corresponding to the canonical name"""
-        name = self.get_name_static(canonical, dtype, shape)
-        return self.mod.get_global_type_var(name)
-
-    def get_ctor_static(self, ty_name, name, dtype, shape):
-        """Get constructor corresponding to the canonical name"""
-        ty_name = self.get_name_static(ty_name, dtype, shape)
-        name = self.get_name_static(name, dtype, shape)
-        ctors = self.mod.get_type(ty_name)
-        for ctor in ctors:
-            if ctor.name_hint == name:
-                return ctor
-        raise Exception(f"could not find {name}")
-
-    def get_tensor_ctor_static(self, name, dtype, shape):
-        """Get constructor corresponding to the canonical name"""
-        return self.get_ctor_static("tensor_t", name, dtype, shape)
-
-    def load_prelude(self):
-        """Parses the Prelude from Relay's text format into a module."""
-        # TODO(@jroesch): we should remove this helper when we port over prelude
-        self.mod.import_from_std("prelude.rly")
-
-        GLOBAL_DEFS = [
-            "id",
-            "compose",
-            "flip",
-            "hd",
-            "tl",
-            "nth",
-            "update",
-            "map",
-            "foldl",
-            "foldr",
-            "foldr1",
-            "concat",
-            "filter",
-            "zip",
-            "rev",
-            "map_accuml",
-            "map_accumr",
-            "unfoldl",
-            "unfoldr",
-            "sum",
-            "length",
-            "tmap",
-            "size",
-            "iterate",
-        ]
-
-        for global_def in GLOBAL_DEFS:
-            setattr(self, global_def, self.mod.get_global_var(global_def))
-
-        for dtype in [
-            "float32",
-            "float16",
-            "float64",
-            "int32",
-            "uint8",
-            "int8",
-            "int16",
-            "uint16",
-            "int64",
-        ]:
-            tensor_array_ops = TensorArrayOps(self, dtype)
-            tensor_array_ops.register()
-
-        # Renamer doesn't properly deal with constructors, etc
-        # self.mod = AnnotateSpans()(self.mod)
diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
deleted file mode 100644
index af6a0b0449a9..000000000000
--- a/python/tvm/relay/qnn/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import,redefined-builtin
-"""QNN dialect operators and IR passes."""
-from __future__ import absolute_import as _abs
-from . import op
-from . import transform
-from .op.qnn import *
diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py
deleted file mode 100644
index 0f087dec1fa8..000000000000
--- a/python/tvm/relay/qnn/op/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin
-
-"""QNN dialect related operators."""
-from __future__ import absolute_import as _abs
-from .qnn import *
-from .op import register_qnn_legalize, register_qnn_canonicalize
-from . import _qnn, legalizations, layout_conversions, canonicalizations
diff --git a/python/tvm/relay/qnn/op/_make.py b/python/tvm/relay/qnn/op/_make.py
deleted file mode 100644
index 4472bc77c6cd..000000000000
--- a/python/tvm/relay/qnn/op/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.qnn.op._make", __name__)
diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
deleted file mode 100644
index f036e6cf840d..000000000000
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, len-as-condition
-"""QNN operator feature registration"""
-
-import numpy as np
-
-from tvm import topi
-
-from .. import strategy
-from ...op.op import register_compute
-from ...op.op import register_injective_schedule
-from ...op.op import (
-    OpPattern,
-    register_alter_op_layout,
-    register_legalize,
-    register_pattern,
-    register_strategy,
-)
-
-
-@register_compute("qnn.simulated_quantize")
-def simulated_quantize_compute(attrs, inputs, output_type):
-    assert len(inputs) == 4
-    return [
-        topi.nn.simulated_quantize(
-            inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis")
-        )
-    ]
-
-
-register_injective_schedule("qnn.simulated_quantize")
-register_pattern("qnn.simulated_quantize", OpPattern.ELEMWISE)
-
-
-@register_compute("qnn.simulated_dequantize")
-def simulated_dequantize_compute(attrs, inputs, output_type):
-    assert len(inputs) == 4
-    return [
-        topi.nn.simulated_dequantize(
-            inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis")
-        )
-    ]
-
-
-register_injective_schedule("qnn.simulated_dequantize")
-register_pattern("qnn.simulated_dequantize", OpPattern.ELEMWISE)
-
-# qnn.quantize
-register_strategy("qnn.quantize", strategy.qnn_quantize_strategy)
-register_pattern("qnn.quantize", OpPattern.ELEMWISE)
-
-# qnn.dequantize
-register_strategy("qnn.dequantize", strategy.qnn_dequantize_strategy)
-register_pattern("qnn.dequantize", OpPattern.ELEMWISE)
-
-# qnn.requantize
-register_strategy("qnn.requantize", strategy.qnn_requantize_strategy)
-register_pattern("qnn.requantize", OpPattern.ELEMWISE)
-
-# qnn.add
-register_strategy("qnn.add", strategy.qnn_add_strategy)
-
-# qnn.subtract
-register_strategy("qnn.subtract", strategy.qnn_subtract_strategy)
-
-# qnn.mul
-register_strategy("qnn.mul", strategy.qnn_mul_strategy)
-
-# qnn.tanh
-register_strategy("qnn.tanh", strategy.qnn_tanh_strategy)
-register_pattern("qnn.tanh", OpPattern.ELEMWISE)
-
-# qnn.concatenate
-register_strategy("qnn.concatenate", strategy.qnn_concatenate_strategy)
-register_pattern("qnn.concatenate", OpPattern.INJECTIVE)
-
-# qnn.conv2d
-register_strategy("qnn.conv2d", strategy.qnn_conv2d_strategy)
-
-
-@register_legalize("clip")
-def legalize_clip(attrs, inputs, tinfos):
-    """Removes clip operators with bounds matching the defaults for their dtype.
-
-    This is already done after alter_op by TVM's simplification passes, but certain QNN operator
-    implementations (like Cortex-M) need it to be done earlier in legalization.
-    """
-
-    if (
-        hasattr(inputs[0], "op")
-        and hasattr(inputs[0].op, "name")
-        and inputs[0].op.name == "qnn.requantize"
-    ):
-        dtype_info = np.iinfo(tinfos[0].dtype)
-        if dtype_info.min == attrs.a_min and dtype_info.max == attrs.a_max:
-            return inputs[0]
-
-    return None
-
-
-@register_legalize("nn.bias_add")
-def legalize_bias_add(attrs, inputs, tinfos):
-    """Legalize a bias add operator.
-
-    May be used to "fold in" unused channels from quantized convolution operators. This should
-    be done before layout rewrites occur to minimize the amount of "extra" overhead operators
-    like "cast" and "layout_transform".
-    """
-    return topi.nn.bias_add_legalize(attrs, inputs, tinfos)
-
-
-@register_alter_op_layout("qnn.conv2d")
-def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a qnn conv2d op.
-
-    May be used to alter the current QNN Conv2D op, but can also be used to alter previous ops to
-    better match the current op. For example, Arm Cortex-M uses this to set the out_layout of
-    previous ops to the input layout preferred by future layouts.
-    """
-    return topi.nn.qnn_conv2d_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@register_alter_op_layout("add")
-def alter_op_layout_add(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a add op.
-
-    Useful for fusing the bias constant with an input zero point constant in a previous quantized
-    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
-    """
-    return topi.nn.add_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@register_alter_op_layout("qnn.requantize")
-def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a requantization op."""
-    return topi.nn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-# qnn.dense
-register_strategy("qnn.dense", strategy.qnn_dense_strategy)
-
-
-@register_alter_op_layout("qnn.dense")
-def alter_op_layout_qnn_dense(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of qnn.dense"""
-    return topi.nn.qnn_dense_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-# qnn.contrib_dense_pack
-register_strategy("qnn.contrib_dense_pack", strategy.qnn_dense_pack_strategy)
-
-# qnn.batch_matmul
-register_strategy("qnn.batch_matmul", strategy.qnn_batch_matmul_strategy)
-register_pattern("qnn.batch_matmul", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-# qnn.avg_pool2d
-register_strategy("qnn.avg_pool2d", strategy.qnn_avg_pool2d_strategy)
diff --git a/python/tvm/relay/qnn/op/_requantize.py b/python/tvm/relay/qnn/op/_requantize.py
deleted file mode 100644
index 2e2fd9fd2980..000000000000
--- a/python/tvm/relay/qnn/op/_requantize.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""Internal module for qnn requantization."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay._requantize", __name__)
diff --git a/python/tvm/relay/qnn/op/canonicalizations.py b/python/tvm/relay/qnn/op/canonicalizations.py
deleted file mode 100644
index 6bfcd34aba90..000000000000
--- a/python/tvm/relay/qnn/op/canonicalizations.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Consist of utilities and methods for lowering QNN into mainline relay."""
-from typing import Callable
-
-import numpy as np
-import tvm
-from tvm import relay
-
-
-def run_const_expr(expr: "relay.Expr") -> np.ndarray:
-    """Evaluate a const expression, receiving result as np array.
-
-    If a number of passes are disabled in the current Pass Context, then there is no need to disable
-    these passes for const expression evaluation as well. That's why we use empty list
-    "disabled_pass=[]", all other arguments are inherited from the current Pass Context.
-    """
-    curr_pass_ctx = tvm.ir.transform.PassContext.current()
-    with tvm.ir.transform.PassContext(
-        opt_level=curr_pass_ctx.opt_level,
-        required_pass=curr_pass_ctx.required_pass,
-        disabled_pass=[],
-        instruments=curr_pass_ctx.instruments,
-        config=curr_pass_ctx.config,
-    ):
-        mod = tvm.IRModule.from_expr(expr)
-        vm_exe = relay.create_executor("vm", mod=mod)
-        output = vm_exe.evaluate()().asnumpy()
-
-    return output
-
-
-def create_integer_lookup_table(
-    floating_point_func: Callable[[np.ndarray], np.ndarray],
-    input_scale: "relay.Expr",
-    input_zero_point: "relay.Expr",
-    output_scale: "relay.Expr",
-    output_zero_point: "relay.Expr",
-    in_axis: int = -1,
-    out_axis: int = -1,
-    in_dtype: str = "uint8",
-    out_dtype: str = "uint8",
-) -> np.ndarray:
-    """
-    Return a table where each input indexes to the output quantizing the given function.
-
-    Note this also supports mapping unsigned and signed integers to each other.
-
-    Args:
-      floating_point_func: The numpy function which this table is to approximate
-      input_scale: The scale of the quantized input tensor.
-      input_zero_point: The zero point of the quantized input tensor.
-      output_scale: The scale of the quantized output tensor.
-      output_zero_point: The zero point of the quantized output tensor.
-      in_axis: The axis for multi-channel quantization of the input if applicable.
-      out_axis: The axis for multi-channel quantization of the output if applicable.
-      in_dtype: The dtype of the input tensor.
-      out_dtype: The wanted dtype of the output tensor.
-
-    Returns:
-      A numpy array where values in quantized space will index to the output in quantized space
-      approximating the given function.
-    """
-    if not np.issubdtype(np.dtype(in_dtype), np.integer) or not np.issubdtype(
-        np.dtype(out_dtype), np.integer
-    ):
-        raise ValueError(
-            f"Only integer dtypes allowed got {in_dtype} and {out_dtype} for in and out dtypes."
-        )
-
-    dtype_info = np.iinfo(in_dtype)
-
-    num_bits = dtype_info.bits
-
-    # Use TVMs quantization methods via relay to be consistent
-    # inputs_quantized = np.array(range(dtype_info.min, dtype_info.max + 1)).astype(in_dtype)
-
-    # First generate a list of all num_bit integer patterns
-    inputs_quantized = np.array(range(0, 2**num_bits), dtype=f"uint{num_bits}")
-
-    # Reinterpret bits as the real datatype
-    # Note what we are doing here is a bit tricky, the canonical view of our lookup table
-    # is using the uintX version. When we run the lookup in the relay graph, we cast the
-    # bit pattern back into this form.
-    inputs_quantized = inputs_quantized.view(in_dtype)
-    inputs_quantized = relay.const(inputs_quantized, dtype=in_dtype)
-    inputs_dequantized = run_const_expr(
-        relay.qnn.op.dequantize(
-            inputs_quantized,
-            input_scale=input_scale,
-            input_zero_point=input_zero_point,
-            axis=in_axis,
-        )
-    )
-
-    output_dequantized = relay.const(floating_point_func(inputs_dequantized))
-    output_quantized = run_const_expr(
-        relay.qnn.op.quantize(
-            output_dequantized, output_scale, output_zero_point, out_axis, out_dtype
-        )
-    )
-
-    return output_quantized
-
-
-def create_integer_lookup_op(
-    input_arg: "relay.Expr",
-    floating_point_func: Callable[[np.array], np.array],
-    in_scale: "relay.Expr",
-    in_zero_point: "relay.Expr",
-    out_scale: "relay.Expr",
-    out_zero_point: "relay.Expr",
-    in_axis: int = -1,
-    out_axis: int = -1,
-    in_dtype: str = "uint8",
-    out_dtype: str = "uint8",
-) -> "relay.Expr":
-    """
-    Create a quantized version of the given floating point unary operation using table lookup.
-
-    Args:
-      input_arg: The quantized input to the final function.
-      floating_point_func: The numpy function which this table is to approximate
-      in_scale: The scale of the quantized input tensor.
-      in_zero_point: The zero point of the quantized input tensor.
-      out_scale: The scale of the quantized output tensor.
-      out_zero_point: The zero point of the quantized output tensor.
-      in_axis: The axis for multi-channel quantization of the input if applicable.
-      out_axis: The axis for multi-channel quantization of the output if applicable.
-      in_dtype: The dtype of the input tensor.
-      out_dtype: The wanted dtype of the output tensor.
-
-    Returns:
-      A Relay expression representing a quantized version of the given function.
-    """
-
-    # TODO: handle multi-channel q, below will fail with multi-channel q
-    in_scale = in_scale.data.numpy().item()
-    in_zero_point = in_zero_point.data.numpy().item()
-    out_scale = out_scale.data.numpy().item()
-    out_zero_point = out_zero_point.data.numpy().item()
-
-    lookup_table = create_integer_lookup_table(
-        floating_point_func,
-        relay.const(in_scale),
-        relay.const(in_zero_point, dtype="int32"),
-        relay.const(out_scale),
-        relay.const(out_zero_point, dtype="int32"),
-        in_axis=in_axis,
-        in_dtype=in_dtype,
-        out_axis=out_axis,
-        out_dtype=out_dtype,
-    )
-
-    in_dtype_info = np.iinfo(in_dtype)
-    in_dtype_num_bits = in_dtype_info.bits
-
-    lookup_table = relay.const(lookup_table)
-    index_tensor = relay.reinterpret(input_arg, f"uint{in_dtype_num_bits}")
-    result = relay.take(lookup_table, index_tensor, axis=0, mode="fast")
-    return result
diff --git a/python/tvm/relay/qnn/op/layout_conversions.py b/python/tvm/relay/qnn/op/layout_conversions.py
deleted file mode 100644
index 587993603139..000000000000
--- a/python/tvm/relay/qnn/op/layout_conversions.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Convert layout related registration"""
-from __future__ import absolute_import
-
-from tvm.relay.op import op as reg
-
-from ...op.strategy.generic import is_depthwise_conv2d
-
-
-@reg.register_convert_op_layout("qnn.conv2d")
-def convert_qnn_conv2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for QNN conv2d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    assert len(desired_layouts) == 2, "A desired layout is expected for both of qnn.conv2d's inputs"
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-
-    new_attrs = dict(attrs)
-    new_attrs["data_layout"] = desired_data_layout
-
-    if desired_kernel_layout != "default":
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        return relay.qnn.op.conv2d(*inputs, **new_attrs)
-
-    if desired_data_layout == "NCHW":
-        new_attrs["kernel_layout"] = "OIHW"
-        return relay.qnn.op.conv2d(*inputs, **new_attrs)
-    if desired_data_layout == "NHWC":
-        # Check for depthwise convolution.
-        data_info = tinfos[0]
-        weight_info = tinfos[1]
-        if is_depthwise_conv2d(
-            data_info.shape,
-            attrs["data_layout"],
-            weight_info.shape,
-            attrs["kernel_layout"],
-            attrs["groups"],
-        ):
-            new_attrs["kernel_layout"] = "HWOI"
-        else:
-            new_attrs["kernel_layout"] = "HWIO"
-        return relay.qnn.op.conv2d(*inputs, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported")
-
-
-@reg.register_convert_op_layout("qnn.conv2d_transpose")
-def convert_qnn_conv2d_transpose(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for QNN conv2d_transpose op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data and kernel inputs respectively.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    assert (
-        len(desired_layouts) == 2
-    ), "A desired layout is expected for both of qnn.conv2d_transpose's inputs"
-    desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-    assert desired_data_layout != "default", "Data layout cannot be default"
-
-    new_attrs = dict(attrs)
-    new_attrs["data_layout"] = desired_data_layout
-
-    if desired_kernel_layout != "default":
-        new_attrs["kernel_layout"] = desired_kernel_layout
-        return relay.qnn.op.conv2d_transpose(*inputs, **new_attrs)
-
-    # Handle default kernel layouts
-    if desired_data_layout == "NCHW":
-        new_attrs["kernel_layout"] = "IOHW"
-        return relay.qnn.op.conv2d_transpose(*inputs, **new_attrs)
-    if desired_data_layout == "NHWC":
-        new_attrs["kernel_layout"] = "HWIO"
-        return relay.qnn.op.conv2d_transpose(*inputs, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported")
-
-
-@reg.register_convert_op_layout("qnn.avg_pool2d")
-def convert_qnn_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
-    """Convert Layout pass registration for QNN avg_pool2d op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current avg_pool2d
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layouts : list of layout strings
-        List of layouts defining our desired
-        layout for the data input.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    assert len(desired_layouts) == 1, "A desired layout is expected for qnn.avg_pool2d's input"
-    desired_data_layout = desired_layouts[0]
-    if desired_data_layout == "NCHW" or desired_data_layout == "NHWC":
-        new_attrs = dict(attrs)
-        new_attrs["layout"] = str(desired_data_layout)
-        new_attrs["out_layout"] = str(desired_data_layout)
-        return relay.qnn.op.avg_pool2d(*inputs, **new_attrs)
-
-    raise ValueError(f"Layout {desired_data_layout} is not yet supported")
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
deleted file mode 100644
index 81df386fc297..000000000000
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ /dev/null
@@ -1,699 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Backend QNN related feature registration"""
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm._ffi.base import TVMError
-from tvm.relay.qnn.op.canonicalizations import create_integer_lookup_op
-
-from ....target.x86 import target_has_features
-from ....topi.utils import is_target
-from .. import op as reg
-
-#################################################
-# Register the functions for different operators.
-#################################################
-
-# Registering QNN Conv2D legalization function.
-
-
-@reg.register_qnn_legalize("qnn.conv2d")
-def legalize_qnn_conv2d(attrs, inputs, types):
-    return qnn_conv2d_legalize(attrs, inputs, types)
-
-
-# Registering QNN Conv2DTranspose legalization function.
-@reg.register_qnn_legalize("qnn.conv2d_transpose")
-def legalize_qnn_conv2d_transpose(attrs, inputs, types):
-    return qnn_conv2d_transpose_legalize(attrs, inputs, types)
-
-
-# Registering QNN dense legalization function.
-@reg.register_qnn_legalize("qnn.dense")
-def legalize_qnn_dense(attrs, inputs, types):
-    return qnn_dense_legalize(attrs, inputs, types)
-
-
-def register_qnn_unary_op_legalize(op_name, floating_point_func):
-    """Register unary qnn op for legalization via table lookup op."""
-
-    def legalize_qnn_unary_op(attrs, inputs, types):
-        return create_integer_lookup_op(
-            input_arg=inputs[0],
-            floating_point_func=floating_point_func,
-            in_scale=inputs[1],
-            in_zero_point=inputs[2],
-            out_scale=inputs[3],
-            out_zero_point=inputs[4],
-            in_dtype=types[0].dtype,
-            out_dtype=types[0].dtype,
-        )
-
-    return reg.register_qnn_legalize(op_name, legalize_qnn_unary_op)
-
-
-def hardswish_func(x):
-    x2 = x + 3.0
-    x2 = np.clip(x2, 0.0, 6.0)
-    return x * x2 / 6.0
-
-
-register_qnn_unary_op_legalize("qnn.sqrt", np.sqrt)
-register_qnn_unary_op_legalize("qnn.rsqrt", lambda arr: 1 / np.sqrt(arr))
-register_qnn_unary_op_legalize("qnn.exp", np.exp)
-register_qnn_unary_op_legalize("qnn.sigmoid", lambda arr: 1 / (1 + np.exp(-arr)))
-register_qnn_unary_op_legalize("qnn.hardswish", hardswish_func)
-register_qnn_unary_op_legalize("qnn.tanh", np.tanh)
-register_qnn_unary_op_legalize("qnn.log", np.log)
-register_qnn_unary_op_legalize("qnn.abs", np.abs)
-
-
-@reg.register_qnn_legalize("qnn.erf")
-def _legalize_qnn_erf(attrs, inputs, types):
-    from scipy import special  # pylint: disable=import-outside-toplevel
-
-    return create_integer_lookup_op(
-        input_arg=inputs[0],
-        floating_point_func=special.erf,
-        in_scale=inputs[1],
-        in_zero_point=inputs[2],
-        out_scale=inputs[3],
-        out_zero_point=inputs[4],
-        in_dtype=types[0].dtype,
-        out_dtype=types[0].dtype,
-    )
-
-
-# Default to None. If overridden by target, this will not be run.
-# Generic QNN Conv2D legalization function.
-@tvm.target.generic_func
-def qnn_conv2d_legalize(attrs, inputs, types):
-    """Default legalization is None."""
-    return None
-
-
-# Generic QNN Conv2DTranspose legalization function.
-@tvm.target.generic_func
-def qnn_conv2d_transpose_legalize(attrs, inputs, types):
-    """Convert kernel and data to int16, subtract offsets upfront
-    and calls into relay.nn.conv2d_transpose."""
-
-    # Collect the input exprs.
-    data, kernel, input_zero_point, kernel_zero_point, _, _ = inputs
-
-    # If input zero point is a scalar, we can directly subtract it.
-    if len(types[2].shape) == 0:
-        shift_data = relay.subtract(
-            relay.cast(data, dtype="int16"), relay.cast(input_zero_point, "int16")
-        )
-    # Otherwise it needs to be broadcast.
-    else:
-        shift_data = relay.nn.bias_add(
-            relay.cast(data, dtype="int16"),
-            -relay.cast(input_zero_point, dtype="int16"),
-        )
-
-    # If kernel zero point is a scalar, we can directly subtract it.
-    if len(types[3].shape) == 0:
-        shift_kernel = relay.subtract(
-            relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, "int16")
-        )
-    # Otherwise it needs to be broadcast.
-    else:
-        shift_kernel = relay.nn.bias_add(
-            relay.cast(kernel, dtype="int16"),
-            -relay.cast(kernel_zero_point, dtype="int16"),
-        )
-
-    return relay.nn.conv2d_transpose(shift_data, shift_kernel, **attrs)
-
-
-# Generic QNN Conv2D legalization function.
-@tvm.target.generic_func
-def qnn_dense_legalize(attrs, inputs, types):
-    """Default legalization is None."""
-    return None
-
-
-###################
-# Helper functions.
-###################
-
-
-def get_scalar_from_constant(expr):
-    """Returns scalar value from Relay constant scalar."""
-    assert (
-        isinstance(expr, relay.Constant) and not expr.data.shape
-    ), "Expr is not a constant scalar."
-    value = expr.data.numpy()
-    assert value.dtype == np.dtype(np.int32) or value.dtype == np.dtype(
-        np.float32
-    ), "value must be float32/int32"
-    return value.item(0)
-
-
-def _shift(data, zero_point, out_dtype):
-    """Shifts (add/subtracts) the qnn tensor with +/-128)"""
-    if out_dtype == "uint8":
-        shift = 128
-    elif out_dtype == "int8":
-        shift = -128
-    else:
-        raise ValueError("Unsupported out dtype.")
-    data_modified = relay.cast(data, "int32")
-    data_modified = relay.add(data_modified, relay.const(shift, "int32"))
-    data_modified = relay.cast(data_modified, out_dtype)
-    if isinstance(zero_point, relay.Constant):
-        zero_point_val = get_scalar_from_constant(zero_point)
-        zero_point_modified = relay.const(zero_point_val + shift, "int32")
-    else:
-        zero_point_modified = zero_point + relay.const(shift, "int32")
-    return (data_modified, zero_point_modified)
-
-
-# Helper function for lowering in the abscence of fast Int8 arithmetic units.
-def helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay_op):
-    """Converts QNN operators into a sequence of Relay operators that are friendly to HW that do
-    not have fast Int8 arithmetic. For example, for ARM, LLVM utilizes the assembly instructions
-    much more efficiently if the convolution or dense operator input datatypes are int16 instead of
-    int8. More details are present at https://github.com/apache/tvm/pull/4277.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-
-    # Collect the input exprs.
-    data, kernel, input_zero_point, kernel_zero_point, _, _ = inputs
-
-    shift_data = relay.subtract(
-        relay.cast(data, dtype="int16"), relay.cast(input_zero_point, dtype="int16")
-    )
-    # If kernel zero point is a scalar we can directly subtract it.
-    if len(types[3].shape) == 0:
-        shift_kernel = relay.subtract(
-            relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, dtype="int16")
-        )
-    # Otherwise it needs to be broadcast.
-    else:
-        # Determine output axis of kernel for spatial operations.
-        if hasattr(attrs, "kernel_layout"):
-            output_axis = tvm.tir.layout(attrs["kernel_layout"]).index_of("O")
-        # For dense operations, broadcast to [N, K] layout.
-        elif isinstance(attrs, relay.op.op_attrs.DenseAttrs):
-            output_axis = 0
-        # For matrix multiplication instead expand to [K, N] layout.
-        elif isinstance(attrs, relay.op.op_attrs.MatmulAttrs):
-            output_axis = 1
-        else:
-            raise TVMError(
-                "Legalization of %s is not yet supported with per channel parameters"
-                % str(type(attrs))
-            )
-
-        shift_kernel = relay.nn.bias_add(
-            relay.cast(kernel, dtype="int16"),
-            -relay.cast(kernel_zero_point, dtype="int16"),
-            output_axis,
-        )
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    return relay_op(shift_data, shift_kernel, **new_attrs)
-
-
-# Helper function to change dtypes to uint8 x int8. Intel VNNI instructions prefer this setting.
-def helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay_op):
-    """Legalizes QNN conv2d/dense op for Intel HW. VNNI supports u8 x i8 fast conv/MM. If the dtypes
-    are already good, we dont transform. Else, we shift the tensor values and zero points to change
-    the dtype.
-
-    Converting from int8 to uint8 can be done in following manner.
-
-    Original equation
-      scale * (QA - zp_a)
-      scale * (QA + 128 - 128 - zp_a)
-      scale * ( (QA + 128) - (zp_a + 128))
-
-    Replacing QA + 128 with QA' and (zp_a + 128) with zp_a'
-    We get our new quantized uint8 tensor - scale * (QA' - zp_a')
-
-    Similarly we can convert from uint8 to int8.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # Collect the dtypes.
-    data_dtype = types[0].dtype
-    kernel_dtype = types[1].dtype
-
-    # Collect the input exprs.
-    data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale = inputs
-
-    # VNNI supports u8 x i8 fast conv/MM. Don't do anything if it is already satisfied.
-    if data_dtype == "uint8" and kernel_dtype == "int8":
-        return None
-
-    # Shift input if necessary.
-    if data_dtype == "int8":
-        # Compute (QA + 128) and (zp_a + 128)
-        data, input_zero_point = _shift(data, input_zero_point, "uint8")
-
-    # Shift kernel if necessary.
-    if kernel_dtype == "uint8":
-        # Compute (QA - 128) and (zp_a - 128)
-        kernel, kernel_zero_point = _shift(kernel, kernel_zero_point, "int8")
-
-    # Call qnn.conv2d with modified inputs and zero points.
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    return relay_op(
-        data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale, **new_attrs
-    )
-
-
-# Helper function to change dtypes to int8 x int8. Cuda dp4a instructions prefer this setting.
-def helper_change_dtypes_to_int8(attrs, inputs, types, relay_op):
-    """Legalizes QNN conv2d/dense op for Nvidia HW. dp4a supports i8 x i8 fast conv/MM. If the
-    dtypes are already good, we dont transform. Else, we shift the tensor values and zero points
-    to change the dtype.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # Collect the dtypes.
-    data_dtype = types[0].dtype
-    kernel_dtype = types[1].dtype
-
-    # Collect the input exprs.
-    data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale = inputs
-
-    # dp4a supports i8 x i8 fast conv/MM. Don't do anything if it is already satisfied.
-    if data_dtype == "int8" and kernel_dtype == "int8":
-        return None
-
-    # Shift input if necessary.
-    if data_dtype == "uint8":
-        # Compute (QA + 128) and (zp_a + 128)
-        data, input_zero_point = _shift(data, input_zero_point, "int8")
-
-    # Shift kernel if necessary.
-    if kernel_dtype == "uint8":
-        # Compute (QA - 128) and (zp_a - 128)
-        kernel, kernel_zero_point = _shift(kernel, kernel_zero_point, "int8")
-
-    # Call qnn.conv2d with modified inputs and zero points.
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    return relay_op(
-        data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale, **new_attrs
-    )
-
-
-def helper_change_dtypes_to_uint8(attrs, inputs, types, relay_op):
-    """Helper function to change dtypes to uint8 x uint8.
-    Legalizes QNN dense op for Hexagon DSP. It supports fast u8 x u8 vrmpy instruction.
-
-    Converting from int8 to uint8 can be done in following manner:
-
-    Original equation
-      scale * (QA - zp_a)
-      scale * (QA + 128 - 128 - zp_a)
-      scale * ( (QA + 128) - (zp_a + 128))
-
-    Replacing QA + 128 with QA' and (zp_a + 128) with zp_a'
-    We get our new quantized uint8 tensor - scale * (QA' - zp_a')
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # Collect the dtypes.
-    data_dtype = types[0].dtype
-    kernel_dtype = types[1].dtype
-
-    # Do nothing since it is already uint8.
-    if data_dtype == "uint8" and kernel_dtype == "uint8":
-        return None
-
-    # Collect the input exprs.
-    data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale = inputs
-
-    # Shift input if necessary.
-    if data_dtype == "int8":
-        # Compute (QA + 128) and (zp_a + 128)
-        data, input_zero_point = _shift(data, input_zero_point, "uint8")
-
-    # Shift kernel if necessary.
-    if kernel_dtype == "int8":
-        # Compute (QA + 128) and (zp_a + 128)
-        kernel, kernel_zero_point = _shift(kernel, kernel_zero_point, "uint8")
-
-    # Call qnn.conv2d/qnn.dense with modified inputs and zero points.
-    new_attrs = dict(attrs)
-    return relay_op(
-        data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale, **new_attrs
-    )
-
-
-# Helper function to change dtypes to be same. ARM dotprod instructions prefer this setting.
-def helper_change_dtypes_to_be_same(attrs, inputs, types, relay_op):
-    """Sometimes MxNet + MLDNN can lead to uint8 x int8 datatypes for the conv inputs. However,
-    many devices like ARM prefer the datatypes to be same for the HW units. This helper transforms
-    conv2d/dense such that both the dtypes are same.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-
-    def _shift(data, zero_point, out_dtype):
-        """Shifts (adds/subtracts) the qnn tensor by 128)"""
-        if out_dtype == "uint8":
-            shift = 128
-        elif out_dtype == "int8":
-            shift = -128
-        else:
-            raise ValueError("Unsupported out dtype.")
-        data_modified = relay.cast(data, "int32")
-        data_modified = relay.add(data_modified, relay.const(shift, "int32"))
-        data_modified = relay.cast(data_modified, out_dtype)
-        zero_point_val = get_scalar_from_constant(zero_point)
-        zero_point_modified = relay.const(zero_point_val + shift, "int32")
-        return (data_modified, zero_point_modified)
-
-    # Collect the dtypes.
-    data_dtype = types[0].dtype
-    kernel_dtype = types[1].dtype
-
-    if data_dtype == kernel_dtype:
-        return None
-
-    # Collect the input exprs.
-    data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale = inputs
-
-    assert (
-        "int8" in data_dtype and "int8" in kernel_dtype
-    ), "Qnn Conv2D/Dense only accepts uint8 or int8 inputs"
-
-    # Shift input if necessary.
-    data, input_zero_point = _shift(data, input_zero_point, kernel_dtype)
-
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    return relay_op(
-        data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale, **new_attrs
-    )
-
-
-def is_fast_int8_on_intel():
-    """Checks whether the hardware has support for fast Int8 arithmetic operations."""
-    return target_has_features("sse4.2")
-
-
-# Helper function to align up given value.
-def helper_align_up(value, aligner):
-    return ((value + aligner) // aligner) * aligner
-
-
-########################
-# ARM CPU legalizations.
-########################
-
-
-@qnn_conv2d_legalize.register("arm_cpu")
-def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
-    target = tvm.target.Target.current(allow_none=False)
-    is_depthwise = relay.op.strategy.is_depthwise_conv2d(
-        types[0].shape,
-        attrs["data_layout"],
-        types[1].shape,
-        attrs["kernel_layout"],
-        attrs["groups"],
-    )
-    use_int8_on_arm = (not is_depthwise) and attrs["data_layout"] == "NHWC"
-    other_options = use_int8_on_arm or target.features.has_dotprod
-    if target.features.has_asimd and not other_options:
-        return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
-    # ARM prefers the dtypes to be same.
-    return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-
-
-@qnn_dense_legalize.register("arm_cpu")
-def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
-    target = tvm.target.Target.current(allow_none=False)
-    if target.features.has_asimd and not target.features.has_dotprod:
-        return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense)
-    # ARM prefers the dtypes to be same.
-    return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
-
-
-##########################
-# Intel CPU legalizations.
-##########################
-
-
-@qnn_conv2d_legalize.register("cpu")
-def _qnn_conv2d_legalize_intel_cpu(attrs, inputs, types):
-    # TODO(vvchernov): not only VNNI
-    # The VNNI transformations prefer uint8 x int8 datatypes.
-    if is_fast_int8_on_intel():
-        return helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay.qnn.op.conv2d)
-    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
-
-
-@qnn_dense_legalize.register("cpu")
-def _qnn_dense_legalize_intel_cpu(attrs, inputs, types):
-    # TODO(vvchernov): not only VNNI
-    # The VNNI transformations prefer uint8 x int8 datatypes.
-    if is_fast_int8_on_intel():
-        return helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay.qnn.op.dense)
-    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense)
-
-
-#####################
-# CUDA and vulkan legalizations.
-#####################
-
-
-@qnn_conv2d_legalize.register(["cuda", "gpu"])
-def _qnn_conv2d_legalize_cuda(attrs, inputs, types):
-    if is_target("vulkan"):
-        # prefers the dtypes to be same. Mixed type is not yet supported.
-        return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-    if is_target(["cuda", "rocm"]):
-        # CUDA prefers both datatypes to be int8.
-        return helper_change_dtypes_to_int8(attrs, inputs, types, relay.qnn.op.conv2d)
-    return None
-
-
-@qnn_dense_legalize.register(["cuda", "gpu"])
-def _qnn_dense_legalize_cuda(attrs, inputs, types):
-    if is_target("vulkan"):
-        # prefers the dtypes to be same. Mixed type is not yet supported.
-        return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
-    if is_target(["cuda", "rocm"]):
-        # CUDA prefers both datatypes to be the int8.
-        return helper_change_dtypes_to_int8(attrs, inputs, types, relay.qnn.op.dense)
-    return None
-
-
-########################
-# Hexagon legalizations.
-########################
-
-IN_CHANNEL_VECTOR_LENGTH = 4
-OUT_CHANNEL_VECTOR_LENGTH = 32
-
-
-@qnn_conv2d_legalize.register("hexagon")
-def _qnn_conv2d_legalize_hexagon(attrs, inputs, types):
-    """Legalize qnn.conv2d op for vrmpy tensorization.
-
-    If the inputs are signed or unsigned int8 and data/kernel layouts are NCHW/OIHW, then the input
-    and output channels are padded to be a multiple of 4 and 32 respectively.
-    """
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-
-    if data_layout != "NCHW" or kernel_layout != "OIHW":
-        return None
-
-    data_tensor, kernel_tensor = types[0], types[1]
-
-    if "int8" in data_tensor.dtype and "int8" in kernel_tensor.dtype:
-        in_channel = data_tensor.shape[1].value
-        out_channel = kernel_tensor.shape[0].value
-        ic_modified = False
-        oc_modified = False
-        data, kernel, data_zp, kernel_zp, data_scale, kernel_scale = inputs
-
-        if in_channel % IN_CHANNEL_VECTOR_LENGTH != 0:
-            new_in_channel = helper_align_up(in_channel, IN_CHANNEL_VECTOR_LENGTH)
-            diff = new_in_channel - in_channel
-            pad_width = ((0, 0), (0, diff), (0, 0), (0, 0))
-            data = relay.nn.pad(data, pad_width=pad_width)
-            kernel = relay.nn.pad(kernel, pad_width=pad_width)
-            ic_modified = True
-
-        new_out_channel = out_channel
-        if out_channel % OUT_CHANNEL_VECTOR_LENGTH != 0:
-            new_out_channel = helper_align_up(out_channel, OUT_CHANNEL_VECTOR_LENGTH)
-            diff = new_out_channel - out_channel
-            kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0), (0, 0), (0, 0)))
-            oc_modified = True
-
-            # Pad kernel zero point by 'diff' elements of 0 if it is not scalar
-            kernel_zp_tensor = types[3]
-            if len(kernel_zp_tensor.shape) != 0:
-                assert isinstance(kernel_zp, relay.Constant)
-                padded_kernel_zp_np = np.append(kernel_zp.data.numpy(), [0] * diff)
-                kernel_zp = relay.const(padded_kernel_zp_np)
-
-            # Pad kernel scale by 'diff' elements of 1.0 if it is not scalar
-            kernel_scale_tensor = types[5]
-            if len(kernel_scale_tensor.shape) != 0:
-                assert isinstance(kernel_scale, relay.Constant)
-                padded_kernel_scale_np = np.append(kernel_scale.data.numpy(), [1.0] * diff)
-                kernel_scale = relay.const(padded_kernel_scale_np)
-
-        if ic_modified is True or oc_modified is True:
-            new_attrs = dict(attrs)
-            if oc_modified:
-                new_attrs["channels"] = new_out_channel
-                out = relay.qnn.op.conv2d(
-                    data, kernel, data_zp, kernel_zp, data_scale, kernel_scale, **new_attrs
-                )
-                output_tensor = types[6]
-                original_out_shape = list(output_tensor.shape)
-                out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
-            else:
-                out = relay.qnn.op.conv2d(
-                    data, kernel, data_zp, kernel_zp, data_scale, kernel_scale, **new_attrs
-                )
-
-            return out
-
-    return None
-
-
-@qnn_dense_legalize.register("hexagon")
-def _qnn_dense_legalize_hexagon(attrs, inputs, types):
-    """Legalize qnn.dense op for vrmpy tensorization.
-
-    N dimension of weights should be aligned on vector length. If not, then N dimension is padded to
-    be a multiple of 32.
-    """
-    assert len(types) == 7
-    assert len(inputs) == 6
-
-    data_tensor, kernel_tensor = types[0], types[1]
-    if "int8" not in data_tensor.dtype or "int8" not in kernel_tensor.dtype:
-        return None
-
-    N, _ = kernel_tensor.shape
-
-    if N % OUT_CHANNEL_VECTOR_LENGTH != 0:
-        N_padded = helper_align_up(N, OUT_CHANNEL_VECTOR_LENGTH)
-        diff = N_padded - N
-
-        data, kernel, data_zp, kernel_zp, data_scale, kernel_scale = inputs
-
-        # Pad weights by 'diff'
-        padded_kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0)))
-
-        kernel_zp_tensor, kernel_scale_tensor = types[3], types[5]
-
-        # Pad kernel zero point by 'diff' elements of 0 if it is not scalar
-        if len(kernel_zp_tensor.shape) != 0:
-            assert isinstance(kernel_zp, relay.Constant)
-            assert isinstance(diff, tvm.tir.IntImm)
-            padded_kernel_zp_np = np.append(kernel_zp.data.numpy(), [0] * diff.value)
-            kernel_zp = relay.const(padded_kernel_zp_np)
-
-        # Pad kernel scale by 'diff' elements of 1.0 if it is not scalar
-        if len(kernel_scale_tensor.shape) != 0:
-            assert isinstance(kernel_scale, relay.Constant)
-            assert isinstance(diff, tvm.tir.IntImm)
-            padded_kernel_scale_np = np.append(kernel_scale.data.numpy(), [1.0] * diff.value)
-            kernel_scale = relay.const(padded_kernel_scale_np)
-
-        # If units is explicitly specified, it is used to compute the output shape.
-        # We need to update units after padding to prevent a type error.
-        new_attrs = dict(attrs)
-        if attrs["units"] is not None:
-            new_attrs["units"] = N + diff
-
-        new_inputs = (data, padded_kernel, data_zp, kernel_zp, data_scale, kernel_scale)
-
-        out = relay.qnn.op.dense(*new_inputs, **new_attrs)
-
-        output_tensor = types[6]
-        out = relay.strided_slice(out, begin=[0, 0], end=list(output_tensor.shape))
-        return out
-
-    return None
diff --git a/python/tvm/relay/qnn/op/op.py b/python/tvm/relay/qnn/op/op.py
deleted file mode 100644
index 335947b9f7ce..000000000000
--- a/python/tvm/relay/qnn/op/op.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""The register functions for the QNN dialect."""
-import tvm.ir
-
-
-def register_qnn_legalize(op_name, legal_op=None, level=10):
-    """Register legal transformation function for a QNN op.
-
-    This helps QNN match hardware intrinsics better and is run before
-    canonicalization.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    legal_op: function (attrs: Attrs, inputs: List[Expr]) -> new_expr: Expr
-        The function for transforming an expr to another expr.
-
-    level : int
-        The priority level
-    """
-    return tvm.ir.register_op_attr(op_name, "FTVMQnnLegalize", legal_op, level)
-
-
-def register_qnn_canonicalize(op_name, legal_op=None, level=10):
-    """Register canonicalization function for a QNN op.
-
-    This transforms QNN ops to mainline Relay components.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    legal_op: function (Attrs, List[Expr], List[relay.Type]) -> Expr
-        The function for transforming an expr to another expr.
-
-    level : int
-        The priority level
-    """
-
-    return tvm.ir.register_op_attr(op_name, "FTVMQnnCanonicalize", legal_op, level)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
deleted file mode 100644
index 312fd18bf6b2..000000000000
--- a/python/tvm/relay/qnn/op/qnn.py
+++ /dev/null
@@ -1,1321 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-argument, not-context-manager
-"""QNN dialect operators."""
-
-from __future__ import absolute_import as _abs
-
-import tvm
-import tvm.ir
-from tvm import relay
-from tvm.relay.expr import Tuple, TupleWrapper
-from tvm.relay.op.nn.utils import get_pad_tuple2d
-from tvm.runtime import Object
-from tvm.target import Target
-from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
-from tvm.target.x86 import target_has_features
-
-from . import _make, _requantize
-
-
-@tvm._ffi.register_object("relay.qnn.op.RequantizeConfig")
-class RequantizeConfig(Object):
-    """Configure the requantization behavior by setting config variables.
-
-    Note
-    ----
-    This object is backed by node system in C++, with arguments that can be
-    exchanged between python and C++.
-
-    Do not construct directly, use requantize_config instead.
-
-    The fields that are backed by the C++ node are immutable once an instance
-    is constructed. Use _node_defaults getters to get results for the fields.
-    """
-
-    @staticmethod
-    def _get_node_default_rounding():
-        return "UPWARD"
-
-    @staticmethod
-    def _get_node_default_compute_dtype():
-        target = Target.current(True)
-        if target and str(target.kind) == "llvm":
-            if target_has_features("sse4.1", target):
-                return "float32"
-
-        return "int64"
-
-    _node_defaults = {
-        "rounding": _get_node_default_rounding.__func__,
-        "compute_dtype": _get_node_default_compute_dtype.__func__,
-    }
-
-    # pylint: disable=no-member
-    def __init__(self, handle):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : SymbolHandle
-            the handle to the underlying C++ Symbol
-        """
-        super(RequantizeConfig, self).__init__(handle)
-        self.handle = handle
-
-    def __enter__(self):
-        # pylint: disable=protected-access
-        _requantize._EnterRequantizeConfigScope(self)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        _requantize._ExitRequantizeConfigScope()
-
-    def __setattr__(self, name, value):
-        if name in RequantizeConfig._node_defaults:
-            raise AttributeError(f"'{type(self)}' object cannot set attribute '{name}'")
-        return super(RequantizeConfig, self).__setattr__(name, value)
-
-
-def current_requantize_config():
-    """Get the current requantization configuration."""
-    return _requantize._GetCurrentRequantizeConfig()
-
-
-def requantize_config(**kwargs):
-    """Configure the requantization behavior by setting config variables.
-
-    Parameters
-    ---------
-    rounding: "UPWARD" or "TONEAREST"
-        Rounding direction for fixed point multiplications.
-    compute_dtype:
-        Specifies the data type used during requantize.
-        Supported options: \"int64\", \"float32\", \"float64\"
-
-    Returns
-    -------
-    config: RequantizeConfig
-        The requantization configuration
-    """
-    node_args = {
-        k: v() if k not in kwargs else kwargs[k] for k, v in RequantizeConfig._node_defaults.items()
-    }
-    return tvm.ir.make_node("relay.qnn.op.RequantizeConfig", **node_args)
-
-
-def requantize(
-    data,
-    input_scale,
-    input_zero_point,
-    output_scale,
-    output_zero_point,
-    axis=-1,
-    rounding="None",
-    compute_dtype="None",
-    out_dtype="int8",
-):
-    r"""Requantized operator.
-
-    The requantize operator converts one quantized tensor representation to
-    another quantized tensor representation. For the output tensor, we are
-    provided with output scale and zero point. The computation is as follows
-
-    Q_output = zp_output +  (scale_input)/(scale_output) * (Q_input - zp_input)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    input_scale: tvm.relay.Expr
-        The quantization scale for the input tensor.
-
-    input_zero_point: tvm.relay.Expr
-        The zero point of the input tensor.
-
-    output_scale: tvm.relay.Expr
-        The quantization scale for the output tensor.
-
-    output_zero_point: tvm.relay.Expr
-        The zero point of the output tensor.
-
-    axis : int
-        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
-
-    rounding : string, optional
-        Defines the rounding direction when the value is midway between two
-        representable values.
-    compute_dtype:
-        Specifies the data type used during requantize.
-        Supported options: \"int64\", \"float32\", \"float64\"
-    out_dtype : str, optional
-        Specifies the output data type.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _make.requantize(
-        data,
-        input_scale,
-        input_zero_point,
-        output_scale,
-        output_zero_point,
-        axis,
-        rounding,
-        compute_dtype,
-        out_dtype,
-    )
-
-
-def quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
-    r"""Quantize op
-    This operator takes float32 input and produces quantized output. The input
-    tensor can be of any shape. The output shape is the same as input shape.
-
-    Q_output = clamp((round(input_tensor/output_scale) + output_zero_point),
-                     out_dtype::min,
-                     out_dtype::max)
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-
-    output_scale : tvm.relay.Expr
-        The output scale.
-
-    output_zero_point : tvm.relay.Expr
-        The output zero_point.
-
-    axis : int
-        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
-
-    out_dtype : str, optional
-        The data type of the output tensor. Can be [int8, unit8, int16, uint16, int32].
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _make.quantize(data, output_scale, output_zero_point, axis, out_dtype)
-
-
-def simulated_quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
-    r"""Simulated Quantize op
-    Mimics the quantize op but has more flexibility in valid inputs and always
-    outputs the same type as the input. This can be useful for
-    calibrating or training a quantized network.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be quantized. Can be of type float32.
-
-    out_dtype : string or tvm.relay.Expr
-        A string or tensor indicating which datatype to quantize to.
-
-    output_scale : tvm.relay.Expr
-        The output scale.
-
-    output_zero_point : tvm.relay.Expr
-        The output zero_point.
-
-    axis : int
-        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # Convert string dtype to a constant if needed.
-    if isinstance(out_dtype, str):
-        type_code = SQNN_DTYPE_TO_CODE[out_dtype]
-        out_dtype = relay.const(type_code, dtype="int32")
-    # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility.
-    output_scale = relay.op.reshape(output_scale, [-1])
-    output_zero_point = relay.op.reshape(output_zero_point, [-1])
-    return _make.simulated_quantize(data, out_dtype, output_scale, output_zero_point, axis)
-
-
-def dequantize(data, input_scale, input_zero_point, axis=-1, out_dtype="float32"):
-    r"""Dequantize op
-    This operator takes quantized input and produces dequantized float output.
-    The output shape is the same as input shape. The input tensor can be of any shape.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be dequantized. Can be of type [int8, unit8, int16, uint16, int32].
-
-    input_scale : tvm.relay.Expr
-        The input scale.
-
-    input_zero_point : tvm.relay.Expr
-        The input zero_point.
-
-    axis : int
-        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
-
-    out_dtype : str, optional
-        The data type of the output tensor. Can be [float16, float32].
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _make.dequantize(data, input_scale, input_zero_point, axis, out_dtype)
-
-
-def simulated_dequantize(data, input_scale, input_zero_point, axis=-1, in_dtype="int8"):
-    r"""Simulated Dequantize op
-    Mimics the dequantize op but has more flexibility in valid inputs and always
-    outputs the same type as the input. This can be useful for calibrating or
-    training a quantized network.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor to be dequantized.
-
-    in_dtype : string or tvm.relay.Expr
-        A string or tensor indicating which datatype to dequantize from.
-
-    input_scale : tvm.relay.Expr
-        The input scale.
-
-    input_zero_point : tvm.relay.Expr
-        The input zero_point.
-
-    axis : int
-        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # Convert string dtype to a constant if needed.
-    if isinstance(in_dtype, str):
-        type_code = SQNN_DTYPE_TO_CODE[in_dtype]
-        in_dtype = relay.const(type_code, dtype="int32")
-    # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility.
-    input_scale = relay.op.reshape(input_scale, [-1])
-    input_zero_point = relay.op.reshape(input_zero_point, [-1])
-    return _make.simulated_dequantize(data, in_dtype, input_scale, input_zero_point, axis)
-
-
-def concatenate(data, input_scales, input_zero_points, output_scale, output_zero_point, axis):
-    """Concatenate the quantized input tensors along the given axis.
-
-    Parameters
-    ----------
-    data : Union(List[relay.Expr], Tuple[relay.Expr], TupleWrapper[relay.Expr])
-        The list of quantized tensors.
-
-    input_scales : List[relay.Expr]
-        The list of scales of input quantized tensors.
-
-    input_zero_points : List[relay.Expr]
-        The list of zero points of input quantized tensors.
-
-    output_scale : relay.Expr
-        The scale of the output quantized tensor.
-
-    output_zero_point : relay.Expr
-        The zero point of the output quantized tensor.
-
-    axis : int
-        The axis along which the tensors are concatenated.
-
-    Returns
-    -------
-    result: relay.Expr
-        The concatenated quantized tensor.
-    """
-
-    if isinstance(data, (list, tuple)):
-        data = Tuple(data)
-    elif isinstance(data, TupleWrapper):
-        data = data.tuple_value
-    if not isinstance(axis, int):
-        raise ValueError("For now, we only support integer axis")
-    input_scales = list(input_scales)
-    input_zero_points = list(input_zero_points)
-
-    return _make.concatenate(
-        data, Tuple(input_scales), Tuple(input_zero_points), output_scale, output_zero_point, axis
-    )
-
-
-def conv2d(
-    data,
-    kernel,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_size,
-    channels,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    out_layout="",
-    out_dtype="int32",
-):
-    r"""Quantized 2D convolution.
-
-    This operator convolves quantized data with quantized kernel.
-    If doing Per-channel quantization, qnn expects the kernel_zero_scale
-    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
-    The scale of the output quantized tensor is the product of the kernel_scale and
-    input_scale of the input quantized tensors. The zero point of the output
-    quantized tensor is 0. By default, the dtype of output is int32. Please also
-    refer to Requantize operator to understand how to scale back the int32
-    output to (u)int8.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    kernel : tvm.relay.Expr
-        The kernel expressions.
-
-    input_zero_point: tvm.relay.Expr
-           The zero point of the data distribution.
-
-    kernel_zero_point: tvm.relay.Expr
-           The zero point of the quantized_kernel distribution.
-
-    input_scale: tvm.relay.Expr
-           The scale for the input tensor. The scale for the input tensor is
-           stored purely for convenience here. See more commentary below.
-
-    kernel_scale: tvm.relay.Expr
-           The scale for the weight tensor. The scale for the weight tensor is
-           stored for access to this during relay. This information is not
-           needed in the pass pipeline after qnn.conv2d is lowered to the
-           sequence of steps as in nn.conv2d. See also input_scale in Requantize.
-
-    kernel_size : tuple of int
-        The spatial width and height of the convolution kernel.
-
-    channels : int
-        Number of output channels of this convolution.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the kernel.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    # TODO enforce 4-way padding in topi/nn/conv2d after #4644 merged
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.conv2d(
-        data,
-        kernel,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        out_dtype,
-    )
-
-
-def conv2d_transpose(
-    data,
-    weight,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    strides=(1, 1),
-    padding=(0, 0),
-    dilation=(1, 1),
-    groups=1,
-    channels=None,
-    kernel_size=None,
-    data_layout="NCHW",
-    kernel_layout="IOHW",
-    out_layout="",
-    output_padding=(0, 0),
-    out_dtype="int32",
-):
-    """This operator deconvolves quantized data with quantized kernel. The scale of
-    the output quantized tensor is the product of the kernel_scale and
-    input_scale of the input quantized tensors. The zero point of the output
-    quantized tensor is 0. By default, the dtype of output is int32. Please also
-    refer to Requantize operator to understand how to scale back the int32
-    output to (u)int8.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    input_zero_point: tvm.relay.Expr
-           The zero point of the data distribution.
-
-    kernel_zero_point: tvm.relay.Expr
-           The zero point of the quantized_kernel distribution.
-
-    input_scale: tvm.relay.Expr
-           The scale for the input tensor. The scale for the input tensor is
-           stored purely for convenience here. See more commentary below.
-
-    kernel_scale: tvm.relay.Expr
-           The scale for the weight tensor. The scale for the weight tensor is
-           stored for access to this during relay. This information is not
-           needed in the pass pipeline after qnn.conv2d_transpose is lowered to the
-           sequence of steps as in nn.conv2d_transpose. See also input_scale in Requantize.
-
-    strides : Tuple[int], optional
-        The strides of convolution.
-
-    padding : Tuple[int], optional
-        The padding of convolution.
-
-    dilation : Tuple[int], optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial dimensions of the convolution kernel.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    output_padding : Tuple[int], optional
-        Used to identify the padding within the output shape
-        (only used in training, where transpose_conv represents the gradient of a convolution )
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.conv2d_transpose(
-        data,
-        weight,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        strides,
-        padding,
-        dilation,
-        groups,
-        channels,
-        kernel_size,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        output_padding,
-        out_dtype,
-    )
-
-
-def add(
-    lhs,
-    rhs,
-    lhs_scale,
-    lhs_zero_point,
-    rhs_scale,
-    rhs_zero_point,
-    output_scale,
-    output_zero_point,
-    lhs_axis=-1,
-    rhs_axis=-1,
-):
-    """Quantized addition with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side quantized input data.
-
-    rhs : relay.Expr
-        The right hand side quantized input data.
-
-    lhs_scale: relay.Expr
-        The scale of the lhs quantized expr.
-
-    lhs_zero_point: relay.Expr
-       The zero point of lhs quantized expr.
-
-    rhs_scale: relay.Expr
-        The scale of the rhs quantized expr.
-
-    rhs_zero_point: relay.Expr
-       The zero point of rhs quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    lhs_axis: int
-        The channel axis for lhs quantization. Default value is -1 which corresponds
-        to the last axis.
-
-    rhs_axis: int
-        The channel axis for rhs quantization. Default value is -1 which corresponds
-        to the last axis.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.add(
-        lhs,
-        rhs,
-        lhs_scale,
-        lhs_zero_point,
-        rhs_scale,
-        rhs_zero_point,
-        output_scale,
-        output_zero_point,
-        lhs_axis,
-        rhs_axis,
-    )
-
-
-def dense(
-    data,
-    weight,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    units,
-    out_dtype="int32",
-):
-    """Qnn Dense operator.
-    Applies a quantized linear transformation
-
-     .. math::
-
-     `Y = X * W`
-
-    If doing Per-channel quantization, qnn expects the kernel_zero_scale
-    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The quantized input data to the operator.
-    weight : tvm.relay.Expr
-        The quantized weight expressions.
-    input_zero_point: tvm.relay.Expr
-        The input zero point.
-    kernel_zero_point: tvm.relay.Expr
-        The kernel zero point.
-    input_scale: tvm.relay.Expr
-        The scale for the input tensor.
-    kernel_scale: tvm.relay.Expr
-        The scale for the weight tensor. The scale for the weight tensor is
-        stored for access to this during relay. This information is not
-        needed in the pass pipeline after qnn.conv2d is lowered to the
-        sequence of steps as in nn.conv2d. See also input_scale in Requantize.
-    units : int
-        Number of hidden units of the dense transformation.
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision dense can be int32 or int16.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _make.dense(
-        data,
-        weight,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        units,
-        out_dtype,
-    )
-
-
-def contrib_dense_pack(
-    data,
-    weight,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_layout="NC",
-    units=None,
-    out_dtype="int32",
-):
-    """Qnn contrib_dense_pack operator.
-    Applies a quantized linear transformation
-
-     .. math::
-
-     `Y = X * W`
-
-    If doing Per-channel quantization, qnn expects the kernel_zero_scale
-    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The quantized input data to the operator.
-    weight : tvm.relay.Expr
-        The quantized weight expressions.
-    input_zero_point: tvm.relay.Expr
-        The input zero point.
-    kernel_zero_point: tvm.relay.Expr
-        The kernel zero point.
-    input_scale: tvm.relay.Expr
-        The scale for the input tensor.
-    kernel_scale: tvm.relay.Expr
-        The scale for the weight tensor. The scale for the weight tensor is
-        stored for access to this during relay. This information is not
-        needed in the pass pipeline after qnn.conv2d is lowered to the
-        sequence of steps as in nn.conv2d. See also input_scale in Requantize.
-    kernel_layout: str
-        The layout of weight, such as "NC" or "NC32n4c".
-    units : int, optional
-        Number of hidden units of the dense transformation.
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision dense can be int32 or int16.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-
-    return _make.contrib_dense_pack(
-        data,
-        weight,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        kernel_layout,
-        units,
-        out_dtype,
-    )
-
-
-def mul(
-    lhs,
-    rhs,
-    lhs_scale,
-    lhs_zero_point,
-    rhs_scale,
-    rhs_zero_point,
-    output_scale,
-    output_zero_point,
-    lhs_axis=-1,
-    rhs_axis=-1,
-):
-    """Quantized multiplication with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side quantized input data.
-
-    rhs : relay.Expr
-        The right hand side quantized input data.
-
-    lhs_scale: relay.Expr
-        The scale of the lhs quantized expr.
-
-    lhs_zero_point: relay.Expr
-       The zero point of lhs quantized expr.
-
-    rhs_scale: relay.Expr
-        The scale of the rhs quantized expr.
-
-    rhs_zero_point: relay.Expr
-       The zero point of rhs quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    lhs_axis: int
-        The channel axis for lhs quantization. Default value is -1 which corresponds
-        to the last axis.
-
-    rhs_axis: int
-        The channel axis for rhs quantization. Default value is -1 which corresponds
-        to the last axis.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.mul(
-        lhs,
-        rhs,
-        lhs_scale,
-        lhs_zero_point,
-        rhs_scale,
-        rhs_zero_point,
-        output_scale,
-        output_zero_point,
-        lhs_axis,
-        rhs_axis,
-    )
-
-
-def tanh(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized tanh.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.tanh(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def exp(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized exponential function.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.exp(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def sqrt(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized square root.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.sqrt(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def rsqrt(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized reciprocal square root.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.rsqrt(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def erf(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized error function.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.erf(x, scale, zero_point, output_scale, output_zero_point)
-
-
-# pylint: disable=redefined-builtin
-
-
-def abs(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized abs function.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.abs(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def sigmoid(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized sigmoid.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.sigmoid(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def hardswish(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized hardswish.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.hardswish(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def log(x, scale, zero_point, output_scale, output_zero_point):
-    """Quantized log.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-
-    scale: relay.Expr
-        The scale of the quantized expr.
-
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.log(x, scale, zero_point, output_scale, output_zero_point)
-
-
-def subtract(
-    lhs,
-    rhs,
-    lhs_scale,
-    lhs_zero_point,
-    rhs_scale,
-    rhs_zero_point,
-    output_scale,
-    output_zero_point,
-    lhs_axis=-1,
-    rhs_axis=-1,
-):
-    """Quantized subtraction with numpy-style broadcasting.
-
-    Parameters
-    ----------
-    lhs : relay.Expr
-        The left hand side quantized input data.
-
-    rhs : relay.Expr
-        The right hand side quantized input data.
-
-    lhs_scale: relay.Expr
-        The scale of the lhs quantized expr.
-
-    lhs_zero_point: relay.Expr
-       The zero point of lhs quantized expr.
-
-    rhs_scale: relay.Expr
-        The scale of the rhs quantized expr.
-
-    rhs_zero_point: relay.Expr
-       The zero point of rhs quantized expr.
-
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-
-    lhs_axis: int
-        The channel axis for lhs quantization. Default value is -1 which corresponds
-        to the last axis.
-
-    rhs_axis: int
-        The channel axis for rhs quantization. Default value is -1 which corresponds
-        to the last axis.
-
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-
-    """
-    return _make.subtract(
-        lhs,
-        rhs,
-        lhs_scale,
-        lhs_zero_point,
-        rhs_scale,
-        rhs_zero_point,
-        output_scale,
-        output_zero_point,
-        lhs_axis,
-        rhs_axis,
-    )
-
-
-def batch_matmul(x, y, x_zero_point, y_zero_point, x_scale, y_scale, out_dtype="int32"):
-    r"""
-    Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data
-    in batch.
-
-    .. math::
-
-        \mbox{batch_matmul}(x, y)[i, :, :] = \mbox{matmul}(x[i, :, :], y[i, :, :]^T)
-
-    Parameters
-    ----------
-    x : tvm.relay.Expr
-        The first quantized input.
-        A quantized tensor is represented in following manner
-        `A = scale_a x (QA - zp_A)`
-        where QA is quantized tensor, scale_a and zp_A are quantization
-        params.
-    y : tvm.relay.Expr
-        The second quantized input.
-    x_zero_point: tvm.relay.Expr
-        The first input zero point.
-    y_zero_point: tvm.relay.Expr
-        The second input zero point.
-    x_scale: tvm.relay.Expr
-        The scale for the first input tensor.
-    y_scale: tvm.relay.Expr
-        The scale for the second input tensor.
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision dense can be int32 or int16.
-
-    Returns
-    -------
-    result: tvm.relay.Expr
-        The computed result.
-    """
-    return _make.batch_matmul(x, y, x_zero_point, y_zero_point, x_scale, y_scale, out_dtype)
-
-
-def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zero_point):
-    """Quantized leaky relu.
-
-    Parameters
-    ----------
-    x : relay.Expr
-        The quantized input tensor.
-    alpha: double
-        The alpha value.
-    input_scale: relay.Expr
-        The scale of the input quantized expr.
-    input_zero_point: relay.Expr
-       The zero point of input quantized expr.
-    output_scale: relay.Expr
-        The scale of the output quantized expr.
-    output_zero_point: relay.Expr
-       The zero point of output quantized expr.
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.leaky_relu(
-        x, alpha, input_scale, input_zero_point, output_scale, output_zero_point
-    )
-
-
-def softmax(x, scale, zero_point, output_scale, output_zero_point, axis=-1):
-    return _make.softmax(x, axis, scale, zero_point, output_scale, output_zero_point)
-
-
-def avg_pool2d(
-    data,
-    input_scale,
-    input_zero_point,
-    output_scale,
-    output_zero_point,
-    pool_size,
-    strides,
-    padding,
-    dilation,
-    ceil_mode=False,
-    count_include_pad=True,
-    layout="NHWC",
-    out_layout="",
-):
-
-    """Quantized avg_pool2d
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The quantized input tensor.
-    input_scale: float
-        The scale of the input quantized expr.
-    input_zero_point: int
-        The zero point of input quantized expr.
-    output_scale: flaot
-        The scale of the output quantized expr.
-    output_zero_point: int
-       The zero point of output quantized expr.
-    pool_size : relay.Expr
-        The pool_size
-    strides : relay.Expr
-        The strides
-    padding : relay.Expr
-        The padding size
-    dilation : relay.Expr
-        The dilation size
-    ceil_mode : bool, optional
-        Whether to use ceil or floor for calculating the output shape
-    count_include_pad : bool, optional
-        Determines if padding should be taken into account in the computation
-    layout: string, optinal
-    out_layout: string, optional
-    Returns
-    -------
-    result : relay.Expr
-        The computed result.
-    """
-    return _make.avg_pool2d(
-        data,
-        input_scale,
-        input_zero_point,
-        output_scale,
-        output_zero_point,
-        pool_size,
-        strides,
-        padding,
-        dilation,
-        ceil_mode,
-        count_include_pad,
-        layout,
-        out_layout,
-    )
diff --git a/python/tvm/relay/qnn/strategy/__init__.py b/python/tvm/relay/qnn/strategy/__init__.py
deleted file mode 100644
index d7b669a4fa42..000000000000
--- a/python/tvm/relay/qnn/strategy/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""QNN op strategies."""
-from __future__ import absolute_import as _abs
-
-from .generic import *
-from . import arm_cpu
-from . import hexagon
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
deleted file mode 100644
index bddfd7de3a56..000000000000
--- a/python/tvm/relay/qnn/strategy/arm_cpu.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Quantized operator strategy for Arm CPU.
-
-As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current
-schedules only work for fused operators with bias, as this is the most common use case. Only
-regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
-
-from tvm import topi, TVMError
-from tvm.topi.utils import get_const_tuple
-from ... import op as _op
-from ...op.strategy.generic import is_depthwise_conv2d
-from .generic import (
-    qnn_conv2d_strategy,
-    qnn_dense_strategy,
-    qnn_dequantize_strategy,
-    qnn_quantize_strategy,
-    wrap_compute_dequantize,
-    wrap_compute_quantize,
-    wrap_topi_qnn_dense,
-    wrap_topi_schedule,
-)
-
-
-@qnn_quantize_strategy.register("arm_cpu")
-def qnn_quantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
-    """qnn.quantize strategy for arm_cpu"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_quantize(topi.hexagon.qnn_quantize),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_quantize),
-        name="qnn_quantize.arm_cpu",
-    )
-    return strategy
-
-
-@qnn_dequantize_strategy.register("arm_cpu")
-def qnn_dequantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
-    """qnn.dequantize strategy for arm_cpu"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dequantize(topi.hexagon.qnn_dequantize),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_dequantize),
-        name="qnn_dequantize.arm_cpu",
-    )
-    return strategy
-
-
-@qnn_dense_strategy.register("arm_cpu")
-def qnn_dense_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
-    """qnn.dense strategy for arm_cpu"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_qnn_dense(topi.hexagon.qnn_dense),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_dense),
-        name="qnn_dense.arm_cpu",
-    )
-    return strategy
-
-
-@qnn_conv2d_strategy.register("arm_cpu")
-def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
-    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
-
-    When computing convolutions, we want data that will be used to compute the same output values to
-    be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions.
-
-    For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW
-    layouts to the best job of keeping "related" data close. In contrast, computing one output of a
-    regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these
-    are the layouts we support.
-    """
-
-    if not (target.features.has_dsp and "cortex-m" in target.mcpu):
-        raise TVMError(
-            "Quantized Arm schedules only exist for Cortex-M with DSP! "
-            "The qnn.Legalize pass should be run for other Arm processors."
-        )
-
-    data = inputs[0]
-    kernel = inputs[1]
-    data_layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    groups = attrs.groups
-    strategy = _op.OpStrategy()
-
-    if groups == 1:
-        if data_layout == "NHWC" and kernel_layout == "OHWI":
-            strategy.add_implementation(
-                topi.arm_cpu.qnn_conv2d,
-                topi.arm_cpu.schedule_qnn_conv2d,
-                name="qnn_conv2d.arm_cpu",
-            )
-        else:
-            raise TVMError("QNN regular Conv2D for Arm Cortex-M DSP got incorrect input layout!")
-    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
-        if data_layout == "NCHW" and kernel_layout == "IOHW":
-            height, width = data.shape[2:]
-            y_stride, x_stride = get_const_tuple(attrs.strides)
-            if height * width * y_stride % 2 == 0:
-                strategy.add_implementation(
-                    topi.arm_cpu.qnn_depthwise_conv2d,
-                    topi.arm_cpu.schedule_qnn_depthwise_conv2d,
-                    name="qnn_depthwise_conv2d.arm_cpu",
-                )
-            elif y_stride == x_stride == 1:
-                strategy.add_implementation(
-                    topi.arm_cpu.qnn_unrolled_depthwise_conv2d,
-                    topi.arm_cpu.schedule_qnn_unrolled_depthwise_conv2d,
-                    name="qnn_unrolled_depthwise_conv2d.arm_cpu",
-                )
-            else:
-                raise TVMError("No QNN depthwise Conv2D Cortex-M schedule supports these params!")
-        else:
-            raise TVMError("QNN depthwise Conv2D for Arm Cortex-M DSP got incorrect input layout!")
-    else:
-        raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
-
-    return strategy
diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py
deleted file mode 100644
index 4c5884ffdc15..000000000000
--- a/python/tvm/relay/qnn/strategy/generic.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of generic operator strategy."""
-
-from tvm.target import override_native_generic_func
-
-
-def wrap_topi_schedule(topi_schedule):
-    """Wrap TOPI schedule which doesn't use attrs"""
-
-    def wrapper(_attrs, outs, target):
-        with target:
-            return topi_schedule(outs)
-
-    return wrapper
-
-
-def wrap_topi_compute(topi_compute):
-    """Wrap TOPI compute which doesn't use attrs"""
-
-    def wrapper(_attrs, inputs, _out_type):
-        return [topi_compute(*inputs)]
-
-    return wrapper
-
-
-def wrap_compute_quantize(topi_compute):
-    """Wrap TOPI compute which use axis and out data type from attrs"""
-
-    def wrapper(attrs, inputs, _out_type):
-        axis = attrs.axis
-        out_dtype = attrs.out_dtype
-        args = [*inputs, axis, out_dtype]
-        return [topi_compute(*args)]
-
-    return wrapper
-
-
-def wrap_compute_dequantize(topi_compute):
-    """Wrap TOPI compute which use axis from attrs"""
-
-    def wrapper(attrs, inputs, _out_type):
-        args = [*inputs, attrs.axis]
-        return [topi_compute(*args)]
-
-    return wrapper
-
-
-def wrap_topi_qnn_conv2d(topi_compute):
-    """Wrap TOPI compute which use conv2d attrs and output data type"""
-
-    def wrapper(attrs, inputs, out_type):
-        out_dtype = out_type.dtype
-        oshape = out_type.shape
-        strides = attrs.strides
-        padding = attrs.padding
-        dilation = attrs.dilation
-        if len([*inputs]) == 11:
-            args = [*inputs, strides, padding, dilation, oshape, out_dtype]
-        elif len([*inputs]) == 10:
-            args = [  # QNN Conv2d params:
-                inputs[0],
-                inputs[1],
-                inputs[2],
-                inputs[3],
-                inputs[4],
-                inputs[5],
-                # Bias argument
-                None,
-                # Requantization params:
-                inputs[6],
-                inputs[7],
-                inputs[8],
-                inputs[9],
-                # Conv2d attrs:
-                strides,
-                padding,
-                dilation,
-                oshape,
-                out_dtype,
-            ]
-        else:
-            assert len([*inputs]) == 6
-            args = [  # QNN Conv2d params:
-                *inputs,
-                # Bias argument:
-                None,
-                # Requantization params:
-                None,
-                None,
-                None,
-                None,
-                strides,
-                padding,
-                dilation,
-                oshape,
-                out_dtype,
-            ]
-        return [topi_compute(*args)]
-
-    return wrapper
-
-
-def wrap_topi_qnn_dense(topi_compute):
-    """Wrap TOPI compute which use qnn.dense attrs"""
-
-    def wrapper(_attrs, inputs, out_type):
-        out_dtype = out_type.dtype
-        if len([*inputs]) == 11:
-            args = [*inputs, out_dtype]
-        elif len([*inputs]) == 10:
-            args = [  # QNN Dense params:
-                inputs[0],
-                inputs[1],
-                inputs[2],
-                inputs[3],
-                inputs[4],
-                inputs[5],
-                # Bias argument
-                None,
-                # Requantization params:
-                inputs[6],
-                inputs[7],
-                inputs[8],
-                inputs[9],
-                out_dtype,
-            ]
-        else:
-            assert len([*inputs]) == 6
-            args = [  # QNN Dense params:
-                *inputs,
-                # Bias argument:
-                None,
-                # Requantization params:
-                None,
-                None,
-                None,
-                None,
-                out_dtype,
-            ]
-        return [topi_compute(*args)]
-
-    return wrapper
-
-
-def wrap_compute_qnn_avg_pool2d(topi_compute):
-    """Wrap qnn.avg_pool2d topi compute"""
-
-    def wrapper(attrs, inputs, out_type):
-        kernel = attrs.pool_size
-        strides = attrs.strides
-        padding = attrs.padding
-        dilation = attrs.dilation
-        count_include_pad = attrs.count_include_pad
-        oshape = out_type.shape
-        odtype = out_type.dtype
-        args = [
-            inputs[0],
-            kernel,
-            strides,
-            padding,
-            dilation,
-            count_include_pad,
-            oshape,
-            odtype,
-            inputs[1],
-            inputs[2],
-            inputs[3],
-            inputs[4],
-        ]
-        return [topi_compute(*args)]
-
-    return wrapper
-
-
-def wrap_topi_concatenate(topi_compute):
-    """Wrap TOPI compute which use qnn.concatenate attrs"""
-
-    def wrapper(attrs, inputs, out_type):
-        return [topi_compute(inputs, attrs.axis, out_type.dtype)]
-
-    return wrapper
-
-
-def wrap_topi_qnn_batch_matmul(topi_compute):
-    """Wrap TOPI compute which use qnn.batch_matmul attrs"""
-
-    def wrapper(attrs, inputs, _out_type):
-        assert len([*inputs]) == 6
-        args = [*inputs, attrs.transpose_a, attrs.transpose_b, attrs.out_dtype]
-        return [topi_compute(*args)]
-
-    return wrapper
-
-
-@override_native_generic_func("qnn_quantize_strategy")
-def qnn_quantize_strategy(attrs, inputs, out_type, target):
-    """qnn.quantize generic strategy"""
-    raise RuntimeError(
-        "qnn.quantize is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_dequantize_strategy")
-def qnn_dequantize_strategy(attrs, inputs, out_type, target):
-    """qnn.dequantize generic strategy"""
-    raise RuntimeError(
-        "qnn.dequantize is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_requantize_strategy")
-def qnn_requantize_strategy(attrs, inputs, out_type, target):
-    """qnn.requantize generic strategy"""
-    raise RuntimeError(
-        "qnn.requantize is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_add_strategy")
-def qnn_add_strategy(attrs, inputs, out_type, target):
-    """qnn.add generic strategy"""
-    raise RuntimeError(
-        "qnn.add is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_subtract_strategy")
-def qnn_subtract_strategy(attrs, inputs, out_type, target):
-    """qnn.subtract generic strategy"""
-    raise RuntimeError(
-        "qnn.subtract is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_mul_strategy")
-def qnn_mul_strategy(attrs, inputs, out_type, target):
-    """qnn.mul generic strategy"""
-    raise RuntimeError(
-        "qnn.mul is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_tanh_strategy")
-def qnn_tanh_strategy(attrs, inputs, out_type, target):
-    """qnn.tanh generic strategy"""
-    raise RuntimeError(
-        "qnn.tanh is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_concatenate_strategy")
-def qnn_concatenate_strategy(attrs, inputs, out_type, target):
-    """qnn.concatenate generic strategy"""
-    raise RuntimeError(
-        "qnn.concatenate is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_conv2d_strategy")
-def qnn_conv2d_strategy(attrs, inputs, out_type, target):
-    """qnn.conv2d generic strategy"""
-    raise RuntimeError(
-        "qnn.conv2d is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_dense_strategy")
-def qnn_dense_strategy(attrs, inputs, out_type, target):
-    """qnn.dense generic strategy"""
-    raise RuntimeError(
-        "qnn.dense is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_dense_pack_strategy")
-def qnn_dense_pack_strategy(attrs, inputs, out_type, target):
-    """qnn.contrib_dense_pack generic strategy"""
-    raise RuntimeError("qnn.contrib_dense_pack is currently only supported with Hexagon. ")
-
-
-@override_native_generic_func("qnn_batch_matmul_strategy")
-def qnn_batch_matmul_strategy(attrs, inputs, out_type, target):
-    """qnn.batch_matmul generic strategy"""
-    raise RuntimeError(
-        "qnn.batch_matmul is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
-
-
-@override_native_generic_func("qnn_avg_pool2d_strategy")
-def qnn_avg_pool2d_strategy(attrs, inputs, out_type, target):
-    """qnn.avg_pool2d generic strategy"""
-    raise RuntimeError(
-        "qnn.avg_pool2d is currently only supported with Hexagon. "
-        "Please run QNN Canonicalize pass to decompose this op into supported ops."
-    )
diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py
deleted file mode 100644
index 3edbce34e30f..000000000000
--- a/python/tvm/relay/qnn/strategy/hexagon.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of Hexagon operator strategy."""
-# pylint: disable=unused-argument,wildcard-import,unused-wildcard-import
-
-import re
-
-from tvm import topi
-from .generic import *
-from ... import op as _op
-from ...op.strategy.generic import is_depthwise_conv2d
-
-
-NCHWC_MATCHER = re.compile("^NCHW[0-9]+c$")
-OIHWIOI_MATCHER = re.compile("^OIHW[0-9]+i[0-9]+o[0-9]+i$")
-
-
-@qnn_quantize_strategy.register("hexagon")
-def qnn_quantize_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.quantize strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_quantize(topi.hexagon.qnn_quantize),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_quantize),
-        name="qnn_quantize.hexagon",
-    )
-    return strategy
-
-
-@qnn_dequantize_strategy.register("hexagon")
-def qnn_dequantize_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.dequantize strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dequantize(topi.hexagon.qnn_dequantize),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_dequantize),
-        name="qnn_dequantize.hexagon",
-    )
-    return strategy
-
-
-@qnn_requantize_strategy.register("hexagon")
-def qnn_requantize_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.requantize strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_quantize(topi.hexagon.qnn_requantize),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_requantize),
-        name="qnn_requantize.hexagon",
-    )
-    return strategy
-
-
-@qnn_add_strategy.register("hexagon")
-def qnn_add_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.add strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_compute(topi.hexagon.qnn_add),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_add),
-        name="qnn_add.hexagon",
-    )
-    return strategy
-
-
-@qnn_subtract_strategy.register("hexagon")
-def qnn_subtract_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.subtract strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_compute(topi.hexagon.qnn_subtract),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_subtract),
-        name="qnn_subtract.hexagon",
-    )
-    return strategy
-
-
-@qnn_mul_strategy.register("hexagon")
-def qnn_mul_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.mul strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_compute(topi.hexagon.qnn_mul),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_mul),
-        name="qnn_mul.hexagon",
-    )
-    return strategy
-
-
-@qnn_tanh_strategy.register("hexagon")
-def qnn_tanh_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.tanh strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_compute(topi.hexagon.qnn_tanh),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_tanh),
-        name="qnn_tanh.hexagon",
-    )
-    return strategy
-
-
-@qnn_concatenate_strategy.register("hexagon")
-def qnn_concatenate_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.concatenate strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_concatenate(topi.hexagon.qnn_concatenate),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_concatenate),
-        name="qnn_concatenate.hexagon",
-    )
-    return strategy
-
-
-@qnn_conv2d_strategy.register("hexagon")
-def qnn_conv2d_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.conv2d strategy for Hexagon"""
-    data = inputs[0]
-    kernel = inputs[1]
-    data_layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    groups = attrs.groups
-    strategy = _op.OpStrategy()
-    if groups == 1:
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            strategy.add_implementation(
-                wrap_topi_qnn_conv2d(topi.hexagon.qnn_conv2d),
-                wrap_topi_schedule(topi.hexagon.schedule_qnn_conv2d),
-                name="qnn_conv2d.hexagon",
-            )
-        elif NCHWC_MATCHER.match(data_layout) and OIHWIOI_MATCHER.match(kernel_layout):
-            if data.dtype == "uint8" and kernel.dtype == "int8":
-                strategy.add_implementation(
-                    wrap_topi_qnn_conv2d(topi.hexagon.qnn_conv2d_NCHWc_int8),
-                    wrap_topi_schedule(topi.hexagon.schedule_qnn_conv2d_NCHWc_int8),
-                    name="qnn_conv2d_NCHWc_int8.hexagon",
-                )
-    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            strategy.add_implementation(
-                wrap_topi_qnn_conv2d(topi.hexagon.qnn_depthwise_conv2d),
-                wrap_topi_schedule(topi.hexagon.schedule_qnn_depthwise_conv2d),
-                name="qnn_depthwise_conv2d.hexagon",
-            )
-    else:
-        raise RuntimeError("Unsupported strategy for group qnn.conv2d")
-
-    return strategy
-
-
-@qnn_dense_strategy.register("hexagon")
-def qnn_dense_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.dense strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_qnn_dense(topi.hexagon.qnn_dense),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_dense),
-        name="qnn_dense.hexagon",
-    )
-    return strategy
-
-
-@qnn_dense_pack_strategy.register("hexagon")
-def qnn_dense_pack_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.contrib_dense_pack strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    if (
-        "uint8" in inputs[0].dtype
-        and "int8" in inputs[1].dtype
-        and attrs["weight_layout"] == "NC32n4c"
-    ):
-        # uint8 + uint8|int8 case
-        strategy.add_implementation(
-            wrap_topi_qnn_dense(topi.hexagon.qnn_dense_pack_vrmpy),
-            wrap_topi_schedule(topi.hexagon.schedule_qnn_dense_pack_vrmpy),
-            name="qnn_dense_pack_vrmpy.hexagon",
-        )
-    return strategy
-
-
-@qnn_batch_matmul_strategy.register("hexagon")
-def qnn_batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.batch_matmul strategy for Hexagon"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_topi_qnn_batch_matmul(topi.hexagon.qnn_batch_matmul),
-        wrap_topi_schedule(topi.hexagon.schedule_qnn_batch_matmul),
-        name="qnn_batch_matmul.hexagon",
-    )
-    return strategy
-
-
-@qnn_avg_pool2d_strategy.register(["hexagon"])
-def qnn_avg_pool2d_strategy_hexagon(attrs, inputs, out_type, target):
-    """qnn.avg_pool2d strategy for Hexagon"""
-    data_layout = attrs.layout
-    if data_layout == "NHWC":
-        strategy = _op.OpStrategy()
-        strategy.add_implementation(
-            wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NHWC),
-            wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d),
-            name="qnn_avg_pool2d.hexagon",
-        )
-        return strategy
-    elif data_layout == "NCHW":
-        strategy = _op.OpStrategy()
-        strategy.add_implementation(
-            wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NCHW),
-            wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d),
-            name="qnn_avg_pool2d.hexagon",
-        )
-        return strategy
-    else:
-        raise RuntimeError("Unsupported strategy for qnn.avg_pool2d")
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
deleted file mode 100644
index 0485cecb99c5..000000000000
--- a/python/tvm/relay/qnn/transform.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,arguments-differ,no-else-return,unused-argument,missing-docstring
-"""
-QNN pass transformation infrastructure.
-"""
-from tvm import relay
-
-
-def CanonicalizeOps():
-    """Converts/Lowers an expression containing QNN ops to an expression containing only core
-    (non-Dialect) Relay ops. Each QNN op is lowered to a sequence of existing Relay ops. This is a
-    target-independent pass. One can register the lowering/transformation function for this op using
-    FTVMQnnCanonicalize attr_name for FTVMLegalize op attribute.  An example of this transformation
-    is below
-
-    Examples
-    ________
-
-    .. code-block:: python
-
-        # Original expression
-        qnn_expr = relay.qnn.op.requantize(y,
-                                           input_scale=1,
-                                           input_zero_point=0,
-                                           output_scale=1,
-                                           output_zero_point=0,
-                                           out_dtype='int8')
-
-        # We want to utilize all the existing Relay infrastructure. So, instead of supporting this
-        # QNN requantize op, we convert it into a sequence of existing Relay operators.
-        mod = tvm.IRModule.from_expr(qnn_expr)
-        mod = relay.qnn.transform.CanonicalizeOps()(mod)
-        relay_expr = mod['main']
-        print(relay_expr)
-
-        def @main(%quantized_data: Tensor[(200), int32]) -> Tensor[(200), int8] {
-          %0 = cast(%quantized_data, dtype="int64") /* ty=Tensor[(200), int64] */;
-          %1 = multiply(%0, 2 /* ty=int64 */) /* ty=Tensor[(200), int64] */;
-          %2 = multiply(%1, 1073741824 /* ty=int64 */) /* ty=Tensor[(200), int64] */;
-          %3 = add(%2, 1073741824 /* ty=int64 */) /* ty=Tensor[(200), int64] */;
-          %4 = right_shift(%3, 31 /* ty=int64 */) /* ty=Tensor[(200), int64] */;
-          %5 = add(0 /* ty=int64 */, %4) /* ty=Tensor[(200), int64] */;
-          %6 = clip(%5, a_min=-128f, a_max=127f) /* ty=Tensor[(200), int64] */;
-          cast(%6, dtype="int8") /* ty=Tensor[(200), int8] */
-        }
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that canonicalizes QNN ops to Relay ops.
-    """
-
-    return relay.transform.Legalize("FTVMQnnCanonicalize")
-
-
-def Legalize():
-    """Legalizes QNN ops. As opposed to Relay Legalize, this one legalizes only QNN ops. One can
-    register a transformation/legalization function for an op by using the FTVMQnnLegalize attr_name
-    for FTVMLegalize op attribute. The isolation of QNN and Relay Legalize gives us separation of
-    concerns, leading to a better software practice. The legalization can be configured to happen
-    per target. An example of this type of legalization is shown below.
-
-    Examples
-    ________
-
-    Suppose the original graph is as follows
-
-            data(u8)  weight(u8)
-                |       |
-                |       |
-               qnn.conv2d (int32)
-                   |
-                   |
-                nn.relu (int32)
-
-    Now, we know that Intel Cascade Lake has VNNI instructions to speedup convolution. However, it
-    only works on u8 x i8 inputs. So, here, we can use QNN Legalize to transform the above graph as
-    follows
-
-            data(u8)  weight(u8)
-               |          |
-               |          |
-               |     requantize(i8)
-               |        |
-               |        |
-               qnn.conv2d (int32)
-                   |
-                   |
-                 nn.relu (int32)
-
-    In this legalization, since we have isolated legalization for QNN ops, it will only trigger the
-    transformation for qnn.conv2d (and not nn.relu). This pass can be followed by CanonicalizeOps to
-    further lower the qnn.requantize and qnn.conv2d into an expr containing only Relay ops.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that legalizes QNN ops.
-    """
-
-    return relay.transform.Legalize("FTVMQnnLegalize")
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
deleted file mode 100644
index 428c6e97e032..000000000000
--- a/python/tvm/relay/quantize/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin
-"""Automatic quantization utilities."""
-from __future__ import absolute_import as _abs
-
-from .quantize import *
-from ._partition import register_partition_function
-from ._annotate import register_annotate_function
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
deleted file mode 100644
index b6d6c921a8a1..000000000000
--- a/python/tvm/relay/quantize/_annotate.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument,inconsistent-return-statements
-"""Internal module for registering attribute for annotation."""
-import warnings
-from tvm import topi
-import tvm._ffi
-from tvm.relay.op import op as _reg
-from .. import expr as _expr
-from .. import analysis as _analysis
-from .. import op as _op
-from . import _quantize
-from .quantize import QAnnotateKind, current_qconfig, quantize_context
-from .quantize import _forward_op
-
-
-@_op.register_compute("relay.op.annotation.simulated_quantize")
-def simulated_quantize_compute(attrs, inputs, out_type):
-    """Compiler for simulated_quantize."""
-    assert len(inputs) == 4
-    assert attrs.sign
-    assert attrs.rounding == "round"
-
-    data, scale, clip_min, clip_max = inputs
-
-    if attrs.kind == QAnnotateKind.IDENTITY:
-        return [topi.identity(data)]
-
-    # simulate rounding error
-    scaled_data = topi.divide(data, scale)
-    clipped_data = topi.maximum(topi.minimum(scaled_data, clip_max), clip_min)
-    round_data = topi.round(clipped_data)
-
-    # recover data
-    rdata = topi.multiply(round_data, scale)
-    return [rdata]
-
-
-_reg.register_injective_schedule("relay.op.annotation.simulated_quantize")
-_reg.register_pattern("relay.op.annotation.simulated_quantize", _reg.OpPattern.ELEMWISE)
-_reg.register_injective_schedule("annotation.cast_hint")
-
-
-@tvm._ffi.register_object("relay.QAnnotateExpr")
-class QAnnotateExpr(_expr.TempExpr):
-    """A special kind of Expr for Annotating.
-
-    Parameters
-    ---------
-    expr: Expr
-        the original relay ir expr.
-
-    kind: QAnnotateKind
-        the kind of annotation field.
-    """
-
-    def __init__(self, expr, kind):
-        self.__init_handle_by_constructor__(_quantize.make_annotate_expr, expr, kind)
-
-
-def _get_expr_kind(anno):
-    """Get the expression and QAnnotateKind from QAnnotateExpr or Expr"""
-    if isinstance(anno, QAnnotateExpr):
-        return anno.expr, anno.kind
-    return anno, None
-
-
-def register_annotate_function(op_name, frewrite=None, level=10):
-    """register a rewrite function for operator, used by annotation.
-
-    Parameters
-    ---------
-    op_name: str
-        The name of operation
-
-    frewrite : function, optional
-        The function to be registered.
-
-    level : int, optional
-        The priority level
-    """
-
-    def default_rewrite(ref_call, new_args, ctx):
-        # recover from QAnnotateExpr
-        args = [_get_expr_kind(x)[0] for x in new_args]
-        return _forward_op(ref_call, args)
-
-    def _register(func):
-        """internal register function"""
-
-        def frewrite_with_guard(ref_call, new_args, ctx):
-            if not current_qconfig().guard(ref_call):
-                return default_rewrite(ref_call, new_args, ctx)
-            return func(ref_call, new_args, ctx)
-
-        return tvm.ir.register_op_attr(op_name, "FQAnnotateRewrite", frewrite_with_guard, level)
-
-    return _register(frewrite) if frewrite is not None else _register
-
-
-def attach_simulated_quantize(data, kind, sign=True, rounding="round"):
-    """Attach a simulated quantize operation after input data expr.
-
-    Parameters
-    ---------
-    data: Expr
-        the original data expr.
-
-    kind: QAnnotateKind
-        the kind of annotation field.
-    """
-    quantize_op = _op.get("relay.op.annotation.simulated_quantize")
-    if isinstance(data, _expr.Call) and data.op == quantize_op:
-        if data.attrs.kind == kind and data.attrs.sign == sign and data.attrs.rounding == rounding:
-            return data
-
-    qctx = quantize_context()
-    key = tuple([data, kind, sign, rounding])
-    if key in qctx.qnode_map:
-        return qctx.qnode_map[key]
-
-    dom_scale = _expr.var("dom_scale")
-    clip_min = _expr.var("clip_min")
-    clip_max = _expr.var("clip_max")
-    qnode = _quantize.simulated_quantize(data, dom_scale, clip_min, clip_max, kind, sign, rounding)
-    qctx.qnode_map[key] = qnode
-    return qnode
-
-
-tvm._ffi.register_func("relay.quantize.attach_simulated_quantize", attach_simulated_quantize)
-
-
-@register_annotate_function("nn.contrib_conv2d_NCHWc")
-def conv2d_nchwc_rewrite(ref_call, new_args, ctx):
-    warnings.warn(
-        "NCHWc layout Conv2D detected, please use a lower "
-        "optimization level before applying the quantization "
-        "pass as quantization will have no effect here..."
-    )
-
-
-@register_annotate_function("nn.conv2d")
-def conv2d_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for conv2d. Lhs of conv will be quantized to
-    input field, and rhs of conv will be quantized to weight field.
-    Output would be in activation field"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
-    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
-
-    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
-        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
-
-    assert rhs_kind is None
-    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
-
-    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-
-    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-
-
-@register_annotate_function("nn.conv1d")
-def conv1d_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for conv1d. Lhs of conv will be quantized to
-    input field, and rhs of conv will be quantized to weight field.
-    Output would be in activation field"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
-    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
-
-    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
-        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
-
-    assert rhs_kind is None
-    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
-
-    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-
-    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-
-
-@register_annotate_function("nn.dense")
-def dense_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
-    dense will be quantized to weight field. Output would be in activation field."""
-
-    if current_qconfig().skip_dense_layer:
-        return None
-
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
-    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
-
-    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
-        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
-
-    assert rhs_kind is None
-    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
-
-    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-
-    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-
-
-@register_annotate_function("multiply")
-def multiply_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for multiply."""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
-    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
-
-    if lhs_kind is None and rhs_kind is None:
-        return None
-
-    if lhs_kind in [QAnnotateKind.ACTIVATION, QAnnotateKind.INPUT] and rhs_kind is None:
-        # quantize lhs to INPUT field
-        if lhs_kind == QAnnotateKind.ACTIVATION:
-            lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
-        if _analysis.check_constant(rhs_expr):
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
-        else:
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
-        expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-        return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-    if rhs_kind in [QAnnotateKind.ACTIVATION, QAnnotateKind.INPUT] and lhs_kind is None:
-        # quantize rhs to INPUT field
-        if rhs_kind == QAnnotateKind.ACTIVATION:
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
-        if _analysis.check_constant(lhs_expr):
-            lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.WEIGHT)
-        else:
-            lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
-        expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-        return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-
-    raise ValueError
-
-
-@register_annotate_function("add")
-def add_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for add."""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
-    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
-
-    if lhs_kind is None and rhs_kind is None:
-        # trivial case
-        return None
-
-    if lhs_kind is None and rhs_kind is not None:
-        # quantize lhs to INPUT field if it is normal expression
-        assert rhs_kind in [QAnnotateKind.INPUT, QAnnotateKind.ACTIVATION]
-        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
-        expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-        return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-
-    if lhs_kind is not None and rhs_kind is None:
-        if _analysis.check_constant(rhs_expr):
-            # - introduced by batch_norm: add(out, const)
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
-        else:
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
-        expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-        return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-
-    if lhs_kind is not None and rhs_kind is not None:
-        if lhs_kind == QAnnotateKind.INPUT and rhs_kind == QAnnotateKind.INPUT:
-            expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-            return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-        if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION:
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
-            expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-            return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-        if (lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.INPUT) or (
-            lhs_kind == QAnnotateKind.INPUT and rhs_kind == QAnnotateKind.ACTIVATION
-        ):
-            expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-            return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-    raise ValueError()
-
-
-def identity_rewrite(ref_call, new_args, ctx):
-    """Simply forward the original operation"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    x_expr, x_kind = _get_expr_kind(new_args[0])
-    if x_kind is None:
-        return None
-
-    ret_expr = _forward_op(ref_call, [x_expr])
-    return QAnnotateExpr(ret_expr, x_kind)
-
-
-register_annotate_function("reshape", identity_rewrite)
-register_annotate_function("clip", identity_rewrite)
-register_annotate_function("nn.relu", identity_rewrite)
-register_annotate_function("strided_slice", identity_rewrite)
-register_annotate_function("nn.avg_pool2d", identity_rewrite)
-register_annotate_function("nn.batch_flatten", identity_rewrite)
-register_annotate_function("transpose", identity_rewrite)
-register_annotate_function("annotation.stop_fusion", identity_rewrite)
-
-
-def pool2d_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for max pool2d"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    expr, x_kind = _get_expr_kind(new_args[0])
-
-    if x_kind is None:
-        return None
-    if x_kind == QAnnotateKind.ACTIVATION:
-        expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT)
-
-    expr = _forward_op(ref_call, [expr])
-    return QAnnotateExpr(expr, QAnnotateKind.INPUT)
-
-
-register_annotate_function("nn.max_pool2d", pool2d_rewrite)
-
-
-def pool1d_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for max pool1d"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    expr, x_kind = _get_expr_kind(new_args[0])
-
-    if x_kind is None:
-        return None
-    if x_kind == QAnnotateKind.ACTIVATION:
-        expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT)
-
-    expr = _forward_op(ref_call, [expr])
-    return QAnnotateExpr(expr, QAnnotateKind.INPUT)
-
-
-register_annotate_function("nn.max_pool1d", pool1d_rewrite)
-
-
-@register_annotate_function("annotation.cast_hint")
-def cast_hint_rewrite(ref_call, new_args, ctx):
-    """Rewrite function to force cast"""
-    expr, x_kind = _get_expr_kind(new_args[0])
-
-    if quantize_context().check_to_skip(ref_call):
-        return expr
-
-    if x_kind is None:
-        return new_args[0]
-    if x_kind == QAnnotateKind.ACTIVATION:
-        expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT)
-
-    expr = _forward_op(ref_call, [expr])
-    return QAnnotateExpr(expr, QAnnotateKind.INPUT)
-
-
-@register_annotate_function("concatenate")
-def concatenate_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for concatenate"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    input_tuple = new_args[0]
-    expr_list = [_get_expr_kind(x)[0] for x in input_tuple]
-    kind_list = [_get_expr_kind(x)[1] for x in input_tuple]
-
-    # make sure the inputs of concatenate are all normal
-    # expression or annotate expression
-    if all([k is None for k in kind_list]):
-        return None
-    for i, k in enumerate(kind_list):
-        if k is None:
-            expr_list[i] = attach_simulated_quantize(expr_list[i], QAnnotateKind.ACTIVATION)
-    expr = _forward_op(ref_call, [_expr.Tuple(expr_list)])
-    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
-
-
-@register_annotate_function("nn.global_avg_pool2d")
-def global_avg_pool2d_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for global_avg_pool2d for stopping quantize"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    expr, x_kind = _get_expr_kind(new_args[0])
-
-    if x_kind is None:
-        return None
-    expr = _forward_op(ref_call, [new_args[0].realize()])
-
-    # stop quantize after global_avg_pool2d
-    quantize_context().stop_quantize()
-    return expr
-
-
-@register_annotate_function("nn.batch_matmul")
-def batch_matmul_rewrite(ref_call, new_args, ctx):
-    """Rewrite function for batch_matmul"""
-    if quantize_context().check_to_skip(ref_call):
-        return None
-
-    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
-    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
-
-    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
-        if _analysis.check_constant(lhs_expr):
-            lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.WEIGHT)
-        else:
-            lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
-
-    if rhs_kind is None or rhs_kind == QAnnotateKind.ACTIVATION:
-        if _analysis.check_constant(rhs_expr):
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
-        else:
-            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
-
-    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
diff --git a/python/tvm/relay/quantize/_calibrate.py b/python/tvm/relay/quantize/_calibrate.py
deleted file mode 100644
index f03d556814a0..000000000000
--- a/python/tvm/relay/quantize/_calibrate.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Find scales for quantization on the dataset."""
-from __future__ import absolute_import
-import logging
-import multiprocessing as mp
-import numpy as np
-import tvm
-import tvm.driver
-from tvm.ir import IRModule
-
-from . import _quantize
-from . import quantize
-from .. import op as _op
-from .. import expr as _expr
-from .. import analysis as _analysis
-from .. import build_module as _build_module
-from ...contrib import graph_executor
-from .kl_divergence import _find_scale_by_kl
-
-
-def _get_profile_runtime(mod):
-    func = mod["main"]
-    func = _quantize.CreateStatsCollector(func)
-
-    if tvm.target.Target.current():
-        target = tvm.target.Target.current()
-        dev = tvm.device(target.kind.name)
-    else:
-        target = "llvm"
-        dev = tvm.device(target)
-
-    with tvm.transform.PassContext(opt_level=3):
-        lib = _build_module.build(func, target=target)
-    runtime = graph_executor.GraphModule(lib["default"](dev))
-
-    return runtime
-
-
-def collect_stats(mod, dataset, chunk_by=-1):
-    """Given an annotated graph, create a profile graph to collect profile data from the
-    calibration dataset. This pass collects simulated_quantize op input into a tuple.
-    Simulated_quantize ops are rewritten to identity mode. The tuple is the output of the profile
-    graph.
-
-    Parameters
-    ----------
-    mod: Module
-        The simulation graph after annotation.
-
-    dataset: Iterable[NDArray]
-        The calibration dataset.
-
-    chunk_by: optional, int
-        The size of chunk to be returned in one iteration. It is meant to be
-        used for reducing memory usage. If not specified, return samples for
-        all layers in one chunk.
-
-    Returns
-    -------
-    ret: Iterable[list of ndarray]
-        List of output data of each layer, chunked by the chunk_by parameter
-    """
-    logging.info("collecting statistics for calibration...")
-    runtime = _get_profile_runtime(mod)
-    num_outputs = runtime.get_num_outputs()
-    chunk_by = num_outputs if chunk_by == -1 else chunk_by
-
-    for i in range(0, num_outputs, chunk_by):
-        outputs = [[] for i in range(min(chunk_by, num_outputs - i))]
-        for batch in dataset:
-            runtime.set_input(**batch)
-            runtime.run()
-            for j in range(i, min(i + chunk_by, num_outputs)):
-                outputs[j - i].append(runtime.get_output(j).numpy())
-        yield [np.concatenate(output).reshape(-1) for output in outputs]
-
-
-def _kl_scale(mod, dataset):
-    cfg = quantize.current_qconfig()
-    chunk_by = cfg.calibrate_chunk_by
-    scales = []
-    for samples in collect_stats(mod, dataset, chunk_by):
-        logging.info("finding threshold with kl for calibration...")
-        with mp.Pool() as pool:
-            scales += list(pool.map(_find_scale_by_kl, samples))
-
-    def func(_):
-        scale = scales[func.scale_idx]
-        func.scale_idx += 1
-        return scale
-
-    func.scale_idx = 0
-
-    return func
-
-
-def _find_scale_by_percentile(arr, percentile=0.99999):
-    assert isinstance(arr, np.ndarray)
-    x = np.abs(arr)
-    max_k = int(x.size * percentile)
-    return np.partition(x, max_k)[max_k]
-
-
-def _percentile_scale(mod, dataset):
-    cfg = quantize.current_qconfig()
-    chunk_by = cfg.calibrate_chunk_by
-    scales = []
-    for samples in collect_stats(mod, dataset, chunk_by):
-        logging.info("finding threshold with percentile for calibration...")
-        with mp.Pool() as pool:
-            scales += list(pool.map(_find_scale_by_percentile, samples))
-
-    def func(_):
-        scale = scales[func.scale_idx]
-        func.scale_idx += 1
-        return scale
-
-    func.scale_idx = 0
-
-    return func
-
-
-def _set_params(mod, input_scale_func, weight_scale_func):
-    quantize_op = _op.get("relay.op.annotation.simulated_quantize")
-    cfg = quantize.current_qconfig()
-    const_params = {}
-
-    def visit_func(expr):
-        """visitor function for traverse"""
-        if isinstance(expr, _expr.Call) and expr.op == quantize_op:
-            _, ndom_scale, nclip_min, nclip_max = expr.args
-            attrs = expr.attrs
-            kind = attrs.kind
-            nbit = cfg.get_nbit_by_kind(kind)
-            valid_bit = nbit - attrs.sign
-
-            # set scale
-            if kind == quantize.QAnnotateKind.WEIGHT:
-                assert isinstance(expr.args[0], _expr.Constant)
-                scale = weight_scale_func(expr)
-            else:
-                scale = input_scale_func(expr)
-
-            def _make_const(val):
-                return _expr.const(val, "float32")
-
-            valid_range = 2**valid_bit
-            const_params[ndom_scale] = _make_const(scale / valid_range)
-            const_params[nclip_min] = _make_const(-(valid_range - 1))
-            const_params[nclip_max] = _make_const((valid_range - 1))
-
-    main_func = mod["main"]
-    _analysis.post_order_visit(main_func, visit_func)
-    main_func = _expr.bind(main_func, const_params)
-    func_dict = {}
-    for global_var, func in mod.functions.items():
-        if global_var.name_hint != "main":
-            func_dict[global_var] = func
-    return IRModule.from_expr(main_func, func_dict)
-
-
-# weight scale functions
-def _power2_scale(sq_call):  # pylint: disable=unused-argument
-    """calculate weight scale with nearest mode-2 scale"""
-    var = sq_call.args[0]
-    assert isinstance(var, _expr.Constant)
-    val = np.amax(np.abs(var.data.numpy()))
-    return 2 ** np.math.ceil(np.math.log(val, 2)) if val > 0 else 1.0
-
-
-def _max_scale(sq_call):
-    """calculate weight scale with maximum absolute value"""
-    var = sq_call.args[0]
-    assert isinstance(var, _expr.Constant)
-    val = np.amax(np.abs(var.data.numpy()))
-    return val
-
-
-# input scale functions
-def _global_scale(sq_call):  # pylint: disable=unused-argument
-    cfg = quantize.current_qconfig()
-    return cfg.global_scale
-
-
-def calibrate(dataset=None):
-    """The calibrate procedure will try to calculate the content of
-    dom_scale, nbit, clip_min, clip_max for every `simulated_quantize`
-    operator.
-
-    Parameters
-    ---------
-    dataset: Optional[Iterable[NDArray]]
-        The calibration dataset.
-
-    Returns
-    -------
-    ret: Function
-        The module pass function.
-    """
-
-    def wrapped_func(mod, _):
-        """make transform.module pass happy"""
-        cfg = quantize.current_qconfig()
-
-        if cfg.calibrate_mode == "kl_divergence":
-            input_scale_func = _kl_scale(mod, dataset)
-        elif cfg.calibrate_mode == "global_scale":
-            input_scale_func = _global_scale
-        elif cfg.calibrate_mode == "percentile":
-            input_scale_func = _percentile_scale(mod, dataset)
-        else:
-            raise ValueError(f"Unknown calibrate mode {cfg.calibrate_mode}")
-
-        if cfg.weight_scale == "max":
-            weight_scale_func = _max_scale
-        elif cfg.weight_scale == "power2":
-            weight_scale_func = _power2_scale
-        else:
-            raise ValueError(f"Unknown weight scale mode {cfg.weight_scale}")
-
-        return _set_params(mod, input_scale_func, weight_scale_func)
-
-    return wrapped_func
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
deleted file mode 100644
index ca980dd9f8fc..000000000000
--- a/python/tvm/relay/quantize/_partition.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument,inconsistent-return-statements
-"""Internal module for registering attribute for annotation."""
-import tvm
-from .. import expr as _expr
-from .. import analysis as _analysis
-from . import _quantize
-from .quantize import _forward_op
-
-
-def register_partition_function(op_name, frewrite=None, level=10):
-    return tvm.ir.register_op_attr(op_name, "FQPartitionRewrite", frewrite, level)
-
-
-@tvm._ffi.register_object("relay.QPartitionExpr")
-class QPartitionExpr(_expr.TempExpr):
-    def __init__(self, expr):
-        self.__init_handle_by_constructor__(_quantize.make_partition_expr, expr)
-
-
-def partition_expr_check(expr):
-    if isinstance(expr, QPartitionExpr):
-        return True, expr.expr
-    return False, expr
-
-
-@register_partition_function("nn.conv2d")
-def conv2d_partition_function(ref_call, new_args, ctx):
-    """Rewrite function for conv2d for partition"""
-    data_cond, data = partition_expr_check(new_args[0])
-    kernel_cond, kernel = partition_expr_check(new_args[1])
-
-    assert not kernel_cond
-    if data_cond:
-        data = new_args[0].realize()
-    ret = _forward_op(ref_call, [data, kernel])
-    return QPartitionExpr(ret)
-
-
-def identity_partition_function(ref_call, new_args, ctx):
-    cond, expr = partition_expr_check(new_args[0])
-    if cond:
-        return QPartitionExpr(_forward_op(ref_call, [expr]))
-    return None
-
-
-register_partition_function("clip", identity_partition_function)
-register_partition_function("nn.relu", identity_partition_function)
-register_partition_function("nn.max_pool2d", identity_partition_function)
-
-
-def add_partition_generic(ref_call, new_args, ctx):
-    """Rewrite function for ewise add for partition for generic devices"""
-    lhs_cond, lhs = partition_expr_check(new_args[0])
-    rhs_cond, rhs = partition_expr_check(new_args[1])
-    if lhs_cond and rhs_cond:
-        # - introduced by ResNet, when for the first residual connection
-        #     ...
-        #     %0 = nn.conv2d(%data, %meta[relay.Constant])
-        #     %1 = add(%0, %meta[relay.Constant])
-        #     %2 = nn.relu(%1)
-        #     %3 = nn.max_pool2d(%2)
-        #     ...
-        #     %9 = nn.conv2d(%8, %meta[relay.Constant])
-        #     %10 = add(%9, %meta[relay.Constant])
-        #     %11 = add(%3, %10)  <- need to insert annotations for %3, %10
-        #     ...
-        lhs = new_args[0].realize()
-        rhs = new_args[1].realize()
-        return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
-    if not lhs_cond and rhs_cond:
-        # - introduced by residual connection in ResNet
-        #     ...
-        #     %13 = nn.conv2d(%12, %meta[relay.Constant])
-        #     %14 = add(%13, %meta[relay.Constant])
-        #     %15 = annotation.cast_hint(%15, 'int8')
-        #     %16 = annotation.stop_fusion(%16)
-        #     %17 = add(%5, %16)
-        #     %18 = nn.relu(%17)
-        #     ...
-        #     %24 = nn.conv2d(%23, %meta[relay.Constant])
-        #     %25 = add(%24, %meta[relay.Constant])
-        #     %26 = add(%18, %25)  <- need to insert annotations for %25
-        #     ...
-        rhs = new_args[1].realize()
-        return _forward_op(ref_call, [lhs, rhs])
-    if lhs_cond and not rhs_cond:
-        if _analysis.check_constant(rhs):
-            # - introduced by batch_norm: add(out, bias)
-            return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
-        # - introduced by residual connection in MobileNetV2
-        #     ...
-        #     %81 = add(%80, meta[relay.Constant])
-        #     %82 = annotation.cast_hint(%81, 'int8')
-        #     %83 = annotation.stop_fusion(%82)
-        #     %84 = add(%79, %83)
-        #     ...
-        #     %96 = nn.conv2d(%94, %meta[relay.Constant])
-        #     %96 = add(%95, %meta[relay.Constant])
-        #     %97 = add(%96, %84)  <- need to insert annotations for %96
-        #     ...
-        lhs = new_args[0].realize()
-        return _forward_op(ref_call, [lhs, rhs])
-    if not lhs_cond and not rhs_cond:
-        # trivial case
-        return None
-
-    raise ValueError
-
-
-def mul_partition_generic(ref_call, new_args, ctx):
-    """Rewrite function for ewise mul for partition for generic devices"""
-    lhs_cond, lhs = partition_expr_check(new_args[0])
-    rhs_cond, rhs = partition_expr_check(new_args[1])
-
-    if lhs_cond:
-        # introduced by bn: multiply(out, scale)
-        lhs = new_args[0].realize()
-        return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
-
-    if rhs_cond:
-        # introduced by efficientnet
-        rhs = new_args[1].realize()
-        return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
-
-    if not lhs_cond and not rhs_cond:
-        # trivial case
-        return None
-
-    raise ValueError
-
-
-# TODO(ziheng) enhance `register_partition_function` to dispatch
-# for target automatically
-@register_partition_function("add")
-def add_partition_function(ref_call, new_args, ctx):
-    """Rewrite function for ewise add for partition"""
-    target = tvm.target.Target.current()
-    if target and "cuda" in target.keys:
-        # TODO(wuwei/ziheng) cuda specific rules
-        return add_partition_generic(ref_call, new_args, ctx)
-    return add_partition_generic(ref_call, new_args, ctx)
-
-
-@register_partition_function("multiply")
-def multiply_partition_function(ref_call, new_args, ctx):
-    """Rewrite function for ewise multiply for partition"""
-    return mul_partition_generic(ref_call, new_args, ctx)
-
-
-# add cast after the relu op to make it run on vta
-@register_partition_function("nn.global_avg_pool2d")
-def global_avg_pool2d_partition_function(ref_call, new_args, ctx):
-    cond, expr = partition_expr_check(new_args[0])
-    if cond:
-        expr = new_args[0].realize()
-    else:
-        expr = QPartitionExpr(new_args[0]).realize()
-
-    return _forward_op(ref_call, [expr])
diff --git a/python/tvm/relay/quantize/_partition_conversions.py b/python/tvm/relay/quantize/_partition_conversions.py
deleted file mode 100644
index 8fec69cdf53e..000000000000
--- a/python/tvm/relay/quantize/_partition_conversions.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, not-context-manager
-"""Utilities for partitioning input quantization and output dequantization expressions."""
-import tvm
-from tvm import relay
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
-
-# operators that are allowed in prefix/suffix partitions, because they are used
-# to quantize/dequantize
-ALLOWED_CONVERSION_OPS = ["add", "multiply", "right_shift", "clip", "round", "cast"]
-
-
-def partition_conversions(mod, quantized_dtypes, ensure_fully_integral):
-    """Partition mod into input quantization, core quantized inference, and output dequantization.
-
-    The resulting module includes an additional `main` that fuses all three
-    partitions together.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        Quantized module to partition
-
-    quantized_dtypes : Set[str]
-        Set of data types allowed in quantized operators
-
-    ensure_fully_integral : bool
-        Whether to raise an exception if there are unquantized operators in the result
-
-    Returns
-    -------
-    fused_mod : tvm.IRModule
-        Module containing the input quantization (`quantize_inputs`), core
-        quantized inference (`quantized_main`), output dequantization
-        (`dequantize_outputs`), and full quantized inference functions
-    """
-    # Partitioning is implemented as in the diagram below:
-    #
-    #   +----------------------------+
-    #   |Quantized Inference Function|
-    #   +--------------+-------------+
-    #                  |
-    #           partition_prefix
-    #                  |
-    #            +-----+-------------------------+
-    #            |                               |
-    #   +--------v---------+   +-----------------v------------------+
-    #   |Input Quantization|   |Rest of Quantized Inference Function|
-    #   +------------------+   +-----------------+------------------+
-    #                                            |
-    #                                    partition_suffix
-    #                                            |
-    #                                     +------+---------------------+
-    #                                     |                            |
-    #   +------------------+   +----------v------------+   +-----------v---------+
-    #   |Input Quantization|   |Core Quantized Function|   |Output Dequantization|
-    #   +------------------+   +-----------------------+   +---------------------+
-    #
-    # The final module contains all three partitions, as well as a
-    # `main` function that composes these three functions (depicted below).
-    #
-    # +--------------------+-------------------------+-----------------------+
-    # | Input Quantization | Core Quantized Function | Output Dequantization |
-    # +--------------------+-------------------------+-----------------------+
-    assert len(mod.functions) == 1
-    pre_mod, mid_mod = partition_prefix(mod, quantized_dtypes)
-    mid_mod, post_mod = partition_suffix(mid_mod, quantized_dtypes)
-    if ensure_fully_integral:
-        assert has_only_conversion_ops(pre_mod["main"])
-        assert relay.analysis.all_dtypes(mid_mod["main"]).issubset(quantized_dtypes)
-        assert has_only_conversion_ops(post_mod["main"])
-    return fuse_partitions(pre_mod, mid_mod, post_mod)
-
-
-def fuse_partitions(pre_mod, mid_mod, post_mod):
-    """Combine prefix, middle, and suffix modules into a single module.
-
-    The combined module includes an additional `main` that fuses all three
-    partitions together.
-
-    Parameters
-    ----------
-    pre_mod : tvm.IRModule
-        Module containing an input quantization function
-
-    mid_mod : tvm.IRModule
-        Module containing core of a quantized inference function
-
-    post_mod : tvm.IRModule
-        Module containing an output dequantization function
-
-    Returns
-    -------
-    fused_mod : tvm.IRModule
-        Module containing the input quantization, core quantized inference,
-        output dequantization, and full quantized inference functions
-    """
-    pre_func = pre_mod["main"]
-    mid_func = mid_mod["main"]
-    post_func = post_mod["main"]
-    # create a module containing the prefix, middle, and suffix partitions
-    fused_mod = tvm.IRModule(
-        functions={
-            relay.GlobalVar("quantize_inputs"): pre_func,
-            relay.GlobalVar("quantized_main"): mid_func,
-            relay.GlobalVar("dequantize_outputs"): post_func,
-        }
-    )
-
-    # construct a `main` that strings together the partitions, such that its
-    # behaviour is equivalent to `main` in an *unpartitioned* module
-    scope_builder = relay.ScopeBuilder()
-    fused_mod_main_params = [relay.Var(param.name_hint) for param in pre_func.params]
-    quantized_inputs = scope_builder.let(
-        "quantized_inputs",
-        relay.Call(fused_mod.get_global_var("quantize_inputs"), fused_mod_main_params),
-    )
-    quantized_outputs = scope_builder.let(
-        "quantized_outputs",
-        relay.Call(
-            fused_mod.get_global_var("quantized_main"),
-            [relay.TupleGetItem(quantized_inputs, i) for i in range(len(pre_func.ret_type.fields))],
-        ),
-    )
-    dequantized_outputs = scope_builder.let(
-        "dequantized_outputs",
-        relay.Call(fused_mod.get_global_var("dequantize_outputs"), [quantized_outputs]),
-    )
-    scope_builder.ret(dequantized_outputs)
-    fused_mod["main"] = relay.Function(fused_mod_main_params, scope_builder.get())
-    return relay.transform.InferType()(fused_mod)
-
-
-class PrefixCutter(ExprMutator):
-    """A mutator for extracting input quantization expressions from a function
-
-    The result of `visit` is the core function, and the input quantization
-    expressions are stored in the `prefix_sb` scope builder.
-    """
-
-    def __init__(self, params, quantized_dtypes):
-        ExprMutator.__init__(self)
-        self.params = set(params)
-        self.quantized_dtypes = quantized_dtypes
-        self.subtree_params = set()
-        self.new_func_params = []
-        self.prefix_sb = relay.ScopeBuilder()
-        self.prefix_binding_map = {}
-
-    def visit_var(self, var):
-        if var in self.params:
-            self.subtree_params.add(var)
-        return var
-
-    def visit_call(self, call):
-        # TODO(weberlo) use graph pattern matching?
-        if not hasattr(call.op, "name") or call.op.name not in ALLOWED_CONVERSION_OPS:
-            new_args = []
-            for arg in call.args:
-                new_arg = self.visit(arg)
-                if len(self.subtree_params) == 0:
-                    new_args.append(new_arg)
-                else:
-                    assert len(self.subtree_params) == 1
-                    param = next(iter(self.subtree_params))
-                    pre_param = self.prefix_sb.let(param.name_hint, new_arg)
-                    self.subtree_params.clear()
-                    mid_param = relay.Var(param.name_hint, arg.checked_type)
-                    self.prefix_binding_map[mid_param] = pre_param
-                    # return new parameter, then we can use
-                    # relay.analysis.free_vars at the end of the pass to generate
-                    # new `mid_func` type signature
-                    new_args.append(mid_param)
-            return relay.Call(call.op, new_args, call.attrs)
-
-        return super().visit_call(call)
-
-
-def partition_prefix(mod, quantized_dtypes):
-    """Extract input quantization expressions from `mod['main']`.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        Module containing a quantized inference function
-
-    quantized_dtypes : Set[str]
-        Set of data types allowed in quantized operators
-
-    Returns
-    -------
-    pre_mod : tvm.IRModule
-        Module containing the input quantization function
-
-    mid_mod : tvm.IRModule
-        Module containing a function with everything except for input quantization
-    """
-    assert len(mod.functions) == 1
-    func = mod["main"]
-    prefix_cutter = PrefixCutter(func.params, quantized_dtypes)
-    mid_body = prefix_cutter.visit(func.body)
-    assert not func.type_params, "unimplemented"
-    assert not func.attrs, "unimplemented"
-    mid_func = relay.Function(relay.analysis.free_vars(mid_body), mid_body)
-    mid_mod = tvm.IRModule.from_expr(mid_func)
-    mid_mod = relay.transform.InferType()(mid_mod)
-
-    scope_builder = prefix_cutter.prefix_sb
-    # make sure we pass through all inputs in the prefix function's return expr
-    # (even those that don't require quantization)
-    ret_expr = []
-    for param in mid_func.params:
-        if param in prefix_cutter.prefix_binding_map:
-            # this param required a conversion, so we collected it in the
-            # prefix cutter pass, and we can use the pass's mapping from mid
-            # func params to pre func params
-            ret_expr.append(prefix_cutter.prefix_binding_map[param])
-        else:
-            # there was no detected conversion for this argument, so we thread
-            # it through the prefix function untouched
-            ret_expr.append(relay.Var(param.name_hint, param.checked_type))
-    ret_expr = relay.Tuple(ret_expr)
-    scope_builder.ret(ret_expr)
-    pre_func_body = scope_builder.get()
-    pre_func = relay.Function(relay.analysis.free_vars(pre_func_body), pre_func_body)
-    pre_mod = tvm.IRModule.from_expr(pre_func)
-    pre_mod = relay.transform.InferType()(pre_mod)
-
-    return pre_mod, mid_mod
-
-
-class SuffixCutter(ExprMutator):
-    """A mutator for extracting output dequantization expressions from a function
-
-    The result of `visit` is a function containing the output dequantization
-    expressions, and the middle of the function is stored in `mid_body`.
-    """
-
-    def __init__(self, quantized_dtypes):
-        ExprMutator.__init__(self)
-        self.mid_body = None
-        self.quantized_dtypes = quantized_dtypes
-
-    def visit(self, expr):
-        if hasattr(expr, "checked_type") and expr.checked_type.dtype in self.quantized_dtypes:
-            self.mid_body = expr
-            return relay.Var("input", expr.checked_type)
-
-        return super().visit(expr)
-
-
-def partition_suffix(mod, quantized_dtypes):
-    """Extract output dequantization expressions from `mod['main']`.
-
-    Parameters
-    ----------
-    mod : tvm.IRModule
-        Module containing a quantized inference function
-
-    quantized_dtypes : Set[str]
-        Set of data types allowed in quantized operators
-
-    Returns
-    -------
-    pre_mod : tvm.IRModule
-        Module containing the input quantization function
-
-    mid_mod : tvm.IRModule
-        Module containing a function with everything except for input quantization
-    """
-    assert len(mod.functions) == 1
-    func = mod["main"]
-    suffix_cutter = SuffixCutter(quantized_dtypes)
-    post_body = suffix_cutter.visit(func.body)
-    assert not func.type_params, "unimplemented"
-    assert not func.attrs, "unimplemented"
-    post_func = relay.Function(relay.analysis.free_vars(post_body), post_body, func.ret_type)
-    post_mod = tvm.IRModule.from_expr(post_func)
-    post_mod = relay.transform.InferType()(post_mod)
-
-    mid_body = suffix_cutter.mid_body
-    if mid_body is None:
-        # The suffix contains the entire function, meaning there was no
-        # quantization boundary in the given mod.  In this case, we use the
-        # suffix mod as the middle mod and make the suffix an identity function.
-        mid_mod = post_mod
-        post_body = relay.Var("input", mid_mod["main"].ret_type)
-        post_func = relay.Function([post_body], post_body)
-        post_mod = tvm.IRModule.from_expr(post_func)
-        post_mod = relay.transform.InferType()(post_mod)
-    else:
-        mid_func = relay.Function(func.params, mid_body)
-        mid_mod = tvm.IRModule.from_expr(mid_func)
-        mid_mod = relay.transform.InferType()(mid_mod)
-
-    return mid_mod, post_mod
-
-
-class ConversionOpChecker(ExprVisitor):
-    """A pass for checking that the visited function contains only conversion ops"""
-
-    def __init__(self):
-        ExprVisitor.__init__(self)
-        self.valid = True
-
-    def visit_call(self, call):
-        if not hasattr(call.op, "name") or call.op.name not in ALLOWED_CONVERSION_OPS:
-            self.valid = False
-        super().visit_call(call)
-
-
-def has_only_conversion_ops(func):
-    """Return true iff the given function contains only quantization/dequantization ops.
-
-    Parameters
-    ----------
-    func : relay.Function
-        Function being checked
-
-    Returns
-    -------
-    valid : bool
-        Whether the function contains only conversion ops
-    """
-    checker = ConversionOpChecker()
-    checker.visit(func)
-    return checker.valid
diff --git a/python/tvm/relay/quantize/_quantize.py b/python/tvm/relay/quantize/_quantize.py
deleted file mode 100644
index 70f8f175f512..000000000000
--- a/python/tvm/relay/quantize/_quantize.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""Internal module for quantization."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay._quantize", __name__)
diff --git a/python/tvm/relay/quantize/kl_divergence.py b/python/tvm/relay/quantize/kl_divergence.py
deleted file mode 100644
index ca6c0b6dc5bc..000000000000
--- a/python/tvm/relay/quantize/kl_divergence.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Find optimal scale for quantization by minimizing KL-divergence"""
-
-import ctypes
-import numpy as np
-
-from . import _quantize
-
-
-def _find_scale_by_kl(arr, quantized_dtype="int8", num_bins=8001, num_quantized_bins=255):
-    """Given a tensor, find the optimal threshold for quantizing it.
-    The reference distribution is `q`, and the candidate distribution is `p`.
-    `q` is a truncated version of the original distribution.
-
-    Ref:
-    http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
-    """
-    assert isinstance(arr, np.ndarray)
-    min_val = np.min(arr)
-    max_val = np.max(arr)
-    thres = max(abs(min_val), abs(max_val))
-
-    if min_val >= 0 and quantized_dtype in ["uint8"]:
-        # We need to move negative bins to positive bins to fit uint8 range.
-        num_quantized_bins = num_quantized_bins * 2 + 1
-
-    def get_pointer(arr, ctypes_type):
-        ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes_type))
-        return ctypes.cast(ptr, ctypes.c_void_p)
-
-    hist, hist_edges = np.histogram(arr, bins=num_bins, range=(-thres, thres))
-    hist_ptr = get_pointer(hist.astype(np.int32), ctypes.c_int)
-    hist_edges_ptr = get_pointer(hist_edges, ctypes.c_float)
-
-    return _quantize.FindScaleByKLMinimization(
-        hist_ptr, hist_edges_ptr, num_bins, num_quantized_bins
-    )
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
deleted file mode 100644
index 41343061da3e..000000000000
--- a/python/tvm/relay/quantize/quantize.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, not-context-manager
-"""Automatic quantization toolkit."""
-import tvm.ir
-import tvm
-from tvm.runtime import Object
-
-from . import _quantize
-from ._calibrate import calibrate
-from ._partition_conversions import partition_conversions
-from .. import expr as _expr
-from .. import transform as _transform
-
-
-class QAnnotateKind(object):
-    """Denote the kind of annotation field, corresponding
-    to different nbit configure."""
-
-    IDENTITY = 0
-    INPUT = 1
-    WEIGHT = 2
-    ACTIVATION = 3
-
-
-def kind2str(kind):
-    """Convert a `QAnnotateKind` to string"""
-    str_map = {
-        QAnnotateKind.INPUT: "input",
-        QAnnotateKind.WEIGHT: "weight",
-        QAnnotateKind.ACTIVATION: "activation",
-        QAnnotateKind.IDENTITY: "identity",
-    }
-    assert kind in str_map
-    return str_map[kind]
-
-
-def _forward_op(ref_call, args):
-    """forward the operator of ref_call with provided arguments"""
-    return _expr.Call(ref_call.op, args, ref_call.attrs, ref_call.type_args, ref_call.span)
-
-
-@tvm._ffi.register_object("relay.quantize.QConfig")
-class QConfig(Object):
-    """Configure the quantization behavior by setting config variables.
-
-    Note
-    ----
-    This object is backed by node system in C++, with arguments that can be
-    exchanged between python and C++.
-
-    Do not construct directly, use qconfig instead.
-
-    The fields that are backed by the C++ node are immutable once an instance
-    is constructed. See _node_defaults for the fields.
-    """
-
-    _node_defaults = {
-        "nbit_input": 8,
-        "nbit_weight": 8,
-        "nbit_activation": 32,
-        "dtype_input": "int8",
-        "dtype_weight": "int8",
-        "dtype_activation": "int32",
-        "calibrate_mode": "global_scale",
-        "global_scale": 8.0,
-        "weight_scale": "power2",
-        "skip_dense_layer": True,
-        "skip_conv_layers": [0],
-        "do_simulation": False,
-        "round_for_shift": True,
-        "debug_enabled_ops": None,
-        "rounding": "UPWARD",
-        "calibrate_chunk_by": -1,
-        "partition_conversions": "disabled",
-    }
-
-    # pylint: disable=no-member
-    def __init__(self, handle):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : SymbolHandle
-            the handle to the underlying C++ Symbol
-        """
-        super(QConfig, self).__init__(handle)
-        self.handle = handle
-
-    def guard(self, ref_call):
-        """Return true if op is enabled, otherwise return false"""
-        op_name = ref_call.op.name
-        if self.debug_enabled_ops is not None:
-            name_list = [x.value for x in self.debug_enabled_ops]
-            if op_name not in name_list:
-                return False
-        return True
-
-    def get_nbit_by_kind(self, kind):
-        name = kind2str(kind)
-        return getattr(self, "nbit_" + name)
-
-    def get_dtype_by_kind(self, kind):
-        name = kind2str(kind)
-        return getattr(self, "dtype_" + name)
-
-    def __enter__(self):
-        # pylint: disable=protected-access
-        _quantize._EnterQConfigScope(self)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        _quantize._ExitQConfigScope()
-
-    def __setattr__(self, name, value):
-        if name in QConfig._node_defaults:
-            raise AttributeError(f"'{type(self)}' object cannot set attribute '{name}'")
-        return super(QConfig, self).__setattr__(name, value)
-
-
-def current_qconfig():
-    """Get the current quantization configuration."""
-    return _quantize._GetCurrentQConfig()
-
-
-def qconfig(**kwargs):
-    """Configure the quantization behavior by setting config variables.
-
-    Parameters
-    ---------
-    nbit_dict: dict of QAnnotateKind -> int
-        Number of bit for every kind of annotate field.
-
-    calibrate_mode: str
-        The calibration mode. 'global_scale' or 'kl_divergence'.
-        global_scale: use global scale
-        kl_divergence: find scales by kl divergence on the dataset.
-
-    global_scale: float
-        The global scale for calibration.
-
-    weight_scale: str
-        The way to calculate scales for weights (annotated with QAnnotateKind.WEIGHT).
-        power2: Find the maximum of the absolute value of the tensor, and then round up to power
-        of two.
-        max: Find the maximum of the absolute value of the tensor
-
-    skip_dense_layer: boolean
-        Whether to skip all nn.dense layer type. By default are skipped.
-
-    skip_conv_layers: list
-        Specifying which layers to be skipped. Provide a list of indices
-        that indicate which conv2d layers to leave untouched. Start from 0.
-
-    do_simulation: boolean
-        Whether to do simulation with float operation only.
-
-    round_for_shift: boolean
-        Whether to add bias for rounding during shift.
-
-    debug_enabled_ops: None or list of str
-        Partially quantize specified operators for debugging. The default value
-        is None, which means will try to call all operartors' annotate rewrite
-        function.
-
-    rounding: "UPWARD" or "TONEAREST"
-        Rounding direction for fixed point multiplications.
-
-    partition_conversions: 'disabled', 'enabled', or 'fully_integral'
-        If set to 'enabled' or 'fully_integral', partitions a quantized
-        result into a module containing
-        a prefix function (consisting of input conversion into the quantized data space),
-        a middle function (consisting of the core quantized network),
-        a suffix function (consisting of output dequantization),
-        and a main function (that calls the prefix, middle, and suffix functions in succession).
-        If set to 'fully_integral' and there are unquantized operators in the result,
-        an exception is raised.
-        The default value is 'disabled'.
-
-    Returns
-    -------
-    config: QConfig
-        The quantization configuration
-    """
-    node_args = {k: v if k not in kwargs else kwargs[k] for k, v in QConfig._node_defaults.items()}
-    return tvm.ir.make_node("relay.quantize.QConfig", **node_args)
-
-
-class QuantizeContext(object):
-    """An internal used global context object for annotation,
-    for putting some state variables like `conv2d_counter`."""
-
-    Current = None
-
-    def __init__(self):
-        self.qnode_map = dict()
-        self._conv2d_counter = 0
-        self._stop_quantize = False
-
-    def check_to_skip(self, ref_call):
-        """Check the index of conv2d layer to decide whether to
-        skip the current operator."""
-        if self._stop_quantize:
-            return True
-
-        if current_qconfig().skip_conv_layers is not None:
-            # check skip conv layers
-            skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers]
-            if self._conv2d_counter in skipped_indices and ref_call.op.name == "nn.conv2d":
-                self._conv2d_counter += 1
-                return True
-            if ref_call.op.name == "nn.conv2d":
-                self._conv2d_counter += 1
-
-        return False
-
-    def stop_quantize(self):
-        self._stop_quantize = True
-
-    def reset(self):
-        self._conv2d_counter = 0
-        self._stop_quantize = False
-
-    def __enter__(self):
-        self.reset()
-        return self
-
-    def __exit__(self, ptype, value, traceback):
-        pass
-
-
-def quantize_context():
-    """Get the global singleton scope"""
-    if QuantizeContext.Current is None:
-        QuantizeContext.Current = QuantizeContext()
-    return QuantizeContext.Current
-
-
-def partition():
-    """Partition graph into small low-precision sections by `cast_hint` and
-    `stop_fusion`.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass for VTA rewrite.
-    """
-    return _quantize.QuantizePartition()
-
-
-def annotate():
-    """Given a float32 graph, this pass will rewrite the graph and return
-    a graph which simulates the error brought by the current quantization
-    scheme.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass for quantization annotation.
-    """
-    return _quantize.QuantizeAnnotate()
-
-
-def realize():
-    """The realize pass will transform the simulated quantized graph, which
-    actually computes with float32, to a real low-bit integer graph. It will
-    replace the `simulated_quantize` with several fine-grained operators like
-    add, multiply, and shift as much as possible for better performance.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass for quantization realization.
-    """
-    return _quantize.QuantizeRealize()
-
-
-def _bind_params(func, params):
-    """Bind the params to the expression."""
-    name_dict = {}
-    for arg in func.params:
-        name = arg.name_hint
-        if name in name_dict:
-            name_dict[name] = None
-        else:
-            name_dict[name] = arg
-    bind_dict = {}
-    for k, v in params.items():
-        if k not in name_dict:
-            continue
-        arg = name_dict[k]
-        if arg is None:
-            raise ValueError(f"Multiple args in the function have name {k}")
-        bind_dict[arg] = _expr.const(v)
-    return _expr.bind(func, bind_dict)
-
-
-def prerequisite_optimize(mod, params=None):
-    """Prerequisite optimization passes for quantization. Perform
-    "SimplifyInference", "FoldScaleAxis", "FoldConstant", and
-    "CanonicalizeOps" optimization before quantization."""
-    optimize = tvm.transform.Sequential(
-        [
-            _transform.SimplifyInference(),
-            _transform.FoldConstant(),
-            _transform.FoldScaleAxis(),
-            _transform.CanonicalizeOps(),
-            _transform.FoldConstant(),
-        ]
-    )
-
-    if params:
-        mod["main"] = _bind_params(mod["main"], params)
-
-    mod = optimize(mod)
-    return mod
-
-
-def quantize(mod, params=None, dataset=None):
-    """The quantization procedure. Before running the three main
-    procedure of quantization, "annotate", "calibrate" and "realize"
-    , we need to do "SimplifyInference", "FoldScaleAxis", "FoldConstant"
-    first for optimizing.
-
-    Parameters
-    ---------
-    mod: Module
-        The original module.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    dataset: list of dict of Var -> NDArray
-        The calibration dataset.
-
-    Returns
-    -------
-    ret: Function
-        The graph after quantization
-    """
-    mod = prerequisite_optimize(mod, params)
-
-    calibrate_pass = tvm.transform.module_pass(
-        calibrate(dataset), opt_level=1, name="QuantizeCalibrate"
-    )
-    quant_passes = [partition(), annotate(), calibrate_pass, tvm.relay.transform.InferType()]
-    if not current_qconfig().do_simulation:
-        quant_passes.append(realize())
-    quant_passes.append(_transform.FoldConstant())
-    quantize_seq = tvm.transform.Sequential(quant_passes)
-    with tvm.transform.PassContext(
-        opt_level=3, required_pass=["QuantizeAnnotate", "QuantizeCalibrate", "QuantizeRealize"]
-    ):
-        with quantize_context():
-            mod = quantize_seq(mod)
-
-    q_cfg = current_qconfig()
-    assert q_cfg.partition_conversions in ["disabled", "enabled", "fully_integral"]
-    if q_cfg.partition_conversions != "disabled":
-        quantized_dtypes = {q_cfg.dtype_input, q_cfg.dtype_weight, q_cfg.dtype_activation}
-        ensure_fully_integral = q_cfg.partition_conversions == "fully_integral"
-        return partition_conversions(mod, quantized_dtypes, ensure_fully_integral)
-
-    return mod
diff --git a/python/tvm/relay/scope_builder.py b/python/tvm/relay/scope_builder.py
deleted file mode 100644
index 726b3c6241fe..000000000000
--- a/python/tvm/relay/scope_builder.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""The scope builder interface."""
-from __future__ import absolute_import
-
-from . import ty as _ty
-from . import expr as _expr
-from .._ffi import base as _base
-
-
-class WithScope(object):
-    """A wrapper for builder methods which introduce scoping.
-
-    Parameters
-    ----------
-    enter_value: object
-        The value returned by enter.
-    """
-
-    def __init__(self, enter_value, exit_cb):
-        self._enter_value = enter_value
-        self._exit_cb = exit_cb
-
-    def __enter__(self):
-        return self._enter_value
-
-    def __exit__(self, ptype, value, trace):
-        if value:
-            raise value
-        self._exit_cb()
-
-
-def _make_lets(bindings, ret_value):
-    """Make a nested let expressions.
-
-    Parameters
-    ----------
-    bindings: List[Tuple[tvm.relay.Var,tvm.relay.Expr]]
-        The sequence of let bindings
-
-    ret_value: tvm.relay.Expr
-        The final value of the expression.
-
-    Returns
-    -------
-    lets: tvm.relay.Expr
-        A nested let expression.
-    """
-    if ret_value is None:
-        raise RuntimeError("ret is not called in this scope")
-    if isinstance(ret_value, _expr.If) and ret_value.false_branch is None:
-        raise RuntimeError("Creating an If expression without else.")
-    let_expr = ret_value
-    for var, value in reversed(bindings):
-        let_expr = _expr.Let(var, value, let_expr)
-    return let_expr
-
-
-class ScopeBuilder(object):
-    """Scope builder class.
-
-    Enables users to build up a nested
-    scope(let, if) expression easily.
-
-    Examples
-    --------
-    .. code-block: python
-
-        sb = relay.ScopeBuilder()
-        cond = relay.var("cond", 'bool')
-        x = relay.var("x")
-        y = relay.var("y")
-
-        with sb.if_scope(cond):
-            one = relay.const(1, "float32")
-            t1 = sb.let(t1, relay.add(x, one))
-            sb.ret(t1)
-        with sb.else_scope():
-            sb.ret(y)
-
-        print(sb.get().astext())
-    """
-
-    def __init__(self):
-        self._bindings = [[]]
-        self._ret_values = [None]
-
-    def _enter_scope(self):
-        self._bindings.append([])
-        self._ret_values.append(None)
-
-    def _exit_scope(self):
-        bindings = self._bindings.pop()
-        ret_value = self._ret_values.pop()
-        return bindings, ret_value
-
-    def let(self, var, value):
-        """Create a new let binding.
-
-        Parameters
-        ----------
-        var: Union[Tuple[str, relay.Type], tvm.relay.Var]
-            The variable or name of variable.
-
-        value: tvm.relay.Expr
-            The value to be bound
-        """
-        if isinstance(var, (tuple, list)):
-            if len(var) > 2:
-                raise ValueError("Expect var to be Tuple[str, relay.Type]")
-            var = _expr.var(*var)
-        elif isinstance(var, _base.string_types):
-            var = _expr.var(var)
-        self._bindings[-1].append((var, value))
-        return var
-
-    def if_scope(self, cond):
-        """Create a new if scope.
-
-        Parameters
-        ----------
-        cond: tvm.relay.expr.Expr
-            The condition
-
-        Returns
-        -------
-        scope: WithScope
-            The if scope.
-
-        Note
-        ----
-        The user must follows with an else scope.
-        """
-        self._enter_scope()
-
-        def _on_exit():
-            bindings, ret_value = self._exit_scope()
-            if self._ret_values[-1] is not None:
-                raise RuntimeError("result already returned before if scope")
-            true_branch = _make_lets(bindings, ret_value)
-            self._ret_values[-1] = _expr.If(cond, true_branch, None)
-
-        return WithScope(None, _on_exit)
-
-    def else_scope(self):
-        """Create a new else scope.
-
-        Returns
-        -------
-        scope: WithScope
-            The if scope.
-        """
-        self._enter_scope()
-
-        def _on_exit():
-            bindings, ret_value = self._exit_scope()
-            partial_if = self._ret_values[-1]
-            no_else = not isinstance(partial_if, _expr.If) or partial_if.false_branch is not None
-            if no_else:
-                raise RuntimeError("else scope must follows")
-            false_branch = _make_lets(bindings, ret_value)
-            self._ret_values[-1] = _expr.If(partial_if.cond, partial_if.true_branch, false_branch)
-
-        return WithScope(None, _on_exit)
-
-    def type_of(self, expr):
-        """
-        Compute the type of an expression.
-
-        Parameters
-        ----------
-        expr: relay.Expr
-            The expression to compute the type of.
-        """
-        if isinstance(expr, _expr.Var):
-            return expr.type_annotation
-
-        ity = _ty.IncompleteType()
-        var = _expr.var("unify", ity)
-        self.let(var, expr)
-        return ity
-
-    def ret(self, value):
-        """Set the return value of this scope.
-
-        Parameters
-        ----------
-        value: tvm.relay.expr.Expr
-            The return value.
-        """
-        if self._ret_values[-1] is not None:
-            raise RuntimeError("ret value is already set in this scope.")
-        self._ret_values[-1] = value
-
-    def get(self):
-        """Get the generated result.
-
-        Returns
-        -------
-        value: tvm.relay.expr.Expr
-            The final result of the expression.
-        """
-        if len(self._bindings) != 1:
-            raise RuntimeError("can only call get at the outmost scope")
-        return _make_lets(self._bindings[-1], self._ret_values[-1])
diff --git a/python/tvm/relay/std/core.rly b/python/tvm/relay/std/core.rly
deleted file mode 100644
index f469491a56f1..000000000000
--- a/python/tvm/relay/std/core.rly
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#[version = "0.0.5"]
-
-extern type Storage
diff --git a/python/tvm/relay/std/gradient.rly b/python/tvm/relay/std/gradient.rly
deleted file mode 100644
index 7594f4ebc5f4..000000000000
--- a/python/tvm/relay/std/gradient.rly
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#[version = "0.0.5"]
-
-/*
- * Store the Gradient Value of a Tensor of type T.
- * Note that Gradient of T is stored inside a Ref(GradCell[T]) instead of GradCell[T].
- */
-type GradCell[T] {
-  Raw(T),
-  One(fn() -> T),
-  Zero(fn() -> T)
-}
-
-def @FromGradCell[T](%g: GradCell[T]) -> T {
-  match (%g) {
-    Raw(%x) => %x,
-    One(%x) => %x(),
-    Zero(%x) => %x()
-  }
-}
-
-def @MultiplyGradCell[T](%multiply: fn(T, T) -> T, %l: GradCell[T], %r: GradCell[T]) -> GradCell[T] {
-  match((%l, %r)) {
-    (Zero(_), _) => %l,
-    (_, Zero(_)) => %r,
-    (One(_), _) => %r,
-    (_, One(_)) => %l,
-    _ => Raw(%multiply(@FromGradCell(%l), @FromGradCell(%r)))
-  }
-}
-
-def @AddGradCell[T](%add: fn(T, T) -> T, %l: GradCell[T], %r: GradCell[T]) -> GradCell[T] {
-  match ((%l, %r)) {
-    (Zero(_), _) => %r,
-    (_, Zero(_)) => %l,
-    _ => Raw(%add(@FromGradCell(%l), @FromGradCell(%r)))
-  }
-}
diff --git a/python/tvm/relay/std/nat.rly b/python/tvm/relay/std/nat.rly
deleted file mode 100644
index de71beb3379c..000000000000
--- a/python/tvm/relay/std/nat.rly
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#[version = "0.0.5"]
-
-/* Defines a Peano (unary) natural number ADT.
-    Zero is represented by z(). s(n) adds 1 to a nat n.
-    Adds the fields nat, z, and s to the prelude, representing
-    (respectively) the nat ADT and the z and s constructors.
-*/
-type nat {
-    zero,
-    succ(nat),
-}
-
-/*
-  Defines a function that doubles a nat. Adds a field called
-  'double' to the prelude, giving the GlobalVar pointing to
-  the function.
-*/
-def @nat_double(%x: nat) -> nat {
-    match %x {
-        zero => zero,
-        succ(%y) => succ(succ(@nat_double(%y)))
-    }
-}
-
-def @nat_add(%x: nat, %y: nat) -> nat {
-    match %x {
-        zero => %y,
-        succ(%z) => succ(@nat_add(%z, %y))
-    }
-}
-
-/* Defines a function to get the nth eleemnt of a list using
-   a nat to index into the list.
-*/
-def @nat_nth[A](%l: List[A], %n: nat) -> A {
-    match %n {
-        zero => @hd(%l),
-        succ(%y) => @nat_nth(@tl(%l), %y)
-    }
-}
-
-/* Defines a function to update the nth element of a list and return the updated list. */
-def @nat_update[A](%list: List[A], %index: nat, %value: A) -> List[A] {
-    match %index {
-        zero => Cons(%value, @tl(%list)),
-        succ(%index_pred) => @nat_update(@tl(%list), %index_pred, %value)
-    }
-}
-
-/* Defines a function that takes a number n and a function f;
-    returns a closure that takes an argument and applies f
-    n times to its argument.
-*/
-def @nat_iterate[A](%f: fn(A) -> A, %num: nat) -> fn(A) -> A {
-    match %num {
-        zero => fn(%x: A) -> A { %x },
-        succ(%y) => fn (%i: A) { %f(@nat_iterate(%f, %y)(%i)) }
-    }
-}
diff --git a/python/tvm/relay/std/prelude.rly b/python/tvm/relay/std/prelude.rly
deleted file mode 100644
index 57512a0369b3..000000000000
--- a/python/tvm/relay/std/prelude.rly
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#[version = "0.0.5"]
-
-def @id[A](%x: A) -> A {
-  %x
-}
-
-def @compose[A, B, C](%f: fn(B) -> C, %g: fn(A) -> B) {
-  fn (%x: A) -> C {
-    %f(%g(%x))
-  }
-}
-
-def @flip[A, B, C](%f: fn(A, B) -> C) -> fn(B, A) -> C {
-  fn(%b: B, %a: A) -> C {
-    %f(%a, %b)
-  }
-}
-
-/*
- * A LISP-style list ADT. An empty list is represented by `Nil`, and a member
- * `x` can be appended to the front of a list `l` via the constructor `Cons(x, l)`.
- */
-type List[A] {
-  Cons(A, List[A]),
-  Nil,
-}
-
-/*
- * Get the head of a list. Assume the list has at least one element.
- */
-def @hd[A](%xs: List[A]) -> A {
-  match? (%xs) {
-    Cons(%x, _) => %x,
-  }
-}
-
-/*
- * Get the tail of a list.
- */
-def @tl[A](%xs: List[A]) -> List[A] {
-  match? (%xs) {
-    Cons(_, %rest) => %rest,
-  }
-}
-
-/*
- * Get the `n`th element of a list.
- */
-def @nth[A](%xs: List[A], %n: Tensor[(), int32]) -> A {
-  if (%n == 0) {
-    @hd(%xs)
-  } else {
-    @nth(@tl(%xs), %n - 1)
-  }
-}
-
-/*
- * Return the length of a list.
- */
-def @length[A](%xs: List[A]) -> Tensor[(), int32] {
-  match (%xs) {
-    Cons(_, %rest) => 1 + @length(%rest),
-    Nil => 0,
-  }
-}
-
-/*
- * Update the `n`th element of a list and return the updated list.
- */
-def @update[A](%xs: List[A], %n: Tensor[(), int32], %v: A) -> List[A] {
-  if (%n == 0) {
-    Cons(%v, @tl(%xs))
-  } else {
-    Cons(@hd(%xs), @update(@tl(%xs), %n - 1, %v))
-  }
-}
-
-/*
- * Map a function over a list's elements. That is, `map(f, xs)` returns a new
- * list where the `i`th member is `f` applied to the `i`th member of `xs`.
- */
-def @map[A, B](%f: fn(A) -> B, %xs: List[A]) -> List[B] {
-  match (%xs) {
-    Cons(%x, %rest) => Cons(%f(%x), @map(%f, %rest)),
-    Nil => Nil,
-  }
-}
-
-/*
- * A left-way fold over a list.
- *
- * `foldl(f, z, cons(a1, cons(a2, cons(a3, cons(..., nil)))))`
- * evaluates to `f(...f(f(f(z, a1), a2), a3)...)`.
- */
-def @foldl[A, B](%f: fn(A, B) -> A, %acc: A, %xs: List[B]) -> A {
-  match (%xs) {
-    Cons(%x, %rest) => @foldl(%f, %f(%acc, %x), %rest),
-    Nil => %acc,
-  }
-}
-
-/*
- * A right-way fold over a list.
- *
- * `foldr(f, z, cons(a1, cons(a2, cons(..., cons(an, nil)))))`
- * evaluates to `f(a1, f(a2, f(..., f(an, z)))...)`.
- */
-def @foldr[A, B](%f: fn(A, B) -> B, %acc: B, %xs: List[A]) -> B {
-  match (%xs) {
-    Cons(%x, %rest) => %f(%x, @foldr(%f, %acc, %rest)),
-    Nil => %acc,
-  }
-}
-
-/*
- * A right-way fold over a nonempty list.
- *
- * `foldr1(f, cons(a1, cons(a2, cons(..., cons(an, nil)))))`
- * evaluates to `f(a1, f(a2, f(..., f(an-1, an)))...)`
- */
-def @foldr1[A](%f: fn(A, A) -> A, %xs: List[A]) -> A {
-  match? (%xs) {
-    Cons(%x, Nil) => %x,
-    Cons(%x, %rest) => %f(%x, @foldr1(%f, %rest)),
-  }
-}
-
-/*
- * Computes the sum of a list of integer scalars.
- */
- // (@jroesch): if we leave off the return type this doesn't work
-def @sum(%xs: List[Tensor[(), int32]]) -> int32 {
-  let %add_f = fn(%x: Tensor[(), int32], %y: Tensor[(), int32]) -> Tensor[(), int32] {
-    %x + %y
-  };
-  @foldl(%add_f, 0, %xs)
-}
-
-/*
- * Concatenates two lists.
- */
-
-def @concat[A](%xs: List[A], %ys: List[A]) -> List[A] {
-  @foldr(Cons, %ys, %xs)
-}
-
-/*
- * Filters a list, returning a sublist of only the values which satisfy the given predicate.
- */
-def @filter[A](%f: fn(A) -> Tensor[(), bool], %xs: List[A]) -> List[A] {
-  match (%xs) {
-    Cons(%x, %rest) => {
-      if (%f(%x)) {
-        Cons(%x, @filter(%f, %rest))
-      } else {
-        @filter(%f, %rest)
-      }
-    },
-    Nil => Nil,
-  }
-}
-
-/*
- * Combines two lists into a list of tuples of their elements.
- *
- * The zipped list will be the length of the shorter list.
- */
-def @zip[A, B](%xs: List[A], %ys: List[B]) -> List[(A, B)] {
-  match (%xs, %ys) {
-    (Cons(%x, %x_rest), Cons(%y, %y_rest)) => Cons((%x, %y), @zip(%x_rest, %y_rest)),
-    _ => Nil,
-  }
-}
-
-/*
- * Reverses a list.
- */
-def @rev[A](%xs: List[A]) -> List[A] {
-  @foldl(@flip(fn (%h: A, %t: List[A]) { Cons(%h, %t) }), Nil, %xs)
-}
-
-/*
- * An accumulative map, which is a fold that simulataneously updates an
- * accumulator value and a list of results.
- *
- * This map proceeds through the list from right to left.
- */
-def @map_accumr[A, B, C](%f: fn(A, B) -> (A, C), %init: A, %xs: List[B]) -> (A, List[C]) {
-  let %updater = fn(%x: B, %acc: (A, List[C])) -> (A, List[C]) {
-    let %f_out = %f(%acc.0, %x);
-    (%f_out.0, Cons(%f_out.1, %acc.1))
-  };
-  @foldr(%updater, (%init, Nil), %xs)
-}
-
-/*
- * an accumulative map, which is a fold that simulataneously updates an
- * accumulator value and a list of results.
- *
- * This map proceeds through the list from left to right.
- */
-def @map_accuml[A, B, C](%f: fn(A, B) -> (A, C), %init: A, %xs: List[B]) -> (A, List[C]) {
-  let %updater = fn(%acc: (A, List[C]), %x: B) -> (A, List[C]) {
-    let %f_out = %f(%acc.0, %x);
-    (%f_out.0, Cons(%f_out.1, %acc.1))
-  };
-  @foldl(%updater, (%init, Nil), %xs)
-}
-
-/*
- * An optional ADT, which can either contain some other type or nothing at all.
- */
-type Option[A] {
-  Some(A),
-  None,
-}
-
-/*
- * Builds up a list starting from a seed value.
- *
- * `f` returns an option containing a new seed and an output value. `f` will
- * continue to be called on the new seeds until it returns `None`. All the output
- * values will be combined into a list, right to left.
- */
-def @unfoldr[A, B](%f: fn(A) -> Option[(A, B)], %seed: A) -> List[B] {
-  match (%f(%seed)) {
-    Some(%val) => Cons(%val.1, @unfoldr(%f, %val.0)),
-    None => Nil,
-  }
-}
-
-/*
- * Builds up a list starting from a seed value.
- *
- * `f` returns an option containing a new seed and an output value. `f` will
- * continue to be called on the new seeds until it returns `None`. All the
- * output values will be combined into a list, left to right.
- */
-def @unfoldl[A, B](%f: fn(A) -> Option[(A, B)], %seed: A) -> List[B] {
-  @rev(@unfoldr(%f, %seed))
-}
-
-/*
- * A tree ADT. A tree can contain any type. It has only one
- * constructor, rose(x, l), where x is the content of that point of the tree
- * and l is a list of more trees of the same type. A leaf is thus rose(x,
- * nil()).
- */
-type Tree[A] {
-  Rose(A, List[Tree[A]]),
-}
-
-/*
- * Maps over a tree. The function is applied to each subtree's contents.
- */
-def @tmap[A, B](%f: fn(A) -> B, %t: Tree[A]) -> Tree[B] {
-  match(%t) {
-    Rose(%v, %sub_trees) => {
-      let %list_f = fn(%tt: Tree[A]) -> Tree[B] {
-        @tmap(%f, %tt)
-      };
-      Rose(%f(%v), @map(%list_f, %sub_trees))
-    },
-  }
-}
-
-/*
- * Computes the size of a tree.
- */
-def @size[A](%t: Tree[A]) -> Tensor[(), int32] {
-  match(%t) {
-    Rose(_, %sub_trees) => {
-      1 + @sum(@map(@size, %sub_trees))
-    },
-  }
-}
-
-/*
- * Takes a number n and a function f; returns a closure that takes an argument
- * and applies f n times to its argument.
- */
-def @iterate[A](%f: fn(A) -> A, %n: Tensor[(), int32]) -> fn(A) -> A {
-  if (%n == 0) {
-    @id
-  } else {
-    @compose(%f, @iterate(%f, %n - 1))
-  }
-}
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
deleted file mode 100644
index 2399a474de88..000000000000
--- a/python/tvm/relay/testing/__init__.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Utilities for testing and benchmarks"""
-from __future__ import absolute_import as _abs
-import collections
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import op
-from tvm.relay.prelude import Prelude
-from tvm.testing import enabled_targets
-
-from . import mlp
-from . import resnet
-from . import resnet_3d
-from . import dqn
-from . import dcgan
-from . import mobilenet
-from . import lstm
-from . import inception_v3
-from . import squeezenet
-from . import vgg
-from . import densenet
-from . import yolo_detection
-from . import temp_op_attr
-from . import synthetic
-
-from .init import create_workload
-from .nat import count, make_nat_value, make_nat_expr
-from .py_converter import to_python, run_as_python
-from ..transform import gradient
-
-
-def run_opt_pass(expr, opt_pass, import_prelude=False):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    if import_prelude:
-        Prelude(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def run_infer_type(expr):
-    return run_opt_pass(expr, relay.transform.InferType())
-
-
-def _np_randn_from_type(t, scale=1, mean=0):
-    res = mean + (scale * np.random.randn(*(int(d) for d in t.shape)))
-    # if t.shape == (), then randn returns a scalar so we need to wrap for dtype conversion
-    if np.isscalar(res):
-        res = np.array(res)
-    return res.astype(t.dtype)
-
-
-def check_grad(
-    func,
-    inputs=None,
-    test_inputs=None,
-    eps=1e-6,
-    atol=1e-5,
-    rtol=1e-3,
-    scale=None,
-    mean=0,
-    mode="higher_order",
-    target_devices=None,
-    executor_kind="debug",
-):
-    """Perform numerical gradient checking given a relay function.
-
-    Compare analytical gradients to numerical gradients derived from two-sided approximation. Note
-    that this test may fail if your function input types are not of high enough precision.
-
-    Parameters
-    ----------
-    func : tvm.relay.Function
-        The relay function to test.
-
-    inputs: List[np.array]
-        Optional user-provided input parameters to use. If not given, will generate random normal
-        inputs scaled to be close to the chosen epsilon value to avoid numerical precision loss.
-
-    test_inputs: List[np.array]
-        The inputs to test for gradient matching. Useful in cases where some inputs are not
-        differentiable, such as symbolic inputs to dynamic ops. If not given, all inputs are
-        tested.
-
-    eps: float
-        The epsilon value to use for computing numerical gradient approximation.
-
-    atol: float
-        The absolute tolerance on difference between numerical and analytical gradients. Note that
-        this needs to be scaled appropriately relative to the chosen eps and inputs.
-
-    rtol: float
-        The relative tolerance on difference between numerical and analytical gradients. Note that
-        this needs to be scaled appropriately relative to the chosen eps.
-
-    scale: float
-        The standard deviation of the inputs.
-
-    mean: float
-        The mean of the inputs.
-
-    target_devices: Optional[List[Tuple[tvm.target.Target, tvm.runtime.Device]]]
-        A list of targets/devices on which the gradient should be
-        tested.  If not specified, will default to `tvm.testing.enabled_targets()`.
-
-    """
-
-    fwd_func = run_infer_type(func)
-    bwd_func = run_infer_type(gradient(fwd_func, mode=mode))
-    bwd_func = run_opt_pass(bwd_func, relay.transform.Legalize())
-
-    if scale is None:
-        scale = 10 * eps
-
-    if inputs is None:
-        params = fwd_func.params
-        # Generate random inputs on the same scale as epsilon to avoid numerical precision loss.
-        inputs = [_np_randn_from_type(x.checked_type, scale=scale, mean=mean) for x in params]
-
-    if test_inputs is None:
-        test_inputs = inputs
-
-    if target_devices is None:
-        target_devices = enabled_targets()
-
-    for target, dev in target_devices:
-        # Eval the backward and forward functions
-        # TODO(mbs): Evaluate a pair of functions so can share preparation between them.
-        bwd_func_compiled = relay.create_executor(
-            executor_kind, device=dev, target=target
-        ).evaluate(bwd_func)
-        fwd_func_compiled = relay.create_executor(
-            executor_kind, device=dev, target=target
-        ).evaluate(fwd_func)
-
-        # Get analytic gradients.
-        _, grads = bwd_func_compiled(*inputs)
-        grads = [grad.numpy().astype("float64") for grad in grads]
-
-        # Throw out gradients we aren't testing
-        if inputs != test_inputs:
-            tmp = []
-            # find the gradient that corresponds to every test input
-            for test_input in test_inputs:
-                for i, grad in enumerate(grads):
-                    if inputs[i] is test_input:
-                        tmp.append(grad)
-                        break
-            grads = tmp
-
-        assert len(grads) > 0, "You must test at least one gradient."
-
-        # Get numeric gradients for each dimension of each param, using two-sided approximation.
-        approx_grads = []
-        for x in test_inputs:
-            approx_grad = np.zeros(x.shape)
-            for i in np.ndindex(*x.shape):
-                x_i = x[i]
-                x[i] = x_i + eps
-                fwd_plus = fwd_func_compiled(*inputs).numpy().astype("float64")
-                x[i] = x_i - eps
-                fwd_minus = fwd_func_compiled(*inputs).numpy().astype("float64")
-                x[i] = x_i
-                approx_grad[i] = np.sum((fwd_plus - fwd_minus) / (2 * eps))
-            approx_grads.append(approx_grad)
-        # Compare gradients by checking that relative difference is below tolerance.
-        for grad, approx_grad in zip(grads, approx_grads):
-            np.testing.assert_allclose(grad, approx_grad, atol=atol, rtol=rtol)
-
-
-def rand(dtype, *shape):
-    return tvm.nd.array(np.random.rand(*shape).astype(dtype))
-
-
-def count_ops(expr):
-    """count number of times a given op is called in the graph"""
-
-    class OpCounter(tvm.relay.ExprVisitor):
-        """OpCounter"""
-
-        def visit_call(self, call):
-            if hasattr(call, "op"):
-                self.node_counter[call.op.name] += 1
-            return super().visit_call(call)
-
-        def count(self, expr):
-            self.node_set = {}
-            self.node_counter = collections.Counter()
-            self.visit(expr)
-            return self.node_counter
-
-    return OpCounter().count(expr)
diff --git a/python/tvm/relay/testing/byoc.py b/python/tvm/relay/testing/byoc.py
deleted file mode 100644
index 619c9b99ca1d..000000000000
--- a/python/tvm/relay/testing/byoc.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Defines test utilties useful for testing BYOC flows."""
-
-from tvm import relay
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-
-
-class CcompilerAnnotator(ExprMutator):
-    """
-    This is used to create external functions for ccompiler.
-    A simple annotator that creates the following program:
-           |
-      -- begin --
-           |
-          add
-           |
-        subtract
-           |
-        multiply
-           |
-       -- end --
-           |
-    """
-
-    def __init__(self):
-        super(CcompilerAnnotator, self).__init__()
-        self.in_compiler = 0
-
-    def visit_call(self, call):
-        if call.op.name == "add":  # Annotate begin at args
-            if self.in_compiler == 1:
-                lhs = compiler_begin(super().visit(call.args[0]), "ccompiler")
-                rhs = compiler_begin(super().visit(call.args[1]), "ccompiler")
-                op = relay.add(lhs, rhs)
-                self.in_compiler = 2
-                return op
-        elif call.op.name == "subtract":
-            if self.in_compiler == 1:
-                lhs = super().visit(call.args[0])
-                rhs = super().visit(call.args[1])
-                if isinstance(lhs, relay.expr.Var):
-                    lhs = compiler_begin(lhs, "ccompiler")
-                if isinstance(rhs, relay.expr.Var):
-                    rhs = compiler_begin(rhs, "ccompiler")
-                return relay.subtract(lhs, rhs)
-        elif call.op.name == "multiply":  # Annotate end at output
-            self.in_compiler = 1
-            lhs = super().visit(call.args[0])
-            rhs = super().visit(call.args[1])
-            if isinstance(lhs, relay.expr.Var):
-                lhs = compiler_begin(lhs, "ccompiler")
-            if isinstance(rhs, relay.expr.Var):
-                rhs = compiler_begin(rhs, "ccompiler")
-            op = relay.multiply(lhs, rhs)
-            if self.in_compiler == 2:
-                op = compiler_end(op, "ccompiler")
-            self.in_compiler = 0
-            return op
-        return super().visit_call(call)
diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py
deleted file mode 100644
index b1f364273e1b..000000000000
--- a/python/tvm/relay/testing/darknet.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, no-init
-"""
-Compile DarkNet Models
-====================
-DarkNet helper functions for darknet model parsing and image loading.
-This functions will not be loaded by default.
-These are utility functions used for testing and tutorial file.
-"""
-from __future__ import division
-from cffi import FFI
-import numpy as np
-import cv2
-
-
-def convert_image(image):
-    """Convert the image with numpy."""
-    imagex = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    imagex = np.array(imagex)
-    imagex = imagex.transpose((2, 0, 1))
-    imagex = np.divide(imagex, 255.0)
-    imagex = np.flip(imagex, 0)
-    return imagex
-
-
-def load_image_color(test_image):
-    """To load the image using opencv api and do preprocessing."""
-    imagex = cv2.imread(test_image)
-    return convert_image(imagex)
-
-
-def _letterbox_image(img, w_in, h_in):
-    """To get the image in boxed format."""
-    imh, imw, imc = img.shape
-    if (w_in / imw) < (h_in / imh):
-        new_w = w_in
-        new_h = imh * w_in // imw
-    else:
-        new_h = h_in
-        new_w = imw * h_in // imh
-    dim = (new_w, new_h)
-    # Default interpolation method is INTER_LINEAR
-    # Other methods are INTER_AREA, INTER_NEAREST, INTER_CUBIC and INTER_LANCZOS4
-    # For more information see:
-    # https://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html#resize
-    resized = cv2.resize(src=img, dsize=dim, interpolation=cv2.INTER_CUBIC)
-    resized = convert_image(resized)
-    boxed = np.full((imc, h_in, w_in), 0.5, dtype=float)
-    _, resizedh, resizedw = resized.shape
-    boxed[
-        :,
-        int((h_in - new_h) / 2) : int((h_in - new_h) / 2) + resizedh,
-        int((w_in - new_w) / 2) : int((w_in - new_w) / 2) + resizedw,
-    ] = resized
-    return boxed
-
-
-def load_image(img, resize_width, resize_height):
-    """Load the image and convert to the darknet model format.
-    The image processing of darknet is different from normal.
-    Parameters
-    ----------
-    image : string
-        The image file name with path
-
-    resize_width : integer
-        The width to which the image needs to be resized
-
-    resize_height : integer
-        The height to which the image needs to be resized
-
-    Returns
-    -------
-    img : Float array
-        Array of processed image
-    """
-    imagex = cv2.imread(img)
-    return _letterbox_image(imagex, resize_width, resize_height)
-
-
-class LAYERTYPE(object):
-    """Darknet LAYERTYPE Class constant."""
-
-    CONVOLUTIONAL = 0
-    DECONVOLUTIONAL = 1
-    CONNECTED = 2
-    MAXPOOL = 3
-    SOFTMAX = 4
-    DETECTION = 5
-    DROPOUT = 6
-    CROP = 7
-    ROUTE = 8
-    COST = 9
-    NORMALIZATION = 10
-    AVGPOOL = 11
-    LOCAL = 12
-    SHORTCUT = 13
-    ACTIVE = 14
-    RNN = 15
-    GRU = 16
-    LSTM = 17
-    CRNN = 18
-    BATCHNORM = 19
-    NETWORK = 20
-    XNOR = 21
-    REGION = 22
-    YOLO = 23
-    REORG = 24
-    UPSAMPLE = 25
-    LOGXENT = 26
-    L2NORM = 27
-    BLANK = 28
-
-
-class ACTIVATION(object):
-    """Darknet ACTIVATION Class constant."""
-
-    LOGISTIC = 0
-    RELU = 1
-    RELIE = 2
-    LINEAR = 3
-    RAMP = 4
-    TANH = 5
-    PLSE = 6
-    LEAKY = 7
-    ELU = 8
-    LOGGY = 9
-    STAIR = 10
-    HARDTAN = 11
-    LHTAN = 12
-
-
-__darknetffi__ = FFI()
-
-__darknetffi__.cdef(
-    """
-typedef struct network network;
-typedef struct layer layer;
-
-typedef struct{
-    int *leaf;
-    int n;
-    int *parent;
-    int *child;
-    int *group;
-    char **name;
-
-    int groups;
-    int *group_size;
-    int *group_offset;
-} tree;
-
-typedef enum{
-    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN
-} ACTIVATION;
-
-
-typedef enum {
-    CONVOLUTIONAL,
-    DECONVOLUTIONAL,
-    CONNECTED,
-    MAXPOOL,
-    SOFTMAX,
-    DETECTION,
-    DROPOUT,
-    CROP,
-    ROUTE,
-    COST,
-    NORMALIZATION,
-    AVGPOOL,
-    LOCAL,
-    SHORTCUT,
-    ACTIVE,
-    RNN,
-    GRU,
-    LSTM,
-    CRNN,
-    BATCHNORM,
-    NETWORK,
-    XNOR,
-    REGION,
-    YOLO,
-    REORG,
-    UPSAMPLE,
-    LOGXENT,
-    L2NORM,
-    BLANK
-} LAYERTYPE;
-
-typedef enum{
-    SSE, MASKED, L1, SEG, SMOOTH, WGAN
-} COSTTYPE;
-
-
-struct layer{
-    LAYERTYPE type;
-    ACTIVATION activation;
-    COSTTYPE cost_type;
-    void (*forward);
-    void (*backward);
-    void (*update);
-    void (*forward_gpu);
-    void (*backward_gpu);
-    void (*update_gpu);
-    int batch_normalize;
-    int shortcut;
-    int batch;
-    int forced;
-    int flipped;
-    int inputs;
-    int outputs;
-    int nweights;
-    int nbiases;
-    int extra;
-    int truths;
-    int h,w,c;
-    int out_h, out_w, out_c;
-    int n;
-    int max_boxes;
-    int groups;
-    int size;
-    int side;
-    int stride;
-    int reverse;
-    int flatten;
-    int spatial;
-    int pad;
-    int sqrt;
-    int flip;
-    int index;
-    int binary;
-    int xnor;
-    int steps;
-    int hidden;
-    int truth;
-    float smooth;
-    float dot;
-    float angle;
-    float jitter;
-    float saturation;
-    float exposure;
-    float shift;
-    float ratio;
-    float learning_rate_scale;
-    float clip;
-    int softmax;
-    int classes;
-    int coords;
-    int background;
-    int rescore;
-    int objectness;
-    int joint;
-    int noadjust;
-    int reorg;
-    int log;
-    int tanh;
-    int *mask;
-    int total;
-
-    float alpha;
-    float beta;
-    float kappa;
-
-    float coord_scale;
-    float object_scale;
-    float noobject_scale;
-    float mask_scale;
-    float class_scale;
-    int bias_match;
-    int random;
-    float ignore_thresh;
-    float truth_thresh;
-    float thresh;
-    float focus;
-    int classfix;
-    int absolute;
-
-    int onlyforward;
-    int stopbackward;
-    int dontload;
-    int dontsave;
-    int dontloadscales;
-
-    float temperature;
-    float probability;
-    float scale;
-
-    char  * cweights;
-    int   * indexes;
-    int   * input_layers;
-    int   * input_sizes;
-    int   * map;
-    float * rand;
-    float * cost;
-    float * state;
-    float * prev_state;
-    float * forgot_state;
-    float * forgot_delta;
-    float * state_delta;
-    float * combine_cpu;
-    float * combine_delta_cpu;
-
-    float * concat;
-    float * concat_delta;
-
-    float * binary_weights;
-
-    float * biases;
-    float * bias_updates;
-
-    float * scales;
-    float * scale_updates;
-
-    float * weights;
-    float * weight_updates;
-
-    float * delta;
-    float * output;
-    float * loss;
-    float * squared;
-    float * norms;
-
-    float * spatial_mean;
-    float * mean;
-    float * variance;
-
-    float * mean_delta;
-    float * variance_delta;
-
-    float * rolling_mean;
-    float * rolling_variance;
-
-    float * x;
-    float * x_norm;
-
-    float * m;
-    float * v;
-
-    float * bias_m;
-    float * bias_v;
-    float * scale_m;
-    float * scale_v;
-
-
-    float *z_cpu;
-    float *r_cpu;
-    float *h_cpu;
-    float * prev_state_cpu;
-
-    float *temp_cpu;
-    float *temp2_cpu;
-    float *temp3_cpu;
-
-    float *dh_cpu;
-    float *hh_cpu;
-    float *prev_cell_cpu;
-    float *cell_cpu;
-    float *f_cpu;
-    float *i_cpu;
-    float *g_cpu;
-    float *o_cpu;
-    float *c_cpu;
-    float *dc_cpu;
-
-    float * binary_input;
-
-    struct layer *input_layer;
-    struct layer *self_layer;
-    struct layer *output_layer;
-
-    struct layer *reset_layer;
-    struct layer *update_layer;
-    struct layer *state_layer;
-
-    struct layer *input_gate_layer;
-    struct layer *state_gate_layer;
-    struct layer *input_save_layer;
-    struct layer *state_save_layer;
-    struct layer *input_state_layer;
-    struct layer *state_state_layer;
-
-    struct layer *input_z_layer;
-    struct layer *state_z_layer;
-
-    struct layer *input_r_layer;
-    struct layer *state_r_layer;
-
-    struct layer *input_h_layer;
-    struct layer *state_h_layer;
-
-    struct layer *wz;
-    struct layer *uz;
-    struct layer *wr;
-    struct layer *ur;
-    struct layer *wh;
-    struct layer *uh;
-    struct layer *uo;
-    struct layer *wo;
-    struct layer *uf;
-    struct layer *wf;
-    struct layer *ui;
-    struct layer *wi;
-    struct layer *ug;
-    struct layer *wg;
-
-    tree *softmax_tree;
-
-    size_t workspace_size;
-};
-
-
-typedef enum {
-    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
-} LEARNINGRATEPOLICY;
-
-typedef struct network{
-    int n;
-    int batch;
-    size_t *seen;
-    int *t;
-    float epoch;
-    int subdivisions;
-    layer *layers;
-    float *output;
-    LEARNINGRATEPOLICY policy;
-
-    float learning_rate;
-    float momentum;
-    float decay;
-    float gamma;
-    float scale;
-    float power;
-    int time_steps;
-    int step;
-    int max_batches;
-    float *scales;
-    int   *steps;
-    int num_steps;
-    int burn_in;
-
-    int adam;
-    float B1;
-    float B2;
-    float eps;
-
-    int inputs;
-    int outputs;
-    int truths;
-    int notruth;
-    int h, w, c;
-    int max_crop;
-    int min_crop;
-    float max_ratio;
-    float min_ratio;
-    int center;
-    float angle;
-    float aspect;
-    float exposure;
-    float saturation;
-    float hue;
-    int random;
-
-    int gpu_index;
-    tree *hierarchy;
-
-    float *input;
-    float *truth;
-    float *delta;
-    float *workspace;
-    int train;
-    int index;
-    float *cost;
-    float clip;
-} network;
-
-
-typedef struct {
-    int w;
-    int h;
-    int c;
-    float *data;
-} image;
-
-network *load_network(char *cfg, char *weights, int clear);
-image letterbox_image(image im, int w, int h);
-int resize_network(network *net, int w, int h);
-void top_predictions(network *net, int n, int *index);
-void free_image(image m);
-image load_image_color(char *filename, int w, int h);
-float *network_predict_image(network *net, image im);
-float *network_predict(network *net, float *input);
-network *make_network(int n);
-layer make_convolutional_layer(
-    int batch,
-    int h, int w, int c, int n,
-    int groups, int size, int stride, int padding,
-    ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
-layer make_connected_layer(int batch, int inputs, int outputs,
-    ACTIVATION activation, int batch_normalize, int adam);
-layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding);
-layer make_avgpool_layer(int batch, int w, int h, int c);
-layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
-layer make_batchnorm_layer(int batch, int w, int h, int c);
-layer make_reorg_layer(
-    int batch, int w, int h, int c,
-    int stride, int reverse, int flatten, int extra);
-layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
-layer make_softmax_layer(int batch, int inputs, int groups);
-layer make_rnn_layer(int batch, int inputs, int outputs,
-    int steps, ACTIVATION activation, int batch_normalize, int adam);
-layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
-layer make_crnn_layer(
-    int batch, int h, int w, int c,
-    int hidden_filters, int output_filters, int steps,
-    ACTIVATION activation, int batch_normalize);
-layer make_lstm_layer(
-    int batch, int inputs, int outputs, int steps,
-    int batch_normalize, int adam);
-layer make_gru_layer(int batch, int inputs,
-    int outputs, int steps, int batch_normalize, int adam);
-layer make_upsample_layer(int batch, int w, int h, int c, int stride);
-layer make_l2norm_layer(int batch, int inputs);
-void free_network(network *net);
-"""
-)
diff --git a/python/tvm/relay/testing/dcgan.py b/python/tvm/relay/testing/dcgan.py
deleted file mode 100644
index 4749d76dbcce..000000000000
--- a/python/tvm/relay/testing/dcgan.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""
-Net of the generator of DCGAN
-
-Adopted from:
-https://github.com/tqchen/mxnet-gan/blob/main/mxgan/generator.py
-
-Reference:
-Radford, Alec, Luke Metz, and Soumith Chintala.
-"Unsupervised representation learning with deep convolutional generative adversarial networks."
-arXiv preprint arXiv:1511.06434 (2015).
-"""
-from tvm import relay
-
-from . import layers
-from .init import create_workload
-
-
-def deconv2d(data, ishape, oshape, kshape, layout, name, stride=(2, 2)):
-    """a deconv layer that enlarges the feature map"""
-    target_shape = (oshape[-2], oshape[-1])
-
-    pad_y = (kshape[0] - 1) // 2
-    pad_x = (kshape[1] - 1) // 2
-    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
-    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
-
-    if layout == "NCHW":
-        kernel_layout = "IOHW"
-    elif layout == "NHWC":
-        kernel_layout = "HWOI"
-    else:
-        raise ValueError("Invalid layout: " + layout)
-
-    net = layers.conv2d_transpose(
-        data,
-        kernel_size=kshape,
-        strides=stride,
-        channels=oshape[0],
-        padding=(pad_y, pad_x),
-        output_padding=(adj_y, adj_x),
-        data_layout=layout,
-        kernel_layout=kernel_layout,
-        name=name,
-    )
-    return net
-
-
-def deconv2d_bn_relu(data, prefix, **kwargs):
-    """a block of deconv + batch norm + relu"""
-    eps = 1e-5 + 1e-12
-    net = deconv2d(data, name=f"{prefix}_deconv", **kwargs)
-    bn_axis = kwargs.get("layout", "NCHW").index("C")
-    net = layers.batch_norm_infer(
-        net, epsilon=eps, scale=False, axis=bn_axis, name=f"{prefix}_batch_norm"
-    )
-    net = relay.nn.relu(net)
-    return net
-
-
-def get_net(
-    batch_size,
-    random_len=100,
-    oshape=(3, 64, 64),
-    ngf=128,
-    code=None,
-    layout="NCHW",
-    dtype="float32",
-):
-    """get net of dcgan generator"""
-    assert oshape[-1] == 64, "Only support 64x64 image"
-    assert oshape[-2] == 64, "Only support 64x64 image"
-
-    code = relay.var("data", dtype=dtype, shape=(batch_size, random_len)) if code is None else code
-    dense_weight = relay.var("dense_weight")
-    dense = relay.nn.dense(code, weight=dense_weight, units=4 * 4 * ngf * 8)
-    relu = relay.nn.relu(dense)
-    # 4 x 4
-    if layout == "NCHW":
-        reshape = relay.reshape(relu, newshape=(-1, ngf * 8, 4, 4))
-    elif layout == "NHWC":
-        reshape = relay.reshape(relu, newshape=(-1, 4, 4, ngf * 8))
-    else:
-        raise ValueError("Invalid layout: " + layout)
-    # 8 x 8
-    dc8 = deconv2d_bn_relu(
-        reshape,
-        ishape=(ngf * 8, 4, 4),
-        oshape=(ngf * 4, 8, 8),
-        kshape=(4, 4),
-        layout=layout,
-        prefix="g2",
-    )
-    # 16x16
-    dc16 = deconv2d_bn_relu(
-        dc8,
-        ishape=(ngf * 4, 8, 8),
-        oshape=(ngf * 2, 16, 16),
-        kshape=(4, 4),
-        layout=layout,
-        prefix="g3",
-    )
-    # 32x32
-    dc32 = deconv2d_bn_relu(
-        dc16,
-        ishape=(ngf * 2, 16, 16),
-        oshape=(ngf, 32, 32),
-        kshape=(4, 4),
-        layout=layout,
-        prefix="g4",
-    )
-    # 64x64
-    dc64 = deconv2d(
-        dc32,
-        ishape=(ngf, 32, 32),
-        oshape=oshape[-3:],
-        kshape=(4, 4),
-        layout=layout,
-        name="g5_deconv",
-    )
-    tanh = relay.tanh(dc64)
-
-    args = relay.analysis.free_vars(tanh)
-    return relay.Function(args, tanh)
-
-
-def get_workload(
-    batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, layout="NCHW", dtype="float32"
-):
-    """Get benchmark workload for a DCGAN generator
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-    oshape : tuple, optional
-        The shape of output image, layout="CHW"
-    ngf: int, optional
-        The number of final feature maps in the generator
-    random_len : int, optional
-        The length of random input
-    layout: str, optional
-        The layout of conv2d transpose
-    dtype : str, optional
-        The data type
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a DCGAN network.
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(batch_size, random_len, oshape=oshape, ngf=ngf, layout=layout, dtype=dtype)
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/densenet.py b/python/tvm/relay/testing/densenet.py
deleted file mode 100644
index c9deb7868330..000000000000
--- a/python/tvm/relay/testing/densenet.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name, line-too-long
-"""
-Port of MxNet version of Densenet to Relay.
-https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/model_zoo/vision/densenet.py
-"""
-# pylint: enable=line-too-long
-from tvm import relay
-from . import layers
-from .init import create_workload
-
-
-def _make_dense_layer(data, growth_rate, bn_size, index):
-    """Single densenet layer."""
-    bn1 = layers.batch_norm_infer(data, name=f"batch_1_{index}")
-    relu1 = relay.nn.relu(bn1)
-    conv1 = layers.conv2d(
-        relu1, channels=bn_size * growth_rate, kernel_size=(1, 1), name=f"conv2d_1_{index}"
-    )
-    bn2 = layers.batch_norm_infer(conv1, name="batch_2_" + index)
-    relu2 = relay.nn.relu(bn2)
-    conv2 = layers.conv2d(
-        relu2, channels=growth_rate, kernel_size=(3, 3), padding=(1, 1), name=f"conv2d_2_{index}"
-    )
-    return conv2
-
-
-def _make_dense_block(data, num_layers, bn_size, growth_rate, index):
-    """Makes a block of dense layers of the specified size."""
-    layer_out = data
-    blocks = []
-    for i in range(num_layers):
-        layer_out = _make_dense_layer(layer_out, growth_rate, bn_size, f"{index}_{i}")
-        blocks.append(layer_out)
-    block_out = relay.concatenate(blocks, 1)
-    return block_out
-
-
-def _make_transition(data, num_output_features, index):
-    """Transition between layers."""
-    bn = layers.batch_norm_infer(data, name=f"batch_t_{index}")
-    relu = relay.nn.relu(bn)
-    conv = layers.conv2d(
-        relu, channels=num_output_features, kernel_size=(1, 1), name=f"conv_t_{index}"
-    )
-    return relay.nn.avg_pool2d(conv, pool_size=(2, 2), strides=(2, 2))
-
-
-def _make_dense_net(
-    num_init_features, growth_rate, block_config, data_shape, data_dtype, bn_size=4, classes=1000
-):
-    """Builds up a densenet."""
-    data = relay.Var(
-        "data", relay.TensorType(data_shape, data_dtype)
-    )  # (batch_size, 3, 224, 224)))
-    conv1 = layers.conv2d(
-        data,
-        channels=num_init_features,
-        kernel_size=(7, 7),
-        strides=(2, 2),
-        padding=(3, 3),
-        name="conv1",
-    )
-    bn1 = layers.batch_norm_infer(conv1, name="batch1")
-    relu1 = relay.nn.relu(bn1)
-    mp = relay.nn.max_pool2d(relu1, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
-
-    num_features = num_init_features
-    layer_out = mp
-    for i, num_layers in enumerate(block_config):
-        layer_out = _make_dense_block(layer_out, num_layers, bn_size, growth_rate, i)
-        num_features = num_features + num_layers * growth_rate
-        if i != len(block_config) - 1:
-            layer_out = _make_transition(layer_out, num_features // 2, i)
-            num_features = num_features // 2
-    bn2 = layers.batch_norm_infer(layer_out, name="batch2")
-    relu2 = relay.nn.relu(bn2)
-    avg = relay.nn.avg_pool2d(relu2, pool_size=(7, 7))
-    flat = relay.nn.batch_flatten(avg)
-
-    ret = layers.dense_add_bias(flat, units=classes, name="dense")
-
-    return relay.Function(relay.analysis.free_vars(ret), ret)
-
-
-def get_workload(
-    densenet_size=121, classes=1000, batch_size=4, image_shape=(3, 224, 224), dtype="float32"
-):
-    """Gets benchmark workload for densenet.
-
-    Parameters
-    ----------
-    densenet_size : int, optional (default 121)
-        Parameter for the network size. The supported sizes
-        are 121, 161, 169, and 201.
-
-    classes : int, optional (default 1000)
-        The number of classes.
-
-    batch_size : int, optional (detault 4)
-        The batch size for the network.
-
-    image_shape : shape, optional (default (3, 224, 224))
-        The shape of the input data.
-
-    dtype : data type, optional (default 'float32')
-        The data type of the input data.
-
-    Returns
-    -------
-    mod: tvm.IRModule
-        The relay module that contains a DenseNet network.
-
-    params : dict of str to NDArray
-        The benchmark paraeters.
-    """
-    specs = {
-        121: (64, 32, [6, 12, 24, 16]),
-        161: (96, 48, [6, 12, 36, 24]),
-        169: (69, 32, [6, 12, 32, 32]),
-        201: (64, 32, [6, 12, 48, 32]),
-    }
-    bn_size = 4
-    num_init_features, growth_rate, block_config = specs[densenet_size]
-    data_shape = tuple([batch_size] + list(image_shape))
-    net = _make_dense_net(
-        num_init_features, growth_rate, block_config, data_shape, dtype, bn_size, classes
-    )
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/dqn.py b/python/tvm/relay/testing/dqn.py
deleted file mode 100644
index dd31ab850119..000000000000
--- a/python/tvm/relay/testing/dqn.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Net of Nature DQN
-Reference:
-Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning."
-Nature 518.7540 (2015): 529.
-"""
-
-from tvm import relay
-from . import layers
-from .init import create_workload
-
-
-def get_net(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32", layout="NCHW"):
-    """get symbol of nature dqn"""
-    data_shape = (batch_size,) + image_shape
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-
-    bias_axis = layout.index("C")
-
-    conv1_bias = relay.var("conv1_bias")
-    conv1 = layers.conv2d(
-        data,
-        kernel_size=(8, 8),
-        strides=(4, 4),
-        padding=(0, 0),
-        channels=32,
-        name="conv1",
-        data_layout=layout,
-        kernel_layout=layers.conv_kernel_layout(layout),
-    )
-    conv1 = relay.nn.bias_add(conv1, conv1_bias, bias_axis)
-    relu1 = relay.nn.relu(conv1)
-
-    conv2_bias = relay.var("conv2_bias")
-    conv2 = layers.conv2d(
-        relu1,
-        kernel_size=(4, 4),
-        strides=(2, 2),
-        padding=(0, 0),
-        channels=64,
-        name="conv2",
-        data_layout=layout,
-        kernel_layout=layers.conv_kernel_layout(layout),
-    )
-    conv2 = relay.nn.bias_add(conv2, conv2_bias, bias_axis)
-    relu2 = relay.nn.relu(conv2)
-
-    conv3_bias = relay.var("conv3_bias")
-    conv3 = layers.conv2d(
-        relu2,
-        kernel_size=(3, 3),
-        strides=(1, 1),
-        padding=(0, 0),
-        channels=64,
-        name="conv3",
-        data_layout=layout,
-        kernel_layout=layers.conv_kernel_layout(layout),
-    )
-    conv3 = relay.nn.bias_add(conv3, conv3_bias, bias_axis)
-    relu3 = relay.nn.relu(conv3)
-
-    bf1 = relay.nn.batch_flatten(relu3)
-    dense1 = layers.dense_add_bias(bf1, units=512, name="dense1")
-    relu4 = relay.nn.relu(dense1)
-    dense2 = layers.dense_add_bias(relu4, units=num_actions, name="dense2")
-
-    args = relay.analysis.free_vars(dense2)
-    return relay.Function(args, dense2)
-
-
-def get_workload(
-    batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32", layout="NCHW"
-):
-    """Get benchmark workload for a Deep Q Network
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-    num_actions : int, optional
-        Number of actions
-    image_shape : tuple, optional
-        The input image shape
-    dtype : str, optional
-        The data type
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a DQN network.
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(
-        batch_size, num_actions=num_actions, image_shape=image_shape, dtype=dtype, layout=layout
-    )
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/inception_v3.py b/python/tvm/relay/testing/inception_v3.py
deleted file mode 100644
index e5b89ccdecce..000000000000
--- a/python/tvm/relay/testing/inception_v3.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Inception V3, suitable for images with around 299 x 299
-
-Reference:
-Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
-arXiv preprint arXiv:1512.00567 (2015).
-
-Adopted from https://github.com/apache/incubator-mxnet/blob/master/
-             example/image-classification/symbols/inception-v3.py
-"""
-# pylint: disable=invalid-name,missing-docstring,unused-argument, superfluous-parens
-from tvm import relay
-from .init import create_workload
-from . import layers
-
-
-def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=""):
-    conv = layers.conv2d(
-        data=data,
-        channels=int(num_filter),
-        kernel_size=kernel,
-        strides=stride,
-        padding=pad,
-        name=f"{name}{suffix}_conv1",
-    )
-
-    bn = layers.batch_norm_infer(data=conv, epsilon=2e-5, scale=False, name=f"{name}{suffix}_bn")
-    act = relay.nn.relu(data=bn)
-    return act
-
-
-def Pooling(data, kernel, stride, pad, pool_type, name):
-    if pool_type == "max":
-        return relay.nn.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad)
-    if pool_type == "avg":
-        return relay.nn.avg_pool2d(
-            data=data, pool_size=kernel, strides=stride, padding=pad, count_include_pad=True
-        )
-    raise ValueError("Invalid pooling type: " + pool_type)
-
-
-def Inception7A(
-    data, num_1x1, num_3x3_red, num_3x3_1, num_3x3_2, num_5x5_red, num_5x5, pool, proj, name
-):
-    tower_1x1 = Conv(data, num_1x1, name=f"{name}_conv")
-    tower_5x5 = Conv(data, num_5x5_red, name=f"{name}_tower", suffix="_conv")
-    tower_5x5 = Conv(
-        tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=f"{name}_tower", suffix="_conv_1"
-    )
-    tower_3x3 = Conv(data, num_3x3_red, name=f"{name}_tower_1", suffix="_conv")
-    tower_3x3 = Conv(
-        tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=f"{name}_tower_1", suffix="_conv_1"
-    )
-    tower_3x3 = Conv(
-        tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=f"{name}_tower_1", suffix="_conv_2"
-    )
-    pooling = Pooling(
-        data=data,
-        kernel=(3, 3),
-        stride=(1, 1),
-        pad=(1, 1),
-        pool_type=pool,
-        name=f"{pool}_pool_{name}_pool",
-    )
-
-    cproj = Conv(pooling, proj, name=f"{name}_tower_2", suffix="_conv")
-    concat = relay.concatenate((tower_1x1, tower_5x5, tower_3x3, cproj), axis=1)
-    return concat
-
-
-# First Downsample
-def Inception7B(data, num_3x3, num_d3x3_red, num_d3x3_1, num_d3x3_2, pool, name):
-    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=f"{name}_conv")
-    tower_d3x3 = Conv(data, num_d3x3_red, name=f"{name}_tower", suffix="_conv")
-    tower_d3x3 = Conv(
-        tower_d3x3,
-        num_d3x3_1,
-        kernel=(3, 3),
-        pad=(1, 1),
-        stride=(1, 1),
-        name=f"{name}_tower",
-        suffix="_conv_1",
-    )
-    tower_d3x3 = Conv(
-        tower_d3x3,
-        num_d3x3_2,
-        kernel=(3, 3),
-        pad=(0, 0),
-        stride=(2, 2),
-        name=f"{name}_tower",
-        suffix="_conv_2",
-    )
-    pooling = Pooling(
-        data=data,
-        kernel=(3, 3),
-        stride=(2, 2),
-        pad=(0, 0),
-        pool_type="max",
-        name=f"max_pool_{name}_pool",
-    )
-    concat = relay.concatenate((tower_3x3, tower_d3x3, pooling), axis=1)
-    return concat
-
-
-def Inception7C(
-    data,
-    num_1x1,
-    num_d7_red,
-    num_d7_1,
-    num_d7_2,
-    num_q7_red,
-    num_q7_1,
-    num_q7_2,
-    num_q7_3,
-    num_q7_4,
-    pool,
-    proj,
-    name,
-):
-    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=f"{name}_conv")
-    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=f"{name}_tower", suffix="_conv")
-    tower_d7 = Conv(
-        data=tower_d7,
-        num_filter=num_d7_1,
-        kernel=(1, 7),
-        pad=(0, 3),
-        name=f"{name}_tower",
-        suffix="_conv_1",
-    )
-    tower_d7 = Conv(
-        data=tower_d7,
-        num_filter=num_d7_2,
-        kernel=(7, 1),
-        pad=(3, 0),
-        name=f"{name}_tower",
-        suffix="_conv_2",
-    )
-    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=f"{name}_tower_1", suffix="_conv")
-    tower_q7 = Conv(
-        data=tower_q7,
-        num_filter=num_q7_1,
-        kernel=(7, 1),
-        pad=(3, 0),
-        name=f"{name}_tower_1",
-        suffix="_conv_1",
-    )
-    tower_q7 = Conv(
-        data=tower_q7,
-        num_filter=num_q7_2,
-        kernel=(1, 7),
-        pad=(0, 3),
-        name=f"{name}_tower_1",
-        suffix="_conv_2",
-    )
-    tower_q7 = Conv(
-        data=tower_q7,
-        num_filter=num_q7_3,
-        kernel=(7, 1),
-        pad=(3, 0),
-        name=f"{name}_tower_1",
-        suffix="_conv_3",
-    )
-    tower_q7 = Conv(
-        data=tower_q7,
-        num_filter=num_q7_4,
-        kernel=(1, 7),
-        pad=(0, 3),
-        name=f"{name}_tower_1",
-        suffix="_conv_4",
-    )
-    pooling = Pooling(
-        data=data,
-        kernel=(3, 3),
-        stride=(1, 1),
-        pad=(1, 1),
-        pool_type=pool,
-        name=f"{pool}_pool_{name}_pool",
-    )
-    cproj = Conv(
-        data=pooling, num_filter=proj, kernel=(1, 1), name=f"{name}_tower_2", suffix="_conv"
-    )
-    # concat
-    concat = relay.concatenate((tower_1x1, tower_d7, tower_q7, cproj), axis=1)
-    return concat
-
-
-def Inception7D(
-    data, num_3x3_red, num_3x3, num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3, pool, name
-):
-    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=f"{name}_tower", suffix="_conv")
-    tower_3x3 = Conv(
-        data=tower_3x3,
-        num_filter=num_3x3,
-        kernel=(3, 3),
-        pad=(0, 0),
-        stride=(2, 2),
-        name=f"{name}_tower",
-        suffix="_conv_1",
-    )
-    tower_d7_3x3 = Conv(
-        data=data, num_filter=num_d7_3x3_red, name=f"{name}_tower_1", suffix="_conv"
-    )
-    tower_d7_3x3 = Conv(
-        data=tower_d7_3x3,
-        num_filter=num_d7_1,
-        kernel=(1, 7),
-        pad=(0, 3),
-        name=f"{name}_tower_1",
-        suffix="_conv_1",
-    )
-    tower_d7_3x3 = Conv(
-        data=tower_d7_3x3,
-        num_filter=num_d7_2,
-        kernel=(7, 1),
-        pad=(3, 0),
-        name=f"{name}_tower_1",
-        suffix="_conv_2",
-    )
-    tower_d7_3x3 = Conv(
-        data=tower_d7_3x3,
-        num_filter=num_d7_3x3,
-        kernel=(3, 3),
-        stride=(2, 2),
-        name=f"{name}_tower_1",
-        suffix="_conv_3",
-    )
-    pooling = Pooling(
-        data=data,
-        kernel=(3, 3),
-        stride=(2, 2),
-        pool_type=pool,
-        pad=(0, 0),
-        name=f"{pool}_pool_{name}_pool",
-    )
-    # concat
-    concat = relay.concatenate((tower_3x3, tower_d7_3x3, pooling), axis=1)
-    return concat
-
-
-def Inception7E(
-    data,
-    num_1x1,
-    num_d3_red,
-    num_d3_1,
-    num_d3_2,
-    num_3x3_d3_red,
-    num_3x3,
-    num_3x3_d3_1,
-    num_3x3_d3_2,
-    pool,
-    proj,
-    name,
-):
-    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=f"{name}_conv")
-    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=f"{name}_tower", suffix="_conv")
-    tower_d3_a = Conv(
-        data=tower_d3,
-        num_filter=num_d3_1,
-        kernel=(1, 3),
-        pad=(0, 1),
-        name=f"{name}_tower",
-        suffix="_mixed_conv",
-    )
-    tower_d3_b = Conv(
-        data=tower_d3,
-        num_filter=num_d3_2,
-        kernel=(3, 1),
-        pad=(1, 0),
-        name=f"{name}_tower",
-        suffix="_mixed_conv_1",
-    )
-    tower_3x3_d3 = Conv(
-        data=data, num_filter=num_3x3_d3_red, name=f"{name}_tower_1", suffix="_conv"
-    )
-    tower_3x3_d3 = Conv(
-        data=tower_3x3_d3,
-        num_filter=num_3x3,
-        kernel=(3, 3),
-        pad=(1, 1),
-        name=f"{name}_tower_1",
-        suffix="_conv_1",
-    )
-    tower_3x3_d3_a = Conv(
-        data=tower_3x3_d3,
-        num_filter=num_3x3_d3_1,
-        kernel=(1, 3),
-        pad=(0, 1),
-        name=f"{name}_tower_1",
-        suffix="_mixed_conv",
-    )
-    tower_3x3_d3_b = Conv(
-        data=tower_3x3_d3,
-        num_filter=num_3x3_d3_2,
-        kernel=(3, 1),
-        pad=(1, 0),
-        name=f"{name}_tower_1",
-        suffix="_mixed_conv_1",
-    )
-    pooling = Pooling(
-        data=data,
-        kernel=(3, 3),
-        stride=(1, 1),
-        pad=(1, 1),
-        pool_type=pool,
-        name=f"{pool}_pool_{name}_pool",
-    )
-    cproj = Conv(
-        data=pooling, num_filter=proj, kernel=(1, 1), name=f"{name}_tower_2", suffix="_conv"
-    )
-    # concat
-    concat = relay.concatenate(
-        (tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj), axis=1
-    )
-    return concat
-
-
-def get_net(batch_size, num_classes, image_shape, dtype):
-    """Get network a Inception v3 network.
-
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of claseses
-
-    image_shape : tuple, optional
-        The input image shape
-
-    dtype : str, optional
-        The data type
-
-    Returns
-    -------
-    net : relay.Function
-        The dataflow.
-    """
-    data_shape = (batch_size,) + image_shape
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-
-    # stage 1
-    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
-    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
-    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
-    pool = Pooling(
-        data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), name="pool"
-    )
-    # stage 2
-    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
-    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
-    pool1 = Pooling(
-        data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), name="pool1"
-    )
-
-    # stage 3
-    in3a = Inception7A(pool1, 64, 64, 96, 96, 48, 64, "avg", 32, "mixed")
-
-    in3b = Inception7A(in3a, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_1")
-    in3c = Inception7A(in3b, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_2")
-    in3d = Inception7B(in3c, 384, 64, 96, 96, "max", "mixed_3")
-    # stage 4
-    in4a = Inception7C(in3d, 192, 128, 128, 192, 128, 128, 128, 128, 192, "avg", 192, "mixed_4")
-    in4b = Inception7C(in4a, 192, 160, 160, 192, 160, 160, 160, 160, 192, "avg", 192, "mixed_5")
-    in4c = Inception7C(in4b, 192, 160, 160, 192, 160, 160, 160, 160, 192, "avg", 192, "mixed_6")
-    in4d = Inception7C(in4c, 192, 192, 192, 192, 192, 192, 192, 192, 192, "avg", 192, "mixed_7")
-    in4e = Inception7D(in4d, 192, 320, 192, 192, 192, 192, "max", "mixed_8")
-    # stage 5
-    in5a = Inception7E(in4e, 320, 384, 384, 384, 448, 384, 384, 384, "avg", 192, "mixed_9")
-    in5b = Inception7E(in5a, 320, 384, 384, 384, 448, 384, 384, 384, "max", 192, "mixed_10")
-
-    # pool
-    pool = Pooling(
-        data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0), name="global_pool"
-    )
-
-    flatten = relay.nn.batch_flatten(pool)
-    fc1 = relay.nn.dense(flatten, relay.var("fc1_weight"), units=num_classes)
-    fc1 = relay.nn.bias_add(fc1, relay.var("fc2_bias"), axis=-1)
-    inception_v3 = relay.nn.softmax(data=fc1)
-    args = relay.analysis.free_vars(inception_v3)
-    return relay.Function(args, inception_v3)
-
-
-def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 299, 299), dtype="float32"):
-    """Get benchmark workload for InceptionV3
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of classes
-
-    image_shape : tuple, optional
-        The input image shape
-
-    dtype : str, optional
-        The data type
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains an Inception V3 network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(batch_size, num_classes, image_shape, dtype)
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/init.py b/python/tvm/relay/testing/init.py
deleted file mode 100644
index 373b5a8ec3ac..000000000000
--- a/python/tvm/relay/testing/init.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Initializer of parameters."""
-from functools import reduce
-import numpy as np
-
-import tvm
-from tvm import relay
-
-
-class Initializer(object):
-    """The base class of an initializer."""
-
-    def __init__(self, **kwargs):
-        self._kwargs = kwargs
-
-    def __call__(self, desc, arr):
-        """Initialize an array
-
-        Parameters
-        ----------
-        desc : str
-            Initialization pattern descriptor.
-
-        arr : NDArray
-            The array to be initialized.
-        """
-        if desc.endswith("weight"):
-            self._init_weight(desc, arr)
-        elif desc.endswith("bias"):
-            self._init_bias(desc, arr)
-        elif desc.endswith("gamma"):
-            self._init_gamma(desc, arr)
-        elif desc.endswith("beta"):
-            self._init_beta(desc, arr)
-        elif desc.endswith("mean"):
-            self._init_mean(desc, arr)
-        elif desc.endswith("var"):
-            self._init_var(desc, arr)
-        else:
-            self._init_default(desc, arr)
-
-    def _init_bias(self, _, arr):
-        arr[:] = 0.0
-
-    def _init_gamma(self, _, arr):
-        arr[:] = 1.0
-
-    def _init_beta(self, _, arr):
-        arr[:] = 0.0
-
-    def _init_mean(self, _, arr):
-        arr[:] = 0.0
-
-    def _init_var(self, _, arr):
-        arr[:] = 1.0
-
-    def _init_weight(self, name, arr):
-        """Abstract method to Initialize weight."""
-        raise NotImplementedError("Must override it")
-
-    def _init_default(self, name, _):
-        raise ValueError(
-            f"Unknown initialization pattern for {name}. "
-            f"Default initialization is now limited to "
-            f'"weight", "bias", "gamma" (1.0), and "beta" (0.0).'
-            f"Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern"
-        )
-
-
-class Xavier(Initializer):
-    """ "Xavier" initialization for weights
-
-    Parameters
-    ----------
-    rnd_type: str, optional
-        Random generator type, can be ``'gaussian'`` or ``'uniform'``.
-
-    factor_type: str, optional
-        Can be ``'avg'``, ``'in'``, or ``'out'``.
-
-    magnitude: float, optional
-        Scale of random number.
-    """
-
-    def __init__(self, rnd_type="uniform", factor_type="avg", magnitude=3):
-        super(Xavier, self).__init__(
-            rnd_type=rnd_type, factor_type=factor_type, magnitude=magnitude
-        )
-        self.rnd_type = rnd_type
-        self.factor_type = factor_type
-        self.magnitude = float(magnitude)
-
-    def _init_weight(self, name, arr):
-        shape = arr.shape
-        hw_scale = 1.0
-        if len(shape) < 2:
-            raise ValueError(
-                f"Xavier initializer cannot be applied to vector {name}. It requires at least 2D."
-            )
-        if len(shape) > 2:
-            hw_scale = np.prod(shape[2:])
-        fan_in, fan_out = shape[1] * hw_scale, shape[0] * hw_scale
-        factor = 1.0
-        if self.factor_type == "avg":
-            factor = (fan_in + fan_out) / 2.0
-        elif self.factor_type == "in":
-            factor = fan_in
-        elif self.factor_type == "out":
-            factor = fan_out
-        else:
-            raise ValueError("Incorrect factor type")
-        # Hack for mobilenet, because there is less connectivity
-        if "depthwise" in name:
-            factor = hw_scale
-        scale = np.sqrt(self.magnitude / factor)
-        if self.rnd_type == "uniform":
-            arr[:] = np.random.uniform(-scale, scale, size=arr.shape)
-        else:
-            raise ValueError("Unknown random type")
-
-
-class Constant(Initializer):
-    """Constant initialization of weights. Sum of weights in the matrix is 1."""
-
-    def _init_weight(self, name, arr):
-        num_elements = reduce(lambda x, y: x * y, arr.shape)
-        arr[:] = 1.0 / num_elements
-
-
-def create_workload(net, initializer=None, seed=0):
-    """Helper function to create benchmark image classification workload.
-
-    Parameters
-    ----------
-    net : tvm.relay.Function
-        The selected function of the network.
-
-    initializer : Initializer
-        The initializer used
-
-    seed : int
-        The seed used in initialization.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The created relay module.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    mod = tvm.IRModule.from_expr(net)
-    mod = relay.transform.InferType()(mod)
-    shape_dict = {v.name_hint: v.checked_type for v in mod["main"].params}
-    np.random.seed(seed)
-    initializer = initializer if initializer else Xavier()
-    params = {}
-    for k, v in shape_dict.items():
-        if k == "data":
-            continue
-        init_value = np.zeros(v.concrete_shape).astype(v.dtype)
-        initializer(k, init_value)
-        params[k] = tvm.nd.array(init_value, device=tvm.cpu(0))
-    return mod, params
diff --git a/python/tvm/relay/testing/layers.py b/python/tvm/relay/testing/layers.py
deleted file mode 100644
index 8496c56400b8..000000000000
--- a/python/tvm/relay/testing/layers.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Simple Layer DSL wrapper to ease creation of neural nets."""
-from tvm import relay
-
-
-def batch_norm_infer(data, gamma=None, beta=None, moving_mean=None, moving_var=None, **kwargs):
-    """Wrapper of batch_norm.
-
-    This function automatically creates weights and return
-    the first output(normalized result).
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input expression.
-
-    gamma : relay.Expr
-        The gamma scale factor.
-
-    beta : relay.Expr
-        The beta offset factor.
-
-    moving_mean : relay.Expr
-        Running mean of input,
-
-    moving_var : relay.Expr
-        Running variance of input.
-
-    kwargs : dict
-        Additional arguments.
-
-    Returns
-    -------
-    result : relay.Expr
-        The result.
-    """
-    name = kwargs.get("name")
-    kwargs.pop("name")
-    if not gamma:
-        gamma = relay.var(name + "_gamma")
-    if not beta:
-        beta = relay.var(name + "_beta")
-    if not moving_mean:
-        moving_mean = relay.var(name + "_moving_mean")
-    if not moving_var:
-        moving_var = relay.var(name + "_moving_var")
-    return relay.nn.batch_norm(
-        data, gamma=gamma, beta=beta, moving_mean=moving_mean, moving_var=moving_var, **kwargs
-    )[0]
-
-
-def conv2d(data, weight=None, **kwargs):
-    """Wrapper of conv2d which automatically creates weights if not given.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input expression.
-
-    weight : relay.Expr
-        The weight to conv2d.
-
-    kwargs : dict
-        Additional arguments.
-
-    Returns
-    -------
-    result : relay.Expr
-        The result.
-    """
-    name = kwargs.get("name")
-    kwargs.pop("name")
-    if not weight:
-        weight = relay.var(name + "_weight")
-    return relay.nn.conv2d(data, weight, **kwargs)
-
-
-def conv3d(data, weight=None, **kwargs):
-    """Wrapper of conv3d which automatically creates weights if not given.
-    Parameters
-    ----------
-    data : relay.Expr
-        The input expression.
-    weight : relay.Expr
-        The weight to conv3d.
-    kwargs : dict
-        Additional arguments.
-    Returns
-    -------
-    result : relay.Expr
-        The result.
-    """
-    name = kwargs.get("name")
-    kwargs.pop("name")
-    if not weight:
-        weight = relay.var(name + "_weight")
-    return relay.nn.conv3d(data, weight, **kwargs)
-
-
-def conv2d_transpose(data, weight=None, **kwargs):
-    """Wrapper of conv2d_transpose which automatically creates weights if not given.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input expression.
-
-    weight : relay.Expr
-        The weight to conv2d_transpose.
-
-    kwargs : dict
-        Additional arguments.
-
-    Returns
-    -------
-    result : relay.Expr
-        The result.
-    """
-    name = kwargs.get("name")
-    kwargs.pop("name")
-    if not weight:
-        weight = relay.var(name + "_weight")
-    return relay.nn.conv2d_transpose(data, weight, **kwargs)
-
-
-def dense_add_bias(data, weight=None, bias=None, units=None, **kwargs):
-    """Wrapper of dense which automatically creates weights if not given.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input expression.
-
-    weight : relay.Expr
-        The weight to conv2d.
-
-    bias : relay.Expr
-        The bias.
-
-    kwargs : dict
-        Additional arguments.
-
-    Returns
-    -------
-    result : relay.Expr
-        The result.
-    """
-    name = kwargs.get("name")
-    kwargs.pop("name")
-    if not weight:
-        weight = relay.var(name + "_weight")
-    if not bias:
-        bias = relay.var(name + "_bias")
-    data = relay.nn.dense(data, weight, units, **kwargs)
-    data = relay.nn.bias_add(data, bias, axis=-1)
-    return data
-
-
-def conv_kernel_layout(data_layout, is_depthwise=False):
-    """Map the data layout to corresponding kernel layout.
-
-    Arbitrary layout is not fully supported in TOPI yet.
-
-    Parameters
-    ----------
-    data_layout : str
-        The data_layout, can be 'NCHW', 'NHWC'.
-
-    is_depthwise : bool, optional
-        Whether the conv is a depthwise convolution.
-
-    Returns
-    -------
-    result : str
-        The corresponding kernel layout.
-    """
-    conv_layout_map = {"NCHW": "OIHW", "NHWC": "HWIO"}
-    depthwise_conv_layout_map = {"NCHW": "OIHW", "NHWC": "HWOI"}
-    mapping = depthwise_conv_layout_map if is_depthwise else conv_layout_map
-    assert data_layout in mapping, f"Unknown data layout {data_layout}"
-    return mapping[data_layout]
diff --git a/python/tvm/relay/testing/lstm.py b/python/tvm/relay/testing/lstm.py
deleted file mode 100644
index bf054592b0a9..000000000000
--- a/python/tvm/relay/testing/lstm.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Implementation of a Long Short-Term Memory (LSTM) cell.
-
-Adapted from:
-https://gist.github.com/merrymercy/5eb24e3b019f84200645bd001e9caae9
-"""
-
-from tvm import relay
-from . import layers
-from .init import create_workload
-
-
-def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
-    """Long-Short Term Memory (LSTM) network cell.
-
-    Parameters
-    ----------
-    num_hidden : int
-        Number of units in output symbol.
-
-    batch_size : int
-        Batch size (length of states).
-
-    Returns
-    -------
-    result : tvm.relay.Function
-        A Relay function that evaluates an LSTM cell.
-        The function takes in a tensor of input data, a tuple of two
-        states, and weights and biases for dense operations on the
-        inputs and on the state. It returns a tuple with two members,
-        an output tensor and a tuple of two new states.
-    """
-    builder = relay.ScopeBuilder()
-
-    input_type = relay.TensorType((batch_size, num_hidden), dtype)
-    weight_type = relay.TensorType((4 * num_hidden, num_hidden), dtype)
-    bias_type = relay.TensorType((4 * num_hidden,), dtype)
-
-    dense_type = relay.TensorType((batch_size, 4 * num_hidden), dtype)
-    slice_type = relay.TupleType([input_type, input_type, input_type, input_type])
-    ret_type = relay.TupleType([input_type, relay.TupleType([input_type, input_type])])
-
-    inputs = relay.Var("inputs", input_type)
-    states = relay.Var("states", relay.TupleType([input_type, input_type]))
-
-    i2h_weight = relay.Var("i2h_weight", weight_type)
-    i2h_bias = relay.Var("i2h_bias", bias_type)
-
-    h2h_weight = relay.Var("h2h_weight", weight_type)
-    h2h_bias = relay.Var("h2h_bias", bias_type)
-
-    i2h = builder.let(
-        ("i2h", dense_type),
-        layers.dense_add_bias(
-            data=inputs, units=num_hidden * 4, weight=i2h_weight, bias=i2h_bias, name=f"{name}i2h"
-        ),
-    )
-    h2h = builder.let(
-        ("h2h", dense_type),
-        layers.dense_add_bias(
-            data=relay.TupleGetItem(states, 0),
-            units=num_hidden * 4,
-            weight=h2h_weight,
-            bias=h2h_bias,
-            name=f"{name}h2h",
-        ),
-    )
-
-    gates = builder.let(("gates", dense_type), relay.add(i2h, h2h))
-    slice_gates = builder.let(
-        ("slice_gates", slice_type), relay.split(gates, indices_or_sections=4, axis=1).astuple()
-    )
-
-    in_gate = builder.let(
-        ("in_gate", input_type), relay.sigmoid(relay.TupleGetItem(slice_gates, 0))
-    )
-    forget_gate = builder.let(
-        ("forget_gate", input_type), relay.sigmoid(relay.TupleGetItem(slice_gates, 1))
-    )
-    in_transform = builder.let(
-        ("in_transform", input_type), relay.tanh(relay.TupleGetItem(slice_gates, 2))
-    )
-    out_gate = builder.let(
-        ("out_gate", input_type), relay.sigmoid(relay.TupleGetItem(slice_gates, 3))
-    )
-
-    next_c = builder.let(
-        ("next_c", input_type),
-        relay.add(
-            relay.multiply(forget_gate, relay.TupleGetItem(states, 1)),
-            relay.multiply(in_gate, in_transform),
-        ),
-    )
-    next_h = builder.let(("next_h", input_type), relay.multiply(out_gate, relay.tanh(next_c)))
-    ret = builder.let(("ret", ret_type), relay.Tuple([next_h, relay.Tuple([next_h, next_c])]))
-    builder.ret(ret)
-
-    body = builder.get()
-
-    return relay.Function(
-        [inputs, states, i2h_weight, i2h_bias, h2h_weight, h2h_bias], body, ret_type
-    )
-
-
-def get_net(iterations, num_hidden, batch_size=1, dtype="float32"):
-    """Constructs an unrolled RNN with LSTM cells"""
-    input_type = relay.TensorType((batch_size, num_hidden), dtype)
-    weight_type = relay.TensorType((4 * num_hidden, num_hidden), dtype)
-    bias_type = relay.TensorType((4 * num_hidden,), dtype)
-
-    state_type = relay.TupleType([input_type, input_type])
-    cell_type = relay.TupleType([input_type, state_type])
-
-    builder = relay.ScopeBuilder()
-
-    zeros = builder.let(("zeros", input_type), relay.zeros((batch_size, num_hidden), dtype))
-    init_states = builder.let(("init_states", state_type), relay.Tuple([zeros, zeros]))
-
-    states = init_states
-    out = None
-
-    for i in range(iterations):
-        inputs = relay.Var("data", input_type)
-        i2h_weight = relay.Var(f"i2h_{i}_weight", weight_type)
-        i2h_bias = relay.Var(f"i2h_{i}_bias", bias_type)
-        h2h_weight = relay.Var(f"h2h_{i}_weight", weight_type)
-        h2h_bias = relay.Var(f"h2h_{i}_bias", bias_type)
-
-        cell_fn = lstm_cell(num_hidden, batch_size, dtype, f"lstm_{i}")
-
-        call = builder.let(
-            (f"call_{i}", cell_type),
-            relay.Call(cell_fn, [inputs, states, i2h_weight, i2h_bias, h2h_weight, h2h_bias]),
-        )
-        new_out = builder.let((f"out_{i}", input_type), relay.TupleGetItem(call, 0))
-        new_states = builder.let((f"states_{i}", state_type), relay.TupleGetItem(call, 1))
-        states = new_states
-        out = new_out
-
-    builder.ret(out)
-    body = builder.get()
-    args = relay.analysis.free_vars(body)
-    return relay.Function(args, body, input_type)
-
-
-def get_workload(iterations, num_hidden, batch_size=1, dtype="float32"):
-    """Get benchmark workload for an LSTM RNN.
-
-    Parameters
-    ----------
-    iterations : int
-        The number of iterations in the desired LSTM RNN.
-    num_hidden : int
-        The size of the hiddxen state
-    batch_size : int, optional (default 1)
-        The batch size used in the model
-    dtype : str, optional (default "float32")
-        The data type
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a LSTM network.
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(iterations, num_hidden, batch_size, dtype)
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/mlp.py b/python/tvm/relay/testing/mlp.py
deleted file mode 100644
index ac2d4224660c..000000000000
--- a/python/tvm/relay/testing/mlp.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-a simple multilayer perceptron
-"""
-from __future__ import absolute_import
-from tvm import relay
-from .init import create_workload
-
-
-def get_net(batch_size, num_classes=10, image_shape=(1, 28, 28), dtype="float32"):
-    """Get network a simple multilayer perceptron.
-
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of claseses
-
-    image_shape : tuple, optional
-        The input image shape
-
-    dtype : str, optional
-        The data type
-
-    Returns
-    -------
-    net : relay.Function
-        The dataflow.
-    """
-    data_shape = (batch_size,) + image_shape
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    data = relay.nn.batch_flatten(data)
-    fc1 = relay.nn.dense(data, relay.var("fc1_weight"), units=128)
-    fc1 = relay.nn.bias_add(fc1, relay.var("fc1_bias"), axis=-1)
-    act1 = relay.nn.relu(fc1)
-    fc2 = relay.nn.dense(act1, relay.var("fc2_weight"), units=64)
-    fc2 = relay.nn.bias_add(fc2, relay.var("fc2_bias"), axis=-1)
-    act2 = relay.nn.relu(fc2)
-    fc3 = relay.nn.dense(act2, relay.var("fc3_weight"), units=num_classes)
-    fc3 = relay.nn.bias_add(fc3, relay.var("fc3_bias"), axis=-1)
-    mlp = relay.nn.softmax(data=fc3)
-    args = relay.analysis.free_vars(mlp)
-    return relay.Function(args, mlp)
-
-
-def get_workload(batch_size, num_classes=10, image_shape=(1, 28, 28), dtype="float32"):
-    """Get benchmark workload for a simple multilayer perceptron.
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of claseses
-
-    image_shape : tuple, optional
-        The input image shape
-
-    dtype : str, optional
-        The data type
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a mlp network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(batch_size, num_classes, image_shape, dtype)
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py
deleted file mode 100644
index 4c600966d24a..000000000000
--- a/python/tvm/relay/testing/mobilenet.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Port of NNVM version of MobileNet to Relay.
-"""
-# pylint: disable=invalid-name
-
-from tvm import relay
-from . import layers
-from .init import create_workload
-
-
-def conv_block(
-    data,
-    name,
-    channels,
-    kernel_size=(3, 3),
-    strides=(1, 1),
-    padding=(1, 1),
-    epsilon=1e-5,
-    layout="NCHW",
-):
-    """Helper function to construct conv_bn-relu"""
-    # convolution + bn + relu
-    conv = layers.conv2d(
-        data=data,
-        channels=channels,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_layout=layout,
-        kernel_layout=layers.conv_kernel_layout(layout),
-        name=name + "_conv",
-    )
-    bn = layers.batch_norm_infer(data=conv, epsilon=epsilon, name=name + "_bn")
-    act = relay.nn.relu(data=bn)
-    return act
-
-
-def separable_conv_block(
-    data,
-    name,
-    depthwise_channels,
-    pointwise_channels,
-    kernel_size=(3, 3),
-    downsample=False,
-    padding=(1, 1),
-    epsilon=1e-5,
-    layout="NCHW",
-    dtype="float32",
-):
-    """Helper function to get a separable conv block"""
-    if downsample:
-        strides = (2, 2)
-    else:
-        strides = (1, 1)
-
-    # depthwise convolution + bn + relu
-    if layout == "NCHW":
-        wshape = (depthwise_channels, 1) + kernel_size
-    elif layout == "NHWC":
-        wshape = kernel_size + (depthwise_channels, 1)
-    else:
-        raise ValueError("Invalid layout: " + layout)
-    bn_axis = layout.index("C")
-    weight = relay.var(name + "_weight", shape=wshape, dtype=dtype)
-    conv1 = layers.conv2d(
-        data=data,
-        weight=weight,
-        channels=depthwise_channels,
-        groups=depthwise_channels,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_layout=layout,
-        kernel_layout=layers.conv_kernel_layout(layout, True),
-        name=name + "_depthwise_conv1",
-    )
-    bn1 = layers.batch_norm_infer(data=conv1, epsilon=epsilon, axis=bn_axis, name=name + "_bn1")
-    act1 = relay.nn.relu(data=bn1)
-    # pointwise convolution + bn + relu
-    conv2 = layers.conv2d(
-        data=act1,
-        channels=pointwise_channels,
-        kernel_size=(1, 1),
-        strides=(1, 1),
-        padding=(0, 0),
-        data_layout=layout,
-        kernel_layout=layers.conv_kernel_layout(layout),
-        name=name + "_conv2",
-    )
-    bn2 = layers.batch_norm_infer(data=conv2, epsilon=epsilon, axis=bn_axis, name=name + "_bn2")
-    act2 = relay.nn.relu(data=bn2)
-    return act2
-
-
-def mobile_net(
-    num_classes=1000,
-    data_shape=(1, 3, 224, 224),
-    dtype="float32",
-    alpha=1.0,
-    is_shallow=False,
-    layout="NCHW",
-):
-    """Function to construct a MobileNet"""
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    body = conv_block(data, "conv_block_1", int(32 * alpha), strides=(2, 2), layout=layout)
-    body = separable_conv_block(
-        body, "separable_conv_block_1", int(32 * alpha), int(64 * alpha), layout=layout, dtype=dtype
-    )
-    body = separable_conv_block(
-        body,
-        "separable_conv_block_2",
-        int(64 * alpha),
-        int(128 * alpha),
-        downsample=True,
-        layout=layout,
-        dtype=dtype,
-    )
-    body = separable_conv_block(
-        body,
-        "separable_conv_block_3",
-        int(128 * alpha),
-        int(128 * alpha),
-        layout=layout,
-        dtype=dtype,
-    )
-    body = separable_conv_block(
-        body,
-        "separable_conv_block_4",
-        int(128 * alpha),
-        int(256 * alpha),
-        downsample=True,
-        layout=layout,
-        dtype=dtype,
-    )
-    body = separable_conv_block(
-        body,
-        "separable_conv_block_5",
-        int(256 * alpha),
-        int(256 * alpha),
-        layout=layout,
-        dtype=dtype,
-    )
-    body = separable_conv_block(
-        body,
-        "separable_conv_block_6",
-        int(256 * alpha),
-        int(512 * alpha),
-        downsample=True,
-        layout=layout,
-        dtype=dtype,
-    )
-    if is_shallow:
-        body = separable_conv_block(
-            body,
-            "separable_conv_block_7",
-            int(512 * alpha),
-            int(1024 * alpha),
-            downsample=True,
-            layout=layout,
-            dtype=dtype,
-        )
-        body = separable_conv_block(
-            body,
-            "separable_conv_block_8",
-            int(1024 * alpha),
-            int(1024 * alpha),
-            downsample=True,
-            layout=layout,
-            dtype=dtype,
-        )
-    else:
-        for i in range(7, 12):
-            body = separable_conv_block(
-                body,
-                f"separable_conv_block_{i}",
-                int(512 * alpha),
-                int(512 * alpha),
-                layout=layout,
-                dtype=dtype,
-            )
-        body = separable_conv_block(
-            body,
-            "separable_conv_block_12",
-            int(512 * alpha),
-            int(1024 * alpha),
-            downsample=True,
-            layout=layout,
-            dtype=dtype,
-        )
-        body = separable_conv_block(
-            body,
-            "separable_conv_block_13",
-            int(1024 * alpha),
-            int(1024 * alpha),
-            layout=layout,
-            dtype=dtype,
-        )
-    pool = relay.nn.global_avg_pool2d(data=body, layout=layout)
-    flatten = relay.nn.batch_flatten(data=pool)
-    weight = relay.var("fc_weight")
-    bias = relay.var("fc_bias")
-    fc = relay.nn.dense(data=flatten, weight=weight, units=num_classes)
-    fc = relay.nn.bias_add(fc, bias)
-    softmax = relay.nn.softmax(data=fc)
-    return relay.Function(relay.analysis.free_vars(softmax), softmax)
-
-
-def get_workload(
-    batch_size=1, num_classes=1000, image_shape=(3, 224, 224), dtype="float32", layout="NCHW"
-):
-    """Get benchmark workload for mobilenet
-
-    Parameters
-    ----------
-    batch_size : int, optional
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of classes
-
-    image_shape : tuple, optional
-        The input image shape, cooperate with layout
-
-    dtype : str, optional
-        The data type
-
-    layout : str, optional
-        The data layout of image_shape and the operators
-        cooperate with image_shape
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a MobileNet network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    data_shape = tuple([batch_size] + list(image_shape))
-    net = mobile_net(
-        num_classes=num_classes,
-        data_shape=data_shape,
-        dtype=dtype,
-        alpha=1.0,
-        is_shallow=False,
-        layout=layout,
-    )
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/nat.py b/python/tvm/relay/testing/nat.py
deleted file mode 100644
index 914a7ffdde74..000000000000
--- a/python/tvm/relay/testing/nat.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Defines a unary natural number (Peano natural number) abstract
-data type for Relay and provides some utility functions for it.
-Nats are useful for testing purposes, as they make it easy to write
-test cases for recursion and pattern matching."""
-
-from tvm.relay.backend.interpreter import ConstructorValue
-
-
-def get_type(prelude, name):
-    ty_var = prelude.mod.get_global_type_var(name)
-    ty_data = prelude.mod.type_definitions[ty_var]
-    return tuple([ty_var] + list(ty_data.constructors))
-
-
-def count(prelude, n):
-    """Takes a ConstructorValue corresponding to a nat ADT
-    and converts it into a Python integer. This is an example of
-    using an ADT value in Python.
-    """
-    assert isinstance(n, ConstructorValue)
-    _, z, s = prelude.mod.get_type("nat")
-    if n.tag == z.tag:
-        return 0
-    assert n.tag == s.tag
-    return 1 + count(prelude, n.fields[0])
-
-
-def make_nat_value(prelude, n):
-    """The inverse of count(): Given a non-negative Python integer,
-    constructs a ConstructorValue representing that value as a nat.
-    """
-    _, z, s = prelude.mod.get_type("nat")
-    if n == 0:
-        return ConstructorValue(z.tag, [], z)
-    return ConstructorValue(s.tag, [make_nat_value(prelude, n - 1)], s)
-
-
-def make_nat_expr(prelude, n):
-    """Given a non-negative Python integer, constructs a Python
-    expression representing that integer's value as a nat.
-    """
-    assert n >= 0
-    _, z, s = prelude.mod.get_type("nat")
-    ret = z()
-    while n > 0:
-        ret = s(ret)
-        n = n - 1
-    return ret
diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
deleted file mode 100644
index 8e2cbe10822c..000000000000
--- a/python/tvm/relay/testing/py_converter.py
+++ /dev/null
@@ -1,656 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return
-"""Utility for converting Relay code into a Python script with equivalent semantics"""
-import sys
-import ast
-from ast import alias, Assign, Load, Name, NameConstant, Num, Return, Store, Str
-import re
-
-import tvm
-from tvm import relay
-from tvm.relay.adt import Pattern
-from tvm.relay.backend import te_compiler
-from tvm.relay.expr import Expr, GlobalVar, Var
-from tvm.relay.function import Function
-from tvm.relay.expr_functor import ExprFunctor
-
-__MAJOR__, __MINOR__, _, _, _ = sys.version_info
-
-OUTPUT_VAR_NAME = "_py_out"
-
-# corresponds to:
-#     import numpy
-#     import tvm
-#     from tvm import relay
-#     from tvm import nd
-#     from tvm.runtime import container as _container
-#     from tvm.relay.backend.interpreter import RefValue, ConstructorValue
-PROLOGUE = [
-    ast.Import([alias("numpy", None)]),
-    ast.Import([alias("tvm", None)]),
-    ast.ImportFrom("tvm", [alias("relay", None)], 0),
-    ast.ImportFrom("tvm", [alias("nd", None)], 0),
-    ast.ImportFrom("tvm.runtime", [alias("container", "_container")], 0),
-    ast.ImportFrom(
-        "tvm.relay.backend.interpreter",
-        [alias("RefValue", None), alias("ConstructorValue", None)],
-        0,
-    ),
-]
-
-
-class PythonConverter(ExprFunctor):
-    """Functor for translating Relay programs into Python ASTs."""
-
-    def __init__(self, mod, target) -> None:
-        super().__init__()
-        self.mod = mod
-        self.tgt = target if isinstance(target, tvm.target.Target) else tvm.target.Target(target)
-        self.tec = te_compiler.get()
-        self.fun_no = 0
-        self.var_no = 0
-        self.var_map = {}
-
-    def convert(self, prog: Expr):
-        """This method converts the passed Relay expression into a Python
-        AST object with equivalent semantics.
-
-        The Python AST can be executed using exec(); it can be turned
-        into text and inspected using astor.
-        """
-        optimized = self.optimize(prog)
-
-        # start with conversion prelude (imports) and convert global defs
-        body = []
-        body += PROLOGUE
-        body += self.convert_module()
-
-        prog_body, extra_defs = self.visit(optimized)
-        body += extra_defs
-
-        # we finally must assign the final expression to the output var
-        # so it can be read after running EXEC
-        body.append(Assign([Name(OUTPUT_VAR_NAME, Store())], prog_body))
-        global __MAJOR__, __MINOR__
-
-        if __MAJOR__ == 3 and __MINOR__ >= 8:
-            return ast.fix_missing_locations(ast.Module(body=body, type_ignores=[]))
-        else:
-            return ast.fix_missing_locations(ast.Module(body=body))
-
-    def optimize(self, prog: Expr):
-        """Performs optimizations necessary to be able to generate code for prog."""
-        # unwrap tuple wrappers (some op calls produce them)
-        unwrapped = prog.astuple() if isinstance(prog, relay.TupleWrapper) else prog
-        assert relay.analysis.well_formed(unwrapped)
-        # For a lone global var, there is nothing we need to do
-        if isinstance(unwrapped, relay.GlobalVar):
-            return unwrapped
-
-        # main might be in the mod already and from_expr will not override it if it's there,
-        # so we need a new name
-        target_name = self.generate_function_name("target")
-
-        wrapped = unwrapped
-        if not isinstance(unwrapped, relay.Function):
-            wrapped = relay.Function(relay.analysis.free_vars(unwrapped), unwrapped)
-
-        # easiest way to make a deep copy -- note that main will not be overridden if it's present
-        copy_mod = tvm.IRModule.from_expr(
-            relay.Tuple([]), self.mod.functions, self.mod.type_definitions
-        )
-        copy_mod[target_name] = wrapped
-
-        # necessary pass: SimplifyInference (otherwise we can't generate code for some operators)
-        # and fusion (to get primitive functions)
-        opts = tvm.transform.Sequential(
-            [relay.transform.SimplifyInference(), relay.transform.FuseOps(fuse_opt_level=0)]
-        )
-        copy_mod = opts(copy_mod)
-        optimized = copy_mod[target_name]
-        return optimized if isinstance(unwrapped, Function) else optimized.body
-
-    def sanitize(self, name: str) -> str:
-        """Removes any invalid characters (only underscores, numbers, and letters permitted)
-        from the given name. Since we append a number and underscore to var names anyway,
-        it doesn't matter if the name is the empty string."""
-        return re.sub(r"\W", "", name)
-
-    def generate_var_name(self, name_hint: str) -> str:
-        """Generates a unique variable name starting from the hint."""
-        name = f"{self.sanitize(name_hint)}_var_{self.var_no}"
-        self.var_no += 1
-        return name
-
-    def generate_function_name(self, name_hint: str) -> str:
-        """Generates a unique function name starting from the hint."""
-        name = f"{self.sanitize(name_hint)}_fun_{self.fun_no}"
-        self.fun_no += 1
-        return name
-
-    def get_var_name(self, var: Expr) -> str:
-        """Returns the var name for the given Realy variable."""
-        if var in self.var_map:
-            return self.var_map[var]
-        name = self.generate_var_name(var.name_hint)
-        self.var_map[var] = name
-        return name
-
-    def include_var(self, var: Expr, assign=False):
-        """Returns a variable AST node for the given Relay var depending on
-        whether it must appear in an assignment or not."""
-        name = self.get_var_name(var)
-        return Name(name, Store() if assign else Load())
-
-    def parse_name(self, name: str):
-        """Given the name of a Python method with dots (e.g., 'relay.var'),
-        returns an appropriate AST object corresponding to that name."""
-        attributes = name.split(".")
-        ret = Name(attributes[0], Load())
-        for i in range(len(attributes) - 1):
-            ret = ast.Attribute(ret, attributes[i + 1], Load())
-        return ret
-
-    def parse_numpy_array(self, arr):
-        """Given a Numpy array, produces an appropriate Python array
-        or numerical literal representing its contents."""
-
-        def parse_single(i):
-            return NameConstant(i) if isinstance(i, bool) else Num(i)
-
-        if arr.ndim == 0:
-            return parse_single(arr.item())
-        if arr.ndim == 1:
-            return ast.List([parse_single(i.item()) for i in arr], Load())
-
-        elts = []
-        for row in arr:
-            elts.append(self.parse_numpy_array(row))
-        return ast.List(elts, Load())
-
-    def convert_fields(self, fields: [Expr]):
-        """Given a list of call args or tuple fields, converts
-        each and returns their ASTs and their defs lists (in order)."""
-        bodies = []
-        defs = []
-        for field in fields:
-            member_body, member_defs = self.visit(field)
-            bodies.append(member_body)
-            defs += member_defs
-        return (bodies, defs)
-
-    def convert_to_thunk(self, name_hint: str, expr: Expr):
-        """Wraps the passed expression in a thunk."""
-        body, defs = self.visit(expr)
-        thunk_name = self.generate_function_name(name_hint)
-        thunk = self.create_def(thunk_name, [], defs + [Return(body)])
-        return (thunk, thunk_name)
-
-    def convert_func_node(self, func: Function, name_var=None):
-        """Converts the given Relay function into a Python function, with
-        special for named functions (locally or globally)"""
-        if name_var is None:
-            func_name = self.generate_function_name("_anon_func")
-        if isinstance(name_var, GlobalVar):
-            func_name = str(name_var.name_hint)
-        if isinstance(name_var, Var):
-            func_name = self.get_var_name(name_var)
-
-        var_names = [self.get_var_name(var) for var in func.params]
-        body, defs = self.visit(func.body)
-        ret = self.create_def(func_name, var_names, defs + [Return(body)], register_packed=True)
-        return (ret, func_name)
-
-    def convert_module(self):
-        """Converts all the global functions defined in the module and returns
-        them as a list of definitions"""
-        defs = []
-        for var, func in self.mod.functions.items():
-            # optimize the definition so any operators used are lowered
-            opt_func = self.optimize(func)
-            try:
-                converted_func, _ = self.convert_func_node(opt_func, var)
-                defs.append(converted_func)
-            except TypeError:
-                # TODO(wweic): fix conversion for Any
-                pass
-        return defs
-
-    def create_call(self, func_name: str, arguments):
-        """Creates a simple function call."""
-        return ast.Call(self.parse_name(func_name), arguments, [])
-
-    def create_def(self, func_name: str, arguments: [str], body, register_packed: bool = False):
-        """
-        Wrapper over function definition AST node, whose constructor is inconvenient.
-
-        register_packed includes a tvm.register_func decorator on the generated function if true.
-        This option should be used for Relay functions (warning: clobbers registry!)
-        """
-        inner_args = [ast.arg(argument, None) for argument in arguments]
-
-        # add a decorator to register as a PackedFunc so the function will be an ObjectRef
-        # and will allow for putting functions into tuples or refs
-        decorator_list = [
-            ast.Call(
-                self.parse_name("tvm.register_func"),
-                [ast.Constant(value=func_name)],
-                [ast.keyword(arg="override", value=ast.Constant(value=True))],
-            )
-        ]
-
-        global __MAJOR__, __MINOR__
-        if __MAJOR__ == 3 and __MINOR__ >= 8:
-            arguments = ast.arguments([], inner_args, None, [], [], None, [])
-        else:
-            arguments = ast.arguments(inner_args, None, [], [], None, [])
-
-        return ast.FunctionDef(
-            func_name, arguments, body, decorator_list if register_packed else [], None
-        )
-
-    def create_tuple(self, fields):
-        """
-        Given the ASTs for tuple fields, produce an AST that creates a
-        tuple value with those fields
-        """
-        # Use the FFI API directly so that PackedFuncs will be correctly converted to ObjectRef.
-        # Using tvm.runtime.container.tuple_object fails to convert PackedFuncs in Python
-        return self.create_call("_container._ffi_api.Tuple", fields)
-
-    def create_op_call(self, op: Function, relay_args, py_args):
-        """Lowers the passed primitive function, registers it in TVM's
-        global compiler, and produces a call to the lowered function in
-        the generated Python code."""
-
-        # compile the function and register globally
-        cc_key = te_compiler.CCacheKey(op, self.tgt)
-        func_hash = tvm.ir.structural_hash(op)
-        op_name = f"_lowered_op_{func_hash}"
-        if not tvm.get_global_func(op_name, allow_missing=True):
-            jitted = self.tec.jit(cc_key, self.tgt)
-            tvm.register_func(op_name, jitted)
-
-        def convert_input(py_input, arg_type):
-            """Use the types of the function arguments to determine whether we expect
-            a tensor or tuple (returns list of inputs to the lowered op call)"""
-            # equivalent: input.data
-            if isinstance(arg_type, relay.TensorType):
-                return [py_input]
-            assert isinstance(arg_type, relay.TupleType)
-            # convert each input.fields[i]
-            ret = []
-            for i in range(len(arg_type.fields)):
-                ret += convert_input(
-                    ast.Subscript(py_input, ast.Index(Num(i)), Load()), arg_type.fields[i]
-                )
-            return ret
-
-        def convert_output(ret_type):
-            """Use the function return type to produce auxiliary variables to store outputs.
-            Returns ([assignments of output vars], [extra arguments to pass to op call],
-            expression collecting output)"""
-            if isinstance(ret_type, relay.TensorType):
-                output_var_name = self.generate_var_name("_out")
-                output_var = Name(output_var_name, Load())
-                shape = ast.Tuple([Num(dim) for dim in ret_type.concrete_shape], Load())
-                # create a new NDArray of the right shape and dtype
-                assign_output = Assign(
-                    [Name(output_var_name, Store())],
-                    self.create_call(
-                        "nd.array", [self.create_call("numpy.empty", [shape, Str(ret_type.dtype)])]
-                    ),
-                )
-                return ([assign_output], [output_var], output_var)
-            assert isinstance(ret_type, relay.TupleType)
-            assignments = []
-            extra_args = []
-            fields = []
-            for t in ret_type.fields:
-                inner_assignments, inner_args, inner_output = convert_output(t)
-                assignments += inner_assignments
-                extra_args += inner_args
-                fields.append(inner_output)
-            return (assignments, extra_args, self.create_tuple(fields))
-
-        # create a function to wrap the call of the lowered op and return
-        # a call to that function
-        wrap_name = self.generate_function_name(f"_{op_name}_wrapper")
-        wrap_args = [self.generate_var_name(f"_arg_{i}") for i in range(len(py_args))]
-
-        inner_call_args = []
-        for i in range(len(py_args)):
-            inner_call_args += convert_input(Name(wrap_args[i], Load()), relay_args[i].checked_type)
-        output_assignments, aux_args, output = convert_output(op.checked_type.ret_type)
-        # equiv: _op = tvm.get_global_func(op_name)
-        op_var = self.generate_var_name("_op")
-        op_call = self.create_call("tvm.get_global_func", [Str(op_name)])
-        op_assign = Assign([Name(op_var, Store())], op_call)
-        # equiv: _op(args)
-        inner_call = self.create_call(op_var, inner_call_args + aux_args)
-        body = output_assignments + [op_assign, ast.Expr(inner_call), Return(output)]
-        wrap_def = self.create_def(wrap_name, wrap_args, body)
-        return wrap_def, self.create_call(wrap_name, py_args)
-
-    def create_match_check(self, pattern: Pattern, data):
-        """Given an ADT match pattern and a (Python) expression pointing to
-        an ADT value, this generates a Python expression that checks if the
-        ADT value matches the given pattern (returning True or False)."""
-
-        # wildcard or var match everything
-        if isinstance(pattern, (relay.PatternWildcard, relay.PatternVar)):
-            return NameConstant(True)
-
-        conds = []
-
-        if isinstance(pattern, relay.PatternConstructor):
-            # constructor patterns check whether the constructors match
-            # and also the matches of any nested patterns
-
-            # equiv: (arg.tag == patern_constructor.tag)
-            conds.append(
-                ast.Compare(
-                    ast.Attribute(data, "tag", Load()),
-                    [ast.Eq()],
-                    [ast.Num(pattern.constructor.tag)],
-                )
-            )
-
-        assert isinstance(pattern, (relay.PatternConstructor, relay.PatternTuple))
-        # now check for any nested patterns
-        for i in range(len(pattern.patterns)):
-            nested_pat = pattern.patterns[i]
-            # can safely skip var or wildcard patterns: they will
-            # never cause a check to fail
-            if not isinstance(nested_pat, relay.PatternConstructor):
-                continue
-
-            # index into the value corresponding to the subpattern
-            field_index = ast.Subscript(
-                ast.Attribute(data, "fields", Load()), ast.Index(Num(i)), Load()
-            )
-            conds.append(self.create_match_check(nested_pat, field_index))
-
-        # if we do not need to check nested pattern, just return the single check
-        if len(conds) == 1:
-            return conds[0]
-        # otherwise AND together any nested checks
-        return ast.BoolOp(ast.And(), conds)
-
-    def create_match_clause_body(self, pattern: Pattern, body: Expr):
-        """Given a match clause pattern and a clause body,
-        generates a Python function that when called with an ADT
-        that matches the pattern, returns the result of evaluating
-        the clause body. This function returns a function definition
-        and the name of the generated function."""
-
-        def collect_var_assignments(pat, val):
-            """This helper function ensures that the pattern is used to
-            properly assign all subfields of the given AST for use
-            in the clause body
-
-            E.g., for PatternConstructor(A, PatternVar(v), PatternWildcard(),
-            PatternConstructor(B, PatternVar(w)))
-            we would want to have
-            v = a.fields[0]
-            w = a.fields[2].fields[0]
-            """
-            if isinstance(pat, relay.PatternWildcard):
-                return []
-            if isinstance(pat, relay.PatternVar):
-                return [Assign([self.include_var(pat.var, assign=True)], val)]
-            # constructor pattern: assign each field of the value
-            # based on subpatterns
-            assignments = []
-            for i in range(len(pat.patterns)):
-                # we want the assignments for val.fields[i]
-                field = ast.Subscript(
-                    ast.Attribute(val, "fields", Load()), ast.Index(Num(i)), Load()
-                )
-                assignments += collect_var_assignments(pat.patterns[i], field)
-            return assignments
-
-        func_name = self.generate_function_name("_match_clause_body")
-        arg_name = self.generate_var_name("_match_clause_body")
-
-        clause_body, defs = self.visit(body)
-        assignments = collect_var_assignments(pattern, Name(arg_name, Load()))
-
-        func_def = self.create_def(
-            func_name, [arg_name], defs + assignments + [Return(clause_body)]
-        )
-        return (func_def, func_name)
-
-    # Convention for the expr visitor: Each visit function returns a tuple of two members.
-    #
-    # The first is a Python AST comprised of a single *expression* that evaluates to an equivalent
-    # result to the desired Relay expression (and executes all effects in the right order).
-    #
-    # The second is a list of function definition *statements* defining thunks and other
-    # auxiliary functions needed in the translated AST object. The defs in the second object
-    # will always have unique names and will never perform any effects, so as long as they
-    # appear in the Python program before the first statement is executed, there should not
-    # be any problems.
-
-    def visit_var(self, var: Expr):
-        return (self.include_var(var, assign=False), [])
-
-    def visit_global_var(self, gvar: Expr):
-        # we don't need to add numbers to global var names because
-        # the *names* are checked for uniqueness in the mod
-        func_name = str(gvar.name_hint)
-        # load in the packed func
-        return (self.create_call("tvm.get_global_func", [ast.Constant(value=func_name)]), [])
-
-    def visit_let(self, letexp: Expr):
-        # To properly account for scoping and ensure that the entire node produces an expression,
-        # we translate the let binding as a function that we call with the value we intend to bind.
-        # Yes, this is somewhat ugly.
-        """
-        let var = value in body
-        =======================
-        def let_thunk(var):
-            return body
-        let_thunk(value)
-        """
-        bind_body, bind_defs = self.visit(letexp.body)
-
-        func_name = self.generate_function_name("_let_func")
-        binding_func = self.create_def(
-            func_name, [self.get_var_name(letexp.var)], bind_defs + [Return(bind_body)]
-        )
-
-        # we call the binding func with the intended value for the bound variable
-
-        # special case: if the value is a function literal, we must ensure it can be
-        # recursive by naming it after the var
-        if isinstance(letexp.value, Function):
-            value_def, value_name = self.convert_func_node(letexp.value, letexp.var)
-            return (
-                self.create_call(func_name, [Name(value_name, Load())]),
-                [value_def, binding_func],
-            )
-
-        value_body, value_defs = self.visit(letexp.value)
-        value_defs.append(binding_func)
-        binding_call = self.create_call(func_name, [value_body])
-        return (binding_call, value_defs)
-
-    def visit_tuple(self, tup: Expr):
-        fields, ret_defs = self.convert_fields(tup.fields)
-        return (self.create_tuple(fields), ret_defs)
-
-    def visit_tuple_getitem(self, tgi: Expr):
-        tup, tup_defs = self.visit(tgi.tuple_value)
-        ret = ast.Subscript(tup, ast.Index(Num(tgi.index)), Load())
-        return (ret, tup_defs)
-
-    def visit_if(self, if_block: Expr):
-        cond_body, cond_defs = self.visit(if_block.cond)
-        true_body, true_defs = self.visit(if_block.true_branch)
-        false_body, false_defs = self.visit(if_block.false_branch)
-
-        # need to get the value out of a NDArray to check the condition
-        # equvialent to: val.numpy()
-        cond_check = ast.Call(ast.Attribute(cond_body, "numpy", Load()), [], [])
-        ret = ast.IfExp(cond_check, true_body, false_body)
-        return (ret, cond_defs + true_defs + false_defs)
-
-    def visit_constant(self, constant: Expr):
-        """Proceeds by converting constant value to a numpy array
-        and converting it to the appropriate value in the generated
-        code (whether it be a Python scalar or a Numpy array)"""
-        value = constant.data.numpy()
-        const_expr = ast.Call(
-            ast.Attribute(Name("numpy", Load()), "array", Load()),
-            [self.parse_numpy_array(value)],
-            [ast.keyword("dtype", Str(constant.checked_type.dtype))],
-        )
-        return (self.create_call("nd.array", [const_expr]), [])
-
-    def visit_function(self, func: Expr):
-        # Python's lambdas are very restrictive, so we do "name" inline functions
-        converted_func, func_name = self.convert_func_node(func)
-        # load in the PackedFunc
-        return (
-            self.create_call("tvm.get_global_func", [ast.Constant(value=func_name)]),
-            [converted_func],
-        )
-
-    def visit_call(self, call: Expr):
-        """For calls, we must distinguish between ordinary functions,
-        operators, and constructor calls."""
-        func = call.op
-        fields, field_defs = self.convert_fields(call.args)
-
-        if isinstance(func, tvm.ir.Op):
-            raise Exception("Operators should have been lowered and eliminated")
-
-        if isinstance(func, relay.Constructor):
-            # produce a constructor value
-            return (
-                self.create_call(
-                    "ConstructorValue",
-                    [ast.Num(func.tag), ast.List(fields, Load()), NameConstant(None)],
-                ),
-                field_defs,
-            )
-
-        # lowered operator: generate a call to a function that gets the PackedFunc
-        # from TVM's registry
-        if (
-            isinstance(func, Function)
-            and hasattr(func.attrs, "Primitive")
-            and int(func.attrs.Primitive) == 1
-        ):
-            op_call_def, op_call = self.create_op_call(func, call.args, fields)
-            return (op_call, field_defs + [op_call_def])
-
-        # ordinary function
-        converted_func, defs = self.visit(func)
-        defs += field_defs
-        return (ast.Call(converted_func, fields, []), defs)
-
-    def visit_ref_create(self, ref: Expr):
-        val, defs = self.visit(ref.value)
-        return (self.create_call("RefValue", [val]), defs)
-
-    def visit_ref_read(self, read: Expr):
-        ref, defs = self.visit(read.ref)
-        return (ast.Attribute(ref, "value", Load()), defs)
-
-    def visit_ref_write(self, write: Expr):
-        """For writing refs, we wrap the update in a thunk
-        (returning an empty tuple to match Relay's semantics)
-        that we execute at the right time. This ensures such assignments
-        can be properly nested, since assignments are statements
-        in Python but expressions in Relay"""
-        ref, ref_defs = self.visit(write.ref)
-        val, val_defs = self.visit(write.value)
-        thunk_name = self.generate_function_name("_ref_write_thunk")
-        thunk = self.create_def(
-            thunk_name,
-            [],
-            ref_defs
-            + val_defs
-            + [Assign([ast.Attribute(ref, "value", Store())], val), Return(self.create_tuple([]))],
-        )
-        return (self.create_call(thunk_name, []), [thunk])
-
-    def visit_match(self, match: Expr):
-        """For matches, we wrap the entire expression in a thunk
-        because it is easiest to implement them using if statements.
-        For each clause, we generate a function that checks if the
-        pattern matches. If yes, we call a function that assigns
-        the variables appropriately and invokes the clause body."""
-        data, defs = self.visit(match.data)
-        data_var = self.generate_var_name("_match_data")
-
-        # must ensure the data clause is executed exactly once
-        thunk_body = [Assign([Name(data_var, Store())], data)]
-        for clause in match.clauses:
-            check_expr = self.create_match_check(clause.lhs, Name(data_var, Load()))
-            body_def, body_name = self.create_match_clause_body(clause.lhs, clause.rhs)
-            defs.append(body_def)
-
-            # equiv: if check(data): return body(data)
-            thunk_body.append(
-                ast.If(
-                    check_expr, [Return(self.create_call(body_name, [Name(data_var, Load())]))], []
-                )
-            )
-
-        # finally if nothing matches we have a failed assert (should never happen)
-        thunk_body.append(ast.Assert(NameConstant(False), Str("Match was not exhaustive")))
-
-        thunk_name = self.generate_function_name("_match_thunk")
-        thunk_def = self.create_def(thunk_name, [], defs + thunk_body)
-        return (self.create_call(thunk_name, []), [thunk_def])
-
-    # these are both handled in the "call" case
-    def visit_constructor(self, _):
-        pass
-
-    def visit_op(self, _):
-        pass
-
-
-def to_python(expr: Expr, mod=None, target=tvm.target.Target("llvm")):
-    """Converts the given Relay expression into a Python script (as a Python AST object).
-    For easiest debugging, import the astor package and use to_source()."""
-    mod = mod if mod is not None else tvm.IRModule()
-    mod = relay.transform.InferType()(mod)
-    converter = PythonConverter(mod, target)
-    python = converter.convert(expr)
-    assert python
-    return python
-
-
-def run_as_python(expr: Expr, mod=None, target=tvm.target.Target("llvm")):
-    """Converts the given Relay expression into a Python script and
-    executes it.
-
-    Note that closures will be returned as PackedFuncs
-    """
-    mod = mod if mod is not None else tvm.IRModule()
-    py_ast = to_python(expr, mod, target)
-    code = compile(py_ast, "<string>", "exec")
-    var_map = {OUTPUT_VAR_NAME: None}
-    # pylint: disable=exec-used
-    exec(code, var_map, var_map)
-    return var_map[OUTPUT_VAR_NAME]
diff --git a/python/tvm/relay/testing/resnet.py b/python/tvm/relay/testing/resnet.py
deleted file mode 100644
index e1e4069f5412..000000000000
--- a/python/tvm/relay/testing/resnet.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-Original author Wei Wu
-
-Implemented the following paper:
-
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
-"""
-# pylint: disable=unused-argument
-from tvm import relay
-from .init import create_workload
-from . import layers
-
-
-def residual_unit(
-    data,
-    num_filter,
-    stride,
-    dim_match,
-    name,
-    bottle_neck=True,
-    data_layout="NCHW",
-    kernel_layout="IOHW",
-):
-    """Return ResNet Unit symbol for building ResNet
-
-    Parameters
-    ----------
-    data : str
-        Input data
-
-    num_filter : int
-        Number of output channels
-
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-
-    stride : tuple
-        Stride used in convolution
-
-    dim_match : bool
-        True means channel number between input and output is the same,
-        otherwise means differ
-
-    name : str
-        Base name of the operators
-    """
-    bn_axis = data_layout.index("C")
-    if bottle_neck:
-        bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, axis=bn_axis, name=name + "_bn1")
-        act1 = relay.nn.relu(data=bn1)
-        conv1 = layers.conv2d(
-            data=act1,
-            channels=int(num_filter * 0.25),
-            kernel_size=(1, 1),
-            strides=stride,
-            padding=(0, 0),
-            name=name + "_conv1",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, axis=bn_axis, name=name + "_bn2")
-        act2 = relay.nn.relu(data=bn2)
-        conv2 = layers.conv2d(
-            data=act2,
-            channels=int(num_filter * 0.25),
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=(1, 1),
-            name=name + "_conv2",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        bn3 = layers.batch_norm_infer(data=conv2, epsilon=2e-5, axis=bn_axis, name=name + "_bn3")
-        act3 = relay.nn.relu(data=bn3)
-        conv3 = layers.conv2d(
-            data=act3,
-            channels=num_filter,
-            kernel_size=(1, 1),
-            strides=(1, 1),
-            padding=(0, 0),
-            name=name + "_conv3",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut = layers.conv2d(
-                data=act1,
-                channels=num_filter,
-                kernel_size=(1, 1),
-                strides=stride,
-                name=name + "_sc",
-                data_layout=data_layout,
-                kernel_layout=kernel_layout,
-            )
-        return relay.add(conv3, shortcut)
-
-    bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, axis=bn_axis, name=name + "_bn1")
-    act1 = relay.nn.relu(data=bn1)
-    conv1 = layers.conv2d(
-        data=act1,
-        channels=num_filter,
-        kernel_size=(3, 3),
-        strides=stride,
-        padding=(1, 1),
-        name=name + "_conv1",
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, axis=bn_axis, name=name + "_bn2")
-    act2 = relay.nn.relu(data=bn2)
-    conv2 = layers.conv2d(
-        data=act2,
-        channels=num_filter,
-        kernel_size=(3, 3),
-        strides=(1, 1),
-        padding=(1, 1),
-        name=name + "_conv2",
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-
-    if dim_match:
-        shortcut = data
-    else:
-        shortcut = layers.conv2d(
-            data=act1,
-            channels=num_filter,
-            kernel_size=(1, 1),
-            strides=stride,
-            name=name + "_sc",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-    return relay.add(conv2, shortcut)
-
-
-def resnet(
-    units,
-    num_stages,
-    filter_list,
-    num_classes,
-    data_shape,
-    bottle_neck=True,
-    layout="NCHW",
-    dtype="float32",
-):
-    """Return ResNet Program.
-
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-
-    num_stages : int
-        Number of stages
-
-    filter_list : list
-        Channel size of each stage
-
-    num_classes : int
-        Output size of symbol
-
-    data_shape : tuple of int.
-        The shape of input data.
-
-    bottle_neck : bool
-        Whether apply bottleneck transformation.
-
-    layout: str
-        The data layout for conv2d
-
-    dtype : str
-        The global data type.
-    """
-
-    data_layout = layout
-    kernel_layout = "OIHW" if layout == "NCHW" else "HWIO"
-    bn_axis = data_layout.index("C")
-
-    num_unit = len(units)
-    assert num_unit == num_stages
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    data = layers.batch_norm_infer(
-        data=data, epsilon=2e-5, axis=bn_axis, scale=False, name="bn_data"
-    )
-    (_, _, height, _) = data_shape
-    if layout == "NHWC":
-        (_, height, _, _) = data_shape
-    if height <= 32:  # such as cifar10
-        body = layers.conv2d(
-            data=data,
-            channels=filter_list[0],
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=(1, 1),
-            name="conv0",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-    else:  # often expected to be 224 such as imagenet
-        body = layers.conv2d(
-            data=data,
-            channels=filter_list[0],
-            kernel_size=(7, 7),
-            strides=(2, 2),
-            padding=(3, 3),
-            name="conv0",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        body = layers.batch_norm_infer(data=body, epsilon=2e-5, axis=bn_axis, name="bn0")
-        body = relay.nn.relu(data=body)
-        body = relay.nn.max_pool2d(
-            data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1), layout=data_layout
-        )
-
-    for i in range(num_stages):
-        body = residual_unit(
-            body,
-            filter_list[i + 1],
-            (1 if i == 0 else 2, 1 if i == 0 else 2),
-            False,
-            name=f"stage{i + 1}_unit1",
-            bottle_neck=bottle_neck,
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        for j in range(units[i] - 1):
-            body = residual_unit(
-                body,
-                filter_list[i + 1],
-                (1, 1),
-                True,
-                name=f"stage{i + 1}_unit{j + 2}",
-                bottle_neck=bottle_neck,
-                data_layout=data_layout,
-                kernel_layout=kernel_layout,
-            )
-    bn1 = layers.batch_norm_infer(data=body, epsilon=2e-5, axis=bn_axis, name="bn1")
-    relu1 = relay.nn.relu(data=bn1)
-    # Although kernel is not used here when global_pool=True, we should put one
-    pool1 = relay.nn.global_avg_pool2d(data=relu1, layout=data_layout)
-    flat = relay.nn.batch_flatten(data=pool1)
-    fc1 = layers.dense_add_bias(data=flat, units=num_classes, name="fc1")
-    net = relay.nn.softmax(data=fc1)
-    return relay.Function(relay.analysis.free_vars(net), net)
-
-
-def get_net(
-    batch_size,
-    num_classes,
-    num_layers=50,
-    image_shape=(3, 224, 224),
-    layout="NCHW",
-    dtype="float32",
-    **kwargs,
-):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
-    Original author Wei Wu
-    """
-    (_, height, _) = image_shape
-    if layout == "NHWC":
-        (height, _, _) = image_shape
-    data_shape = (batch_size,) + image_shape
-    if height <= 28:
-        num_stages = 3
-        if (num_layers - 2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers - 2) // 9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers - 2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers - 2) // 6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError(f"no experiments done on num_layers {num_layers}")
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError(f"no experiments done on num_layers {num_layers}")
-
-    return resnet(
-        units=units,
-        num_stages=num_stages,
-        filter_list=filter_list,
-        num_classes=num_classes,
-        data_shape=data_shape,
-        bottle_neck=bottle_neck,
-        layout=layout,
-        dtype=dtype,
-    )
-
-
-def get_workload(
-    batch_size=1,
-    num_classes=1000,
-    num_layers=18,
-    image_shape=(3, 224, 224),
-    layout="NCHW",
-    dtype="float32",
-    **kwargs,
-):
-    """Get benchmark workload for resnet
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of classes
-
-    num_layers : int, optional
-        Number of layers
-
-    image_shape : tuple, optional
-        The input image shape
-
-    layout: str
-        The data layout for conv2d
-
-    dtype : str, optional
-        The data type
-
-    kwargs : dict
-        Extra arguments
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a ResNet network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(
-        batch_size=batch_size,
-        num_classes=num_classes,
-        num_layers=num_layers,
-        image_shape=image_shape,
-        dtype=dtype,
-        layout=layout,
-        **kwargs,
-    )
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/resnet_3d.py b/python/tvm/relay/testing/resnet_3d.py
deleted file mode 100644
index b20833402af4..000000000000
--- a/python/tvm/relay/testing/resnet_3d.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Network definition of 3D ResNet for Action Recognition (CVPR 2018)
-
-Reference : https://github.com/kenshohara/3D-ResNets-PyTorch
-"""
-
-# pylint: disable=unused-argument
-from tvm import relay
-from .init import create_workload
-from . import layers
-
-
-def residual_unit(
-    data,
-    num_filter,
-    stride,
-    dim_match,
-    name,
-    bottle_neck=True,
-    data_layout="NCDHW",
-    kernel_layout="OIDHW",
-):
-    """Return ResNet Unit symbol for building ResNet
-
-    Parameters
-    ----------
-    data : str
-        Input data
-
-    num_filter : int
-        Number of output channels
-
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-
-    stride : tuple
-        Stride used in convolution
-
-    dim_match : bool
-        True means channel number between input and output is the same,
-        otherwise means differ
-
-    name : str
-        Base name of the operators
-    """
-    if bottle_neck:
-        bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, name=name + "_bn1")
-        act1 = relay.nn.relu(data=bn1)
-        conv1 = layers.conv3d(
-            data=act1,
-            channels=int(num_filter * 0.25),
-            kernel_size=(1, 1, 1),
-            strides=stride,
-            padding=(0, 0, 0),
-            name=name + "_conv1",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + "_bn2")
-        act2 = relay.nn.relu(data=bn2)
-        conv2 = layers.conv3d(
-            data=act2,
-            channels=int(num_filter * 0.25),
-            kernel_size=(3, 3, 3),
-            strides=(1, 1, 1),
-            padding=(1, 1, 1),
-            name=name + "_conv2",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        bn3 = layers.batch_norm_infer(data=conv2, epsilon=2e-5, name=name + "_bn3")
-        act3 = relay.nn.relu(data=bn3)
-        conv3 = layers.conv3d(
-            data=act3,
-            channels=num_filter,
-            kernel_size=(1, 1, 1),
-            strides=(1, 1, 1),
-            padding=(0, 0, 0),
-            name=name + "_conv3",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut = layers.conv3d(
-                data=act1,
-                channels=num_filter,
-                kernel_size=(1, 1, 1),
-                strides=stride,
-                name=name + "_sc",
-                data_layout=data_layout,
-                kernel_layout=kernel_layout,
-            )
-        return relay.add(conv3, shortcut)
-
-    bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, name=name + "_bn1")
-    act1 = relay.nn.relu(data=bn1)
-    conv1 = layers.conv3d(
-        data=act1,
-        channels=num_filter,
-        kernel_size=(3, 3, 3),
-        strides=stride,
-        padding=(1, 1, 1),
-        name=name + "_conv1",
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + "_bn2")
-    act2 = relay.nn.relu(data=bn2)
-    conv2 = layers.conv3d(
-        data=act2,
-        channels=num_filter,
-        kernel_size=(3, 3, 3),
-        strides=(1, 1, 1),
-        padding=(1, 1, 1),
-        name=name + "_conv2",
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-
-    if dim_match:
-        shortcut = data
-    else:
-        shortcut = layers.conv3d(
-            data=act1,
-            channels=num_filter,
-            kernel_size=(1, 1, 1),
-            strides=stride,
-            name=name + "_sc",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-    return relay.add(conv2, shortcut)
-
-
-def resnet(
-    units,
-    num_stages,
-    filter_list,
-    num_classes,
-    data_shape,
-    bottle_neck=True,
-    layout="NCDHW",
-    dtype="float32",
-):
-    """Return ResNet Program.
-
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-
-    num_stages : int
-        Number of stages
-
-    filter_list : list
-        Channel size of each stage
-
-    num_classes : int
-        Output size of symbol
-
-    data_shape : tuple of int.
-        The shape of input data.
-
-    bottle_neck : bool
-        Whether apply bottleneck transformation.
-
-    layout: str
-        The data layout for conv3d
-
-    dtype : str
-        The global data type.
-    """
-
-    data_layout = layout
-    kernel_layout = "OIDHW" if layout == "NCDHW" else "DHWIO"
-
-    num_unit = len(units)
-    assert num_unit == num_stages
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    data = layers.batch_norm_infer(data=data, epsilon=2e-5, scale=False, name="bn_data")
-    if layout == "NCDHW":
-        (_, _, _, height, _) = data_shape
-    else:
-        (_, _, height, _, _) = data_shape
-    if height <= 32:  # such as cifar10
-        body = layers.conv3d(
-            data=data,
-            channels=filter_list[0],
-            kernel_size=(3, 3, 3),
-            strides=(1, 1, 1),
-            padding=(1, 1, 1),
-            name="conv0",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-    else:  # often expected to be 224 such as imagenet
-        body = layers.conv3d(
-            data=data,
-            channels=filter_list[0],
-            kernel_size=(3, 7, 7),
-            strides=(1, 2, 2),
-            padding=(1, 3, 3),
-            name="conv0",
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        body = layers.batch_norm_infer(data=body, epsilon=2e-5, name="bn0")
-        body = relay.nn.relu(data=body)
-        # body = relay.nn.max_pool3d(data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1),
-        #                           layout=data_layout)
-
-    for i in range(num_stages):
-        body = residual_unit(
-            body,
-            filter_list[i + 1],
-            (1 if i == 0 else 2, 1 if i == 0 else 2, 1 if i == 0 else 2),
-            False,
-            name=f"stage{i + 1}_unit1",
-            bottle_neck=bottle_neck,
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        for j in range(units[i] - 1):
-            body = residual_unit(
-                body,
-                filter_list[i + 1],
-                (1, 1, 1),
-                True,
-                name=f"stage{i + 1}_unit{j + 2}",
-                bottle_neck=bottle_neck,
-                data_layout=data_layout,
-                kernel_layout=kernel_layout,
-            )
-    bn1 = layers.batch_norm_infer(data=body, epsilon=2e-5, name="bn1")
-    relu1 = relay.nn.relu(data=bn1)
-    # Although kernel is not used here when global_pool=True, we should put one
-    pool1 = relay.nn.global_avg_pool3d(data=relu1, layout=data_layout)
-    flat = relay.nn.batch_flatten(data=pool1)
-    fc1 = layers.dense_add_bias(data=flat, units=num_classes, name="fc1")
-    net = relay.nn.softmax(data=fc1)
-    return relay.Function(relay.analysis.free_vars(net), net)
-
-
-def get_net(
-    batch_size,
-    num_classes,
-    num_layers=50,
-    image_shape=(3, 16, 112, 112),
-    layout="NCDHW",
-    dtype="float32",
-    **kwargs,
-):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
-    Original author Wei Wu
-    """
-    if layout == "NCDHW":
-        (_, _, height, _) = image_shape
-    else:
-        (_, height, _, _) = image_shape
-    data_shape = (batch_size,) + image_shape
-    if height <= 28:
-        num_stages = 3
-        if (num_layers - 2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers - 2) // 9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers - 2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers - 2) // 6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError(f"no experiments done on num_layers {num_layers}")
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError(f"no experiments done on num_layers {num_layers}")
-
-    return resnet(
-        units=units,
-        num_stages=num_stages,
-        filter_list=filter_list,
-        num_classes=num_classes,
-        data_shape=data_shape,
-        bottle_neck=bottle_neck,
-        layout=layout,
-        dtype=dtype,
-    )
-
-
-def get_workload(
-    batch_size=1,
-    num_classes=1000,
-    num_layers=18,
-    image_shape=(3, 16, 112, 112),
-    layout="NCDHW",
-    dtype="float32",
-    **kwargs,
-):
-    """Get benchmark workload for resnet
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of classes
-
-    num_layers : int, optional
-        Number of layers
-
-    image_shape : tuple, optional
-        The input image shape
-
-    layout: str
-        The data layout for conv3d
-
-    dtype : str, optional
-        The data type
-
-    kwargs : dict
-        Extra arguments
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a ResNet network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(
-        batch_size=batch_size,
-        num_classes=num_classes,
-        num_layers=num_layers,
-        image_shape=image_shape,
-        dtype=dtype,
-        layout=layout,
-        **kwargs,
-    )
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/squeezenet.py b/python/tvm/relay/testing/squeezenet.py
deleted file mode 100644
index ce918fd879d9..000000000000
--- a/python/tvm/relay/testing/squeezenet.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=unused-argument
-
-"""
-Symbol of SqueezeNet
-
-Reference:
-Iandola, Forrest N., et al.
-"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
-"""
-
-from tvm import relay
-from .init import create_workload
-from . import layers
-
-# Helpers
-def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels, prefix):
-    net = _make_fire_conv(net, squeeze_channels, 1, 0, f"{prefix}_input")
-
-    left = _make_fire_conv(net, expand1x1_channels, 1, 0, f"{prefix}_left")
-    right = _make_fire_conv(net, expand3x3_channels, 3, 1, f"{prefix}_right")
-    # NOTE : Assume NCHW layout here
-    net = relay.concatenate((left, right), axis=1)
-    return net
-
-
-def _make_fire_conv(net, channels, kernel_size, padding=0, prefix=""):
-    net = layers.conv2d(
-        net,
-        channels=channels,
-        kernel_size=(kernel_size, kernel_size),
-        padding=(padding, padding),
-        name=f"{prefix}_conv",
-    )
-    net = relay.nn.bias_add(net, relay.var(f"{prefix}_conv_bias"))
-    net = relay.nn.relu(net)
-    return net
-
-
-# Net
-def get_net(batch_size, image_shape, num_classes, version, dtype):
-    """Get symbol of SqueezeNet
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    image_shape : tuple, optional
-        The input image shape
-
-    num_classes: int
-        The number of classification results
-
-    version : str, optional
-        "1.0" or "1.1" of SqueezeNet
-    """
-    assert version in ["1.0", "1.1"], (
-        f"Unsupported SqueezeNet version {version}:" "1.0 or 1.1 expected"
-    )
-    data_shape = (batch_size,) + image_shape
-    net = relay.var("data", shape=data_shape, dtype=dtype)
-    if version == "1.0":
-        net = layers.conv2d(
-            net, channels=96, kernel_size=(7, 7), strides=(2, 2), padding=(3, 3), name="conv1"
-        )
-        net = relay.nn.bias_add(net, relay.var("conv1_bias"))
-        net = relay.nn.relu(net)
-        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 16, 64, 64, "fire1")
-        net = _make_fire(net, 16, 64, 64, "fire2")
-        net = _make_fire(net, 32, 128, 128, "fire3")
-        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 32, 128, 128, "fire4")
-        net = _make_fire(net, 48, 192, 192, "fire5")
-        net = _make_fire(net, 48, 192, 192, "fire6")
-        net = _make_fire(net, 64, 256, 256, "fire7")
-        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 64, 256, 256, "fire8")
-    else:
-        net = layers.conv2d(
-            net, channels=64, kernel_size=(3, 3), strides=(2, 2), padding=(1, 1), name="conv1"
-        )
-        net = relay.nn.bias_add(net, relay.var("conv1_bias"))
-        net = relay.nn.relu(net)
-        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 16, 64, 64, "fire1")
-        net = _make_fire(net, 16, 64, 64, "fire2")
-        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 32, 128, 128, "fire3")
-        net = _make_fire(net, 32, 128, 128, "fire4")
-        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 48, 192, 192, "fire5")
-        net = _make_fire(net, 48, 192, 192, "fire6")
-        net = _make_fire(net, 64, 256, 256, "fire7")
-        net = _make_fire(net, 64, 256, 256, "fire8")
-    net = relay.nn.dropout(net, rate=0.5)
-    net = layers.conv2d(net, channels=num_classes, kernel_size=(1, 1), name="conv_final")
-    net = relay.nn.bias_add(net, relay.var("conv_final_bias"))
-    net = relay.nn.relu(net)
-    net = relay.nn.global_avg_pool2d(net)
-    net = relay.nn.batch_flatten(net)
-    net = relay.nn.softmax(net)
-    args = relay.analysis.free_vars(net)
-    return relay.Function(args, net)
-
-
-def get_workload(
-    batch_size=1, num_classes=1000, version="1.0", image_shape=(3, 224, 224), dtype="float32"
-):
-    """Get benchmark workload for SqueezeNet
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of classes
-
-    version : str, optional
-        "1.0" or "1.1" of SqueezeNet
-
-    image_shape : tuple, optional
-        The input image shape
-
-    dtype : str, optional
-        The data type
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a SqueezeNet network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-
-    net = get_net(batch_size, image_shape, num_classes, version, dtype)
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/synthetic.py b/python/tvm/relay/testing/synthetic.py
deleted file mode 100644
index 7b7778990cb0..000000000000
--- a/python/tvm/relay/testing/synthetic.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Synthetic networks for testing purposes. Ideally, these networks are similar in
-structure to real world networks, but are much smaller in order to make testing
-faster.
-"""
-from __future__ import absolute_import
-from tvm import relay
-from .init import create_workload, Constant
-from . import layers
-
-
-def get_net(input_shape=(1, 3, 24, 12), dtype="float32", wtype=None):
-    """Get synthetic testing network.
-
-    Parameters
-    ----------
-    image_shape : tuple, optional
-        The input shape as (batch_size, channels, height, width).
-
-    dtype : str, optional
-        The data type for the input.
-
-    wtype : str, optional
-        The data type for weights. Defaults to `dtype`.
-
-    Returns
-    -------
-    net : relay.Function
-        The dataflow.
-    """
-    if wtype is None:
-        wtype = dtype
-    data = relay.var("data", shape=input_shape, dtype=dtype)
-    dense_shape = [-1, input_shape[3]]
-    dense = relay.nn.relu(
-        relay.nn.dense(
-            relay.reshape(data, dense_shape),
-            relay.var("dense_weight", shape=[input_shape[3], dense_shape[1]], dtype=wtype),
-        )
-    )
-    dense = relay.reshape_like(dense, data)
-    conv_shape = [input_shape[1], input_shape[1], 3, 3]
-    conv = relay.nn.softmax(
-        relay.nn.conv2d(
-            data,
-            relay.var("conv_weight", shape=conv_shape, dtype=wtype),
-            padding=1,
-            kernel_size=3,
-        )
-    )
-    added = relay.add(dense, conv)
-    biased = layers.batch_norm_infer(
-        relay.nn.bias_add(added, relay.var("bias", dtype=wtype)), name="batch_norm"
-    )
-    dense = relay.nn.relu(
-        relay.nn.dense(
-            relay.reshape(biased, dense_shape),
-            relay.var("dense2_weight", shape=[input_shape[3], dense_shape[1]], dtype=wtype),
-        )
-    )
-    dense = relay.reshape_like(dense, data)
-    conv = relay.nn.softmax(
-        relay.nn.conv2d(
-            biased,
-            relay.var("conv2_weight", shape=conv_shape, dtype=wtype),
-            padding=1,
-            kernel_size=3,
-        )
-    )
-    added = relay.add(dense, conv)
-    args = relay.analysis.free_vars(added)
-    return relay.Function(args, added)
-
-
-def get_workload(input_shape=(1, 3, 24, 12), dtype="float32", wtype=None):
-    """Get benchmark workload for the synthetic net.
-
-    Parameters
-    ----------
-    image_shape : tuple, optional
-        The input shape as (batch_size, channels, height, width).
-
-    dtype : str, optional
-        The data type for the input.
-
-    wtype : str, optional
-        The data type for weights. Defaults to `dtype`.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a synthetic network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    return create_workload(
-        get_net(input_shape=input_shape, dtype=dtype, wtype=wtype),
-        initializer=Constant(),
-    )
diff --git a/python/tvm/relay/testing/temp_op_attr.py b/python/tvm/relay/testing/temp_op_attr.py
deleted file mode 100644
index e2d2e6bbcd42..000000000000
--- a/python/tvm/relay/testing/temp_op_attr.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Defines a TempOpAttr class that allows temporarily changing an attr of the
-operator to allow unit testing. This is useful for AlterOpLayout and Legalize
-tests."""
-
-from tvm import relay
-
-
-class TempOpAttr(object):
-    """Temporarily changes the attr of an op."""
-
-    def __init__(self, op_name, attr_key, attr_value):
-        """Saves the required info for RAII pattern usage.
-
-        Parameters
-        ----------
-        op_name : str
-            The op name.
-
-        attr_key : str
-            The attribute name.
-
-        attr_value : object
-            The attribute value.
-
-        Examples
-        --------
-        .. code-block:: python
-
-        # Temporarily update FTVMAlterOpLayout to a user-defined packed function.
-        # After the test is finished, the attr value will be set back to the original value.
-
-        with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-            my_mod = relay.transform.AlterOpLayout()(my_mod)
-
-        """
-        self.op = relay.op.get(op_name)
-        self.attr_key = attr_key
-        self.attr_value = attr_value
-
-    def __enter__(self):
-        self.older_attr = self.op.get_attr(self.attr_key)
-        self.op.reset_attr(self.attr_key)
-        self.op.set_attr(self.attr_key, self.attr_value)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        self.op.reset_attr(self.attr_key)
-        if self.older_attr:
-            self.op.set_attr(self.attr_key, self.older_attr)
diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
deleted file mode 100644
index 158de22eea8a..000000000000
--- a/python/tvm/relay/testing/tf.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, no-init, import-outside-toplevel
-"""
-Tensorflow Model Helpers
-========================
-Some helper definitions for tensorflow models.
-"""
-import re
-import os.path
-import collections
-import numpy as np
-
-# Tensorflow imports
-import tensorflow as tf
-from tensorflow.core.framework import graph_pb2
-
-import tvm
-from tvm.contrib.download import download_testdata
-
-try:
-    tf_compat_v1 = tf.compat.v1
-except (ImportError, AttributeError):
-    tf_compat_v1 = tf
-
-######################################################################
-# Some helper functions
-# ---------------------
-
-
-def ProcessGraphDefParam(graph_def):
-    """Type-checks and possibly canonicalizes `graph_def`.
-
-    Parameters
-    ----------
-    graph_def : Obj
-        tensorflow graph definition.
-
-    Returns
-    -------
-    graph_def : Obj
-        tensorflow graph definition
-
-    """
-
-    if not isinstance(graph_def, graph_pb2.GraphDef):
-        # `graph_def` could be a dynamically-created message, so try a duck-typed
-        # approach
-        try:
-            old_graph_def = graph_def
-            graph_def = graph_pb2.GraphDef()
-            graph_def.MergeFrom(old_graph_def)
-        except TypeError:
-            raise TypeError("graph_def must be a GraphDef proto.")
-    return graph_def
-
-
-def convert_to_list(x):
-    if not isinstance(x, list):
-        x = [x]
-    return x
-
-
-def vmobj_to_list(o):
-    """Converts TVM objects returned by VM execution to Python List.
-
-    Parameters
-    ----------
-    o : Obj
-        VM Object as output from VM runtime executor.
-
-    Returns
-    -------
-    result : list
-        Numpy objects as list with equivalent values to the input object.
-
-    """
-
-    if isinstance(o, tvm.nd.NDArray):
-        result = [o.numpy()]
-    elif isinstance(o, tvm.runtime.container.ADT):
-        result = []
-        for f in o:
-            result.extend(vmobj_to_list(f))
-    elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
-        if o.constructor.name_hint == "Cons":
-            tl = vmobj_to_list(o.fields[1])
-            hd = vmobj_to_list(o.fields[0])
-            hd.extend(tl)
-            result = hd
-        elif o.constructor.name_hint == "Nil":
-            result = []
-        elif "tensor_nil" in o.constructor.name_hint:
-            result = [0]
-        elif "tensor" in o.constructor.name_hint:
-            result = [o.fields[0].numpy()]
-        else:
-            raise RuntimeError(f"Unknown object type: {o.constructor.name_hint}")
-    else:
-        raise RuntimeError(f"Unknown object type: {type(o)}")
-    return result
-
-
-def AddShapesToGraphDef(session, out_node):
-    """Add shapes attribute to nodes of the graph.
-        Input graph here is the default graph in context.
-
-    Parameters
-    ----------
-    session : tf.Session
-        Tensorflow session
-    out_node : String or List
-        Final output node of the graph.
-
-    Returns
-    -------
-    graph_def : Obj
-        tensorflow graph definition with shapes attribute added to nodes.
-
-    """
-
-    graph_def = tf_compat_v1.graph_util.convert_variables_to_constants(
-        session, session.graph.as_graph_def(add_shapes=True), convert_to_list(out_node)
-    )
-    return graph_def
-
-
-class NodeLookup(object):
-    """Converts integer node ID's to human readable labels."""
-
-    def __init__(self, label_lookup_path=None, uid_lookup_path=None):
-        self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
-
-    def load(self, label_lookup_path, uid_lookup_path):
-        """Loads a human readable English name for each softmax node.
-
-        Parameters
-        ----------
-        label_lookup_path: String
-            File containing String UID to integer node ID mapping .
-
-        uid_lookup_path: String
-            File containing String UID to human-readable string mapping.
-
-        Returns
-        -------
-        node_id_to_name: dict
-            dict from integer node ID to human-readable string.
-
-        """
-        if not tf_compat_v1.gfile.Exists(uid_lookup_path):
-            tf.logging.fatal("File does not exist %s", uid_lookup_path)
-        if not tf_compat_v1.gfile.Exists(label_lookup_path):
-            tf.logging.fatal("File does not exist %s", label_lookup_path)
-
-        # Loads mapping from string UID to human-readable string
-        proto_as_ascii_lines = tf_compat_v1.gfile.GFile(uid_lookup_path).readlines()
-        uid_to_human = {}
-        p = re.compile(r"[n\d]*[ \S,]*")
-        for line in proto_as_ascii_lines:
-            parsed_items = p.findall(line)
-            uid = parsed_items[0]
-            human_string = parsed_items[2]
-            uid_to_human[uid] = human_string
-
-        # Loads mapping from string UID to integer node ID.
-        node_id_to_uid = {}
-        proto_as_ascii = tf_compat_v1.gfile.GFile(label_lookup_path).readlines()
-        for line in proto_as_ascii:
-            if line.startswith("  target_class:"):
-                target_class = int(line.split(": ")[1])
-            if line.startswith("  target_class_string:"):
-                target_class_string = line.split(": ")[1]
-                node_id_to_uid[target_class] = target_class_string[1:-2]
-
-        # Loads the final mapping of integer node ID to human-readable string
-        node_id_to_name = {}
-        for key, val in node_id_to_uid.items():
-            if val not in uid_to_human:
-                tf.logging.fatal("Failed to locate: %s", val)
-            name = uid_to_human[val]
-            node_id_to_name[key] = name
-
-        return node_id_to_name
-
-    def id_to_string(self, node_id):
-        if node_id not in self.node_lookup:
-            return ""
-        return self.node_lookup[node_id]
-
-
-def get_workload_official(model_url, model_sub_path, retries=5):
-    """Import workload from tensorflow official
-
-    Parameters
-    ----------
-    model_url: str
-        URL from where it will be downloaded.
-
-    model_sub_path:
-        Sub path in extracted tar for the ftozen protobuf file.
-
-    retries: int
-        The number of retries to attempt downloading and uncompressing
-        the model in the CI, due to possible network and CI node issues.
-
-    Returns
-    -------
-    model_path: str
-        Full path to saved model file
-
-    """
-    attempts = retries + 1
-    error = None
-    for current_attempt_idx in range(attempts):
-        try:
-            model_tar_name = os.path.basename(model_url)
-            model_path = download_testdata(model_url, model_tar_name, module=["tf", "official"])
-            dir_path = os.path.dirname(model_path)
-
-            if model_path.endswith("tgz") or model_path.endswith("gz"):
-                import tarfile
-
-                tar = tarfile.open(model_path)
-                tar.extractall(path=dir_path)
-                tar.close()
-            elif model_path.endswith("zip"):
-                import zipfile
-
-                zip_object = zipfile.ZipFile(model_path)
-                zip_object.extractall(path=dir_path)
-                zip_object.close()
-            else:
-                raise RuntimeError("Could not decompress the file: " + model_path)
-            return os.path.join(dir_path, model_sub_path)
-        except (EOFError, RuntimeError) as err:
-            error = err
-            print(f"Raised : {str(error)}, current attempt : {current_attempt_idx} ...")
-    raise error
-
-
-def get_workload(model_path, model_sub_path=None, inputs_dict=None, output=None):
-    """Import workload from frozen protobuf
-
-    Parameters
-    ----------
-    model_path: str
-        model_path on remote repository to download from.
-
-    model_sub_path: str
-        Model path in the compressed archive.
-
-    Returns
-    -------
-    graph_def: graphdef
-        graph_def is the tensorflow workload.
-
-    """
-
-    if model_sub_path:
-        path_model = get_workload_official(model_path, model_sub_path)
-    else:
-        repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/"
-        model_url = os.path.join(repo_base, model_path)
-        path_model = download_testdata(model_url, model_path, module="tf")
-
-    # Creates graph from saved graph_def.pb.
-    with tf_compat_v1.gfile.FastGFile(path_model, "rb") as f:
-        graph_def = tf_compat_v1.GraphDef()
-        graph_def.ParseFromString(f.read())
-        graph = tf_compat_v1.import_graph_def(graph_def, name="", input_map=inputs_dict)
-
-    if inputs_dict is not None:
-        # graph is changed so generate graph_def again
-        with tf_compat_v1.Session(graph=graph) as sess:
-            graph_def = AddShapesToGraphDef(sess, output)
-
-    return graph_def
-
-
-#######################################################################
-# PTB LSTMBlockCell Model
-# -----------------------
-
-
-class PTBSmallConfig(object):
-    """Small config.
-    This configurations are used when training the model
-    """
-
-    num_layers = 2
-    num_steps = 1
-    hidden_size = 200
-    batch_size = 1
-    vocab_size = 10000
-    init_scale = 0.1
-
-
-def get_config():
-    """Configuration used for training the model"""
-    return PTBSmallConfig()
-
-
-def pick_from_weight(weight, pows=1.0):
-    """Identify token from Softmax output.
-    This token will be mapped to word in the vocabulary.
-    """
-    weight = weight**pows
-    t = np.cumsum(weight)
-    s = np.sum(weight)
-    return int(np.searchsorted(t, 0.5 * s))
-
-
-def do_tf_sample(session, data, in_states, num_samples):
-    """Sampled from the model"""
-    samples = []
-    sample = None
-    # Cell inputs c and h should be passed for each layer explicitly.
-    state_input_name = [
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState/zeros:0",
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState/zeros_1:0",
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState_1/zeros:0",
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState_1/zeros_1:0",
-    ]
-    state = in_states
-
-    # Graph nodes to be fetched as run output. Tensorflow LSTMBlockCell create internal
-    # nodes for intermediate operations (gates) in the cell during run.
-    # Cell state (c) is ':1'and cell output (h) is ':6' for each layer.
-    fetches = [
-        [
-            "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:1",
-            "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:6",
-            "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:1",
-            "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:6",
-        ],
-        "Model/Softmax:0",
-    ]
-
-    def _get_feed_dict(input_name, input_data):
-        """Create feed dict"""
-        feed_dict = {}
-        if isinstance(input_data, list):
-            for i, e in enumerate(input_name):
-                feed_dict[e] = input_data[i]
-        else:
-            feed_dict[input_name] = input_data
-        return feed_dict
-
-    for x in data:
-        feed_dict = _get_feed_dict(state_input_name, state)
-        feed_dict["Model/Placeholder:0"] = [[x]]
-        state, probs = session.run(fetches, feed_dict)
-        sample = pick_from_weight(probs[0])
-    if sample is not None:
-        samples.append(sample)
-    else:
-        samples.append(0)
-
-    k = 1
-    while k < num_samples:
-        feed_dict = _get_feed_dict(state_input_name, state)
-        feed_dict["Model/Placeholder:0"] = [[samples[-1]]]
-        state, probs = session.run(fetches, feed_dict)
-        sample = pick_from_weight(probs[0])
-        samples.append(sample)
-        k += 1
-    return samples, state
-
-
-def _create_ptb_vocabulary(data_dir):
-    """Read the PTB sample data input to create vocabulary"""
-    data_path = os.path.join(data_dir, "simple-examples/data/")
-    file_name = "ptb.train.txt"
-
-    def _read_words(filename):
-        """Read the data for creating vocabulary"""
-        with tf_compat_v1.gfile.GFile(filename, "r") as f:
-            return f.read().encode("utf-8").decode("utf-8").replace("\n", "<eos>").split()
-
-    def _build_vocab(filename):
-        """Create vocabulary"""
-        data = _read_words(filename)
-        counter = collections.Counter(data)
-        count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
-        words, _ = list(zip(*count_pairs))
-        word_to_id = dict(zip(words, range(len(words))))
-        # for python 3.x
-        id_to_word = dict((v, k) for k, v in word_to_id.items())
-        return word_to_id, id_to_word
-
-    def ptb_raw_data(data_path, file_name):
-        """Read the sample data and create vocabulary"""
-        train_path = os.path.join(data_path, file_name)
-        word_to_id, id_2_word = _build_vocab(train_path)
-        return word_to_id, id_2_word
-
-    return ptb_raw_data(data_path, file_name)
-
-
-def get_workload_ptb():
-    """Import ptb workload from frozen protobuf
-
-    Parameters
-    ----------
-        Nothing.
-
-    Returns
-    -------
-    graph_def: graphdef
-        graph_def is the tensorflow workload for ptb.
-
-    word_to_id : dict
-        English word to integer id mapping
-
-    id_to_word : dict
-        Integer id to English word mapping
-    """
-    sample_repo = "http://www.fit.vutbr.cz/~imikolov/rnnlm/"
-    sample_data_file = "simple-examples.tgz"
-    sample_url = sample_repo + sample_data_file
-    ptb_model_file = "RNN/ptb/ptb_model_with_lstmblockcell.pb"
-    # pylint: disable=import-outside-toplevel
-    import tarfile
-
-    file_path = download_testdata(sample_url, sample_data_file, module=["data", "ptb_data"])
-    dir_path = os.path.dirname(file_path)
-    t = tarfile.open(file_path, "r")
-    t.extractall(dir_path)
-
-    word_to_id, id_to_word = _create_ptb_vocabulary(dir_path)
-    dtype = "float32"
-    shape = (1, 200)
-
-    # Convert states of LSTMBlockCell to placeholder, so TVM can feed data
-    state_name = [
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState/zeros:0",
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState/zeros_1:0",
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState_1/zeros:0",
-        "Model/MultiRNNCellZeroState/LSTMBlockCellZeroState_1/zeros_1:0",
-    ]
-
-    inputs_dict = {
-        state_name[0]: tf_compat_v1.placeholder(dtype, shape, state_name[0].split(":")[0]),
-        state_name[1]: tf_compat_v1.placeholder(dtype, shape, state_name[1].split(":")[0]),
-        state_name[2]: tf_compat_v1.placeholder(dtype, shape, state_name[2].split(":")[0]),
-        state_name[3]: tf_compat_v1.placeholder(dtype, shape, state_name[3].split(":")[0]),
-    }
-    return (
-        word_to_id,
-        id_to_word,
-        get_workload(ptb_model_file, inputs_dict=inputs_dict, output="Model/Softmax"),
-    )
diff --git a/python/tvm/relay/testing/tflite.py b/python/tvm/relay/testing/tflite.py
deleted file mode 100644
index 29f6bc62cad2..000000000000
--- a/python/tvm/relay/testing/tflite.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Common utilities for creating TFLite models"""
-from packaging.version import parse
-import numpy as np
-import pytest
-import tflite.Model  # pylint: disable=wrong-import-position
-import tensorflow as tf  # pylint: disable=wrong-import-position
-import tvm
-
-pytest.importorskip("tflite")
-pytest.importorskip("tensorflow")
-
-
-class TFLiteModel:
-    """Creates TFLite Model and facilitates reference data generation"""
-
-    def __init__(self, dtype):
-        self.serial_model = None  # This is what TFLite convert() provides
-        self.dtype = dtype  # This is the dtype of graph inputs
-        self.shape_dict = {}
-        self.dtype_dict = {}
-
-    def create_conv2d_single(self, kernel_shape, strides, padding, dilation, activation):
-        """Returns tf.function that creates TFLite Conv2d layer"""
-
-        @tf.function
-        def conv2d_single_function(ifm_tensor):
-            """Returns TFLite Conv2d layer"""
-            op = tf.nn.conv2d(
-                ifm_tensor,
-                filters=tf.constant(
-                    np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
-                    dtype=tf.float32,
-                ),
-                strides=[1, strides[0], strides[1], 1],
-                padding=padding,
-                dilations=dilation,
-            )
-            if activation == "RELU":
-                op = tf.nn.relu(op)
-            elif activation == "NONE":
-                pass
-            else:
-                assert False, f"Unsupported activation {activation}"
-            return op
-
-        return conv2d_single_function
-
-    def load_from_file(self, model_file, shapes):
-        """Load tflite model from a tflite file"""
-        for i, shape in enumerate(shapes):
-            input_name = "input_" + str(i)
-            self.shape_dict.update({input_name: shape})
-            self.dtype_dict.update({input_name: self.dtype})
-
-        with open(model_file, "rb") as f:
-            self.serial_model = f.read()
-
-    def create_tflite_model(self, tfl_function, shapes, ranges=None):
-        """Creates TFLite serial graph"""
-        tensor_specs = []
-        for i, shape in enumerate(shapes):
-            input_name = "input_" + str(i)
-            self.shape_dict.update({input_name: shape})
-            self.dtype_dict.update({input_name: self.dtype})
-            tensor_specs.append(tf.TensorSpec(shape, dtype=tf.float32, name=input_name))
-        concrete_func = tfl_function.get_concrete_function(*tensor_specs)
-
-        if not ranges:
-            ranges = [(0, 1) for _ in shapes]
-
-        def representative_dataset():
-            for _ in range(100):
-                inputs = []
-                for i, shape in enumerate(shapes):
-                    data = np.random.uniform(
-                        low=ranges[i][0], high=ranges[i][1], size=tuple(shape)
-                    ).astype("float32")
-                    inputs.append(data)
-
-                yield inputs
-
-        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-        converter.optimizations = [tf.lite.Optimize.DEFAULT]
-        converter.representative_dataset = representative_dataset
-        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-        converter.inference_input_type = tf.int8
-        converter.inference_output_type = tf.int8
-        self.serial_model = converter.convert()
-
-    def convert_to_relay(self):
-        """Converts TFLite serialized graph into Relay"""
-        assert self.serial_model is not None, "TFLite model is empty!"
-
-        tflite_model = tflite.Model.Model.GetRootAsModel(self.serial_model, 0)
-        relay_module, relay_params = tvm.relay.frontend.from_tflite(
-            tflite_model, self.shape_dict, self.dtype_dict
-        )
-        return relay_module, relay_params
-
-    def generate_randomized_input_data(self, seed, shape, dtype):
-        """Generates randomized input numpy arrays based on shape and dtype."""
-        random_state = np.random.RandomState(seed)
-        random_data = None
-        if dtype == np.float32:
-            random_data = random_state.uniform(-1, 1, size).astype(dtype)
-        else:
-            low = np.iinfo(dtype).min
-            high = np.iinfo(dtype).max + 1
-            random_data = random_state.randint(low, high, shape, dtype)
-        return random_data
-
-    # pylint: disable=import-outside-toplevel
-    def generate_reference_data(self):
-        """
-        This method uses TFLite reference kernels to generate reference output.
-        It returns randomized inputs and reference outputs.
-        """
-        assert self.serial_model is not None, "TFLite model was not created."
-
-        output_tolerance = None
-        if parse(tf.__version__) < parse("2.5.0"):
-            output_tolerance = 1
-            interpreter = tf.lite.Interpreter(model_content=self.serial_model)
-        else:
-            output_tolerance = 0
-            interpreter = tf.lite.Interpreter(
-                model_content=self.serial_model,
-                experimental_op_resolver_type=tf.lite.experimental.OpResolverType.BUILTIN_REF,
-                experimental_preserve_all_tensors=False,
-            )
-
-        interpreter.allocate_tensors()
-        input_details = interpreter.get_input_details()
-        output_details = interpreter.get_output_details()
-
-        # Generate predictable randomized input
-        seed = 0
-        input_data = {}
-        for input_detail in input_details:
-            input_values = self.generate_randomized_input_data(
-                seed, input_detail["shape"], input_detail["dtype"]
-            )
-            interpreter.set_tensor(input_detail["index"], input_values)
-            input_data.update({input_detail["name"]: input_values})
-
-        interpreter.invoke()
-
-        # Obtain the expected output from interpreter
-        expected_output_data = {}
-        for output_detail in output_details:
-            expected_output_data.update(
-                {output_detail["name"]: interpreter.get_tensor(output_detail["index"])}
-            )
-
-        return input_data, expected_output_data, output_tolerance
diff --git a/python/tvm/relay/testing/vgg.py b/python/tvm/relay/testing/vgg.py
deleted file mode 100644
index 426cd9e60850..000000000000
--- a/python/tvm/relay/testing/vgg.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""References:
-
-Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
-large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
-"""
-from tvm import relay
-from .init import create_workload
-from . import layers as wrapper
-
-
-def get_feature(internal_layer, layers, filters, batch_norm=False):
-    """Get VGG feature body as stacks of convolutions."""
-    for i, num in enumerate(layers):
-        for j in range(num):
-            internal_layer = wrapper.conv2d(
-                data=internal_layer,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                channels=filters[i],
-                name=f"conv{i + 1}_{j + 1}",
-            )
-            internal_layer = relay.nn.bias_add(
-                internal_layer, relay.var(f"conv{i + 1}_{j + 1}_bias")
-            )
-            if batch_norm:
-                internal_layer = wrapper.batch_norm_infer(
-                    data=internal_layer, name=f"bn{i + 1}_{j + 1}"
-                )
-            internal_layer = relay.nn.relu(data=internal_layer)
-        internal_layer = relay.nn.max_pool2d(data=internal_layer, pool_size=(2, 2), strides=(2, 2))
-    return internal_layer
-
-
-def get_classifier(input_data, num_classes):
-    """Get VGG classifier layers as fc layers."""
-    flatten = relay.nn.batch_flatten(data=input_data)
-    fc6 = wrapper.dense_add_bias(data=flatten, units=4096, name="fc6")
-    relu6 = relay.nn.relu(data=fc6)
-    drop6 = relay.nn.dropout(data=relu6, rate=0.5)
-    fc7 = wrapper.dense_add_bias(data=drop6, units=4096, name="fc7")
-    relu7 = relay.nn.relu(data=fc7)
-    drop7 = relay.nn.dropout(data=relu7, rate=0.5)
-    fc8 = wrapper.dense_add_bias(data=drop7, units=num_classes, name="fc8")
-    return fc8
-
-
-def get_net(batch_size, image_shape, num_classes, dtype, num_layers=11, batch_norm=False):
-    """
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    image_shape : tuple, optional
-        The input image shape
-
-    num_classes : int, optional
-        Number of claseses
-
-    dtype : str, optional
-        The data type
-
-    num_layers : int
-        Number of layers for the variant of vgg. Options are 11, 13, 16, 19.
-
-    batch_norm : bool, default False
-        Use batch normalization.
-    """
-    vgg_spec = {
-        11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
-        13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
-        16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
-        19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512]),
-    }
-    if num_layers not in vgg_spec:
-        raise ValueError(f"Invalid num_layers {num_layers}. Choices are 11,13,16,19.")
-    layers, filters = vgg_spec[num_layers]
-    data_shape = (batch_size,) + image_shape
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    feature = get_feature(data, layers, filters, batch_norm)
-    classifier = get_classifier(feature, num_classes)
-    symbol = relay.nn.softmax(data=classifier)
-    args = relay.analysis.free_vars(symbol)
-    return relay.Function(args, symbol)
-
-
-def get_workload(
-    batch_size,
-    num_classes=1000,
-    image_shape=(3, 224, 224),
-    dtype="float32",
-    num_layers=11,
-    batch_norm=False,
-):
-    """Get benchmark workload for VGG nets.
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size used in the model
-
-    num_classes : int, optional
-        Number of claseses
-
-    image_shape : tuple, optional
-        The input image shape
-
-    dtype : str, optional
-        The data type
-
-    num_layers : int
-        Number of layers for the variant of vgg. Options are 11, 13, 16, 19.
-
-    batch_norm : bool
-        Use batch normalization.
-
-    Returns
-    -------
-    mod : tvm.IRModule
-        The relay module that contains a VGG network.
-
-    params : dict of str to NDArray
-        The parameters.
-    """
-    net = get_net(batch_size, image_shape, num_classes, dtype, num_layers, batch_norm)
-    return create_workload(net)
diff --git a/python/tvm/relay/testing/yolo_detection.py b/python/tvm/relay/testing/yolo_detection.py
deleted file mode 100644
index 7a54961891b1..000000000000
--- a/python/tvm/relay/testing/yolo_detection.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, no-init,
-"""
-Yolo detection boxes helper functions
-====================
-DarkNet helper functions for yolo and image loading.
-This functions will not be loaded by default.
-These are utility functions used for testing and tutorial file.
-"""
-from __future__ import division
-import math
-from collections import namedtuple
-from functools import cmp_to_key
-import numpy as np
-
-Box = namedtuple("Box", ["x", "y", "w", "h"])
-
-
-def nms_comparator(a, b):
-    if "sort_class" in b and b["sort_class"] >= 0:
-        diff = a["prob"][b["sort_class"]] - b["prob"][b["sort_class"]]
-    else:
-        diff = a["objectness"] - b["objectness"]
-    return diff
-
-
-def _correct_boxes(dets, w, h, netw, neth, relative):
-    new_w, new_h = (netw, (h * netw) // w) if (netw / w < neth / h) else ((w * neth // h), neth)
-    for det in dets:
-        b = det["bbox"]
-        b = b._replace(x=(b.x - (netw - new_w) / 2 / netw) / (new_w / netw))
-        b = b._replace(y=(b.y - (neth - new_h) / 2 / neth) / (new_h / neth))
-        b = b._replace(w=b.w * netw / new_w)
-        b = b._replace(h=b.h * neth / new_h)
-        if not relative:
-            b = b._replace(x=b.x * w)
-            b = b._replace(w=b.w * w)
-            b = b._replace(y=b.y * h)
-            b = b._replace(h=b.h * h)
-        det["bbox"] = b
-    return dets
-
-
-def _overlap(x1, w1, x2, w2):
-    l1 = x1 - w1 / 2
-    l2 = x2 - w2 / 2
-    left = l1 if l1 > l2 else l2
-    r1 = x1 + w1 / 2
-    r2 = x2 + w2 / 2
-    right = r1 if r1 < r2 else r2
-    return right - left
-
-
-def _box_intersection(a, b):
-    w = _overlap(a.x, a.w, b.x, b.w)
-    h = _overlap(a.y, a.h, b.y, b.h)
-    if w < 0 or h < 0:
-        return 0
-    return w * h
-
-
-def _box_union(a, b):
-    i = _box_intersection(a, b)
-    u = a.w * a.h + b.w * b.h - i
-    return u
-
-
-def _box_iou(a, b):
-    return _box_intersection(a, b) / _box_union(a, b)
-
-
-def _get_box(data, biases, n, location, lw, lh, w, h):
-    bx = (location[2] + data[location[0]][0][location[1]][location[2]]) / lw
-    by = (location[1] + data[location[0]][1][location[1]][location[2]]) / lh
-    bw = np.exp(data[location[0]][2][location[1]][location[2]]) * biases[2 * n] / w
-    bh = np.exp(data[location[0]][3][location[1]][location[2]]) * biases[2 * n + 1] / h
-    return Box(bx, by, bw, bh)
-
-
-def _get_yolo_detections(l, im_shape, net_shape, thresh, relative, dets):
-    data = l["output"]
-    active_data_loc = np.asarray(np.where(data[:, 4, :, :] > thresh))
-    before_correct_dets = []
-    for i in range(active_data_loc.shape[1]):
-        location = [active_data_loc[0][i], active_data_loc[1][i], active_data_loc[2][i]]
-        box_b = _get_box(
-            data,
-            l["biases"],
-            np.asarray(l["mask"])[location[0]],
-            location,
-            data.shape[3],
-            data.shape[2],
-            net_shape[0],
-            net_shape[1],
-        )
-        objectness = data[location[0]][4][location[1]][location[2]]
-        classes = l["classes"]
-        prob = objectness * data[location[0], 5 : 5 + 1 + classes, location[1], location[2]]
-        prob[prob < thresh] = 0
-        detection = {}
-        detection["bbox"] = box_b
-        detection["classes"] = classes
-        detection["prob"] = prob
-        detection["objectness"] = objectness
-        before_correct_dets.append(detection)
-    dets.extend(
-        _correct_boxes(
-            before_correct_dets, im_shape[0], im_shape[1], net_shape[0], net_shape[1], relative
-        )
-    )
-
-
-def _get_region_detections(l, im_shape, net_shape, thresh, relative, dets):
-    data = l["output"]
-    before_correct_dets = []
-    for row in range(data.shape[2]):
-        for col in range(data.shape[3]):
-            for n in range(data.shape[0]):
-                prob = [0] * l["classes"]
-                scale = data[n, l["coords"], row, col] if not l["background"] else 1
-                location = [n, row, col]
-                box_b = _get_box(
-                    data,
-                    l["biases"],
-                    n,
-                    location,
-                    data.shape[3],
-                    data.shape[2],
-                    data.shape[3],
-                    data.shape[2],
-                )
-                objectness = scale if scale > thresh else 0
-                if objectness:
-                    prob = (
-                        scale * data[n, l["coords"] + 1 : l["coords"] + 1 + l["classes"], row, col]
-                    )
-                    prob[prob < thresh] = 0
-                detection = {}
-                detection["bbox"] = box_b
-                detection["prob"] = prob
-                detection["objectness"] = objectness
-                before_correct_dets.append(detection)
-    _correct_boxes(
-        before_correct_dets, im_shape[0], im_shape[1], net_shape[0], net_shape[1], relative
-    )
-    dets.extend(before_correct_dets)
-
-
-def fill_network_boxes(net_shape, im_shape, thresh, relative, tvm_out):
-    dets = []
-    for layer in tvm_out:
-        if layer["type"] == "Yolo":
-            _get_yolo_detections(layer, im_shape, net_shape, thresh, relative, dets)
-        elif layer["type"] == "Region":
-            _get_region_detections(layer, im_shape, net_shape, thresh, relative, dets)
-    return dets
-
-
-def do_nms_sort(dets, classes, thresh):
-    "Does the sorting based on the threshold values"
-    k = len(dets) - 1
-    cnt = 0
-    while cnt < k:
-        if dets[cnt]["objectness"] == 0:
-            dets[k], dets[cnt] = dets[cnt], dets[k]
-            k = k - 1
-        else:
-            cnt = cnt + 1
-    total = k + 1
-    for k in range(classes):
-        for i in range(total):
-            dets[i]["sort_class"] = k
-        dets[0:total] = sorted(dets[0:total], key=cmp_to_key(nms_comparator), reverse=True)
-        for i in range(total):
-            if dets[i]["prob"][k] == 0:
-                continue
-            a = dets[i]["bbox"]
-            for j in range(i + 1, total):
-                b = dets[j]["bbox"]
-                if _box_iou(a, b) > thresh:
-                    dets[j]["prob"][k] = 0
-
-
-def get_detections(im, det, thresh, names, classes):
-    "Draw the markings around the detected region"
-    labelstr = []
-    category = -1
-    detection = None
-    valid = False
-    for j in range(classes):
-        if det["prob"][j] > thresh:
-            if category == -1:
-                category = j
-            labelstr.append(names[j] + " " + str(round(det["prob"][j], 4)))
-
-    if category > -1:
-        valid = True
-        imc, imh, imw = im.shape
-        width = int(imh * 0.006)
-        offset = category * 123457 % classes
-        red = _get_color(2, offset, classes)
-        green = _get_color(1, offset, classes)
-        blue = _get_color(0, offset, classes)
-        rgb = [red, green, blue]
-        b = det["bbox"]
-        left = int((b.x - b.w / 2.0) * imw)
-        right = int((b.x + b.w / 2.0) * imw)
-        top = int((b.y - b.h / 2.0) * imh)
-        bot = int((b.y + b.h / 2.0) * imh)
-
-        if left < 0:
-            left = 0
-        if right > imw - 1:
-            right = imw - 1
-        if top < 0:
-            top = 0
-        if bot > imh - 1:
-            bot = imh - 1
-
-        detection = {
-            "category": category,
-            "labelstr": labelstr,
-            "left": left,
-            "top": top,
-            "right": right,
-            "bot": bot,
-            "width": width,
-            "rgb": rgb,
-        }
-
-    return valid, detection
-
-
-def draw_detections(font_path, im, dets, thresh, names, classes):
-    "Draw the markings around the detected region"
-    for det in dets:
-        valid, detection = get_detections(im, det, thresh, names, classes)
-        if valid:
-            rgb = detection["rgb"]
-            label = _get_label(font_path, "".join(detection["labelstr"]), rgb)
-            _draw_box_width(
-                im,
-                detection["left"],
-                detection["top"],
-                detection["right"],
-                detection["bot"],
-                detection["width"],
-                rgb[0],
-                rgb[1],
-                rgb[2],
-            )
-            _draw_label(im, detection["top"] + detection["width"], detection["left"], label, rgb)
-
-
-def show_detections(im, dets, thresh, names, classes):
-    "Print the markings and the detected region"
-    for det in dets:
-        valid, detection = get_detections(im, det, thresh, names, classes)
-        if valid:
-            print(
-                "class:{} left:{} top:{} right:{} bottom:{}".format(
-                    detection["labelstr"],
-                    detection["left"],
-                    detection["top"],
-                    detection["right"],
-                    detection["bot"],
-                )
-            )
-
-
-def _get_pixel(im, x, y, c):
-    return im[c][y][x]
-
-
-def _set_pixel(im, x, y, c, val):
-    if x < 0 or y < 0 or c < 0 or x >= im.shape[2] or y >= im.shape[1] or c >= im.shape[0]:
-        return
-    im[c][y][x] = val
-
-
-def _draw_label(im, r, c, label, rgb):
-    w = label.shape[2]
-    h = label.shape[1]
-    if (r - h) >= 0:
-        r = r - h
-
-    for j in range(h):
-        if j < h and (j + r) < im.shape[1]:
-            for i in range(w):
-                if i < w and (i + c) < im.shape[2]:
-                    for k in range(label.shape[0]):
-                        val = _get_pixel(label, i, j, k)
-                        _set_pixel(im, i + c, j + r, k, val)  # rgb[k] * val)
-
-
-def _get_label(font_path, labelstr, rgb):
-    # pylint: disable=import-outside-toplevel
-    from PIL import Image
-    from PIL import ImageDraw
-    from PIL import ImageFont
-
-    text = labelstr
-    textSize = 25
-    colorText = "black"
-    testDraw = ImageDraw.Draw(Image.new("RGB", (1, 1)))
-    font = ImageFont.truetype(font_path, textSize)
-    width = int(testDraw.textlength(labelstr, font=font))
-    height = textSize + 5
-    img = Image.new(
-        "RGB", (width, height), color=(int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255))
-    )
-    d = ImageDraw.Draw(img)
-    d.text((0, 0), text, fill=colorText, font=font)
-    opencvImage = np.divide(np.asarray(img), 255)
-    return opencvImage.transpose(2, 0, 1)
-
-
-def _get_color(c, x, max_value):
-    c = int(c)
-    colors = [[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]
-    ratio = (float(x) / float(max_value)) * 5
-    i = int(math.floor(ratio))
-    j = int(math.ceil(ratio))
-    ratio -= i
-    r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
-    return r
-
-
-def _draw_box(im, x1, y1, x2, y2, r, g, b):
-    y1 = int(y1)
-    y2 = int(y2)
-    x1 = int(x1)
-    x2 = int(x2)
-    ac, ah, aw = im.shape
-    if x1 < 0:
-        x1 = 0
-    if x1 >= aw:
-        y1 = 0
-    if y1 >= ah:
-        y1 = ah - 1
-    if y2 < 0:
-        y2 = 0
-    if y2 >= ah:
-        y2 = ah - 1
-
-    for i in range(x1, x2):
-        im[0][y1][i] = r
-        im[0][y2][i] = r
-        im[1][y1][i] = g
-        im[1][y2][i] = g
-        im[2][y1][i] = b
-        im[2][y2][i] = b
-
-    for i in range(y1, y2):
-        im[0][i][x1] = r
-        im[0][i][x2] = r
-        im[1][i][x1] = g
-        im[1][i][x2] = g
-        im[2][i][x1] = b
-        im[2][i][x2] = b
-
-
-def _draw_box_width(im, x1, y1, x2, y2, w, r, g, b):
-    for i in range(int(w)):
-        _draw_box(im, x1 + i, y1 + i, x2 - i, y2 - i, r, g, b)
diff --git a/python/tvm/relay/transform/__init__.py b/python/tvm/relay/transform/__init__.py
deleted file mode 100644
index c10b8f8ff3c3..000000000000
--- a/python/tvm/relay/transform/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import, redefined-builtin, invalid-name
-"""The Relay IR namespace containing transformations."""
-# transformation passes
-from .transform import *
-from .recast import recast
-from . import fake_quantization_to_integer, mixed_precision
-from .flexible_shape import FlexibleShapeDispatch
diff --git a/python/tvm/relay/transform/_ffi_api.py b/python/tvm/relay/transform/_ffi_api.py
deleted file mode 100644
index 32c79cb6b2a3..000000000000
--- a/python/tvm/relay/transform/_ffi_api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for Relay transformation passes."""
-import tvm._ffi
-
-tvm._ffi._init_api("relay._transform", __name__)
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
deleted file mode 100644
index 6eef6ff3ffae..000000000000
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ /dev/null
@@ -1,662 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Relay functions for rewriting fake quantized ops."""
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.ir import TensorAffineType, TupleAffineType
-
-# import to register canonicalization funcs for fq2i
-# pylint: disable=unused-import
-from tvm.relay.qnn.op import canonicalizations
-from tvm.tir import bijective_layout
-
-from ..op import (
-    register_fake_quantization_to_integer,
-    register_optional_fake_quantization_to_integer,
-)
-
-
-def fold_constant(expr):
-    return relay.transform.FoldConstantExpr(expr, tvm.IRModule())
-
-
-def get_zeros(scale):
-    return fold_constant(relay.op.cast(relay.op.zeros_like(scale), "int32"))
-
-
-def infer_shape(expr):
-    return relay.transform.InferType()(tvm.IRModule.from_expr(expr))["main"].body.checked_type.shape
-
-
-def approx_equal(x, y):
-    x = fold_constant(x)
-    y = fold_constant(y)
-    if isinstance(x, relay.Constant) and isinstance(y, relay.Constant):
-        equal = np.allclose(x.data.asnumpy(), y.data.asnumpy())
-    else:
-        equal = tvm.ir.structural_equal(x, y)
-    return equal
-
-
-@register_fake_quantization_to_integer("qnn.dequantize")
-def dequantize(expr, type_map):
-    """Remove dequantize op"""
-    out = expr.args[0]
-    t = type_map[expr]
-    return [out, t]
-
-
-@register_fake_quantization_to_integer("qnn.quantize")
-def quantize(expr, type_map):
-    """Turn a quantize op into requantize or remove it"""
-    out = expr.args[0]
-    t = type_map[out]
-    in_scale = fold_constant(t.scale)
-    in_zero_point = fold_constant(t.zero_point)
-    if not (
-        approx_equal(in_scale, expr.args[1])
-        and approx_equal(in_zero_point, expr.args[2])
-        and tvm.ir.structural_equal(t.dtype, expr.attrs.out_dtype)
-    ):
-        out = relay.qnn.op.requantize(
-            out,
-            in_scale,
-            in_zero_point,
-            expr.args[1],
-            expr.args[2],
-            out_dtype=expr.attrs.out_dtype,
-            axis=t.axis,
-        )
-    return [
-        out,
-        TensorAffineType(expr.args[1], expr.args[2], expr.attrs.out_dtype, expr.attrs.axis),
-    ]
-
-
-def register_unary_identity(op_name):
-    def identity(expr, type_map):
-        assert len(expr.args) == 1
-        arg = expr.args[0]
-        t = type_map[arg]
-        return [expr, t]
-
-    return register_fake_quantization_to_integer(op_name, identity)
-
-
-register_unary_identity("reshape")
-register_unary_identity("squeeze")
-register_unary_identity("strided_slice")
-register_unary_identity("transpose")
-register_unary_identity("expand_dims")
-register_unary_identity("nn.max_pool2d")
-register_unary_identity("nn.batch_flatten")
-register_unary_identity("nn.depth_to_space")
-register_unary_identity("max")
-register_unary_identity("min")
-register_unary_identity("image.resize2d")
-
-
-@register_fake_quantization_to_integer("nn.avg_pool2d")
-def avgpool2d(expr, type_map):
-    """Rewrite an avgpool op"""
-    attrs = {**expr.attrs}
-    arg = expr.args[0]
-    t = type_map[arg]
-    out_t = type_map[expr]
-
-    # dq > nn.avg_pool2d > q
-    # Use the same input quantization parameters for output if the pattern is not the above.
-    # Type_map is a map of graphs and their Tensoraffinetypes
-    # Find the current "nn.avg_pool2d" op after checking for the "qnn.quantize" op in the graph.
-    # Structure for .. dq > op > q will be q [op [dq ..
-    def check(y, expr):
-        if isinstance(y, type(expr)):
-            if y.op.name != "nn.avg_pool2d":
-                return True
-            # check if this is the expr avg_pool
-            if y.attrs != expr.attrs:
-                return True
-        return False
-
-    for x in type_map.items():
-        if isinstance(x[0], type(expr)):
-            if x[0].op.name == "qnn.quantize":
-                prev = x[0]
-                y = prev.args[0]
-                while check(y, expr):
-                    prev = y
-                    y = prev.args[0]
-                if (
-                    isinstance(y, type(expr))
-                    and y.op.name == "nn.avg_pool2d"
-                    and y.attrs == expr.attrs
-                ):
-                    if prev.op.name != "qnn.quantize":
-                        out_t = t
-                    break
-
-    out = relay.qnn.op.avg_pool2d(
-        arg,
-        t.scale,
-        t.zero_point,
-        out_t.scale,
-        out_t.zero_point,
-        attrs["pool_size"],
-        attrs["strides"],
-        attrs["padding"],
-        attrs["dilation"],
-        attrs["ceil_mode"],
-        attrs["count_include_pad"],
-        attrs["layout"],
-    )
-
-    return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)]
-
-
-@register_fake_quantization_to_integer("nn.adaptive_avg_pool1d")
-def adaptive_avgpool1d(expr, type_map):
-    """Rewrite an adaptive avgpool op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    out_t = type_map[expr]
-    if not (
-        approx_equal(t.scale, out_t.scale)
-        and approx_equal(t.zero_point, out_t.zero_point)
-        and tvm.ir.structural_equal(t.dtype, out_t.dtype)
-    ):
-        arg = relay.qnn.op.requantize(
-            arg,
-            t.scale,
-            t.zero_point,
-            out_t.scale,
-            out_t.zero_point,
-            out_dtype="int32",
-            axis=t.axis,
-        )
-    else:
-        arg = relay.op.cast(arg, "int32")
-    output_size = expr.attrs.output_size
-    out = relay.op.nn.adaptive_avg_pool1d(arg, output_size)
-    return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
-
-
-@register_fake_quantization_to_integer("nn.global_avg_pool2d")
-def global_avgpool2d(expr, type_map):
-    """Rewrite a global_avgpool op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    out_t = type_map[expr]
-    out_t = type_map[expr]
-    if not (
-        approx_equal(t.scale, out_t.scale)
-        and approx_equal(t.zero_point, out_t.zero_point)
-        and tvm.ir.structural_equal(t.dtype, out_t.dtype)
-    ):
-        arg = relay.qnn.op.requantize(
-            arg,
-            t.scale,
-            t.zero_point,
-            out_t.scale,
-            out_t.zero_point,
-            out_dtype="int32",
-            axis=t.axis,
-        )
-    else:
-        arg = relay.op.cast(arg, "int32")
-    out = relay.op.nn.global_avg_pool2d(arg)
-    return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
-
-
-@register_fake_quantization_to_integer("broadcast_to")
-def broadcast_to(expr, type_map):
-    """Rewrite a broadcast_to op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    shape = expr.attrs.shape
-    out = relay.op.broadcast_to(arg, shape)
-    return [out, t]
-
-
-@register_fake_quantization_to_integer("nn.bias_add")
-def bias_add(expr, type_map):
-    """Rewrite a bias_add op"""
-    x, b = expr.args
-    x_t = type_map[x]
-    if b in type_map:
-        # Ensure bias matches the previous op
-        b_t = type_map[b]
-        in_scale = fold_constant(x_t.scale)
-        in_zero_point = fold_constant(x_t.zero_point)
-        if not (
-            approx_equal(x_t.scale, b_t.scale)
-            and approx_equal(x_t.zero_point, b_t.zero_point)
-            and tvm.ir.structural_equal(x_t.dtype, b_t.dtype)
-        ):
-            b = relay.qnn.op.requantize(
-                b, b_t.scale, b_t.zero_point, in_scale, in_zero_point, out_dtype=x_t.dtype, axis=0
-            )
-    else:
-        # If the bias is a constant, we need to quantize it
-        assert isinstance(b, relay.expr.Constant)
-        assert b.checked_type.dtype in ["float32", "float64", "float16", "bfloat16"]
-        b = relay.qnn.op.quantize(b, x_t.scale, x_t.zero_point, axis=0, out_dtype=x_t.dtype)
-    out = relay.op.nn.bias_add(x, b, **expr.attrs)
-    return [out, x_t]
-
-
-@register_fake_quantization_to_integer("nn.conv2d")
-def conv2d(expr, type_map):
-    """Rewrite a conv2d op"""
-    attrs = {**expr.attrs}
-    attrs.pop("out_dtype")
-    x, weight = expr.args
-    x_t = type_map[x]
-    w_t = type_map[weight]
-    conv_scale = fold_constant(x_t.scale * w_t.scale)
-    conv_zp = get_zeros(conv_scale)
-    out = relay.qnn.op.conv2d(
-        x, weight, x_t.zero_point, w_t.zero_point, x_t.scale, w_t.scale, **attrs
-    )
-    out_layout = attrs["out_layout"] if attrs["out_layout"] != "" else attrs["data_layout"]
-    out_axis = bijective_layout(out_layout, "NCHW").backward_index(list(range(4)))[1]
-    return [out, TensorAffineType(conv_scale, conv_zp, out.attrs.out_dtype, out_axis.value)]
-
-
-@register_fake_quantization_to_integer("nn.conv2d_transpose")
-def conv2d_transpose(expr, type_map):
-    """Rewrite a conv2d_transpose op"""
-    attrs = {**expr.attrs}
-    attrs.pop("out_dtype")
-    x, weight = expr.args
-    x_t = type_map[x]
-    w_t = type_map[weight]
-    conv_scale = fold_constant(x_t.scale * w_t.scale)
-    conv_zp = get_zeros(conv_scale)
-
-    out = relay.qnn.op.conv2d_transpose(
-        x, weight, x_t.zero_point, w_t.zero_point, x_t.scale, w_t.scale, **attrs
-    )
-    out_layout = attrs["out_layout"] if attrs["out_layout"] != "" else attrs["data_layout"]
-    out_axis = bijective_layout(out_layout, "NCHW").backward_index(list(range(4)))[1]
-    return [out, TensorAffineType(conv_scale, conv_zp, out.attrs.out_dtype, out_axis.value)]
-
-
-@register_fake_quantization_to_integer("nn.dense")
-def dense(expr, type_map):
-    """Rewrite a dense op"""
-    attrs = {**expr.attrs}
-    attrs.pop("out_dtype")
-    x, weight = expr.args
-    x_t = type_map[x]
-    w_t = type_map[weight]
-    dense_scale = fold_constant(x_t.scale * w_t.scale)
-    dense_zp = get_zeros(dense_scale)
-    out = relay.qnn.op.dense(
-        x, weight, x_t.zero_point, w_t.zero_point, x_t.scale, w_t.scale, **attrs
-    )
-    return [out, TensorAffineType(dense_scale, dense_zp, out.attrs.out_dtype, 1)]
-
-
-@register_fake_quantization_to_integer("nn.batch_matmul")
-def batch_matmul(expr, type_map):
-    """Rewrite a batch_matmul op"""
-    x, y = expr.args
-    x_t = type_map[x]
-    y_t = type_map[y]
-    matmul_scale = fold_constant(x_t.scale * y_t.scale)
-    matmul_zp = relay.const(0)
-    out = relay.qnn.op.batch_matmul(x, y, x_t.zero_point, y_t.zero_point, x_t.scale, y_t.scale)
-    return [out, TensorAffineType(matmul_scale, matmul_zp, out.attrs.out_dtype, x_t.axis)]
-
-
-@register_fake_quantization_to_integer("concatenate")
-def concat(expr, type_map):
-    """Rewrite a concat op"""
-    scales = []
-    zps = []
-
-    tuple_type = type_map[expr.args[0]]
-    for t in tuple_type.types:
-        scales.append(t.scale)
-        zps.append(t.zero_point)
-
-    out_type = type_map[expr]
-
-    out = relay.qnn.op.concatenate(
-        expr.args[0],
-        relay.Tuple(scales),
-        relay.Tuple(zps),
-        out_type.scale,
-        out_type.zero_point,
-        **expr.attrs,
-    )
-    return [out, out_type]
-
-
-@register_fake_quantization_to_integer("topk")
-def topk(expr, type_map):
-    """Rewrite a topk op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    attrs = {**expr.attrs}
-    assert "ret_type" in attrs and attrs["ret_type"] == "values"
-    return [expr, t]
-
-
-@register_fake_quantization_to_integer("split")
-def split(expr, type_map):
-    """Rewrite a split op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    attrs = {**expr.attrs}
-    if isinstance(attrs["indices_or_sections"], int):
-        num_split = attrs["indices_or_sections"]
-    else:
-        num_split = len(attrs["indices_or_sections"]) + 1
-    return [expr, TupleAffineType([t] * num_split)]
-
-
-@register_fake_quantization_to_integer("clip")
-def clip(expr, type_map):
-    """Rewrite a clip op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    amin = expr.attrs.a_min
-    amax = expr.attrs.a_max
-    scale = fold_constant(t.scale)
-    z_p = fold_constant(t.zero_point)
-    if (
-        isinstance(scale, relay.expr.Constant)
-        and scale.data.numpy().size == 1
-        and isinstance(z_p, relay.expr.Constant)
-        and z_p.data.numpy().size == 1
-    ):
-        scale = scale.data.numpy().item()
-        z_p = z_p.data.numpy().item()
-        new_min = int(amin / scale + z_p)
-        new_max = int(amax / scale + z_p)
-        out = relay.op.clip(arg, new_min, new_max)
-    else:
-        if not isinstance(amin, relay.expr.Constant):
-            amin = relay.op.const(amin)
-        if not isinstance(amax, relay.expr.Constant):
-            amax = relay.op.const(amax)
-
-        scale_shape = infer_shape(scale)
-        if len(scale_shape) > 0 and scale_shape[0] > 1:
-            b_shape = [1] * len(infer_shape(arg))
-            b_shape[t.axis] = -1
-            amin = relay.op.reshape(relay.op.broadcast_to(amin, scale_shape), b_shape)
-            amax = relay.op.reshape(relay.op.broadcast_to(amax, scale_shape), b_shape)
-        amin = relay.qnn.op.quantize(amin, scale, z_p, t.axis, t.dtype)
-        amax = relay.qnn.op.quantize(amax, scale, z_p, t.axis, t.dtype)
-        out = relay.op.minimum(relay.op.maximum(arg, fold_constant(amin)), fold_constant(amax))
-
-    return [out, t]
-
-
-@register_fake_quantization_to_integer("nn.relu")
-def relu(expr, type_map):
-    """Rewrite a relu op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    scale_shape = infer_shape(t.scale)
-    z_p = t.zero_point
-    assert len(scale_shape) <= 1
-    if len(scale_shape) == 1 and scale_shape[0] > 1:
-        b_shape = [1] * len(infer_shape(arg))
-        b_shape[t.axis] = -1
-        z_p = relay.op.reshape(relay.op.broadcast_to(z_p, scale_shape), b_shape)
-    zero = relay.op.cast(z_p, t.dtype)
-    return [relay.op.maximum(arg, fold_constant(zero)), t]
-
-
-@register_fake_quantization_to_integer("nn.leaky_relu")
-def leaky_relu(expr, type_map):
-    """Rewrite a leaky relu op"""
-    arg = expr.args[0]
-    x_t = type_map[arg]
-    out_t = type_map[expr]
-    alpha = expr.attrs.alpha
-    output = relay.qnn.op.leaky_relu(
-        expr, alpha, x_t.scale, x_t.zero_point, out_t.scale, out_t.zero_point
-    )
-    return [output, x_t]
-
-
-@register_fake_quantization_to_integer("nn.pad")
-def pad(expr, type_map):
-    """Rewite an nn.pad op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    pad_value = expr.args[1]
-    # TF2ONNX will sometimes implement the pad_value as a constant without a quantize
-    # To support that, the pass lets branches that terminate in a constant through
-    if pad_value in type_map:
-        # if the pad value is calcuated from a dequantize op, it should be in the type map
-        # and we need to make sure it's affine type matches the arg
-        pad_t = type_map[pad_value]
-        if not tvm.ir.structural_equal(t, pad_t):
-            pad_value = relay.qnn.op.requantize(
-                pad_value,
-                pad_t.scale,
-                pad_t.zero_point,
-                t.scale,
-                t.zero_point,
-                out_dtype=t.dtype,
-                axis=pad_t.axis,
-            )
-    else:
-        # If the pad-value is a constant, we need to quantize it
-        assert isinstance(pad_value, relay.expr.Constant)
-        assert pad_value.checked_type.dtype in ["float32", "float64", "float16", "bfloat16"]
-        pad_value = relay.qnn.op.quantize(pad_value, t.scale, t.zero_point, out_dtype=t.dtype)
-
-    out = relay.op.nn.pad(arg, pad_value=pad_value, **expr.attrs)
-    return [out, t]
-
-
-@register_fake_quantization_to_integer("mean")
-def mean(expr, type_map):
-    """Rewrite a mean op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-
-    arg = relay.op.cast(arg, "int32")
-    out = relay.op.mean(arg, **expr.attrs)
-    out = relay.op.cast(out, t.dtype)
-    return [out, t]
-
-
-def get_binary_types(expr, type_map):
-    """Get Affine types of a binary op's inputs and unify them"""
-    # Support the case where one input is quantized and the other is a constant float
-    left = expr.args[0]
-    right = expr.args[1]
-    left_t = None
-    right_t = None
-
-    if left in type_map:
-        left_t = type_map[left]
-    if right in type_map:
-        right_t = type_map[right]
-
-    out_t = type_map[expr]
-    if left_t is None and right_t is None:
-        raise TypeError("neither input is quantized!")
-    if left_t is None:
-        assert isinstance(left, relay.expr.Constant)
-        left = relay.qnn.op.quantize(
-            left, right_t.scale, right_t.zero_point, out_dtype=right_t.dtype
-        )
-        left_t = right_t
-    if right_t is None:
-        assert isinstance(right, relay.expr.Constant)
-        right = relay.qnn.op.quantize(
-            right, left_t.scale, left_t.zero_point, out_dtype=left_t.dtype
-        )
-        right_t = left_t
-
-    # Handle the case of mismatched inputs
-    if not left_t.dtype == out_t.dtype:
-        out_t = left_t
-
-    return left, right, left_t, right_t, out_t
-
-
-def register_binary_qnn(op_name, op):
-    """Register a Binary Op that converts to QNN"""
-
-    def binary(expr, type_map):
-        left, right, left_t, right_t, out_t = get_binary_types(expr, type_map)
-
-        if (
-            op_name == "add"
-            and approx_equal(left_t.scale, right_t.scale)
-            and approx_equal(left_t.zero_point, right_t.zero_point)
-            and tvm.ir.structural_equal(left_t.dtype, right_t.dtype)
-            and left_t.dtype == "int32"
-            and approx_equal(left_t.scale, out_t.scale)
-            and approx_equal(left_t.zero_point, out_t.zero_point)
-            and np.all(out_t.zero_point.data.numpy() == 0)
-        ):
-            # If this add op comes after conv2d or dense, out_t.scale and out_t.zero_point
-            # can be a vector, which is not supported by QNN binary operators.
-            # In particular, the pattern of an `add` op following `dense`, where the addition is
-            # really a bias addtion, can come up often. We identify that pattern and convert it to
-            # `qnn.dense` -> `add`.
-            # To avoid overflow, we do this conversion only when the input data type is 32 bit (bias
-            # addition is typically done in 32 bit).
-            return [left + right, left_t]
-
-        assert len(out_t.scale.data.shape) == 0, (
-            f"The output scale needs to be a scalar, but got a tensor of shape "
-            f"{out_t.scale.data.shape}"
-        )
-        assert len(out_t.zero_point.data.shape) == 0, (
-            f"The output zero point needs to be a scalar, but got a tensor of shape "
-            f"{out_t.zero_point.data.shape}"
-        )
-
-        out = op(
-            left,
-            right,
-            left_t.scale,
-            left_t.zero_point,
-            right_t.scale,
-            right_t.zero_point,
-            out_t.scale,
-            out_t.zero_point,
-            left_t.axis,
-            right_t.axis,
-        )
-
-        return [out, out_t]
-
-    return register_fake_quantization_to_integer(op_name, binary)
-
-
-# Use lambdas here to avoid a circular import problem
-# pylint: disable=unnecessary-lambda
-register_binary_qnn("add", lambda *args: relay.qnn.op.add(*args))
-register_binary_qnn("multiply", lambda *args: relay.qnn.op.mul(*args))
-register_binary_qnn("subtract", lambda *args: relay.qnn.op.subtract(*args))
-
-
-def register_binary_identity(op_name, op):
-    """Register a binary op that works directly on int8"""
-
-    def binary(expr, type_map):
-        left, right, left_t, right_t, out_t = get_binary_types(expr, type_map)
-        if left_t != out_t:
-            left = relay.qnn.op.requantize(
-                left,
-                left_t.scale,
-                left_t.zero_point,
-                out_t.scale,
-                out_t.zero_point,
-                out_dtype=out_t.dtype,
-                axis=left_t.axis,
-            )
-
-        if right_t != out_t:
-            right = relay.qnn.op.requantize(
-                right,
-                right_t.scale,
-                right_t.zero_point,
-                out_t.scale,
-                out_t.zero_point,
-                out_dtype=out_t.dtype,
-                axis=right_t.axis,
-            )
-        out = op(left, right)
-        return [out, out_t]
-
-    return register_fake_quantization_to_integer(op_name, binary)
-
-
-register_binary_identity("minimum", relay.op.minimum)
-register_binary_identity("maximum", relay.op.maximum)
-
-
-def register_unary_qnn(op_name, op):
-    """Rewrite a unary op"""
-
-    def unary(expr, type_map):
-        arg = expr.args[0]
-        x_t = type_map[arg]
-        out_t = type_map[expr]
-        out = op(arg, x_t.scale, x_t.zero_point, out_t.scale, out_t.zero_point)
-        return [out, out_t]
-
-    return register_fake_quantization_to_integer(op_name, unary)
-
-
-register_unary_qnn("sqrt", relay.qnn.op.sqrt)
-register_unary_qnn("rsqrt", relay.qnn.op.rsqrt)
-register_unary_qnn("exp", relay.qnn.op.exp)
-register_unary_qnn("erf", relay.qnn.op.erf)
-register_unary_qnn("sigmoid", relay.qnn.op.sigmoid)
-register_unary_qnn("hardswish", relay.qnn.op.hardswish)
-register_unary_qnn("tanh", relay.qnn.op.tanh)
-register_unary_qnn("abs", relay.qnn.op.abs)
-register_unary_qnn("log", relay.qnn.op.log)
-
-
-@register_fake_quantization_to_integer("take")
-def take(expr, type_map):
-    """Rewrite a take op"""
-    arg = expr.args[0]
-    indices = expr.args[1]
-    t = type_map[arg]
-
-    out = relay.op.take(arg, indices, **expr.attrs)
-    return [out, t]
-
-
-@register_optional_fake_quantization_to_integer("nn.softmax")
-def softmax(expr, type_map):
-    """Rewrite a softmax op"""
-    arg = expr.args[0]
-    arg_t = type_map[arg]
-    out_t = type_map[expr]
-
-    out = relay.qnn.op.softmax(
-        arg, arg_t.scale, arg_t.zero_point, out_t.scale, out_t.zero_point, **expr.attrs
-    )
-    return [out, out_t]
diff --git a/python/tvm/relay/transform/flexible_shape.py b/python/tvm/relay/transform/flexible_shape.py
deleted file mode 100644
index c38fde0e704e..000000000000
--- a/python/tvm/relay/transform/flexible_shape.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Relay functions for wrapping a module with flexible shape dispatch."""
-import tvm
-from tvm import relay
-
-
-def override_shape(tensor_type, axis, dim):
-    """Change a dimension in a tensor shape."""
-    # Handle multiple tensors by overriding the shape of each.
-    if isinstance(tensor_type, relay.TupleType):
-        tensor_type = tensor_type.fields
-    else:
-        tensor_type = [tensor_type]
-
-    # Create new tensortypes for each input.
-    new_types = []
-    for t_type in tensor_type:
-        new_dims = list(t_type.shape)
-        new_dims[axis] = dim
-        new_types.append(relay.TensorType(new_dims, t_type.dtype))
-
-    # Dont return a tuple if there is a single tensor.
-    if len(new_types) == 1:
-        return new_types[0]
-    return relay.TupleType(tvm.runtime.convert(new_types))
-
-
-def specialize_body(mod, function, axis, dim, input_indices, affects_output=True):
-    """
-    Create a subgraph to handle specific input shapes
-
-    This function takes in a module and one of it's functions and creates a
-    similar function with a specific input shape. It then attaches the new function
-    to the module. Calling this function multiple times results in a module that
-    contains several similar functions each specialized to a specific input shape.
-    This allows a dispatch handler to be built on top of the module to deal with
-    flexible shapes.
-
-    There are a few modes to this function. When the specialized function has multiple
-    flexible inputs, the index of those inputs must be provided to the input_indices argument.
-    In this case, the axis of the flexible dimension for each of those inputs must be the same.
-
-    By default, this function assumes that the output shape is dependent on the input
-    shape (as is the case in dynamic batching) and will also specialize the output type
-    accordingly. If this is not true, the affects_output argument must be set to False.
-
-    Parameters
-    ----------
-    mod: IRModule
-        The module that contains specialized functions and the dispatcher.
-    function: Function
-        The original non-specialized function that will be transformed.
-    axis: int
-        Which axis the flexible shape is on.
-    dim: int
-        The shape to specialize the new subgraph for along the axis dim.
-    input_indices: List[int]
-        Which inputs should be dispatched dynamically, provided by index. All inputs
-        must share the same dynamic axis.
-    affects_output: Optional[bool]
-        Whether the change in input shape has a corresponding effect on the output shape.
-        Batching for example effects both the input and output whereas changing sequence
-        length in an NLP model typically does not.
-
-    Returns
-    -------
-    gvar : GlobalVar
-        The new variable for the specialized subgraph.
-    spec_types : List[TensorType]
-        A list of the new specialized types for each input in the graph.
-    """
-    # Iterate through specified inputs and construct specialized shapes for each.
-    new_params = list(function.params)
-    data_binding = {}
-    dyn_data_array = []
-    for inp in input_indices:
-        data = function.params[inp]
-        flex_ty = override_shape(data.type_annotation, axis, dim)
-        dyn_data = relay.Var(data.name_hint, type_annotation=flex_ty)
-        new_params[inp] = dyn_data
-        data_binding[data] = dyn_data
-        dyn_data_array.append(dyn_data)
-
-    # Create a new function body for the modified shapes.
-    new_body = relay.expr.bind(function.body, data_binding)
-    # Only change the output shape if the input shape affects it.
-    if affects_output:
-        new_ret_ty = override_shape(function.ret_type, axis, dim)
-    else:
-        new_ret_ty = function.ret_type
-    gvar = relay.GlobalVar("main_" + str(dim))
-    # Add the new function to the main IRModule.
-    mod[gvar] = relay.Function(
-        new_params, new_body, new_ret_ty, function.type_params, function.attrs
-    )
-    return gvar, [d.type_annotation for d in dyn_data_array]
-
-
-def flexible_dispatch(
-    mod, buckets, axis=0, auto_pad=False, pad_value=0, input_indices=None, affects_output=True
-):
-    """
-    Enable inference of multiple shaped inputs in one module.
-
-    This transformation adds a handler around a module that
-    checks input shapes and dispatches to a subgraph specialized
-    to handle the specific shapes of that input. If no exactly matching
-    subgraph is available, the input will be run using full dynamism.
-    For best performance, specify all the sizes the module will
-    be likely to see using the buckets argument.
-
-    By default, this function will dispatch shapes that exactly match one
-    of the buckets to a corresponding subgraph. All non-matching shapes
-    use the same fully dynamic fallback. This can be detrimental to performance
-    for those non-matching shapes. Setting auto_pad to True causes this
-    function to round-up the shape of non-matching inputs to the closest
-    bucket. This allows them to use the tuned kernels of bucket shapes
-    which can improve performance.
-
-    Functions that have multiple inputs sharing a dynamic axis, which
-    is common for batch size or sequence length dynamism, are supported
-    through the input_indices argument.
-
-    Many types of dynamism such as batching affect both the input and output
-    shape, however this is not always the case. If the output shape
-    is independent of the input, the affects_output argument of this
-    function must be set to False.
-
-    Parameters
-    ----------
-    buckets: list[int]
-        The sizes of the input dimension that should be explicitly handled.
-        Each value in buckets will have a corresponding subgraph constructed to
-        handle it.
-    axis: int
-        The dimension of the input that should be made flexible. This will
-        most often be used for the batch dimension.
-    auto_pad: Optional[bool]
-        If True, then padding will be inserted to values that don't match one of
-        the provided buckets.
-    pad_value: Optional[float]
-        When auto_pad is true, padding will be done with this value.
-    input_indices: Optional[List[int]]
-        Which inputs should be dispatched dynamically, provided by index. All inputs
-        must share the same dynamic axis.
-    affects_output: Optional[bool]
-        Whether the change in input shape has a corresponding effect on the output shape.
-        Batching for example effects both the input and output whereas changing sequence
-        length in an NLP model typically does not.
-
-    Returns
-    -------
-    mod : IRModule
-        The new module wrapped with a flexible shape dispatch handler.
-    """
-    main_fn = mod["main"]
-
-    # Default to single input if not specified.
-    if input_indices is None:
-        input_indices = [0]
-
-    # Extract all input data and create a new dynamic variable for each.
-    data = []
-    dyn_data = []
-    for i in input_indices:
-        data.append(main_fn.params[i])
-        dyn_shape = override_shape(data[i].type_annotation, axis, relay.Any())
-        dyn_data.append(relay.Var(data[i].name_hint, type_annotation=dyn_shape))
-
-    # Extract the dynamic shape value from one of the inputs.
-    rt_sh = relay.op.shape_of(dyn_data[0])
-    flex_value = relay.op.take(rt_sh, relay.const(axis))
-
-    if_exprs = []
-
-    for i, bucket in enumerate(buckets):
-        input_data = dyn_data
-        check_dim = flex_value
-
-        # Apply automatic padding if specified.
-        if auto_pad:
-            input_data = []
-            # Construct padding expression for inputs.
-            for j, inp in enumerate(dyn_data):
-                pad_width = relay.const(bucket) - flex_value
-                rank = len(data[j].type_annotation.shape)
-                pads = relay.zeros([rank, 2], "int32")
-                pads = relay.scatter_nd(pads, relay.const([axis, 1]), pad_width)
-                padded_value = relay.nn.pad(inp, pads, pad_value)
-
-                # Determine if this is the proper bucket to pad to. Do this by checking if the
-                # input shape is between this bucket and the previous.
-                if i == 0:
-                    padded_value = relay.If(
-                        relay.op.less_equal(flex_value, relay.const(bucket)), padded_value, inp
-                    )
-                else:
-                    padded_value = relay.If(
-                        relay.op.logical_and(
-                            relay.op.less_equal(flex_value, relay.const(bucket)),
-                            relay.op.greater(flex_value, relay.const(buckets[i - 1])),
-                        ),
-                        padded_value,
-                        inp,
-                    )
-                # Update input value and test dimension to reflect possible padding.
-                input_data.append(padded_value)
-            # Grab the new possibly padded shape for checking bucket size.
-            check_dim = relay.op.take(relay.op.shape_of(input_data[0]), relay.const(axis))
-
-        # Create a specialized subgraph for the current bucket.
-        spec_call, spec_ty = specialize_body(
-            mod, main_fn, axis, bucket, input_indices=input_indices, affects_output=affects_output
-        )
-        # Apply hard casting to shape to create statically typed graphs.
-        spec_data = []
-        for j, inp in enumerate(input_data):
-            spec_data.append(relay.op.reshape(inp, spec_ty[j].shape))
-
-        # Create a dispatch statement for the current specialized graph.
-        call_args = list(main_fn.params)
-        for j, inp in enumerate(input_indices):
-            call_args[inp] = spec_data[j]
-        new_call = spec_call(*call_args)
-
-        # Remove meaningless padded outputs if applicable.
-        if auto_pad and affects_output:
-            new_call = relay.take(
-                new_call,
-                relay.arange(start=relay.const(0), stop=flex_value, dtype="int32"),
-                axis=axis,
-            )
-
-        # Add this new case to the dispatch handler.
-        if_exprs.append((relay.op.equal(check_dim, relay.const(bucket)), new_call))
-
-    # Create a subgraph to handle all other shapes.
-    default_dyn_call, _ = specialize_body(
-        mod, main_fn, axis, relay.Any(), input_indices=input_indices, affects_output=affects_output
-    )
-    call_args = list(main_fn.params)
-    for j, inp in enumerate(input_indices):
-        call_args[inp] = dyn_data[j]
-    new_body = default_dyn_call(*call_args)
-
-    # Create an If chain to dispatch shapes to the appropriate specialized subgraph.
-    for cond, true_branch in if_exprs:
-        new_body = relay.If(cond, true_branch, new_body)
-
-    # Assign new parameters to the function.
-    new_params = list(main_fn.params)
-    for j, inp in enumerate(input_indices):
-        new_params[inp] = dyn_data[j]
-
-    # Update the output shape to be dynamic if needed.
-    if affects_output:
-        dyn_ret_type = override_shape(main_fn.ret_type, axis, relay.Any())
-    else:
-        dyn_ret_type = main_fn.ret_type
-
-    # Assign the handler as the new entrypoint in the module.
-    new_main = relay.Function(
-        new_params, new_body, dyn_ret_type, main_fn.type_params, main_fn.attrs
-    )
-    mod["main"] = new_main
-    # Do type inference to make sure everything worked.
-    mod = relay.transform.InferType()(mod)
-    return mod
-
-
-class FlexibleShapeDispatch(object):
-    """Enable inference of multiple shaped inputs in one module.
-
-    This transformation adds a handler around a module that
-    checks input shapes and dispatches to a subgraph specialized
-    to handle the specific shapes of that input. If no exactly matching
-    subgraph is available, the input will be run using full dynamism.
-    For best performance, specify all the sizes the module will
-    be likely to see using the buckets argument.
-
-    By default, this pass will dispatch shapes that exactly match one
-    of the buckets to a corresponding subgraph. All non-matching shapes
-    use the same fully dynamic fallback. This can be detrimental to performance
-    for those non-matching shapes. Setting auto_pad to True causes this
-    pass to round-up the shape of non-matching inputs to the closest
-    bucket. This allows them to use the tuned kernels of bucket shapes
-    which can improve performance.
-
-    Models that have multiple inputs sharing a dynamic axis, which
-    is common for batch size or sequence length dynamism, are supported
-    through the input_indices argument.
-
-    Many types of dynamism such as batching affect both the input and output
-    shape, however this is not always the case. If the output shape
-    is independent of the input, the affects_output argument of this
-    pass must be set to False.
-
-    Parameters
-    ----------
-    buckets: list[int]
-        The sizes of the input dimension that should be explicitly handled.
-        Each value in buckets will have a corresponding subgraph constructed to
-        handle it.
-    axis: int
-        The dimension of the input that should be made flexible. This will
-        most often be used for the batch dimension.
-    auto_pad: Optional[bool]
-        If True, then padding will be inserted to values that don't match one of
-        the provided buckets.
-    pad_value: Optional[float]
-        When auto_pad is true, padding will be done with this value.
-    input_indices: Optional[List[int]]
-        Which inputs should be dispatched dynamically, provided by index. All inputs
-        must share the same dynamic axis.
-    affects_output: Optional[bool]
-        Whether the change in input shape has a corresponding effect on the output shape.
-        Batching for example effects both the input and output whereas changing sequence
-        length in an NLP model typically does not.
-
-    Returns
-    -------
-    ret : FlexibleShapeDispatch
-        A pass that can be applied to a module to add flexible shape handling.
-    """
-
-    def __init__(
-        self,
-        buckets,
-        axis=0,
-        auto_pad=False,
-        pad_value=0,
-        input_indices=None,
-        affects_output=True,
-    ):
-        self.axis = axis
-        self.buckets = buckets
-        self.auto_pad = auto_pad
-        self.pad_value = pad_value
-        self.input_indices = input_indices
-        self.affects_output = affects_output
-        super(FlexibleShapeDispatch, self).__init__()
-
-    def __call__(self, mod):
-        # Shape information is required for this pass.
-        mod = relay.transform.InferType()(mod)
-        return flexible_dispatch(
-            mod,
-            self.buckets,
-            self.axis,
-            self.auto_pad,
-            self.pad_value,
-            self.input_indices,
-            self.affects_output,
-        )
diff --git a/python/tvm/relay/transform/infer_layout_utils.py b/python/tvm/relay/transform/infer_layout_utils.py
deleted file mode 100644
index 2dc0d25e2dcd..000000000000
--- a/python/tvm/relay/transform/infer_layout_utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, missing-docstring, unused-import
-"""
-Relay infer correct layout pass.
-"""
-import tvm
-from tvm.runtime import Object
-from . import _ffi_api
-
-
-@tvm._ffi.register_object("relay._transform.InferCorrectLayoutOutput")
-class InferCorrectLayoutOutput(Object):
-    """An output structure to hold results from FInferCorrectLayout calls."""
-
-    def __init__(self, input_layouts, output_layouts, new_attrs):
-        self.__init_handle_by_constructor__(
-            _ffi_api.InferCorrectLayoutOutput, input_layouts, output_layouts, new_attrs
-        )
diff --git a/python/tvm/relay/transform/memory_plan.py b/python/tvm/relay/transform/memory_plan.py
deleted file mode 100644
index 814adb2b4ff1..000000000000
--- a/python/tvm/relay/transform/memory_plan.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return,invalid-name,len-as-condition,too-many-nested-blocks
-"""
-A pass for manifesting explicit memory allocations.
-"""
-from typing import Optional, Dict, List, Tuple
-from collections import defaultdict
-from dataclasses import dataclass
-
-from ..expr_functor import ExprMutator
-from .. import op, expr
-from ..function import Function
-from ... import register_func, ir, cpu
-from ..._ffi.runtime_ctypes import Device
-from ... import IRModule
-from .. import transform
-from . import function_pass
-
-
-def is_primitive(call):
-    return (
-        hasattr(call, "op")
-        and hasattr(call.op, "attrs")
-        and hasattr(call.op.attrs, "Primitive")
-        and int(call.op.attrs.Primitive) == 1
-    )
-
-
-@dataclass
-class Region:
-    """
-    Represents a control-free allocation region.
-
-    The below pass groups sets of allocations into regions,
-    then replaces the region with a single allocation.
-    """
-
-    var: expr.Var
-    size: expr.Expr
-    alignment: Optional[expr.Expr]
-    dtype: Optional[str]
-    device: Device
-    offsets: Dict[expr.Var, Tuple[expr.Expr, expr.Expr]]
-
-    @staticmethod
-    def empty(region_no):
-        zero = expr.const(0, dtype="int64")
-        assert len(zero.data.shape) == 0
-        region_var = expr.var(f"region{region_no}")
-        return Region(region_var, zero, None, None, None, {})
-
-    def grow(
-        self,
-        old_storage: expr.Var,
-        size: expr.Expr,
-        alignment: expr.Expr,
-        dev: Device,
-        dtype: str,
-    ) -> None:
-        """Grow the region by a given allocation as well as track the old storage
-        for later rewriting the program to use the allocated region.
-        """
-        if self.dtype:
-            assert self.dtype == dtype, "must have matching dtypes in a region"
-        else:
-            self.dtype = dtype
-
-        if self.alignment:
-            assert ir.structural_equal(
-                self.alignment, alignment
-            ), "must have matching alignments in a region"
-        else:
-            self.alignment = alignment
-
-        if self.device:
-            assert (
-                self.device.device_type == dev.device_type
-                and self.device.device_id == dev.device_id
-            ), "must have matching device"
-        else:
-            assert dev
-            self.device = dev
-
-        new_size = (
-            (size + self.alignment - expr.const(1, "int64")) / self.alignment * self.alignment
-        )
-
-        # Record the offset at which we allocate the storage.
-        offset_var: expr.RelayExpr = expr.var(f"offset{len(self.offsets)}")
-        self.offsets[old_storage] = (offset_var, self.size)
-
-        self.size = self.size + new_size
-
-    def offset_for(self, alloc: expr.Expr) -> expr.Expr:
-        return self.offsets.get(alloc, [None])[0]
-
-    def to_expr(self, body: expr.Expr) -> expr.Expr:
-        """
-        Generate the prelude code for a region, wrapping the body in it.
-
-        The prelude contains the single allocation for a region, and
-        all offset computations.
-        """
-
-        if self.device is None:
-            self.device = cpu(0)
-
-        # Generate bindings for each and every size computation
-        # we must do this to maintain ANF.
-        bindings: List[Tuple[expr.Expr, expr.Expr]] = []
-
-        # First compute the total size.
-        total_size = expr.var(f"total_size{hash(body)}")
-        bindings.append((total_size, self.size))
-
-        # Allocate the entire region with a single call.
-        alloc = op.memory.alloc_storage(total_size, self.alignment, self.device, self.dtype)
-        bindings.append((self.var, alloc))
-
-        # Generate variables which contain all of the offset math.
-        # Ensure we constant evaluate away all the math here.
-        #
-        # In theory we can support dynamic offsets but this
-        # requires another round of memory planning and
-        # potentially colaescing.
-        for alloc in self.offsets:
-            (var, offset) = self.offsets[alloc]
-            bindings.append((var, offset))
-
-        body = mk_let(bindings, body)
-        return body
-
-
-def iterative_let(let, each_binding, kont):
-    bindings = []
-    while isinstance(let, expr.Let):
-        lhs = let.var
-        rhs = let.value
-        bindings.append(each_binding(lhs, rhs))
-        let = let.body
-
-    return kont(bindings, let)
-
-
-def mk_let(bindings, body):
-    for var, value in reversed(bindings):
-        assert var
-        assert value
-        assert body
-        body = expr.Let(var, value, body)
-
-    return body
-
-
-def const_eval(mod, exp):
-    mod = IRModule.from_expr(exp, type_defs=mod.type_definitions)
-    mod = transform.FoldConstant()(mod)
-    return mod["main"]
-
-
-class StorageCoalesce(ExprMutator):
-    """
-    A pass for coalescing allocations into region/arena allocations.
-
-    After this pass each allocation comes from the same backing storage,
-    but will never overlap even in time, i.e. the allocations are just
-    packed into a contiguous block of memory.
-
-    A secondary part of memory planning will perform liveness analysis to
-    overlap these in time, i.e when an early tensor dies we will attempt
-    to reuse its slot.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.regions = []
-
-    def enter_scope(self) -> None:
-        region_no = len(self.regions)
-        self.regions.append(defaultdict(lambda: Region.empty(region_no)))
-
-    def exit_scope(self, body: expr.Expr) -> expr.Expr:
-        """When leaving a scope build a region allocation for the scope."""
-        dtype_region = self.regions.pop()
-        for _, region in reversed(list(dtype_region.items())):
-            if len(region.offsets) != 0:
-                body = region.to_expr(body)
-
-        return body
-
-    def current_region(self, dtype) -> Region:
-        current_scope = self.regions[-1]
-        return current_scope[dtype]
-
-    def new_region_and_offset(self, old_storage):
-        for dtype_region in reversed(self.regions):
-            for dtype in dtype_region:
-                region = dtype_region[dtype]
-                offset = region.offset_for(old_storage)
-                if offset:
-                    return region, offset
-
-        raise Exception("could not find offset in any valid region")
-
-    def visit_function(self, fn):
-        """Transform the function body to use region allocation scheme."""
-        func = fn
-        if getattr(func.attrs, "Primitive", 0) == 1:
-            return super().visit_function(func)
-        else:
-            self.enter_scope()
-            body = self.visit(func.body)
-            body = self.exit_scope(body)
-            return Function(
-                func.params,
-                body,
-                func.ret_type,
-                func.type_params,
-                func.attrs,
-            )
-
-    def visit_if(self, ite):
-        self.enter_scope()
-        true_branch = self.visit(ite.true_branch)
-        true_branch = self.exit_scope(true_branch)
-
-        self.enter_scope()
-        false_branch = self.visit(ite.false_branch)
-        false_branch = self.exit_scope(false_branch)
-
-        return expr.If(ite.cond, true_branch, false_branch)
-
-    def mk_let(self, dynamic_regions):
-        """Let bind the dynamic regions"""
-
-        def _mk_let(bindings, body):
-            for var, value in reversed(bindings):
-                assert var
-                assert value is not None
-                assert body
-                body = expr.Let(var, value, body)
-                if var in dynamic_regions:
-                    body = self.exit_scope(body)
-
-            return body
-
-        return _mk_let
-
-    def visit_let(self, let):
-        dynamic_regions = []
-
-        def _each_binding(lhs, rhs):
-            if isinstance(rhs, expr.Call) and rhs.op == op.op.get("memory.alloc_storage"):
-                return self.process_alloc_storage(dynamic_regions, lhs, rhs)
-            elif isinstance(rhs, expr.Call) and rhs.op == op.op.get("memory.alloc_tensor"):
-                return self.process_alloc_tensor(lhs, rhs)
-            else:
-                return lhs, rhs
-
-        result = iterative_let(let, _each_binding, self.mk_let(dynamic_regions))
-        assert result
-        return result
-
-    def process_alloc_storage(self, dynamic_regions, lhs, call):
-        """Process alloc_storage"""
-        size, alignment = call.args
-        dtype = call.attrs.dtype
-        dev = Device(call.attrs.device_type, call.attrs.device_id)
-
-        if not isinstance(size, expr.Constant):
-            self.enter_scope()
-            dynamic_regions.append(lhs)
-        else:
-            # A new scope is created when entering a new region with different
-            # device.
-            region = self.current_region(dtype)
-            if region.device and region.device.device_type != dev.device_type:
-                self.enter_scope()
-                dynamic_regions.append(lhs)
-
-        region = self.current_region(dtype)
-        region.grow(lhs, size, alignment, dev, dtype)
-        return lhs, region.var
-
-    def process_alloc_tensor(self, lhs, call):
-        """Process alloc tensor. Region and offset are computed"""
-        storage, old_offset, shape = call.args
-        region, offset = self.new_region_and_offset(storage)
-
-        assert old_offset.data.numpy().item() == 0, "no offsets should yet be allocated"
-        return (
-            lhs,
-            expr.Call(call.op, [region.var, offset, shape], call.attrs),
-        )
-
-
-class LiftConst(ExprMutator):
-    """An internal pass to lift constants to the top level of function."""
-
-    def __init__(self):
-        self.i = 0
-        self.constants = []
-        self.top_level = True
-        super().__init__()
-
-    def visit_constant(self, const):
-        var = expr.var(f"const{self.i}")
-        self.i += 1
-        self.constants.append((var, const))
-        return var
-
-    def visit_function(self, fn):
-        if int(getattr(fn.attrs, "Primitive", 0)) == 1:
-            return fn
-
-        outer_constant = self.constants
-        self.constants = []
-        # Populates self.constants.
-        body = self.visit(fn.body)
-        body = mk_let(self.constants, body)
-        self.constants = outer_constant
-
-        return Function(fn.params, body, fn.ret_type, fn.type_params, fn.attrs)
-
-    def visit_let(self, let):
-        bindings = []
-        while isinstance(let, expr.Let):
-            new_var = self.visit(let.var)
-            new_val = self.visit(let.value)
-            bindings.append((new_var, new_val))
-            let = let.body
-
-        new_body = self.visit(let)
-        return mk_let(bindings, new_body)
-
-
-@function_pass(opt_level=0)
-class MemoryPlan:
-    """An explicit pass wrapper around StorageCoalesce."""
-
-    def transform_function(self, func, mod, _):
-        mod.import_from_std("core.rly")
-        sc = StorageCoalesce()
-        func = sc.visit(func)
-        return func
-
-
-register_func("relay.transform.MemoryPlan", MemoryPlan)
-
-
-@function_pass(opt_level=0)
-class LiftConstants:
-    """An explicit pass wrapper around LiftConst."""
-
-    def transform_function(self, func, mod, _):
-        mod.import_from_std("core.rly")
-        func = LiftConst().visit(func)
-        return func
-
-
-register_func("relay.transform.LiftConstants", LiftConstants)
diff --git a/python/tvm/relay/transform/mixed_precision.py b/python/tvm/relay/transform/mixed_precision.py
deleted file mode 100644
index f6bb8b815085..000000000000
--- a/python/tvm/relay/transform/mixed_precision.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=line-too-long,unused-argument
-"""Default behavior for ops in mixed_precision pass. Import this file to use."""
-from typing import List
-
-from tvm.relay.op import register_mixed_precision_conversion
-
-# MIXED_PRECISION_ALWAYS ops should always be done in lower precision due to the speed and memory
-# savings. MIXED_PRECISION_FOLLOW ops can be done in lower precision but don't have speedups to
-# justify a cast. MIXED_PRECISION_NEVER colored ops should not be done in lower precision due to
-# numerical reasons.
-MIXED_PRECISION_ALWAYS = 0
-MIXED_PRECISION_FOLLOW = 1
-MIXED_PRECISION_NEVER = 2
-
-# Default lists inspired from TF's classifications:
-# github.com/tensorflow/tensorflow/blob/v2.5.0/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
-# They have a bias toward Nvidia Tensor Cores so modify lists per your hardware choice.
-DEFAULT_ALWAYS_LIST = [
-    "nn.conv1d",
-    "nn.conv2d",
-    "nn.conv3d",
-    "nn.conv1d_transpose",
-    "nn.conv2d_transpose",
-    "nn.conv3d_transpose",
-    "nn.dense",
-    "nn.batch_matmul",
-]
-DEFAULT_FOLLOW_LIST = [
-    # These ops add new data or change shape
-    "nn.pad",
-    "nn.batch_flatten",
-    "concatenate",
-    "zeros",
-    "split",
-    "squeeze",
-    "transpose",
-    "expand_dims",
-    "reshape",
-    "dyn.reshape",
-    "broadcast_to_like",
-    "dyn.broadcast_to",
-    "strided_slice",
-    "dyn.strided_slice",
-    "take",
-    "argwhere",
-    "where",
-    "tile",
-    "dyn.tile",
-    "scatter",
-    "scatter_elements",
-    "scatter_nd",
-    "full",
-    "dyn.full",
-    "nn.depth_to_space",
-    # Comparison
-    "less",
-    "greater",
-    "less_equal",
-    "greater_equal",
-    # By definition copy and cast will depend on inputs for output.
-    "copy",
-    "cast",
-    "cast_like",
-    # Simple arithmetic
-    "add",
-    "subtract",
-    "multiply",
-    "divide",
-    "nn.bias_add",
-    "nn.batch_norm",
-    "sqrt",
-    "shape_of",
-    # Simple activations
-    "max",
-    "min",
-    "maximum",
-    "minimum",
-    "argmax",
-    "argmin",
-    "nn.relu",
-    "nn.leaky_relu",
-    "nn.prelu",
-    "nn.dropout",
-    # Complicated activations which saturate in a narrow range
-    "sigmoid",
-    "tanh",
-    "fast_tanh",  # Some coefficients outside of representable range, but probably ok
-    "fast_exp",
-    "fast_erf",
-    "clip",  # Usually safe, may result in oddity if clip greater than fp16 range
-    # Pooling operations
-    "nn.max_pool1d",
-    "nn.max_pool2d",
-    "nn.max_pool3d",
-    "nn.avg_pool1d",
-    "nn.avg_pool2d",
-    "nn.avg_pool3d",
-    # "nn.global_max_pool1d", # does not exist yet
-    "nn.global_max_pool2d",
-    # "nn.global_max_pool3d", # does not exist yet
-    "nn.adaptive_max_pool1d",
-    "nn.adaptive_max_pool2d",
-    "nn.adaptive_max_pool3d",
-    "image.resize2d",
-]
-DEFAULT_NEVER_LIST = [
-    # In general if |f(x)| >> |x| for expected inputs then put the op here.
-    "exp",
-    "power",
-    "nn.cross_entropy",
-    "nn.cross_entropy_with_logits",
-    "nn.softmax",
-    "nn.l2_normalize",
-    # Error function doesn't seem to be able to be lowered into fp16 version in llvm.
-    # Move to follow list when it does.
-    "erf",
-    # Do not allow arange arguments (begin/end) to be fp16. "end" can be a big fp32 number
-    # not representable in fp16.
-    "arange",
-    # Ops that could involve a large summation are not allowed in fp16.
-    "nn.global_avg_pool2d",
-    "nn.adaptive_avg_pool1d",
-    "nn.adaptive_avg_pool2d",
-    "nn.adaptive_avg_pool3d",
-    "sum",
-    "mean",
-    "variance",
-    "nn.layer_norm",
-]
-
-# Returns a decorator which registers for every given op, the function under FTVMMixedPrecisionConversionType
-def register_func_to_op_list(list_ops: List):
-    def decorator(func):
-        for op_name in list_ops:
-            register_mixed_precision_conversion(op_name, func=func)
-
-    return decorator
-
-
-def get_generic_out_dtypes(call_node: "relay.Call", mixed_precision_type: str) -> List[str]:
-    """A function which returns output dtypes in a way which works for most ops.
-
-    Parameters
-    ---------
-    call_node: relay.Call
-        The call node containing the op.
-    mixed_precision_type: str
-        The target type to run the operation in.
-    Returns
-    -------
-    output_dtypes : [str, str]
-        A list of two strings. The first represents the datatype used for accumulation
-        in the operation. The second represents the actual output datatype.
-    """
-    # Assume support accumulation dtypes <---> has out_dtype attr.
-    # This is because there is no better way right now to tell which ops support accumulating
-    # at different data types.
-    # Some discussion here about making this better is here:
-    # https://discuss.tvm.apache.org/t/rfc-relay-fp32-fp16-model-support/9994/4?u=andrewzhaoluo
-    if hasattr(call_node.attrs, "out_dtype"):
-        # TODO (AndrewZhaoLuo): evaluate consistent support for mixed_type accumulators
-        # return ["float32", mixed_precision_type]
-        return [mixed_precision_type, mixed_precision_type]
-
-    # [accumulation_dtype, output_dtype] for the operations
-    return [mixed_precision_type, mixed_precision_type]
-
-
-# Functions for FTVMMixedPrecisionConversionType which
-# Take in CallNodes and a DType and returns a conversion type,
-# an accumulation dtype, and an output_dtype.
-@register_func_to_op_list(list_ops=DEFAULT_ALWAYS_LIST)
-def generic_always_op(call_node: "relay.Call", mixed_precision_type: str) -> List:
-    return [MIXED_PRECISION_ALWAYS] + get_generic_out_dtypes(call_node, mixed_precision_type)
-
-
-@register_func_to_op_list(list_ops=DEFAULT_FOLLOW_LIST)
-def generic_follow_op(call_node: "relay.Call", mixed_precision_type: str) -> List:
-    return [MIXED_PRECISION_FOLLOW] + get_generic_out_dtypes(call_node, mixed_precision_type)
-
-
-@register_func_to_op_list(list_ops=DEFAULT_NEVER_LIST)
-def generic_never_op(call_node: "relay.Call", mixed_precision_type: str) -> List:
-    return [MIXED_PRECISION_NEVER] + get_generic_out_dtypes(call_node, mixed_precision_type)
diff --git a/python/tvm/relay/transform/recast.py b/python/tvm/relay/transform/recast.py
deleted file mode 100644
index 39f07b2eb926..000000000000
--- a/python/tvm/relay/transform/recast.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=use-list-literal
-"""Relay type recasting pass"""
-import tvm
-from tvm import relay
-from tvm.ir import IRModule
-from .transform import InferType
-from ..analysis import count_layers
-from ..expr_functor import ExprMutator, Call
-
-
-class RecastMutator(ExprMutator):
-    """Cast operations to the target type."""
-
-    def __init__(self, dtype, out_dtype, valid_ops, valid_op_count, skip_layers):
-        self.dtype = dtype
-        self.out_dtype = out_dtype
-        self.depth_count = 0
-        self.valid_ops = [relay.op.get(op) for op in valid_ops]
-        self.valid_op_count = valid_op_count
-        self.skip_layers = skip_layers
-        # Convert negative indices to positive ones.
-        for i, layer in enumerate(skip_layers):
-            if layer < 0:
-                skip_layers[i] = self.valid_op_count + layer
-        super().__init__()
-
-    def visit_call(self, call):
-        # Keep track of our current depth and layer count
-        # so we can know whether to skip this layer or not.
-        current_depth = self.depth_count
-        current_layer = self.valid_op_count - current_depth - 1
-        if call.op in self.valid_ops:
-            self.depth_count += 1
-        # Visit current call operation
-        new_fn = self.visit(call.op)
-        # Visit current arguments
-        args = []
-        for arg in call.args:
-            args.append(self.visit(arg))
-            self.depth_count = current_depth
-
-        # Downcast this op if its the correct type and not skipped.
-        if call.op in self.valid_ops and current_layer not in self.skip_layers:
-            # Recast inputs to specified type.
-            if call.op == relay.op.get("concatenate"):
-                if len(call.args) != 1 or not isinstance(call.args[0], relay.expr.Tuple):
-                    return Call(new_fn, args, call.attrs)
-
-                tuple_args = [self.visit(arg) for arg in call.args[0].fields]
-                new_args = list()
-                for arg in tuple_args:
-                    new_args.append(relay.cast(arg, dtype=self.dtype))
-                new_args = [relay.expr.Tuple(new_args)]
-            else:
-                args = [self.visit(arg) for arg in call.args]
-                new_args = list()
-                for arg in args:
-                    new_args.append(relay.cast(arg, dtype=self.dtype))
-
-            # If out_dtype is in the attributes, we need to update it.
-            orig_dtype = None
-            if call.attrs is not None and "out_dtype" in call.attrs.keys():
-                new_attr_dict = {}
-                for attr in call.attrs.keys():
-                    attr_value = call.attrs[attr]
-                    if isinstance(attr_value, tvm.ir.container.Array):
-                        attr_value = tuple(attr_value)
-                    new_attr_dict[str(attr)] = attr_value
-                new_attr_dict["out_dtype"] = self.out_dtype
-                attr_type = str(call.attrs).split("(")[0]
-                new_attrs = tvm.ir.make_node(attr_type, **new_attr_dict)
-                if call.attrs["out_dtype"] != "":
-                    orig_dtype = call.attrs["out_dtype"]
-            else:
-                new_attrs = call.attrs
-
-            if orig_dtype is None:
-                # Perform type inference to determine the original type.
-                new_mod = IRModule.from_expr(call)
-                new_mod = InferType()(new_mod)
-                checked_arg = new_mod["main"].body
-                orig_dtype = checked_arg.checked_type.dtype
-            # Recast the output for compatibility with other graph operations.
-            return relay.cast(Call(new_fn, new_args, new_attrs), orig_dtype)
-
-        # Otherwise return the unchanged call.
-        return Call(new_fn, args, call.attrs)
-
-
-def recast(expr, dtype, out_dtype, ops=None, skip_layers=None):
-    """Convert the types of operations in a graph to a new value.
-    Note that this is primarily useful for testing performance of individual
-    operations at the new datatype. In a real setting, this pass will
-    almost certainly do a poor job converting from one datatype to another
-    as it just applies hard casting. For example, when recasting from float
-    to integer, many small values will simply be set to 0. Although this will
-    allow autotuning and benchmarking to produce proper timings at the new
-    data type, the output of the model will of course be heavily impacted.
-
-    Parameters
-    ---------
-    expr: tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule
-        The original function that will have its type changed.
-    dtype: str
-        The target type to cast to.
-    out_dtype: str
-        The output type to cast to.
-    ops: List[str]
-        A list of operations that should have their type changed,
-        others will be left as is.
-    skip_layers: List[int]
-        A list of integers indicating operations that should
-        not have their type changed, counted starting with the
-        first valid operation encountered. Negative indices are
-        allowed and indicate starting at the last layer.
-    Returns
-    -------
-    output_expr : tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule
-        The graph after recasting to the specified datatype.
-    """
-    return_mod = False
-    if isinstance(expr, tvm.ir.IRModule):
-        expr = expr["main"]
-        return_mod = True
-    if ops is None:
-        ops = ["nn.conv2d"]
-    if skip_layers is None:
-        skip_layers = []
-    layer_depth = count_layers(expr, ops)
-    recast_pass = RecastMutator(dtype, out_dtype, ops, layer_depth, skip_layers)
-    expr = recast_pass.visit(expr)
-    if return_mod:
-        return tvm.IRModule.from_expr(expr)
-    return expr
diff --git a/python/tvm/relay/transform/suffixes.py b/python/tvm/relay/transform/suffixes.py
deleted file mode 100644
index eaca11607020..000000000000
--- a/python/tvm/relay/transform/suffixes.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"Add suffix to the relay.Call's span fields"
-from collections import defaultdict
-
-import tvm
-
-from ..expr_functor import ExprMutator
-from .. import expr as _expr
-
-SUFFIX_STRING = r"_PART_"
-
-
-class _SuffixTagger(ExprMutator):
-    """A pass to traverse the Relay graph to add suffix to the call's span fields.
-    This making span an unique indicator of a Relay line and we can use it to
-    obtain the mapping between the Relay that gets generated from a relay frontend
-    and the Relay after partitioning.
-    """
-
-    def __init__(self):
-        ExprMutator.__init__(self)
-        # key: span or source name, value: counter, indexed from 0
-        self.lookup = defaultdict(int)
-        self.suffix = SUFFIX_STRING
-        # a set to record hashes of an expressions which spans have been already rewritten
-        self.hashes = set()
-
-    def _tag_suffix(self, span):
-        # To avoid error once we introduce the SequentialSpan in the future
-        """https://discuss.tvm.apache.org/
-        t/pre-rfc-tvm-explorer-infrastructure/13457#pass-source-information-builder-6
-        """
-        # Don't need this if currently
-        if isinstance(span, tvm.relay.Span):
-            ori_name = span.source_name.name
-            new_name = ori_name + self.suffix + str(self.lookup[ori_name])
-            self.lookup[ori_name] += 1
-            return tvm.relay.Span(
-                tvm.relay.SourceName(new_name),
-                span.line,
-                span.end_line,
-                span.column,
-                span.end_column,
-            )
-        return span
-
-    def visit(self, expr):
-        if hasattr(expr, "span"):
-            return super().visit(expr)
-        return expr
-
-    def visit_call(self, call):
-        new_args = [self.visit(arg) for arg in call.args]
-        new_op = self.visit(call.op)
-        if tvm.ir.structural_hash(call) not in self.hashes:
-            self.hashes.add(tvm.ir.structural_hash(call))
-            expr__ = _expr.CallWithFields(
-                call,
-                new_op,
-                new_args,
-                call.attrs,
-                call.type_args,
-                None,
-                self._tag_suffix(call.span),
-            )
-        else:
-            expr__ = _expr.CallWithFields(
-                call, new_op, new_args, call.attrs, call.type_args, None, call.span
-            )
-        return expr__
-
-
-def tag_suffixes(mod):
-    """Traverses the Relay graph to add suffix to the call's span fields.
-    That making span as an unique indicator of a Relay call and we can use it to
-    obtain the mapping between the offloaded result and the frontend operators.
-
-    Parameters
-    ----------
-    tvm.ir.IRModule
-        The IRModule that gets generated from a relay frontend.
-
-    Returns
-    -------
-    tvm.ir.IRModule
-        The IRModule with call's span fields tagged with suffixes.
-    """
-    tagger = _SuffixTagger()
-    for global_var, func in mod.functions.items():
-        func = tagger.visit(func)
-        mod.update_func(global_var, func)
-    return mod
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
deleted file mode 100644
index 902c1a6a4576..000000000000
--- a/python/tvm/relay/transform/transform.py
+++ /dev/null
@@ -1,1512 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, missing-docstring, unused-import
-"""
-Relay pass transformation infrastructure.
-"""
-import functools
-import inspect
-import types
-import warnings
-
-import tvm.ir
-from tvm import relay, te
-from tvm.runtime import ndarray as _nd
-
-from ..backend.utils import mangle_module_name
-from . import _ffi_api
-
-
-def build_config(opt_level=2, required_pass=None, disabled_pass=None, trace=None):
-    """Configure the build behavior by setting config variables. This function
-    will be deprecated in TVM v0.7. Instead, we should directly use
-    tvm.transform.PassContext.
-
-    Parameters
-    ----------
-    opt_level: int, optional
-        Optimization level. The optimization pass name and level are as the
-        following:
-
-        .. code-block:: python
-
-            OPT_PASS_LEVEL = {
-                "SimplifyInference": 0,
-                "OpFusion": 1,
-                "FoldConstant": 2,
-                "FoldScaleAxis": 3,
-                "AlterOpLayout": 3,
-                "CanonicalizeOps": 3,
-                "CanonicalizeCast": 3,
-                "EliminateCommonSubexpr": 3,
-                "CombineParallelConv2D": 4,
-                "CombineParallelDense": 4,
-                "CombineParallelBatchMatmul": 4,
-                "FastMath": 4
-            }
-
-    required_pass: set of str, optional
-        Optimization passes that are required regardless of optimization level.
-
-    disabled_pass: set of str, optional
-        Optimization passes to be disabled during optimization.
-
-    trace: Callable[[IRModule, PassInfo, bool], None]
-        A tracing function for debugging or introspection.
-
-    Returns
-    -------
-    pass_context: PassContext
-        The pass context for optimizations.
-    """
-    warnings.warn(
-        "relay.build_config will be deprecated. Please use \
-                  tvm.transform.PassContext directly",
-        DeprecationWarning,
-    )
-    return tvm.transform.PassContext(opt_level, required_pass, disabled_pass, trace)
-
-
-@tvm._ffi.register_object("relay.FunctionPass")
-class FunctionPass(tvm.ir.transform.Pass):
-    """A pass that works on each tvm.relay.Function in a module. A function
-    pass class should be created through `function_pass`.
-    """
-
-
-def InferType():
-    """Infer the type of an expr.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered type inference pass.
-    """
-    return _ffi_api.InferType()
-
-
-def InferTypeLocal(expr):
-    """Infer the type of a single expr, reusing type information to do so.
-
-    This populates the checked_type field in expr. We assume existing type information
-    in the graph is correct!
-
-    Parameters
-    ----------
-    expr: relay.Expr
-        The expression we want to know the type of
-
-    Returns
-    -------
-    type: relay.Type
-        The type of the expression
-    """
-    return _ffi_api.InferTypeLocal(expr)
-
-
-def FoldScaleAxis():
-    """Fold the scaling of axis into weights of conv2d/dense. This pass will
-    invoke both forward and backward scale folding.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass to fold expressions.
-
-    Note
-    ----
-    Internally, we will call backward_fold_scale_axis before using
-    forward_fold_scale_axis as backward folding targets the common conv->bn
-    pattern.
-    """
-    return _ffi_api.FoldScaleAxis()
-
-
-def BackwardFoldScaleAxis():
-    """Backward fold axis scaling into weights of conv2d/dense.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass to backward fold expressions.
-
-    Note
-    ----
-    It is recommended to call backward_fold_scale_axis
-    before using forward_fold_scale_axis as backward folding targets the common
-    conv->bn pattern.
-    """
-    return _ffi_api.BackwardFoldScaleAxis()
-
-
-def RemoveUnusedFunctions(entry_functions=None):
-    """Remove unused global relay functions in a relay module.
-
-    Parameters
-    ----------
-    entry_functions: list[string]
-        The set of entry functions to start from.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass to remove unused functions.
-    """
-    if entry_functions is None:
-        entry_functions = ["main"]
-    return _ffi_api.RemoveUnusedFunctions(entry_functions)
-
-
-def ForwardFoldScaleAxis():
-    """Fold the scaling of axis into weights of conv2d/dense.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass to forward fold expressions.
-
-    Note
-    ----
-    It is recommended to call backward_fold_scale_axis
-    before using forward_fold_scale_axis, as backward folding targets the
-    common conv->bn pattern.
-    """
-    return _ffi_api.ForwardFoldScaleAxis()
-
-
-def SimplifyInference():
-    """Simplify the data-flow graph for inference phase. An simplified expression
-    which is semantically equal to the input expression will be returned.
-
-    Note that batch norms will only be simplified if their result is indexed at
-    tuple index 0.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass to perform operator simplification.
-
-    """
-    return _ffi_api.SimplifyInference()
-
-
-def FastMath():
-    """Converts the expensive non linear functions to their fast but approximate counterparts.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass to perform fast math operations.
-    """
-    return _ffi_api.FastMath()
-
-
-def CanonicalizeOps():
-    """Canonicalize special operators to basic operators.
-    This can simplify followed analysis, e.g. expanding bias_add to
-    expand_dims and broadcast_add.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass performing the canonicalization.
-    """
-    return _ffi_api.CanonicalizeOps()
-
-
-def DeadCodeElimination(inline_once=False, ignore_impurity=False):
-    """Remove expressions that do not have any users (dead code).
-
-    Parameters
-    ----------
-    inline_once: Optional[Bool]
-        Whether to inline a binding that is referenced exactly once.
-    ignore_impurity: Optional[Bool]
-        Whether to ignore possible side-effects in let-bound expressions.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that eliminates the dead code in a Relay program.
-    """
-    return _ffi_api.DeadCodeElimination(inline_once, ignore_impurity)
-
-
-def LazyGradientInit():
-    """Reduces memory usage of gradient tensors
-
-    Parameters
-    ----------
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        A pass which delays and/or reduces memory allocation,
-        by lazily allocating 0 or one filled tensors.
-    """
-    return _ffi_api.LazyGradientInit()
-
-
-def FoldConstantExpr(expr, mod, fold_qnn=False):
-    """Fold the constant expressions in a Relay program.
-    Parameters
-    ----------
-    expr: Expr
-        The expression to fold
-    mod: IRModule
-        The module the expr lives in (for global calls)
-    fold_qnn: bool
-        Whether to fold constants for QNN operations.
-
-    Returns
-    -------
-    new_expr: Expr
-        The expr after Constant Folding
-    """
-    return _ffi_api.FoldConstantExpr(expr, mod, fold_qnn)
-
-
-def FoldConstant(fold_qnn=False):
-    """Fold the constant expressions in a Relay program.
-
-    Because of backward compatibility reason it skips QNN primitives from folding by default.
-    There are some transformation passes like FakeQuantizationToInteger, which requires to keep QNN
-    primitives for constant subgraphs. Uncontrolled constant folding of QNN primitives may break
-    applicability of FakeQuantizationToInteger. We suggest to use FoldConstant pass with none
-    default fold_qnn=True value only when all other QNN sensitive passes were already applied.
-
-    Parameters
-    ----------
-    fold_qnn: bool
-        Whether to fold constants for QNN operations.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass for constant folding.
-    """
-    return _ffi_api.FoldConstant(fold_qnn)
-
-
-def FuseOps(fuse_opt_level=-1):
-    """Fuse operators in an expr to a larger operator according to some rules.
-
-    Parameters
-    ----------
-    fuse_opt_level : int
-        The level of fuse optimization. -1 indicates that the level will be
-        inferred from pass context.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass for operator fusion.
-    """
-    return _ffi_api.FuseOps(fuse_opt_level)
-
-
-def DefuseOps():
-    """The inverse operation of FuseOps. It transforms a fused program returned by FuseOps into the
-    program before FuseOps. (i.e., x == DefuseOps(FuseOps(x)))
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass for operator defusion.
-    """
-    return _ffi_api.DefuseOps()
-
-
-def CombineParallelConv2D(min_num_branches=3):
-    """Combine multiple conv2d operators into one.
-
-    Parameters
-    ----------
-    min_num_branches : int
-        The minimum number of required parallel branches for performing this
-        optimization.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that combines parallel conv2d operators.
-    """
-    return _ffi_api.CombineParallelConv2D(min_num_branches)
-
-
-def CombineParallelDense(min_num_branches=3, to_batch=True):
-    """Combine multiple dense operators into one. For example:
-
-    .. code-block
-                    data
-            /              \
-        dense (2,2)         dense (2,2)
-            |                 |
-        elemwise/bcast (2,2)  elemwise/bcast (2,2)
-
-    Would become:
-
-    .. code-block
-
-                data
-                |
-            batch_matmul+elemwise/bcast (2,2,2)
-
-    or (if to_batch=False)
-
-    .. code-block
-
-                data
-                |
-            dense+elemwise/bcast (2,2+2)
-
-    Parameters
-    ----------
-    min_num_branches : int
-        The minimum number of required parallel branches for performing this
-        optimization.
-
-    to_batch_matmul : bool
-        If True, combine parallel dense ops into batch_matmul op.
-        If False, combine parallel dense ops into dense op.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that combines parallel dense operators.
-    """
-    return _ffi_api.CombineParallelDense(min_num_branches, to_batch)
-
-
-def CombineParallelBatchMatmul(min_num_branches=3):
-    """Combine multiple batch matmul operators into one. For example:
-
-    .. code-block
-                             data (1, 2, 3)
-                         /                  \
-        batch_matmul(data, (1, 4, 3))    batch_matmul(data, (1, 5, 3))
-            |                                |
-        elemwise/bcast (1, 2, 4)         elemwise/bcast (1, 2, 5)
-
-    Would become:
-
-    .. code-block
-
-                data (1, 2, 3)
-                |
-            batch_matmul(data, (1, 4+5, 3))
-                |
-            elemwise/bcast (1 ,2, 4+5)
-
-    Parameters
-    ----------
-    min_num_branches : int
-        The minimum number of required parallel branches for performing this
-        optimization.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that combines parallel dense operators.
-    """
-    return _ffi_api.CombineParallelBatchMatmul(min_num_branches)
-
-
-def BatchingOps():
-    """Batching parallel operators into one for Conv2D, Dense and BatchMatmul.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The sequential pass which apply batching for different operator types.
-    """
-    return tvm.transform.Sequential(
-        [CombineParallelConv2D(), CombineParallelDense(), CombineParallelBatchMatmul()]
-    )
-
-
-def AlterOpLayout():
-    """Alternate the layouts of operators or replace primitive operators with
-    other expressions.
-    This pass can be used for computing convolution in custom layouts or
-    other general weight pre-transformation.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that alters the layout of operators.
-    """
-    return _ffi_api.AlterOpLayout()
-
-
-class LayoutConfig(object):
-    """A structure for customizing the ConvertLayout pass."""
-
-    current = None
-
-    def __init__(self, skip_layers=None):
-        self.skip_counter = 0
-        self.skip_layers = skip_layers if skip_layers is not None else []
-
-    def check_skip(self):
-        skip = self.skip_counter in self.skip_layers
-        self.skip_counter += 1
-        return skip
-
-    def reset(self):
-        self.skip_counter = 0
-        self.skip_layers = []
-
-    def __enter__(self):
-        self._old_manager = LayoutConfig.current
-        LayoutConfig.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        LayoutConfig.current = self._old_manager
-
-
-def ConvertLayout(desired_layouts):
-    """Given a dest layout, this pass transforms the expr such that most of the ops input data
-    layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms,
-    one at the start and one at the end.
-
-    This pass is not a part of relay.build and is expected to be called between framework-relay
-    parser and relay.build call. This is very helpful for hardware backends that support/prefer only
-    type of data layout.
-
-    RFC - https://discuss.tvm.apache.org/t/layout-conversion-pass/4009
-
-    This pass uses most of the AlterOpLayout and InferCorrectLayout infrastructure. We can define
-    new layouts for conv2d ops for now. Most of the other operators try to adapt to their input
-    layout using the InferCorrectLayout infrastructure.
-
-    Parameters
-    ----------
-    desired_layouts : map of op_name to list of layouts
-        Specify a mapping of operator names to a list of layouts to convert to, in the order
-        defined by the operator. An example for nn.conv2d could be: {"nn.conv2d", ["NHWC", "OHWI]},
-        where the first item in the list specifies the data layout and the second specifies the
-        kernel layout.
-
-    Returns
-    -------
-    pass: FunctionPass
-      The pass.
-    """
-    return _ffi_api.ConvertLayout(desired_layouts)
-
-
-def Legalize(legalize_map_attr_name="FTVMLegalize"):
-    """Legalizes an expression with another expression.
-    This pass can be used to replace an expr with another expr for target
-    dependent optimizations. For example, one expr, though semnatically
-    equivalent to the other, can have better performance on a target. This pass
-    can be used to legalize the expr in a target-dependent manner.
-
-    Parameters
-    ----------
-    legalize_map_attr_name : str
-        The Op's attr name which corresponds to the legalize rule function.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that rewrites an expr.
-    """
-    return _ffi_api.Legalize(legalize_map_attr_name)
-
-
-def MergeComposite(pattern_table):
-    """Merge multiple operators into a single composite relay function.
-
-    Parameters
-    ----------
-    pattern_table : List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Function]]
-        A list of (pattern_name, pattern, check) tuples.
-        The order of the patterns in the list will determine the order
-        of priority in which they are matched.
-        'check' is a function to check whether an extracted pattern matches.
-        It can be implemented by pattern writer but if not specified it will
-        always return True.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that merges operators into a single composite
-        relay function.
-    """
-    pattern_names = []
-    patterns = []
-    checks = []
-    for tup in pattern_table:
-        if len(tup) == 2:
-            pattern_name, pattern = tup
-            check = lambda extract: True
-        elif len(tup) == 3:
-            pattern_name, pattern, check = tup
-
-        pattern_names.append(pattern_name)
-        patterns.append(pattern)
-        checks.append(check)
-
-    return _ffi_api.MergeComposite(pattern_names, patterns, *checks)
-
-
-def MergeCompilerRegions():
-    """Merge together compiler regions.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that merges compiler regions.
-    """
-    return _ffi_api.MergeCompilerRegions()
-
-
-def ToANormalForm():
-    """Turn Graph Normal Form expression into A Normal Form Expression.
-    The scope of the root expression is the global scope.
-    The scope of any non root expression is the least common ancestor of all it's scope.
-    Values are ordered by post-DFS order in each scope.
-
-    Returns
-    -------
-    ret : Union[tvm.transform.Pass, tvm.relay.Expr]
-        The registered pass that transforms an expression into A Normal Form.
-    """
-    return _ffi_api.ToANormalForm()
-
-
-def ToANormalFormExpr(e):
-    """ToANormalForm, but on expression level.
-
-    Parameters
-    ----------
-    e : Expr
-        The graph expression.
-
-    Returns
-    -------
-    ret : Expr
-        The transformed expresion.
-    """
-    return _ffi_api.ToANormalFormExpr(e)
-
-
-def ToBasicBlockNormalForm():
-    """Turn an expression to Basic Block Normal Form.
-    We define a block as a group of expressions implied by the scope structure.
-    Each graph node can only belong to a single block.
-    For any value that is being used in multiple blocks, it has to be referred
-    by a Var which is defined in a block, whose scope is the least common ancestor
-    of blocks this value is used.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that transforms an expression into Basic Block Normal Form.
-    """
-    return _ffi_api.ToBasicBlockNormalForm()
-
-
-def ToCPS(expr, mod=None):
-    """
-    Turn expression into continuation passing style(CPS).
-
-    Every intermediate compute will be passed to a continuation.
-
-    Returns
-    -------
-    result: tvm.transform.Pass
-        The registered pass that transforms an expression into CPS.
-    """
-    return _ffi_api.to_cps(expr, mod)
-
-
-def EtaExpand(expand_constructor=False, expand_global_var=False):
-    """Add abstraction over a constructor or global variable bound to a function
-
-    Parameters
-    ----------
-    expand_constructor: bool
-        Whether to expand constructors.
-
-    expand_global_var: bool
-        Whether to expand global variables.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that eta expands an expression.
-    """
-    return _ffi_api.EtaExpand(expand_constructor, expand_global_var)
-
-
-def ToGraphNormalForm():
-    """Turn a Relay program in A Normal Form into Graph Normal Form
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that transforms an expression into Graph Normal Form.
-    """
-    return _ffi_api.ToGraphNormalForm()
-
-
-def EliminateCommonSubexpr(fskip=None):
-    """Eliminate common subexpressions.
-
-    Parameters
-    ----------
-    fskip: Callable
-        The callback function that decides whether an expression should be
-        skipped.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that eliminates common subexpressions.
-    """
-    return _ffi_api.EliminateCommonSubexpr(fskip)
-
-
-def PartialEvaluate():
-    """Evaluate the static fragment of the code.
-
-    Note
-    ----
-    This transformation could be either `Module -> Module` or `Expr -> Expr`.
-    It will directly transform the input expression to a new one if the target
-    expression is provided. Otherwise, it will rely on the pass manager to
-    carry out transformation.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that performs partial evaluation on an expression.
-    """
-    return _ffi_api.PartialEvaluate()
-
-
-def CanonicalizeCast():
-    """
-    Canonicalize cast expressions to make operator fusion more efficient.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that canonicalizes cast expression.
-    """
-    return _ffi_api.CanonicalizeCast()
-
-
-def LambdaLift():
-    """
-    Lift the closure to global function.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass that lifts the lambda function.
-    """
-    return _ffi_api.LambdaLift()
-
-
-def PartitionGraph(mod_name="default", bind_constants=True):
-    """Partition a Relay program into regions that can be executed on different
-    backends.
-
-    Parameters
-    ----------
-    mod_name : string
-        Controls the prefix of the name of each partitioned subraph.
-        If `mod_name` is None, then `tvmgen_` prefix is used.
-        Otherwise, `tvmgen_mod_name_` prefix is used.
-
-    bind_constants: bool
-        Whether or not to bind constants in partitioned subgraphs. Note that the codegen needs
-        to maintain the bound constants; Otherwise the constants will be maintained by
-        the metadata module. So it is recommended for C-source based codegens to
-        set bind_constants=False to avoid embedding large constants in a C source file.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that partitions the Relay program.
-    """
-    mod_name = mangle_module_name(mod_name)
-    return _ffi_api.PartitionGraph(mod_name, bind_constants)
-
-
-def AnnotateTarget(targets, include_non_call_ops=True):
-    """Annotate ops in an experession with a provied compiler/target and then
-    use it for codegen.
-
-    Parameters
-    ----------
-    targets : str or List[str]
-        The list of target compilers used for codegen.
-    include_non_call_ops : boolean
-        If True then non-call ops also will be annotated with targets
-        If False then non-call ops will not be processed
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The annotated pass that wrapps ops with subgraph_start and
-        subgraph_end.
-    """
-    if isinstance(targets, str):
-        targets = [targets]
-    return _ffi_api.AnnotateTarget(
-        [tvm.runtime.container.String(t) for t in targets], include_non_call_ops
-    )
-
-
-def DynamicToStatic():
-    """If possible, convert tvm.relay.dynamic* ops to static versions
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass for dynamic->static conversion.
-    """
-    return _ffi_api.DynamicToStatic()
-
-
-def Inline():
-    """Perform inlining on the given Relay IR module. The global functions that
-    are marked as `inline` should be always inlined. A cost model will be
-    needed in the future to decide if it is profitable to inline the function.
-
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that performs inlining for a Relay IR module.
-    """
-    return _ffi_api.Inline()
-
-
-def gradient(expr, mod=None, mode="higher_order"):
-    """
-    Transform the input function,
-    returning a function that calculate the original result,
-    paired with gradient of the input.
-
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression, which is a Function or a GlobalVar.
-
-    mod : Optional[tvm.IRModule]
-
-    mode : Optional[String]
-        The mode of the automatic differentiation algorithm.
-        'first_order' only works on first order code, but will not produce
-        reference nor closure.
-        'higher_order' works on all code using reference and closure.
-
-    Returns
-    -------
-    expr : tvm.relay.Expr
-      The transformed expression.
-    """
-    if mode == "first_order":
-        warnings.warn(
-            "using transform.gradient for first-order AD is deprecated, please use the"
-            "FirstOrderGradient module pass",
-            DeprecationWarning,
-        )
-        if mod is not None:
-            raise RuntimeError(
-                "to run first-order AD on a module, please use the FirstOrderGradient module pass."
-            )
-        return FirstOrderGradient()(tvm.IRModule.from_expr(expr))["main"]
-    if mode == "higher_order":
-        return _ffi_api.gradient(expr, mod)
-    raise Exception("unknown mode")
-
-
-def FirstOrderGradient():
-    """
-    Transforms all global functions in the module to return the original result, paired with the
-    gradients of the inputs. This pass transforms each global function independently and does not
-    support interprocedural AD. Additionally, this pass does not support any control-flow or
-    references, and should only be used on pure data-flow graphs.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered FirstOrderGradient pass.
-    """
-    return _ffi_api.FirstOrderGradient()
-
-
-def Defunctionalization(func, mod):
-    """
-    Performs defunctionalization on func,
-    transforming func from a higher-order program to a first-order program.
-
-    At each call site, the function is cloned and type parameters are substituted in.
-    Function arguments are encoded as datatypes
-    and additional apply functions are used for application.
-
-    Parameters
-    ----------
-    func : tvm.relay.Function
-        The input function, which should not be polymorphic or be higher-order.
-        This is because all types must be known and we can't encode function arguments
-        to the program itself.
-
-    mod : tvm.IRModule
-        The IRModule containing function and type definitions,
-        which is also mutated during this pass.
-
-    Returns
-    -------
-    expr : tvm.relay.Function
-      The output function.
-    """
-    return _ffi_api.Defunctionalization(func, mod)
-
-
-def to_cps(func, mod=None):
-    """
-    Turn expression into CPS expression.
-
-    Every intermediate compute will be passed to a continuation.
-
-    Parameters
-    ----------
-    func: tvm.relay.Function
-        The input function.
-
-    mod: Optional[tvm.IRModule]
-        The global module.
-
-    Returns
-    -------
-    result: tvm.relay.Function
-      The output function.
-    """
-    use_mod = mod if mod is not None else tvm.ir.IRModule()
-    return _ffi_api.to_cps(func, use_mod)
-
-
-def un_cps(func):
-    """
-    Turn an cps function into a Function without the continuation argument.
-
-    Note that this will not give the exact same interface as before cps:
-      If the input/output is higher order, they will still be in cps form.
-
-    Parameters
-    ----------
-    func: tvm.relay.Function
-        The input function
-
-    Returns
-    -------
-    result: tvm.relay.Function
-        The output function
-    """
-    return _ffi_api.un_cps(func)
-
-
-def _wrap_class_function_pass(pass_cls, pass_info):
-    """Wrap a python class as function pass"""
-
-    class PyFunctionPass(FunctionPass):
-        """Internal wrapper class to create a class instance."""
-
-        def __init__(self, *args, **kwargs):
-            # initialize handle in cass pass_cls creation failed.fg
-            self.handle = None
-            inst = pass_cls(*args, **kwargs)
-
-            # it is important not to capture self to
-            # avoid a cyclic dependency
-            def _pass_func(func, mod, ctx):
-                return inst.transform_function(func, mod, ctx)
-
-            self.__init_handle_by_constructor__(_ffi_api.MakeFunctionPass, _pass_func, pass_info)
-            self._inst = inst
-
-        def __getattr__(self, name):
-            # fall back to instance attribute if there is not any
-            return self._inst.__getattribute__(name)
-
-    functools.update_wrapper(PyFunctionPass.__init__, pass_cls.__init__)
-    PyFunctionPass.__name__ = pass_cls.__name__
-    PyFunctionPass.__doc__ = pass_cls.__doc__
-    PyFunctionPass.__module__ = pass_cls.__module__
-    return PyFunctionPass
-
-
-def function_pass(pass_func=None, opt_level=None, name=None, required=None):
-    """Decorate a function pass.
-
-    This function returns a callback when pass_func
-    is provided. Otherwise, it returns the created function pass using the
-    given optimization function.
-
-    Parameters
-    ----------
-    pass_func : Optional[Callable[(Function, Module, PassContext) -> Function]]
-        The transformation function or class.
-
-    opt_level : int
-        The optimization level of this module pass.
-
-    name : Optional[str]
-        The name of the function pass. The name could be empty. In this case, the
-        name of the optimization function will be used as the pass name.
-
-    required : Optional[List[str]]
-        The list of passes that the module pass is dependent on.
-
-    Returns
-    -------
-    create_function_pass : Union[Callable, FunctionPass]
-
-        A decorator will be returned if pass_func is not provided,
-        otherwise return the decorated result.
-        The returned decorator has two behaviors depending on the input:
-        A new FunctionPass will be returned when we decorate a pass function.
-        A new FunctionPass class will be returned when we decorate a class type.
-
-    Examples
-    --------
-    The following code block decorates a function pass class.
-
-    .. code-block:: python
-
-        @relay.transform.function_pass(opt_level=1)
-        class TestReplaceFunc:
-            def __init__(self, new_func):
-                self.new_func = new_func
-
-            def transform_function(self, func, mod, ctx):
-                # just for demo purposes
-                # transform func to new_func
-                return self.new_func
-
-        x = relay.var("x", shape=(10, 20))
-        f1 = relay.Function([x], x)
-        f2 = relay.Function([x], relay.log(x))
-        # fpass is now a special pass that replaces every
-        # function to f1
-        fpass = TestReplaceFunc(f1)
-        # now every function in input_mod is replaced by f1
-        res_mod = fpass(input_mod)
-
-
-    The following code creates a function pass by decorating
-    a user defined transform function.
-
-    .. code-block:: python
-
-        @relay.transform.function_pass(opt_level=2)
-        def transform(func, mod, ctx):
-            # my transformations here.
-            return func
-
-        function_pass = transform
-        assert isinstance(function_pass, transform.FunctionPass)
-        assert function_pass.info.opt_level == 2
-
-        # Given a module m, the optimization could be invoked as the follwoing:
-        updated_mod = function_pass(m)
-        # Now constant folding should have been applied to every function in
-        # the provided module m. And the updated module will be returned.
-    """
-
-    if opt_level is None:
-        raise ValueError("Please provide opt_level for the function pass.")
-
-    required = required if required else []
-    if not isinstance(required, (list, tuple)):
-        raise TypeError("Required is expected to be the type of " + "list/tuple.")
-
-    def create_function_pass(pass_arg):
-        """Internal function that creates a function pass"""
-        fname = name if name else pass_arg.__name__
-        info = tvm.transform.PassInfo(opt_level, fname, required)
-        if inspect.isclass(pass_arg):
-            return _wrap_class_function_pass(pass_arg, info)
-        if not callable(pass_arg):
-            raise TypeError("pass_func must be a callable for Module pass")
-        return _ffi_api.MakeFunctionPass(pass_arg, info)
-
-    if pass_func:
-        return create_function_pass(pass_func)
-    return create_function_pass
-
-
-@function_pass(opt_level=1)
-class ChangeBatch:
-    """
-    Change the batch size.
-
-    Parameters
-    ----------
-    data: Dict[relay.Var, int]
-      A dictionary of all the params to change.
-      The keys are all params, and the values are which dimension hold the batch.
-
-    batch_size: int
-      The batch size to change to.
-
-    Returns
-    -------
-    pass: FunctionPass
-      The pass.
-    """
-
-    def __init__(self, data, batch_size=16):
-        self.data = data
-        self.batch_size = batch_size
-
-    def transform_function(self, func, mod, ctx):
-        func = relay.Function(func.params, func.body, None, func.type_params, func.attrs)
-        change_batch = self
-
-        class ChangeBatchMutator(tvm.relay.ExprMutator):
-            def visit_var(self, var):
-                if var in change_batch.data:
-                    ty = var.type_annotation
-                    new_shape = list(ty.shape)
-                    new_shape[change_batch.data[var]] = change_batch.batch_size
-                    return relay.Var(var.name_hint, relay.TensorType(new_shape, ty.dtype))
-                return var
-
-        return ChangeBatchMutator().visit(func)
-
-
-def DenseToSparse(weight_name, weight_shape):
-    """
-    Rewrite qualified ```nn.dense operation``` to ```nn.sparse_dense```
-    This pass is used in ```data_dep_optimization.bsr_dense```
-    Parameters of this pass is generated by ```analysis.sparse_dense.process_params```
-
-    Parameters
-    ----------
-    weight_name: Array[String]
-      Names of weights which qualified sparse contrains
-
-    weight_shape: Array[Array[IntImm]]
-      Weights shape in BSR format.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered DenseToSparse pass.
-    """
-    return _ffi_api.DenseToSparse(weight_name, weight_shape)
-
-
-def Conv2dToSparse(weight_name, weight_shape, layout, kernel_size):
-    """
-    Rewrite qualified ```nn.conv2d operation``` to ```nn.sparse_conv2d```
-
-    Parameters
-    ----------
-    weight_name: Array[String]
-      Names of weights which qualified sparse contrains
-
-    weight_shape: Array[Array[IntImm]]
-      Weights shape in BSR format.
-
-    layout : str
-        layout of data
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered DenseToSparse pass.
-    """
-    return _ffi_api.Conv2dToSparse(weight_name, weight_shape, layout, kernel_size)
-
-
-def Conv2dToSparse2(layout, kernel_size, blocksize, sparsity_threshold):
-    """
-    Rewrite freezed ```nn.conv2d``` operation to ```nn.sparse_conv2d```
-
-    Parameters
-    ----------
-    layout : str
-        layout of data
-
-    kernel_size : int
-        kernel size of conv2d
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered DenseToSparse pass.
-    """
-    return _ffi_api.Conv2dToSparse2(layout, kernel_size, *blocksize, sparsity_threshold)
-
-
-def SimplifyFCTranspose(target_weight_name):
-    """
-    Rewrite ```y = nn.dense(x, transpose(w, [1, 0]))``` to ```y = nn.dense(x, wt)```
-    This pass is used in ```data_dep_optimization.simplify_fc_transpose```
-
-    Parameters
-    ----------
-    weight_name: Array[String]
-      Names of weights which qualified ```y = nn.dense(x, transpose(w, [1, 0]))```
-      This parameter is generated by ```analysis.search_fc_transpose``` function
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered SimplifyFCTranspose pass.
-    """
-    return _ffi_api.SimplifyFCTranspose(target_weight_name)
-
-
-def SimplifyExpr():
-    """
-    Simplify the Relay expression, including merging consecutive reshapes.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered SimplifyExpr pass.
-    """
-    return _ffi_api.SimplifyExpr()
-
-
-def PlanDevices(config):
-    """
-    Uses existing "on_device" and "device_copy" calls to infer the virtual device on which
-    every Relay sub-expression should run and the result stored. Captures the result of that
-    analysis using new "on_device" and "device_copy" calls. Sub-expressions which are
-    not otherwise constrained are assigned to the default primitive virtual device describe by
-    config. However data and computations which must be hosted on a CPU (such as shapes and
-    shape functions) use the host virtual device of the config.
-
-    Parameters
-    ----------
-    config : tvm.CompilationConfig
-        The compilation configuration, specifying available targets and default devices.
-
-    Returns
-    -------
-    ret : tvm.transforms.Pass
-        The pass.
-    """
-    return _ffi_api.PlanDevices(config)
-
-
-def ManifestLifetimes():
-    """
-    Manifest the lifetimes of variables after allocations have been manifested, by inserting kill
-    operations once variables become dead.
-    """
-    return _ffi_api.ManifestLifetimes()
-
-
-def FoldExplicitPadding():
-    """
-    FoldExplicitPadding finds explict padding before an op that can support
-    implicit padding and fuses them.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered ImplicitPadding pass.
-    """
-    return _ffi_api.FoldExplicitPadding()
-
-
-def AnnotateSpans():
-    """
-    Annotate a program with span information by first generating its textual
-    representation and then parsing it back into a Relay AST annotated with
-    span information.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered AnnotateSpans pass.
-    """
-    return _ffi_api.AnnotateSpans()
-
-
-def FakeQuantizationToInteger(hard_fail=False, use_qat=False, optional_qnn_ops=None):
-    # pylint: disable=anomalous-backslash-in-string
-    """
-    Find regions of the graph of the form
-
-    .. code-block:: text
-
-        x    w
-        |    |
-        dq   dq
-         \\   /
-          op1
-           |
-          op2
-           |
-           q
-
-    where ``q == qnn.quantize`` and ``dq = qnn.dequantize``
-    and rewrite them into integer versions of ``op1`` and ``op2``
-
-    Rules for rewriting indivdual ops are in fake_quantization_to_integer.py
-
-    Parameters
-    ----------
-    hard_fail : boolean
-        How do deal with errors during graph rewriting.
-        If true, raise an error.
-        If false, skip rewriting the subgraph.
-
-    use_qat : boolean
-        To perform an additional QAT pass - convert enabled operations with dequantized inputs.
-        Example: in the graph above op2 is not registered with the FakeQuantizationToInteger
-        attribute, op1 operation can still be converted. Converted pattern below:
-
-        .. code-block:: text
-
-            x    w
-            |    |
-            \\   /
-              op1
-              |
-              dq
-              |
-              op2
-              |
-              q
-
-    optional_qnn_ops : List[str]
-        Specify a list of operator names to explicitly enable conversion for
-        specific ops disabled by default.
-        Example: ['nn.softmax']
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered FakeQuantizationToInteger pass.
-    """
-    if optional_qnn_ops is None:
-        optional_qnn_ops = []
-    return _ffi_api.FakeQuantizationToInteger(hard_fail, use_qat, optional_qnn_ops)
-
-
-def FlattenAtrousConv():
-    # pylint: disable=anomalous-backslash-in-string
-    """
-    The purpose of this pass is to find a sequence of space_to_batch_nd-conv2d-batch_to_space_nd
-    operations:
-
-    .. code-block:: text
-
-      x     w
-      |     |
-      s2b   |
-       \\   /
-        conv2d
-         |
-         b2s
-
-    and convert them into subgraphs with a convolution with the modified "dilation" and
-    recalculated "padding" parameters.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered FlattenAtrousConv pass.
-    """
-    return _ffi_api.FlattenAtrousConv()
-
-
-def ToMixedPrecision(mixed_precision_type="float16", missing_op_mode=1):
-    """
-    Automatic mixed precision rewriter. Rewrite an FP32 relay graph into a version
-    where as many operations as possible are in the target mixed_precision_type.
-
-    Parameters
-    ----------
-    mixed_precision_type: str
-      The target datatype to transform operations in the graph to use.
-
-    missing_op_mode: int
-      Determines how to handle ops not registered with FTVMMixedPrecisionConversionType
-        0: Does not allow any missing ops. Will throw errors when encountering any.
-        1: Allow missing ops but emit warnings.
-        2: Allow missing ops and silently ignore them.
-
-    relay.ToMixedPrecision.keep_orig_output_dtype: boolean
-      Defines if outputs should be retained in original data type or convert to
-      mixed_precision_type. By default this parameter is False and transformation
-      modifies the data types of outputs to mixed_precision_type.
-      This parameter is not part of explicit arguments of the transformation, but should
-      be passed through tvm.transform.PassContext.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass.
-    """
-    if missing_op_mode < 0 or missing_op_mode > 2:
-        raise ValueError("Missing op mode is either 0, 1, or 2")
-    return _ffi_api.ToMixedPrecision(mixed_precision_type, missing_op_mode)
-
-
-def SplitArgs(max_function_args):
-    """Split function with huge number of arguments to smaller pieces.
-
-    Parameters
-    ----------
-    max_function_args: int
-      Maximum number of function arguments. If it equals 0 then SplitArgs
-      shouldn't split the function.
-
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The registered pass.
-    """
-    return _ffi_api.SplitArgs(max_function_args)
-
-
-def OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter=""):
-    """Outlines all literal functions in direct call positions which have a "Compiler"
-    attribute.
-
-    The outlined functions are bound to unique global vars according to their existing
-    "global_symbol" attribute. At most one function with the same global symbol is outlined.
-
-    If compiler_filter is non-empty only functions with that as their attribute value are
-    outlined.
-
-    This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism
-    to prepare the IRModule before custom lowering.
-
-    Parameters
-    ----------
-    compiler_filter : String
-        If non-empty, the "Compiler" attribute to filter on.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The pass.
-    """
-    return _ffi_api.OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter)
-
-
-def MarkCompilerFunctionsAsExtern(compiler_filter=""):
-    """Marks all global functions which have a "Compiler" attribute matching
-    compiler_filter as 'extern'.
-
-    The function's attributes are replaced with a single "Extern" attribute, and
-    all calls to the function are switched to use the 'call_lowered' calling convention.
-
-    If compiler_filter is non-empty only functions with that as their attribute value are
-    outlined.
-
-    This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism to
-    cleanup the IRModule after custom lowering.
-
-    Parameters
-    ----------
-    compiler_filter : String
-        If non-empty, the "Compiler" attribute to filter on.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The pass.
-    """
-    return _ffi_api.MarkCompilerFunctionsAsExtern(compiler_filter)
-
-
-def CapturePostDfsIndexInSpans():
-    """Captures the post-dfs index and dominator post-dfs index of (most) expression nodes in
-    their span, in the form "index:<post-dfs index>:<dominator post-dfs index>".
-
-    This is useful for debugging since a) it helps identify pretty-printed sub-expressions within
-    the overall model and b) the indexes are heavily used by Collage for its compact representation
-    of sub-graphs.
-
-    Note that Op and Constructor nodes are not changed even though they are assigned an
-    post-dfs index.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The pass.
-    """
-    return _ffi_api.CapturePostDfsIndexInSpans()
-
-
-def InlineCompilerFunctionsBoundTo(global_vars):
-    """Inlines all global functions bound to a global var in global_vars.
-
-    Both the global "Compiler" attributed function, and any calls to "Composite" functions it its
-    body are inlined.
-
-    This pass may be useful for external codegen which needs to undo partitioning based on
-    properties of the entire partition.
-
-    Parameters
-    ----------
-    global_vars : Array[tvm.relay.GlobalVar]
-        The global vars of all 'Compiler' functions to inline.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The pass.
-    """
-    return _ffi_api.InlineCompilerFunctionsBoundTo(global_vars)
-
-
-def CollagePartition(config, cost_estimator=None):
-    """Partition the bodies of all functions according to the available targets so as to
-    minimize model latency. See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.
-
-    Parameters
-    ----------
-    config : CompilationConfig
-        The available targets.
-    cost_estimator : CostEstimator, optional
-        The custom cost estimator to use for costing each candidate partition.
-
-    Returns
-    -------
-    ret : tvm.transform.Pass
-        The pass.
-
-    """
-    if cost_estimator is None:
-        cost_estimator = relay.collage.CostEstimator()
-
-    return _ffi_api.CollagePartition(config, cost_estimator)
-
-
-def DivToMul():
-    """Transform division by a constant to multiplication by the inverse of the constant"""
-    return _ffi_api.DivToMul()
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
deleted file mode 100644
index affd7f493030..000000000000
--- a/python/tvm/relay/ty.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-import
-"""The type nodes of the Relay language."""
-from tvm.ir import Type, TypeKind, TypeVar, GlobalTypeVar
-from tvm.ir import TypeConstraint, FuncType, TupleType, IncompleteType
-from tvm.ir import TypeCall, TypeRelation, TensorType, RelayRefType as RefType
-
-from .base import RelayNode
-from . import _ffi_api
-
-Any = _ffi_api.Any
-
-
-def is_dynamic(tensor_type):
-    """Check whether type has any or symbolic variables as a shape.
-
-    tensor_type : Type
-        The type to be inspected
-
-    Returns
-    -------
-    has_any : bool
-        The check result.
-    """
-    return _ffi_api.IsDynamic(tensor_type)
-
-
-def ShapeVar(name):
-    """A helper which constructs a type var of which the shape kind.
-
-    Parameters
-    ----------
-    name : str
-
-    Returns
-    -------
-    type_var : tvm.relay.TypeVar
-        The shape variable.
-    """
-    return TypeVar(name, kind=TypeKind.ShapeVar)
-
-
-def scalar_type(dtype):
-    """Creates a scalar type.
-
-    This function returns TensorType((), dtype)
-
-    Parameters
-    ----------
-    dtype : str
-        The content data type.
-
-    Returns
-    -------
-    s_type : tvm.relay.TensorType
-        The result type.
-    """
-    return TensorType((), dtype)
diff --git a/python/tvm/relay/type_functor.py b/python/tvm/relay/type_functor.py
deleted file mode 100644
index 39f94aeca747..000000000000
--- a/python/tvm/relay/type_functor.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The type functor of Relay."""
-from .ty import (
-    TypeVar,
-    IncompleteType,
-    TensorType,
-    FuncType,
-    TupleType,
-    TypeRelation,
-    RefType,
-    GlobalTypeVar,
-    TypeCall,
-)
-from .adt import TypeData
-
-
-class TypeFunctor:
-    """
-    An abstract visitor defined over Type.
-
-    Defines the default dispatch over types.
-    """
-
-    def __init__(self):
-        # TODO(weberlo): make type vars hashable, so we can memoize
-        pass
-
-    # pylint: disable=no-else-return
-    def visit(self, typ):
-        """Apply the visitor to a type."""
-        if isinstance(typ, TypeVar):
-            return self.visit_type_var(typ)
-        elif isinstance(typ, IncompleteType):
-            return self.visit_incomplete_type(typ)
-        elif isinstance(typ, TensorType):
-            return self.visit_tensor_type(typ)
-        elif isinstance(typ, FuncType):
-            return self.visit_func_type(typ)
-        elif isinstance(typ, TupleType):
-            return self.visit_tuple_type(typ)
-        elif isinstance(typ, TypeRelation):
-            return self.visit_type_relation(typ)
-        elif isinstance(typ, RefType):
-            return self.visit_ref_type(typ)
-        elif isinstance(typ, GlobalTypeVar):
-            return self.visit_global_type_var(typ)
-        elif isinstance(typ, TypeCall):
-            return self.visit_type_call(typ)
-        elif isinstance(typ, TypeData):
-            return self.visit_type_data(typ)
-        else:
-            raise Exception(f"unhandled case: {type(typ)}")
-
-    def visit_type_var(self, _):
-        raise NotImplementedError()
-
-    def visit_incomplete_type(self, _):
-        raise NotImplementedError()
-
-    def visit_tensor_type(self, _):
-        raise NotImplementedError()
-
-    def visit_func_type(self, _):
-        raise NotImplementedError()
-
-    def visit_tuple_type(self, _):
-        raise NotImplementedError()
-
-    def visit_type_relation(self, _):
-        raise NotImplementedError()
-
-    def visit_ref_type(self, _):
-        raise NotImplementedError()
-
-    def visit_global_type_var(self, _):
-        raise NotImplementedError()
-
-    def visit_type_call(self, _):
-        raise NotImplementedError()
-
-    def visit_type_data(self, _):
-        raise NotImplementedError()
-
-
-class TypeVisitor(TypeFunctor):
-    """
-    A visitor over Type.
-
-    The default behavior recursively traverses the AST.
-    """
-
-    def visit_type_var(self, tv):
-        pass
-
-    def visit_incomplete_type(self, it):
-        pass
-
-    def visit_tensor_type(self, tt):
-        pass
-
-    def visit_func_type(self, ft):
-        for arg_type in ft.arg_types:
-            self.visit(arg_type)
-        self.visit(ft.ret_type)
-        for type_param in getattr(ft, "type_params", []):
-            self.visit(type_param)
-        for type_constraint in getattr(ft, "type_constraints", []):
-            self.visit(type_constraint)
-
-    def visit_tuple_type(self, tt):
-        for field in tt.fields:
-            self.visit(field)
-
-    def visit_type_relation(self, tr):
-        for arg in tr.args:
-            self.visit(arg)
-
-    def visit_ref_type(self, rt):
-        self.visit(rt.value)
-
-    def visit_global_type_var(self, gtv):
-        pass
-
-    def visit_type_call(self, tc):
-        self.visit(tc.func)
-        for arg in tc.args:
-            self.visit(arg)
-
-    def visit_type_data(self, td):
-        self.visit(td.header)
-        for type_var in td.type_vars:
-            self.visit(type_var)
-
-
-class TypeMutator(TypeFunctor):
-    """
-    A functional visitor over Type.
-
-    The default behavior recursively traverses the AST
-    and reconstructs the AST.
-    """
-
-    def visit_type_var(self, tv):
-        return TypeVar(tv.name_hint, tv.kind)
-
-    def visit_incomplete_type(self, it):
-        return IncompleteType(it.kind)
-
-    def visit_tensor_type(self, tt):
-        return TensorType(tt.shape, tt.dtype)
-
-    def visit_func_type(self, ft):
-        new_arg_types = [self.visit(arg_type) for arg_type in ft.arg_types]
-        new_ret_type = self.visit(ft.ret_type)
-        new_type_params = [self.visit(type_param) for type_param in getattr(ft, "type_params", [])]
-        new_type_constraints = [
-            self.visit(type_constraint) for type_constraint in getattr(ft, "type_constraints", [])
-        ]
-        return FuncType(new_arg_types, new_ret_type, new_type_params, new_type_constraints)
-
-    def visit_tuple_type(self, tt):
-        return TupleType([self.visit(field) for field in tt.fields])
-
-    def visit_type_relation(self, tr):
-        return TypeRelation(tr.func, [self.visit(arg) for arg in tr.args], tr.num_inputs, tr.attrs)
-
-    def visit_ref_type(self, rt):
-        return RefType(self.visit(rt.value))
-
-    def visit_global_type_var(self, gtv):
-        return GlobalTypeVar(gtv.name_hint, gtv.kind)
-
-    def visit_type_call(self, tc):
-        return TypeCall(self.visit(tc.func), [self.visit(arg) for arg in tc.args])
-
-    def visit_type_data(self, td):
-        return TypeData(
-            self.visit(td.header),
-            [self.visit(type_var) for type_var in td.type_vars],
-            td.constructors,
-        )
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index 301f0ef66286..b748f84beca4 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -39,7 +39,6 @@
     load_param_dict_from_file,
 )
 
-from . import executor
 from . import disco
 
 from .support import _regex_match
diff --git a/python/tvm/runtime/executor/__init__.py b/python/tvm/runtime/executor/__init__.py
deleted file mode 100644
index ecc4097dbaa0..000000000000
--- a/python/tvm/runtime/executor/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""This module contains Python wrappers for the TVM C++ Executor implementations.
-
-NOTE: at present, only AOT Executor is contained here. The others are:
- - GraphExecutor, in python/tvm/contrib/graph_executor.py
- - VM Executor, in python/tvm/runtime/vm.py
-
-TODO(areusch): Consolidate these into this module.
-"""
-from .aot_executor import AotModule
diff --git a/python/tvm/runtime/executor/aot_executor.py b/python/tvm/runtime/executor/aot_executor.py
deleted file mode 100644
index 9e1e4bbd18f4..000000000000
--- a/python/tvm/runtime/executor/aot_executor.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""A Python wrapper for the Module-based Model Runtime Interface for Ahead-of-Time compilation."""
-
-import numpy as np
-
-
-class AotModule(object):
-    """Wraps the AOT executor runtime.Module.
-
-    This is a thin wrapper of the underlying TVM module.
-    you can also directly call set_input, run, and get_output
-    of underlying module functions
-
-    Parameters
-    ----------
-    module : tvm.runtime.Module
-        The internal tvm module that holds the implemented model functions.
-
-    Attributes
-    ----------
-    module : tvm.runtime.Module
-        The internal tvm module that holds the implemented model functions.
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        import tvm
-        from tvm import relay
-        from tvm.contrib import graph_executor
-
-        # build the library using graph executor
-        lib = relay.build(...)
-        lib.export_library("compiled_lib.so")
-        # load it back as a runtime
-        lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
-        # Call the library factory function for default and create
-        # a new runtime.Module, wrap with aot module.
-        gmod = tvm.runtime.executor.AotModule(lib["default"](dev))
-        # use the aot  module.
-        gmod.set_input("x", data)
-        gmod.run()
-    """
-
-    def __init__(self, module):
-        self.module = module
-        self._set_input = module["set_input"]
-        self._run = module["run"]
-        self._get_output = module["get_output"]
-        self._get_input = module["get_input"]
-        self._get_num_outputs = module["get_num_outputs"]
-        self._get_input_index = module["get_input_index"]
-        self._get_num_inputs = module["get_num_inputs"]
-        self._get_input_name = module["get_input_name"]
-
-    def set_input(self, key=None, value=None, **params):
-        """Set inputs to the module via kwargs
-
-        Parameters
-        ----------
-        key : int or str
-           The input key
-
-        value : the input value.
-           The input key
-
-        params : dict of str to NDArray
-           Additional arguments
-        """
-        if key is not None:
-            v = self._get_input(key)
-            if v is None:
-                raise RuntimeError(f"Could not find '{key}' in model's inputs")
-            v.copyfrom(value)
-
-        if params:
-            # upload big arrays first to avoid memory issue in rpc mode
-            keys = list(params.keys())
-            keys.sort(key=lambda x: -np.prod(params[x].shape))
-            for k in keys:
-                # TODO(zhiics) Skip the weights for submodule in a better way.
-                # We should use MetadataModule for initialization and remove
-                # params from set_input
-                val = self._get_input(k)
-                if val:
-                    self._get_input(k).copyfrom(params[k])
-
-    def run(self, **input_dict):
-        """Run forward execution of the model
-
-        Parameters
-        ----------
-        input_dict: dict of str to NDArray
-            List of input values to be feed to
-        """
-        if input_dict:
-            self.set_input(**input_dict)
-        self._run()
-
-    def get_num_outputs(self):
-        """Get the number of outputs from the model
-
-        Returns
-        -------
-        count : int
-            The number of outputs.
-        """
-        return self._get_num_outputs()
-
-    def get_num_inputs(self):
-        """Get the number of inputs to the model
-
-        Returns
-        -------
-        count : int
-            The number of inputs.
-        """
-        return self._get_num_inputs()
-
-    def get_input(self, index, out=None):
-        """Get index-th input to out
-
-        Parameters
-        ----------
-        index : int
-            The input index
-
-        out : NDArray
-            The output array container
-        """
-        if out:
-            self._get_input(index).copyto(out)
-            return out
-
-        return self._get_input(index)
-
-    def get_input_index(self, name):
-        """Get inputs index via input name.
-
-        Parameters
-        ----------
-        name : str
-           The input key name
-
-        Returns
-        -------
-        index: int
-            The input index. -1 will be returned if the given input name is not found.
-        """
-        return self._get_input_index(name)
-
-    def get_output(self, index, out=None):
-        """Get index-th output to out
-
-        Parameters
-        ----------
-        index : int
-            The output index
-
-        out : NDArray
-            The output array container
-        """
-        if out:
-            self._get_output(index, out)
-            return out
-
-        return self._get_output(index)
-
-    def get_input_name(self, index: int) -> str:
-        """Return the name of input with index `index`"""
-        return self._get_input_name(index)
-
-    def get_input_info(self):
-        """Return the 'shape' and 'dtype' dictionaries of the module."""
-        self.get_input_name(0)
-
-        shape_dict = dict()
-        dtype_dict = dict()
-        for ind in range(0, self.get_num_inputs()):
-            input_name = self.get_input_name(ind)
-            input_tensor = self.get_input(ind)
-            shape_dict[input_name] = input_tensor.shape
-            dtype_dict[input_name] = input_tensor.dtype
-
-        return shape_dict, dtype_dict
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
deleted file mode 100644
index cf6d3e3f9ce8..000000000000
--- a/python/tvm/runtime/vm.py
+++ /dev/null
@@ -1,738 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, invalid-name, redefined-builtin
-"""
-The Relay Virtual Machine runtime.
-
-Implements a Python interface to executing the compiled VM object.
-"""
-import numpy as np
-
-import tvm
-from tvm.runtime import Module
-from tvm._ffi.runtime_ctypes import TVMByteArray
-from tvm._ffi import base as _base
-from .object import Object
-from . import _ffi_api, container
-from ..rpc.base import RPC_SESS_MASK
-
-
-def _convert(arg, cargs):
-    def _gettype(arg):
-        if isinstance(arg, np.float16):
-            return "float16"
-        elif isinstance(arg, (_base.integer_types, bool)):
-            return "int32"
-        else:
-            return "float32"
-
-    if isinstance(arg, Object):
-        cargs.append(arg)
-    elif arg is None:
-        cargs.append(tvm.nd.array([], device=tvm.cpu(0)))
-    elif isinstance(arg, np.ndarray):
-        nd_arr = tvm.nd.array(arg, device=tvm.cpu(0))
-        cargs.append(nd_arr)
-    elif isinstance(arg, tvm.runtime.NDArray):
-        cargs.append(arg)
-    elif isinstance(arg, (tuple, list)):
-        field_args = []
-        for field in arg:
-            _convert(field, field_args)
-        cargs.append(container.tuple_object(field_args))
-    elif isinstance(arg, (_base.numeric_types, bool)):
-        dtype = _gettype(arg)
-        value = tvm.nd.array(np.array(arg, dtype=dtype), device=tvm.cpu(0))
-        cargs.append(value)
-    elif isinstance(arg, str):
-        cargs.append(arg)
-    else:
-        raise TypeError(f"Unsupported type: {type(arg)}")
-
-
-def convert(args):
-    cargs = []
-    for arg in args:
-        _convert(arg, cargs)
-
-    return cargs
-
-
-class Executable(object):
-    """Relay VM executable"""
-
-    def __init__(self, mod):
-        self.mod = mod
-        self._function_params = {}
-        self._save = self.mod["save"]
-        self._get_lib = self.mod["get_lib"]
-        self._get_bytecode = self.mod["get_bytecode"]
-        self._get_constants = self.mod["get_constants"]
-        self._get_virtual_devices = self.mod["get_virtual_devices"]
-        self._get_primitives = self.mod["get_primitives"]
-        self._get_stats = self.mod["get_stats"]
-        self._get_function_arity = self.mod["get_function_arity"]
-        self._get_function_param_name = self.mod["get_function_param_name"]
-        self._move_late_bound_consts = self.mod["move_late_bound_consts"]
-        self._get_late_bound_consts = self.mod["get_late_bound_consts"]
-        self._load_late_bound_consts = self.mod["load_late_bound_consts"]
-        self._load_late_bound_consts_from_map = self.mod["load_late_bound_consts_from_map"]
-
-    def save(self):
-        """Save the Relay VM Executable.
-
-        Returns
-        -------
-        code : bytearray
-            The binary blob representing a serialized Relay VM executable. It
-            can then be saved to disk and later deserialized into a new
-            Executable.
-
-        lib : :py:class:`~tvm.runtime.Module`
-            The runtime module that contains the generated code. It is
-            basically a library that is composed of hardware dependent code.
-
-        Notes
-        -----
-        The returned code is organized with the following sections in order.
-         - Global section. This section contains the globals used by the
-         virtual machine.
-
-         - Constant section. This section is used to store the constant pool of
-         a virtual machine.
-
-         - Primitive name section. This section is introduced to accommodate
-         the list of primitive operator names that will be invoked by the
-         virtual machine.
-
-         - Code section. The VM functions, including bytecode, are sitting in
-         this section.
-
-        Examples
-        --------
-
-        .. code-block:: python
-
-            import numpy as np
-            import tvm
-            from tvm import te
-            from tvm import relay
-            # define a simple network.
-            x = relay.var('x', shape=(10, 10))
-            f = relay.Function([x], x + x)
-            mod = tvm.IRModule({"main": f})
-            # create a Relay VM.
-            dev = tvm.cpu()
-            target = "llvm"
-            executable = relay.vm.compile(mod, target)
-            code, lib = executable.save()
-            # save and load the code and lib file.
-            tmp = tvm.contrib.utils.tempdir()
-            path_lib = tmp.relpath("lib.so")
-            lib.export_library(path_lib)
-            with open(tmp.relpath("code.ro"), "wb") as fo:
-                fo.write(code)
-            loaded_lib = tvm.runtime.load_module(path_lib)
-            loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read())
-            # deserialize.
-            des_exec = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib)
-            # execute the deserialized executable.
-            x_data = np.random.rand(10, 10).astype('float32')
-            des_vm = tvm.runtime.vm.VirtualMachine(des_exec, dev)
-            res = des_vm.run(x_data)
-            print(res.numpy())
-        """
-        return self._save(), self._get_lib()
-
-    @staticmethod
-    def load_exec(bytecode, lib):
-        """Construct an executable from saved artifacts.
-
-        Parameters
-        ----------
-        bytecode : bytearray
-            The binary blob representing a the Relay VM bytecode.
-
-        lib : :py:class:`~tvm.runtime.Module`
-            The runtime module that contains the generated code.
-
-        Returns
-        -------
-        exec: Executable
-            An executable constructed using the provided artifacts.
-        """
-        if isinstance(bytecode, (bytes, str)):
-            bytecode = bytearray(bytecode)
-        elif not isinstance(bytecode, (bytearray, TVMByteArray)):
-            raise TypeError(
-                "bytecode is expected to be the type of bytearray or TVMByteArray, but received "
-                f"{type(bytecode)}"
-            )
-
-        if lib is not None and not isinstance(lib, tvm.runtime.Module):
-            raise TypeError(
-                f"lib is expected to be the type of tvm.runtime.Module, but received {type(lib)}"
-            )
-
-        return Executable(_ffi_api.Load_Executable(bytecode, lib))
-
-    @property
-    def lib(self):
-        """Get the library that contains hardware dependent code.
-
-        Returns
-        -------
-        ret : :py:class:`~tvm.runtime.Module`
-            The runtime module that contains hardware dependent code.
-        """
-        return self._get_lib()
-
-    @property
-    def stats(self):
-        """Get the statistics of the Relay VM executable.
-
-        Returns
-        -------
-        ret : String
-            The statistic information of the VM executable.
-        """
-        return self._get_stats()
-
-    @property
-    def primitive_ops(self):
-        """Get the name of the primitive ops contained in the executable.
-
-        Returns
-        -------
-        ret : List[String]
-            The list of primitive ops.
-        """
-        ret = []
-        num_primitives = _ffi_api.GetNumOfPrimitives(self.module)
-        for i in range(num_primitives):
-            ret.append(_ffi_api.GetPrimitiveFields(self.module, i))
-        return ret
-
-    @property
-    def bytecode(self):
-        """Get the bytecode of the Relay VM executable.
-
-        Returns
-        -------
-        ret : String
-            The bytecode of the executable.
-
-        Notes
-        -----
-        The bytecode is in the following format:
-          func_name reg_file_size num_instructions
-
-          param1 param2 ... paramM
-
-          instruction1
-
-          instruction2
-
-          ...
-
-          instructionN
-
-        Each instruction is printed in the following format:
-          hash opcode field1 ... fieldX # The text format.
-
-        The part starting from # is only used for visualization and debugging.
-        The real serialized code doesn't contain it, therefore the deserializer
-        doesn't need to deal with it as well.
-        """
-        return self._get_bytecode()
-
-    @property
-    def constants(self):
-        """Returns a human-readable description of all the constants in the executable.
-        Useful for debugging and diffing generated executables in unit tests."""
-        return self._get_constants()
-
-    @property
-    def virtual_devices(self):
-        """Returns a human-readable description of all the (virtual) devices in the executable."""
-        return self._get_virtual_devices()
-
-    @property
-    def primitives(self):
-        """Returns a human-readable description of all the primitives (ie PackedFuncs) in the
-        executable"""
-        return self._get_primitives()
-
-    @property
-    def globals(self):
-        """Get the globals used by the Relay VM executable.
-
-        Returns
-        -------
-        ret : List[String]
-            The globals contained in the executable.
-        """
-        ret = []
-        num_globals = _ffi_api.GetNumOfGlobals(self.module)
-        for i in range(num_globals):
-            ret.append(_ffi_api.GetGlobalFields(self.module, i))
-        return ret
-
-    @property
-    def module(self):
-        """Return the runtime module contained in a virtual machine executable."""
-        return self.mod
-
-    def get_function_params(self, func_name):
-        """Get VM Function parameters"""
-        if func_name in self._function_params:
-            return self._function_params[func_name]
-        arity = self._get_function_arity(func_name)
-        assert arity >= 0
-        params = []
-        for i in range(arity):
-            p = self._get_function_param_name(func_name, i)
-            assert p
-            params.append(p)
-        self._function_params[func_name] = params
-        return params
-
-    def move_late_bound_consts(self, path, byte_limit):
-        """Move all constants of byte size greater or equal to byte_limit to file at path"""
-        return self._move_late_bound_consts(path, byte_limit)
-
-    def get_late_bound_consts(self, byte_limit):
-        """Return all constants of byte size greater or equal to byte_limit"""
-        return self._get_late_bound_consts(byte_limit)
-
-    def load_late_bound_consts(self, path):
-        """Re-load constants previously saved to file at path"""
-        return self._load_late_bound_consts(path)
-
-    def load_late_bound_consts_from_map(self, map):
-        """Re-load constants supplied in map"""
-        return self._load_late_bound_consts_from_map(map)
-
-
-class VirtualMachine(object):
-    """Relay VM runtime.
-
-    Parameters
-    ----------
-    exe : Executable
-        The VM executable.
-
-    device : tvm.runtime.Device or List[tvm.runtime.Device]
-        The device(s) on which the model will run.
-        Currently at most one device per device type is supported.
-
-    memory_cfg : str or Dict[tvm.runtime.Device, str], optional
-        Config the type of memory allocator. The allocator type can be ["naive",
-        "pooled"]. If memory_cfg is None, all devices will use pooled allocator
-        by default. If memory_cfg is string, all devices will use the specified
-        allocator type. If memory_cfg is a dict, each device uses the allocator
-        type specified in the dict, or pooled allocator if not specified in the
-        dict.
-    """
-
-    NAIVE_ALLOCATOR = 1
-    POOLED_ALLOCATOR = 2
-
-    def __init__(self, exe, device, memory_cfg=None):
-        """
-        Construct a VirtualMachine wrapper class which provides a simple
-        interface over the raw C++ Module based API.
-
-        Parameters
-        ----------
-        exe: Union[Executable, Module]
-            The executable either with the wrapper Python type or the raw runtime.Module.
-
-            In most cases this will be the Python wrapper class tvm.runtime.vm.Executable but
-            if you instead get the underlying runtime.Module subclass (i.e `exe.mod`) you
-            can directly pass it to this method.
-
-            This case can occur when doing things such as RPC where TVM's module APIs
-            return the raw modules, not the wrapped modules. This constructor will
-            handle this internally.
-
-        device: Union[Device, List[Device]]
-            The device, or devices on which to execute the VM code.
-
-        memory_cfg: Optional[str]
-            The allocator behavior to use for the VM.
-
-        Returns
-        -------
-        vm: VirtualMachine
-            A VM wrapper object.
-        """
-        if not isinstance(exe, Executable) and not isinstance(exe, Module):
-            raise TypeError(
-                f"exe is expected to be the type of Executable, but received {type(exe)}"
-            )
-
-        if not isinstance(exe, Executable):
-            exe = Executable(exe)
-
-        self.module = exe.mod["vm_load_executable"]()
-        self._exec = exe
-        self._init = self.module["init"]
-        self._invoke = self.module["invoke"]
-        self._invoke_stateful = self.module["invoke_stateful"]
-        self._get_output = self.module["get_output"]
-        self._get_num_outputs = self.module["get_num_outputs"]
-        self._get_input_index = self.module["get_input_index"]
-        self._set_input = self.module["set_input"]
-        self._set_one_input = self.module["set_one_input"]
-        self._set_outputs = self.module["set_outputs"]
-        self._setup_device(device, memory_cfg)
-
-    def _setup_device(self, dev, memory_cfg):
-        """Init devices and allocators."""
-        devs = dev
-        if not isinstance(dev, (list, tuple)):
-            if not isinstance(dev, tvm.runtime.Device):
-                raise TypeError("dev is expected to be Device or List[Device]")
-            devs = [dev]
-
-        # CPU is required for executing shape functions
-        if not any(c.device_type % RPC_SESS_MASK == tvm.cpu().device_type for c in devs):
-            devs.append(tvm.cpu())
-
-        default_alloc_type = VirtualMachine.POOLED_ALLOCATOR
-        if memory_cfg is None:
-            memory_cfg = {}
-        elif isinstance(memory_cfg, str):
-            assert memory_cfg in ["naive", "pooled"]
-            if memory_cfg == "naive":
-                default_alloc_type = VirtualMachine.NAIVE_ALLOCATOR
-            memory_cfg = {}
-        elif not isinstance(memory_cfg, dict):
-            raise TypeError(
-                f"memory_cfg is expected be string or dictionary, but received {type(memory_cfg)}"
-            )
-        init_args = []
-        for device in devs:
-            init_args.append(device.device_type % RPC_SESS_MASK)
-            init_args.append(device.device_id)
-            alloc_type = memory_cfg[device] if device in memory_cfg else default_alloc_type
-            init_args.append(alloc_type)
-        self._init(*init_args)
-
-    def set_input(self, func_name, *args, **kwargs):
-        """Set the input to a function.
-        If device type and device id for input tensor are the same as
-        for target one the zero copy is used. It means that internal
-        tensor is reference to memory allocated by input one.
-        Otherwise new internal NDarray is created and data is copied
-
-        Parameters
-        ----------
-        func_name : str
-            The name of the function.
-
-        args : list[tvm.runtime.NDArray] or list[np.ndarray]
-            The arguments to the function.
-
-        kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
-            Named arguments to the function.
-        """
-        if kwargs:
-            # kwargs is a super set of the required function parameters. We
-            # only find the ones that are needed.
-            func_params = self._exec.get_function_params(func_name)
-            new_args = [None] * len(func_params)
-            cnt = 0
-            for k in kwargs:
-                if k in func_params:
-                    idx = func_params.index(k)
-                    new_args[idx] = kwargs[k]
-                    cnt += 1
-            assert len(args) + cnt == len(func_params)
-            idx = 0
-            for i, arg in enumerate(new_args):
-                if arg is None:
-                    new_args[i] = args[idx]
-                    idx += 1
-            args = new_args
-        cargs = convert(args)
-        self._set_input(func_name, *cargs)
-
-    def set_one_input(self, func_name, *args, **kwargs):
-        """Set the one input tensor with tag to a function.
-
-        Parameters
-        ----------
-        func_name : str
-            The name of the function.
-        args : [str or int, tvm.runtime.NDArray]
-            name or index of tensor and input tensor, optional
-        kwargs: dict of str or int to tvm.runtime.NDArray, optional
-            taged arguments to the function.
-        Only args or kwargs should exist
-        """
-        if kwargs:
-            assert len(kwargs) == 1
-            tag = next(iter(kwargs))
-            if isinstance(tag, str):
-                func_params = self._exec.get_function_params(func_name)
-                assert tag in func_params
-            self._set_one_input(func_name, tag, kwargs[tag])
-        else:
-            assert len(args) == 2
-            self._set_one_input(func_name, args[0], args[1])
-
-    def invoke(self, func_name, *args, **kwargs):
-        """Invoke a function.
-
-        Parameters
-        ----------
-        func_name : str
-            The name of the function.
-
-        args : list[tvm.runtime.NDArray] or list[np.ndarray]
-            The arguments to the function.
-
-        kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
-            Named arguments to the function.
-
-        Returns
-        -------
-        result : Object
-            The output.
-        """
-        if args or kwargs:
-            self.set_input(func_name, *args, **kwargs)
-        return self._invoke(func_name)
-
-    def run(self, *args, **kwargs):
-        """Run the main function.
-
-        Parameters
-        ----------
-        args : list[tvm.runtime.NDArray] or list[np.ndarray]
-            The arguments to the function.
-
-        kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
-            Named arguments to the function.
-
-        Returns
-        -------
-        result : Object
-            The output.
-        """
-        return self.invoke("main", *args, **kwargs)
-
-    def invoke_stateful(self, func_name, *args, **kwargs):
-        """Invoke a function and ignore the returned result.
-
-        Use this function when running over rpc because it is currently
-        impossible to return a ADT object over rpc. To get the outputs, use
-        :py:func`get_outputs`.
-
-        Parameters
-        ----------
-        func_name : str
-            The name of the function.
-
-        args : list[tvm.runtime.NDArray] or list[np.ndarray]
-            The arguments to the function.
-
-        kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
-            Named arguments to the function.
-        """
-        if args or kwargs:
-            self.set_input(func_name, *args, **kwargs)
-        self._invoke_stateful(func_name)
-
-    def invoke_with_outputs(self, func_name, input_args, output_args):
-        # TODO(vvchernov): consider scenario then output tensors set once
-        """Invoke a function with pre-allocated output tensors.
-        The output tensors should be set every invocation.
-        input_args can be None if set_input method was used before.
-
-        This invoke method allows to avoid excess copying if memory for output tensors
-        was allocated before inference.
-
-        Parameters
-        ----------
-        func_name : str
-            The name of the function.
-
-        input_args: dict of str to tvm.runtime.NDArray or np.ndarray
-            Named arguments to the function.
-
-        output_args : list[tvm.runtime.NDArray] or list[DLTensor]
-            The output tensors of the function.
-        """
-        if input_args:
-            func_params = self._exec.get_function_params(func_name)
-            new_args = [None] * len(func_params)
-            cnt = 0
-            for k in input_args:
-                if k in func_params:
-                    idx = func_params.index(k)
-                    new_args[idx] = input_args[k]
-                    cnt += 1
-            assert cnt == len(func_params)
-        cargs = convert(new_args)
-        self._set_input(func_name, *cargs)
-        self._set_outputs(func_name, *output_args)
-        self._invoke(func_name)
-
-    def get_outputs(self):
-        """Get the outputs from a call to :py:func`invoke_stateful`.
-
-        Returns
-        -------
-        outputs : List[NDArray]
-        """
-        return [self._get_output(i) for i in range(self._get_num_outputs())]
-
-    def get_input_index(self, input_name, func_name="main"):
-        """Get inputs index via input name.
-        Parameters
-        ----------
-        name : str
-          The input key name
-        func_name : str
-          The function name
-
-        Returns
-        -------
-        index: int
-          The input index. -1 will be returned if the given input name is not found.
-        """
-        return self._get_input_index(input_name, func_name)
-
-    def benchmark(
-        self,
-        device,
-        *args,
-        func_name="main",
-        repeat=5,
-        number=5,
-        min_repeat_ms=None,
-        limit_zero_time_iterations=100,
-        end_to_end=False,
-        cooldown_interval_ms=0,
-        repeats_to_cooldown=1,
-        **kwargs,
-    ):
-        """Calculate runtime of a function by repeatedly calling it.
-
-        Use this function to get an accurate measurement of the runtime of a function. The function
-        is run multiple times in order to account for variability in measurements, processor speed
-        or other external factors.  Mean, median, standard deviation, min and max runtime are all
-        reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
-        synchonization and data transfer operations are not counted towards the runtime. This allows
-        for fair comparison of runtimes across different functions and models. The `end_to_end` flag
-        switches this behavior to include data transfer operations in the runtime.
-
-        The benchmarking loop looks approximately like so:
-
-        .. code-block:: python
-
-            for r in range(repeat):
-                time_start = now()
-                for n in range(number):
-                    func_name()
-                time_end = now()
-                total_times.append((time_end - time_start)/number)
-
-
-        Parameters
-        ----------
-        func_name : str
-            The function to benchmark
-
-        repeat : int
-            Number of times to run the outer loop of the timing code (see above). The output will
-            contain `repeat` number of datapoints.
-
-        number : int
-            Number of times to run the inner loop of the timing code. This inner loop is run in
-            between the timer starting and stopping. In order to amortize any timing overhead,
-            `number` should be increased when the runtime of the function is small (less than a 1/10
-            of a millisecond).
-
-        min_repeat_ms : Optional[int]
-            If set, the inner loop will be run until it takes longer than `min_repeat_ms`
-            milliseconds. This can be used to ensure that the function is run enough to get an
-            accurate measurement.
-
-        limit_zero_time_iterations : Optional[int]
-            The maximum number of repeats when measured time is equal to 0.
-            It helps to avoid hanging during measurements.
-
-        end_to_end : bool
-            If set, include time to transfer input tensors to the device and time to transfer
-            returned tensors in the total runtime. This will give accurate timings for end to end
-            workloads.
-
-        cooldown_interval_ms: Optional[int]
-            The cooldown interval in milliseconds between the number of repeats defined by
-            `repeats_to_cooldown`.
-
-        repeats_to_cooldown: Optional[int]
-            The number of repeats before the cooldown is activated.
-
-        args : Sequence[Object]
-            Arguments to the function. These are cached before running timing code, so that data
-            transfer costs are not counted in the runtime.
-
-        kwargs : Dict[str, Object]
-            Named arguments to the function. These are cached like `args`.
-
-        Returns
-        -------
-        timing_results : BenchmarkResult
-            Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
-            access the individual runtimes (in seconds).
-        """
-        min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
-        if end_to_end:
-            # We need to unpack keyword arguments into positional arguments
-            packed_args = list(args)
-            for k, v in kwargs.items():
-                i = self.get_input_index(k, func_name)
-                if i < 0:
-                    raise TypeError(f"{func_name}() got an unexpected keyword argument '{k}'")
-                while i >= len(packed_args):
-                    packed_args.append(None)
-                packed_args[i] = v
-            return self.module.time_evaluator(
-                "invoke_return_to_device",
-                device,
-                repeat=repeat,
-                number=number,
-                min_repeat_ms=min_repeat_ms,
-                limit_zero_time_iterations=limit_zero_time_iterations,
-            )(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args)
-        if args or kwargs:
-            self.set_input(func_name, *args, **kwargs)
-        return self.module.time_evaluator(
-            "invoke",
-            device,
-            repeat=repeat,
-            number=number,
-            min_repeat_ms=min_repeat_ms,
-            limit_zero_time_iterations=limit_zero_time_iterations,
-            cooldown_interval_ms=cooldown_interval_ms,
-            repeats_to_cooldown=repeats_to_cooldown,
-        )(func_name)
diff --git a/python/tvm/script/parser/tir/parser.py b/python/tvm/script/parser/tir/parser.py
index 3107354ac353..9cc3e785febb 100644
--- a/python/tvm/script/parser/tir/parser.py
+++ b/python/tvm/script/parser/tir/parser.py
@@ -454,7 +454,7 @@ def visit_expr_stmt(self: Parser, node: doc.Expr) -> None:
         T.evaluate(res)
     elif isinstance(res, (int, bool)):
         T.evaluate(tvm.tir.const(res))
-    elif isinstance(res, (tvm.relay.Call, tvm.relax.Call)) and not res.args:
+    elif isinstance(res, tvm.relax.Call) and not res.args:
         # Using GlobalVar.__call__ with no arguments is ambiguous, as
         # each IR has a different function Call representation.  If
         # this occurs, convert to the TIR representation.
diff --git a/python/tvm/target/datatype.py b/python/tvm/target/datatype.py
index 03dfb9995e6d..aaf30afaf535 100644
--- a/python/tvm/target/datatype.py
+++ b/python/tvm/target/datatype.py
@@ -303,7 +303,7 @@ def lower(op):
             key = (src_bits, t.bits)
 
         if key not in extern_func_map:
-            raise RuntimeError(f"missing key {key} in extern_func_map for {op.astext()}")
+            raise RuntimeError(f"missing key {key} in extern_func_map for {op}")
 
         if isinstance(op, _Cast):
             return call_pure_extern(dtype, extern_func_map[key], op.value)
@@ -314,7 +314,7 @@ def lower(op):
         if isinstance(op, _BinaryOpExpr):
             return call_pure_extern(dtype, extern_func_map[key], op.a, op.b)
 
-        raise RuntimeError(f"lowering unsupported op: {op.astext()}")
+        raise RuntimeError(f"lowering unsupported op: {op}")
 
     return lower
 
diff --git a/python/tvm/testing/__init__.py b/python/tvm/testing/__init__.py
index 041207b66fe7..2a4516e537a2 100644
--- a/python/tvm/testing/__init__.py
+++ b/python/tvm/testing/__init__.py
@@ -17,7 +17,6 @@
 
 # pylint: disable=redefined-builtin, wildcard-import
 """Utility Python functions for TVM testing"""
-from . import auto_scheduler, autotvm
 from ._ffi_api import (
     ErrorTest,
     FrontendTestModule,
diff --git a/python/tvm/testing/auto_scheduler.py b/python/tvm/testing/auto_scheduler.py
deleted file mode 100644
index bc335c82d324..000000000000
--- a/python/tvm/testing/auto_scheduler.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, missing-function-docstring
-"""Common functions for auto_scheduler test cases"""
-import tvm
-from tvm import auto_scheduler, te, topi
-from tvm.topi.nn.winograd_util import winograd_transform_matrices
-from tvm.topi.utils import get_const_tuple
-
-
-@auto_scheduler.register_workload
-def matmul_auto_scheduler_test(N, M, K):
-    A = te.placeholder((N, K), name="A")
-    B = te.placeholder((K, M), name="B")
-    k = te.reduce_axis((0, K), name="k")
-    C = te.compute(
-        (N, M),
-        lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]),
-        name="C",
-        attrs={"layout_free_placeholders": [B]},
-    )
-    return [A, B, C]
-
-
-@auto_scheduler.register_workload
-def double_matmul_auto_scheduler_test(N):
-    A = te.placeholder((N, N), name="A", dtype="float32")
-    B = te.placeholder((N, N), name="B", dtype="float32")
-    C = te.placeholder((N, N), name="C", dtype="float32")
-    k = te.reduce_axis((0, N), name="k")
-    D = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="D")
-    k = te.reduce_axis((0, N), name="k")
-    E = te.compute((N, N), lambda i, j: te.sum(D[i][k] * C[k][j], axis=[k]), name="E")
-
-    return [A, B, C, E]
-
-
-@auto_scheduler.register_workload
-def parallel_matmul_auto_scheduler_test(N):
-    """Two parallel matmuls with shared A."""
-    A = te.placeholder((N, N), name="A", dtype="float32")
-    B = te.placeholder((N, N), name="B", dtype="float32")
-    C = te.placeholder((N, N), name="C", dtype="float32")
-    k = te.reduce_axis((0, N), name="k")
-    D = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="D")
-    k = te.reduce_axis((0, N), name="k")
-    E = te.compute((N, N), lambda i, j: te.sum(A[i][k] * C[k][j], axis=[k]), name="E")
-
-    return [A, B, C, D, E]
-
-
-# Test for register_workload with different name
-@auto_scheduler.register_workload("matmul_auto_scheduler_test_rename_1")
-def matmul_auto_scheduler_test_rename_0(N, M, K):
-    A = te.placeholder((N, K), name="A")
-    B = te.placeholder((K, M), name="B")
-    k = te.reduce_axis((0, K), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
-    return [A, B, C]
-
-
-@auto_scheduler.register_workload
-def conv2d_nchw_bn_relu_auto_scheduler_test(
-    N, H, W, CI, CO, kernel_size, strides, padding, dilation=1
-):
-    data = te.placeholder((N, CI, H, W), name="Data")
-    kernel = te.placeholder((CO, CI, kernel_size, kernel_size), name="Kernel")
-    bias = te.placeholder((CO, 1, 1), name="Bias")
-    bn_scale = te.placeholder((CO, 1, 1), name="Bn_scale")
-    bn_offset = te.placeholder((CO, 1, 1), name="Bn_offset")
-
-    OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-    OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation)
-    conv = te.compute(
-        (N, CO, OH, OW), lambda i, j, k, l: conv[i, j, k, l] + bias[j, 0, 0], name="Bias_add"
-    )
-    conv = te.compute(
-        (N, CO, OH, OW), lambda i, j, k, l: conv[i, j, k, l] * bn_scale[j, 0, 0], name="Bn_mul"
-    )
-    conv = te.compute(
-        (N, CO, OH, OW), lambda i, j, k, l: conv[i, j, k, l] + bn_offset[j, 0, 0], name="Bn_add"
-    )
-    out = topi.nn.relu(conv)
-
-    return [data, kernel, bias, bn_offset, bn_scale, out]
-
-
-@auto_scheduler.register_workload
-def max_pool2d_auto_scheduler_test(N, H, W, CI, padding):
-    data = te.placeholder((N, CI, H, W), name="Data")
-    out = topi.nn.pool2d(data, [2, 2], [1, 1], [1, 1], [padding, padding, padding, padding], "max")
-
-    return [data, out]
-
-
-@auto_scheduler.register_workload
-def min_nm_auto_scheduler_test(N, M):
-    A = te.placeholder((N, M), name="A")
-    B = topi.min(A, axis=-1)
-
-    return [A, B]
-
-
-@auto_scheduler.register_workload
-def softmax_nm_auto_scheduler_test(N, M):
-    A = te.placeholder((N, M), name="A")
-    B = topi.nn.softmax(A, axis=1)
-
-    return [A, B]
-
-
-@auto_scheduler.register_workload
-def softmax_abcd_auto_scheduler_test(a, b, c, d):
-    A = te.placeholder((a, b, c, d), name="A")
-    B = topi.nn.softmax(A, axis=-1)
-
-    return [A, B]
-
-
-@auto_scheduler.register_workload
-def invalid_compute_definition():
-    A = te.placeholder((10, 10), name="A")
-    # The names of the following two iterators are the same.
-    # This is invalid.
-    r1 = te.reduce_axis((0, 2), name="r1")
-    r2 = te.reduce_axis((0, 2), name="r1")
-    B = te.compute((10,), lambda i: te.sum(A[i][r1 + r2], axis=[r1, r2]), name="B")
-    return [A, B]
-
-
-@auto_scheduler.register_workload
-def zero_rank_reduce_auto_scheduler_test(N):
-    A = tvm.te.placeholder((N,), name="A")
-    k = tvm.te.reduce_axis((0, N), name="k")
-    B = tvm.te.compute((), lambda: tvm.te.sum(A[k], k), name="B")
-
-    return [A, B]
-
-
-@auto_scheduler.register_workload
-def zero_rank_compute_auto_scheduler_test(N):
-    A = tvm.te.placeholder((N,), name="A")
-    B = tvm.te.compute((), lambda: A[0], name="B")
-
-    return [A, B]
-
-
-@auto_scheduler.register_workload
-def conv2d_winograd_nhwc_auto_scheduler_test(
-    N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1
-):
-    tile_size = 4
-    inputs = te.placeholder((N, H, W, CI), name="inputs")
-    N, H, W, CI = get_const_tuple(inputs.shape)
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
-
-    KH = KW = kernel_size
-    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
-    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
-    assert HSTR == 1 and WSTR == 1 and KH == KW
-
-    data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, _ = winograd_transform_matrices(m, r, "float32")
-
-    H = (H + 2 * HPAD - KH) // HSTR + 1
-    W = (W + 2 * WPAD - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-    kshape = (alpha, alpha, CI, CO)
-    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
-
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    # pack input tile
-    input_tile = te.compute(
-        (alpha, alpha, P, CI),
-        lambda eps, nu, p, ci: data_pad[idxdiv(p, (nH * nW))][idxmod(idxdiv(p, nW), nH) * m + eps][
-            idxmod(p, nW) * m + nu
-        ][ci],
-        name="input_tile",
-    )
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    data_pack = te.compute(
-        (alpha, alpha, P, CI),
-        lambda eps, nu, p, ci: te.sum(
-            input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
-        ),
-        name="data_pack",
-        attrs={"auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"]},
-    )
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name="ci")
-    bgemm = te.compute(
-        (alpha, alpha, P, CO),
-        lambda eps, nu, p, co: te.sum(
-            data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][ci][co], axis=[ci]
-        ),
-        name="bgemm",
-    )
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    inverse = te.compute(
-        (m, m, P, CO),
-        lambda vh, vw, p, co: te.sum(
-            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
-        ),
-        name="inverse",
-        attrs={"auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"]},
-    )
-
-    # output
-    output = te.compute(
-        (N, H, W, CO),
-        lambda n, h, w, co: inverse[
-            idxmod(h, m), idxmod(w, m), n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), co
-        ],
-        name="conv2d_winograd",
-    )
-
-    return [inputs, kernel_pack, output]
-
-
-def get_tiled_matmul():
-    """Get a compute dag and a state for tiled matmul"""
-    A, B, C = matmul_auto_scheduler_test(512, 512, 512)
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-
-    s0 = dag.get_init_state()
-    its0 = s0.split(C, s0[C].iters[0], [4, 8, 8])
-    its1 = s0.split(C, s0[C].iters[4], [8, 4, 4])
-    s0.reorder(
-        C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3], its1[3], s0[C].iters[8]]
-    )
-
-    return dag, s0
diff --git a/python/tvm/testing/autotvm.py b/python/tvm/testing/autotvm.py
deleted file mode 100644
index b1132cd1faa7..000000000000
--- a/python/tvm/testing/autotvm.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, missing-function-docstring, missing-class-docstring
-"""Common utilities for testing autotvm"""
-import time
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm import MeasureInput, MeasureResult
-from tvm.autotvm.measure.measure import Runner
-
-
-class DummyRunner(Runner):
-    def __init__(self):
-        super(DummyRunner, self).__init__(1, 1)
-
-    def run(self, measure_inputs, build_results):
-        return [
-            MeasureResult((np.random.random(),), 0, 0.2, time.time())
-            for _ in range(len(measure_inputs))
-        ]
-
-    def get_build_kwargs(self):
-        return {}
-
-
-@autotvm.template("testing/matmul")
-def matmul(N, L, M, dtype):
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-    s = te.create_schedule(C.op)
-
-    # schedule
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    ##### define space begin #####
-    cfg = autotvm.get_config()
-    cfg.define_split("tile_y", y, num_outputs=2)
-    cfg.define_split("tile_x", x, num_outputs=2)
-    ##### define space end #####
-
-    # schedule according to config
-    yo, yi = cfg["tile_y"].apply(s, C, y)
-    # Make sure configurations have a varied number of itervars. Splitting adds
-    # new itervars, so conditionally splitting with cause the number of
-    # itervars to depend on the tile size.
-    if cfg["tile_x"].size[-1] > 1:
-        xo, xi = cfg["tile_x"].apply(s, C, x)
-        s[C].reorder(yo, xo, k, yi, xi)
-    else:
-        s[C].reorder(yo, k, yi, x)
-
-    return s, [A, B, C]
-
-
-@autotvm.template("testing/bad_matmul")
-def bad_matmul(N, L, M, dtype):
-    if "bad_device" in tvm.target.Target.current().keys:
-        A = te.placeholder((N, L), name="A", dtype=dtype)
-        B = te.placeholder((L, M), name="B", dtype=dtype)
-
-        k = te.reduce_axis((0, L - 1), name="k")
-        C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-        s = te.create_schedule(C.op)
-
-        # schedule
-        y, x = s[C].op.axis
-        cfg = autotvm.get_config()
-        cfg.define_split("tile_y", y, num_outputs=2)
-        cfg.define_split("tile_x", x, num_outputs=2)
-        return s, [A, B, C]
-
-    return matmul(N, L, M, dtype)
-
-
-def get_sample_task(n=128):
-    """return a sample task for testing"""
-    target = tvm.target.Target("llvm")
-    task = autotvm.task.create("testing/matmul", args=(n, n, n, "float32"), target=target)
-    return task, target
-
-
-def get_sample_records(n):
-    """get sample records for testing"""
-    tsk, target = get_sample_task()
-
-    inps, ress = [], []
-    for i in range(n):
-        inps.append(MeasureInput(target, tsk, tsk.config_space.get(i % len(tsk.config_space))))
-        ress.append(MeasureResult((i + 1,), 0, i, time.time()))
-    return list(zip(inps, ress))
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 8546d4aef233..8df32c810543 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -90,11 +90,9 @@ def test_something():
 import tvm.te
 import tvm._ffi
 
-from tvm import relay
 from tvm.target import codegen
-from tvm.contrib import nvcc, cudnn, rocm, graph_executor
+from tvm.contrib import nvcc, cudnn, rocm
 import tvm.contrib.hexagon._ci_env_check as hexagon
-from tvm.driver.tvmc.frontends import load_model
 from tvm.error import TVMError
 import tvm.contrib.utils
 
@@ -1647,35 +1645,6 @@ def get_dtype_range(dtype: str) -> Tuple[int, int]:
     return type_info.min, type_info.max
 
 
-def generate_ref_data(mod, input_data, params=None, target="llvm"):
-    """Generate reference data through executing the relay module"""
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        lib = relay.build(mod, target=target, params=params)
-
-    lib_name = "mod.so"
-    temp = tvm.contrib.utils.tempdir()
-    lib_path = temp.relpath(lib_name)
-    lib.export_library(lib_path)
-    lib = tvm.runtime.load_module(lib_path)
-    grt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu()))
-    grt_mod.set_input(**input_data)
-    grt_mod.run()
-    output_count = grt_mod.get_num_outputs()
-    out = [grt_mod.get_output(i).numpy() for i in range(output_count)]
-    if isinstance(mod, tvm.relay.Function):
-        main = mod
-    else:
-        main = mod["main"]
-    if "output_tensor_names" in main.attrs:
-        output_tensor_names = main.attrs["output_tensor_names"]
-    else:
-        output_tensor_names = (
-            ["output"] if output_count == 1 else [f"output{i}" for i in range(output_count)]
-        )
-
-    return dict(zip(output_tensor_names, out))
-
-
 class _DeepCopyAllowedClasses(dict):
     def __init__(self, allowed_class_list):
         self.allowed_class_list = allowed_class_list
@@ -1831,8 +1800,8 @@ def terminate_self():
 def is_ampere_or_newer():
     """Check if the target environment has an NVIDIA Ampere GPU or newer."""
     arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    return major >= 8
+    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
+    return major >= 8 and minor != 9
 
 
 def install_request_hook(depth: int) -> None:
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index fc316fd19307..3588c04d8fa2 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -47,24 +47,12 @@
 from .unique import *
 from .searchsorted import *
 from .signal import *
-from . import generic
 from . import nn
-from . import x86
-from . import cuda
-from . import gpu
-from . import arm_cpu
-from . import mali
-from . import bifrost
-from . import intel_graphics
 from . import utils
-from . import rocm
 from . import vision
 from . import image
-from . import sparse
-from . import hls
 from . import random
-from . import hexagon
-from . import adreno
+from . import gpu
 
 # error reporting
 from .utils import InvalidShapeError
diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py
deleted file mode 100644
index 2c0ed20f1011..000000000000
--- a/python/tvm/topi/adreno/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""Qualcomm Adreno GPU specific declaration and schedules."""
-from .conv2d_nchw import *
-from .depthwise_conv2d_nchw import *
-from .conv2d_nhwc import *
-from .group_conv2d_nchw import *
-from .depthwise_conv2d_nhwc import *
-from .pooling import *
-from .conv2d_alter_op import *
-from .conv2d_transpose_alter_op import *
-from .conv2d_nchw_winograd import *
-from .conv2d_nhwc_winograd import *
-from .injective import schedule_injective
-from .reduction import *
-from .conv2d_transpose_nchw import *
diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py
deleted file mode 100644
index 47030606ddfb..000000000000
--- a/python/tvm/topi/adreno/conv2d_alter_op.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D alter op for Qualcomm Adreno GPU"""
-
-import logging
-
-import re
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from ..utils import get_const_tuple
-from .utils import infer_tile_size
-from ..nn import conv2d_alter_layout
-
-logger = logging.getLogger("topi")
-
-# Number of wildcards for matching of supported layouts to be transformed
-_NCHWc_matcher = re.compile("^NCHW[0-9]+c$")
-_OIHWo_matcher = re.compile("^OIHW[0-9]+o$")
-_NHWCc_matcher = re.compile("^NHWC[0-9]+c$")
-_HWIOo_matcher = re.compile("^HWIO[0-9]+o$")
-_HWOIo_matcher = re.compile("^HWOI[0-9]+o$")
-
-
-@conv2d_alter_layout.register("adreno")
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    """
-    Prepare of the new conv2d with proper target blocked layout attributes
-    OpenCL Textures supports 1d/2d/3d/4d tetures but read happens always only for 4 elements
-    in a line. Thus way we are supporting for now only 4d conversions on the end
-    NCHW -> NCHW4c & OIHW ->OIHW4o
-    NHWC -> NHWC4c & HWIO -> HWIO4o & HWOI -> HWOI4o
-    """
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    # Parse the attributes.
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data_tensor, kernel_tensor = tinfos
-    data_dtype = data_tensor.dtype
-    out_dtype = out_type.dtype
-
-    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
-        cfg = dispatch_ctx.query(target, None)
-        workload = cfg.workload
-    else:
-        impl, outs = relay.backend.te_compiler.select_implementation(
-            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-        )
-        workload = autotvm.task.get_workload(outs)
-        if workload is None:
-            if impl.name.find("winograd") != -1:
-                if dilation != (1, 1):
-                    logger.warning("Does not support weight pre-transform for dilated convolution.")
-                    return None
-
-                assert (data_layout == "NCHW" and kernel_layout == "OIHW") or (
-                    data_layout == "NHWC" and kernel_layout == "HWIO"
-                )
-                if data_layout == "NCHW":
-                    N, CI, H, W = get_const_tuple(data_tensor.shape)
-                    CO, _, KH, KW = get_const_tuple(kernel_tensor.shape)
-                    weight = inputs[1]
-                else:
-                    N, H, W, CI = get_const_tuple(data_tensor.shape)
-                    KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
-                    weight = relay.layout_transform(inputs[1], "HWIO", "OIHW")
-
-                # Pre-compute weight transformation in winograd
-                tile_size = infer_tile_size(data_tensor, data_layout)
-
-                # alpha, alpha, CO, CI
-                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
-                    weight, tile_size=tile_size
-                )
-                new_attrs["tile_size"] = tile_size
-                new_attrs["channels"] = CO
-                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                    inputs[0], weight, **new_attrs
-                )
-            return None
-
-        cfg = dispatch_ctx.query(target, workload)
-
-    topi_tmpl = workload[0]
-
-    if "conv2d_nchw_winograd" in topi_tmpl:
-        suffix = "_acc32" if "acc32" in topi_tmpl else ""
-        wkl_name = "conv2d_nchw_winograd_without_weight_transform" + suffix + ".image2d"
-        if dilation != (1, 1):
-            logger.warning("Does not support weight pre-transform for dilated convolution.")
-            return None
-
-        tile_size = infer_tile_size(data_tensor, data_layout)
-        if len(data_tensor.shape) == 5:
-            assert data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
-            N, CI, H, W, CB = get_const_tuple(data_tensor.shape)
-            CO, _, KH, KW, COB = get_const_tuple(kernel_tensor.shape)
-            weight = relay.layout_transform(inputs[1], "OIHW4o", "OIHW")
-            weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
-            weight = relay.layout_transform(weight, "HWOI", "HWIO4o")
-
-            new_attrs["tile_size"] = tile_size
-            new_attrs["channels"] = CO * COB
-
-            new_data = data_tensor
-            new_weight = te.placeholder(
-                (KH + tile_size - 1, KW + tile_size - 1, CI * CB, CO, COB),
-                dtype=kernel_tensor.dtype,
-            )
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_weight, strides, padding, dilation, out_dtype], wkl_name
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                inputs[0], weight, **new_attrs
-            )
-
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data_tensor.shape)
-        CO, _, KH, KW = get_const_tuple(kernel_tensor.shape)
-
-        # pre-compute weight transformation in winograd
-        # alpha, alpha, CO, CI
-        weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], tile_size=tile_size)
-        weight = relay.transpose(weight, axes=[2, 3, 0, 1])  # HWOI -> OIHW
-        # (oc, ic, h, w) -> (h, w, ic, oc)
-        new_attrs["kernel_layout"] = "HWIO"
-        new_attrs["tile_size"] = tile_size
-        new_attrs["channels"] = CO
-
-        # Store the same config for the altered operator (workload)
-        new_data = data_tensor
-        new_weight = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel_tensor.dtype
-        )
-        in_channel_block = CI % 4
-        if in_channel_block == 0:
-            in_channel_block = 4
-        num_filter_block = CO % 4
-        if num_filter_block == 0:
-            num_filter_block = 4
-
-        if in_channel_block != 4 or num_filter_block != 4:
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_weight, strides, padding, dilation, out_dtype], wkl_name
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                inputs[0], weight, **new_attrs
-            )
-
-        new_attrs["data_layout"] = f"NCHW{in_channel_block}c"
-        # (oc, ic, h, w) -> (h, w, ic, oc // 4, oc % 4)
-        new_attrs["kernel_layout"] = f"HWIO{num_filter_block}o"
-        new_attrs["out_layout"] = f"NCHW{num_filter_block}c"
-        # Store altered operator's config
-        new_data = te.placeholder(
-            (N, CI // in_channel_block, H, W, in_channel_block), dtype=data_dtype
-        )
-        new_weight = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, CI, CO // num_filter_block, num_filter_block),
-            dtype=kernel_tensor.dtype,
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_weight, strides, padding, dilation, out_dtype], wkl_name
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], weight, **new_attrs
-        )
-
-    if "conv2d_nhwc_winograd" in topi_tmpl:
-        suffix = "_acc32" if "acc32" in topi_tmpl else ""
-        wkl_name = "conv2d_nhwc_winograd_without_weight_transform" + suffix + ".image2d"
-        if dilation != (1, 1):
-            logger.warning("Does not support weight pre-transform for dilated convolution.")
-            return None
-
-        tile_size = infer_tile_size(data_tensor, data_layout)
-        if len(data_tensor.shape) == 5:
-            assert data_layout == "NHWC4c" and kernel_layout == "HWIO4o"
-            N, CI, H, W, CB = get_const_tuple(data_tensor.shape)
-            KH, KW, _, CO, COB = get_const_tuple(kernel_tensor.shape)
-            weight = relay.layout_transform(inputs[1], "HWIO4o", "OIHW")
-            weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
-            weight = relay.layout_transform(weight, "HWOI", "HWIO4o")
-
-            new_attrs["tile_size"] = tile_size
-            new_attrs["channels"] = CO * COB
-
-            new_data = data_tensor
-            new_weight = te.placeholder(
-                (KH + tile_size - 1, KW + tile_size - 1, CI * CB, CO, COB),
-                dtype=kernel_tensor.dtype,
-            )
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_weight, strides, padding, dilation, out_dtype], wkl_name
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                inputs[0], weight, **new_attrs
-            )
-
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        N, H, W, CI = get_const_tuple(data_tensor.shape)
-        KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
-
-        # pre-compute weight transformation in winograd
-        weight = relay.layout_transform(inputs[1], "HWIO", "OIHW")
-        weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
-        weight = relay.transpose(weight, axes=[0, 1, 3, 2])  # HWOI -> HWIO
-        new_attrs["tile_size"] = tile_size
-        new_attrs["channels"] = CO
-
-        # Store the same config for the altered operator (workload)
-        new_data = data_tensor
-        new_weight = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel_tensor.dtype
-        )
-        in_channel_block = CI % 4
-        if in_channel_block == 0:
-            in_channel_block = 4
-        num_filter_block = CO % 4
-        if num_filter_block == 0:
-            num_filter_block = 4
-
-        if in_channel_block != 4 or num_filter_block != 4:
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_weight, strides, padding, dilation, out_dtype], wkl_name
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                inputs[0], weight, **new_attrs
-            )
-
-        new_attrs["data_layout"] = f"NHWC{in_channel_block}c"
-        # (oc, ic, h, w) -> (h, w, ic, oc // 4, oc % 4)
-        new_attrs["kernel_layout"] = f"HWIO{num_filter_block}o"
-        new_attrs["out_layout"] = f"NHWC{num_filter_block}c"
-        # Store altered operator's config
-        new_data = te.placeholder(
-            (N, H, W, CI // in_channel_block, in_channel_block), dtype=data_dtype
-        )
-        new_weight = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, CI, CO // num_filter_block, num_filter_block),
-            dtype=kernel_tensor.dtype,
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_weight, strides, padding, dilation, out_dtype], wkl_name
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], weight, **new_attrs
-        )
-
-    if "conv2d_nchwc" in topi_tmpl:  # covers both conv2d_nchwc and depthwise_conv2d_nchwc
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            batch, in_channels, in_height, in_width = data_tensor.shape
-            out_channles, _, kernel_h, kernel_w = kernel_tensor.shape
-            in_channel_block = in_channels % 4
-            if in_channel_block == 0:
-                in_channel_block = 4
-            num_filter_block = out_channles % 4
-            if num_filter_block == 0:
-                num_filter_block = 4
-
-            # no support yet for tensors that cannot be divisible by factor 4
-            if num_filter_block != 4:
-                return None
-
-            batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-            out_channel, in_filter_channel, kh, kw = get_const_tuple(kernel_tensor.shape)
-
-            # update new attrs
-            new_attrs["channels"] = out_channel
-            if in_channel_block == 4:
-                new_attrs["data_layout"] = f"NCHW{in_channel_block}c"
-            else:
-                new_attrs["data_layout"] = "NCHW"
-            # (oc, ic, h, w) -> (OC, ic, h, w, oc)
-            new_attrs["kernel_layout"] = f"OIHW{num_filter_block}o"
-            new_attrs["out_layout"] = f"NCHW{num_filter_block}c"
-
-            # Store altered operator's config for applying of tuned AutoTVM statistics
-            if in_channel_block == 4:
-                new_data = te.placeholder(
-                    (batch_size, in_channel // in_channel_block, height, width, in_channel_block),
-                    dtype=data_dtype,
-                )
-            else:
-                new_data = data_tensor
-            new_kernel = te.placeholder(
-                (out_channel // num_filter_block, in_filter_channel, kh, kw, num_filter_block),
-                dtype=kernel_tensor.dtype,
-            )
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, strides, padding, dilation, out_dtype],
-                topi_tmpl,  # "conv2d_nchwc.image2d",
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-        else:
-            assert _NCHWc_matcher.match(data_layout)
-            assert _OIHWo_matcher.match(kernel_layout)
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    if "conv2d_nhwc" in topi_tmpl:  # covers both conv2d_nhwcc and depthwise_conv2d_nhwcc
-        if (data_layout == "NHWC" and kernel_layout == "HWIO") or (
-            data_layout == "NHWC" and kernel_layout == "HWOI"
-        ):
-            if kernel_layout == "HWIO":
-                batch_size, in_height, in_width, in_channels = data_tensor.shape
-                kernel_h, kernel_w, in_filter_channel, out_channles = kernel_tensor.shape
-            else:
-                batch_size, in_height, in_width, in_channels = data_tensor.shape
-                kernel_h, kernel_w, out_channles, in_filter_channel = kernel_tensor.shape
-            in_channel_block = in_channels % 4
-            if in_channel_block == 0:
-                in_channel_block = 4
-            num_filter_block = out_channles % 4
-            if num_filter_block == 0:
-                num_filter_block = 4
-
-            # no support yet for tensors cannot be divisible by factor 4
-            if num_filter_block != 4:
-                return None
-
-            # update new attrs
-            new_attrs["channels"] = out_channles
-            if in_channel_block == 4:
-                new_attrs["data_layout"] = f"NHWC{in_channel_block}c"
-            else:
-                new_attrs["data_layout"] = "NHWC"
-            # (h, w, ic, oc) -> (h, w, ic, OC, oc)
-            if kernel_layout == "HWIO":
-                new_attrs["kernel_layout"] = f"HWIO{num_filter_block}o"
-            else:
-                new_attrs["kernel_layout"] = f"HWOI{num_filter_block}o"
-            new_attrs["out_layout"] = f"NHWC{num_filter_block}c"
-
-            # Store altered operator's config for applying of tuned AutoTVM statistics
-            if in_channel_block == 4:
-                new_data = te.placeholder(
-                    (
-                        batch_size,
-                        in_height,
-                        in_width,
-                        in_channels // in_channel_block,
-                        in_channel_block,
-                    ),
-                    dtype=data_dtype,
-                )
-            else:
-                new_data = data_tensor
-            if kernel_layout == "HWIO":
-                new_kernel = te.placeholder(
-                    (
-                        kernel_h,
-                        kernel_w,
-                        in_filter_channel,
-                        out_channles // num_filter_block,
-                        num_filter_block,
-                    ),
-                    dtype=kernel_tensor.dtype,
-                )
-            else:
-                new_kernel = te.placeholder(
-                    (
-                        kernel_h,
-                        kernel_w,
-                        out_channles // num_filter_block,
-                        in_filter_channel,
-                        num_filter_block,
-                    ),
-                    dtype=kernel_tensor.dtype,
-                )
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, strides, padding, dilation, out_dtype], topi_tmpl
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-        else:
-            assert _NHWCc_matcher.match(data_layout)
-            assert _HWIOo_matcher.match(kernel_layout) or _HWOIo_matcher.match(kernel_layout)
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    return None
diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py
deleted file mode 100644
index bd128ed7bf75..000000000000
--- a/python/tvm/topi/adreno/conv2d_nchw.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""conv2d nchw schedule on Qualcomm Adreno GPU"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from ..utils import get_const_tuple, traverse_inline
-from .utils import (
-    split_to_chunks,
-    pack_input,
-    pack_filter,
-    expand_spatial_dimensions,
-    add_pad,
-    bind_data_copy,
-    get_default_conv2d_config,
-    get_texture_storage,
-)
-
-
-@autotvm.register_topi_schedule("conv2d_nchwc.image2d")
-def schedule_conv2d_nchwc(cfg, outs):
-    """Create the schedule for conv2d_nchw"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "adreno_conv2d_latest_op":
-            schedule_conv2d_NCHWc_KCRSk(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nchwc.image2d")
-def conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    """
-    Convolution operator in NCHWc layout.
-    Algo:
-      1. Convert into blocked format if we have 4d original tensor.
-         In case of AutoTVM we override the convert by just tensors since such conversion
-         will be absent for real blocked convolution, no sense to include into tuning
-      2. Expand spatial dimensions to have width and height be dividable by factor 4
-         This leads to slightly bigger amount of compute but allow utilize GPU much better
-      3. Add paddings. This happens even if we do not need pad originaly. This is useful
-         due to work arounding of the gaps of texture annotation between Primary Functions
-         and limited support of textures in schedules. Later on this pad will be executed
-         separately and will produce texture
-      4. 5d Convolution compute with accumulating into out_dtype
-      5. Cast to the origin output data type
-      6. For case of 4d convolution: convert of output from 5d to 4d
-    """
-
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    convert_from4d = False
-    if len(Input.shape) == 4:
-        batch, in_channels, in_height, in_width = Input.shape
-        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block)
-            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-        else:
-            Input = pack_input(
-                Input,
-                "NCHW",
-                batch,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                in_height,
-                in_width,
-            )
-    else:
-        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
-
-    if len(Filter.shape) == 4:
-        out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
-        else:
-            convert_from4d = True
-            Filter = pack_filter(
-                Filter,
-                "OIHW",
-                out_channel_chunks,
-                out_channel_block,
-                out_channel_tail,
-                in_filter_channels,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                kernel_h,
-                kernel_w,
-            )
-    else:
-        out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape
-
-    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
-        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
-    )
-
-    temp = add_pad(
-        Input,
-        "NCHW",
-        out_height_orig,
-        out_width_orig,
-        kernel_h,
-        kernel_w,
-        dilation_h,
-        dilation_w,
-        padding,
-        stride_h,
-        stride_w,
-    )
-
-    rcc = te.reduce_axis((0, in_channel_chunks), name="rc")
-    rcb = te.reduce_axis((0, in_channel_block), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-
-    conv = te.compute(
-        (batch, out_channel_chunks, out_height, out_width, out_channel_block),
-        lambda nn, ffc, yy, xx, ffb: te.sum(
-            (
-                temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
-                * Filter[ffc, rcc * in_channel_block + rcb, ry, rx, ffb]
-            ).astype(out_dtype),
-            axis=[rcc, rcb, ry, rx],
-        ),
-        tag="conv2d_nchwc",
-    )
-
-    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
-        dummy_cast = te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
-            tag="dummy_cast",
-        )
-        return te.compute(
-            (batch, out_channles, out_height_orig, out_width_orig),
-            lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
-            tag="adreno_conv2d_latest_op",
-        )
-    else:
-        return te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
-            tag="adreno_conv2d_latest_op",
-        )
-
-
-def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
-    """
-    schedule optimized for batch size = 1
-
-    Algo:
-    1. Split output axis to three parts: global work size, vthread, local worksize.
-       The limitations for tuning includes heuristics from some tuned networks to limit
-       search space and not pay much time for useles configurations.
-    2. In case of 4d convolution schedule copying of the input (and filter) into
-      5d tensors
-    4. pad should be scheduled separately to create independent opencl kernel. If pad is
-       inlined into convolution, this gives 1.5x performance drop
-    5. We are using cache_read for intermediate tensors to produce texture and guarantee
-       the best performance on the next stage.
-       The weights are managed through static texture planning mechanism and guarantied come
-       in texture memory scope.
-       Thus way we are calling cache_read only for data tensor
-    6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
-       for textures
-       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
-       of data type
-    7. In case of 4d conv we need to schedule postops as well
-    """
-    latest = s.outputs[0].output(0)
-    if len(latest.op.axis) == 4:
-        latest_blocked = dummy = output.op.input_tensors[0]
-        conv = dummy.op.input_tensors[0]
-    else:
-        conv = output.op.input_tensors[0]
-        latest_blocked = latest
-
-    pad_data, kernel = s[conv].op.input_tensors
-    filter_pack_rt = bool(
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    )
-
-    if "pad_temp" in pad_data.op.name:
-        input_pad_temp = pad_data.op.input_tensors[0]
-    else:
-        input_pad_temp = pad_data
-
-    input_pack_rt = bool(
-        isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag
-    )
-
-    ##### space definition begin #####
-    n, fc, y, x, fb = s[conv].op.axis
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-
-    if conv.shape[1] % 2 == 0:
-        min_threads_div = 2
-    else:
-        min_threads_div = 1
-    cfg.define_split(
-        "tile_fc",
-        fc,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8
-        and entity.size[2] >= min_threads_div
-        and entity.size[2] < 256,
-    )
-    cfg.define_split(
-        "tile_y",
-        y,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-    cfg.define_split(
-        "tile_x",
-        x,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-
-    cfg.define_split("tile_rcc", rcc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
-    cfg.multi_filter(
-        filter=lambda entity: (  # pylint: disable=chained-comparison
-            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
-        )
-        <= 24
-        and 32
-        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
-        < 1024
-    )
-    if cfg.is_fallback:
-        get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-    # There are several conditions that have to be handled:
-    # 1. If we are in the tuning, we always add cache read for data to main conv kernel
-    #    to get texture in tuning opencl kernel
-    # 2. If we are repacking input in runtime, we should always explicit schedule this one more
-    #    stage of data copy from 4d to 5d (referred as pack_data).
-    # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the
-    #    cache_read("texture")
-    if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt:
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            if "pad_temp" in pad_data.op.name:
-                s[pad_data].compute_inline()
-        else:
-            if "pad_temp" in pad_data.op.name:
-                pack_data = pad_data.op.input_tensors[0]
-                bind_data_copy(s[pack_data])
-                s[pad_data].compute_inline()
-            else:
-                pack_data = pad_data
-                bind_data_copy(s[pack_data])
-
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-    elif "pad_temp" in pad_data.op.name:
-        s[pad_data].compute_inline()
-        # create cache stage
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-
-    if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            bind_data_copy(s[kernel])
-        if kernel.shape[2] == 1 and kernel.shape[3] == 1:
-            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-            bind_data_copy(s[WT])
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    # tile and bind spatial axes
-    n, fc, y, x, fb = s[latest_blocked].op.axis
-
-    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
-
-    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
-    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
-    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
-
-    bf = s[latest_blocked].fuse(n, bf)
-    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
-    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
-    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
-    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
-    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
-    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
-    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
-    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
-    s[latest_blocked].vectorize(fb)
-
-    s[conv].compute_at(s[latest_blocked], tx)
-
-    # tile reduction axes
-    n, fc, y, x, fb = s[conv].op.axis
-
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-    rco, rci = cfg["tile_rcc"].apply(s, conv, rcc)
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
-    s[conv].vectorize(fb)
-    s[conv].unroll(rcb)
-
-    # unroll
-    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    if latest_blocked != latest:
-        s[latest].compute_root()
-        bind_data_copy(s[latest], 1)
-        if latest != output:
-            s[output].compute_inline()
-
-    N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape)
-    _, IC, KH, KW, _ = get_const_tuple(kernel.shape)
-    ICKHKW = IC * KH * KW
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
diff --git a/python/tvm/topi/adreno/conv2d_nchw_winograd.py b/python/tvm/topi/adreno/conv2d_nchw_winograd.py
deleted file mode 100644
index 0ddc0e7f2c0d..000000000000
--- a/python/tvm/topi/adreno/conv2d_nchw_winograd.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Winograd NCHW template for Adreno backend"""
-
-import logging
-from tvm import autotvm
-from .conv2d_winograd_common import conv2d_winograd_comp, schedule_conv2d_winograd_impl
-
-
-logger = logging.getLogger("conv2d_nchw_winograd")
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd.image2d")
-def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return conv2d_nchw_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd.image2d")
-def schedule_conv2d_nchw_winograd(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at")
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.image2d")
-def conv2d_nchw_winograd_without_weight_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    return conv2d_nchw_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.image2d")
-def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True)
-
-
-def conv2d_nchw_winograd_comp(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed
-):
-    """Compute declaration for winograd
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data: tvm.te.Tensor
-        4-D or 5-D Data tensor with shape NCHW or NCHW4c
-
-    kernel: tvm.te.Tensor
-        4-D or 5-D tensor with shape OIHW or OIHW4o
-
-    strides: int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding: int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    pre_computed: bool
-        Flag if weights were pre computed if true or the weights should be
-        computed in runtime
-
-    Returns
-    -------
-    output: tvm.te.Tensor
-        4-D or 5-D with shape NCHW or NCHW4c
-    """
-    return conv2d_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NCHW"
-    )
diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py
deleted file mode 100644
index e391495b5384..000000000000
--- a/python/tvm/topi/adreno/conv2d_nhwc.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""conv2d nhwc schedule on Qualcomm Adreno GPU"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from ..utils import get_const_tuple, traverse_inline
-from .utils import (
-    split_to_chunks,
-    pack_input,
-    pack_filter,
-    expand_spatial_dimensions,
-    add_pad,
-    bind_data_copy,
-    get_texture_storage,
-    get_default_conv2d_config,
-)
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc.image2d")
-def schedule_conv2d_nhwc(cfg, outs):
-    """Create the schedule for conv2d_nhwc"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "adreno_conv2d_latest_op":
-            schedule_conv2d_NHWC(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc.image2d")
-def conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    """
-    Convolution operator in NHWC layout.
-    Algo:
-      1. Convert into blocked format if we have 4d original tensor.
-         In case of AutoTVM we override the convert by just tensors since such conversion
-         will be absent for real blocked convolution, no sense to include into tuning
-      2. Expand spatial dimensions to have width and height be dividable by factor 4
-         This leads to slightly bigger amount of compute but allow utilize GPU much better
-      3. Add paddings. This happens even if we do not need pad originaly. This is useful
-         due to work arounding of the gaps of texture annotation between Primary Functions
-         and limited support of textures in schedules. Later on this pad will be executed
-         separately and will produce texture
-      4. 5d Convolution compute with accumulating into out_dtype
-      5. Cast to the origin output data type
-      6. For case of 4d convolution: convert of output from 5d to 4d
-    """
-
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    convert_from4d = False
-    if len(Input.shape) == 4:
-        batch, in_height, in_width, in_channels = Input.shape
-        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block)
-            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-        else:
-            Input = pack_input(
-                Input,
-                "NHWC",
-                batch,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                in_height,
-                in_width,
-            )
-    else:
-        batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape
-
-    if len(Filter.shape) == 4:
-        kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
-        else:
-            convert_from4d = True
-            Filter = pack_filter(
-                Filter,
-                "HWIO",
-                out_channel_chunks,
-                out_channel_block,
-                out_channel_tail,
-                in_filter_channels,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                kernel_h,
-                kernel_w,
-            )
-    else:
-        kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block = Filter.shape
-
-    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
-        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
-    )
-
-    temp = add_pad(
-        Input,
-        "NHWC",
-        out_height_orig,
-        out_width_orig,
-        kernel_h,
-        kernel_w,
-        dilation_h,
-        dilation_w,
-        padding,
-        stride_h,
-        stride_w,
-    )
-
-    rcc = te.reduce_axis((0, in_channel_chunks), name="rcc")
-    rcb = te.reduce_axis((0, in_channel_block), name="rcb")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    conv = te.compute(
-        (batch, out_height, out_width, out_channel_chunks, out_channel_block),
-        lambda nn, yy, xx, fc, fb: te.sum(
-            (
-                temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb]
-                * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb]
-            ).astype(out_dtype),
-            axis=[ry, rx, rcc, rcb],
-        ),
-        tag="conv2d_nhwc",
-    )
-
-    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
-        dummy_cast = te.compute(
-            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
-            lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
-            tag="dummy_cast",
-        )
-        return te.compute(
-            (batch, out_height_orig, out_width_orig, out_channles),
-            lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block],
-            tag="adreno_conv2d_latest_op",
-        )
-    else:
-        return te.compute(
-            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
-            lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype),
-            tag="adreno_conv2d_latest_op",
-        )
-
-
-def schedule_conv2d_NHWC(cfg, s, output):
-    """
-    schedule optimized for batch size = 1
-
-    Algo:
-    1. Split output axis to three parts: global work size, vthread, local worksize.
-       The limitations for tuning includes heuristics from some tuned networks to limit
-       search space and not pay much time for useles configurations.
-    2. In case of 4d convolution schedule copying of the input (and filter) into
-      5d tensors
-    4. pad should be scheduled separately to create independent opencl kernel. If pad is
-       inlined into convolution, this gives 1.5x performance drop
-    5. We are using cache_read for intermediate tensors to produce texture and guarantee
-       the best performance on the next stage.
-       The weights are managed through static texture planning mechanism and guarantied come
-       in texture memory scope.
-       Thus way we are calling cache_read only for data tensor
-    6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
-       for textures
-       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
-       of data type
-    7. In case of 4d conv we need to schedule postops as well
-    """
-    latest = s.outputs[0].output(0)
-    if len(latest.op.axis) == 4:
-        latest_blocked = dummy = output.op.input_tensors[0]
-        conv = dummy.op.input_tensors[0]
-    else:
-        conv = output.op.input_tensors[0]
-        latest_blocked = latest
-
-    pad_data, kernel = s[conv].op.input_tensors
-    filter_pack_rt = bool(
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    )
-
-    if "pad_temp" in pad_data.op.name:
-        input_pad_temp = pad_data.op.input_tensors[0]
-    else:
-        input_pad_temp = pad_data
-
-    input_pack_rt = bool(
-        isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag
-    )
-    ##### space definition begin #####
-    n, y, x, fc, fb = s[conv].op.axis
-    ry, rx, rcc, rcb = s[conv].op.reduce_axis
-
-    if conv.shape[3] % 2 == 0:
-        min_threads_div = 2
-    else:
-        min_threads_div = 1
-
-    cfg.define_split(
-        "tile_fc",
-        fc,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8
-        and entity.size[2] >= min_threads_div
-        and entity.size[2] < 256,
-    )
-    cfg.define_split(
-        "tile_y",
-        y,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-    cfg.define_split(
-        "tile_x",
-        x,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-
-    cfg.define_split("tile_rcc", rcc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
-    cfg.multi_filter(
-        filter=lambda entity: (  # pylint: disable=chained-comparison
-            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
-        )
-        <= 24
-        and 32
-        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
-        < 1024
-    )
-    if cfg.is_fallback:
-        get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2])
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-    # There are several conditions that have to be handled:
-    # 1. If we are in the tuning, we always add cache read for data to main conv kernel
-    #    to get texture in tuning opencl kernel
-    # 2. If we are repacking input in runtime, we should always explicit schedule this one more
-    #    stage of data copy from 4d to 5d (referred as pack_data).
-    # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the
-    #    cache_read("texture")
-    if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt:
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            if "pad_temp" in pad_data.op.name:
-                s[pad_data].compute_inline()
-        else:
-            if "pad_temp" in pad_data.op.name:
-                s[pad_data].compute_inline()
-                pack_data = pad_data.op.input_tensors[0]
-                bind_data_copy(s[pack_data])
-            else:
-                pack_data = pad_data
-                bind_data_copy(s[pack_data])
-
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-    elif "pad_temp" in pad_data.op.name:
-        s[pad_data].compute_inline()
-        # create cache stage
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-
-    if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            bind_data_copy(s[kernel])
-        if kernel.shape[0] == 1 and kernel.shape[1] == 1:
-            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-            bind_data_copy(s[WT])
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    # tile and bind spatial axes
-    n, y, x, fc, fb = s[latest_blocked].op.axis
-
-    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
-
-    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
-    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
-    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
-
-    by = s[latest_blocked].fuse(n, by)
-    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
-    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
-    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
-    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
-    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
-    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
-    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
-    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
-    s[latest_blocked].vectorize(fb)
-
-    s[conv].compute_at(s[latest_blocked], tx)
-
-    # tile reduction axes
-    n, y, x, fc, fb = s[conv].op.axis
-
-    ry, rx, rcc, rcb = s[conv].op.reduce_axis
-    rco, rci = cfg["tile_rcc"].apply(s, conv, rcc)
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
-    s[conv].vectorize(fb)
-    s[conv].unroll(rcb)
-
-    # unroll
-    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    if latest_blocked != latest:
-        s[latest].compute_root()
-        bind_data_copy(s[latest], 1)
-        if latest != output:
-            s[output].compute_inline()
-
-    N, OH, OW, OCC, OCB = get_const_tuple(latest_blocked.shape)
-    KH, KW, IC, _, _ = get_const_tuple(kernel.shape)
-    ICKHKW = IC * KH * KW
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
diff --git a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
deleted file mode 100644
index b055b388e1a7..000000000000
--- a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Winograd NHWC template for Adreno backend"""
-
-import logging
-from tvm import autotvm
-from .conv2d_winograd_common import conv2d_winograd_comp, schedule_conv2d_winograd_impl
-
-
-logger = logging.getLogger("conv2d_nhwc_winograd")
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd.image2d")
-def conv2d_nhwc_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return conv2d_nhwc_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd.image2d")
-def schedule_conv2d_nhwc_winograd(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at")
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform.image2d")
-def conv2d_nhwc_winograd_without_weight_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    return conv2d_nhwc_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform.image2d")
-def schedule_conv2d_nhwc_winograd_without_weight_transform(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True)
-
-
-def conv2d_nhwc_winograd_comp(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed
-):
-    """Compute declaration for winograd
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data: tvm.te.Tensor
-        4-D or 5-D Data tensor with shape NCHW or NCHW4c
-
-    kernel: tvm.te.Tensor
-        4-D or 5-D tensor with shape OIHW or OIHW4o
-
-    strides: int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding: int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    pre_computed: bool
-        Flag if weights were pre computed if true or the weights should be
-        computed in runtime
-
-    Returns
-    -------
-    output: tvm.te.Tensor
-        4-D or 5-D with shape NCHW or NCHW4c
-    """
-    return conv2d_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NHWC"
-    )
diff --git a/python/tvm/topi/adreno/conv2d_transpose_alter_op.py b/python/tvm/topi/adreno/conv2d_transpose_alter_op.py
deleted file mode 100644
index c68e5cb7a558..000000000000
--- a/python/tvm/topi/adreno/conv2d_transpose_alter_op.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D Transpose alter op for Qualcomm Adreno GPU"""
-
-import logging
-
-import re
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from ..utils import get_const_tuple
-from ..nn import conv2d_transpose_alter_layout
-
-logger = logging.getLogger("topi")
-
-# Number of wildcards for matching of supported layouts to be transformed
-_NCHWc_matcher = re.compile("^NCHW[0-9]+c$")
-_IOHWo_matcher = re.compile("^IOHW[0-9]+o$")
-
-
-@conv2d_transpose_alter_layout.register("adreno")
-def _alter_conv2d_transpose_layout(attrs, inputs, tinfos, out_type):
-    """
-    Prepare of the new conv2d_transpose with proper target blocked layout attributes
-    OpenCL Textures supports 1d/2d/3d/4d tetures but read happens always only for 4 elements
-    in a line. Thus way we are supporting for now only 4d conversions on the end
-    NCHW -> NCHW4c & IOHW ->IOHW4o
-    """
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    # Parse the attributes.
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data_tensor, kernel_tensor = tinfos
-    data_dtype = data_tensor.dtype
-    out_dtype = out_type.dtype
-
-    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
-        cfg = dispatch_ctx.query(target, None)
-        workload = cfg.workload
-    else:
-        impl, outs = relay.backend.te_compiler.select_implementation(
-            relay.op.get("nn.conv2d_transpose"), attrs, tinfos, out_type, target
-        )
-        workload = autotvm.task.get_workload(outs)
-        cfg = dispatch_ctx.query(target, workload)
-
-    topi_tmpl = workload[0]
-
-    if "conv2d_transpose_nchwc" in topi_tmpl:  # covers conv2d_transpose_nchwc
-        if data_layout == "NCHW" and kernel_layout == "IOHW":
-            batch, in_channels, in_height, in_width = data_tensor.shape
-            _, out_channles, kernel_h, kernel_w = kernel_tensor.shape
-            in_channel_block = in_channels % 4
-            if in_channel_block == 0:
-                in_channel_block = 4
-            num_filter_block = out_channles % 4
-            if num_filter_block == 0:
-                num_filter_block = 4
-
-            # no support yet for tensors that cannot be divisible by factor 4
-            if num_filter_block != 4:
-                return None
-
-            batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-            in_filter_channel, out_channel, kh, kw = get_const_tuple(kernel_tensor.shape)
-
-            # update new attrs
-            new_attrs["channels"] = out_channel
-            if in_channel_block == 4:
-                new_attrs["data_layout"] = f"NCHW{in_channel_block}c"
-            else:
-                new_attrs["data_layout"] = "NCHW"
-            # (oc, ic, h, w) -> (ic, OC, h, w, oc)
-            new_attrs["kernel_layout"] = f"IOHW{num_filter_block}o"
-            new_attrs["out_layout"] = f"NCHW{num_filter_block}c"
-
-            # Store altered operator's config for applying of tuned AutoTVM statistics
-            if in_channel_block == 4:
-                new_data = te.placeholder(
-                    (batch_size, in_channel // in_channel_block, height, width, in_channel_block),
-                    dtype=data_dtype,
-                )
-            else:
-                new_data = data_tensor
-            new_kernel = te.placeholder(
-                (in_filter_channel, out_channel // num_filter_block, kh, kw, num_filter_block),
-                dtype=kernel_tensor.dtype,
-            )
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, strides, padding, dilation, out_dtype],
-                topi_tmpl,  # "conv2d_transpose_nchwc.image2d",
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-        else:
-            assert _NCHWc_matcher.match(data_layout)
-            assert _IOHWo_matcher.match(kernel_layout)
-        return relay.nn.conv2d_transpose(*inputs, **new_attrs)
-
-    return None
diff --git a/python/tvm/topi/adreno/conv2d_transpose_nchw.py b/python/tvm/topi/adreno/conv2d_transpose_nchw.py
deleted file mode 100644
index ad8c7b88ef50..000000000000
--- a/python/tvm/topi/adreno/conv2d_transpose_nchw.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""conv2d_transpose nchw schedule on Qualcomm Adreno GPU"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from .. import nn
-
-
-from ..utils import get_const_tuple, traverse_inline
-from .utils import (
-    split_to_chunks,
-    pack_input,
-    pack_filter,
-    bind_data_copy,
-    get_default_conv2d_config,
-    get_texture_storage,
-)
-
-
-@autotvm.register_topi_compute("conv2d_transpose_nchwc.image2d")
-def conv2d_transpose_nchwc(
-    cfg, Input, Filter, stride, padding, out_dtype, output_padding, groups=1
-):
-    """
-    Transposed Convolution operator in NCHWc layout.
-    Algo:
-      1. Convert into blocked format if we have 4d original tensor.
-         In case of AutoTVM we override the convert by just tensors since such conversion
-         will be absent for real blocked convolution, no sense to include into tuning
-      2. Expand spatial dimensions to have width and height be dividable by factor 4
-         This leads to slightly bigger amount of compute but allow utilize GPU much better
-      3. Add paddings. This happens even if we do not need pad originaly. This is useful
-         due to work arounding of the gaps of texture annotation between Primary Functions
-         and limited support of textures in schedules. Later on this pad will be executed
-         separately and will produce texture
-      4. 5d Convolution compute with accumulating into out_dtype
-      5. Cast to the origin output data type
-      6. For case of 4d convolution: convert of output from 5d to 4d
-    """
-
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    outpad_height, outpad_width = output_padding
-    assert outpad_height < stride_h and outpad_width < stride_w
-
-    convert_from4d = False
-    if len(Input.shape) == 4:
-        batch, in_channels, in_height, in_width = Input.shape
-        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block)
-            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-        else:
-            Input = pack_input(
-                Input,
-                "NCHW",
-                batch,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                in_height,
-                in_width,
-            )
-    else:
-        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
-
-    if len(Filter.shape) == 4:
-        in_filter_channels, out_channels, kernel_h, kernel_w = Filter.shape
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channels, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            kshape = (in_filter_channels, out_channel_chunks, kernel_h, kernel_w, out_channel_block)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
-        else:
-            convert_from4d = True
-            Filter = pack_filter(
-                Filter,
-                "IOHW",
-                out_channel_chunks,
-                out_channel_block,
-                out_channel_tail,
-                in_filter_channels,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                kernel_h,
-                kernel_w,
-            )
-    else:
-        in_filter_channels, out_channel_chunks, kernel_h, kernel_w, out_channel_block = Filter.shape
-
-    cfg.stride = stride
-
-    pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w))
-
-    out_width_orig = out_width = (
-        (in_width - 1) * stride_w + kernel_w - pad_left - pad_right + outpad_width
-    )
-    pad_left = kernel_w - 1 - pad_left
-    pad_right = kernel_w - 1 - pad_right + outpad_width
-    dilated_width = stride_w * (in_width - 1) + 1
-
-    out_height_orig = out_height = (
-        (in_height - 1) * stride_h + kernel_h - pad_top - pad_bottom + outpad_height
-    )
-    pad_top = kernel_h - 1 - pad_top
-    pad_bottom = kernel_h - 1 - pad_bottom + outpad_height
-    dilated_height = stride_h * (in_height - 1) + 1
-
-    if out_height % 2 != 0:
-        out_height += 1
-    if out_width % 2 != 0:
-        out_width += 1
-
-    if out_height % 4 != 0:
-        out_height += 2
-    if out_width % 4 != 0:
-        out_width += 2
-
-    # compute pad
-    temp = te.compute(
-        (
-            batch,
-            in_channel_chunks,
-            pad_top + dilated_height + pad_bottom,
-            pad_left + dilated_width + pad_right,
-            in_channel_block,
-        ),
-        lambda n, c, y, x, cb: tvm.tir.if_then_else(
-            tvm.tir.all(
-                x >= pad_left,
-                x < pad_left + dilated_width,
-                tvm.tir.indexmod(x - pad_left, stride_w).equal(0),
-                y >= pad_top,
-                y < pad_top + dilated_height,
-                tvm.tir.indexmod(y - pad_top, stride_h).equal(0),
-            ),
-            Input[
-                n,
-                c,
-                tvm.tir.indexdiv(y - pad_top, stride_h),
-                tvm.tir.indexdiv(x - pad_left, stride_w),
-                cb,
-            ],
-            tvm.tir.const(0.0, Input.dtype),
-        ),
-        name="pad_temp",
-    )
-
-    # compute transposed conv
-    dcc = te.reduce_axis((0, in_channel_chunks), name="dcc")
-    dcb = te.reduce_axis((0, in_channel_block), name="dcb")
-    dh = te.reduce_axis((0, kernel_h), name="dh")
-    dw = te.reduce_axis((0, kernel_w), name="dw")
-    conv = te.compute(
-        (batch, out_channel_chunks, out_height, out_width, out_channel_block),
-        lambda b, c, h, w, cb: te.sum(
-            temp[
-                b, c // out_channel_chunks * (in_channel_chunks) + dcc, h + dh, w + dw, dcb
-            ].astype(out_dtype)
-            * Filter[
-                dcc * in_channel_block + dcb,
-                c % out_channel_chunks,
-                kernel_h - 1 - dh,
-                kernel_w - 1 - dw,
-                cb,
-            ].astype(out_dtype),
-            axis=[dcc, dcb, dh, dw],
-        ),
-        tag="conv2d_transpose_nchwc",
-    )
-
-    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
-        dummy_cast = te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
-            tag="dummy_cast",
-        )
-        return te.compute(
-            (batch, out_channels, out_height_orig, out_width_orig),
-            lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
-            tag="adreno_conv2d_transpose_latest_op",
-        )
-    else:
-        return te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
-            tag="adreno_conv2d_transpose_latest_op",
-        )
-
-
-@autotvm.register_topi_schedule("conv2d_transpose_nchwc.image2d")
-def schedule_conv2d_transpose_nchwc(cfg, outs):
-    """Create the schedule for conv2d_nchw"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "adreno_conv2d_transpose_latest_op":
-            schedule_conv2d_transpose_NCHWc(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def schedule_conv2d_transpose_NCHWc(cfg, s, output):
-    """
-    schedule optimized for batch size = 1
-
-    Algo:
-    1. Split output axis to three parts: global work size, vthread, local worksize.
-       The limitations for tuning includes heuristics from some tuned networks to limit
-       search space and not pay much time for useles configurations.
-    2. In case of 4d convolution schedule copying of the input (and filter) into
-      5d tensors
-    4. pad should be scheduled separately to create independent opencl kernel. If pad is
-       inlined into convolution, this gives 1.5x performance drop
-    5. We are using cache_read for intermediate tensors to produce texture and guarantee
-       the best performance on the next stage.
-       The weights are managed through static texture planning mechanism and guarantied come
-       in texture memory scope.
-       Thus way we are calling cache_read only for data tensor
-    6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
-       for textures
-       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
-       of data type
-    7. In case of 4d conv we need to schedule postops as well
-    """
-    latest = s.outputs[0].output(0)
-    if len(latest.op.axis) == 4:
-        latest_blocked = dummy = output.op.input_tensors[0]
-        conv = dummy.op.input_tensors[0]
-    else:
-        conv = output.op.input_tensors[0]
-        latest_blocked = latest
-
-    pad_data, kernel = s[conv].op.input_tensors
-    filter_pack_rt = bool(
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    )
-
-    if "pad_temp" in pad_data.op.name:
-        input_pad_temp = pad_data.op.input_tensors[0]
-    else:
-        input_pad_temp = pad_data
-
-    input_pack_rt = bool(
-        isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag
-    )
-
-    ##### space definition begin #####
-    n, fc, y, x, fb = s[conv].op.axis
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-
-    if conv.shape[1] % 2 == 0:
-        min_threads_div = 2
-    else:
-        min_threads_div = 1
-    cfg.define_split(
-        "tile_fc",
-        fc,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8
-        and entity.size[2] >= min_threads_div
-        and entity.size[2] < 256,
-    )
-    cfg.define_split(
-        "tile_y",
-        y,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-    cfg.define_split(
-        "tile_x",
-        x,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-
-    cfg.define_split("tile_rcc", rcc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 64])
-    cfg.define_knob("unroll_explicit", [0, 1])
-    cfg.multi_filter(
-        filter=lambda entity: (  # pylint: disable=chained-comparison
-            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
-        )
-        <= 24
-        and 32
-        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
-        < 1024
-    )
-    if cfg.is_fallback:
-        get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-    # There are several conditions that have to be handled:
-    # 1. If we are in the tuning, we always add cache read for data to main conv kernel
-    #    to get texture in tuning opencl kernel
-    # 2. If we are repacking input in runtime, we should always explicit schedule this one more
-    #    stage of data copy from 4d to 5d (referred as pack_data).
-    # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the
-    #    cache_read("texture")
-    if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt:
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            if "pad_temp" in pad_data.op.name:
-                s[pad_data].compute_inline()
-        else:
-            if "pad_temp" in pad_data.op.name:
-                pack_data = pad_data.op.input_tensors[0]
-                bind_data_copy(s[pack_data])
-                s[pad_data].compute_inline()
-            else:
-                pack_data = pad_data
-                bind_data_copy(s[pack_data])
-
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-    elif "pad_temp" in pad_data.op.name:
-        s[pad_data].compute_inline()
-        # create cache stage
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-
-    if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            bind_data_copy(s[kernel])
-        if kernel.shape[2] == 1 and kernel.shape[3] == 1:
-            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-            bind_data_copy(s[WT])
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    # tile and bind spatial axes
-    n, fc, y, x, fb = s[latest_blocked].op.axis
-
-    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
-
-    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
-    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
-    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
-
-    bf = s[latest_blocked].fuse(n, bf)
-    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
-    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
-    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
-    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
-    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
-    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
-    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
-    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
-    s[latest_blocked].vectorize(fb)
-
-    s[conv].compute_at(s[latest_blocked], tx)
-
-    # tile reduction axes
-    n, fc, y, x, fb = s[conv].op.axis
-
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-    rco, rci = cfg["tile_rcc"].apply(s, conv, rcc)
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
-    s[conv].vectorize(fb)
-    s[conv].unroll(rcb)
-
-    # unroll
-    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    if latest_blocked != latest:
-        s[latest].compute_root()
-        bind_data_copy(s[latest], 1)
-        if latest != output:
-            s[output].compute_inline()
-
-    N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape)
-    _, IC, KH, KW, _ = get_const_tuple(kernel.shape)
-    ICKHKW = IC * KH * KW
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
deleted file mode 100644
index d10acb73123d..000000000000
--- a/python/tvm/topi/adreno/conv2d_winograd_common.py
+++ /dev/null
@@ -1,521 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Common Winograd implementation for Adreno backend"""
-
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from tvm.topi import nn
-from tvm.topi.utils import get_const_int, get_const_tuple, traverse_inline
-from ..nn.winograd_util import winograd_transform_matrices
-from .utils import (
-    split_to_chunks,
-    pack_input,
-    pack_filter,
-    bind_data_copy,
-    get_texture_storage,
-    infer_tile_size,
-)
-
-
-def conv2d_winograd_comp(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, layout
-):
-    """Compute declaration for winograd
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data: tvm.te.Tensor
-        4-D or 5-D Data tensor with shape NCHW or NCHW4c
-
-    kernel: tvm.te.Tensor
-        4-D or 5-D tensor with shape OIHW or OIHW4o
-
-    strides: int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding: int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    pre_computed: bool
-        Flag if weights were pre computed if true or the weights should be
-        computed in runtime
-
-    layout: str
-        NHWC or NCHW values are accepted
-
-    Returns
-    -------
-    output: tvm.te.Tensor
-        4-D or 5-D with shape NCHW or NCHW4c
-    """
-    assert layout in ("NCHW", "NHWC")
-    tile_size = infer_tile_size(data, layout)
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
-
-    convert_from4d = False
-    if len(data.shape) == 4:
-        convert_from4d = True
-        if layout == "NCHW":
-            N, DCI, H, W = get_const_tuple(data.shape)
-        else:
-            N, H, W, DCI = get_const_tuple(data.shape)
-        if not pre_computed:
-            if layout == "NCHW":
-                out_channels, CI, KH, KW = get_const_tuple(kernel.shape)
-            else:
-                KH, KW, CI, out_channels = get_const_tuple(kernel.shape)
-        else:
-            alpha, _, CI, out_channels = get_const_tuple(kernel.shape)
-            KH = KW = alpha + 1 - tile_size
-
-        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(CI, 4)
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channels, 4)
-        if autotvm.GLOBAL_SCOPE.in_tuning is True:
-            if layout == "NCHW":
-                dshape = (N, in_channel_chunks, H, W, in_channel_block)
-            else:
-                dshape = (N, H, W, in_channel_chunks, in_channel_block)
-            if not pre_computed:  # kernel tensor is raw tensor, do strict check
-                if layout == "NCHW":
-                    kshape = (out_channel_chunks, CI, KH, KW, out_channel_block)
-                else:
-                    kshape = (KH, KW, CI, out_channel_chunks, out_channel_block)
-            else:
-                kshape = (alpha, alpha, CI, out_channel_chunks, out_channel_block)
-            data = tvm.te.placeholder(dshape, data.dtype, name="data_placeholder")
-            kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel_placeholder")
-        else:
-            data = pack_input(
-                data, layout, N, in_channel_chunks, in_channel_block, in_channel_tail, H, W
-            )
-            kernel_layout = "OIHW" if layout == "NCHW" else "HWIO"
-            if not pre_computed:  # kernel tensor is raw tensor, do strict check
-                kernel = pack_filter(
-                    kernel,
-                    kernel_layout,
-                    out_channel_chunks,
-                    out_channel_block,
-                    out_channel_tail,
-                    CI,
-                    in_channel_chunks,
-                    in_channel_block,
-                    in_channel_tail,
-                    KH,
-                    KW,
-                )
-            else:
-                kernel = pack_filter(
-                    kernel,
-                    "HWIO",
-                    out_channel_chunks,
-                    out_channel_block,
-                    out_channel_tail,
-                    CI,
-                    in_channel_chunks,
-                    in_channel_block,
-                    in_channel_tail,
-                    alpha,
-                    alpha,
-                )
-    if layout == "NCHW":
-        N, DCI, H, W, CB = get_const_tuple(data.shape)
-    else:
-        N, H, W, DCI, CB = get_const_tuple(data.shape)
-    if not pre_computed:  # kernel tensor is raw tensor, do strict check
-        if layout == "NCHW":
-            CO, CI, KH, KW, COB = get_const_tuple(kernel.shape)
-        else:
-            KH, KW, CI, CO, COB = get_const_tuple(kernel.shape)
-        alpha = KW + tile_size - 1
-        assert HSTR == 1 and WSTR == 1 and KH == KW
-    else:
-        alpha, _, CI, CO, COB = get_const_tuple(kernel.shape)
-        KH = KW = alpha + 1 - tile_size
-        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
-
-    if isinstance(N, tvm.tir.Any):
-        N = tvm.te.size_var("n")
-
-    if not isinstance(H, int) or not isinstance(W, int):
-        raise RuntimeError(
-            "adreno winograd conv2d doesn't support dynamic input\
-                           height or width."
-        )
-
-    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
-    if layout == "NCHW":
-        data_pad = nn.pad(data, (0, 0, pt, pl, 0), (0, 0, pb, pr, 0), name="data_pad")
-    else:
-        data_pad = nn.pad(data, (0, pt, pl, 0, 0), (0, pb, pr, 0, 0), name="data_pad")
-
-    r = KW
-    m = tile_size
-    A, B, G = winograd_transform_matrices(m, r, data.dtype)
-
-    H = (H + pt + pb - KH) // HSTR + 1
-    W = (W + pl + pr - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-
-    P = N * nH * nW if isinstance(N, int) else nH * nW
-
-    # transform kernel
-    if not pre_computed:
-        r_kh = te.reduce_axis((0, KH), name="r_kh")
-        r_kw = te.reduce_axis((0, KW), name="r_kw")
-        if layout == "NCHW":
-            kernel_pack = te.compute(
-                (alpha, alpha, CI, CO, COB),
-                lambda eps, nu, ci, co, cob: te.sum(
-                    kernel[co][ci][r_kh][r_kw][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
-                ),
-                name="kernel_pack",
-            )
-        else:
-            kernel_pack = te.compute(
-                (alpha, alpha, CI, CO, COB),
-                lambda eps, nu, ci, co, cob: te.sum(
-                    kernel[r_kh][r_kw][ci][co][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
-                ),
-                name="kernel_pack",
-            )
-    else:
-        kernel_pack = kernel
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-    if layout == "NCHW":
-        N, CI, _, _, CB = get_const_tuple(data.shape)
-    else:
-        N, _, _, CI, CB = get_const_tuple(data.shape)
-
-    # pack input tile
-    if layout == "NCHW":
-        input_tile = te.compute(
-            (alpha, alpha, CI, P, CB),
-            lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][c][
-                idxmod(idxdiv(p, nW), nH) * m + eps
-            ][idxmod(p, nW) * m + nu][cb],
-            name="d",
-        )
-    else:
-        input_tile = te.compute(
-            (alpha, alpha, CI, P, CB),
-            lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][
-                idxmod(idxdiv(p, nW), nH) * m + eps
-            ][idxmod(p, nW) * m + nu][c][cb],
-            name="d",
-        )
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_a")
-    data_pack = te.compute(
-        (P, CI, alpha, alpha, CB),
-        lambda p, ci, eps, nu, cb: te.sum(
-            input_tile[r_a][r_b][ci][p][cb] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
-        ),
-        name="data_pack",
-    )
-
-    # repack transformed data
-    data_pack_trans = te.compute(
-        (alpha, alpha, CI, P, CB),
-        lambda eps, nu, c, p, cb: data_pack[p][c][eps][nu][cb],
-        name="data_pack_trans",
-    )
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name="ci")
-    cb = te.reduce_axis((0, CB), name="cb")
-    bgemm = te.compute(
-        (alpha, alpha, CO, P, COB),
-        lambda eps, nu, co, p, cob: te.sum(
-            (
-                kernel_pack[eps][nu][ci * CB + cb][co][cob] * data_pack_trans[eps][nu][ci][p][cb]
-            ).astype(out_dtype),
-            axis=[ci, cb],
-        ),
-        name="bgemm",
-    )
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_a")
-    inverse = te.compute(
-        (CO, P, m, m, COB),
-        lambda co, p, vh, vw, cob: te.sum(
-            bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(out_dtype),
-            axis=[r_a, r_b],
-        ),
-        name="inverse",
-    )
-
-    # output
-    if layout == "NCHW":
-        if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False:
-            output = te.compute(
-                (N, out_channels, H, W),
-                lambda n, c, h, w: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][
-                    idxmod(h, m)
-                ][idxmod(w, m)][c % CB].astype(out_dtype),
-                name="output",
-                tag="dummy_compute_at",
-            )
-        else:
-            output = te.compute(
-                (N, CO, H, W, COB),
-                lambda n, co, h, w, cob: inverse[co][
-                    n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
-                ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
-                name="output",
-                tag="dummy_compute_at",
-            )
-    else:
-        if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False:
-            output = te.compute(
-                (N, H, W, out_channels),
-                lambda n, h, w, c: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][
-                    idxmod(h, m)
-                ][idxmod(w, m)][c % CB].astype(out_dtype),
-                name="output",
-                tag="dummy_compute_at",
-            )
-        else:
-            output = te.compute(
-                (N, H, W, CO, COB),
-                lambda n, h, w, co, cob: inverse[co][
-                    n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
-                ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
-                name="output",
-                tag="dummy_compute_at",
-            )
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * CO * COB * H * W * CI * CB * KH * KW)
-
-    return output
-
-
-def schedule_conv2d_winograd_impl(cfg, outs, tag, pre_computed=False):
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == tag:
-            schedule_conv2d_winograd(cfg, s, op.output(0), pre_computed=pre_computed)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def schedule_conv2d_winograd(cfg, s, output, pre_computed):
-    """Schedule winograd template"""
-    inverse = s[output].op.input_tensors[0]
-    bgemm, A = s[inverse].op.input_tensors
-    kernel_pack, data_pack_trans = s[bgemm].op.input_tensors
-    data_pack = s[data_pack_trans].op.input_tensors[0]
-    input_tile, B = s[data_pack].op.input_tensors
-    pad_data = s[input_tile].op.input_tensors[0]
-
-    # data transform
-    s[B].compute_inline()
-    s[A].compute_inline()
-
-    # probably will improve real topology execution
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # Padding to texture
-        AA = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [input_tile])
-        bind_data_copy(s[AA])
-
-    s[input_tile].compute_inline()
-
-    OL = s.cache_write(data_pack, "local")
-    c, p, eps, nu, cb = s[data_pack].op.axis
-    fused = s[data_pack].fuse(c, p, eps, nu)
-    bx, tx = s[data_pack].split(fused, 128)
-    s[data_pack].vectorize(cb)
-    s[data_pack].bind(bx, te.thread_axis("blockIdx.x"))
-    s[data_pack].bind(tx, te.thread_axis("threadIdx.x"))
-
-    _, _, eps, nu, cb = s[OL].op.axis
-    r_a, r_b = s[OL].op.reduce_axis
-    s[OL].unroll(eps)
-    s[OL].unroll(nu)
-    s[OL].unroll(r_a)
-    s[OL].unroll(r_b)
-    s[OL].vectorize(cb)
-    s[OL].compute_at(s[data_pack], tx)
-    s[data_pack].set_scope(get_texture_storage(data_pack.shape))
-
-    s[data_pack_trans].compute_inline()
-
-    # transform kernel
-    if not pre_computed:
-        kernel, G = s[kernel_pack].op.input_tensors
-        eps, nu, ci, co, cob = s[kernel_pack].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # skip this part during tuning to make recrods accurate
-            # this part will be pre-computed during pre-compute optimization pass
-            s[G].pragma(s[G].op.axis[0], "debug_skip_region")
-            s[kernel_pack].pragma(eps, "debug_skip_region")
-        else:
-            s[G].compute_inline()
-            r_a, r_b = s[kernel_pack].op.reduce_axis
-            for axis in [eps, nu, r_a, r_b]:
-                s[kernel_pack].unroll(axis)
-
-            fused = s[kernel_pack].fuse(ci, co)
-            bb, tt = s[kernel_pack].split(fused, 128)
-            s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b, cob)
-            s[kernel_pack].vectorize(cob)
-            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
-            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
-    else:
-        kernel = kernel_pack
-
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag:
-        # manage scheduling of datacopy
-        pack_data = pad_data.op.input_tensors[0]
-        bind_data_copy(s[pack_data])
-        bind_data_copy(s[kernel])
-    elif isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-    s[pad_data].compute_inline()
-
-    ##### space definition begin #####
-    cfg.define_knob("auto_unroll_max_step", [0, 4, 16])
-    b1, b2, y, x, cb = s[bgemm].op.axis
-    rcc = s[bgemm].op.reduce_axis[0]
-    alpha = get_const_int(b1.dom.extent)
-
-    cfg.define_split(
-        "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 16
-    )
-
-    min_x_div = 1
-    for bn in range(4, 0, -1):
-        if bgemm.shape[3] % bn == 0:
-            min_x_div = bn
-            break
-
-    cfg.define_split(
-        "tile_x",
-        x,
-        num_outputs=3,
-        filter=lambda entry: entry.size[2] <= 64
-        and entry.size[1] >= min_x_div
-        and entry.size[1] <= 16,
-    )
-    cfg.define_split("tile_rc", rcc, num_outputs=2)
-    cfg.multi_filter(
-        filter=lambda entity: 32 <= (entity["tile_y"].size[2] * entity["tile_x"].size[2]) < 1024
-    )
-    ##### space definition end #####
-
-    # batch gemm
-    OL = s.cache_write(bgemm, "local")
-    if (
-        autotvm.GLOBAL_SCOPE.in_tuning
-        or isinstance(kernel.op, tvm.te.ComputeOp)
-        and "filter_pack" in kernel.op.tag
-        and kernel.shape[2] == 1
-        and kernel.shape[3] == 1
-    ):
-        BB = s.cache_read(kernel_pack, get_texture_storage(kernel_pack.shape), [OL])
-        bind_data_copy(s[BB])
-
-    by = s[bgemm].fuse(b1, b2, y)
-
-    # tile and bind spatial axes
-    bgemm_scope, by = s[bgemm].split(by, nparts=1)
-    by, vy, ty = cfg["tile_y"].apply(s, bgemm, by)
-    bx, vx, tx = cfg["tile_x"].apply(s, bgemm, x)
-    s[bgemm].bind(by, te.thread_axis("blockIdx.y"))
-    s[bgemm].bind(bx, te.thread_axis("blockIdx.x"))
-    s[bgemm].bind(vy, te.thread_axis("vthread"))
-    s[bgemm].bind(vx, te.thread_axis("vthread"))
-    s[bgemm].bind(ty, te.thread_axis("threadIdx.y"))
-    s[bgemm].bind(tx, te.thread_axis("threadIdx.x"))
-    s[bgemm].reorder(bgemm_scope, by, bx, vy, vx, ty, tx, cb)
-    s[bgemm].vectorize(cb)
-    s[bgemm].set_scope(get_texture_storage(bgemm.shape))
-
-    # tile reduction axes
-    s[OL].compute_at(s[bgemm], tx)
-    b1, b2, y, x, cb = s[OL].op.axis
-    (rcc, rcb) = s[OL].op.reduce_axis
-    b = s[OL].fuse(b1, b2)
-    s[OL].reorder(b, y, x, rcc, rcb, cb)
-    # s[OL].unroll(rcb)
-    s[OL].pragma(rcb, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[OL].pragma(rcb, "unroll_explicit", True)
-    s[OL].vectorize(cb)
-
-    # schedule inverse, output and fusion
-    if output.op in s.outputs:
-        OL = None
-    else:
-        OL = output
-        s[OL].set_scope("local")
-        output = s.outputs[0]
-
-    if len(s[output].op.axis) == 4:
-        n, co, h, w = s[output].op.axis
-        cb = None
-    else:
-        n, co, h, w, cb = s[output].op.axis
-    inverse_scope, n = s[output].split(n, nparts=1)
-
-    fused = s[output].fuse(n, co, h, w)
-    bb, tt = s[output].split(fused, 128)
-    if cb is not None:
-        s[output].reorder(bb, tt, cb)
-        s[output].vectorize(cb)
-
-    s[output].bind(bb, te.thread_axis("blockIdx.x"))
-    s[output].bind(tt, te.thread_axis("threadIdx.x"))
-
-    if OL is not None:
-        s[OL].compute_at(s[output], tt)
-
-    co, p, vh, vw, cb = s[inverse].op.axis
-    r_a, r_b = s[inverse].op.reduce_axis
-    for axis in [vh, vw, r_a, r_b]:
-        s[inverse].unroll(axis)
-    s[inverse].vectorize(cb)
-    s[inverse].compute_at(s[output], tt)
-
-    return s
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
deleted file mode 100644
index 7fae354dee0e..000000000000
--- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
+++ /dev/null
@@ -1,308 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""depthwise_conv2d_nchw(c) schedule on Qualcomm Adreno GPU"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from ..utils import get_const_tuple, traverse_inline
-from .utils import (
-    split_to_chunks,
-    pack_input,
-    pack_filter,
-    expand_spatial_dimensions,
-    add_pad,
-    bind_data_copy,
-    get_texture_storage,
-    get_default_conv2d_config,
-)
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchwc.image2d")
-def schedule_depthwise_conv2d_nchwc(cfg, outs):
-    """Create the schedule for depthwise conv2d_nchw4c_ohwi4o"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "adreno_dw_conv2d_latest_op":
-            schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d")
-def depthwise_conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    """
-    Depthwise convolution operator in NCHWc layout.
-    Algo:
-      1. Convert into blocked format if we have 4d original tensor.
-         In case of AutoTVM we override the convert by just tensors since such conversion
-         will be absent for real blocked convolution, no sense to include into tuning
-      2. Expand spatial dimensions to have width and height be dividable by factor 4
-         This leads to slightly bigger amount of compute but allow utilize GPU much better
-      3. Add paddings. This happens even if we do not need pad originaly. This is useful
-         due to work arounding of the gaps of texture annotation between Primary Functions
-         and limited support of textures in schedules. Later on this pad will be executed
-         separately and will produce texture
-      4. 5d Convolution compute with accumulating into out_dtype
-      5. Cast to the origin output data type
-      6. For case of 4d convolution: convert of output from 5d to 4d
-    """
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    convert_from4d = False
-    if len(Input.shape) == 4:
-        batch, in_channels, in_height, in_width = Input.shape
-        out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape
-
-        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block)
-            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-            kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
-        else:
-            convert_from4d = True
-            Input = pack_input(
-                Input,
-                "NCHW",
-                batch,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                in_height,
-                in_width,
-            )
-            Filter = pack_filter(
-                Filter,
-                "OIHW",
-                out_channel_chunks,
-                out_channel_block,
-                out_channel_tail,
-                in_filter_channels,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                kernel_h,
-                kernel_w,
-            )
-
-    else:
-        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
-        out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape
-
-    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
-        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
-    )
-
-    temp = add_pad(
-        Input,
-        "NCHW",
-        out_height_orig,
-        out_width_orig,
-        kernel_h,
-        kernel_w,
-        dilation_h,
-        dilation_w,
-        padding,
-        stride_h,
-        stride_w,
-    )
-
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    conv = te.compute(
-        (batch, out_channel_chunks, out_height, out_width, out_channel_block),
-        lambda nn, ffc, yy, xx, ffb: te.sum(
-            (
-                temp[
-                    nn,
-                    ffc // in_filter_channels,
-                    yy * stride_h + ry * dilation_h,
-                    xx * stride_w + rx * dilation_w,
-                    ffb,
-                ]
-                * Filter[ffc // in_filter_channels, ffc % in_filter_channels, ry, rx, ffb]
-            ).astype(out_dtype),
-            axis=[ry, rx],
-        ),
-        tag="depthwise_conv2d_nchwc",
-    )
-
-    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
-        dummy_cast = te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
-            tag="dummy_cast",
-        )
-        return te.compute(
-            (batch, out_channles, out_height_orig, out_width_orig),
-            lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
-            tag="adreno_dw_conv2d_latest_op",
-        )
-    else:
-        return te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
-            tag="adreno_dw_conv2d_latest_op",
-        )
-
-
-def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output):
-    """
-    schedule optimized for batch size = 1
-
-    Algo:
-    1. Split output axis to three parts: global work size, vthread, local worksize.
-       The limitations for tuning includes heuristics from some tuned networks to limit
-       search space and not pay much time for useles configurations.
-    2. For depthwise convolution it's better to inline pad into the conv2d compute, the
-       divergence in opencl kernel will not so significant as for regular conv2d.
-    3. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
-       for textures
-       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
-       of data type
-    4. In case of 4d conv we need to schedule postops as well
-    """
-    latest = s.outputs[0].output(0)
-    if len(latest.op.axis) == 4:
-        latest_blocked = dummy = output.op.input_tensors[0]
-        conv = dummy.op.input_tensors[0]
-    else:
-        conv = output.op.input_tensors[0]
-        latest_blocked = latest
-
-    ##### space definition begin #####
-    n, fc, y, x, fb = s[conv].op.axis
-    ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_fc", fc, num_outputs=3)
-    cfg.define_split("tile_y", y, num_outputs=3)
-    cfg.define_split("tile_x", x, num_outputs=3)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
-    cfg.multi_filter(
-        filter=lambda entity: (  # pylint: disable=chained-comparison
-            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
-        )
-        <= 32
-        and 32
-        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
-        < 1024
-    )
-
-    if cfg.is_fallback:
-        get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-    if (
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    ):  # len(latest.op.axis) == 4:
-        # manage scheduling of datacopy
-        pad_data, kernel = s[conv].op.input_tensors
-        if "pad_temp" in pad_data.op.name:
-            pack_data = pad_data.op.input_tensors[0]
-            bind_data_copy(s[pack_data])
-        else:
-            bind_data_copy(s[pad_data])
-        bind_data_copy(s[kernel])
-
-    pad_data, kernel = s[conv].op.input_tensors
-
-    if "pad_temp" in pad_data.op.name:
-        s[pad_data].compute_inline()
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    if autotvm.GLOBAL_SCOPE.in_tuning or len(latest.op.axis) == 4:
-        # create cache stage for tuning only or in case of 4d case
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-        if kernel.shape[2] == 1 and kernel.shape[3] == 1:
-            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-            bind_data_copy(s[WT])
-
-    # tile and bind spatial axes
-    n, fc, y, x, fb = s[latest_blocked].op.axis
-    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
-
-    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
-    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
-    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
-
-    bf = s[latest_blocked].fuse(n, bf)
-    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
-    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
-    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
-    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
-    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
-    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
-    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
-    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
-    s[latest_blocked].vectorize(fb)
-
-    s[conv].compute_at(s[latest_blocked], tx)
-
-    # tile reduction axes
-    n, fc, y, x, fb = s[conv].op.axis
-
-    ry, rx = s[conv].op.reduce_axis
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb)
-    s[conv].vectorize(fb)
-
-    # unroll
-    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-    if latest_blocked != latest:
-        s[latest].compute_root()
-        bind_data_copy(s[latest], 1)
-        if latest != output:
-            s[output].compute_inline()
-
-    N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape)
-    _, _, KH, KW, ICB = get_const_tuple(kernel.shape)
-    KHKW = KH * KW
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW * ICB)
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
deleted file mode 100644
index f224fe3c88dc..000000000000
--- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""depthwise_conv2d_nhwc(c) schedule on Qualcomm Adreno GPU"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from ..utils import get_const_tuple, traverse_inline
-from .utils import (
-    split_to_chunks,
-    pack_input,
-    pack_filter,
-    expand_spatial_dimensions,
-    add_pad,
-    bind_data_copy,
-    get_texture_storage,
-    get_default_conv2d_config,
-)
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.image2d")
-def schedule_depthwise_conv2d_nhwc(cfg, outs):
-    """Create the schedule for depthwise conv2d_nchw4c_ohwi4o"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "adreno_dw_conv2d_latest_op":
-            schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d")
-def depthwise_conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    """
-    Depthwise convolution operator in NCHWc layout.
-    Algo:
-      1. Convert into blocked format if we have 4d original tensor.
-         In case of AutoTVM we override the convert by just tensors since such conversion
-         will be absent for real blocked convolution, no sense to include into tuning
-      2. Expand spatial dimensions to have width and height be dividable by factor 4
-         This leads to slightly bigger amount of compute but allow utilize GPU much better
-      3. Add paddings. This happens even if we do not need pad originaly. This is useful
-         due to work arounding of the gaps of texture annotation between Primary Functions
-         and limited support of textures in schedules. Later on this pad will be executed
-         separately and will produce texture
-      4. 5d Convolution compute with accumulating into out_dtype
-      5. Cast to the origin output data type
-      6. For case of 4d convolution: convert of output from 5d to 4d
-    """
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    convert_from4d = False
-    if len(Input.shape) == 4:
-        batch, in_height, in_width, in_channels = Input.shape
-        kernel_h, kernel_w, out_channles, in_filter_channels = Filter.shape
-
-        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block)
-            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-            kshape = (kernel_h, kernel_w, out_channel_block, in_filter_channels, out_channel_chunks)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
-        else:
-            convert_from4d = True
-            Input = pack_input(
-                Input,
-                "NHWC",
-                batch,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                in_height,
-                in_width,
-            )
-            Filter = pack_filter(
-                Filter,
-                "HWOI",
-                out_channel_chunks,
-                out_channel_block,
-                out_channel_tail,
-                in_filter_channels,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                kernel_h,
-                kernel_w,
-            )
-
-    else:
-        batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape
-        kernel_h, kernel_w, out_channel_chunks, in_filter_channels, out_channel_block = Filter.shape
-
-    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
-        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
-    )
-
-    temp = add_pad(
-        Input,
-        "NHWC",
-        out_height_orig,
-        out_width_orig,
-        kernel_h,
-        kernel_w,
-        dilation_h,
-        dilation_w,
-        padding,
-        stride_h,
-        stride_w,
-    )
-
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    conv = te.compute(
-        (batch, out_height, out_width, out_channel_chunks, out_channel_block),
-        lambda nn, yy, xx, ffc, ffb: te.sum(
-            (
-                temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb]
-                * Filter[ry, rx, ffc, 0, ffb]
-            ).astype(out_dtype),
-            axis=[ry, rx],
-        ),
-        tag="depthwise_conv2d_nhwc",
-    )
-
-    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
-        dummy_cast = te.compute(
-            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
-            lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
-            tag="dummy_cast",
-        )
-        return te.compute(
-            (batch, out_height_orig, out_width_orig, out_channles),
-            lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block],
-            tag="adreno_dw_conv2d_latest_op",
-        )
-    else:
-        return te.compute(
-            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
-            lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype),
-            tag="adreno_dw_conv2d_latest_op",
-        )
-
-
-def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output):
-    """
-    schedule optimized for batch size = 1
-
-    Algo:
-    1. Split output axis to three parts: global work size, vthread, local worksize.
-       The limitations for tuning includes heuristics from some tuned networks to limit
-       search space and not pay much time for useles configurations.
-    2. In case of 4d convolution schedule copying of the input (and filter) into
-      5d tensors
-    3. For depthwise convolution it's better to inline pad into the conv2d compute, the
-       divergence in opencl kernel will not so significant as for regular conv2d.
-    4. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
-       for textures
-       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
-       of data type
-    5. In case of 4d conv we need to schedule postops as well
-    """
-    latest = s.outputs[0].output(0)
-    if len(latest.op.axis) == 4:
-        latest_blocked = dummy = output.op.input_tensors[0]
-        conv = dummy.op.input_tensors[0]
-    else:
-        conv = output.op.input_tensors[0]
-        latest_blocked = latest
-
-    ##### space definition begin #####
-    n, y, x, fc, fb = s[conv].op.axis
-    ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_fc", fc, num_outputs=3)
-    cfg.define_split("tile_y", y, num_outputs=3)
-    cfg.define_split("tile_x", x, num_outputs=3)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
-
-    cfg.multi_filter(
-        filter=lambda entity: (  # pylint: disable=chained-comparison
-            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
-        )
-        <= 32
-        and 32
-        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
-        < 1024
-    )
-    if cfg.is_fallback:
-        get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2])
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-    if (
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    ):  # len(latest.op.axis) == 4:
-        # manage scheduling of datacopy
-        pad_data, kernel = s[conv].op.input_tensors
-        if "pad_temp" in pad_data.op.name:
-            pack_data = pad_data.op.input_tensors[0]
-            bind_data_copy(s[pack_data])
-        else:
-            bind_data_copy(s[pad_data])
-        bind_data_copy(s[kernel])
-
-    pad_data, kernel = s[conv].op.input_tensors
-
-    if "pad_temp" in pad_data.op.name:
-        s[pad_data].compute_inline()
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    if autotvm.GLOBAL_SCOPE.in_tuning or len(latest.op.axis) == 4:
-        # create cache stage for tuning only or in case of 4d case
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-        if kernel.shape[0] == 1 and kernel.shape[1] == 1:
-            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-            bind_data_copy(s[WT])
-
-    # tile and bind spatial axes
-    n, y, x, fc, fb = s[latest_blocked].op.axis
-    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
-
-    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
-    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
-    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
-
-    by = s[latest_blocked].fuse(n, by)
-    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
-    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
-    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
-    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
-    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
-    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
-    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
-    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
-    s[latest_blocked].vectorize(fb)
-
-    s[conv].compute_at(s[latest_blocked], tx)
-
-    # tile reduction axes
-    n, y, x, fc, fb = s[conv].op.axis
-
-    ry, rx = s[conv].op.reduce_axis
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb)
-    s[conv].vectorize(fb)
-
-    # unroll
-    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-    if latest_blocked != latest:
-        s[latest].compute_root()
-        bind_data_copy(s[latest], 1)
-        if latest != output:
-            s[output].compute_inline()
-
-    N, OH, OW, OCC, OCB = get_const_tuple(latest_blocked.shape)
-    KH, KW, _, _, _ = get_const_tuple(kernel.shape)
-    KHKW = KH * KW
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW)
diff --git a/python/tvm/topi/adreno/group_conv2d_nchw.py b/python/tvm/topi/adreno/group_conv2d_nchw.py
deleted file mode 100644
index f1ab7fcf0e64..000000000000
--- a/python/tvm/topi/adreno/group_conv2d_nchw.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-
-"""Group Conv2d NCHW Operator wt Schedule on Qualcomm Adreno GPU"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from ..utils import get_const_tuple, traverse_inline
-from .utils import (
-    split_to_chunks,
-    pack_input,
-    pack_filter,
-    expand_spatial_dimensions,
-    add_pad,
-    bind_data_copy,
-    get_default_conv2d_config,
-    get_texture_storage,
-)
-
-
-@autotvm.register_topi_schedule("group_conv2d_nchwc.image2d")
-def schedule_group_conv2d_nchwc(cfg, outs):
-    """Create the schedule for group_conv2d_nchw"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "adreno_group_conv2d_latest_op":
-            schedule_group_conv2d_NCHWc_KCRSk(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("group_conv2d_nchwc.image2d")
-def group_conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    """
-    Group Convolution Operator in NCHWc layout.
-    Algo:
-      1. Convert into blocked format if we have 4d original tensor.
-         In case of AutoTVM we override the convert by just tensors since such conversion
-         will be absent for real blocked convolution, no sense to include into tuning
-      2. Expand spatial dimensions to have width and height be dividable by factor 4
-         This leads to slightly bigger amount of compute but allow utilize GPU much better
-      3. Add paddings. This happens even if we do not need pad originaly. This is useful
-         due to work surrounding the gaps of texture annotation between Primary Functions
-         and limited support of textures in schedules. Later on this pad will be executed
-         separately and will produce texture
-      4. 5d Convolution compute with accumulating into out_dtype
-      5. Cast to the origin output data type
-      6. For case of 4d convolution: convert of output from 5d to 4d
-    """
-
-    if out_dtype is None:
-        out_dtype = Input.dtype
-
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    convert_from4d = False
-    if len(Input.shape) == 4:
-        batch, in_channels, in_height, in_width = Input.shape
-        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block)
-            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-        else:
-            Input = pack_input(
-                Input,
-                "NCHW",
-                batch,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                in_height,
-                in_width,
-            )
-    else:
-        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
-        in_channels = in_channel_chunks * in_channel_block
-
-    if len(Filter.shape) == 4:
-        out_channels, in_filter_channels, kernel_h, kernel_w = Filter.shape
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channels, 4)
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
-        else:
-            convert_from4d = True
-            Filter = pack_filter(
-                Filter,
-                "OIHW",
-                out_channel_chunks,
-                out_channel_block,
-                out_channel_tail,
-                in_filter_channels,
-                in_channel_chunks,
-                in_channel_block,
-                in_channel_tail,
-                kernel_h,
-                kernel_w,
-            )
-    else:
-        out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape
-        out_channels = out_channel_chunks * out_channel_block
-
-    assert in_channels % in_filter_channels == 0
-    groups = in_channels // in_filter_channels
-
-    # Compute Constraints...
-    assert out_channel_chunks % groups == 0
-    assert in_channel_chunks % groups == 0
-
-    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
-        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
-    )
-
-    temp = add_pad(
-        Input,
-        "NCHW",
-        out_height_orig,
-        out_width_orig,
-        kernel_h,
-        kernel_w,
-        dilation_h,
-        dilation_w,
-        padding,
-        stride_h,
-        stride_w,
-    )
-
-    in_group_channel_chunks = in_channel_chunks // groups
-    in_group_channel_block = in_channel_block
-    out_group_channel_chunks = out_channel_chunks // groups
-    rcc = te.reduce_axis((0, in_group_channel_chunks), name="rcc")
-    rcb = te.reduce_axis((0, in_group_channel_block), name="rcb")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-
-    conv = te.compute(
-        (batch, out_channel_chunks, out_height, out_width, out_channel_block),
-        lambda nn, occ, yy, xx, obb: te.sum(
-            (
-                temp[
-                    nn,
-                    occ // out_group_channel_chunks * in_group_channel_chunks + rcc,
-                    yy * stride_h + ry * dilation_h,
-                    xx * stride_w + rx * dilation_w,
-                    rcb,
-                ]
-                * Filter[occ, rcc * in_group_channel_block + rcb, ry, rx, obb]
-            ).astype(out_dtype),
-            axis=[rcc, rcb, ry, rx],
-        ),
-        tag="conv2d_nchwc_group",
-    )
-
-    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
-        dummy_cast = te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
-            tag="dummy_cast",
-        )
-        return te.compute(
-            (batch, out_channels, out_height_orig, out_width_orig),
-            lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
-            tag="adreno_group_conv2d_latest_op",
-        )
-    else:
-        return te.compute(
-            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
-            lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
-            tag="adreno_group_conv2d_latest_op",
-        )
-
-
-def schedule_group_conv2d_NCHWc_KCRSk(cfg, s, output):
-    """
-    Schedule optimized for batch size = 1
-
-    Algo:
-    1. Split output axis to three parts: global work size, vthread, local worksize.
-       The limitations for tuning includes heuristics from some tuned networks to limit
-       search space and not pay much time for useles configurations.
-    2. In case of 4d convolution schedule copying of the input (and filter) into
-      5d tensors
-    4. pad should be scheduled separately to create independent opencl kernel. If pad is
-       inlined into convolution, this gives 1.5x performance drop
-    5. We are using cache_read for intermediate tensors to produce texture and guarantee
-       the best performance on the next stage.
-       The weights are managed through static texture planning mechanism and guarantied come
-       in texture memory scope.
-       Thus way we are calling cache_read only for data tensor
-    6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
-       for textures
-       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
-       of data type
-    7. In case of 4d conv we need to schedule postops as well
-    """
-    latest = s.outputs[0].output(0)
-    if len(latest.op.axis) == 4:
-        latest_blocked = dummy = output.op.input_tensors[0]
-        conv = dummy.op.input_tensors[0]
-    else:
-        conv = output.op.input_tensors[0]
-        latest_blocked = latest
-
-    pad_data, kernel = s[conv].op.input_tensors
-    filter_pack_rt = bool(
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    )
-
-    if "pad_temp" in pad_data.op.name:
-        input_pad_temp = pad_data.op.input_tensors[0]
-    else:
-        input_pad_temp = pad_data
-
-    input_pack_rt = bool(
-        isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag
-    )
-
-    ##### space definition begin #####
-    n, fc, y, x, fb = s[conv].op.axis
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-
-    if conv.shape[1] % 2 == 0:
-        min_threads_div = 2
-    else:
-        min_threads_div = 1
-    cfg.define_split(
-        "tile_fc",
-        fc,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8
-        and entity.size[2] >= min_threads_div
-        and entity.size[2] < 256,
-    )
-    cfg.define_split(
-        "tile_y",
-        y,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-    cfg.define_split(
-        "tile_x",
-        x,
-        num_outputs=3,
-        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
-    )
-
-    cfg.define_split("tile_rcc", rcc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
-    cfg.multi_filter(
-        filter=lambda entity: (  # pylint: disable=chained-comparison
-            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
-        )
-        <= 24
-        and 32
-        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
-        < 1024
-    )
-    if cfg.is_fallback:
-        get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-    # There are several conditions that have to be handled:
-    # 1. If we are in the tuning, we always add cache read for data to main conv kernel
-    #    to get texture in tuning opencl kernel
-    # 2. If we are repacking input in runtime, we should always explicit schedule this one more
-    #    stage of data copy from 4d to 5d (referred as pack_data).
-    # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the
-    #    cache_read("texture")
-    if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt:
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            if "pad_temp" in pad_data.op.name:
-                s[pad_data].compute_inline()
-        else:
-            if "pad_temp" in pad_data.op.name:
-                pack_data = pad_data.op.input_tensors[0]
-                bind_data_copy(s[pack_data])
-                s[pad_data].compute_inline()
-            else:
-                pack_data = pad_data
-                bind_data_copy(s[pack_data])
-
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-    elif "pad_temp" in pad_data.op.name:
-        s[pad_data].compute_inline()
-        # create cache stage
-        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-        bind_data_copy(s[AT])
-
-    if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            bind_data_copy(s[kernel])
-        if kernel.shape[2] == 1 and kernel.shape[3] == 1:
-            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-            bind_data_copy(s[WT])
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    # tile and bind spatial axes
-    n, fc, y, x, fb = s[latest_blocked].op.axis
-
-    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
-
-    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
-    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
-    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
-
-    bf = s[latest_blocked].fuse(n, bf)
-    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
-    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
-    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
-    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
-    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
-    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
-    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
-    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
-    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
-    s[latest_blocked].vectorize(fb)
-
-    s[conv].compute_at(s[latest_blocked], tx)
-
-    # tile reduction axes
-    n, fc, y, x, fb = s[conv].op.axis
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-
-    rco, rci = cfg["tile_rcc"].apply(s, conv, rcc)
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
-    s[conv].unroll(rcb)
-    s[conv].vectorize(fb)
-
-    # unroll
-    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    if latest_blocked != latest:
-        s[latest].compute_root()
-        bind_data_copy(s[latest], 1)
-        if latest != output:
-            s[output].compute_inline()
-
-    N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape)
-    _, IC, KH, KW, _ = get_const_tuple(kernel.shape)
-    ICKHKW = IC * KH * KW
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
diff --git a/python/tvm/topi/adreno/injective.py b/python/tvm/topi/adreno/injective.py
deleted file mode 100644
index 52ab0eab33fb..000000000000
--- a/python/tvm/topi/adreno/injective.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable,
-"""Schedule for composition of injective operator"""
-import tvm
-from tvm import te
-from .utils import bind_data_copy
-from .. import utils
-
-
-def schedule_injective_from_existing(sch, out):
-    """Schedule for injective op from existing schedule.
-
-    Parameters
-    ----------
-    sch: Schedule
-         The schedule to update.
-    out: Tensor
-         The tensor representing the injective op.
-
-    Returns
-    -------
-    sch: Schedule
-         The updated schedule.
-    """
-
-    bind_data_copy(sch[out])
-    return sch
-
-
-def schedule_injective(outs):
-    """Schedule for injective op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    tvm.te.schedule.AutoInlineInjective(s)
-    for out in outs:
-        if not utils.is_empty_shape(out.shape):
-            schedule_injective_from_existing(s, out)
-    return s
diff --git a/python/tvm/topi/adreno/pooling.py b/python/tvm/topi/adreno/pooling.py
deleted file mode 100644
index c6eb35a4c9bd..000000000000
--- a/python/tvm/topi/adreno/pooling.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""pooling schedules for Qualcomm Adreno GPU"""
-import tvm
-from tvm import te
-from .. import tag
-from .utils import get_div
-
-
-def schedule_adaptive_pool(outs, layout="NCHW"):
-    """Schedule for adaptive_pool.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of adaptive_pool
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for adaptive_pool.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule_global(Pool, layout):
-        # examples of latest pool op is global max pool and non latest is global avg pooling
-        # OL - an Expr will be used for rfactor
-        # Out - programming of the parallelizm on the global level
-        # shared is not required, local could be enough but shared scope gives quite significant
-        # perf boost
-        if Pool.op in s.outputs:
-            Out = Pool
-            OL = s.cache_write(Pool, "shared")
-        else:
-            Out = outs[0].op.output(0)
-            s[Pool].set_scope("shared")
-            OL = Pool
-
-        PaddedInput = Pool.op.input_tensors[0]
-
-        # detect axis for later reorder and binding of batch/channel to blocks and
-        # spatial to threads
-        if layout in ("NCHW", "NCHW4c"):
-            channel_index = 1
-            height_index = 2
-            width_index = 3
-        else:
-            channel_index = 3
-            height_index = 1
-            width_index = 2
-
-        if isinstance(PaddedInput.op, tvm.te.ComputeOp):
-            s[PaddedInput].compute_inline()
-
-        fused_reduce = s[OL].fuse(*s[OL].op.reduce_axis)
-
-        spatial = PaddedInput.shape[height_index].value * PaddedInput.shape[width_index].value
-        # below values were selected empirically assuming that we should have some work in each
-        # thread (currently from 25-49) and number of threads not exceeding some threshold that
-        # was selected as 256 from performance point of view after experiments on Adreno 660
-        max_threads = spatial // 25 if spatial > 25 else 1
-        max_threads = 256 if max_threads > 256 else max_threads
-        num_thread = get_div(spatial, max_threads)
-
-        thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
-
-        _, ki = s[OL].split(fused_reduce, factor=num_thread)
-        data_out_rf = s.rfactor(OL, ki)
-        s[data_out_rf].compute_at(s[OL], s[OL].op.reduce_axis[0])
-        s[OL].bind(s[OL].op.reduce_axis[0], thread_y)
-
-        naxis = s[Out].op.axis[0]
-        caxis = s[Out].op.axis[channel_index]
-        haxis = s[Out].op.axis[height_index]
-        waxis = s[Out].op.axis[width_index]
-
-        if layout in ("NHWC4c", "NCHW4c"):
-            texture_axis = s[Out].op.axis[-1]
-            s[Out].reorder(naxis, caxis, haxis, waxis, texture_axis)
-            s[Out].vectorize(texture_axis)
-        else:
-            texture_axis = None
-            s[Out].reorder(naxis, caxis, haxis, waxis)
-
-        bx = s[Out].fuse(naxis, caxis, haxis, waxis)
-        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
-
-        s[OL].compute_at(s[Out], bx)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule global_pool
-        elif OP.tag.startswith("adaptive_pool"):
-            Pool = OP.output(0)
-            _schedule_global(Pool, layout)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
-
-
-def schedule_pool(outs, layout):
-    """Schedule for various pooling operators.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of pool
-        in the format of an array of tensors.
-
-    layout: str
-        Data layout.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for pool.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(PaddedInput, Pool):
-        if isinstance(PaddedInput.op, tvm.te.ComputeOp):
-            s[PaddedInput].compute_inline()
-        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
-        num_thread = int(num_thread * 2)
-        if Pool.op in s.outputs:
-            Out = Pool
-            OL = s.cache_write(Pool, "local")
-        else:
-            Out = outs[0].op.output(0)
-            s[Pool].set_scope("local")
-        fused = s[Out].fuse(*s[Out].op.axis[:-1])
-        bx, tx = s[Out].split(fused, factor=num_thread)
-        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
-        s[Out].bind(tx, te.thread_axis("threadIdx.x"))
-        s[Out].vectorize(s[Out].op.axis[-1])
-        if Pool.op in s.outputs:
-            s[OL].compute_at(s[Out], tx)
-            s[OL].vectorize(s[OL].op.axis[-1])
-        else:
-            s[Pool].compute_at(s[Out], tx)
-            s[Pool].vectorize(s[Pool].op.axis[-1])
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule pool
-        elif OP.tag.startswith("pool"):
-            PaddedInput = OP.input_tensors[0]
-            Pool = OP.output(0)
-            _schedule(PaddedInput, Pool)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/adreno/reduction.py b/python/tvm/topi/adreno/reduction.py
deleted file mode 100644
index a208e2e27414..000000000000
--- a/python/tvm/topi/adreno/reduction.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,too-many-locals,len-as-condition
-"""Schedule for reduce operators"""
-import numpy
-from tvm import te
-from ..utils import get_const_tuple
-from .injective import schedule_injective_from_existing
-from .utils import get_div
-from ..cuda.reduction import schedule_reduce_impl
-
-
-def _schedule_reduce_adreno(op, sch, is_idx_reduce=False):
-    sch_output = sch.outputs[0].output(0)
-    use_rfactor = False
-    if not is_idx_reduce:
-        rdomain = 1
-        whole_rop_output = op.output(0)
-        for axis in sch[whole_rop_output].op.reduce_axis:
-            rdomain = rdomain * axis.dom.extent
-        if rdomain > 50:
-            use_rfactor = True
-            # shared goves better perf, but works only for rfactor flow
-            scope = "shared"
-        else:
-            # in case of direct scheduling, shared is failed to be compiled
-            scope = "local"
-        if op in sch.outputs:
-            whole_rop_output = sch.cache_write(sch_output, scope)
-        else:
-            # no change for whole_rop_output def, but need to set proper scope
-            sch[whole_rop_output].set_scope(scope)
-    else:
-        temp_idx_input = op.input_tensors[0].op.output(0)
-        temp_val_input = op.input_tensors[0].op.output(1)
-        sch[temp_idx_input].set_scope("local")
-        sch[temp_val_input].set_scope("local")
-
-    shape = get_const_tuple(sch_output.shape)
-    latest4 = len(shape) > 0 and shape[-1] == 4
-    div4 = numpy.prod(shape) % 4 == 0
-
-    # Fuse and split the axis
-    if latest4:
-        fused_outer = sch[sch_output].fuse(
-            *[sch[sch_output].op.axis[i] for i in range(len(sch[sch_output].op.axis) - 1)]
-        )
-    else:
-        fused_outer = sch[sch_output].fuse(
-            *[sch[sch_output].op.axis[i] for i in range(len(sch[sch_output].op.axis))]
-        )
-
-    ftc = numpy.prod(shape)
-    a = fused_outer
-
-    if not is_idx_reduce:
-        if use_rfactor:
-            # below values were selected empirically assuming that we should have some work in each
-            # thread (currently from 25-49) and number of threads not exceeding some threshold that
-            # was selected as 256 from performance point of view after experiments on Adreno 660
-            max_threads = rdomain.value // 25 if rdomain > 25 else 1
-            max_threads = 256 if max_threads > 256 else max_threads
-            num_thread = get_div(rdomain, max_threads)
-
-            fused_reduce = sch[whole_rop_output].fuse(*sch[whole_rop_output].op.reduce_axis)
-            thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
-            _, ki = sch[whole_rop_output].split(fused_reduce, factor=num_thread)
-            data_out_rf = sch.rfactor(whole_rop_output, ki)
-            sch[data_out_rf].compute_at(
-                sch[whole_rop_output], sch[whole_rop_output].op.reduce_axis[0]
-            )
-            sch[whole_rop_output].bind(sch[whole_rop_output].op.reduce_axis[0], thread_y)
-
-    if div4:
-        if latest4:
-            b = sch[sch_output].op.axis[-1]
-        else:
-            a, b = sch[sch_output].split(fused_outer, factor=4)
-        sch[sch_output].vectorize(b)
-        if not use_rfactor:
-            if is_idx_reduce:
-                sch[temp_idx_input].compute_at(sch[sch_output], b)
-                sch[temp_val_input].compute_at(sch[sch_output], b)
-            else:
-                sch[whole_rop_output].compute_at(sch[sch_output], b)
-
-    if not use_rfactor:
-        num_thread = get_div(ftc, 128)
-        bx, outer_in = sch[sch_output].split(a, factor=num_thread)
-        sch[sch_output].bind(bx, te.thread_axis("blockIdx.x"))
-        sch[sch_output].bind(outer_in, te.thread_axis("threadIdx.x"))
-
-        if not div4:
-            if is_idx_reduce:
-                sch[temp_idx_input].compute_at(sch[sch_output], outer_in)
-                sch[temp_val_input].compute_at(sch[sch_output], outer_in)
-            else:
-                sch[whole_rop_output].compute_at(sch[sch_output], outer_in)
-    else:
-        sch[sch_output].bind(a, te.thread_axis("blockIdx.x"))
-        if not div4 or use_rfactor:
-            if is_idx_reduce:
-                sch[temp_idx_input].compute_at(sch[sch_output], a)
-                sch[temp_val_input].compute_at(sch[sch_output], a)
-            else:
-                sch[whole_rop_output].compute_at(sch[sch_output], a)
-
-
-def schedule_reduce(outs):
-    return schedule_reduce_impl(
-        outs, _schedule_reduce_adreno, schedule_injective_from_existing, True
-    )
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
deleted file mode 100644
index a42cbeeb773b..000000000000
--- a/python/tvm/topi/adreno/utils.py
+++ /dev/null
@@ -1,697 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""util functions to be reused in different compute/schedule on Qualcomm Adreno GPU"""
-
-import numpy
-import tvm
-from tvm import te
-from tvm._ffi.registry import register_func
-from tvm.topi.utils import simplify
-from tvm.topi import nn
-from tvm.autotvm.task.space import SplitEntity
-from ..utils import get_const_tuple
-
-
-def get_div(value, start):
-    """Returns the maximum divider for `value` starting from `start` value"""
-    div = 1
-    for d in range(start, 0, -1):
-        if (value % d) == 0:
-            div = d
-            break
-    return div
-
-
-def split_to_chunks(extent, block):
-    """
-    Splits the trip count value to chunks and block, returns the remainder as well
-    the chunks and blocks covers or overlaps the origin value
-
-    If extent can be divisible by block:
-        extent = chunks * block
-    else
-        extent = (chunks - 1) * block + tail
-
-    Parameters
-    ----------
-    extent: int
-        tripcount for original compute
-
-    block: int
-        size of the block
-
-    Returns
-    ----------
-    out: tuple of the (chunks, block, tail)
-         chunks = ceildiv(extent, block)
-         tail = number of origin elements in the latest chunk
-    """
-    tail = extent % block
-    chunks = extent // block
-    if tail == 0:
-        tail = block
-    else:
-        chunks += 1
-    return chunks, block, tail
-
-
-def pack_input(Input, layout, batch, chunks, block, original_tail, in_height, in_width):
-    """
-    Adds compute stages for packing of the data in runtime. Extends channel dimensions
-    to be dividable by factor 4
-
-    This function should be substituted by Schedule.transform_layout() in the future: see
-    https://github.com/apache/tvm-rfcs/blob/main/rfcs/0039-buffer-physical-layout.md
-
-    Parameters
-    ----------
-    Input: tvm.te.Tensor
-        Input tensor to be repacked in runtime
-
-    layout: string
-        Layout of origin 4d tensor
-        NCHW or NHWC are acceptable
-
-    batch: int
-        Batch size
-
-    chunks: int
-        Number of channel chunks been in the final tensor
-
-    block: int
-        size of the channel block
-
-    original_tail: int
-        Tail in the latest chunk diffing original number of channels vs blocked one
-        If original_tail != block:
-          original_channels = chunks * block - original_tail
-        else
-          original_channels = chunks * block
-
-    in_height: int
-        Height of the feature map
-
-    in_width: int
-        Width of the feature map
-    """
-
-    pad_value = tvm.tir.const(0, Input.dtype)
-
-    def _reorder_data_nchw(*indices):
-        condition = []
-        condition.append(indices[1] == chunks - 1)
-        condition.append(indices[4] >= original_tail)
-        condition = tvm.tir.all(*condition)
-        return tvm.tir.if_then_else(
-            condition,
-            pad_value,
-            Input[indices[0], indices[1] * block + indices[4], indices[2], indices[3]],
-        )
-
-    def _reorder_data_nhwc(*indices):
-        condition = []
-        condition.append(indices[3] == chunks - 1)
-        condition.append(indices[4] >= original_tail)
-        condition = tvm.tir.all(*condition)
-        return tvm.tir.if_then_else(
-            condition,
-            pad_value,
-            Input[indices[0], indices[1], indices[2], indices[3] * block + indices[4]],
-        )
-
-    # compute:
-    if layout == "NCHW":
-        reordered_data = te.compute(
-            [batch, chunks, in_height, in_width, block],
-            _reorder_data_nchw,
-            name="input_pack",
-            tag="input_pack",
-        )
-    elif layout == "NHWC":
-        reordered_data = te.compute(
-            [batch, in_height, in_width, chunks, block],
-            _reorder_data_nhwc,
-            name="input_pack",
-            tag="input_pack",
-        )
-    else:
-        assert False, "Adreno util function pack_input does not accept unknown layout"
-    return reordered_data
-
-
-def pack_filter(
-    Filter,
-    layout,
-    out_chunks,
-    out_block,
-    out_original_tail,
-    in_filter_channels,
-    in_chunks,
-    in_block,
-    in_original_tail,
-    kernel_h,
-    kernel_w,
-):
-    """
-    Adds compute stages for packing of the filter in runtime. Extends channels dimensions
-    to be dividable by factor 4
-
-    This function should be substituted by Schedule.transform_layout() in the future: see
-    https://github.com/apache/tvm-rfcs/blob/main/rfcs/0039-buffer-physical-layout.md
-
-    Parameters
-    ----------
-    Filter: tvm.te.Tensor
-        Filter tensor to be repacked in runtime
-
-    layout: string
-        Layout of origin 4d tensor
-        NCHW or NHWC are acceptable
-
-    out_chunks: int
-        Number of chunks for filters
-
-    out_block: int
-        Size of the block for output channels
-
-    out_original_tail: int
-        Original size of the latest chunk of output filters
-
-    in_filter_channels: int
-        Number of filter channels. might be different vs input channels in the
-        data due to groups/depthwise nature
-
-    in_chunks: int
-        Number of input data channel chunks
-
-    in_block: int
-        Size of the block for input data channels
-
-    in_original_tail
-        Original size of the latest chunk for input data channels
-
-    kernel_h: int
-        Height of the conv2d kernel
-
-    kernel_w: int
-        Width of the conv2d kernel
-    """
-    pad_value = tvm.tir.const(0, Filter.dtype)
-
-    def _reorder_weights_depthwise_oihw(*indices):
-        conditionA = []
-        conditionA.append(indices[0] == out_chunks - 1)
-        conditionA.append(indices[4] >= out_original_tail)
-        conditionAT = tvm.tir.all(*conditionA)
-
-        return tvm.tir.if_then_else(
-            conditionAT,
-            pad_value,
-            Filter[indices[0] * out_block + indices[4], indices[1], indices[2], indices[3]],
-        )
-
-    def _reorder_weights_depthwise_hwoi(*indices):
-        conditionA = []
-        conditionA.append(indices[2] == out_chunks - 1)
-        conditionA.append(indices[4] >= out_original_tail)
-        conditionAT = tvm.tir.all(*conditionA)
-
-        return tvm.tir.if_then_else(
-            conditionAT,
-            pad_value,
-            Filter[indices[0], indices[1], indices[2] * out_block + indices[4], indices[3]],
-        )
-
-    def _reorder_weights_depthwise_hwio(*indices):
-        conditionA = []
-        conditionA.append(indices[3] == out_chunks - 1)
-        conditionA.append(indices[4] >= out_original_tail)
-        conditionAT = tvm.tir.all(*conditionA)
-
-        return tvm.tir.if_then_else(
-            conditionAT,
-            pad_value,
-            Filter[indices[0], indices[1], indices[2], indices[3] * out_block + indices[4]],
-        )
-
-    def _reorder_weights_oihw(*indices):
-        conditionA = []
-        conditionA.append(indices[0] == out_chunks - 1)
-        conditionA.append(indices[4] >= out_original_tail)
-        conditionAT = tvm.tir.all(*conditionA)
-
-        conditionO = []
-        conditionO.append(conditionAT)
-        conditionO.append(indices[1] >= in_chunks * in_block + in_original_tail)
-        conditionOT = tvm.tir.any(*conditionO)
-        return tvm.tir.if_then_else(
-            conditionOT,
-            pad_value,
-            Filter[indices[0] * out_block + indices[4], indices[1], indices[2], indices[3]],
-        )
-
-    def _reorder_weights_hwio(*indices):
-        conditionA = []
-        conditionA.append(indices[3] == out_chunks - 1)
-        conditionA.append(indices[4] >= out_original_tail)
-        conditionAT = tvm.tir.all(*conditionA)
-
-        conditionO = []
-        conditionO.append(conditionAT)
-        conditionO.append(indices[2] >= in_chunks * in_block + in_original_tail)
-        conditionOT = tvm.tir.any(*conditionO)
-        return tvm.tir.if_then_else(
-            conditionOT,
-            pad_value,
-            Filter[indices[0], indices[1], indices[2], indices[3] * out_block + indices[4]],
-        )
-
-    def _reorder_weights_iohw(*indices):
-        conditionA = []
-        conditionA.append(indices[1] == out_chunks - 1)
-        conditionA.append(indices[4] >= out_original_tail)
-        conditionAT = tvm.tir.all(*conditionA)
-
-        conditionO = []
-        conditionO.append(conditionAT)
-        conditionO.append(indices[0] >= in_chunks * in_block + in_original_tail)
-        conditionOT = tvm.tir.any(*conditionO)
-        return tvm.tir.if_then_else(
-            conditionOT,
-            pad_value,
-            Filter[indices[0], indices[1] * out_block + indices[4], indices[2], indices[3]],
-        )
-
-    if in_filter_channels == 1:
-        if layout == "OIHW":
-            reordered_filter = te.compute(
-                [out_chunks, in_filter_channels, kernel_h, kernel_w, out_block],
-                _reorder_weights_depthwise_oihw,
-                name="filter_pack",
-                tag="filter_pack",
-            )
-        elif layout == "HWOI":
-            reordered_filter = te.compute(
-                [kernel_h, kernel_w, out_chunks, in_filter_channels, out_block],
-                _reorder_weights_depthwise_hwoi,
-                name="filter_pack",
-                tag="filter_pack",
-            )
-        elif layout == "HWIO":
-            reordered_filter = te.compute(
-                [kernel_h, kernel_w, in_filter_channels, out_chunks, out_block],
-                _reorder_weights_depthwise_hwio,
-                name="filter_pack",
-                tag="filter_pack",
-            )
-        else:
-            assert False, "Adreno util function def pack_filter does not accept unknown layout"
-    else:
-        if layout == "OIHW":
-            reordered_filter = te.compute(
-                [out_chunks, in_filter_channels, kernel_h, kernel_w, out_block],
-                _reorder_weights_oihw,
-                name="filter_pack",
-                tag="filter_pack",
-            )
-        elif layout == "IOHW":
-            reordered_filter = te.compute(
-                [in_filter_channels, out_chunks, kernel_h, kernel_w, out_block],
-                _reorder_weights_iohw,
-                name="filter_pack",
-                tag="filter_pack",
-            )
-        elif layout == "HWIO":
-            reordered_filter = te.compute(
-                [kernel_h, kernel_w, in_filter_channels, out_chunks, out_block],
-                _reorder_weights_hwio,
-                name="filter_pack",
-                tag="filter_pack",
-            )
-        else:
-            assert False, "Adreno util function def pack_filter does not accept unknown layout"
-    return reordered_filter
-
-
-def expand_spatial_dimensions(
-    in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
-):
-    """
-    Expands spatial dimensions to be dividable by factor 4. This will allow us to do extrimely
-    better parallel computation on GPU. The drawback of this solution - it will be number of
-    useless computations. By fact the speed-up of parallelism significantly overcomes the slowdown
-    of extra compute and eventuially this is useful approach, at least for GPU
-
-    Parameters
-    ----------
-    in_height: int
-        Height of the feature map
-
-    in_width: int
-        Width of the feature map
-
-    kernel_h: int
-        Height of the conv2d kernel
-
-    kernel_w: int
-        Width of the conv2d kernel
-
-    dilation_h: int
-        Vertical dilation of the conv2d kernel
-
-    dilation_w: int
-        Horizontal dilation of the conv2d kernel
-
-    padding: tuple or list
-        Conv2d paddings
-
-    stride_h: int
-        Vertical stride  of the conv2d kernel
-
-    stride_w: int
-        Horizontal stride  of the conv2d kernel
-    """
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-
-    out_height_orig = out_height = simplify(
-        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1
-    )
-    out_width_orig = out_width = simplify(
-        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1
-    )
-
-    # can output shape be divded by 2 or even 4?
-    # if it cannot be divided, need to extend for further help with split
-    # theortically there should be addition padding for inputs, but it will be optimized by
-    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
-    # required for calculation of original out size, not more! In other case intermediate
-    # tensor might be allcoated with less sizes while compute will try to fill the expanded
-    # one - data discrepancy as a result
-    # And in case of textures it is not a problem if we provide texture of less size because
-    # 1. It is not important which values would be for extra calc - these calculations are
-    #    required only for better utilizatin of GPU fit to working groups
-    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
-    #    above, the value itself is not important
-    if out_height % 2 != 0:
-        out_height += 1
-    if out_width % 2 != 0:
-        out_width += 1
-
-    if out_height % 4 != 0:
-        out_height += 2
-    if out_width % 4 != 0:
-        out_width += 2
-    return out_height_orig, out_height, out_width_orig, out_width
-
-
-def add_pad(
-    data,
-    layout,
-    out_height,
-    out_width,
-    kernel_h,
-    kernel_w,
-    dilation_h,
-    dilation_w,
-    padding,
-    stride_h,
-    stride_w,
-):
-    """Computes required padding values by the parameters of conv2d and adds
-        compute for extending of original tensor
-
-    Parameters
-    ----------
-    data: tvm.te.Tensor
-        5d tensor, the layout of spatial dimensions are defined as separate argument
-
-    layout: string
-        Layout of origin 4d tensor
-
-    out_height: int
-        Height of the output feature map
-
-    out_width: int
-        Width of the output feature map
-
-    kernel_h: int
-        Height of the conv2d kernel
-
-    kernel_w: int
-        Width of the conv2d kernel
-
-    dilation_h: int
-        Height dilation value from conv2d attributes
-
-    dilation_w: int
-        Width dilation value from conv2d attributes
-
-    padding: list / tuple of n ints
-        Padding values from conv2d attributes
-
-    stride_h: int
-        Height stride value from conv2d attributes
-
-    stride_w: int
-        Width stride value from conv2d attributes
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        n-D, the same layout as Input.
-    """
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-
-    # compute graph
-    if layout == "NCHW":
-        y_axis = 2
-        x_axis = 3
-        if len(data.shape) == 4:
-            _, _, in_height, in_width = data.shape
-        else:
-            _, _, in_height, in_width, _ = data.shape
-    elif layout == "NHWC":
-        y_axis = 1
-        x_axis = 2
-        if len(data.shape) == 4:
-            _, in_height, in_width, _ = data.shape
-        else:
-            _, in_height, in_width, _, _ = data.shape
-    else:
-        assert False, "not supported layout in adreno util add_pad"
-    pad_before = [0, 0, 0, 0, 0]
-    pad_after = [0, 0, 0, 0, 0]
-    pad_before[y_axis] = pad_top
-    pad_before[x_axis] = pad_left
-    pad_after[y_axis] = pad_down
-    pad_after[x_axis] = pad_right
-
-    # calculation of real used input size:
-    input_latest_w = (out_width - 1) * stride_w + (kernel_w - 1) * dilation_w + 1
-    input_latest_h = (out_height - 1) * stride_h + (kernel_h - 1) * dilation_h + 1
-    if input_latest_w < in_width + pad_before[x_axis] + pad_after[x_axis]:
-        pad_after[x_axis] -= in_width + pad_before[x_axis] + pad_after[x_axis] - input_latest_w
-    if input_latest_h < in_height + pad_before[y_axis] + pad_after[y_axis]:
-        pad_after[y_axis] -= in_height + pad_before[y_axis] + pad_after[y_axis] - input_latest_h
-    if (
-        pad_before[0] == 0
-        and pad_before[1] == 0
-        and pad_before[2] == 0
-        and pad_before[3] == 0
-        and pad_after[0] == 0
-        and pad_after[1] == 0
-        and pad_after[2] == 0
-        and pad_after[3] == 0
-    ):
-        return data
-    else:
-        return nn.pad(data, pad_before, pad_after, name="pad_temp")
-
-
-def bind_data_copy(stage, axis_to_vectorize=None):
-    """
-    Schedules the eltwise stages like copying of data or postops
-
-    Parameters
-    ----------
-    stage: tvm.te.Tensor
-
-    axis_to_vectorize:
-        Causes to split certain axis, moves inner part to the end of schedule
-        and enable vectorization by this axis
-        If parameter is not pointed, the schedule will be vectorized if the most inner
-        dim is eq to 4 (size of the vector in texture)
-    """
-    shape = get_const_tuple(stage.op.output(0).shape)
-    if axis_to_vectorize and len(shape) == 4 and shape[axis_to_vectorize] % 4 == 0:
-        ax0, ax1, ax2, ax3 = stage.op.axis
-        if axis_to_vectorize == 1:
-            oax1, iax1 = stage.split(ax1, factor=4)
-            stage.reorder(ax0, oax1, ax2, ax3, iax1)
-            stage.vectorize(iax1)
-            fused = stage.fuse(ax0, oax1, ax2, ax3)
-        elif axis_to_vectorize == 3:
-            oax3, iax3 = stage.split(ax3, factor=4)
-            stage.reorder(ax0, ax1, ax2, oax3, iax3)
-            stage.vectorize(iax3)
-            fused = stage.fuse(ax0, ax1, ax2, oax3)
-
-        ftc = numpy.prod(shape) // 4
-        div = get_div(ftc, 128)
-        block, thread = stage.split(fused, factor=div)
-
-        stage.bind(block, te.thread_axis("blockIdx.z"))
-        stage.bind(thread, te.thread_axis("threadIdx.z"))
-    else:
-        if len(shape) > 0 and shape[-1] == 4:
-            axes = stage.op.axis
-            fused = stage.fuse(*axes[:-1])
-            ftc = numpy.prod(shape[:-1])
-            div = get_div(ftc, 64)
-            block, thread = stage.split(fused, factor=div)
-            stage.bind(block, te.thread_axis("blockIdx.x"))
-            stage.bind(thread, te.thread_axis("threadIdx.x"))
-            stage.vectorize(axes[-1])
-        else:
-            ftc = numpy.prod(shape)
-            vthread = get_div(ftc, 8)
-            fused = stage.fuse(*stage.op.axis)
-            ftc = ftc // vthread
-            # 1024 is a maximum work group size on the most Adreno GPU
-            num_thread = get_div(ftc, 1024 // vthread)
-            a, b = stage.split(fused, factor=num_thread)
-            a, c = stage.split(a, factor=vthread)
-            stage.bind(c, te.thread_axis("vthread"))
-            stage.bind(a, te.thread_axis("blockIdx.x"))
-            stage.bind(b, te.thread_axis("threadIdx.x"))
-
-
-def get_texture_storage(shape):
-    """
-    Returns the texture layout acceptable for the shape
-
-    Parameters
-    ----------
-    shape: array
-        Shape of the tensor to be packed to texture
-    """
-    # certain limitation of the Qualcomm devices. Subject to be determined for certain device
-    # individually, but until we have access to remote device during compilation, we have to
-    # define it uniformly for all target devices
-    # limit = 16384
-    limit = tvm.target.Target.current().attrs["texture_spatial_limit"]
-
-    if shape[0] * shape[1] * shape[2] < limit and shape[3] < limit:
-        return "global.texture"
-    elif shape[0] * shape[1] < limit and shape[2] * shape[3] < limit:
-        return "global.texture-nhwc"
-    else:
-        return "global.texture-weight"
-
-
-@register_func("tvm.info.mem.global.texture")
-@register_func("tvm.info.mem.global.texture-nhwc")
-@register_func("tvm.info.mem.global.texture-weight")
-def mem_info_global_texture_variants():
-    return tvm.ir.make_node(
-        "MemoryInfo",
-        unit_bits=16,
-        max_num_bits=16384 * 16384 * 4 * 32,
-        max_simd_bits=4 * 32,
-        head_address=None,
-    )
-
-
-def infer_tile_size(data, layout):
-    """Compute the tile size for Winograd algorithm
-
-    Parameters
-    ----------
-    data: tvm.te.Tensor
-        Data tensor
-
-    layout: string
-        Layout of data tebsir
-        NCHW, NCHW4c, NHWC or NHWC4c are acceptable
-
-    Returns
-    -------
-    tile_size : int
-        Calculated tile size
-    """
-    assert layout in ("NCHW", "NCHW4c", "NHWC", "NHWC4c"), "Incompatible layout"
-    if layout in ("NCHW", "NCHW4c"):
-        H = get_const_tuple(data.shape)[2]
-    else:
-        H = get_const_tuple(data.shape)[1]
-
-    if H % 8 == 0:
-        return 4
-    return 2
-
-
-def get_default_conv2d_config(cfg, fc, y, x):
-    """Defines conv2d default parameters for split axis for Adreno conv2d and depthwise conv2d"""
-    # look for vthread params:
-    vy = 1
-    for n in range(5, 0, -1):
-        if y % n == 0:
-            vy = n
-            break
-
-    vx = 1
-    for n in range(5, 0, -1):
-        if x % n == 0 and vy * n < 9:
-            vx = n
-            break
-
-    y = y // vy
-    x = x // vx
-
-    tfc = 1
-    for n in range(64, 0, -1):
-        if fc % n == 0:
-            tfc = n
-            break
-    ty = 1
-    for n in range(16, 0, -1):
-        if y % n == 0 and tfc * n <= 512:
-            ty = n
-            break
-    tx = 1
-    for n in range(16, 0, -1):
-        if x % n == 0 and tfc * ty * n <= 512:
-            tx = n
-            break
-
-    fc = fc // tfc
-    y = y // ty
-    x = x // tx
-
-    cfg["tile_fc"] = SplitEntity([fc, 1, tfc])
-    cfg["tile_y"] = SplitEntity([y, vy, ty])
-    cfg["tile_x"] = SplitEntity([x, vx, tx])
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
deleted file mode 100644
index 5484adaa6409..000000000000
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Schedule for ARM CPU"""
-
-from .conv1d import *
-from .conv2d import *
-from .depthwise_conv2d import *
-from .conv2d_transpose import *
-from .conv2d_int8 import *
-from .bitserial_conv2d import *
-from .bitserial_dense import *
-from .injective import *
-from .group_conv2d import *
-from .pooling import *
-from .dense import *
-from .matmul import *
-from .qnn import *
-
-from . import conv2d_alter_op
-from . import dense_alter_op
-from . import qnn_alter_op
-from . import qnn_legalize
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
deleted file mode 100644
index f690b2273112..000000000000
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Arm target utility functions"""
-
-import tvm
-from tvm.target import Target
-from tvm.tir.expr import PrimExpr
-
-
-def get_tiling_A(interleave_A, in_dtype, use_sme=False):
-    """Compute the tiling information for matrix A in C=A*B,
-    which corresponds to the im2col-transformed input matrix.
-
-    The tiling information is chosen to maximize register usage during
-    the tile computation.
-
-    Please refer to:
-    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-performance-for-armv8-architectures # pylint: disable=line-too-long
-    - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
-    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-through-mmla-instruction
-    - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
-     In order to have more information
-
-    Parameters
-    ----------
-    interleave_A : bool
-        determines if A is expected to be interleaved
-    in_dtype : str
-        input datatype
-    use_sme : bool
-        determines if SME operations on scalable vectors are expected
-
-    Returns
-    ----------
-    tile_M: the output tile size of A on M axis (M = OH * OW)
-    tile_K: the output tile size of A on K axis (K = KW * KH * IC)
-    """
-    target = Target.current(allow_none=False)
-    if in_dtype in ["int8", "uint8"]:
-        if target.features.has_matmul_i8:
-            # If smmla/ummla is enabled, we are loading 8 rows from A. Each row
-            # will contain 8 elements
-            tile_M = 8
-            tile_K = 8
-        elif target.features.has_dotprod and interleave_A:
-            # If dot product has been enabled, and we are interleaving A
-            # tile size should be 8x4
-            tile_M = 8
-            tile_K = 4
-        else:
-            # If either there is no dot product or if we are using a native strategy
-            # tile size should be 4x16
-            tile_M = 4
-            tile_K = 16
-    elif use_sme:
-        tile_M = 2 * tvm.tir.get_vscale_expr(in_dtype)
-        if in_dtype == "float16":
-            tile_K = tvm.tir.get_vscale_expr(in_dtype)
-        else:
-            tile_K = 2 * tvm.tir.get_vscale_expr(in_dtype)
-    else:
-        # In non-SME, non-quantized cases, A is not interleaved.
-        # We are loading 4 rows from A.
-        # Each row will contain 4 elements, along the dimension of reduction
-        tile_M = 4
-        tile_K = 4
-
-    return tile_M, tile_K
-
-
-def get_tiling_B_transformed(interleave_A, in_dtype, use_scalable_vectors=False, use_sme=False):
-    """Compute the tiling information for matrix B', where B'
-    is the tiled, interleaved (and transposed) version of matrix B in C=A*B.
-
-    The tiling information is chosen to maximize register usage during the
-    tile computation.
-
-    Please refer to:
-    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-performance-for-armv8-architectures # pylint: disable=line-too-long
-    - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
-    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-through-mmla-instruction
-    - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
-     In order to have more information
-
-    Parameters
-    ----------
-    interleave_A : bool
-        determines if A is expected to be interleaved
-    in_dtype : str
-        input datatype
-    use_scalable_vectors : bool
-        determines if operations on scalable vectors are expected
-    use_sme : bool
-        determines if SME operations on scalable vectors are expected
-
-
-    Returns
-    ----------
-    tile_N: the output tile size of B' on N axis (N = OC)
-    tile_K: the output tile size of B' on K axis (K = KW * KH * IC)
-    """
-    target = Target.current(allow_none=False)
-    if in_dtype in ["int8", "uint8"]:
-        if target.features.has_matmul_i8:
-            # If smmla/ummla is available,  A must be interleaved.
-            # Each load from B' will contain 8 elements
-            # and we are loading 12 rows of B' (i.e., 12 columns of B)
-            tile_N = 12
-            tile_K = 8
-        elif target.features.has_dotprod:
-            # The number of tile rows of B' vary depending on the
-            # strategy:
-            # * If we are interleaving A, then we select 12 columns from B'(i.e.,
-            #   12 rows from B).
-            # * If we are not interleaving A, then we select 16 columns from B'(i.e.,
-            #   16 rows from B).
-            tile_N = 12 if interleave_A else 16
-
-            # Dot product instruction groups 2 (u)int16x8 vectors in
-            # groups of 4 and compute the dot product among those groups
-            # This means that the number of columns in a tile of B' (i.e.,  the
-            # rows of the original matrix B)  need to be 4.
-            tile_K = 4
-        else:
-            # If no acceleration is available, A must be interleaved. In this case
-            # we load 4 rows of B' (i.e., 4 columns of B). Each of them will contain 16 elements
-            tile_N = 4
-            tile_K = 16
-    elif use_sme:
-        tile_N = 2 * tvm.tir.get_vscale_expr(in_dtype)
-        if in_dtype == "float16":
-            tile_K = tvm.tir.get_vscale_expr(in_dtype)
-        else:
-            tile_K = 2 * tvm.tir.get_vscale_expr(in_dtype)
-    # In non-SME, non-quantized cases, A is not interleaved.
-    elif use_scalable_vectors:
-        # Each load from B' contains 4 * scalable vectors (i.e. 4 * SVL columns from B)
-        # We are loading 4 rows from B', in the dimension of reduction (i.e. 4 rows from B)
-        tile_N = 4 * tvm.tir.get_vscale_expr(in_dtype)
-        tile_K = 4
-    elif in_dtype == "float16" and target.features.has_fp16_simd:
-        # Each load from B' contains 32 elements (i.e. 32 columns from B)
-        # We are loading 4 rows from B', in the dimension of reduction (i.e. 4 rows from B)
-        tile_N = 32
-        tile_K = 4
-    else:
-        # Each load from B' contains 16 elements (i.e. 16 columns from B)
-        # We are loading 4 rows from B', in the dimension of reduction (i.e. 4 rows from B)
-        tile_N = 16
-        tile_K = 4
-
-    return tile_N, tile_K
-
-
-def get_conv2d_im2col_padding(M, K, tile_M, tile_K):
-    """Compute the necessary padding for matrix A in C=A*B,
-    which corresponds to the im2col-transformed input matrix.
-
-    Parameters
-    ----------
-    M : int
-        Number of rows in A = OH * OW
-    K : int
-        Number of columns in A = KW * KH * IC
-    tile_M : int
-             tile size of A on M axis
-    tile_K : int
-             tile size of A on K axis
-
-    Returns
-    ----------
-    pad_M : padding for M axis
-    pad_K : padding for K axis
-    """
-    pad_M = 0
-    pad_K = 0
-
-    if M % tile_M != 0:
-        pad_M = tile_M - (M % tile_M)
-
-    if K % tile_K != 0:
-        pad_K = tile_K - (K % tile_K)
-
-    return pad_M, pad_K
-
-
-def pad_dim_to_multiple(dim: PrimExpr, multiple: PrimExpr):
-    """
-    Compute the padding required to reach specified multiple.
-
-    Parameters
-    ----------
-    dim : PrimExpr
-        Current size of the dim.
-    multiple : PrimExpr
-        Multiple to pad up to.
-
-    Returns
-    -------
-    padded_dim : PrimExpr
-        The new dim size.
-    pad_value : PrimExpr
-        The padding required.
-    """
-    pad_value = 0
-    if dim % multiple != 0:
-        pad_value = multiple - (dim % multiple)
-    padded_dim = dim + pad_value
-    return padded_dim, pad_value
-
-
-def get_conv2d_weights_padding(N, K, tile_N, tile_K):
-    """Compute the necessary padding for matrix B', where B'
-    is the transformed version of matrix B in C=A*B.
-
-    Parameters
-    ----------
-    N : int
-        Number of columns in B = OC
-    K : int
-        Number of rows in B = KW * KH * IC
-    tile_N : int
-             tile size of B' on N axis
-    tile_K : int
-             tile size of B' on K axis
-
-    Returns
-    ----------
-    pad_N : padding for N axis
-    pad_K : padding for K axis
-    """
-    pad_N = 0
-    pad_K = 0
-
-    if N % tile_N != 0:
-        pad_N = tile_N - (N % tile_N)
-
-    # Tensorize will later make use of 4 tiles at once across the K axis so make sure we pad such
-    # that K is multiple of 4
-    K_multiplier = 4
-    tile_K_multiplied = tile_K * K_multiplier
-    K_misalignment = K % tile_K_multiplied
-
-    if K_misalignment != 0:
-        pad_K = tile_K_multiplied - K_misalignment
-
-    return pad_N, pad_K
diff --git a/python/tvm/topi/arm_cpu/bitserial_conv2d.py b/python/tvm/topi/arm_cpu/bitserial_conv2d.py
deleted file mode 100644
index def9b8345cd8..000000000000
--- a/python/tvm/topi/arm_cpu/bitserial_conv2d.py
+++ /dev/null
@@ -1,468 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument
-"""Bitserial conv2d schedule on arm cpu"""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
-from ..nn.pad import pad
-from ..nn.bitserial_conv2d import bitserial_conv2d_legalize
-from ..nn.bitserial_util import bitpack, binary_op_multiplier
-from ..nn.utils import get_pad_tuple
-from ..utils import get_const_int, get_const_tuple, traverse_inline
-
-
-def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True):
-    if use_bitpack:
-        kernel_q = bitpack(kernel, kernel_bits, pack_axis=2, bit_axis=2, pack_type="uint8")
-    else:
-        kernel_q = kernel
-    KH, KW, KB, CI, CO = kernel_q.shape
-    kvshape = (CO // VC, KH, KW, KB, VC, CI)
-    return te.compute(
-        kvshape,
-        lambda co, dh, dw, b, vc, ci: kernel_q[dh][dw][b][ci][co * VC + vc],
-        name="kernel_vec",
-    )
-
-
-@autotvm.register_topi_compute("bitserial_conv2d_nhwc.arm_cpu")
-def bitserial_conv2d_nhwc(
-    cfg,
-    data,
-    kernel,
-    stride,
-    padding,
-    activation_bits,
-    weight_bits,
-    pack_dtype,
-    out_dtype,
-    unipolar,
-):
-    """Compute convolution with pack on spatial axes."""
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    assert pack_dtype == "uint8", "only support packing into uint8 bits"
-    assert out_dtype == "int16", "only support output type of int16"
-
-    N, H, W, CI = get_const_tuple(data.shape)
-    if len(kernel.shape) == 4:
-        KH, KW, _, CO = get_const_tuple(kernel.shape)
-        CI_packed = CI // 8
-    else:
-        KH, KW, KB, CI_packed, CO = get_const_tuple(kernel.shape)
-
-    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
-        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
-    else:
-        TPAD, LPAD, DPAD, RPAD = padding
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    HCAT, WCAT = KH - 1, KW - 1
-
-    PAD_H = H + (TPAD + DPAD)
-    PAD_W = W + (LPAD + RPAD)
-    OH = (PAD_H - KH) // HSTR + 1
-    OW = (PAD_W - KW) // WSTR + 1
-    oshape = (1, OH, OW, CO)
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    # Pad input channels of weights and data when it is not a multiple of 8
-    if CI_packed % 8 != 0:
-        CI_PAD = CI_packed % 8
-        CI_packed += CI_PAD
-    else:
-        CI_PAD = 0
-
-    # ==================== define configuration space ====================
-    n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO)
-    ci, kh, kw = cfg.reduce_axis(CI_packed), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-    ib, kb = cfg.reduce_axis(activation_bits), cfg.reduce_axis(weight_bits)
-
-    co, vc = cfg.define_split("tile_co", co, num_outputs=2, filter=lambda x: x.size[-1] == 8)
-    oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2, filter=lambda x: x.size[-1] >= 2)
-    ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda x: x.size[-1] >= 2)
-    ci_o, ci_i = cfg.define_split(
-        "tile_ci", ci, num_outputs=2, filter=lambda x: x.size[-1] == 8 or x.size[-1] == 16
-    )
-    re_axes = cfg.define_reorder(
-        "reorder_0",
-        [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i],
-        policy="candidate",
-        candidate=[
-            [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i],
-            [n, oh, ow, co, vh, vw, kw, kh, ci_o, kb, ib, vc, ci_i],
-        ],
-    )
-    # binary ops
-    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
-    # ====================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type="uint8")
-
-    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC, len(kernel.shape) == 4)
-    idxm = tvm.tir.indexmod
-    if idxm(kernel_vec.shape[-1], 8) != 0 and CI_PAD != 0:
-        kernel_vec = pad(kernel_vec, [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, CI_PAD])
-
-    N, H, W, IB, CI = data_q.shape
-    OCO, KH, KW, KB, VC, CI = kernel_vec.shape
-
-    dvshape = (
-        N,
-        PAD_H // (VH * HSTR),
-        PAD_W // (VW * WSTR),
-        VH * HSTR + HCAT,
-        VW * WSTR + WCAT,
-        IB,
-        CI,
-    )
-    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
-
-    if TPAD != 0 and RPAD != 0:
-        data_pad = pad(data_q, (0, TPAD, LPAD, 0, 0), (0, DPAD, RPAD, 0, CI_PAD), name="data_pad")
-    elif CI_PAD != 0:
-        data_pad = pad(data_q, (0, 0, 0, 0, 0), (0, 0, 0, 0, CI_PAD), name="data_pad")
-    else:
-        data_pad = data_q
-
-    data_vec = te.compute(
-        dvshape,
-        lambda n, h, w, vh, vw, b, ci: data_pad[n][h * VH * HSTR + vh][w * VW * WSTR + vw][b][ci],
-        name="data_vec",
-    )
-    ci = te.reduce_axis((0, CI), name="ci")
-    dh = te.reduce_axis((0, KH), name="dh")
-    dw = te.reduce_axis((0, KW), name="dw")
-    ib = te.reduce_axis((0, IB), name="ib")
-    kb = te.reduce_axis((0, KB), name="kb")
-
-    def _bipolar_conv(n, h, w, co, vh, vw, vc):
-        return te.sum(
-            (
-                tvm.tir.popcount(
-                    kernel_vec[co, dh, dw, kb, vc, ci].astype("uint16")
-                    & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci].astype("uint16")
-                )
-                << (kb + ib).astype("uint16")
-            ),
-            axis=[dh, dw, kb, ib, ci],
-        )
-
-    def _unipolar_conv(n, h, w, co, vh, vw, vc):
-        return te.sum(
-            (
-                (
-                    tvm.tir.popcount(
-                        kernel_vec[co, dh, dw, kb, vc, ci].astype("int16")
-                        & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci].astype("int16")
-                    )
-                    - tvm.tir.popcount(
-                        ~kernel_vec[co, dh, dw, kb, vc, ci].astype("int16")
-                        & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci]
-                    ).astype("int16")
-                )
-                << (kb + ib).astype("int16")
-            ),
-            axis=[dh, dw, kb, ib, ci],
-        )
-
-    if unipolar:
-        conv_vec = te.compute(ovshape, _unipolar_conv, name="conv_vec", tag="unipolar")
-    else:
-        conv_vec = te.compute(ovshape, _bipolar_conv, name="conv_vec", tag="bipolar")
-
-    conv = te.compute(
-        oshape,
-        lambda n, h, w, co: conv_vec[
-            n, idxd(h, VH), idxd(w, VW), idxd(co, VC), idxm(h, VH), idxm(w, VW), idxm(co, VC)
-        ].astype(out_dtype),
-        name="conv",
-        tag="spatial_bitserial_conv_nhwc",
-    )
-
-    return conv
-
-
-def _intrin_popcount(m, k_i, w_b, x_b, unipolar):
-    pack_dtype = "uint8"
-    w = te.placeholder((w_b, m, k_i), dtype=pack_dtype, name="w")
-    x = te.placeholder(
-        (
-            x_b,
-            k_i,
-        ),
-        dtype=pack_dtype,
-        name="x",
-    )
-    k = te.reduce_axis((0, k_i), name="k")
-    bw = te.reduce_axis((0, w_b), name="bw")
-    bx = te.reduce_axis((0, x_b), name="bx")
-    if unipolar:
-        dtype = "int16"
-        z = te.compute(
-            (m,),
-            lambda i: te.sum(
-                (
-                    tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype))
-                    - tvm.tir.popcount(~w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype))
-                )
-                << (bw + bx).astype(dtype),
-                axis=[bw, bx, k],
-            ),
-            name="z",
-        )
-    else:
-        dtype = "uint16"
-        z = te.compute(
-            (m,),
-            lambda i: te.sum(
-                tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype))
-                << (bw + bx).astype(dtype),
-                axis=[bw, bx, k],
-            ),
-            name="z",
-        )
-    Wb = tvm.tir.decl_buffer(
-        w.shape, w.dtype, name="W", offset_factor=k_i, strides=[te.var("ldw"), te.var("ldw"), 1]
-    )  # stride can be inferred
-    Xb = tvm.tir.decl_buffer(
-        x.shape, x.dtype, name="X", offset_factor=k_i, strides=[te.var("ldw"), 1]
-    )
-    Zb = tvm.tir.decl_buffer(z.shape, z.dtype, name="Z", offset_factor=1, strides=[1])
-
-    def _intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-
-        args_2 = tvm.tir.const(2, "uint32")
-
-        if unipolar:
-            vpadd = "llvm.arm.neon.vpadd.v8i8"
-            vpadalu = "llvm.arm.neon.vpadals.v16i8.v8i16"
-            full_dtype = "int8x16"
-            half_dtype = "int8x8"
-            return_dtype = "int16x8"
-        else:
-            vpadd = "llvm.arm.neon.vpadd.v8u8"
-            vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
-            full_dtype = "uint8x16"
-            half_dtype = "uint8x8"
-            return_dtype = "uint16x8"
-
-        def _instr(index):
-            irb = tvm.tir.ir_builder.create()
-            if index == 1:  # reduce reset
-                irb.emit(zz.vstore(0, tvm.tir.const(0, return_dtype)))
-                return irb.get()
-            # body and reduce update
-            cnts8 = [None] * 8
-            cnts4 = [None] * 4
-            cnts2 = [None] * 2
-            for bw in range(w_b):
-                for bx in range(x_b):
-                    if k_i == 16:
-                        for i in range(m):
-                            w_ = ww.vload([bw, i, 0], "uint8x16").astype(full_dtype)
-                            x_ = xx.vload([bx, 0], "uint8x16").astype(full_dtype)
-                            if unipolar:
-                                cnts = tvm.tir.popcount(w_ & x_) - tvm.tir.popcount(~w_ & x_)
-                            else:
-                                cnts = tvm.tir.popcount(w_ & x_)
-                            upper_half = tvm.tir.call_intrin(half_dtype, "tir.vectorhigh", cnts)
-                            lower_half = tvm.tir.call_intrin(half_dtype, "tir.vectorlow", cnts)
-                            cnts8[i] = upper_half + lower_half
-                        for i in range(m // 2):
-                            cnts4[i] = tvm.tir.call_llvm_pure_intrin(
-                                half_dtype, vpadd, args_2, cnts8[i * 2], cnts8[i * 2 + 1]
-                            )
-                        for i in range(m // 4):
-                            cnts2[i] = tvm.tir.call_llvm_pure_intrin(
-                                half_dtype, vpadd, args_2, cnts4[i * 2], cnts4[i * 2 + 1]
-                            )
-                        cnts = tvm.tir.call_intrin(
-                            full_dtype, "tir.vectorcombine", cnts2[0], cnts2[1]
-                        )
-                        shifted_cnts = cnts << tvm.tir.const(bw + bx, pack_dtype)
-                        out = tvm.tir.call_llvm_pure_intrin(
-                            return_dtype, vpadalu, args_2, zz.vload(0, return_dtype), shifted_cnts
-                        )
-                    else:  # ki == 8
-                        for i in range(m):
-                            w_ = ww.vload([bw, i, 0], "uint8x8").astype(half_dtype)
-                            x_ = xx.vload([bx, 0], "uint8x8").astype(half_dtype)
-                            if unipolar:
-                                cnts8[i] = tvm.tir.popcount(w_ & x_) - tvm.tir.popcount(~w_ & x_)
-                            else:
-                                cnts8[i] = tvm.tir.popcount(w_ & x_)
-                        for i in range(m // 2):
-                            cnts4[i] = tvm.tir.call_llvm_pure_intrin(
-                                half_dtype, vpadd, args_2, cnts8[i * 2], cnts8[i * 2 + 1]
-                            )
-                        for i in range(m // 4):
-                            cnts2[i] = tvm.tir.call_llvm_pure_intrin(
-                                half_dtype, vpadd, args_2, cnts4[i * 2], cnts4[i * 2 + 1]
-                            )
-                        cnts = tvm.tir.call_intrin(
-                            full_dtype, "tir.vectorcombine", cnts2[0], cnts2[1]
-                        )
-                        shifted_cnts = cnts << tvm.tir.const(bw + bx, pack_dtype)
-                        out = tvm.tir.call_llvm_pure_intrin(
-                            return_dtype, vpadalu, args_2, zz.vload(0, return_dtype), shifted_cnts
-                        )
-                    irb.emit(zz.vstore(0, out))
-            return irb.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        z.op, _intrin_func, binds={w: Wb, x: Xb, z: Zb}, default_buffer_params=buffer_params
-    )
-
-
-# ARM specific schedule that using custom microkernel
-def _schedule_spatial_conv2d_nhwc(
-    cfg, s, data_pad, data_vec, kernel_vec, conv_out, output, last, unipolar
-):
-    _, _, _, _, _, IB, CI = data_vec.shape
-    _, KH, KW, KB, _, _ = kernel_vec.shape
-    KB = get_const_int(KB)
-    IB = get_const_int(IB)
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    ##### Schedule data padding and  packing
-    if data_pad is not None:
-        s[data_pad].compute_inline()
-
-    _, h, _, _, _, _, _ = s[data_vec].op.axis
-    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
-    oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
-    s[data_vec].parallel(oh)
-
-    #### Schedule kernel packing
-    co, _, _, _, _, _ = s[kernel_vec].op.axis
-    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
-    oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
-    s[kernel_vec].parallel(oco)
-
-    ##### Schedule Convolution
-    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
-    kh, kw, kb, ib, ci = s[conv_out].op.reduce_axis
-
-    ci_o, ci_i = cfg["tile_ci"].apply(s, conv_out, ci)
-    re_axes = cfg["reorder_0"].apply(
-        s, conv_out, [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i]
-    )
-
-    # Use microkernel
-    kfactor = cfg["tile_ci"].size[1]
-    if kfactor % 8 == 0:
-        pc = _intrin_popcount(VC, kfactor, KB, IB, unipolar)
-        s[conv_out].tensorize(kb, pc)
-
-    n, h, w, co = s[last].op.axis
-    co, vc = cfg["tile_co"].apply(s, last, co)
-    oh, vh = cfg["tile_oh"].apply(s, last, h)
-    ow, vw = cfg["tile_ow"].apply(s, last, w)
-    s[last].reorder(n, oh, ow, co, vh, vw, vc)
-    s[last].vectorize(vc)
-    if last != output:
-        s[output].compute_inline()
-
-    s[conv_out].compute_at(s[last], co)
-    s[last].parallel(oh)
-    return s
-
-
-@autotvm.register_topi_schedule("bitserial_conv2d_nhwc.arm_cpu")
-def schedule_bitserial_conv2d_nhwc(cfg, outs):
-    """Arm cpu schedule for bitserial conv2d"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "spatial_bitserial_conv_nhwc" in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[0]
-            data_vec = conv_out.op.input_tensors[1]
-            data_q = data_vec.op.input_tensors[0]
-            data = data_q.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data_q.op, te.tensor.ComputeOp) and "pad" in data_q.op.tag:
-                data_pad = data_q
-                data_q = data
-                data = data.op.input_tensors[0]
-            unipolar = "unipolar" in conv_out.op.tag
-            _schedule_spatial_conv2d_nhwc(
-                cfg, s, data_pad, data_vec, kernel_vec, conv_out, output, outs[0], unipolar
-            )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@bitserial_conv2d_legalize.register("arm_cpu")
-def _bitserial_conv2d_legalize(attrs, inputs, arg_types):
-    """Legalizes Bitserial Conv2D op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-
-    # Fix different kernel layouts where possible.
-    if attrs["data_layout"] == "NHWC":
-        data, kernel = inputs
-        if len(kernel.data.shape) == 4:
-            # HWIO layout is expected for NHWC input.
-            if attrs["kernel_layout"] == "HWOI":
-                # Handle HWOI layout. This is common in TF depthwise conv2d graph.
-                kernel = relay.transpose(kernel, axes=(0, 1, 3, 2))
-            elif attrs["kernel_layout"] == "OIHW":
-                kernel = relay.transpose(kernel, axes=(2, 3, 1, 0))
-            ## Set new attrs for the tranposed conv.
-            new_attrs = {k: attrs[k] for k in attrs.keys()}
-            new_attrs["kernel_layout"] = "HWIO"
-
-            conv = relay.nn.bitserial_conv2d(data, kernel, **new_attrs)
-            return conv
-    return None
diff --git a/python/tvm/topi/arm_cpu/bitserial_dense.py b/python/tvm/topi/arm_cpu/bitserial_dense.py
deleted file mode 100644
index a9ce846cf163..000000000000
--- a/python/tvm/topi/arm_cpu/bitserial_dense.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, invalid-name, too-many-locals, too-many-arguments, condition-evals-to-constant
-"""Schedule for bitserial dense operator."""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.topi.utils import get_const_tuple
-from .. import tag
-from .bitserial_conv2d import _intrin_popcount
-from ..nn.pad import pad
-from ..nn.bitserial_util import bitpack, binary_op_multiplier
-
-
-@autotvm.register_topi_compute("bitserial_dense.arm_cpu")
-def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_dtype, unipolar):
-    """The default implementation of bitserial dense in topi.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : tvm.te.Tensor
-        2-D with shape [out_dim, in_dim]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
-    if len(weight.shape) == 2:
-        weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
-    else:
-        weight_packed = weight
-
-    batch, DB, in_dim = get_const_tuple(data_packed.shape)
-    out_dim, WB, in_dim = get_const_tuple(weight_packed.shape)
-
-    # Pad Inputs so that microkernel can be used
-    # out_dim and in_dim need to be multiples of 8
-    if out_dim % 8 != 0:
-        out_dim_pad = out_dim % 8
-        data_packed = pad(data_packed, [0, 0, 0], [out_dim_pad, 0, 0], name="PaddedInput")
-        out_dim += out_dim_pad
-
-    ######## Search space
-
-    x, y = cfg.axis(batch), cfg.axis(out_dim)
-    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(in_dim)
-
-    ko, ki = cfg.define_split(
-        "tile_k", k, num_outputs=2, filter=lambda xx: xx.size[-1] == 8 or xx.size[-1] == 16
-    )
-    xo, xi = cfg.define_split("tile_x", x, num_outputs=2)
-    yo, yi = cfg.define_split("tile_y", y, num_outputs=2, filter=lambda xx: xx.size[-1] == 8)
-
-    cfg.define_reorder(
-        "reorder_0",
-        [yo, xo, ko, xi, wb, db, yi, ki],
-        policy="candidate",
-        candidate=[
-            [yo, xo, ko, xi, wb, db, yi, ki],
-            [yo, xo, xi, ko, wb, db, yi, ki],
-            [yo, xo, ko, xi, wb, db, yi, ki],
-        ],
-    )
-
-    ###### Compute rule
-    VY = cfg["tile_y"].size[-1]
-    VK = cfg["tile_k"].size[-1]
-
-    wvshape = (out_dim // VY, in_dim // VK, WB, VY, VK)
-    oshape = (batch, out_dim)
-
-    k = te.reduce_axis((0, in_dim), name="k")
-    db = te.reduce_axis((0, DB), name="db")
-    wb = te.reduce_axis((0, WB), name="wb")
-
-    # Tile data and weights
-    weight_vec = te.compute(
-        wvshape,
-        lambda yo, ko, wb, vy, vk: weight_packed[yo * VY + vy][wb][ko * VK + vk],
-        name="weight_vec",
-    )
-    matmul_unipolar = te.compute(
-        oshape,
-        lambda x, y: te.sum(
-            (
-                tvm.tir.popcount(
-                    weight_vec[y // VY, k // VK, wb, y % VY, k % VK].astype(out_dtype)
-                    & data_packed[x, db, k].astype(out_dtype)
-                )
-                - tvm.tir.popcount(
-                    ~weight_vec[y // VY, k // VK, wb, y % VY, k % VK].astype(out_dtype)
-                    & data_packed[x, db, k].astype(out_dtype)
-                )
-            )
-            << (wb + db).astype(out_dtype),
-            axis=[wb, db, k],
-        ),
-        tag="bitserial_dense_unipolar",
-    )
-
-    matmul = te.compute(
-        oshape,
-        lambda x, y: te.sum(
-            tvm.tir.popcount(
-                weight_vec[y // VY, k // VK, wb, y % VY, k % VK].astype(out_dtype)
-                & data_packed[x, db, k].astype(out_dtype)
-            )
-            << (wb + db).astype(out_dtype),
-            axis=[wb, db, k],
-        ),
-        tag="bitserial_dense",
-    )
-
-    cfg.add_flop(batch * out_dim * in_dim * binary_op_multiplier(pack_dtype))
-
-    if unipolar:
-        return matmul_unipolar
-    return matmul
-
-
-@autotvm.register_topi_schedule("bitserial_dense.arm_cpu")
-def schedule_bitserial_dense(cfg, outs):
-    """Schedule for binary_dense.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of bitserial dense operator.
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for bitserial_dense.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(cfg, s, data_vec, weight_vec, output, unipolar):
-
-        z, k, _, y, x = s[weight_vec].op.axis
-        s[weight_vec].parallel(z)
-        s[weight_vec].vectorize(x)
-
-        x, y = s[output].op.axis
-        wb, db, k = s[output].op.reduce_axis
-        _, DB, _ = get_const_tuple(data_vec.shape)
-        _, _, WB, _, _ = get_const_tuple(weight_vec.shape)
-
-        yo, yi = cfg["tile_y"].apply(s, output, y)
-        xo, xi = cfg["tile_x"].apply(s, output, x)
-        ko, ki = cfg["tile_k"].apply(s, output, k)
-
-        cfg["reorder_0"].apply(s, output, [yo, xo, ko, xi, wb, db, yi, ki])
-
-        fused = s[output].fuse(xo, yo)
-        s[output].parallel(fused)
-
-        nfactor = cfg["tile_y"].size[-1]
-        kfactor = cfg["tile_k"].size[-1]
-        if nfactor % 8 == 0:
-            pc = _intrin_popcount(nfactor, kfactor, WB, DB, unipolar)
-            s[output].tensorize(wb, pc)
-
-        return s
-
-    def traverse(op):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag) or "elemwise" in op.tag:
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp):
-                    traverse(tensor.op)
-
-        elif op.tag == "bitserial_dense" or "bitserial_dense_unipolar":
-            output = op.output(0)
-            weight_vec = op.input_tensors[0]
-
-            data_vec = op.input_tensors[1]
-            data = data_vec.op.input_tensors[0]
-            if "QuantizeInput" in data.op.name:
-                data = data.op.input_tensors[0]
-            unipolar = output.op.tag == "bitserial_dense_unipolar"
-            _schedule(cfg, s, data_vec, weight_vec, output, unipolar)
-        else:
-            raise RuntimeError(f"Unsupported operator: {op.tag}")
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/arm_cpu/conv1d.py b/python/tvm/topi/arm_cpu/conv1d.py
deleted file mode 100644
index 54a6968777e7..000000000000
--- a/python/tvm/topi/arm_cpu/conv1d.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument, import-outside-toplevel
-"""Conv1D schedule for ARM CPU"""
-from __future__ import absolute_import as _abs
-
-from tvm import autotvm
-
-from .mprofile.dsp.conv1d import (
-    conv1d_nwc_dsp_compute,
-    conv1d_nwc_dsp_schedule,
-)
-
-
-@autotvm.register_topi_compute("conv1d_nwc_dsp.arm_cpu")
-def conv1d_nwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv1d with v7e-m DSP instructions."""
-    return conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv1d_nwc_dsp.arm_cpu")
-def schedule_conv1d_nwc_dsp(cfg, outs):
-    return conv1d_nwc_dsp_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
deleted file mode 100644
index b7327d5b52e8..000000000000
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ /dev/null
@@ -1,956 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument, import-outside-toplevel
-"""Conv2D schedule for ARM CPU"""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.script import tir as T
-import tvm.contrib.nnpack
-from tvm.tir.schedule.analysis import has_block
-
-from ..utils import traverse_inline, get_const_tuple
-from .. import nn
-from ..nn.utils import get_const_int, get_pad_tuple
-from ..nn.winograd_util import winograd_transform_matrices
-from .arm_utils import get_tiling_A, get_tiling_B_transformed
-from .conv2d_spatial_pack import (
-    conv2d_spatial_pack_nchw,
-    conv2d_spatial_pack_nhwc,
-    schedule_conv2d_spatial_pack_nchw,
-    schedule_conv2d_spatial_pack_nhwc,
-)
-from .conv2d_gemm import (
-    compute_conv2d_gemm_without_weight_transform,
-    schedule_conv2d_gemm_interleaved,
-    schedule_conv2d_gemm_native,
-)
-from .mprofile.dsp.conv2d import conv2d_nhwc_dsp_compute, conv2d_nhwc_dsp_schedule
-
-
-@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
-def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with NCHW layout"""
-    return conv2d_spatial_pack_nchw(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.arm_cpu")
-def schedule_conv2d_nchw_spatial_pack(cfg, outs):
-    """Create schedule for conv2d_nchw"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        # schedule conv2d
-        if "spatial_conv2d_output" in op.tag:
-            output = op.output(0)
-            conv = op.input_tensors[0]
-
-            data_vec = conv.op.input_tensors[0]
-            data_pad = data_vec.op.input_tensors[0]
-            s[data_pad].compute_inline()
-
-            kernel_vec = conv.op.input_tensors[1]
-            if kernel_vec.op.name == "kernel_vec":
-                kernel = kernel_vec.op.input_tensors[0]
-            else:
-                kernel = kernel_vec
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_spatial_pack.arm_cpu")
-def conv2d_nhwc_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with NHWC layout"""
-    return conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_spatial_pack.arm_cpu")
-def schedule_conv2d_nhwc_spatial_pack(cfg, outs):
-    """Create schedule for conv2d_nhwc"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "spatial_conv_output_NHWC" in op.tag:
-            schedule_conv2d_spatial_pack_nhwc(cfg, s, op, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd.arm_cpu")
-def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d_nchw layout using Winograd with weight transform"""
-    tile_size = 4
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size)
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd.arm_cpu")
-def schedule_conv2d_nchw_winograd(cfg, outs):
-    """Create schedule for conv2d_nchw_winograd"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "winograd_conv2d_output" in op.tag:
-            output = op.output(0)
-            _schedule_winograd(cfg, s, output, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size):
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    if isinstance(N, tvm.tir.Any):
-        N = tvm.te.size_var("n")
-    if not isinstance(IH, int) or not isinstance(IW, int):
-        raise RuntimeError("ARM winograd conv2d doesn't support dynamic input height or width.")
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    if len(kernel.shape) == 4:
-        if dilation_h != 1 or dilation_w != 1:
-            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
-        pre_computed = False
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-    else:
-        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
-        pre_computed = True
-        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
-        CO *= VC
-        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
-
-    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
-    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, out_dtype)
-
-    K = CO
-    C = CI
-
-    H = (IH + pt + pb - 3) // HSTR + 1
-    W = (IW + pl + pr - 3) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-
-    # TODO(@kevinthesun): Support tuning/optimization for dynamic shape.
-    tile_p = P if isinstance(N, int) else nH * nW
-    cfg.define_split("tile_p", cfg.axis(tile_p), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
-    cfg.define_split("tile_k", cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
-    VP = cfg["tile_p"].size[-1]
-    VK = cfg["tile_k"].size[-1]
-
-    # pack input tile
-    input_tile = te.compute(
-        (C, idxd(P, VP), alpha, alpha, VP),
-        lambda c, b, eps, nu, bb: data_pad[
-            idxd(b * VP + bb, nH * nW),
-            c,
-            idxm(idxd(b * VP + bb, nW), nH) * m + eps,
-            idxm(b * VP + bb, nW) * m + nu,
-        ],
-        name="d",
-    )
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        VC = cfg["tile_k"].size[-1]
-        kvshape = (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC)
-        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
-    else:
-        # transform kernel
-        if pre_computed:
-            U = kernel
-        else:
-            r_kh = te.reduce_axis((0, KH), "r_kh")
-            r_kw = te.reduce_axis((0, KW), "r_kw")
-            U = te.compute(
-                (alpha, alpha, idxd(K, VK), C, VK),
-                lambda eps, nu, k, c, kk: te.sum(
-                    kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype)
-                    * G[eps][r_kh]
-                    * G[nu][r_kw],
-                    axis=[r_kh, r_kw],
-                ),
-                name="U",
-            )
-
-    # transform image
-    r_eps = te.reduce_axis((0, alpha), "r_eps")
-    r_nu = te.reduce_axis((0, alpha), "r_nu")
-    V = te.compute(
-        (alpha, alpha, idxd(P, VP), C, VP),
-        lambda eps, nu, b, c, bb: te.sum(
-            input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu],
-            axis=[r_eps, r_nu],
-        ),
-        name="V",
-    )
-
-    # batch gemm
-    c = te.reduce_axis((0, C), name="c")
-    M = te.compute(
-        (alpha, alpha, K, P),
-        lambda eps, nu, k, b: te.sum(
-            U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] * V[eps][nu][idxd(b, VP)][c][idxm(b, VP)],
-            axis=c,
-        ),
-        name="M",
-    )
-
-    # inverse transform
-    r_eps = te.reduce_axis((0, alpha), "r_eps")
-    r_nu = te.reduce_axis((0, alpha), "r_nu")
-    Y = te.compute(
-        (K, P, m, m),
-        lambda k, b, vh, vw: te.sum(
-            M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]
-        ),
-        name="Y",
-    )
-
-    # unpack output
-    output = te.compute(
-        (N, K, H, W),
-        lambda n, k, h, w: Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m), idxm(h, m), idxm(w, m)],
-        name="output",
-        tag="winograd_conv2d_output",
-    )
-
-    # we have to manually assign effective GFLOP for winograd
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * K * H * W * KH * KW * C)
-    return output
-
-
-def _schedule_winograd(cfg, s, output, last):
-    Y = output.op.input_tensors[0]
-    M, A = Y.op.input_tensors
-    U, V = M.op.input_tensors
-    d, B = V.op.input_tensors
-    data_pad = d.op.input_tensors[0]
-
-    # padding
-    s[data_pad].compute_inline()
-
-    # pack input tiles
-    s[d].compute_inline()
-
-    # transform kernel
-    if isinstance(U.op, tvm.te.ComputeOp):
-        kernel, G = U.op.input_tensors
-        s[G].compute_inline()
-        (eps, nu, k, c, kk) = s[U].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel transformation will be pre-computed during compilation, so we skip
-            # this part to make tuning records correct
-            s[U].pragma(eps, "debug_skip_region")
-        else:
-            r_kh, r_kw = s[U].op.reduce_axis
-            s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
-            for axis in [eps, nu, r_kh, r_kw]:
-                s[U].unroll(axis)
-            s[U].vectorize(kk)
-            s[U].parallel(k)
-
-        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-            s[kernel].compute_inline()
-
-    # transform image
-    DD = s.cache_read(d, "global", [V])
-    s[B].compute_inline()
-    eps, nu, b, c, bb = s[V].op.axis
-    r_eps, r_nu = s[V].op.reduce_axis
-    s[V].reorder(b, c, eps, nu, r_eps, r_nu, bb)
-    for axis in [eps, nu, r_eps, r_nu]:
-        s[V].unroll(axis)
-    s[DD].compute_at(s[V], c)
-    s[V].vectorize(bb)
-    s[V].parallel(b)
-
-    # batch gemm
-    eps, nu, k, b = s[M].op.axis
-    c = s[M].op.reduce_axis[0]
-    cfg.define_split("tile_c", c, num_outputs=2, filter=lambda x: x.size[-1] <= 16)
-    co, ci = cfg["tile_c"].apply(s, M, c)
-    xo, xi = cfg["tile_p"].apply(s, M, b)
-    s[M].reorder(eps, nu, xo, co, k, ci, xi)
-    cfg.define_annotate("ann_reduce", [ci], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [k, xi], policy="try_unroll_vec")
-    cfg["ann_reduce"].apply(s, M, [ci], axis_lens=[cfg["tile_c"].size[-1]], max_unroll=16, cfg=cfg)
-    cfg["ann_spatial"].apply(s, M, [k, xi])
-
-    # inverse transform
-    s[A].compute_inline()
-    k, b, vh, vw = s[Y].op.axis
-    r_eps, r_nu = s[Y].op.reduce_axis
-    for axis in [vh, vw, r_eps, r_nu]:
-        s[Y].unroll(axis)
-
-    # output
-    n, co, h, w = s[last].op.axis
-    co, coi = cfg["tile_k"].apply(s, last, co)
-    p = s[last].fuse(n, co)
-    s[M].compute_at(s[last], p)
-    s[last].parallel(p)
-
-    MM = s.cache_read(M, "global", [Y])
-    m = get_const_int(V.shape[0]) + 1 - 3
-    ho, wo, hi, wi = s[last].tile(h, w, m, m)
-    s[Y].compute_at(s[last], wo)
-    s[MM].compute_at(s[last], wo)
-
-    if output != last:
-        s[output].compute_inline()
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack.arm_cpu")
-def conv2d_nchw_winograd_nnpack(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d_nchw using nnpack Winograd implementation"""
-    dtype = data.dtype
-    if dtype == "float32":
-        return _conv2d_arm_cpu_winograd_nnpack(
-            cfg,
-            data,
-            kernel,
-            strides,
-            padding,
-            dilation,
-            out_dtype,
-            tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8,
-        )
-    elif dtype == "float16":
-        return _conv2d_arm_cpu_winograd_nnpack(
-            cfg,
-            data,
-            kernel,
-            strides,
-            padding,
-            dilation,
-            out_dtype,
-            tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16,
-        )
-    else:
-        raise ValueError(f"Unsupported data type {dtype} for conv2d winograd nnpack")
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack.arm_cpu")
-def schedule_conv2d_nchw_winograd_nnpack(cfg, outs):
-    """Create schedule for conv2d_nchw_winograd_nnpack"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "winograd_nnpack_conv2d_output" in op.tag:
-            output = op.output(0)
-            _schedule_winograd_nnpack(cfg, s, output, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _conv2d_arm_cpu_winograd_nnpack(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, convolution_algorithm
-):
-    """TOPI compute callback. Use winograd NNPACK template"""
-    N, CI, IH, IW = get_const_tuple(data.shape)
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-    assert (dilation_h, dilation_w) == (1, 1)
-    assert len(kernel.shape) == 4
-    CO, _, KH, KW = get_const_tuple(kernel.shape)
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
-
-    assert (
-        KH == 3
-        and KW == 3
-        and pt == 1
-        and pb == 1
-        and pl == 1
-        and pr == 1
-        and HSTR == 1
-        and WSTR == 1
-    )
-    H = (IH + pt + pb - 3) // HSTR + 1
-    W = (IW + pl + pr - 3) // WSTR + 1
-
-    cfg.define_knob("winograd_nnpack_algorithm", [convolution_algorithm])
-
-    assert N == 1
-    with tvm.te.tag_scope("winograd_nnpack_conv2d_weight_transform"):
-        transformed_kernel = tvm.contrib.nnpack.convolution_inference_weight_transform(
-            kernel, algorithm=cfg["winograd_nnpack_algorithm"].val
-        )
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            transformed_kernel = te.compute(transformed_kernel.shape, lambda *args: 0.0)
-
-    with tvm.te.tag_scope("winograd_nnpack_conv2d_output"):
-        output = tvm.contrib.nnpack.convolution_inference_without_weight_transform(
-            data,
-            transformed_kernel,
-            bias=None,
-            padding=[pt, pb, pl, pr],
-            stride=[HSTR, WSTR],
-            algorithm=cfg["winograd_nnpack_algorithm"].val,
-        )
-
-    # we have to manually assign effective GFLOP for winograd
-    cfg.add_flop(2 * N * CI * H * W * KH * KW * CO)
-    return output
-
-
-def _schedule_winograd_nnpack(cfg, s, output, last):
-    # Could have bias.
-
-    (X, TK) = output.op.input_tensors[:2]
-
-    # transform kernel
-    assert isinstance(TK.op, (te.tensor.ComputeOp, te.tensor.ExternOp, te.tensor.PlaceholderOp))
-    if autotvm.GLOBAL_SCOPE.in_tuning and isinstance(TK.op, te.tensor.ComputeOp):
-        # kernel transformation will be pre-computed during compilation, so we skip
-        # this part to make tuning records correct
-        s[TK].pragma(s[TK].op.axis[0], "debug_skip_region")
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu")
-def conv2d_nchw_winograd_nnpack_without_weight_transform(
-    cfg, data, transformed_kernel, bias, strides, padding, dilation, out_dtype
-):
-    """Compute conv2d_nchw using NNPack winograd without weight transform"""
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-    assert (dilation_h, dilation_w) == (1, 1)
-    assert len(transformed_kernel.shape) == 4
-    CO, _, _, _ = get_const_tuple(transformed_kernel.shape)
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    KH, KW = 3, 3
-    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
-
-    assert (
-        KH == 3
-        and KW == 3
-        and pt == 1
-        and pb == 1
-        and pl == 1
-        and pr == 1
-        and HSTR == 1
-        and WSTR == 1
-    )
-    H = (IH + pt + pb - 3) // HSTR + 1
-    W = (IW + pl + pr - 3) // WSTR + 1
-
-    assert N == 1
-    with tvm.te.tag_scope("winograd_nnpack_conv2d_output"):
-        output = tvm.contrib.nnpack.convolution_inference_without_weight_transform(
-            data=data,
-            transformed_kernel=transformed_kernel,
-            bias=bias,
-            padding=[pt, pb, pl, pr],
-            stride=[HSTR, WSTR],
-            algorithm=cfg["winograd_nnpack_algorithm"].val,
-        )
-
-    # we have to manually assign effective GFLOP for winograd
-    cfg.add_flop(2 * N * CI * H * W * KH * KW * CO)
-    return output
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu")
-def schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "winograd_nnpack_conv2d_output" in op.tag:
-            output = op.output(0)
-            _schedule_winograd_nnpack(cfg, s, output, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_dsp.arm_cpu")
-def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d_nhwc with v7e-m DSP instructions."""
-    return conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_dsp.arm_cpu")
-def schedule_conv2d_nhwc_dsp(cfg, outs):
-    """Create schedule for conv2d_nhwc_dsp"""
-    return conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
-def compute_conv2d_NHWC(
-    cfg,
-    data,
-    kernel,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    interleave_A,
-    use_scalable_vectors=False,
-    use_sme=False,
-):
-    """Compute definition for conv2d NHWC"""
-    N, IH, IW, IC = get_const_tuple(data.shape)
-    KH, KW, _, OC = get_const_tuple(kernel.shape)
-    tile_N, tile_K = get_tiling_B_transformed(
-        interleave_A, data.dtype, use_scalable_vectors, use_sme
-    )
-
-    kernel = nn.conv2d_gemm_weight_transform(kernel, tile_N, tile_K, use_scalable_vectors, use_sme)
-    return compute_conv2d_gemm_without_weight_transform(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        (KH, KW),
-        OC,
-        interleave_A,
-        use_scalable_vectors,
-        use_sme,
-    )
-
-
-def compute_conv2d_NHWC_without_transform(
-    cfg,
-    data,
-    B,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    kernel_size=None,
-    output_channels=None,
-    interleave_A=False,
-):
-    """Compute conv2d NHWC without weight transform"""
-    return compute_conv2d_gemm_without_weight_transform(
-        cfg,
-        data,
-        B,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        kernel_size,
-        output_channels,
-        interleave_A,
-    )
-
-
-def schedule_conv2d_NHWC(cfg, outs, interleave_A):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-    # Vectorize the output and then inline all the rest
-    out = outs[0]
-    n, h, w, c = out.op.axis
-    n_h_fused = s[out].fuse(n, h)
-    _, inner = s[out].split(c, 4)
-    s[out].vectorize(inner)
-    s[out].parallel(n_h_fused)
-
-    def _callback(op):
-        """Traverse operators from computation graph"""
-        if op.name == "conv2d_gemm_output":
-            conv_out = op.output(0)
-            if interleave_A:
-                schedule_conv2d_gemm_interleaved(cfg, s, conv_out, out)
-            else:
-                schedule_conv2d_gemm_native(cfg, s, conv_out, out)
-            if out != conv_out:
-                s[conv_out].compute_at(s[out], inner)
-            else:
-                C = conv_out.op.input_tensors[0]
-                if interleave_A:
-                    s[C].compute_at(s[out], inner)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_NHWC_hybrid.arm_cpu")
-def compute_conv2d_NHWC_hybrid(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Interface for hybrid compute_conv2d_NHWC_hybrid"""
-    return compute_conv2d_NHWC(cfg, data, kernel, strides, padding, dilation, out_dtype, False)
-
-
-@autotvm.register_topi_compute("conv2d_NHWC_hybrid_without_transform.arm_cpu")
-def compute_conv2d_NHWC_hybrid_without_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
-):
-    """Interface for hybrid compute_conv2d_NHWC_hybrid_without_transform"""
-    return compute_conv2d_NHWC_without_transform(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        kernel_size,
-        output_channels,
-        False,
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_NHWC_hybrid.arm_cpu")
-def schedule_conv2d_NHWC_hybrid(cfg, outs):
-    """Interface for hybrid schedule_conv2d_NHWC_hybrid"""
-    return schedule_conv2d_NHWC(cfg, outs, False)
-
-
-@autotvm.register_topi_schedule("conv2d_NHWC_hybrid_without_transform.arm_cpu")
-def schedule_conv2d_NHWC_hybrid_without_transform(cfg, outs):
-    """Interface for hybrid schedule_conv2d_NHWC_hybrid"""
-    return schedule_conv2d_NHWC(cfg, outs, False)
-
-
-@autotvm.register_topi_compute("conv2d_NHWC_hybrid_SVE.arm_cpu")
-def compute_conv2d_NHWC_hybrid_SVE(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Interface for hybrid compute_conv2d_NHWC_hybrid_SVE"""
-    return compute_conv2d_NHWC(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, False, True
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_NHWC_hybrid_SVE.arm_cpu")
-def schedule_conv2d_NHWC_hybrid_SVE(cfg, outs):
-    """Interface for hybrid schedule_conv2d_NHWC_hybrid_SVE"""
-    return schedule_conv2d_NHWC(cfg, outs, False)
-
-
-@autotvm.register_topi_compute("conv2d_NHWC_hybrid_SME.arm_cpu")
-def compute_conv2d_NHWC_hybrid_SME(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Interface for hybrid compute_conv2d_NHWC_hybrid_SME"""
-    return compute_conv2d_NHWC(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        False,
-        True,
-        True,
-    )
-
-
-@autotvm.register_topi_compute("conv2d_NHWC_hybrid_SME_transposed_B.arm_cpu")
-def compute_conv2d_NHWC_SME_transposed_B(
-    cfg,
-    data,
-    kernel,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    kernel_size,
-    output_channels,
-):
-    """Compute conv2d NHWC hybrid SME transposed B"""
-    N, K = get_const_tuple(kernel.shape)
-    tile_N, tile_K = get_tiling_B_transformed(False, data.dtype, True, True)
-    pad_N, pad_K = tvm.topi.arm_cpu.arm_utils.get_conv2d_weights_padding(N, K, tile_N, tile_K)
-
-    kernel = tvm.topi.nn.pad(
-        kernel, pad_before=(0, 0), pad_after=(pad_N, pad_K), name="weight_padding"
-    )
-
-    return compute_conv2d_gemm_without_weight_transform(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        kernel_size,
-        output_channels,
-        interleave_A=False,
-        use_scalable_vectors=True,
-        use_sme=True,
-    )
-
-
-def schedule_conv2d_NHWC_hybrid_TIR(sch: tvm.tir.Schedule):
-    """
-    Perform TIR scheduling for conv2d NHWC.
-    """
-    # Get ordered buffer list
-    primfunc = sch.mod["main"]
-    buffer_names = primfunc.params
-    buffer_list = [primfunc.buffer_map[buf] for buf in buffer_names]
-    in_dtype = buffer_list[0].dtype
-    out_dtype = "float32"
-
-    # Determine PrimFunc blocks
-    block_list = [
-        "data_pad",
-        "data_im2col",
-        "T_reshape",
-        "A_padded_K",
-        "A_padded_M",
-        "weight_flatten",
-        "weight_padding",
-        "weight_transpose",
-        "C",
-        "conv2d_gemm_output",
-    ]
-    func_blocks = {}
-    for block in block_list:
-        func_blocks[block] = sch.get_block(block) if has_block(sch, block) else None
-
-    gemm_block = func_blocks["C"]
-    b, m, n, k = sch.get_loops(gemm_block)
-
-    # Get tiling information
-    use_scalable_vectors = sch.get(func_blocks["conv2d_gemm_output"]).annotations[
-        "use_scalable_vectors"
-    ]
-    use_sme = sch.get(func_blocks["conv2d_gemm_output"]).annotations["use_sme"]
-    M_padded = sch.get(m).extent
-    N_padded = sch.get(n).extent
-    K_padded = sch.get(k).extent
-    tile_M, tile_K = get_tiling_A(False, in_dtype, use_sme)
-    tile_N, _ = get_tiling_B_transformed(False, in_dtype, use_scalable_vectors, use_sme)
-    tile_M = T.cast(tile_M, M_padded.dtype)
-    tile_N = T.cast(tile_N, N_padded.dtype)
-    tile_K = T.cast(tile_K, K_padded.dtype)
-
-    # GeMM
-    # Compute each tile_M x tile_N tile
-    # By summing up K outer products
-    if use_sme:
-        # pylint: disable=import-outside-toplevel
-        from tvm.topi.arm_cpu.pstate_attributes import SMEAttributes
-        from tvm.tir.tensor_intrin.arm_cpu import (
-            ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA,
-            ARM_SME_INIT,
-            get_sme_gemm_interleaved_mopa_2svlx2svl_intrin,
-            get_transpose_interleave_intrin_name,
-        )
-
-        # Interleave the padded im2col matrix utilizing the matrix tile
-        interleave_t_A_block = sch.cache_read(gemm_block, 0, "global")
-        sch.transform_layout(interleave_t_A_block, ("write", 0), lambda b, m, k: (b, k, m))
-        b, m, k = sch.get_loops(interleave_t_A_block)
-        mo, mi = sch.split(m, factors=(None, tile_M), disable_predication=True)
-        ko, ki = sch.split(k, factors=(None, tile_K), disable_predication=True)
-        sch.parallel(b)
-        sch.reorder(b, ko, mo, ki, mi)
-        sch.tensorize(
-            ki, get_transpose_interleave_intrin_name(in_dtype, out_dtype, M_padded, K_padded)
-        )
-
-        # Interleave the padded weights matrix utilizing the matrix tile
-        if in_dtype == "float16":
-            interleave_b_block = sch.cache_read(gemm_block, 1, "global")
-            sch.transform_layout(interleave_b_block, ("write", 0), lambda n, k: (k, n))
-            n, k = sch.get_loops(interleave_b_block)
-            ko, ki = sch.split(k, factors=(None, tile_K), disable_predication=True)
-            no, ni = sch.split(n, factors=(None, tile_N), disable_predication=True)
-            sch.reorder(ko, no, ki, ni)
-            sch.tensorize(
-                ki, get_transpose_interleave_intrin_name(in_dtype, out_dtype, M_padded, K_padded)
-            )
-
-        # Split and reorder the loops of the GeMM for tensorization
-        b, m, n, k = sch.get_loops(gemm_block)
-        tile_M, _ = get_tiling_A(False, out_dtype, True)
-        tile_N, _ = get_tiling_B_transformed(False, out_dtype, True, True)
-        tile_M = T.cast(tile_M, M_padded.dtype)
-        tile_N = T.cast(tile_N, N_padded.dtype)
-        mo, mi = sch.split(m, factors=(None, tile_M), disable_predication=True)
-        no, ni = sch.split(n, factors=(None, tile_N), disable_predication=True)
-        sch.parallel(b)
-        sch.reorder(b, mo, no, mi, ni, k)
-
-        # Tensorize the GeMM initialization
-        init_block = sch.decompose_reduction(gemm_block, mi)
-        sch.tensorize(sch.get_loops(init_block)[-2], ARM_SME_INIT)
-
-        # Tensorize the GeMM update
-        sme_gemm_interleaved_intrin_name = (
-            ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA + f"_{M_padded}_{K_padded}_{in_dtype}"
-        )
-        tvm.tir.TensorIntrin.register(
-            sme_gemm_interleaved_intrin_name,
-            *get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(M_padded, K_padded, in_dtype),
-            override=True,
-        )
-        sch.tensorize(mi, sme_gemm_interleaved_intrin_name)
-
-        # Add pstate annotations
-        root_block = sch.get_block("root")
-        sch.annotate(
-            root_block, SMEAttributes.STREAMING_MODE, SMEAttributes.StreamingModeValues.ENABLED
-        )
-        sch.annotate(root_block, SMEAttributes.ZA_STORAGE, SMEAttributes.ZAStorageValues.NEW)
-    elif use_scalable_vectors:
-        mo, mi = sch.split(m, [None, tile_M])
-        no, ni = sch.split(n, [None, tile_N], disable_predication=True)
-        ko, ki = sch.split(k, [None, tile_K])
-        b_mo_fused = sch.fuse(b, mo)
-        sch.parallel(b_mo_fused)
-        sch.reorder(
-            b_mo_fused,
-            no,
-            ko,
-            ki,
-            mi,
-            ni,
-        )
-        sch.vectorize(ni)
-        sch.unroll(mi)
-
-        # GeMM - Init
-        # Initialise an entire GeMM tile at once
-        sch.decompose_reduction(gemm_block, ko)
-    else:
-        mo, mi = sch.split(m, [None, tile_M])
-        no, ni = sch.split(n, [None, tile_N])
-        ko, ki = sch.split(k, [None, tile_K])
-        ni_outer, ni_inner = sch.split(ni, [4, None])
-        b_mo_fused = sch.fuse(b, mo)
-        sch.parallel(b_mo_fused)
-        sch.reorder(
-            b_mo_fused,
-            no,
-            ko,
-            ki,
-            ni_outer,
-            mi,
-            ni_inner,
-        )
-        sch.vectorize(ni_inner)
-        sch.unroll(mi)
-        sch.unroll(ni_outer)
-
-        # GeMM - Init
-        # Initialise an entire GeMM tile at once
-        sch.decompose_reduction(gemm_block, ko)
-
-    # Input padding
-    if func_blocks["data_pad"]:
-        input_padding_block = func_blocks["data_pad"]
-        b, h, w, ic = sch.get_loops(input_padding_block)
-        b_h_fused = sch.fuse(b, h)
-        sch.parallel(b_h_fused)
-
-    # Im2col + padding to tile size
-    # Computed outside GeMM
-    if func_blocks["data_im2col"]:
-        im2col_block = func_blocks["data_im2col"]
-        b1, m1, k1 = sch.get_loops(im2col_block)
-        b_m_fused_1 = sch.fuse(b1, m1)
-        if func_blocks["A_padded_K"]:
-            im2col_pad_K_block = func_blocks["A_padded_K"]
-            b2, m2, k2 = sch.get_loops(im2col_pad_K_block)
-            b_m_fused_2 = sch.fuse(b2, m2)
-            sch.parallel(b_m_fused_2)
-            sch.compute_at(im2col_block, b_m_fused_2)
-            _, k1 = sch.get_loops(sch.get_block("data_im2col"))
-        elif func_blocks["A_padded_M"]:
-            im2col_pad_M_block = func_blocks["A_padded_M"]
-            b2, m2, k2 = sch.get_loops(im2col_pad_M_block)
-            b_m_fused_2 = sch.fuse(b2, m2)
-            sch.parallel(b_m_fused_1)
-            sch.parallel(b_m_fused_2)
-        else:
-            sch.parallel(b_m_fused_1)
-
-        K = sch.get(k1).extent.value
-        if K % 16 == 0:
-            split_factor = 16
-        elif K % 8 == 0:
-            split_factor = 8
-        else:
-            IC = buffer_list[0].shape[3]
-            split_factor = IC
-        k_outer, k_inner = sch.split(k1, [None, split_factor])
-        sch.vectorize(k_inner)
-        sch.unroll(k_outer)
-
-    # Reshape + padding to tile size
-    # Computed inside GeMM
-    elif func_blocks["T_reshape"]:
-        reshape_block = func_blocks["T_reshape"]
-        A_pad_block = func_blocks["A_padded_K"] if func_blocks["A_padded_K"] else None
-        A_pad_block = func_blocks["A_padded_M"] if func_blocks["A_padded_M"] else A_pad_block
-        use_explicit_predication = use_sme and in_dtype == "float32"
-        if not use_explicit_predication:
-            if use_sme:
-                sch.compute_inline(reshape_block)
-            elif A_pad_block:
-                sch.compute_inline(reshape_block)
-                b, m, k = sch.get_loops(A_pad_block)
-                _, k_inner = sch.split(k, [None, tile_N])
-                sch.vectorize(k_inner)
-                sch.compute_at(A_pad_block, mi)
-            else:
-                sch.compute_at(reshape_block, mi)
-
-    # Weight flattening
-    if func_blocks["weight_flatten"]:
-        weight_flatten_block = func_blocks["weight_flatten"]
-        sch.compute_inline(weight_flatten_block)
-
-    # Weight transpose
-    if func_blocks["weight_transpose"] and func_blocks["weight_padding"]:
-        weight_padding_block = func_blocks["weight_padding"]
-        sch.compute_inline(weight_padding_block)
-
-    # Conv2d output block
-    output_block = func_blocks["conv2d_gemm_output"]
-    n, h, w, c = sch.get_loops(output_block)
-    n_h_fused = sch.fuse(n, h)
-    _, inner = sch.split(c, [None, 4])
-    sch.vectorize(inner)
-    sch.parallel(n_h_fused)
-
-    return sch
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
deleted file mode 100644
index 2476cb92b915..000000000000
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ /dev/null
@@ -1,571 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D alter op and legalize functions for arm cpu"""
-
-import logging
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from tvm.target.target import Target
-
-from ..nn import conv2d_alter_layout, conv2d_legalize
-from ..utils import get_const_tuple
-from ..x86.conv2d import _get_default_config as _get_x86_default_config
-from ..x86.conv2d_int8 import _get_default_config_int8
-from .conv2d_int8 import is_int8_hw_support
-from .arm_utils import get_tiling_B_transformed, get_conv2d_weights_padding
-from ..generic.conv2d import conv2d_alter_int8_common
-from .mprofile.dsp.micro_kernel.common import num_simd_lanes_per_word
-
-logger = logging.getLogger("topi")
-
-
-def transform_weights(inputs, data, kernel, interleave_A):
-    """Transform the weight matrix by tiling, interleaving (and transposing it)
-
-    Parameters
-    ----------
-    inputs : tvm.relay.Expr
-        Grouped input symbols
-    data :
-        Input shape and dtype
-    kernel :
-        Input shape and dtype
-    interleave_A: indicates if we expect matrix A to be interleaved
-
-    Returns
-    ----------
-    new_kernel : tvm.te.placeholder
-                 A placeholder with the new shape
-    new_kernel_expr : tvm.relay.Expr
-                The relay expression of the weights
-    """
-
-    KH, KW, IC, OC = get_const_tuple(kernel.shape)
-    K = KH * KW * IC
-    N = OC
-
-    # Get tiling information for the transformed version of B
-    tile_N, tile_K = get_tiling_B_transformed(interleave_A, data.dtype)
-    pad_N, pad_K = get_conv2d_weights_padding(N, K, tile_N, tile_K)
-
-    N_padded = N + pad_N
-    K_padded = K + pad_K
-    new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], tile_N, tile_K)
-    if data.dtype in ["int8", "uint8"]:
-        new_kernel = te.placeholder(
-            (N_padded // tile_N, K_padded // tile_K, tile_N, tile_K),
-            kernel.dtype,
-        )
-    else:
-        new_kernel = te.placeholder(
-            (N_padded // tile_N, K_padded // tile_K, tile_K, tile_N),
-            kernel.dtype,
-        )
-    return new_kernel, new_kernel_expr
-
-
-@conv2d_alter_layout.register(["arm_cpu"])
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-
-    _, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-    )
-    workload = autotvm.task.get_workload(outs)
-    if workload is None:
-        # The best implementation is not an AutoTVM template,
-        # we then assume it's not necessary to alter this op.
-        return None
-    cfg = dispatch_ctx.query(target, workload)
-
-    topi_tmpl = workload[0]
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
-    # Extract data types
-    data_tensor, kernel_tensor = tinfos
-    data_dtype = data_tensor.dtype
-    kernel_dtype = kernel_tensor.dtype
-
-    idxd = tvm.tir.indexdiv
-
-    if topi_tmpl == "depthwise_conv2d_nhwc_dsp.arm_cpu":
-        assert data_layout == "NHWC" and kernel_layout == "HWOI"
-
-        # We are not able to check if inputs[1] (the kernel) is a constant in the
-        # strategy function, so as a stopgap solution we use an assert here.
-        assert isinstance(
-            inputs[1], relay.Constant
-        ), "depthwise_conv2d_nhwc_dsp.arm_cpu requires kernel be a relay Constant"
-
-        channels = get_const_tuple(data.shape)[3]
-        KH, KW, _, _ = get_const_tuple(kernel.shape)
-        simd_lanes = num_simd_lanes_per_word(data.dtype)
-
-        HWOI_kernel_np = inputs[1].data.numpy()
-        CHWc_kernel_np = np.zeros((channels // simd_lanes, KH, KW, simd_lanes), dtype=kernel.dtype)
-        for i in range(channels // simd_lanes):
-            CHWc_kernel_np[i] = HWOI_kernel_np[:, :, simd_lanes * i : simd_lanes * (i + 1), 0]
-        reshaped_new_kernel = CHWc_kernel_np.reshape((KH, KW, channels, 1))
-
-        # Store the same config for the altered operator (workload)
-        new_data = data
-        new_kernel = te.placeholder((KH, KW, channels, 1), dtype=kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "depthwise_conv2d_nhwc_dsp.arm_cpu",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.conv2d(
-            inputs[0], relay.Constant(tvm.nd.array(reshaped_new_kernel)), **new_attrs
-        )
-
-    if topi_tmpl == "conv2d_NHWC_hybrid.arm_cpu":
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        KH, KW, _, OC = get_const_tuple(kernel.shape)
-        new_workload_name = "conv2d_NHWC_hybrid_without_transform.arm_cpu"
-        new_kernel, new_kernel_expr = transform_weights(inputs, data, kernel, interleave_A=False)
-        new_workload = autotvm.task.args_to_workload(
-            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
-            new_workload_name,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
-            inputs[0], new_kernel_expr, **new_attrs
-        )
-
-    if (
-        topi_tmpl == "conv2d_NHWC_hybrid_SME.arm_cpu"
-        and data_dtype == "float16"
-        and kernel_dtype == "float16"
-        and out_dtype == "float32"
-    ):
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        KH, KW, IC, OC = get_const_tuple(kernel.shape)
-        K = KH * KW * IC
-        N = OC
-        # The SME schedule for float16->float32 prearranges the two matrices to be multiplied
-        # using the ARM_SME_BLOCK2_2SVLx1SVL_FP16_TRANSPOSE_INTERLEAVE intrinsic which expects
-        # the reduction axis K as the second dimension of the matrix (i.e. shape = (_, K)).
-        # This means that the flattened weights matrix B needs to be transposed to (N, K).
-        transposed_kernel_expr = relay.transpose(inputs[1], axes=[3, 0, 1, 2])
-        transposed_flattened_kernel_expr = relay.reshape(transposed_kernel_expr, newshape=(N, K))
-        new_kernel_expr = transposed_flattened_kernel_expr
-        new_kernel = te.placeholder((N, K), kernel.dtype)
-        new_workload_name = "conv2d_NHWC_hybrid_SME_transposed_B.arm_cpu"
-        new_workload = autotvm.task.args_to_workload(
-            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
-            new_workload_name,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
-            inputs[0], new_kernel_expr, **new_attrs
-        )
-
-    # Only microTVM does layout alteration for NHWC layout with real data types
-    if data_layout == "NHWC" and data_dtype not in ["uint8", "int8"]:
-        return None
-
-    if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        VC = cfg["tile_co"].size[-1]
-
-        new_attrs["kernel_layout"] = f"OIHW{VC}o"
-
-        new_data = data
-        new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_spatial_pack.arm_cpu",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu":
-        assert (
-            data.dtype == "int8"
-            and kernel.dtype == "int8"
-            or data.dtype == "uint8"
-            and kernel.dtype == "uint8"
-        )
-
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-
-        data_expr, kernel_expr = inputs
-
-        data_int16 = relay.cast(data_expr, dtype="int16")
-        kernel_int16 = relay.cast(kernel_expr, dtype="int16")
-
-        new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-        new_data = te.placeholder(data.shape, "int16")
-        new_kernel = te.placeholder(kernel.shape, "int16")
-
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nhwc_spatial_pack.arm_cpu",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.conv2d(data_int16, kernel_int16, **new_attrs)
-
-    if topi_tmpl == "conv2d_nchw_winograd.arm_cpu":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        VC = cfg["tile_k"].size[-1]
-        tile_size = 4
-
-        weight_expr = inputs[1]
-        weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
-            weight_expr, tile_size=tile_size
-        )
-        weight_expr = relay.reshape(
-            weight_expr, newshape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)
-        )
-        weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3])
-
-        new_attrs["tile_size"] = tile_size
-        new_attrs["channels"] = CO
-
-        new_data = data
-        new_kernel = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC), kernel.dtype
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_winograd.arm_cpu",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], weight_expr, **new_attrs
-        )
-
-    if topi_tmpl == "conv2d_nchw_winograd_nnpack.arm_cpu":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        new_attrs["channels"] = CO
-
-        # pre-compute winograd_nnpack transform
-        # for winograd_nnpack_fp16, the precompute prune pass must run on device,
-        # where float16 is supported
-        weight_dtype = "float32"
-        weight_expr = inputs[1]
-        transformed_weight = relay.nn.contrib_conv2d_winograd_nnpack_weight_transform(
-            weight_expr,
-            convolution_algorithm=cfg["winograd_nnpack_algorithm"].val,
-            out_dtype=weight_dtype,
-        )
-
-        new_data = data
-        new_kernel = te.placeholder((CO, CI, 8, 8), "float32")
-
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, None, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], transformed_weight, **new_attrs
-        )
-
-    if topi_tmpl == "depthwise_conv2d_nchw_spatial_pack.arm_cpu":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, M, KH, KW = get_const_tuple(kernel.shape)
-        VC = cfg["tile_co"].size[-1]
-
-        new_attrs["kernel_layout"] = f"OIHW{cfg['tile_co'].size[-1]}o"
-
-        # Store the same config for the altered operator (workload)
-        new_data = data
-        new_kernel = te.placeholder((idxd(CO, VC), M, KH, KW, VC), dtype=kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "depthwise_conv2d_nchw_spatial_pack.arm_cpu",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_NCHWc.x86":
-        # Converting NCHW to NCHWc.
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        if cfg.is_fallback:
-            _get_x86_default_config(
-                cfg,
-                data_tensor,
-                kernel_tensor,
-                strides,
-                padding,
-                dilation,
-                out_dtype,
-                False,
-                data_layout,
-            )
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
-        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-        # update new attrs
-        new_attrs["channels"] = out_channel
-        new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-        new_attrs["kernel_layout"] = f"OIHW{ic_bn}i{oc_bn}o"
-        new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-        # Store altered operator's config
-        new_data = te.placeholder(
-            (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
-        )
-        new_kernel = te.placeholder(
-            (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn),
-            dtype=kernel_tensor.dtype,
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [
-                new_data,
-                new_kernel,
-                strides,
-                padding,
-                dilation,
-                new_attrs["data_layout"],
-                new_attrs["out_layout"],
-                out_dtype,
-            ],
-            topi_tmpl,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
-
-    if topi_tmpl == "depthwise_conv2d_NCHWc.x86":
-        # Converting NCHW to NCHWc.
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        if cfg.is_fallback:
-            _get_x86_default_config(
-                cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout
-            )
-
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
-        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-        assert channel_multiplier == 1
-
-        # update new attrs
-        new_attrs["channels"] = out_channel
-        new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-        new_attrs["kernel_layout"] = f"OIHW1i{oc_bn}o"
-        new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-        # Store altered operator's config.
-        new_data = te.placeholder(
-            (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
-        )
-        new_kernel = te.placeholder((out_channel // oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [
-                new_data,
-                new_kernel,
-                strides,
-                padding,
-                dilation,
-                new_attrs["data_layout"],
-                new_attrs["out_layout"],
-                out_dtype,
-            ],
-            topi_tmpl,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
-
-        n_elems = 4
-
-        if cfg.is_fallback:
-            _get_default_config_int8(
-                cfg,
-                data_tensor,
-                kernel_tensor,
-                strides,
-                padding,
-                dilation,
-                out_dtype,
-                False,
-                data_layout,
-                int32_lanes=4,
-            )
-
-        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-        if cfg.is_fallback:
-            # ic_bn needs to be divided by n_elems below
-            ic_bn = max(ic_bn, n_elems)
-
-        # update new attrs
-        new_attrs["channels"] = out_channel
-        new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-        new_attrs["kernel_layout"] = f"OIHW{ic_bn // n_elems:n}i{oc_bn:n}o{n_elems:n}i"
-        new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-        # Store altered operator's config.
-        new_data = te.placeholder(
-            (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
-        )
-        new_kernel = te.placeholder(
-            (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
-            dtype=kernel_dtype,
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [
-                new_data,
-                new_kernel,
-                strides,
-                padding,
-                dilation,
-                new_attrs["data_layout"],
-                new_attrs["out_layout"],
-                out_dtype,
-            ],
-            topi_tmpl,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        KH, KW, _, OC = get_const_tuple(kernel.shape)
-        new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
-        new_kernel, new_kernel_expr = transform_weights(inputs, data, kernel, interleave_A=True)
-        new_workload = autotvm.task.args_to_workload(
-            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
-            new_workload_name,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
-            inputs[0], new_kernel_expr, **new_attrs
-        )
-    if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu":
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        KH, KW, _, OC = get_const_tuple(kernel.shape)
-        new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu"
-        new_kernel, new_kernel_expr = transform_weights(inputs, data, kernel, interleave_A=False)
-        new_workload = autotvm.task.args_to_workload(
-            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
-            new_workload_name,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
-            inputs[0], new_kernel_expr, **new_attrs
-        )
-    return None
-
-
-@conv2d_legalize.register("arm_cpu")
-def _conv2d_legalize(attrs, inputs, arg_types):
-    """Legalizes Conv2D op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # Collect the input tensors.
-    data_tensor, kernel_tensor = arg_types[0], arg_types[1]
-    data_dtype = data_tensor.dtype
-    kernel_dtype = kernel_tensor.dtype
-
-    # Collect the output tensor.
-    output_tensor = arg_types[2]
-
-    # Collect the input exprs.
-    data, kernel = inputs
-
-    # Determine conv2d implementation
-    target = Target.current(allow_none=False)
-    _, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.conv2d"),
-        attrs,
-        [
-            te.placeholder(data_tensor.shape, data_dtype),
-            te.placeholder(kernel_tensor.shape, kernel_dtype),
-        ],
-        output_tensor,
-        target,
-    )
-    workload = autotvm.task.get_workload(outs)
-    if workload is not None:
-        topi_tmpl = workload[0]
-
-    # ARM vector instructions operate on the same dtype for data and kernel, we
-    # provide those here and conv2d_alter_int8_common will convert to the
-    # correct datatype.
-    if is_int8_hw_support(kernel_dtype, kernel_dtype):
-        # ARM intrinsics need the datatypes of data and kernel to be the same
-        if (
-            attrs["data_layout"] == "NHWC"
-            and attrs["kernel_layout"] == "HWIO"
-            and topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu"
-        ):
-            in_channel_vector_length = data_tensor.shape[3]
-        else:
-            in_channel_vector_length = 8
-
-        return conv2d_alter_int8_common(
-            data,
-            data_tensor,
-            kernel,
-            kernel_tensor,
-            output_tensor,
-            attrs,
-            kernel_dtype,
-            in_channel_vector_length,
-            8,
-        )
-    return None
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
deleted file mode 100644
index cc1a28b9dee0..000000000000
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ /dev/null
@@ -1,586 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, too-many-locals
-# pylint: disable=unused-argument, redefined-builtin
-"""GEMM Convolution schedule on ARM"""
-import tvm
-from tvm.target import Target
-from tvm import te
-from tvm.topi import nn
-from tvm.topi.arm_cpu import arm_utils
-from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity
-from ..utils import get_const_tuple, get_const_int
-from ..nn.utils import get_pad_tuple
-from .tensor_intrin import (
-    gemm_4x4_int8_int8_int32,
-    gemm_acc_4x4_int8_int8_int32,
-    gemm_acc_nx16_int8_int8_int32,
-    gemm_acc_2x2_int8_int8_int32,
-)
-
-
-def configure_knobs(cfg, M, K, target):
-    """Configure auto-tuning knobs for the interleaved strategy"""
-
-    x, y = cfg.axis(M // 4), cfg.axis(K // 16)
-    cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]])
-
-    outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
-    cfg.define_annotate(
-        "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec"
-    )
-
-    # Fallback configuration
-    if cfg.is_fallback:
-        cfg["reorder_gemm"] = ReorderEntity([0, 1])
-        cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
-
-    if not target.features.has_dotprod:
-        cfg.define_knob("gemm_quantized_unroll", [True, False])
-        if cfg.is_fallback:
-            cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
-
-
-# Compute function
-def compute_conv2d_gemm_without_weight_transform(
-    cfg,
-    data,
-    B_interleaved_t,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    kernel_size,
-    output_channels,
-    interleave_A,
-    use_scalable_vectors=False,
-    use_sme=False,
-):
-    """Compute conv2d by transforming the input,
-    executing GEMM and transforming the output back"""
-    batches, IH, IW, IC = get_const_tuple(data.shape)
-    in_dtype = data.dtype
-
-    KH, KW = get_const_tuple(kernel_size)
-    OC = get_const_int(output_channels)
-    kernel_area = KH * KW
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = get_const_tuple(dilation)
-
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-
-    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
-    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
-
-    # Input padding (if necessary)
-    if pad_top or pad_left or pad_down or pad_right:
-        data_pad = nn.pad(
-            data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0], name="data_pad"
-        )
-    else:
-        data_pad = data
-
-    # Im2col transformation
-    M = OH * OW
-    K = IC * kernel_area
-    N = OC
-
-    A_shape = (batches, M, K)
-    if kernel_area == 1:
-        A = tvm.topi.reshape(data_pad, A_shape)
-    else:
-        A = te.compute(
-            A_shape,
-            lambda n, x, y: data_pad[
-                n,
-                HSTR * (x // OW) + dilation_h * ((y // IC) // KW),
-                WSTR * (x % OW) + dilation_w * ((y // IC) % KW),
-                y % IC,
-            ],
-            name="data_im2col",
-        )
-
-    # Select the tiling strategy for A and B
-    tile_M, tile_K_A = arm_utils.get_tiling_A(interleave_A, in_dtype, use_sme)
-    tile_N, tile_K_B = arm_utils.get_tiling_B_transformed(
-        interleave_A,
-        in_dtype,
-        use_scalable_vectors,
-        use_sme,
-    )
-
-    # Pad to tiles (if necessary)
-    use_explicit_predication = use_sme and in_dtype == "float32"
-    if not use_explicit_predication:
-        pad_M, pad_K = arm_utils.get_conv2d_im2col_padding(M, K, tile_M, tile_K_A)
-        pad_N, _ = arm_utils.get_conv2d_weights_padding(N, K, tile_N, tile_K_B)
-
-        M_padded = M + pad_M
-        K_padded = K + pad_K
-        N_padded = N + pad_N
-
-        pad_before = (0, 0, 0)
-        pad_after = (0, pad_M, pad_K)
-
-        if pad_K != 0:
-            A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded_K")
-        elif pad_M != 0:
-            A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded_M")
-
-    idxm = tvm.tir.indexmod
-    k = te.reduce_axis((0, K if use_explicit_predication else K_padded), "k")
-
-    # Determine matrix multiplication compute definition
-    target = Target.current(allow_none=False)
-    if in_dtype in ["int8", "uint8"]:
-        assert len(B_interleaved_t.shape) == 4
-        if interleave_A:
-            # Configuration space
-            configure_knobs(cfg, M_padded, K_padded, target)
-
-            # Pack the input data
-            A_interleaved = te.compute(
-                (
-                    batches,
-                    M_padded // tile_M,
-                    K_padded // tile_K_A,
-                    tile_M,
-                    tile_K_A,
-                ),
-                lambda b, x, y, z, w: A[b, z + tile_M * x, w + tile_K_A * y],
-                name="A_interleaved",
-            )
-            N_transformed = B_interleaved_t.shape[0]
-            if target.features.has_matmul_i8:
-                # Execute GEMM. In the case of mmla, we need to enforce the tiling
-                # from the compute. This is because mmla is doing a tiled computation
-                # as well. So we have a big 8x12 tile, with small 2x2 sub-tiles
-                # generated by mmla. In theory we could make the tile 2x2 and
-                # fuse and split during scheduling, but this would not work
-                # because of possible padding
-                C_interleaved = te.compute(
-                    (
-                        batches,
-                        M_padded // tile_M,
-                        N_transformed,
-                        tile_M // 2,
-                        tile_N // 2,
-                        2,
-                        2,
-                    ),
-                    lambda b, x, y, w, z, s, t: te.sum(
-                        A_interleaved[b, x, k // tile_K_A, 2 * w + s, idxm(k, tile_K_A)].astype(
-                            "int32"
-                        )
-                        * B_interleaved_t[y, k // tile_K_B, 2 * z + t, idxm(k, tile_K_B)].astype(
-                            "int32"
-                        ),
-                        axis=k,
-                    ),
-                    name="C_interleaved",
-                )
-                # Ensure the padding needed for tensorize does not get removed during tir passes
-                # by adding a dummy reference to the specific padded area of the result
-                zero = (
-                    tvm.tir.const(1, C_interleaved.dtype)
-                    * C_interleaved[
-                        batches - 1,
-                        M // tile_M,
-                        N_transformed - 1,
-                        idxm(M, tile_M) // 2,
-                        tile_N // 2 - 1,
-                        1,
-                        1,
-                    ]
-                    - tvm.tir.const(1, C_interleaved.dtype)
-                    * C_interleaved[
-                        batches - 1,
-                        M // tile_M,
-                        N_transformed - 1,
-                        idxm(M, tile_M) // 2,
-                        tile_N // 2 - 1,
-                        1,
-                        1,
-                    ]
-                )
-                # Unpack the result
-                C = te.compute(
-                    (batches, M, N),
-                    lambda b, x, y: (
-                        C_interleaved[
-                            b,
-                            x // tile_M,
-                            y // tile_N,
-                            idxm(x, tile_M) // 2,
-                            idxm(y, tile_N) // 2,
-                            idxm(idxm(x, tile_M), 2),
-                            idxm(idxm(y, tile_N), 2),
-                        ]
-                        + zero
-                    ).astype(out_dtype),
-                    name="C",
-                )
-            else:
-                # Execute GEMM
-                C_interleaved = te.compute(
-                    (batches, M_padded // tile_M, N_transformed, tile_M, tile_N),
-                    lambda b, x, y, w, z: te.sum(
-                        A_interleaved[b, x, k // tile_K_A, w, idxm(k, tile_K_A)].astype("int32")
-                        * B_interleaved_t[y, k // tile_K_B, z, idxm(k, tile_K_B)].astype("int32"),
-                        axis=k,
-                    ),
-                    name="C_interleaved",
-                )
-                # Unpack the result
-                C = te.compute(
-                    (batches, M, N),
-                    lambda b, x, y: C_interleaved[
-                        b,
-                        x // tile_M,
-                        y // tile_N,
-                        idxm(x, tile_M),
-                        idxm(y, tile_N),
-                    ].astype(out_dtype),
-                    name="C",
-                )
-            zero = tvm.tir.const(0)
-        else:
-            # No need to pack/unpack, execute GEMM directly
-            C = te.compute(
-                (batches, M_padded, N_padded),
-                lambda b, x, y: te.sum(
-                    A[b, x, k].astype("int32")
-                    * B_interleaved_t[
-                        y // tile_N,
-                        k // tile_K_B,
-                        idxm(y, tile_N),
-                        idxm(k, tile_K_B),
-                    ].astype("int32"),
-                    axis=k,
-                ),
-                name="C",
-            )
-
-            # We need to ensure that infer bound pass does not remove the padding
-            # which is necessary for the tensorizations to work. So we need to
-            # add a dummy reference to the padding area of the result
-            zero = (
-                tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
-                - tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
-            )
-    elif use_sme and in_dtype == "float16" and out_dtype == "float32":
-        assert len(B_interleaved_t.shape) == 2
-        C = te.compute(
-            (batches, M_padded, N_padded),
-            lambda b, x, y: te.sum(
-                A[b, x, k].astype(out_dtype) * B_interleaved_t[y, k].astype(out_dtype),
-                axis=k,
-            ),
-            name="C",
-        )
-        zero = tvm.tir.const(0)
-    elif use_explicit_predication:
-        assert len(B_interleaved_t.shape) == 2
-        C = te.compute(
-            (batches, M, N),
-            lambda b, x, y: te.sum(
-                A[b, x, k].astype(in_dtype) * B_interleaved_t[k, y].astype(in_dtype),
-                axis=k,
-            ),
-            name="C",
-        )
-        zero = tvm.tir.const(0)
-    elif use_scalable_vectors:
-        assert len(B_interleaved_t.shape) == 2
-        C = te.compute(
-            (batches, M_padded, N_padded),
-            lambda b, x, y: te.sum(
-                A[b, x, k].astype(in_dtype) * B_interleaved_t[k, y].astype(in_dtype),
-                axis=k,
-            ),
-            name="C",
-        )
-        # Ensure padding on the N axis does not get removed during tir passes
-        # by adding a dummy reference to the specific padded area of the result
-        zero = (
-            tvm.tir.const(1, C.dtype) * C[0, 0, N_padded - 1]
-            - tvm.tir.const(1, C.dtype) * C[0, 0, N_padded - 1]
-        )
-    else:
-        assert len(B_interleaved_t.shape) == 4
-        C = te.compute(
-            (batches, M_padded, N_padded),
-            lambda b, x, y: te.sum(
-                A[b, x, k].astype(in_dtype)
-                * B_interleaved_t[
-                    y // tile_N,
-                    k // tile_K_B,
-                    idxm(k, tile_K_B),
-                    idxm(y, tile_N),
-                ].astype(in_dtype),
-                axis=k,
-            ),
-            name="C",
-        )
-        # Ensure padding on the N axis does not get removed during tir passes
-        # by adding a dummy reference to the specific padded area of the result
-        if in_dtype == "float16" and target.features.has_fp16_simd:
-            zero = (
-                tvm.tir.const(1, C.dtype) * C[0, 0, N_padded - 1]
-                - tvm.tir.const(1, C.dtype) * C[0, 0, N_padded - 1]
-            )
-        else:
-            zero = tvm.tir.const(0)
-
-    # Reshape the result into a convolution output
-    out_shape = (batches, OH, OW, OC)
-    out = te.compute(
-        out_shape,
-        lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype),
-        name="conv2d_gemm_output",
-        attrs={"use_scalable_vectors": use_scalable_vectors, "use_sme": use_sme},
-    )
-    return out
-
-
-def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
-    """Schedule the conv2d_gemm interleaved strategy"""
-    C = out.op.input_tensors[0]
-    C_interleaved = C.op.input_tensors[0]
-    A_interleaved = C_interleaved.op.input_tensors[0]
-    in_type = A_interleaved.dtype
-    tile_M, tile_K = arm_utils.get_tiling_A(True, in_type)
-
-    # Input transform
-    A_interleaved_input = A_interleaved.op.input_tensors[0]
-    if A_interleaved_input.op.name == "A_padded_K" or A_interleaved_input.op.name == "A_padded_M":
-        s[A_interleaved_input].compute_at(s[A_interleaved], A_interleaved.op.axis[3])
-        s[A_interleaved_input].vectorize(A_interleaved_input.op.axis[2])
-        s[A_interleaved_input].compute_inline()
-        data_im2col = A_interleaved_input.op.input_tensors[0]
-    else:
-        data_im2col = A_interleaved_input
-
-    b, m, n = data_im2col.op.axis
-    if data_im2col.op.name == "data_im2col":
-        n_size = data_im2col.shape[2]
-        if n_size % 16 == 0:
-            split_factor = 16
-        else:
-            split_factor = 8
-        n_outer, n_inner = s[data_im2col].split(n, split_factor)
-        s[data_im2col].unroll(n_outer)
-        s[data_im2col].vectorize(n_inner)
-        b_m_fused = s[data_im2col].fuse(b, m)
-        s[data_im2col].parallel(b_m_fused)
-    else:
-        s[data_im2col].compute_inline()
-
-    # Computation(through tensorize)
-    b, xo, yo, xi, yi = C_interleaved.op.axis[0:5]
-    outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo])
-
-    b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm)
-    s[C_interleaved].parallel(b_outer_gemm_fused)
-    s[A_interleaved].compute_at(s[C_interleaved], b_outer_gemm_fused)
-    _, _, _, outer_A_interleaved, inner_A_interleaved = A_interleaved.op.axis
-    cfg["A_interleaved_unroll_vec"].apply(
-        s, A_interleaved, [outer_A_interleaved, inner_A_interleaved]
-    )
-
-    k = C_interleaved.op.reduce_axis[0]
-    _, M, N = C.shape
-    if in_type in ["int8", "uint8"]:
-        target = Target.current(allow_none=False)
-        if target.features.has_matmul_i8:
-            gemm_acc = gemm_acc_2x2_int8_int8_int32(in_type)
-            xi_inner, yi_inner = C_interleaved.op.axis[-2:]
-            k_outer, k_inner = s[C_interleaved].split(k, tile_K)
-            s[C_interleaved].reorder(
-                b_outer_gemm_fused, inner_gemm, k_outer, xi, yi, xi_inner, yi_inner, k_inner
-            )
-            s[C_interleaved].tensorize(xi_inner, gemm_acc)
-            s[C_interleaved].unroll(xi)
-            s[C_interleaved].unroll(yi)
-        elif target.features.has_dotprod:
-            gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
-            xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
-                xi, yi, x_factor=tile_M, y_factor=4
-            )
-            k_outer, k_inner = s[C_interleaved].split(k, tile_K)
-            xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
-            s[C_interleaved].reorder(
-                b_outer_gemm_fused,
-                inner_gemm,
-                xi_outer,
-                yi_outer,
-                k_outer,
-                xi_inner_outer,
-                xi_inner_inner,
-                yi_inner,
-                k_inner,
-            )
-            s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
-            s[C_interleaved].unroll(xi_inner_outer)
-
-        elif target.features.has_asimd:
-            s[C_interleaved].reorder(yi, xi)
-            K = A_interleaved_input.shape[2]
-            assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
-            unroll = cfg["gemm_quantized_unroll"].val
-            gemm = gemm_4x4_int8_int8_int32(M, N, K, unroll, in_type)
-            s[C_interleaved].tensorize(yi, gemm)
-
-    # Output transform
-    if out != final_out:
-        n, h, w, c = out.op.axis
-        _, inner = s[out].split(c, 4)
-        s[C].compute_at(s[out], inner)
-        s[out].vectorize(inner)
-    return s
-
-
-def schedule_conv2d_gemm_native(cfg, s, out, final_out):
-    """Schedule the conv2d_gemm hybrid strategy"""
-    C = out.op.input_tensors[0]
-    A = C.op.input_tensors[0]
-    in_type = A.dtype
-    use_scalable_vectors = bool(out.op.attrs["use_scalable_vectors"])
-    tile_M, tile_K = arm_utils.get_tiling_A(False, in_type)
-    tile_N, _ = arm_utils.get_tiling_B_transformed(False, in_type, use_scalable_vectors)
-
-    # Computation
-    b, x, y = C.op.axis
-    (k,) = C.op.reduce_axis
-
-    if in_type in ["int8", "uint8"]:
-        k_outer, k_inner = s[C].split(k, tile_K)
-        x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=tile_M, y_factor=tile_N)
-        s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner)
-        gemm_acc = gemm_acc_nx16_int8_int8_int32(in_type, rows=1)
-        s[C].unroll(x_inner)
-        s[C].tensorize(y_inner, gemm_acc)
-        s[C].parallel(x_outer)
-    elif use_scalable_vectors:
-        k_outer, k_inner = s[C].split(k, factor=tile_K)
-        x_outer, x_inner = s[C].split(x, factor=tile_M)
-        y_outer, y_inner = s[C].split(y, factor=tile_N, disable_predication=use_scalable_vectors)
-        b_x_outer_fused = s[C].fuse(b, x_outer)
-        s[C].parallel(b_x_outer_fused)
-        s[C].reorder(
-            b_x_outer_fused,
-            y_outer,
-            k_outer,
-            k_inner,
-            x_inner,
-            y_inner,
-        )
-        s[C].unroll(x_inner)
-        s[C].vectorize(y_inner)
-    else:
-        k_outer, k_inner = s[C].split(k, factor=tile_K)
-        x_outer, x_inner = s[C].split(x, factor=tile_M)
-        y_outer, y_inner = s[C].split(y, factor=tile_N)
-        y_inner_outer, y_inner_inner = s[C].split(y_inner, nparts=4)
-        b_x_outer_fused = s[C].fuse(b, x_outer)
-        s[C].parallel(b_x_outer_fused)
-        s[C].reorder(
-            b_x_outer_fused,
-            y_outer,
-            k_outer,
-            k_inner,
-            y_inner_outer,
-            x_inner,
-            y_inner_inner,
-        )
-        s[C].unroll(y_inner_outer)
-        s[C].unroll(x_inner)
-        s[C].vectorize(y_inner_inner)
-
-    # Input transform
-    if A.op.name == "A_padded_K" or A.op.name == "A_padded_M":
-        padding_A = True
-        data_im2col = A.op.input_tensors[0]
-    else:
-        padding_A = False
-        data_im2col = A
-
-    b, m, n = data_im2col.op.axis
-    if data_im2col.op.name == "data_im2col":
-        # Either only pad_K or both pad_K and pad_M applied
-        if A.op.name == "A_padded_K":
-            s[data_im2col].compute_at(s[A], A.op.axis[1])
-            s[A].parallel(A.op.axis[1])
-        # Only pad_M applied
-        elif A.op.name == "A_padded_M":
-            s[data_im2col].parallel(m)
-            s[A].parallel(A.op.axis[1])
-        # No padding
-        else:
-            s[data_im2col].parallel(m)
-
-        split_factor = 16
-        n_size = data_im2col.shape[2]
-        if n_size % 16 == 0:
-            split_factor = 16
-        elif n_size % 8 == 0:
-            split_factor = 8
-        else:
-            # Split by kernel area (KH * KW) to ensure proper vectorization
-            ic = data_im2col.op.input_tensors[0].shape[3]
-            split_factor = n_size // ic
-
-        n_outer, n_inner = s[data_im2col].split(n, split_factor)
-        s[data_im2col].unroll(n_outer)
-        s[data_im2col].vectorize(n_inner)
-    elif padding_A:
-        s[data_im2col].compute_inline()
-        _, n_inner = s[A].split(A.op.axis[2], tile_N)
-        s[A].vectorize(n_inner)
-        s[A].compute_at(s[C], x_inner)
-    else:
-        s[data_im2col].compute_at(s[C], x_inner)
-
-    A_pad = data_im2col.op.input_tensors[0]
-    if A_pad.op.name == "data_pad":
-        n, h, w, c = A_pad.op.axis
-        n_h_fused = s[A_pad].fuse(n, h)
-        s[A_pad].parallel(n_h_fused)
-        s[A_pad].vectorize(c)
-
-    # Weight transform
-    if use_scalable_vectors:
-        B_pad = C.op.input_tensors[1]
-        s[B_pad].parallel(B_pad.op.axis[0])
-        B_flat = B_pad.op.input_tensors[0]
-        s[B_flat].compute_inline()
-
-    # Output transform
-    if out != final_out:
-        n, h, w, c = out.op.axis
-        _, inner = s[out].split(c, 4)
-        s[out].vectorize(inner)
-    return s
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
deleted file mode 100644
index 721385c189e7..000000000000
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D int8 schedule on ARM"""
-from tvm import te, target, autotvm
-from ..utils import traverse_inline, get_const_tuple
-from ..generic import conv2d as conv2d_generic
-from .. import nn
-from ...target import codegen
-from ..nn.conv2d import _get_workload as _get_conv2d_workload, unpack_NCHWc_to_nchw
-from ..x86.conv2d_int8 import _pack_data
-from ..nn.utils import get_pad_tuple
-from .tensor_intrin import dot_int8_int8_int32_neon_82, dot_int8_int8_int32_neon
-from .conv2d import compute_conv2d_NHWC, compute_conv2d_NHWC_without_transform, schedule_conv2d_NHWC
-
-
-def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """
-    Get default int8 schedule config for the workload
-    """
-    wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype)
-    is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
-    if is_kernel_1x1:
-        conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=4, num_int8_elements=4)
-    else:
-        conv2d_generic.fallback_schedule_cpu_common_int8(
-            cfg, wkl, int32_lanes=4, num_int8_elements=4
-        )
-
-
-@autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu")
-def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype):
-    """Compute conv2d int8 with NCHWc layout"""
-    # layout and out_layout are not used here,
-    # we keep them for debug convenience when dumping autotvm workload
-
-    if len(data.shape) == 5:  # data is in nchwc
-        n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
-        in_channel = ic_chunk * ic_bn
-
-        oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn = get_const_tuple(kernel.shape)[:6]
-        num_filter = oc_chunk * oc_bn
-    else:
-        # data is nchw, implicitly treat it as nchw1c
-        n, in_channel, ih, iw = get_const_tuple(data.shape)
-        num_filter, _, kh, kw = get_const_tuple(kernel.shape)
-
-    # Define autotvm tuning space
-    is_kernel_1x1 = kh == 1 and kw == 1
-    pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw))
-    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_h = (kh - 1) * dh + 1
-    dilated_kernel_w = (kw - 1) * dw + 1
-    oh = (ih - dilated_kernel_h + pt + pb) // sh + 1
-    ow = (iw - dilated_kernel_w + pl + pr) // sw + 1
-
-    # input and output should be a multiple of 8 (intrinsics are 8 lanes)
-    cfg.define_split(
-        "tile_ic", in_channel, num_outputs=2, filter=lambda y: y.size[-1] % min(8, in_channel) == 0
-    )
-    cfg.define_split(
-        "tile_oc", num_filter, num_outputs=2, filter=lambda y: y.size[-1] % min(8, num_filter) == 0
-    )
-    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
-    if is_kernel_1x1:
-        cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
-    else:
-        cfg.define_knob("unroll_kw", [True, False])
-
-    # If no config was set, we can fallback to NCHW config.
-    if cfg.is_fallback:
-        _get_default_config(
-            cfg,
-            te.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
-            te.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype),
-            strides,
-            padding,
-            dilation,
-            out_dtype,
-        )
-    # Pack data if raw 4-D data is provided.
-    # This can only happen when autotuning.
-    if len(data.shape) == 4:
-        data, kernel = _pack_data(cfg, data, kernel)
-
-    n_elems = int(kernel.shape[-1])
-
-    return nn.conv2d_NCHWc_int8(
-        data, kernel, strides, padding, dilation, layout, out_layout, out_dtype, n_elems=n_elems
-    )
-
-
-def is_int8_hw_support(data_dtype, kernel_dtype):
-    """
-    Checks to ensure that we can use int8 on arm
-    1) The datatypes are correct.
-    2) LLVM version has support for the instructions.
-    """
-    # 1) Check datatypes
-    is_dtype_support = data_dtype == kernel_dtype and "int8" in data_dtype
-
-    # 2) Check LLVM support
-    llvm_version = codegen.llvm_version_major()
-    is_llvm_support = llvm_version >= 8
-
-    # 3) Check target
-    current_target = target.Target.current(allow_none=False)
-    is_target_support = bool(
-        current_target.features.has_asimd or current_target.features.has_dotprod
-    )
-
-    return is_dtype_support and is_llvm_support and is_target_support
-
-
-@autotvm.register_topi_schedule("conv2d_NCHWc_int8.arm_cpu")
-def schedule_conv2d_NCHWc_int8(cfg, outs):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _callback(op):
-        if "conv2d_NCHWc_int8" in op.tag:
-            conv_out = op.output(0)
-            kernel_vec = conv_out.op.input_tensors[1]
-            data_vec = conv_out.op.input_tensors[0]
-            data = (
-                data_vec.op.input_tensors[0]
-                if isinstance(data_vec.op, te.tensor.ComputeOp) and "pad" not in data_vec.op.tag
-                else data_vec
-            )
-            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
-            # int8 conv kernel is 7-dim
-            _, _, kh, kw, _, _, n_elems = get_const_tuple(kernel_vec.shape)
-            assert n_elems == 4
-            dtype = "uint" if data.dtype == "uint8" else "int"
-            current_target = target.Target.current(allow_none=False)
-            if current_target.features.has_dotprod:
-                intrin = dot_int8_int8_int32_neon_82(int32_lanes=4, dtype=dtype)
-            elif current_target.features.has_asimd:
-                assert dtype == "int", "uint8 not supported if dot product is not available"
-                intrin = dot_int8_int8_int32_neon()
-            else:
-                raise RuntimeError(
-                    "Cannot schedule schedule_NCHWc_int8 without neon or arm v8.2 neon support"
-                )
-            # On raspberry pi 4s, we see poor performance when the fused
-            # operations are inlined into the main computation body. These
-            # fused ops dominated the runtime on small convolutions repeatedly
-            # blow the cache. Using workloads from resnet50, inceptionv3, and
-            # mobilenetv3, we empirically determine the size at which inline is
-            # not worth it to be kernel heigh * kernel width < 500. These tests
-            # were only run on raspberry pi 4, other arm cpus may have larger
-            # caches where inlining has good performance.
-            if target.Target.current().mcpu == "cortex-a72" and kh * kw < 500:
-                inline_fused = False
-            else:
-                inline_fused = True
-            if kh == 1 and kw == 1:
-                conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(
-                    *args, int32_lanes=4, int8_elems=4, intrin=intrin, inline_fused=inline_fused
-                )
-            else:
-                conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(
-                    *args, int32_lanes=4, int8_elems=4, intrin=intrin, inline_fused=inline_fused
-                )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with NCHW layout and int8 dtype"""
-    layout = "NCHW"
-    # pylint: disable=no-value-for-parameter
-    packed_out = conv2d_NCHWc_int8(
-        data, kernel, strides, padding, dilation, layout, layout, out_dtype
-    )
-    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
-
-
-def schedule_conv2d_nchw_int8(outs):
-    """Create the schedule for conv2d_nchw_int8"""
-    # pylint: disable=no-value-for-parameter
-    return schedule_conv2d_NCHWc_int8(outs)
-
-
-# Interleaved schedules: those schedule will interleave the input data. The
-# weights are interleaved and transposed
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved.arm_cpu")
-def compute_conv2d_NHWC_quantized_interleaved(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    """Interface for interleaved compute_conv2d_NHWC_quantized_interleaved"""
-    return compute_conv2d_NHWC(cfg, data, kernel, strides, padding, dilation, out_dtype, True)
-
-
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_interleaved_without_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
-):
-    """Interface for interleaved compute_conv2d_NHWC_quantized_interleaved_without_transform"""
-    return compute_conv2d_NHWC_without_transform(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels, True
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved.arm_cpu")
-def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs):
-    """Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved"""
-    return schedule_conv2d_NHWC(cfg, outs, True)
-
-
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu")
-def schedule_conv2d_NHWC_quantized_interleaved_without_transform(cfg, outs):
-    """Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved"""
-    return schedule_conv2d_NHWC(cfg, outs, True)
-
-
-# Native schedules: those schedule won't interleave A (which is left in its native form).
-# The weights are interleaved and transposed
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu")
-def compute_conv2d_NHWC_quantized_native(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Interface for native compute_conv2d_NHWC_quantized"""
-    return compute_conv2d_NHWC(cfg, data, kernel, strides, padding, dilation, out_dtype, False)
-
-
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_native_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_native_without_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
-):
-    """Interface for compute_conv2d_NHWC_quantized_native_without_transform"""
-    return compute_conv2d_NHWC_without_transform(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        kernel_size,
-        output_channels,
-        False,
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native.arm_cpu")
-def schedule_conv2d_NHWC_quantized_native(cfg, outs):
-    """Interface for native schedule_conv2d_NHWC_quantized"""
-    return schedule_conv2d_NHWC(cfg, outs, False)
-
-
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native_without_transform.arm_cpu")
-def schedule_conv2d_NHWC_quantized_native_without_transform(cfg, outs):
-    """Interface for native schedule_conv2d_NHWC_quantized"""
-    return schedule_conv2d_NHWC(cfg, outs, False)
diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
deleted file mode 100644
index da3afd642b0b..000000000000
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,no-else-return
-"""Conv2D spatial pack implementation for ARM CPU"""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.target import Target
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity
-from .. import nn
-from ..utils import get_const_tuple
-from ..nn.utils import get_const_int, get_pad_tuple
-
-
-def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
-    """compute define for Conv2d Spatial Pack with NCHW layout"""
-    out_dtype = out_dtype or data.dtype
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    if isinstance(N, tvm.tir.Any):
-        N = tvm.te.size_var("n")
-    if not isinstance(IH, int) or not isinstance(IW, int):
-        raise RuntimeError("ARM winograd conv2d doesn't support dynamic input height or width.")
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    if len(kernel.shape) == 4:
-        pre_packed = False
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-    else:  # kernel tensor is pre packed
-        pre_packed = True
-        CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
-        CO = CO * VC
-
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1
-    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
-    data_pad = nn.pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])
-
-    # ==================== define configuration space ====================
-    # TODO(@kevinthesun): Support tuning/optimization for dynamic shape.
-    n_tuning_axis = N if isinstance(N, int) else 1
-    n, co, oh, ow = cfg.axis(n_tuning_axis), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
-    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-
-    if num_tile == 2:  # for arm cpu
-        co, vc = cfg.define_split("tile_co", co, num_outputs=2)
-        oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2)
-        ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2)
-    elif num_tile == 3:  # for mali gpu
-        co, _, vc = cfg.define_split("tile_co", co, num_outputs=3)
-        oh, _, vh = cfg.define_split("tile_oh", oh, num_outputs=3)
-        ow, _, vw = cfg.define_split("tile_ow", ow, num_outputs=3)
-    else:
-        raise RuntimeError("Invalid num_tile")
-
-    cfg.define_reorder(
-        "reorder_0",
-        [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
-        policy="candidate",
-        candidate=[
-            [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
-            [n, co, oh, ow, ci, kh, kw, vc, vh, vw],
-        ],
-    )
-
-    cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll_vec")
-
-    # fallback support
-    if cfg.is_fallback:
-        if num_tile == 2:  # arm cpu
-            ref_log = autotvm.tophub.load_reference_log(
-                "arm_cpu", "rk3399", "conv2d_nchw_spatial_pack.arm_cpu"
-            )
-            cfg.fallback_with_reference_log(ref_log)
-        elif num_tile == 3:  # mali gpu
-            ref_log = autotvm.tophub.load_reference_log(
-                "mali", "rk3399", "conv2d_nchw_spatial_pack.mali"
-            )
-            cfg.fallback_with_reference_log(ref_log)
-    # ====================================================================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    kvshape = (CO // VC, CI, KH, KW, VC)
-    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (N, CO, OH, OW)
-
-    if dilation_h != 1 or dilation_w != 1:
-        # undilate input data
-        dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW)
-        data_vec = te.compute(
-            dvshape,
-            lambda n, h, w, ci, kh, kw, vh, vw: data_pad[n][ci][
-                (h * VH + vh) * HSTR + kh * dilation_h
-            ][(w * VW + vw) * WSTR + kw * dilation_w],
-            name="data_vec_undilated",
-        )
-    else:
-        dvshape = (N, OH // VH, OW // VW, CI, VH * HSTR + KH - 1, VW * WSTR + KW - 1)
-        data_vec = te.compute(
-            dvshape,
-            lambda n, h, w, ci, vh, vw: data_pad[n][ci][h * VH * HSTR + vh][w * VW * WSTR + vw],
-            name="data_vec",
-        )
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # use "kernel_autotvm" instead of "kernel" to avoid naming conflict with OpenCL keyword
-        kernel_vec = tvm.te.placeholder(kvshape, kernel.dtype, name="kernel_autotvm")
-    else:
-        if pre_packed:
-            kernel_vec = kernel
-        else:
-            kernel_vec = te.compute(
-                kvshape,
-                lambda co, ci, kh, kw, vc: kernel[co * VC + vc][ci][kh][kw],
-                name="kernel_vec",
-            )
-
-    ci = te.reduce_axis((0, CI), name="ci")
-    kh = te.reduce_axis((0, KH), name="kh")
-    kw = te.reduce_axis((0, KW), name="kw")
-
-    if dilation_h != 1 or dilation_w != 1:
-        conv = te.compute(
-            ovshape,
-            lambda n, co, h, w, vh, vw, vc: te.sum(
-                data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype)
-                * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                axis=[ci, kh, kw],
-            ),
-            name="conv",
-        )
-    else:
-        conv = te.compute(
-            ovshape,
-            lambda n, co, h, w, vh, vw, vc: te.sum(
-                data_vec[n, h, w, ci, vh * HSTR + kh, vw * WSTR + kw].astype(out_dtype)
-                * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                axis=[ci, kh, kw],
-            ),
-            name="conv",
-        )
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    output = te.compute(
-        oshape,
-        lambda n, co, h, w: conv[
-            n,
-            idxdiv(co, VC),
-            idxdiv(h, VH),
-            idxdiv(w, VW),
-            idxmod(h, VH),
-            idxmod(w, VW),
-            idxmod(co, VC),
-        ],
-        name="output_unpack",
-        tag="spatial_conv2d_output",
-    )
-    return output
-
-
-def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, conv, output, last):
-    """schedule implementation"""
-    n, co, oh, ow, vh, vw, vc = s[conv].op.axis
-    ci, kh, kw = s[conv].op.reduce_axis
-
-    # schedule conv
-    cfg["reorder_0"].apply(s, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
-    cfg["ann_reduce"].apply(
-        s,
-        conv,
-        [kh, kw],
-        axis_lens=[get_const_int(kh.dom.extent), get_const_int(kw.dom.extent)],
-        max_unroll=None,
-        cfg=cfg,
-    )
-    cfg["ann_spatial"].apply(
-        s,
-        conv,
-        [vh, vw, vc],
-        axis_lens=[cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1]],
-        max_unroll=None,
-        cfg=cfg,
-    )
-
-    # schedule fusion
-    n, co, h, w = s[last].op.axis
-    co, vc = cfg["tile_co"].apply(s, last, co)
-    oh, vh = cfg["tile_oh"].apply(s, last, h)
-    ow, vw = cfg["tile_ow"].apply(s, last, w)
-    s[last].reorder(n, co, oh, ow, vh, vw, vc)
-    if last != output:
-        s[output].compute_inline()
-        cfg["ann_spatial"].apply(
-            s,
-            last,
-            [vh, vw, vc],
-            axis_lens=[cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1]],
-            max_unroll=16,
-            cfg=cfg,
-        )
-    s[conv].compute_at(s[last], ow)
-
-    # mark parallel
-    s[last].parallel(co)
-
-    if data_vec.op.name == "data_vec_undilated":
-        _, h, _, _, _, _, _, _ = s[data_vec].op.axis
-    else:
-        _, h, _, _, _, _ = s[data_vec].op.axis
-    s[data_vec].parallel(h)
-
-    if kernel_vec.op.name == "kernel_vec":
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            co, _, _, _, _ = s[kernel_vec].op.axis
-            s[kernel_vec].parallel(co)
-    elif kernel_vec.op.name == "kernel_vec_conv2d_transpose":  # for conv2d transpose
-        co, _, _, _, _ = s[kernel_vec].op.axis
-        s[kernel_vec].parallel(co)
-
-    return s
-
-
-def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2):
-    """Spatial pack compute for Conv2d NHWC"""
-    out_dtype = out_dtype or data.dtype
-
-    N, IH, IW, IC = get_const_tuple(data.shape)
-    assert len(kernel.shape) == 4, "AlterOpLayout not enabled for NHWC yet"
-    KH, KW, _, OC = get_const_tuple(kernel.shape)
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-
-    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
-    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
-    data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0])
-
-    # ==================== define configuration space ====================
-    # If it has dynamic shape in batch, we fix the split factor to 1
-    n = cfg.axis(N) if isinstance(N, int) else cfg.axis(1)
-    oc, oh, ow = cfg.axis(OC), cfg.axis(OH), cfg.axis(OW)
-    ic, kh, kw = cfg.reduce_axis(IC), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-
-    if num_tile == 2:  # for arm cpu
-        oco, oci = cfg.define_split("tile_co", oc, num_outputs=2)
-        oho, ohi = cfg.define_split("tile_oh", oh, num_outputs=2)
-        owo, owi = cfg.define_split("tile_ow", ow, num_outputs=2)
-    elif num_tile == 3:  # for mali gpu
-        oco, _, oci = cfg.define_split("tile_co", oc, num_outputs=3)
-        oho, _, ohi = cfg.define_split("tile_oh", oh, num_outputs=3)
-        owo, _, owi = cfg.define_split("tile_ow", ow, num_outputs=3)
-    else:
-        raise RuntimeError("Invalid num_tile")
-
-    cfg.define_reorder(
-        "reorder_conv",
-        [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci],
-        policy="candidate",
-        candidate=[
-            [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci],
-            [n, oho, owo, oco, ohi, kh, kw, ic, owi, oci],
-            [n, oho, owo, oco, ohi, kh, kw, owi, ic, oci],
-            [n, oho, owo, ohi, oco, kh, kw, owi, ic, oci],
-        ],
-    )
-
-    cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [owi, oci], policy="try_unroll_vec")
-    # ====================================================================
-
-    # If there are no tuning records, use this config
-    if cfg.is_fallback:
-
-        def _tile_size(axis, candidates):
-            for candidate in candidates:
-                tiles_divisible_by_candidate = axis % candidate == 0
-                if tiles_divisible_by_candidate:
-                    return candidate
-            return 1
-
-        # For data tensors with unity height and width we can leave it to the
-        # backend to vectorize the inner loop. This has been observed to be more
-        # performant on SVE targets with a vector width > 128bits.
-        target = Target.current(allow_none=False)
-        if target.features.has_sve and OW == OH and OW == 1:
-            tile_size = [OC]
-            vectorize = "none"
-        else:
-            # Tile size 8 results in efficient vectorization for these schedules.
-            # If the axis is not divisible by 8, try 4
-            tile_size = [8, 4]
-            vectorize = "vec"
-
-        cfg["tile_oh"] = SplitEntity([-1, 1])
-        cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])])
-        cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, tile_size)])
-        cfg["ann_spatial"] = AnnotateEntity(["none", vectorize])
-        cfg["ann_reduce"] = AnnotateEntity(["none", "none"])
-        cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-        cfg["compat"] = OtherOptionEntity(0)
-
-    OCI = cfg["tile_co"].size[-1]
-    OHI = cfg["tile_oh"].size[-1]
-    OWI = cfg["tile_ow"].size[-1]
-    OCO = OC // OCI
-    OHO = OH // OHI
-    OWO = OW // OWI
-
-    kvshape = (OCO, KH, KW, IC, OCI)
-    ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI)
-    oshape = (N, OH, OW, OC)
-
-    if dilation_h != 1 or dilation_w != 1:
-        # undilate input data
-        dvshape = (N, OHO, OWO, KH, KW, IC, OHI, OWI)
-        data_vec = te.compute(
-            dvshape,
-            lambda n, oho, owo, kh, kw, ic, ohi, owi: data_pad[n][
-                (oho * OHI + ohi) * HSTR + kh * dilation_h
-            ][(owo * OWI + owi) * WSTR + kw * dilation_w][ic],
-            name="data_vec_undilated",
-        )
-    else:
-        dvshape = (N, OHO, OWO, KH + (OHI - 1) * HSTR, KW + (OWI - 1) * WSTR, IC)
-        data_vec = te.compute(
-            dvshape,
-            lambda n, oho, owo, ohi, owi, ic: data_pad[n][oho * OHI * HSTR + ohi][
-                owo * OWI * WSTR + owi
-            ][ic],
-            name="data_vec",
-        )
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        kernel_vec = tvm.te.placeholder(kvshape, kernel.dtype, name="kernel")
-    else:
-        kernel_vec = te.compute(
-            kvshape,
-            lambda oco, kh, kw, ic, oci: kernel[kh][kw][ic][oco * OCI + oci],
-            name="kernel_vec",
-        )
-
-    ic = te.reduce_axis((0, IC), name="ic")
-    kh = te.reduce_axis((0, KH), name="kh")
-    kw = te.reduce_axis((0, KW), name="kw")
-
-    if dilation_h != 1 or dilation_w != 1:
-        conv = te.compute(
-            ovshape,
-            lambda n, oho, owo, oco, ohi, owi, oci: te.sum(
-                data_vec[n, oho, owo, kh, kw, ic, ohi, owi].astype(out_dtype)
-                * kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
-                axis=[ic, kh, kw],
-            ),
-            name="conv",
-        )
-    else:
-        conv = te.compute(
-            ovshape,
-            lambda n, oho, owo, oco, ohi, owi, oci: te.sum(
-                data_vec[n, oho, owo, ohi * HSTR + kh, owi * WSTR + kw, ic].astype(out_dtype)
-                * kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
-                axis=[ic, kh, kw],
-            ),
-            name="conv",
-        )
-
-    idiv = tvm.tir.indexdiv
-    imod = tvm.tir.indexmod
-    output = te.compute(
-        oshape,
-        lambda n, oho, owo, oc: conv[n][idiv(oho, OHI)][idiv(owo, OWI)][idiv(oc, OCI)][
-            imod(oho, OHI)
-        ][imod(owo, OWI)][imod(oc, OCI)],
-        name="output_unpack",
-        tag="spatial_conv_output_NHWC",
-    )
-    return output
-
-
-def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
-    """Spatial Pack schedule for Conv2d NHWC"""
-    unpack = op.output(0)
-    conv = unpack.op.input_tensors[0]
-    data_vec = conv.op.input_tensors[0]
-    kernel_vec = conv.op.input_tensors[1]
-    data_pad = data_vec.op.input_tensors[0]
-
-    OWI = cfg["tile_ow"].size[-1]
-    OCI = cfg["tile_co"].size[-1]
-
-    # schedule unpack/output
-    if output != unpack:
-        s[unpack].compute_inline()
-    n, oh, ow, oc = s[output].op.axis
-    oco, oci = cfg["tile_co"].apply(s, output, oc)
-    oho, ohi = cfg["tile_oh"].apply(s, output, oh)
-    owo, owi = cfg["tile_ow"].apply(s, output, ow)
-    s[output].reorder(n, oho, owo, oco, ohi, owi, oci)
-    cfg["ann_spatial"].apply(s, output, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg)
-
-    cfg.define_knob("compat", [0, 1])
-    compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
-    s[conv].compute_at(s[output], compat_axis)
-    paxis = s[output].fuse(n, oho)
-    s[output].parallel(paxis)
-
-    # schedule conv
-    n, oho, owo, oco, ohi, owi, oci = s[conv].op.axis
-    ic, kh, kw = s[conv].op.reduce_axis
-    cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci])
-    cfg["ann_reduce"].apply(
-        s,
-        conv,
-        [kh, kw],
-        axis_lens=[get_const_int(kh.dom.extent), get_const_int(kw.dom.extent)],
-        max_unroll=16,
-        cfg=cfg,
-    )
-    cfg["ann_spatial"].apply(s, conv, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg)
-
-    # schedule data_vec, data_pad and kernel_vec
-    compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
-    s[kernel_vec].compute_at(s[conv], compat_axis)
-    s[data_vec].compute_at(s[conv], compat_axis)
-
-    # Inlining kernel vec brings a performance improvement, but the tuner seems to not
-    # like it, so inline only when we are using the fallback config
-    if cfg.is_fallback:
-        s[kernel_vec].compute_inline()
-
-    if data_vec.op.name == "data_vec_undilated":
-        n, oho, owo, kh, kw, ic, ohi, owi = s[data_vec].op.axis
-    else:
-        n, oho, owo, ohi, owi, ic = s[data_vec].op.axis
-    s[data_pad].compute_at(s[data_vec], n)
-
-    return s
diff --git a/python/tvm/topi/arm_cpu/conv2d_transpose.py b/python/tvm/topi/arm_cpu/conv2d_transpose.py
deleted file mode 100644
index c9f1e1efddfc..000000000000
--- a/python/tvm/topi/arm_cpu/conv2d_transpose.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable
-"""Transposed 2D convolution operators (sometimes called Deconvolution)."""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from ..nn import dilate, pad, get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
-from .conv2d_spatial_pack import schedule_conv2d_spatial_pack_nchw
-
-
-@autotvm.register_topi_compute("conv2d_transpose_nchw.arm_cpu")
-def conv2d_transpose_nchw(cfg, Input, Filter, strides, padding, out_dtype, output_padding):
-    """Transposed 2D convolution nchw forward operator.
-
-    Parameters
-    ----------
-    Input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    Filter : tvm.te.Tensor
-        4-D with shape [in_channel, num_filter, filter_height, filter_width]
-
-    strides : tuple of two ints
-        The spatial stride along height and width
-
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-
-    out_dtype: str
-        The output data type. This is used for mixed precision.
-
-    output_padding : tuple of int
-        Used to get the right output shape in gradients
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    return _decl_spatial_pack(
-        cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2, output_padding
-    )
-
-
-def _decl_spatial_pack(
-    cfg, data, kernel, strides, padding, layout, out_dtype, num_tile, output_padding
-):
-    assert layout == "NCHW", "Only support NCHW"
-    out_dtype = out_dtype or data.dtype
-
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    if isinstance(N, tvm.tir.Any):
-        N = tvm.te.size_var("n")
-    if not isinstance(IH, int) or not isinstance(IW, int):
-        raise RuntimeError("ARM winograd conv2d doesn't support dynamic input height or width.")
-
-    _, CO, KH, KW = get_const_tuple(kernel.shape)
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    opad_h, opad_w = output_padding
-    assert opad_h < HSTR and opad_w < WSTR
-
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
-    bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom + opad_h
-    bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right + opad_w
-
-    OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH + opad_h
-    OW = (IW - 1) * WSTR - pad_left - pad_right + KW + opad_w
-
-    dilated_input = dilate(data, [1, 1, HSTR, WSTR])
-    data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right])
-
-    # ==================== define configuration space ====================
-    # TODO(@kevinthesun): Support tuning/optimization for dynamic shape.
-    n_tuning_axis = N if isinstance(N, int) else 1
-    n, co, oh, ow = cfg.axis(n_tuning_axis), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
-    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-
-    if num_tile == 2:  # for arm cpu
-        co, vc = cfg.define_split("tile_co", co, num_outputs=2)
-        oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2)
-        ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2)
-    elif num_tile == 3:  # for mali gpu
-        co, _, vc = cfg.define_split("tile_co", co, num_outputs=3)
-        oh, _, vh = cfg.define_split("tile_oh", oh, num_outputs=3)
-        ow, _, vw = cfg.define_split("tile_ow", ow, num_outputs=3)
-    else:
-        raise RuntimeError("Invalid num_tile")
-
-    cfg.define_reorder(
-        "reorder_0",
-        [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
-        policy="candidate",
-        candidate=[
-            [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
-            [n, co, oh, ow, ci, kh, kw, vc, vh, vw],
-        ],
-    )
-
-    cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll_vec")
-    # ====================================================================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    dvshape = (N, OH // VH, OW // VW, CI, VH + KH - 1, VW + KW - 1)
-    kvshape = (CO // VC, CI, KH, KW, VC)
-    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (N, CO, OH, OW)
-
-    data_vec = te.compute(
-        dvshape,
-        lambda n, h, w, ci, vh, vw: data_pad[n][ci][h * VH + vh][w * VW + vw],
-        name="data_vec",
-    )
-
-    kernel_vec = te.compute(
-        kvshape,
-        lambda co, ci, kh, kw, vc: kernel[ci][co * VC + vc][kh][kw],
-        name="kernel_vec_conv2d_transpose",
-    )
-
-    ci = te.reduce_axis((0, CI), name="ci")
-    kh = te.reduce_axis((0, KH), name="kh")
-    kw = te.reduce_axis((0, KW), name="kw")
-
-    conv = te.compute(
-        ovshape,
-        lambda n, co, h, w, vh, vw, vc: te.sum(
-            data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype)
-            * kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
-            axis=[ci, kh, kw],
-        ),
-        name="conv",
-    )
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    output = te.compute(
-        oshape,
-        lambda n, co, h, w: conv[
-            n,
-            idxdiv(co, VC),
-            idxdiv(h, VH),
-            idxdiv(w, VW),
-            idxmod(h, VH),
-            idxmod(w, VW),
-            idxmod(co, VC),
-        ],
-        name="output_unpack",
-        tag="spatial_conv2d_transpose_output",
-    )
-    return output
-
-
-# register customized schedule for arm cpu.
-@autotvm.register_topi_schedule("conv2d_transpose_nchw.arm_cpu")
-def schedule_conv2d_transpose_nchw(cfg, outs):
-    """Schedule conv2d transpose for arm cpu"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "spatial_conv2d_transpose_output" in op.tag:
-            output = op.output(0)
-            conv = op.input_tensors[0]
-
-            data_vec = conv.op.input_tensors[0]
-            data_pad = data_vec.op.input_tensors[0]
-            dilated_input = data_pad.op.input_tensors[0]
-            s[data_pad].compute_inline()
-            s[dilated_input].compute_inline()
-
-            kernel_vec = conv.op.input_tensors[1]
-            if kernel_vec.op.name == "kernel_vec":
-                kernel = kernel_vec.op.input_tensors[0]
-            else:
-                kernel = kernel_vec
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/arm_cpu/dense.py b/python/tvm/topi/arm_cpu/dense.py
deleted file mode 100644
index 929413893b7b..000000000000
--- a/python/tvm/topi/arm_cpu/dense.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Dense schedule for ARM CPU"""
-from tvm import autotvm
-from .mprofile.dsp.dense import dense_dsp_schedule, dense_dsp_compute
-from .dense_gemm import dense_gemm_compute, dense_gemm_schedule
-
-
-@autotvm.register_topi_compute("dense_dsp.arm_cpu")
-def dense_dsp(cfg, data, weight, bias, out_dtype):
-    """Compute dense with DSP instructions."""
-    return dense_dsp_compute(cfg, data, weight, bias=bias, out_dtype=out_dtype)
-
-
-@autotvm.register_topi_schedule("dense_dsp.arm_cpu")
-def schedule_dense_dsp(cfg, outs):
-    """Create schedule for dense_dsp"""
-    return dense_dsp_schedule(cfg, outs)
-
-
-@autotvm.register_topi_compute("dense_gemm.arm_cpu")
-def dense_gemm(cfg, data, weight, bias, out_dtype, transpose_a=False, transpose_b=True):
-    """Compute dense using GeMM."""
-    return dense_gemm_compute(cfg, data, weight, bias, out_dtype, transpose_a, transpose_b)
-
-
-@autotvm.register_topi_schedule("dense_gemm.arm_cpu")
-def schedule_dense_gemm(cfg, outs):
-    """Create schedule for dense using GeMM."""
-    return dense_gemm_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/dense_alter_op.py b/python/tvm/topi/arm_cpu/dense_alter_op.py
deleted file mode 100644
index 973ab85d20f9..000000000000
--- a/python/tvm/topi/arm_cpu/dense_alter_op.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Dense alter op definitions for the `arm_cpu` device key."""
-
-import tvm
-from tvm import relay
-from tvm import autotvm
-from tvm import te
-
-from ..nn import dense_alter_layout
-
-
-@dense_alter_layout.register("arm_cpu")
-def _alter_dense(attrs, inputs, tinfos, out_type):
-    from tvm.relay.op.nn import _make  # pylint: disable=import-outside-toplevel
-
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-
-    _, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.dense"),
-        attrs,
-        tinfos,
-        out_type,
-        target,
-    )
-    workload = autotvm.task.get_workload(outs)
-    if workload is None:
-        # The best implementation is not an AutoTVM template,
-        # we then assume it's not necessary to alter this op.
-        return None
-
-    cfg = dispatch_ctx.query(target, workload)
-    topi_impl = workload[0]
-
-    if topi_impl == "matmul.arm_cpu.sme":
-
-        weight_dtype = tinfos[1].dtype
-        N, K = tinfos[1].shape
-        encoded_weight = inputs[1]
-
-        # For dense the weights (rhs) are provided in transposed format,
-        # i.e. they are of the shape (n, k).
-        transpose_b = True
-
-        # The SME schedule expects the rhs to be in the format (k, n). We can do this
-        # transformation at compile time in the case of float32. Note: For the
-        # float16->float32 schedule the transformation currently happens at runtime
-        # with the ARM_SME_BLOCK2_2SVLx1SVL_FP16_TRANSPOSE_INTERLEAVE intrinsic.
-        if weight_dtype == "float32":
-            encoded_weight = relay.transpose(encoded_weight)
-            transpose_b = False
-
-        new_weight = te.placeholder(([K, N]), dtype=weight_dtype)
-
-        new_workload = autotvm.task.args_to_workload(
-            [tinfos[0], new_weight, None, out_type.dtype, False, transpose_b], topi_impl
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return _make.matmul(
-            inputs[0],
-            encoded_weight,
-            attrs.units,
-            attrs.out_dtype,
-            False,
-            transpose_b,
-        )
-    elif topi_impl == "dense_gemm.arm_cpu":
-
-        weight_dtype = tinfos[1].dtype
-        N, K = tinfos[1].shape
-
-        encoded_weight = relay.transpose(inputs[1])
-        new_weight = te.placeholder(([K, N]), dtype=weight_dtype)
-
-        new_workload = autotvm.task.args_to_workload(
-            [tinfos[0], new_weight, None, out_type.dtype, False, False], topi_impl
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return _make.matmul(
-            inputs[0],
-            encoded_weight,
-            attrs.units,
-            attrs.out_dtype,
-            False,
-            False,
-        )
-
-    # x86 schedules are used as a fallback
-    return tvm.topi.x86.dense_alter_op._alter_dense_layout(attrs, inputs, tinfos, out_type)
diff --git a/python/tvm/topi/arm_cpu/dense_gemm.py b/python/tvm/topi/arm_cpu/dense_gemm.py
deleted file mode 100644
index 316d5731c5f9..000000000000
--- a/python/tvm/topi/arm_cpu/dense_gemm.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, too-many-locals
-# pylint: disable=unused-argument, redefined-builtin
-"""GeMM dense schedule on AArch64"""
-import tvm
-from tvm import te
-from tvm.topi import nn
-from tvm.topi.arm_cpu.arm_utils import get_tiling_A, get_tiling_B_transformed, pad_dim_to_multiple
-from ..utils import get_const_tuple, traverse_inline
-from .. import tag
-
-# Compute function
-def dense_gemm_compute(
-    cfg, data, weight, bias=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """
-    Compute dense using GeMM.
-
-    Parameters
-    ----------
-    cfg : Autotvm tuning space config file,
-        empty in this case, but it's needed as an arg.
-
-    data : tvm.te.Tensor
-        2-D with shape [M, K] or [K, M].
-
-    weight : tvm.te.Tensor
-        2-D with shape [K, N] or [N, K].
-
-    bias : Optional[tvm.te.Tensor]
-        1-D with shape [N]
-
-
-    out_dtype : Optional[str]
-        Specifies the output data type.
-
-    transpose_a : Optional[bool] = False
-    Whether the data tensor is in transposed format.
-
-    transpose_b : Optional[bool] = True
-    Whether the weight tensor is in transposed format.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        1-D with shape [out_dim]
-    """
-
-    if out_dtype is None:
-        out_dtype = data.dtype
-    M, K = get_const_tuple(data.shape)  # batch, in_dim
-    if bool(transpose_b):  # out_dim
-        (N, _) = get_const_tuple(weight.shape)
-    else:
-        (_, N) = get_const_tuple(weight.shape)
-
-    tile_M, tile_K = get_tiling_A(False, out_dtype)
-    tile_N, _ = get_tiling_B_transformed(False, out_dtype, False)
-
-    M_padded, pad_M = pad_dim_to_multiple(M, tile_M)
-    K_padded, pad_K = pad_dim_to_multiple(K, tile_K)
-    N_padded, pad_N = pad_dim_to_multiple(N, tile_N)
-    m_pad_after = (pad_M, pad_K)
-    n_pad_after = (pad_N, pad_K) if transpose_b else (pad_K, pad_N)
-
-    if pad_M != 0 or pad_K != 0:
-        data = nn.pad(data, pad_before=(0, 0), pad_after=m_pad_after, name="data_padded")
-
-    k = te.reduce_axis((0, K_padded), name="k")
-
-    if bool(transpose_b):
-        weight = te.compute(
-            (K_padded, N_padded), lambda x, y: weight[y, x], name="weight_transposed"
-        )
-
-    if pad_N != 0 or pad_K != 0:
-        weight = nn.pad(weight, pad_before=(0, 0), pad_after=n_pad_after, name="weight_padded")
-
-    C = te.compute(
-        (M_padded, N_padded),
-        lambda x, y: te.sum(
-            data[x, k].astype(out_dtype) * weight[k, y].astype(out_dtype),
-            axis=k,
-        ).astype(out_dtype),
-        name="C",
-    )
-
-    if bias is not None:
-        C = te.compute(
-            (M_padded, N_padded),
-            lambda i, j: C[i, j] + bias[j].astype(out_dtype),
-            tag=tag.BROADCAST,
-            name="dense_biased_output",
-        )
-
-    # We need to ensure that infer bound pass does not remove the padding
-    # which is necessary for the tensorizations to work. So we need to
-    # add a dummy reference to the padding area of the result
-    zero = (
-        tvm.tir.const(1, C.dtype) * C[0, N_padded - 1]
-        - tvm.tir.const(1, C.dtype) * C[0, N_padded - 1]
-    )
-
-    out = te.compute(
-        (M, N), lambda x, y: (C[x, y] + zero).astype(out_dtype), name="dense_gemm_output"
-    )
-
-    return out
-
-
-def _dense_gemm_schedule(s, out):
-    C = out.op.input_tensors[0]
-    A = C.op.input_tensors[0]
-    out_type = A.dtype
-    tile_M, tile_K = get_tiling_A(False, out_type)
-    tile_N, _ = get_tiling_B_transformed(False, out_type, False)
-
-    if C.op.name == "dense_biased_output":
-        s[C].compute_inline()
-        C = C.op.input_tensors[0]
-    x, y = s[C].op.axis
-    (k,) = s[C].op.reduce_axis
-
-    k_outer, k_inner = s[C].split(k, factor=tile_K)
-    x_outer, x_inner = s[C].split(x, factor=tile_M)
-    y_outer, y_inner = s[C].split(y, factor=tile_N)
-    y_inner_outer, y_inner_inner = s[C].split(y_inner, nparts=4)
-    s[C].parallel(x_outer)
-    s[C].reorder(
-        x_outer,
-        y_outer,
-        k_outer,
-        k_inner,
-        y_inner_outer,
-        x_inner,
-        y_inner_inner,
-    )
-    s[C].unroll(y_inner_outer)
-    s[C].unroll(x_inner)
-    s[C].vectorize(y_inner_inner)
-
-    return s
-
-
-def dense_gemm_schedule(cfg, outs):
-    """Schedule the dense_gemm strategy"""
-    s = te.create_schedule([x.op for x in outs])
-    out = outs[0]
-    x, y = out.op.axis
-    _, inner = s[out].split(y, 4)
-    s[out].parallel(x)
-    s[out].vectorize(inner)
-
-    def _callback(op):
-        if "dense_gemm_output" in op.name:
-            _dense_gemm_schedule(s, op.output(0))
-
-    traverse_inline(s, out.op, _callback)
-    return s
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
deleted file mode 100644
index 59660e6bb90c..000000000000
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ /dev/null
@@ -1,721 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable
-"""Depthwise convolution schedule for ARM CPU"""
-
-import tvm
-from tvm.target import Target
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
-from .. import nn
-from ..utils import traverse_inline, get_const_tuple, get_const_int
-from ..nn.utils import get_pad_tuple
-from .tensor_intrin import smlal_int16_int32
-from .mprofile.dsp.depthwise_conv2d import (
-    depthwise_conv2d_nhwc_dsp_compute,
-    depthwise_conv2d_nhwc_dsp_schedule,
-)
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
-def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute depthwise_conv2d with NCHW layout"""
-    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw.arm_cpu")
-def schedule_depthwise_conv2d_nchw(cfg, outs):
-    """Schedule depthwise conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The configuration of this template
-    outs: Array of Tensor
-        The computation graph description of depthwise convolution2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(cfg, s, data, data_pad, kernel, output):
-        A, B, C = data, kernel, output
-        s[data_pad].compute_inline()
-
-        ##### space definition begin #####
-        n, c, h, w = s[output].op.axis
-        _, vc = cfg.define_split("tile_c", c, num_outputs=2)
-        _, vh = cfg.define_split("tile_h", h, num_outputs=2)
-        _, vw = cfg.define_split("tile_w", w, num_outputs=2)
-        cfg.define_annotate("ann", [vh, vw, vc], policy="try_unroll_vec")
-
-        # fallback support
-        if cfg.is_fallback:
-            ref_log = autotvm.tophub.load_reference_log(
-                "arm_cpu", "rk3399", "depthwise_conv2d_nchw.arm_cpu"
-            )
-            cfg.fallback_with_reference_log(ref_log)
-        ##### space definition end #####
-
-        # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
-        A0 = s.cache_read(data_pad, "global", C)
-        n, c, h, w = s[A0].op.axis
-        c, vc = cfg["tile_c"].apply(s, A0, c)
-        s[A0].reorder(n, c, h, w, vc)
-        A1 = s.cache_write(A0, "global")
-        s[A0].compute_inline()
-
-        # park kernel to vector form  [co, ci, kh, kw] -> [CO, ci, kh, kw, VC]
-        B0 = s.cache_read(B, "global", C)
-        c, m, h, w = s[B0].op.axis
-        c, vc, = cfg[
-            "tile_c"
-        ].apply(s, B0, c)
-        s[B0].reorder(c, m, h, w, vc)
-        B1 = s.cache_write(B0, "global")
-        s[B0].compute_inline()
-
-        n, c, h, w = s[C].op.axis
-        c, vc, = cfg[
-            "tile_c"
-        ].apply(s, C, c)
-        s[C].reorder(n, c, h, w, vc)
-
-        # depthwise conv
-        C0 = s.cache_write(C, "global")
-        _, c, h, w, vc = s[C0].op.axis
-        dh, dw = s[C0].op.reduce_axis
-        oh, ih = cfg["tile_h"].apply(s, C0, h)
-        ow, iw = cfg["tile_w"].apply(s, C0, w)
-        s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
-        s[A1].compute_at(s[C0], oh)
-
-        # try unroll and vectorization
-        cfg["ann"].apply(
-            s,
-            C0,
-            [ih, iw, vc],
-            axis_lens=[cfg["tile_h"].size[-1], cfg["tile_w"].size[-1], cfg["tile_c"].size[-1]],
-            max_unroll=16,
-            cfg=cfg,
-        )
-
-        # fusion
-        if C.op not in s.outputs:
-            s[C].compute_inline()
-
-        # mark parallel
-        last = outs[0]
-        n, c, h, w = s[last].op.axis
-        s[last].parallel(c)
-
-        n, c, h, w, vc = s[C0].op.axis
-        s[C0].parallel(c)
-
-        c, m, h, w, vc = s[B1].op.axis
-        s[B1].parallel(c)
-
-        return s
-
-    def _callback(op):
-        if op.tag == "depthwise_conv2d_nchw":
-            output = op.output(0)
-            kernel = op.input_tensors[1]
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-            _schedule(cfg, s, data, data_pad, kernel, output)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-# TODO:
-# This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
-# Let us comment it out but not remove.
-# see discussion:
-# https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu
-@autotvm.register_topi_compute("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
-def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """TOPI compute callback for depthwise_conv2d nchw
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    kernel : tvm.te.Tensor
-        4-D with shape [num_filter, multiplier, filter_height, filter_width] or
-        pre-packed 5-D with shape [num_filter_chunk, multiplier, filter_height,
-        filter_width, num_filter_block]
-
-    strides : list of two ints
-        [stride_height, stride_width]
-
-    padding : list of two ints
-        [pad_height, pad_width]
-
-    dilation : list of two ints
-        [dilation_height, dilation_width]
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2)
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nhwc.arm_cpu")
-def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, out_dtype):
-    """TOPI compute callback for depthwise_conv2d nhwc
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_height, in_width, in_channel]
-
-    kernel : tvm.te.Tensor
-        4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]
-
-    strides : list of two ints
-        [stride_height, stride_width]
-
-    padding : list of two ints
-        [pad_height, pad_width]
-
-    dilation : list of two ints
-        [dilation_height, dilation_width]
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [batch, out_height, out_width, out_channel]
-    """
-    out_dtype = out_dtype or data.dtype
-
-    N, IH, IW, IC = get_const_tuple(data.shape)
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    KH, KW, IC, channel_multiplier = get_const_tuple(kernel.shape)
-
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-
-    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
-    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
-
-    if pad_top or pad_left or pad_down or pad_right:
-        data_pad = nn.pad(
-            data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0], name="data_pad"
-        )
-    else:
-        data_pad = data
-
-    output_shape = (N, OH, OW, IC * channel_multiplier)
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    reduce_h = te.reduce_axis((0, KH), name="reduce_h")
-    reduce_w = te.reduce_axis((0, KW), name="reduce_w")
-
-    out = te.compute(
-        output_shape,
-        lambda n, h, w, c: te.sum(
-            data_pad[
-                n,
-                HSTR * h + dilation_h * reduce_h,
-                w * WSTR + reduce_w * dilation_w,
-                idxdiv(c, channel_multiplier),
-            ].astype(out_dtype)
-            * kernel[
-                reduce_h, reduce_w, idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier)
-            ].astype(out_dtype),
-            axis=[reduce_h, reduce_w],
-        ),
-        name="depthwise_conv2d_nhwc_output",
-    )
-    return out
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.arm_cpu")
-def schedule_depthwise_conv2d_nhwc(cfg, outs):
-    """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    out = outs[0]
-
-    ##### space definition begin #####
-    _, h, w, c = s[out].op.axis
-    # Split the number of input/output channels
-    cfg.define_split("tile_c", c, num_outputs=2, filter=lambda entry: entry.size[1] <= 8)
-    # Split the height of the convolution
-    cfg.define_split("tile_h", h, num_outputs=2)
-    # Split the width of the convolution
-    cfg.define_split("tile_w", w, num_outputs=2)
-    # Additional out (e.g., requantization, bias addition, etc..)
-    # 0: locate the output on the second last axis of the main compuation
-    # 1: locate the output closest to the main computation
-    cfg.define_knob("locate_output", [0, 1])
-    # Determine if we should unroll the computation of the inner tile
-    cfg.define_knob("unroll_tile", [True, False])
-
-    # fallback support
-    if cfg.is_fallback:
-        cfg["tile_c"] = SplitEntity([-1, 8])
-        cfg["tile_h"] = SplitEntity([-1, 2])
-        cfg["tile_w"] = SplitEntity([-1, 2])
-        cfg["locate_output"] = OtherOptionEntity(1)
-        cfg["unroll_tile"] = OtherOptionEntity(True)
-    ##### space definition end #####
-
-    def schedule_conv(conv):
-        conv_data = conv.op.input_tensors[0]
-        kernel_data = conv.op.input_tensors[1]
-        in_type = conv_data.dtype
-
-        _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape)
-
-        n, w, h, c = conv.op.axis
-        r_h, r_w = conv.op.reduce_axis
-        ho, hi = cfg["tile_h"].apply(s, conv, h)
-        wo, wi = cfg["tile_w"].apply(s, conv, w)
-        co, ci = cfg["tile_c"].apply(s, conv, c)
-
-        split_val = cfg["tile_c"].size[-1]
-        target = Target.current(allow_none=False)
-        use_tensorization = (
-            (in_type == "int16")
-            and (split_val == 8)
-            and (IC % split_val == 0)
-            and (channel_multiplier == 1)
-            and target.features.has_asimd
-        )
-
-        data_pad_value = -1
-        if conv_data.name == "data_pad":
-            assert isinstance(conv_data.op, tvm.te.ComputeOp)
-            # Define a strategy for padding computation
-            cfg.define_knob("data_pad_strategy", [1, 2, 3])
-            if cfg.is_fallback:
-                # We cannot inline padding when tensorizing.
-                # So, if we can tensorize, let's compute_at the closest axis
-                cfg["data_pad_strategy"] = (
-                    OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)
-                )
-            # Compute padding on the third to last axis of the computation
-            if cfg["data_pad_strategy"].val == 1:
-                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
-                s[conv_data].compute_at(s[conv], ho)
-            # Compute padding on the second to last axis of the computation
-            if cfg["data_pad_strategy"].val == 2:
-                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
-                s[conv_data].compute_at(s[conv], wo)
-            # Inline padding during computation
-            if cfg["data_pad_strategy"].val == 3:
-                s[conv_data].compute_inline()
-            data_pad_value = cfg["data_pad_strategy"].val
-
-        if use_tensorization and data_pad_value != 3:
-            smlal = smlal_int16_int32()
-            s[conv].tensorize(ci, smlal)
-        else:
-            s[conv].vectorize(ci)
-
-        if cfg["unroll_tile"].val:
-            s[conv].unroll(r_h)
-            s[conv].unroll(r_w)
-            s[conv].unroll(wi)
-            s[conv].unroll(hi)
-
-        s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
-        fused_n_ho = s[conv].fuse(n, ho)
-        return fused_n_ho
-
-    def schedule_conv_out(out):
-        n, h, w, c = out.op.axis
-        co, ci = cfg["tile_c"].apply(s, out, c)
-        wo, wi = cfg["tile_w"].apply(s, out, w)
-        ho, hi = cfg["tile_h"].apply(s, out, h)
-        s[out].reorder(n, ho, wo, co, hi, wi, ci)
-        if cfg["unroll_tile"]:
-            s[out].unroll(wi)
-            s[out].unroll(hi)
-
-        if out.dtype in ["int8", "uint8"]:
-            # In case of quantized convolution further split the channel in batches of 4 elements
-            # so that we can use arm intrinsics to run fixed_point_multiplication
-            ci_outer, ci_inner = s[out].split(ci, 4)
-            s[out].vectorize(ci_inner)
-            s[out].unroll(ci_outer)
-        else:
-            s[out].vectorize(ci)
-        fused_n_ho = s[out].fuse(n, ho)
-        return hi, wi, fused_n_ho
-
-    def _callback(op):
-        if op.name == "depthwise_conv2d_nhwc_output":
-            conv = op.output(0)
-            if conv != out:
-                hi, wi, p_axis = schedule_conv_out(out)
-                schedule_conv(conv)
-                if cfg["locate_output"].val == 0:
-                    s[conv].compute_at(s[out], hi)
-                if cfg["locate_output"].val == 1:
-                    s[conv].compute_at(s[out], wi)
-            else:
-                p_axis = schedule_conv(out)
-
-            s[out].parallel(p_axis)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
-def schedule_depthwise_conv2d_nchw_spatial_pack(cfg, outs):
-    """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "spatial_depthwise_conv2d_nchw_output":
-            output = op.output(0)
-            conv = op.input_tensors[0]
-            data_vec = conv.op.input_tensors[0]
-            kernel_vec = conv.op.input_tensors[1]
-            if kernel_vec.op.name == "kernel_vec":
-                kernel = kernel_vec.op.input_tensors[0]
-            else:
-                kernel = kernel_vec
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
-    out_dtype = out_dtype or data.dtype
-
-    N, C, IH, IW = get_const_tuple(data.shape)
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    if len(kernel.shape) == 4:
-        pre_packed = False
-        C, M, KH, KW = get_const_tuple(kernel.shape)
-    else:  # kernel tensor is pre packed
-        pre_packed = True
-        C, M, KH, KW, VC = get_const_tuple(kernel.shape)
-        C = C * VC
-
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
-    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
-    # pack data
-    HPAD = pad_top + pad_down
-    WPAD = pad_left + pad_right
-    DOPAD = HPAD != 0 or WPAD != 0
-    if DOPAD:
-        data_pad = nn.pad(
-            data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right), name="data_pad"
-        )
-    else:
-        data_pad = data
-
-    # fallback support
-    # Currently, Mali schedule doesn't use it like conv2d.
-    if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log(
-            "arm_cpu", "rk3399", "depthwise_conv2d_nchw_spatial_pack.arm_cpu"
-        )
-        cfg.fallback_with_reference_log(ref_log)
-
-    # ==================== define configuration space ====================
-    n, c, oh, ow = cfg.axis(N), cfg.axis(C), cfg.axis(OH), cfg.axis(OW)
-    kh, kw = cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-
-    # Currently, Mali schedule doesn't use it like conv2d.
-    # Leave num_tile for possible future use of Mali schedule
-    if num_tile == 2:  # for arm cpu
-        co, vc = cfg.define_split("tile_co", c, num_outputs=2)
-        oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2)
-        ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2)
-    else:
-        raise RuntimeError("Invalid num_tile")
-
-    cfg.define_reorder(
-        "reorder_0",
-        [n, co, oh, ow, kh, kw, vh, vw, vc],
-        policy="candidate",
-        candidate=[[n, co, oh, ow, kh, kw, vh, vw, vc], [n, co, oh, ow, kh, kw, vc, vh, vw]],
-    )
-
-    cfg.define_reorder(
-        "reorder_1",
-        [n, co, oh, ow, vh, vw, vc],
-        policy="candidate",
-        candidate=[
-            [n, co, oh, ow, vh, vw, vc],
-            [n, co, oh, ow, vc, vh, vw],
-            [n, co, oh, ow, vh, vc, vw],
-        ],
-    )
-
-    cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll_vec")
-    # ====================================================================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    kvshape = (C // VC, M, KH, KW, VC)
-    ovshape = (N, C * M // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (N, C * M, OH, OW)
-
-    if dilation_h != 1 or dilation_w != 1:
-        # undilate input data
-        dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW)
-        data_vec = te.compute(
-            dvshape,
-            lambda n, h, w, c, kh, kw, vh, vw: data_pad[n][c][
-                (h * VH + vh) * HSTR + kh * dilation_h
-            ][(w * VW + vw) * WSTR + kw * dilation_w],
-            name="data_vec_undilated",
-        )
-    else:
-        dvshape = (N, OH // VH, OW // VW, C, VH * HSTR + KH - 1, VW * WSTR + KW - 1)
-        data_vec = te.compute(
-            dvshape,
-            lambda n, h, w, c, vh, vw: data_pad[n][c][h * VH * HSTR + vh][w * VW * WSTR + vw],
-            name="data_vec",
-        )
-
-    if pre_packed:
-        kernel_vec = kernel
-    else:
-        kernel_vec = te.compute(
-            kvshape, lambda co, m, kh, kw, vc: kernel[co * VC + vc][m][kh][kw], name="kernel_vec"
-        )
-
-    kh = te.reduce_axis((0, KH), name="kh")
-    kw = te.reduce_axis((0, KW), name="kw")
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    if dilation_h != 1 or dilation_w != 1:
-        conv = te.compute(
-            ovshape,
-            lambda n, co, h, w, vh, vw, vc: te.sum(
-                data_vec[n, h, w, idxdiv(co * VC + vc, M), kh, kw, vh, vw].astype(out_dtype)
-                * kernel_vec[idxdiv(co, M), idxmod(co, M), kh, kw, vc].astype(out_dtype),
-                axis=[kh, kw],
-            ),
-            name="depthwise_conv",
-        )
-    else:
-        conv = te.compute(
-            ovshape,
-            lambda n, co, h, w, vh, vw, vc: te.sum(
-                data_vec[n, h, w, idxdiv((co * VC + vc), M), vh * HSTR + kh, vw * WSTR + kw].astype(
-                    out_dtype
-                )
-                * kernel_vec[idxdiv(co, M), idxmod(co, M), kh, kw, vc].astype(out_dtype),
-                axis=[kh, kw],
-            ),
-            name="depthwise_conv",
-        )
-
-    output = te.compute(
-        oshape,
-        lambda n, co, h, w: conv[
-            n,
-            idxdiv(co, VC),
-            idxdiv(h, VH),
-            idxdiv(w, VW),
-            idxmod(h, VH),
-            idxmod(w, VW),
-            idxmod(co, VC),
-        ],
-        name="output_unpack",
-        tag="spatial_depthwise_conv2d_nchw_output",
-    )
-    return output
-
-
-def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, last):
-    """schedule implementation"""
-    n, co, oh, ow, vh, vw, vc = s[conv].op.axis
-    kh, kw = s[conv].op.reduce_axis
-
-    if data_vec.op.name == "data_vec_undilated":
-        _, dv_oh, dv_ow, dv_c, _, _, dv_vh, dv_vw = s[data_vec].op.axis
-    else:
-        _, dv_oh, dv_ow, dv_c, dv_vh, dv_vw = s[data_vec].op.axis
-
-    data_pad = data_vec.op.input_tensors[0]
-    if data_pad.op.name == "data_pad":
-        assert isinstance(data_pad.op, tvm.te.ComputeOp)
-        has_padding = True
-    else:
-        assert isinstance(data_pad.op, tvm.te.PlaceholderOp)
-        has_padding = False
-
-    cfg.define_knob("data_pad_inline", [0, 1, 2, 3, 4])
-
-    if cfg["data_pad_inline"].val == 1 and has_padding:
-        s[data_pad].compute_inline()
-    if cfg["data_pad_inline"].val == 2 and has_padding:
-        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
-    if cfg["data_pad_inline"].val == 3 and has_padding:
-        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
-        s[data_pad].compute_at(s[data_vec], dv_oh)
-    if cfg["data_pad_inline"].val == 4 and has_padding:
-        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
-        s[data_pad].compute_at(s[data_vec], dv_ow)
-
-    cfg.define_knob("data_vec_inline", [0, 1, 2, 3])
-    if cfg["data_vec_inline"].val == 1:
-        s[data_vec].compute_at(s[conv], oh)
-    if cfg["data_vec_inline"].val == 2:
-        s[data_vec].compute_at(s[conv], ow)
-    if cfg["data_vec_inline"].val == 3:
-        s[data_vec].compute_at(s[conv], co)
-
-    # schedule conv
-    cfg["reorder_0"].apply(s, conv, [n, co, oh, ow, kh, kw, vh, vw, vc])
-    cfg["ann_reduce"].apply(
-        s,
-        conv,
-        [kh, kw],
-        axis_lens=[get_const_int(kh.dom.extent), get_const_int(kw.dom.extent)],
-        max_unroll=16,
-        cfg=cfg,
-    )
-    cfg["ann_spatial"].apply(
-        s,
-        conv,
-        [vh, vw, vc],
-        axis_lens=[cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1]],
-        max_unroll=16,
-        cfg=cfg,
-    )
-
-    # schedule fusion
-    n, co, h, w = s[last].op.axis
-    co, vc = cfg["tile_co"].apply(s, last, co)
-    oh, vh = cfg["tile_oh"].apply(s, last, h)
-    ow, vw = cfg["tile_ow"].apply(s, last, w)
-    cfg["reorder_1"].apply(s, last, [n, co, oh, ow, vh, vw, vc])
-    if last != output:
-        s[output].compute_inline()
-        cfg["ann_spatial"].apply(
-            s,
-            last,
-            [vh, vw, vc],
-            axis_lens=[cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1]],
-            max_unroll=16,
-            cfg=cfg,
-        )
-    else:
-        s[last].vectorize(vw)
-    cfg.define_knob("conv_inline", [0, 1, 2, 3])
-    if cfg["conv_inline"].val == 1:
-        s[conv].compute_at(s[last], ow)
-    if cfg["conv_inline"].val == 2:
-        s[conv].compute_at(s[last], oh)
-    if cfg["conv_inline"].val == 3:
-        s[conv].compute_at(s[last], co)
-
-    # mark parallel
-    s[last].parallel(co)
-
-    if data_vec.op.name == "data_vec_undilated":
-        _, h, _, _, _, _, _, _ = s[data_vec].op.axis
-    else:
-        _, h, _, _, _, _ = s[data_vec].op.axis
-    s[data_vec].parallel(h)
-
-    if kernel_vec.op.name == "kernel_vec":
-        co, _, _, _, _ = s[kernel_vec].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel packing will be pre-computed during compilation, so we skip
-            # this part to make tuning records correct
-            s[kernel_vec].pragma(co, "debug_skip_region")
-        else:
-            s[kernel_vec].parallel(co)
-
-    return s
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nhwc_dsp.arm_cpu")
-def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d_nhwc with v7e-m DSP instructions."""
-    return depthwise_conv2d_nhwc_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_dsp.arm_cpu")
-def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
-    """Create schedule for conv2d_nhwc_dsp"""
-    return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
deleted file mode 100644
index 81b2c7260f05..000000000000
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
-
-import tvm
-from tvm import autotvm
-from tvm import te
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
-from ..utils import get_const_tuple
-from ..nn.pad import pad
-from .. import tag
-
-from ..nn.conv2d import _get_workload as _get_conv2d_workload
-
-
-def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
-    """Compute group_conv2d with NCHW layout"""
-    return group_conv2d_nchw_spatial_pack(
-        data, kernel, strides, padding, dilation, groups, out_dtype
-    )
-
-
-def schedule_group_conv2d_nchw(outs):
-    """Compute group_conv2d with NCHW layout"""
-    return schedule_group_conv2d_nchwc(outs)
-
-
-def _get_default_config(
-    cfg, data, kernel, strides, padding, dilation, groups, out_dtype, layout="NCHW"
-):
-    """
-    Get default schedule config for the workload
-    """
-    static_data_shape = []
-    for dim in get_const_tuple(data.shape):
-        if isinstance(dim, tvm.tir.Var):
-            static_data_shape.append(1)
-        else:
-            static_data_shape.append(dim)
-    data = te.placeholder(static_data_shape, dtype=data.dtype)
-
-    wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
-    _fallback_schedule(cfg, wkl)
-
-
-def _fallback_schedule(cfg, wkl):
-    simd_width = 4  # assume ARM SIMD Width is 4
-    pad_left, pad_right = wkl.padl, wkl.padr
-    stride_w = wkl.stride_w
-    out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1
-    groups = wkl.groups
-    kernels_per_group = wkl.out_filter // groups
-    kernel_depth = wkl.in_filter // groups
-
-    oc_bn = 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if kernels_per_group % bn == 0:
-            oc_bn = bn
-            break
-    if oc_bn > kernels_per_group:
-        oc_bn = kernels_per_group
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if kernel_depth % bn == 0:
-            ic_bn = bn
-            break
-    if ic_bn > kernel_depth:
-        ic_bn = kernel_depth
-
-    reg_n = 1
-    for n in range(31, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-
-    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
-    cfg["unroll_kw"] = OtherOptionEntity(False)
-
-
-@autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu")
-def group_conv2d_nchw_spatial_pack(
-    cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
-):
-    """
-    Compute group conv2d with NCHW layout, using GSPC algorithm.
-    https://arxiv.org/abs/2006.09791
-    """
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(dilation, int):
-        dilation_h, dilation_w = dilation, dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
-    if isinstance(padding, int):
-        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
-    elif len(padding) == 2:
-        hpad, wpad = padding
-        pad_top, pad_bottom = hpad, hpad
-        pad_left, pad_right = wpad, wpad
-    else:
-        pad_top, pad_left, pad_bottom, pad_right = padding
-
-    hpad = pad_top + pad_bottom
-    wpad = pad_left + pad_right
-
-    assert isinstance(strides, int) or len(strides) == 2
-    if isinstance(strides, int):
-        stride_h, stride_w = strides, strides
-    else:
-        stride_h, stride_w = strides
-
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
-
-    pad_height = in_height + pad_top + pad_bottom
-    pad_width = in_width + pad_left + pad_right
-
-    dilated_kernel_h = (k_height - 1) * dilation_h + 1
-    dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
-
-    kernels_per_group = out_channel // groups
-
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
-    cfg.define_knob("unroll_kw", [True, False])
-
-    # If no config was set, we can fallback to default config.
-    if cfg.is_fallback:
-        _get_default_config(
-            cfg,
-            te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
-            te.placeholder(
-                (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype
-            ),
-            strides,
-            padding,
-            dilation,
-            groups,
-            out_dtype,
-        )
-
-    oc_bn = cfg["tile_oc"].size[-1]
-    ic_bn = cfg["tile_ic"].size[-1]
-
-    # pack data
-    DOPAD = hpad != 0 or wpad != 0
-    if DOPAD:
-        data_pad = pad(
-            data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad"
-        )
-    else:
-        data_pad = data
-
-    shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width)
-
-    data_vec = te.compute(
-        shape,
-        lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
-        name="data_vec",
-    )
-
-    # pack kernel
-    shape = (
-        groups,
-        kernels_per_group // oc_bn,
-        kernel_depth // ic_bn,
-        k_height,
-        k_width,
-        ic_bn,
-        oc_bn,
-    )
-
-    kernel_vec = te.compute(
-        shape,
-        lambda g, out_channel, in_channel, h, w, ci, co: kernel[
-            (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w
-        ],
-        name="kernel_vec",
-    )
-
-    # convolution
-    oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn)
-    unpack_shape = (batch_size, out_channel, out_height, out_width)
-
-    ic = te.reduce_axis((0, (kernel_depth)), name="ic")
-    kh = te.reduce_axis((0, k_height), name="kh")
-    kw = te.reduce_axis((0, k_width), name="kw")
-
-    idxmod = tvm.tir.indexmod
-    idxdiv = tvm.tir.indexdiv
-
-    conv = te.compute(
-        oshape,
-        lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(
-            data_vec[
-                g,
-                n,
-                idxdiv(ic, ic_bn),
-                oh * stride_h + kh * dilation_h,
-                idxmod(ic, ic_bn),
-                ow * stride_w + kw * dilation_w,
-            ].astype(out_dtype)
-            * kernel_vec[
-                g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block
-            ].astype(out_dtype),
-            axis=[ic, kh, kw],
-        ),
-        name="conv",
-    )
-
-    unpack = te.compute(
-        unpack_shape,
-        lambda n, c, h, w: conv[
-            idxdiv(c, kernels_per_group),
-            n,
-            idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
-            h,
-            w,
-            idxmod(idxmod(c, oc_bn), kernels_per_group),
-        ].astype(out_dtype),
-        name="output_unpack",
-        tag="group_conv2d_nchw",
-    )
-
-    return unpack
-
-
-@autotvm.register_topi_schedule("group_conv2d_nchw.arm_cpu")
-def schedule_group_conv2d_nchwc(cfg, outs):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
-        if "group_conv2d_nchw" in op.tag:
-            output = op.output(0)
-
-            if "tile_ic" not in cfg:
-                return
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
-            _schedule_gspc_nchw(*args)
-
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
-    return s
-
-
-def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
-    """Schedule GSPC"""
-    ic_bn, oc_bn, reg_n, unroll_kw = (
-        cfg["tile_ic"].size[-1],
-        cfg["tile_oc"].size[-1],
-        cfg["tile_ow"].size[-1],
-        cfg["unroll_kw"].val,
-    )
-
-    _, W = data, kernel_vec
-    A0, A1 = data_pad, data_vec
-
-    # schedule data
-    if (
-        data_pad is not None
-        and isinstance(data_pad.op, tvm.te.ComputeOp)
-        and "pad" in data_pad.op.tag
-    ):
-        s[A0].compute_inline()
-
-    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
-
-    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
-    s[A1].parallel(parallel_axis)
-
-    # schedule kernel pack
-    groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
-    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-
-    if oc_bn > 1:
-        s[W].vectorize(oc_block)
-
-    parallel_axis = s[W].fuse(groups, oc_chunk, oh)
-    s[W].parallel(parallel_axis)
-
-    # schedule conv
-    C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, "global")
-
-    _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-
-    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
-
-    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    s[C].fuse(oc_chunk, oh)
-    s[C].vectorize(oc_block)
-
-    groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-
-    ic, kh, kw = s[CC].op.reduce_axis
-    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
-        s[CC].unroll(kw)
-    else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
-
-    parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
-    s[CC].parallel(parallel_axis)
-
-    s[CC].vectorize(oc_block)
-
-    s[CC].unroll(ow_block)
-
-    if O0 != O:
-        s[O0].compute_inline()
-
-    batch, oc, oh, ow = s[O].op.axis
-    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-
-    s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[O].fuse(oc_chunk, oh)
-    s[O].vectorize(oc_block)
-    s[O].parallel(parallel_axis)
-    return s
diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
deleted file mode 100644
index fbc071092503..000000000000
--- a/python/tvm/topi/arm_cpu/injective.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable
-"""Schedule for pooling operators"""
-import tvm
-from tvm import te
-from ..utils import is_empty_shape
-
-
-def schedule_injective_from_existing(sch, out):
-    """Schedule for injective op from existing schedule.
-
-    Parameters
-    ----------
-    sch: Schedule
-         The schedule to update.
-    out: Tensor
-         The tensor representing the injective op.
-
-    Returns
-    -------
-    sch: Schedule
-         The updated schedule.
-    """
-    if len(sch[out].op.axis) >= 4:
-        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1], sch[out].op.axis[2])
-        sch[out].parallel(fused)
-    elif len(sch[out].op.axis) >= 3:
-        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1])
-        sch[out].parallel(fused)
-    elif len(sch[out].op.axis) >= 2:
-        sch[out].parallel(sch[out].op.axis[0])
-    return sch
-
-
-def schedule_injective(outs):
-    """ARM CPU schedule for injective op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    x = outs[0]
-
-    if list(s[x].op.axis):
-        # do not vectorize for broadcast
-        dtype = "uint16" if x.dtype == "bfloat16" else x.dtype
-        itemsize = max(1, tvm.DataType(dtype).bits // 8)
-        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // itemsize)
-        s[x].vectorize(ii)
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    if not is_empty_shape(x.shape):
-        schedule_injective_from_existing(s, x)
-    return s
-
-
-def schedule_concatenate(outs):
-    """Schedule for concatenate op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of concatenate in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    x = outs[0]
-    tvm.te.schedule.AutoInlineInjective(s)
-    if len(s[x].op.axis) >= 4:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
-        s[x].parallel(fused)
-    elif len(s[x].op.axis) >= 3:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
-        s[x].parallel(fused)
-    elif len(s[x].op.axis) >= 2:
-        s[x].parallel(s[x].op.axis[0])
-    return s
diff --git a/python/tvm/topi/arm_cpu/matmul.py b/python/tvm/topi/arm_cpu/matmul.py
deleted file mode 100644
index 63f6289f0eb7..000000000000
--- a/python/tvm/topi/arm_cpu/matmul.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-argument
-
-"""Matmul schedules for the `arm_cpu` device key."""
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.script import tir as T
-from tvm.topi import nn
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.arm_cpu.pstate_attributes import SMEAttributes
-from tvm.topi.arm_cpu.arm_utils import pad_dim_to_multiple
-from tvm.dlight.base.analysis import normalize_prim_func
-
-
-@autotvm.register_topi_compute("matmul.arm_cpu.sme")
-def compute_matmul_sme(cfg, data_a, data_b, _, out_dtype, transpose_a=False, transpose_b=True):
-    """
-    SME Matmul compute definition.
-    """
-    assert bool(transpose_a) is False, "Transposed lhs not currently supported."
-    if data_b.dtype == "float16":
-        assert bool(transpose_b) is True, "Rhs must be transposed when dtype is float16."
-
-    M, K = get_const_tuple(data_a.shape)
-    if transpose_b:
-        N = get_const_tuple(data_b.shape)[0]
-    else:
-        N = get_const_tuple(data_b.shape)[1]
-
-    if not out_dtype:
-        out_dtype = data_a.dtype
-
-    tile_m = 2 * tvm.tir.get_vscale_expr(data_a.dtype)
-    tile_k = tvm.tir.get_vscale_expr(data_a.dtype)
-    if data_a.dtype == "float32":
-        tile_k *= 2
-    tile_n = 2 * tvm.tir.get_vscale_expr(data_a.dtype)
-
-    if data_a.dtype == "float16":
-        _, pad_M = pad_dim_to_multiple(M, tile_m)
-        _, pad_K = pad_dim_to_multiple(K, tile_k)
-        _, pad_N = pad_dim_to_multiple(N, tile_n)
-        m_pad_after = (pad_M, pad_K)
-        n_pad_after = (pad_N, pad_K) if transpose_b else (pad_K, pad_N)
-        if pad_M != 0:
-            data_a = nn.pad(data_a, pad_before=(0, 0), pad_after=m_pad_after)
-        if pad_N != 0:
-            data_b = nn.pad(data_b, pad_before=(0, 0), pad_after=n_pad_after)
-
-    if out_dtype is None:
-        out_dtype = data_a.dtype
-
-    k = te.reduce_axis((0, K), name="k")
-
-    def compute(*indices):
-        i, j = indices[-2:]
-        a_indices = (k, i) if transpose_a else (i, k)
-        b_indices = (j, k) if transpose_b else (k, j)
-        return te.sum(
-            data_a[a_indices].astype(out_dtype) * data_b[b_indices].astype(out_dtype), axis=k
-        )
-
-    compute_name = {
-        (True, True): "T_matmul_TT",
-        (True, False): "T_matmul_TN",
-        (False, True): "T_matmul_NT",
-        (False, False): "T_matmul_NN",
-    }[(transpose_a, transpose_b)]
-
-    return te.compute(
-        (M, N),
-        compute,
-        name=compute_name,
-        attrs={"schedule_type": "sme"},
-    )
-
-
-def tir_schedule_matmul_sme(sch):
-    """
-    SME STIR Matmul schedule.
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm.tir.tensor_intrin.arm_cpu import (
-        ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA,
-        ARM_SME_INIT,
-        get_sme_gemm_interleaved_mopa_2svlx2svl_intrin,
-        get_transpose_interleave_intrin_name,
-    )
-
-    main_func = sch.mod["main"]
-    data_handle = main_func.params[0]
-    in_dtype = main_func.buffer_map[data_handle].dtype
-    out_dtype = "float32"
-
-    block_infos = normalize_prim_func(sch)
-    reduction_block_infos = [block_info for block_info in block_infos if block_info.is_reduction()]
-    assert len(reduction_block_infos) == 1, "Expected a single gemm reduction block."
-    gemm_block = reduction_block_infos[0].block_rv
-    gemm_block_name = sch.get(gemm_block).name_hint
-    transpose = gemm_block_name.split("_")[-1]
-    transpose_b = transpose[1] == "T"
-
-    m, n, k = sch.get_loops(gemm_block)
-
-    extent_m = sch.get(m).extent
-    extent_k = sch.get(k).extent
-    extent_n = sch.get(n).extent
-
-    if in_dtype == "float16":
-        tile_m = T.cast(2 * tvm.tir.get_vscale_expr(in_dtype), extent_m.dtype)
-        tile_k = T.cast(tvm.tir.get_vscale_expr(in_dtype), extent_k.dtype)
-        tile_n = T.cast(2 * tvm.tir.get_vscale_expr(in_dtype), extent_n.dtype)
-    else:
-        tile_m = T.cast(2 * tvm.tir.get_vscale_expr(in_dtype), extent_m.dtype)
-        tile_k = T.cast(2 * tvm.tir.get_vscale_expr(in_dtype), extent_k.dtype)
-        tile_n = T.cast(2 * tvm.tir.get_vscale_expr(in_dtype), extent_n.dtype)
-
-    # Interleave the input utilizing the matrix tile
-    interleave_a_block = sch.cache_read(gemm_block, 0, "global")
-    sch.transform_layout(interleave_a_block, ("write", 0), lambda m, k: (k, m))
-    m, k = sch.get_loops(interleave_a_block)
-    outer_m, inner_m = sch.split(m, factors=(None, tile_m), disable_predication=True)
-    outer_k, inner_k = sch.split(k, factors=(None, tile_k), disable_predication=True)
-    sch.reorder(outer_k, outer_m, inner_k, inner_m)
-    sch.tensorize(
-        inner_k, get_transpose_interleave_intrin_name(in_dtype, out_dtype, extent_m, extent_k)
-    )
-
-    # Interleave the weights utilizing the matrix tile
-    if transpose_b:
-        interleave_b_block = sch.cache_read(gemm_block, 1, "global")
-        sch.transform_layout(interleave_b_block, ("write", 0), lambda n, k: (k, n))
-        n, k = sch.get_loops(interleave_b_block)
-        outer_k, inner_k = sch.split(k, factors=(None, tile_k), disable_predication=True)
-        outer_n, inner_n = sch.split(n, factors=(None, tile_n), disable_predication=True)
-        sch.reorder(outer_k, outer_n, inner_k, inner_n)
-        sch.tensorize(
-            inner_k, get_transpose_interleave_intrin_name(in_dtype, out_dtype, extent_k, extent_n)
-        )
-
-    # Split and reorder the loops of the GeMM for tensorization
-    tile_m = T.cast(2 * tvm.tir.get_vscale_expr(out_dtype), extent_m.dtype)
-    tile_n = T.cast(2 * tvm.tir.get_vscale_expr(out_dtype), extent_n.dtype)
-    m, n, k = sch.get_loops(gemm_block)
-    outer_m, inner_m = sch.split(m, factors=(None, tile_m), disable_predication=True)
-    outer_n, inner_n = sch.split(n, factors=(None, tile_n), disable_predication=True)
-    sch.reorder(outer_m, outer_n, inner_m, inner_n, k)
-
-    # Tensorize the GeMM initialization
-    init_block = sch.decompose_reduction(gemm_block, inner_m)
-    sch.tensorize(sch.get_loops(init_block)[-2], ARM_SME_INIT)
-
-    # Tensorize the GeMM update
-    sme_gemm_interleaved_intrin_name = (
-        ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA + f"_{extent_m}_{extent_k}_{in_dtype}"
-    )
-    tvm.tir.TensorIntrin.register(
-        sme_gemm_interleaved_intrin_name,
-        *get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(extent_m, extent_k, in_dtype),
-        override=True,
-    )
-    sch.tensorize(inner_m, sme_gemm_interleaved_intrin_name)
-
-    # Add pstate annotations
-    root_block = sch.get_block("root")
-    sch.annotate(
-        root_block, SMEAttributes.STREAMING_MODE, SMEAttributes.StreamingModeValues.ENABLED
-    )
-    sch.annotate(root_block, SMEAttributes.ZA_STORAGE, SMEAttributes.ZAStorageValues.NEW)
diff --git a/python/tvm/topi/arm_cpu/mprofile/__init__.py b/python/tvm/topi/arm_cpu/mprofile/__init__.py
deleted file mode 100644
index 32ce4d3a5447..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Schedules specialized for cortex-m DSP instructions."""
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py b/python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py
deleted file mode 100644
index 13a83393a912..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
deleted file mode 100644
index 521a58d0c1fc..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter
-"""Direct implementation of conv1d."""
-from tvm import autotvm
-from tvm.autotvm.task import deserialize_args
-from tvm import te
-from tvm.topi.utils import simplify, traverse_inline
-from tvm.topi.nn.pad import pad
-from tvm.topi.nn.utils import get_pad_tuple1d
-from tvm.tir.expr import Mul
-
-from .micro_kernel.gemm import (
-    intrin_gemm_MxKxN,
-    gemm_MxKxN_impl,
-)
-
-
-def conv1d_nwc_dsp(*args, **kwargs):
-    """Defines the v7e-m DSP instructions of conv1d on NWC layout."""
-    assert not kwargs, "Do not support kwargs in template function call"
-    args = deserialize_args(args)
-    data, kernel = args[:2]
-    layout = args[-2]
-    cfg = autotvm.get_config()
-    args = [cfg] + args
-    assert layout == "NWC"
-    conv = conv1d_nwc_dsp_compute(*args)
-    sched = conv1d_nwc_dsp_schedule(cfg, [data, kernel, conv])
-    return sched, [data, kernel, conv]
-
-
-conv1d_nwc_dsp.template_key = "dsp"
-conv1d_nwc_dsp.default_data_layout = "NWC"
-conv1d_nwc_dsp.default_kernel_layout = "WOI"
-
-
-def conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute function for v7e-m DSP instructions of conv1d on NWC layout."""
-    if isinstance(strides, (tuple, list)):
-        strides = strides[0]
-    if isinstance(dilation, (tuple, list)):
-        dilation = dilation[0]
-
-    batch_size, data_width, in_channels = data.shape
-    kernel_size, out_channels, _ = kernel.shape
-
-    # Compute the output shape
-    dilated_kernel_size = (kernel_size - 1) * dilation + 1
-    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size,))
-    out_channels = simplify(out_channels)
-    out_width = simplify((data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1)
-
-    # Apply padding
-    pad_before = [0, pad_left, 0]
-    pad_after = [0, pad_right, 0]
-    padded_data = pad(data, pad_before, pad_after, name="padded_data")
-
-    # Compute graph
-    rc = te.reduce_axis((0, in_channels), name="rc")
-    rw = te.reduce_axis((0, kernel_size), name="rw")
-
-    conv = te.compute(
-        (batch_size, out_width, out_channels),
-        lambda b, w, c: te.sum(
-            padded_data[b, w * strides + rw * dilation, rc].astype(out_dtype)
-            * kernel[rw, c, rc].astype(out_dtype),
-            axis=[rw, rc],
-        ),
-        name="conv1d",
-        tag="conv1d_nwc",
-    )
-
-    ###########################
-    # Config Space Definition #
-    ###########################
-    n, ow, co = (
-        cfg.axis(batch_size.value),
-        cfg.axis(out_width.value),
-        cfg.axis(out_channels.value),
-    )
-    kw, ci = (
-        cfg.reduce_axis(kernel_size.value),
-        cfg.reduce_axis(in_channels.value),
-    )
-
-    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
-    cio, cii = cfg.define_split(
-        "tile_ci",
-        ci,
-        policy="factors",
-        num_outputs=2,
-        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
-        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
-    )
-    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)
-
-    cfg.define_reorder(
-        "reorder_0_simd",
-        [n, owo, owi, coo, coi, kw, cio, cii],
-        policy="candidate",
-        candidate=[
-            [n, kw, owo, coo, cio, owi, coi, cii],
-            [n, kw, coo, owo, cio, owi, coi, cii],
-            [n, kw, owo, coo, cio, owi, coi, cii],
-            [n, kw, coo, owo, cio, owi, coi, cii],
-        ],
-    )
-
-    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
-    cfg.define_knob("unroll_explicit", [0, 1])
-
-    if cfg.is_fallback:
-        cfg.fallback_split("tile_ow", [-1, out_width.value])
-        cfg.fallback_split("tile_ci", [-1, in_channels.value])
-        cfg.fallback_split("tile_co", [-1, out_channels.value])
-
-    return conv
-
-
-def conv1d_nwc_dsp_schedule(cfg, outs):
-    """Schedule function for v7e-m DSP instructions of conv1d on NWC layout."""
-    sched = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv1d_nwc" not in op.tag:
-            return
-
-        # extract tensors
-        output = op.output(0)
-        conv = op
-        data_vec = conv.input_tensors[0]
-
-        source_index_w = output.op.body[0].source[0].a.value.indices[1].a
-        stride_w = source_index_w.b.value if isinstance(source_index_w, Mul) else 1
-
-        # tile reduction axes
-        n, ow, co = sched[conv].op.axis
-        kw, ci = sched[conv].op.reduce_axis
-
-        M = cfg["tile_ow"].size[-1]
-        K = cfg["tile_ci"].size[-1]
-        N = cfg["tile_co"].size[-1]
-
-        owo, owi = cfg["tile_ow"].apply(sched, conv, ow)
-        cio, cii = cfg["tile_ci"].apply(sched, conv, ci)
-        coo, coi = cfg["tile_co"].apply(sched, conv, co)
-
-        cfg["reorder_0_simd"].apply(sched, conv, [n, owo, owi, coo, coi, kw, cio, cii])
-
-        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype, stride_w)
-        sched[output].tensorize(owi, gemm)
-        sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
-
-        # this is the scope to attach global config inside this kernel
-        kernel_scope = n
-
-        # tune unroll
-        sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    traverse_inline(sched, outs[-1].op, _callback)
-    return sched
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
deleted file mode 100644
index 470d46b92a7a..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter
-"""Direct implementation of conv2d."""
-
-from tvm import autotvm
-from tvm.autotvm.task import deserialize_args
-from tvm import te
-from tvm.topi.utils import simplify, traverse_inline
-from tvm.topi.nn.pad import pad
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.tir.expr import Mul
-
-from .micro_kernel.gemm import (
-    intrin_gemm_MxKxN,
-    gemm_MxKxN_impl,
-)
-
-
-def conv2d_nhwc_dsp(*args, **kwargs):
-    """Defines the v7e-m DSP instructions of conv2d."""
-    assert not kwargs, "Do not support kwargs in template function call"
-    args = deserialize_args(args)
-    data, kernel = args[:2]
-    layout = args[-2]
-    cfg = autotvm.get_config()
-    args = [cfg] + args
-    assert layout == "NHWC"
-    conv = conv2d_nhwc_dsp_compute(*args)
-    sched = conv2d_nhwc_dsp_schedule(cfg, [data, kernel, conv])
-    return sched, [data, kernel, conv]
-
-
-conv2d_nhwc_dsp.template_key = "dsp"
-conv2d_nhwc_dsp.default_data_layout = "NHWC"
-conv2d_nhwc_dsp.default_kernel_layout = "HWOI"
-
-
-def conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute function for v7e-m DSP instructions of conv2d."""
-    assert isinstance(strides, int) or len(strides) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(strides, int):
-        stride_h = stride_w = strides
-    else:
-        stride_h, stride_w = strides
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch_size, in_height, in_width, in_channels = data.shape
-    kernel_h, kernel_w, out_channels, _ = kernel.shape
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-
-    pad_before = [0, pad_top, pad_left, 0]
-    pad_after = [0, pad_down, pad_right, 0]
-    padded_data = pad(data, pad_before, pad_after, name="padded_data")
-
-    rc = te.reduce_axis((0, in_channels), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-
-    conv = te.compute(
-        (batch_size, out_height, out_width, out_channels),
-        lambda nn, yy, xx, ff: te.sum(
-            padded_data[
-                nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc
-            ].astype(out_dtype)
-            * kernel[ry, rx, ff, rc].astype(out_dtype),
-            axis=[ry, rx, rc],
-        ),
-        name="conv2d",
-        tag="conv2d_nhwc",
-    )
-
-    ###########################
-    # Config Space Definition #
-    ###########################
-    n, oh, ow, co = (
-        cfg.axis(batch_size.value),
-        cfg.axis(out_height.value),
-        cfg.axis(out_width.value),
-        cfg.axis(out_channels.value),
-    )
-    kh, kw, ci = (
-        cfg.reduce_axis(kernel_h.value),
-        cfg.reduce_axis(kernel_w.value),
-        cfg.reduce_axis(in_channels.value),
-    )
-
-    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
-    cio, cii = cfg.define_split(
-        "tile_ci",
-        ci,
-        policy="factors",
-        num_outputs=2,
-        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
-        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
-    )
-    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)
-
-    cfg.define_reorder(
-        "reorder_0_simd",
-        [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
-        policy="candidate",
-        candidate=[
-            [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
-            [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
-            [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
-            [n, kh, kw, oh, coo, owo, cio, owi, coi, cii],
-        ],
-    )
-
-    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
-    cfg.define_knob("unroll_explicit", [0, 1])
-
-    if cfg.is_fallback:
-        cfg.fallback_split("tile_ow", [-1, out_width.value])
-        cfg.fallback_split("tile_ci", [-1, in_channels.value])
-        cfg.fallback_split("tile_co", [-1, out_channels.value])
-
-    return conv
-
-
-def conv2d_nhwc_dsp_schedule(cfg, outs):
-    """Schedule function for v7e-m DSP instructions of conv2d."""
-    sched = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nhwc" not in op.tag:
-            return
-
-        # extract tensors
-        output = op.output(0)
-        conv = op
-        data_vec = conv.input_tensors[0]
-        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
-        last = outs[0]  # pylint: disable=unused-variable
-
-        source_index_w = output.op.body[0].source[0].a.value.indices[2].a
-        stride_w = source_index_w.b.value if isinstance(source_index_w, Mul) else 1
-
-        # tile reduction axes
-        n, oh, ow, co = sched[conv].op.axis
-        kh, kw, ci = sched[conv].op.reduce_axis
-
-        M = cfg["tile_ow"].size[-1]
-        K = cfg["tile_ci"].size[-1]
-        N = cfg["tile_co"].size[-1]
-
-        owo, owi = cfg["tile_ow"].apply(sched, conv, ow)
-        cio, cii = cfg["tile_ci"].apply(sched, conv, ci)
-        coo, coi = cfg["tile_co"].apply(sched, conv, co)
-
-        cfg["reorder_0_simd"].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii])
-
-        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype, stride_w)
-        sched[output].tensorize(owi, gemm)
-        sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
-
-        # this is the scope to attach global config inside this kernel
-        kernel_scope = n
-
-        # tune unroll
-        sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    traverse_inline(sched, outs[-1].op, _callback)
-    return sched
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py b/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
deleted file mode 100644
index 123ad1ce3c6b..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter
-"""Direct implementation of dense."""
-
-from tvm import te
-from tvm.topi.utils import traverse_inline, get_const_tuple
-
-from .micro_kernel.gemm import (
-    intrin_gemm_MxKxN,
-    gemm_MxKxN_impl,
-)
-from .... import tag
-
-
-def dense_dsp_compute(cfg, data, weight, bias=None, out_dtype=None):
-    """Defines the v7e-m DSP instructions of dense."""
-    M, K = get_const_tuple(data.shape)
-    N, _ = get_const_tuple(weight.shape)
-
-    cfg.define_split("tile_x", M, policy="factors", num_outputs=2)
-    cfg.define_split("tile_y", N, policy="factors", num_outputs=2)
-    cfg.define_split("tile_k", K, policy="factors", num_outputs=2)
-
-    k = te.reduce_axis((0, K), "k")
-    C = te.compute(
-        (M, N),
-        lambda x, y: te.sum(
-            data[x, k].astype(out_dtype) * weight[y, k].astype(out_dtype),
-            axis=k,
-        ),
-        name="dense",
-        tag="dense_dsp",
-    )
-
-    if bias is not None:
-        C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
-    return C
-
-
-def dense_dsp_schedule(cfg, outs):
-    """Schedule function for v7e-m DSP instructions of dense."""
-    sched = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense" not in op.tag:
-            return
-
-        output = op.output(0)
-        dense = op
-
-        data = dense.input_tensors[0]
-
-        M = cfg["tile_x"].size[-1]
-        N = cfg["tile_y"].size[-1]
-        K = cfg["tile_k"].size[-1]
-
-        x, y = sched[dense].op.axis
-        k = sched[dense].op.reduce_axis[0]
-
-        x_o, x_i = cfg["tile_x"].apply(sched, dense, x)
-        y_o, y_i = cfg["tile_y"].apply(sched, dense, y)
-        k_o, k_i = cfg["tile_k"].apply(sched, dense, k)
-
-        sched[dense].reorder(x_o, y_o, k_o, x_i, y_i, k_i)
-
-        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data.dtype, output.dtype, stride_w=1)
-        sched[output].tensorize(x_i, gemm)
-        sched[output].pragma(x_o, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
-
-    traverse_inline(sched, outs[-1].op, _callback)
-    return sched
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
deleted file mode 100644
index b8da15dadf13..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""ARM Cortex-M DSP schedule for depthwise_conv2d"""
-
-import random
-import string
-
-from tvm import te, topi
-from tvm.topi.utils import traverse_inline
-from tvm.topi.nn.pad import pad
-
-from .micro_kernel.multi_channel_convolve import (
-    intrin_multi_channel_convolve,
-    multi_channel_convolve_impl,
-)
-from .micro_kernel.common import num_simd_lanes_per_word
-
-
-def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute function for v7e-m DSP instructions of DepthwiseConv2D. Has a lot of requirements
-    for use - if not all apply, the fallback implementation will be used instead."""
-    assert isinstance(strides, int) or len(strides) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(strides, int):
-        stride_h = stride_w = strides
-    else:
-        stride_h, stride_w = strides
-
-    # We do not support dilation currently. It would be possible, but it would require
-    # modifying the way the kernel is packed. Gnarly.
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-    assert dilation_h == dilation_w == 1
-
-    batch_size, height, width, channels = data.shape
-    kernel_h, kernel_w, _, _ = kernel.shape
-    simd_lanes = num_simd_lanes_per_word(data.dtype)
-
-    # We don't support different numbers of input and output channels.
-    assert channels == kernel.shape[2]
-    assert kernel.shape[3] == 1
-
-    # We take in int8 as our dtype, but we spit out int32. This is because we cannot
-    # round until we compute activations.
-    assert out_dtype == "int32"
-
-    # Padding the data requires COPYING THE ENTIRE INPUT TENSOR, which
-    # is slow and bad. We should really implement a strip mining
-    # routine to avoid this, but TVM has terrible support for that.
-
-    if padding == "SAME":
-        # This assumption makes the logic easier. Could be removed with work.
-        assert height % stride_h == width % stride_w == 0
-
-        output_h = height // stride_h
-        output_w = width // stride_w
-
-        # This padding behavior is consistent with other TVM depthwise_conv2d schedules. However it
-        # differs from the TensorFlow, which only pads the bottom right if stride > 1. This probably
-        # brings down accuracy slightly for models imported from TFLite.
-        pad_down = 1 if stride_h == 1 else 0
-        pad_right = 1 if stride_w == 1 else 0
-
-        padded_data = pad(
-            data,
-            [0, kernel_h // 2, kernel_w // 2, 0],
-            [0, pad_down, pad_right, 0],
-            name="padded_data",
-        )
-
-    elif padding == "VALID":
-        assert height > kernel_h and width > kernel_w
-        output_h = (height - kernel_h) // stride_h + 1
-        output_w = (width - kernel_w) // stride_w + 1
-        padded_data = data
-
-    elif isinstance(padding, tuple):
-        if len(padding) == 2:
-            pad_up, pad_down = padding[0]
-            pad_left, pad_right = padding[1]
-        else:
-            pad_up, pad_left, pad_down, pad_right = padding
-
-        output_h = (height - kernel_h + pad_up + pad_down) // stride_h + 1
-        output_w = (width - kernel_w + pad_left + pad_right) // stride_w + 1
-        padded_data = pad(
-            data,
-            [0, pad_up, pad_left, 0],
-            [0, pad_down, pad_right, 0],
-            name="padded_data",
-        )
-
-    else:
-        raise RuntimeError()
-    _, padded_h, padded_w, _ = padded_data.shape
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-    reshaped_kernel = topi.reshape(kernel, (channels // simd_lanes, kernel_h, kernel_w, simd_lanes))
-    return te.compute(
-        (batch_size, output_h, output_w, channels),
-        lambda h, i, j, k: te.sum(
-            padded_data[h, (i * stride_h) + kh_i, (j * stride_w) + kw_i, k].astype("int32")
-            * reshaped_kernel[k // simd_lanes, kh_i, kw_i, k % simd_lanes].astype("int32"),
-            axis=(kh_i, kw_i),
-        ),
-        name="depthwise_conv2d",
-        tag=f"depthwise_conv2d_nhwc_{padded_h}_{padded_w}_dsp",
-    )
-
-
-def depthwise_conv2d_nhwc_dsp_schedule(_cfg, outs):
-
-    """Schedule function for v7e-m DSP instructions of conv2d."""
-    schedule = te.create_schedule([x.op for x in outs])
-
-    def _callback(operator):
-        if "depthwise_conv2d_nhwc" not in operator.tag:
-            return
-
-        # extract tensors
-        output = operator.output(0)
-        padded_data = output.op.input_tensors[0]
-        reshaped_kernel = output.op.input_tensors[1]
-        in_dtype = padded_data.dtype
-
-        _, padded_h, padded_w, channels = padded_data.shape
-        _, kernel_h, kernel_w, _ = reshaped_kernel.shape
-        suffix = "".join(random.choices(string.ascii_uppercase, k=8))
-
-        b_ax, y_ax, x_ax, c_ax = schedule[output].op.axis
-        ky_ax, kx_ax = schedule[output].op.reduce_axis
-        simd_lanes = num_simd_lanes_per_word(in_dtype)
-        c_ax_o, c_ax_i = schedule[output].split(c_ax, factor=simd_lanes)
-        schedule[output].reorder(b_ax, c_ax_o, y_ax, x_ax, ky_ax, kx_ax, c_ax_i)
-
-        multi_channel_convolve = intrin_multi_channel_convolve(
-            in_dtype, padded_h, padded_w, channels, kernel_h, kernel_w, suffix
-        )
-        schedule[output].tensorize(ky_ax, multi_channel_convolve)
-        schedule[output].pragma(
-            b_ax,
-            "import_c",
-            multi_channel_convolve_impl(
-                in_dtype, padded_h, padded_w, channels, kernel_h, kernel_w, suffix
-            ),
-        )
-
-    traverse_inline(schedule, outs[-1].op, _callback)
-    return schedule
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py
deleted file mode 100644
index 13a83393a912..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
deleted file mode 100644
index 3eb32d8fdb16..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter
-"""Defines sum intrinsics for sum operation with v7e-m DSP instructions."""
-
-import random
-import string
-
-import tvm
-from tvm import te
-from . import common
-
-
-def intrin_sum(shape, in_dtype, out_dtype, reset=False):
-    """Defines a v7e-m DSP-accelerated sum operation."""
-    UNIQ_ID_LEN = 8
-    uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
-    func_prefix = "sum16"
-
-    assert in_dtype == "int16"
-    assert out_dtype == "int16"
-
-    width = shape[-1]
-    x = te.placeholder(shape, name="x", dtype=in_dtype)
-    k = te.reduce_axis((0, width), name="rc")
-
-    def get_slice(indices, k):
-        s = list(indices)
-        s[-1] = s[-1] + k
-        return tuple(s)
-
-    z = te.compute(
-        (1,) * len(shape), lambda *i: te.sum(x[get_slice(i, k)], axis=[k]).astype(out_dtype)
-    )
-
-    def _intrin_func(ins, outs):
-        aa = ins[0]
-        cc = outs[0]
-
-        def _body():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    f"{func_prefix}_{width}_{uniq_id}",
-                    aa.access_ptr("r"),
-                    cc.access_ptr("w"),
-                    aa.elem_offset,
-                    1 if reset else 0,
-                )
-            )
-            return ib.get()
-
-        def _reduce_reset():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern("int32", f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"))
-            )
-            return ib.get()
-
-        def _reduce_update():
-            return _body()
-
-        return _body(), _reduce_reset(), _reduce_update()
-
-    binds = {
-        t: tvm.tir.decl_buffer(
-            t.shape,
-            t.dtype,
-            t.op.name,
-            strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0, len(t.shape))],
-            offset_factor=1,
-        )
-        for t in [x, z]
-    }
-
-    intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
-    return intrin_decl, uniq_id
-
-
-def sum_impl(N, uniq_id):
-    """Emit C code for sum impl."""
-    cc_code = (
-        common.common_includes
-        + f"""
-
-#ifdef __cplusplus
-extern "C"
-#endif // __cplusplus
-__attribute__((always_inline)) static inline int32_t sum16_reset_{uniq_id}(
-    int16_t *res) {{
-  *res = (int16_t)0;
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t sum16_{N}_{uniq_id}(
-    int16_t *arr,
-    int16_t *res16,
-    int32_t arr_offset,
-    int32_t reset) {{
-  int n;
-  int32_t *p32;
-  int32_t res = reset ? 0 : *res16;
-
-  if ( arr_offset % 4 != 0 ) {{
-    res += *arr;
-    p32 = (int32_t *)(&arr[1]);
-    n = {N} - 1;
-  }} else {{
-    p32 = (int32_t *)arr;
-    n = {N};
-  }}
-
-  for ( int i = 0; i < n / 2; ++ i ) {{
-    res = __smlad(*p32, 0x00010001, res);
-    ++ p32;
-  }}
-
-  if ( n % 2 != 0 )
-    res += *(int16_t *)p32;
-
-  *res16 = res;
-
-  return 0;
-}}
-
-"""
-    )
-    return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
deleted file mode 100644
index e89bf7c1b4fc..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter
-"""Defines common C code for all microkernel operations."""
-
-
-common_includes = """
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <arm_acle.h>
-
-#include <tvm/runtime/crt/error_codes.h>
-
-
-#ifndef ARM_CPU_INTRINSICS_EXIST
-#define ARM_CPU_INTRINSICS_EXIST
-__attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2)
-{
-  op2 %= 32U;
-  if (op2 == 0U)
-  {
-    return op1;
-  }
-  return (op1 >> op2) | (op1 << (32U - op2));
-}
-
-#define __pkhbt(ARG1,ARG2,ARG3) \
-__extension__ \
-({                          \
-  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
-  __asm("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
-  __RES; \
- })
-
-#define __pkhtb(ARG1,ARG2,ARG3) \
-__extension__ \
-({                          \
-  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
-  if (ARG3 == 0) \
-    __asm("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
-  else \
-    __asm("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
-  __RES; \
- })
-#endif
-"""
-
-MICRO_WORD_LENGTH_BITS = 32
-
-
-def num_simd_lanes_per_word(dtype: str) -> int:
-    """Takes a dtype, and returns how many of that dtype fit into a single microcontroller word.
-
-    >>> num_simd_lanes_per_word("int8")
-    4
-    >>> num_simd_lanes_per_word("int16")
-    2
-    """
-    assert dtype.startswith("int")
-    dtype_width = int(dtype[3:])
-    return MICRO_WORD_LENGTH_BITS // dtype_width
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
deleted file mode 100644
index e26e818fbd7e..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
+++ /dev/null
@@ -1,587 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter, f-string-without-interpolation
-"""Defines gemm intrinsics for matrix multiplication with v7e-m DSP instructions."""
-
-import random
-import string
-
-import tvm
-from tvm import te
-from . import common
-
-
-##########################
-# MxKxN MatMul Intrinsic #
-##########################
-
-# NOTE this is transposed matmul (A * B^T)
-def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype, stride_w=1):
-    """Defines a v7e-m DSP-accelerated transposed matmul."""
-    # we generate a unique ID for every intrinsic definition, to prevent name
-    # collisions in the generated source (e.g., if there are multiple operators
-    # in the same module that use the same intrinsic)
-    #
-    # TODO(weberlo, areusch): to cut down on memory usage, we should cache each intrinsic
-    # instantiation and include it only once, eliminating the need for unique
-    # IDs
-    UNIQ_ID_LEN = 8
-    uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
-
-    if isinstance(M, tvm.tir.IntImm):
-        M = M.value
-    if isinstance(K, tvm.tir.IntImm):
-        K = K.value
-    if isinstance(N, tvm.tir.IntImm):
-        N = N.value
-    # TODO(weberlo, areusch): support more dtypes?
-    assert in_dtype in ("int8", "int16")
-    assert out_dtype == "int32"
-    A = te.placeholder((M * stride_w - (stride_w - 1), K), name="a", dtype=in_dtype)
-    B = te.placeholder((N, K), name="b", dtype=in_dtype)
-    k = te.reduce_axis((0, K), name="k")
-    C = te.compute(
-        (M, N),
-        lambda i, j: te.sum(
-            A[i * stride_w, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k
-        ),
-        name="c",
-    )
-    A_buf = tvm.tir.decl_buffer(
-        A.shape, A.dtype, name="A", offset_factor=1, strides=[te.var("A_s"), 1]
-    )
-    B_buf = tvm.tir.decl_buffer(
-        B.shape, B.dtype, name="B", offset_factor=1, strides=[te.var("B_s"), 1]
-    )
-    C_buf = tvm.tir.decl_buffer(
-        C.shape, C.dtype, name="C", offset_factor=1, strides=[te.var("C_s"), 1]
-    )
-
-    def intrin_func(ins, outs):
-        aa, bb = ins
-        cc = outs[0]
-        gemm_func_prefix = "gemm" if in_dtype == "int8" else "gemm16"
-
-        def _reduce_update():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    f"{gemm_func_prefix}_{M}x{K}x{N}_update_{uniq_id}",
-                    aa.access_ptr("r"),
-                    bb.access_ptr("r"),
-                    cc.access_ptr("w"),
-                    aa.strides[0] * stride_w,
-                    bb.strides[0],
-                    cc.strides[0],
-                )
-            )
-            return ib.get()
-
-        def _reduce_reset():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32", f"gemm_{M}x{K}x{N}_reset_{uniq_id}", cc.access_ptr("w"), cc.strides[0]
-                )
-            )
-            return ib.get()
-
-        def _body():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    f"{gemm_func_prefix}_{M}x{K}x{N}_body_{uniq_id}",
-                    aa.access_ptr("r"),
-                    bb.access_ptr("r"),
-                    cc.access_ptr("w"),
-                    aa.strides[0] * stride_w,
-                    bb.strides[0],
-                    cc.strides[0],
-                )
-            )
-            return ib.get()
-
-        return _body(), _reduce_reset(), _reduce_update()
-
-    intrin_decl = te.decl_tensor_intrin(C.op, intrin_func, binds={A: A_buf, B: B_buf, C: C_buf})
-    return intrin_decl, uniq_id
-
-
-def gemm_MxKxN_impl(M, K, N, uniq_id):
-    """Emit C code for gemm impl."""
-    # TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays quickly?
-    # aa_pad_size = M * K
-    bb_pad_size = N * K
-    # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
-    cc_code = (
-        common.common_includes
-        + f"""
-#ifndef ARM_CPU_MPROFILE_READ_AND_PAD_EXISTS
-#define ARM_CPU_MPROFILE_READ_AND_PAD_EXISTS
-__attribute__((always_inline)) static inline const int8_t *read_and_pad(const int8_t *source, int32_t *out1, int32_t *out2)
-{{
-    int32_t inA;
-    memcpy(&inA, source, 4);
-    source += 4;
-
-    int32_t inAbuf1 = __sxtb16(__ror((uint32_t)inA, 8));
-    int32_t inAbuf2 = __sxtb16(inA);
-    *out2 = (int32_t)(__pkhtb(inAbuf1, inAbuf2, 16));
-    *out1 = (int32_t)(__pkhbt(inAbuf2, inAbuf1, 16));
-
-    return source;
-}}
-#endif
-"""
-        + f"""
-
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm_{M}x{N}_body_rest_{uniq_id}(
-    int32_t K_arg,
-    int8_t *aa, int8_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int K = K_arg;
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int k_base = (K / 4) * 4;
-  switch ( K % 4 ) {{
-  case 1:
-    for (int i = 0; i < {M}; i++) {{
-      for (int j = 0; j < {N}; j++) {{
-        int8_t *a_ptr = &aa[i * A_stride + k_base];
-        int8_t *b_ptr = &bb[j * B_stride + k_base];
-        cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
-      }}
-    }}
-    break;
-  case 2:
-    for (int i = 0; i < {M}; i++) {{
-      for (int j = 0; j < {N}; j++) {{
-        int8_t *a_ptr = &aa[i * A_stride + k_base];
-        int8_t *b_ptr = &bb[j * B_stride + k_base];
-        cc[i * C_stride + j] =   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
-                               + (int32_t) a_ptr[1] * (int32_t) b_ptr[1];
-      }}
-    }}
-    break;
-  case 3:
-    for (int i = 0; i < {M}; i++) {{
-      for (int j = 0; j < {N}; j++) {{
-        int8_t *a_ptr = &aa[i * A_stride + k_base];
-        int8_t *b_ptr = &bb[j * B_stride + k_base];
-        cc[i * C_stride + j] =   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
-                               + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]
-                               + (int32_t) a_ptr[2] * (int32_t) b_ptr[2];
-      }}
-    }}
-    break;
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}(
-    int8_t *aa, int8_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int32_t sum = 0;
-      for (int l = 0; l < {K}; l++) {{
-        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
-      }}
-      // NOTE: this is the line where `*_body` differs from `*_update`. here
-      // we're *setting* the result, instead of accumulating, because we know
-      // the `i` and `j` itervars span their entire respective axes.
-      cc[i*C_stride + j] = sum;
-    }}
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
-    int8_t *aa, int8_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int16_t bb_pad[{bb_pad_size}];
-  int32_t retcode = 0;
-
-  if ( {M} < 2 && {N} < 2 ) {{
-    retcode = gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
-    goto out;
-  }}
-
-  for (int i = 0; i < {N}; i++)
-    for (int j = 0; j < {K} / 4; j++)
-      read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
-
-  for (int i = 0; i < {M}; i++) {{
-    int16_t aa_pad_line[{K}];
-    for (int l = 0; l < {K} / 4; l++)
-      read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4], (int32_t*) &aa_pad_line[l*4 + 2]);
-
-    for (int j = 0; j < {N}; j++) {{
-      int32_t *aa_ptr = (int32_t *) aa_pad_line;
-      int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
-      int32_t sum = 0;
-      for (int l = 0; l < 2 * ({K} / 4); l++) {{
-        sum = __smlad(*aa_ptr, *bb_ptr, sum);
-        ++ aa_ptr; ++ bb_ptr;
-      }}
-      // NOTE: this is the line where `*_body` differs from `*_update`. here
-      // we're *setting* the result, instead of accumulating, because we know
-      // the `i` and `j` itervars span their entire respective axes.
-      cc[i*C_stride + j] = sum;
-    }}
-  }}
-
-  if ( {K} % 4 != 0 )
-    gemm_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
-
-out:
-  return retcode;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm_{M}x{N}_update_rest_{uniq_id}(
-    int32_t K_arg,
-    int8_t *aa, int8_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int K = K_arg;
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int k_base = (K / 4) * 4;
-  switch ( K % 4 ) {{
-  case 1:
-    for (int i = 0; i < {M}; i++) {{
-      for (int j = 0; j < {N}; j++) {{
-        int8_t *a_ptr = &aa[i * A_stride + k_base];
-        int8_t *b_ptr = &bb[j * B_stride + k_base];
-        cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
-      }}
-    }}
-    break;
-  case 2:
-    for (int i = 0; i < {M}; i++) {{
-      for (int j = 0; j < {N}; j++) {{
-        int8_t *a_ptr = &aa[i * A_stride + k_base];
-        int8_t *b_ptr = &bb[j * B_stride + k_base];
-        cc[i * C_stride + j] +=   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
-                                + (int32_t) a_ptr[1] * (int32_t) b_ptr[1];
-      }}
-    }}
-    break;
-  case 3:
-    for (int i = 0; i < {M}; i++) {{
-      for (int j = 0; j < {N}; j++) {{
-        int8_t *a_ptr = &aa[i * A_stride + k_base];
-        int8_t *b_ptr = &bb[j * B_stride + k_base];
-        cc[i * C_stride + j] +=   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
-                                + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]
-                                + (int32_t) a_ptr[2] * (int32_t) b_ptr[2];
-      }}
-    }}
-    break;
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}(
-    int8_t *aa, int8_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int32_t sum = 0;
-      for (int l = 0; l < {K}; l++) {{
-        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
-      }}
-      cc[i*C_stride + j] += sum;
-    }}
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
-    int8_t *aa, int8_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int16_t bb_pad[{bb_pad_size}];
-  int32_t retcode = 0;
-
-  if ( {M} < 2 && {N} < 2 ) {{
-    retcode = gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
-    goto out;
-  }}
-
-  for (int i = 0; i < {N}; i++)
-    for (int j = 0; j < {K} / 4; j++)
-      read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
-
-  for (int i = 0; i < {M}; i++) {{
-    int16_t aa_pad_line[{K}];
-    for (int l = 0; l < {K} / 4; l++)
-      read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4], (int32_t*) &aa_pad_line[l*4 + 2]);
-
-    for (int j = 0; j < {N}; j++) {{
-      int32_t *aa_ptr = (int32_t *) aa_pad_line;
-      int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
-      int32_t sum = 0;
-      for (int l = 0; l < 2 * ({K} / 4); l++) {{
-        sum = __smlad(*aa_ptr, *bb_ptr, sum);
-        ++ aa_ptr; ++ bb_ptr;
-      }}
-      cc[i*C_stride + j] += sum;
-    }}
-  }}
-
-  if ( {K} % 4 != 0 )
-    gemm_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
-
-out:
-  return retcode;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm16_{M}x{N}_body_rest_{uniq_id}(
-    int32_t K_arg,
-    int16_t *aa, int16_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int K = K_arg;
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int k_base = (K / 2) * 2;
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int16_t *a_ptr = &aa[i * A_stride + k_base];
-      int16_t *b_ptr = &bb[j * B_stride + k_base];
-      cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
-    }}
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(
-    int16_t *aa, int16_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int32_t sum = 0;
-      for (int l = 0; l < {K}; l++) {{
-        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
-      }}
-      // NOTE: this is the line where `*_body` differs from `*_update`. here
-      // we're *setting* the result, instead of accumulating, because we know
-      // the `i` and `j` itervars span their entire respective axes.
-      cc[i*C_stride + j] = sum;
-    }}
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
-    int16_t *aa, int16_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int32_t retcode = 0;
-
-  if ( {M} < 2 && {N} < 2 ) {{
-    retcode = gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
-    goto out;
-  }}
-
-  if(((uint32_t)aa & 0x3) != 0 || ((uint32_t)bb & 0x3) != 0){{
-    retcode = kTvmErrorFunctionCallInvalidArg;
-    goto out;
-  }}
-
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int32_t aa_vector[{K} / 2];
-      int32_t bb_vector[{K} / 2];
-      memcpy(&aa_vector, &aa[i * A_stride], sizeof(aa_vector));
-      memcpy(&bb_vector, &bb[j * B_stride], sizeof(bb_vector));
-
-      int32_t sum = 0;
-      for (int l = 0; l < {K} / 2; l++) {{
-        sum = __smlad(aa_vector[l], bb_vector[l], sum);
-      }}
-      // NOTE: this is the line where `*_body` differs from `*_update`. here
-      // we're *setting* the result, instead of accumulating, because we know
-      // the `i` and `j` itervars span their entire respective axes.
-      cc[i*C_stride + j] = sum;
-    }}
-  }}
-
-  if ( {K} % 2 != 0 )
-    gemm16_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
-
-out:
-  return retcode;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm16_{M}x{N}_update_rest_{uniq_id}(
-    int32_t K_arg,
-    int16_t *aa, int16_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int K = K_arg;
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int k_base = (K / 2) * 2;
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int16_t *a_ptr = &aa[i * A_stride + k_base];
-      int16_t *b_ptr = &bb[j * B_stride + k_base];
-      cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
-    }}
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(
-    int16_t *aa, int16_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int32_t sum = 0;
-      for (int l = 0; l < {K}; l++) {{
-        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
-      }}
-      cc[i*C_stride + j] += sum;
-    }}
-  }}
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
-    int16_t *aa, int16_t *bb, int32_t *cc,
-    int32_t A_stride_arg, int32_t B_stride_arg, int32_t C_stride_arg) {{
-  int A_stride = A_stride_arg;
-  int B_stride = B_stride_arg;
-  int C_stride = C_stride_arg;
-
-  int32_t retcode = 0;
-
-  if ( {M} < 2 && {N} < 2 ) {{
-    retcode = gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
-    goto out;
-  }}
-
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      int32_t aa_vector[{K} / 2];
-      int32_t bb_vector[{K} / 2];
-      memcpy(&aa_vector, &aa[i * A_stride], sizeof(aa_vector));
-      memcpy(&bb_vector, &bb[j * B_stride], sizeof(bb_vector));
-
-      int32_t sum = 0;
-      for (int l = 0; l < {K} / 2; l++) {{
-        sum = __smlad(aa_vector[l], bb_vector[l], sum);
-      }}
-      cc[i*C_stride + j] += sum;
-    }}
-  }}
-
-  if ( {K} % 2 != 0 )
-    gemm16_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
-
-out:
-  return retcode;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int32_t C_stride) {{
-  for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {N}; j++) {{
-      cc[i*C_stride + j] = 0;
-    }}
-  }}
-  return 0;
-}}
-
-"""
-    )
-    return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
deleted file mode 100644
index cfed417c9fe7..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter
-"""Defines max intrinsics for elemwise max operation with v7e-m DSP instructions."""
-
-import random
-import string
-
-import tvm
-from tvm import te
-from . import common
-
-
-def intrin_max(shape, in_dtype, out_dtype):
-    """Defines a v7e-m DSP-accelerated max pool."""
-    UNIQ_ID_LEN = 8
-    uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
-    func_prefix = "max8"
-
-    assert in_dtype == "int8"
-    assert out_dtype == "int8"
-
-    x = te.placeholder(shape, name="x", dtype=in_dtype)
-    k = te.reduce_axis((0, 1), name="rc")
-    z = te.compute(shape, lambda *i: tvm.tir.max(x[i], axis=[k]).astype(out_dtype))
-
-    def _intrin_func(ins, outs):
-        aa = ins[0]
-        cc = outs[0]
-
-        def _body():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    f"{func_prefix}_{uniq_id}",
-                    aa.access_ptr("r"),
-                    cc.access_ptr("w"),
-                    cc.strides[0],
-                )
-            )
-            return ib.get()
-
-        def _reduce_reset():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32", f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"), cc.strides[0]
-                )
-            )
-            return ib.get()
-
-        def _reduce_update():
-            return _body()
-
-        return _body(), _reduce_reset(), _reduce_update()
-
-    binds = {
-        t: tvm.tir.decl_buffer(
-            t.shape,
-            t.dtype,
-            t.op.name,
-            strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0, len(t.shape))],
-            offset_factor=1,
-        )
-        for t in [x, z]
-    }
-
-    intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
-    return intrin_decl, uniq_id
-
-
-def max_impl(uniq_id):
-    """Emit C code for pool impl."""
-    cc_code = (
-        common.common_includes
-        + f"""
-
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t max8_reset_{uniq_id}(
-    int8_t *res,
-    int32_t N) {{
-  memset(res, (int8_t)-128, N * sizeof(*res));
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t max8_loop_{uniq_id}(
-    int8_t *arg,
-    int8_t *res,
-    int32_t N_arg) {{
-  int N = N_arg;
-
-  for ( int i = 0; i < N; ++ i )
-    if ( arg[i] > res[i] )
-      res[i] = arg[i];
-  return 0;
-}}
-
-#ifdef __cplusplus
-extern "C"
-#endif
-__attribute__((always_inline)) static inline int32_t max8_{uniq_id}(
-    int8_t *arg,
-    int8_t *res,
-    int32_t N_arg) {{
-  int N = N_arg;
-  int32_t *parg32, *pres32;
-  int una_arg = (int32_t)arg & 0x3, una_res = (int32_t)res & 0x3;
-  int32_t retcode = 0;
-
-  if ( N < 4 || ((una_arg || una_res) && una_arg != una_res) ) {{
-    retcode = max8_loop_{uniq_id}(arg, res, N);
-    goto out;
-  }}
-  if ( una_arg ) {{
-    int n = (4 - una_arg);
-    if ( n > N || (N - n) < 4 )
-      n = N;
-    retcode = max8_loop_{uniq_id}(arg, res, n);
-    N -= n;
-    if ( N == 0 )
-      goto out;
-    arg += n; res += n;
-  }}
-
-  parg32 = (int32_t *)arg;
-  pres32 = (int32_t *)res;
-
-  for ( int i = 0; i < N / 4; ++ i ) {{
-    int32_t arg32 = *parg32 ++;
-    int32_t res32 = *pres32;
-    __ssub8(arg32, res32);
-    res32 = __sel(arg32, res32);
-    *pres32 ++ = res32;
-  }}
-
-  if ( N & 0x3 ) {{
-    retcode = max8_loop_{uniq_id}((int8_t *)parg32, (int8_t *)pres32, N & 0x3);
-    goto out;
-  }}
-
-out:
-  return retcode;
-}}
-
-"""
-    )
-    return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
deleted file mode 100644
index 90ca04ac9f19..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""This is a special intrinsic used for depthwise convolution using Cortex-M DSP instructions
-(v7e-m). It takes as inputs an int8 HWC data tensor and an int8 CHWc kernel. This intrinsic "lays"
-the kernel on top of the data tensors starting from a given pointer, performs signed sixteen-bit
-multiplies on each pair of values, and sums all the products in an int32 accumlator. This process is
-repeated four times giving four int32 outputs - one per channel."""
-
-import textwrap
-
-from tvm import te, tir
-from .common import num_simd_lanes_per_word, common_includes
-
-
-def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
-    """Gets the C function name of the tensorized function."""
-    return f"kernel_convolve_{in_dtype}_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}"
-
-
-def intrin_multi_channel_convolve(
-    in_dtype, _tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix
-):
-    """Defines a v7e-m DSP-accelerated multi-channel convolution. Works on two
-    channels if in_dtype==int16, and four channels if in_dtype==int8."""
-    simd_lanes = num_simd_lanes_per_word(in_dtype)
-
-    overlap_dims = (kernel_h, kernel_w, simd_lanes)
-    data_slice = te.placeholder(overlap_dims, name="data_slice", dtype=in_dtype)
-    kernel_slice = te.placeholder(overlap_dims, name="kernel_slice", dtype=in_dtype)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    output_slice = te.compute(
-        (simd_lanes,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i, k].astype("int32") * kernel_slice[kh_i, kw_i, k].astype("int32"),
-            axis=(kh_i, kw_i),
-        ),
-        name="c",
-    )
-
-    data_buf = tir.decl_buffer(
-        data_slice.shape,
-        data_slice.dtype,
-        name="data",
-        offset_factor=1,
-        strides=[tensor_w * channels, channels, 1],
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel_slice.shape,
-        kernel_slice.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=[kernel_w * simd_lanes, simd_lanes, 1],
-    )
-    output_buf = tir.decl_buffer(
-        output_slice.shape, output_slice.dtype, name="output", offset_factor=1, strides=[1]
-    )
-
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
-
-    return te.decl_tensor_intrin(
-        output_slice.op,
-        intrin_func,
-        binds={data_slice: data_buf, kernel_slice: kernel_buf, output_slice: output_buf},
-    )
-
-
-def multi_channel_convolve_impl(in_dtype, *args) -> str:
-    """Generates C code for a fast multi-channel convolution function for ARM Cortex-M. This is done
-    by calling a sub-function depending on the input data type, as since v7e-m has no quad multiply
-    accumulate instruction, the int8 and int16 cases work differently."""
-    if in_dtype == "int8":
-        return _quad_int8_channel_convolve_impl(*args)
-    if in_dtype == "int16":
-        return _dual_int16_channel_convolve_impl(*args)
-
-    raise NotImplementedError(f"No Cortex-M {in_dtype} depthwise_conv2d implementation exists!")
-
-
-def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
-    return textwrap.dedent(
-        (
-            common_includes
-            + f"""
-        // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
-
-        #define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
-            arranged_kernel, \
-            tensor_c3210, \
-            sum_c0, sum_c1, sum_c2, sum_c3) {{ \
-          \
-          int32_t kernel_c3210 = *arranged_kernel++; \
-          \
-          int32_t tensor_c20 = __sxtb16(tensor_c3210); \
-          int32_t kernel_c20 = __sxtb16(kernel_c3210); \
-          sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
-          sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
-          \
-          int32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
-          int32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
-          sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
-          sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
-        }}
-
-        /* We do four channels at once to get this speed boost. */
-        #ifdef __cplusplus
-        extern "C"
-        #endif
-        int32_t {_get_func_name("int8", tensor_w, channels, kernel_h, kernel_w, suffix)}(
-            int32_t *out,
-            int8_t *tensor,
-            int8_t *kernel) {{
-
-          int32_t sum_c0 = 0;
-          int32_t sum_c1 = 0;
-          int32_t sum_c2 = 0;
-          int32_t sum_c3 = 0;
-
-          int32_t kernel_i32[{kernel_h} * {kernel_w}];
-          memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * sizeof(int32_t));
-          int32_t *arranged_kernel = kernel_i32;
-
-          int32_t tensor_length = {((kernel_w - 1) * (channels // 4) + (kernel_h - 1) * tensor_w * (channels // 4)) + 1};
-          int32_t tensor_i32[tensor_length];
-          memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
-
-          #pragma GCC unroll 3
-          for (int i = 0; i < {kernel_h}; i++) {{
-            #pragma GCC unroll 3
-            for (int j = 0; j < {kernel_w}; j++) {{
-              TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP(
-                arranged_kernel,
-                *(tensor_i32 + j * {channels // 4} + i * {tensor_w * (channels // 4)}),
-                sum_c0, sum_c1, sum_c2, sum_c3)
-            }}
-          }}
-
-          out[0] = sum_c0;
-          out[1] = sum_c1;
-          out[2] = sum_c2;
-          out[3] = sum_c3;
-          return 0;
-        }}
-
-        #undef TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP
-        """
-        )
-    )
-
-
-def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
-    return textwrap.dedent(
-        (
-            common_includes
-            + f"""
-        #include <stdint.h>
-
-        /* We do four channels at once to get this speed boost. */
-        #ifdef __cplusplus
-        extern "C"
-        #endif
-        int32_t {_get_func_name("int16", tensor_w, channels, kernel_h, kernel_w, suffix)}(
-            int32_t *out,
-            int16_t *tensor,
-            int16_t *kernel) {{
-
-          int32_t sum_c0 = 0;
-          int32_t sum_c1 = 0;
-
-          int32_t kernel_i32[{kernel_h} * {kernel_w}];
-          memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * sizeof(int32_t));
-
-          int32_t tensor_length = {((kernel_w - 1) * (channels // 2) + (kernel_h - 1) * tensor_w * (channels // 2)) + 1};
-          int32_t tensor_i32[tensor_length];
-          memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
-
-          #pragma GCC unroll 3
-          for (int i = 0; i < {kernel_h}; i++) {{
-            #pragma GCC unroll 3
-            for (int j = 0; j < {kernel_w}; j++) {{
-              int32_t tensor_c10 = tensor_i32[j * {channels // 2} + i * {tensor_w * (channels // 2)}];
-              int32_t kernel_c10 = kernel_i32[{kernel_w} * i + j];
-              sum_c0 = __builtin_arm_smlabb(tensor_c10, kernel_c10, sum_c0);
-              sum_c1 = __builtin_arm_smlatt(tensor_c10, kernel_c10, sum_c1);
-            }}
-          }}
-
-          out[0] = sum_c0;
-          out[1] = sum_c1;
-          return 0;
-        }}
-
-        #undef TVMGEN_DUAL_INT16_CHANNEL_REARRANGE_SUM
-        """
-        )
-    )
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
deleted file mode 100644
index af3b23e01dcb..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
-
-This function can be used to tensorize many common operators including regular conv2d, depthwise
-conv2d, and grouped conv2d for some data and kernel layouts. When for regular convolution, use data
-layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout data layout is NCHW
-and kernel layout OIHW.
-
-The generated code will also work on v8-M chips that have the DSP instructions (unlike v7E-M, they
-are optional in v8-M). Note that the generated code does not use the (potentially very useful) MVE
-instructions present on some v8-M chips.
-"""
-
-from dataclasses import dataclass
-from itertools import chain
-import textwrap
-from typing import Iterator, Optional, Tuple
-
-
-@dataclass
-class SMLAInstruction:
-    """Class for keeping track of an item in inventory."""
-
-    instruction: str
-    tensor_var: str
-    kernel_var: str
-
-    def call_with_acle(self, accumulator_var: str) -> str:
-        return (
-            f"{accumulator_var} = __{self.instruction}"
-            f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
-        )
-
-    def has_same_operands(self, other: "SMLAInstruction") -> bool:
-        return self.tensor_var == other.tensor_var and self.kernel_var == other.kernel_var
-
-
-def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
-    """Generates a C function name for tensordot.
-
-    We do not need a suffix, as the generated function will have an #include guard. Unlike other
-    microTVM operators, _get_c_function_name is never called externally.
-    """
-    tensor_w, kernel_h, kernel_w = dimensions
-    return (
-        f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
-        + f"{kernel_h}x{kernel_w}_"
-        + "".join(map(str, offsets))
-        + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
-    )
-
-
-def _init_biased_accumulators(num_outputs):
-    """Generates code to load the bias into the accumulators.
-
-    Addition is commutative, so we could add the bias before, during, or after performing our
-    multiply-accumulate operations. Where we add the bias does not change the overflow behavior.
-
-    Doing the bias add takes one cycle either way (if done at the beginning we can't use a SMULXY
-    trick to set sum_i to zero for "free"). However, doing it at the beginning frees up a register,
-    so we'll do it first.
-    """
-    assignments = [f"sum_{x:x} = *bias" for x in range(num_outputs)]
-    joined_assignments = ", ".join(assignments)
-    return f"int32_t {joined_assignments};"
-
-
-def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) -> Iterator[Optional[Tuple]]:
-    """Gets the logical indices of the data that will be stored in memory at the tensor pointer.
-
-    Returns an Iterator of Optional[Tuple], while skipping over word-aligned pairs of unrelated
-    halfwords. The returned iterator is as short as possible while having even length and containing
-    all relevant tensor data. Tuples in the returned Iterator represent an (y, x) offset from the
-    top-left tensor position being used in this convolution. We need to be aware of the None values
-    so our code is correctly word-aligned.
-
-    One consequence of these requirements - each row in the tensor is broken into word-aligned pairs
-    of halfwords (which are later combined into full words). See the test cases (located in
-    tests/python/topi/python/test_topi_conv2d_tensordot_opts.py) for usage examples.
-    """
-
-    tensor_w, kernel_h, kernel_w = dimensions
-    max_x_val = (num_outputs - 1) * in_stride + kernel_w
-    halfwords = []
-
-    for y in range(kernel_h):
-        # If needed, pad so the beginning of the row is word-aligned
-        if (y * tensor_w + offset) % 2 == 1:
-            halfwords.append(None)
-
-        for x in range(max_x_val):
-            halfwords.append((y, x))
-
-        # If needed, pad so the row length is word aligned
-        if (y * tensor_w + offset + max_x_val) % 2 == 1:
-            halfwords.append(None)
-    return halfwords
-
-
-def _get_kernel_halfwords(dimensions, offset) -> Iterator[Optional[Tuple]]:
-    """Gets the logical indices of the data that will be stored in memory at the kernel pointer.
-
-    Returns an Iterator of Optional[Tuple]. The returned iterator is as short as possible while
-    having even length and containing all kernel data. Tuples in the returned Iterator represent
-    an (y, x) position in the kernel, while None values represent other, irrelevant data. We need
-    to be aware of the None values so our code is correctly word-aligned.
-
-    See test cases in tests/python/topi/python/test_topi_conv2d_tensordot_opts.py for examples.
-    """
-    _, kernel_h, kernel_w = dimensions
-    halfwords = []
-
-    # Kernel data starts `offset` places after the pointer value
-    if offset == 1:
-        halfwords.append(None)
-
-    for y in range(kernel_h):
-        for x in range(kernel_w):
-            halfwords.append((y, x))
-
-    # Make sure the returned iterator has even length by padding with an "unknown" value. We want
-    # even length as this corresponds to an integer number of int32 words.
-    if (kernel_h * kernel_w + offset) % 2 == 1:
-        halfwords.append(None)
-    return halfwords
-
-
-def _get_int16_alias(position) -> str:
-    if position is None:
-        return "unknown"
-    y, x = position
-    return f"y{y:0>2x}_x{x:0>2x}"
-
-
-def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]:
-    assert len(halfwords) % 2 == 0
-    offset = int(not bool(halfwords[0]))
-
-    for i in range(0, len(halfwords), 2):
-        var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
-        y, x = halfwords[i + 1] or halfwords[i]
-        tensor_index = (y * tensor_w + x + offset) // 2
-        yield f"int32_t tensor__{var_name} = tensor[{tensor_index}];"
-
-
-def _load_kernel_vars(halfwords) -> Iterator[str]:
-    assert len(halfwords) % 2 == 0
-    for i in range(0, len(halfwords), 2):
-        var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
-        yield f"int32_t kernel__{var_name} = kernel[{i // 2}];"
-
-
-def _get_draft_macs(
-    kernel_dims, tensor_halfwords, kernel_halfwords, offset
-) -> Iterator[SMLAInstruction]:
-    """Generates unrolled MAC instructions to compute one tensordot sum.
-
-    Unrolling these loops increases code size a tiny bit (< 0.02 KB), but makes the generated code
-    much faster. The generated code does not use SIMD instructions - they are added later by
-    _apply_simd_optimizations.
-
-    We return an iterator of SMLAInstruction named tuples. Returning an iterator lets us do
-    optimizations by iterator chaining.
-    """
-
-    def get_var(y, x, halfwords) -> Tuple[str, str]:
-        i = halfwords.index((y, x))
-        if i % 2 == 0:
-            return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i + 1])}", "b"
-        return f"{_get_int16_alias(halfwords[i - 1])}__{_get_int16_alias((y, x))}", "t"
-
-    kernel_h, kernel_w = kernel_dims
-    for y in range(kernel_h):
-        for x in range(kernel_w):
-            tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords)
-            kernel_var, kernel_half = get_var(y, x, kernel_halfwords)
-            instruction = f"smla{tensor_half}{kernel_half}"
-            yield SMLAInstruction(instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}")
-
-
-def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]:
-    """When possible, fuses single MACs into SIMD MAC instructions.
-
-    The compiler cannot do this automatically, as calling __smlaxy forces the SMLAxy instruction to
-    be used. This function takes as input an iterator of SMLAInstructions and returns an iterator of
-    SMLAInstructions (possibly of different length).
-    """
-    curr_tuple = next(instruction_tuples, None)
-    while curr_tuple:
-        next_tuple = next(instruction_tuples, None)
-        if next_tuple is None:
-            yield curr_tuple
-            break
-
-        if curr_tuple.has_same_operands(next_tuple):
-            instructions = sorted([curr_tuple.instruction, next_tuple.instruction])
-            if instructions == ["smlabb", "smlatt"]:
-                yield SMLAInstruction("smlad", curr_tuple.tensor_var, curr_tuple.kernel_var)
-                next_tuple = next(instruction_tuples, None)
-            elif instructions == ["smlabt", "smlatb"]:
-                yield SMLAInstruction("smladx", curr_tuple.tensor_var, curr_tuple.kernel_var)
-                next_tuple = next(instruction_tuples, None)
-            else:
-                yield curr_tuple
-
-        else:
-            yield curr_tuple
-        curr_tuple = next_tuple
-
-
-def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
-    """Converts an iterator of SMLAInstructions into lines of C code.
-
-    We want the compiler to re-order these with the memory loads, so we generate them as a series of
-    calls to instruction aliases instead of as a single `asm` block.
-    """
-
-    for smla_instruction in instruction_tuples:
-        assert "smla" in smla_instruction.instruction
-
-        # We call the instruction using the Arm C Language Extensions. Using ACLE gives better
-        # cross-compiler compatibility than using __builtin functions.
-        yield smla_instruction.call_with_acle(f"sum_{index}")
-
-
-def _requantize_sums(num_outputs, requantize_shift, output_zero_point) -> Iterator[str]:
-    """Generates code to requantize the accumulator values.
-
-    The generated code does not use floating point instructions, as it simulates floating point
-    multiplication with an a int64 multiply + shift. The bias is added at the beginning, so we can
-    skip doing it now. The shift is hard-coded, as this saves a few cycles without hurting accuracy
-    in "most" cases.
-
-    It's *possible* we could save one more cycle here by pre-multiplying the bias with the
-    requantize multiplier, and then doing the bias addition and shift in the same cycle (via <op2>).
-    However, it's complicated and only saves one cycle.
-
-    It's also worth noting the SSAT16 operation doesn't help us here. The data isn't stored as two
-    halfwords in a word, and rearrainging it would take at least one cycle. Two SSAT operations is
-    just as good.
-
-    Calling __ssat directly is a little bit gross, but GCC and Clang are unreliable about compiling
-    other ways of writing this. Both the multiply + shift and shift + saturation combine to one
-    instruction each.
-    """
-
-    yield "int32_t scale_val = *scale;"
-    for i in range(num_outputs):
-        yield f"int32_t requant_{i} = (sum_{i} * (int64_t) scale_val) >> {requantize_shift - 1};"
-        yield f"requant_{i} = (requant_{i} + 1) >> 1;"
-        yield f"requant_{i} = __ssat(requant_{i} + {output_zero_point}, 8);"
-
-
-def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
-    """Generates code to write the requantized sums to memory.
-
-    Note - halfword packing here *does* help. It seems
-    like it wouldn't, as doing two pipelined int16 stores takes two cycles - the same as halfword
-    packing plus a pipelined int32 store. We still do the int16 stores when there is an output
-    stride, though.
-
-    However, this lets the compiler re-order instructions to better preserve memory, as it doesn't
-    like breaking apart the store instructions (as this messes up pipelining).
-    """
-
-    if stride > 1:
-        for i in range(num_outputs):
-            yield f"((int16_t*) output)[{i * stride + offset}] = (int16_t) requant_{i};"
-
-    else:
-        num_packed = (num_outputs - offset) // 2
-        for i in range(num_packed):
-            index = 2 * i + offset
-            # We must explicitly call asm inline to use the PKHBT instruction. It is not part of
-            # ACLE and has no __builtin. Writing it using masks and bitshifts does not work either:
-            # Arm GCC 12 with -O3 does not compile these efficiently.
-            yield f"int packed_res_{i};"
-            yield (
-                f'__asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_{i}) : '
-                f'"r" (requant_{index}), "r" (requant_{index + 1}));'
-            )
-
-        if offset == 1:
-            yield "((int16_t*) output)[1] = (int16_t) requant_0;"
-
-        for i in range(num_packed):
-            yield f"output[{offset + i}] = packed_res_{i};"
-
-        if (offset + num_outputs) % 2 == 1:
-            yield f"((int16_t*) output)[{num_packed * 2}] = (int16_t) requant_{num_packed * 2};"
-
-
-def tensordot_int16_impl(
-    num_outputs: int,
-    dimensions: Tuple[int, int, int],
-    offsets: Tuple[int, int, int],
-    x_strides: Tuple[int, int],
-    requantize_shift: int = 33,
-    output_zero_point: int = -128,
-) -> Tuple[str, str]:
-    """Generates code to compute a tensor dot product with requantization.
-
-    The generated function takes pointers to the output, tensor, and kernel as input. All pointers
-    must be word aligned. Only works with `int16` data type. The generated code is optimized for the
-    ARMv7E-M architecture.
-
-    Parameters
-    ----------
-    num_outputs: int
-        The number of tensordot outputs to compute per function call. Computing more than one at
-        once makes us much faster by reducing how often overlapping data is loaded. However, setting
-        this too high causes us to run out of registers and need to store data on the stack. We
-        should autotune this, but num_outputs=2 is usually OK.
-
-    dimensions: Tuple[int, int, int]
-        The dimensions of each tensordot operation. dimensions[1] and dimensions[2] are the height
-        and width of the kernel, respectively. dimensions[0] is the width of the data tensor, which
-        is usually larger than the kernel.
-
-    offsets: Tuple[int, int, int]
-        Each value is 0 or 1, and represents how far after the given data, kernel, and output
-        pointers (respectively) we should start reading/writing. This prevents us from having to
-        check if each pointer is aligned or unaligned at runtime, making us faster.
-
-    x_strides: Tuple[int, int]
-        The distance (in halfwords) between the start of each input tensor, and where to write each
-        output result respectively. Only used when num_outputs > 1.
-
-    requantize_shift: int
-        The distance to right shift after multiplying by the requantization scale. Defaults to 33,
-        as this lets us skip a shift operation.
-
-    outout_zero_point: int
-        The output zero point, which will be subtracted after scale multiplication but before
-        clipping. Defaults to -128, as most models always use this.
-
-    Returns
-    -------
-    func_name, func_code: Tuple[str, str]
-        The name and source code of the generated function.
-    """
-    function_name = _get_c_function_name(num_outputs, dimensions, offsets, x_strides)
-    tensor_w, kernel_h, kernel_w = dimensions
-    tensor_offset, kernel_offset, output_offset = offsets
-    assert tensor_offset < 2 and kernel_offset < 2 and output_offset < 2
-    in_stride, out_stride = x_strides
-
-    tensor_halfwords = _get_tensor_halfwords(dimensions, tensor_offset, num_outputs, in_stride)
-    kernel_halfwords = _get_kernel_halfwords(dimensions, kernel_offset)
-    load_tensor_lines = _load_tensor_vars(tensor_halfwords, tensor_w)
-    load_kernel_lines = _load_kernel_vars(kernel_halfwords)
-
-    def gen_single_loop_macs(index):
-        draft_macs_iter = _get_draft_macs(
-            (kernel_h, kernel_w), tensor_halfwords, kernel_halfwords, index * in_stride
-        )
-        draft_macs_iter = _apply_simd_optimizations(draft_macs_iter)
-        return _expand_instruction_tuples(draft_macs_iter, index)
-
-    multiply_acc_lines = chain.from_iterable(gen_single_loop_macs(i) for i in range(num_outputs))
-    requantize_lines = _requantize_sums(
-        num_outputs, requantize_shift=requantize_shift, output_zero_point=output_zero_point
-    )
-    write_out_lines = _write_sums_to_memory(num_outputs, output_offset, out_stride)
-
-    def insert_lines(lines):
-        return ("\n" + " " * 10).join(lines)
-
-    # It's very common for one model to have different layers that use identical tensordot
-    # functions. To prevent function re-definition errors, we need an #include guard. This is better
-    # than adding a random suffix, as it saves flash memory.
-    code = textwrap.dedent(
-        f"""
-        #ifndef {function_name.upper()}_EXISTS
-        #define {function_name.upper()}_EXISTS
-        #include <arm_acle.h>
-        __attribute__((always_inline)) static inline int32_t {function_name}(
-            int16_t *output_arg, int16_t *tensor_arg, int16_t *kernel_arg,
-            int32_t *bias, int32_t *scale
-        ) {{
-          int32_t *output = output_arg;
-          int32_t *tensor = tensor_arg;
-          int32_t *kernel = kernel_arg;
-
-          {_init_biased_accumulators(num_outputs)}
-
-          {insert_lines(load_tensor_lines)}
-
-          {insert_lines(load_kernel_lines)}
-
-          {insert_lines(multiply_acc_lines)}
-
-          {insert_lines(requantize_lines)}
-
-          {insert_lines(write_out_lines)}
-          return 0;
-        }}
-        #endif
-        """
-    )
-    return (function_name, code)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
deleted file mode 100644
index 441683112447..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-value-for-parameter
-"""Direct implementation of pool."""
-import logging
-
-import tvm
-
-from tvm import te
-from tvm.topi.utils import traverse_inline
-
-from .micro_kernel.max_pool import (
-    intrin_max,
-    max_impl,
-)
-
-from .micro_kernel.avg_pool import (
-    intrin_sum,
-    sum_impl,
-)
-
-logger = logging.getLogger("topi")
-
-
-def schedule_maxpool_1d_nwc(s, op):
-    """Schedule function for v7e-m DSP instructions of maxpool 1d NWC layout."""
-    output = op.output(0)
-    data_vec = op.input_tensors[0]
-
-    channels = data_vec.shape[-1]
-    if isinstance(channels, tvm.tir.IntImm):
-        channels = channels.value
-
-    n, w, c = s[op].op.axis
-    (k,) = s[op].op.reduce_axis
-
-    s[op].reorder(n, w, k, c)
-    max_val, uniq_id = intrin_max((1, 1, channels), data_vec.dtype, output.dtype)
-    s[op].tensorize(c, max_val)
-    s[output].pragma(n, "import_c", max_impl(uniq_id))
-
-
-def schedule_maxpool_2d_nhwc(s, op):
-    """Schedule function for v7e-m DSP instructions of maxpool 2d NHWC layout."""
-    output = op.output(0)
-    data_vec = op.input_tensors[0]
-
-    channels = data_vec.shape[-1]
-    if isinstance(channels, tvm.tir.IntImm):
-        channels = channels.value
-
-    n, h, w, c = s[op].op.axis
-    ko, ki = s[op].op.reduce_axis
-
-    s[op].reorder(n, h, w, ko, ki, c)
-    max_val, uniq_id = intrin_max((1, 1, 1, channels), data_vec.dtype, output.dtype)
-    s[op].tensorize(c, max_val)
-    s[output].pragma(n, "import_c", max_impl(uniq_id))
-
-
-def schedule_avgpool_1d_ncw(s, op):
-    """Schedule function for v7e-m DSP instructions of avgpool 1d NCW layout."""
-    output = op.output(0)
-    data_vec = op.input_tensors[0]
-
-    n, _, _ = s[op].op.axis
-    (k,) = s[op].op.reduce_axis
-    pool_w = k.dom.extent.value
-
-    summary, uniq_id = intrin_sum((1, 1, pool_w), data_vec.dtype, output.dtype, reset=True)
-    s[op].tensorize(k, summary)
-    s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id))
-
-
-def schedule_avgpool_2d_nchw(s, op):
-    """Schedule function for v7e-m DSP instructions of avgpool 2d NCHW layout."""
-    output = op.output(0)
-    data_vec = op.input_tensors[0]
-
-    n, _, _, _ = s[op].op.axis
-    _, ki = s[op].op.reduce_axis
-    pool_w = ki.dom.extent.value
-
-    summary, uniq_id = intrin_sum((1, 1, 1, pool_w), data_vec.dtype, output.dtype)
-    s[op].tensorize(ki, summary)
-    s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id))
-
-
-def pool_dsp_schedule(outs, layout):
-    """Schedule function for v7e-m DSP instructions of pooling."""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "pool_max" in op.tag:
-            in_dtype = op.input_tensors[0].dtype
-            if in_dtype != "int8":
-                logger.warning("Does not have micro-kernel for %s maxpool.", in_dtype)
-            elif layout == "NWC":
-                schedule_maxpool_1d_nwc(s, op)
-            elif layout == "NHWC":
-                schedule_maxpool_2d_nhwc(s, op)
-        elif "pool_sum" in op.tag:
-            in_dtype = op.input_tensors[0].dtype
-            if in_dtype != "int16":
-                logger.warning("Does not have micro-kernel for %s avgpool.", in_dtype)
-            elif layout == "NCW":
-                schedule_avgpool_1d_ncw(s, op)
-            elif layout == "NCHW":
-                schedule_avgpool_2d_nchw(s, op)
-
-    traverse_inline(s, outs[-1].op, _callback)
-    return s
diff --git a/python/tvm/topi/arm_cpu/pooling.py b/python/tvm/topi/arm_cpu/pooling.py
deleted file mode 100644
index f09f0089342d..000000000000
--- a/python/tvm/topi/arm_cpu/pooling.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable
-"""Schedule for pooling operators"""
-
-from .mprofile.dsp.pool import pool_dsp_schedule
-
-
-def schedule_pool(outs, layout):
-    """Create schedule for avgpool/maxpool with dsp"""
-    return pool_dsp_schedule(outs, layout)
diff --git a/python/tvm/topi/arm_cpu/pstate_attributes.py b/python/tvm/topi/arm_cpu/pstate_attributes.py
deleted file mode 100644
index 439337bac5b2..000000000000
--- a/python/tvm/topi/arm_cpu/pstate_attributes.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Specialized attributes that can be added to schedules to alter
-the behaviour of AArch64 codegen.
-"""
-
-
-class SMEAttributes:
-    """
-    This class serves as a convenience wrapper for processor state annotations
-    relating to the Scalable Matrix Extension (SME). Processor state annotations
-    are inserted at compile time and alter some global state of the processor
-    during execution. For example, the streaming mode attribute can be used to
-    transfer some vector operations to a separate processing element. These
-    attributes can be added to block-level annotations in AArch64 schedules to
-    define a desired state.
-
-    Please refer to the following pages for more information regarding the SME
-    attributes and their behaviours:
-     - https://arm-software.github.io/acle/main/acle.html#markdown-toc-sme-attributes
-     - https://llvm.org/docs/AArch64SME.html
-
-    Attributes
-    ----------
-    STREAMING_MODE : str
-        Whether execution should occur in regular mode or streaming mode. When
-        enabled, some vector operations may be transferred to a separate processing
-        element.
-    ZA_STORAGE : str
-        Defines how the ZA area of storage provided by the SME extension should be
-        utilized.
-    """
-
-    STREAMING_MODE = "pragma_aarch64_pstate_sm"
-
-    class StreamingModeValues:
-        """
-        Streaming mode attribute values. By default, a function is considered
-        'non-streaming' (often referred to as 'regular').
-
-        Attributes
-        ----------
-        ENABLED : str
-            The processor state must be in streaming mode before executing the marked function.
-        COMPATIBLE : str
-            The marked function can be run in either streaming or non-streaming mode.
-        """
-
-        ENABLED = "enabled"
-        COMPATIBLE = "compatible"
-
-    ZA_STORAGE = "pragma_aarch64_pstate_za"
-
-    class ZAStorageValues:
-        """
-        ZA Storage attribure values. By default, a function has no ZA state. In other words, it
-        does not use the ZA storage.
-
-        Attributes
-        ----------
-        NEW : str
-            A new ZA state is created "from scratch".
-        SHARED : str
-            The ZA state is shared with the calling function.
-        """
-
-        NEW = "new"
-        SHARED = "shared"
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
deleted file mode 100644
index b90ee994478d..000000000000
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ /dev/null
@@ -1,590 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Contains TVMScript implementations of some QNN operators for Arm.
-
-Currently, the only ops with compute functions are fused regular and depthwise convolutions for
-Arm Cortex-M with DSP. Additionally, these functions explicitly do not support padding - it
-must be done in a separate Relay op for memory reasons.
-"""
-
-from typing import Callable, Dict, Tuple
-
-import tvm
-from tvm import te, tir, TVMError
-from tvm.script import tir as T
-from tvm.tir import const
-
-from ..utils import get_const_tuple
-from .mprofile.dsp.micro_kernel import tensordot
-
-
-def _int_ceil_division(x, y):
-    return -(x // -y)
-
-
-def _compute_output_dim(data_length, kernel_length, stride):
-    return _int_ceil_division(data_length + 1 - kernel_length, stride)
-
-
-def _pick_num_outputs(out_width):
-    """Guess a good value for num_outputs."""
-
-    assert out_width > 1
-
-    # num_outputs is capped at 8
-    for i in range(2, min(out_width + 1, 8)):
-        if out_width % i == 0:
-            return i
-
-    raise TVMError(f"Cannot pick a good num_outputs value for out_width = {out_width}!")
-
-
-def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False):
-    """Helper function that chooses the right implementation of micro_kernel.tensordot.
-
-    Takes as input the parameters of the conv2d, and returns a tuple of TWO (function_name,
-    function_code). The first pair (the aligned one) is for even numbered output channels, and the
-    second pair (the offset one) is for odd-numbered output channels. This function is used for
-    regular and depthwise convolutions.
-
-    We need different implementations for even vs odd numbered output channels, because the "start"
-    of an odd output channel in the data tensor or kernel might or might not be on a word boundary,
-    and the tensordot code expects all input pointers to be word-aligned.
-    """
-    data, kernel = inputs[0:2]
-    rq_output_zero_point_const = inputs[10]
-    assert len(rq_output_zero_point_const.op.body) == 1
-    output_zero_point = rq_output_zero_point_const.op.body[0]
-
-    _, stride_w = get_const_tuple(attrs.strides)
-
-    if is_depthwise:
-        assert attrs.data_layout == "NCHW"
-        assert attrs.kernel_layout == "IOHW"
-        _, _, height, width = get_const_tuple(data.shape)
-        _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-
-        dimensions = (width, kernel_h, kernel_w)
-        in_stride = stride_w
-        data_per_oc_size = height * width
-    else:
-        assert attrs.data_layout == "NHWC"
-        assert attrs.kernel_layout == "OHWI"
-        _, height, width, in_channels = get_const_tuple(data.shape)
-        out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
-
-        dimensions = (width * in_channels, kernel_h, kernel_w * in_channels)
-        in_stride = in_channels * stride_w
-        data_per_oc_size = 0
-
-    assert attrs.out_layout is not None
-    if attrs.out_layout == "NHWC":
-        out_stride = out_channels
-    elif attrs.out_layout == "NCHW":
-        out_stride = 1
-    else:
-        raise ValueError(f"Unsupported output layout {attrs.out_layout}!")
-
-    x_strides = (in_stride, out_stride)
-    aligned_func = tensordot.tensordot_int16_impl(
-        num_outputs,
-        dimensions,
-        (0, 0, 0),
-        x_strides,
-        output_zero_point=output_zero_point,
-    )
-
-    kernel_per_oc_size = dimensions[1] * dimensions[2]
-
-    offsets = (data_per_oc_size % 2, kernel_per_oc_size % 2, 0)
-    offset_func = tensordot.tensordot_int16_impl(
-        num_outputs,
-        dimensions,
-        offsets,
-        x_strides,
-        output_zero_point=output_zero_point,
-    )
-
-    return (aligned_func, offset_func)
-
-
-def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
-    return T.tvm_access_ptr(
-        T.type_annotation(dtype=dtype),
-        buffer.data,
-        offset,
-        length,
-        1,
-        dtype="handle",
-    )
-
-
-def _bias_ptr(bias, c):
-    return _make_tscript_ptr(bias, c, 1, dtype="int32")
-
-
-def _scale_ptr(scale, c):
-    return _make_tscript_ptr(scale, c, 1, dtype="int32")
-
-
-def _make_tscript_call(func_name, *args):
-    return T.evaluate(T.call_extern(func_name, *args, dtype="int32"))
-
-
-def _make_conv2d_primfunc(
-    output_dimensions: Tuple[int, int, int, int],
-    buffer_shapes: Tuple,
-    aligned_func: Tuple[str, str],
-    offset_func: Tuple[str, str],
-    ptr_gens: Tuple[Callable, Callable],
-    output_layout: str = "NHWC",
-) -> tir.function.PrimFunc:
-    """Makes a TIR PrimFunc computing Conv2D using a call to tensordot.
-
-    Can be used to generate regular, depthwise, and grouped Conv2D operators by passing different
-    arguments and ptr_gen functions. However, it only works for Conv2D operators where the height
-    stride of the tensor is divisible by two.
-
-    Parameters
-    ----------
-    output_dimensions : Tuple[int, int, int, int]
-        A tuple containing the out_height, out_width, out_channels, and desired num_outputs values
-        in that order.
-
-    buffer_shapes: Tuple[tvm.ir.container.Array]
-        The shapes of the data, kernel, bias, scale, and output tensors, in that order. Each shape
-        should be a TVM Array.
-
-    aligned_func: Tuple[str, str]
-        A tuple containing the (name, C implementation) of a word-aligned tensordot operator.
-
-    offset_func: Tuple[str, str]
-        A tuple containing the (name, C implementation) of a word-unaligned tensordot operator. Can
-        be a tuple of empty strings if the Conv2D in question does not need an unaligned operator.
-
-    ptr_gens: Tuple[Callable, Callable]
-        A tuple of two functions to generate data and kernel access pointers. They should take as
-        inputs the buffer, (y, x, c) indices, and an alignment offset. They should return a
-        T.tvm_access_ptr object which can be used in T.call_extern.
-
-    output_layout: str
-        The tensor layout that will be prosued by the generated PrimFunc. Should be NHWC or NCHW.
-    """
-
-    out_height, out_width, out_channels, num_outputs = output_dimensions
-    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
-    aligned_func_name, aligned_func_code = aligned_func
-    offset_func_name, offset_func_code = offset_func
-    data_ptr, kernel_ptr = ptr_gens
-
-    # If the functions are identical, we can skip the second loop
-    if aligned_func_name == offset_func_name:
-        aligned_channels = out_channels
-        offset_channels = 0
-        c_step = const(1)
-    else:
-        aligned_channels = out_channels // 2
-        offset_channels = out_channels // 2
-        c_step = const(2)
-
-    def output_ptr(output, y, x, c):
-        if output_layout == "NHWC":
-            return _make_tscript_ptr(
-                output,
-                y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
-                1,
-            )
-        elif output_layout == "NCHW":
-            return _make_tscript_ptr(
-                output,
-                c * const(out_height * out_width) + y * const(out_width) + x * const(num_outputs),
-                1,
-            )
-        else:
-            raise TVMError(f"Unsupported out_layout '{output_layout}'!")
-
-    @T.prim_func
-    def biased_quantized_conv2d(
-        data_handle: T.handle,
-        kernel_handle: T.handle,
-        bias_handle: T.handle,
-        scale_handle: T.handle,
-        output_handle: T.handle,
-    ) -> None:
-
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        data = T.match_buffer(data_handle, data_shape, dtype="int16")
-        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
-        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
-
-        # We don't specify a data type for the requantization scale, even though we will read it as
-        # an int32. This is because we must pretend it is a float32, as Relay's requantize op only
-        # allows floating point scales.
-        scale = T.match_buffer(scale_handle, scale_shape)
-        output = T.match_buffer(output_handle, output_shape, dtype="int16")
-
-        # This hack prevents TVM from seeing these variables as "unused". I should be using T.reads
-        # and T.writes, but they don't work. I think it's an issue with BufferTouchedDomain.
-        # pylint: disable=unused-variable
-        output[0, 0, 0, 0] = 0
-        __1 = data[0, 0, 0, 0]
-        __2 = kernel[0, 0, 0, 0]
-        __3 = bias[0, 0, 0, 0]
-        __4 = scale[0]
-        # pylint: enable=unused-variable
-
-        for c_ax, y_ax, x_ax in T.grid(
-            const(aligned_channels), const(out_height), const(out_width // num_outputs)
-        ):
-            with T.block("conv2d_aligned"):
-                T.block_attr({"pragma_import_c": aligned_func_code})
-                y, x, c_interval = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
-                c = c_interval * c_step
-                _make_tscript_call(
-                    aligned_func_name,
-                    output_ptr(output, y, x, c),
-                    data_ptr(data, y, x, c),
-                    kernel_ptr(kernel, c),
-                    _bias_ptr(bias, c),
-                    _scale_ptr(scale, c),
-                )
-
-        for c_ax, y_ax, x_ax in T.grid(
-            const(offset_channels), const(out_height), const(out_width // num_outputs)
-        ):
-            with T.block("conv2d_offset"):
-                T.block_attr({"pragma_import_c": offset_func_code})
-                y, x, c_interval = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
-                c = c_interval * c_step + 1
-                _make_tscript_call(
-                    offset_func_name,
-                    output_ptr(output, y, x, c),
-                    data_ptr(data, y, x, c, offset=1),
-                    kernel_ptr(kernel, c, offset=1),
-                    _bias_ptr(bias, c),
-                    _scale_ptr(scale, c),
-                )
-
-    return biased_quantized_conv2d
-
-
-def qnn_conv2d(attrs, inputs, out_type):
-    """Compute for qnn.conv2d with NHWC layout.
-
-    Note that this is a DIFFERENT layout from the Hexagon variant, because they have special
-    instructions Cortex-M doesn't have. We expect the kernel to have OHWI layout. We also assume
-    that padding is not necessary, as it will have been done by another pass.
-    """
-
-    # Make a few checks to unpack the function arguments and ensure it was called with the right
-    # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper.
-    assert len(inputs) == 11
-    assert not any(get_const_tuple(attrs.padding))
-
-    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    _, height, width, in_channels = get_const_tuple(data.shape)
-    out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
-
-    y_stride, x_stride = get_const_tuple(attrs.strides)
-    out_height = _compute_output_dim(height, kernel_h, y_stride)
-    out_width = _compute_output_dim(width, kernel_w, x_stride)
-
-    # Decide how many sums our function should have running at the same time. Doing
-    # this lets us do "more work" for each memory load, but doing too many of them causes us to run
-    # out of registers. Currently this is set to the smallest value greater than one that divides
-    # the output width, but autotuning this value would improve performance a lot.
-    num_outputs = _pick_num_outputs(out_width)
-
-    # Next, decide whether we need "parity alternation". For example, if we have an
-    # 8x3x3x3 kernel (8 output channels, height 3, width 3, input channels 3) in the OHWI layout,
-    # then every output channel kernel slice will be 27 halfwords. This means every other output
-    # channel will not be word aligned, which will cause slowness/crashes!
-
-    # We solve this problem by handling the "aligned" and "offset" output channels with different
-    # versions of our tensordot function. The "aligned func" assumes that the start positions of the
-    # output, data, and kernel are given exactly by their pointer. The "offset" version assumes that
-    # the "true" start of the output is the value in the output pointer, plus an offset of 0 or 1.
-    # _pick_tensordot_impl decides whether this is the case. If not, we only want to generate one
-    # function (to save flash), so offset_func is a tuple of empty strings.
-
-    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, False)
-
-    # We need to disable pylint's unused argument checker, as the kwarg offset is unused but must
-    # be present for compatibility. We cannot add an underscore as we normally would, as this makes
-    # the keyword not match.
-
-    # pylint: disable=unused-argument
-    def data_ptr(buffer, y, x, c, offset=0):
-        return _make_tscript_ptr(
-            buffer,
-            y * const(y_stride * width * in_channels)
-            + x * const(x_stride * num_outputs * in_channels),
-            1,
-        )
-
-    # pylint: enable=unused-argument
-
-    def kernel_ptr(buffer, c, offset=0):
-        return _make_tscript_ptr(
-            buffer,
-            c * const(kernel_h * kernel_w * in_channels) - offset,
-            1,
-        )
-
-    prim_func = _make_conv2d_primfunc(
-        (out_height, out_width, out_channels, num_outputs),
-        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
-        aligned_func,
-        offset_func,
-        (data_ptr, kernel_ptr),
-        output_layout=attrs.out_layout,
-    )
-
-    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
-    return [output]
-
-
-def schedule_qnn_conv2d(_attrs, _outs, _target):
-    """Schedule function for qnn.conv2d."""
-    return None
-
-
-def qnn_depthwise_conv2d(attrs, inputs, out_type):
-    """Compute for qnn.depthwise_conv2d with NCHW layout.
-
-    Works basically the same way as regular conv2d - see above.
-    """
-
-    assert len(inputs) == 11
-    assert not any(get_const_tuple(attrs.padding))
-    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    _, _, height, width = get_const_tuple(data.shape)
-    _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-
-    y_stride, x_stride = get_const_tuple(attrs.strides)
-    out_height = _compute_output_dim(height, kernel_h, y_stride)
-    out_width = _compute_output_dim(width, kernel_w, x_stride)
-
-    num_outputs = _pick_num_outputs(out_width)
-
-    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, True)
-
-    def data_ptr(buffer, y, x, c, offset=0):
-        if height * width % 2 == 1:
-            x_ptr_offset = tvm.tir.const(-1)
-        else:
-            x_ptr_offset = tvm.tir.const(0)
-
-        return _make_tscript_ptr(
-            buffer,
-            c * const(width * height)
-            + y * const(y_stride * width)
-            + x * const(x_stride * num_outputs)
-            + offset * x_ptr_offset,
-            1,
-        )
-
-    def kernel_ptr(buffer, c, offset=0):
-        return _make_tscript_ptr(
-            buffer,
-            c * tvm.tir.const(kernel_h * kernel_w) - offset,
-            1,
-        )
-
-    prim_func = _make_conv2d_primfunc(
-        (out_height, out_width, out_channels, num_outputs),
-        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
-        aligned_func,
-        offset_func,
-        (data_ptr, kernel_ptr),
-        output_layout=attrs.out_layout,
-    )
-
-    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
-    return [output]
-
-
-def schedule_qnn_depthwise_conv2d(_attrs, _outs, _target):
-    """Schedule function for qnn.depthwise_conv2d."""
-    return None
-
-
-def _make_unrolled_conv2d_primfunc(
-    output_dimensions: Tuple[int, int, int],
-    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
-    function_names: Dict[Tuple, str],
-    function_code: str,
-    ptr_gens: Tuple[Callable, Callable],
-    output_layout: str = "NHWC",
-) -> tir.function.PrimFunc:
-    """Makes a TIR PrimFunc computing Conv2D using a call to tensordot.
-
-    Can be used to generate regular, depthwise, and grouped Conv2D operators by passing different
-    arguments and ptr_gen functions. Takes some of the same arguments as _make_conv2d_primfunc, but
-    requires the tensordot function variations to be passed differently. The generated PrimFunc is
-    simlar to the one produced by _make_conv2d_primfunc, but unrolls the height and width loops
-    over the input tensor. This results in longer code, but unlike _make_conv2d_primfunc this
-    function does not require the height stride be an even number of words.
-
-    This is required to compute layer 25 in MobileNetV1 models, among other things.
-
-    Parameters
-    ----------
-    output_dimensions : Tuple[int, int, int, int]
-        A tuple containing the out_height, out_width, out_channels, and desired num_outputs values
-        in that order.
-
-    buffer_shapes: Tuple[tvm.ir.container.Array]
-        The shapes of the data, kernel, bias, scale, and output tensors, in that order. Each shape
-        should be a TVM Array.
-
-    function_names: Dict[Tuple, str]
-        A dictionary mapping a tuple of (data, kernel, output) alignments to the name of the
-        appropriate tensordot function.
-
-    function_code: str
-        A string containing all verions of tensordot function our PrimFunc needs. This will usually
-        be a string of 4+ function variations concatenated together.
-
-    ptr_gens: Tuple[Callable, Callable]
-        A tuple of two functions to generate data and kernel access pointers. They should take as
-        inputs the buffer, (y, x, c) indices, and an alignment offset. They should return a
-        T.tvm_access_ptr object which can be used in T.call_extern.
-
-    output_layout: str
-        The tensor layout that will be prosued by the generated PrimFunc. Should be NHWC or NCHW.
-    """
-
-    out_height, out_width, out_channels = output_dimensions
-    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
-    data_ptr, kernel_ptr = ptr_gens
-
-    def output_ptr(output, y, c):
-        if output_layout == "NHWC":
-            return _make_tscript_ptr(output, y * const(out_width * out_channels) + c, 1)
-        elif output_layout == "NCHW":
-            return _make_tscript_ptr(
-                output, c * const(out_height * out_width) + y * const(out_width), 1
-            )
-        else:
-            raise TVMError(f"Unsupported out_layout '{output_layout}'!")
-
-    def make_row_calls(buffers, c_var, out_height):
-        output, data, kernel, bias, scale = buffers
-        for y in range(out_height):
-            for c in range(2):
-                _make_tscript_call(
-                    function_names[(y + c) % 2, c % 2, 0],
-                    output_ptr(output, y, c_var + c),
-                    data_ptr(data, y, c_var + c, offset=(y + c) % 2),
-                    kernel_ptr(kernel, c_var + c, offset=c),
-                    _bias_ptr(bias, c_var + c),
-                    _scale_ptr(scale, c_var + c),
-                )
-
-    @T.prim_func
-    def biased_quantized_conv2d(
-        data_handle: T.handle,
-        kernel_handle: T.handle,
-        bias_handle: T.handle,
-        scale_handle: T.handle,
-        output_handle: T.handle,
-    ) -> None:
-        # Same setup is used as in _make_conv2d_primfunc
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        data = T.match_buffer(data_handle, data_shape, dtype="int16")
-        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
-        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
-        scale = T.match_buffer(scale_handle, scale_shape)
-        output = T.match_buffer(output_handle, output_shape, dtype="int16")
-
-        # pylint: disable=unused-variable
-        output[0, 0, 0, 0] = 0
-        __1 = data[0, 0, 0, 0]
-        __2 = kernel[0, 0, 0, 0]
-        __3 = bias[0, 0, 0, 0]
-        __4 = scale[0]
-        # pylint: enable=unused-variable
-
-        for c_ax in T.grid(out_channels // 2):
-            with T.block("conv2ds"):
-                T.block_attr({"pragma_import_c": function_code})
-                c = T.axis.remap("S", [c_ax]) * 2
-                make_row_calls((output, data, kernel, bias, scale), c, out_height)
-
-    return biased_quantized_conv2d
-
-
-def qnn_unrolled_depthwise_conv2d(attrs, inputs, out_type):
-    """Compute for qnn.depthwise_conv2d with NCHW layout for convolutions with small width, height.
-
-    Behaves similarly to qnn_depthwise_conv2d, but does not iterate over the output width and height
-    and instead calls these functions explicitly. This gives a tiny performance boost in exchange
-    for larger code size, but more importantly does not require out_width * out_height
-    * y_stride % 2 == 0. This does, however, require y_stride == x_stride == 1.
-    """
-
-    assert len(inputs) == 11
-    assert not any(get_const_tuple(attrs.padding))
-    y_stride, x_stride = get_const_tuple(attrs.strides)
-    assert y_stride == x_stride == 1
-
-    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    _, _, height, width = get_const_tuple(data.shape)
-    _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-
-    y_stride, x_stride = get_const_tuple(attrs.strides)
-    out_height = _compute_output_dim(height, kernel_h, y_stride)
-    out_width = _compute_output_dim(width, kernel_w, x_stride)
-
-    rq_output_zero_point_const = inputs[10]
-    assert len(rq_output_zero_point_const.op.body) == 1
-    output_zero_point = rq_output_zero_point_const.op.body[0]
-
-    dimensions = (width, kernel_h, kernel_w)
-    x_strides = (1, out_channels)
-
-    func_names = {}
-    impls = []
-    for alignment in ((0, 0, 0), (0, 1, 0), (1, 0, 0), (1, 1, 0)):
-        func_name, impl = tensordot.tensordot_int16_impl(
-            out_width, dimensions, alignment, x_strides, output_zero_point=output_zero_point
-        )
-        func_names[alignment] = func_name
-        impls.append(impl)
-
-    def data_ptr(buffer, y, c, offset=0):
-        return _make_tscript_ptr(buffer, c * const(width * height) + y * const(width) - offset, 1)
-
-    def kernel_ptr(buffer, c, offset=0):
-        return _make_tscript_ptr(buffer, c * const(kernel_h * kernel_w) - offset, 1)
-
-    prim_func = _make_unrolled_conv2d_primfunc(
-        (out_height, out_width, out_channels),
-        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
-        func_names,
-        "\n".join(impls),
-        (data_ptr, kernel_ptr),
-        output_layout=attrs.out_layout,
-    )
-    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
-    return [output]
-
-
-def schedule_qnn_unrolled_depthwise_conv2d(_attrs, _outs, _target):
-    """Schedule function for qnn.depthwise_conv2d."""
-    return None
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
deleted file mode 100644
index 31782d69d032..000000000000
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Cortex-M specific optimizations for quantized operators."""
-
-from typing import Iterable
-
-import numpy as np
-
-from tvm import nd, relay, target
-from ..utils import get_const_tuple
-from ..nn import qnn_conv2d_alter_layout, add_alter_layout, qnn_requantize_alter_layout
-
-
-def prev_ops_match(curr_op: relay.expr.Call, pattern: Iterable[str]):
-    """Checks if the names of nested Relay operators match a pattern.
-
-    Note this function considers `curr_op` as a linear stack of operators, only considering args[0]
-    when traversing backwards. `pattern` should be an Iterable of operator names, written backwards
-    from last to first.
-    """
-    prev_op = curr_op
-    for op_name in pattern:
-        if (not hasattr(prev_op, "op")) or prev_op.op.name != op_name:
-            return False
-        prev_op = prev_op.args[0]
-    return True
-
-
-def edit_attrs(attrs, **kwargs):
-    return {**attrs, **kwargs}
-
-
-def change_numpy_layout(arr, src_layout, dst_layout):
-    assert src_layout.isalpha() and dst_layout.isalpha()
-    axis_order = [src_layout.index(c) for c in dst_layout]
-    return np.transpose(arr, axis_order)
-
-
-def _squash_transformations(expr):
-    if isinstance(expr, relay.expr.Constant):
-        return expr.data.numpy()
-    assert isinstance(expr, relay.expr.Call)
-    assert len(expr.args) == 1
-
-    prev_kernel = _squash_transformations(expr.args[0])
-    attrs = expr.attrs
-
-    if expr.op.name == "layout_transform":
-        return change_numpy_layout(prev_kernel, attrs.src_layout, attrs.dst_layout)
-    elif expr.op.name == "cast":
-        return prev_kernel.astype(attrs.dtype)
-    elif kernel.op.name == "expand_dims":
-        new_axes = range(attrs.axis, attrs.axis + attrs.num_newaxis)
-        return np.expand_dims(prev_kernel, tuple(new_axes))
-    else:
-        raise RuntimeError(f"Invalid kernel transformation '{expr}'!")
-
-
-def _alter_depthwise_conv2d_layout(depthwise_conv2d):
-    cast_op = depthwise_conv2d.args[0]
-    requantize_op = cast_op.args[0]
-    add_op = requantize_op.args[0]
-    prev_conv2d_op = add_op.args[0]
-
-    return relay.qnn.op.conv2d(
-        relay.layout_transform(
-            relay.cast(
-                relay.qnn.op.requantize(
-                    relay.op.add(
-                        relay.qnn.op.conv2d(
-                            *prev_conv2d_op.args,
-                            **edit_attrs(prev_conv2d_op.attrs, out_layout="NCHW"),
-                        ),
-                        relay.layout_transform(
-                            add_op.args[1],
-                            src_layout="NHWC",
-                            dst_layout="NCHW",
-                        ),
-                    ),
-                    *requantize_op.args[1:],
-                    **edit_attrs(requantize_op.attrs, axis=1),
-                ),
-                dtype="int16",
-            ),
-            src_layout="NCHW",
-            dst_layout="NHWC",
-        ),
-        *depthwise_conv2d.args[1:],
-        **edit_attrs(depthwise_conv2d.attrs, data_layout="NCHW"),
-    )
-
-
-@qnn_conv2d_alter_layout.register(["arm_cpu"])
-def alter_conv2d_layout(attrs, inputs, _tinfos, _out_type):
-    """Adjust a qnn.conv2d and preceeding ops to better fit on Cortex-M."""
-    current_target = target.Target.current(allow_none=False)
-    if not "cortex-m" in current_target.mcpu:
-        return None
-
-    # Always cast to int16 and pick a our desired kernel layout - this won't affect anything
-    data_expr, kernel_expr = inputs[:2]
-    is_depthwise = attrs.groups > 1
-    new_kernel_layout = "IOHW" if is_depthwise else "OHWI"
-
-    op = relay.qnn.op.conv2d(
-        relay.cast(data_expr, dtype="int16"),
-        relay.cast(kernel_expr, dtype="int16"),
-        *inputs[2:],
-        **edit_attrs(attrs, kernel_layout=new_kernel_layout, out_layout="NHWC"),
-    )
-
-    # If possible, modify depthwise ops to take as input NCHW instead.
-    if is_depthwise and prev_ops_match(op.args[0], ("cast", "qnn.requantize", "add", "qnn.conv2d")):
-        op = _alter_depthwise_conv2d_layout(op)
-
-    return op
-
-
-@add_alter_layout.register(["arm_cpu"])
-def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
-    """Fuses the zero point for a previous quantized operator with this add operation.
-
-    Currently only supports qnn.conv2d, but qnn.dense support should be added. Note that this
-    optimization means we must pad tensors with the input zero point, and NOT with zero.
-    """
-    prev_op, biases_data_op = inputs
-    if not prev_ops_match(inputs[0], ("qnn.conv2d",)):
-        return None
-
-    # We should not perform this alteration if the target has a uint * int SIMD MAC operation (since
-    # these do (x - (-128)) * y efficiently, and conv_input_zp is usually -128). For now, we
-    # restrict this optimization to just Cortex-M devices, but it might be helpful on others too.
-    current_target = target.Target.current(allow_none=False)
-    if not "cortex-m" in current_target.mcpu:
-        return None
-
-    conv_input_zp = prev_op.args[2].data.numpy().item()
-    kernel = _squash_transformations(prev_op.args[1])
-
-    if prev_op.attrs.groups == prev_op.attrs.channels:
-        axes_to_sum = "HW"
-    elif prev_op.attrs.groups == 1:
-        axes_to_sum = "HWI"
-    else:
-        # This alteration does not currently support grouped conv2d
-        return None
-    axes_to_sum = tuple(map(prev_op.attrs.kernel_layout.index, axes_to_sum))
-    element_sums = np.sum(kernel, axis=axes_to_sum).flatten()
-
-    # The zero point is subtracted from the input elements, so we need a "-" sign here
-    zp_shifted_sums = element_sums * (-conv_input_zp)
-
-    # The bias values may or may not be wrapped in an expand_dims op
-    if isinstance(biases_data_op, relay.expr.Call):
-        biases = biases_data_op.args[0]
-    else:
-        biases = biases_data_op
-    assert isinstance(biases, relay.expr.Constant)
-
-    # We want to make sure new_biases is representable as an int32. It's tempting to just check
-    # whether arr.dtype == "int32" (since Numpy will automatically increase dtype in some cases)
-    # but this leads to weird wrapping behavior and doesn't work. We must do it manually.
-    new_biases = biases.data.numpy().astype("int64") + zp_shifted_sums
-    if new_biases.min() < -(2**31) or new_biases.max() > 2**31 - 1:
-        return None
-
-    current_target = target.Target.current(allow_none=False)
-    new_input_zp = relay.Constant(nd.array(np.int32(0)))
-    new_conv_args = [*prev_op.args[:2], new_input_zp, *prev_op.args[3:]]
-    bias_constant = relay.Constant(nd.array(new_biases.astype("int32")))
-
-    # We should handle padding separately from convolution, so the original tensor can be
-    # de-allocated immediately. This may also help with fusing padding onto a previous
-    # operator. However, only do this if we're working with Cortex-M devices.
-    padding = get_const_tuple(prev_op.attrs.padding)
-    if "cortex-m" in current_target.mcpu and any(padding):
-        data_layout = prev_op.attrs.data_layout
-        assert data_layout.isupper()
-
-        pad_up, pad_left, pad_down, pad_right = padding
-        pad_op_arg = [(0, 0)] * len(data_layout)
-        pad_op_arg[data_layout.index("H")] = (pad_up, pad_down)
-        pad_op_arg[data_layout.index("W")] = (pad_left, pad_right)
-        new_conv_args[0] = relay.nn.pad(new_conv_args[0], tuple(pad_op_arg), conv_input_zp)
-
-    new_conv_op = relay.qnn.op.conv2d(
-        *new_conv_args,
-        **edit_attrs(prev_op.attrs, padding=(0, 0, 0, 0)),
-    )
-    # If biases was wrapped in an expand_dims op, we must re-wrap it
-    if isinstance(biases_data_op, relay.expr.Call):
-        new_biases_op = relay.expand_dims(bias_constant, **biases_data_op.attrs)
-    else:
-        new_biases_op = bias_constant
-
-    return relay.add(new_conv_op, new_biases_op)
-
-
-@qnn_requantize_alter_layout.register(["arm_cpu"])
-def alter_requantize_layout(attrs, inputs, _tinfos, _out_type):
-    """Changes a floating point requantize op to use int64 multiply + shift for microTVM.
-
-    Usually, this is done by QNN legalization. However, microTVM wants to manually choose the
-    integer rounding constants in order to:
-        (a) Have int32, not int64 constants
-        (b) Use a constant rounding shift to skip a memory load.
-
-    Ideally, we would pick these constants in the requantize (or fused) schedule. Unfortunately that
-    is not currently possible, so we pick them with `alter_layout` as a hack. This will only work if
-    the requantize schedule "plays along" with this hack.
-    """
-
-    # Only microTVM Cortex-M boards with DSP use the relevant schedules
-    current_target = target.Target.current(allow_none=False)
-    if not (current_target.features.has_dsp and "cortex-m" in current_target.mcpu):
-        return None
-
-    if not prev_ops_match(inputs[0], ("add", "qnn.conv2d")):
-        return None
-
-    _, in_scale, _, out_scale, _ = inputs
-    in_scale_numpy = in_scale.data.numpy().astype("float64")
-    out_scale_scalar = out_scale.data.numpy().item()
-
-    # Shifting by 33 and rounding means shifting by 32, adding 1, and shifting by 1 again. This is
-    # useful, because shifting a multiplication product by 32 can be done for "free" with SMMUL
-    scales = ((in_scale_numpy / out_scale_scalar) * 2**33).astype("int32")
-
-    # Requantize ops in Relay do not support int32 scales - if we try to use one, requantize.cc will
-    # raise an error. As a hacky work-around, we change the scale dtype to float32, without changing
-    # underlying data. This works, as our compute function knows to interpret the scale as an int32.
-
-    # This is only a work-around - a better long-term solution would be adding a new integer
-    # requantize op, which takes integer scales, shifts, and rounding behavior.
-    fake_float_scales = scales.view("float32")
-
-    scale_constant = relay.Constant(nd.array(fake_float_scales))
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    new_attrs["out_dtype"] = "int16"
-    return relay.qnn.op.requantize(inputs[0], scale_constant, *inputs[2:], **new_attrs)
diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
deleted file mode 100644
index dae869fcb7ee..000000000000
--- a/python/tvm/topi/arm_cpu/qnn_legalize.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""QNN legalization transforms that help eliminate sparse channels.
-
-Some models (like MobileNetV1 when fine-tuned) have output channels in their kernels which are
-completely full of zeros. Sometimes these can be optimized away by the C compiler, but this does not
-happen when complex schedules (like the ACLE tensordot convolutions) are used.
-
-Instead, we will remove these channels by replacing blocks of operators with equivalent "denser"
-ones during legalization. This is harder than it looks - while the outputs of channels with all-zero
-kernels do not depend on the input data, they are usually not zero. We work around this by computing
-how these constant values affect subsequent operators, and "folding" these effects into a bias_add.
-
-It would eventually be nice to have a generalized, cross-target solution for removing zero channels,
-as there is no downside. This may be possible with Relax, but I'm unsure.
-"""
-
-import numpy as np
-
-from tvm import nd, relay
-from tvm.topi.utils import get_const_tuple
-
-from ..nn import bias_add_legalize
-from .qnn_alter_op import edit_attrs, prev_ops_match
-
-
-def _compute_fixed_conv2d_outputs(requantize_op):
-    """Compute all conv2d output values that do not depend on the layer input.
-
-    Parameters
-    ----------
-    requantize_op : relay.expr.Call
-        A qnn.requantize Relay operator, which must be preceeded by a nn.bias_add op and a
-        qnn.conv2d operator. The qnn.conv2d operator must have groups==1. All arguments to all three
-        operators, besides the main tensor, must be constants.
-
-    Returns
-    -------
-    fixed_outputs : Dict[int, int]
-        A dictionary showing which of the conv2d -> bias_add -> requantize output channels are
-        "fixed" - i.e. those that do not depend on the input tensor. Each key in the dictionary is
-        an output channel index, and each value is the value that all entries in that output channel
-        will have. If the block has no fixed output channels, this dictionary will be empty.
-    """
-
-    bias_add_op = requantize_op.args[0]
-    conv2d_op = bias_add_op.args[0]
-
-    assert conv2d_op.attrs.kernel_layout.isalpha()
-    assert conv2d_op.attrs.groups == 1
-    kernel = conv2d_op.args[1].data.numpy()
-    oc_axis = conv2d_op.attrs.kernel_layout.index("O")
-
-    num_channels = kernel.shape[oc_axis]
-    rq_input_scale = requantize_op.args[1].data.numpy()
-    rq_output_scale = requantize_op.args[3].data.numpy().item()
-    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
-    bias_data = bias_add_op.args[1].data.numpy()
-
-    fixed_outputs = {}
-
-    for i in range(num_channels):
-        if np.any(np.take(kernel, i, axis=oc_axis)):
-            continue
-        scale = rq_input_scale[i] / rq_output_scale
-        channel_constant = round(bias_data[i] * scale + rq_output_zero_point)
-        clipped = min(127, max(-128, channel_constant))
-        fixed_outputs[i] = clipped
-
-    return fixed_outputs
-
-
-def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
-    """Compute all depthwise conv2d output values that do not depend on the PREVIOUS layer input.
-
-    We take as input a requantize operator, and a dictionary of which inputs to our depthwise
-    operator are fixed and what values they are fixed to. However, a fixed input to one channel
-    of our depthwise operator does NOT guarantee we can remove the output, because of padding.
-    This function checks if the padding makes a difference in the outputs, and if not, removes
-    the channels from the depthwise_conv2d.
-
-    Parameters
-    ----------
-    requantize_op : relay.expr.Call
-        A qnn.requantize Relay operator, which must be preceeded by a nn.bias_add op and a
-        qnn.conv2d operator. The qnn.conv2d operator must be depthwise. All arguments to all three
-        operators, besides the main tensor, must be constants.
-
-    fixed_channel_inputs : Dict[int, int]
-        A dictionary showing which input channels to the qnn.conv2d operator have fixed values, and
-        what those values are fixed to. Can be empty. Usually, this will be generated by
-        _compute_fixed_conv2d_outputs.
-
-    Returns
-    -------
-    fixed_outputs : Dict[int, int]
-        A dictionary showing which of the conv2d -> bias_add -> requantize output channels are
-        "fixed" - i.e. those that do not depend on the input tensor. Each key in the dictionary is
-        an output channel index, and each value is the value that all entries in that output channel
-        will have. If the block has no fixed output channels, this dictionary will be empty.
-    """
-    from scipy.signal import convolve2d  # pylint: disable=import-outside-toplevel
-
-    bias_add_op = requantize_op.args[0]
-    depthwise_op = bias_add_op.args[0]
-
-    assert depthwise_op.attrs.kernel_layout.isalpha()
-    assert depthwise_op.attrs.groups > 1
-    kernel = depthwise_op.args[1].data.numpy()
-    oc_axis = depthwise_op.attrs.kernel_layout.index("O")
-
-    conv_input_zero_point = depthwise_op.args[2].data.numpy().item()
-    rq_input_scale = requantize_op.args[1].data.numpy()
-    rq_output_scale = requantize_op.args[3].data.numpy().item()
-    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
-    bias_data = bias_add_op.args[1].data.numpy()
-
-    kernel_size = get_const_tuple(depthwise_op.attrs.kernel_size)
-    fixed_outputs = {}
-
-    for i, fixed_input in fixed_channel_inputs.items():
-        input_array = np.full(kernel_size, fixed_input, dtype="int32") - conv_input_zero_point
-        kernel_channel = np.take(kernel, i, axis=oc_axis).reshape(kernel_size)
-        scale = rq_input_scale[i] / rq_output_scale
-
-        convolved = convolve2d(input_array, kernel_channel, mode="same")
-        rounded = np.around((convolved + bias_data[i]) * scale).astype("int32")
-        clipped = np.clip(rounded + rq_output_zero_point, -128, 127)
-
-        # We require the ENTIRE padded convolution to all have the same clipped value before we do
-        # a replacement. This is excessive - we only have to check for the padding that will
-        # actually be performed on the depthwise convolution, which is often less. If we felt even
-        # more ambitious, we could do the replacement for "close enough" looking convolution
-        # outputs, which in theory could reduce accuracy but in practice does not. Doing this would
-        # yield a ~0.5% speed gain on MobileNetV1, and nothing on other models.
-
-        if np.all(clipped == clipped[0, 0]):
-            fixed_outputs[i] = clipped[0, 0]
-
-    # TODO @guberti look for all-zero entries in the depthwise kernel. I don't think these really
-    # occur in practice, but it would be nice for theoretical completeness.
-
-    return fixed_outputs
-
-
-def _excise_conv2d_channels(empty_channels, input_op, requantize_op, is_depthwise=False):
-    bias_add_op = requantize_op.args[0]
-    conv2d_op = bias_add_op.args[0]
-    axis = conv2d_op.attrs.kernel_layout.index("O")
-
-    kernel_data = np.delete(conv2d_op.args[1].data.numpy(), empty_channels, axis=axis)
-    bias_data = np.delete(bias_add_op.args[1].data.numpy(), empty_channels)
-    in_scale_data = np.delete(conv2d_op.args[5].data.numpy(), empty_channels)
-    out_scale_data = np.delete(requantize_op.args[1].data.numpy(), empty_channels)
-    num_channels = kernel_data.shape[axis]
-    if is_depthwise:
-        num_groups = num_channels
-    else:
-        num_groups = 1
-
-    return relay.qnn.op.requantize(
-        relay.nn.bias_add(
-            relay.qnn.op.conv2d(
-                input_op,
-                relay.Constant(nd.array(kernel_data)),
-                *conv2d_op.args[2:5],
-                relay.Constant(nd.array(in_scale_data)),
-                **edit_attrs(conv2d_op.attrs, channels=num_channels, groups=num_groups),
-            ),
-            relay.Constant(nd.array(bias_data)),
-            **bias_add_op.attrs,
-        ),
-        relay.Constant(nd.array(out_scale_data)),
-        *requantize_op.args[2:],
-        **requantize_op.attrs,
-    )
-
-
-def _excise_avg_pool_channels(empty_channels, input_op, first_reshape_op, axis=1):
-    outer_cast = first_reshape_op.args[0].args[0]
-    avg_pool = outer_cast.args[0]
-    inner_cast = avg_pool.args[0]
-
-    new_shape = list(get_const_tuple(first_reshape_op.attrs.newshape))
-    new_shape[axis] -= len(empty_channels)
-
-    return relay.reshape(
-        relay.cast(
-            relay.nn.avg_pool2d(relay.cast(input_op, **inner_cast.attrs), **avg_pool.attrs),
-            **outer_cast.attrs,
-        ),
-        **edit_attrs(first_reshape_op.attrs, newshape=new_shape),
-    )
-
-
-def _fold_into_conv_bias(fixed_inputs, conv2d_op, input_op):
-    assert not any(get_const_tuple(conv2d_op.attrs.padding))
-    in_axis = conv2d_op.attrs.kernel_layout.index("I")
-    out_axis = conv2d_op.attrs.kernel_layout.index("O")
-
-    kernel = conv2d_op.args[1].data.numpy()
-    zero_point = conv2d_op.args[2].data.numpy().item()
-
-    extra_bias = np.zeros((kernel.shape[out_axis],), dtype="int32")
-
-    # For every output channel
-    for i in range(kernel.shape[out_axis]):
-        out_kernel_slice = np.expand_dims(np.take(kernel, i, axis=out_axis), axis=out_axis)
-
-        # For every input channel that is being removed:
-        for j, val in fixed_inputs.items():
-            kernel_slice = np.take(out_kernel_slice, j, axis=in_axis)
-            accumulator = np.sum(kernel_slice * (val - zero_point))
-            extra_bias[i] += accumulator
-
-    stripped_kernel = np.delete(kernel, tuple(fixed_inputs.keys()), axis=in_axis)
-    new_conv = relay.qnn.op.conv2d(
-        input_op,
-        relay.Constant(nd.array(stripped_kernel)),
-        *conv2d_op.args[2:],
-        **conv2d_op.attrs,
-    )
-
-    return new_conv, extra_bias
-
-
-def _fold_into_dense_bias(fixed_inputs, dense_op, input_op, channel_axis=1):
-    weights = dense_op.args[1].data.numpy()
-    assert channel_axis < 2
-    assert len(weights.shape) == 2
-    zero_point = dense_op.args[2].data.numpy().item()
-
-    extra_bias = np.zeros((weights.shape[1 - channel_axis],), dtype="int32")
-
-    # For every output channel
-    for i in range(weights.shape[1 - channel_axis]):
-        out_weights_slice = np.take(weights, i, axis=1 - channel_axis)
-
-        # For every input channel that is being removed:
-        for j, val in fixed_inputs.items():
-            weight = out_weights_slice[j]
-            extra_bias[i] += (val - zero_point) * weight
-
-    stripped_weights = np.delete(weights, tuple(fixed_inputs.keys()), axis=channel_axis)
-    new_dense = relay.qnn.op.dense(
-        input_op,
-        relay.Constant(nd.array(stripped_weights)),
-        *dense_op.args[2:],
-        **dense_op.attrs,
-    )
-
-    return new_dense, extra_bias
-
-
-def _densify_conv_depthwise_conv_pattern(attrs, inputs):
-    """Rewrites a regular -> depthwise -> regular convolution pattern to excise empty out channels.
-
-    Should be called as part of legalization (before dtypes and layouts are rewritten) and with the
-    BIAS ADD OPERATOR'S (the one we'll use to "fold in" our constants) `attrs` and `inputs`. The
-    last regular conv2d operator must be unpadded.
-    """
-    current_conv = inputs[0]
-    depthwise_requantize = current_conv.args[0]
-    top_requantize = depthwise_requantize.args[0].args[0].args[0]
-    top_conv2d = top_requantize.args[0].args[0]
-
-    fixed_conv2d_outputs = _compute_fixed_conv2d_outputs(top_requantize)
-    fixed_dw_outputs = _compute_fixed_depthwise_outputs(depthwise_requantize, fixed_conv2d_outputs)
-
-    # Ensure number of channels is divisible by two
-    if len(fixed_dw_outputs) % 2 > 0:
-        fixed_dw_outputs.popitem()
-
-    if not fixed_dw_outputs:
-        return None
-
-    unneeded_channels = tuple(fixed_dw_outputs.keys())
-    new_top_conv2d = _excise_conv2d_channels(unneeded_channels, top_conv2d.args[0], top_requantize)
-    new_dw_conv2d = _excise_conv2d_channels(
-        unneeded_channels, new_top_conv2d, depthwise_requantize, is_depthwise=True
-    )
-    new_conv, extra_bias = _fold_into_conv_bias(fixed_dw_outputs, current_conv, new_dw_conv2d)
-
-    new_bias = inputs[1].data.numpy() + extra_bias
-    new_op = relay.nn.bias_add(new_conv, relay.Constant(nd.array(new_bias)), **attrs)
-    return new_op
-
-
-def _densify_conv_pool_dense_pattern(attrs, inputs):
-    """Rewrites a regular conv -> pool -> dense pattern to excise empty out channels from the conv.
-
-    Should be called as part of legalization (before dtypes and layouts are rewritten) and with the
-    BIAS ADD operator's `attrs` and `inputs` (the one we'll use to "fold in" our constants). The
-    average pool operator must reduce the height and width dimensions to 1x1.
-    """
-    first_reshape = inputs[0].args[0]
-    top_requantize = first_reshape.args[0].args[0].args[0].args[0].args[0]
-    top_conv2d = top_requantize.args[0].args[0]
-
-    fixed_conv2d_outputs = _compute_fixed_conv2d_outputs(top_requantize)
-
-    # Ensure number of channels is divisible by two
-    if len(fixed_conv2d_outputs) % 2 > 0:
-        fixed_dw_outputs.popitem()
-
-    if not fixed_conv2d_outputs:
-        return None
-
-    unneeded_channels = tuple(fixed_conv2d_outputs.keys())
-    new_top_conv2d = _excise_conv2d_channels(unneeded_channels, top_conv2d.args[0], top_requantize)
-    new_avg_pool = _excise_avg_pool_channels(unneeded_channels, new_top_conv2d, first_reshape)
-    new_conv, extra_bias = _fold_into_dense_bias(fixed_conv2d_outputs, inputs[0], new_avg_pool)
-
-    new_bias = inputs[1].data.numpy() + extra_bias
-    new_op = relay.nn.bias_add(new_conv, relay.Constant(nd.array(new_bias)), **attrs)
-    return new_op
-
-
-@bias_add_legalize.register(["arm_cpu"])
-def legalize_bias_add(attrs, inputs, _tinfos):
-    """Remove empty convolution channels when possible, and "fold" them into the bias add.
-
-    TODO @guberti: these rewrites are always beneficial and will improve performance cross-platform,
-    should we enable them for all platforms, not just arm_cpu?
-    """
-
-    if prev_ops_match(
-        inputs[0],
-        (
-            "qnn.conv2d",
-            "qnn.requantize",
-            "nn.bias_add",
-            "qnn.conv2d",
-            "qnn.requantize",
-            "nn.bias_add",
-            "qnn.conv2d",
-        ),
-    ):
-        current_conv = inputs[0]
-        depthwise_conv2d = current_conv.args[0].args[0].args[0]
-        top_conv2d = depthwise_conv2d.args[0].args[0].args[0]
-        if (
-            not any(get_const_tuple(current_conv.attrs.padding))
-            and current_conv.attrs.groups == 1
-            and depthwise_conv2d.attrs.groups > 1
-            and top_conv2d.attrs.groups == 1
-        ):
-            return _densify_conv_depthwise_conv_pattern(attrs, inputs)
-
-    if prev_ops_match(
-        inputs[0],
-        (
-            "qnn.dense",
-            "reshape",
-            "reshape",
-            "cast",
-            "nn.avg_pool2d",
-            "cast",
-            "qnn.requantize",
-            "nn.bias_add",
-            "qnn.conv2d",
-        ),
-    ):
-        avg_pool = inputs[0].args[0].args[0].args[0].args[0]
-        top_requantize = avg_pool.args[0].args[0]
-        top_conv2d = top_requantize.args[0].args[0]
-        if top_conv2d.attrs.groups == 1:
-            return _densify_conv_pool_dense_pattern(attrs, inputs)
-
-    return None
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
deleted file mode 100644
index de38b944c27a..000000000000
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ /dev/null
@@ -1,1158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D int8 schedule on ARM"""
-
-import tvm
-from tvm import te
-from tvm.ir import register_intrin_lowering
-
-
-def gemm_4x4_int8_int8_int32(M, N, K, unroll, in_type):
-    """
-    Int8 4x4 matrix multiplication and accumulation using a sequence of
-    umull -> uadalp -> umull2 -> uadalp instructions. This function
-    takes two arrays of int8 data type  A[4][K] and B[4][K], and produces
-    a 4x4 matrix which is equal to A*B'.
-
-    The pseudo code is as follows.
-
-    .. code-block:: c
-
-        void gemm_4x4_int8_int8_int32(int8 A[4][K], int8 B[4][K], int32 C[4][4]){
-            for (int i = 0; i < 4; i++){
-                for (int j = 0; j < 4; j++){
-                    for (int k = 0; k < K; k++){
-                        C[i][j] += A[i][k] * B[j][k]
-                    }
-            }
-        }
-
-    Notes:
-        * The tiling strategy is picked to maximize register usage.
-
-    Parameters
-    ----------
-    M : int
-        rows of the matrix A
-    N : int
-        columns of the matrix B
-    K : int
-        columns of matrix A
-    unroll : bool
-        Unroll the loop accumulation if True
-    in_type : str, {'uint8', 'int8'}
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The ARM uint8/int8 TensorIntrin that can be used in tensorizing schedule
-    """
-    assert in_type in ["uint8", "int8"]
-    A = te.placeholder((K // 16, te.var("m"), 16), dtype=in_type, name="A")
-    B = te.placeholder((K // 16, te.var("n"), 16), dtype=in_type, name="B")
-    dtype_vec = in_type + "x16"
-    idxm = tvm.tir.indexmod
-
-    k = te.reduce_axis((0, K), "k")
-    C = te.compute(
-        (te.var("m"), te.var("n")),
-        lambda x, y: te.sum(
-            A[k // 16, x, idxm(k, 16)].astype("int32") * B[k // 16, y, idxm(k, 16)].astype("int32"),
-            axis=k,
-        ),
-        name="C",
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        A.shape,
-        dtype=in_type,
-        name="a_buffer",
-        offset_factor=1,
-        strides=[te.var("sa_1"), te.var("sa_2"), 1],
-    )
-
-    b_buffer = tvm.tir.decl_buffer(
-        B.shape,
-        dtype=in_type,
-        name="b_buffer",
-        offset_factor=1,
-        strides=[te.var("sb_1"), te.var("sb_2"), 1],
-    )
-
-    c_buffer = tvm.tir.decl_buffer(
-        C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1]
-    )
-
-    # Intrinsics used in the following algorithm
-    umull_intrin = "llvm.aarch64.neon.umull" if in_type == "uint8" else "llvm.aarch64.neon.smull"
-    uaddlp_intrin = "llvm.aarch64.neon.uaddlp" if in_type == "uint8" else "llvm.aarch64.neon.saddlp"
-    addp_intrin = "llvm.aarch64.neon.addp"
-
-    def uadalp(a, b):
-        """Add pair and accumulate
-
-        Parameters:
-        ----------
-        a: int16x8 vector
-        b: int16x8 vector
-
-        Returns:
-        --------
-            return a int32x4 vector
-
-        Pseudocode:
-        ----------
-            a += (b0+b1, b2+b3, b4+b5, b6+b7)
-        """
-
-        return a + tvm.tir.call_llvm_pure_intrin(
-            "int32x4", uaddlp_intrin, tvm.tir.const(1, "uint32"), b
-        )
-
-    def umull(a, b):
-        """Multiply long (higher part)
-
-        Parameters:
-        ----------
-        a: int8x16 vector
-        b: int8x16 vector
-
-        Returns:
-        --------
-            return a int16x8 vector
-
-        Pseudocode:
-        ----------
-            c = (a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7)
-        """
-        a_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", a)
-        b_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", b)
-        c = tvm.tir.call_llvm_pure_intrin(
-            "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_high, b_high
-        )
-        return c
-
-    def umull2(a, b):
-        """Multiply long (lower part)
-
-        Parameters:
-        ----------
-        a: int8x16 vector
-        b: int8x16 vector
-
-        Returns:
-        --------
-            return a int16x8 vector
-
-        Pseudocode:
-        ----------
-            c = (a8*b8, a9*b9, a10*b10, a11*b11, a12*b12, a13*b13, a14*b14, a15*b15)
-        """
-        a_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", a)
-        b_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", b)
-        c = tvm.tir.call_llvm_pure_intrin(
-            "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_low, b_low
-        )
-        return c
-
-    def addp(a, b):
-        """Add two vectors in pairs
-
-        Parameters:
-        ----------
-        a: int32x4 vector
-        b: int32x4 vector
-
-        Returns:
-        --------
-            return a int32x4 vector
-
-        Pseudocode:
-        ----------
-            c = (a0+a1, a2+a3, b0+b1, b0+b3)
-        """
-        return tvm.tir.call_llvm_pure_intrin(
-            "int32x4", addp_intrin, tvm.tir.const(2, "uint32"), a, b
-        )
-
-    def accumulation_loop(M, N, ins, acc, tile_idx):
-        """Internal tile accumulation. This function
-        takes two arrays of int8 data type  A[tile_idx][4][16] and B[tile_idx][4][16], produces
-        a 4x4 matrix which is equal to A*B' and accumulates into C[4][4]
-
-        The pseudo code is as follows.
-
-        .. code-block:: c
-
-            void gemm_4x4_int8_int8_int32(int8 A[tile_idx][4][K],
-                                          int8 B[tile_idx][4][K],
-                                          int32 C[4][4]){
-                for (int i = 0; i < 4; i++){
-                    for (int j = 0; j < 4; j++){
-                        for (int k = 0; k < 16; k++){
-                            C[i][j] += A[tile_idx][i][k] * B[tile_idx][j][k]
-                        }
-                }
-            }
-
-        Notes:
-            * The tiling strategy is picked to maximize register usage.
-
-        Parameters:
-        ----------
-        M : int
-            Number of total rows of the output matrix
-        N : int
-            Number of total columns of the output matrix
-        ins : list of tvm.tir.buffer
-            Input buffers
-        acc : tvm.tir.ir_builder.BufferVar
-            Bank of register accumulators
-        tiled_idx : int
-            Index of a sub-tile of A and B in A[tile_idx][:][:] and B[tile_idx][:][:].
-            Please note that  0 <= tile_idx <= K//16
-
-        """
-        a0 = ins[0].vload([tile_idx, 0, 0], dtype_vec)
-        a1 = tvm.tir.const(0, "int8x16")
-        if M > 1:
-            a1 = ins[0].vload([tile_idx, 1, 0], dtype_vec)
-        a2 = tvm.tir.const(0, "int8x16")
-        if M > 2:
-            a2 = ins[0].vload([tile_idx, 2, 0], dtype_vec)
-        a3 = tvm.tir.const(0, "int8x16")
-        if M > 3:
-            a3 = ins[0].vload([tile_idx, 3, 0], dtype_vec)
-
-        b0 = ins[1].vload([tile_idx, 0, 0], dtype_vec)
-        b1 = tvm.tir.const(0, "int8x16")
-        if N > 1:
-            b1 = ins[1].vload([tile_idx, 1, 0], dtype_vec)
-        b2 = tvm.tir.const(0, "int8x16")
-        if N > 2:
-            b2 = ins[1].vload([tile_idx, 2, 0], dtype_vec)
-        b3 = tvm.tir.const(0, "int8x16")
-        if N > 3:
-            b3 = ins[1].vload([tile_idx, 3, 0], dtype_vec)
-
-        # First half
-        # Lower part of a0 * {b0,b1,b2,b3}
-        d00 = umull(a0, b0)
-        d01 = umull(a0, b1)
-        d02 = umull(a0, b2)
-        d03 = umull(a0, b3)
-
-        # Lower part of a1 * {b0,b1,b2,b3}
-        d10 = umull(a1, b0)
-        d11 = umull(a1, b1)
-        d12 = umull(a1, b2)
-        d13 = umull(a1, b3)
-
-        # Accumulate
-        acc[0] = uadalp(acc[0], d00)
-        acc[1] = uadalp(acc[1], d01)
-        acc[2] = uadalp(acc[2], d02)
-        acc[3] = uadalp(acc[3], d03)
-        acc[4] = uadalp(acc[4], d10)
-        acc[5] = uadalp(acc[5], d11)
-        acc[6] = uadalp(acc[6], d12)
-        acc[7] = uadalp(acc[7], d13)
-
-        # Higher part of a0 * {b0,b1,b2,b3}
-        d00 = umull2(a0, b0)
-        d01 = umull2(a0, b1)
-        d02 = umull2(a0, b2)
-        d03 = umull2(a0, b3)
-
-        # Higher part of a1 * {b0,b1,b2,b3}
-        d10 = umull2(a1, b0)
-        d11 = umull2(a1, b1)
-        d12 = umull2(a1, b2)
-        d13 = umull2(a1, b3)
-
-        # Accumulate again
-        acc[0] = uadalp(acc[0], d00)
-        acc[1] = uadalp(acc[1], d01)
-        acc[2] = uadalp(acc[2], d02)
-        acc[3] = uadalp(acc[3], d03)
-        acc[4] = uadalp(acc[4], d10)
-        acc[5] = uadalp(acc[5], d11)
-        acc[6] = uadalp(acc[6], d12)
-        acc[7] = uadalp(acc[7], d13)
-
-        # Second half
-        # Lower part of a2 * {b0,b1,b2,b3}
-        d00 = umull(a2, b0)
-        d01 = umull(a2, b1)
-        d02 = umull(a2, b2)
-        d03 = umull(a2, b3)
-
-        # Lower part of a3 * {b0,b1,b2,b3}
-        d10 = umull(a3, b0)
-        d11 = umull(a3, b1)
-        d12 = umull(a3, b2)
-        d13 = umull(a3, b3)
-
-        # Accumulate
-        acc[8] = uadalp(acc[8], d00)
-        acc[9] = uadalp(acc[9], d01)
-        acc[10] = uadalp(acc[10], d02)
-        acc[11] = uadalp(acc[11], d03)
-        acc[12] = uadalp(acc[12], d10)
-        acc[13] = uadalp(acc[13], d11)
-        acc[14] = uadalp(acc[14], d12)
-        acc[15] = uadalp(acc[15], d13)
-
-        # Higher part of a2 * {b0,b1,b2,b3}
-        d00 = umull2(a2, b0)
-        d01 = umull2(a2, b1)
-        d02 = umull2(a2, b2)
-        d03 = umull2(a2, b3)
-
-        # Lower part of a3 * {b0,b1,b2,b3}
-        d10 = umull2(a3, b0)
-        d11 = umull2(a3, b1)
-        d12 = umull2(a3, b2)
-        d13 = umull2(a3, b3)
-
-        # Accumulate
-        acc[8] = uadalp(acc[8], d00)
-        acc[9] = uadalp(acc[9], d01)
-        acc[10] = uadalp(acc[10], d02)
-        acc[11] = uadalp(acc[11], d03)
-        acc[12] = uadalp(acc[12], d10)
-        acc[13] = uadalp(acc[13], d11)
-        acc[14] = uadalp(acc[14], d12)
-        acc[15] = uadalp(acc[15], d13)
-
-    def _intrin_func(ins, outs):
-        def _instr():
-            ib = tvm.tir.ir_builder.create()
-            # Allocate a local buffer (possibly translates to registers)
-            acc = ib.allocate("int32x4", 16, name="accs", scope="local")
-            m = outs[0].shape[0]
-            n = outs[0].shape[1]
-            # Initialization
-            for i in range(0, 16):
-                acc[i] = tvm.tir.const(0, "int32x4")
-
-            if unroll:
-                for i in range(0, int(K // 16)):
-                    accumulation_loop(M, N, ins, acc, i)
-            else:
-                with ib.for_range(0, K // 16, name="i") as i:
-                    accumulation_loop(M, N, ins, acc, i)
-
-            # Final accumulations
-            # acc[4*r + c] contains the partial accumulations of element C[r][c]
-            #
-            # In particular:
-            # acc[4*r] contains the partial sums of a[r,0:K].*b[0,0:K] -> (a,b,c,d)
-            # acc[4*r+1] contains the partial sums of a[r, 0:K].*b[1,0:K] -> (e,f,g,h)
-            # acc[4*r+2] contains the partial sums of a[r, 0:K].*b[2,0:K] -> (i,j,k,l)
-            # acc[4*r+3] contains the partial sums of a[r, 0:K].*b[3,0:K] -> (m,n,o,p)
-            #
-            # Please note that 0<= r, c < 4
-
-            acc[0] = addp(acc[0], acc[1])  # (a+b, c+d, e+f, g+h)
-            acc[1] = addp(acc[2], acc[3])  # (i+j, k+l, m+n, o+p)
-            acc[0] = addp(acc[0], acc[1])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-            acc[4] = addp(acc[4], acc[5])  # (a+b, c+d, e+f, g+h)
-            acc[5] = addp(acc[6], acc[7])  # (i+j, k+l, m+n, o+p)
-            acc[4] = addp(acc[4], acc[5])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-            acc[8] = addp(acc[8], acc[9])  # (a+b, c+d, e+f, g+h)
-            acc[9] = addp(acc[10], acc[11])  # (i+j, k+l, m+n, o+p)
-            acc[8] = addp(acc[8], acc[9])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-            acc[12] = addp(acc[12], acc[13])  # (a+b, c+d, e+f, g+h)
-            acc[13] = addp(acc[14], acc[15])  # (i+j, k+l, m+n, o+p)
-            acc[12] = addp(acc[12], acc[13])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-            # Store the result
-            if N > 3:
-                out_0 = acc[0]
-                out_1 = acc[4]
-                out_2 = acc[8]
-                out_3 = acc[12]
-            elif N > 2:
-                out_0 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[0])
-                out_1 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[4])
-                out_2 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[8])
-                out_3 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[12])
-            elif N > 1:
-                out_0 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[0])
-                out_1 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[4])
-                out_2 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[8])
-                out_3 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[12])
-            else:
-                out_0 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[0])
-                out_1 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[4])
-                out_2 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[8])
-                out_3 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[12])
-
-            ib.emit(outs[0].vstore([0, 0], out_0))
-            if M > 1:
-                ib.emit(outs[0].vstore([1, 0], out_1))
-            if M > 2:
-                ib.emit(outs[0].vstore([2, 0], out_2))
-            if M > 3:
-                ib.emit(outs[0].vstore([3, 0], out_3))
-            return ib.get()
-
-        # body, reset, update
-        return _instr()
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={A: a_buffer, B: b_buffer, C: c_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def dot_int8_int8_int32_neon_82(int32_lanes, dtype="uint"):
-    """
-    Int8 dot product by every 4 elements using ARM v8.2 udot.
-    This function takes two arrays of int8 datatype -- data[4] and
-    kernel[int32_lanes][4] -- and computes a dot product of data[4] with every
-    4 elements of kernels, resulting in output[int32_lanes] of uint32 datatype.
-    The pseudo code is as follows.
-
-    .. code-block:: c
-
-        void dot_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){
-            for (int i = 0; i < int32_lanes; i++){
-                out[i] = 0;
-                for (int k = 0; k < 4; k++){
-                    out[i] += data[k] * kernel[i][k]
-                }
-            }
-        }
-
-    Physically, the kernel array sits in a vector register and
-    the data[4] is broadcasted to another vector register. This
-    function returns a TensorIntrin that can be used to tensorize
-    a schedule.
-
-    Parameters
-    ----------
-    int32_lanes : int
-        How many int32/uint32 to produce
-    dtype : str, optional, {"uint", "int"}
-        Whether it works on unsigned int or signed int
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The ARM uint8 TensorIntrin that can be used in tensorizing schedule
-    """
-    num_int8_elements = 4  # 4 int8 elements in int32
-
-    data = te.placeholder((num_int8_elements,), dtype=f"{dtype}8", name="data")
-    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype=f"{dtype}8", name="kernel")
-
-    k = te.reduce_axis((0, num_int8_elements), name="k")
-    C = te.compute(
-        (int32_lanes,),
-        lambda i: te.sum(data[k].astype(f"{dtype}32") * kernel[i, k].astype(f"{dtype}32"), axis=k),
-        name="C",
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype=f"{dtype}8", name="a_buffer", offset_factor=1, strides=[1]
-    )
-    b_buffer = tvm.tir.decl_buffer(
-        kernel.shape, dtype=f"{dtype}8", name="b_buffer", offset_factor=1, strides=[te.var("s"), 1]
-    )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.tir.const(0, f"{dtype}32x{int32_lanes}")))
-                return ib.get()
-
-            dtype_a = f"{dtype}8x{num_int8_elements}"
-            dtype_b = f"{dtype}8x{int32_lanes * num_int8_elements}"
-            dtype_c = f"{dtype}32x{int32_lanes}"
-
-            a_int8 = ins[0].vload([0], dtype_a)
-            re_int32 = tvm.tir.call_intrin(f"{dtype}32", "tir.reinterpret", a_int8)
-            # broadcast a
-            vec_ai32 = re_int32.astype(dtype_c)
-
-            vec_a = tvm.tir.call_intrin(dtype_b, "tir.reinterpret", vec_ai32)
-            vec_b = ins[1].vload([0, 0], dtype_b)
-            vec_c = outs[0].vload([0], dtype_c)
-
-            inst = "udot" if dtype == "uint" else "sdot"
-            inst = "llvm.aarch64.neon.%s.v%di32.v%di8" % (
-                inst,
-                int32_lanes,
-                int32_lanes * num_int8_elements,
-            )
-            vdot = tvm.tir.call_llvm_pure_intrin(
-                dtype_c, inst, tvm.tir.const(3, "uint32"), vec_c, vec_a, vec_b
-            )
-            ib.emit(outs[0].vstore(0, vdot))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={data: a_buffer, kernel: b_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def dot_int8_int8_int32_neon():
-    """
-    Int8 dot product using vmlal instructions
-
-    .. code-block:: c
-
-        void dot_int8_int8_int32(int8 data[4], int8 kernel[4][4], int32 output[4]){
-            for (int i = 0; i < 4; i++){
-                out[i] = 0;
-                for (int k = 0; k < 4; k++){
-                    out[i] += data[k] * kernel[i][k]
-                }
-            }
-        }
-
-    We use the smull and saddlp instructions to compute the dot product.
-    smull : int8x16 -> int8x16 -> int16x8 elementwise multiplication
-    saddlp: int16x8 -> int32x4 pairwise addition of elements
-
-    Data is broadcast across the register
-    int8 elements
-    |         data      |         data      |
-    |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |
-
-                      smull
-
-    int8 elements
-    |     kernel[i]     |     kernel[i+1]   |
-    |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |
-
-                        =
-
-    int16 elements
-    |               data * kernel[i]        |         data * kernel[i+1]            |
-    |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
-
-                                          saddlp =
-
-    int32 elements
-    |    partial sum(data * kernel[i])      |  partial sum(data * kernel[i+1])      |
-    |         0         |         1         |         2         |         3         |
-
-
-    We apply the above kernel twice and use addp to compute the second set of pairwise additions
-
-    int32 elements (narrowed for so they fit on a line)
-    |    psum d*k[i]    |   psum d*k[i+1]   |           |   psum d*k[i+2]   |   psum d*k[i+3]   |
-    |    0    |    1    |    2    |    3    |   addp    |    4    |    5    |    6    |    7    |
-                                                 =
-    |sum d*ki |sum d*ki1|sum d*ki2|sum d*ki3|
-    |    0    |    1    |    2    |    3    |
-
-
-    """
-    int32_lanes = 4  # 4 int32 lanes = 128
-    num_int8_elements = 4  # 4 int8 elements in int32
-    data = te.placeholder((num_int8_elements,), dtype="int8", name="data")
-    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel")
-    k = te.reduce_axis((0, num_int8_elements), name="k")
-    C = te.compute(
-        (int32_lanes,),
-        lambda i: te.sum(data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
-        name="C",
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype="int8", name="a_buffer", offset_factor=1, strides=[1]
-    )
-    b_buffer = tvm.tir.decl_buffer(
-        kernel.shape, dtype="int8", name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]
-    )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            int_8xl = "int8x8"
-            int_32xl = "int32x4"
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.tir.const(0, int_32xl)))
-                return ib.get()
-
-            # this broadcasts data to the vector size
-            a_int8 = ins[0].vload([0], "int8x4")
-            re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-            vec_ai32 = re_int32.astype("int32x2")
-            vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
-
-            vec_b = ins[1].vload([0, 0], "int8x16")
-
-            def pairwise_add_mul(extract_half):
-                vec_b_half = tvm.tir.call_intrin("int8x8", extract_half, vec_b)
-                multiply = tvm.tir.call_llvm_pure_intrin(
-                    "int16x8",
-                    "llvm.aarch64.neon.smull.v8i16",  # saturating pairwise multiplication
-                    tvm.tir.const(2, "uint32"),
-                    vec_a,
-                    vec_b_half,
-                )
-                pairwise_reduction = tvm.tir.call_llvm_pure_intrin(
-                    "int32x4",
-                    "llvm.aarch64.neon.saddlp.v4i32.v8i16",
-                    tvm.tir.const(1, "uint32"),
-                    multiply,
-                )
-                return pairwise_reduction
-
-            pair_1 = pairwise_add_mul("tir.vectorlow")
-            pair_2 = pairwise_add_mul("tir.vectorhigh")
-            quad_reduction = tvm.tir.call_llvm_pure_intrin(
-                "int32x4",
-                "llvm.aarch64.neon.addp.v4i32",
-                tvm.tir.const(2, "uint32"),
-                pair_1,
-                pair_2,
-            )
-            if index == 0:
-                ib.emit(outs[0].vstore(0, quad_reduction))
-            else:
-                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], int_32xl)))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={data: a_buffer, kernel: b_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def select_word(vec, lane, dtype_vec):
-    """
-    Utility function used to select a int8x4 word within a int8x16 vector
-    and replicate 4 times.
-    The pseudo-code for this operation is:
-
-    v = [x0, ..., x15]
-    vsub(lane) = v[4*lane:4*lane+3]
-    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
-
-    Note that 0<=lane<4
-
-     Parameters
-    ----------
-    vec : tvm.tir.Expr
-         int8x16 vector expression
-    lane : int
-        vector lane we want to replicate
-    dtype_vec : str
-        vector data type (e.g., int8x16)
-
-    Returns
-    ----------
-    output : tvm.tir.Expr
-        replicated vector
-    """
-    # Reinterpret vec_a as 4 int32 words
-    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
-    # Broadcast the lane-th word
-    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
-    # Convert back to uint8x16
-    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", vec_int32_shuffled)
-    return vec_int8_broadcast
-
-
-def gemm_acc_4x4_int8_int8_int32(dtype):
-    """
-    Int8 4x4 matrix multiplication and accumulation using sdot/udot
-    instructions. This function takes two arrays of int8 datatype
-    -- A[4][4] and B[4][4] and produces a 4x4 matrix
-    which is equal to A*B'.
-
-    The pseudo code is as follows.
-
-    .. code-block:: c
-
-        void gemm_acc_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 C[4][4]){
-            for (int i = 0; i < 4; i++){
-                for (int j = 0; j < 4; j++){
-                    for (int k = 0; k < 4; k++){
-                        C[i][j] += A[i][k] * B[j][k]
-                    }
-            }
-        }
-
-    Notes:
-        * The tiling strategy is picked to maximize register usage.
-
-    Parameters
-    ----------
-    dtype : str, {"uint8", "int8"}
-        Whether it works on unsigned int or signed int
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Arm TensorIntrin that can be used in tensorizing schedule
-    """
-    assert dtype in ["uint8", "int8"]
-    # This needs to be a variable number of "rows" since TVM
-    # "thinks" I only need to compute one row because of
-    # padding
-    A = te.placeholder((te.var("rows"), 4), dtype, name="A")
-    B = te.placeholder((4, 4), dtype, name="B")
-    dtype_vec = dtype + "x16"
-
-    k = te.reduce_axis((0, 4), name="k")
-    C = te.compute(
-        (te.var("rows"), 4),
-        lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k),
-        name="C",
-    )
-
-    aa_buffer = tvm.tir.decl_buffer(
-        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
-    )
-    bb_buffer = tvm.tir.decl_buffer(
-        B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
-    )
-    cc_buffer = tvm.tir.decl_buffer(
-        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
-    )
-
-    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot"
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                for i in range(0, 4):
-                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4")))
-                return ib.get()
-            # Load all the elements of tile A.
-            # vec_a = [a, b, c, d,
-            #          e, f, g, h,
-            #          l, m, n, o,
-            #          p, q, r, s];
-            vec_a = ins[0].vload([0, 0], dtype_vec)
-
-            # Replicate 4 times the i-th row of A. For instance,
-            # vec_a[0] = [a, b, c, d,
-            #             a, b, c, d,
-            #             a, b, c, d,
-            #             a, b, c, d,];
-            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
-
-            # Load all the elements of B. Remember that B
-            # is transposed:
-            # vec_b = [0, 4, 8, 12,
-            #          1, 5, 9, 13,
-            #          2, 6, 10, 14,
-            #          3, 7, 11, 15,];
-            vec_b = ins[1].vload([0, 0], dtype_vec)
-
-            # Execute the dot product
-            for i in range(0, 4):
-                vec_c = outs[0].vload([i, 0], "int32x4")
-                # Compute the product between the i-th row of A
-                # and all the rows of B. Remember that sdot/udot
-                # subdive the input vectors in 16 elements
-                # and then take the dot product among each group.
-                # The result is stored in a int32x4 register
-                #
-                # For instance, for i=0, we have:
-                # sdot(vec_aa[0], vec_b) = [a*0+b*4+c*8+d*12,
-                #                           a*1+b*5+c*9+d*13,
-                #                           a*2+b*6+c*10+d*14,
-                #                           a*3+b*7+c*11+d*15]
-                vdot = tvm.tir.call_llvm_intrin(
-                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_b, vec_aa[i]
-                )
-
-                # Store the result
-                ib.emit(outs[0].vstore([i, 0], vdot))
-
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def gemm_acc_nx16_int8_int8_int32(dtype, rows):
-    """
-    Int8 nx16 matrix multiplication and accumulation using sdot/udot instructions
-    This function takes two arrays of int8 datatype -- A[n][4] and
-    B[4][16] and produces a rowsx16 matrix which is equal to A*B'
-    The pseudo code is as follows.
-
-    .. code-block:: c
-
-        void mmla_nx16_int8_int8_int32(int8 A[n][16], int8 B[4][16][4], int32 output[n][16]){
-            for (int i = 0; i < n; i++){
-                for (int j = 0; j < 16; j++){
-                    for (int k = 0; k < 16; k++){
-                        out[i][j] += A[i][k] * B[k//4][j][k%4]
-                    }
-                }
-            }
-        }
-
-    Notes:
-        * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16
-          we need 4 tiles of B to compute a single row of the output. The first 4 values of
-          k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on
-        * The tiling strategy is picked to maximize register usage.
-
-    Parameters
-    ----------
-    dtype : str, {"uint8", "int8"}
-        Whether it works on unsigned int or signed int
-    rows : int
-        Number of the output rows "n"
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Arm TensorIntrin that can be used in tensorizing schedule
-    """
-    assert dtype in ["uint8", "int8"]
-    A = te.placeholder((rows, 16), dtype, name="A")
-    B = te.placeholder((4, 16, 4), dtype, name="B")
-    dtype_vec = dtype + "x16"
-    idxm = tvm.tir.indexmod
-    k = te.reduce_axis((0, 16), name="k")
-    C = te.compute(
-        (rows, 16),
-        lambda i, j: te.sum(
-            A[i, k].astype("int32") * B[k // 4, j, idxm(k, 4)].astype("int32"), axis=k
-        ),
-        name="C",
-    )
-
-    aa_buffer = tvm.tir.decl_buffer(
-        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
-    )
-    bb_buffer = tvm.tir.decl_buffer(
-        B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb0"), te.var("sb1"), 1]
-    )
-    cc_buffer = tvm.tir.decl_buffer(
-        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
-    )
-
-    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot"
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                for i in range(0, rows):
-                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x16")))
-                return ib.get()
-            # Iterate on the number of rows of the output
-            for k in range(0, rows):
-                # Load 16 elements of A
-                # vec_a = [a, b, c, d, e, f, g, h, l, m, n, o, p, q, r, s];
-                vec_a = ins[0].vload([k, 0], dtype_vec)
-
-                # Iterate over each of the 4 rowsx4 tiles of the output
-                for j in range(0, 4):
-                    # Accumulate over each of the 4 (16x4) tiles contained in B
-                    for i in range(0, 4):
-                        # Replicate a single 4-element group of A (A[k, i:i+4])
-                        vec_aa = select_word(vec_a, i, dtype_vec)
-
-                        # Load 4 rows (each rows with 4 elements) from B (B[i:i+4, j:j+4])
-                        # vec_b = [0, 16, 32, 48,
-                        #          1, 17, 33, 49,
-                        #          2, 18, 34, 50,
-                        #          3, 19, 35, 51,];
-                        vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec)
-
-                        # Accumulate in the correct part of the output
-                        vec_c = outs[0].vload([k, 4 * j], "int32x4")
-
-                        # Compute the dot product between the rowsx4 tile
-                        # from A and the 4x4 tile from B
-                        #
-                        # For instance, for i=0, we have:
-                        # sdot(vec_aa[0], vec_b) = [a*0+b*16+c*32+d*48,
-                        #                           a*1+b*17+c*33+d*49,
-                        #                           a*2+b*18+c*34+d*50,
-                        #                           a*3+b*19+c*35+d*51]
-                        vdot = tvm.tir.call_llvm_intrin(
-                            "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_b, vec_aa
-                        )
-                        ib.emit(outs[0].vstore([k, 4 * j], vdot))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def smlal_int16_int32():
-    """
-    Intrinsic to be used in order to load two int16x8 vectors and multiply
-    them together through a pair of smlal/smlal2 instructions. The pseudo-code
-    for the algorithm is as follows:
-
-        vec_a = vload(A, "int16x8")
-        vec_b = vload(B, "int16x8")
-
-        vec_c[0:4] += vec_a[0:4]*vec_b[0:4] //  -> smlal instruction
-        vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
-
-    So we load a single int16x8 vector and we accumulate its lower (0:4) and
-    higher part separately.
-    """
-    int16_lanes = 8
-    A = te.placeholder((int16_lanes,), dtype="int16", name="A")
-    B = te.placeholder((int16_lanes, 1), dtype="int16", name="B")
-    C = te.compute(
-        (int16_lanes,), lambda i: A[i].astype("int32") * B[i, 0].astype("int32"), name="C"
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1]
-    )
-    b_buffer = tvm.tir.decl_buffer(
-        B.shape, dtype="int16", name="b_buffer", offset_factor=1, strides=[te.var("sb"), 1]
-    )
-    c_buffer = tvm.tir.decl_buffer(
-        C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[1]
-    )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8")))
-                return ib.get()
-
-            vec_a = ins[0].vload([0], "int16x8")
-            vec_b = ins[1].vload([0, 0], "int16x8")
-            inst = "llvm.aarch64.neon.smull"
-
-            # Higher part of the vector
-            vec_c_h = outs[0].vload([4], "int32x4")
-            vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a)
-            vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b)
-            vmull_h = tvm.tir.call_llvm_pure_intrin(
-                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_h, vec_b_h
-            )
-            vec_out_h = vec_c_h + vmull_h
-
-            # Lower part of the vector
-            vec_c_l = outs[0].vload([0], "int32x4")
-            vec_a_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_a)
-            vec_b_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_b)
-            vmull_l = tvm.tir.call_llvm_pure_intrin(
-                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_l, vec_b_l
-            )
-            vec_out_l = vec_c_l + vmull_l
-
-            # Combine higher and lower part in a single int32x8 vector to store
-            # (this will require two different store instructions, since the
-            # length of a NEON vector is fixed at 128
-            vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h)
-            ib.emit(outs[0].vstore(0, vec_out))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={A: a_buffer, B: b_buffer, C: c_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def gemm_acc_2x2_int8_int8_int32(dtype):
-    """
-    Int8 2x2 matrix multiplication using smmla/ummla instructions
-    This function takes two arrays of int8 datatype -- A[2][8] and
-    B[2][8] and produces a 2x2 matrix which is equal to A*B'
-    The pseudo code is as follows.
-
-    .. code-block:: c
-
-        void mmla_2x2_int8_int8_int32(int8 A[2][8], int8 B[2][8], int32 C[2][2]){
-            for (int i = 0; i < 2; i++){
-                for (int j = 0; j < 2; j++){
-                    for (int k = 0; k < 8; k++){
-                        C[i][j] += A[i][k] * B[j][k]
-                    }
-            }
-        }
-
-    Parameters
-    ----------
-    dtype : str, {"uint8", "int8"}
-        Whether it works on unsigned int or signed int
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Arm TensorIntrin that can be used in tensorizing schedule
-    """
-    assert dtype in ["uint8", "int8"]
-    A = te.placeholder((2, 8), dtype, name="A")
-    B = te.placeholder((2, 8), dtype, name="B")
-    dtype_vec = dtype + "x16"
-
-    k = te.reduce_axis((0, 8), name="k")
-    C = te.compute(
-        (2, 2),
-        lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k),
-        name="C",
-    )
-
-    aa_buffer = tvm.tir.decl_buffer(
-        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
-    )
-    bb_buffer = tvm.tir.decl_buffer(
-        B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
-    )
-    cc_buffer = tvm.tir.decl_buffer(
-        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
-    )
-
-    llvm_intrin = "llvm.aarch64.neon.smmla" if dtype == "int8" else "llvm.aarch64.neon.ummla"
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore([0, 0], tvm.tir.const(0, "int32x4")))
-                return ib.get()
-            # Load in vec_a the two rows of A
-            # vec_a = [a, b, c, d, e, f, g, h;
-            #          i, j, k, l, m, n, o, p,]
-            vec_a = ins[0].vload([0, 0], dtype_vec)
-            # Load in vec_b the two rows of B
-            # vec_b = [0, 2, 4, 6, 8, 10, 12, 14;
-            #          1, 3, 5, 7, 9, 11, 13, 14,]
-            vec_b = ins[1].vload([0, 0], dtype_vec)
-
-            # Execute the matrix multiplication via (s/u)mmla:
-            # vec_c = [a*0 + b*2 + c*4 + d*6 +e*8 + f*10 + g*12 + h*14;
-            #          a*1 + b*3 + c*5 + d*7 +e*9 + f*11 + g*13 + h*15;
-            #          i*0 + j*2 + k*4 + l*6 +m*8 + n*10 + o*12 + p*14;
-            #          i*1 + j*3 + k*5 + l*7 +m*9 + n*11 + o*13 + p*15]
-            vec_c = outs[0].vload([0, 0], "int32x4")
-            vmmla = tvm.tir.call_llvm_intrin(
-                "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_a, vec_b
-            )
-            # Store the result
-            ib.emit(outs[0].vstore([0, 0], vmmla))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def _q_multiply_shift_arm(op):
-    """
-    Implementation of q_multiply_shift_arm through arm intrinsics
-    sqrdmulh and srshl when q == 31.
-
-    Please note that this is introducing a small round-up error for
-    some corner cases. This is because we are rounding twice instead
-    than only once. I.e.:
-
-        * original q_multiply_shift: round(x*y*2^-s)
-        * arm q_multiply_shift: round(round(x*y)*2^-s)
-    """
-    x = op.args[0]
-    y = op.args[1]
-    q = op.args[2]
-    s = op.args[3]
-
-    # Don't use this intrinsic if we don't have a int32x4 vector
-    # or if we are not multiplying q31 numbers
-    if x.dtype != "int32x4" or q.value != 31:
-        return op
-
-    # Case 1, shift is negative
-    sqrdmulh = tvm.tir.call_llvm_intrin(
-        op.dtype, "llvm.aarch64.neon.sqrdmulh", tvm.tir.const(2, "uint32"), x, y
-    )
-
-    fixup = (sqrdmulh & (-s)) >> 31
-    fixed_up_x = sqrdmulh + fixup
-    out_1 = tvm.tir.call_llvm_intrin(
-        op.dtype, "llvm.aarch64.neon.srshl", tvm.tir.const(2, "uint32"), sqrdmulh, s
-    )
-
-    # Case 2, shift is positive
-    x = x * (1 << (s))
-    out_2 = tvm.tir.call_llvm_intrin(
-        op.dtype, "llvm.aarch64.neon.sqrdmulh", tvm.tir.const(2, "uint32"), x, y
-    )
-
-    # Select depending on the shift
-    return tvm.tir.Select(s < 0, out_1, out_2)
-
-
-register_intrin_lowering(
-    "tir.q_multiply_shift", target="llvm.aarch64", f=_q_multiply_shift_arm, level=99
-)
diff --git a/python/tvm/topi/bifrost/__init__.py b/python/tvm/topi/bifrost/__init__.py
deleted file mode 100644
index bd875ec09a19..000000000000
--- a/python/tvm/topi/bifrost/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""ARM Mali GPU specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .gemm import *
-from .conv2d import *
-from .dense import *
-from .depthwise_conv2d import *
diff --git a/python/tvm/topi/bifrost/conv2d.py b/python/tvm/topi/bifrost/conv2d.py
deleted file mode 100644
index 30d39b476946..000000000000
--- a/python/tvm/topi/bifrost/conv2d.py
+++ /dev/null
@@ -1,552 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""conv2d schedule on ARM Mali (Bifrost) GPU"""
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-
-from .gemm import decl_winograd_gemm, schedule_gemm
-from .transforms import tile_and_bind, tile_and_bind3d
-from ..utils import traverse_inline, get_const_int, get_const_tuple
-from .. import nn
-from ..nn.winograd_util import winograd_transform_matrices
-
-# reuse some compute declarations from ARM CPU
-from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw
-
-
-@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.bifrost")
-def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """TOPI compute callback for conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    kernel : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
-        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
-        filter_width, num_filter_block]
-
-    strides : list of two ints
-        [stride_height, stride_width]
-
-    padding : list of two ints
-        [pad_height, pad_width]
-
-    dilation : list of two ints
-        [dilation_height, dilation_width]
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    return conv2d_spatial_pack_nchw(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=3
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.bifrost")
-def schedule_conv2d_nchw_spatial_pack(cfg, outs):
-    """TOPI schedule callback for conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The configuration of this template
-    outs: Array of Tensor
-        The computation graph description of convolution2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d
-    """
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        # schedule conv2d
-        if "spatial_conv2d_output" in op.tag:
-            output = op.output(0)
-            conv = op.input_tensors[0]
-
-            data_vec = conv.op.input_tensors[0]
-            data_pad = data_vec.op.input_tensors[0]
-            s[data_pad].compute_inline()
-
-            kernel_vec = conv.op.input_tensors[1]
-            if kernel_vec.op.name == "kernel_vec":
-                kernel = kernel_vec.op.input_tensors[0]
-            else:
-                kernel = kernel_vec
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
-    """schedule the spatial packing for conv2d"""
-    data = s[data_vec].op.input_tensors[0]
-
-    max_unroll = 16
-    vec_size = [1, 2, 4, 8, 16]
-    # get tunable parameters (they are defined in compute)
-    BC, TC, VC = cfg["tile_co"].size
-    BH, TH, VH = cfg["tile_oh"].size
-    BW, TW, VW = cfg["tile_ow"].size
-
-    # schedule padding
-    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-        data_pad = data
-        s[data_pad].compute_inline()
-
-    # schedule data packing
-    if isinstance(data_vec.op, te.tensor.ComputeOp) and data_vec.op.name == "data_vec_undilated":
-        _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
-    else:
-        _, h, w, ci, vh, vw = s[data_vec].op.axis
-    tile_and_bind3d(s, data_vec, h, w, ci, 1)
-    if vh.dom.extent.value < max_unroll:
-        s[data_vec].unroll(vh)
-    if vw.dom.extent.value < max_unroll:
-        s[data_vec].unroll(vw)
-
-    if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == "kernel_vec":
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-            co, ci, kh, kw, vc = s[kernel_vec].op.axis
-            fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
-            fused, vec = s[kernel_vec].split(fused, VC)
-            bb, tt = s[kernel_vec].split(fused, max_threads)
-            s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
-            s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
-            if VC in vec_size:
-                s[kernel_vec].vectorize(vec)
-
-    # schedule convolution
-    n, c, h, w, vh, vw, vc = s[conv].op.axis
-    kc, kh, kw = s[conv].op.reduce_axis
-
-    cfg["reorder_0"].apply(s, conv, [n, c, h, w, kc, kh, kw, vh, vw, vc])
-    tile_and_bind3d(s, conv, c, h, w, TC, TH, TW)
-
-    cfg["ann_reduce"].apply(
-        s,
-        conv,
-        [kh, kw],
-        axis_lens=[get_const_int(kernel_vec.shape[2]), get_const_int(kernel_vec.shape[3])],
-        max_unroll=max_unroll,
-    )
-
-    cfg["ann_spatial"].apply(
-        s,
-        conv,
-        [vh, vw, vc],
-        axis_lens=[VH, VW, VC],
-        max_unroll=max_unroll,
-        vec_size=vec_size,
-        cfg=cfg,
-    )
-
-    # schedule output
-    if output.op not in s.outputs:  # has bias
-        s[output].compute_inline()
-        output = s.outputs[0]
-
-    _, co, oh, ow = s[output].op.axis
-    tile_and_bind3d(s, output, co, oh, ow, TC, TH, TW)
-
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd.bifrost")
-def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Use Winograd as the convolution method"""
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd.bifrost")
-def schedule_conv2d_nchw_winograd(cfg, outs):
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "winograd_conv2d_output" in op.tag:
-            _schedule_winograd(cfg, s, op)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _decl_winograd_kernel_transform(kernel, tile_size, G):
-    """Declare a Winograd kernel transform
-    This exists separately to allow for precomputation
-    The precomputation will most often happen on CPU
-
-    Parameters
-    ----------
-    kernel : tvm.te.Tensor
-        The kernel to transform
-
-    tile_size : int
-        The size of the tile to use for the Winograd filter
-
-    Returns
-    -------
-    U : tvm.te.Tensor
-        Transformed kernel
-
-    """
-    CO, CI, KH, KW = [get_const_int(x) for x in kernel.shape]
-    # Only support 32 bit floats
-    out_dtype = "float32"
-
-    alpha = G.shape[0]
-    K = CO
-    C = CI
-
-    def upround(x, align):
-        return (x + align - 1) // align * align
-
-    ALIGN = 16
-    K_round = upround(K, ALIGN)
-
-    # Padded Kernel [K_round, C, KH, KW]
-    # Pad the number of kernels to multiple of ALIGN
-    padded_kernel = te.compute(
-        (K_round, C, KH, KW),
-        lambda k, c, h, w: tvm.tir.if_then_else(
-            k < K, kernel[k][c][h][w], tvm.tir.const(0, out_dtype)
-        ),
-        name="padded_kernel",
-    )
-
-    # U [alpha, alpha, K_round, C]
-    # Perform the kernel transform
-    r_kh = te.reduce_axis((0, KH), "r_kh")
-    r_kw = te.reduce_axis((0, KW), "r_kw")
-    U = te.compute(
-        (alpha, alpha, K_round, C),
-        lambda eps, nu, k, c: te.sum(
-            padded_kernel[k][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
-        ),
-        name="U",
-    )
-
-    return U
-
-
-def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size=2):
-    """Declare a winograd convolution - only tile_size=2 is currently supported"""
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    if int(kernel.shape[2]) == 3:
-        if dilation_h != 1 or dilation_w != 1:
-            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
-        pre_computed = False
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-    else:
-        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
-        pre_computed = True
-        H_CAT, W_CAT, CO, CI = get_const_tuple(kernel.shape)
-        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
-
-    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
-    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, out_dtype)
-
-    K = CO
-    C = CI
-    H = (IH + pt + pb - 3) // HSTR + 1
-    W = (IW + pl + pr - 3) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-
-    def upround(x, align):
-        return (x + align - 1) // align * align
-
-    ALIGN = 16
-    P_round = upround(P, ALIGN)
-    K_round = upround(K, ALIGN)
-
-    # CONFIG
-
-    cfg.define_knob("data_transform_wgx", [1, 2, 4, 8, 16, 32, 64])
-    cfg.define_knob("data_transform_wgy", [1, 2, 4, 8, 16, 32, 64])
-
-    # Pack input tile
-    input_tile = te.compute((N, C, H + 2, W + 2), lambda n, c, h, w: data_pad[n][c][h][w], name="d")
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        VC = cfg["tile_k"].size[-1]
-        kvshape = (KH + tile_size - 1, KW + tile_size - 1, tvm.tir.indexdiv(CO, VC), CI, VC)
-        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
-    else:
-        if pre_computed:
-            U = kernel
-        else:
-            U = _decl_winograd_kernel_transform(kernel, tile_size, G)
-
-    # V [alpha * alpha, C, P_round)
-    # Perform the image transform
-    r_eps = te.reduce_axis((0, alpha), "r_eps")
-    r_nu = te.reduce_axis((0, alpha), "r_nu")
-    V = te.compute(
-        (alpha * alpha, C, P_round),
-        lambda epsnu, c, b: te.sum(
-            input_tile[b // (nH * nW)][c][b // nW % nH * m + r_eps][b % nW * m + r_nu]
-            * B[r_eps][epsnu // alpha]
-            * B[r_nu][epsnu % alpha],
-            axis=[r_eps, r_nu],
-        ),
-        name="V",
-    )
-
-    # Winograd GEMM is a wrapper around batched GEMM to convert U to a 3D Tensor
-    _, M = decl_winograd_gemm(cfg, U, V)
-
-    # Y [K, P, m, m]
-    # Winograd output transform
-    r_eps = te.reduce_axis((0, alpha), "r_eps")
-    r_nu = te.reduce_axis((0, alpha), "r_nu")
-    Y = te.compute(
-        (K, P, m, m),
-        lambda k, b, vh, vw: te.sum(
-            M[r_eps * alpha + r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]
-        ),
-        name="Y",
-    )
-
-    # Output [N, K, H, W]
-    # Unpack back to NCHW format
-    # The last term ensures alignment is not lost to bound inference
-    output = te.compute(
-        (N, K, H, W),
-        lambda n, k, h, w: Y[k][n * nH * nW + (h // m) * nW + w // m][h % m][w % m]
-        + tvm.tir.const(0, out_dtype) * M[(alpha * alpha) - 1][K_round - 1][P_round - 1],
-        name="output",
-        tag="winograd_conv2d_output",
-    )
-
-    return output
-
-
-def _schedule_winograd(cfg, s, op):
-    """Schedule Winograd convolution for Bifrost"""
-
-    # Get ops and tensors
-    output = op.output(0)
-
-    Y = op.input_tensors[0]
-    M, A = s[Y].op.input_tensors
-    U_3D, V = s[M].op.input_tensors
-    U = s[U_3D].op.input_tensors[0]
-    d, B = s[V].op.input_tensors
-    data_pad = s[d].op.input_tensors[0]
-
-    if isinstance(U.op, tvm.te.ComputeOp):
-        padded_kernel, G = s[U].op.input_tensors
-        kernel = s[padded_kernel].op.input_tensors[0]
-        s[G].compute_inline()
-        eps, _, _, _ = s[U].op.axis
-        y, _, _, _ = s[padded_kernel].op.axis
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            # Pad kernel
-            y, x, ky, kx = s[padded_kernel].op.axis
-            s[padded_kernel].unroll(ky)
-            s[padded_kernel].unroll(kx)
-            tile_and_bind(s, padded_kernel, y, x, 1, 8)
-
-            # Transform kernel
-            eps, nu, k, c = s[U].op.axis
-            s[U].reorder(k, c, eps, nu)
-            r_kh, r_kw = s[U].op.reduce_axis
-            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
-
-            yo, xo, yi, xi = tile_and_bind(s, U, k, c, 1, 4)
-
-        # Dilation
-        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-            s[kernel].compute_inline()
-
-    # Pad data
-    s[data_pad].compute_inline()
-
-    # Pack data
-    n, c, h, w = s[d].op.axis
-    w, wi = s[d].split(w, 4)
-    s[d].unroll(wi)
-    b = s[d].fuse(n, c)
-    tile_and_bind3d(s, d, b, h, w, 1, 4, 2)
-
-    # Transform data
-    bIL_d = s.cache_read(d, "local", [V])
-
-    s[B].compute_inline()
-    epsnu, c, b = s[V].op.axis
-    r_eps, r_nu = s[V].op.reduce_axis
-    s[V].reorder(b, c, epsnu, r_nu, r_eps)
-    _ = [s[V].unroll(x) for x in [epsnu, r_eps, r_nu]]
-    yo, xo, yi, xi = tile_and_bind(
-        s, V, b, c, cfg["data_transform_wgy"].val, cfg["data_transform_wgx"].val
-    )
-
-    s[bIL_d].compute_at(s[V], xi)
-    n, c, h, w = s[bIL_d].op.axis
-    s[bIL_d].unroll(h)
-    s[bIL_d].vectorize(w)
-
-    # Batched GEMM
-    # Inline the 4D -> 3D tensor transform on the kernel
-    s[U_3D].compute_inline()
-    U_transform, V_transform = schedule_gemm(
-        cfg, s, U_3D, V, M, batched=True, schedule_transforms=True
-    )
-
-    # Inverse transform
-    CR_M = s.cache_read(M, "local", [Y])
-    CW_Y = s.cache_write(Y, "local")
-
-    s[A].compute_inline()
-    k, b, vh, vw = s[Y].op.axis
-    fused = s[Y].fuse(vh, vw)
-    s[Y].vectorize(fused)
-    yo, xo, yi, xi = tile_and_bind(s, Y, k, b, 1, 4)
-
-    s[CR_M].compute_at(s[Y], xi)
-    k, b, epsnu = s[CR_M].op.axis
-    s[CR_M].unroll(k)
-
-    s[CW_Y].compute_at(s[Y], xi)
-    k, b, vh, vw = s[CW_Y].op.axis
-    r_eps, r_nu = s[CW_Y].op.reduce_axis
-    _ = [s[CW_Y].unroll(x) for x in [vh, vw, r_eps, r_nu]]
-
-    # Schedule output and fusion
-    if output.op not in s.outputs:
-        s[output].compute_inline()
-        output = s.outputs[0]
-
-    _, k, h, w = s[output].op.axis
-    tile_and_bind3d(s, output, k, h, w, 1, 2, 2)
-
-
-##### REGISTER ALTER OP LAYOUT #####
-@nn.conv2d_alter_layout.register("bifrost")
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-
-    _, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-    )
-    workload = autotvm.task.get_workload(outs)
-    if workload is None:
-        # The best implementation is not an AutoTVM template,
-        # we then assume it's not necessary to alter this op.
-        return None
-    cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:  # if is fallback, clear query cache and return None
-        autotvm.task.clear_fallback_cache(target, workload)
-        return None
-
-    topi_tmpl = workload[0]
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
-    idxd = tvm.tir.indexdiv
-
-    if topi_tmpl == "conv2d_nchw_spatial_pack.bifrost":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        VC = cfg["tile_co"].size[-1]
-
-        new_attrs["kernel_layout"] = f"OIHW{VC}o"
-
-        new_data = data
-        new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_spatial_pack.bifrost",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_nchw_winograd.bifrost":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        tile_size = 2
-
-        weight_expr = inputs[1]
-        weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
-            weight_expr, tile_size=tile_size
-        )
-        weight_expr = relay.reshape(
-            weight_expr, newshape=(KH + tile_size - 1, KW + tile_size - 1, CO, CI)
-        )
-
-        new_attrs["tile_size"] = tile_size
-
-        new_data = data
-        new_kernel = te.placeholder((KH + tile_size - 1, KW + tile_size - 1, CO, CI), kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_winograd.bifrost",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], weight_expr, **new_attrs
-        )
-
-    return None
diff --git a/python/tvm/topi/bifrost/dense.py b/python/tvm/topi/bifrost/dense.py
deleted file mode 100644
index 7e827813ed66..000000000000
--- a/python/tvm/topi/bifrost/dense.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable
-"""dense schedule on ARM Mali Biforst GPU"""
-from tvm import te
-from tvm import autotvm
-
-from .. import nn
-from ..utils import traverse_inline
-
-
-@autotvm.register_topi_compute("dense.bifrost")
-def dense(_, data, weight, bias=None, out_dtype=None):
-    """Dense operator on Biforst"""
-    return nn.dense(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule("dense.bifrost")
-def schedule_dense(cfg, outs):
-    """Schedule for dense operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config entity for this template
-    outs: Array of Tensor
-        The computation graph description of dense
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for dense.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "dense":
-            vec_size = [1, 2, 4, 8, 16]
-            max_unroll = 32
-
-            dense_out = op.output(0)
-            output = outs[0]
-
-            y, x = s[output].op.axis
-            c = s[dense_out].op.reduce_axis[0]
-
-            ##### space definition begin #####
-            cfg.define_split("tile_y", y, num_outputs=3)
-            cfg.define_split("tile_x", x, num_outputs=3)
-            cfg.define_split("c_unroll", c, num_outputs=2, max_factor=64)
-
-            # fallback support
-            if cfg.is_fallback:
-                ref_log = autotvm.tophub.load_reference_log("mali", "rk3399", "dense.bifrost")
-                cfg.fallback_with_reference_log(ref_log)
-            ##### space definition end #####
-
-            if dense_out.op in s.outputs:
-                dense_out = s.cache_write(output, "local")
-
-            by, ty, yi = cfg["tile_y"].apply(s, output, y)
-            bx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            s[output].bind(by, te.thread_axis("blockIdx.y"))
-            s[output].bind(bx, te.thread_axis("blockIdx.x"))
-            s[output].bind(ty, te.thread_axis("threadIdx.y"))
-            s[output].bind(tx, te.thread_axis("threadIdx.x"))
-
-            if cfg["tile_y"].size[-1] < max_unroll:
-                s[output].unroll(yi)
-            if cfg["tile_x"].size[-1] in vec_size:
-                s[output].vectorize(xi)
-            s[dense_out].compute_at(s[output], tx)
-
-            k = s[dense_out].op.reduce_axis[0]
-            y, x = s[dense_out].op.axis
-            k, k_unroll = cfg["c_unroll"].apply(s, dense_out, k)
-            s[dense_out].reorder(k, k_unroll, y, x)
-            s[dense_out].unroll(k_unroll)
-            if cfg["tile_y"].size[-1] < max_unroll:
-                s[dense_out].unroll(y)
-            if cfg["tile_x"].size[-1] in vec_size:
-                s[dense_out].vectorize(x)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-    """fuse all the axis and bind to GPU threads"""
-    axis = axis or s[tensor].op.axis
-    fused = s[tensor].fuse(*axis)
-    bx, tx = s[tensor].split(fused, num_thread)
-    s[tensor].bind(bx, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, te.thread_axis("threadIdx.x"))
-    return bx, tx
diff --git a/python/tvm/topi/bifrost/depthwise_conv2d.py b/python/tvm/topi/bifrost/depthwise_conv2d.py
deleted file mode 100644
index 801acd676aa6..000000000000
--- a/python/tvm/topi/bifrost/depthwise_conv2d.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""depthwise_conv2d schedule on ARM Mali GPU"""
-
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-
-from .. import utils
-from .. import tag
-
-
-def schedule_depthwise_conv2d_nchw(outs):
-    """Schedule for depthwise_conv2d nchw forward.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(pad_data, kernel, conv):
-        raw_data = s[pad_data].op.input_tensors[0]
-
-        if conv.op not in s.outputs:  # has bias or relu
-            output = outs[0]
-        else:  # no bias or relu
-            output = conv
-
-        def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-            """tile and bind 3d"""
-            y_factor = y_factor or z_factor
-            x_factor = x_factor or y_factor
-            zo, zi = s[tensor].split(z, z_factor)
-            yo, yi = s[tensor].split(y, y_factor)
-            xo, xi = s[tensor].split(x, x_factor)
-            s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
-            s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
-            s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
-            s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
-            s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
-            s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
-            return zo, zi, yo, yi, xo, xi
-
-        # set tunable parameters
-        VH = 1
-        VW = 1
-        num_thread = 4
-        while utils.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
-            VW = VW * 2
-        while utils.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
-            VH = VH * 2
-        if raw_data.dtype == "float16":
-            if utils.get_const_int(conv.shape[3]) % (VW * 2) == 0:
-                VW *= 2
-                num_thread *= 2
-            else:
-                num_thread *= 2
-
-        # schedule padding
-        _, c, y, x = s[pad_data].op.axis
-        tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
-
-        # schedule conv
-        di, dj = s[conv].op.reduce_axis
-        s[conv].unroll(di)
-        s[conv].unroll(dj)
-
-        _, c, y, x = s[output].op.axis
-        y, x, yi, xi = s[output].tile(y, x, VH, VW)
-        s[output].unroll(yi)
-        s[output].vectorize(xi)
-
-        _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
-
-        if conv.op not in s.outputs:
-            _, c, y, x = s[conv].op.axis
-            y, x, yi, xi = s[conv].tile(y, x, VH, VW)
-            s[conv].unroll(yi)
-            s[conv].vectorize(xi)
-            s[conv].compute_at(s[output], ji)
-
-    def traverse(op):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-
-        # schedule depthwise_conv2d
-        if op.tag == "depthwise_conv2d_nchw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            conv = op.output(0)
-            _schedule(pad_data, kernel, conv)
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/bifrost/gemm.py b/python/tvm/topi/bifrost/gemm.py
deleted file mode 100644
index 6224493109ef..000000000000
--- a/python/tvm/topi/bifrost/gemm.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""GEMM schedules for Mali Bifrost"""
-from tvm import te
-
-from .transforms import tile_and_bind, tile_and_bind3d, interleave_transpose, transpose_interleave
-from .. import utils
-
-
-def decl_gemm(cfg, A, B):
-    """Declare a single GEMM computation for Mali Bifrost GPUs
-
-    Parameters
-    ----------
-    cfg : Config
-        Schedule configuration
-
-    A : tvm.te.Tensor
-        2D Tensor, shape [n, k]
-
-    B : tvm.te.Tensor
-        2D Tensor, shape [k, m]
-
-    Returns
-    -------
-    C : tvm.te.Tensor
-        2D Tensor, shape [n, m]
-    """
-
-    cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
-    cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
-    cfg.define_knob("unroll_k_factor", [1, 2, 4])
-    cfg.define_knob("A_interleave", [1, 4, 8, 16, 24, 32, 48, 64])
-    cfg.define_knob("B_interleave", [1, 4, 8, 16, 32])
-    cfg.define_knob("split_k_factor", [1, 4, 16])
-
-    # Mutual k axis must be of equal extent
-    assert utils.get_const_int(A.shape[1]) == utils.get_const_int(B.shape[0])
-    n = A.shape[0]
-    m = B.shape[1]
-    k_size = utils.get_const_int(A.shape[1])
-    unroll_gemm = cfg["split_k_factor"].val
-    if unroll_gemm == 1:
-        # No unrolling case must have the same set of tensors to keep scheduling consistent
-        # Create identity tensors to take the place of A_unrolled, B_unrolled and R
-        A_unrolled = te.compute((n, k_size), lambda i, j: A[i, j], name="A_unrolled")
-        B_unrolled = te.compute((k_size, m), lambda i, j: B[i, j], name="B_unrolled")
-
-        # Declare standard GEMM
-        k = te.reduce_axis((0, A.shape[1]), name="k")
-        C = te.compute(
-            (n, m), lambda i, j: te.sum(A_unrolled[i, k] * B_unrolled[k, j], axis=k), name="C"
-        )
-
-        R = te.compute((n, m), lambda i, j: C[i, j], name="R")
-
-    else:
-        unrolled_k_size = k_size // unroll_gemm
-
-        # Unroll the two input matrices along the shared k axis
-        A_unrolled = te.compute(
-            (unroll_gemm, n, unrolled_k_size),
-            lambda b, i, j: A[i][unrolled_k_size * b + j],
-            name="A_unrolled",
-        )
-
-        B_unrolled = te.compute(
-            (unroll_gemm, unrolled_k_size, m),
-            lambda b, i, j: B[unrolled_k_size * b + i][j],
-            name="B_unrolled",
-        )
-
-        # Declare a batched GEMM
-        k = te.reduce_axis((0, unrolled_k_size), name="k")
-        C = te.compute(
-            (unroll_gemm, n, m),
-            lambda b, i, j: te.sum(A_unrolled[b][i][k] * B_unrolled[b][k][j], axis=k),
-            name="C",
-        )
-
-        # Then declare a reduction to reduce the sub matrices
-        k = te.reduce_axis((0, unroll_gemm), name="k")
-        R = te.compute((n, m), lambda i, j: te.sum(C[k][i][j], axis=k), name="R")
-
-    return R
-
-
-def decl_batched_gemm(cfg, A, B):
-    """Declare a batched GEMM computation for Mali Bifrost GPUs
-    Parameters
-    ----------
-    cfg : Config
-        Schedule configuration
-
-    A : tvm.te.Tensor
-        3D Tensor, shape [b, n, k]
-
-    B : tvm.te.Tensor
-        3D Tensor, shape [b, k, m]
-
-    Returns
-    -------
-    C : tvm.te.Tensor
-        3D Tensor, shape [b, n, m]
-
-    """
-    # Mutual b and k axis must be of equal extent
-    assert utils.get_const_int(A.shape[2]) == utils.get_const_int(B.shape[1])
-    assert utils.get_const_int(A.shape[0]) == utils.get_const_int(B.shape[0])
-
-    cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
-    cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
-    cfg.define_knob("unroll_k_factor", [1, 2, 4])
-    cfg.define_knob("A_interleave", [1, 4, 8, 16, 32, 64])
-    cfg.define_knob("B_interleave", [1, 4, 8, 16, 32])
-
-    n = A.shape[1]
-    m = B.shape[2]
-    k_size = utils.get_const_int(A.shape[2])
-    b_size = utils.get_const_int(A.shape[0])
-
-    # Declare a batched GEMM
-    k = te.reduce_axis((0, k_size), name="k")
-    C = te.compute(
-        (b_size, n, m), lambda b, i, j: te.sum(A[b][i][k] * B[b][k][j], axis=k), name="C"
-    )
-
-    return C
-
-
-def decl_winograd_gemm(cfg, A, B):
-    """Declare a winograd GEMM for Mali Bifrost GPUs
-    Winograd uses batched GEMM, however the input tensors are 4D
-    This wraps decl_batched_gemm to provide it with 3D tensors
-
-    Parameters
-    ----------
-    cfg : Config
-        Schedule configuration
-
-    A : tvm.te.Tensor
-        4D Tensor, shape [a, a, n, k]
-
-    B : tvm.te.Tensor
-        4D Tensor, shape [a * a, k, m]
-
-    Returns
-    -------
-
-    """
-    alpha = utils.get_const_int(A.shape[0])
-    n = utils.get_const_int(A.shape[2])
-    k = utils.get_const_int(A.shape[3])
-
-    A_3D = te.compute(
-        (alpha * alpha, n, k), lambda b, i, j: A[b // alpha][b % alpha][i][j], name="A_3D"
-    )
-
-    C = decl_batched_gemm(cfg, A_3D, B)
-    return A_3D, C
-
-
-def schedule_gemm(cfg, s, A, B, C, batched=False, schedule_transforms=True):
-    """Schedule GEMM, single and batched
-
-    Parameters
-    ----------
-    cfg : Config
-        Schedule configuration
-
-    s : tvm.te.schedule.Schedule
-        Operator schedule
-
-    A : tvm.te.Tensor
-        2D/3D Tensor, shape [n, k]/[b, n, k]
-
-    B : tvm.te.Tensor
-        2D/3D Tensor, shape [k, m]/[b, k, m]
-
-    C : tvm.te.Tensor
-        2D/3D Tensor, shape [n, m]/[b, n, m]
-
-    batched : bool
-        Whether the GEMM is batched
-
-    Returns
-    -------
-
-    """
-    block_size_x = 4
-    block_size_y = 4
-    warp_size_x = 2
-    warp_size_y = 2
-
-    work_group_x = cfg["work_group_x"].val
-    work_group_y = cfg["work_group_y"].val
-    k_unroll = cfg["unroll_k_factor"].val
-
-    if not batched:
-        y_index, x_index = (0, 1)
-    else:
-        y_index, x_index = (1, 2)
-
-    trans_inter, A_transposed_interleaved = transpose_interleave(
-        s, A, cfg["A_interleave"].val, y_index, x_index, [C], batched=batched
-    )
-    inter_trans, B_interleaved_transposed = interleave_transpose(
-        s, B, cfg["B_interleave"].val, y_index, x_index, [C], batched=batched
-    )
-
-    if schedule_transforms:
-        # Schedule A
-        y, x = s[trans_inter].op.axis
-        y, x, yi, xi = s[trans_inter].tile(y, x, 1, 8)
-        s[trans_inter].unroll(yi)
-        s[trans_inter].unroll(xi)
-        tile_and_bind(s, trans_inter, y, x, 1, 4)
-
-        # Schedule B
-        y, x = s[inter_trans].op.axis
-        xo, xi = s[inter_trans].split(x, 4)
-        s[inter_trans].vectorize(xi)
-        tile_and_bind(s, inter_trans, y, xo, 4, 4)
-
-    # Schedule C
-    CR_A = s.cache_read(A_transposed_interleaved, "local", [C])
-    CR_B = s.cache_read(B_interleaved_transposed, "local", [C])
-    CW_C = s.cache_write(C, "local")
-
-    if not batched:
-        y, x = s[C].op.axis
-    else:
-        z, y, x = s[C].op.axis
-    y, x, yt, xt = s[C].tile(y, x, block_size_y, block_size_x)
-    s[C].unroll(yt)
-    s[C].vectorize(xt)
-    # Tile the global work space to generate 'square' warps -> 2x2 for warp size of 4
-    y, x, wy, wx = s[C].tile(y, x, warp_size_y, warp_size_x)
-    x = s[C].fuse(x, wy, wx)
-    if not batched:
-        yo, xo, yi, xi = tile_and_bind(s, C, y, x, work_group_y, work_group_x)
-    else:
-        # For batched GEMM bind batch to z axis
-        zo, yo, xo, zi, yi, xi = tile_and_bind3d(s, C, z, y, x, 1, work_group_y, work_group_x)
-
-    s[CW_C].compute_at(s[C], xi)
-    if not batched:
-        y, x = s[CW_C].op.axis
-    else:
-        _, y, x = s[CW_C].op.axis
-    y, x, yt, xt = s[CW_C].tile(y, x, block_size_y, block_size_x)
-    k = s[CW_C].op.reduce_axis[0]
-    s[CW_C].reorder(k, yt, xt)
-    ko, ki = s[CW_C].split(k, k_unroll)
-    s[CW_C].unroll(ki)
-    s[CW_C].unroll(yt)
-    s[CW_C].unroll(xt)
-
-    if not batched:
-        i, j = s[CR_A].op.axis
-    else:
-        _, i, j = s[CR_A].op.axis
-    s[CR_A].reorder(j, i)
-    s[CR_A].compute_at(s[CW_C], ki)
-    s[CR_A].unroll(j)
-    s[CR_A].vectorize(i)
-
-    if not batched:
-        i, j = s[CR_B].op.axis
-    else:
-        _, i, j = s[CR_B].op.axis
-    s[CR_B].compute_at(s[CW_C], ki)
-    s[CR_B].unroll(i)
-    s[CR_B].vectorize(j)
-
-    return trans_inter, inter_trans
-
-
-def schedule_unrollable_gemm(cfg, s, A, B, C, R):
-    """Schedule a GEMM that can be unrolled by a constant factor
-    along its inner dimension
-
-    Parameters
-    ----------
-    cfg : Config
-        Schedule configuration
-
-    s : tvm.te.schedule.Schedule
-        Operator schedule
-
-    A : tvm.te.Tensor
-        2D/3D Tensor, shape [n, k]/[b, n, k]
-
-    B : tvm.te.Tensor
-        2D/3D Tensor, shape [k, m]/[b, k, m]
-
-    C : tvm.te.Tensor
-        2D/3D Tensor, shape [n, m]/[b, n, m]
-
-    R : tvm.te.Tensor
-        2D Tensor, shape [n, m]
-
-    """
-    # If the GEMM is 2D, no unrolling has taken place
-    # Use non-batched GEMM schedule and inline identity matrices A, B and R
-    if len(C.op.axis) == 2:
-        s[A].compute_inline()
-        s[B].compute_inline()
-        schedule_gemm(cfg, s, A, B, C)
-        s[R].compute_inline()
-
-    # GEMM is 3D, use batched GEMM schedule, inline A and B and schedule R
-    else:
-        s[A].compute_inline()
-        s[B].compute_inline()
-        schedule_gemm(cfg, s, A, B, C, batched=True)
-
-        CR_C = s.cache_read(C, "local", [R])
-
-        y, x = s[R].op.axis
-        xo, xi = s[R].split(x, 4)
-        k = s[R].op.reduce_axis[0]
-        s[R].reorder(k, xi)
-        ko, ki = s[R].split(k, 4)
-        s[R].unroll(xi)
-        s[R].unroll(ki)
-        tile_and_bind(s, R, y, xo, 1, 2)
-
-        s[CR_C].compute_at(s[R], ko)
-        _, y, x = s[CR_C].op.axis
-        s[CR_C].unroll(y)
-        s[CR_C].vectorize(x)
-
-
-def get_unrollable_gemm_ops(R):
-    """Get all GEMM operators from the final reduction
-    This is a helper function to more easily get all the GEMM operations
-    from an operator
-
-    Parameters
-    ----------
-    R : tvm.te.Tensor
-        Reduced tensor, final stage of GEMM
-
-    Returns
-    -------
-    A_unrolled : tvm.te.Tensor
-        Matrix A unrolled along k
-
-    B_unrolled: tvm.te.Tensor
-        Matrix B unrolled along k
-
-    C : tvm.te.Tensor
-        Result of batched GEMM
-
-    R : tvm.te.Tensor
-        Reduction of C, result of unrollable GEMM
-
-    """
-    C = R.op.input_tensors[0]
-    A_unrolled, B_unrolled = C.op.input_tensors
-    return A_unrolled, B_unrolled, C, R
diff --git a/python/tvm/topi/bifrost/transforms.py b/python/tvm/topi/bifrost/transforms.py
deleted file mode 100644
index 6a39f195dace..000000000000
--- a/python/tvm/topi/bifrost/transforms.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Utility scheduling functions for the Bifrost schedules"""
-
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-
-
-def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-    """Fuse all the axis and bind to GPU threads"""
-    axis = axis or s[tensor].op.axis
-    fused = s[tensor].fuse(*axis)
-    max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-    bx, tx = s[tensor].split(fused, num_thread or max_threads)
-    s[tensor].bind(bx, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, te.thread_axis("threadIdx.x"))
-    return bx, tx
-
-
-def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
-    """Tile and bind to GPU threads"""
-    x_factor = x_factor or y_factor
-    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
-    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
-    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
-    return yo, xo, yi, xi
-
-
-def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-    """Tile and bind 3d"""
-    y_factor = y_factor or z_factor
-    x_factor = x_factor or y_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
-    return zo, yo, xo, zi, yi, xi
-
-
-def pack_tensor(s, tensor, factor, readers):
-    """Do transform X[n, m] -> X[n / factor, m, factor]"""
-    tmp = s.cache_read(tensor, "global", readers)
-    y, x = s[tmp].op.axis
-    yo, yi = s[tmp].split(y, factor)
-    s[tmp].reorder(yo, x, yi)
-    s[tmp].compute_inline()
-    return s.cache_write(tmp, "global"), tmp
-
-
-def transpose(s, tensor, y_index, x_index, readers):
-    """Do transform X[n, m] -> X[m, n]"""
-    tmp = s.cache_read(tensor, "global", readers)
-    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
-    s[tmp].reorder(x, y)
-    s[tmp].compute_inline()
-    A_transpose = s.cache_write(tmp, "global")
-
-    CR_A = s.cache_read(tensor, "local", [A_transpose])
-    CW_A_transpose = s.cache_write(A_transpose, "local")
-
-    y, x = s[A_transpose].op.axis[y_index], s[A_transpose].op.axis[x_index]
-    yo, xo, yi, xi = s[A_transpose].tile(y, x, 4, 4)
-    s[A_transpose].unroll(yi)
-    s[A_transpose].vectorize(xi)
-    _, _, _, xi = tile_and_bind(s, A_transpose, yo, xo, 32, 2)
-
-    s[CW_A_transpose].compute_at(s[A_transpose], xi)
-    y, x = s[CW_A_transpose].op.axis[y_index], s[CW_A_transpose].op.axis[x_index]
-    s[CW_A_transpose].unroll(x)
-    s[CW_A_transpose].unroll(y)
-
-    s[CR_A].compute_at(s[A_transpose], xi)
-    y, x = s[CR_A].op.axis[y_index], s[CR_A].op.axis[x_index]
-    s[CR_A].unroll(y)
-    s[CR_A].vectorize(x)
-
-    return tmp
-
-
-def interleave_transpose(s, tensor, width, y_index, x_index, readers, batched=False):
-    """Interleave the tensor, then transpose it"""
-    tmp = s.cache_read(tensor, "global", readers)
-    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
-    xo, xi = s[tmp].split(x, width)
-    s[tmp].reorder(xo, y, xi)
-    s[tmp].fuse(y, xi)
-    if batched:
-        z = s[tmp].op.axis[0]
-        s[tmp].fuse(z, xo)
-    s[tmp].compute_inline()
-    return s.cache_write(tmp, "global"), tmp
-
-
-def transpose_interleave(s, tensor, width, y_index, x_index, readers, batched=False):
-    """Transpose the tensor, then interleave it"""
-    tmp = s.cache_read(tensor, "global", readers)
-    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
-    yo, yi = s[tmp].split(y, width)
-    s[tmp].reorder(yo, x, yi)
-    s[tmp].fuse(x, yi)
-    if batched:
-        z = s[tmp].op.axis[0]
-        s[tmp].fuse(z, yo)
-    s[tmp].compute_inline()
-    return s.cache_write(tmp, "global"), tmp
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
deleted file mode 100644
index a6ced5bcf9bc..000000000000
--- a/python/tvm/topi/cuda/__init__.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""CUDA specific declaration and schedules."""
-from .conv1d import *
-from .conv1d_transpose_ncw import *
-from .conv2d import *
-from .conv2d_hwcn import *
-from .conv2d_int8 import *
-from .conv2d_winograd import *
-from .conv2d_nhwc_winograd import *
-from .depthwise_conv2d import *
-from .group_conv2d_nchw import *
-from . import conv2d_alter_op
-from .conv2d_transpose import *
-from .conv3d_transpose_ncdhw import *
-from .deformable_conv2d import *
-from .conv3d import *
-from .conv3d_winograd import *
-from . import conv3d_alter_op
-from .reduction import schedule_reduce
-from .softmax import *
-from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .dense import *
-from .pooling import *
-from .nn import schedule_lrn
-from .batch_matmul import *
-from .batch_matmul_tensorcore import *
-from .vision import *
-from .ssd import *
-from .nms import get_valid_counts, non_max_suppression, all_class_non_max_suppression
-from .rcnn import *
-from .scatter import *
-from .scatter_elements import *
-from .sort import *
-from .conv2d_nhwc_tensorcore import *
-from .conv3d_ndhwc_tensorcore import *
-from .dense_tensorcore import *
-from .conv2d_hwnc_tensorcore import *
-from .correlation import *
-from .sparse import *
-from . import tensorcore_alter_op
-from .argwhere import *
-from .scan import *
-from .sparse_reshape import *
-from .transform import *
-from .unique import *
-from .searchsorted import *
-from .signal import *
diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
deleted file mode 100644
index cc6c4c26eddb..000000000000
--- a/python/tvm/topi/cuda/argwhere.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=too-many-arguments, invalid-name
-"""Argwhere operator"""
-
-import logging
-
-import tvm
-from tvm import te
-from .injective import schedule_injective_from_existing
-from .scan import exclusive_scan
-from .. import tag
-from ..utils import ceil_div, prod
-from ..transform import reshape
-from ..broadcast import not_equal
-from ..math import cast
-
-
-logger = logging.getLogger("topi")
-
-fdiv = tvm.tir.floordiv
-fmod = tvm.tir.floormod
-
-
-def compact_nonzero_indices_ir(condition, write_indices, out, do_write_func):
-    """Copy nonzero indices to the corresponding write locations.
-
-    Parameters
-    ----------
-    condition : Buffer
-        The input condition.
-
-    write_indices : Buffer
-        The result of exclusive scan on a boolean array, where True indicates that
-        the condition is non zero at that position.
-
-    out : Buffer
-        The output buffer to copy indices to.
-
-    do_write_func : a function
-        A callback that accepts an output buffer, a dst index to write to, and a src index.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    ib = tvm.tir.ir_builder.create()
-    size_1d = prod(condition.shape)
-
-    condition = ib.buffer_ptr(condition)
-    write_indices = ib.buffer_ptr(write_indices)
-    out = ib.buffer_ptr(out)
-
-    nthread_tx = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_bx = ceil_div(size_1d, nthread_tx)
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-
-    with ib.new_scope():
-        idx = bx * nthread_tx + tx
-        with ib.if_scope(idx < size_1d):
-            with ib.if_scope(condition[idx] != 0):
-                do_write_func(out, write_indices[idx], idx)
-
-    return ib.get()
-
-
-def argwhere_common(output_shape, condition, do_write_func):
-    """A common compute used by argwhere of various ranks.
-
-    Parameters
-    ----------
-    output_shape : list of int or tvm.tir.Any
-        Tensor with output shape info.
-
-    condition : tvm.te.Tensor
-        The input condition.
-
-    do_write_func : a function
-        A callback that accepts an output buffer, a dst index to write to, and a src index.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-
-    flags = not_equal(condition, tvm.tir.const(0))
-    flags_1d = reshape(flags, (prod(flags.shape),))
-    write_indices = exclusive_scan(cast(flags_1d, dtype="int32"))
-
-    condition_buf = tvm.tir.decl_buffer(
-        condition.shape, condition.dtype, "data_buf", data_alignment=8
-    )
-    write_indices_buf = tvm.tir.decl_buffer(
-        write_indices.shape, write_indices.dtype, "write_indices_buf", data_alignment=8
-    )
-    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
-
-    out = te.extern(
-        [output_shape],
-        [condition, write_indices],
-        lambda ins, outs: compact_nonzero_indices_ir(ins[0], ins[1], outs[0], do_write_func),
-        dtype=["int32"],
-        in_buffers=[condition_buf, write_indices_buf],
-        out_buffers=[out_buf],
-        name="argwhere",
-        tag="argwhere_gpu",
-    )
-
-    return out
-
-
-def argwhere_1d(output_shape, condition):
-    """Compute for argwhere 1D
-
-    Parameters
-    ----------
-    condition : list of int or tvm.tir.Any
-        The output shape
-
-    out : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def do_write(out, write_index, idx):
-        out[write_index] = idx
-
-    return argwhere_common(output_shape, condition, do_write)
-
-
-def argwhere_2d(output_shape, condition):
-    """Compute for argwhere 2D
-
-    Parameters
-    ----------
-    condition : list of int or tvm.tir.Any
-        The output shape
-
-    out : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def do_write(out, write_index, idx):
-        a1 = condition.shape[1]
-        out[write_index * 2] = tvm.tir.floordiv(idx, a1)
-        out[write_index * 2 + 1] = tvm.tir.floormod(idx, a1)
-
-    return argwhere_common(output_shape, condition, do_write)
-
-
-def argwhere_3d(output_shape, condition):
-    """Compute for argwhere 3D
-
-    Parameters
-    ----------
-    condition : list of int or tvm.tir.Any
-        The output shape
-
-    out : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def do_write(out, write_index, idx):
-        _, a1, a2 = condition.shape
-        s1 = a1 * a2
-        out[write_index * 3] = fdiv(idx, s1)
-        out[write_index * 3 + 1] = fdiv(fmod(idx, s1), a2)
-        out[write_index * 3 + 2] = fmod(idx, a2)
-
-    return argwhere_common(output_shape, condition, do_write)
-
-
-def argwhere_4d(output_shape, condition):
-    """Compute for argwhere 4D
-
-    Parameters
-    ----------
-    condition : list of int or tvm.tir.Any
-        The output shape
-
-    out : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def do_write(out, write_index, idx):
-        _, a1, a2, a3 = condition.shape
-        s1 = a2 * a3
-        s2 = a1 * s1
-        out[write_index * 4] = fdiv(idx, s2)
-        out[write_index * 4 + 1] = fdiv(fmod(idx, s2), s1)
-        out[write_index * 4 + 2] = fdiv(fmod(idx, s1), a3)
-        out[write_index * 4 + 3] = fmod(idx, a3)
-
-    return argwhere_common(output_shape, condition, do_write)
-
-
-def argwhere_5d(output_shape, condition):
-    """Compute for argwhere 5D
-
-    Parameters
-    ----------
-    condition : list of int or tvm.tir.Any
-        The output shape
-
-    out : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def do_write(out, write_index, idx):
-        _, a1, a2, a3, a4 = condition.shape
-        s1 = a3 * a4
-        s2 = a2 * s1
-        s3 = a1 * s2
-        out[write_index * 5] = fdiv(idx, s3)
-        out[write_index * 5 + 1] = fdiv(fmod(idx, s3), s2)
-        out[write_index * 5 + 2] = fdiv(fmod(idx, s2), s1)
-        out[write_index * 5 + 3] = fdiv(fmod(idx, s1), a4)
-        out[write_index * 5 + 4] = fmod(idx, a4)
-
-    return argwhere_common(output_shape, condition, do_write)
-
-
-def argwhere(output_shape, condition):
-    """Find the indices of elements of a tensor that are non-zero.
-
-    Parameters
-    ----------
-    output_shape : tvm.te.Tensor
-        Tensor with output shape info.
-
-    condition : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    if len(condition.shape) == 1:
-        return argwhere_1d(output_shape.shape, condition)
-    if len(condition.shape) == 2:
-        return argwhere_2d(output_shape.shape, condition)
-    if len(condition.shape) == 3:
-        return argwhere_3d(output_shape.shape, condition)
-    if len(condition.shape) == 4:
-        return argwhere_4d(output_shape.shape, condition)
-    if len(condition.shape) == 5:
-        return argwhere_5d(output_shape.shape, condition)
-    raise ValueError("Argwhere does not support rank higher than 5")
-
-
-def schedule_argwhere(outs):
-    """Schedule for argwhere on cuda.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of argwhere
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for argwhere
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        if tag.is_injective(op.tag):
-            schedule_injective_from_existing(s, op.output(0))
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                traverse(tensor.op)
-        scheduled_ops.append(op)
-
-    for out in outs:
-        traverse(out.op)
-    return s
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
deleted file mode 100644
index 0a7acfa50444..000000000000
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument
-"""cuda batch_matmul operators"""
-import tvm
-from tvm import autotvm
-from tvm import te
-from tvm.contrib import cublas
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from .. import nn, generic
-from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
-from .tensor_intrin import dp4a
-
-
-@autotvm.register_topi_compute("batch_matmul.cuda")
-def batch_matmul(cfg, x, y, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True):
-    """Compute batch matrix multiplication of `tensor_a` and `tensor_b`.
-
-    Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT format
-    (transpose_a=False, transpose_b=True) by default.
-
-    Parameters
-    ----------
-    cfg : ConfigSpace
-        Autotvm tuning space config file.
-
-    tensor_a : tvm.te.Tensor
-        3-D with shape [batch, M, K] or [batch, K, M].
-
-    tensor_b : tvm.te.Tensor
-        3-D with shape [batch, K, N] or [batch, N, K].
-
-    out_shape : List[Optional]
-        Explicit intended output shape of the computation. Can be useful in cases
-        with dynamic input shapes.
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision batch matmul.
-
-    transpose_a : Optional[bool] = False
-        Whether the first tensor is in transposed format.
-
-    transpose_b : Optional[bool] = True
-        Whether the second tensor is in transposed format.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D with shape [batch, M, N]
-    """
-    return nn.batch_matmul(
-        x,
-        y,
-        oshape=out_shape,
-        out_dtype=out_dtype,
-        transpose_a=transpose_a,
-        transpose_b=transpose_b,
-    )
-
-
-@autotvm.register_topi_schedule("batch_matmul.cuda")
-def schedule_batch_matmul(cfg, outs):
-    """Schedule for batch_matmul
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of batch_matmul
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(cfg, op):
-        C = op.output(0)
-        A, B = s[C].op.input_tensors
-        if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A:
-            s[B].compute_inline()
-        _, M, N = get_const_tuple(C.shape)
-        AA = s.cache_read(A, "shared", [C])
-        AL = s.cache_read(AA, "local", [C])
-        BB = s.cache_read(B, "shared", [C])
-        BL = s.cache_read(BB, "local", [C])
-        CC = s.cache_write(C, "local")
-        if op not in s.outputs:
-            s[C].compute_inline()
-            C = s.outputs[0].output(0)
-
-        b, y, x = s[C].op.axis
-        (k,) = s[CC].op.reduce_axis
-
-        cfg.define_split("tile_y", y, num_outputs=3)
-        cfg.define_split("tile_x", x, num_outputs=3)
-        cfg.define_split("tile_k", k, num_outputs=2)
-        cfg.define_knob("auto_unroll_max_step", [8, 16, 32, 64])
-        target = tvm.target.Target.current()
-        if target.kind.name in ["nvptx", "rocm"]:
-            # llvm-based backends cannot do non-explicit unrolling
-            cfg.define_knob("unroll_explicit", [1])
-        else:
-            cfg.define_knob("unroll_explicit", [0, 1])
-
-        if cfg.is_fallback:
-            y_bn = get_max_power2_factor(M, 64)
-            x_bn = get_max_power2_factor(N, 64)
-            y_nthreads = min(y_bn, 8)
-            x_nthreads = min(x_bn, 8)
-            cfg["tile_x"] = SplitEntity([-1, x_nthreads, x_bn // x_nthreads])
-            cfg["tile_y"] = SplitEntity([-1, y_nthreads, y_bn // y_nthreads])
-            cfg["tile_k"] = SplitEntity([-1, 8])
-            cfg["auto_unroll_max_step"] = OtherOptionEntity(16)
-
-        by, ty, yi = cfg["tile_y"].apply(s, C, y)
-        bx, tx, xi = cfg["tile_x"].apply(s, C, x)
-
-        thread_x = te.thread_axis("threadIdx.x")
-        thread_y = te.thread_axis("threadIdx.y")
-
-        s[C].reorder(b, by, bx, ty, tx, yi, xi)
-        s[C].bind(b, te.thread_axis("blockIdx.z"))
-        s[C].bind(by, te.thread_axis("blockIdx.y"))
-        s[C].bind(bx, te.thread_axis("blockIdx.x"))
-        s[C].bind(ty, thread_y)
-        s[C].bind(tx, thread_x)
-        s[C].pragma(yi, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        s[C].pragma(yi, "unroll_explicit", cfg["unroll_explicit"].val)
-
-        s[CC].compute_at(s[C], tx)
-        _, yi, xi = s[CC].op.axis
-        ko, ki = cfg["tile_k"].apply(s, CC, k)
-        s[CC].reorder(ko, ki, yi, xi)
-        s[CC].pragma(ki, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        s[CC].pragma(ki, "unroll_explicit", cfg["unroll_explicit"].val)
-
-        s[AA].compute_at(s[CC], ko)
-        s[AL].compute_at(s[CC], ki)
-        s[BB].compute_at(s[CC], ko)
-        s[BL].compute_at(s[CC], ki)
-        _, y, k = s[AA].op.axis
-        ty, yi = s[AA].split(y, nparts=cfg["tile_y"].size[1])
-        tx, ki = s[AA].split(k, nparts=cfg["tile_x"].size[1])
-        s[AA].reorder(ty, tx, yi, ki)
-        s[AA].bind(ty, thread_y)
-        s[AA].bind(tx, thread_x)
-        s[AA].pragma(yi, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        s[AA].pragma(yi, "unroll_explicit", cfg["unroll_explicit"].val)
-
-        _, x, k = s[BB].op.axis
-        ty, xi = s[BB].split(x, nparts=cfg["tile_y"].size[1])
-        tx, ki = s[BB].split(k, nparts=cfg["tile_x"].size[1])
-        s[BB].bind(ty, thread_y)
-        s[BB].bind(tx, thread_x)
-        s[BB].reorder(ty, tx, xi, ki)
-        s[BB].pragma(xi, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        s[BB].pragma(xi, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    def _callback(op):
-        if "batch_matmul" in op.tag:
-            _schedule(cfg, op)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("batch_matmul_cublas.cuda")
-def batch_matmul_cublas(
-    cfg, x, y, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """Compute batch matrix multiplication of `x` and `y`.
-
-    Both `x` and `y` can be transposed. For legacy reason, we use NT format
-    (transpose_a=False, transpose_b=True) by default.
-
-    Parameters
-    ----------
-    cfg : ConfigSpace
-        Autotvm tuning space config file.
-
-    x : tvm.te.Tensor
-        3-D with shape [batch, M, K] or [batch, K, M].
-
-    y : tvm.te.Tensor
-        3-D with shape [batch, K, N] or [batch, N, K].
-
-    out_shape : List[Optional]
-        Explicit intended output shape of the computation. Can be useful in cases
-        with dynamic input shapes.
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision batch matmul.
-
-    transpose_a : Optional[bool] = False
-        Whether the first tensor is in transposed format.
-
-    transpose_b : Optional[bool] = True
-        Whether the second tensor is in transposed format.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D with shape [batch, M, N]
-    """
-    if transpose_a:
-        b, k, m = get_const_tuple(x.shape)
-    else:
-        b, m, k = get_const_tuple(x.shape)
-    if transpose_b:
-        b, n, k = get_const_tuple(y.shape)
-    else:
-        b, k, n = get_const_tuple(y.shape)
-    if all([isinstance(s, int) for s in [b, m, n, k]]):
-        cfg.add_flop(b * m * k * n * 2)
-    return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b, dtype=out_dtype)
-
-
-@autotvm.register_topi_schedule("batch_matmul_cublas.cuda")
-def schedule_batch_matmul_cublas(_, outs):
-    """Schedule batch_matmul operator using CUBLAS"""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("batch_matmul_int8.cuda")
-def batch_matmul_int8(
-    cfg, x, y, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """Batch Matmul operator for int8 on CUDA.
-
-    Parameters
-    ----------
-    cfg : ConfigSpace
-        Autotvm tuning space config file.
-
-    x : tvm.te.Tensor
-        3-D with shape [batch, M, K] or [batch, K, M].
-
-    y : tvm.te.Tensor
-        3-D with shape [batch, K, N] or [batch, N, K].
-
-    out_shape : List[Optional]
-        Explicit intended output shape of the computation. Can be useful in cases
-        with dynamic input shapes.
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision batch matmul.
-
-    transpose_a : Optional[bool] = False
-        Whether the first tensor is in transposed format.
-
-    transpose_b : Optional[bool] = True
-        Whether the second tensor is in transposed format.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D with shape [batch, M, N]
-    """
-    del out_shape
-    # TODO(jcf94): Deal with different transpose combinations
-    assert not transpose_a and transpose_b
-    if out_dtype is None:
-        out_dtype = x.dtype
-
-    x_shape = get_const_tuple(x.shape)
-    y_shape = get_const_tuple(y.shape)
-    assert len(x_shape) == 3 and len(y_shape) == 3, "only support 3-dim batch_matmul"
-
-    XB, M, XK = x.shape
-    YB, N, YK = y.shape
-    assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match"
-    assert XK == YK, "shapes of x and y is inconsistent"
-
-    nB = tvm.te.max(XB, YB)
-    nK = ((XK + 3) // 4) * 4
-    reduce_k = te.reduce_axis((0, nK), name="k")
-
-    # pad for _dp4a vectorize
-    pad_x = te.compute(
-        (XB, M, nK),
-        lambda b, i, j: tvm.te.if_then_else(j >= XK, tvm.tir.const(0, x.dtype), x[b, i, j]),
-    )
-    pad_y = te.compute(
-        (YB, N, nK),
-        lambda b, i, j: tvm.te.if_then_else(j >= YK, tvm.tir.const(0, y.dtype), y[b, i, j]),
-    )
-
-    out = te.compute(
-        (nB, M, N),
-        lambda b, i, j: te.sum(
-            pad_x[b if XB != 1 else 0, i, reduce_k].astype(out_dtype)
-            * pad_y[b if YB != 1 else 0, j, reduce_k].astype(out_dtype),
-            axis=[reduce_k],
-        ),
-        tag="batch_matmul_int8",
-    )
-    cfg.add_flop(XB * M * N * nK * 2)
-    return out
-
-
-@autotvm.register_topi_schedule("batch_matmul_int8.cuda")
-def schedule_batch_matmul_int8(cfg, outs):
-    """Batch Matmul schedule for int8 on CUDA"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "batch_matmul_int8" in op.tag:
-            _schedule_batch_matmul_int8(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_batch_matmul_int8(cfg, s, output):
-    input_x, input_y = s[output].op.input_tensors
-    if len(input_y.op.input_tensors) == 1 and input_y.op.input_tensors[0] == input_x:
-        s[input_y].compute_inline()
-
-    B, M, K = get_const_tuple(input_x.shape)
-    _, N, _ = get_const_tuple(input_y.shape)
-
-    k_factor = 4
-    assert K % k_factor == 0, f"Input dimension must divide {k_factor}"
-    if K % 16 == 0:
-        k_factor = 16
-
-    cfg.define_split("tile_f", B, num_outputs=4)
-    cfg.define_split("tile_m", M, num_outputs=4)
-    cfg.define_split("tile_n", N, num_outputs=4)
-    cfg.define_split("tile_k", K // k_factor, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 256, 512, 1024])
-
-    batch_matmul_op = s[output].op
-    s[input_x].compute_inline()
-    s[input_y].compute_inline()
-
-    x_cache = s.cache_read(input_x, "shared", [batch_matmul_op])
-    y_cache = s.cache_read(input_y, "shared", [batch_matmul_op])
-    batch_matmul_cache = s.cache_write(batch_matmul_op.output(0), "local")
-
-    # tile reduce axis
-    ko = batch_matmul_cache.op.reduce_axis[0]
-    ko, ki = s[batch_matmul_cache].split(ko, factor=4)
-    ko, kt = cfg["tile_k"].apply(s, batch_matmul_cache, ko)
-    # dp4a tensorize
-
-    target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
-
-    if do_tensorize:
-        dtypes = (input_x.dtype, input_y.dtype)
-        s[batch_matmul_cache].tensorize(ki, dp4a("shared", "shared", "local", dtypes))
-
-    if batch_matmul_op not in s.outputs:
-        s[output].compute_inline()
-        batch_matmul_op = s.outputs[0]
-
-    # tile axis
-    f, m, n = batch_matmul_op.axis
-    kernel_scope, f = s[batch_matmul_op].split(f, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, batch_matmul_op, f)
-    bm, vm, tm, mi = cfg["tile_m"].apply(s, batch_matmul_op, m)
-    bn, vn, tn, ni = cfg["tile_n"].apply(s, batch_matmul_op, n)
-    s[batch_matmul_op].reorder(bf, bm, bn, vf, vm, vn, tf, tm, tn, fi, mi, ni)
-
-    # bind axis
-    s[batch_matmul_op].bind(bf, tvm.te.thread_axis("blockIdx.z"))
-    s[batch_matmul_op].bind(bm, tvm.te.thread_axis("blockIdx.y"))
-    s[batch_matmul_op].bind(bn, tvm.te.thread_axis("blockIdx.x"))
-
-    s[batch_matmul_op].bind(vf, tvm.te.thread_axis("vthread"))
-    s[batch_matmul_op].bind(vm, tvm.te.thread_axis("vthread"))
-    s[batch_matmul_op].bind(vn, tvm.te.thread_axis("vthread"))
-
-    s[batch_matmul_op].bind(tf, tvm.te.thread_axis("threadIdx.z"))
-    s[batch_matmul_op].bind(tm, tvm.te.thread_axis("threadIdx.y"))
-    s[batch_matmul_op].bind(tn, tvm.te.thread_axis("threadIdx.x"))
-
-    # cache compute at
-    s[batch_matmul_cache].compute_at(s[batch_matmul_op], tn)
-    fo, mo, no = batch_matmul_cache.op.axis[:3]
-    s[batch_matmul_cache].reorder(ko, kt, fo, mo, no, ki)
-
-    # for load in [splited_x_op, splited_y_op]
-    for load in [x_cache, y_cache]:
-        s[load].compute_at(s[batch_matmul_cache], ko)
-        outer, inner = s[load].split(s[load].op.axis[-1], factor=k_factor)
-        s[load].vectorize(inner)
-
-        fused = s[load].op.axis[:-1] + [outer]
-        fused = s[load].fuse(*fused)
-
-        fused, tx = s[load].split(fused, factor=cfg["tile_n"].size[2])
-        fused, ty = s[load].split(fused, factor=cfg["tile_m"].size[2])
-        fused, tz = s[load].split(fused, factor=cfg["tile_f"].size[2])
-
-        s[load].bind(tz, tvm.te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.te.thread_axis("threadIdx.x"))
-
-    # max unroll
-    s[batch_matmul_op].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[batch_matmul_op].pragma(kernel_scope, "unroll_explicit", False)
-
-    return s
diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
deleted file mode 100644
index 920f162b103a..000000000000
--- a/python/tvm/topi/cuda/batch_matmul_tensorcore.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument
-"""cuda batch_matmul operators"""
-import tvm
-from tvm import autotvm
-from tvm import te
-from ..utils import traverse_inline, get_const_tuple
-from .tensor_intrin import (
-    intrin_wmma_load_matrix_A,
-    intrin_wmma_load_matrix_W,
-    intrin_wmma_store_matrix,
-    intrin_wmma_gemm,
-)
-
-
-@autotvm.register_topi_compute("batch_matmul_tensorcore.cuda")
-def batch_matmul_tensorcore(
-    cfg, x, y, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """batch matmul tensorcore operator on cuda"""
-    # TODO(jcf94): Deal with different transpose combinations
-    assert not transpose_a and transpose_b
-    # TODO(liuxin.ai): Deal with out_shape for broadcast
-    del out_shape
-    return batch_matmul_tensorcore_cuda(x, y, out_dtype)
-
-
-@autotvm.register_topi_schedule("batch_matmul_tensorcore.cuda")
-def schedule_batch_matmul_tensorcore(cfg, outs):
-    """Schedule for batch_matmul operator using Tensorcore
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of batch_matmul
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(cfg, s, C):
-        A, B = s[C].op.input_tensors
-        if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A:
-            s[B].compute_inline()
-        batch, m_dim, k_dim = get_const_tuple(A.shape)
-        batch, n_dim, k_dim = get_const_tuple(B.shape)
-        data_dtype = A.dtype
-        out_dtype = C.dtype
-
-        # Explicit memory access
-        AS = s.cache_read(A, "shared", [C])
-        BS = s.cache_read(B, "shared", [C])
-        AF = s.cache_read(AS, "wmma.matrix_a", [C])
-        BF = s.cache_read(BS, "wmma.matrix_b", [C])
-        CF = s.cache_write(C, "wmma.accumulator")
-        CS = s.cache_read(CF, "shared", [C])
-
-        # fallback support
-        target = tvm.target.Target.current()
-        if cfg.is_fallback:
-            ref_log = autotvm.tophub.load_reference_log(
-                target.kind.name, target.model, "batch_matmul_tensorcore.cuda"
-            )
-            cfg.fallback_with_reference_log(ref_log)
-
-        # Deal with op fusion, such as bias/relu and slice after padding
-        if C.op not in s.outputs and "injective" in s.outputs[0].tag:
-            s[C].compute_inline()
-            C = s.outputs[0].output(0)
-
-        # create tuning space
-        cfg.define_knob("block_row_warps", [1, 2, 4])
-        cfg.define_knob("block_col_warps", [1, 2, 4])
-        cfg.define_knob("warp_row_tiles", [1, 2, 4])
-        cfg.define_knob("warp_col_tiles", [1, 2, 4])
-        cfg.define_knob("chunk", [1, 2, 4, 8])
-        cfg.define_knob("offset", [0, 8])
-        cfg.define_knob("offsetCS", [0, 8])
-        cfg.define_knob("vec", [1, 2, 4, 8])
-
-        # Ensure that the default parameters are applicable when autotvm is not in use
-        if data_dtype in ["float16", "uint8", "int8"]:
-            if m_dim % 32 == 0 and n_dim % 8 == 0:
-                cfg.define_knob("wmma_m", [32, 16, 8])
-            elif m_dim % 16 == 0 and n_dim % 16 == 0:
-                cfg.define_knob("wmma_m", [16, 8, 32])
-            elif m_dim % 8 == 0 and n_dim % 32 == 0:
-                cfg.define_knob("wmma_m", [8, 16, 32])
-            wmma_k = 16
-            wmma_m = cfg["wmma_m"].val
-            if wmma_m == 16:
-                wmma_n = 16
-            elif wmma_m == 8:
-                wmma_n = 32
-            elif wmma_m == 32:
-                wmma_n = 8
-        elif data_dtype in ["int4", "uint4"]:
-            wmma_m = wmma_n = 8
-            wmma_k = 32
-        else:
-            raise ValueError(f"data dtype {data_dtype} is not yet supported")
-
-        warp_size = 32
-        block_row_warps = cfg["block_row_warps"].val
-        block_col_warps = cfg["block_col_warps"].val
-        warp_row_tiles = cfg["warp_row_tiles"].val
-        warp_col_tiles = cfg["warp_col_tiles"].val
-        chunk = cfg["chunk"].val
-        offset = cfg["offset"].val
-        offsetCS = cfg["offsetCS"].val
-        vec = cfg["vec"].val
-
-        # Define the stride of intrin functions
-        AS_align = chunk * wmma_k + offset
-        BS_align = chunk * wmma_k + offset
-        CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS
-        AS_stride = [AS_align, 1]
-        BS_stride = [BS_align, 1]
-        AF_stride = [wmma_k, 1]
-        BF_stride = [wmma_k, 1]
-        CF_stride = [warp_col_tiles * wmma_n, 1]
-        CS_stride = [CS_align, 1]
-
-        block_x = te.thread_axis("blockIdx.x")
-        block_y = te.thread_axis("blockIdx.y")
-        block_z = te.thread_axis("blockIdx.z")
-        thread_x = te.thread_axis("threadIdx.x")
-        thread_y = te.thread_axis("threadIdx.y")
-        thread_z = te.thread_axis("threadIdx.z")
-
-        # Schedule for dense computation
-        block_factor_m = wmma_m * warp_row_tiles * block_row_warps
-        block_factor_n = wmma_n * warp_col_tiles * block_col_warps
-        b, m, n = C.op.axis
-        block_i, bc = s[C].split(m, factor=block_factor_m)
-        block_j, oc = s[C].split(n, factor=block_factor_n)
-        s[C].reorder(b, block_i, block_j, bc, oc)
-        t = s[C].fuse(bc, oc)
-        t, vi = s[C].split(t, factor=vec)
-        t, tx = s[C].split(t, factor=warp_size)
-        t, ty = s[C].split(t, factor=block_row_warps)
-        t, tz = s[C].split(t, factor=block_col_warps)
-        s[C].bind(block_i, block_x)
-        s[C].bind(block_j, block_y)
-        s[C].bind(b, block_z)
-        s[C].bind(tz, thread_z)
-        s[C].bind(ty, thread_y)
-        s[C].bind(tx, thread_x)
-        s[C].vectorize(vi)
-
-        # Schedule for wmma store
-        s[CS].compute_at(s[C], block_j)
-        bs, bb, oo = CS.op.axis
-        s[CS].storage_align(bb, CS_align - 1, CS_align)
-        bb, bbi = s[CS].split(bb, factor=wmma_m)
-        oo, ooi = s[CS].split(oo, factor=wmma_n)
-        bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
-        oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
-        s[CS].reorder(bs, bb, oo, bbii, ooii, bbi, ooi)
-        s[CS].bind(bb, thread_z)
-        s[CS].bind(oo, thread_y)
-
-        # Schedule for wmma computation
-        s[CF].compute_at(s[CS], oo)
-        bs, warp_i, warp_j = CF.op.axis
-        warp_i, _ii = s[CF].split(warp_i, factor=wmma_m)
-        warp_j, _jj = s[CF].split(warp_j, factor=wmma_n)
-        (k,) = CF.op.reduce_axis
-        k, _k = s[CF].split(k, factor=wmma_k)
-        ko, ki = s[CF].split(k, factor=chunk)
-        s[CF].reorder(bs, ko, ki, warp_i, warp_j, _ii, _jj, _k)
-
-        # Schedule for  wmma_matrix_a load
-        s[AF].compute_at(s[CF], ki)
-        bs, b, i = AF.op.axis
-        b, b_ii = s[AF].split(b, factor=wmma_m)
-        i, i_jj = s[AF].split(i, factor=wmma_k)
-        s[AF].reorder(bs, b, i, b_ii, i_jj)
-
-        # Schedule for  wmma_matrix_b load
-        s[BF].compute_at(s[CF], ki)
-        bs, o, i = BF.op.axis
-        o, o_ii = s[BF].split(o, factor=wmma_n)
-        i, i_ii = s[BF].split(i, factor=wmma_k)
-        s[BF].reorder(bs, o, i, o_ii, i_ii)
-
-        # Schedule for A's(B's) shared memory load
-        def shared_schedule(stage, strides):
-            s[stage].compute_at(s[CF], ko)
-            bs, xo, yo = stage.op.axis
-            s[stage].storage_align(xo, strides - 1, strides)
-            t = s[stage].fuse(xo, yo)
-            t, vi = s[stage].split(t, factor=vec)
-            t, tx = s[stage].split(t, factor=warp_size)
-            t, ty = s[stage].split(t, factor=block_row_warps)
-            _, tz = s[stage].split(t, factor=block_col_warps)
-            s[stage].bind(ty, thread_y)
-            s[stage].bind(tz, thread_z)
-            s[stage].bind(tx, thread_x)
-            s[stage].vectorize(vi)
-
-        shared_schedule(AS, AS_align)
-        shared_schedule(BS, BS_align)
-
-        shape = (wmma_m, wmma_n, wmma_k)
-        AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=data_dtype)
-        BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=data_dtype)
-        k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
-        CL_compute = te.compute(
-            (wmma_m, wmma_n),
-            lambda ii, jj: te.sum(
-                AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm].astype(out_dtype),
-                axis=k_gemm,
-            ),
-            name="CL_compute",
-        )
-
-        # lower the computation loops down to TensorCore hardware intrinsics
-        # by mapping the dense tensorcore to tensor intrinsics
-        s[AF].tensorize(
-            b_ii,
-            intrin_wmma_load_matrix_A(
-                AF_stride,
-                AS_stride,
-                shape,
-                "row_major",
-                (wmma_m, wmma_k),
-                (wmma_m, wmma_k),
-                data_dtype,
-            ),
-        )
-        s[BF].tensorize(
-            o_ii,
-            intrin_wmma_load_matrix_W(
-                BF_stride,
-                BS_stride,
-                shape,
-                "col_major",
-                (wmma_n, wmma_k),
-                (wmma_n, wmma_k),
-                data_dtype,
-            ),
-        )
-        s[CF].tensorize(
-            _ii,
-            intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride, CF_stride, shape),
-        )
-        s[CS].tensorize(
-            bbi,
-            intrin_wmma_store_matrix(
-                CS_stride, CF_stride, shape, out_dtype, (wmma_m, wmma_n), (wmma_m, wmma_n)
-            ),
-        )
-
-    def _callback(op):
-        if "batch_matmul_tensorcore" in op.tag:
-            _schedule(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def batch_matmul_tensorcore_cuda(x, y, out_dtype=None):
-    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
-    data in batch.
-
-    Parameters
-    ----------
-    x : tvm.te.Tensor
-        3-D with shape [batch, M, K]
-
-    y : tvm.te.Tensor
-        3-D with shape [batch, N, K]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D with shape [batch, M, N]
-    """
-    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
-    x_shape = get_const_tuple(x.shape)
-    y_shape = get_const_tuple(y.shape)
-    assert x_shape[0] == y_shape[0], "batch dimension doesn't match"
-    assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistent"
-    batch, M, K = x.shape
-    N = y.shape[1]
-
-    if out_dtype is None:
-        out_dtype = x.dtype
-
-    assert x.dtype == y.dtype
-    assert x.dtype in ["float16", "uint8", "int8", "uint4", "int4"]
-    if x.dtype in ["float16", "uint8", "int8"]:
-        assert (
-            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
-            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
-            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
-        ), "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)"
-    else:
-        assert (
-            M % 8 == 0 and K % 32 == 0 and N % 8 == 0
-        ), "The shape of (M, K, N) must be multiple of (8, 32, 8)"
-
-    k = te.reduce_axis((0, K), name="k")
-    return te.compute(
-        (batch, M, N),
-        lambda b, i, j: te.sum(x[b, i, k].astype(out_dtype) * y[b, j, k].astype(out_dtype), axis=k),
-        tag="batch_matmul_tensorcore",
-    )
diff --git a/python/tvm/topi/cuda/conv1d.py b/python/tvm/topi/cuda/conv1d.py
deleted file mode 100644
index b2fc4ca02dc9..000000000000
--- a/python/tvm/topi/cuda/conv1d.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Compute definition for conv1d with cuda backend"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from .. import nn
-from ..utils import traverse_inline, get_const_tuple
-
-
-@autotvm.register_topi_compute("conv1d_ncw.cuda")
-def conv1d_ncw(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
-    return nn.conv1d_ncw(data, kernel, strides, padding, dilation, out_dtype)
-
-
-def _schedule_conv1d_ncw(cfg, outs):
-    """TOPI schedule callback of conv1d ncw for cuda gpu
-
-    Parameters
-    ----------
-    cfg : ConfigEntity
-        the config for this template.
-
-    outs : Array of Tensor
-        The computation graph description of conv1d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s : Schedule
-        The computation schedule for conv1d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv1d_ncw" or op.tag == "group_conv1d_ncw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-
-            ##### space definition begin #####
-            n, f, x = s[conv].op.axis
-            rc = s[conv].op.reduce_axis[0]
-            cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
-            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])
-
-            target = tvm.target.Target.current()
-            if target.kind.name in ["nvptx", "rocm"]:
-                cfg.define_knob("unroll_explicit", [1])
-            else:
-                cfg.define_knob("unroll_explicit", [0, 1])
-
-            ##### space definition end #####
-
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            if conv.op in s.outputs:
-                output = conv
-                OL = s.cache_write(conv, "local")
-            else:
-                output = s.outputs[0].output(0)
-                s[conv].set_scope("local")
-                OL = conv
-
-            # create cache stage
-            s[pad_data].set_scope("shared")
-            AA = pad_data
-            WW = s.cache_read(kernel, "shared", [OL])
-
-            # tile and bind spatial axes
-            n, f, x = s[output].op.axis
-            kernel_scope, n = s[output].split(n, nparts=1)
-            bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            s[output].reorder(bn, bf, bx, vn, vf, vx, tn, tf, tx, ni, fi, xi)
-            s[output].bind(bn, te.thread_axis("blockIdx.z"))
-            s[output].bind(bf, te.thread_axis("blockIdx.y"))
-            s[output].bind(bx, te.thread_axis("blockIdx.x"))
-            s[output].bind(vn, te.thread_axis("vthread"))
-            s[output].bind(vf, te.thread_axis("vthread"))
-            s[output].bind(vx, te.thread_axis("vthread"))
-
-            s[output].bind(tx, te.thread_axis("threadIdx.x"))
-            s[OL].compute_at(s[output], tx)
-            # number of threads
-            n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-            n_tx = cfg["tile_x"].size[2]
-
-            # tile reduction axes
-            n, f, x = s[OL].op.axis
-            rc, rx = s[OL].op.reduce_axis
-            rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
-            s[OL].reorder(rco, rcm, rx, rci, n, f, x)
-
-            s[AA].compute_at(s[OL], rx)
-            s[WW].compute_at(s[OL], rx)
-
-            # cooperative fetching
-            for load in [AA, WW]:
-                n, f, x = s[load].op.axis
-                fused = s[load].fuse(f, x)
-                tz, fused = s[load].split(fused, nparts=n_tz)
-                tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, te.thread_axis("threadIdx.y"))
-                s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-            s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-            s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-            N, CO, OW = get_const_tuple(output.shape)
-            _, CI, KW = get_const_tuple(kernel.shape)
-            cfg.add_flop(2 * N * OW * CO * KW * CI)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
-
-
-@autotvm.register_topi_schedule("conv1d_ncw.cuda")
-def schedule_conv1d_ncw(cfg, outs):
-    return _schedule_conv1d_ncw(cfg, outs)
-
-
-@autotvm.register_topi_compute("group_conv1d_ncw.cuda")
-def group_conv1d_ncw(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
-    return nn.group_conv1d_ncw(data, kernel, strides, padding, dilation, groups, out_dtype)
-
-
-@autotvm.register_topi_schedule("group_conv1d_ncw.cuda")
-def schedule_group_conv1d_ncw(cfg, outs):
-    return _schedule_conv1d_ncw(cfg, outs)
-
-
-@autotvm.register_topi_compute("conv1d_nwc.cuda")
-def conv1d_nwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
-    return nn.conv1d_nwc(data, kernel, strides, padding, dilation, out_dtype)
-
-
-def _schedule_conv1d_nwc(cfg, outs):
-    """TOPI schedule callback of conv1d nwc for cuda gpu
-
-    Parameters
-    ----------
-    cfg : ConfigEntity
-        the config for this template.
-
-    outs : Array of Tensor
-        The computation graph description of conv1d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s : Schedule
-        The computation schedule for conv1d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv1d_nwc" or op.tag == "group_conv1d_nwc":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-
-            ##### space definition begin #####
-            n, x, f = s[conv].op.axis
-            rc = s[conv].op.reduce_axis[0]
-            cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
-            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])
-
-            target = tvm.target.Target.current()
-            if target.kind.name in ["nvptx", "rocm"]:
-                cfg.define_knob("unroll_explicit", [1])
-            else:
-                cfg.define_knob("unroll_explicit", [0, 1])
-
-            ##### space definition end #####
-
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            if conv.op in s.outputs:
-                output = conv
-                OL = s.cache_write(conv, "local")
-            else:
-                output = s.outputs[0].output(0)
-                s[conv].set_scope("local")
-                OL = conv
-
-            # create cache stage
-            s[pad_data].set_scope("shared")
-            AA = pad_data
-            WW = s.cache_read(kernel, "shared", [OL])
-
-            # tile and bind spatial axes
-            n, f, x = s[output].op.axis
-            kernel_scope, n = s[output].split(n, nparts=1)
-            bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-
-            s[output].reorder(bn, bx, bf, vn, vx, vf, tn, tx, tf, ni, xi, fi)
-            s[output].bind(bn, te.thread_axis("blockIdx.z"))
-            s[output].bind(bx, te.thread_axis("blockIdx.y"))
-            s[output].bind(bf, te.thread_axis("blockIdx.x"))
-            s[output].bind(vn, te.thread_axis("vthread"))
-            s[output].bind(vx, te.thread_axis("vthread"))
-            s[output].bind(vf, te.thread_axis("vthread"))
-
-            s[output].bind(tf, te.thread_axis("threadIdx.x"))
-            s[OL].compute_at(s[output], tf)
-            # number of threads
-            n_tz = cfg["tile_n"].size[2] * cfg["tile_x"].size[2]
-            n_tx = cfg["tile_f"].size[2]
-
-            # tile reduction axes
-            n, x, f = s[OL].op.axis
-            rc, rx = s[OL].op.reduce_axis
-            rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
-            s[OL].reorder(rco, rcm, rx, rci, n, x, f)
-
-            s[AA].compute_at(s[OL], rx)
-            s[WW].compute_at(s[OL], rx)
-
-            # cooperative fetching
-            for load in [AA, WW]:
-                n, x, f = s[load].op.axis
-                fused = s[load].fuse(x, f)
-                tz, fused = s[load].split(fused, nparts=n_tz)
-                tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, te.thread_axis("threadIdx.y"))
-                s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-            s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-            s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-            N, OW, CO = get_const_tuple(output.shape)
-            KW, CI, _ = get_const_tuple(kernel.shape)
-            cfg.add_flop(2 * N * OW * CO * KW * CI)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
-
-
-@autotvm.register_topi_schedule("conv1d_nwc.cuda")
-def schedule_conv1d_nwc(cfg, outs):
-    return _schedule_conv1d_nwc(cfg, outs)
-
-
-@autotvm.register_topi_compute("group_conv1d_nwc.cuda")
-def group_conv1d_nwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
-    return nn.group_conv1d_nwc(data, kernel, strides, padding, dilation, groups, out_dtype)
-
-
-@autotvm.register_topi_schedule("group_conv1d_nwc.cuda")
-def schedule_group_conv1d_nwc(cfg, outs):
-    return _schedule_conv1d_nwc(cfg, outs)
diff --git a/python/tvm/topi/cuda/conv1d_transpose_ncw.py b/python/tvm/topi/cuda/conv1d_transpose_ncw.py
deleted file mode 100644
index 2098aa9089c6..000000000000
--- a/python/tvm/topi/cuda/conv1d_transpose_ncw.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Conv1d transpose template for cuda backend"""
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from .. import nn
-from ..utils import get_const_tuple, traverse_inline
-
-
-@autotvm.task.register_topi_compute("conv1d_transpose_nchw.cuda")
-def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype, output_padding):
-    """Transposed 1D convolution ncw forward operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-    Input : tvm.te.Tensor
-        3-D with shape [batch, in_channel, inp_width]
-    Filter : tvm.te.Tensor
-        3-D with shape [in_channel, num_filter, kernel_size]
-    stride : tuple of one int
-        The spatial stride along width
-    padding : int, tuple, or string
-        int: padding size
-        tuple of 2 ints: (pad_left, pad_right) for left and right padding
-        string: ['VALID', 'SAME']
-    out_dtype: str
-        The output type. This is used in mixed precision
-    output_padding : ints
-        Used to disambiguate the output shape.
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-    u    3-D with shape [batch, out_channel, out_width]
-    """
-    if isinstance(stride, (tuple, list)):
-        stride = stride[0]
-    if isinstance(output_padding, (tuple, list)):
-        output_padding = output_padding[0]
-    assert output_padding < stride
-    cfg.stride = stride
-    cfg.output_padding = output_padding
-    batch, inp_channels, inp_width = get_const_tuple(data.shape)
-    _, out_channels, kernel_size = get_const_tuple(kernel.shape)
-    pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size)
-    out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right + output_padding
-    pad_left = kernel_size - 1 - pad_left
-    pad_right = kernel_size - 1 - pad_right + output_padding
-    padded_width = pad_left + inp_width + pad_right
-
-    padded_data = te.compute(
-        (batch, inp_channels, padded_width),
-        lambda n, c, x: tvm.tir.if_then_else(
-            tvm.tir.all(x >= pad_left, x < pad_left + inp_width),
-            data[n, c, x - pad_left],
-            tvm.tir.const(0.0, "float32"),
-        ),
-        name="data_pad",
-    )
-
-    padded_kernel = te.compute(
-        (inp_channels, out_channels, kernel_size + stride - 1),
-        lambda ci, co, k: tvm.tir.if_then_else(
-            tvm.tir.all(k < kernel_size),
-            kernel[ci, co, kernel_size - k - 1],
-            tvm.tir.const(0.0, "float32"),
-        ),
-        name="kernel_pad",
-    )
-
-    ci = te.reduce_axis((0, inp_channels), name="ci")
-    k = te.reduce_axis((0, tvm.tir.indexdiv(kernel_size + stride - 1, stride)), name="k")
-    border = pad_left * (stride - 1)
-
-    # Skip multiplication by 0 values in the input data inserted when stride is greater then 1.
-    # During multiplication of kernel by padded data:
-    #  Kernel indices are: 0, 1 * stride, 2 * stride, ..., ceil(kernel_size / stride) plus
-    #  data offset mod stride
-    data_out = te.compute(
-        (batch, out_channels, out_width),
-        lambda b, co, w: te.sum(
-            padded_data[b, ci, tvm.tir.indexdiv(border + w + stride - 1, stride) + k].astype(
-                out_dtype
-            )
-            * padded_kernel[
-                ci, co, k * stride + tvm.tir.indexmod(stride - w - border, stride)
-            ].astype(out_dtype),
-            axis=[ci, k],
-        ),
-        tag="conv1d_transpose_ncw",
-    )
-
-    return data_out
-
-
-@autotvm.task.register_topi_schedule("conv1d_transpose_nchw.cuda")
-def schedule_conv1d_transpose_ncw(cfg, outs):
-    """TOPI Schedule callback for conv1d_transpose operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The parameters for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv1d transpose
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv1d transpose.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv1d_transpose_ncw":
-            padded_data = op.input_tensors[0]
-            padded_kernel = op.input_tensors[1]
-            conv = op.output(0)
-
-            ##### space definition begin #####
-            n, f, x = s[conv].op.axis
-            rc = s[conv].op.reduce_axis[0]
-            cfg.define_split("tile_n", cfg.axis(n if isinstance(n, int) else 1), num_outputs=4)
-            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])
-
-            target = tvm.target.Target.current()
-            if target.kind.name in ["nvptx", "rocm"]:
-                cfg.define_knob("unroll_explicit", [1])
-            else:
-                cfg.define_knob("unroll_explicit", [0, 1])
-
-            ##### space definition end #####
-
-            if conv.op in s.outputs:
-                output = conv
-                OL = s.cache_write(conv, "local")
-            else:
-                output = s.outputs[0].output(0)
-                s[conv].set_scope("local")
-                OL = conv
-
-            s[padded_kernel].compute_inline()
-            s[padded_data].compute_inline()
-
-            # tile and bind spatial axes
-            n, f, x = s[output].op.axis
-            kernel_scope, n = s[output].split(n, nparts=1)
-            bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            s[output].reorder(bn, bf, bx, vn, vf, vx, tn, tf, tx, ni, fi, xi)
-            s[output].bind(bn, te.thread_axis("blockIdx.z"))
-            s[output].bind(bf, te.thread_axis("blockIdx.y"))
-            s[output].bind(bx, te.thread_axis("blockIdx.x"))
-            s[output].bind(vn, te.thread_axis("vthread"))
-            s[output].bind(vf, te.thread_axis("vthread"))
-            s[output].bind(vx, te.thread_axis("vthread"))
-
-            s[output].bind(tx, te.thread_axis("threadIdx.x"))
-            s[OL].compute_at(s[output], tx)
-
-            # tile reduction axes
-            n, f, x = s[OL].op.axis
-            rc, rx = s[OL].op.reduce_axis
-            rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
-            s[OL].reorder(rco, rcm, rx, rci, n, f, x)
-
-            s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-            s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
diff --git a/python/tvm/topi/cuda/conv2d.py b/python/tvm/topi/cuda/conv2d.py
deleted file mode 100644
index fc9d51b2dd40..000000000000
--- a/python/tvm/topi/cuda/conv2d.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Compute definition for conv2d with cuda backend"""
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm.task.space import OtherOptionEntity
-from tvm.contrib import cudnn
-
-from .. import nn, generic
-from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
-from .conv2d_direct import schedule_direct_cuda
-
-
-@autotvm.register_topi_compute("conv2d_nchw.cuda")
-def conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
-    """Compute conv2d with NCHW layout"""
-    return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_nchw.cuda")
-def schedule_conv2d_nchw(cfg, outs):
-    """Create the schedule for conv2d_nchw"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv2d_nchw":
-            schedule_direct_cuda(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_cudnn.cuda")
-def conv2d_cudnn(
-    cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32"
-):
-    """Compute conv2d using CuDNN library"""
-    if layout == "NCHW":
-        tensor_format = 0  # CUDNN_TENSOR_NCHW
-        N, _, H, W = get_const_tuple(data.shape)
-    elif layout == "NHWC":
-        tensor_format = 1  # CUDNN_TENSOR_NHWC
-        N, H, W, _ = get_const_tuple(data.shape)
-    else:
-        raise ValueError(f"Unsupported layout {layout} in cudnn")
-    CO, CI, KH, KW = get_const_tuple(kernel.shape)
-
-    # handle dilation
-    stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
-    dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
-    KH_dilated = (KH - 1) * dilation_h + 1
-    KW_dilated = (KW - 1) * dilation_h + 1
-
-    pt, pl, pb, pr = get_pad_tuple(padding, (KH_dilated, KW_dilated))
-    if (pt != pb) or (pl != pr):
-        raise ValueError("Cudnn doesn't support asymmetric padding.")
-
-    OH = (H + pt + pb - KH) // stride_h + 1
-    OW = (W + pl + pr - KW) // stride_w + 1
-
-    if isinstance(N, int):
-        cfg.add_flop(
-            groups
-            * 2
-            * N
-            * OH
-            * OW
-            * CO
-            * CI
-            * ((KH - 1) * dilation_h + 1)
-            * ((KW - 1) * dilation_w + 1)
-        )
-
-    if data.dtype == "int8" or kernel.dtype == "int8":
-        if layout == "NCHW":
-            raise ValueError("NCHW layout do not support int8 in cudnn")
-        dtype = "int32"
-    else:
-        dtype = data.dtype
-
-    cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT")))
-    if cfg.is_fallback:
-        if cudnn.exists():
-            # Let CUDNN choose the best algo, based on benchmarks run
-            # on the local machine.  In the future, this should be
-            # based on parameters stored in the Target.
-            cfg["algo"] = OtherOptionEntity(-1)
-        else:
-            cfg["algo"] = OtherOptionEntity(0)
-
-    return cudnn.conv_forward(
-        data,
-        kernel,
-        [pt, pl],  # cudnn padding pt, pl on both sides of input
-        [stride_h, stride_w],
-        [dilation_h, dilation_w],
-        conv_mode=1,
-        tensor_format=tensor_format,
-        algo=cfg["algo"].val,
-        conv_dtype=dtype,
-        groups=groups,
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_cudnn.cuda")
-def schedule_conv2d_cudnn(cfg, outs):
-    """Create the schedule for conv2d_cudnn"""
-    return generic.schedule_extern(outs)
-
-
-def conv2d_backward_weight_cudnn(
-    dy, x, kernel_size, padding, stride, dilation, groups, layout, output_dtype
-):
-    """Compute conv2d wgrad using CuDNN library"""
-    assert layout in ["NCHW", "NHWC"]
-
-    if dy.dtype == "float16":
-        # cuDNN does not seem to support other combination.
-        assert output_dtype == "float16", "Only supports fp16 output for cuDNN fp16 wgrad."
-
-    conv_dtype = "float32"  # Accumulation is always fp32
-    return cudnn.conv_backward_filter(
-        dy,
-        x,
-        kernel_size,
-        padding,
-        stride,
-        dilation,
-        conv_mode=1,
-        tensor_format=0 if layout == "NCHW" else 1,
-        conv_dtype=conv_dtype,
-        groups=groups,
-    )
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
deleted file mode 100644
index 93512ca07d9e..000000000000
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ /dev/null
@@ -1,550 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Conv2D alter op and legalize functions for cuda backend"""
-
-import logging
-
-import tvm
-from tvm import autotvm, relay, te
-
-from .. import nn
-from ..nn import conv2d_legalize
-from ..utils import get_const_tuple, is_target
-from .conv2d_winograd import _infer_tile_size
-from .tensorcore_alter_op import pad_to_tensorcore
-
-logger = logging.getLogger("topi")
-
-
-@nn.conv2d_alter_layout.register(["cuda", "gpu"])
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    if not is_target(["vulkan", "rocm", "cuda"]):
-        return None
-    dispatch_ctx = autotvm.task.DispatchContext.current
-
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
-    impl, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-    )
-    workload = autotvm.task.get_workload(outs)
-    if workload is None:
-        # The best implementation is not an AutoTVM template.
-        # It may be from the auto-scheduler
-
-        if impl.name.find("winograd") != -1:
-            if dilation != (1, 1):
-                logger.warning("Does not support weight pre-transform for dilated convolution.")
-                return None
-
-            if data_layout == "NHWC" and kernel_layout == "HWIO":
-                N, H, W, CI = get_const_tuple(data.shape)
-                KH, KW, _, CO = get_const_tuple(kernel.shape)
-                # Pre-compute weight transformation in winograd
-                tile_size = _infer_tile_size(tinfos[0], tinfos[1], layout="NHWC")
-                # HWIO -> OIHW
-                kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
-                # alpha, alpha, CO, CI
-                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
-                    kernel_transform, tile_size=tile_size
-                )
-                new_attrs["tile_size"] = tile_size
-                new_attrs["channels"] = CO
-                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                    inputs[0], weight, **new_attrs
-                )
-            elif data_layout == "NCHW" and kernel_layout == "OIHW":
-                N, CI, H, W = get_const_tuple(data.shape)
-                CO, _, KH, KW = get_const_tuple(kernel.shape)
-                # Pre-compute weight transformation in winograd
-                tile_size = _infer_tile_size(tinfos[0], tinfos[1], layout="NCHW")
-                # alpha, alpha, CO, CI
-                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
-                    inputs[1], tile_size=tile_size
-                )
-                # alpha, alpha, CI, CO
-                weight = relay.transpose(weight, axes=[0, 1, 3, 2])
-                new_attrs["tile_size"] = tile_size
-                new_attrs["channels"] = CO
-                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                    inputs[0], weight, **new_attrs
-                )
-
-        return None
-
-    cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:  # if is fallback, clear query cache and return None
-        autotvm.task.clear_fallback_cache(target, workload)
-        do_new_layout = False
-        if is_target(["vulkan", "rocm"]):
-            do_new_layout = "+dotprod" in target.mattr or target.supports_integer_dot_product
-        if not do_new_layout:
-            return None
-
-    topi_tmpl = workload[0]
-    if topi_tmpl == "conv2d_NCHWc_int8.cuda":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        assert CO % 4 == 0, "Number of output channels should be multiple of 4"
-        new_layout = "NCHW4c"
-        new_attrs["channels"] = CO
-        new_attrs["data_layout"] = new_layout
-        new_attrs["out_layout"] = new_layout
-        new_attrs["kernel_layout"] = "OIHW4o4i"
-        ic_block_factor = oc_block_factor = 4
-
-        # Store the same config for the altered operator (workload)
-        new_data = te.placeholder(
-            (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype
-        )
-        new_kernel = te.placeholder(
-            (
-                CO // oc_block_factor,
-                CI // ic_block_factor,
-                KH,
-                KW,
-                oc_block_factor,
-                ic_block_factor,
-            ),
-            dtype=kernel.dtype,
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
-            "conv2d_NCHWc_int8.cuda",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_nchw_winograd.cuda":
-        if dilation != (1, 1):
-            logger.warning("Does not support weight pre-transform for dilated convolution.")
-            return None
-
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-
-        # pre-compute weight transformation in winograd
-        tile_size = _infer_tile_size(tinfos[0], tinfos[1])
-
-        weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], tile_size=tile_size)
-        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
-        new_attrs["tile_size"] = tile_size
-        new_attrs["channels"] = CO
-
-        # Store the same config for the altered operator (workload)
-        new_data = data
-        new_weight = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_weight, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_winograd_without_weight_transform.cuda",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], weight, **new_attrs
-        )
-
-    if topi_tmpl in ("conv2d_nhwc_winograd_direct.cuda", "conv2d_nhwc_winograd_tensorcore.cuda"):
-        if dilation != (1, 1):
-            logger.warning("Does not support weight pre-transform for dilated convolution.")
-            return None
-
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        N, H, W, CI = get_const_tuple(data.shape)
-        KH, KW, _, CO = get_const_tuple(kernel.shape)
-
-        # Pre-compute weight transformation in winograd
-        tile_size = _infer_tile_size(data, kernel, layout="NHWC")
-        kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
-        weight = relay.nn.contrib_conv2d_winograd_weight_transform(
-            kernel_transform, tile_size=tile_size
-        )
-        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
-        new_attrs["tile_size"] = tile_size
-        new_attrs["channels"] = CO
-        # Store the same config for the altered operator (workload)
-        new_data = data
-        new_weight = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype
-        )
-        if topi_tmpl == "conv2d_nhwc_winograd_direct.cuda":
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_weight, strides, padding, dilation, out_dtype],
-                "conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
-            )
-        elif topi_tmpl == "conv2d_nhwc_winograd_tensorcore.cuda":
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_weight, strides, padding, dilation, out_dtype],
-                "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
-            )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], weight, **new_attrs
-        )
-
-    if topi_tmpl == "group_conv2d_NCHWc_int8.cuda":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-
-        new_layout = "NCHW4c"
-        new_attrs["channels"] = CO
-        new_attrs["data_layout"] = new_layout
-        new_attrs["out_layout"] = new_layout
-        new_attrs["kernel_layout"] = "OIHW4o4i"
-        ic_block_factor = oc_block_factor = 4
-
-        # Store the same config for the altered operator (workload)
-        new_data = te.placeholder(
-            (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype
-        )
-        new_kernel = te.placeholder(
-            (
-                CO // oc_block_factor,
-                CI // ic_block_factor // groups,
-                KH,
-                KW,
-                oc_block_factor,
-                ic_block_factor,
-            ),
-            dtype=kernel.dtype,
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
-            "group_conv2d_NCHWc_int8.cuda",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_HWNCnc_tensorcore.cuda":
-        assert data_layout == "HWNC" and kernel_layout == "HWOI"
-        assert float(tvm.cuda(0).compute_version) >= 7.5
-        H, W, N, CI = get_const_tuple(data.shape)
-        KH, KW, CO, _ = get_const_tuple(kernel.shape)
-
-        if (
-            kernel.dtype in ["int4", "uint4"]
-            and (CI % 32 != 0 or CO % 8 != 0)
-            or kernel.dtype in ["int8", "uint8"]
-            and (CI % 16 != 0 or CO % 32 != 0)
-        ):
-            return relay.nn.conv2d(*inputs, **new_attrs)
-
-        new_attrs["channels"] = CO
-        if kernel.dtype in ["int4", "uint4"]:
-            new_attrs["kernel_layout"] = "HWOI8o32i"
-            ic_block_factor = 32
-            oc_block_factor = 8
-        else:
-            new_attrs["kernel_layout"] = "HWOI32o16i"
-            ic_block_factor = 16
-            oc_block_factor = 32
-
-        new_kernel = te.placeholder(
-            (
-                KH,
-                KW,
-                CO // oc_block_factor,
-                CI // ic_block_factor,
-                oc_block_factor,
-                ic_block_factor,
-            ),
-            dtype=kernel.dtype,
-        )
-
-        new_workload = autotvm.task.args_to_workload(
-            [data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_HWNCnc_tensorcore.cuda",
-        )
-
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.conv2d(*inputs, **new_attrs)
-
-    return None
-
-
-def _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor):
-    # Pad batch size
-    if db != 0:
-        data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, db), (0, 0)))
-
-    # Pad input channel
-    if di != 0:
-        data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
-        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
-
-    # Pad output channel
-    if do != 0:
-        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, do), (0, 0)))
-
-    if do != 0:
-        new_out_channel = out_channel + do
-        new_attrs["channels"] = new_out_channel
-
-    out = relay.nn.conv2d(data, kernel, **new_attrs)
-
-    if db != 0 or do != 0:
-        original_out_shape = [x.value for x in output_tensor.shape]
-        out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
-
-    return out
-
-
-def _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor):
-    # Pad batch size
-    if db != 0:
-        data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0)))
-
-    # Pad input channel
-    if di != 0:
-        data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
-        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0)))
-
-    # Pad output channel
-    if do != 0:
-        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do)))
-
-    if do != 0:
-        new_out_channel = out_channel + do
-        new_attrs["channels"] = new_out_channel
-
-    out = relay.nn.conv2d(data, kernel, **new_attrs)
-
-    if db != 0 or do != 0:
-        original_out_shape = [x.value for x in output_tensor.shape]
-        out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
-
-    return out
-
-
-@conv2d_legalize.register(["cuda", "gpu"])
-def _conv2d_legalize(attrs, inputs, arg_types):
-    """Legalizes Conv2D op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    if not is_target(["vulkan", "rocm", "cuda"]):
-        return None
-    # Dilation not supported yet. Return None if dilation is not (1, 1)
-    dilation = attrs.get_int_tuple("dilation")
-    if not (dilation[0] == 1 and dilation[1] == 1):
-        return None
-
-    # No legalization for depthwise convolutions yet.
-    groups = attrs.get_int("groups")
-    if groups != 1:
-        return None
-
-    # Collect the input tensors.
-    data_tensor, kernel_tensor = arg_types[0], arg_types[1]
-    data_dtype = data_tensor.dtype
-
-    # Collect the output tensor.
-    output_tensor = arg_types[2]
-
-    # Collect the input exprs.
-    data, kernel = inputs
-
-    # Get the conv attrs
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    # Get data layout. Return None if not NCHW
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-
-    # Pad input and output channels to use int8 schedule.
-    if data_dtype in ["int8", "uint8"]:
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            oc_modified = False
-            in_channel = data_tensor.shape[1].value
-            out_channel = kernel_tensor.shape[0].value
-
-            # Pad input channel
-            if in_channel % 4 != 0:
-                new_in_channel = ((in_channel + 4) // 4) * 4
-                diff = new_in_channel - in_channel
-                pad_width = ((0, 0), (0, diff), (0, 0), (0, 0))
-                data = relay.nn.pad(data, pad_width=pad_width)
-                kernel = relay.nn.pad(kernel, pad_width=pad_width)
-
-            # Pad output channel
-            new_out_channel = out_channel
-            if out_channel % 4 != 0:
-                new_out_channel = ((out_channel + 4) // 4) * 4
-                diff = new_out_channel - out_channel
-                kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0), (0, 0), (0, 0)))
-                oc_modified = True
-
-            if oc_modified:
-                new_attrs["channels"] = new_out_channel
-                out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
-                original_out_shape = [x.value for x in output_tensor.shape]
-                out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
-            else:
-                out = relay.nn.conv2d(data, kernel, **new_attrs)
-            return out
-
-        if data_layout == "NHWC" and kernel_layout == "HWIO":
-            batch = data_tensor.shape[0].value
-            in_channel = data_tensor.shape[3].value
-            out_channel = kernel_tensor.shape[3].value
-
-            if (
-                (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
-                or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
-                or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
-            ):
-                # no need to pad
-                return None
-
-            candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(
-                batch, in_channel, out_channel, candidates
-            )
-
-            if extra_flops > 2:
-                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
-                return None
-
-            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
-
-            return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
-
-        if data_layout == "HWNC" and kernel_layout == "HWOI":
-            batch = data_tensor.shape[2].value
-            in_channel = data_tensor.shape[3].value
-            out_channel = kernel_tensor.shape[2].value
-
-            if batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0:
-                return None
-
-            candidates = [(8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(
-                batch, in_channel, out_channel, candidates
-            )
-
-            if extra_flops > 2:
-                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
-                return None
-            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
-
-            return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
-
-    elif data_dtype in ["float16"]:
-        if data_layout == "NHWC" and kernel_layout == "HWIO":
-            if isinstance(data_tensor.shape[0], tvm.tir.expr.Any):
-                # Skip legalize when the batch size is dynamic
-                return None
-
-            batch = data_tensor.shape[0].value
-            in_channel = data_tensor.shape[3].value
-            out_channel = kernel_tensor.shape[3].value
-
-            if (
-                (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
-                or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
-                or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
-            ):
-                # no need to pad
-                return None
-
-            candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(
-                batch, in_channel, out_channel, candidates
-            )
-
-            if extra_flops > 2:
-                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
-                return None
-
-            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
-
-            return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
-
-    elif data_dtype in ["int4", "uint4"]:
-        if data_layout == "NHWC" and kernel_layout == "HWIO":
-            batch = data_tensor.shape[0].value
-            in_channel = data_tensor.shape[3].value
-            out_channel = kernel_tensor.shape[3].value
-
-            if (
-                (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
-                or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
-                or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
-            ):
-                # no need to pad
-                return None
-
-            candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(
-                batch, in_channel, out_channel, candidates
-            )
-
-            if extra_flops > 2:
-                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
-                return None
-
-            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
-
-            return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
-
-        if data_layout == "HWNC" and kernel_layout == "HWOI":
-            batch = data_tensor.shape[2].value
-            in_channel = data_tensor.shape[3].value
-            out_channel = kernel_tensor.shape[2].value
-
-            if batch % 8 == 0 and in_channel % 32 == 0 and out_channel % 8 == 0:
-                return None
-
-            candidates = [(8, 32, 8)]
-            (db, di, do), extra_flops = pad_to_tensorcore(
-                batch, in_channel, out_channel, candidates
-            )
-
-            if extra_flops > 2:
-                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
-                return None
-            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
-
-            return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
-
-    return None
diff --git a/python/tvm/topi/cuda/conv2d_direct.py b/python/tvm/topi/cuda/conv2d_direct.py
deleted file mode 100644
index 2dc6635e680e..000000000000
--- a/python/tvm/topi/cuda/conv2d_direct.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""The templates for cuda conv2d operators"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from ..utils import get_const_tuple
-
-
-def schedule_direct_cuda(cfg, s, conv):
-    """schedule optimized for batch size = 1"""
-
-    ##### space definition begin #####
-    n, f, y, x = s[conv].op.axis
-    rc, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_f", f, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-
-    # fallback support
-    if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log(
-            target.kind.name, target.model, "conv2d_nchw.cuda"
-        )
-        cfg.fallback_with_reference_log(ref_log)
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-
-    s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    # create cache stage
-    AA = s.cache_read(pad_data, "shared", [OL])
-    WW = s.cache_read(kernel, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, f, y, x = s[output].op.axis
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-    s[OL].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, f, y, x = s[OL].op.axis
-    rc, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
-    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        n, f, y, x = s[load].op.axis
-        fused = s[load].fuse(n, f, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    N, CO, OH, OW = get_const_tuple(output.shape)
-    _, KH, KW, CI = get_const_tuple(kernel.shape)
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
diff --git a/python/tvm/topi/cuda/conv2d_hwcn.py b/python/tvm/topi/cuda/conv2d_hwcn.py
deleted file mode 100644
index 8786fbcc1aa0..000000000000
--- a/python/tvm/topi/cuda/conv2d_hwcn.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
-"""Schedule for conv2d_hwcn with auto fusion"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from tvm.autotvm.task.space import SplitEntity
-
-from .. import nn, tag
-
-
-@autotvm.register_topi_compute("conv2d_hwcn.cuda")
-def conv2d_hwcn(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
-    """Compute conv2d with HWCN layout on CUDA"""
-    return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_hwcn.cuda")
-def schedule_conv2d_hwcn(cfg, outs):
-    """Schedule for conv2d_hwcn and any element-wise operations.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_hwcn in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_hwcn.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    sch = te.create_schedule([x.op for x in outs])
-
-    def schedule(Apad, W, B):
-        """Schedule conv2d_hwcn"""
-        sch[Apad].compute_inline()
-        AA = sch.cache_read(Apad, "shared", [B])
-        WW = sch.cache_read(W, "shared", [B])
-        AL = sch.cache_read(AA, "local", [B])
-        WL = sch.cache_read(WW, "local", [B])
-
-        if B.op in sch.outputs:
-            Out = B
-            BL = sch.cache_write(Out, "local")
-        else:
-            Out = sch.outputs[0].output(0)
-            sch[B].set_scope("local")
-            BL = B
-
-        hi, wi, fi, ni = sch[Out].op.axis
-
-        # Create tuning space
-        n_thread_cand = [1, 2, 4, 8, 16, 32]
-        vthread_cand = [1, 2, 4, 8]
-
-        cfg.define_split(
-            "tile_fi",
-            fi,
-            num_outputs=4,
-            filter=lambda x: (x.size[1] in vthread_cand and x.size[2] in n_thread_cand),
-        )
-        cfg.define_split(
-            "tile_ni",
-            ni,
-            num_outputs=4,
-            filter=lambda x: (x.size[1] in vthread_cand and x.size[2] in n_thread_cand),
-        )
-
-        if cfg.is_fallback:
-            cfg["tile_fi"] = SplitEntity([-1, 2, 8, 4])
-            cfg["tile_ni"] = SplitEntity([-1, 2, 8, 4])
-
-        # Scheduling
-        step = 8
-
-        bz = sch[Out].fuse(hi, wi)
-        by, tyz, ty, fi = cfg["tile_fi"].apply(sch, Out, fi)
-        bx, txz, tx, ni = cfg["tile_ni"].apply(sch, Out, ni)
-        sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
-
-        sch[Out].bind(bz, te.thread_axis("blockIdx.z"))
-        sch[Out].bind(by, te.thread_axis("blockIdx.y"))
-        sch[Out].bind(bx, te.thread_axis("blockIdx.x"))
-        sch[Out].bind(tyz, te.thread_axis("vthread"))
-        sch[Out].bind(txz, te.thread_axis("vthread"))
-        sch[Out].bind(ty, te.thread_axis("threadIdx.y"))
-        sch[Out].bind(tx, te.thread_axis("threadIdx.x"))
-
-        # Schedule BL local write
-        sch[BL].compute_at(sch[Out], tx)
-        yi, xi, fi, ni = sch[BL].op.axis
-        ry, rx, rc = sch[BL].op.reduce_axis
-        rco, rci = sch[BL].split(rc, factor=step)
-        sch[BL].reorder(rco, ry, rx, rci, fi, ni)
-        fuse_index = sch[BL].fuse(ry, rx)
-        fuse_index = sch[BL].fuse(fuse_index, rco)
-        rx = fuse_index
-
-        sch[AA].compute_at(sch[BL], rx)
-        sch[WW].compute_at(sch[BL], rx)
-        sch[AL].compute_at(sch[BL], rci)
-        sch[WL].compute_at(sch[BL], rci)
-        # Schedule for A's shared memory load
-        yi, xi, ci, ni = sch[AA].op.axis
-        ty, ci = sch[AA].split(ci, nparts=cfg["tile_fi"].size[2])
-        tx, ni = sch[AA].split(ni, nparts=cfg["tile_ni"].size[2])
-        _, ni = sch[AA].split(ni, factor=4)
-        sch[AA].reorder(ty, tx, yi, xi, ci, ni)
-        sch[AA].bind(ty, te.thread_axis("threadIdx.y"))
-        sch[AA].bind(tx, te.thread_axis("threadIdx.x"))
-        sch[AA].vectorize(ni)
-        # Schedule for W's shared memory load
-        yi, xi, ci, fi = sch[WW].op.axis
-        ty, ci = sch[WW].split(ci, nparts=cfg["tile_fi"].size[2])
-        tx, fi = sch[WW].split(fi, nparts=cfg["tile_ni"].size[2])
-        _, fi = sch[WW].split(fi, factor=4)
-        sch[WW].reorder(ty, tx, yi, xi, ci, fi)
-        sch[WW].bind(ty, te.thread_axis("threadIdx.y"))
-        sch[WW].bind(tx, te.thread_axis("threadIdx.x"))
-        sch[WW].vectorize(fi)
-
-    scheduled_ops = []
-
-    def traverse(operator):
-        """Traverse operators from computation graph"""
-        if tag.is_broadcast(operator.tag):
-            if operator not in sch.outputs:
-                sch[operator].compute_inline()
-            for tensor in operator.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        elif operator.tag == "conv2d_hwcn":
-            Apad = operator.input_tensors[0]
-            W = operator.input_tensors[1]
-            if isinstance(W.op, tvm.te.ComputeOp) and "dilate" in W.op.tag:
-                sch[W].compute_inline()
-            B = operator.output(0)
-            schedule(Apad, W, B)
-        else:
-            raise RuntimeError(f"Unsupported operator: {operator.tag}")
-
-        scheduled_ops.append(operator)
-
-    traverse(outs[0].op)
-    return sch
diff --git a/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py b/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
deleted file mode 100644
index be9218431c85..000000000000
--- a/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-function-args
-# pylint: disable=too-many-statements, unused-argument, too-many-arguments
-"""Tensorcore template for cuda backend"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.target import Target
-from tvm.topi.cuda.injective import schedule_injective_from_existing
-from ..utils import get_const_tuple, traverse_inline, simplify, tag
-from ..nn.pad import pad
-from ..nn.utils import get_pad_tuple
-from .tensor_intrin import intrin_wmma_load_matrix_A
-from .tensor_intrin import intrin_wmma_load_matrix_W
-from .tensor_intrin import intrin_wmma_store_matrix
-from .tensor_intrin import intrin_wmma_gemm
-
-
-def unpack_HWNCnc_to_hwnc(packed_out, out_dtype):
-    """Unpack conv2d_hwnc output from layout hwncnc to hwnc
-
-     Parameters
-    -----------
-    packed_out : tvm.te.Tensor
-        The output tensor of conv2d_hwnc.
-
-    out_dtype : str
-        The output dtype.
-
-    Returns
-    -------
-    unpacked_out : tvm.te.Tensor
-        The unpacked output tensor in hwnc layout.
-    """
-    H, W, N, O, wmma_m, wmma_n = get_const_tuple(packed_out.shape)
-
-    idxmod = tvm.tir.indexmod
-    idxdiv = tvm.tir.indexdiv
-
-    oshape = (H, W, N * wmma_m, O * wmma_n)
-    unpacked_out = te.compute(
-        oshape,
-        lambda h, w, n, o: packed_out[
-            h, w, idxdiv(n, wmma_m), idxdiv(o, wmma_n), idxmod(n, wmma_m), idxmod(o, wmma_n)
-        ].astype(out_dtype),
-        name="output_unpack",
-        tag=tag.INJECTIVE + ",unpack_hwncc",
-    )
-    return unpacked_out
-
-
-def conv2d_hwnc_tensorcore(data, kernel, strides, padding, dilation, in_dtype, out_dtype="int32"):
-    """ "Compute conv2d with tensorcore for HWNC layout with int8/int4"""
-    assert data.dtype in ("int4", "uint4", "int8", "uint8")
-    assert kernel.dtype in ("int4", "uint4", "int8", "uint8")
-    packed_out = hwnc_tensorcore_cuda(data, kernel, strides, padding, dilation, out_dtype)
-    return unpack_HWNCnc_to_hwnc(packed_out, out_dtype)
-
-
-@autotvm.register_topi_compute("conv2d_HWNCnc_tensorcore.cuda")
-def hwnc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dtype="int32"):
-    """Compute declaration for tensorcore"""
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    in_dtype = Input.dtype
-
-    if in_dtype in ["int4", "uint4"]:
-        wmma_n = wmma_m = 8
-        wmma_k = 32
-    else:
-        wmma_m = 8
-        wmma_n = 32
-        wmma_k = 16
-
-    pre_computed = len(Filter.shape) == 6
-    in_height, in_width, batch, in_channels = get_const_tuple(Input.shape)
-    if pre_computed:
-        kernel_h, kernel_w, oc_chunk, _, oc_block_factor, _ = get_const_tuple(Filter.shape)
-        num_filter = oc_block_factor * oc_chunk
-    else:
-        kernel_h, kernel_w, num_filter, _ = get_const_tuple(Filter.shape)
-
-    if in_dtype in ["int4", "uint4"]:
-        assert batch % 8 == 0 and in_channels % 32 == 0 and num_filter % 8 == 0
-    else:
-        assert batch % 8 == 0 and in_channels % 16 == 0 and num_filter % 32 == 0, (
-            "The shape of (batch, in_channels, num_filter) "
-            "must be multiple of (8, 16, 32) for int8, "
-            "and (8, 32, 8) for int4"
-        )
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-
-    out_channels = num_filter
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-
-    cfg.add_flop(
-        2 * batch * out_height * out_width * out_channels * in_channels * kernel_h * kernel_w
-    )
-
-    # Input feature map: (H, W, N, IC, n, ic)
-    data_shape = (in_height, in_width, batch // wmma_m, in_channels // wmma_k, wmma_m, wmma_k)
-
-    # Kernel: (H, W, OC, IC, oc, ic)
-    kernel_shape = (
-        kernel_h,
-        kernel_w,
-        out_channels // wmma_n,
-        in_channels // wmma_k,
-        wmma_n,
-        wmma_k,
-    )
-
-    # Reduction axes
-    kh = te.reduce_axis((0, kernel_h), name="kh")
-    kw = te.reduce_axis((0, kernel_w), name="kw")
-    ic = te.reduce_axis((0, in_channels // wmma_k), name="ic")
-    ii = te.reduce_axis((0, wmma_k), name="ii")
-
-    if pre_computed:
-        packed_kernel = Filter
-    else:
-        packed_kernel = te.compute(
-            kernel_shape,
-            lambda kh, kw, o, i, oo, ii: Filter[kh, kw, o * wmma_n + oo, i * wmma_k + ii],
-            name="packed_kernel",
-        )
-
-    packed_data = te.compute(
-        data_shape, lambda h, w, n, i, nn, ii: Input[h, w, n * wmma_m + nn, i * wmma_k + ii]
-    )
-
-    pad_before = [pad_top, pad_left, 0, 0, 0, 0]
-    pad_after = [pad_down, pad_right, 0, 0, 0, 0]
-    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
-
-    Conv = te.compute(
-        (out_height, out_width, batch // wmma_m, out_channels // wmma_n, wmma_m, wmma_n),
-        lambda h, w, n, o, nn, oo: te.sum(
-            (
-                pad_data[h * stride_h + kh, w * stride_w + kw, n, ic, nn, ii].astype("int32")
-                * packed_kernel[kh, kw, o, ic, oo, ii].astype("int32")
-            ),
-            axis=[ic, kh, kw, ii],
-        ),
-        name="Conv",
-        tag="conv2d_HWNCnc_tensorcore",
-    )
-    return Conv
-
-
-def schedule_hwnc_tensorcore_cuda(cfg, s, Conv):
-    """Schedule tensorcore template"""
-    pad_data, packed_kernel = s[Conv].op.input_tensors
-    ic, kh, kw, ii = s[Conv].op.reduce_axis
-    packed_data = s[pad_data].op.input_tensors[0]
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    # Designate the memory hierarchy
-    AS = s.cache_read(pad_data, "shared", [Conv])
-    WS = s.cache_read(packed_kernel, "shared", [Conv])
-    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
-    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
-    ConvF = s.cache_write(Conv, "wmma.accumulator")
-
-    if Conv.op in s.outputs:
-        output = Conv
-        ConvS = s.cache_read(ConvF, "shared", [Conv])
-        OL = ConvS
-    else:
-        output = s.outputs[0].output(0)
-        s[Conv].set_scope("shared")
-        OL = Conv
-
-    out_dtype = Conv.dtype
-
-    if isinstance(packed_kernel.op, te.tensor.ComputeOp) and packed_kernel.name == "packed_kernel":
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
-        else:
-            with Target("cuda"):
-                schedule_injective_from_existing(s, packed_kernel)
-
-    if isinstance(pad_data.op, te.tensor.ComputeOp) and "pad" in pad_data.op.tag:
-        s[pad_data].compute_inline()
-        data = pad_data.op.input_tensors[0]
-
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # skip this part during tuning to make recrods accurate
-            # this part will be pre-computed during NNVM's pre-compute optimization pass
-            s[pad_data].pragma(s[pad_data].op.axis[0], "debug_skip_region")
-    else:
-        data = pad_data
-        s[data].compute_inline()
-
-    data_dtype = data.dtype
-    kernel_dtype = packed_kernel.dtype
-
-    # Schedule for autotvm
-    cfg.define_knob("block_row_warps", [1, 2, 4])
-    cfg.define_knob("block_col_warps", [1, 2, 4])
-    cfg.define_knob("warp_row_tiles", [1, 2, 4, 8, 16])
-    cfg.define_knob("warp_col_tiles", [1, 2, 4, 8, 16])
-    cfg.define_knob("chunk", [1, 2, 4, 8])
-    cfg.define_knob("split_block_k_nums", [1, 2, 4, 8, 16, 32])
-    cfg.define_knob("vector_ws", [1, 8])
-    cfg.define_knob("vector_as", [1, 8, 16])
-
-    block_row_warps = cfg["block_row_warps"].val
-    block_col_warps = cfg["block_col_warps"].val
-    warp_row_tiles = cfg["warp_row_tiles"].val
-    warp_col_tiles = cfg["warp_col_tiles"].val
-    chunk = cfg["chunk"].val
-    vector_as = cfg["vector_as"].val
-    vector_ws = cfg["vector_ws"].val
-    split_block_k_nums = cfg["split_block_k_nums"].val
-
-    s[packed_data].compute_inline()
-
-    if data_dtype in ["int4", "uint4"]:
-        wmma_m = wmma_n = 8
-        wmma_k = 32
-    else:
-        wmma_m = 8
-        wmma_n = 32
-        wmma_k = 16
-
-    warp_size = 32
-
-    # Schedule for output
-    if len(s[output].op.axis) == 4:
-        (
-            hc,
-            wc,
-            nc,
-            oc,
-        ) = output.op.axis
-        nc, nnc = s[output].split(nc, factor=wmma_m)
-        oc, ooc = s[output].split(oc, factor=wmma_n)
-    else:
-        hc, wc, nc, oc, nnc, ooc = output.op.axis
-
-    kernel_scope, hc = s[output].split(hc, nparts=1)
-
-    block_k = s[output].fuse(hc, wc)
-    block_k, split_block_k = s[output].split(block_k, factor=split_block_k_nums)
-    nc, nci = s[output].split(nc, factor=warp_row_tiles)
-    block_i, nc = s[output].split(nc, factor=block_row_warps)
-    oc, oci = s[output].split(oc, factor=warp_col_tiles)
-    block_j, oc = s[output].split(oc, factor=block_col_warps)
-    s[output].reorder(block_k, split_block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
-    t = s[output].fuse(nnc, ooc)
-    _, tx = s[output].split(t, factor=warp_size)
-    s[output].bind(block_k, block_z)
-    s[output].bind(block_i, block_x)
-    s[output].bind(block_j, block_y)
-    s[output].bind(tx, thread_x)
-    s[output].bind(nc, thread_y)
-    s[output].bind(oc, thread_z)
-
-    # Schedule wmma store
-    s[OL].compute_at(s[output], block_j)
-    hc, wc, nc, oc, nnc, ooc = OL.op.axis
-    oc, oci = s[OL].split(oc, factor=warp_col_tiles)
-    _, oc = s[OL].split(oc, factor=block_col_warps)
-    nc, nci = s[OL].split(nc, factor=warp_row_tiles)
-    _, nc = s[OL].split(nc, factor=block_row_warps)
-    s[OL].reorder(nc, oc, nci, oci, nnc, ooc)
-    s[OL].bind(nc, thread_y)
-    s[OL].bind(oc, thread_z)
-
-    # Schedule local computation
-    s[ConvF].compute_at(s[OL], oc)
-    _, _, n, o, nnf, oof = ConvF.op.axis
-    ko, ki = s[ConvF].split(ic, factor=chunk)
-    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
-
-    cfg.define_reorder("reorder_inner", [ko, kh], policy="all")
-    cfg["reorder_inner"].apply(s, ConvF, [ko, kh])
-    cfg["reorder_inner"].apply(s, ConvF, [ki, kw])
-
-    # Move intermediate computation into each output compute tile
-    s[AF].compute_at(s[ConvF], kw)
-    s[WF].compute_at(s[ConvF], kw)
-
-    # Schedule for A's share memory
-    s[AS].compute_at(s[ConvF], ko)
-
-    _, _, n, _, nn, ii = AS.op.axis
-    tx, xo = s[AS].split(n, nparts=block_row_warps)
-    ty, _ = s[AS].split(xo, nparts=block_col_warps)
-    t = s[AS].fuse(nn, ii)
-    to, ti = s[AS].split(t, nparts=warp_size)
-    ti, _t = s[AS].split(ti, factor=vector_as)
-    s[AS].bind(tx, thread_y)
-    s[AS].bind(ty, thread_z)
-    s[AS].bind(to, thread_x)
-    s[AS].vectorize(_t)
-
-    # Schedule for W's share memory
-    s[WS].compute_at(s[ConvF], kw)
-    kh, kw, ic, o, ii, oo = WS.op.axis
-    tx, xo = s[WS].split(o, nparts=block_row_warps)
-    ty, _ = s[WS].split(xo, nparts=block_col_warps)
-    t = s[WS].fuse(ii, oo)
-    to, ti = s[WS].split(t, nparts=warp_size)
-    ti, _t = s[WS].split(ti, factor=vector_ws)
-    s[WS].bind(tx, thread_y)
-    s[WS].bind(ty, thread_z)
-    s[WS].bind(to, thread_x)
-    s[WS].vectorize(ti)
-
-    # double buffer
-    cfg.define_knob("AS_double_buffer", [0, 1])
-    cfg.define_knob("WS_double_buffer", [0, 1])
-    if cfg["AS_double_buffer"].val:
-        s[AS].double_buffer()
-    if cfg["WS_double_buffer"].val:
-        s[WS].double_buffer()
-
-    # unroll
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", False)
-
-    shape = (wmma_m, wmma_n, wmma_k)
-
-    AS_shape = (wmma_m, wmma_k)
-    AL_shape = (wmma_m, wmma_k)
-    WS_shape = (wmma_n, wmma_k)
-    WL_shape = (wmma_n, wmma_k)
-    CL_shape = (wmma_m, wmma_n)
-    CS_shape = (wmma_m, wmma_n)
-
-    AL_gemm = te.placeholder(AL_shape, name="A", dtype=data_dtype)
-    WL_gemm = te.placeholder(WL_shape, name="B", dtype=kernel_dtype)
-    k_gemm = te.reduce_axis((0, wmma_k), name="k")
-    CL_compute = te.compute(
-        CL_shape,
-        lambda ii, jj: te.sum(
-            (AL_gemm[ii, k_gemm].astype("int32") * WL_gemm[jj, k_gemm].astype("int32")), axis=k_gemm
-        ),
-        name="C",
-    )
-
-    AL_strides = [wmma_k, 1]
-    AS_strides = [wmma_k, 1]
-    WL_strides = [wmma_k, 1]
-    WS_strides = [wmma_k, 1]
-    CL_strides = [wmma_n, 1]
-    CS_strides = [wmma_n, 1]
-
-    s[AF].tensorize(
-        AF.op.axis[-2],
-        intrin_wmma_load_matrix_A(
-            AL_strides, AS_strides, shape, "row_major", AS_shape, AL_shape, data_dtype
-        ),
-    )
-
-    s[WF].tensorize(
-        WF.op.axis[-2],
-        intrin_wmma_load_matrix_W(
-            WL_strides, WS_strides, shape, "col_major", WS_shape, WL_shape, kernel_dtype
-        ),
-    )
-
-    s[OL].tensorize(
-        nnc, intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype, CL_shape, CS_shape)
-    )
-
-    s[ConvF].tensorize(
-        nnf,
-        intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides, CL_strides, shape),
-    )
-
-    return s
-
-
-@autotvm.register_topi_schedule("conv2d_HWNCnc_tensorcore.cuda")
-def schedule_conv2d_hwnc_tensorcore(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_HWNCnc_tensorcore" in op.tag:
-            schedule_hwnc_tensorcore_cuda(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
deleted file mode 100644
index b959136999bc..000000000000
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-# pylint: disable=no-value-for-parameter
-"""Int8 conv2d in NCHWc layout"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from .injective import schedule_injective_from_existing
-from .tensor_intrin import dp4a
-from ..nn.pad import pad
-from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
-
-
-def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"):
-    """Compute conv2d internally using conv2d_nchwc layout for int8 dtype"""
-    assert data.dtype in ("int8", "uint8")
-    assert kernel.dtype in ("int8", "uint8")
-    assert data.dtype == kernel.dtype
-    packed_out = conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, "NCHW", out_dtype)
-    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
-
-
-def schedule_conv2d_nchw_int8(outs):
-    """Create schedule for tensors"""
-    return schedule_conv2d_NCHWc_int8(outs)
-
-
-@autotvm.register_topi_compute("conv2d_NCHWc_int8.cuda")
-def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_dtype):
-    """Convolution operator in NCHW[x]c layout for int8.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width] or
-        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
-
-    kernel : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
-        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
-        filter_width, num_filter_block, in_channel_block]
-
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding: int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
-
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    layout : str
-        layout of data
-
-    out_dtype : str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
-    """
-    assert layout in ["NCHW", "NCHW4c"]
-    ic_block_factor = 4
-    oc_block_factor = 4
-
-    pre_computed = len(kernel.shape) == 6
-    if not pre_computed:
-        batch, channels, height, width = get_const_tuple(data.shape)
-        assert (
-            channels % ic_block_factor == 0
-        ), f"Number of input channels should be multiple of {ic_block_factor}"
-        packed_data = te.compute(
-            (batch, channels // ic_block_factor, height, width, ic_block_factor),
-            lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w],
-            name="packed_data",
-        )
-
-        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-        assert (
-            out_channels % oc_block_factor == 0
-        ), f"Number of output channels should be multiple of {oc_block_factor}"
-        packed_kernel = te.compute(
-            (
-                out_channels // oc_block_factor,
-                in_channels // ic_block_factor,
-                kernel_h,
-                kernel_w,
-                oc_block_factor,
-                ic_block_factor,
-            ),
-            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: kernel[
-                oc_chunk * oc_block_factor + oc_block, ic_chunk * ic_block_factor + ic_block, kh, kw
-            ],
-            name="packed_kernel",
-        )
-
-    else:
-        packed_data = data
-        packed_kernel = kernel
-
-    batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(packed_data.shape)
-    oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
-        packed_kernel.shape
-    )
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
-    # compute graph
-    pad_before = [0, 0, pad_top, pad_left, 0]
-    pad_after = [0, 0, pad_down, pad_right, 0]
-    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1
-    out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1
-    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
-
-    icc = te.reduce_axis((0, ic_chunk), name="ic_chunk")
-    icb = te.reduce_axis((0, ic_block), name="ic_block")
-    kh = te.reduce_axis((0, kernel_h), name="kh")
-    kw = te.reduce_axis((0, kernel_w), name="kw")
-
-    packed_kernel_dtype = packed_kernel.dtype
-    packed_dtype = "int32" if packed_kernel_dtype == "int8" else "uint32"
-    conv = te.compute(
-        oshape,
-        lambda n, oc_chunk, oh, ow, oc_block: te.sum(
-            pad_data[
-                n, icc, oh * stride_h + kh * dilation_h, ow * stride_w + kw * dilation_w, icb
-            ].astype(packed_dtype)
-            * packed_kernel[oc_chunk, icc, kh, kw, oc_block, icb].astype(packed_dtype),
-            axis=[icc, kh, kw, icb],
-        ),
-    )
-
-    output = te.compute(
-        oshape,
-        lambda n, oc_chunk, oh, ow, oc_block: conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
-        tag="conv2d_NCHWc_int8",
-    )
-
-    # num flop
-    num_flop = (
-        batch
-        * oc_chunk
-        * oc_block
-        * out_height
-        * out_width
-        * ic_chunk
-        * ic_block
-        * kernel_h
-        * kernel_w
-        * 2
-    )
-    cfg.add_flop(num_flop)
-
-    return output
-
-
-@autotvm.register_topi_schedule("conv2d_NCHWc_int8.cuda")
-def schedule_conv2d_NCHWc_int8(cfg, outs):
-    """Schedule conv2d int8 NCHWc template"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv2d_NCHWc_int8":
-            _schedule_conv2d_NCHWc_int8(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_conv2d_NCHWc_int8(cfg, s, output):
-    conv = output.op.input_tensors[0]
-    packed_data, packed_kernel = conv.op.input_tensors
-
-    if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag:
-        pad_data = packed_data
-        packed_data = pad_data.op.input_tensors[0]
-    else:
-        pad_data = packed_data
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # skip this part during tuning to make recrods accurate
-        # this part will be pre-computed during NNVM's pre-compute optimization pass
-        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
-        s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
-    else:
-        if isinstance(packed_kernel.op, tvm.te.ComputeOp) and packed_kernel.name == "packed_kernel":
-            # data and kernel are not pre-computed, schedule layout transform here
-            schedule_injective_from_existing(s, packed_data)
-            schedule_injective_from_existing(s, packed_kernel)
-
-    if pad_data != packed_data:
-        s[pad_data].compute_inline()
-
-    # create cache stage
-    AA = s.cache_read(pad_data, "shared", [conv])
-    WW = s.cache_read(packed_kernel, "shared", [conv])
-
-    s[conv].set_scope("local")
-
-    # handle bias
-    if output.op not in s.outputs:
-        s[output].compute_inline()
-        output = s.outputs[0].output(0)
-
-    # tile and bind spatial axes
-    if len(s[output].op.axis) == 5:
-        n, f, y, x, c = s[output].op.axis
-    else:
-        # For task extraction of auto-tuning, the expected output is 4D.  Since auto-tuning tasks
-        # are created from scratch, therefore the real auto-tuning will still happen on 5D output.
-        n, f, y, x = s[output].op.axis
-
-    cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
-    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
-    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-
-    # this is the scope to attach global config inside this kernel
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, te.thread_axis("blockIdx.z"))
-    s[output].bind(bf, te.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
-    s[output].bind(vn, te.thread_axis("vthread"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-
-    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
-    if cfg["fuse_yx"].val:
-        s[output].bind(tn, te.thread_axis("threadIdx.z"))
-        s[output].bind(tf, te.thread_axis("threadIdx.y"))
-        tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
-        s[conv].compute_at(s[output], tyx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2]
-        n_ty = cfg["tile_f"].size[2]
-        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
-    else:
-        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
-        s[output].bind(ty, te.thread_axis("threadIdx.y"))
-        s[output].bind(tx, te.thread_axis("threadIdx.x"))
-        s[conv].compute_at(s[output], tx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-        n_ty = cfg["tile_y"].size[2]
-        n_tx = cfg["tile_x"].size[2]
-
-    # tile and bind reduction axes
-    n, f, y, x, c = s[conv].op.axis
-
-    rc, ry, rx, rc_block = s[conv].op.reduce_axis
-    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
-    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
-    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
-    rco, rci = cfg["tile_rc"].apply(s, conv, rc)
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
-
-    cfg.define_reorder("reorder_inner", [rco, ryo, rxo], policy="all")
-    cfg["reorder_inner"].apply(s, conv, [rco, ryo, rxo])
-    cfg["reorder_inner"].apply(s, conv, [rci, ryi, rxi])
-
-    _, rc_block = s[conv].split(rc_block, factor=4)
-    target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
-
-    if do_tensorize:
-        dtypes = (pad_data.dtype, packed_kernel.dtype)
-        s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))
-
-    cache_loc = [rco, ryo, rxo][cfg["reorder_inner"].perm[-1]]
-    s[AA].compute_at(s[conv], cache_loc)
-    s[WW].compute_at(s[conv], cache_loc)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        c = s[load].op.axis[-1]
-        c_outer, c = s[load].split(c, factor=4)
-        s[load].vectorize(c)
-        fused = s[load].op.axis[:-1] + [c_outer]
-        fused = s[load].fuse(*fused)
-
-        fused, tx = s[load].split(fused, factor=n_tx)
-        fused, ty = s[load].split(fused, factor=n_ty)
-        fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # double buffer
-    cfg.define_knob("AA_double_buffer", [0, 1])
-    cfg.define_knob("WW_double_buffer", [0, 1])
-    if cfg["AA_double_buffer"].val:
-        s[AA].double_buffer()
-    if cfg["WW_double_buffer"].val:
-        s[WW].double_buffer()
-
-    # unroll
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", False)
-
-    return s
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
deleted file mode 100644
index 76f082f07b44..000000000000
--- a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-function-args
-# pylint: disable=too-many-statements, unused-argument, too-many-arguments
-"""Tensorcore template for cuda backend"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from ..utils import get_const_tuple, traverse_inline, simplify
-from ..nn.pad import pad
-from ..nn.utils import get_pad_tuple
-from .tensor_intrin import intrin_wmma_load_matrix_A
-from .tensor_intrin import intrin_wmma_load_matrix_W
-from .tensor_intrin import intrin_wmma_store_matrix
-from .tensor_intrin import intrin_wmma_gemm
-
-
-def nhwc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    """Compute declaration for tensorcore"""
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_height, in_width, in_channel = get_const_tuple(Input.shape)
-    kernel_h, kernel_w, _, num_filter = get_const_tuple(Filter.shape)
-    assert (
-        (batch % 16 == 0 and in_channel % 16 == 0 and num_filter % 16 == 0)
-        or (batch % 8 == 0 and in_channel % 16 == 0 and num_filter % 32 == 0)
-        or (batch % 32 == 0 and in_channel % 16 == 0 and num_filter % 8 == 0)
-    ), (
-        "The shape of (batch, in_channel, num_filter) "
-        "must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
-    )
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel = num_filter
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [0, pad_top, pad_left, 0]
-    pad_after = [0, pad_down, pad_right, 0]
-    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    # convert data type of input feature maps and weights
-    # TODO: add checking here, datatype casting may cause precision loss
-    TransPaddedInput = te.compute(
-        PaddedInput.shape, lambda n, h, w, c: PaddedInput[n, h, w, c].astype("float16")
-    )
-    TransFilter = te.compute(Filter.shape, lambda h, w, i, o: Filter[h, w, i, o].astype("float16"))
-    Output = te.compute(
-        (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: te.sum(
-            TransPaddedInput[
-                nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc
-            ].astype(out_dtype)
-            * TransFilter[ry, rx, rc, ff].astype(out_dtype),
-            axis=[ry, rx, rc],
-        ),
-        name="Conv2dOutput",
-        tag="conv2d_nhwc_tensorcore",
-    )
-    return Output
-
-
-def schedule_nhwc_tensorcore_cuda(cfg, s, Conv):
-    """Schedule tensorcore template"""
-    kh, kw, ic = s[Conv].op.reduce_axis
-    out_dtype = Conv.dtype
-    trans_paddata, kernel = s[Conv].op.input_tensors
-    in_dtype = trans_paddata.dtype
-    batch, _, _, _ = get_const_tuple(Conv.shape)
-    _, _, _, out_channels = get_const_tuple(kernel.shape)
-    paddata = s[trans_paddata].op.input_tensors
-
-    # inline the pad and dtype transform
-    s[trans_paddata].compute_inline()
-    s[kernel].compute_inline()
-    s[paddata[0]].compute_inline()
-
-    # Designate the memory hierarchy
-    AS = s.cache_read(trans_paddata, "shared", [Conv])
-    WS = s.cache_read(kernel, "shared", [Conv])
-    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
-    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
-    ConvF = s.cache_write(Conv, "wmma.accumulator")
-
-    if Conv.op in s.outputs:
-        output = Conv
-        ConvS = s.cache_read(ConvF, "shared", [Conv])
-        OL = ConvS
-    else:
-        output = s.outputs[0].output(0)
-        s[Conv].set_scope("shared")
-        OL = Conv
-
-    # Schedule for autotvm
-    cfg.define_knob("block_row_warps", [1, 2, 4])
-    cfg.define_knob("block_col_warps", [1, 2, 4])
-    cfg.define_knob("warp_row_tiles", [1, 2, 4])
-    cfg.define_knob("warp_col_tiles", [1, 2, 4])
-    cfg.define_knob("chunk", [1, 2, 4, 8])
-    cfg.define_knob("offset", [0, 8])
-    cfg.define_knob("vector_width", [1, 2, 4, 8])
-
-    if batch % 16 == 0 and out_channels % 16 == 0:
-        cfg.define_knob("wmma_m", [16, 8, 32])
-    elif batch % 8 == 0 and out_channels % 32 == 0:
-        cfg.define_knob("wmma_m", [8, 16, 32])
-    elif batch % 32 == 0 and out_channels % 8 == 0:
-        cfg.define_knob("wmma_m", [32, 16, 8])
-
-    # fallback support
-    target = tvm.target.Target.current()
-    if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log(
-            target.kind.name, target.model, "conv2d_nhwc_tensorcore.cuda"
-        )
-        cfg.fallback_with_reference_log(ref_log)
-
-    block_row_warps = cfg["block_row_warps"].val
-    block_col_warps = cfg["block_col_warps"].val
-    warp_row_tiles = cfg["warp_row_tiles"].val
-    warp_col_tiles = cfg["warp_col_tiles"].val
-    chunk = cfg["chunk"].val
-    offset = cfg["offset"].val
-    wmma_m = cfg["wmma_m"].val
-    vector_width = cfg["vector_width"].val
-
-    wmma_k = 16
-    if wmma_m == 16:
-        wmma_n = 16
-    elif wmma_m == 8:
-        wmma_n = 32
-    elif wmma_m == 32:
-        wmma_n = 8
-
-    warp_size = 32
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    # Define the intrin strides
-    def get_strides(extents):
-        return [np.prod(extents[i:]).tolist() for i in range(len(extents))]
-
-    AS_align = chunk * wmma_k + offset
-    WS_align = warp_col_tiles * block_col_warps * wmma_n + offset
-    block_factor_n = wmma_m * warp_row_tiles * block_row_warps
-    block_factor_o = wmma_n * warp_col_tiles * block_col_warps
-    CS_align = block_factor_o + offset
-    AS_strides = get_strides([1, 1, AS_align, 1])
-    AL_strides = get_strides([1, 1, wmma_k, 1])
-    WS_strides = get_strides([WS_align, 1])
-    WL_strides = get_strides([wmma_n * warp_col_tiles, 1])
-    CL_strides = get_strides([1, 1, wmma_n * warp_col_tiles, 1])
-    CS_strides = get_strides([1, 1, CS_align, 1])
-
-    # Schedule for output
-    nc, hc, wc, oc = output.op.axis
-    block_k = s[output].fuse(hc, wc)
-    s[output].bind(block_k, block_z)
-    block_i, nc = s[output].split(nc, factor=block_factor_n)
-    block_j, oc = s[output].split(oc, factor=block_factor_o)
-    s[output].reorder(block_k, block_i, block_j, nc, oc)
-    t = s[output].fuse(nc, oc)
-    t, ti = s[output].split(t, factor=vector_width)
-    t, tx = s[output].split(t, factor=warp_size)
-    t, ty = s[output].split(t, factor=block_row_warps)
-    t, tz = s[output].split(t, factor=block_col_warps)
-    s[output].bind(block_i, block_x)
-    s[output].bind(block_j, block_y)
-    s[output].bind(tz, thread_z)
-    s[output].bind(ty, thread_y)
-    s[output].bind(tx, thread_x)
-    s[output].vectorize(ti)
-
-    # Schedule wmma store
-    s[OL].compute_at(s[output], block_j)
-    nc, hc, wc, oc = OL.op.axis
-    s[OL].reorder(hc, wc, nc, oc)
-    s[OL].storage_align(wc, CS_align - 1, CS_align)
-    oc, ooc = s[OL].split(oc, factor=wmma_n)
-    oc, oci = s[OL].split(oc, factor=warp_col_tiles)
-    _, oc = s[OL].split(oc, factor=block_col_warps)
-    nc, nnc = s[OL].split(nc, factor=wmma_m)
-    nc, nci = s[OL].split(nc, factor=warp_row_tiles)
-    _, nc = s[OL].split(nc, factor=block_row_warps)
-    s[OL].reorder(nc, oc, nci, oci, nnc, ooc)
-    s[OL].bind(nc, thread_y)
-    s[OL].bind(oc, thread_z)
-
-    # Schedule wmma computation
-    s[ConvF].compute_at(s[OL], oc)
-    n, h, w, o = ConvF.op.axis
-    n, nnf = s[ConvF].split(n, factor=wmma_m)
-    o, oof = s[ConvF].split(o, factor=wmma_n)
-    ic, ii = s[ConvF].split(ic, factor=wmma_k)
-    ko, ki = s[ConvF].split(ic, factor=chunk)
-    s[ConvF].reorder(kh, kw, ko, ki, n, o, nnf, oof, ii)
-
-    s[AF].compute_at(s[ConvF], ki)
-    s[WF].compute_at(s[ConvF], ki)
-
-    # Schedule wmma load
-    n, h, w, i = AF.op.axis
-    n, nn = s[AF].split(n, factor=wmma_m)
-    i, ii = s[AF].split(i, factor=wmma_k)
-    s[AF].reorder(n, i, nn, ii)
-
-    kh, kw, i, o = WF.op.axis
-    i, ii = s[WF].split(i, factor=wmma_k)
-    o, oo = s[WF].split(o, factor=wmma_n)
-    s[WF].reorder(o, i, oo)
-    s[WF].reorder(i, o, ii, oo)
-
-    s[WS].compute_at(s[ConvF], ko)
-    s[AS].compute_at(s[ConvF], ko)
-
-    # Schedule for data's share memory
-    n, h, w, i = AS.op.axis
-    s[AS].reorder(h, w, n, i)
-    s[AS].storage_align(w, AS_align - 1, AS_align)
-    t = s[AS].fuse(n, i)
-    t, ti = s[AS].split(t, factor=vector_width)
-    t, tx = s[AS].split(t, factor=warp_size)
-    t, ty = s[AS].split(t, factor=block_row_warps)
-    _, tz = s[AS].split(t, factor=block_col_warps)
-    s[AS].bind(ty, thread_y)
-    s[AS].bind(tz, thread_z)
-    s[AS].bind(tx, thread_x)
-    s[AS].vectorize(ti)
-
-    # Schedule for kernel's share memory
-    kh, kw, ic, o = WS.op.axis
-    t = s[WS].fuse(ic, o)
-    s[WS].storage_align(ic, WS_align - 1, WS_align)
-    t, ti = s[WS].split(t, factor=vector_width)
-    t, tx = s[WS].split(t, factor=warp_size)
-    t, ty = s[WS].split(t, factor=block_row_warps)
-    _, tz = s[WS].split(t, factor=block_col_warps)
-    s[WS].bind(ty, thread_y)
-    s[WS].bind(tz, thread_z)
-    s[WS].bind(tx, thread_x)
-    s[WS].vectorize(ti)
-
-    shape = (wmma_m, wmma_n, wmma_k)
-
-    # tensorize the wmma process
-    AS_shape = (wmma_m, 1, 1, wmma_k)
-    AL_shape = (wmma_m, 1, 1, wmma_k)
-    WS_shape = (wmma_k, wmma_n)
-    WL_shape = (wmma_k, wmma_n)
-    CL_shape = (wmma_m, 1, 1, wmma_n)
-    CS_shape = (wmma_m, 1, 1, wmma_n)
-
-    AL_gemm = te.placeholder(AL_shape, name="A", dtype=in_dtype)
-    WL_gemm = te.placeholder(WL_shape, name="B", dtype=in_dtype)
-    k_gemm = te.reduce_axis((0, wmma_k), name="k")
-    CL_compute = te.compute(
-        CL_shape,
-        lambda ii, t0, t1, jj: te.sum(
-            AL_gemm[ii, t0, t1, k_gemm].astype(out_dtype) * WL_gemm[k_gemm, jj].astype(out_dtype),
-            axis=k_gemm,
-        ),
-        name="C",
-    )
-
-    s[AF].tensorize(
-        nn,
-        intrin_wmma_load_matrix_A(
-            AL_strides, AS_strides, shape, "row_major", AS_shape, AL_shape, in_dtype
-        ),
-    )
-    s[WF].tensorize(
-        ii,
-        intrin_wmma_load_matrix_W(
-            WL_strides, WS_strides, shape, "row_major", WS_shape, WL_shape, in_dtype
-        ),
-    )
-    s[OL].tensorize(
-        nnc, intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype, CL_shape, CS_shape)
-    )
-    s[ConvF].tensorize(
-        nnf,
-        intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides, CL_strides, shape),
-    )
-
-    N, OH, OW, CO = get_const_tuple(output.shape)
-    KH, KW, CI, _ = get_const_tuple(kernel.shape)
-    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_tensorcore.cuda")
-def conv2d_nhwc_tensorcore(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with tensorcore for NCHW layout"""
-    return nhwc_tensorcore_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_tensorcore.cuda")
-def schedule_conv2d_nhwc_tensorcore(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nhwc_tensorcore" in op.tag:
-            schedule_nhwc_tensorcore_cuda(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
deleted file mode 100644
index 77b332400d0b..000000000000
--- a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
+++ /dev/null
@@ -1,748 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-# pylint: disable=too-many-arguments,too-many-locals
-# pylint: disable=too-many-statements
-"""Winograd template for cuda backend"""
-
-import tvm
-from tvm import autotvm, te
-
-from .. import nn
-from ..nn.winograd_util import winograd_transform_matrices
-from ..utils import get_const_int, get_const_tuple, traverse_inline
-from .tensor_intrin import (
-    intrin_wmma_gemm,
-    intrin_wmma_load_matrix_A,
-    intrin_wmma_load_matrix_W,
-    intrin_wmma_store_matrix,
-)
-
-
-def _infer_tile_size(data, kernel):
-    """Compute the tile size"""
-    N, H, W, CI = get_const_tuple(data.shape)
-    if H % 8 == 0:
-        return 4
-    return 2
-
-
-def schedule_bgemm_tensorcore(cfg, s, bgemm, data_pack, kernel_pack):
-    """Schedule for bgemm tensorcore"""
-    A = data_pack
-    B = kernel_pack
-    C = bgemm
-    _, _, P, out_dim = get_const_tuple(C.shape)
-    out_dtype = C.dtype
-
-    # Explicit memory access
-    AS = s.cache_read(A, "shared", [C])
-    BS = s.cache_read(B, "shared", [C])
-    AF = s.cache_read(AS, "wmma.matrix_a", [C])
-    BF = s.cache_read(BS, "wmma.matrix_b", [C])
-    CF = s.cache_write(C, "wmma.accumulator")
-    CS = s.cache_read(CF, "shared", [C])
-
-    # Create tuning space
-    cfg.define_knob("block_row_warps", [1, 2, 4])
-    cfg.define_knob("block_col_warps", [1, 2, 4])
-    cfg.define_knob("warp_row_tiles", [1, 2, 4, 8])
-    cfg.define_knob("warp_col_tiles", [1, 2, 4, 8])
-    cfg.define_knob("chunk", [1, 2, 4, 8])
-    cfg.define_knob("offset", [0, 1, 2, 4, 8])
-    cfg.define_knob("offsetCS", [0, 1, 2, 4, 8])
-    cfg.define_knob("vec", [1, 2, 4, 8])
-
-    # Ensure that the default parameters are applicable when autotvm is not in use
-    if P % 16 == 0 and out_dim % 16 == 0:
-        cfg.define_knob("wmma_m", [16, 8, 32])
-    elif P % 32 == 0 and out_dim % 8 == 0:
-        cfg.define_knob("wmma_m", [32, 16, 8])
-    elif P % 8 == 0 and out_dim % 32 == 0:
-        cfg.define_knob("wmma_m", [8, 16, 32])
-
-    warp_size = 32
-    wmma_k = 16
-    block_row_warps = cfg["block_row_warps"].val
-    block_col_warps = cfg["block_col_warps"].val
-    warp_row_tiles = cfg["warp_row_tiles"].val
-    warp_col_tiles = cfg["warp_col_tiles"].val
-    chunk = cfg["chunk"].val
-    offsetAB = cfg["offset"].val
-    offsetCS = cfg["offsetCS"].val
-    wmma_m = cfg["wmma_m"].val
-    vec = cfg["vec"].val
-
-    if wmma_m == 16:
-        wmma_n = 16
-    elif wmma_m == 8:
-        wmma_n = 32
-    elif wmma_m == 32:
-        wmma_n = 8
-
-    # Define the stride of intrin functions
-    AS_align = chunk * wmma_k + offsetAB
-    BS_align = warp_col_tiles * block_col_warps * wmma_n + offsetAB
-    CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS
-    AS_stride = [AS_align, 1]
-    BS_stride = [BS_align, 1]
-    AF_stride = [wmma_k, 1]
-    BF_stride = [wmma_n * warp_col_tiles, 1]
-    CF_stride = [warp_col_tiles * wmma_n, 1]
-    CS_stride = [CS_align, 1]
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    # Schedule for computation
-    block_factor_b = wmma_m * warp_row_tiles * block_row_warps
-    block_factor_o = wmma_n * warp_col_tiles * block_col_warps
-    alpha_1, alpha_2, b, o = C.op.axis
-    block_k = s[C].fuse(alpha_1, alpha_2)
-    block_i, bc = s[C].split(b, factor=block_factor_b)
-    block_j, oc = s[C].split(o, factor=block_factor_o)
-    s[C].reorder(block_k, block_i, block_j, bc, oc)
-    t = s[C].fuse(bc, oc)
-    t, vi = s[C].split(t, factor=vec)
-    t, tx = s[C].split(t, factor=warp_size)
-    t, ty = s[C].split(t, factor=block_row_warps)
-    t, tz = s[C].split(t, factor=block_col_warps)
-    s[C].bind(block_k, block_z)
-    s[C].bind(block_i, block_x)
-    s[C].bind(block_j, block_y)
-    s[C].bind(tz, thread_z)
-    s[C].bind(ty, thread_y)
-    s[C].bind(tx, thread_x)
-    s[C].vectorize(vi)
-
-    # Schedule for wmma store
-    s[CS].compute_at(s[C], block_j)
-    _, _, bb, oo = CS.op.axis
-    s[CS].storage_align(bb, CS_align - 1, CS_align)
-    bb, bbi = s[CS].split(bb, factor=wmma_m)
-    oo, ooi = s[CS].split(oo, factor=wmma_n)
-    bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
-    oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
-    s[CS].reorder(bb, oo, bbii, ooii, bbi, ooi)
-
-    # Schedule for wmma computation
-    s[CF].compute_at(s[CS], oo)
-    _, _, warp_i, warp_j = CF.op.axis
-    warp_i, _ii = s[CF].split(warp_i, factor=wmma_m)
-    warp_j, _jj = s[CF].split(warp_j, factor=wmma_n)
-    (k,) = CF.op.reduce_axis
-    k, _k = s[CF].split(k, factor=wmma_k)
-    ko, ki = s[CF].split(k, factor=chunk)
-    s[CF].reorder(ko, ki, warp_i, warp_j, _ii, _jj, _k)
-
-    # Schedule for  wmma_matrix_a load
-    s[AF].compute_at(s[CF], ki)
-    _, _, b, i = AF.op.axis
-    b, b_ii = s[AF].split(b, factor=wmma_m)
-    i, i_jj = s[AF].split(i, factor=wmma_k)
-    s[AF].reorder(b, i, b_ii, i_jj)
-
-    # Schedule for  wmma_matrix_b load
-    s[BF].compute_at(s[CF], ki)
-    _, _, i, o = BF.op.axis
-    o, o_ii = s[BF].split(o, factor=wmma_n)
-    i, i_ii = s[BF].split(i, factor=wmma_k)
-    s[BF].reorder(i, o, i_ii, o_ii)
-
-    # Schedule for A's(B's) shared memory load
-    def shared_schedule(stage, strides):
-        s[stage].compute_at(s[CF], ko)
-        _, _, xo, yo = stage.op.axis
-        s[stage].storage_align(xo, strides - 1, strides)
-        t = s[stage].fuse(xo, yo)
-        t, vi = s[stage].split(t, factor=vec)
-        t, tx = s[stage].split(t, factor=warp_size)
-        t, ty = s[stage].split(t, factor=block_row_warps)
-        _, tz = s[stage].split(t, factor=block_col_warps)
-        s[stage].bind(ty, thread_y)
-        s[stage].bind(tz, thread_z)
-        s[stage].bind(tx, thread_x)
-        s[stage].vectorize(vi)
-
-    shared_schedule(AS, AS_align)
-    shared_schedule(BS, BS_align)
-
-    shape = (wmma_m, wmma_n, wmma_k)
-    in_dtype = "float16"
-    AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype)
-    BL_gemm = te.placeholder((wmma_k, wmma_n), name="BL_gemm", dtype=in_dtype)
-    k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
-    CL_compute = te.compute(
-        (wmma_m, wmma_n),
-        lambda ii, jj: te.sum(
-            AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[k_gemm, jj].astype(out_dtype),
-            axis=k_gemm,
-        ),
-        name="CL_compute",
-    )
-
-    # Lower the computation loops down to TensorCore hardware intrinsics
-    # by mapping the tensorcore to tensor intrinsics
-    s[AF].tensorize(
-        b_ii,
-        intrin_wmma_load_matrix_A(
-            AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), "float16"
-        ),
-    )
-    s[BF].tensorize(
-        i_ii,
-        intrin_wmma_load_matrix_W(
-            BF_stride, BS_stride, shape, "row_major", (wmma_k, wmma_n), (wmma_k, wmma_n), "float16"
-        ),
-    )
-    s[CF].tensorize(
-        _ii, intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride, CF_stride, shape)
-    )
-    s[CS].tensorize(
-        bbi,
-        intrin_wmma_store_matrix(
-            CS_stride, CF_stride, shape, out_dtype, (wmma_m, wmma_n), (wmma_m, wmma_n)
-        ),
-    )
-
-
-def schedule_bgemm_direct(cfg, s, bgemm, data_pack, kernel_pack):
-    """Schedule for bgemm direct"""
-    b1, b2, y, x = s[bgemm].op.axis
-    rc = s[bgemm].op.reduce_axis[0]
-    alpha = get_const_int(b1.dom.extent)
-
-    # Create tuning space
-    cfg.define_split(
-        "tile_b", cfg.axis(alpha * alpha), num_outputs=4, filter=lambda x: x.size[-3:] == [1, 1, 1]
-    )
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_knob("offset_bgemm", [0, 1, 2, 4, 8])
-    cfg.define_knob("vector_bgemm", [1, 2, 4, 8])
-    offset_bgemm = cfg["offset_bgemm"].val
-    vector_bgemm = cfg["vector_bgemm"].val
-
-    C = bgemm
-    A0, B0 = kernel_pack, data_pack
-
-    # Designate the memory hierarchy
-    OL = s.cache_write(C, "local")
-    AA = s.cache_read(A0, "shared", [OL])
-    BB = s.cache_read(B0, "shared", [OL])
-
-    # Tile and bind spatial axes
-    b = s[bgemm].fuse(b1, b2)
-    bgemm_scope, b = s[bgemm].split(b, nparts=1)
-    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].bind(bz, te.thread_axis("blockIdx.z"))
-    s[C].bind(by, te.thread_axis("blockIdx.y"))
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(vz, te.thread_axis("vthread"))
-    s[C].bind(vy, te.thread_axis("vthread"))
-    s[C].bind(vx, te.thread_axis("vthread"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
-
-    # Tile reduction axes
-    s[OL].compute_at(s[C], tx)
-    b1, b2, y, x = s[OL].op.axis
-    b = s[OL].fuse(b1, b2)
-    (rc,) = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    s[OL].reorder(rco, b, y, x, rci)
-
-    s[AA].compute_at(s[OL], rco)
-    _, _, k, n = s[AA].op.axis
-    AA_align = offset_bgemm + cfg["tile_x"].size[1] * cfg["tile_x"].size[2] * cfg["tile_x"].size[3]
-    s[AA].storage_align(k, AA_align - 1, AA_align)
-
-    s[BB].compute_at(s[OL], rco)
-    _, _, m, k = s[BB].op.axis
-    BB_align = offset_bgemm + cfg["tile_rc"].size[1]
-    s[BB].storage_align(m, BB_align - 1, BB_align)
-
-    # Schedule for A and B shared memory load
-    for load in [AA, BB]:
-        fused = s[load].fuse(*list(s[load].op.axis))
-        fused, ti = s[load].split(fused, factor=vector_bgemm)
-        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
-        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
-        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-        s[load].vectorize(ti)
-
-
-def nhwc_winograd_cuda(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, use_tensorcore, pre_computed
-):
-    """Compute declaration for winograd"""
-    tile_size = _infer_tile_size(data, kernel)
-    N, H, W, CI = get_const_tuple(data.shape)
-
-    if isinstance(N, tvm.tir.Any):
-        N = tvm.te.size_var("n")
-
-    if not isinstance(H, int) or not isinstance(W, int):
-        raise RuntimeError(
-            "cuda winograd nhwc conv2d doesn't support dynamic \
-                           input height or width."
-        )
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
-
-    if not pre_computed:  # Kernel tensor is raw tensor, do strict check
-        if dilation_h != 1 or dilation_w != 1:
-            kernel = nn.dilate(kernel, (dilation_h, dilation_w, 1, 1))
-        KH, KW, CI, CO = get_const_tuple(kernel.shape)
-        alpha = KW + tile_size - 1
-        assert HSTR == 1 and WSTR == 1 and KH == KW
-    else:
-        # Kernel tensor is pre-transfomred. This op is created by conv2d_alter_op.
-        # Dilation is not supported
-        alpha, _, CI, CO = get_const_tuple(kernel.shape)
-        KH = KW = alpha + 1 - tile_size
-        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
-
-    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
-    data_pad = nn.pad(
-        data,
-        (0, pt, pl, 0),
-        (0, pb, pr, 0),
-        name="data_pad",
-        attrs={"schedule_rule": "None"},
-    )
-
-    r = KW
-    m = tile_size
-    H = (H + pt + pb - KH) // HSTR + 1
-    W = (W + pl + pr - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW if isinstance(N, int) else nH * nW
-
-    # Determine whether the shape is available with tensorcore
-    shape_judge = (
-        (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-        or (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-        or (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-    )
-
-    if shape_judge and use_tensorcore:
-        trans_type = "float16"
-    else:
-        trans_type = data.dtype
-
-    # Compute transform matrix
-    A, _, _ = winograd_transform_matrices(m, r, out_dtype)
-    _, B, G = winograd_transform_matrices(m, r, data.dtype)
-
-    # Transform kernel
-    if not pre_computed:
-        # Check if we are currently tuning, if so we want to avoid counting
-        # prepacking in time costs. Just use a placeholder with the packed shape instead.
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            kernel_pack = te.placeholder(
-                (alpha, alpha, CI, CO), dtype=kernel.dtype, name="kernel_pack"
-            )
-        else:
-            r_kh = te.reduce_axis((0, KH), name="r_kh")
-            r_kw = te.reduce_axis((0, KW), name="r_kw")
-            kernel_pack = te.compute(
-                (alpha, alpha, CI, CO),
-                lambda eps, nu, ci, co: te.sum(
-                    (kernel[r_kh][r_kw][ci][co]) * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
-                ),
-                name="kernel_pack",
-            )
-    else:
-        kernel_pack = kernel
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    # Pack input tile
-    input_tile = te.compute(
-        (P, CI, alpha, alpha),
-        lambda p, c, eps, nu: data_pad[
-            idxdiv(p, (nH * nW)), idxmod(idxdiv(p, nW), nH) * m + eps, idxmod(p, nW) * m + nu, c
-        ],
-        name="d",
-        attrs={"schedule_rule": "None"},
-    )
-
-    # Transform data
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    data_pack = te.compute(
-        (alpha, alpha, P, CI),
-        lambda eps, nu, p, ci: te.sum(
-            input_tile[p][ci][r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
-        ),
-        name="data_pack",
-    )
-
-    # Convert data type of input feature maps and weights for tensorcore
-    Transdata = te.compute(
-        data_pack.shape, lambda eps, nu, p, ci: data_pack[eps, nu, p, ci].astype(trans_type)
-    )
-    TransFilter = te.compute(
-        kernel_pack.shape, lambda eps, nu, ci, co: kernel_pack[eps, nu, ci, co].astype(trans_type)
-    )
-
-    # Do batch gemm
-    ci = te.reduce_axis((0, CI), name="ci")
-    bgemm = te.compute(
-        (alpha, alpha, P, CO),
-        lambda eps, nu, p, co: te.sum(
-            (Transdata[eps][nu][p][ci]).astype(out_dtype)
-            * (TransFilter[eps][nu][ci][co]).astype(out_dtype),
-            axis=[ci],
-        ),
-        name="bgemm",
-    )
-
-    # Inverse transform
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    inverse = te.compute(
-        (P, CO, m, m),
-        lambda p, co, vh, vw: te.sum(
-            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
-        ),
-        name="inverse",
-    )
-
-    # Output
-    output = te.compute(
-        (N, H, W, CO),
-        lambda n, h, w, co: inverse[
-            n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), co, idxmod(h, m), idxmod(w, m)
-        ],
-        name="output",
-        tag="conv2d_nhwc_winograd",
-    )
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
-    return output
-
-
-def data_weight_transform(s, data_trans, input_tile, thread_num_trans, offset_trans, trans_tag):
-    """Schedule for data or kernel transform"""
-    kernel_align = thread_num_trans + offset_trans
-    indata_s = s.cache_read(input_tile, "shared", [data_trans])
-    data_l = s.cache_write(data_trans, "local")
-    # Schedule for data or kernel transform
-    eps, nu, p, c = s[data_trans].op.axis
-
-    block_x, thread_x = s[data_trans].split(c, thread_num_trans)
-    block_x = s[data_trans].fuse(p, block_x)
-    s[data_trans].reorder(block_x, thread_x, eps, nu)
-    s[data_trans].bind(thread_x, te.thread_axis("threadIdx.x"))
-    s[data_trans].bind(block_x, te.thread_axis("blockIdx.x"))
-
-    s[data_l].compute_at(s[data_trans], thread_x)
-    eps_l, nu_l, p_l, c_l = s[data_l].op.axis
-    r_a, r_b = s[data_l].op.reduce_axis
-    block_x_l, thread_x_l = s[data_l].split(c_l, thread_num_trans)
-    block_x_l = s[data_l].fuse(p_l, block_x_l)
-
-    s[data_l].reorder(block_x_l, thread_x_l, eps_l, nu_l, r_a, r_b)
-
-    for axis in [eps_l, nu_l, r_a, r_b]:
-        s[data_l].unroll(axis)
-
-    # Schedule for share memory load
-    s[indata_s].compute_at(s[data_l], block_x_l)
-    if trans_tag == "data":
-        p_is, c_is, eps_is, nu_is = s[indata_s].op.axis
-        data_align = (
-            get_const_int(eps_is.dom.extent) * get_const_int(nu_is.dom.extent) + offset_trans
-        )
-        s[indata_s].storage_align(c_is, data_align - 1, data_align)
-        block_x_is, thread_x_is = s[indata_s].split(c_is, thread_num_trans)
-        s[indata_s].bind(thread_x_is, te.thread_axis("threadIdx.x"))
-    else:
-        eps_is, nu_is, ci_is, co_is = s[indata_s].op.axis
-        s[indata_s].storage_align(nu_is, kernel_align - 1, kernel_align)
-        block_x_is, thread_x_is = s[indata_s].split(co_is, thread_num_trans)
-        s[indata_s].reorder(ci_is, block_x_is, eps_is, nu_is, thread_x_is)
-        s[indata_s].bind(thread_x_is, te.thread_axis("threadIdx.x"))
-
-
-def schedule_nhwc_winograd_cuda(cfg, s, output, use_tensorcore, pre_computed):
-    """Schedule winograd template"""
-    # Get stages
-    inverse = s[output].op.input_tensors[0]
-    bgemm, A = s[inverse].op.input_tensors
-    Transdata, TransFilter = s[bgemm].op.input_tensors
-    data_pack = s[Transdata].op.input_tensors[0]
-    kernel_pack = s[TransFilter].op.input_tensors[0]
-    s[Transdata].compute_inline()
-    s[TransFilter].compute_inline()
-
-    input_tile, B = s[data_pack].op.input_tensors
-    pad_data = s[input_tile].op.input_tensors[0]
-
-    # Define the stride of intrin functions
-    cfg.define_knob("thread_num_inverse", [1, 32, 64, 128, 256])
-    cfg.define_knob("thread_num_data", [1, 32, 64, 128, 256])
-    cfg.define_knob("thread_num_kernel", [1, 32, 64, 128, 256])
-    cfg.define_knob("offset_inverse", [0, 2, 4])
-    cfg.define_knob("offset_data", [0, 1, 2, 4])
-    cfg.define_knob("offset_kernel", [0, 1, 2, 4])
-    cfg.define_knob("inverse_in_vector", [1, 2, 4])
-
-    thread_num_data = cfg["thread_num_data"].val
-    thread_num_kernel = cfg["thread_num_kernel"].val
-    thread_num_inverse = cfg["thread_num_inverse"].val
-    offset_data = cfg["offset_data"].val
-    offset_kernel = cfg["offset_kernel"].val
-    offset_inverse = cfg["offset_inverse"].val
-    inverse_in_vector = cfg["inverse_in_vector"].val
-
-    # Data transform
-    s[B].compute_inline()
-    data_weight_transform(s, data_pack, input_tile, thread_num_data, offset_data, trans_tag="data")
-    s[input_tile].compute_inline()
-    s[pad_data].compute_inline()
-
-    # Kernel transform
-    if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
-        kernel, G = s[kernel_pack].op.input_tensors
-        s[G].compute_inline()
-        data_weight_transform(
-            s, kernel_pack, kernel, thread_num_kernel, offset_kernel, trans_tag="kernel"
-        )
-    else:
-        kernel = kernel_pack
-
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    b1, b2, y, x = s[bgemm].op.axis
-    alpha = get_const_int(b1.dom.extent)
-    _, _, P, CI = get_const_tuple(Transdata.shape)
-    _, _, _, CO = get_const_tuple(TransFilter.shape)
-
-    # Determine whether the shape is available with tensorcore
-    shape_judge = (
-        (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-        or (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-        or (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-    )
-
-    if shape_judge and use_tensorcore:
-        schedule_bgemm_tensorcore(cfg, s, bgemm, Transdata, TransFilter)
-    else:
-        schedule_bgemm_direct(cfg, s, bgemm, Transdata, TransFilter)
-
-    # Schedule inverse, output and fusion
-    if output.op in s.outputs:
-        OL = None
-    else:
-        OL = output
-        s[OL].set_scope("local")
-        output = s.outputs[0]
-
-    s[A].compute_inline()
-    inverse_s = s.cache_read(bgemm, "shared", [inverse])
-
-    m = alpha - 3 + 1
-    offset_inverse_in = offset_inverse
-    vector_width_inverse_in = inverse_in_vector
-
-    # Schedule for output
-    n, h, w, co = s[output].op.axis
-    ho, wo, hi, wi = s[output].tile(h, w, m, m)
-    s[output].reorder(n, ho, wo, co, hi, wi)
-    fused = s[output].fuse(n, ho, wo)
-
-    block_x_s, thread_x_s = s[output].split(co, thread_num_inverse)
-    block_x_s = s[output].fuse(fused, block_x_s)
-    s[output].reorder(block_x_s, thread_x_s, hi, wi)
-
-    if OL is not None:
-        s[OL].compute_inline()
-
-    # Schedule for inverse
-    s[inverse].compute_at(s[output], thread_x_s)
-    p_inv, co_inv, eps_inv, nu_inv = s[inverse].op.axis
-    block_x_inv, thread_x_inv = s[inverse].split(co_inv, thread_num_inverse)
-    r_a, r_b = s[inverse].op.reduce_axis
-    for axis in [eps_inv, nu_inv, r_a, r_b]:
-        s[inverse].unroll(axis)
-
-    # Schedule for share memory load
-    s[inverse_s].compute_at(s[output], block_x_s)
-    eps_inv_s, nu_inv_s, p_inv_s, co_inv_s = s[inverse_s].op.axis
-    inverse_in_align = offset_inverse_in + thread_num_inverse
-    s[inverse_s].storage_align(p_inv_s, inverse_in_align - 1, inverse_in_align)
-    block_x_inv_s, thread_x_inv_s = s[inverse_s].split(co_inv_s, thread_num_inverse)
-    block_x_inv_s = s[inverse_s].fuse(p_inv_s, block_x_inv_s)
-    s[inverse_s].reorder(block_x_inv_s, eps_inv_s, nu_inv_s, thread_x_inv_s)
-    t = s[inverse_s].fuse(eps_inv_s, nu_inv_s, thread_x_inv_s)
-    t, ti = s[inverse_s].split(t, factor=vector_width_inverse_in)
-    t, tx = s[inverse_s].split(t, factor=thread_num_inverse)
-    s[inverse_s].bind(tx, te.thread_axis("threadIdx.x"))
-    s[inverse_s].vectorize(ti)
-
-    s[output].bind(thread_x_s, te.thread_axis("threadIdx.x"))
-    s[output].bind(block_x_s, te.thread_axis("blockIdx.x"))
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd_direct.cuda")
-def conv2d_nhwc_winograd_direct(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with winograd for NHWC layout"""
-    return nhwc_winograd_cuda(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        use_tensorcore=False,
-        pre_computed=False,
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd_direct.cuda")
-def schedule_conv2d_nhwc_winograd_direct(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nhwc_winograd" in op.tag:
-            schedule_nhwc_winograd_cuda(
-                cfg, s, op.output(0), use_tensorcore=False, pre_computed=False
-            )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd_tensorcore.cuda")
-def conv2d_nhwc_winograd_tensorcore(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with winograd for NHWC layout"""
-    return nhwc_winograd_cuda(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        use_tensorcore=True,
-        pre_computed=False,
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd_tensorcore.cuda")
-def schedule_conv2d_nhwc_winograd_tensorcore(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nhwc_winograd" in op.tag:
-            schedule_nhwc_winograd_cuda(
-                cfg, s, op.output(0), use_tensorcore=True, pre_computed=False
-            )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd_direct_without_weight_transform.cuda")
-def conv2d_nhwc_winograd_direct_without_weight_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    """Compute conv2d with winograd for NHWC layout"""
-    return nhwc_winograd_cuda(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        use_tensorcore=False,
-        pre_computed=True,
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd_direct_without_weight_transform.cuda")
-def schedule_conv2d_nhwc_winograd_direct_without_weight_transform(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nhwc_winograd" in op.tag:
-            schedule_nhwc_winograd_cuda(
-                cfg, s, op.output(0), use_tensorcore=False, pre_computed=True
-            )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda")
-def conv2d_nhwc_winograd_tensorcore_without_weight_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    """Compute conv2d with winograd for NHWC layout"""
-    return nhwc_winograd_cuda(
-        cfg,
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        use_tensorcore=True,
-        pre_computed=True,
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda")
-def schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nhwc_winograd" in op.tag:
-            schedule_nhwc_winograd_cuda(
-                cfg, s, op.output(0), use_tensorcore=True, pre_computed=True
-            )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/conv2d_transpose.py b/python/tvm/topi/cuda/conv2d_transpose.py
deleted file mode 100644
index 006b67a5515e..000000000000
--- a/python/tvm/topi/cuda/conv2d_transpose.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Conv2d transpose template for cuda backend"""
-
-import tvm
-from tvm import te
-from tvm.contrib import cudnn
-from tvm import autotvm
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from .. import nn
-from ..utils import get_const_tuple, traverse_inline
-
-
-@autotvm.register_topi_compute("conv2d_transpose_nchw.cuda")
-def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype, output_padding, groups=1):
-    """Transposed 2D convolution nchw forward operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-    Input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-    Filter : tvm.te.Tensor
-        4-D with shape [in_channel, num_filter, filter_height, filter_width]
-    strides : tuple of two ints
-        The spatial stride along height and width
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-    out_dtype: str
-        The output type. This is used in mixed precision
-    output_padding : tuple of two ints
-        Used to disambiguate output shape.
-    groups : int
-        number of groups
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape)
-    _, out_channels, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-    stride_height, stride_width = stride
-    outpad_height, outpad_width = output_padding
-    assert outpad_height < stride_height and outpad_width < stride_width
-    assert (
-        inp_channels % groups == 0
-    ), f"input channels {inp_channels} must divide group size {groups}"
-    cfg.stride = stride
-    pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple(
-        padding, (kernel_height, kernel_width)
-    )
-
-    out_width = (inp_width - 1) * stride_width + kernel_width - pad_left - pad_right + outpad_width
-    pad_left = kernel_width - 1 - pad_left
-    pad_right = kernel_width - 1 - pad_right + outpad_width
-    dilated_width = stride_width * (inp_width - 1) + 1
-
-    out_height = (
-        (inp_height - 1) * stride_height + kernel_height - pad_top - pad_bottom + outpad_height
-    )
-    pad_top = kernel_height - 1 - pad_top
-    pad_bottom = kernel_height - 1 - pad_bottom + outpad_height
-    dilated_height = stride_height * (inp_height - 1) + 1
-
-    # compute pad
-    data = te.compute(
-        (
-            batch,
-            inp_channels,
-            pad_top + dilated_height + pad_bottom,
-            pad_left + dilated_width + pad_right,
-        ),
-        lambda n, c, y, x: tvm.tir.if_then_else(
-            tvm.tir.all(
-                x >= pad_left,
-                x < pad_left + dilated_width,
-                tvm.tir.indexmod(x - pad_left, stride_width).equal(0),
-                y >= pad_top,
-                y < pad_top + dilated_height,
-                tvm.tir.indexmod(y - pad_top, stride_height).equal(0),
-            ),
-            data[
-                n,
-                c,
-                tvm.tir.indexdiv(y - pad_top, stride_height),
-                tvm.tir.indexdiv(x - pad_left, stride_width),
-            ],
-            tvm.tir.const(0.0, data.dtype),
-        ),
-        name="data_pad",
-    )
-
-    # compute transposed conv
-    dc = te.reduce_axis((0, inp_channels // groups), name="dc")
-    dh = te.reduce_axis((0, kernel_height), name="dh")
-    dw = te.reduce_axis((0, kernel_width), name="dw")
-    data_out = te.compute(
-        (batch, out_channels * groups, out_height, out_width),
-        lambda b, c, h, w: te.sum(
-            data[b, c // out_channels * (inp_channels // groups) + dc, h + dh, w + dw].astype(
-                out_dtype
-            )
-            * kernel[
-                c // out_channels * (inp_channels // groups) + dc,
-                c % out_channels,
-                kernel_height - 1 - dh,
-                kernel_width - 1 - dw,
-            ].astype(out_dtype),
-            axis=[dc, dh, dw],
-        ),
-        tag="conv2d_transpose_nchw",
-    )
-
-    return data_out
-
-
-@autotvm.register_topi_schedule("conv2d_transpose_nchw.cuda")
-def schedule_conv2d_transpose_nchw(cfg, outs):
-    """TOPI Schedule callback for conv2d transpose operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The parameters for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d transpose
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d transpose.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _fallback_schedule(N, F, Y, X):
-        # pylint: disable=unused-argument
-        # split N (batch dimension)
-        if N > 1:
-            cfg["tile_n"] = SplitEntity([-1, 1, 1, 4])
-        else:
-            cfg["tile_n"] = SplitEntity([1, 1, 1, 1])
-        # split F (output channel dimension)
-        if F > 1:
-            cfg["tile_f"] = SplitEntity([-1, 1, 4, 1])
-        # split Y (height dimension)
-        y_split_factor = 1
-        for candidate in range(5, 17):
-            if Y % candidate == 0:
-                y_split_factor = candidate
-                break
-        cfg["tile_y"] = SplitEntity([-1, 1, 1, y_split_factor])
-        # split X (width dimension)
-        x_split_factor = 1
-        for candidate in range(5, 17):
-            if X % candidate == 0:
-                x_split_factor = candidate
-                break
-        cfg["tile_x"] = SplitEntity([-1, x_split_factor, 1, 1])
-        # split RC (input channel dimension, which is a reduction axis)
-        cfg["tile_rc"] = SplitEntity([-1, 1, 16])
-        # other configurations
-        cfg["fuse_yx"] = OtherOptionEntity(False)
-        cfg["unroll_explicit"] = OtherOptionEntity(True)
-        cfg["auto_unroll_max_step"] = OtherOptionEntity(1500)
-
-    def _callback(op):
-        if op.tag == "conv2d_transpose_nchw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-
-            ##### space definition begin #####
-            n, f, y, x = s[conv].op.axis
-            rc = s[conv].op.reduce_axis[0]
-            # TODO(@kevinthesun): Support tuning/optimization for dynamic shape.
-            bs = pad_data.shape[0]
-            n_tuning_axis = n if isinstance(bs, tvm.tir.IntImm) else 1
-            cfg.define_split("tile_n", cfg.axis(n_tuning_axis), num_outputs=4)
-            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-            cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
-            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])
-
-            target = tvm.target.Target.current()
-            if target.kind.name in ["nvptx", "rocm"]:
-                cfg.define_knob("unroll_explicit", [1])
-            else:
-                cfg.define_knob("unroll_explicit", [0, 1])
-
-            if cfg.is_fallback:
-                N, F, Y, X = get_const_tuple(conv.shape)
-                if not isinstance(N, int):
-                    N = 1
-                _fallback_schedule(N, F, Y, X)
-
-            ##### space definition end #####
-
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            if conv.op in s.outputs:
-                output = conv
-                OL = s.cache_write(conv, "local")
-            else:
-                output = s.outputs[0].output(0)
-                s[conv].set_scope("local")
-                OL = conv
-
-            # create cache stage
-            s[pad_data].set_scope("shared")
-            AA = pad_data
-            WW = s.cache_read(kernel, "shared", [OL])
-
-            # tile and bind spatial axes
-            n, f, y, x = s[output].op.axis
-            kernel_scope, n = s[output].split(n, nparts=1)
-            bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-            s[output].bind(bn, te.thread_axis("blockIdx.z"))
-            s[output].bind(bf, te.thread_axis("blockIdx.y"))
-            s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
-            s[output].bind(vn, te.thread_axis("vthread"))
-            s[output].bind(vf, te.thread_axis("vthread"))
-            s[output].bind(vy, te.thread_axis("vthread"))
-            s[output].bind(vx, te.thread_axis("vthread"))
-
-            cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
-
-            if cfg["fuse_yx"].val:
-                s[output].bind(tn, te.thread_axis("threadIdx.z"))
-                s[output].bind(tf, te.thread_axis("threadIdx.y"))
-                tyx = s[output].fuse(ty, tx)
-                s[output].bind(s[output].fuse(ty, tx), te.thread_axis("threadIdx.x"))
-                s[OL].compute_at(s[output], tyx)
-
-                # number of threads
-                n_tz = cfg["tile_n"].size[2]
-                n_ty = cfg["tile_f"].size[2]
-                n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
-            else:
-                s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
-                s[output].bind(ty, te.thread_axis("threadIdx.y"))
-                s[output].bind(tx, te.thread_axis("threadIdx.x"))
-                s[OL].compute_at(s[output], tx)
-
-                # number of threads
-                n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-                n_ty = cfg["tile_y"].size[2]
-                n_tx = cfg["tile_x"].size[2]
-
-            # tile reduction axes
-            n, f, y, x = s[OL].op.axis
-            rc, ry, rx = s[OL].op.reduce_axis
-            rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
-            s[OL].reorder(rco, rcm, ry, rx, rci, n, f, y, x)
-
-            s[AA].compute_at(s[OL], rx)
-            s[WW].compute_at(s[OL], rx)
-
-            # cooperative fetching
-            for load in [AA, WW]:
-                n, f, y, x = s[load].op.axis
-                fused = s[load].fuse(f, y, x)
-                tz, fused = s[load].split(fused, nparts=n_tz)
-                ty, fused = s[load].split(fused, nparts=n_ty)
-                tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, te.thread_axis("threadIdx.z"))
-                s[load].bind(ty, te.thread_axis("threadIdx.y"))
-                s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-            s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-            s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
-
-
-def conv2d_transpose_cudnn(
-    x, w, stride, padding, out_dtype, output_padding=(0, 0), layout="NCHW", groups=1
-):
-    """Compute conv2d_tranpose using cudnn dgrad kernel"""
-    tensor_format = 0 if layout == "NCHW" else 1
-    return cudnn.conv_backward_data(
-        x,
-        w,
-        padding,
-        stride,
-        (1, 1),
-        1,
-        tensor_format,
-        out_dtype,
-        groups=groups,
-        output_padding=output_padding,
-    )
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
deleted file mode 100644
index eca51c921016..000000000000
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Winograd template for cuda backend"""
-
-import logging
-
-import tvm
-from tvm import autotvm, te
-
-from .. import nn
-from ..nn.conv2d import (
-    _conv2d_winograd_nchw_impl,
-    _conv2d_winograd_nhwc_impl,
-    conv2d_winograd_nchw,
-    conv2d_winograd_nhwc,
-)
-from ..nn.winograd_util import winograd_transform_matrices
-from ..utils import get_const_int, get_const_tuple, traverse_inline
-
-logger = logging.getLogger("conv2d_winograd")
-
-
-def _infer_tile_size(data, kernel, layout="NCHW"):
-    if layout == "NCHW":
-        N, CI, H, W = get_const_tuple(data.shape)
-    else:
-        assert layout == "NHWC"
-        N, H, W, CI = get_const_tuple(data.shape)
-
-    if H % 8 == 0:
-        return 4
-    return 2
-
-
-def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed):
-    """Compute declaration for winograd"""
-    tile_size = _infer_tile_size(data, kernel)
-
-    N, CI, H, W = get_const_tuple(data.shape)
-
-    if isinstance(N, tvm.tir.Any):
-        N = tvm.te.size_var("n")
-
-    if not isinstance(H, int) or not isinstance(W, int):
-        raise RuntimeError(
-            "cuda winograd conv2d doesn't support dynamic input\
-                           height or width."
-        )
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
-
-    if not pre_computed:  # kernel tensor is raw tensor, do strict check
-        if dilation_h != 1 or dilation_w != 1:
-            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
-        CO, CI, KH, KW = get_const_tuple(kernel.shape)
-        alpha = KW + tile_size - 1
-        assert HSTR == 1 and WSTR == 1 and KH == KW
-    else:
-        # kernel tensor is pre-transfomred. this op is created by alter op layout.
-        # dilation is not supported
-        alpha, _, CI, CO = get_const_tuple(kernel.shape)
-        KH = KW = alpha + 1 - tile_size
-        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
-
-    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
-    data_pad = nn.pad(
-        data,
-        (0, 0, pt, pl),
-        (0, 0, pb, pr),
-        name="data_pad",
-    )
-
-    r = KW
-    m = tile_size
-    A, B, G = winograd_transform_matrices(m, r, out_dtype)
-
-    H = (H + pt + pb - KH) // HSTR + 1
-    W = (W + pl + pr - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-
-    P = N * nH * nW if isinstance(N, int) else nH * nW
-
-    # transform kernel
-    if not pre_computed:
-        r_kh = te.reduce_axis((0, KH), name="r_kh")
-        r_kw = te.reduce_axis((0, KW), name="r_kw")
-        kernel_pack = te.compute(
-            (alpha, alpha, CI, CO),
-            lambda eps, nu, ci, co: te.sum(
-                kernel[co][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
-            ),
-            name="kernel_pack",
-        )
-    else:
-        kernel_pack = kernel
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-    # pack input tile
-    input_tile = te.compute(
-        (CI, P, alpha, alpha),
-        lambda c, p, eps, nu: data_pad[idxdiv(p, (nH * nW))][c][
-            idxmod(idxdiv(p, nW), nH) * m + eps
-        ][idxmod(p, nW) * m + nu],
-        name="d",
-    )
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_a")
-    data_pack = te.compute(
-        (alpha, alpha, CI, P),
-        lambda eps, nu, ci, p: te.sum(
-            input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
-        ),
-        name="data_pack",
-    )
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name="ci")
-    bgemm = te.compute(
-        (alpha, alpha, CO, P),
-        lambda eps, nu, co, p: te.sum(
-            kernel_pack[eps][nu][ci][co] * data_pack[eps][nu][ci][p], axis=[ci]
-        ),
-        name="bgemm",
-    )
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_a")
-    inverse = te.compute(
-        (CO, P, m, m),
-        lambda co, p, vh, vw: te.sum(
-            bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
-        ),
-        name="inverse",
-    )
-
-    # output
-    output = te.compute(
-        (N, CO, H, W),
-        lambda n, co, h, w: inverse[
-            co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), idxmod(h, m), idxmod(w, m)
-        ],
-        name="output",
-        tag="conv2d_nchw_winograd",
-    )
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
-
-    return output
-
-
-def schedule_winograd_cuda(cfg, s, output, pre_computed):
-    """Schedule winograd template"""
-    # get stages
-    inverse = s[output].op.input_tensors[0]
-    bgemm, A = s[inverse].op.input_tensors
-    kernel_pack, data_pack = s[bgemm].op.input_tensors
-    input_tile, B = s[data_pack].op.input_tensors
-    pad_data = s[input_tile].op.input_tensors[0]
-
-    # data transform
-    s[B].compute_inline()
-
-    data_l = s.cache_write(data_pack, "local")
-    eps, nu, c, p = s[data_l].op.axis
-    r_a, r_b = s[data_l].op.reduce_axis
-    for axis in [eps, nu, r_a, r_b]:
-        s[data_l].unroll(axis)
-
-    eps, nu, c, p = s[data_pack].op.axis
-    p, pi = s[data_pack].split(p, 1)
-    fused = s[data_pack].fuse(c, p)
-    bb, tt = s[data_pack].split(fused, 128)
-    s[data_pack].reorder(bb, tt, pi, eps, nu)
-    s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
-    s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
-
-    s[data_l].compute_at(s[data_pack], pi)
-    s[input_tile].compute_at(s[data_pack], pi)
-    s[pad_data].compute_inline()
-
-    # transform kernel
-    if not pre_computed:
-        kernel, G = s[kernel_pack].op.input_tensors
-        eps, nu, ci, co = s[kernel_pack].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # skip this part during tuning to make recrods accurate
-            # this part will be pre-computed during pre-compute optimization pass
-            s[G].pragma(s[G].op.axis[0], "debug_skip_region")
-            s[kernel_pack].pragma(eps, "debug_skip_region")
-        else:
-            s[G].compute_inline()
-            r_a, r_b = s[kernel_pack].op.reduce_axis
-            for axis in [eps, nu, r_a, r_b]:
-                s[kernel_pack].unroll(axis)
-
-            fused = s[kernel_pack].fuse(ci, co)
-            bb, tt = s[kernel_pack].split(fused, 128)
-            s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b)
-            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
-            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
-    else:
-        kernel = kernel_pack
-
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    ##### space definition begin #####
-    b1, b2, y, x = s[bgemm].op.axis
-    rc = s[bgemm].op.reduce_axis[0]
-    alpha = get_const_int(b1.dom.extent)
-
-    cfg.define_split(
-        "tile_b", cfg.axis(alpha * alpha), num_outputs=4, filter=lambda x: x.size[-3:] == [1, 1, 1]
-    )
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-    ##### space definition end #####
-
-    # batch gemm
-    C = bgemm
-    A0, B0 = kernel_pack, data_pack
-
-    OL = s.cache_write(C, "local")
-    AA = s.cache_read(A0, "shared", [OL])
-    BB = s.cache_read(B0, "shared", [OL])
-
-    b = s[bgemm].fuse(b1, b2)
-
-    # tile and bind spatial axes
-    bgemm_scope, b = s[bgemm].split(b, nparts=1)
-    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].bind(bz, te.thread_axis("blockIdx.z"))
-    s[C].bind(by, te.thread_axis("blockIdx.y"))
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(vz, te.thread_axis("vthread"))
-    s[C].bind(vy, te.thread_axis("vthread"))
-    s[C].bind(vx, te.thread_axis("vthread"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
-
-    # tile reduction axes
-    s[OL].compute_at(s[C], tx)
-    b1, b2, y, x = s[OL].op.axis
-    b = s[OL].fuse(b1, b2)
-    (rc,) = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    s[OL].reorder(rco, rci, b, y, x)
-
-    s[AA].compute_at(s[OL], rco)
-    s[BB].compute_at(s[OL], rco)
-
-    # cooperative fetching
-    for load in [AA, BB]:
-        fused = s[load].fuse(*list(s[load].op.axis))
-        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
-        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
-        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    s[C].pragma(bgemm_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[C].pragma(bgemm_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    # schedule inverse, output and fusion
-    if output.op in s.outputs:
-        OL = None
-    else:
-        OL = output
-        s[OL].set_scope("local")
-        output = s.outputs[0]
-
-    m = alpha - 3 + 1
-    n, co, h, w = s[output].op.axis
-    ho, wo, hi, wi = s[output].tile(h, w, m, m)
-    inverse_scope, n = s[output].split(n, nparts=1)
-
-    fused = s[output].fuse(n, co, ho, wo)
-    bb, tt = s[output].split(fused, 128)
-
-    s[output].bind(bb, te.thread_axis("blockIdx.x"))
-    s[output].bind(tt, te.thread_axis("threadIdx.x"))
-
-    if OL is not None:
-        s[OL].compute_at(s[output], tt)
-
-    s[A].compute_inline()
-    co, p, vh, vw = s[inverse].op.axis
-    r_a, r_b = s[inverse].op.reduce_axis
-    for axis in [vh, vw, r_a, r_b]:
-        s[inverse].unroll(axis)
-    s[inverse].compute_at(s[output], tt)
-
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd.cuda")
-def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return winograd_cuda(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd.cuda")
-def schedule_conv2d_nchw_winograd(cfg, outs):
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nchw_winograd" in op.tag:
-            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.cuda")
-def conv2d_nchw_winograd_without_weight_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    return winograd_cuda(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.cuda")
-def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nchw_winograd" in op.tag:
-            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@conv2d_winograd_nhwc.register(["cuda", "gpu"])
-def conv2d_winograd_nhwc_cuda(
-    data,
-    weight,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    pre_computed=False,
-    auto_scheduler_rewritten_layout="",
-    meta_schedule_original_shape=None,
-):
-    """Conv2D Winograd in NHWC layout.
-    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
-    """
-    tile_size = _infer_tile_size(data, weight, layout="NHWC")
-    return _conv2d_winograd_nhwc_impl(
-        data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed
-    )
-
-
-@conv2d_winograd_nchw.register(["cuda", "gpu"])
-def conv2d_winograd_nchw_cuda(
-    data,
-    weight,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    pre_computed=False,
-    auto_scheduler_rewritten_layout="",
-    meta_schedule_original_shape=None,
-):
-    """Conv2D Winograd in NCHW layout.
-    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
-    """
-    tile_size = _infer_tile_size(data, weight, layout="NCHW")
-    return _conv2d_winograd_nchw_impl(
-        data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed
-    )
diff --git a/python/tvm/topi/cuda/conv3d.py b/python/tvm/topi/cuda/conv3d.py
deleted file mode 100644
index 7a5e8ce69cb4..000000000000
--- a/python/tvm/topi/cuda/conv3d.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Compute definition for conv3d with cuda backend"""
-from tvm import te
-from tvm import autotvm
-from tvm.contrib import cudnn
-
-from .. import nn, generic
-from ..utils import get_const_tuple, traverse_inline
-from .conv3d_direct import schedule_direct_conv3d_cuda
-
-
-@autotvm.register_topi_compute("conv3d_ncdhw.cuda")
-def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
-    """Conv3D operator in NCDHW layout for cuda backend.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
-
-    kernel : tvm.te.Tensor
-        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
-
-    strides : int or a list/tuple of three ints
-        stride size, or [stride_depth, stride_height, stride_width]
-
-    padding : int or a list/tuple of three ints
-        padding size, or [pad_depth, pad_height, pad_width]
-
-    dilation: int or a list/tuple of three ints
-        dilation size, or [dilation_depth, dilation_height, dilation_width]
-
-    groups: int
-        Number of groups
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
-    """
-    return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, groups, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv3d_ncdhw.cuda")
-def schedule_conv3d_ncdhw(cfg, outs):
-    """TOPI schedule callback of conv3d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv3d_ncdhw" in op.tag:
-            schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NCDHW", "conv3d_ncdhw.cuda")
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv3d_ndhwc.cuda")
-def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
-    """Conv3d operator in NDHWC layout for cuda backend.
-
-    Parameters
-    ----------
-    Input : tvm.te.Tensor
-        5-D with shape [batch, in_depth, in_height, in_width, in_channel]
-
-    Filter : tvm.te.Tensor
-        5-D with shape [filter_depth, filter_height, filter_width, in_channel, num_filter]
-
-    stride : int or a list/tuple of three ints
-        Stride size, or [stride_depth, stride_height, stride_width]
-
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-
-    dilation: int or a list/tuple of three ints
-        dilation size, or [dilation_depth, dilation_height, dilation_width]
-
-    groups: int
-        Number of groups
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        5-D with shape [batch, out_depth, out_height, out_width, out_channel]
-    """
-    return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, groups, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv3d_ndhwc.cuda")
-def schedule_conv3d_ndhwc(cfg, outs):
-    """TOPI schedule callback of conv3d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv3d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv3d_ndhwc" in op.tag:
-            schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NDHWC", "conv3d_ndhwc.cuda")
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv3d_cudnn.cuda")
-def conv3d_cudnn(
-    cfg, data, kernel, strides, padding, dilation, groups, layout="NCDHW", out_dtype="float32"
-):
-    """Conv3D operator for cuda backend.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
-
-    kernel : tvm.te.Tensor
-        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
-
-    strides : int or a list/tuple of three ints
-        stride size, or [stride_depth, stride_height, stride_width]
-
-    padding : int or a list/tuple of three ints
-        padding size, or [pad_depth, pad_height, pad_width]
-
-    dilation: int or a list/tuple of three ints
-        dilation size, or [dilation_depth, dilation_height, dilation_width]
-
-    layout : str
-        layout of data
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
-    """
-    if layout == "NCDHW":
-        tensor_format = 0  # CUDNN_TENSOR_NCHW
-        N, _, D, H, W = get_const_tuple(data.shape)
-    elif layout == "NDHWC":
-        tensor_format = 1  # CUDNN_TENSOR_NHWC
-        N, D, H, W, _ = get_const_tuple(data.shape)
-    else:
-        raise ValueError(f"Unsupported layout {layout} in cudnn")
-    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
-
-    assert groups == 1, "conv3d_cudnn does not support groups"
-
-    # handle dilation
-    stride_d, stride_h, stride_w = (
-        (strides, strides, strides) if isinstance(strides, int) else strides
-    )
-    pad_d, pad_h, pad_w = (padding, padding, padding) if isinstance(padding, int) else padding
-    dilation_d, dilation_h, dilation_w = (
-        (dilation, dilation, dilation) if isinstance(dilation, int) else dilation
-    )
-
-    OD = (D + 2 * pad_d - KD) // stride_d + 1
-    OH = (H + 2 * pad_h - KH) // stride_h + 1
-    OW = (W + 2 * pad_w - KW) // stride_w + 1
-
-    if isinstance(N, int):
-        cfg.add_flop(
-            2
-            * N
-            * OD
-            * OH
-            * OW
-            * CO
-            * CI
-            * ((KD - 1) * dilation_d + 1)
-            * ((KH - 1) * dilation_h + 1)
-            * ((KW - 1) * dilation_w + 1)
-        )
-
-    cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT")))
-    if cfg.is_fallback:
-        if cudnn.exists():
-            # Let CUDNN choose the best algo, based on benchmarks run
-            # on the local machine.  In the future, this should be
-            # based on parameters stored in the Target.
-            cfg["algo"] = OtherOptionEntity(-1)
-        else:
-            cfg["algo"] = OtherOptionEntity(0)
-
-    return cudnn.conv_forward(
-        data,
-        kernel,
-        [pad_d, pad_h, pad_w],
-        [stride_d, stride_h, stride_w],
-        [dilation_d, dilation_h, dilation_w],
-        conv_mode=1,
-        tensor_format=tensor_format,
-        algo=cfg["algo"].val,
-        conv_dtype=dtype,
-    )
-
-
-@autotvm.register_topi_schedule("conv3d_cudnn.cuda")
-def schedule_conv3d_cudnn(_, outs):
-    """TOPI schedule callback of conv3d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/cuda/conv3d_alter_op.py b/python/tvm/topi/cuda/conv3d_alter_op.py
deleted file mode 100644
index c7ec7cb21fcf..000000000000
--- a/python/tvm/topi/cuda/conv3d_alter_op.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Conv3D alter op and legalize functions for cuda backend"""
-
-import logging
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-
-from .. import nn
-from ..utils import get_const_tuple
-from .conv3d_winograd import _infer_tile_size
-
-logger = logging.getLogger("topi")
-
-
-@nn.conv3d_alter_layout.register(["cuda", "gpu"])
-def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-
-    _, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.conv3d"), attrs, tinfos, out_type, target
-    )
-    workload = autotvm.task.get_workload(outs)
-    if workload is None:
-        # The best implementation is not an AutoTVM template,
-        # we then assume it's not necessary to alter this op.
-        return None
-    cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:  # if is fallback, clear query cache and return None
-        autotvm.task.clear_fallback_cache(target, workload)
-        return None
-
-    topi_tmpl = workload[0]
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
-    if topi_tmpl == "conv3d_ncdhw_winograd.cuda":
-        if dilation != (1, 1, 1):
-            logger.warning("Does not support weight pre-transform for dilated 3D convolution.")
-            return None
-
-        assert data_layout == "NCDHW" and kernel_layout == "OIDHW"
-        N, CI, D, H, W = get_const_tuple(data.shape)
-        CO, _, KD, KH, KW = get_const_tuple(kernel.shape)
-
-        # Pre-compute weight transformation in winograd
-        tile_size = _infer_tile_size(tinfos[0], tinfos[1])
-
-        weight = relay.nn.contrib_conv3d_winograd_weight_transform(inputs[1], tile_size=tile_size)
-        new_attrs["tile_size"] = tile_size
-        new_attrs["channels"] = CO
-
-        # Store the same config for the altered operators (workload)
-        new_data = data
-        # Check if depth is transformed or not
-        if 2 < KD < 8 and KD == KH:
-            new_weight = te.placeholder(
-                (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CO, CI),
-                dtype=kernel.dtype,
-            )
-        else:
-            new_weight = te.placeholder(
-                (KH + tile_size - 1, KW + tile_size - 1, KD, CO, CI), dtype=kernel.dtype
-            )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_weight, strides, padding, dilation, out_dtype],
-            "conv3d_ncdhw_winograd_without_weight_transform.cuda",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv3d_winograd_without_weight_transform(
-            inputs[0], weight, **new_attrs
-        )
-
-    return None
diff --git a/python/tvm/topi/cuda/conv3d_direct.py b/python/tvm/topi/cuda/conv3d_direct.py
deleted file mode 100644
index 2a8e573621f6..000000000000
--- a/python/tvm/topi/cuda/conv3d_direct.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""The templates for cuda conv3d operators"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from ..utils import get_const_tuple
-
-
-def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name):
-    """schedule optimized for batch size = 1"""
-
-    ##### space definition begin #####
-    if layout == "NCDHW":
-        n, f, d, y, x = s[conv].op.axis
-    elif layout == "NDHWC":
-        n, d, y, x, f = s[conv].op.axis
-    else:
-        raise ValueError(f"not support this layout {layout} yet")
-    rc, rd, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_f", f, num_outputs=4)
-    cfg.define_split("tile_d", d, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_rd", ry, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-
-    # fallback support
-    if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log(target.kind.name, target.model, workload_name)
-        cfg.fallback_with_reference_log(ref_log)
-    ##### space definition end #####
-
-    pad_data, kernel = s[conv].op.input_tensors
-
-    s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    # create cache stage
-    AA = s.cache_read(pad_data, "shared", [OL])
-    WW = s.cache_read(kernel, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, f, d, y, x = s[output].op.axis
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    bd, vd, td, di = cfg["tile_d"].apply(s, output, d)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].reorder(bf, bd, by, bx, vf, vd, vy, vx, tf, td, ty, tx, fi, di, yi, xi)
-
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(s[output].fuse(bd, by), te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vd, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(s[output].fuse(td, tf), te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[OL].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, f, d, y, x = s[OL].op.axis
-    rc, rd, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    rdo, rdi = cfg["tile_rd"].apply(s, OL, rd)
-    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
-    s[OL].reorder(rco, rdo, ryo, rxo, rci, rdi, ryi, rxi, n, f, d, y, x)
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        n, f, d, y, x = s[load].op.axis
-        fused = s[load].fuse(n, f, d, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-        td, fused = s[load].split(fused, nparts=cfg["tile_d"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(s[load].fuse(td, ty), te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    N, CO, OD, OH, OW = get_const_tuple(output.shape)
-    _, KD, KH, KW, CI = get_const_tuple(kernel.shape)
-    cfg.add_flop(2 * N * OD * OH * OW * CO * CI * KD * KH * KW)
diff --git a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
deleted file mode 100644
index cf96794dd4f0..000000000000
--- a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-function-args
-# pylint: disable=too-many-statements, unused-argument, too-many-arguments
-"""Tensorcore template for cuda backend"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from ..utils import get_const_tuple, traverse_inline, simplify
-from ..nn.pad import pad
-from ..nn.utils import get_pad_tuple3d
-from .tensor_intrin import intrin_wmma_load_matrix_A
-from .tensor_intrin import intrin_wmma_load_matrix_W
-from .tensor_intrin import intrin_wmma_store_matrix
-from .tensor_intrin import intrin_wmma_gemm
-
-
-def ndhwc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    """Compute declaration for conv3d tensorcore function"""
-    assert isinstance(stride, int) or len(stride) == 3
-    assert isinstance(dilation, int) or len(dilation) == 3
-
-    if isinstance(stride, int):
-        stride_d = stride_h = stride_w = stride
-    else:
-        stride_d, stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_d = dilation_h = dilation_w = dilation
-    else:
-        dilation_d, dilation_h, dilation_w = dilation
-
-    batch, in_depth, in_height, in_width, in_channel = get_const_tuple(Input.shape)
-    kernel_d, kernel_h, kernel_w, _, num_filter = get_const_tuple(Filter.shape)
-    assert (
-        (batch % 16 == 0 and in_channel % 16 == 0 and num_filter % 16 == 0)
-        or (batch % 8 == 0 and in_channel % 16 == 0 and num_filter % 32 == 0)
-        or (batch % 32 == 0 and in_channel % 16 == 0 and num_filter % 8 == 0)
-    ), (
-        "The shape of (batch, in_channel, num_filter) "
-        "must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
-    )
-
-    # compute the output shape
-    dilated_kernel_d = (kernel_d - 1) * dilation_d + 1
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d(
-        padding, (dilated_kernel_d, dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel = num_filter
-    out_depth = simplify((in_depth - dilated_kernel_d + pad_front + pad_back) // stride_d + 1)
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [0, pad_front, pad_top, pad_left, 0]
-    pad_after = [0, pad_back, pad_down, pad_right, 0]
-    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    rz = te.reduce_axis((0, kernel_d), name="rz")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    # convert data type of input feature maps and weights
-    # TODO: add checking here, datatype casting may cause precision loss
-    TransPaddedInput = te.compute(
-        PaddedInput.shape, lambda n, d, h, w, c: PaddedInput[n, d, h, w, c].astype("float16")
-    )
-    TransFilter = te.compute(
-        Filter.shape, lambda d, h, w, i, o: Filter[d, h, w, i, o].astype("float16")
-    )
-    Output = te.compute(
-        (batch, out_depth, out_height, out_width, out_channel),
-        lambda nn, zz, yy, xx, ff: te.sum(
-            TransPaddedInput[
-                nn,
-                zz * stride_d + rz * dilation_d,
-                yy * stride_h + ry * dilation_h,
-                xx * stride_w + rx * dilation_w,
-                rc,
-            ].astype(out_dtype)
-            * TransFilter[rz, ry, rx, rc, ff].astype(out_dtype),
-            axis=[rz, ry, rx, rc],
-        ),
-        name="Conv3dOutput",
-        tag="conv3d_ndhwc_tensorcore",
-    )
-    return Output
-
-
-def schedule_ndhwc_tensorcore_cuda(cfg, s, Conv):
-    """Schedule tensorcore template"""
-    kd, kh, kw, ic = s[Conv].op.reduce_axis
-    out_dtype = Conv.dtype
-    trans_paddata, kernel = s[Conv].op.input_tensors
-    in_dtype = trans_paddata.dtype
-    batch, _, _, _, _ = get_const_tuple(Conv.shape)
-    _, _, _, _, out_channels = get_const_tuple(kernel.shape)
-    paddata = s[trans_paddata].op.input_tensors
-
-    # inline the pad and dtype transform
-    s[trans_paddata].compute_inline()
-    s[kernel].compute_inline()
-    s[paddata[0]].compute_inline()
-
-    # Designate the memory hierarchy
-    AS = s.cache_read(trans_paddata, "shared", [Conv])
-    WS = s.cache_read(kernel, "shared", [Conv])
-    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
-    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
-    ConvF = s.cache_write(Conv, "wmma.accumulator")
-
-    if Conv.op in s.outputs:
-        output = Conv
-        ConvS = s.cache_read(ConvF, "shared", [Conv])
-        OL = ConvS
-    else:
-        output = s.outputs[0].output(0)
-        s[Conv].set_scope("shared")
-        OL = Conv
-
-    # Schedule for autotvm
-    cfg.define_knob("block_row_warps", [1, 2, 4])
-    cfg.define_knob("block_col_warps", [1, 2, 4])
-    cfg.define_knob("warp_row_tiles", [1, 2, 4])
-    cfg.define_knob("warp_col_tiles", [1, 2, 4])
-    cfg.define_knob("chunk", [1, 2, 4, 8])
-    cfg.define_knob("offset", [0, 8])
-    cfg.define_knob("vector_width", [1, 2, 4, 8])
-
-    if batch % 16 == 0 and out_channels % 16 == 0:
-        cfg.define_knob("wmma_m", [16, 8, 32])
-    elif batch % 8 == 0 and out_channels % 32 == 0:
-        cfg.define_knob("wmma_m", [8, 16, 32])
-    elif batch % 32 == 0 and out_channels % 8 == 0:
-        cfg.define_knob("wmma_m", [32, 16, 8])
-
-    # fallback support
-    target = tvm.target.Target.current()
-    if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log(
-            target.kind.name, target.model, "conv3d_ndhwc_tensorcore.cuda"
-        )
-        cfg.fallback_with_reference_log(ref_log)
-
-    block_row_warps = cfg["block_row_warps"].val
-    block_col_warps = cfg["block_col_warps"].val
-    warp_row_tiles = cfg["warp_row_tiles"].val
-    warp_col_tiles = cfg["warp_col_tiles"].val
-    chunk = cfg["chunk"].val
-    offset = cfg["offset"].val
-    wmma_m = cfg["wmma_m"].val
-    vector_width = cfg["vector_width"].val
-
-    wmma_k = 16
-    if wmma_m == 16:
-        wmma_n = 16
-    elif wmma_m == 8:
-        wmma_n = 32
-    elif wmma_m == 32:
-        wmma_n = 8
-
-    warp_size = 32
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    # Define the intrin strides
-    def get_strides(extents):
-        return [np.prod(extents[i:]).tolist() for i in range(len(extents))]
-
-    AS_align = chunk * wmma_k + offset
-    WS_align = warp_col_tiles * block_col_warps * wmma_n + offset
-    block_factor_n = wmma_m * warp_row_tiles * block_row_warps
-    block_factor_o = wmma_n * warp_col_tiles * block_col_warps
-    CS_align = block_factor_o + offset
-    AS_strides = get_strides([1, 1, 1, AS_align, 1])
-    AL_strides = get_strides([1, 1, 1, wmma_k, 1])
-    WS_strides = get_strides([WS_align, 1])
-    WL_strides = get_strides([wmma_n * warp_col_tiles, 1])
-    CL_strides = get_strides([1, 1, 1, wmma_n * warp_col_tiles, 1])
-    CS_strides = get_strides([1, 1, 1, CS_align, 1])
-
-    # Schedule for output
-    nc, dc, hc, wc, oc = output.op.axis
-    block_k = s[output].fuse(dc, hc, wc)
-    s[output].bind(block_k, block_z)
-    block_i, nc = s[output].split(nc, factor=block_factor_n)
-    block_j, oc = s[output].split(oc, factor=block_factor_o)
-    s[output].reorder(block_k, block_i, block_j, nc, oc)
-    t = s[output].fuse(nc, oc)
-    t, ti = s[output].split(t, factor=vector_width)
-    t, tx = s[output].split(t, factor=warp_size)
-    t, ty = s[output].split(t, factor=block_row_warps)
-    t, tz = s[output].split(t, factor=block_col_warps)
-    s[output].bind(block_i, block_x)
-    s[output].bind(block_j, block_y)
-    s[output].bind(tz, thread_z)
-    s[output].bind(ty, thread_y)
-    s[output].bind(tx, thread_x)
-    s[output].vectorize(ti)
-
-    # Schedule wmma store
-    s[OL].compute_at(s[output], block_j)
-    nc, dc, hc, wc, oc = OL.op.axis
-    s[OL].reorder(dc, hc, wc, nc, oc)
-    s[OL].storage_align(wc, CS_align - 1, CS_align)
-    oc, ooc = s[OL].split(oc, factor=wmma_n)
-    oc, oci = s[OL].split(oc, factor=warp_col_tiles)
-    _, oc = s[OL].split(oc, factor=block_col_warps)
-    nc, nnc = s[OL].split(nc, factor=wmma_m)
-    nc, nci = s[OL].split(nc, factor=warp_row_tiles)
-    _, nc = s[OL].split(nc, factor=block_row_warps)
-    s[OL].reorder(nc, oc, nci, oci, nnc, ooc)
-    s[OL].bind(nc, thread_y)
-    s[OL].bind(oc, thread_z)
-
-    # Schedule wmma computation
-    s[ConvF].compute_at(s[OL], oc)
-    n, d, h, w, o = ConvF.op.axis
-    n, nnf = s[ConvF].split(n, factor=wmma_m)
-    o, oof = s[ConvF].split(o, factor=wmma_n)
-    ic, ii = s[ConvF].split(ic, factor=wmma_k)
-    ko, ki = s[ConvF].split(ic, factor=chunk)
-    s[ConvF].reorder(kd, kh, kw, ko, ki, n, o, nnf, oof, ii)
-
-    s[AF].compute_at(s[ConvF], ki)
-    s[WF].compute_at(s[ConvF], ki)
-
-    # Schedule wmma load
-    n, d, h, w, i = AF.op.axis
-    n, nn = s[AF].split(n, factor=wmma_m)
-    i, ii = s[AF].split(i, factor=wmma_k)
-    s[AF].reorder(n, i, nn, ii)
-
-    kd, kh, kw, i, o = WF.op.axis
-    i, ii = s[WF].split(i, factor=wmma_k)
-    o, oo = s[WF].split(o, factor=wmma_n)
-    s[WF].reorder(o, i, oo)
-    s[WF].reorder(i, o, ii, oo)
-
-    s[WS].compute_at(s[ConvF], ko)
-    s[AS].compute_at(s[ConvF], ko)
-
-    # Schedule for data's share memory
-    n, d, h, w, i = AS.op.axis
-    s[AS].reorder(d, h, w, n, i)
-    s[AS].storage_align(w, AS_align - 1, AS_align)
-    t = s[AS].fuse(n, i)
-    t, ti = s[AS].split(t, factor=vector_width)
-    t, tx = s[AS].split(t, factor=warp_size)
-    t, ty = s[AS].split(t, factor=block_row_warps)
-    _, tz = s[AS].split(t, factor=block_col_warps)
-    s[AS].bind(ty, thread_y)
-    s[AS].bind(tz, thread_z)
-    s[AS].bind(tx, thread_x)
-    s[AS].vectorize(ti)
-
-    # Schedule for kernel's share memory
-    kd, kh, kw, ic, o = WS.op.axis
-    t = s[WS].fuse(ic, o)
-    s[WS].storage_align(ic, WS_align - 1, WS_align)
-    t, ti = s[WS].split(t, factor=vector_width)
-    t, tx = s[WS].split(t, factor=warp_size)
-    t, ty = s[WS].split(t, factor=block_row_warps)
-    _, tz = s[WS].split(t, factor=block_col_warps)
-    s[WS].bind(ty, thread_y)
-    s[WS].bind(tz, thread_z)
-    s[WS].bind(tx, thread_x)
-    s[WS].vectorize(ti)
-
-    shape = (wmma_m, wmma_n, wmma_k)
-
-    # tensorize the wmma process
-    AS_shape = (wmma_m, 1, 1, 1, wmma_k)
-    AL_shape = (wmma_m, 1, 1, 1, wmma_k)
-    WS_shape = (wmma_k, wmma_n)
-    WL_shape = (wmma_k, wmma_n)
-    CL_shape = (wmma_m, 1, 1, 1, wmma_n)
-    CS_shape = (wmma_m, 1, 1, 1, wmma_n)
-
-    AL_gemm = te.placeholder(AL_shape, name="A", dtype=in_dtype)
-    WL_gemm = te.placeholder(WL_shape, name="B", dtype=in_dtype)
-    k_gemm = te.reduce_axis((0, wmma_k), name="k")
-    CL_compute = te.compute(
-        CL_shape,
-        lambda ii, t0, t1, t2, jj: te.sum(
-            AL_gemm[ii, t0, t1, t2, k_gemm].astype(out_dtype)
-            * WL_gemm[k_gemm, jj].astype(out_dtype),
-            axis=k_gemm,
-        ),
-        name="C",
-    )
-
-    s[AF].tensorize(
-        nn,
-        intrin_wmma_load_matrix_A(
-            AL_strides, AS_strides, shape, "row_major", AS_shape, AL_shape, in_dtype
-        ),
-    )
-    s[WF].tensorize(
-        ii,
-        intrin_wmma_load_matrix_W(
-            WL_strides, WS_strides, shape, "row_major", WS_shape, WL_shape, in_dtype
-        ),
-    )
-    s[OL].tensorize(
-        nnc, intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype, CL_shape, CS_shape)
-    )
-    s[ConvF].tensorize(
-        nnf,
-        intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides, CL_strides, shape),
-    )
-
-    N, OD, OH, OW, CO = get_const_tuple(output.shape)
-    KD, KH, KW, CI, _ = get_const_tuple(kernel.shape)
-    cfg.add_flop(2 * N * OD * OH * OW * CO * CI * KD * KH * KW)
-
-
-@autotvm.register_topi_compute("conv3d_ndhwc_tensorcore.cuda")
-def conv3d_ndhwc_tensorcore(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
-    """Compute conv3d with tensorcore for NDHWC layout"""
-    assert groups == 1, "tensorcore conv3d does not support groups"
-    return ndhwc_tensorcore_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv3d_ndhwc_tensorcore.cuda")
-def schedule_conv3d_ndhwc_tensorcore(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv3d_ndhwc_tensorcore" in op.tag:
-            schedule_ndhwc_tensorcore_cuda(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py b/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
deleted file mode 100644
index 3ad85b9bbee7..000000000000
--- a/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Conv3d transpose template for cuda backend"""
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from .. import nn
-from ..utils import get_const_tuple, traverse_inline
-from .conv3d_direct import schedule_direct_conv3d_cuda
-
-
-@autotvm.register_topi_compute("conv3d_transpose_ncdhw.cuda")
-def conv3d_transpose_ncdhw(cfg, data, kernel, stride, padding, out_dtype, output_padding):
-    """Transposed 3D convolution ncdhw forward operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-    Input : tvm.te.Tensor
-        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
-    Filter : tvm.te.Tensor
-        5-D with shape [in_channel, num_filter, filter_depth, filter_height, filter_width]
-    strides : int or a list/tuple of three ints
-        The spatial stride along height and width
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-    out_dtype: str
-        The output type. This is used in mixed precision
-    output_padding : tuple of three ints
-        Used to disambiguate output shape
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
-    """
-    batch, inp_channels, inp_depth, inp_height, inp_width = get_const_tuple(data.shape)
-    _, out_channels, kernel_depth, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-    stride_depth, stride_height, stride_width = stride
-    outpad_depth, outpad_height, outpad_width = output_padding
-    assert (
-        outpad_height < stride_height
-        and outpad_width < stride_width
-        and outpad_depth < stride_depth
-    )
-    cfg.stride = stride
-    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = nn.get_pad_tuple3d(
-        padding, (kernel_depth, kernel_height, kernel_width)
-    )
-
-    out_depth = (inp_depth - 1) * stride_depth + kernel_depth - pad_front - pad_back + outpad_depth
-    pad_front = kernel_depth - 1 - pad_front
-    pad_back = kernel_depth - 1 - pad_back
-    dilated_depth = stride_depth * (inp_depth - 1) + 1
-
-    out_width = (inp_width - 1) * stride_width + kernel_width - pad_left - pad_right + outpad_width
-    pad_left = kernel_width - 1 - pad_left
-    pad_right = kernel_width - 1 - pad_right
-    dilated_width = stride_width * (inp_width - 1) + 1
-
-    out_height = (
-        (inp_height - 1) * stride_height + kernel_height - pad_top - pad_bottom + outpad_height
-    )
-    pad_top = kernel_height - 1 - pad_top
-    pad_bottom = kernel_height - 1 - pad_bottom
-    dilated_height = stride_height * (inp_height - 1) + 1
-
-    # compute pad
-    data = te.compute(
-        (
-            batch,
-            inp_channels,
-            pad_front + dilated_depth + pad_back,
-            pad_top + dilated_height + pad_bottom,
-            pad_left + dilated_width + pad_right,
-        ),
-        lambda n, c, d, y, x: tvm.tir.if_then_else(
-            tvm.tir.all(
-                x >= pad_left,
-                x < pad_left + dilated_width,
-                tvm.tir.indexmod(x - pad_left, stride_width).equal(0),
-                y >= pad_top,
-                y < pad_top + dilated_height,
-                tvm.tir.indexmod(y - pad_top, stride_height).equal(0),
-                d >= pad_front,
-                d < pad_front + dilated_depth,
-                tvm.tir.indexmod(d - pad_front, stride_depth).equal(0),
-            ),
-            data[
-                n,
-                c,
-                tvm.tir.indexdiv(d - pad_front, stride_depth),
-                tvm.tir.indexdiv(y - pad_top, stride_height),
-                tvm.tir.indexdiv(x - pad_left, stride_width),
-            ],
-            tvm.tir.const(0.0, "float32"),
-        ),
-        name="data_pad",
-    )
-
-    # compute transposed conv
-    dc = te.reduce_axis((0, inp_channels), name="dc")
-    dd = te.reduce_axis((0, kernel_depth), name="dd")
-    dh = te.reduce_axis((0, kernel_height), name="dh")
-    dw = te.reduce_axis((0, kernel_width), name="dw")
-    data_out = te.compute(
-        (batch, out_channels, out_depth, out_height, out_width),
-        lambda b, c, d, h, w: te.sum(
-            data[b, dc, d + dd, h + dh, w + dw].astype(out_dtype)
-            * kernel[
-                dc, c, kernel_depth - 1 - dd, kernel_height - 1 - dh, kernel_width - 1 - dw
-            ].astype(out_dtype),
-            axis=[dc, dd, dh, dw],
-        ),
-        tag="conv3d_transpose_ncdhw",
-    )
-
-    return data_out
-
-
-@autotvm.register_topi_schedule("conv3d_transpose_ncdhw.cuda")
-def schedule_conv3d_transpose_ncdhw(cfg, outs):
-    """TOPI Schedule callback for conv3d transpose operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The parameters for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv3d transpose
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv3d transpose.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv3d_transpose_ncdhw":
-            schedule_direct_conv3d_cuda(
-                cfg, s, op.output(0), "NCDHW", "conv3d_transpose_ncdhw.cuda"
-            )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/conv3d_winograd.py b/python/tvm/topi/cuda/conv3d_winograd.py
deleted file mode 100644
index 2f53d0458af1..000000000000
--- a/python/tvm/topi/cuda/conv3d_winograd.py
+++ /dev/null
@@ -1,685 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Winograd template for cuda backend"""
-
-import logging
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from .. import nn
-from ..utils import get_const_int, get_const_tuple, traverse_inline, simplify
-from ..nn.winograd_util import winograd_transform_matrices
-
-logger = logging.getLogger("conv3d_winograd")
-
-
-def _infer_tile_size(data, kernel):
-    N, CI, D, H, W = get_const_tuple(data.shape)
-
-    if H % 8 == 0:
-        return 4
-    return 2
-
-
-def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed):
-    """Compute declaration for winograd"""
-    tile_size = _infer_tile_size(data, kernel)
-
-    N, CI, D, H, W = get_const_tuple(data.shape)
-
-    if isinstance(dilation, int):
-        dilation_d = dilation_h = dilation_w = dilation
-    else:
-        dilation_d, dilation_h, dilation_w = dilation
-    DSTR, HSTR, WSTR = (strides, strides, strides) if isinstance(strides, int) else strides
-
-    if not pre_computed:  # kernel tensor is raw tensor, do strict check
-        if dilation_d != 1 or dilation_h != 1 or dilation_w != 1:
-            kernel = nn.dilate(kernel, (1, 1, dilation_d, dilation_h, dilation_w))
-        CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
-        alpha = KW + tile_size - 1
-        assert DSTR == 1 and HSTR == 1 and WSTR == 1 and KD == KH and KH == KW
-    else:
-        # kernel tensor is pre-transformed. this op is created by alter op layout.
-        # dilation is not supported
-        alpha, _, _, CO, CI = get_const_tuple(kernel.shape)
-        KD = KH = KW = alpha + 1 - tile_size
-        assert (
-            DSTR == 1
-            and HSTR == 1
-            and WSTR == 1
-            and dilation_d == 1
-            and dilation_h == 1
-            and dilation_w == 1
-        )
-
-    pf, pt, pl, pb, pd, pr = nn.get_pad_tuple3d(padding, (KD, KH, KW))
-    data_pad = nn.pad(data, (0, 0, pf, pt, pl), (0, 0, pb, pd, pr), name="data_pad")
-
-    r = KW
-    m = tile_size
-    A, B, G = winograd_transform_matrices(m, r, out_dtype)
-
-    D = (D + pf + pb - KD) // DSTR + 1
-    H = (H + pt + pd - KH) // HSTR + 1
-    W = (W + pl + pr - KW) // WSTR + 1
-    nD, nH, nW = (D + m - 1) // m, (H + m - 1) // m, (W + m - 1) // m
-    P = N * nD * nH * nW
-
-    # transform kernel
-    if not pre_computed:
-        # Check if we are currently tuning, if so we want to avoid counting
-        # prepacking in time costs. Just use a placeholder with the packed shape instead.
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            kernel_pack = te.placeholder(
-                (alpha, alpha, alpha, CO, CI), dtype=kernel.dtype, name="kernel_pack"
-            )
-        else:
-            r_kd = te.reduce_axis((0, KD), name="r_kd")
-            r_kh = te.reduce_axis((0, KH), name="r_kh")
-            r_kw = te.reduce_axis((0, KW), name="r_kw")
-            kernel_pack = te.compute(
-                (alpha, alpha, alpha, CO, CI),
-                lambda omg, eps, nu, co, ci: te.sum(
-                    kernel[co][ci][r_kd][r_kh][r_kw] * G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
-                    axis=[r_kd, r_kh, r_kw],
-                ),
-                name="kernel_pack",
-            )
-    else:
-        kernel_pack = kernel
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-    # pack input tile
-    input_tile = te.compute(
-        (CI, P, alpha, alpha, alpha),
-        lambda c, p, omg, eps, nu: data_pad[idxdiv(p, (nD * nH * nW))][c][
-            idxmod(idxdiv(p, nH * nW), nD) * m + omg
-        ][idxmod(idxdiv(p, nW), nH) * m + eps][idxmod(p, nW) * m + nu],
-        name="d",
-    )
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    r_c = te.reduce_axis((0, alpha), "r_c")
-    data_pack = te.compute(
-        (alpha, alpha, alpha, CI, P),
-        lambda omg, eps, nu, ci, p: te.sum(
-            input_tile[ci][p][r_a][r_b][r_c] * B[r_a][omg] * B[r_b][eps] * B[r_c][nu],
-            axis=[r_a, r_b, r_c],
-        ),
-        name="data_pack",
-    )
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name="ci")
-    bgemm = te.compute(
-        (alpha, alpha, alpha, CO, P),
-        lambda omg, eps, nu, co, p: te.sum(
-            kernel_pack[omg][eps][nu][co][ci] * data_pack[omg][eps][nu][ci][p], axis=[ci]
-        ),
-        name="bgemm",
-    )
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    r_c = te.reduce_axis((0, alpha), "r_c")
-    inverse = te.compute(
-        (CO, P, m, m, m),
-        lambda co, p, vd, vh, vw: te.sum(
-            bgemm[r_a][r_b][r_c][co][p] * A[r_a][vd] * A[r_b][vh] * A[r_c][vw], axis=[r_a, r_b, r_c]
-        ),
-        name="inverse",
-    )
-
-    # output
-    output = te.compute(
-        (N, CO, D, H, W),
-        lambda n, co, d, h, w: inverse[
-            co,
-            n * nD * nH * nW + idxdiv(d, m) * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-            idxmod(d, m),
-            idxmod(h, m),
-            idxmod(w, m),
-        ],
-        name="output",
-        tag="conv3d_ncdhw_winograd",
-    )
-    cfg.add_flop(2 * N * CO * D * H * W * CI * KD * KH * KW)
-
-    return output
-
-
-def winograd_without_depth_cuda(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed
-):
-    """Compute declaration for winograd without transforming depth"""
-    tile_size = _infer_tile_size(data, kernel)
-
-    N, CI, D, H, W = get_const_tuple(data.shape)
-
-    if isinstance(dilation, int):
-        dilation_d = dilation_h = dilation_w = dilation
-    else:
-        dilation_d, dilation_h, dilation_w = dilation
-    DSTR, HSTR, WSTR = (strides, strides, strides) if isinstance(strides, int) else strides
-
-    if not pre_computed:  # kernel tensor is raw tensor, do strict check
-        if dilation_d != 1 or dilation_h != 1 or dilation_w != 1:
-            kernel = nn.dilate(kernel, (1, 1, dilation_d, dilation_h, dilation_w))
-        CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
-        alpha = KW + tile_size - 1
-        assert HSTR == 1 and WSTR == 1 and KH == KW
-    else:
-        # kernel tensor is pre-transfomred. this op is created by alter op layout.
-        # dilation is not supported
-        alpha, _, KD, CO, CI = get_const_tuple(kernel.shape)
-        KH = KW = alpha + 1 - tile_size
-        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
-
-    pf, pt, pl, pb, pd, pr = nn.get_pad_tuple3d(padding, (KD, KH, KW))
-    data_pad = nn.pad(data, (0, 0, pf, pt, pl), (0, 0, pb, pd, pr), name="data_pad")
-    out_depth = simplify((D - KD + pf + pb) // DSTR + 1)
-    D += pf + pb
-
-    r = KW
-    m = tile_size
-    A, B, G = winograd_transform_matrices(m, r, out_dtype)
-
-    H = (H + pt + pd - KH) // HSTR + 1
-    W = (W + pl + pr - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-
-    # transform kernel
-    if not pre_computed:
-        # During autotuning dont count kernel packing as a time cost
-        # as it will later be removed via alter_op_layout.
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            kernel_pack = te.placeholder(
-                (alpha, alpha, KD, CO, CI), dtype=kernel.dtype, name="kernel_pack"
-            )
-        else:
-            r_kh = te.reduce_axis((0, KH), name="r_kh")
-            r_kw = te.reduce_axis((0, KW), name="r_kw")
-            kernel_pack = te.compute(
-                (alpha, alpha, KD, CO, CI),
-                lambda eps, nu, d, co, ci: te.sum(
-                    kernel[co][ci][d][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
-                ),
-                name="kernel_pack",
-            )
-    else:
-        kernel_pack = kernel
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-    # pack input tile
-    input_tile = te.compute(
-        (CI, D, P, alpha, alpha),
-        lambda c, d, p, eps, nu: data_pad[idxdiv(p, (nH * nW))][c][d][
-            idxmod(idxdiv(p, nW), nH) * m + eps
-        ][idxmod(p, nW) * m + nu],
-        name="d",
-    )
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    data_pack = te.compute(
-        (alpha, alpha, CI, D, P),
-        lambda eps, nu, ci, d, p: te.sum(
-            input_tile[ci][d][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
-        ),
-        name="data_pack",
-    )
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name="ci")
-    rz = te.reduce_axis((0, KD), name="rz")
-    bgemm = te.compute(
-        (alpha, alpha, CO, out_depth, P),
-        lambda eps, nu, co, d, p: te.sum(
-            kernel_pack[eps][nu][rz][co][ci] * data_pack[eps][nu][ci][d * DSTR + rz][p],
-            axis=[ci, rz],
-        ),
-        name="bgemm",
-    )
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    inverse = te.compute(
-        (CO, out_depth, P, m, m),
-        lambda co, d, p, vh, vw: te.sum(
-            bgemm[r_a][r_b][co][d][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
-        ),
-        name="inverse",
-    )
-
-    # output
-    output = te.compute(
-        (N, CO, out_depth, H, W),
-        lambda n, co, d, h, w: inverse[
-            co, d, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), idxmod(h, m), idxmod(w, m)
-        ],
-        name="output",
-        tag="conv3d_ncdhw_winograd_without_depth",
-    )
-    cfg.add_flop(2 * N * CO * D * H * W * CI * KD * KH * KW)
-
-    return output
-
-
-def schedule_winograd_cuda(cfg, s, output, pre_computed):
-    """Schedule winograd template"""
-    # get stages
-    inverse = s[output].op.input_tensors[0]
-    bgemm, A = s[inverse].op.input_tensors
-    kernel_pack, data_pack = s[bgemm].op.input_tensors
-    input_tile, B = s[data_pack].op.input_tensors
-    pad_data = s[input_tile].op.input_tensors[0]
-
-    # data transform
-    s[B].compute_inline()
-
-    data_l = s.cache_write(data_pack, "local")
-    omg, eps, nu, c, p = s[data_l].op.axis
-    r_a, r_b, r_c = s[data_l].op.reduce_axis
-    # TODO unrolling by omg, eps, nu may improve performance but
-    # in some cases causes extremely long build times due to imperfect tiling.
-    for axis in [r_a, r_b, r_c]:
-        s[data_l].unroll(axis)
-
-    omg, eps, nu, c, p = s[data_pack].op.axis
-    p, pi = s[data_pack].split(p, 1)
-    fused = s[data_pack].fuse(c, p)
-    bb, tt = s[data_pack].split(fused, 128)
-    s[data_pack].reorder(bb, tt, pi, omg, eps, nu)
-    s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
-    s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
-
-    s[data_l].compute_at(s[data_pack], pi)
-    s[input_tile].compute_at(s[data_pack], pi)
-    s[pad_data].compute_inline()
-
-    # transform kernel
-    if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
-        kernel, G = s[kernel_pack].op.input_tensors
-        omg, eps, nu, co, ci = s[kernel_pack].op.axis
-        s[G].compute_inline()
-        r_a, r_b, r_c = s[kernel_pack].op.reduce_axis
-        # Could add additional unrolling by omg, eps, nu in the future.
-        for axis in [r_a, r_b, r_c]:
-            s[kernel_pack].unroll(axis)
-
-        fused = s[kernel_pack].fuse(co, ci)
-        bb, tt = s[kernel_pack].split(fused, 128)
-        s[kernel_pack].reorder(bb, tt, omg, eps, nu, r_a, r_b, r_c)
-        s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
-        s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
-    else:
-        kernel = kernel_pack
-
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    ##### space definition begin #####
-    b1, b2, b3, y, x = s[bgemm].op.axis
-    rc = s[bgemm].op.reduce_axis[0]
-    alpha = get_const_int(b1.dom.extent)
-
-    cfg.define_split(
-        "tile_b",
-        cfg.axis(alpha * alpha * alpha),
-        num_outputs=4,
-        filter=lambda x: x.size[-3:] == [1, 1, 1],
-    )
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-    ##### space definition end #####
-
-    # batch gemm
-    C = bgemm
-    A0, B0 = kernel_pack, data_pack
-
-    OL = s.cache_write(C, "local")
-    AA = s.cache_read(A0, "shared", [OL])
-    BB = s.cache_read(B0, "shared", [OL])
-
-    b = s[bgemm].fuse(b1, b2, b3)
-
-    # tile and bind spatial axes
-    bgemm_scope, b = s[bgemm].split(b, nparts=1)
-    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].bind(bz, te.thread_axis("blockIdx.z"))
-    s[C].bind(by, te.thread_axis("blockIdx.y"))
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(vz, te.thread_axis("vthread"))
-    s[C].bind(vy, te.thread_axis("vthread"))
-    s[C].bind(vx, te.thread_axis("vthread"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
-
-    # tile reduction axes
-    s[OL].compute_at(s[C], tx)
-    b1, b2, b3, y, x = s[OL].op.axis
-    b = s[OL].fuse(b1, b2, b3)
-    (rc,) = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    s[OL].reorder(rco, rci, b, y, x)
-
-    s[AA].compute_at(s[OL], rco)
-    s[BB].compute_at(s[OL], rco)
-
-    # cooperative fetching
-    for load in [AA, BB]:
-        fused = s[load].fuse(*list(s[load].op.axis))
-        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
-        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
-        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    s[C].pragma(bgemm_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[C].pragma(bgemm_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    # schedule inverse, output and fusion
-    if output.op in s.outputs:
-        OL = None
-    else:
-        OL = output
-        s[OL].set_scope("local")
-        output = s.outputs[0]
-
-    m = alpha - 3 + 1
-    n, co, d, h, w = s[output].op.axis
-    do, di = s[output].split(d, m)
-    ho, hi = s[output].split(h, m)
-    wo, wi = s[output].split(w, m)
-    s[output].reorder(n, co, do, ho, wo, di, hi, wi)
-    inverse_scope, n = s[output].split(n, nparts=1)
-
-    fused = s[output].fuse(n, co, do, ho, wo)
-    bb, tt = s[output].split(fused, 128)
-
-    s[output].bind(bb, te.thread_axis("blockIdx.x"))
-    s[output].bind(tt, te.thread_axis("threadIdx.x"))
-
-    if OL is not None:
-        s[OL].compute_at(s[output], tt)
-
-    s[A].compute_inline()
-    co, p, vd, vh, vw = s[inverse].op.axis
-    r_a, r_b, r_c = s[inverse].op.reduce_axis
-    # Could add additional unrolling of vd, vh, vw, in the future
-    for axis in [r_a, r_b, r_c]:
-        s[inverse].unroll(axis)
-    s[inverse].compute_at(s[output], tt)
-
-    return s
-
-
-def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
-    """Schedule winograd template"""
-    # get stages
-    inverse = s[output].op.input_tensors[0]
-    bgemm, A = s[inverse].op.input_tensors
-    kernel_pack, data_pack = s[bgemm].op.input_tensors
-    input_tile, B = s[data_pack].op.input_tensors
-    pad_data = s[input_tile].op.input_tensors[0]
-
-    # data transform
-    s[B].compute_inline()
-
-    data_l = s.cache_write(data_pack, "local")
-    eps, nu, c, d, p = s[data_l].op.axis
-    r_a, r_b = s[data_l].op.reduce_axis
-    for axis in [eps, nu, r_a, r_b]:
-        s[data_l].unroll(axis)
-
-    eps, nu, c, d, p = s[data_pack].op.axis
-    p, pi = s[data_pack].split(p, 1)
-    fused = s[data_pack].fuse(c, d, p)
-    bb, tt = s[data_pack].split(fused, 128)
-    s[data_pack].reorder(bb, tt, pi, eps, nu)
-    s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
-    s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
-
-    s[data_l].compute_at(s[data_pack], pi)
-    s[input_tile].compute_at(s[data_pack], pi)
-    s[pad_data].compute_inline()
-
-    # transform kernel
-    if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
-        kernel, G = s[kernel_pack].op.input_tensors
-        eps, nu, kd, co, ci = s[kernel_pack].op.axis
-        s[G].compute_inline()
-        r_a, r_b = s[kernel_pack].op.reduce_axis
-        for axis in [eps, nu, r_a, r_b]:
-            s[kernel_pack].unroll(axis)
-
-        fused = s[kernel_pack].fuse(kd, co, ci)
-        bb, tt = s[kernel_pack].split(fused, 128)
-        s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b)
-        s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
-        s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
-    else:
-        kernel = kernel_pack
-
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    ##### space definition begin #####
-    b1, b2, z, y, x = s[bgemm].op.axis
-    # Combine channel and depth axes.
-    rc = s[bgemm].op.reduce_axis[0]
-    rz = s[bgemm].op.reduce_axis[1]
-    alpha = get_const_int(b1.dom.extent)
-
-    cfg.define_split(
-        "tile_b", cfg.axis(alpha * alpha), num_outputs=4, filter=lambda x: x.size[-3:] == [1, 1, 1]
-    )
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_rz", rz, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-    ##### space definition end #####
-
-    # batch gemm
-    C = bgemm
-    A0, B0 = kernel_pack, data_pack
-
-    OL = s.cache_write(C, "local")
-    AA = s.cache_read(A0, "shared", [OL])
-    BB = s.cache_read(B0, "shared", [OL])
-
-    b = s[bgemm].fuse(b1, b2)
-    # Allow two different tiling strategies as both seem
-    # to work best in different cases.
-    cfg.define_knob("unroll_axis", [0, 1])
-    # tile and bind spatial axes
-    bgemm_scope, b = s[bgemm].split(b, nparts=1)
-    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, C, z)
-    if cfg["unroll_axis"].val:
-        bx, vx, tx, xi = cfg["tile_x"].apply(s, C, y)
-    else:
-        bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].bind(bz, te.thread_axis("blockIdx.z"))
-    s[C].bind(by, te.thread_axis("blockIdx.y"))
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(vz, te.thread_axis("vthread"))
-    s[C].bind(vy, te.thread_axis("vthread"))
-    s[C].bind(vx, te.thread_axis("vthread"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
-    if cfg["unroll_axis"].val:
-        s[C].unroll(x)
-    else:
-        s[C].unroll(y)
-
-    # tile reduction axes
-    s[OL].compute_at(s[C], tx)
-    b1, b2, y1, y2, x = s[OL].op.axis
-    y = s[OL].fuse(y1, y2)
-    b = s[OL].fuse(b1, b2)
-    rc, rz = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    rzo, rzi = cfg["tile_rz"].apply(s, OL, rz)
-    s[OL].reorder(rco, rzo, rci, rzi, b, y, x)
-
-    s[AA].compute_at(s[OL], rco)
-    s[BB].compute_at(s[OL], rco)
-
-    # cooperative fetching
-    for load in [AA, BB]:
-        fused = s[load].fuse(*list(s[load].op.axis))
-        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
-        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
-        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    s[C].pragma(bgemm_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[C].pragma(bgemm_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    # schedule inverse, output and fusion
-    if output.op in s.outputs:
-        OL = None
-    else:
-        OL = output
-        s[OL].set_scope("local")
-        output = s.outputs[0]
-
-    m = alpha - 3 + 1
-    n, co, d, h, w = s[output].op.axis
-    do, di = s[output].split(d, m)
-    ho, hi = s[output].split(h, m)
-    wo, wi = s[output].split(w, m)
-    s[output].reorder(n, co, do, ho, wo, di, hi, wi)
-    inverse_scope, n = s[output].split(n, nparts=1)
-
-    fused = s[output].fuse(n, co, do, ho, wo)
-    bb, tt = s[output].split(fused, 128)
-
-    s[output].bind(bb, te.thread_axis("blockIdx.x"))
-    s[output].bind(tt, te.thread_axis("threadIdx.x"))
-
-    if OL is not None:
-        s[OL].compute_at(s[output], tt)
-
-    s[A].compute_inline()
-    co, d, p, vh, vw = s[inverse].op.axis
-    r_a, r_b = s[inverse].op.reduce_axis
-    for axis in [vh, vw, r_a, r_b]:
-        s[inverse].unroll(axis)
-    s[inverse].compute_at(s[output], tt)
-
-    return s
-
-
-@autotvm.register_topi_compute("conv3d_ncdhw_winograd.cuda")
-def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
-    """Conv3d NCDHW using winograd optimization"""
-    assert groups == 1, "conv3d_ncdhw_winograd only supports a single group"
-    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
-    # Check if we can transform depth.
-    if 2 < KD < 8 and KD == KH:
-        return winograd_cuda(
-            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False
-        )
-
-    return winograd_without_depth_cuda(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False
-    )
-
-
-@autotvm.register_topi_schedule("conv3d_ncdhw_winograd.cuda")
-def schedule_conv3d_ncdhw_winograd(cfg, outs):
-    """Dispatch to schedule approriate for conv3d winograd algorithm used."""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv3d_ncdhw_winograd_without_depth" in op.tag:
-            schedule_winograd_no_depth_cuda(cfg, s, op.output(0), pre_computed=False)
-        elif "conv3d_ncdhw_winograd" in op.tag:
-            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_weight_transform.cuda")
-def conv3d_ncdhw_winograd_without_weight_transform(
-    cfg, data, kernel, strides, padding, dilation, groups, out_dtype
-):
-    """Conv3d NCDHW winograd without weight transform."""
-    assert (
-        groups == 1
-    ), "conv3d_ncdhw_winograd_without_weight_transform does not support more than one group"
-    A, B, C, _, _ = get_const_tuple(kernel.shape)
-    # Check if we can transform depth.
-    if A == B == C:
-        return winograd_cuda(
-            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True
-        )
-
-    return winograd_without_depth_cuda(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True
-    )
-
-
-@autotvm.register_topi_schedule("conv3d_ncdhw_winograd_without_weight_transform.cuda")
-def schedule_conv3d_ncdhw_winograd_without_weight_transform(cfg, outs):
-    """TOPI schedule callback"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv3d_ncdhw_winograd_without_depth" in op.tag:
-            schedule_winograd_no_depth_cuda(cfg, s, op.output(0), pre_computed=True)
-        elif "conv3d_ncdhw_winograd" in op.tag:
-            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/correlation.py b/python/tvm/topi/cuda/correlation.py
deleted file mode 100644
index 9b1698329fd3..000000000000
--- a/python/tvm/topi/cuda/correlation.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Correlation operators on CUDA"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from .. import nn
-from ..utils import traverse_inline
-
-
-@autotvm.register_topi_compute("correlation_nchw.cuda")
-def correlation_nchw(
-    cfg, data1, data2, kernel_size, max_displacement, stride1, stride2, padding, is_multiply
-):
-    """Correlation operator in NCHW layout.
-
-    Parameters
-    ----------
-    data1 : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    data2 : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    kernel_size: int
-        Kernel size for correlation, must be an odd number
-
-    max_displacement: int
-        Max displacement of Correlation
-
-    stride1: int
-        Stride for data1
-
-    stride2: int
-        Stride for data2 within the neightborhood centered around data1
-
-    padding : int or a list/tuple of 2 or 4 ints
-        Padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    is_multiply: bocorrelation
-        operation type is either multiplication or substraction
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    # pylint: disable=unused-argument
-    return nn.correlation_nchw(
-        data1, data2, kernel_size, max_displacement, stride1, stride2, padding, is_multiply
-    )
-
-
-def _schedule_correlation_nchw(cfg, s, correlation):
-    """Schedule correlation_nchw direct template"""
-    # pylint: disable=invalid-name
-    ##### space definition begin #####
-    n, f, y, x = s[correlation].op.axis
-    rc, ry, rx = s[correlation].op.reduce_axis
-    cfg.define_split("tile_f", f, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-
-    ##### space definition end #####
-
-    padded_data1, padded_data2 = s[correlation].op.input_tensors
-    s[padded_data1].compute_inline()
-    s[padded_data2].compute_inline()
-
-    # create cache stage
-    s[correlation].set_scope("local")
-    AA = s.cache_read(padded_data1, "shared", [correlation])
-    BB = s.cache_read(padded_data2, "shared", [correlation])
-
-    output = s.outputs[0].output(0)
-
-    # tile and bind spatial axes
-    n, f, y, x = s[output].op.axis
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-    s[correlation].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, f, y, x = s[correlation].op.axis
-    rc, ry, rx = s[correlation].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, correlation, rc)
-    ryo, ryi = cfg["tile_ry"].apply(s, correlation, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, correlation, rx)
-    s[correlation].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
-
-    s[AA].compute_at(s[correlation], rxo)
-    s[BB].compute_at(s[correlation], rxo)
-
-    # cooperative fetching
-    for load in [AA, BB]:
-        n, f, y, x = s[load].op.axis
-        fused = s[load].fuse(n, f, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-
-@autotvm.register_topi_schedule("correlation_nchw.cuda")
-def schedule_correlation_nchw(cfg, outs):
-    """schedule of correlation_nchw for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of correlation
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for correlation.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "correlation_nchw":
-            _schedule_correlation_nchw(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/deformable_conv2d.py b/python/tvm/topi/cuda/deformable_conv2d.py
deleted file mode 100644
index 911588cad5a3..000000000000
--- a/python/tvm/topi/cuda/deformable_conv2d.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-argument
-"""Schedule template of deformable conv2d with cuda backend"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from .. import nn
-from ..utils import traverse_inline
-
-
-@autotvm.register_topi_compute("deformable_conv2d_nchw.cuda")
-def deformable_conv2d_nchw(
-    cfg, data, offset, kernel, strides, padding, dilation, deformable_groups, groups, out_dtype
-):
-    """Deformable Conv2d."""
-    return nn.deformable_conv2d_nchw(
-        data, offset, kernel, strides, padding, dilation, deformable_groups, groups, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("deformable_conv2d_nchw.cuda")
-def schedule_deformable_conv2d_nchw(cfg, outs):
-    """TOPI schedule callback of deformable conv2d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "deformable_conv2d_nchw":
-            _schedule_direct_cuda(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_direct_cuda(cfg, s, conv):
-    """Schedule template of deformable conv2d"""
-    n, f, y, x = s[conv].op.axis
-    rc, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_f", f, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-
-    data_deform, kernel = s[conv].op.input_tensors
-
-    s[data_deform].compute_inline()
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    # create cache stage
-    AA = s.cache_read(data_deform, "shared", [OL])
-    WW = s.cache_read(kernel, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, f, y, x = s[output].op.axis
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-    s[OL].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, f, y, x = s[OL].op.axis
-    rc, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
-    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
-    cfg.define_reorder("reorder_inner", [rco, ryo, rxo], "all")
-    cfg["reorder_inner"].apply(s, OL, [rco, ryo, rxo])
-    cfg["reorder_inner"].apply(s, OL, [rci, ryi, rxi])
-
-    cache_loc = [rco, ryo, rxo][cfg["reorder_inner"].perm[-1]]
-    s[AA].compute_at(s[OL], cache_loc)
-    s[WW].compute_at(s[OL], cache_loc)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        fused = s[load].fuse(*s[load].op.axis)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
deleted file mode 100644
index fa2c4a0f9d6d..000000000000
--- a/python/tvm/topi/cuda/dense.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Schedule for dense operator"""
-import logging
-import tvm
-from tvm import te, autotvm
-from tvm.contrib import cublas
-from .tensor_intrin import dp4a
-from .. import tag
-from .. import generic
-from ..utils import traverse_inline, get_const_tuple
-
-logger = logging.getLogger("topi")
-
-
-def _matmul_cublas_common(
-    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
-):
-    assert len(tensor_a.shape) == 2 and len(tensor_b.shape) == 2, "only support 2-dim matmul"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    if out_dtype is None:
-        out_dtype = tensor_a.dtype
-    if out_dtype not in [tensor_a.dtype, "int32"]:
-        assert out_dtype == tensor_a.dtype, "Mixed precision other than int8 + int32 not supported."
-    batch, in_dim = get_const_tuple(tensor_a.shape)
-    out_dim, _ = get_const_tuple(tensor_b.shape)
-    matmul = cublas.matmul(tensor_a, tensor_b, transpose_a, transpose_b, dtype=out_dtype)
-    if all(isinstance(d, int) for d in [batch, in_dim, out_dim]):
-        cfg.add_flop(batch * in_dim * out_dim * 2)
-    if bias is not None:
-        matmul = te.compute(
-            (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST
-        )
-    return matmul
-
-
-@autotvm.register_topi_compute("matmul_cublas.cuda")
-def matmul_cublas(
-    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
-):
-    """Matmul operator on CUDA with CUBLAS"""
-    return _matmul_cublas_common(cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b)
-
-
-@autotvm.register_topi_schedule("matmul_cublas.cuda")
-def schedule_matmul_cublas(_, outs):
-    """Schedule matmul operator using CUBLAS"""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("dense_cublas.cuda")
-def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator on CUDA with CUBLAS. This is an alias of matmul_nt operator."""
-    return _matmul_cublas_common(cfg, data, weight, bias, out_dtype, False, True)
-
-
-@autotvm.register_topi_schedule("dense_cublas.cuda")
-def schedule_dense_cublas(_, outs):
-    """Schedule dense operator using CUBLAS"""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("dense_int8.cuda")
-def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator for int8 on CUDA"""
-    if out_dtype is None:
-        out_dtype = data.dtype
-
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    k = te.reduce_axis((0, in_dim), name="k")
-
-    matmul = te.compute(
-        (batch, out_dim),
-        lambda i, j: te.sum(
-            data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=[k]
-        ),
-        tag="dense_int8",
-    )
-
-    cfg.add_flop(batch * in_dim * out_dim * 2)
-
-    if bias is not None:
-        matmul = te.compute(
-            (batch, out_dim),
-            lambda i, j: matmul[i, j] + bias[j].astype(out_dtype),
-            tag=tag.BROADCAST,
-        )
-        cfg.add_flop(batch * out_dim)
-
-    return matmul
-
-
-@autotvm.register_topi_schedule("dense_int8.cuda")
-def schedule_dense_int8(cfg, outs):
-    """Dense schedule for int8 on CUDA"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_int8" in op.tag:
-            _schedule_dense_int8(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_dense_int8(cfg, s, output):
-    data, weight = s[output].op.input_tensors
-    if len(weight.op.input_tensors) == 1 and weight.op.input_tensors[0] == data:
-        s[weight].compute_inline()
-
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-
-    in_dim_factor = 4
-    assert in_dim % in_dim_factor == 0, f"Input dimension must divide {in_dim_factor}"
-    if in_dim % 16 == 0:
-        in_dim_factor = 16
-
-    # create tuning space
-    cfg.define_split("tile_y", batch, num_outputs=4)
-    cfg.define_split("tile_x", out_dim, num_outputs=4)
-    cfg.define_split("tile_k", in_dim // in_dim_factor, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    # create cache stage
-    AA = s.cache_read(data, "shared", [output])
-    WW = s.cache_read(weight, "shared", [output])
-    CC = s.cache_write(output, "local")
-
-    # handle bias
-    if output.op not in s.outputs:
-        s[output].compute_inline()
-        output = s.outputs[0].output(0)
-
-    n, x = s[output].op.axis
-
-    # this is the scope to attach global config inside this kernel
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    ko = CC.op.reduce_axis[0]
-    ko, ki = s[CC].split(ko, factor=4)
-    ko, kt = cfg["tile_k"].apply(s, CC, ko)
-    target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
-
-    if do_tensorize:
-        dtypes = (data.dtype, weight.dtype)
-        s[CC].tensorize(ki, dp4a("shared", "shared", "local", dtypes))
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, n)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    s[output].reorder(by, bx, vy, vx, ty, tx, yi, xi)
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    n_ty = cfg["tile_y"].size[2]
-    n_tx = cfg["tile_x"].size[2]
-
-    s[CC].compute_at(s[output], tx)
-    yo, xo = CC.op.axis[:2]
-    s[CC].reorder(ko, kt, yo, xo, ki)
-
-    for load in [AA, WW]:
-        s[load].compute_at(s[CC], ko)
-
-        outer, inner = s[load].split(s[load].op.axis[-1], factor=in_dim_factor)
-        s[load].vectorize(inner)
-        fused = s[load].op.axis[:-1] + [outer]
-        fused = s[load].fuse(*fused)
-
-        fused, tx = s[load].split(fused, factor=n_tx)
-        fused, ty = s[load].split(fused, factor=n_ty)
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", False)
-    return s
diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py
deleted file mode 100644
index 506d94e60ea0..000000000000
--- a/python/tvm/topi/cuda/dense_tensorcore.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
-"""Compute and Schedule definition for dense tensorcore with cuda backend"""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te, autotvm
-from .. import tag
-from ..utils import traverse_inline, get_const_tuple
-from .tensor_intrin import (
-    intrin_wmma_load_matrix_A,
-    intrin_wmma_load_matrix_W,
-    intrin_wmma_store_matrix,
-    intrin_wmma_gemm,
-)
-
-
-@autotvm.register_topi_compute("dense_tensorcore.cuda")
-def dense_tensorcore(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense tensorcore operator on CUDA"""
-    matmul = dense_tensorcore_cuda(data, weight, bias, out_dtype)
-    return matmul
-
-
-@autotvm.register_topi_schedule("dense_tensorcore.cuda")
-def schedule_dense_tensorcore(cfg, outs):
-    """Schedule dense operator using Tensorcore"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "dense_tensorcore":
-            _schedule_dense_tensorcore(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def dense_tensorcore_cuda(data, weight, bias=None, out_dtype=None):
-    """Dense tensorcore operator on CUDA"""
-    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim dense"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    if out_dtype is None:
-        out_dtype = data.dtype
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-
-    assert data.dtype == weight.dtype
-    assert data.dtype in ["float16", "int8", "uint8", "int4", "uint4"]
-    if data.dtype in ["float16", "int8", "uint8"]:
-        assert (
-            (batch % 8 == 0 and in_dim % 16 == 0 and out_dim % 32 == 0)
-            or (batch % 16 == 0 and in_dim % 16 == 0 and out_dim % 16 == 0)
-            or (batch % 32 == 0 and in_dim % 16 == 0 and out_dim % 8 == 0)
-        ), (
-            "The shape of (batch, in_dim, out_dim) "
-            "must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
-        )
-    else:
-        assert (
-            batch % 8 == 0 and in_dim % 32 == 0 and out_dim % 8 == 0
-        ), "The shape of (batch, in_dim, out_dim) must be multiple of (8, 32, 8)"
-
-    k = te.reduce_axis((0, in_dim), name="k")
-    matmul = te.compute(
-        (batch, out_dim),
-        lambda i, j: te.sum(data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k),
-        name="T_dense",
-        tag="dense_tensorcore",
-    )
-    if bias is not None:
-        matmul = te.compute(
-            (batch, out_dim),
-            lambda i, j: matmul[i, j] + bias[j].astype(out_dtype),
-            tag=tag.BROADCAST,
-        )
-    return matmul
-
-
-def _schedule_dense_tensorcore(cfg, s, C):
-    """Schedule dense operator using Tensorcore"""
-    A, B = s[C].op.input_tensors
-    if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A:
-        s[B].compute_inline()
-    batch, out_dim = get_const_tuple(C.shape)
-    data_dtype = A.dtype
-    out_dtype = C.dtype
-
-    # Explicit memory access
-    AS = s.cache_read(A, "shared", [C])
-    BS = s.cache_read(B, "shared", [C])
-    AF = s.cache_read(AS, "wmma.matrix_a", [C])
-    BF = s.cache_read(BS, "wmma.matrix_b", [C])
-    CF = s.cache_write(C, "wmma.accumulator")
-    CS = s.cache_read(CF, "shared", [C])
-
-    # fallback support
-    target = tvm.target.Target.current()
-    if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log(
-            target.kind.name, target.model, "dense_tensorcore.cuda"
-        )
-        cfg.fallback_with_reference_log(ref_log)
-
-    # Deal with op fusion, such as bias and relu
-    if C.op not in s.outputs:
-        s[C].compute_inline()
-        C = s.outputs[0].output(0)
-
-    # create tuning space
-    cfg.define_knob("block_row_warps", [1, 2, 4])
-    cfg.define_knob("block_col_warps", [1, 2, 4])
-    cfg.define_knob("warp_row_tiles", [1, 2, 4])
-    cfg.define_knob("warp_col_tiles", [1, 2, 4])
-    cfg.define_knob("chunk", [1, 2, 4, 8])
-    cfg.define_knob("offset", [0, 8])
-    cfg.define_knob("offsetCS", [0, 8])
-    cfg.define_knob("vec", [1, 2, 4, 8])
-
-    if data_dtype in ["float16", "int8", "uint8"]:
-        # Ensure that the default parameters are applicable when autotvm is not in use
-        if batch % 32 == 0 and out_dim % 8 == 0:
-            cfg.define_knob("wmma_m", [32, 16, 8])
-        elif batch % 16 == 0 and out_dim % 16 == 0:
-            cfg.define_knob("wmma_m", [16, 8, 32])
-        elif batch % 8 == 0 and out_dim % 32 == 0:
-            cfg.define_knob("wmma_m", [8, 16, 32])
-        wmma_k = 16
-        wmma_m = cfg["wmma_m"].val
-        if wmma_m == 16:
-            wmma_n = 16
-        elif wmma_m == 8:
-            wmma_n = 32
-        elif wmma_m == 32:
-            wmma_n = 8
-    elif data_dtype in ["int4", "uint4"]:
-        wmma_m = wmma_n = 8
-        wmma_k = 32
-    else:
-        raise ValueError(f"data dtype {data_dtype} is not yet supported")
-
-    warp_size = 32
-    block_row_warps = cfg["block_row_warps"].val
-    block_col_warps = cfg["block_col_warps"].val
-    warp_row_tiles = cfg["warp_row_tiles"].val
-    warp_col_tiles = cfg["warp_col_tiles"].val
-    chunk = cfg["chunk"].val
-    offset = cfg["offset"].val
-    offsetCS = cfg["offsetCS"].val
-    vec = cfg["vec"].val
-
-    # Define the stride of intrin functions
-    AS_align = chunk * wmma_k + offset
-    BS_align = chunk * wmma_k + offset
-    CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS
-    AS_stride = [AS_align, 1]
-    BS_stride = [BS_align, 1]
-    AF_stride = [wmma_k, 1]
-    BF_stride = [wmma_k, 1]
-    CF_stride = [warp_col_tiles * wmma_n, 1]
-    CS_stride = [CS_align, 1]
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    # Schedule for dense computation
-    block_factor_b = wmma_m * warp_row_tiles * block_row_warps
-    block_factor_o = wmma_n * warp_col_tiles * block_col_warps
-    b, o = C.op.axis
-    block_i, bc = s[C].split(b, factor=block_factor_b)
-    block_j, oc = s[C].split(o, factor=block_factor_o)
-    s[C].reorder(block_i, block_j, bc, oc)
-    t = s[C].fuse(bc, oc)
-    t, vi = s[C].split(t, factor=vec)
-    t, tx = s[C].split(t, factor=warp_size)
-    t, ty = s[C].split(t, factor=block_row_warps)
-    t, tz = s[C].split(t, factor=block_col_warps)
-    s[C].bind(block_i, block_x)
-    s[C].bind(block_j, block_y)
-    s[C].bind(tz, thread_z)
-    s[C].bind(ty, thread_y)
-    s[C].bind(tx, thread_x)
-    s[C].vectorize(vi)
-
-    # Schedule for wmma store
-    s[CS].compute_at(s[C], block_j)
-    bb, oo = CS.op.axis
-    s[CS].storage_align(bb, CS_align - 1, CS_align)
-    bb, bbi = s[CS].split(bb, factor=wmma_m)
-    oo, ooi = s[CS].split(oo, factor=wmma_n)
-    bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
-    oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
-    s[CS].reorder(bb, oo, bbii, ooii, bbi, ooi)
-    s[CS].bind(bb, thread_y)
-    s[CS].bind(oo, thread_z)
-
-    # Schedule for wmma computation
-    s[CF].compute_at(s[CS], oo)
-    warp_i, warp_j = CF.op.axis
-    warp_i, _ii = s[CF].split(warp_i, factor=wmma_m)
-    warp_j, _jj = s[CF].split(warp_j, factor=wmma_n)
-    (k,) = CF.op.reduce_axis
-    k, _k = s[CF].split(k, factor=wmma_k)
-    ko, ki = s[CF].split(k, factor=chunk)
-    s[CF].reorder(ko, ki, warp_i, warp_j, _ii, _jj, _k)
-
-    # Schedule for  wmma_matrix_a load
-    s[AF].compute_at(s[CF], ki)
-    b, i = AF.op.axis
-    b, b_ii = s[AF].split(b, factor=wmma_m)
-    i, i_jj = s[AF].split(i, factor=wmma_k)
-    s[AF].reorder(b, i, b_ii, i_jj)
-
-    # Schedule for  wmma_matrix_b load
-    s[BF].compute_at(s[CF], ki)
-    o, i = BF.op.axis
-    o, o_ii = s[BF].split(o, factor=wmma_n)
-    i, i_ii = s[BF].split(i, factor=wmma_k)
-    s[BF].reorder(o, i, o_ii, i_ii)
-
-    # Schedule for A's(B's) shared memory load
-    def shared_schedule(stage, strides):
-        s[stage].compute_at(s[CF], ko)
-        xo, yo = stage.op.axis
-        s[stage].storage_align(xo, strides - 1, strides)
-        t = s[stage].fuse(xo, yo)
-        t, vi = s[stage].split(t, factor=vec)
-        t, tx = s[stage].split(t, factor=warp_size)
-        t, ty = s[stage].split(t, factor=block_row_warps)
-        _, tz = s[stage].split(t, factor=block_col_warps)
-        s[stage].bind(ty, thread_y)
-        s[stage].bind(tz, thread_z)
-        s[stage].bind(tx, thread_x)
-        s[stage].vectorize(vi)
-
-    shared_schedule(AS, AS_align)
-    shared_schedule(BS, BS_align)
-
-    shape = (wmma_m, wmma_n, wmma_k)
-    AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=data_dtype)
-    BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=data_dtype)
-    k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
-    CL_compute = te.compute(
-        (wmma_m, wmma_n),
-        lambda ii, jj: te.sum(
-            AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm].astype(out_dtype),
-            axis=k_gemm,
-        ),
-        name="CL_compute",
-    )
-
-    # lower the computation loops down to TensorCore hardware intrinsics
-    # by mapping the dense tensorcore to tensor intrinsics
-    s[AF].tensorize(
-        b_ii,
-        intrin_wmma_load_matrix_A(
-            AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), data_dtype
-        ),
-    )
-    s[BF].tensorize(
-        o_ii,
-        intrin_wmma_load_matrix_W(
-            BF_stride, BS_stride, shape, "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), data_dtype
-        ),
-    )
-    s[CF].tensorize(
-        _ii, intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride, CF_stride, shape)
-    )
-    s[CS].tensorize(
-        bbi,
-        intrin_wmma_store_matrix(
-            CS_stride, CF_stride, shape, out_dtype, (wmma_m, wmma_n), (wmma_m, wmma_n)
-        ),
-    )
diff --git a/python/tvm/topi/cuda/depthwise_conv2d.py b/python/tvm/topi/cuda/depthwise_conv2d.py
deleted file mode 100644
index 4a91bd95811e..000000000000
--- a/python/tvm/topi/cuda/depthwise_conv2d.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Schedule for depthwise_conv2d with auto fusion"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from ..utils import traverse_inline
-from .. import tag
-from .. import nn
-
-# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-@autotvm.register_topi_compute("depthwise_conv2d_nchw.cuda")
-def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute depthwise_conv2d with NCHW layout."""
-    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw.cuda")
-def schedule_depthwise_conv2d_nchw(cfg, outs):
-    """Schedule for depthwise_conv2d nchw forward.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "depthwise_conv2d_nchw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-
-            ##### space definition begin #####
-            n, f, y, x = s[conv].op.axis
-            cfg.define_split("tile_f", f, num_outputs=4)
-            cfg.define_split("tile_y", y, num_outputs=4)
-            cfg.define_split("tile_x", x, num_outputs=4)
-            cfg.define_knob("auto_unroll_max_step", [0, 256, 1500])
-
-            target = tvm.target.Target.current()
-            if target.kind.name in ["nvptx", "rocm"]:
-                cfg.define_knob("unroll_explicit", [1])
-            else:
-                cfg.define_knob("unroll_explicit", [0, 1])
-
-            # fallback support
-            if cfg.is_fallback:
-                ref_log = autotvm.tophub.load_reference_log(
-                    target.kind.name, target.model, "depthwise_conv2d_nchw.cuda"
-                )
-                cfg.fallback_with_reference_log(ref_log)
-                # TODO(lmzheng): A bug here, set unroll_explicit to False as workaround
-                cfg["unroll_explicit"].val = 0
-            ##### space definition end #####
-
-            s[pad_data].compute_inline()
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            if conv.op in s.outputs:
-                output = conv
-                OL = s.cache_write(conv, "local")
-            else:
-                output = s.outputs[0].output(0)
-                s[conv].set_scope("local")
-                OL = conv
-
-            # create cache stage
-            AA = s.cache_read(pad_data, "shared", [OL])
-            WW = s.cache_read(kernel, "shared", [OL])
-            AL = s.cache_read(AA, "local", [OL])
-            WL = s.cache_read(WW, "local", [OL])
-
-            # tile and bind spatial axes
-            n, f, y, x = s[output].op.axis
-            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            kernel_scope, n = s[output].split(n, nparts=1)
-            bf = s[output].fuse(n, bf)
-            s[output].bind(bf, te.thread_axis("blockIdx.z"))
-            s[output].bind(by, te.thread_axis("blockIdx.y"))
-            s[output].bind(bx, te.thread_axis("blockIdx.x"))
-            s[output].bind(vf, te.thread_axis("vthread"))
-            s[output].bind(vy, te.thread_axis("vthread"))
-            s[output].bind(vx, te.thread_axis("vthread"))
-            s[output].bind(tf, te.thread_axis("threadIdx.z"))
-            s[output].bind(ty, te.thread_axis("threadIdx.y"))
-            s[output].bind(tx, te.thread_axis("threadIdx.x"))
-            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-            s[OL].compute_at(s[output], tx)
-
-            # cooperative fetching
-            s[AA].compute_at(s[output], bx)
-            s[WW].compute_at(s[output], bx)
-            s[AL].compute_at(s[output], tx)
-            s[WL].compute_at(s[output], tx)
-
-            for load in [AA, WW]:
-                fused = s[load].fuse(*list(s[load].op.axis))
-                fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
-                fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
-                fused, tz = s[load].split(fused, cfg["tile_f"].size[2])
-                s[load].bind(tz, te.thread_axis("threadIdx.z"))
-                s[load].bind(ty, te.thread_axis("threadIdx.y"))
-                s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-            s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-            s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def schedule_depthwise_conv2d_nhwc(outs):
-    """Schedule for depthwise_conv2d nhwc forward.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nhwc.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(temp, Filter, DepthwiseConv2d):
-        s[temp].compute_inline()
-        FS = s.cache_read(Filter, "shared", [DepthwiseConv2d])
-        if DepthwiseConv2d.op in s.outputs:
-            Output = DepthwiseConv2d
-            CL = s.cache_write(DepthwiseConv2d, "local")
-        else:
-            Output = outs[0].op.output(0)
-            s[DepthwiseConv2d].set_scope("local")
-
-        block_x = te.thread_axis("blockIdx.x")
-        thread_x = te.thread_axis("threadIdx.x")
-
-        b, h, w, c = s[Output].op.axis
-
-        # make sure the size of our parallelism is not larger than the number of threads
-        num_thread = min(
-            tvm.arith.Analyzer().simplify(temp.shape[3]).value,
-            tvm.target.Target.current().max_num_threads,
-        )
-        xoc, xic = s[Output].split(c, factor=num_thread)
-        s[Output].reorder(xoc, b, h, w, xic)
-        xo, yo, _, _ = s[Output].tile(h, w, x_factor=2, y_factor=2)
-        fused = s[Output].fuse(yo, xo)
-        fused = s[Output].fuse(fused, b)
-        fused = s[Output].fuse(fused, xoc)
-
-        s[Output].bind(fused, block_x)
-        s[Output].bind(xic, thread_x)
-
-        if DepthwiseConv2d.op in s.outputs:
-            s[CL].compute_at(s[Output], xic)
-        else:
-            s[DepthwiseConv2d].compute_at(s[Output], xic)
-
-        _, _, ci, fi = s[FS].op.axis
-        s[FS].compute_at(s[Output], fused)
-        fused = s[FS].fuse(fi, ci)
-        s[FS].bind(fused, thread_x)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule depthwise_conv2d
-        if OP.tag == "depthwise_conv2d_nhwc":
-            PaddedInput = OP.input_tensors[0]
-            Filter = OP.input_tensors[1]
-            if isinstance(Filter.op, tvm.te.ComputeOp) and "dilate" in Filter.op.tag:
-                s[Filter].compute_inline()
-            DepthwiseConv2d = OP.output(0)
-            _schedule(PaddedInput, Filter, DepthwiseConv2d)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
-
-
-def schedule_depthwise_conv2d_backward_input_nhwc(outs):
-    """Schedule for depthwise_conv2d nhwc backward wrt input.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        backward wrt input in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d backward
-        wrt input with layout nhwc.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(Padded_out_grad, In_grad):
-        s[Padded_out_grad].compute_inline()
-
-        block_x = te.thread_axis("blockIdx.x")
-        thread_x = te.thread_axis("threadIdx.x")
-        _, h, w, c = In_grad.op.axis
-
-        fused_hwc = s[In_grad].fuse(h, w, c)
-        xoc, xic = s[In_grad].split(fused_hwc, factor=128)
-
-        s[In_grad].bind(xoc, block_x)
-        s[In_grad].bind(xic, thread_x)
-
-    def traverse(OP):
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if OP.tag == "depthwise_conv2d_backward_input_nhwc":
-            Padded_out_grad = OP.input_tensors[0]
-            Dilated_out_grad = Padded_out_grad.op.input_tensors[0]
-            s[Dilated_out_grad].compute_inline()
-            In_grad = OP.output(0)
-            _schedule(Padded_out_grad, In_grad)
-        else:
-            raise ValueError("Depthwise conv backward wrt input for non-NHWC is not supported.")
-
-    traverse(outs[0].op)
-    return s
-
-
-def schedule_depthwise_conv2d_backward_weight_nhwc(outs):
-    """Schedule for depthwise_conv2d nhwc backward wrt weight.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        backward wrt weight in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d backward
-        wrt weight with layout nhwc.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(Weight_grad):
-        block_x = te.thread_axis("blockIdx.x")
-        thread_y = te.thread_axis("threadIdx.y")
-        thread_x = te.thread_axis("threadIdx.x")
-
-        db, dh, dw = Weight_grad.op.reduce_axis
-
-        fused_dbdhdw = s[Weight_grad].fuse(db, dh, dw)
-        _, ki = s[Weight_grad].split(fused_dbdhdw, factor=8)
-        BF = s.rfactor(Weight_grad, ki)
-
-        fused_fwcm = s[Weight_grad].fuse(*s[Weight_grad].op.axis)
-
-        xo, xi = s[Weight_grad].split(fused_fwcm, factor=32)
-
-        s[Weight_grad].bind(xi, thread_x)
-        s[Weight_grad].bind(xo, block_x)
-
-        s[Weight_grad].bind(s[Weight_grad].op.reduce_axis[0], thread_y)
-        s[BF].compute_at(s[Weight_grad], s[Weight_grad].op.reduce_axis[0])
-
-    def traverse(OP):
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if OP.tag == "depthwise_conv2d_backward_weight_nhwc":
-            Padded_in = OP.input_tensors[1]
-            s[Padded_in].compute_inline()
-            Weight_grad = OP.output(0)
-            _schedule(Weight_grad)
-        else:
-            raise ValueError("Depthwise conv backward wrt weight for non-NHWC is not supported.")
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
deleted file mode 100644
index ba0fa8a4c405..000000000000
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ /dev/null
@@ -1,542 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-# pylint: disable=no-value-for-parameter
-"""The template for cuda group_conv2d_nchw"""
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from .injective import schedule_injective_from_existing
-from .tensor_intrin import dp4a
-from ..nn.pad import pad
-from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..nn.utils import get_pad_tuple
-from ..utils import traverse_inline, get_const_tuple, get_const_int
-from .. import nn
-
-
-def group_conv2d_nchw_int8(data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
-    """Compute group_conv2d internally using group_conv2d_nchwc layout for int8 dtype"""
-    assert data.dtype in ("int8", "uint8")
-    assert kernel.dtype in ("int8", "uint8")
-    assert data.dtype == kernel.dtype
-    packed_out = group_conv2d_NCHWc_int8(
-        data, kernel, strides, padding, dilation, groups, out_dtype
-    )
-    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
-
-
-def schedule_group_conv2d_nchw_int8(outs):
-    """Create schedule for tensors"""
-    return schedule_group_conv2d_NCHWc_int8(outs)
-
-
-@autotvm.register_topi_compute("group_conv2d_nchw.cuda")
-def group_conv2d_nchw(_, data, kernel, stride, padding, dilation, groups, out_dtype="float32"):
-    return nn.group_conv2d_nchw(data, kernel, stride, padding, dilation, groups, out_dtype)
-
-
-@autotvm.register_topi_schedule("group_conv2d_nchw.cuda")
-def schedule_group_conv2d_nchw(cfg, outs):
-    """TOPI schedule callback of group conv2d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for group conv2d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "group_conv2d_nchw":
-            _schedule_group_conv2d_nchw_direct(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_group_conv2d_nchw_direct(cfg, s, conv):
-    """Schedule group conv2d NCHW direct template"""
-    workload = conv.op.attrs["workload"]
-    groups = get_const_int(workload[6])
-    num_filters = get_const_int(conv.shape[1])
-
-    ##### space definition begin #####
-    n, f, y, x = s[conv].op.axis
-    rc, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_n", n, num_outputs=4)
-    cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2)
-    cfg.define_split("tile_f", cfg.axis(num_filters // groups), num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    target = tvm.target.Target.current()
-    if target.kind.name in ["nvptx", "rocm"]:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-
-    pad_data, kernel = s[conv].op.input_tensors
-
-    s[pad_data].compute_inline()
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    # create cache stage
-    AA = s.cache_read(pad_data, "shared", [OL])
-    WW = s.cache_read(kernel, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, f, y, x = s[output].op.axis
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    g, f = s[output].split(f, nparts=groups)
-    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-    bg, vg = cfg["tile_g"].apply(s, output, g)
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, te.thread_axis("blockIdx.z"))
-    s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
-    s[output].bind(vn, te.thread_axis("vthread"))
-    s[output].bind(vg, te.thread_axis("vthread"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-
-    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
-    if cfg["fuse_yx"].val:
-        s[output].bind(tn, te.thread_axis("threadIdx.z"))
-        s[output].bind(tf, te.thread_axis("threadIdx.y"))
-        tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
-        s[OL].compute_at(s[output], tyx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2]
-        n_ty = cfg["tile_f"].size[2]
-        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
-    else:
-        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
-        s[output].bind(ty, te.thread_axis("threadIdx.y"))
-        s[output].bind(tx, te.thread_axis("threadIdx.x"))
-        s[OL].compute_at(s[output], tx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-        n_ty = cfg["tile_y"].size[2]
-        n_tx = cfg["tile_x"].size[2]
-
-    # tile reduction axes
-    n, f, y, x = s[OL].op.axis
-    rc, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    ryo, ryi = cfg["tile_rx"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_ry"].apply(s, OL, rx)
-    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        n, f, y, x = s[load].op.axis
-        fused = s[load].fuse(n, f, y, x)
-        fused, tx = s[load].split(fused, factor=n_tx)
-        fused, ty = s[load].split(fused, factor=n_ty)
-        fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    N, CO, OH, OW = get_const_tuple(output.shape)
-    _, CI_div_groups, KH, KW = get_const_tuple(kernel.shape)
-    cfg.add_flop(2 * N * OH * OW * CO * CI_div_groups * KH * KW)
-
-
-@autotvm.register_topi_compute("group_conv2d_NCHWc_int8.cuda")
-def group_conv2d_NCHWc_int8(
-    cfg, data, kernel, stride, padding, dilation, groups, out_dtype="float32"
-):
-    """Group convolution operator for 'group_conv2d_NCHWc_int8'.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width] or
-        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
-
-    kernel : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] or
-        6-D with shape [num_filter_chunk, in_channel_chunk // groups, filter_height,
-        filter_width, num_filter_block, in_channel_block]
-
-    stride : int or a list/tuple of two ints
-        Stride size, or [stride_height, stride_width]
-
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-
-    dilation : int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    groups : int
-        number of groups
-
-    out_dtype : str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        5-D with shape [batch, out_channel, out_height, out_width, out_channel_block]
-    """
-    ic_block_factor = 4
-    oc_block_factor = 4
-
-    pre_computed = len(kernel.shape) == 6
-    if not pre_computed:
-        batch, channels, height, width = get_const_tuple(data.shape)
-        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-
-        assert channels % groups == 0, "input channels must divide group size"
-        assert out_channels % groups == 0, "output channels must divide group size"
-        assert (
-            channels % ic_block_factor == 0
-        ), f"Number of input channels per group must divide {ic_block_factor}"
-        assert (
-            out_channels % oc_block_factor == 0
-        ), f"Number of output channels per group must divide {oc_block_factor}"
-
-        packed_data = te.compute(
-            (batch, channels // ic_block_factor, height, width, ic_block_factor),
-            lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w],
-            name="packed_data",
-        )
-        packed_kernel = te.compute(
-            (
-                out_channels // oc_block_factor,
-                in_channels // ic_block_factor,
-                kernel_h,
-                kernel_w,
-                oc_block_factor,
-                ic_block_factor,
-            ),
-            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: kernel[
-                oc_chunk * oc_block_factor + oc_block, ic_chunk * ic_block_factor + ic_block, kh, kw
-            ],
-            name="packed_kernel",
-        )
-    else:
-        packed_data = data
-        packed_kernel = kernel
-
-    batch, ic_chunk, in_height, in_width, _ = get_const_tuple(packed_data.shape)
-    oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(packed_kernel.shape)
-
-    # TODO(kumasento): these assertions ensure that the number of groups
-    # should be smaller or equal to the number of blocks, so that each
-    # group will have at least one block.
-    # Shall we pad the channels to avoid raising assertions?
-    assert (
-        groups <= oc_chunk
-    ), f"Number of groups {groups} should be less than output channel chunk size {oc_chunk}"
-    assert (
-        groups <= ic_chunk
-    ), f"Number of groups {groups} should be less than input channel chunk size {ic_chunk}"
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    # pad the input data
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
-    pad_before = [0, 0, pad_top, pad_left, 0]
-    pad_after = [0, 0, pad_down, pad_right, 0]
-    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
-
-    # compute the output shape
-    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
-    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
-
-    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
-
-    icc = te.reduce_axis((0, ic_chunk // groups), name="ic_chunk")
-    icb = te.reduce_axis((0, ic_block_factor), name="ic_block")
-    kh = te.reduce_axis((0, kernel_h), name="kh")
-    kw = te.reduce_axis((0, kernel_w), name="kw")
-
-    # NOTE(kumasento): explanation of this snippet -
-    # oc_chunk//groups and ic_chunk//groups give you the number of blocks,
-    # i.e., chunk, per group.
-    # occ is the ID of the output channel block, so that occ//(oc_chunk//groups)
-    # produces the ID of the group.
-    # Multiplying that result with ic_chunk//groups resulting in the ID
-    # of the beginning block of the corresponding input group.
-    # Adding the block offset (icc) will give you the exact block ID.
-    #
-    # Compared with a normal convolution, group convolution only sums
-    # input channels from the group that an output channel resides in.
-    conv = te.compute(
-        oshape,
-        lambda n, occ, oh, ow, ocb: te.sum(
-            pad_data[
-                n,
-                occ // (oc_chunk // groups) * (ic_chunk // groups) + icc,
-                oh * stride_h + kh * dilation_h,
-                ow * stride_w + kw * dilation_w,
-                icb,
-            ].astype("int32")
-            * packed_kernel[occ, icc, kh, kw, ocb, icb].astype("int32"),
-            axis=[icc, kh, kw, icb],
-        ),
-    )
-
-    # Type conversion
-    output = te.compute(
-        oshape, lambda *index: conv(*index).astype(out_dtype), tag="group_conv2d_NCHWc_int8"
-    )
-
-    num_flop = (
-        batch
-        * oc_chunk
-        * oc_block
-        * out_height
-        * out_width
-        * ic_chunk
-        * ic_block
-        * kernel_h
-        * kernel_w
-        * 2
-        // groups
-    )
-    cfg.add_flop(num_flop)
-
-    return output
-
-
-@autotvm.register_topi_schedule("group_conv2d_NCHWc_int8.cuda")
-def schedule_group_conv2d_NCHWc_int8(cfg, outs):
-    """TOPI schedule callback of group conv2d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for group conv2d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "group_conv2d_NCHWc_int8":
-            _schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
-    """Schedule group conv2d int8 NCHWc template"""
-    workload = output.op.attrs["workload"]
-    groups = get_const_int(workload[6])
-
-    conv = output.op.input_tensors[0]
-    packed_data, packed_kernel = conv.op.input_tensors
-
-    if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag:
-        pad_data = packed_data
-        packed_data = pad_data.op.input_tensors[0]
-    else:
-        pad_data = packed_data
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # skip this part during tuning to make records accurate
-        # this part will be pre-computed during NNVM's pre-compute optimization pass
-        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
-        s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
-    else:
-        if isinstance(packed_kernel.op, tvm.te.ComputeOp) and packed_kernel.name == "packed_kernel":
-            # data and kernel are not pre-computed, schedule layout transform here
-            schedule_injective_from_existing(s, packed_data)
-            schedule_injective_from_existing(s, packed_kernel)
-
-    if pad_data != packed_data:
-        s[pad_data].compute_inline()
-
-    # create cache stage
-    AA = s.cache_read(pad_data, "shared", [conv])
-    WW = s.cache_read(packed_kernel, "shared", [conv])
-
-    s[conv].set_scope("local")
-
-    # handle bias
-    if output.op not in s.outputs:
-        s[output].compute_inline()
-        output = s.outputs[0].output(0)
-
-    oc_chunk = get_const_int(output.shape[1])
-    # tile and bind spatial axes
-    if len(s[output].op.axis) == 5:
-        n, f, y, x, c = s[output].op.axis
-    else:
-        # For task extraction of auto-tuning, the expected output is 4D.  Since auto-tuning tasks
-        # are created from scratch, therefore the real auto-tuning will still happen on 5D output.
-        n, f, y, x = s[output].op.axis
-
-    cfg.define_split("tile_n", n, num_outputs=4)
-    cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2)
-    cfg.define_split("tile_f", cfg.axis(oc_chunk // groups), num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-
-    # this is the scope to attach global config inside this kernel
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    g, f = s[output].split(f, nparts=groups)
-    s[output].bind(n, te.thread_axis("blockIdx.z"))
-    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-    bg, vg = cfg["tile_g"].apply(s, output, g)
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, te.thread_axis("blockIdx.z"))
-    s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
-    s[output].bind(vn, te.thread_axis("vthread"))
-    s[output].bind(vg, te.thread_axis("vthread"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
-    if cfg["fuse_yx"].val:
-        s[output].bind(tn, te.thread_axis("threadIdx.z"))
-        s[output].bind(tf, te.thread_axis("threadIdx.y"))
-        tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
-        s[conv].compute_at(s[output], tyx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2]
-        n_ty = cfg["tile_f"].size[2]
-        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
-    else:
-        s[output].bind(tn, te.thread_axis("threadIdx.z"))
-        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
-        s[output].bind(ty, te.thread_axis("threadIdx.y"))
-        s[output].bind(tx, te.thread_axis("threadIdx.x"))
-        s[conv].compute_at(s[output], tx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-        n_ty = cfg["tile_y"].size[2]
-        n_tx = cfg["tile_x"].size[2]
-
-    # tile and bind reduction axes
-    n, f, y, x, c = s[conv].op.axis
-    rc, ry, rx, rc_block = s[conv].op.reduce_axis
-    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
-    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
-    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
-    rco, rci = cfg["tile_rc"].apply(s, conv, rc)
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
-    _, rc_block = s[conv].split(rc_block, factor=4)
-    target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
-    if do_tensorize:
-        dtypes = (pad_data.dtype, packed_kernel.dtype)
-        s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))
-
-    s[AA].compute_at(s[conv], rxo)
-    s[WW].compute_at(s[conv], rxo)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        c = s[load].op.axis[-1]
-        c_outer, c = s[load].split(c, factor=4)
-        s[load].vectorize(c)
-        fused = s[load].op.axis[:-1] + [c_outer]
-        fused = s[load].fuse(*fused)
-
-        fused, tx = s[load].split(fused, factor=n_tx)
-        fused, ty = s[load].split(fused, factor=n_ty)
-        fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # double buffer
-    cfg.define_knob("AA_double_buffer", [0, 1])
-    cfg.define_knob("WW_double_buffer", [0, 1])
-    if cfg["AA_double_buffer"].val:
-        s[AA].double_buffer()
-    if cfg["WW_double_buffer"].val:
-        s[WW].double_buffer()
-
-    # unroll
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", False)
-
-    return s
diff --git a/python/tvm/topi/cuda/injective.py b/python/tvm/topi/cuda/injective.py
deleted file mode 100644
index 0faddc31c25a..000000000000
--- a/python/tvm/topi/cuda/injective.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable,
-"""Schedule for composition of injective operator"""
-import numpy as np
-
-import tvm
-from tvm import te
-from .. import utils
-
-
-def schedule_injective_from_existing(sch, out):
-    """Schedule for injective op from existing schedule.
-
-    Parameters
-    ----------
-    sch: Schedule
-         The schedule to update.
-    out: Tensor
-         The tensor representing the injective op.
-
-    Returns
-    -------
-    sch: Schedule
-         The updated schedule.
-    """
-
-    def find_nearest_small_factor(num, target):
-        """Find the nearest factor of the given number that is smaller than the target."""
-        for i in range(target, 0, -1):
-            if num % i == 0:
-                return i
-        # Unreachable because i=1 must hold.
-        return -1
-
-    fused = sch[out].fuse(*sch[out].op.axis)
-    num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
-    max_block = 256
-
-    # Vectorize on fp16 data type to enable half2 for better memory bandwidth utilization.
-    vector_width = 2 if out.dtype == "float16" else 1
-
-    is_dynamic_output = False
-    for dim in out.shape:
-        if not isinstance(dim, tvm.tir.IntImm):
-            is_dynamic_output = True
-            break
-
-    out_len = utils.prod(out.shape)
-
-    try:
-        const_size = utils.get_const_int(out_len)
-
-        # Adjust block and thread to make sure they are dividable so that vectorize can be
-        # correctly applied.
-        if vector_width > 1 and const_size % vector_width == 0:
-            remain_total_size = const_size // vector_width
-            cand_sizes = []
-            for max_size in [num_thread, max_block]:
-                cand_sizes.append(
-                    max_size
-                    if remain_total_size % max_size == 0
-                    else find_nearest_small_factor(remain_total_size, max_size)
-                )
-                remain_total_size //= cand_sizes[-1]
-
-            # If the product of candidate dividable (block * thread) is too small,
-            # then the performance may be worse even half2 is enabled. Note that 0.7
-            # is just a heuristic ratio and may not be optimal for all workloads.
-            if np.prod(cand_sizes) / (max_block * num_thread) >= 0.7:
-                num_thread, max_block = cand_sizes
-
-        need_block_split = const_size > max_block * num_thread * vector_width
-    except ValueError:
-        need_block_split = False
-        const_size = 0
-
-    if vector_width > 1:
-        fused, v = sch[out].split(fused, vector_width)
-        sch[out].vectorize(v)
-
-    if need_block_split:
-        xo, xi = sch[out].split(fused, factor=num_thread * max_block)
-        bx, tx = sch[out].split(xi, factor=num_thread)
-        sch[out].reorder(bx, tx, xo)
-        sch[out].bind(bx, te.thread_axis("blockIdx.x"))
-        sch[out].bind(tx, te.thread_axis("threadIdx.x"))
-    else:
-        # Use less threads for dynamic shape ops to avoid runtime error.
-        if is_dynamic_output:
-            num_thread //= 2
-        if const_size != 0 and const_size < num_thread:
-            bx, tx = sch[out].split(fused, factor=const_size)
-        else:
-            bx, tx = sch[out].split(fused, factor=num_thread)
-        sch[out].bind(tx, te.thread_axis("threadIdx.x"))
-        sch[out].bind(bx, te.thread_axis("blockIdx.x"))
-
-    return sch
-
-
-def schedule_injective(outs):
-    """Schedule for injective op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    tvm.te.schedule.AutoInlineInjective(s)
-    for out in outs:
-        if not utils.is_empty_shape(out.shape):
-            schedule_injective_from_existing(s, out)
-    return s
-
-
-schedule_elemwise = schedule_injective
-schedule_broadcast = schedule_injective
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
deleted file mode 100644
index f258bffc3e8f..000000000000
--- a/python/tvm/topi/cuda/nms.py
+++ /dev/null
@@ -1,1149 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
-# pylint: disable=bad-continuation, unused-argument
-"""Non-maximum suppression operator"""
-import tvm
-from tvm import te
-from tvm.contrib import nvcc
-from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
-from tvm.ir import register_intrin_lowering
-from tvm.tir import if_then_else
-from .sort import argsort, argsort_thrust
-from .scan import exclusive_scan
-from ..utils import ceil_div
-from ..math import cast
-from ..transform import reshape
-from ..vision.nms_util import (
-    calculate_overlap,
-    binary_search,
-    collect_selected_indices,
-    collect_selected_indices_and_scores,
-    run_all_class_nms,
-)
-
-
-def cuda_atomic_add_rule(op):
-    if op.dtype == "float32":
-        return tvm.tir.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
-    if op.dtype == "float64":
-        return tvm.tir.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
-    if op.dtype == "int32":
-        return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
-    raise RuntimeError("only support int32, float32 and float64")
-
-
-def opencl_atomic_add_rule(op):
-    if op.dtype == "int32":
-        return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
-    elif op.dtype == "float32":
-        return tvm.tir.call_pure_extern("float32", "atomic_add", op.args[0], op.args[1])
-    raise RuntimeError("only support int32, float32")
-
-
-register_intrin_lowering("tir.atomic_add", target="cuda", f=cuda_atomic_add_rule, level=99)
-
-register_intrin_lowering("tir.atomic_add", target="opencl", f=opencl_atomic_add_rule, level=99)
-
-
-def atomic_add(x, y):
-    return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
-
-
-def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index):
-    """Low level IR to identify bounding boxes given a score threshold.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
-
-    score_threshold : Buffer or float32
-        Lower limit of score for valid bounding boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
-
-    Returns
-    -------
-    valid_boxes: Buffer
-        2D Buffer  indicating valid boxes with shape [batch_size, num_anchors].
-
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    data = ib.buffer_ptr(data)
-
-    valid_boxes = ib.buffer_ptr(valid_boxes)
-    if isinstance(score_threshold, float):
-        score_threshold = tvm.tir.FloatImm("float32", score_threshold)
-    id_index = tvm.tir.IntImm("int32", id_index)
-    score_index = tvm.tir.IntImm("int32", score_index)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    with ib.new_scope():
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(num_anchors, max_threads)
-        nthread_by = batch_size
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        tid = bx * max_threads + tx
-
-        with ib.if_scope(tid < num_anchors):
-            i = by
-            j = tid
-            score = data[(i * num_anchors + j) * elem_length + score_index]
-            with ib.if_scope(
-                tvm.tir.all(
-                    score > score_threshold,
-                    tvm.tir.any(
-                        id_index < 0, data[(i * num_anchors + j) * elem_length + id_index] >= 0
-                    ),
-                )
-            ):
-                valid_boxes[i * num_anchors + j] = 1
-            with ib.else_scope():
-                valid_boxes[i * num_anchors + j] = 0
-    return ib.get()
-
-
-def get_valid_counts_ir(data, valid_indices, valid_boxes, out, out_indices):
-    """Low level IR to get valid count of bounding boxes
-    given a score threshold. Also prepares to move valid boxes to the
-    top of input data.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
-
-    valid_indices: Buffer
-        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
-
-    Returns
-    -------
-    out : Buffer
-        Sorted valid boxes
-
-    out_indices : Buffer
-        Incidices of valid boxes in original data
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    data = ib.buffer_ptr(data)
-    valid_indices = ib.buffer_ptr(valid_indices)
-    valid_boxes = ib.buffer_ptr(valid_boxes)
-
-    out = ib.buffer_ptr(out)
-    out_indices = ib.buffer_ptr(out_indices)
-    one = tvm.tir.const(1, dtype=out.dtype)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = num_anchors // max_threads + 1
-    nthread_by = batch_size
-    with ib.new_scope():
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        tid = bx * max_threads + tx
-        with ib.if_scope(tid < num_anchors):
-            i = by
-            j = tid
-            with ib.for_range(0, elem_length) as k:
-                out[(i * num_anchors + j) * elem_length + k] = -one
-            out_indices[i * num_anchors + j] = -1
-    with ib.new_scope():
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        tid = bx * max_threads + tx
-        with ib.if_scope(tid < num_anchors):
-            i = by
-            j = tid
-            with ib.if_scope(valid_boxes[i, tid] > 0):
-                with ib.for_range(0, elem_length) as k:
-                    out[(i * num_anchors + valid_indices[i, tid]) * elem_length + k] = data[
-                        (i * num_anchors + j) * elem_length + k
-                    ]
-                out_indices[i * num_anchors + valid_indices[i, tid]] = j
-    return ib.get()
-
-
-def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
-    """Get valid count of bounding boxes given a score threshold.
-    Also moves valid boxes to the top of input data.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].
-
-    score_threshold : optional, tvm.te.Tensor or float
-        Lower limit of score for valid bounding boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
-
-    Returns
-    -------
-    valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes.
-
-    out_tensor : tvm.te.Tensor
-        Rearranged data tensor.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    valid_boxes_buf = tvm.tir.decl_buffer(
-        (batch_size, num_anchors), "int32", "valid_boxes_buf", data_alignment=8
-    )
-    valid_boxes = te.extern(
-        [(batch_size, num_anchors)],
-        [data],
-        lambda ins, outs: get_valid_boxes_ir(
-            ins[0], outs[0], score_threshold, id_index, score_index
-        ),
-        dtype=["int32"],
-        in_buffers=[data_buf],
-        out_buffers=[valid_boxes_buf],
-        name="get_valid_boxes",
-        tag="get_valid_boxes_gpu",
-    )
-
-    valid_indices_buf = tvm.tir.decl_buffer(
-        (batch_size, num_anchors), "int32", "valid_indices_buf", data_alignment=8
-    )
-
-    valid_indices, valid_count = exclusive_scan(valid_boxes, axis=1, return_reduction=True)
-
-    out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8)
-    out_indices_buf = tvm.tir.decl_buffer(
-        (batch_size, num_anchors), "int32", "out_buf", data_alignment=8
-    )
-
-    out, out_indices = te.extern(
-        [data.shape, (batch_size, num_anchors)],
-        [data, valid_indices, valid_boxes],
-        lambda ins, outs: get_valid_counts_ir(ins[0], ins[1], ins[2], outs[0], outs[1]),
-        dtype=["int32", data.dtype],
-        in_buffers=[data_buf, valid_indices_buf, valid_boxes_buf],
-        out_buffers=[out_buf, out_indices_buf],
-        name="get_valid_counts",
-        tag="get_valid_counts_gpu",
-    )
-
-    return [valid_count, out, out_indices]
-
-
-def _nms_loop(
-    ib,
-    batch_size,
-    top_k,
-    iou_threshold,
-    max_output_size,
-    valid_count,
-    on_new_valid_box_func,
-    on_new_invalidated_box_func,
-    needs_bbox_check_func,
-    calc_overlap_func,
-    out_scores,
-    num_valid_boxes,
-):
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-
-    with ib.new_scope():
-        nthread_by = batch_size
-        nthread_tx = max_threads
-
-        # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
-        # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
-        # be exceeded with 1024 threads.
-        target = tvm.target.Target.current(allow_none=False)
-        if target.kind.name == "cuda":
-            if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]:
-                nthread_tx = 512
-
-        by = te.thread_axis("blockIdx.y")
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-
-        num_valid_boxes_local = ib.allocate(
-            "int32", (1,), name="num_valid_boxes_local", scope="local"
-        )
-        num_valid_boxes_local[0] = 0
-
-        def nms_inner_loop(ib, i, j, nkeep):
-            # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
-            on_new_valid_box_func(ib, tx, num_valid_boxes_local[0], i, j)
-            num_valid_boxes_local[0] += 1
-
-            num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx)
-
-            with ib.for_range(0, num_iter_per_thread, name="_k") as _k:
-                k = j + 1 + _k * nthread_tx + tx
-
-                with ib.if_scope(
-                    tvm.tir.all(
-                        k < nkeep,
-                        out_scores[i, k] > 0,  # is the box k still valid?
-                        needs_bbox_check_func(i, j, k),
-                    )
-                ):
-                    iou = calc_overlap_func(i, j, k)
-
-                    with ib.if_scope(iou >= iou_threshold):
-                        # invalidate the box k
-                        out_scores[i, k] = -1.0
-                        on_new_invalidated_box_func(i, k)
-
-                ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
-
-        i = by
-
-        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i])
-        max_output_size = if_then_else(max_output_size > 0, max_output_size, nkeep)
-
-        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
-            # Apply nms
-            # No need to do more iteration if we have already reached max_output_size boxes
-            box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local")
-            box_idx[0] = 0
-            with ib.while_loop(
-                tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)
-            ):
-                # Proceed to the inner loop if the box with id box_idx is still valid
-                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
-                    nms_inner_loop(ib, i, box_idx[0], nkeep)
-                box_idx[0] += 1
-
-            with ib.if_scope(tx + 0 == 0):
-                num_valid_boxes[i] = num_valid_boxes_local[0]
-
-        with ib.else_scope():
-            num_valid_boxes[i] = 0
-
-    return ib.get()
-
-
-def nms_ir(
-    data,
-    sorted_index,
-    valid_count,
-    indices,
-    out_bboxes,
-    out_scores,
-    out_class_ids,
-    out_features,
-    box_indices,
-    num_valid_boxes,
-    max_output_size,
-    iou_threshold,
-    force_suppress,
-    top_k,
-    coord_start,
-    id_index,
-    score_index,
-    return_indices,
-):
-    """Low level IR routing for transform location in multibox_detection operator.
-
-    Parameters
-    ----------
-    data : Buffer
-        Buffer of output boxes with class and score.
-
-    sorted_index : Buffer
-        Buffer of output box indexes sorted by score.
-
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
-
-    indices : Buffer
-        indices in original tensor, with shape [batch_size, num_anchors],
-        represents the index of box in original data. It could be the third
-        output out_indices of get_valid_counts. The values in the second
-        dimension are like the output of arange(num_anchors) if get_valid_counts
-        is not used before non_max_suppression.
-
-    out_bboxes : Buffer
-        Output buffer, to be filled with sorted box coordinates.
-
-    out_scores : Buffer
-        Output buffer, to be filled with sorted scores.
-
-    out_class_ids : Buffer
-        Output buffer, to be filled with sorted class ids.
-
-    box_indices : Buffer
-        A indices tensor mapping sorted indices to original indices
-        This is the first output of NMS when return_indices=True.
-
-    num_valid_boxes : Buffer
-        Record the number of boxes that have survived IOU tests.
-        This is the second output of NMS when return_indices=True.
-
-    max_output_size : int
-        Max number of output valid boxes for each instance.
-        By default all valid boxes are returned.
-
-    iou_threshold : float
-        Overlapping(IoU) threshold to suppress object with smaller score.
-
-    force_suppress : boolean
-        Whether to suppress all detections regardless of class_id.
-
-    top_k : int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    coord_start : int
-        Start index of the consecutive 4 coordinates.
-
-    id_index : int
-        index of the class categories, -1 to disable.
-
-    score_index : optional, int
-        Index of the scores/confidence of boxes.
-
-    return_indices : boolean
-        Whether to return box indices in input data.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    box_data_length = data.shape[2]
-    num_features = out_features.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    data = ib.buffer_ptr(data)
-    sorted_index = ib.buffer_ptr(sorted_index)
-    valid_count = ib.buffer_ptr(valid_count)
-    indices = ib.buffer_ptr(indices)
-
-    # outputs
-    out_bboxes = ib.buffer_ptr(out_bboxes)
-    out_scores = ib.buffer_ptr(out_scores)
-    out_class_ids = ib.buffer_ptr(out_class_ids)
-    out_features = ib.buffer_ptr(out_features)
-    box_indices = ib.buffer_ptr(box_indices)
-    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
-
-    if isinstance(iou_threshold, float):
-        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
-    top_k = tvm.tir.IntImm("int32", top_k)
-    coord_start = tvm.tir.IntImm("int32", coord_start)
-    id_index = tvm.tir.IntImm("int32", id_index)
-    score_index = tvm.tir.IntImm("int32", score_index)
-    force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-
-    with ib.new_scope():
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(num_anchors, max_threads)
-        nthread_by = batch_size
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        i = by
-        base_src_idx = i * num_anchors * box_data_length
-        base_bbox_idx = i * num_anchors * 4
-        base_features_idx = i * num_anchors * num_features
-
-        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
-            # Reorder output
-            nkeep = if_then_else(
-                tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]
-            )
-            j = bx * max_threads + tx
-            with ib.if_scope(j < nkeep):
-                src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length
-                with ib.for_range(0, 4, kind="unroll") as k:
-                    out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k]
-                with ib.for_range(0, num_features, kind="unroll") as k:
-                    out_features[(base_features_idx + j * num_features + k)] = data[
-                        src_idx + coord_start + 4 + k
-                    ]
-
-                out_scores[i * num_anchors + j] = data[src_idx + score_index]
-
-                if id_index >= 0:
-                    out_class_ids[i * num_anchors + j] = data[src_idx + id_index]
-
-            with ib.else_scope():
-                # Indices > nkeep are discarded
-                # Only needed for return_indices = False case
-                if return_indices is False:
-                    with ib.if_scope(j < num_anchors):
-                        with ib.for_range(0, 4, kind="unroll") as k:
-                            out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
-                        with ib.for_range(0, num_features, kind="unroll") as k:
-                            out_features[(base_features_idx + j * num_features + k)] = -1.0
-
-                        out_scores[i, j] = -1.0
-
-                        if id_index >= 0:
-                            out_class_ids[i, j] = -1.0
-
-            if return_indices:
-                with ib.if_scope(j < num_anchors):
-                    box_indices[i * num_anchors + j] = -1
-
-        with ib.else_scope():
-            # Need to copy all boxes if not using return_indices
-            bounds = valid_count[i] if return_indices else num_anchors
-            with ib.if_scope(j < bounds):
-                src_offset = base_src_idx + j * box_data_length
-
-                with ib.for_range(0, 4, kind="unroll") as k:
-                    out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k]
-                with ib.for_range(0, num_features, kind="unroll") as k:
-                    out_features[(base_features_idx + j * num_features + k)] = data[
-                        src_offset + coord_start + 4 + k
-                    ]
-                out_scores[i * num_anchors + j] = data[src_offset + score_index]
-
-                if id_index >= 0:
-                    out_class_ids[i * num_anchors + j] = data[src_offset + id_index]
-
-                box_indices[i * num_anchors + j] = j
-
-    if isinstance(max_output_size, int):
-        max_output_size = tvm.tir.const(max_output_size)
-
-    def calc_overlap(i, j, k):
-        offset_j = j * 4
-        offset_k = k * 4
-        base_bbox_idx = i * num_anchors * 4
-        return calculate_overlap(
-            out_bboxes,
-            base_bbox_idx + offset_j,
-            base_bbox_idx + offset_k,
-        )
-
-    def on_new_valid_box(ib, tid, num_current_valid_box, i, j):
-        # When return_indices is False, no need to populate box_indices
-        if return_indices:
-            with ib.if_scope(tid + 0 == 0):
-                orig_idx = sorted_index[i * num_anchors + j]
-                box_indices[i, num_current_valid_box] = indices[i, orig_idx]
-
-    def on_new_invalidated_box(i, k):
-        if return_indices is False and id_index >= 0:
-            out_class_ids[i, k] = -1.0
-
-    def needs_bbox_check(i, j, k):
-        return tvm.tir.any(
-            force_suppress > 0,
-            id_index < 0,
-            out_class_ids[i, k] == out_class_ids[i, j],
-        )
-
-    return _nms_loop(
-        ib,
-        batch_size,
-        top_k,
-        iou_threshold,
-        max_output_size,
-        valid_count,
-        on_new_valid_box,
-        on_new_invalidated_box,
-        needs_bbox_check,
-        calc_overlap,
-        out_scores,
-        num_valid_boxes,
-    )
-
-
-def _fetch_score_ir(data, score, axis):
-    """
-    Fetch score from data.
-    This routine is required for dynamic shape nms.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    data = ib.buffer_ptr(data)
-    score = ib.buffer_ptr(score)
-    with ib.if_scope(num_anchors > 0):
-        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-        nthread_tx = max_threads
-        nthread_bx = batch_size * num_anchors // max_threads + 1
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-
-        tid = bx * max_threads + tx
-        with ib.if_scope(tid < batch_size * num_anchors):
-            score[tid] = data[tid * elem_length + axis]
-
-    return ib.get()
-
-
-def _dispatch_sort(scores, ret_type="indices"):
-    target = tvm.target.Target.current()
-    if target and (
-        can_use_thrust(target, "tvm.contrib.thrust.sort")
-        or can_use_rocthrust(target, "tvm.contrib.thrust.sort")
-    ):
-        return argsort_thrust(scores, axis=1, is_ascend=False, dtype="int32", ret_type=ret_type)
-    return argsort(scores, axis=1, is_ascend=False, dtype="int32", ret_type=ret_type)
-
-
-def _get_sorted_indices(data, data_buf, score_index, score_shape):
-    """Extract a 1D score tensor from the packed input and do argsort on it."""
-    score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8)
-    score_tensor = te.extern(
-        [score_shape],
-        [data],
-        lambda ins, outs: _fetch_score_ir(
-            ins[0],
-            outs[0],
-            score_index,
-        ),
-        dtype=[data.dtype],
-        in_buffers=[data_buf],
-        out_buffers=[score_buf],
-        name="fetch_score",
-        tag="fetch_score",
-    )
-    return _dispatch_sort(score_tensor)
-
-
-def _run_nms(
-    data,
-    data_buf,
-    sort_tensor,
-    valid_count,
-    indices,
-    max_output_size,
-    iou_threshold,
-    force_suppress,
-    top_k,
-    coord_start,
-    id_index,
-    score_index,
-    return_indices,
-):
-    """Run NMS using sorted scores."""
-    sort_tensor_buf = tvm.tir.decl_buffer(
-        sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8
-    )
-
-    valid_count_dtype = "int32"
-    valid_count_buf = tvm.tir.decl_buffer(
-        valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4
-    )
-    indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8)
-
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    # Number of extra features per box beyond coords, score, and id.
-    num_features = data.shape[2] - 6 if id_index >= 0 else data.shape[2] - 5
-
-    # output shapes
-    bbox_shape = (batch_size, num_anchors, 4)
-    score_shape = (batch_size, num_anchors)
-    class_id_shape = score_shape
-    out_features_shape = (batch_size, num_anchors, num_features)
-    box_indices_shape = score_shape
-    num_valid_boxes_shape = (batch_size, 1)
-
-    return te.extern(
-        [
-            bbox_shape,
-            score_shape,
-            class_id_shape,
-            out_features_shape,
-            box_indices_shape,
-            num_valid_boxes_shape,
-        ],
-        [data, sort_tensor, valid_count, indices],
-        lambda ins, outs: nms_ir(
-            ins[0],
-            ins[1],
-            ins[2],
-            ins[3],
-            outs[0],  # sorted bbox
-            outs[1],  # sorted scores
-            outs[2],  # sorted class ids
-            outs[3],  # sorted box feats
-            outs[4],  # box_indices
-            outs[5],  # num_valid_boxes
-            max_output_size,
-            iou_threshold,
-            force_suppress,
-            top_k,
-            coord_start,
-            id_index,
-            score_index,
-            return_indices,
-        ),
-        dtype=[data.dtype, "float32", "float32", "float32", "int32", "int32"],
-        in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
-        name="nms",
-        tag="nms",
-    )
-
-
-def _concatenate_outputs(
-    out_bboxes,
-    out_scores,
-    out_class_ids,
-    out_features,
-    out_shape,
-    coord_start,
-    score_index,
-    id_index,
-):
-    """Pack the results from NMS into a single 5D or 6D tensor."""
-    batch_size = out_bboxes.shape[0]
-    num_anchors = out_bboxes.shape[1]
-    num_features = out_features.shape[2]
-
-    def ir(out_bboxes, out_scores, out_class_ids, out):
-        ib = tvm.tir.ir_builder.create()
-
-        out_bboxes = ib.buffer_ptr(out_bboxes)
-        out_scores = ib.buffer_ptr(out_scores)
-        out_class_ids = ib.buffer_ptr(out_class_ids)
-        out = ib.buffer_ptr(out)
-
-        with ib.if_scope(num_anchors > 0):
-            max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-            nthread_tx = max_threads
-            nthread_bx = ceil_div(num_anchors, nthread_tx)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            by = te.thread_axis("blockIdx.y")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-            ib.scope_attr(by, "thread_extent", batch_size)
-
-            tid = bx * nthread_tx + tx
-            i = by
-
-            with ib.if_scope(tid < num_anchors):
-                with ib.for_range(0, 4, kind="unroll") as j:
-                    out[i, tid, coord_start + j] = out_bboxes[i, tid, j]
-                with ib.for_range(0, num_features, kind="unroll") as j:
-                    out[i, tid, coord_start + 4 + j] = out_features[i, tid, j]
-                out[i, tid, score_index] = out_scores[i, tid]
-                if id_index >= 0:
-                    out[i, tid, id_index] = out_class_ids[i, tid]
-
-        return ib.get()
-
-    return te.extern(
-        [out_shape],
-        [out_bboxes, out_scores, out_class_ids],
-        lambda ins, outs: ir(ins[0], ins[1], ins[2], outs[0]),
-        dtype=["float32"],
-        name="nms_output_concat",
-        tag="nms_output_concat",
-    )
-
-
-def non_max_suppression(
-    data,
-    valid_count,
-    indices,
-    max_output_size=-1,
-    iou_threshold=0.5,
-    force_suppress=False,
-    top_k=-1,
-    coord_start=2,
-    score_index=1,
-    id_index=0,
-    return_indices=True,
-    invalid_to_bottom=False,
-):
-    """Non-maximum suppression operator for object detection.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, elem_length].
-        The last dimension should be in format of
-        [class_id, score, box_left, box_top, box_right, box_bottom].
-        It could be the second output out_tensor of get_valid_counts.
-
-    valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes. It could be the output
-        valid_count of get_valid_counts.
-
-    indices : tvm.te.Tensor
-        2-D tensor with shape [batch_size, num_anchors], represents
-        the index of box in original data. It could be the third
-        output out_indices of get_valid_counts. The values in the
-        second dimension are like the output of arange(num_anchors)
-        if get_valid_counts is not used before non_max_suppression.
-
-    max_output_size : optional, tvm.te.Tensor or int
-        Max number of output valid boxes for each instance.
-        By default all valid boxes are returned.
-
-    iou_threshold : optional, tvm.te.Tensor or float
-        Non-maximum suppression threshold.
-
-    force_suppress : optional, boolean
-        Whether to suppress all detections regardless of class_id.
-
-    top_k : optional, int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    coord_start : required, int
-        Start index of the consecutive 4 coordinates.
-
-    score_index : optional, int
-        Index of the scores/confidence of boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    return_indices : boolean
-        Whether to return box indices in input data.
-
-    invalid_to_bottom : optional, boolean
-        Whether to move all valid bounding boxes to the top.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, elem_length].
-
-    Example
-    --------
-    .. code-block:: python
-
-        # An example to use nms
-        dshape = (1, 5, 6)
-        data = te.placeholder(dshape, name="data")
-        valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count")
-        iou_threshold = 0.7
-        force_suppress = True
-        top_k = -1
-        out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
-                                 force_suppress=force_supress, top_k=top_k, return_indices=False)
-        np_data = np.random.uniform(dshape)
-        np_valid_count = np.array([4])
-        s = topi.generic.schedule_nms(out)
-        f = tvm.build(s, [data, valid_count, out], "cuda")
-        dev = tvm.cuda(0)
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
-        f(tvm_data, tvm_valid_count, tvm_out)
-    """
-    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-
-    sort_tensor = _get_sorted_indices(data, data_buf, score_index, (data.shape[0], data.shape[1]))
-
-    out_bboxes, out_scores, out_class_ids, out_features, box_indices, num_valid_boxes = _run_nms(
-        data,
-        data_buf,
-        sort_tensor,
-        valid_count,
-        indices,
-        max_output_size,
-        iou_threshold,
-        force_suppress,
-        top_k,
-        coord_start,
-        id_index,
-        score_index,
-        return_indices,
-    )
-
-    if return_indices:
-        return [box_indices, num_valid_boxes]
-
-    return _concatenate_outputs(
-        out_bboxes,
-        out_scores,
-        out_class_ids,
-        out_features,
-        data.shape,
-        coord_start,
-        score_index,
-        id_index,
-    )
-
-
-def _get_valid_box_count(scores, score_threshold):
-    batch_classes, num_boxes = scores.shape
-
-    def searchsorted_ir(scores, valid_count):
-        ib = tvm.tir.ir_builder.create()
-        scores = ib.buffer_ptr(scores)
-        valid_count = ib.buffer_ptr(valid_count)
-
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-
-        with ib.new_scope():
-            ib.scope_attr(bx, "thread_extent", ceil_div(batch_classes, max_threads))
-            ib.scope_attr(tx, "thread_extent", max_threads)
-            tid = bx * max_threads + tx
-
-            with ib.if_scope(tid < batch_classes):
-                binary_search(ib, tid, num_boxes, scores, score_threshold, valid_count)
-
-        return ib.get()
-
-    scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8)
-
-    return te.extern(
-        [(batch_classes,)],
-        [scores],
-        lambda ins, outs: searchsorted_ir(ins[0], outs[0]),
-        dtype=["int32"],
-        in_buffers=[scores_buf],
-        name="searchsorted",
-        tag="searchsorted",
-    )
-
-
-def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out):
-    batch_classes, num_boxes = selected_indices.shape
-
-    ib = tvm.tir.ir_builder.create()
-
-    selected_indices = ib.buffer_ptr(selected_indices)
-    num_detections = ib.buffer_ptr(num_detections)
-    row_offsets = ib.buffer_ptr(row_offsets)
-    out = ib.buffer_ptr(out)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = ceil_div(num_boxes, nthread_tx)
-    nthread_by = batch_classes
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    by = te.thread_axis("blockIdx.y")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    ib.scope_attr(by, "thread_extent", nthread_by)
-
-    with ib.new_scope():
-        idx = bx * nthread_tx + tx
-        idy = cast(by, "int64")
-        batch_id = idy // num_class
-        class_id = idy % num_class
-        with ib.if_scope(idx < num_detections[idy]):
-            out[row_offsets[idy] + idx, 0] = batch_id
-            out[row_offsets[idy] + idx, 1] = class_id
-            out[row_offsets[idy] + idx, 2] = cast(selected_indices[idy, idx], "int64")
-
-    return ib.get()
-
-
-def _collect_selected_indices_and_scores_ir(
-    selected_indices,
-    selected_scores,
-    num_detections,
-    row_offsets,
-    num_total_detections,
-    collected_indices,
-    collected_scores,
-):
-    batch_size, num_class = row_offsets.shape
-    num_boxes = selected_indices.shape[1]
-
-    ib = tvm.tir.ir_builder.create()
-
-    selected_indices = ib.buffer_ptr(selected_indices)
-    selected_scores = ib.buffer_ptr(selected_scores)
-    num_detections = ib.buffer_ptr(num_detections)
-    row_offsets = ib.buffer_ptr(row_offsets)
-    num_total_detections = ib.buffer_ptr(num_total_detections)
-    collected_indices = ib.buffer_ptr(collected_indices)
-    collected_scores = ib.buffer_ptr(collected_scores)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = ceil_div(num_boxes, nthread_tx)
-    nthread_by = batch_size * num_class
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    by = te.thread_axis("blockIdx.y")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    ib.scope_attr(by, "thread_extent", nthread_by)
-    zero = cast(0, "int64")
-
-    with ib.new_scope():
-        idx = bx * nthread_tx + tx
-        idy = cast(by, "int64")
-        batch_id = idy // num_class
-        class_id = idy % num_class
-
-        with ib.if_scope(idx < num_detections[batch_id, class_id]):
-            offset = row_offsets[batch_id, class_id] + idx
-            collected_indices[batch_id, offset, 0] = class_id
-            collected_indices[batch_id, offset, 1] = cast(selected_indices[idy, idx], "int64")
-            collected_scores[batch_id, offset] = selected_scores[idy, idx]
-        with ib.else_scope():
-            with ib.if_scope(idx < num_boxes):
-                offset = (
-                    num_total_detections[batch_id]
-                    + class_id * num_boxes
-                    - row_offsets[batch_id, class_id]
-                    + idx
-                    - num_detections[batch_id, class_id]
-                )
-                collected_indices[batch_id, offset, 0] = zero
-                collected_indices[batch_id, offset, 1] = zero
-                collected_scores[batch_id, offset] = 0.0
-
-    return ib.get()
-
-
-def all_class_non_max_suppression(
-    boxes,
-    scores,
-    max_output_boxes_per_class,
-    iou_threshold,
-    score_threshold,
-    output_format="onnx",
-):
-    """Non-maximum suppression operator for object detection, corresponding to ONNX
-    NonMaxSuppression and TensorFlow combined_non_max_suppression.
-    NMS is performed for each class separately.
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    scores: tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_classes, num_boxes)
-
-    max_output_boxes_per_class : int or tvm.te.Tensor, optional
-        The maxinum number of output selected boxes per class
-
-    iou_threshold : float or tvm.te.Tensor, optionaIl
-        IoU test threshold
-
-    score_threshold : float or tvm.te.Tensor, optional
-        Score threshold to filter out low score boxes early
-
-    output_format : str, optional
-        "onnx" or "tensorflow", see below
-
-    Returns
-    -------
-    out : list of tvm.te.Tensor
-        If `output_format` is "onnx", the output is two tensors. The first is `indices` of size
-        `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor
-        `num_total_detection` of shape `(1,)` representing the total number of selected
-        boxes. The three values in `indices` encode batch, class, and box indices.
-        Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come
-        first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
-        `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection`
-        rows are valid.
-
-        If `output_format` is "tensorflow", the output is three tensors, the first
-        is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of
-        size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size
-        `(batch_size,)` representing the total number of selected boxes per batch. The two values
-        in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at
-        batch b, only the first `num_total_detection[b]` entries are valid. The second axis of
-        `indices` and `scores` are sorted within each class by box scores, but not across classes.
-        So the box indices and scores for the class 0 come first in a sorted order, followed by
-        the class 1 etc.
-    """
-    batch, num_class, num_boxes = scores.shape
-
-    scores = reshape(scores, (batch * num_class, num_boxes))
-    sorted_scores, sorted_indices = _dispatch_sort(scores, ret_type="both")
-    valid_count = _get_valid_box_count(sorted_scores, score_threshold)
-
-    selected_indices, selected_scores, num_detections = run_all_class_nms(
-        boxes,
-        sorted_scores,
-        sorted_indices,
-        valid_count,
-        max_output_boxes_per_class,
-        iou_threshold,
-        _nms_loop,
-        return_scores=(output_format == "tensorflow"),
-    )
-
-    if output_format == "onnx":
-        row_offsets, num_total_detections = exclusive_scan(
-            num_detections, return_reduction=True, output_dtype="int64"
-        )
-        selected_indices = collect_selected_indices(
-            num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
-        )
-        return [selected_indices, num_total_detections]
-
-    num_detections_per_batch = reshape(num_detections, (batch, num_class))
-    row_offsets, num_total_detections = exclusive_scan(
-        num_detections_per_batch, return_reduction=True, output_dtype="int64", axis=1
-    )
-
-    selected_indices, selected_scores = collect_selected_indices_and_scores(
-        selected_indices,
-        selected_scores,
-        num_detections_per_batch,
-        row_offsets,
-        num_total_detections,
-        _collect_selected_indices_and_scores_ir,
-    )
-
-    return [selected_indices, selected_scores, num_total_detections]
diff --git a/python/tvm/topi/cuda/nn.py b/python/tvm/topi/cuda/nn.py
deleted file mode 100644
index e29bb440de35..000000000000
--- a/python/tvm/topi/cuda/nn.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""scheduler functions for cuda backend"""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-from ..utils import traverse_inline
-
-
-def schedule_lrn(outs):
-    """Schedule for LRN
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of LRN
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-
-    def _callback(op):
-        if "sqr_sum" in op.tag:
-            pad = op.input_tensors[0]
-            s[pad].compute_inline()
-            fused_axis = s[outs[0]].fuse(*s[outs[0]].op.axis)
-            bx, tx = s[outs[0]].split(fused_axis, factor=max_threads)
-            s[outs[0]].bind(bx, te.thread_axis("blockIdx.x"))
-            s[outs[0]].bind(tx, te.thread_axis("threadIdx.x"))
-            s[op].compute_at(s[outs[0]], tx)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/cuda/pooling.py b/python/tvm/topi/cuda/pooling.py
deleted file mode 100644
index a443f222b63b..000000000000
--- a/python/tvm/topi/cuda/pooling.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument
-"""Schedule for pooling operators"""
-import tvm
-from tvm import te
-from .. import tag
-from ..utils import traverse_inline
-from .reduction import _schedule_reduce
-from .injective import schedule_injective_from_existing
-
-
-def schedule_adaptive_pool(outs, layout="NCHW"):
-    """Schedule for adaptive_pool.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of adaptive_pool
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for adaptive_pool.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule_non_global(Pool):
-        if Pool.op in s.outputs:
-            Out = Pool
-            OL = s.cache_write(Pool, "local")
-        else:
-            Out = outs[0].op.output(0)
-            s[Pool].set_scope("local")
-
-        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-        fused_axis = s[Out].fuse(*s[Out].op.axis)
-        bx, tx = s[Out].split(fused_axis, factor=max_threads)
-        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
-        s[Out].bind(tx, te.thread_axis("threadIdx.x"))
-
-        if Pool.op in s.outputs:
-            s[OL].compute_at(s[Out], tx)
-        else:
-            s[Pool].compute_at(s[Out], tx)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule global_pool
-        elif OP.tag.startswith("adaptive_pool"):
-            Pool = OP.output(0)
-            oshape = Pool.shape
-            if (layout == "NCHW" and oshape[2] == 1 and oshape[3] == 1) or (
-                layout == "NHWC" and oshape[1] == 1 and oshape[2] == 1
-            ):
-                _schedule_reduce(OP, s)
-                if OP != outs[0].op:
-                    # the final division for adaptive pool or fused elemwise ops
-                    schedule_injective_from_existing(s, outs[0])
-            else:
-                _schedule_non_global(Pool)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
-
-
-def schedule_pool(outs, layout):
-    """Schedule for pool.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of pool
-        in the format of an array of tensors.
-
-    layout: str
-        Data layout.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for pool.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(PaddedInput, Pool):
-        if isinstance(PaddedInput.op, tvm.te.ComputeOp):
-            s[PaddedInput].compute_inline()
-        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
-        if Pool.op in s.outputs:
-            Out = Pool
-            OL = s.cache_write(Pool, "local")
-        else:
-            Out = outs[0].op.output(0)
-            s[Pool].set_scope("local")
-        fused = s[Out].fuse(*s[Out].op.axis)
-        bx, tx = s[Out].split(fused, factor=num_thread)
-        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
-        s[Out].bind(tx, te.thread_axis("threadIdx.x"))
-        if Pool.op in s.outputs:
-            s[OL].compute_at(s[Out], tx)
-        else:
-            s[Pool].compute_at(s[Out], tx)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule pool
-        elif OP.tag.startswith("pool"):
-            PaddedInput = OP.input_tensors[0]
-            Pool = OP.output(0)
-            _schedule(PaddedInput, Pool)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
-
-
-def schedule_pool_grad(outs):
-    """Schedule for pool_grad on CUDA
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of pool_grad
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for pool_grad.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule_pool_grad(op):
-        if op in s.outputs:
-            out = op
-        else:
-            out = outs[0].op.output(0)
-        fused = s[out].fuse(*s[out].op.axis)
-        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
-        bx, tx = s[out].split(fused, factor=num_thread)
-        s[out].bind(bx, te.thread_axis("blockIdx.x"))
-        s[out].bind(tx, te.thread_axis("threadIdx.x"))
-
-        if tag.COMM_REDUCE_IDX in op.input_tensors[0].op.tag:
-            max_pool_index = op.input_tensors[0]
-            s[max_pool_index].compute_at(s[out], tx)
-
-            pool_input = max_pool_index.op.input_tensors[0]
-            if isinstance(pool_input.op, tvm.te.ComputeOp):
-                # handle padding
-                s[pool_input].compute_inline()
-        if op not in s.outputs:
-            s[op].compute_at(s[out], tx)
-
-    def _callback(op):
-        if op.tag.startswith("pool_grad"):
-            _schedule_pool_grad(op)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
diff --git a/python/tvm/topi/cuda/rcnn/__init__.py b/python/tvm/topi/cuda/rcnn/__init__.py
deleted file mode 100644
index da55b070a807..000000000000
--- a/python/tvm/topi/cuda/rcnn/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Faster R-CNN and Mask R-CNN operators"""
-from .proposal import proposal
diff --git a/python/tvm/topi/cuda/rcnn/proposal.py b/python/tvm/topi/cuda/rcnn/proposal.py
deleted file mode 100644
index 12f7a23abe35..000000000000
--- a/python/tvm/topi/cuda/rcnn/proposal.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, singleton-comparison, bad-continuation
-"""Proposal operator"""
-import math
-import tvm
-from tvm import te
-from ...vision.rcnn import generate_anchor, reg_bbox, reg_iou
-from ...utils import get_const_tuple, get_const_int
-
-
-def predict_bbox_ir(
-    cls_prob_buf,
-    bbox_pred_buf,
-    im_info_buf,
-    out_buf,
-    scales,
-    ratios,
-    feature_stride,
-    rpn_min_size,
-    iou_loss,
-):
-    """Predict bounding boxes based on anchors, scores and deltas.
-
-    Parameters
-    ----------
-    cls_prob_buf : tvm.te.schedule.Buffer
-        4-D with shape [batch, 2 * num_anchors, height, width]
-
-    bbox_pred_buf : tvm.te.schedule.Buffer
-        4-D with shape [batch, 4 * num_anchors, height, width]
-
-    im_info_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, 3]
-
-    out_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]
-        The last dimension is in format of [w_start, h_start, w_end, h_end, score]
-
-    scales : list/tuple of float
-        Scales of anchor windows.
-
-    ratios : list/tuple of float
-        Ratios of anchor windows.
-
-    feature_stride : int
-        The size of the receptive field each unit in the convolution layer of the rpn, for example
-        the product of all stride's prior to this layer.
-
-    rpn_min_size : int
-        Minimum height or width in proposal.
-
-    iou_loss : bool
-        Usage of IoU loss.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_anchors, height, width = get_const_tuple(cls_prob_buf.shape)
-    num_anchors //= 2
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = (batch * height * width) // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    tid = bx * max_threads + tx
-    ib = tvm.tir.ir_builder.create()
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-
-    p_score = ib.buffer_ptr(cls_prob_buf)
-    p_delta = ib.buffer_ptr(bbox_pred_buf)
-    p_im_info = ib.buffer_ptr(im_info_buf)
-    p_out = ib.buffer_ptr(out_buf)
-
-    idxm = tvm.tir.indexmod
-    idxd = tvm.tir.indexdiv
-
-    with ib.if_scope(tid < batch * height * width):
-        w = idxm(tid, width)
-        h = idxm(idxd(tid, width), height)
-        b = idxd(idxd(tid, width), height)
-
-        for k in range(num_anchors):
-            out_index = tid * num_anchors + k
-            ratio = ratios[k // len(scales)]
-            scale = scales[k % len(scales)]
-            anchor = generate_anchor(ratio, scale, feature_stride)
-            im_height = p_im_info[b * 3]
-            im_width = p_im_info[b * 3 + 1]
-            x1 = anchor[0] + w * feature_stride
-            y1 = anchor[1] + h * feature_stride
-            x2 = anchor[2] + w * feature_stride
-            y2 = anchor[3] + h * feature_stride
-
-            delta = [
-                p_delta[((((b * num_anchors + k) * 4 + i) * height + h) * width + w)]
-                for i in range(4)
-            ]
-            regression_func = reg_iou if iou_loss else reg_bbox
-            pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta)
-
-            pred_x1 = tvm.te.max(tvm.te.min(pred_x1, im_width - 1.0), 0.0)
-            pred_y1 = tvm.te.max(tvm.te.min(pred_y1, im_height - 1.0), 0.0)
-            pred_x2 = tvm.te.max(tvm.te.min(pred_x2, im_width - 1.0), 0.0)
-            pred_y2 = tvm.te.max(tvm.te.min(pred_y2, im_height - 1.0), 0.0)
-
-            real_height = (im_height / feature_stride).astype("int32")
-            real_width = (im_width / feature_stride).astype("int32")
-
-            bbox_w = pred_x2 - pred_x1 + 1.0
-            bbox_h = pred_y2 - pred_y1 + 1.0
-            min_size = p_im_info[b * 3 + 2] * rpn_min_size
-
-            pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w]
-            pred_score = tvm.tir.Select(
-                tvm.tir.any(h >= real_height, w >= real_width), -1.0, pred_score
-            )
-            p_out[out_index * 5 + 0] = pred_x1
-            p_out[out_index * 5 + 1] = pred_y1
-            p_out[out_index * 5 + 2] = pred_x2
-            p_out[out_index * 5 + 3] = pred_y2
-            p_out[out_index * 5 + 4] = pred_score
-
-            with ib.if_scope(tvm.tir.any(bbox_w < min_size, bbox_h < min_size)):
-                p_out[out_index * 5 + 0] -= min_size / 2.0
-                p_out[out_index * 5 + 1] -= min_size / 2.0
-                p_out[out_index * 5 + 2] += min_size / 2.0
-                p_out[out_index * 5 + 3] += min_size / 2.0
-                p_out[out_index * 5 + 4] = -1.0
-
-    return ib.get()
-
-
-def argsort_ir(data_buf, out_index_buf):
-    """Batched odd-even transposition sort.
-
-    Parameters
-    ----------
-    data_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]
-
-    out_index_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Indices of data in sorted order.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_bbox = get_const_tuple(data_buf.shape)
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    ib = tvm.tir.ir_builder.create()
-    p_data = ib.buffer_ptr(data_buf)
-    index_out = ib.buffer_ptr(out_index_buf)
-    nthread_tx = max_threads
-    nthread_bx = (num_bbox + 1) // 2 // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("vthread")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "virtual_thread", nthread_bx)
-    tid = bx * nthread_tx + tx
-    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
-    temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
-
-    idxm = tvm.tir.indexmod
-
-    with ib.for_range(0, batch, kind="unroll") as b:
-        start = b * num_bbox
-        for i in range(2):
-            bbox_id = tid * 2 + i
-            with ib.if_scope(bbox_id < num_bbox):
-                index_out[start + bbox_id] = bbox_id
-        with ib.for_range(0, num_bbox) as k:
-            offset = start + 2 * tid + idxm(k, 2)
-            with ib.if_scope(
-                tvm.tir.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])
-            ):
-                temp_data[0] = p_data[offset]
-                p_data[offset] = p_data[offset + 1]
-                p_data[offset + 1] = temp_data[0]
-                temp_index[0] = index_out[offset]
-                index_out[offset] = index_out[offset + 1]
-                index_out[offset + 1] = temp_index[0]
-            ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
-    return ib.get()
-
-
-def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
-    """Non-maximum suppression.
-
-    Parameters
-    ----------
-    sorted_bbox_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
-        [w_start, h_start, w_end, h_end, score].
-
-    out_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
-
-    nms_threshold : float
-        Non-maximum suppression threshold.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-        """Calculate overlap of two boxes."""
-        w = tvm.te.max(
-            0.0,
-            tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-            - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx])
-            + 1.0,
-        )
-        h = tvm.te.max(
-            0.0,
-            tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-            - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])
-            + 1.0,
-        )
-        i = w * h
-        u = (
-            (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0)
-            * (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0)
-            + (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx] + 1.0)
-            * (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1] + 1.0)
-            - i
-        )
-        return i / u
-
-    batch, num_bbox = get_const_tuple(out_buf.shape)
-    max_threads = int(math.sqrt(tvm.target.Target.current(allow_none=False).max_num_threads))
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib = tvm.tir.ir_builder.create()
-    p_data = ib.buffer_ptr(sorted_bbox_buf)
-    p_out = ib.buffer_ptr(out_buf)
-    nthread_tx = max_threads
-    nthread_bx = num_bbox // max_threads + 1
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    i = bx * max_threads + tx
-    with ib.for_range(0, batch, kind="unroll", name="n") as b:
-        base_idx = b * num_bbox
-        with ib.if_scope(i < num_bbox):
-            p_out[base_idx + i] = False
-        with ib.for_range(0, num_bbox - 1) as l:
-            with ib.if_scope(tvm.tir.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
-                iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5)
-                with ib.if_scope(iou > nms_threshold):
-                    p_out[base_idx + i] = True
-        ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
-    return ib.get()
-
-
-def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
-    """Copy output after applying nms to continuous memory.
-
-    Parameters
-    ----------
-    sorted_bbox_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
-        [w_start, h_start, w_end, h_end, score].
-
-    remove_mask_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
-
-    out_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
-        [batch_index, w_start, h_start, w_end, h_end].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
-    rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
-    nthread_tx = batch
-    tx = te.thread_axis("threadIdx.x")
-    ib = tvm.tir.ir_builder.create()
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    i = ib.allocate("int32", (1,), "i", scope="local")
-    i[0] = 0
-    p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf)
-    p_remove = ib.buffer_ptr(remove_mask_buf)
-    p_out = ib.buffer_ptr(out_buf)
-    b = tx
-
-    nkeep = ib.allocate("int32", (1,), "nkeep", scope="local")
-    nkeep[0] = 0  # number of bbox after nms
-
-    with ib.for_range(0, num_bbox) as j:
-        with ib.if_scope(p_remove[b * num_bbox + j] == False):
-            nkeep[0] += 1
-    with ib.if_scope(nkeep[0] > 0):
-        with ib.for_range(
-            0, te.ceil(tvm.tir.const(rpn_post_nms_top_n, "float32") / nkeep[0]).astype("int32")
-        ):
-            with ib.for_range(0, num_bbox) as j:
-                offset_j = (b * num_bbox + j) * 5
-                offset_i = (b * rpn_post_nms_top_n + i[0]) * 5
-                with ib.if_scope(
-                    tvm.tir.all(i[0] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False)
-                ):
-                    p_out[offset_i] = tvm.tir.Cast("float32", b)
-                    with ib.for_range(0, 4, kind="unroll") as k:
-                        p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
-                    i[0] = i[0] + 1
-
-    body = ib.get()
-    return body
-
-
-def proposal(
-    cls_prob,
-    bbox_pred,
-    im_info,
-    scales,
-    ratios,
-    feature_stride,
-    threshold,
-    rpn_pre_nms_top_n,
-    rpn_post_nms_top_n,
-    rpn_min_size,
-    iou_loss,
-):
-    """Proposal operator.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        4-D with shape [batch, 2 * num_anchors, height, width]
-
-    bbox_pred : tvm.te.Tensor
-        4-D with shape [batch, 4 * num_anchors, height, width]
-
-    im_info : tvm.te.Tensor
-        2-D with shape [batch, 3]
-
-    scales : list/tuple of float
-        Scales of anchor windows.
-
-    ratios : list/tuple of float
-        Ratios of anchor windows.
-
-    feature_stride : int
-        The size of the receptive field each unit in the convolution layer of the rpn, for example
-        the product of all stride's prior to this layer.
-
-    threshold : float
-        Non-maximum suppression threshold.
-
-    rpn_pre_nms_top_n : int
-        Number of top scoring boxes to apply NMS. -1 to use all boxes.
-
-    rpn_post_nms_top_n : int
-        Number of top scoring boxes to keep after applying NMS to RPN proposals.
-
-    rpn_min_size : int
-        Minimum height or width in proposal.
-
-    iou_loss : bool
-        Usage of IoU loss.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
-        [batch_index, w_start, h_start, w_end, h_end].
-    """
-
-    batch, _, height, width = get_const_tuple(cls_prob.shape)
-    num_anchors = len(scales) * len(ratios)
-    num_bbox = height * width * num_anchors
-    rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox
-
-    bbox = te.extern(
-        (batch, num_bbox, 5),
-        [cls_prob, bbox_pred, im_info],
-        lambda ins, outs: predict_bbox_ir(
-            ins[0], ins[1], ins[2], outs[0], scales, ratios, feature_stride, rpn_min_size, iou_loss
-        ),
-        dtype=bbox_pred.dtype,
-    )
-    score = te.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag="bbox_score")
-    sorted_index = te.extern(
-        [score.shape], [score], lambda ins, outs: argsort_ir(ins[0], outs[0]), dtype="int32"
-    )
-    sorted_bbox = te.compute(
-        (batch, rpn_pre_nms_top_n, 5),
-        lambda b, i, j: bbox[b, sorted_index[b, i], j],
-        tag="sorted_bbox",
-    )
-    nms_remove_mask = te.extern(
-        (batch, rpn_pre_nms_top_n),
-        [sorted_bbox],
-        lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
-        dtype="bool",
-    )
-    nms_out = te.extern(
-        (batch * rpn_post_nms_top_n, 5),
-        [sorted_bbox, nms_remove_mask],
-        lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
-        dtype=sorted_bbox.dtype,
-    )
-    return nms_out
diff --git a/python/tvm/topi/cuda/reduction.py b/python/tvm/topi/cuda/reduction.py
deleted file mode 100644
index c3ddb59605be..000000000000
--- a/python/tvm/topi/cuda/reduction.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,too-many-locals,len-as-condition
-"""Schedule for reduce operators"""
-from __future__ import absolute_import as _abs
-from operator import mul
-from functools import reduce
-import tvm
-from tvm import te
-from .. import tag
-from .injective import schedule_injective_from_existing
-
-
-def _schedule_reduce(op, sch, is_idx_reduce=False):
-    if is_idx_reduce:
-        data_out = op.input_tensors[0]
-    else:
-        data_in = op.input_tensors[0]
-        data_out = op.output(0)
-
-    if not sch[data_out].op.reduce_axis:
-        return schedule_injective_from_existing(sch, op.output(0))
-
-    if len(sch[data_out].op.axis) > 0:
-        all_reduce = False
-        num_thread = 32
-        target = tvm.target.Target.current()
-        if target and (target.kind.name == "opencl" or target.kind.name == "metal"):
-            # without it, CL_INVALID_WORK_GROUP_SIZE occurred when running test_topi_reduce.py
-            # don't know why
-            num_thread = 16
-        block_x = te.thread_axis("blockIdx.x")
-        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-        thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
-    else:
-        all_reduce = True
-        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
-        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-
-    # Fuse and refactor the reduce axis
-    fused_reduce = sch[data_out].fuse(
-        *[sch[data_out].op.reduce_axis[i] for i in range(len(sch[data_out].op.reduce_axis))]
-    )
-    ko, ki = sch[data_out].split(fused_reduce, factor=num_thread)
-    if is_idx_reduce:
-        data_out_rf, _ = sch.rfactor(data_out, ki)
-    else:
-        data_out_rf = sch.rfactor(data_out, ki)
-    tx = sch[data_out].op.reduce_axis[0]
-    sch[data_out].bind(tx, thread_x)
-    sch[data_out_rf].compute_at(sch[data_out], tx)
-    if is_idx_reduce:
-        real_output = op.output(0)
-        temp_idx_input = data_out.op.output(0)
-        temp_val_input = data_out.op.output(1)
-    else:
-        real_output = data_out
-    if not all_reduce:
-        # Fuse and split the axis
-        fused_outer = sch[real_output].fuse(
-            *[sch[real_output].op.axis[i] for i in range(len(sch[real_output].op.axis))]
-        )
-        bx, outer_in = sch[real_output].split(fused_outer, factor=num_thread)
-
-        # Bind the axes to threads and blocks
-        sch[real_output].bind(outer_in, thread_y)
-        sch[real_output].bind(bx, block_x)
-        if is_idx_reduce:
-            sch[temp_idx_input].compute_at(sch[real_output], outer_in)
-            sch[temp_val_input].compute_at(sch[real_output], outer_in)
-        sch[real_output].set_store_predicate(
-            tvm.tir.all(
-                thread_x.equal(0), block_x * num_thread + thread_y < reduce(mul, real_output.shape)
-            )
-        )
-    else:
-        if is_idx_reduce:
-            spatial_axis = sch[real_output].fuse(*(sch[real_output].op.axis))
-            sch[real_output].bind(spatial_axis, te.thread_axis("blockIdx.x"))
-            sch[temp_idx_input].compute_at(sch[real_output], spatial_axis)
-            sch[temp_val_input].compute_at(sch[real_output], spatial_axis)
-        sch[real_output].set_store_predicate(thread_x.equal(0))
-    return sch
-
-
-def _enable_auto_inline(sch):
-    def is_scheduled(stage):
-        # auto inline requires the attach type is AttachType.kGroupRoot
-        conds = [
-            len(stage.relations) == 0,
-            stage.attach_type == 1,
-            stage.all_iter_vars == stage.leaf_iter_vars,
-        ]
-        if not all(conds):
-            return True
-        return False
-
-    for s in sch.stages:
-        if not s.is_output and isinstance(s.op, tvm.te.ComputeOp):
-            if is_scheduled(s) or len(s.op.reduce_axis) != 0:
-                return False
-    return True
-
-
-def schedule_reduce_impl(
-    outs, schedule_reduce_stage, schedule_injective_stage, inline_postops=False
-):
-    """Schedule for inject->reduce->bcast ops.
-    Traverse over the stages in the schedule and schedule separate stages depending
-    on the position of the stage. Injecteve post-ops of reduction will be scheduled using
-    injection schedule, injective pre-ops of reduction will be inlined, reduction stage
-    will be scheduled using reduction schedule
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of reduce in the format
-          of an array of tensors.
-    schedule_reduce_stage: Function responsible for scheduling the reduction
-          stage
-    schedule_injective_stage: Function responsible for scheduling the
-          standalone injection stage
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    sch = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-    enable_auto_inline = _enable_auto_inline(sch)
-
-    def traverse_before_reduce(operator):
-        """Internal traverse function"""
-        if isinstance(operator, tvm.te.PlaceholderOp):
-            return
-        if tag.is_injective(operator.tag):
-            sch[operator].compute_inline()
-            for tensor in operator.input_tensors:
-                if tensor.op not in scheduled_ops:
-                    traverse_before_reduce(tensor.op)
-        else:
-            raise RuntimeError(f"Unsupported operator: {operator.tag}")
-
-        scheduled_ops.append(operator)
-
-    def traverse_after_reduce(operator):
-        """Internal traverse function"""
-        if tag.is_broadcast(operator.tag):
-            if operator not in scheduled_ops and not inline_postops:
-                schedule_injective_stage(sch, operator.output(0))
-            for tensor in operator.input_tensors:
-                if tensor.op not in scheduled_ops:
-                    if enable_auto_inline:
-                        traverse_before_reduce(tensor.op)
-                    else:
-                        traverse_after_reduce(tensor.op)
-        elif operator.tag == "comm_reduce":
-            if operator not in scheduled_ops:
-                schedule_reduce_stage(operator, sch, is_idx_reduce=False)
-            for tensor in operator.input_tensors:
-                if tensor.op not in scheduled_ops:
-                    traverse_before_reduce(tensor.op)
-        elif operator.tag == "comm_reduce_idx":
-            if operator not in scheduled_ops:
-                schedule_reduce_stage(operator, sch, is_idx_reduce=True)
-            input_tensors = operator.input_tensors[0].op.input_tensors
-            for tensor in input_tensors:
-                if tensor.op not in scheduled_ops:
-                    traverse_before_reduce(tensor.op)
-        elif isinstance(operator, tvm.te.PlaceholderOp):
-            pass
-        else:
-            raise RuntimeError(f"Unsupported operator: {operator.tag}")
-
-        scheduled_ops.append(operator)
-
-    for out in outs:
-        traverse_after_reduce(out.op)
-    return sch
-
-
-def schedule_reduce(outs):
-    return schedule_reduce_impl(outs, _schedule_reduce, schedule_injective_from_existing)
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
deleted file mode 100644
index 7f5fb8aa8770..000000000000
--- a/python/tvm/topi/cuda/scatter.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Scatter operators"""
-import tvm
-from tvm import te, tir, autotvm
-from ..scatter import _verify_scatter_nd_inputs
-from ..generic import schedule_extern
-from .nms import atomic_add
-from .sort import stable_sort_by_key_thrust
-from ..utils import ceil_div
-
-
-def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out):
-    """Generate scatter ir for 1d inputs, using a sorting based approach.
-    By sorting indices and comparing neighboring two indices, we can tell which
-    of elements in the indices tensor can scatter its update value into the output.
-    Sorting of indices, and sorting of updates with respect to indices, can be done
-    at the same time by thrust's sort_by_key function. It is important that sorting
-    be done in a "stable" way via stable_sort, to guarantee deterministic output.
-    Negative indices are assumed to have been converted to corresponding positive
-    indices.
-
-    Parameters
-    ----------
-    data : tir.Tensor
-        The input data to the operator.
-
-    indices_sorted : tir.Tensor
-        The sorted index locations to update.
-
-    updates : tir.Tensor
-        The values to update, sorted by indices.
-
-    out : tir.Tensor
-        The output tensor.
-
-    Returns
-    -------
-    ret : tir
-        The computational ir.
-    """
-    n = data.shape[0]
-
-    ib = tvm.tir.ir_builder.create()
-
-    out_ptr = ib.buffer_ptr(out)
-    data_ptr = ib.buffer_ptr(data)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-
-    with ib.new_scope():
-        nthread_bx = ceil_div(n, nthread_tx)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * nthread_tx + tx
-        with ib.if_scope(tid < n):
-            out_ptr[tid] = data_ptr[tid]
-
-    indices_ptr = ib.buffer_ptr(indices_sorted)
-    updates_ptr = ib.buffer_ptr(updates_sorted)
-
-    ni = indices_sorted.shape[0]
-
-    with ib.new_scope():
-        nthread_bx = ceil_div(ni, nthread_tx)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * nthread_tx + tx
-
-        with ib.if_scope(tid == ni - 1):
-            # The last element can always update.
-            index = indices_ptr[tid]
-            update = updates_ptr[tid]
-            out_ptr[index] = update
-
-        with ib.else_scope():
-            with ib.if_scope(tid < ni - 1):
-                index = indices_ptr[tid]
-                index_next = indices_ptr[tid + 1]
-
-                # If the next neighbor in the sorted list of indices has a different index,
-                # that means thread tid is the last one to have this index.
-                # This thread can update the output.
-                with ib.if_scope(index != index_next):
-                    update = updates_ptr[tid]
-                    out_ptr[index] = update
-
-    return ib.get()
-
-
-@autotvm.register_topi_compute("scatter_via_sort.cuda")
-def scatter_via_sort(cfg, data, indices, updates, axis=0, reduction="add"):
-    """Update data at positions defined by indices with values in updates
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    indices : relay.Expr
-        The index locations to update.
-
-    updates : relay.Expr
-        The values to update.
-
-    axis : int
-        The axis to scatter on
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    assert reduction == "add"
-    if axis < 0:
-        axis += len(data.shape)
-    assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
-
-    cfg.add_flop(1)  # A dummy value to satisfy AutoTVM
-
-    out_shape = data.shape
-    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
-
-    indices_sorted, updates_sorted = stable_sort_by_key_thrust(indices, updates, for_scatter=True)
-
-    out = te.extern(
-        [out_shape],
-        [data, indices_sorted, updates_sorted],
-        lambda ins, outs: gen_scatter_1d_thrust(ins[0], ins[1], ins[2], outs[0]),
-        dtype=data.dtype,
-        out_buffers=[out_buf],
-        name="scatter_via_sort_gpu",
-        tag="scatter_via_sort_gpu",
-    )
-
-    return out
-
-
-@autotvm.register_topi_schedule("scatter_via_sort.cuda")
-def schedule_scatter_via_sort(_, outs):
-    return schedule_extern(outs)
-
-
-def scatter_nd(data, indices, updates, mode):
-    """Scatter elements from a n-dimension array.
-
-    Given updates with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
-    (M, Y_0, ..., Y_{K-1}), and output copied from data with shape (X_0, X_1, ..., X_{N-1}),
-    scatter_nd computes
-
-    .. code-block::
-
-        output[indices[0, y_0, ..., y_{K-1}],
-               ...,
-               indices[M-1, y_0, ..., y_{K-1}],
-               x_M,
-               ...,
-               x_{N-1}
-              ] = f(output[...], updates[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}])
-
-    where the update function f is determinted by the mode.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        The source array.
-
-    indices : tvm.te.Tensor
-        The indices of the values to extract.
-
-    updates : tvm.te.Tensor
-        The updates to apply at the Indices
-
-    mode : string
-        The update mode for the algorithm, either "update" or "add"
-        If update, the update values will replace the input data
-        If add, the update values will be added to the input data
-
-    Returns
-    -------
-    ret : tvm.te.Tensor
-    """
-    _verify_scatter_nd_inputs(data, indices, updates)
-
-    def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr):
-        ib = tvm.tir.ir_builder.create()
-
-        data = ib.buffer_ptr(data_ptr)
-        indices = ib.buffer_ptr(indices_ptr)
-        updates = ib.buffer_ptr(updates_ptr)
-        out = ib.buffer_ptr(out_ptr)
-
-        atomic_add_return = ib.allocate(
-            updates.dtype, (1,), name="atomic_add_return", scope="local"
-        )
-
-        fused_indices_dimension = 1
-        for i in indices_ptr.shape[1:]:
-            fused_indices_dimension *= i
-
-        fused_updates_dimension = 1
-        for i in updates_ptr.shape[len(indices_ptr.shape) - 1 :]:
-            fused_updates_dimension *= i
-
-        fused_shape = 1
-        for i in data_ptr.shape:
-            fused_shape *= i
-
-        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-
-        tdim = tvm.tir.min(max_threads, fused_updates_dimension)
-        with ib.new_scope():
-            bdim = ceil_div(fused_shape, tdim)
-            bx = te.thread_axis("blockIdx.x")
-            tx = te.thread_axis("threadIdx.x")
-            ib.scope_attr(bx, "thread_extent", bdim)
-            ib.scope_attr(tx, "thread_extent", tdim)
-
-            index = bx * tdim + tx
-            with ib.if_scope(index < fused_shape):
-                out[index] = data[index]
-
-        # For better performance, we introduce blockIdx.y to implement for-loops
-        # within one thread.
-        # The code is parallel over the scattered indices, so we use atomic_add
-        # to guarantee correctness when mode=="add"
-
-        # For now, atomic is not supported by target "vulkan", "metal", or "cuda" with "int64"
-        # So we fallback to normal algorithm, using "+=" rather than atomic_add
-
-        # TODO (CaptainDuke):
-        # Since multiple threads compete for the same write index, which leads to
-        # non-determinstic output for update mode. We could add a new attribute,
-        # "allow_non_deterministic", which can be conditionally set to True by
-        # each frontend when non-determinsm is allowed.
-        cur_target_kind = str(tvm.target.Target.current(allow_none=False).kind)
-        with ib.new_scope():
-            if (
-                mode == "add"
-                and cur_target_kind not in ["vulkan", "metal"]
-                and updates.dtype in ["int32", "float32"]
-            ):
-                bdim_x = fused_indices_dimension
-                bdim_y = ceil_div(fused_updates_dimension, tdim)
-                # In case of large input sizes, fused_indices_dimension might be too large.
-                # So we use blockIdx.x because holds larger scales.
-                bx = te.thread_axis("blockIdx.x")
-                by = te.thread_axis("blockIdx.y")
-                tx = te.thread_axis("threadIdx.x")
-                ib.scope_attr(bx, "thread_extent", bdim_x)
-                ib.scope_attr(by, "thread_extent", bdim_y)
-                ib.scope_attr(tx, "thread_extent", tdim)
-
-                j = by * tdim + tx
-                with ib.if_scope(j < fused_updates_dimension):
-                    offset = fused_updates_dimension
-                    index = j  # This is x_M, .. x_{N-1} part of the index into out.
-                    # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}]
-                    # part of the index into out.
-                    up_index = bx * fused_updates_dimension + j
-                    for l in reversed(range(indices_ptr.shape[0].value)):
-                        # indices[bx * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
-                        index += offset * indices[bx + l * fused_indices_dimension]
-                        offset *= data_ptr.shape[l]
-                    atomic_add_return[0] = atomic_add(
-                        tvm.tir.call_intrin("handle", "tir.address_of", out[index]),
-                        updates[up_index],
-                    )
-            else:
-                bdim_x = ceil_div(fused_updates_dimension, tdim)
-                bx = te.thread_axis("blockIdx.x")
-                tx = te.thread_axis("threadIdx.x")
-                ib.scope_attr(bx, "thread_extent", bdim_x)
-                ib.scope_attr(tx, "thread_extent", tdim)
-                with ib.for_range(0, fused_indices_dimension) as i:
-                    j = bx * tdim + tx
-                    with ib.if_scope(j < fused_updates_dimension):
-                        offset = fused_updates_dimension
-                        index = j  # This is x_M, .. x_{N-1} part of the index into out.
-                        # Build up the
-                        # indices[0, y_0, .. y_{K-1}], ... indices[M-1, y_0, .. y_{K-1}]
-                        # part of the index into out.
-                        for l in reversed(range(indices_ptr.shape[0].value)):
-                            # indices[i * l * fused_indices_dimension] = indices[l, y_0,
-                            #                                                   ... y_{k-1}]
-                            index += offset * indices[i + l * fused_indices_dimension]
-                            offset *= data_ptr.shape[l]
-                        if mode == "update":
-                            out[index] = updates[i * fused_updates_dimension + j]
-                        elif mode == "add":
-                            out[index] += updates[i * fused_updates_dimension + j]
-                        elif mode == "mul":
-                            out[index] *= updates[i * fused_updates_dimension + j]
-                        elif mode == "min":
-                            out[index] = tir.min(
-                                out[index], updates[i * fused_updates_dimension + j]
-                            )
-                        elif mode == "max":
-                            out[index] = tir.max(
-                                out[index], updates[i * fused_updates_dimension + j]
-                            )
-                        else:
-                            raise NotImplementedError(
-                                "scatter_nd mode not in [update, add, mul, min, max]:", mode
-                            )
-
-        return ib.get()
-
-    out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf")
-    return te.extern(
-        [data.shape],
-        [data, indices, updates],
-        lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0]),
-        dtype=data.dtype,
-        out_buffers=[out_buf],
-        name="scatter_nd_cuda",
-        tag="scatter_nd_cuda",
-    )
diff --git a/python/tvm/topi/cuda/scatter_elements.py b/python/tvm/topi/cuda/scatter_elements.py
deleted file mode 100644
index 27567ea23e21..000000000000
--- a/python/tvm/topi/cuda/scatter_elements.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Scatter operator """
-import tvm
-from tvm import te, tir
-from ..utils import ceil_div, get_const_int
-from ..math import cast
-from .nms import atomic_add
-
-
-def gen_scatter_add_1d_atomic(data, indices, updates, out, axis, _):
-    """Generate ir for scatter elements for reduction sum for 1d inputs,
-    using atomic_add instruction
-
-    Parameters
-    ----------
-    data : tir.Tensor
-        The input data to the operator.
-
-    indices : tir.Tensor
-        The index locations to update.
-
-    updates : tir.Tensor
-        The values to update.
-
-    out : tir.Tensor
-        The output tensor.
-
-    axis : int
-        The axis to scatter on
-
-    Returns
-    -------
-    ret : tir
-        The computational ir.
-    """
-    assert axis == 0
-    n = data.shape[0]
-
-    ib = tvm.tir.ir_builder.create()
-
-    out_ptr = ib.buffer_ptr(out)
-    data_ptr = ib.buffer_ptr(data)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-
-    with ib.new_scope():
-        nthread_bx = ceil_div(n, nthread_tx)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * nthread_tx + tx
-        with ib.if_scope(tid < n):
-            out_ptr[tid] = data_ptr[tid]
-
-    indices_ptr = ib.buffer_ptr(indices)
-    updates_ptr = ib.buffer_ptr(updates)
-
-    ni = indices.shape[0]
-
-    atomic_add_return = ib.allocate(updates.dtype, (1,), name="atomic_add_return", scope="local")
-
-    with ib.new_scope():
-        nthread_bx = ceil_div(ni, nthread_tx)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * nthread_tx + tx
-
-        with ib.if_scope(tid < ni):
-            index = indices_ptr[tid]
-            with ib.if_scope(index < 0):
-                atomic_add_return[0] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", out_ptr[index + n]),
-                    updates_ptr[tid],
-                )
-            with ib.else_scope():
-                atomic_add_return[0] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", out_ptr[index]),
-                    updates_ptr[tid],
-                )
-
-    return ib.get()
-
-
-def gen_ir(data, indices, updates, out, axis, reduce_func):
-    """Generate ir for scatter elements
-
-    Parameters
-    ----------
-    data : tir.Tensor
-        The input data to the operator.
-
-    indices : tir.Tensor
-        The index locations to update.
-
-    updates : tir.Tensor
-        The values to update.
-
-    out : tir.Tensor
-        The output tensor.
-
-    axis : int
-        The axis to scatter on
-
-    reduce_func : Any
-        The function reduced update and output to output
-
-    Returns
-    -------
-    ret : tir
-        The computational ir.
-    """
-    ib = tir.ir_builder.create()
-
-    data_ptr = ib.buffer_ptr(data)
-    indices_ptr = ib.buffer_ptr(indices)
-    updates_ptr = ib.buffer_ptr(updates)
-    out_ptr = ib.buffer_ptr(out)
-
-    # Prepare ranges and strides
-    shape = data.shape
-    if axis < 0:
-        axis = len(shape) + axis
-    axis_range = cast(shape[axis], indices.dtype)
-
-    before_axis_range = 1
-    after_axis_range = 1
-    for i, value in enumerate(shape, 0):
-        if i < axis:
-            before_axis_range *= value
-        elif i > axis:
-            after_axis_range *= value
-    before_axis_stride = axis_range * after_axis_range
-    full_range = before_axis_range * before_axis_stride
-
-    ind_shape = indices.shape
-    ind_axis_range = ind_shape[axis]
-
-    ind_before_axis_range = 1
-    ind_after_axis_range = 1
-    for i, value in enumerate(ind_shape, 0):
-        if i < axis:
-            ind_before_axis_range *= value
-        elif i > axis:
-            ind_after_axis_range *= value
-    ind_before_axis_stride = ind_axis_range * ind_after_axis_range
-    ind_full_range_excl_axis = ind_before_axis_range * ind_after_axis_range
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    # Copy initial input data to output
-    with ib.new_scope():
-        num_blocks = cast(ceil_div(full_range, max_threads), "int32")
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(bx, "thread_extent", num_blocks)
-        ib.scope_attr(tx, "thread_extent", max_threads)
-
-        index = bx * max_threads + tx
-        with ib.if_scope(index < full_range):
-            out_ptr[index] = data_ptr[index]
-
-    with ib.new_scope():
-        num_blocks_2 = ceil_div(ind_full_range_excl_axis, max_threads)
-        bx2 = te.thread_axis("blockIdx.x")
-        tx2 = te.thread_axis("threadIdx.x")
-        ib.scope_attr(bx2, "thread_extent", num_blocks_2)
-        ib.scope_attr(tx2, "thread_extent", max_threads)
-
-        ind_fused = bx2 * max_threads + tx2
-        with ib.if_scope(ind_fused < ind_full_range_excl_axis):
-            i = ind_fused // ind_after_axis_range
-            j = ind_fused % ind_after_axis_range
-            pre_index1 = i * ind_before_axis_stride + j
-            pre_index2 = i * before_axis_stride + j
-            with ib.for_range(0, ind_axis_range, "k") as k:
-                # Offset along indices or updates
-                index1 = pre_index1 + k * ind_after_axis_range
-                # Get index and shift to positive side if need
-                new_index = indices_ptr[index1]
-                shifted_index = new_index + (new_index < 0) * axis_range
-                # Offset along data
-                index2 = pre_index2 + shifted_index * after_axis_range
-                reduce_func(out_ptr, index2, updates_ptr[index1])
-
-    return ib.get()
-
-
-def scatter_elements(data, indices, updates, axis=0, reduction="update"):
-    """Scatter elements from updates to corresponding indices of copied data.
-
-    Data, indices, updates and output have the same shape.
-    Indices can not have duplicates (if idx1 != idx2, then indices[idx1] != indices[idx2])
-    if reduction == "update".
-
-    .. code-block::
-
-        output[indices[i][j]][j] = f(output[indices[i][j]][j], updates[i][j]) if axis = 0
-        output[i][indices[i][j]] = f(output[i][indices[i][j]], updates[i][j]) if axis = 1
-
-    where the update function f is determinted by the reduction.
-    Five types of the function are supported: "update", "add", "mul", "min" and "max" (see below)
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        The source array.
-
-    indices : tvm.te.Tensor
-        The indices of the values to extract.
-
-    updates : tvm.te.Tensor
-        The updates to apply at the Indices
-
-    axis : optional, int
-        The axis to scatter on. It is zero by default.
-
-    reduction : optional, string
-        The update mode for the algorithm, either "update", "add", "mul", "mean", "min" or "max"
-        If update, the update values will replace the input data
-        If add, the update values will be added to the input data
-        If mul, the input data will be multiplied on the update values
-        If mean, the input data will be mean between the update values and the input data
-        If min, there is choice of minimal between the update values and the input data
-        If max, there is choice of maximal between the update values and the input data
-        It is "update" by default
-
-    Returns
-    -------
-    ret : tvm.te.Tensor
-    """
-    if not isinstance(axis, int):
-        axis = get_const_int(axis)
-
-    def update_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] = update
-
-    def add_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] += update
-
-    def mul_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] *= update
-
-    def mean_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] = (dst_ptr[dst_index] + update) / 2
-
-    def min_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] = tir.min(dst_ptr[dst_index], update)
-
-    def max_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] = tir.max(dst_ptr[dst_index], update)
-
-    reduce_func = None
-    if reduction == "update":
-        reduce_func = update_func
-    elif reduction == "add":
-        reduce_func = add_func
-    elif reduction == "mul":
-        reduce_func = mul_func
-    elif reduction == "mean":
-        reduce_func = mean_func
-    elif reduction == "min":
-        reduce_func = min_func
-    elif reduction == "max":
-        reduce_func = max_func
-    else:
-        raise NotImplementedError(
-            "scatter_elements reduction not in [update, add, mul, min, max]:", reduction
-        )
-
-    shape = data.shape
-    rank = len(shape)
-    cur_target_kind = str(tvm.target.Target.current(allow_none=False).kind)
-    gen_scatter_elements_ir = None
-    if (
-        reduction == "add"
-        and rank == 1
-        and cur_target_kind not in ["vulkan", "metal"]
-        and updates.dtype in ["int32", "float32"]
-    ):
-        gen_scatter_elements_ir = gen_scatter_add_1d_atomic
-    else:
-        gen_scatter_elements_ir = gen_ir
-
-    out_buf = tir.decl_buffer(shape, data.dtype, "out_buf")
-    return te.extern(
-        [shape],
-        [data, indices, updates],
-        lambda ins, outs: gen_scatter_elements_ir(
-            ins[0], ins[1], ins[2], outs[0], axis, reduce_func
-        ),
-        dtype=data.dtype,
-        out_buffers=[out_buf],
-        name="scatter_elements_cuda",
-        tag="scatter_elements_cuda",
-    )
diff --git a/python/tvm/topi/cuda/searchsorted.py b/python/tvm/topi/cuda/searchsorted.py
deleted file mode 100644
index 1c39ccaa8632..000000000000
--- a/python/tvm/topi/cuda/searchsorted.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""searchsorted operator for GPU"""
-import tvm
-from tvm import te
-from .. import utils
-from ..searchsorted import binary_search
-
-
-def searchsorted(sorted_sequence, values, right, out_dtype="int64"):
-    """Find indices where elements should be inserted to maintain order.
-       If `sorted_sequence` is N-dimensional, the innermost dimension of
-       `values` are searched in the corresponding dimension of `sorted_sequence`.
-
-    Parameters
-    ----------
-    sorted_sequence : te.Tensor
-        N-D or 1-D Tensor, containing monotonically increasing sequence
-        on the innermost dimension.
-
-    values : te.Tensor
-        N-D Tensor containing the search values. When `sorted_sequence` is 1-D,
-        the shape of `values` can be arbitrary. Otherwise, ranks of `sorted_sequence`
-        and `values` must be the same, and outer N-1 axes must have the same size.
-
-    right : bool, optional
-        Controls which index is returned if a value lands exactly on one of sorted values. If
-        False, the index of the first suitable location found is given. If true, return the
-        last such index. If there is no suitable index, return either 0 or N (where N is the
-        size of the innermost dimension).
-
-    dtype : string, optional
-        The data type of the output indices.
-
-    Returns
-    -------
-    indices : te.Tensor
-        Tensor with same shape as values, representing the indices of
-        elements of `values` if they are inserted in `sorted_sequence`.
-    """
-
-    def ir(sorted_sequence, values, indices):
-        ib = tvm.tir.ir_builder.create()
-        sorted_sequence_shape = sorted_sequence.shape
-        values_shape = values.shape
-        num_search = utils.prod(values_shape)
-        search_range = sorted_sequence_shape[-1]
-
-        sorted_sequence = ib.buffer_ptr(sorted_sequence)
-        values = ib.buffer_ptr(values)
-        indices = ib.buffer_ptr(indices)
-
-        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(
-            bx, "thread_extent", tvm.tir.indexdiv(num_search + max_threads - 1, max_threads)
-        )
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        tid = bx * max_threads + tx
-
-        with ib.if_scope(tid < num_search):
-            if len(sorted_sequence_shape) == 1:
-                sequence_offset = 0
-            else:
-                sequence_id = tid // values_shape[-1]
-                sequence_offset = sequence_id * search_range
-
-            indices[tid] = binary_search(
-                ib,
-                sequence_offset,
-                search_range,
-                sorted_sequence,
-                values[tid],
-                right,
-                out_dtype,
-            )
-
-        return ib.get()
-
-    return te.extern(
-        values.shape,
-        [sorted_sequence, values],
-        lambda ins, outs: ir(ins[0], ins[1], outs[0]),
-        name="searchsorted",
-        dtype=out_dtype,
-    )
diff --git a/python/tvm/topi/cuda/signal.py b/python/tvm/topi/cuda/signal.py
deleted file mode 100644
index d08f41ab8912..000000000000
--- a/python/tvm/topi/cuda/signal.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks, unused-argument
-"""STFT operator"""
-from math import pi
-import tvm
-from tvm import te, tir
-from ..utils import ceil_div
-
-
-def _get_max_threads(batch_row):
-    max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-    return tir.min(batch_row, max_threads)
-
-
-def stft(
-    data,
-    n_fft,
-    hop_length,
-    win_length,
-    window,
-    normalized,
-    onesided,
-    output_shape,
-):
-    """
-    The STFT computes the Fourier transform of short overlapping windows of the input.
-    This gives frequency components of the signal as they change over time.
-    Parameters
-    ----------
-    data : relay.Expr
-        Either a 1-D tensor or a 2-D batch tensor.
-    n_fft : int
-        The size of Fourier transform
-    hop_length : int
-        The distance between neighboring sliding window frames
-    win_length : int
-        The size of window frame and STFT filter
-    window : relay.Expr
-        A 1-D tensor window frame
-    normalized : bool
-        Whether to return the normalized STFT results
-    onesided : bool
-        Whether to return onesided result or fill with conjugate symmetry
-    Returns
-    -------
-    output : relay.Expr
-        Tensor containing the STFT result
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [1, 2, 3, 4, 5, 6]
-        window = [4, 3, 2]
-        [n_fft, hop_length, win_length, normalized, onesided] = [3, 3, 3, False, True]
-        relay.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
-        -> [[[15.0000,  0.0000], [34.0000,  0.0000]], [[ 4.5000,  0.8660], [ 1.0000, -1.7321]]]
-    """
-
-    def gen_ir(
-        data_ptr,
-        n_fft,
-        hop_length,
-        win_length,
-        window_ptr,
-        normalized,
-        onesided,
-        output_ptr,
-    ):
-        ib = tir.ir_builder.create()
-        data = ib.buffer_ptr(data_ptr)
-        window = ib.buffer_ptr(window_ptr)
-        output = ib.buffer_ptr(output_ptr)
-        max_threads = _get_max_threads(output_ptr.shape[0] * output_ptr.shape[1])
-        output_size = output_ptr.shape[0] * output_ptr.shape[1] * output_ptr.shape[2]
-        with ib.new_scope():
-            nthread_tx = max_threads
-            nthread_bx = ceil_div(output_size, max_threads)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-            tid = bx * max_threads + tx
-
-            with ib.if_scope(tid < output_size):
-                matrix_size = output_ptr.shape[1] * output_ptr.shape[2]
-                batch = tir.floordiv(tid, matrix_size)
-                row = tir.floordiv(tir.indexmod(tid, matrix_size), output_ptr.shape[2])
-                col = tir.indexmod(tir.indexmod(tid, matrix_size), output_ptr.shape[2])
-                output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0)
-                output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0)
-                with ib.for_range(0, win_length) as wlen:
-                    output[batch, row, col, 0] += (
-                        window[wlen]
-                        * data[batch, col * hop_length + wlen]
-                        * tir.cos(2 * pi * row * wlen / win_length)
-                    )
-                    output[batch, row, col, 1] -= (
-                        window[wlen]
-                        * data[batch, col * hop_length + wlen]
-                        * tir.sin(2 * pi * row * wlen / win_length)
-                    )
-                with ib.if_scope(normalized):
-                    output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32"))
-                    output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32"))
-
-        return ib.get()
-
-    output_buf = tir.decl_buffer(output_shape, data.dtype, "output_buf")
-
-    return te.extern(
-        output_shape,
-        [data, window],
-        lambda ins, outs: gen_ir(
-            ins[0], n_fft, hop_length, win_length, ins[1], normalized, onesided, outs[0]
-        ),
-        dtype=[data.dtype],
-        out_buffers=[output_buf],
-        name="stft_cuda",
-        tag="stft_cuda",
-    )
-
-
-def dft(
-    re_data: te.Tensor,
-    im_data: te.Tensor,
-    inverse: tir.IntImm,
-):
-    """
-    Computes the discrete Fourier transform of input (calculation along the last axis).
-    This gives frequency components of the signal as they change over time.
-
-    Parameters
-    ----------
-    re_data : relay.Expr
-        N-D tensor, real part of the input signal.
-
-    im_data : relay.Expr
-        N-D tensor, imaginary part of the input signal.
-        If the signal is real, then the values of this tensor are zeros.
-
-    inverse : bool
-        Whether to perform the inverse discrete fourier transform.
-
-    Returns
-    -------
-    re_output : relay.Expr
-        The Fourier Transform of the input (Real part).
-    im_output : relay.Expr
-        The Fourier Transform of the input (Imaginary part).
-    """
-
-    def gen_ir(
-        re_data_buf,
-        im_data_buf,
-        re_output_buf,
-        im_output_buf,
-    ):
-        ib = tir.ir_builder.create()
-        re_data_ptr = ib.buffer_ptr(re_data_buf)
-        im_data_ptr = ib.buffer_ptr(im_data_buf)
-        re_output_ptr = ib.buffer_ptr(re_output_buf)
-        im_output_ptr = ib.buffer_ptr(im_output_buf)
-
-        shape = re_data.shape
-        n_fft = shape[len(shape) - 1]
-        base_range = 1
-        for i in range(len(shape) - 1):
-            base_range *= shape[i]
-
-        sign = -1 if inverse else 1
-        factor = 1.0 / n_fft if inverse else 1.0
-
-        max_threads = _get_max_threads(base_range)
-        with ib.new_scope():
-            nthread_tx = max_threads
-            nthread_bx = ceil_div(base_range, max_threads)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-
-            tid = bx * max_threads + tx
-            with ib.if_scope(tid < base_range):
-                base_idx = tid * n_fft
-                with ib.for_range(0, n_fft) as n:
-                    n_idx = base_idx + n
-                    re_output_ptr[n_idx] = tir.Cast(re_output_ptr.dtype, 0)
-                    im_output_ptr[n_idx] = tir.Cast(im_output_ptr.dtype, 0)
-                    _w = sign * -2 * pi * n / n_fft
-                    with ib.for_range(0, n_fft) as k:
-                        k_idx = base_idx + k
-                        w = _w * k
-                        cos_w = tir.Cast(re_output_ptr.dtype, tir.cos(w))
-                        sin_w = tir.Cast(re_output_ptr.dtype, tir.sin(w))
-                        re_output_ptr[n_idx] += (
-                            re_data_ptr[k_idx] * cos_w - im_data_ptr[k_idx] * sin_w
-                        )
-                        im_output_ptr[n_idx] += (
-                            re_data_ptr[k_idx] * sin_w + im_data_ptr[k_idx] * cos_w
-                        )
-
-                    re_output_ptr[n_idx] *= tir.Cast(re_output_ptr.dtype, factor)
-                    im_output_ptr[n_idx] *= tir.Cast(im_output_ptr.dtype, factor)
-
-        return ib.get()
-
-    output_shape = [re_data.shape] * 2
-
-    return te.extern(
-        shape=output_shape,
-        inputs=[re_data, im_data],
-        fcompute=lambda ins, outs: gen_ir(ins[0], ins[1], outs[0], outs[1]),
-        dtype=[re_data.dtype, im_data.dtype],
-        name="dft_cuda",
-        tag="dft_cuda",
-    )
diff --git a/python/tvm/topi/cuda/softmax.py b/python/tvm/topi/cuda/softmax.py
deleted file mode 100644
index 3919eac8167d..000000000000
--- a/python/tvm/topi/cuda/softmax.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, trailing-whitespace
-"""Schedule for softmax operator"""
-from tvm.target import Target
-from tvm import te
-from tvm.contrib import cudnn
-from .. import generic
-from .injective import schedule_injective_from_existing
-from ..utils import get_const_int, traverse_inline
-
-
-def _schedule_softmax(softmax_op, s, outs, tgt):
-    op_tag = softmax_op.tag
-    axis = get_const_int(softmax_op.attrs["axis"])  # reduce axis
-    if op_tag == "softmax_output":
-        expsum = softmax_op.input_tensors[1]
-        exp = softmax_op.input_tensors[0]
-        max_elem = s[exp].op.input_tensors[1]
-        delta = None
-    elif op_tag == "fast_softmax_output":
-        expsum = softmax_op.input_tensors[1]
-        exp = softmax_op.input_tensors[0]
-        delta = s[exp].op.input_tensors[0]
-        max_elem = s[delta].op.input_tensors[1]
-    elif op_tag == "log_softmax_output":
-        exp = None
-        delta = None
-        max_elem = softmax_op.input_tensors[1]
-        expsum = softmax_op.input_tensors[2]
-    else:
-        raise ValueError(
-            f"Tag is expected to be softmax_output or log_softmax_output. Got {op_tag}"
-        )
-
-    # The nvptx and rocm backends only supports 32-bits warp shuffle
-    # instructions.
-    #
-    # TODO(tvm-team) Fix nvptx codegen or deprecate nvptx backend.
-    def sched_warp_softmax():
-        if tgt.kind.name in ["nvptx", "rocm"]:
-            dtype = softmax_op.output(0).dtype
-            return dtype in ["float32", "int32"]
-        if tgt.kind.name != "cuda":
-            # this is used as the gpu schedule for other arches which
-            # may not have warp reductions
-            return False
-        return True
-
-    if len(outs[0].shape) != 2:
-        ops = [max_elem.op, expsum.op, softmax_op]
-        if delta is not None:
-            ops.append(delta.op)
-        if exp is not None:
-            ops.append(exp.op)
-        if softmax_op != outs[0].op:
-            ops.append(outs[0].op)
-
-        for op in ops:
-            s = schedule_injective_from_existing(s, op.output(0))
-
-    elif sched_warp_softmax():
-        # A warp of 32 threads performs a row reduction.
-        num_thread = tgt.thread_warp_size
-        block_x = te.thread_axis("blockIdx.x")
-        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-
-        # (4) softmax
-        output = outs[0]
-        xo, xi = s[output].split(output.op.axis[axis], nparts=num_thread)
-        xio, xii = s[output].split(xi, factor=4)
-        s[output].vectorize(xii)
-        s[output].bind(xo, thread_x)
-        s[output].bind(output.op.axis[axis ^ 1], block_x)
-        s[output].reorder(output.op.axis[axis ^ 1], xo, xio, xii)
-
-        if softmax_op != outs[0].op:
-            s[softmax_op].compute_at(s[output], xio)
-            s[softmax_op].vectorize(softmax_op.axis[axis])  # vec_len == 4
-
-        # (3) expsum
-        k = expsum.op.reduce_axis[0]
-        ko, _ = s[expsum].split(k, nparts=num_thread)
-        s[expsum].bind(ko, thread_x)
-        s[expsum].compute_at(s[output], xo)
-
-        # (2) exp
-        if delta is not None:
-            s[exp].compute_inline()
-            s[delta].compute_inline()
-        elif exp is not None:
-            xo, xi = s[exp].split(exp.op.axis[axis], nparts=num_thread)
-            _, xii = s[exp].split(xi, factor=4)
-            s[exp].vectorize(xii)
-            s[exp].bind(xo, thread_x)
-            s[exp].compute_at(s[expsum], expsum.op.axis[0])
-            s[exp].compute_at(s[output], output.op.axis[axis ^ 1])
-            s[exp].set_scope("warp")
-
-        # (1) max_elem
-        k = max_elem.op.reduce_axis[0]
-        ko, _ = s[max_elem].split(k, nparts=num_thread)
-        s[max_elem].bind(ko, thread_x)
-        if exp is not None and delta is None:
-            s[max_elem].compute_at(s[exp], xo)
-        else:
-            s[max_elem].bind(ko, thread_x)
-            s[max_elem].bind(max_elem.op.axis[0], block_x)
-
-    else:
-        num_thread = 64
-        block_x = te.thread_axis("blockIdx.x")
-        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-
-        if delta is not None:
-            s[exp].compute_inline()
-            s[delta].compute_inline()
-        elif exp is not None:
-            s[exp].bind(exp.op.axis[axis ^ 1], block_x)
-
-        s[max_elem].bind(max_elem.op.axis[0], block_x)
-        k = expsum.op.reduce_axis[0]
-        ko, ki = s[expsum].split(k, factor=num_thread)
-        EF = s.rfactor(expsum, ki)
-        s[expsum].bind(s[expsum].op.axis[0], block_x)
-        s[expsum].bind(s[expsum].op.reduce_axis[0], thread_x)
-        s[EF].compute_at(s[expsum], s[expsum].op.reduce_axis[0])
-        s[expsum].set_store_predicate(thread_x.var.equal(0))
-
-        output = outs[0]
-        tx, xi = s[output].split(output.op.axis[axis], nparts=num_thread)
-        s[output].bind(output.op.axis[axis ^ 1], block_x)
-        s[output].bind(tx, thread_x)
-        s[output].reorder(output.op.axis[axis ^ 1], tx, xi)
-
-        if softmax_op != outs[0].op:
-            s[softmax_op].compute_at(s[output], tx)
-
-
-def schedule_softmax(outs):
-    """Schedule for softmax op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of softmax in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tgt = Target.current(allow_none=False)
-
-    def _callback(op):
-        if "softmax" in op.tag:
-            _schedule_softmax(op, s, outs, tgt)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def softmax_cudnn(x, axis=-1):
-    """Perform softmax on the data using cudnn"""
-    return cudnn.softmax(x, axis)
-
-
-def schedule_softmax_cudnn(outs):
-    """Schedule for softmax cudnn op"""
-    return generic.schedule_extern(outs)
-
-
-def log_softmax_cudnn(x, axis=-1):
-    """Perform log_softmax on the data using cudnn"""
-    return cudnn.log_softmax(x, axis)
-
-
-def schedule_log_softmax_cudnn(outs):
-    """Schedule for log_softmax cudnn op"""
-    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
deleted file mode 100644
index cd977fb1b868..000000000000
--- a/python/tvm/topi/cuda/sparse.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Sparse operators"""
-import numpy as np
-
-import tvm
-from tvm import relay, te
-
-from .. import nn
-from ..utils import ceil_div, get_const_int, get_const_tuple, prod, traverse_inline
-from .transform import schedule_transpose_from_existing
-
-
-def sparse_dense(data, weight_data, weight_indices, weight_indptr, sparse_lhs=False):
-    """
-    Computes sparse-dense matrix multiplication of `data` and
-    `(weight_data, weight_indices, weight_indptr).T`
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        2-D with shape [M, K], float32
-
-    weight_data : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        3-D with shape [num_blocks, bs_r, bs_c] (BSR)
-
-    weight_indices : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        1-D with shape [num_blocks] (BSR)
-
-    weight_indptr : tvm.te.Tensor
-        1-D with shape [N + 1] (CSR) or
-        1-D with shape [(N + 1) // bs_r] (BSR)
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [M, N]
-    """
-    # pylint:disable=unused-argument
-    return nn.sparse_dense(data, weight_data, weight_indices, weight_indptr, sparse_lhs)
-
-
-def schedule_sparse_dense(outs):
-    """Create schedule for sparse dense"""
-    # pylint:disable=invalid-name
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "sparse_dense_sp_rhs_bsrmm" or op.tag == "sparse_dense_sp_lhs_bsrmm":
-            y_bsrmm = op.input_tensors[0]
-            assert (
-                y_bsrmm.op.tag == "sparse_dense_sp_rhs_bsrmm_block"
-                or y_bsrmm.op.tag == "sparse_dense_sp_lhs_bsrmm_block"
-            )
-            out = s.outputs[0].output(0)
-
-            if op not in s.outputs:
-                y_reshape = op.output(0)
-                s[y_reshape].compute_at(s[out], s[out].op.axis[1])
-
-            (_, c) = s[y_bsrmm].op.reduce_axis
-
-            (m_o, n_o) = s[out].op.axis
-            s[out].bind(m_o, te.thread_axis("blockIdx.x"))
-            s[out].bind(n_o, te.thread_axis("blockIdx.y"))
-            s[y_bsrmm].compute_at(s[out], n_o)
-
-            thread_x = te.thread_axis("threadIdx.x")
-
-            y_bsrmm_factored = s.rfactor(y_bsrmm, c)
-            tx = s[y_bsrmm].op.reduce_axis[0]
-            s[y_bsrmm].bind(tx, thread_x)
-            s[y_bsrmm_factored].compute_at(s[y_bsrmm], tx)
-            s[y_bsrmm].set_store_predicate(thread_x.var.equal(0))
-            s[out].set_store_predicate(thread_x.var.equal(0))
-        elif op.tag == "sparse_dense_sp_lhs_csrmm" or op.tag == "sparse_dense_sp_rhs_csrmm":
-            out = op.output(0)
-            const_size = get_const_int(prod(out.shape))
-            fused = s[out].fuse(*s[out].op.axis)
-            bx, tx = s[out].split(fused, factor=const_size)
-            s[out].bind(tx, te.thread_axis("threadIdx.x"))
-            s[out].bind(bx, te.thread_axis("blockIdx.x"))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def sparse_dense_tir(data, w_data, w_indices, w_indptr):
-    """Compute data * w^T.
-
-    Actually computes (w * data^T) ^ T as data needs to be in column-major
-    format for performance reasons.
-
-    Good resources:
-    Yang, Carl, Aydın Buluç, and John D. Owens. "Design principles for sparse
-    matrix multiplication on the GPU." European Conference on Parallel
-    Processing. Springer, Cham, 2018. <- This code is basically row-split from here.
-    Gale, Trevor, et al. "Sparse GPU Kernels for Deep Learning." arXiv preprint
-    arXiv:2006.10901 (2020).
-
-
-    Profile with
-    `/opt/nvidia/nsight-compute/2020.1.2/ncu -k default_function_kernel1
-    --section '.*' -s 1 -c 1 venv/bin/python3 test_topi_sparse.py manual`
-    with either default_function_kernel0 for the transpose or
-    default_function_kernel1 for the multiply.
-    """
-
-    def gen_ir(data, w_data, w_indices, w_indptr, out):
-        # pylint: disable=invalid-name, simplifiable-if-statement
-        # TODO(tkonolige): use tensorcores for block multiply
-        # TODO(tkonolige): use vectorize on loads
-        # TODO(tkonolige): separate implementation if M is small
-        # TODO(tkonolige): separate implementation for large block sizes
-        ib = tvm.tir.ir_builder.create()
-
-        if tvm.target.Target.current(allow_none=False).kind.name == "cuda":
-            use_warp_storage = True
-        else:
-            # TVMs warp shuffle intrinsics are slow on ROCM because they use
-            # LDS (shared memory) to do the shuffling. Instead, we could use
-            # ROCM's support for accessing neighboring threads memory, but we
-            # those intrinsics aren't accessible from TVM. For now, we just use
-            # shared memory. We also default to shared memory on platforms
-            # where we do not know how warp storage performs.
-            use_warp_storage = False
-
-        warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
-        m = data.shape[1]
-        nb = w_indptr.shape[0] - 1
-        # treat csr like block size 1 bsr
-        if len(w_data.shape) == 1:
-            bs_n = 1
-            bs_k = 1
-        else:
-            bs_n = w_data.shape[1]
-            bs_k = w_data.shape[2]
-        bs_m = bs_n
-        mb = m // bs_m
-        mi = warp_size
-        assert mb >= mi, (
-            f"Number of block rows in dense matrix must be larger than warp size: "
-            f"{warp_size} vs {mb}."
-        )
-        mo = ceil_div(mb, mi)
-        ni = 1  # TODO(tkonolige): how do I compute the number of warps per block?
-        no = ceil_div(nb, ni)
-        rowlength_bi = warp_size
-
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", mo)
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(by, "thread_extent", no)
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", warp_size)
-        warp = te.thread_axis("threadIdx.y")
-        ib.scope_attr(warp, "thread_extent", ni)
-
-        out_ptr = ib.buffer_ptr(out)
-        data_ptr = ib.buffer_ptr(data)
-        w_data_ptr = ib.buffer_ptr(w_data)
-        w_indices_ptr = ib.buffer_ptr(w_indices)
-        w_indptr_ptr = ib.buffer_ptr(w_indptr)
-
-        n_index = by * ni + warp
-        m_index = bx * mi + tx
-        row_start = w_indptr_ptr[n_index]
-
-        # Guaranteed to be evenly divisible
-        rowlength_bo = ceil_div(w_indptr_ptr[n_index + 1] - row_start, rowlength_bi)
-
-        # thread local storage for bs_m x bs_n block
-        block = ib.allocate(data.dtype, (bs_m, bs_n), name="block", scope="local")
-        data_cache = ib.allocate(data.dtype, (mi, bs_m, bs_k), name="data_cache", scope="local")
-        if use_warp_storage:
-            indices = ib.allocate(w_indices.dtype, (rowlength_bi,), name="indices", scope="warp")
-            w_data_cache = ib.allocate(
-                w_data.dtype, (rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="warp"
-            )
-        else:
-            indices = ib.allocate(
-                w_indices.dtype, (ni, rowlength_bi), name="indices", scope="shared"
-            )
-            w_data_cache = ib.allocate(
-                w_data.dtype, (ni, rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="shared"
-            )
-
-        # zero block
-        with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
-            with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
-                block[x, y] = 0.0
-        # compute into thread local storage using warp_size chunks
-        with ib.for_range(0, rowlength_bo, name="bb") as bb:
-            elem_idx = bb * rowlength_bi + tx
-            # Cache indices. Guaranteed to be multiple of warp_size.
-            if use_warp_storage:
-                indices[tx] = w_indices_ptr[row_start + elem_idx]
-            else:
-                indices[warp, tx] = w_indices_ptr[row_start + elem_idx]
-            # cache dense matrix
-            # each thread has a row
-            # TODO: ideally we could vectorize this
-            with ib.for_range(0, rowlength_bi, name="bi") as bi:
-                with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
-                    with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
-                        # This memory acces should be out of bounds when
-                        # m_index >= mb (which occurs when the dense matrix
-                        # rows % 32 != 0), but it seems to work just fine...
-                        if use_warp_storage:
-                            ind = indices[bi]
-                        else:
-                            ind = indices[warp, bi]
-                        data_cache[bi, x, z] = data_ptr[ind * bs_k + z, m_index * bs_m + x]
-            # cache w_data
-            elem_idx = bb * rowlength_bi + tx
-            with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
-                with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
-                    data_indices = [row_start + elem_idx] + (
-                        [y, z] if len(w_data.shape) > 1 else []
-                    )
-                    cache_indices = [tx, y, z] if use_warp_storage else [warp, tx, y, z]
-                    w_data_cache[cache_indices] = w_data_ptr[data_indices]
-            with ib.for_range(0, mi, name="i") as i:
-                # thread local block matmul
-                with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
-                    with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
-                        with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
-                            if use_warp_storage:
-                                w = w_data_cache[i, y, z]
-                            else:
-                                w = w_data_cache[warp, i, y, z]
-                            block[x, y] += data_cache[i, x, z] * w
-        # store results
-        with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
-            with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
-                with ib.if_scope(m_index < mb):
-                    with ib.if_scope(n_index < nb):
-                        # It doesn't seem like we would be getting coelesced
-                        # writes here, but it doesn't seem to matter
-                        out_ptr[m_index * bs_m + x, n_index * bs_n + y] = block[x, y]
-
-        return ib.get()
-
-    data_t = tvm.topi.transpose(data)
-    # handle csr
-    if len(w_data.shape) == 1:
-        blocksize = 1
-    else:
-        blocksize = w_data.shape[1]
-    out_shape = (data_t.shape[1], (w_indptr.shape[0] - 1) * blocksize)
-    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
-    out = te.extern(
-        [out_shape],
-        [data_t, w_data, w_indices, w_indptr, data],
-        lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-        dtype=data.dtype,
-        out_buffers=[out_buf],
-        name="sparse_dense_gpu",
-        tag="sparse_dense_gpu",
-    )
-    return out
-
-
-def is_valid_for_sparse_dense_padded(data, weight_data):
-    """
-    Check whether input is applicable for sparse_dense_padded op.
-    If not we should fall back to default scheduling.
-    """
-    # pylint:disable=invalid-name
-    warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
-    # If there are multiple alter_ops in a model, the first alteration does not
-    # run type inference for the subsequent ones. In this case, we don't have
-    # the shape information, so we run the inferencer manually.
-    try:
-        m = get_const_tuple(data.checked_type.shape)[1]
-    except ValueError:
-        data_infered = relay.transform.InferType()(tvm.IRModule.from_expr(data))["main"]
-        m = get_const_tuple(data_infered.ret_type.shape)[1]
-    if len(weight_data.shape) == 1:
-        bs_m = 1
-    else:
-        bs_m = weight_data.shape[1]
-
-    mb = m // bs_m
-    if mb >= warp_size:
-        return True
-    return False
-
-
-def sparse_dense_padded(data, weight_data, weight_indices, weight_indptr, sparse_lhs=False):
-    """
-    Computes sparse-dense matrix multiplication of `data` and
-    `(weight_data, weight_indices, weight_indptr).T`
-
-    This variation uses a padded matrix where all row lengths are a multiple of the warp size.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        2-D with shape [M, K], float32
-
-    weight_data : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        3-D with shape [num_blocks, bs_r, bs_c] (BSR)
-
-    weight_indices : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        1-D with shape [num_blocks] (BSR)
-
-    weight_indptr : tvm.te.Tensor
-        1-D with shape [N + 1] (CSR) or
-        1-D with shape [(N + 1) // bs_r] (BSR)
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [M, N]
-    """
-    # TODO(ANSHUMAN87): Handle for sparse_lhs case too
-    assert not sparse_lhs, "Currently only sparse weight is supported."
-    return sparse_dense_tir(data, weight_data, weight_indices, weight_indptr)
-
-
-def schedule_sparse_dense_padded(outs):
-    """Create schedule for sparse dense"""
-    # XXX: this will fail if we don't include the data_t Tensor in the schedule
-    # ops. Maybe create_schedule should do some analysis so this isn't
-    # necessary
-    data_t = outs[0].op.input_tensors[0]
-    s = te.create_schedule([outs[0].op, data_t.op])
-    schedule_transpose_from_existing(s, outs[0].op.input_tensors[0])
-    return s
-
-
-def pad_sparse_matrix(matrix, blocksize):
-    """Pad rows of sparse matrix matrix so that they are a multiple of blocksize."""
-    import scipy.sparse as sp  # pylint: disable=import-outside-toplevel
-
-    assert isinstance(matrix, sp.bsr_matrix)
-    new_entries = np.zeros(matrix.shape[0], dtype=matrix.indptr.dtype)
-    bsr = matrix.blocksize[0]
-    for i in range(matrix.shape[0] // bsr):
-        row_length = matrix.indptr[i + 1] - matrix.indptr[i]
-        if row_length % blocksize != 0:
-            new_entries[i] = blocksize - (row_length % blocksize)
-    additional = np.sum(new_entries)
-    indices = np.zeros(matrix.indices.shape[0] + additional, dtype=matrix.indices.dtype)
-    data = np.zeros(
-        (matrix.data.shape[0] + additional, matrix.data.shape[1], matrix.data.shape[2]),
-        dtype=matrix.data.dtype,
-    )
-
-    n = matrix.shape[0] // bsr
-    indptr = np.zeros(n + 1, dtype=matrix.indptr.dtype)
-    indptr[: matrix.indptr.shape[0]] = matrix.indptr
-
-    for i in range(matrix.shape[0] // bsr):
-        indptr[i + 1] = indptr[i] + new_entries[i] + (matrix.indptr[i + 1] - matrix.indptr[i])
-        indices[indptr[i] : indptr[i + 1] - new_entries[i]] = matrix.indices[
-            matrix.indptr[i] : matrix.indptr[i + 1]
-        ]
-        data[indptr[i] : indptr[i + 1] - new_entries[i], :, :] = matrix.data[
-            matrix.indptr[i] : matrix.indptr[i + 1], :, :
-        ]
-
-    return sp.bsr_matrix((data, indices, indptr), matrix.shape)
-
-
-@nn.sparse_dense_alter_layout.register(["cuda", "gpu", "rocm"])
-def _alter_sparse_dense_layout(_attrs, inputs, _tinfos, _out_type):
-    """With cuda, we modify use alter_op_layout to swap the default
-    sparse_dense implementation for one that operates on a padded matrix. We
-    also pad the matrix.
-    """
-    import scipy.sparse as sp  # pylint: disable=import-outside-toplevel
-
-    # TODO(ANSHUMAN87): Handle for sparse_lhs case too
-    if (
-        isinstance(inputs[1], relay.Constant)
-        and isinstance(inputs[2], relay.Constant)
-        and isinstance(inputs[3], relay.Constant)
-        and is_valid_for_sparse_dense_padded(inputs[0], inputs[1].data.numpy())
-    ):
-        if len(inputs[1].data.numpy().shape) == 1:
-            sparse_matrix = sp.csr_matrix(
-                (inputs[1].data.numpy(), inputs[2].data.numpy(), inputs[3].data.numpy())
-            ).tobsr()
-        else:
-            sparse_matrix = sp.bsr_matrix(
-                (inputs[1].data.numpy(), inputs[2].data.numpy(), inputs[3].data.numpy())
-            )
-        warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
-        sparse_matrix = pad_sparse_matrix(sparse_matrix, warp_size)
-        return relay.nn._make.sparse_dense_padded(
-            inputs[0],
-            relay.Constant(tvm.nd.array(sparse_matrix.data)),
-            relay.Constant(tvm.nd.array(sparse_matrix.indices)),
-            relay.Constant(tvm.nd.array(sparse_matrix.indptr)),
-        )
-    return None
diff --git a/python/tvm/topi/cuda/sparse_reshape.py b/python/tvm/topi/cuda/sparse_reshape.py
deleted file mode 100644
index 7a796fa42696..000000000000
--- a/python/tvm/topi/cuda/sparse_reshape.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
-"""Sparse_Reshape operator"""
-import tvm
-from tvm import te
-from ...tir import decl_buffer, ir_builder, Cast
-from ...te import extern, div, floordiv, floormod
-from ..utils import ceil_div
-
-
-def sparse_reshape(
-    sparse_indices,
-    prev_shape,
-    new_shape,
-    new_sparse_indices_shape,
-    new_shape_shape,
-):
-    """
-    Reshape a Sparse Tensor
-    Parameters
-    ----------
-    sparse_indices : relay.Expr
-        A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the
-        number of sparse values and n_dim is the number of dimensions of the dense_shape
-    prev_shape : relay.Expr
-        A 1-D tensor containing the previous shape of the dense tensor
-    new_shape : relay.Expr
-        A 1-D tensor containing the new shape of the dense tensor
-    Returns
-    -------
-    result: relay.Expr
-        Output tensor.
-    Examples
-    --------
-    .. code-block:: python
-
-        sparse_indices = [[0, 0, 0],
-                            [0, 0, 1],
-                            [0, 1, 0],
-                            [1, 0, 0],
-                            [1, 2, 3]]
-        prev_shape = [2, 3, 4]
-        new_shape = [9, -1]
-        new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices,
-                            prev_shape,
-                            new_shape)
-        new_sparse_indices = [[0, 0],
-                              [0, 1],
-                              [1, 2],
-                              [4, 2],
-                              [8, 1]]
-        new_shape = [9, 4]
-    """
-
-    def gen_ir(
-        sparse_indices_ptr,
-        prev_shape_ptr,
-        new_shape_ptr,
-        new_sparse_indices_ptr,
-        out_new_shape_ptr,
-    ):
-        ib = ir_builder.create()
-
-        sparse_indices = ib.buffer_ptr(sparse_indices_ptr)
-        prev_shape = ib.buffer_ptr(prev_shape_ptr)
-
-        new_shape = ib.buffer_ptr(new_shape_ptr)
-        out_new_shape = ib.buffer_ptr(out_new_shape_ptr)
-        new_sparse_indices = ib.buffer_ptr(new_sparse_indices_ptr)
-        out_new_shape = ib.buffer_ptr(out_new_shape_ptr)
-
-        prev_shape_size = prev_shape_ptr.shape[0]
-        new_shape_size = new_shape_ptr.shape[0]
-
-        multipliers = ib.allocate(
-            new_shape_ptr.dtype, (prev_shape_size,), name="multipliers", scope="global"
-        )
-        dividers = ib.allocate(
-            new_shape_ptr.dtype, (new_shape_size,), name="dividers", scope="global"
-        )
-        flattened_indices = ib.allocate(
-            new_shape_ptr.dtype,
-            (sparse_indices_ptr.shape[0],),
-            name="flattened_indices",
-            scope="global",
-        )
-        total_ele = ib.allocate(new_shape_ptr.dtype, (1,), name="total_ele", scope="global")
-        division_total_ele = ib.allocate(
-            new_shape_ptr.dtype, (1,), name="division_total_ele", scope="global"
-        )
-        equal_shape = ib.allocate("bool", (1,), name="equal_shape", scope="global")
-        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-        with ib.new_scope():
-            # The computation in this block is very very miniscule since we are just iterating over
-            # shape tensors which are very small (< 10) and there is no need of parallelization
-            nthread_tx = 1
-            nthread_bx = 1
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-
-            total_ele[0] = prev_shape[0]
-
-            # Cumulative Reverse Exclusive Multiply
-            multipliers[prev_shape_size - 1] = Cast(new_shape_ptr.dtype, 1)
-            with ib.for_range(0, prev_shape_size - 1) as i_:
-                i = i_ + 1
-                multipliers[prev_shape_size - 1 - i] = (
-                    prev_shape[prev_shape_size - i] * multipliers[prev_shape_size - i]
-                )
-                total_ele[0] *= prev_shape[prev_shape_size - i]
-
-            division_total_ele[0] = Cast(new_shape_ptr.dtype, 1)
-            with ib.for_range(0, new_shape_size) as i:
-                with ib.if_scope(new_shape[i] != -1):
-                    division_total_ele[0] *= new_shape[i]
-
-            # Compute true output shape (replace negative ones)
-            with ib.for_range(0, new_shape_size) as i:
-                with ib.if_scope(new_shape[i] == -1):
-                    out_new_shape[i] = Cast(
-                        new_shape_ptr.dtype, div(total_ele[0], division_total_ele[0])
-                    )
-                with ib.else_scope():
-                    out_new_shape[i] = new_shape[i]
-
-            # Check if prev_shape and new_shape are equal
-            equal_shape[0] = True
-            with ib.if_scope(prev_shape_size == new_shape_size):
-                with ib.for_range(0, prev_shape_size) as i:
-                    with ib.if_scope(prev_shape[i] != out_new_shape[i]):
-                        equal_shape[0] = False
-            with ib.else_scope():
-                equal_shape[0] = False
-
-        with ib.new_scope():
-            nthread_tx = max_threads
-            nthread_bx = ceil_div(sparse_indices_ptr.shape[0], max_threads)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-
-            row_number = bx * max_threads + tx
-
-            # Return same inputs if shapes are equal
-            with ib.if_scope(equal_shape[0]):
-                with ib.if_scope(row_number < sparse_indices_ptr.shape[0]):
-                    with ib.for_range(0, sparse_indices_ptr.shape[1]) as j:
-                        new_sparse_indices[row_number, j] = sparse_indices[row_number, j]
-
-            # Else compute new_sparse_indices
-            with ib.else_scope():
-                dividers[new_shape_size - 1] = Cast(new_shape_ptr.dtype, 1)
-                with ib.for_range(0, new_shape_size - 1) as i_:
-                    i = i_ + 1
-                    dividers[new_shape_size - 1 - i] = (
-                        dividers[new_shape_size - i] * out_new_shape[new_shape_size - i]
-                    )
-
-                with ib.if_scope(row_number < sparse_indices_ptr.shape[0]):
-                    flattened_indices[row_number] = Cast(new_shape_ptr.dtype, 0)
-                    with ib.for_range(0, sparse_indices_ptr.shape[1]) as j:
-                        flattened_indices[row_number] += (
-                            sparse_indices[row_number, j] * multipliers[j]
-                        )
-
-                with ib.if_scope(row_number < sparse_indices_ptr.shape[0]):
-                    current_element = ib.allocate(
-                        new_shape_ptr.dtype, (1,), name="current_element", scope="local"
-                    )
-                    current_element[0] = flattened_indices[row_number]
-
-                    with ib.for_range(0, new_sparse_indices_ptr.shape[1]) as j:
-                        new_sparse_indices[row_number, j] = Cast(
-                            sparse_indices_ptr.dtype, floordiv(current_element[0], dividers[j])
-                        )
-                        current_element[0] = floormod(current_element[0], dividers[j])
-
-        return ib.get()
-
-    new_sparse_indices_buf = decl_buffer(
-        new_sparse_indices_shape, sparse_indices.dtype, "new_sparse_indices_buf"
-    )
-    new_shape_buf = decl_buffer(new_shape_shape, prev_shape.dtype, "new_shape_buf")
-
-    return extern(
-        [new_sparse_indices_shape, new_shape_shape],
-        [sparse_indices, prev_shape, new_shape],
-        lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0], outs[1]),
-        out_buffers=[new_sparse_indices_buf, new_shape_buf],
-        name="sparse_reshape_cuda",
-        tag="sparse_reshape_cuda",
-    )
diff --git a/python/tvm/topi/cuda/ssd/__init__.py b/python/tvm/topi/cuda/ssd/__init__.py
deleted file mode 100644
index 1ac388da9a1e..000000000000
--- a/python/tvm/topi/cuda/ssd/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""VISION network operators"""
-from __future__ import absolute_import as _abs
-
-from .multibox import *
diff --git a/python/tvm/topi/cuda/ssd/multibox.py b/python/tvm/topi/cuda/ssd/multibox.py
deleted file mode 100644
index e401547db474..000000000000
--- a/python/tvm/topi/cuda/ssd/multibox.py
+++ /dev/null
@@ -1,618 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, too-many-function-args
-"""SSD multibox operators"""
-import math
-import tvm
-from tvm import te
-from tvm.tir import if_then_else, exp
-
-from tvm import topi
-
-from ..nms import non_max_suppression
-
-
-def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
-    """Low level IR routing for multibox_prior operator.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input data buffer.
-
-    out : Buffer
-        Output buffer.
-
-    sizes : tuple of float
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float
-        Tuple of ratios for anchor boxes.
-
-    steps : Tuple of float
-        Priorbox step across y and x, -1 for auto calculation.
-
-    offsets : tuple of int
-        Priorbox center offsets, y and x respectively.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    max_threads = int(math.sqrt(tvm.target.Target.current(allow_none=False).max_num_threads))
-    tx = te.thread_axis("threadIdx.x")
-    ty = te.thread_axis("threadIdx.y")
-    bx = te.thread_axis("blockIdx.x")
-    by = te.thread_axis("blockIdx.y")
-    ib = tvm.tir.ir_builder.create()
-    p_out = ib.buffer_ptr(out)
-    in_height = data.shape[2]
-    in_width = data.shape[3]
-    nthread_tx = max_threads
-    nthread_bx = in_height // max_threads + 1
-    nthread_ty = max_threads
-    nthread_by = in_width // max_threads + 1
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(ty, "thread_extent", nthread_ty)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    ib.scope_attr(by, "thread_extent", nthread_by)
-
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    size_ratio_concat = sizes + ratios
-    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
-    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
-    offset_h = offsets[0]
-    offset_w = offsets[1]
-
-    i = bx * max_threads + tx
-    j = by * max_threads + ty
-    with ib.if_scope((i < in_height)):
-        with ib.if_scope((j < in_width)):
-            center_h = (i + offset_h) * steps_h
-            center_w = (j + offset_w) * steps_w
-
-            for k in range(num_sizes + num_ratios - 1):
-                w = if_then_else(
-                    k < num_sizes,
-                    float(size_ratio_concat[k]) * in_height / in_width / 2.0,
-                    float(size_ratio_concat[0])
-                    * in_height
-                    / in_width
-                    * math.sqrt(size_ratio_concat[k + 1])
-                    / 2.0,
-                )
-                h = if_then_else(
-                    k < num_sizes,
-                    size_ratio_concat[k] / 2.0,
-                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0,
-                )
-                count = (
-                    i * in_width * (num_sizes + num_ratios - 1)
-                    + j * (num_sizes + num_ratios - 1)
-                    + k
-                ) * 4
-                p_out[count] = center_w - w
-                p_out[count + 1] = center_h - h
-                p_out[count + 2] = center_w + w
-                p_out[count + 3] = center_h + h
-
-    body = ib.get()
-    return body
-
-
-def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
-    """Generate prior(anchor) boxes from data, sizes and ratios.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]]
-
-    sizes : tuple of float
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float
-        Tuple of ratios for anchor boxes.
-
-    steps : Tuple of float
-        Priorbox step across y and x, -1 for auto calculation.
-
-    offsets : tuple of int
-        Priorbox center offsets, y and x respectively.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
-    """
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
-    out = te.extern(
-        oshape,
-        [data],
-        lambda ins, outs: multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
-        tag="multibox_prior",
-    )
-    if clip:
-        out = topi.clip(out, 0, 1)
-    return out
-
-
-def transform_loc_pre(
-    cls_prob,
-    valid_count,
-    temp_valid_count,
-    temp_cls_id,
-    temp_score,
-    threshold,
-    keep_background,
-):
-    """Low level IR routing for transform location data preparation.
-
-    Parameters
-    ----------
-    cls_prob : Buffer
-        Buffer of class probabilities.
-
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
-
-    temp_valid_count : Buffer
-        Output intermediate result buffer
-
-    temp_cls_id : Buffer
-        Output intermediate result buffer
-
-    temp_score : Buffer
-        Output buffer
-
-    threshold : float
-        Threshold to be a positive prediction.
-
-    keep_background : int
-        1 to keep background, 0 to remove it.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = cls_prob.shape[0]
-    num_classes = cls_prob.shape[1]
-    num_anchors = cls_prob.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    cls_prob = ib.buffer_ptr(cls_prob)
-    cls_id = ib.buffer_ptr(temp_cls_id)
-    valid_count = ib.buffer_ptr(valid_count)
-    temp_valid_count = ib.buffer_ptr(temp_valid_count)
-    score = ib.buffer_ptr(temp_score)
-
-    threshold = tvm.tir.FloatImm("float32", threshold)
-    keep_background = tvm.tir.IntImm("int8", keep_background)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = (batch_size * num_anchors) // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    with ib.if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors)
-        j = idxm(tid, num_anchors)
-        valid_count[i] = 0
-        score[tid] = -1.0
-        cls_id[tid] = 0
-        with ib.for_range(0, num_classes) as k:
-            with ib.if_scope(tvm.tir.any(keep_background == 1, k > 0)):
-                temp = cls_prob[i * num_classes * num_anchors + k * num_anchors + j]
-                cls_id[tid] = if_then_else(temp > score[tid], k, cls_id[tid])
-                score[tid] = tvm.te.max(temp, score[tid])
-        with ib.if_scope(tvm.tir.all(cls_id[tid] > 0, score[tid] < threshold)):
-            cls_id[tid] = 0
-        with ib.if_scope(tvm.tir.any(keep_background == 1, cls_id[tid] > 0)):
-            temp_valid_count[tid] = 1
-        with ib.else_scope():
-            temp_valid_count[tid] = 0
-
-        with ib.if_scope(tid < batch_size):
-            with ib.for_range(0, num_anchors) as k:
-                with ib.if_scope(k > 0):
-                    temp_valid_count[tid * num_anchors + k] += temp_valid_count[
-                        tid * num_anchors + k - 1
-                    ]
-            valid_count[tid] = temp_valid_count[tid * num_anchors + num_anchors - 1]
-
-    return ib.get()
-
-
-def transform_loc_ir(
-    loc_pred,
-    anchor,
-    temp_valid_count,
-    temp_cls_id,
-    temp_score,
-    out,
-    clip,
-    variances,
-    batch_size,
-    num_anchors,
-    keep_background,
-):
-    """Low level IR routing for transform location in multibox_detection operator.
-
-    Parameters
-    ----------
-    loc_pred : Buffer
-        Buffer of location regression predictions.
-
-    anchor : Buffer
-        Buffer of prior anchor boxes.
-
-    temp_valid_count : Buffer
-        Intermediate result buffer.
-
-    temp_cls_id : Buffer
-        Intermediate result buffer.
-
-    temp_score : Buffer
-        Input buffer which stores intermediate results.
-
-    out : Buffer
-        Output buffer.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    variances : tuple of float
-        Variances to be decoded from box regression output.
-
-    batch_size : int
-        Batch size
-
-    num_anchors : int
-        Number of anchors
-
-    keep_background : int
-        1 to keep background, 0 to remove it.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
-        """Transform prior anchor box to output box through location predictions."""
-        al = anchor[anchor_base_idx]
-        at = anchor[anchor_base_idx + 1]
-        ar = anchor[anchor_base_idx + 2]
-        ab = anchor[anchor_base_idx + 3]
-        aw = ar - al
-        ah = ab - at
-        ax = (al + ar) / 2.0
-        ay = (at + ab) / 2.0
-        px = loc[loc_base_idx]
-        py = loc[loc_base_idx + 1]
-        pw = loc[loc_base_idx + 2]
-        ph = loc[loc_base_idx + 3]
-        ox = px * vx * aw + ax
-        oy = py * vy * ah + ay
-        ow = exp(pw * vw) * aw / 2.0
-        oh = exp(ph * vh) * ah / 2.0
-        return (
-            tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, ox - ow)), ox - ow),
-            tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, oy - oh)), oy - oh),
-            tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, ox + ow)), ox + ow),
-            tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, oy + oh)), oy + oh),
-        )
-
-    ib = tvm.tir.ir_builder.create()
-
-    loc_pred = ib.buffer_ptr(loc_pred)
-    anchor = ib.buffer_ptr(anchor)
-    temp_valid_count = ib.buffer_ptr(temp_valid_count)
-    cls_id = ib.buffer_ptr(temp_cls_id)
-    score = ib.buffer_ptr(temp_score)
-    out_loc = ib.buffer_ptr(out)
-
-    keep_background = tvm.tir.IntImm("int8", keep_background)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = (batch_size * num_anchors) // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    with ib.if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors)
-        j = idxm(tid, num_anchors)
-
-        with ib.if_scope(tvm.tir.any(keep_background == 1, cls_id[tid] > 0)):
-            with ib.if_scope(j == 0):
-                out_base_idx = i * num_anchors * 6
-                out_loc[out_base_idx] = (
-                    cls_id[tid] * 1.0 if keep_background == 1 else cls_id[tid] - 1.0
-                )
-                out_loc[out_base_idx + 1] = score[tid]
-                (
-                    out_loc[out_base_idx + 2],
-                    out_loc[out_base_idx + 3],
-                    out_loc[out_base_idx + 4],
-                    out_loc[out_base_idx + 5],
-                ) = transform_loc(
-                    loc_pred,
-                    tid * 4,
-                    anchor,
-                    j * 4,
-                    clip,
-                    variances[0],
-                    variances[1],
-                    variances[2],
-                    variances[3],
-                )
-            with ib.else_scope():
-                out_base_idx = i * num_anchors * 6 + temp_valid_count[tid - 1] * 6
-                out_loc[out_base_idx] = (
-                    cls_id[tid] * 1.0 if keep_background == 1 else cls_id[tid] - 1.0
-                )
-                out_loc[out_base_idx + 1] = score[tid]
-                (
-                    out_loc[out_base_idx + 2],
-                    out_loc[out_base_idx + 3],
-                    out_loc[out_base_idx + 4],
-                    out_loc[out_base_idx + 5],
-                ) = transform_loc(
-                    loc_pred,
-                    tid * 4,
-                    anchor,
-                    j * 4,
-                    clip,
-                    variances[0],
-                    variances[1],
-                    variances[2],
-                    variances[3],
-                )
-
-    return ib.get()
-
-
-def multibox_transform_loc(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip=True,
-    threshold=0.01,
-    variances=(0.1, 0.1, 0.2, 0.2),
-    keep_background=False,
-):
-    """Location transformation for multibox detection
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        Class probabilities.
-
-    loc_pred : tvm.te.Tensor
-        Location regression predictions.
-
-    anchor : tvm.te.Tensor
-        Prior anchor boxes.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    threshold : float
-        Threshold to be a positive prediction.
-
-    variances : tuple of float
-        Variances to be decoded from box regression output.
-
-    keep_background : boolean
-        Whether to keep boxes detected as background or not.
-
-    Returns
-    -------
-    ret : tuple of tvm.te.Tensor composed of
-
-    out : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_anchors, 6)
-
-    valid_count : tvm.te.Tensor
-        1-D tensor with shape (batch_size,), number of valid anchor boxes.
-    """
-    batch_size = cls_prob.shape[0]
-    num_anchors = cls_prob.shape[2]
-    oshape = (batch_size, num_anchors, 6)
-    # Define data alignment for intermediate buffer
-    valid_count_dtype = "int32"
-    out_loc_dtype = loc_pred.dtype
-
-    valid_count_buf = tvm.tir.decl_buffer(
-        (batch_size,), valid_count_dtype, "valid_count_buf", data_alignment=4
-    )
-    loc_pred_buf = tvm.tir.decl_buffer(
-        loc_pred.shape, loc_pred.dtype, "loc_pred_buf", data_alignment=8
-    )
-    anchor_buf = tvm.tir.decl_buffer(anchor.shape, anchor.dtype, "anchor_buf", data_alignment=8)
-
-    temp_valid_count_buf = tvm.tir.decl_buffer(
-        (
-            batch_size,
-            num_anchors,
-        ),
-        valid_count_dtype,
-        "temp_valid_count",
-        data_alignment=8,
-    )
-    temp_cls_id_buf = tvm.tir.decl_buffer(
-        (
-            batch_size,
-            num_anchors,
-        ),
-        valid_count_dtype,
-        "temp_cls_id",
-        data_alignment=8,
-    )
-    temp_score_buf = tvm.tir.decl_buffer(
-        (
-            batch_size,
-            num_anchors,
-        ),
-        cls_prob.dtype,
-        "temp_score",
-        data_alignment=8,
-    )
-
-    valid_count, temp_valid_count, temp_cls_id, temp_score = te.extern(
-        [
-            (batch_size,),
-            (
-                batch_size,
-                num_anchors,
-            ),
-            (
-                batch_size,
-                num_anchors,
-            ),
-            (
-                batch_size,
-                num_anchors,
-            ),
-        ],
-        [cls_prob],
-        lambda ins, outs: transform_loc_pre(
-            ins[0],
-            outs[0],
-            outs[1],
-            outs[2],
-            outs[3],
-            threshold,
-            int(keep_background),
-        ),
-        dtype=[valid_count_dtype, valid_count_dtype, valid_count_dtype, cls_prob.dtype],
-        out_buffers=[valid_count_buf, temp_valid_count_buf, temp_cls_id_buf, temp_score_buf],
-        tag="multibox_transform_loc_phase_one",
-    )
-
-    out_loc = te.extern(
-        [oshape],
-        [loc_pred, anchor, temp_valid_count, temp_cls_id, temp_score],
-        lambda ins, outs: transform_loc_ir(
-            ins[0],
-            ins[1],
-            ins[2],
-            ins[3],
-            ins[4],
-            outs[0],
-            clip,
-            variances,
-            batch_size,
-            num_anchors,
-            int(keep_background),
-        ),
-        in_buffers=[
-            loc_pred_buf,
-            anchor_buf,
-            temp_valid_count_buf,
-            temp_cls_id_buf,
-            temp_score_buf,
-        ],
-        dtype=[out_loc_dtype],
-        tag="multibox_transform_loc",
-    )
-
-    return [out_loc, valid_count]
-
-
-def multibox_detection(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip=True,
-    threshold=0.01,
-    nms_threshold=0.5,
-    force_suppress=False,
-    variances=(0.1, 0.1, 0.2, 0.2),
-    nms_topk=-1,
-):
-    """Convert multibox detection predictions.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        Class probabilities.
-
-    loc_pred : tvm.te.Tensor
-        Location regression predictions.
-
-    anchor : tvm.te.Tensor
-        Prior anchor boxes.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    nms_threshold : float
-        Non-maximum suppression threshold.
-
-    force_suppress : boolean
-        Whether to suppress all detections regardless of class_id.
-
-    threshold : float
-        Threshold to be a positive prediction.
-
-    variances : tuple of float
-        Variances to be decoded from box regression output.
-
-    nms_topk : int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_anchors, 6)
-    """
-    inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, clip, threshold, variances)
-    out = non_max_suppression(
-        inter_out[0],
-        inter_out[1],
-        inter_out[1],
-        max_output_size=-1,
-        iou_threshold=nms_threshold,
-        force_suppress=force_suppress,
-        top_k=nms_topk,
-        return_indices=False,
-    )
-    return out
diff --git a/python/tvm/topi/cuda/tensor_intrin.py b/python/tvm/topi/cuda/tensor_intrin.py
deleted file mode 100644
index 0a504906c053..000000000000
--- a/python/tvm/topi/cuda/tensor_intrin.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unnecessary-lambda, too-many-arguments
-"""Tensor intrinsics on CUDA."""
-import tvm
-from tvm import te
-from ..utils import is_target
-
-
-def dp4a(x_scope="local", y_scope="local", z_scope="local", dtypes=("int8", "int8")):
-    """
-    Int8 dot product reduced by every 4 elements using __dp4a
-
-    Parameters
-    ----------
-    x_scope : str, optional
-        The storage scope of buffer for lhs
-    y_scope : str, optional
-        The storage scope of buffer for rhs
-    z_scope : str, optional
-        The storage scope of buffer for result
-    dtypes:  tuple of strs, optional
-        The dtype of x and y
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The dp4a TensorIntrin that can be used in tensorizing schedule.
-    """
-
-    n = 4  # dp4a requires operands packed by 4
-    result_dtype = "int32" if dtypes[1] == "int8" else "uint32"
-
-    x = te.placeholder((n,), name="x", dtype=dtypes[0])
-    y = te.placeholder((n,), name="y", dtype=dtypes[1])
-
-    k = te.reduce_axis((0, n), name="rc")
-
-    z = te.compute(
-        (1,), lambda i: te.sum(x[k].astype(result_dtype) * y[k].astype(result_dtype), axis=[k])
-    )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            xx, yy = ins
-            zz = outs[0]
-            zz_dtype = zz.dtype
-
-            if index == 1:
-                return zz.vstore(0, tvm.tir.const(0, zz_dtype))
-
-            ib = tvm.tir.ir_builder.create()
-
-            vec_x_dtype = "int8x4" if xx.dtype == "int8" else "uint8x4"
-            vec_y_dtype = "int8x4" if yy.dtype == "int8" else "uint8x4"
-
-            vec_x = xx.vload(0, dtype=vec_x_dtype)
-            vec_y = yy.vload(0, dtype=vec_y_dtype)
-            prev_z = 0 if index == 0 else zz.vload(0)
-
-            if is_target("rocm"):
-                # TODO(masahi): Here we are assuming that we are compiling for gfx10 or later
-                # We can refine the specification for dot product on rocm if needed later.
-
-                # We can just use "llvm.amdgcn.udot4" for u8u8u32, but it is not tested.
-                assert (
-                    dtypes[0] == "int8" and dtypes[0] == "int8"
-                ), "u8u8u32 dot product for rocm not supported yet"
-
-                new_z = tvm.tir.call_llvm_pure_intrin(
-                    zz_dtype,
-                    "llvm.amdgcn.sdot4",
-                    tvm.tir.const(4, "uint32"),
-                    tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x),
-                    tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y),
-                    prev_z,
-                    True,
-                )
-            else:
-                new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
-
-            ib.emit(zz.vstore(0, new_z))
-
-            return ib.get()
-
-        return _instr(0), _instr(1), _instr(2)  # body, reset, update
-
-    default_buffer_params = {"data_alignment": 4, "offset_factor": 1}
-    scopes = {x: x_scope, y: y_scope, z: z_scope}
-    binds = {
-        t: tvm.tir.decl_buffer(
-            t.shape, t.dtype, t.op.name, scope=scopes[t], **default_buffer_params
-        )
-        for t in [x, y, z]
-    }
-
-    return te.decl_tensor_intrin(
-        z.op, _intrin_func, binds=binds, default_buffer_params=default_buffer_params
-    )
-
-
-def intrin_wmma_load_matrix_A(strides_dst, strides_from, shape, layout, A_shape, C_shape, in_dtype):
-    """Intrin function for loading data from shared memory to wmma.matrix_a"""
-    wmma_m, wmma_n, wmma_k = shape
-
-    A = te.placeholder(A_shape, name="A", dtype=in_dtype)
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, scope="shared", strides=strides_from, data_alignment=32, offset_factor=8
-    )
-    C = te.compute(C_shape, lambda *i: A(*i), name="C")
-    BC = tvm.tir.decl_buffer(
-        C.shape,
-        C.dtype,
-        scope="wmma.matrix_a",
-        strides=strides_dst,
-        data_alignment=32,
-        offset_factor=8,
-    )
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        row = wmma_m * wmma_k
-        warp_index = BC.elem_offset // row + BC.elem_offset % row // wmma_k
-        ib.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.tvm_load_matrix_sync",
-                BC.data,
-                wmma_m,
-                wmma_n,
-                wmma_k,
-                warp_index,
-                BA.access_ptr("r"),
-                strides_from[0],
-                layout,
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-
-def intrin_wmma_load_matrix_W(strides_dst, strides_from, shape, layout, A_shape, C_shape, in_dtype):
-    """Intrin function for loading data from shared memory to wmma.matrix_b"""
-    wmma_m, wmma_n, wmma_k = shape
-
-    A = te.placeholder(A_shape, name="A", dtype=in_dtype)
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, scope="shared", strides=strides_from, data_alignment=32, offset_factor=8
-    )
-    C = te.compute(C_shape, lambda *i: A(*i), name="C")
-    BC = tvm.tir.decl_buffer(
-        C.shape,
-        C.dtype,
-        scope="wmma.matrix_b",
-        strides=strides_dst,
-        data_alignment=32,
-        offset_factor=8,
-    )
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        row = wmma_n * wmma_k
-        warp_index = BC.elem_offset // row + BC.elem_offset % row // wmma_n
-        ib.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.tvm_load_matrix_sync",
-                BC.data,
-                wmma_m,
-                wmma_n,
-                wmma_k,
-                warp_index,
-                BA.access_ptr("r"),
-                strides_from[0],
-                layout,
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-
-def intrin_wmma_store_matrix(strides_dst, strides_from, shape, out_dtype, A_shape, C_shape):
-    """Intrin function for storing the results from wmma.accumulator to shared"""
-    wmma_m, wmma_n, wmma_k = shape
-    A = te.placeholder(A_shape, name="A", dtype=out_dtype)
-    BA = tvm.tir.decl_buffer(
-        A.shape,
-        A.dtype,
-        scope="wmma.accumulator",
-        strides=strides_from,
-        data_alignment=32,
-        offset_factor=8,
-    )
-    C = te.compute(C_shape, lambda *i: A(*i), name="C")
-    BC = tvm.tir.decl_buffer(
-        C.shape, C.dtype, scope="shared", strides=strides_dst, data_alignment=32, offset_factor=8
-    )
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        row = wmma_m * wmma_n
-        warp_index = BA.elem_offset // row + BA.elem_offset % row // wmma_n
-        ib.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.tvm_store_matrix_sync",
-                BA.data,
-                wmma_m,
-                wmma_n,
-                wmma_k,
-                warp_index,
-                BC.access_ptr("w"),
-                strides_dst[0],
-                "row_major",
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-
-def intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, strides_A, strides_W, strides_Conv, shape):
-    """Intrin for wmma fill_fragment and mma_sync
-
-    Parameters
-    ----------
-    AL_gemm : tvm.te.placeholder
-        wmma matrix A
-    WL_gemm : tvm.te.placeholder
-        wmma matrix B
-    CL_compute : tvm.te.compute
-        The definition of wmma gemm
-    """
-    wmma_m, wmma_n, wmma_k = shape
-    A = AL_gemm
-    B = WL_gemm
-    C = CL_compute
-
-    BA = tvm.tir.decl_buffer(
-        A.shape,
-        A.dtype,
-        name="BA",
-        scope="wmma.matrix_a",
-        data_alignment=32,
-        offset_factor=8,
-        strides=strides_A,
-    )
-    BB = tvm.tir.decl_buffer(
-        B.shape,
-        B.dtype,
-        name="BB",
-        scope="wmma.matrix_b",
-        data_alignment=32,
-        offset_factor=8,
-        strides=strides_W,
-    )
-    BC = tvm.tir.decl_buffer(
-        C.shape,
-        C.dtype,
-        name="BC",
-        scope="wmma.accumulator",
-        data_alignment=32,
-        offset_factor=8,
-        strides=strides_Conv,
-    )
-
-    def intrin_func(ins, outs):
-        BA, BB = ins
-        (BC,) = outs
-
-        def warp_idnex(offset, row, col):
-            row = row * col
-            return offset // row + offset % row // col
-
-        warp_index_A = warp_idnex(BA.elem_offset, wmma_m, wmma_k)
-        warp_index_B = warp_idnex(BB.elem_offset, wmma_k, wmma_n)
-        warp_index_C = warp_idnex(BC.elem_offset, wmma_m, wmma_n)
-
-        def init():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_intrin(
-                    "handle",
-                    "tir.tvm_fill_fragment",
-                    BC.data,
-                    wmma_m,
-                    wmma_n,
-                    wmma_k,
-                    warp_index_C,
-                    0.0,
-                )
-            )
-            return ib.get()
-
-        def update():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_intrin(
-                    "handle",
-                    "tir.tvm_mma_sync",
-                    BC.data,
-                    warp_index_C,
-                    BA.data,
-                    warp_index_A,
-                    BB.data,
-                    warp_index_B,
-                    BC.data,
-                    warp_index_C,
-                )
-            )
-            return ib.get()
-
-        return update(), init(), update()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
deleted file mode 100644
index dbbf9e74903c..000000000000
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""Tensorcore alter op and legalize functions for cuda backend"""
-
-import logging
-import math
-from tvm import relay, tir
-
-from .. import nn
-
-logger = logging.getLogger("topi")
-
-
-@nn.batch_matmul_legalize.register("cuda")
-def _batch_matmul_legalize(attrs, inputs, arg_types):
-    """Legalizes batch_matmul op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    arg_types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # Collect the input tensors.
-    x_tensor, y_tensor = arg_types[0], arg_types[1]
-    dtype = x_tensor.dtype
-
-    if attrs.transpose_a:
-        B, K, M = x_tensor.shape
-    else:
-        B, M, K = x_tensor.shape
-
-    if attrs.transpose_b:
-        B, N, K = y_tensor.shape
-    else:
-        B, K, N = y_tensor.shape
-
-    # Collect the output tensor.
-    output_tensor = arg_types[2]
-
-    # Collect the input exprs.
-    x, y = inputs
-
-    if (
-        isinstance(B, tir.expr.Any)
-        or isinstance(M, tir.expr.Any)
-        or isinstance(K, tir.expr.Any)
-        or isinstance(N, tir.expr.Any)
-    ):
-        # Dynamic shape do not support alter op layout now
-        return None
-
-    M = M.value
-    K = K.value
-    N = N.value
-
-    # Pad input and output channels to use tensorcore schedule.
-    if dtype in ["float16", "int8", "uint8"]:
-        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
-        if (
-            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
-            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
-            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
-        ):
-            # no need to pad
-            return None
-        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-    elif dtype in ["int4", "uint4"]:
-        if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
-            # no need to pad
-            return None
-
-        candidates = [(8, 32, 8)]
-    else:
-        return None
-
-    (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
-
-    if extra_flops > 2:
-        logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops)
-        return None
-
-    logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
-
-    if attrs.transpose_a:
-        pad_width = ((0, 0), (0, dk), (0, dm))
-    else:
-        pad_width = ((0, 0), (0, dm), (0, dk))
-
-    x_ = relay.nn.pad(x, pad_width=pad_width) if dm or dk else x
-
-    if attrs.transpose_b:
-        pad_width = ((0, 0), (0, dn), (0, dk))
-    else:
-        pad_width = ((0, 0), (0, dk), (0, dn))
-
-    y_ = relay.nn.pad(y, pad_width=pad_width) if dn or dk else y
-
-    out_ = relay.nn.batch_matmul(x_, y_, **attrs)
-
-    out = (
-        relay.strided_slice(out_, begin=[0, 0, 0], end=[x.value for x in output_tensor.shape])
-        if dm or dn
-        else out_
-    )
-    return out
-
-
-@nn.dense_legalize.register("cuda")
-def _dense_legalize(attrs, inputs, arg_types):
-    """Legalizes dense op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    # Collect the input tensors.
-    x_tensor, y_tensor = arg_types[0], arg_types[1]
-    dtype = x_tensor.dtype
-
-    # Collect the output tensor.
-    output_tensor = arg_types[2]
-
-    # Collect the input exprs.
-    x, y = inputs
-
-    M, K = x_tensor.shape
-    N, K = y_tensor.shape
-    try:
-        M = M.value
-        K = K.value
-        N = N.value
-    except AttributeError:
-        # todo: deal with unfixed shape when compiling wdl model
-        return None
-
-    # Pad input and output channels to use tensorcore schedule.
-    if dtype in ["float16", "int8", "uint8"]:
-        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
-        if (
-            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
-            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
-            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
-        ):
-            # no need to pad
-            return None
-
-        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-    elif dtype in ["int4", "uint4"]:
-        if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
-            # no need to pad
-            return None
-        candidates = [(8, 32, 8)]
-    else:
-        return None
-
-    (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
-    skip_pad = extra_flops_ratio > 2
-
-    if skip_pad and dtype in ["int8", "uint8"]:
-        skip_pad = False
-        # If tensorcore schedule padding fails, pad to nearest upward 4x4x4 as long as
-        # the additional flops ratio isn't double or more.
-        # Note that 4x4x4 is invalid for tensorcore scheduling, but padding upwards to 4x4x4
-        # doesn't hurt if tensorcore padding has already failed.
-        if M % 4 == 0 and K % 4 == 0 and N % 4 == 0:
-            # No need to pad
-            return None
-        (dm, dk, dn) = _pad_to(M, K, N, (4, 4, 4))
-        extra_flops_ratio = _extra_flops(M, K, N, dm, dk, dn) / (M * K * N)
-        skip_pad = extra_flops_ratio > 2
-
-    if skip_pad:
-        logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
-        return None
-
-    logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio)
-
-    x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk))) if dm or dk else x
-    y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk))) if dn or dk else y
-
-    # If units is explicitly specified, it is used to compute the output shape.
-    # We need to update units after padding to prevent a type error.
-    if attrs["units"] is not None:
-        new_attrs["units"] = N + dn
-
-    out_ = relay.nn.dense(x_, y_, **new_attrs)
-    out = (
-        relay.strided_slice(out_, begin=[0, 0], end=[x.value for x in output_tensor.shape])
-        if dm or dn
-        else out_
-    )
-    return out
-
-
-def pad_to_tensorcore(M, K, N, candidates):
-    """pad shape to enable tensorcore"""
-    flops = M * K * N
-    extra_flops = math.inf
-    best_pad = (0, 0, 0)
-    for padding in candidates:
-        dm, dk, dn = _pad_to(M, K, N, padding)
-        e = _extra_flops(M, K, N, dm, dk, dn)
-        # print(dm, dk, dn, e, flops)
-        if e < extra_flops:
-            extra_flops = e
-            best_pad = (dm, dk, dn)
-    return best_pad, extra_flops / flops
-
-
-def _extra_flops(M, K, N, dm, dk, dn):
-    return (M + dm) * (N + dn) * (K + dk) - M * N * K
-
-
-def _pad_to(M, K, N, PADDING):
-    dm, dk, dn = 0, 0, 0
-
-    if M % PADDING[0] != 0:
-        M_ = ((M + PADDING[0]) // PADDING[0]) * PADDING[0]
-        dm = M_ - M
-    if K % PADDING[1] != 0:
-        K_ = ((K + PADDING[1]) // PADDING[1]) * PADDING[1]
-        dk = K_ - K
-    if N % PADDING[2] != 0:
-        N_ = ((N + PADDING[2]) // PADDING[2]) * PADDING[2]
-        dn = N_ - N
-
-    return dm, dk, dn
diff --git a/python/tvm/topi/cuda/transform.py b/python/tvm/topi/cuda/transform.py
deleted file mode 100644
index 16b1273def47..000000000000
--- a/python/tvm/topi/cuda/transform.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""CUDA implementations of transforms"""
-import tvm
-from ... import te
-from ...target import Target
-from ..utils import traverse_inline
-
-
-def schedule_transpose(outs):
-    """Schedule a unfused transpose"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    schedule_transpose_from_existing(s, outs[0])
-    return s
-
-
-def schedule_transpose_from_existing(s, out):
-    """Schedule for transpose on the gpu.
-
-    Roughly follows this:
-    https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/, but
-    without the padding for shared memory. For better performance, we could
-    rewrite it in tir to add the padding. Also, rewriting in tir would allow
-    use to use warp shuffles instead of shared memory (see
-    https://github.com/bryancatanzaro/trove).
-    """
-
-    def _callback(op):
-        # pylint: disable=invalid-name
-        m, n = s[op].op.axis
-        warp_size = int(Target.current(allow_none=False).thread_warp_size)
-        no, ni = s[op].split(n, factor=warp_size)
-        mo, mi = s[op].split(m, factor=warp_size)
-        s[op].reorder(mo, no, mi, ni)
-        s[op].bind(mo, te.thread_axis("blockIdx.x"))
-        s[op].bind(no, te.thread_axis("blockIdx.y"))
-        c = s.cache_read(op.input_tensors[0], "shared", op)
-        s[c].compute_at(s[op], no)
-        thread_x = te.thread_axis("threadIdx.x")
-        thread_y = te.thread_axis("threadIdx.y")
-        s[op].bind(ni, thread_x)
-        # This is a hack to make the scheduling language realize that this axis
-        # can be scheduled.
-        a, _ = s[c].split(s[c].op.axis[1], factor=1)
-        s[c].bind(a, thread_x)
-        # Use 4 warps per block. Slightly faster than 1 warp per block
-        ao, _ = s[op].split(mi, nparts=4)
-        s[op].bind(ao, thread_y)
-        ao, _ = s[c].split(s[c].op.axis[0], nparts=4)
-        s[c].bind(ao, thread_y)
-
-    traverse_inline(s, out.op, _callback)
-
-
-def _invert_permutation_ir(data, out):
-    """Low level IR to get invert_permutation.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input data. 1-D Buffer with shape [elem_num].
-
-    out : Buffer
-        1D buffer for invert permutation result with the same shape with data.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    elem_num = data.shape[0]
-
-    irb = tvm.tir.ir_builder.create()
-    data = irb.buffer_ptr(data)
-    out = irb.buffer_ptr(out)
-
-    max_threads = int(Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = elem_num // max_threads + 1
-    thread_x = te.thread_axis("threadIdx.x")
-    block_x = te.thread_axis("blockIdx.x")
-    irb.scope_attr(thread_x, "thread_extent", nthread_tx)
-    irb.scope_attr(block_x, "thread_extent", nthread_bx)
-    tid = block_x * max_threads + thread_x
-
-    with irb.if_scope(tid < elem_num):
-        r_ind = data[tid]
-        out[r_ind] = tid
-    return irb.get()
-
-
-def invert_permutation(data):
-    """Compute definition of invert_permutation.
-    For an output tensor y and an input tensor x, this operation computes the following:
-
-       y[x[i]] = i for i in [0, 1, ..., len(x) - 1]
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        1-D tensor
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-    """
-    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8)
-
-    out = te.extern(
-        [data.shape],
-        [data],
-        lambda ins, outs: _invert_permutation_ir(ins[0], outs[0]),
-        in_buffers=[
-            data_buf,
-        ],
-        out_buffers=[
-            out_buf,
-        ],
-        name="invert_permutation",
-        tag="invert_permutation_gpu",
-    )
-    return out
diff --git a/python/tvm/topi/cuda/unique.py b/python/tvm/topi/cuda/unique.py
deleted file mode 100644
index 9b16bd69b7d0..000000000000
--- a/python/tvm/topi/cuda/unique.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Unique operator"""
-import tvm
-from tvm import te, tir
-from ...te import hybrid
-from .scan import cumsum
-from .sort import sort, argsort
-from ..utils import ceil_div
-
-
-def _get_max_threads(batch_size):
-    target = tvm.target.Target.current()
-    max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-    if "vulkan" in str(target) and not isinstance(batch_size, tvm.tir.IntImm):
-        # SPIR-V does not support dynamic thread group size
-        return max_threads
-    return tir.min(batch_size, max_threads)
-
-
-def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
-    """Low level IR to calculate adjacent difference in an 1-D array.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input 1-D Buffer.
-
-    output: Buffer
-        A buffer to store adjacent difference, of the same shape as data. The adjacent difference
-        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
-        where i > 0 and i < len(data).
-
-    binop: function, optional
-        A binary associative op to use for calculating adjacent difference. The function takes two
-        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
-        compute the adjacent difference.
-    """
-    ib = tir.ir_builder.create()
-    data_ptr = ib.buffer_ptr(data)
-    output_ptr = ib.buffer_ptr(output)
-    batch_size = data.shape[0]
-    max_threads = _get_max_threads(batch_size)
-    with ib.new_scope():
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(batch_size, max_threads)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * max_threads + tx
-        with ib.if_scope(tid < batch_size):
-            with ib.if_scope(tid == 0):
-                output_ptr[tid] = 0
-            with ib.else_scope():
-                output_ptr[tid] = tir.Cast(output.dtype, binop(data_ptr[tid], data_ptr[tid - 1]))
-    return ib.get()
-
-
-def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub):
-    """Function calculate adjacent difference in an 1-D array.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        Input 1-D tensor.
-
-    output_dtype : str
-        The output tensor data type.
-
-    binop: function, optional
-        A binary associative op to use for calculating difference. The function takes two
-        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
-        compute the adjacent difference.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        1-D tensor storing the adjacent difference of the input tensor. The adjacent difference
-        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
-        where i > 0 and i < len(data).
-    """
-    data_buf = tir.decl_buffer(data.shape, data.dtype, "sorted_data_buf", data_alignment=8)
-    output_buf = tir.decl_buffer(data.shape, out_dtype, "output_buf", data_alignment=8)
-    return te.extern(
-        [data.shape],
-        [data],
-        lambda ins, outs: _calc_adjacent_diff_ir(ins[0], outs[0], binop=binop),
-        dtype=[out_dtype],
-        in_buffers=[data_buf],
-        out_buffers=[output_buf],
-        name="_calc_adjacent_diff",
-        tag="_calc_adjacent_diff_gpu",
-    )
-
-
-@hybrid.script
-def _calc_num_unique(inc_scan):
-    """Helper function to get the number of unique elements fron inc_scan tensor"""
-    output = output_tensor((1,), "int32")
-    for i in bind("threadIdx.x", 1):
-        output[i] = inc_scan[inc_scan.shape[0] - 1] + int32(1)
-    return output
-
-
-def _calc_unique_ir(
-    data, argsorted_indices, inc_scan, index_converter, unique_elements, inverse_indices, counts
-):
-    """Low level IR to calculate unique elements, inverse indices, and counts (optional) of
-    unique elements of 1-D array.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input 1-D Buffer.
-
-    argsorted_indices : Buffer
-        A buffer that stores the argsorted indices of the input data.
-
-    inc_scan : Buffer
-        A buffer that stores the inclusive scan of the binary tir.NE adjacent difference
-        of the sorted data.
-
-    index_converter (optional) : Buffer
-        An optional index converter that transforms the unique element index
-        such that new_idx = index_converter[old_idx].
-
-    unique_elements : Buffer
-        A buffer that stores the unique elements.
-
-    inverse_indices : Buffer
-        A buffer that stores the index of each input data element in the unique element array.
-
-    counts (optional) : Buffer
-        A buffer that stores the count of each unique element.
-    """
-    ib = tir.ir_builder.create()
-    data_ptr = ib.buffer_ptr(data)
-    argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices)
-    inc_scan_ptr = ib.buffer_ptr(inc_scan)
-    unique_elements_ptr = ib.buffer_ptr(unique_elements)
-    inverse_indices_ptr = ib.buffer_ptr(inverse_indices)
-
-    index_converter_ptr = None
-    if isinstance(index_converter, tir.Buffer):
-        index_converter_ptr = ib.buffer_ptr(index_converter)
-
-    if isinstance(counts, tir.Buffer):
-        counts_ptr = ib.buffer_ptr(counts)
-        # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1]
-        unique_seq_indices_ptr = ib.buffer_ptr(inverse_indices)
-
-    batch_size = data.shape[0]
-    max_threads = _get_max_threads(batch_size)
-
-    # if need to return counts
-    if isinstance(counts, tir.Buffer):
-        num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1
-        num_elements = data.shape[0]
-        with ib.new_scope():
-            nthread_tx = max_threads
-            nthread_bx = ceil_div(batch_size, max_threads)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-            tid = bx * max_threads + tx
-            with ib.if_scope(tid < batch_size):
-                with ib.if_scope(tid == 0):
-                    unique_seq_indices_ptr[num_unique - 1] = num_elements
-                with ib.else_scope():
-                    with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]):
-                        unique_seq_indices_ptr[inc_scan_ptr[tid] - 1] = tid
-        with ib.new_scope():
-            nthread_tx = max_threads
-            nthread_bx = ceil_div(batch_size, max_threads)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-            tid = bx * max_threads + tx
-            with ib.if_scope(tid < num_unique):
-                unique_idx = tid if not index_converter_ptr else index_converter_ptr[tid]
-                with ib.if_scope(tid == 0):
-                    counts_ptr[unique_idx] = unique_seq_indices_ptr[tid]
-                with ib.else_scope():
-                    counts_ptr[unique_idx] = (
-                        unique_seq_indices_ptr[tid] - unique_seq_indices_ptr[tid - 1]
-                    )
-    # calculate unique elements and inverse indices
-    with ib.new_scope():
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(batch_size, max_threads)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * max_threads + tx
-        with ib.if_scope(tid < batch_size):
-            data_idx = argsorted_indices_ptr[tid]
-            unique_idx = (
-                inc_scan_ptr[tid]
-                if not index_converter_ptr
-                else index_converter_ptr[inc_scan_ptr[tid]]
-            )
-            inverse_indices_ptr[data_idx] = unique_idx
-            with ib.if_scope(tid == 0):
-                unique_elements_ptr[unique_idx] = data_ptr[data_idx]
-            with ib.else_scope():
-                with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]):
-                    unique_elements_ptr[unique_idx] = data_ptr[data_idx]
-    return ib.get()
-
-
-def _calc_first_occurence_ir(argsorted_indices, inc_scan, first_occurence):
-    """Low level IR to calculate the first occurence of each unique element in the input data.
-
-    Parameters
-    ----------
-    argsorted_indices : Buffer
-        A buffer that stores the argsorted indices of the input data.
-
-    inc_scan : Buffer
-        A buffer that stores the inclusive scan of the binary tir.NE adjacent difference
-        of the sorted data.
-
-    first_occurence : Buffer
-        A buffer that stores the first occurence of each unique element in the input data.
-    """
-    ib = tir.ir_builder.create()
-    argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices)
-    inc_scan_ptr = ib.buffer_ptr(inc_scan)
-    first_occurence_ptr = ib.buffer_ptr(first_occurence)
-    batch_size = argsorted_indices.shape[0]
-    max_threads = _get_max_threads(batch_size)
-    with ib.new_scope():
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(batch_size, max_threads)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * max_threads + tx
-        with ib.if_scope(tid < batch_size):
-            first_occurence_ptr[tid] = batch_size
-    with ib.new_scope():
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(batch_size, max_threads)
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * max_threads + tx
-        with ib.if_scope(tid < batch_size):
-            with ib.if_scope(tid == 0):
-                first_occurence_ptr[inc_scan_ptr[tid]] = argsorted_indices_ptr[tid]
-            with ib.else_scope():
-                with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]):
-                    first_occurence_ptr[inc_scan_ptr[tid]] = argsorted_indices_ptr[tid]
-    return ib.get()
-
-
-def unique(data, is_sorted=True, return_counts=False):
-    """
-    Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
-    have the same length of `data` and element with index >= num_unique[0] has undefined value.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        A 1-D tensor of integers.
-
-    sorted : bool
-        Whether to sort the unique elements in ascending order before returning as output.
-
-    return_counts : bool
-        Whether to return the count of each unique element.
-
-    Returns
-    -------
-    unique : tvm.te.Tensor
-        A 1-D tensor containing the unique elements of the input data tensor. The same size as
-        the input data. If there are less unique elements than input data, the end of the tensor
-        is padded with zeros.
-
-    indices : tvm.te.Tensor
-        A 1-D tensor. The same size as output. For each entry in output, it contains
-        the index of its first occurence in the input data. The end of the tensor is padded
-        with the length of the input data.
-
-    inverse_indices : tvm.te.Tensor
-        A 1-D tensor. For each entry in data, it contains the index of that data element in the
-        unique array. (Note that inverse_indices is very similar to indices if output is not
-        sorted)
-
-    num_unique : tvm.te.Tensor
-        A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
-
-    counts (optional) : tvm.te.Tensor
-        A 1-D tensor containing the count of each unique element in the output.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
-        output          =  [4, 5, 1, 2, 3, ?, ?, ?]
-        indices         =  [0, 1, 2, 3, 4, ?, ?, ?]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-
-        [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True)
-        output          =  [4, 5, 1, 2, 3, ?, ?, ?]
-        indices         =  [0, 1, 2, 3, 4, ?, ?, ?]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-        counts          =  [2, 2, 1, 1, 2, ?, ?, ?]
-
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
-        output          =  [1, 2, 3, 4, 5, ?, ?, ?]
-        indices         =  [2, 3, 4, 0, 1, ?, ?, ?]
-        inverse_indices =  [3, 4, 0, 1, 2, 2, 3, 4]
-        num_unique      =  [5]
-    """
-    sorted_data = sort(data)
-    argsorted_indices = argsort(data, dtype="int32")
-    # adjacent difference
-    adjacent_diff = _calc_adjacent_diff(sorted_data, out_dtype="int32", binop=tir.NE)
-    # inclusive scan
-    inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0)
-    # total number of unique elements
-    num_unique_elements = _calc_num_unique(inc_scan)
-    # buffers
-    data_buf = tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    argsorted_indices_buf = tir.decl_buffer(
-        data.shape, "int32", "argsorted_indices_buf", data_alignment=8
-    )
-    inc_scan_buf = tvm.tir.decl_buffer(data.shape, "int32", "inc_scan_buf", data_alignment=8)
-    unique_elements_buf = tir.decl_buffer(
-        data.shape, data.dtype, "unique_elements_buf", data_alignment=8
-    )
-    inverse_indices_buf = tvm.tir.decl_buffer(
-        data.shape, "int32", "inverse_indices_buf", data_alignment=8
-    )
-    # prepare outputs
-    if return_counts:
-        counts_buf = tir.decl_buffer(data.shape, "int32", "counts_buf", data_alignment=8)
-        out_data_shape = [data.shape] * 3
-        out_buffers = [unique_elements_buf, inverse_indices_buf, counts_buf]
-        out_dtypes = [data.dtype, "int32", "int32"]
-    else:
-        out_data_shape = [data.shape] * 2
-        out_buffers = [unique_elements_buf, inverse_indices_buf]
-        out_dtypes = [data.dtype, "int32"]
-    # prepare inputs and fcompute
-    # calculate first occurence
-    first_occurence_buf = tir.decl_buffer(
-        data.shape, "int32", "first_occurence_buf", data_alignment=8
-    )
-    first_occurence = te.extern(
-        [data.shape],
-        [argsorted_indices, inc_scan],
-        lambda ins, outs: _calc_first_occurence_ir(ins[0], ins[1], outs[0]),
-        dtype=["int32"],
-        in_buffers=[argsorted_indices_buf, inc_scan_buf],
-        out_buffers=[first_occurence_buf],
-        name="_calc_first_occurence",
-        tag="_calc_first_occurence_gpu",
-    )
-    if is_sorted:
-        in_data = [data, argsorted_indices, inc_scan]
-        in_buffers = [data_buf, argsorted_indices_buf, inc_scan_buf]
-        if return_counts:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs)
-        else:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None)
-        indices = first_occurence
-    else:
-        # calculate index converter by sorting unique elements by their first occurence
-        argsorted_first_occurence = argsort(first_occurence, dtype="int32")
-        index_converter = argsort(argsorted_first_occurence, dtype="int32")
-        index_converter_buf = tir.decl_buffer(
-            data.shape, "int32", "index_converter_buf", data_alignment=8
-        )
-        in_data = [data, argsorted_indices, inc_scan, index_converter]
-        in_buffers = [data_buf, argsorted_indices_buf, inc_scan_buf, index_converter_buf]
-        if return_counts:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs)
-        else:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None)
-        indices = sort(first_occurence)
-    outs = te.extern(
-        out_data_shape,
-        in_data,
-        fcompute,
-        dtype=out_dtypes,
-        in_buffers=in_buffers,
-        out_buffers=out_buffers,
-        name="_calc_unique",
-        tag="_calc_unique_gpu",
-    )
-    if return_counts:
-        return [outs[0], indices, outs[1], num_unique_elements, outs[2]]
-    return [outs[0], indices, outs[1], num_unique_elements]
diff --git a/python/tvm/topi/cuda/vision.py b/python/tvm/topi/cuda/vision.py
deleted file mode 100644
index 5208aeccd413..000000000000
--- a/python/tvm/topi/cuda/vision.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, no-member, import-outside-toplevel
-"""Schedule for vision operators"""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-from .. import cpp
-from .. import tag
-from .pooling import schedule_pool
-from .injective import schedule_injective_from_existing
-
-
-def _default_schedule(outs):
-    """Default schedule for gpu."""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        if tag.is_injective(op.tag) or op.tag in ["bbox_score", "sorted_bbox"]:
-            schedule_injective_from_existing(s, op.output(0))
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                traverse(tensor.op)
-        scheduled_ops.append(op)
-
-    for o in outs:
-        traverse(o.op)
-
-    return s
-
-
-def schedule_reorg(outs):
-    """Schedule for reorg operator.
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of reorg
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for reorg.
-    """
-    target = tvm.target.Target.current(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.kind.name)
-    return cpp.cuda.schedule_injective(cpp_target, outs)
-
-
-def schedule_nms(outs):
-    """Schedule for non-maximum suppression
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of nms
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs)
-
-
-def schedule_multibox_prior(outs):
-    """Schedule for multibox_prior operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of multibox_prior
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for multibox_prior.
-    """
-    return _default_schedule(outs)
-
-
-def schedule_multibox_transform_loc(outs):
-    """Schedule for multibox_transform_loc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of
-      multibox_transform_loc in the format
-      of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs)
-
-
-def schedule_multibox_detection(outs):
-    """Schedule for multibox_detection operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of multibox_detection
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for multibox_detection.
-    """
-    return _default_schedule(outs)
-
-
-def schedule_roi_align(outs):
-    return schedule_pool(outs, "NCHW")
-
-
-def schedule_roi_pool(outs):
-    return schedule_pool(outs, "NCHW")
-
-
-def schedule_proposal(outs):
-    """Schedule for proposal operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of proposal
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs)
-
-
-def schedule_get_valid_counts(outs):
-    """Schedule for get_valid_counts operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of get_valid_counts
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs)
diff --git a/python/tvm/topi/generic/__init__.py b/python/tvm/topi/generic/__init__.py
deleted file mode 100644
index 021f9a1bbe1d..000000000000
--- a/python/tvm/topi/generic/__init__.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Generic declaration and schedules.
-
-This is a recommended way of using TOPI API.
-To use the generic schedule function, user must set
-the current target scope using with block. See also :any:`tvm.target`
-
-Example
--------
-.. code-block:: python
-
-  # create schedule that dispatches to topi.cuda.schedule_injective
-  with tvm.target.Target("cuda"):
-    s = tvm.tir.generic.schedule_injective(outs)
-"""
-from __future__ import absolute_import as _abs
-
-from .nn import *
-from .injective import *
-from .extern import *
-from .vision import *
-from .sort import *
-from .search import *
-from .image import *
-from .math import *
diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
deleted file mode 100644
index 189bdf9cbd7c..000000000000
--- a/python/tvm/topi/generic/conv2d.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, too-many-locals
-# pylint: disable=unused-argument, redefined-builtin
-"""Generic convolution schedules"""
-from tvm import te
-from tvm import autotvm
-from tvm import relay
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from ..utils import get_const_tuple, traverse_inline
-from ..nn.utils import get_pad_tuple
-
-
-def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
-    """Fallback schedule for conv2d int8 on cpu.
-    Normally the inner most pattern takes two int8/uint8 tensors
-    data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
-    produces a dot product int32/uint32 output[int32_lanes].
-
-    Parameters
-    ----------
-    int32_lanes : int
-        How many numbers of int32/uint32 will be produced using intrinsic.
-        This is related to output channel.
-    num_int8_elements : int
-        How many numbers of input int32/uint32 will be multiplied and reduced.
-        This is related to input channel.
-    """
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.stride_h, wkl.stride_w
-    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
-    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
-
-    assert (
-        wkl.out_filter % int32_lanes == 0
-    ), f"wkl.out_filter={wkl.out_filter}, int32_lanes={int32_lanes}"
-    assert (
-        wkl.in_filter % num_int8_elements == 0
-    ), f"wkl.in_filter={wkl.in_filter}, num_int8_elements={num_int8_elements}"
-
-    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -4):
-        if wkl.in_filter % bn == 0:
-            ic_bn = bn
-            break
-
-    reg_n = 1
-    for n in range(31, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-
-    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
-    cfg["unroll_kw"] = OtherOptionEntity(False)
-
-
-def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
-    """Fallback schedule for 1x1 conv2d int8 on cpu.
-    Normally the inner most pattern takes two int8/uint8 tensors
-    data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
-    produces a dot product int32/uint32 output[int32_lanes].
-
-    Parameters
-    ----------
-    int32_lanes : int
-        How many numbers of int32/uint32 will be produced using intrinsic.
-        This is related to output channel.
-    num_int8_elements : int
-        How many numbers of input int32/uint32 will be multiplied and reduced.
-        This is related to input channel.
-    """
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.stride_h, wkl.stride_w
-    out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1
-    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1
-
-    assert (
-        wkl.out_filter % int32_lanes == 0
-    ), f"wkl.out_filter={wkl.out_filter}, int32_lanes={int32_lanes}"
-    assert (
-        wkl.in_filter % num_int8_elements == 0
-    ), f"wkl.in_filter={wkl.in_filter}, num_int8_elements={num_int8_elements}"
-
-    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -4):
-        if wkl.in_filter % bn == 0:
-            ic_bn = bn
-            break
-
-    for ow_factor in range(out_width, 0, -1):
-        if out_width % ow_factor == 0:
-            for oh_factor in range(out_height, 0, -1):
-                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
-                    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-                    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-                    cfg["tile_oh"] = OtherOptionEntity(oh_factor)
-                    cfg["tile_ow"] = SplitEntity([out_width // ow_factor, ow_factor])
-                    return
-    raise ValueError(f"cannot decide default schedule for workload: {wkl}")
-
-
-def schedule_conv_NCHWc_cpu_common_int8(
-    s,
-    cfg,
-    data_vec,
-    kernel_vec,
-    conv_out,
-    last,
-    int32_lanes=16,
-    int8_elems=4,
-    intrin=None,
-    inline_fused=True,
-    mem_scope="global",
-):
-    """
-    Defines the schedule for INT8 for Intel and ARM machines
-    Uses the Intel/ARM intrinsics to use INT8 operations
-    More details - https://software.intel.com/en-us/articles/
-    lower-numerical-precision-deep-learning-inference-and-training
-    """
-    if isinstance(cfg["tile_ow"], int):
-        reg_n = cfg["tile_ow"]
-    else:
-        reg_n = cfg["tile_ow"].size[-1]
-
-    if isinstance(cfg["unroll_kw"], (int, bool)):
-        unroll_kw = cfg["unroll_kw"]
-    else:
-        unroll_kw = cfg["unroll_kw"].val
-
-    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
-    _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
-
-    # schedule pad
-    if isinstance(s[data_vec].op, te.tensor.ComputeOp) and "pad" in data_vec.op.tag:
-        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-        data_vec = data_vec.op.input_tensors[0]
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
-        # skip this part during tuning to make records accurate.
-        # this part will be folded during Relay fold_constant pass.
-        if isinstance(data_vec.op, te.tensor.ComputeOp):
-            s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
-        if isinstance(kernel_vec.op, te.tensor.ComputeOp):
-            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and kernel_vec.name == "kernel_vec":
-        # data and kernel are not pre-computed, schedule layout transform here.
-        # this should only be used by x86 conv2d_nchw, which is for
-        # testing purpose.
-        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-
-        # conv2d_nchwc_int8 has 7D kernel
-        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block, _ = s[kernel_vec].op.axis
-        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-        oc_bn = cfg["tile_oc"].size[-1]
-        if oc_bn > 1:
-            s[kernel_vec].vectorize(oc_block)
-        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
-        s[kernel_vec].parallel(parallel_axis)
-
-    # schedule 5-D NCHW[x]c conv
-    C, O = conv_out, last
-    CC = s.cache_write(C, mem_scope)
-
-    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
-    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[C].fuse(batch, oc_chunk, oh)
-    s[C].vectorize(oc_block)
-    if C == O:
-        s[C].parallel(parallel_axis)
-
-    s[CC].compute_at(s[C], parallel_axis)
-    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
-
-    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
-
-    assert oc_bn % int32_lanes == 0, f"oc_bn={oc_bn} % int32_lanes={int32_lanes} != 0"
-    assert (
-        ic_bn % int8_elems == 0
-    ), f"ic_bn={ic_bn} % int8_elems={int8_elems} != 0"  # (u)int8 elements in (u)int32
-
-    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
-
-    if unroll_kw:
-        s[CC].reorder(
-            oc_chunk,
-            oh,
-            ow_chunk,
-            ic_outer,
-            kh,
-            ic_f_inner,
-            kw,
-            ow_block,
-            oc_f_inner,
-            oc_s_inner,
-            ic_s_inner,
-        )
-        s[CC].unroll(kw)
-    else:
-        s[CC].reorder(
-            oc_chunk,
-            oh,
-            ow_chunk,
-            ic_outer,
-            kh,
-            kw,
-            ic_f_inner,
-            ow_block,
-            oc_f_inner,
-            oc_s_inner,
-            ic_s_inner,
-        )
-
-    if intrin is not None:
-        s[CC].tensorize(oc_s_inner, intrin)
-    s[CC].unroll(ow_block)
-    s[CC].unroll(oc_f_inner)
-
-    if C != O:
-        out_ndim = len(s[O].op.axis)
-        if out_ndim == 5:
-            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-            ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-        elif out_ndim == 4:
-            batch, oc, oh, ow = s[O].op.axis
-            ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-        else:
-            raise ValueError(f"Unsupported output ndim: {out_ndim}")
-        parallel_axis = s[O].fuse(batch, oc_chunk, oh)
-        if inline_fused:
-            s[C].compute_at(s[O], ow_block)
-        else:
-            s[C].compute_at(s[O], parallel_axis)
-        s[O].vectorize(oc_block)
-        s[O].parallel(parallel_axis)
-
-    return s
-
-
-def schedule_conv_NCHWc_cpu_1x1_int8(
-    s,
-    cfg,
-    data_vec,
-    kernel_vec,
-    conv_out,
-    last,
-    int32_lanes=16,
-    int8_elems=4,
-    intrin=None,
-    inline_fused=False,
-    mem_scope="global",
-):
-    """
-    Defines the 1x1 conv schedule for INT8 for Intel and ARM machines
-    Uses the Intel/ARM intrinsics to use INT8 operations
-    More details - https://software.intel.com/en-us/articles/
-    lower-numerical-precision-deep-learning-inference-and-training
-    """
-    oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
-    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
-    _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
-
-    # schedule pad
-    if isinstance(s[data_vec].op, te.tensor.ComputeOp) and "pad" in data_vec.op.tag:
-        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-        data_vec = data_vec.op.input_tensors[0]
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
-        # skip this part during tuning to make records accurate.
-        # this part will be folded during Relay fold_constant pass.
-        if isinstance(data_vec.op, te.tensor.ComputeOp):
-            s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
-        if isinstance(kernel_vec.op, te.tensor.ComputeOp):
-            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and kernel_vec.name == "kernel_vec":
-        # data and kernel are not pre-computed, schedule layout transform here.
-        # this should only be used by x86 conv2d_nchw, which is for
-        # testing purpose.
-        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-
-        # Conv2d int8 schedule has 7D kernel
-        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block, _ = s[kernel_vec].op.axis
-        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-        oc_bn = cfg["tile_oc"].size[-1]
-        if oc_bn > 1:
-            s[kernel_vec].vectorize(oc_block)
-        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
-        s[kernel_vec].parallel(parallel_axis)
-
-    C, O = conv_out, last
-    CC = s.cache_write(C, mem_scope)
-
-    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
-    ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
-    s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
-    s[C].vectorize(oc_block)
-
-    parallel_axis = s[C].fuse(batch, oc_chunk, oh_outer)
-    if C == O:
-        s[C].parallel(parallel_axis)
-    s[CC].compute_at(s[C], parallel_axis)  # good perf on mobilenet, but not on individuals?
-
-    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
-
-    assert oc_bn % int32_lanes == 0
-    assert ic_bn % int8_elems == 0  # (u)int8 elements in (u)int32
-
-    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
-
-    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
-
-    s[CC].reorder(
-        oc_chunk,
-        oh_outer,
-        ow_outer,
-        kh,
-        kw,
-        ic_outer,
-        ic_f_inner,
-        oh_inner,
-        ow_inner,
-        oc_f_inner,
-        oc_s_inner,
-        ic_s_inner,
-    )
-    s[CC].fuse(oc_chunk, oh_outer)
-
-    if intrin is not None:
-        s[CC].tensorize(oc_s_inner, intrin)
-    s[CC].unroll(ow_inner)
-    s[CC].unroll(oh_inner)
-
-    if C != O:
-        out_ndim = len(s[O].op.axis)
-        if out_ndim == 5:
-            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-            oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
-            ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
-        elif out_ndim == 4:
-            batch, oc, oh, ow = s[O].op.axis
-            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-            oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
-            ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
-        else:
-            raise ValueError(f"Unsupported output ndim: {out_ndim}")
-
-        s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
-        parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
-        if inline_fused:
-            s[C].compute_at(s[O], ow_inner)
-        else:
-            s[C].compute_at(s[O], parallel_axis)
-        s[O].vectorize(oc_block)
-        s[O].parallel(parallel_axis)
-
-    return s
-
-
-def schedule_depthwise_conv2d_nhwc(outs):
-    """Create schedule for depthwise conv2d in NHWC layout.
-    Parameters
-    ----------
-    outs : list[te.tensor.Tensor]
-            The output tensors.
-    Returns
-    -------
-    s : tvm.te.schedule.Schedule
-        The computation schedule for depthwise conv2d.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        """Traverse operators from computation graph"""
-        if "depthwise_conv2d_nhwc" in op.tag:
-            out = outs[0]
-            depthwise_conv2d_out = op.output(0)
-            data_pad = depthwise_conv2d_out.op.input_tensors[0]
-            s[data_pad].compute_inline()
-            if depthwise_conv2d_out != out:
-                s[depthwise_conv2d_out].compute_at(s[out], s[out].op.axis[3])
-            s[out].fuse(*s[out].op.axis)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def conv2d_alter_int8_common(
-    data,
-    data_tensor,
-    kernel,
-    kernel_tensor,
-    output_tensor,
-    attrs,
-    data_dtype: str,
-    in_channel_vector_length: int,
-    out_channel_vector_length: int,
-):
-    """
-    Convert TE inputs/outputs so that they are suitable for fast Int8 instructions.
-
-    Int8 instructions require input channels and output channels to be a
-    multiple of the vector length. For input channels, we pad both the inputs
-    and weights channels. For output channels, we pad the weight and
-    stride_slice the output.
-
-    Arguments
-    ---------
-    data: Expr
-        Data Expr
-    data_tensor: Tensor
-        Data tensor
-    kernel: Expr
-        Kernel Expr
-    kernel_tensor: Tensor
-        Kernel tensor
-    output_tensor: Tensor
-        Output tensor
-    attrs: Conv2dAttrs
-        Attributes of the computation
-    data_dtype: "int8" or "uint8"
-        Desired dtype of data. Data will be converted to this dtype before the main computation.
-    in_channel_vector_length: int
-        Length of vector units on target hardware. Input channels are padded to this length.
-    out_channel_vector_length: int
-        Output size of vector instruction. Output channels are padded to this length.
-
-    Returns
-    -------
-    out : Tensor
-        Conv2d computation with inputs in the correct order for tensorization.
-    """
-    # Dilation not supported yet. Return None if dilation is not (1, 1)
-    dilation = attrs.get_int_tuple("dilation")
-    if not (dilation[0] == 1 and dilation[1] == 1):
-        return None
-
-    # No legalization for depthwise convolutions yet.
-    groups = attrs.get_int("groups")
-    if groups != 1:
-        return None
-
-    # Get the conv attrs
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    padding = attrs.get_int_tuple("padding")
-    kh, kw = attrs.get_int_tuple("kernel_size")
-    pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw))
-
-    if data_tensor.dtype != data_dtype:
-        # How to convert data to uint8
-        # Original --> C = A (conv) B
-        # A and B are int8
-        #   C = (A + 128 - 128) (conv) B
-        #   C = (A' conv B) - 128 (conv) B
-        # where A' = A + 128
-        # and 128 (conv) B is basically a reduce on CRS axis for weights.
-        #
-        # How to convert data to int8
-        #   C = (A - 128 + 128) (conv) B
-        #   C = (A' conv B) + 128 (conv) B
-        # where A' = A - 128
-        if data_dtype == "uint8":
-            # shift data to uint8
-            before_shift = relay.add
-            after_shift = relay.subtract
-            pad_value = 128
-        else:
-            # shift data to int8
-            before_shift = relay.subtract
-            after_shift = relay.add
-            pad_value = -128
-
-        if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
-            adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(0, 1, 2))
-            pad_width = ((0, 0), (pt, pb), (pl, pr), (0, 0))
-        elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
-            pad_width = ((0, 0), (0, 0), (pt, pb), (pl, pr))
-            adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(1, 2, 3))
-            adjust_shift = relay.expand_dims(adjust_shift, axis=1, num_newaxis=2)
-        else:
-            return None
-
-        data = relay.cast(data, "int32")
-        data = before_shift(data, relay.const(128, "int32"))
-        data = relay.cast(data, data_dtype)
-
-        # Do external padding as pad value has to be 128.
-        if any(padding):
-            data = relay.nn.pad(data, pad_width=pad_width, pad_value=pad_value)
-
-        new_attrs["padding"] = (0, 0)
-
-        # Multiply 128 to adjust shift.
-        adjust_shift = relay.multiply(adjust_shift, relay.const(128, "int32"))
-
-    # Flags to remember if the expr is modified
-    ic_modified = False
-    oc_modified = False
-
-    # Find the value of input and output channel.
-    in_channel = -1
-    out_channel = -1
-    if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
-        in_channel = data_tensor.shape[3].value
-        out_channel = kernel_tensor.shape[3].value
-    elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
-        in_channel = data_tensor.shape[1].value
-        out_channel = kernel_tensor.shape[0].value
-    else:
-        return None
-
-    if in_channel % in_channel_vector_length != 0:
-        new_in_channel = (
-            (in_channel + in_channel_vector_length) // in_channel_vector_length
-        ) * in_channel_vector_length
-        diff = new_in_channel - in_channel
-        if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
-            data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, diff)))
-            kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, diff), (0, 0)))
-            ic_modified = True
-        elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
-            pad_width = ((0, 0), (0, diff), (0, 0), (0, 0))
-            data = relay.nn.pad(data, pad_width=pad_width)
-            kernel = relay.nn.pad(kernel, pad_width=pad_width)
-            ic_modified = True
-        else:
-            return None
-
-    new_out_channel = out_channel
-    if out_channel % out_channel_vector_length != 0:
-        new_out_channel = (
-            (out_channel + out_channel_vector_length) // out_channel_vector_length
-        ) * out_channel_vector_length
-        diff = new_out_channel - out_channel
-        if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
-            kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, diff)))
-            oc_modified = True
-        elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
-            kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0), (0, 0), (0, 0)))
-            oc_modified = True
-        else:
-            return None
-
-    if oc_modified:
-        new_attrs["channels"] = new_out_channel
-        out = relay.nn.conv2d(data, kernel, **new_attrs)
-        original_out_shape = [x.value for x in output_tensor.shape]
-        out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
-    else:
-        out = relay.nn.conv2d(data, kernel, **new_attrs)
-
-    if data_tensor.dtype != data_dtype:
-        out = after_shift(out, adjust_shift)
-
-    return out
diff --git a/python/tvm/topi/generic/default.py b/python/tvm/topi/generic/default.py
deleted file mode 100644
index 65f24019de15..000000000000
--- a/python/tvm/topi/generic/default.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-argument
-"""The default schedule used by various operators"""
-import tvm
-from tvm import te
-
-
-def default_schedule(outs, auto_inline):
-    """Default schedule for llvm."""
-    target = tvm.target.Target.current(allow_none=False)
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    if target.kind.name not in ("llvm", "c"):
-        raise RuntimeError(f"schedule not registered for '{target}'")
-    s = te.create_schedule([x.op for x in outs])
-    if auto_inline:
-        x = outs[0]
-        te.schedule.AutoInlineInjective(s)
-        s[x].fuse(s[x].op.axis)
-    return s
diff --git a/python/tvm/topi/generic/extern.py b/python/tvm/topi/generic/extern.py
deleted file mode 100644
index cd6fd7ae284c..000000000000
--- a/python/tvm/topi/generic/extern.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""generic declaration and schedules."""
-import tvm
-from .. import cpp
-
-
-def schedule_extern(outs):
-    """Schedule for an extern op followed by injective operations.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of extern plus injective ops in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    target = tvm.target.Target.current()
-    return cpp.generic.schedule_extern(target, outs)
diff --git a/python/tvm/topi/generic/image.py b/python/tvm/topi/generic/image.py
deleted file mode 100644
index 44d134de9921..000000000000
--- a/python/tvm/topi/generic/image.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Generic image operators"""
-from .default import default_schedule as _default_schedule
-
-
-def schedule_dilation2d_nchw(outs):
-    """Schedule for dilation2d
-    Parameters
-    ----------
-    outs : Array of Tensor
-        The computation graph description of dilation2d
-        in the format of an array of tensors.
-    Returns
-    -------
-    sch : Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_dilation2d_nhwc(outs):
-    """Schedule for dilation2d
-    Parameters
-    ----------
-    outs : Array of Tensor
-        The computation graph description of dilation2d
-        in the format of an array of tensors.
-    Returns
-    -------
-    sch : Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/generic/injective.py b/python/tvm/topi/generic/injective.py
deleted file mode 100644
index 00c35b22b6c8..000000000000
--- a/python/tvm/topi/generic/injective.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""generic declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-
-
-def schedule_injective_from_existing(sch, out):
-    """Schedule for injective op from existing schedule.
-
-    Parameters
-    ----------
-    sch: Schedule
-         The schedule to update.
-    out: Tensor
-         The tensor representing the injective op.
-
-    Returns
-    -------
-    sch: Schedule
-         The updated schedule.
-    """
-    sch[out].fuse(*sch[out].op.axis)
-    return sch
-
-
-def schedule_injective(outs):
-    """Schedule for injective op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    target = tvm.target.Target.current(allow_none=False)
-    if target.kind.name != "llvm":
-        raise RuntimeError(f"schedule_injective not registered for '{target}'")
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    x = outs[0]
-    s = te.create_schedule([x.op for x in outs])
-    te.schedule.AutoInlineInjective(s)
-    schedule_injective_from_existing(s, x)
-    return s
-
-
-schedule_elemwise = schedule_injective
-schedule_broadcast = schedule_injective
diff --git a/python/tvm/topi/generic/math.py b/python/tvm/topi/generic/math.py
deleted file mode 100644
index 3af6cd16a374..000000000000
--- a/python/tvm/topi/generic/math.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Generic math operators"""
-from .default import default_schedule as _default_schedule
-
-
-def schedule_einsum(outs):
-    """Schedule for einsum operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of einsum.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
deleted file mode 100644
index a3da7a395151..000000000000
--- a/python/tvm/topi/generic/nn.py
+++ /dev/null
@@ -1,933 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-argument
-"""Generic nn operators"""
-from tvm import te
-from .default import default_schedule as _default_schedule
-
-
-def schedule_conv1d_ncw(outs):
-    """Schedule for conv1d_ncw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv1d_ncw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv1d_nwc(outs):
-    """Schedule for conv1d_nwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv1d_nwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_group_conv1d_ncw(outs):
-    """Schedule for group_conv1d_ncw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of group_conv1d_ncw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_group_conv1d_nwc(outs):
-    """Schedule for group_conv1d_nwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of group_conv1d_nwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_hwcn(outs):
-    """Schedule for conv2d_hwcn
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv2d_hwcn
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_nhwc_pack(outs):
-    """Schedule for conv2d_nhwc_pack
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv2d_nhwc_pack
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_nhwc(outs):
-    """Schedule for conv2d_nhwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_NCHWc(outs):
-    """Schedule for conv2d_NCHW[x]c
-
-    Parameters
-    ----------
-    outs : Array of Tensor
-        The computation graph description of conv2d_NCHWc
-        in the format of an array of tensors.
-        The number of filter, i.e., the output channel.
-
-    Returns
-    -------
-    sch : Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_NCHWc_int8(outs):
-    """Schedule for conv2d_NCHW[x]c_int8
-
-    Parameters
-    ----------
-    outs : Array of Tensor
-        The computation graph description of conv2d_NCHWc_int8
-        in the format of an array of tensors.
-        The number of filter, i.e., the output channel.
-
-    Returns
-    -------
-    sch : Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_winograd_weight_transform(outs):
-    """Schedule for weight transformation of winograd
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of this operator
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    # Typically this is computed in PreCompute pass
-    # so we make a schedule here for cpu llvm
-    s = te.create_schedule([x.op for x in outs])
-    output = outs[0]
-    _, G = s[output].op.input_tensors
-    s[G].compute_inline()
-    eps, nu, co, ci = s[output].op.axis
-    r_kh, r_kw = s[output].op.reduce_axis
-    s[output].reorder(co, ci, r_kh, r_kw, eps, nu)
-    for axis in [r_kh, r_kw, eps, nu]:
-        s[output].unroll(axis)
-    s[output].parallel(co)
-    return s
-
-
-def schedule_conv2d_gemm_weight_transform(outs):
-    """Schedule for weight transformation of gemm
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of this operator
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    # Typically this is computed in PreCompute pass
-    s = te.create_schedule([x.op for x in outs])
-    return s
-
-
-def schedule_conv3d_winograd_weight_transform(outs):
-    """Schedule for weight transformation of 3D winograd
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of this operator
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    # Typically this is computed in PreCompute pass
-    # so we make a schedule here for cpu llvm
-    s = te.create_schedule([x.op for x in outs])
-    output = outs[0]
-    _, G = s[output].op.input_tensors
-    s[G].compute_inline()
-    transform_depth = len(s[output].op.reduce_axis) == 3
-    if transform_depth:
-        omg, eps, nu, ci, co = s[output].op.axis
-        r_kd, r_kh, r_kw = s[output].op.reduce_axis
-        s[output].reorder(co, ci, omg, eps, nu, r_kd, r_kh, r_kw)
-        for axis in [r_kd, r_kh, r_kw]:
-            s[output].unroll(axis)
-    else:
-        eps, nu, d, ci, co = s[output].op.axis
-        r_kh, r_kw = s[output].op.reduce_axis
-        s[output].reorder(co, ci, d, eps, nu, r_kh, r_kw)
-        for axis in [r_kh, r_kw]:
-            s[output].unroll(axis)
-    s[output].parallel(co)
-    return s
-
-
-def schedule_conv2d_winograd_without_weight_transform(outs):
-    """Schedule for winograd without weight transformation
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of this operator
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_winograd_nnpack_weight_transform(outs):
-    """Schedule for weight transformation of winograd
-     Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of this operator
-          in the format of an array of tensors.
-     Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    # Typically this is computed in PreCompute pass
-    s = te.create_schedule([x.op for x in outs])
-    return s
-
-
-def schedule_conv3d_ncdhw(outs):
-    """Schedule for conv3d_ncdhw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv3d_ndhwc(outs):
-    """Schedule for conv3d_ndhwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv3d_ndhwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv3d_transpose_ncdhw(outs):
-    """Schedule for conv3d_transpose_ncdhw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv3d_transpose_ncdhw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv2d_transpose_nchw(outs):
-    """Schedule for conv2d_transpose_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_transpose_nchw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_conv1d_transpose_ncw(outs):
-    """Schedule for conv1d_transpose_ncw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_transpose_ncw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_group_conv1d_transpose_ncw(outs):
-    """Schedule for group_conv1d_transpose_ncw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of group conv1d_transpose_ncw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_depthwise_conv2d_nchw(outs):
-    """Schedule for depthwise_conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of depthwise_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_depthwise_conv2d_nhwc(outs):
-    """Schedule for depthwise_conv2d_nhwc
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of depthwise_conv2d_nhwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_depthwise_conv2d_NCHWc(outs):
-    """Schedule for depthwise_conv2d_NCHWc
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of depthwise_conv2d_nhwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_group_conv2d_nchw(outs):
-    """Schedule for group_conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of group_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_group_conv2d_transpose_nchw(outs):
-    """Schedule for group_conv2d_transpose_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of group_conv2d_nhwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_group_conv3d_transpose_ncdhw(outs):
-    """Schedule for schedule_group_conv3d_transpose_ncdhw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of schedule_group_conv3d_transpose_ncdhw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_group_conv2d_nhwc(outs):
-    """Schedule for group_conv2d_nhwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of group_conv2d_nhwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_deformable_conv2d_nchw(outs):
-    """Schedule for deformable_conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of deformable_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_deformable_conv2d_nhwc(outs):
-    """Schedule for deformable_conv2d_nhwc.
-    We only use the default schedule here and rely on auto_scheduler.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of deformable_conv2d_nhwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_bitserial_conv2d_nchw(outs):
-    """Schedule for bitserial_conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of bitserial_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_bitserial_conv2d_nhwc(outs):
-    """Schedule for bitserial_conv2d_nhwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of bitserial_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_bitserial_dense(outs):
-    """Schedule for bitserial_dense
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of bitserial_dense
-          in the format of an array of tensors.
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_reduce(outs):
-    """Schedule for reduction
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of reduce
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, True)
-
-
-def schedule_softmax(outs):
-    """Schedule for softmax
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of softmax
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_fast_softmax(outs):
-    """Schedule for fast_softmax
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of fast_softmax
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_matmul(outs):
-    """Schedule for matmul
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of matmul
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_dense(outs):
-    """Schedule for dense
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of dense
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_pool(outs, layout):
-    """Schedule for pool
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of pool
-          in the format of an array of tensors.
-
-    layout: str
-        Data layout.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_pool_grad(outs):
-    """Schedule for pool_grad
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of pool
-          in the format of an array of tensors.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_adaptive_pool(outs):
-    """Schedule for adaptive pool
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of adaptive pool
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_binarize_pack(outs):
-    """Schedule for binarize_pack
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of binarize_pack
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_bitpack(outs):
-    """Schedule for bitpack
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of bitpack
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_binary_dense(outs):
-    """Schedule for binary_dense
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of binary_dense
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_lrn(outs):
-    """Schedule for lrn
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of lrn
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_sparse_dense(outs):
-    """Schedule for sparse_dense
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of sparse_dense
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_sparse_transpose(outs):
-    """Schedule for sparse_transpose
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of sparse_transpose
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_sparse_conv2d(outs):
-    """Schedule for sparse_conv2d
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of sparse_conv2d
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_batch_matmul(outs):
-    """Schedule for batch_matmul
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of sparse_transpose
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_batch_norm(outs):
-    """Schedule for batch_norm
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of sparse_transpose
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_correlation_nchw(outs):
-    """Schedule for correlation_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of correlation_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_lstm(outs):
-    """Schedule for LSTM
-
-    Parameters
-    ----------
-    outs : Array of Tensor
-        The outputs of LSTM (hidden states and cell states).
-
-    Returns
-    -------
-    sch: Schedule
-        The default schedule for LSTM.
-    """
-    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/generic/search.py b/python/tvm/topi/generic/search.py
deleted file mode 100644
index 9a80e678c212..000000000000
--- a/python/tvm/topi/generic/search.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member
-"""Generic search operators"""
-from __future__ import absolute_import as _abs
-from .default import default_schedule as _default_schedule
-
-
-def schedule_argwhere(outs):
-    """Schedule for argwhere operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of argwhere.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_sparse_fill_empty_rows(outs):
-    return _default_schedule(outs, False)
-
-
-def schedule_unique(outs):
-    """Schedule for unique operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of unique.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/generic/sort.py b/python/tvm/topi/generic/sort.py
deleted file mode 100644
index 65df7a1a2569..000000000000
--- a/python/tvm/topi/generic/sort.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member
-"""Generic sort operators"""
-from __future__ import absolute_import as _abs
-from .default import default_schedule as _default_schedule
-
-
-def schedule_sort(outs):
-    """Schedule for sort operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The indices that would sort an input array along
-      the given axis.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_argsort(outs):
-    """Schedule for argsort operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The indices that would sort an input array along
-      the given axis.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_topk(outs):
-    """Schedule for topk operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The indices that would sort an input array along
-      the given axis.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/generic/vision.py b/python/tvm/topi/generic/vision.py
deleted file mode 100644
index e7518b1110a1..000000000000
--- a/python/tvm/topi/generic/vision.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member
-"""Generic vision operators"""
-from __future__ import absolute_import as _abs
-import tvm
-from .. import cpp
-from .default import default_schedule as _default_schedule
-
-
-def schedule_reorg(outs):
-    """Schedule for reorg
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of reorg
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    target = tvm.target.Target.current(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.kind.name)
-    return cpp.generic.default_schedule(cpp_target, outs, False)
-
-
-def schedule_get_valid_counts(outs):
-    """Schedule for get_valid_counts
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of nms
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_nms(outs):
-    """Schedule for non-maximum suppression
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of nms
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_multibox_prior(outs):
-    """Schedule for multibox_prior
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of multibox_prior
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_multibox_transform_loc(outs):
-    """Schedule for multibox_transform_loc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of
-      multibox_transform_loc in the format
-      of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_multibox_detection(outs):
-    """Schedule for multibox_detection
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of multibox_detection
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_roi_align(outs):
-    """Schedule for roi_align
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of roi_align
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_roi_pool(outs):
-    """Schedule for roi_align
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of roi_pool
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
-def schedule_proposal(outs):
-    """Schedule for proposal operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of proposal
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/gpu/__init__.py b/python/tvm/topi/gpu/__init__.py
index 8ed9362a3cf2..14f1fa3aab58 100644
--- a/python/tvm/topi/gpu/__init__.py
+++ b/python/tvm/topi/gpu/__init__.py
@@ -16,6 +16,6 @@
 # under the License.
 
 # pylint: disable=redefined-builtin, wildcard-import
-"""GPU specific declaration and schedules."""
-from .dense import *
-from .conv2d import *
+"""GPU specific declaration."""
+from .scan import cumsum, cumprod
+from .sort import *
diff --git a/python/tvm/topi/gpu/conv2d.py b/python/tvm/topi/gpu/conv2d.py
deleted file mode 100644
index 87c900e1d4d7..000000000000
--- a/python/tvm/topi/gpu/conv2d.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Schedule for conv2d operator"""
-from tvm import te, autotvm
-
-from .. import nn
-from ..utils import traverse_inline
-from .conv2d_nhwc import schedule_conv2d_nhwc_direct
-
-
-@autotvm.register_topi_compute("conv2d_nhwc.gpu")
-def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
-    """Compute conv2d with NHWC layout"""
-    return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc.gpu")
-def schedule_conv2d_nhwc(cfg, outs):
-    """Create the schedule for conv2d_nhwc"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv2d_nhwc":
-            schedule_conv2d_nhwc_direct(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/gpu/conv2d_nhwc.py b/python/tvm/topi/gpu/conv2d_nhwc.py
deleted file mode 100644
index ff0610394eac..000000000000
--- a/python/tvm/topi/gpu/conv2d_nhwc.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
-"""Direct conv2d in NHWC layout"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from ..utils import get_const_tuple
-
-
-def schedule_conv2d_nhwc_direct(cfg, s, Conv):
-    """schedule optimized for NHWC direct conv2d"""
-    pad_data, kernel = s[Conv].op.input_tensors
-    s[pad_data].compute_inline()
-
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    if Conv.op in s.outputs:
-        output = Conv
-        OL = s.cache_write(Conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[Conv].set_scope("local")
-        OL = Conv
-    # create cache stage
-    AA = s.cache_read(pad_data, "shared", [OL])
-    WW = s.cache_read(kernel, "shared", [OL])
-    AL = s.cache_read(AA, "local", [OL])
-    WL = s.cache_read(WW, "local", [OL])
-
-    # Currently Conv2d NHWC only support dynamic shpe in batch
-    dynamic_batch = isinstance(s[output].op.axis[0].dom.extent, tvm.tir.expr.Var)
-
-    # Schedule for autotvm
-    cfg.define_knob("tile_n", [1] if dynamic_batch else [2, 4, 8])
-    cfg.define_knob("tile_c", [2, 4, 8])
-    cfg.define_knob("num_thread_n", [1] if dynamic_batch else [4, 8, 16])
-    cfg.define_knob("num_thread_c", [4, 8, 16])
-    cfg.define_knob("vthread_n", [1] if dynamic_batch else [1, 2])
-    cfg.define_knob("vthread_c", [1, 2])
-    cfg.define_knob("step", [16, 3, 32, 64])
-    cfg.define_knob("vectorize", [1, 2, 4, 8])
-
-    # fallback support
-    target = tvm.target.Target.current()
-    if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log(
-            target.kind.name, target.model, "conv2d_nhwc.gpu"
-        )
-        cfg.fallback_with_reference_log(ref_log)
-
-    tile_n = cfg["tile_n"].val
-    tile_c = cfg["tile_c"].val
-    num_thread_n = cfg["num_thread_n"].val
-    num_thread_c = cfg["num_thread_c"].val
-    vthread_n = cfg["vthread_n"].val
-    vthread_c = cfg["vthread_c"].val
-    step = cfg["step"].val
-    vec_factor = cfg["vectorize"].val
-    block_factor_c = tile_c * num_thread_c * vthread_c
-
-    offset = 8
-    A_align = step + offset
-    W_align = block_factor_c + offset
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis((0, num_thread_c), "threadIdx.x")
-    thread_y = te.thread_axis((0, num_thread_n), "threadIdx.y")
-    thread_xz = te.thread_axis((0, vthread_c), "vthread", name="vx")
-    thread_yz = te.thread_axis((0, vthread_n), "vthread", name="vy")
-
-    # Schedule for output
-    ni, _, wi, fi = s[output].op.axis
-    bx = wi
-    fi, vec = s[output].split(fi, factor=vec_factor)
-    s[output].vectorize(vec)
-    tx, fi = s[output].split(fi, factor=tile_c)
-    txz, tx = s[output].split(tx, factor=num_thread_c)
-    bz, txz = s[output].split(txz, factor=vthread_c)
-    ty, ni = s[output].split(ni, factor=tile_n)
-    tyz, ty = s[output].split(ty, factor=num_thread_n)
-    by, tyz = s[output].split(tyz, factor=vthread_n)
-    s[output].reorder(bx, by, bz, tyz, txz, ty, tx, ni, fi, vec)
-    s[output].bind(bz, block_z)
-    s[output].bind(by, block_y)
-    s[output].bind(bx, block_x)
-    s[output].bind(tyz, thread_yz)
-    s[output].bind(txz, thread_xz)
-    s[output].bind(ty, thread_y)
-    s[output].bind(tx, thread_x)
-    # Schedule local computation
-    s[OL].compute_at(s[output], tx)
-    ni, yi, xi, fi = s[OL].op.axis
-    ry, rx, rc = s[OL].op.reduce_axis
-    rco, rci = s[OL].split(rc, factor=step)
-    s[OL].vectorize(fi)
-    s[OL].reorder(rco, ry, rx, rci, ni, fi)
-
-    s[AA].compute_at(s[OL], rx)
-    s[WW].compute_at(s[OL], rx)
-    s[AL].compute_at(s[OL], rci)
-    s[WL].compute_at(s[OL], rci)
-    # Schedule for data's share memory
-    ni, yi, xi, ci = s[AA].op.axis
-    s[AA].reorder(yi, xi, ni, ci)
-    s[AA].storage_align(xi, A_align - 1, A_align)
-    t = s[AA].fuse(ni, ci)
-    ty, tx = s[AA].split(t, factor=num_thread_c)
-    _, ty = s[AA].split(ty, factor=num_thread_n)
-    s[AA].bind(tx, thread_x)
-    s[AA].bind(ty, thread_y)
-    # Schedule for kernel's share memory
-    _, _, ic, o = s[WW].op.axis
-    t = s[WW].fuse(ic, o)
-    s[WW].storage_align(ic, W_align - 1, W_align)
-    t, vec = s[WW].split(t, factor=vec_factor)
-    s[WW].vectorize(vec)
-    ty, tx = s[WW].split(t, factor=num_thread_c)
-    _, ty = s[WW].split(ty, factor=num_thread_n)
-    s[WW].bind(tx, thread_x)
-    s[WW].bind(ty, thread_y)
-
-    N, OH, OW, CO = get_const_tuple(output.shape)
-    KH, KW, CI, _ = get_const_tuple(kernel.shape)
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
diff --git a/python/tvm/topi/gpu/dense.py b/python/tvm/topi/gpu/dense.py
deleted file mode 100644
index 5f2f36c46bf5..000000000000
--- a/python/tvm/topi/gpu/dense.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name, unused-argument
-"""Schedule for dense operator"""
-
-import logging
-
-from tvm import autotvm, te
-from tvm.autotvm.task.space import SplitEntity
-
-from .. import nn
-from ..utils import traverse_inline, get_const_tuple
-
-logger = logging.getLogger("topi")
-
-
-@autotvm.register_topi_compute("dense_small_batch.gpu")
-def dense_small_batch(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator on GPU"""
-    return nn.dense(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule("dense_small_batch.gpu")
-def schedule_dense_small_batch(cfg, outs):
-    """Schedule float32/64 dense with small batch size"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "dense":
-            _schedule_dense_small_batch(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("matmul_default.gpu")
-def matmul_default(
-    cfg,
-    tensor_a,
-    tensor_b,
-    bias=None,
-    out_dtype=None,
-    transpose_a=False,
-    transpose_b=False,
-):
-    """Matmul operator on GPU"""
-    return nn.matmul(tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b)
-
-
-@autotvm.register_topi_schedule("matmul_default.gpu")
-def schedule_matmul_default(cfg, outs):
-    """Schedule matmul on GPU"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "matmul":
-            # Temporary use this as a basic schedule for matmul
-            # TODO(jcf94): Add a more general schedule for matmul
-            _schedule_dense_small_batch(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_dense_small_batch(cfg, s, C):
-    A, weights = C.op.input_tensors
-    if len(weights.op.input_tensors) == 1 and weights.op.input_tensors[0] == A:
-        s[weights].compute_inline()
-
-    _, in_dim_weights = get_const_tuple(weights.shape)
-    _, in_dim_A = get_const_tuple(A.shape)
-
-    if isinstance(in_dim_A, int):
-        in_dim = in_dim_A
-    elif isinstance(in_dim_weights, int):
-        in_dim = in_dim_weights
-    else:
-        in_dim = None
-
-    if in_dim is not None:
-        cfg.define_split("tile_k", in_dim, num_outputs=2)
-        if cfg.is_fallback:
-            cfg["tile_k"] = SplitEntity([-1, 64] if in_dim > 64 else [1, 64])
-        _, kf = cfg["tile_k"].apply(s, C, C.op.reduce_axis[0])
-    else:
-        tile_k = 64
-        _, kf = s[C].split(C.op.reduce_axis[0], tile_k)
-
-    CF = s.rfactor(C, kf)
-
-    if C.op in s.outputs:
-        Out = C
-    else:
-        Out = s.outputs[0].output(0)
-        s[C].compute_at(s[Out], s[Out].op.axis[1])
-    s[Out].bind(s[Out].op.axis[0], te.thread_axis("blockIdx.y"))
-    s[Out].bind(s[Out].op.axis[1], te.thread_axis("blockIdx.x"))
-
-    tx = s[C].op.reduce_axis[0]
-    thread_x = te.thread_axis("threadIdx.x")
-    s[C].bind(tx, thread_x)
-    s[CF].compute_at(s[C], tx)
-    s[C].set_store_predicate(thread_x.var.equal(0))
-    s[Out].set_store_predicate(thread_x.var.equal(0))
-
-
-@autotvm.register_topi_compute("dense_large_batch.gpu")
-def dense_large_batch(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator on GPU"""
-    return nn.dense(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule("dense_large_batch.gpu")
-def schedule_dense_large_batch(cfg, outs):
-    """Schedule float32/64 dense with large batch size"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "dense":
-            _schedule_dense_large_batch(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_dense_large_batch(cfg, s, C):
-    """Schedule float32/64 dense with large batch size"""
-    A, B = C.op.input_tensors
-    if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A:
-        s[B].compute_inline()
-    batch, in_dim = get_const_tuple(A.shape)
-    out_dim, _ = get_const_tuple(B.shape)
-    k = C.op.reduce_axis[0]
-
-    # create tuning space
-    try:
-        block_cand = [64, 128]
-        vthread_cand = [2**x for x in range(1, 7)]
-        n_thread_cand = [2**x for x in range(3, 7)]
-        cfg.define_split(
-            "tile_x",
-            batch,
-            num_outputs=4,
-            filter=lambda x: (
-                x.size[1] in vthread_cand
-                and x.size[2] in n_thread_cand
-                and (x.size[1] * x.size[2] * x.size[3]) in block_cand
-            ),
-        )
-        cfg.define_split(
-            "tile_y",
-            out_dim,
-            num_outputs=4,
-            filter=lambda x: (
-                x.size[1] in vthread_cand
-                and x.size[2] in n_thread_cand
-                and (x.size[1] * x.size[2] * x.size[3]) in block_cand
-            ),
-        )
-        cfg.define_split("tile_k", in_dim, num_outputs=3, filter=lambda x: x.size[0] > 2)
-    except IndexError:
-        # Index error happens when no entities left after filtering, which was designed
-        # to prune tuning space for better search efficiency.
-        logger.debug("Tuning space was created without pruning due to unfit shapes")
-        cfg.define_split("tile_x", batch, num_outputs=4)
-        cfg.define_split("tile_y", out_dim, num_outputs=4)
-        cfg.define_split("tile_k", in_dim, num_outputs=3)
-
-    if cfg.is_fallback:
-        if batch > 1:
-            cfg["tile_x"] = SplitEntity([-1, 2, 16, 2])
-        else:
-            cfg["tile_x"] = SplitEntity([1, 1, 1, 1])
-        if out_dim > 1:
-            cfg["tile_y"] = SplitEntity([-1, 2, 16, 2])
-        else:
-            cfg["tile_y"] = SplitEntity([1, 1, 1, 1])
-        if in_dim > 8:
-            cfg["tile_k"] = SplitEntity([-1, 8, 1])
-        else:
-            cfg["tile_k"] = SplitEntity([-1, 1, 1])
-
-    # Explicit memory access
-    AA = s.cache_read(A, "shared", [C])
-    BB = s.cache_read(B, "shared", [C])
-    AL = s.cache_read(AA, "local", [C])
-    BL = s.cache_read(BB, "local", [C])
-    CC = s.cache_write(C, "local")
-
-    # Deal with op fusion
-    if C.op not in s.outputs:
-        s[C].compute_inline()
-        C = s.outputs[0].output(0)
-
-    # Split and reorder computation
-    bx, txz, tx, xi = cfg["tile_x"].apply(s, C, C.op.axis[0])
-    by, tyz, ty, yi = cfg["tile_y"].apply(s, C, C.op.axis[1])
-    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
-    s[CC].compute_at(s[C], tx)
-
-    # Binding
-    s[C].bind(by, te.thread_axis("blockIdx.y"))
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tyz, te.thread_axis("vthread"))
-    s[C].bind(txz, te.thread_axis("vthread"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # Split reduction
-    yo, xo = CC.op.axis
-    ko, kt, ki = cfg["tile_k"].apply(s, CC, k)
-    s[CC].reorder(ko, kt, ki, yo, xo)
-    s[AA].compute_at(s[CC], ko)
-    s[BB].compute_at(s[CC], ko)
-    s[CC].unroll(kt)
-    s[AL].compute_at(s[CC], kt)
-    s[BL].compute_at(s[CC], kt)
-
-    # Schedule for A's shared memory load
-    num_thread_x = cfg["tile_x"].size[2]
-    ty, _ = s[AA].split(s[AA].op.axis[0], nparts=num_thread_x)
-    _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread_x * 4)
-    tx, xi = s[AA].split(xi, nparts=num_thread_x)
-    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
-    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
-    s[AA].double_buffer()
-
-    # Schedule for B' shared memory load
-    num_thread_y = cfg["tile_y"].size[2]
-    ty, _ = s[BB].split(s[BB].op.axis[0], nparts=num_thread_y)
-    _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread_y * 4)
-    tx, xi = s[BB].split(xi, nparts=num_thread_y)
-    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
-    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
-    s[BB].double_buffer()
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/gpu/scan.py
similarity index 96%
rename from python/tvm/topi/cuda/scan.py
rename to python/tvm/topi/gpu/scan.py
index c1f2eded6be1..f45702c6341f 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/gpu/scan.py
@@ -22,11 +22,9 @@
 from tvm import te
 from tvm.contrib.thrust import can_use_rocthrust, can_use_thrust
 
-from .. import tag
 from ..math import cast, ceil_log2
 from ..transform import expand_dims, reshape, squeeze, transpose
 from ..utils import ceil_div, get_const_int, prod, swap
-from .injective import schedule_injective_from_existing
 
 
 def _get_thrust_func_name(tvmop):
@@ -555,37 +553,6 @@ def inclusive_scan(
     return binop(data, ex_scan)
 
 
-def schedule_scan(outs):
-    """Schedule for scan operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of scan
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        if tag.is_injective(op.tag):
-            schedule_injective_from_existing(s, op.output(0))
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                traverse(tensor.op)
-        scheduled_ops.append(op)
-
-    for out in outs:
-        traverse(out.op)
-    return s
-
-
 def scanop(
     data: tvm.te.Tensor,
     binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"],
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/gpu/sort.py
similarity index 80%
rename from python/tvm/topi/cuda/sort.py
rename to python/tvm/topi/gpu/sort.py
index 9151744b6961..71854e43997a 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/gpu/sort.py
@@ -19,44 +19,11 @@
 import tvm
 from tvm import te
 
-from .injective import schedule_injective_from_existing
 from ..transform import strided_slice, transpose
-from .. import tag
 from ..utils import ceil_div, swap
 from ..math import cast, ceil_log2
 
 
-def _schedule_sort(outs):
-    """Schedule for argsort operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of argsort
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        if tag.is_injective(op.tag):
-            schedule_injective_from_existing(s, op.output(0))
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                traverse(tensor.op)
-        scheduled_ops.append(op)
-
-    for out in outs:
-        traverse(out.op)
-    return s
-
-
 def _get_threads(ib, nthread_tx, nthread_bx, nthread_by):
     tx = te.thread_axis("threadIdx.x")
     bx = te.thread_axis("blockIdx.x")
@@ -556,83 +523,6 @@ def sort_ir(
     return ib.get()
 
 
-def sort_by_key_ir(
-    keys_in, values_in, keys_out, values_out, keys_out_swap, values_out_swap, axis, is_ascend
-):
-    """Low level IR to do sort by key on the GPU.
-
-    Parameters
-    ----------
-    keys_in: Buffer
-        Buffer of input keys.
-
-    values_in: Buffer
-        Buffer of input keys.
-
-    keys_out : Buffer
-        Buffer of output sorted keys.
-
-    values_out : Buffer
-        Buffer of output sorted values.
-
-    keys_out_swap : Buffer
-        Output buffer of values with same shape as keys_in to use as swap.
-
-    values_out_swap : Buffer
-        Output buffer of values with same shape as values_in to use as swap.
-
-    axis : Int
-        Axis long which to sort the input tensor.
-
-    is_ascend : Boolean
-        Whether to sort in ascending or descending order.
-
-    indicess_out : Buffer
-        Output buffer of indices of sorted tensor with same shape as keys_in.
-
-    values_out_swap : Buffer
-        Output buffer of indices with same shape as keys_in to use as swap.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    ib = tvm.tir.ir_builder.create()
-    shape = keys_in.shape
-
-    keys_in = ib.buffer_ptr(keys_in)
-    values_in = ib.buffer_ptr(values_in)
-    keys_out = ib.buffer_ptr(keys_out)
-    keys_out_swap = ib.buffer_ptr(keys_out_swap)
-    values_out = ib.buffer_ptr(values_out)
-    values_out_swap = ib.buffer_ptr(values_out_swap)
-
-    with ib.if_scope(shape[axis] > 0):
-        axis_mul_before, axis_mul_after = _sort_init(
-            ib,
-            shape,
-            axis,
-            keys_in,
-            keys_out,
-            values_out,
-            value_init_func=lambda idx, _: values_in[idx],
-        )
-
-        _sort_common(
-            ib,
-            shape[axis],
-            axis_mul_before,
-            axis_mul_after,
-            is_ascend,
-            keys_out,
-            keys_out_swap,
-            values=values_out,
-            values_swap=values_out_swap,
-        )
-    return ib.get()
-
-
 def sort(data, axis=-1, is_ascend=1):
     """Performs sorting along the given axis and returns an array of
     sorted values with the same shape as the input data.
@@ -845,40 +735,6 @@ def argsort_thrust(data, axis=-1, is_ascend=1, dtype="float32", ret_type="indice
     return topk_thrust(data, 0, axis, ret_type, is_ascend, dtype, workspace)
 
 
-def schedule_sort(outs):
-    """Schedule for sort operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of argsort
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _schedule_sort(outs)
-
-
-def schedule_argsort(outs):
-    """Schedule for argsort operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of argsort
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _schedule_sort(outs)
-
-
 def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
     """Get the top k elements in an input tensor along the given axis.
 
@@ -1081,143 +937,3 @@ def f_compute(ins, outs):
         out = out[1]
 
     return out
-
-
-def schedule_topk(outs):
-    """Schedule for argsort operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of argsort
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _schedule_sort(outs)
-
-
-def sort_by_key(keys, values, axis=-1, is_ascend=1):
-    """Sort values with respect to keys. Both keys and values will
-     be sorted and returned.
-
-    Parameters
-    ----------
-    keys: tvm.te.Tensor
-        The input keys.
-
-    values : tvm.te.Tensor,
-        The input values.
-
-    axis : int, optional
-        Axis long which to sort the input tensor.
-
-    is_ascend : boolean, optional
-        Whether to sort in ascending or descending order.
-
-    Returns
-    -------
-    keys_sorted : tvm.te.Tensor
-        The sorted keys
-
-    values_sorted : tvm.te.Tensor
-        The values sorted with respect to the keys
-    """
-    keys_buf = tvm.tir.decl_buffer(keys.shape, keys.dtype, "keys_buf", data_alignment=8)
-    values_buf = tvm.tir.decl_buffer(values.shape, values.dtype, "values_buf", data_alignment=8)
-
-    out_bufs = [
-        tvm.tir.decl_buffer(keys.shape, keys.dtype, "keys_buf", data_alignment=8),
-        tvm.tir.decl_buffer(values.shape, values.dtype, "values_buf", data_alignment=8),
-        tvm.tir.decl_buffer(keys.shape, keys.dtype, "keys_swap_buf", data_alignment=8),
-        tvm.tir.decl_buffer(values.shape, values.dtype, "values_swap_buf", data_alignment=8),
-    ]
-    out = te.extern(
-        [keys.shape, values.shape, keys.shape, values.shape],
-        [keys, values],
-        lambda ins, outs: sort_by_key_ir(
-            ins[0], ins[1], outs[0], outs[1], outs[2], outs[3], axis, is_ascend
-        ),
-        in_buffers=[keys_buf, values_buf],
-        out_buffers=out_bufs,
-        dtype=[keys.dtype, values.dtype],
-        name="sort_by_key",
-        tag="sort_by_key",
-    )
-    return out[0], out[1]
-
-
-def stable_sort_by_key_thrust(keys, values, for_scatter=False, workspace=None):
-    """Sort values with respect to keys using thrust.
-    Both keys and values will be sorted and returned.
-    Sorting is done via stable sort, so relative ordering among
-    ties are preserved.
-
-    Parameters
-    ----------
-    keys: tvm.te.Tensor
-        The 1D input keys.
-
-    values : tvm.te.Tensor,
-        The 1D input values.
-
-    for_scatter: bool, optional
-        If True, negative keys are interpreted as negative indices.
-        Before sorting, negative indices are converted to corresponding positive indices.
-        The output keys (indices) are all positive.
-        This option is introduced to optimize the scatter implementation.
-
-    workspace : Optional[tvm.te.Tensor]
-        A buffer to store intermediate results. The size of the workspace should be sufficiently
-        large, this can be obtained by overestimation or memory usage profiling. If None, it will
-        fallback to use thrust internal memory allocation.
-
-    Returns
-    -------
-    keys_sorted : tvm.te.Tensor
-        The sorted keys
-
-    values_sorted : tvm.te.Tensor
-        The values sorted with respect to the keys
-    """
-    keys_buf = tvm.tir.decl_buffer(keys.shape, keys.dtype, "keys_buf", data_alignment=8)
-    values_buf = tvm.tir.decl_buffer(values.shape, values.dtype, "values_buf", data_alignment=8)
-    workspace_buf = (
-        tvm.tir.decl_buffer(workspace.shape, workspace.dtype, "workspace_buf", data_alignment=8)
-        if workspace is not None
-        else None
-    )
-    out_bufs = [
-        tvm.tir.decl_buffer(keys.shape, keys.dtype, "keys_buf", data_alignment=8),
-        tvm.tir.decl_buffer(keys.shape, values.dtype, "values_buf", data_alignment=8),
-    ]
-
-    def f_compute(ins, outs):
-        args = [
-            "tvm.contrib.thrust.stable_sort_by_key",
-            ins[0],
-            ins[1],
-            outs[0],
-            outs[1],
-            for_scatter,
-        ]
-        if workspace is not None:
-            args.append(ins[2])
-        return tvm.tir.call_packed(*args)
-
-    out = te.extern(
-        [keys.shape, values.shape],
-        [keys, values] if workspace is None else [keys, values, workspace],
-        f_compute,
-        in_buffers=[keys_buf, values_buf]
-        if workspace is None
-        else [keys_buf, values_buf, workspace_buf],
-        out_buffers=out_bufs,
-        dtype=[keys.dtype, values.dtype],
-        name="stable_sort_by_key",
-        tag="stable_sort_by_key",
-    )
-    return out[0], out[1]
diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py
deleted file mode 100644
index b94526e5b919..000000000000
--- a/python/tvm/topi/hexagon/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Schedules for Hexagon. """
-
-# pylint: disable=wildcard-import
-
-from .batch_matmul import *
-from .conv2d import *
-from .dense import *
-from .injective import *
-from .pad import *
-from .pooling import *
-from .reduce import *
-from .resize2d import *
-from .tensor_intrin import *
-from .qnn import *
-from .dense_alter_op import *
-from .conv2d_alter_op import *
diff --git a/python/tvm/topi/hexagon/batch_matmul.py b/python/tvm/topi/hexagon/batch_matmul.py
deleted file mode 100644
index bf2ca3c9c7fc..000000000000
--- a/python/tvm/topi/hexagon/batch_matmul.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Schedule for composition of batch_matmul operator"""
-
-import tvm
-
-
-def schedule_batch_matmul(outs):
-    """Schedule for batch_matmul op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of batch_matmul in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    return s
diff --git a/python/tvm/topi/hexagon/compute_poolarea.py b/python/tvm/topi/hexagon/compute_poolarea.py
deleted file mode 100644
index 0e1130edd8dc..000000000000
--- a/python/tvm/topi/hexagon/compute_poolarea.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
-
-"""Compute PoolArea size which is used to exclude the zero-padding elements in the averaging
-   calculation.
-"""
-
-from tvm import te, tir
-
-
-def compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left):
-    """
-    Parameters
-    ----------
-    i,j:
-        index of output tensor along H and W axis
-        This is equal to the starting point of the sliding window for which the average is computed
-    ih, iw:
-        input data size along H and W axis
-    kh, kw:
-        Kernel size along H and W axis
-    sh, sw:
-        Stride size along H and W axis
-    dh, dw:
-        Dilation size along H and W axis
-    pad_top, pad_left:
-        Pad size on Top and left side of input data
-
-    # PoolArea refers to the area of that portion of each sliding window which only includes
-    # the input data and not the padded area.
-
-    # Motivation: The following example shows the location of the first sliding window (at i=0, j=0)
-    # on a 6*6 array, with kernel=[3,3] and padding=[1, 1, 1, 1].
-    # The input data elements are shown with (X) and padding data with (0).
-    # As shown, the number of non-padding elements that should be used for computing
-    # the average of values inside this window is 4, while the windows area is 3*3=9.
-    # To compute the PoolArea, we have to move the top/left edge of the window down/right
-    # to exclude zero-padding elements. The edge adjustment can be formulated as
-    #    top_edge = max(i , pad_top)
-    #    left_edge= max(j , pad_left)
-    # Note that pad_top and pad_left represent point 0 of the input data along i and j direction.
-    # In this example, bottom_edge and right_edge of the PoolArea do not need any adjustment,
-    # because there is no padding data on those side of the window.
-    # However, as we slide the window down and to the right, the window might go
-    # beyond the input data boundaries (ih and iw). In these cases, bottom/right edge should be
-    # moved up/left to be located inside the input data.
-    # This can be formulated as
-    #    bottom_edge = min(i + kh, ih + pad_top)
-    #    left_edge   = min(j + kw, iw + pad_left)
-    # Having all the edges,
-    #    PoolArea = (bottom_edge - top_edge) * (right_edge - left_edge)
-
-    #    _______
-    #    |0 0 0|0 0 0 0 0                         0 0 0 0 0 0 0 0
-    #    |     |                                 _______
-    #    |0 X X|X X X X 0                        |0 X X|X X X X 0
-    #    |     |                                 |     |
-    #    |0 X X|X X X X 0        ====>           |0 X X|X X X X 0
-    #    |_____|                                 |_____|
-    #    0 X X X X X X 0                          0 X X X X X X 0
-    #    0 X X X X X X 0                          0 X X X X X X 0
-    #    0 X X X X X X 0                          0 X X X X X X 0
-    #    0 X X X X X X 0                          0 X X X X X X 0
-    #    0 0 0 0 0 0 0 0                          0 0 0 0 0 0 0 0
-
-
-    # The above equations are derived under the assumption of having default value (1)
-    # for stride and dilation. However, we need to expand them to support non-default
-    # stride and dilation values.
-    # Stride impacts the starting location of the sliding windows, so i and j should be
-    # replaced by (i * sh) and j by (j * sw) in the equations.
-    # Dilation changes the window size, making k kernel elements scattered into a d*(k - 1) + 1
-    # window.
-    # Non-1 dilation means that, we need to divide the adjusted window size by the dilation value
-    # to find out how many kernel elements inside the sliding window are inside the input data
-    # boundaries:
-    #    top_edge= max(i * sh , pad_top)
-    #    left_edge= max(j * sw , pad_left)
-    #    bottom_edge = min(i * sh + (kh - 1) * dh + 1, ih + pad_top)
-    #    left_edge   = min(j * sw + (kw - 1) * dw + 1, data_w + pad_left)
-    #    PoolArea = ceil_div((bottom_edge - top_edge), dh) * ceil_div((right_edge - left_edge), dw)
-    #
-    # Finally, we need to address one corner case related to the non-default dilation:
-    # Consider the following example along W axis, where iw = 3, kw = 3 and dw = 2.
-    # The first figure on the left shows the sliding window of size 5 starting at index 0,
-    # and the first figure on the right shows the same example with sliding window at index 1.
-    # The second row of figures show the PoolArea after adjusting the edges
-    # (both left_edge - right_edge = 3)
-    # The third row of figures show the location of dialated kernel points(*).
-    # As shown, although the distance between left and right edge in both cases is 3 and
-    # dilation is 2 and ceil_div(3,2)=2, the right PoolArea only includes 1 kernel point.
-
-    #  Sliding Window:                       |0 0 X X X |0                         0 |0 X X X  0|
-    #  PoolArea(after edge adjustment):       0 0|X X X |0                         0  0|X X X| 0
-    #  location of dilated kernel points:     * 0|* X * |0                         0  *|X * X| 0
-    #  PoolArea (dilated_point_aware):        * 0|* X * |0                         0  * X|* X| 0
-
-    # To address this issue, instead of moving the left_edge to bring it just inside the input
-    # data boundary, we should move the edge to the right untill we get to the first dilated kernel
-    # point inside the input data boundary.
-    # The third row of figures shows how this row adjustment can solve the problem.
-    # So the problem is reduced to finding the first dilated kernel point inside the data
-    # boundary.# For that, we can find the number of dialted points which are mapped to the padded
-    # area and find the location of the next one which should be inside the input data:
-    #    num_of_prev_points = (pad_top - i * sh - 1) // dh
-    #    next_point_index = i * sh + (num_prev_points + 1) * dh
-    #
-    # With that, Top_edge and left_edge can be reformulated as:
-    #    if i*sh - pad_top < 0:
-    #        top_edge = i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh
-    #    else:
-    #        top_edge = i * sh
-    #
-    #    if j * sw - pad_left < 0:
-    #        left_edge = j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw
-    #    else:
-    #        left_edge= j * sw
-
-    """
-    top_edge = tir.if_then_else(
-        tir.all(i * sh - pad_top < 0), i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh, i * sh
-    )
-    bottom_edge = te.min(i * sh + (kh - 1) * dh + 1, ih + pad_top)
-    left_edge = tir.if_then_else(
-        tir.all(j * sw - pad_left < 0), j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw, j * sw
-    )
-    right_edge = te.min(j * sw + (kw - 1) * dw + 1, iw + pad_left)
-    return -((bottom_edge - top_edge) // -dh) * -((right_edge - left_edge) // -dw)
diff --git a/python/tvm/topi/hexagon/conv2d.py b/python/tvm/topi/hexagon/conv2d.py
deleted file mode 100644
index aa1b7e57e464..000000000000
--- a/python/tvm/topi/hexagon/conv2d.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Schedule for conv2d"""
-
-import tvm
-from tvm import te
-from .. import nn
-from ..utils import traverse_inline
-from .tensor_intrin import dot_vrmpy
-from ..generic import conv2d as conv2d_generic
-
-
-def schedule_conv2d_nhwc(outs):
-    """Schedule for conv2d NHWC operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    return s
-
-
-def schedule_conv2d_nchw(outs):
-    return schedule_conv2d_nhwc(outs)
-
-
-def schedule_conv2d(outs, layout="NHWC"):
-    layout_uncase = layout.casefold()
-    if layout_uncase == "NHWC".casefold():
-        return schedule_conv2d_nhwc(outs)
-    if layout_uncase == "NCHW".casefold():
-        return schedule_conv2d_nchw(outs)
-
-    raise ValueError(f"Unexpected layout={layout}")
-
-
-def schedule_depthwise_conv2d_nchw(outs):
-    return schedule_conv2d_nchw(outs)
-
-
-def schedule_depthwise_conv2d_nhwc(out):
-    return schedule_conv2d_nhwc(out)
-
-
-def schedule_conv2d_transpose_nchw(outs):
-    """Create schedule for tensors"""
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = schedule_conv2d_nchw(outs)
-
-    def _callback(op):
-        if "unpack_nchwc" in op.tag:
-            conv_out = op.input_tensors[0]
-            # retrieve data
-            data_vec = conv_out.op.input_tensors[0]
-            if isinstance(data_vec, tvm.te.ComputeOp):
-                data_pad = data_vec.op.input_tensors[0]
-                data_dilate = data_pad.op.input_tensors[0]
-                s[data_dilate].compute_inline()
-                s[data_pad].compute_inline()
-            # retrieve kernel
-            kernel_vec = conv_out.op.input_tensors[1]
-            if isinstance(kernel_vec, tvm.te.ComputeOp):
-                kernel_transform = kernel_vec.op.input_tensors[0]
-                s[kernel_transform].compute_inline()
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def conv2d_NCHWc_int8(
-    data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32"
-):
-    """Compute definition for int8 conv2d in NCHWc layout"""
-    n_elems = int(kernel.shape[-1])
-    return nn.conv2d_NCHWc_int8(
-        data, kernel, stride, padding, dilation, layout, out_layout, out_dtype, n_elems=n_elems
-    )
-
-
-def schedule_conv2d_NCHWc_int8(outs):
-    """Schedule for int8 conv2d in NCHWc layout using vrmpy tensorization"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_NCHWc_int8" in op.tag:
-            conv_out = op.output(0)
-            kernel_vec = conv_out.op.input_tensors[1]
-            data_vec = conv_out.op.input_tensors[0]
-            out_width = conv_out.shape[3]
-
-            reg_n = 1
-            for n in range(31, 0, -1):
-                if out_width % n == 0:
-                    reg_n = n
-                    break
-
-            cfg = {"tile_ow": reg_n, "unroll_kw": False}
-            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
-            intrin = dot_vrmpy(data_vec.dtype, kernel_vec.dtype)
-
-            conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(
-                *args,
-                int32_lanes=32,
-                int8_elems=4,
-                intrin=intrin,
-                inline_fused=True,
-            )
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/hexagon/conv2d_alter_op.py b/python/tvm/topi/hexagon/conv2d_alter_op.py
deleted file mode 100644
index a4affb8a82b7..000000000000
--- a/python/tvm/topi/hexagon/conv2d_alter_op.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2d alter op functions for Hexagon"""
-
-from tvm import relay
-from ..utils import get_const_tuple
-from .. import nn
-from ..nn import conv2d_alter_layout
-from ..generic.conv2d import conv2d_alter_int8_common
-
-
-@conv2d_alter_layout.register("hexagon")
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    """Convert nn.conv2d into nn.contrib_conv2d_nchwc if vrmpy is applicable."""
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data_tensor, kernel_tensor = tinfos
-    out_channel, in_channel, _, _ = get_const_tuple(kernel_tensor.shape)
-
-    if (
-        "int8" in data_tensor.dtype
-        and "int8" in kernel_tensor.dtype
-        and out_channel % 32 == 0
-        and in_channel % 4 == 0
-        and data_layout == "NCHW"
-        and kernel_layout == "OIHW"
-    ):
-        out_channel, in_channel, _, _ = get_const_tuple(kernel_tensor.shape)
-
-        n_elems = 4
-        oc_bn = 32
-        ic_bn = min(in_channel, 32)
-
-        new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-        new_attrs["channels"] = out_channel
-        new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-        new_attrs["kernel_layout"] = f"OIHW{ic_bn // n_elems:n}i{oc_bn:n}o{n_elems:n}i"
-        new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
-
-    return None
-
-
-@nn.conv2d_legalize.register("hexagon")
-def _conv2d_legalize(attrs, inputs, arg_types):
-    """Legalize conv2d op for vrmpy tensorization.
-
-    If the inputs are signed or unsigned int8, the input and output channels are padded to be
-    a multiple of 4 and 32 respectively.
-
-    If the input data types are (int8, int8), they are converted to (uint8, int8) and
-    the vector-by-vector variant of vrmpy is applied.
-    If the input data types are (uint8, uint8), the more efficient vector-by-scalar variant of vrmpy
-    is applied.
-
-    Unlike the nn.dense case (see dense_alter_op.py), we do not convert (uint8, int8) to
-    (uint8, uint8). That would introduce another convolution by a constant (128 or 1) filter,
-    to compensate for the dtype legalization. In the nn.dense case, such compensation factor is
-    just a sum over the K axis.
-    """
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-
-    output_tensor = arg_types[2]
-
-    data, kernel = inputs
-
-    if data_layout != "NCHW" or kernel_layout != "OIHW":
-        return None
-
-    data_tensor, kernel_tensor = arg_types[0], arg_types[1]
-
-    if "int8" in data_tensor.dtype and "int8" in data_tensor.dtype:
-        output_tensor = arg_types[2]
-        data, kernel = inputs
-        desired_data_dtype = "uint8"
-        in_channel_vector_length = 4
-        out_channel_vector_length = 32
-
-        return conv2d_alter_int8_common(
-            data,
-            data_tensor,
-            kernel,
-            kernel_tensor,
-            output_tensor,
-            attrs,
-            desired_data_dtype,
-            in_channel_vector_length,
-            out_channel_vector_length,
-        )
-
-    return None
diff --git a/python/tvm/topi/hexagon/dense.py b/python/tvm/topi/hexagon/dense.py
deleted file mode 100644
index 02ad141ecb5a..000000000000
--- a/python/tvm/topi/hexagon/dense.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Schedule for dense operator"""
-
-import tvm
-from tvm.topi.utils import traverse_inline
-from tvm import te
-from .. import tag
-from .tensor_intrin import dot_vrmpy
-
-
-def schedule_dense(outs):
-    """Schedule for dense op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    return s
-
-
-def dense_u8u8i32_vrmpy_compute(X, packed_w, bias, out_dtype):
-    """Compute for uint8 x uint8 -> int32 dense using vrmpy"""
-    assert X.dtype == "uint8" and packed_w.dtype == "uint8" and out_dtype == "int32"
-    m, k = X.shape
-    n_o, _, n_i, _ = packed_w.shape
-    assert n_i == 32
-    ak = te.reduce_axis((0, k), name="k")
-
-    C = te.compute(
-        (m, n_o * n_i),
-        lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packed_w[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
-                "int32"
-            ),
-            axis=ak,
-        ),
-        tag="dense_u8u8i32_vrmpy",
-        name="compute",
-    )
-
-    if bias is not None:
-        C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j], tag=tag.BROADCAST)
-
-    return C
-
-
-def dense_u8u8i32_vrmpy_schedule(outs):
-    """Schedule for vrmpy dense"""
-    s = te.create_schedule([x.op for x in outs])
-    # O: The output of the fused op
-    O = outs[0]
-
-    def _schedule_dense(s, C, O):
-        (a_k,) = C.op.reduce_axis
-        a_y = C.op.axis[-2]
-        a_yo, a_yi = s[C].split(a_y, factor=32)
-        a_xo, a_xi = s[C].split(C.op.axis[-1], factor=32)
-        a_ko, a_ki = s[C].split(a_k, factor=4)
-
-        s[C].reorder(a_yo, a_xo, a_yi, a_ko, a_xi, a_ki)
-
-        pc = dot_vrmpy("uint8", "uint8")
-        s[C].tensorize(a_xi, pc)
-        s[C].parallel(s[C].fuse(a_yo, a_xo))
-
-        if C != O:
-            a_y = O.op.axis[-2]
-            a_yo, a_yi = s[O].split(a_y, factor=32)
-            a_xo, a_xi = s[O].split(O.op.axis[-1], factor=32)
-
-            s[O].reorder(a_yo, a_xo, a_yi, a_xi)
-            s[O].vectorize(a_xi)
-            s[C].compute_at(s[O], a_yi)
-            s[O].parallel(s[O].fuse(a_yo, a_xo))
-
-    def _callback(op):
-        if "u8u8i32_vrmpy" in op.tag:
-            # C: The output of GEMM
-            C = op.output(0)
-            _schedule_dense(s, C, O)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
diff --git a/python/tvm/topi/hexagon/dense_alter_op.py b/python/tvm/topi/hexagon/dense_alter_op.py
deleted file mode 100644
index cb5feb56d68e..000000000000
--- a/python/tvm/topi/hexagon/dense_alter_op.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Dense alter op functions for ARM"""
-
-import tvm
-from tvm import relay
-from .. import nn
-from ..nn import dense_alter_layout
-
-
-def check_vrmpy_applicable(x, y):
-    return (
-        "int8" in x.dtype and "int8" in y.dtype and y.shape[-2] % 32 == 0 and y.shape[-1] % 4 == 0
-    )
-
-
-@dense_alter_layout.register(["hexagon"])
-def _alter_dense_layout(attrs, inputs, tinfos, out_type):
-    data_tensor, weight_tensor = tinfos
-    out_dtype = out_type.dtype
-
-    if check_vrmpy_applicable(data_tensor, weight_tensor):
-        weight_layout = "NC32n4c"
-        return relay.nn.contrib_dense_pack(inputs[0], inputs[1], weight_layout, None, out_dtype)
-    else:
-        return None
-
-
-def vrmpy_legalize(x, w, arg_types, op, attrs):
-    """
-    Legalizes int8 inputs to dense for vrmpy.
-    X'_u8 = X_s8 + 128
-    X_s8 * W_s8 = (X'_u8 - 128) * (W'_u8 - 128)
-                = X'_u8 * W'_u8 - X'_u8 * 128 - 128 * W'_u8 + 128 * 128
-    X_u8 * W_s8 = X_u8 * (W'_u8 - 128)
-                = X'_u8 * W'_u8 - X_u8 * 128
-    """
-    if not check_vrmpy_applicable(arg_types[0], arg_types[1]):
-        return None
-
-    def cast_to_uint8(x):
-        x = relay.cast(x, "int32")
-        x = relay.add(x, relay.const(128, "int32"))
-        return relay.cast(x, "uint8")
-
-    if arg_types[0].dtype == "int8" and arg_types[1].dtype == "int8":
-        x = cast_to_uint8(x)
-        w = cast_to_uint8(w)
-
-        W_u8x128 = relay.const(-128, "int32") * relay.sum(relay.cast(w, "int32"), axis=[-1])
-        X_u8x128 = relay.const(-128, "int32") * relay.sum(relay.cast(x, "int32"), axis=[-1])
-        X_u8x128 = relay.expand_dims(X_u8x128, axis=1)
-
-        out = op(x, w, **attrs)
-
-        out += W_u8x128
-        out += X_u8x128
-
-        k_dim = int(arg_types[0].shape[-1])
-        return out + relay.const(128 * 128 * k_dim, "int32")
-
-    if arg_types[0].dtype == "uint8" and arg_types[1].dtype == "int8":
-        w = cast_to_uint8(w)
-
-        X_u8x128 = relay.expand_dims(
-            relay.const(-128, "int32") * relay.sum(relay.cast(x, "int32"), axis=[-1]), axis=1
-        )
-
-        out = op(x, w, **attrs)
-
-        return out + X_u8x128
-
-    return None
-
-
-@nn.dense_legalize.register("hexagon")
-def _dense_legalize(attrs, inputs, arg_types):
-    """Legalize dense op for HVX vectorization and vrmpy tensorization.
-
-    Given a workload with a matrix X of shape (M, K) and a matrix Y of (N, K),
-    we first pad the N dimension to be a multiple of the output vector length.
-
-    And if the inputs are signed or unsigned int8 and the Y matrix can be packed into the
-    NK32n4k layout, we convert both inputs to uint8 to apply the most efficient variant of vrmpy.
-    """
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    # Collect the input tensors.
-    x_tensor, y_tensor = arg_types[0], arg_types[1]
-    dtype = x_tensor.dtype
-
-    # Collect the output tensor.
-    output_tensor = arg_types[2]
-
-    # Collect the input exprs.
-    x, y = inputs
-
-    N, _ = y_tensor.shape
-
-    if dtype == "float16":
-        vec_len = 64
-    elif "int8" in dtype:
-        vec_len = 32
-    else:
-        return None
-
-    if N % vec_len != 0:
-        N_padded = ((N + vec_len) // vec_len) * vec_len
-        dn = N_padded - N
-
-        y_ = relay.nn.pad(y, pad_width=((0, dn), (0, 0)))
-
-        # If units is explicitly specified, it is used to compute the output shape.
-        # We need to update units after padding to prevent a type error.
-        if attrs["units"] is not None:
-            new_attrs["units"] = N + dn
-
-        arg_types = [
-            arg_types[0],
-            tvm.ir.tensor_type.TensorType([N + dn, arg_types[1].shape[1]], arg_types[1].dtype),
-        ]
-
-        vrmpy_out = vrmpy_legalize(x, y_, arg_types, relay.nn.dense, new_attrs)
-
-        if vrmpy_out is None:
-            out_ = relay.nn.dense(x, y_, **new_attrs)
-        else:
-            out_ = vrmpy_out
-
-        out = relay.strided_slice(out_, begin=[0, 0], end=[x.value for x in output_tensor.shape])
-        return out
-
-    return vrmpy_legalize(inputs[0], inputs[1], arg_types, relay.nn.dense, attrs)
diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py
deleted file mode 100644
index 1da745a6774d..000000000000
--- a/python/tvm/topi/hexagon/injective.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Schedule for injective operators"""
-import numpy as np
-import tvm
-
-
-def schedule_injective(outs):
-    """Schedule for injective op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of injective in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    # Fuse axes and vectorize inner elements
-    for x in outs:
-        fused = s[x].fuse(*x.op.axis)
-        outer, inner = s[x].split(fused, factor=128 // np.dtype(x.dtype).itemsize)
-        s[x].vectorize(inner)
-        s[x].parallel(outer)
-    return s
-
-
-def schedule_softmax(outs):
-    return schedule_injective(outs)
-
-
-def schedule_elemwise(outs):
-    return schedule_injective(outs)
-
-
-def schedule_broadcast(outs):
-    return schedule_injective(outs)
diff --git a/python/tvm/topi/hexagon/pad.py b/python/tvm/topi/hexagon/pad.py
deleted file mode 100644
index 631079080231..000000000000
--- a/python/tvm/topi/hexagon/pad.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Schedule for nn.pad operator"""
-
-import numpy as np
-import tvm
-
-
-def schedule_pad(outs):
-    """Schedule for pad op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of injective in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    # Fuse axes and vectorize only if last output tensor dimension is divisible by a factor:
-    factor = 128 // np.dtype(outs[0].dtype).itemsize
-    last_dim = outs[0].shape[-1]
-    if last_dim % factor == 0 and last_dim // factor >= 0:
-        fused = s[outs[0]].fuse(*outs[0].op.axis)
-        _, inner = s[outs[0]].split(fused, factor=factor)
-        s[outs[0]].vectorize(inner)
-
-    return s
diff --git a/python/tvm/topi/hexagon/pooling.py b/python/tvm/topi/hexagon/pooling.py
deleted file mode 100644
index eb8adac35f84..000000000000
--- a/python/tvm/topi/hexagon/pooling.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Schedule for pooling operators"""
-
-import tvm
-
-
-def schedule_pool(outs, layout="NHWC"):  # pylint: disable=unused-argument
-    """Schedule for pooling op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of injective in the format
-        of an array of tensors.
-
-    layout: str
-        The tensor layout.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    return s
-
-
-def schedule_adaptive_pool(outs):
-    return schedule_pool(outs)
diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
deleted file mode 100644
index f7c4502301c0..000000000000
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Computes and schedules for Hexagon quantized ops """
-
-from .adaptive_avg_pool1d import *
-from .avg_pool2d import *
-from .conv2d_alter_op import *
-from .dense_alter_op import *
-from .dequantize import dequantize_compute, dequantize_schedule
-from .global_avg_pool2d import *
-from .nn import *
-from .qadd_qsub_qmul import *
-from .qdense import *
-from .qdepthwise_conv2d_slice import qdepthwise_conv2d_compute, qdepthwise_conv2d_schedule
-from .quantize import quantize_compute, tir_quantize_schedule
diff --git a/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py b/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
deleted file mode 100644
index 14bdd45b56f7..000000000000
--- a/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Compute and schedule for adaptive_avg_pool1d slice op
-
-Following are few notes and assumptions made by the implementation:
-
-Assumptions:
-1) The input is in NCW layout. Distilbert is the only model that calls
-   nn.adaptive_avg_pool1d and the only layout it uses is 'NCW'.
-2) The op takes output_size as an argument and
-   only handles the specialized case where output_size is 1.
-   The argument output_size is used as the value of output_width.
-3) Both input and output dtype is uint8/int8 and
-   quantization parameter is provided to the op.
-4) Input is assumed to always be multiple of fixed chunk 32c64w.
-
-Notes:
-1) If input width is used as output width, there can be two cases:
-    a. If the quantization parameters of input and output are same,
-       it can return the input as output so the op will be a no-op.
-    b. If the quantization parameters of input and output are different,
-       it will essentially be a requantize op.
-2) If output_size is a value besides 1 or input_width,
-   adaptive_avg_pool1d may use dynamic stride and kernel for each output element.
-   When this case occurs, kernel won't be known at compile time. We want to use
-   the generic implementation nn.adaptive_avg_pool1d() for this case.
-"""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
-
-
-def adaptive_avg_pool1d(
-    data: te.Tensor,
-    output_size: list,
-    odtype: str,
-    input_zero_point: int,
-    input_scale: float,
-    output_zero_point: int,
-    output_scale: float,
-):
-    """adaptive_avg_pool1d compute"""
-    _, _, inw = data.shape
-
-    out_width = output_size[0]
-
-    n, c = data.shape[:2]
-    oshape = (n, c) + (out_width,)
-
-    # Kernel is same as input_width since output_width is assumed to be 1
-    if out_width == 1:
-        kw_r = inw
-    else:
-        raise RuntimeError(f"Unsupported output_size, {out_width}'")
-
-    if odtype == "uint8":
-        temp_dtype = "uint32"
-    elif odtype == "int8":
-        temp_dtype = "int32"
-    else:
-        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
-
-    scale_with_area = input_scale / (output_scale * int(kw_r))
-    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
-    corr = (output_zero_point << rsh) - input_zero_point * kw_r * scale_fixed_point
-
-    rw_r = te.reduce_axis((0, kw_r), name="rw_r")
-
-    sum_compute = te.compute(
-        oshape,
-        lambda n, c, w: te.sum(data[n, c, w + rw_r].astype(temp_dtype), axis=[rw_r]),
-        name="sum",
-    )
-
-    avg_compute = te.compute(
-        oshape,
-        lambda n, c, w: saturate(
-            ((sum_compute[n, c, w] * scale_fixed_point) + corr) >> rsh, odtype
-        ).astype(odtype),
-        name="adaptive_avg_1d",
-    )
-    return avg_compute
-
-
-def stir_schedule_ncw_32c64w(outs, ins, input_layout: str):
-    """Schedule for input layout ncw-32c64w and output layout ncw"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-
-    sum_block = s.get_block("sum")
-
-    # Input is multiple of fixed chunk but output is NxCx1
-    # Hence transform_layout is only applied on input
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    return s
-
-
-def tir_adaptive_avg_pool1d_schedule(outs, ins, output_layout: str, input_layout: str):
-    """STIR based schedule"""
-    if output_layout == "ncw":
-        return stir_schedule_ncw_32c64w(outs, ins, input_layout)
-    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/qnn/avg_pool2d.py b/python/tvm/topi/hexagon/qnn/avg_pool2d.py
deleted file mode 100644
index 4e88f39b0552..000000000000
--- a/python/tvm/topi/hexagon/qnn/avg_pool2d.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
-
-""" Compute and schedule for quantized avg_pool2d op """
-
-import tvm
-from tvm import te
-from tvm import tir
-from ..utils import (
-    get_layout_transform_fn,
-    get_fixed_point_value,
-    is_scalar,
-    get_const_int_value,
-    get_const_float_value,
-)
-from ...utils import get_const_tuple
-from ...nn.utils import get_pad_tuple
-from ...nn.pad import pad
-from ..compute_poolarea import compute_PoolArea
-
-
-def saturate(x: te.Tensor, dtype: str):
-    """Saturate value for the specified data type"""
-    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
-
-
-def get_temp_dtype(h, w, dtype):
-    temp_dtype = "int16" if h * w < 256 else "int32"
-    if dtype in ("uint8", "int8"):
-        return temp_dtype
-    else:
-        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
-
-
-def qnn_avg_pool2d_NCHW(
-    data: te.Tensor,
-    kernel: list,
-    stride: list,
-    padding: list,
-    dilation: list,
-    count_include_pad: bool,
-    oshape: list,
-    odtype: str,
-    # quantization params:
-    input_scale: float,
-    input_zero_point: int,
-    output_scale: float,
-    output_zero_point: int,
-):
-    """Compute for quantized avg_pool2d"""
-    kh, kw = kernel
-    rh = te.reduce_axis((0, kh), name="rh")
-    rw = te.reduce_axis((0, kw), name="rw")
-
-    temp_dtype = get_temp_dtype(kh, kw, odtype)
-
-    sh, sw = stride
-    dh, dw = dilation
-
-    scale = input_scale / output_scale
-    scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-    corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point
-
-    dilated_kh = (kh - 1) * dh + 1
-    dilated_kw = (kw - 1) * dw + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        get_const_tuple(padding), (dilated_kh, dilated_kw)
-    )
-
-    # DOPAD
-    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
-        pad_before = (0, 0, pad_top, pad_left)
-        pad_after = (0, 0, pad_down, pad_right)
-        data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad")
-    else:
-        # By definition when True, zero-padding will be included in the averaging calculation
-        # This is equivalent to PoolArea = (kh * kw)
-        count_include_pad = True
-        data_pad = data
-
-    Sum = te.compute(
-        oshape,
-        lambda b, c, h, w: te.sum(
-            data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype(temp_dtype), axis=[rh, rw]
-        ),
-        name="pool_sum",
-    )
-
-    if not count_include_pad:
-        # Compute PoolArea using unpadded input tensor
-        _, _, oh, ow = oshape
-        _, _, ih, iw = data.shape
-
-        PoolArea = te.compute(
-            (oh, ow),
-            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
-            name="pool_area",
-        )
-
-        ScaleWithArea = te.compute(
-            (oh, ow),
-            lambda i, j: (scale_fixed_point // PoolArea[i, j]).astype("int32"),
-            name="scale_with_area",
-        )
-
-        Avg = te.compute(
-            oshape,
-            lambda b, c, h, w: saturate(
-                ((Sum[b, c, h, w] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype
-            ).astype(odtype),
-            name="pool_avg",
-        )
-    else:
-        ScaleWithArea = scale_fixed_point // (kh * kw)
-        Avg = te.compute(
-            oshape,
-            lambda b, c, h, w: saturate(
-                ((Sum[b, c, h, w] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype
-            ).astype(odtype),
-            name="pool_avg",
-        )
-    return Avg
-
-
-def qnn_avg_pool2d_NHWC(
-    data: te.Tensor,
-    kernel: list,
-    stride: list,
-    padding: list,
-    dilation: list,
-    count_include_pad: bool,
-    oshape: list,
-    odtype: str,
-    # quantization params:
-    input_scale: float,
-    input_zero_point: int,
-    output_scale: float,
-    output_zero_point: int,
-):
-    """Compute for quantized avg_pool2d"""
-    kh, kw = kernel
-    rh = te.reduce_axis((0, kh), name="rh")
-    rw = te.reduce_axis((0, kw), name="rw")
-
-    temp_dtype = get_temp_dtype(kh, kw, odtype)
-
-    sh, sw = stride
-    dh, dw = dilation
-
-    scale = input_scale / output_scale
-    scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-    corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point
-
-    dilated_kh = (kh - 1) * dh + 1
-    dilated_kw = (kw - 1) * dw + 1
-    # Compute Area
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        get_const_tuple(padding), (dilated_kh, dilated_kw)
-    )
-    # DOPAD
-    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
-        pad_before = (0, pad_top, pad_left, 0)
-        pad_after = (0, pad_down, pad_right, 0)
-        data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad")
-    else:
-        # By definition when True, zero-padding will be included in the averaging calculation
-        # This is equivalent to PoolArea = (kh * kw)
-        count_include_pad = True
-        data_pad = data
-
-    Sum = te.compute(
-        oshape,
-        lambda b, h, w, c: te.sum(
-            data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw]
-        ),
-        name="pool_sum",
-    )
-
-    if not count_include_pad:
-        # Compute PoolArea using unpadded input tensor
-        _, oh, ow, _ = oshape
-        _, ih, iw, _ = data.shape
-
-        PoolArea = te.compute(
-            (oh, ow),
-            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
-            name="pool_area",
-        )
-
-        ScaleWithArea = te.compute(
-            (oh, ow),
-            lambda i, j: tir.if_then_else(
-                tir.all(PoolArea[i, j] > 0),
-                (scale_fixed_point // PoolArea[i, j]).astype("int32"),
-                0,
-            ),
-            name="scale_with_area",
-        )
-
-        Avg = te.compute(
-            oshape,
-            lambda b, h, w, c: saturate(
-                ((Sum[b, h, w, c] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype
-            ).astype(odtype),
-            name="pool_avg",
-        )
-    else:
-        ScaleWithArea = scale_fixed_point // (kh * kw)
-        Avg = te.compute(
-            oshape,
-            lambda b, h, w, c: saturate(
-                ((Sum[b, h, w, c] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype
-            ).astype(odtype),
-            name="pool_avg",
-        )
-
-    return Avg
-
-
-def qnn_avg_pool2d_wrapper_compute_NCHW(
-    data: te.Tensor,
-    kernel: list,
-    stride: list,
-    padding: list,
-    dilation: list,
-    count_include_pad: bool,
-    oshape: list,
-    odtype: str,
-    # quantization params:
-    input_scale: float,
-    input_zero_point: int,
-    output_scale: float,
-    output_zero_point: int,
-):
-    """Extract qnn params"""
-    if (
-        is_scalar(input_scale)
-        and is_scalar(output_scale)
-        and is_scalar(input_zero_point)
-        and is_scalar(output_zero_point)
-    ):
-        iscale = get_const_float_value(input_scale)
-        oscale = get_const_float_value(output_scale)
-        izero_point = get_const_int_value(input_zero_point)
-        ozero_point = get_const_int_value(output_zero_point)
-        return qnn_avg_pool2d_NCHW(
-            data,
-            kernel,
-            stride,
-            padding,
-            dilation,
-            count_include_pad,
-            oshape,
-            odtype,
-            iscale,
-            izero_point,
-            oscale,
-            ozero_point,
-        )
-    else:
-        raise RuntimeError("quantization parameters should be scalar tensors")
-
-
-def qnn_avg_pool2d_wrapper_compute_NHWC(
-    data: te.Tensor,
-    kernel: list,
-    stride: list,
-    padding: list,
-    dilation: list,
-    count_include_pad: bool,
-    oshape: list,
-    odtype: str,
-    # quantization params:
-    input_scale: float,
-    input_zero_point: int,
-    output_scale: float,
-    output_zero_point: int,
-):
-    """Extract qnn params"""
-    if (
-        is_scalar(input_scale)
-        and is_scalar(output_scale)
-        and is_scalar(input_zero_point)
-        and is_scalar(output_zero_point)
-    ):
-        iscale = get_const_float_value(input_scale)
-        oscale = get_const_float_value(output_scale)
-        izero_point = get_const_int_value(input_zero_point)
-        ozero_point = get_const_int_value(output_zero_point)
-        return qnn_avg_pool2d_NHWC(
-            data,
-            kernel,
-            stride,
-            padding,
-            dilation,
-            count_include_pad,
-            oshape,
-            odtype,
-            iscale,
-            izero_point,
-            oscale,
-            ozero_point,
-        )
-    else:
-        raise RuntimeError("quantization parameters should be scalar tensors")
-
-
-def schedule_qnn_avg_pool2d(outs):
-    """Schedule for qnn.avg_pool2d
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.avg_pool2d
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    return s
-
-
-def schedule_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Schedule for input and output layout 8h8w32c"""
-
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-    Sum = s.get_block("pool_sum")
-    Avg = s.get_block("pool_avg")
-    mem_scope = "global.vtcm"
-    sum_read = s.cache_read(Sum, 0, mem_scope)
-    avg_read = s.cache_read(Avg, 0, mem_scope)
-    avg_write = s.cache_write(Avg, 0, mem_scope)
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0)
-    s.transform_layout(Avg, ("read", 0), input_transform_fn, pad_value=0)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0)
-    return s
-
-
-def schedule_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Schedule for output layout: 2048c, input layout: 8h8w32c"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-    Sum = s.get_block("pool_sum")
-    Avg = s.get_block("pool_avg")
-
-    mem_scope = "global.vtcm"
-    sum_read = s.cache_read(Sum, 0, mem_scope)
-    avg_write = s.cache_write(Avg, 0, mem_scope)
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0)
-
-    # Schedule 'Avg'
-    # Split and reorder the axes to iterate over the output tensor chunks.
-    # Each chunk consists for 2048 bytes. For n11c-2048c tensor layout, each chunk
-    # only contains 2048 channels which get split by a factor of 128 to be vectorized.
-    # NOTE: These schedules are a work in progress and may require
-    # adjustments in future as some of the missing features for 2-d tensors
-    # become available.
-
-    if output_layout == "n11c-2048c-2d":
-        _, _, _, c = s.get_loops(Avg)
-    else:
-        _, c, _, _ = s.get_loops(Avg)
-
-    # n, h, w, c = s.get_loops(Avg)
-    co, ci = s.split(c, [None, 2048])
-    cio, cii = s.split(ci, [None, 128])
-    s.vectorize(cii)
-
-    # Schedule 'Sum'
-    # Compute for 'Sum' includes reduction along height and width. The axes are being
-    # reordered so that 128 channels become the inner-most loop and can be vectorized.
-    # However, vectorization of the 2-d tensors doesn't work when reduction is
-    # involved and requires codegen support that is yet to be added.
-    s.compute_at(Sum, cio)
-    Sum_axis = s.get_loops(Sum)
-    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3])
-    # s.vectorize(Sum_axis[-3]) # Doesn't work
-    return s
-
-
-def qnn_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Quantized avg_pool2d schedule"""
-    if output_layout == "nhwc-8h8w32c-2d" or output_layout == "nchw-8h8w32c-2d":
-        return schedule_8h8w32c(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-2048c-2d" or output_layout == "nc11-2048c-2d":
-        return schedule_2048c(outs, ins, output_layout, input_layout)
-
-    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/qnn/conv2d_alter_op.py b/python/tvm/topi/hexagon/qnn/conv2d_alter_op.py
deleted file mode 100644
index b8240dccaf9f..000000000000
--- a/python/tvm/topi/hexagon/qnn/conv2d_alter_op.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""QNN Conv2d alter op functions for Hexagon"""
-
-from tvm import relay
-from ...nn import qnn_conv2d_alter_layout
-from ...utils import get_const_tuple
-
-
-@qnn_conv2d_alter_layout.register("hexagon")
-def _alter_qnn_conv2d_layout(attrs, inputs, tinfos, _out_type):
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data_tensor, kernel_tensor, _, _, _, _ = tinfos
-
-    if (
-        "int8" in data_tensor.dtype
-        and "int8" in kernel_tensor.dtype
-        and data_layout == "NCHW"
-        and kernel_layout == "OIHW"
-    ):
-        out_channel, in_channel, _, _ = get_const_tuple(kernel_tensor.shape)
-
-        if out_channel % 32 != 0 or in_channel % 4 != 0:
-            return None
-
-        n_elems = 4
-        oc_bn = 32
-        ic_bn = min(in_channel, 32)
-
-        new_attrs = dict(attrs)
-        new_attrs["channels"] = out_channel
-        new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-        new_attrs["kernel_layout"] = f"OIHW{ic_bn // n_elems:n}i{oc_bn:n}o{n_elems:n}i"
-        new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-        return relay.qnn.op.conv2d(*inputs, **new_attrs)
-
-    return None
diff --git a/python/tvm/topi/hexagon/qnn/dense_alter_op.py b/python/tvm/topi/hexagon/qnn/dense_alter_op.py
deleted file mode 100644
index 1935bbda036e..000000000000
--- a/python/tvm/topi/hexagon/qnn/dense_alter_op.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""QNN Dense alter op functions for Hexagon"""
-
-from tvm import relay
-from ..dense_alter_op import check_vrmpy_applicable
-from ...nn import qnn_dense_alter_layout
-
-
-@qnn_dense_alter_layout.register("hexagon")
-def _alter_qnn_dense_layout(_attrs, inputs, tinfos, out_type):
-    data_tensor = tinfos[0]
-    weight_tensor = tinfos[1]
-
-    if check_vrmpy_applicable(data_tensor, weight_tensor):
-        weight_layout = "NC32n4c"
-        return relay.qnn.op.contrib_dense_pack(*inputs, weight_layout, None, out_type.dtype)
-    else:
-        return None
diff --git a/python/tvm/topi/hexagon/qnn/dequantize.py b/python/tvm/topi/hexagon/qnn/dequantize.py
deleted file mode 100644
index 3e1466e88b38..000000000000
--- a/python/tvm/topi/hexagon/qnn/dequantize.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-""" Hexagon qnn.dequantize slice op compute and schedule"""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn
-
-
-def dequantize_compute(tensor_A, scale_A, zero_point_A):
-
-    return te.compute(
-        tensor_A.shape,
-        lambda *indices: (scale_A * (tensor_A[indices] - zero_point_A)).astype("float32"),
-        name="dequantize",
-    )
-
-
-def dequantize_stir_schedule_nhwc_8h8w32c(
-    _in,
-    _out,
-    in_layout,
-    out_layout,
-):
-    """Schedule for nhwc int8/uint8 to f32 : nhwc layout"""
-    func = te.create_prim_func([_in, _out])
-    sch = tir.Schedule(func, debug_mask="all")
-    block_name = "dequantize"
-    n, h, w, c = sch.get_loops(sch.get_block(block_name))
-    ho, hi = sch.split(h, [None, 4])
-    wo, wi = sch.split(w, [None, 8])
-    wio, wii = sch.split(wi, [None, 4])
-    co, ci = sch.split(c, [None, 32])
-    sch.transform_layout(block_name, "A", in_layout)
-    sch.transform_layout(block_name, block_name, out_layout)
-    sch.reorder(n, ho, wo, co, hi, wio, wii, ci)
-    wii_ci = sch.fuse(wii, ci)
-    sch.vectorize(wii_ci)
-    return sch
-
-
-def dequantize_stir_schedule_nc(
-    _in,
-    _out,
-    in_layout,
-    out_layout,
-):
-    """Schedule for nc int8/uint8 to f32 : nc layout"""
-    func = te.create_prim_func([_in, _out])
-    sch = tir.Schedule(func, debug_mask="all")
-    block_name = "dequantize"
-    _, c_orig = sch.get_loops(sch.get_block(block_name))
-    _, c_inner = sch.split(c_orig, [None, 512])
-    sch.transform_layout(block_name, "A", in_layout)
-    sch.transform_layout(block_name, block_name, out_layout)
-    sch.vectorize(c_inner)
-    return sch
-
-
-def dequantize_schedule(_in, _output, in_layout_str, out_layout_str):
-    """Schedule for int8/uint8 to f32 : top level function"""
-    f32_layout_transform_func = get_layout_transform_fn(out_layout_str)
-    in_layout_transform_func = get_layout_transform_fn(in_layout_str)
-    if out_layout_str == "nhwc-4h2w32c2w-2d":
-        return dequantize_stir_schedule_nhwc_8h8w32c(
-            _in,
-            _output,
-            in_layout_transform_func,
-            f32_layout_transform_func,
-        )
-    if out_layout_str == "nc-512c-2d":
-        return dequantize_stir_schedule_nc(
-            _in,
-            _output,
-            in_layout_transform_func,
-            f32_layout_transform_func,
-        )
-    raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py b/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
deleted file mode 100644
index 24d5224f71cf..000000000000
--- a/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Assumptions:
-1) The input is in NCHW layout. Squeezenet is the only model that calls
-   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
-2) Both input and output dtype is uint8 and
-   quantization parameter is provided to the op.
-3) Input is assumed to always be multiple of fixed chunk 32c8h8w.
-"""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
-
-
-def global_avg_pool2d_u8(
-    data: te.Tensor,
-    odtype: str,
-    input_zero_point: int,
-    input_scale: float,
-    output_zero_point: int,
-    output_scale: float,
-):
-    """global_avg_pool2d"""
-    input_b, input_c, input_h, input_w = data.shape
-    oshape = (input_b, input_c) + (1, 1)
-
-    if input_h * input_w < 256:
-        bits = "16"
-    else:
-        bits = "32"
-
-    if odtype == "uint8":
-        temp_dtype = "uint" + bits
-    elif odtype == "int8":
-        temp_dtype = "int" + bits
-    else:
-        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
-
-    pool_area = input_h * input_w
-    rh_r = te.reduce_axis((0, input_h), name="rh_r")
-    rw_r = te.reduce_axis((0, input_w), name="rw_r")
-
-    scale_with_area = input_scale / (output_scale * int(pool_area))
-    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
-    corr = (output_zero_point << rsh) - input_zero_point * pool_area * scale_fixed_point
-
-    sum_compute = te.compute(
-        oshape,
-        lambda n, c, h, w: te.sum(
-            data[n, c, h + rh_r, w + rw_r].astype(temp_dtype), axis=[rh_r, rw_r]
-        ),
-        name="sum",
-    )
-
-    avg_compute = te.compute(
-        oshape,
-        lambda n, c, h, w: saturate(
-            ((sum_compute[n, c, h, w] * scale_fixed_point) + corr) >> rsh, odtype
-        ).astype(odtype),
-        name="global_avg_pool2d",
-    )
-
-    return avg_compute
-
-
-def stir_global_avg_pool2d_u8_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
-    """Schedule"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-
-    sum_block = s.get_block("sum")
-
-    # Input is multiple of fixed chunk but output is NxCx1x1
-    # Hence transform_layout is only applied on input
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    return s
diff --git a/python/tvm/topi/hexagon/qnn/nn.py b/python/tvm/topi/hexagon/qnn/nn.py
deleted file mode 100644
index 28eea59e59b5..000000000000
--- a/python/tvm/topi/hexagon/qnn/nn.py
+++ /dev/null
@@ -1,1032 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hexagon QNN operators"""
-# pylint: disable=invalid-name
-
-from typing import Union
-import numpy as np
-
-import tvm
-from tvm import te, topi
-from ..utils import (
-    saturate,
-    is_scalar,
-    get_const_int_value,
-    get_const_float_value,
-    get_fixed_point_value,
-)
-from ...utils import get_const_tuple
-from ...nn.utils import get_pad_tuple
-from ...nn.pad import pad
-from ... import tag, nn
-from ..conv2d import conv2d_NCHWc_int8
-from ...transform import concatenate
-
-
-def clip_cast(val, dtype):
-    # clip + cast:
-    const_min = tvm.tir.min_value(dtype)
-    const_max = tvm.tir.max_value(dtype)
-    return te.max(tvm.te.min(val, const_max), const_min).astype(dtype)
-
-
-def is_relax_constant(expr):
-    return hasattr(expr.op, "value") and isinstance(expr.op.value, tvm.relax.expr.Constant)
-
-
-def get_relax_scalar_const_value(expr):
-    assert len(expr.op.value.data.shape) == 0
-    return expr.op.value.data.numpy()[()]
-
-
-def get_qnn_param(param, indices, axis):
-    # Account scalar and 1D quantization parameters:
-    if is_scalar(param):
-        return param
-
-    param_idx = tvm.tir.indexmod(indices[axis], topi.shape(param)[0])
-    return param[param_idx]
-
-
-def subtract_zero_point(tensor: te.Tensor, zero_point: Union[te.Tensor, tvm.tir.IntImm], name: str):
-    """
-    Subtract zero point from given tensor. If zero point is scalar constant and is equal to 0, then
-    it can be optimized and return tensor as it is.
-    This new block is marked with 'meta_schedule.inline_rule = disable' attribute to disable inline.
-    Otherwise, inline prevents from tensorization and leveraging vrmpy intrinsic
-    """
-    if is_scalar(zero_point) and get_const_int_value(zero_point) == 0:
-        return tensor
-    else:
-        return te.compute(
-            tensor.shape,
-            lambda *i: te.subtract(tensor(*i), zero_point).astype(tensor.dtype),
-            name=name,
-            attrs={"meta_schedule.inline_rule": "disable"},
-        )
-
-
-def default_schedule(outs):
-    """Simple default schedule for QNN ops.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    for x in outs:
-        fused = s[x].fuse(*x.op.axis)
-        outer, inner = s[x].split(fused, factor=128 // np.dtype(x.dtype).itemsize)
-        s[x].vectorize(inner)
-        s[x].parallel(outer)
-    return s
-
-
-def qnn_quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
-    """Compute for qnn.quantize
-
-    Q_output = clamp((round(input_tensor/output_scale) + output_zero_point),
-                     out_dtype::min,
-                     out_dtype::max)
-    """
-
-    assert len(output_scale.shape) == 0 or len(output_scale.shape) == 1
-    assert len(output_zero_point.shape) == 0 or len(output_zero_point.shape) == 1
-
-    def _compute(*indices):
-        value = data(*indices)
-        scale = get_qnn_param(output_scale, indices, axis)
-        zp = get_qnn_param(output_zero_point, indices, axis)
-
-        val = te.add(te.round(te.div(value, scale)), zp)
-        return clip_cast(val, out_dtype)
-
-    return te.compute(data.shape, _compute, tag=tag.ELEMWISE)
-
-
-def schedule_qnn_quantize(outs):
-    """Schedule for qnn.quantize
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.quantize
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_dequantize(data, input_scale, input_zero_point, axis=-1):
-    """Compute for qnn.dequantize
-
-    fp_output = input_scale * (Q_input - input_zero_point)
-    """
-
-    def _compute(*indices):
-        value = data(*indices)
-        scale = get_qnn_param(input_scale, indices, axis)
-        zp = get_qnn_param(input_zero_point, indices, axis)
-
-        return te.multiply(scale, te.subtract(value, zp))
-
-    return te.compute(data.shape, _compute, tag=tag.ELEMWISE)
-
-
-def schedule_qnn_dequantize(outs):
-    """Schedule for qnn.dequantize
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.dequantize
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_requantize(
-    data: te.Tensor, input_scale, input_zp, output_scale, output_zp, axis=-1, out_dtype="int8"
-):
-    """Compute for qnn.requantize
-
-    If both input and output scales are constant scalars then we convert scale to fixed point value
-    and use integer arithmetic only for performance optimization purpose.
-    But this is a tradeoff between performance and accuracy, since we use int16 data type to
-    represent fixed point values (against QNN lowering approach where we use int32 for that).
-
-    if input and/or output scales are not constant scalars then we use the following formula:
-        Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
-
-    TODO: support 'rounding' and 'compute_dtype' arguments.
-    """
-    if is_scalar(input_scale) and is_scalar(output_scale):
-        iscale = get_const_float_value(input_scale)
-        oscale = get_const_float_value(output_scale)
-        scale = iscale / oscale
-        scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-
-        def _compute(*indices):
-            value = data(*indices)
-            # Subtract input zero point:
-            sub = te.subtract(value, input_zp)
-            # Fixed point multiply + roundup delta:
-            mul = (sub * scale_fixed_point + (1 << (rsh - 1))) >> rsh
-            # Add output zero point + clip + cast:
-            return saturate(te.add(mul, output_zp), out_dtype).astype(out_dtype)
-
-        return te.compute(data.shape, _compute, name="requantize_scalar")
-
-    else:
-
-        def _compute(*indices):
-            value = data(*indices)
-            iscale = get_qnn_param(input_scale, indices, axis)
-            oscale = get_qnn_param(output_scale, indices, axis)
-
-            # Subtract input zero point:
-            sub = te.subtract(value, input_zp)
-            mul = te.div(iscale, oscale)
-            val = te.add(te.round(te.multiply(mul, sub)), output_zp)
-            # clip + cast:
-            return saturate(val, out_dtype).astype(out_dtype)
-
-        return te.compute(data.shape, _compute, name="requantize")
-
-
-def schedule_qnn_requantize(outs):
-    """Schedule for qnn.requantize
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.requantize
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def compute_qnn_binary_op(
-    lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp, func
-):
-    """Compute for QNN binary operation
-
-    If rhs/lhs/output scales are constant scalars then we convert scale to fixed point value
-    and use integer arithmetic only for performance optimization purpose.
-    But this is a tradeoff between performance and accuracy, since we use int16 data type to
-    represent fixed point values (against QNN lowering approach where we use int32 for that).
-
-    if rhs/lhs/output scales are not constant scalars then we use the following formula:
-        Q_output = output_zp + round((lhs_scale)/(output_scale) * (lhs_input - lhs_zp))
-                        _OP_ round((rhs_scale)/(output_scale) * (rhs_input - rhs_zp))
-        where _OP_ is add/subtract
-    """
-    assert lhs.dtype == rhs.dtype
-    dtype = lhs.dtype
-
-    def _compute_const(x: te.Tensor, iscale, input_zp):
-        return te.round(te.multiply(te.div(iscale, output_scale), te.subtract(x, input_zp))).astype(
-            "int32"
-        )
-
-    def _compute_tensor(x: te.Tensor, input_scale, input_zp):
-        if is_scalar(input_scale) and is_scalar(output_scale):
-            iscale = get_const_float_value(input_scale)
-            oscale = get_const_float_value(output_scale)
-            scale = iscale / oscale
-            scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-            return te.compute(
-                x.shape,
-                lambda *i: (te.subtract(x(*i), input_zp) * scale_fixed_point + (1 << (rsh - 1)))
-                >> rsh,
-            )
-        else:
-            return te.compute(
-                x.shape,
-                lambda *i: te.round(
-                    te.multiply(te.div(input_scale, output_scale), te.subtract(x(*i), input_zp))
-                ).astype("int32"),
-            )
-
-    if is_scalar(lhs):
-        lhs_tensor = _compute_const(lhs, lhs_scale, lhs_zp)
-    else:
-        lhs_tensor = _compute_tensor(lhs, lhs_scale, lhs_zp)
-
-    if is_scalar(rhs):
-        rhs_tensor = _compute_const(rhs, rhs_scale, rhs_zp)
-    else:
-        rhs_tensor = _compute_tensor(rhs, rhs_scale, rhs_zp)
-
-    # Binary op with broadcasting
-    tensor = func(lhs_tensor, rhs_tensor)
-
-    # Add output zero point and clip+cast.
-    def _compute(*indices):
-        return saturate(te.add(tensor(*indices), output_zp), dtype).astype(dtype)
-
-    return te.compute(tensor.shape, _compute)
-
-
-def qnn_add(lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp):
-    """Compute for qnn.add
-    TODO: support 'axis' argument.
-    """
-    return compute_qnn_binary_op(
-        lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp, topi.add
-    )
-
-
-def schedule_qnn_add(outs):
-    """Schedule for qnn.add
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.add
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_subtract(lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp):
-    """Compute for qnn.subtract"""
-
-    return compute_qnn_binary_op(
-        lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp, topi.subtract
-    )
-
-
-def schedule_qnn_subtract(outs):
-    """Schedule for qnn.subtract
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.add
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_mul(
-    lhs: te.Tensor,
-    rhs: te.Tensor,
-    lhs_scale: te.Tensor,
-    lhs_zp: te.Tensor,
-    rhs_scale: te.Tensor,
-    rhs_zp: te.Tensor,
-    output_scale: te.Tensor,
-    output_zp: te.Tensor,
-):
-    """Compute for qnn.mul
-
-    mul = (lhs_input - lhs_zp) * (rhs_input - rhs_zp)
-    Q_output = requantize(mul, lhs_scale * rhs_scale, 0, output_scale, output_zp)
-    """
-    assert lhs.dtype == rhs.dtype
-    odtype = lhs.dtype
-
-    def _compute_tensor(tensor, zero_point):
-        if is_scalar(tensor):
-            return tensor - zero_point
-        else:
-            return te.compute(tensor.shape, lambda *i: te.subtract(tensor(*i), zero_point))
-
-    lhs_tensor = _compute_tensor(lhs, lhs_zp)
-    rhs_tensor = _compute_tensor(rhs, rhs_zp)
-
-    # Multiply with broadcasting.
-    mul = topi.multiply(lhs_tensor, rhs_tensor)
-
-    if is_scalar(lhs_scale) and is_scalar(rhs_scale):
-        assert isinstance(lhs_scale, te.Tensor)
-        assert isinstance(rhs_scale, te.Tensor)
-        iscale_val = get_const_float_value(lhs_scale.op.body[0]) * get_const_float_value(
-            rhs_scale.op.body[0]
-        )
-        iscale = tvm.tir.const(iscale_val)
-    else:
-        iscale = lhs_scale * rhs_scale
-
-    return qnn_requantize(mul, iscale, tvm.tir.const(0), output_scale, output_zp, out_dtype=odtype)
-
-
-def schedule_qnn_mul(outs):
-    """Schedule for qnn.mul
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.add
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_tanh(data, input_scale, input_zp, output_scale, output_zp):
-    """Compute for qnn.tanh
-
-    Q_output = quantize(tanh(dequantize(data)))
-    """
-    dq_tensor = qnn_dequantize(data, input_scale, input_zp)
-    tanh = te.compute(dq_tensor.shape, lambda *i: te.tanh(dq_tensor(*i)))
-    return qnn_quantize(tanh, output_scale, output_zp, out_dtype=data.dtype)
-
-
-def schedule_qnn_tanh(outs):
-    """Schedule for qnn.tanh
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.add
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_concatenate(data, axis, out_dtype):
-    """Compute for qnn.concatenate
-
-    Parameters
-    ----------
-    data: Array of Tensor
-          The computation graph description of qnn.concatenate
-          in the format of an array of tensors.
-
-    axis: int
-          The axis along which the tensors are concatenated.
-
-    out_dtype: string
-          Data type of output tensor
-
-    Returns
-    -------
-    out: Tensor
-        The computation for the op.
-    """
-
-    # Get output quantization parameters.
-    o_scale = data[-2]
-    o_zp = data[-1]
-
-    # Initially qnn.concatenate had 3 tuples: (1) tuple with input tensors, (2) tuple with input
-    # scales and (3) tuple with input zero points.
-    # Last 2 elements in data represent output scale and zero point.
-    num_of_tuples = 3
-    assert ((len(data) - 2) % num_of_tuples) == 0
-    args_num = (len(data) - 2) // num_of_tuples
-
-    args = []
-    for i in range(args_num):
-        # Get next tensor and its quantization parameters.
-        tensor = data[i]
-        i_scale = data[i + args_num]
-        i_zp = data[i + args_num * 2]
-
-        # Requantize tensors and add them to the list.
-        args.append(qnn_requantize(tensor, i_scale, i_zp, o_scale, o_zp, out_dtype=out_dtype))
-
-    # Call generic implementation of concatenate.
-    return concatenate(args, axis)
-
-
-def schedule_qnn_concatenate(outs):
-    """Schedule for qnn.concatenate
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.add
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_conv2d(  # Conv2d inputs
-    data,
-    weight,
-    # Conv2d quantization params:
-    input_zero_point,
-    kernel_zero_point,
-    _input_scale,
-    _kernel_scale,
-    # bias
-    bias,
-    # Requantization params:
-    rq_input_scale,
-    rq_input_zero_point,
-    rq_output_scale,
-    rq_output_zero_point,
-    # Conv2d attributes:
-    strides,
-    padding,
-    dilation,
-    oshape,
-    odtype,
-):
-    """Compute for qnn.conv2d with NCHW layout.
-
-    Output data type should be specified through the 'odtype' parameter. qnn.conv2d leverages int32
-    type to store intermediate results. If 'odtype' differs from int32, you need to specify
-    requantization parameters.
-    """
-    in_channel = data.shape[1]  # NCHW layout
-    kernel_height = weight.shape[2]  # OIHW layout
-    kernel_width = weight.shape[3]  # OIHW layout
-
-    height_stride, width_stride = strides
-    dilation_h, dilation_w = dilation
-
-    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        get_const_tuple(padding), (dilated_kernel_h, dilated_kernel_w)
-    )
-
-    # Subtract zero point from weights. axis=0 in get_qnn_param means 'O' dimension in "OIHW"
-    # weights layout.
-    weight = te.compute(
-        weight.shape,
-        lambda *indices: te.subtract(
-            weight(*indices), get_qnn_param(kernel_zero_point, indices, axis=0)
-        ),
-    )
-
-    # Subtract zero point from input and then do padding with 0 value
-    data = te.compute(data.shape, lambda *indices: te.subtract(data(*indices), input_zero_point))
-
-    # DOPAD
-    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
-        pad_before = (0, 0, pad_top, pad_left)
-        pad_after = (0, 0, pad_down, pad_right)
-        data_pad = pad(data, pad_before, pad_after, name="data_pad")
-    else:
-        data_pad = data
-
-    ic = te.reduce_axis((0, in_channel), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-
-    out = te.compute(
-        oshape,
-        lambda n, oc, oh, ow: te.sum(
-            data_pad[
-                n, ic, oh * height_stride + kh * dilation_h, ow * width_stride + kw * dilation_w
-            ].astype("int32")
-            * weight[oc, ic, kh, kw].astype("int32"),
-            axis=[ic, kh, kw],
-        ),
-    )
-
-    # Add bias
-    if bias is not None:
-        assert len(out.shape) == len(bias.shape)
-        assert bias.shape[2] == 1 and bias.shape[3] == 1
-        out = te.compute(out.shape, lambda n, c, h, w: out[n, c, h, w] + bias[n, c, 0, 0])
-
-    # Requantize output of convolution
-    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
-    if rq_input_scale is not None and rq_output_scale is not None:
-        # Now supported only scalar and 1D quantization parameters
-        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
-        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
-        axis = -1
-        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
-            axis = 1  # Axis param should correspond to 'C' dimension.
-
-        return qnn_requantize(
-            out,
-            rq_input_scale,
-            rq_input_zero_point,
-            rq_output_scale,
-            rq_output_zero_point,
-            axis,
-            odtype,
-        )
-
-    return out
-
-
-def schedule_qnn_conv2d(outs):
-    """Schedule for qnn.conv2d
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.conv2d
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_conv2d_NCHWc_int8(  # Conv2d inputs
-    data,
-    weight,
-    # Conv2d quantization params:
-    input_zero_point,
-    kernel_zero_point,
-    _input_scale,
-    _kernel_scale,
-    # bias
-    bias,
-    # Requantization params:
-    rq_input_scale,
-    rq_input_zero_point,
-    rq_output_scale,
-    rq_output_zero_point,
-    # Conv2d attributes:
-    strides,
-    padding,
-    dilation,
-    _oshape,
-    odtype,
-):
-    """Compute for qnn.conv2d with NCHWc layout."""
-
-    # Subtract zero point from input and weights.
-    weight = subtract_zero_point(weight, kernel_zero_point, "weight_zp")
-    data = subtract_zero_point(data, input_zero_point, "data_zp")
-
-    strides = get_const_tuple(strides)
-    padding = get_const_tuple(padding)
-    dilation = get_const_tuple(dilation)
-    out = conv2d_NCHWc_int8(data, weight, strides, padding, dilation, "NCHW32c", "NCHW32c")
-
-    # Add bias
-    if bias is not None:
-        assert len(out.shape) == len(bias.shape)
-        assert bias.shape[2] == 1 and bias.shape[3] == 1
-        out = te.compute(
-            out.shape,
-            lambda n, c, h, w, ci: out[n, c, h, w, ci] + bias[n, c, 0, 0, ci],
-            name="bias_add",
-        )
-
-    # Requantize output of convolution
-    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
-    if rq_input_scale is not None and rq_output_scale is not None:
-        # Now supported only scalar and 1D quantization parameters
-        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
-        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
-        axis = -1
-        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
-            axis = 1  # Axis param should correspond to 'C' dimension.
-
-        return qnn_requantize(
-            out,
-            rq_input_scale,
-            rq_input_zero_point,
-            rq_output_scale,
-            rq_output_zero_point,
-            axis,
-            odtype,
-        )
-
-    return out
-
-
-def schedule_qnn_conv2d_NCHWc_int8(outs):
-    """Schedule for qnn.conv2d with NCHWc layout."""
-
-    return default_schedule(outs)
-
-
-def qnn_depthwise_conv2d(  # Conv2d inputs
-    data,
-    weight,
-    # Conv2d quantization params:
-    input_zero_point,
-    kernel_zero_point,
-    _input_scale,
-    _kernel_scale,
-    # bias
-    bias,
-    # Requantization params:
-    rq_input_scale,
-    rq_input_zero_point,
-    rq_output_scale,
-    rq_output_zero_point,
-    # Conv2d attributes:
-    strides,
-    padding,
-    dilation,
-    oshape,
-    odtype,
-):
-    """Compute for qnn.conv2d with NCHW layout
-
-    Output data type should be specified through the 'odtype' parameter. qdepthwise nn.conv2d
-    leverages int32 type to store intermediate results. If 'odtype' differs from int32, you need to
-    specify requantization parameters.
-    """
-    kernel_height = weight.shape[2]  # OIHW layout
-    kernel_width = weight.shape[3]  # OIHW layout
-
-    height_stride, width_stride = strides
-    dilation_h, dilation_w = dilation
-
-    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        get_const_tuple(padding), (dilated_kernel_h, dilated_kernel_w)
-    )
-
-    # Subtract zero point from input and then do padding with 0 value
-    data = te.compute(data.shape, lambda *indices: te.subtract(data(*indices), input_zero_point))
-
-    # DOPAD
-    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
-        pad_before = (0, 0, pad_top, pad_left)
-        pad_after = (0, 0, pad_down, pad_right)
-        data_pad = pad(data, pad_before, pad_after, name="data_pad")
-    else:
-        data_pad = data
-
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-
-    out = te.compute(
-        oshape,
-        lambda n, oc, oh, ow: te.sum(
-            data_pad[
-                n, oc, oh * height_stride + kh * dilation_h, ow * width_stride + kw * dilation_w
-            ].astype("int32")
-            * te.subtract(weight[oc, 0, kh, kw], kernel_zero_point).astype("int32"),
-            axis=[kh, kw],
-        ),
-    )
-
-    # Add bias
-    if bias is not None:
-        assert len(out.shape) == len(bias.shape)
-        assert bias.shape[2] == 1 and bias.shape[3] == 1
-        out = te.compute(out.shape, lambda n, c, h, w: out[n, c, h, w] + bias[n, c, 0, 0])
-
-    # Requantize output of convolution
-    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
-    if rq_input_scale is not None and rq_output_scale is not None:
-        # Now supported only scalar and 1D quantization parameters
-        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
-        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
-        axis = -1
-        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
-            axis = 1  # Axis param should correspond to 'C' dimension.
-
-        return qnn_requantize(
-            out,
-            rq_input_scale,
-            rq_input_zero_point,
-            rq_output_scale,
-            rq_output_zero_point,
-            axis,
-            odtype,
-        )
-
-    return out
-
-
-def schedule_qnn_depthwise_conv2d(outs):
-    """Schedule for depthwise qnn.conv2d
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.conv2d
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_dense(
-    data,
-    weight,
-    # Dense quantization params:
-    input_zero_point,
-    kernel_zero_point,
-    _input_scale,
-    _kernel_scale,
-    # bias
-    bias,
-    # Requantization params:
-    rq_input_scale,
-    rq_input_zero_point,
-    rq_output_scale,
-    rq_output_zero_point,
-    out_dtype,
-):
-    """Compute for qnn.dense
-
-    Output data type should be specified through the 'odtype' parameter. qnn.dense leverages int32
-    type to store intermediate results. If 'odtype' differs from int32, you need to specify
-    requantization parameters.
-    """
-    M, K = get_const_tuple(data.shape)
-    N, _ = get_const_tuple(weight.shape)
-    k = te.reduce_axis((0, K), "k")
-    # This implementation uses "int32" dense output data type.
-    # axis=0 in get_qnn_param mean 'N' dimension in "NK" weights layout.
-    out = te.compute(
-        (M, N),
-        lambda m, n: te.sum(
-            te.subtract(data[m, k], input_zero_point).astype("int32")
-            * te.subtract(weight[n, k], get_qnn_param(kernel_zero_point, (n, k), axis=0)).astype(
-                "int32"
-            ),
-            axis=k,
-        ),
-    )
-
-    # Add bias
-    if bias is not None:
-        out = te.compute(out.shape, lambda n, c: out[n, c] + bias[0, c])
-
-    # Requantize output of dense
-    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
-    if rq_input_scale is not None and rq_output_scale is not None:
-        # Now supported only scalar and 1D quantization parameters
-        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
-        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
-        axis = -1
-        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
-            axis = 1  # Axis param should correspond to 'N' dimension.
-
-        return qnn_requantize(
-            out,
-            rq_input_scale,
-            rq_input_zero_point,
-            rq_output_scale,
-            rq_output_zero_point,
-            axis,
-            out_dtype,
-        )
-
-    return out
-
-
-def schedule_qnn_dense(outs):
-    """Schedule for qnn.dense
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.dense
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_dense_pack_vrmpy(
-    data: te.Tensor,
-    weight: te.Tensor,
-    # Dense quantization params:
-    input_zero_point: te.Tensor,
-    kernel_zero_point: te.Tensor,
-    _input_scale: te.Tensor,
-    _kernel_scale: te.Tensor,
-    # bias
-    bias: te.Tensor,
-    # Requantization params:
-    rq_input_scale: te.Tensor,
-    rq_input_zero_point: te.Tensor,
-    rq_output_scale: te.Tensor,
-    rq_output_zero_point: te.Tensor,
-    out_dtype: str,
-):
-    """Compute for qnn.contrib_dense_pack
-
-    Output data type should be specified through the 'odtype' parameter. qnn.dense leverages int32
-    type to store intermediate results. If 'odtype' differs from int32, you need to specify
-    requantization parameters.
-    """
-    # Subtract zero point from input and weights.
-    weight = subtract_zero_point(weight, kernel_zero_point, "weight_zp")
-    data = subtract_zero_point(data, input_zero_point, "data_zp")
-
-    # Required for vrmpy intrinsic
-    assert "int8" in weight.dtype and "int8" in data.dtype
-
-    M, K = get_const_tuple(data.shape)
-    N_O, _, N_I, _ = get_const_tuple(weight.shape)
-    k = te.reduce_axis((0, K), "k")
-    out = te.compute(
-        (M, N_O * N_I),
-        lambda m, n: te.sum(
-            data[m, k].astype("int32")
-            * weight[
-                tvm.tir.indexdiv(n, 32),
-                tvm.tir.indexdiv(k, 4),
-                tvm.tir.indexmod(n, 32),
-                tvm.tir.indexmod(k, 4),
-            ].astype("int32"),
-            axis=k,
-        ),
-        name="qnn_dense_pack",
-    )
-
-    # Add bias
-    if bias is not None:
-        assert bias.ndim == 2
-        out = te.compute(out.shape, lambda n, c: out[n, c] + bias[0, c])
-
-    # Requantize output of qnn.contrib_dense_pack
-    if rq_input_scale is not None and rq_output_scale is not None:
-        # Now supported only scalar and 1D quantization parameters
-        assert rq_input_scale.ndim == 0 or rq_input_scale.ndim == 1
-        assert rq_output_scale.ndim == 0 or rq_output_scale.ndim == 1
-        axis = -1
-        if rq_input_scale.ndim == 1 or rq_output_scale.ndim == 1:
-            axis = 1  # Axis param should correspond to 'C' dimension.
-
-        return qnn_requantize(
-            out,
-            rq_input_scale,
-            rq_input_zero_point,
-            rq_output_scale,
-            rq_output_zero_point,
-            axis,
-            out_dtype,
-        )
-
-    return out
-
-
-def schedule_qnn_dense_pack_vrmpy(outs):
-    """Schedule for qnn.contrib_dense_pack
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.dense
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
-
-
-def qnn_batch_matmul(
-    tensor_a,
-    tensor_b,
-    # batch_matmul quantization params:
-    a_zero_point,
-    b_zero_point,
-    _a_scale,
-    _b_scale,
-    # Attributes
-    transpose_a,
-    transpose_b,
-    out_dtype,
-):
-    """Compute for qnn.batch_matmul"""
-
-    # Preprocess tensor_a: subtract zp
-    a_sub_zp = te.compute(
-        tensor_a.shape, lambda *indices: te.subtract(tensor_a(*indices), a_zero_point)
-    )
-    # Preprocess tensor_b: subtract zp
-    b_sub_zp = te.compute(
-        tensor_b.shape, lambda *indices: te.subtract(tensor_b(*indices), b_zero_point)
-    )
-
-    return nn.batch_matmul(a_sub_zp, b_sub_zp, None, out_dtype, transpose_a, transpose_b)
-
-
-def schedule_qnn_batch_matmul(outs):
-    """Schedule for qnn.batch_matmul
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qnn.batch_matmul
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return default_schedule(outs)
diff --git a/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py b/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py
deleted file mode 100644
index a974ad643107..000000000000
--- a/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Compute and schedule for quantized add, multiply, subtract op
-
-Please note the following assumptions made by the implementation:
-
-1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value
-
-
-def broadcast_axis(tensor_A, tensor_B):
-    """Find out the indices that will have broadcasting"""
-    A_broadcast = []
-    B_broadcast = []
-
-    for i in range(len(tensor_A.shape)):
-        if tensor_A.shape[i] == tensor_B.shape[i]:
-            A_broadcast.append(1)
-            B_broadcast.append(1)
-        elif tensor_A.shape[i] == 1:
-            A_broadcast.append(0)
-            B_broadcast.append(1)
-        elif tensor_B.shape[i] == 1:
-            A_broadcast.append(1)
-            B_broadcast.append(0)
-    return A_broadcast, B_broadcast
-
-
-def saturate(x: te.Tensor, dtype: str):
-    """Saturate value for the specified data type"""
-    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
-
-
-def get_int_scale(
-    scale_A: float,
-    scale_B: float,
-    scale_M: float,
-    zero_point_A: int,
-    zero_point_B: int,
-    zero_point_M: int,
-    op: str,
-):
-    """
-    Get fixed-point number and exp_scale_factor from topi.hexagon.utils.get_fixed_point_value.
-    Also, depending on the op, this function uses exp_scale_factor(log2 of the scale factor)
-    to adjust the output's zero_point.
-    """
-
-    C_recip = 1 / scale_M
-
-    if op == "qmul":
-        scale = scale_A * scale_B * C_recip
-        scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-
-        # We need to adjust output's zero point value since the compute for the op is multiplied
-        # by a scaling factor.
-        # The scaling factor is 2^x where x is the exp_scale_factor which is assigned to rsh here.
-        # Since zero_point_M is multipled by 2^rsh while converting floating-point scale value
-        # into fixed-point number, we left shift it by rsh in our compute to reflect that.
-
-        corr = zero_point_M << rsh
-
-        return scale_fixed_point, rsh, corr
-
-    a_scale_f = scale_A * C_recip
-    b_scale_f = scale_B * C_recip
-    scale_fixed_point_a, rsh_a = get_fixed_point_value(a_scale_f, "int16")
-    scale_fixed_point_b, rsh_b = get_fixed_point_value(b_scale_f, "int16")
-
-    # Here we have two exp_scale_factors rsh_a and rsh_b.
-    # To avoid complexity, we want to use a common exp_scale_factor and
-    # we want to use the lowest of the two.
-
-    # Since, either of scale_fixed_point_a or scale_fixed_point_b has already been multiplied
-    # by 2^max(rsh_a, rsh_b) in topi.hexagon.utils.get_fixed_point_value,
-    # we want to undo that by right shifting that scale_fixed_point value
-    # by the difference of rsh_a and rsh_b.
-
-    # This results into having a common exp_scale_factor for both scale_fixed_point_a
-    # and scale_fixed_point_b.
-
-    # We also set rsh here which is used to adjust the zero_point_M and compute the corr value,
-    # computation of which comes from the original equation of the op's compute.
-
-    if rsh_a > rsh_b:
-        scale_fixed_point_a = scale_fixed_point_a >> (rsh_a - rsh_b)
-        rsh = rsh_b
-    else:
-        scale_fixed_point_b = scale_fixed_point_b >> (rsh_b - rsh_a)
-        rsh = rsh_a
-
-    if op == "qadd":
-        corr = (zero_point_M << rsh) - (
-            zero_point_A * scale_fixed_point_a + zero_point_B * scale_fixed_point_b
-        )
-    else:
-        corr = (zero_point_M << rsh) - (
-            zero_point_A * scale_fixed_point_a - zero_point_B * scale_fixed_point_b
-        )
-
-    return scale_fixed_point_a, scale_fixed_point_b, rsh, corr
-
-
-def qadd_broadcast_compute(
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_shape: list,
-    zero_point_A: int,
-    scale_A: float,
-    zero_point_B: int,
-    scale_B: float,
-    zero_point_M: int,
-    scale_M: float,
-    dtype: str,
-):
-    """Compute quantized add with broadcasting"""
-    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
-    n_a, h_a, w_a, c_a = A_broadcast
-    n_b, h_b, w_b, c_b = B_broadcast
-
-    scale_a, scale_b, rsh, corr = get_int_scale(
-        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qadd"
-    )
-
-    return te.compute(
-        output_shape,
-        lambda n, h, w, c: saturate(
-            (
-                (
-                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
-                    + (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
-                    + corr
-                )
-                >> rsh
-            ),
-            dtype,
-        ).astype(dtype),
-    )
-
-
-def qsubtract_broadcast_compute(
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_shape: list,
-    zero_point_A: int,
-    scale_A: float,
-    zero_point_B: int,
-    scale_B: float,
-    zero_point_M: int,
-    scale_M: float,
-    dtype: str,
-):
-    """Compute quantized subtract with broadcasting"""
-    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
-    n_a, h_a, w_a, c_a = A_broadcast
-    n_b, h_b, w_b, c_b = B_broadcast
-
-    scale_a, scale_b, rsh, corr = get_int_scale(
-        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qsub"
-    )
-
-    return te.compute(
-        output_shape,
-        lambda n, h, w, c: saturate(
-            (
-                (
-                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
-                    - (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
-                    + corr
-                )
-                >> rsh
-            ),
-            dtype,
-        ).astype(dtype),
-    )
-
-
-def qmultiply_broadcast_compute(
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_shape: list,
-    zero_point_A: int,
-    scale_A: float,
-    zero_point_B: int,
-    scale_B: float,
-    zero_point_M: int,
-    scale_M: float,
-    dtype: str,
-):
-    """Compute quantized multiply with broadcasting"""
-    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
-    n_a, h_a, w_a, c_a = A_broadcast
-    n_b, h_b, w_b, c_b = B_broadcast
-
-    scale_int, rsh, corr = get_int_scale(
-        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qmul"
-    )
-
-    return te.compute(
-        output_shape,
-        lambda n, h, w, c: saturate(
-            (
-                (
-                    scale_int
-                    * (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] - zero_point_A)
-                    * (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] - zero_point_B)
-                    + corr
-                )
-                >> rsh
-            ),
-            dtype,
-        ).astype(dtype),
-    )
-
-
-def tir_schedule_quant(
-    out_M: te.Tensor,
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_layout: str,
-    tensor_A_layout: str,
-    tensor_B_layout: str,
-):
-    """Schedule for output layout nhwc-8h8w32c-2d"""
-    func = te.create_prim_func([tensor_A, tensor_B, out_M])
-
-    s = tir.Schedule(func)
-
-    block = s.get_block("compute")
-
-    if tensor_A_layout == "nhwc-8h8w32c-2d":
-        tensor_A_transformed_layout = get_layout_transform_fn(tensor_A_layout)
-        s.transform_layout(block, buffer=tensor_A.name, index_map=tensor_A_transformed_layout)
-
-    if tensor_B_layout == "nhwc-8h8w32c-2d":
-        tensor_B_transformed_layout = get_layout_transform_fn(tensor_B_layout)
-        s.transform_layout(block, buffer=tensor_B.name, index_map=tensor_B_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
-
-    n, h, w, c = s.get_loops(block)
-
-    h_o, h_i = s.split(h, [None, 8])
-    w_o, w_i = s.split(w, [None, 8])
-    c_o, c_i = s.split(c, [None, 32])
-    wio, wii = s.split(w_i, [None, 4])
-
-    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
-
-    return s
diff --git a/python/tvm/topi/hexagon/qnn/qdense.py b/python/tvm/topi/hexagon/qnn/qdense.py
deleted file mode 100644
index 53f9077e56ba..000000000000
--- a/python/tvm/topi/hexagon/qnn/qdense.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Schedule for dense operator"""
-
-from tvm import te, tir
-from tvm.topi import tag
-from ..utils import get_layout_transform_fn
-
-
-def qdense_compute(
-    tensor_a,
-    tensor_b,
-    zero_a,
-    scale_a,
-    zero_b,
-    scale_b,
-    zero_out=None,
-    scale_out=None,
-    bias=None,
-    q_dtype=None,
-):
-    """Hexagon's implementation of a sliced dense operator in Topi.
-    Uses matmul.
-
-    Parameters
-    ----------
-    tensor_a : tvm.te.Tensor
-        data 2-D with shape [batch, in_dim]
-
-    tensor_b : tvm.te.Tensor
-        weight 2-D with shape [in_dim, out_dim]
-
-    zero_a : integer
-        quantization zero point for tensor a.
-
-    scale_a : float
-        quantization scale for tensor a.
-
-    zero_b : integer
-        quantization zero point for tensor b.
-
-    scale_b : float
-        quantization scale for tensor b.
-
-    zero_out : Optional[integer]
-        quantization zero point for output.
-
-    scale_out : Optional[float]
-        quantization scale for output.
-
-    bias : Optional[tvm.te.Tensor]
-        1-D with shape [out_dim]
-
-    q_dtype : Optional[str]
-        The output type.
-
-    Returns
-    -------
-    mat : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-
-    """
-    if bias is not None:
-        assert len(bias.shape) == 1
-    if q_dtype is None:
-        q_dtype = tensor_a.dtype
-
-    batch, in_dim = tensor_a.shape
-    out_dim, red_dim = tensor_b.shape
-
-    # cmp should be done by values
-    assert int(in_dim) == int(red_dim)
-
-    k = te.reduce_axis((0, in_dim), name="k")
-    compute_lambda = lambda n, m: te.sum(
-        scale_a
-        * (tensor_a[n, k].astype("float32") - zero_a)
-        * scale_b
-        * (tensor_b[k, m].astype("float32") - zero_b),
-        axis=k,
-    )
-    compute_name = "qmatmul_sliced"
-
-    out = te.compute(
-        (batch, out_dim),
-        compute_lambda,
-        name=compute_name,
-        attrs={"layout_free_placeholders": [tensor_b]},
-    )
-
-    if bias is not None:
-        out = te.compute(
-            (batch, out_dim),
-            lambda i, j: out[i, j] + bias[j],
-            tag=tag.BROADCAST,
-            name="bias",
-        )
-
-    # Requantization of dense
-    if scale_out is not None:
-        out = te.compute(
-            (batch, out_dim),
-            lambda *i: (out[i] / scale_out + zero_out).astype(q_dtype),
-            name="requantize",
-        )
-
-    return out
-
-
-def qdense_schedule(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for dense op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense in the format
-        of an array of tensors.
-
-    ins: Array of Tensor
-        Input tensors into graph.
-
-    output_layout: str
-        Descriptor string for physical layout
-
-    input_layout: str
-        Descriptor string for physical layout
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    if not isinstance(ins, list):
-        ins = [ins]
-    if not isinstance(outs, list):
-        outs = [outs]
-
-    func = te.create_prim_func([*ins, *outs])
-    s = tir.Schedule(func)
-
-    matmul = s.get_block("qmatmul_sliced")
-    try:
-        requantize = s.get_block("requantize")
-    except tir.schedule.schedule.ScheduleError:
-        requantize = None
-    try:
-        bias = s.get_block("bias")
-    except tir.schedule.schedule.ScheduleError:
-        bias = None
-
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
-
-    # Transform input and output buffer
-    s.transform_layout(matmul, ("read", 0), input_transform_fn)
-    if requantize is not None:
-        s.transform_layout(requantize, ("write", 0), output_transform_fn)
-    elif bias is not None:
-        s.transform_layout(bias, ("write", 0), output_transform_fn)
-    else:
-        s.transform_layout(matmul, ("write", 0), output_transform_fn)
-
-    # Vectorize
-    _, matmul_c, _ = s.get_loops(matmul)
-    _, matmul_c_inner = s.split(matmul_c, [None, 128])
-    s.vectorize(matmul_c_inner)
-
-    # Compute everything inline
-    if bias is not None and requantize is not None:
-        _, bias_c = s.get_loops(bias)
-        s.compute_at(matmul, bias_c)
-        _, out_c = s.get_loops(requantize)
-        s.compute_at(bias, out_c)
-    elif bias is not None and requantize is None:
-        _, out_c = s.get_loops(bias)
-        s.compute_at(matmul, out_c)
-
-    return s
diff --git a/python/tvm/topi/hexagon/qnn/qdepthwise_conv2d_slice.py b/python/tvm/topi/hexagon/qnn/qdepthwise_conv2d_slice.py
deleted file mode 100644
index 9a275c1cc370..000000000000
--- a/python/tvm/topi/hexagon/qnn/qdepthwise_conv2d_slice.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
-"""
-Please note the following assumptions made by the implementation:
-1) The input must be padded in advance to account for 'padding'. In addition,
-   both input and output must be padded as per the physical buffer layout.
-2) 'padding' is ignored. It must be handled outside of the sliced op.
-3) The weights are expected to be as per physical layout
-
-The initial compute for quantized depthwise conv2d is as follows
-where cm = channel_multiplier; assumed to be 1,
-zp_a = Activation_zero_point,
-zp_w = Weight_zero_point,
-Qa = Quantized Activation,
-Qw = Quantized Weights.
-
-     a) Qc(n, oh, ow, oc) = (Sigma(r, s) (Qw(r, s, oc%cm, oc/cm) - zp_w)
-                                      * (Qa(n, oh + r, ow + s, oc/cm) - zp_a))
-                                      * scale_value
-        where scale_value = (activation_scale * weight_scale) / output_scale
-
-        This can be written as
-
-     b) Qc(n, oh, ow, oc) = (t1 - t2 - t3 + t4) * scale_value
-
-        where t1 = Sigma(r, s) Qw(r, s, oc%cm, oc/cm) * Qa(n, oh + r, ow + s, oc/cm)
-              t2 = Sigma(r, s) zp_w * Qa(n, oh + r, ow + s, oc/cm)
-              t3 = Sigma(r, s) zp_a * Qw(r, s, oc%cm, oc/cm)
-              t4 = Sigma(r, s) zp_a * zp_w
-
-     c) Qc(n, oh, ow, oc) = saturate(((t1 - t2 - t3 + t4) * fixed_scale_value)) >> rsh)
-
-        where fixed_scale_value, rsh are fixed point values for scale_value.
-
-
-Compute and schedule for quantized depthwise conv2d slice op"""
-
-import typing
-import tvm
-from tvm import te
-from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
-
-
-def qdepthwise_conv2d_compute(
-    activations: te.Tensor,
-    weights: te.Tensor,
-    out_shape: typing.Tuple,
-    stride: typing.Tuple,
-    dilation: typing.Tuple,
-    dtype: str,
-    # quantization params:
-    activation_zero_point,
-    activation_scale,
-    weight_zero_point,
-    weight_scale,
-    output_zero_point,
-    output_scale,
-):
-    """Compute for quantized depthwise conv2d"""
-    filt_shape = weights.shape
-    ob, oh, ow, oc = out_shape
-
-    if dtype == "uint8":
-        temp_dtype = "int32"
-        big_dtype = "int64"
-    elif dtype == "int8":
-        temp_dtype = "int32"
-        big_dtype = "int64"
-    else:
-        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
-
-    reduce_height = tvm.te.reduce_axis((0, filt_shape[0]), name="reduce_height")
-    reduce_width = tvm.te.reduce_axis((0, filt_shape[1]), name="reduce_width")
-    stride_height, stride_width = stride
-    dilation_height, dilation_width = dilation
-
-    scale_value = (activation_scale * weight_scale) / output_scale
-    fixed_scale_value, rsh = get_fixed_point_value(scale_value, "int16")
-
-    t1 = tvm.te.compute(
-        out_shape,
-        lambda n, h, w, c: tvm.te.sum(
-            (
-                (
-                    activations[
-                        n,
-                        h * stride_height + reduce_height * dilation_height,
-                        w * stride_width + reduce_width * dilation_width,
-                        c,
-                    ].astype(temp_dtype)
-                )
-                * (weights[reduce_height, reduce_width, 0, c].astype(temp_dtype))
-            ).astype(temp_dtype),
-            axis=[reduce_height, reduce_width],
-        ),
-        name="t1",
-    )
-
-    t2 = tvm.te.compute(
-        out_shape,
-        lambda n, h, w, c: tvm.te.sum(
-            (
-                (
-                    activations[
-                        n,
-                        h * stride_height + reduce_height * dilation_height,
-                        w * stride_width + reduce_width * dilation_width,
-                        c,
-                    ].astype(temp_dtype)
-                )
-                * weight_zero_point
-            ).astype(temp_dtype),
-            axis=[reduce_height, reduce_width],
-        ),
-        name="t2",
-    )
-
-    t3 = tvm.te.compute(
-        (oc,),
-        lambda c: tvm.te.sum(
-            (
-                ((weights[reduce_height, reduce_width, 0, c].astype(temp_dtype)))
-                * activation_zero_point
-            ).astype(temp_dtype),
-            axis=[reduce_height, reduce_width],
-        ),
-        name="t3",
-    )
-
-    t4 = activation_zero_point * weight_zero_point * reduce_height * reduce_width
-
-    output = tvm.te.compute(
-        out_shape,
-        lambda n, h, w, c: saturate(
-            (
-                (
-                    (
-                        ((t1[n, h, w, c]).astype(big_dtype) - t2[n, h, w, c] - t3[c] + t4)
-                        * fixed_scale_value
-                    )
-                    >> rsh
-                )
-                + (output_zero_point).astype(big_dtype)
-            ),
-            dtype,
-        ).astype(dtype),
-        name="output",
-    )
-
-    return output
-
-
-def qdepthwise_conv2d_schedule(
-    outs: te.Tensor,
-    ins: typing.List[te.Tensor],
-    transform_activation_layout: str,
-    transform_weights: str,
-):
-    """
-    Schedule for quantized depthwise conv2d for input layout nhwc-8h8w32c
-    assert len(ins) == 2, "This schedule expects only 2 inputs - Activations and Weights
-    """
-    source_expr = ins + [outs]
-    prim_func = tvm.te.create_prim_func(source_expr)
-    sch = tvm.tir.Schedule(prim_func)
-
-    compute = sch.get_block("output")
-    compute1 = sch.get_block("t1")
-
-    transform_layout_fn = get_layout_transform_fn(transform_activation_layout)
-    transform_layout_weights = get_layout_transform_fn(transform_weights)
-
-    # Apply layout_transform for activation
-    sch.transform_layout(compute1, ins[0].name, transform_layout_fn)
-
-    # Apply layout_transform for weights
-    sch.transform_layout(compute1, ins[1].name, transform_layout_weights)
-
-    # Apply layout_transform for output
-    sch.transform_layout(compute, outs.name, transform_layout_fn)
-
-    # This returns the original 6d loop
-    batch, height, width, channel, reduce_height, reduce_width = sch.get_loops(compute1)
-    h_outer, h_inner = sch.split(height, [None, 8])
-    w_outer, w_inner = sch.split(width, [None, 8])
-    c_outer, c_inner = sch.split(channel, [None, 32])
-    sch.reorder(
-        batch,
-        h_outer,
-        w_outer,
-        c_outer,
-        h_inner,
-        reduce_height,
-        reduce_width,
-        w_inner,
-        c_inner,
-    )
-
-    sch.decompose_reduction(compute1, reduce_height)
-    # wi_ci = sch.fuse(w_inner,c_inner)
-    # sch.vectorize(wi_ci)
-    return sch
diff --git a/python/tvm/topi/hexagon/qnn/quantize.py b/python/tvm/topi/hexagon/qnn/quantize.py
deleted file mode 100644
index 3fd91ddce6ca..000000000000
--- a/python/tvm/topi/hexagon/qnn/quantize.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Compute and schedule for hexagon quantize
-Please note the following assumptions made by the implementation:
-1) The input and output data will be multiple of crouton layout
-2) And the supported layout is NHWC
-3) The input layout will be nhwc-4h2w32c2w-2d and
-   output layout will be nhwc-8h8w32c-2d"""
-
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, saturate
-
-
-def quantize_compute(tensor_A: te.Tensor, scale: float, zero_point: int, dtype: str):
-    """Compute for quantize"""
-    scale_recip = 1 / scale
-
-    return te.compute(
-        tensor_A.shape,
-        lambda n, h, w, c: saturate(
-            ((tensor_A[n, h, w, c] * scale_recip).astype("int32") + zero_point),
-            dtype,
-        ).astype(dtype),
-        name="quantize",
-    )
-
-
-def tir_quantize_schedule(
-    out_M: te.Tensor,
-    tensor_A: te.Tensor,
-    input_layout: str,
-    output_layout: str,
-):
-    """Schedule for output layout nhwc-8h8w32c-2d"""
-    func = te.create_prim_func([tensor_A, out_M])
-
-    s = tir.Schedule(func)
-
-    block = s.get_block("quantize")
-
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(block, buffer=tensor_A.name, index_map=input_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
-
-    # Fixed chunk size is 2048 byte
-    # For uint8 the layout for fixed chunk is 8x8x32
-    # where each element is 1 bytes
-    # Split and reorder is done to iterate over the fixed chunk
-    # Channel is split by a factor of 32
-    # Width is split by a factor of 8
-    # Height is split by a factor of 8
-    n, h, w, c = s.get_loops(block)
-
-    h_o, h_i = s.split(h, [None, 8])
-    w_o, w_i = s.split(w, [None, 8])
-    c_o, c_i = s.split(c, [None, 32])
-    wio, wii = s.split(w_i, [None, 4])
-
-    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
-
-    return s
diff --git a/python/tvm/topi/hexagon/reduce.py b/python/tvm/topi/hexagon/reduce.py
deleted file mode 100644
index ea10cd492a7b..000000000000
--- a/python/tvm/topi/hexagon/reduce.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Schedule for composition of reduction operator"""
-
-import tvm
-
-
-def schedule_reduce(outs):
-    """Schedule for reduction op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of reduction in the format
-        of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
-    s = tvm.te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    return s
diff --git a/python/tvm/topi/hexagon/resize2d.py b/python/tvm/topi/hexagon/resize2d.py
deleted file mode 100644
index 6e6c0e471db0..000000000000
--- a/python/tvm/topi/hexagon/resize2d.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Compute and schedule for resize2d
-Please note the following assumptions made by the implementation:
-1) The input and output data will be multiple of crouton layout
-2) And the supported layout is NHWC"""
-
-from tvm import te
-from tvm import tir
-from tvm import topi
-from .utils import get_layout_transform_fn
-
-
-def resize2d_compute(
-    data,
-    roi,
-    size,
-    layout,
-    method="linear",
-    coordinate_transformation_mode="half_pixel",
-    rounding_method="",
-    bicubic_alpha=-0.5,
-    bicubic_exclude=0,
-    extrapolation_value=0.0,
-    out_dtype=None,
-    output_shape=None,
-):
-    """Call resize2d op from topi.image"""
-    return topi.image.resize2d(
-        data,
-        roi,
-        size,
-        layout,
-        method,
-        coordinate_transformation_mode,
-        rounding_method,
-        bicubic_alpha,
-        bicubic_exclude,
-        extrapolation_value,
-        out_dtype,
-        output_shape,
-    )
-
-
-def tir_resize2d_schedule(
-    out_m,
-    input_a,
-    input_layout: str,
-    output_layout: str,
-):
-    """Schedule for input and output layout nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d"""
-    func = te.create_prim_func([input_a, out_m])
-
-    s = tir.Schedule(func)
-
-    block = s.get_block("resize")
-
-    if input_layout in (
-        "nhwc-8h2w32c2w-2d",
-        "nhwc-8h8w32c-2d",
-    ):
-        input_transformed_layout = get_layout_transform_fn(input_layout)
-        s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
-
-    if output_layout == "nhwc-8h2w32c2w-2d":
-        # Fixed chunk size is 2048 byte
-        # For fp16 the layout for fixed chunk is 8x4x32
-        # where each element is 2 bytes
-        # Split and reorder is done to iterate over the fixed chunk
-        # Channel is split by a factor of 32
-        # Width is split by a factor of 4
-        # Height is split by a factor of 8
-        n, h, w, c = s.get_loops(block)
-
-        ho, hi = s.split(h, [None, 8])
-        wo, wi = s.split(w, [None, 4])
-        co, ci = s.split(c, [None, 32])
-
-        s.reorder(n, ho, wo, co, hi, wi, ci)
-
-    elif output_layout == "nhwc-8h8w32c-2d":
-        # Fixed chunk size is 2048 byte
-        # For uint8 the layout for fixed chunk is 8x8x32
-        # where each element is 1 bytes
-        # Split and reorder is done to iterate over the fixed chunk
-        # Channel is split by a factor of 32
-        # Width is split by a factor of 8
-        # Height is split by a factor of 8
-        n, h, w, c = s.get_loops(block)
-
-        ho, hi = s.split(h, [None, 8])
-        wo, wi = s.split(w, [None, 8])
-        co, ci = s.split(c, [None, 32])
-
-        s.reorder(n, ho, wo, co, hi, wi, ci)
-
-    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
deleted file mode 100644
index b38dd5ecb3c1..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Computes and Schedules for Hexagon slice ops. """
-
-from .avg_pool2d import avg_pool2d_NHWC, avg_pool2d_NCHW, avg_pool2d_schedule
-from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule
-from .add_subtract_multiply import *
-from .argmax import argmax_compute, argmax_schedule
-from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule
-from .softmax_slice import *
-from .clip import *
-from .cast import (
-    cast_f16_f32_compute,
-    cast_f16_f32_schedule,
-    cast_f32_f16_compute,
-    cast_f32_f16_schedule,
-)
-from .conv2d import *
-from .reshape import reshape_compute, reshape_stir_schedule
-from .relu import relu_compute, relu_stir_schedule
-from .tanh import tanh_te_compute, tanhf16_schedule
-from .dwconv2d import *
-from .depth_to_space import d2s_compute, d2s_schedule
-from .global_avg_pool2d import *
-from .dense import *
diff --git a/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py b/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
deleted file mode 100644
index 0596f79b66a8..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Compute and schedule for add, multiply, subtract slice op
-
-Please note the following assumptions made by the implementation:
-
-1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
-
-from tvm import te
-from tvm import tir
-from tvm import topi
-from ..utils import get_layout_transform_fn
-
-
-def add_broadcast_compute(input_a, input_b):
-    """Call the add op from topi"""
-    return topi.add(input_a, input_b)
-
-
-def subtract_broadcast_compute(input_a, input_b):
-    """Call the subtract op from topi"""
-    return topi.subtract(input_a, input_b)
-
-
-def multiply_broadcast_compute(input_a, input_b):
-    """Call the multiply op from topi"""
-    return topi.multiply(input_a, input_b)
-
-
-def tir_broadcast_schedule(
-    out_m,
-    input_a,
-    input_b,
-    output_layout: str,
-    input_a_layout: str,
-    input_b_layout: str,
-    op_name: str,
-):
-    """Schedule for input and output layout nhwc-8h2w32c2w-2d considering broadcast"""
-    func = te.create_prim_func([input_a, input_b, out_m])
-
-    s = tir.Schedule(func)
-
-    block_dict = {"add": "T_add", "subtract": "T_subtract", "multiply": "T_multiply"}
-
-    block = s.get_block(block_dict[op_name])
-
-    if input_a_layout == "nhwc-8h2w32c2w-2d":
-        input_a_transformed_layout = get_layout_transform_fn(input_a_layout)
-        s.transform_layout(block, buffer=("read", 0), index_map=input_a_transformed_layout)
-
-    if input_b_layout == "nhwc-8h2w32c2w-2d":
-        input_b_transformed_layout = get_layout_transform_fn(input_b_layout)
-        s.transform_layout(block, buffer=("read", 1), index_map=input_b_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
-
-    n, h, w, c = s.get_loops(block)
-
-    h_o, h_i = s.split(h, [None, 8])
-    w_o, w_i = s.split(w, [None, 4])
-    c_o, c_i = s.split(c, [None, 32])
-    wio, wii = s.split(w_i, [None, 2])
-
-    s.reorder(n, h_o, w_o, c_o, h_i, wio, c_i, wii)
-
-    fused = s.fuse(c_i, wii)
-    s.vectorize(fused)
-
-    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/argmax.py b/python/tvm/topi/hexagon/slice_ops/argmax.py
deleted file mode 100644
index a3a0ea37c37c..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/argmax.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Hexagon slice argmax compute and schedule"""
-
-from tvm import tir
-from tvm import topi
-from ..utils import get_layout_transform_fn
-
-
-def argmax_compute(in_tensor, axis):
-    out_tensor = topi.argmax(in_tensor, axis)
-    return out_tensor
-
-
-def argmax_stir_schedule_nhwc(func, in_layout, out_layout):
-    """Schedule for nhwc argmax"""
-    sch = tir.Schedule(func, debug_mask="all")
-    sch.transform_layout("A_red_temp", "A", in_layout)
-    sch.transform_layout("A_red", "A_red", out_layout)
-    return sch
-
-
-def argmax_schedule(argmax_func, in_layout_str, out_layout_str):
-    """Schedule for argmax: top level function"""
-    if (in_layout_str == "nhwc-8h2w32c2w-2d") and (out_layout_str == "nhw-32h16w-2d"):
-        fp16_layout_transform = get_layout_transform_fn(in_layout_str)
-        int32_layout_transform = get_layout_transform_fn(out_layout_str)
-        tir_s = argmax_stir_schedule_nhwc(
-            argmax_func, fp16_layout_transform, int32_layout_transform
-        )
-        return tir_s
-    if (in_layout_str == "nhwc-8h8w32c-2d") and (out_layout_str == "nhw-32h16w-2d"):
-        int8_layout_transform = get_layout_transform_fn(in_layout_str)
-        int32_layout_transform = get_layout_transform_fn(out_layout_str)
-        tir_s = argmax_stir_schedule_nhwc(
-            argmax_func, int8_layout_transform, int32_layout_transform
-        )
-        return tir_s
-    raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'")
diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
deleted file mode 100644
index 0c7b00e287c3..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals, pointless-exception-statement
-
-""" Compute and schedule for avg_pool2d slice op """
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn
-from ...utils import get_const_tuple
-from ...nn.utils import get_pad_tuple
-from ...nn.pad import pad
-from ..compute_poolarea import compute_PoolArea
-
-
-def avg_pool2d_NCHW(
-    data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16"
-):
-    """avg_pool2d compute"""
-    if odtype != "float16":
-        raise RuntimeError(f"Unsupported output dtype '{odtype}'")
-    kh, kw = kernel
-    rh = te.reduce_axis((0, kh), name="rh")
-    rw = te.reduce_axis((0, kw), name="rw")
-    sh, sw = stride
-    dh, dw = dilation
-
-    dilated_kh = (kh - 1) * dh + 1
-    dilated_kw = (kw - 1) * dw + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        get_const_tuple(padding), (dilated_kh, dilated_kw)
-    )
-
-    # DOPAD
-
-    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
-        pad_before = (0, 0, pad_top, pad_left)
-        pad_after = (0, 0, pad_down, pad_right)
-        data_pad = pad(data, pad_before, pad_after, name="data_pad")
-    else:
-        # By definition when True, zero-padding will be included in the averaging calculation
-        # This is equivalent to PoolArea = (kh * kw)
-        count_include_pad = True
-        data_pad = data
-
-    Sum = te.compute(
-        oshape,
-        lambda b, c, h, w: te.sum(
-            data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype("float32"), axis=[rh, rw]
-        ),
-        name="pool_sum",
-    )
-
-    if not count_include_pad:
-        # Compute PoolArea using unpadded input tensor
-        _, _, oh, ow = oshape
-        _, _, ih, iw = data.shape
-
-        PoolArea = te.compute(
-            (oh, ow),
-            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
-            name="pool_area",
-        )
-
-        InvArea = te.compute(
-            (oh, ow),
-            lambda i, j: tir.if_then_else(
-                tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0
-            ),
-            name="inverse_area",
-        )
-
-        Avg = te.compute(
-            oshape,
-            lambda b, c, h, w: (Sum[b, c, h, w] * InvArea[h, w]).astype(odtype),
-            name="pool_avg",
-        )
-    else:
-        InvArea = float(1) / (kh * kw)
-        Avg = te.compute(
-            oshape, lambda b, c, h, w: (Sum[b, c, h, w] * InvArea).astype(odtype), name="pool_avg"
-        )
-
-    return Avg
-
-
-def avg_pool2d_NHWC(
-    data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16"
-):
-    """avg_pool2d compute"""
-    if odtype != "float16":
-        raise RuntimeError(f"Unsupported output dtype '{odtype}'")
-    kh, kw = kernel
-    rh = te.reduce_axis((0, kh), name="rh")
-    rw = te.reduce_axis((0, kw), name="rw")
-
-    sh, sw = stride
-    dh, dw = dilation
-    InvArea = float(1) / (kh * kw)
-
-    dilated_kh = (kh - 1) * dh + 1
-    dilated_kw = (kw - 1) * dw + 1
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        get_const_tuple(padding), (dilated_kh, dilated_kw)
-    )
-
-    # DOPAD
-    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
-        pad_before = (0, pad_top, pad_left, 0)
-        pad_after = (0, pad_down, pad_right, 0)
-        data_pad = pad(data, pad_before, pad_after, name="data_pad")
-    else:
-        # By definition when True, zero-padding will be included in the averaging calculation
-        # This is equivalent to PoolArea = (kh * kw)
-        count_include_pad = True
-        data_pad = data
-
-    Sum = te.compute(
-        oshape,
-        lambda b, h, w, c: te.sum(
-            data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
-        ),
-        name="pool_sum",
-    )
-
-    if not count_include_pad:
-        # Compute PoolArea using unpadded input tensor
-        _, oh, ow, _ = oshape
-        _, ih, iw, _ = data.shape
-
-        PoolArea = te.compute(
-            (oh, ow),
-            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
-            name="pool_area",
-        )
-
-        InvArea = te.compute(
-            (oh, ow),
-            lambda i, j: tir.if_then_else(
-                tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0
-            ),
-            name="inverse_area",
-        )
-
-        Avg = te.compute(
-            oshape,
-            lambda b, h, w, c: (Sum[b, h, w, c] * InvArea[h, w]).astype(odtype),
-            name="pool_avg",
-        )
-    else:
-        InvArea = float(1) / (kh * kw)
-        Avg = te.compute(
-            oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(odtype), name="pool_avg"
-        )
-
-    return Avg
-
-
-def schedule_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for input and output layout 8h2w32c2w"""
-    func = te.create_prim_func([ins, outs])
-    print(func)
-    s = tir.Schedule(func)
-    Sum = s.get_block("pool_sum")
-    Avg = s.get_block("pool_avg")
-
-    mem_scope = "global.vtcm"
-    sum_read = s.cache_read(Sum, 0, mem_scope)
-    avg_write = s.cache_write(Avg, 0, mem_scope)
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0)
-    return s
-
-
-def schedule_1024c(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for output layout: 1024c, input layout: 8h2w32c2w"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-    Sum = s.get_block("pool_sum")
-    Avg = s.get_block("pool_avg")
-
-    mem_scope = "global.vtcm"
-    sum_read = s.cache_read(Sum, 0, mem_scope)
-    avg_write = s.cache_write(Avg, 0, mem_scope)
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0)
-
-    # Schedule 'Avg'
-    if output_layout == "n11c-1024c-2d":
-        n, h, w, c = s.get_loops(Avg)
-    else:
-        n, c, h, w = s.get_loops(Avg)
-    _, ci = s.split(c, [None, 1024])
-    cio, cii = s.split(ci, [None, 64])
-    s.vectorize(cii)
-
-    # Schedule 'Sum'
-    Sum_axis = s.get_loops(Sum)
-    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3])
-    return s
-
-
-def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str):
-    """avg_pool2d schedule"""
-    if output_layout == "nhwc-8h2w32c2w-2d" or output_layout == "nchw-8h2w32c2w-2d":
-        return schedule_8h2w32c2w(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-1024c-2d" or output_layout == "nc11-1024c-2d":
-        return schedule_1024c(outs, ins, output_layout, input_layout)
-    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/slice_ops/batch_flatten.py b/python/tvm/topi/hexagon/slice_ops/batch_flatten.py
deleted file mode 100644
index 6dc0914e91b4..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/batch_flatten.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Hexagon slice batch flatten compute and schedule"""
-from tvm import te, tir, topi
-from ..utils import get_layout_transform_fn
-
-
-def batch_flatten_compute(inp: te.Tensor) -> te.Tensor:
-    """Compute for slice batch flatten op for hexagon.
-    This op makes the following assumptions:
-    1. This op is written for a sliced batch flatten operation.
-    2. The input is assumed to be in NHWC layout.
-
-    Parameters
-    ----------
-    Input : te.Tensor
-        Input activations padded for inner dimension size
-    Returns
-    -------
-    Output : te.Tensor
-        Output of applying batch flatten operation on input
-    """
-    return topi.nn.flatten(inp)
-
-
-def batch_flatten_stir_schedule(
-    out: te.Tensor,
-    inp: te.Tensor,
-    out_layout: str,
-    in_layout: str,
-) -> tir.Schedule:
-    """STIR schedule definition for the compute of batch flatten compute.
-    Parameters
-    ----------
-    outputs : te.Tensor
-        The output tensor as returned by a call to batch_flatten_compute
-    input : te.Tensor
-        Input tensor to batch_flatten
-    out_layout: typing.Callable
-        The transformation function definition for the expected output layout
-    in_layout: typing.Callable
-        The transformation function definition for the input layout
-    Returns
-    -------
-    sch : tvm.tir.Schedule
-        The STIR schedule for slice batch flatten compute
-    """
-
-    batch_flatten_func = te.create_prim_func([inp, out])
-    sch = tir.Schedule(batch_flatten_func, debug_mask="all")
-    compute = sch.get_block("compute")
-
-    sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout))
-    sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout))
-    i, j = sch.get_loops(compute)
-    jout, channel = sch.split(j, [None, inp.shape[3]])
-    height, width = sch.split(jout, [inp.shape[1], inp.shape[2]])
-    channelo, channeli = sch.split(channel, [None, 1024])
-    channelio, channelii = sch.split(channeli, [None, 64])
-    sch.reorder(i, height, width, channelo, channelio, channelii)
-    sch.vectorize(channelii)
-    return sch
diff --git a/python/tvm/topi/hexagon/slice_ops/cast.py b/python/tvm/topi/hexagon/slice_ops/cast.py
deleted file mode 100644
index ac2e4c32e3e0..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/cast.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Hexagon slice cast op compute and schedule"""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn
-
-
-def get_layout_transform_for_f32(f32_layout_string):
-    """
-    Given f32 layout string, return transform_layout function and
-    channel/height split factor to be used for scheduling
-    """
-    layout_transform_fn = get_layout_transform_fn(f32_layout_string)
-    if f32_layout_string == "nhwc-8h2w32c2w-2d":
-        return [layout_transform_fn, 8]
-    if f32_layout_string == "nhwc-4h2w32c2w-2d":
-        return [layout_transform_fn, 4]
-    if f32_layout_string == "nc-1024c-2d":
-        return [layout_transform_fn, 1024]
-    if f32_layout_string == "nc-512c-2d":
-        return [layout_transform_fn, 512]
-    raise RuntimeError(f"Unexpected f32_layout '{f32_layout_string}'")
-
-
-def cast_f16_f32_compute(in_tensor):
-    out_tensor = te.compute(
-        in_tensor.shape, lambda *indices: in_tensor[indices].astype("float32"), name="CastF16F32"
-    )
-    return out_tensor
-
-
-def cast_f16_f32_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor):
-    """Schedule for nhwc f16 to f32 cast: nhwc layout"""
-    sch = tir.Schedule(func, debug_mask="all")
-    block_name = "CastF16F32"
-    n_orig, h_orig, w_orig, c_orig = sch.get_loops(sch.get_block(block_name))
-    h_outer, h_inner = sch.split(h_orig, [None, h_split_factor])
-    w_outer, w_inner = sch.split(w_orig, [None, 4])
-    c_outer, c_inner = sch.split(c_orig, [None, 32])
-    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
-    sch.reorder(n_orig, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
-    sch.transform_layout(block_name, "A", in_layout)
-    sch.transform_layout(block_name, block_name, out_layout)
-    fused = sch.fuse(c_inner, w_inner_i)
-    sch.vectorize(fused)
-    return sch
-
-
-def cast_f16_f32_stir_schedule_nc(func, in_layout, out_layout, c_split_factor):
-    """Schedule for nc f16 to f32 cast: nc layout"""
-    sch = tir.Schedule(func, debug_mask="all")
-    block_name = "CastF16F32"
-    _, c_orig = sch.get_loops(sch.get_block(block_name))
-    _, c_inner = sch.split(c_orig, [None, c_split_factor])
-    _, c_inner_inner = sch.split(c_inner, [None, 64])
-    sch.transform_layout(block_name, "A", in_layout)
-    sch.transform_layout(block_name, block_name, out_layout)
-    sch.vectorize(c_inner_inner)
-    return sch
-
-
-def cast_f16_f32_schedule(cast_func, in_layout_str, out_layout_str):
-    """Schedule for f16 to f32 cast: top level function"""
-    f32_layout_transform_func, split_factor = get_layout_transform_for_f32(out_layout_str)
-    f16_layout_transform_func = get_layout_transform_fn(in_layout_str)
-    if in_layout_str == "nhwc-8h2w32c2w-2d":
-        return cast_f16_f32_stir_schedule_nhwc(
-            cast_func,
-            f16_layout_transform_func,
-            f32_layout_transform_func,
-            split_factor,
-        )
-    if in_layout_str == "nc-1024c-2d":
-        return cast_f16_f32_stir_schedule_nc(
-            cast_func, f16_layout_transform_func, f32_layout_transform_func, split_factor
-        )
-    raise RuntimeError(f"Unexpected input_layout, output_layout '{input_layout, output_layout}'")
-
-
-def cast_f32_f16_compute(in_tensor):
-    out_tensor = te.compute(
-        in_tensor.shape, lambda *indices: in_tensor[indices].astype("float16"), name="CastF32F16"
-    )
-    return out_tensor
-
-
-def cast_f32_f16_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor):
-    """Schedule for nhwc f32 to f16 cast: nhwc layout"""
-    sch = tir.Schedule(func, debug_mask="all")
-    block_name = "CastF32F16"
-    n_orig, h_orig, w_orig, c_orig = sch.get_loops(sch.get_block(block_name))
-    h_outer, h_inner = sch.split(h_orig, [None, h_split_factor])
-    w_outer, w_inner = sch.split(w_orig, [None, 4])
-    c_outer, c_inner = sch.split(c_orig, [None, 32])
-    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
-    sch.reorder(n_orig, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
-    sch.transform_layout(block_name, "A", in_layout)
-    sch.transform_layout(block_name, block_name, out_layout)
-    fused = sch.fuse(c_inner, w_inner_i)
-    sch.vectorize(fused)
-    return sch
-
-
-def cast_f32_f16_stir_schedule_nc(func, in_layout, out_layout, c_split_factor):
-    """Schedule for nc f32 to f16 cast: nc layout"""
-    sch = tir.Schedule(func, debug_mask="all")
-    block_name = "CastF32F16"
-    _, c_orig = sch.get_loops(sch.get_block(block_name))
-    _, c_inner = sch.split(c_orig, [None, c_split_factor])
-    _, c_inner_inner = sch.split(c_inner, [None, 64])
-    sch.transform_layout(block_name, "A", in_layout)
-    sch.transform_layout(block_name, block_name, out_layout)
-    sch.vectorize(c_inner_inner)
-    return sch
-
-
-def cast_f32_f16_schedule(cast_func, in_layout_str, out_layout_str):
-    """Schedule for f32 to f16 cast: top level function"""
-    f32_layout_transform_func, split_factor = get_layout_transform_for_f32(in_layout_str)
-    f16_layout_transform_func = get_layout_transform_fn(out_layout_str)
-    if out_layout_str == "nhwc-8h2w32c2w-2d":
-        return cast_f32_f16_stir_schedule_nhwc(
-            cast_func, f32_layout_transform_func, f16_layout_transform_func, split_factor
-        )
-    if out_layout_str == "nc-1024c-2d":
-        return cast_f32_f16_stir_schedule_nc(
-            cast_func, f32_layout_transform_func, f16_layout_transform_func, split_factor
-        )
-    raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'")
diff --git a/python/tvm/topi/hexagon/slice_ops/clip.py b/python/tvm/topi/hexagon/slice_ops/clip.py
deleted file mode 100644
index 2beb2df643bb..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/clip.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name
-
-"""
-Clip the elements in `A` between `A_min` and `A_max`.
-"""
-
-from tvm import te, tir, topi
-from ..utils import get_layout_transform_fn
-
-
-def clip_compute(A, A_min, A_max):
-    """
-    Use topi clip implementation
-    """
-    return topi.clip(A, A_min, A_max)
-
-
-def clip_schedule(outs, ins, output_layout: str, input_layout: str):
-    """
-    Hexagon clip schedule
-    """
-    A = ins
-    M = outs
-
-    func = te.create_prim_func([A, M])
-
-    s = tir.Schedule(func)
-
-    block = s.get_block("compute")
-
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
-
-    n, h, w, c = s.get_loops(block)
-
-    ho, hi = s.split(h, [None, 8])
-    wo, wi = s.split(w, [None, 4])
-    co, ci = s.split(c, [None, 32])
-    wio, wii = s.split(wi, [None, 2])
-
-    s.reorder(n, ho, wo, co, hi, wio, ci, wii)
-
-    fused = s.fuse(ci, wii)
-    s.vectorize(fused)
-
-    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/conv2d.py b/python/tvm/topi/hexagon/slice_ops/conv2d.py
deleted file mode 100644
index ab782b5fa21a..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/conv2d.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=line-too-long
-
-"""Hexagon slice conv2d compute and schedule"""
-import typing
-
-import tvm
-from tvm import te
-
-from ..utils import get_layout_transform_fn
-
-
-def conv2d_compute(
-    activations: te.Tensor,
-    weights: te.Tensor,
-    out_shape: typing.Tuple,
-    stride: typing.Tuple,
-    dilation: typing.Tuple,
-    dtype: str,
-    output_name: str,
-    weights_width_reversed: bool = True,
-) -> te.Tensor:
-    """Compute for slice conv2d op for hexagon.
-
-    This op makes the following assumptions:
-    1. This op is written for a sliced convolution with 2d physical buffers
-    2. The input activations is assumed to be in NHWC layout and filter is in HWIO layout
-    3. Grouped convolutions are not supported. and there will be a separate compute definition for depthwise convolution
-    4. In order to get grouped convolutions, it is assumed that the op will be sliced according to the groups and multiple calls to this compute would be placed.
-
-
-    Parameters
-    ----------
-    activations : te.Tensor
-        Input activations padded for inner dimension size
-    weights : te.Tensor
-        Weights without dilation
-    out_shape : typing.Tuple
-        The logical output shape without considering input padding
-    stride : typing.Tuple
-        stride
-    dilation : typing.Tuple
-        dilation
-    dtype : str
-        dtype
-    output_name : str
-        The name to be given to output. This would become the block name for the corresponding STIR compute
-    weights_width_reversed : bool
-        The width axis of weights are expected in reverse order if weights_width_reversed is True
-
-    Returns
-    -------
-    output : te.Tensor
-        Output of applying 2D convolution of Weights on Input
-    """
-
-    filt_shape = weights.shape
-
-    reduce_channel = tvm.te.reduce_axis((0, filt_shape[2]), name="reduce_channel")
-    reduce_height = tvm.te.reduce_axis((0, filt_shape[0]), name="reduce_height")
-    reduce_width = tvm.te.reduce_axis((0, filt_shape[1]), name="reduce_width")
-    stride_height, stride_width = stride
-    dilation_height, dilation_width = dilation
-
-    if weights_width_reversed:
-        weights_width_var = filt_shape[1] - reduce_width - 1
-    else:
-        weights_width_var = reduce_width
-
-    output = tvm.te.compute(
-        out_shape,
-        lambda n, h, w, c: tvm.te.sum(
-            (
-                activations[
-                    n,
-                    h * stride_height + reduce_height * dilation_height,
-                    w * stride_width + reduce_width * dilation_width,
-                    reduce_channel,
-                ]
-                * weights[reduce_height, weights_width_var, reduce_channel, c]
-            ).astype(dtype),
-            axis=[reduce_channel, reduce_height, reduce_width],
-        ),
-        name=output_name,
-    )
-    return output
-
-
-def conv2d_te_schedule(
-    out: te.Tensor,
-    ins: typing.List[te.Tensor],
-    transform_activation_layout: str,
-    transform_weights_layout: str,
-    transform_output_layout: str,
-) -> te.Schedule:
-    """TE Schedule for the sliced conv2d op
-
-    This schedule makes the following assumptions:
-    1. There is only one output tensor
-    2. The activations and weights have specific layouts defined by the last 2 arguments
-    3. All transformation functions are expected to be a bijection for now
-
-    Parameters
-    ----------
-    out : te.Tensor
-        The output tensor returned by a call to conv2d_compute
-    ins : typing.List[te.Tensor]
-        The list of 2 Tensors which would be the input activations and weights
-    transform_activation_layout : str
-        The expected activations layout
-    transform_weights_layout : str
-        String representing the weights layout as defined in get_layout_transform_fn
-    transform_output_layout: str
-        String representing the output layout as defined in get_layout_transform_fn
-
-    Returns
-    -------
-    sch : te.Schedule
-        The TE schedule for slice conv2d
-    """
-    activations, weights = ins
-    output = out
-    sch = tvm.te.create_schedule(output.op)
-    reduce_channel, reduce_height, reduce_width = sch[output].op.reduce_axis
-    sch[activations].transform_layout(get_layout_transform_fn(transform_activation_layout))
-    sch[weights].transform_layout(get_layout_transform_fn(transform_weights_layout))
-    transformed_axis = sch[output].transform_layout(
-        get_layout_transform_fn(transform_output_layout)
-    )
-    fused_out_axis = sch[output].fuse(transformed_axis[-1], transformed_axis[-2])
-    sch[output].reorder(
-        *[*transformed_axis[:-2], reduce_height, reduce_width, reduce_channel, fused_out_axis]
-    )
-    # The below code doesn't work yet as vectorization across 2D boundary is not yet supported
-    # s[output].vectorize(fused_out_axis)
-    return sch
-
-
-def conv2d_schedule(
-    outs: te.Tensor,
-    ins: typing.List[te.Tensor],
-    transform_activation_layout: str,
-    transform_weights_layout: str,
-    transform_output_layout: str,
-    output_name: str,
-) -> tvm.tir.Schedule:
-    """STIR schedule definition for the compute defined above by conv2d_compute.
-
-    - Auto-generated prim_func before applying schedule primitives for reference
-    - The below TVMScript code is for conv2d with padded input dimensions and a stride of 1x1
-
-    # from tvm.script import tir as T
-    @T.prim_func
-    def func(InputTensor: T.Buffer((1, 24, 12, 32), "float16"), Weights: T.Buffer((3, 3, 32, 32), "float16"), compute: T.Buffer((1, 16, 8, 32), "float16")) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 16, 8, 32, 32, 3, 3):
-            with T.block("compute"):
-                n, h, w, c, rc, rh, rw = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
-                T.reads(InputTensor[n, h + rh, w + rw, rc], Weights[rh, rw, rc, c])
-                T.writes(compute[n, h, w, c])
-                with T.init():
-                    compute[n, h, w, c] = T.float16(0)
-                compute[n, h, w, c] = compute[n, h, w, c] + InputTensor[n, h + rh, w + rw, rc] * Weights[rh, rw, rc, c]
-
-    Parameters
-    ----------
-    outs : te.Tensor
-        The output Tensor as returned by a call to conv2d_compute
-    ins : typing.List[te.Tensor]
-        This is a list of 2 tensors - Input activations and Weights
-    transform_activation_layout : str
-        String representing the activations layout as defined in get_layout_transform_fn
-    transform_weights_layout : str
-        String representing the weights layout as defined in get_layout_transform_fn
-    transform_output_layout: str
-        String representing the output layout as defined in get_layout_transform_fn
-    output_name : str
-        The name that was given to the output compute and which can be used to get the block name
-
-    Returns
-    -------
-    sch : tvm.tir.Schedule
-        The STIR schedule for slice conv2d compute
-    """
-
-    assert len(ins) == 2, "This schedule expects only 2 inputs - Activations and Weights"
-    source_expr = ins + [outs]
-    prim_func = tvm.te.create_prim_func(source_expr)
-    sch = tvm.tir.Schedule(prim_func)
-
-    compute = sch.get_block(output_name)
-    # Apply layout_transform for activation
-    sch.transform_layout(compute, ins[0].name, get_layout_transform_fn(transform_activation_layout))
-
-    # Apply layout_transform for weights
-    sch.transform_layout(compute, ins[1].name, get_layout_transform_fn(transform_weights_layout))
-
-    # Apply layout_transform for output
-    sch.transform_layout(compute, outs.name, get_layout_transform_fn(transform_output_layout))
-
-    batch, height, width, channel, reduce_channel, reduce_height, reduce_width = sch.get_loops(
-        compute
-    )  # This still returns the original 7d loop
-    h_outer, h_inner = sch.split(height, [None, 8])
-    w_outer, w_inner = sch.split(width, [None, 4])
-    w_inner_outer, w_inner_inner = sch.split(w_inner, [2, 2])
-    c_outer, c_inner = sch.split(channel, [None, 32])
-    sch.reorder(
-        batch,
-        h_outer,
-        w_outer,
-        c_outer,
-        h_inner,
-        w_inner_outer,
-        reduce_height,
-        reduce_width,
-        reduce_channel,
-        c_inner,
-        w_inner_inner,
-    )
-    sch.decompose_reduction(compute, reduce_height)
-    # ci_wii = s.fuse(ci, wii)
-    # s.vectorize(ci_wii)
-    return sch
diff --git a/python/tvm/topi/hexagon/slice_ops/dense.py b/python/tvm/topi/hexagon/slice_ops/dense.py
deleted file mode 100644
index a298ff4bc98e..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/dense.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Schedule for dense operator"""
-
-from tvm import te, tir
-from tvm.topi import tag
-from ..utils import get_layout_transform_fn
-
-
-def dense_compute(tensor_a, tensor_b, bias=None, out_dtype=None):
-    """Hexagon's implementation of a sliced dense operator in Topi.
-    Uses matmul.
-
-    Parameters
-    ----------
-    tensor_a : tvm.te.Tensor
-        data 2-D with shape [batch, in_dim]
-
-    tensor_b : tvm.te.Tensor
-        weight 2-D with shape [in_dim, out_dim]
-
-    bias : Optional[tvm.te.Tensor]
-        1-D with shape [out_dim]
-
-    out_dtype : Optional[str]
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-
-    """
-    if bias is not None:
-        assert len(bias.shape) == 1
-    if out_dtype is None:
-        out_dtype = tensor_a.dtype
-
-    batch, in_dim = tensor_a.shape
-    out_dim, red_dim = tensor_b.shape
-
-    # cmp should be done by values
-    assert int(in_dim) == int(red_dim)
-
-    k = te.reduce_axis((0, in_dim), name="k")
-    compute_lambda = lambda n, m: te.sum(
-        tensor_a[n, k].astype(out_dtype) * tensor_b[k, m].astype(out_dtype), axis=k
-    )
-    compute_name = "matmul_sliced"
-    compute_tag = "matmul"
-
-    mat = te.compute(
-        (batch, out_dim),
-        compute_lambda,
-        name=compute_name,
-        tag=compute_tag,
-        attrs={"layout_free_placeholders": [tensor_b]},
-    )
-
-    if bias is not None:
-        mat = te.compute(
-            (batch, out_dim),
-            lambda i, j: mat[i, j] + bias[j],
-            tag=tag.BROADCAST,
-            name="bias",
-        )
-
-    return mat
-
-
-def dense_schedule(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for dense op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense in the format
-        of an array of tensors.
-
-    ins: Array of Tensor
-        Input tensors into graph.
-
-    output_layout: str
-        Descriptor string for physical layout
-
-    input_layout: str
-        Descriptor string for physical layout
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    if not isinstance(ins, list):
-        ins = [ins]
-    if not isinstance(outs, list):
-        outs = [outs]
-
-    func = te.create_prim_func([*ins, *outs])
-    s = tir.Schedule(func)
-
-    matmul = s.get_block("matmul_sliced")
-    try:
-        bias = s.get_block("bias")
-    except tir.schedule.schedule.ScheduleError:
-        bias = None
-
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
-
-    # No bias
-    if bias is None:
-        s.transform_layout(matmul, ("read", 0), input_transform_fn)
-        # s.transform_layout(matmul, ("read", 1), input_transform_fn)
-        s.transform_layout(matmul, ("write", 0), output_transform_fn)
-    else:
-        s.transform_layout(matmul, ("read", 0), input_transform_fn)
-        s.transform_layout(bias, ("write", 0), output_transform_fn)
-
-    _, matmul_c, _ = s.get_loops(matmul)
-    _, matmul_c_inner = s.split(matmul_c, [None, 64])
-    s.vectorize(matmul_c_inner)
-
-    if bias is not None:
-        _, bias_c = s.get_loops(bias)
-        _, bias_c_inner = s.split(bias_c, [None, 64])
-        s.vectorize(bias_c_inner)
-
-    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/depth_to_space.py b/python/tvm/topi/hexagon/slice_ops/depth_to_space.py
deleted file mode 100644
index aa14a97f5ee9..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/depth_to_space.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Compute and schedule for depth to space slice op
-"""
-
-from tvm import te, tir, topi
-from ..utils import get_layout_transform_fn
-
-
-def d2s_compute(inp, block_size, layout, mode):
-    """depth_to_space compute"""
-    return topi.nn.depth_to_space(inp, block_size=block_size, layout=layout, mode=mode)
-
-
-def d2s_schedule(inp, out, input_layout, output_layout):
-    """Schedule for depth to space: top level function"""
-    if (input_layout != output_layout) or (
-        output_layout not in ("nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d")
-    ):
-        raise RuntimeError(
-            f"Unexpected input_layout, output_layout '{input_layout, output_layout}'"
-        )
-    d2s_func = te.create_prim_func([inp, out])
-    sch = tir.Schedule(d2s_func, debug_mask="all")
-    compute = sch.get_block("depth_to_space")
-    sch.transform_layout(compute, inp.name, get_layout_transform_fn(input_layout))
-    sch.transform_layout(compute, out.name, get_layout_transform_fn(output_layout))
-    return sch
diff --git a/python/tvm/topi/hexagon/slice_ops/dwconv2d.py b/python/tvm/topi/hexagon/slice_ops/dwconv2d.py
deleted file mode 100644
index d94afe98bc61..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/dwconv2d.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=line-too-long
-
-"""Hexagon slice dwconv2d compute and schedule"""
-import typing
-
-import tvm
-from tvm import te
-from ..utils import get_layout_transform_fn
-
-
-def dwconv2d_compute(
-    activations: te.Tensor,
-    weights: te.Tensor,
-    out_shape: typing.Tuple,
-    stride: typing.Tuple,
-    dilation: typing.Tuple,
-    dtype: str,
-) -> te.Tensor:
-    """Compute for slice dwconv2d op for hexagon.
-    This op makes the following assumptions:
-    1. This op is written for a sliced dw convolution with 2d physical buffers
-    2. The input activations is assumed to be in NHWC layout and filter is in HWIO layout
-    Parameters
-    ----------
-    activations : te.Tensor
-        Input activations padded for inner dimension size
-    weights : te.Tensor
-        Weights without dilation
-    out_shape : typing.Tuple
-        The logical output shape without considering input padding
-    stride : typing.Tuple
-        stride
-    dilation : typing.Tuple
-        dilation
-    dtype : str
-        dtype
-    Returns
-    -------
-    output : te.Tensor
-        Output of applying 2D depthwise convolution of Weights on Input
-    """
-
-    filt_shape = weights.shape
-
-    reduce_height = tvm.te.reduce_axis((0, filt_shape[0]), name="reduce_height")
-    reduce_width = tvm.te.reduce_axis((0, filt_shape[1]), name="reduce_width")
-    stride_height, stride_width = stride
-    dilation_height, dilation_width = dilation
-    output = tvm.te.compute(
-        out_shape,
-        lambda n, h, w, c: tvm.te.sum(
-            (
-                activations[
-                    n,
-                    h * stride_height + reduce_height * dilation_height,
-                    w * stride_width + reduce_width * dilation_width,
-                    c,
-                ]
-                * weights[reduce_height, reduce_width, 0, c]
-            ).astype(dtype),
-            axis=[reduce_height, reduce_width],
-        ),
-        name="Output",
-    )
-    return output
-
-
-def dwconv2d_schedule(
-    outs: te.Tensor,
-    ins: typing.List[te.Tensor],
-    transform_activation_layout: str,
-    transform_weights: str,
-) -> tvm.tir.Schedule:
-    """STIR schedule definition for the compute defined above by dwconv2d_compute.
-        - Auto-generated prim_func before applying schedule primitives for reference
-        - The below TVMScript code is for dwconv2d with padded input dimensions and a stride of 1x1
-    # from tvm.script import tir as T
-    @tvm.script.ir_module
-    class Module:
-        @T.prim_func
-        def main(InputTensor: T.Buffer((1, 16, 8, 32), "float16"), Weights: T.Buffer((3, 3, 1, 32), "float16"), Output: T.Buffer((1, 8, 4, 32), "float16")) -> None:
-            # function attr dict
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            # body
-            # with T.block("root")
-            for i0, i1, i2, i3, i4, i5 in T.grid(1, 8, 4, 32, 3, 3):
-                with T.block("Output"):
-                    n, h, w, c, reduce_height, reduce_width = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
-                    T.reads(InputTensor[n, h + reduce_height, w + reduce_width, c], Weights[reduce_height, reduce_width, 0, c])
-                    T.writes(Output[n, h, w, c])
-                    with T.init():
-                        Output[n, h, w, c] = T.float16(0)
-                    Output[n, h, w, c] = Output[n, h, w, c] + InputTensor[n, h + reduce_height, w + reduce_width, c] * Weights[reduce_height, reduce_width, 0, c]
-        Parameters
-        ----------
-        outs : te.Tensor
-            The output Tensor as returned by a call to dwconv2d_compute
-        ins : typing.List[te.Tensor]
-            This is a list of 2 tensors - Input activations and Weights
-        transform_activation_layout : str
-            The transformation string representing the expected activations layout
-        transform_weights : typing.Callable
-            The transformation function definition for the expected weights layout
-        Returns
-        -------
-        sch : tvm.tir.Schedule
-            The STIR schedule for slice dwconv2d compute
-    """
-    assert len(ins) == 2, "This schedule expects only 2 inputs - Activations and Weights"
-    source_expr = ins + [outs]
-    prim_func = tvm.te.create_prim_func(source_expr)
-    sch = tvm.tir.Schedule(prim_func)
-    compute = sch.get_block("Output")
-    transform_layout_fn = get_layout_transform_fn(transform_activation_layout)
-    transform_layout_weights = get_layout_transform_fn(transform_weights)
-    # Apply layout_transform for activation
-    sch.transform_layout(compute, ins[0].name, transform_layout_fn)
-
-    # Apply layout_transform for weights
-    sch.transform_layout(compute, ins[1].name, transform_layout_weights)
-
-    # Apply layout_transform for output
-    sch.transform_layout(compute, outs.name, transform_layout_fn)
-
-    batch, height, width, channel, reduce_height, reduce_width = sch.get_loops(
-        compute
-    )  # This still returns the original 6d loop
-    h_outer, h_inner = sch.split(height, [None, 8])
-    w_outer, w_inner = sch.split(width, [None, 4])
-    w_inner_outer, w_inner_inner = sch.split(w_inner, [2, 2])
-    c_outer, c_inner = sch.split(channel, [None, 32])
-    sch.reorder(
-        batch,
-        h_outer,
-        w_outer,
-        c_outer,
-        h_inner,
-        w_inner_outer,
-        reduce_height,
-        reduce_width,
-        c_inner,
-        w_inner_inner,
-    )
-    sch.decompose_reduction(compute, reduce_height)
-    # ci_wii = sch.fuse(c_inner, w_inner_inner)
-    # sch.vectorize(ci_wii)
-    return sch
diff --git a/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
deleted file mode 100644
index 9e6ae077851e..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Assumptions:
-1) The input is in NCHW layout. Squeezenet is the only model that calls
-   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
-2) The op takes input data as an argument.
-3) Both input and output dtype is float32 and
-4) Input is assumed to always be multiple of fixed chunk 32c8h4w.
-"""
-
-from tvm import te
-from tvm import tir
-from tvm import topi
-from ..utils import get_layout_transform_fn
-
-
-def global_avg_pool2d(
-    data: te.Tensor,
-):
-    """global_avg_pool2d"""
-    return topi.nn.global_pool(data, "avg", "NCHW")
-
-
-def stir_global_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
-    """Schedule"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-
-    sum_block = s.get_block("adaptive_pool_sum")
-
-    # Input is multiple of fixed chunk but output is NxCx1x1
-    # Hence transform_layout is only applied on input
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/max_pool2d.py b/python/tvm/topi/hexagon/slice_ops/max_pool2d.py
deleted file mode 100644
index 06911657954a..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/max_pool2d.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals, condition-evals-to-constant
-
-""" Compute and schedule for max_pool2d slice op
-
-Please note the following assumptions made by the implementation:
-
-1) The input must be padded in advance to account for 'padding'. In addition,
-   both input and output must be padded as per the physical buffer layout.
-
-2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
-   modified to support 'False' case but the element count for the pooling window
-   must be pre-computed and provided as an input to reduce the run-time overhead.
-
-3) 'padding' is ignored. It must be handled outside of the sliced op.
-
-4) This implementation will not work if the output includes any physical layout
-   related padding, as it can result into out-of-bound access for the input.
-"""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn
-
-
-def validate_out_shape(out_shape, in_shape, kernel, stride, dilation):
-    """Validate output shape"""
-    _, oh, ow, _ = out_shape
-    _, ih, iw, _ = in_shape
-    kh, kw = kernel
-    sh, sw = stride
-    dh, dw = dilation
-    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
-        raise RuntimeError("Output height is too large")
-    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
-        raise RuntimeError("Output width is too large")
-
-
-def max_pool2d_compute(A, out_shape, kernel, stride, dilation):
-    """max_pool2d compute"""
-    kh, kw = kernel
-    rh = te.reduce_axis((0, kh), name="rh")
-    rw = te.reduce_axis((0, kw), name="rw")
-    ob, oh, ow, oc = out_shape
-    if isinstance(ob, int):
-        validate_out_shape(out_shape, A.shape, kernel, stride, dilation)
-
-    sh, sw = stride
-    dh, dw = dilation
-
-    Max = te.compute(
-        out_shape,
-        lambda b, h, w, c: te.max(
-            A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(A.dtype), axis=[rh, rw]
-        ),
-        name="max",
-    )
-    return Max
-
-
-def STIR_schedule_nhwc_8h2w32c2w_nhwc_8h8w32c(
-    outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str
-):
-    """Schedule for input and output layout nhwc-8h2w32c2w and nhwc-8h8w32c"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-
-    # NOTE!!! This scheduling logic is a work in progress.
-    # It is not known to ultimately result in near-optimal Hexagon performance.
-    # The schedule below strives to implement these heuristics:
-    #
-    # (1) For mathematical operations on tensor values, prefer HVX SIMD operations
-    #     over per-element scalar operations.
-    #
-    # (2) Minimize the number of memory transfers used to operate on tensor values:
-    #     host-memory <--> Hexagon DDR <--> VTCM <--> HVX registers
-    #
-    # As a consequence of (1) + (2), prefer TIR schedules that load each value
-    # into an HVX SIMD tensor exactly once.
-
-    Max = s.get_block("max")
-
-    if input_layout in (
-        "nhwc-8h2w32c2w-2d",
-        "nhwc-8h8w32c-2d",
-    ):
-        input_transform_fn = get_layout_transform_fn(input_layout)
-        s.transform_layout(Max, ("read", 0), input_transform_fn)
-
-    output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Max, ("write", 0), output_transform_fn)
-
-    # pylint: disable=line-too-long
-    #
-    # Restructure the loop nestings to have this overall structure:
-    # (loop over different 128-byte output-tensor chunks) : n, ho, wo, co   }- the first level of a two-level tensor layout
-    #    (loop within one 128-byte output-tensor chunk) : hi, wio, ci, wii  }- the second level of a two-level tensor layout
-    #        (loop over reduction axes) : rh, rw                            }- loop over multiple elements of the input tensor
-    #
-    # Note: This schedule is a work in progress.  We *expect* that it's
-    # crucially important for the loops to have this relative ordering:
-    #    n ... ho ... wo ... co ... hi ... wio ... ci ... wii
-    # because it lets us visit each of the 128-byte output chunks precisely once.
-
-    (
-        n,
-        h,
-        w,
-        c,
-        rh,
-        rw,
-    ) = s.get_loops(Max)
-
-    # Restructure the loops from NHWC to nhwc_8h2w32c2w or nhwc_8h8w32c, with loops for 'max's reduction
-    # axes at the very end.
-    # nhwc_8h2w32c2w layout is for float16 and nhwc-8h8w32c-2d layout is for uint8/int8
-    if output_layout == "nhwc-8h2w32c2w-2d":
-        ho, hi = s.split(h, [None, 8])
-        wo, wi = s.split(w, [None, 4])
-        wio, wii = s.split(wi, [None, 2])
-        co, ci = s.split(c, [None, 32])
-        s.reorder(n, ho, wo, co, hi, wio, ci, wii, rh, rw)
-    elif output_layout == "nhwc-8h8w32c-2d":
-        ho, hi = s.split(h, [None, 8])
-        wo, wi = s.split(w, [None, 8])
-        co, ci = s.split(c, [None, 32])
-
-        s.reorder(n, ho, wo, co, hi, wi, ci, rh, rw)
-
-    # TODO: Enable vectorization.
-    # Hexagon v69's HVX units support SIMD operations on 64-element float16 vectors.
-    #
-    # TVM's 'vectorize' schedule primitive is the idiomatic way to encourage lower layers of the
-    # compiler to generate this kind of SIMD object code.
-    #
-    # Several requirements must be met to use 'vectorize':
-    #
-    # 1) It can only be applied to a schedule's innermost loop variable.
-    #
-    # 2) Any block-iterator(s) bound to that innermost loop variable must be
-    #    *data-parallel* block iterators.
-    #
-    # 3) Ideally, the innermost loop variable will iterate only over the output
-    #    tensor's fastest-changing indices and nothing else.  But in our case,
-    #    our two innermost loops correspond to the max operator's reduction axes.
-    #
-    # Finding a good way to satisfy all of these requirements at the same time is
-    # left for future work.
-
-    # ci_wii = s.fuse(ci, wii)
-    # s.vectorize(ci_wii_rh_rw)
-
-    return s
-
-
-def STIR_schedule_n11c(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for output layout: n11c-1024c, n11c-2048c-2d;"""
-
-    # NOTE: This function is a variation of the STIR_schedule_maxpool2d
-    # functions.  Most of that function's code comments apply to this function
-    # as well, but are ommited for brevity.
-
-    # NOTE: the "n11c-1024c" output layout is shorthand for this axis mapping:
-    # [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024]
-    func = te.create_prim_func([ins, outs])
-
-    s = tir.Schedule(func)
-    Max = s.get_block("max")
-
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Max, ("read", 0), input_transform_fn)
-    s.transform_layout(Max, ("write", 0), output_transform_fn)
-
-    (
-        n,
-        h,
-        w,
-        c,
-        rh,
-        rw,
-    ) = s.get_loops(Max)
-    if output_layout == "n11c-1024c-2d":
-        co, ci = s.split(c, [None, 1024])
-    else:
-        co, ci = s.split(c, [None, 2048])
-    # s.vectorize(ci)
-
-    return s
-
-
-def max_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str):
-    """STIR based schedule"""
-    if output_layout == "nhwc-8h2w32c2w-2d" or "nhwc-8h8w32c-2d":
-        return STIR_schedule_nhwc_8h2w32c2w_nhwc_8h8w32c(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-1024c-2d" or "n11c-2048c-2d":
-        return STIR_schedule_n11c(outs, ins, output_layout, input_layout)
-    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/slice_ops/relu.py b/python/tvm/topi/hexagon/slice_ops/relu.py
deleted file mode 100644
index c6d03ddccb32..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/relu.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Hexagon slice relu op"""
-
-from tvm import te, tir, topi
-from ..utils import get_layout_transform_fn
-
-
-def relu_compute(Input):
-    """Relu topi compute"""
-    return topi.nn.relu(Input)
-
-
-def relu_te_sched(Output, Input, layout):
-    """
-    Schedule assumes the layout function to be bijective
-    """
-    s = te.create_schedule(Output.op)
-    s[Input].transform_layout(layout)
-    out_axes = s[Output].transform_layout(layout)
-    fused = s[Output].fuse(out_axes[6], out_axes[7])
-    s[Output].vectorize(fused)
-    return s
-
-
-def relu_stir_schedule(Input, Output, input_layout, output_layout):
-    """
-    Schedule assumes the layout function to be bijective
-    """
-    if (input_layout != output_layout) or (output_layout != "nhwc-8h2w32c2w-2d"):
-        raise RuntimeError(
-            f"Unexpected input_layout, output_layout '{input_layout, output_layout}'"
-        )
-    relu_func = te.create_prim_func([Input, Output])
-    sch = tir.Schedule(relu_func, debug_mask="all")
-    block = sch.get_block("compute")
-    sch.transform_layout(block, Input.name, get_layout_transform_fn(input_layout))
-    sch.transform_layout(block, Output.name, get_layout_transform_fn(output_layout))
-
-    n, h, w, c = sch.get_loops(block)
-    h_o, h_i = sch.split(h, [None, 8])
-    w_o, w_i = sch.split(w, [None, 4])
-    c_o, c_i = sch.split(c, [None, 32])
-    wio, wii = sch.split(w_i, [None, 2])
-
-    sch.reorder(n, h_o, w_o, c_o, h_i, wio, c_i, wii)
-
-    fused = sch.fuse(c_i, wii)
-    sch.vectorize(fused)
-    return sch
diff --git a/python/tvm/topi/hexagon/slice_ops/reshape.py b/python/tvm/topi/hexagon/slice_ops/reshape.py
deleted file mode 100644
index 2220253e21be..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/reshape.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Hexagon slice reshape compute and schedule"""
-from tvm import te, tir, topi
-from ..utils import get_layout_transform_fn
-
-
-def reshape_compute(inp: te.Tensor, new_shape: tuple) -> te.Tensor:
-    """Compute for slice reshape op for hexagon.
-    This op makes the following assumptions:
-    1. This op is written for a sliced reshape operation.
-    2. The input is assumed to be in NHWC layout.
-
-    Parameters
-    ----------
-    Input : te.Tensor
-        Input tensor
-    New Shape: tuple
-        Output shape
-    Returns
-    -------
-    Output : te.Tensor
-        Output of applying reshape operation on input
-    """
-    return topi.transform.reshape(inp, new_shape)
-
-
-def stir_sched_nhwc_2d_op(
-    out: te.Tensor,
-    inp: te.Tensor,
-    out_layout: str,
-    in_layout: str,
-    c_split: int,
-) -> tir.Schedule:
-    """Schedule for output layout: nc-1024-2d, nc-2048-2d"""
-    reshape_func = te.create_prim_func([inp, out])
-    sch = tir.Schedule(reshape_func, debug_mask="all")
-    compute = sch.get_block("T_reshape")
-
-    sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout))
-    sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout))
-    i, j = sch.get_loops(compute)
-    jout, channel = sch.split(j, [None, inp.shape[3]])
-    height, width = sch.split(jout, [inp.shape[1], inp.shape[2]])
-    channelo, channeli = sch.split(channel, [None, 1024])
-    channelio, channelii = sch.split(channeli, [None, c_split])
-    sch.reorder(i, height, width, channelo, channelio, channelii)
-    sch.vectorize(channelii)
-    return sch
-
-
-def stir_schedule_nhwc_8h2w32c2w(
-    out: te.Tensor,
-    inp: te.Tensor,
-    out_layout: str,
-    in_layout: str,
-) -> tir.Schedule:
-    """Schedule for input and output layout nhwc-8h2w32c2w"""
-    reshape_func = te.create_prim_func([inp, out])
-    sch = tir.Schedule(reshape_func, debug_mask="all")
-    compute = sch.get_block("T_reshape")
-
-    sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout))
-    sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout))
-    return sch
-
-
-def reshape_stir_schedule(
-    out: te.Tensor,
-    inp: te.Tensor,
-    output_layout: str,
-    input_layout: str,
-) -> tir.Schedule:
-    """STIR schedule definition for the compute of reshape compute.
-    Parameters
-    ----------
-    outputs : te.Tensor
-        The output tensor as returned by a call to reshape_compute
-    input : te.Tensor
-        Input tensor to reshape
-    out_layout: str
-        The transformation function definition for the expected output layout
-    in_layout: str
-        The transformation function definition for the input layout
-    Returns
-    -------
-    sch : tvm.tir.Schedule
-        The STIR schedule for slice reshape compute
-    """
-    if output_layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
-        return stir_schedule_nhwc_8h2w32c2w(out, inp, output_layout, input_layout)
-    if output_layout == "nc-1024-2d":
-        return stir_sched_nhwc_2d_op(out, inp, output_layout, input_layout, 64)
-    if output_layout == "nc-2048-2d":
-        return stir_sched_nhwc_2d_op(out, inp, output_layout, input_layout, 128)
-    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/slice_ops/softmax_slice.py b/python/tvm/topi/hexagon/slice_ops/softmax_slice.py
deleted file mode 100644
index f95e58f3aec6..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/softmax_slice.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hexagon slice softmax compute and schedule"""
-
-import typing
-
-from tvm import te, tir, topi
-from ..utils import get_layout_transform_fn
-
-
-def softmax_compute(in_tensor):
-    """
-    Compute for slice softmax op for hexagon.
-    This op makes the following assumptions:
-    1. This op is written for a sliced softmax operation.
-    2. The input is assumed to be in NC layout.
-    """
-    return topi.nn.softmax(in_tensor, axis=1)
-
-
-def softmax_stir_schedule(
-    out: te.Tensor, inp: te.Tensor, out_layout: typing.Callable, in_layout: typing.Callable
-):
-    """
-    STIR schedule definition for the compute of softmax
-    """
-
-    in_layout = get_layout_transform_fn(in_layout)
-    out_layout = get_layout_transform_fn(out_layout)
-
-    func = te.create_prim_func([inp, out])
-    sch = tir.Schedule(func, debug_mask="all")
-
-    max_tensor = sch.get_block("T_softmax_maxelem")
-    exp_tensor = sch.get_block("T_softmax_exp")
-    sum_tensor = sch.get_block("T_softmax_expsum")
-    out_tensor = sch.get_block("T_softmax_norm")
-
-    sch.transform_layout(max_tensor, inp.name, in_layout)
-    sch.transform_layout(out_tensor, out.name, out_layout)
-
-    _, c_inner = sch.get_loops(max_tensor)
-    _, c_inner_i = sch.split(c_inner, [None, 64])
-    rf_max = sch.rfactor(c_inner_i, 0)
-    _, _, max_inner = sch.get_loops(rf_max)
-    sch.vectorize(max_inner)
-
-    _, loopi = sch.get_loops(exp_tensor)
-    _, loopi_i = sch.split(loopi, [None, 512])
-    sch.vectorize(loopi_i)
-
-    _, c_sum_inner = sch.get_loops(sum_tensor)
-    _, c_sum_inner_i = sch.split(c_sum_inner, [None, 64])
-    rf_sum = sch.rfactor(c_sum_inner_i, 0)
-    _, _, sum_inner = sch.get_loops(rf_sum)
-    sch.vectorize(sum_inner)
-
-    _, c_out_inner = sch.get_loops(out_tensor)
-    _, c_out_inner_i = sch.split(c_out_inner, [None, 512])
-    sch.vectorize(c_out_inner_i)
-
-    return sch
diff --git a/python/tvm/topi/hexagon/slice_ops/tanh.py b/python/tvm/topi/hexagon/slice_ops/tanh.py
deleted file mode 100644
index 3e10ec599cda..000000000000
--- a/python/tvm/topi/hexagon/slice_ops/tanh.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-""" Hexagon tanh slice op compute and schedule """
-import tvm
-from tvm import te, tir
-from ..utils import get_layout_transform_fn
-
-
-def tanh_te_compute(in_tensor):
-    out_tensor = te.compute(
-        in_tensor.shape, lambda n, h, w, c: tvm.tir.tanh(in_tensor[n, h, w, c]), name="tanhf16"
-    )
-    return out_tensor
-
-
-def tanhf16_stir_sched_nhwc(func, in_layout, out_layout, h_split_factor=8):
-    """Schedule for nhwc fp16 to nchw fp16 layout"""
-    sch = tir.Schedule(func, debug_mask="all")
-    block_name = "tanhf16"
-    n, h, w, c = sch.get_loops(sch.get_block(block_name))
-    h_outer, h_inner = sch.split(h, [None, h_split_factor])
-    w_outer, w_inner = sch.split(w, [None, 4])
-    c_outer, c_inner = sch.split(c, [None, 32])
-    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
-    sch.reorder(n, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
-    sch.transform_layout(block_name, "A", in_layout)
-    sch.transform_layout(block_name, block_name, out_layout)
-    fused = sch.fuse(c_inner, w_inner_i)
-    sch.vectorize(fused)
-    return sch
-
-
-def tanhf16_schedule(tanh_func, in_layout_str, out_layout_str):
-    in_layout_transform_func = get_layout_transform_fn(in_layout_str)
-    out_layout_transform_func = get_layout_transform_fn(out_layout_str)
-    return tanhf16_stir_sched_nhwc(
-        tanh_func,
-        in_layout_transform_func,
-        out_layout_transform_func,
-    )
diff --git a/python/tvm/topi/hexagon/tensor_intrin.py b/python/tvm/topi/hexagon/tensor_intrin.py
deleted file mode 100644
index 367e8ccb5a4a..000000000000
--- a/python/tvm/topi/hexagon/tensor_intrin.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Optimized implementation of q_multiply_shift based on LLVM intrinsics"""
-
-import tvm
-from tvm.ir import register_intrin_lowering
-from tvm import te
-
-
-def get_lanes(dtype: str):
-    if "x" not in dtype:
-        return 1
-
-    _, lanes = dtype.split("x")
-    return int(lanes)
-
-
-def is_vector_type(dtype: str):
-    return get_lanes(dtype) != 1
-
-
-def is_power_of_2(n: int):
-    return (n & (n - 1) == 0) and n != 0
-
-
-def _adapt_to_highest_lanes(*args, intrinsic=None, intrinsic_lanes: int = 0):
-    """Apply provided lowering intrinsic to arguments with longer vector data type.
-
-    This wrapper will do next actions:
-      * Split each argument into chunks with size equal intrinsic_lanes
-      * Apply provided intrinsic for each argument chunk
-      * Concatenate results
-
-    Parameters
-    ----------
-    args: List[PrimExpr]
-        List of arguments. Each arg expression should have vector type with lanes
-        equal `intrinsic_lanes * 2**n`.
-
-    intrinsic: callable
-        Intrinsic implementation to apply.
-
-    intrinsic_lanes: int
-        Vector length required by intrinsic implementation.
-
-    Returns
-    -------
-    res : PrimExpr
-        Resulting expression.
-    """
-
-    def split_args(args_set):
-        res_args_set = []
-        for args_chunk in args_set:
-            res_args_chunk_l = []
-            res_args_chunk_h = []
-            for arg_chunk in args_chunk:
-                element, lanes = arg_chunk.dtype.split("x")
-                res_arg_chunk_dtype = f"{element}x{int(lanes) // 2}"
-
-                res_args_chunk_l.append(tvm.tir.op.vectorlow(res_arg_chunk_dtype, arg_chunk))
-                res_args_chunk_h.append(tvm.tir.op.vectorhigh(res_arg_chunk_dtype, arg_chunk))
-            res_args_set += [res_args_chunk_l, res_args_chunk_h]
-
-        return res_args_set
-
-    def concat_args(res_chunks):
-        merged_res_chunks = []
-        for i in range(0, len(res_chunks), 2):
-            arg_chunk_l = res_chunks[i]
-            arg_chunk_h = res_chunks[i + 1]
-            element, lanes = arg_chunk_l.dtype.split("x")
-            res_arg_chunk_dtype = f"{element}x{int(lanes) * 2}"
-
-            merged_res_chunks.append(
-                tvm.tir.op.vectorcombine(res_arg_chunk_dtype, arg_chunk_l, arg_chunk_h)
-            )
-
-        return merged_res_chunks
-
-    num_chunks = None
-    for arg in args:
-        _, lanes = arg.dtype.split("x")
-        lanes = int(lanes)
-        assert lanes % intrinsic_lanes == 0
-        if num_chunks is None:
-            assert is_power_of_2(lanes // intrinsic_lanes)
-            num_chunks = lanes // intrinsic_lanes
-
-        assert num_chunks == lanes // intrinsic_lanes
-
-    # Split arguments
-    lowered_args = [args]
-    while len(lowered_args) != num_chunks:
-        lowered_args = split_args(lowered_args)
-
-    # Intrinsic application
-    lowered_res = []
-    for l_arg in lowered_args:
-        res = intrinsic(*l_arg)
-        lowered_res.append(res)
-
-    # Result concatenation
-    while len(lowered_res) != 1:
-        lowered_res = concat_args(lowered_res)
-
-    return lowered_res[0]
-
-
-def _q_multiply_shift_hexagon(op):
-    """
-    Implementation of q_multiply_shift through hexagon intrinsics vmpyewuh and vmpyowh when q == 31.
-    """
-    arg_x = op.args[0]
-    arg_fractional_bits = op.args[2]
-
-    # Don't use this intrinsic if we are not multiplying q31 numbers
-    if arg_fractional_bits.value != 31:
-        return op
-
-    x_lanes = get_lanes(arg_x.dtype)
-    if x_lanes % 32 != 0 or not is_power_of_2(x_lanes // 32):
-        return op
-
-    # pylint: disable=unused-argument
-    def intrinsic_lowering_32(x, y, fractional_bits, shift):
-        lowered_dtype = "int32x32"
-
-        # Case 1, shift is negative
-        mul_e_1 = tvm.tir.call_llvm_intrin(
-            lowered_dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
-        )
-        mul_o_1 = tvm.tir.call_llvm_intrin(
-            lowered_dtype,
-            "llvm.hexagon.V6.vmpyowh.sacc.128B",
-            tvm.tir.const(3, "uint32"),
-            mul_e_1,
-            x,
-            y,
-        )
-        fixup = 1 << (-shift - 1)
-        round_mul = mul_o_1 + fixup
-        out_negative_shift = tvm.tir.call_llvm_intrin(
-            lowered_dtype,
-            "llvm.hexagon.V6.vaslwv.128B",
-            tvm.tir.const(2, "uint32"),
-            round_mul,
-            shift,
-        )
-
-        # Case 2, shift is positive
-        x = x * (1 << (shift))
-        mul_e_2 = tvm.tir.call_llvm_intrin(
-            lowered_dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
-        )
-        mul_o_2 = tvm.tir.call_llvm_intrin(
-            lowered_dtype,
-            "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B",
-            tvm.tir.const(3, "uint32"),
-            mul_e_2,
-            x,
-            y,
-        )
-
-        # Select depending on the shift
-        return tvm.tir.Select(shift < 0, out_negative_shift, mul_o_2)
-
-    return _adapt_to_highest_lanes(*op.args, intrinsic=intrinsic_lowering_32, intrinsic_lanes=32)
-
-
-register_intrin_lowering(
-    "tir.q_multiply_shift", target="hexagon", f=_q_multiply_shift_hexagon, level=99
-)
-
-
-def _q_multiply_shift_per_axis_hexagon(op):
-    """
-    Implementation of q_multiply_shift_per_axis through hexagon intrinsics vmpyewuh and vmpyowh when
-    q == 31.
-    """
-    arg_x = op.args[0]
-    arg_fractional_bits = op.args[4]
-    arg_is_lshift_required = op.args[5]
-    arg_is_rshift_required = op.args[6]
-
-    # Don't use this intrinsic if we are not multiplying q31 numbers
-    if arg_fractional_bits.value != 31:
-        return op
-
-    x_lanes = get_lanes(arg_x.dtype)
-    if x_lanes % 32 != 0 or not is_power_of_2(x_lanes // 32):
-        return op
-
-    # Don't use this intrinsic when we need do both: left and right shifts.
-    # For now it is not clear how to implement this case through vector HVX instructions without
-    # accuracy drop.
-    if arg_is_rshift_required.value and arg_is_lshift_required.value:
-        return op
-
-    # pylint: disable=unused-argument
-    def intrinsic_impl_32(
-        x, y, left_shift, right_shift, fractional_bits, is_lshift_required, is_rshift_required
-    ):
-        lowered_dtype = "int32x32"
-
-        # Case 1: do the left shift
-        shifted_x = x << left_shift
-        mul_e_1 = tvm.tir.call_llvm_intrin(
-            lowered_dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), shifted_x, y
-        )
-        left_shift_out = tvm.tir.call_llvm_intrin(
-            lowered_dtype,
-            "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B",
-            tvm.tir.const(3, "uint32"),
-            mul_e_1,
-            shifted_x,
-            y,
-        )
-
-        # Case 2: do the right shift
-        mul_e_2 = tvm.tir.call_llvm_intrin(
-            lowered_dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
-        )
-        mul_o_2 = tvm.tir.call_llvm_intrin(
-            lowered_dtype,
-            "llvm.hexagon.V6.vmpyowh.sacc.128B",
-            tvm.tir.const(3, "uint32"),
-            mul_e_2,
-            x,
-            y,
-        )
-        fixup = 1 << (right_shift - 1)
-        round_mul = mul_o_2 + fixup
-        right_shift_out = tvm.tir.call_llvm_intrin(
-            lowered_dtype,
-            "llvm.hexagon.V6.vasrwv.128B",
-            tvm.tir.const(2, "uint32"),
-            round_mul,
-            right_shift,
-        )
-
-        # Case 3: do neither right nor left shift
-        mul_e_3 = tvm.tir.call_llvm_intrin(
-            lowered_dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
-        )
-        no_shift_out = tvm.tir.call_llvm_intrin(
-            lowered_dtype,
-            "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B",
-            tvm.tir.const(3, "uint32"),
-            mul_e_3,
-            x,
-            y,
-        )
-
-        return tvm.tir.Select(
-            tvm.tir.Not(tvm.tir.Or(is_lshift_required, is_rshift_required)),
-            no_shift_out,
-            tvm.tir.Select(is_lshift_required, left_shift_out, right_shift_out),
-        )
-
-    return _adapt_to_highest_lanes(*op.args, intrinsic=intrinsic_impl_32, intrinsic_lanes=32)
-
-
-register_intrin_lowering(
-    "tir.q_multiply_shift_per_axis",
-    target="hexagon",
-    f=_q_multiply_shift_per_axis_hexagon,
-    level=99,
-)
-
-
-def dot_vrmpy(x_ty, y_ty):
-    """Generates vrmpy instruciton for tensorization."""
-    int32_lanes = 32
-    num_int8_elements = 4  # 4 int8 elements in int32
-    data = te.placeholder((num_int8_elements,), dtype=x_ty, name="data")
-    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype=y_ty, name="kernel")
-    k = te.reduce_axis((0, num_int8_elements), name="k")
-    C = te.compute(
-        (int32_lanes,),
-        lambda i: te.sum(data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
-        name="C",
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype=x_ty, name="a_buffer", offset_factor=1, strides=[1]
-    )
-    b_buffer = tvm.tir.decl_buffer(
-        kernel.shape, dtype=y_ty, name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]
-    )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x32")))
-                return ib.get()
-
-            vec_zero = tvm.tir.const(0, "int32x32")
-
-            if x_ty == "uint8" and y_ty == "uint8":
-                a_uint8 = ins[0].vload([0], "uint8x4")
-                re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_uint8)
-                vec_b = ins[1].vload([0, 0], "uint8x128")
-
-                vrmpy_inst_name = "llvm.hexagon.V6.vrmpyub.acc.128B"
-
-                vec_bi32 = tvm.tir.call_intrin("int32x32", "tir.reinterpret", vec_b)
-
-                quad_reduction = tvm.tir.call_llvm_pure_intrin(
-                    "int32x32",
-                    vrmpy_inst_name,
-                    tvm.tir.const(3, "uint32"),
-                    vec_zero,
-                    vec_bi32,
-                    re_int32,
-                )
-            elif x_ty == "uint8" and y_ty == "int8":
-                a_uint8 = ins[0].vload([0], "uint8x4")
-                re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_uint8)
-                vec_b = ins[1].vload([0, 0], "int8x128")
-
-                vrmpy_inst_name = "llvm.hexagon.V6.vrmpybusv.acc.128B"
-
-                vec_bi32 = tvm.tir.call_intrin("int32x32", "tir.reinterpret", vec_b)
-
-                quad_reduction = tvm.tir.call_llvm_pure_intrin(
-                    "int32x32",
-                    vrmpy_inst_name,
-                    tvm.tir.const(3, "uint32"),
-                    vec_zero,
-                    re_int32.astype("int32x32"),
-                    vec_bi32,
-                )
-            else:
-                raise ValueError("Only (u8, u8) or (u8, i8) dtype pairs are supported by vrmpy.")
-
-            if index == 0:
-                ib.emit(outs[0].vstore(0, quad_reduction))
-            else:
-                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x32")))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={data: a_buffer, kernel: b_buffer},
-        default_buffer_params=buffer_params,
-    )
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
deleted file mode 100644
index aa1af5de43db..000000000000
--- a/python/tvm/topi/hexagon/utils.py
+++ /dev/null
@@ -1,465 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name
-
-
-"""Common hexagon specific utilities"""
-import math
-import struct
-from typing import Dict, Tuple, Union
-
-import tvm
-from tvm import IRModule, te, tir
-from tvm.tir import IndexMap, PrimFunc
-
-
-def is_scalar(expr):
-    if isinstance(expr, te.Tensor):
-        return expr.ndim == 0 and (isinstance(expr.op.body[0], (tir.FloatImm, tir.IntImm)))
-    return isinstance(expr, (tir.FloatImm, tir.IntImm))
-
-
-def get_const_int_value(expr):
-    if isinstance(expr, te.Tensor):
-        assert isinstance(expr.op.body[0], tir.IntImm)
-        return expr.op.body[0].value
-    return tvm.topi.utils.get_const_int(expr)
-
-
-def get_const_float_value(expr):
-    if isinstance(expr, te.Tensor):
-        assert isinstance(expr.op.body[0], tir.FloatImm)
-        return expr.op.body[0].value
-    return tvm.topi.utils.get_const_float(expr)
-
-
-def n11c_1024c_2d(n, h, w, c):
-    """Return index map for n11c_1024 2d layout"""
-    return [n, h, w, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024]
-
-
-def n11c_1024c_1d(n, h, w, c):
-    """Return index map for n11c_1024 1d layout"""
-    return [n, h, w, c // 1024, c % 1024]
-
-
-def nc11_1024c_2d(n, c, h, w):
-    """Return index map for nc11_1024 2d layout"""
-    return [n, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024, h, w]
-
-
-def nhwc_8h2w32c2w_2d(n, h, w, c):
-    """Return index map for nhwc_8h2w32c2w 2d layout"""
-    return [n, h // 8, w // 4, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
-
-
-def nhwc_8h2w32c2w_1d(n, h, w, c):
-    """Return index map for nhwc_8h2w32c2w 1d layout"""
-    return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
-
-
-def nchw_8h2w32c2w_2d(n, c, h, w):
-    """Return index map for nchw_8h2w32c2w 2d layout"""
-    return [n, c // 32, h // 8, w // 4, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
-
-
-def nhw_32h16w_2d(n, h, w):
-    """Return index map for nhw_32h16w 2d layout"""
-    return [n, h // 32, w // 16, IndexMap.AXIS_SEPARATOR, h % 32, w % 16]
-
-
-def nhwc_4h4w32c_1d(n, h, w, c):
-    """Return index map for nhwc_4h4232c 1d layout"""
-    return [n, h // 4, w // 4, c // 32, h % 4, w % 4, c % 32]
-
-
-def nhwc_4h4w32c_2d(n, h, w, c):
-    """Return index map for nhwc_4h4w32c 2d layout"""
-    return [n, h // 4, w // 4, c // 32, IndexMap.AXIS_SEPARATOR, h % 4, w % 4, c % 32]
-
-
-def nc_512c_1d(n, c):
-    """Return index map for nc_512c 1d layout"""
-    return [n, c // 512, c % 512]
-
-
-def nc_512c_2d(n, c):
-    """Return index map for nc_512c 2d layout"""
-    return [n, c // 512, IndexMap.AXIS_SEPARATOR, c % 512]
-
-
-def nc_1024c_2d(n, c):
-    """Return index map for nc_1024c 2d layout"""
-    return [n, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024]
-
-
-def nc_2048c_1d(n, c):
-    """Return index map for nc_2024c 1d layout"""
-    return [n, c // 2048, c % 2048]
-
-
-def nc_2048c_2d(n, c):
-    """Return index map for nc_2024c 2d layout"""
-    return [n, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
-
-
-def nc11_2048c_2d(n, c, h, w):
-    """Return index map for nc11_2048c 2d layout"""
-    return [n, c // 2048, IndexMap.AXIS_SEPARATOR, h, w, c % 2048]
-
-
-def nc_1024c_1d(n, c):
-    """Return index map for nc_1024c 1d layout"""
-    return [n, c // 1024, c % 1024]
-
-
-def nhwc_4h2w32c2w_2d(n, h, w, c):
-    """Return index map for nhwc_4h2w32c2w 2d layout"""
-    return [n, h // 4, w // 4, c // 32, IndexMap.AXIS_SEPARATOR, h % 4, (w % 4) // 2, c % 32, w % 2]
-
-
-def nhwc_1024c_2d(n, h, w, c):
-    """Return index map for nhwc_1024 2d layout"""
-    return [n, h, w, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024]
-
-
-def nc_1024_2d(n, c):
-    """Return index map for nc_1024 2d layout"""
-    return [n, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024]
-
-
-def nhwc_2048c_2d(n, h, w, c):
-    """Return index map for nhwc_2048 2d layout"""
-    return [n, h, w, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
-
-
-def nc_2048_2d(n, c):
-    """Return index map for nc_2048 2d layout"""
-    return [n, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
-
-
-def nhwc_8h8w32c_2d(n, h, w, c):
-    """Return index map for nhwc_8h8w32c 2d layout"""
-    return [n, h // 8, w // 8, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
-
-
-def nhwc_8h8w32c_1d(n, h, w, c):
-    """Return index map for nhwc_8h8w32c 1d layout"""
-    return [n, h // 8, w // 8, c // 32, h % 8, w % 8, c % 32]
-
-
-def nchw_8h8w32c_2d(n, c, h, w):
-    return [n, c // 32, h // 8, w // 8, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
-
-
-def n11c_2048c_2d(n, h, w, c):
-    """Return index map for n11c_2048c 2d layout"""
-    return [n, h, w, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
-
-
-def n11c_2048c_1d(n, h, w, c):
-    """Return index map for n11c_2048c 1 layout"""
-    return [n, h, w, c // 2048, c % 2048]
-
-
-def iohw_16i32o2i_1d(height, width, in_channel, out_channel):
-    return [
-        in_channel // 32,
-        out_channel // 32,
-        height,
-        width,
-        (in_channel % 32) // 2,
-        out_channel % 32,
-        in_channel % 2,
-    ]
-
-
-def ohwi32o_1d(height, width, in_channel, out_channel):
-    return [out_channel // 32, height, width, in_channel, out_channel % 32]
-
-
-def ncw_32c64w_2d(n, c, w):
-    """Return index map for ncw_32c64w 2d layout"""
-    return [n, c // 32, w // 64, IndexMap.AXIS_SEPARATOR, c % 32, w % 64]
-
-
-def nchw_32c8h8w_2d(n, c, h, w):
-    return [n, c // 32, h // 8, w // 8, IndexMap.AXIS_SEPARATOR, c % 32, h % 8, w % 8]
-
-
-def nchw_32c8h4w_2d(n, c, h, w):
-    return [n, c // 32, h // 8, w // 4, IndexMap.AXIS_SEPARATOR, c % 32, h % 8, w % 4]
-
-
-def get_layout_transform_fn(layout):
-    """Return index map function as per the layout string"""
-    if layout == "nhwc-8h2w32c2w-2d":
-        return nhwc_8h2w32c2w_2d
-    if layout == "nhwc-8h2w32c2w-1d":
-        return nhwc_8h2w32c2w_1d
-    if layout == "nchw-8h2w32c2w-2d":
-        return nchw_8h2w32c2w_2d
-    if layout == "n11c-1024c-2d":
-        return n11c_1024c_2d
-    if layout == "n11c-1024c-1d":
-        return n11c_1024c_1d
-    if layout == "nhwc-1024c-2d":
-        return nhwc_1024c_2d
-    if layout == "nc11-1024c-2d":
-        return nc11_1024c_2d
-    if layout == "nc-1024-2d":
-        return nc_1024_2d
-    if layout == "nhw-32h16w-2d":
-        return nhw_32h16w_2d
-    if layout == "nhwc-4h4w32c-2d":
-        return nhwc_4h4w32c_2d
-    if layout == "nhwc-4h4w32c-1d":
-        return nhwc_4h4w32c_1d
-    if layout == "nc-512c-2d":
-        return nc_512c_2d
-    if layout == "nc-512c-1d":
-        return nc_512c_1d
-    if layout == "nhwc-4h2w32c2w-2d":
-        return nhwc_4h2w32c2w_2d
-    if layout == "nc-2048c-1d":
-        return nc_2048c_1d
-    if layout == "nc-2048c-2d":
-        return nc_2048c_2d
-    if layout == "nc-1024c-2d":
-        return nc_1024c_2d
-    if layout == "nc-1024c-1d":
-        return nc_1024c_1d
-    if layout == "iohw-16i32o2i-1d":
-        return iohw_16i32o2i_1d
-    if layout == "nhwc-2048c-2d":
-        return nhwc_2048c_2d
-    if layout == "nc-2048-2d":
-        return nc_2048_2d
-    if layout == "nc-2048c-2d":
-        return nc_2048c_2d
-    if layout == "nhwc-8h8w32c-2d":
-        return nhwc_8h8w32c_2d
-    if layout == "nhwc-8h8w32c-1d":
-        return nhwc_8h8w32c_1d
-    if layout == "nchw-8h8w32c-2d":
-        return nchw_8h8w32c_2d
-    if layout == "n11c-2048c-2d":
-        return n11c_2048c_2d
-    if layout == "n11c-2048c-1d":
-        return n11c_2048c_1d
-    if layout == "ohwi32o-1d":
-        return ohwi32o_1d
-    if layout == "nc11-2048c-2d":
-        return nc11_2048c_2d
-    if layout == "ncw-32c64w-2d":
-        return ncw_32c64w_2d
-    if layout == "nchw-32c8h8w-2d":
-        return nchw_32c8h8w_2d
-    if layout == "nchw-32c8h4w-2d":
-        return nchw_32c8h4w_2d
-    if layout == "nchw-8h8w32c-2d":
-        return nchw_8h8w32c_2d
-    raise RuntimeError(f"Unexpected layout '{layout}'")
-
-
-def get_fixed_point_value(flp: float, dtype: str = "int16") -> Tuple[int, int]:
-    """
-    Return fixed-point value and the corresponding log2 of the scale factor used to compute
-    this value.
-
-    Parameters
-    ----------
-    flp : float
-        Floating-point value to be converted
-    dtype : str
-        Type of the resulting fixed-point value. By default, it's set to "int16"
-
-    Returns
-    -------
-    fixed_point_value : int
-        Fixed-point value for the given floating-point value
-    exp_scale_factor : int
-        log2 of the scale factor
-
-    Convert floating-point value into fixed-point number. This is done by
-    multiplying the value by a scaling factor and then rounding it to the nearest
-    integer value.
-
-    As per IEEE-754 standard, a floating-point value can be represented as follows
-    [see: https://en.wikipedia.org/wiki/IEEE_754-1985]:
-        (-1)^S * M * 2^(E-Bias)
-
-    Here,
-    * S is the signed bit (0 or 1).
-    * M is the mantissa. It's composed of an implicit 1 for the normalized floating-point
-      values or 0 for the denormalized values, and the fraction part. This ensures that
-      mantissa is always within [0, 2) range. Please note that this function doesn't
-      handle denormalized values.
-    * E is the exponent.
-
-    In single precision, 23 bits are used to represent the fraction part of
-    the mantissa (and therefore, '23' shows up in one of the computations below) and
-    8 bits are used for the exponent. Since exponent field needs to reperesent both
-    positive and negative values, a bias (127 for single precision) is added to the actual
-    value. Therefore, to compute the actual exponent, 127 must be subtracted from the stored
-    value.
-
-    As mentioned above, to find the corresponding fixed-point number, we multiply the
-    value with a scaling factor and then round it to the nearest integer. The scaling factor
-    is chosen to be a power for 2 and it's the largest value that can be safely multiplied
-    to the floating-point value, without causing the resulting value to overflow the range
-    of the integer type used to represent the fixed-point value.
-
-    So, if we assume the scaling factor to be 2^x, the resulting fixed-point value will be:
-        round((-1)^S * (M) * 2^(E-Bias) * 2^x)
-
-    This can be simplified to:
-        round((-1)^S * M * 2^(E-Bias+x)
-
-    Now, if 'int16' is used for fixed-point value, then it has to be >= -(2 * 2^14)
-    and <= (2 * 2^14) - 1. Since M (Mantissa) is always < 2, in order for the fixed-point value
-    to be within this range, 2^(E - Bias + x) must be <= 2^14 - 1.
-    And, if we ignore -1, (E - Bias + x) should be <= 14. Note: if mantissa gets too close to 2,
-    this will cause the resulting value to go out of range and require it to be saturated.
-    In the following implementation, we perform range check and adjust the scale to avoid
-    saturation.
-    For most cases, 2^x, where x = 14 - (E - Bias) or 14 - (E - 127) for single precision, is the
-    best scaling factor for 'int16' type that can be used to convert the floating-point value to
-    fixed-point with the least amount of precision loss.
-
-
-    Here is a more rigorous explanation of the above, for non-negative scale values, which are of
-    interest. M < 2, so M * 2^(E-Bias+x) < 2 ^ (E-Bias+x+1)   [Note: LHS is a fraction, RHS int]
-    => round(M * 2^(E-Bias+x)) <= 2 ^ (E-Bias+x+1)  [Note the "<=", not "<"]
-    We want x s.t. round(M * 2^(E-Bias+x)) <= 2^15 - 1
-    We know round(M * 2^(E-Bias+x)) <= 2^(E-Bias+x+1)
-    It will be sufficient to choose x s.t. 2^(E-Bias+x+1) <= 2^15 - 1
-    That is, max x. s.t. 2^(E-Bias+x+1) < 2^15
-    E-Bias+x+1 < 15
-    E-Bias+x+1 <= 14
-    Max x will make E-Bias+x+1 = 14
-    x = 13 - E + Bias
-
-    Additonal notes on various floating-point values:
-    ------------------------------------------------
-    1) Denormalized values: causes assertion failure. The problem with the denormalized values
-        is that they require a very large scale factor (>= 2^127) to be converted to a fixed-point
-        value. As the denormalzied values get smaller, the scale factor becomes too large to be
-        represented as a IEEE-754 floating point value (as being done in the computaton below)
-        and therefore, the denormalized values aren't being handled here.
-    2) NaN and INF: assertion failure
-    """
-
-    def within_range(val, dtype):
-        if dtype == "int16":
-            return -32768 <= val <= 32767
-        raise RuntimeError(f"Unsupported dtype, {dtype}'")
-
-    # Make sure that 'flp' isn't NaN or infinity
-    if math.isnan(flp) or math.isinf(flp):
-        raise RuntimeError("NaN or INF can not be represented as fixed-point")
-
-    flp_f = struct.pack("f", flp)
-    flp_i = struct.unpack("I", flp_f)
-    exp_stored_value = (flp_i[0] >> 23) & 0xFF
-
-    if exp_stored_value == 0:
-        raise RuntimeError(
-            "Denormalized values are not considered for float -> fixed-point conversion!"
-        )
-
-    exp_value = ((flp_i[0] >> 23) & 0xFF) - 127
-    if dtype == "int16":
-        max_bits = 14
-    else:
-        raise RuntimeError(f"Unsupported dtype, {dtype}'")
-
-    exp_scale_factor = max_bits - exp_value  # log2 of the scale_factor
-
-    if exp_scale_factor > 127:
-        raise RuntimeError("Value too small for fixed-point conversion!")
-
-    # Scaling factor = 2^exp_scale_factor
-    # Since exp_scale_factor can be -ve or +ve, scaling factor is calculated by first
-    # representing the value in the binary format as per IEEE floating-point standand and then
-    # reinterpreting it as a float using struct.pack and struct.unpack functions.
-    # struct.pack returns a bytes object packed as integer and struct.unpack
-    # unpacks this bytes object into float.
-    scale = ((exp_scale_factor + 127) & 0xFF) << 23
-    scale_i = struct.pack("I", scale)
-    scale_f = struct.unpack("f", scale_i)
-    fixed_point_value = int(round(flp * scale_f[0]))
-
-    if not within_range(fixed_point_value, dtype):
-        # Adjust scale factor to avoid overflow.
-        exp_scale_factor -= 1
-        scale = ((exp_scale_factor + 127) & 0xFF) << 23
-        scale_i = struct.pack("I", scale)
-        scale_f = struct.unpack("f", scale_i)
-        fixed_point_value = int(round(flp * scale_f[0]))
-
-    return fixed_point_value, exp_scale_factor
-
-
-def saturate(x: te.Tensor, dtype: str):
-    """Saturate value for the specified data type"""
-    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
-
-
-def get_vtcm_allocation_sizes(
-    func_or_mod: Union[PrimFunc, IRModule], compacted=True
-) -> Dict[str, int]:
-    """Calculate and return the vtcm allocation sizes for all the functions in
-    the IRModule or just the vtcm size if a single PrimFunc is passed
-
-    Parameters
-    ----------
-    func_or_mod : Union[PrimFunc, IRModule]
-        PrimFunc or IRModule for which VTCM allocation size is to be calculated
-    compacted :
-        Whether to calculate the sizes after applying VTCM lowering passes for
-        buffer compaction. This helps return the VTCM size that would get
-        allocated after lowering
-
-    Returns
-    -------
-    result : Dict[str, int]
-        A dict with function names as keys and vtcm allocated
-        inside that function as values
-
-    """
-    if not isinstance(func_or_mod, (PrimFunc, IRModule)):
-        raise TypeError(
-            f"Expected argument to be PrimFunc or IRModule, but received {type(func_or_mod)}"
-        )
-    if isinstance(func_or_mod, tvm.tir.PrimFunc):
-        mod = tvm.IRModule.from_expr(func_or_mod)
-    else:
-        mod = func_or_mod
-    if compacted:
-        passes = tvm.tir.analysis.get_vtcm_compaction_passes()
-        mod = tvm.transform.Sequential(list(passes))(mod)
-
-    result = {}
-    all_sizes = tvm.tir.analysis.calculate_allocated_bytes(mod)
-    for func_name, sizes in all_sizes.items():
-        if "global.vtcm" in sizes:
-            result[func_name] = sizes["global.vtcm"]
-        else:
-            result[func_name] = 0
-    return result
diff --git a/python/tvm/topi/hls/__init__.py b/python/tvm/topi/hls/__init__.py
deleted file mode 100644
index dff460b6149f..000000000000
--- a/python/tvm/topi/hls/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""HLS specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .nn import *
diff --git a/python/tvm/topi/hls/injective.py b/python/tvm/topi/hls/injective.py
deleted file mode 100644
index 931935973ed1..000000000000
--- a/python/tvm/topi/hls/injective.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable,
-"""Schedule for composition of injective operator"""
-import tvm
-from tvm import te
-
-
-def schedule_injective_from_existing(sch, out):
-    """Schedule for injective op from existing schedule.
-
-    Parameters
-    ----------
-    sch: Schedule
-         The schedule to update.
-    out: Tensor
-         The tensor representing the injective op.
-
-    Returns
-    -------
-    sch: Schedule
-         The updated schedule.
-    """
-    fused = sch[out].fuse(*sch[out].op.axis)
-    px, x = sch[out].split(fused, nparts=1)
-    sch[out].bind(px, te.thread_axis("pipeline"))
-    return sch
-
-
-def schedule_injective(outs):
-    """Schedule for injective op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-    for out in outs:
-        schedule_injective_from_existing(s, out)
-    return s
-
-
-schedule_elemwise = schedule_injective
-schedule_broadcast = schedule_injective
diff --git a/python/tvm/topi/hls/nn.py b/python/tvm/topi/hls/nn.py
deleted file mode 100644
index 4d0f1f66d74a..000000000000
--- a/python/tvm/topi/hls/nn.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""HLS nn operators"""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-from .. import tag
-
-
-def _schedule_conv2d(outs):
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp):
-                    traverse(tensor.op)
-        # schedule conv2d
-        elif OP.tag.find("conv2d") >= 0:
-            Conv2d = OP.output(0)
-            if not Conv2d.op in s.outputs:
-                Out = outs[0].op.output(0)
-                s[Conv2d].compute_at(s[Out], s[Out].op.axis[1])
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-    traverse(outs[0].op)
-
-    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, te.thread_axis("pipeline"))
-    return s
-
-
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_conv2d_nhwc(outs):
-    """Schedule for conv2d_nhwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_conv2d_NCHWc(outs):
-    """Schedule for conv2d_NCHW[x]c
-
-    Parameters
-    ----------
-    outs : Array of Tensor
-        The computation graph description of conv2d_NCHWc
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    sch : Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_conv2d_transpose_nchw(outs):
-    """Schedule for conv2d_transpose_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_transpose_nchw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_depthwise_conv2d_nchw(outs):
-    """Schedule for depthwise_conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of depthwise_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_depthwise_conv2d_nhwc(outs):
-    """Schedule for depthwise_conv2d_nhwc
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of depthwise_conv2d_nhwc
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_bitserial_conv2d_nchw(outs):
-    """Schedule for bitserial_conv2d_nchw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of bitserial_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_bitserial_conv2d_nhwc(outs):
-    """Schedule for bitserial_conv2d_nhwc
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of bitserial_conv2d_nchw
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _schedule_conv2d(outs)
-
-
-def schedule_reduce(outs):
-    """Schedule for reduction
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of reduce
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp):
-                    traverse(tensor.op)
-        elif OP.tag in ["comm_reduce", "comm_reduce_idx"]:
-            if OP.tag == "comm_reduce":
-                Reduce = OP.output(0)
-            else:
-                Reduce = OP.input_tensors[0]
-            if not Reduce.op in s.outputs:
-                Out = outs[0].op.output(0)
-                s[Reduce].compute_at(s[Out], s[Out].op.axis[0])
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-    traverse(outs[0].op)
-
-    fused = s[outs[0]].fuse()
-    px, x = s[outs[0]].split(fused, nparts=1)
-    s[outs[0]].bind(px, te.thread_axis("pipeline"))
-    return s
-
-
-def schedule_softmax(outs):
-    """Schedule for softmax
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of softmax
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    softmax = outs[0]
-
-    op_tag = softmax.op.tag
-    if op_tag == "softmax_output":
-        expsum = softmax.op.input_tensors[1]
-        exp = softmax.op.input_tensors[0]
-        max_elem = s[exp].op.input_tensors[1]
-    elif op_tag == "log_softmax_output":
-        exp = None
-        max_elem = softmax.op.input_tensors[1]
-        expsum = softmax.op.input_tensors[2]
-    else:
-        raise ValueError(
-            f"Tag is expected to be softmax_output or log_softmax_output. Got {op_tag}"
-        )
-
-    if exp is not None:
-        s[exp].compute_at(s[softmax], s[softmax].op.axis[1])
-
-    s[expsum].compute_at(s[softmax], s[softmax].op.axis[1])
-    s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1])
-
-    px, x = s[softmax].split(softmax.op.axis[0], nparts=1)
-    s[softmax].bind(px, te.thread_axis("pipeline"))
-    return s
-
-
-def schedule_dense(outs):
-    """Schedule for dense
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of dense
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp):
-                    traverse(tensor.op)
-        # schedule dense
-        elif OP.tag == "dense":
-            Dense = OP.output(0)
-            if not Dense.op in s.outputs:
-                Out = outs[0].op.output(0)
-                s[Dense].compute_at(s[Out], s[Out].op.axis[1])
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-    traverse(outs[0].op)
-
-    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, te.thread_axis("pipeline"))
-    return s
-
-
-def schedule_pool(outs, layout):
-    """Schedule for pool
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of pool
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp):
-                    traverse(tensor.op)
-        # schedule pool
-        elif OP.tag.startswith("pool"):
-            Pool = OP.output(0)
-            if not Pool.op in s.outputs:
-                Out = outs[0].op.output(0)
-                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-    traverse(outs[0].op)
-
-    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, te.thread_axis("pipeline"))
-    return s
-
-
-def schedule_adaptive_pool(outs):
-    """Schedule for adaptive_pool
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of adaptive_pool
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    tvm.te.schedule.AutoInlineInjective(s)
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp):
-                    traverse(tensor.op)
-        # schedule global_pool
-        elif OP.tag.startswith("adaptive_pool"):
-            Pool = OP.output(0)
-            if not Pool.op in s.outputs:
-                Out = outs[0].op.output(0)
-                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-    traverse(outs[0].op)
-
-    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, te.thread_axis("pipeline"))
-    return s
diff --git a/python/tvm/topi/intel_graphics/__init__.py b/python/tvm/topi/intel_graphics/__init__.py
deleted file mode 100644
index 5f82fe758786..000000000000
--- a/python/tvm/topi/intel_graphics/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""Intel Gen9 GPU specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .conv2d import *
-from . import conv2d_alter_op
-from .depthwise_conv2d import *
diff --git a/python/tvm/topi/intel_graphics/conv2d.py b/python/tvm/topi/intel_graphics/conv2d.py
deleted file mode 100644
index b7906cdb9108..000000000000
--- a/python/tvm/topi/intel_graphics/conv2d.py
+++ /dev/null
@@ -1,656 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches, too-many-boolean-expressions
-"""conv2d schedule on Intel Graphics"""
-
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
-from .. import nn
-from .. import utils
-from ..utils import simplify, get_const_tuple, traverse_inline
-
-
-def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
-    if is_depthwise:
-        raise RuntimeError("Depthwise not supported for intel graphics.")
-
-    batch_size, in_channel, height, width = get_const_tuple(data.shape)
-    out_channel, _, hkernel, _ = get_const_tuple(kernel.shape)
-    HSTR, _ = strides
-
-    ic_bn = 1
-    oc_bn, oc_bn_upper = 16, 16
-    for i in range(oc_bn_upper, 0, -1):
-        if out_channel % i == 0:
-            oc_bn = i
-            break
-
-    if HSTR == 2:
-        if out_channel + hkernel == 515:
-            block_oh = 4
-            block_ow = 4
-        else:
-            block_oh = 4
-            block_ow = 5
-    elif hkernel == 3:
-        if out_channel == 512:
-            block_oh = 2
-            block_ow = 7
-        else:
-            block_oh = 2
-            block_ow = 14
-    else:
-        block_oh = 1
-        block_ow = 16
-    cfg["tile_ic"] = SplitEntity([in_channel // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([out_channel // oc_bn, oc_bn])
-    cfg["block_oh"] = OtherOptionEntity(block_oh)
-    cfg["block_ow"] = OtherOptionEntity(block_ow)
-
-
-def _create_schedule_template(cfg, dshape, kshape, strides, padding, dilation):
-    """Create schedule configuration from input arguments"""
-    n, ic, h, w = dshape
-    oc, _, kh, kw = kshape
-
-    pt, pl, pb, pr = nn.get_pad_tuple(padding, (kh, kw))
-    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    oh = (h - kh + pt + pb) // sh + 1
-    ow = (w - kw + pl + pr) // sw + 1
-    ic_bn_upper = 32
-    oc_bn_upper = 64
-    oc_bn_lower = min(oc, 8)
-    ic_bn_candidates, oc_bn_candidates = [], []
-    for i in range(1, ic + 1):
-        if ic % i == 0 and i <= ic_bn_upper:
-            ic_bn_candidates.append(i)
-    if not ic_bn_candidates:
-        ic_bn_candidates.append(1)
-        ic_bn_candidates.append(ic)
-
-    for i in range(1, oc + 1):
-        if oc % i == 0 and oc_bn_lower <= i <= oc_bn_upper:
-            oc_bn_candidates.append(i)
-    if not oc_bn_candidates:
-        oc_bn_candidates.append(1)
-        oc_bn_candidates.append(oc)
-
-    blk_candidates_low_limits = 5
-    blk_oh_list, blk_ow_list = [], []
-    for i, j in zip(range(oh, 0, -1), range(ow, 0, -1)):
-        if i <= 16 and oh % i == 0:
-            blk_oh_list.append(i)
-        if j <= 16 and ow % j == 0:
-            blk_ow_list.append(j)
-
-    if len(blk_oh_list) < blk_candidates_low_limits:
-        for i in range(2, oh):
-            if i not in blk_oh_list:
-                blk_oh_list.append(i)
-                if len(blk_oh_list) >= 5:
-                    break
-
-    if len(blk_ow_list) < blk_candidates_low_limits:
-        for i in range(min(ow - 1, 16), 1, -1):
-            if i not in blk_ow_list:
-                blk_ow_list.append(i)
-                if len(blk_ow_list) >= 5:
-                    break
-
-    # Create schedule config
-    cfg.define_knob("tile_ic", ic_bn_candidates)
-    cfg.define_knob("tile_oc", oc_bn_candidates)
-    cfg.define_knob("block_oh", blk_oh_list)
-    cfg.define_knob("block_ow", blk_ow_list)
-
-
-##### SCHEDULE UTILITIES #####
-def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-    """tile and bind 3d"""
-    y_factor = y_factor or z_factor
-    x_factor = x_factor or y_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].reorder(zo, yo, xo, zi, yi, xi)
-
-    thread_z = te.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = te.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = te.thread_axis((0, x_factor), "threadIdx.x")
-    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, thread_z)
-    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, thread_y)
-    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, thread_x)
-    return xi, thread_z, thread_y, thread_x
-
-
-def _pack_data(data, kernel, ic_bn, oc_bn):
-    n, _, ih, iw = get_const_tuple(data.shape)
-    oc, ic, kh, kw = get_const_tuple(kernel.shape)
-
-    ic_chunk = ic // ic_bn
-    oc_chunk = oc // oc_bn
-
-    data = te.compute(
-        (n, ic_chunk, ih, iw, ic_bn),
-        lambda bs, c, h, w, vc: data[bs, c * ic_bn + vc, h, w],
-        name="data_vec",
-    )
-
-    kernel = te.compute(
-        (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn),
-        lambda occ, icc, k_h, k_w, icb, ocb: kernel[occ * oc_bn + ocb, icc * ic_bn + icb, k_h, k_w],
-        name="kernel_vec",
-    )
-
-    return data, kernel
-
-
-@autotvm.register_topi_compute("conv2d_NCHWc.intel_graphics")
-def conv2d_NCHWc(
-    cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype="float32"
-):
-    """Conv2D operator for Intel Graphics backend.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    kernel : tvm.te.Tensor
-        5-D with shape [num_filter, in_channel, filter_height, filter_width, nnum_filter_vec]
-
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding : int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
-
-    layout : str
-        layout of data
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    if len(data.shape) == 5:
-        batch, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
-        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
-        in_channel = ic_chunk * ic_bn
-        num_filter = oc_chunk * oc_bn
-    else:
-        batch, in_channel, ih, iw = get_const_tuple(data.shape)
-        num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
-        padding, (kernel_height, kernel_width)
-    )
-    assert (dh, dw) == (1, 1), "Does not support dilation"
-    if isinstance(strides, (tuple, list)):
-        stride_h, stride_w = strides
-    else:
-        stride_h, stride_w = strides, strides
-
-    data_shape = (batch, in_channel, ih, iw)
-    kernel_shape = (num_filter, in_channel, kernel_height, kernel_width)
-    _create_schedule_template(cfg, data_shape, kernel_shape, strides, padding, dilation)
-
-    if cfg.is_fallback:
-        _get_default_config(
-            cfg,
-            te.placeholder((batch, in_channel, ih, iw), dtype=data.dtype),
-            te.placeholder(
-                (num_filter, in_channel, kernel_height, kernel_width), dtype=kernel.dtype
-            ),
-            strides,
-            padding,
-            out_dtype,
-        )
-
-    ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1]
-    oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1]
-
-    # Pack data if raw 4-D data is provided.
-    if len(data.shape) == 4:
-        data, kernel = _pack_data(data, kernel, ic_bn, oc_bn)
-
-    out_channel = num_filter
-    out_height = simplify((ih - kernel_height + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((iw - kernel_width + pad_left + pad_right) // stride_w + 1)
-    oshape = (batch, out_channel // oc_bn, out_height, out_width, oc_bn)
-
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_height), name="ry")
-    rx = te.reduce_axis((0, kernel_width), name="rx")
-
-    block_h = cfg["block_oh"].val
-    block_w = cfg["block_ow"].val
-
-    c_h = out_height
-    c_w = out_width
-
-    if out_height % block_h != 0:
-        c_h = (out_height // block_h + 1) * block_h
-
-    if out_width % block_w != 0:
-        c_w = (out_width // block_w + 1) * block_w
-
-    cshape = (batch, out_channel // oc_bn, c_h, c_w, oc_bn)
-
-    pad_before = [0, 0, pad_top, pad_left, 0]
-    pad_after = [0, 0, pad_down + c_h - out_height, pad_right + c_w - out_width, 0]
-    DOPAD = (
-        pad_top != 0
-        or pad_left != 0
-        or pad_down + c_h - out_height != 0
-        or pad_right + c_w - out_width != 0
-    )
-    DOUNPACK = c_h - out_height != 0 or c_w - out_width != 0
-    if DOPAD:
-        temp = nn.pad(data, pad_before, pad_after, name="pad_temp")
-    else:
-        temp = data
-
-    conv = te.compute(
-        cshape,
-        lambda nn, ff, yy, xx, ff_v: te.sum(
-            temp[nn, rc // ic_bn, yy * stride_h + ry, xx * stride_w + rx, rc % ic_bn].astype(
-                out_dtype
-            )
-            * kernel[ff, rc // ic_bn, ry, rx, rc % ic_bn, ff_v].astype(out_dtype),
-            axis=[rc, ry, rx],
-        ),
-        tag="conv2d_NCHWc",
-        name="conv2d_NCHWc",
-    )
-
-    if DOUNPACK:
-        output = te.compute(
-            oshape,
-            lambda nn, ff, yy, xx, ff_v: conv[nn][ff][yy][xx][ff_v],
-            name="output_unpack",
-            tag="conv2d_NCHWc_unpack",
-        )
-    else:
-        output = conv
-
-    return output
-
-
-@autotvm.register_topi_schedule("conv2d_NCHWc.intel_graphics")
-def schedule_conv2d_NCHWc(cfg, outs):
-    """Schedule for conv2d_nchw for Intel Graphics
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if "conv2d_NCHWc" in op.tag:
-            _schedule_cl_spatialpack_NCHWc(cfg, s, op)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
-
-
-def _schedule_cl_spatialpack_NCHWc(cfg, s, op):
-    output = op.output(0)
-    if op.name == "conv2d_NCHWc":
-        temp = op.input_tensors[0]
-        kernel = op.input_tensors[1]
-        temp_W = s.cache_read(temp, "warp", [output])
-        conv_L = s.cache_write(output, "local")
-        if output.op in s.outputs:
-            conv = output
-        else:
-            s[output].compute_inline()
-            conv = s.outputs[0]
-        SCHEDULE_OUTPUT = False
-    else:  # conv2d_NCHWc_unpack
-        conv = op.input_tensors[0]
-        temp = s[conv].op.input_tensors[0]
-        kernel = s[conv].op.input_tensors[1]
-        temp_W = s.cache_read(temp, "warp", [conv])
-        conv_L = s.cache_write(conv, "local")
-        SCHEDULE_OUTPUT = True
-    kernel_L = s.cache_read(kernel, "local", [conv_L])
-
-    if temp.name == "pad_temp":
-        data = temp.op.input_tensors[0]
-        # TODO(@Laurawly): Do we need to schedule pad op here?
-    else:
-        data = temp
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
-        # skip this part during tuning to make records accurate.
-        # this part will be folded during Relay fold_constant pass.
-        s[data].pragma(s[data].op.axis[0], "debug_skip_region")
-        s[kernel].pragma(s[kernel].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel.op, tvm.te.ComputeOp) and kernel.name == "kernel_vec":
-        # data and kernel are not pre-computed, schedule layout transform here.
-        # TODO(@Laurawly): Add schedule for data and kernel pack
-        pass
-
-    OUTPUT_BLOCK_HEIGHT = cfg["block_oh"].val
-    OUTPUT_BLOCK_WIDTH = cfg["block_ow"].val
-
-    # schedule conv
-    z_factor = 1
-    y_factor = 1
-    x_factor = 16
-    thread_z = te.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = te.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = te.thread_axis((0, x_factor), "threadIdx.x")
-    _, co, oh, ow, vc = s[conv].op.axis
-    ooh, ioh = s[conv].split(oh, factor=OUTPUT_BLOCK_HEIGHT)
-    oow, iow = s[conv].split(ow, factor=OUTPUT_BLOCK_WIDTH)
-    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
-    coo, coi = s[conv].split(co, nparts=1)
-    ooho, oohi = s[conv].split(ooh, factor=z_factor)
-    oowo, oowi = s[conv].split(oow, factor=y_factor)
-    vco, vci = s[conv].split(vc, factor=x_factor)
-    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
-    s[conv].bind(oohi, thread_z)
-    s[conv].bind(oowi, thread_y)
-    s[conv].bind(vci, thread_x)
-    s[conv].bind(ooho, te.thread_axis("blockIdx.z"))
-    s[conv].bind(oowo, te.thread_axis("blockIdx.y"))
-    s[conv].bind(coi, te.thread_axis("blockIdx.x"))
-
-    # schedule conv_L
-    s[conv_L].compute_at(s[conv], vci)
-    i, oc, h, w, vc = s[conv_L].op.axis
-    rc, ry, rx = s[conv_L].op.reduce_axis
-    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-    s[temp_W].compute_at(s[conv_L], rc)
-    if kernel.shape[3].value != 7:
-        s[conv_L].unroll(ry)
-        s[conv_L].unroll(rx)
-
-    # schedule temp
-    if temp.op.name == "pad_temp":
-        _, ci, h, w, vci = s[temp].op.axis
-        tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16)
-
-    # schedule temp_W
-    _, ci, h, w, vci = s[temp_W].op.axis
-    zo, zi = s[temp_W].split(vci, 1)
-    yo, yi = s[temp_W].split(h, 1)
-    xo, xi = s[temp_W].split(w, 16)
-    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
-    s[temp_W].bind(zi, thread_z)
-    s[temp_W].bind(yi, thread_y)
-    s[temp_W].bind(xi, thread_x)
-    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
-
-    # schedule kernel_L
-    if OUTPUT_BLOCK_HEIGHT == 2 and OUTPUT_BLOCK_WIDTH == 14:
-        s[kernel_L].compute_at(s[conv_L], ry)
-    else:
-        s[kernel_L].compute_at(s[conv_L], rx)
-
-    # schedule output
-    if SCHEDULE_OUTPUT:
-        if output.op in s.outputs:
-            out = output
-        else:
-            s[output].compute_inline()
-            out = s.outputs[0]
-
-        _, co, h, w, vc = s[out].op.axis
-        tile_and_bind3d(s, out, w, h, vc, 4, 8, 8)
-
-
-def conv2d_nchw(data, kernel, stride, padding, dilation, out_dtype="float32"):
-    """Conv2D operator for Intel Graphics backend.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-    kernel : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-    padding : int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
-    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
-
-    return _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype)
-
-
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw for Intel Graphics
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if "conv2d" in op.tag:
-            _schedule_cl_spatialpack(s, op)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype="float16"):
-    batch, in_channel, in_height, in_width = [utils.get_const_int(x) for x in data.shape]
-    num_filter, channel, kernel_h, kernel_w = [utils.get_const_int(x) for x in kernel.shape]
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w))
-
-    if isinstance(stride, (tuple, list)):
-        stride_h, stride_w = stride
-    else:
-        stride_h, stride_w = stride, stride
-
-    out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
-    oshape = (batch, out_channel, out_height, out_width)
-
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-
-    if stride_h == 2:
-        if num_filter + kernel_h == 515:
-            block_h = 4
-            block_w = 4
-        else:
-            block_h = 4
-            block_w = 5
-    elif kernel_h == 3:
-        if num_filter == 512:
-            block_h = 2
-            block_w = 7
-        else:
-            block_h = 2
-            block_w = 14
-    elif kernel_h == 7 and padding == 3 and stride == 1:
-        block_h = 3
-        block_w = 4
-    else:
-        block_h = 1
-        block_w = 16
-    attrs = {"block_h": block_h, "block_w": block_w}
-    c_h = out_height
-    c_w = out_width
-
-    if out_height % block_h != 0:
-        c_h = (out_height // block_h + 1) * block_h
-
-    if out_width % block_w != 0:
-        c_w = (out_width // block_w + 1) * block_w
-
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w]
-    temp = nn.pad(data, pad_before, pad_after, name="pad_temp")
-
-    nv = 16
-    if num_filter % nv != 0:
-        num_filter = (num_filter // nv + 1) * nv
-        out_channel = num_filter
-
-    cshape = (batch, out_channel // nv, c_h, c_w, nv)
-    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
-
-    kernel_vec = te.compute(
-        kvshape, lambda co, ci, kh, kw, vc: kernel[co * nv + vc][ci][kh][kw], name="kernel_vec"
-    )
-
-    conv = te.compute(
-        cshape,
-        lambda nn, ff, yy, xx, vc: te.sum(
-            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype)
-            * kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
-            axis=[rc, ry, rx],
-        ),
-        name="conv",
-        attrs=attrs,
-    )
-
-    output = te.compute(
-        oshape,
-        lambda nn, ff, yy, xx: conv[nn][ff // nv][yy][xx][ff % nv],
-        name="output_unpack",
-        tag="conv2d",
-    )
-
-    return output
-
-
-def _schedule_cl_spatialpack(s, op):
-    output = op.output(0)
-    _, _, out_height, out_width = [utils.get_const_int(x) for x in output.shape]
-
-    conv = op.input_tensors[0]
-    temp = s[conv].op.input_tensors[0]
-    kernel_vec = s[conv].op.input_tensors[1]
-    kernel = s[kernel_vec].op.input_tensors[0]
-    temp_W = s.cache_read(temp, "shared", [conv])
-    conv_L = s.cache_write(conv, "local")
-
-    kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
-    _, in_channel, temp_h, temp_w = [utils.get_const_int(x) for x in temp.shape]
-
-    attrs = s[conv].op.attrs
-    OUTPUT_BLOCK_HEIGHT = attrs["block_h"]
-    OUTPUT_BLOCK_WIDTH = attrs["block_w"]
-
-    # schedule conv
-    y_factor = z_factor = 1
-    x_factor = 16
-    thread_z = te.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = te.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = te.thread_axis((0, x_factor), "threadIdx.x")
-    _, co, oh, ow, vc = s[conv].op.axis
-    ooh, ioh = s[conv].split(oh, factor=OUTPUT_BLOCK_HEIGHT)
-    oow, iow = s[conv].split(ow, factor=OUTPUT_BLOCK_WIDTH)
-    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
-    coo, coi = s[conv].split(co, nparts=1)
-    ooho, oohi = s[conv].split(ooh, factor=z_factor)
-    oowo, oowi = s[conv].split(oow, factor=y_factor)
-    vco, vci = s[conv].split(vc, factor=x_factor)
-    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
-    s[conv].bind(oohi, thread_z)
-    s[conv].bind(oowi, thread_y)
-    s[conv].bind(vci, thread_x)
-    s[conv].bind(ooho, te.thread_axis("blockIdx.z"))
-    s[conv].bind(oowo, te.thread_axis("blockIdx.y"))
-    s[conv].bind(coi, te.thread_axis("blockIdx.x"))
-
-    # schedule conv_L
-    s[conv_L].compute_at(s[conv], vci)
-    i, oc, h, w, vc = s[conv_L].op.axis
-    rc, ry, rx = s[conv_L].op.reduce_axis
-    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-    s[temp_W].compute_at(s[conv_L], rc)
-    if kernel.shape[3].value != 7:
-        s[conv_L].unroll(ry)
-        s[conv_L].unroll(rx)
-
-    # schedule temp
-    _, ci, h, w = s[temp].op.axis
-    tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16)
-
-    # schedule temp_W
-    _, ci, h, w = s[temp_W].op.axis
-    zo, zi = s[temp_W].split(ci, 1)
-    yo, yi = s[temp_W].split(h, 1)
-    xo, xi = s[temp_W].split(w, 16)
-    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
-    s[temp_W].bind(zi, thread_z)
-    s[temp_W].bind(yi, thread_y)
-    s[temp_W].bind(xi, thread_x)
-    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
-
-    s[kernel_vec].compute_inline()
-
-    # schedule kernel_L
-    if OUTPUT_BLOCK_HEIGHT == 2 and OUTPUT_BLOCK_WIDTH == 14:
-        s[kernel_L].compute_at(s[conv_L], ry)
-    else:
-        s[kernel_L].compute_at(s[conv_L], rx)
-
-    # schedule output
-    if output.op in s.outputs:
-        out = output
-    else:
-        s[output].compute_inline()
-        out = s.outputs[0]
-
-    _, co, h, w = s[out].op.axis
-    tile_and_bind3d(s, out, w, h, co, 4, 8, 8)
diff --git a/python/tvm/topi/intel_graphics/conv2d_alter_op.py b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
deleted file mode 100644
index 3dc587e8710e..000000000000
--- a/python/tvm/topi/intel_graphics/conv2d_alter_op.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D alter op and legalize functions for x86"""
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-
-from ..utils import get_const_tuple
-from ..nn import conv2d_alter_layout, conv2d_infer_layout
-from .conv2d import _get_default_config
-
-
-@conv2d_alter_layout.register(["intel_graphics"])
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
-        cfg = dispatch_ctx.query(target, None)
-        workload = cfg.workload
-    else:
-        _, outs = relay.backend.te_compiler.select_implementation(
-            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-        )
-        workload = autotvm.task.get_workload(outs)
-        if workload is None:
-            # The best implementation is not an AutoTVM template,
-            # we then assume it's not necessary to alter this op.
-            return None
-        cfg = dispatch_ctx.query(target, workload)
-
-    topi_tmpl = workload[0]
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data_tensor, kernel_tensor = tinfos
-    data_dtype = data_tensor.dtype
-    kernel_dtype = kernel_tensor.dtype
-    out_dtype = out_type.dtype
-
-    if topi_tmpl == "conv2d_NCHWc.intel_graphics":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        if cfg.is_fallback:
-            _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False)
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
-        ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1]
-        oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1]
-
-        # update new attrs
-        new_attrs["channels"] = out_channel
-        new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-        new_attrs["kernel_layout"] = f"OIHW{ic_bn}i{oc_bn}o"
-        new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-        # Store altered operator's config
-        new_data = te.placeholder(
-            (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
-        )
-        new_kernel = te.placeholder(
-            (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn), dtype=kernel_dtype
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [
-                new_data,
-                new_kernel,
-                strides,
-                padding,
-                dilation,
-                new_attrs["data_layout"],
-                new_attrs["out_layout"],
-                out_dtype,
-            ],
-            "conv2d_NCHWc.intel_graphics",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
-
-    return None
-
-
-@conv2d_infer_layout.register("intel_graphics")
-def _conv2d_infer_layout(workload, cfg):
-    _, data, kernel, strides, padding, dilation, layout, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[1]
-    out_channel, _, k_height, k_width = kernel[1]
-    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
-    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
-    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
-    in_layout = f"NCHW{tile_ic}c"
-    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
-    out_layout = f"NCHW{tile_oc}c"
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
diff --git a/python/tvm/topi/intel_graphics/depthwise_conv2d.py b/python/tvm/topi/intel_graphics/depthwise_conv2d.py
deleted file mode 100644
index 02af465248a6..000000000000
--- a/python/tvm/topi/intel_graphics/depthwise_conv2d.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Schedule for depthwise_conv2d with auto fusion"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from ..utils import traverse_inline
-from .. import nn
-from ..nn.depthwise_conv2d import depthwise_conv2d_infer_layout
-
-# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-@autotvm.register_topi_compute("depthwise_conv2d_nchw.intel_graphics")
-def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype):
-    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw.intel_graphics")
-def schedule_depthwise_conv2d_nchw(cfg, outs):
-    """Schedule for depthwise_conv2d nchw forward.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "depthwise_conv2d_nchw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-
-            ##### space definition begin #####
-            n, f, y, x = s[conv].op.axis
-            cfg.define_split("tile_f", f, num_outputs=4)
-            cfg.define_split("tile_y", y, num_outputs=4)
-            cfg.define_split("tile_x", x, num_outputs=4)
-            cfg.define_knob("auto_unroll_max_step", [0, 256, 1500])
-
-            target = tvm.target.Target.current()
-            if target.kind.name in ["nvptx", "rocm"]:
-                cfg.define_knob("unroll_explicit", [1])
-            else:
-                cfg.define_knob("unroll_explicit", [0, 1])
-
-            # fallback support
-            if cfg.is_fallback:
-                ref_log = autotvm.tophub.load_reference_log(
-                    target.kind.name, target.model, "depthwise_conv2d_nchw.intel_graphics"
-                )
-                cfg.fallback_with_reference_log(ref_log)
-                cfg["unroll_explicit"].val = 0
-            ##### space definition end #####
-
-            s[pad_data].compute_inline()
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            if conv.op in s.outputs:
-                output = conv
-                OL = s.cache_write(conv, "local")
-            else:
-                output = s.outputs[0].output(0)
-                s[conv].set_scope("local")
-                OL = conv
-
-            # create cache stage
-            AA = s.cache_read(pad_data, "shared", [OL])
-            WW = s.cache_read(kernel, "shared", [OL])
-            AL = s.cache_read(AA, "local", [OL])
-            WL = s.cache_read(WW, "local", [OL])
-
-            # tile and bind spatial axes
-            n, f, y, x = s[output].op.axis
-            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            kernel_scope, n = s[output].split(n, nparts=1)
-            bf = s[output].fuse(n, bf)
-            s[output].bind(bf, te.thread_axis("blockIdx.z"))
-            s[output].bind(by, te.thread_axis("blockIdx.y"))
-            s[output].bind(bx, te.thread_axis("blockIdx.x"))
-            s[output].bind(vf, te.thread_axis("vthread"))
-            s[output].bind(vy, te.thread_axis("vthread"))
-            s[output].bind(vx, te.thread_axis("vthread"))
-            s[output].bind(tf, te.thread_axis("threadIdx.z"))
-            s[output].bind(ty, te.thread_axis("threadIdx.y"))
-            s[output].bind(tx, te.thread_axis("threadIdx.x"))
-            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-            s[OL].compute_at(s[output], tx)
-
-            # cooperative fetching
-            s[AA].compute_at(s[output], bx)
-            s[WW].compute_at(s[output], bx)
-            s[AL].compute_at(s[output], tx)
-            s[WL].compute_at(s[output], tx)
-
-            for load in [AA, WW]:
-                fused = s[load].fuse(*list(s[load].op.axis))
-                fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
-                fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
-                fused, tz = s[load].split(fused, cfg["tile_f"].size[2])
-                s[load].bind(tz, te.thread_axis("threadIdx.z"))
-                s[load].bind(ty, te.thread_axis("threadIdx.y"))
-                s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-            s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-            s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@depthwise_conv2d_infer_layout.register("intel_graphics")
-def _depthwise_conv2d_infer_layout(workload, _):
-    """Infer input/output shapes and layouts from a workload and cfg.
-
-    Parameters
-    ----------
-    workload : tuple
-        conv2d workload
-
-    cfg : tuple
-        tvm.autotvm config
-
-    Returns
-    -------
-    Output : [tuple of tuple and str, tuple of tuple and str]
-        Input shapes and layouts, and output shapes and layouts
-    """
-    _, data, kernel, strides, padding, _, _ = workload
-    batch_size, in_channel, in_height, in_width = data[1]
-    filter_channel, channel_multiplier, k_height, k_width = kernel[1]
-    out_channel = filter_channel * channel_multiplier
-    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
-    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
-    in_shape = (batch_size, in_channel, in_height, in_width)
-    out_shape = (batch_size, out_channel, out_height, out_width)
-    in_layout = out_layout = "NCHW"
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
diff --git a/python/tvm/topi/mali/__init__.py b/python/tvm/topi/mali/__init__.py
deleted file mode 100644
index 36a464b42e1d..000000000000
--- a/python/tvm/topi/mali/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""ARM Mali GPU specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .conv2d import *
-from .depthwise_conv2d import *
-from .dense import *
diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py
deleted file mode 100644
index ccd3090a9838..000000000000
--- a/python/tvm/topi/mali/conv2d.py
+++ /dev/null
@@ -1,668 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""conv2d schedule on ARM Mali GPU"""
-import logging
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from tvm.autotvm.task.space import get_factors
-
-from ..utils import traverse_inline, get_const_int, get_const_tuple
-from .. import nn
-from ..nn.winograd_util import winograd_transform_matrices
-from ..nn.conv2d import conv2d_winograd_nhwc, _conv2d_winograd_nhwc_impl
-
-# reuse some compute declarations from ARM CPU
-from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw
-from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nhwc
-
-logger = logging.getLogger("topi")
-
-
-@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.mali")
-def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """TOPI compute callback for conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    kernel : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
-        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
-        filter_width, num_filter_block]
-
-    strides : list of two ints
-        [stride_height, stride_width]
-
-    padding : list of two ints
-        [pad_height, pad_width]
-
-    dilation : list of two ints
-        [dilation_height, dilation_width]
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    return conv2d_spatial_pack_nchw(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=3
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.mali")
-def schedule_conv2d_nchw_spatial_pack(cfg, outs):
-    """TOPI schedule callback for conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The configuration of this template
-    outs: Array of Tensor
-        The computation graph description of convolution2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d
-    """
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        # schedule conv2d
-        if "spatial_conv2d_output" in op.tag:
-            _schedule_spatial_pack(cfg, s, op, layout="NCHW")
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_spatial_pack.mali")
-def conv2d_nhwc_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with NHWC layout"""
-    return conv2d_spatial_pack_nhwc(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=3
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_spatial_pack.mali")
-def schedule_conv2d_nhwc_spatial_pack(cfg, outs):
-    """Create schedule for conv2d_nhwc"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        # schedule conv2d
-        if "spatial_conv_output_NHWC" in op.tag:
-            _schedule_spatial_pack(cfg, s, op, layout="NHWC")
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_spatial_pack(cfg, s, op, layout):
-    """schedule the spatial packing for conv2d"""
-
-    assert layout in ("NCHW", "NHWC")
-
-    output = op.output(0)
-    conv = op.input_tensors[0]
-    data_vec = conv.op.input_tensors[0]
-    data_pad = data_vec.op.input_tensors[0]
-    s[data_pad].compute_inline()
-    kernel_vec = conv.op.input_tensors[1]
-    if kernel_vec.op.name == "kernel_vec":
-        kernel = kernel_vec.op.input_tensors[0]
-    else:
-        kernel = kernel_vec
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-    data = s[data_vec].op.input_tensors[0]
-
-    max_unroll = 16
-    vec_size = [1, 2, 4, 8, 16]
-    # get tunable parameters (they are defined in compute)
-    _, TC, VC = cfg["tile_co"].size
-    _, TH, VH = cfg["tile_oh"].size
-    _, TW, VW = cfg["tile_ow"].size
-
-    # schedule padding
-    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-        data_pad = data
-        s[data_pad].compute_inline()
-
-    # schedule data packing
-    if layout == "NCHW":
-        if isinstance(data_vec.op, tvm.te.ComputeOp) and data_vec.op.name == "data_vec_undilated":
-            _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
-        else:
-            _, h, w, ci, vh, vw = s[data_vec].op.axis
-        z, y, x, unroll1, unroll2 = h, w, ci, vh, vw
-    else:
-        if isinstance(data_vec.op, tvm.te.ComputeOp) and data_vec.op.name == "data_vec_undilated":
-            _, oho, owo, _, _, ic, ohi, owi = s[data_vec].op.axis
-        else:
-            _, oho, owo, ohi, owi, ic = s[data_vec].op.axis
-        z, y, x, unroll1, unroll2 = oho, owo, ohi, ic, owi
-    tile_and_bind3d(s, data_vec, z, y, x, 1)
-    if unroll1.dom.extent.value < max_unroll:
-        s[data_vec].unroll(unroll1)
-    if unroll2.dom.extent.value < max_unroll:
-        s[data_vec].unroll(unroll2)
-
-    if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == "kernel_vec":
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-            ax1, ax2, ax3, ax4, ax5 = s[kernel_vec].op.axis
-            fused = s[kernel_vec].fuse(ax1, ax2, ax3, ax4, ax5)
-            fused, vec = s[kernel_vec].split(fused, VC)
-            bb, tt = s[kernel_vec].split(fused, max_threads)
-            s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
-            s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
-            if VC in vec_size:
-                s[kernel_vec].vectorize(vec)
-
-    # schedule convolution
-    ic, kh, kw = s[conv].op.reduce_axis
-    if layout == "NCHW":
-        kh_dim, kw_dim = kernel_vec.shape[2], kernel_vec.shape[3]
-    else:
-        kh_dim, kw_dim = kernel_vec.shape[0], kernel_vec.shape[1]
-    cfg["ann_reduce"].apply(
-        s,
-        conv,
-        [kh, kw],
-        axis_lens=[get_const_int(kh_dim), get_const_int(kw_dim)],
-        max_unroll=max_unroll,
-    )
-
-    if layout == "NCHW":
-        n, c, h, w, vh, vw, vc = s[conv].op.axis
-        cfg["reorder_0"].apply(s, conv, [n, c, h, w, ic, kh, kw, vh, vw, vc])
-        tile_and_bind3d(s, conv, c, h, w, TC, TH, TW)
-        unroll_vec_axes = [vh, vw, vc]
-        axis_lens = [VH, VW, VC]
-    else:
-        n, oho, owo, oco, ohi, owi, oci = s[conv].op.axis
-        cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci])
-        tile_and_bind3d(s, conv, oho, owo, oco, TH, TW, TC)
-        unroll_vec_axes = [ohi, owi, oci]
-        axis_lens = [VH, VW, VC]
-
-    cfg["ann_spatial"].apply(
-        s, conv, unroll_vec_axes, axis_lens, max_unroll=max_unroll, vec_size=vec_size, cfg=cfg
-    )
-
-    # schedule output
-    if output.op not in s.outputs:  # has bias
-        s[output].compute_inline()
-        output = s.outputs[0]
-    if layout == "NCHW":
-        _, co, oh, ow = s[output].op.axis
-        tile_and_bind3d(s, output, co, oh, ow, TC, TH, TW)
-    else:
-        _, oh, ow, co = s[output].op.axis
-        tile_and_bind3d(s, output, oh, ow, co, TH, TW, TC)
-
-    return s
-
-
-##### WINOGRAD TEMPLATE #####
-def _pick_tile_size(data, kernel, layout="NCHW"):
-    if layout == "NCHW":
-        N, CI, H, W = get_const_tuple(data.shape)
-    else:
-        assert layout == "NHWC"
-        N, H, W, CI = get_const_tuple(data.shape)
-
-    if H % 4 == 0:
-        return 4
-    else:
-        return 2
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd.mali")
-def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    tile_size = _pick_tile_size(data, kernel)
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size)
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd.mali")
-def schedule_conv2d_nchw_winograd(cfg, outs):
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "winograd_conv2d_output" in op.tag:
-            _schedule_winograd(cfg, s, op)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size):
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    if len(kernel.shape) == 4:
-        if dilation_h != 1 or dilation_w != 1:
-            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
-        pre_computed = False
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-    else:
-        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
-        pre_computed = True
-        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
-        CO *= VC
-        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
-
-    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
-    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, out_dtype)
-
-    H = (IH + pt + pb - 3) // HSTR + 1
-    W = (IW + pl + pr - 3) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-
-    ##### space definition begin #####
-    tile_bna_candidates = [1, 2, 4, 8, 16]
-    factors = get_factors(CO)
-    cfg.define_knob("tile_bna", [x for x in tile_bna_candidates if x in factors])
-    cfg.define_knob("tile_bnb", [1, 2, 4, 8, 16])
-    cfg.define_split("tile_t1", CI, num_outputs=2, max_factor=128)
-    cfg.define_split("tile_t2", CO, num_outputs=2, max_factor=128)
-    cfg.define_split("c_unroll", CI, num_outputs=2, max_factor=8)
-    cfg.define_knob("yt", [1, 2, 4, 8, 16, 32])
-    ##### space definition end #####
-
-    if cfg.is_fallback:
-        cfg["tile_bnb"].val = 4
-        cfg["tile_bna"].val = 4
-        while CO % cfg["tile_bna"].val != 0:
-            cfg["tile_bna"].val //= 2
-        cfg["yt"].val = 8
-        cfg.fallback_split("tile_t1", [-1, 128])
-        cfg.fallback_split("tile_t2", [-1, 128])
-        cfg.fallback_split("c_unroll", [-1, 8])
-
-    bna = cfg["tile_bna"].val
-    bnb = cfg["tile_bnb"].val
-
-    P_round = (P + bnb - 1) // bnb * bnb
-    assert CO % bna == 0 and P_round % bnb == 0
-
-    # pack input tile
-    input_tile = te.compute(
-        (CI, P_round // bnb, alpha, alpha, bnb),
-        lambda ci, b, eps, nu, bb: tvm.tir.if_then_else(
-            b * bnb + bb < P,
-            data_pad[(b * bnb + bb) // (nH * nW)][ci][(b * bnb + bb) // nW % nH * m + eps][
-                (b * bnb + bb) % nW * m + nu
-            ],
-            tvm.tir.const(0, data_pad.dtype),
-        ),
-        name="d",
-    )
-
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        kvshape = (alpha, alpha, CO // bna, CI, bna)
-        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
-    else:
-        # transform kernel
-        if pre_computed:
-            U = kernel
-        else:
-            r_kh = te.reduce_axis((0, KH), "r_kh")
-            r_kw = te.reduce_axis((0, KW), "r_kw")
-            U = te.compute(
-                (alpha, alpha, CO // bna, CI, bna),
-                lambda eps, nu, co, ci, vco: te.sum(
-                    kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
-                    axis=[r_kh, r_kw],
-                ),
-                name="U",
-            )
-
-    # transform image
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    V = te.compute(
-        (alpha, alpha, P_round // bnb, CI, bnb),
-        lambda eps, nu, p, ci, vp: te.sum(
-            input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
-        ),
-        name="V",
-    )
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    # batch gemm
-    ci = te.reduce_axis((0, CI), name="c")
-    M = te.compute(
-        (alpha, alpha, CO, P_round),
-        lambda eps, nu, co, p: te.sum(
-            U[eps][nu][idxdiv(co, bna)][ci][idxmod(co, bna)]
-            * V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)],
-            axis=ci,
-        ),
-        name="M",
-    )
-
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    Y = te.compute(
-        (CO, P, m, m),
-        lambda co, p, vh, vw: te.sum(M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
-        name="Y",
-    )
-
-    # unpack output
-    output = te.compute(
-        (N, CO, H, W),
-        lambda n, co, h, w: Y[
-            co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), idxmod(h, m), idxmod(w, m)
-        ]
-        # The following hack term is used to make the padding in batch gemm ("M")
-        # effective, otherwise the padding will be eliminated by bound inference.
-        # Use `tvm.tir.Mul` instead of `*` to avoid issues in const folding.
-        + tvm.tir.Mul(tvm.tir.const(0, out_dtype), M[alpha - 1][alpha - 1][CO - 1][P_round - 1]),
-        name="output",
-        tag="winograd_conv2d_output",
-    )
-
-    # we have to manually assign effective GFLOP for winograd
-    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
-    return output
-
-
-def _schedule_winograd(cfg, s, op):
-    """schedule winograd fast convolution F(2x2, 3x3) for conv2d"""
-    # get ops and tensors
-    output = op.output(0)
-
-    Y = op.input_tensors[0]
-    M, A = s[Y].op.input_tensors
-    U, V = s[M].op.input_tensors
-    d, B = s[V].op.input_tensors
-    data_pad = s[d].op.input_tensors[0]
-
-    # padding
-    s[data_pad].compute_inline()
-
-    # transform kernel
-    if isinstance(U.op, tvm.te.ComputeOp):
-        kernel, G = s[U].op.input_tensors
-        s[G].compute_inline()
-        (eps, nu, co, ci, vco) = s[U].op.axis
-        if not autotvm.GLOBAL_SCOPE.in_tuning:
-            r_kh, r_kw = s[U].op.reduce_axis
-            s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco)
-            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
-            s[U].vectorize(vco)
-            tile_and_bind(s, U, co, ci, 1, 256)
-
-        # dilation
-        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-            s[kernel].compute_inline()
-
-    # transform image
-    s[B].compute_inline()
-    VL = s.cache_write(V, "local")
-
-    eps, nu, p, ci, vp = s[V].op.axis
-    s[V].reorder(p, ci, eps, nu, vp)
-    for axis in [eps, nu]:
-        s[V].unroll(axis)
-    s[V].vectorize(vp)
-    fused = s[V].fuse(p, ci)
-
-    bb, tt = cfg["tile_t1"].apply(s, V, fused)
-    s[V].bind(bb, te.thread_axis("blockIdx.x"))
-    s[V].bind(tt, te.thread_axis("threadIdx.x"))
-
-    eps, nu, p, ci, vp = s[VL].op.axis
-    r_a, r_b = s[VL].op.reduce_axis
-    for axis in [eps, nu, r_a, r_b]:
-        s[VL].unroll(axis)
-    s[VL].vectorize(vp)
-    s[d].compute_at(s[V], tt)
-    s[VL].compute_at(s[V], tt)
-
-    # batch gemm
-    bna = cfg["tile_bna"].val
-    bnb = cfg["tile_bnb"].val
-
-    eps, nu, k, b = s[M].op.axis
-    alpha = eps.dom.extent
-    c = s[M].op.reduce_axis[0]
-    yo, xo, yi, xi = s[M].tile(k, b, bna, bnb)
-    c, c_unroll = cfg["c_unroll"].apply(s, M, c)
-    s[M].reorder(yo, xo, c, c_unroll, yi, xi)
-    s[M].unroll(c_unroll)
-    s[M].unroll(yi)
-    s[M].vectorize(xi)
-    z = s[M].fuse(eps, nu)
-    tile_and_bind3d(s, M, z, yo, xo, 1, cfg["yt"].val, 1)
-
-    # inverse transform
-    s[A].compute_inline()
-    k, b, vh, vw = s[Y].op.axis
-    r_a, r_b = s[Y].op.reduce_axis
-    for axis in [vh, vw, r_a, r_b]:
-        s[Y].unroll(axis)
-
-    # schedule output and fusion
-    if output.op not in s.outputs:
-        s[output].compute_inline()
-        output = s.outputs[0]
-
-    n, co, h, w = s[output].op.axis
-    m = alpha - 3 + 1
-    h, w, hi, wi = s[output].tile(h, w, m, m)
-    s[output].unroll(hi)
-    s[output].unroll(wi)
-    fused = s[output].fuse(n, co, h, w)
-    bb, tt = cfg["tile_t2"].apply(s, output, fused)
-    s[output].bind(bb, te.thread_axis("blockIdx.x"))
-    s[output].bind(tt, te.thread_axis("threadIdx.x"))
-
-    s[Y].compute_at(s[output], tt)
-
-
-##### REGISTER ALTER OP LAYOUT #####
-@nn.conv2d_alter_layout.register(["mali"])
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
-    impl, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-    )
-    workload = autotvm.task.get_workload(outs)
-    if workload is None:
-        # The best implementation is not an AutoTVM template.
-        # It may be from the auto-scheduler
-        if impl.name.find("winograd") != -1:
-            if dilation != (1, 1):
-                logger.warning("Does not support weight pre-transform for dilated convolution.")
-                return None
-
-            assert data_layout == "NHWC" and kernel_layout == "HWIO"
-            N, H, W, CI = get_const_tuple(data.shape)
-            KH, KW, _, CO = get_const_tuple(kernel.shape)
-
-            # Pre-compute weight transformation in winograd
-            tile_size = _pick_tile_size(tinfos[0], tinfos[1], layout="NHWC")
-
-            # HWIO -> OIHW
-            kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
-            # alpha, alpha, CO, CI
-            weight = relay.nn.contrib_conv2d_winograd_weight_transform(
-                kernel_transform, tile_size=tile_size
-            )
-            new_attrs["tile_size"] = tile_size
-            new_attrs["channels"] = CO
-            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                inputs[0], weight, **new_attrs
-            )
-
-        return None
-    cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:  # if is fallback, clear query cache and return None
-        autotvm.task.clear_fallback_cache(target, workload)
-        return None
-
-    topi_tmpl = workload[0]
-    idxd = tvm.tir.indexdiv
-
-    if topi_tmpl == "conv2d_nchw_spatial_pack.mali":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        VC = cfg["tile_co"].size[-1]
-
-        new_attrs["kernel_layout"] = f"OIHW{VC}o"
-
-        new_data = data
-        new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_spatial_pack.mali",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.conv2d(*inputs, **new_attrs)
-    elif topi_tmpl == "conv2d_nchw_winograd.mali":
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-        tile_size = _pick_tile_size(data, kernel)
-        VC = cfg["tile_bna"].val
-
-        weight_expr = inputs[1]
-        weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
-            weight_expr, tile_size=tile_size
-        )
-        weight_expr = relay.reshape(
-            weight_expr, newshape=(KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), VC, CI)
-        )
-        weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3])
-
-        new_attrs["tile_size"] = tile_size
-
-        new_data = data
-        new_kernel = te.placeholder(
-            (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC), kernel.dtype
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nchw_winograd.mali",
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-            inputs[0], weight_expr, **new_attrs
-        )
-    else:
-        return None
-
-
-@conv2d_winograd_nhwc.register(["mali"])
-def conv2d_winograd_nhwc_mali(
-    data,
-    weight,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    pre_computed=False,
-    auto_scheduler_rewritten_layout="",
-):
-    """Conv2D Winograd in NHWC layout.
-    This is a clean version to be used by the auto-scheduler for mali.
-    """
-    tile_size = _pick_tile_size(data, weight, layout="NHWC")
-    return _conv2d_winograd_nhwc_impl(
-        data,
-        weight,
-        strides,
-        padding,
-        dilation,
-        out_dtype,
-        tile_size,
-        pre_computed,
-        auto_scheduler_rewritten_layout,
-    )
-
-
-##### SCHECULE UTILITIES #####
-def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
-    """tile and bind to GPU threads"""
-    x_factor = x_factor or y_factor
-    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
-    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
-    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
-    return yo, xo, yi, xi
-
-
-def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-    """tile and bind 3d"""
-    y_factor = y_factor or z_factor
-    x_factor = x_factor or y_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
-    s[tensor].reorder(zo, yo, xo, zi, yi, xi)
-    return zo, yo, xo, zi, yi, xi
diff --git a/python/tvm/topi/mali/dense.py b/python/tvm/topi/mali/dense.py
deleted file mode 100644
index a8ca66b09cd5..000000000000
--- a/python/tvm/topi/mali/dense.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable
-"""dense schedule on ARM Mali GPU"""
-from tvm import te
-from tvm import autotvm
-
-from .. import nn
-from ..utils import traverse_inline
-
-
-@autotvm.register_topi_compute("dense.mali")
-def dense(_, data, weight, bias=None, out_dtype=None):
-    """Dense operator on Mali"""
-    return nn.dense(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule("dense.mali")
-def schedule_dense(cfg, outs):
-    """Schedule for dense operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config entity for this template
-    outs: Array of Tensor
-        The computation graph description of dense
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for dense.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "dense":
-            vec_size = [1, 2, 4, 8, 16]
-            max_unroll = 32
-
-            dense_out = op.output(0)
-            output = outs[0]
-
-            y, x = s[output].op.axis
-            c = s[dense_out].op.reduce_axis[0]
-
-            ##### space definition begin #####
-            cfg.define_split("tile_y", y, num_outputs=3)
-            cfg.define_split("tile_x", x, num_outputs=3)
-            cfg.define_split("c_unroll", c, num_outputs=2, max_factor=64)
-
-            # fallback support
-            if cfg.is_fallback:
-                ref_log = autotvm.tophub.load_reference_log("mali", "rk3399", "dense.mali")
-                cfg.fallback_with_reference_log(ref_log)
-            ##### space definition end #####
-
-            if dense_out.op in s.outputs:
-                dense_out = s.cache_write(output, "local")
-
-            by, ty, yi = cfg["tile_y"].apply(s, output, y)
-            bx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            s[output].bind(by, te.thread_axis("blockIdx.y"))
-            s[output].bind(bx, te.thread_axis("blockIdx.x"))
-            s[output].bind(ty, te.thread_axis("threadIdx.y"))
-            s[output].bind(tx, te.thread_axis("threadIdx.x"))
-
-            if cfg["tile_y"].size[-1] < max_unroll:
-                s[output].unroll(yi)
-            if cfg["tile_x"].size[-1] in vec_size:
-                s[output].vectorize(xi)
-            s[dense_out].compute_at(s[output], tx)
-
-            k = s[dense_out].op.reduce_axis[0]
-            y, x = s[dense_out].op.axis
-            k, k_unroll = cfg["c_unroll"].apply(s, dense_out, k)
-            s[dense_out].reorder(k, k_unroll, y, x)
-            s[dense_out].unroll(k_unroll)
-            if cfg["tile_y"].size[-1] < max_unroll:
-                s[dense_out].unroll(y)
-            if cfg["tile_x"].size[-1] in vec_size:
-                s[dense_out].vectorize(x)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-    """fuse all the axis and bind to GPU threads"""
-    # TODO(@comaniac): figure out where this function is used.
-    axis = axis or s[tensor].op.axis
-    fused = s[tensor].fuse(*axis)
-    bx, tx = s[tensor].split(fused, num_thread)
-    s[tensor].bind(bx, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, te.thread_axis("threadIdx.x"))
-    return bx, tx
diff --git a/python/tvm/topi/mali/depthwise_conv2d.py b/python/tvm/topi/mali/depthwise_conv2d.py
deleted file mode 100644
index 98109ab4535f..000000000000
--- a/python/tvm/topi/mali/depthwise_conv2d.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument
-"""depthwise_conv2d schedule on ARM Mali GPU"""
-
-import tvm
-from tvm import te
-from tvm import autotvm
-
-from .. import nn
-from ..utils import traverse_inline
-
-# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-@autotvm.register_topi_compute("depthwise_conv2d_nchw.mali")
-def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-
-
-# register customized schedule for Mali.
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw.mali")
-def schedule_depthwise_conv2d_nchw(cfg, outs):
-    """Schedule depthwise conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The configuration of this template
-    outs: Array of Tensor
-        The computation graph description of depthwise convolution2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        """traverse to find op to schedule"""
-        # schedule depthwise_conv2d
-        if op.tag == "depthwise_conv2d_nchw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-            _schedule(cfg, s, pad_data, kernel, conv, "NCHW")
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-# register original implementation of depthwise_conv2d_nhwc since we don't need to change this part
-@autotvm.register_topi_compute("depthwise_conv2d_nhwc.mali")
-def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return nn.depthwise_conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-
-
-# register customized schedule for Mali.
-@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.mali")
-def schedule_depthwise_conv2d_nhwc(cfg, outs):
-    """Schedule depthwise conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The configuration of this template
-    outs: Array of Tensor
-        The computation graph description of depthwise convolution2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        """traverse to find op to schedule"""
-        # schedule depthwise_conv2d
-        if op.tag == "depthwise_conv2d_nhwc":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-            _schedule(cfg, s, pad_data, kernel, conv, "NHWC")
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule(cfg, s, pad_data, kernel, conv, layout):
-    """schedule depthwise_conv2d"""
-    assert layout in ("NCHW", "NHWC")
-
-    max_unroll = 16
-    vec_size = [1, 2, 4, 8, 16]
-
-    ##### space definition begin #####
-    if layout == "NCHW":
-        n, c, h, w = s[conv].op.axis
-    else:
-        n, h, w, c = s[conv].op.axis
-
-    bc, tc, ci = cfg.define_split("tile_c", c, num_outputs=3)
-    bh, th, hi = cfg.define_split("tile_y", h, num_outputs=3)
-    bw, tw, wi = cfg.define_split("tile_x", w, num_outputs=3)
-    cfg.define_annotate("ann_spatial", [ci, hi, wi], policy="try_unroll_vec")
-
-    # fallback support
-    if cfg.is_fallback:
-        if layout == "NCHW":
-            ref_log = autotvm.tophub.load_reference_log(
-                "mali", "rk3399", "depthwise_conv2d_nchw.mali"
-            )
-            cfg.fallback_with_reference_log(ref_log)
-        else:
-            cfg.fallback_split("tile_c", [-1, 4, 2])
-            cfg.fallback_split("tile_y", [-1, 4, 2])
-            cfg.fallback_split("tile_x", [-1, 4, 2])
-    ###### space definition end ######
-
-    # schedule padding
-    if layout == "NCHW":
-        n, c, h, w = s[pad_data].op.axis
-        z, y, x = c, h, w
-        z_factor, y_factor, x_factor = cfg["tile_c"].size[1], 1, 1
-    else:
-        n, h, w, c = s[pad_data].op.axis
-        z, y, x = h, w, c
-        z_factor, y_factor, x_factor = 1, 1, cfg["tile_c"].size[1]
-    tile_and_bind3d(s, pad_data, z, y, x, z_factor, y_factor, x_factor)
-
-    # schedule dilation
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    # schedule conv
-    if conv.op not in s.outputs:
-        s[conv].set_scope("local")
-        OL = conv
-        output = s.outputs[0].output(0)
-    else:
-        OL = s.cache_write(conv, "local")
-        output = conv
-
-    if layout == "NCHW":
-        n, c, h, w = s[output].op.axis
-    else:
-        n, h, w, c = s[output].op.axis
-
-    bc, tc, ci = cfg["tile_c"].apply(s, output, c)
-    bh, th, hi = cfg["tile_y"].apply(s, output, h)
-    bw, tw, wi = cfg["tile_x"].apply(s, output, w)
-
-    if layout == "NCHW":
-        bz, tz, by, ty, bx, tx = bc, tc, bh, th, bw, tw
-    else:
-        bz, tz, by, ty, bx, tx = bh, th, bw, tw, bc, tc
-
-    bz = s[output].fuse(n, bz)
-    s[output].bind(bz, te.thread_axis("blockIdx.z"))
-    s[output].bind(tz, te.thread_axis("threadIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-
-    di, dj = s[OL].op.reduce_axis
-    s[OL].unroll(di)
-    s[OL].unroll(dj)
-
-    s[OL].compute_at(s[output], tx)
-
-    if layout == "NCHW":
-        n, ci, hi, wi = s[OL].op.axis
-    else:
-        n, hi, wi, ci = s[OL].op.axis
-
-    cfg["ann_spatial"].apply(
-        s,
-        OL,
-        [ci, hi, wi],
-        axis_lens=[cfg["tile_c"].size[2], cfg["tile_y"].size[2], cfg["tile_x"].size[2]],
-        max_unroll=max_unroll,
-        vec_size=vec_size,
-        cfg=cfg,
-    )
-
-
-def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-    """tile and bind 3d"""
-    y_factor = y_factor or z_factor
-    x_factor = x_factor or y_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
-    return zo, zi, yo, yi, xo, xi
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index 2c549cc5b9cf..6a4e38754492 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -47,7 +47,6 @@
 from .bitserial_dense import *
 from .batch_matmul import *
 from .batch_norm import *
-from .sparse import *
 from .pad import *
 from .fifo_buffer import *
 from .depth_to_space import *
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 2156fe11ed43..a99659afac3d 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -19,7 +19,7 @@
 import logging
 
 import tvm
-from tvm import auto_scheduler, te
+from tvm import te
 
 from ..utils import get_const_tuple
 
@@ -79,23 +79,15 @@ def batch_matmul(
     else:
         XB, XI, XK = get_const_tuple(tensor_a.shape)
     if auto_scheduler_rewritten_layout:
-        # Infer shape for the rewritten layout
-        YB, YK, YJ = auto_scheduler.get_shape_from_rewritten_layout(
-            auto_scheduler_rewritten_layout, ["b", "k", "j"]
-        )
-        auto_scheduler.remove_index_check(tensor_b)
-    elif meta_schedule_original_shape:
-        auto_scheduler.rewrite_tensor_shape(tensor_b, meta_schedule_original_shape)
-        if transpose_b:
-            YB, YJ, YK = get_const_tuple(tensor_b.shape)
-        else:
-            YB, YK, YJ = get_const_tuple(tensor_b.shape)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
+    if meta_schedule_original_shape:
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
+
+    assert len(tensor_b.shape) == 3, "tensor_b only support 3-dim"
+    if transpose_b:
+        YB, YJ, YK = get_const_tuple(tensor_b.shape)
     else:
-        assert len(tensor_b.shape) == 3, "tensor_b only support 3-dim"
-        if transpose_b:
-            YB, YJ, YK = get_const_tuple(tensor_b.shape)
-        else:
-            YB, YK, YJ = get_const_tuple(tensor_b.shape)
+        YB, YK, YJ = get_const_tuple(tensor_b.shape)
 
     assert XK == YK or isinstance(YK, tvm.tir.expr.Var), "shapes of x and y are inconsistent"
     k = te.reduce_axis((0, XK), name="k")
@@ -153,29 +145,6 @@ def batch_matmul(
         attrs={"layout_free_placeholders": [tensor_b]},
     )
     if auto_scheduler_rewritten_layout:
-        output = auto_scheduler.rewrite_compute_body(output, auto_scheduler_rewritten_layout)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
 
     return output
-
-
-@tvm.target.generic_func
-def batch_matmul_legalize(attrs, inputs, types):
-    """Legalizes batch_matmul op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current batch_matmul
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # not to change by default
-    # pylint: disable=unused-argument
-    return None
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 205730ff22d6..5ee625577e38 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -25,7 +25,7 @@
 
 import numpy as np
 import tvm
-from tvm import auto_scheduler, te
+from tvm import te
 
 from ..utils import get_const_int, get_const_tuple, simplify, tag
 from .pad import pad
@@ -890,7 +890,7 @@ def conv(
     kernel_permutation_from = np.argsort(kernel_permutation_to)
 
     if meta_schedule_original_shape:
-        auto_scheduler.rewrite_tensor_shape(filt, meta_schedule_original_shape)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
     batch, in_channel, *dimensions = np.array(get_const_tuple(inp.shape))[
         data_permutation_to
     ].tolist()
@@ -901,11 +901,7 @@ def conv(
     # Autoscheduler may have messed with the input layout, so we extract the
     # dimensions that it gives us
     if auto_scheduler_rewritten_layout:
-        num_filter, _, *kernel_dimensions = auto_scheduler.get_shape_from_rewritten_layout(
-            auto_scheduler_rewritten_layout,
-            ["ff", "rc"] + [f"r{i}" for i in ["y", "x", "z"][: len(kernel_dimensions)]],
-        )
-        auto_scheduler.remove_index_check(filt)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
 
     assert in_channel % groups == 0, "input channels must divide group size"
     assert num_filter % groups == 0, "output channels must divide group size"
@@ -967,7 +963,7 @@ def compute(*args):
     # if we used autoscheduler's changed layout we need to rewrite the ordering
     # of the output dimensions
     if auto_scheduler_rewritten_layout:
-        out = auto_scheduler.rewrite_compute_body(out, auto_scheduler_rewritten_layout)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
     return out
 
 
@@ -1207,23 +1203,13 @@ def _conv2d_winograd_nhwc_impl(
     else:
         dilation_h, dilation_w = dilation
     if meta_schedule_original_shape:
-        auto_scheduler.rewrite_tensor_shape(weight, meta_schedule_original_shape)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
 
     assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
     if not pre_computed:
         KH, KW, CI, CO = get_const_tuple(weight.shape)
     else:
-        if auto_scheduler_rewritten_layout:
-            H_CAT, W_CAT, CO, CI = get_const_tuple(
-                auto_scheduler.get_shape_from_rewritten_layout(
-                    auto_scheduler_rewritten_layout, ["eps", "nu", "co", "ci"]
-                )
-            )
-            auto_scheduler.remove_index_check(weight)
-        else:
-            H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
-
-        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
 
     pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
     HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
@@ -1305,7 +1291,7 @@ def _conv2d_winograd_nhwc_impl(
     )
 
     if auto_scheduler_rewritten_layout:
-        bgemm = auto_scheduler.rewrite_compute_body(bgemm, auto_scheduler_rewritten_layout)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
 
     # inverse transform
 
@@ -1358,7 +1344,7 @@ def _conv2d_winograd_nchw_impl(
     else:
         dilation_h, dilation_w = dilation
     if meta_schedule_original_shape:
-        auto_scheduler.rewrite_tensor_shape(weight, meta_schedule_original_shape)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
 
     assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
     HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index 5638d3d77fd2..eb86a167c27c 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -18,8 +18,7 @@
 """Transposed 2D convolution operators (sometimes called Deconvolution)."""
 import collections
 
-import tvm
-from tvm import relay, te
+from tvm import te
 
 from ..utils import simplify
 from .dilate import dilate
@@ -243,91 +242,3 @@ def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output
         ),
         tag="group_conv2d_transpose_nchw",
     )
-
-
-def layout_transform(tensor: "relay.Expr", current_layout: str, desired_layout: str):
-    """Transform a tensor with the current layout to the desired layout.
-
-    E.g. layout_transform(t, "NCHW", "CNHW") --> relay.transpose(t, [1, 0, 2, 3])
-
-    Parameters
-    ----------
-    tensor: relay.Expr
-        The Tensor to transpose
-
-    current_layout: str
-        The current layout e.g. NCHW or OIHW
-
-    desired_layout: str
-        The desired layout, must be compatible with current_layout
-
-    Returns
-    -------
-    The layout_transformed tensor.
-    """
-    if sorted(current_layout) != sorted(desired_layout):
-        raise ValueError(f"Incompatible layouts: {current_layout} vs {desired_layout}")
-
-    if current_layout == desired_layout:
-        return tensor
-
-    current_layout_map = {c: i for i, c in enumerate(current_layout)}
-    desired_layout_map = {c: i for i, c in enumerate(desired_layout)}
-
-    axes = [None] * len(current_layout)
-    for c, i in desired_layout_map.items():
-        axes[i] = current_layout_map[c]
-    return relay.transpose(tensor, axes=axes)
-
-
-@tvm.target.generic_func
-def conv2d_transpose_legalize(attrs, inputs, types):
-    """Legalizes Transposed 2D convolution op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current Transposed 2D convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    data, kernel = inputs
-    kernel_layout = attrs["kernel_layout"]
-
-    target = tvm.target.Target.current(allow_none=True)
-    if target and "cudnn" in target.libs:
-        # cuDNN backend can directly operate on NHWC layout.
-        return None
-
-    if attrs["data_layout"] == "NHWC":
-        kernel = layout_transform(kernel, kernel_layout, "IOHW")
-
-        # Set new attrs for conv2d_transpose.
-        new_attrs = {k: attrs[k] for k in attrs.keys()}
-        new_attrs["data_layout"] = "NCHW"
-        # layout of kernel should be IOHW, but kernel_layout will be swapped - OIHW
-        new_attrs["kernel_layout"] = "IOHW"
-
-        # Convert data to NCHW.
-        data = relay.transpose(data, axes=(0, 3, 1, 2))
-        deconv = relay.nn.conv2d_transpose(data, kernel, **new_attrs)
-        # Convert back to original NHWC layout.
-        out = relay.transpose(deconv, axes=(0, 2, 3, 1))
-        return out
-
-    if attrs["data_layout"] == "NCHW":
-        kernel = layout_transform(kernel, kernel_layout, "IOHW")
-        new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-        # layout of kernel should be IOHW, but kernel_layout will be swapped - OIHW
-        new_attrs["kernel_layout"] = "IOHW"
-        return relay.nn.conv2d_transpose(data, kernel, **new_attrs)
-
-    return None
diff --git a/python/tvm/topi/nn/conv3d_transpose.py b/python/tvm/topi/nn/conv3d_transpose.py
index 275ba8c6317f..993fdbff27ae 100644
--- a/python/tvm/topi/nn/conv3d_transpose.py
+++ b/python/tvm/topi/nn/conv3d_transpose.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Transposed 3D convolution operators (sometimes called Deconvolution)."""
-import tvm
 from tvm import te
 
 from ..utils import simplify
@@ -198,63 +197,3 @@ def group_conv3d_transpose_ncdhw(data, kernel, strides, padding, out_dtype, outp
         ),
         tag="group_conv3d_transpose_ncdhw",
     )
-
-
-@tvm.target.generic_func
-def conv3d_transpose_legalize(attrs, inputs, types):
-    """Legalizes Transposed 3D convolution op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current Transposed 3D convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    from tvm import relay  # pylint: disable=import-outside-toplevel
-
-    if attrs["data_layout"] == "NDHWC":
-        data, kernel = inputs
-        kernel_layout = attrs["kernel_layout"]
-        # Convert Kernel layout to IODHW
-        if kernel_layout == "DHWIO":
-            # input kernel layout is swapped to DHWOI
-            # output kernel layout will be IODHW
-            kernel = relay.transpose(kernel, axes=(3, 4, 0, 1, 2))
-        elif kernel_layout == "DHWOI":
-            # input kernel layout is swapped to DHWIO
-            # output kernel layout will be IODHW
-            kernel = relay.transpose(kernel, axes=(4, 3, 0, 1, 2))
-        elif kernel_layout == "OIDHW":
-            # input kernel layout is swapped to OIDHW
-            # output kernel layout will be IODHW
-            kernel = relay.transpose(kernel, axes=(1, 0, 2, 3, 4))
-        elif kernel_layout == "IODHW":
-            # input kernel layout is swapped to IODHW
-            # output kernel layout will be IODHW
-            pass
-        else:
-            # Skip legalize. Let relay.nn.conv2d_transpose to handle the case
-            return None
-
-        # Set new attrs for conv3d_transpose.
-        new_attrs = {k: attrs[k] for k in attrs.keys()}
-        new_attrs["data_layout"] = "NCDHW"
-        # layout of kernel should be IODHW, but kernel_layout should be swapped - OIDHW
-        new_attrs["kernel_layout"] = "IODHW"
-
-        # Convert data to NCDHW.
-        data = relay.transpose(data, axes=(0, 4, 1, 2, 3))
-        deconv = relay.nn.conv3d_transpose(data, kernel, **new_attrs)
-        # Convert back to original NDHWC layout.
-        out = relay.transpose(deconv, axes=(0, 2, 3, 4, 1))
-        return out
-
-    return None
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 76315670641e..5df1674627c3 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name,unused-argument
 """TVM operator fully connected compute."""
 import tvm
-from tvm import auto_scheduler, te
+from tvm import te
 
 from .. import tag, add
 
@@ -83,18 +83,11 @@ def matmul(
 
     if auto_scheduler_rewritten_layout:
         # Infer shape for the rewritten layout
-        assert len(tensor_b).shape == 2, "only support 2-dim matmul when using auto-scheduler"
-        out_dim, reduce_dim_b = auto_scheduler.get_shape_from_rewritten_layout(
-            auto_scheduler_rewritten_layout, ["j", "k"]
-        )
-        auto_scheduler.remove_index_check(tensor_b)
-    elif meta_schedule_original_shape:
-        auto_scheduler.rewrite_tensor_shape(tensor_b, meta_schedule_original_shape)
-        if transpose_b:
-            out_dim, reduce_dim_b = tensor_b.shape[-2:]
-        else:
-            reduce_dim_b, out_dim = tensor_b.shape[-2:]
-    elif transpose_b:
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
+    if meta_schedule_original_shape:
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
+
+    if transpose_b:
         out_dim, reduce_dim_b = tensor_b.shape[-2:]
     else:
         reduce_dim_b, out_dim = tensor_b.shape[-2:]
@@ -165,7 +158,7 @@ def compute(*indices):
         mat = add(mat, bias.astype(out_dtype))
 
     if auto_scheduler_rewritten_layout:
-        mat = auto_scheduler.rewrite_compute_body(mat, auto_scheduler_rewritten_layout)
+        raise RuntimeError("LEGACY-FLOW triggered, to be removed")
 
     return mat
 
@@ -310,45 +303,3 @@ def dense_pack(data, weight, bias=None, out_dtype=None):
     if bias is not None:
         C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
     return C
-
-
-@tvm.target.generic_func
-def dense_alter_layout(attrs, inputs, tinfos, out_type):
-    """Change dense layout.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : tvm.relay.Expr
-        Grouped input symbols
-    tinfos : list
-        Input shape and dtype
-    out_type: type
-        The output type
-
-    Note
-    ----
-    Unlike other TOPI functions, this function operates on both graph level and operator level.
-    """
-    # not to change by default
-    return None
-
-
-@tvm.target.generic_func
-def batch_matmul_legalize(attrs, inputs, types):
-    """Legalizes batch_matmul op.
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current batch_matmul
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    return None
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
deleted file mode 100644
index d3475653715d..000000000000
--- a/python/tvm/topi/nn/sparse.py
+++ /dev/null
@@ -1,816 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Sparse operators"""
-from __future__ import absolute_import
-import tvm
-from tvm import te, auto_scheduler
-
-from ..utils import get_const_tuple
-
-
-def sparse_dense_sp_rhs(data, weight_data, weight_indices, weight_indptr):
-    """
-    Computes sparse-dense matrix multiplication of `data` and
-    `(weight_data, weight_indices, weight_indptr).T`
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        2-D with shape [M, K]
-
-    weight_data : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        3-D with shape [num_blocks, bs_r, bs_c] (BSR)
-
-    weight_indices : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        1-D with shape [num_blocks] (BSR)
-
-    weight_indptr : tvm.te.Tensor
-        1-D with shape [N + 1] (CSR) or
-        1-D with shape [(N + 1) // bs_r] (BSR)
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [M, N]
-    """
-    assert len(weight_data.shape) in (1, 3)
-    if len(weight_data.shape) == 1:
-        func = _sparse_dense_sp_rhs_csrmm
-    if len(weight_data.shape) == 3:
-        func = _sparse_dense_sp_rhs_bsrmm
-    return func(data, weight_data, weight_indices, weight_indptr)
-
-
-def sparse_dense_sp_lhs(data_data, data_indices, data_indptr, weight):
-    """
-    Computes sparse-dense matrix multiplication of
-    `(data_data, data_indices, data_indptr)` and `weight.T`
-
-    Parameters
-    ----------
-    data_data:
-        1-D with shape [nnz] (CSR) or
-        3-D with shape [num_blocks, bs_r, bs_c] (BSR)
-
-    data_indices:
-        1-D with shape [nnz] (CSR) or
-        1-D with shape [num_blocks] (BSR)
-
-    data_indptr:
-        1-D with shape [M + 1] (CSR) or
-        1-D with shape [(M + 1) // bs_r] (BSR)
-
-    weight:
-        2-D with shape [N, K]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [M, N]
-    """
-    assert len(data_data.shape) in (1, 3)
-    if len(data_data.shape) == 1:
-        func = _sparse_dense_sp_lhs_csrmm
-    if len(data_data.shape) == 3:
-        func = _sparse_dense_sp_lhs_bsrmm
-    return func(data_data, data_indices, data_indptr, weight)
-
-
-# pylint: disable=no-else-return,inconsistent-return-statements
-def sparse_dense(dense_data, sparse_data, sparse_indices, sparse_indptr, sparse_lhs=False):
-    """
-    Computes sparse-dense matrix multiplication of `data` and
-    `(weight_data, weight_indices, weight_indptr).T`, if sparse_lhs=False
-    or
-    Computes sparse-dense matrix multiplication of
-    `(data_data, data_indices, data_indptr)` and `weight.T`, if sparse_lhs=True
-
-    Parameters
-    ----------
-    dense_data : tvm.te.Tensor
-        2-D with shape [M, K]
-
-    sparse_data : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        3-D with shape [num_blocks, bs_r, bs_c] (BSR)
-
-    sparse_indices : tvm.te.Tensor
-        1-D with shape [nnz] (CSR) or
-        1-D with shape [num_blocks] (BSR)
-
-    sparse_indptr : tvm.te.Tensor
-        1-D with shape [N + 1] (CSR) or
-        1-D with shape [(N + 1) // bs_r] (BSR)
-
-    sparse_lhs : bool, optional
-        Indicates whether lhs or rhs matrix is sparse. Default value is False.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [M, N]
-    """
-    if sparse_lhs:
-        return sparse_dense_sp_lhs(sparse_data, sparse_indices, sparse_indptr, dense_data)
-    else:
-        return sparse_dense_sp_rhs(dense_data, sparse_data, sparse_indices, sparse_indptr)
-
-
-def _sparse_dense_sp_lhs_csrmm(data_data, data_indices, data_indptr, weight):
-    oshape = (get_const_tuple(data_indptr.shape)[0] - 1, get_const_tuple(weight.shape)[0])
-
-    def f(row, i):
-        row_start = data_indptr[row]
-        row_end = data_indptr[row + 1]
-        row_elems = row_end - row_start
-        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
-        elem = row_start + elem_idx
-        a_val = data_data[elem]
-        weight_val = weight[i, data_indices[elem]]
-        return te.sum(a_val * weight_val, axis=elem_idx)
-
-    return te.compute(oshape, f, tag="sparse_dense_sp_lhs_csrmm")
-
-
-def _sparse_dense_sp_rhs_csrmm(data, weight_data, weight_indices, weight_indptr):
-    oshape = (get_const_tuple(data.shape)[0], get_const_tuple(weight_indptr.shape)[0] - 1)
-
-    def f(i, row):
-        row_start = weight_indptr[row]
-        row_end = weight_indptr[row + 1]
-        row_elems = row_end - row_start
-        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
-        elem = row_start + elem_idx
-        a_val = weight_data[elem]
-        weight_val = data[i, weight_indices[elem]]
-        return te.sum(a_val * weight_val, axis=elem_idx)
-
-    return te.compute(oshape, f, tag="sparse_dense_sp_rhs_csrmm")
-
-
-def _sparse_dense_sp_lhs_bsrmm(data_data, data_indices, data_indptr, weight):
-    (m, _) = get_const_tuple(weight.shape)
-    (_, bs_r, bs_c) = get_const_tuple(data_data.shape)
-    (num_blocks_plus_1,) = get_const_tuple(data_indptr.shape)
-    num_blocks = num_blocks_plus_1 - 1
-
-    def _compute_block(nb_j, j, i):
-        row_start = data_indptr[nb_j]
-        row_end = data_indptr[nb_j + 1]
-        row_elems = row_end - row_start
-        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
-        block_offset = row_start + elem_idx
-        c = te.reduce_axis((0, bs_c), name="c")
-        block_j = data_indices[block_offset]
-        block_ij_val = data_data[block_offset][j][c]
-        x_val = weight[i, bs_c * block_j + c]
-        return te.sum(block_ij_val * x_val, axis=[elem_idx, c])
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    bsrmm_block = te.compute(
-        (num_blocks, bs_r, m), _compute_block, tag="sparse_dense_sp_lhs_bsrmm_block"
-    )
-    return te.compute(
-        (num_blocks * bs_r, m),
-        lambda m, n: bsrmm_block[idxd(m, bs_r), idxm(m, bs_r), n],
-        tag="sparse_dense_sp_lhs_bsrmm",
-    )
-
-
-def _sparse_dense_sp_rhs_bsrmm(data, weight_data, weight_indices, weight_indptr):
-    (m, k) = get_const_tuple(data.shape)
-    (_, bs_r, bs_c) = get_const_tuple(weight_data.shape)
-    (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape)
-    num_blocks = num_blocks_plus_1 - 1
-
-    def _compute_block(i, nb_j, j):
-        row_start = weight_indptr[nb_j]
-        row_end = weight_indptr[nb_j + 1]
-        row_elems = row_end - row_start
-        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
-        block_offset = row_start + elem_idx
-        c = te.reduce_axis((0, bs_c), name="c")
-        block_j = weight_indices[block_offset]
-        block_ij_val = weight_data[block_offset][j][c]
-        x_val = data[i, bs_c * block_j + c]
-        return te.sum(block_ij_val * x_val, axis=[elem_idx, c])
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    bsrmm_block = te.compute(
-        (m, num_blocks, bs_r),
-        _compute_block,
-        tag="sparse_dense_sp_rhs_bsrmm_block",
-        attrs={"FLOP": 2 * m * num_blocks * bs_r * k},
-    )
-    return te.compute(
-        (m, num_blocks * bs_r),
-        lambda m, n: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r)],
-        tag="sparse_dense_sp_rhs_bsrmm",
-    )
-
-
-def sparse_transpose(sparse_data, sparse_indices, sparse_indptr):
-    """
-    Transpose a square sparse matrix,
-    `A` is an n-by-n sparse matrix in the CSR format.
-    ** Currently only support Square Matrices **
-
-    Parameters
-    ----------
-    sparse_data : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    sparse_indices : tvm.te.Tensor
-        1-D with shape [nonzeros], dtype of 'int32'
-
-    sparse_indptr : tvm.te.Tensor
-        1-D with shape [n+1], dtype of 'int32'
-
-    Returns
-    -------
-    out_data : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    out_indices : tvm.te.Tensor
-        1-D with shape [nonzeros], dtype of 'int32'
-
-    out_indptr : tvm.te.Tensor
-        1-D with shape [n+1], dtype of 'int32'
-    """
-    assert len(sparse_data.shape) == 1, "error in data dimension"
-    assert len(sparse_indices.shape) == 1, "error in indices dimension"
-    assert len(sparse_indptr.shape) == 1, "error in indptr dimension"
-
-    nnz = get_const_tuple(sparse_data.shape)[0]
-    n = get_const_tuple(sparse_indptr.shape)[0] - 1
-    output_shape = [(nnz,), (nnz,), (n + 1,)]
-
-    # TODO: Add BSR transpose support
-
-    output_data, output_indices, output_indptr = te.extern(
-        shape=output_shape,
-        inputs=[sparse_data, sparse_indices, sparse_indptr],
-        fcompute=lambda ins, outs: _csr_transpose_ir(
-            ins[0], ins[1], ins[2], outs[0], outs[1], outs[2]
-        ),
-        tag="sparse_transpose_csr",
-        dtype=[sparse_data.dtype, "int32", "int32"],
-        name="out",
-    )
-
-    return [output_data, output_indices, output_indptr]
-
-
-def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr):
-    """define ir for csr_transpose"""
-    irb = tvm.tir.ir_builder.create()
-
-    data_ptr = irb.buffer_ptr(data)
-    indices_ptr = irb.buffer_ptr(indices)
-    indptr_ptr = irb.buffer_ptr(indptr)
-
-    out_data_ptr = irb.buffer_ptr(out_data)
-    out_indices_ptr = irb.buffer_ptr(out_indices)
-    out_indptr_ptr = irb.buffer_ptr(out_indptr)
-
-    n = get_const_tuple(indptr.shape)[0] - 1
-    nnz = get_const_tuple(data.shape)[0]
-
-    with irb.for_range(0, n, kind="parallel", name="col") as col:
-        out_indptr_ptr[col] = 0
-
-    with irb.for_range(0, nnz, kind="serial", name="nz_idx") as nz_idx:
-        out_indptr_ptr[indices_ptr[nz_idx]] += 1
-
-    cumsum = irb.allocate("int32", (1,), name="cumsum", scope="local")
-    temp = irb.allocate("int32", (1,), name="temp", scope="local")
-    cumsum[0] = 0
-    with irb.for_range(0, n, kind="serial", name="col") as col:
-        temp[0] = out_indptr_ptr[col]
-        out_indptr_ptr[col] = cumsum[0]
-        cumsum[0] += temp[0]
-
-    out_indptr_ptr[n] = nnz
-
-    with irb.for_range(0, n, kind="serial", name="row") as row:
-        offset = indptr_ptr[row]
-        diff = indptr_ptr[row + 1] - indptr_ptr[row]
-        with irb.for_range(0, diff, kind="serial", name="idx") as idx:
-            real_idx = offset + idx
-            col = indices_ptr[real_idx]
-            dest = out_indptr_ptr[col]
-
-            out_indices_ptr[dest] = row
-            out_data_ptr[dest] = data_ptr[real_idx]
-            out_indptr_ptr[col] += 1
-
-    last = irb.allocate("int32", (1,), name="last", scope="local")
-    temp2 = irb.allocate("int32", (1,), name="temp2", scope="local")
-    last[0] = 0
-    with irb.for_range(0, n, kind="serial", name="col") as col:
-        temp2[0] = out_indptr_ptr[col]
-        out_indptr_ptr[col] = last[0]
-        last[0] = temp2[0]
-
-    return irb.get()
-
-
-@tvm.target.generic_func
-def sparse_dense_alter_layout(_attrs, _inputs, _tinfos, _out_type):
-    """Change Sparse Dense layout.
-
-    This is used for modifying the inputs weights so they are more amenable for
-    the target.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : tvm.relay.Expr
-        Grouped input symbols
-    tinfos : list
-        Input shape and dtype
-    out_type: type
-        The output type
-
-    Note
-    ----
-    Unlike other TOPI functions, this function operates on both graph level and operator level.
-    """
-    return None
-
-
-@auto_scheduler.register_task_input_check_func
-def try_get_sparse_input(args):
-    """Analyze the input data from the given args.
-
-    Parameters
-    ----------
-    args : List[Tensor]
-        Input/output Tensor of a TVM subgraph.
-
-    Returns
-    -------
-    Dict[Tensor, str] :
-        Map from the input Tensor to its buffer name.
-
-    Notes
-    -----
-    The buffer name is specially designed, and these buffer should be provided in
-    `SearchTask(..., task_inputs={...})`.
-    """
-    sparse_prefix = sparse_data = sparse_indices = sparse_indptr = None
-
-    def _process_inputs(input_tensors, m, n, prefix_init):
-        nonlocal sparse_prefix
-        nonlocal sparse_data
-        nonlocal sparse_indices
-        nonlocal sparse_indptr
-
-        assert len(input_tensors) == 4
-        unsure_tensors = list(input_tensors)
-        # Get the Dense data
-        dense_data = None
-        for tensor in unsure_tensors:
-            if len(tensor.shape) == 2:
-                assert dense_data is None
-                dense_data = tensor
-                assert m == dense_data.shape[0]
-                k = dense_data.shape[1]
-        unsure_tensors.remove(dense_data)
-
-        # Get the Sparse data
-        sparse_data = None
-        for tensor in unsure_tensors:
-            if len(tensor.shape) == 3:
-                assert sparse_data is None
-                sparse_data = tensor
-                block_size, bs_r, bs_c = sparse_data.shape
-        unsure_tensors.remove(sparse_data)
-
-        # Get the Sparse indptr & indices
-        sparse_indices = None
-        for tensor in unsure_tensors:
-            assert len(tensor.shape) == 1
-            if tensor.shape[0] == block_size:
-                assert sparse_indices is None
-                sparse_indices = tensor
-        unsure_tensors.remove(sparse_indices)
-        assert len(unsure_tensors) == 1
-        sparse_indptr = unsure_tensors[0]
-
-        # Generate the sparse_prefix
-        density = 1.0
-        for i in sparse_data.shape:
-            density *= i
-        density /= k * n
-        density = density.value
-        sparse_prefix = "%s_%d_%d_%d_%d_%d_%d_" % (
-            prefix_init,
-            n,
-            k,
-            bs_r,
-            bs_c,
-            sparse_indices.shape[0],
-            sparse_indptr.shape[0],
-        )
-
-    visited = set()
-
-    def _traverse(t):
-        # We cannot directly add tensors to the set, because the comparison of
-        # two tensors with ndim=0 is ambiguous.
-        assert t.handle is not None
-        if t.handle.value in visited:
-            return
-
-        if isinstance(t.op, te.ComputeOp):
-            # TODO(jcf94): Currently only support to one sparse op, add more support here
-            if t.op.tag == "sparse_dense_sp_rhs_bsrmm":
-                m, n = t.shape
-                assert len(t.op.input_tensors) == 1
-                block_tensor = t.op.input_tensors[0]
-                _process_inputs(block_tensor.op.input_tensors, m, n, "sparse_dense_bsr")
-            if sparse_prefix is not None:
-                # Early stop if we find a sparse_prefix
-                # Notice: If any workload has more than one sparse input, this may get problem
-                return
-            for x in t.op.input_tensors:
-                _traverse(x)
-        visited.add(t.handle.value)
-
-    try:
-        for arg in args:
-            _traverse(arg)
-    # pylint: disable=broad-except
-    except Exception:
-        return {}
-
-    if sparse_data is None or sparse_indices is None or sparse_indptr is None:
-        return {}
-
-    sparse_input_map = {}
-    sparse_input_map[sparse_data] = sparse_prefix + "W_data"
-    sparse_input_map[sparse_indices] = sparse_prefix + "W_indices"
-    sparse_input_map[sparse_indptr] = sparse_prefix + "W_indptr"
-
-    return sparse_input_map
-
-
-def _sparse_conv2d_bsr_compute_nhwc(data, weight_data, weight_indices, weight_indptr):
-    (m, h, w, k) = get_const_tuple(data.shape)  # pylint: disable=C0103
-    if len(weight_data.shape) == 2:
-        _, bs_r = get_const_tuple(weight_data.shape)
-    elif len(weight_data.shape) == 3:
-        _, bs_r, bs_c = get_const_tuple(weight_data.shape)
-    (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape)
-    num_blocks = num_blocks_plus_1 - 1
-
-    def _compute_block(i, h, w, nb_j, j):  # pylint: disable=C0103
-        row_start = weight_indptr[nb_j]
-        row_end = weight_indptr[nb_j + 1]
-        row_elems = row_end - row_start
-        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
-        block_offset = row_start + elem_idx
-        block_j = weight_indices[block_offset]
-        if len(weight_data.shape) == 3:
-            c = te.reduce_axis((0, bs_c), name="c")
-            block_ij_val = weight_data[block_offset][j][c]
-            x_val = data[i, h, w, bs_c * block_j + c]
-            return te.sum(block_ij_val * x_val, axis=[elem_idx, c])
-        else:
-            block_ij_val = weight_data[block_offset][j]
-            x_val = data[i, h, w, block_j]
-            return te.sum(block_ij_val * x_val, axis=[elem_idx])
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    bsrmm_block = te.compute(
-        (m, h, w, num_blocks, bs_r),
-        _compute_block,
-        tag="sparse_conv2d_sp_bsrmm_block",
-        attrs={"FLOP": 2 * m * num_blocks * bs_r * k * h * w},
-    )
-    return te.compute(
-        (m, h, w, num_blocks * bs_r),
-        lambda m, h, w, n: bsrmm_block[m, h, w, idxd(n, bs_r), idxm(n, bs_r)],
-        tag="sparse_conv2d_sp_bsrmm",
-        name="sparse_conv2d",
-        attrs={"layout": "NHWC"},
-    )
-
-
-def _sparse_conv2d_bsr_compute_nchw(data, weight_data, weight_indices, weight_indptr):
-    (m, k, h, w) = get_const_tuple(data.shape)  # pylint: disable=C0103
-    if len(weight_data.shape) == 2:
-        _, bs_r = get_const_tuple(weight_data.shape)
-    elif len(weight_data.shape) == 3:
-        _, bs_r, bs_c = get_const_tuple(weight_data.shape)
-    (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape)
-    num_blocks = num_blocks_plus_1 - 1
-
-    def _compute_block(i, nb_j, j, h, w):  # pylint: disable=C0103
-        row_start = weight_indptr[nb_j]
-        row_end = weight_indptr[nb_j + 1]
-        row_elems = row_end - row_start
-        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
-        block_offset = row_start + elem_idx
-        block_j = weight_indices[block_offset]
-        if len(weight_data.shape) == 3:
-            c = te.reduce_axis((0, bs_c), name="c")
-            block_ij_val = weight_data[block_offset][j][c]
-            x_val = data[i, bs_c * block_j + c, h, w]
-            return te.sum(block_ij_val * x_val, axis=[elem_idx, c])
-        else:
-            block_ij_val = weight_data[block_offset][j]
-            x_val = data[i, block_j, h, w]
-            return te.sum(block_ij_val * x_val, axis=[elem_idx])
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    bsrmm_block = te.compute(
-        (m, num_blocks, bs_r, h, w),
-        _compute_block,
-        tag="sparse_conv2d_sp_bsrmm_block",
-        attrs={"FLOP": 2 * m * num_blocks * bs_r * k * h * w},
-    )
-    return te.compute(
-        (m, num_blocks * bs_r, h, w),
-        lambda m, n, h, w: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r), h, w],
-        tag="sparse_conv2d_sp_bsrmm",
-        name="sparse_conv2d",
-        attrs={"layout": "NCHW"},
-    )
-
-
-def sparse_conv2d(
-    dense_data, sparse_data, sparse_indices, sparse_indptr, layout="NHWC", kernel_size=1
-):
-    """
-    Computes sparse-conv2d(1*1) of ``data`` and
-    ``(weight_data, weight_indices, weight_indptr)``
-
-    Parameters
-    ----------
-    dense_data : tvm.te.Tensor
-        4-D with shape ``[M, H, W, K]`` (layout=NHWC)
-
-        4-D with shape ``[M, K, H, W]`` (layout=NCHW)
-
-    sparse_data : tvm.te.Tensor
-        2-D with shape ``[num_blocks, bs_r]`` (BSR)
-
-        3-D with shape ``[num_blocks, bs_r, bs_c]`` (BSR)
-
-    sparse_indices : tvm.te.Tensor
-        1-D with shape ``[num_blocks]`` (BSR)
-
-    sparse_indptr : tvm.te.Tensor
-        1-D with shape ``[(N + 1) // bs_r]`` (BSR)
-
-    layout : str
-        layout of data
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [M, H, W, N] (layout=NHWC)
-        4-D with shape [M, N, H ,W] (layout=NCHW)
-    """
-    if kernel_size == 1:
-        if layout == "NHWC":
-            return _sparse_conv2d_bsr_compute_nhwc(
-                dense_data, sparse_data, sparse_indices, sparse_indptr
-            )
-        elif layout == "NCHW":
-            return _sparse_conv2d_bsr_compute_nchw(
-                dense_data, sparse_data, sparse_indices, sparse_indptr
-            )
-    else:
-        raise ValueError(f"Unsupport Layout {layout}")
-
-
-@auto_scheduler.register_task_input_check_func
-def try_get_conv2d_sparse_input(args):
-    """Analyze the input data from the given args.
-
-    Parameters
-    ----------
-    args : List[Tensor]
-        Input/output Tensor of a TVM subgraph.
-
-    Returns
-    -------
-    Dict[Tensor, str] :
-        Map from the input Tensor to its buffer name.
-
-    Notes
-    -----
-    The buffer name is specially designed, and these buffer should be provided in
-    `SearchTask(..., task_inputs={...})`.
-    """
-    sparse_prefix = sparse_data = sparse_indices = sparse_indptr = None
-
-    def _process_inputs(input_tensors, m, h, w, n, prefix_init, layout):  # pylint: disable=C0103
-        nonlocal sparse_prefix
-        nonlocal sparse_data
-        nonlocal sparse_indices
-        nonlocal sparse_indptr
-
-        assert len(input_tensors) == 4
-        unsure_tensors = list(input_tensors)
-        # Get the Dense data
-        dense_data = None
-        for tensor in unsure_tensors:
-            if len(tensor.shape) == 4:
-                assert dense_data is None
-                dense_data = tensor
-                if layout == "NHWC":
-                    assert m == dense_data.shape[0]
-                    assert h == dense_data.shape[1]
-                    assert w == dense_data.shape[2]
-                    k = dense_data.shape[3]
-                elif layout == "NCHW":
-                    assert m == dense_data.shape[0]
-                    assert h == dense_data.shape[2]
-                    assert w == dense_data.shape[3]
-                    k = dense_data.shape[1]
-        unsure_tensors.remove(dense_data)
-        # Get the Sparse data
-        sparse_data = None
-        for tensor in unsure_tensors:
-            if len(tensor.shape) == 3:
-                assert sparse_data is None
-                sparse_data = tensor
-                block_size, bs_r, bs_c = sparse_data.shape
-            if len(tensor.shape) == 2:
-                assert sparse_data is None
-                sparse_data = tensor
-                block_size, bs_r = sparse_data.shape
-                bs_c = 1
-        unsure_tensors.remove(sparse_data)
-        # Get the Sparse indptr & indices
-        sparse_indices = None
-        for tensor in unsure_tensors:
-            assert len(tensor.shape) == 1
-            if tensor.shape[0] == block_size:
-                assert sparse_indices is None
-                sparse_indices = tensor
-        unsure_tensors.remove(sparse_indices)
-        assert len(unsure_tensors) == 1
-        sparse_indptr = unsure_tensors[0]
-        # Generate the sparse_prefix
-        density = 1.0
-        for i in sparse_data.shape:
-            density *= i
-        density /= k * n
-        density = density.value
-        sparse_prefix = "%s_%d_%d_%d_%d_%d_%d_" % (
-            prefix_init,
-            n,
-            k,
-            bs_r,
-            bs_c,
-            sparse_indices.shape[0],
-            sparse_indptr.shape[0],
-        )
-
-    visited = set()
-
-    def _traverse(t):
-        # We cannot directly add tensors to the set, because the comparison of
-        # two tensors with ndim=0 is ambiguous.
-        assert t.handle is not None
-        if t.handle.value in visited:
-            return
-
-        if isinstance(t.op, te.ComputeOp):
-            if t.op.tag == "sparse_conv2d_sp_bsrmm":
-                m, h, w, n = t.shape  # pylint: disable=C0103
-                assert len(t.op.input_tensors) == 1
-                block_tensor = t.op.input_tensors[0]
-                _process_inputs(
-                    block_tensor.op.input_tensors,
-                    m,
-                    h,
-                    w,
-                    n,
-                    "sparse_conv2d_bsr",
-                    t.op.attrs["layout"],
-                )
-            if sparse_prefix is not None:
-                # Early stop if we find a sparse_prefix
-                # Notice: If any workload has more than one sparse input, this may get problem
-                return
-            for x in t.op.input_tensors:
-                _traverse(x)
-        visited.add(t.handle.value)
-
-    try:
-        for arg in args:
-            _traverse(arg)
-    # pylint: disable=broad-except
-    except Exception:
-        return {}
-
-    if sparse_data is None or sparse_indices is None or sparse_indptr is None:
-        return {}
-
-    sparse_input_map = {}
-    sparse_input_map[sparse_data] = sparse_prefix + "W_data"
-    sparse_input_map[sparse_indices] = sparse_prefix + "W_indices"
-    sparse_input_map[sparse_indptr] = sparse_prefix + "W_indptr"
-
-    return sparse_input_map
-
-
-def sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr):
-    """
-    Computes sparse-dense addition
-
-    Parameters
-    ----------
-    dense_data : tvm.te.Tensor
-        2-D with shape [M, N]
-
-    sparse_data : tvm.te.Tensor
-        1-D with shape [nnz] (CSR)
-
-    sparse_indices : tvm.te.Tensor
-        1-D with shape [nnz] (CSR)
-
-    sparse_indptr : tvm.te.Tensor
-        1-D with shape [M + 1] (CSR)
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [M, N]
-    """
-    # TODO(ANSHUMAN87): support BSR format too
-    assert len(sparse_data.shape) == 1, "only CSR format is supported"
-    return _sparse_add_csr(dense_data, sparse_data, sparse_indices, sparse_indptr)
-
-
-def _sparse_add_csr(dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp):
-    oshape = get_const_tuple(dense_data_inp.shape)
-
-    def _csr_add_ir(dense_data, sparse_data, sparse_indices, sparse_indptr, out_data):
-        irb = tvm.tir.ir_builder.create()
-        dense_data_ptr = irb.buffer_ptr(dense_data)
-        sparse_data_ptr = irb.buffer_ptr(sparse_data)
-        sparse_indices_ptr = irb.buffer_ptr(sparse_indices)
-        sparse_indptr_ptr = irb.buffer_ptr(sparse_indptr)
-
-        out_data_ptr = irb.buffer_ptr(out_data)
-
-        with irb.for_range(0, oshape[0], kind="vectorize", name="row") as row:
-            with irb.for_range(0, oshape[1], kind="parallel", name="col") as col:
-                out_data_ptr[row, col] = dense_data_ptr[row, col]
-
-        with irb.for_range(0, oshape[0], kind="parallel", name="row") as row:
-            offset = sparse_indptr_ptr[row]
-            diff = sparse_indptr_ptr[row + 1] - sparse_indptr_ptr[row]
-            with irb.for_range(0, diff, kind="serial", name="idx") as idx:
-                real_idx = offset + idx
-                col = sparse_indices_ptr[real_idx]
-                out_data_ptr[row, col] = sparse_data_ptr[real_idx] + out_data_ptr[row, col]
-
-        return irb.get()
-
-    return te.extern(
-        shape=oshape,
-        inputs=[dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp],
-        fcompute=lambda ins, outs: _csr_add_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-        tag="sparse_add_csr",
-        dtype=[
-            dense_data_inp.dtype,
-            sparse_data_inp.dtype,
-            sparse_indices_inp.dtype,
-            sparse_indptr_inp.dtype,
-        ],
-        name="sparse_add_csr_output",
-    )
diff --git a/python/tvm/topi/rocm/__init__.py b/python/tvm/topi/rocm/__init__.py
deleted file mode 100644
index f61039ab91cc..000000000000
--- a/python/tvm/topi/rocm/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""rocm specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .batch_matmul import *
-from .conv2d import *
-from .dense import *
diff --git a/python/tvm/topi/rocm/batch_matmul.py b/python/tvm/topi/rocm/batch_matmul.py
deleted file mode 100644
index 53b51eedf6d9..000000000000
--- a/python/tvm/topi/rocm/batch_matmul.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument
-"""Schedule for batch_matmul operator"""
-from tvm import autotvm
-from tvm.contrib import rocblas
-from .. import generic
-from ..utils import get_const_tuple
-
-
-@autotvm.register_topi_compute("batch_matmul_rocblas.rocm")
-def batch_matmul_rocblas(
-    cfg, x, y, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """Computes matrix multiplication of `x` and `y` via rocblas when
-    `x` and `y` are batched matrices.
-
-    Parameters
-    ----------
-    cfg : ConfigSpace
-        Autotvm tuning space config file
-    x : tvm.te.Tensor
-        3-D with shape [batch, M, K]
-    y : tvm.te.Tensor
-        3-D with shape [batch, N, K]
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D with shape [batch, M, N]
-    """
-    del out_dtype
-    batch, M, K = get_const_tuple(x.shape)
-    _, N, _ = get_const_tuple(y.shape)
-    if out_shape is not None:
-        assert out_shape[0] == batch, "Input and output batch sizes must match"
-        assert out_shape[1] == M and out_shape[2] == N, "Invalid output shape"
-    result = rocblas.batch_matmul(x, y, transpose_a, transpose_b)
-    cfg.add_flop(batch * M * N * K * 2)
-    return result
-
-
-@autotvm.register_topi_schedule("batch_matmul_rocblas.rocm")
-def schedule_batch_matmul_rocblas(_, outs):
-    """Schedule for batch_matmul operator with rocm cblas"""
-    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/rocm/conv2d.py b/python/tvm/topi/rocm/conv2d.py
deleted file mode 100644
index fac77f02b456..000000000000
--- a/python/tvm/topi/rocm/conv2d.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Compute definition for conv2d with rocm backend"""
-from tvm import autotvm
-from tvm.contrib import miopen
-
-from .. import generic
-from ..utils import get_const_tuple
-from ..nn.utils import get_pad_tuple
-
-
-@autotvm.register_topi_compute("conv2d_nchw_miopen.rocm")
-def conv2d_nchw_miopen(
-    cfg, data, kernel, strides, padding, dilation, layout="NCHW", out_dtype="float32"
-):
-    """Conv2D operator for rocm backend.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    filter : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
-
-    strides : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding : int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    layout : str
-        layout of data
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    CO, CI, KH, KW = get_const_tuple(kernel.shape)
-    N, _, H, W = get_const_tuple(data.shape)
-
-    assert layout == "NCHW"
-
-    # handle dilation
-    stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
-    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
-    pad_h, pad_w = pt + pb, pl + pr
-    dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
-    assert (pt == pb) and (pl == pr)
-    OH = (H + 2 * pad_h - KH) // stride_h + 1
-    OW = (W + 2 * pad_w - KW) // stride_w + 1
-    cfg.add_flop(
-        2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)
-    )
-
-    return miopen.conv2d_forward(
-        data, kernel, stride_h, stride_w, pt, pl, dilation_h, dilation_w, conv_mode=0, data_type=1
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_miopen.rocm")
-def schedule_conv2d_nchw_miopen(cfg, outs):
-    """TOPI schedule callback of conv2d for rocm
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/rocm/dense.py b/python/tvm/topi/rocm/dense.py
deleted file mode 100644
index 983f235f0ec8..000000000000
--- a/python/tvm/topi/rocm/dense.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument
-"""Schedule for dense operator"""
-from tvm import te
-from tvm import autotvm
-from tvm.contrib import rocblas
-from .. import generic
-from .. import tag
-
-
-@autotvm.register_topi_compute("dense_rocblas.rocm")
-def dense_rocblas(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator for rocm backend with cblas.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : tvm.te.Tensor
-        2-D with shape [out_dim, in_dim]
-
-    bias : tvm.te.Tensor, optional
-        1-D with shape [out_dim]
-
-    out_dtype : str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    assert out_dtype == data.dtype, "Mixed precision not supported."
-    matmul = rocblas.matmul(data, weight, False, True)
-    batch, in_dim = data.shape
-    out_dim, _ = weight.shape
-    cfg.add_flop(batch * in_dim * out_dim * 2)
-    if bias is not None:
-        matmul = te.compute(
-            (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST
-        )
-    return matmul
-
-
-@autotvm.register_topi_schedule("dense_rocblas.rocm")
-def schedule_dense_rocblas(_, outs):
-    """Schedule for dense operator with rocm cblas"""
-    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/sparse/__init__.py b/python/tvm/topi/sparse/__init__.py
deleted file mode 100644
index f8c248f7a283..000000000000
--- a/python/tvm/topi/sparse/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Sparse operators"""
-from __future__ import absolute_import as _abs
-
-from .csrmv import csrmv
-from .csrmm import csrmm
-from .dense import dense
diff --git a/python/tvm/topi/sparse/csrmm.py b/python/tvm/topi/sparse/csrmm.py
deleted file mode 100644
index 7af9d30bddde..000000000000
--- a/python/tvm/topi/sparse/csrmm.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM operator compute SpMM in CSR format."""
-from __future__ import absolute_import
-import tvm
-from tvm import te
-from .. import tag
-from ..utils import simplify
-from ...tir.generic import cast
-
-
-def csrmm_default(data, indices, indptr, weight, bias=None):
-    # pylint: disable=invalid-name
-    """The default implementation of csrmm in topi.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    indices : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    indptr : tvm.te.Tensor
-        1-D with shape [m+1]
-
-    weight : tvm.te.Tensor
-        2-D with shape [k, n]
-
-    bias : tvm.te.Tensor, optional
-        1-D with shape [m]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [m, n]
-    """
-    assert (
-        len(data.shape) == 1
-        and len(indices.shape) == 1
-        and len(indptr.shape) == 1
-        and len(weight.shape) == 2
-    ), "only support 2-dim csrmm"
-    assert isinstance(
-        weight, te.tensor.Tensor
-    ), f"weight matrix is assumed to be tvm.te.Tensor, but weight is `{type(weight)}`"
-    assert (
-        data.dtype == weight.dtype
-    ), f"Data and weight must have the same dtype, but they have {data.dtype} and {weight.dtype}"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    M = simplify(indptr.shape[0] - 1)
-    _, N = weight.shape
-
-    def csrmm_default_ir(data, indices, indptr, weight, out):
-        """define ir for csrmm"""
-        irb = tvm.tir.ir_builder.create()
-        data_ptr = irb.buffer_ptr(data)
-        indices_ptr = irb.buffer_ptr(indices)
-        indptr_ptr = irb.buffer_ptr(indptr)
-        weight_ptr = irb.buffer_ptr(weight)
-        out_ptr = irb.buffer_ptr(out)
-        M = simplify(indptr.shape[0] - 1)
-        _, N = weight.shape
-        with irb.for_range(0, N, kind="vectorize", name="n") as n:
-            with irb.for_range(0, M, kind="parallel", name="row") as row:
-                dot = irb.allocate(data.dtype, (1,), name="dot", scope="local")
-                out_ptr[row * N + n] = cast(0, data.dtype)
-                dot[0] = cast(0, data.dtype)
-                row_start = indptr_ptr[row]
-                row_end = indptr_ptr[row + 1]
-                row_elems = row_end - row_start
-                with irb.for_range(0, row_elems, name="idx") as idx:
-                    elem = row_start + idx
-                    dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem] * N + n]
-                out_ptr[row * N + n] += dot[0]
-        return irb.get()
-
-    oshape = (M, N)
-    matmul = te.extern(
-        oshape,
-        [data, indices, indptr, weight],
-        lambda ins, outs: csrmm_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-        tag="csrmm",
-        dtype=data.dtype,
-        name="out",
-    )
-    if bias is not None:
-        matmul = te.compute(oshape, lambda i, j: matmul[i, j] + bias[i], tag=tag.BROADCAST)
-    return matmul
-
-
-def csrmm(a, b, c=None):
-    """The `csrmm` routine performs a matrix-matrix operation defined as :math:`C := A*B + C`,
-    where `B` and `C` are dense matrices, `A` is an m-by-k sparse matrix in the CSR format.
-
-    Parameters
-    ----------
-    a : tvm.contrib.sparse.CSRNDArray
-        2-D sparse matrix with shape [m, k]
-
-    b : tvm.te.Tensor
-        2-D dense matrix with shape [k, n]
-
-    c : tvm.te.Tensor, optional
-        1-D dense vector with shape [n]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [m, n]
-    """
-    return csrmm_default(a.data, a.indices, a.indptr, b, c)
diff --git a/python/tvm/topi/sparse/csrmv.py b/python/tvm/topi/sparse/csrmv.py
deleted file mode 100644
index d585b27ca7ab..000000000000
--- a/python/tvm/topi/sparse/csrmv.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM operator compute SpMV in CSR format."""
-from __future__ import absolute_import
-import tvm
-from tvm import te
-from .. import tag
-from ...tir.generic import cast
-
-
-def csrmv_default(data, indices, indptr, weight, bias=None):
-    """The default implementation of csrmv in topi.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    indices : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    indptr : tvm.te.Tensor
-        1-D with shape [m+1]
-
-    weight : tvm.te.Tensor
-        2-D with shape [k, 1]
-
-    bias : tvm.te.Tensor, optional
-        1-D with shape [1]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [m, 1]
-    """
-    assert len(data.shape) == 1 and len(weight.shape) == 2, "only support 2-dim csrmv"
-    assert isinstance(
-        weight, te.tensor.Tensor
-    ), f"weight matrix is assumed to be tvm.te.Tensor, but weight is `{type(weight)}`"
-    assert (
-        data.dtype == weight.dtype
-    ), f"Data and weight must have the same dtype, but they have {data.dtype} and {weight.dtype}"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    batch = indptr.shape[0] - 1
-
-    def csrmv_default_ir(data, indices, indptr, weight, out):
-        """define ir for csrmv"""
-        irb = tvm.tir.ir_builder.create()
-        data_ptr = irb.buffer_ptr(data)
-        indices_ptr = irb.buffer_ptr(indices)
-        indptr_ptr = irb.buffer_ptr(indptr)
-        weight_ptr = irb.buffer_ptr(weight)
-        out_ptr = irb.buffer_ptr(out)
-        num_rows = indptr.shape[0] - 1
-        with irb.for_range(0, num_rows, kind="parallel", name="row") as row:
-            dot = irb.allocate(data.dtype, (1,), name="dot", scope="local")
-            out_ptr[row] = cast(0, data.dtype)
-            dot[0] = cast(0, data.dtype)
-            row_start = indptr_ptr[row]
-            row_end = indptr_ptr[row + 1]
-            row_elems = row_end - row_start
-            with irb.for_range(0, row_elems, name="elemidx") as elemidx:
-                elem = row_start + elemidx
-                dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]]
-            out_ptr[row] += dot[0]
-        return irb.get()
-
-    oshape = (batch, 1)
-    matmul = te.extern(
-        oshape,
-        [data, indices, indptr, weight],
-        lambda ins, outs: csrmv_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-        tag="csrmv",
-        dtype=data.dtype,
-        name="csrmv",
-    )
-    if bias is not None:
-        matmul = te.compute((batch, 1), lambda i, j: matmul[i, 0] + bias[i], tag=tag.BROADCAST)
-    return matmul
-
-
-def csrmv(a, x, y=None):
-    """The `csrmv` routine performs a matrix-vector operation defined as :math:`y := A*x + y`,
-    where `x` and `y` are vectors, `A` is an m-by-k sparse matrix in the CSR format.
-
-    Parameters
-    ----------
-    a : tvm.contrib.sparse.CSRNDArray
-        2-D sparse matrix with shape [m, k]
-
-    x : tvm.te.Tensor
-        2-D dense matrix with shape [k, 1]
-
-    y : tvm.te.Tensor, optional
-        1-D dense vector with shape [1]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D dense matrix with shape [m, 1]
-    """
-    return csrmv_default(a.data, a.indices, a.indptr, x, y)
diff --git a/python/tvm/topi/sparse/dense.py b/python/tvm/topi/sparse/dense.py
deleted file mode 100644
index 9c13c4bae918..000000000000
--- a/python/tvm/topi/sparse/dense.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM operator compute Dense in CSR format."""
-from __future__ import absolute_import
-import tvm
-from tvm import te
-from .. import tag
-from ..utils import simplify
-
-
-def dense_si(data, indices, indptr, weight, bias=None):
-    # pylint: disable=invalid-name
-    """The implementation of dense in topi, assuming sparse input.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        1-D with shape [num_nonzeros]
-
-    indices : tvm.te.Tensor
-        1-D with shape [num_nonzeros]
-
-    indptr : tvm.te.Tensor
-        1-D with shape [m+1]
-
-    weight : tvm.te.Tensor
-        2-D with shape [k, n]
-
-    bias : tvm.te.Tensor, optional
-        1-D with shape [m]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [m, n]
-    """
-    assert (
-        len(data.shape) == 1
-        and len(indices.shape) == 1
-        and len(indptr.shape) == 1
-        and len(weight.shape) == 2
-    ), "only support 2-dim dense"
-    assert isinstance(
-        weight, te.tensor.Tensor
-    ), f"weight matrix is assumed to be tvm.te.Tensor, but weight is `{type(weight)}`"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    dtype = data.dtype
-    M = simplify(indptr.shape[0] - 1)
-    N, _ = weight.shape
-
-    def dense_default_ir(data, indices, indptr, weight, out):
-        """Define IR for Dense"""
-        dtype = data.dtype
-        irb = tvm.tir.ir_builder.create()
-        data_ptr = irb.buffer_ptr(data)
-        indices_ptr = irb.buffer_ptr(indices)
-        indptr_ptr = irb.buffer_ptr(indptr)
-        weight_ptr = irb.buffer_ptr(weight)
-        out_ptr = irb.buffer_ptr(out)
-        M = simplify(indptr.shape[0] - 1)
-        N, K = weight.shape
-        with irb.for_range(0, N, kind="vectorize", name="n") as n:
-            with irb.for_range(0, M, kind="parallel", name="m") as m:
-                dot = irb.allocate(dtype, (1,), name="dot", scope="local")
-                out_ptr[m * N + n] = tvm.tir.const(0, dtype)
-                dot[0] = tvm.tir.const(0, dtype)
-                row_start = indptr_ptr[m]
-                row_elems = indptr_ptr[m + 1] - row_start
-                with irb.for_range(0, row_elems, name="k") as k:
-                    elem = row_start + k
-                    dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem] + n * K]
-                out_ptr[m * N + n] += dot[0]
-        return irb.get()
-
-    oshape = (M, N)
-    matmul = te.extern(
-        oshape,
-        [data, indices, indptr, weight],
-        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-        tag="dense",
-        dtype=dtype,
-        name="out",
-    )
-    if bias is not None:
-        matmul = te.compute(oshape, lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST)
-    return matmul
-
-
-def dense_sw(data, w_data, w_indices, w_indptr, bias=None):
-    # pylint: disable=invalid-name
-    """The implementation of dense in topi, assuming sparse weight.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        2-D with shape [m, k]
-
-    w_data : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    w_indices : tvm.te.Tensor
-        1-D with shape [nonzeros]
-
-    w_indptr : tvm.te.Tensor
-        1-D with shape [n+1]
-
-    bias : tvm.te.Tensor, optional
-        1-D with shape [n]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [m, n]
-    """
-    assert (
-        len(w_data.shape) == 1
-        and len(w_indices.shape) == 1
-        and len(w_indptr.shape) == 1
-        and len(data.shape) == 2
-    ), "only support 2-dim dense"
-    assert isinstance(
-        data, te.tensor.Tensor
-    ), f"data matrix is assumed to be tvm.te.Tensor, but weight is `{type(data)}`"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    dtype = data.dtype
-    M, _ = data.shape
-    N = simplify(w_indptr.shape[0] - 1)
-
-    def dense_default_ir(data, w_data, w_indices, w_indptr, out):
-        """Define IR for Dense"""
-        dtype = data.dtype
-        irb = tvm.tir.ir_builder.create()
-        data_ptr = irb.buffer_ptr(data)
-        w_data_ptr = irb.buffer_ptr(w_data)
-        w_indices_ptr = irb.buffer_ptr(w_indices)
-        w_indptr_ptr = irb.buffer_ptr(w_indptr)
-        out_ptr = irb.buffer_ptr(out)
-        M, K = data.shape
-        N = simplify(w_indptr.shape[0] - 1)
-        with irb.for_range(0, M, kind="vectorize", name="m") as m:
-            with irb.for_range(0, N, kind="parallel", name="n") as n:
-                dot = irb.allocate(dtype, (1,), name="dot", scope="local")
-                out_ptr[m * N + n] = tvm.tir.const(0, dtype)
-                dot[0] = tvm.tir.const(0, dtype)
-                row_start = w_indptr_ptr[n]
-                row_elems = w_indptr_ptr[n + 1] - row_start
-                with irb.for_range(0, row_elems, name="k") as k:
-                    elem = row_start + k
-                    dot[0] += w_data_ptr[elem] * data_ptr[w_indices_ptr[elem] + m * K]
-                out_ptr[m * N + n] += dot[0]
-        return irb.get()
-
-    oshape = (M, N)
-    matmul = te.extern(
-        oshape,
-        [data, w_data, w_indices, w_indptr],
-        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-        tag="dense",
-        dtype=dtype,
-        name="out",
-    )
-    if bias is not None:
-        matmul = te.compute(oshape, lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST)
-    return matmul
-
-
-def dense(data, weight, bias=None):
-    """Applies a linear transformation: :math:`Y = XW^T + b`.
-    Either data or weight should be tvm.contrib.sparse.CSRNDArray.
-
-    Parameters
-    ----------
-    data : tvm.contrib.sparse.CSRNDArray or te.tensor.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : te.tensor.Tensor or tvm.contrib.sparse.CSRNDArray
-        2-D with shape [out_dim, in_dim]
-
-    bias : te.tensor.Tensor, optional
-        1-D with shape [out_dim]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    ret = None
-    if isinstance(data, tvm.contrib.sparse.CSRPlaceholderOp) and isinstance(
-        weight, te.tensor.Tensor
-    ):
-        ret = dense_si(data.data, data.indices, data.indptr, weight, bias)
-    elif isinstance(data, te.tensor.Tensor) and isinstance(
-        weight, tvm.contrib.sparse.CSRPlaceholderOp
-    ):
-        ret = dense_sw(data, weight.data, weight.indices, weight.indptr, bias)
-    else:
-        raise NotImplementedError(
-            "implementation for %s as data and %s as weights, "
-            "is not supported yet." % (type(data), type(weight))
-        )
-    return ret
diff --git a/python/tvm/topi/sparse/utils.py b/python/tvm/topi/sparse/utils.py
deleted file mode 100644
index e8636f95fcc6..000000000000
--- a/python/tvm/topi/sparse/utils.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Some utils for Sparse operation."""
-import tvm
-from tvm import relay, auto_scheduler
-from tvm.relay import data_dep_optimization as ddo
-from tvm.auto_scheduler import _ffi_api
-
-
-def random_bsr_matrix(m, n, bs_r, bs_c, density, dtype):
-    """Generate a random sparse matrix in bsr format.
-
-    Returns
-    -------
-    scipy.sparse.bsr_matrix
-    """
-    # pylint: disable=import-outside-toplevel
-    import numpy as np
-    import itertools
-    import scipy.sparse as sp
-
-    y = np.zeros((m, n), dtype=dtype)
-    assert m % bs_r == 0
-    assert n % bs_c == 0
-    nnz = int(density * m * n)
-    num_blocks = int(nnz / (bs_r * bs_c)) + 1
-    candidate_blocks = np.asarray(list(itertools.product(range(0, m, bs_r), range(0, n, bs_c))))
-    assert candidate_blocks.shape[0] == m // bs_r * n // bs_c
-    chosen_blocks = candidate_blocks[
-        np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False)
-    ]
-    # pylint: disable=invalid-name
-    for (r, c) in chosen_blocks:
-        y[r : r + bs_r, c : c + bs_c] = np.random.randn(bs_r, bs_c)
-    s = sp.bsr_matrix(y, blocksize=(bs_r, bs_c))
-    assert s.data.shape == (num_blocks, bs_r, bs_c)
-    assert s.indices.shape == (num_blocks,)
-    assert s.indptr.shape == (m // bs_r + 1,)
-    return s
-
-
-def random_sparse_dense_params(func, params, bs_r, bs_c, density):
-    """Replace the dense parameters with random sparse parameters. Mainly used for testing.
-
-    Parameters
-    ----------
-    func : tvm.relay.Expr
-        Expr will be optimized to sparse operation.
-    params : Dict[Srting, tvm.nd.array]
-        Parameters of the Expr.
-    bs_r : int
-        The row of BSR matrix block.
-    bs_c : int
-        The column of BSR matrix block.
-    density : float
-        The density of the random sparse parameters.
-
-    Returns
-    -------
-    Dict[Srting, tvm.nd.array]
-        The generated random parameters.
-    """
-
-    def deepcopy(param_dic):
-        ret = {}
-        for k, v in param_dic.items():
-            ret[k] = tvm.nd.array(v.numpy())
-        return ret
-
-    new_params = deepcopy(params)
-    dense_weight_names = relay.analysis.sparse_dense._search_dense_op_weight(func)
-    for item in dense_weight_names:
-        name = str(item)
-        shape = new_params[name].shape
-        if shape[0] % bs_r == 0 and shape[1] % bs_c == 0:
-            new_w = random_bsr_matrix(shape[0], shape[1], bs_r, bs_c, density, "float32").todense()
-            new_params[name] = tvm.nd.array(new_w)
-    return new_params
-
-
-def random_sparse_conv2d_params(func, params, bs_r, bs_c, density, layout):
-    """Replace the dense parameters with random sparse parameters. Mainly used for testing.
-
-    Parameters
-    ----------
-    func : tvm.relay.Expr
-        Expr will be optimized to sparse operation.
-    params : Dict[Srting, tvm.nd.array]
-        Parameters of the Expr.
-    bs_r : int
-        The row of BSR matrix block.
-    bs_c : int
-        The column of BSR matrix block.
-    density : float
-        The density of the random sparse parameters.
-    layout : str
-        layout of network
-
-    Returns
-    -------
-    Dict[Srting, tvm.nd.array]
-        The generated random parameters.
-    """
-    # pylint: disable=import-outside-toplevel
-    import numpy as np
-
-    def deepcopy(param_dic):
-        ret = {}
-        for k, v in param_dic.items():
-            ret[k] = tvm.nd.array(v.numpy())
-        return ret
-
-    new_params = deepcopy(params)
-    conv2d_weight_names = relay.analysis.sparse_conv2d._search_conv2d_op_weight(func)
-    for item in conv2d_weight_names:
-        name = str(item)
-        shape = new_params[name].shape
-        if not ((shape[0] == 1 and shape[1] == 1) or (shape[2] == 1 and shape[3] == 1)):
-            continue
-        if layout == "NCHW" and shape[0] % bs_r == 0 and shape[1] % bs_c == 0:
-            new_w = random_bsr_matrix(shape[0], shape[1], bs_r, bs_c, density, "float32").todense()
-            new_params[name] = tvm.nd.array(np.array(new_w).reshape(shape))
-        elif layout == "NHWC" and shape[3] % bs_r == 0 and shape[2] % bs_c == 0:
-            new_w = random_bsr_matrix(shape[3], shape[2], bs_r, bs_c, density, "float32").todense()
-            new_params[name] = tvm.nd.array(np.array(new_w).reshape(shape))
-    return new_params
-
-
-def convert_model_dense_to_sparse(
-    mod, params, random_params=False, bs_r=1, bs_c=1, sparsity=0.85, layout="NHWC"
-):
-    """Convert a dense model to sparse model.
-
-    Parameters
-    ----------
-    mod : tvm.Module
-        The dense model.
-    params : Dict[Srting, tvm.nd.array]
-        Parameters of the dense model.
-    random_params : Bool = False
-        True to replace the parameters of the dense model with some random sparse tensors.
-        This is mainly used for testing.
-    bs_r : int
-        The row of BSR matrix block.
-    bs_c : int
-        The column of BSR matrix block.
-    sparsity : float
-        The sparsity of the random sparse parameters.
-    layout : str
-        layout of network
-
-    Returns
-    -------
-    tvm.Module
-        The updated sparse model.
-    Dict[Srting, tvm.nd.array]
-        The updated parameters.
-    """
-
-    mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params)
-    if random_params:
-        # Manually replace the parameters of dense to sparse tensors
-        params = random_sparse_dense_params(mod, params, bs_r=bs_r, bs_c=bs_c, density=1 - sparsity)
-        # Manually replace the parameters of conv2d to sparse tensors
-        params = random_sparse_conv2d_params(
-            mod, params, bs_r=bs_r, bs_c=bs_c, density=1 - sparsity, layout=layout
-        )
-    # convert dense matmul to sparse matmul
-    mod, params = ddo.bsr_dense.convert(mod, params, (bs_r, bs_c), sparsity_threshold=0.8)
-    # convert dense conv2d to sparse conv2d
-    mod, params = ddo.bsr_conv2d.convert(
-        mod, params, (bs_r, bs_c), sparsity_threshold=0.8, layout=layout
-    )
-
-    return tvm.IRModule.from_expr(mod), params
-
-
-def sparse_sketch_rules():
-    """Return the sketch rules for sparse op"""
-    sparse_sketch_rule_list = [
-        auto_scheduler.PreloadCustomSketchRule(
-            sparse_conv2d_meet_condition_func, sparse_conv2d_apply_func, "SparseConv2D"
-        ),
-        auto_scheduler.PreloadCustomSketchRule(
-            sparse_dense_meet_condition_func, sparse_dense_apply_func, "SparseDense"
-        ),
-        # Add more sketch rules for sparse
-    ]
-    return sparse_sketch_rule_list
-
-
-def sparse_conv2d_meet_condition_func(search_policy, state, stage_id):
-    state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
-    if state.stages[stage_id].op.tag in [
-        "sparse_conv2d_sp_bsrmm",
-        "sparse_conv2d_sp_bsrmm_block",
-    ]:
-        return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST
-    return auto_scheduler.PreloadCustomSketchRule.PASS
-
-
-def sparse_conv2d_apply_func(search_policy, state, stage_id):
-    """Describe how to generate the initial sketch for sparse conv2d"""
-    ret = []
-    s_0 = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
-    if s_0.stages[stage_id].op.tag == "sparse_conv2d_sp_bsrmm_block":
-        return [s_0.state_object, stage_id - 1]
-
-    sparse_conv2d = s_0.stages[stage_id].op
-    sparse_conv2d_block = s_0.stages[stage_id - 1].op
-    assert sparse_conv2d.tag == "sparse_conv2d_sp_bsrmm"
-    assert sparse_conv2d_block.tag == "sparse_conv2d_sp_bsrmm_block"
-    layout = sparse_conv2d.attrs["layout"]
-
-    # Set the default consumer of compute block
-    consumer = sparse_conv2d
-
-    # If sparse conv2d has a single elementwise consumer
-    # We can compute inline the sparse_conv2d output stage
-    consumers = _ffi_api.SearchPolicyUtilsGetConsumers(
-        search_policy.search_task, s_0.state_object, stage_id
-    )
-    if len(consumers) == 1:
-        consumer_id = int(consumers.items()[0][0])
-        if _ffi_api.SearchPolicyUtilsIsElementwiseMatch(
-            search_policy.search_task, s_0.state_object, stage_id, consumer_id
-        ):
-            consumer = s_0.stages[consumer_id].op
-            s_0.compute_inline(sparse_conv2d)
-
-    c = None
-    if layout == "NHWC":
-        if len(s_0[sparse_conv2d_block].iters) == 6:
-            # bs_c = 1
-            i, h, w, nb_j, j, row_offset = s_0[  # pylint: disable=invalid-name
-                sparse_conv2d_block
-            ].iters
-        else:
-            i, h, w, nb_j, j, row_offset, c = s_0[  # pylint: disable=invalid-name
-                sparse_conv2d_block
-            ].iters
-        m, x, y, n = s_0[consumer].iters
-    elif layout == "NCHW":
-        if len(s_0[sparse_conv2d_block].iters) == 6:
-            # bs_c = 1
-            i, nb_j, j, h, w, row_offset = s_0[  # pylint: disable=invalid-name
-                sparse_conv2d_block
-            ].iters
-        else:
-            i, nb_j, j, h, w, row_offset, c = s_0[  # pylint: disable=invalid-name
-                sparse_conv2d_block
-            ].iters
-        m, n, x, y = s_0[consumer].iters
-
-    i_0, i_1, i_2 = s_0.split(sparse_conv2d_block, i, [None, None])
-    m_0, m_1 = s_0.follow_split(consumer, m, len(s_0.transform_steps) - 1, 1)
-    h_0, h_1, h_2 = s_0.split(sparse_conv2d_block, h, [None, None])
-    x_0, x_1 = s_0.follow_split(consumer, x, len(s_0.transform_steps) - 1, 1)
-    w_0, w_1, w_2 = s_0.split(sparse_conv2d_block, w, [None, None])  # pylint: disable=invalid-name
-    y_0, y_1 = s_0.follow_split(consumer, y, len(s_0.transform_steps) - 1, 1)
-    j_0, j_1 = s_0.split(sparse_conv2d_block, nb_j, [None])
-    n_0, n_1 = s_0.follow_split(consumer, n, len(s_0.transform_steps) - 1, 1)
-    if layout == "NHWC":
-        if c is None:
-            s_0.reorder(
-                sparse_conv2d_block,
-                [i_0, h_0, w_0, j_0, i_1, h_1, w_1, j_1, row_offset, i_2, h_2, w_2, j],
-            )
-        else:
-            s_0.reorder(
-                sparse_conv2d_block,
-                [i_0, h_0, w_0, j_0, i_1, h_1, w_1, j_1, row_offset, i_2, h_2, w_2, j, c],
-            )
-        s_0.reorder(consumer, [m_0, x_0, y_0, n_0, m_1, x_1, y_1, n_1])
-    elif layout == "NCHW":
-        if c is None:
-            s_0.reorder(
-                sparse_conv2d_block,
-                [i_0, j_0, h_0, w_0, i_1, j_1, h_1, w_1, row_offset, i_2, j, h_2, w_2],
-            )
-        else:
-            s_0.reorder(
-                sparse_conv2d_block,
-                [i_0, j_0, h_0, w_0, i_1, j_1, h_1, w_1, row_offset, i_2, j, c, h_2, w_2],
-            )
-        s_0.reorder(consumer, [m_0, n_0, x_0, y_0, m_1, n_1, x_1, y_1])
-    s_0.compute_at(sparse_conv2d_block, consumer, n_0)
-
-    ret.append([s_0.state_object, stage_id - 2])
-
-    return ret
-
-
-def sparse_dense_meet_condition_func(search_policy, state, stage_id):
-    state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
-    if state.stages[stage_id].op.tag in [
-        "sparse_dense_sp_rhs_bsrmm",
-        "sparse_dense_sp_rhs_bsrmm_block",
-    ]:
-        return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST
-    return auto_scheduler.PreloadCustomSketchRule.PASS
-
-
-def sparse_dense_apply_func(search_policy, state, stage_id):
-    """Describe how to generate the initial sketch for sparse dense"""
-    ret = []
-    s_0 = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
-    if s_0.stages[stage_id].op.tag == "sparse_dense_sp_rhs_bsrmm_block":
-        return [s_0.state_object, stage_id - 1]
-
-    sparse_dense = s_0.stages[stage_id].op
-    sparse_dense_block = s_0.stages[stage_id - 1].op
-    assert sparse_dense.tag == "sparse_dense_sp_rhs_bsrmm"
-    assert sparse_dense_block.tag == "sparse_dense_sp_rhs_bsrmm_block"
-
-    # Set the default consumer of compute block
-    consumer = sparse_dense
-
-    # If sparse dense has a single elementwise consumer
-    # We can compute inline the sparse_dense output stage
-    consumers = _ffi_api.SearchPolicyUtilsGetConsumers(
-        search_policy.search_task, s_0.state_object, stage_id
-    )
-    if len(consumers) == 1:
-        consumer_id = int(consumers.items()[0][0])
-        if _ffi_api.SearchPolicyUtilsIsElementwiseMatch(
-            search_policy.search_task, s_0.state_object, stage_id, consumer_id
-        ):
-            consumer = s_0.stages[consumer_id].op
-            s_0.compute_inline(sparse_dense)
-
-    i, nb_j, j, row_offset, c = s_0[sparse_dense_block].iters
-    m, n = s_0[consumer].iters
-    i_0, i_1, i_2 = s_0.split(sparse_dense_block, i, [None, None])
-    m_0, m_1 = s_0.follow_split(consumer, m, len(s_0.transform_steps) - 1, 1)
-    j_0, j_1 = s_0.split(sparse_dense_block, nb_j, [None])
-    n_0, n_1 = s_0.follow_split(consumer, n, len(s_0.transform_steps) - 1, 1)
-    s_0.reorder(sparse_dense_block, [i_0, j_0, i_1, j_1, row_offset, i_2, j, c])
-    s_0.reorder(consumer, [m_0, n_0, m_1, n_1])
-    s_0.compute_at(sparse_dense_block, consumer, n_0)
-
-    ret.append([s_0.state_object, stage_id - 2])
-
-    return ret
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 1486e9986e0e..55bcda6a309c 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -65,15 +65,6 @@
 from .depth_to_space import depth_to_space_python
 from .space_to_depth import space_to_depth_python
 from .crop_and_resize_python import crop_and_resize_python
-from .common import (
-    compare_numpy_tvm,
-    get_injective_schedule,
-    get_reduce_schedule,
-    get_broadcast_schedule,
-    get_elemwise_schedule,
-    get_conv2d_nchw_implement,
-    dispatch,
-)
 from .adaptive_pool_python import adaptive_pool
 from .grid_sample_python import affine_grid_python, grid_sample_python
 from .matrix_set_diag import matrix_set_diag
diff --git a/python/tvm/topi/testing/common.py b/python/tvm/topi/testing/common.py
index c84c5eaa1e27..8c99087f5c7b 100644
--- a/python/tvm/topi/testing/common.py
+++ b/python/tvm/topi/testing/common.py
@@ -20,98 +20,6 @@
 import numpy as np
 import scipy.signal
 
-import tvm
-from tvm import topi
-from tvm.testing import assert_allclose
-
-_injective_schedule = {
-    "generic": topi.generic.schedule_injective,
-    "cpu": topi.x86.schedule_injective,
-    "arm_cpu": topi.arm_cpu.schedule_injective,
-    "gpu": topi.cuda.schedule_injective,
-    "hls": topi.hls.schedule_injective,
-    "adreno": topi.adreno.schedule_injective,
-}
-
-_reduce_schedule = {
-    "generic": topi.generic.schedule_reduce,
-    "cpu": topi.x86.schedule_reduce,
-    "gpu": topi.cuda.schedule_reduce,
-    "hls": topi.cuda.schedule_reduce,
-}
-
-
-def dispatch(target, dispatch_map):
-    if isinstance(target, str):
-        target = tvm.target.Target(target)
-    assert isinstance(target, tvm.target.Target)
-    for key in target.keys:
-        if key in dispatch_map:
-            return dispatch_map[key]
-    return dispatch_map["generic"]
-
-
-def get_injective_schedule(target):
-    return dispatch(target, _injective_schedule)
-
-
-def get_reduce_schedule(target):
-    return dispatch(target, _reduce_schedule)
-
-
-get_broadcast_schedule = get_injective_schedule
-get_elemwise_schedule = get_injective_schedule
-
-_conv2d_nchw_implement = {
-    "generic": (topi.nn.conv2d_nchw, topi.generic.schedule_conv2d_nchw),
-    "cpu": (topi.x86.conv2d_nchw, topi.x86.schedule_conv2d_nchw),
-    "arm_cpu": (
-        topi.arm_cpu.conv2d_nchw_spatial_pack,
-        topi.arm_cpu.schedule_conv2d_nchw_spatial_pack,
-    ),
-    "gpu": (topi.cuda.conv2d_nchw, topi.cuda.schedule_conv2d_nchw),
-    "mali": (topi.mali.conv2d_nchw_spatial_pack, topi.mali.schedule_conv2d_nchw_spatial_pack),
-    "bifrost": (
-        topi.bifrost.conv2d_nchw_spatial_pack,
-        topi.bifrost.schedule_conv2d_nchw_spatial_pack,
-    ),
-    "intel_graphics": (topi.intel_graphics.conv2d_nchw, topi.intel_graphics.schedule_conv2d_nchw),
-    "hls": (topi.nn.conv2d_nchw, topi.hls.schedule_conv2d_nchw),
-}
-
-
-def get_conv2d_nchw_implement(target):
-    return dispatch(target, _conv2d_nchw_implement)
-
-
-def compare_numpy_tvm(inputs, output, target, device, compute, schedule):
-    """Compare a numpy inputs and output of a function to the results of the TVM version.
-
-    Parameters
-    ----------
-    inputs : Sequence[numpy.nd.array]
-        List of input numpy arrays to pass to the function.
-    output : numpy.nd.array
-        Verified correct function output.
-    target : tvm.target.Target
-        Target to run on.
-    device : tvm.runtime.Device
-        Context to run on.
-    compute : callable
-        Topi compute function to test against.
-    schedule : callable
-        Topi scheduling function to test against.
-    """
-    te_inputs = [tvm.te.placeholder(shape=i.shape, dtype=str(i.dtype)) for i in inputs]
-    te_out = tvm.nd.array(np.zeros(output.shape).astype(output.dtype), device=device)
-    with tvm.target.Target(target):
-        out = compute(*te_inputs)
-        s = schedule([out])
-        func = tvm.build(s, te_inputs + [out])
-        arys = [tvm.nd.array(x, device=device) for x in inputs]
-        func(*(arys + [te_out]))
-        assert_allclose(te_out.numpy(), output, atol=1e-4, rtol=1e-4)
-
 
 def _convolve2d(data, weights):
     """2d convolution operator in HW layout.
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
deleted file mode 100644
index a54b156380d0..000000000000
--- a/python/tvm/topi/x86/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=redefined-builtin, wildcard-import
-"""x86 specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .conv1d import *
-from .conv2d import *
-from .conv3d import *
-from .binarize_pack import schedule_binarize_pack
-from .binary_dense import schedule_binary_dense
-from .nn import *
-from .conv2d_int8 import *
-from .injective import *
-from .reduction import *
-from .pooling import schedule_pool, schedule_adaptive_pool
-from .bitserial_conv2d import *
-from .bitserial_dense import *
-from .depthwise_conv2d import *
-from .dense import *
-from .batch_matmul import *
-from .roi_align import roi_align_nchw
-from .conv2d_transpose import *
-from .conv3d_transpose import *
-from .sparse import *
-from .conv2d_alter_op import *
-from .dense_alter_op import *
-from .group_conv2d import *
-from .math_alter_op import *
-from .concat import *
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
deleted file mode 100644
index e10313323089..000000000000
--- a/python/tvm/topi/x86/batch_matmul.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,too-many-locals,unused-variable
-# pylint: disable=unused-argument
-"""x86 batch_matmul operators"""
-import tvm
-from tvm import autotvm, te
-from tvm.autotvm.task.space import SplitEntity
-from tvm.contrib import cblas, mkl
-from tvm.target.codegen import target_has_features
-
-from .. import generic, nn
-from ..transform import layout_transform
-from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
-from .dense import dense_amx_int8_schedule, dense_int8_schedule
-from .injective import schedule_injective_from_existing
-
-
-@autotvm.register_topi_compute("batch_matmul_int8.x86")
-def batch_matmul_int8_compute(cfg, x, y, *_):
-    """Compute for uint8 x int8 -> int32 batch_matmul"""
-    batch, m, k = x.shape
-    packed_y_layout = "BNK16n4k"
-    packed_y = layout_transform(y, "BNK", packed_y_layout)
-    _, n_o, _, n_i, _ = packed_y.shape
-    ak = te.reduce_axis((0, k), name="k")
-    if target_has_features(["avx512bw", "avx512f"]):
-        attrs_info = {"schedule_rule": "batch_matmul_int8"}
-    else:
-        attrs_info = None
-
-    z = te.compute(
-        (batch, m, n_o * n_i),
-        lambda b, i, j: te.sum(
-            x[b, i, ak].astype("int32")
-            * packed_y[b, tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4), j % 16, ak % 4].astype(
-                "int32"
-            ),
-            axis=ak,
-        ),
-        tag="batch_matmul_int8",
-        attrs=attrs_info,
-    )
-
-    return z
-
-
-def batch_matmul_int8_schedule(cfg, s, C, O, layout_trans):
-    """Schedule batch_matmul compute using avx512 or lower instructions
-    including VNNI vpdpbusd instruction if possible"""
-    # C: The output of batched GEMM
-    # O: The output of the fused op
-
-    # Schedule the GEMM part
-    s, fused_inner = dense_int8_schedule(cfg, s, C, O, do_parallel=False)
-    # Parallelize over batch
-    fused = s[O].fuse(O.op.axis[0], fused_inner)
-    s[O].parallel(fused)
-    cfg.define_knob("layout_trans_compute_root", [0, 1])
-
-    if cfg["layout_trans_compute_root"].val:
-        s[layout_trans].compute_root()
-        schedule_injective_from_existing(s, layout_trans)
-    else:
-        s[layout_trans].compute_at(s[O], fused)
-        _, _, _, ni, ki = s[layout_trans].op.axis
-        s[layout_trans].vectorize(ki)
-        s[layout_trans].unroll(ni)
-
-    return s
-
-
-def batch_matmul_amx_schedule(cfg, s, C, O, layout_trans):
-    """Schedule batch_matmul compute using AMX tdpbusd instruction"""
-    # C: The output of batched GEMM
-    # O: The output of the fused op
-
-    # Schedule the GEMM part
-    s, fused_inner = dense_amx_int8_schedule(cfg, s, C, O, do_parallel=False)
-    # Parallelize over ouuter loop
-    fused = s[O].fuse(O.op.axis[0], fused_inner)
-    s[O].parallel(fused)
-    cfg.define_knob("layout_trans_compute_root", [0, 1])
-
-    if cfg["layout_trans_compute_root"].val:
-        s[layout_trans].compute_root()
-        schedule_injective_from_existing(s, layout_trans)
-    else:
-        _, _, _, ni, ki = s[layout_trans].op.axis
-        s[layout_trans].vectorize(ki)
-        s[layout_trans].unroll(ni)
-
-    return s
-
-
-@autotvm.register_topi_compute("batch_matmul.x86")
-def batch_matmul(
-    cfg, tensor_a, tensor_b, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """Compute batch matrix multiplication of `tensor_a` and `tensor_b`.
-
-    Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT format
-    (transpose_a=False, transpose_b=True) by default.
-
-    Parameters
-    ----------
-    cfg : ConfigSpace
-        Autotvm tuning space config file.
-
-    tensor_a : tvm.te.Tensor
-        3-D with shape [batch, M, K] or [batch, K, M].
-
-    tensor_b : tvm.te.Tensor
-        3-D with shape [batch, K, N] or [batch, N, K].
-
-    out_shape : List[Optional]
-        Explicit intended output shape of the computation. Can be useful in cases
-        with dynamic input shapes.
-
-    out_dtype : Optional[str]
-        Specifies the output data type for mixed precision batch matmul.
-
-    transpose_a : Optional[bool] = False
-        Whether the first tensor is in transposed format.
-
-    transpose_b : Optional[bool] = True
-        Whether the second tensor is in transposed format.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D with shape [batch, M, N]
-    """
-    if cfg.is_fallback:
-        if transpose_a:
-            _, K, M = get_const_tuple(tensor_a.shape)
-        else:
-            _, M, K = get_const_tuple(tensor_a.shape)
-        if transpose_b:
-            _, N, _ = get_const_tuple(tensor_b.shape)
-        else:
-            _, _, N = get_const_tuple(tensor_b.shape)
-        _default_batch_matmul_config(cfg, M, N, K)
-    return nn.batch_matmul(
-        tensor_a,
-        tensor_b,
-        out_shape,
-        out_dtype,
-        transpose_a,
-        transpose_b,
-    )
-
-
-@autotvm.register_topi_schedule("batch_matmul.x86")
-def schedule_batch_matmul(cfg, outs):
-    """Schedule for batch_matmul
-
-    Parameters
-    ----------
-    cfg : ConfigSpace
-        AutoTVM tuning space config file.
-    outs : Array of Tensor
-        The computation graph description of batch_matmul
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "batch_matmul" in op.tag:
-            C = op.output(0)
-            A, B = op.input_tensors
-            if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A:
-                s[B].compute_inline()
-            _, M, K = get_const_tuple(A.shape)
-            _, _, N = get_const_tuple(C.shape)
-
-            if op not in s.outputs:
-                s[C].compute_inline()
-                O = outs[0]
-            else:
-                O = C
-
-            CC = s.cache_write(C, "global")
-
-            # create tuning space
-            cfg.define_split("tile_y", M, num_outputs=2)
-            cfg.define_split("tile_x", N, num_outputs=2)
-            cfg.define_split("tile_k", K, num_outputs=2)
-
-            b, y, x = s[O].op.axis
-            yo, yi = cfg["tile_y"].apply(s, O, y)
-            xo, xi = cfg["tile_x"].apply(s, O, x)
-            s[O].reorder(b, yo, xo, yi, xi)
-            bxyo = s[O].fuse(b, yo, xo)
-            s[O].parallel(bxyo)
-
-            s[CC].compute_at(s[O], bxyo)
-            (k,) = s[CC].op.reduce_axis
-            ko, ki = cfg["tile_k"].apply(s, CC, k)
-
-            Crf = s.rfactor(CC, ki)
-            s[Crf].compute_at(s[CC], s[CC].op.axis[0])
-            _, _, y, x = s[Crf].op.axis
-            s[Crf].fuse(y, x)
-            s[Crf].vectorize(s[Crf].op.axis[0])
-            s[O].pragma(bxyo, "auto_unroll_max_step", 16)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule("batch_matmul_int8.x86")
-def schedule_batch_matmul_int8(cfg, outs):
-    """Schedule for batch_matmul_int8"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "batch_matmul_int8" in op.tag:
-            layout_trans = op.input_tensors[1]
-            if target_has_features("amx-int8"):
-                batch_matmul_amx_schedule(cfg, s, op.output(0), outs[0], layout_trans)
-            elif target_has_features(["avx512bw", "avx512f"]):
-                batch_matmul_int8_schedule(cfg, s, op.output(0), outs[0], layout_trans)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _default_batch_matmul_config(cfg, M, N, K):
-    cfg["tile_k"] = SplitEntity([K // 16, 16])
-    x_bn = get_max_power2_factor(N, 8)
-    cfg["tile_x"] = SplitEntity([N // x_bn, x_bn])
-    y_bn = get_max_power2_factor(M, 8)
-    cfg["tile_y"] = SplitEntity([M // y_bn, y_bn])
-
-
-def batch_matmul_blas_common(cfg, tensor_a, tensor_b, out_shape, trans_a, trans_b, lib):
-    """Computes batch matrix multiplication of `tensor_a` and `tensor_b` when `tensor_a` and
-    `tensor_b` are data in batch, using one of BLAS libraries. Supports broadcasting in batch
-    dimension.
-
-    Parameters
-    ----------
-    cfg : ConfigSpace
-        Autotvm tuning space config file
-
-    tensor_a : tvm.te.Tensor
-        3-D with shape [batch, M, K] or [batch, K, M].
-
-    tensor_b : tvm.te.Tensor
-        3-D with shape [batch, K, N] or [batch, N, K].
-
-    out_shape : List[Optional]
-        Explicit intended output shape of the computation. Can be useful in cases
-        with dynamic input shapes.
-
-    trans_a : Optional[bool] = False
-        Whether the first tensor is in transposed format.
-
-    trans_b : Optional[bool] = True
-        Whether the second tensor is in transposed format.
-
-    lib : A contrib module which implements batch_matmul function
-        cblas and mkl are supported
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D with shape [batch, M, N]
-    """
-    assert len(tensor_a.shape) == 3 and len(tensor_b.shape) == 3, "only support 3-dim batch_matmul"
-    if trans_a:
-        XB, XK, M = get_const_tuple(tensor_a.shape)
-    else:
-        XB, M, XK = get_const_tuple(tensor_a.shape)
-    if trans_b:
-        YB, N, YK = get_const_tuple(tensor_b.shape)
-    else:
-        YB, YK, N = get_const_tuple(tensor_a.shape)
-    assert (XB == YB) or (YB == 1) or (XB == 1), "batch dimension doesn't match"
-    assert XK == YK, "shapes of x and y is inconsistent"
-    if out_shape is not None:
-        assert out_shape[0] in (XB, YB), "got invalid output shape"
-        assert out_shape[1] == M, "got invalid output shape"
-        assert out_shape[2] == N, "got invalid output shape"
-    cfg.add_flop(XB * M * N * XK * 2)
-    return lib.batch_matmul(tensor_a, tensor_b, trans_a, trans_b)
-
-
-@autotvm.register_topi_compute("batch_matmul_cblas.x86")
-def batch_matmul_cblas(
-    cfg, tensor_a, tensor_b, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """Compute batch_matmul using cblas"""
-    del out_dtype  # Unused argument
-    return batch_matmul_blas_common(
-        cfg, tensor_a, tensor_b, out_shape, transpose_a, transpose_b, cblas
-    )
-
-
-@autotvm.register_topi_schedule("batch_matmul_cblas.x86")
-def schedule_batch_matmul_cblas(_, outs):
-    """Create schedule for batch_matmul_cblas"""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("batch_matmul_mkl.x86")
-def batch_matmul_mkl(
-    cfg, tensor_a, tensor_b, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
-):
-    """Compute batch_matmul using mkl"""
-    del out_dtype  # Unused argument
-    return batch_matmul_blas_common(
-        cfg, tensor_a, tensor_b, out_shape, transpose_a, transpose_b, mkl
-    )
-
-
-@autotvm.register_topi_schedule("batch_matmul_mkl.x86")
-def schedule_batch_matmul_mkl(_, outs):
-    """Create schedule for batch_matmul_mul"""
-    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/x86/binarize_pack.py b/python/tvm/topi/x86/binarize_pack.py
deleted file mode 100644
index 53c346c37969..000000000000
--- a/python/tvm/topi/x86/binarize_pack.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Schedule for binarization and bit-packing."""
-from tvm import te
-
-
-def schedule_binarize_pack(outs):
-    """Schedule for binarize_pack.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of binarize_pack
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for binarize_pack.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(Out):
-        s[Out].parallel(Out.op.axis[0])
-
-    def traverse(OP):
-        # schedule binarize_pack
-        if OP.tag == "binarize_pack":
-            Out = OP.output(0)
-            _schedule(Out)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/x86/binary_dense.py b/python/tvm/topi/x86/binary_dense.py
deleted file mode 100644
index 0940af4fb161..000000000000
--- a/python/tvm/topi/x86/binary_dense.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument
-"""Schedule for binary dense operator."""
-from tvm import te
-from .. import tag
-
-
-def schedule_binary_dense(outs):
-    """Schedule for binary_dense.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of binary_dense
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for binary_dense.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _schedule(A, B, C):
-        s[C].split(s[C].op.reduce_axis[0], factor=8)
-        s[C].parallel(s[C].op.axis[0])
-        if C.op in s.outputs:
-            Out = C
-        else:
-            Out = outs[0].op.output(0)
-        xo, xi = s[Out].split(Out.op.axis[1], factor=8)
-        s[Out].vectorize(xi)
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule binary_dense
-        elif OP.tag == "binary_dense":
-            output = OP.output(0)
-            data = OP.input_tensors[0]
-            weight = OP.input_tensors[1]
-            _schedule(data, weight, output)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/x86/bitserial_conv2d.py b/python/tvm/topi/x86/bitserial_conv2d.py
deleted file mode 100644
index 73c9dd56517f..000000000000
--- a/python/tvm/topi/x86/bitserial_conv2d.py
+++ /dev/null
@@ -1,562 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,invalid-name
-"""Bitserial conv2d schedule on x86"""
-import tvm
-from tvm import te
-from tvm import autotvm
-from .. import tag
-from ..utils import get_const_int, get_const_tuple
-from ..nn.pad import pad
-from ..nn.utils import get_pad_tuple
-from ..nn.bitserial_util import bitpack, binary_op_multiplier
-
-
-@autotvm.register_topi_compute("bitserial_conv2d_nchw.x86")
-def bitserial_conv2d_nchw(
-    cfg,
-    data,
-    kernel,
-    stride,
-    padding,
-    in_bits,
-    weight_bits,
-    pack_dtype="uint32",
-    out_dtype="int16",
-    unipolar=True,
-):
-    """Compute convolution with pack on spatial axes."""
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
-    # Check if kernel is already bitpacked
-    if len(kernel.shape) == 4:
-        kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
-        KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)
-    else:
-        kernel_vec = kernel
-        OCO, _, KH, KW, KB, VC = get_const_tuple(kernel_vec.shape)
-        CO = OCO * VC
-
-    IB, N, CI, H, W = get_const_tuple(data_q.shape)
-    KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)
-
-    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
-        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
-    else:
-        TPAD, LPAD, DPAD, RPAD = padding
-    pad_before = [0, 0, 0, TPAD, LPAD]
-    pad_after = [0, 0, 0, DPAD, RPAD]
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    HCAT, WCAT = KH - 1, KW - 1
-
-    TH = H + TPAD + DPAD
-    TW = W + LPAD + RPAD
-    OH = (H + TPAD + DPAD - KH) // HSTR + 1
-    OW = (W + LPAD + RPAD - KW) // WSTR + 1
-
-    # ==================== define configuration space ====================
-    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
-    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
-
-    co, vc = cfg.define_split("tile_co", co, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16)
-    oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16)
-    ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16)
-    cfg.define_annotate("ann_reduce", [ib, kb, kh, kw], policy="try_unroll")
-
-    cfg.define_reorder(
-        "reorder_0",
-        [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci],
-        policy="interval_all",
-        interval=(6, 11),
-    )
-    # binary ops
-    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
-    # ====================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    dvshape = (1, TH // (VH * HSTR), TW // (VW * WSTR), CI, VH * HSTR + HCAT, VW * WSTR + WCAT, IB)
-    kvshape = (CO // VC, CI, KH, KW, KB, VC)
-    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (1, CO, OH, OW)
-
-    if TPAD != 0 and RPAD != 0:
-        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
-    else:
-        data_pad = data_q
-
-    data_vec = te.compute(
-        dvshape,
-        lambda n, h, w, ci, vh, vw, b: data_pad[b][n][ci][h * VH * HSTR + vh][w * VW * WSTR + vw],
-        name="data_vec",
-    )
-
-    if len(kernel.shape) == 4:
-        kernel_vec = te.compute(
-            kvshape,
-            lambda co, ci, dh, dw, b, vc: kernel_q[b][co * VC + vc][ci][dh][dw],
-            name="kernel_vec",
-        )
-
-    ci = te.reduce_axis((0, CI), name="ci")
-    dh = te.reduce_axis((0, KH), name="dh")
-    dw = te.reduce_axis((0, KW), name="dw")
-    b1 = te.reduce_axis((0, IB), name="ib")
-    b2 = te.reduce_axis((0, KB), name="kb")
-
-    def _conv(n, co, h, w, vh, vw, vc):
-        b1b2 = (b1 + b2).astype(out_dtype)
-        if unipolar:
-            return te.sum(
-                (
-                    tvm.tir.popcount(
-                        data_vec[n, h, w, ci, vh * HSTR + dh, vw * WSTR + dw, b1].astype(out_dtype)
-                        & kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype)
-                    )
-                    - tvm.tir.popcount(
-                        data_vec[n, h, w, ci, vh * HSTR + dh, vw * WSTR + dw, b1].astype(out_dtype)
-                        & ~kernel_vec[co, ci, dh, dw, b2, vc]
-                    ).astype(out_dtype)
-                )
-                << b1b2,
-                axis=[ci, dh, dw, b1, b2],
-            )
-
-        return te.sum(
-            (
-                tvm.tir.popcount(
-                    data_vec[n, h, w, ci, vh * HSTR + dh, vw * WSTR + dw, b1]
-                    & kernel_vec[co, ci, dh, dw, b2, vc]
-                )
-            ).astype(out_dtype)
-            << b1b2,
-            axis=[ci, dh, dw, b1, b2],
-        )
-
-    conv = te.compute(ovshape, _conv, name="conv_out")
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    return te.compute(
-        oshape,
-        lambda n, co, h, w: conv[
-            n, idxd(co, VC), idxd(h, VH), idxd(w, VW), idxm(h, VH), idxm(w, VW), idxm(co, VC)
-        ],
-        name="conv_vec",
-        tag="spatial_bitserial_conv_nchw",
-    )
-
-
-@autotvm.register_topi_compute("bitserial_conv2d_nhwc.x86")
-def bitserial_conv2d_nhwc(
-    cfg,
-    data,
-    kernel,
-    stride,
-    padding,
-    in_bits,
-    weight_bits,
-    pack_dtype="uint32",
-    out_dtype="int16",
-    unipolar=True,
-):
-    """Compute convolution with pack on spatial axes."""
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype)
-    pack_kernel = len(kernel.shape) == 4
-
-    if pack_kernel:
-        kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype)
-    else:
-        kernel_q = kernel
-
-    KH, KW, _, CO, KB = get_const_tuple(kernel_q.shape)
-    N, H, W, CI, IB = get_const_tuple(data_q.shape)
-
-    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
-        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
-    else:
-        TPAD, LPAD, DPAD, RPAD = padding
-    pad_before = [0, TPAD, LPAD, 0, 0]
-    pad_after = [0, DPAD, RPAD, 0, 0]
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    HCAT, WCAT = KH - 1, KW - 1
-
-    PAD_H = H + (TPAD + DPAD)
-    PAD_W = W + (LPAD + RPAD)
-    OH = (PAD_H - KH) // HSTR + 1
-    OW = (PAD_W - KW) // WSTR + 1
-    oshape = (1, OH, OW, CO)
-
-    # ==================== define configuration space ====================
-    n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO)
-    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
-
-    co, vc = cfg.define_split("tile_co", co, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16)
-    oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16)
-    ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16)
-    cfg.define_annotate("ann_reduce", [ib, kb, kh, kw], policy="try_unroll")
-    cfg.define_reorder(
-        "reorder_0",
-        [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci],
-        policy="interval_all",
-        interval=(3, 7),
-    )
-    # binary ops
-    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
-    # ====================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    dvshape = (
-        1,
-        PAD_H // (VH * HSTR),
-        PAD_W // (VW * WSTR),
-        VH * HSTR + HCAT,
-        VW * WSTR + WCAT,
-        CI,
-        IB,
-    )
-    kvshape = (CO, KH, KW, CI, VC, KB)
-    ovshape = (1, OH, OW, CO, VH, VW, VC)
-    oshape = (1, OH, OW, CO)
-
-    if DPAD != 0 and RPAD != 0:
-        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
-    else:
-        data_pad = data_q
-
-    data_vec = te.compute(
-        dvshape,
-        lambda n, h, w, vh, vw, ci, b: data_pad[n][h * VH * HSTR + vh][w * VW * WSTR + vw][ci][b],
-        name="data_vec",
-    )
-
-    kernel_vec = te.compute(
-        kvshape,
-        lambda co, dh, dw, ci, vc, b: kernel_q[dh][dw][ci][co * VC + vc][b],
-        name="kernel_vec",
-    )
-
-    ci = te.reduce_axis((0, CI), name="ci")
-    dh = te.reduce_axis((0, KH), name="dh")
-    dw = te.reduce_axis((0, KW), name="dw")
-    b1 = te.reduce_axis((0, IB), name="ib")
-    b2 = te.reduce_axis((0, KB), name="kb")
-
-    def _conv(n, h, w, co, vh, vw, vc):
-        b1b2 = (b1 + b2).astype(out_dtype)
-        if unipolar:
-            return te.sum(
-                (
-                    (
-                        tvm.tir.popcount(
-                            data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1]
-                            & kernel_vec[co, dh, dw, ci, vc, b2]
-                        ).astype(out_dtype)
-                        - tvm.tir.popcount(
-                            data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1]
-                            & ~kernel_vec[co, dh, dw, ci, vc, b2]
-                        ).astype(out_dtype)
-                    )
-                    << b1b2
-                ),
-                axis=[dh, dw, ci, b1, b2],
-            )
-
-        return te.sum(
-            tvm.tir.popcount(
-                data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1]
-                & kernel_vec[co, dh, dw, ci, vc, b2]
-            ).astype(out_dtype)
-            << b1b2,
-            axis=[dh, dw, ci, b1, b2],
-        )
-
-    conv = te.compute(ovshape, _conv, name="conv")
-
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-    return te.compute(
-        oshape,
-        lambda n, h, w, co: conv[
-            n, idxd(h, VH), idxd(w, VW), idxd(co, VC), idxm(h, VH), idxm(w, VW), idxm(co, VC)
-        ],
-        name="output_unpack",
-        tag="spatial_bitserial_conv_nhwc",
-    )
-
-
-@autotvm.register_topi_schedule("bitserial_conv2d_nchw.x86")
-def schedule_bitserial_conv2d_nchw(cfg, outs):
-    return _schedule_bitserial_conv2d(cfg, outs)
-
-
-@autotvm.register_topi_schedule("bitserial_conv2d_nhwc.x86")
-def schedule_bitserial_conv2d_nhwc(cfg, outs):
-    return _schedule_bitserial_conv2d(cfg, outs)
-
-
-def _schedule_bitserial_conv2d(cfg, outs):
-    """CPU schedule for bitserial convolutions NCHW and NHWC"""
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        output = op.output(0)
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag) or "elemwise" in op.tag:
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and (tensor.op not in scheduled_ops):
-                    if isinstance(tensor.op, tvm.te.ComputeOp):
-                        traverse(tensor.op)
-
-        elif "spatial_bitserial_conv_nchw" in op.tag or "spatial_bitserial_conv_nhwc" in op.tag:
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel_q = kernel_vec.op.input_tensors[0]
-            data_vec = conv_out.op.input_tensors[0]
-            data_q = data_vec.op.input_tensors[0]
-            data = data_q.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data_q.op, tvm.te.ComputeOp) and "pad" in data_q.op.tag:
-                data_pad = data_q
-                data_q = data
-                data = data_q.op.input_tensors[0]
-
-            if "QuantizeInput" in data.op.name:
-                # Need to go up 1 further, from the combine in bitpack
-                data = data.op.input_tensors[0]
-
-            if "spatial_bitserial_conv_nchw" in op.tag:
-                _schedule_bitserial_conv2d_nchw(
-                    cfg,
-                    s,
-                    data_q,
-                    data_pad,
-                    data_vec,
-                    kernel_q,
-                    kernel_vec,
-                    conv_out,
-                    output,
-                    outs[0],
-                )
-            elif "spatial_bitserial_conv_nhwc" in op.tag:
-                _schedule_bitserial_conv2d_nhwc(
-                    cfg,
-                    s,
-                    data_q,
-                    data_pad,
-                    data_vec,
-                    kernel_q,
-                    kernel_vec,
-                    conv_out,
-                    output,
-                    outs[0],
-                )
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
-    return s
-
-
-def _schedule_bitserial_conv2d_nchw(
-    cfg, s, data_q, data_pad, data_vec, kernel_q, kernel_vec, conv_out, output, last
-):
-    IB, _, CI, IH, IW = data_q.shape
-    KB, CO, _, KH, KW = kernel_q.shape
-    _, _, OH, OW = output.shape
-
-    # Infer padding and stride
-    if data_pad is None:
-        padding = (0, 0)
-        TH, TW = IH, IW
-    else:
-        _, _, _, TH, TW = data_pad.shape
-        hpad = get_const_int((TH - IH) // 2)
-        wpad = get_const_int((TW - IW) // 2)
-        padding = (hpad, wpad)
-
-    hstride = get_const_int((TH - KH) // (OH - 1))
-    wstride = get_const_int((TW - KW) // (OW - 1))
-    stride = (hstride, wstride)
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    ##### Schedule Data padding, and bitpacking
-    if data_pad is not None:
-        s[data_pad].compute_inline()
-
-    _, _, h, _, _, _, _ = s[data_vec].op.axis
-    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
-    oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
-    if cfg["tile_ah"].size[1] == 1:
-        oaxis = oh
-        paxis = oh
-    else:
-        oaxis = oh
-        paxis = ih
-
-    s[data_vec].parallel(paxis)
-    s[data_vec].pragma(oaxis, "parallel_launch_point")
-    s[data_vec].pragma(paxis, "parallel_stride_pattern")
-    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-    ##### Schedule Kenerl bitpacking
-    co, _, _, _, _, _ = s[kernel_vec].op.axis
-    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
-    oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
-    if cfg["tile_bco"].size[1] == 1:
-        oaxis = oco
-        paxis = oco
-    else:
-        oaxis = oco
-        paxis = ico
-
-    s[kernel_vec].parallel(paxis)
-    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
-    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
-    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-    ##### Schedule Convolution
-    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
-    ci, dh, dw, ib, kb = s[conv_out].op.reduce_axis
-
-    # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
-    cfg["reorder_0"].apply(s, conv_out, [n, co, oh, ow, vc, vh, vw, dh, dw, kb, ib, ci])
-    cfg["ann_reduce"].apply(
-        s,
-        conv_out,
-        [kb, ib, dh, dw],
-        axis_lens=[
-            get_const_int(kb.dom.extent),
-            get_const_int(ib.dom.extent),
-            get_const_int(dh.dom.extent),
-            get_const_int(dw.dom.extent),
-        ],
-        max_unroll=16,
-        cfg=cfg,
-    )
-
-    s[conv_out].vectorize(vc)
-
-    # # Schedule output
-    n, co, h, w = s[last].op.axis
-    co, vc = s[last].split(co, VC)
-    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
-    s[last].reorder(n, co, oh, ow, vh, vw, vc)
-    if last != output:
-        s[output].compute_inline()
-    s[conv_out].compute_at(s[last], ow)
-
-    oco, ico = cfg["tile_oh"].apply(s, last, co)
-    if cfg["tile_oh"].size[1] == 1:
-        oaxis = oco
-        paxis = oco
-    else:
-        oco, ico = s[last].split(co, bc)
-        oaxis = oco
-        paxis = ico
-
-    s[last].parallel(oco)
-    return s
-
-
-def _schedule_bitserial_conv2d_nhwc(
-    cfg, s, data_q, data_pad, data_vec, kernel_q, kernel_vec, conv_out, output, last
-):
-    # no stride and padding info here
-    _, IH, IW, CI, IB = data_q.shape
-    KH, KW, _, CO, KB = kernel_q.shape
-    _, OH, OW, _ = output.shape
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    ##### Schedule data padding and packing
-    if data_pad is not None:
-        s[data_pad].compute_inline()
-
-    _, h, _, _, _, _, _ = s[data_vec].op.axis
-    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
-    oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
-    s[data_vec].parallel(oh)
-
-    ##### Schedule kernel packing
-    co, _, _, _, _, _ = s[kernel_vec].op.axis
-    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
-    oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
-    s[kernel_vec].parallel(oco)
-
-    ##### Schedule Convolution
-    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
-    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis
-
-    # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
-    cfg["reorder_0"].apply(s, conv_out, [n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2])
-    cfg["ann_reduce"].apply(
-        s,
-        conv_out,
-        [b1, b2, dh, dw],
-        axis_lens=[
-            get_const_int(b1.dom.extent),
-            get_const_int(b2.dom.extent),
-            get_const_int(dh.dom.extent),
-            get_const_int(dw.dom.extent),
-        ],
-        max_unroll=16,
-        cfg=cfg,
-    )
-
-    s[conv_out].unroll(b1)
-    s[conv_out].unroll(b2)
-    s[conv_out].vectorize(vc)
-
-    # # Schedule output
-    n, h, w, co = s[last].op.axis
-    co, vc = s[last].split(co, VC)
-    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
-    s[last].reorder(n, oh, ow, co, vh, vw, vc)
-    s[last].vectorize(vc)
-    if last != output:
-        s[output].compute_inline()
-    s[conv_out].compute_at(s[last], ow)
-
-    oho, iho = cfg["tile_oh"].apply(s, last, oh)  # reuse parameter
-    s[last].parallel(oho)
-
-    return s
diff --git a/python/tvm/topi/x86/bitserial_dense.py b/python/tvm/topi/x86/bitserial_dense.py
deleted file mode 100644
index 86c58b60eaf2..000000000000
--- a/python/tvm/topi/x86/bitserial_dense.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-arguments, condition-evals-to-constant
-"""Schedule for bitserial dense operator."""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.topi.utils import get_const_int, get_const_tuple
-from .. import tag
-from ..nn.bitserial_util import bitpack, binary_op_multiplier
-
-
-@autotvm.register_topi_compute("bitserial_dense.x86")
-def bitserial_dense(
-    cfg, data, weight, data_bits, weight_bits, pack_dtype="uint32", out_dtype="int16", unipolar=True
-):
-    """Bitserial dense implementation. TODO: Why are these separate
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        2-D with shape [batch, in_dim]
-    weight : tvm.te.Tensor
-        2-D with shape [out_dim, in_dim] or
-        3-D with shape [out_dim, weight_bits, in_dim]
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
-    if len(weight.shape) == 2:
-        weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
-    else:
-        weight_packed = weight
-    Y, DB, K = get_const_tuple(data_packed.shape)
-    X, WB, _ = get_const_tuple(weight_packed.shape)
-    ######## Search space
-    x, y = cfg.axis(X), cfg.axis(Y)
-    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K)
-    ko, ki = cfg.define_split("tile_k", k, num_outputs=2)
-    yo, yi = cfg.define_split("tile_y", y, num_outputs=2)
-    xo, xi = cfg.define_split("tile_x", x, num_outputs=2)
-
-    cfg.define_reorder(
-        "reorder_0",
-        [yo, xo, ko, yi, wb, db, ki, xi],
-        policy="candidate",
-        candidate=[[yo, xo, ko, yi, wb, db, ki, xi], [yo, xo, yi, ko, wb, db, ki, xi]],
-    )
-
-    cfg.define_annotate("ann_reduce", [db, wb], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [yi, xi], policy="try_unroll_vec")
-
-    ###### Compute rule
-    VX = cfg["tile_x"].size[-1]
-
-    wvshape = (X // VX, WB, VX, K)
-    oshape = (Y, X)
-
-    k = te.reduce_axis((0, K), name="k")
-    db = te.reduce_axis((0, DB), name="db")
-    wb = te.reduce_axis((0, WB), name="wb")
-
-    # Tile data and weights
-    weight_vec = te.compute(
-        wvshape, lambda xo, wb, vx, k: weight_packed[xo * VX + vx][wb][k], name="weight_vec"
-    )
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    matmul_unipolar = te.compute(
-        oshape,
-        lambda i, j: te.sum(
-            (
-                tvm.tir.popcount(
-                    weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]
-                )
-                - tvm.tir.popcount(
-                    ~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]
-                )
-            ).astype(out_dtype)
-            << (db + wb).astype(out_dtype),
-            axis=[wb, db, k],
-        ),
-        tag="bitserial_dense_unipolar",
-    )
-
-    matmul = te.compute(
-        oshape,
-        lambda i, j: te.sum(
-            tvm.tir.popcount(
-                weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]
-            ).astype(out_dtype)
-            << (db + wb).astype(out_dtype),
-            axis=[wb, db, k],
-        ),
-        tag="bitserial_dense",
-    )
-
-    # binary ops
-    cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype))
-
-    if unipolar:
-        return matmul_unipolar
-    return matmul
-
-
-@autotvm.register_topi_schedule("bitserial_dense.x86")
-def schedule_bitserial_dense(cfg, outs):
-    """Schedule for bitserial_dense.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of bitserial dense operator.
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for bitserial_dense.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _schedule(cfg, s, data_vec, weight_vec, output):
-        s[data_vec].parallel(s[data_vec].op.axis[0])
-        s[weight_vec].parallel(s[weight_vec].op.axis[0])
-
-        y, x = s[output].op.axis
-        wb, db, k = s[output].op.reduce_axis
-
-        yo, yi = cfg["tile_y"].apply(s, output, y)
-        xo, xi = cfg["tile_x"].apply(s, output, x)
-        ko, ki = cfg["tile_k"].apply(s, output, k)
-
-        cfg["reorder_0"].apply(s, output, [yo, xo, ko, yi, wb, db, ki, xi])
-        cfg["ann_reduce"].apply(
-            s,
-            output,
-            [db, wb],
-            axis_lens=[get_const_int(db.dom.extent), get_const_int(wb.dom.extent)],
-            max_unroll=8,
-            cfg=cfg,
-        )
-        cfg["ann_spatial"].apply(
-            s,
-            output,
-            [yi, xi],
-            axis_lens=[cfg["tile_y"].size[-1], cfg["tile_x"].size[-1]],
-            max_unroll=8,
-            cfg=cfg,
-        )
-        s[output].vectorize(xi)
-        s[output].parallel(yo)
-        return s
-
-    def traverse(op):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag) or "elemwise" in op.tag:
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp):
-                    traverse(tensor.op)
-
-        elif op.tag == "bitserial_dense" or "bitserial_dense_unipolar":
-            output = op.output(0)
-            weight_vec = op.input_tensors[0]
-
-            data_vec = op.input_tensors[1]
-            data = data_vec.op.input_tensors[0]
-            if "QuantizeInput" in data.op.name:
-                data = data.op.input_tensors[0]
-            _schedule(cfg, s, data_vec, weight_vec, output)
-        else:
-            raise RuntimeError(f"Unsupported operator: {op.tag}")
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/x86/concat.py b/python/tvm/topi/x86/concat.py
deleted file mode 100644
index ae131686dca6..000000000000
--- a/python/tvm/topi/x86/concat.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"concatenate related operators"
-from typing import Optional
-import numpy as np
-import tvm
-from tvm import te
-from ..utils import get_const_int
-
-
-def concatenate(data: tvm.te.Tensor, axis: Optional[int] = 0):
-    """Join a sequence of arrays along an existing axis.
-    Optimized for CPU execution.
-
-    Parameters
-    ----------
-    data : tuple of tvm.te.Tensor
-        The arrays to concatenate
-
-    axis : int, optional
-        The axis along which the arrays will be joined. Default is 0.
-
-    Returns
-    -------
-    ret : tvm.te.Tensor
-    """
-
-    in_outers = [int(np.prod(i.shape[axis:])) for i in data]
-    in_outers_cumsum = [0, *np.cumsum(in_outers, dtype="int64")[0:-1]]
-
-    def gen_ir_1d(data_bufs, out_buf):
-        """Custom concatenation execution."""
-        i_b = tvm.tir.ir_builder.create()
-        data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
-        out_buf = i_b.buffer_ptr(out_buf)
-
-        for i in range(len(data)):
-            with i_b.for_range(0, in_outers[i], name="j") as j:
-                out_buf[in_outers_cumsum[i] + j] = data_bufs1[i][j]
-        return i_b.get()
-
-    def gen_ir(data_bufs, out_buf, inner, outer):
-        """Common case of concatenation execution."""
-        i_b = tvm.tir.ir_builder.create()
-        data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
-        out_buf = i_b.buffer_ptr(out_buf)
-        if inner > 1:
-            with i_b.for_range(0, inner, name="inn", kind="parallel") as inn:
-                pos = inn * outer
-                for i in range(len(data)):
-                    offset = inn * in_outers[i]
-                    with i_b.for_range(0, in_outers[i], name="j") as j:
-                        out_buf[pos + in_outers_cumsum[i] + j] = data_bufs1[i][offset + j]
-        else:
-            for i in range(len(data)):
-                with i_b.for_range(0, in_outers[i], name="j", kind="parallel") as j:
-                    out_buf[in_outers_cumsum[i] + j] = data_bufs1[i][j]
-        return i_b.get()
-
-    if axis < 0:
-        axis += len(data[0].shape)
-    concat_axis_sizes = [int(t.shape[axis]) for t in data]
-    join_size = int(np.sum(concat_axis_sizes))
-
-    dtype = data[0].dtype
-    out_shape = data[0].shape[:axis] + [join_size] + data[0].shape[axis + 1 :]
-    right_val = np.prod(out_shape[axis:])
-    left_val = np.prod(out_shape[:axis])
-
-    if (
-        len(data[0].shape) == 1
-        or (left_val == 1 and axis == len(data[0].shape) - 1)
-        or (left_val == 1 and right_val == 1)
-    ):
-        # badly parallelized case
-        return te.extern(
-            [out_shape],
-            list(data),
-            lambda ins, outs: gen_ir_1d(ins, outs[0]),
-            dtype=dtype,
-            name="concatenate_ext",
-        )
-
-    inner = get_const_int(int(left_val))
-    outer = get_const_int(int(right_val))
-    return te.extern(
-        [out_shape],
-        list(data),
-        lambda ins, outs: gen_ir(ins, outs[0], inner, outer),
-        dtype=dtype,
-        name="concatenate_ext",
-    )
diff --git a/python/tvm/topi/x86/conv1d.py b/python/tvm/topi/x86/conv1d.py
deleted file mode 100644
index 76fc40cab3a4..000000000000
--- a/python/tvm/topi/x86/conv1d.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
-"""Conv1D schedule on for Intel CPU"""
-from tvm import te
-from .. import tag
-
-
-def schedule_conv1d_ncw(outs):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-    output_op = outs[0].op
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            else:  # inject custom schedule
-                if len(op.axis) == 3:  # schedule bias + bn + relu
-                    n, c, w = op.axis
-                    fused = s[op].fuse(n, c)
-                    s[op].parallel(fused)
-                    s[op].vectorize(w)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
-        if "conv1d_ncw" in op.tag:
-            conv = op.output(0)
-            kernel = op.input_tensors[1]
-            if isinstance(kernel.op, te.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            n_pad, c_pad, w_pad = data_pad.op.axis
-            pad_fused = s[data_pad].fuse(n_pad, c_pad)
-            s[data_pad].parallel(pad_fused)
-            C = conv
-            n, c, w = C.op.axis
-            rc, rw = C.op.reduce_axis
-            n_out, c_out, w_out = output_op.axis
-            s[C].vectorize(w)
-            if op != output_op:  # fuse bias + bn + relu into conv
-                s[C].compute_at(s[output_op], w_out)
-            else:
-                fused = s[C].fuse(n, c)
-                s[C].parallel(fused)
-
-        scheduled_ops.append(op)
-
-    traverse(output_op)
-    return s
-
-
-def schedule_conv1d_nwc(outs):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-    output_op = outs[0].op
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            else:  # inject custom schedule
-                if len(op.axis) == 3:  # schedule bias + bn + relu
-                    n, w, c = op.axis
-                    fused = s[op].fuse(n, w)
-                    s[op].parallel(fused)
-                    s[op].vectorize(c)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
-        if "conv1d_nwc" in op.tag:
-            conv = op.output(0)
-            kernel = op.input_tensors[1]
-            if isinstance(kernel.op, te.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            n_pad, w_pad, c_pad = data_pad.op.axis
-            pad_fused = s[data_pad].fuse(n_pad, w_pad)
-            s[data_pad].parallel(pad_fused)
-            C = conv
-            n, w, c = C.op.axis
-            rc, rw = C.op.reduce_axis
-            n_out, w_out, c_out = output_op.axis
-            s[C].vectorize(c)
-            if op != output_op:  # fuse bias + bn + relu into conv
-                s[C].compute_at(s[output_op], c_out)
-            else:
-                fused = s[C].fuse(n, w)
-                s[C].parallel(fused)
-
-        scheduled_ops.append(op)
-
-    traverse(output_op)
-    return s
-
-
-def schedule_group_conv1d_ncw(outs):
-    return schedule_conv1d_ncw(outs)
-
-
-def schedule_group_conv1d_nwc(outs):
-    return schedule_conv1d_nwc(outs)
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
deleted file mode 100644
index 1b7f020d5014..000000000000
--- a/python/tvm/topi/x86/conv2d.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Conv2D schedule on x86"""
-
-import logging
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.contrib import dnnl
-from .. import nn
-from ..generic import schedule_extern
-from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
-from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
-from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
-from . import conv2d_avx_1x1, conv2d_avx_common
-
-logger = logging.getLogger("topi")
-
-
-def _get_default_config(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW"
-):
-    """
-    Get default schedule config for the workload
-    """
-    static_data_shape = []
-    for dim in get_const_tuple(data.shape):
-        if isinstance(dim, tvm.tir.Var):
-            static_data_shape.append(1)
-        else:
-            static_data_shape.append(dim)
-    data = te.placeholder(static_data_shape, dtype=data.dtype)
-    if is_depthwise:
-        wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype)
-        from .depthwise_conv2d import _fallback_schedule
-
-        _fallback_schedule(cfg, wkl)
-    else:
-        wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
-        is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
-        if is_kernel_1x1:
-            conv2d_avx_1x1._fallback_schedule(cfg, wkl)
-        else:
-            conv2d_avx_common._fallback_schedule(cfg, wkl)
-
-
-@conv2d_infer_layout.register("cpu")
-def _conv2d_infer_layout(workload, cfg):
-    _, data, kernel, strides, padding, dilation, layout, _, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[1]
-    out_channel, _, k_height, k_width = kernel[1]
-    idxdiv = tvm.tir.indexdiv
-
-    pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
-    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_h = (k_height - 1) * hdilation + 1
-    dilated_kernel_w = (k_width - 1) * wdilation + 1
-    out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1
-    out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1
-    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic)
-    in_layout = f"NCHW{tile_ic}c"
-    out_shape = (batch_size, idxdiv(out_channel, tile_oc), out_height, out_width, tile_oc)
-    out_layout = f"NCHW{tile_oc}c"
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
-
-
-def schedule_conv2d_nhwc(outs):
-    """Create schedule for conv2d_nhwc"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    output_op = outs[0].op
-
-    def _callback(op):
-        if "conv2d_nhwc" in op.tag:
-            conv = op.output(0)
-            kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            n_pad, h_pad, w_pad, c_pad = data_pad.op.axis
-            pad_fused = s[data_pad].fuse(n_pad, h_pad)
-            s[data_pad].parallel(pad_fused)
-            C = conv
-            n, h, w, c = C.op.axis
-            s[C].vectorize(c)
-
-            O = output_op.output(0)
-            if len(O.op.axis) == 4:  # schedule bias + bn + relu
-                n, h, w, c = O.op.axis
-                fused = s[O].fuse(n, h, w)
-                s[O].parallel(fused)
-                channels = int(O.shape[-1])
-                if channels % 64 == 0:
-                    c, ci = s[O].split(c, 64)
-                    s[O].vectorize(ci)
-                if C != O:
-                    s[C].compute_at(s[O], c)
-
-    traverse_inline(s, output_op, _callback)
-    return s
-
-
-def conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype):
-    layout = "NCHW"
-    packed_out = conv2d_NCHWc(data, kernel, strides, padding, dilation, layout, layout, out_dtype)
-    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
-
-
-def schedule_conv2d_nchw(outs):
-    """Create schedule for tensors"""
-    return schedule_conv2d_NCHWc(outs)
-
-
-def _pack_data(cfg, data, kernel):
-    n, _, ih, iw = get_const_tuple(data.shape)
-    oc, ic, kh, kw = get_const_tuple(kernel.shape)
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-    ic_chunk = ic // ic_bn
-    oc_chunk = oc // oc_bn
-
-    # Handle dynamic shape to pass tuning dispatch.
-    if isinstance(n, tvm.tir.Any):
-        n = tvm.te.size_var("n")
-    if isinstance(ih, tvm.tir.Any):
-        ih = tvm.te.size_var("ih")
-    if isinstance(iw, tvm.tir.Any):
-        iw = tvm.te.size_var("iw")
-    if isinstance(ic, tvm.tir.Any):
-        raise RuntimeError("Dynamic input channel is not supported for conv2d.")
-
-    data = te.compute(
-        (n, ic_chunk, ih, iw, ic_bn),
-        lambda bs, c, h, w, vc: data[bs, c * ic_bn + vc, h, w],
-        name="data_vec",
-    )
-
-    kernel = te.compute(
-        (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn),
-        lambda occ, icc, k_h, k_w, icb, ocb: kernel[occ * oc_bn + ocb, icc * ic_bn + icb, k_h, k_w],
-        name="kernel_vec",
-    )
-
-    return data, kernel
-
-
-@autotvm.register_topi_compute("conv2d_NCHWc.x86")
-def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype):
-    """Compute conv2d with NCHWc layout."""
-    # layout and out_layout are not used here,
-    # we keep them for debug convenience when dumping autotvm workload
-    if len(data.shape) == 5:
-        n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
-        oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = get_const_tuple(
-            kernel.shape
-        )
-        in_channel = ic_chunk * ic_bn
-        num_filter = oc_chunk * oc_bn
-    else:
-        n, in_channel, ih, iw = get_const_tuple(data.shape)
-        num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    # Define autotvm tuning space
-    is_kernel_1x1 = kernel_height == 1 and kernel_width == 1
-    pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width))
-    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    oh = (ih - kernel_height + pt + pb) // sh + 1
-    ow = (iw - kernel_width + pl + pr) // sw + 1
-
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", num_filter, num_outputs=2)
-    if isinstance(ow, (tvm.tir.IntImm, int)):
-        cfg.define_split(
-            "tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64, policy="verbose"
-        )
-    if is_kernel_1x1:
-        if isinstance(oh, (tvm.tir.IntImm, int)):
-            cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
-    else:
-        cfg.define_knob("unroll_kw", [True, False])
-
-    # If no config was set, we can fallback to default config.
-    if cfg.is_fallback:
-        _get_default_config(
-            cfg,
-            te.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
-            te.placeholder(
-                (num_filter, in_channel, kernel_height, kernel_width), dtype=kernel.dtype
-            ),
-            strides,
-            padding,
-            dilation,
-            out_dtype,
-        )
-
-    # Pack data if raw 4-D data is provided.
-    # This can only happen when autotuning.
-    if len(data.shape) == 4:
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # Directly use modified data layout placeholder.
-            dshape = (n, in_channel // cfg["tile_ic"].size[-1], ih, iw, cfg["tile_ic"].size[-1])
-            data = tvm.te.placeholder(dshape, data.dtype, name="data")
-            kshape = (
-                num_filter // cfg["tile_oc"].size[-1],
-                in_channel // cfg["tile_ic"].size[-1],
-                kernel_height,
-                kernel_width,
-                cfg["tile_ic"].size[-1],
-                cfg["tile_oc"].size[-1],
-            )
-            kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel")
-        else:
-            data, kernel = _pack_data(cfg, data, kernel)
-
-    return nn.conv2d_NCHWc(data, kernel, strides, padding, dilation, layout, out_layout, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv2d_NCHWc.x86")
-def schedule_conv2d_NCHWc(cfg, outs):
-    """Create schedule for tensors"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_NCHWc" in op.tag:
-            conv_out = op.output(0)
-            kernel_vec = conv_out.op.input_tensors[1]
-            data_vec = conv_out.op.input_tensors[0]
-
-            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
-            (_, _, kh, kw, _, _) = get_const_tuple(kernel_vec.shape)
-            if kh == 1 and kw == 1:
-                conv2d_avx_1x1._schedule_conv_NCHWc(*args)
-            else:
-                conv2d_avx_common._schedule_conv_NCHWc(*args)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv2d_nchw_dnnl.x86")
-def conv2d_nchw_dnnl(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d in NCHW format using dnnl."""
-    groups = 1
-    _out = dnnl.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype)
-    return _out
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_dnnl.x86")
-def schedule_conv2d_nchw_dnnl(_, outs):
-    """Create schedule for conv2d_nchw_dnnl"""
-    return schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_dnnl.x86")
-def conv2d_nhwc_dnnl(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d in NHWC format using dnnl."""
-    groups = 1
-    _out = dnnl.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype)
-    return _out
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_dnnl.x86")
-def schedule_conv2d_nhwc_dnnl(_, outs):
-    """Create schedule for conv2d_nhwc_dnnl"""
-    return schedule_extern(outs)
-
-
-# FIXME - https://github.com/apache/tvm/issues/4122
-# _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO
-# layout. Commenting until we have clarity about the nhwc_pack implementation from the author.
-# elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8":
-#     if cfg.is_fallback:
-#         _get_default_config(cfg, data, kernel, strides, padding, out_dtype, False, layout)
-#     # specialize for INT8 1X1 conv on X86
-#     return conv2d_avx_1x1._declaration_conv_nhwc_pack(cfg, data, kernel, strides,
-#                                                       padding, dilation, out_dtype)
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
deleted file mode 100644
index 3772aaec046d..000000000000
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D alter op and legalize functions for x86"""
-
-import logging
-
-import re
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from .conv2d import _get_default_config
-from .conv2d_int8 import is_int8_hw_support, _get_default_config_int8
-from ..utils import get_const_tuple
-from ..nn import conv2d_legalize, conv2d_alter_layout
-from ..generic.conv2d import conv2d_alter_int8_common
-
-logger = logging.getLogger("topi")
-
-_NCHWc_matcher = re.compile("^NCHW[0-9]+c$")
-_OIHWio_matcher = re.compile("^OIHW[0-9]+i[0-9]+o$")
-
-
-@conv2d_alter_layout.register("cpu")
-def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    # Parse the attributes.
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data_tensor, kernel_tensor = tinfos
-    data_dtype = data_tensor.dtype
-    kernel_dtype = kernel_tensor.dtype
-    out_dtype = out_type.dtype
-
-    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
-        cfg = dispatch_ctx.query(target, None)
-        workload = cfg.workload
-    else:
-        impl, outs = relay.backend.te_compiler.select_implementation(
-            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-        )
-        workload = autotvm.task.get_workload(outs)
-        if workload is None:
-            # The best implementation is not an AutoTVM template.
-            # It may be from the auto-scheduler
-            if impl.name.find("winograd") != -1:
-                if dilation != (1, 1):
-                    logger.warning("Does not support weight pre-transform for dilated convolution.")
-                    return None
-
-                assert data_layout == "NHWC" and kernel_layout == "HWIO"
-                N, H, W, CI = get_const_tuple(data_tensor.shape)
-                KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
-
-                # Pre-compute weight transformation in winograd
-                tile_size = 4
-                # HWIO -> OIHW
-                kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
-                # alpha, alpha, CO, CI
-                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
-                    kernel_transform, tile_size=tile_size
-                )
-                new_attrs["tile_size"] = tile_size
-                new_attrs["channels"] = CO
-                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                    inputs[0], weight, **new_attrs
-                )
-            return None
-
-        cfg = dispatch_ctx.query(target, workload)
-
-    topi_tmpl = workload[0]
-
-    if topi_tmpl == "conv2d_NCHWc.x86":
-        # we only convert conv2d_NCHW to conv2d_NCHWc for x86
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            if cfg.is_fallback:
-                _get_default_config(
-                    cfg,
-                    data_tensor,
-                    kernel_tensor,
-                    strides,
-                    padding,
-                    dilation,
-                    out_dtype,
-                    False,
-                    data_layout,
-                )
-            batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-            out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
-            ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-            # update new attrs
-            new_attrs["channels"] = out_channel
-            new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-            # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-            new_attrs["kernel_layout"] = f"OIHW{ic_bn}i{oc_bn}o"
-            new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-            # Store altered operator's config
-            new_data = te.placeholder(
-                (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
-            )
-            new_kernel = te.placeholder(
-                (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn),
-                dtype=kernel_tensor.dtype,
-            )
-            new_workload = autotvm.task.args_to_workload(
-                [
-                    new_data,
-                    new_kernel,
-                    strides,
-                    padding,
-                    dilation,
-                    new_attrs["data_layout"],
-                    new_attrs["out_layout"],
-                    out_dtype,
-                ],
-                topi_tmpl,
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-        else:
-            assert _NCHWc_matcher.match(data_layout)
-            assert _OIHWio_matcher.match(kernel_layout)
-        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
-
-    if topi_tmpl == "conv2d_NCHWc_int8.x86":
-        # TODO(@icemelon9, @anijain2305): Need to support data layout NHWC with kernel layout HWIO
-        assert data_layout == "NCHW" and kernel_layout == "OIHW"
-        if cfg.is_fallback:
-            _get_default_config_int8(
-                cfg,
-                data_tensor,
-                kernel_tensor,
-                strides,
-                padding,
-                dilation,
-                out_dtype,
-                False,
-                data_layout,
-                int32_lanes=16,
-            )
-
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
-        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-        # update new attrs
-        n_elems = 4
-        new_attrs["channels"] = out_channel
-        new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-        new_attrs["kernel_layout"] = f"OIHW{ic_bn // n_elems:n}i{oc_bn:n}o{n_elems:n}i"
-        new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-        # Store altered operator's config.
-        new_data = te.placeholder(
-            (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
-        )
-        new_kernel = te.placeholder(
-            (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
-            dtype=kernel_dtype,
-        )
-        new_workload = autotvm.task.args_to_workload(
-            [
-                new_data,
-                new_kernel,
-                strides,
-                padding,
-                dilation,
-                new_attrs["data_layout"],
-                new_attrs["out_layout"],
-                out_dtype,
-            ],
-            topi_tmpl,
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
-
-    if topi_tmpl == "depthwise_conv2d_NCHWc.x86":
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            if cfg.is_fallback:
-                _get_default_config(
-                    cfg,
-                    data_tensor,
-                    kernel_tensor,
-                    strides,
-                    padding,
-                    dilation,
-                    out_dtype,
-                    True,
-                    data_layout,
-                )
-
-            batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-            out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
-            ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-            assert channel_multiplier == 1
-
-            # update new attrs
-            new_attrs["channels"] = out_channel
-            new_attrs["data_layout"] = f"NCHW{ic_bn}c"
-            new_attrs["kernel_layout"] = f"OIHW1i{oc_bn}o"
-            new_attrs["out_layout"] = f"NCHW{oc_bn}c"
-
-            # Store altered operator's config.
-            new_data = te.placeholder(
-                (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
-            )
-            new_kernel = te.placeholder(
-                (out_channel // oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype
-            )
-            new_workload = autotvm.task.args_to_workload(
-                [
-                    new_data,
-                    new_kernel,
-                    strides,
-                    padding,
-                    dilation,
-                    new_attrs["data_layout"],
-                    new_attrs["out_layout"],
-                    out_dtype,
-                ],
-                topi_tmpl,
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-        else:
-            assert _NCHWc_matcher.match(data_layout)
-            assert _OIHWio_matcher.match(kernel_layout)
-        return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
-
-    return None
-
-
-@conv2d_legalize.register("cpu")
-def _conv2d_legalize(attrs, inputs, arg_types):
-    """Legalizes Conv2D op.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # Collect the input tensors.
-    data_tensor, kernel_tensor = arg_types[0], arg_types[1]
-    data_dtype = data_tensor.dtype
-    kernel_dtype = kernel_tensor.dtype
-
-    # Collect the output tensor.
-    output_tensor = arg_types[2]
-
-    # Collect the input exprs.
-    data, kernel = inputs
-
-    # Intel vector intructions require data and kernel to have different dtypes.
-    if data_tensor.dtype == "int8" and kernel_tensor.dtype == "int8":
-        data_dtype = "uint8"
-    if is_int8_hw_support(data_dtype, kernel_dtype):
-        return conv2d_alter_int8_common(
-            data, data_tensor, kernel, kernel_tensor, output_tensor, attrs, data_dtype, 4, 16
-        )
-    return None
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
deleted file mode 100644
index 047377f83e86..000000000000
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
-"""1x1 Conv2D schedule on for Intel CPU"""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity
-from tvm.target.x86 import get_simd_32bit_lanes
-
-from ..generic import conv2d as conv2d_generic
-from ..nn.pad import pad
-from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, simplify
-from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-
-
-def _fallback_schedule(cfg, wkl):
-    simd_width = get_simd_32bit_lanes()
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.stride_h, wkl.stride_w
-    dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1
-    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
-
-    out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1
-    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if wkl.out_filter % bn == 0:
-            oc_bn = bn
-            break
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if wkl.in_filter % bn == 0:
-            ic_bn = bn
-            break
-
-    for ow_factor in range(out_width, 0, -1):
-        if out_width % ow_factor == 0:
-            for oh_factor in range(out_height, 0, -1):
-                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
-                    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-                    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-                    cfg["tile_oh"] = OtherOptionEntity(oh_factor)
-                    cfg["tile_ow"] = SplitEntity([out_width // ow_factor, ow_factor])
-                    return
-    raise ValueError(f"cannot decide default schedule for workload: {wkl}")
-
-
-def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
-    # fetch schedule
-    oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
-    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
-
-    # schedule pad
-    if isinstance(s[data_vec].op, tvm.te.ComputeOp) and "pad" in data_vec.op.tag:
-        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
-        s[data_vec].vectorize(ic_block)
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-        data_vec = data_vec.op.input_tensors[0]
-
-    oc_bn = cfg["tile_oc"].size[-1]
-    if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == "kernel_vec":
-        # data and kernel are not pre-computed, schedule layout transform here.
-        # this should only be used by x86 conv2d_nchw, which is for
-        # testing purpose.
-        batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-
-        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis
-        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-        if oc_bn > 1:
-            s[kernel_vec].vectorize(oc_block)
-        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
-        s[kernel_vec].parallel(parallel_axis)
-
-    C, O = conv_out, last
-    CC = s.cache_write(C, "global")
-
-    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
-    ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
-    s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
-    s[C].vectorize(oc_block)
-
-    parallel_axis = s[C].fuse(batch, oc_chunk, oh_outer)
-    s[CC].compute_at(s[C], parallel_axis)
-    if C == O:
-        s[C].parallel(parallel_axis)
-
-    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, _, _ = s[CC].op.reduce_axis
-
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
-
-    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
-    s[CC].fuse(oc_chunk, oh_outer)
-    s[CC].vectorize(oc_block)
-
-    s[CC].unroll(ow_inner)
-    s[CC].unroll(oh_inner)
-
-    if C != O:
-        out_ndim = len(s[O].op.axis)
-        if out_ndim == 5:
-            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-            oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
-            ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
-            s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
-
-            parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
-            s[C].compute_at(s[O], parallel_axis)
-            s[O].vectorize(oc_block)
-            s[O].parallel(parallel_axis)
-        elif out_ndim == 4:
-            batch, oc, oh, ow = s[O].op.axis
-            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-            oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
-            ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
-            s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
-            parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
-            s[C].compute_at(s[O], parallel_axis)
-            s[O].vectorize(oc_block)
-            s[O].parallel(parallel_axis)
-        else:
-            raise ValueError(f"Unsupported output ndim: {out_ndim}")
-
-    return s
-
-
-def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last):
-    return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(
-        s,
-        cfg,
-        data_vec,
-        kernel_vec,
-        conv_out,
-        last,
-        int32_lanes=get_simd_32bit_lanes(),
-        intrin=dot_16x1x16_uint8_int8_int32(),
-    )
-
-
-def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, out_dtype):
-    # more assertion for the shapes
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_height, in_width, in_channel = Input.shape
-    kernel_h, kernel_w, num_filter, channel = Filter.shape
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel = num_filter
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [0, pad_top, pad_left, 0]
-    pad_after = [0, pad_down, pad_right, 0]
-    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    # todo: padding filter to accommodate the intrinsic
-
-    # packing the Filter to let memory access be consecutive for AVX512 intrinsic
-    # Done in pre-compute stage
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    packw_shape = (kernel_h, kernel_w, idxd(num_filter, 16), 16 * idxd(channel, 4), 4)
-    PackW = te.compute(
-        packw_shape,
-        lambda a, b, c, d, e: Filter[a, b, c * 16 + idxm(d, 16), idxd(d, 16) * 4 + e],
-        name="packed_filter",
-    )
-
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    Output = te.compute(
-        (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: te.sum(
-            PaddedInput[
-                nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc
-            ].astype(out_dtype)
-            * PackW[ry, rx, idxd(ff, 16), idxd(rc, 4) * 16 + idxm(ff, 16), idxm(rc, 4)].astype(
-                out_dtype
-            ),
-            axis=[ry, rx, rc],
-        ),
-        name="Conv2d_1x1_Output_int8",
-        tag="conv2d_nhwc_pack_int8",
-    )
-    return Output
-
-
-def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
-    """
-    Defines the schedule for the int8 nhwc layout. For 1x1 conv, it
-    is a matrix-multiply operation by using nhwc layout. We will do
-    packing of weight to make the address access be friendly to int8
-    intrinsic
-    """
-    # FIXME - https://github.com/apache/tvm/issues/3598
-    # pylint: disable=unreachable
-    return s
-
-    int32_lanes = 16
-
-    # assertion to fail the unhandled case
-    _, _, _, ic_num = get_const_tuple(data.shape)
-    _, _, _, oc_num = get_const_tuple(conv_out.shape)
-    assert ic_num % 4 == 0
-    assert oc_num % 16 == 0
-
-    ic_factor, oc_factor = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    # schedule data
-    A = data
-    if isinstance(s[A].op, tvm.te.ComputeOp):
-        batch, ih, iw, ic = s[A].op.axis
-        d_ic_chunk, d_ic_block = s[A].split(ic, factor=4)
-        s[A].vectorize(d_ic_block)
-
-    C, O = conv_out, last
-
-    batch, oh, ow, oc = s[C].op.axis
-    kh, kw, ic = s[C].op.reduce_axis
-    # match the x86 intrinsic
-    ic_outer, ic_inner = s[C].split(ic, factor=4)
-    oc_outer, oc_inner = s[C].split(oc, factor=int32_lanes)
-
-    ic_f_outer, ic_s_outer = s[C].split(ic_outer, factor=ic_factor)
-    s[C].reorder(oc_outer, oh, ow, ic_f_outer, ic_s_outer, kh, kw, oc_inner, ic_inner)
-
-    pc = dot_16x1x16_uint8_int8_int32()
-    s[C].tensorize(oc_inner, pc)
-
-    if C != O:
-        batch, last_oh, last_ow, last_oc = s[O].op.axis
-        oc_chunk, oc_block = s[O].split(ochannel, 16)
-        # not saw perf improvement to split oh/ow here
-        s[O].vectorize(oc_block)
-
-    return s
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
deleted file mode 100644
index 73283e7888dd..000000000000
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
-"""Conv2D schedule on for Intel CPU"""
-import tvm
-from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity
-from tvm.target.x86 import get_simd_32bit_lanes
-
-from ..generic import conv2d as conv2d_generic
-from ..utils import get_const_tuple
-from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-
-
-def _fallback_schedule(cfg, wkl):
-    simd_width = get_simd_32bit_lanes()
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.stride_h, wkl.stride_w
-    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
-
-    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if wkl.out_filter % bn == 0:
-            oc_bn = bn
-            break
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if wkl.in_filter % bn == 0:
-            ic_bn = bn
-            break
-
-    reg_n = 1
-    for n in range(31, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-
-    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
-    cfg["unroll_kw"] = OtherOptionEntity(False)
-
-
-def _fallback_schedule_int8(cfg, wkl):
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.stride_h, wkl.stride_w
-    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1
-
-    oc_bn = 16
-    assert wkl.out_filter % oc_bn == 0
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -4):
-        if wkl.in_filter % bn == 0:
-            ic_bn = bn
-            break
-    assert wkl.in_filter % 4 == 0
-
-    reg_n = 1
-    for n in range(31, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-
-    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
-    cfg["unroll_kw"] = OtherOptionEntity(False)
-
-
-def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
-    # fetch schedule
-    reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
-    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
-
-    # schedule pad
-    if isinstance(s[data_vec].op, tvm.te.ComputeOp) and "pad" in data_vec.op.tag:
-        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
-        s[data_vec].vectorize(ic_block)
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-        data_vec = data_vec.op.input_tensors[0]
-
-    oc_bn = cfg["tile_oc"].size[-1]
-    if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == "kernel_vec":
-        # data and kernel are not pre-computed, schedule layout transform here.
-        # this should only be used by x86 conv2d_nchw, which is for
-        # testing purpose.
-        batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-
-        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis
-        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-        if oc_bn > 1:
-            s[kernel_vec].vectorize(oc_block)
-        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
-        s[kernel_vec].parallel(parallel_axis)
-
-    # schedule 5-D NCHW[x]c conv
-    C, O = conv_out, last
-    CC = s.cache_write(C, "global")
-
-    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
-    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[C].fuse(batch, oc_chunk, oh)
-    s[C].vectorize(oc_block)
-    if C == O:
-        s[C].parallel(parallel_axis)
-
-    s[CC].compute_at(s[C], ow_chunk)
-    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, kh, kw = s[CC].op.reduce_axis
-
-    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
-        s[CC].unroll(kw)
-    else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
-
-    s[CC].vectorize(oc_block)
-    s[CC].unroll(ow_block)
-
-    if C != O:
-        out_ndim = len(s[O].op.axis)
-        if out_ndim == 5:
-            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-            ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-            parallel_axis = s[O].fuse(batch, oc_chunk, oh)
-            s[C].compute_at(s[O], parallel_axis)
-            s[O].vectorize(oc_block)
-            s[O].parallel(parallel_axis)
-        elif out_ndim == 4:
-            batch, oc, oh, ow = s[O].op.axis
-            ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-            parallel_axis = s[O].fuse(batch, oc_chunk, oh)
-            s[C].compute_at(s[O], parallel_axis)
-            s[O].vectorize(oc_block)
-            s[O].parallel(parallel_axis)
-        else:
-            raise ValueError(f"Unsupported output ndim: {out_ndim}")
-
-    return s
-
-
-def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last):
-    return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(
-        s,
-        cfg,
-        data_vec,
-        kernel_vec,
-        conv_out,
-        last,
-        int32_lanes=get_simd_32bit_lanes(),
-        intrin=dot_16x1x16_uint8_int8_int32(),
-        inline_fused=True,
-    )
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
deleted file mode 100644
index 7c01967e87d3..000000000000
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Conv2D int8 schedule on x86"""
-
-import tvm
-from tvm import autotvm, te
-from tvm.target.x86 import target_has_features
-
-from .. import nn, tag
-from ..generic import conv2d as conv2d_generic
-from ..nn.conv2d import _get_workload as _get_conv2d_workload
-from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
-from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
-from . import conv2d_avx_1x1, conv2d_avx_common
-
-
-def _get_default_config_int8(
-    cfg,
-    data,
-    kernel,
-    strides,
-    padding,
-    dilation,
-    out_dtype,
-    is_depthwise=False,
-    layout="NCHW",
-    int32_lanes=4,
-):
-    """
-    Get default schedule config for the workload
-    """
-    if is_depthwise:
-        # Fallback to FP32 default config until a VNNI schedule is defined.
-        wkl = _get_depthwise_conv2d_workload(
-            data, kernel, strides, padding, dilation, out_dtype, layout
-        )
-
-        from .depthwise_conv2d import _fallback_schedule
-
-        _fallback_schedule(cfg, wkl)
-    else:
-        wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
-        is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
-        if is_kernel_1x1:
-            conv2d_generic.fallback_schedule_cpu_1x1_int8(
-                cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4
-            )
-        else:
-            conv2d_generic.fallback_schedule_cpu_common_int8(
-                cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4
-            )
-
-
-def is_int8_hw_support(data_dtype, kernel_dtype):
-    """
-    Checks to ensure that we can use Intel DLBoost instructions
-    1) The datatypes are correct.
-    2) LLVM version has support for the instructions.
-    3) Target is skylake and above.
-    """
-    # 1) Check datatypes
-    is_dtype_support = data_dtype == "uint8" and kernel_dtype == "int8"
-
-    # 2) Check LLVM support
-    llvm_version = tvm.target.codegen.llvm_version_major()
-    is_llvm_support = llvm_version >= 8
-
-    # 3) Check target
-    is_target_support = target_has_features("sse4.2")
-
-    return is_dtype_support and is_llvm_support and is_target_support
-
-
-def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with NCHW layout and int8 dtype"""
-    layout = "NCHW"
-    packed_out = conv2d_NCHWc_int8(
-        data, kernel, strides, padding, dilation, layout, layout, out_dtype
-    )
-    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
-
-
-def schedule_conv2d_nchw_int8(outs):
-    """Create the schedule for conv2d_nchw_int8"""
-    return schedule_conv2d_NCHWc_int8(outs)
-
-
-def _pack_data(cfg, data, kernel):
-    n_elems = 4
-    n, _, ih, iw = get_const_tuple(data.shape)
-    oc, ic, kh, kw = get_const_tuple(kernel.shape)
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-    ic_chunk = ic // ic_bn
-    oc_chunk = oc // oc_bn
-
-    data = te.compute(
-        (n, ic_chunk, ih, iw, ic_bn),
-        lambda bs, c, h, w, vc: data[bs, c * ic_bn + vc, h, w],
-        name="data_vec",
-    )
-
-    kernel = te.compute(
-        (oc_chunk, ic_chunk, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
-        lambda occ, icc, k_h, k_w, icbc, ocb, icbb: kernel[
-            occ * oc_bn + ocb, icc * ic_bn + icbc * n_elems + icbb, k_h, k_w
-        ],
-        name="kernel_vec",
-    )
-
-    return data, kernel
-
-
-@autotvm.register_topi_compute("conv2d_NCHWc_int8.x86")
-def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype):
-    """Compute conv2d with NCHWc layout and int8 dtype"""
-    if len(data.shape) == 5:
-        n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
-        in_channel = ic_chunk * ic_bn
-        oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
-            kernel.shape
-        )
-        num_filter = oc_chunk * oc_bn
-    else:
-        n, in_channel, ih, iw = get_const_tuple(data.shape)
-        num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    # Define autotvm tuning space
-    is_kernel_1x1 = kernel_height == 1 and kernel_width == 1
-    pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width))
-    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_h = (kernel_height - 1) * dh + 1
-    dilated_kernel_w = (kernel_width - 1) * dw + 1
-    oh = (ih - dilated_kernel_h + pt + pb) // sh + 1
-    ow = (iw - dilated_kernel_w + pl + pr) // sw + 1
-
-    cfg.define_split("tile_ic", in_channel, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0)
-    cfg.define_split("tile_oc", num_filter, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0)
-    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
-    if is_kernel_1x1:
-        cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
-    else:
-        cfg.define_knob("unroll_kw", [True, False])
-
-    # If no config was set, we can fallback to default config.
-    if cfg.is_fallback:
-        _get_default_config_int8(
-            cfg,
-            te.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
-            te.placeholder(
-                (num_filter, in_channel, kernel_height, kernel_width), dtype=kernel.dtype
-            ),
-            strides,
-            padding,
-            dilation,
-            out_dtype,
-            int32_lanes=16,
-        )
-
-    # Pack data if raw 4-D data is provided.
-    # This can only happen when autotuning.
-    if len(data.shape) == 4:
-        data, kernel = _pack_data(cfg, data, kernel)
-
-    return nn.conv2d_NCHWc_int8(
-        data, kernel, strides, padding, dilation, layout, out_layout, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_NCHWc_int8.x86")
-def schedule_conv2d_NCHWc_int8(cfg, outs):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        """Traverse operators from computation graph"""
-        if "conv2d_NCHWc_int8" in op.tag:
-            conv_out = op.output(0)
-            kernel_vec = conv_out.op.input_tensors[1]
-            data_vec = conv_out.op.input_tensors[0]
-
-            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
-            # int8 conv kernel is 7-dim
-            _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape)
-            if kh == 1 and kw == 1:
-                conv2d_avx_1x1._schedule_conv_NCHWc_int8(*args)
-            else:
-                conv2d_avx_common._schedule_conv_NCHWc_int8(*args)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_pack_int8.x86")
-def schedule_conv2d_nhwc_pack_int8(cfg, outs):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-    output_op = outs[0].op
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            else:  # inject custom schedule
-                if len(op.axis) == 4:  # schedule bias + bn + relu
-                    n, h, w, c = op.axis
-                    fused = s[op].fuse(n, h, w)
-                    s[op].parallel(fused)
-                    s[op].vectorize(c)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
-        if "conv2d_nhwc_pack_int8" in op.tag:
-            conv_out = op.output(0)
-            kernel = conv_out.op.input_tensors[1]
-            data_vec = conv_out.op.input_tensors[0]
-            data = (
-                data_vec.op.input_tensors[0]
-                if isinstance(data_vec.op, te.tensor.ComputeOp) and "pad" not in data_vec.op.tag
-                else data_vec
-            )
-            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            args = [s, cfg, data_vec, conv_out, outs[0]]
-            if data.dtype == "uint8":
-                kh, kw, _, _, _ = get_const_tuple(kernel.shape)
-                if kh == 1 and kw == 1:
-                    conv2d_avx_1x1._schedule_conv_nhwc_pack_int8(*args)
-                else:
-                    raise ValueError("Only support 1x1 kernel with schedule_conv2d_nhwc_pack.")
-            else:
-                raise ValueError(
-                    f"Not support this data type {data.dtype} with "
-                    f"schedule_conv2d_nhwc_pack. Only support int8"
-                )
-
-        scheduled_ops.append(op)
-
-    traverse(output_op)
-    return s
diff --git a/python/tvm/topi/x86/conv2d_transpose.py b/python/tvm/topi/x86/conv2d_transpose.py
deleted file mode 100644
index 865b62bb3e87..000000000000
--- a/python/tvm/topi/x86/conv2d_transpose.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Conv2D Transpose schedule on x86"""
-from tvm import te
-from ..utils import traverse_inline
-from .. import nn
-from .conv2d import conv2d_nchw, schedule_conv2d_nchw
-
-
-def conv2d_transpose_nchw(data, kernel, strides, padding, out_dtype, output_padding):
-    data_pad, kernel_transform = nn.conv2d_transpose_nchw_preprocess(
-        data, kernel, strides, padding, out_dtype, output_padding
-    )
-    # reuse conv2d_nchw implementation
-    return conv2d_nchw(
-        data_pad,
-        kernel_transform,
-        strides=(1, 1),
-        padding=(0, 0),
-        dilation=(1, 1),
-        out_dtype=out_dtype,
-    )
-
-
-def schedule_conv2d_transpose_nchw(outs):
-    """Create schedule for tensors"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = schedule_conv2d_nchw(outs)
-
-    def _callback(op):
-        if "unpack_nchwc" in op.tag:
-            conv_out = op.input_tensors[0]
-            # retrieve data
-            data_vec = conv_out.op.input_tensors[0]
-            if isinstance(data_vec, te.ComputeOp):
-                data_pad = data_vec.op.input_tensors[0]
-                data_dilate = data_pad.op.input_tensors[0]
-                s[data_dilate].compute_inline()
-                s[data_pad].compute_inline()
-            # retrieve kernel
-            kernel_vec = conv_out.op.input_tensors[1]
-            if isinstance(kernel_vec, te.ComputeOp):
-                kernel_transform = kernel_vec.op.input_tensors[0]
-                s[kernel_transform].compute_inline()
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/x86/conv3d.py b/python/tvm/topi/x86/conv3d.py
deleted file mode 100644
index 20f2c4ac128c..000000000000
--- a/python/tvm/topi/x86/conv3d.py
+++ /dev/null
@@ -1,713 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, too-many-locals
-# pylint: disable=unused-argument, redefined-builtin, no-else-return
-"""Conv3D operators"""
-from collections import namedtuple
-
-import tvm
-from tvm import autotvm, te
-from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity
-from tvm.target.x86 import get_simd_32bit_lanes
-
-from ..nn.pad import pad
-from ..nn.utils import get_pad_tuple3d, infer_pad3d
-from ..utils import get_const_int, get_const_tuple, simplify, traverse_inline
-
-Workload3D = namedtuple(
-    "Workload",
-    [
-        "in_dtype",
-        "out_dtype",
-        "depth",
-        "height",
-        "width",
-        "in_filter",
-        "groups",
-        "out_filter",
-        "dkernel",
-        "hkernel",
-        "wkernel",
-        "dpad",
-        "hpad",
-        "wpad",
-        "dstride",
-        "hstride",
-        "wstride",
-    ],
-)
-
-
-@autotvm.register_topi_compute("conv3d_ndhwc.x86")
-def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
-    """3D convolution forward operator.
-
-    Parameters
-    ----------
-    input : tvm.te.Tensor
-        5-D input data with shapes:
-        [batch, in_depth, in_height, in_width, in_channel] for NDHWC layout
-
-    filter : tvm.te.Tensor
-        5-D filter with shape [kernel_depth, kernel_height, kernel_width, in_channels, out_channels]
-
-    strides : int or a list/tuple of three ints
-        stride size, or [stride_depth, stride_height, stride_width]
-
-    padding : int or a list/tuple of three ints
-        padding size, or [pad_depth, pad_height, pad_width]
-
-    dilation: int or a list/tuple of three ints
-        dilation size, or [dilation_depth, dilation_height, dilation_width]
-
-    groups: int
-        Number of groups
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        5-D with shape [batch, out_depth, out_height, out_width, out_channel] for NDHWC layout
-    """
-    layout = "NDHWC"
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides, strides)
-    dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation, dilation)
-
-    _create_tuning_space(cfg, data, kernel, strides, padding, dilation, groups, layout)
-    if cfg.is_fallback:
-        _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout)
-    return _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype)
-
-
-@autotvm.register_topi_compute("conv3d_ncdhw.x86")
-def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
-    """3D convolution forward operator.
-
-    Parameters
-    ----------
-    input : tvm.te.Tensor
-        5-D input data with shapes:
-        [batch, in_channel, in_depth, in_height, in_width] for NCDHW layout
-
-    filter : tvm.te.Tensor
-        5-D filter with shape [out_channels, in_channels, kernel_depth, kernel_height, kernel_width]
-
-    strides : int or a list/tuple of three ints
-        stride size, or [stride_depth, stride_height, stride_width]
-
-    padding : int or a list/tuple of three ints
-        padding size, or [pad_depth, pad_height, pad_width]
-
-    dilation: int or a list/tuple of three ints
-        dilation size, or [dilation_depth, dilation_height, dilation_width]
-
-    groups: int
-        Number of groups
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        5-D with shape [batch, out_channel, out_depth, out_height, out_width] for NCDHW layout
-    """
-    # assert groups == 1, "conv3d_ncdhw.x86 does not support groups"
-    layout = "NCDHW"
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides, strides)
-    dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation, dilation)
-
-    _create_tuning_space(cfg, data, kernel, strides, padding, dilation, groups, layout)
-    if cfg.is_fallback:
-        _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout)
-    return _conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout, groups, out_dtype)
-
-
-@autotvm.register_topi_schedule("conv3d_ndhwc.x86")
-def schedule_conv3d_ndhwc(cfg, outs):
-    """TOPI schedule callback for conv3d
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv3d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv3d.
-    """
-    s = te.create_schedule([x.op for x in outs])
-
-    def _traverse(op):
-        if "conv3d_ndhwc" in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            kd, kh, kw, i, o = get_const_tuple(kernel.shape)
-            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
-            _schedule_conv3d_ndhwc(*args)
-
-    traverse_inline(s, outs[0].op, _traverse)
-    return s
-
-
-@autotvm.register_topi_schedule("conv3d_ncdhw.x86")
-def schedule_conv3d_ncdhw(cfg, outs):
-    """TOPI schedule callback for conv3d
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv3d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv3d.
-    """
-    s = te.create_schedule([x.op for x in outs])
-
-    def _traverse(op):
-        if "conv3d_ncdhw" in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            kd, kh, kw, i, o = get_const_tuple(kernel.shape)
-            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
-            _schedule_conv3d_ncdhw(*args)
-
-    traverse_inline(s, outs[0].op, _traverse)
-    return s
-
-
-def _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-
-    assert isinstance(dilation, int) or len(dilation) == 3
-    if isinstance(dilation, int):
-        dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation)
-    else:
-        dilation_d, dilation_h, dilation_w = dilation
-
-    DSTR, HSTR, WSTR = strides
-    batch_size, in_depth, in_height, in_width, in_channel = get_const_tuple(data.shape)
-    kernel_depth, kernel_height, kernel_width, _, num_filter = get_const_tuple(kernel.shape)
-
-    assert in_channel % groups == 0, "input channels must be a multiple of group size"
-    assert num_filter % groups == 0, "number of filters must be a multiple of group size"
-
-    dilated_kernel_d = (kernel_depth - 1) * dilation_d + 1
-    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
-
-    pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d(
-        padding, (dilated_kernel_d, dilated_kernel_h, dilated_kernel_w)
-    )
-
-    pad_d = pad_front + pad_back
-    pad_h = pad_top + pad_down
-    pad_w = pad_left + pad_right
-
-    pad_depth = in_depth + pad_d
-    pad_height = in_height + pad_h
-    pad_width = in_width + pad_w
-
-    out_depth = simplify((in_depth + pad_d - dilated_kernel_d) // DSTR + 1)
-    out_height = simplify((in_height + pad_h - dilated_kernel_h) // HSTR + 1)
-    out_width = simplify((in_width + pad_w - dilated_kernel_w) // WSTR + 1)
-
-    # pack data
-    DOPAD = pad_d != 0 or pad_h != 0 or pad_w != 0
-    if DOPAD:
-        data_pad = pad(
-            data,
-            (0, pad_front, pad_top, pad_left, 0),
-            (0, pad_back, pad_down, pad_right, 0),
-            name="data_pad",
-        )
-    else:
-        data_pad = data
-
-    # fetch schedule
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    assert groups == 1 or ic_bn <= groups
-    assert groups == 1 or oc_bn <= groups
-    shape = (batch_size, in_channel // ic_bn, pad_depth, pad_height, ic_bn, pad_width)
-    data_vec = te.compute(
-        shape, lambda n, C, d, h, c, w: data_pad[n, d, h, w, C * ic_bn + c], name="data_vec"
-    )
-
-    ci_tile = in_channel // groups // ic_bn
-    if ci_tile == 0 or ci_tile * ic_bn * groups < in_channel:
-        ci_tile += 1
-
-    # pack kernel
-    shape = (num_filter // oc_bn, ci_tile, kernel_depth, kernel_height, kernel_width, ic_bn, oc_bn)
-    kernel_vec = te.compute(
-        shape,
-        lambda CO, CI, d, h, w, ci, co: kernel[d, h, w, CI * ic_bn + ci, CO * oc_bn + co],
-        name="kernel_vec",
-    )
-
-    # convolution
-    oshape = (batch_size, num_filter // oc_bn, out_depth, out_height, out_width, oc_bn)
-    unpack_shape = (batch_size, out_depth, out_height, out_width, num_filter)
-
-    ic = te.reduce_axis((0, in_channel // groups), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    kd = te.reduce_axis((0, kernel_depth), name="kd")
-    idxmod = tvm.tir.indexmod
-    idxdiv = tvm.tir.indexdiv
-
-    conv = te.compute(
-        oshape,
-        lambda n, oc_chunk, od, oh, ow, oc_block: te.sum(
-            data_vec[
-                n,
-                idxdiv(
-                    (oc_chunk * oc_bn + oc_block) // (num_filter // groups) * (in_channel // groups)
-                    + ic,
-                    ic_bn,
-                ),
-                od * DSTR + kd * dilation_d,
-                oh * HSTR + kh * dilation_h,
-                idxmod(
-                    (oc_chunk * oc_bn + oc_block) // (num_filter // groups) * (in_channel // groups)
-                    + ic,
-                    ic_bn,
-                ),
-                ow * WSTR + kw * dilation_w,
-            ].astype(out_dtype)
-            * kernel_vec[
-                oc_chunk, idxdiv(ic, ic_bn), kd, kh, kw, idxmod(ic, ic_bn), oc_block
-            ].astype(out_dtype),
-            axis=[kd, kh, kw, ic],
-        ),
-        name="conv",
-    )
-    conv_unpacked = te.compute(
-        unpack_shape,
-        lambda n, d, h, w, c: conv[n, idxdiv(c, oc_bn), d, h, w, idxmod(c, oc_bn)].astype(
-            out_dtype
-        ),
-        name="output_unpack",
-        tag="conv3d_ndhwc",
-    )
-    return conv_unpacked
-
-
-def _conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout, groups, out_dtype):
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-
-    assert isinstance(dilation, int) or len(dilation) == 3
-    if isinstance(dilation, int):
-        dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation)
-    else:
-        dilation_d, dilation_h, dilation_w = dilation
-
-    DSTR, HSTR, WSTR = strides
-    batch_size, in_channel, in_depth, in_height, in_width = get_const_tuple(data.shape)
-    num_filter, _, kernel_depth, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    dilated_kernel_d = (kernel_depth - 1) * dilation_d + 1
-    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
-
-    pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d(
-        padding, (dilated_kernel_d, dilated_kernel_h, dilated_kernel_w)
-    )
-
-    pad_d = pad_front + pad_back
-    pad_h = pad_top + pad_down
-    pad_w = pad_left + pad_right
-
-    pad_depth = in_depth + pad_d
-    pad_height = in_height + pad_h
-    pad_width = in_width + pad_w
-
-    out_depth = simplify((in_depth + pad_d - dilated_kernel_d) // DSTR + 1)
-    out_height = simplify((in_height + pad_h - dilated_kernel_h) // HSTR + 1)
-    out_width = simplify((in_width + pad_w - dilated_kernel_w) // WSTR + 1)
-
-    # pack data
-    DOPAD = pad_d != 0 or pad_h != 0 or pad_w != 0
-    if DOPAD:
-        data_pad = pad(
-            data,
-            (0, 0, pad_front, pad_top, pad_left),
-            (0, 0, pad_back, pad_down, pad_right),
-            name="data_pad",
-        )
-    else:
-        data_pad = data
-
-    # fetch schedule
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-    shape = (batch_size, in_channel // ic_bn, pad_depth, pad_height, ic_bn, pad_width)
-    data_vec = te.compute(
-        shape, lambda n, C, d, h, c, w: data_pad[n, C * ic_bn + c, d, h, w], name="data_vec"
-    )
-
-    ci_tile = in_channel // groups // ic_bn
-    if ci_tile == 0 or ci_tile * ic_bn * groups < in_channel:
-        ci_tile += 1
-
-    # pack kernel
-    shape = (num_filter // oc_bn, ci_tile, kernel_depth, kernel_height, kernel_width, ic_bn, oc_bn)
-    kernel_vec = te.compute(
-        shape,
-        lambda CO, CI, d, h, w, ci, co: kernel[CO * oc_bn + co, CI * ic_bn + ci, d, h, w],
-        name="kernel_vec",
-    )
-
-    # convolution
-    oshape = (batch_size, num_filter // oc_bn, out_depth, out_height, out_width, oc_bn)
-    unpack_shape = (batch_size, num_filter, out_depth, out_height, out_width)
-
-    ic = te.reduce_axis((0, in_channel // groups), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    kd = te.reduce_axis((0, kernel_depth), name="kd")
-    idxmod = tvm.tir.indexmod
-    idxdiv = tvm.tir.indexdiv
-
-    conv = te.compute(
-        oshape,
-        lambda n, oc_chunk, od, oh, ow, oc_block: te.sum(
-            data_vec[
-                n,
-                idxdiv(
-                    (oc_chunk * oc_bn + oc_block) // (num_filter // groups) * (in_channel // groups)
-                    + ic,
-                    ic_bn,
-                ),
-                od * DSTR + kd * dilation_d,
-                oh * HSTR + kh * dilation_h,
-                idxmod(
-                    (oc_chunk * oc_bn + oc_block) // (num_filter // groups) * (in_channel // groups)
-                    + ic,
-                    ic_bn,
-                ),
-                ow * WSTR + kw * dilation_w,
-            ].astype(out_dtype)
-            * kernel_vec[
-                oc_chunk, idxdiv(ic, ic_bn), kd, kh, kw, idxmod(ic, ic_bn), oc_block
-            ].astype(out_dtype),
-            axis=[ic, kd, kh, kw],
-        ),
-        name="conv",
-    )
-    conv_unpacked = te.compute(
-        unpack_shape,
-        lambda n, c, d, h, w: conv[n, idxdiv(c, oc_bn), d, h, w, idxmod(c, oc_bn)].astype(
-            out_dtype
-        ),
-        name="output_unpack",
-        tag="conv3d_ncdhw",
-    )
-    return conv_unpacked
-
-
-def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, groups, layout):
-    """Create schedule configuration from input arguments"""
-    dshape = get_const_tuple(data.shape)
-    kshape = get_const_tuple(kernel.shape)
-    if layout == "NDHWC":
-        n, d, h, w, ic = dshape
-        kd, kh, kw, _, oc = kshape
-    elif layout == "NCDHW":
-        n, ic, d, h, w = dshape
-        oc, _, kd, kh, kw = kshape
-    else:
-        raise ValueError(f"Not support this layout {layout} with schedule template.")
-
-    # pad_front, pad_top, pad_left, pad_back, pad_down(bottom), pad_right
-    pf, pt, pl, pb, pd, pr = get_pad_tuple3d(padding, (kd, kh, kw))
-    sd, sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides, strides)
-    od = (d - kd + pf + pb) // sd + 1
-    oh = (h - kh + pt + pd) // sh + 1
-    ow = (w - kw + pl + pr) // sw + 1
-
-    # Create schedule config
-    cfg.define_split("tile_ic", ic, num_outputs=2)
-    cfg.define_split("tile_oc", oc, num_outputs=2)
-    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 8)
-    cfg.define_knob("unroll_kw", [True, False])
-
-
-def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout):
-    """
-    Get default schedule config for the workload
-    """
-    if layout not in ["NDHWC", "NCDHW"]:
-        raise ValueError(f"Layout {layout} is not supported")
-
-    static_data_shape = []
-    for dim in get_const_tuple(data.shape):
-        if isinstance(dim, tvm.tir.Var):
-            static_data_shape.append(1)
-        else:
-            static_data_shape.append(dim)
-    data = te.placeholder(static_data_shape, dtype=data.dtype)
-    wkl = _get_conv3d_workload(data, kernel, strides, padding, groups, out_dtype, layout)
-    _fallback_schedule(cfg, wkl)
-
-
-def _get_conv3d_workload(data, kernel, stride, padding, groups, out_dtype, data_layout="NCHW"):
-    """Get the workload structure."""
-    if data_layout == "NCDHW":
-        _, CI, ID, IH, IW = get_const_tuple(data.shape)
-        CO, CIG, KD, KH, KW = get_const_tuple(kernel.shape)
-    elif data_layout == "NDHWC":
-        _, ID, IH, IW, CI = get_const_tuple(data.shape)
-        KD, KH, KW, CIG, CO = get_const_tuple(kernel.shape)
-    else:
-        raise ValueError(f"not support this layout {data_layout} yet")
-
-    pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d(
-        padding, (get_const_int(KD), get_const_int(KH), get_const_int(KW))
-    )
-    DPAD = pad_front + pad_back
-    HPAD = pad_top + pad_down
-    WPAD = pad_left + pad_right
-    if isinstance(stride, (tuple, list)):
-        DSTR, HSTR, WSTR = stride
-    else:
-        DSTR, HSTR, WSTR = stride, stride, stride
-    assert (data.dtype == kernel.dtype) or (
-        data.dtype == "uint8" and kernel.dtype == "int8"
-    ), f"Do not support inputs with different data types now. {data.dtype} vs. {kernel.dtype}"
-    return Workload3D(
-        data.dtype,
-        out_dtype,
-        ID,
-        IH,
-        IW,
-        CI,
-        groups,
-        CO,
-        KD,
-        KH,
-        KW,
-        DPAD,
-        HPAD,
-        WPAD,
-        DSTR,
-        HSTR,
-        WSTR,
-    )
-
-
-def _fallback_schedule(cfg, wkl):
-    simd_width = get_simd_32bit_lanes()
-    DPAD, HPAD, WPAD = wkl.dpad, wkl.hpad, wkl.wpad
-    DSTR, HSTR, WSTR = wkl.dstride, wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if wkl.out_filter % bn == 0:
-            oc_bn = bn
-            break
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if wkl.in_filter % bn == 0:
-            ic_bn = bn
-            break
-
-    reg_n = 1
-    for n in range(7, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
-    cfg["unroll_kw"] = OtherOptionEntity(False)
-
-
-def _schedule_conv3d_ndhwc(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
-    # fetch schedule
-    ic_bn, oc_bn, reg_n, unroll_kw = (
-        cfg["tile_ic"].size[-1],
-        cfg["tile_oc"].size[-1],
-        cfg["tile_ow"].size[-1],
-        cfg["unroll_kw"].val,
-    )
-
-    # get padding size
-    padding = infer_pad3d(data, data_pad, "NDHWC")
-    DPAD, HPAD, WPAD = padding
-    DOPAD = DPAD != 0 or HPAD != 0 or WPAD != 0
-
-    A, W = data, kernel_vec
-    A0, A1 = data_pad, data_vec
-
-    # schedule data
-    if DOPAD:
-        s[A0].compute_inline()
-    batch, ic_chunk, idd, ih, ic_block, iw = s[A1].op.axis
-    parallel_axis = s[A1].fuse(batch, ic_chunk, idd, ih)
-    s[A1].parallel(parallel_axis)
-
-    # schedule kernel pack
-    oc_chunk, ic_chunk, od, oh, ow, ic_block, oc_block = s[W].op.axis
-    s[W].reorder(oc_chunk, od, oh, ic_chunk, ow, ic_block, oc_block)
-    if oc_bn > 1:
-        s[W].vectorize(oc_block)
-    parallel_axis = s[W].fuse(oc_chunk, od, oh)
-    s[W].parallel(parallel_axis)
-
-    # schedule conv
-    C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, "global")
-
-    _, oc_chunk, od, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
-    s[C].reorder(oc_chunk, od, oh, ow_chunk, ow_block, oc_block)
-    s[C].fuse(oc_chunk, od, oh)
-    s[C].vectorize(oc_block)
-
-    s[CC].compute_at(s[C], ow_chunk)
-    _, oc_chunk, od, oh, ow, oc_block = s[CC].op.axis
-    kd, kh, kw, ic = s[CC].op.reduce_axis
-
-    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kd, kh, ic_block, kw, ow_block, oc_block)
-        s[CC].unroll(kw)
-    else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kd, kh, kw, ic_block, ow_block, oc_block)
-
-    s[CC].fuse(oc_chunk, od, oh)
-    s[CC].vectorize(oc_block)
-    s[CC].unroll(ow_block)
-
-    if O0 != O:
-        s[O0].compute_inline()
-
-    # unpacking
-    batch, od, oh, ow, oc = s[O].op.axis
-    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-    s[O].reorder(oc_chunk, od, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[O].fuse(batch, oc_chunk, od, oh)
-    s[C].compute_at(s[O], parallel_axis)
-    s[O].vectorize(oc_block)
-    s[O].parallel(parallel_axis)
-    return s
-
-
-def _schedule_conv3d_ncdhw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
-    # fetch schedule
-    ic_bn, oc_bn, reg_n, unroll_kw = (
-        cfg["tile_ic"].size[-1],
-        cfg["tile_oc"].size[-1],
-        cfg["tile_ow"].size[-1],
-        cfg["unroll_kw"].val,
-    )
-
-    # get padding size
-    padding = infer_pad3d(data, data_pad, "NCDHW")
-    DPAD, HPAD, WPAD = padding
-    DOPAD = DPAD != 0 or HPAD != 0 or WPAD != 0
-
-    A, W = data, kernel_vec
-    A0, A1 = data_pad, data_vec
-
-    # schedule data
-    if DOPAD:
-        s[A0].compute_inline()
-    batch, ic_chunk, idd, ih, ic_block, iw = s[A1].op.axis
-    parallel_axis = s[A1].fuse(batch, ic_chunk, idd, ih)
-    s[A1].parallel(parallel_axis)
-
-    # schedule kernel pack
-    oc_chunk, ic_chunk, od, oh, ow, ic_block, oc_block = s[W].op.axis
-    s[W].reorder(oc_chunk, od, oh, ic_chunk, ow, ic_block, oc_block)
-    if oc_bn > 1:
-        s[W].vectorize(oc_block)
-    parallel_axis = s[W].fuse(oc_chunk, od, oh)
-    s[W].parallel(parallel_axis)
-
-    # schedule conv
-    C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, "global")
-
-    _, oc_chunk, od, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
-    s[C].reorder(oc_chunk, od, oh, ow_chunk, ow_block, oc_block)
-    s[C].fuse(oc_chunk, od, oh)
-    s[C].vectorize(oc_block)
-
-    s[CC].compute_at(s[C], ow_chunk)
-    _, oc_chunk, od, oh, ow, oc_block = s[CC].op.axis
-    ic, kd, kh, kw = s[CC].op.reduce_axis
-
-    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kd, kh, ic_block, kw, ow_block, oc_block)
-        s[CC].unroll(kw)
-    else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kd, kh, kw, ic_block, ow_block, oc_block)
-
-    s[CC].fuse(oc_chunk, od, oh)
-    s[CC].vectorize(oc_block)
-    s[CC].unroll(ow_block)
-
-    if O0 != O:
-        s[O0].compute_inline()
-
-    # unpacking
-    batch, oc, od, oh, ow = s[O].op.axis
-    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-    s[O].reorder(oc_chunk, od, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[O].fuse(batch, oc_chunk, od, oh)
-    s[C].compute_at(s[O], parallel_axis)
-    s[O].vectorize(oc_block)
-    s[O].parallel(parallel_axis)
-
-    return s
diff --git a/python/tvm/topi/x86/conv3d_transpose.py b/python/tvm/topi/x86/conv3d_transpose.py
deleted file mode 100644
index cb814a29d60d..000000000000
--- a/python/tvm/topi/x86/conv3d_transpose.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter
-
-"""Conv3D Transpose schedule on x86"""
-from tvm import te
-from ..utils import traverse_inline
-from .. import nn
-from .conv3d import conv3d_ncdhw, schedule_conv3d_ncdhw
-
-
-def conv3d_transpose_ncdhw(data, kernel, strides, padding, out_dtype, output_padding):
-    data_pad, kernel_transform = nn.conv3d_transpose_ncdhw_preprocess(
-        data, kernel, strides, padding, out_dtype, output_padding
-    )
-
-    # reuse conv3d_ncdhw implementation
-    return conv3d_ncdhw(data_pad, kernel_transform, (1, 1, 1), (0, 0, 0), (1, 1, 1), 1, out_dtype)
-
-
-def schedule_conv3d_transpose_ncdhw(outs):
-    """Create schedule for tensors"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = schedule_conv3d_ncdhw(outs)
-
-    def _callback(op):
-        if "unpack_ncdhwc" in op.tag:
-            conv_out = op.input_tensors[0]
-            # retrieve data
-            data_vec = conv_out.op.input_tensors[0]
-            data_pad = data_vec.op.input_tensors[0]
-            data_dilate = data_pad.op.input_tensors[0]
-            s[data_dilate].compute_inline()
-            s[data_pad].compute_inline()
-            # retrieve kernel
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel_transform = kernel_vec.op.input_tensors[0]
-            s[kernel_transform].compute_inline()
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
deleted file mode 100644
index 4151ea0b7006..000000000000
--- a/python/tvm/topi/x86/dense.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,too-many-locals,unused-argument
-# pylint: disable=no-value-for-parameter,unused-variable
-"""x86 dense operators"""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import autotvm, te
-from tvm.autotvm.task.space import SplitEntity
-from tvm.contrib import cblas, dnnl, mkl
-from tvm.target.x86 import get_simd_32bit_lanes
-from tvm.target.codegen import target_has_features
-
-from .. import generic, tag
-from ..utils import get_const_tuple, traverse_inline
-from .tensor_intrin import (
-    acc_32x32_int32_sapphirerapids,
-    dot_16x1x16_uint8_int8_int32,
-    dot_32x128x32_u8s8s32_sapphirerapids,
-)
-
-
-def _schedule_dense_pack_template(cfg, s, C, O):
-    A, packedB = s[C].op.input_tensors
-
-    CC = s.cache_write(C, "global")
-    y, x = s[C].op.axis
-    (k,) = s[CC].op.reduce_axis
-
-    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
-    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(xt, yt, yo, xo, yi, xi)
-    xyt = s[C].fuse(xt, yt)
-    if C == O:
-        s[C].parallel(xyt)
-    xyo = s[C].fuse(yo, xo)
-    s[C].unroll(yi)
-    s[C].vectorize(xi)
-
-    s[CC].compute_at(s[C], xyo)
-    y, x = s[CC].op.axis
-    ko, ki = cfg["tile_k"].apply(s, CC, k)
-    s[CC].reorder(ko, ki, y, x)
-    s[CC].vectorize(x)
-
-    tile_inner = cfg["tile_inner"].size[-1]
-    if tile_inner > 1:
-        yo, yi = s[CC].split(y, tile_inner)
-        s[CC].reorder(ko, yo, ki, yi, x)
-        s[CC].unroll(yo)
-        s[CC].unroll(ki)
-        s[CC].unroll(yi)
-    else:
-        s[CC].unroll(ki)
-        s[CC].unroll(y)
-
-    if C != O:
-        y, x = s[O].op.axis
-        yt, yo, yi = cfg["tile_y"].apply(s, O, y)
-        xt, xo, xi = cfg["tile_x"].apply(s, O, x)
-        s[O].reorder(xt, yt, yo, xo, yi, xi)
-        xyt = s[O].fuse(xt, yt)
-        s[C].compute_at(s[O], xyt)
-        s[O].vectorize(xi)
-        s[O].parallel(xyt)
-    return s
-
-
-def _schedule_dense_nopack_template(cfg, s, C):
-    y, x = s[C].op.axis
-    (kk,) = s[C].op.reduce_axis
-    yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yo, xo, yi, xi)
-    xyo = s[C].fuse(yo, xo)
-    s[C].parallel(xyo)
-    s[C].unroll(kk)
-
-    (CC,) = s[C].op.input_tensors
-    s[CC].compute_at(s[C], xyo)
-    z, y, x = s[CC].op.axis
-    (k,) = s[CC].op.reduce_axis
-    yz = s[CC].fuse(z, y)
-    s[CC].reorder(k, yz, x)
-    s[CC].unroll(yz)
-    s[CC].vectorize(x)
-    return s
-
-
-def _default_dense_pack_config(cfg, M, N, K):
-    # Generate default schedule for dynamic shape.
-    if isinstance(M, (tvm.tir.Var, tvm.tir.Any)):
-        M = 16
-    if isinstance(N, (tvm.tir.Var, tvm.tir.Any)):
-        N = 16
-    if isinstance(K, (tvm.tir.Var, tvm.tir.Any)):
-        K = 16
-
-    vec_width = get_simd_32bit_lanes()
-    tilex_ii = 1
-    for bn in range(vec_width * 2, 0, -1):
-        if N % bn == 0:
-            tilex_ii = bn
-            break
-    NN = N // tilex_ii
-    tilex_oi = 1
-    while NN // tilex_oi > 4:
-        if (NN // tilex_oi) % 2 == 1:
-            break
-        tilex_oi *= 2
-
-    tiley_ii = 8
-    while M % tiley_ii != 0:
-        tiley_ii //= 2
-    MM = M // tiley_ii
-    tiley_oi = 1
-    while MM // tiley_oi > 4:
-        if (MM // tiley_oi) % 2 == 1:
-            break
-        tiley_oi *= 2
-
-    cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
-    cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
-    cfg["tile_k"] = SplitEntity([K, 1])
-    cfg["tile_inner"] = SplitEntity([M // tiley_ii, tiley_ii])
-
-
-def _default_dense_nopack_config(cfg, M, N, K):
-    # Generate default schedule for dynamic shape.
-    if isinstance(M, (tvm.tir.Var, tvm.tir.Any)):
-        M = 16
-    if isinstance(N, (tvm.tir.Var, tvm.tir.Any)):
-        N = 16
-    if isinstance(K, (tvm.tir.Var, tvm.tir.Any)):
-        K = 16
-
-    vec_width = get_simd_32bit_lanes()
-    tilek_bn = 1
-    for bn in range(vec_width * 2, 0, -1):
-        if K % bn == 0:
-            tilek_bn = bn
-            break
-    cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
-    cfg["tile_x"] = SplitEntity([N, 1])
-    cfg["tile_y"] = SplitEntity([1, M])
-
-
-@autotvm.register_topi_compute("dense_nopack.x86")
-def dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense without packing"""
-    if out_dtype is None:
-        out_dtype = data.dtype
-    M, K = get_const_tuple(data.shape)
-    N, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split(
-        "tile_y", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=2
-    )
-    cfg.define_split(
-        "tile_x", 32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N, num_outputs=2
-    )
-    cfg.define_split(
-        "tile_k", 32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K, num_outputs=2
-    )
-    if cfg.is_fallback:
-        _default_dense_nopack_config(cfg, M, N, K)
-
-    vec = cfg["tile_k"].size[-1]
-    k = te.reduce_axis((0, K // vec), "k")
-    CC = te.compute(
-        (M, N, vec),
-        lambda z, y, x: te.sum(
-            data[z, k * vec + x].astype(out_dtype) * weight[y, k * vec + x].astype(out_dtype),
-            axis=k,
-        ),
-    )
-
-    kk = te.reduce_axis((0, vec), "kk")
-    C = te.compute((M, N), lambda y, x: te.sum(CC[y, x, kk], axis=kk), tag="dense_nopack")
-    if bias is not None:
-        C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
-    return C
-
-
-@autotvm.register_topi_schedule("dense_nopack.x86")
-def schedule_dense_nopack(cfg, outs):
-    """Create the schedule for dense_nopack"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_nopack" in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("dense_pack.x86")
-def dense_pack(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense with transformed weight."""
-    if out_dtype is None:
-        out_dtype = data.dtype
-    M, K = get_const_tuple(data.shape)  # batch, in_dim
-    if len(weight.shape) == 3:
-        N, _, packw_bn = get_const_tuple(weight.shape)  # out_dim
-        N = N * packw_bn
-    else:
-        N, _ = get_const_tuple(weight.shape)  # out_dim
-    # create tuning space
-    cfg.define_split(
-        "tile_y", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=3
-    )
-    cfg.define_split(
-        "tile_x", 32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N, num_outputs=3
-    )
-    cfg.define_split(
-        "tile_k", 32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K, num_outputs=2
-    )
-    cfg.define_split(
-        "tile_inner",
-        32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M,
-        num_outputs=2,
-        filter=lambda y: y.size[-1] <= 16,
-    )
-    if cfg.is_fallback:
-        _default_dense_pack_config(cfg, M, N, K)
-
-    if len(weight.shape) == 2:
-        packw_bn = cfg["tile_x"].size[-1]
-        packw_shape = (N // packw_bn, K, packw_bn)
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # Directly use modified data layout placeholder.
-            packw = tvm.te.placeholder(packw_shape, weight.dtype, name="packed_weight")
-        else:
-            packw = te.compute(
-                packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight"
-            )
-    else:
-        packw = weight
-
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-    k = te.reduce_axis((0, K), name="k")
-    C = te.compute(
-        (M, N),
-        lambda y, x: te.sum(
-            data[y, k].astype(out_dtype)
-            * packw[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
-            axis=k,
-        ),
-        tag="dense_pack",
-    )
-    if bias is not None:
-        C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
-    return C
-
-
-@autotvm.register_topi_schedule("dense_pack.x86")
-def schedule_dense_pack(cfg, outs):
-    """Create the schedule for dense_pack"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0), outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("dense_int8.x86")
-def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute for uint8 x int8 -> int32 dense"""
-    if out_dtype is None:
-        out_dtype = data.dtype
-    assert len(weight.shape) == 4
-    assert data.dtype == "uint8" and weight.dtype == "int8"
-    _, _, n_inner, k_inner = get_const_tuple(weight.shape)  # out_dim
-    assert n_inner == 16 and k_inner == 4
-    return dense_int8_compute(cfg, data, weight, bias)
-
-
-@autotvm.register_topi_schedule("dense_int8.x86")
-def schedule_dense_int8(cfg, outs):
-    """Create a schedule for dense__int8"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_int8" in op.tag:
-            if target_has_features("amx-int8"):
-                dense_amx_int8_schedule(cfg, s, op.output(0), outs[0])
-            elif target_has_features(["avx512bw", "avx512f"]):
-                dense_int8_schedule(cfg, s, op.output(0), outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def dense_int8_compute(cfg, X, packed_w, bias=None):
-    """Compute for uint8 x int8 -> int32 dense"""
-    m, k = X.shape
-    n_o, _, n_i, _ = packed_w.shape
-    ak = te.reduce_axis((0, k), name="k")
-    if target_has_features(["avx512bw", "avx512f"]):
-        target_attr = {"schedule_rule": "meta_schedule.x86.dense_int8"}
-    else:
-        target_attr = None
-
-    C = te.compute(
-        (m, n_o * n_i),
-        lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packed_w[tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4), j % 16, ak % 4].astype(
-                "int32"
-            ),
-            axis=ak,
-        ),
-        tag="dense_int8",
-        attrs=target_attr,
-    )
-
-    if bias is not None:
-        C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j], tag=tag.BROADCAST)
-
-    return C
-
-
-def dense_int8_schedule(cfg, s, C, O, do_parallel=True):
-    """Schedule dense compute using avx512 or lower instructions
-    including VNNI vpdpbusd instruction if possible"""
-    # C: The output of GEMM
-    # O: The output of the fused op
-    def split_y(out):
-        default_y_split_factor = 32
-        a_y = out.op.axis[-2]
-
-        if cfg.is_fallback:
-            return s[out].split(a_y, factor=default_y_split_factor)
-
-        cfg.define_split("tile_y", a_y, num_outputs=2)
-        return cfg["tile_y"].apply(s, out, a_y)
-
-    (a_k,) = C.op.reduce_axis
-
-    a_yo, a_yi = split_y(C)
-    a_xo, a_xi = s[C].split(C.op.axis[-1], factor=16)
-    a_ko, a_ki = s[C].split(a_k, factor=4)
-
-    s[C].reorder(a_yo, a_xo, a_yi, a_ko, a_xi, a_ki)
-
-    pc = dot_16x1x16_uint8_int8_int32()
-    s[C].tensorize(a_xi, pc)
-
-    if C == O:
-        fused = s[O].fuse(a_yo, a_xo)
-    else:
-        a_yo, a_yi = split_y(O)
-        a_xo, a_xi = s[O].split(O.op.axis[-1], factor=16)
-
-        s[O].reorder(a_yo, a_xo, a_yi, a_xi)
-        s[O].vectorize(a_xi)
-        s[C].compute_at(s[O], a_yi)
-
-        fused = s[O].fuse(a_yo, a_xo)
-
-    if do_parallel:
-        s[O].parallel(fused)
-
-    return s, fused
-
-
-def dense_amx_int8_schedule(cfg, s, C, O, do_parallel=True):
-    """Schedule dense compute using AMX TMUL instruction"""
-    # C: The output of GEMM
-    # O: The output of the fused op
-    def split_x(out):
-        default_x_split_factor1 = 32
-        default_x_split_factor2 = 2
-        default_x_split_factor3 = 2
-        default_x_split_factor4 = 2
-        a_x = s[out].op.axis[-2]
-
-        if cfg.is_fallback:
-            a_xo, a_xi = s[out].split(a_x, factor=default_x_split_factor1)
-            a_xo2, a_xo1 = s[out].split(a_xo, factor=default_x_split_factor2)
-            a_xo3, a_xo2 = s[out].split(a_xo2, factor=default_x_split_factor3)
-            a_xo4, a_xo3 = s[out].split(a_xo3, factor=default_x_split_factor4)
-            return [a_xo4, a_xo3, a_xo2, a_xo1, a_xi]
-
-        cfg.define_split("tile_x", a_x, num_outputs=5, filter=lambda x: x.size[-1] == 32)
-        return cfg["tile_x"].apply(s, out, a_x)
-
-    def split_y(out):
-        default_y_split_factor1 = 32
-        default_y_split_factor2 = 4
-        default_y_split_factor3 = 4
-        default_y_split_factor4 = 4
-        a_y = s[out].op.axis[-1]
-
-        if cfg.is_fallback:
-            a_yo1, a_yo = s[out].split(a_y, factor=default_y_split_factor1)
-            a_yo2, a_yo1 = s[out].split(a_yo1, factor=default_y_split_factor2)
-            a_yo3, a_yo2 = s[out].split(a_yo2, factor=default_y_split_factor3)
-            a_yo4, a_yo3 = s[out].split(a_yo3, factor=default_y_split_factor4)
-            return [a_yo4, a_yo3, a_yo2, a_yo1, a_yo]
-
-        cfg.define_split("tile_y", a_y, num_outputs=5, filter=lambda y: y.size[-1] == 32)
-        return cfg["tile_y"].apply(s, out, a_y)
-
-    def split_k(out, rd_axis):
-        default_k_split_factor1 = 128
-        default_k_split_factor2 = 2
-        default_k_split_factor3 = 2
-        default_k_split_factor4 = 2
-
-        if cfg.is_fallback:
-            a_ko, a_ki = s[out].split(rd_axis, factor=default_k_split_factor1)
-            a_ko2, a_ko1 = s[out].split(a_ko, factor=default_k_split_factor2)
-            a_ko3, a_ko2 = s[out].split(a_ko2, factor=default_k_split_factor3)
-            a_ko4, a_ko3 = s[out].split(a_ko3, factor=default_k_split_factor4)
-            return [a_ko4, a_ko3, a_ko2, a_ko1, a_ki]
-
-        cfg.define_split("tile_k", rd_axis, num_outputs=5, filter=lambda y: y.size[-1] == 128)
-        return cfg["tile_k"].apply(s, out, rd_axis)
-
-    a_x, a_y = C.op.axis[-2:]
-    (a_k,) = C.op.reduce_axis
-    CF = s.cache_write(C, "amx.tmm")
-
-    a_x3, a_x2, a_x1, a_xo, a_xi = split_x(C)
-    a_y3, a_y2, a_y1, a_yo, a_yi = split_y(C)
-    s[C].reorder(a_x3, a_y3, a_x2, a_y2, a_x1, a_y1, a_xo, a_yo, a_xi, a_yi)
-
-    s[CF].compute_at(s[C], a_yo)
-
-    (a_k_f,) = CF.op.reduce_axis
-    a_x_f, a_y_f = CF.op.axis[-2:]
-
-    a_xo_f, a_xi_f = s[CF].split(a_x_f, factor=32)
-
-    a_yo_f, a_yi_f = s[CF].split(a_y_f, factor=32)
-    a_k3_f, a_k2_f, a_k1_f, a_ko_f, a_ki_f = split_k(CF, a_k_f)
-    s[CF].reorder(a_k3_f, a_k2_f, a_k1_f, a_ko_f, a_xo_f, a_yo_f, a_ki_f, a_xi_f, a_yi_f)
-
-    (m, k) = CF.op.input_tensors[0].shape[-2:]
-    (n, c, n_i, c_i) = CF.op.input_tensors[1].shape[-4:]
-    n = n * n_i
-
-    s[CF].tensorize(a_ki_f, dot_32x128x32_u8s8s32_sapphirerapids(LDA=int(k)))
-    s[C].tensorize(a_xi, acc_32x32_int32_sapphirerapids(LDC=int(n)))
-
-    if C == O:
-        fused = s[O].fuse(a_x3, a_y3)
-    else:
-        a_y3, a_y2, a_y1, a_yr, a_yi = split_y(O)
-        a_x3, a_x2, a_x1, a_xr, a_xi = split_x(O)
-
-        s[O].reorder(a_y3, a_x3, a_y2, a_x2, a_y1, a_x1, a_yr, a_xr, a_yi, a_xi)
-        s[O].vectorize(a_xi)
-
-        fused = s[O].fuse(a_x3, a_y3)
-
-    if do_parallel:
-        s[O].parallel(fused)
-
-    return s, fused
-
-
-def matmul_blas_common(cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, lib):
-    """Compute matmul/dense using a BLAS library"""
-    M, K = get_const_tuple(tensor_a.shape)
-    N, _ = get_const_tuple(tensor_b.shape)
-    if isinstance(M, int) and isinstance(K, int) and isinstance(N, int):
-        cfg.add_flop(M * K * N * 2)
-    if tensor_a.dtype == "uint8" and tensor_b.dtype == "int8" and out_dtype == "int32":
-        if not hasattr(lib, "matmul_u8s8s32"):
-            raise NotImplementedError(
-                f"Matmul/Dense with {lib.__name__} for {tensor_a.dtype} is not supported "
-                "(matmulu8s8s32 not imlemented)"
-            )
-        C = lib.matmul_u8s8s32(tensor_a, tensor_b, transpose_a, transpose_b, dtype=out_dtype)
-    elif tensor_a.dtype == "float32" or tensor_a.dtype == "float64":
-        C = lib.matmul(tensor_a, tensor_b, transpose_a, transpose_b)
-    else:
-        raise NotImplementedError(
-            f"Matmul/Dense with {lib.__name__} for {tensor_a.dtype} is not supported"
-        )
-
-    if bias is not None:
-        C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
-    return C
-
-
-@autotvm.register_topi_compute("dense_cblas.x86")
-def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using cblas. This is an alias of matmul_nt operator."""
-    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, cblas)
-
-
-@autotvm.register_topi_schedule("dense_cblas.x86")
-def schedule_dense_cblas(_, outs):
-    """Create schedule for dense_cblas. This is an alias of matmul_nt operator."""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("dense_mkl.x86")
-def dense_mkl(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using mkl. This is an alias of matmul_nt operator."""
-    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkl)
-
-
-@autotvm.register_topi_schedule("dense_mkl.x86")
-def schedule_dense_mkl(_, outs):
-    """Create schedule for dense_mkl. This is an alias of matmul_nt operator."""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("dense_dnnl.x86")
-def dense_dnnl(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using dnnl. This is an alias of matmul_nt operator."""
-    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, dnnl)
-
-
-@autotvm.register_topi_schedule("dense_dnnl.x86")
-def schedule_dense_dnnl(_, outs):
-    """Create schedule for dense_dnnl. This is an alias of matmul_nt operator."""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("matmul_cblas.x86")
-def matmul_cblas(
-    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
-):
-    """Compute matmul using cblas."""
-    return matmul_blas_common(
-        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, cblas
-    )
-
-
-@autotvm.register_topi_schedule("matmul_cblas.x86")
-def schedule_matmul_cblas(_, outs):
-    """Create schedule for matmul_cblas."""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("matmul_mkl.x86")
-def matmul_mkl(
-    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
-):
-    """Compute matmul using mkl."""
-    return matmul_blas_common(
-        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkl
-    )
-
-
-@autotvm.register_topi_schedule("matmul_mkl.x86")
-def schedule_matmul_mkl(_, outs):
-    """Create schedule for matmul_mkl."""
-    return generic.schedule_extern(outs)
-
-
-@autotvm.register_topi_compute("matmul_dnnl.x86")
-def matmul_dnnl(
-    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
-):
-    """Compute matmul using dnnl."""
-    return matmul_blas_common(
-        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, dnnl
-    )
-
-
-@autotvm.register_topi_schedule("matmul_dnnl.x86")
-def schedule_matmul_dnnl(_, outs):
-    """Create schedule for matmul_dnnl."""
-    return generic.schedule_extern(outs)
-
-
-def dense_dynamic(A, B, bias, dtype):
-    """Compute for dense with dynamic shape"""
-
-    assert A.shape[0] == 1, "Only dynamic matrix vector multiplication with vector LHS is supported"
-
-    # Right now we only support matrix-vector multiplication with lhs as the
-    # vector. We don't need to do much optimization here because the access
-    # pattern and parallelization are straight forward.
-    def gen_ir(a, b, c):
-        ib = tvm.tir.ir_builder.create()
-        A = ib.buffer_ptr(a)
-        B = ib.buffer_ptr(b)
-        C = ib.buffer_ptr(c)
-        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
-            C[0, j] = 0.0
-            with ib.for_range(0, b.shape[1], name="k") as k:
-                C[0, j] += A[0, k] * B[j, k]
-        return ib.get()
-
-    def gen_ir_bias(a, b, bias, c):
-        ib = tvm.tir.ir_builder.create()
-        A = ib.buffer_ptr(a)
-        B = ib.buffer_ptr(b)
-        C = ib.buffer_ptr(c)
-        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
-            C[0, j] = bias[j]
-            with ib.for_range(0, b.shape[1], name="k") as k:
-                C[0, j] += A[0, k] * B[j, k]
-        return ib.get()
-
-    out_shape = (A.shape[0], B.shape[0])
-    out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf")
-    if bias is None:
-        out = te.extern(
-            [out_shape],
-            [A, B],
-            lambda ins, outs: gen_ir(*ins, *outs),
-            dtype=dtype,
-            out_buffers=[out_buf],
-            name="dense_dynamic_cpu",
-            tag="dense_dynamic_cpu",
-        )
-    else:
-        out = te.extern(
-            [out_shape],
-            [A, B, bias],
-            lambda ins, outs: gen_ir_bias(*ins, *outs),
-            dtype=dtype,
-            out_buffers=[out_buf],
-            name="dense_dynamic_cpu",
-            tag="dense_dynamic_cpu",
-        )
-    return out
-
-
-def schedule_dense_dynamic(outs):
-    """Create schedule for dense_dynamic."""
-    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
deleted file mode 100644
index 10b1248c6a3a..000000000000
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Dense alter op functions for x86"""
-
-import tvm
-from tvm import autotvm, relay, te
-from tvm.target.codegen import target_has_features
-
-from .. import nn
-from ..nn import dense_alter_layout
-from ..utils import get_const_tuple
-from .dense import _default_dense_pack_config
-
-
-def check_int8_applicable(x, y, allow_padding=False):
-    simd_avai = target_has_features(["avx512bw", "avx512f"])
-    simd_avai |= target_has_features("amx-int8")
-    # TODO(vvchernov): may be also target_has_features("avx2") or lower?
-    return (
-        simd_avai
-        and "int8" in x.dtype
-        and "int8" in y.dtype
-        and (allow_padding or (y.shape[-2] % 16 == 0 and y.shape[-1] % 4 == 0))
-    )
-
-
-@dense_alter_layout.register(["cpu"])
-def _alter_dense_layout(attrs, inputs, tinfos, out_type):
-    target = tvm.target.Target.current(allow_none=False)
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    data_tensor, weight_tensor = tinfos
-    out_dtype = out_type.dtype
-    M, K = get_const_tuple(data_tensor.shape)
-    N, _ = get_const_tuple(weight_tensor.shape)
-
-    if check_int8_applicable(data_tensor, weight_tensor) and data_tensor.dtype == "uint8":
-        weight_layout = "NC16n4c"
-        return relay.nn.contrib_dense_pack(inputs[0], inputs[1], weight_layout, None, out_dtype)
-
-    _, outs = relay.backend.te_compiler.select_implementation(
-        relay.op.get("nn.dense"), attrs, tinfos, out_type, target
-    )
-    workload = autotvm.task.get_workload(outs)
-
-    if workload:
-        cfg = dispatch_ctx.query(target, workload)
-        topi_impl = workload[0]
-        if topi_impl == "dense_pack.x86":
-            if cfg.is_fallback:
-                _default_dense_pack_config(cfg, M, N, K)
-            packw_bn = cfg["tile_x"].size[-1]
-            weight_layout = f"NC{packw_bn}n"
-            new_weight = te.placeholder((N // packw_bn, K, packw_bn), dtype=weight_tensor.dtype)
-            # Relay dense doesn't have bias.
-            new_workload = autotvm.task.args_to_workload(
-                [data_tensor, new_weight, None, out_dtype], topi_impl
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-            return relay.nn.contrib_dense_pack(inputs[0], inputs[1], weight_layout, None, out_dtype)
-
-    return None
-
-
-def int8_int8_legalize(inputs, arg_types, op, attrs, need_expand=False):
-    """Legalizes s8, s8 -> s32 GEMM op for VNNI."""
-    if (
-        check_int8_applicable(arg_types[0], arg_types[1], allow_padding=True)
-        and arg_types[0].dtype == "int8"
-    ):
-        x, y = inputs
-        x = relay.cast(x, "int32")
-        x = relay.add(x, relay.const(128, "int32"))
-        x = relay.cast(x, "uint8")
-
-        adjust_shift = relay.const(128, "int32") * relay.sum(relay.cast(y, "int32"), axis=[-1])
-
-        if need_expand:
-            adjust_shift = relay.expand_dims(adjust_shift, axis=1)
-
-        analyzer = tvm.arith.Analyzer()
-        x_shape = arg_types[0].shape
-        y_shape = arg_types[1].shape
-        inst_n = 16
-        inst_k = 4
-        pad_n = analyzer.simplify((inst_n - y_shape[-2] % inst_n) % inst_n)
-        pad_k = analyzer.simplify((inst_k - y_shape[-1] % inst_k) % inst_k)
-        if pad_k != 0 or pad_n != 0:
-            ndim = len(x_shape)
-            unpadded_dims = [(0, 0)] * (ndim - 2)
-            padding_y = [(0, 0)] * (len(y_shape) - 2) + [(0, pad_n), (0, pad_k)]
-            padded_y = relay.nn.pad(y, pad_width=padding_y, pad_value=0)
-            if pad_k != 0:
-                padding_x = [(0, 0)] * (len(x_shape) - 1) + [(0, pad_k)]
-                padded_x = relay.nn.pad(x, pad_width=padding_x, pad_value=0)
-            else:
-                padded_x = x
-            out = op(padded_x, padded_y, **attrs)
-            if pad_n != 0:
-                begin = [0] * len(x_shape)
-                end = x_shape[:-2] + [x_shape[-2], y_shape[-2]]
-                out = relay.strided_slice(out, begin, end, slice_mode="size")
-        else:
-            out = op(x, y, **attrs)
-
-        return relay.subtract(out, adjust_shift)
-
-    return None
-
-
-@nn.dense_legalize.register("cpu")
-def _dense_legalize(attrs, inputs, arg_types):
-    """Legalizes s8, s8 -> s32 dense for VNNI."""
-    return int8_int8_legalize(inputs, arg_types, relay.nn.dense, attrs)
-
-
-@nn.batch_matmul_legalize.register("cpu")
-def _batch_matmul_legalize(attrs, inputs, arg_types):
-    """Legalizes s8, s8 -> s32 batch_matmul for VNNI."""
-    if attrs["transpose_a"] or not attrs["transpose_b"]:
-        return None
-    return int8_int8_legalize(inputs, arg_types, relay.nn.batch_matmul, attrs, need_expand=True)
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
deleted file mode 100644
index 59d7412befc0..000000000000
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter
-"""Depthwise Conv2D schedule on x86"""
-import tvm
-from tvm import autotvm, te
-from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity
-from tvm.target.x86 import get_simd_32bit_lanes
-
-from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..nn.depthwise_conv2d import _get_workload, depthwise_conv2d_infer_layout
-from ..nn.pad import pad
-from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
-
-
-def _fallback_schedule(cfg, wkl):
-    """
-    Get default schedule for the workload
-    Parameters
-    ----------
-    cfg : tvm.autotvm.task.space.FallbackConfigEntity
-        Fallback config to be updated
-    wkl : topi.nn.depthwise_conv2d.Workload
-        Convolution workload
-    """
-    simd_width = get_simd_32bit_lanes()
-
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.stride_h, wkl.stride_w
-    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
-
-    out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if wkl.out_filter % bn == 0:
-            oc_bn = bn
-            break
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if wkl.in_filter % bn == 0:
-            ic_bn = bn
-            break
-
-    reg_n = 1
-    for n in range(31, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-
-    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
-    cfg["unroll_kw"] = OtherOptionEntity(False)
-
-
-def depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype):
-    """Compute depthwise conv2d with NCHW layout."""
-    layout = "NCHW"
-    packed_out = depthwise_conv2d_NCHWc(
-        data, kernel, strides, padding, dilation, layout, layout, out_dtype
-    )
-    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
-
-
-def schedule_depthwise_conv2d_nchw(outs):
-    """Create schedule for depthwise_conv2d_nchw."""
-    return schedule_depthwise_conv2d_NCHWc(outs)
-
-
-def _pack_data(cfg, data, kernel):
-    n, ic, ih, iw = get_const_tuple(data.shape)
-    filters, cm, kh, kw = get_const_tuple(kernel.shape)
-    oc = filters * cm
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-    ic_chunk = ic // ic_bn
-    oc_chunk = oc // oc_bn
-
-    data = te.compute(
-        (n, ic_chunk, ih, iw, ic_bn),
-        lambda bs, c, h, w, vc: data[bs, c * ic_bn + vc, h, w],
-        name="data_vec",
-    )
-
-    kernel = te.compute(
-        (oc_chunk, 1, kh, kw, 1, oc_bn),
-        lambda occ, icc, k_h, k_w, icb, ocb: kernel[
-            (occ * oc_bn + ocb) // cm, (occ * oc_bn + ocb) % cm, k_h, k_w
-        ],
-        name="kernel_vec",
-    )
-
-    return data, kernel
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_NCHWc.x86")
-def depthwise_conv2d_NCHWc(
-    cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype=None
-):
-    """Compute depthwise conv2d with NCHWc layout"""
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-
-    if len(data.shape) == 5:
-        batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
-        (
-            out_channel_chunk,
-            cm_chunk,
-            filter_height,
-            filter_width,
-            cm_block,
-            out_channel_block,
-        ) = get_const_tuple(kernel.shape)
-        in_channel = in_channel_chunk * in_channel_block
-        out_channel = out_channel_chunk * out_channel_block
-        channel_multiplier = cm_chunk * cm_block
-        assert channel_multiplier * in_channel == out_channel
-    else:
-        batch, in_channel, in_height, in_width = get_const_tuple(data.shape)
-        out_channel, channel_multiplier, filter_height, filter_width = get_const_tuple(kernel.shape)
-    assert channel_multiplier == 1
-
-    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    HSTR, WSTR = strides
-
-    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-
-    dilated_kernel_h = (filter_height - 1) * dh + 1
-    dilated_kernel_w = (filter_width - 1) * dw + 1
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    HPAD = pad_top + pad_down
-    WPAD = pad_left + pad_right
-
-    out_height = (in_height + HPAD - dilated_kernel_h) // HSTR + 1
-    out_width = (in_width + WPAD - dilated_kernel_w) // WSTR + 1
-
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
-    cfg.define_knob("unroll_kw", [True, False])
-
-    # get workload and related schedule config
-    wkl = _get_workload(
-        te.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
-        te.placeholder(
-            (out_channel, channel_multiplier, filter_height, filter_width), dtype=kernel.dtype
-        ),
-        strides,
-        (pad_top, pad_down),
-        dilation,
-        out_dtype,
-    )
-    if cfg.is_fallback:
-        _fallback_schedule(cfg, wkl)
-
-    # Pack data if raw 4-D data is provided.
-    # This can only happen when autotuning.
-    if len(data.shape) == 4:
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # Directly use modified data layout placeholder.
-            in_channel_block = cfg["tile_ic"].size[-1]
-            in_channel_chunk = in_channel // in_channel_block
-            out_channel_block = cfg["tile_oc"].size[-1]
-            out_channel_chunk = out_channel // out_channel_block
-            dshape = (batch, in_channel_chunk, in_height, in_width, in_channel_block)
-            data = tvm.te.placeholder(dshape, data.dtype, name="data")
-            kshape = (out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block)
-            kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel")
-        else:
-            data, kernel = _pack_data(cfg, data, kernel)
-            _, _, _, _, in_channel_block = get_const_tuple(data.shape)
-            out_channel_chunk, _, _, _, _, out_channel_block = get_const_tuple(kernel.shape)
-
-    # padding stage
-    DOPAD = pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0
-    if DOPAD:
-        pad_before = [0, 0, pad_top, pad_left, 0]
-        pad_after = [0, 0, pad_down, pad_right, 0]
-        data_pad = pad(data, pad_before, pad_after, name="PaddedInput")
-    else:
-        data_pad = data
-
-    # depthconv stage
-    idxdiv = tvm.tir.indexdiv
-    idxmod = tvm.tir.indexmod
-
-    kh = te.reduce_axis((0, filter_height), name="kh")
-    kw = te.reduce_axis((0, filter_width), name="kw")
-    Output = te.compute(
-        (batch, out_channel_chunk, out_height, out_width, out_channel_block),
-        lambda b, oco, oh, ow, oci: te.sum(
-            (
-                data_pad[
-                    b,
-                    idxdiv(
-                        idxdiv(oco * out_channel_block + oci, channel_multiplier), in_channel_block
-                    ),
-                    oh * HSTR + kh * dh,
-                    ow * WSTR + kw * dw,
-                    idxmod(
-                        idxdiv(oco * out_channel_block + oci, channel_multiplier), in_channel_block
-                    ),
-                ].astype(out_dtype)
-                * kernel[oco, 0, kh, kw, 0, oci].astype(out_dtype)
-            ),
-            axis=[kh, kw],
-        ),
-        name="DepthwiseConv2d",
-        tag="depthwise_conv2d_NCHWc",
-    )
-    return Output
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_NCHWc.x86")
-def schedule_depthwise_conv2d_NCHWc(cfg, outs):
-    """CPU schedule for depthwise conv2d in NCHW[x]c layout"""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        """Traverse operators from computation graph"""
-        if "depthwise_conv2d_NCHWc" in op.tag:
-            conv_out = op.output(0)
-            data = conv_out.op.input_tensors[0]
-            kernel = conv_out.op.input_tensors[1]
-            _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out, output):
-    tile_ow, oc_bn = cfg["tile_ow"].size[-1], cfg["tile_oc"].size[-1]
-    unroll_kw = cfg["unroll_kw"].val
-
-    # schedule pad
-    if isinstance(s[data_vec].op, tvm.te.ComputeOp) and "pad" in data_vec.op.tag:
-        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
-        s[data_vec].vectorize(ic_block)
-        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
-        s[data_vec].parallel(parallel_axis)
-
-    C, O = conv_out, output
-    CC = s.cache_write(C, "global")
-
-    _, ic_chunk, oh, ow, ic_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=tile_ow)
-    s[C].reorder(ic_chunk, oh, ow_chunk, ow_block, ic_block)
-    s[C].vectorize(ic_block)
-    parallel_axis = s[C].fuse(ic_chunk, oh)
-    s[C].parallel(parallel_axis)
-    s[CC].compute_at(s[C], ow_chunk)
-
-    # the ow axis in the cached block CC is the ow_block in C
-    _, ic_chunk, oh, ow, ic_block = s[CC].op.axis
-    kh, kw = s[CC].op.reduce_axis
-    s[CC].reorder(ic_chunk, oh, kh, kw, ow, ic_block)
-    if unroll_kw:
-        s[CC].unroll(kw)
-    s[CC].vectorize(ic_block)
-    s[CC].unroll(ow)
-
-    if C != O:
-        out_ndim = len(s[O].op.axis)
-        if out_ndim == 5:
-            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-            ow_chunk, ow_block = s[O].split(ow, factor=tile_ow)
-            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-            parallel_axis = s[O].fuse(oc_chunk, oh)
-            s[C].compute_at(s[O], parallel_axis)
-            s[O].vectorize(oc_block)
-            s[O].parallel(parallel_axis)
-        elif out_ndim == 4:
-            batch, oc, oh, ow = s[O].op.axis
-            ow_chunk, ow_block = s[O].split(ow, factor=tile_ow)
-            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-            parallel_axis = s[O].fuse(oc_chunk, oh)
-            s[C].compute_at(s[O], parallel_axis)
-            s[O].vectorize(oc_block)
-            s[O].parallel(parallel_axis)
-        else:
-            raise ValueError(f"Unsupported output ndim: {out_ndim}")
-
-    return s
-
-
-@depthwise_conv2d_infer_layout.register("cpu")
-def _depthwise_conv2d_infer_layout(workload, cfg):
-    _, data, kernel, strides, padding, dilation, _, _, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[1]
-    filter_channel, channel_multiplier, k_height, k_width = kernel[1]
-    out_channel = filter_channel * channel_multiplier
-    out_height = (in_height + padding[0] + padding[2] - k_height) // strides[0] + 1
-    out_width = (in_width + padding[1] + padding[3] - k_width) // strides[1] + 1
-    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
-    in_layout = f"NCHW{tile_ic}c"
-    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
-    out_layout = f"NCHW{tile_oc}c"
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
deleted file mode 100644
index 60b99f796bf9..000000000000
--- a/python/tvm/topi/x86/group_conv2d.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
-
-import tvm
-from tvm import autotvm, te
-from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity
-from tvm.target.x86 import get_simd_32bit_lanes
-
-from .. import tag
-from ..nn.conv2d import _get_workload as _get_conv2d_workload
-from ..nn.pad import pad
-from ..utils import get_const_tuple
-
-
-def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
-    """Compute group_conv2d with NCHW layout"""
-    return group_conv2d_nchw_spatial_pack(
-        data, kernel, strides, padding, dilation, groups, out_dtype
-    )
-
-
-def schedule_group_conv2d_nchw(outs):
-    """Compute group_conv2d with NCHW layout"""
-    return schedule_group_conv2d_nchwc(outs)
-
-
-def _get_default_config(
-    cfg, data, kernel, strides, padding, dilation, groups, out_dtype, layout="NCHW"
-):
-    """
-    Get default schedule config for the workload
-    """
-    static_data_shape = []
-    for dim in get_const_tuple(data.shape):
-        if isinstance(dim, tvm.tir.Var):
-            static_data_shape.append(1)
-        else:
-            static_data_shape.append(dim)
-    data = te.placeholder(static_data_shape, dtype=data.dtype)
-
-    wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
-    _fallback_schedule(cfg, wkl)
-
-
-def _fallback_schedule(cfg, wkl):
-    simd_width = get_simd_32bit_lanes()
-    pad_left, pad_right = wkl.padl, wkl.padr
-    stride_w = wkl.stride_w
-    out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1
-    groups = wkl.groups
-    kernels_per_group = wkl.out_filter // groups
-    kernel_depth = wkl.in_filter // groups
-
-    oc_bn = 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if kernels_per_group % bn == 0:
-            oc_bn = bn
-            break
-    if oc_bn > kernels_per_group:
-        oc_bn = kernels_per_group
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if kernel_depth % bn == 0:
-            ic_bn = bn
-            break
-    if ic_bn > kernel_depth:
-        ic_bn = kernel_depth
-
-    reg_n = 1
-    for n in range(31, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-
-    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
-    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
-    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
-    cfg["unroll_kw"] = OtherOptionEntity(False)
-
-
-@autotvm.register_topi_compute("group_conv2d_nchw.x86")
-def group_conv2d_nchw_spatial_pack(
-    cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
-):
-    """
-    Compute group conv2d with NCHW layout, using GSPC algorithm.
-    https://arxiv.org/abs/2006.09791
-    """
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(dilation, int):
-        dilation_h, dilation_w = dilation, dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
-    if isinstance(padding, int):
-        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
-    elif len(padding) == 2:
-        hpad, wpad = padding
-        pad_top, pad_bottom = hpad, hpad
-        pad_left, pad_right = wpad, wpad
-    else:
-        pad_top, pad_left, pad_bottom, pad_right = padding
-
-    hpad = pad_top + pad_bottom
-    wpad = pad_left + pad_right
-
-    assert isinstance(strides, int) or len(strides) == 2
-    if isinstance(strides, int):
-        stride_h, stride_w = strides, strides
-    else:
-        stride_h, stride_w = strides
-
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
-
-    pad_height = in_height + pad_top + pad_bottom
-    pad_width = in_width + pad_left + pad_right
-
-    dilated_kernel_h = (k_height - 1) * dilation_h + 1
-    dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
-
-    kernels_per_group = out_channel // groups
-
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
-    cfg.define_knob("unroll_kw", [True, False])
-
-    # If no config was set, we can fallback to default config.
-    if cfg.is_fallback:
-        _get_default_config(
-            cfg,
-            te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
-            te.placeholder(
-                (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype
-            ),
-            strides,
-            padding,
-            dilation,
-            groups,
-            out_dtype,
-        )
-
-    oc_bn = cfg["tile_oc"].size[-1]
-    ic_bn = cfg["tile_ic"].size[-1]
-
-    # pack data
-    DOPAD = hpad != 0 or wpad != 0
-    if DOPAD:
-        data_pad = pad(
-            data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad"
-        )
-    else:
-        data_pad = data
-
-    shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width)
-
-    data_vec = te.compute(
-        shape,
-        lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
-        name="data_vec",
-    )
-
-    # pack kernel
-    shape = (
-        groups,
-        kernels_per_group // oc_bn,
-        kernel_depth // ic_bn,
-        k_height,
-        k_width,
-        ic_bn,
-        oc_bn,
-    )
-
-    kernel_vec = te.compute(
-        shape,
-        lambda g, out_channel, in_channel, h, w, ci, co: kernel[
-            (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w
-        ],
-        name="kernel_vec",
-    )
-
-    # convolution
-    oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn)
-    unpack_shape = (batch_size, out_channel, out_height, out_width)
-
-    ic = te.reduce_axis((0, (kernel_depth)), name="ic")
-    kh = te.reduce_axis((0, k_height), name="kh")
-    kw = te.reduce_axis((0, k_width), name="kw")
-
-    idxmod = tvm.tir.indexmod
-    idxdiv = tvm.tir.indexdiv
-    conv = te.compute(
-        oshape,
-        lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(
-            data_vec[
-                g,
-                n,
-                idxdiv(ic, ic_bn),
-                oh * stride_h + kh * dilation_h,
-                idxmod(ic, ic_bn),
-                ow * stride_w + kw * dilation_w,
-            ].astype(out_dtype)
-            * kernel_vec[
-                g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block
-            ].astype(out_dtype),
-            axis=[ic, kh, kw],
-        ),
-        name="conv",
-    )
-
-    unpack = te.compute(
-        unpack_shape,
-        lambda n, c, h, w: conv[
-            idxdiv(c, kernels_per_group),
-            n,
-            idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
-            h,
-            w,
-            idxmod(idxmod(c, oc_bn), kernels_per_group),
-        ].astype(out_dtype),
-        name="output_unpack",
-        tag="group_conv2d_nchw",
-    )
-
-    return unpack
-
-
-@autotvm.register_topi_schedule("group_conv2d_nchw.x86")
-def schedule_group_conv2d_nchwc(cfg, outs):
-    """Create schedule for tensors"""
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
-        if "group_conv2d_nchw" in op.tag:
-            output = op.output(0)
-
-            if "tile_ic" not in cfg:
-                return
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
-            _schedule_gspc_nchw(*args)
-
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
-    return s
-
-
-def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
-    """Schedule GSPC"""
-    ic_bn, oc_bn, reg_n, unroll_kw = (
-        cfg["tile_ic"].size[-1],
-        cfg["tile_oc"].size[-1],
-        cfg["tile_ow"].size[-1],
-        cfg["unroll_kw"].val,
-    )
-
-    _, W = data, kernel_vec
-    A0, A1 = data_pad, data_vec
-
-    # schedule data
-    if (
-        data_pad is not None
-        and isinstance(data_pad.op, tvm.te.ComputeOp)
-        and "pad" in data_pad.op.tag
-    ):
-        s[A0].compute_inline()
-
-    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
-
-    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
-    s[A1].parallel(parallel_axis)
-
-    # schedule kernel pack
-    groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
-    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-
-    if oc_bn > 1:
-        s[W].vectorize(oc_block)
-
-    parallel_axis = s[W].fuse(groups, oc_chunk, oh)
-    s[W].parallel(parallel_axis)
-
-    # schedule conv
-    C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, "global")
-
-    _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-
-    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
-
-    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    s[C].fuse(oc_chunk, oh)
-    s[C].vectorize(oc_block)
-
-    groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-
-    ic, kh, kw = s[CC].op.reduce_axis
-    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
-        s[CC].unroll(kw)
-    else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
-
-    parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
-
-    s[CC].parallel(parallel_axis)
-
-    s[CC].vectorize(oc_block)
-
-    s[CC].unroll(ow_block)
-
-    if O0 != O:
-        s[O0].compute_inline()
-
-    batch, oc, oh, ow = s[O].op.axis
-    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-
-    s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[O].fuse(oc_chunk, oh)
-    s[O].vectorize(oc_block)
-    s[O].parallel(parallel_axis)
-    return s
diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py
deleted file mode 100644
index d197b50469f6..000000000000
--- a/python/tvm/topi/x86/injective.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""x86 declaration and schedules."""
-from tvm import te
-from tvm.topi import tag
-from tvm.tir import IntImm
-from tvm.topi.generic.injective import (
-    schedule_injective_from_existing as schedule_injective_for_concat,
-)
-from ..utils import is_empty_shape
-
-
-def schedule_injective_from_existing(sch, out):
-    """Schedule for injective op from existing schedule.
-    Parameters
-    ----------
-    sch: Schedule
-         The schedule to update.
-    out: Tensor
-         The tensor representing the injective op.
-    Returns
-    -------
-    sch: Schedule
-         The updated schedule.
-    """
-    if len(sch[out].op.axis) >= 5:
-        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1], sch[out].op.axis[2])
-        sch[out].parallel(fused)
-    elif len(sch[out].op.axis) >= 3:
-        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1])
-        sch[out].parallel(fused)
-    elif len(sch[out].op.axis) >= 1:
-        sch[out].parallel(sch[out].op.axis[0])
-
-    # Vectorize the inner most for loop. Tiling first to get a const extent
-    if len(sch[out].op.axis) >= 1:
-        l = sch[out].op.axis[-1]
-        lo, li = sch[out].split(l, factor=16)
-        sch[out].vectorize(li)
-
-        # for 1D loop, the above split will break the parallel axis
-        # Need to make the outer loop parallel again
-        if len(sch[out].op.axis) == 1:
-            sch[out].parallel(lo)
-
-    return sch
-
-
-def schedule_injective(outs):
-    """X86 schedule for injective op.
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    te.schedule.AutoInlineInjective(s)
-    for x in outs:
-        if not is_empty_shape(x.shape):
-            schedule_injective_from_existing(s, x)
-    return s
-
-
-def schedule_concatenate(outs):
-    """X86 schedule for concatenate op.
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-
-    def vectorize(sch, tensor, vectorize_limit):
-        """Internal vectorization function for concatenate."""
-        inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1]
-        # Check that the tensor shape is static. Otherwise skip vectorization.
-        if isinstance(tensor.shape[len(tensor.shape) - 1], IntImm):
-            inner_length = tensor.shape[len(tensor.shape) - 1].value
-            if inner_length <= vectorize_limit:
-                sch[tensor].vectorize(inner_axis)
-            else:
-                split_factor = 1
-                for i in range(vectorize_limit, 1, -1):
-                    if inner_length % i == 0:
-                        split_factor = i
-                        break
-                if split_factor > 1:
-                    _, inner_i = sch[tensor].split(inner_axis, split_factor)
-                    sch[tensor].vectorize(inner_i)
-
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    x = outs[0]
-    s = te.create_schedule([x.op for x in outs])
-    te.schedule.AutoInlineInjective(s)
-    if len(s[x].op.axis) >= 5:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
-        vectorize(s, x, 64)
-        s[x].parallel(fused)
-    elif len(s[x].op.axis) >= 3:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
-        s[x].parallel(fused)
-    else:
-        s[x].parallel(s[x].op.axis[0])
-    return s
-
-
-def schedule_concatenate_cpu(outs):
-    """X86 schedule for concatenate op.
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description in the format
-          of an array of tensors.
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        if tag.is_injective(op.tag):
-            schedule_injective_for_concat(s, op.output(0))
-
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                traverse(tensor.op)
-        scheduled_ops.append(op)
-
-    for out in outs:
-        traverse(out.op)
-
-    return s
-
-
-schedule_elemwise = schedule_injective
-schedule_broadcast = schedule_injective
diff --git a/python/tvm/topi/x86/math_alter_op.py b/python/tvm/topi/x86/math_alter_op.py
deleted file mode 100644
index 9ddc75891628..000000000000
--- a/python/tvm/topi/x86/math_alter_op.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-"""Legalization transforms for math operations on x86"""
-
-import logging
-
-from tvm import relay
-from ..math import erf_legalize
-
-logger = logging.getLogger("topi")
-
-
-@erf_legalize.register("cpu")
-def _erf_legalize(attrs, inputs, arg_types):
-    """Legalizes ERF op if needed.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    types : list of types
-        List of input and output types
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The legalized expr
-    """
-    # Extract types and expressions.
-    data = inputs[0]
-    data_tensor = arg_types[0]
-    # Check if the input type is supported.
-    data_dtype = data_tensor.dtype
-    # If input is not fp32, we must cast to it.
-    if data_dtype != "float32":
-        data = relay.cast(data, "float32")
-        output = relay.erf(data)
-        return relay.cast(output, data_dtype)
-
-    # Otherwise do nothing.
-    return None
diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py
deleted file mode 100644
index 734c9f6e70cf..000000000000
--- a/python/tvm/topi/x86/nn.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,too-many-locals,unused-variable
-"""x86 nn operators"""
-from tvm import te
-from ..utils import traverse_inline
-from .injective import schedule_injective_from_existing
-
-
-def _schedule_softmax(softmax_op, s, outs):
-    op_tag = softmax_op.tag
-    if op_tag == "softmax_output":
-        exp = softmax_op.input_tensors[0]
-        expsum = softmax_op.input_tensors[1]
-        max_elem = s[exp].op.input_tensors[1]
-        delta = None
-        axis = int(softmax_op.attrs["axis"])
-    elif op_tag == "fast_softmax_output":
-        exp = softmax_op.input_tensors[0]
-        expsum = softmax_op.input_tensors[1]
-        delta = s[exp].op.input_tensors[0]
-        max_elem = s[delta].op.input_tensors[1]
-        axis = int(softmax_op.attrs["axis"])
-    elif op_tag == "log_softmax_output":
-        exp = None
-        delta = None
-        max_elem = softmax_op.input_tensors[1]
-        expsum = softmax_op.input_tensors[2]
-        axis = int(softmax_op.attrs["axis"])
-    else:
-        raise ValueError(
-            f"Tag is expected to be softmax_output or log_softmax_output. Got {op_tag}"
-        )
-
-    output = outs[0]
-
-    def _schedule(output_op, softmax_op):
-        # only parallelize outer dimensions up to axis
-        outer_axes = [output_op.axis[i] for i in range(0, axis)]
-        fused_outer_axes = s[output_op].fuse(*outer_axes)
-        s[output_op].parallel(fused_outer_axes)
-
-        if softmax_op != output_op:
-            # fuse softmax output with following elemwise ops.
-            s[softmax_op].compute_at(s[output_op], fused_outer_axes)
-
-        # move computations with the same outer dimensions under the same root
-        s[max_elem].compute_at(s[output_op], fused_outer_axes)
-        s[expsum].compute_at(s[output_op], fused_outer_axes)
-
-        if delta is not None:
-            s[exp].compute_inline()
-            s[delta].compute_inline()
-        if exp is not None:
-            s[exp].compute_at(s[output_op], fused_outer_axes)
-
-    if list(output.shape) == list(softmax_op.output(0).shape):
-        _schedule(output.op, softmax_op)
-    else:
-        # This case can happen, for example, if the 4D input to softmax
-        # is in the NCHW layout while the fused elemwise op takes the NCHWc layout.
-        # Since we parallelize over outer axes up to the "axis" parameter of softmax,
-        # softmax and the fused op need to be in the same layout if we want to
-        # fuse them under the same parallel loop.
-        # This case can be removed if softmax supported AlterLayout.
-        schedule_injective_from_existing(s, output)
-        _schedule(softmax_op, softmax_op)
-
-
-def schedule_softmax(outs):
-    """Schedule for softmax
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of softmax
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "softmax" in op.tag:
-            _schedule_softmax(op, s, outs)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def schedule_batch_norm(outs):
-    """Schedule for batch_norm
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of batch_norm
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    s = te.create_schedule([x.op for x in outs])
-    # only parallelize outer dimensions up to axis
-    output_op = outs[0].op
-    axis = output_op.axis
-    outer_axes = [output_op.axis[i] for i in range(0, len(axis) - 1)]
-    fused_outer_axes = s[output_op].fuse(*outer_axes)
-    s[output_op].parallel(fused_outer_axes)
-    # when scale or center is enabled
-    if "divide" not in output_op.name:
-        div = output_op.input_tensors[0]
-        substract = s[div].op.input_tensors[0]
-        s[div].compute_inline()
-        s[substract].compute_inline()
-    return s
diff --git a/python/tvm/topi/x86/pooling.py b/python/tvm/topi/x86/pooling.py
deleted file mode 100644
index c70046e771f8..000000000000
--- a/python/tvm/topi/x86/pooling.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable
-"""Schedule for pooling operators"""
-from tvm import te
-from .. import tag
-
-
-def _parallel_sch(sch, oshape, do_vectorize=False):
-    def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64):
-        """Internal vectorization utility function."""
-        reorder_axis = [fused_axis]
-        for i in range(num_parallel_axis, len(sch.op.axis) - 1):
-            reorder_axis.append(sch.op.axis[i])
-        k = sch.op.reduce_axis
-        fuse_k = sch.fuse(*k)
-        c = sch.op.axis[len(sch.op.axis) - 1]
-        reorder_axis += [fuse_k, c]
-        sch.reorder(*reorder_axis)
-        inner_length = oshape[len(oshape) - 1].value
-        if inner_length <= vectorize_limit:
-            sch.vectorize(c)
-        else:
-            split_factor = 1
-            for i in range(vectorize_limit, 1, -1):
-                if inner_length % i == 0:
-                    split_factor = i
-                    break
-            if split_factor > 1:
-                _, c_i = sch.split(c, split_factor)
-                sch.vectorize(c_i)
-
-    if len(sch.op.axis) >= 5:
-        fused = sch.fuse(sch.op.axis[0], sch.op.axis[1], sch.op.axis[2])
-        if do_vectorize:
-            vectorize(fused, 3)
-
-    elif len(sch.op.axis) >= 3:
-        fused = sch.fuse(sch.op.axis[0], sch.op.axis[1])
-        if do_vectorize:
-            vectorize(fused, 2)
-    else:
-        sch.parallel(sch.op.axis[0])
-        return
-    sch.parallel(fused)
-
-
-def schedule_pool(outs, layout):
-    """Schedule for pool
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of pool
-          in the format of an array of tensors.
-
-    layout: str
-        Data layout.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _schedule(PaddedInput, Pool):
-        if isinstance(PaddedInput.op, te.tensor.ComputeOp):
-            s[PaddedInput].compute_inline()
-        do_vectorize = layout[-1] not in "DHWdhw"
-        _parallel_sch(s[Pool], outs[0].shape, do_vectorize)
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule pool
-        elif OP.tag.startswith("pool"):
-            # Average pool accumulation and division happens in different for loops (#3607).
-            # To ensure good parallel support, apply multi-threading on the second loop.
-            if OP != outs[0].op:
-                output = outs[0]
-                output_fused = s[output].fuse(output.op.axis[0], output.op.axis[1])
-                s[output].parallel(output_fused)
-
-            PaddedInput = OP.input_tensors[0]
-            Pool = OP.output(0)
-            _schedule(PaddedInput, Pool)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
-
-
-def schedule_adaptive_pool(outs):
-    """Schedule for adaptive pool
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of adaptive pool
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule pool
-        elif OP.tag.startswith("adaptive_pool"):
-            if OP != outs[0].op:
-                output = outs[0]
-                output_fused = s[output].fuse(output.op.axis[0], output.op.axis[1])
-                s[output].parallel(output_fused)
-
-            Pool = OP.output(0)
-            _parallel_sch(s[Pool], outs[0].shape)
-        else:
-            raise RuntimeError(f"Unsupported operator: {OP.tag}")
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
diff --git a/python/tvm/topi/x86/reduction.py b/python/tvm/topi/x86/reduction.py
deleted file mode 100644
index 349d4561497f..000000000000
--- a/python/tvm/topi/x86/reduction.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""x86 declaration and schedules."""
-import tvm
-from tvm import te
-from .injective import schedule_injective_from_existing
-from .. import tag
-from ..utils import get_const_tuple
-
-
-def _schedule_reduce(sch, op, is_idx_reduce=False):
-    if is_idx_reduce:
-        real_out = op.output(0)
-        fused = sch[real_out].fuse(*sch[real_out].op.axis)
-        out = op.input_tensors[0]
-    else:
-        out = op.output(0)
-
-    const_shape = True
-    out_shape = get_const_tuple(out.shape)
-    for d in out_shape:
-        if not isinstance(d, int):
-            const_shape = False
-            break
-
-    if const_shape:
-        naxes = len(sch[out].op.axis)
-        parallelism = 1
-        fuse_axes = []
-        # We choose a heuristic number 128 to limit the maximum parallelism
-        while len(fuse_axes) < naxes and parallelism < 128:
-            ivar = sch[out].op.axis[len(fuse_axes)]
-            parallelism *= int(ivar.dom.extent)
-            fuse_axes.append(ivar)
-        fused = sch[out].fuse(*fuse_axes)
-        sch[out].parallel(fused)
-    else:
-        if len(sch[out].op.axis) >= 5:
-            # avoid too many parallelism
-            fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1], sch[out].op.axis[2])
-            sch[out].parallel(fused)
-        else:
-            fused = sch[out].fuse(*sch[out].op.axis)
-            sch[out].parallel(fused)
-
-
-def schedule_reduce(outs):
-    """X86 schedule for reduction op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of injective in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    sch = te.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse_before_reduce(operator):
-        """Internal traverse function"""
-        if isinstance(operator, tvm.te.PlaceholderOp):
-            return
-        if tag.is_injective(operator.tag):
-            sch[operator].compute_inline()
-            for tensor in operator.input_tensors:
-                if tensor.op not in scheduled_ops:
-                    traverse_before_reduce(tensor.op)
-        else:
-            raise RuntimeError(f"Unsupported operator: {operator.tag}")
-
-        scheduled_ops.append(operator)
-
-    def traverse_after_reduce(operator):
-        """Internal traverse function"""
-        if tag.is_broadcast(operator.tag):
-            if operator not in scheduled_ops:
-                schedule_injective_from_existing(sch, operator)
-            for tensor in operator.input_tensors:
-                traverse_after_reduce(tensor.op)
-        elif operator.tag == "comm_reduce":
-            _schedule_reduce(sch, operator, is_idx_reduce=False)
-            for tensor in operator.input_tensors:
-                if tensor.op not in scheduled_ops:
-                    traverse_before_reduce(tensor.op)
-        elif operator.tag == "comm_reduce_idx":
-            _schedule_reduce(sch, operator, is_idx_reduce=True)
-            input_tensors = operator.input_tensors[0].op.input_tensors
-            for tensor in input_tensors:
-                if tensor.op not in scheduled_ops:
-                    traverse_before_reduce(tensor.op)
-        elif isinstance(operator, tvm.te.PlaceholderOp):
-            pass
-        else:
-            raise RuntimeError(f"Unsupported operator: {operator} (tag: {operator.tag})")
-
-        scheduled_ops.append(operator)
-
-    traverse_after_reduce(outs[0].op)
-    return sch
diff --git a/python/tvm/topi/x86/roi_align.py b/python/tvm/topi/x86/roi_align.py
deleted file mode 100644
index 336a336f50e5..000000000000
--- a/python/tvm/topi/x86/roi_align.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
-"""Non-maximum suppression operator for intel cpu"""
-import math
-
-import tvm
-from tvm.te import hybrid
-from ..tensor import full
-from ..utils import get_const_tuple
-
-
-@hybrid.script
-def roi_align_nchw_ir(
-    data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_scale, sample_ratio, mode
-):
-    """Hybrid routing fo ROI align operator in NCHW layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        4-D with shape [batch, channel, height, width]
-
-    rois : tvm.te.Tensor or numpy NDArray
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    num_rois : tvm.tir.IntImm or tvm.tir.Var
-        Number of roi. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    w_pc : tvm.te.Tensor or numpy NDArray
-        3-D weight pre-calculation buffer
-
-    pos_pc : tvm.te.Tensor or numpy NDArray
-        3-D position pre-calculation buffer
-
-    pooled_size : tvm ConsExpr
-        [out_height, out_width]
-
-    spatial_scale : tvm.tir.const
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    sample_ratio : tvm.tir.const
-        Sampling ratio of ROI align, using adaptive size by default.
-
-    mode : tvm.tir.const
-        Mode of RoiAlign. A value of 0 corrensponds to b'avg', while a value of 1 corresponds to
-        b'max'.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        4-D with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    channels = data.shape[1]
-    height = data.shape[2]
-    width = data.shape[3]
-    pooled_size_h = pooled_size[0]
-    pooled_size_w = pooled_size[1]
-    output = output_tensor((num_rois, channels, pooled_size_h, pooled_size_w), data.dtype)
-
-    for n in parallel(num_rois):
-        roi_batch_index = int32(rois[n, 0])
-        roi_start_w = rois[n, 1] * spatial_scale
-        roi_start_h = rois[n, 2] * spatial_scale
-        roi_end_w = rois[n, 3] * spatial_scale
-        roi_end_h = rois[n, 4] * spatial_scale
-
-        roi_h = max(roi_end_h - roi_start_h, 1.0)
-        roi_w = max(roi_end_w - roi_start_w, 1.0)
-
-        bin_h = roi_h / pooled_size_h
-        bin_w = roi_w / pooled_size_w
-
-        roi_bin_grid_h = sample_ratio
-        roi_bin_grid_w = roi_bin_grid_h
-        rounded_bin_h = int32(bin_h) * 1.0
-        rounded_bin_w = int32(bin_w) * 1.0
-        if sample_ratio <= 0:
-            # Cannot use ceil function since hybrid script
-            # doesn't support Call as indexing
-            roi_bin_grid_h = int32(bin_h)
-            roi_bin_grid_w = int32(bin_w)
-            if rounded_bin_h < bin_h:
-                roi_bin_grid_h += 1
-            if rounded_bin_w < bin_w:
-                roi_bin_grid_w += 1
-
-        count = roi_bin_grid_h * roi_bin_grid_w
-
-        # Pre-calculate indices and weights shared by all channels.
-        # This is the key point of optimization.
-        pre_calc_index = 0
-        iy_upper = roi_bin_grid_h
-        ix_upper = roi_bin_grid_w
-        for ph in range(pooled_size_h):
-            for pw in range(pooled_size_w):
-                for iy in range(iy_upper):
-                    yy = roi_start_h + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
-                    for ix in range(ix_upper):
-                        xx = roi_start_w + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
-                        x = xx
-                        y = yy
-                        if y < -1.0 or y > height or x < -1.0 or x > width:
-                            for i in range(4):
-                                w_pc[n, pre_calc_index, i] = 0.0
-                                pos_pc[n, pre_calc_index, i] = 0
-                        else:
-                            if y < 0.0:
-                                y = 0.0
-                            if x < 0.0:
-                                x = 0.0
-
-                            y_low = int32(y)
-                            x_low = int32(x)
-                            x_high = x_low + 1
-                            y_high = y_low + 1
-
-                            if y_low >= height - 1:
-                                y_high = height - 1
-                                y_low = y_high
-                                y = float32(y_low)
-
-                            if x_low >= width - 1:
-                                x_high = width - 1
-                                x_low = x_high
-                                x = float32(x_low)
-
-                            ly = y - y_low
-                            lx = x - x_low
-                            hy = 1.0 - ly
-                            hx = 1.0 - lx
-                            w1 = hy * hx
-                            w2 = hy * lx
-                            w3 = ly * hx
-                            w4 = ly * lx
-
-                            pos_pc[n, pre_calc_index, 0] = x_low
-                            pos_pc[n, pre_calc_index, 1] = x_high
-                            pos_pc[n, pre_calc_index, 2] = y_low
-                            pos_pc[n, pre_calc_index, 3] = y_high
-                            w_pc[n, pre_calc_index, 0] = w1
-                            w_pc[n, pre_calc_index, 1] = w2
-                            w_pc[n, pre_calc_index, 2] = w3
-                            w_pc[n, pre_calc_index, 3] = w4
-
-                        pre_calc_index += 1
-
-        for c in range(channels):
-            pre_calc_index = 0
-            for ph in range(pooled_size_h):
-                for pw in range(pooled_size_w):
-                    output_val = 0.0  # Avg mode
-                    if mode == 1:  # Max mode
-                        output_val = ninf("float32")
-                    for iy in range(roi_bin_grid_h):
-                        for ix in range(roi_bin_grid_w):
-                            bilinear_val = (
-                                w_pc[n, pre_calc_index, 0]
-                                * data[
-                                    roi_batch_index,
-                                    c,
-                                    pos_pc[n, pre_calc_index, 2],
-                                    pos_pc[n, pre_calc_index, 0],
-                                ]
-                                + w_pc[n, pre_calc_index, 1]
-                                * data[
-                                    roi_batch_index,
-                                    c,
-                                    pos_pc[n, pre_calc_index, 2],
-                                    pos_pc[n, pre_calc_index, 1],
-                                ]
-                                + w_pc[n, pre_calc_index, 2]
-                                * data[
-                                    roi_batch_index,
-                                    c,
-                                    pos_pc[n, pre_calc_index, 3],
-                                    pos_pc[n, pre_calc_index, 0],
-                                ]
-                                + w_pc[n, pre_calc_index, 3]
-                                * data[
-                                    roi_batch_index,
-                                    c,
-                                    pos_pc[n, pre_calc_index, 3],
-                                    pos_pc[n, pre_calc_index, 1],
-                                ]
-                            )
-                            pre_calc_index += 1
-                            if mode == 0:  # Avg mode
-                                output_val += bilinear_val / count
-                            if mode == 1:  # Max mode
-                                output_val = max(output_val, bilinear_val)
-                        output[n, c, ph, pw] = output_val
-    return output
-
-
-def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
-    """ROI align operator in NCHW layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    rois : tvm.te.Tensor
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : int or list/tuple of two ints
-        output size, or [out_height, out_width]
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    mode : str
-        Mode of RoiAlign. Should be b'max' or b'avg'.
-
-    sample_ratio : int
-        Optional sampling ratio of ROI align, using adaptive size by default.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    if not isinstance(pooled_size, (tuple, list)):
-        pooled_size = (pooled_size, pooled_size)
-
-    # Pre-allocate intermediate buffer
-    if sample_ratio > 0:
-        max_roi_bin_grid_w = max_roi_bin_grid_h = sample_ratio
-    else:
-        _, _, height, width = get_const_tuple(data.shape)
-        max_roi_bin_grid_h = math.ceil(height / pooled_size[0])
-        max_roi_bin_grid_w = math.ceil(width / pooled_size[1])
-    num_rois = rois.shape[0]
-    max_pc_shape = (
-        rois.shape[0],
-        max_roi_bin_grid_h * max_roi_bin_grid_w * pooled_size[0] * pooled_size[1],
-        4,
-    )
-    w_pc_buffer = full(max_pc_shape, data.dtype, 0)
-    pos_pc_buffer = full(max_pc_shape, "int32", 0)
-
-    pooled_size = tvm.runtime.convert(pooled_size)
-    spatial_scale = tvm.tir.const(spatial_scale, "float32")
-    sample_ratio = tvm.tir.const(sample_ratio, "int32")
-    if mode in (b"avg", 0):
-        mode = tvm.tir.const(0, dtype="float32")
-    elif mode in (b"max", 1):
-        mode = tvm.tir.const(1, dtype="float32")
-    else:
-        raise ValueError(mode, "Value %s passed in for mode not supported", mode)
-
-    return roi_align_nchw_ir(
-        data,
-        rois,
-        num_rois,
-        w_pc_buffer,
-        pos_pc_buffer,
-        pooled_size,
-        spatial_scale,
-        sample_ratio,
-        mode,
-    )
diff --git a/python/tvm/topi/x86/sparse.py b/python/tvm/topi/x86/sparse.py
deleted file mode 100644
index fdbbaf1002de..000000000000
--- a/python/tvm/topi/x86/sparse.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""sparse_dense schedule on x86"""
-from functools import partial, reduce
-
-from tvm import autotvm, te, tir
-from tvm.target.x86 import get_simd_32bit_lanes
-
-from ..transform import reshape
-from ..utils import get_const_int, traverse_inline
-
-
-def schedule_sparse_dense(outs):
-    """Create schedule for sparse dense"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        simd_width = get_simd_32bit_lanes()
-        if op.tag == "sparse_dense_sp_lhs_csrmm" or op.tag == "sparse_dense_sp_lhs_csrmm":
-            (y_o, y_i) = s[op].split(s[op].op.axis[1], 2)
-            fused = s[op].fuse(s[op].op.axis[0], y_o)
-            s[op].parallel(fused)
-            s[op].vectorize(y_i)
-        elif op.tag == "sparse_dense_sp_rhs_bsrmm" or op.tag == "sparse_dense_sp_rhs_bsrmm":
-            y_bsrmm = op.input_tensors[0]
-            assert (
-                y_bsrmm.op.tag == "sparse_dense_sp_rhs_bsrmm_block"
-                or y_bsrmm.op.tag == "sparse_dense_sp_lhs_bsrmm_block"
-            )
-            y_reshape = op
-            (m, num_blocks, b_r) = s[y_bsrmm].op.axis
-            bs_r = get_const_int(b_r.dom.extent)
-            (elem_idx, c) = s[y_bsrmm].op.reduce_axis
-            s[y_bsrmm].reorder(num_blocks, m, elem_idx, b_r, c)
-            s[y_bsrmm].vectorize(b_r)
-            (m_o, n_o) = s[y_reshape].op.axis
-            (noo, noi) = s[y_reshape].split(n_o, bs_r)
-            s[y_bsrmm].compute_at(s[y_reshape], noi)
-            s[y_reshape].vectorize(noi)
-            if op != s[outs[0]].op:
-                (y_o, y_i) = s[outs[0].op].split(s[outs[0].op].op.axis[1], 2 * simd_width)
-                s[y_reshape].compute_at(s[outs[0]], y_o)
-                s[outs[0].op].parallel(y_o)
-                s[outs[0].op].vectorize(y_i)
-            else:
-                m_o_noo = s[y_reshape].fuse(m_o, noo)
-                s[y_reshape].parallel(m_o_noo)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv3x3_spNHWC.x86")
-def spconv2d_3x3_nhwc(cfg, data, wdat, wind, wptr, layout="NHWC"):
-    """Sparse Conv2d 3x3 compute (NHWC)."""
-    assert layout == "NHWC"
-    nsamples, imh, imw, chanin = [i.value for i in data.shape]
-    nelems, bsrr, bsrc = [i.value for i in wdat.shape]
-    chanout = (wptr.shape[0].value - 1) * bsrr
-
-    imglen, chanlen = nsamples * imh * imw, 9 * chanin
-    cfg.define_split("tile_y", imglen, num_outputs=3)
-    cfg.define_split("tile_x", chanout // bsrr, num_outputs=2)
-    cfg.add_flop(imglen * (nelems * bsrc * bsrr * 2 - chanout))
-    if cfg.is_fallback:
-        cfg["tile_y"] = autotvm.task.space.SplitEntity([-1, 160, 8])
-        cfg["tile_x"] = autotvm.task.space.SplitEntity([-1, 4])
-
-    idxsplit = lambda x, y: reduce(lambda a, b: a[:-1] + [a[-1] % b, a[-1] // b], y, [x])
-
-    @partial(te.compute, (imglen, chanlen), name="Im2Col")
-    def im2col(row, col):
-        j_w, j_h, j_n = idxsplit(row, [imw, imh])
-        j_c, k_w, k_h = idxsplit(col, [chanin, 3])
-        i_h, i_w = j_h + k_h - 1, j_w + k_w - 1
-        return tir.if_then_else(
-            tir.all(i_h >= 0, i_h < imh, i_w >= 0, i_w < imw), data[j_n, i_h, i_w, j_c], 0
-        )
-
-    @partial(te.compute, (imglen, chanout // bsrr, bsrr, bsrc), name="CC")
-    def matmul(drow, wrow, brow, bcol):
-        row_start, row_end = wptr[wrow], wptr[wrow + 1]
-        elem_idx = te.reduce_axis((0, row_end - row_start), name="elem_idx")
-        elem = row_start + elem_idx
-        return te.sum(
-            im2col[drow, wind[elem] * bsrc + bcol] * wdat[elem, brow, bcol], axis=elem_idx
-        )
-
-    sum_bsrc = te.reduce_axis((0, bsrc), name="k")
-    ret = te.compute(
-        (imglen, chanout),
-        lambda y, x: te.sum(matmul[y, x // bsrr, x % bsrr, sum_bsrc], axis=sum_bsrc),
-        name="C",
-        tag="conv3x3_spNHWC",
-    )
-    return reshape(ret, (nsamples, imh, imw, chanout))
-
-
-@autotvm.register_topi_schedule("conv3x3_spNHWC.x86")
-def schedule_spconv2d_3x3_nhwc(cfg, outs):
-    """Sparse Conv2d 3x3 schedule (NHWC)."""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv3x3_spNHWC":
-            (matmul,) = op.input_tensors
-            # wptr, wind, im2col, wdat
-            _, _, im2col, _ = matmul.op.input_tensors
-            (data,) = im2col.op.input_tensors
-            bsrr = matmul.shape[-2].value
-            chanin = data.shape[-1].value
-
-            mm_y, mm_x = s[op].op.axis
-            y_t, y_o, y_i = cfg["tile_y"].apply(s, op, mm_y)
-            x_o, x_i = s[op].split(mm_x, factor=bsrr)
-            x_t, x_o = cfg["tile_x"].apply(s, op, x_o)
-            (sum_ax,) = s[op].op.reduce_axis
-            s[op].reorder(y_t, x_t, y_o, x_o, y_i, x_i, sum_ax)
-            s[op].unroll(sum_ax)
-            s[op].vectorize(x_i)
-            s[op].unroll(y_i)
-
-            s[matmul].compute_at(s[op], x_o)
-            y_i, x_i, bsrr, bsrc = s[matmul].op.axis
-            (sum_ax,) = s[matmul].op.reduce_axis
-            s[matmul].reorder(x_i, sum_ax, y_i, bsrr, bsrc)
-            s[matmul].unroll(bsrc)
-            s[matmul].vectorize(bsrr)
-            s[matmul].unroll(y_i)
-
-            s[im2col].compute_at(s[op], y_o)
-            y_i, sum_ax = s[im2col].op.axis
-            _, k_i = s[im2col].split(sum_ax, factor=chanin)
-            s[im2col].vectorize(k_i)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_compute("conv3x3_spNCHW.x86")
-def spconv2d_3x3_nchw(cfg, data, wdat, wind, wptr, layout="NCHW"):
-    """Sparse Conv2d 3x3 compute (NCHW)."""
-    nsamples, chanin, imgh, imgw = [i.value for i in data.shape]
-    nelems, veclen, bsrc = [i.value for i in wdat.shape]
-    chanout = (wptr.shape[0].value - 1) * veclen
-    assert bsrc == 1 and layout == "NCHW"
-
-    cfg.add_flop(nsamples * imgh * imgw * (nelems * veclen * bsrc * 2 - chanout))
-    cfg.define_split("tile_hw", imgh * imgw, num_outputs=3)
-    cfg.define_split("tile_ckk", chanin * 9, num_outputs=3)
-
-    @partial(te.compute, (nsamples, chanin * 3 * 3, imgh * imgw), name="im2col")
-    def im2col(nsamples, ckk, imglen):
-        j_h, j_w = imglen // imgw, imglen % imgw
-        i_c, k_h, k_w = ckk // 9, ckk // 3 % 3, ckk % 3
-        i_h, i_w = j_h + k_h - 1, j_w + k_w - 1
-        return tir.if_then_else(
-            tir.all(i_h >= 0, i_h < imgh, i_w >= 0, i_w < imgw), data[nsamples, i_c, i_h, i_w], 0
-        )
-
-    @partial(
-        te.compute,
-        (nsamples, chanout // veclen, veclen, bsrc, imgh * imgw),
-        name="CC",
-        tag="conv3x3_spNCHW",
-    )
-    def matmul(nsamples, f_o, f_i, bsrk, imglen):
-        row_start, row_end = wptr[f_o], wptr[f_o + 1]
-        elem_idx = te.reduce_axis((0, row_end - row_start), name="elem_idx")
-        elem = row_start + elem_idx
-        return te.sum(
-            im2col[nsamples, wind[elem] * bsrc + bsrk, imglen] * wdat[elem, f_i, bsrk],
-            axis=elem_idx,
-        )
-
-    return reshape(matmul, [nsamples, chanout, imgh, imgw])
-
-
-@autotvm.register_topi_schedule("conv3x3_spNCHW.x86")
-def schedule_spconv2d_3x3_nchw(cfg, outs):
-    """Sparse Conv2d 3x3 schedule (NCHW)."""
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "conv3x3_spNCHW":
-            # wptr, wind, im2col, wdat
-            _, _, im2col, _ = op.input_tensors
-
-            n_samples, f_o, f_i, b_c, imglen = s[op].op.axis
-            (sum_ax,) = s[op].op.reduce_axis
-            hw1, hw2, hw3 = cfg["tile_hw"].apply(s, op, imglen)
-            s[op].reorder(n_samples, hw1, f_o, hw2, sum_ax, f_i, b_c, hw3)
-            s[op].unroll(f_i)
-            s[op].unroll(b_c)
-            s[op].vectorize(hw3)
-
-            s[im2col].compute_at(s[op], hw1)
-            n_samples, ckk, imglen = s[im2col].op.axis
-            ckk1, ckk2, ckk3 = cfg["tile_ckk"].apply(s, im2col, ckk)
-            hw2, hw3 = s[im2col].split(imglen, factor=cfg["tile_hw"].size[-1])
-            s[im2col].reorder(n_samples, ckk1, ckk2, hw2, ckk3, hw3)
-            s[im2col].unroll(ckk3)
-            s[im2col].vectorize(hw3)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
deleted file mode 100644
index f2e84a62ecbd..000000000000
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ /dev/null
@@ -1,574 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Core kernel of dot product of 4 Int8 operations"""
-# pylint: disable=invalid-name,unused-variable
-import tvm
-from tvm import te
-import tvm.target.codegen
-from tvm.target.x86 import target_has_features, get_simd_32bit_lanes
-
-
-def dot_16x1x16_uint8_int8_int32():
-    """Dispatch the most optimized intrin depending on the target"""
-    assert target_has_features(
-        "sse4.2"
-    ), "An old Intel machine that does not have fast Int8 support."
-    if target_has_features("avx512vnni") or target_has_features("avxvnni"):
-        # VNNI capable platform
-        return dot_16x1x16_uint8_int8_int32_cascadelake()
-    # vpmaddubsw/vpmaddwd fallback
-    return dot_16x1x16_uint8_int8_int32_skylake()
-
-
-def dot_16x1x16_uint8_int8_int32_skylake():
-    """
-    Int8 dot product by every 4 elements using AVX512 Skylake instructions.
-    This function takes two arrays of uint8 and int8 datatype -- data[4] and
-    kernel[16][4] -- and computes a dot product of data[4] with every
-    4 elements of kernels, resulting in output[16] of int32 datatype.
-    The pseudo code is as follows.
-    .. code-block:: c
-        void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4],
-                int32 output[16]){
-            for (int i = 0; i < 16; i++){
-                output[i] = 0;
-                for (int k = 0; k < 4; k++){
-                    output[i] += data[k] * kernel[i][k]
-                }
-            }
-        }
-
-    Physically, the kernel array sits in an AVX512 vector register and
-    the data[4] is broadcasted to another AVX512 vector register. This
-    function returns a TensorIntrin that can be used to tensorize
-    a schedule.
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
-    """
-
-    int32_lanes = get_simd_32bit_lanes()
-    num_int8_elements = 4  # 4 int8 elements in int32
-    data = te.placeholder((num_int8_elements,), dtype="uint8", name="data")
-    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel")
-    k = te.reduce_axis((0, num_int8_elements), name="k")
-    C = te.compute(
-        (int32_lanes,),
-        lambda i: te.sum(data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
-        name="C",
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]
-    )
-    b_buffer = tvm.tir.decl_buffer(
-        kernel.shape, dtype="int8", name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]
-    )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            # int_lx32 - output datatype after pmaddubs - 16 bits to number of lanes
-            # int_8xl - input datatype to pmaddubs - 8 bits to number of lanes
-            # int_32xl - output datatype after pmaddw - 32 bits per number of lanes
-
-            if int32_lanes == 4:
-                int_lx32 = "int16x8"
-                int_8xl = "int8x16"
-                int_32xl = "int32x4"
-                pmaddubs = "llvm.x86.ssse3.pmadd.ub.sw.128"
-                pmaddw = "llvm.x86.sse2.pmadd.wd"
-            elif int32_lanes == 8:
-                int_lx32 = "int16x16"
-                int_8xl = "int8x32"
-                int_32xl = "int32x8"
-                pmaddubs = "llvm.x86.avx2.pmadd.ub.sw"
-                pmaddw = "llvm.x86.avx2.pmadd.wd"
-            elif int32_lanes == 16:
-                int_lx32 = "int16x32"
-                int_8xl = "int8x64"
-                int_32xl = "int32x16"
-                pmaddubs = "llvm.x86.avx512.pmaddubs.w.512"
-                pmaddw = "llvm.x86.avx512.pmaddw.d.512"
-
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.tir.const(0, int_32xl)))
-                return ib.get()
-
-            a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-            vec_ai32 = re_int32.astype(int_32xl)
-            vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
-            vec_b = ins[1].vload([0, 0], int_8xl)
-            vec_one = tvm.tir.const(1, int_lx32)
-            pair_reduction = tvm.tir.call_llvm_pure_intrin(
-                int_lx32,
-                pmaddubs,
-                tvm.tir.const(2, "uint32"),
-                vec_a,
-                vec_b,
-            )
-            quad_reduction = tvm.tir.call_llvm_pure_intrin(
-                int_32xl,
-                pmaddw,
-                tvm.tir.const(2, "uint32"),
-                pair_reduction,
-                vec_one,
-            )
-            if index == 0:
-                ib.emit(outs[0].vstore(0, quad_reduction))
-            else:
-                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], int_32xl)))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={data: a_buffer, kernel: b_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def dot_16x1x16_uint8_int8_int16():
-    """
-    Int8 dot product by every 2 elements using AVX512 Skylake instructions.
-    This function takes two arrays of uint8 and int8 datatype -- data[2] and
-    kernel[4][32][2] -- and computes a dot product of data[2] with every
-    2 elements of kernels, resulting in output[4][32] of int16 datatype.
-    The pseudo code is as follows.
-    .. code-block:: c
-        void dot_16x1x16_uint8_int8_int16(uint8 data[2], int8 kernel[32*4][2],
-                int16 output[32*4]){
-            for (int i = 0; i< 4; i++){
-                for (int j = 0; j < 32; j++){
-                    output[i][i] = 0;
-                    for (int k = 0; k < 2; k++){
-                        output[i][j][k] += data[k] * kernel[i][j][k]
-                    }
-                }
-            }
-        }
-
-    Physically, the kernel array sits in four AVX512 vector registers and
-    the data[2] is broadcasted to another AVX512 vector register. This
-    function returns a TensorIntrin that can be used to tensorize
-    a schedule.
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
-    """
-
-    int16_lanes = 4 * 32  # 4*32 int32 lanes in 4 AVX512 vector registers
-    num_int8_elements = 2  # 2 int8 elements in int16
-    data = te.placeholder((num_int8_elements,), dtype="uint8", name="data")
-    kernel = te.placeholder((int16_lanes, num_int8_elements), dtype="int8", name="kernel")
-    k = te.reduce_axis((0, num_int8_elements), name="k")
-    C = te.compute(
-        (int16_lanes,),
-        lambda i: te.sum(data[k].astype("int16") * kernel[i, k].astype("int16"), axis=k),
-        name="C",
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]
-    )
-    b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype="int8", name="b_buffer", offset_factor=1)
-    # strides=[te.var('ldw'), 1, 1])
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                for i in range(4):
-                    ib.emit(outs[0].vstore([i * 32], tvm.tir.const(0, "int16x32")))
-                return ib.get()
-
-            a_int8 = ins[0].vload([0], "uint8x2")
-            re_int16 = tvm.tir.call_intrin("int16", "tir.reinterpret", a_int8)
-            vec_ai16 = re_int16.astype("int16x32")
-            vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai16)
-
-            for i in range(4):
-                vec_b = ins[1].vload([i * 32, 0], "int8x64")
-                pair_reduction = tvm.tir.call_llvm_pure_intrin(
-                    "int16x32",
-                    "llvm.x86.avx512.pmaddubs.w.512",
-                    tvm.tir.const(2, "uint32"),
-                    vec_a,
-                    vec_b,
-                )
-                if index == 0:
-                    ib.emit(outs[0].vstore([i * 32], pair_reduction))
-                else:
-                    ib.emit(
-                        outs[0].vstore(
-                            [i * 32], pair_reduction + outs[0].vload([i * 32], "int16x32")
-                        )
-                    )
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={data: a_buffer, kernel: b_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def dot_16x1x16_uint8_int8_int32_cascadelake():
-    """
-    Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions.
-    This function takes two arrays of uint8 and int8 datatype -- data[4] and
-    kernel[16][4] -- and computes a dot product of data[4] with every
-    4 elements of kernels, resulting in output[16] of int32 datatype.
-    The pseudo code is as follows.
-    .. code-block:: c
-        void dot_16x1x16_uint8_int8_int32_cascadelake(uint8 data[4], int8 kernel[16][4],
-                int32 output[16]){
-            for (int i = 0; i < 16; i++){
-                output[i] = 0;
-                for (int k = 0; k < 4; k++){
-                    output[i] += data[k] * kernel[i][k]
-                }
-            }
-        }
-
-    Physically, the kernel array sits in an AVX512 vector register and
-    the data[4] is broadcasted to another AVX512 vector register. This
-    function returns a TensorIntrin that can be used to tensorize
-    a schedule.
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Cascade Lake int8 TensorIntrin that can be used in tensorizing schedule
-    """
-
-    int32_lanes = 16  # 16 int32 lanes in AVX512
-    num_int8_elements = 4  # 4 int8 elements in int32
-    data = te.placeholder((num_int8_elements,), dtype="uint8", name="data")
-    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel")
-    k = te.reduce_axis((0, num_int8_elements), name="k")
-    C = te.compute(
-        (int32_lanes,),
-        lambda i: te.sum(data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
-        name="C",
-    )
-
-    a_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]
-    )
-    b_buffer = tvm.tir.decl_buffer(
-        kernel.shape, dtype="int8", name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]
-    )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.tir.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x16")))
-                return ib.get()
-
-            a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-            vec_ai32 = re_int32.astype("int32x16")
-            vec_b = ins[1].vload([0, 0], "int8x64")
-
-            vnni_inst_name = "llvm.x86.avx512.vpdpbusd.512"
-            llvm_id = tvm.target.codegen.llvm_lookup_intrinsic_id(vnni_inst_name)
-
-            if llvm_id != 0:  # VNNI is available for current LLVM version
-                vec_bi32 = tvm.tir.call_intrin("int32x16", "tir.reinterpret", vec_b)
-                vec_c = outs[0].vload([0], "int32x16")
-                quad_reduction = tvm.tir.call_llvm_pure_intrin(
-                    "int32x16",
-                    "llvm.x86.avx512.vpdpbusd.512",
-                    tvm.tir.const(3, "uint32"),
-                    vec_c,
-                    vec_ai32,
-                    vec_bi32,
-                )
-                ib.emit(outs[0].vstore(0, quad_reduction))
-            else:  # Fall back to the normal AVX512
-                vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32)
-                vec_one = tvm.tir.const(1, "int16x32")
-                pair_reduction = tvm.tir.call_llvm_pure_intrin(
-                    "int16x32",
-                    "llvm.x86.avx512.pmaddubs.w.512",
-                    tvm.tir.const(2, "uint32"),
-                    vec_a,
-                    vec_b,
-                )
-                quad_reduction = tvm.tir.call_llvm_pure_intrin(
-                    "int32x16",
-                    "llvm.x86.avx512.pmaddw.d.512",
-                    tvm.tir.const(2, "uint32"),
-                    pair_reduction,
-                    vec_one,
-                )
-                if index == 0:
-                    ib.emit(outs[0].vstore(0, quad_reduction))
-                else:
-                    ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x16")))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(
-        C.op,
-        _intrin_func,
-        binds={data: a_buffer, kernel: b_buffer},
-        default_buffer_params=buffer_params,
-    )
-
-
-def dot_32x128x32_u8s8s32_sapphirerapids(LDA):
-    """
-    Int8 dot product by every 16x64 elements using AMX-TMUL Sapphire Rapids instructions.
-    The tdpxxd instruction takes two tile of uint8 and int8 datatype -- data[16][64] and
-    kernel[1][16][16][4] -- and computes a dot product of data[16][16] in int32 datatype.
-
-    (Physically, to efficiently leveraging the tile register, we constructing a 2x2 tiles
-    matmul which performs 32x128x32 in total)
-
-    The pseudo code is as follows:
-        for(k=0; k<2; k++){
-            for(n=0; n<2; n++){
-                tileload64(tmm_b, B)
-                for(m=0; m<2; m++){
-                    if(n==0)
-                        tileload64(tmm_a, A)
-                    tdpbusd(tmm_c, tmm_a, tmm_b)
-                }
-            }
-        }
-
-    Args:
-        LDA (int): the stride of the matrix A, which is uint8 type and use it to determine
-                    memory strides of macro reduce axis.
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Sapphire Rapids AMX-TMUL int8 tdpbusd TensorIntrin that can be used in tensorizing
-        schedule
-    """
-    A = te.placeholder((32, 128), name="A", dtype="uint8")
-    B = te.placeholder((2, 32, 16, 4), name="B", dtype="int8")
-    k = te.reduce_axis((0, 128), name="k")
-
-    C = te.compute(
-        (32, 32),
-        lambda i, j: te.sum(
-            A[i, k].astype("int32")
-            * B[tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(k, 4), j % 16, k % 4].astype("int32"),
-            axis=k,
-        ),
-        name="C",
-    )
-
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, offset_factor=1, strides=[te.var("ldw"), 1], name="BA"
-    )
-    BB = tvm.tir.decl_buffer(
-        B.shape,
-        B.dtype,
-        offset_factor=1,
-        strides=[te.var("ldw"), te.var("ldw"), te.var("ldw"), 1],
-        name="BB",
-    )
-    BC = tvm.tir.decl_buffer(
-        C.shape, C.dtype, offset_factor=1, strides=[te.var("ldw"), 1], name="BC", scope="amx.tmm"
-    )
-
-    def intrin_func(ins, outs):  # pylint: disable=unused-variable
-        bufA = ins[0]
-        bufB = ins[1]
-        bufC = outs[0]
-
-        assert LDA
-        _strides_A = tvm.tir.const(LDA, dtype="uint64")
-        _strides_B_tile = tvm.tir.const(LDA / 128, dtype="uint64")
-
-        def init():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_llvm_intrin(
-                    "int32",
-                    "llvm.x86.tilezero",
-                    tvm.tir.const(1, "uint8"),
-                    tvm.tir.const(0, dtype="uint8"),
-                )
-            )  # tile C 0
-            ib.emit(
-                tvm.tir.call_llvm_intrin(
-                    "int32",
-                    "llvm.x86.tilezero",
-                    tvm.tir.const(1, "uint8"),
-                    tvm.tir.const(1, dtype="uint8"),
-                )
-            )  # tile C 1
-            ib.emit(
-                tvm.tir.call_llvm_intrin(
-                    "int32",
-                    "llvm.x86.tilezero",
-                    tvm.tir.const(1, "uint8"),
-                    tvm.tir.const(2, dtype="uint8"),
-                )
-            )  # tile C 2
-            ib.emit(
-                tvm.tir.call_llvm_intrin(
-                    "int32",
-                    "llvm.x86.tilezero",
-                    tvm.tir.const(1, "uint8"),
-                    tvm.tir.const(3, dtype="uint8"),
-                )
-            )  # tile C 3
-
-            return ib.get()
-
-        def body():  # load A, load B, dpbusd, store C
-            ib = tvm.tir.ir_builder.create()
-
-            for k_tile in range(2):  # reduced data blocks
-                for n_acc in range(2):  # broadcast data blocks
-                    tmm_B_ = tvm.tir.const(n_acc + 6, dtype="uint8")
-                    ib.emit(
-                        tvm.tir.call_llvm_intrin(
-                            "int32",
-                            "llvm.x86.tileloaddt164",  # load B: tmm6, tmm7
-                            tvm.tir.const(3, "uint8"),
-                            tmm_B_,
-                            bufB.access_ptr(
-                                "r", offset=64 * 16 * (n_acc * 2 * _strides_B_tile + k_tile)
-                            ),
-                            tvm.tir.const(64, dtype="uint64"),
-                        )
-                    )
-
-                    for m_acc in range(2):  # loaded data blocks
-                        tmm_A_ = tvm.tir.const(m_acc + 4, dtype="uint8")
-                        if n_acc == 0:
-                            ib.emit(
-                                tvm.tir.call_llvm_intrin(
-                                    "int32",
-                                    "llvm.x86.tileloaddt164",  # load A: , tmm4, tmm5
-                                    tvm.tir.const(3, "uint8"),
-                                    tmm_A_,
-                                    bufA.access_ptr(
-                                        "r", offset=m_acc * 16 * _strides_A + k_tile * 64
-                                    ),
-                                    _strides_A,
-                                )
-                            )
-
-                        tmm_C_ = tvm.tir.const(m_acc * 2 + n_acc, dtype="uint8")
-                        ib.emit(
-                            tvm.tir.call_llvm_intrin(
-                                "int32",
-                                "llvm.x86.tdpbusd",
-                                tvm.tir.const(3, "uint8"),
-                                tmm_C_,
-                                tmm_A_,
-                                tmm_B_,
-                            )
-                        )  # tdpxxd
-
-            return ib.get()
-
-        # body, reset, store
-        return (
-            body(),
-            init(),
-            body(),
-        )
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
-
-
-def acc_32x32_int32_sapphirerapids(LDC):
-    """
-    Store the accumulated tile register in scope amx.tmm to global memory.
-    (tmm0, tmm1, tmm2, tmm3 --> global 4 tiles)
-
-    Args:
-        LDC (int): the stride of the matrix C, which is int32 type and use it to
-                    determine memory strides.
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Sapphirerapids AMX-TMUL int8 tilestored64 TensorIntrin that can be used
-        in tensorizing schedule
-    """
-    A = te.placeholder((32, 32), name="A", dtype="int32")
-    bufA = tvm.tir.decl_buffer(
-        A.shape,
-        A.dtype,
-        scope="amx.tmm",
-        name="a_buffer",
-        offset_factor=1,
-        strides=[te.var("ldw"), 1],
-    )
-
-    C = te.compute((32, 32), lambda i, j: A[i, j], name="C")
-    bufC = tvm.tir.decl_buffer(
-        C.shape,
-        C.dtype,
-        scope="global",
-        name="c_buffer",
-        offset_factor=1,
-        strides=[te.var("ldw"), 1],
-    )
-
-    assert LDC
-    _strides_C = tvm.tir.const(4 * LDC, dtype="uint64")
-
-    def intrin_func(ins, outs):  # pylint: disable=unused-variable
-        ib = tvm.tir.ir_builder.create()
-        bufA = ins[0]
-        bufC = outs[0]
-        for n_acc in range(2):  # broadcast data blocks
-            for m_acc in range(2):  # loaded data blocks
-                ib.emit(
-                    tvm.tir.call_llvm_intrin(
-                        "int32",
-                        "llvm.x86.tilestored64",
-                        tvm.tir.const(3, "uint8"),
-                        tvm.tir.const(m_acc * 2 + n_acc, dtype="uint8"),
-                        bufC.access_ptr("w", offset=n_acc * 16 + m_acc * 16 * _strides_C / 4),
-                        _strides_C,
-                    )
-                )
-
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: bufA, C: bufC})
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index e75150859f90..26b4398b427d 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -19,14 +19,5 @@
 members = [
 	"tvm-sys",
 	"tvm-macros",
-	"tvm-rt",
-	"tvm",
-	"tvm/tests/basics",
-	"tvm/tests/callback",
-	"tvm/examples/resnet",
-	"tvm-graph-rt",
-	"tvm-graph-rt/tests/test_tvm_basic",
-	"tvm-graph-rt/tests/test_tvm_dso",
-	"tvm-graph-rt/tests/test_nn",
-	"compiler-ext",
+	"tvm-rt"
 ]
diff --git a/rust/compiler-ext/Cargo.toml b/rust/compiler-ext/Cargo.toml
deleted file mode 100644
index b830b7a84135..000000000000
--- a/rust/compiler-ext/Cargo.toml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "compiler-ext"
-version = "0.1.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[lib]
-crate-type = ["staticlib", "cdylib"]
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-tvm = { path = "../tvm", default-features = false, features = ["static-linking"] }
-log = "*"
-env_logger = "*"
diff --git a/rust/compiler-ext/src/lib.rs b/rust/compiler-ext/src/lib.rs
deleted file mode 100644
index 278060ef4897..000000000000
--- a/rust/compiler-ext/src/lib.rs
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use env_logger;
-use tvm::export;
-
-fn diagnostics() -> Result<(), tvm::Error> {
-    tvm::ir::diagnostics::codespan::init()
-}
-
-export!(diagnostics);
-
-#[no_mangle]
-extern "C" fn compiler_ext_initialize() -> i32 {
-    let _ = env_logger::try_init();
-    tvm_export("rust_ext").expect("failed to initialize the Rust compiler extensions.");
-    log::debug!("Loaded the Rust compiler extension.");
-    return 0;
-}
diff --git a/rust/tvm-graph-rt/Cargo.toml b/rust/tvm-graph-rt/Cargo.toml
deleted file mode 100644
index c8db44eadf9b..000000000000
--- a/rust/tvm-graph-rt/Cargo.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "tvm-graph-rt"
-version = "0.1.0-alpha"
-license = "Apache-2.0"
-description = "A static graph executor for TVM."
-repository = "https://github.com/apache/tvm"
-readme = "README.md"
-keywords = ["tvm"]
-categories = ["api-bindings", "science"]
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[dependencies]
-crossbeam-channel = "0.4"
-thiserror = "1"
-
-itertools = "0.8"
-lazy_static = "1.4"
-ndarray="0.12"
-nom = "5.0"
-num_cpus = "1.10"
-serde = { version = "^1.0", features = ["derive"] }
-serde_json = "^1.0"
-tvm-sys = { version = "0.1.1-alpha", path = "../tvm-sys" }
-tvm-macros = { version = "0.1.1-alpha", path = "../tvm-macros" }
-
-[target.'cfg(not(any(target_arch = "wasm32", target_env = "sgx")))'.dependencies]
-libloading = "0.5"
diff --git a/rust/tvm-graph-rt/README.md b/rust/tvm-graph-rt/README.md
deleted file mode 100644
index f1355b042882..000000000000
--- a/rust/tvm-graph-rt/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# tvm-graph-rt
-
-An implementation of TVM's graph runtime in Rust. See `tvm` crate for more documentation.
diff --git a/rust/tvm-graph-rt/src/allocator.rs b/rust/tvm-graph-rt/src/allocator.rs
deleted file mode 100644
index fe741aa69c23..000000000000
--- a/rust/tvm-graph-rt/src/allocator.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::alloc::{self, Layout, LayoutError};
-
-const DEFAULT_ALIGN_BYTES: usize = 4;
-
-#[derive(PartialEq, Eq)]
-pub struct Allocation {
-    layout: Layout,
-    ptr: *mut u8,
-}
-
-impl Allocation {
-    /// Allocates a chunk of memory of `size` bytes with optional alignment.
-    pub fn new(size: usize, align: Option<usize>) -> Result<Self, LayoutError> {
-        let alignment = align.unwrap_or(DEFAULT_ALIGN_BYTES);
-        let layout = Layout::from_size_align(size, alignment)?;
-        let ptr = unsafe { alloc::alloc(layout) };
-        if ptr.is_null() {
-            alloc::handle_alloc_error(layout);
-        }
-        Ok(Self { ptr, layout })
-    }
-
-    pub fn as_mut_ptr(&self) -> *mut u8 {
-        self.ptr
-    }
-
-    /// Returns the size of the Allocation in bytes.
-    pub fn size(&self) -> usize {
-        self.layout.size()
-    }
-
-    /// Returns the byte alignment of the Allocation.
-    pub fn align(&self) -> usize {
-        self.layout.align()
-    }
-
-    /// Returns a view of the Allocation.
-    pub fn as_slice(&self) -> &[u8] {
-        unsafe { std::slice::from_raw_parts(self.as_mut_ptr(), self.size()) }
-    }
-
-    /// Returns a mutable view of the Allocation.
-    pub fn as_mut_slice(&mut self) -> &mut [u8] {
-        unsafe { std::slice::from_raw_parts_mut(self.as_mut_ptr(), self.size()) }
-    }
-}
-
-impl Drop for Allocation {
-    fn drop(&mut self) {
-        unsafe {
-            alloc::dealloc(self.ptr, self.layout);
-        }
-    }
-}
diff --git a/rust/tvm-graph-rt/src/array.rs b/rust/tvm-graph-rt/src/array.rs
deleted file mode 100644
index 1a8ff81f56c4..000000000000
--- a/rust/tvm-graph-rt/src/array.rs
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{convert::TryFrom, mem, os::raw::c_void, ptr, slice};
-
-use ndarray;
-use tvm_sys::{ffi::DLTensor, DataType, Device};
-
-use crate::allocator::Allocation;
-use crate::errors::ArrayError;
-use std::alloc::LayoutError;
-
-/// A `Storage` is a container which holds `Tensor` data.
-#[derive(PartialEq)]
-pub enum Storage<'a> {
-    /// A `Storage` which owns its contained bytes.
-    Owned(Allocation),
-
-    /// A view of an existing `Storage`.
-    View(&'a mut [u8], usize), // ptr, align
-}
-
-impl<'a> Storage<'a> {
-    pub fn new(size: usize, align: Option<usize>) -> Result<Storage<'static>, LayoutError> {
-        Ok(Storage::Owned(Allocation::new(size, align)?))
-    }
-
-    pub fn as_mut_ptr(&self) -> *mut u8 {
-        match self {
-            Storage::Owned(alloc) => alloc.as_mut_ptr(),
-            Storage::View(slice, _) => slice.as_ptr() as *mut u8,
-        }
-    }
-
-    pub fn size(&self) -> usize {
-        match self {
-            Storage::Owned(alloc) => alloc.size(),
-            Storage::View(slice, _) => slice.len(),
-        }
-    }
-
-    pub fn align(&self) -> usize {
-        match self {
-            Storage::Owned(alloc) => alloc.align(),
-            Storage::View(_, align) => *align,
-        }
-    }
-
-    pub fn as_ptr(&self) -> *const u8 {
-        self.as_mut_ptr() as *const _
-    }
-
-    /// Returns a `Storage::View` which points to an owned `Storage::Owned`.
-    pub fn view(&self) -> Storage<'a> {
-        match self {
-            Storage::Owned(alloc) => Storage::View(
-                unsafe { slice::from_raw_parts_mut(alloc.as_mut_ptr(), self.size()) },
-                self.align(),
-            ),
-            Storage::View(slice, _) => Storage::View(
-                unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), slice.len()) },
-                self.align(),
-            ),
-        }
-    }
-
-    pub fn is_owned(&self) -> bool {
-        match self {
-            Storage::Owned(_) => true,
-            _ => false,
-        }
-    }
-
-    /// Returns an owned version of this storage via cloning.
-    pub fn to_owned(&self) -> Storage<'static> {
-        let s = Storage::new(self.size(), Some(self.align())).unwrap();
-        unsafe {
-            s.as_mut_ptr()
-                .copy_from_nonoverlapping(self.as_ptr(), self.size());
-        }
-        s
-    }
-
-    /// Returns a view of the stored data.
-    pub fn as_slice(&self) -> &[u8] {
-        match self {
-            Storage::Owned(alloc) => alloc.as_slice(),
-            Storage::View(slice, _) => &*slice,
-        }
-    }
-
-    /// Returns a mutable view of the stored data.
-    pub fn as_mut_slice(&mut self) -> &mut [u8] {
-        match self {
-            Storage::Owned(alloc) => alloc.as_mut_slice(),
-            Storage::View(slice, _) => slice,
-        }
-    }
-}
-
-impl<'d, 's, T> From<&'d [T]> for Storage<'s> {
-    fn from(data: &'d [T]) -> Self {
-        let data = unsafe {
-            slice::from_raw_parts_mut(
-                data.as_ptr() as *const u8 as *mut u8,
-                data.len() * mem::size_of::<T>() as usize,
-            )
-        };
-        Storage::View(data, mem::align_of::<T>())
-    }
-}
-
-/// A n-dimensional array type which can be converted to/from `tvm::DLTensor` and `ndarray::Array`.
-/// `Tensor` is primarily a holder of data which can be operated on via TVM (via `DLTensor`) or
-/// converted to `ndarray::Array` for non-TVM processing.
-///
-/// # Examples
-///
-/// ```
-/// extern crate ndarray;
-/// use std::convert::TryInto;
-/// use tvm_graph_rt::{call_packed, DLTensor, ArgValue, RetValue, Tensor};
-///
-/// let mut a_nd: ndarray::Array1<f32> = ndarray::Array::from_vec(vec![1f32, 2., 3., 4.]);
-/// let mut a: Tensor = a_nd.into();
-/// let mut a_dl: DLTensor = (&mut a).into();
-///
-/// let tvm_fn = |args: &[ArgValue]| -> Result<RetValue, ()> { Ok(RetValue::default()) };
-/// call_packed!(tvm_fn, &mut a_dl);
-///
-/// // Array -> Tensor is mostly useful when post-processing TVM graph outputs.
-/// let mut a_nd: ndarray::ArrayD<f32> = a.try_into().unwrap();
-/// ```
-#[derive(PartialEq)]
-pub struct Tensor<'a> {
-    /// The bytes which contain the data this `Tensor` represents.
-    pub(crate) data: Storage<'a>,
-    pub(crate) device: Device,
-    pub(crate) dtype: DataType,
-    pub(crate) shape: Vec<i64>,
-    // ^ not usize because `typedef int64_t tvm_index_t` in c_runtime_api.h
-    /// The `Tensor` strides. Can be `None` if the `Tensor` is contiguous.
-    pub(crate) strides: Option<Vec<usize>>,
-    pub(crate) byte_offset: isize,
-    /// The number of elements in the `Tensor`.
-    pub(crate) size: usize,
-}
-
-unsafe impl<'a> Send for Tensor<'a> {}
-
-impl<'a> Tensor<'a> {
-    pub fn shape(&self) -> Vec<i64> {
-        self.shape.clone()
-    }
-
-    pub fn data(&self) -> &Storage {
-        &self.data
-    }
-
-    pub fn data_mut(&mut self) -> &'a mut Storage {
-        &mut self.data
-    }
-
-    /// Returns the data of this `Tensor` as a `Vec`.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `Tensor` is not contiguous or does not contain elements of type `T`.
-    pub fn to_vec<T: 'static + std::fmt::Debug + Clone>(&self) -> Vec<T> {
-        assert!(self.is_contiguous());
-        assert!(self.dtype.is_type::<T>());
-        unsafe { slice::from_raw_parts(self.data.as_ptr() as *const T, self.size).to_vec() }
-    }
-
-    /// Returns `true` iff this `Tensor` is represented by a contiguous region of memory.
-    pub fn is_contiguous(&self) -> bool {
-        match self.strides {
-            None => true,
-            Some(ref strides) => {
-                // check that stride for each dimension is the
-                // product of all trailing dimensons' shapes
-                self.shape
-                    .iter()
-                    .zip(strides)
-                    .rfold(
-                        (true, 1),
-                        |(is_contig, expected_stride), (shape, stride)| {
-                            (
-                                is_contig && *stride == expected_stride,
-                                expected_stride * (*shape as usize),
-                            )
-                        },
-                    )
-                    .0
-            }
-        }
-    }
-
-    /// Returns a clone of this `Tensor`.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `Tensor` is not contiguous or does not contain elements of type `T`.
-    pub fn copy(&mut self, other: &Tensor) {
-        assert!(
-            self.dtype == other.dtype && self.size == other.size,
-            "Tensor shape/dtype mismatch."
-        );
-        assert!(
-      self.is_contiguous() && other.is_contiguous(),
-      "copy currently requires contiguous tensors\n`self.strides = {:?}` `other.strides = {:?}`",
-      self.strides,
-      other.strides
-    );
-        unsafe {
-            self.data
-                .as_mut_ptr()
-                .offset(self.byte_offset as isize)
-                .copy_from_nonoverlapping(
-                    other.data.as_mut_ptr().offset(other.byte_offset),
-                    other.size * other.dtype.itemsize(),
-                );
-        }
-    }
-
-    /// Returns an owned version of this `Tensor` via cloning.
-    pub fn to_owned(&self) -> Tensor<'static> {
-        let t = Tensor {
-            data: self.data.to_owned(),
-            device: self.device,
-            dtype: self.dtype,
-            size: self.size,
-            shape: self.shape.clone(),
-            strides: None,
-            byte_offset: 0,
-        };
-        unsafe { mem::transmute::<Tensor<'a>, Tensor<'static>>(t) }
-    }
-
-    fn from_array_storage<'s, T, D: ndarray::Dimension>(
-        arr: &ndarray::Array<T, D>,
-        storage: Storage<'s>,
-        dtype_fn: fn(u8, u16) -> DataType,
-    ) -> Tensor<'s> {
-        let type_width = mem::size_of::<T>() as u8;
-
-        Tensor {
-            data: storage,
-            device: Device::default(),
-            dtype: dtype_fn(8 * type_width, 1),
-            size: arr.len(),
-            shape: arr.shape().iter().map(|&v| v as i64).collect(),
-            strides: Some(arr.strides().iter().map(|&v| v as usize).collect()),
-            byte_offset: 0,
-        }
-    }
-
-    pub fn as_dltensor(&self, flatten: bool) -> DLTensor {
-        assert!(!flatten || self.is_contiguous());
-        DLTensor {
-            data: unsafe { self.data.as_mut_ptr().offset(self.byte_offset) } as *mut c_void,
-            device: self.device.into(),
-            ndim: if flatten { 1 } else { self.shape.len() } as i32,
-            dtype: self.dtype.into(),
-            shape: if flatten {
-                &self.size as *const _ as *mut i64
-            } else {
-                self.shape.as_ptr()
-            } as *mut i64,
-            strides: if flatten || self.is_contiguous() {
-                ptr::null_mut()
-            } else {
-                self.strides.as_ref().unwrap().as_ptr()
-            } as *mut i64,
-            byte_offset: 0,
-            ..Default::default()
-        }
-    }
-}
-
-/// Conversions to `ndarray::Array` from `Tensor`, if the types match.
-macro_rules! impl_ndarray_try_from_tensor {
-    ($type:ty, $dtype:expr) => {
-        impl<'t> TryFrom<Tensor<'t>> for ndarray::ArrayD<$type> {
-            type Error = ArrayError;
-            fn try_from(tensor: Tensor) -> Result<ndarray::ArrayD<$type>, Self::Error> {
-                if tensor.dtype != $dtype {
-                    return Err(ArrayError::IncompatibleDataType(tensor.dtype));
-                }
-                Ok(ndarray::Array::from_shape_vec(
-                    tensor
-                        .shape
-                        .iter()
-                        .map(|s| *s as usize)
-                        .collect::<Vec<usize>>(),
-                    tensor.to_vec::<$type>(),
-                )
-                .map_err(|_| ArrayError::ShapeError(tensor.shape.clone()))?)
-            }
-        }
-    };
-}
-
-macro_rules! make_dtype_const {
-    ($name: ident, $cnst:expr) => {
-        pub const $name: DataType = $cnst;
-    };
-}
-
-make_dtype_const!(DTYPE_INT32, DataType::int(32, 1));
-make_dtype_const!(DTYPE_UINT32, DataType::uint(32, 1));
-make_dtype_const!(DTYPE_FLOAT32, DataType::float(32, 1));
-make_dtype_const!(DTYPE_FLOAT64, DataType::float(64, 1));
-impl_ndarray_try_from_tensor!(i32, DTYPE_INT32);
-impl_ndarray_try_from_tensor!(u32, DTYPE_UINT32);
-impl_ndarray_try_from_tensor!(f32, DTYPE_FLOAT32);
-impl_ndarray_try_from_tensor!(f64, DTYPE_FLOAT64);
-
-impl<'a, 't> From<&'a Tensor<'t>> for DLTensor {
-    fn from(tensor: &'a Tensor<'t>) -> Self {
-        Tensor::as_dltensor(tensor, false /* flatten */)
-    }
-}
-
-impl<'a, 't> From<&'a mut Tensor<'t>> for DLTensor {
-    fn from(tensor: &'a mut Tensor<'t>) -> Self {
-        Tensor::as_dltensor(tensor, false /* flatten */)
-    }
-}
-
-impl<'a> From<DLTensor> for Tensor<'a> {
-    fn from(dlt: DLTensor) -> Self {
-        unsafe {
-            let dtype = DataType::from(dlt.dtype);
-            let shape = slice::from_raw_parts(dlt.shape, dlt.ndim as usize).to_vec();
-            let size = shape.iter().map(|v| *v as usize).product::<usize>() as usize;
-            let storage = Storage::from(slice::from_raw_parts(
-                dlt.data as *const u8,
-                dtype.itemsize() * size,
-            ));
-            Self {
-                data: storage,
-                device: Device::default(),
-                dtype,
-                size,
-                shape,
-                strides: if dlt.strides.is_null() {
-                    None
-                } else {
-                    Some(slice::from_raw_parts_mut(dlt.strides as *mut usize, size).to_vec())
-                },
-                byte_offset: dlt.byte_offset as isize,
-            }
-        }
-    }
-}
-
-/// `From` conversions to `Tensor` for owned or borrowed `ndarray::Array`.
-///
-/// # Panics
-///
-/// Panics if the ndarray is not contiguous.
-macro_rules! impl_tensor_from_ndarray {
-    ($type:ty, $dtype_fn:expr) => {
-        impl<D: ndarray::Dimension> From<ndarray::Array<$type, D>> for Tensor<'static> {
-            fn from(arr: ndarray::Array<$type, D>) -> Self {
-                let storage = Storage::from(arr.as_slice().expect("NDArray must be contiguous"));
-                Tensor::from_array_storage(&arr, storage.to_owned(), $dtype_fn)
-            }
-        }
-        impl<'a, D: ndarray::Dimension> From<&'a ndarray::Array<$type, D>> for Tensor<'a> {
-            fn from(arr: &'a ndarray::Array<$type, D>) -> Self {
-                let storage = Storage::from(arr.as_slice().expect("NDArray must be contiguous"));
-                Tensor::from_array_storage(arr, storage, $dtype_fn)
-            }
-        }
-    };
-}
-
-impl_tensor_from_ndarray!(f32, DataType::float);
-impl_tensor_from_ndarray!(f64, DataType::float);
-impl_tensor_from_ndarray!(i32, DataType::int);
-impl_tensor_from_ndarray!(i64, DataType::int);
-impl_tensor_from_ndarray!(u32, DataType::uint);
-impl_tensor_from_ndarray!(u64, DataType::uint);
diff --git a/rust/tvm-graph-rt/src/errors.rs b/rust/tvm-graph-rt/src/errors.rs
deleted file mode 100644
index c4bddb85b0de..000000000000
--- a/rust/tvm-graph-rt/src/errors.rs
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use thiserror::Error;
-use tvm_sys::DataType;
-
-#[derive(Debug, Error)]
-pub enum GraphFormatError {
-    #[error("Failed to parse graph with error: {0}")]
-    Parse(#[source] serde_json::Error),
-    #[error("Failed to parse graph parameters with error: {0:?}")]
-    Params(#[source] Option<nom::Err<(Vec<u8>, nom::error::ErrorKind)>>),
-    #[error("{0} is missing attribute: {1}")]
-    MissingAttr(String, String),
-    #[error("Failed to parse graph attribute '{0}' with error: {1}")]
-    InvalidAttr(String, #[source] std::num::ParseIntError),
-    #[error("Missing field: {0}")]
-    MissingField(&'static str),
-    #[error("Invalid DLType: {0}")]
-    InvalidDLType(String),
-    #[error("Unsupported Op: {0}")]
-    UnsupportedOp(String),
-}
-
-#[derive(Debug, Error)]
-#[error("Function {0} not found")]
-pub struct FunctionNotFound(pub String);
-
-#[derive(Debug, Error)]
-#[error("Pointer {0:?} invalid when freeing")]
-pub struct InvalidPointer(pub *mut u8);
-
-#[derive(Debug, Error)]
-pub enum ArrayError {
-    #[error("Cannot convert Tensor with dtype {0} to ndarray")]
-    IncompatibleDataType(DataType),
-    #[error("Shape error when casting ndarray to TVM Array with shape {0:?}")]
-    ShapeError(Vec<i64>),
-}
diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs
deleted file mode 100644
index 058e55b0261c..000000000000
--- a/rust/tvm-graph-rt/src/graph.rs
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{
-    cmp, collections::HashMap, convert::TryFrom, error::Error, iter::FromIterator, mem, str,
-};
-
-use itertools::izip;
-use nom::{
-    character::complete::{alpha1, digit1},
-    complete, count, do_parse, length_count, map, named,
-    number::complete::{le_i32, le_i64, le_u16, le_u32, le_u64, le_u8},
-    opt, tag, take, tuple, Err as NomErr,
-};
-use serde::{Deserialize, Serialize};
-use serde_json;
-
-use tvm_sys::ffi::{DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt};
-
-use tvm_sys::{ffi::DLTensor, ArgValue, DataType, Device, DeviceType};
-
-use crate::{errors::*, Module, Storage, Tensor};
-
-// @see `kTVMNDArrayMagic` in `ndarray.h`
-const _NDARRAY_MAGIC: u64 = 0xDD5E_40F0_96B4_A13F;
-// @see `kTVMNDArrayListMagic` in `graph_executor.h`
-const _NDARRAY_LIST_MAGIC: u64 = 0xF7E5_8D4F_0504_9CB7;
-
-/// A TVM computation graph.
-///
-/// # Examples
-///
-/// ```no_run
-/// use tvm_graph_rt::Graph;
-/// use std::convert::TryFrom;
-/// let graph_json = std::fs::read_to_string("graph.json").unwrap();
-/// let graph = Graph::try_from(&graph_json).unwrap();
-/// ```
-#[derive(Serialize, Deserialize, Debug)]
-pub struct Graph {
-    pub nodes: Vec<Node>,
-    pub arg_nodes: Vec<usize>,
-    pub heads: Vec<Entry>,
-    pub node_row_ptr: Option<Vec<usize>>,
-    pub attrs: Option<HashMap<String, serde_json::Value>>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct Entry {
-    pub id: usize,
-    pub index: usize,
-    pub version: usize,
-}
-
-impl Graph {
-    fn entry_index(&self, entry: &Entry) -> Result<usize, GraphFormatError> {
-        self.node_row_ptr
-            .as_ref()
-            .map(|nrp| nrp[entry.id] + entry.index)
-            .ok_or_else(|| GraphFormatError::MissingField("node_row_ptr"))
-    }
-
-    /// Attempt to deserialize a JSON attribute to a type `T`.
-    fn get_attr<T: serde::de::DeserializeOwned>(&self, attr: &str) -> Result<T, GraphFormatError> {
-        Ok(serde_json::from_value::<T>(
-            self.attrs
-                .as_ref()
-                .ok_or(GraphFormatError::MissingField("attrs"))?
-                .get(attr)
-                .ok_or_else(|| {
-                    GraphFormatError::MissingAttr("graph".to_string(), attr.to_string())
-                })?
-                .to_owned(),
-        )
-        .map_err(|err| GraphFormatError::Parse(err.into()))?)
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct Node {
-    pub op: String,
-    pub name: String,
-    pub inputs: Vec<Entry>,
-    pub attrs: Option<HashMap<String, String>>,
-    pub control_deps: Option<Vec<Entry>>,
-}
-
-struct NodeAttrs {
-    func_name: String,
-    num_outputs: usize,
-    flatten_data: bool,
-}
-
-macro_rules! get_node_attr {
-    ($node:expr, $attrs:ident, $attr:literal) => {
-        $attrs
-            .get($attr)
-            .ok_or_else(|| GraphFormatError::MissingAttr($node.to_owned(), $attr.to_owned()))
-    };
-}
-
-impl Node {
-    fn parse_attrs(&self) -> Result<NodeAttrs, GraphFormatError> {
-        let attrs = self
-            .attrs
-            .as_ref()
-            .ok_or_else(|| GraphFormatError::MissingAttr(self.name.clone(), "attrs".to_owned()))?;
-
-        let func_name = get_node_attr!(self.name, attrs, "func_name")?.to_owned();
-
-        let num_outputs = get_node_attr!(self.name, attrs, "num_outputs")?
-            .parse::<usize>()
-            .map_err(|error| GraphFormatError::InvalidAttr("num_outputs".to_string(), error))?;
-
-        let flatten_data = get_node_attr!(self.name, attrs, "flatten_data")?
-            .parse::<u8>()
-            .map(|val| val == 1)
-            .map_err(|error| GraphFormatError::InvalidAttr("flatten_data".to_string(), error))?;
-
-        Ok(NodeAttrs {
-            func_name,
-            num_outputs,
-            flatten_data,
-        })
-    }
-}
-
-impl<'a> TryFrom<&'a String> for Graph {
-    type Error = GraphFormatError;
-    fn try_from(graph_json: &String) -> Result<Self, GraphFormatError> {
-        serde_json::from_str(graph_json).map_err(|error| GraphFormatError::Parse(error))
-    }
-}
-
-impl<'a> TryFrom<&'a str> for Graph {
-    type Error = GraphFormatError;
-    fn try_from(graph_json: &'a str) -> Result<Self, Self::Error> {
-        serde_json::from_str(graph_json).map_err(|error| GraphFormatError::Parse(error))
-    }
-}
-
-/// A executor for a TVM computation graph.
-///
-/// # Examples
-///
-/// ```no_compile
-/// use ndarray::Array;
-///
-/// let syslib = SystemLibModule::default(); // a provider of TVM functions
-///
-/// let mut params_bytes = Vec::new();
-/// fs::File::open("graph.params").unwrap().read_to_end(&mut params_bytes).unwrap();
-/// let params = tvm::runtime::load_param_dict(&params_bytes).unwrap();
-///
-/// let graph = Graph::try_from(&fs::read_to_string("graph.json").unwrap()).unwrap();
-///
-/// let mut exec = GraphExecutor::new(graph, &syslib).unwrap();
-/// exec.load_params(params);
-///
-/// let x = Array::from_vec(vec![1f32, 2., 3., 4.]);
-/// exec.set_input("data", x.into());
-/// exec.run();
-/// let output = exec.get_output(0).unwrap();
-///
-/// println!("{:#?}", Array::try_from(output).unwrap());
-/// ```
-pub struct GraphExecutor<'m, 't> {
-    graph: Graph,
-    op_execs: Vec<Box<dyn Fn() + 'm>>,
-    tensors: Vec<Tensor<'t>>,
-}
-
-unsafe impl<'m, 't> Send for GraphExecutor<'m, 't> {}
-
-impl<'m, 't> GraphExecutor<'m, 't> {
-    pub fn new<M: 'm + Module>(graph: Graph, lib: &'m M) -> Result<Self, Box<dyn Error>> {
-        let tensors = Self::setup_storages(&graph)?;
-        Ok(GraphExecutor {
-            op_execs: Self::setup_op_execs(&graph, lib, &tensors)?,
-            tensors,
-            graph,
-        })
-    }
-
-    /// Runs the computation graph.
-    pub fn run(&mut self) {
-        self.op_execs.iter().for_each(|op_exec| {
-            op_exec();
-        });
-    }
-
-    /// Allocates `Storages` for each `storage_id` and returns `Tensor`s to hold each output.
-    fn setup_storages<'a>(graph: &'a Graph) -> Result<Vec<Tensor<'t>>, Box<dyn Error>> {
-        let storage_ids = graph.get_attr::<(String, Vec<usize>)>("storage_id")?.1;
-        let shapes = graph.get_attr::<(String, Vec<Vec<i64>>)>("shape")?.1;
-        let dtypes = graph
-            .get_attr::<(String, Vec<String>)>("dltype")?
-            .1
-            .iter()
-            .map(|dltype| {
-                if let Ok((_, dtype)) = tvm_str_to_type(dltype) {
-                    Ok(dtype)
-                } else {
-                    Err(GraphFormatError::InvalidDLType(dltype.to_string()))
-                }
-            })
-            .collect::<Result<Vec<DataType>, GraphFormatError>>()?;
-
-        let align = dtypes.iter().map(|dtype| dtype.bits() as usize).max();
-        let mut storage_num_bytes = vec![0usize; *storage_ids.iter().max().unwrap_or(&1) + 1];
-        for (i, &storage_id) in storage_ids.iter().enumerate() {
-            let dtype_size = (dtypes[i].bits() * dtypes[i].lanes()) >> 3;
-            let nbytes = dtype_size * shapes[i].iter().product::<i64>() as usize;
-            storage_num_bytes[storage_id] = cmp::max(nbytes, storage_num_bytes[storage_id]);
-        }
-
-        let mut storages: Vec<Storage> = storage_num_bytes
-            .into_iter()
-            .map(|nbytes| Storage::new(nbytes, align))
-            .collect::<Result<Vec<Storage>, std::alloc::LayoutError>>()?;
-
-        let tensors = izip!(storage_ids, shapes, dtypes)
-            .map(|(storage_id, shape, dtype)| {
-                let storage = storages[storage_id].view();
-                Tensor {
-                    data: mem::replace(&mut storages[storage_id], storage),
-                    device: Device::default(),
-                    dtype,
-                    size: shape.iter().product::<i64>() as usize,
-                    shape,
-                    strides: None,
-                    byte_offset: 0,
-                }
-            })
-            .collect();
-
-        Ok(tensors)
-    }
-
-    /// Creates closures which represent the computation performed by this graph.
-    fn setup_op_execs<M: 'm + Module>(
-        graph: &Graph,
-        lib: &'m M,
-        tensors: &[Tensor<'t>],
-    ) -> Result<Vec<Box<dyn Fn() + 'm>>, Box<dyn Error + 'static>> {
-        if !graph.node_row_ptr.is_some() {
-            return Err(GraphFormatError::MissingField("node_row_ptr").into());
-        }
-        let node_row_ptr = graph.node_row_ptr.as_ref().unwrap();
-
-        let mut op_execs = Vec::new();
-        for (i, node) in graph.nodes.iter().enumerate() {
-            if node.op == "null" {
-                continue;
-            }
-            if node.op != "tvm_op" {
-                return Err(GraphFormatError::UnsupportedOp(node.op.to_owned()).into());
-            }
-            if !node.attrs.is_some() {
-                return Err(GraphFormatError::MissingAttr(node.op.clone(), "".to_string()).into());
-            }
-
-            let attrs: NodeAttrs = node.parse_attrs()?.into();
-
-            if attrs.func_name == "__nop" {
-                continue;
-            }
-
-            let func = lib
-                .get_function(&attrs.func_name)
-                .ok_or_else(|| FunctionNotFound(attrs.func_name.clone()))?;
-            let arg_indices = node
-                .inputs
-                .iter()
-                .map(|entry| graph.entry_index(entry))
-                .chain((0..attrs.num_outputs).map(|oi| Ok(node_row_ptr[i] + oi)));
-
-            let dl_tensors: Vec<DLTensor> = arg_indices
-                .map(|idx| {
-                    let tensor = &tensors[idx?];
-                    Ok(if attrs.flatten_data {
-                        Tensor::as_dltensor(tensor, true /* flatten */)
-                    } else {
-                        DLTensor::from(tensor)
-                    })
-                })
-                .collect::<Result<Vec<DLTensor>, GraphFormatError>>()?
-                .into();
-            let op: Box<dyn Fn()> = Box::new(move || {
-                let args: Vec<ArgValue> = dl_tensors
-                    .iter()
-                    .map(|t| t.into())
-                    .collect::<Vec<ArgValue>>();
-                let err_str = format!("Function {} failed to execute", attrs.func_name);
-                func(&args).expect(&err_str);
-            });
-            op_execs.push(op);
-        }
-        Ok(op_execs)
-    }
-
-    pub fn load_params(&mut self, params: HashMap<String, Tensor>) {
-        params.into_iter().for_each(|(name, param)| {
-            self.set_input(name, param);
-        })
-    }
-
-    #[allow(clippy::if_same_then_else)]
-    pub fn set_input<S: AsRef<str>>(&mut self, name: S, value: Tensor) {
-        if let Some(idx) = self.get_input_index(name.as_ref()) {
-            // TODO: consider `new_with_params` to avoid ever allocating
-            let ptr = self.tensors[idx].data.as_ptr();
-            let mut to_replace = self.tensors.iter_mut().filter(|t| t.data.as_ptr() == ptr);
-            let owner = to_replace.nth(0).unwrap();
-            if value.data.is_owned() {
-                // FIXME: for no-copy, need setup_op_execs to not capture tensor ptr
-                // mem::replace(&mut (*owner), value);
-                // to_replace.for_each(|t| {
-                //   panic!("replacing");
-                //   t.data = owner.data.view();
-                // });
-                owner.copy(&value);
-            } else {
-                owner.copy(&value);
-            }
-        } else {
-            println!("Unexpected input `{}`", name.as_ref());
-        }
-    }
-
-    /// Returns the graph input with name `name`, if it exists.
-    pub fn get_input<S: AsRef<str>>(&mut self, name: S) -> Option<&Tensor> {
-        self.get_input_index(name.as_ref())
-            .map(move |idx| &self.tensors[idx])
-    }
-
-    /// Returns the graph output with index `index`, if it exists.
-    pub fn get_output(&self, idx: usize) -> Option<&Tensor> {
-        let graph = &self.graph;
-        graph.heads.get(idx).and_then(|entry| {
-            graph
-                .entry_index(entry)
-                .map(|idx| self.tensors.get(idx))
-                .unwrap_or(None)
-        })
-    }
-
-    /// Returns the index for graph input with name `name`, if it exists.
-    pub fn get_input_index<S: AsRef<str>>(&self, name: S) -> Option<usize> {
-        let graph = &self.graph;
-        (0..graph.nodes.len())
-            .skip_while(|&i| graph.nodes[i].name != name.as_ref())
-            .nth(0)
-            .and_then(|i| {
-                if graph.arg_nodes.iter().any(|&id| id == i) {
-                    graph.node_row_ptr.as_ref().map(|nrp| nrp[i])
-                } else {
-                    None
-                }
-            })
-    }
-}
-
-// Converts a string to TVM DLDataTypeCode. @see `String2DLDataType` in packed_func.h
-named! {
-  tvm_str_to_type<&str, DataType>,
-  do_parse!(
-    type_name: alpha1 >>
-    bits:      digit1 >>
-    lanes:     opt!(complete!(tuple!(tag!("x"), digit1))) >>
-    (
-        DataType::new(
-            match type_name {
-                "int" => DLDataTypeCode_kDLInt,
-                "uint" => DLDataTypeCode_kDLUInt,
-                "float" => DLDataTypeCode_kDLFloat,
-                _ => DLDataTypeCode_kDLFloat,
-            } as u8,
-            bits.parse::<u8>().unwrap() as u8,
-            lanes
-                .map(|(_, lanes)| lanes.parse::<u16>().unwrap() as u16)
-                .unwrap_or(1),
-        )
-    )
-  )
-}
-
-// Converts a bytes to String.
-named! {
-    name<String>,
-    do_parse!(
-        len_l: le_u32 >>
-        len_h: le_u32 >>
-        data: take!(len_l) >>
-        (
-            if len_h == 0 {
-                String::from_utf8(data.to_vec()).unwrap()
-            } else {
-                panic!("Too long string")
-            }
-        )
-    )
-}
-
-// Parses a Device
-named! {
-  tvm_device<&[u8], Device>,
-  do_parse!(
-    device_type: le_u32 >>
-    device_id:   le_i32 >>
-    (
-        Device {
-            device_type: DeviceType::from(device_type),
-            device_id: device_id as usize,
-        }
-    )
-  )
-}
-
-// Parses a DataType
-named! {
-  data_type<&[u8], DataType>,
-  do_parse!(
-    code:  le_u8  >>
-    bits:  le_u8  >>
-    lanes: le_u16 >>
-    (DataType::new(code, bits, lanes)))
-}
-
-// Parses a Tensor from a TVM array file.
-named! {
-    tensor<Tensor>,
-    do_parse!(
-                take!(8)      >>
-                le_u64        >>
-        device: tvm_device    >>
-        ndim:   le_u32        >>
-        dtype:  data_type     >>
-        shape:  count!(map!(le_i64, |sz| sz as i64), ndim as usize) >>
-        length: le_i64        >>
-        data:   take!(length) >>
-        (
-            Tensor {
-                data: Storage::from(data),
-                device: device,
-                dtype: dtype,
-                size: shape.iter().product::<i64>() as usize,
-                shape: shape,
-                strides: None,
-                byte_offset: 0,
-            }
-        )
-    )
-}
-
-// Parses a graph params dict from a params binary file.
-named! {
-    parse_param_dict<HashMap<String, Tensor>>,
-    do_parse!(
-                 take!(8)                      >>
-                 le_u64                        >>
-        names:   length_count!(le_u64, name)   >>
-        tensors: length_count!(le_u64, tensor) >>
-        (
-            HashMap::from_iter(names.into_iter().zip(tensors.into_iter()))
-        )
-    )
-}
-
-/// Loads a param dict saved using `runtime.save_param_dict`.
-pub fn load_param_dict(bytes: &[u8]) -> Result<HashMap<String, Tensor>, GraphFormatError> {
-    match parse_param_dict(bytes) {
-        Ok((remaining_bytes, param_dict)) => {
-            if remaining_bytes.is_empty() {
-                Ok(param_dict)
-            } else {
-                Err(GraphFormatError::Params(None))
-            }
-        }
-        Err(error) => Err(match error {
-            NomErr::Incomplete(error) => GraphFormatError::Params(Some(NomErr::Incomplete(error))),
-            NomErr::Error((remainder, error_kind)) => {
-                GraphFormatError::Params(Some(NomErr::Error((remainder.into(), error_kind))))
-            }
-            NomErr::Failure((remainder, error_kind)) => {
-                GraphFormatError::Params(Some(NomErr::Failure((remainder.into(), error_kind))))
-            }
-        }),
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_str_to_type() {
-        assert_eq!(
-            tvm_str_to_type("float24").unwrap().1,
-            DataType::float(24, 1)
-        );
-        assert_eq!(
-            tvm_str_to_type("uint111x44").unwrap().1,
-            DataType::uint(111, 44)
-        );
-    }
-}
diff --git a/rust/tvm-graph-rt/src/lib.rs b/rust/tvm-graph-rt/src/lib.rs
deleted file mode 100644
index a37c712acc54..000000000000
--- a/rust/tvm-graph-rt/src/lib.rs
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-//! This crate is an implementation of the TVM runtime for modules compiled with `--system-lib`.
-//! It's mainly useful for compiling to WebAssembly and SGX,
-//! but also native if you prefer Rust to C++.
-//!
-//! For TVM graphs, the entrypoint to this crate is `runtime::GraphExecutor`.
-//! Single-function modules are used via the `packed_func!` macro after obtaining
-//! the function from `runtime::SystemLibModule`
-//!
-//! The main entrypoints to this crate are `GraphExecutor`
-//! For examples of use, please refer to the multi-file tests in the `tests` directory.
-
-extern crate tvm_macros;
-extern crate tvm_sys;
-
-// Re-export the import_module macro.
-pub use tvm_macros::import_module;
-
-// Re-export the called pack macro, eventually remove as its not a very good
-// abstraction.
-pub use tvm_sys::call_packed;
-
-use lazy_static::lazy_static;
-
-mod allocator;
-mod array;
-pub mod errors;
-mod graph;
-mod module;
-mod threading;
-mod workspace;
-
-pub use tvm_sys::{
-    errors::*,
-    ffi::{self, DLTensor},
-    packed_func::{self, *},
-    ArgValue, RetValue,
-};
-
-pub use self::{array::*, errors::*, graph::*, module::*, threading::*, workspace::*};
-
-lazy_static! {
-    static ref LAST_ERROR: std::sync::RwLock<Option<&'static std::ffi::CStr>> =
-        std::sync::RwLock::new(None);
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn TVMAPISetLastError(cmsg: *const i8) {
-    *LAST_ERROR.write().unwrap() = Some(std::ffi::CStr::from_ptr(cmsg));
-}
-
-#[no_mangle]
-pub extern "C" fn TVMGetLastError() -> *const std::os::raw::c_char {
-    match *LAST_ERROR.read().unwrap() {
-        Some(err) => err.as_ptr(),
-        None => std::ptr::null(),
-    }
-}
diff --git a/rust/tvm-graph-rt/src/module/dso.rs b/rust/tvm-graph-rt/src/module/dso.rs
deleted file mode 100644
index f1145da4b4de..000000000000
--- a/rust/tvm-graph-rt/src/module/dso.rs
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{
-    cell::RefCell,
-    collections::HashMap,
-    ffi::CStr,
-    os::raw::{c_char, c_int, c_void},
-    pin::Pin,
-};
-
-use tvm_sys::{ffi::BackendPackedCFunc, packed_func::PackedFunc};
-
-use crate::{
-    threading::{TVMBackendParallelBarrier, TVMBackendParallelLaunch},
-    workspace::{TVMBackendAllocWorkspace, TVMBackendFreeWorkspace},
-    TVMAPISetLastError,
-};
-
-use super::Module;
-
-const TVM_MAIN: &[u8] = b"__tvm_main__";
-const TVM_MODULE_CTX: &[u8] = b"__tvm_module_ctx";
-
-/// A module backed by a Dynamic Shared Object (dylib).
-pub struct DsoModule<'a> {
-    lib: libloading::Library,
-    packed_funcs: RefCell<HashMap<String, &'a (dyn PackedFunc)>>,
-    _pin: std::marker::PhantomPinned,
-}
-
-macro_rules! init_context_func {
-    ($lib:ident, $( ($fn:ident, $sig:ty) ),+ $(,)?) => {
-        unsafe {
-            $(
-                let fn_ptr = $lib.get::<*mut $sig>(concat!("__", stringify!($fn)).as_bytes());
-                if let Ok(fn_ptr) = fn_ptr {
-                    **fn_ptr = $fn;
-                }
-            )+
-        }
-    };
-}
-
-impl<'a> DsoModule<'a> {
-    pub fn new<P: AsRef<std::ffi::OsStr>>(filename: P) -> Result<Pin<Box<Self>>, std::io::Error> {
-        let lib = libloading::Library::new(filename)?;
-
-        init_context_func!(
-            lib,
-            (TVMAPISetLastError, unsafe extern "C" fn(*const i8)),
-            (
-                TVMBackendAllocWorkspace,
-                unsafe extern "C" fn(c_int, c_int, u64, c_int, c_int) -> *mut c_void
-            ),
-            (
-                TVMBackendFreeWorkspace,
-                unsafe extern "C" fn(c_int, c_int, *mut c_void) -> c_int
-            ),
-            (
-                TVMBackendParallelLaunch,
-                unsafe extern "C" fn(
-                    crate::threading::FTVMParallelLambda,
-                    *const c_void,
-                    usize,
-                ) -> c_int
-            ),
-            (
-                TVMBackendParallelBarrier,
-                unsafe extern "C" fn(usize, *const tvm_sys::ffi::TVMParallelGroupEnv)
-            ),
-        );
-
-        // Pin the module in memory so that `ctx` pointer (below) is stable.
-        let dso_mod = Box::pin(Self {
-            lib,
-            packed_funcs: RefCell::new(HashMap::new()),
-            _pin: std::marker::PhantomPinned,
-        });
-
-        unsafe {
-            if let Ok(ctx) = dso_mod.lib.get::<*mut *const c_void>(TVM_MODULE_CTX) {
-                **ctx = &dso_mod as *const _ as *const c_void;
-            }
-        }
-
-        Ok(dso_mod)
-    }
-}
-
-impl<'a> Module for DsoModule<'a> {
-    fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)> {
-        let name = name.as_ref();
-        let func = match unsafe {
-            self.lib
-                .get::<BackendPackedCFunc>(if name.as_bytes() == TVM_MAIN {
-                    // If __tvm_main__ is present, it contains the name of the
-                    // actual main function.
-                    match self
-                        .lib
-                        .get::<*const c_char>(TVM_MAIN)
-                        .map(|p| CStr::from_ptr(*p))
-                    {
-                        Ok(m) => m.to_bytes(),
-                        _ => return None,
-                    }
-                } else {
-                    name.as_bytes()
-                })
-        } {
-            Ok(func) => unsafe { func.into_raw() },
-            Err(_) => return None,
-        };
-
-        self.packed_funcs.borrow_mut().insert(
-            name.to_string(),
-            &*Box::leak(super::wrap_backend_packed_func(name.to_string(), *func)),
-        );
-
-        self.packed_funcs.borrow().get(name).copied()
-    }
-}
-
-impl<'a> Drop for DsoModule<'a> {
-    fn drop(&mut self) {
-        self.packed_funcs
-            .replace(HashMap::new())
-            .into_iter()
-            .map(|(_name, f)| unsafe { Box::from_raw(f as *const _ as *mut (dyn PackedFunc)) })
-            .for_each(std::mem::drop);
-    }
-}
diff --git a/rust/tvm-graph-rt/src/module/mod.rs b/rust/tvm-graph-rt/src/module/mod.rs
deleted file mode 100644
index a345758deca1..000000000000
--- a/rust/tvm-graph-rt/src/module/mod.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#[cfg(not(any(target_arch = "wasm32", target_env = "sgx")))]
-mod dso;
-mod syslib;
-
-use tvm_sys::{
-    ffi::BackendPackedCFunc,
-    packed_func::{ArgValue, PackedFunc, RetValue, TVMValue},
-};
-
-#[cfg(not(any(target_arch = "wasm32", target_env = "sgx")))]
-pub use dso::DsoModule;
-pub use syslib::SystemLibModule;
-
-pub trait Module {
-    fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)>;
-}
-
-// @see `WrapPackedFunc` in `llvm_module.cc`.
-fn wrap_backend_packed_func(func_name: String, func: BackendPackedCFunc) -> Box<dyn PackedFunc> {
-    Box::new(move |args: &[ArgValue]| {
-        let (values, type_codes): (Vec<TVMValue>, Vec<i32>) = args
-            .iter()
-            .map(|arg| {
-                let (val, code) = arg.to_tvm_value();
-                (val, code as i32)
-            })
-            .unzip();
-        let ret: RetValue = RetValue::default();
-        let (mut ret_val, mut ret_type_code) = ret.to_tvm_value();
-        let exit_code = func(
-            values.as_ptr(),
-            type_codes.as_ptr(),
-            values.len() as i32,
-            &mut ret_val,
-            &mut ret_type_code,
-            std::ptr::null_mut(),
-        );
-        if exit_code == 0 {
-            Ok(RetValue::from_tvm_value(ret_val, ret_type_code))
-        } else {
-            Err(tvm_sys::errors::FuncCallError::get_with_context(
-                func_name.clone(),
-            ))
-        }
-    })
-}
diff --git a/rust/tvm-graph-rt/src/module/syslib.rs b/rust/tvm-graph-rt/src/module/syslib.rs
deleted file mode 100644
index efc29a336620..000000000000
--- a/rust/tvm-graph-rt/src/module/syslib.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{
-    collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::RwLock,
-};
-
-use lazy_static::lazy_static;
-
-use tvm_sys::{ffi::BackendPackedCFunc, packed_func::PackedFunc};
-
-use super::Module;
-
-pub struct SystemLibModule;
-
-#[cfg(target_env = "sgx")]
-extern "C" {
-    fn __tvm_module_startup();
-}
-
-lazy_static! {
-    static ref SYSTEM_LIB_FUNCTIONS: RwLock<HashMap<String, &'static (dyn PackedFunc)>> =
-        RwLock::new(HashMap::new());
-}
-
-impl Module for SystemLibModule {
-    fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)> {
-        SYSTEM_LIB_FUNCTIONS
-            .read()
-            .unwrap()
-            .get(name.as_ref())
-            .copied()
-    }
-}
-
-impl Default for SystemLibModule {
-    fn default() -> Self {
-        #[cfg(target_env = "sgx")]
-        unsafe {
-            __tvm_module_startup();
-        }
-        SystemLibModule {}
-    }
-}
-
-#[no_mangle]
-pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
-    cname: *const c_char,
-    func: BackendPackedCFunc,
-) -> i32 {
-    let name = unsafe { CStr::from_ptr(cname).to_str().unwrap() };
-    SYSTEM_LIB_FUNCTIONS.write().unwrap().insert(
-        name.to_string(),
-        &*Box::leak(super::wrap_backend_packed_func(name.to_string(), func)),
-    );
-    0
-}
diff --git a/rust/tvm-graph-rt/src/threading.rs b/rust/tvm-graph-rt/src/threading.rs
deleted file mode 100644
index 03765e0a049b..000000000000
--- a/rust/tvm-graph-rt/src/threading.rs
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{
-    os::raw::{c_int, c_void},
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc, Barrier,
-    },
-    thread::{self, JoinHandle},
-};
-
-#[cfg(not(target_arch = "wasm32"))]
-use std::env;
-
-use crossbeam_channel::{bounded, Receiver, Sender};
-use tvm_sys::ffi::TVMParallelGroupEnv;
-
-pub(crate) type FTVMParallelLambda =
-    extern "C" fn(task_id: usize, penv: *const TVMParallelGroupEnv, cdata: *const c_void) -> i32;
-
-/// Holds a parallel job request made by a TVM library function.
-struct Job {
-    cb: FTVMParallelLambda,
-    cdata: *const c_void,
-    req_num_tasks: usize,
-    pending: Arc<AtomicUsize>,
-}
-
-impl Job {
-    /// Splits this job into a number of `Task`s which can be scheduled.
-    fn tasks(&self, num_workers: usize) -> Vec<Task> {
-        let num_tasks = if self.req_num_tasks == 0 {
-            num_workers
-        } else {
-            self.req_num_tasks.min(num_workers)
-        };
-        self.pending.store(num_tasks, Ordering::SeqCst);
-
-        let barrier = Arc::new(Barrier::new(num_tasks));
-
-        (0..num_tasks)
-            .map(move |i| Task {
-                id: i,
-                flambda: self.cb,
-                penv: TVMParallelGroupEnv {
-                    sync_handle: &Arc::clone(&barrier) as *const _ as *mut c_void,
-                    num_task: num_tasks as i32,
-                },
-                cdata: self.cdata,
-                pending: Arc::clone(&self.pending),
-            })
-            .collect()
-    }
-
-    /// Waits for all tasks in this `Job` to be completed.
-    fn wait(&self) {
-        while self.pending.load(Ordering::Acquire) > 0 {
-            thread::yield_now();
-        }
-    }
-}
-
-/// A chunk of work requested by a TVM function.
-struct Task {
-    id: usize,
-    flambda: FTVMParallelLambda,
-    penv: TVMParallelGroupEnv,
-    cdata: *const c_void,
-    pending: Arc<AtomicUsize>,
-}
-unsafe impl Send for Task {}
-unsafe impl Sync for Task {}
-
-impl Task {
-    fn run(self) -> i32 {
-        let status = (self.flambda)(self.id, &self.penv as *const _, self.cdata);
-        self.pending.fetch_sub(1, Ordering::AcqRel);
-        status
-    }
-}
-
-#[derive(Default)]
-struct Threads {
-    #[allow(unused)]
-    handles: Vec<JoinHandle<()>>,
-    queues: Vec<Sender<Task>>,
-}
-
-impl<'a> Threads {
-    fn launch<F: Sync + Send + FnOnce(Receiver<Task>) + 'static + Copy>(
-        num_threads: usize,
-        cb: F,
-    ) -> Self {
-        let (handles, queues) = (0..num_threads)
-            .map(|_| {
-                let (p, c) = bounded(2);
-                let handle = thread::spawn(move || cb(c.into()));
-                (handle, p)
-            })
-            .unzip();
-        Threads { handles, queues }
-    }
-}
-
-struct ThreadPool {
-    num_workers: usize,
-    #[allow(unused)]
-    threads: Threads,
-}
-
-thread_local!(static THREAD_POOL: ThreadPool = ThreadPool::new());
-
-impl ThreadPool {
-    fn new() -> Self {
-        let num_workers = max_concurrency();
-        ThreadPool {
-            num_workers,
-            threads: Threads::launch(num_workers, ThreadPool::run_worker),
-        }
-    }
-
-    fn launch(&self, job: Job) {
-        let mut tasks = job.tasks(self.num_workers + 1);
-
-        for (i, task) in tasks.split_off(1).into_iter().enumerate() {
-            self.threads.queues[i].send(task).expect("should send");
-        }
-
-        tasks.pop().unwrap().run();
-        job.wait();
-    }
-
-    fn run_worker(queue: Receiver<Task>) {
-        loop {
-            let task = match queue.recv() {
-                Ok(v) => v,
-                Err(_) => break,
-            };
-            let result = task.run();
-            if result == <i32>::min_value() {
-                break;
-            } else if result != 0 {
-                panic!("Error running task.");
-            }
-        }
-    }
-}
-
-#[cfg(not(target_arch = "wasm32"))]
-fn max_concurrency() -> usize {
-    if let Ok(threads_str) = env::var("TVM_NUM_THREADS").or_else(|_| env::var("OMP_NUM_THREADS")) {
-        if let Ok(threads) = usize::from_str_radix(&threads_str, 10) {
-            return threads;
-        }
-    }
-    num_cpus::get()
-}
-
-#[cfg(target_arch = "wasm32")]
-fn max_concurrency() -> usize {
-    0 // wasm doesn't support threads yet
-}
-
-#[no_mangle]
-pub extern "C" fn TVMBackendParallelLaunch(
-    cb: FTVMParallelLambda,
-    cdata: *const c_void,
-    num_task: usize,
-) -> c_int {
-    if max_concurrency() < 2 {
-        let penv = TVMParallelGroupEnv {
-            sync_handle: std::ptr::null_mut(),
-            num_task: 1,
-        };
-        cb(0, &penv as *const _, cdata);
-    } else {
-        THREAD_POOL.with(|pool| {
-            pool.launch(Job {
-                cb,
-                cdata,
-                req_num_tasks: num_task,
-                pending: Arc::new(AtomicUsize::new(0)),
-            });
-        });
-    }
-    0
-}
-
-// @see issue 988 for information on why this function is used.
-#[no_mangle]
-pub unsafe extern "C" fn TVMBackendParallelBarrier(
-    _task_id: usize,
-    penv: *const TVMParallelGroupEnv,
-) {
-    let barrier: &Arc<Barrier> = &*((*penv).sync_handle as *const Arc<Barrier>);
-    barrier.wait();
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{thread, time::Duration};
-
-    use super::*;
-
-    #[test]
-    fn test_max_concurrency() {
-        env::set_var("TVM_NUM_THREADS", "42");
-        env::set_var("OMP_NUM_THREADS", "24");
-        assert_eq!(max_concurrency(), 42);
-        env::remove_var("TVM_NUM_THREADS");
-        assert_eq!(max_concurrency(), 24);
-    }
-
-    extern "C" fn _flambda(
-        task_id: usize,
-        penv: *const TVMParallelGroupEnv,
-        cdata: *const c_void,
-    ) -> i32 {
-        if cdata.is_null() {
-            return 0;
-        }
-        unsafe {
-            let &(ref counter, ref task_ids_sum) = &*(cdata as *const (AtomicUsize, AtomicUsize));
-            thread::sleep(Duration::from_millis(50 * task_id as u64));
-            counter.fetch_add(1, Ordering::SeqCst);
-            task_ids_sum.fetch_add(task_id, Ordering::SeqCst);
-            assert_eq!((*penv).num_task, 3);
-        }
-        0
-    }
-
-    // #[test]
-    // fn test_parallel_launch() {
-    //     TVMBackendParallelLaunch(flambda, ptr::null(), 6);
-    //     let counter = AtomicUsize::new(0);
-    //     let task_ids_sum = AtomicUsize::new(0);
-    //     let cdata = (counter, task_ids_sum);
-    //     let num_tasks = 3;
-    //     TVMBackendParallelLaunch(flambda, &cdata as *const _ as *const c_void, num_tasks);
-    //     assert_eq!(cdata.0.load(Ordering::SeqCst), num_tasks);
-    //     assert_eq!(
-    //         cdata.1.load(Ordering::SeqCst),
-    //         (0..num_tasks).sum::<usize>()
-    //     );
-    // }
-}
diff --git a/rust/tvm-graph-rt/src/workspace.rs b/rust/tvm-graph-rt/src/workspace.rs
deleted file mode 100644
index 82bbfddcf261..000000000000
--- a/rust/tvm-graph-rt/src/workspace.rs
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{
-    cell::RefCell,
-    error::Error,
-    os::raw::{c_int, c_void},
-    ptr,
-};
-
-use crate::allocator::Allocation;
-use crate::errors::InvalidPointer;
-use std::alloc::LayoutError;
-
-const WS_ALIGN: usize = 64; // taken from `kTempAllocaAlignment` in `device_api.h`
-
-pub fn remove_item<T: PartialEq>(vec: &mut Vec<T>, item: &T) -> Option<T> {
-    let pos = vec.iter().position(|x| *x == *item)?;
-    Some(vec.remove(pos))
-}
-
-struct WorkspacePool {
-    workspaces: Vec<Allocation>,
-    free: Vec<usize>,
-    in_use: Vec<usize>,
-}
-
-impl WorkspacePool {
-    fn new() -> Self {
-        WorkspacePool {
-            workspaces: Vec::new(),
-            free: Vec::new(),
-            in_use: Vec::new(),
-        }
-    }
-
-    fn alloc_new(&mut self, size: usize) -> Result<*mut u8, LayoutError> {
-        self.workspaces.push(Allocation::new(size, Some(WS_ALIGN))?);
-        self.in_use.push(self.workspaces.len() - 1);
-        Ok(self.workspaces[self.workspaces.len() - 1].as_mut_ptr())
-    }
-
-    fn alloc(&mut self, size: usize) -> Result<*mut u8, LayoutError> {
-        if self.free.is_empty() {
-            return self.alloc_new(size);
-        }
-        let idx = self
-            .free
-            .iter()
-            .fold(None, |cur_ws_idx: Option<usize>, &idx| {
-                let ws_size = self.workspaces[idx].size();
-                if ws_size < size {
-                    return cur_ws_idx;
-                }
-                cur_ws_idx.or(Some(idx)).and_then(|cur_idx| {
-                    let cur_size = self.workspaces[cur_idx].size();
-                    Some(if ws_size <= cur_size { idx } else { cur_idx })
-                })
-            });
-        match idx {
-            Some(idx) => {
-                remove_item(&mut self.free, &idx).unwrap();
-                self.in_use.push(idx);
-                Ok(self.workspaces[idx].as_mut_ptr())
-            }
-            None => self.alloc_new(size),
-        }
-    }
-
-    fn free(&mut self, ptr: *mut u8) -> Result<(), Box<dyn Error>> {
-        let mut ws_idx = None;
-        for i in 0..self.in_use.len() {
-            let idx = self.in_use[i];
-            if self.workspaces[idx].as_mut_ptr() == ptr {
-                self.in_use.remove(i);
-                ws_idx = Some(idx);
-                break;
-            }
-        }
-        let ws_idx = ws_idx.ok_or_else(|| InvalidPointer(ptr))?;
-        self.free.push(ws_idx);
-        Ok(())
-    }
-}
-
-thread_local!(static WORKSPACE_POOL: RefCell<WorkspacePool> = RefCell::new(WorkspacePool::new()));
-
-const WORKSPACE_PAGE_SIZE: usize = 4 << 10;
-
-#[no_mangle]
-pub extern "C" fn TVMBackendAllocWorkspace(
-    _device_type: c_int,
-    _device_id: c_int,
-    size: u64,
-    _dtype_code_hint: c_int,
-    _dtype_bits_hint: c_int,
-) -> *mut c_void {
-    let nbytes = if size == 0 {
-        WORKSPACE_PAGE_SIZE
-    } else {
-        size as usize
-    };
-    WORKSPACE_POOL.with(|pool_cell| {
-        pool_cell
-            .borrow_mut()
-            .alloc(nbytes as usize)
-            .unwrap_or(ptr::null_mut()) as *mut c_void
-    })
-}
-
-#[no_mangle]
-pub extern "C" fn TVMBackendFreeWorkspace(
-    _device_type: c_int,
-    _device_id: c_int,
-    ptr: *mut c_void,
-) -> c_int {
-    WORKSPACE_POOL.with(|pool_cell| {
-        (match pool_cell.borrow_mut().free(ptr as *mut u8) {
-            Ok(()) => 0,
-            Err(_) => -1,
-        }) as c_int
-    })
-}
diff --git a/rust/tvm-graph-rt/tests/.gitignore b/rust/tvm-graph-rt/tests/.gitignore
deleted file mode 100644
index 811076739bfa..000000000000
--- a/rust/tvm-graph-rt/tests/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.json
-*.params
-*.o
diff --git a/rust/tvm-graph-rt/tests/build_model.py b/rust/tvm-graph-rt/tests/build_model.py
deleted file mode 100755
index 969075929a42..000000000000
--- a/rust/tvm-graph-rt/tests/build_model.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Builds a simple graph for testing."""
-
-from os import path as osp
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import relay, runtime
-from tvm.relay import testing
-
-CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-
-
-def _get_model(dshape):
-    data = relay.var("data", shape=dshape)
-    fc = relay.nn.dense(data, relay.var("dense_weight"), units=dshape[-1] * 2)
-    fc = relay.nn.bias_add(fc, relay.var("dense_bias"))
-    left, right = relay.split(fc, indices_or_sections=2, axis=1)
-    one = relay.const(1, dtype="float32")
-    return relay.Tuple([(left + one), (right - one), fc])
-
-
-def main():
-    dshape = (32, 16)
-    net = _get_model(dshape)
-    mod, params = testing.create_workload(net)
-    graph, lib, params = relay.build(mod, "llvm", params=params)
-
-    with open(osp.join(CWD, "graph.json"), "w") as f_resnet:
-        f_resnet.write(graph)
-    with open(osp.join(CWD, "graph.params"), "wb") as f_params:
-        f_params.write(runtime.save_param_dict(params))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/rust/tvm-graph-rt/tests/test_graph_serde.rs b/rust/tvm-graph-rt/tests/test_graph_serde.rs
deleted file mode 100644
index aaa33ef6dd4f..000000000000
--- a/rust/tvm-graph-rt/tests/test_graph_serde.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{convert::TryFrom, fs, io::Read};
-
-use tvm_graph_rt::Graph;
-
-macro_rules! mf_dir {
-    ($p:literal) => {
-        concat!(env!("CARGO_MANIFEST_DIR"), $p)
-    };
-}
-
-static PARAMS_FIXTURE_PATH: &str = mf_dir!("/tests/graph.params");
-
-#[test]
-fn test_load_graph() {
-    let output = std::process::Command::new(mf_dir!("/tests/build_model.py"))
-        .env(
-            "PYTHONPATH",
-            concat!(mf_dir!("/../../python"), ":", mf_dir!("/../../nnvm/python")),
-        )
-        .output()
-        .expect("Failed to build test model");
-    assert!(
-        std::path::Path::new(PARAMS_FIXTURE_PATH).exists(),
-        "Could not build test graph fixture: STDOUT:\n\n{}\nSTDERR: {}\n\n",
-        String::from_utf8(output.stdout).unwrap(),
-        String::from_utf8(output.stderr).unwrap()
-    );
-    let mut params_bytes = Vec::new();
-    fs::File::open(PARAMS_FIXTURE_PATH)
-        .unwrap()
-        .read_to_end(&mut params_bytes)
-        .unwrap();
-    let _params = tvm_graph_rt::load_param_dict(&params_bytes);
-
-    let graph = Graph::try_from(
-        &fs::read_to_string(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.json")).unwrap(),
-    )
-    .unwrap();
-
-    assert_eq!(graph.nodes[3].op, "tvm_op");
-    assert_eq!(
-        graph.nodes[3]
-            .attrs
-            .as_ref()
-            .unwrap()
-            .get("func_name")
-            .unwrap(),
-        "tvmgen_default_fused_nn_dense_nn_bias_add"
-    );
-    assert_eq!(graph.nodes[3].inputs[0].index, 0);
-    assert_eq!(graph.nodes[4].inputs[0].index, 0);
-    assert_eq!(graph.heads.len(), 3);
-}
diff --git a/rust/tvm-graph-rt/tests/test_nn/Cargo.toml b/rust/tvm-graph-rt/tests/test_nn/Cargo.toml
deleted file mode 100644
index a3ed3624a3b9..000000000000
--- a/rust/tvm-graph-rt/tests/test_nn/Cargo.toml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "test-rt-nn"
-version = "0.0.0"
-license = "Apache-2.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[dependencies]
-ndarray="0.12"
-serde = "1.0"
-serde_json = "1.0"
-tvm-graph-rt = { path = "../../" }
-
-[build-dependencies]
-ar = "0.6"
-anyhow = "^1.0"
diff --git a/rust/tvm-graph-rt/tests/test_nn/build.rs b/rust/tvm-graph-rt/tests/test_nn/build.rs
deleted file mode 100644
index 5cf4cc848afe..000000000000
--- a/rust/tvm-graph-rt/tests/test_nn/build.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-extern crate ar;
-
-use std::{env, fs::File, path::Path, process::Command};
-
-use anyhow::{Context, Result};
-use ar::Builder;
-
-fn main() -> Result<()> {
-    let out_dir = env::var("OUT_DIR")?;
-    let out_dir = Path::new(&out_dir).join("test_nn");
-
-    std::fs::create_dir_all(&out_dir)?;
-
-    let manifest_dir = env::var("CARGO_MANIFEST_DIR")?;
-    let manifest_dir = Path::new(&manifest_dir);
-
-    let generator = manifest_dir.join("src").join("build_test_graph.py");
-
-    let graph_path = out_dir.join("graph.o");
-
-    let output = Command::new(&generator)
-        .arg(&out_dir)
-        .output()
-        .with_context(|| format!("Failed to execute: {:?}", generator))?;
-
-    assert!(
-        graph_path.exists(),
-        "Could not build graph lib: {}",
-        String::from_utf8(output.stderr)
-            .unwrap()
-            .trim()
-            .split("\n")
-            .last()
-            .unwrap_or("")
-    );
-
-    let lib_file = out_dir.join("libtestnn.a");
-    let file = File::create(&lib_file).context("failed to create library file")?;
-    let mut builder = Builder::new(file);
-    builder.append_path(graph_path)?;
-
-    let status = Command::new("ranlib").arg(&lib_file).status()?;
-
-    assert!(status.success());
-
-    println!("cargo:rustc-link-lib=static=testnn");
-    println!("cargo:rustc-link-search=native={}", out_dir.display());
-    println!("cargo:rerun-if-changed={}", generator.display());
-
-    Ok(())
-}
diff --git a/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py b/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py
deleted file mode 100755
index a6e6958e4637..000000000000
--- a/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Builds a simple graph for testing."""
-
-from os import path as osp
-import sys
-
-from tvm import runtime as tvm_runtime
-from tvm import relay
-from tvm.relay import testing
-
-
-def _get_model(dshape):
-    data = relay.var("data", shape=dshape)
-    fc = relay.nn.dense(data, relay.var("dense_weight"), units=dshape[-1] * 2)
-    fc = relay.nn.bias_add(fc, relay.var("dense_bias"))
-    left, right = relay.split(fc, indices_or_sections=2, axis=1)
-    one = relay.const(1, dtype="float32")
-    return relay.Tuple([(left + one), (right - one), fc])
-
-
-def main():
-    dshape = (4, 8)
-    net = _get_model(dshape)
-    mod, params = testing.create_workload(net)
-    runtime = relay.backend.Runtime("cpp", {"system-lib": True})
-    graph, lib, params = relay.build(mod, "llvm", runtime=runtime, params=params)
-
-    out_dir = sys.argv[1]
-    lib.save(osp.join(sys.argv[1], "graph.o"))
-    with open(osp.join(out_dir, "graph.json"), "w") as f_resnet:
-        f_resnet.write(graph)
-
-    with open(osp.join(out_dir, "graph.params"), "wb") as f_params:
-        f_params.write(tvm_runtime.save_param_dict(params))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/rust/tvm-graph-rt/tests/test_nn/src/main.rs b/rust/tvm-graph-rt/tests/test_nn/src/main.rs
deleted file mode 100644
index 88cc68b946c9..000000000000
--- a/rust/tvm-graph-rt/tests/test_nn/src/main.rs
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{collections::HashMap, convert::TryFrom, fs, io::Read};
-
-use ndarray::{s, Array};
-use tvm_graph_rt::{Graph, GraphExecutor, SystemLibModule, Tensor};
-
-const BATCH_SIZE: usize = 4;
-const IN_DIM: usize = 8;
-
-macro_rules! check_sum {
-    ($e:expr, $a:ident, $b:ident) => {
-        let a = Array::try_from($e.get_input(stringify!($a)).unwrap().to_owned()).unwrap();
-        check_sum!(a, $b);
-    };
-    ($e:expr, $a:expr, $b:ident) => {
-        let a = Array::try_from($e.get_output($a).unwrap().to_owned()).unwrap();
-        check_sum!(a, $b);
-    };
-    ($a:ident, $b:ident) => {
-        let a_sum: f32 = $a.scalar_sum();
-        let b_sum: f32 = $b.scalar_sum();
-        assert!((a_sum - b_sum).abs() < 1e-2, "{} != {}", a_sum, b_sum);
-    };
-}
-
-fn main() {
-    let syslib = SystemLibModule::default();
-
-    let mut params_bytes = Vec::new();
-    fs::File::open(concat!(env!("OUT_DIR"), "/test_nn/graph.params"))
-        .unwrap()
-        .read_to_end(&mut params_bytes)
-        .unwrap();
-    let params = tvm_graph_rt::load_param_dict(&params_bytes)
-        .unwrap()
-        .into_iter()
-        .map(|(k, v)| (k, v.to_owned()))
-        .collect::<HashMap<String, Tensor<'static>>>();
-
-    let graph = Graph::try_from(
-        &fs::read_to_string(concat!(env!("OUT_DIR"), "/test_nn/graph.json")).unwrap(),
-    )
-    .unwrap();
-    let mut exec = GraphExecutor::new(graph, &syslib).unwrap();
-
-    let x = Array::from_shape_vec(
-        (BATCH_SIZE, IN_DIM),
-        (0..BATCH_SIZE * IN_DIM)
-            .map(|x| x as f32)
-            .collect::<Vec<f32>>(),
-    )
-    .unwrap();
-
-    let p0 = params.get("p0").unwrap().to_owned();
-    let p1 = params.get("p1").unwrap().to_owned();
-    println!("p0: {:?}", p0.shape());
-    println!("p1: {:?}", p1.shape());
-    let w = Array::try_from(p0)
-        .unwrap()
-        .into_shape((BATCH_SIZE * 4, IN_DIM))
-        .unwrap();
-    let b = Array::try_from(p1).unwrap();
-    let dense = x.dot(&w.t()) + &b;
-    let left = dense.slice(s![.., 0..IN_DIM]);
-    let right = dense.slice(s![.., IN_DIM..]);
-    let expected_o0 = &left + 1f32;
-    let expected_o1 = &right - 1f32;
-
-    exec.load_params(params);
-    exec.set_input("data", (&x).into());
-
-    check_sum!(exec, data, x);
-    check_sum!(exec, p0, w);
-    check_sum!(exec, p1, b);
-
-    exec.run();
-
-    check_sum!(exec, 0, expected_o0);
-    check_sum!(exec, 1, expected_o1);
-    check_sum!(exec, 2, dense);
-}
diff --git a/rust/tvm-graph-rt/tests/test_tvm_basic/Cargo.toml b/rust/tvm-graph-rt/tests/test_tvm_basic/Cargo.toml
deleted file mode 100644
index 7e86ef046356..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_basic/Cargo.toml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "test-rt-tvm-basic"
-version = "0.0.1"
-license = "Apache-2.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[dependencies]
-ndarray = "0.12"
-tvm-graph-rt = { path = "../../" }
-tvm-rt = { path = "../../../tvm-rt" }
-
-[build-dependencies]
-ar = "0.6"
-anyhow = "^1.0"
diff --git a/rust/tvm-graph-rt/tests/test_tvm_basic/build.rs b/rust/tvm-graph-rt/tests/test_tvm_basic/build.rs
deleted file mode 100644
index e1b4cfea74d5..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_basic/build.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-extern crate ar;
-
-use std::{path::PathBuf, process::Command};
-
-use std::fs::File;
-
-use anyhow::Result;
-use ar::Builder;
-
-fn main() -> Result<()> {
-    let mut out_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-    out_dir.push("lib");
-
-    if !out_dir.is_dir() {
-        std::fs::create_dir(&out_dir)?;
-    }
-
-    let obj_file = out_dir.join("test.o");
-    let lib_file = out_dir.join("libtest_basic.a");
-
-    let output = Command::new(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/src/build_test_lib.py"
-    ))
-    .arg(&out_dir)
-    .output()?;
-
-    assert!(
-        obj_file.exists(),
-        "Could not build tvm lib: {}",
-        String::from_utf8(output.stderr)?
-            .trim()
-            .split("\n")
-            .last()
-            .unwrap_or("")
-    );
-
-    let mut builder = Builder::new(File::create(&lib_file)?);
-    builder.append_path(&obj_file)?;
-
-    drop(builder);
-
-    let status = Command::new("ranlib").arg(&lib_file).status()?;
-
-    assert!(status.success());
-
-    println!("cargo:rustc-link-lib=static=test_basic");
-    println!("cargo:rustc-link-search=native={}", out_dir.display());
-
-    Ok(())
-}
diff --git a/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py
deleted file mode 100755
index d6e1922efa85..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Prepares a simple TVM library for testing."""
-
-from os import path as osp
-import sys
-
-import tvm
-from tvm.relay.backend import Runtime
-from tvm import te
-
-
-def main():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-    s[C].parallel(s[C].op.axis[0])
-    runtime = Runtime("cpp", {"system-lib": True})
-    print(tvm.lower(s, [A, B, C], simple_mode=True))
-    tvm.build(s, [A, B, C], "llvm", runtime=runtime).save(osp.join(sys.argv[1], "test.o"))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/rust/tvm-graph-rt/tests/test_tvm_basic/src/main.rs b/rust/tvm-graph-rt/tests/test_tvm_basic/src/main.rs
deleted file mode 100644
index 9d774ce1670b..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_basic/src/main.rs
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use ndarray::Array;
-use tvm_graph_rt::{DLTensor, Module as _, SystemLibModule};
-
-mod tvm_mod {
-    tvm_graph_rt::import_module!("lib/test.o");
-}
-
-fn main() {
-    // try static
-    let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
-    let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
-    let mut c = Array::from_vec(vec![0f32; 4]);
-    let e = Array::from_vec(vec![2f32, 2., 4., 4.]);
-    let mut a_dl: DLTensor = (&mut a).into();
-    let mut b_dl: DLTensor = (&mut b).into();
-    let mut c_dl: DLTensor = (&mut c).into();
-    let args = vec![(&mut a_dl).into(), (&mut b_dl).into(), (&mut c_dl).into()];
-    tvm_mod::default_function(&args[..]).unwrap();
-    assert!(c.all_close(&e, 1e-8f32));
-
-    // try runtime
-    let syslib = SystemLibModule::default();
-    let add = syslib
-        .get_function("default_function")
-        .expect("main function not found");
-    add(&args[..]).unwrap();
-    assert!(c.all_close(&e, 1e-8f32));
-}
diff --git a/rust/tvm-graph-rt/tests/test_tvm_dso/Cargo.toml b/rust/tvm-graph-rt/tests/test_tvm_dso/Cargo.toml
deleted file mode 100644
index 1ff645b9c400..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_dso/Cargo.toml
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "test-rt-tvm-dso"
-version = "0.0.0"
-license = "Apache-2.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[dependencies]
-ndarray="0.12"
-tvm-graph-rt = { path = "../../" }
-
-[build-dependencies]
-ar = "0.6"
-anyhow = "^1.0"
diff --git a/rust/tvm-graph-rt/tests/test_tvm_dso/build.rs b/rust/tvm-graph-rt/tests/test_tvm_dso/build.rs
deleted file mode 100644
index 1e3a9ab0770b..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_dso/build.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{env, path::Path, process::Command};
-
-use anyhow::{Context, Result};
-
-fn main() -> Result<()> {
-    let out_dir = env::var("OUT_DIR").unwrap();
-
-    let exe = concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_test_lib.py");
-
-    let output = Command::new(exe)
-        .arg(&out_dir)
-        .output()
-        .with_context(|| anyhow::anyhow!("Failed to execute: {} {}", exe, &out_dir))?;
-
-    assert!(
-        Path::new(&format!("{}/test.so", out_dir)).exists(),
-        "Could not build tvm lib: {}",
-        String::from_utf8(output.stderr)
-            .unwrap()
-            .trim()
-            .split("\n")
-            .last()
-            .unwrap_or("")
-    );
-
-    Ok(())
-}
diff --git a/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py
deleted file mode 100755
index 4b270fa17cbc..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Prepares a simple TVM library for testing."""
-
-from os import path as osp
-import sys
-
-import tvm
-from tvm import te
-from tvm.contrib import cc
-
-
-def main():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-    s[C].parallel(s[C].op.axis[0])
-    print(tvm.lower(s, [A, B, C], simple_mode=True))
-    obj_file = osp.join(sys.argv[1], "test.o")
-    tvm.build(s, [A, B, C], "llvm").save(obj_file)
-    cc.create_shared(osp.join(sys.argv[1], "test.so"), [obj_file])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/rust/tvm-graph-rt/tests/test_tvm_dso/src/main.rs b/rust/tvm-graph-rt/tests/test_tvm_dso/src/main.rs
deleted file mode 100644
index 797d96ad7c73..000000000000
--- a/rust/tvm-graph-rt/tests/test_tvm_dso/src/main.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use ndarray::Array;
-use tvm_graph_rt::{DLTensor, DsoModule, Module};
-
-fn main() {
-    tvm_graph_rt::TVMGetLastError();
-    let module = DsoModule::new(concat!(env!("OUT_DIR"), "/test.so")).unwrap();
-    let add = module
-        .get_function("__tvm_main__")
-        .expect("main function not found");
-    let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
-    let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
-    let mut c = Array::from_vec(vec![0f32; 4]);
-    let e = Array::from_vec(vec![2f32, 2., 4., 4.]);
-    let mut a_dl: DLTensor = (&mut a).into();
-    let mut b_dl: DLTensor = (&mut b).into();
-    let mut c_dl: DLTensor = (&mut c).into();
-    let args = vec![(&mut a_dl).into(), (&mut b_dl).into(), (&mut c_dl).into()];
-    add(&args[..]).unwrap();
-    assert!(c.all_close(&e, 1e-8f32));
-}
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/.cargo/config b/rust/tvm-graph-rt/tests/test_wasm32/.cargo/config
deleted file mode 100644
index 6b77899cb333..000000000000
--- a/rust/tvm-graph-rt/tests/test_wasm32/.cargo/config
+++ /dev/null
@@ -1,2 +0,0 @@
-[build]
-target = "wasm32-wasi"
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml b/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
deleted file mode 100644
index 02e77d106f28..000000000000
--- a/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "test-rt-wasm32"
-version = "0.0.0"
-license = "Apache-2.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[dependencies]
-ndarray = "0.12"
-tvm-graph-rt = { path = "../../" }
-
-[build-dependencies]
-anyhow = "^1.0"
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/build.rs b/rust/tvm-graph-rt/tests/test_wasm32/build.rs
deleted file mode 100644
index 5c816c336825..000000000000
--- a/rust/tvm-graph-rt/tests/test_wasm32/build.rs
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{path::PathBuf, process::Command};
-
-use anyhow::{Context, Result};
-
-fn main() -> Result<()> {
-    let mut out_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-    out_dir.push("lib");
-
-    if !out_dir.is_dir() {
-        std::fs::create_dir(&out_dir).context("failed to create directory for WASM outputs")?;
-    }
-
-    let obj_file = out_dir.join("test.o");
-    let lib_file = out_dir.join("libtest_wasm32.a");
-
-    let output = Command::new(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/src/build_test_lib.py"
-    ))
-    .arg(&out_dir)
-    .output()
-    .context("failed to execute Python script for generating TVM library")?;
-
-    assert!(
-        obj_file.exists(),
-        "Could not build tvm lib: {}",
-        String::from_utf8(output.stderr)
-            .unwrap()
-            .trim()
-            .split("\n")
-            .last()
-            .unwrap_or("")
-    );
-
-    let ar = option_env!("LLVM_AR").unwrap_or("llvm-ar-8");
-
-    let output = Command::new(ar)
-        .arg("rcs")
-        .arg(&lib_file)
-        .arg(&obj_file)
-        .output()
-        .context("failed to run LLVM_AR command")?;
-
-    assert!(
-        lib_file.exists(),
-        "Could not create archive: {}",
-        String::from_utf8(output.stderr)
-            .unwrap()
-            .trim()
-            .split("\n")
-            .last()
-            .unwrap_or("")
-    );
-
-    println!("cargo:rustc-link-lib=static=test_wasm32");
-    println!("cargo:rustc-link-search=native={}", out_dir.display());
-    Ok(())
-}
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py
deleted file mode 100755
index 2bf327a31b1b..000000000000
--- a/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Prepares a simple TVM library for testing."""
-
-from os import path as osp
-import sys
-
-import tvm
-from tvm import te
-from tvm.relay.backend import Runtime
-
-
-def main():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-    s[C].parallel(s[C].op.axis[0])
-    print(tvm.lower(s, [A, B, C], simple_mode=True))
-    runtime = Runtime("cpp", {"system-lib": True})
-    tvm.build(s, [A, B, C], "llvm -mtriple=wasm32-unknown-unknown", runtime=runtime).save(
-        osp.join(sys.argv[1], "test.o")
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/src/main.rs b/rust/tvm-graph-rt/tests/test_wasm32/src/main.rs
deleted file mode 100644
index 67ef21779cde..000000000000
--- a/rust/tvm-graph-rt/tests/test_wasm32/src/main.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-extern "C" {
-    static __tvm_module_ctx: i32;
-}
-
-#[no_mangle]
-unsafe fn __get_tvm_module_ctx() -> i32 {
-    // Refer a symbol in the libtest_wasm32.a to make sure that the link of the
-    // library is not optimized out.
-    __tvm_module_ctx
-}
-
-extern crate ndarray;
-#[macro_use]
-extern crate tvm_graph_rt;
-
-use ndarray::Array;
-use tvm_graph_rt::{DLTensor, Module as _, SystemLibModule};
-
-fn main() {
-    // try static
-    let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
-    let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
-    let mut c = Array::from_vec(vec![0f32; 4]);
-    let e = Array::from_vec(vec![2f32, 2., 4., 4.]);
-    let mut a_dl: DLTensor = (&mut a).into();
-    let mut b_dl: DLTensor = (&mut b).into();
-    let mut c_dl: DLTensor = (&mut c).into();
-
-    let syslib = SystemLibModule::default();
-    let add = syslib
-        .get_function("default_function")
-        .expect("main function not found");
-    call_packed!(add, &mut a_dl, &mut b_dl, &mut c_dl).unwrap();
-    assert!(c.all_close(&e, 1e-8f32));
-}
diff --git a/rust/tvm/.gitignore b/rust/tvm/.gitignore
deleted file mode 100644
index 2430329c78b6..000000000000
--- a/rust/tvm/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-target
-**/*.rs.bk
-Cargo.lock
-/tests/basics/add_*
-/examples/resnet/deploy_*
-/examples/resnet/*.png
-/examples/resnet/synset.*
diff --git a/rust/tvm/.travis.yml b/rust/tvm/.travis.yml
deleted file mode 100644
index e963b7c0ede5..000000000000
--- a/rust/tvm/.travis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-language: rust
-rust:
-  - nightly
-matrix:
-  fast_finish: true
diff --git a/rust/tvm/Cargo.toml b/rust/tvm/Cargo.toml
deleted file mode 100644
index 22dc546c93f7..000000000000
--- a/rust/tvm/Cargo.toml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "tvm"
-version = "0.1.1-alpha"
-license = "Apache-2.0"
-description = "Rust frontend support for TVM"
-repository = "https://github.com/apache/tvm"
-homepage = "https://github.com/apache/tvm"
-readme = "README.md"
-keywords = ["rust", "tvm"]
-categories = ["api-bindings", "science"]
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[features]
-default = ["python", "dynamic-linking"]
-dynamic-linking = ["tvm-rt/dynamic-linking"]
-static-linking = ["tvm-rt/static-linking"]
-blas = ["ndarray/blas"]
-python = ["pyo3"]
-# Enabling any of the following features is like setting the value to "ON" in config.cmake.
-use-cuda = ["tvm-rt/use-cuda"]
-use-opencl = ["tvm-rt/use-opencl"]
-use-vulkan = ["tvm-rt/use-vulkan"]
-use-metal = ["tvm-rt/use-metal"]
-use-rocm = ["tvm-rt/use-rocm"]
-use-hexagon-device = ["tvm-rt/use-hexagon-device"]
-use-rpc = ["tvm-rt/use-rpc"]
-use-threads = ["tvm-rt/use-threads"]
-use-llvm = ["tvm-rt/use-llvm"]
-use-stackvm-runtime = ["tvm-rt/use-stackvm-runtime"]
-use-graph-runtime = ["tvm-rt/use-graph-runtime"]
-use-graph-runtime-debug = ["tvm-rt/use-graph-runtime-debug"]
-use-openmp = ["tvm-rt/use-openmp"]
-use-relay-debug = ["tvm-rt/use-relay-debug"]
-use-rtti = ["tvm-rt/use-rtti"]
-use-mscv-mt = ["tvm-rt/use-mscv-mt"]
-use-install-dev = ["tvm-rt/use-install-dev"]
-hide-private-symbols = ["tvm-rt/hide-private-symbols"]
-use-fallback-stl-map = ["tvm-rt/use-fallback-stl-map"]
-use-index-default-i64 = ["tvm-rt/use-index-default-i64"]
-use-tf-tvmdsoop = ["tvm-rt/use-tf-tvmdsoop"]
-use-byodt-posit = ["tvm-rt/use-byodt-posit"]
-use-mkl = ["tvm-rt/use-mkl"]
-use-mkldnn = ["tvm-rt/use-mkldnn"]
-use-dnnl-codegen = ["tvm-rt/use-dnnl-codegen"]
-use-cudnn = ["tvm-rt/use-cudnn"]
-use-cublas = ["tvm-rt/use-cublas"]
-use-thrust = ["tvm-rt/use-thrust"]
-use-miopen = ["tvm-rt/use-miopen"]
-use-rocblas = ["tvm-rt/use-rocblas"]
-use-sort = ["tvm-rt/use-sort"]
-use-nnpack = ["tvm-rt/use-nnpack"]
-use-random = ["tvm-rt/use-random"]
-use-cpp-rpc = ["tvm-rt/use-cpp-rpc"]
-use-tflite = ["tvm-rt/use-tflite"]
-use-coreml = ["tvm-rt/use-coreml"]
-use-target-onnx = ["tvm-rt/use-target-onnx"]
-use-arm-compute-lib = ["tvm-rt/use-arm-compute-lib"]
-use-arm-compute-lib-graph-runtime = ["tvm-rt/use-arm-compute-lib-graph-runtime"]
-use-tensorrt-codegen = ["tvm-rt/use-tensorrt-codegen"]
-use-tensorrt-runtime = ["tvm-rt/use-tensorrt-runtime"]
-use-vitis-ai = ["tvm-rt/use-vitis-ai"]
-
-[dependencies.tvm-rt]
-version = "0.1.0-alpha"
-default-features = false
-path = "../tvm-rt/"
-
-[dependencies]
-thiserror = "^1.0"
-anyhow = "^1.0"
-lazy_static = "1.1"
-ndarray = "0.12"
-num-traits = "0.2"
-tvm-macros = { version = "0.1.1-alpha", path = "../tvm-macros/" }
-paste = "0.1"
-mashup = "0.1"
-once_cell = "^1.3.1"
-pyo3 = { version = "^0.13", optional = true }
-codespan-reporting = "0.9.5"
-structopt = { version = "0.3" }
-tracing = "^0.1"
-
-[[bin]]
-name = "tyck"
-required-features = ["dynamic-linking"]
diff --git a/rust/tvm/README.md b/rust/tvm/README.md
deleted file mode 100644
index 3455975ad81d..000000000000
--- a/rust/tvm/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# TVM
-
-This crate provides an idiomatic Rust API for [Apache TVM](https://github.com/apache/tvm).
-The code works on **Stable Rust** and is tested against `rustc 1.47`.
-
-You can find the API Documentation [here](https://tvm.apache.org/docs/api/rust/tvm/index.html).
-
-## What Does This Crate Offer?
-
-The goal of this crate is to provide bindings to both the TVM compiler and runtime
-APIs. First train your **Deep Learning** model using any major framework such as
-[PyTorch](https://pytorch.org/) or [TensorFlow](https://www.tensorflow.org/).
-Then use **TVM** to build and deploy optimized model artifacts on a supported devices such as CPU, GPU, OpenCL and specialized accelerators.
-
-The Rust bindings are composed of a few crates:
-- The [tvm](https://tvm.apache.org/docs/api/rust/tvm/index.html) crate which exposes Rust bindings to
-  both the compiler and runtime.
-- The [tvm_macros](https://tvm.apache.org/docs/api/rust/tvm/index.html) crate which provides macros
-  which generate unsafe boilerplate for TVM's data structures.
-- The [tvm_rt](https://tvm.apache.org/docs/api/rust/tvm_rt/index.html) crate which exposes Rust
-  bindings to the TVM runtime APIs.
-- The [tvm_sys] crate which provides raw bindings and linkage to the TVM C++ library.
-- The [tvm_graph_rt] crate which implements a version of the TVM graph executor in Rust vs. C++.
-
-These crates have been recently refactored and reflect a much different philosophy than
-previous bindings, as well as much increased support for more of the TVM API including
-exposing all of the compiler internals.
-
-These are still very much in development and should not be considered stable, but contributions
-and usage is welcome and encouraged. If you want to discuss design issues check our Discourse
-[forum](https://discuss.tvm.ai) and for bug reports check our GitHub [repository](https://github.com/apache/tvm).
-
-## Install
-
-Please follow the TVM [install](https://tvm.apache.org/docs/install/index.html) instructions, `export TVM_HOME=/path/to/tvm` and add `libtvm_runtime` to your `LD_LIBRARY_PATH`.
-
-*Note:* To run the end-to-end examples and tests, `tvm` and `topi` need to be added to your `PYTHONPATH` or it's automatic via an Anaconda environment when it is installed individually.
-
-### Disclaimers
-
-*Apache TVM is a top level project from the Apache software foundation. Please refer to the official Apache TVM website for Apache source releases. Apache TVM, Apache, the Apache feather, and the Apache TVM project logo are either trademarks or registered trademarks of the Apache Software Foundation.*
diff --git a/rust/tvm/examples/resnet/Cargo.toml b/rust/tvm/examples/resnet/Cargo.toml
deleted file mode 100644
index 1e45739dd93d..000000000000
--- a/rust/tvm/examples/resnet/Cargo.toml
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "resnet"
-version = "0.0.0"
-authors = ["TVM Contributors"]
-license = "Apache-2.0"
-build = "build.rs"
-edition = "2018"
-
-[dependencies]
-ndarray = "0.12"
-tvm-rt = { path = "../../../tvm-rt", features = ["standalone"] }
-image = "0.20"
-csv = "1.1"
-anyhow = "^1.0"
-
-[build-dependencies]
-anyhow = "1.0"
diff --git a/rust/tvm/examples/resnet/README.md b/rust/tvm/examples/resnet/README.md
deleted file mode 100644
index ad76ac0048a0..000000000000
--- a/rust/tvm/examples/resnet/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-## Resnet example
-
-This end-to-end example shows how to:
-* build `Resnet 18` with `tvm` from Python
-* use the provided Rust frontend API to test for an input image
-
-To run the example with pretrained resnet weights, first `tvm`  and `torchvision` must be installed for the python build. To install torchvision for cpu, run `pip install torch torchvision`
-and to install `tvm` with `llvm` follow the [TVM installation guide](https://tvm.apache.org/docs/install/index.html).
-
-* **Build the example**: `cargo build
-
-To have a successful build, note that it is required to instruct Rust compiler to link to the compiled shared library, for example with
-`println!("cargo:rustc-link-search=native={}", build_path)`. See the `build.rs` for more details.
-
-* **Run the example**: `cargo run`
-
-Note: To use pretrained weights, one can enable `--pretrained` in `build.rs` with
-
-```
-let output = Command::new("python")
-        .arg(concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_resnet.py"))
-        .arg(&format!("--build-dir={}", env!("CARGO_MANIFEST_DIR")))
-        .arg(&format!("--pretrained"))
-        .output()
-        .expect("Failed to execute command");
-```
-
-Otherwise, *random weights* are used, therefore, the prediction will be `limpkin, Aramus pictus`!
diff --git a/rust/tvm/examples/resnet/build.rs b/rust/tvm/examples/resnet/build.rs
deleted file mode 100644
index 9e3a76433ffc..000000000000
--- a/rust/tvm/examples/resnet/build.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use anyhow::{Context, Result};
-use std::{io::Write, path::Path, process::Command};
-
-fn main() -> Result<()> {
-    let out_dir = std::env::var("CARGO_MANIFEST_DIR")?;
-    let python_script = concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_resnet.py");
-    let synset_txt = concat!(env!("CARGO_MANIFEST_DIR"), "/synset.txt");
-
-    println!("cargo:rerun-if-changed={}", python_script);
-    println!("cargo:rerun-if-changed={}", synset_txt);
-
-    let output = Command::new("python3")
-        .arg(python_script)
-        .arg(&format!("--build-dir={}", out_dir))
-        .output()
-        .with_context(|| anyhow::anyhow!("failed to run python3"))?;
-
-    if !output.status.success() {
-        std::io::stdout()
-            .write_all(&output.stderr)
-            .context("Failed to write error")?;
-        panic!("Failed to execute build script");
-    }
-
-    assert!(
-        Path::new(&format!("{}/deploy_lib.o", out_dir)).exists(),
-        "Could not prepare demo: {}",
-        String::from_utf8(output.stderr)
-            .unwrap()
-            .trim()
-            .split("\n")
-            .last()
-            .unwrap_or("")
-    );
-    println!("cargo:rustc-link-search=native={}", out_dir);
-
-    Ok(())
-}
diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py
deleted file mode 100644
index 4e8ae01c413b..000000000000
--- a/rust/tvm/examples/resnet/src/build_resnet.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import logging
-import shutil
-from os import path as osp
-
-import numpy as np
-import torch
-import torchvision
-import tvm
-from PIL import Image
-from tvm import relay, runtime
-from tvm.contrib import cc, graph_executor
-from tvm.contrib.download import download_testdata
-
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-
-parser = argparse.ArgumentParser(description="Resnet build example")
-aa = parser.add_argument
-aa("--build-dir", type=str, required=True, help="directory to put the build artifacts")
-aa("--batch-size", type=int, default=1, help="input image batch size")
-aa(
-    "--opt-level",
-    type=int,
-    default=3,
-    help="level of optimization. 0 is unoptimized and 3 is the highest level",
-)
-aa("--target", type=str, default="llvm", help="target for compilation")
-aa("--image-shape", type=str, default="3,224,224", help="input image dimensions")
-aa("--image-name", type=str, default="cat.png", help="name of input image to download")
-args = parser.parse_args()
-
-build_dir = args.build_dir
-batch_size = args.batch_size
-opt_level = args.opt_level
-target = tvm.target.create(args.target)
-image_shape = tuple(map(int, args.image_shape.split(",")))
-data_shape = (batch_size,) + image_shape
-
-
-def build(target_dir):
-    """Compiles resnet18 with TVM"""
-    # Download the pretrained model from Torchvision.
-    weights = torchvision.models.ResNet18_Weights.IMAGENET1K_V1
-    torch_model = torchvision.models.resnet18(weights=weights).eval()
-
-    input_shape = [1, 3, 224, 224]
-    input_data = torch.randn(input_shape)
-    scripted_model = torch.jit.trace(torch_model, input_data)
-    input_infos = [("data", input_data.shape)]
-    mod, params = relay.frontend.from_pytorch(scripted_model, input_infos)
-
-    # Add softmax to do classification in last layer.
-    func = mod["main"]
-    func = relay.Function(
-        func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs
-    )
-
-    target = "llvm"
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, params = relay.build(func, target, params=params)
-
-    # save the model artifacts
-    deploy_lib = osp.join(target_dir, "deploy_lib.o")
-    lib.save(deploy_lib)
-    cc.create_shared(osp.join(target_dir, "deploy_lib.so"), [osp.join(target_dir, "deploy_lib.o")])
-
-    with open(osp.join(target_dir, "deploy_graph.json"), "w") as fo:
-        fo.write(graph)
-
-    with open(osp.join(target_dir, "deploy_param.params"), "wb") as fo:
-        fo.write(runtime.save_param_dict(params))
-
-
-def download_img_labels():
-    """Download an image and imagenet1k class labels for test"""
-
-    synset_url = "".join(
-        [
-            "https://gist.githubusercontent.com/zhreshold/",
-            "4d0b62f3d01426887599d4f7ede23ee5/raw/",
-            "596b27d23537e5a1b5751d2b0481ef172f58b539/",
-            "imagenet1000_clsid_to_human.txt",
-        ]
-    )
-    synset_name = "synset.txt"
-    synset_path = download_testdata(synset_url, synset_name + ".raw", module="data", overwrite=True)
-
-    with open(synset_path) as fin:
-        data = fin.read()
-        synset = eval(data)
-
-    with open(synset_name, "w") as f:
-        for key in synset:
-            f.write(synset[key])
-            f.write("\n")
-
-    print(synset_path)
-    print(synset_name)
-
-    return synset
-
-
-def transform_image(image):
-    image = np.array(image) - np.array([123.0, 117.0, 104.0])
-    image /= np.array([58.395, 57.12, 57.375])
-    image = image.transpose((2, 0, 1))
-    image = image[np.newaxis, :]
-    return image
-
-
-def get_cat_image():
-    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
-    img_path = download_testdata(img_url, "cat.png", module="data")
-    shutil.copyfile(img_path, "cat.png")
-    img = Image.open(img_path).resize((224, 224))
-    return transform_image(img)
-
-
-def test_build(build_dir):
-    """Sanity check with the cat image we download."""
-    graph = open(osp.join(build_dir, "deploy_graph.json")).read()
-    lib = tvm.runtime.load_module(osp.join(build_dir, "deploy_lib.so"))
-    params = bytearray(open(osp.join(build_dir, "deploy_param.params"), "rb").read())
-    input_data = get_cat_image()
-    dev = tvm.cpu()
-    module = graph_executor.create(graph, lib, dev)
-    module.load_params(params)
-    module.run(data=input_data)
-    out = module.get_output(0).numpy()
-    top1 = np.argmax(out[0])
-    synset = download_img_labels()
-    print("TVM prediction top-1:", top1, synset[top1])
-
-
-if __name__ == "__main__":
-    logger.info("Compiling the model to graph executor.")
-    build(build_dir)
-    logger.info("Testing the model's predication on test data.")
-    test_build(build_dir)
diff --git a/rust/tvm/examples/resnet/src/main.rs b/rust/tvm/examples/resnet/src/main.rs
deleted file mode 100644
index c22d55f2e4da..000000000000
--- a/rust/tvm/examples/resnet/src/main.rs
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::{
-    fs::{self, File},
-    io::{BufRead, BufReader},
-    path::Path,
-};
-
-use ::ndarray::{Array, ArrayD, Axis};
-use image::{FilterType, GenericImageView};
-
-use anyhow::Context as _;
-use tvm_rt::graph_rt::GraphRt;
-use tvm_rt::*;
-
-fn main() -> anyhow::Result<()> {
-    let dev = Device::cpu(0);
-    println!("{}", concat!(env!("CARGO_MANIFEST_DIR"), "/cat.png"));
-
-    let img = image::open(concat!(env!("CARGO_MANIFEST_DIR"), "/cat.png"))
-        .context("Failed to open cat.png")?;
-
-    println!("original image dimensions: {:?}", img.dimensions());
-    // for bigger size images, one needs to first resize to 256x256
-    // with `img.resize_exact` method and then `image.crop` to 224x224
-    let img = img.resize(224, 224, FilterType::Nearest).to_rgb();
-    println!("resized image dimensions: {:?}", img.dimensions());
-    let mut pixels: Vec<f32> = vec![];
-    for pixel in img.pixels() {
-        let tmp = pixel.data;
-        // normalize the RGB channels using mean, std of imagenet1k
-        let tmp = [
-            (tmp[0] as f32 - 123.0) / 58.395, // R
-            (tmp[1] as f32 - 117.0) / 57.12,  // G
-            (tmp[2] as f32 - 104.0) / 57.375, // B
-        ];
-        for e in &tmp {
-            pixels.push(*e);
-        }
-    }
-
-    let arr = Array::from_shape_vec((224, 224, 3), pixels)?;
-    let arr: ArrayD<f32> = arr.permuted_axes([2, 0, 1]).into_dyn();
-    // make arr shape as [1, 3, 224, 224] acceptable to resnet
-    let arr = arr.insert_axis(Axis(0));
-    // create input tensor from rust's ndarray
-    let input = NDArray::from_rust_ndarray(&arr, Device::cpu(0), DataType::float(32, 1))?;
-    println!(
-        "input shape is {:?}, len: {}, size: {}",
-        input.shape(),
-        input.len(),
-        input.size(),
-    );
-
-    let graph = fs::read_to_string(concat!(env!("CARGO_MANIFEST_DIR"), "/deploy_graph.json"))
-        .context("Failed to open graph")?;
-
-    // load the built module
-    let lib = Module::load(&Path::new(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/deploy_lib.so"
-    )))?;
-
-    // parse parameters and convert to TVMByteArray
-    let params: Vec<u8> = fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/deploy_param.params"))?;
-    println!("param bytes: {}", params.len());
-
-    // If you want an easy way to test a memory leak simply replace the program below with:
-    // let mut output: Vec<f32>;
-
-    // loop {
-    //     let mut graph_rt = GraphRt::create_from_parts(&graph, lib.clone(), dev)?;
-    //     graph_rt.load_params(params.clone())?;
-    //     graph_rt.set_input("data", input.clone())?;
-    //     graph_rt.run()?;
-
-    //     // prepare to get the output
-    //     let output_shape = &[1, 1000];
-    //     let output_nd = NDArray::empty(output_shape, Device::cpu(0), DataType::float(32, 1));
-    //     graph_rt.get_output_into(0, output_nd.clone())?;
-
-    //     // flatten the output as Vec<f32>
-    //     output = output_nd.to_vec::<f32>()?;
-    // }
-
-    let mut graph_rt = GraphRt::create_from_parts(&graph, lib, dev)?;
-    graph_rt.load_params(params)?;
-    graph_rt.set_input("data", input)?;
-    graph_rt.run()?;
-
-    // prepare to get the output
-    let output_shape = &[1, 1000];
-    let output_nd = NDArray::empty(output_shape, Device::cpu(0), DataType::float(32, 1));
-    graph_rt.get_output_into(0, output_nd.clone())?;
-
-    // flatten the output as Vec<f32>
-    let output: Vec<f32> = output_nd.to_vec::<f32>()?;
-
-    // find the maximum entry in the output and its index
-    let (argmax, max_prob) = output
-        .iter()
-        .copied()
-        .enumerate()
-        .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
-        .unwrap();
-
-    // create a hash map of (class id, class name)
-    let file = File::open("synset.txt").context("failed to open synset")?;
-    let synset: Vec<std::string::String> = BufReader::new(file)
-        .lines()
-        .into_iter()
-        .map(|x| x.expect("readline failed"))
-        .collect();
-
-    let label = &synset[argmax];
-    println!(
-        "input image belongs to the class `{}` with probability {}",
-        label, max_prob
-    );
-
-    Ok(())
-}
diff --git a/rust/tvm/src/bin/tyck.rs b/rust/tvm/src/bin/tyck.rs
deleted file mode 100644
index 839a6bd1c17f..000000000000
--- a/rust/tvm/src/bin/tyck.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::path::PathBuf;
-
-use anyhow::Result;
-use structopt::StructOpt;
-
-use tvm::ir::diagnostics::codespan;
-use tvm::ir::{self, IRModule};
-use tvm::runtime::Error;
-
-#[derive(Debug, StructOpt)]
-#[structopt(name = "tyck", about = "Parse and type check a Relay program.")]
-struct Opt {
-    /// Input file
-    #[structopt(parse(from_os_str))]
-    input: PathBuf,
-}
-
-fn main() -> Result<()> {
-    codespan::init().expect("Failed to initialize Rust based diagnostics.");
-    let opt = Opt::from_args();
-    let _module = match IRModule::parse_file(opt.input) {
-        Err(ir::module::Error::TVM(Error::DiagnosticError(_))) => return Ok(()),
-        Err(e) => {
-            return Err(e.into());
-        }
-        Ok(module) => module,
-    };
-
-    Ok(())
-}
diff --git a/rust/tvm/src/compiler/graph_rt.rs b/rust/tvm/src/compiler/graph_rt.rs
deleted file mode 100644
index 8313e47bea20..000000000000
--- a/rust/tvm/src/compiler/graph_rt.rs
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::convert::TryInto;
-use std::io::Read;
-use std::path::Path;
-
-use once_cell::sync::Lazy;
-use thiserror::Error;
-
-use crate::ir::IRModule;
-use crate::python;
-use crate::runtime::{map::Map, Function, Module as RtModule, NDArray, String};
-
-#[derive(Error, Debug)]
-pub enum Error {
-    #[error("{0}")]
-    IO(#[from] std::io::Error),
-    #[error("{0}")]
-    TVM(#[from] crate::errors::Error),
-}
-
-static TVM_BUILD: Lazy<Function> = Lazy::new(|| {
-    python::import("tvm").unwrap();
-    python::import("tvm.relay").unwrap();
-    Function::get("tvm.relay.build").unwrap()
-});
-
-fn _compile_module(
-    module: IRModule,
-    target: String,
-    target_host: String,
-    params: Map<String, NDArray>,
-    module_name: String,
-) -> Result<RtModule, Error> {
-    // The RAW API is Fn(IRModule, String, String, Map<String, NDArray>, String);
-    let module = TVM_BUILD.invoke(vec![
-        (&module).into(),
-        (&target).into(),
-        (&target_host).into(),
-        (&params).into(),
-        (&module_name).into(),
-    ])?;
-    let module: RtModule = module.try_into().unwrap();
-    Ok(module)
-}
-
-#[derive(Debug)]
-pub struct CompilerConfig {
-    target: Option<String>,
-    target_host: Option<String>,
-    params: Map<String, NDArray>,
-    module_name: Option<String>,
-}
-
-impl Default for CompilerConfig {
-    fn default() -> Self {
-        CompilerConfig {
-            target: None,
-            target_host: None,
-            params: Map::empty(),
-            module_name: None,
-        }
-    }
-}
-
-/// Compile a module from a configuration and IRModule.
-///
-/// # Arguments
-///
-/// * `config` - The configuration for the compiler.
-/// * `module` - The IRModule to compile.
-pub fn compile_module(config: CompilerConfig, module: IRModule) -> Result<RtModule, Error> {
-    let target = config.target.unwrap_or("llvm".into());
-    _compile_module(
-        module,
-        target,
-        "llvm".into(),
-        Map::<String, NDArray>::empty(),
-        "default".into(),
-    )
-}
-
-/// Compile an IRModule on disk and output a runtime module to disk.
-///
-/// # Arguments
-/// * `config` - The configuration for the compiler.
-/// * `ir_mod_path` - The path the serialized IRModule.
-//
-/// * `output_rt_mod_path` - The path to the output runtime module.
-pub fn compile_from_disk<P1, P2>(
-    config: CompilerConfig,
-    ir_mod_path: P1,
-    output_rt_mod_path: P2,
-) -> Result<(), Error>
-where
-    P1: AsRef<Path>,
-    P2: AsRef<Path>,
-{
-    let mut input_file = std::fs::File::open(ir_mod_path.as_ref())?;
-    let mut input_module_text = std::string::String::new();
-    input_file.read_to_string(&mut input_module_text)?;
-    let input_module = IRModule::parse("name", input_module_text)?;
-    let rt_module = compile_module(config, input_module)?;
-    let output_path_str = output_rt_mod_path.as_ref().display().to_string();
-    rt_module.export_library(output_path_str)?;
-    Ok(())
-}
diff --git a/rust/tvm/src/compiler/mod.rs b/rust/tvm/src/compiler/mod.rs
deleted file mode 100644
index ed8b47edbad4..000000000000
--- a/rust/tvm/src/compiler/mod.rs
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-pub mod graph_rt;
diff --git a/rust/tvm/src/ir/arith.rs b/rust/tvm/src/ir/arith.rs
deleted file mode 100644
index 672e6e6113a0..000000000000
--- a/rust/tvm/src/ir/arith.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::runtime::{Object, ObjectPtr};
-
-use tvm_macros::Object;
-
-macro_rules! define_node {
-    ($name:ident, $ref:expr, $typekey:expr; $node:ident { $($id:ident : $t:ty),*}) => {
-        #[repr(C)]
-        #[derive(Object, Debug)]
-        #[ref_name = $ref]
-        #[type_key = $typekey]
-        pub struct $node {
-            base: Object,
-            $(pub $id : $t),*
-        }
-
-        impl $name {
-            pub fn new($($id : $t,)*) -> $name {
-                let base = Object::base::<$node>();
-                let node = $node { base, $($id),* };
-                $name(Some(ObjectPtr::new(node)))
-            }
-        }
-    }
-}
-
-define_node!(ConstIntBound, "ConstIntBound", "arith.ConstIntBound";
-             ConstIntBoundNode { min_value: i64, max_value: i64 });
diff --git a/rust/tvm/src/ir/attrs.rs b/rust/tvm/src/ir/attrs.rs
deleted file mode 100644
index 739ed405c906..000000000000
--- a/rust/tvm/src/ir/attrs.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::runtime::Object;
-use tvm_macros::Object;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Attrs"]
-#[type_key = "Attrs"]
-pub struct BaseAttrsNode {
-    pub base: Object,
-}
diff --git a/rust/tvm/src/ir/diagnostics/codespan.rs b/rust/tvm/src/ir/diagnostics/codespan.rs
deleted file mode 100644
index 22e51e4e7396..000000000000
--- a/rust/tvm/src/ir/diagnostics/codespan.rs
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-//! A TVM diagnostics renderer which uses the Rust `codespan` library
-//! to produce error messages.
-//!
-//! This is an example of using the exposed API surface of TVM to
-//! customize the compiler behavior.
-use std::collections::HashMap;
-use std::sync::{Arc, Mutex};
-
-use codespan_reporting::diagnostic::{Diagnostic as CDiagnostic, Label, Severity};
-use codespan_reporting::files::SimpleFiles;
-use codespan_reporting::term::termcolor::{ColorChoice, StandardStream};
-use codespan_reporting::term::{self};
-
-use super::*;
-use crate::ir::source_map::*;
-
-/// A representation of a TVM Span as a range of bytes in a file.
-struct ByteRange<FileId> {
-    /// The file in which the range occurs.
-    #[allow(dead_code)]
-    file_id: FileId,
-    /// The range start.
-    start_pos: usize,
-    /// The range end.
-    end_pos: usize,
-}
-
-/// A mapping from Span to ByteRange for a single file.
-enum FileSpanToByteRange {
-    AsciiSource(Vec<usize>),
-    #[allow(dead_code)]
-    Utf8 {
-        /// Map character regions which are larger then 1-byte to length.
-        lengths: HashMap<isize, isize>,
-        /// The source of the program.
-        source: String,
-    },
-}
-
-impl FileSpanToByteRange {
-    /// Construct a span to byte range mapping from the program source.
-    fn new(source: String) -> FileSpanToByteRange {
-        if source.is_ascii() {
-            let line_lengths = source.lines().map(|line| line.len()).collect();
-            FileSpanToByteRange::AsciiSource(line_lengths)
-        } else {
-            panic!()
-        }
-    }
-
-    /// Lookup the corresponding ByteRange for a given Span.
-    fn lookup(&self, span: &Span) -> ByteRange<String> {
-        use FileSpanToByteRange::*;
-
-        let source_name: String = span.source_name.name.as_str().unwrap().into();
-
-        match self {
-            AsciiSource(ref line_lengths) => {
-                let start_pos = (&line_lengths[0..(span.line - 1) as usize])
-                    .into_iter()
-                    .sum::<usize>()
-                    + (span.column) as usize;
-                let end_pos = (&line_lengths[0..(span.end_line - 1) as usize])
-                    .into_iter()
-                    .sum::<usize>()
-                    + (span.end_column) as usize;
-                ByteRange {
-                    file_id: source_name,
-                    start_pos,
-                    end_pos,
-                }
-            }
-            _ => panic!(),
-        }
-    }
-}
-
-/// A mapping for all files in a source map to byte ranges.
-struct SpanToByteRange {
-    map: HashMap<String, FileSpanToByteRange>,
-}
-
-impl SpanToByteRange {
-    fn new() -> SpanToByteRange {
-        SpanToByteRange {
-            map: HashMap::new(),
-        }
-    }
-
-    /// Add a source file to the span mapping.
-    pub fn add_source(&mut self, source: Source) {
-        let source_name: String = source.source_name.name.as_str().expect("foo").into();
-
-        if self.map.contains_key(&source_name) {
-            panic!()
-        } else {
-            let source = source.source.as_str().expect("fpp").into();
-            self.map
-                .insert(source_name, FileSpanToByteRange::new(source));
-        }
-    }
-
-    /// Lookup a span to byte range mapping.
-    ///
-    /// First resolves the Span to a file, and then maps the span to a byte range in the file.
-    pub fn lookup(&self, span: &Span) -> ByteRange<String> {
-        let source_name: String = span.source_name.name.as_str().expect("foo").into();
-
-        match self.map.get(&source_name) {
-            Some(file_span_to_bytes) => file_span_to_bytes.lookup(span),
-            None => panic!(),
-        }
-    }
-}
-
-/// The state of the `codespan` based diagnostics.
-struct DiagnosticState {
-    files: SimpleFiles<String, String>,
-    span_map: SpanToByteRange,
-    // todo unify wih source name
-    source_to_id: HashMap<String, usize>,
-}
-
-impl DiagnosticState {
-    fn new() -> DiagnosticState {
-        DiagnosticState {
-            files: SimpleFiles::new(),
-            span_map: SpanToByteRange::new(),
-            source_to_id: HashMap::new(),
-        }
-    }
-
-    fn add_source(&mut self, source: Source) {
-        let source_str: String = source.source.as_str().unwrap().into();
-        let source_name: String = source.source_name.name.as_str().unwrap().into();
-        self.span_map.add_source(source);
-        let file_id = self.files.add(source_name.clone(), source_str);
-        self.source_to_id.insert(source_name, file_id);
-    }
-
-    fn to_diagnostic(&self, diag: super::Diagnostic) -> CDiagnostic<usize> {
-        let severity = match diag.level {
-            DiagnosticLevel::Error => Severity::Error,
-            DiagnosticLevel::Warning => Severity::Warning,
-            DiagnosticLevel::Note => Severity::Note,
-            DiagnosticLevel::Help => Severity::Help,
-            DiagnosticLevel::Bug => Severity::Bug,
-        };
-
-        let source_name: String = diag.span.source_name.name.as_str().unwrap().into();
-        let file_id = *self.source_to_id.get(&source_name).unwrap();
-
-        let message: String = diag.message.as_str().unwrap().into();
-
-        let byte_range = self.span_map.lookup(&diag.span);
-
-        let diagnostic = CDiagnostic::new(severity)
-            .with_message(message)
-            .with_code("EXXX")
-            .with_labels(vec![Label::primary(
-                file_id,
-                byte_range.start_pos..byte_range.end_pos,
-            )]);
-
-        diagnostic
-    }
-}
-
-fn renderer(state: &mut DiagnosticState, diag_ctx: DiagnosticContext) {
-    let source_map = diag_ctx.module.source_map.clone();
-    let writer = StandardStream::stderr(ColorChoice::Always);
-    let config = codespan_reporting::term::Config::default();
-    for diagnostic in diag_ctx.diagnostics.clone() {
-        match source_map.source_map.get(&diagnostic.span.source_name) {
-            Err(err) => panic!("{}", err),
-            Ok(source) => {
-                state.add_source(source);
-                let diagnostic = state.to_diagnostic(diagnostic);
-                term::emit(&mut writer.lock(), &config, &state.files, &diagnostic).unwrap();
-            }
-        }
-    }
-}
-
-/// Initialize the `codespan` based diagnostics.
-///
-/// Calling this function will globally override the TVM diagnostics renderer.
-pub fn init() -> Result<()> {
-    let diag_state = Arc::new(Mutex::new(DiagnosticState::new()));
-    let render_fn = move |diag_ctx: DiagnosticContext| {
-        let mut guard = diag_state.lock().unwrap();
-        renderer(&mut *guard, diag_ctx);
-    };
-
-    override_renderer(Some(render_fn))?;
-    Ok(())
-}
diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs
deleted file mode 100644
index 91e221131216..000000000000
--- a/rust/tvm/src/ir/diagnostics/mod.rs
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use super::module::IRModule;
-use super::span::*;
-use crate::runtime::function::Result;
-use crate::runtime::object::{Object, ObjectPtr};
-use crate::runtime::{
-    array::Array,
-    function::{self, Function, ToFunction},
-    string::String as TString,
-};
-/// The diagnostic interface to TVM, used for reporting and rendering
-/// diagnostic information by the compiler. This module exposes
-/// three key abstractions: a Diagnostic, the DiagnosticContext,
-/// and the DiagnosticRenderer.
-use tvm_macros::{external, Object};
-
-pub mod codespan;
-
-external! {
-    #[name("runtime.ArrayGetItem")]
-    fn get_renderer() -> DiagnosticRenderer;
-
-    #[name("diagnostics.DiagnosticRenderer")]
-    fn diagnostic_renderer(func: Function) -> DiagnosticRenderer;
-
-    #[name("diagnostics.Emit")]
-    fn emit(ctx: DiagnosticContext, diagnostic: Diagnostic) -> ();
-
-    #[name("diagnostics.DiagnosticContextDefault")]
-    fn diagnostic_context_default(module: IRModule) -> DiagnosticContext;
-
-    #[name("diagnostics.DiagnosticContextRender")]
-    fn diagnostic_context_render(ctx: DiagnosticContext) -> ();
-
-    #[name("diagnostics.DiagnosticRendererRender")]
-    fn diagnositc_renderer_render(renderer: DiagnosticRenderer, ctx: DiagnosticContext) -> ();
-
-    #[name("diagnostics.ClearRenderer")]
-    fn clear_renderer() -> ();
-}
-
-/// The diagnostic level, controls the printing of the message.
-#[repr(C)]
-#[derive(PartialEq, Eq, Debug)]
-pub enum DiagnosticLevel {
-    Bug = 10,
-    Error = 20,
-    Warning = 30,
-    Note = 40,
-    Help = 50,
-}
-
-/// A compiler diagnostic.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Diagnostic"]
-#[type_key = "Diagnostic"]
-pub struct DiagnosticNode {
-    pub base: Object,
-    /// The level.
-    pub level: DiagnosticLevel,
-    /// The span at which to report an error.
-    pub span: Span,
-    /// The diagnostic message.
-    pub message: TString,
-}
-
-impl Diagnostic {
-    pub fn new(level: DiagnosticLevel, span: Span, message: TString) -> Diagnostic {
-        let node = DiagnosticNode {
-            base: Object::base::<DiagnosticNode>(),
-            level,
-            span,
-            message,
-        };
-        ObjectPtr::new(node).into()
-    }
-
-    pub fn bug(span: Span) -> DiagnosticBuilder {
-        DiagnosticBuilder::new(DiagnosticLevel::Bug, span)
-    }
-
-    pub fn error(span: Span) -> DiagnosticBuilder {
-        DiagnosticBuilder::new(DiagnosticLevel::Error, span)
-    }
-
-    pub fn warning(span: Span) -> DiagnosticBuilder {
-        DiagnosticBuilder::new(DiagnosticLevel::Warning, span)
-    }
-
-    pub fn note(span: Span) -> DiagnosticBuilder {
-        DiagnosticBuilder::new(DiagnosticLevel::Note, span)
-    }
-
-    pub fn help(span: Span) -> DiagnosticBuilder {
-        DiagnosticBuilder::new(DiagnosticLevel::Help, span)
-    }
-}
-
-/// A wrapper around std::stringstream to build a diagnostic.
-pub struct DiagnosticBuilder {
-    /// The level.
-    pub level: DiagnosticLevel,
-
-    /// The span of the diagnostic.
-    pub span: Span,
-
-    /// The in progress message.
-    pub message: String,
-}
-
-impl DiagnosticBuilder {
-    pub fn new(level: DiagnosticLevel, span: Span) -> DiagnosticBuilder {
-        DiagnosticBuilder {
-            level,
-            span,
-            message: "".into(),
-        }
-    }
-}
-
-/// Display diagnostics in a given display format.
-///
-/// A diagnostic renderer is responsible for converting the
-/// raw diagnostics into consumable output.
-///
-/// For example the terminal renderer will render a sequence
-/// of compiler diagnostics to std::out and std::err in
-/// a human readable form.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "DiagnosticRenderer"]
-#[type_key = "DiagnosticRenderer"]
-/// A diagnostic renderer, which given a diagnostic context produces a "rendered"
-/// form of the diagnostics for either human or computer consumption.
-pub struct DiagnosticRendererNode {
-    /// The base type.
-    pub base: Object,
-    // TODO(@jroesch): we can't easily exposed packed functions due to
-    // memory layout
-    // missing field here
-}
-
-impl DiagnosticRenderer {
-    /// Render the provided context.
-    pub fn render(&self, ctx: DiagnosticContext) -> Result<()> {
-        diagnositc_renderer_render(self.clone(), ctx)
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "DiagnosticContext"]
-#[type_key = "DiagnosticContext"]
-/// A diagnostic context for recording errors against a source file.
-pub struct DiagnosticContextNode {
-    // The base type.
-    pub base: Object,
-
-    /// The Module to report against.
-    pub module: IRModule,
-
-    /// The set of diagnostics to report.
-    pub diagnostics: Array<Diagnostic>,
-
-    /// The renderer set for the context.
-    pub renderer: DiagnosticRenderer,
-}
-
-/// A diagnostic context which records active errors
-/// and contains a renderer.
-impl DiagnosticContext {
-    pub fn new<F>(module: IRModule, render_func: F) -> DiagnosticContext
-    where
-        F: Fn(DiagnosticContext) -> () + 'static,
-    {
-        let renderer = diagnostic_renderer(render_func.to_function()).unwrap();
-        let node = DiagnosticContextNode {
-            base: Object::base::<DiagnosticContextNode>(),
-            module,
-            diagnostics: Array::from_vec(vec![]).unwrap(),
-            renderer,
-        };
-        DiagnosticContext(Some(ObjectPtr::new(node)))
-    }
-
-    pub fn default(module: IRModule) -> DiagnosticContext {
-        diagnostic_context_default(module).unwrap()
-    }
-
-    /// Emit a diagnostic.
-    pub fn emit(&mut self, diagnostic: Diagnostic) -> Result<()> {
-        emit(self.clone(), diagnostic)
-    }
-
-    /// Render the errors and raise a DiagnosticError exception.
-    pub fn render(&mut self) -> Result<()> {
-        diagnostic_context_render(self.clone())
-    }
-
-    /// Emit a diagnostic and then immediately attempt to render all errors.
-    pub fn emit_fatal(&mut self, diagnostic: Diagnostic) -> Result<()> {
-        self.emit(diagnostic)?;
-        self.render()?;
-        Ok(())
-    }
-}
-
-/// Override the global diagnostics renderer.
-// render_func: Option[Callable[[DiagnosticContext], None]]
-//     If the render_func is None it will remove the current custom renderer
-//     and return to default behavior.
-fn override_renderer<F>(opt_func: Option<F>) -> Result<()>
-where
-    F: Fn(DiagnosticContext) -> () + 'static,
-{
-    match opt_func {
-        None => clear_renderer(),
-        Some(func) => {
-            let func = func.to_function();
-            let render_factory = move || diagnostic_renderer(func.clone()).unwrap();
-
-            function::register_override(render_factory, "diagnostics.OverrideRenderer", true)?;
-
-            Ok(())
-        }
-    }
-}
diff --git a/rust/tvm/src/ir/expr.rs b/rust/tvm/src/ir/expr.rs
deleted file mode 100644
index 1a0e7aea39c9..000000000000
--- a/rust/tvm/src/ir/expr.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use tvm_macros::Object;
-
-use crate::runtime::String as TString;
-use crate::runtime::{self, external, IsObject, IsObjectRef, Object, ObjectPtr, ObjectRef};
-use crate::DataType;
-
-use super::relay;
-use super::span::Span;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "BaseExpr"]
-#[type_key = "Expr"]
-pub struct BaseExprNode {
-    pub base: Object,
-    pub span: Span,
-}
-
-impl BaseExprNode {
-    pub fn base<T: IsObject>(span: Span) -> BaseExprNode {
-        BaseExprNode {
-            base: Object::base::<T>(),
-            span,
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PrimExpr"]
-#[type_key = "PrimExpr"]
-pub struct PrimExprNode {
-    pub base: BaseExprNode,
-    pub datatype: DataType,
-}
-
-impl PrimExprNode {
-    pub fn base<T: IsObject>(datatype: DataType, span: Span) -> PrimExprNode {
-        PrimExprNode {
-            base: BaseExprNode::base::<T>(span),
-            datatype,
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "GlobalVar"]
-#[type_key = "GlobalVar"]
-pub struct GlobalVarNode {
-    pub base: relay::ExprNode,
-    pub name_hint: TString,
-}
-
-impl GlobalVar {
-    pub fn new(name_hint: String, span: Span) -> GlobalVar {
-        let node = GlobalVarNode {
-            base: relay::ExprNode::base::<GlobalVarNode>(span),
-            name_hint: name_hint.into(),
-        };
-        GlobalVar(Some(ObjectPtr::new(node)))
-    }
-}
-
-// TODO(@jroesch): update to match TVM
-// Move IntImm
-// Define FloatImm
-// Define Bool
-// Define tvm::Integer?
-// Define RangeNode
-
-// TODO: figure out how to type the last argument runtime::TypedPackedFunc<String(ObjectRef)> annotate)
-external! {
-    #[name("relay.ir.AsText")]
-    fn _as_text(object: ObjectRef, show_meta_data: i32, annotate: runtime::Function) -> TString;
-}
-
-pub fn as_text<T: IsObjectRef>(object: T) -> String {
-    let no_func = unsafe { runtime::Function::null() };
-    _as_text(object.upcast(), 0, no_func)
-        .unwrap()
-        .as_str()
-        .unwrap()
-        .into()
-}
diff --git a/rust/tvm/src/ir/function.rs b/rust/tvm/src/ir/function.rs
deleted file mode 100644
index 43aca869f385..000000000000
--- a/rust/tvm/src/ir/function.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use tvm_macros::Object;
-
-use super::span::Span;
-
-use crate::ir::relay::ExprNode;
-use crate::runtime::{IsObject, IsObjectRef, ObjectRef};
-
-// TODO(@jroesch): define DictAttrs
-pub type DictAttrs = ObjectRef;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "BaseFunc"]
-#[type_key = "BaseFunc"]
-pub struct BaseFuncNode {
-    pub base: ExprNode,
-    pub attrs: DictAttrs,
-}
-
-impl BaseFuncNode {
-    pub fn base<T: IsObject>() -> BaseFuncNode {
-        BaseFuncNode {
-            base: ExprNode::base::<T>(Span::null()),
-            attrs: <ObjectRef as IsObjectRef>::null(),
-        }
-    }
-}
diff --git a/rust/tvm/src/ir/mod.rs b/rust/tvm/src/ir/mod.rs
deleted file mode 100644
index 6d5158005497..000000000000
--- a/rust/tvm/src/ir/mod.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-pub mod arith;
-pub mod attrs;
-pub mod diagnostics;
-pub mod expr;
-pub mod function;
-pub mod module;
-pub mod op;
-pub mod relay;
-pub mod source_map;
-pub mod span;
-pub mod tir;
-pub mod ty;
-
-pub use expr::*;
-pub use module::IRModule;
diff --git a/rust/tvm/src/ir/module.rs b/rust/tvm/src/ir/module.rs
deleted file mode 100644
index bb1d2b730d2b..000000000000
--- a/rust/tvm/src/ir/module.rs
+++ /dev/null
@@ -1,399 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::collections::HashMap;
-use std::iter::FromIterator;
-use std::path::Path;
-
-use thiserror::Error;
-use tvm_macros::Object;
-
-use crate::runtime::array::Array;
-use crate::runtime::function::Result;
-use crate::runtime::map::Map;
-use crate::runtime::string::String as TVMString;
-use crate::runtime::{external, IsObjectRef, Object, ObjectRef};
-
-use super::expr::GlobalVar;
-use super::function::BaseFunc;
-use super::source_map::SourceMap;
-use super::{relay, ty::GlobalTypeVar, ty::TypeData};
-
-#[derive(Error, Debug)]
-pub enum Error {
-    #[error("{0}")]
-    IO(#[from] std::io::Error),
-    #[error("{0}")]
-    TVM(#[from] crate::runtime::Error),
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "IRModule"]
-#[type_key = "IRModule"]
-pub struct IRModuleNode {
-    pub base: Object,
-    pub functions: Map<GlobalVar, BaseFunc>,
-    pub type_definitions: Map<GlobalTypeVar, TypeData>,
-    pub source_map: SourceMap,
-    // TODO(@jroesch): this is missing some fields
-}
-
-external! {
-    // Parser functions
-    #[name("relay.parser.ParseModule")]
-    fn parse_module(file_name: TVMString, source: TVMString) -> IRModule;
-    #[name("relay.parser.ParseExpr")]
-    fn parse_expression(file_name: TVMString, source: TVMString) -> IRModule;
-    #[name("ir.IRModule")]
-    fn module_new(funcs: Map<GlobalVar, BaseFunc>, types: Map<GlobalTypeVar, TypeData>, attrs: Map<TVMString, ObjectRef>, global_infos: Map<TVMString, Array<ObjectRef>>) -> IRModule;
-    // Module methods
-    #[name("ir.Module_Add")]
-    fn module_add(module: IRModule, type_name: GlobalVar, expr: BaseFunc, update: bool) -> IRModule;
-    #[name("ir.Module_AddDef")]
-    fn module_add_def(module: IRModule, type_name: GlobalTypeVar, type_data: TypeData, update: bool) -> ();
-    #[name("ir.Module_GetGlobalVar")]
-    fn module_get_global_var(module: IRModule, name: TVMString) -> GlobalVar;
-    #[name("ir.Module_GetGlobalVars")]
-    fn module_get_global_vars(module: IRModule) -> Array<GlobalVar>;
-    #[name("ir.Module_Lookup")]
-    fn module_lookup(module: IRModule, var: GlobalVar) -> BaseFunc;
-    #[name("ir.Module_Lookup_str")]
-    fn module_lookup_str(module: IRModule, name: TVMString) -> BaseFunc;
-    #[name("ir.Module_GetGlobalTypeVars")]
-    fn module_get_global_type_vars(module: IRModule) -> Array<GlobalTypeVar>;
-    #[name("ir.Module_ContainGlobalVar")]
-    fn module_contains_global_var(module: IRModule, name: TVMString) -> bool;
-    #[name("ir.Module_ContainGlobalTypeVar")]
-    fn module_contains_global_type_var(module: IRModule, name: TVMString) -> bool;
-    #[name("ir.Module_LookupDef")]
-    fn module_lookup_def(module: IRModule, global: GlobalTypeVar) -> TypeData;
-    #[name("ir.Module_LookupDef_str")]
-    fn module_lookup_def_str(module: IRModule, global: TVMString) -> TypeData;
-    #[name("ir.Module_LookupTag")]
-    fn module_lookup_tag(module: IRModule, tag: i32) -> relay::Constructor;
-    #[name("ir.Module_FromExpr")]
-    fn module_from_expr(expr: relay::Expr, funcs: Map<GlobalVar, BaseFunc>, types: Map<GlobalTypeVar, TypeData>) -> IRModule;
-    #[name("ir.Module_Import")]
-    fn module_import(module: IRModule, path: TVMString);
-    #[name("ir.Module_ImportFromStd")]
-    fn module_import_from_std(module: IRModule, path: TVMString);
-}
-
-// Note: we don't expose update here as update is going to be removed.
-
-impl IRModule {
-    pub fn new<'a, F, T, A, G>(funcs: F, types: T, attrs: A, global_infos: G) -> Result<IRModule>
-    where
-        F: IntoIterator<Item = (&'a GlobalVar, &'a BaseFunc)>,
-        T: IntoIterator<Item = (&'a GlobalTypeVar, &'a TypeData)>,
-        A: IntoIterator<Item = (&'a TVMString, &'a ObjectRef)>,
-        G: IntoIterator<Item = (&'a TVMString, &'a Array<ObjectRef>)>,
-    {
-        module_new(
-            Map::from_iter(funcs),
-            Map::from_iter(types),
-            Map::from_iter(attrs),
-            Map::from_iter(global_infos),
-        )
-    }
-
-    pub fn empty() -> Result<IRModule> {
-        let funcs = HashMap::<GlobalVar, BaseFunc>::new();
-        let types = HashMap::<GlobalTypeVar, TypeData>::new();
-        let attrs = HashMap::<TVMString, ObjectRef>::new();
-        let global_infos = HashMap::<TVMString, Array<ObjectRef>>::new();
-        IRModule::new(
-            funcs.iter(),
-            types.iter(),
-            attrs.iter(),
-            global_infos.iter(),
-        )
-    }
-
-    pub fn parse<N, S>(file_name: N, source: S) -> Result<IRModule>
-    where
-        N: Into<TVMString>,
-        S: Into<TVMString>,
-    {
-        parse_module(file_name.into(), source.into())
-    }
-
-    pub fn parse_file<P: 'static + AsRef<Path>>(
-        file_path: P,
-    ) -> std::result::Result<IRModule, Error> {
-        let file_path = file_path.as_ref();
-        let file_path_as_str = file_path.to_str().unwrap().to_string();
-        let source = std::fs::read_to_string(file_path)?;
-        let module = IRModule::parse(file_path_as_str, source)?;
-        Ok(module)
-    }
-
-    pub fn add<F>(&mut self, var: GlobalVar, func: F) -> Result<IRModule>
-    // todo(@jroesch): can we do better here? why doesn't BaseFunc::Object work?
-    where
-        F: IsObjectRef,
-        F::Object: AsRef<<BaseFunc as IsObjectRef>::Object>,
-    {
-        module_add(self.clone(), var, func.upcast(), true)
-    }
-
-    pub fn add_def(
-        &mut self,
-        type_name: GlobalTypeVar,
-        type_data: TypeData,
-        update: bool,
-    ) -> Result<()> {
-        module_add_def(self.clone(), type_name, type_data, update)
-    }
-
-    pub fn get_global_var<S>(&self, name: S) -> Result<GlobalVar>
-    where
-        S: Into<TVMString>,
-    {
-        module_get_global_var(self.clone(), name.into())
-    }
-
-    pub fn get_global_vars(&self) -> Result<Array<GlobalVar>> {
-        module_get_global_vars(self.clone())
-    }
-
-    pub fn lookup(&self, var: GlobalVar) -> Result<BaseFunc> {
-        module_lookup(self.clone(), var)
-    }
-
-    pub fn lookup_str<S>(&self, name: S) -> Result<BaseFunc>
-    where
-        S: Into<TVMString>,
-    {
-        module_lookup_str(self.clone(), name.into())
-    }
-
-    pub fn get_global_type_vars(&self) -> Result<Array<GlobalTypeVar>> {
-        module_get_global_type_vars(self.clone())
-    }
-
-    pub fn contains_global_var<S: Into<TVMString>>(&self, name: S) -> Result<bool> {
-        module_contains_global_var(self.clone(), name.into())
-    }
-
-    pub fn contains_global_type_var<S: Into<TVMString>>(&self, name: S) -> Result<bool> {
-        module_contains_global_type_var(self.clone(), name.into())
-    }
-
-    pub fn lookup_def(&self, global: GlobalTypeVar) -> Result<TypeData> {
-        module_lookup_def(self.clone(), global)
-    }
-
-    pub fn lookup_def_str<S>(&self, global: S) -> Result<TypeData>
-    where
-        S: Into<TVMString>,
-    {
-        module_lookup_def_str(self.clone(), global.into())
-    }
-
-    pub fn lookup_tag(&self, tag: i32) -> Result<relay::Constructor> {
-        module_lookup_tag(self.clone(), tag)
-    }
-
-    pub fn from_expr<E>(expr: E) -> Result<IRModule>
-    where
-        E: IsObjectRef,
-        E::Object: AsRef<<relay::Expr as IsObjectRef>::Object>,
-    {
-        Self::from_expr_with_items(expr, HashMap::new(), HashMap::new())
-    }
-
-    pub fn from_expr_with_items<'a, E, F, T>(expr: E, funcs: F, types: T) -> Result<IRModule>
-    where
-        F: IntoIterator<Item = (&'a GlobalVar, &'a BaseFunc)>,
-        T: IntoIterator<Item = (&'a GlobalTypeVar, &'a TypeData)>,
-        E: IsObjectRef,
-        E::Object: AsRef<<relay::Expr as IsObjectRef>::Object>,
-    {
-        module_from_expr(expr.upcast(), Map::from_iter(funcs), Map::from_iter(types))
-    }
-
-    pub fn import<S: Into<TVMString>>(&mut self, path: S) -> Result<()> {
-        module_import(self.clone(), path.into())
-    }
-
-    pub fn import_from_std<S: Into<TVMString>>(&mut self, path: S) -> Result<()> {
-        module_import_from_std(self.clone(), path.into())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::relay::*;
-    use super::*;
-    use crate::ir::span::Span;
-    use crate::ir::ty::{GlobalTypeVar, TypeData, TypeKind};
-    use tvm_rt::IsObjectRef;
-
-    fn add_dummy_functions(names: Vec<&str>) -> Result<IRModule> {
-        let mut module = IRModule::empty()?;
-        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
-        let params = vec![x.clone()];
-        let func = relay::Function::simple(params, x);
-
-        for name in names {
-            let gv = GlobalVar::new(name.into(), Span::null());
-            module = module.add(gv, func.clone())?;
-        }
-
-        Ok(module)
-    }
-
-    fn add_dummy_types(names: Vec<&str>) -> Result<IRModule> {
-        let mut module = IRModule::empty()?;
-
-        for name in names {
-            let name: String = name.into();
-            let name = GlobalTypeVar::new(name, TypeKind::Type, Span::null());
-            let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null());
-            module.add_def(name, type_data, true)?;
-        }
-
-        Ok(module)
-    }
-
-    #[test]
-    fn test_module_add() -> anyhow::Result<()> {
-        let mut module = IRModule::empty()?;
-        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
-        let params = vec![x.clone()];
-        let func = relay::Function::simple(params, x);
-        let module = module.add(GlobalVar::new("foo".into(), Span::null()), func)?;
-        let lfunc = module.lookup_str("foo")?;
-        let lfunc = lfunc.downcast::<relay::Function>()?;
-        assert_eq!(lfunc.params.len(), 1);
-        Ok(())
-    }
-
-    #[test]
-    fn test_module_add_def() -> Result<()> {
-        let mut module = IRModule::empty()?;
-        let name = GlobalTypeVar::new("my_type", TypeKind::Type, Span::null());
-        let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null());
-        module.add_def(name.clone(), type_data, true)?;
-        let _by_gtv = module.lookup_def(name)?;
-        let _by_gv = module.lookup_def_str("my_type")?;
-        Ok(())
-    }
-
-    #[test]
-    fn test_get_global_var() -> Result<()> {
-        let mut module = IRModule::empty()?;
-        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
-        let params = vec![x.clone()];
-        let func = relay::Function::simple(params, x);
-        let gv_foo = GlobalVar::new("foo".into(), Span::null());
-        let module = module.add(gv_foo.clone(), func)?;
-        let gv = module.get_global_var("foo")?;
-        assert_eq!(gv_foo, gv);
-        Ok(())
-    }
-
-    #[test]
-    fn test_get_global_vars() -> Result<()> {
-        let names = vec!["foo", "bar", "baz"];
-        let module = add_dummy_functions(names.clone())?;
-        let gvars: Vec<String> = module
-            .get_global_vars()?
-            .into_iter()
-            .map(|gv| gv.name_hint.as_str().unwrap().to_string())
-            .collect();
-
-        for name in names {
-            assert!(gvars.contains(&name.to_string()));
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_get_global_type_vars() -> Result<()> {
-        let names = vec!["foo", "bar", "baz"];
-        let module = add_dummy_types(names.clone())?;
-        let gvars: Vec<String> = module
-            .get_global_type_vars()?
-            .into_iter()
-            .map(|gv| gv.name_hint.as_str().unwrap().to_string())
-            .collect();
-
-        for name in names {
-            assert!(gvars.contains(&name.to_string()));
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_contains_global_var() -> Result<()> {
-        let module = add_dummy_functions(vec!["foo"])?;
-        assert!(module.contains_global_var("foo")?);
-        Ok(())
-    }
-
-    #[test]
-    fn test_contains_global_type_var() -> Result<()> {
-        let module = add_dummy_types(vec!["foo"])?;
-        assert!(module.contains_global_type_var("foo")?);
-        Ok(())
-    }
-
-    // TODO(@jroesch): not really sure about this API at all.
-    // pub fn lookup_tag(&self, tag: i32) -> Result<relay::Constructor> {
-    //     module_lookup_tag(self.clone(), tag)
-    // }
-
-    #[test]
-    fn test_from_expr() -> Result<()> {
-        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
-        let params = vec![x.clone()];
-        let func = relay::Function::simple(params, x);
-        let module = IRModule::from_expr(func.clone())?;
-        let main_fn = module.lookup_str("main")?;
-        let main_fn = main_fn.downcast::<relay::Function>()?;
-        assert_eq!(main_fn, func);
-        Ok(())
-    }
-
-    #[test]
-    fn test_import() -> Result<()> {
-        let mut std_path: String = env!("CARGO_MANIFEST_DIR").into();
-        std_path += "/../../python/tvm/relay/std/prelude.rly";
-
-        let mut mod1 = IRModule::empty()?;
-        mod1.import(std_path.clone())?;
-        mod1.lookup_str("map")?;
-
-        // TODO(@jroesch): this requires another patch of mine to enable.
-
-        // if cfg!(feature = "python") {
-        //     crate::python::load().unwrap();
-        //     let mut mod2 = IRModule::empty()?;
-        //     mod2.import_from_std("prelude.rly")?;
-        //     mod2.lookup_str("map")?;
-        // }
-
-        Ok(())
-    }
-}
diff --git a/rust/tvm/src/ir/op.rs b/rust/tvm/src/ir/op.rs
deleted file mode 100644
index d222ead0391b..000000000000
--- a/rust/tvm/src/ir/op.rs
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::ir::relay::ExprNode;
-use crate::runtime::array::Array;
-use crate::runtime::ObjectRef;
-use crate::runtime::String as TString;
-use tvm_macros::Object;
-
-type FuncType = ObjectRef;
-type AttrFieldInfo = ObjectRef;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Op"]
-#[type_key = "Op"]
-pub struct OpNode {
-    pub base: ExprNode,
-    pub name: TString,
-    pub op_type: FuncType,
-    pub description: TString,
-    pub arguments: Array<AttrFieldInfo>,
-    pub attrs_type_key: TString,
-    pub attrs_type_index: u32,
-    pub num_inputs: i32,
-    pub support_level: i32,
-}
diff --git a/rust/tvm/src/ir/relay/attrs/mod.rs b/rust/tvm/src/ir/relay/attrs/mod.rs
deleted file mode 100644
index 333ed26752fc..000000000000
--- a/rust/tvm/src/ir/relay/attrs/mod.rs
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-pub mod nn;
-pub mod reduce;
-pub mod transform;
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
deleted file mode 100644
index c4807c72e9a7..000000000000
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::ir::attrs::BaseAttrsNode;
-use crate::ir::PrimExpr;
-use crate::runtime::array::Array;
-use crate::runtime::DataType;
-use crate::runtime::String as TString;
-use tvm_macros::Object;
-
-type IndexExpr = PrimExpr;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PadAttrs"]
-#[type_key = "relay.attrs.PadAttrs"]
-pub struct PadAttrsNode {
-    pub base: BaseAttrsNode,
-    pub pad_width: Array<Array<IndexExpr>>,
-    pub pad_mode: TString,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Conv1DAttrs"]
-#[type_key = "relay.attrs.Conv1DAttrs"]
-pub struct Conv1DAttrsNode {
-    pub base: BaseAttrsNode,
-    pub strides: Array<IndexExpr>,
-    pub padding: Array<IndexExpr>,
-    pub dilation: Array<IndexExpr>,
-    // TODO(@gussmith23) groups is "int", what should it be here?
-    pub groups: i32,
-    pub channels: IndexExpr,
-    pub kernel_size: Array<IndexExpr>,
-    pub data_layout: TString,
-    pub kernel_layout: TString,
-    pub out_layout: TString,
-    pub out_dtype: DataType,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Conv2DAttrs"]
-#[type_key = "relay.attrs.Conv2DAttrs"]
-pub struct Conv2DAttrsNode {
-    pub base: BaseAttrsNode,
-    pub strides: Array<IndexExpr>,
-    pub padding: Array<IndexExpr>,
-    pub dilation: Array<IndexExpr>,
-    // TODO(@gussmith23) groups is "int", what should it be here?
-    pub groups: i32,
-    pub channels: IndexExpr,
-    pub kernel_size: Array<IndexExpr>,
-    pub data_layout: TString,
-    pub kernel_layout: TString,
-    pub out_layout: TString,
-    pub auto_scheduler_rewritten_layout: TString,
-    pub out_dtype: DataType,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Conv3DAttrs"]
-#[type_key = "relay.attrs.Conv3DAttrs"]
-pub struct Conv3DAttrsNode {
-    pub base: BaseAttrsNode,
-    pub strides: Array<IndexExpr>,
-    pub padding: Array<IndexExpr>,
-    pub dilation: Array<IndexExpr>,
-    pub groups: i32,
-    pub channels: IndexExpr,
-    pub kernel_size: Array<IndexExpr>,
-    pub data_layout: TString,
-    pub kernel_layout: TString,
-    pub out_layout: TString,
-    pub auto_scheduler_rewritten_layout: TString,
-    pub out_dtype: DataType,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Conv3DTransposeAttrs"]
-#[type_key = "relay.attrs.Conv3DTransposeAttrs"]
-pub struct Conv3DTransposeAttrsNode {
-    pub base: BaseAttrsNode,
-    pub channels: IndexExpr,
-    pub kernel_size: Array<IndexExpr>,
-    pub strides: Array<IndexExpr>,
-    pub padding: Array<IndexExpr>,
-    pub output_padding: Array<IndexExpr>,
-    pub dilation: Array<IndexExpr>,
-    pub groups: i32,
-    pub data_layout: TString,
-    pub kernel_layout: TString,
-    pub out_layout: TString,
-    pub out_dtype: DataType,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "BiasAddAttrs"]
-#[type_key = "relay.attrs.BiasAddAttrs"]
-pub struct BiasAddAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: i32,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "MatmulAttrs"]
-#[type_key = "relay.attrs.MatmulAttrs"]
-pub struct MatmulAttrsNode {
-    pub base: BaseAttrsNode,
-    pub units: IndexExpr,
-    pub out_dtype: DataType,
-    pub transpose_a: bool,
-    pub transpose_b: bool,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "DenseAttrs"]
-#[type_key = "relay.attrs.DenseAttrs"]
-pub struct DenseAttrsNode {
-    pub base: BaseAttrsNode,
-    pub units: IndexExpr,
-    pub auto_scheduler_rewritten_layout: TString,
-    pub out_dtype: DataType,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "GlobalPool2DAttrs"]
-#[type_key = "relay.attrs.GlobalPool2DAttrs"]
-pub struct GlobalPool2DAttrsNode {
-    pub base: BaseAttrsNode,
-    pub layout: TString,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "MaxPool2DAttrs"]
-#[type_key = "relay.attrs.MaxPool2DAttrs"]
-pub struct MaxPool2DAttrsNode {
-    pub base: BaseAttrsNode,
-    pub pool_size: Array<IndexExpr>,
-    pub strides: Array<IndexExpr>,
-    pub padding: Array<IndexExpr>,
-    pub dilation: Array<IndexExpr>,
-    pub layout: TString,
-    pub ceil_mode: bool,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "SoftmaxAttrs"]
-#[type_key = "relay.attrs.SoftmaxAttrs"]
-pub struct SoftmaxAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: i32,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "BatchNormAttrs"]
-#[type_key = "relay.attrs.BatchNormAttrs"]
-pub struct BatchNormAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: i32,
-    pub epsilon: f64,
-    pub center: bool,
-    pub scale: bool,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "LeakyReluAttrs"]
-#[type_key = "relay.attrs.LeakyReluAttrs"]
-pub struct LeakyReluAttrsNode {
-    pub base: BaseAttrsNode,
-    pub alpha: f64,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "AvgPool2DAttrs"]
-#[type_key = "relay.attrs.AvgPool2DAttrs"]
-pub struct AvgPool2DAttrsNode {
-    pub base: BaseAttrsNode,
-    pub pool_size: Array<IndexExpr>,
-    pub strides: Array<IndexExpr>,
-    pub padding: Array<IndexExpr>,
-    pub dilation: Array<IndexExpr>,
-    pub layout: TString,
-    pub ceil_mode: bool,
-    pub count_include_pad: bool,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "UpSamplingAttrs"]
-#[type_key = "relay.attrs.UpSamplingAttrs"]
-pub struct UpSamplingAttrsNode {
-    pub base: BaseAttrsNode,
-    pub scale_h: f64,
-    pub scale_w: f64,
-    pub layout: TString,
-    pub method: TString,
-    pub align_corners: bool,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "DropoutAttrs"]
-#[type_key = "relay.attrs.DropoutAttrs"]
-pub struct DropoutAttrsNode {
-    pub base: BaseAttrsNode,
-    pub rate: f64,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "BatchMatmulAttrs"]
-#[type_key = "relay.attrs.BatchMatmulAttrs"]
-pub struct BatchMatmulAttrsNode {
-    pub base: BaseAttrsNode,
-    pub auto_scheduler_rewritten_layout: TString,
-    pub out_dtype: DataType,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "LayerNormAttrs"]
-#[type_key = "relay.attrs.LayerNormAttrs"]
-pub struct LayerNormAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: i32,
-    pub epsilon: f64,
-    pub center: bool,
-    pub scale: bool,
-}
diff --git a/rust/tvm/src/ir/relay/attrs/reduce.rs b/rust/tvm/src/ir/relay/attrs/reduce.rs
deleted file mode 100644
index aed84fdf2aad..000000000000
--- a/rust/tvm/src/ir/relay/attrs/reduce.rs
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::ir::attrs::BaseAttrsNode;
-use crate::ir::PrimExpr;
-use crate::runtime::array::Array;
-use tvm_macros::Object;
-
-type IndexExpr = PrimExpr;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "ReduceAttrs"]
-#[type_key = "relay.attrs.ReduceAttrs"]
-pub struct ReduceAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: Array<IndexExpr>,
-    pub keepdims: bool,
-    pub exclude: bool,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "VarianceAttrs"]
-#[type_key = "relay.attrs.ReduceAttrs"]
-pub struct VarianceAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: Array<IndexExpr>,
-    pub keepdims: bool,
-    pub exclude: bool,
-    pub unbiased: bool,
-}
diff --git a/rust/tvm/src/ir/relay/attrs/transform.rs b/rust/tvm/src/ir/relay/attrs/transform.rs
deleted file mode 100644
index d86c46a6f6bb..000000000000
--- a/rust/tvm/src/ir/relay/attrs/transform.rs
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::ir::attrs::BaseAttrsNode;
-use crate::ir::relay::TString;
-use crate::ir::tir::IntImm;
-use crate::ir::PrimExpr;
-use crate::runtime::array::Array;
-use crate::runtime::ObjectRef;
-use tvm_macros::Object;
-use tvm_rt::DataType;
-
-type IndexExpr = PrimExpr;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "ClipAttrs"]
-#[type_key = "relay.attrs.ClipAttrs"]
-pub struct ClipAttrsNode {
-    pub base: BaseAttrsNode,
-    pub a_min: f64,
-    pub a_max: f64,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "CastAttrs"]
-#[type_key = "relay.attrs.CastAttrs"]
-pub struct CastAttrsNode {
-    pub base: BaseAttrsNode,
-    pub dtype: DataType,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "ExpandDimsAttrs"]
-#[type_key = "relay.attrs.ExpandDimsAttrs"]
-pub struct ExpandDimsAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: i32,
-    pub num_newaxis: i32,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "ConcatenateAttrs"]
-#[type_key = "relay.attrs.ConcatenateAttrs"]
-pub struct ConcatenateAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: i32,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "ReshapeAttrs"]
-#[type_key = "relay.attrs.ReshapeAttrs"]
-pub struct ReshapeAttrsNode {
-    pub base: BaseAttrsNode,
-    pub newshape: Array<IndexExpr>,
-    pub reverse: bool,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "SplitAttrs"]
-#[type_key = "relay.attrs.SplitAttrs"]
-pub struct SplitAttrsNode {
-    pub base: BaseAttrsNode,
-    pub indices_or_sections: ObjectRef,
-    pub axis: i32,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TransposeAttrs"]
-#[type_key = "relay.attrs.TransposeAttrs"]
-pub struct TransposeAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axes: Array<IndexExpr>,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "SqueezeAttrs"]
-#[type_key = "relay.attrs.SqueezeAttrs"]
-pub struct SqueezeAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: Array<IntImm>,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TakeAttrs"]
-#[type_key = "relay.attrs.TakeAttrs"]
-pub struct TakeAttrsNode {
-    pub base: BaseAttrsNode,
-    pub batch_dims: IntImm,
-    pub axis: IntImm,
-    pub mode: TString,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "StackAttrs"]
-#[type_key = "relay.attrs.StackAttrs"]
-pub struct StackAttrsNode {
-    pub base: BaseAttrsNode,
-    pub axis: IntImm,
-}
-
-// TODO(@gussmith23) How to support Optional type? This "just works" when values
-// are provided for begin/end/strides, but I'm not sure what happens if None is
-// passed from the C++ side.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "StridedSliceAttrs"]
-#[type_key = "relay.attrs.StridedSliceAttrs"]
-pub struct StridedSliceAttrsNode {
-    pub base: BaseAttrsNode,
-    pub begin: Array<IntImm>,
-    pub end: Array<IntImm>,
-    pub strides: Array<IntImm>,
-    pub slice_mode: TString,
-}
diff --git a/rust/tvm/src/ir/relay/mod.rs b/rust/tvm/src/ir/relay/mod.rs
deleted file mode 100644
index 08ce082c4586..000000000000
--- a/rust/tvm/src/ir/relay/mod.rs
+++ /dev/null
@@ -1,589 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-use crate::runtime::array::Array;
-use crate::runtime::{self, object::*, IsObjectRef, String as TString};
-
-use super::attrs::Attrs;
-use super::expr::BaseExprNode;
-use super::function::BaseFuncNode;
-use super::span::Span;
-use super::ty::Type;
-
-use tvm_macros::Object;
-use tvm_rt::NDArray;
-
-pub use super::expr::{GlobalVar, GlobalVarNode};
-pub use crate::runtime::DataType;
-
-pub mod attrs;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Expr"]
-#[type_key = "RelayExpr"]
-pub struct ExprNode {
-    pub base: BaseExprNode,
-    pub checked_type: Type,
-    pub struct_info: ObjectRef,
-    pub virtual_device: ObjectRef,
-}
-
-impl ExprNode {
-    pub fn base<T: IsObject>(span: Span) -> ExprNode {
-        ExprNode {
-            base: BaseExprNode::base::<T>(span.clone()),
-            checked_type: Type::null(),
-            struct_info: ObjectRef::null(),
-            virtual_device: ObjectRef::null(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Id"]
-#[type_key = "relay.Id"]
-pub struct IdNode {
-    pub base: Object,
-    pub name_hint: TString,
-}
-
-impl Id {
-    fn new(name_hint: TString) -> Id {
-        let node = IdNode {
-            base: Object::base::<IdNode>(),
-            name_hint: name_hint,
-        };
-        Id(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Constant"]
-#[type_key = "relay.Constant"]
-pub struct ConstantNode {
-    pub base: ExprNode,
-    pub data: NDArray,
-}
-
-impl Constant {
-    pub fn new(data: NDArray, span: Span) -> Constant {
-        let node = ConstantNode {
-            base: ExprNode::base::<ConstantNode>(span),
-            data: data,
-        };
-        Constant(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Tuple"]
-#[type_key = "relay.Tuple"]
-pub struct TupleNode {
-    pub base: ExprNode,
-    pub fields: Array<Expr>,
-}
-
-impl Tuple {
-    pub fn new(fields: Array<Expr>, span: Span) -> Tuple {
-        let node = TupleNode {
-            base: ExprNode::base::<TupleNode>(span),
-            fields,
-        };
-        Tuple(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Var"]
-#[type_key = "relay.Var"]
-pub struct VarNode {
-    pub base: ExprNode,
-    pub vid: Id,
-    pub type_annotation: Type,
-}
-
-impl Var {
-    pub fn new(name_hint: String, type_annotation: Type, span: Span) -> Var {
-        let node = VarNode {
-            base: ExprNode::base::<VarNode>(span),
-            vid: Id::new(name_hint.into()),
-            type_annotation: type_annotation,
-        };
-        Var(Some(ObjectPtr::new(node)))
-    }
-
-    pub fn name_hint(&self) -> &TString {
-        &self.vid.0.as_ref().unwrap().name_hint
-    }
-
-    pub fn static_tensor(name_hint: String, sh: Vec<i32>, dtype: DataType) -> Var {
-        let sh = Array::from_vec(sh.into_iter().map(Into::into).collect()).unwrap();
-        Self::new(
-            name_hint,
-            super::ty::TensorType::new(sh, dtype, Span::null()).upcast(),
-            Span::null(),
-        )
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Call"]
-#[type_key = "relay.Call"]
-pub struct CallNode {
-    pub base: ExprNode,
-    deleter: ObjectRef,
-    pub op: Expr,
-    pub args: Array<Expr>,
-    pub attrs: Attrs,
-    pub type_args: Array<Type>,
-}
-
-impl Call {
-    pub fn new(
-        op: Expr,
-        args: Array<Expr>,
-        attrs: Attrs,
-        type_args: Array<Type>,
-        span: Span,
-    ) -> Call {
-        let node = CallNode {
-            base: ExprNode::base::<CallNode>(span),
-            deleter: todo!("Don't know how to construct this"),
-            op: op,
-            args: args,
-            attrs: attrs,
-            type_args: type_args,
-        };
-        Call(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Let"]
-#[type_key = "relay.Let"]
-pub struct LetNode {
-    pub base: ExprNode,
-    pub var: Var,
-    pub value: Expr,
-    pub body: Expr,
-}
-
-impl Let {
-    pub fn new(var: Var, value: Expr, body: Expr, span: Span) -> Let {
-        let node = LetNode {
-            base: ExprNode::base::<LetNode>(span),
-            var,
-            value,
-            body,
-        };
-        Let(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "If"]
-#[type_key = "relay.If"]
-pub struct IfNode {
-    pub base: ExprNode,
-    pub cond: Expr,
-    pub true_branch: Expr,
-    pub false_branch: Expr,
-}
-
-impl If {
-    pub fn new(cond: Expr, true_branch: Expr, false_branch: Expr, span: Span) -> If {
-        let node = IfNode {
-            base: ExprNode::base::<IfNode>(span),
-            cond,
-            true_branch,
-            false_branch,
-        };
-        If(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TupleGetItem"]
-#[type_key = "relay.TupleGetItem"]
-pub struct TupleGetItemNode {
-    pub base: ExprNode,
-    pub tuple: Expr,
-    pub index: i32,
-}
-
-impl TupleGetItem {
-    pub fn new(tuple: Expr, index: i32, span: Span) -> TupleGetItem {
-        let node = TupleGetItemNode {
-            base: ExprNode::base::<TupleGetItemNode>(span),
-            tuple,
-            index,
-        };
-        TupleGetItem(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "RefCreate"]
-#[type_key = "relay.RefCreate"]
-pub struct RefCreateNode {
-    pub base: ExprNode,
-    pub value: Expr,
-}
-
-impl RefCreate {
-    pub fn new(value: Expr, span: Span) -> RefCreate {
-        let node = RefCreateNode {
-            base: ExprNode::base::<RefCreateNode>(span),
-            value,
-        };
-        RefCreate(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "RefRead"]
-#[type_key = "relay.RefRead"]
-pub struct RefReadNode {
-    pub base: ExprNode,
-    pub ref_value: Expr,
-}
-
-impl RefRead {
-    pub fn new(ref_value: Expr, span: Span) -> RefRead {
-        let node = RefReadNode {
-            base: ExprNode::base::<RefReadNode>(span),
-            ref_value,
-        };
-        RefRead(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "RefWrite"]
-#[type_key = "relay.RefWrite"]
-pub struct RefWriteNode {
-    pub base: ExprNode,
-    pub ref_value: Expr,
-    pub value: Expr,
-}
-
-impl RefWrite {
-    pub fn new(ref_value: Expr, value: Expr, span: Span) -> RefWrite {
-        let node = RefWriteNode {
-            base: ExprNode::base::<RefWriteNode>(span),
-            ref_value,
-            value,
-        };
-        RefWrite(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Constructor"]
-#[type_key = "relay.Constructor"]
-pub struct ConstructorNode {
-    pub base: ExprNode,
-    pub name_hint: String,
-    pub inputs: Array<Type>,
-    pub tag: i32,
-}
-
-impl Constructor {
-    pub fn new(name_hint: String, inputs: Array<Type>, tag: i32, span: Span) -> Constructor {
-        let node = ConstructorNode {
-            base: ExprNode::base::<ConstructorNode>(span),
-            name_hint,
-            inputs,
-            tag,
-        };
-        Constructor(Some(ObjectPtr::new(node)))
-    }
-}
-
-// TODO(@jroesch): define the type data
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Pattern"]
-#[type_key = "relay.Pattern"]
-pub struct PatternNode {
-    pub base: Object,
-    pub span: Span,
-}
-
-impl PatternNode {
-    pub fn base<T: IsObject>(span: Span) -> PatternNode {
-        PatternNode {
-            base: Object::base::<T>(),
-            span: span,
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PatternWildcard"]
-#[type_key = "relay.PatternWildcard"]
-pub struct PatternWildcardNode {
-    pub base: PatternNode,
-}
-
-impl PatternWildcard {
-    pub fn new(span: Span) -> PatternWildcard {
-        let node = PatternWildcardNode {
-            base: PatternNode::base::<PatternWildcardNode>(span),
-        };
-        PatternWildcard(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PatternVar"]
-#[type_key = "relay.PatternVar"]
-pub struct PatternVarNode {
-    pub base: PatternNode,
-    pub var: Var,
-}
-
-impl PatternVar {
-    pub fn new(var: Var, span: Span) -> PatternVar {
-        let node = PatternVarNode {
-            base: PatternNode::base::<PatternVarNode>(span),
-            var: var,
-        };
-        PatternVar(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PatternConstructor"]
-#[type_key = "relay.PatternConstructor"]
-pub struct PatternConstructorNode {
-    pub base: PatternNode,
-    pub constructor: Constructor,
-    pub patterns: Array<Pattern>,
-}
-
-impl PatternConstructor {
-    pub fn new(
-        constructor: Constructor,
-        patterns: Array<Pattern>,
-        span: Span,
-    ) -> PatternConstructor {
-        let node = PatternConstructorNode {
-            base: PatternNode::base::<PatternConstructorNode>(span),
-            constructor,
-            patterns,
-        };
-        PatternConstructor(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PatternTuple"]
-#[type_key = "relay.PatternTuple"]
-pub struct PatternTupleNode {
-    pub base: PatternNode,
-    pub patterns: Array<Pattern>,
-}
-
-impl PatternTuple {
-    pub fn new(patterns: Array<Pattern>, span: Span) -> PatternTuple {
-        let node = PatternTupleNode {
-            base: PatternNode::base::<PatternTupleNode>(span),
-            patterns,
-        };
-        PatternTuple(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Clause"]
-#[type_key = "relay.Clause"]
-pub struct ClauseNode {
-    pub base: Object,
-    pub lhs: Pattern,
-    pub rhs: Expr,
-}
-
-impl Clause {
-    pub fn new(lhs: Pattern, rhs: Expr, _span: Span) -> Clause {
-        let node = ClauseNode {
-            base: Object::base::<ClauseNode>(),
-            lhs,
-            rhs,
-        };
-        Clause(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Match"]
-#[type_key = "relay.Match"]
-pub struct MatchNode {
-    pub base: ExprNode,
-    pub data: Expr,
-    pub clauses: Array<Clause>,
-    pub complete: bool,
-}
-
-impl Match {
-    pub fn new(data: Expr, clauses: Array<Clause>, complete: bool, span: Span) -> Match {
-        let node = MatchNode {
-            base: ExprNode::base::<MatchNode>(span),
-            data,
-            clauses,
-            complete,
-        };
-        Match(Some(ObjectPtr::new(node)))
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Function"]
-#[type_key = "relay.Function"]
-pub struct FunctionNode {
-    pub base: BaseFuncNode,
-    pub params: Array<Var>,
-    pub body: Expr,
-    pub ret_type: Type,
-    pub type_params: Array<Type>,
-}
-
-impl Function {
-    pub fn new(
-        params: Array<Var>,
-        body: Expr,
-        ret_type: Type,
-        type_params: Array<Type>,
-    ) -> Function {
-        let node = FunctionNode {
-            base: BaseFuncNode::base::<FunctionNode>(),
-            params: params,
-            body: body,
-            ret_type: ret_type,
-            type_params: type_params,
-        };
-        Function(Some(ObjectPtr::new(node)))
-    }
-
-    pub fn simple<E>(params: Vec<Var>, body: E) -> Function
-    where
-        E: IsObjectRef,
-        E::Object: AsRef<<Expr as IsObjectRef>::Object>,
-    {
-        let params = Array::from_vec(params).unwrap();
-        Self::new(
-            params,
-            body.upcast(),
-            Type::null(),
-            Array::from_vec(vec![]).unwrap(),
-        )
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::ir::as_text;
-    use crate::runtime::String as TString;
-    use anyhow::Result;
-
-    #[test]
-    fn test_id() -> Result<()> {
-        let string = TString::from("foo");
-        let id = Id::new(string);
-        let text = as_text(id.clone());
-        assert!(text.contains("relay.Id"));
-        Ok(())
-    }
-
-    #[test]
-    fn test_global() -> Result<()> {
-        let gv = GlobalVar::new("main".to_string(), Span::null());
-        let text = as_text(gv.clone());
-        assert!(text.contains("@main"));
-        Ok(())
-    }
-
-    #[test]
-    fn test_var() -> Result<()> {
-        let var = Var::new("local".to_string(), Type::null(), Span::null());
-        let text = as_text(var.clone());
-        assert!(text.contains("%local"));
-        Ok(())
-    }
-
-    #[test]
-    fn test_parse_constant() -> Result<()> {
-        let module = crate::ir::module::IRModule::parse(
-            "",
-            r#"
-#[version = "0.0.5"]
-def @main() -> float32 {
-  0.01639530062675476f
-}
-"#,
-        )
-        .unwrap();
-        let main = module
-            .lookup(module.get_global_var("main").unwrap())
-            .unwrap();
-        let func = main.downcast::<crate::ir::relay::Function>().unwrap();
-        let constant = func
-            .body
-            .clone()
-            .downcast::<crate::ir::relay::Constant>()
-            .unwrap();
-        let tuple_type = constant
-            .clone()
-            .upcast::<Expr>()
-            .checked_type
-            .clone()
-            .downcast::<crate::ir::ty::TensorType>()
-            .unwrap();
-        // Test type
-        assert_eq!(tuple_type.shape.len(), 0,);
-        assert_eq!(tuple_type.dtype, "float32".parse().unwrap(),);
-        // Check that actual data matches up with type
-        assert_eq!(constant.data.dtype(), "float32".parse().unwrap(),);
-        assert_eq!(constant.data.len(), 1);
-        assert_eq!(constant.data.size(), 4);
-        assert_eq!(constant.data.shape(), &[]);
-        Ok(())
-    }
-}
diff --git a/rust/tvm/src/ir/source_map.rs b/rust/tvm/src/ir/source_map.rs
deleted file mode 100644
index 7376f4b74022..000000000000
--- a/rust/tvm/src/ir/source_map.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either exprss or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::runtime::map::Map;
-use crate::runtime::object::Object;
-use crate::runtime::string::String as TString;
-
-use super::span::SourceName;
-
-use tvm_macros::Object;
-
-/// A program source in any language.
-///
-/// Could represent the source from an ML framework or a source of an IRModule.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[type_key = "Source"]
-#[ref_name = "Source"]
-pub struct SourceNode {
-    pub base: Object,
-    /// The source name.
-    pub source_name: SourceName,
-
-    /// The raw source.
-    pub source: TString,
-    // TODO(@jroesch): Non-ABI compat field
-    // A mapping of line breaks into the raw source.
-    // std::vector<std::pair<int, int>> line_map;
-}
-
-/// A mapping from a unique source name to source fragments.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[type_key = "SourceMap"]
-#[ref_name = "SourceMap"]
-pub struct SourceMapNode {
-    /// The base object.
-    pub base: Object,
-    /// The source mapping.
-    pub source_map: Map<SourceName, Source>,
-}
diff --git a/rust/tvm/src/ir/span.rs b/rust/tvm/src/ir/span.rs
deleted file mode 100644
index be74745b60ca..000000000000
--- a/rust/tvm/src/ir/span.rs
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-
-* specific language governing permissions and limitations
-* under the License.
-*/
-
-use crate::runtime::{Object, ObjectPtr, String as TString};
-use tvm_macros::Object;
-
-/// A source file name, contained in a Span.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[type_key = "SourceName"]
-#[ref_name = "SourceName"]
-pub struct SourceNameNode {
-    pub base: Object,
-    pub name: TString,
-}
-
-/// Span information for diagnostic purposes.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[type_key = "Span"]
-#[ref_name = "Span"]
-pub struct SpanNode {
-    pub base: Object,
-    /// The source name.
-    pub source_name: SourceName,
-    /// The line number.
-    pub line: i32,
-    /// The column offset.
-    pub column: i32,
-    /// The end line number.
-    pub end_line: i32,
-    /// The end column number.
-    pub end_column: i32,
-}
-
-impl Span {
-    pub fn new(
-        source_name: SourceName,
-        line: i32,
-        end_line: i32,
-        column: i32,
-        end_column: i32,
-    ) -> Span {
-        let span_node = SpanNode {
-            base: Object::base::<SpanNode>(),
-            source_name,
-            line,
-            end_line,
-            column,
-            end_column,
-        };
-        Span(Some(ObjectPtr::new(span_node)))
-    }
-}
diff --git a/rust/tvm/src/ir/tir.rs b/rust/tvm/src/ir/tir.rs
deleted file mode 100644
index dcbec520d3b6..000000000000
--- a/rust/tvm/src/ir/tir.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use super::{PrimExpr, PrimExprNode};
-
-use crate::ir::span::Span;
-use crate::runtime::{IsObjectRef, String as TVMString};
-use crate::DataType;
-
-use tvm_macros::Object;
-
-macro_rules! define_node {
-    ($name:ident, $ref:expr, $typekey:expr; $node:ident { $($id:ident : $t:ty),*}) => {
-        #[repr(C)]
-        #[derive(Object, Debug)]
-        #[ref_name = $ref]
-        #[type_key = $typekey]
-        pub struct $node {
-            base: PrimExprNode,
-            $(pub $id : $t),*
-        }
-
-        impl $name {
-            pub fn new(datatype: DataType, $($id : $t,)*) -> $name {
-                let base = PrimExprNode::base::<$node>(datatype, Span::null());
-                let node = $node { base, $($id),* };
-                node.into()
-            }
-        }
-    }
-}
-
-// TODO(@jroesch): should move up to expr.rs to mirror TVM.
-define_node!(IntImm, "IntImm", "IntImm";
-             IntImmNode { value: i64 });
-
-impl From<i32> for IntImm {
-    fn from(i: i32) -> IntImm {
-        IntImm::new(DataType::int(32, 1), i as i64)
-    }
-}
-
-impl From<i32> for PrimExpr {
-    fn from(i: i32) -> PrimExpr {
-        IntImm::from(i).upcast()
-    }
-}
-
-define_node!(Var, "Var", "tir.Var";
-             VarNode { name_hint: TVMString });
-
-define_node!(Add, "Add", "tir.Add"; AddNode { a: PrimExpr, b: PrimExpr });
-define_node!(Sub, "Sub", "tir.Sub"; SubNode { a: PrimExpr, b: PrimExpr });
-define_node!(Mul, "Mul", "tir.Mul"; MulNode { a: PrimExpr, b: PrimExpr });
-
-define_node!(Div, "Div", "tir.Div"; DivNode { a: PrimExpr, b: PrimExpr });
-define_node!(Mod, "Mod", "tir.Mod"; ModNode { a: PrimExpr, b: PrimExpr });
-define_node!(FloorDiv, "FloorDiv", "tir.FloorDiv"; FloorDivNode { a: PrimExpr, b: PrimExpr });
-define_node!(FloorMod, "FloorMod", "tir.FloorMod"; FloorModNode { a: PrimExpr, b: PrimExpr });
-
-define_node!(Min, "Min", "tir.Min"; MinNode { a: PrimExpr, b: PrimExpr });
-define_node!(Max, "Max", "tir.Max"; MaxNode { a: PrimExpr, b: PrimExpr });
-
-// the new datatype is in the base expr
-define_node!(Cast, "Cast", "tir.Cast"; CastNode { value: PrimExpr });
-
-// renamed base to start to avoid name clash
-define_node!(Ramp, "Ramp", "tir.Ramp"; RampNode { start: PrimExpr, stride: PrimExpr, lanes: i32 });
-
-define_node!(Select, "Select", "tir.Select";
-             SelectNode { condition: PrimExpr, true_value: PrimExpr, false_value: PrimExpr });
-
-define_node!(Eq, "Eq", "tir.EQ"; EqNode { a: PrimExpr, b: PrimExpr });
-define_node!(Ne, "Ne", "tir.NE"; NeNode { a: PrimExpr, b: PrimExpr });
-define_node!(Lt, "Lt", "tir.LT"; LtNode { a: PrimExpr, b: PrimExpr });
-define_node!(Le, "Le", "tir.LE"; LeNode { a: PrimExpr, b: PrimExpr });
-define_node!(Gt, "Gt", "tir.GT"; GtNode { a: PrimExpr, b: PrimExpr });
-define_node!(Ge, "Ge", "tir.GE"; GeNode { a: PrimExpr, b: PrimExpr });
-
-define_node!(And, "And", "tir.And"; AndNode { a: PrimExpr, b: PrimExpr });
-define_node!(Or,  "Or",  "tir.Or";  OrNode  { a: PrimExpr, b: PrimExpr });
-define_node!(Not, "Not", "tir.Not"; NotNode { value: PrimExpr });
-
-define_node!(Let, "Let", "tir.Let"; LetNode { var: Var, value: PrimExpr, body: PrimExpr });
diff --git a/rust/tvm/src/ir/ty.rs b/rust/tvm/src/ir/ty.rs
deleted file mode 100644
index 83fdbfeb66aa..000000000000
--- a/rust/tvm/src/ir/ty.rs
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use tvm_macros::Object;
-use tvm_rt::{array::Array, DataType};
-
-use crate::ir::relay::Constructor;
-use crate::ir::span::Span;
-use crate::ir::PrimExpr;
-use crate::runtime::{string::String as TString, IsObject, IsObjectRef, Object, ObjectPtr};
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "Type"]
-#[type_key = "Type"]
-pub struct TypeNode {
-    pub base: Object,
-    pub span: Span,
-}
-
-impl TypeNode {
-    fn base<T: IsObject>(span: Span) -> Self {
-        TypeNode {
-            base: Object::base::<T>(),
-            span,
-        }
-    }
-}
-
-/*
- * \brief Primitive data types used in the low-level IR.
- *
- * PrimType represents POD-values and handles that are
- * not automatically managed by the runtime.
- *
- * \sa PrimType
- */
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PrimType"]
-#[type_key = "PrimType"]
-pub struct PrimTypeNode {
-    pub base: TypeNode,
-    /// The corresponding dtype field.
-    pub dtype: DataType,
-}
-
-/*
- *!
- * \brief Low-level raw pointer type.
- *
- *  PointerType represents type hints in the TIR to be
- *  passed to the final code generator.
- *
- *  PointerType should not occur in the high-level analysis.
- *
- * \sa PointerType
- */
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PointerType"]
-#[type_key = "PointerType"]
-pub struct PointerTypeNode {
-    pub base: TypeNode,
-    /// The type of the element which the pointer points to.
-    pub element_type: Type,
-}
-
-/// Possible kinds of type variables.
-#[derive(PartialEq, Eq, Debug)]
-pub enum TypeKind {
-    Type = 0,
-    /// Template variable in shape expression.
-    ShapeVar = 1,
-    Constraint = 4,
-    AdtHandle = 5,
-    TypeData = 6,
-}
-
-/// Type parameter in functions.
-///
-/// A type variable can be viewed as template parameter in c++ template function.
-///
-/// For example, in the following pesudo code,
-/// the TypeVar of f is TypeVar("n", kind=kShapeVar).
-/// This function can take in a Tensor with shape=(3, 3) and
-/// returns a Tensor with shape=(9,)
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TypeVar"]
-#[type_key = "TypeVar"]
-pub struct TypeVarNode {
-    pub base: TypeNode,
-    pub name_hint: TString,
-    pub kind: TypeKind,
-}
-
-/// A global type variable that is used for defining new types or type aliases.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "GlobalTypeVar"]
-#[type_key = "GlobalTypeVar"]
-pub struct GlobalTypeVarNode {
-    pub base: TypeNode,
-    pub name_hint: TString,
-    pub kind: TypeKind,
-}
-
-impl GlobalTypeVar {
-    pub fn new<S>(name_hint: S, kind: TypeKind, span: Span) -> GlobalTypeVar
-    where
-        S: Into<TString>,
-    {
-        let node = GlobalTypeVarNode {
-            base: TypeNode::base::<GlobalTypeVarNode>(span),
-            name_hint: name_hint.into(),
-            kind: kind,
-        };
-        ObjectPtr::new(node).into()
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TupleType"]
-#[type_key = "TupleType"]
-pub struct TupleTypeNode {
-    pub base: TypeNode,
-    pub fields: Array<Type>,
-}
-
-impl TupleType {
-    // todo add coercion
-    pub fn new(fields: Vec<Type>, span: Span) -> Self {
-        let node = TupleTypeNode {
-            base: TypeNode::base::<TupleTypeNode>(span),
-            fields: Array::from_vec(fields).unwrap(),
-        };
-        ObjectPtr::new(node).into()
-    }
-
-    pub fn empty() -> TupleType {
-        TupleType::new(vec![], Span::null())
-    }
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TypeConstraint"]
-#[type_key = "TypeConstraint"]
-pub struct TypeConstraintNode {
-    pub base: TypeNode,
-}
-
-/// The representation of a polymorphic function type.
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "FuncType"]
-#[type_key = "FuncType"]
-pub struct FuncTypeNode {
-    pub base: TypeNode,
-    /// The type of arguments.
-    pub arg_types: Array<Type>,
-    /// The return type of the function.
-    pub ret_type: Type,
-    /// ...
-    pub type_params: Array<TypeVar>,
-    /// Type constraints that must hold when
-    /// calling this function.
-    pub type_constraints: Array<TypeConstraint>,
-}
-
-/*
- * \brief Intermediate values that is used to indicate incomplete type
- *         during type inference.
- *
- * If we view the type relations as "computational graph of types",
- * then IncompleteType represents intermediate values of the graph,
- * TypeVar represents the input to the graph.
- */
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "IncompleteType"]
-#[type_key = "IncompleteType"]
-pub struct IncompleteTypeNode {
-    pub base: TypeNode,
-    pub kind: TypeKind,
-}
-
-/*
- * \brief Reference Type High-level Relay IR.
- *
- * \sa RelayRefType.
- */
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "RefType"]
-#[type_key = "relay.RefType"]
-pub struct RelayRefTypeNode {
-    pub base: TypeNode,
-    pub value: Type,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "BaseTensorType"]
-#[type_key = "relay.BaseTensorType"]
-pub struct BaseTensorTypeNode {
-    pub base: TypeNode,
-}
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TensorType"]
-#[type_key = "relay.TensorType"]
-pub struct TensorTypeNode {
-    pub base: TypeNode,
-    pub shape: Array<PrimExpr>,
-    pub dtype: DataType,
-}
-
-impl TensorType {
-    pub fn new(shape: Array<PrimExpr>, dtype: DataType, span: Span) -> TensorType {
-        let node = TensorTypeNode {
-            base: TypeNode::base::<TensorTypeNode>(span),
-            shape,
-            dtype,
-        };
-        ObjectPtr::new(node).into()
-    }
-
-    pub fn static_sh(shape: Vec<i32>, dtype: DataType, span: Span) -> TensorType {
-        let sh = Array::from_vec(shape.into_iter().map(Into::into).collect()).unwrap();
-        Self::new(sh, dtype, span)
-    }
-}
-
-// TODO(@jroesch): implement these in future.
-//
-// using TypeCall = tvm::TypeCall;
-// using TypeCallNode = tvm::TypeCallNode;
-// using TypeRelation = tvm::TypeRelation;
-// using TypeRelationNode = tvm::TypeRelationNode;
-// using TypeRelationFn = tvm::TypeRelationFn;
-// using TypeReporter = tvm::TypeReporter;
-// using TypeReporterNode = tvm::TypeReporterNode;
-
-/* TypeData container node.
-\brief Stores all data for an Algebraic Data Type (ADT).
-
-In particular, it stores the handle (global type var) for an ADT
-and the constructors used to build it and is kept in the module. Note
-that type parameters are also indicated in the type data: this means that
-for any instance of an ADT, the type parameters must be indicated. That is,
-an ADT definition is treated as a type-level function, so an ADT handle
-must be wrapped in a TypeCall node that instantiates the type-level arguments.
-The kind checker enforces this. */
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "TypeData"]
-#[type_key = "relay.TypeData"]
-pub struct TypeDataNode {
-    /// The header is simply the name of the ADT.
-    /// We adopt nominal typing for ADT definitions;
-    /// that is, differently-named ADT definitions with same constructors
-    /// have different types.
-    pub base: TypeNode,
-    pub type_name: GlobalTypeVar,
-    /// The type variables (to allow for polymorphism).
-    pub type_vars: Array<TypeVar>,
-    /// The constructors.
-    pub constructors: Array<Constructor>,
-}
-
-impl TypeData {
-    pub fn new<TypeVars, Ctors>(
-        type_name: GlobalTypeVar,
-        type_vars: TypeVars,
-        constructors: Ctors,
-        span: Span,
-    ) -> TypeData
-    where
-        TypeVars: IntoIterator<Item = TypeVar>,
-        Ctors: IntoIterator<Item = Constructor>,
-    {
-        use std::iter::FromIterator;
-        let type_data = TypeDataNode {
-            base: TypeNode::base::<TypeDataNode>(span),
-            type_name,
-            type_vars: Array::from_iter(type_vars),
-            constructors: Array::from_iter(constructors),
-        };
-        TypeData(Some(ObjectPtr::new(type_data)))
-    }
-}
diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs
deleted file mode 100644
index 81abe338bd1b..000000000000
--- a/rust/tvm/src/lib.rs
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-//! [TVM](https://github.com/apache/tvm) is a compiler stack for deep learning systems.
-//!
-//! This crate provides an idiomatic Rust API for TVM runtime frontend.
-//!
-//! One particular use case is that given optimized deep learning model artifacts,
-//! (compiled with TVM) which include a shared library
-//! `lib.so`, `graph.json` and a byte-array `param.params`, one can load them
-//! in Rust idiomatically to create a TVM Graph Executor and
-//! run the model for some inputs and get the
-//! desired predictions *all in Rust*.
-//!
-//! Checkout the `examples` repository for more details.
-
-pub use crate::{errors::*, function::Function, module::Module, ndarray::NDArray};
-
-pub use tvm_rt::{DataType, Device, DeviceType};
-
-pub use tvm_rt::device;
-pub use tvm_rt::errors;
-pub use tvm_rt::function;
-pub use tvm_rt::module;
-pub use tvm_rt::ndarray;
-
-#[cfg(feature = "python")]
-pub mod compiler;
-pub mod ir;
-#[cfg(feature = "python")]
-pub mod python;
-pub mod runtime;
-pub mod transform;
-
-pub use runtime::version;
-
-#[macro_export]
-macro_rules! export {
-    ($($fn_name:expr),*) => {
-        pub fn tvm_export(ns: &str) -> Result<(), tvm::Error> {
-            $(
-                let name = String::from(ns) + ::std::stringify!($fn_name);
-                tvm::runtime::function::register_override($fn_name, name, true)?;
-            )*
-            Ok(())
-        }
-    }
-}
-
-#[macro_export]
-macro_rules! export_mod {
-    ($ns:expr, $($mod_name:expr),*) => {
-        pub fn tvm_mod_export() -> Result<(), tvm::Error> {
-            $(
-                $mod_name::tvm_export($ns)?;
-            )*
-            Ok(())
-        }
-    }
-}
diff --git a/rust/tvm/src/python.rs b/rust/tvm/src/python.rs
deleted file mode 100644
index c224fb4db372..000000000000
--- a/rust/tvm/src/python.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use pyo3::prelude::*;
-
-/// Load the Python interpreter into the address space.
-///
-/// This enables the ability for Rust code to call TVM
-/// functionality defined in Python.
-///
-/// For example registered TVM functions can now be
-/// obtained via `Function::get`.
-pub fn load() -> Result<String, ()> {
-    let gil = Python::acquire_gil();
-    let py = gil.python();
-    // let main_mod = initialize();
-    //let main_mod = main_mod.as_ref(py);
-    load_python_tvm_(py).map_err(|e| {
-        // We can't display Python exceptions via std::fmt::Display,
-        // so print the error here manually.
-        e.print_and_set_sys_last_vars(py);
-    })
-}
-
-pub fn import(mod_to_import: &str) -> PyResult<()> {
-    let gil = Python::acquire_gil();
-    let py = gil.python();
-    import_python(py, mod_to_import)?;
-    Ok(())
-}
-
-fn import_python<'p, 'b: 'p>(py: Python<'p>, to_import: &'b str) -> PyResult<&'p PyModule> {
-    let imported_mod = py.import(to_import)?;
-    Ok(imported_mod)
-}
-
-fn load_python_tvm_(py: Python) -> PyResult<String> {
-    let imported_mod = import_python(py, "tvm")?;
-    let version: String = imported_mod.get("__version__")?.extract()?;
-    Ok(version)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use anyhow::Result;
-
-    #[ignore]
-    #[test]
-    fn test_run() -> Result<()> {
-        load().unwrap();
-        Ok(())
-    }
-}
diff --git a/rust/tvm/src/runtime/mod.rs b/rust/tvm/src/runtime/mod.rs
deleted file mode 100644
index 69fbb371824a..000000000000
--- a/rust/tvm/src/runtime/mod.rs
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-pub use tvm_rt::*;
diff --git a/rust/tvm/src/transform.rs b/rust/tvm/src/transform.rs
deleted file mode 100644
index b49633777b65..000000000000
--- a/rust/tvm/src/transform.rs
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use crate::ir::relay::Function;
-use crate::runtime::array::Array;
-use crate::runtime::{
-    external,
-    function::{self, Result, ToFunction},
-    String as TString,
-};
-use crate::runtime::{Object, ObjectPtr, ObjectRef};
-
-use tvm_macros::Object;
-
-pub type Pass = ObjectRef;
-pub type IRModule = ObjectRef;
-pub type PassContext = ObjectRef;
-
-#[repr(C)]
-#[derive(Object, Debug)]
-#[ref_name = "PassInfo"]
-#[type_key = "transform.PassInfo"]
-pub struct PassInfoNode {
-    pub base: Object,
-    pub opt_level: i32,
-    pub name: TString,
-    pub required: Array<TString>,
-}
-
-impl PassInfo {
-    pub fn new(opt_level: i32, name: String, required: Vec<String>) -> Result<PassInfo> {
-        let required = required.into_iter().map(|name| name.into()).collect();
-
-        let required = Array::from_vec(required)?;
-
-        let node = PassInfoNode {
-            base: Object::base::<PassInfoNode>(),
-            opt_level,
-            name: name.into(),
-            required,
-        };
-
-        Ok(PassInfo(Some(ObjectPtr::new(node))))
-    }
-}
-
-external! {
-    #[name("relay._transform.MakeFunctionPass")]
-    fn create_func_pass(func: function::Function, pass_info: PassInfo) -> Pass;
-}
-
-pub fn function_pass<F: Fn(Function, IRModule, PassContext) -> Function + 'static>(
-    pass_fn: F,
-    pass_info: PassInfo,
-) -> Result<Pass> {
-    let func = pass_fn.to_function();
-    create_func_pass(func, pass_info)
-}
-
-/// A macro for generating the correct TVM symbols for plugin loading.
-///
-/// The expression passed to the macro will be run when TVM loads the
-/// shared library.
-///
-/// This is useful for calling register to register packed functions
-/// to consume via TVM's packed function APIs.
-#[macro_export]
-macro_rules! initialize {
-    ($body:expr) => {
-        #[no_mangle]
-        pub unsafe extern "C" fn initialize(
-            args: *mut tvm_sys::ffi::TVMValue,
-            type_codes: *mut c_int,
-            num_args: c_int,
-            ret: tvm_sys::ffi::TVMRetValueHandle,
-        ) -> c_int {
-            $body
-            return 0;
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! export_pass {
-    ($name:literal,$func:expr) => {
-        #[no_mangle]
-        pub unsafe extern "C" fn initialize(
-            args: *mut tvm_sys::ffi::TVMValue,
-            type_codes: *mut c_int,
-            num_args: c_int,
-            ret: tvm_sys::ffi::TVMRetValueHandle,
-        ) -> c_int {
-            register($func, $name).unwrap();
-            return 0;
-        }
-    };
-}
diff --git a/rust/tvm/tests/basics/.gitignore b/rust/tvm/tests/basics/.gitignore
deleted file mode 100644
index 10a4b225a705..000000000000
--- a/rust/tvm/tests/basics/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-/target
-**/*.rs.bk
-Cargo.lock
-*.o
-*.so
-*.ptx
-*.json
diff --git a/rust/tvm/tests/basics/Cargo.toml b/rust/tvm/tests/basics/Cargo.toml
deleted file mode 100644
index 421a7ae3cb7d..000000000000
--- a/rust/tvm/tests/basics/Cargo.toml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "basics"
-version = "0.0.0"
-authors = ["TVM Contributors"]
-license = "Apache-2.0"
-build = "build.rs"
-edition = "2018"
-
-[dependencies]
-ndarray = "0.12"
-tvm = { path = "../../" }
-
-[build-dependencies]
-anyhow = "^1.0"
-
-[features]
-default = ["cpu"]
-cpu = []
-gpu = []
diff --git a/rust/tvm/tests/basics/build.rs b/rust/tvm/tests/basics/build.rs
deleted file mode 100644
index 6d5807c3d419..000000000000
--- a/rust/tvm/tests/basics/build.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use anyhow::{Context, Result};
-
-fn main() -> Result<()> {
-    let out_dir = std::env::var("OUT_DIR").unwrap();
-    let tvm_mk_add = concat!(env!("CARGO_MANIFEST_DIR"), "/src/tvm_add.py");
-
-    let output = std::process::Command::new(tvm_mk_add)
-        .args(&[
-            if cfg!(feature = "cpu") {
-                "llvm"
-            } else {
-                "cuda"
-            },
-            &std::env::var("OUT_DIR").unwrap(),
-        ])
-        .output()
-        .with_context(|| anyhow::anyhow!(tvm_mk_add))?;
-
-    assert!(
-        std::path::Path::new(&format!("{}/test_add.so", out_dir)).exists(),
-        "Could not build tvm lib: {}",
-        String::from_utf8(output.stderr)
-            .context("utf-8 conversion failed")?
-            .trim()
-            .split("\n")
-            .last()
-            .unwrap_or("")
-    );
-
-    println!("cargo:rustc-link-search=native={}", out_dir);
-
-    Ok(())
-}
diff --git a/rust/tvm/tests/basics/src/main.rs b/rust/tvm/tests/basics/src/main.rs
deleted file mode 100644
index b7c30364f294..000000000000
--- a/rust/tvm/tests/basics/src/main.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::str::FromStr;
-
-use tvm::*;
-
-fn main() {
-    let shape = &mut [2];
-    let mut data = vec![3f32, 4.0];
-
-    let (dev, dev_name) = if cfg!(feature = "cpu") {
-        (Device::cpu(0), "cpu")
-    } else {
-        (Device::cuda(0), "cuda")
-    };
-
-    let dtype = DataType::from_str("float32").unwrap();
-    let mut arr = NDArray::empty(shape, dev, dtype);
-    arr.copy_from_buffer(data.as_mut_slice());
-    let ret = NDArray::empty(shape, dev, dtype);
-    let fadd = Module::load(&concat!(env!("OUT_DIR"), "/test_add.so")).unwrap();
-    if !fadd.enabled(dev_name) {
-        return;
-    }
-
-    if cfg!(feature = "cuda") {
-        fadd.import_module(Module::load(&concat!(env!("OUT_DIR"), "/test_add.ptx")).unwrap());
-    }
-
-    // todo(@jroesch): fix the entry_name
-    fadd.get_function("__tvm_main__", false)
-        .expect("module must have entry point")
-        .invoke(vec![(&arr).into(), (&arr).into(), (&ret).into()])
-        .unwrap();
-
-    assert_eq!(ret.to_vec::<f32>().unwrap(), vec![6f32, 8.0]);
-}
diff --git a/rust/tvm/tests/basics/src/tvm_add.py b/rust/tvm/tests/basics/src/tvm_add.py
deleted file mode 100755
index fc5c4213bd08..000000000000
--- a/rust/tvm/tests/basics/src/tvm_add.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os.path as osp
-import sys
-
-import tvm
-from tvm import te
-from tvm.contrib import cc
-
-
-def main(target, out_dir):
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-
-    if target == "cuda":
-        bx, tx = s[C].split(C.op.axis[0], factor=64)
-        s[C].bind(bx, te.thread_axis("blockIdx.x"))
-        s[C].bind(tx, te.thread_axis("threadIdx.x"))
-
-    fadd = tvm.build(s, [A, B, C], tvm.target.Target(target, host="llvm"), name="myadd")
-    fadd.save(osp.join(out_dir, "test_add.o"))
-    if target == "cuda":
-        fadd.imported_modules[0].save(osp.join(out_dir, "test_add.ptx"))
-    cc.create_shared(osp.join(out_dir, "test_add.so"), [osp.join(out_dir, "test_add.o")])
-
-
-if __name__ == "__main__":
-    main(sys.argv[1], sys.argv[2])
diff --git a/rust/tvm/tests/callback/Cargo.toml b/rust/tvm/tests/callback/Cargo.toml
deleted file mode 100644
index 5c89d2ac6375..000000000000
--- a/rust/tvm/tests/callback/Cargo.toml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "callback"
-version = "0.0.0"
-authors = ["TVM Contributors"]
-edition = "2018"
-
-[dependencies]
-ndarray = "0.12"
-tvm = { path = "../../" }
diff --git a/rust/tvm/tests/callback/src/bin/array.rs b/rust/tvm/tests/callback/src/bin/array.rs
deleted file mode 100644
index 8deae30c076d..000000000000
--- a/rust/tvm/tests/callback/src/bin/array.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#![allow(unused_imports)]
-
-extern crate ndarray as rust_ndarray;
-
-use rust_ndarray::ArrayD;
-use std::{
-    convert::{TryFrom, TryInto},
-    str::FromStr,
-};
-
-use tvm::{
-    errors::Error,
-    function::register_untyped,
-    runtime::{ArgValue, RetValue},
-    *,
-};
-
-fn main() {
-    fn sum<'a>(args: Vec<ArgValue<'a>>) -> Result<RetValue, Error> {
-        let mut ret = 0.0;
-        for arg in args {
-            let arg: NDArray = arg.try_into()?;
-            let rnd: ArrayD<f32> = ArrayD::try_from(&arg)?;
-            ret += rnd.scalar_sum();
-        }
-        Ok(RetValue::from(ret))
-    }
-
-    let shape = &[2];
-    let data = vec![3.0, 4.0];
-    let mut arr = NDArray::empty(shape, Device::cpu(0), DataType::float(32, 1));
-    arr.copy_from_buffer(data.as_slice());
-
-    register_untyped(sum, "sum", true).unwrap();
-    let func = Function::get("sum").expect("function registered");
-
-    let ret: f32 = func
-        .invoke(vec![(&arr).into()])
-        .unwrap()
-        .try_into()
-        .expect("call should succeed");
-
-    assert_eq!(ret, 7.0);
-}
diff --git a/rust/tvm/tests/callback/src/bin/error.rs b/rust/tvm/tests/callback/src/bin/error.rs
deleted file mode 100644
index f8886a55c3a2..000000000000
--- a/rust/tvm/tests/callback/src/bin/error.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::panic;
-
-use tvm::{
-    errors::Error,
-    runtime::{ArgValue, RetValue},
-    *,
-};
-
-fn main() {
-    fn error<'a>(_args: Vec<ArgValue<'a>>) -> Result<RetValue, Error> {
-        Err(errors::NDArrayError::DataTypeMismatch {
-            expected: DataType::int(64, 1),
-            actual: DataType::float(64, 1),
-        }
-        .into())
-    }
-
-    function::register_untyped(error, "error", true).unwrap();
-
-    let func = Function::get("error");
-    assert!(func.is_some());
-    match func.unwrap().invoke(vec![10.into(), 20.into()]) {
-        Err(_) => {}
-        Ok(_) => panic!("expected error"),
-    }
-}
diff --git a/rust/tvm/tests/callback/src/bin/float.rs b/rust/tvm/tests/callback/src/bin/float.rs
deleted file mode 100644
index d575f47c87cd..000000000000
--- a/rust/tvm/tests/callback/src/bin/float.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#![allow(unused_imports)]
-
-use std::convert::TryInto;
-use tvm::{
-    errors::Error,
-    runtime::{ArgValue, RetValue},
-    *,
-};
-
-fn main() {
-    fn sum<'a>(args: Vec<ArgValue<'a>>) -> Result<RetValue, Error> {
-        let mut ret = 0.0;
-        for arg in args.into_iter() {
-            let val: f64 = arg.try_into()?;
-            ret += val;
-        }
-        Ok(RetValue::from(ret))
-    }
-
-    function::register_untyped(sum, "sum", true).expect("registration should succeed");
-
-    let func = Function::get("sum").expect("sum was just registered.");
-
-    let ret: f64 = func
-        .invoke(vec![10.0f64.into(), 20.0.into(), 30.0.into()])
-        .unwrap()
-        .try_into()
-        .unwrap();
-
-    assert_eq!(ret, 60f64);
-}
diff --git a/rust/tvm/tests/callback/src/bin/int.rs b/rust/tvm/tests/callback/src/bin/int.rs
deleted file mode 100644
index fc2e40d8de4d..000000000000
--- a/rust/tvm/tests/callback/src/bin/int.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::convert::TryInto;
-use tvm::{
-    errors::Error,
-    runtime::{ArgValue, RetValue},
-    *,
-};
-
-fn main() {
-    fn sum<'a>(args: Vec<ArgValue<'a>>) -> Result<RetValue, Error> {
-        let mut ret = 0i64;
-        for arg in args.iter() {
-            let val: i64 = arg.try_into()?;
-            ret += val;
-        }
-        Ok(RetValue::from(ret))
-    }
-
-    tvm::function::register_untyped(sum, "mysum".to_owned(), false).unwrap();
-    let func = Function::get("mysum").unwrap();
-    let ret: i64 = func
-        .invoke(vec![10.into(), 20.into(), 30.into()])
-        .unwrap()
-        .try_into()
-        .unwrap();
-    assert_eq!(ret, 60);
-}
diff --git a/rust/tvm/tests/callback/src/bin/string.rs b/rust/tvm/tests/callback/src/bin/string.rs
deleted file mode 100644
index 4f3d67e95d64..000000000000
--- a/rust/tvm/tests/callback/src/bin/string.rs
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::convert::TryInto;
-use tvm::{
-    errors::Error,
-    runtime::{ArgValue, RetValue},
-    *,
-};
-
-// FIXME
-fn main() {
-    fn concat_str<'a>(args: Vec<ArgValue<'a>>) -> Result<RetValue, Error> {
-        let mut ret = "".to_string();
-        for arg in args.iter() {
-            let val: &str = arg.try_into()?;
-            ret += val;
-        }
-        Ok(RetValue::from(ret))
-    }
-
-    let a = std::ffi::CString::new("a").unwrap();
-    let b = std::ffi::CString::new("b").unwrap();
-    let c = std::ffi::CString::new("c").unwrap();
-
-    tvm::function::register_untyped(concat_str, "concat_str".to_owned(), false).unwrap();
-
-    let func = Function::get("concat_str").expect("just registered a function");
-
-    let args = vec![
-        a.as_c_str().into(),
-        b.as_c_str().into(),
-        c.as_c_str().into(),
-    ];
-
-    let ret: String = func
-        .invoke(args)
-        .expect("function call should succeed")
-        .try_into()
-        .unwrap();
-
-    assert_eq!(ret, "abc".to_owned());
-}
diff --git a/tests/cpp/relay/backend/aot/aot_lower_main_test.cc b/tests/cpp/relay/backend/aot/aot_lower_main_test.cc
deleted file mode 100644
index 0157f031c214..000000000000
--- a/tests/cpp/relay/backend/aot/aot_lower_main_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "../../../../../src/relay/backend/aot/aot_lower_main.h"
-
-#include <gtest/gtest.h>
-#include <tvm/relay/parser.h>
-
-namespace tvm {
-namespace relay {
-namespace backend {
-namespace aot {
-
-TEST(AOTLowerMain, ExprAllocatorSkipNestedFunc) {
-  constexpr const char* mod_text = R"(
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32]) {
-          nn.relu(%FunctionVar_01)
-        };
-        %0(%x)
-      }
-    )";
-  IRModule mod = ParseModule("string", mod_text, {}, {});
-  auto host_target = tvm::Target("llvm");
-  auto prim_target = tvm::Target(host_target, host_target);
-  auto ctxt = tvm::transform::PassContext::Current();
-  auto config = tvm::CompilationConfig(ctxt, {prim_target});
-  mod = tvm::relay::transform::PlanDevices(config)(mod);
-  mod = tvm::relay::transform::InferType()(mod);
-
-  StorageMap storage_map;
-  std::vector<int> return_sids;
-  auto func = Downcast<Function>(mod->Lookup("main"));
-  std::tie(storage_map, return_sids) = CreateStorage(func);
-
-  auto nested_func = Downcast<Function>(Downcast<Call>(func->body)->op);
-  EXPECT_EQ(storage_map.find(nested_func->body), storage_map.end());
-  EXPECT_EQ(storage_map.find(nested_func->params[0]), storage_map.end());
-  EXPECT_NE(storage_map.find(func->body), storage_map.end());
-  EXPECT_NE(storage_map.find(func->params[0]), storage_map.end());
-}
-
-}  // namespace aot
-}  // namespace backend
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/backend/executor_test.cc b/tests/cpp/relay/backend/executor_test.cc
deleted file mode 100644
index 3367390b27f2..000000000000
--- a/tests/cpp/relay/backend/executor_test.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/relay/executor.h>
-
-#include <cmath>
-#include <string>
-namespace tvm {
-namespace relay {
-
-TVM_REGISTER_EXECUTOR("TestExecutor")
-    .add_attr_option<Bool>("my_bool")
-    .add_attr_option<Array<String>>("your_names")
-    .add_attr_option<String>("another_option")
-    .add_attr_option<Bool>("defaulty_the_default_option", Bool(false));
-
-TEST(Executor, Create) {
-  Map<String, ObjectRef> attrs = {{"my_bool", Bool(true)}};
-  Executor my_exec = Executor::Create("TestExecutor", attrs);
-  ASSERT_EQ(my_exec->GetAttr<Bool>("my_bool"), true);
-  ASSERT_EQ(my_exec->GetAttr<Array<String>>("your_names").defined(), false);
-  ASSERT_EQ(my_exec->GetAttr<Bool>("defaulty_the_default_option"), false);
-}
-
-TEST(Executor, UnknownAttr) {
-  Map<String, ObjectRef> attrs = {{"woofles", Bool(true)}};
-  ASSERT_THROW(Executor::Create("TestExecutor", attrs), Error);
-}
-
-TEST(Executor, IncorrectAttrType) {
-  Map<String, ObjectRef> attrs = {{"my_bool", String("snuck_in")}};
-  ASSERT_THROW(Executor::Create("TestExecutor", attrs), Error);
-}
-
-TEST(Executor, UnregisteredName) {
-  Map<String, ObjectRef> attrs = {};
-  ASSERT_THROW(Executor::Create("NeverNameAnExecutorThis", attrs), Error);
-}
-
-TEST(ExecutorRegistry, ListExecutors) {
-  Array<String> names = Executor::ListExecutors();
-  ICHECK_EQ(names.empty(), false);
-  ICHECK_EQ(std::count(std::begin(names), std::end(names), "TestExecutor"), 1);
-}
-
-TEST(ExecutorRegistry, ListExecutorOptions) {
-  Map<String, String> attrs = Executor::ListExecutorOptions("TestExecutor");
-
-  ICHECK_EQ(attrs.empty(), false);
-  ICHECK_EQ(attrs["my_bool"], "IntImm");
-  ICHECK_EQ(attrs["your_names"], "Array");
-  ICHECK_EQ(attrs["another_option"], "runtime.String");
-}
-
-TEST(ExecutorRegistry, ListExecutorOptionsNoExecutor) {
-  ASSERT_THROW(Executor::ListExecutorOptions("NeverNameAnExecutorThis"), Error);
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/backend/graph_plan_token_alloc.cc b/tests/cpp/relay/backend/graph_plan_token_alloc.cc
deleted file mode 100644
index 7fca4b26a985..000000000000
--- a/tests/cpp/relay/backend/graph_plan_token_alloc.cc
+++ /dev/null
@@ -1,540 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include "../src/relay/backend/token_allocator.h"
-
-namespace tvm {
-namespace relay {
-
-// TokenAllocatorMixed is necessary because in class TokenAllocatorMixed we don't
-// have an access to its protected members. In this class we add new methods
-// which allow us to get and check internal state of class TokenAllocatorMixed
-class TokenAllocatorMixedWrapper : public TokenAllocatorMixed {
- public:
-  inline size_t FreeListSize() const { return free_.size(); }
-  inline size_t AllocListSize() const { return data_.size(); }
-};
-
-TEST(TokenMixedAlloc, TextureOneToken) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
-                    MemoryScope("global.texture-nhwc"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d, 140800);
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-}
-
-TEST(TokenMixedAlloc, TextureEqualSizeTokenReuse) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
-                    MemoryScope("global.texture-nhwc"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d, 140800);
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto req = alloc.Request(&tok2);
-  EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 1);
-  auto sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq, 140800);
-
-  req->ref_counter -= 1;
-  alloc.CheckForRelease(req);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  // Try reuse of the texture memory for buffer object
-  VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"), MemoryScope("global"));
-  StorageToken tok3 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd2,  // virtual device
-      -1    // storage_id
-  };
-  auto req1 = alloc.Request(&tok3);
-  EXPECT_NE(req1, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req1->storage_id, storage_ids - 1);
-  EXPECT_EQ(req1->ref_counter, 1);
-  sizeReq = alloc.GetSize2D(req1);
-  EXPECT_EQ(sizeReq, 140800);
-
-  req1->ref_counter -= 1;
-  alloc.CheckForRelease(req1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-}
-
-TEST(TokenMixedAlloc, TextureEqualSizeDiffTypes) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
-                    MemoryScope("global.texture-nhwc"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto sizeReq = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(sizeReq, 140800);
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1));
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt2,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-
-  auto req1 = alloc.Request(&tok2);
-  EXPECT_NE(req1, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  req1->ref_counter -= 1;
-  alloc.CheckForRelease(req1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-}
-
-TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
-                    MemoryScope("global.texture-nhwc"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto sizeReq = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(sizeReq, 140800);
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1));
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt2,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto req = alloc.Request(&tok2);
-  EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 1);
-  sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq, 576000);
-
-  req->ref_counter -= 1;
-  alloc.CheckForRelease(req);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1));
-  StorageToken tok3 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt3,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto req2 = alloc.Request(&tok3);
-  EXPECT_NE(req2, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req2->storage_id, storage_ids - 1);
-  EXPECT_EQ(req2->ref_counter, 1);
-  sizeReq = alloc.GetSize2D(req2);
-  EXPECT_EQ(sizeReq, 576000);
-}
-
-TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse2) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
-                    MemoryScope("global.texture-nhwc"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto sizeReq = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(sizeReq, 140800);
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt2({1, 5, 30, 20, 4}, DataType(kDLFloat, 32, 1));
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt2,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto req = alloc.Request(&tok2);
-  EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 1);
-  sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq, 140800);
-}
-
-TEST(TokenMixedAlloc, TextureSameSizesButDiffMemoryScopes) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({28, 676, 1, 1, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
-                    MemoryScope("global.texture-weight"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto sizeReq = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(sizeReq, 302848);
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt2({1, 28, 26, 26, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"),
-                    MemoryScope("global.texture-nhwc"));
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt2,  // tensor type
-      vd2,  // virtual device
-      -1    // storage_id
-  };
-  auto tok2Size = alloc.GetSize2D(&tok2);
-  EXPECT_EQ(tok2Size, 302848);
-
-  auto req = alloc.Request(&tok2);
-  EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  req->ref_counter -= 1;
-  alloc.CheckForRelease(req);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-}
-
-TEST(TokenMixedAlloc, OneToken) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-}
-
-TEST(TokenMixedAlloc, EqualSizeTokenReuse) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto req = alloc.Request(&tok2);
-  EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 1);
-
-  req->ref_counter -= 1;
-  alloc.CheckForRelease(req);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-}
-
-TEST(TokenMixedAlloc, EqualSizeDiffTypes) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1));
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt2,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-
-  auto req1 = alloc.Request(&tok2);
-  EXPECT_NE(req1, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  req1->ref_counter -= 1;
-  alloc.CheckForRelease(req1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-}
-
-TEST(TokenMixedAlloc, DifferentSizesTokenReuse) {
-  TokenAllocatorMixedWrapper alloc;
-  int storage_ids = 0;
-  EXPECT_EQ(alloc.AllocListSize(), 0);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
-  StorageToken tok1 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt1,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  EXPECT_EQ(alloc.Request(&tok1), nullptr);
-
-  alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-
-  tok1.ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1));
-  StorageToken tok2 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt2,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto req = alloc.Request(&tok2);
-  EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 1);
-
-  req->ref_counter -= 1;
-  alloc.CheckForRelease(req);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
-
-  TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1));
-  StorageToken tok3 = {
-      1,    // ref_counter
-      0,    // max bytes
-      tt3,  // tensor type
-      vd1,  // virtual device
-      -1    // storage_id
-  };
-  auto req2 = alloc.Request(&tok3);
-  EXPECT_NE(req2, nullptr);
-  EXPECT_EQ(alloc.AllocListSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 0);
-  EXPECT_EQ(req2->storage_id, storage_ids - 1);
-  EXPECT_EQ(req2->ref_counter, 1);
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/backend/runtime_test.cc b/tests/cpp/relay/backend/runtime_test.cc
deleted file mode 100644
index adabb9b9b6cf..000000000000
--- a/tests/cpp/relay/backend/runtime_test.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/relay/runtime.h>
-
-#include <cmath>
-#include <string>
-namespace tvm {
-namespace relay {
-
-TVM_REGISTER_RUNTIME("TestRuntime")
-    .add_attr_option<runtime::Bool>("my_bool")
-    .add_attr_option<Array<String>>("your_names")
-    .add_attr_option<String>("another_option")
-    .add_attr_option<runtime::Bool>("defaulty_the_default_option", runtime::Bool(false));
-
-TEST(Runtime, Create) {
-  Map<String, ObjectRef> attrs = {{"my_bool", runtime::Bool(true)}};
-  Runtime my_runtime = Runtime::Create("TestRuntime", attrs);
-  ASSERT_EQ(my_runtime->GetAttr<Bool>("my_bool"), true);
-  ASSERT_EQ(my_runtime->GetAttr<Array<String>>("your_names").defined(), false);
-  ASSERT_EQ(my_runtime->GetAttr<Bool>("defaulty_the_default_option"), false);
-}
-
-TEST(Runtime, UnknownAttr) {
-  Map<String, ObjectRef> attrs = {{"woofles", runtime::Bool(true)}};
-  ASSERT_THROW(Runtime::Create("TestRuntime", attrs), Error);
-}
-
-TEST(Runtime, IncorrectAttrType) {
-  Map<String, ObjectRef> attrs = {{"my_bool", String("snuck_in")}};
-  ASSERT_THROW(Runtime::Create("TestRuntime", attrs), Error);
-}
-
-TEST(Runtime, UnregisteredName) {
-  Map<String, ObjectRef> attrs = {};
-  ASSERT_THROW(Runtime::Create("NeverNameAnRuntimeThis", attrs), Error);
-}
-
-TEST(RuntimeRegistry, ListRuntimes) {
-  Array<String> names = Runtime::ListRuntimes();
-  ICHECK_EQ(names.empty(), false);
-  ICHECK_EQ(std::count(std::begin(names), std::end(names), "TestRuntime"), 1);
-}
-
-TEST(RuntimeRegistry, ListRuntimeOptions) {
-  Map<String, String> attrs = Runtime::ListRuntimeOptions("TestRuntime");
-
-  ICHECK_EQ(attrs.empty(), false);
-  ICHECK_EQ(attrs["my_bool"], "runtime.BoxBool");
-  ICHECK_EQ(attrs["your_names"], "Array");
-  ICHECK_EQ(attrs["another_option"], "runtime.String");
-}
-
-TEST(RuntimeRegistry, ListRuntimeOptionsNoRuntime) {
-  ASSERT_THROW(Runtime::ListRuntimeOptions("NeverNameAnRuntimeThis"), Error);
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/collage/candidate_partition_test.cc b/tests/cpp/relay/collage/candidate_partition_test.cc
deleted file mode 100644
index d298a493c11f..000000000000
--- a/tests/cpp/relay/collage/candidate_partition_test.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "../../../src/relay/collage/candidate_partition.h"
-
-#include <gtest/gtest.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/function.h>
-#include <tvm/relay/parser.h>
-#include <tvm/relay/transform.h>
-
-#include "../../../../src/relay/collage/mock_cost_estimator.h"
-#include "../../../src/relay/collage/partition_spec.h"
-
-namespace tvm {
-namespace relay {
-namespace collage {
-namespace {
-
-// NOTE: CandidatePartition::ParallelRewrite is effectively tested in partition_rule_test.cc
-// so not re-tested here. The only other non-trivial code is CandidatePartition::EstimateCost
-
-Function MakeTestFunction(const std::string& mod_text) {
-  IRModule mod = ParseModule("string", mod_text, {}, {});
-  mod = transform::CapturePostDfsIndexInSpans()(mod);
-  auto func = Downcast<Function>(mod->Lookup("main"));
-  LOG(INFO) << "------- input function -------";
-  LOG(INFO) << PrettyPrint(func);
-  LOG(INFO) << "------------------------------";
-  return func;
-}
-
-PartitionSpec StandardSpec() { return PartitionSpec("test_spec", Target("llvm"), {}); }
-
-String AlwaysInvalid(const Function& function) { return "invalid"; }
-
-PartitionSpec AlwaysInvalidSpec() {
-  return PartitionSpec("test_spec", Target("llvm"), {}, AlwaysInvalid);
-}
-
-/*!
- * \brief Returns candidate containing nodes with given \p indexes wrapped within a
- * "Primitive" and "Compiler" function.
- */
-CandidatePartition MakeCandidate(const DataflowGraph& graph, const PartitionSpec& spec,
-                                 const std::vector<PostDfsIndex>& indexes) {
-  IndexSet inside(graph.size(), indexes);
-  SubGraph inner_sub_graph(graph, inside);
-  FunctionAttrsMap attrs_map;
-  attrs_map.Set(attr::kPrimitive, Integer(1));
-  attrs_map.Set(attr::kCompiler, String("llvm"));
-  NestedSubGraph nested_sub_graph(inner_sub_graph, attrs_map);
-  SubGraph outer_sub_graph(graph, inside, inner_sub_graph->kind_, inner_sub_graph->label_,
-                           {nested_sub_graph});
-  return CandidatePartition(/*rule_name=*/"", outer_sub_graph, spec);
-}
-
-CostEstimator StandardEstimator() {
-  Map<String, Integer> target_costs;
-  target_costs.Set("llvm", 3);
-  return MockCostEstimator(std::move(target_costs));
-}
-
-CostEstimator AlternateEstimator() {
-  Map<String, Integer> target_costs;
-  target_costs.Set("llvm", 7);
-  return MockCostEstimator(std::move(target_costs));
-}
-
-std::shared_ptr<CandidateFunctionCache> Cache() {
-  return std::make_shared<CandidateFunctionCache>(std::make_shared<NameSupply>("test"));
-}
-
-TEST(CandidatePartition, EstimateCost_Simple) {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-      %0 = abs(%x);                      //  3
-      %1 = nn.relu(%0);                  //  4
-      nn.relu(%1)                        //  5
-    }
-  )";
-  auto func = MakeTestFunction(kMod);
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-  auto candidate = MakeCandidate(graph, spec, {3, 4});
-  auto estimator = StandardEstimator();
-  auto cache = Cache();
-
-  {
-    auto cost = candidate->EstimatedCost(graph, estimator, cache);
-    ASSERT_TRUE(cost.is_value());
-    // cost is 3 for nn.rulu plus 3 * 0.9 for the nested abs
-    ASSERT_EQ(cost.value(), 5.7);
-  }
-}
-
-TEST(CandidatePartition, EstimateCost_AlreadyCached) {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-      %0 = abs(%x);                      //  3
-      %1 = nn.relu(%0);                  //  4
-      nn.relu(%1)                        //  5
-    }
-  )";
-  auto func = MakeTestFunction(kMod);
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-  auto candidate = MakeCandidate(graph, spec, {3, 4});
-  candidate->cost_ = Cost::Value(42.0);
-  auto estimator = StandardEstimator();
-  auto cache = Cache();
-
-  {
-    auto cost = candidate->EstimatedCost(graph, estimator, cache);
-    ASSERT_TRUE(cost.is_value());
-    ASSERT_EQ(cost.value(), 42.0);
-  }
-}
-
-TEST(CandidatePartition, EstimateCost_Invalid) {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-      %0 = abs(%x);                      //  3
-      %1 = nn.relu(%0);                  //  4
-      nn.relu(%1)                        //  5
-    }
-  )";
-  auto func = MakeTestFunction(kMod);
-  auto graph = DataflowGraph(func);
-  auto spec = AlwaysInvalidSpec();
-  auto candidate = MakeCandidate(graph, spec, {3, 4});
-  auto estimator = StandardEstimator();
-  auto cache = Cache();
-
-  {
-    auto cost = candidate->EstimatedCost(graph, estimator, cache);
-    ASSERT_TRUE(cost.is_invalid());
-  }
-}
-
-TEST(CandidatePartition, EstimateCost_Cached) {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-      %0 = abs(%x);                      //  4
-      %1 = nn.relu(%0);                  //  5
-      %2 = abs(%1);                      //  6
-      %3 = nn.relu(%2);                  //  7
-      add(%1, %3)                        //  8
-    }
-  )";
-  auto func = MakeTestFunction(kMod);
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-  auto candidateA = MakeCandidate(graph, spec, {4, 5});
-  auto candidateB = MakeCandidate(graph, spec, {6, 7});
-  auto standard_estimator = StandardEstimator();
-  auto alternate_estimator = AlternateEstimator();
-  auto cache = Cache();
-
-  {
-    // First candidate estimated as per usual.
-    auto costA = candidateA->EstimatedCost(graph, standard_estimator, cache);
-    ASSERT_TRUE(costA.is_value());
-    ASSERT_EQ(costA.value(), 5.7);
-
-    // Second candidate is structurally equal to first, so reuse first's cost even though
-    // estimator has different weights.
-    auto costB = candidateB->EstimatedCost(graph, alternate_estimator, cache);
-    ASSERT_TRUE(costB.is_value());
-    ASSERT_EQ(costB.value(), costA.value());
-  }
-}
-
-TEST(CandidatePartition, EstimateCost_EtaExpandTuples) {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-      %0 = abs(%x);                      //  3
-      %1 = nn.relu(%0);                  //  5
-      %2 = (%0, %1);                     //  6
-      concatenate(%2)                    //  7
-    }
-  )";
-  auto func = MakeTestFunction(kMod);
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-  auto candidate = MakeCandidate(graph, spec, {7});
-  auto estimator = StandardEstimator();
-  auto cache = Cache();
-
-  {
-    auto cost = candidate->EstimatedCost(graph, estimator, cache);
-    ASSERT_TRUE(cost.is_value());
-    ASSERT_EQ(cost.value(), 3);
-  }
-}
-
-}  // namespace
-}  // namespace collage
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/collage/partition_rule_test.cc b/tests/cpp/relay/collage/partition_rule_test.cc
deleted file mode 100644
index effe0b1fa030..000000000000
--- a/tests/cpp/relay/collage/partition_rule_test.cc
+++ /dev/null
@@ -1,715 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "../../../src/relay/collage/partition_rule.h"
-
-#include <gtest/gtest.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/function.h>
-#include <tvm/relay/parser.h>
-#include <tvm/relay/transform.h>
-
-#include "../../../src/relay/collage/partition_spec.h"
-
-namespace tvm {
-namespace relay {
-namespace collage {
-namespace {
-
-Constant MakeConstant(std::initializer_list<ShapeTuple::index_type> shape) {
-  return Constant(runtime::NDArray::Empty(shape, DataType::Float(32), {kDLCPU, 0}));
-}
-
-Function MakeTestFunction(
-    const std::string& mod_text,
-    const std::initializer_list<std::initializer_list<ShapeTuple::index_type>>& constant_shapes =
-        {}) {
-  Array<ObjectRef> constants;
-  for (const auto& shape : constant_shapes) {
-    constants.push_back(MakeConstant(shape));
-  }
-  Map<String, Array<ObjectRef>> metatable;
-  metatable.Set("relay.Constant", constants);
-  IRModule mod = ParseModule("string", mod_text, {}, metatable);
-  mod = transform::CapturePostDfsIndexInSpans()(mod);
-  auto func = Downcast<Function>(mod->Lookup("main"));
-  LOG(INFO) << "------- input function -------";
-  LOG(INFO) << PrettyPrint(func);
-  LOG(INFO) << "------------------------------";
-  return func;
-}
-
-Function StandardTestFunction() {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-                                         //  index, kind
-      %0 = abs(%x);                      //  3, E
-      %1 = nn.relu(%0);                  //  4, E
-      nn.relu(%1)                        //  5, E
-    }
-  )";
-  return MakeTestFunction(kMod);
-}
-
-Function VariantTestFunction() {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-                                         // index, kind
-      %0 = abs(%x);                      // 4, E
-      %1 = add(%0, %x);                  // 5, E
-      shape_of(%1)                       // 6, O
-    }
-  )";
-  return MakeTestFunction(kMod);
-}
-
-Function GPT2ExtractOps() {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(1600, 768), float32]) {
-                                                                               // index, kind
-      %60 = nn.dense(%x, meta[relay.Constant][0] /*(3072, 768)*/, units=3072); // 6,  A
-      %61 = add(%60, meta[relay.Constant][1] /*(3072)*/);                      // 8,  B
-      %62 = reshape(%61, newshape=[50, 32, 3072]);                             // 9,  I
-      %63 = power(%62, 3f);                                                    // 15, B
-      %64 = multiply(%63, 0.044715f);                                          // 17, B
-      %65 = add(%62, %64);                                                     // 18, B
-      %66 = multiply(%65, 0.797885f);                                          // 20, B
-      %67 = tanh(%66);                                                         // 21, E
-      %68 = multiply(%62, 0.5f);                                               // 11, B
-      %69 = add(%67, 1f);                                                      // 23, B
-      multiply(%68, %69)                                                       // 24, B
-    }
-  )";
-  return MakeTestFunction(kMod, {{3072, 768}, {3072}});
-}
-
-Function GPT2ExtractTuples() {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(50, 32, 2304), float32]) {
-                                                                           // index, kind
-      %19 = split(%x, indices_or_sections=[768, 1536], axis=2);            // 6,  I
-      %23 = %19.1;                                                         // 7
-      %24 = reshape(%23, newshape=[50, 32, 12, 64]);                       // 8,  I
-      %35 = %19.2;                                                         // 11
-      %36 = reshape(%35, newshape=[50, 32, 12, 64]);                       // 12, I
-      %37 = transpose(%36, axes=[0, 2, 1, 3]);                             // 13, I
-      %855 = transpose(%24, axes=[0, 2, 1, 3]);                            // 9,  I
-      %856 = expand_dims(%855, axis=0);                                    // 10, B
-      %857 = expand_dims(%37, axis=0);                                     // 14, B
-      %858 = (%856, %857);                                                 // 15, B
-      concatenate(%858)                                                    // 16, I
-    }
-  )";
-  return MakeTestFunction(kMod);
-}
-
-PartitionSpec StandardSpec(const std::string& spec_name = "test_spec",
-                           const std::string& target = "llvm") {
-  return PartitionSpec(spec_name, Target(target), {});
-}
-
-std::vector<CandidatePartition> ActualCandidates(const DataflowGraph& graph, const Function& func,
-                                                 const PartitionSpec& spec,
-                                                 const PartitionRule& rule) {
-  auto candidates = rule->AllCandidates(graph, spec);
-  LOG(INFO) << "--------- actual candidates -------------";
-  for (const auto& candidate : candidates) {
-    LOG(INFO) << candidate->ToString();
-  }
-  LOG(INFO) << "-----------------------------------------";
-  return candidates;
-}
-
-std::vector<CandidatePartition> ExpectedCandidates(
-    const DataflowGraph& graph, const PartitionSpec& spec,
-    const std::vector<std::vector<PostDfsIndex>>& index_sets) {
-  std::vector<CandidatePartition> candidate_partitions;
-  for (const auto& indexes : index_sets) {
-    auto subgraph = SubGraph(graph, IndexSet(graph.size(), indexes));
-    auto candidate = CandidatePartition(/*rule_name=*/"", subgraph, spec);
-    candidate_partitions.emplace_back(std::move(candidate));
-  }
-  return candidate_partitions;
-}
-
-void AssertEqual(const std::vector<CandidatePartition>& actual,
-                 const std::vector<CandidatePartition>& expected) {
-  ASSERT_EQ(actual.size(), expected.size());
-  std::set<CandidatePartition, CandidatePartitionCompare> actual_set(actual.begin(), actual.end());
-  std::set<CandidatePartition, CandidatePartitionCompare> expected_set(expected.begin(),
-                                                                       expected.end());
-  ASSERT_EQ(actual_set.size(), expected_set.size());
-  for (const auto& actual_candidate : actual_set) {
-    ASSERT_EQ(expected_set.count(actual_candidate), 1) << actual_candidate->ToString();
-  }
-}
-
-void AssertEqual(const Expr& actual, const Expr& expected) {
-  ASSERT_TRUE(StructuralEqual()(actual, expected)) << PrettyPrint(actual);
-}
-
-TEST(PartitionRule, DFPatternSingleOp) {
-  auto func = StandardTestFunction();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    auto pattern = IsOp("nn.relu")({IsWildcard()});
-    auto rule = DFPatternPartitionRule("relu_pattern", pattern);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, DFPatternOverlap) {
-  auto func = StandardTestFunction();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    auto pattern =
-        IsOp("nn.relu")({IsOp("nn.relu")({IsWildcard()}) || IsOp("abs")({IsWildcard()})});
-    auto rule = DFPatternPartitionRule("relu+abs_pattern", pattern);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, {{3, 4}, {4, 5}});
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, Composite) {
-  auto func = StandardTestFunction();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  constexpr const char* kExpectedMod = R"(
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = abs(%x);
-        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="composite") {
-          nn.relu(%FunctionVar_01)
-        };
-        %2 = %1(%0);
-        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Composite="composite") {
-          nn.relu(%FunctionVar_0)
-        };
-        %3(%2)
-      }
-    )";
-  Expr expected_expr = MakeTestFunction(kExpectedMod);
-
-  {
-    auto pattern = IsOp("nn.relu")({IsWildcard()});
-    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
-    auto composite_rule = CompositePartitionRule("composite", df_rule);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, composite_rule);
-    auto actual_expr = CandidatePartition::ParallelRewrite(graph, actual_candidates);
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
-    AssertEqual(actual_candidates, expected_candidates);
-    AssertEqual(actual_expr, expected_expr);
-  }
-}
-
-TEST(PartitionRule, PrimitiveTVM) {
-  auto func = StandardTestFunction();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  constexpr const char* kExpectedMod = R"(
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = abs(%x);
-        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1) {
-          nn.relu(%FunctionVar_01)
-        };
-        %2 = %1(%0);
-        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1) {
-          nn.relu(%FunctionVar_0)
-        };
-        %3(%2)
-      }
-    )";
-  Expr expected_expr = MakeTestFunction(kExpectedMod);
-
-  {
-    auto pattern = IsOp("nn.relu")({IsWildcard()});
-    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
-    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, primitive_rule);
-    auto actual_expr = CandidatePartition::ParallelRewrite(graph, actual_candidates);
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
-    AssertEqual(actual_candidates, expected_candidates);
-    AssertEqual(actual_expr, expected_expr);
-  }
-}
-
-TVM_REGISTER_TARGET_KIND("test_ext_codegen", kDLCUDA)
-    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
-
-TEST(PartitionRule, PrimitiveExternal) {
-  auto func = StandardTestFunction();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec("test_ext_codegen", "test_ext_codegen");
-
-  constexpr const char* kExpectedMod = R"(
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = abs(%x);
-        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") {
-          nn.relu(%FunctionVar_01)
-        };
-        %2 = %1(%0);
-        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") {
-          nn.relu(%FunctionVar_0)
-        };
-        %3(%2)
-      }
-    )";
-  Expr expected_expr = MakeTestFunction(kExpectedMod);
-
-  {
-    auto pattern = IsOp("nn.relu")({IsWildcard()});
-    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
-    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, primitive_rule);
-    auto actual_expr = CandidatePartition::ParallelRewrite(graph, actual_candidates);
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
-    AssertEqual(actual_candidates, expected_candidates);
-    AssertEqual(actual_expr, expected_expr);
-  }
-}
-
-TEST(PartitionRule, Union) {
-  auto func = StandardTestFunction();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    auto abs_pattern = IsOp("abs")({IsWildcard()});
-    auto abs_rule = DFPatternPartitionRule("abs_pattern", abs_pattern);
-    auto relu_pattern = IsOp("nn.relu")({IsWildcard()});
-    auto relu_rule = DFPatternPartitionRule("relu_pattern", relu_pattern);
-    auto union_rule = UnionPartitionRule("union", {abs_rule, relu_rule});
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, union_rule);
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, {{3}, {4}, {5}});
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, OpCallByKind) {
-  auto func = VariantTestFunction();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    auto rule = OpCallByKindPartitionRule("op_call_by_kind");
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, Combine_ByKind) {
-  auto func = GPT2ExtractOps();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    // Prime the system by picking out all 11 calls to non-opaque ops.
-    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
-    // Combine all <= kOutEWiseFusable (A) actual_candidates (ie anything) with downstream
-    // <= kBroadcast (B) actual_candidates (ie B or E).
-    Array<SimpleCombinerRule> simple_rules;
-    simple_rules.push_back(ByKindSimpleCombinerRule(/*upstream_kind=*/kOutEWiseFusable,
-                                                    /*downstream_kind=*/kBroadcast));
-    Array<CombinerRule> combiner_rules;
-    combiner_rules.push_back(AllSimpleCombinerRule("all_simple", std::move(simple_rules)));
-    // Build the overall partition rule.
-    auto rule = CombinePartitionRule("combine_by_kind_A_B", std::move(sub_rule),
-                                     std::move(combiner_rules), /*max_depth=*/3);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    // The original calls.
-    std::vector<std::vector<PostDfsIndex>> expected;
-    expected.push_back({6});
-    expected.push_back({8});
-    expected.push_back({9});
-    expected.push_back({11});
-    expected.push_back({15});
-    expected.push_back({17});
-    expected.push_back({18});
-    expected.push_back({20});
-    expected.push_back({21});
-    expected.push_back({23});
-    expected.push_back({24});
-
-    // nn.dense (A) and the following add (B)
-    expected.push_back({6, 8});
-
-    // reshape (I) and the following power or multiply or both
-    expected.push_back({9, 11});
-    expected.push_back({9, 15});
-    expected.push_back({9, 11, 15});
-
-    // reshape (I) and the following power and multiply
-    expected.push_back({9, 15, 17});
-
-    // reshape (I) and everything after it to the max depth of 3
-    expected.push_back({9, 11, 15, 17});
-
-    // pairs of broadcasts
-    expected.push_back({11, 24});  // multiply / multiply
-    expected.push_back({15, 17});  // power / multiply
-    expected.push_back({17, 18});  // multiply / add
-    expected.push_back({18, 20});  // add / multiply
-    expected.push_back({20, 21});  // multiply / tanh
-    expected.push_back({21, 23});  // tanh / add
-    expected.push_back({23, 24});  // add / multiply
-
-    // triples of broadcasts
-    expected.push_back({15, 17, 18});  // power / multiply / add
-    expected.push_back({17, 18, 20});  // multiply / add / multiply
-    expected.push_back({18, 20, 21});  // add / multiply / tanh
-    expected.push_back({20, 21, 23});  // multiply / tanh / add
-    expected.push_back({21, 23, 24});  // tanh / add / multiply
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, Combine_TupleArg) {
-  auto func = GPT2ExtractTuples();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    // Prime the system by picking out all 8 calls to non-opaque ops.
-    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
-    // Merge args of tuples of <= injective (I) fields into the call's group.
-    Array<CombinerRule> combiner_rules;
-    combiner_rules.push_back(TupleArgCombinerRule("tuple_arg"));
-    // Build the overall partition rule.
-    auto rule = CombinePartitionRule("combine_tuple_arg", std::move(sub_rule),
-                                     std::move(combiner_rules), /*max_depth=*/3);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    // The original calls
-    std::vector<std::vector<PostDfsIndex>> expected;
-    expected.push_back({6});
-    expected.push_back({8});
-    expected.push_back({9});
-    expected.push_back({10});
-    expected.push_back({12});
-    expected.push_back({13});
-    expected.push_back({14});
-    expected.push_back({16});
-
-    // The concatenate((expand_dims(...), expand_dims(...)) is grouped.
-    expected.push_back({10, 14, 15, 16});
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, Combine_TupleProj) {
-  auto func = GPT2ExtractTuples();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    // Prime the system by picking out all 8 calls to non-opaque ops.
-    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
-    // Merge projections from injective groups.
-    Array<CombinerRule> combiner_rules;
-    combiner_rules.push_back(TupleProjCombinerRule("tuple_proj"));
-    // Build the overall partition rule.
-    auto rule = CombinePartitionRule("combine_tuple_proj", std::move(sub_rule),
-                                     std::move(combiner_rules), /*max_depth=*/3);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    // The original calls
-    std::vector<std::vector<PostDfsIndex>> expected;
-    expected.push_back({6});
-    expected.push_back({8});
-    expected.push_back({9});
-    expected.push_back({10});
-    expected.push_back({12});
-    expected.push_back({13});
-    expected.push_back({14});
-    expected.push_back({16});
-
-    // split / proj 1
-    expected.push_back({6, 7});
-    // split / proj 2
-    expected.push_back({6, 11});
-    // split and both projections
-    expected.push_back({6, 7, 11});
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, Combine_Constant) {
-  auto func = GPT2ExtractOps();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    // Prime the system by picking out all 11 calls to non-opaque ops.
-    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
-    // Merge constant args into injective groups
-    Array<CombinerRule> combiner_rules;
-    combiner_rules.push_back(ConstantCombinerRule("constant"));
-    // Build the overall partition rule.
-    auto rule = CombinePartitionRule("combine_constant", std::move(sub_rule),
-                                     std::move(combiner_rules), /*max_depth=*/3);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    // The original calls
-    std::vector<std::vector<PostDfsIndex>> expected;
-    expected.push_back({6});
-    expected.push_back({8});
-    expected.push_back({9});
-    expected.push_back({11});
-    expected.push_back({15});
-    expected.push_back({17});
-    expected.push_back({18});
-    expected.push_back({20});
-    expected.push_back({21});
-    expected.push_back({23});
-    expected.push_back({24});
-
-    // Constant arg to nn.dense
-    expected.push_back({5, 6});
-
-    // Constant arg to add
-    expected.push_back({7, 8});
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, Combine_Mixed) {
-  auto func = GPT2ExtractOps();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    // Prime the system by picking out all 11 calls to non-opaque ops.
-    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
-
-    // Mimic the FuseOps rules.
-    Array<SimpleCombinerRule> simple_rules;
-    simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast));
-    simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce));
-    simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective));
-    Array<CombinerRule> combiner_rules;
-    combiner_rules.push_back(AllSimpleCombinerRule("all_simple", std::move(simple_rules)));
-
-    // Merge constant args into injective groups
-    combiner_rules.push_back(ConstantCombinerRule("constant"));
-
-    // Build the overall partition rule.
-    auto rule = CombinePartitionRule("combine_mixed", std::move(sub_rule),
-                                     std::move(combiner_rules), /*max_depth=*/3);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    // The original calls
-    std::vector<std::vector<PostDfsIndex>> expected;
-    expected.push_back({6});
-    expected.push_back({8});
-    expected.push_back({9});
-    expected.push_back({11});
-    expected.push_back({15});
-    expected.push_back({17});
-    expected.push_back({18});
-    expected.push_back({20});
-    expected.push_back({21});
-    expected.push_back({23});
-    expected.push_back({24});
-
-    // A -> B merging
-    expected.push_back({6, 8});
-    expected.push_back({9, 11});
-    expected.push_back({9, 15});
-    expected.push_back({9, 11, 15});
-    expected.push_back({9, 15, 17});
-    expected.push_back({9, 11, 15, 17});
-    expected.push_back({11, 24});
-    expected.push_back({15, 17});
-    expected.push_back({17, 18});
-    expected.push_back({18, 20});
-    expected.push_back({20, 21});
-    expected.push_back({21, 23});
-    expected.push_back({23, 24});
-    expected.push_back({15, 17, 18});
-    expected.push_back({17, 18, 20});
-    expected.push_back({18, 20, 21});
-    expected.push_back({20, 21, 23});
-    expected.push_back({21, 23, 24});
-
-    // Constant args
-    expected.push_back({5, 6});
-    expected.push_back({7, 8});
-
-    // B -> R
-    expected.push_back({8, 9});
-    expected.push_back({8, 9, 11});
-    expected.push_back({8, 9, 15});
-
-    // Constant's and A -> B
-    expected.push_back({5, 6, 8});
-    expected.push_back({5, 6, 7, 8});
-
-    // Constants and B -> R
-    expected.push_back({7, 8, 9});
-    expected.push_back({7, 8, 9, 11});
-    expected.push_back({7, 8, 9, 15});
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, OnlyValid) {
-  auto func = GPT2ExtractOps();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    // Prime the system by picking out all 11 calls to non-opaque ops.
-    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
-    // Combine all <= kOutEWiseFusable (A) actual_candidates (ie anything) with downstream
-    // <= kBroadcast (B) actual_candidates (ie B or E).
-    Array<SimpleCombinerRule> simple_rules;
-    simple_rules.push_back(ByKindSimpleCombinerRule(/*upstream_kind=*/kOutEWiseFusable,
-                                                    /*downstream_kind=*/kBroadcast));
-    Array<CombinerRule> combiner_rules;
-    combiner_rules.push_back(AllSimpleCombinerRule("all_simple", std::move(simple_rules)));
-    auto combine_rule = CombinePartitionRule("combine_by_kind_A_B", std::move(sub_rule),
-                                             std::move(combiner_rules), /*max_depth=*/3);
-    // Only allow up to depth 2, no taps and 1 exit.
-    SubGraphConfig config;
-    config.allow_taps = false;
-    config.max_depth = 2;
-    config.max_exits = 1;
-
-    // Build the overall partition rule.
-    auto rule = OnlyValidPartitionRule("only_valid", std::move(combine_rule), config);
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    // The original calls.
-    std::vector<std::vector<PostDfsIndex>> expected;
-    expected.push_back({6});
-    expected.push_back({8});
-    expected.push_back({9});
-    expected.push_back({11});
-    expected.push_back({15});
-    expected.push_back({17});
-    expected.push_back({18});
-    expected.push_back({20});
-    expected.push_back({21});
-    expected.push_back({23});
-    expected.push_back({24});
-
-    // nn.dense (A) and the following add (B)
-    expected.push_back({6, 8});
-
-    // pairs of broadcasts
-    expected.push_back({11, 24});  // multiply / multiply
-    expected.push_back({15, 17});  // power / multiply
-    expected.push_back({17, 18});  // multiply / add
-    expected.push_back({18, 20});  // add / multiply
-    expected.push_back({20, 21});  // multiply / tanh
-    expected.push_back({21, 23});  // tanh / add
-    expected.push_back({23, 24});  // add / multiply
-
-    // The following candidates are filtered out because they have 2 or 3 exits:
-    // {9, 11}, {9, 15}, {9,11,15}, {9,15,17}, {15,17,18}, {17,18,20},
-    // {18,20,21}, {20,21,23}, {21,23,24}, {9,11,15,17}
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-TEST(PartitionRule, Host) {
-  auto func = GPT2ExtractTuples();
-  auto graph = DataflowGraph(func);
-  auto spec = StandardSpec();
-
-  {
-    auto rule = HostPartitionRule("host");
-
-    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
-
-    std::vector<std::vector<PostDfsIndex>> expected;
-
-    // Function arg %x
-    expected.push_back({0});
-    // Operators
-    expected.push_back({1});  // concatenate
-    expected.push_back({2});  // expand_dims
-    expected.push_back({3});  // transpose
-    expected.push_back({4});  // reshape
-    expected.push_back({5});  // split
-    // Tuple projection
-    expected.push_back({7});
-    expected.push_back({11});
-    // Tuple construction
-    expected.push_back({15});
-    // The overall @main function
-    expected.push_back({17});
-
-    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
-    AssertEqual(actual_candidates, expected_candidates);
-  }
-}
-
-}  // namespace
-}  // namespace collage
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/df_pattern_rewrite_test.cc b/tests/cpp/relay/df_pattern_rewrite_test.cc
deleted file mode 100644
index 374887c12a22..000000000000
--- a/tests/cpp/relay/df_pattern_rewrite_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/dataflow_matcher.h>
-#include <tvm/relay/dataflow_pattern.h>
-#include <tvm/relay/function.h>
-#include <tvm/relay/parser.h>
-
-#include "../../../src/relay/transforms/simplify_expr.h"
-
-namespace tvm {
-namespace relay {
-namespace {
-
-// Demonstrates rewriting a deeply nested sub-graph with specific
-// attributes on the inner-most operator call.
-class TestRewriter : public DFPatternRewrite {
- public:
-  TestRewriter() {
-    x_ = IsWildcard();
-    const1_ = IsWildcard();
-    const2_ = IsWildcard();
-    const3_ = IsWildcard();
-    const4_ = IsWildcard();
-
-    auto biasadd = IsOp("nn.bias_add");
-    auto relu = IsOp("nn.relu");
-    auto conv2d = IsOp("nn.conv2d");
-
-    Map<String, ObjectRef> attrs;
-    attrs.Set("groups", Integer(304));
-    auto maybedepthwise = conv2d({x_, const1_}).HasAttr(attrs);
-
-    pattern_ =
-        relu({biasadd({conv2d({relu({biasadd({maybedepthwise, const2_})}), const3_}), const4_})});
-  }
-
-  Expr Callback(const Expr& pre, const Expr& post,
-                const Map<DFPattern, Array<Expr>>& node_map) const override {
-    LOG(INFO) << "depthwise conv2d detected!";
-    auto attrs = runtime::make_object<InitOpAttrs>();
-    attrs->shape = Array<Integer>({Integer(1), Integer(256), Integer(128), Integer(128)});
-    attrs->dtype = DataType::Float(32);
-    return Call(Op::Get("zeros"), {}, Attrs(attrs));
-  }
-
-  DFPattern x_, const1_, const2_, const3_, const4_;
-};
-
-TEST(DFPatternRewrite, DeeplyNestedWithCallAttributes) {
-  constexpr const char* kModel = R"(
-    #[version = "0.0.5"]
-    def @main(%data : Tensor[(1, 304, 128, 128), float32],
-             %weight1 : Tensor[(304, 1, 3, 3), float32],
-             %bias1 : Tensor[(304), float32],
-             %weight2 : Tensor[(256, 304, 1, 1), float32],
-             %bias2 : Tensor[(256), float32]) -> Tensor[(1, 256, 128, 128), float32] {
-      %0 = nn.conv2d(%data, %weight1, padding=[1, 1, 1, 1], groups=304, channels=304, kernel_size=[3, 3]);
-      %1 = nn.bias_add(%0, %bias1);
-      %2 = nn.relu(%1);
-      %3 = nn.conv2d(%2, %weight2, padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-      %4 = nn.bias_add(%3, %bias2);
-      nn.relu(%4)
-    }
-  )";
-
-  IRModule module = ParseModule("string", kModel);
-  DFPatternRewriteComposer composer;
-  composer.AddRewrite<TestRewriter>();
-  Function in_function = Downcast<Function>(module->Lookup("main"));
-  LOG(INFO) << "input function:\n" << PrettyPrint(in_function);
-  Function out_function =
-      Downcast<Function>(RewritePatterns(composer.MakeCallbacks(), in_function, module));
-  LOG(INFO) << "output function:\n" << PrettyPrint(out_function);
-  const auto* call_node = out_function->body.as<CallNode>();
-  ASSERT_TRUE(call_node != nullptr);
-  ASSERT_TRUE(call_node->op == Op::Get("zeros"));
-}
-
-}  // namespace
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/ir/indexed_graph_test.cc b/tests/cpp/relay/ir/indexed_graph_test.cc
deleted file mode 100644
index 486d027fbc21..000000000000
--- a/tests/cpp/relay/ir/indexed_graph_test.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "../../../src/relay/ir/indexed_graph.h"
-
-#include <gtest/gtest.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/function.h>
-#include <tvm/relay/parser.h>
-
-namespace tvm {
-namespace relay {
-namespace {
-
-// A module stolen from onnx/test_forward.py::test_loop which combines functions, recursion,
-// control flow, tuples as well as the usual operator calls.
-// We include the known post-dfs indexes in comments to help write the tests.
-IRModule TestRecursiveIRModule() {
-  Device device = {kDLCPU, 0};
-  Constant const0(runtime::NDArray::Empty(ShapeTuple({1}), DataType::Int(64), device));
-  Constant const1(runtime::NDArray::Empty(ShapeTuple({0, 1}), DataType::Float(32), device));
-  Map<String, Array<ObjectRef>> metadata;
-  metadata.Set("relay.Constant", Array<ObjectRef>({const0, const1}));
-  constexpr const char* kModel = R"(
-    #[version = "0.0.5"]
-    def @main(%trip_count: int64,                                        // 0
-              %cond: bool,                                               // 1
-              %y: Tensor[(1), float32])                                  // 2
-              -> (Tensor[(1), float32], Tensor[(?, ?), float32]) {
-      %17 = (
-        let %while_loop = fn (%iter_count: int64,                        // 3
-                              %max_count: int64,                         // 4
-                              %cond_in: bool,                            // 5
-                              %y_in: Tensor[(1), float32],               // 6
-                              %scan_out: Tensor[(?, ?), float32])        // 7
-                              -> (int64, int64, bool, Tensor[(1), float32], Tensor[(?, ?), float32]) {
-          %0 = equal(%cond_in, True);                                    // 11
-          %1 = less(%iter_count, %max_count);                            // 13
-          %2 = logical_and(%0, %1);                                      // 14
-          if (%2) {
-            %3 = cast(%iter_count, dtype="float32");                     // 20
-            %4 = add(%y_in, %3);                                         // 21
-            %5 = less(%4, 5f);                                           // 23
-            %6 = squeeze(%5);                                            // 24
-            %7 = reshape(%iter_count, newshape=[1]);                     // 29
-            %8 = (%7, meta[relay.Constant][0]);                          // 31
-            %9 = concatenate(%8);                                        // 32
-            %10 = copy(%4);                                              // 36
-            %11 = dyn.broadcast_to(%scan_out, %9, shape=None);           // 33
-            %12 = expand_dims(%10, axis=0);                              // 37
-            %13 = (%11, %12);                                            // 38
-            %14 = add(%iter_count, 1i64);                                // 17
-            %15 = cast(%6, dtype="bool");                                // 25
-            %16 = concatenate(%13);                                      // 39
-            %while_loop(%14, %max_count, %15, %4, %16)                   // 40
-          } else {
-            (%iter_count, %max_count, %cond_in, %y_in, %scan_out)        // 41
-          }                                                              // 42
-        };                                                               // 43
-        %while_loop                                                      // 44
-      );                                                                 // 45
-      %18 = %17(0i64, %trip_count, %cond, %y, meta[relay.Constant][1]);  // 48
-      %19 = %18.3;                                                       // 49
-      %20 = %18.4;                                                       // 50
-      (%19, %20)                                                         // 51
-    }                                                                    // 52
-  )";
-  return ParseModule("string", kModel, /*init_module=*/{}, metadata);
-}
-
-TEST(IndexedGraph, RecursiveExprRegression) {
-  IRModule ir_mod = TestRecursiveIRModule();
-  auto main = Downcast<Function>(ir_mod->Lookup("main"));
-  auto graph = CreateIndexedGraph(main);
-  graph->CheckValid();
-
-  {
-    // Dataflow node properties for %4
-    auto node = graph->index_to_node(21);
-    const auto* call_node = node->ref().as<CallNode>();
-    ASSERT_NE(call_node, nullptr);
-    const auto* op_node = call_node->op.as<OpNode>();
-    ASSERT_NE(op_node, nullptr);
-    ASSERT_EQ(op_node->name, "add");
-
-    // 3 inputs (the op itself is an input)
-    ASSERT_EQ(node->inputs_.size(), 3);
-    ASSERT_EQ(node->inputs_[0]->index_, 15);  // the add op
-    ASSERT_EQ(node->inputs_[1]->index_, 6);   // %y_in
-    ASSERT_EQ(node->inputs_[2]->index_, 20);  // %3
-
-    // 3 outputs
-    ASSERT_EQ(node->outputs_.size(), 3);
-    ASSERT_EQ(node->outputs_[0]->index_, 23);  // %5
-    ASSERT_EQ(node->outputs_[1]->index_, 36);  // %10
-    ASSERT_EQ(node->outputs_[2]->index_, 40);  // recursive %while_loop call
-
-    // In the 'if' basic block
-    ASSERT_EQ(node->basic_block_->index_, 42);
-
-    // Dominator 'parent' is recursive call
-    ASSERT_EQ(node->dominator_parent_->index_, 40);
-
-    // One dominator child from %3
-    ASSERT_EQ(node->dominator_children_.size(), 1);
-    ASSERT_EQ(node->dominator_children_[0]->index_, 20);
-  }
-
-  {
-    // The recursive call to %while_loop does not depend on %while_loop
-    auto node = graph->index_to_node(40);
-    const auto* call_node = node->ref().as<CallNode>();
-    ASSERT_NE(call_node, nullptr);
-    const auto* var_node = call_node->op.as<VarNode>();
-    ASSERT_NE(var_node, nullptr);
-    ASSERT_EQ(var_node->name_hint(), "while_loop");
-
-    ASSERT_EQ(node->inputs_.size(), 5);
-    ASSERT_EQ(node->inputs_[0]->index_, 17);  // %14
-    ASSERT_EQ(node->inputs_[1]->index_, 4);   // %max_count
-    ASSERT_EQ(node->inputs_[2]->index_, 25);  // %15
-    ASSERT_EQ(node->inputs_[3]->index_, 21);  // %4
-    ASSERT_EQ(node->inputs_[4]->index_, 39);  // %16
-  }
-
-  {
-    // Downstream nodes of %18
-    auto node = graph->index_to_node(48);
-    std::unordered_set<const IndexedGraph<Expr>::Node*> downstreams;
-    node->AccumulateDownstreamNodes(&downstreams);
-    ASSERT_EQ(downstreams.size(), 4);
-    for (const auto* downstream : downstreams) {
-      ASSERT_TRUE(downstream->index_ >= 49 && downstream->index_ <= 52);
-    }
-  }
-
-  {
-    // Dominates relation for %4
-    auto upstream = graph->index_to_node(21);
-    // Path 1: 21->23->24->25->40
-    // Path 2: 21->36->37->38->39->40
-    // Then 40->43
-    auto downstream = graph->index_to_node(43);
-    ASSERT_TRUE(downstream->Dominates(upstream));
-  }
-}
-
-// A module with unused let-bound function. The 'add' operator should have no dominator
-// since it is used both in the unused function and in the main body.
-IRModule TestUnusedLetBoundIRModule() {
-  constexpr const char* kModel = R"(
-    #[version = "0.0.5"]
-    def @main(%x: int64) -> int64 {   // 0
-      let %f = fn (                   // 5
-        %y: int64                     // 1
-      ) {
-        add(%x, %y)                   // 3
-      };
-      if (less(%x, 5i64)) {
-        add(%x, 3i64)                 // 10
-      } else {
-        %x
-      }
-    }
-  )";
-  return ParseModule("string", kModel);
-}
-
-TEST(IndexedGraph, UnusedLetVars) {
-  IRModule ir_mod = TestUnusedLetBoundIRModule();
-  auto main = Downcast<Function>(ir_mod->Lookup("main"));
-  auto graph = CreateIndexedGraph(main);
-  graph->CheckValid();
-
-  {
-    auto node = graph->index_to_node(2);
-    const auto* op_node = node->ref().as<OpNode>();
-    ICHECK(op_node);
-    ICHECK_EQ(op_node->name, "add");
-    ICHECK_EQ(node->outputs_.size(), 2);
-    ICHECK_EQ(node->outputs_[0]->index_, 3);
-    ICHECK_EQ(node->outputs_[1]->index_, 10);
-    ICHECK(node->dominator_parent_ == nullptr);
-  }
-}
-
-}  // namespace
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/op/memory/on_device_test.cc b/tests/cpp/relay/op/memory/on_device_test.cc
deleted file mode 100644
index 6f0a0b0d8beb..000000000000
--- a/tests/cpp/relay/op/memory/on_device_test.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "../../../../../src/relay/op/memory/on_device.h"
-
-#include <gtest/gtest.h>
-
-#include <string>
-
-namespace tvm {
-namespace relay {
-
-TEST(OnDeviceOp, Name) { EXPECT_EQ(OnDeviceOp()->name, "on_device"); }
-
-TEST(OnDevice, Default) {
-  Var body("x", {});
-  VirtualDevice virtual_device = VirtualDevice::ForDeviceType(kDLCPU, 3);
-  Call call = OnDevice(body, virtual_device);
-  EXPECT_EQ(call->op, OnDeviceOp());
-  EXPECT_EQ(call->args.size(), 1);
-  EXPECT_EQ(call->args[0], body);
-  const auto* attrs = call->attrs.as<OnDeviceAttrs>();
-  ASSERT_TRUE(attrs != nullptr);
-  EXPECT_EQ(attrs->virtual_device, virtual_device);
-  EXPECT_FALSE(attrs->constrain_result);
-  EXPECT_TRUE(attrs->constrain_body);
-}
-
-TEST(OnDevice, Fixed) {
-  Var body("x", {});
-  VirtualDevice virtual_device = VirtualDevice::ForDeviceType(kDLCPU, 3);
-  Call call = OnDevice(body, virtual_device, /*constrain_result=*/true);
-  const auto* attrs = call->attrs.as<OnDeviceAttrs>();
-  ASSERT_TRUE(attrs != nullptr);
-  EXPECT_TRUE(attrs->constrain_result);
-  EXPECT_TRUE(attrs->constrain_body);
-}
-
-TEST(OnDevice, Free) {
-  Var body("x", {});
-  VirtualDevice virtual_device = VirtualDevice::ForDeviceType(kDLCPU, 3);
-  Call call = OnDevice(body, virtual_device, /*constrain_result=*/false, /*constrain_body=*/false);
-  const auto* attrs = call->attrs.as<OnDeviceAttrs>();
-  ASSERT_TRUE(attrs != nullptr);
-  EXPECT_FALSE(attrs->constrain_result);
-  EXPECT_FALSE(attrs->constrain_body);
-}
-
-TEST(GetOnDeviceProps, Correct) {
-  Var body("x", {});
-  VirtualDevice virtual_device = VirtualDevice::ForDeviceType(kDLCPU, 3);
-  Call call = OnDevice(body, virtual_device, /*constrain_result=*/true, /*constrain_body=*/false);
-  OnDeviceProps props = GetOnDeviceProps(call);
-  ASSERT_TRUE(props.body.defined());
-  ASSERT_EQ(props.virtual_device, virtual_device);
-  ASSERT_TRUE(props.constrain_result);
-  ASSERT_FALSE(props.constrain_body);
-}
-
-TEST(MaybeOnDevice, Wrapped) {
-  VirtualDevice virtual_device = VirtualDevice::ForDeviceType(kDLCPU, 3);
-  Var body("x", {});
-  Call inner = OnDevice(body, virtual_device);
-  Call outer = OnDevice(inner, virtual_device);
-  OnDeviceProps props = GetOnDeviceProps(outer);
-  ASSERT_TRUE(props.body.defined());
-  ASSERT_EQ(props.virtual_device, virtual_device);
-  ASSERT_FALSE(props.constrain_result);
-  ASSERT_TRUE(props.constrain_body);
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/transforms/device_domains_test.cc b/tests/cpp/relay/transforms/device_domains_test.cc
deleted file mode 100644
index 47e303996b3b..000000000000
--- a/tests/cpp/relay/transforms/device_domains_test.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Just a smoke test for the device planner's unification domain, mostly to tease out how we'd
- * like to organize our cpp unit tests for functionality that's not obviously a Pass or should
- * be exposed via FFI.
- */
-
-// TODO(mbs): Revisit cpp unit test layout or setup include dir at root of src/
-#include "../../../../src/relay/transforms/device_domains.h"
-
-#include <gtest/gtest.h>
-#include <tvm/relay/parser.h>
-#include <tvm/relay/transform.h>
-
-namespace tvm {
-namespace relay {
-namespace transform {
-namespace {
-
-IRModule TestModule() {
-  return InferType()(ParseModule("test", R"(
-    #[version = "0.0.5"]
-    def @f(%x : Tensor[(3, 7), float32], %y : Tensor[(3, 7), float32]) {
-      add(%x, %y)
-    }
-  )"));
-}
-
-TEST(DeviceDomains, SmokeTest) {
-  VirtualDevice cpu = VirtualDevice::ForDeviceType(kDLCPU);
-  VirtualDevice cuda = VirtualDevice::ForDeviceType(kDLCUDA);
-  transform::PassContext ctxt = transform::PassContext::Create();
-  CompilationConfig config(ctxt, {Target("llvm"), Target("cuda")});
-  DeviceDomains domains(config);
-  IRModule mod = TestModule();
-  Function f = Downcast<Function>(mod->Lookup("f"));
-
-  DeviceDomainPtr actual_add_domain = domains.DomainForCallee(Downcast<Call>(f->body));
-  DeviceDomainPtr x_domain = domains.DomainFor(f->params[0]);
-  DeviceDomainPtr y_domain = domains.DomainFor(f->params[1]);
-  DeviceDomainPtr result_domain = domains.Free(f->ret_type);
-  std::vector<DeviceDomainPtr> arg_and_results;
-  arg_and_results.push_back(x_domain);
-  arg_and_results.push_back(y_domain);
-  arg_and_results.push_back(result_domain);
-  DeviceDomainPtr implied_add_domain = domains.MakeHigherOrderDomain(std::move(arg_and_results));
-  EXPECT_FALSE(domains.UnifyOrNull(actual_add_domain, implied_add_domain) == nullptr);
-  EXPECT_FALSE(domains.UnifyOrNull(x_domain, domains.ForVirtualDevice(f->params[0]->checked_type(),
-                                                                      cuda)) == nullptr);
-
-  EXPECT_EQ(domains.ResultVirtualDevice(y_domain), config->CanonicalVirtualDevice(cuda));
-  EXPECT_EQ(domains.ResultVirtualDevice(result_domain), config->CanonicalVirtualDevice(cuda));
-}
-
-}  // namespace
-}  // namespace transform
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay/with_fields_test.cc b/tests/cpp/relay/with_fields_test.cc
deleted file mode 100644
index 6114fa97a9fd..000000000000
--- a/tests/cpp/relay/with_fields_test.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and lixmitations
- * under the License.
- */
-
-/*!
- * \brief Proof-of-concept unit tests for the family of WithFields helpers.
- * Only Call, GlobalVar and Constant are currently tested.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/relay/adt.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/function.h>
-#include <tvm/relay/parser.h>
-
-namespace tvm {
-namespace relay {
-namespace {
-
-IRModule TestIRModule() {
-  return ParseModule("string",
-                     R"(
-    #[version = "0.0.5"]
-    def @main(%data : Tensor[(1, 304, 128, 128), float32],
-             %weight1 : Tensor[(304, 1, 3, 3), float32],
-             %bias1 : Tensor[(304), float32],
-             %weight2 : Tensor[(256, 304, 1, 1), float32],
-             %bias2 : Tensor[(256), float32]) -> Tensor[(1, 256, 128, 128), float32] {
-      %0 = nn.conv2d(%data, %weight1, padding=[1, 1, 1, 1], groups=304, channels=304, kernel_size=[3, 3]);
-      %1 = nn.bias_add(%0, %bias1);
-      %2 = nn.relu(%1);
-      %3 = nn.conv2d(%2, %weight2, padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-      %4 = nn.bias_add(%3, %bias2);
-      nn.relu(%4)
-    }
-  )");
-}
-
-Function TestFunction() { return Downcast<Function>(TestIRModule()->Lookup("main")); }
-Call TestCall() { return Downcast<Call>(TestFunction()->body); }
-GlobalVar TestGlobalVar() { return TestIRModule()->GetGlobalVar("main"); }
-VirtualDevice TestVirtualDevice() { return VirtualDevice::ForDevice({kDLCUDA, 3}); }
-Span TestSpan() { return Span(SourceName::Get("foo"), 3, 4, 6, 42); }
-Constant TestConstant() {
-  return Constant(runtime::NDArray::Empty({}, DataType::Int(32), {kDLCPU, 0}));
-}
-
-//
-// Call
-//
-
-TEST(WithFields, Call_Noop) {
-  Call call = TestCall();
-  Call result = WithFields(call);
-  ASSERT_TRUE(result.same_as(call));
-}
-
-TEST(WithFields, Call_Op) {
-  Call call = TestCall();
-  Op new_op = Op::Get("tanh");
-  Call result = WithFields(call, new_op);
-  ASSERT_FALSE(result.same_as(call));
-  ASSERT_FALSE(call->op.same_as(new_op));
-  ASSERT_TRUE(result->op.same_as(new_op));
-}
-
-TEST(WithFields, Call_Args) {
-  Call call = TestCall();
-  Array<Expr> new_args = {Tuple(Array<Expr>())};
-  Call result = WithFields(call, /*opt_op=*/{}, new_args);
-  ASSERT_FALSE(result.same_as(call));
-  ASSERT_FALSE(call->args.same_as(new_args));
-  ASSERT_TRUE(result->args.same_as(new_args));
-}
-
-TEST(WithFields, Call_Attrs) {
-  Call call = TestCall();
-  Attrs new_attrs = DictAttrs(Map<String, ObjectRef>());
-  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, new_attrs);
-  ASSERT_FALSE(result.same_as(call));
-  ASSERT_FALSE(call->attrs.same_as(new_attrs));
-  ASSERT_TRUE(result->attrs.same_as(new_attrs));
-}
-
-TEST(WithFields, Call_TypeArgs) {
-  Call call = TestCall();
-  Array<Type> new_type_args;
-  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{}, new_type_args);
-  ASSERT_FALSE(result.same_as(call));
-  ASSERT_FALSE(call->type_args.same_as(new_type_args));
-  ASSERT_TRUE(result->type_args.same_as(new_type_args));
-}
-
-TEST(WithFields, Call_VirtualDevice) {
-  Call call = TestCall();
-  VirtualDevice new_virtual_device = TestVirtualDevice();
-  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{},
-                           /*opt_type_args=*/{}, new_virtual_device);
-  ASSERT_FALSE(result.same_as(call));
-  ASSERT_FALSE(call->virtual_device().same_as(new_virtual_device));
-  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
-}
-
-TEST(WithFields, Call_Span) {
-  Call call = TestCall();
-  Span new_span = TestSpan();
-  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{},
-                           /*opt_type_args=*/{}, /*opt_virtual_device=*/{}, new_span);
-  ASSERT_FALSE(result.same_as(call));
-  ASSERT_FALSE(call->span.same_as(new_span));
-  ASSERT_TRUE(result->span.same_as(new_span));
-}
-
-//
-// GlobalVar
-//
-
-TEST(WithFields, GlobalVar_Noop) {
-  GlobalVar gv = TestGlobalVar();
-  GlobalVar result = WithFields(gv);
-  ASSERT_TRUE(result.same_as(gv));
-}
-
-TEST(WithFields, GlobalVar_Name) {
-  GlobalVar gv = TestGlobalVar();
-  String new_name("foo");
-  GlobalVar result = WithFields(gv, new_name);
-  ASSERT_FALSE(result.same_as(gv));
-  ASSERT_FALSE(gv->name_hint.same_as(new_name));
-  ASSERT_TRUE(result->name_hint.same_as(new_name));
-}
-
-TEST(WithFields, GlobalVar_Type) {
-  GlobalVar gv = TestGlobalVar();
-  Type new_type = TupleType(Array<Type>());
-  GlobalVar result = WithFields(gv, /*opt_name_hint=*/{}, new_type);
-  ASSERT_FALSE(result.same_as(gv));
-  ASSERT_FALSE(gv->checked_type().same_as(new_type));
-  ASSERT_TRUE(result->checked_type().same_as(new_type));
-}
-
-TEST(WithFields, GlobalVar_VirtualDevice) {
-  GlobalVar gv = TestGlobalVar();
-  VirtualDevice new_virtual_device = TestVirtualDevice();
-  GlobalVar result = WithFields(gv, /*opt_name_hint=*/{}, /*opt_type=*/{}, new_virtual_device);
-  ASSERT_FALSE(result.same_as(gv));
-  ASSERT_FALSE(gv->virtual_device().same_as(new_virtual_device));
-  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
-}
-
-TEST(WithFields, GlobalVar_Span) {
-  GlobalVar gv = TestGlobalVar();
-  Span new_span = TestSpan();
-  GlobalVar result =
-      WithFields(gv, /*opt_name_hint=*/{}, /*opt_type=*/{}, /*opt_virtual_device=*/{}, new_span);
-  ASSERT_FALSE(result.same_as(gv));
-  ASSERT_FALSE(gv->span.same_as(new_span));
-  ASSERT_TRUE(result->span.same_as(new_span));
-}
-
-//
-// Constant
-//
-
-TEST(WithFields, Constant_Noop) {
-  Constant constant = TestConstant();
-  Constant result = WithFields(constant);
-  ASSERT_TRUE(result.same_as(constant));
-}
-
-TEST(WithFields, Constant_Data) {
-  Constant constant = TestConstant();
-  runtime::NDArray new_data = runtime::NDArray::Empty({}, DataType::Float(32), {kDLCPU, 0});
-  Constant result = WithFields(constant, new_data);
-  ASSERT_FALSE(result.same_as(constant));
-  ASSERT_FALSE(constant->data.same_as(new_data));
-  ASSERT_TRUE(result->data.same_as(new_data));
-}
-
-TEST(WithFields, Constant_VirtualDevice) {
-  Constant constant = TestConstant();
-  VirtualDevice new_virtual_device = TestVirtualDevice();
-  Constant result = WithFields(constant, /*opt_data=*/{}, new_virtual_device);
-  ASSERT_FALSE(result.same_as(constant));
-  ASSERT_FALSE(constant->virtual_device().same_as(new_virtual_device));
-  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
-}
-
-TEST(WithFields, Constant_Span) {
-  Constant constant = TestConstant();
-  Span new_span = TestSpan();
-  Constant result = WithFields(constant, /*opt_data=*/{}, /*opt_virtual_device=*/{}, new_span);
-  ASSERT_FALSE(result.same_as(constant));
-  ASSERT_FALSE(constant->span.same_as(new_span));
-  ASSERT_TRUE(result->span.same_as(new_span));
-}
-
-}  // namespace
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
deleted file mode 100644
index 103989fc779d..000000000000
--- a/tests/cpp/relay_build_module_test.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/driver/driver_api.h>
-#include <tvm/ir/memory_pools.h>
-#include <tvm/ir/module.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/executor.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/op_strategy.h>
-#include <tvm/relay/runtime.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/executor_info.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/broadcast.h>
-#include <tvm/topi/generic/injective.h>
-
-using namespace tvm;
-using namespace tvm::relay;
-
-TVM_REGISTER_GLOBAL("test.strategy")
-    .set_body_typed([](const Attrs& attrs, const Array<te::Tensor>& inputs, const Type& out_type,
-                       const Target& target) {
-      FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                const Type& out_type) -> Array<te::Tensor> {
-        ICHECK_EQ(inputs.size(), 2U);
-        return {topi::add(inputs[0], inputs[1])};
-      };
-      FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs,
-                                  const Target& target) {
-        With<Target> target_scope(target);
-        return topi::generic::schedule_injective(target, outs);
-      };
-
-      auto n = make_object<OpStrategyNode>();
-      auto strategy = tvm::relay::OpStrategy(std::move(n));
-      strategy.AddImplementation(fcompute, fschedule, "test.strategy", 10);
-      return strategy;
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend.lower_call")
-    .set_body_typed([](const relay::Call& call, const Array<te::Tensor>& inputs,
-                       const Target& target) {
-      static auto fstrategy = Op::GetAttrMap<relay::FTVMStrategy>("FTVMStrategy");
-      Op op = Downcast<Op>(call->op);
-      auto out_type = call->checked_type();
-      OpStrategy strategy = fstrategy[op](call->attrs, inputs, out_type, target);
-      auto impl = strategy->specializations[0]->implementations[0];
-      auto outs = impl.Compute(call->attrs, inputs, out_type);
-      auto f = tvm::runtime::Registry::Get("relay.backend._make_LoweredOutput");
-      if (!f) {
-        LOG(FATAL) << "relay.backend._make_LoweredOutput is not registered";
-      }
-      return (*f)(outs, impl);
-    });
-
-TEST(Relay, BuildModule) {
-  auto tensor_type = relay::TensorType({2, 3}, DataType::Float(32));
-  auto a = relay::Var("a", tensor_type);
-  auto b = relay::Var("b", tensor_type);
-  auto add_op = relay::Op::Get("add");
-  auto x = relay::Call(add_op, {a, b}, tvm::Attrs(), {});
-  auto c = relay::Var("c", tensor_type);
-  auto y = relay::Call(add_op, {x, c}, tvm::Attrs(), {});
-  auto func = relay::Function(relay::FreeVars(y), y, relay::Type(), {});
-  auto A = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto B = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto C = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-
-  auto pA = static_cast<float*>(A->data);
-  auto pB = static_cast<float*>(B->data);
-  auto pC = static_cast<float*>(C->data);
-
-  for (int i = 0; i < 6; ++i) {
-    pA[i] = i;
-    pB[i] = i + 1;
-    pC[i] = i + 2;
-  }
-  // get schedule
-  auto reg = tvm::runtime::Registry::Get("ir.RegisterOpAttr");
-  if (!reg) {
-    LOG(FATAL) << "no _Register";
-  }
-  auto reset = tvm::runtime::Registry::Get("ir.OpResetAttr");
-  if (!reset) {
-    LOG(FATAL) << "Reset is not defined.";
-  }
-  auto fs = tvm::runtime::Registry::Get("test.strategy");
-  if (!fs) {
-    LOG(FATAL) << "No test_strategy registered.";
-  }
-  auto fgeneric = GenericFunc::Get("test.strategy_generic").set_default(*fs, true);
-  (*reset)(add_op, "FTVMStrategy");
-  (*reg)("add", "FTVMStrategy", fgeneric, 10);
-  Array<Integer> dep;
-  dep.push_back(0);
-  (*reset)(add_op, "TShapeDataDependent");
-  (*reg)("add", "TShapeDataDependent", dep, 10);
-  // build
-  auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
-  tvm::runtime::Module build_mod = (*pfb)();
-  auto build_f = build_mod.GetFunction("build", false);
-  auto json_f = build_mod.GetFunction("get_graph_json", false);
-  auto mod_f = build_mod.GetFunction("get_module", false);
-  Target llvm_tgt = Target("llvm");
-  Array<Target> targets = {llvm_tgt};
-  auto relay_mod = tvm::IRModule::FromExpr(func);
-  ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, llvm_tgt, Executor::Create("graph"), Runtime::Create("cpp"),
-          WorkspaceMemoryPools(), ConstantMemoryPools(), "");
-  std::string json = json_f();
-  tvm::runtime::Module mod = mod_f();
-  // run
-  auto dev = A->device;
-  auto pfr = tvm::runtime::Registry::Get("tvm.graph_executor.create");
-  ICHECK(mod.defined()) << "Module must be defined";
-  tvm::runtime::Module run_mod =
-      (*pfr)(json, mod, static_cast<int>(dev.device_type), dev.device_id);
-  auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false);
-  auto run_f = run_mod.GetFunction("run", false);
-  auto get_output_f = run_mod.GetFunction("get_output", false);
-  set_input_f("a", const_cast<DLTensor*>(A.operator->()));
-  set_input_f("b", const_cast<DLTensor*>(B.operator->()));
-  set_input_f("c", const_cast<DLTensor*>(C.operator->()));
-  run_f();
-  tvm::runtime::NDArray Y = get_output_f(0);
-  auto pY = static_cast<float*>(Y->data);
-  for (int i = 0; i < 6; ++i) {
-    ICHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
-  }
-  // mutate the input a bit and run it again
-  for (int i = 0; i < 6; ++i) {
-    pB[i] = i + 3;
-  }
-  run_f();
-  tvm::runtime::NDArray Y2 = get_output_f(0);
-  auto pY2 = static_cast<float*>(Y2->data);
-  for (int i = 0; i < 6; ++i) {
-    ICHECK_LT(fabs(pY2[i] - (i + (i + 3) + (i + 2))), 1e-4);
-  }
-  // attach a different input and run it again
-  auto C2 = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto pC2 = static_cast<float*>(C2->data);
-  for (int i = 0; i < 6; ++i) {
-    pC2[i] = i + 4;
-  }
-  set_input_f("c", const_cast<DLTensor*>(C2.operator->()));
-  run_f();
-  tvm::runtime::NDArray Y3 = get_output_f(0);
-  auto pY3 = static_cast<float*>(Y3->data);
-  for (int i = 0; i < 6; ++i) {
-    ICHECK_LT(fabs(pY3[i] - (i + (i + 3) + (i + 4))), 1e-4);
-  }
-}
-
-TEST(Relay, GetExprRefCount) {
-  auto tensor_type = relay::TensorType({2, 3}, DataType::Float(32));
-  auto a = relay::Var("a", tensor_type);
-  auto add_op = relay::Op::Get("add");
-  auto relu_op = relay::Op::Get("nn.relu");
-  auto x = relay::Call(relu_op, {a}, tvm::Attrs(), {});
-  auto y = relay::Call(relu_op, {x}, tvm::Attrs(), {});
-  auto z = relay::Call(add_op, {y, x}, tvm::Attrs(), {});
-  auto ref_count = GetExprRefCount(z);
-  ICHECK(ref_count[a.get()] == 1);
-  ICHECK(ref_count[relu_op.get()] == 2);
-  ICHECK(ref_count[add_op.get()] == 1);
-  ICHECK(ref_count[x.get()] == 2);
-  ICHECK(ref_count[y.get()] == 1);
-  ICHECK(ref_count[z.get()] == 1);
-}
diff --git a/tests/cpp/relay_dismantler_test.cc b/tests/cpp/relay_dismantler_test.cc
deleted file mode 100644
index ae95185cb287..000000000000
--- a/tests/cpp/relay_dismantler_test.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <gtest/gtest.h>
-#include <tvm/ir/expr.h>
-#include <tvm/ir/type_functor.h>
-#include <tvm/node/functor.h>
-#include <tvm/node/structural_equal.h>
-#include <tvm/relay/adt.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/function.h>
-#include <tvm/relay/op.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/op_strategy.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/broadcast.h>
-#include <tvm/topi/generic/injective.h>
-
-#include <memory>
-
-using namespace tvm;
-using namespace tvm::relay;
-
-TEST(Relay, OutOfStack_add) {
-  auto foo = [] {
-    auto add_op = relay::Op::Get("add");
-    auto c_data = tvm::runtime::NDArray::Empty({1, 2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-    auto c1 = relay::Constant(c_data);
-    Call y1 = relay::Call(add_op, {c1, c1});
-    for (int i = 0; i < 1e6; i++) {
-      y1 = relay::Call(add_op, {c1, y1});
-    }
-    relay::Function func = relay::Function({}, y1, relay::Type(), {});
-  };
-  ASSERT_EXIT((foo(), exit(0)), ::testing::ExitedWithCode(0), ".*");
-}
-
-TEST(Relay, OutOfStack_cast) {
-  auto foo = [] {
-    auto cast_op = relay::Op::Get("cast");
-    auto c_data = tvm::runtime::NDArray::Empty({1, 2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-    auto c1 = relay::Constant(c_data);
-    Call y1 = relay::Call(cast_op, {c1});
-    for (int i = 0; i < 1e6; i++) {
-      y1 = relay::Call(cast_op, {y1});
-    }
-    relay::Function func = relay::Function({}, y1, relay::Type(), {});
-  };
-  ASSERT_EXIT((foo(), exit(0)), ::testing::ExitedWithCode(0), ".*");
-}
-
-TEST(Relay, OutOfStack_packed_func) {
-  constexpr int len = 1e6;
-  auto foo = [] {
-    auto x = relay::Var("x", relay::TensorType({3, 2}, DataType::Float(32)));
-    auto one = relay::Constant(tvm::runtime::NDArray::Empty({1}, {kDLFloat, 32, 1}, {kDLCPU, 0}));
-    auto add_func = tvm::runtime::Registry::Get("relay.op._make.add");
-    auto y = (*add_func)(x, one);
-    for (int i = 0; i < len; ++i) {
-      y = (*add_func)(y, one);
-    }
-
-    // check if still reachable
-    int k = 0;
-    Expr e = y;
-    while (e.defined() && e.as<CallNode>() != nullptr) {
-      e = e.as<CallNode>()->args[0];
-      ++k;
-    }
-    ASSERT_EQ(len + 1, k);
-  };
-  ASSERT_EXIT((foo(), exit(0)), ::testing::ExitedWithCode(0), ".*");
-}
-
-TEST(Relay, CallNodeSharedArgs) {
-  auto x = relay::Var("x", relay::TensorType({3, 2}, DataType::Float(32)));
-  auto one = relay::Constant(tvm::runtime::NDArray::Empty({1}, {kDLFloat, 32, 1}, {kDLCPU, 0}));
-  auto relu_op = relay::Op::Get("nn.relu");
-  Call y = relay::Call(relu_op, {x}, Attrs(), {});
-  y = relay::Call(relu_op, {y}, Attrs(), {});
-  ASSERT_EQ(1, y.get()->args[0].as<CallNode>()->args.size());
-  y = relay::Call(y.get()->op, y.get()->args, y.get()->attrs, y.get()->type_args);
-  ASSERT_EQ(1, y.get()->args[0].as<CallNode>()->args.size());
-}
-
-TEST(Relay, TupleSharedFields) {
-  auto x = relay::Var("x", relay::TensorType({3, 2}, DataType::Float(32)));
-  auto one = relay::Constant(tvm::runtime::NDArray::Empty({1}, {kDLFloat, 32, 1}, {kDLCPU, 0}));
-  auto relu_op = relay::Op::Get("nn.relu");
-  Expr y = relay::Call(relu_op, {x}, Attrs(), {});
-  y = relay::Call(relu_op, {y}, Attrs(), {});
-  {
-    Expr y1 = relay::Tuple(y.as<CallNode>()->args);
-    Expr y2 = relay::Tuple(y.as<CallNode>()->args);
-
-    y1 = relay::Call(relu_op, {y1});
-    y2 = relay::Call(relu_op, {y2});
-    y = y1;
-  }
-  ASSERT_EQ(1, y.as<CallNode>()->args[0].as<TupleNode>()->fields[0].as<CallNode>()->args.size());
-}
-
-TEST(Relay, TupleiGetItemSharedTuple) {
-  auto x = relay::Var("x", relay::TensorType({3, 2}, DataType::Float(32)));
-  auto one = relay::Constant(tvm::runtime::NDArray::Empty({1}, {kDLFloat, 32, 1}, {kDLCPU, 0}));
-  auto relu_op = relay::Op::Get("nn.relu");
-  Expr y = relay::Call(relu_op, {x}, Attrs(), {});
-  y = relay::Tuple({y});
-  {
-    Expr y1 = relay::TupleGetItem(y, 0);
-    Expr y2 = relay::TupleGetItem(y, 0);
-
-    y1 = relay::Call(relu_op, {y1});
-    y2 = relay::Call(relu_op, {y2});
-    y = y1;
-  }
-  ASSERT_EQ(1, y.as<CallNode>()
-                   ->args[0]
-                   .as<TupleGetItemNode>()
-                   ->tuple.as<TupleNode>()
-                   ->fields[0]
-                   .as<CallNode>()
-                   ->args.size());
-}
-
-TEST(Relay, OutOfStackLet) {
-  auto foo = [] {
-    auto add_op = relay::Op::Get("add");
-    auto p = relay::Var("p", relay::TensorType({3, 2}, DataType::Float(32)));
-    int size = 1e6 - 1;
-    std::vector<relay::Var> vars;
-    for (int i = 0; i < size; ++i) {
-      vars.emplace_back("x_" + std::to_string(i), relay::TensorType({3, 2}, DataType::Float(32)));
-    }
-    Expr body = vars[size - 1];
-    for (int i = size - 1; i >= 0; --i) {
-      Var v = i == 0 ? p : vars[i - 1];
-      body = relay::Let(vars[i], relay::Call(add_op, {v, v}), body);
-    }
-    relay::Function func = relay::Function({p}, body, relay::Type(), {});
-  };
-  ASSERT_EXIT((foo(), exit(0)), ::testing::ExitedWithCode(0), ".*");
-}
diff --git a/tests/cpp/relay_pass_type_infer_test.cc b/tests/cpp/relay_pass_type_infer_test.cc
deleted file mode 100644
index 6db595281813..000000000000
--- a/tests/cpp/relay_pass_type_infer_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/node/structural_equal.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/te/operation.h>
-
-TEST(Relay, SelfReference) {
-  using namespace tvm;
-  auto tensor_type = relay::TensorType({}, DataType::Bool());
-  auto x = relay::Var("x", relay::Type());
-  auto f = relay::Function(tvm::Array<relay::Var>{x}, x, relay::Type(), {});
-  ICHECK(f->IsInstance<BaseFuncNode>());
-  auto y = relay::Var("y", tensor_type);
-  auto call = relay::Call(f, Array<relay::Expr>{y});
-  auto fx = relay::Function(tvm::Array<relay::Var>{y}, call, relay::Type(), {});
-  auto mod = IRModule::FromExpr(fx);
-  mod = relay::transform::InferType()(mod);
-  auto type_fx = mod->Lookup("main");
-
-  auto expected = relay::FuncType(tvm::Array<relay::Type>{tensor_type}, tensor_type, {}, {});
-  ICHECK(tvm::StructuralEqual()(type_fx->checked_type(), expected));
-}
diff --git a/tests/cpp/relay_text_printer_test.cc b/tests/cpp/relay_text_printer_test.cc
deleted file mode 100644
index 58fa228f8a46..000000000000
--- a/tests/cpp/relay_text_printer_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/ir/expr.h>
-#include <tvm/ir/type_functor.h>
-#include <tvm/node/functor.h>
-#include <tvm/node/structural_equal.h>
-#include <tvm/relay/adt.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/function.h>
-#include <tvm/relay/op.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/op_strategy.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/broadcast.h>
-#include <tvm/topi/generic/injective.h>
-
-using namespace tvm;
-using namespace tvm::relay;
-
-TEST(Relay, LargeGraphPrint) {
-  auto foo = [] {
-    auto add_op = relay::Op::Get("add");
-    auto c_data = tvm::runtime::NDArray::Empty({1, 2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-    auto c1 = relay::Constant(c_data);
-    Call y1 = relay::Call(add_op, {c1, c1});
-    for (int i = 0; i < 1e6; i++) {
-      y1 = relay::Call(add_op, {c1, y1});
-    }
-    relay::Function func = relay::Function({}, y1, relay::Type(), {});
-    std::string result = AsText(func);
-    ASSERT_GT(0, result.size());
-  };
-  ASSERT_EXIT((foo(), exit(0)), ::testing::ExitedWithCode(0), ".*");
-}
diff --git a/tests/cpp/relay_transform_sequential_test.cc b/tests/cpp/relay_transform_sequential_test.cc
deleted file mode 100644
index c4e5db89de07..000000000000
--- a/tests/cpp/relay_transform_sequential_test.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/driver/driver_api.h>
-#include <tvm/ir/module.h>
-#include <tvm/node/structural_equal.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/op_strategy.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/broadcast.h>
-#include <tvm/topi/generic/injective.h>
-
-using namespace tvm;
-
-TVM_REGISTER_GLOBAL("test.seq.strategy")
-    .set_body_typed([](const Attrs& attrs, const Array<te::Tensor>& inputs, const Type& out_type,
-                       const Target& target) {
-      relay::FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                       const Type& out_type) -> Array<te::Tensor> {
-        ICHECK_EQ(inputs.size(), 2U);
-        return {topi::add(inputs[0], inputs[1])};
-      };
-      relay::FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs,
-                                         const Target& target) {
-        With<Target> target_scope(target);
-        return topi::generic::schedule_injective(target, outs);
-      };
-
-      auto n = make_object<relay::OpStrategyNode>();
-      auto strategy = relay::OpStrategy(std::move(n));
-      strategy.AddImplementation(fcompute, fschedule, "test.strategy", 10);
-      return strategy;
-    });
-
-TEST(Relay, Sequential) {
-  auto tensor_type = relay::TensorType({1, 2, 3}, DataType::Float(32));
-  auto c_data = tvm::runtime::NDArray::Empty({1, 2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-
-  // Create a function for optimization.
-  auto c = relay::Constant(c_data);
-  auto a = relay::Var("a", tensor_type);
-  auto x = relay::Var("x", tensor_type);
-  auto add_op = relay::Op::Get("add");
-  auto y = relay::Call(add_op, {c, c});
-  y = relay::Call(add_op, {x, y});
-  auto z = relay::Call(add_op, {y, c});
-  auto z1 = relay::Call(add_op, {y, c});
-  auto z2 = relay::Call(add_op, {z, z1});
-  // Let expression and varaible a should be dead-code eliminated.
-  auto z3 = relay::Let(a, c, z2);
-  relay::Function func = relay::Function(relay::FreeVars(z3), z3, relay::Type(), {});
-
-  auto reg = tvm::runtime::Registry::Get("ir.RegisterOpAttr");
-  if (!reg) {
-    LOG(FATAL) << "Register is not defined.";
-  }
-  auto reset = tvm::runtime::Registry::Get("ir.OpResetAttr");
-  if (!reset) {
-    LOG(FATAL) << "Reset is not defined.";
-  }
-  auto fs = tvm::runtime::Registry::Get("test.seq.strategy");
-  if (!fs) {
-    LOG(FATAL) << "Strategy is not defined.";
-  }
-  auto fgeneric = GenericFunc::Get("test.strategy_generic").set_default(*fs, true);
-  (*reset)(add_op, "FTVMStrategy");
-  (*reg)("add", "FTVMStrategy", fgeneric, 10);
-
-  // Run sequential passes.
-  tvm::Array<relay::transform::Pass> pass_seqs{
-      relay::transform::InferType(), relay::transform::DeadCodeElimination(),
-      relay::transform::EliminateCommonSubexpr(), relay::transform::AlterOpLayout()};
-  relay::transform::Pass seq = relay::transform::Sequential(pass_seqs);
-  auto mod = IRModule::FromExpr(func);
-  auto pass_ctx = relay::transform::PassContext::Create();
-  pass_ctx->opt_level = 3;
-  pass_ctx->config.Set("relay.fallback_device_type", Integer(1));
-  {
-    tvm::With<relay::transform::PassContext> ctx_scope(pass_ctx);
-    tvm::With<tvm::Target> tctx(tvm::Target("llvm"));
-    mod = seq(mod);
-  }
-
-  ICHECK(mod.defined());
-  auto entry_func = mod->GetGlobalVar("main");
-  ICHECK(entry_func.defined());
-  relay::Function f = Downcast<relay::Function>(mod->Lookup("main"));
-  ICHECK(f.defined());
-
-  // Expected function
-  auto c1 = relay::Constant(c_data);
-  auto x1 = relay::Var("x", tensor_type);
-  auto y1 = relay::Call(add_op, {c1, c1});
-  y1 = relay::Call(add_op, {x1, y1});
-  auto zz = relay::Call(add_op, {y1, c1});
-  zz = relay::Call(add_op, {zz, zz});
-  relay::Function expected_func = relay::Function(relay::FreeVars(zz), zz, relay::Type(), {});
-
-  // Infer type for the expected function.
-  auto mod1 = IRModule::FromExpr(expected_func);
-  mod1 = relay::transform::InferType()(mod1);
-  auto expected = mod1->Lookup("main");
-  ICHECK(tvm::StructuralEqual()(f, expected));
-}
-
-TEST(PassContextListConfigs, Basic) {
-  Map<String, Map<String, String>> configs = relay::transform::PassContext::ListConfigs();
-  ICHECK_EQ(configs.empty(), false);
-
-  auto config = configs["relay.backend.use_auto_scheduler"];
-  ICHECK_EQ(config["type"], "IntImm");
-}
diff --git a/tests/cpp/runtime_test.cc b/tests/cpp/runtime_test.cc
deleted file mode 100644
index be81ded5d78b..000000000000
--- a/tests/cpp/runtime_test.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/driver/driver_api.h>
-#include <tvm/ir/memory_pools.h>
-#include <tvm/ir/module.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/executor.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/op_strategy.h>
-#include <tvm/relay/runtime.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/executor_info.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/broadcast.h>
-#include <tvm/topi/generic/injective.h>
-
-using namespace tvm;
-using namespace tvm::relay;
-
-TVM_REGISTER_GLOBAL("runtime_test.strategy")
-    .set_body_typed([](const Attrs& attrs, const Array<te::Tensor>& inputs, const Type& out_type,
-                       const Target& target) {
-      FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                const Type& out_type) -> Array<te::Tensor> {
-        ICHECK_EQ(inputs.size(), 2U);
-        return {topi::add(inputs[0], inputs[1])};
-      };
-      FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs,
-                                  const Target& target) {
-        With<Target> target_scope(target);
-        return topi::generic::schedule_injective(target, outs);
-      };
-
-      auto n = make_object<OpStrategyNode>();
-      auto strategy = tvm::relay::OpStrategy(std::move(n));
-      strategy.AddImplementation(fcompute, fschedule, "runtime_test.strategy", 10);
-      return strategy;
-    });
-
-TEST(Runtime, ZeroCopy) {
-  auto tensor_type = relay::TensorType({2, 3}, DataType::Float(32));
-  auto a = relay::Var("a", tensor_type);
-  auto b = relay::Var("b", tensor_type);
-  auto add_op = relay::Op::Get("add");
-  auto x = relay::Call(add_op, {a, b}, tvm::Attrs(), {});
-  auto c = relay::Var("c", tensor_type);
-  auto y = relay::Call(add_op, {x, c}, tvm::Attrs(), {});
-  auto func = relay::Function(relay::FreeVars(y), y, relay::Type(), {});
-  auto A = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto B = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto C = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto Y = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-
-  auto pA = static_cast<float*>(A->data);
-  auto pB = static_cast<float*>(B->data);
-  auto pC = static_cast<float*>(C->data);
-  auto pY = static_cast<float*>(Y->data);
-
-  for (int i = 0; i < 6; ++i) {
-    pA[i] = i;
-    pB[i] = i + 1;
-    pC[i] = i + 2;
-  }
-  // get schedule
-  auto reg = tvm::runtime::Registry::Get("ir.RegisterOpAttr");
-  if (!reg) {
-    LOG(FATAL) << "no _Register";
-  }
-  auto reset = tvm::runtime::Registry::Get("ir.OpResetAttr");
-  if (!reset) {
-    LOG(FATAL) << "Reset is not defined.";
-  }
-  auto fs = tvm::runtime::Registry::Get("runtime_test.strategy");
-  if (!fs) {
-    LOG(FATAL) << "No test_strategy registered.";
-  }
-  auto fgeneric = GenericFunc::Get("runtime_test.strategy_generic").set_default(*fs, true);
-  (*reset)(add_op, "FTVMStrategy");
-  (*reg)("add", "FTVMStrategy", fgeneric, 10);
-  Array<Integer> dep;
-  dep.push_back(0);
-  (*reset)(add_op, "TShapeDataDependent");
-  (*reg)("add", "TShapeDataDependent", dep, 10);
-  // build
-  auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
-  tvm::runtime::Module build_mod = (*pfb)();
-  auto build_f = build_mod.GetFunction("build", false);
-  auto json_f = build_mod.GetFunction("get_graph_json", false);
-  auto mod_f = build_mod.GetFunction("get_module", false);
-  Target llvm_tgt = Target("llvm");
-  Array<Target> targets = {llvm_tgt};
-  auto relay_mod = tvm::IRModule::FromExpr(func);
-  ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, llvm_tgt, Executor::Create("graph"), Runtime::Create("cpp"),
-          WorkspaceMemoryPools(), ConstantMemoryPools(), "");
-  // create graph executor
-  std::string json = json_f();
-  tvm::runtime::Module mod = mod_f();
-  auto dev = A->device;
-  auto pfr = tvm::runtime::Registry::Get("tvm.graph_executor.create");
-  ICHECK(mod.defined()) << "Module must be defined";
-  tvm::runtime::Module run_mod =
-      (*pfr)(json, mod, static_cast<int>(dev.device_type), dev.device_id);
-  // get function
-  auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false);
-  auto set_output_f = run_mod.GetFunction("set_output_zero_copy", false);
-  auto run_f = run_mod.GetFunction("run", false);
-  // set input zero copy
-  set_input_f("a", const_cast<DLTensor*>(A.operator->()));
-  set_input_f("b", const_cast<DLTensor*>(B.operator->()));
-  set_input_f("c", const_cast<DLTensor*>(C.operator->()));
-  // set output zero copy
-  set_output_f(0, const_cast<DLTensor*>(Y.operator->()));
-  run_f();
-  // check correctness
-  for (int i = 0; i < 6; ++i) {
-    ICHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
-  }
-  // mutate the input a bit and run it again
-  for (int i = 0; i < 6; ++i) {
-    pB[i] = i + 3;
-  }
-  run_f();
-  // check correctness
-  for (int i = 0; i < 6; ++i) {
-    ICHECK_LT(fabs(pY[i] - (i + (i + 3) + (i + 2))), 1e-4);
-  }
-  // attach a different input and run it again
-  auto C2 = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto pC2 = static_cast<float*>(C2->data);
-  for (int i = 0; i < 6; ++i) {
-    pC2[i] = i + 4;
-  }
-  set_input_f("c", const_cast<DLTensor*>(C2.operator->()));
-  run_f();
-  // check correctness
-  for (int i = 0; i < 6; ++i) {
-    ICHECK_LT(fabs(pY[i] - (i + (i + 3) + (i + 4))), 1e-4);
-  }
-}
diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 4d10b01485a0..fdc753ca13b6 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -18,28 +18,3 @@
 set -euxo pipefail
 
 python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/tvmscript/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/conftest.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_cblas.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_tflite_runtime.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_thrust.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_util.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_sort.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_sparse.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_tedd.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_rpc_tracker.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_rpc_server_device.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_rpc_proxy.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_rocblas.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_random.py --rcfile="$(dirname "$0")"/pylintrc
-
-# tests/python/contrib/test_hexagon tests
-python3 -m pylint tests/python/contrib/test_hexagon/*.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/conv2d/*.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/topi/*.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/metaschedule_e2e/*.py --rcfile="$(dirname "$0")"/pylintrc
-
-# tests/python/contrib/test_msc tests
-python3 -m pylint tests/python/contrib/test_msc/*.py --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_compute_dag.py b/tests/python/auto_scheduler/test_auto_scheduler_compute_dag.py
deleted file mode 100644
index d3b618d67586..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_compute_dag.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test ComputeDAG (replay, infer bound)"""
-import json
-import pickle
-
-import tvm
-from tvm import topi
-from tvm import auto_scheduler, te
-
-from tvm.testing.auto_scheduler import (
-    get_tiled_matmul,
-    invalid_compute_definition,
-    matmul_auto_scheduler_test,
-    parallel_matmul_auto_scheduler_test,
-)
-
-
-def test_apply_steps():
-    dag, s = get_tiled_matmul()
-    dag.print_python_code_from_state(s)
-    sch, tensors = dag.apply_steps_from_state(s)
-    tvm.lower(sch, tensors, simple_mode=True)
-
-
-def test_infer_bound():
-    dag, s = get_tiled_matmul()
-    s = dag.infer_bound_from_state(s)
-
-
-def test_estimate_flop():
-    N = 512
-    A, B, C = matmul_auto_scheduler_test(N, N, N)
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-    assert abs(dag.flop_ct - 2 * N**3) < 0.5
-
-    D = topi.nn.relu(C)
-    dag = auto_scheduler.ComputeDAG([A, B, D])
-    assert abs(dag.flop_ct - (2 * N**3 + N * N)) < 0.5
-
-    # should not count the comparison operations in padding
-    E = topi.nn.pad(C, [1, 1])
-    dag = auto_scheduler.ComputeDAG([A, B, E])
-    assert abs(dag.flop_ct - 2 * N**3) < 0.5
-
-    F = te.compute((N, N), lambda i, j: E[i, j], name="F", attrs={"FLOP": 1234})
-    dag = auto_scheduler.ComputeDAG([A, B, F])
-    assert abs(dag.flop_ct - (2 * N**3 + 1234)) < 0.5
-
-    A = te.placeholder((N, N), dtype="float32", name="A")
-    F = te.compute((N, N), lambda i, j: te.if_then_else(A[i, j] > 0, A[i, j], 0))
-    dag = auto_scheduler.ComputeDAG([A, F])
-    assert abs(dag.flop_ct - N**2) < 0.5
-
-
-def test_stage_order():
-    """Test if the stage order is preserved when recovering a DAG."""
-    N = 512
-    A, B, C, D, E = parallel_matmul_auto_scheduler_test(N)
-    sch = te.create_schedule([D.op, E.op])
-    (D_local,) = sch.cache_write([D], "local")
-    (E_local,) = sch.cache_write([E], "local")
-    sch.cache_read(A, "shared", [D_local])
-    sch.cache_read(B, "shared", [D_local])
-    sch.cache_read(A, "shared", [E_local])
-    sch.cache_read(C, "shared", [E_local])
-
-    dag = auto_scheduler.ComputeDAG(sch)
-    stage_ops_1 = dag.get_init_state().stage_ops
-
-    # 3 placeholder, 4 x.shared, 2 {D,E}.local, 2 {D,E} compute
-    assert len(stage_ops_1) == 11
-
-    # Cache read stage should follow the source stage
-    for idx, op in enumerate(stage_ops_1):
-        if op.name == "A":
-            assert (
-                stage_ops_1[idx + 1].name == "A.d.shared"
-                and stage_ops_1[idx + 2].name == "A.shared"
-            )
-        elif op.name in ["B", "C"]:
-            assert stage_ops_1[idx + 1].name == "%s.shared" % op.name
-
-    # Apply the same schedule to Ansor state and it should have the same stage order
-    dag = auto_scheduler.ComputeDAG([A, B, C, D, E])
-    state = dag.get_init_state()
-
-    D_local = state.cache_write(D, "local")
-    E_local = state.cache_write(E, "local")
-    state.cache_read(A, "shared", [D_local])
-    state.cache_read(B, "shared", [D_local])
-    state.cache_read(A, "shared", [E_local])
-    state.cache_read(C, "shared", [E_local])
-
-    stage_ops_2 = state.stage_ops
-    assert len(stage_ops_1) == len(stage_ops_2)
-
-    # Cache read stage should follow the source stage
-    for op1, op2 in zip(stage_ops_1, stage_ops_2):
-        assert op1.name == op2.name
-
-    # Serialize and deserialize the ComputeDAG constructed by a list of tensor ops.
-    loaded_dag = pickle.loads(pickle.dumps(dag))
-    assert str(loaded_dag.get_init_state()) == str(dag.get_init_state())
-    assert len(loaded_dag.get_init_state().stage_ops) == len(dag.get_init_state().stage_ops)
-
-    # Serialize and deserialize the search task. Note that we intentionally skip hardware_params
-    # to test if the default one is serialized along with other attributes as well.
-    task = auto_scheduler.SearchTask(
-        compute_dag=dag, workload_key=json.dumps(("test-key",)), target=tvm.target.Target("llvm")
-    )
-
-    task2 = pickle.loads(pickle.dumps(task))
-    assert '["test-key"]' in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY
-    assert str(task.compute_dag.get_init_state()) == str(task2.compute_dag.get_init_state())
-    assert len(task.compute_dag.get_init_state().stage_ops) == len(
-        task2.compute_dag.get_init_state().stage_ops
-    )
-    assert task.workload_key == task2.workload_key
-    assert str(task.target) == str(task2.target)
-    assert task.hardware_params.num_cores == task2.hardware_params.num_cores
-    assert task.hardware_params.vector_unit_bytes == task2.hardware_params.vector_unit_bytes
-    assert task.hardware_params.cache_line_bytes == task2.hardware_params.cache_line_bytes
-
-
-def test_invalid_compute_dag():
-    failed = False
-    try:
-        A, B = invalid_compute_definition()
-        auto_scheduler.ComputeDAG([A, B])
-    except tvm.TVMError:
-        failed = True
-
-    assert failed
-
-
-if __name__ == "__main__":
-    test_apply_steps()
-    test_infer_bound()
-    test_estimate_flop()
-    test_stage_order()
-    test_invalid_compute_dag()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_cost_model.py b/tests/python/auto_scheduler/test_auto_scheduler_cost_model.py
deleted file mode 100644
index 50e3ceb6f5fa..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_cost_model.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test cost models"""
-
-import tempfile
-
-import numpy as np
-
-import tvm
-from tvm import auto_scheduler
-
-from tvm.testing.auto_scheduler import matmul_auto_scheduler_test
-
-
-def get_sample_records(number):
-    """Generate a list of random MeasureInput and MeasureResult pairs"""
-    N = 128
-    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target="llvm")
-    policy = auto_scheduler.SketchPolicy(task, verbose=0)
-    states = policy.sample_initial_population()[:number]
-
-    inputs = [auto_scheduler.MeasureInput(task, s) for s in states]
-    results = [
-        auto_scheduler.MeasureResult([np.random.uniform(0.5, 1.0)], 0, "", 0.1, 0)
-        for _ in range(len(inputs))
-    ]
-
-    return task, inputs, results
-
-
-def test_random_model():
-    task, inputs, results = get_sample_records(50)
-
-    model = auto_scheduler.RandomModel()
-    model.update(inputs, results)
-    scores = model.predict(task, [x.state for x in inputs])
-    assert len(scores) == len(inputs)
-
-
-def test_xgb_model():
-    task, inputs, results = get_sample_records(50)
-
-    model = auto_scheduler.XGBModel(num_warmup_sample=-1)
-    model.update(inputs, results)
-    preds = model.predict(task, [x.state for x in inputs])
-    assert len(preds) == len(inputs)
-
-    costs = [np.mean([x.value for x in res.costs]) for res in results]
-    throughputs = np.min(costs) / costs
-
-    # test regression quality
-    rmse = np.sqrt(np.mean([np.square(pred - label) for pred, label in zip(preds, throughputs)]))
-    assert rmse <= 0.3
-
-    # test loading a record file
-    tmpdir = tvm.contrib.utils.tempdir()
-    tmpfile = tmpdir.relpath("test1")
-    auto_scheduler.save_records(tmpfile, inputs, results)
-    model.update_from_file(tmpfile)
-
-    # test model serialization
-    tmpfile = tmpdir.relpath("test2")
-    model.save(tmpfile)
-    model.load(tmpfile)
-
-
-if __name__ == "__main__":
-    test_random_model()
-    test_xgb_model()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_evolutionary_search.py b/tests/python/auto_scheduler/test_auto_scheduler_evolutionary_search.py
deleted file mode 100644
index 93853b4e7c5e..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_evolutionary_search.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Test evolutionary search. """
-
-import tvm
-import pytest
-from tvm.testing.auto_scheduler import matmul_auto_scheduler_test
-from tvm import auto_scheduler, te
-from tvm.auto_scheduler.cost_model.cost_model import PythonBasedModel
-
-
-def test_mutate_tile_size():
-    """
-    The test case initializes evo search with a batch of "bad" states and check whether
-    the search algorithm can find "good" states by mutating the "bad" states.
-
-    This unit test has been tested with 1,000 runs with no failures, meaning that
-    the failure rate is less than 0.1%.
-    """
-
-    class MockCostModel(PythonBasedModel):
-        """A mock cost model that rates 1 only for the states with tile_k=2."""
-
-        @staticmethod
-        def is_good_state(state):
-            for line in str(state).split("\n"):
-                if line.find("k.1") != -1 and line.find("(0,2)") != -1:
-                    return True
-            return False
-
-        def predict(self, task, states):
-            scores = []
-            for state in states:
-                scores.append(1 if self.is_good_state(state) else 0)
-            return scores
-
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(10, 10, 4), target=tvm.target.Target("llvm")
-    )
-    policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
-    states = policy.sample_initial_population()[:50]
-
-    bad_states = []
-    for state in states:
-        if not MockCostModel.is_good_state(state):
-            bad_states.append(state)
-
-    new_states = policy.evolutionary_search(bad_states, 50)
-    found = False
-    for state in new_states:
-        if MockCostModel.is_good_state(state):
-            found = True
-            break
-    assert found
-
-
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11440")
-def test_mutate_parallel():
-    """
-    The test case initializes evo search with a batch of "bad" states and check whether
-    the search algorithm can find "good" states by mutating the "bad" states.
-    """
-
-    class MockCostModel(PythonBasedModel):
-        @staticmethod
-        def is_good_state(state):
-            for line in str(state).split("\n"):
-                if (
-                    line.find("parallel i.0@ (0") != -1
-                    or line.find("parallel i.0@j.0@ (0") != -1
-                    or line.find("parallel i.0@j.0@i.1@ (0") != -1
-                ):
-                    return True
-            return False
-
-        def predict(self, task, states):
-            scores = []
-            for state in states:
-                scores.append(1 if self.is_good_state(state) else 0)
-            return scores
-
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(1024, 1024, 1024), target="llvm"
-    )
-    policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
-
-    found = False
-    retry_ct = 0
-    while retry_ct < 10 and not found:
-        states = policy.sample_initial_population()[:100]
-        bad_states = []
-        for state in states:
-            if not MockCostModel.is_good_state(state):
-                bad_states.append(state)
-
-        new_states = policy.evolutionary_search(bad_states, 50)
-        for state in new_states:
-            if MockCostModel.is_good_state(state):
-                found = True
-                break
-        retry_ct += 1
-
-    assert found
-
-
-if __name__ == "__main__":
-    test_mutate_tile_size()
-    test_mutate_parallel()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_feature.py b/tests/python/auto_scheduler/test_auto_scheduler_feature.py
deleted file mode 100644
index c8edebfd3b87..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_feature.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test feature extraction"""
-
-import math
-import tempfile
-
-import tvm
-from tvm import te, auto_scheduler, relay
-from tvm.script import tir as T
-
-from tvm.testing.auto_scheduler import matmul_auto_scheduler_test
-
-
-def fequal(a, b):
-    return math.fabs(a - b) < 1e-6
-
-
-def test_cpu_matmul():
-    dag = auto_scheduler.ComputeDAG(matmul_auto_scheduler_test(512, 512, 512))
-    s = dag.get_init_state()
-    C = s.stage_ops[2]
-
-    i, j, k = s[C].iters
-    io, ii = s.split(C, i, [16])
-    jo, ji = s.split(C, j, [8])
-    s.reorder(C, [io, jo, k, ji, ii])
-    s.vectorize(C, ji)
-    s.parallel(C, io)
-    s.parallel(C, jo)
-    s.unroll(C, k)
-
-    target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
-    names = auto_scheduler.feature.get_per_store_feature_names()
-    fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0]
-
-    stage_0 = fea[0]
-    assert len(stage_0) == len(names), "%d vs %d" % (len(stage_0), len(names))
-    fea_dict = {}
-    for name, value in zip(names, stage_0):
-        fea_dict[name] = value
-
-    for name in ["B0", "B1", "B2"]:
-        if fequal(fea_dict[name + ".acc_type.kReadWrite"], 1.0):
-            c_name = name
-        if fequal(fea_dict[name + ".acc_type.kRead"], 1.0):
-            if fequal(fea_dict[name + ".stride"], 0.0):
-                b_name = name
-            else:
-                a_name = name
-
-    """
-    lowered IR:
-
-    Placeholder: A, B
-    parallel i.0 (0,32)
-      parallel j.0 (0,64)
-        unroll k (0,512)
-          vectorize j.1 (0,8)
-            for i.1 (0,16)
-              C...] = A[...] * B[...]
-    """
-
-    # check touched memory in bytes, touched unique memory in bytes, reuse distance, etc.
-    assert fequal(fea_dict[c_name + ".bytes"], math.log2(512**3 * 4 + 1))
-    assert fequal(fea_dict[b_name + ".unique_bytes"], math.log2(512**2 * 4 + 1))
-    assert fequal(fea_dict[c_name + ".reuse_dis_iter"], math.log2(8 * 16 + 1))
-    assert fequal(fea_dict[c_name + ".reuse_dis_bytes"], math.log2((8 * 16 + 8 + 16) * 4 + 1))
-    assert fequal(fea_dict[c_name + ".reuse_ct"], math.log2(512 + 1))
-
-    # check annotations
-    assert fequal(fea_dict["unroll_num"], math.log2(1 + 1))
-    # assert fequal(fea_dict["unroll_type.kPosInnerReduce"], 1.0)
-    assert fequal(fea_dict["vec_num"], math.log2(1 + 1))
-    assert fequal(fea_dict["parallel_num"], math.log2(2 + 1))
-    assert fequal(fea_dict["parallel_prod"], math.log2((512 * 512 / 16 / 8) + 1))
-
-
-def test_cpu_fusion():
-    def fusion_test(N, M):
-        A = te.placeholder((N, M), name="A")
-        B = te.compute((N, M), lambda i, j: A[i][j], name="B")
-        C = te.compute((N, M), lambda i, j: B[i][j], name="C")
-        return [A, B, C]
-
-    dag = auto_scheduler.ComputeDAG(fusion_test(64, 32))
-    s = dag.get_init_state()
-    s.compute_at(1, 2, s.stages[2].iters[1])
-
-    target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
-    names = auto_scheduler.feature.get_per_store_feature_names()
-    fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0]
-
-    """
-    lowered IR:
-
-    Placeholder: A
-    for i (0,64)
-        for j (0,32)
-            for ii (1)
-                for jj (1)
-                    B[...] = A[...]
-            C[...] = B[...]
-    """
-
-    # check reuse distance and reuse type after fusion
-    found = False
-    for stage_fea in fea:
-        for i, (name, value) in enumerate(zip(names, stage_fea)):
-            if "reuse_type.kSerialMultipleReadWrite" in name and value > 0.5:
-                # reuse distance in #iter
-                assert fequal(stage_fea[i + 2], 1.0)
-                # reuse distance in bytes
-                assert fequal(stage_fea[i + 3], math.log2(16 + 1))
-                found = True
-    assert found
-
-
-def test_gpu_feature():
-    # Use records to build a complicated GPU program
-    json_records = "\n".join(
-        (
-            """{"i": [["[\\"matmul_auto_scheduler_test\\", 512, 512, 512]", "cuda"], [[], [["CHW", 2, "local"], ["SP", 2, 0, 512, [1, 16, 32, 1], 1], ["SP", 2, 5, 512, [4, 1, 1, 16], 1], ["SP", 2, 10, 512, [1, 2], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 3, 0, 1, 3], ["FSP", 3, 4, 2, 3], ["RE", 3, [0, 4, 1, 5, 2, 6, 3, 7]], ["FU", 2, [0, 1]], ["FU", 3, [0, 1]], ["FU", 2, [1, 2]], ["FU", 3, [1, 2]], ["FU", 2, [2, 3]], ["FU", 3, [2, 3]], ["CA", 2, 3, 2], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 3], ["FU", 2, [0, 1]], ["FFSP", 2, 0, [1, 2], 1, 1], ["AN", 2, 1, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 3], ["FU", 1, [0, 1]], ["FFSP", 1, 0, [1, 2], 1, 1], ["AN", 1, 1, 6], ["AN", 5, 0, 5], ["AN", 5, 1, 4], ["AN", 5, 2, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00536798], 0, 2.49277, 1585564852], "v": "v0.1"}""",
-        )
-    )
-
-    # load states
-    with tempfile.NamedTemporaryFile(mode="w") as f:
-        f.write(json_records)
-        f.flush()
-        inputs, _ = auto_scheduler.RecordReader(f.name).read_lines()
-
-        inp = inputs[0]
-        task = auto_scheduler.SearchTask(
-            workload_key=inp.task.workload_key,
-            target=inp.task.target,
-            hardware_params=auto_scheduler.HardwareParams(
-                100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30
-            ),
-        )
-
-        state = task.compute_dag.infer_bound_from_state(inputs[0].state)
-        fea = auto_scheduler.feature.get_per_store_features_from_states([state], task)[0]
-        names = auto_scheduler.feature.get_per_store_feature_names()
-
-        # build feature dict
-        fea_dicts = []
-        for i in range(len(fea)):
-            tmp_dict = {}
-            for j in range(len(names)):
-                tmp_dict[names[j]] = fea[i][j]
-            fea_dicts.append(tmp_dict)
-
-        """
-        lowered IR:
-
-        Placeholder: A, B
-        blockIdx.x i.0@j.0@ (0,8)
-          vthread i.1@j.1@ (0,4)
-            threadIdx.x i.2@j.2@ (0,16)
-              C.local auto_unroll: 1024
-              for k.0 (0,256)
-                for ax0@ax1@.0 (0,8)
-                  threadIdx.x ax0@ax1@.1 (0,16)
-                    B.shared = ...
-                for ax0@ax1@.0 (0,64)
-                  threadIdx.x ax0@ax1@.1 (0,16)
-                    A.shared = ...
-                for i_c.3 (0,32)
-                  for k.2 (0,2)
-                    for j_c.4 (0,16)
-                      C.local = ...
-              for i.3 (0,32)
-                for j.3 (0,16)
-                  C = ...
-        """
-
-        # check gpu-related features
-        assert fequal(fea_dicts[0]["blockIdx_x_len"], math.log2(8 + 1))
-        assert fequal(fea_dicts[0]["vthread_len"], math.log2(4 + 1))
-        assert fequal(fea_dicts[1]["threadIdx_x_len"], math.log2(16 + 1))
-        assert fequal(fea_dicts[0]["threadIdx_y_len"], math.log2(1 + 1))
-        assert fequal(fea_dicts[2]["blockIdx_z_len"], math.log2(1 + 1))
-        assert fequal(fea_dicts[0]["is_gpu"], 1.0)
-
-
-@T.prim_func
-def tir_matmul(
-    A: T.Buffer((256, 256), "float32"),
-    B: T.Buffer((256, 256), "float32"),
-    C: T.Buffer((256, 256), "float32"),
-) -> None:
-    # function attr dict
-    T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-    A_flat = T.Buffer([16384], dtype="float32", data=A.data)
-    B_flat = T.Buffer([16384], dtype="float32", data=B.data)
-    C_flat = T.Buffer([16384], dtype="float32", data=C.data)
-    # body
-    for x, y in T.grid(128, 128):
-        C_flat[x * 128 + y] = T.float32(0)
-        for k in T.serial(128):
-            C_flat[x * 128 + y] = C_flat[x * 128 + y] + A_flat[x * 128 + k] * B_flat[y * 128 + k]
-
-
-def test_primfunc_without_lowering():
-    features = auto_scheduler.feature.named_features_from_primfunc(tir_matmul)
-    assert features["float_mad"].shape == (1,)
-    # featurization does not handle multiple-add right now, so they are split out
-    assert abs(features["float_addsub"][0] - 128 * 128 * 128) < 10
-    assert abs(features["float_mul"][0] - 128 * 128 * 128) < 10
-    for i in range(0, 3):
-        assert abs(features[f"B{i}.unique_bytes"][0] - 128 * 128 * 4) < 10  # 4 bytes per float32
-
-
-def test_primfunc_lowered():
-    # Lower tir function so all passes get applied
-    f = tvm.lower(tir_matmul)
-    features = auto_scheduler.feature.named_features_from_primfunc(f["main"])
-    assert features["float_mad"].shape == (1,)
-    # featurization does not handle multiple-add right now, so they are split out
-    assert abs(features["float_addsub"][0] - 128 * 128 * 128) < 10
-    assert abs(features["float_mul"][0] - 128 * 128 * 128) < 10
-    for i in range(0, 3):
-        assert abs(features[f"B{i}.unique_bytes"][0] - 128 * 128 * 4) < 10  # 4 bytes per float32
-
-
-def test_dense_lowered():
-    a = relay.var("a", relay.TensorType((128, 128), "float32"))
-    b = relay.var("b", relay.TensorType((128, 128), "float32"))
-    c = relay.nn.dense(a, b)
-    mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
-    target = "llvm"
-    comp = relay.vm.VMCompiler()
-    mod, params = comp.optimize(mod, params={}, target=target)
-    for name, func in mod.functions.items():
-        if name.name_hint != "main":
-            break
-    features = auto_scheduler.feature.named_features_from_primfunc(func)
-    # featurization does not handle multiple-add right now, so they are split out
-    assert features["float_addsub"].sum() >= 128 * 128 * 128
-    assert features["float_mul"].sum() >= 128 * 128 * 128
-    total_bytes_loaded = 0
-    for i in range(0, 4):
-        total_bytes_loaded += features[f"B{i}.unique_bytes"].sum()
-    assert total_bytes_loaded > 2 * 128 * 128 * 4  # 4 bytes per float32
-
-
-@T.prim_func
-def negative_extent(A: T.Buffer((1,), "float32")):
-    for j in range(0, -1):
-        A[j] = A[j] + 1.0
-
-
-def test_negative_extent():
-    features = auto_scheduler.feature.named_features_from_primfunc(negative_extent)
-    assert features["B0.unique_bytes"] == 0
-
-
-@T.prim_func
-def zero_dim(
-    p2: T.Buffer((), "float32"),
-    T_cast: T.Buffer((T.int64(1), T.int64(768)), "int8"),
-):
-    # function attr dict
-    T.func_attr(
-        {
-            "tir.noalias": True,
-            "Primitive": 1,
-        }
-    )
-    # buffer definition
-    T_cast_1 = T.buffer_decl([T.int64(768)], dtype="int8", data=T_cast.data)
-    p2_1 = T.buffer_decl([1], dtype="float32", data=p2.data)
-    # body
-    for i0_i1_fused in T.serial(768):
-        T_cast_1[i0_i1_fused] = p2_1[0]
-
-
-def test_zero_dim():
-    features = auto_scheduler.feature.named_features_from_primfunc(zero_dim)
-    assert features["B1.stride"] == 1
-    assert features["B0.stride"] == 1
-
-
-if __name__ == "__main__":
-    test_cpu_matmul()
-    test_cpu_fusion()
-    test_gpu_feature()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_layout_rewrite.py b/tests/python/auto_scheduler/test_auto_scheduler_layout_rewrite.py
deleted file mode 100644
index 39673fad2495..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_layout_rewrite.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test AutoScheduler Layout Rewrite"""
-import tempfile
-import numpy as np
-
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import topi
-from tvm import auto_scheduler, te
-
-from tvm.testing.auto_scheduler import get_tiled_matmul, matmul_auto_scheduler_test
-
-
-def test_apply_steps_with_layout_rewrite():
-    dag, s = get_tiled_matmul()
-    _, bufs = dag.apply_steps_from_state(s)
-    assert bufs[1].shape[0] == 512
-    assert bufs[1].shape[1] == 512
-    _, bufs = dag.apply_steps_from_state(
-        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
-    )
-    assert bufs[1].shape[0] == 4
-    assert bufs[1].shape[1] == 8
-    assert bufs[1].shape[2] == 4
-    assert bufs[1].shape[3] == 4
-    assert bufs[1].shape[4] == 512
-    _, bufs = dag.apply_steps_from_state(
-        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE
-    )
-    assert bufs[1].shape[0] == 512
-    assert bufs[1].shape[1] == 512
-
-
-def test_apply_steps_with_layout_rewrite_corner_case():
-    A, B, C = matmul_auto_scheduler_test(1, 1, 1)
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-
-    s = dag.get_init_state()
-
-    s.compute_root(C)
-    i_j_fused = s.fuse(C, [s[C].iters[0], s[C].iters[1]])
-    s.parallel(C, i_j_fused)
-
-    _, bufs = dag.apply_steps_from_state(
-        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
-    )
-
-
-@tvm.testing.requires_llvm
-def test_correctness_layout_rewrite_rewrite_for_preTransformed():
-    N = 16
-    target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
-    dag = task.compute_dag
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-
-        search_policy = auto_scheduler.SketchPolicy(task)
-
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
-        tuning_options = auto_scheduler.TuningOptions(
-            num_measure_trials=100,
-            runner=measure_ctx.runner,
-            verbose=2,
-            early_stopping=1,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        )
-        task.tune(tuning_options, search_policy=search_policy)
-        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target)
-        s, bufs = dag.apply_steps_from_state(
-            inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
-        )
-        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
-        np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
-        np_args_ref = [np.array(x) for x in np_args]
-
-        weight = np_args_ref[1]
-        # infer shape for the rewritten layout
-        if len(weight.shape) >= 6:
-            # For cpu tile structure SSRSRS
-            base = len(weight.shape) - 6
-            red_dim = weight.shape[2 + base] * weight.shape[4 + base]
-            out_dim = weight.shape[3 + base] * weight.shape[5 + base]
-            for i in range(base + 2):
-                out_dim *= weight.shape[i]
-            new_order = (
-                [
-                    2 + base,
-                    4 + base,
-                ]
-                + list(range(base + 2))
-                + [
-                    3 + base,
-                    5 + base,
-                ]
-            )
-            np_args_ref[1] = np_args_ref[1].transpose(new_order)
-            np_args_ref[1] = np_args_ref[1].reshape((red_dim, out_dim))
-
-        func = tvm.build(s, bufs, target=target)
-        func_ref = tvm.build(s_ref, bufs_ref, target=target)
-
-        dev = tvm.device(str(target))
-        dev_ref = tvm.cpu()
-
-        args = [tvm.nd.array(x, device=dev) for x in np_args]
-        args_ref = [tvm.nd.array(x, device=dev_ref) for x in np_args_ref]
-        dev.sync()
-
-        func(*args)
-        func_ref(*args_ref)
-        dev.sync()
-
-        tvm.testing.assert_allclose(args[0].numpy(), args_ref[0].numpy(), atol=1e-3, rtol=1e-3)
-        tvm.testing.assert_allclose(args[2].numpy(), args_ref[2].numpy(), atol=1e-3, rtol=1e-3)
-        del measure_ctx
-
-
-@tvm.testing.requires_llvm
-def test_correctness_layout_rewrite_insert_transform_stage():
-    N = 128
-    target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
-    dag = task.compute_dag
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-
-        search_policy = auto_scheduler.SketchPolicy(task)
-
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
-        tuning_options = auto_scheduler.TuningOptions(
-            num_measure_trials=2,
-            runner=measure_ctx.runner,
-            verbose=1,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        )
-        task.tune(tuning_options, search_policy=search_policy)
-        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target)
-        s, bufs = dag.apply_steps_from_state(
-            inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE
-        )
-
-        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
-        np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
-
-        func = tvm.build(s, bufs, target=target)
-        func_ref = tvm.build(s_ref, bufs_ref, target=target)
-
-        dev = tvm.device(str(target))
-        dev_ref = tvm.cpu()
-
-        args = [tvm.nd.array(x, device=dev) for x in np_args]
-        args_ref = [tvm.nd.array(x, device=dev_ref) for x in np_args]
-        dev.sync()
-
-        func(*args)
-        func_ref(*args_ref)
-        dev.sync()
-
-        tvm.testing.assert_allclose(args[0].numpy(), args_ref[0].numpy(), atol=1e-3, rtol=1e-3)
-        tvm.testing.assert_allclose(args[1].numpy(), args_ref[1].numpy(), atol=1e-3, rtol=1e-3)
-        tvm.testing.assert_allclose(args[2].numpy(), args_ref[2].numpy(), atol=1e-3, rtol=1e-3)
-        del measure_ctx
-
-
-if __name__ == "__main__":
-    test_apply_steps_with_layout_rewrite()
-    test_apply_steps_with_layout_rewrite_corner_case()
-    test_correctness_layout_rewrite_rewrite_for_preTransformed()
-    test_correctness_layout_rewrite_insert_transform_stage()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_loop_state.py b/tests/python/auto_scheduler/test_auto_scheduler_loop_state.py
deleted file mode 100644
index 0965ed9efbac..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_loop_state.py
+++ /dev/null
@@ -1,522 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test loop state and schedule primitives"""
-
-import numpy as np
-
-import tvm
-from tvm import auto_scheduler, te
-from tvm import topi
-
-from tvm.testing.auto_scheduler import (
-    matmul_auto_scheduler_test,
-    conv2d_nchw_bn_relu_auto_scheduler_test,
-)
-
-
-def test_split_fuse_reorder_annotation():
-    A, B, C = matmul_auto_scheduler_test(N=512, M=512, K=512)
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-    s0 = dag.get_init_state()
-    i, j, k = s0[C].iters
-
-    assert i.range.extent == 512
-
-    io, ii = s0.split(C, i, [16])
-    assert s0[C].iters[0] == io
-    assert s0[C].iters[1] == ii
-    assert io.range.extent == 32
-    assert ii.range.extent == 16
-
-    jo, ji = s0.split(C, j, [8])
-    assert jo.range.extent == 64
-    assert ji.range.extent == 8
-
-    s0.reorder(C, [io, jo, k, ji, ii])
-    assert s0[C].iters[2].range.extent == 512
-
-    fused_it = s0.fuse(C, [io, jo])
-    assert fused_it.range.extent == 2048
-
-    s1 = dag.get_init_state()
-    i, j, _ = s1[C].iters
-    i1, i2, i3 = s1.split(C, i, [8, 2])
-    j1, j2, j3 = s1.split(C, j, [32, 8], False)
-    assert s1[C].iters[0].range.extent == 32
-    assert s1[C].iters[1].range.extent == 8
-    assert s1[C].iters[2].range.extent == 2
-    assert s1[C].iters[3].range.extent == 32
-    assert s1[C].iters[4].range.extent == 8
-    assert s1[C].iters[5].range.extent == 2
-
-    res = s1.bind(C, i1, "blockIdx.x")
-    assert res == s1[C].iters[0]
-    assert res.annotation == auto_scheduler.loop_state.State.ANNOTATION_TRANS_TABLE["blockIdx.x"]
-
-    res = s1.bind(C, i2, "vthread")
-    assert res == s1[C].iters[1]
-    assert res.annotation == auto_scheduler.loop_state.State.ANNOTATION_TRANS_TABLE["vthread"]
-
-    res = s1.bind(C, i3, "threadIdx.y")
-    assert res == s1[C].iters[2]
-    assert res.annotation == auto_scheduler.loop_state.State.ANNOTATION_TRANS_TABLE["threadIdx.y"]
-
-    res = s1.parallel(C, j1)
-    assert res == s1[C].iters[3]
-    assert res.annotation == auto_scheduler.loop_state.State.ANNOTATION_TRANS_TABLE["parallel"]
-
-    res = s1.unroll(C, j2)
-    assert res == s1[C].iters[4]
-    assert res.annotation == auto_scheduler.loop_state.State.ANNOTATION_TRANS_TABLE["unroll"]
-
-    res = s1.vectorize(C, j3)
-    assert res == s1[C].iters[5]
-    assert res.annotation == auto_scheduler.loop_state.State.ANNOTATION_TRANS_TABLE["vectorize"]
-
-
-def test_compute_at_root_inline():
-    dag = auto_scheduler.ComputeDAG(
-        conv2d_nchw_bn_relu_auto_scheduler_test(
-            N=1, H=224, W=224, CI=3, CO=64, kernel_size=7, strides=2, padding=3
-        )
-    )
-    s0 = dag.get_init_state()
-
-    # data, padding, kernel = 0, 1, 2
-    conv = s0.stage_ops[3]
-    # bias = 4
-    bias_add = s0.stage_ops[5]
-    # bn_scale = 6
-    bn_mul = s0.stage_ops[7]
-    # bn_offset = 8
-    bn_add = s0.stage_ops[9]
-    relu = s0.stage_ops[10]
-
-    s0.compute_inline(bn_add)
-    assert s0[bn_add].compute_at == 1
-
-    s0.compute_inline(bn_mul)
-    assert s0[bn_mul].compute_at == 1
-
-    s0.compute_inline(bias_add)
-    assert s0[bias_add].compute_at == 1
-
-    assert s0[conv].iters[0].range.extent == 1
-    assert s0[conv].iters[1].range.extent == 64
-    assert s0[conv].iters[2].range.extent == 112
-    assert s0[conv].iters[3].range.extent == 112
-    assert s0[conv].iters[4].range.extent == 3
-    assert s0[conv].iters[5].range.extent == 7
-    assert s0[conv].iters[6].range.extent == 7
-    s0.compute_at(conv, relu, s0[relu].iters[2])
-    assert s0[conv].compute_at == 2
-    s0 = dag.infer_bound_from_state(s0)
-    assert s0[conv].iters[0].range.extent == 1
-    assert s0[conv].iters[1].range.extent == 1
-    assert s0[conv].iters[2].range.extent == 1
-    assert s0[conv].iters[3].range.extent == 112
-    assert s0[conv].iters[4].range.extent == 3
-    assert s0[conv].iters[5].range.extent == 7
-    assert s0[conv].iters[6].range.extent == 7
-
-    s0.compute_root(bn_mul)
-    assert s0[bn_mul].compute_at == 0
-
-    s0.compute_root(conv)
-    assert s0[conv].compute_at == 0
-    s0 = dag.infer_bound_from_state(s0)
-    assert s0[conv].iters[0].range.extent == 1
-    assert s0[conv].iters[1].range.extent == 64
-    assert s0[conv].iters[2].range.extent == 112
-    assert s0[conv].iters[3].range.extent == 112
-    assert s0[conv].iters[4].range.extent == 3
-    assert s0[conv].iters[5].range.extent == 7
-    assert s0[conv].iters[6].range.extent == 7
-
-
-def test_cache_read_write():
-    N, H, W, CO, CI, KH, KW, strides, padding = 4, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
-
-    data = te.placeholder((N, CI, H, W), name="Data")
-    kernel_data = te.placeholder((CO, CI, KH, KW), name="Kernel_data")
-    k0, k1 = te.compute(
-        kernel_data.shape,
-        lambda *i: (kernel_data(*i) + 1, kernel_data(*i) / 2),
-        name="Kernel_split",
-    )
-    kernel = te.compute(kernel_data.shape, lambda *i: k0(*i) + k1(*i), name="Kernel")
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation=1)
-    relu = topi.nn.relu(conv)
-    add = topi.add(data, relu)
-
-    dag = auto_scheduler.ComputeDAG([data, kernel_data, add])
-    s0 = dag.get_init_state()
-
-    pad_temp = s0.stage_ops[1]
-    kernel_split = s0.stage_ops[3]
-
-    # 0: init state
-    ori_its = s0[add].iters
-    its = s0.split(add, s0[add].iters[0], [2])
-    s0.reorder(add, [its[0], ori_its[1], its[1], ori_its[2], ori_its[3]])
-    s0.compute_inline(relu)
-
-    # 1: simple cache_write with compute_at
-    conv_global = s0.cache_write(conv, "global")
-    s0.compute_at(conv_global, conv, s0[conv].iters[3])
-
-    # 2: simple cache_read with compute_at
-    kernel_global = s0.cache_read(kernel, "global", [conv_global])
-    s0.compute_at(kernel_global, conv_global, s0[conv_global].iters[4])
-    """
-        Placeholder: Data, Kernel_data
-        for i0 (0,4)
-          for i1 (0,512)
-            for i2 (0,9)
-              for i3 (0,9)
-                pad_temp = ...
-        for i0 (0,512)
-          for i1 (0,512)
-            for i2 (0,3)
-              for i3 (0,3)
-                Kernel_split = ...
-        for i0 (0,512)
-          for i1 (0,512)
-            for i2 (0,3)
-              for i3 (0,3)
-                Kernel = ...
-        for nn (0,4)
-          for ff (0,512)
-            for yy (0,7)
-              for xx (0,7)
-                for nn_c (None)
-                  for ff_c (None)
-                    for yy_c (None)
-                      for xx_c (None)
-                        for rc (None)
-                          for ax0 (None)
-                            for ax1 (None)
-                              for ax2 (None)
-                                for ax3 (None)
-                                  Kernel.global = ...
-                          for ry (None)
-                            for rx (None)
-                              compute.global = ...
-                compute = ...
-        for ax0.0 (0,2)
-          for ax1 (0,512)
-            for ax0.1 (0,2)
-              for ax2 (0,7)
-                for ax3 (0,7)
-                  T_add = ...
-    """
-    s1 = dag.infer_bound_from_state(s0)
-    assert s1[conv].iters[0].range.extent == 4
-    assert s1[conv].iters[1].range.extent == 512
-    assert s1[conv].iters[2].range.extent == 7
-    assert s1[conv].iters[3].range.extent == 7
-    assert s1[kernel_global].iters[0].range.extent == 1
-    assert s1[kernel_global].iters[1].range.extent == 1
-    assert s1[kernel_global].iters[2].range.extent == 3
-    assert s1[kernel_global].iters[3].range.extent == 3
-    assert s1[conv_global].iters[0].range.extent == 1
-    assert s1[conv_global].iters[1].range.extent == 1
-    assert s1[conv_global].iters[2].range.extent == 1
-    assert s1[conv_global].iters[3].range.extent == 1
-    assert s1[conv_global].iters[4].range.extent == 512
-    assert s1[conv_global].iters[5].range.extent == 3
-    assert s1[conv_global].iters[6].range.extent == 3
-
-    # 3: two level cache_read with compute_at
-    #    preparing for GPU's shared memory & local memory
-    pad_temp_global = s0.cache_read(pad_temp, "global", [conv_global])
-    pad_temp_shared = s0.cache_read(pad_temp_global, "shared", [conv_global])
-    s0.compute_at(pad_temp_global, conv_global, s0[conv_global].iters[2])
-    s0.compute_at(pad_temp_shared, conv_global, s0[conv_global].iters[4])
-
-    # 4: cache_read with multi readers
-    #    This stage cannot be compute at to its consumer
-    s0.cache_read(data, "global", [pad_temp, add])
-    """
-        Placeholder: Data, Kernel_data
-        for ax0 (0,4)
-          for ax1 (0,512)
-            for ax2 (0,7)
-              for ax3 (0,7)
-                Data.global = ...
-        for i0 (0,4)
-          for i1 (0,512)
-            for i2 (0,9)
-              for i3 (0,9)
-                pad_temp = ...
-        for i0 (0,512)
-          for i1 (0,512)
-            for i2 (0,3)
-              for i3 (0,3)
-                Kernel_split = ...
-        for i0 (0,512)
-          for i1 (0,512)
-            for i2 (0,3)
-              for i3 (0,3)
-                Kernel = ...
-        for nn (0,4)
-          for ff (0,512)
-            for yy (0,7)
-              for xx (0,7)
-                for nn_c (None)
-                  for ff_c (None)
-                    for yy_c (None)
-                      for ax0 (None)
-                        for ax1 (None)
-                          for ax2 (None)
-                            for ax3 (None)
-                              pad_temp.global = ...
-                      for xx_c (None)
-                        for rc (None)
-                          for ax0 (None)
-                            for ax1 (None)
-                              for ax2 (None)
-                                for ax3 (None)
-                                  Kernel.global = ...
-                          for ax0 (None)
-                            for ax1 (None)
-                              for ax2 (None)
-                                for ax3 (None)
-                                  pad_temp.global.shared = ...
-                          for ry (None)
-                            for rx (None)
-                              compute.global = ...
-                compute = ...
-        for ax0.0 (0,2)
-          for ax1 (0,512)
-            for ax0.1 (0,2)
-              for ax2 (0,7)
-                for ax3 (0,7)
-                  T_add = ...
-    """
-    s1 = dag.infer_bound_from_state(s0)
-    assert s1[conv].iters[0].range.extent == 4
-    assert s1[conv].iters[1].range.extent == 512
-    assert s1[conv].iters[2].range.extent == 7
-    assert s1[conv].iters[3].range.extent == 7
-    assert s1[kernel_global].iters[0].range.extent == 1
-    assert s1[kernel_global].iters[1].range.extent == 1
-    assert s1[kernel_global].iters[2].range.extent == 3
-    assert s1[kernel_global].iters[3].range.extent == 3
-    assert s1[conv_global].iters[0].range.extent == 1
-    assert s1[conv_global].iters[1].range.extent == 1
-    assert s1[conv_global].iters[2].range.extent == 1
-    assert s1[conv_global].iters[3].range.extent == 1
-    assert s1[conv_global].iters[4].range.extent == 512
-    assert s1[conv_global].iters[5].range.extent == 3
-    assert s1[conv_global].iters[6].range.extent == 3
-    assert s1[pad_temp_global].iters[0].range.extent == 1
-    assert s1[pad_temp_global].iters[1].range.extent == 512
-    assert s1[pad_temp_global].iters[2].range.extent == 3
-    assert s1[pad_temp_global].iters[3].range.extent == 3
-    assert s1[pad_temp_shared].iters[0].range.extent == 1
-    assert s1[pad_temp_shared].iters[1].range.extent == 1
-    assert s1[pad_temp_shared].iters[2].range.extent == 3
-    assert s1[pad_temp_shared].iters[3].range.extent == 3
-
-    # 5: cache_write with multi outputs
-    # TVM's cache_write actually has a bug with this case:
-    #
-    # After schedule.cache_write, TVM generate one new stage:
-    #   From: kernel_data -> kernel_split -> kernel
-    #   To:   kernel_data -> kernel_split_global -> kernel_split -> kernel
-    #
-    # But with topo sort analyse, we get:
-    #  //   kernel_data -> kernel_split_global -> kernel_split -> kernel
-    #         \                                                /
-    #          ----------------> kernel_split ---------------->
-    #
-    # TODO(jcf94): Seems there's bug with the input/output tensor. Such multi outputs case
-    # should be unusual, so we make some hack on DoCacheWrite. This should be fixed later.
-    kernel_split_global = s0.cache_write(kernel_split, "global")
-    """
-        Placeholder: Data, Kernel_data
-        for ax0 (0,4)
-          for ax1 (0,512)
-            for ax2 (0,7)
-              for ax3 (0,7)
-                Data.global = ...
-        for i0 (0,4)
-          for i1 (0,512)
-            for i2 (0,9)
-              for i3 (0,9)
-                pad_temp = ...
-        for i0_c (0,512)
-          for i1_c (0,512)
-            for i2_c (0,3)
-              for i3_c (0,3)
-                Kernel_split.global = ...
-        for i0 (0,512)
-          for i1 (0,512)
-            for i2 (0,3)
-              for i3 (0,3)
-                Kernel_split = ...
-        (******* Bug here, there should not be two kernel_split stage *******)
-        for i0 (0,512)
-          for i1 (0,512)
-            for i2 (0,3)
-              for i3 (0,3)
-                Kernel_split = ...
-        (******* Bug here, there should not be two kernel_split stage *******)
-        for i0 (0,512)
-          for i1 (0,512)
-            for i2 (0,3)
-              for i3 (0,3)
-                Kernel = ...
-        for nn (0,4)
-          for ff (0,512)
-            for yy (0,7)
-              for xx (0,7)
-                for nn_c (None)
-                  for ff_c (None)
-                    for yy_c (None)
-                      for ax0 (None)
-                        for ax1 (None)
-                          for ax2 (None)
-                            for ax3 (None)
-                              pad_temp.global = ...
-                      for xx_c (None)
-                        for rc (None)
-                          for ax0 (None)
-                            for ax1 (None)
-                              for ax2 (None)
-                                for ax3 (None)
-                                  Kernel.global = ...
-                          for ax0 (None)
-                            for ax1 (None)
-                              for ax2 (None)
-                                for ax3 (None)
-                                  pad_temp.global.shared = ...
-                          for ry (None)
-                            for rx (None)
-                              compute.global = ...
-                compute = ...
-        for ax0.0 (0,2)
-          for ax1 (0,512)
-            for ax0.1 (0,2)
-              for ax2 (0,7)
-                for ax3 (0,7)
-                  T_add = ...
-    """
-    assert len(s0[kernel_split].iters) == len(s0[kernel_split_global].iters)
-    for it0, it1 in zip(s0[kernel_split].iters, s0[kernel_split_global].iters):
-        assert it0.range == it1.range
-
-
-def test_follow_split_follow_fused_split():
-    A, B, C = matmul_auto_scheduler_test(512, 512, 512)
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-    s0 = dag.get_init_state()
-
-    C_global = s0.cache_write(C, "global")
-    its0 = s0.split(C, s0[C].iters[0], [4, 2, 8, 4], True)
-    split_step0 = len(s0.transform_steps) - 1
-    for level in range(1, 6):
-        tmp = s0.copy()
-        tmp.follow_split(C_global, tmp[C_global].iters[0], split_step0, level)
-        for i in range(0, level):
-            assert tmp[C].iters[i].range.extent == tmp[C_global].iters[i].range.extent
-
-    its1 = s0.split(C, s0[C].iters[5], [2, 2, 4, 8])
-    split_step1 = len(s0.transform_steps) - 1
-    its = []
-    for i0, i1 in zip(its0, its1):
-        its.append(i0)
-        its.append(i1)
-    s0.reorder(C, its)
-    for i in range(0, 5):
-        s0.fuse(C, [s0[C].iters[i], s0[C].iters[i + 1]])
-
-    for level in range(0, 4):
-        tmp = s0.copy()
-        tmp.follow_fused_split(
-            C_global, tmp[C_global].iters[0], [split_step0, split_step1], level, False
-        )
-        assert tmp[C].iters[level + 1].range.extent == tmp[C_global].iters[0].range.extent
-
-    for level in range(0, 4):
-        tmp = s0.copy()
-        tmp.follow_fused_split(
-            C_global, tmp[C_global].iters[0], [split_step0, split_step1], level, True
-        )
-        assert tmp[C].iters[level + 1].range.extent == tmp[C_global].iters[1].range.extent
-
-
-def test_rfactor():
-    A, B, C = matmul_auto_scheduler_test(8, 8, 512)
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-    s0 = dag.get_init_state()
-
-    ko, ki = s0.split(C, s0[C].iters[2], [16])
-
-    s1 = s0.copy()
-    C_r = s1.rfactor(C, ko, 2)
-    """
-        Placeholder: A, B
-        for i (0,8)
-          for j (0,8)
-            for k_o (0,32)
-              for k_i (0,16)
-                C.rf = ...
-        for ax0 (0,8)
-          for ax1 (0,8)
-            for k_o_v (0,32)
-              C.repl = ...
-    """
-    assert s1[C_r].iters[0].range.extent == 8
-    assert s1[C_r].iters[1].range.extent == 8
-    assert s1[C_r].iters[2].range.extent == 32
-    assert s1[C_r].iters[3].range.extent == 16
-    assert s1[C].iters[0].range.extent == 8
-    assert s1[C].iters[1].range.extent == 8
-    assert s1[C].iters[2].range.extent == 32
-
-    s2 = s0.copy()
-    C_r = s2.rfactor(C, ki, 2)
-    """
-        Placeholder: A, B
-        for i (0,8)
-          for j (0,8)
-            for k_i (0,16)
-              for k_o (0,32)
-                C.rf = ...
-        for ax0 (0,8)
-          for ax1 (0,8)
-            for k_i_v (0,16)
-              C.repl = ...
-    """
-    assert s2[C_r].iters[0].range.extent == 8
-    assert s2[C_r].iters[1].range.extent == 8
-    assert s2[C_r].iters[2].range.extent == 16
-    assert s2[C_r].iters[3].range.extent == 32
-    assert s2[C].iters[0].range.extent == 8
-    assert s2[C].iters[1].range.extent == 8
-    assert s2[C].iters[2].range.extent == 16
-
-
-if __name__ == "__main__":
-    test_split_fuse_reorder_annotation()
-    test_compute_at_root_inline()
-    test_cache_read_write()
-    test_follow_split_follow_fused_split()
-    test_rfactor()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_measure.py b/tests/python/auto_scheduler/test_auto_scheduler_measure.py
deleted file mode 100644
index 3fd5f97dd8a3..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_measure.py
+++ /dev/null
@@ -1,427 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Test measurement and log serialization. """
-import json
-
-import multiprocessing
-import numpy as np
-import tvm
-from tvm import topi
-from tvm import te, auto_scheduler
-import tempfile
-import tvm.testing
-import pickle
-from tvm.testing.auto_scheduler import matmul_auto_scheduler_test
-from tvm.auto_scheduler import workload_registry
-
-
-def record_common(dag, s):
-    target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
-
-    inp = auto_scheduler.measure.MeasureInput(task, s)
-    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
-
-    # Test in-memory record processing.
-    record_str = auto_scheduler.measure_record.dump_record_to_string(inp, res)
-    r_inp, r_res = auto_scheduler.measure_record.load_record_from_string(record_str)
-    # Only check the workload_key for simplification.
-    assert inp.task.workload_key == r_inp.task.workload_key
-    assert str(res) == str(r_res)
-
-    # Test file-based record processing.
-    with tempfile.NamedTemporaryFile() as fp:
-        auto_scheduler.save_records(fp.name, [inp], [res])
-
-        log_reader = auto_scheduler.RecordReader(fp.name)
-        inputs, _ = log_reader.read_lines()
-        assert len(inputs) == 1
-
-        s1 = dag.infer_bound_from_state(s)
-        s2 = dag.infer_bound_from_state(inputs[0].state)
-
-        assert s1 == s2
-        assert not (s1 == dag.get_init_state())
-
-
-def test_record_split_reorder_fuse_annotation():
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    A = te.placeholder((512, 512), name="A")
-    B = te.placeholder((512, 512), name="B")
-    k = te.reduce_axis((0, 512), name="k")
-    C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
-
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-    s = dag.get_init_state()
-
-    # Split
-    its0 = s.split(C, s[C].iters[0], [4, 8, 8])
-    its1 = s.split(C, s[C].iters[4], [8, 4, 4])
-    # Reorder
-    s.reorder(
-        C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3], s[C].iters[8], its1[3]]
-    )
-    # Fuse
-    s.fuse(C, [s[C].iters[0], s[C].iters[1], s[C].iters[2]])
-    # Parallel
-    s.parallel(C, s[C].iters[0])
-    # Thread bind(The blockIdx & threadIdx are used in GPU, just for record testing here)
-    s.bind(C, s[C].iters[1], "blockIdx.x")
-    s.bind(C, s[C].iters[2], "threadIdx.z")
-    s.bind(C, s[C].iters[3], "vthread")
-    # Unroll
-    s.unroll(C, s[C].iters[4])
-    # Vectorize
-    s.vectorize(C, s[C].iters[6])
-
-    record_common(dag, s)
-
-
-def test_record_compute_at_root_inline_cache_read_write():
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    A = te.placeholder((512, 512), name="A")
-    AA = topi.nn.relu(A)
-    B = te.placeholder((512, 512), name="B")
-    k = te.reduce_axis((0, 512), name="k")
-    C = te.compute((512, 512), lambda i, j: te.sum(AA[i][k] * B[k][j], axis=[k]), name="C")
-
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-    s = dag.get_init_state()
-
-    # Cache Write
-    C_shared = s.cache_write(C, "shared")
-    # Compute At
-    s.compute_at(C_shared, C, s[C].iters[0])
-    # Cache Read
-    B_global = s.cache_read(B, "global", [C_shared])
-    s.compute_at(B_global, C_shared, s[C_shared].iters[2])
-    # Compute Inline
-    s.compute_inline(AA)
-    # Compute Root
-    s.compute_root(C_shared)
-
-    record_common(dag, s)
-
-
-def test_record_follow_split_follow_fused_split():
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    A = te.placeholder((512, 512), name="A")
-    B = te.placeholder((512, 512), name="B")
-    k = te.reduce_axis((0, 512), name="k")
-    C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
-    D = topi.nn.relu(C)
-    E = topi.nn.relu(D)
-
-    dag = auto_scheduler.ComputeDAG([A, B, E])
-    s = dag.get_init_state()
-
-    # Follow Split
-    s.split(C, s[C].iters[0], [4, 2, 8, 4], True)
-    split_step0 = len(s.transform_steps) - 1
-    s.follow_split(C, s[C].iters[5], split_step0, 4)
-    # Follow Fused Split
-    its0 = s.split(E, s[E].iters[0], [4, 2, 8, 4], True)
-    split_step1 = len(s.transform_steps) - 1
-    its1 = s.split(E, s[E].iters[5], [2, 4, 2, 4], True)
-    split_step2 = len(s.transform_steps) - 1
-    its = []
-    for i0, i1 in zip(its0, its1):
-        its.append(i0)
-        its.append(i1)
-    for i in range(0, 5):
-        s.fuse(E, [s[E].iters[i], s[E].iters[i + 1]])
-    s.follow_fused_split(D, s[D].iters[0], [split_step1, split_step2], 2, True)
-
-    record_common(dag, s)
-
-
-def test_record_pragma_storage_align_rfactor():
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    A = te.placeholder((512, 512), name="A")
-    B = te.placeholder((512, 512), name="B")
-    k = te.reduce_axis((0, 512), name="k")
-    C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
-
-    dag = auto_scheduler.ComputeDAG([A, B, C])
-    s = dag.get_init_state()
-
-    # Rfactor
-    ko, _ = s.split(C, s[C].iters[2], [16])
-    s.rfactor(C, ko, 2)
-    # Pragma
-    s.pragma(C, s[C].iters[0], "auto_unroll_max_step$64")
-    # StorageAlign
-    s.storage_align(C, s[C].iters[-1], 8, 4)
-
-    record_common(dag, s)
-
-
-def test_recover_measure_input():
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
-    )
-
-    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
-    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
-
-    with tempfile.NamedTemporaryFile() as fp:
-        auto_scheduler.save_records(fp.name, [inp], [res])
-
-        log_reader = auto_scheduler.RecordReader(fp.name)
-        inputs, _ = log_reader.read_lines()
-        assert len(inputs) == 1
-
-        raw_inp = inputs[0]
-
-        correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
-        assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag)
-
-        correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp, rebuild_state=True)
-        assert str(correct_inp.state) == str(inp.state)
-
-
-def test_workload_dis_factor():
-    calc = auto_scheduler.utils.calc_workload_dis_factor
-    decode = auto_scheduler.utils.decode_workload_key
-
-    # Identical
-    target_wkl_key = json.dumps(
-        ["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"]
-    )
-    assert calc(decode(target_wkl_key), decode(target_wkl_key)) == 1
-
-    # Compatible with a factor
-    wkl_key = json.dumps(["func1", [1, 3, 112, 112], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
-    assert calc(decode(target_wkl_key), decode(wkl_key)) == 8 * 2 * 2
-
-    # Incompatible argument with zeros
-    wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [1, 1], [1, 1], "float32"])
-    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
-    wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [0, 0], "float32"])
-    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
-
-    # Incompatible non-integter argument
-    wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "int8"])
-    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
-
-    # Incompatible function
-    wkl_key = json.dumps(["func2", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
-    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
-
-    # Incompatible due to non-dividable factor
-    wkl_key = json.dumps(["func1", [8, 3, 223, 223], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
-    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
-
-
-def test_measure_local_builder_runner():
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
-    )
-
-    for enable_cpu_cache_flush in [True, False]:
-        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
-        local_builder = auto_scheduler.LocalBuilder()
-        local_runner = auto_scheduler.LocalRunner(
-            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
-        )
-
-        bress = local_builder.build([minp])
-        assert bress[0].error_no == 0
-        mress = local_runner.run([minp], bress)
-        assert mress[0].error_no == 0
-
-
-def test_dag_measure_local_builder_runner():
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    A = te.placeholder((512, 512), name="A")
-    B = te.placeholder((512, 512), name="B")
-    k = te.reduce_axis((0, 512), name="k")
-    C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
-    D = topi.nn.relu(C)
-    E = topi.nn.relu(D)
-
-    tensors = [A, B, E]
-    dag = auto_scheduler.ComputeDAG(tensors)
-    key = workload_registry.register_workload_tensors(dag.workload_key(), tensors)
-    transfer_data = workload_registry.serialize_workload_registry_entry(key)
-    f_data = pickle.dumps(transfer_data)
-    f_new = pickle.loads(f_data)
-    del workload_registry.WORKLOAD_FUNC_REGISTRY[key]
-    workload_registry.deserialize_workload_registry_entry(f_new)
-
-    target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key=key, target=target)
-
-    for enable_cpu_cache_flush in [True, False]:
-        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
-        local_builder = auto_scheduler.LocalBuilder()
-        local_runner = auto_scheduler.LocalRunner(
-            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
-        )
-
-        bress = local_builder.build([minp])
-        assert bress[0].error_no == 0
-        mress = local_runner.run([minp], bress)
-        assert mress[0].error_no == 0
-
-
-def test_workload_serialization():
-    key = tvm.auto_scheduler.utils.get_func_name(matmul_auto_scheduler_test)
-    transfer_data = workload_registry.serialize_workload_registry_entry(key)
-    f_data = pickle.dumps(transfer_data)
-    f_new = pickle.loads(f_data)
-    del workload_registry.WORKLOAD_FUNC_REGISTRY[key]
-    workload_registry.deserialize_workload_registry_entry(f_new)
-
-
-def test_measure_local_builder_rpc_runner():
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
-    )
-
-    for enable_cpu_cache_flush in [True, False]:
-        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
-        local_builder = auto_scheduler.LocalBuilder()
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
-            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
-        )
-        rpc_runner = measure_ctx.runner
-
-        bress = local_builder.build([minp])
-        assert bress[0].error_no == 0
-        mress = rpc_runner.run([minp], bress)
-        assert mress[0].error_no == 0
-
-        del measure_ctx
-
-
-def measure_local_builder_rpc_runner_spawn():
-    assert multiprocessing.get_start_method(False) == "spawn"
-    test_measure_local_builder_rpc_runner()
-
-
-@tvm.testing.requires_llvm
-def test_measure_local_builder_rpc_runner_spawn():
-    ctx = multiprocessing.get_context("spawn")
-    p = ctx.Process(target=measure_local_builder_rpc_runner_spawn)
-    p.start()
-    p.join()
-
-
-@tvm.testing.requires_llvm
-def test_measure_target_host():
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test,
-        args=(512, 512, 512),
-        target=tvm.target.Target("llvm", "llvm -mtriple=aarch64-linux-gnu"),
-    )
-
-    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
-    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
-
-    with tempfile.NamedTemporaryFile() as fp:
-        auto_scheduler.save_records(fp.name, [inp], [res])
-
-        log_reader = auto_scheduler.RecordReader(fp.name)
-        inputs, _ = log_reader.read_lines()
-        assert len(inputs) == 1
-
-        raw_inp = inputs[0]
-
-        recovered_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
-        assert str(recovered_inp.task.target.host) == str(inp.task.target.host)
-
-
-@tvm.testing.requires_llvm
-def test_measure_special_inputs_map_by_name_local_runner():
-    @auto_scheduler.register_workload
-    def foo():
-        X = te.placeholder(shape=[10], dtype="int32")
-        Index = te.placeholder(shape=[1], dtype="int32", name="Index")
-        Y = te.compute((1,), lambda i: X[Index[i]])
-        return [X, Index, Y]
-
-    # This workload cannot use random input for the `Index` input
-    task = auto_scheduler.SearchTask(
-        func=foo,
-        target="llvm",
-        task_inputs={
-            "Index": tvm.nd.array(np.array([5], dtype="int32")),
-        },
-    )
-
-    minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
-    local_builder = auto_scheduler.LocalBuilder()
-    local_runner = auto_scheduler.LocalRunner(timeout=10)
-
-    bress = local_builder.build([minp])
-    assert bress[0].error_no == 0
-    mress = local_runner.run([minp], bress)
-    assert mress[0].error_no == 0
-
-
-@tvm.testing.requires_llvm
-def test_measure_special_inputs_map_by_name_rpc_runner():
-    @auto_scheduler.register_workload
-    def foo():
-        X = te.placeholder(shape=[10], dtype="int32")
-        Index = te.placeholder(shape=[1], dtype="int32", name="Index")
-        Y = te.compute((1,), lambda i: X[Index[i]])
-        return [X, Index, Y]
-
-    # This workload cannot use random input for the `Index` input
-    task = auto_scheduler.SearchTask(
-        func=foo,
-        target="llvm",
-        task_inputs={
-            "Index": tvm.nd.array(np.array([5], dtype="int32")),
-        },
-    )
-
-    for enable_cpu_cache_flush in [True, False]:
-        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
-        local_builder = auto_scheduler.LocalBuilder()
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
-            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
-        )
-        rpc_runner = measure_ctx.runner
-
-        bress = local_builder.build([minp])
-        assert bress[0].error_no == 0
-        mress = rpc_runner.run([minp], bress)
-        assert mress[0].error_no == 0
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_search_policy.py b/tests/python/auto_scheduler/test_auto_scheduler_search_policy.py
deleted file mode 100644
index a9f6596a8548..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_search_policy.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test search policy"""
-
-import random
-import multiprocessing
-import numpy as np
-import tempfile
-
-import tvm
-import tvm.testing
-from tvm import auto_scheduler
-from tvm.auto_scheduler.utils import get_const_tuple
-
-from tvm.testing.auto_scheduler import (
-    matmul_auto_scheduler_test,
-    zero_rank_compute_auto_scheduler_test,
-    zero_rank_reduce_auto_scheduler_test,
-)
-import multiprocessing
-
-
-class CustomMeasureCallback(auto_scheduler.measure.PythonBasedMeasureCallback):
-    """A simple Python-based callback for testing."""
-
-    def callback(self, policy, inputs, results):
-        assert isinstance(policy, auto_scheduler.search_policy.SearchPolicy)
-        for inp, res in zip(inputs, results):
-            assert isinstance(inp, auto_scheduler.MeasureInput)
-            assert isinstance(res, auto_scheduler.MeasureResult)
-
-
-def search_common(
-    task=None,
-    target="llvm",
-    search_policy="sketch",
-    runner="local",
-    num_measure_trials=100,
-    cost_model=auto_scheduler.RandomModel(),
-    init_search_callbacks=None,
-):
-    if task is None:
-        task = auto_scheduler.SearchTask(
-            func=matmul_auto_scheduler_test, args=(64, 64, 64), target=target
-        )
-    target = task.target
-
-    print("Test search policy '%s' for '%s'" % (search_policy, target))
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-
-        init_search_callbacks = init_search_callbacks or []
-        init_search_callbacks.append(auto_scheduler.PreloadMeasuredStates(log_file))
-
-        if search_policy == "empty":
-            search_policy = auto_scheduler.EmptyPolicy(task)
-        elif search_policy == "sketch":
-            search_policy = auto_scheduler.SketchPolicy(
-                task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks
-            )
-        else:
-            raise ValueError("Invalid policy: " + search_policy)
-
-        # Tune
-        tuning_options = auto_scheduler.TuningOptions(
-            num_measure_trials=num_measure_trials,
-            num_measures_per_round=2,
-            early_stopping=1,
-            runner=runner,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file), CustomMeasureCallback()],
-        )
-        task.tune(tuning_options=tuning_options, search_policy=search_policy)
-
-        # Compile with the best schedule
-        sch, args = task.apply_best(log_file)
-        mod = tvm.build(sch, args, target)
-
-        # Compile with naive schedule for correctness check
-        sch, args = task.compute_dag.apply_steps_from_state(task.compute_dag.init_state)
-        mod_ref = tvm.build(sch, args, "llvm")
-
-        ctx = tvm.device(str(target), 0)
-        np_arrays = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in args]
-
-        tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays]
-        mod(*tvm_arrays)
-        actual = [x.numpy() for x in tvm_arrays]
-
-        tvm_arrays = [tvm.nd.array(x) for x in np_arrays]
-        mod_ref(*tvm_arrays)
-        expected = [x.numpy() for x in tvm_arrays]
-
-        for x, y in zip(actual, expected):
-            tvm.testing.assert_allclose(x, y, rtol=1e-5)
-
-
-@tvm.testing.requires_llvm
-def test_workload_registry_empty_policy():
-    search_common(search_policy="empty", num_measure_trials=2)
-
-    N = 64
-    target = "llvm"
-    search_common(
-        task=auto_scheduler.SearchTask(
-            func="matmul_auto_scheduler_test", args=(N, N, N), target=target
-        ),
-        num_measure_trials=2,
-        search_policy="empty",
-    )
-    search_common(
-        task=auto_scheduler.SearchTask(
-            func="matmul_auto_scheduler_test_rename_1", args=(N, N, N), target=target
-        ),
-        num_measure_trials=2,
-        search_policy="empty",
-    )
-
-
-@tvm.testing.requires_llvm
-def test_sketch_search_policy_basic():
-    search_common()
-
-
-def sketch_search_policy_basic_spawn():
-    assert multiprocessing.get_start_method(False) == "spawn"
-    test_sketch_search_policy_basic()
-
-
-@tvm.testing.requires_llvm
-def test_sketch_search_policy_basic_spawn():
-    ctx = multiprocessing.get_context("spawn")
-    p = ctx.Process(target=sketch_search_policy_basic_spawn)
-    p.start()
-    p.join()
-
-
-@tvm.testing.requires_llvm
-def test_sketch_search_policy_xgbmodel():
-    search_common(cost_model=auto_scheduler.XGBModel())
-
-
-@tvm.testing.requires_cuda
-def test_sketch_search_policy_cuda_rpc_runner():
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext()
-    search_common(target="cuda", runner=measure_ctx.runner)
-
-
-@tvm.testing.requires_cuda
-def test_sketch_search_policy_cuda_xgbmodel_rpc_runner():
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext()
-    search_common(target="cuda", runner=measure_ctx.runner, cost_model=auto_scheduler.XGBModel())
-
-
-@tvm.testing.requires_llvm
-@tvm.testing.requires_cuda
-def test_sketch_search_policy_zero_rank():
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext()
-    for target in ["llvm", "cuda"]:
-        task = auto_scheduler.SearchTask(
-            func=zero_rank_compute_auto_scheduler_test, args=(10,), target=target
-        )
-        search_common(task, runner=measure_ctx.runner)
-
-        task = auto_scheduler.SearchTask(
-            func=zero_rank_reduce_auto_scheduler_test, args=(10,), target=target
-        )
-        search_common(task, runner=measure_ctx.runner)
-
-
-@tvm.testing.requires_llvm
-def test_sketch_search_policy_custom_sketch():
-    def meet_condition_func(search_policy, state, stage_id):
-        return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST
-
-    def apply_func(search_policy, state, stage_id):
-        ret = []
-        state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
-        C = state.stage_ops[2]
-
-        ret.append([state.state_object, -1])
-
-        s1 = state.copy()
-        i, _, _ = s1[C].iters
-        s1.split(C, i, [8])
-        ret.append([s1.state_object, -1])
-        return ret
-
-    search_common(
-        cost_model=auto_scheduler.XGBModel(),
-        init_search_callbacks=[
-            auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func)
-        ],
-    )
-
-
-if __name__ == "__main__":
-    test_workload_registry_empty_policy()
-    test_sketch_search_policy_basic()
-    test_sketch_search_policy_basic_spawn()
-    test_sketch_search_policy_xgbmodel()
-    test_sketch_search_policy_cuda_rpc_runner()
-    test_sketch_search_policy_cuda_xgbmodel_rpc_runner()
-    test_sketch_search_policy_zero_rank()
-    test_sketch_search_policy_custom_sketch()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_search_task.py b/tests/python/auto_scheduler/test_auto_scheduler_search_task.py
deleted file mode 100644
index 7c5441e81839..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_search_task.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test search policy"""
-
-import numpy as np
-import tempfile
-
-import tvm
-import tvm.testing
-from tvm import auto_scheduler
-from tvm.auto_scheduler.utils import get_const_tuple
-from tvm.testing.auto_scheduler import (
-    matmul_auto_scheduler_test,
-    zero_rank_compute_auto_scheduler_test,
-    zero_rank_reduce_auto_scheduler_test,
-)
-
-
-def test_search_task_add_task_input():
-    auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
-    N = 64
-    target = "llvm"
-    test_input_0 = tvm.runtime.ndarray.empty((64, 64))
-    test_input_1 = tvm.runtime.ndarray.empty((10, 20))
-    test_input_2 = tvm.runtime.ndarray.empty((30, 40, 50))
-    task = auto_scheduler.SearchTask(
-        func="matmul_auto_scheduler_test",
-        args=(N, N, N),
-        target=target,
-        task_inputs={
-            "test_input_0": test_input_0,
-            "test_input_1": test_input_1,
-            "test_input_2": test_input_2,
-        },
-        task_inputs_overwrite=True,
-    )
-
-    assert len(task.task_input_names) == 3
-    assert task.task_input_names[0] == "test_input_0"
-    assert task.task_input_names[1] == "test_input_1"
-    assert task.task_input_names[2] == "test_input_2"
-
-
-def test_search_task_record():
-    auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
-    N = 64
-    target = "llvm"
-
-    # Log with no task input
-    task = auto_scheduler.SearchTask(
-        func="matmul_auto_scheduler_test", args=(N, N, N), target=target
-    )
-    task_record = auto_scheduler._ffi_api.SerializeSearchTask(task)
-    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
-    # TODO(jcf94): Check the compute dag & hardware parameter
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-
-    # Log with 1 task input
-    test_input_0 = tvm.runtime.ndarray.empty((64, 64))
-    task = auto_scheduler.SearchTask(
-        func="matmul_auto_scheduler_test",
-        args=(N, N, N),
-        target=target,
-        task_inputs={"test_input_0": test_input_0},
-        task_inputs_overwrite=True,
-    )
-    task_record = auto_scheduler._ffi_api.SerializeSearchTask(task)
-    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-    assert len(new_task.task_input_names) == 1
-    assert new_task.task_input_names[0] == "test_input_0"
-
-    # Log with multiple task inputs
-    test_input_1 = tvm.runtime.ndarray.empty((64, 64))
-    task = auto_scheduler.SearchTask(
-        func="matmul_auto_scheduler_test",
-        args=(N, N, N),
-        target=target,
-        task_inputs={
-            "test_input_0": test_input_0,
-            "test_input_1": test_input_1,
-        },
-        task_inputs_overwrite=True,
-    )
-    task_record = auto_scheduler._ffi_api.SerializeSearchTask(task)
-    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-    assert len(new_task.task_input_names) == 2
-    assert new_task.task_input_names[0] == "test_input_0"
-    assert new_task.task_input_names[1] == "test_input_1"
-
-    # Log with version 0.5
-    v5_log = (
-        """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", """
-        f'"{str(tvm.target.Target(target))}"'
-        """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
-    )
-    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(v5_log)
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-    assert len(new_task.task_input_names) == 0
-
-
-def test_recover_measure_input_with_task_input():
-    auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
-    target = "llvm"
-
-    # Since this file is tests for search_task, we only check the search_task here
-
-    # Log with no task input
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(512, 512, 512), target=target
-    )
-    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
-    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
-    measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res)
-    measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record)
-    new_task = measure_log[0].task
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-
-    # Log with 1 task input
-    test_input_0 = tvm.runtime.ndarray.empty((64, 64))
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test,
-        args=(512, 512, 512),
-        target=target,
-        task_inputs={
-            "test_input_0": test_input_0,
-        },
-        task_inputs_overwrite=True,
-    )
-    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
-    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
-    measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res)
-    measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record)
-    new_task = measure_log[0].task
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-    assert len(new_task.task_input_names) == 1
-    assert new_task.task_input_names[0] == "test_input_0"
-
-    # Log with multiple task inputs
-    test_input_1 = tvm.runtime.ndarray.empty((64, 64))
-    task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test,
-        args=(512, 512, 512),
-        target=target,
-        task_inputs={
-            "test_input_0": test_input_0,
-            "test_input_1": test_input_1,
-        },
-        task_inputs_overwrite=True,
-    )
-    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
-    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
-    measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res)
-    measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record)
-    new_task = measure_log[0].task
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-    assert len(new_task.task_input_names) == 2
-    assert new_task.task_input_names[0] == "test_input_0"
-    assert new_task.task_input_names[1] == "test_input_1"
-
-    # Log with version 0.5
-    v5_log = (
-        """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", """
-        f'"{str(tvm.target.Target(target))}"'
-        """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
-    )
-    measure_log = auto_scheduler.measure_record.load_record_from_string(v5_log)
-    new_task = measure_log[0].task
-    assert task.workload_key == new_task.workload_key
-    assert str(task.target) == str(new_task.target)
-    assert str(task.target.host) == str(new_task.target.host)
-    assert task.layout_rewrite_option == new_task.layout_rewrite_option
-    assert len(new_task.task_input_names) == 0
-
-
-if __name__ == "__main__":
-    test_search_task_add_task_input()
-    test_search_task_record()
-    test_recover_measure_input_with_task_input()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_sketch_generation.py b/tests/python/auto_scheduler/test_auto_scheduler_sketch_generation.py
deleted file mode 100644
index a3f63a38495c..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_sketch_generation.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Test sketch generation. """
-
-import sys
-import tvm
-import tvm.testing
-
-import pytest
-
-from tvm import te, auto_scheduler
-from tvm.auto_scheduler import _ffi_api
-from tvm.auto_scheduler.loop_state import Stage
-
-from tvm.testing.auto_scheduler import (
-    matmul_auto_scheduler_test,
-    double_matmul_auto_scheduler_test,
-    conv2d_nchw_bn_relu_auto_scheduler_test,
-    max_pool2d_auto_scheduler_test,
-    min_nm_auto_scheduler_test,
-    softmax_nm_auto_scheduler_test,
-    softmax_abcd_auto_scheduler_test,
-    conv2d_winograd_nhwc_auto_scheduler_test,
-    zero_rank_reduce_auto_scheduler_test,
-)
-
-
-def generate_sketches(
-    workload_func, args, target, print_for_debug=False, init_search_callbacks=None
-):
-    # NOTE: test_cpu_matmul_sketch and test_cpu_max_pool2d_sketch assume 4 cores to trigger all
-    # possible sketch generations.
-    task = auto_scheduler.SearchTask(
-        func=workload_func,
-        args=args,
-        target=target,
-        hardware_params=auto_scheduler.HardwareParams(num_cores=4, target=target),
-    )
-    policy = auto_scheduler.SketchPolicy(
-        task, verbose=0, init_search_callbacks=init_search_callbacks
-    )
-    return policy.generate_sketches(print_for_debug)
-
-
-def assert_compute_at_condition(stage, condition):
-    assert stage.compute_at == Stage.COMPUTE_AT_TRANS_TABLE[condition]
-
-
-def assert_is_tiled(stage):
-    assert _ffi_api.SearchPolicyUtilsIsTiled(stage)
-
-
-def assert_is_not_tiled(stage):
-    assert not _ffi_api.SearchPolicyUtilsIsTiled(stage)
-
-
-def assert_has_cache_write(state, stage_id):
-    assert _ffi_api.SearchPolicyUtilsHasCacheWriteStage(state, stage_id)
-
-
-def assert_has_cache_read(state, stage_id):
-    assert _ffi_api.SearchPolicyUtilsHasCacheReadStage(state, stage_id)
-
-
-def assert_has_rfactor(state, stage_id):
-    assert _ffi_api.SearchPolicyUtilsHasRfactorStage(state, stage_id)
-
-
-def assert_has_cross_thread_reduction(state, stage_id):
-    assert _ffi_api.SearchPolicyUtilsHasCrossThreadReduction(state, stage_id)
-
-
-def test_cpu_matmul_sketch():
-    sketches = generate_sketches(matmul_auto_scheduler_test, (512, 512, 512), "llvm")
-    """ 3 multi-level tiling sketches
-        No.0 : Multi-level tiling
-        No.1 : Multi-level tiling with cache write on position 0
-        No.2 : Multi-level tiling with cache write on position 1
-    """
-    assert len(sketches) == 3
-    # Sketch 0
-    assert_is_tiled(sketches[0].stages[2])
-    # Sketch 1
-    assert_is_tiled(sketches[1].stages[2])
-    assert_has_cache_write(sketches[1], 2)
-    assert_compute_at_condition(sketches[1].stages[2], "iter")
-    # Sketch 2
-    assert_is_tiled(sketches[2].stages[2])
-    assert_has_cache_write(sketches[2], 2)
-    assert_compute_at_condition(sketches[2].stages[2], "iter")
-    assert sketches[1] != sketches[2]
-
-    sketches = generate_sketches(matmul_auto_scheduler_test, (8, 8, 512), "llvm")
-    """ 2 rfactor sketches + 3 multi-level tiling sketches
-        No.0 : Rfactor with factor position 0
-        No.1 : Rfactor with factor position 1
-        No.2 : Multi-level tiling
-        No.3 : Multi-level tiling with cache write on position 0
-        No.4 : Multi-level tiling with cache write on position 1
-    """
-    assert len(sketches) == 5
-    # Sketch 0
-    assert_has_rfactor(sketches[0], 2)
-    # Sketch 1
-    assert_has_rfactor(sketches[1], 2)
-    assert sketches[0] != sketches[1]
-    # Sketch 2
-    assert_is_tiled(sketches[2].stages[2])
-    # Sketch 3
-    assert_is_tiled(sketches[3].stages[2])
-    assert_has_cache_write(sketches[3], 2)
-    assert_compute_at_condition(sketches[3].stages[2], "iter")
-    # Sketch 4
-    assert_is_tiled(sketches[4].stages[2])
-    assert_has_cache_write(sketches[4], 2)
-    assert_compute_at_condition(sketches[4].stages[2], "iter")
-    assert sketches[3] != sketches[4]
-
-    sketches = generate_sketches(double_matmul_auto_scheduler_test, (512,), "llvm")
-    """ 3 multi-level tiling sketches for one matmul, so 3 * 3 = 9 sketches in total """
-    assert len(sketches) == 9
-    assert_is_tiled(sketches[8].stages[5])
-
-
-def test_cpu_conv2d_bn_relu_sketch():
-    sketches = generate_sketches(
-        conv2d_nchw_bn_relu_auto_scheduler_test, (1, 56, 56, 512, 512, 3, 1, 1), "llvm"
-    )
-    """ 3 multi-level tiling sketches
-        No.0 : Conv2d multi-level tiling with fusion on position 0
-        No.1 : Conv2d multi-level tiling with fusion on position 1
-        No.2 : Conv2d multi-level tiling without fusion
-    """
-    assert len(sketches) == 3
-    # Sketch 0
-    assert_is_not_tiled(sketches[0].stages[1])
-    assert_is_tiled(sketches[0].stages[3])
-    assert_compute_at_condition(sketches[0].stages[3], "iter")
-    assert_compute_at_condition(sketches[0].stages[5], "inlined")
-    assert_compute_at_condition(sketches[0].stages[7], "inlined")
-    assert_compute_at_condition(sketches[0].stages[9], "inlined")
-    assert_is_tiled(sketches[0].stages[10])
-    # Sketch 1
-    assert_is_not_tiled(sketches[1].stages[1])
-    assert_is_tiled(sketches[1].stages[3])
-    assert_compute_at_condition(sketches[1].stages[3], "iter")
-    assert_compute_at_condition(sketches[1].stages[5], "inlined")
-    assert_compute_at_condition(sketches[1].stages[7], "inlined")
-    assert_compute_at_condition(sketches[1].stages[9], "inlined")
-    assert_is_tiled(sketches[1].stages[10])
-    # Sketch 2
-    assert_is_not_tiled(sketches[2].stages[1])
-    assert_is_tiled(sketches[2].stages[3])
-    assert_compute_at_condition(sketches[2].stages[3], "root")
-    assert_compute_at_condition(sketches[2].stages[5], "inlined")
-    assert_compute_at_condition(sketches[2].stages[7], "inlined")
-    assert_compute_at_condition(sketches[2].stages[9], "inlined")
-    assert_is_not_tiled(sketches[2].stages[10])
-
-
-def test_cpu_max_pool2d_sketch():
-    sketches = generate_sketches(max_pool2d_auto_scheduler_test, (1, 56, 56, 512, 1), "llvm")
-    """ 1 default sketch """
-    assert len(sketches) == 1
-    # Sketch 0
-    assert len(sketches[0].transform_steps) == 0
-
-
-def test_cpu_min_sketch():
-    sketches = generate_sketches(min_nm_auto_scheduler_test, (10, 1024), "llvm")
-    """ 2 rfactor sketches + 1 default sketch
-        No.0 : Rfactor with factor position 0
-        No.1 : Rfactor with factor position 1
-        No.2 : Default sketch
-    """
-    assert len(sketches) == 3
-    # Sketch 0
-    assert_has_rfactor(sketches[0], 1)
-    # Sketch 1
-    assert_has_rfactor(sketches[1], 1)
-    assert sketches[0] != sketches[1]
-    # Sketch 2
-    assert len(sketches[2].transform_steps) == 0
-
-
-def test_cpu_softmax_sketch():
-    sketches = generate_sketches(softmax_nm_auto_scheduler_test, (1, 1024), "llvm")
-    """ (2 rfactor sketches + 1 default sketch) * (2 rfactor sketches + 1 default sketch) """
-    assert len(sketches) == (3 * 3)
-    for i in range(0, 3):
-        for j in range(0, 3):
-            sketch = sketches[i * 3 + j]
-            if j in [0, 1]:
-                assert_has_rfactor(sketch, 1)
-            if i in [0, 1]:
-                assert_has_rfactor(sketch, 4 if j in [0, 1] else 3)
-    assert len(sketches[8].transform_steps) == 0
-
-    sketches = generate_sketches(softmax_abcd_auto_scheduler_test, (1, 12, 128, 128), "llvm")
-    """ (2 rfactor sketches + 1 default sketch) * (2 rfactor sketches + 1 default sketch) """
-    assert len(sketches) == (3 * 3)
-    for i in range(0, 3):
-        for j in range(0, 3):
-            sketch = sketches[i * 3 + j]
-            if j in [0, 1]:
-                assert_has_rfactor(sketch, 1)
-            if i in [0, 1]:
-                assert_has_rfactor(sketch, 4 if j in [0, 1] else 3)
-    assert len(sketches[8].transform_steps) == 0
-
-
-def test_cpu_conv2d_winograd_sketch():
-    sketches = generate_sketches(
-        conv2d_winograd_nhwc_auto_scheduler_test, (1, 28, 28, 128, 128, 3, 1, 1), "llvm"
-    )
-    """ 3 multi-level tiling sketches
-        No.0 : Bgemm multi-level tiling
-        No.1 : Bgemm multi-level tiling with cache write on position 0
-        No.2 : Bgemm multi-level tiling with cache write on position 1
-    """
-    assert len(sketches) == 3
-    # Sketch 0
-    assert_is_not_tiled(sketches[0].stages[1])
-    assert_is_not_tiled(sketches[0].stages[2])
-    assert_compute_at_condition(sketches[0].stages[3], "inlined")
-    assert_is_tiled(sketches[0].stages[4])
-    assert_is_tiled(sketches[0].stages[6])
-    assert_compute_at_condition(sketches[0].stages[7], "inlined")
-    assert_is_tiled(sketches[0].stages[8])
-    assert_is_not_tiled(sketches[0].stages[9])
-    # Sketch 1
-    assert_is_not_tiled(sketches[1].stages[1])
-    assert_is_not_tiled(sketches[1].stages[2])
-    assert_compute_at_condition(sketches[1].stages[3], "inlined")
-    assert_is_tiled(sketches[1].stages[4])
-    assert_is_tiled(sketches[1].stages[6])
-    assert_has_cache_write(sketches[1], 6)
-    assert_compute_at_condition(sketches[1].stages[6], "iter")
-    assert_compute_at_condition(sketches[1].stages[8], "inlined")
-    assert_is_tiled(sketches[1].stages[9])
-    assert_is_not_tiled(sketches[1].stages[10])
-    # Sketch 2
-    assert_is_not_tiled(sketches[2].stages[1])
-    assert_is_not_tiled(sketches[2].stages[2])
-    assert_compute_at_condition(sketches[2].stages[3], "inlined")
-    assert_is_tiled(sketches[2].stages[4])
-    assert_is_tiled(sketches[2].stages[6])
-    assert_has_cache_write(sketches[2], 6)
-    assert_compute_at_condition(sketches[2].stages[6], "iter")
-    assert_compute_at_condition(sketches[2].stages[8], "inlined")
-    assert_is_tiled(sketches[2].stages[9])
-    assert_is_not_tiled(sketches[2].stages[10])
-    assert sketches[1] != sketches[2]
-
-
-def test_cpu_zero_rank_sketch():
-    sketches = generate_sketches(zero_rank_reduce_auto_scheduler_test, (128,), "llvm")
-    """ 2 rfactor sketches + 1 multi-level tiling sketches """
-    assert len(sketches) == 3
-
-
-def test_cpu_custom_sketch():
-    def meet_condition_func(search_policy, state, stage_id):
-        return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST
-
-    def apply_func(search_policy, state, stage_id):
-        ret = []
-        state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
-        C = state.stage_ops[2]
-
-        ret.append([state.state_object, -1])
-
-        s1 = state.copy()
-        i, _, _ = s1[C].iters
-        s1.split(C, i, [8, 2])
-        ret.append([s1.state_object, -1])
-        return ret
-
-    sketches = generate_sketches(
-        matmul_auto_scheduler_test,
-        (512, 512, 512),
-        "llvm",
-        init_search_callbacks=[
-            auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func)
-        ],
-    )
-    assert len(sketches) == 2
-    assert sketches[0].stages[2].iters[0].range.extent == 512
-    assert sketches[0].stages[2].iters[1].range.extent == 512
-    assert sketches[0].stages[2].iters[2].range.extent == 512
-    assert sketches[1].stages[2].iters[0].range.extent == 32
-    assert sketches[1].stages[2].iters[1].range.extent == 8
-    assert sketches[1].stages[2].iters[2].range.extent == 2
-    assert sketches[1].stages[2].iters[3].range.extent == 512
-    assert sketches[1].stages[2].iters[4].range.extent == 512
-
-
-@tvm.testing.requires_cuda
-def test_cuda_matmul_sketch():
-    sketches = generate_sketches(matmul_auto_scheduler_test, (512, 512, 512), "cuda")
-    """ 1 multi-level tiling sketch """
-    assert len(sketches) == 1
-    assert_has_cache_read(sketches[0], 0)
-    assert_compute_at_condition(sketches[0].stages[1], "iter")
-    assert_has_cache_read(sketches[0], 2)
-    assert_compute_at_condition(sketches[0].stages[3], "iter")
-    assert_has_cache_write(sketches[0], 4)
-    assert_is_tiled(sketches[0].stages[4])
-    assert_compute_at_condition(sketches[0].stages[4], "iter")
-    assert_is_tiled(sketches[0].stages[5])
-
-    sketches = generate_sketches(matmul_auto_scheduler_test, (8, 8, 1024), "cuda")
-    """ 1 cross thread reuction sketch + 1 multi-level tiling sketch """
-    assert len(sketches) == 2
-    # Sketch 0
-    assert_has_cross_thread_reduction(sketches[0], 2)
-    # Sketch 1
-    assert_has_cache_read(sketches[1], 0)
-    assert_compute_at_condition(sketches[1].stages[1], "iter")
-    assert_has_cache_read(sketches[1], 2)
-    assert_compute_at_condition(sketches[1].stages[3], "iter")
-    assert_has_cache_write(sketches[1], 4)
-    assert_is_tiled(sketches[1].stages[4])
-    assert_compute_at_condition(sketches[1].stages[4], "iter")
-    assert_is_tiled(sketches[1].stages[5])
-
-    sketches = generate_sketches(double_matmul_auto_scheduler_test, (512,), "cuda")
-    """ 1 multi-level tiling sketch for one matmul, so 1 x 1 = 1 sketch in total """
-    assert len(sketches) == 1
-    assert_compute_at_condition(sketches[0].stages[5], "root")
-    assert_compute_at_condition(sketches[0].stages[6], "iter")
-
-
-@tvm.testing.requires_cuda
-def test_cuda_conv2d_bn_relu_sketch():
-    sketches = generate_sketches(
-        conv2d_nchw_bn_relu_auto_scheduler_test, (1, 56, 56, 512, 512, 3, 1, 1), "cuda"
-    )
-    """ 1 multi-level tiling sketch """
-    assert len(sketches) == 1
-    assert_has_cache_read(sketches[0], 1)
-    assert_compute_at_condition(sketches[0].stages[1], "inlined")
-    assert_compute_at_condition(sketches[0].stages[2], "iter")
-    assert_has_cache_read(sketches[0], 3)
-    assert_compute_at_condition(sketches[0].stages[4], "iter")
-    assert_is_tiled(sketches[0].stages[5])
-    assert_compute_at_condition(sketches[0].stages[5], "iter")
-    assert_compute_at_condition(sketches[0].stages[7], "inlined")
-    assert_compute_at_condition(sketches[0].stages[9], "inlined")
-    assert_compute_at_condition(sketches[0].stages[11], "inlined")
-    assert_is_tiled(sketches[0].stages[12])
-
-
-@tvm.testing.requires_cuda
-def test_cuda_max_pool2d_sketch():
-    sketches = generate_sketches(max_pool2d_auto_scheduler_test, (1, 56, 56, 512, 0), "cuda")
-    """ 1 default sketch """
-    assert len(sketches) == 1
-    assert len(sketches[0].transform_steps) == 0
-
-
-@tvm.testing.requires_cuda
-def test_cuda_min_sketch():
-    sketches = generate_sketches(min_nm_auto_scheduler_test, (10, 1024), "cuda")
-    """ 1 cross thread reuction sketch + 1 default sketch """
-    assert len(sketches) == 2
-    # Sketch 0
-    assert_has_cross_thread_reduction(sketches[0], 1)
-    # Sketch 1
-    assert len(sketches[1].transform_steps) == 0
-
-
-@tvm.testing.requires_cuda
-def test_cuda_softmax_sketch():
-    sketches = generate_sketches(softmax_nm_auto_scheduler_test, (2, 1024), "cuda")
-    """ (1 cross thread reuction sketch + 1 default sketch) * (1 cross thread reuction sketch + 1 default sketch) """
-    assert len(sketches) == (2 * 2)
-    # Sketch 0
-    assert_has_cross_thread_reduction(sketches[0], 1)
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-    assert_has_cross_thread_reduction(sketches[0], 3)
-    # Sketch 1
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-    assert_has_cross_thread_reduction(sketches[1], 3)
-    # Sketch 2
-    assert_has_cross_thread_reduction(sketches[2], 1)
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-    # Sketch 3
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-
-    sketches = generate_sketches(softmax_abcd_auto_scheduler_test, (1, 12, 128, 128), "cuda")
-    """ (1 cross thread reuction sketch + 1 default sketch) * (1 cross thread reuction sketch + 1 default sketch) """
-    assert len(sketches) == (2 * 2)
-    # Sketch 0
-    assert_has_cross_thread_reduction(sketches[0], 1)
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-    assert_has_cross_thread_reduction(sketches[0], 3)
-    # Sketch 1
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-    assert_has_cross_thread_reduction(sketches[1], 3)
-    # Sketch 2
-    assert_has_cross_thread_reduction(sketches[2], 1)
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-    # Sketch 3
-    assert_compute_at_condition(sketches[3].stages[2], "inlined")
-
-
-@tvm.testing.requires_cuda
-def test_cuda_conv2d_winograd_sketch():
-    sketches = generate_sketches(
-        conv2d_winograd_nhwc_auto_scheduler_test, (1, 28, 28, 128, 128, 3, 1, 1), "cuda"
-    )
-    """ 1 multi-level tiling sketch """
-    assert len(sketches) == 1
-    assert_compute_at_condition(sketches[0].stages[1], "inlined")
-    assert_compute_at_condition(sketches[0].stages[2], "iter")
-    assert_compute_at_condition(sketches[0].stages[3], "inlined")
-    assert_is_tiled(sketches[0].stages[4])
-    assert_has_cache_read(sketches[0], 4)
-    assert_compute_at_condition(sketches[0].stages[5], "iter")
-    assert_has_cache_read(sketches[0], 6)
-    assert_compute_at_condition(sketches[0].stages[7], "iter")
-    assert_is_tiled(sketches[0].stages[8])
-    assert_compute_at_condition(sketches[0].stages[8], "iter")
-    assert_has_cache_write(sketches[0], 8)
-    assert_compute_at_condition(sketches[0].stages[9], "root")
-    assert_is_tiled(sketches[0].stages[11])
-    assert_is_not_tiled(sketches[0].stages[12])
-
-
-@tvm.testing.requires_cuda
-def test_cuda_zero_rank_sketch():
-    sketches = generate_sketches(zero_rank_reduce_auto_scheduler_test, (128,), "cuda")
-    """ 1 cross thread reuction sketch + 1 multi-level tiling sketch """
-    assert len(sketches) == 2
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/auto_scheduler/test_auto_scheduler_task_scheduler.py b/tests/python/auto_scheduler/test_auto_scheduler_task_scheduler.py
deleted file mode 100644
index 66e1ba9d6802..000000000000
--- a/tests/python/auto_scheduler/test_auto_scheduler_task_scheduler.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Test task scheduler """
-
-import tempfile
-
-import multiprocessing
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import auto_scheduler
-
-from tvm.testing.auto_scheduler import matmul_auto_scheduler_test
-
-
-@tvm.testing.requires_llvm
-def test_task_scheduler_round_robin():
-    tasks = []
-    for n in [2, 4, 8]:
-        tasks.append(
-            auto_scheduler.SearchTask(
-                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
-            )
-        )
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-        num_trials_per_task = 2
-
-        # Tune all tasks
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
-        tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=num_trials_per_task * len(tasks),
-            runner=measure_ctx.runner,
-            num_measures_per_round=1,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        )
-        task_scheduler = auto_scheduler.TaskScheduler(tasks, strategy="round-robin", callbacks=[])
-        task_scheduler.tune(tune_option, search_policy="sketch.random")
-
-        # Check the result of round robin
-        counters = {}
-        for task in tasks:
-            counters[task.workload_key] = 0
-
-        for inp, _ in auto_scheduler.load_records(log_file):
-            counters[inp.task.workload_key] += 1
-
-        for task in tasks:
-            assert counters[task.workload_key] == num_trials_per_task
-
-        # test continuous tuning (restoring the status)
-        task_scheduler = auto_scheduler.TaskScheduler(
-            tasks, strategy="round-robin", load_log_file=log_file, callbacks=[]
-        )
-        tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=len(tasks),
-            num_measures_per_round=1,
-        )
-        task_scheduler.tune(tune_option, search_policy="sketch.random")
-        del measure_ctx
-
-
-@tvm.testing.requires_llvm
-def task_scheduler_round_robin_spawn():
-    assert multiprocessing.get_start_method(False) == "spawn"
-    test_task_scheduler_round_robin()
-
-
-@tvm.testing.requires_llvm
-def test_task_scheduler_round_robin_spawn():
-    ctx = multiprocessing.get_context("spawn")
-    p = ctx.Process(target=task_scheduler_round_robin_spawn)
-    p.start()
-    p.join()
-
-
-@tvm.testing.requires_llvm
-def test_task_scheduler_gradient():
-    tasks = []
-    for n in [2, 4]:
-        tasks.append(
-            auto_scheduler.SearchTask(
-                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
-            )
-        )
-
-    def objective_func(costs):
-        return 1e5 * costs[0]
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-
-        n_trials = 5
-
-        # Tune all tasks
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
-        tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=n_trials,
-            runner=measure_ctx.runner,
-            num_measures_per_round=1,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        )
-        task_scheduler = auto_scheduler.TaskScheduler(
-            tasks, objective_func=objective_func, callbacks=[]
-        )
-
-        # Forcely rewrite the initial values.
-        # This can make this test more stable on the slow CI machines
-        task_scheduler.best_costs = np.array([1e2, 1e-8])
-
-        task_scheduler.tune(tune_option, search_policy="sketch.random")
-
-        # Check the allocation results
-        counters = {}
-        for task in tasks:
-            counters[task.workload_key] = 0
-
-        for inp, _ in auto_scheduler.load_records(log_file):
-            counters[inp.task.workload_key] += 1
-
-        assert counters[tasks[0].workload_key] == n_trials - 1
-        assert counters[tasks[1].workload_key] == 1
-        del measure_ctx
-
-
-if __name__ == "__main__":
-    test_task_scheduler_round_robin()
-    test_task_scheduler_round_robin_spawn()
-    test_task_scheduler_gradient()
diff --git a/tests/python/autotvm/test_autotvm_database.py b/tests/python/autotvm/test_autotvm_database.py
deleted file mode 100644
index d5980022811f..000000000000
--- a/tests/python/autotvm/test_autotvm_database.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test database"""
-import copy
-import logging
-
-from tvm.autotvm import database
-from tvm.autotvm.record import encode, MeasureResult
-
-from tvm.testing.autotvm import get_sample_records
-
-
-def test_save_load():
-    logging.info("test basic db load/save ...")
-    records = get_sample_records(3)
-    inp1, res1 = records[0]
-    inp2, res2 = records[1]
-    inp3, _ = records[2]
-
-    _db = database.DummyDatabase()
-    _db.flush()
-    _db.save(inp1, res1)
-    _db.save(inp2, res2)
-
-    load1 = _db.load(inp1)
-    load2 = _db.load(inp2)
-    load3 = _db.load(inp3)
-    assert load1 == res1
-    assert load2 == res2
-    assert load3 is None
-    assert load1 != load2
-
-
-TRIAL_LIMIT = 2
-
-
-def test_db_hash():
-    logging.info("test db hash check ...")
-    inp1, res1 = get_sample_records(1)[0]
-    inp2 = copy.deepcopy(inp1)
-    inp1.config.code_hash = "cafecafe"
-    inp2.config.code_hash = "dbffdbff"
-    res2l = list(tuple(res1))
-
-    # set timestamp
-    res2l[-1] = -1
-    res2 = MeasureResult(*res2l)
-    _db = database.DummyDatabase()
-    _db.flush()
-    _db.save(inp1, res1, extend=True)
-    _db.save(inp2, res2, extend=True)
-
-    load1 = _db.load(inp1)
-    load2 = _db.load(inp2)
-    assert load1 != load2
-    assert load1.timestamp != -1
-    assert load2.timestamp == -1
-
-
-def test_db_latest_all():
-    logging.info("test db load w/ multiple results ...")
-    inp1, res1 = get_sample_records(1)[0]
-    lis1 = list(tuple(res1))
-    lis2 = list(tuple(res1))
-    lis3 = list(tuple(res1))
-
-    # set timestamp
-    lis1[-1] = 0.0
-    lis2[-1] = 1.1
-    lis3[-1] = 9999.9999
-    res1 = MeasureResult(*lis1)
-    res2 = MeasureResult(*lis2)
-    res3 = MeasureResult(*lis3)
-
-    _db = database.DummyDatabase()
-    _db.flush()
-    _db.save(inp1, res1, extend=True)
-    load1 = _db.load(inp1)
-    assert load1.timestamp == 0.0
-    _db.save(inp1, res2, extend=True)
-    load2 = _db.load(inp1)
-    assert load2.timestamp == 1.1
-    _db.save(inp1, res3, extend=True)
-    load3 = _db.load(inp1)
-    assert load3.timestamp == 9999.9999
-
-    load4 = _db.load(inp1, get_all=True)
-    assert encode(inp1, load4[0]) == encode(inp1, res1)
-    assert encode(inp1, load4[1]) == encode(inp1, res2)
-    assert encode(inp1, load4[2]) == encode(inp1, res3)
-
-
-def test_db_filter():
-    logging.info("test db filter ...")
-    records = get_sample_records(5)
-    _db = database.DummyDatabase()
-    _db.flush()
-    for inp, result in records:
-        _db.save(inp, result)
-
-    records = _db.filter(lambda inp, ress: any(r.costs[0] <= 2 for r in ress))
-    assert len(records) == 2
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    test_save_load()
-    test_db_hash()
-    test_db_latest_all()
-    test_db_filter()
diff --git a/tests/python/autotvm/test_autotvm_dispatch_context.py b/tests/python/autotvm/test_autotvm_dispatch_context.py
deleted file mode 100644
index ba75992128a8..000000000000
--- a/tests/python/autotvm/test_autotvm_dispatch_context.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test dispatcher.
-The dispatcher can choose which template to use according
-to the parameters of workload"""
-
-from tvm import autotvm
-import tvm
-
-
-@autotvm.template("testing/dispatch_fallback")
-def simple_template(a, b):
-    cfg = autotvm.get_config()
-    assert cfg.is_fallback
-
-
-def test_fallback():
-    simple_template(2, 3)
-
-
-def test_tophub_kinds_match():
-    def verify_arm_cpu(target):
-        best_by_targetkey = autotvm.tophub.context(target).best_by_targetkey
-        assert len(best_by_targetkey)
-        found_arm_cpu = False
-        for a, _ in best_by_targetkey:
-            if "arm_cpu" in a:
-                found_arm_cpu = True
-                break
-        assert found_arm_cpu
-
-    verify_arm_cpu("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod")
-    verify_arm_cpu("llvm -model=snapdragon835 -mtriple=arm64-linux-android -mattr=+neon")
-
-
-if __name__ == "__main__":
-    test_fallback()
diff --git a/tests/python/autotvm/test_autotvm_droplet_tuner.py b/tests/python/autotvm/test_autotvm_droplet_tuner.py
deleted file mode 100644
index 10ffa18784ce..000000000000
--- a/tests/python/autotvm/test_autotvm_droplet_tuner.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test droplet algorithm tuner"""
-
-from tvm.testing.autotvm import DummyRunner, get_sample_task, get_sample_records
-from tvm import autotvm
-
-
-def test_tuner():
-    """Test Droplet Tuner"""
-
-    task, _ = get_sample_task()
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-
-    # When no range index, range_length should be the length of config space
-    tuner = autotvm.tuner.DropletTuner(task)
-    assert len(tuner.best_choice) == 3
-    assert tuner.execution == 1
-    assert tuner.batch == 16
-    assert tuner.total_execution == max(tuner.dims)
-    assert tuner.step == 1
-
-
-def test_multi_filter():
-    # Test with multi-filter
-    task, _ = get_sample_task()
-    task.config_space.multi_filter(
-        filter=lambda entity: 0 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-    tuner = autotvm.tuner.DropletTuner(task)
-    valid_indexes = list(
-        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
-    )
-    assert tuner.visited.issubset(valid_indexes)
-
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-    tuner.tune(n_trial=8, measure_option=measure_option)
-    assert tuner.visited.issubset(valid_indexes)
-
-
-if __name__ == "__main__":
-    test_tuner()
-    test_multi_filter()
diff --git a/tests/python/autotvm/test_autotvm_feature.py b/tests/python/autotvm/test_autotvm_feature.py
deleted file mode 100644
index 26268e55994b..000000000000
--- a/tests/python/autotvm/test_autotvm_feature.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test feature extraction"""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.autotvm import feature
-
-
-def test_iter_feature_gemm():
-    N = 128
-
-    k = te.reduce_axis((0, N), "k")
-    A = te.placeholder((N, N), name="A")
-    B = te.placeholder((N, N), name="B")
-    C = te.compute(A.shape, lambda y, x: te.sum(A[y, k] * B[k, x], axis=k), name="C")
-
-    s = te.create_schedule(C.op)
-
-    feas = feature.get_itervar_feature(s, [A, B, C], take_log=False)
-
-    expected = [
-        {
-            "_attr_": [128, 1, 128, 2097152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
-            "A_0": [128, -1, 16384, 128, 0, 0],
-            "B_0": [0, -1, 16384, 128, 0, 0],
-            "C_0": [128, -1, 16384, 128, 0, 0],
-            "C_1": [128, -1, 16384, 128, 0, 0],
-        },
-        {
-            "_attr_": [128, 2, 16384, 16384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
-            "A_0": [0, -1, 128, 128, 0, 0],
-            "B_0": [1, -1, 16384, 1, 0, 0],
-            "C_0": [1, -1, 128, 128, 0, 0],
-            "C_1": [1, -1, 128, 128, 0, 0],
-        },
-        {
-            "_attr_": [128, 3, 2097152, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
-            "A_0": [1, -1, 128, 1, 0, 0],
-            "B_0": [128, -1, 128, 1, 0, 0],
-            "C_1": [0, -1, 1, 128, 0, 0],
-            "C_2": [0, -1, 1, 128, 0, 0],
-        },
-    ]
-
-    for ans, row in zip(expected, feas):
-        for pair in row:
-            if pair[0] not in ans:
-                continue
-            assert ans[pair[0]] == pair[1:], "%s: %s vs %s" % (pair[0], ans[pair[0]], pair[1:])
-
-
-def test_curve_feature_gemm():
-    N = 128
-
-    k = te.reduce_axis((0, N), "k")
-    A = te.placeholder((N, N), name="A")
-    B = te.placeholder((N, N), name="B")
-    C = te.compute(A.shape, lambda y, x: te.sum(A[y, k] * B[k, x], axis=k), name="C")
-
-    s = te.create_schedule(C.op)
-
-    feas = feature.get_buffer_curve_sample_flatten(s, [A, B, C], sample_n=30)
-    # sample_n * #buffers * #curves * 2 numbers per curve
-    assert len(feas) == 30 * 3 * 4 * 2
-
-
-def test_feature_shape():
-    """test the dimensions of flatten feature are the same"""
-
-    N = 1024
-    n_sample = 100
-
-    def get_gemm_feature(target):
-        k = te.reduce_axis((0, N), "k")
-        A = te.placeholder((N, N), name="A")
-        B = te.placeholder((N, N), name="B")
-        C = te.compute(A.shape, lambda y, x: te.sum(A[y, k] * B[k, x], axis=k), name="C")
-
-        s = te.create_schedule(C.op)
-
-        y, x = s[C].op.axis
-        axes = list(s[C].tile(y, x, 8, 8)) + [k]
-        perm = np.random.permutation(5)
-        axes = [axes[x] for x in perm]
-        s[C].reorder(*axes)
-
-        if "gpu" in target.keys:
-            pick = []
-            # filter out reduction axis
-            for i in range(len(perm)):
-                if perm[i] != 4:
-                    pick.append(axes[i])
-            s[C].bind(pick[0], te.thread_axis("blockIdx.x"))
-            s[C].bind(pick[1], te.thread_axis("vthread"))
-            s[C].bind(pick[2], te.thread_axis("threadIdx.y"))
-
-        with target:
-            feas = feature.get_itervar_feature(s, [A, B, C])
-            feas = feature.flatten_itervar_feature(feas)
-        return feas
-
-    targets = [
-        tvm.target.cuda(),
-        tvm.target.mali(),
-        tvm.target.arm_cpu(),
-    ]
-
-    for target in targets:
-        dim = len(get_gemm_feature(target))
-        for i in range(n_sample):
-            assert dim == len(get_gemm_feature(target)), (
-                "dimensions of feature do not match" " for different configurations"
-            )
-
-
-if __name__ == "__main__":
-    test_iter_feature_gemm()
-    test_curve_feature_gemm()
-    test_feature_shape()
diff --git a/tests/python/autotvm/test_autotvm_flop_calculator.py b/tests/python/autotvm/test_autotvm_flop_calculator.py
deleted file mode 100644
index e28beaf98709..000000000000
--- a/tests/python/autotvm/test_autotvm_flop_calculator.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test flop calculation"""
-
-import tvm
-from tvm import te
-import numpy as np
-
-from tvm.autotvm.task.task import compute_flop
-
-
-def random_dtypes():
-    """Return pair of (input, accumulator) dtypes"""
-    candidates = [("float32", "float32"), ("float16", "float32"), ("int8", "int32")]
-    return candidates[np.random.choice(len(candidates))]
-
-
-def test_conv():
-    for i in range(5):
-        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
-        (input_dtype, acc_dtype) = random_dtypes()
-        D = te.placeholder((N, CI, H, W), dtype=input_dtype)
-        K = te.placeholder((CO, CI, KH, KW), dtype=input_dtype)
-
-        KH = min(H, KH)
-        KW = min(W, KW)
-
-        ci = te.reduce_axis((0, CI))
-        kh = te.reduce_axis((0, KH))
-        kw = te.reduce_axis((0, KW))
-
-        OH = (H - KH) + 1
-        OW = (W - KW) + 1
-
-        C = te.compute(
-            (N, CO, OH, OW),
-            lambda n, co, h, w: te.sum(
-                D[n][ci][h][w].astype(acc_dtype) * K[co][ci][h][w].astype(acc_dtype),
-                axis=[ci, kh, kw],
-            ),
-        )
-
-        s = te.create_schedule([C.op])
-
-        assert compute_flop(s) == 2 * N * CO * OH * OW * CI * KH * KW
-
-
-def test_pack_gemm():
-    for i in range(5):
-        N, L, M = [np.random.randint(10, 128) * 4 for _ in range(3)]
-        (input_dtype, acc_dtype) = random_dtypes()
-        A = te.placeholder((N, L), dtype=input_dtype)
-        B = te.placeholder((M, L), dtype=input_dtype)
-        k = te.reduce_axis((0, L))
-
-        bn = 4
-        idxd = tvm.tir.indexdiv
-        idxm = tvm.tir.indexmod
-
-        A_pack = te.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j])
-        B_pack = te.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j])
-        C_pack = te.compute(
-            (N // bn, M // bn, bn, bn),
-            lambda i, j, ii, jj: te.sum(
-                A_pack[i, k, ii].astype(acc_dtype) * B_pack[j, k, jj].astype(acc_dtype), axis=[k]
-            ),
-        )
-        C = te.compute(
-            (N, M), lambda i, j: C_pack[idxd(i, bn)][idxd(j, bn)][idxm(i, bn)][idxm(j, bn)]
-        )
-
-        s = te.create_schedule([C.op])
-        assert compute_flop(s) == 2 * N * L * M
-
-
-def test_outer_dot():
-    for i in range(5):
-        N, M = [np.random.randint(10, 128) * 4 for _ in range(2)]
-        (input_dtype, acc_dtype) = random_dtypes()
-        A = te.placeholder((N,), dtype=input_dtype)
-        B = te.placeholder((M,), dtype=input_dtype)
-
-        C = te.compute((N, M), lambda i, j: A[i].astype(acc_dtype) * B[j].astype(acc_dtype))
-
-        s = te.create_schedule([C.op])
-        assert compute_flop(s) == N * M
-
-
-def test_max_pool():
-    for i in range(5):
-        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
-        (input_dtype, _) = random_dtypes()
-        D = te.placeholder((N, CI, H, W), dtype=input_dtype)
-
-        KH = min(H, KH)
-        KW = min(W, KW)
-
-        kh = te.reduce_axis((0, KH))
-        kw = te.reduce_axis((0, KW))
-
-        OH = (H - KH) + 1
-        OW = (W - KW) + 1
-
-        C = te.compute(
-            (N, CO, OH, OW), lambda n, co, h, w: tvm.te.max(D[n][co][h + kh][w + kw], axis=[kh, kw])
-        )
-
-        s = te.create_schedule([C.op])
-
-        assert compute_flop(s) == N * CO * OH * OW * KH * KW
-
-
-def test_average_pool():
-    for i in range(5):
-        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
-        (input_dtype, acc_dtype) = random_dtypes()
-        D = te.placeholder((N, CI, H, W), dtype=input_dtype)
-
-        KH = min(H, KH)
-        KW = min(W, KW)
-
-        kh = te.reduce_axis((0, KH))
-        kw = te.reduce_axis((0, KW))
-
-        OH = (H - KH) + 1
-        OW = (W - KW) + 1
-
-        C = te.compute(
-            (N, CO, OH, OW),
-            lambda n, co, h, w: te.sum(
-                te.div(D[n][co][h + kh][w + kw].astype(acc_dtype), (KW * KH)), axis=[kh, kw]
-            ),
-        )
-
-        s = te.create_schedule([C.op])
-
-        assert compute_flop(s) == 2 * N * CO * OH * OW * KH * KW
-
-
-def test_move():
-    """No float number operation in simple move. So the estimator should raise an error"""
-    N = 1024
-
-    A = te.placeholder((N,))
-    C = te.compute((N,), lambda i: A[i])
-    s = te.create_schedule([C.op])
-
-    try:
-        compute_flop(s)
-        assert False
-    except RuntimeError:
-        pass
-
-
-if __name__ == "__main__":
-    test_conv()
-    test_pack_gemm()
-    test_outer_dot()
-    test_move()
diff --git a/tests/python/autotvm/test_autotvm_ga_tuner.py b/tests/python/autotvm/test_autotvm_ga_tuner.py
deleted file mode 100644
index 625c6c66b6f2..000000000000
--- a/tests/python/autotvm/test_autotvm_ga_tuner.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test genetic algorithm tuner"""
-
-from tvm.testing.autotvm import DummyRunner, get_sample_task
-from tvm import autotvm
-
-
-def test_ga_tuner():
-    """Test GATuner"""
-    # Test population size smaller than space size tuning configuration
-    task, _ = get_sample_task()
-    tuner = autotvm.tuner.GATuner(task, pop_size=32)
-    valid_indexes = list(
-        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
-    )
-    assert tuner.visited.issubset(valid_indexes)
-    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
-    assert len(tuner.space) == 64
-
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
-    assert tuner.visited.issubset(valid_indexes)
-
-    # Test population size bigger than space size tuning configuration
-    task, _ = get_sample_task()
-    tuner = autotvm.tuner.GATuner(task, pop_size=100)
-    valid_indexes = list(
-        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
-    )
-    assert tuner.visited.issubset(valid_indexes)
-    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
-    assert len(tuner.space) == 64
-
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
-    assert tuner.visited.issubset(valid_indexes)
-
-    # Test population size smaller than multi-filtered space size tuning configuration
-    task, _ = get_sample_task()
-    task.config_space.multi_filter(
-        filter=lambda entity: 8 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-    tuner = autotvm.tuner.GATuner(task, pop_size=32)
-    valid_indexes = list(
-        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
-    )
-    assert tuner.visited.issubset(valid_indexes)
-    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
-    assert len(tuner.space) == 43
-
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
-    assert tuner.visited.issubset(valid_indexes)
-
-    # Test population size bigger than multi-filtered space size tuning configuration
-    task, _ = get_sample_task()
-    task.config_space.multi_filter(
-        filter=lambda entity: 8 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-    tuner = autotvm.tuner.GATuner(task, pop_size=100)
-    valid_indexes = list(
-        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
-    )
-    assert tuner.visited.issubset(valid_indexes)
-    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
-    assert len(tuner.space) == 43
-
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
-    assert tuner.visited.issubset(valid_indexes)
-
-
-if __name__ == "__main__":
-    test_ga_tuner()
diff --git a/tests/python/autotvm/test_autotvm_graph_tuner_core.py b/tests/python/autotvm/test_autotvm_graph_tuner_core.py
deleted file mode 100644
index e1aff8724178..000000000000
--- a/tests/python/autotvm/test_autotvm_graph_tuner_core.py
+++ /dev/null
@@ -1,792 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# NOTE: We name this test file to start with test_graph_tuner
-# to make it execute after zero_rank tensor test cases. This
-# helps avoid topi arithmetic operator overloading issue:
-# https://github.com/apache/tvm/issues/3240.
-# TODO: restore the file name after this issue is resolved.
-import os
-import copy
-import numpy as np
-import tvm
-from tvm import te
-import tvm.relay.testing
-
-from tvm import autotvm
-from tvm import relay
-from tvm.autotvm.task import ConfigEntity
-from tvm.autotvm.measure import MeasureResult, MeasureInput
-from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
-
-
-def _create_args(dshape, kshape, strides, padding, dilation, layout, out_layout, dtype, out_dtype):
-    data = tvm.te.placeholder(dshape, dtype=dtype)
-    kernel = tvm.te.placeholder(kshape, dtype=dtype)
-    return autotvm.task.serialize_args(
-        [data, kernel, strides, padding, dilation, layout, layout, out_dtype]
-    )
-
-
-def _create_data(target, dshape, dtype, layout):
-    data = relay.var("data", shape=dshape, dtype=dtype)
-    w0 = relay.var("w0_weight")
-    conv0 = relay.nn.conv2d(data, w0, channels=16, kernel_size=(3, 3), padding=(1, 1))
-    w1 = relay.var("w1_weight")
-    conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1))
-    w2 = relay.var("w2_weight")
-    conv2 = relay.nn.conv2d(conv1, w2, channels=32, kernel_size=(3, 3), padding=(1, 1))
-    out = relay.add(conv1, conv2)
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    mod, params = relay.testing.create_workload(net)
-    tasks = autotvm.task.extract_from_program(
-        mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
-    )
-    new_args = [
-        _create_args(
-            (1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype
-        ),
-        _create_args(
-            (1, 16, 8, 8),
-            (32, 16, 1, 1),
-            (1, 1),
-            (0, 0, 0, 0),
-            (1, 1),
-            layout,
-            layout,
-            dtype,
-            dtype,
-        ),
-        _create_args(
-            (1, 32, 8, 8),
-            (32, 32, 3, 3),
-            (1, 1),
-            (1, 1, 1, 1),
-            (1, 1),
-            layout,
-            layout,
-            dtype,
-            dtype,
-        ),
-    ]
-
-    costs = [0.04, 0.012, 0.03]
-    config_list = []
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [3, 1]],
-            ["tile_oc", "sp", [4, 4]],
-            ["tile_ow", "sp", [4, 2]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [2, 8]],
-            ["tile_oc", "sp", [1, 32]],
-            ["tile_oh", "ot", 1],
-            ["tile_ow", "sp", [4, 2]],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [8, 4]],
-            ["tile_oc", "sp", [4, 8]],
-            ["tile_ow", "sp", [2, 4]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-
-    records = []
-    for args, cost, config, task in zip(new_args, costs, config_list, tasks):
-        task.args = args
-        ms_input = MeasureInput(target=target, task=task, config=config)
-        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
-        records.append((ms_input, ms_output))
-
-    ltf_records = []
-    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_task = autotvm.task.create("layout_transform", ltf_arg, target)
-    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
-    ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
-    ltf_records.append((ms_input, ms_output))
-
-    ltf_keys = []
-    ltf_arg = [te.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"]
-    ltf_wkl = autotvm.task.args_to_workload(ltf_arg, "layout_transform")
-    ltf_keys.append(ltf_wkl)
-    ltf_arg = [te.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"]
-    ltf_wkl = autotvm.task.args_to_workload(ltf_arg, "layout_transform")
-    ltf_keys.append(ltf_wkl)
-    ltf_arg = [te.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"]
-    ltf_wkl = autotvm.task.args_to_workload(ltf_arg, "layout_transform")
-    ltf_keys.append(ltf_wkl)
-
-    return net, records, ltf_records, ltf_keys, tasks
-
-
-@tvm.testing.requires_x86
-def test_graph_tuner_layout_transform():
-    log_file = "%s/test_tuner.log" % (os.getcwd())
-    target = "llvm"
-    dshape = (1, 3, 8, 8)
-    dtype = "float32"
-    layout = "NCHW"
-    conv2d = relay.op.get("nn.conv2d")
-    target_ops = [conv2d]
-
-    g, records, ltf_records, ltf_keys, _ = _create_data(target, dshape, dtype, layout)
-    executor = DPTuner(g, {"data": dshape}, records, target_ops, target=target, log_file=log_file)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    out = executor._layout_transform_perf_records
-
-    num_flops = 0
-    total_time = 0
-    for record in ltf_records:
-        ltf_wkl = record[0].task.workload
-        input_shape = ltf_wkl[1][1]
-        flops = np.prod(input_shape)
-        num_flops += flops
-        total_time += record[1].costs[0]
-    avg_time = total_time / num_flops
-
-    for ltf_workload in out:
-        input_shape = ltf_workload[1][1]
-        flops = 1
-        for i in input_shape:
-            flops *= i
-        expected_time = flops * avg_time
-        out_time = out[ltf_workload][1].costs[0]
-        assert (
-            expected_time == out_time
-        ), "Inferred layout transformation time mismatch for %s: " "expecting %f but got %f" % (
-            str(ltf_workload),
-            expected_time,
-            out_time,
-        )
-
-
-@tvm.testing.requires_x86
-def test_graph_tuner_layout_transform_runner():
-    log_file = "%s/test_tuner.log" % (os.getcwd())
-    target = "llvm"
-    dshape = (1, 3, 8, 8)
-    dtype = "float32"
-    layout = "NCHW"
-    conv2d = relay.op.get("nn.conv2d")
-    target_ops = [conv2d]
-
-    g, records, ltf_records, ltf_keys, _ = _create_data(target, dshape, dtype, layout)
-    executor = DPTuner(g, {"data": dshape}, records, target_ops, target=target, log_file=log_file)
-    runner = autotvm.LocalRunner(number=100, repeat=1, timeout=10)
-    executor.benchmark_layout_transform(
-        layout_records=ltf_records, infer_layout=True, runner=runner
-    )
-    out = executor._layout_transform_perf_records
-
-    num_flops = 0
-    total_time = 0
-    for record in ltf_records:
-        ltf_wkl = record[0].task.workload
-        input_shape = ltf_wkl[1][1]
-        flops = np.prod(input_shape)
-        num_flops += flops
-        total_time += record[1].costs[0]
-    avg_time = total_time / num_flops
-
-    for ltf_workload in out:
-        input_shape = ltf_workload[1][1]
-        flops = 1
-        for i in input_shape:
-            flops *= i
-        expected_time = flops * avg_time
-        out_time = out[ltf_workload][1].costs[0]
-        assert (
-            expected_time == out_time
-        ), "Inferred layout transformation time mismatch for %s: " "expecting %f but got %f" % (
-            str(ltf_workload),
-            expected_time,
-            out_time,
-        )
-
-
-@tvm.testing.requires_x86
-def test_DPTuner_run():
-    log_file = "%s/test_tuner.log" % (os.getcwd())
-    target = "llvm"
-    dtype = "float32"
-    layout = "NCHW"
-    dshape = (1, 3, 8, 8)
-    conv2d = relay.op.get("nn.conv2d")
-    target_ops = [conv2d]
-
-    g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
-    mod = tvm.IRModule()
-    mod["main"] = g
-    costs = [0.02, 0.02, 0.045]
-    config_list = []
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 3]],
-            ["tile_oc", "sp", [2, 8]],
-            ["tile_ow", "sp", [4, 2]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [4, 4]],
-            ["tile_oc", "sp", [2, 16]],
-            ["tile_oh", "ot", 1],
-            ["tile_ow", "sp", [4, 2]],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [16, 2]],
-            ["tile_oc", "sp", [8, 4]],
-            ["tile_ow", "sp", [2, 4]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    for cost, config, task in zip(costs, config_list, tasks):
-        ms_input = MeasureInput(target=target, task=task, config=config)
-        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
-        records.append((ms_input, ms_output))
-
-    executor = DPTuner(mod, {"data": dshape}, records, target_ops, target, log_file=log_file)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-    assert os.path.isfile(log_file), "No log file with name %s exists." % log_file
-
-
-@tvm.testing.requires_x86
-def test_PBQPTuner_run():
-    target = "llvm"
-    dtype = "float32"
-    layout = "NCHW"
-    dshape = (1, 3, 8, 8)
-    conv2d = relay.op.get("nn.conv2d")
-    target_ops = [conv2d]
-
-    g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
-    costs = [0.02, 0.02, 0.045]
-    config_list = []
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 3]],
-            ["tile_oc", "sp", [2, 8]],
-            ["tile_ow", "sp", [4, 2]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [4, 4]],
-            ["tile_oc", "sp", [2, 16]],
-            ["tile_oh", "ot", 1],
-            ["tile_ow", "sp", [4, 2]],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [16, 2]],
-            ["tile_oc", "sp", [8, 4]],
-            ["tile_ow", "sp", [2, 4]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    for cost, config, task in zip(costs, config_list, tasks):
-        ms_input = MeasureInput(target=target, task=task, config=config)
-        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
-        records.append((ms_input, ms_output))
-
-    executor = PBQPTuner(g, {"data": dshape}, records, target_ops, target)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-
-
-@tvm.testing.requires_x86
-def test_many_sub_graphs():
-    target = "llvm"
-    dtype = "float32"
-    dshape = (1, 8, 8, 3)
-    layout = "NCHW"
-    conv2d = relay.op.get("nn.conv2d")
-    target_ops = [conv2d]
-
-    data = relay.var("data", shape=dshape, dtype=dtype)
-    t0 = relay.transpose(data, (0, 3, 1, 2))
-    w0 = relay.var("w0_weight")
-    conv0 = relay.nn.conv2d(t0, w0, channels=16, kernel_size=(3, 3), padding=(1, 1))
-    t1 = relay.transpose(conv0, (0, 2, 3, 1))
-    w1 = relay.var("w1_weight")
-    t2 = relay.transpose(t1, (0, 3, 1, 2))
-    conv1 = relay.nn.conv2d(t2, w1, channels=32, kernel_size=(1, 1))
-    t3 = relay.transpose(conv1, (0, 2, 3, 1))
-    w2 = relay.var("w2_weight")
-    t4 = relay.transpose(t3, (0, 3, 1, 2))
-    conv2 = relay.nn.conv2d(t4, w2, channels=32, kernel_size=(3, 3), padding=(1, 1))
-    t5 = relay.transpose(conv2, (0, 2, 3, 1))
-    out = relay.add(t3, t5)
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    net, params = relay.testing.create_workload(net)
-
-    tasks = autotvm.task.extract_from_program(
-        net["main"], target=target, params=params, ops=(conv2d,)
-    )
-    new_args = [
-        _create_args(
-            (1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype
-        ),
-        _create_args(
-            (1, 16, 8, 8),
-            (32, 16, 1, 1),
-            (1, 1),
-            (0, 0, 0, 0),
-            (1, 1),
-            layout,
-            layout,
-            dtype,
-            dtype,
-        ),
-        _create_args(
-            (1, 32, 8, 8),
-            (32, 32, 3, 3),
-            (1, 1),
-            (1, 1, 1, 1),
-            (1, 1),
-            layout,
-            layout,
-            dtype,
-            dtype,
-        ),
-    ]
-
-    costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045]
-    config_list = []
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [3, 1]],
-            ["tile_oc", "sp", [4, 4]],
-            ["tile_ow", "sp", [4, 2]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [2, 8]],
-            ["tile_oc", "sp", [1, 32]],
-            ["tile_oh", "ot", 1],
-            ["tile_ow", "sp", [4, 2]],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [8, 4]],
-            ["tile_oc", "sp", [4, 8]],
-            ["tile_ow", "sp", [2, 4]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 3]],
-            ["tile_oc", "sp", [2, 8]],
-            ["tile_ow", "sp", [4, 2]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [4, 4]],
-            ["tile_oc", "sp", [2, 16]],
-            ["tile_oh", "ot", 1],
-            ["tile_ow", "sp", [4, 2]],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [16, 2]],
-            ["tile_oc", "sp", [8, 4]],
-            ["tile_ow", "sp", [2, 4]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-
-    records = []
-    new_args = new_args + new_args
-    tasks = tasks + tasks
-    for args, cost, config, task in zip(new_args, costs, config_list, tasks):
-        task.args = args
-        ms_input = MeasureInput(target=target, task=task, config=config)
-        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
-        records.append((ms_input, ms_output))
-
-    ltf_records = []
-    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_task = autotvm.task.create("layout_transform", ltf_arg, target)
-    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
-    ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
-    ltf_records.append((ms_input, ms_output))
-
-    executor = DPTuner(net, {"data": dshape}, records, target_ops, target)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-
-    executor = PBQPTuner(net, {"data": dshape}, records, target_ops, target)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-
-
-@tvm.testing.requires_x86
-def test_tuple():
-    target = "llvm"
-    dtype = "float32"
-    dshape = (1, 5, 32, 32)
-    layout = "NCHW"
-    conv2d = relay.op.get("nn.conv2d")
-    target_ops = [conv2d]
-
-    data = relay.var("data", shape=dshape, dtype=dtype)
-    w0 = relay.var("w0_weight")
-    conv0 = relay.nn.conv2d(data, w0, channels=2, kernel_size=(3, 3), padding=(1, 1))
-    w1 = relay.var("w1_weight")
-    conv1 = relay.nn.conv2d(data, w1, channels=3, kernel_size=(3, 3), padding=(1, 1))
-    out = relay.concatenate([conv0, conv1], axis=1)
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    net, params = relay.testing.create_workload(net)
-
-    tasks = autotvm.task.extract_from_program(
-        net["main"], target=target, params=params, ops=(conv2d,)
-    )
-    new_args = [
-        _create_args(
-            (1, 5, 32, 32), (2, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype
-        ),
-        _create_args(
-            (1, 5, 32, 32), (3, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype
-        ),
-    ]
-    costs = [0.01, 0.012, 0.03, 0.04]
-    config_list = []
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 5]],
-            ["tile_oc", "sp", [1, 2]],
-            ["tile_ow", "sp", [4, 8]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 5]],
-            ["tile_oc", "sp", [1, 3]],
-            ["tile_ow", "sp", [2, 16]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 5]],
-            ["tile_oc", "sp", [2, 1]],
-            ["tile_ow", "sp", [4, 8]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 5]],
-            ["tile_oc", "sp", [3, 1]],
-            ["tile_ow", "sp", [2, 16]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-
-    records = []
-    new_args = new_args + new_args
-    tasks = tasks + tasks
-    for args, cost, config, task in zip(new_args, costs, config_list, tasks):
-        task.args = args
-        ms_input = MeasureInput(target=target, task=task, config=config)
-        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
-        records.append((ms_input, ms_output))
-
-    ltf_records = []
-    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_task = autotvm.task.create("layout_transform", ltf_arg, target)
-    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
-    ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
-    ltf_records.append((ms_input, ms_output))
-
-    executor = DPTuner(net, {"data": dshape}, records, target_ops, target)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[2][0].config, records[1][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-
-    executor = PBQPTuner(net, {"data": dshape}, records, target_ops, target)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[2][0].config, records[1][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-
-
-@tvm.testing.requires_x86
-def test_triangle_block():
-    target = "llvm"
-    dtype = "float32"
-    dshape = (1, 3, 8, 8)
-    layout = "NCHW"
-    conv2d = relay.op.get("nn.conv2d")
-    target_ops = [conv2d]
-
-    data = relay.var("data", shape=dshape, dtype=dtype)
-    w0 = relay.var("w0_weight")
-    conv0 = relay.nn.conv2d(data, w0, channels=16, kernel_size=(3, 3), padding=(1, 1))
-    w1 = relay.var("w1_weight")
-    conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1))
-    w2 = relay.var("w2_weight")
-    conv2 = relay.nn.conv2d(data, w2, channels=32, kernel_size=(3, 3), padding=(1, 1))
-    out = relay.concatenate([conv0, conv1, conv2], axis=1)
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    net, params = relay.testing.create_workload(net)
-
-    tasks = autotvm.task.extract_from_program(
-        net["main"], target=target, params=params, ops=(conv2d,)
-    )
-    new_args = [
-        _create_args(
-            (1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype
-        ),
-        _create_args(
-            (1, 16, 8, 8),
-            (32, 16, 1, 1),
-            (1, 1),
-            (0, 0, 0, 0),
-            (1, 1),
-            layout,
-            layout,
-            dtype,
-            dtype,
-        ),
-        _create_args(
-            (1, 3, 8, 8), (32, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype
-        ),
-    ]
-    costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045]
-    config_list = []
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [3, 1]],
-            ["tile_oc", "sp", [4, 4]],
-            ["tile_ow", "sp", [4, 2]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [2, 8]],
-            ["tile_oc", "sp", [1, 32]],
-            ["tile_oh", "ot", 1],
-            ["tile_ow", "sp", [4, 2]],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [8, 4]],
-            ["tile_oc", "sp", [4, 8]],
-            ["tile_ow", "sp", [2, 4]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [1, 3]],
-            ["tile_oc", "sp", [2, 8]],
-            ["tile_ow", "sp", [4, 2]],
-            ["unroll_kw", "ot", True],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [4, 4]],
-            ["tile_oc", "sp", [2, 16]],
-            ["tile_oh", "ot", 1],
-            ["tile_ow", "sp", [4, 2]],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {
-        "index": -1,
-        "code_hash": None,
-        "entity": [
-            ["tile_ic", "sp", [16, 2]],
-            ["tile_oc", "sp", [8, 4]],
-            ["tile_ow", "sp", [2, 4]],
-            ["unroll_kw", "ot", False],
-        ],
-    }
-    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-
-    records = []
-    new_args = new_args + new_args
-    tasks = tasks + tasks
-    for args, cost, config, task in zip(new_args, costs, config_list, tasks):
-        task.args = args
-        ms_input = MeasureInput(target=target, task=task, config=config)
-        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
-        records.append((ms_input, ms_output))
-
-    ltf_records = []
-    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_task = autotvm.task.create("layout_transform", ltf_arg, target)
-    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
-    ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
-    ltf_records.append((ms_input, ms_output))
-
-    executor = DPTuner(net, {"data": dshape}, records, target_ops, target)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-
-    executor = PBQPTuner(net, {"data": dshape}, records, target_ops, target)
-    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
-    executor.run()
-    out = [record[0].config for record in executor.get_optimal_records()]
-    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
-    assert expected_out == out, "Output mismatch: expecting %s but got %s" % (
-        str(expected_out),
-        str(out),
-    )
-
-
-if __name__ == "__main__":
-    test_graph_tuner_layout_transform()
-    test_DPTuner_run()
-    test_PBQPTuner_run()
-    test_many_sub_graphs()
-    test_tuple()
-    test_triangle_block()
diff --git a/tests/python/autotvm/test_autotvm_graph_tuner_utils.py b/tests/python/autotvm/test_autotvm_graph_tuner_utils.py
deleted file mode 100644
index 583bd366847c..000000000000
--- a/tests/python/autotvm/test_autotvm_graph_tuner_utils.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# NOTE: We name this test file to start with test_graph_tuner
-# to make it execute after zero_rank tensor test cases. This
-# helps avoid topi arithmetic operator overloading issue:
-# https://github.com/apache/tvm/issues/3240
-# TODO: restore the file name after this issue is resolved.
-import pytest
-
-import tvm
-from tvm import te
-
-from tvm import autotvm, relay
-from tvm.relay.testing import synthetic
-from tvm.autotvm.graph_tuner.utils import (
-    has_multiple_inputs,
-    get_direct_ancestor,
-    get_in_nodes,
-    get_out_nodes,
-    expr2graph,
-    bind_inputs,
-)
-from tvm.autotvm.graph_tuner._base import OPT_OUT_OP
-from tvm.autotvm.graph_tuner.utils.traverse_graph import _replace_device_with_tracing
-from tvm.relay.expr import Call, TupleGetItem, Tuple, Var
-
-
-def verify_has_multiple_inputs(node_list, node_idx, input_names, expected_result):
-    out = has_multiple_inputs(node_list, node_idx, input_names, OPT_OUT_OP)
-    assert out == expected_result, "Output mismatch: expecting checking %s to be %s but got %s." % (
-        node_list[node_idx]["op"],
-        str(expected_result),
-        str(out),
-    )
-
-
-def test_has_multiple_inputs():
-    data = relay.var("data")
-    out1 = data * relay.expr.const(3.0)
-    w0 = relay.var("w0")
-    out2 = relay.nn.conv2d(data, w0)
-    out = relay.add(out1, out2)
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1)})
-    target_ops = [relay.op.get("nn.conv2d")]
-    node_list = []
-    node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
-    input_names = ["data"]
-    verify_has_multiple_inputs(node_list, 2, input_names, False)
-    verify_has_multiple_inputs(node_list, 4, input_names, False)
-    verify_has_multiple_inputs(node_list, 5, input_names, True)
-
-
-def test_expr2graph():
-    mod, _ = synthetic.get_workload()
-    node_dict = {}
-    node_list = []
-    target_ops = [relay.op.get("nn.conv2d")]
-    op_name_list = []
-
-    def _count_node(node):
-        if isinstance(node, Call):
-            op_name_list.append(node.op)
-        elif isinstance(node, (Var, TupleGetItem, Tuple)):
-            op_name_list.append(None)
-
-    relay.analysis.post_order_visit(mod["main"], _count_node)
-
-    expr2graph(mod["main"], target_ops, node_dict, node_list, tvm.target.Target("llvm"))
-    assert len(node_list) == len(op_name_list)
-    for i, item in enumerate(zip(op_name_list, node_list)):
-        op_name, node = item
-        assert op_name == node["op"], "%dth Node operator mismatch: expecting %s but got %s" % (
-            i,
-            str(op_name),
-            str(node["op"]),
-        )
-
-
-def test_get_direct_ancestor():
-    data = relay.var("data")
-    w0 = relay.var("w0")
-    out1 = relay.nn.conv2d(data, w0)
-    out2 = relay.add(out1, data * relay.expr.const(5.0))
-    out3 = out2 + relay.expr.const(2.5)
-    w1 = relay.var("w1")
-    out = relay.nn.conv2d(out3, w1)
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
-    target_ops = [relay.op.get("nn.conv2d")]
-    node_list = []
-    node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
-    visited_dict = {}
-    input_names = ["data"]
-    out = get_direct_ancestor(node_list, visited_dict, target_ops, 5, input_names)
-    assert out == [0], "Output mismatch: expecting [0] but got %s." % str(out)
-
-    # non-regression test
-    out = relay.add(relay.log(data), relay.sqrt(data))
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    net = bind_inputs(net, {"data": (1, 16, 224, 224)})
-    node_list = []
-    node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
-    out = get_direct_ancestor(node_list, visited_dict, target_ops, 3, input_names)
-    assert out == [0], "Output mismatch: expecting [0] but got %s." % str(out)
-
-
-def test_get_in_nodes():
-    data = relay.var("data")
-    w0 = relay.var("w0")
-    out1 = relay.nn.conv2d(data, w0)
-    out2 = relay.add(out1, data)
-    out3 = out2 + relay.expr.const(2.5)
-    w1 = relay.var("w1")
-    out = relay.nn.conv2d(out3, w1)
-    net = relay.Function(relay.analysis.free_vars(out), out)
-    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
-    target_ops = [relay.op.get("nn.conv2d")]
-    input_names = ["data"]
-    node_list = []
-    node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
-    out = get_in_nodes(node_list, target_ops, input_names)
-    expected_out = {3: [0], 4: [3, 0], 7: [4]}
-    diff_set = set(out) ^ set(expected_out)
-    if len(diff_set) != 0:
-        raise RuntimeError(
-            "Output mismatch: expecting %s but got %s." % (str(expected_out), str(out))
-        )
-
-
-def test_get_out_nodes():
-    in_nodes_dict = {8: [4], 4: [3, 0], 3: [0]}
-    expected_out = {0: [3, 4], 3: [4], 4: [8], 8: []}
-    out = get_out_nodes(in_nodes_dict)
-    diff_set = set(out) ^ set(expected_out)
-    if len(diff_set) != 0:
-        raise RuntimeError(
-            "Output mismatch: expecting %s but got %s." % (str(expected_out), str(out))
-        )
-
-
-def test_target_device_replacement():
-    assert _replace_device_with_tracing("cuda") == "cuda -device=tracing"
-    assert (
-        _replace_device_with_tracing("cuda -device=some_device -libs=cudnn")
-        == "cuda -device=tracing -libs=cudnn"
-    )
-    assert (
-        _replace_device_with_tracing("llvm -device=arm_cpu -arg=xxx")
-        == "llvm -device=tracing -arg=xxx"
-    )
-    assert _replace_device_with_tracing("llvm -device=arm_cpu") == "llvm -device=tracing"
-    assert _replace_device_with_tracing("llvm -device=abc, def") == "llvm -device=tracing"
-
-
-if __name__ == "__main__":
-    test_has_multiple_inputs()
-    test_expr2graph()
-    test_get_direct_ancestor()
-    test_get_in_nodes()
-    test_get_out_nodes()
diff --git a/tests/python/autotvm/test_autotvm_index_tuner.py b/tests/python/autotvm/test_autotvm_index_tuner.py
deleted file mode 100644
index 514577f1c986..000000000000
--- a/tests/python/autotvm/test_autotvm_index_tuner.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test index based tuners"""
-
-import multiprocessing
-from tvm.testing.autotvm import DummyRunner, get_sample_task
-from tvm import autotvm
-
-
-def test_grid_search_tuner():
-    """Test GridSearchTuner"""
-
-    task, _ = get_sample_task()
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-
-    # When no range index, range_length should be the length of config space
-    tuner = autotvm.tuner.GridSearchTuner(task)
-    assert tuner.begin_idx == 0
-    assert tuner.end_idx == 64
-    assert tuner.index == 0
-    assert tuner.range_length == 64
-    assert tuner.visited_max == 64
-
-    # With range index, range_length should be the length of the specified range
-    tuner = autotvm.tuner.GridSearchTuner(task, range_idx=(8, 15))
-    assert tuner.begin_idx == 8
-    assert tuner.end_idx == 16
-    assert tuner.index == 8
-    assert tuner.range_length == 8
-    assert tuner.visited_max == 8
-
-    # Tuner should only focus on the specified range
-    tuner.tune(n_trial=8, measure_option=measure_option)
-    assert len(tuner.visited) == 8
-    assert not tuner.has_next()
-
-    # With multi-filter
-    task, _ = get_sample_task()
-    task.config_space.multi_filter(
-        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-
-    tuner = autotvm.tuner.GridSearchTuner(task)
-    assert tuner.begin_idx == 0
-    assert tuner.end_idx == 64
-    assert tuner.index == 5
-    assert tuner.range_length == 64
-    assert tuner.visited_max == 34
-
-    # With range index, range_length should be the length of the specified range
-    tuner = autotvm.tuner.GridSearchTuner(task, range_idx=(8, 15))
-    assert tuner.begin_idx == 8
-    assert tuner.end_idx == 16
-    assert tuner.index == 12
-    assert tuner.range_length == 8
-    assert tuner.visited_max == 4
-
-    # Tuner should only focus on the specified range
-    tuner.tune(n_trial=8, measure_option=measure_option)
-    assert len(tuner.visited) == 4
-    assert not tuner.has_next()
-
-
-def grid_search_spawn():
-    assert multiprocessing.get_spawn_method(False) == "spawn"
-    test_grid_search_tuner()
-
-
-def test_grid_search_tuner_spawn():
-    ctx = multiprocessing.get_context("spawn")
-    p = ctx.Process(target=test_grid_search_tuner)
-    p.start()
-    p.join()
-
-
-def test_random_tuner():
-    """Test RandomTuner"""
-
-    task, _ = get_sample_task()
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-
-    tuner = autotvm.tuner.RandomTuner(task, range_idx=(8, 15))
-    assert tuner.begin_idx == 8
-    assert tuner.end_idx == 16
-    assert tuner.range_length == 8
-    assert tuner.visited_max == 8
-
-    # Tuner should only focus on the specified range and should visit all indices
-    tuner.tune(n_trial=8, measure_option=measure_option)
-    assert len(tuner.visited) == 8
-    assert not tuner.has_next()
-    for idx in tuner.visited:
-        assert 8 <= idx <= 15
-
-    # With multi-filter
-    task, _ = get_sample_task()
-    task.config_space.multi_filter(
-        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-    tuner = autotvm.tuner.RandomTuner(task, range_idx=(8, 15))
-    assert tuner.begin_idx == 8
-    assert tuner.end_idx == 16
-    assert tuner.range_length == 8
-    assert tuner.visited_max == 4
-
-    # Tuner should only focus on the specified range and should visit all indices
-    tuner.tune(n_trial=8, measure_option=measure_option)
-    assert len(tuner.visited) == 4
-    assert not tuner.has_next()
-    for idx in tuner.visited:
-        assert 8 <= idx <= 15
-
-
-if __name__ == "__main__":
-    test_grid_search_tuner()
-    test_grid_search_tuner_spawn()
-    test_random_tuner()
diff --git a/tests/python/autotvm/test_autotvm_measure.py b/tests/python/autotvm/test_autotvm_measure.py
deleted file mode 100644
index 3ef5cbdad635..000000000000
--- a/tests/python/autotvm/test_autotvm_measure.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test builder and runner"""
-import logging
-import multiprocessing
-import concurrent
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.autotvm.measure import executor
-from tvm.testing.autotvm import DummyRunner, bad_matmul, get_sample_task
-from tvm import autotvm
-from tvm.autotvm.measure.measure import MeasureErrorNo, MeasureResult
-from tvm.autotvm import measure
-from inspect import Signature
-
-
-def test_task_tuner_without_measurement():
-    """test task and tuner without measurement"""
-    task, _ = get_sample_task()
-
-    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
-
-    logging.info("%s", task.config_space)
-
-    for tuner_class in [
-        autotvm.tuner.RandomTuner,
-        autotvm.tuner.GridSearchTuner,
-        autotvm.tuner.GATuner,
-        autotvm.tuner.XGBTuner,
-    ]:
-        tuner = tuner_class(task)
-        tuner.tune(n_trial=10, measure_option=measure_option)
-        assert tuner.best_flops > 1
-
-
-def task_tuner_spawn():
-    assert multiprocessing.get_start_method(False) == "spawn"
-    test_task_tuner_without_measurement()
-
-
-def test_task_tuner_without_measurement_spawn():
-    # Subprocesses inherit the spawn method of their parents
-    ctx = multiprocessing.get_context("spawn")
-    p = ctx.Process(target=task_tuner_spawn)
-    p.start()
-    p.join()
-
-
-def test_task_runner_with_ref_input():
-    """test runner ref_input without measurement"""
-    refinp = [np.random.rand(128, 128) for i in range(3)]
-    runner = measure.LocalRunner()
-    runner.ref_input = refinp
-
-    class DummyExecutor(measure.executor.Executor):
-        def __init__(self):
-            self.ran_dummy_executor = False
-
-        def submit(self, func, *args, **kwargs):
-            self.ran_dummy_executor = True
-            sig = Signature.from_callable(func)
-            assert sig.bind(*args, **kwargs).arguments["ref_input"] == refinp
-            dummy_future = concurrent.futures.Future()
-            dummy_future.set_result(None)
-            return dummy_future
-
-    runner.executor = DummyExecutor()
-    runner.run([None], [None])
-    assert runner.executor.ran_dummy_executor
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-
-    test_task_tuner_without_measurement()
-    test_task_tuner_without_measurement_spawn()
-    test_task_runner_with_ref_input()
diff --git a/tests/python/autotvm/test_autotvm_record.py b/tests/python/autotvm/test_autotvm_record.py
deleted file mode 100644
index 693810d3f979..000000000000
--- a/tests/python/autotvm/test_autotvm_record.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""test the correctness of dump and load of data log"""
-from io import StringIO
-from os import PathLike
-import time
-
-from tvm.contrib import utils
-
-from tvm import autotvm
-from tvm.autotvm.measure import MeasureInput, MeasureResult, MeasureErrorNo
-from tvm.autotvm.record import encode, decode, ApplyHistoryBest, measure_str_key
-
-from tvm.testing.autotvm import get_sample_task
-
-
-def test_load_dump():
-    task, target = get_sample_task()
-
-    inp = MeasureInput(target, task, task.config_space.get(0))
-    result = MeasureResult(
-        (2.0, 2.23, 0.23, 0.123, 0.234, 0.123), MeasureErrorNo.NO_ERROR, 2.3, time.time()
-    )
-
-    for protocol in ["json", "pickle"]:
-        row = encode(inp, result, protocol=protocol)
-        inp_2, result_2 = decode(row, protocol=protocol)
-
-        assert measure_str_key(inp) == measure_str_key(inp_2), "%s vs %s" % (
-            measure_str_key(inp),
-            measure_str_key(inp_2),
-        )
-        assert result.costs == result_2.costs
-        assert result.error_no == result_2.error_no
-        assert result.timestamp == result_2.timestamp
-
-
-def test_file_io():
-    temp = utils.tempdir()
-    file_path = temp.relpath("temp.log")
-
-    tsk, target = get_sample_task()
-    inputs = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10)]
-    results = [MeasureResult((i,), 0, 0, 0) for i in range(0, 10)]
-
-    invalid_inp = MeasureInput(target, tsk, tsk.config_space.get(10))
-    invalid_res = MeasureResult((10,), 0, 0, 0)
-
-    # Erase the entity map to test if it will be ignored when loading back.
-    invalid_inp.config._entity_map = {}
-
-    with open(file_path, "w") as fo:
-        cb = autotvm.callback.log_to_file(fo)
-        cb(None, inputs, results)
-        cb(None, [invalid_inp], [invalid_res])
-
-    ref = zip(inputs, results)
-    for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
-        assert x[1] == y[1]
-
-    # Confirm functionality of multiple file loads
-    hist_best = ApplyHistoryBest([file_path, file_path])
-    x = hist_best.query(target, tsk.workload)
-    assert str(x) == str(inputs[0][2])
-
-
-def test_apply_history_best(tmpdir):
-    tsk, target = get_sample_task()
-    best = str(tsk.config_space.get(2))
-
-    inputs_batch_1 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(3)]
-    results_batch_1 = [MeasureResult((i,), 0, 0, 0) for i in range(1, 3)]
-    results_batch_1.append(MeasureResult((0.5,), 0, 2.3, 0))
-
-    # Write data out to file
-    filepath_batch_1 = tmpdir / "batch_1.log"
-    with open(filepath_batch_1, "w") as file:
-        autotvm.callback.log_to_file(file)(None, inputs_batch_1, results_batch_1)
-
-    # Load best results from Path
-    assert isinstance(filepath_batch_1, PathLike)
-    hist_best = ApplyHistoryBest(filepath_batch_1)
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-    # Load best results from str(Path)
-    hist_best = ApplyHistoryBest(str(filepath_batch_1))
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-    # Write data into StringIO buffer
-    stringio_batch_1 = StringIO()
-    assert isinstance(filepath_batch_1, PathLike)
-    callback = autotvm.callback.log_to_file(stringio_batch_1)
-    callback(None, inputs_batch_1, results_batch_1)
-    stringio_batch_1.seek(0)
-
-    # Load best results from strIO
-    hist_best = ApplyHistoryBest(stringio_batch_1)
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-    # Load best result from list of tuples (MeasureInput, MeasureResult)
-    hist_best = ApplyHistoryBest(list(zip(inputs_batch_1, results_batch_1)))
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-    # Same thing, but iterable instead of list (i.e. no subscripting)
-    hist_best = ApplyHistoryBest(zip(inputs_batch_1, results_batch_1))
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-
-def test_apply_history_best_multiple_batches(tmpdir):
-    tsk, target = get_sample_task()
-    best = str(tsk.config_space.get(2))
-
-    inputs_batch_1 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(2)]
-    results_batch_1 = [MeasureResult((i,), 0, 0, 0) for i in range(1, 3)]
-    filepath_batch_1 = tmpdir / "batch_1.log"
-    with open(filepath_batch_1, "w") as file:
-        autotvm.callback.log_to_file(file)(None, inputs_batch_1, results_batch_1)
-
-    inputs_batch_2 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(2, 4)]
-    results_batch_2 = [MeasureResult((0.5,), 0, 0, 0), MeasureResult((3,), 0, 0, 0)]
-    filepath_batch_2 = tmpdir / "batch_2.log"
-    with open(filepath_batch_2, "w") as file:
-        autotvm.callback.log_to_file(file)(None, inputs_batch_2, results_batch_2)
-
-    # Check two Path filepaths works
-    hist_best = ApplyHistoryBest([filepath_batch_1, filepath_batch_2])
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-    # Check that an arbitrary Iterable of Paths works
-    # Calling zip() on a single list gives a non-subscriptable Iterable
-    hist_best = ApplyHistoryBest(zip([filepath_batch_1, filepath_batch_2]))
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-    # Check that Iterable of Iterable of tuples is correctly merged
-    hist_best = ApplyHistoryBest(
-        zip(
-            [
-                zip(inputs_batch_1, results_batch_1),
-                zip(inputs_batch_2, results_batch_2),
-            ]
-        )
-    )
-    assert str(hist_best.query(target, tsk.workload)) == best
-
-
-if __name__ == "__main__":
-    test_load_dump()
-    test_apply_history_best()
-    test_file_io()
diff --git a/tests/python/autotvm/test_autotvm_space.py b/tests/python/autotvm/test_autotvm_space.py
deleted file mode 100644
index eb783a9f8bcd..000000000000
--- a/tests/python/autotvm/test_autotvm_space.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test space definition primitives"""
-
-from tvm import te
-from tvm.autotvm.task.space import ConfigSpace, FallbackConfigEntity
-
-
-def gemm_func(cfg, N, filter_y=None, filter_x=None):
-    A = te.placeholder((N, N), name="A")
-    B = te.placeholder((N, N), name="B")
-
-    k = te.reduce_axis((0, N), name="k")
-    C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=[k]), name="C")
-
-    s = te.create_schedule([C.op])
-
-    y, x = s[C].op.axis
-
-    cfg.define_split("tile_y", cfg.axis(y), num_outputs=2, filter=filter_y)
-    cfg.define_split("tile_x", cfg.axis(x), num_outputs=2, filter=filter_x)
-
-    return s, [A, B, C]
-
-
-def test_split():
-    cfg = ConfigSpace()
-
-    gemm_func(cfg, 128)
-    assert cfg.range_length == 64
-    assert len(cfg.space_map["tile_y"]) == 8
-
-    # test policy
-    cfg = ConfigSpace()
-    cfg.define_split("tile_x", cfg.axis(256), policy="factors", num_outputs=3)
-    assert len(cfg.space_map["tile_x"]) == 45
-
-    cfg.define_split("tile_y", cfg.axis(256), policy="power2", num_outputs=3)
-    assert len(cfg.space_map["tile_y"]) == 45
-
-    cfg.define_split("tile_z", cfg.axis(256), policy="verbose", num_outputs=3)
-    assert len(cfg.space_map["tile_z"]) == 45
-
-    cfg.define_split("tile_a", cfg.axis(224), policy="factors", num_outputs=3)
-    assert len(cfg.space_map["tile_a"]) == 63
-
-    cfg.define_split("tile_b", cfg.axis(224), policy="power2", num_outputs=3)
-    assert len(cfg.space_map["tile_b"]) == 36
-
-    cfg.define_split("tile_c", cfg.axis(224), policy="verbose", num_outputs=3)
-    assert len(cfg.space_map["tile_c"]) == 84
-
-    # Count the number of non-negative integer solutions of a + b + c + d = n
-    def count4(n):
-        cnt = 0
-        for a in range(0, n + 1):
-            for b in range(0, n - a + 1):
-                cnt += n - a - b + 1
-        return cnt
-
-    # test overflow
-    n = 25
-    cfg = ConfigSpace()
-    cfg.define_split("x", cfg.axis(2**n), policy="factors", num_outputs=4)
-    # count4(25) is 3276.
-    assert len(cfg.space_map["x"]) == count4(n)
-
-    # test fallback
-    cfg = FallbackConfigEntity()
-    cfg.define_split("tile_n", cfg.axis(128), num_outputs=3)
-    cfg.fallback_split("tile_n", [-1, 8, 4])
-    # verify if define_split override previously manualy defined split params
-    cfg.define_split("tile_n", cfg.axis(128), num_outputs=3)
-    assert cfg["tile_n"].size == [4, 8, 4]
-
-    cfg = FallbackConfigEntity()
-    cfg.define_split("tile_n", cfg.axis(49), num_outputs=3)
-    cfg.fallback_split("tile_n", [-1, 8, 4])
-    assert cfg["tile_n"].size == [7, 7, 1]
-
-    cfg = FallbackConfigEntity()
-    cfg.define_split("tile_n", cfg.axis(49), num_outputs=3)
-    try:
-        cfg.fallback_split("tile_n", [-1, 1, 0])
-        assert False
-    except RuntimeError:
-        pass
-
-
-def _raises_exception(f):
-    try:
-        f()
-    except Exception:
-        return True
-    return False
-
-
-def test_multi_filter():
-    # create config without multi_filter
-    cfg = ConfigSpace()
-    gemm_func(cfg, 128)
-    # create config with multi_filter
-    cfg_mf = ConfigSpace()
-    gemm_func(cfg_mf, 128)
-    cfg_mf.multi_filter(
-        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-    # test len
-    assert len(cfg) == 64
-    assert len(cfg_mf) == 34
-    # test range_length
-    assert cfg.range_length == 64
-    assert cfg_mf.range_length == 64
-    # test dims
-    assert cfg.dims == [8, 8]
-    assert cfg_mf.dims == [8, 8]
-    # test is_index_valid
-    assert cfg.is_index_valid(0) is True
-    assert cfg.is_index_valid(15) is True
-    assert cfg_mf.is_index_valid(0) is False
-    assert cfg_mf.is_index_valid(15) is True
-    # test get
-    assert _raises_exception(lambda: cfg.get(0)) is False
-    assert _raises_exception(lambda: cfg.get(15)) is False
-    assert _raises_exception(lambda: cfg_mf.get(0)) is True
-    assert _raises_exception(lambda: cfg_mf.get(15)) is False
-    # test subrange_length
-    assert cfg.subrange_length(0, 64) == 64
-    assert cfg.subrange_length(0, 32) == 32
-    assert cfg.subrange_length(16, 32) == 16
-    assert cfg.subrange_length(16, 16) == 0
-    assert _raises_exception(lambda: cfg.subrange_length(0, 128))
-    assert _raises_exception(lambda: cfg.subrange_length(-64, 64))
-    assert _raises_exception(lambda: cfg.subrange_length(64, 0))
-    assert cfg_mf.subrange_length(0, 64) == 34
-    assert cfg_mf.subrange_length(0, 32) == 17
-    assert cfg_mf.subrange_length(16, 32) == 10
-    assert cfg_mf.subrange_length(16, 16) == 0
-    assert _raises_exception(lambda: cfg_mf.subrange_length(0, 128))
-    assert _raises_exception(lambda: cfg_mf.subrange_length(-64, 64))
-    assert _raises_exception(lambda: cfg_mf.subrange_length(64, 0))
-    # test point2knob
-    assert cfg.point2knob(0) == [0, 0]
-    assert cfg.point2knob(4) == [4, 0]
-    assert cfg.point2knob(8) == [0, 1]
-    assert cfg.point2knob(12) == [4, 1]
-    assert cfg_mf.point2knob(0) == [0, 0]
-    assert cfg_mf.point2knob(4) == [4, 0]
-    assert cfg_mf.point2knob(8) == [0, 1]
-    assert cfg_mf.point2knob(12) == [4, 1]
-    # test knob2point
-    assert cfg.knob2point([0, 0]) == 0
-    assert cfg.knob2point([4, 0]) == 4
-    assert cfg.knob2point([0, 1]) == 8
-    assert cfg.knob2point([4, 1]) == 12
-    assert cfg_mf.knob2point([0, 0]) == 0
-    assert cfg_mf.knob2point([4, 0]) == 4
-    assert cfg_mf.knob2point([0, 1]) == 8
-    assert cfg_mf.knob2point([4, 1]) == 12
-    # get_rand_index
-    cfg_valid_indexes = list(filter(lambda idx: cfg.is_index_valid(idx), range(cfg.range_length)))
-    assert cfg.get_rand_index() in cfg_valid_indexes
-    assert cfg.get_rand_index(start=15, end=16) == 15
-    assert 10 <= cfg.get_rand_index(start=10, end=20) < 20
-    assert cfg.get_rand_index(to_exclude=cfg_valid_indexes[:-1]) == cfg_valid_indexes[-1:][0]
-    cfg_mf_valid_indexes = list(
-        filter(lambda idx: cfg_mf.is_index_valid(idx), range(cfg_mf.range_length))
-    )
-    assert cfg_mf.get_rand_index() in cfg_mf_valid_indexes
-    assert cfg_mf.get_rand_index(start=15, end=16) == 15
-    assert 10 <= cfg_mf.get_rand_index(start=10, end=20) < 20
-    assert (
-        cfg_mf.get_rand_index(to_exclude=cfg_mf_valid_indexes[:-1]) == cfg_mf_valid_indexes[-1:][0]
-    )
-    # get_next_index
-    assert cfg.get_next_index(0) == 1
-    assert cfg.get_next_index(0, 1) == 1
-    assert cfg.get_next_index(0, 2) == 2
-    assert cfg.get_next_index(0, -1) is None
-    assert cfg.get_next_index(0, -2) is None
-    assert cfg.get_next_index(63) is None
-    assert cfg.get_next_index(63, 1) is None
-    assert cfg.get_next_index(63, 2) is None
-    assert cfg.get_next_index(63, -1) == 62
-    assert cfg.get_next_index(63, -2) == 61
-    assert cfg.get_next_index(60, 1, end=63) == 61
-    assert cfg.get_next_index(63, -1, start=60) == 62
-    assert cfg_mf.get_next_index(0) == 5
-    assert cfg_mf.get_next_index(0, 1) == 5
-    assert cfg_mf.get_next_index(0, 2) == 6
-    assert cfg_mf.get_next_index(0, -1) is None
-    assert cfg_mf.get_next_index(0, -2) is None
-    assert cfg_mf.get_next_index(63) is None
-    assert cfg_mf.get_next_index(63, 1) is None
-    assert cfg_mf.get_next_index(63, 2) is None
-    assert cfg_mf.get_next_index(63, -1) == 58
-    assert cfg_mf.get_next_index(63, -2) == 57
-    assert cfg_mf.get_next_index(60, 1, end=63) is None
-    assert cfg_mf.get_next_index(63, -1, start=60) is None
-    # test sample_ints
-    cfg_ints = cfg.sample_ints(5)
-    assert len(cfg_ints) == 5
-    assert set(cfg_ints).issubset(cfg_valid_indexes)
-    cfg_mf_ints = cfg_mf.sample_ints(5)
-    assert len(cfg_mf_ints) == 5
-    assert set(cfg_mf_ints).issubset(cfg_mf_valid_indexes)
-    # test random_walk
-    cfg_walk = cfg.random_walk(15)
-    assert cfg_walk != 15
-    assert cfg_walk in cfg_valid_indexes
-    cfg_mf_walk = cfg_mf.random_walk(15)
-    assert cfg_mf_walk != 15
-    assert cfg_mf_walk in cfg_mf_valid_indexes
-
-
-def test_filter_and_multi_filter():
-    # test the order: filter -> multi_filter
-    cfg = ConfigSpace()
-    gemm_func(cfg, 128, filter_y=lambda y: y.size[-1] < 64)
-    # after adding filter
-    assert len(cfg) == 48
-    assert cfg.range_length == 48
-    cfg.multi_filter(
-        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-    # after adding multi_filter
-    assert len(cfg) == 27
-    assert cfg.range_length == 48
-
-    # test the order: multi_filter -> filter
-    cfg = ConfigSpace()
-    s, (A, B, C) = gemm_func(cfg, 128, filter_y=None)
-    cfg.multi_filter(
-        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
-    )
-    # after adding multi_filter
-    assert len(cfg) == 34
-    assert cfg.range_length == 64
-    y, x = s[C].op.axis
-    cfg.define_split("tile_y", cfg.axis(y), num_outputs=2, filter=lambda y: y.size[-1] < 64)
-    # after adding filter
-    assert len(cfg) == 27
-    assert cfg.range_length == 48
-
-
-if __name__ == "__main__":
-    test_split()
-    test_multi_filter()
-    test_filter_and_multi_filter()
diff --git a/tests/python/autotvm/test_autotvm_xgboost_model.py b/tests/python/autotvm/test_autotvm_xgboost_model.py
deleted file mode 100644
index b9f157247eae..000000000000
--- a/tests/python/autotvm/test_autotvm_xgboost_model.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import time
-
-import multiprocessing
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm import MeasureInput, MeasureResult
-from tvm.autotvm.tuner.xgboost_cost_model import XGBoostCostModel
-
-from tvm.testing.autotvm import get_sample_task, get_sample_records
-
-
-def test_fit():
-    task, target = get_sample_task()
-    records = get_sample_records(n=500)
-
-    base_model = XGBoostCostModel(task, feature_type="itervar", loss_type="reg")
-    base_model.fit_log(records, plan_size=32)
-
-    upper_model = XGBoostCostModel(task, feature_type="itervar", loss_type="reg")
-    upper_model.load_basemodel(base_model)
-
-    xs = np.arange(10)
-    ys = np.arange(10)
-
-    upper_model.fit(xs, ys, plan_size=32)
-
-    # feature lengths are not guaranteed to always be the same
-    upper_model.predict(np.ones(12))
-    upper_model.predict(np.ones(8))
-
-
-def fit_spawn():
-    assert multiprocessing.get_start_method(False) == "spawn"
-    test_fit()
-
-
-def test_fit_spawn():
-    # Subprocesses inherit the spawn method of their parents
-    ctx = multiprocessing.get_context("spawn")
-    p = ctx.Process(target=test_fit)
-    p.start()
-    p.join()
-
-
-def test_tuner():
-    task, target = get_sample_task()
-    records = get_sample_records(n=10)
-
-    tuner = autotvm.tuner.XGBTuner(task)
-    tuner.load_history(records, min_seed_records=10)
-    # Confirm that loading history successfully loaded a
-    # base_model.
-    assert tuner.cost_model.base_model is not None
-
-    tuner = autotvm.tuner.XGBTuner(task)
-    tuner.load_history(records, min_seed_records=11)
-    # Confirm that loading history did not load base_model
-    # when not enough records according to `min_seed_records`
-    # are provided
-    assert tuner.cost_model.base_model is None
-
-
-def test_update():
-    task, target = get_sample_task()
-    tuner = autotvm.tuner.XGBTuner(task)
-    n_records = 5
-    records = get_sample_records(n=n_records)
-    tuner.update([inp for inp, _ in records], [res for _, res in records])
-    assert len(tuner.xs) == n_records
-    assert len(tuner.ys) == n_records
-    assert len(tuner.visited) == n_records
-    assert all(x in tuner.visited for x in tuner.xs)
-
-
-if __name__ == "__main__":
-    test_fit()
-    test_fit_spawn()
-    test_tuner()
-    test_update()
diff --git a/tests/python/codegen/test_target_codegen_aarch64.py b/tests/python/codegen/test_target_codegen_aarch64.py
index f596549a10d0..366198c7de6a 100644
--- a/tests/python/codegen/test_target_codegen_aarch64.py
+++ b/tests/python/codegen/test_target_codegen_aarch64.py
@@ -25,7 +25,6 @@
 import tvm
 from tvm import te
 from tvm.script import tir as T
-from tvm.topi.arm_cpu.pstate_attributes import SMEAttributes
 from tvm.target.codegen import llvm_version_major
 
 
@@ -498,65 +497,6 @@ def main(A: T.Buffer((5,), "int32")):
     assert re.findall(r"llvm.vscale.i32", llvm), "No vscale in generated LLVM."
 
 
-@pytest.mark.skipif(
-    llvm_version_major() < 16, reason="SME is not supported in earlier versions of LLVM"
-)
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-def test_matmul_sme(dtype):
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+v9a,+sme"
-
-    def check_correct_assembly(dtype):
-        A = te.placeholder((32, 32), dtype=dtype, name="A")
-        B = te.placeholder((32, 32), dtype=dtype, name="B")
-
-        with tvm.target.Target(target):
-            C = tvm.topi.arm_cpu.matmul.compute_matmul_sme(
-                A, B, None, "float32", False, dtype == "float16"
-            )
-            prim_func = te.create_prim_func([A, B, C])
-
-            sch = tvm.tir.Schedule(prim_func)
-            tvm.topi.arm_cpu.matmul.tir_schedule_matmul_sme(sch)
-            prim_func = sch.mod
-
-            f = tvm.build(prim_func, target=target)
-
-        assembly = f.get_source("asm")
-        smstart = re.findall(r"smstart\t(sm|za)", assembly)
-        loads = re.findall(r"ld1[whdb]\t{\s?za", assembly)
-        mopa = re.findall(
-            r"fmopa\tza[0-9].[shdb],( p[0-9]/[zm],)?( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]",
-            assembly,
-        )
-        stores = re.findall(r"st1[whdb]\t{\s?za", assembly)
-        smstop = re.findall(r"smstop\t(sm|za)", assembly)
-        whilelo = re.findall(r"whilelo\tp[0-9].[shdb]", assembly)
-
-        assert len(smstart) > 0
-        assert len(loads) > 0
-        assert len(mopa) > 0
-        assert len(stores) > 0
-        assert len(smstop) > 0
-        assert len(whilelo) > 0
-
-    check_correct_assembly(dtype=dtype)
-
-
-def test_matmul_sme_no_reduction_block():
-    @T.prim_func
-    def prim_func(a: T.handle, b: T.handle):
-        A = T.match_buffer(a, (4,))
-        B = T.match_buffer(b, (4,))
-        for i in range(3):
-            with T.block("block"):
-                vi = T.axis.remap("S", [i])
-                B[vi] = A[vi]
-
-    sch = tvm.tir.Schedule(prim_func)
-    with pytest.raises(AssertionError, match="Expected a single gemm reduction block."):
-        tvm.topi.arm_cpu.matmul.tir_schedule_matmul_sme(sch)
-
-
 @pytest.mark.skipif(
     llvm_version_major() < 11, reason="Vscale is not supported in earlier versions of LLVM"
 )
@@ -636,206 +576,7 @@ def test_vscale_range_function_attribute(mattr, expect_attr):
         ), f"Unexpected function attribute vscale_range() was found in generated LLVM IR"
 
 
-@pytest.mark.skipif(
-    llvm_version_major() < 16, reason="Test requires an LLVM version of at least 16 to target SME"
-)
-@pytest.mark.parametrize(
-    "attr_key,attr_value,expected",
-    [
-        (
-            SMEAttributes.STREAMING_MODE,
-            SMEAttributes.StreamingModeValues.ENABLED,
-            "aarch64_pstate_sm_enabled",
-        ),
-        (
-            SMEAttributes.STREAMING_MODE,
-            SMEAttributes.StreamingModeValues.COMPATIBLE,
-            "aarch64_pstate_sm_compatible",
-        ),
-        (SMEAttributes.ZA_STORAGE, SMEAttributes.ZAStorageValues.NEW, "aarch64_pstate_za_new"),
-        (
-            SMEAttributes.ZA_STORAGE,
-            SMEAttributes.ZAStorageValues.SHARED,
-            "aarch64_pstate_za_shared",
-        ),
-    ],
-)
-def test_function_attributes(attr_key, attr_value, expected):
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sme"
-
-    @T.prim_func
-    def prim_func(a: T.handle, c: T.handle):
-        T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)})
-        A = T.match_buffer(a, (16,), "float32")
-        C = T.match_buffer(c, (1,), "float32")
-
-        with T.block("extern"):
-            T.block_attr({attr_key: attr_value})
-            for i in range(16):
-                C[0] += A[i]
-
-    func = tvm.build(prim_func, target=target)
-    ll = func.get_source("ll")
-
-    # Check that the attribute exists
-    attr = re.findall(rf".*{expected}*.", ll)
-    assert attr, f"Function attribute {expected} was not found in generated LLVM IR"
-
-    # Check this attribute is used on the "compute" function
-    func_attr_label = attr[0].split(" ")[1]
-    found_compute_func = False
-    for match in re.findall(rf".*{func_attr_label}*.", ll):
-        if "_compute_" in match:
-            found_compute_func = True
-
-    assert found_compute_func, (
-        f"The attribute {expected} was found to be under the label {func_attr_label}, "
-        "but it was not used by the 'compute' scope function."
-    )
-
-
-def test_unsupported_function_attribute_type():
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sme"
-
-    @T.prim_func
-    def prim_func(a: T.handle, c: T.handle):
-        T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)})
-        A = T.match_buffer(a, (16,), "float32")
-        C = T.match_buffer(c, (1,), "float32")
-
-        with T.block("extern"):
-            T.block_attr({SMEAttributes.STREAMING_MODE: True})
-            with T.block("root"):
-                for i in range(16):
-                    C[0] += A[i]
-
-    err_msg = f"Expect {SMEAttributes.STREAMING_MODE} to have a String value but was IntImm"
-    with pytest.raises(tvm.error.TVMError, match=err_msg):
-        tvm.build(prim_func, target=target)
-
-
-@pytest.mark.parametrize(
-    "attr_key,attr_value",
-    [
-        (SMEAttributes.STREAMING_MODE, SMEAttributes.StreamingModeValues.ENABLED),
-        (SMEAttributes.ZA_STORAGE, SMEAttributes.ZAStorageValues.NEW),
-    ],
-)
-def test_unsupported_multiple_function_attributes(attr_key, attr_value):
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sme"
-
-    @T.prim_func
-    def prim_func(a: T.handle, c: T.handle):
-        A = T.match_buffer(a, (16,), "float32")
-        C = T.match_buffer(c, (1,), "float32")
-
-        with T.block("root"):
-            with T.block("extern"):
-                T.block_attr({attr_key: attr_value})
-                for i in range(16):
-                    C[0] += A[i] * 2
-            with T.block("extern2"):
-                T.block_attr({attr_key: attr_value})
-                for i in range(16):
-                    C[0] += A[i] * 3
-
-    err_msg = f"Multiple definitions of {attr_key} attribute found in the function default_function_compute_"
-    with pytest.raises(tvm.error.TVMError, match=err_msg):
-        tvm.build(prim_func, target=target)
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 15, reason="Test requires an LLVM version of at least 15 to target SVE"
-)
-@pytest.mark.parametrize("dtype", ["float16", "float32"])
-@pytest.mark.parametrize(
-    "conv2d_impl",
-    [
-        (
-            tvm.topi.arm_cpu.compute_conv2d_NHWC_hybrid_SVE,
-            tvm.topi.arm_cpu.schedule_conv2d_NHWC_hybrid_SVE,
-            False,
-        ),
-        (
-            tvm.topi.arm_cpu.compute_conv2d_NHWC_hybrid_SVE,
-            tvm.topi.arm_cpu.schedule_conv2d_NHWC_hybrid_TIR,
-            True,
-        ),
-    ],
-)
-def test_conv2d_sve(dtype, conv2d_impl):
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
-
-    def check_correct_assembly(dtype, compute, schedule, use_tir_schedule):
-        A = te.placeholder((1, 32, 32, 3), dtype=dtype, name="A")
-        W = te.placeholder((3, 3, 3, 8), dtype=dtype, name="B")
-        stride = padding = dilation = 1
-        B = compute(A, W, stride, padding, dilation, dtype)
-        if use_tir_schedule:
-            func = te.create_prim_func([A, W, B])
-            sch = schedule(tvm.tir.Schedule(func))
-            f = tvm.build(sch.mod["main"], target)
-        else:
-            s = schedule([B])
-            f = tvm.build(s, [A, W, B], target)
-        assembly = f.get_source("asm")
-
-        loads = re.findall(r"ld1[r]?[q]?[whdb]\t{\s?z", assembly)
-        compute_ops = re.findall(
-            r"fm(la|ad)\tz\d+.[shdb], (p\d+\/[zm], )?z\d+.[shdb], z\d+.[shdb]",
-            assembly,
-        )
-        stores = re.findall(r"st1[whdb]\t{\s?z", assembly)
-
-        assert len(loads) > 0
-        assert len(compute_ops) > 0
-        assert len(stores) > 0
-
-    with tvm.target.Target(target):
-        check_correct_assembly(dtype, *conv2d_impl)
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 16, reason="Test requires an LLVM version of at least 16 to target SME"
-)
-@pytest.mark.parametrize("dtype", ["float32"])
-def test_conv2d_sme(dtype):
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+v9a,+sme"
-
-    def check_correct_assembly(dtype):
-        A = te.placeholder((1, 32, 32, 3), dtype=dtype, name="A")
-        W = te.placeholder((3, 3, 3, 8), dtype=dtype, name="B")
-        stride = padding = dilation = 1
-
-        B = tvm.topi.arm_cpu.compute_conv2d_NHWC_hybrid_SME(A, W, stride, padding, dilation, dtype)
-        func = te.create_prim_func([A, W, B])
-        sch = tvm.topi.arm_cpu.schedule_conv2d_NHWC_hybrid_TIR(tvm.tir.Schedule(func))
-        f = tvm.build(sch.mod["main"], target)
-
-        assembly = f.get_source("asm")
-        smstart = re.findall(r"smstart\t(sm|za)", assembly)
-        loads = re.findall(r"ld1[whdb]\t{\s?za", assembly)
-        mopa = re.findall(
-            r"fmopa\tza[0-9].[shdb],( p[0-9]/[zm],)?( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]",
-            assembly,
-        )
-        stores = re.findall(r"st1[whdb]\t{\s?za", assembly)
-        smstop = re.findall(r"smstop\t(sm|za)", assembly)
-        whilelo = re.findall(r"whilelo\tp[0-9].[shdb]", assembly)
-
-        assert len(smstart) > 0
-        assert len(loads) > 0
-        assert len(mopa) > 0
-        assert len(stores) > 0
-        assert len(smstop) > 0
-        assert len(whilelo) > 0
-
-    with tvm.target.Target(target):
-        check_correct_assembly(dtype=dtype)
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 11,
+@pytest.mark.skip(
     reason="Vscale and get.active.lane.mask are not supported in earlier versions of LLVM",
 )
 def test_get_active_lane_mask():
@@ -854,8 +595,7 @@ def before(a: T.handle):
     assert "get.active.lane.mask" in ll
 
 
-@pytest.mark.skipif(
-    llvm_version_major() < 11,
+@pytest.mark.skip(
     reason="Vscale and get.active.lane.mask are not supported in earlier versions of LLVM",
 )
 def test_predicated_scalable_buffer():
diff --git a/tests/python/codegen/test_target_codegen_blob.py b/tests/python/codegen/test_target_codegen_blob.py
index 5266f481f556..a61e6e894c83 100644
--- a/tests/python/codegen/test_target_codegen_blob.py
+++ b/tests/python/codegen/test_target_codegen_blob.py
@@ -17,53 +17,12 @@
 
 import ctypes
 import numpy as np
-from tvm import relay
-import tvm.relay.testing
-from tvm.contrib import graph_executor, cc, utils, popen_pool, tar
+from tvm.contrib import cc, utils, popen_pool, tar
 import tvm
 import tvm.testing
 from tvm.script import ir as I, tir as T
 
 
-@tvm.testing.uses_gpu
-def test_synthetic():
-    for device in ["llvm", "cuda"]:
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled..." % device)
-            return
-
-    input_shape = (1, 5, 23, 61)
-
-    def verify(data):
-        mod, params = relay.testing.synthetic.get_workload(input_shape=input_shape)
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build_module.build(mod, "llvm", params=params)
-        dev = tvm.cpu()
-        module = graph_executor.GraphModule(lib["default"](dev))
-        module.set_input("data", data)
-        module.run()
-        out = module.get_output(0).numpy()
-        return out
-
-    synthetic_mod, synthetic_params = relay.testing.synthetic.get_workload(input_shape=input_shape)
-    with tvm.transform.PassContext(opt_level=3):
-        synthetic_gpu_lib = relay.build_module.build(synthetic_mod, "cuda", params=synthetic_params)
-
-    temp = utils.tempdir()
-    path_lib = temp.relpath("deploy_lib.so")
-    synthetic_gpu_lib.export_library(path_lib)
-
-    loaded_lib = tvm.runtime.load_module(path_lib)
-    data = np.random.uniform(-1, 1, size=input_shape).astype("float32")
-    dev = tvm.cuda()
-    module = graph_executor.GraphModule(loaded_lib["default"](dev))
-    module.set_input("data", data)
-    module.run()
-    out = module.get_output(0).numpy()
-
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-
 @tvm.testing.uses_gpu
 def test_cuda_multi_lib():
     # test combining two system lib together
diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py
index 112d1151febd..7b370f3e3211 100644
--- a/tests/python/codegen/test_target_codegen_cuda.py
+++ b/tests/python/codegen/test_target_codegen_cuda.py
@@ -432,7 +432,6 @@ def test_rfactor_predicates(target, dev):
 def test_cuda_const_float_to_half():
     # This import is required to use nvcc to perform code gen;
     # otherwise it is found that the code gen is done by nvrtc.
-    from tvm import autotvm
 
     shape = (2, 3, 4)
     a = te.placeholder(shape, dtype="float16", name="a")
@@ -455,68 +454,6 @@ def test_cuda_const_float_to_half():
     np.testing.assert_equal(c.numpy(), a_np > b.value)
 
 
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_cuda_reduction():
-    def check(device, dtype, m=32, n=32):
-        if not tvm.testing.device_enabled(device):
-            print("Skipping", device)
-            return
-        dev = tvm.device(device, 0)
-        a = te.placeholder((m, n), name="a", dtype=dtype)
-        b = te.placeholder((m, n), name="b", dtype=dtype)
-        c = a + b
-        d = a * b
-        e = topi.elemwise_sum([c, d])
-        g = topi.sum(e)
-        with tvm.target.Target(device):
-            sg = topi.cuda.schedule_reduce(g)
-            func = tvm.build(sg, [a, b, g], device)
-            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
-            b_np = np.random.uniform(size=(m, n)).astype(b.dtype)
-            g_np = np.sum(np.add(a_np * b_np, a_np + b_np))
-            a_nd = tvm.nd.array(a_np, dev)
-            b_nd = tvm.nd.array(b_np, dev)
-            g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
-            func(a_nd, b_nd, g_nd)
-            tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-3)
-
-    check("cuda", "float32")
-    check("rocm", "float32")
-    check("cuda", "float16")
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_cuda_mix_threaded_and_normal_reduction():
-    def check(device, dtype, m=32, n=32):
-        if not tvm.testing.device_enabled(device):
-            print("Skipping", device)
-            return
-        dev = tvm.device(device, 0)
-        if dtype == "float16" and not have_fp16(dev.compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
-        a = tvm.te.placeholder((m, n), name="a", dtype=dtype)
-        b = topi.sum(a)
-        with tvm.target.Target(device):
-            sb = tvm.te.create_schedule(b.op)
-            i, _ = b.op.reduce_axis
-            sb[b].bind(i, tvm.te.thread_axis("threadIdx.x"))
-            func = tvm.build(sb, [a, b], device)
-            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
-            b_np = np.sum(a_np)
-            a_nd = tvm.nd.array(a_np, dev)
-            b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
-            func(a_nd, b_nd)
-            tvm.testing.assert_allclose(b_nd.numpy(), b_np, rtol=1e-3)
-
-    check("cuda", "float32")
-    check("rocm", "float32")
-    check("cuda", "float16")
-
-
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_cuda_floordiv_with_vectorization():
diff --git a/tests/python/codegen/test_target_codegen_hexagon.py b/tests/python/codegen/test_target_codegen_hexagon.py
index 344c7a976248..c97637f927b7 100644
--- a/tests/python/codegen/test_target_codegen_hexagon.py
+++ b/tests/python/codegen/test_target_codegen_hexagon.py
@@ -21,7 +21,6 @@
 import re
 import sys
 import tvm
-import tvm.relay
 import tvm.testing
 import tvm.contrib.hexagon as hexagon
 
diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py
index e8036467ffb6..d629d93d365e 100644
--- a/tests/python/codegen/test_target_codegen_llvm.py
+++ b/tests/python/codegen/test_target_codegen_llvm.py
@@ -27,7 +27,6 @@
 import tvm.testing
 from tvm import te
 from tvm.contrib import clang, utils
-from tvm.relay.backend import Runtime
 from tvm.script import tir as T, ir as I
 from tvm.target.codegen import llvm_get_intrinsic_name, llvm_lookup_intrinsic_id
 
@@ -754,12 +753,10 @@ def test_llvm_crt_static_lib():
     A = te.placeholder((32,), dtype="bfloat16")
     B = te.placeholder((32,), dtype="bfloat16")
     d = te.compute((32,), lambda x: A[x] + B[x])
-    sch = te.create_schedule(d.op)
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B, d]))
     module = tvm.build(
-        sch,
-        [A, B, d],
+        mod.with_attr("system_lib_prefix", ""),
         target=tvm.target.Target("llvm"),
-        runtime=Runtime("crt", {"system-lib": True}),
     )
     print(module.get_source())
     module.save("test.o")
diff --git a/tests/python/codegen/test_target_codegen_opencl.py b/tests/python/codegen/test_target_codegen_opencl.py
index 9222947ae47e..079553665ffb 100644
--- a/tests/python/codegen/test_target_codegen_opencl.py
+++ b/tests/python/codegen/test_target_codegen_opencl.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te, relay
+from tvm import te
 import tvm.testing
 import re
 import pytest
@@ -234,197 +234,5 @@ def get_kernel_args(source):
     return max_args
 
 
-def _validate_opencl_executors(executor_type, get_model, ref_impl):
-    from tvm.contrib import graph_executor
-    from tvm.runtime.vm import VirtualMachine
-
-    input_dict, model = get_model()
-    if executor_type == "ge":
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(model, target_host="llvm", target=target)
-        ocl_lib = lib.get_lib()
-    else:
-        module = tvm.IRModule({})
-        module["main"] = model
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.vm.compile(module, target=target, target_host="llvm")
-        ocl_lib = lib.module.imported_modules[0]
-    opencl_modules = list(filter(lambda mod: mod.type_key == "opencl", ocl_lib.imported_modules))
-    assembly = opencl_modules[0].get_source()
-    with tvm.target.Target(target):
-        limit = tvm.target.Target.current().max_function_args
-    max_num = _get_maximum_kernel_args(assembly)
-    assert max_num <= limit
-
-    dev = tvm.cl()
-    if executor_type == "ge":
-        module = graph_executor.GraphModule(lib["default"](dev))
-        module.set_input(**input_dict)
-        module.run()
-        tvm_out = module.get_output(0)
-    else:
-        vm = VirtualMachine(lib, dev, "naive")
-        data = {}
-        for k, v in input_dict.items():
-            data[k] = tvm.nd.array(v, dev)
-        vm.set_input("main", **data)
-        vm.invoke_stateful("main")
-        tvm_out = vm.get_outputs()[0]
-
-    np_result = ref_impl(list(input_dict.values()))
-    np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-2, atol=1e-2)
-
-
-shape_type = tvm.testing.parameter("dynamic", "static")
-executor_type = tvm.testing.parameter("ge", "vm")
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_opencl
-def test_opencl_args_split(executor_type, shape_type):
-    def _get_model():
-        if shape_type == "dynamic":
-            shape = (tvm.tir.Any(), 1, 1, 3)
-        else:
-            shape = (1, 1, 1, 3)
-        shape_np = (1, 1, 1, 3)
-        dtype = "float32"
-        axis = 1
-        tensors_num = 300
-        inputs = []
-        inputs_np = {}
-        for i in range(tensors_num):
-            inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
-            inputs_np[f"p{i}"] = np.random.uniform(size=shape_np).astype(dtype)
-
-        inp = relay.Tuple(inputs)
-        concat = relay.op.concatenate(inp, axis)
-        return inputs_np, relay.Function(inputs, concat)
-
-    def ref_impl(inputs):
-        axis = 1
-        return np.concatenate(tuple(inputs), axis=axis)
-
-    if executor_type == "ge" and shape_type == "dynamic":
-        pytest.skip()
-    _validate_opencl_executors(executor_type, _get_model, ref_impl)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_opencl
-def test_opencl_fuse_max_args(executor_type, shape_type):
-    if shape_type == "dynamic":
-        shape = (tvm.tir.Any(), 20)
-        ops_num = 80
-    else:
-        shape = (1, 20)
-        ops_num = 300
-    shape_np = (1, 20)
-    dtype = "float32"
-
-    def _base_func(name):
-        x = relay.var(name, shape=shape)
-        y = relay.add(x, relay.const(1, "float32"))
-        w = relay.exp(y)
-        return x, w
-
-    def _get_model():
-        inp = []
-        inputs_np = {}
-        out = []
-        for i in range(ops_num):
-            x, w = _base_func(f"x{i}")
-            inputs_np[f"x{i}"] = np.random.uniform(size=shape_np).astype(dtype)
-            inp.append(x)
-            out.append(w)
-        w = out[0]
-        for i in range(len(out) - 1):
-            w = relay.add(w, out[i + 1])
-        return inputs_np, relay.Function(inp, w)
-
-    def ref_impl(inputs):
-        w = np.exp(inputs[0] + 1)
-        for i in range(len(inputs) - 1):
-            w = w + np.exp(inputs[i + 1] + 1)
-        return w
-
-    if executor_type == "ge" and shape_type == "dynamic":
-        pytest.skip()
-    _validate_opencl_executors(executor_type, _get_model, ref_impl)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_opencl
-def test_fuse_concat_max_num_args(executor_type, shape_type):
-    """
-    In this test, we have an operation with 3 inputs before concat. In the
-    SplitArgs we cannot calculate these inputs as inputs to the concat layer,
-    because they will be added to the concat after the fusing operation. So
-    FuseOps pass should handle this case and stop fusing before the concat
-    layer.
-
-    The example:
-       x     y     z                  x     y     z
-       \     |     /                  \     |     /
-        \    |    /                    \    |    /
-           where            ...           where
-             |                              |
-            exp                            exp
-             \                              /
-              \                            /
-               \----->    concat    <-----/
-    """
-    if shape_type == "dynamic":
-        shape = (tvm.tir.Any(), 20)
-        ops_num = 80
-    else:
-        shape = (10, 20)
-        ops_num = 300
-    shape_np = (10, 20)
-    dtype = "float32"
-    axis = 1
-
-    def _base_func(name):
-        x = relay.var(name, shape=shape)
-        y = relay.var(f"y{name}", shape=shape)
-        z = relay.var(f"z{name}", shape=shape)
-        cond = relay.less(x, relay.const(1, "float32"))
-        l = relay.add(y, relay.const(1, "float32"))
-        r = relay.add(z, relay.const(5, "float32"))
-        w = relay.where(cond, l, r)
-        w = relay.exp(w)
-        return [x, y, z], w
-
-    def _get_model():
-        inp = []
-        out = []
-        inputs_np = {}
-        for i in range(ops_num):
-            inputs, w = _base_func(f"x{i}")
-            inputs_np[f"x{i}"] = np.random.uniform(size=shape_np).astype(dtype)
-            inputs_np[f"yx{i}"] = np.random.uniform(size=shape_np).astype(dtype)
-            inputs_np[f"zx{i}"] = np.random.uniform(size=shape_np).astype(dtype)
-            inp.extend(inputs)
-            out.append(w)
-        t = relay.Tuple(out)
-        w = relay.op.concatenate(t, axis)
-        return inputs_np, relay.Function(inp, w)
-
-    def ref_impl(inputs):
-        res = []
-        for i in range(0, len(inputs), 3):
-            x = inputs[i]
-            y = inputs[i + 1]
-            z = inputs[i + 2]
-            comp = np.where(x < 1, y + 1, z + 5)
-            comp = np.exp(comp)
-            res.append(comp)
-        return np.concatenate(tuple(res), axis=axis)
-
-    if executor_type == "ge" and shape_type == "dynamic":
-        pytest.skip()
-    _validate_opencl_executors(executor_type, _get_model, ref_impl)
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/codegen/test_target_codegen_vulkan.py b/tests/python/codegen/test_target_codegen_vulkan.py
index 6973040cb276..9d00f047cb69 100644
--- a/tests/python/codegen/test_target_codegen_vulkan.py
+++ b/tests/python/codegen/test_target_codegen_vulkan.py
@@ -26,7 +26,7 @@
 
 import tvm
 import tvm.testing
-from tvm import relay, te
+from tvm import te
 from tvm.topi.math import cast
 from tvm.script import tir as T, ir as I
 from tvm.tir import TensorIntrin, IntImm, Cast, Schedule
@@ -162,10 +162,10 @@ def build_f(f_ref):
             a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
             b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
             cs = [tvm.nd.empty((n,), A.dtype, dev) for _ in fs]
-            for ((f, _), c) in zip(fs, cs):
+            for (f, _), c in zip(fs, cs):
                 f(a, b, c)
 
-            for ((_, ref), c) in zip(fs, cs):
+            for (_, ref), c in zip(fs, cs):
                 tvm.testing.assert_allclose(c.numpy(), ref(a.numpy(), b.numpy()))
 
         ts = [threading.Thread(target=worker) for _ in range(np.random.randint(1, 10))]
@@ -233,62 +233,10 @@ def do_copy(A, B, n):
     tvm.testing.assert_allclose(b.numpy(), ref)
 
 
-def check_mod(target, dev, mod, x_np, res_np):
-    res = relay.create_executor("vm", mod=mod, device=dev, target=target).evaluate()(x_np).numpy()
-    tvm.testing.assert_allclose(res, res_np, atol=1e-5)
-
-
-def test_sqrt(target, dev):
-    # Three 32 bit pushconstants: any_dim, stride, stride
-    dtype = "float32"
-    x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], relay.sqrt(x))
-    x_np = np.random.uniform(size=(10,)).astype(dtype)
-    res_np = np.sqrt(x_np)
-
-    check_mod(target, dev, mod, x_np, res_np)
-
-
-def test_argsort(target, dev):
-    # One 64 bit and one 32 bit constants
-    dtype = "int32"
-    x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], relay.argsort(x))
-    x_np = np.random.randint(0, high=10, size=(10,)).astype(dtype)
-    res_np = np.argsort(x_np, kind="stable")
-
-    check_mod(target, dev, mod, x_np, res_np)
-
-
-def test_cumsum(target, dev):
-    # One 64 bit and one 32 bit constants
-    dtype = "int32"
-    x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], relay.cumsum(x))
-    x_np = np.random.randint(0, high=10, size=(10,)).astype(dtype)
-    res_np = np.cumsum(x_np)
-
-    check_mod(target, dev, mod, x_np, res_np)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_unique(target, dev):
-    dtype = "int32"
-    x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
-    mod = tvm.IRModule()
-    [unique, _, _, num_unique] = relay.unique(x, is_sorted=True)
-    mod["main"] = relay.Function([x], relay.op.strided_slice(unique, begin=[0], end=num_unique))
-    x_np = np.random.randint(0, high=10, size=(10,)).astype(dtype)
-    res_np = np.unique(x_np)
-    check_mod(target, dev, mod, x_np, res_np)
-
-
 vulkan_parameter_impl = tvm.testing.parameter("push_constants", "ubo")
 vulkan_parameter_dtype = tvm.testing.parameter("int32", "float32", "int64")
 
+
 # Only run on vulkan because extremely large numbers of input
 # parameters can crash cuda/llvm compiler.
 @tvm.testing.parametrize_targets("vulkan -from_device=0")
diff --git a/tests/python/codegen/test_target_codegen_x86.py b/tests/python/codegen/test_target_codegen_x86.py
index 8ff9dbb3ddc8..a276940050b1 100644
--- a/tests/python/codegen/test_target_codegen_x86.py
+++ b/tests/python/codegen/test_target_codegen_x86.py
@@ -64,48 +64,5 @@ def fp16_to_fp32(target, width, match=None, not_match=None):
 is_32bit = platform.architecture()[0] == "32bit"
 
 
-@tvm.testing.requires_llvm
-@pytest.mark.skipif(is_32bit, reason=f"Fails in CI due to architecture mismatch in JIT")
-@pytest.mark.parametrize("feature_string", ["-sse2", "+sse2"])
-def test_fp16_fp32_conversions(feature_string):
-    relay_model = textwrap.dedent(
-        """
-        #[version = "0.0.5"]
-        def @main(%inp : Tensor[(3), float32], %cst : Tensor[(3), float32]) {
-            %1 = cast(%inp, dtype="float16");
-            %2 = cast(%cst, dtype="float16");
-            %3 = add(%1, %2);
-            %4 = cast(%3, dtype="float32");
-            %4
-        }
-        """
-    )
-
-    ir_mod = tvm.relay.fromtext(relay_model)
-
-    arch = "i386" if machine == "i386" else "x86_64"
-    aot_factory = tvm.relay.build(
-        ir_mod,
-        params={"cst": np.array([1.0, 2.0, 3.0], dtype="float32")},
-        target=f"llvm --mtriple={arch} --mattr={feature_string}",
-        executor=tvm.relay.backend.Executor(
-            "aot", {"interface-api": "packed", "unpacked-api": False}
-        ),
-    )
-
-    mod_name = aot_factory["list_module_names"]()[0]
-    executor = aot_factory[mod_name]
-    mod = executor(tvm.cpu(0))
-
-    inp = tvm.nd.array(np.array([1.1, 2.1, 3.1], dtype="float32"), device=tvm.cpu(0))
-
-    mod.get_function("set_input")(0, inp)
-    mod.get_function("run")()
-    out = mod.get_function("get_output")(0)
-
-    expected = np.array([2.1, 4.1, 6.1], dtype="float32")
-    np.testing.assert_allclose(out.asnumpy(), expected, rtol=1e-3)
-
-
 if __name__ == "__main__":
     test_fp16_to_fp32()
diff --git a/tests/python/codegen/test_target_texture_codegen_opencl.py b/tests/python/codegen/test_target_texture_codegen_opencl.py
deleted file mode 100644
index 213ad8d7ba41..000000000000
--- a/tests/python/codegen/test_target_texture_codegen_opencl.py
+++ /dev/null
@@ -1,1778 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import autotvm
-from tvm import te
-from tvm.topi import testing
-from tvm.topi.utils import get_const_tuple, simplify
-from tvm.topi import nn
-
-
-def compute_plus_one_rank3(shape):
-    X = te.placeholder(shape, name="X", dtype="float32")
-    Y = te.compute(shape, lambda i, j, k: X[i, j, k] + 1, name="Compute_Y")
-    return X, Y
-
-
-def schedule_plus_one_rank3(X, Y):
-    s = te.create_schedule(Y.op)
-    # Xt = s.cache_read(X, "texture", [Y])
-    # Xt = s.cache_read(X, "global", [Y])
-    Xt = s.cache_read(X, "global.texture", [Y])
-
-    # copy to texture stage
-    x, y, c = s[Xt].op.axis
-    s[Xt].bind(x, te.thread_axis("blockIdx.x"))
-    s[Xt].bind(y, te.thread_axis("threadIdx.x"))
-    s[Xt].vectorize(c)
-
-    # the compute stage
-    x, y, c = s[Y].op.axis
-    xo, yo, xi, yi = s[Y].tile(x, y, 4, 4)
-    s[Y].bind(xo, te.thread_axis("blockIdx.x"))
-    s[Y].bind(yo, te.thread_axis("threadIdx.x"))
-    s[Y].vectorize(c)
-    return s
-
-
-def compute_plus_one_rank5(shape):
-    X = te.placeholder(shape, name="X", dtype="float32")
-    Y = te.compute(shape, lambda i, j, k, l, m: X[i, j, k, l, m] + 1, name="Compute_Y")
-    return X, Y
-
-
-def schedule_plus_one_rank5(X, Y):
-    s = te.create_schedule(Y.op)
-    Xt = s.cache_read(X, "global.texture", [Y])
-
-    # copy to texture stage
-    a, b, c, d, e = s[Xt].op.axis
-    abc = s[Xt].fuse(a, b, c)
-    s[Xt].bind(abc, te.thread_axis("blockIdx.x"))
-    s[Xt].bind(d, te.thread_axis("threadIdx.x"))
-    s[Xt].vectorize(e)
-
-    # the compute stage
-    a, b, c, d, e = s[Y].op.axis
-    abc = s[Y].fuse(a, b, c)
-    xo, yo, xi, yi = s[Y].tile(abc, d, 4, 4)
-    s[Y].bind(xo, te.thread_axis("blockIdx.x"))
-    s[Y].bind(yo, te.thread_axis("threadIdx.x"))
-    s[Y].vectorize(e)
-    return s
-
-
-def compute_matmul(shape):
-    A = te.placeholder(shape, name="A", dtype="float32")
-    B = te.placeholder(shape, name="B", dtype="float32")
-    k = te.reduce_axis((0, shape[1]), name="k")
-    C = te.compute(
-        (shape[0] * shape[2], shape[0] * shape[2]),
-        lambda i, j: te.sum(
-            A[i // shape[2], k, i % shape[2]].astype("float32")
-            * B[j // shape[2], k, j % shape[2]].astype("float32"),
-            axis=[k],
-        ),
-        name="Compute_MatMul",
-    )
-    return A, B, C
-
-
-def schedule_matmul(A, B, C, local=False):
-    s = te.create_schedule(C.op)
-    At = s.cache_read(A, "global.texture", [C])
-    Bt = s.cache_read(B, "global.texture", [C])
-    if local:
-        Al = s.cache_read(At, "local", [C])
-        Bl = s.cache_read(Bt, "local", [C])
-    Cl = s.cache_write(C, "local")
-
-    bx = te.thread_axis("blockIdx.x")
-    tx = te.thread_axis("threadIdx.x")
-
-    def copy_to_texture(stage):
-        _io, _k, _ii = s[stage].op.axis
-        s[stage].vectorize(_ii)
-        s[stage].bind(_io, bx)
-        s[stage].bind(_k, tx)
-
-    copy_to_texture(At)
-    copy_to_texture(Bt)
-
-    # copy to global stage
-    _i, _j = s[C].op.axis
-    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
-    s[C].unroll(xi)
-    s[C].vectorize(yi)
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(yo, te.thread_axis("threadIdx.x"))
-
-    # the compute stage
-    s[Cl].compute_at(s[C], yo)
-    (_k,) = Cl.op.reduce_axis
-    _x, _y = s[Cl].op.axis
-    s[Cl].reorder(_k, _x, _y)
-    s[Cl].unroll(_x)
-    s[Cl].vectorize(_y)
-
-    if local:
-        s[Al].compute_at(s[Cl], _k)
-        s[Al].vectorize(s[Al].op.axis[-1])
-        s[Bl].compute_at(s[Cl], _k)
-        s[Bl].vectorize(s[Bl].op.axis[-1])
-
-    return s
-
-
-def compute_matmul_inner(shape):
-    A = te.placeholder(shape, name="A", dtype="float32")
-    B = te.placeholder(shape, name="B", dtype="float32")
-    k = te.reduce_axis((0, shape[1] * shape[2]), name="k")
-    # (M, K) x (N, K)
-    # (32, 256) x (32, 256)
-    # (32, 64, 4) x (32, 64, 4)
-    C = te.compute(
-        (shape[0], shape[0]),
-        lambda i, j: te.sum(
-            A[i, k // shape[2], k % shape[2]].astype("float32")
-            * B[j, k // shape[2], k % shape[2]].astype("float32"),
-            axis=[k],
-        ),
-        name="Compute_MatMul",
-    )
-    return A, B, C
-
-
-def schedule_matmul_inner(A, B, C, local=False):
-    s = te.create_schedule(C.op)
-    At = s.cache_read(A, "global.texture", [C])
-    Bt = s.cache_read(B, "global.texture", [C])
-    if local:
-        Al = s.cache_read(At, "local", [C])
-        Bl = s.cache_read(Bt, "local", [C])
-    Cl = s.cache_write(C, "local")
-
-    bx = te.thread_axis("blockIdx.x")
-    tx = te.thread_axis("threadIdx.x")
-
-    def copy_to_texture(stage):
-        _i, _ko, _ki = s[stage].op.axis
-        s[stage].vectorize(_ki)
-        s[stage].bind(_i, bx)
-        s[stage].bind(_ko, tx)
-
-    copy_to_texture(At)
-    copy_to_texture(Bt)
-
-    # copy to global stage
-    _i, _j = s[C].op.axis
-    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
-    s[C].unroll(xi)
-    s[C].vectorize(yi)
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(yo, te.thread_axis("threadIdx.x"))
-
-    # the compute stage
-    s[Cl].compute_at(s[C], yo)
-    (_k,) = Cl.op.reduce_axis
-    _x, _y = s[Cl].op.axis
-    s[Cl].reorder(_x, _y, _k)
-    s[Cl].unroll(_x)
-    # TODO(csullivan): consider whether the below error is worth resolving
-    # s[Cl].vectorize(_y) # error
-
-    if local:
-        s[Al].compute_at(s[Cl], _x)
-        s[Al].vectorize(s[Al].op.axis[-1])
-        s[Bl].compute_at(s[Cl], _x)
-        s[Bl].vectorize(s[Bl].op.axis[-1])
-
-    return s
-
-
-def compute_matmul_vector_accumulator(shapeA, shapeB):
-    # A x B
-    # (K/4, M, K%4) x (K, N/4, N%4) = (M, N)
-    # (32, 64, 4) x (128, 16, 4) = (64, 64)
-    A = te.placeholder(shapeA, name="A", dtype="float32")
-    B = te.placeholder(shapeB, name="B", dtype="float32")
-    k = te.reduce_axis((0, shapeB[0]), name="k")
-    C = te.compute(
-        (shapeA[1], shapeB[1] * shapeB[2]),
-        lambda i, j: te.sum(
-            A[k // shapeA[-1], i, k % shapeA[-1]].astype("float32")
-            * B[k, j // shapeB[-1], j % shapeB[-1]].astype("float32"),
-            axis=[k],
-        ),
-        name="Compute_MatMul",
-    )
-    return A, B, C
-
-
-def schedule_matmul_vector_accumulator(A, B, C, local=False):
-    s = te.create_schedule(C.op)
-    At = s.cache_read(A, "global.texture", [C])
-    Bt = s.cache_read(B, "global.texture", [C])
-    if local:
-        Al = s.cache_read(At, "local", [C])
-        Bl = s.cache_read(Bt, "local", [C])
-    Cl = s.cache_write(C, "local")
-
-    def copy_to_texture(stage):
-        _y, _x, _v = s[stage].op.axis
-        # TODO(csullivan): removing this vectorize results in numerical errors, autovectorize
-        s[stage].vectorize(_v)
-        s[stage].bind(_y, te.thread_axis("blockIdx.x"))
-        s[stage].bind(_x, te.thread_axis("threadIdx.x"))
-
-    copy_to_texture(At)
-    copy_to_texture(Bt)
-
-    # copy to global stage
-    _i, _j = s[C].op.axis
-    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
-    s[C].unroll(xi)
-    s[C].vectorize(yi)
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(yo, te.thread_axis("threadIdx.x"))
-
-    # the compute stage
-    s[Cl].compute_at(s[C], yo)
-    (_k,) = Cl.op.reduce_axis
-    _a, _b = s[Cl].op.axis
-    _ko, _ki = s[Cl].split(_k, factor=4)
-    s[Cl].reorder(_ko, _a, _ki, _b)
-    s[Cl].unroll(_ki)
-    s[Cl].unroll(_a)
-    s[Cl].vectorize(_b)
-
-    if local:
-        s[Al].compute_at(s[Cl], _a)
-        _aa, _ka, _ba = s[Al].op.axis
-        # TODO(csullivan)[BEFORE PR]: removing this vectorize command causes a crash. This needs to be autovectorized.
-        s[Al].vectorize(_ba)
-        s[Bl].compute_at(s[Cl], _ko)
-        _ab, _kb, _bb = s[Bl].op.axis
-        s[Bl].vectorize(_bb)
-        s[Bl].unroll(_ab)
-
-    return s
-
-
-def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape):
-    # conv2d( [N, C, H, W, c] , [1, 1, C, K, k]
-    data = te.placeholder(input_shape, name="data", dtype="float32")
-    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    c = te.reduce_axis((0, input_shape[1]), name="C")
-    c4 = te.reduce_axis((0, input_shape[-1]), name="c4")
-    kh = te.reduce_axis((0, filter_shape[0]), name="kh")
-    kw = te.reduce_axis((0, filter_shape[1]), name="kw")
-    conv = te.compute(
-        (input_shape[0], filter_shape[-2], input_shape[2], input_shape[3], filter_shape[-1]),
-        lambda n, ko, i, j, ki: te.sum(
-            data[n, c, i, j, c4].astype("float32")
-            * filt[kh, kw, c * input_shape[-1] + c4, ko, ki].astype("float32"),
-            axis=[kh, kw, c, c4],
-        ),
-        # name="Compute_conv2d_1x1_NCHWc_RSCKk",
-        name="conv2d_1x1",
-    )
-    return data, filt, conv
-
-
-def schedule_conv2d_1x1_NCHWc_RSCKk(data, filt, conv):
-    # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4)
-    # outputs:
-    s = te.create_schedule(conv.op)
-    A, B, C = data, filt, conv
-    At = s.cache_read(A, "global.texture", [C])
-    Bt = s.cache_read(B, "global.texture", [C])
-    Al = s.cache_read(At, "local", [C])
-    Bl = s.cache_read(Bt, "local", [C])
-    Cl = s.cache_write(C, "local")
-
-    def copy_to_texture(stage):
-        axes = s[stage].op.axis
-        fused = s[stage].fuse(*axes[:-1])
-        block, thread = s[stage].split(fused, factor=32)
-        s[stage].vectorize(axes[-1])
-        s[stage].bind(block, te.thread_axis("blockIdx.x"))
-        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
-
-    copy_to_texture(At)
-    copy_to_texture(Bt)
-
-    _n, _ko, _h, _w, _ki = s[C].op.axis
-    s[C].vectorize(_ki)
-    s[C].bind(_n, te.thread_axis("blockIdx.x"))
-    s[C].bind(_ko, te.thread_axis("threadIdx.x"))
-
-    s[Cl].compute_at(s[C], _w)
-    _nl, _kol, _hl, _wl, _kil = s[Cl].op.axis
-    _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis
-    _clo, _cli = s[Cl].split(_cl, factor=4)
-    s[Cl].reorder(_clo, _cli, _cl4, _kil)
-    s[Cl].unroll(_cli)
-    s[Cl].unroll(_cl4)
-    s[Cl].vectorize(_kil)
-
-    s[Al].compute_at(s[Cl], _cli)
-    s[Al].vectorize(s[Al].op.axis[-1])
-    s[Bl].compute_at(s[Cl], _kwl)
-    s[Bl].vectorize(s[Bl].op.axis[-1])
-
-    return s
-
-
-def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape):
-    # input_shape = [W, C, H, N, c] -> [W, C, H*N, c]
-    # filter_shape = [C, R, S, K, k] -> [C, R*S*K, k]
-    # output_shape: [WK, HN, k] -> [W, K, H, N, k]
-    data = te.placeholder(input_shape, name="data", dtype="float32")
-    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-
-    packed_data = te.compute(
-        (input_shape[0], input_shape[1], input_shape[2] * input_shape[3], input_shape[4]),
-        lambda i, j, k, l: data[i, j, k // input_shape[3], k % input_shape[3], l],
-        name="packed_data",
-    )
-
-    # Logical transformation of Nd -> 3d tensor
-    # CRSKk -> C|RSK|k
-    # r = rsk // SK
-    # sk = rsk % SK
-    # s = sk // K == (rsk % SK) // K == (rsk // K) % S
-    # k = sk % K == (rsk % SK) % K == rsk % K
-    packed_filter = te.compute(
-        (filter_shape[0], filter_shape[1] * filter_shape[2] * filter_shape[3], filter_shape[4]),
-        lambda i, j, k: filt[
-            i,
-            j // (filter_shape[3] * filter_shape[2]),
-            (j // filter_shape[3]) % filter_shape[2],
-            j % filter_shape[3],
-            k,
-        ],
-        name="packed_filter",
-    )
-
-    c = te.reduce_axis((0, input_shape[1]), name="C")
-    c4 = te.reduce_axis((0, input_shape[-1]), name="c4")
-    r = te.reduce_axis((0, filter_shape[1]), name="r")
-    s = te.reduce_axis((0, filter_shape[2]), name="s")
-
-    conv = te.compute(
-        (input_shape[0], filter_shape[3], input_shape[2], input_shape[3], filter_shape[4]),
-        lambda w, ko, h, n, ki: te.sum(
-            packed_data[w, c, h * input_shape[3] + n, c4].astype("float32")
-            * packed_filter[
-                c * input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki
-            ].astype("float32"),
-            axis=[r, s, c, c4],
-        ),
-        name="conv2d_1x1",
-    )
-    return data, filt, packed_data, packed_filter, conv
-
-
-def schedule_conv2d_1x1_WCHNc_CRSKk(data, filt, packed_data, packed_filter, conv):
-    # data: [W, C, H*N, c]
-    # filter: [C, R*S*K, k]
-    # output: [W, K, H, N, k]
-
-    # conv2d( [N, C, H, W, c] , [1, 1, C, K, k]
-    # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4)
-
-    # data: (56, 128//4, 56*1, 4) = (56, 32, 56, 4)
-    # filt: (128, 1*1*128//4, 4) = (128, 32, 4)
-    # conv: (56, 32, 56, 1, 4)
-
-    s = te.create_schedule(conv.op)
-    cfg = autotvm.get_config()
-
-    s[packed_data].compute_inline()
-    s[packed_filter].compute_inline()
-    A, B, C = packed_data, packed_filter, conv
-    At = s.cache_read(A, "global.texture", [C])
-    Bt = s.cache_read(B, "global.texture", [C])
-    Al = s.cache_read(At, "local", [C])
-    Bl = s.cache_read(Bt, "local", [C])
-    Cl = s.cache_write(C, "local")
-
-    def copy_to_texture(stage):
-        axes = s[stage].op.axis
-        fused = s[stage].fuse(*axes[:-1])
-        block, thread = s[stage].split(fused, factor=32)
-        s[stage].vectorize(axes[-1])
-        s[stage].bind(block, te.thread_axis("blockIdx.x"))
-        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
-
-    copy_to_texture(At)
-    copy_to_texture(Bt)
-
-    _w, _ko, _h, _n, _ki = s[C].op.axis
-    kernel_scope, _n = s[C].split(_n, nparts=1)
-
-    cfg.define_split("tile_f", _ko, num_outputs=4)
-    cfg.define_split("tile_w", _w, num_outputs=4)
-    cfg.define_split("tile_h", _h, num_outputs=4)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    bk, vk, tk, ki = cfg["tile_f"].apply(s, C, _ko)
-    bw, vw, tw, wi = cfg["tile_w"].apply(s, C, _w)
-    bh, vh, th, hi = cfg["tile_h"].apply(s, C, _h)
-    s[C].reorder(bh, _n, vh, th, hi)
-    bhn = s[C].fuse(bh, _n)
-
-    s[C].bind(bk, te.thread_axis("blockIdx.z"))
-    s[C].bind(bhn, te.thread_axis("blockIdx.y"))
-    s[C].bind(bw, te.thread_axis("blockIdx.x"))
-    s[C].bind(vk, te.thread_axis("vthread"))
-    s[C].bind(vh, te.thread_axis("vthread"))
-    s[C].bind(vw, te.thread_axis("vthread"))
-    s[C].bind(tk, te.thread_axis("threadIdx.z"))
-    s[C].bind(th, te.thread_axis("threadIdx.y"))
-    s[C].bind(tw, te.thread_axis("threadIdx.x"))
-    s[C].reorder(bw, bk, bhn, vw, vk, vh, tw, tk, th, ki, hi, wi, _ki)
-    s[C].vectorize(_ki)
-
-    # TODO(csullivan): Try uneven workgroup split
-    # _wo, _wi = s[C].split(_w, factor=4)
-    # #_hno, _hni = s[C].split(_hn, factor=8)
-    # #s[C].reorder(_wo, _wi, _ko, _hno, _hni, _ki)
-    # s[C].reorder(_wo, _ko, _hn, _ki, _wi)
-    # s[C].unroll(_wi)
-
-    # # mace:
-    # # const int out_ch_blk = get_global_id(0);
-    # # const int out_w_blk = get_global_id(1);
-    # # const int out_hb = get_global_id(2);
-
-    # bx = te.thread_axis("blockIdx.x")
-    # by = te.thread_axis("blockIdx.y")
-    # bz = te.thread_axis("blockIdx.z")
-    # s[C].bind(_ko, bx)
-    # s[C].bind(_wo, by)
-    # s[C].bind(_hn, bz)
-
-    # s[Cl].compute_at(s[C], _hn)
-    s[Cl].compute_at(s[C], th)
-
-    _wl, _kol, _hl, _nl, _kil = s[Cl].op.axis
-    _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis
-
-    cfg.define_split("tile_c", _cl, num_outputs=2)
-    cfg.define_split("tile_kh", _khl, num_outputs=2)
-    cfg.define_split("tile_kw", _kwl, num_outputs=2)
-
-    _clo, _cli = cfg["tile_c"].apply(s, Cl, _cl)
-    _khlo, _khli = cfg["tile_kh"].apply(s, Cl, _khl)
-    _kwlo, _kwli = cfg["tile_kw"].apply(s, Cl, _kwl)
-    # s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
-    s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli, _kol, _hl, _nl, _kil, _wl)
-    # s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli)
-    # s[Cl].reorder(_cl, _cl4, _kil, _wl)
-    s[Cl].unroll(_cl4)
-    s[Cl].unroll(_wl)
-    s[Cl].vectorize(_kil)
-
-    _wla, _cla, _hnla, _cl4a = s[Al].op.axis
-    s[Al].compute_at(s[Cl], _cli)
-    s[Al].vectorize(_cl4a)
-    s[Al].unroll(_wla)
-
-    _clb, _rskolb, _kilb = s[Bl].op.axis
-    s[Bl].compute_at(s[Cl], _cli)
-    s[Bl].vectorize(_kilb)
-    s[Bl].unroll(_clb)
-
-    s[C].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-
-    WO, K, HO, N, K4 = get_const_tuple(C.shape)
-    RSC, _, _ = get_const_tuple(B.shape)
-    cfg.add_flop(2 * N * K * K4 * HO * WO * RSC)
-
-    return s
-
-
-def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None):
-    """Convolution operator in NCHWc layout."""
-
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
-    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    # compute graph
-    pad_before = [0, 0, pad_top, pad_left, 0]
-    pad_after = [0, 0, pad_down, pad_right, 0]
-    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
-
-    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
-    rcb = te.reduce_axis((0, in_channel_block), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-
-    # NCHWc x KCRSk
-    # texture: NCH|W|c
-    # texture: K|CRS|k
-    # c = crs//RS
-    # rs = crs % RS
-    # r = rs // W == (crs // S) % R
-    # s = rs % W == crs % S
-    Filter = te.compute(
-        (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
-        lambda ffc, crs, ffb: Filter[
-            ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb
-        ],
-        name="packed_filter",
-    )
-    return te.compute(
-        (batch, num_filter_chunk, out_height, out_width, num_filter_block),
-        lambda nn, ffc, yy, xx, ffb: te.sum(
-            temp[
-                nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb
-            ].astype(out_dtype)
-            * Filter[
-                ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb
-            ].astype(out_dtype),
-            axis=[rcc, rcb, ry, rx],
-        ),
-        tag="conv2d_nchwc_kcrsk_texture",
-    )
-
-
-def schedule_conv2d_NCHWc_KCRSk(cfg, s, conv):
-    """schedule optimized for batch size = 1"""
-
-    ##### space definition begin #####
-    n, fc, y, x, fb = s[conv].op.axis
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_fc", fc, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rcc", rcc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    pad_data, flattened_kernel = s[conv].op.input_tensors
-    kernel = s[flattened_kernel].op.input_tensors[0]
-    s[flattened_kernel].compute_inline()
-
-    s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-    kernel = flattened_kernel
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    # create cache stage
-    AT = s.cache_read(pad_data, "global.texture", [OL])
-    WT = s.cache_read(kernel, "global.texture", [OL])
-
-    def copy_to_texture(stage):
-        axes = s[stage].op.axis
-        fused = s[stage].fuse(*axes[:-1])
-        block, thread = s[stage].split(fused, factor=32)
-        s[stage].vectorize(axes[-1])
-        s[stage].bind(block, te.thread_axis("blockIdx.x"))
-        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
-
-    copy_to_texture(AT)
-    copy_to_texture(WT)
-
-    AA = s.cache_read(AT, "shared", [OL])
-    WW = s.cache_read(WT, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, fc, y, x, fb = s[output].op.axis
-
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb)
-    s[output].vectorize(fb)
-    s[OL].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, fc, y, x, fb = s[OL].op.axis
-
-    rcc, rcb, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rcc"].apply(s, OL, rcc)
-    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
-
-    # TODO(csullivan): check position of rcb
-    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
-    s[OL].vectorize(fb)
-    s[OL].unroll(rcb)
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-    # cooperative fetching
-    for load in [AA, WW]:
-        if load == WW:
-            n, fyx, v = s[load].op.axis
-            fused = s[load].fuse(n, fyx)
-        else:
-            n, f, y, x, v = s[load].op.axis
-            fused = s[load].fuse(n, f, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-        s[load].vectorize(v)
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-
-    N, OCC, OH, OW, OCB = get_const_tuple(output.shape)
-    _, ICKHKW, _ = get_const_tuple(kernel.shape)
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
-
-
-def compute_conv2d_NCHWc_KCRSk_acc32(Input, Filter, stride, padding, dilation, out_dtype=None):
-    """Convolution operator in NCHWc layout."""
-
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
-    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    # compute graph
-    pad_before = [0, 0, pad_top, pad_left, 0]
-    pad_after = [0, 0, pad_down, pad_right, 0]
-    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
-
-    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
-    rcb = te.reduce_axis((0, in_channel_block), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-
-    # NCHWc x KCRSk
-    # texture: NCH|W|c
-    # texture: K|CRS|k
-    # c = crs//RS
-    # rs = crs % RS
-    # r = rs // W == (crs // S) % R
-    # s = rs % W == crs % S
-    Filter = te.compute(
-        (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
-        lambda ffc, crs, ffb: Filter[
-            ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb
-        ],
-        name="packed_filter",
-    )
-    conv = te.compute(
-        (batch, num_filter_chunk, out_height, out_width, num_filter_block),
-        lambda nn, ffc, yy, xx, ffb: te.sum(
-            (
-                temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
-                * Filter[ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb]
-            ).astype(out_dtype),
-            axis=[rcc, rcb, ry, rx],
-        ),
-        tag="conv2d_nchwc_kcrsk_texture",
-    )
-    output = te.compute(conv.shape, lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32"))
-    return output
-
-
-def schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output):
-    """schedule optimized for batch size = 1"""
-
-    conv = output.op.input_tensors[0]
-
-    ##### space definition begin #####
-    n, fc, y, x, fb = s[conv].op.axis
-    rcc, rcb, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_fc", fc, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rcc", rcc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    pad_data, flattened_kernel = s[conv].op.input_tensors
-    kernel = s[flattened_kernel].op.input_tensors[0]
-    s[flattened_kernel].compute_inline()
-
-    s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-    kernel = flattened_kernel
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    # create cache stage
-    AT = s.cache_read(pad_data, "global.texture", [OL])
-    WT = s.cache_read(kernel, "global.texture", [OL])
-
-    def copy_to_texture(stage):
-        axes = s[stage].op.axis
-        fused = s[stage].fuse(*axes[:-1])
-        block, thread = s[stage].split(fused, factor=32)
-        s[stage].vectorize(axes[-1])
-        s[stage].bind(block, te.thread_axis("blockIdx.x"))
-        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
-
-    copy_to_texture(AT)
-    copy_to_texture(WT)
-
-    AA = s.cache_read(AT, "shared", [OL])
-    WW = s.cache_read(WT, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, fc, y, x, fb = s[output].op.axis
-
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb)
-    s[output].vectorize(fb)
-
-    s[OL].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, fc, y, x, fb = s[OL].op.axis
-
-    rcc, rcb, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rcc"].apply(s, OL, rcc)
-    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
-
-    # TODO(csullivan): check position of rcb
-    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
-    s[OL].vectorize(fb)
-    s[OL].unroll(rcb)
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-    # cooperative fetching
-    for load in [AA, WW]:
-        if load == WW:
-            n, fyx, v = s[load].op.axis
-            fused = s[load].fuse(n, fyx)
-        else:
-            n, f, y, x, v = s[load].op.axis
-            fused = s[load].fuse(n, f, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-        s[load].vectorize(v)
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-
-    N, OCC, OH, OW, OCB = get_const_tuple(output.shape)
-    _, ICKHKW, _ = get_const_tuple(kernel.shape)
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
-
-
-def compute_depthwise_conv2d_NCHWc_KCRSk_acc32(
-    Input, Filter, stride, padding, dilation, out_dtype=None
-):
-    """Depthwise convolution operator in NCHWc layout."""
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, channel_chunk, in_height, in_width, channel_block = Input.shape
-    _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel_chunk = simplify(channel_chunk * channel_multiplier)
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    # compute graph
-    pad_before = [0, 0, pad_top, pad_left, 0]
-    pad_after = [0, 0, pad_down, pad_right, 0]
-    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
-
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-
-    # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c]
-    # NCHWc x CMRS
-    # texture: NCH|W|c
-    # texture: C|MRS|c
-    # output: N
-    # m = mrs//RS
-    # rs = mrs % RS
-    # r = rs // W == (mrs // S) % R
-    # s = rs % W == mrs % S
-    Filter = te.compute(
-        (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block),
-        lambda ffc, mrs, ffb: Filter[
-            ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb
-        ],
-        name="packed_filter",
-    )
-
-    conv = te.compute(
-        (batch, out_channel_chunk, out_height, out_width, channel_block),
-        lambda nn, ffc, yy, xx, ffb: te.sum(
-            (
-                temp[
-                    nn,
-                    ffc // channel_multiplier,
-                    yy * stride_h + ry * dilation_h,
-                    xx * stride_w + rx * dilation_w,
-                    ffb,
-                ]
-                * Filter[
-                    ffc // channel_multiplier,
-                    ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx,
-                    ffb,
-                ]
-            ).astype(out_dtype),
-            axis=[ry, rx],
-        ),
-        tag="depthwise_conv2d_nchwc_kcrsk_texture",
-    )
-    return te.compute(
-        conv.shape, lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype("float32")
-    )
-
-
-def schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output):
-    """schedule optimized for batch size = 1"""
-
-    conv = output.op.input_tensors[0]
-
-    ##### space definition begin #####
-    n, fc, y, x, fb = s[conv].op.axis
-    ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_fc", fc, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    pad_data, flattened_kernel = s[conv].op.input_tensors
-    kernel = s[flattened_kernel].op.input_tensors[0]
-    s[flattened_kernel].compute_inline()
-
-    s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-    kernel = flattened_kernel
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    # create cache stage
-    AT = s.cache_read(pad_data, "global.texture", [OL])
-    WT = s.cache_read(kernel, "global.texture", [OL])
-
-    def copy_to_texture(stage):
-        axes = s[stage].op.axis
-        fused = s[stage].fuse(*axes[:-1])
-        block, thread = s[stage].split(fused, factor=32)
-        s[stage].vectorize(axes[-1])
-        s[stage].bind(block, te.thread_axis("blockIdx.x"))
-        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
-
-    copy_to_texture(AT)
-    copy_to_texture(WT)
-
-    AA = s.cache_read(AT, "shared", [OL])
-    WW = s.cache_read(WT, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, fc, y, x, fb = s[output].op.axis
-
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb)
-    s[output].vectorize(fb)
-
-    s[OL].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, fc, y, x, fb = s[OL].op.axis
-
-    ry, rx = s[OL].op.reduce_axis
-    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
-
-    s[OL].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb)
-    s[OL].vectorize(fb)
-    # s[OL].unroll()
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-    # cooperative fetching
-    for load in [AA, WW]:
-        if load == WW:
-            n, fyx, v = s[load].op.axis
-            fused = s[load].fuse(n, fyx)
-        else:
-            n, f, y, x, v = s[load].op.axis
-            fused = s[load].fuse(n, f, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-        s[load].vectorize(v)
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-
-    N, OCC, OH, OW, OCB = get_const_tuple(output.shape)
-    ICC, MKHKW, ICB = get_const_tuple(kernel.shape)
-    M = (OCC * OCB) // (ICC * ICB)
-    KHKW = MKHKW // M
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW)
-
-
-def scheduler(compute, schedule, *args, **kwargs):
-    placeholders = compute(*args)
-    s = schedule(*placeholders, **kwargs)
-    return s, placeholders
-
-
-def conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape):
-    placeholders = compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape)
-    s = schedule_conv2d_1x1_NCHWc_RSCKk(*placeholders)
-    return s, placeholders
-
-
-def conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape):
-    placeholders = compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape)
-    s = schedule_conv2d_1x1_WCHNc_CRSKk(*placeholders)
-    return s, (placeholders[0], placeholders[1], placeholders[-1])
-
-
-def conv2d_NCHWc_KCRSk(input_shape, filter_shape):
-    data = te.placeholder(input_shape, name="data", dtype="float32")
-    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    conv = compute_conv2d_NCHWc_KCRSk(data, filt, [1, 1], [0, 0], [1, 1], "float32")
-    cfg = autotvm.get_config()
-    s = te.create_schedule([x.op for x in [conv]])
-    schedule_conv2d_NCHWc_KCRSk(cfg, s, conv)
-    return s, (data, filt, conv)
-
-
-def conv2d_NCHWc_KCRSk_fp32_acc(input_shape, filter_shape):
-    data = te.placeholder(input_shape, name="data", dtype="float32")
-    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    output = compute_conv2d_NCHWc_KCRSk_acc32(data, filt, [1, 1], [0, 0], [1, 1], "float32")
-    cfg = autotvm.get_config()
-    s = te.create_schedule([x.op for x in [output]])
-    schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output)
-    return s, (data, filt, output)
-
-
-def depthwise_conv2d_NCHWc_KCRSk_acc32(input_shape, filter_shape):
-    data = te.placeholder(input_shape, name="data", dtype="float32")
-    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    output = compute_depthwise_conv2d_NCHWc_KCRSk_acc32(
-        data, filt, [1, 1], [0, 0], [1, 1], "float32"
-    )
-    cfg = autotvm.get_config()
-    s = te.create_schedule([x.op for x in [output]])
-    schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output)
-    return s, (data, filt, output)
-
-
-def ref_convolution(data, kernel, stride, pad):
-    import mxnet as mx
-
-    groups = 1
-    kernel_size = (kernel.shape[2], kernel.shape[3])
-    num_filter = kernel.shape[0]
-    ref_res = mx.nd.Convolution(
-        data=mx.nd.array(data),
-        weight=mx.nd.array(kernel),
-        bias=None,
-        no_bias=True,
-        kernel=kernel_size,
-        stride=stride,
-        pad=pad,
-        num_filter=num_filter,
-        num_group=groups,
-    )
-    return ref_res.asnumpy()
-
-
-def ref_depthwise_convolution(data, kernel, stride, pad):
-    import mxnet as mx
-
-    groups = kernel.shape[0]
-    kernel_size = (kernel.shape[2], kernel.shape[3])
-    num_filter = kernel.shape[0]
-    multiplier = kernel.shape[1]
-    ref_res = mx.nd.Convolution(
-        data=mx.nd.array(data),
-        weight=mx.nd.array(kernel),
-        bias=None,
-        no_bias=True,
-        kernel=kernel_size,
-        stride=stride,
-        pad=pad,
-        num_filter=num_filter,
-        num_group=groups,
-    )
-    return ref_res.asnumpy()
-
-
-def validate(workload, target, dev, input_shapes, *args, **kwargs):
-    s, placeholders = workload(*input_shapes, *args, **kwargs)
-    func = tvm.driver.build(s, [*placeholders], target=target, name="TestFunction")
-
-    args_tvm = []
-    args_np = []
-    for var in placeholders[:-1]:
-        var_np = np.random.uniform(size=[i.value for i in var.shape]).astype(var.dtype)
-        args_np.append(var_np)
-        args_tvm.append(tvm.nd.array(var_np, dev))
-    args_tvm.append(
-        tvm.nd.array(
-            np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev
-        )
-    )
-    func(*args_tvm)
-
-    if "plus_one" in workload.__name__:
-        np_result = args_np[0] + 1.0
-    elif "matmul" in workload.__name__:
-        if "inner" in workload.__name__:
-            np_result = np.matmul(
-                args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0)
-            )
-        elif "accum" in workload.__name__:
-            np_result = np.matmul(
-                args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64)
-            )
-        else:
-            np_result = np.matmul(
-                args_np[0].transpose((0, 2, 1)).reshape(128, 64),
-                args_np[1].transpose(1, 0, 2).reshape(64, 128),
-            )
-    elif "conv2d_1x1_NCHWc_RSCKk" in workload.__name__:
-        vec_length = args_np[1].shape[-1]
-        # nchwc -> nchw
-        args_np[0] = (
-            args_np[0]
-            .transpose((0, 1, 4, 2, 3))
-            .reshape(
-                args_np[0].shape[0],
-                args_np[0].shape[1] * args_np[0].shape[-1],
-                args_np[0].shape[2],
-                args_np[0].shape[3],
-            )
-        )
-        # rsckk -> rsck -> kcrs
-        args_np[1] = (
-            args_np[1]
-            .reshape(
-                args_np[1].shape[0],
-                args_np[1].shape[1],
-                args_np[1].shape[2],
-                args_np[1].shape[3] * args_np[1].shape[4],
-            )
-            .transpose((3, 2, 0, 1))
-        )
-        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
-        # nkhw -> nkhwk
-        np_result = np_result.reshape(
-            np_result.shape[0],
-            np_result.shape[1] // vec_length,
-            vec_length,
-            np_result.shape[2],
-            np_result.shape[3],
-        ).transpose(0, 1, 3, 4, 2)
-    elif "conv2d_1x1_WCHNc_CRSKk" in workload.__name__:
-        vec_length = args_np[1].shape[-1]
-        # wchnc -> nchw
-        args_np[0] = (
-            args_np[0]
-            .transpose((3, 1, 4, 2, 0))
-            .reshape(
-                args_np[0].shape[3],
-                args_np[0].shape[1] * args_np[0].shape[-1],
-                args_np[0].shape[2],
-                args_np[0].shape[0],
-            )
-        )
-        # crskk -> crsk -> kcrs
-        args_np[1] = (
-            args_np[1]
-            .reshape(
-                args_np[1].shape[0],
-                args_np[1].shape[1],
-                args_np[1].shape[2],
-                args_np[1].shape[3] * args_np[1].shape[4],
-            )
-            .transpose((3, 0, 1, 2))
-        )
-        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
-        # nkhw -> nkkhw -> wkhnk
-        np_result = np_result.reshape(
-            np_result.shape[0],
-            np_result.shape[1] // vec_length,
-            vec_length,
-            np_result.shape[2],
-            np_result.shape[3],
-        ).transpose(4, 1, 3, 0, 2)
-    elif "NCHW_KCRS" in workload.__name__:
-        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
-    elif "NCHWc_KCRSk" in workload.__name__:
-        vec_length = args_np[1].shape[-1]
-        # nchwc -> nchw
-        args_np[0] = (
-            args_np[0]
-            .transpose((0, 1, 4, 2, 3))
-            .reshape(
-                args_np[0].shape[0],
-                args_np[0].shape[1] * args_np[0].shape[-1],
-                args_np[0].shape[2],
-                args_np[0].shape[3],
-            )
-        )
-        # kcrsk/cmrsc -> kcrs/cmrs
-        args_np[1] = (
-            args_np[1]
-            .transpose((0, 4, 1, 2, 3))
-            .reshape(
-                args_np[1].shape[0] * args_np[1].shape[4],
-                args_np[1].shape[1],
-                args_np[1].shape[2],
-                args_np[1].shape[3],
-            )
-        )
-        if "depthwise" in workload.__name__:
-            # np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID")
-            np_result = ref_depthwise_convolution(args_np[0], args_np[1], [], [])
-        else:
-            # np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
-            np_result = ref_convolution(args_np[0], args_np[1], [], [])
-        # nkhw -> nkhwk
-        np_result = np_result.reshape(
-            np_result.shape[0],
-            np_result.shape[1] // vec_length,
-            vec_length,
-            np_result.shape[2],
-            np_result.shape[3],
-        ).transpose(0, 1, 3, 4, 2)
-    np.testing.assert_allclose(args_tvm[-1].asnumpy(), np_result, rtol=1e-2, atol=1e-2)
-
-
-class BaseSingleShapeValidator:
-    @tvm.testing.parametrize_targets("opencl")
-    def test_unary(self, test_func, input_shape, target, dev):
-        validate(test_func, target, dev, [input_shape])
-
-
-class TestPlusOneRank3(BaseSingleShapeValidator):
-    input_shape = tvm.testing.parameter((32, 32, 4))
-
-    def plus_one(input_shape):
-        return scheduler(compute_plus_one_rank3, schedule_plus_one_rank3, input_shape)
-
-    test_func = tvm.testing.parameter(plus_one)
-
-
-class TestPlusOneRank5(BaseSingleShapeValidator):
-    input_shape = tvm.testing.parameter((32, 2, 4, 4, 4))
-
-    def plus_one(input_shape):
-        return scheduler(compute_plus_one_rank5, schedule_plus_one_rank5, input_shape)
-
-    test_func = tvm.testing.parameter(plus_one)
-
-
-class TestMatmul:
-    input_shape = tvm.testing.parameter((32, 64, 4))
-    local = tvm.testing.parameter(False, True)
-
-    def matmul(input_shape, local):
-        return scheduler(compute_matmul, schedule_matmul, input_shape, local=local)
-
-    def matmul_inner(input_shape, local):
-        return scheduler(compute_matmul_inner, schedule_matmul_inner, input_shape, local=local)
-
-    test_func = tvm.testing.parameter(matmul, matmul_inner)
-
-    @tvm.testing.parametrize_targets("opencl")
-    def test_matmul(self, test_func, input_shape, local, target, dev):
-        validate(test_func, target, dev, [input_shape], local=local)
-
-
-class TestMatmulVectorAccumulator:
-    shapeA = tvm.testing.parameter((32, 64, 4))
-    shapeB = tvm.testing.parameter((128, 16, 4))
-    local = tvm.testing.parameter(False, True)
-
-    def matmul_vector_accumulator(shapeA, shapeB, local):
-        return scheduler(
-            compute_matmul_vector_accumulator,
-            schedule_matmul_vector_accumulator,
-            shapeA,
-            shapeB,
-            local=local,
-        )
-
-    test_func = tvm.testing.parameter(matmul_vector_accumulator)
-
-    @tvm.testing.parametrize_targets("opencl")
-    def test_matmul_vec_acc(self, test_func, shapeA, shapeB, local, target, dev):
-        validate(test_func, target, dev, [shapeA, shapeB], local=local)
-
-
-class BaseConv2DValidator:
-    @tvm.testing.parametrize_targets("opencl")
-    def test_conv2d(self, test_func, input_shapes, target, dev):
-        validate(test_func, target, dev, input_shapes)
-
-
-class TestConv2dNCHWcRSCKk(BaseConv2DValidator):
-    input_shapes = tvm.testing.parameter([(1, 32, 56, 56, 4), (1, 1, 128, 32, 4)])
-    test_func = tvm.testing.parameter(conv2d_1x1_NCHWc_RSCKk)
-
-
-class TestConv2dWCHNcCRSKk(BaseConv2DValidator):
-    input_shapes = tvm.testing.parameter([(56, 32, 56, 1, 4), (128, 1, 1, 32, 4)])
-    test_func = tvm.testing.parameter(conv2d_1x1_WCHNc_CRSKk)
-
-
-@pytest.mark.skip("AttributeError: module 'numpy' has no attribute 'bool' raised from mxnet")
-class TestConv2dNCHWcKCRSk(BaseConv2DValidator):
-    input_shapes = tvm.testing.parameter(
-        [(1, 32, 56, 56, 4), (32, 128, 1, 1, 4)], [(1, 32, 112, 112, 4), (32, 128, 3, 3, 4)]
-    )
-    test_func = tvm.testing.parameter(conv2d_NCHWc_KCRSk, conv2d_NCHWc_KCRSk_fp32_acc)
-
-
-@pytest.mark.skip("AttributeError: module 'numpy' has no attribute 'bool' raised from mxnet")
-class TestDepthwiseConv2dNCHWcKCRSk(BaseConv2DValidator):
-    input_shapes = tvm.testing.parameter([(1, 24, 257, 257, 4), (24, 1, 3, 3, 4)])
-    test_func = tvm.testing.parameter(depthwise_conv2d_NCHWc_KCRSk_acc32)
-
-
-def simple_texture_to_scalar_common(
-    target, input_info, output_info, find_patterns, dtype, cast_type
-):
-    def _compute():
-        p0 = te.placeholder(input_info[1], name="p0", dtype=dtype)
-        p0_comp = te.compute(input_info[1], lambda *i: p0(*i), name="p0_comp")
-        if len(output_info[1]) == 4 and len(input_info[1]) == 5:
-            out = te.compute(
-                output_info[1],
-                lambda n, c, h, w: p0_comp[n][c // 4][h][w][c % 4].astype(cast_type),
-                name="out",
-            )
-        elif len(output_info[1]) == 5 and len(input_info[1]) == 5:
-            out = te.compute(
-                output_info[1],
-                lambda n, c, h, w, cb: p0_comp[n][c][h][w][cb].astype(cast_type),
-                name="out",
-            )
-        else:
-            raise Exception("Impossible case")
-        dummy_out = te.compute(output_info[1], lambda *i: out(*i), name="dummy_out")
-        return p0, dummy_out
-
-    def _schedule(dummy_out):
-        from tvm.topi.adreno.utils import bind_data_copy
-
-        s = te.create_schedule(dummy_out.op)
-        out = s[dummy_out].op.input_tensors[0]
-        p0_comp = s[out].op.input_tensors[0]
-        s[p0_comp].set_scope(input_info[0])
-        bind_data_copy(s[p0_comp])
-        s[out].set_scope(output_info[0])
-        bind_data_copy(s[out])
-        bind_data_copy(s[dummy_out])
-        return s
-
-    p0, dummy_out = _compute()
-    s = _schedule(dummy_out)
-
-    fun = tvm.build(s, [p0, dummy_out], target)
-    dev = tvm.device(target, 0)
-    opencl_source = fun.imported_modules[0].get_source()
-    start_idx = 0
-    for pattern in find_patterns:
-        start_idx = opencl_source.find(pattern, start_idx)
-        assert start_idx > -1
-
-    input_np = np.random.uniform(size=[i for i in input_info[1]]).astype(dtype)
-    input_tvm = tvm.nd.array(input_np, dev)
-    c = tvm.nd.empty(output_info[1], dtype, dev)
-    # Doesn't run OpenCL code for FP16 because GPUs in CI don't support FP16 inference
-    if cast_type == "float32":
-        fun(input_tvm, c)
-    # For output len == 5 it makes no sense to check the accuracy
-    if cast_type == "float32" and len(output_info[1]) == 4:
-        np_result = input_np.transpose(0, 2, 3, 1, 4)  # NCHW4c -> NHWC4c
-        np_result = np.squeeze(np_result, axis=3)
-        np_result = np_result.transpose(0, 3, 1, 2)  # NHWC -> NCHW
-        np.testing.assert_allclose(c.asnumpy(), np_result, rtol=1e-2, atol=1e-2)
-
-
-class TestSimpleTextureToScalarFP16:
-    # (input [scope, shape], output [scope, shape], [find_patterns])
-    input_info, output_info, find_patterns = tvm.testing.parameters(
-        # 1. Texture (NCHW4c) -> Cast(FP16) -> Buffer (NCHW)
-        (
-            ["global.texture", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));",
-                "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));",
-            ],
-        ),
-        # 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW)
-        (
-            ["", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]));"
-            ],
-        ),
-        # 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c)
-        (
-            ["global.texture", (1, 1, 40, 40, 4)],
-            ["global.texture", (1, 1, 40, 40, 4)],
-            [
-                "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5))));",
-                "write_imageh(out, (int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5)), (convert_half4(v_)));",
-            ],
-        ),
-    )
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.parametrize_targets("opencl")
-    def test_simple_texture_to_scalar_fp16(
-        self, input_info, output_info, find_patterns, dtype, target
-    ):
-        simple_texture_to_scalar_common(
-            target, input_info, output_info, find_patterns, dtype, "float16"
-        )
-
-
-class TestSimpleTextureToScalarFP32:
-    # (input [scope, shape], output [scope, shape], [find_patterns])
-    input_info, output_info, find_patterns = tvm.testing.parameters(
-        # 1. Texture (NCHW4c) -> Buffer (NCHW)
-        (
-            ["global.texture", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));",
-                "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((float*)&v_)[((convert_int(get_group_id(0))) >> 1)];",
-            ],
-        ),
-        # 2. Buffer (NCHW4c) -> Buffer (NCHW)
-        (
-            ["", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))];"
-            ],
-        ),
-    )
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.parametrize_targets("opencl")
-    def test_simple_texture_to_scalar_fp32(
-        self, input_info, output_info, find_patterns, dtype, target
-    ):
-        simple_texture_to_scalar_common(
-            target, input_info, output_info, find_patterns, dtype, "float32"
-        )
-
-
-def texture_to_scalar_reuse_ssa_common(
-    target, input_info, output_info, find_patterns, dtype, cast_type
-):
-    def _compute():
-        p0 = te.placeholder(input_info[1], name="p0", dtype=dtype)
-        p0_comp = te.compute(input_info[1], lambda *i: p0(*i), name="p0_comp")
-        if len(output_info[1]) == 4 and len(input_info[1]) == 5:
-            out = te.compute(
-                output_info[1],
-                lambda n, c, h, w: p0_comp[n][c // 4][h][w][c % 4].astype(cast_type),
-                name="out",
-            )
-            out2 = te.compute(
-                output_info[1],
-                lambda n, c, h, w: out[n][c][h][w]
-                + p0_comp[n][c // 4][h][w][c % 4].astype(cast_type),
-                name="out",
-            )
-        elif len(output_info[1]) == 5 and len(input_info[1]) == 5:
-            out = te.compute(
-                output_info[1],
-                lambda n, c, h, w, cb: p0_comp[n][c][h][w][cb].astype(cast_type),
-                name="out",
-            )
-            out2 = te.compute(
-                output_info[1],
-                lambda n, c, h, w, cb: out[n][c][h][w][cb]
-                + p0_comp[n][c][h][w][cb].astype(cast_type),
-                name="out",
-            )
-        else:
-            raise Exception("Impossible case")
-        out_sum = te.compute(output_info[1], lambda *i: out(*i) + out2(*i), name="out_sum")
-        dummy_out = te.compute(output_info[1], lambda *i: out_sum(*i), name="dummy_out")
-        return p0, dummy_out
-
-    def _schedule(dummy_out):
-        from tvm.topi.adreno.utils import bind_data_copy
-
-        s = te.create_schedule(dummy_out.op)
-        out_sum = s[dummy_out].op.input_tensors[0]
-        out, out2 = s[out_sum].op.input_tensors
-        p0_comp = s[out].op.input_tensors[0]
-        s[p0_comp].set_scope(input_info[0])
-        bind_data_copy(s[p0_comp])
-        s[out].set_scope(output_info[0])
-        s[out2].set_scope(output_info[0])
-        s[out2].compute_inline()
-        s[out].compute_inline()
-        s[out_sum].set_scope(output_info[0])
-        bind_data_copy(s[out_sum])
-        bind_data_copy(s[dummy_out])
-        return s
-
-    p0, dummy_out = _compute()
-    s = _schedule(dummy_out)
-
-    fun = tvm.build(s, [p0, dummy_out], target)
-    dev = tvm.device(target, 0)
-    opencl_source = fun.imported_modules[0].get_source()
-    start_idx = 0
-    for pattern in find_patterns:
-        start_idx = opencl_source.find(pattern, start_idx)
-        assert start_idx > -1
-
-    input_np = np.random.uniform(size=[i for i in input_info[1]]).astype(dtype)
-    input_tvm = tvm.nd.array(input_np, dev)
-    c = tvm.nd.empty(output_info[1], dtype, dev)
-    # Doesn't run OpenCL code for FP16 because GPUs in CI don't support FP16 inference
-    if cast_type == "float32":
-        fun(input_tvm, c)
-    # For output len == 5 it makes no sense to check the accuracy
-    if cast_type == "float32" and len(output_info[1]) == 4:
-        np_result = input_np * 3
-        np_result = np_result.transpose(0, 2, 3, 1, 4)  # NCHW4c -> NHWC4c
-        np_result = np.squeeze(np_result, axis=3)
-        np_result = np_result.transpose(0, 3, 1, 2)  # NHWC -> NCHW
-        np.testing.assert_allclose(c.asnumpy(), np_result, rtol=1e-2, atol=1e-2)
-
-
-class TestTextureToScalarReuseSSAFP16:
-    # (input [scope, shape], output [scope, shape], [find_patterns])
-    input_info, output_info, find_patterns = tvm.testing.parameters(
-        # 1. Texture (NCHW4c) -> Cast(FP16) -> Buffer (NCHW)
-        (
-            ["global.texture", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));",
-                "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) + ((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) + (convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]))));",
-            ],
-        ),
-        # 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW)
-        (
-            ["", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                " out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))])) + ((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))])) + (convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]))));"
-            ],
-        ),
-        # 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c)
-        (
-            ["global.texture", (1, 1, 40, 40, 4)],
-            ["global.texture", (1, 1, 40, 40, 4)],
-            [
-                "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5))));",
-                "write_imageh(out_sum, (int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5)), ((convert_half4(v_)) + ((convert_half4(v_)) + (convert_half4(v_)))));",
-            ],
-        ),
-    )
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.parametrize_targets("opencl")
-    def test_texture_to_scalar_reuse_ssa_fp16(
-        self, input_info, output_info, find_patterns, dtype, target
-    ):
-        texture_to_scalar_reuse_ssa_common(
-            target, input_info, output_info, find_patterns, dtype, "float16"
-        )
-
-
-class TestTextureToScalarReuseSSAFP32:
-    # (input [scope, shape], output [scope, shape], [find_patterns])
-    input_info, output_info, find_patterns = tvm.testing.parameters(
-        # 1. Texture (NCHW4c) -> Buffer (NCHW)
-        (
-            ["global.texture", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));",
-                "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] + (((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] + ((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));",
-            ],
-        ),
-        # 2. Buffer (NCHW4c) -> Buffer (NCHW)
-        (
-            ["", (1, 1, 40, 40, 4)],
-            ["", (1, 4, 40, 40)],
-            [
-                "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))] + (p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))] + p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]));"
-            ],
-        ),
-    )
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.parametrize_targets("opencl")
-    def test_texture_to_scalar_reuse_ssa_fp32(
-        self, input_info, output_info, find_patterns, dtype, target
-    ):
-        texture_to_scalar_reuse_ssa_common(
-            target, input_info, output_info, find_patterns, dtype, "float32"
-        )
-
-
-class TestLocalArrayToTexture:
-    # 1. conv2d(Texture(NCHW4c), Texture(OIHW4o)) -> local_array[4] -> Texture (NCHW4c)
-    input_shape1, input_shape2, output_shape, find_patterns = tvm.testing.parameters(
-        (
-            (1, 1, 40, 40, 4),
-            (2, 4, 3, 3, 4),
-            (1, 2, 38, 38, 4),
-            [
-                "float out_local[4];",
-                "float4 v_ = READ_IMAGEF(p1_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 14) + (convert_int(get_local_id(0)))) % 38), (((((convert_int(get_group_id(0))) * 64) + ((convert_int(get_local_id(0))) >> 1)) % 722) / 19))));",
-                "float4 v__1 = READ_IMAGEF(p2_comp, image_sampler, ((int2)(rw, (((((((convert_int(get_group_id(0))) * 32) + ((convert_int(get_local_id(0))) >> 2)) / 361) * 12) + (rcb * 3)) + rh))));",
-                "out_local[cb_c] = (out_local[cb_c] + (((float*)&v_)[rcb] * ((float*)&v__1)[cb_c]));",
-                "write_imagef(out, (int2)(((((convert_int(get_group_id(0))) * 14) + (convert_int(get_local_id(0)))) % 38), ((((convert_int(get_group_id(0))) * 64) + ((convert_int(get_local_id(0))) >> 1)) / 19)), vload4(0, out_local + 0));",
-            ],
-        ),
-    )
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.parametrize_targets("opencl")
-    def test_local_array_to_texture(
-        self, input_shape1, input_shape2, output_shape, find_patterns, dtype, target
-    ):
-        def _compute():
-            p1 = te.placeholder(input_shape1, name="p1", dtype=dtype)
-            p1_comp = te.compute(input_shape1, lambda *i: p1(*i), name="p1_comp")
-            p2 = te.placeholder(input_shape2, name="p2", dtype=dtype)
-            p2_comp = te.compute(input_shape2, lambda *i: p2(*i), name="p2_comp")
-            KH, KW = input_shape2[2], input_shape2[3]
-            IC, ICB = input_shape1[1], input_shape1[4]
-            rh = te.reduce_axis((0, KH), name="rh")
-            rw = te.reduce_axis((0, KW), name="rw")
-            rc = te.reduce_axis((0, IC), name="rc")
-            rcb = te.reduce_axis((0, ICB), name="rcb")
-            out = te.compute(
-                output_shape,
-                lambda n, c, h, w, cb: te.sum(
-                    (p1_comp[n, rc, h, w, rcb] * p2_comp[c, rc * ICB + rcb, rh, rw, cb]).astype(
-                        dtype
-                    ),
-                    axis=[rh, rw, rc, rcb],
-                ),
-                name="out",
-            )
-            dummy_out = te.compute(output_shape, lambda *i: out(*i), name="dummy_out")
-            return p1, p2, dummy_out
-
-        def _schedule(dummy_out):
-            from tvm.topi.adreno.utils import bind_data_copy
-
-            s = te.create_schedule(dummy_out.op)
-            out = s[dummy_out].op.input_tensors[0]
-            p1_comp, p2_comp = s[out].op.input_tensors
-            bind_data_copy(s[p1_comp])
-            s[p1_comp].set_scope("global.texture")
-            bind_data_copy(s[p2_comp])
-            s[p2_comp].set_scope("global.texture")
-            OL = s.cache_write(out, "local")
-            n, c, h, w, cb = s[out].op.axis
-            fused = s[out].fuse(n, c, h, w)
-            bx, tx = s[out].split(fused, 128)
-            s[out].reorder(bx, tx, cb)
-            s[out].vectorize(cb)
-            s[out].set_scope("global.texture")
-            s[out].bind(bx, te.thread_axis("blockIdx.x"))
-            s[out].bind(tx, te.thread_axis("threadIdx.x"))
-            s[OL].compute_at(s[out], tx)
-            bind_data_copy(s[dummy_out])
-            return s
-
-        p1, p2, dummy_out = _compute()
-        s = _schedule(dummy_out)
-
-        fun = tvm.build(s, [p1, p2, dummy_out], target)
-        dev = tvm.device(target, 0)
-        opencl_source = fun.imported_modules[0].get_source()
-        start_idx = 0
-        for pattern in find_patterns:
-            start_idx = opencl_source.find(pattern, start_idx)
-            assert start_idx > -1
-
-        input_np1 = np.random.uniform(size=[i for i in input_shape1]).astype(dtype)
-        input_np2 = np.random.uniform(size=[i for i in input_shape2]).astype(dtype)
-        input_tvm1 = tvm.nd.array(input_np1, dev)
-        input_tvm2 = tvm.nd.array(input_np2, dev)
-        c = tvm.nd.empty(output_shape, dtype, dev)
-        fun(input_tvm1, input_tvm2, c)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_amx.py b/tests/python/contrib/test_amx.py
deleted file mode 100644
index cd4f62cd62f6..000000000000
--- a/tests/python/contrib/test_amx.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
-
-import tvm
-from tvm import relay
-
-from tvm import te
-import tvm.testing
-from tvm.topi.x86.tensor_intrin import dot_32x128x32_u8s8s32_sapphirerapids
-from tvm.topi.x86.tensor_intrin import acc_32x32_int32_sapphirerapids
-import numpy as np
-import pytest
-
-
-has_amx_runtime = pytest.mark.skipif(
-    not tvm.get_global_func("runtime.amx_init", True), reason="AMX runtime not available"
-)
-
-
-@has_amx_runtime
-@tvm.testing.requires_x86_amx
-def test_amx_u8s8s32_matmul_tensorize():
-    m = 1024
-    k = 1024
-    n = 1024
-
-    # --------------------------Config---------------------------
-    # Skip this test if "-mcpu=sapphirerapids" not supported by LLVM < 12.0
-    target = "llvm -mcpu=sapphirerapids"
-    dev = tvm.device(target, 0)
-    if not tvm.testing.device_enabled(target):
-        print("skip because %s is not enabled..." % target)
-        return
-
-    amx_init = tvm.get_global_func("runtime.amx_init")
-    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
-    assert amx_init()
-    assert amx_tileconfig(16, 64)  # config tile size to 16 rows by 64 columns.
-    # --------------------------Compute--------------------------
-    X = te.placeholder((m, k), name="X", dtype="uint8")
-    ak = te.reduce_axis((0, k), name="k")
-    packedW = te.placeholder((n // 16, k // 4, 16, 4), name="packedW", dtype="int8")
-
-    C = te.compute(
-        (m, n),
-        lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packedW[tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4), j % 16, ak % 4].astype(
-                "int32"
-            ),
-            axis=ak,
-        ),
-        name="F",
-    )
-
-    # --------------------------Schedule--------------------------
-    s = te.create_schedule(C.op)
-    a_x, a_y = C.op.axis
-    (a_k,) = C.op.reduce_axis
-
-    CF = s.cache_write(C, "amx.tmm")
-    a_xo, a_xi = s[C].split(a_x, factor=32)
-    a_yo, a_yi = s[C].split(a_y, factor=32)
-    s[C].reorder(a_xo, a_yo, a_xi, a_yi)
-
-    s[CF].compute_at(s[C], a_yo)
-    (a_k_f,) = CF.op.reduce_axis
-    a_x_f, a_y_f = CF.op.axis
-
-    a_xo_f, a_xi_f = s[CF].split(a_x_f, factor=32)
-    a_yo_f, a_yi_f = s[CF].split(a_y_f, factor=32)
-    a_ko_f, a_ki_f = s[CF].split(a_k_f, factor=128)
-    s[CF].reorder(a_ko_f, a_xo_f, a_yo_f, a_ki_f, a_xi_f, a_yi_f)
-
-    s[CF].tensorize(a_ki_f, dot_32x128x32_u8s8s32_sapphirerapids(LDA=k))
-    s[C].tensorize(a_xi, acc_32x32_int32_sapphirerapids(LDC=n))
-
-    lib = tvm.build(s, [X, packedW, C], target, name="intrinsic")
-    asm = lib.get_source("asm")
-    assert "tilezero" in asm
-    assert "tileloaddt1" in asm
-    assert "tdpbusd" in asm
-    assert "tilestored" in asm
-
-    # ----------------------- verify correctness --------------------------------
-    # generate the plain data
-    a = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
-    b = np.random.uniform(1, 10, size=(n, k)).astype("int8")
-    packW = np.random.uniform(1, 10, size=(n // 16, k // 4, 16, 4)).astype("int8")
-
-    # This should occurs in pre_pack (constant folding) stage,
-    # from plain data to blocked data(NC16n4c)
-    for i_n in range(n):
-        for i_k in range(k):
-            packW[i_n // 16][i_k // 4][i_n % 16][i_k % 4] = b[i_n][i_k]
-
-    x = tvm.nd.array(a, dev)
-    w = tvm.nd.array(packW, dev)
-    y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
-    t_evaluator = lib.time_evaluator(lib.entry_name, dev, number=100)
-    result = t_evaluator(x, w, y)
-    print(result)
-    tvm.testing.assert_allclose(y.numpy(), np.dot(a.astype("int32"), b.T.astype("int32")), rtol=0)
-
-
-@has_amx_runtime
-@tvm.testing.requires_x86_amx
-def test_amx_check_support():
-    amx_init = tvm.get_global_func("runtime.amx_init")
-    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
-    assert amx_init()
-    assert amx_tileconfig(16, 64)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/python/contrib/test_arm_compute_lib/__init__.py b/tests/python/contrib/test_arm_compute_lib/__init__.py
deleted file mode 100644
index fd14be1cc34d..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Infrastructure and tests for Arm Compute Library"""
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
deleted file mode 100644
index ae3acbb09607..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from itertools import zip_longest, combinations
-import json
-import os
-import warnings
-
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm import rpc
-from tvm.contrib import graph_executor
-from tvm.relay.op.contrib import arm_compute_lib
-from tvm.contrib import utils
-from tvm.autotvm.measure import request_remote
-
-
-QNN_DTYPES = ("uint8", "int8")
-
-
-class Device:
-    """
-    Configuration for Arm Compute Library tests.
-
-    Check tests/python/contrib/arm_compute_lib/ for the presence of an test_config.json file.
-    This file can be used to override the default configuration here which will attempt to run the Arm
-    Compute Library runtime tests locally if the runtime is available. Changing the configuration
-    will allow these runtime tests to be offloaded to a remote Arm device via a tracker for example.
-
-    Notes
-    -----
-        The test configuration will be loaded once when the class is created. If the configuration
-        changes between tests, any changes will not be picked up.
-
-    Parameters
-    ----------
-    device : RPCSession
-        Allows tests to connect to and use remote device.
-
-    Attributes
-    ----------
-    connection_type : str
-        Details the type of RPC connection to use. Options:
-        local - Use the local device,
-        tracker - Connect to a tracker to request a remote device,
-        remote - Connect to a remote device directly.
-    host : str
-        Specify IP address or hostname of remote target.
-    port : int
-        Specify port number of remote target.
-    target : str
-        The compilation target.
-    device_key : str
-        The device key of the remote target. Use when connecting to a remote device via a tracker.
-    cross_compile : str
-        Specify path to cross compiler to use when connecting a remote device from a non-arm platform.
-    """
-
-    connection_type = "local"
-    host = "127.0.0.1"
-    port = 9090
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+neon"
-    device_key = ""
-    cross_compile = ""
-
-    def __init__(self):
-        """Keep remote device for lifetime of object."""
-        self.device = self._get_remote()
-
-    @classmethod
-    def _get_remote(cls):
-        """Get a remote (or local) device to use for testing."""
-        if cls.connection_type == "tracker":
-            device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000)
-        elif cls.connection_type == "remote":
-            device = rpc.connect(cls.host, cls.port)
-        elif cls.connection_type == "local":
-            device = rpc.LocalSession()
-        else:
-            raise ValueError(
-                "connection_type in test_config.json should be one of: " "local, tracker, remote."
-            )
-
-        return device
-
-    @classmethod
-    def load(cls, file_name):
-        """Load test config
-
-        Load the test configuration by looking for file_name relative
-        to the test_arm_compute_lib directory.
-        """
-        location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-        config_file = os.path.join(location, file_name)
-        if not os.path.exists(config_file):
-            warnings.warn(
-                "Config file doesn't exist, resuming Arm Compute Library tests with default config."
-            )
-            return
-        with open(config_file, mode="r") as config:
-            test_config = json.load(config)
-
-        cls.connection_type = test_config["connection_type"]
-        cls.host = test_config["host"]
-        cls.port = test_config["port"]
-        cls.target = test_config["target"]
-        cls.device_key = test_config.get("device_key") or ""
-        cls.cross_compile = test_config.get("cross_compile") or ""
-
-
-def get_low_high_atol_rtol(dtype):
-    """Returns a tuple with boundary values and tolerance for ACL tests."""
-
-    if dtype == "float32":
-        low, high, atol, rtol = (-127, 128, 0.001, 0.001)
-    elif dtype == "uint8":
-        low, high, atol, rtol = (0, 255, 1, 0)
-    elif dtype == "int8":
-        low, high, atol, rtol = (-127, 128, 1, 0)
-    else:
-        raise Exception(f"dtype not expected: {dtype}")
-
-    return low, high, atol, rtol
-
-
-def get_cpu_op_count(mod):
-    """Traverse graph counting ops offloaded to TVM."""
-
-    class Counter(tvm.relay.ExprVisitor):
-        def __init__(self):
-            super().__init__()
-            self.count = 0
-
-        def visit_call(self, call):
-            if isinstance(call.op, tvm.ir.Op):
-                self.count += 1
-
-            super().visit_call(call)
-
-    c = Counter()
-    c.visit(mod["main"])
-    return c.count
-
-
-def skip_runtime_test():
-    """Skip test if it requires the runtime and it's not present."""
-    # ACL codegen not present.
-    if not tvm.get_global_func("relay.ext.arm_compute_lib", True):
-        print("Skip because Arm Compute Library codegen is not available.")
-        return True
-
-    # Remote device is in use or ACL runtime not present
-    # Note: Ensure that the device config has been loaded before this check
-    if (
-        not Device.connection_type != "local"
-        and not arm_compute_lib.is_arm_compute_runtime_enabled()
-    ):
-        print("Skip because runtime isn't present or a remote device isn't being used.")
-        return True
-
-
-def skip_codegen_test():
-    """Skip test if it requires the ACL codegen and it's not present."""
-    if not tvm.get_global_func("relay.ext.arm_compute_lib", True):
-        print("Skip because Arm Compute Library codegen is not available.")
-        return True
-
-
-def build_module(
-    mod,
-    target,
-    params=None,
-    enable_acl=True,
-    tvm_ops=0,
-    acl_partitions=1,
-    disabled_ops=["concatenate"],
-):
-    """Build module with option to build for ACL."""
-    if isinstance(mod, tvm.relay.expr.Call):
-        mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        if enable_acl:
-            mod = arm_compute_lib.partition_for_arm_compute_lib(
-                mod, params, disabled_ops=disabled_ops
-            )
-            tvm_op_count = get_cpu_op_count(mod)
-            assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
-                tvm_op_count, tvm_ops
-            )
-            partition_count = 0
-            for global_var in mod.get_global_vars():
-                if "arm_compute_lib" in global_var.name_hint:
-                    partition_count += 1
-
-            assert (
-                acl_partitions == partition_count
-            ), "Got {} Arm Compute Library partitions, expected {}".format(
-                partition_count, acl_partitions
-            )
-        relay.backend.te_compiler.get().clear()
-        return relay.build(mod, target=target, params=params)
-
-
-def build_and_run(
-    mod,
-    inputs,
-    outputs,
-    params,
-    device,
-    enable_acl=True,
-    no_runs=1,
-    tvm_ops=0,
-    acl_partitions=1,
-    config=None,
-    disabled_ops=["concatenate"],
-):
-    """Build and run the relay module."""
-    if config is None:
-        config = {}
-
-    try:
-        lib = build_module(
-            mod, device.target, params, enable_acl, tvm_ops, acl_partitions, disabled_ops
-        )
-    except Exception as e:
-        err_msg = "The module could not be built.\n"
-        if config:
-            err_msg += f"The test failed with the following parameters: {config}\n"
-        err_msg += str(e)
-        raise Exception(err_msg)
-
-    lib = update_lib(lib, device.device, device.cross_compile)
-    gen_module = graph_executor.GraphModule(lib["default"](device.device.cpu(0)))
-    gen_module.set_input(**inputs)
-    out = []
-    for _ in range(no_runs):
-        gen_module.run()
-        out.append([gen_module.get_output(i) for i in range(outputs)])
-    return out
-
-
-def update_lib(lib, device, cross_compile):
-    """Export the library to the remote/local device."""
-    lib_name = "mod.so"
-    temp = utils.tempdir()
-    lib_path = temp.relpath(lib_name)
-    if cross_compile:
-        lib.export_library(lib_path, cc=cross_compile)
-    else:
-        lib.export_library(lib_path)
-    device.upload(lib_path)
-    lib = device.load_module(lib_name)
-    return lib
-
-
-def verify(answers, atol, rtol, verify_saturation=False, config=None):
-    """Compare the array of answers. Each entry is a list of outputs."""
-    if config is None:
-        config = {}
-
-    if len(answers) < 2:
-        raise RuntimeError(f"No results to compare: expected at least two, found {len(answers)}")
-    for answer in zip_longest(*answers):
-        for outs in combinations(answer, 2):
-            try:
-                if verify_saturation:
-                    assert (
-                        np.count_nonzero(outs[0].numpy() == 255) < 0.25 * outs[0].numpy().size
-                    ), "Output is saturated: {}".format(outs[0])
-                    assert (
-                        np.count_nonzero(outs[0].numpy() == 0) < 0.25 * outs[0].numpy().size
-                    ), "Output is saturated: {}".format(outs[0])
-                tvm.testing.assert_allclose(outs[0].numpy(), outs[1].numpy(), rtol=rtol, atol=atol)
-            except AssertionError as e:
-                err_msg = "Results not within the acceptable tolerance.\n"
-                if config:
-                    err_msg += f"The test failed with the following parameters: {config}\n"
-                err_msg += str(e)
-                raise AssertionError(err_msg)
-
-
-def extract_acl_modules(module):
-    """Get the ACL module(s) from llvm module."""
-    return list(
-        filter(lambda mod: mod.type_key == "arm_compute_lib", module.get_lib().imported_modules)
-    )
-
-
-def verify_codegen(
-    module,
-    known_good_codegen,
-    num_acl_modules=1,
-    tvm_ops=0,
-    target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
-    disabled_ops=["concatenate"],
-):
-    """Check acl codegen against a known good output."""
-    module = build_module(
-        module,
-        target,
-        tvm_ops=tvm_ops,
-        acl_partitions=num_acl_modules,
-        disabled_ops=disabled_ops,
-    )
-    acl_modules = extract_acl_modules(module)
-
-    assert len(acl_modules) == num_acl_modules, (
-        f"The number of Arm Compute Library modules produced ({len(acl_modules)}) does not "
-        f"match the expected value ({num_acl_modules})."
-    )
-
-    for mod in acl_modules:
-        source = mod.get_source("json")
-        codegen = json.loads(source)["nodes"]
-        # remove input and const names as these cannot be predetermined
-        for node in range(len(codegen)):
-            if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
-                codegen[node]["name"] = ""
-        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
-        known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2)
-
-        assert codegen_str == known_good_codegen_str, (
-            f"The JSON produced by codegen does not match the expected result. \n"
-            f"Actual={codegen_str} \n"
-            f"Expected={known_good_codegen_str}"
-        )
diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py
deleted file mode 100644
index 319105bb5fd9..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_add.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library integration reshape tests."""
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import relay
-
-from test_arm_compute_lib.infrastructure import (
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-)
-from test_arm_compute_lib.infrastructure import Device
-
-_qnn_params = {
-    "lhs_scale": relay.const(0.0156863, "float32"),
-    "lhs_zero_point": relay.const(127, "int32"),
-    "rhs_scale": relay.const(0.0117647, "float32"),
-    "rhs_zero_point": relay.const(85, "int32"),
-    "output_scale": relay.const(0.0235294, "float32"),
-    "output_zero_point": relay.const(128, "int32"),
-}
-
-
-def _get_model(shape, dtype, var_names, op, op_params):
-    a = relay.var(next(var_names), shape=shape, dtype=dtype)
-    b = relay.var(next(var_names), shape=shape, dtype=dtype)
-    return op(a, b, **op_params)
-
-
-def _get_expected_codegen(shape, dtype, op_name, qnn_params):
-    input_a = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
-    input_b = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
-    input_qnn = [
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {
-                "shape": [[list(qnn_params[_].data.shape)]],
-                "dtype": [[qnn_params[_].data.dtype]],
-            },
-        }
-        for _ in qnn_params
-    ]
-    inputs = [input_a, input_b, *input_qnn]
-    node = {
-        "op": "kernel",
-        "name": op_name,
-        "inputs": [[_, 0, 0] for _ in range(len(inputs))],
-        "attrs": {
-            "num_inputs": str(len(inputs)),
-            "num_outputs": "1",
-            "shape": [[list(shape)]],
-            "dtype": [[dtype]],
-        },
-    }
-
-    if qnn_params:
-        node["attrs"]["lhs_axis"] = [["-1"]]
-        node["attrs"]["rhs_axis"] = [["-1"]]
-
-    return [*inputs, node]
-
-
-def test_runtime_add():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    for dtype, low, high, atol, rtol, op, op_params in [
-        ("float32", -127, 128, 1e-7, 1e-7, relay.add, {}),
-        ("uint8", 0, 255, 1.0, 0.0, relay.qnn.op.add, _qnn_params),
-        ("int8", -127, 128, 1.0, 0.0, relay.qnn.op.add, _qnn_params),
-    ]:
-        shape = (2, 2)
-        for inputs in [
-            {
-                "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
-                "b": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
-            }
-        ]:
-            outputs = []
-            func = _get_model(shape, dtype, iter(inputs), op, op_params)
-            for acl in [True, False]:
-                outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0])
-
-            config = {
-                "shape": shape,
-                "dtype": dtype,
-                "inputs": inputs,
-                "operation": op,
-                "op_params": op_params,
-            }
-
-            verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=False)
-
-
-def test_codegen_add():
-    if skip_codegen_test():
-        return
-
-    inputs = {"a", "b"}
-    for dtype, op_name, op, qnn_params in [
-        ("float32", "add", relay.add, {}),
-        ("uint8", "qnn.add", relay.qnn.op.add, _qnn_params),
-        ("int8", "qnn.add", relay.qnn.op.add, _qnn_params),
-    ]:
-        for shape in [(1, 1), (2, 2, 2), (3, 3, 3, 3)]:
-            func = _get_model(shape, dtype, iter(inputs), op, qnn_params)
-            exp_codegen = _get_expected_codegen(shape, dtype, op_name, qnn_params)
-            verify_codegen(func, exp_codegen, 1)
-
-
-@pytest.mark.parametrize(
-    "param, param_type",
-    [
-        ("lhs_scale", "float32"),
-        ("lhs_zero_point", "int32"),
-        ("rhs_scale", "float32"),
-        ("rhs_zero_point", "int32"),
-    ],
-)
-def test_codegen_add_per_channel_quantization(param, param_type):
-    if skip_codegen_test():
-        return
-
-    qnn_params = _qnn_params
-    qnn_params[param] = relay.const([1, 2], param_type)
-
-    dtype = "int8"
-    op_name = "qnn.add"
-    op = relay.qnn.op.add
-    inputs = {"a", "b"}
-
-    for shape in [(1, 3, 3, 2)]:
-        func = _get_model(shape, dtype, iter(inputs), op, qnn_params)
-        exp_codegen = _get_expected_codegen(shape, dtype, op_name, qnn_params)
-        verify_codegen(func, exp_codegen, num_acl_modules=0, tvm_ops=1)
-
-
-if __name__ == "__main__":
-    test_runtime_add()
-    test_codegen_add()
-    test_codegen_add_per_channel_quantization()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_concatenate.py b/tests/python/contrib/test_arm_compute_lib/test_concatenate.py
deleted file mode 100644
index 55072f37c2bf..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_concatenate.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library integration concatenate tests."""
-
-import numpy as np
-import pytest
-
-import tvm
-from tvm import relay
-from tvm import testing
-
-from test_arm_compute_lib.infrastructure import (
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-)
-from test_arm_compute_lib.infrastructure import Device
-
-
-def _get_model(input_shape_a, input_shape_b, input_shape_c, axis, dtype, var_names):
-    """Return a model and any parameters it may have."""
-    a = relay.var(next(var_names), shape=input_shape_a, dtype=dtype)
-    b = relay.var(next(var_names), shape=input_shape_b, dtype=dtype)
-    c = relay.var(next(var_names), shape=input_shape_c, dtype=dtype)
-    out = relay.concatenate([a, b, c], axis)
-    return out
-
-
-def _get_expected_codegen(input_shape_a, input_shape_b, input_shape_c, axis, dtype):
-    node = {
-        "op": "kernel",
-        "name": "concatenate",
-        "inputs": [
-            [0, 0, 0],
-            [1, 0, 0],
-            [2, 0, 0],
-        ],
-        "attrs": {
-            "num_outputs": "1",
-            "num_inputs": "3",
-            "dtype": [[dtype]],
-            "axis": [[str(axis)]],
-            "shape": [[[6, 234, 234, 256]]],
-        },
-    }
-
-    input_a = {
-        "op": "input",
-        "name": "",
-        "attrs": {
-            "shape": [[input_shape_a]],
-            "dtype": [[dtype]],
-        },
-    }
-
-    input_b = {
-        "op": "input",
-        "name": "",
-        "attrs": {
-            "shape": [[input_shape_b]],
-            "dtype": [[dtype]],
-        },
-    }
-
-    input_c = {
-        "op": "input",
-        "name": "",
-        "attrs": {
-            "shape": [[input_shape_c]],
-            "dtype": [[dtype]],
-        },
-    }
-    return [input_a, input_b, input_c, node]
-
-
-@pytest.mark.parametrize(
-    "input_shape_a, input_shape_b, input_shape_c, axis, dtype",
-    [
-        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], 0, "float32"),
-        ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "float32"),
-        ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "float32"),
-        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], -4, "float32"),
-        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], 0, "uint8"),
-        ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "uint8"),
-        ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "uint8"),
-        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], -4, "uint8"),
-        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], 0, "int8"),
-        ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "int8"),
-        ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "int8"),
-        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], -4, "int8"),
-    ],
-)
-def test_concatenate(input_shape_a, input_shape_b, input_shape_c, axis, dtype):
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    outputs = []
-    inputs = {
-        "a": tvm.nd.array(np.random.randn(*input_shape_a).astype(dtype)),
-        "b": tvm.nd.array(np.random.randn(*input_shape_b).astype(dtype)),
-        "c": tvm.nd.array(np.random.randn(*input_shape_c).astype(dtype)),
-    }
-    func = _get_model(
-        inputs["a"].shape, inputs["b"].shape, inputs["c"].shape, axis, dtype, iter(inputs)
-    )
-    for acl in [False, True]:
-        outputs.append(
-            build_and_run(func, inputs, 1, None, device, enable_acl=acl, disabled_ops=[])[0]
-        )
-
-    config = {
-        "input_shape_a": input_shape_a,
-        "input_shape_b": input_shape_b,
-        "input_shape_c": input_shape_c,
-        "axis": axis,
-        "dtype": dtype,
-    }
-    verify(outputs, atol=1e-7, rtol=1e-7, config=config)
-
-
-def test_codegen_concatenate():
-    if skip_codegen_test():
-        return
-    shape_a = [1, 234, 234, 256]
-    shape_b = [2, 234, 234, 256]
-    shape_c = [3, 234, 234, 256]
-    axis = 0
-    inputs = {"a", "b", "c"}
-    for dtype in ["float32"]:
-        args = (shape_a, shape_b, shape_c, axis, dtype)
-        func = _get_model(*args, iter(inputs))
-        exp_codegen = _get_expected_codegen(*args)
-        verify_codegen(func, exp_codegen, 1, disabled_ops=[])
-
-
-if __name__ == "__main__":
-    test_concatenate()
-    test_codegen_concatenate()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_config.json b/tests/python/contrib/test_arm_compute_lib/test_config.json
deleted file mode 100644
index 5c75e659af77..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_config.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "connection_type": "local",
-  "host": "127.0.0.1",
-  "port": 9090,
-  "target": "llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
-  "device_key": "",
-  "cross_compile": ""
-}
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
deleted file mode 100644
index b4fa49ffa288..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ /dev/null
@@ -1,672 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library integration conv2d tests."""
-
-import numpy as np
-import pytest
-
-import tvm
-from tvm import relay
-
-from test_arm_compute_lib.infrastructure import (
-    QNN_DTYPES,
-    get_low_high_atol_rtol,
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-)
-from test_arm_compute_lib.infrastructure import Device
-
-
-def _get_model(
-    shape,
-    kernel_h,
-    kernel_w,
-    padding,
-    strides,
-    dilation,
-    groups,
-    dtype,
-    channels,
-    var_names,
-    has_bias=False,
-    has_activation=False,
-    has_pad=False,
-):
-    """Return a model and any parameters it may have"""
-    a = relay.var(next(var_names), shape=shape, dtype=dtype)
-    if has_pad:
-        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
-        a = relay.nn.pad(a, pad_width=p)
-        padding = (0, 0, 0, 0)
-    else:
-        if len(padding) == 2:
-            padding = (padding[0], padding[1], padding[0], padding[1])
-        shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3])
-    is_depthwise = shape[3] == channels == groups
-    weight_format = "HWOI" if is_depthwise else "HWIO"
-    if weight_format == "HWIO":
-        weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
-    else:
-        weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups)
-    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
-    weights = relay.const(w, dtype)
-    out = relay.nn.conv2d(
-        a,
-        weights,
-        kernel_size=(kernel_h, kernel_w),
-        data_layout="NHWC",
-        kernel_layout=weight_format,
-        dilation=dilation,
-        strides=strides,
-        padding=padding,
-        groups=groups,
-        channels=channels,
-        out_dtype=dtype,
-    )
-    params = {"w": w}
-    if has_bias:
-        bias_shape = weight_shape[2] if is_depthwise else weight_shape[3]
-        b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype(dtype))
-        biasc = relay.const(b, dtype)
-        out = relay.nn.bias_add(out, biasc, axis=3)
-        params["b"] = b
-    if has_activation:
-        out = relay.nn.relu(out)
-    return out, params
-
-
-def _get_qnn_params(input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, channels):
-    """Get output qnn parameters given input and kernel parameters."""
-    input_max = input_sc * (255 - input_zp)
-    input_min = -input_sc * input_zp
-    kernel_max = kernel_sc * (255 - kernel_zp)
-    kernel_min = -kernel_sc * kernel_zp
-    output_limits = [
-        kernel_max * kernel_h * kernel_w * channels * input_max,
-        kernel_min * kernel_h * kernel_w * channels * input_max,
-        kernel_min * kernel_h * kernel_w * channels * input_min,
-        kernel_max * kernel_h * kernel_w * channels * input_min,
-    ]
-    output_max = max(output_limits)
-    output_min = min(output_limits)
-    output_sc = (output_max - output_min) / 255
-    output_zp = -int(output_min / output_sc)
-    return output_zp, output_sc
-
-
-def _get_qnn_model(
-    shape,
-    kernel_h,
-    kernel_w,
-    padding,
-    strides,
-    dilation,
-    groups,
-    dtype,
-    channels,
-    input_zp,
-    input_sc,
-    kernel_zp,
-    kernel_sc,
-    output_zp,
-    output_sc,
-    var_names,
-    has_bias=False,
-    has_activation=False,
-    has_pad=False,
-):
-    """Return a model and any parameters it may have."""
-    low, high, _, _ = get_low_high_atol_rtol(dtype)
-
-    a = relay.var(next(var_names), shape=shape, dtype=dtype)
-    if has_pad:
-        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
-        a = relay.nn.pad(a, pad_width=p, pad_value=input_zp, pad_mode="constant")
-        padding = (0, 0, 0, 0)
-    else:
-        if len(padding) == 2:
-            padding = (padding[0], padding[1], padding[0], padding[1])
-        shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3])
-    is_depthwise = shape[3] == channels == groups
-    weight_format = "HWOI" if is_depthwise else "HWIO"
-    if weight_format == "HWIO":
-        weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
-    else:
-        weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups)
-    w = tvm.nd.array(np.random.uniform(low, high, weight_shape).astype(dtype))
-    weights = relay.const(w, dtype)
-    out = relay.qnn.op.conv2d(
-        a,
-        weights,
-        input_zero_point=relay.const(input_zp, "int32"),
-        kernel_zero_point=relay.const(kernel_zp, "int32"),
-        input_scale=relay.const(input_sc, "float32"),
-        kernel_scale=relay.const(kernel_sc, "float32"),
-        kernel_size=(kernel_h, kernel_w),
-        data_layout="NHWC",
-        kernel_layout=weight_format,
-        dilation=dilation,
-        strides=strides,
-        padding=padding,
-        groups=groups,
-        channels=channels,
-        out_dtype="int32",
-    )
-    params = {"w": w}
-    if has_bias:
-        bias_shape = weight_shape[2] if is_depthwise else weight_shape[3]
-        b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype("int32"))
-        biasc = relay.const(b, "int32")
-        out = relay.nn.bias_add(out, biasc, axis=3)
-        params["b"] = b
-    if has_activation:
-        out = relay.nn.relu(out)
-    req = relay.qnn.op.requantize(
-        out,
-        relay.const(input_sc * kernel_sc, "float32"),  # input scale
-        relay.const(0, "int32"),  # input zero point
-        relay.const(output_sc, "float32"),  # output scale
-        relay.const(output_zp, "int32"),  # output zero point
-        out_dtype=dtype,
-    )
-    return req, params
-
-
-def _get_expected_codegen(
-    shape,
-    kernel_h,
-    kernel_w,
-    padding,
-    strides,
-    dilation,
-    groups,
-    dtype,
-    channels,
-    has_bias=False,
-    has_activation=False,
-):
-    if len(padding) == 2:
-        padding = (padding[0], padding[1], padding[0], padding[1])
-    output_height = ((shape[1] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1
-    output_width = ((shape[2] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1
-    output_shape = (1, int(output_height), int(output_width), channels)
-    out_dtype = "int32" if dtype in QNN_DTYPES else "float32"
-    is_depthwise = shape[3] == channels == groups
-    weight_format = "IHWO" if is_depthwise else "OHWI"
-    if weight_format == "IHWO":
-        weight_shape = (shape[3] // groups, kernel_h, kernel_w, channels)
-    else:
-        weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups)
-    if is_depthwise:
-        name = "nn.depthwise_conv2d"
-    else:
-        name = "nn.conv2d"
-
-    node = {
-        "op": "kernel",
-        "name": name,
-        "inputs": [],
-        "attrs": {
-            "groups": [[str(groups)]],
-            "num_outputs": "1",
-            "data_layout": [["NHWC"]],
-            "kernel_layout": [[weight_format]],
-            "channels": [[str(channels)]],
-            "dilation": [[str(dilation[0]), str(dilation[1])]],
-            "out_layout": [[""]],
-            "out_dtype": [[out_dtype]],
-            "kernel_size": [[str(kernel_h), str(kernel_w)]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "padding": [[str(p) for p in padding]],
-            "strides": [[str(s) for s in strides]],
-        },
-    }
-
-    if has_activation:
-        node["attrs"]["activation_type"] = [["relu"]]
-
-    inputs = [
-        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]},
-        },
-    ]
-
-    # qnn.conv2d params, input and kernel
-    if dtype in QNN_DTYPES:
-        node["name"] = "qnn." + node["name"].split(".")[1]
-        for param_dtype in ["int32", "float32"]:
-            for _ in range(2):
-                inputs.append(
-                    {
-                        "op": "const",
-                        "name": "",
-                        "attrs": {"shape": [[[]]], "dtype": [[param_dtype]]},
-                    }
-                )
-
-    if has_bias:
-        bias_dtype = "int32" if dtype in QNN_DTYPES else "float32"
-        inputs.append(
-            {
-                "op": "const",
-                "name": "",
-                "attrs": {
-                    "shape": [[[1, 1, 1, weight_shape[3] if is_depthwise else weight_shape[0]]]],
-                    "dtype": [[bias_dtype]],
-                },
-            }
-        )
-
-    # qnn.conv2d params, output
-    if dtype in QNN_DTYPES:
-        for param_dtype in ["float32", "int32"]:
-            inputs.append(
-                {"op": "const", "name": "", "attrs": {"shape": [[[]]], "dtype": [[param_dtype]]}}
-            )
-
-    input_idx = 0
-    for _ in range(len(inputs)):
-        node["inputs"].append([input_idx, 0, 0])
-        input_idx += 1
-    node["attrs"]["num_inputs"] = str(len(inputs))
-    inputs.append(node)
-    return inputs
-
-
-@pytest.mark.parametrize(
-    "trial",
-    [
-        # Normal convolution
-        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
-        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
-        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
-        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
-        # Depth-wise convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
-        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
-        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
-        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
-        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
-    ],
-)
-def test_conv2d(trial):
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    dtype = "float32"
-
-    (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        shape,
-        composite,
-        is_depthwise,
-    ) = trial
-    shape = (1, *shape)
-    if is_depthwise:
-        groups = shape[3]
-    else:
-        groups = 1
-    outputs = []
-    inputs = {
-        "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
-    }
-
-    func, params = _get_model(
-        shape,
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        groups,
-        dtype,
-        out_channels,
-        iter(inputs),
-        has_pad=composite[0],
-        has_bias=composite[1],
-        has_activation=composite[2],
-    )
-    # Generate results for ACL conv2d and TVM native conv2d for comparison
-    for acl in [False, True]:
-        outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
-
-    config = {
-        "shape": shape,
-        "groups": groups,
-        "kernel size": (kernel_h, kernel_w),
-        "padding": pad,
-        "stride": stride,
-        "dilation": dilation,
-        "out channels": out_channels,
-        "composite operators (pad, bias, activation)": composite,
-    }
-    verify(outputs, atol=0.002, rtol=0.01, config=config)
-
-
-@pytest.mark.parametrize(
-    "trial",
-    [
-        # Normal convolution
-        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
-        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
-        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
-        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
-        # Depth-wise convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
-        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
-        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
-        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
-        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
-    ],
-)
-def test_codegen_conv2d(trial):
-    if skip_codegen_test():
-        return
-
-    dtype = "float32"
-
-    (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        shape,
-        composite,
-        is_depthwise,
-    ) = trial
-    shape = (1, *shape)
-    if is_depthwise:
-        groups = shape[3]
-    else:
-        groups = 1
-    inputs = {"a"}
-
-    args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
-
-    func, params = _get_model(
-        *args,
-        var_names=iter(inputs),
-        has_pad=composite[0],
-        has_bias=composite[1],
-        has_activation=composite[2],
-    )
-    exp_codegen = _get_expected_codegen(*args, has_bias=composite[1], has_activation=composite[2])
-    verify_codegen(func, exp_codegen, 1)
-
-
-@pytest.mark.parametrize(
-    "trial",
-    [
-        # Normal convolution
-        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
-        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
-        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
-        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
-        # Depth-wise convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
-        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
-        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
-        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
-        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
-    ],
-)
-@pytest.mark.parametrize("dtype", QNN_DTYPES)
-def test_qnn_conv2d(trial, dtype):
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        shape,
-        composite,
-        is_depthwise,
-    ) = trial
-    shape = (1, *shape)
-    if is_depthwise:
-        groups = shape[3]
-    else:
-        groups = 1
-    outputs = []
-    inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
-
-    input_zp = 100
-    input_sc = 0.5
-    kernel_zp = 25
-    kernel_sc = 0.03
-    output_zp, output_sc = _get_qnn_params(
-        input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, shape[3]
-    )
-
-    func, params = _get_qnn_model(
-        shape,
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        groups,
-        dtype,
-        out_channels,
-        input_zp,
-        input_sc,
-        kernel_zp,
-        kernel_sc,
-        output_zp,
-        output_sc,
-        iter(inputs),
-        has_pad=composite[0],
-        has_bias=composite[1],
-        has_activation=composite[2],
-    )
-    for acl in [False, True]:
-        outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
-
-    config = {
-        "shape": shape,
-        "groups": groups,
-        "kernel size": (kernel_h, kernel_w),
-        "padding": pad,
-        "stride": stride,
-        "dilation": dilation,
-        "out channels": out_channels,
-        "composite operators (pad, bias, activation)": composite,
-        "input scale": input_sc,
-        "input zero point": input_zp,
-        "kernel scale": kernel_sc,
-        "kernel zero point": kernel_zp,
-        "output scale": output_sc,
-        "output zero point": output_zp,
-    }
-
-    atol = 2 if is_depthwise else 1
-    verify(outputs, atol=atol, rtol=0, config=config, verify_saturation=True)
-
-
-@pytest.mark.parametrize("dtype", QNN_DTYPES)
-@pytest.mark.parametrize(
-    "trial",
-    [
-        # Normal convolution
-        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
-        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
-        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
-        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
-        # Depth-wise convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
-        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
-        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
-        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
-        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
-    ],
-)
-def test_codegen_qnn_conv2d(trial, dtype):
-    if skip_codegen_test():
-        return
-
-    (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        shape,
-        composite,
-        is_depthwise,
-    ) = trial
-    shape = (1, *shape)
-    if is_depthwise:
-        groups = shape[3]
-    else:
-        groups = 1
-    inputs = {"a"}
-
-    input_zp = 100
-    input_sc = 0.5
-    kernel_zp = 25
-    kernel_sc = 0.03
-    output_zp, output_sc = _get_qnn_params(
-        input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, shape[3]
-    )
-
-    args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
-
-    func, params = _get_qnn_model(
-        *args,
-        input_zp=input_zp,
-        input_sc=input_sc,
-        kernel_zp=kernel_zp,
-        kernel_sc=kernel_sc,
-        output_zp=output_zp,
-        output_sc=output_sc,
-        var_names=iter(inputs),
-        has_pad=composite[0],
-        has_bias=composite[1],
-        has_activation=composite[2],
-    )
-    exp_codegen = _get_expected_codegen(*args, has_bias=composite[1], has_activation=composite[2])
-    verify_codegen(func, exp_codegen, 1)
-
-
-@pytest.mark.parametrize(
-    "param",
-    ["kernel_sc", "kernel_zp"],
-)
-def test_codegen_qnn_conv2d_per_channel_quantization(param):
-    if skip_codegen_test():
-        return
-
-    dtype = "int8"
-    kernel_h = 2
-    kernel_w = 2
-    pad = (1, 1)
-    stride = (1, 1)
-    dilation = (1, 1)
-    out_channels = 4
-    shape = (1, 10, 10, 14)
-    composite = (False, False, False)
-    groups = 1
-    inputs = {"a"}
-
-    qnn_params = {
-        "input_zp": 1,
-        "input_sc": 1,
-        "kernel_zp": 1,
-        "kernel_sc": 1,
-        "output_zp": 1,
-        "output_sc": 1,
-    }
-    qnn_params[param] = [1, 1, 1, 1]
-
-    args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
-
-    func, params = _get_qnn_model(
-        *args,
-        input_zp=qnn_params["input_zp"],
-        input_sc=qnn_params["input_sc"],
-        kernel_zp=qnn_params["kernel_zp"],
-        kernel_sc=qnn_params["kernel_sc"],
-        output_zp=qnn_params["output_zp"],
-        output_sc=qnn_params["output_sc"],
-        var_names=iter(inputs),
-        has_pad=composite[0],
-        has_bias=composite[1],
-        has_activation=composite[2],
-    )
-    exp_codegen = _get_expected_codegen(*args, has_bias=composite[1], has_activation=composite[2])
-    verify_codegen(func, exp_codegen, num_acl_modules=0, tvm_ops=2)
-
-
-if __name__ == "__main__":
-    test_conv2d()
-    test_qnn_conv2d()
-    test_codegen_conv2d()
-    test_codegen_qnn_conv2d()
-    test_codegen_qnn_conv2d_per_channel_quantization()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
deleted file mode 100644
index 411f790f347d..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ /dev/null
@@ -1,430 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library integration dense tests."""
-import numpy as np
-import pytest
-
-import tvm
-from tvm import relay
-from tvm import testing
-from test_arm_compute_lib.infrastructure import (
-    Device,
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-)
-
-
-def _get_model(shape, weight_shape, units, dtype, var_names, has_bias=False):
-    """Return a model and any parameters it may have"""
-    a = relay.var(next(var_names), shape=shape, dtype=dtype)
-    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
-    weights = relay.const(w, dtype)
-    out = relay.nn.dense(a, weights, units=units, out_dtype=dtype)
-    params = {"w": w}
-    if has_bias:
-        b = tvm.nd.array(np.random.randint(-128, 127, weight_shape[0]).astype(dtype))
-        biasc = relay.const(b, dtype)
-        out = relay.nn.bias_add(out, biasc)
-        params["b"] = b
-    return out, params
-
-
-def _get_qnn_params(input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w):
-    """Get output qnn parameters given input and kernel parameters."""
-    input_max = input_sc * (255 - input_zp)
-    input_min = -input_sc * input_zp
-    kernel_max = kernel_sc * (255 - kernel_zp)
-    kernel_min = -kernel_sc * kernel_zp
-    output_limits = [
-        kernel_max * kernel_h * kernel_w * input_max,
-        kernel_min * kernel_h * kernel_w * input_max,
-        kernel_min * kernel_h * kernel_w * input_min,
-        kernel_max * kernel_h * kernel_w * input_min,
-    ]
-    output_max = max(output_limits)
-    output_min = min(output_limits)
-    output_sc = (output_max - output_min) / 255
-    output_zp = -int(output_min / output_sc)
-    return output_zp, output_sc
-
-
-def _get_qnn_model(
-    shape,
-    weight_shape,
-    units,
-    dtype,
-    input_zp,
-    input_sc,
-    kernel_zp,
-    kernel_sc,
-    output_zp,
-    output_sc,
-    var_names,
-    has_bias=False,
-):
-    a = relay.var(next(var_names), shape=shape, dtype=dtype)
-    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
-    weights = relay.const(w, dtype)
-    out = relay.qnn.op.dense(
-        a,
-        weights,
-        units=units,
-        input_zero_point=relay.const(input_zp, "int32"),
-        kernel_zero_point=relay.const(kernel_zp, "int32"),
-        input_scale=relay.const(input_sc, "float32"),
-        kernel_scale=relay.const(kernel_sc, "float32"),
-        out_dtype="int32",
-    )
-    params = {"w": w}
-    if has_bias:
-        b = tvm.nd.array(np.random.randint(0, 255, weight_shape[0]).astype("int32"))
-        biasc = relay.const(b, "int32")
-        out = relay.nn.bias_add(out, biasc)
-        params["b"] = b
-    out = relay.qnn.op.requantize(
-        out,
-        relay.const(input_sc * kernel_sc, "float32"),  # input scale
-        relay.const(0, "int32"),  # input zero point
-        relay.const(output_sc, "float32"),  # output scale
-        relay.const(output_zp, "int32"),  # output zero point
-        out_dtype=dtype,
-    )
-    return out, params
-
-
-def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
-    output_shape = (shape[0], units)
-    qnn_dtypes = ("uint8", "int8")
-    out_dtype = "int32" if dtype in qnn_dtypes else "float32"
-
-    node = {
-        "op": "kernel",
-        "name": "nn.dense",
-        "inputs": [],
-        "attrs": {
-            "num_outputs": "1",
-            "out_dtype": [[out_dtype]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "units": [[str(units)]],
-        },
-    }
-
-    inputs = [
-        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]},
-        },
-    ]
-
-    # qnn.dense params, input and kernel
-    if dtype in qnn_dtypes:
-        node["name"] = "qnn.dense"
-        for param_dtype in ["int32", "float32"]:
-            for _ in range(2):
-                inputs.append(
-                    {
-                        "op": "const",
-                        "name": "",
-                        "attrs": {"shape": [[[]]], "dtype": [[param_dtype]]},
-                    }
-                )
-
-    if has_bias:
-        bias_dtype = "int32" if dtype in qnn_dtypes else "float32"
-        bias_shape = [1, weight_shape[0]] if weight_shape[0] != 1 else [weight_shape[0]]
-        inputs.append(
-            {
-                "op": "const",
-                "name": "",
-                "attrs": {"shape": [[bias_shape]], "dtype": [[bias_dtype]]},
-            }
-        )
-
-    # qnn.dense params, output
-    if dtype in qnn_dtypes:
-        for param_dtype in ["float32", "int32"]:
-            inputs.append(
-                {"op": "const", "name": "", "attrs": {"shape": [[[]]], "dtype": [[param_dtype]]}}
-            )
-
-    input_idx = 0
-    for _ in range(len(inputs)):
-        node["inputs"].append([input_idx, 0, 0])
-        input_idx += 1
-    node["attrs"]["num_inputs"] = str(len(inputs))
-    inputs.append(node)
-    return inputs
-
-
-def test_dense():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-    dtype = "float32"
-    trials = [
-        [(1, 128), (16, 128), 16, True],
-        [(1, 128), (16, 128), 16, False],
-        [(32, 32), (32, 32), 32, True],
-        [(32, 32), (32, 32), 32, False],
-        [(1, 64), (1, 64), 1, True],
-        [(1, 64), (1, 64), 1, False],
-        [(11, 2), (2, 2), 2, True],
-        [(11, 2), (2, 2), 2, False],
-    ]
-    for shape, weight_shape, units, composite in trials:
-        outputs = []
-        inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
-        func, params = _get_model(
-            shape, weight_shape, units, dtype, var_names=iter(inputs), has_bias=composite
-        )
-        for acl in [False, True]:
-            outputs.append(
-                build_and_run(
-                    func,
-                    inputs,
-                    1,
-                    params,
-                    device,
-                    enable_acl=acl,
-                )[0]
-            )
-        config = {
-            "shape": shape,
-            "weight_shape": weight_shape,
-            "units": units,
-            "dtype": dtype,
-            "composite operators (bias)": composite,
-        }
-        verify(outputs, atol=0.001, rtol=0.01, config=config)
-
-
-def test_codegen_dense():
-    if skip_codegen_test():
-        return
-
-    np.random.seed(0)
-    dtype = "float32"
-    trials = [
-        [(1, 128), (16, 128), 16, True],
-        [(1, 128), (16, 128), 16, False],
-        [(32, 32), (32, 32), 32, True],
-        [(32, 32), (32, 32), 32, False],
-        [(1, 64), (1, 64), 1, True],
-        [(1, 64), (1, 64), 1, False],
-        [(11, 2), (2, 2), 2, True],
-        [(11, 2), (2, 2), 2, False],
-    ]
-    for shape, weight_shape, units, composite in trials:
-        inputs = {"a"}
-
-        args = (shape, weight_shape, units, dtype)
-
-        func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite)
-        exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen)
-
-
-@pytest.mark.parametrize(
-    "dtype,min_range,max_range",
-    [
-        ("uint8", 0, 255),
-        ("int8", -127, 128),
-    ],
-)
-def test_qnn_dense(dtype, min_range, max_range):
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    trials = [
-        [(1, 2), (2, 2), 2, True],
-        [(1, 2), (2, 2), 2, False],
-        [(4, 4), (4, 4), 4, True],
-        [(4, 4), (4, 4), 4, False],
-        [(16, 16), (4, 16), 4, True],
-        [(16, 16), (4, 16), 4, False],
-        [(1, 128), (16, 128), 16, True],
-        [(1, 128), (16, 128), 16, False],
-        [(32, 32), (32, 32), 32, True],
-        [(32, 32), (32, 32), 32, False],
-        [(1, 64), (1, 64), 1, True],
-        [(1, 64), (1, 64), 1, False],
-    ]
-    for shape, weight_shape, units, composite in trials:
-        outputs = []
-        inputs = {"a": tvm.nd.array(np.random.uniform(min_range, max_range, shape).astype(dtype))}
-        input_zp = 100
-        input_sc = 0.5
-        kernel_zp = 50
-        kernel_sc = 0.03
-        output_zp, output_sc = _get_qnn_params(
-            input_zp, input_sc, kernel_zp, kernel_sc, weight_shape[0], weight_shape[1]
-        )
-
-        func, params = _get_qnn_model(
-            shape,
-            weight_shape,
-            units,
-            dtype,
-            input_zp,
-            input_sc,
-            kernel_zp,
-            kernel_sc,
-            output_zp,
-            output_sc,
-            var_names=iter(inputs),
-            has_bias=composite,
-        )
-
-        for acl in [False, True]:
-            outputs.append(
-                build_and_run(
-                    func,
-                    inputs,
-                    1,
-                    params,
-                    device,
-                    enable_acl=acl,
-                )[0]
-            )
-
-        config = {
-            "shape": shape,
-            "weight_shape": weight_shape,
-            "units": units,
-            "dtype": dtype,
-            "composite operators (bias)": composite,
-            "input scale": input_sc,
-            "input zero point": input_zp,
-            "kernel scale": kernel_sc,
-            "kernel zero point": kernel_zp,
-            "output scale": output_sc,
-            "output zero point": output_zp,
-        }
-        verify(outputs, atol=1, rtol=0, config=config, verify_saturation=True)
-
-
-@pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_codegen_qnn_dense(dtype):
-    if skip_codegen_test():
-        return
-
-    np.random.seed(0)
-
-    trials = [
-        [(1, 2), (2, 2), 2, True],
-        [(1, 2), (2, 2), 2, False],
-        [(4, 4), (4, 4), 4, True],
-        [(4, 4), (4, 4), 4, False],
-        [(16, 16), (4, 16), 4, True],
-        [(16, 16), (4, 16), 4, False],
-        [(1, 128), (16, 128), 16, True],
-        [(1, 128), (16, 128), 16, False],
-        [(32, 32), (32, 32), 32, True],
-        [(32, 32), (32, 32), 32, False],
-        [(1, 64), (1, 64), 1, True],
-        [(1, 64), (1, 64), 1, False],
-    ]
-    for shape, weight_shape, units, composite in trials:
-        inputs = {"a"}
-        args = (shape, weight_shape, units, dtype)
-
-        input_zp = 100
-        input_sc = 0.5
-        kernel_zp = 25
-        kernel_sc = 0.03
-        output_zp, output_sc = _get_qnn_params(
-            input_zp, input_sc, kernel_zp, kernel_sc, weight_shape[0], weight_shape[1]
-        )
-
-        func, params = _get_qnn_model(
-            *args,
-            var_names=iter(inputs),
-            input_zp=input_zp,
-            input_sc=input_sc,
-            kernel_zp=kernel_zp,
-            kernel_sc=kernel_sc,
-            output_zp=output_zp,
-            output_sc=output_sc,
-            has_bias=composite,
-        )
-        exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen)
-
-
-@pytest.mark.parametrize(
-    "param",
-    ["kernel_sc", "kernel_zp"],
-)
-def test_codegen_qnn_dense_per_channel_quantization(param):
-    if skip_codegen_test():
-        return
-
-    np.random.seed(0)
-    dtype = "int8"
-    shape = (1, 2)
-    weight_shape = (2, 2)
-    units = 2
-    composite = True
-    inputs = {"a"}
-    args = (shape, weight_shape, units, dtype)
-
-    qnn_params = {
-        "input_zp": 1,
-        "input_sc": 1,
-        "kernel_zp": 1,
-        "kernel_sc": 1,
-        "output_zp": 1,
-        "output_sc": 1,
-    }
-    qnn_params[param] = [1, 1]
-
-    func, _ = _get_qnn_model(
-        *args,
-        var_names=iter(inputs),
-        input_zp=qnn_params["input_zp"],
-        input_sc=qnn_params["input_sc"],
-        kernel_zp=qnn_params["kernel_zp"],
-        kernel_sc=qnn_params["kernel_sc"],
-        output_zp=qnn_params["output_zp"],
-        output_sc=qnn_params["output_sc"],
-        has_bias=composite,
-    )
-    exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-    verify_codegen(func, exp_codegen, num_acl_modules=0, tvm_ops=3)
-
-
-if __name__ == "__main__":
-    test_dense()
-    test_qnn_dense()
-    test_codegen_dense()
-    test_codegen_qnn_dense()
-    test_codegen_qnn_dense_per_channel_quantization()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_maximum.py b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
deleted file mode 100644
index 1942d1e213a5..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_maximum.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library integration reshape tests."""
-
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm import testing
-
-from .infrastructure import (
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-)
-from .infrastructure import Device
-
-
-def _get_model(input_shape, dtype, var_names):
-    """Return a model and any parameters it may have."""
-    a = relay.var(next(var_names), shape=input_shape, dtype=dtype)
-    b = relay.var(next(var_names), shape=input_shape, dtype=dtype)
-    max = relay.maximum(a, b)
-    return max
-
-
-def _get_expected_codegen(shape, dtype):
-    node = {
-        "op": "kernel",
-        "name": "maximum",
-        "inputs": [[0, 0, 0], [1, 0, 0]],
-        "attrs": {
-            "num_inputs": "2",
-            "num_outputs": "1",
-            "shape": [[list(shape)]],
-            "dtype": [[dtype]],
-        },
-    }
-
-    inputs = [
-        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}},
-        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}},
-    ]
-    inputs.append(node)
-    return inputs
-
-
-def test_maximum():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    for dtype, low, high, atol, rtol in [
-        ("float32", -127, 128, 0.001, 0.001),
-        ("float32", -1, 1, 0.001, 0.001),
-    ]:
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(low, high, (100, 100)).astype(dtype)),
-            "b": tvm.nd.array(np.random.uniform(low, high, (100, 100)).astype(dtype)),
-        }
-        outputs = []
-        func = _get_model(inputs["a"].shape, dtype, iter(inputs))
-
-        for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0])
-
-        verify(outputs, atol=1e-7, rtol=1e-7)
-
-
-def test_codegen_maximum():
-    if skip_codegen_test():
-        return
-
-    shape = (100, 100)
-    inputs = {"a", "b"}
-    for dtype in ["float32"]:
-        args = (shape, dtype)
-        func = _get_model(*args, iter(inputs))
-        exp_codegen = _get_expected_codegen(*args)
-        verify_codegen(func, exp_codegen, 1)
-
-
-if __name__ == "__main__":
-    test_maximum()
-    test_codegen_maximum()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
deleted file mode 100644
index 8c6302abf842..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library network tests."""
-
-from packaging.version import parse
-
-import numpy as np
-import pytest
-from tvm import relay
-
-from test_arm_compute_lib.infrastructure import Device, skip_runtime_test, build_and_run, verify
-
-
-def _build_and_run_network(mod, params, inputs, device, tvm_ops, acl_partitions, atol, rtol):
-    """Helper function to build and run a network."""
-    data = {}
-    np.random.seed(0)
-
-    for name, (shape, dtype) in inputs.items():
-        if dtype == "uint8":
-            low, high = 0, 255
-        else:
-            low, high = -127, 128
-        data[name] = np.random.uniform(low, high, shape).astype(dtype)
-
-    outputs = []
-    for acl in [False, True]:
-        outputs.append(
-            build_and_run(
-                mod,
-                data,
-                1,
-                params,
-                device,
-                enable_acl=acl,
-                tvm_ops=tvm_ops,
-                acl_partitions=acl_partitions,
-            )[0]
-        )
-    verify(outputs, atol=atol, rtol=rtol, verify_saturation=False)
-
-
-def _get_tflite_model(tflite_model_path, inputs_dict):
-    """Convert TFlite graph to relay."""
-    try:
-        import tflite.Model
-    except ImportError:
-        pytest.skip("Missing Tflite support")
-
-    with open(tflite_model_path, "rb") as f:
-        tflite_model_buffer = f.read()
-
-    try:
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buffer, 0)
-    except AttributeError:
-        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buffer, 0)
-    shape_dict = {}
-    dtype_dict = {}
-    for input in inputs_dict:
-        input_shape, input_dtype = inputs_dict[input]
-        shape_dict[input] = input_shape
-        dtype_dict[input] = input_dtype
-
-    return relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
-
-
-def _get_keras_model(keras_model, inputs_dict):
-    """Convert Keras graph to relay."""
-    inputs = {}
-    for name, (shape, _) in inputs_dict.items():
-        inputs[keras_model.input_names[0]] = shape
-    return relay.frontend.from_keras(keras_model, inputs, layout="NHWC")
-
-
-def test_vgg16():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-
-    def get_model():
-        try:
-            from keras.applications import VGG16
-        except ImportError:
-            pytest.skip("Missing Keras Package")
-
-        vgg16 = VGG16(include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000)
-        inputs = {vgg16.input_names[0]: ((1, 224, 224, 3), "float32")}
-        mod, params = _get_keras_model(vgg16, inputs)
-        return mod, params, inputs
-
-    _build_and_run_network(
-        *get_model(),
-        device=device,
-        tvm_ops=4,
-        acl_partitions=21,
-        atol=0.002,
-        rtol=0.01,
-    )
-
-
-def test_mobilenet():
-    keras = pytest.importorskip("keras")
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-
-    def get_model():
-        try:
-            from keras.applications import MobileNet
-        except ImportError:
-            pytest.skip("Missing keras module")
-
-        mobilenet = MobileNet(
-            include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000
-        )
-        inputs = {mobilenet.input_names[0]: ((1, 224, 224, 3), "float32")}
-        mod, params = _get_keras_model(mobilenet, inputs)
-        return mod, params, inputs
-
-    if parse(keras.__version__) < parse("2.9"):
-        # This can be removed after we migrate to TF/Keras >= 2.9
-        expected_tvm_ops = 56
-        expected_acl_partitions = 31
-    else:
-        # In Keras >= 2.7, one reshape operator was removed
-        # from the MobileNet model, so it impacted this test
-        # which now needs to be reduce in by 1
-        # The change in Keras is `b6abfaed1326e3c`
-        expected_tvm_ops = 55
-        expected_acl_partitions = 30
-
-    _build_and_run_network(
-        *get_model(),
-        device=device,
-        tvm_ops=expected_tvm_ops,
-        acl_partitions=expected_acl_partitions,
-        atol=0.002,
-        rtol=0.01,
-    )
-
-
-def test_quantized_mobilenet():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    try:
-        import tvm.relay.testing.tf as tf_testing
-    except ImportError:
-        pytest.skip("Missing Tflite support")
-
-    device = Device()
-
-    def get_model():
-        model_path = tf_testing.get_workload_official(
-            "https://storage.googleapis.com/download.tensorflow.org/"
-            "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
-            "mobilenet_v1_1.0_224_quant.tflite",
-        )
-        inputs = {"input": ((1, 224, 224, 3), "uint8")}
-        mod, params = _get_tflite_model(model_path, inputs_dict=inputs)
-        return mod, params, inputs
-
-    _build_and_run_network(
-        *get_model(),
-        device=device,
-        tvm_ops=3,
-        acl_partitions=30,
-        atol=10,
-        rtol=0,
-    )
-
-
-def test_squeezenet():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    try:
-        import tvm.relay.testing.tf as tf_testing
-    except ImportError:
-        pytest.skip("Missing TF Support")
-
-    device = Device()
-
-    def get_model():
-        model_path = tf_testing.get_workload_official(
-            "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz",
-            "squeezenet.tflite",
-        )
-        inputs = {"Placeholder": ((1, 224, 224, 3), "float32")}
-        mod, params = _get_tflite_model(model_path, inputs_dict=inputs)
-        return mod, params, inputs
-
-    _build_and_run_network(
-        *get_model(),
-        device=device,
-        tvm_ops=9,
-        acl_partitions=31,
-        atol=8,
-        rtol=0,
-    )
-
-
-if __name__ == "__main__":
-    test_vgg16()
-    test_mobilenet()
-    test_quantized_mobilenet()
-    test_squeezenet()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
deleted file mode 100644
index f08fa0059ddc..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library integration pooling tests."""
-import numpy as np
-import pytest
-
-import tvm
-from tvm import relay, testing
-
-from test_arm_compute_lib.infrastructure import (
-    Device,
-    build_and_run,
-    skip_codegen_test,
-    skip_runtime_test,
-    verify,
-    verify_codegen,
-)
-
-
-def _calculate_output_shape(shape, sizes, padding, strides, dilation):
-    """Calculate pooling output shape."""
-    height_receptive_field = (sizes[0] - 1) * dilation[0] + 1
-    width_receptive_field = (sizes[1] - 1) * dilation[1] + 1
-    output_height = ((shape[1] - height_receptive_field + padding[0] + padding[2]) / strides[0]) + 1
-    output_width = ((shape[2] - width_receptive_field + padding[1] + padding[3]) / strides[1]) + 1
-    return 1, int(output_height), int(output_width), shape[3]
-
-
-def _get_pooling_model(
-    shape, dtype, typef, sizes, strides, dilation, padding, ceil_mode, count_include_pad, var_names
-):
-    """Return a model and any parameters it may have."""
-    if len(padding) == 2:
-        padding = (padding[0], padding[1], padding[0], padding[1])
-    out = relay.var(next(var_names), shape=shape, dtype=dtype)
-    qnn_dtypes = ("uint8", "int8")
-
-    if typef == "nn.max_pool2d":
-        out = relay.nn.max_pool2d(
-            out,
-            pool_size=sizes,
-            strides=strides,
-            dilation=dilation,
-            padding=padding,
-            ceil_mode=ceil_mode,
-            layout="NHWC",
-        )
-    elif typef == "nn.avg_pool2d":
-        if dtype in qnn_dtypes:
-            out = relay.cast(out, "int32")
-        out = relay.nn.avg_pool2d(
-            out,
-            pool_size=sizes,
-            strides=strides,
-            dilation=dilation,
-            padding=padding,
-            ceil_mode=ceil_mode,
-            count_include_pad=count_include_pad,
-            layout="NHWC",
-        )
-        if dtype in qnn_dtypes:
-            out = relay.cast(out, dtype)
-    elif typef == "nn.l2_pool2d":
-        out = relay.power(out, relay.const(2.0))
-        out = relay.nn.avg_pool2d(
-            out,
-            pool_size=sizes,
-            strides=strides,
-            padding=padding,
-            ceil_mode=ceil_mode,
-            count_include_pad=count_include_pad,
-            layout="NHWC",
-        )
-        out = relay.sqrt(out)
-    else:
-        raise ValueError("Function not supported")
-
-    return out
-
-
-def _get_global_pooling_model(shape, dtype, typef, var_names):
-    """Return a model and any parameters it may have."""
-    out = relay.var(next(var_names), shape=shape, dtype=dtype)
-    qnn_dtypes = ("uint8", "int8")
-
-    if typef == "nn.global_max_pool2d":
-        out = relay.nn.global_max_pool2d(out, layout="NHWC")
-    elif typef == "nn.global_avg_pool2d":
-        if dtype in qnn_dtypes:
-            out = relay.cast(out, "int32")
-        out = relay.nn.global_avg_pool2d(out, layout="NHWC")
-        if dtype in qnn_dtypes:
-            out = relay.cast(out, dtype)
-    else:
-        raise ValueError("Function not supported")
-
-    return out
-
-
-def _get_expected_pooling_codegen(
-    shape, dtype, typef, sizes, strides, dilation, padding, ceil_mode, count_include_pad
-):
-    if len(padding) == 2:
-        padding = (padding[0], padding[1], padding[0], padding[1])
-    output_shape = _calculate_output_shape(shape, sizes, padding, strides, dilation)
-
-    node = {
-        "op": "kernel",
-        "name": typef,
-        "inputs": [[0, 0, 0]],
-        "attrs": {
-            "num_inputs": "1",
-            "num_outputs": "1",
-            "layout": [["NHWC"]],
-            "out_layout": [[""]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "padding": [[str(p) for p in padding]],
-            "strides": [[str(s) for s in strides]],
-            "dilation": [[str(d) for d in dilation]],
-            "pool_size": [[str(s) for s in sizes]],
-            "ceil_mode": [[str(1 if ceil_mode else 0)]],
-        },
-    }
-
-    if typef == "nn.avg_pool2d" or typef == "nn.l2_pool2d":
-        node["attrs"]["count_include_pad"] = [["1" if count_include_pad else "0"]]
-
-    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
-    return [input, node]
-
-
-def _get_expected_global_pooling_codegen(shape, dtype, typef):
-    node = {
-        "op": "kernel",
-        "name": typef,
-        "inputs": [[0, 0, 0]],
-        "attrs": {
-            "num_inputs": "1",
-            "num_outputs": "1",
-            "layout": [["NHWC"]],
-            "out_layout": [[""]],
-            "shape": [[[1, 1, 1, shape[3]]]],
-            "dtype": [[dtype]],
-        },
-    }
-
-    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
-    return [input, node]
-
-
-def _get_low_high_atol_rtol(dtype):
-    if dtype == "float32":
-        low, high, atol, rtol = (-127, 128, 0.001, 0.001)
-    elif dtype == "uint8":
-        low, high, atol, rtol = (0, 255, 1, 0)
-    elif dtype == "int8":
-        low, high, atol, rtol = (-127, 128, 1, 0)
-    else:
-        pytest.fail(f"dtype not expected: {dtype}")
-
-    return low, high, atol, rtol
-
-
-# fmt: off
-@pytest.mark.parametrize(
-     "typef,dtype,size,stride,dilation,pad,ceil_mode,count_include_pad,input_shape,expected_ops",
-     [
-        ("nn.max_pool2d", "float32",  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (27, 27, 512), (0, 1),),
-        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False, True,  (16, 16, 16),  (0, 1),),
-        ("nn.max_pool2d", "float32",  (3, 3), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),),
-        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
-        ("nn.max_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
-        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),),
-        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (3, 2), (1, 1), True,  True,  (15, 15, 16),  (1, 0),),
-        ("nn.max_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
-        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),),
-        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (3, 2), (1, 1), True,  True,  (15, 15, 16),  (1, 0),),
-        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16),  (0, 1),),
-        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False, True,  (16, 16, 16),  (0, 1),),
-        ("nn.avg_pool2d", "float32",  (3, 3), (2, 2), (3, 2), (0, 1), True,  False, (15, 15, 16),  (1, 0),),
-        # 20.05: "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"
-        # ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
-        ("nn.avg_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
-        ("nn.avg_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
-        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (0, 1), True,  False, (16, 16, 16),  (0, 1),),
-        ("nn.l2_pool2d",  "float32",  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (16, 16, 16),  (0, 1),),
-        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, True,  (15, 15, 16),  (0, 1),),
-
-     ],
-)
-# fmt: on
-def test_pooling(
-    typef,
-    dtype,
-    size,
-    stride,
-    dilation,
-    pad,
-    ceil_mode,
-    count_include_pad,
-    input_shape,
-    expected_ops,
-):
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    low, high, atol, rtol = _get_low_high_atol_rtol(dtype)
-    tvm_ops, acl_partitions = expected_ops
-
-    shape = (1, *input_shape)
-    outputs = []
-    inputs = {
-        "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
-    }
-
-    func = _get_pooling_model(
-        shape,
-        dtype,
-        typef,
-        size,
-        stride,
-        dilation,
-        pad,
-        ceil_mode,
-        count_include_pad,
-        iter(inputs),
-    )
-
-    config = {
-        "size": size,
-        "stride": stride,
-        "shape": shape,
-        "pooling type": typef,
-        "dtype": dtype,
-        "padding": pad,
-        "dilation": dilation,
-        "ceil_mode": ceil_mode,
-        "count_include_pad": count_include_pad,
-        "inputs": inputs,
-    }
-    verify_saturation = True if dtype == "uint8" else False
-    for acl in [False, True]:
-        outputs.append(
-            build_and_run(
-                func,
-                inputs,
-                1,
-                None,
-                device,
-                enable_acl=acl,
-                tvm_ops=tvm_ops,
-                acl_partitions=acl_partitions,
-                config=config,
-            )[0]
-        )
-
-    verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
-
-
-@pytest.mark.parametrize(
-    "typef,dtype,input_shape",
-    [
-        ["nn.global_max_pool2d", "float32", (8, 8, 16)],
-        ["nn.global_max_pool2d", "float32", (9, 9, 16)],
-        ["nn.global_max_pool2d", "uint8", (8, 8, 16)],
-        ["nn.global_max_pool2d", "uint8", (9, 9, 16)],
-        ["nn.global_max_pool2d", "int8", (8, 8, 16)],
-        ["nn.global_max_pool2d", "int8", (9, 9, 16)],
-        ["nn.global_avg_pool2d", "float32", (8, 8, 16)],
-        ["nn.global_avg_pool2d", "float32", (9, 9, 16)],
-        ["nn.global_avg_pool2d", "uint8", (8, 8, 16)],
-        ["nn.global_avg_pool2d", "uint8", (9, 9, 16)],
-        ["nn.global_avg_pool2d", "int8", (8, 8, 16)],
-        ["nn.global_avg_pool2d", "int8", (9, 9, 16)],
-    ],
-)
-def test_global_pooling(typef, dtype, input_shape):
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    low, high, rtol, atol = _get_low_high_atol_rtol(dtype)
-
-    shape = (1, *input_shape)
-    outputs = []
-    inputs = {
-        "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
-    }
-
-    func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
-    config = {
-        "shape": shape,
-        "pooling type": typef,
-        "dtype": dtype,
-    }
-    verify_saturation = True if dtype in ("uint8", "int8") else False
-
-    for acl in [False, True]:
-        outputs.append(
-            build_and_run(func, inputs, 1, None, device, enable_acl=acl, config=config)[0]
-        )
-
-    verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
-
-
-# fmt: off
-@pytest.mark.parametrize(
-     "typef,dtype,size,stride,dilation,pad,ceil_mode,count_include_pad,input_shape,expected_ops",
-     [
-        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False,  True, (16, 16, 16), (0, 1),),
-        ("nn.max_pool2d", "float32",  (3, 3), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),),
-        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
-        ("nn.max_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
-        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),),
-        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (3, 2), (1, 1),  True,  True, (15, 15, 16), (1, 0),),
-        ("nn.max_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
-        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),),
-        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (3, 2), (1, 1),  True,  True, (15, 15, 16), (1, 0),),
-        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16), (0, 1),),
-        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16), (0, 1),),
-        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False,  True, (16, 16, 16), (0, 1),),
-        ("nn.avg_pool2d", "float32",  (3, 3), (2, 2), (3, 2), (0, 1),  True, False, (15, 15, 16), (1, 0),),
-        ("nn.avg_pool2d", "uint8", (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (16, 16, 16), (0, 1),),
-        ("nn.avg_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
-        ("nn.avg_pool2d", "int8", (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (16, 16, 16), (0, 1),),
-        ("nn.avg_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
-        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (0, 1),  True, False, (15, 15, 16), (0, 1),),
-        ("nn.l2_pool2d",  "float32",  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (16, 16, 16), (0, 1),),
-        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (15, 15, 16), (0, 1),),
-     ],
-)
-# fmt: on
-def test_codegen_pooling(
-    typef,
-    dtype,
-    size,
-    stride,
-    dilation,
-    pad,
-    ceil_mode,
-    count_include_pad,
-    input_shape,
-    expected_ops,
-):
-    if skip_codegen_test():
-        return
-
-    low, high, _, _ = _get_low_high_atol_rtol(dtype)
-    tvm_ops, acl_partitions = expected_ops
-
-    shape = (1, *input_shape)
-    inputs = {"a"}
-    args = (shape, dtype, typef, size, stride, dilation, pad, False, False)
-    func = _get_pooling_model(*args, iter(inputs))
-    exp_codegen = _get_expected_pooling_codegen(*args)
-
-    verify_codegen(func, exp_codegen, acl_partitions, tvm_ops)
-
-
-@pytest.mark.parametrize(
-    "typef,dtype,input_shape",
-    [
-        ("nn.global_max_pool2d", "float32", (8, 8, 16)),
-        ("nn.global_max_pool2d", "float32", (9, 9, 16)),
-        ("nn.global_max_pool2d", "uint8", (8, 8, 16)),
-        ("nn.global_max_pool2d", "uint8", (9, 9, 16)),
-        ("nn.global_max_pool2d", "int8", (8, 8, 16)),
-        ("nn.global_max_pool2d", "int8", (9, 9, 16)),
-        ("nn.global_avg_pool2d", "float32", (8, 8, 16)),
-        ("nn.global_avg_pool2d", "float32", (9, 9, 16)),
-        ("nn.global_avg_pool2d", "uint8", (8, 8, 16)),
-        ("nn.global_avg_pool2d", "uint8", (9, 9, 16)),
-        ("nn.global_avg_pool2d", "int8", (8, 8, 16)),
-        ("nn.global_avg_pool2d", "int8", (9, 9, 16)),
-    ],
-)
-def test_codegen_global_pooling(typef, dtype, input_shape):
-    if skip_codegen_test():
-        return
-
-    low, high, _, _ = _get_low_high_atol_rtol(dtype)
-
-    shape = (1, *input_shape)
-    inputs = {"a"}
-    args = (shape, dtype, typef)
-    func = _get_global_pooling_model(*args, iter(inputs))
-    exp_codegen = _get_expected_global_pooling_codegen(*args)
-    verify_codegen(func, exp_codegen, 1)
-
-
-if __name__ == "__main__":
-    test_pooling()
-    test_global_pooling()
-    test_codegen_pooling()
-    test_codegen_global_pooling()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
deleted file mode 100644
index 611599154c8a..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library integration reshape tests."""
-
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm import testing
-
-from .infrastructure import (
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-)
-from .infrastructure import Device
-
-
-def _get_model(input_shape, output_shape, dtype, var_names):
-    """Return a model and any parameters it may have."""
-    a = relay.var(next(var_names), shape=input_shape, dtype=dtype)
-    reshape = relay.reshape(a, output_shape)
-    return reshape
-
-
-def _get_expected_codegen(input_shape, output_shape, dtype):
-    node = {
-        "op": "kernel",
-        "name": "reshape",
-        "inputs": [[0, 0, 0]],
-        "attrs": {
-            "num_inputs": "1",
-            "num_outputs": "1",
-            "newshape": [[str(s) for s in output_shape]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "allowzero": [["0"]],
-        },
-    }
-
-    input = {
-        "op": "input",
-        "name": "",
-        "attrs": {"shape": [[list(input_shape)]], "dtype": [[dtype]]},
-    }
-
-    return [input, node]
-
-
-def test_reshape():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    for dtype, low, high, atol, rtol in [
-        ("float32", -127, 128, 0.001, 0.001),
-        ("uint8", 0, 255, 0, 0),
-    ]:
-        inputs = {"a": tvm.nd.array(np.random.uniform(low, high, (1, 1, 1, 1000)).astype(dtype))}
-
-        for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1), (1, 1000, 1)]:
-            outputs = []
-            func = _get_model(inputs["a"].shape, new_shape, dtype, iter(inputs))
-            for acl in [False, True]:
-                outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0])
-
-            config = {
-                "new shape": inputs["a"].shape,
-                "shape": new_shape,
-                "dtype": dtype,
-            }
-            verify(outputs, atol=1e-7, rtol=1e-7, config=config)
-
-
-def test_codegen_reshape():
-    if skip_codegen_test():
-        return
-
-    shape = (1, 1, 1, 1000)
-    inputs = {"a"}
-    for dtype in ["float32", "uint8"]:
-        for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1)]:
-            args = (shape, new_shape, dtype)
-            func = _get_model(*args, iter(inputs))
-            exp_codegen = _get_expected_codegen(*args)
-            verify_codegen(func, exp_codegen, 1)
-
-
-if __name__ == "__main__":
-    test_reshape()
-    test_codegen_reshape()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_runtime.py b/tests/python/contrib/test_arm_compute_lib/test_runtime.py
deleted file mode 100644
index 316dfad9e78f..000000000000
--- a/tests/python/contrib/test_arm_compute_lib/test_runtime.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Arm Compute Library runtime tests."""
-
-import numpy as np
-
-import tvm
-from tvm import relay
-
-from .infrastructure import skip_runtime_test, build_and_run, verify
-from .infrastructure import Device
-
-
-def test_multiple_ops():
-    """
-    Test multiple operators destined for ACL.
-    The ACL runtime will expect these ops as 2 separate functions for
-    the time being.
-    """
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    def get_model(input_shape, var_names):
-        """Return a model and any parameters it may have."""
-        a = relay.var(next(var_names), shape=input_shape, dtype="float32")
-        out = relay.reshape(a, (1, 1, 1000))
-        out = relay.reshape(out, (1, 1000))
-        return out
-
-    inputs = {"a": tvm.nd.array(np.random.uniform(0, 1, (1, 1, 1, 1000)).astype("float32"))}
-
-    outputs = []
-    for acl in [False, True]:
-        func = get_model(inputs["a"].shape, iter(inputs))
-        outputs.append(
-            build_and_run(func, inputs, 1, None, device, enable_acl=acl, acl_partitions=2)[0]
-        )
-    verify(outputs, atol=0.002, rtol=0.01)
-
-
-def test_heterogeneous():
-    """
-    Test to check if offloading only supported operators works,
-    while leaving unsupported operators computed via tvm.
-    """
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    def get_model(input_shape, var_names):
-        """Return a model and any parameters it may have."""
-        a = relay.var(next(var_names), shape=input_shape, dtype="float32")
-        out = relay.reshape(a, (1, 1, 1000))
-        out = relay.sigmoid(out)
-        out = relay.reshape(out, (1, 1000))
-        return out
-
-    inputs = {"a": tvm.nd.array(np.random.uniform(-127, 128, (1, 1, 1, 1000)).astype("float32"))}
-
-    outputs = []
-    for acl in [False, True]:
-        func = get_model(inputs["a"].shape, iter(inputs))
-        outputs.append(
-            build_and_run(
-                func, inputs, 1, None, device, enable_acl=acl, tvm_ops=1, acl_partitions=2
-            )[0]
-        )
-    verify(outputs, atol=0.002, rtol=0.01)
-
-
-def test_multiple_runs():
-    """
-    Test that multiple runs of an operator work.
-    """
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-
-    def get_model():
-        a = relay.var("a", shape=(1, 28, 28, 512), dtype="float32")
-        w = tvm.nd.array(np.ones((256, 1, 1, 512), dtype="float32"))
-        weights = relay.const(w, "float32")
-        conv = relay.nn.conv2d(
-            a,
-            weights,
-            kernel_size=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-            strides=(1, 1),
-            padding=(0, 0),
-            dilation=(1, 1),
-        )
-        params = {"w": w}
-        return conv, params
-
-    inputs = {
-        "a": tvm.nd.array(np.random.uniform(-127, 128, (1, 28, 28, 512)).astype("float32")),
-    }
-
-    func, params = get_model()
-    outputs = build_and_run(func, inputs, 1, params, device, enable_acl=True, no_runs=3)
-    verify(outputs, atol=0.002, rtol=0.01)
-
-
-if __name__ == "__main__":
-    test_multiple_ops()
-    test_heterogeneous()
-    test_multiple_runs()
diff --git a/tests/python/contrib/test_bnns/__init__.py b/tests/python/contrib/test_bnns/__init__.py
deleted file mode 100644
index 724b23f1378b..000000000000
--- a/tests/python/contrib/test_bnns/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Infrastructure and tests for BNNS"""
diff --git a/tests/python/contrib/test_bnns/infrastructure.py b/tests/python/contrib/test_bnns/infrastructure.py
deleted file mode 100644
index 066829d18bd4..000000000000
--- a/tests/python/contrib/test_bnns/infrastructure.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from itertools import zip_longest, combinations
-import json
-import os
-import warnings
-
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm import rpc
-from tvm.contrib import graph_executor
-from tvm.relay.op.contrib.bnns import partition_for_bnns
-from tvm.contrib import utils
-from tvm.autotvm.measure import request_remote
-from tvm.relay.analysis import analysis
-
-
-class Device:
-    """
-    Common device configuration for python tests.
-
-    Check tests/python/contrib/arm_compute_lib/ for the presence of an test_config.json file.
-    This file can be used to override the default configuration here which will attempt to run the BNNS
-    runtime tests locally if the runtime is available. Changing the configuration will allow these
-    runtime tests to be offloaded to a remote device with BNNS via a tracker for example.
-
-    Notes
-    -----
-        The test configuration will be loaded once when the class is created. If the configuration
-        changes between tests, any changes will not be picked up.
-
-
-    Attributes
-    ----------
-    connection_type : str
-        Details the type of RPC connection to use. Options:
-        local - Use the local device,
-        tracker - Connect to a tracker to request a remote device,
-        remote - Connect to a remote device directly.
-    host : str
-        Specify IP address or hostname of remote target.
-    port : int
-        Specify port number of remote target.
-    target : str
-        The compilation target.
-    device_key : str
-        The device key of the remote target. Use when connecting to a remote device via a tracker.
-    cross_compile : str
-        Specify path to cross compiler to use when connecting a remote device from a non-arm platform.
-    """
-
-    connection_type = "local"
-    host = "127.0.0.1"
-    port = 9090
-    target = "llvm"
-    device_key = ""
-    cross_compile = ""
-
-    def __init__(self):
-        """Keep remote device for lifetime of object."""
-        self.device = self._get_remote()
-
-    @classmethod
-    def _get_remote(cls):
-        """Get a remote (or local) device to use for testing."""
-        if cls.connection_type == "tracker":
-            device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000)
-        elif cls.connection_type == "remote":
-            device = rpc.connect(cls.host, cls.port)
-        elif cls.connection_type == "local":
-            device = rpc.LocalSession()
-        else:
-            raise ValueError(
-                "connection_type in test_config.json should be one of: " "local, tracker, remote."
-            )
-
-        return device
-
-    @classmethod
-    def load(cls, file_name):
-        """Load test config
-
-        Load the test configuration by looking for file_name relative
-        to the test_bnns directory.
-        """
-        location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-        config_file = os.path.join(location, file_name)
-        if not os.path.exists(config_file):
-            warnings.warn("Config file doesn't exist, resuming tests with default config.")
-            return
-        with open(config_file, mode="r") as config:
-            test_config = json.load(config)
-
-        cls.connection_type = test_config["connection_type"]
-        cls.host = test_config["host"]
-        cls.port = test_config["port"]
-        cls.target = test_config["target"]
-        cls.device_key = test_config.get("device_key") or ""
-        cls.cross_compile = test_config.get("cross_compile") or ""
-
-
-Device.target = "llvm"
-
-
-def skip_runtime_test():
-    """Skip test if it requires the runtime and it's not present."""
-    # BNNS codegen not present.
-    if not tvm.get_global_func("relay.ext.bnns", True):
-        print("Skip because BNNS codegen is not available.")
-        return True
-    return False
-
-
-def skip_codegen_test():
-    """Skip test if it requires the BNNS codegen and it's not present."""
-    if not tvm.get_global_func("relay.ext.bnns", True):
-        print("Skip because BNNS codegen is not available.")
-        return True
-
-
-def build_module(mod, target, params=None, enable_bnns=True, tvm_ops=0):
-    """Build module with option to build for BNNS."""
-    if isinstance(mod, tvm.relay.expr.Call):
-        mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3):
-        if enable_bnns:
-            mod = partition_for_bnns(mod)
-        relay.backend.te_compiler.get().clear()
-        return relay.build(mod, target=target, params=params)
-
-
-def build_and_run(
-    mod,
-    inputs,
-    outputs,
-    params,
-    device,
-    enable_bnns=True,
-    no_runs=1,
-    tvm_ops=0,
-    config=None,
-):
-    """Build and run the relay module."""
-    if config is None:
-        config = {}
-
-    try:
-        lib = build_module(mod, device.target, params, enable_bnns, tvm_ops)
-    except Exception as e:
-        err_msg = "The module could not be built.\n"
-        if config:
-            err_msg += f"The test failed with the following parameters: {config}\n"
-        err_msg += str(e)
-        raise Exception(err_msg)
-
-    lib = update_lib(lib, device.device, device.cross_compile)
-    gen_module = graph_executor.GraphModule(lib["default"](device.device.cpu(0)))
-    gen_module.set_input(**inputs)
-    out = []
-    for _ in range(no_runs):
-        gen_module.run()
-        out.append([gen_module.get_output(i) for i in range(outputs)])
-    return out
-
-
-def update_lib(lib, device, cross_compile):
-    """Export the library to the remote/local device."""
-    lib_name = "mod.so"
-    temp = utils.tempdir()
-    lib_path = temp.relpath(lib_name)
-    if cross_compile:
-        lib.export_library(lib_path, cc=cross_compile)
-    else:
-        lib.export_library(lib_path)
-    device.upload(lib_path)
-    lib = device.load_module(lib_name)
-    return lib
-
-
-def extract_bnns_modules(module):
-    """Get the BNNS module(s) from llvm module."""
-    return list(filter(lambda mod: mod.type_key == "bnns_json", module.get_lib().imported_modules))
-
-
-def verify(answers, atol, rtol, verify_saturation=False, config=None):
-    """Compare the array of answers. Each entry is a list of outputs."""
-    if config is None:
-        config = {}
-
-    if len(answers) < 2:
-        raise RuntimeError(f"No results to compare: expected at least two, found {len(answers)}")
-    for answer in zip_longest(*answers):
-        for outs in combinations(answer, 2):
-            try:
-                if verify_saturation:
-                    assert (
-                        np.count_nonzero(outs[0].numpy() == 255) < 0.25 * outs[0].numpy().size
-                    ), "Output is saturated: {}".format(outs[0])
-                    assert (
-                        np.count_nonzero(outs[0].numpy() == 0) < 0.25 * outs[0].numpy().size
-                    ), "Output is saturated: {}".format(outs[0])
-                tvm.testing.assert_allclose(outs[0].numpy(), outs[1].numpy(), rtol=rtol, atol=atol)
-            except AssertionError as e:
-                err_msg = "Results not within the acceptable tolerance.\n"
-                if config:
-                    err_msg += f"The test failed with the following parameters: {config}\n"
-                err_msg += str(e)
-                raise AssertionError(err_msg)
-
-
-def verify_codegen(
-    module,
-    known_good_codegen,
-    num_bnns_modules,
-    tvm_ops=0,
-    target=Device.target,
-):
-    """Check BNNS codegen against a known good output."""
-    module = build_module(module, target, tvm_ops=tvm_ops)
-    bnns_modules = extract_bnns_modules(module)
-
-    assert len(bnns_modules) == num_bnns_modules, (
-        f"The number of BNNS modules produced ({len(bnns_modules)}) does not "
-        f"match the expected value ({num_bnns_modules})."
-    )
-
-    for mod in bnns_modules:
-        source = mod.get_source("json")
-        codegen = json.loads(source)["nodes"]
-        # remove input and const names as these cannot be predetermined
-        for node in range(len(codegen)):
-            if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
-                codegen[node]["name"] = ""
-        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
-        known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2)
-
-        assert codegen_str == known_good_codegen_str, (
-            f"The JSON produced by codegen does not match the expected result. \n"
-            f"Actual={codegen_str} \n"
-            f"Expected={known_good_codegen_str}"
-        )
-
-
-def compare_inference_with_ref(func, params, atol=0.002, rtol=0.007):
-    """Compare scoring results for compilation with and without BNNS.
-
-    Provided function will be compiled two times with and without BNNS.
-    The scoring results for both type of compilation will be compared
-    with provided atol and rtol. The input data will be automatically
-    generated based of shape and dtype info provided for var nodes.
-
-    """
-    # Generate input tensor values
-    inputs = {}
-    for free_param in analysis.free_vars(func):
-        name = free_param.name_hint
-        dtype = free_param.type_annotation.dtype
-        shape = [s.value for s in free_param.type_annotation.shape]
-        inputs[name] = tvm.nd.array(np.random.uniform(0, 127, shape).astype(dtype))
-
-    # Run for both type of compilation
-    device = Device()
-    outputs = []
-    for bnns in [False, True]:
-        outputs.append(build_and_run(func, inputs, 1, params, device, enable_bnns=bnns)[0])
-
-    # Compare result tensors
-    verify(outputs, atol=atol, rtol=rtol)
-
-
-def generate_trials(space, r_factor=3):
-    """Generates a series of trials.
-
-    This algorithm generates a series of non-deterministic trials given a
-    space of options to test. A trial is generated by pulling a value from
-    each option in the space. On some occasions the values are shuffled to
-    ensure a different trial on each r_factor iteration. The algorithm ensures
-    that each value from an option is used at least once. The total number of
-    trials is determined by the r_factor * the option with the largest number
-    of values.
-
-    Parameters
-    ----------
-    space: List[List[Any]]
-        A list of different options with varying values to test.
-    r_factor: Optional[int]
-        The repeat factor.
-
-    Returns
-    -------
-    result: List[Tuple]
-        A list of trials specifying values for each option.
-
-    """
-    np.random.seed(0)
-    max_len = 1
-    for option in space:
-        max_len = max(max_len, len(option))
-
-    num_trials = r_factor * max_len
-    trials = []
-    for i in range(num_trials):
-        trial = []
-        for option in space:
-            if i % len(option) == 0:
-                np.random.shuffle(option)
-            trial.append(option[i % len(option)])
-
-        trials.append(trial)
-
-    return trials
diff --git a/tests/python/contrib/test_bnns/test_conv2d.py b/tests/python/contrib/test_bnns/test_conv2d.py
deleted file mode 100644
index 886958cf3076..000000000000
--- a/tests/python/contrib/test_bnns/test_conv2d.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BNNS integration conv2d tests."""
-
-import numpy as np
-import pytest
-import tvm
-from tvm import relay
-
-from .infrastructure import skip_runtime_test, compare_inference_with_ref, generate_trials
-
-# TODO: Missed cases
-#   1. Bias as add with 3d const tensor. Lead to additional unsqueeze op between
-#   2. Check unsupported cases of fusion. Like bias add with axis != 1, add with broadcast by spatial dims
-#   3. Check if bias/weights is not constants. Should fallback into LLVM or decompose it
-#   4. Check if bias/weights is constants expr. Should works somehow.
-
-
-def _get_model(
-    shape,
-    kernel=(3, 3),
-    padding=(1, 1),
-    strides=(1, 1),
-    dilation=(1, 1),
-    groups=1,
-    dtype="float32",
-    channels=-1,  # -1 means same as input channels
-    bias_type="none",
-    activation_type="none",
-):
-    """Return a model and any parameters it may have"""
-    if channels == -1:
-        channels = shape[1]
-
-    a = relay.var("a", shape=shape, dtype=dtype)
-    weight_shape = (channels, shape[1] // groups, *kernel)
-    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
-    weights = relay.const(w, dtype)
-    out = relay.nn.conv2d(
-        a,
-        weights,
-        kernel_size=kernel,
-        dilation=dilation,
-        strides=strides,
-        padding=padding,
-        groups=groups,
-        channels=channels,
-        out_dtype=dtype,
-    )
-    params = {"w": w}
-    if bias_type == "bias_add":
-        b = tvm.nd.array(np.random.uniform(-10, 10, weight_shape[0]).astype(dtype))
-        biasc = relay.const(b, dtype)
-        out = relay.nn.bias_add(out, biasc, axis=1)
-        params["b"] = b
-    elif bias_type == "add_3d" or bias_type == "add_4d":
-        bias_shape = (
-            (weight_shape[0], 1, 1) if bias_type == "add_3d" else (1, weight_shape[0], 1, 1)
-        )
-        b = tvm.nd.array(np.random.uniform(-10, 10, bias_shape).astype(dtype))
-        biasc = relay.const(b, dtype)
-        out = relay.add(out, biasc)
-        params["b"] = b
-
-    if activation_type == "relu":
-        out = relay.nn.relu(out)
-    elif activation_type == "sigmoid":
-        out = relay.op.sigmoid(out)
-    return out, params
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_conv2d():
-    np.random.seed(0)
-
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2), (2, 1)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [1, 4, 8, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    batches = [1, 2]
-    groups = [1, 2]
-    bias_kind = ["none", "add_3d", "add_4d", "bias.add"]
-    activation_kind = ["none", "relu", "sigmoid"]
-    trials = generate_trials(
-        [
-            kernel_hs,
-            kernel_ws,
-            pad,
-            strides,
-            dilation,
-            out_channels,
-            input_shapes,
-            groups,
-            batches,
-            bias_kind,
-            activation_kind,
-        ],
-        3,
-    )
-
-    for (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        input_shapes,
-        group,
-        batch,
-        bias,
-        activation,
-    ) in trials:
-        if out_channels % group != 0:
-            continue
-        func, params = _get_model(
-            shape=(batch, *input_shapes),
-            kernel=(kernel_h, kernel_w),
-            padding=pad,
-            strides=stride,
-            dilation=dilation,
-            groups=group,
-            channels=out_channels,
-            bias_type=bias,
-            activation_type=activation,
-        )
-        compare_inference_with_ref(func, params)
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_conv2d_dw():
-    if skip_runtime_test():
-        return
-
-    np.random.seed(0)
-    shape = [4, 5, 5]
-
-    for batch in [1, 2]:
-        mod, params = _get_model(shape=(batch, *shape), groups=shape[0])
-        compare_inference_with_ref(mod, params)
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_conv2d_with_oc1():
-    if skip_runtime_test():
-        return
-
-    np.random.seed(0)
-    shape = [3, 5, 5]
-
-    for batch in [1, 2]:
-        for bias in ["none", "add_4d"]:
-            mod, params = _get_model(shape=(batch, *shape), channels=1, bias_type=bias)
-            compare_inference_with_ref(mod, params)
-
-
-if __name__ == "__main__":
-    test_conv2d()
-    test_conv2d_dw()
-    test_conv2d_with_oc1()
diff --git a/tests/python/contrib/test_bnns/test_conv2d_patterns.py b/tests/python/contrib/test_bnns/test_conv2d_patterns.py
deleted file mode 100644
index 5fc9e9522fbd..000000000000
--- a/tests/python/contrib/test_bnns/test_conv2d_patterns.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BNNS pattern detection check"""
-
-import tvm
-from tvm import relay
-import numpy as np
-
-from tvm.relay.op.contrib.bnns import partition_for_bnns
-
-fp32 = "float32"
-
-
-def partition(exp):
-    """Apply BNNS specific partitioning transformation"""
-    mod = tvm.IRModule.from_expr(exp)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = partition_for_bnns(mod)
-    return mod
-
-
-def is_op_fused(func, op_name):
-    is_fused = False
-
-    def visit(op):
-        if (
-            isinstance(op, tvm.relay.function.Function)
-            and op_name in op.attrs["PartitionedFromPattern"]
-        ):
-            nonlocal is_fused
-            is_fused = True
-
-    tvm.relay.analysis.post_order_visit(func.body, visit)
-    return is_fused
-
-
-def test_pattern_conv2d_with_bias_add():
-    for axis in (1, 2):
-        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
-        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
-        b = relay.const(np.random.uniform(-10, 10, 8).astype(fp32))
-        res = relay.nn.bias_add(res, b, axis=axis)
-
-        mod = partition(res)
-        bias_is_fused = is_op_fused(mod["tvmgen_default_bnns_main_0"], "nn.bias_add")
-
-        assert bias_is_fused if axis == 1 else not bias_is_fused
-
-
-def test_pattern_conv2d_with_add():
-    workloads = {8: False, (8, 1): False, (8, 1, 1): True, (1, 8, 1, 1): True}
-
-    for b_shape, should_be_fused in workloads.items():
-        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
-        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
-        b = relay.const(np.random.uniform(-10, 10, b_shape).astype(fp32))
-        res = relay.add(res, b)
-
-        mod = partition(res)
-        bias_is_fused = is_op_fused(mod["tvmgen_default_bnns_main_0"], "add")
-
-        assert bias_is_fused == should_be_fused
-
-
-def test_pattern_conv2d_with_non_cons_weights():
-    for const_weights in (True, False):
-        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
-        if const_weights:
-            w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
-        else:
-            w = relay.var("w", shape=(8, 7, 3, 3), dtype=fp32)
-
-        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
-
-        mod = partition(res)
-        use_bnns = len(mod.get_global_vars()) == 2  # GlobalVar: "main" and "bnns_0"
-
-        assert use_bnns == const_weights
-
-
-def test_pattern_conv2d_with_non_cons_bias():
-    a = relay.var("a", shape=[2, 7, 8, 8], dtype=fp32)
-    w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
-    res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
-    b = relay.var("b", shape=[8], dtype=fp32)
-    res = relay.nn.bias_add(res, b, axis=1)
-
-    mod = partition(res)
-    bias_is_fused = is_op_fused(mod["tvmgen_default_bnns_main_0"], "nn.bias_add")
-
-    assert not bias_is_fused
diff --git a/tests/python/contrib/test_bnns/test_dense.py b/tests/python/contrib/test_bnns/test_dense.py
deleted file mode 100644
index c2cf9bf71373..000000000000
--- a/tests/python/contrib/test_bnns/test_dense.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BNNS integration dense tests."""
-
-import numpy as np
-import math
-import pytest
-import tvm
-from tvm import relay
-from .infrastructure import (
-    Device,
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-    generate_trials,
-)
-
-
-def _get_model(shape, weight_shape, units, dtype, var_names, has_bias=False, has_gelu=False):
-    """Return a model and any parameters it may have"""
-    a = relay.var(next(var_names), shape=shape, dtype=dtype)
-    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
-    weights = relay.const(w, dtype)
-    out = relay.nn.dense(a, weights, units=units, out_dtype=dtype)
-    params = {"w": w}
-    if has_bias:
-        b = tvm.nd.array(np.random.randint(-128, 127, weight_shape[0]).astype(dtype))
-        biasc = relay.const(b, dtype)
-        out = relay.op.add(out, biasc)
-        params["b"] = b
-    if has_gelu:
-        const1 = relay.const(0.044715)
-        const2 = relay.const(math.sqrt(2 / math.pi))
-        bias = out
-        out = relay.op.power(bias, relay.const(3.0, "float32"))
-        out = relay.op.multiply(out, const1)
-        out = relay.op.add(out, bias)
-        out = relay.op.multiply(out, const2)
-        out = relay.op.tanh(out)
-        out = relay.op.add(out, relay.const(1, "float32"))
-        out = relay.op.multiply(out, relay.const(0.5))
-        out = relay.op.multiply(out, bias)
-    return out, params
-
-
-def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False, has_gelu=False):
-    output_shape = (shape[0], units)
-    name = "nn.dense"
-    if has_bias is True:
-        name = "bnns.dense_bias"
-    if has_bias is True and has_gelu is True:
-        name = "bnns.dense_bias_gelu"
-
-    node = {
-        "op": "kernel",
-        "name": name,
-        "inputs": [],
-        "attrs": {
-            "num_outputs": "1",
-            "out_dtype": [["float32"]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "units": [[str(units)]],
-        },
-    }
-
-    inputs = [
-        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]},
-        },
-    ]
-
-    if has_bias:
-        inputs.append(
-            {
-                "op": "const",
-                "name": "",
-                "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [["float32"]]},
-            }
-        )
-
-    input_idx = 0
-    for _ in range(len(inputs)):
-        node["inputs"].append([input_idx, 0, 0])
-        input_idx += 1
-    node["attrs"]["num_inputs"] = str(len(inputs))
-    inputs.append(node)
-    return inputs
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_dense():
-    device = Device()
-    np.random.seed(0)
-
-    dtype = ["float32"]
-    shape = [
-        ((1, 128), (16, 128), 16),
-        ((32, 32), (32, 32), 32),
-        ((1, 64), (1, 64), 1),
-        ((11, 2), (2, 2), 2),
-        ((2, 2), (1, 2), 1),
-    ]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite, composite], 3)
-
-    for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials:
-        outputs = []
-        inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
-        func, params = _get_model(
-            shape,
-            weight_shape,
-            units,
-            dtype,
-            var_names=iter(inputs),
-            has_bias=with_bias,
-            has_gelu=with_gelu,
-        )
-        for bnns in [False, True]:
-            outputs.append(
-                build_and_run(
-                    func,
-                    inputs,
-                    1,
-                    params,
-                    device,
-                    enable_bnns=bnns,
-                )[0]
-            )
-
-        config = {
-            "shape": shape,
-            "weight_shape": weight_shape,
-            "units": units,
-            "dtype": dtype,
-            "with_bias": with_bias,
-            "with_gelu": with_gelu,
-        }
-        verify(outputs, atol=0.001, rtol=0.01, config=config)
-
-
-@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
-def test_codegen_dense():
-    np.random.seed(0)
-
-    dtype = ["float32"]
-    shape = [
-        ((1, 128), (16, 128), 16),
-        ((32, 32), (32, 32), 32),
-        ((1, 64), (1, 64), 1),
-        ((11, 2), (2, 2), 2),
-        ((2, 2), (1, 2), 1),
-    ]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite, composite], 3)
-
-    for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials:
-        inputs = {"a"}
-
-        args = (shape, weight_shape, units, dtype)
-
-        func, params = _get_model(
-            *args, var_names=iter(inputs), has_bias=with_bias, has_gelu=with_gelu
-        )
-        exp_codegen = _get_expected_codegen(*args, has_bias=with_bias, has_gelu=with_gelu)
-        verify_codegen(func, exp_codegen, 1)
-
-
-if __name__ == "__main__":
-    test_dense()
-    test_codegen_dense()
diff --git a/tests/python/contrib/test_bnns/test_matmul.py b/tests/python/contrib/test_bnns/test_matmul.py
deleted file mode 100644
index 7bf4d48f8e88..000000000000
--- a/tests/python/contrib/test_bnns/test_matmul.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BNNS integration dense tests."""
-
-import numpy as np
-import math
-import pytest
-import tvm
-from tvm import relay
-from tvm import testing
-from .infrastructure import (
-    Device,
-    skip_runtime_test,
-    skip_codegen_test,
-    verify_codegen,
-    build_and_run,
-    verify,
-    generate_trials,
-)
-
-
-def _get_model(a_shape, b_shape, dtype, var_names, is_a_constant=False, is_b_constant=False):
-    """Return a model and any parameters it may have"""
-    a = relay.var(next(var_names), shape=a_shape, dtype=dtype)
-    b = relay.var(next(var_names), shape=b_shape, dtype=dtype)
-    params = {}
-    if is_b_constant is True:
-        b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
-        params["b"] = b
-        b = relay.const(b, dtype)
-    if is_a_constant is True:
-        a = tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype))
-        params["a"] = a
-        a = relay.const(a, dtype)
-    out = relay.nn.batch_matmul(a, b)
-    return out, params
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_matmul():
-    device = Device()
-    np.random.seed(0)
-    dtype = "float32"
-
-    # C[N, I, J] = A[N, I, K] * B[N, J, K]
-    shapes_config = [
-        # B, I, J, K
-        [1, 4, 4, 3],
-        [1, 16, 32, 32],
-        [2, 1, 1, 3],
-        [2, 16, 32, 32],
-        [5, 1, 1, 3],
-    ]
-    data_config = [
-        # A_is_constant, B_is_constant
-        [False, True],
-        [True, False],
-        [False, False],
-    ]
-
-    for N, I, J, K in shapes_config:
-        a_shape = [N, I, K]
-        b_shape = [N, J, K]
-        for is_a_constant, is_b_constant in data_config:
-            outputs = []
-            inputs = {
-                "a": tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype)),
-                "b": tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)),
-            }
-            func, params = _get_model(
-                a_shape,
-                b_shape,
-                dtype,
-                var_names=iter(inputs),
-                is_a_constant=is_a_constant,
-                is_b_constant=is_b_constant,
-            )
-            for enable_bnns in [False, True]:
-                outputs.append(
-                    build_and_run(
-                        func,
-                        inputs,
-                        1,
-                        params,
-                        device,
-                        enable_bnns=enable_bnns,
-                    )[0]
-                )
-
-            config = {
-                "a_shape": a_shape,
-                "b_shape": b_shape,
-                "dtype": dtype,
-            }
-            verify(outputs, atol=0.001, rtol=0.01, config=config)
-
-
-if __name__ == "__main__":
-    test_matmul()
diff --git a/tests/python/contrib/test_bnns/test_normalization.py b/tests/python/contrib/test_bnns/test_normalization.py
deleted file mode 100644
index 094cfb041c3c..000000000000
--- a/tests/python/contrib/test_bnns/test_normalization.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BNNS integration normalization tests."""
-
-import numpy as np
-import math
-import pytest
-import tvm
-from tvm import relay
-from tvm import testing
-from .infrastructure import (
-    Device,
-    skip_runtime_test,
-    skip_codegen_test,
-    verify_codegen,
-    build_and_run,
-    verify,
-    generate_trials,
-)
-
-
-def _get_model(
-    shape, b_shape, s_shape, dtype, var_names, axis=1, epsilon=1e-5, center=True, scale=True
-):
-    """Return a model and any parameters it may have"""
-    src = relay.var(next(var_names), shape=shape, dtype=dtype)
-    params = {}
-    b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
-    params["b"] = b
-    b = relay.const(b, dtype)
-    s = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
-    params["b"] = s
-    s = relay.const(s, dtype)
-    out = relay.nn.instance_norm(src, s, b, axis, epsilon, center, scale)
-
-    return out, params
-
-
-def _get_expected_codegen(shape, axis, center, scale, dtype, offload_on_bnns):
-    output_shape = shape
-    name = "nn.instance_norm"
-
-    node = {
-        "op": "kernel",
-        "name": name,
-        "inputs": [],
-        "attrs": {
-            "num_outputs": "1",
-            "axis": [[str(axis)]],
-            "center": [[str(int(center))]],
-            "scale": [[str(int(scale))]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "epsilon": [["1.0000000000000001e-05"]],
-        },
-    }
-
-    inputs = [
-        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]},
-        },
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]},
-        },
-    ]
-
-    input_idx = 0
-    for _ in range(len(inputs)):
-        node["inputs"].append([input_idx, 0, 0])
-        input_idx += 1
-    node["attrs"]["num_inputs"] = str(len(inputs))
-    inputs.append(node)
-    return inputs
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_normalization():
-    device = Device()
-    np.random.seed(0)
-    dtype = "float32"
-
-    shapes_config = [
-        [1, 2, 3, 4],
-        [3, 2, 3, 4],
-        [2, 2, 3],
-        [16, 32, 32],
-        [5, 3],
-    ]
-    axes = [-1, 0, 1, 2]
-
-    for shape in shapes_config:
-        for axis in axes:
-            if len(shape) == 2 and axis != 0:
-                continue
-            for center in [False, True]:
-                for scale in [False, True]:
-                    outputs = []
-                    inputs = {
-                        "src": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
-                    }
-                    func, params = _get_model(
-                        shape,
-                        [shape[axis]],
-                        [shape[axis]],
-                        dtype,
-                        var_names=iter(inputs),
-                        axis=axis,
-                        center=center,
-                        scale=scale,
-                    )
-                    for enable_bnns in [False, True]:
-                        outputs.append(
-                            build_and_run(
-                                func,
-                                inputs,
-                                1,
-                                params,
-                                device,
-                                enable_bnns=enable_bnns,
-                            )[0]
-                        )
-
-                    config = {
-                        "dtype": dtype,
-                    }
-                    verify(outputs, atol=0.001, rtol=0.01, config=config)
-
-
-@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
-def test_codegen_normalization():
-    np.random.seed(0)
-
-    dtype = "float32"
-    shapes_config = [
-        [1, 2, 3, 4],
-        [3, 2, 3, 4],
-        [2, 2, 3],
-        [16, 32, 32],
-        [5, 3],
-    ]
-    axes = [-1, 0, 1, 2]
-
-    def check_normalization(rank, axis):
-        if rank < 3 or rank > 4:
-            return False
-        if axis == 0 and rank == 3 or axis == 1 and rank == 4:
-            return True
-        return False
-
-    for shape in shapes_config:
-        for axis in axes:
-            if len(shape) == 2 and axis != 0:
-                continue
-            for center in [False, True]:
-                for scale in [False, True]:
-                    inputs = {"src"}
-
-                    args = (shape, axis, center, scale, dtype)
-
-                    func, params = _get_model(
-                        shape,
-                        [shape[axis]],
-                        [shape[axis]],
-                        dtype,
-                        var_names=iter(inputs),
-                        axis=axis,
-                        center=center,
-                        scale=scale,
-                    )
-
-                    offload_on_bnns = check_normalization(len(shape), axis)
-                    if offload_on_bnns is True:
-                        bnns_blocks = 1
-                    else:
-                        bnns_blocks = 0
-                    exp_codegen = _get_expected_codegen(*args, offload_on_bnns)
-                    verify_codegen(func, exp_codegen, bnns_blocks)
-
-
-if __name__ == "__main__":
-    test_normalization()
-    test_codegen_normalization()
diff --git a/tests/python/contrib/test_bnns/test_onnx_topologies.py b/tests/python/contrib/test_bnns/test_onnx_topologies.py
deleted file mode 100644
index 447f48355620..000000000000
--- a/tests/python/contrib/test_bnns/test_onnx_topologies.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BNNS pattern detection check"""
-
-import pytest
-
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.contrib import utils, graph_executor
-from tvm.contrib.download import download_testdata
-from tvm.relay.op.contrib.bnns import partition_for_bnns
-
-import numpy as np
-
-pytest.importorskip("onnx")
-
-bnns_is_absent = tvm.get_global_func("relay.ext.bnns", True) is None
-
-TARGET = "llvm"
-INPUT_SHAPE = [1, 3, 224, 224]
-
-BASE_MODEL_URL = "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/"
-MODEL_URL_COLLECTION = {
-    "BERT": "text/machine_comprehension/bert-squad/model/bertsquad-10.onnx",
-    "MobileNet-v2": "vision/classification/mobilenet/model/mobilenetv2-7.onnx",
-    "ResNet50-v1": "vision/classification/resnet/model/resnet50-v1-7.onnx",
-    "ResNet50-v2": "vision/classification/resnet/model/resnet50-v2-7.onnx",
-    "SqueezeNet-v1.1": "vision/classification/squeezenet/model/squeezenet1.1-7.onnx",
-    "SqueezeNet-v1.0": "vision/classification/squeezenet/model/squeezenet1.0-7.onnx",
-    "Inception-v1": "vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx",
-    "Inception-v2": "vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx",
-}
-
-
-def get_onnx_input_name(model):
-    inputs = [node.name for node in model.graph.input]
-    initializer = [node.name for node in model.graph.initializer]
-
-    inputs = list(set(inputs) - set(initializer))
-    return inputs
-
-
-def get_model_url(model_name):
-    return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name]
-
-
-def get_name_from_url(url):
-    return url[url.rfind("/") + 1 :].strip()
-
-
-def find_of_download(model_name):
-    model_url = get_model_url(model_name)
-    model_file_name = get_name_from_url(model_url)
-    return download_testdata(model_url, model_file_name, module="models")
-
-
-def get_model(model_name):
-    model_path = find_of_download(model_name)
-    onnx_model = onnx.load(model_path)
-    input_names = get_onnx_input_name(onnx_model)
-    input_dict = {}
-    for name in input_names:
-        input_dict[name] = INPUT_SHAPE  # TODO: hardcode
-    mod, params = relay.frontend.from_onnx(onnx_model, input_dict, freeze_params=True)
-    return mod, params, input_dict
-
-
-def simplify_model(mod):
-    """
-    Simplify execution graph
-
-    At least merge BatchNorm into convolution. For this purpose decompose BN primitive
-    into simple operation which can be calculated as const expr and after that merged
-    into nearest conv/dense primitive.
-    """
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.FoldConstant(),
-            transform.SimplifyInference(),
-            transform.FoldScaleAxis(),
-        ]
-    )
-    return seq(mod)
-
-
-def process(model_name):
-    temp = utils.tempdir()
-    model, params, input_dict = get_model(model_name)
-
-    def run(mod, target, simplify=True, with_bnns=False):
-        with tvm.transform.PassContext(opt_level=3):
-            if simplify:
-                mod = simplify_model(mod)
-            if with_bnns:
-                mod = partition_for_bnns(mod)
-            graph_module = relay.build(mod, target=target, params=params)
-
-        lib_name = "deploy.tar"
-        path_dso = temp.relpath(lib_name)
-        graph_module.export_library(path_dso)
-
-        dev = tvm.cpu(0)
-        loaded_lib = tvm.runtime.load_module(path_dso)
-
-        module = graph_executor.GraphModule(loaded_lib["default"](dev))
-        module.run()
-        return module.get_output(0).numpy()
-
-    res_llvm = run(model, TARGET, simplify=True, with_bnns=False)
-    res_bnns = run(model, TARGET, simplify=True, with_bnns=True)
-
-    tvm.testing.assert_allclose(
-        res_llvm,
-        res_bnns,
-        atol=0.002,
-        rtol=0.007,
-    )
-
-
-@pytest.mark.skip(reason="Manually disabled because of huge complexity")
-@pytest.mark.skipif(bnns_is_absent, reason="BNNS runtime is absent")
-@pytest.mark.parametrize("model_name", MODEL_URL_COLLECTION.keys())
-def test_topology(model_name):
-    process(model_name)
diff --git a/tests/python/contrib/test_bnns/test_pooling.py b/tests/python/contrib/test_bnns/test_pooling.py
deleted file mode 100644
index 77a78d4bf7e1..000000000000
--- a/tests/python/contrib/test_bnns/test_pooling.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BNNS integration pooling tests."""
-
-import numpy as np
-import pytest
-import tvm
-from tvm import relay
-from tvm import testing
-from .infrastructure import (
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    verify,
-    verify_codegen,
-)
-from .infrastructure import Device
-
-
-def _calculate_output_shape(shape, sizes, padding, strides):
-    """Calculate pooling output shape."""
-    output_height = ((shape[2] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1
-    output_width = ((shape[3] - sizes[1] + padding[1] + padding[3]) / strides[1]) + 1
-    return 1, shape[1], int(output_height), int(output_width)
-
-
-def _get_pooling_model(
-    shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad, var_names
-):
-    """Return a model and any parameters it may have."""
-    if len(padding) == 2:
-        padding = (padding[0], padding[1], padding[0], padding[1])
-    out = relay.var(next(var_names), shape=shape, dtype=dtype)
-
-    if typef == "nn.max_pool2d":
-        out = relay.nn.max_pool2d(
-            out,
-            pool_size=sizes,
-            strides=strides,
-            padding=padding,
-            ceil_mode=ceil_mode,
-        )
-    elif typef == "nn.avg_pool2d":
-        out = relay.nn.avg_pool2d(
-            out,
-            pool_size=sizes,
-            strides=strides,
-            padding=padding,
-            ceil_mode=ceil_mode,
-            count_include_pad=count_include_pad,
-        )
-    else:
-        raise ValueError("Function not supported")
-
-    return out
-
-
-def _get_global_pooling_model(shape, dtype, typef, var_names):
-    """Return a model and any parameters it may have."""
-    out = relay.var(next(var_names), shape=shape, dtype=dtype)
-
-    if typef == "nn.global_max_pool2d":
-        out = relay.nn.global_max_pool2d(out)
-    elif typef == "nn.global_avg_pool2d":
-        out = relay.nn.global_avg_pool2d(out)
-    else:
-        raise ValueError("Function not supported")
-
-    return out
-
-
-def _get_expected_pooling_codegen(
-    shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad
-):
-    if len(padding) == 2:
-        padding = (padding[0], padding[1], padding[0], padding[1])
-    output_shape = _calculate_output_shape(shape, sizes, padding, strides)
-
-    node = {
-        "op": "kernel",
-        "name": typef,
-        "inputs": [[0, 0, 0]],
-        "attrs": {
-            "num_inputs": "1",
-            "num_outputs": "1",
-            "layout": [["NCHW"]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "padding": [[str(p) for p in padding]],
-            "strides": [[str(s) for s in strides]],
-            "pool_size": [[str(s) for s in sizes]],
-            "ceil_mode": [[str(1 if ceil_mode else 0)]],
-        },
-    }
-
-    if typef == "nn.avg_pool2d" or typef == "nn.l2_pool2d":
-        node["attrs"]["count_include_pad"] = [["1" if count_include_pad else "0"]]
-
-    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
-    return [input, node]
-
-
-def _get_expected_global_pooling_codegen(shape, dtype, typef):
-    node = {
-        "op": "kernel",
-        "name": typef,
-        "inputs": [[0, 0, 0]],
-        "attrs": {
-            "num_inputs": "1",
-            "num_outputs": "1",
-            "layout": [["NCHW"]],
-            "shape": [[[1, shape[1], 1, 1]]],
-            "dtype": [[dtype]],
-        },
-    }
-
-    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
-    return [input, node]
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_pooling():
-    device = Device()
-    np.random.seed(0)
-
-    dtype = "float32"
-    trials = [
-        ["nn.max_pool2d", (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)],
-        ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
-        ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
-        ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
-        ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
-        ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
-        ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
-    ]
-
-    for (
-        typef,
-        size,
-        stride,
-        pad,
-        ceil_mode,
-        count_include_pad,
-        input_shape,
-    ) in trials:
-        shape = (1, *input_shape)
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)),
-        }
-
-        func = _get_pooling_model(
-            shape, dtype, typef, size, stride, pad, ceil_mode, count_include_pad, iter(inputs)
-        )
-
-        config = {
-            "size": size,
-            "stride": stride,
-            "shape": shape,
-            "pooling type": typef,
-            "dtype": dtype,
-            "padding": pad,
-            "ceil_mode": ceil_mode,
-            "count_include_pad": count_include_pad,
-            "inputs": inputs,
-        }
-
-        params = None
-        for enable_bnns in [False, True]:
-            outputs.append(
-                build_and_run(
-                    func, inputs, 1, params, device, enable_bnns=enable_bnns, config=config
-                )[0]
-            )
-
-        verify(outputs, atol=0.001, rtol=0.001, config=config)
-
-
-@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
-def test_global_pooling():
-    device = Device()
-    np.random.seed(0)
-
-    dtype = "float32"
-
-    trials = [
-        ["nn.global_max_pool2d", (8, 8, 16)],
-        ["nn.global_max_pool2d", (9, 9, 16)],
-        ["nn.global_max_pool2d", (8, 8, 16)],
-        ["nn.global_avg_pool2d", (8, 8, 16)],
-        ["nn.global_avg_pool2d", (8, 8, 16)],
-        ["nn.global_avg_pool2d", (9, 9, 16)],
-    ]
-
-    for typef, input_shape in trials:
-        shape = (1, *input_shape)
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)),
-        }
-
-        func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
-        config = {
-            "shape": shape,
-            "pooling type": typef,
-            "dtype": dtype,
-        }
-
-        for enable_bnns in [False, True]:
-            outputs.append(
-                build_and_run(
-                    func, inputs, 1, None, device, enable_bnns=enable_bnns, config=config
-                )[0]
-            )
-
-        verify(outputs, atol=0.001, rtol=0.001, config=config)
-
-
-@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
-def test_codegen_pooling():
-    dtype = "float32"
-
-    trials = [
-        ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
-        ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
-        ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
-        ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
-        ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
-        ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
-    ]
-
-    for (
-        typef,
-        size,
-        stride,
-        pad,
-        ceil_mode,
-        count_include_pad,
-        input_shape,
-    ) in trials:
-        shape = (1, *input_shape)
-        inputs = {"a"}
-        args = (shape, dtype, typef, size, stride, pad, False, False)
-        func = _get_pooling_model(*args, iter(inputs))
-        exp_codegen = _get_expected_pooling_codegen(*args)
-        verify_codegen(func, exp_codegen, 1)
-
-
-@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
-def test_codegen_global_pooling():
-    dtype = "float32"
-
-    trials = [
-        ["nn.global_max_pool2d", (8, 8, 16)],
-        ["nn.global_max_pool2d", (9, 9, 16)],
-        ["nn.global_max_pool2d", (8, 8, 16)],
-        ["nn.global_avg_pool2d", (8, 8, 16)],
-        ["nn.global_avg_pool2d", (8, 8, 16)],
-        ["nn.global_avg_pool2d", (9, 9, 16)],
-    ]
-
-    for typef, input_shape in trials:
-        shape = (1, *input_shape)
-        inputs = {"a"}
-        args = (shape, dtype, typef)
-        func = _get_global_pooling_model(*args, iter(inputs))
-        exp_codegen = _get_expected_global_pooling_codegen(*args)
-        verify_codegen(func, exp_codegen, 1)
-
-
-if __name__ == "__main__":
-    test_pooling()
-    test_global_pooling()
-    test_codegen_pooling()
-    test_codegen_global_pooling()
diff --git a/tests/python/contrib/test_clml/__init__.py b/tests/python/contrib/test_clml/__init__.py
deleted file mode 100644
index dfeb9ae5c88e..000000000000
--- a/tests/python/contrib/test_clml/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Infrastructure and tests for CLML"""
diff --git a/tests/python/contrib/test_clml/conftest.py b/tests/python/contrib/test_clml/conftest.py
deleted file mode 100644
index 6b9c91ec1067..000000000000
--- a/tests/python/contrib/test_clml/conftest.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import tvm
-from tvm import rpc
-import pytest
-
-
-@pytest.fixture(scope="session")
-def remote():
-    if (
-        "TVM_TRACKER_HOST" in os.environ
-        and "TVM_TRACKER_PORT" in os.environ
-        and "RPC_DEVICE_KEY" in os.environ
-    ):
-
-        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
-        rpc_tracker_port = int(os.environ["TVM_TRACKER_PORT"])
-        rpc_device_key = os.environ["RPC_DEVICE_KEY"]
-        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
-        remote = tracker.request(rpc_device_key, priority=0, session_timeout=600)
-        return remote
-    else:
-        return None
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
deleted file mode 100644
index b8ce236cdda9..000000000000
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from itertools import zip_longest, combinations
-import json
-import os
-import warnings
-
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm import rpc
-
-# from tvm.contrib.debugger import debug_runtime as graph_executor
-from tvm.contrib import graph_executor
-from tvm.relay.op.contrib import clml
-from tvm.contrib import utils
-from tvm import autotvm
-from tvm.autotvm.measure import request_remote
-from tvm.relay.expr_functor import ExprMutator, Call
-
-"""Utils for adreno compute/schedules"""
-
-import os
-import tvm
-import numpy as np
-from tvm import relay
-from tvm import autotvm
-from tvm import rpc
-from tvm.contrib import utils, ndk
-from tvm.relay import testing
-from tvm.relay.transform import recast
-from tvm.contrib import graph_runtime
-from tvm.runtime.vm import VirtualMachine
-import json
-
-
-NDK_CROSS_COMPILER = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
-
-
-def get_cpu_op_count(mod):
-    """Traverse graph counting ops offloaded to TVM."""
-
-    class Counter(tvm.relay.ExprVisitor):
-        def __init__(self):
-            super().__init__()
-            self.count = 0
-
-        def visit_call(self, call):
-            if isinstance(call.op, tvm.ir.Op):
-                self.count += 1
-
-            super().visit_call(call)
-
-    c = Counter()
-    c.visit(mod["main"])
-    return c.count
-
-
-def get_non_cpu_op_count(mod):
-    """Traverse graph counting ops not offloaded to TVM."""
-
-    class Counter(tvm.relay.ExprVisitor):
-        def __init__(self):
-            super().__init__()
-            self.count = 0
-
-        def visit_call(self, call):
-            if not isinstance(call.op, tvm.ir.Op):
-                self.count += 1
-
-            super().visit_call(call)
-
-    c = Counter()
-    c.visit(mod["main"])
-    return c.count
-
-
-# build module run with opencl or clml target with graph executor
-def build_and_run(
-    remote,
-    mod,
-    params1,
-    inputs,
-    target="llvm",
-    enable_clml=False,
-    stat_file=None,
-):
-    if remote is None:
-        target_host = "llvm"
-    else:
-        target_host = "llvm -mtriple=arm64-linux-android"
-
-    if isinstance(mod, tvm.relay.expr.Call):
-        mod = tvm.IRModule.from_expr(mod)
-
-    with autotvm.apply_history_best(stat_file):
-        with tvm.transform.PassContext(opt_level=3):
-            if enable_clml:
-                mod = clml.partition_for_clml(mod, params1)
-            graph, lib, params = relay.build(
-                mod, target_host=target_host, target=target, params=params1
-            )
-
-    if remote is None:
-        ctx = tvm.opencl()
-        m = graph_runtime.create(graph, lib, ctx)
-    else:
-        temp = utils.tempdir()
-        dso_binary = "dev_lib_cl.so"
-        dso_binary_path = temp.relpath(dso_binary)
-        ctx = remote.cl(0)
-        lib.export_library(dso_binary_path, fcompile=ndk.create_shared)
-        remote.upload(dso_binary_path)
-        rlib = remote.load_module(dso_binary)
-        m = graph_runtime.create(graph, rlib, ctx)
-    m.set_input(**params)
-    m.set_input(**inputs)
-    m.run()
-    return m.get_output(0)
-
-
-# build module run with opencl or clml target with vm executor
-def build_and_run_vm(
-    remote,
-    mod,
-    params1,
-    inputs,
-    target="llvm",
-    enable_clml=False,
-    stat_file=None,
-):
-    if remote is None:
-        target_host = "llvm"
-    else:
-        target_host = "llvm -mtriple=arm64-linux-android"
-
-    target_host = tvm.target.Target(target_host)
-    target = tvm.target.Target(target, target_host)
-    if isinstance(mod, relay.Function):
-        module = tvm.IRModule({})
-        module["main"] = mod
-        mod = module
-    elif isinstance(mod, tvm.relay.expr.Call):
-        mod = tvm.IRModule.from_expr(mod)
-
-    with autotvm.apply_history_best(stat_file):
-        with tvm.transform.PassContext(opt_level=3):
-            if enable_clml:
-                mod = clml.partition_for_clml(mod, params1)
-            vmc = relay.vm.compile(mod, target=target, params=params1)
-
-    if remote is None:
-        dev = tvm.opencl()
-        vm = VirtualMachine(vmc, dev, "naive")
-    else:
-        temp = utils.tempdir()
-        dso_binary = "dev_lib_cl.so"
-        dso_binary_path = temp.relpath(dso_binary)
-        dev = remote.cl(0)
-        vmc.mod.export_library(dso_binary_path, cc=NDK_CROSS_COMPILER)
-        remote.upload(dso_binary_path)
-        rlib = remote.load_module(dso_binary)
-        vm = VirtualMachine(rlib, dev, "naive")
-    inputs_data = {}
-    for key in inputs.keys():
-        inputs_data[key] = tvm.nd.array(inputs[key], dev)
-    for k, v in params1.items():
-        inputs_data[k] = tvm.nd.array(v, dev)
-    vm.set_input("main", **inputs_data)
-    vm.invoke_stateful("main")
-    out = vm.get_outputs()[0]
-
-    return out
-
-
-def extract_clml_modules(module):
-    """Get the CLML module(s) from llvm module."""
-    return list(filter(lambda mod: mod.type_key == "clml", module.get_lib().imported_modules))
-
-
-def verify_codegen(
-    remote,
-    mod,
-    params,
-    known_good_codegen,
-    target="llvm",
-    num_clml_modules=1,
-    tvm_ops=0,
-):
-    if remote is None:
-        target_host = "llvm"
-    else:
-        target_host = "llvm -mtriple=arm64-linux-android"
-
-    """Check clml codegen against a known good output."""
-    if isinstance(mod, tvm.relay.expr.Call):
-        mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = clml.partition_for_clml(mod, params)
-        tvm_op_count = get_cpu_op_count(mod)
-        assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
-            tvm_op_count, tvm_ops
-        )
-        partition_count = 0
-        for global_var in mod.get_global_vars():
-            if "clml" in global_var.name_hint:
-                partition_count += 1
-
-        assert (
-            num_clml_modules == partition_count
-        ), "Got {} Open CLML partitions, expected {}".format(partition_count, num_clml_modules)
-    relay.backend.te_compiler.get().clear()
-
-    module = relay.build(mod, target=target, target_host=target_host, params=params)
-    clml_modules = extract_clml_modules(module)
-    assert len(clml_modules) == num_clml_modules, (
-        f"The number of CLML modules produced ({len(clml_modules)}) does not "
-        f"match the expected value ({num_clml_modules})."
-    )
-
-    for mod in clml_modules:
-        source = mod.get_source("json")
-        codegen = json.loads(source)["nodes"]
-        # remove input and const names as these cannot be predetermined
-        for node in range(len(codegen)):
-            if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
-                codegen[node]["name"] = ""
-        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
-        known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2)
-
-        assert codegen_str == known_good_codegen_str, (
-            f"The JSON produced by codegen does not match the expected result. \n"
-            f"Actual={codegen_str} \n"
-            f"Expected={known_good_codegen_str}"
-        )
diff --git a/tests/python/contrib/test_clml/test_adreno_collage_targets.py b/tests/python/contrib/test_clml/test_adreno_collage_targets.py
deleted file mode 100644
index 4cf86a0e058d..000000000000
--- a/tests/python/contrib/test_clml/test_adreno_collage_targets.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Compares Collage with various other baselines."""
-
-import tvm
-import logging
-import tempfile
-import os
-import shutil
-import numpy as np
-from tvm.relay import testing
-from tvm import rpc
-from tvm.contrib import utils, ndk
-from tvm.relay.build_module import bind_params_by_name
-
-# The following are necessary to force global functions or pattern tables to be registered
-from tvm.relay.collage.collage import *
-from tvm.relay.op.contrib import clml
-import pytest
-
-logging.basicConfig(level=logging.INFO)
-
-
-########### Configuration ###########
-
-###
-### TVM Opencl AutoTvm log file name
-###
-TUNING_LOG = ""
-
-###
-### If true, run all models
-###
-ALL_MODELS = False
-
-###
-### If true, run all configurations
-###
-ALL_CONFIGS = False
-
-###
-### How aggressively to look for candidates?
-###
-TVM_MAX_DEPTH = 8
-BYOC_MAX_DEPTH = 8
-
-###
-### AutoTVM tuning parameters.
-###
-AUTOTVM_NUM_TRIALS = 1024
-AUTOTVM_EARLY_STOPPING = 600
-TIMEOUT = 10
-MEASURE_NUMBER = tvm.relay.collage.MEASURE_NUMBER
-MEASURE_REPEAT = tvm.relay.collage.MEASURE_REPEAT
-WARMUP_MIN_REPEAT_MS = tvm.relay.collage.WARMUP_MIN_REPEAT_MS
-
-##
-## RPC Build configuration
-##
-HOST = tvm.target.Target("llvm -mtriple=arm64-linux-android")
-OPENCL = tvm.target.Target("opencl", HOST)
-RPC_TRACKER_HOST = os.getenv("TVM_TRACKER_HOST", "localhost")
-RPC_TRACKER_PORT = int(os.getenv("TVM_TRACKER_PORT", 9090))
-RPC_KEY = os.getenv("RPC_DEVICE_KEY", "android")
-NDK_CROSS_COMPILER = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
-
-
-########### AutoTVM tuning helpers ###########
-
-
-def extract_autotvm_tasks(mod, target):
-    """Returns TVM kernels to tune for mod and target."""
-    return tvm.autotvm.task.extract_from_program(mod, target=target, params=None)
-
-
-def optional_tuning_records(log_filename):
-    """Returns existing tuning records, if any."""
-    if log_filename == "" or not os.path.exists(log_filename):
-        return tvm.autotvm.task.FallbackContext()
-    else:
-        return tvm.autotvm.task.ApplyHistoryBest(log_filename)
-
-
-def is_already_tuned(task, log_filename):
-    """Returns True if we already have a tuning record for task in turning logs in log_filename"""
-    if not os.path.exists(log_filename):
-        return False
-
-    dispatch_context = tvm.autotvm.task.ApplyHistoryBest(log_filename)
-    return dispatch_context._query_inside(task.target, task.workload)
-
-
-def tune_autotvm_tasks(tasks, log_filename):
-    """Appends to log filename the best strategies for tasks"""
-    if len(tasks) == 0:
-        return
-
-    measure_option = tvm.autotvm.measure_option(
-        builder=tvm.autotvm.LocalBuilder(build_func=ndk.create_shared, timeout=15),
-        runner=tvm.autotvm.RPCRunner(
-            RPC_KEY, host=RPC_TRACKER_HOST, port=RPC_TRACKER_PORT, number=100, timeout=15
-        ),
-    )
-
-    logging.info(
-        f"Using autotvm tuning for {len(tasks)} tasks with {AUTOTVM_NUM_TRIALS} trials, logging to {log_filename}"
-    )
-
-    # create tmp log file, starting with contents from existing log file
-    tmp_log_filename = log_filename + ".tmp"
-    if os.path.exists(tmp_log_filename):
-        os.remove(tmp_log_filename)
-    if os.path.exists(log_filename):
-        logging.info(f"Copying existing log {log_filename} to {tmp_log_filename}")
-        shutil.copy(log_filename, tmp_log_filename)
-
-    for i, task in enumerate(reversed(tasks)):
-        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-        logging.info(f"Considering task {task.name} {prefix}")
-        if is_already_tuned(task, tmp_log_filename):
-            logging.info(f"Re-using existing record for {task.name}")
-            continue
-
-        logging.info(f"Using autotvm to tune {task.name}")
-        tuner_obj = tvm.autotvm.tuner.XGBTuner(task, loss_type="reg")
-        if os.path.exists(tmp_log_filename):
-            tuner_obj.load_history(tvm.autotvm.record.load_from_file(tmp_log_filename))
-
-        # do tuning
-        n_trial = min(AUTOTVM_NUM_TRIALS, len(task.config_space))
-        tuner_obj.tune(
-            n_trial=n_trial,
-            early_stopping=AUTOTVM_EARLY_STOPPING,
-            measure_option=measure_option,
-            callbacks=[
-                tvm.autotvm.callback.progress_bar(n_trial, prefix=prefix),
-                tvm.autotvm.callback.log_to_file(tmp_log_filename),
-            ],
-        )
-
-    # Pick best records and copy back to main log file
-    tvm.autotvm.record.pick_best(tmp_log_filename, log_filename)
-    os.remove(tmp_log_filename)
-
-    logging.info("Done with autotvm tuning")
-
-
-def autotvm_tune_module(mod, target, log_filename):
-    if log_filename == "":
-        logging.info("Not tuning with autotvm since disabled")
-        return
-    # Extract and tune any TVM kernels. BYOC partitions will have no tasks extracted.
-    logging.info("Extracting tasks from overall module")
-    tasks = extract_autotvm_tasks(mod, target)
-    logging.info(f"Auto-tuning {len(tasks)} tasks from overall module")
-    tune_autotvm_tasks(tasks, log_filename)
-
-
-########### Drivers ###########
-
-
-def compile_and_benchmark(label, model, targets, tmp_dir):
-    """Compile model for target and run it with profiling."""
-    logging.info(f"Compiling {model['name']} using {label} with {targets}...")
-    mod = model["mod"]
-    mod = clml.preprocess_for_clml(mod)
-    exe = tvm.relay.vm.compile(mod, target=targets, params=model["params"])
-    lib = exe.mod
-    lib_path = os.path.join(tmp_dir, "lib.so")
-    logging.info(f"Exporting library to {lib_path}...")
-    lib.export_library(lib_path, cc=NDK_CROSS_COMPILER)
-    tracker = rpc.connect_tracker(RPC_TRACKER_HOST, RPC_TRACKER_PORT)
-    remote = tracker.request(RPC_KEY, priority=0, session_timeout=600)
-    ctx = remote.cl(0)
-    remote_path = "lib.so"
-    remote.upload(lib_path, target=remote_path)
-    lib = remote.load_module(remote_path)
-    vm_factory = tvm.runtime.vm.VirtualMachine(lib, ctx)
-    args = {v.name_hint: arg_for(v.checked_type, ctx) for v in mod["main"].params}
-    logging.info(f"Benchmarking for {model['name']} generated by {label}...")
-    profile = vm_factory.benchmark(
-        ctx, repeat=MEASURE_REPEAT, number=MEASURE_NUMBER, min_repeat_ms=0, **args
-    )
-    logging.info(f"Benchmarked for {model['name']} generated by {label}: {profile}")
-    logging.info(f"RESULT: {label} | {model['name']} | {profile.median * 1e3}ms")
-
-
-# Custom cost function for Opencl RPC targets.
-@register_func("tvm.relay.collage.opencl_cost_estimator")
-def opencl_cost_estimator(mod, target):
-    mod = clml.preprocess_for_clml(mod) if "clml" == target.kind.name else mod
-    try:
-        # Build the module.
-        logging.info("Compiling module to estimate")
-        exe = tvm.relay.vm.compile(mod, target)
-    except RuntimeError as err:
-        # A build failure indicates the partition is not supported.
-        # eg trying to build an nn.batch_norm on GPU, which has no schedule since we assume it
-        # is only ever used with a tuple projection which is rewritten away.
-        logging.info("Assigning module infinite cost since unable to build: %s", err)
-        return math.inf
-
-    lib = exe.mod
-    tracker = rpc.connect_tracker(RPC_TRACKER_HOST, RPC_TRACKER_PORT)
-    remote = tracker.request(RPC_KEY, priority=0, session_timeout=600)
-    temp = utils.tempdir()
-    dso_binary = "dev_lib_cl.so"
-    dso_binary_path = temp.relpath(dso_binary)
-    ctx = remote.cl(0)
-    lib.export_library(dso_binary_path, cc=NDK_CROSS_COMPILER)
-    remote_path = dso_binary
-    remote.upload(dso_binary_path, target=remote_path)
-    lib = remote.load_module(remote_path)
-
-    vm_factory = tvm.runtime.vm.VirtualMachine(lib, ctx)
-    func_name = "main"
-    main_args = {v.name_hint: arg_for(v.checked_type, ctx) for v in mod[func_name].params}
-    cost = vm_factory.benchmark(
-        ctx, repeat=5, number=20, min_repeat_ms=0, func_name=func_name, **main_args
-    )
-    return cost.mean
-
-
-def collage(model):
-    """Run the Collage partitioner for a set of Opencl Adreno related targets and profile the result"""
-    logging.info(f"collage | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    autotvm_tune_module(model["mod"], OPENCL, TUNING_LOG)
-    with optional_tuning_records(TUNING_LOG):
-        targets = []
-        targets.append(OPENCL)
-        use_fp16 = model["main_dtype"] == "float16"
-        tmp_dir = tempfile.mkdtemp()
-        targets.append(tvm.target.Target("clml", HOST))
-
-        # Register byoc fusion style for compiler with available
-        # options [compiler.NoFusion | compiler.TVMFusion | compiler.MaxDepthFusion]
-        config = {
-            "relay.collage.tvm_max_depth": TVM_MAX_DEPTH,
-            "relay.collage.byoc_max_depth": BYOC_MAX_DEPTH,
-            "relay.collage.byoc_fusion_style": ["clml.NoFusion"],
-        }
-        logging.info(f"Using PassContext(config={config}")
-        ctxt = tvm.transform.PassContext(config=config)
-        config = tvm.target.make_compilation_config(ctxt, targets)
-        with ctxt:
-            mod = model["mod"]
-            mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
-            logging.info("-------------- BEGIN INDEXED --------------")
-            logging.info(mod)
-            logging.info("-------------- END INDEXED ----------------")
-            # Register python custom cost function for targets in
-            # custom cost estimator module.
-            cost_estimator = CustomCostEstimator(
-                py_fn_estimator="tvm.relay.collage.opencl_cost_estimator"
-            )
-            mod = tvm.relay.transform.CollagePartition(config, cost_estimator=cost_estimator)(mod)
-            partitioned_model = model.copy()
-            partitioned_model["mod"] = mod
-            logging.info("-------------- BEGIN PARTITIONED --------------")
-            logging.info(partitioned_model["mod"])
-            logging.info("-------------- END PARTITIONED ----------------")
-            compile_and_benchmark("collage", partitioned_model, targets, tmp_dir)
-
-
-def just_clml(model):
-    """Run partition_for_clml, complete the compilation with TVM, and profile the result."""
-    logging.info(f"just_clml | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    tmp_dir = tempfile.mkdtemp()
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        logging.info("Partitioning for CLML...")
-        mod = tvm.relay.op.contrib.clml.partition_for_clml(model["mod"], model["params"])
-        partitioned_model = model.copy()
-        partitioned_model["mod"] = mod
-        logging.info("-------------- BEGIN PARTITIONED --------------")
-        logging.info(partitioned_model["mod"])
-        logging.info("-------------- END PARTITIONED ----------------")
-        targets = []
-        targets.append(OPENCL)
-        targets.append(tvm.target.Target("clml", HOST))
-        compile_and_benchmark("just_clml", partitioned_model, targets, tmp_dir)
-
-
-def just_tvm(model):
-    """Compile and profile using vanilla TVM."""
-    logging.info(f"just_tvm | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    tmp_dir = tempfile.mkdtemp()
-    autotvm_tune_module(model["mod"], OPENCL, TUNING_LOG)
-    with optional_tuning_records(TUNING_LOG):
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            compile_and_benchmark("just_tvm", model, OPENCL, tmp_dir)
-
-
-def get_model(model_name, dtype):
-
-    if "mobilenet" in model_name:
-        mod, params = testing.mobilenet.get_workload(batch_size=1, dtype=dtype)
-    elif "resnet" in model_name:
-        mod, params = testing.resnet.get_workload(num_layers=50, batch_size=1, dtype=dtype)
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-        mod = tvm.relay.transform.FoldConstant()(mod)
-    return {
-        "name": model_name,
-        "input_shapes": {"data": [1, 3, 224, 224]},
-        "input_dtypes": {"data": dtype},
-        "mod": mod,
-        "params": params,
-        "main_dtype": dtype,
-    }
-
-
-########### Runners ###########
-@pytest.mark.parametrize("dtype", ["float32"])
-@tvm.testing.requires_openclml
-def run_resnet50(dtype):
-
-    just_clml(get_model("resnet-50", dtype))
-    just_tvm(get_model("resnet-50", dtype))
-    """Run Collage for tvm and clml compiler target."""
-    collage(get_model("resnet-50", dtype))
-
-
-@pytest.mark.parametrize("dtype", ["float32"])
-@tvm.testing.requires_openclml
-def run_mobilenetv1(dtype):
-
-    just_clml(get_model("mobilenet", dtype))
-    just_tvm(get_model("mobilenet", dtype))
-    """Run Collage for tvm and clml compiler target."""
-    collage(get_model("mobilenet", dtype))
diff --git a/tests/python/contrib/test_clml/test_compiler.py b/tests/python/contrib/test_clml/test_compiler.py
deleted file mode 100644
index 973fbbd345f0..000000000000
--- a/tests/python/contrib/test_clml/test_compiler.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""CLML compiler tests."""
-
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from tvm.relay.op.contrib import clml
-import pytest
-
-
-@tvm.testing.requires_openclml
-def test_device_annotation():
-    mod, params = relay.testing.mobilenet.get_workload(batch_size=1)
-    mod = clml.partition_for_clml(mod, params)
-    with tvm.transform.PassContext(opt_level=3):
-        relay.backend.te_compiler.get().clear()
-        lib = relay.build(
-            mod,
-            target="opencl -device=adreno",
-            target_host="llvm -mtriple=aarch64-linux-gnu",
-            params=params,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
deleted file mode 100644
index ec51510920a7..000000000000
--- a/tests/python/contrib/test_clml/test_network.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""OpenCL ML network tests."""
-
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from tvm.contrib import utils
-from test_clml.infrastructure import build_and_run, build_and_run_vm
-import pytest
-
-
-def _build_and_run_network(remote, mod, params, input_data, target, executor_type, tvm_log=""):
-    """Helper function to build and run a network."""
-
-    outputs = []
-    for clml in [True, False]:
-        if executor_type == "ge":
-            outputs.append(
-                build_and_run(
-                    remote,
-                    mod,
-                    params,
-                    input_data,
-                    target,
-                    enable_clml=clml,
-                    stat_file=tvm_log,
-                )
-            )
-        else:
-            outputs.append(
-                build_and_run_vm(
-                    remote,
-                    mod,
-                    params,
-                    input_data,
-                    target,
-                    enable_clml=clml,
-                    stat_file=tvm_log,
-                )
-            )
-    return outputs
-
-
-def get_network(name, batch_size, dtype="float32"):
-    """Get the symbol definition and random weight of a network
-
-    Parameters
-    ----------
-    name: str
-        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
-    batch_size: int
-        batch size
-    dtype: str
-        Data type
-
-    Returns
-    -------
-    net: tvm.IRModule
-        The relay function of network definition
-    params: dict
-        The random parameters for benchmark
-    input_shape: tuple
-        The shape of input tensor
-    output_shape: tuple
-        The shape of output tensor
-    """
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-
-    if name == "mobilenet":
-        net, params = testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == "inception_v3":
-        input_shape = (batch_size, 3, 299, 299)
-        net, params = testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif "resnet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.resnet.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "vgg" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.vgg.get_workload(
-            num_layers=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "densenet" in name:
-        n_layer = int(name.split("-")[1])
-        net, params = testing.densenet.get_workload(
-            densenet_size=n_layer, batch_size=batch_size, dtype=dtype
-        )
-    elif "squeezenet" in name:
-        version = name.split("_v")[1]
-        net, params = testing.squeezenet.get_workload(
-            batch_size=batch_size, version=version, dtype=dtype
-        )
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    initializer = relay.testing.init.Xavier()
-    for param_name in list(params.keys()):
-        filter_data = np.zeros(params[param_name].shape).astype(params[param_name].dtype)
-        if len(filter_data.shape) > 1:
-            initializer("weight", filter_data)
-        else:
-            initializer("bias", filter_data)
-        params[param_name] = tvm.nd.array(filter_data)
-
-    return net, params, {"data": (input_shape, dtype)}, output_shape
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "name",
-    [
-        "resnet-18",
-        "resnet-34",
-        "resnet-50",
-        "inception_v3",
-        "mobilenet",
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_network(remote, name, dtype, target, executor_type):
-    print("Network evaluating .. " + name + " " + dtype)
-    np.random.seed(0)
-    mod, params, inputs, _ = get_network(name, 1, dtype=dtype)
-    input_data = {}
-
-    for name, (shape, dtype) in inputs.items():
-        input_data[name] = np.random.uniform(-1.0, 1.0, shape).astype(dtype)
-
-    outputs = _build_and_run_network(remote, mod, params, input_data, target, executor_type)
-    opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
-    clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
-    tvm.testing.assert_allclose(opencl_sort[-5:], clml_sort[-5:], rtol=0, atol=0)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
deleted file mode 100644
index 425990f79331..000000000000
--- a/tests/python/contrib/test_clml/test_ops.py
+++ /dev/null
@@ -1,1362 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""CLML integration operator tests."""
-
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay.op.contrib import clml
-from tvm.relay import testing
-from tvm.ir import IRModule
-from tvm.contrib import utils
-from test_clml.infrastructure import (
-    build_and_run,
-    build_and_run_vm,
-    verify_codegen,
-)
-import pytest
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-
-
-def _build_and_run_network(remote, mod, params, input_data, target, executor_type, tvm_log=""):
-    """Helper function to build and run a network."""
-
-    outputs = []
-    for clml in [True, False]:
-        if executor_type == "ge":
-            outputs.append(
-                build_and_run(
-                    remote,
-                    mod,
-                    params,
-                    input_data,
-                    target,
-                    enable_clml=clml,
-                    stat_file=tvm_log,
-                )
-            )
-        else:
-            outputs.append(
-                build_and_run_vm(
-                    remote,
-                    mod,
-                    params,
-                    input_data,
-                    target,
-                    enable_clml=clml,
-                    stat_file=tvm_log,
-                )
-            )
-    return outputs
-
-
-def _get_conv_model(
-    shape,
-    kernel_h,
-    kernel_w,
-    padding,
-    strides,
-    dilation,
-    groups,
-    dtype,
-    channels,
-    var,
-    has_bias=False,
-    has_activation=False,
-    has_pad=False,
-):
-    """Return a model and any parameters it may have"""
-    a = relay.var(next(iter(var)), shape=shape, dtype=dtype)
-    input_arr = var[next(iter(var))]
-    if has_pad:
-        p = ((0, 0), (0, 0), (padding[0], padding[0]), (padding[1], padding[1]))
-        a = relay.nn.pad(a, pad_width=p)
-        padding = (0, 0, 0, 0)
-    else:
-        if len(padding) == 2:
-            padding = (padding[0], padding[1], padding[0], padding[1])
-        shape = (shape[0], shape[1], shape[2] + padding[0] * 2, shape[3] + padding[1] * 2)
-    is_depthwise = shape[1] == channels == groups
-
-    weight_format = "OIHW"
-    weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
-
-    w = tvm.nd.array(np.random.uniform(-1, 1, weight_shape).astype(dtype))
-    weights = relay.const(w, dtype)
-    out = relay.nn.conv2d(
-        a,
-        weights,
-        kernel_size=(kernel_h, kernel_w),
-        data_layout="NCHW",
-        kernel_layout=weight_format,
-        dilation=dilation,
-        strides=strides,
-        padding=padding,
-        groups=groups,
-        channels=channels,
-        out_dtype=dtype,
-    )
-    params = {"w": w}
-    if has_bias:
-        bias_shape = (weight_shape[0],)
-        b = tvm.nd.array(np.random.uniform(-1, 1, bias_shape).astype(dtype))
-        biasc = relay.const(b, dtype)
-        out = relay.nn.bias_add(out, biasc, axis=1)
-        params["b"] = b
-
-    if has_activation:
-        out = relay.nn.relu(out)
-
-    return out, params
-
-
-def _get_conv_expected_codegen(
-    shape,
-    kernel_h,
-    kernel_w,
-    padding,
-    strides,
-    dilation,
-    groups,
-    dtype,
-    channels,
-    has_bias=False,
-    has_activation=False,
-):
-    if len(padding) == 2:
-        padding = (padding[0], padding[1], padding[0], padding[1])
-    output_height = ((shape[2] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1
-    output_width = ((shape[3] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1
-    output_shape = (1, channels, int(output_height), int(output_width))
-    out_dtype = dtype
-    is_depthwise = shape[1] == channels == groups
-
-    weight_format = "IOHW" if is_depthwise else "OIHW"
-    if weight_format == "OIHW":
-        weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
-    else:
-        weight_shape = (shape[1] // groups, channels, kernel_h, kernel_w)
-
-    if is_depthwise:
-        name = "nn.depthwise_conv2d"
-    else:
-        name = "nn.conv2d"
-
-    node = {
-        "op": "kernel",
-        "name": name,
-        "inputs": [],
-        "attrs": {
-            "groups": [[str(groups)]],
-            "num_outputs": "1",
-            "data_layout": [["NCHW"]],
-            "kernel_layout": [[weight_format]],
-            "channels": [[str(channels)]],
-            "dilation": [[str(dilation[0]), str(dilation[1])]],
-            "out_layout": [[""]],
-            "out_dtype": [[out_dtype]],
-            "kernel_size": [[str(kernel_h), str(kernel_w)]],
-            "shape": [[list(output_shape)]],
-            "dtype": [[dtype]],
-            "padding": [[str(p) for p in padding]],
-            "strides": [[str(s) for s in strides]],
-        },
-    }
-
-    if has_activation:
-        node["attrs"]["activation_type"] = [["relu"]]
-
-    inputs = [
-        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]},
-        },
-    ]
-
-    if has_bias:
-        bias_dtype = dtype
-        inputs.append(
-            {
-                "op": "const",
-                "name": "",
-                "attrs": {
-                    "shape": [[[1, weight_shape[1] if is_depthwise else weight_shape[0], 1, 1]]],
-                    "dtype": [[bias_dtype]],
-                },
-            }
-        )
-
-    input_idx = 0
-    for _ in range(len(inputs)):
-        node["inputs"].append([input_idx, 0, 0])
-        input_idx += 1
-    node["attrs"]["num_inputs"] = str(len(inputs))
-    inputs.append(node)
-    return inputs
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        # Normal convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False), False],
-        [2, 2, (1, 1), (1, 1), (1, 1), 4, (16, 10, 10), (False, False, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (16, 10, 10), (False, False, False), False],
-        [5, 5, (1, 1), (1, 1), (1, 1), 4, (6, 256, 256), (True, True, True), False],
-        [3, 3, (0, 0), (1, 1), (1, 1), 4, (4, 512, 512), (False, True, False), False],
-        [3, 3, (1, 1), (1, 1), (1, 1), 8, (6, 512, 512), (False, True, False), False],
-        [1, 3, (0, 0), (1, 1), (1, 1), 16, (16, 20, 20), (False, False, True), False],
-        [3, 1, (0, 0), (1, 1), (1, 1), 64, (64, 20, 20), (False, False, True), False],
-        # [3, 3, (1, 1), (1, 1), (1, 1), 128, (128, 16, 16), (False, True, False), False],
-        # [3, 3, (1, 1), (2, 2), (1, 1), 256, (128, 16, 16), (False, True, True), False],
-        # Depth-wise convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 11, (11, 20, 20), (False, False, True), True],
-        [5, 5, (2, 2), (1, 1), (1, 1), 32, (32, 20, 20), (False, True, False), True],
-        [3, 3, (2, 2), (2, 2), (1, 1), 128, (128, 8, 8), (False, False, False), True],
-        [5, 5, (0, 0), (1, 1), (1, 1), 64, (64, 32, 32), (False, False, False), True],
-        [3, 3, (1, 1), (2, 2), (1, 1), 16, (16, 256, 256), (False, True, True), True],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d(remote, dtype, target, trials, executor_type):
-    np.random.seed(0)
-
-    (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        shape,
-        composite,
-        is_depthwise,
-    ) = trials
-
-    shape = (1, *shape)
-    if is_depthwise:
-        groups = shape[1]
-    else:
-        groups = 1
-    outputs = []
-    inputs = {
-        "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
-    }
-
-    func, params = _get_conv_model(
-        shape,
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        groups,
-        dtype,
-        out_channels,
-        inputs,
-        has_pad=composite[0],
-        has_bias=composite[1],
-        has_activation=composite[2],
-    )
-    outputs = _build_and_run_network(remote, func, params, inputs, target, executor_type)
-    out_tol = 1e-1 if dtype == "float16" else 1e-5
-    tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-    )
-    args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
-    exp_codegen = _get_conv_expected_codegen(
-        *args, has_bias=composite[1], has_activation=composite[2]
-    )
-    verify_codegen(remote, func, params, exp_codegen, target)
-
-
-def _get_conv2d_transpose_expected_codegen(
-    dshape, kshape, channels, kernel_size, strides, padding, dilation, dtype, output_shape
-):
-    attrs = {
-        "channels": [[str(channels)]],
-        "data_layout": [["NCHW"]],
-        "kernel_layout": [["OIHW"]],
-        "groups": [["1"]],
-        "dilation": [[str(p) for p in dilation]],
-        "num_inputs": "2",
-        "num_outputs": "1",
-        "padding": [[str(p) for p in padding]],
-        "kernel_size": [[str(p) for p in kernel_size]],
-        "shape": [[list(output_shape)]],
-        "dtype": [[dtype]],
-        "strides": [[str(s) for s in strides]],
-        "out_dtype": [[""]],
-        "out_layout": [[""]],
-        "output_padding": [["0", "0"]],
-    }
-
-    kshape = [kshape[1], kshape[0], kshape[2], kshape[3]]
-
-    exp_codegen = [
-        {
-            "op": "input",
-            "name": "",
-            "attrs": {"shape": [[list(dshape)]], "dtype": [[str(dtype)]]},
-        },
-        {
-            "op": "const",
-            "name": "",
-            "attrs": {"shape": [[list(kshape)]], "dtype": [[str(dtype)]]},
-        },
-        {
-            "op": "kernel",
-            "name": "nn.conv2d_transpose",
-            "inputs": [[0, 0, 0], [1, 0, 0]],
-            "attrs": attrs,
-        },
-    ]
-    return exp_codegen
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1)],
-        [(1, 64, 400, 400), (64, 16, 4, 4), 16, (4, 4), (2, 2), (1, 1, 1, 1)],
-        [(1, 16, 32, 32), (16, 16, 3, 3), 16, (3, 3), (1, 1), (1, 1, 1, 1)],
-        # [(1, 256, 100, 100), (256, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1)],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_transpose(remote, dtype, target, trials, executor_type):
-    np.random.seed(0)
-    (dshape, kshape, channels, kernel_size, strides, padding) = trials
-    x = relay.var("input", shape=dshape, dtype=dtype)
-    input_arr = tvm.nd.array(np.random.uniform(-1, 1, dshape).astype(dtype))
-    w = relay.var("wt", shape=kshape, dtype=dtype)
-    weight_arr = tvm.nd.array(np.random.uniform(-1, 1, kshape).astype(dtype))
-    inputs = {
-        "input": input_arr,
-    }
-    params = {
-        "wt": weight_arr,
-    }
-    y = relay.nn.conv2d_transpose(
-        x,
-        w,
-        channels=channels,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        kernel_layout="IOHW",
-        data_layout="NCHW",
-    )
-    func = relay.Function([x, w], y)
-    mod = IRModule.from_expr(func)
-    outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-    out_tol = 1e-1 if dtype == "float16" else 1e-5
-    tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-    )
-    args = (
-        dshape,
-        kshape,
-        channels,
-        kernel_size,
-        strides,
-        padding,
-        (1, 1),
-        dtype,
-        outputs[0].shape,
-    )
-    exp_codegen = _get_conv2d_transpose_expected_codegen(*args)
-    verify_codegen(remote, mod, params, exp_codegen, target)
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize("trials", [[1, 64, 8, 8], [1, 16, 64, 64]])
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_batchnorm(remote, dtype, target, trials, executor_type):
-    if clml.clml_sdk_version() < 3:
-        print("Skip due to unsupported CLML version:", clml.clml_sdk_version())
-        return
-    in_shape = trials
-    channels = in_shape[1]
-
-    np.random.seed(0)
-
-    input_arr = tvm.nd.array(np.random.uniform(-1, 1, in_shape).astype(dtype))
-    inp = relay.var("a", shape=in_shape, dtype=dtype)
-    gamma_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
-    beta_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
-    gamma = relay.const(gamma_arr, dtype)
-    beta = relay.const(beta_arr, dtype)
-
-    mean_arr = tvm.nd.array(np.mean(input_arr.asnumpy(), axis=(0, 2, 3), keepdims=False))
-    mean = relay.const(mean_arr)
-    variance_arr = tvm.nd.array(np.var(input_arr.asnumpy(), axis=(0, 2, 3), keepdims=False))
-    variance = relay.const(variance_arr)
-
-    params = {}
-
-    func = relay.nn.batch_norm(inp, gamma, beta, mean, variance, axis=1, epsilon=0.0003)[0]
-    mod = IRModule.from_expr(func)
-    inputs = {
-        "a": input_arr,
-    }
-    outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-    out_tol = 1e-3 if dtype == "float16" else 1e-5
-    tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-    )
-    exp_codegen = [
-        {
-            "attrs": {"dtype": [[dtype]], "shape": [[list(inputs["a"].shape)]]},
-            "name": "",
-            "op": "input",
-        },
-        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", "op": "const"},
-        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", "op": "const"},
-        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", "op": "const"},
-        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", "op": "const"},
-        {
-            "attrs": {
-                "axis": [["1"]],
-                "center": [["1"]],
-                "dtype": [[dtype]],
-                "epsilon": [["0.00029999999999999997"]],
-                "num_inputs": "5",
-                "num_outputs": "1",
-                "scale": [["1"]],
-                "shape": [[list(outputs[0].shape)]],
-            },
-            "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0]],
-            "name": "nn.batch_norm",
-            "op": "kernel",
-        },
-    ]
-    verify_codegen(remote, mod, params, exp_codegen, target)
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(1, 64, 64, 40), (1, 64, 64, 40)],
-        [(1, 1280, 32, 32), (1, 640, 32, 32)],
-        [(1, 64), (1, 32)],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_concat(remote, dtype, target, trials, executor_type):
-    np.random.seed(0)
-    in_shape_1 = trials[0]
-    in_shape_2 = trials[1]
-    a = relay.var("input_1", shape=in_shape_1, dtype=dtype)
-    b = relay.var("input_2", shape=in_shape_2, dtype=dtype)
-    low, high = -1, 1
-    inputs = {
-        "input_1": tvm.nd.array(np.random.uniform(-1, 1, in_shape_1).astype(dtype)),
-        "input_2": tvm.nd.array(np.random.uniform(-1, 1, in_shape_2).astype(dtype)),
-    }
-
-    params = {}
-    func = relay.concatenate((a, b), axis=1)
-
-    outputs = _build_and_run_network(remote, func, params, inputs, target, executor_type)
-    out_tol = 1e-2 if dtype == "float16" else 1e-5
-    tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-    )
-
-    exp_codegen = [
-        {
-            "attrs": {
-                "dtype": [[dtype]],
-                "shape": [[list(in_shape_1)]],
-            },
-            "name": "",
-            "op": "input",
-        },
-        {
-            "attrs": {
-                "dtype": [[dtype]],
-                "shape": [[list(in_shape_2)]],
-            },
-            "name": "",
-            "op": "input",
-        },
-        {
-            "attrs": {
-                "axis": [["1"]],
-                "dtype": [[dtype]],
-                "num_inputs": "2",
-                "num_outputs": "1",
-                "shape": [[list(outputs[0].shape)]],
-            },
-            "inputs": [[0, 0, 0], [1, 0, 0]],
-            "name": "concatenate",
-            "op": "kernel",
-        },
-    ]
-    verify_codegen(remote, func, params, exp_codegen, target)
-
-
-def _get_pool_expected_codegen(input_shape, pool_size, stride, padding, pool_type, dtype):
-    import math
-
-    pool_height = math.floor(((input_shape[2] + padding[2] - pool_size[0]) / stride[0]) + 1)
-    pool_width = math.floor(((input_shape[3] + padding[3] - pool_size[1]) / stride[1]) + 1)
-    output_shape = [input_shape[0], input_shape[1], pool_height, pool_width]
-    attrs = {
-        "ceil_mode": [["0"]],
-        "dilation": [["1", "1"]],
-        "layout": [["NCHW"]],
-        "num_inputs": "1",
-        "num_outputs": "1",
-        "out_layout": [[""]],
-        "padding": [[str(p) for p in padding]],
-        "pool_size": [[str(p) for p in pool_size]],
-        "shape": [[list(output_shape)]],
-        "dtype": [[dtype]],
-        "strides": [[str(s) for s in stride]],
-    }
-    if sum(padding):
-        attrs["count_include_pad"] = [["0"]]
-
-    exp_codegen = [
-        {
-            "op": "input",
-            "name": "",
-            "attrs": {"shape": [[list(input_shape)]], "dtype": [[str(dtype)]]},
-        },
-        {
-            "op": "kernel",
-            "name": "nn.avg_pool2d" if pool_type == "avg" else "nn.max_pool2d",
-            "inputs": [[0, 0, 0]],
-            "attrs": attrs,
-        },
-    ]
-    return exp_codegen
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        # input size         pool_size stride  paading
-        [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
-        [(1, 192, 71, 71), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
-        [(1, 288, 35, 35), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
-        [(1, 768, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
-        [(1, 2048, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
-        [(1, 192, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
-        [(1, 256, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
-        [(1, 288, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
-        [(1, 768, 17, 17), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
-        [(1, 1280, 8, 8), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_pool(remote, dtype, target, trials, executor_type):
-    np.random.seed(0)
-    params = {}
-    (
-        input_shape,
-        pool_size,
-        stride,
-        padding,
-        pooling_type,
-    ) = trials
-    a = relay.var("input_1", shape=input_shape, dtype=dtype)
-    input_arr = tvm.nd.array(np.random.uniform(-1, 1, input_shape).astype(dtype))
-    inputs = {
-        "input_1": input_arr,
-    }
-    if pooling_type == "max":
-        func = relay.nn.max_pool2d(a, pool_size=pool_size, strides=stride, padding=padding)
-    else:
-        func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, padding=padding)
-
-    outputs = _build_and_run_network(remote, func, params, inputs, target, executor_type)
-    out_tol = 1e-2 if dtype == "float16" else 1e-5
-    tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-    )
-    args = (input_shape, pool_size, stride, padding, pooling_type, dtype)
-    exp_codegen = _get_pool_expected_codegen(*args)
-    verify_codegen(remote, func, params, exp_codegen, target)
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(5, 16), (32, 16), False],
-        [(320, 64), (320, 64), False],
-        [(256, 256), (256, 256), False],
-        [(512, 512), (512, 512), False],
-        [(1, 256), (100, 256), False],
-        [(1, 16), (32, 16), True],
-        [(1, 512), (512, 512), True],
-        [(1, 5), (4, 5), True],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_dense(remote, dtype, target, trials, executor_type):
-    def _get_model(x_shape, k_shape, has_bias=False):
-        np.random.seed(0)
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-        out = relay.nn.dense(x, kernel, units=k_shape[0])
-        params = {"kernel": tvm.nd.array(np.random.uniform(-1, 1, k_shape).astype(dtype))}
-        inputs = {"x": tvm.nd.array(np.random.uniform(-1, 1, x_shape).astype(dtype))}
-        exp_codegen = [
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(x_shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(k_shape)]],
-                },
-                "name": "",
-                "op": "const",
-            },
-        ]
-        input_nodes = [[0, 0, 0], [1, 0, 0]]
-        num_inputs = 2
-        if has_bias:
-            bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-            out = relay.nn.bias_add(out, bias)
-            bias_data_node = {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list((1, k_shape[0]))]],
-                },
-                "name": "",
-                "op": "const",
-            }
-            exp_codegen.append(bias_data_node)
-            input_nodes.append([2, 0, 0])
-            num_inputs += 1
-            params["bias"] = tvm.nd.array(np.random.uniform(-1, 1, (k_shape[0],)).astype(dtype))
-
-        dense_node = {
-            "attrs": {
-                "num_inputs": str(num_inputs),
-                "num_outputs": "1",
-                "dtype": [[dtype]],
-                "out_dtype": [[""]],
-                "shape": [[[x_shape[0], k_shape[0]]]],
-                "units": [[str(k_shape[0])]],
-            },
-            "inputs": input_nodes,
-            "name": "nn.dense",
-            "op": "kernel",
-        }
-        exp_codegen.append(dense_node)
-
-        return out, params, inputs, exp_codegen
-
-    def _verify(out, params, inputs, exp_codegen):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-1 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(*(_get_model(trials[0], trials[1], trials[2])))
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_binary_ops(remote, dtype, target, executor_type):
-    def _get_model(a_shape, b_shape, op_func):
-        np.random.seed(0)
-        a = relay.var("a", shape=(a_shape), dtype=dtype)
-        b = relay.var("b", shape=(b_shape), dtype=dtype)
-        out = op_func(a, b)
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype)),
-            "b": tvm.nd.array(np.random.uniform(-1, 1, b_shape).astype(dtype)),
-        }
-        params = {}
-        return out, params, inputs
-
-    def _verify(out, params, inputs):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-2 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-        exp_codegen = [
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(inputs["a"].shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(inputs["b"].shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "num_inputs": "2",
-                    "num_outputs": "1",
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[0, 0, 0], [1, 0, 0]],
-                "name": str(out.op.name),
-                "op": "kernel",
-            },
-        ]
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(*(_get_model((1, 16), (1, 16), relay.add)))
-    _verify(*(_get_model((1, 18), (1, 18), relay.subtract)))
-    _verify(*(_get_model((1, 256), (1, 256), relay.multiply)))
-    _verify(*(_get_model((1, 10), (1, 10), relay.divide)))
-    _verify(*(_get_model((1, 16), (1, 16), relay.minimum)))
-    _verify(*(_get_model((1, 512), (1, 512), relay.maximum)))
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_unary_ops(remote, dtype, target, executor_type):
-    def _get_model(a_shape, op):
-        np.random.seed(0)
-        a = relay.var("a", shape=(a_shape), dtype=dtype)
-        out = op(a)
-        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
-        params = {}
-        return out, params, inputs
-
-    def _verify(out, params, inputs):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-2 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-
-        exp_codegen = [
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(inputs["a"].shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[0, 0, 0]],
-                "name": "nn.relu",
-                "op": "kernel",
-            },
-        ]
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(*(_get_model((1, 16), relay.nn.relu)))
-    _verify(*(_get_model((1, 256), relay.nn.relu)))
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize("input_shape", [(1, 64, 8, 8), (1, 64, 8, 8), (1, 512, 8, 8)])
-@pytest.mark.parametrize("block_size", [4, 8])
-@pytest.mark.parametrize("mode", ["DCR", "CRD"])
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depth_to_space(remote, dtype, target, executor_type, input_shape, block_size, mode):
-    def _get_model(a_shape, block_size, mode):
-        np.random.seed(0)
-        a = relay.var("a", shape=(a_shape), dtype=dtype)
-        out = relay.nn.depth_to_space(a, block_size, mode=mode)
-        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
-        params = {}
-        return out, params, inputs
-
-    def _verify(out, params, inputs):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-2 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-
-        exp_codegen = [
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(inputs["a"].shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "block_size": [[str(int(out.attrs.block_size))]],
-                    "layout": [["NCHW"]],
-                    "mode": [[out.attrs.mode]],
-                    "dtype": [[dtype]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[0, 0, 0]],
-                "name": "nn.depth_to_space",
-                "op": "kernel",
-            },
-        ]
-        num_clml_modules = 1
-        tvm_ops = 0
-        if out.attrs.mode != "DCR":
-            num_clml_modules = 0
-            tvm_ops = 1
-        verify_codegen(
-            remote,
-            mod,
-            params,
-            exp_codegen,
-            target,
-            num_clml_modules=num_clml_modules,
-            tvm_ops=tvm_ops,
-        )
-
-    _verify(*(_get_model(input_shape, block_size, mode)))
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_resize_bilinear(remote, dtype, target, executor_type):
-    def _get_model(a_shape, scale, align_corners):
-        np.random.seed(0)
-        a = relay.var("a", shape=(a_shape), dtype=dtype)
-        out = relay.nn.upsampling(
-            a, scale_h=scale[0], scale_w=scale[1], method="bilinear", align_corners=align_corners
-        )
-        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
-        params = {}
-        return out, params, inputs
-
-    def _verify(out, params, inputs):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-2 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-
-        exp_codegen = [
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(inputs["a"].shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "scale_h": [[str(int(out.attrs.scale_h))]],
-                    "scale_w": [[str(int(out.attrs.scale_w))]],
-                    "layout": [["NCHW"]],
-                    "method": [[out.attrs.method]],
-                    "align_corners": [[str(out.attrs.align_corners)]],
-                    "dtype": [[dtype]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[0, 0, 0]],
-                "name": "nn.upsampling",
-                "op": "kernel",
-            },
-        ]
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(*(_get_model((1, 16, 8, 8), (2, 2), False)))
-    _verify(*(_get_model((1, 16, 7, 7), (2, 2), True)))
-    _verify(*(_get_model((1, 64, 8, 8), (2, 2), True)))
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(1, 512, 32), (1, 512, 32), False, True],
-        [(1, 128, 32), (1, 128, 32), False, True],
-        [(1, 128, 128), (1, 32, 128), False, True],
-        [(1, 64, 40), (1, 64, 40), False, True],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_batch_matmul(remote, dtype, target, executor_type, trials):
-    def _get_model(a_shape, b_shape, a_transpose, b_transpose):
-        np.random.seed(0)
-        a = relay.var("a", shape=(a_shape), dtype=dtype)
-        b = relay.var("b", shape=(b_shape), dtype=dtype)
-        out = relay.nn.batch_matmul(a, b, transpose_a=a_transpose, transpose_b=b_transpose)
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype)),
-            "b": tvm.nd.array(np.random.uniform(-1, 1, b_shape).astype(dtype)),
-        }
-        params = {}
-        return out, params, inputs
-
-    def _verify(out, params, inputs):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-1 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-
-        exp_codegen = [
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(inputs["a"].shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list(inputs["b"].shape)]],
-                },
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "transpose_a": [[str(int(out.attrs.transpose_a))]],
-                    "transpose_b": [[str(int(out.attrs.transpose_b))]],
-                    "out_dtype": [[""]],
-                    "dtype": [[dtype]],
-                    "num_inputs": "2",
-                    "num_outputs": "1",
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[0, 0, 0], [1, 0, 0]],
-                "name": "nn.batch_matmul",
-                "op": "kernel",
-            },
-        ]
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(*(_get_model(trials[0], trials[1], trials[2], trials[3])))
-
-
-def _get_softmax_exp_codegen(inputs, dtype, output_shape, axis):
-
-    exp_codegen = [
-        {
-            "attrs": {
-                "dtype": [[dtype]],
-                "shape": [[list(inputs["a"].shape)]],
-            },
-            "name": "",
-            "op": "input",
-        },
-        {
-            "attrs": {
-                "axis": [[str(axis)]],
-                "dtype": [[dtype]],
-                "num_inputs": "1",
-                "num_outputs": "1",
-                "shape": [[list(output_shape)]],
-            },
-            "inputs": [[0, 0, 0]],
-            "name": "nn.softmax",
-            "op": "kernel",
-        },
-    ]
-    return exp_codegen
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_softmax(remote, dtype, target, executor_type):
-    def _get_model(a_shape, axis):
-        np.random.seed(0)
-        a = relay.var("a", shape=(a_shape), dtype=dtype)
-        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
-        out = relay.nn.softmax(a, axis)
-        params = {}
-        return out, params, inputs, axis
-
-    def _verify(out, params, inputs, axis, out_tol):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].numpy(), rtol=out_tol, atol=out_tol
-        )
-        args = (inputs, dtype, outputs[0].shape, axis)
-        exp_codegen = _get_softmax_exp_codegen(*args)
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    # 2D Tensor  TEST CASES
-    _verify(*(_get_model((1, 5), 1)), 1e-3)
-    _verify(*(_get_model((1, 16), 1)), 1e-3)
-    _verify(*(_get_model((1, 1000), -1)), 1e-3)
-
-    # 4D Tensor  TEST CASES  layout = NCHW
-    _verify(*(_get_model((1, 100, 64, 100), 1)), 1e-3)
-    _verify(*(_get_model((1, 64, 64, 64), 1)), 1e-3)
-    _verify(*(_get_model((1, 5, 3, 4), 1)), 1e-3)
-
-    # 4D Tensor  TEST CASES  layout = NHWC
-    _verify(*(_get_model((1, 64, 100, 100), 3)), 1e-1)
-    _verify(*(_get_model((1, 100, 100, 100), 3)), 1e-1)
-    _verify(*(_get_model((1, 64, 5, 32), -1)), 1e-1)
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(1, 1, 2, 2), 2, 1],
-        [(1, 16, 2, 2), 4, 4],
-        [(1, 8, 4, 4), 3, 2],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_upsampling(remote, dtype, target, executor_type, trials):
-    def _verify(in_shape, scale_h, scale_w):
-        np.random.seed(0)
-        a = relay.var("a", shape=in_shape, dtype=dtype)
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(-1, 1, in_shape).astype(dtype)),
-        }
-        params = {}
-        func = relay.nn.upsampling(
-            a, scale_h, scale_w, layout="NCHW", method="bilinear", align_corners=False
-        )
-        mod = IRModule.from_expr(func)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-2 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-        exp_codegen = [
-            {
-                "attrs": {"dtype": [[dtype]], "shape": [[list(inputs["a"].shape)]]},
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "align_corners": [["0"]],
-                    "dtype": [[dtype]],
-                    "layout": [["NCHW"]],
-                    "method": [["bilinear"]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "scale_h": [[str(scale_h)]],
-                    "scale_w": [[str(scale_w)]],
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[0, 0, 0]],
-                "name": "nn.upsampling",
-                "op": "kernel",
-            },
-        ]
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(trials[0], trials[1], trials[2])
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(1, 40, 64, 64), (1, 40, 4096)],
-        [(1, 77, 768), (1, 1, -1, 768)],
-        [(1, 80, 32, 32), (1, 80, 1024)],
-        [(1, 2, 3, 4), (1, 0, -1)],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_reshape(remote, dtype, target, executor_type, trials):
-    def _verify(shape, newshape):
-        np.random.seed(0)
-        x = relay.var("x", shape=(shape), dtype=dtype)
-        # Defined the test case with unary operator
-        # Single reshape op is failing in native OpenCL with vm executor type
-        # Empty TVM mod in VM doesn't pick appropriate cross compiler
-        out = relay.nn.relu(x)
-        out = relay.reshape(out, newshape)
-
-        inputs = {"x": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype))}
-        params = {}
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-3 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-        exp_codegen = [
-            {
-                "attrs": {"dtype": [[dtype]], "shape": [[list(inputs["x"].shape)]]},
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[list(inputs["x"].shape)]],
-                },
-                "inputs": [[0, 0, 0]],
-                "name": "nn.relu",
-                "op": "kernel",
-            },
-            {
-                "attrs": {
-                    "allowzero": [["0"]],
-                    "dtype": [[dtype]],
-                    "newshape": [[str(ele) for ele in list(newshape)]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[1, 0, 0]],
-                "name": "reshape",
-                "op": "kernel",
-            },
-        ]
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(trials[0], trials[1])
-
-
-def _get_pool_global_expected_codegen(input_shape, pool_type, dtype, out_shape):
-
-    exp_codegen = [
-        {
-            "attrs": {
-                "dtype": [[str(dtype)]],
-                "shape": [[list(input_shape)]],
-            },
-            "name": "",
-            "op": "input",
-        },
-        {
-            "attrs": {
-                "dtype": [[str(dtype)]],
-                "layout": [["NCHW"]],
-                "num_inputs": "1",
-                "num_outputs": "1",
-                "out_layout": [[""]],
-                "shape": [[list(out_shape)]],
-            },
-            "inputs": [[0, 0, 0]],
-            "name": "nn.global_avg_pool2d" if pool_type == "avg" else "nn.global_max_pool2d",
-            "op": "kernel",
-        },
-    ]
-    return exp_codegen
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(1, 3, 32, 32), "avg"],
-        [(1, 64, 147, 147), "max"],
-        [(1, 192, 71, 71), "max"],
-        [(1, 288, 35, 35), "max"],
-        [(1, 768, 17, 17), "max"],
-        [(1, 2048, 17, 17), "max"],
-        [(1, 192, 35, 35), "avg"],
-        [(1, 256, 35, 35), "avg"],
-        [(1, 288, 35, 35), "avg"],
-        [(1, 768, 17, 17), "avg"],
-        [(1, 1280, 8, 8), "avg"],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_pool_global(remote, dtype, target, executor_type, trials):
-    params = {}
-    (input_shape, pooling_type) = trials
-    np.random.seed(0)
-    a = relay.var("a", shape=input_shape, dtype=dtype)
-    inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, input_shape).astype(dtype))}
-    if pooling_type == "max":
-        func = relay.nn.global_max_pool2d(a)
-    else:
-        func = relay.nn.global_avg_pool2d(a)
-    mod = IRModule.from_expr(func)
-    outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-    out_tol = 1e-3 if dtype == "float16" else 1e-5
-    tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-    )
-    args = (input_shape, pooling_type, dtype, outputs[0].shape)
-    exp_codegen = _get_pool_global_expected_codegen(*args)
-    verify_codegen(remote, mod, params, exp_codegen, target)
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_batch_flatten(remote, dtype, target, executor_type):
-    def _get_model(a_shape):
-        a = relay.var("a", shape=(a_shape), dtype=dtype)
-        # Defined the test case with unary operator
-        # Single batch_flatten op is failing in native OpenCL
-        # Empty TVM mod in VM doesn't pick appropriate cross compiler
-        np.random.seed(0)
-        out = relay.nn.relu(a)
-        out = relay.nn.batch_flatten(out)
-        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
-        params = {}
-        return out, params, inputs
-
-    def _verify(out, params, inputs):
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-3 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-        exp_codegen = [
-            {
-                "attrs": {"dtype": [[dtype]], "shape": [[list(inputs["a"].shape)]]},
-                "name": "",
-                "op": "input",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[list(inputs["a"].shape)]],
-                },
-                "inputs": [[0, 0, 0]],
-                "name": "nn.relu",
-                "op": "kernel",
-            },
-            {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[list(outputs[0].shape)]],
-                },
-                "inputs": [[1, 0, 0]],
-                "name": "nn.batch_flatten",
-                "op": "kernel",
-            },
-        ]
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(*(_get_model((1, 3, 2))))
-    _verify(*(_get_model((1, 4, 3, 2))))
-    _verify(*(_get_model((1, 64, 8, 8))))
-    _verify(*(_get_model((1, 128, 4, 4))))
-
-
-@pytest.mark.parametrize("dtype", ["float16", "float32"])
-@pytest.mark.parametrize(
-    "trials",
-    [
-        [(1, 32, 256, 256), -1, 1],
-        [(1, 8, 64, 64), 0, 1],
-    ],
-)
-@tvm.testing.requires_openclml
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_clip(remote, dtype, target, executor_type, trials):
-    def _verify(shape, a_min, a_max):
-        np.random.seed(0)
-        a = relay.var("a", shape=(shape), dtype=dtype)
-        out = relay.clip(a, a_min, a_max)
-        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype))}
-        params = {}
-        mod = IRModule.from_expr(out)
-        outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_tol = 1e-3 if dtype == "float16" else 1e-5
-        tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
-        )
-        exp_codegen = [
-            {"attrs": {"dtype": [[dtype]], "shape": [[shape]]}, "name": "", "op": "input"},
-            {
-                "attrs": {
-                    "a_max": [[str(a_max)]],
-                    "a_min": [[str(a_min)]],
-                    "dtype": [[dtype]],
-                    "num_inputs": "1",
-                    "num_outputs": "1",
-                    "shape": [[shape]],
-                },
-                "inputs": [[0, 0, 0]],
-                "name": "clip",
-                "op": "kernel",
-            },
-        ]
-
-        verify_codegen(remote, mod, params, exp_codegen, target)
-
-    _verify(trials[0], trials[1], trials[2])
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_coreml_codegen.py b/tests/python/contrib/test_coreml_codegen.py
deleted file mode 100644
index f4f84876fe13..000000000000
--- a/tests/python/contrib/test_coreml_codegen.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-from unittest import mock
-
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay import transform
-from tvm.contrib.target import coreml as _coreml
-
-requires_coremltools = tvm.testing.requires_package("coremltools")
-
-
-def _has_xcode():
-    try:
-        tvm.contrib.xcode.xcrun([])
-        return True
-    except FileNotFoundError:
-        pass
-
-    return False
-
-
-def _create_graph():
-    shape = (10, 10)
-    mod = tvm.IRModule()
-
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    z = x + x
-    p = y * y
-    func = relay.Function([x, y], p - z)
-    mod["main"] = func
-
-    return mod
-
-
-def _create_graph_annotated():
-    shape = (10, 10)
-    target = "coremlcompiler"
-    mod = tvm.IRModule()
-
-    # function 0
-    f0_i0 = relay.var(target + "_0_i0", shape=shape)
-    func0 = relay.Function([f0_i0], f0_i0 * f0_i0)
-
-    func0 = func0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func0 = func0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    func0 = func0.with_attr("Compiler", target)
-    func0 = func0.with_attr("global_symbol", target + "_0")
-    gv0 = relay.GlobalVar(target + "_0")
-    mod[gv0] = func0
-
-    # function 2
-    f2_i0 = relay.var(target + "_2_i0", shape=shape)
-    func2 = relay.Function([f2_i0], f2_i0 + f2_i0)
-
-    func2 = func2.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func2 = func2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    func2 = func2.with_attr("Compiler", target)
-    func2 = func2.with_attr("global_symbol", target + "_2")
-    gv2 = relay.GlobalVar(target + "_2")
-    mod[gv2] = func2
-    mod = relay.transform.InferType()(mod)
-
-    # body
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    func = relay.Function([x, y], gv0(y) - gv2(x))
-    mod["main"] = func
-    mod = relay.transform.InferType()(mod)
-
-    return mod
-
-
-@pytest.mark.xfail(
-    reason="Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901"
-)
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_annotate():
-    mod = _create_graph()
-    mod = transform.AnnotateTarget("coremlcompiler")(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    expected = _create_graph_annotated()
-    tvm.ir.assert_structural_equal(mod, expected, map_free_vars=True)
-
-
-@pytest.mark.skipif(not _has_xcode(), reason="Xcode is not available")
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_compile_and_run():
-    dev = tvm.cpu()
-    target = "llvm"
-    tol = 1e-3
-
-    with relay.build_config(opt_level=3):
-        lib = relay.build(_create_graph_annotated(), target=target)
-    m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    shape = (10, 10)
-    x_data = np.random.rand(*shape).astype("float32")
-    y_data = np.random.rand(*shape).astype("float32")
-
-    m.set_input("x", x_data)
-    m.set_input("y", y_data)
-    m.run()
-    out = tvm.nd.empty(shape, device=dev)
-    out = m.get_output(0, out)
-
-    expected = (y_data * y_data) - (x_data + x_data)
-    tvm.testing.assert_allclose(out.numpy(), expected, rtol=tol, atol=tol)
-
-
-@mock.patch("tvm.contrib.coreml_runtime.create")
-@mock.patch("tvm.contrib.xcode.compile_coreml")
-def _construct_model(func, m1, m2):
-    mod = tvm.IRModule()
-    mod["main"] = func
-    mod = transform.AnnotateTarget("coremlcompiler")(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    fcompile = tvm._ffi.get_global_func("relay.ext.coremlcompiler")
-
-    for var, func in mod.functions.items():
-        if "Compiler" in func.attrs and func.attrs["Compiler"] == "coremlcompiler":
-            fcompile(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_add():
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = x + x
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_multiply():
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = x * x
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_clip():
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.clip(x, a_min=0.0, a_max=1.0)
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_batch_flatten():
-    shape = (10, 10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.batch_flatten(x)
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_expand_dims():
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.expand_dims(x, axis=0)
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-    y = relay.expand_dims(x, axis=-1)
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_relu():
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.relu(x)
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_softmax():
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.softmax(x, axis=1)
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_conv2d():
-    x = relay.var("x", shape=(1, 3, 224, 224))
-    w = relay.const(np.zeros((16, 3, 3, 3), dtype="float32"))
-    y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-@tvm.testing.uses_gpu
-@requires_coremltools
-def test_global_avg_pool2d():
-    shape = (10, 10, 10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.global_avg_pool2d(x)
-    func = relay.Function([x], y)
-    _construct_model(func)
-
-
-if __name__ == "__main__":
-    test_annotate()
-    test_compile_and_run()
-    test_add()
-    test_multiply()
-    test_clip()
-    test_expand_dims()
-    test_relu()
-    test_batch_flatten()
-    test_softmax()
-    test_conv2d()
-    test_global_avg_pool2d()
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
deleted file mode 100644
index 99611bab4967..000000000000
--- a/tests/python/contrib/test_cublas.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-
-import tvm
-from tvm import te
-from tvm import relay
-import numpy as np
-from tvm.contrib import cublas
-from tvm.contrib import cublaslt
-from tvm.contrib import graph_executor
-import tvm.testing
-from tvm.relay.op.contrib import get_pattern_table
-from tvm.relay.op.contrib.cublas import partition_for_cublas
-
-
-def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
-    n = 1024
-    l = 128
-    m = 236
-    A = te.placeholder((n, l), name="A", dtype=in_dtype)
-    B = te.placeholder((l, m), name="B", dtype=in_dtype)
-    C = cublas.matmul(A, B, dtype=out_dtype)
-    s = te.create_schedule(C.op)
-
-    def verify(target="cuda"):
-        if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
-            print("skip because extern function is not available")
-            return
-        dev = tvm.cuda(0)
-        f = tvm.build(s, [A, B, C], target)
-        a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
-        f(a, b, c)
-        tvm.testing.assert_allclose(
-            c.numpy(), np.dot(a.numpy().astype(C.dtype), b.numpy().astype(C.dtype)), rtol=rtol
-        )
-
-    verify()
-
-
-def roundoff(v, d):
-    return int(np.floor((v + d - 1) / d) * d)
-
-
-def verify_matmul_add_igemm(in_dtype, out_dtype, rtol=1e-5):
-    n = 1024
-    l = 1024
-    m = 1024
-    L = roundoff(l, 32)
-    N = roundoff(n, 8)
-    N_out = roundoff(n, 32)
-
-    A = te.placeholder((N, L), name="A", dtype=in_dtype)
-    B = te.placeholder((m, L), name="B", dtype=in_dtype)
-    # C has CUBLASLT_ORDER_COL32 layout, thus a different shape
-    C = cublaslt.matmul(A, B, False, True, m, N_out, dtype=out_dtype)
-    s = te.create_schedule(C.op)
-
-    def verify(target="cuda"):
-        if not tvm.get_global_func("tvm.contrib.cublaslt.matmul", True):
-            print("skip because extern function is not available")
-            return
-        dev = tvm.cuda(0)
-        f = tvm.build(s, [A, B, C], target)
-        a_old = np.random.uniform(0, 128, size=(n, l))
-        b_old = np.random.uniform(0, 128, size=(l, m))
-
-        # Transform a to become CUBLASLT_ORDER_COL4_4R2_8C layout
-        a_new = np.hstack([a_old.astype(A.dtype), np.zeros([n, L - l])])
-        a_new = np.vstack([a_new.astype(A.dtype), np.zeros([N - n, L])])
-        a_even = np.vsplit(a_new[::2], N / 8)
-        a_odd = np.vsplit(a_new[1::2], N / 8)
-        a_new = [None] * (len(a_even) + len(a_odd))
-        a_new[::2] = a_even
-        a_new[1::2] = a_odd
-        a_new = np.vstack(a_new)
-        a_new = np.vstack(
-            [
-                np.vstack(
-                    [np.vstack(np.hsplit(i, 8)).reshape([4, 32]) for i in np.vsplit(j, N / 4)]
-                )
-                for j in np.hsplit(a_new, L / 32)
-            ]
-        )
-        a_new = a_new.reshape([N, L])
-        # Transform b to become CUBLASLT_ORDER_COL32 layout
-        b_new = np.vstack(
-            np.hsplit(np.hstack([b_old.T.astype(B.dtype), np.zeros([m, L - l])]), L / 32)
-        )
-        b_new = b_new.reshape([m, L])
-
-        a = tvm.nd.array(a_new.astype(A.dtype), dev)
-        b = tvm.nd.array(b_new.astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), dev)
-        f(a, b, c)
-        # Transform output c from layout CUBLASLT_ORDER_COL32 to row major layout
-        c_out = c.numpy()
-        c_out = c_out.reshape([int(m * N_out / 32), 32])
-        c_out = np.hstack(np.vsplit(c_out, int(N_out / 32)))
-        c_out = c_out[:, :n]
-        c_out = c_out.T
-        tvm.testing.assert_allclose(
-            c_out, np.dot(a_old.astype(C.dtype), b_old.astype(C.dtype)), rtol=rtol
-        )
-
-    verify()
-
-
-def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5):
-    A = te.placeholder(Ashape, name="A", dtype=in_dtype)
-    B = te.placeholder(Bshape, name="B", dtype=in_dtype)
-    C = cublas.batch_matmul(A, B, dtype=out_dtype)
-    s = te.create_schedule(C.op)
-
-    dev = tvm.cuda(0)
-    f = tvm.build(s, [A, B, C], "cuda")
-
-    if "int" in in_dtype:
-        a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
-        b = tvm.nd.array(np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev)
-    else:
-        a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)
-
-    c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev)
-    f(a, b, c)
-    tvm.testing.assert_allclose(
-        c.numpy(),
-        np.matmul(a.numpy().astype(C.dtype), b.numpy().astype(C.dtype)).astype(C.dtype),
-        rtol=rtol,
-    )
-
-
-@tvm.testing.requires_cuda
-def test_matmul_add():
-    verify_matmul_add("float", "float", rtol=1e-3)
-    verify_matmul_add("float16", "float")
-    verify_matmul_add("float16", "float16", rtol=1e-2)
-    verify_matmul_add("int8", "int32")
-
-
-@tvm.testing.requires_cuda
-def test_matmul_add_igemm():
-    verify_matmul_add_igemm("int8", "int32")
-
-
-@tvm.testing.requires_cuda
-def test_batch_matmul():
-    if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
-        print("skip because extern function is not available")
-        return
-
-    verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "float", "float")
-    verify_batch_matmul((16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float", "float")
-    verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "float16", "float")
-    verify_batch_matmul((16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float")
-    verify_batch_matmul(
-        (16, 1024, 128), (16, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2
-    )
-    verify_batch_matmul(
-        (16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2
-    )
-
-    verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "int8", "int32")
-
-
-def _verify_cublas_relay(expr):
-    np.random.seed(42)
-
-    mod = tvm.IRModule.from_expr(expr)
-    mod = relay.transform.InferType()(mod)
-    func = mod["main"]
-    cublas_mod = partition_for_cublas(mod)
-    assert len(cublas_mod.get_global_vars()) == 2
-
-    input_data = []
-    for param in func.params:
-        shape = [int(x) for x in param.checked_type.shape]
-        input_data.append(
-            (param.name_hint, np.random.uniform(0, 32, size=shape).astype(param.checked_type.dtype))
-        )
-
-    # Test against CPU reference
-    cuda_config = (tvm.target.cuda(), tvm.cuda(), cublas_mod)
-    cpu_config = (tvm.target.Target("llvm"), tvm.cpu(), mod)
-    outputs = []
-    for target, dev, test_mod in [cuda_config, cpu_config]:
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(test_mod, target=target, target_host=cpu_config[0])
-            module = graph_executor.GraphModule(lib["default"](dev))
-            for name, data in input_data:
-                module.set_input(name, tvm.nd.array(data, dev))
-
-            module.run()
-            out_type = func.body.checked_type
-            outputs.append(
-                module.get_output(0, tvm.nd.empty(out_type.shape, dtype=out_type.dtype)).numpy()
-            )
-
-    tvm.testing.assert_allclose(
-        outputs[0],
-        outputs[1],
-        rtol=1e-2,
-    )
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "n,m,k,transpose_a,transpose_b",
-    [
-        (64, 128, 32, False, False),
-        (17, 32, 16, True, False),
-        (24, 17, 12, False, True),
-        (96, 4, 17, True, True),
-    ],
-)
-@pytest.mark.parametrize(
-    "in_dtype,out_dtype",
-    [
-        ("float32", "float32"),
-        ("float16", "float16"),
-        ("float16", "float32"),
-        ("int8", "int32"),
-        ("float64", "float64"),
-        ("int8", "float32"),
-    ],
-)
-def test_relay_cublas_matmul(n, m, k, in_dtype, out_dtype, transpose_a, transpose_b):
-    unsupported_configs = [
-        (17, 32, 16, "int8", "float32", True, False),
-        (96, 4, 17, "int8", "float32", True, True),
-        (17, 32, 16, "int8", "int32", True, False),
-        (96, 4, 17, "int8", "int32", True, True),
-    ]
-    if (n, m, k, in_dtype, out_dtype, transpose_a, transpose_b) in unsupported_configs:
-        pytest.skip("Unsupported parameters.")
-
-    a_shape = (k, n) if transpose_a else (n, k)
-    b_shape = (m, k) if transpose_b else (k, m)
-    a = tvm.relay.var("A", tvm.relay.TensorType(a_shape, in_dtype))
-    b = tvm.relay.var("B", tvm.relay.TensorType(b_shape, in_dtype))
-    # Directly use matmul because nn.matmul sometimes defers to nn.dense
-    matmul = relay.op.nn._make.matmul(a, b, None, out_dtype, transpose_a, transpose_b)
-    _verify_cublas_relay(matmul)
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "n,m,k",
-    [
-        (64, 128, 32),
-        (17, 32, 16),
-        (24, 17, 12),
-        (96, 4, 17),
-    ],
-)
-@pytest.mark.parametrize(
-    "in_dtype,out_dtype",
-    [
-        ("float32", "float32"),
-        ("float16", "float16"),
-        ("float16", "float32"),
-        ("int8", "int32"),
-        ("float64", "float64"),
-        ("int8", "float32"),
-    ],
-)
-def test_relay_cublas_dense(n, m, k, in_dtype, out_dtype):
-    unsupported_configs = [
-        (96, 4, 17, "int8", "float32"),
-        (96, 4, 17, "int8", "int32"),
-    ]
-    if (n, m, k, in_dtype, out_dtype) in unsupported_configs:
-        pytest.skip("Unsupported parameters.")
-
-    data = tvm.relay.var("data", tvm.relay.TensorType((n, k), in_dtype))
-    weight = tvm.relay.var("weight", tvm.relay.TensorType((m, k), in_dtype))
-    dense = relay.op.nn.dense(data, weight, out_dtype=out_dtype)
-    _verify_cublas_relay(dense)
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "n,m,k,batch_a,batch_b,transpose_a,transpose_b",
-    [
-        (64, 128, 32, 16, 16, False, False),
-        (17, 32, 16, 16, 1, True, False),
-        (24, 17, 12, 17, 17, False, True),
-        (96, 4, 17, 53, 1, True, True),
-    ],
-)
-@pytest.mark.parametrize(
-    "in_dtype,out_dtype",
-    [
-        ("float32", "float32"),
-        ("float16", "float16"),
-        ("float16", "float32"),
-        ("int8", "int32"),
-        ("float64", "float64"),
-        ("int8", "float32"),
-    ],
-)
-def test_relay_cublas_batch_matmul(
-    n, m, k, batch_a, batch_b, in_dtype, out_dtype, transpose_a, transpose_b
-):
-    unsupported_configs = [
-        (17, 32, 16, 16, 1, "int8", "float32", True, False),
-        (96, 4, 17, 53, 1, "int8", "float32", True, True),
-        (17, 32, 16, 16, 1, "int8", "int32", True, False),
-        (96, 4, 17, 53, 1, "int8", "int32", True, True),
-    ]
-    if (
-        n,
-        m,
-        k,
-        batch_a,
-        batch_b,
-        in_dtype,
-        out_dtype,
-        transpose_a,
-        transpose_b,
-    ) in unsupported_configs:
-        pytest.skip("Unsupported parameters.")
-
-    a_shape = (batch_a, k, n) if transpose_a else (batch_a, n, k)
-    b_shape = (batch_b, m, k) if transpose_b else (batch_b, k, m)
-    a = tvm.relay.var("A", tvm.relay.TensorType(a_shape, in_dtype))
-    b = tvm.relay.var("B", tvm.relay.TensorType(b_shape, in_dtype))
-    batch_matmul = relay.op.nn.batch_matmul(a, b, out_dtype, transpose_a, transpose_b)
-    _verify_cublas_relay(batch_matmul)
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "n,m,k",
-    [
-        (64, 128, 32),
-        (17, 32, 16),
-        (24, 17, 12),
-        (96, 4, 17),
-    ],
-)
-@pytest.mark.parametrize(
-    "in_dtype,out_dtype",
-    [
-        ("float32", "float32"),
-        ("float16", "float16"),
-        ("float16", "float32"),
-        ("int8", "int32"),
-        ("float64", "float64"),
-        ("int8", "float32"),
-    ],
-)
-def test_relay_cublas_dense(n, m, k, in_dtype, out_dtype):
-    unsupported_configs = [
-        (96, 4, 17, "int8", "float32"),
-        (96, 4, 17, "int8", "int32"),
-    ]
-    if (n, m, k, in_dtype, out_dtype) in unsupported_configs:
-        pytest.skip("Unsupported parameters.")
-
-    data = tvm.relay.var("data", tvm.relay.TensorType((n, k), in_dtype))
-    weight = tvm.relay.var("weight", tvm.relay.TensorType((m, k), in_dtype))
-    dense = relay.op.nn.dense(data, weight, out_dtype=out_dtype)
-    _verify_cublas_relay(dense)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
deleted file mode 100644
index 08e03d666047..000000000000
--- a/tests/python/contrib/test_cudnn.py
+++ /dev/null
@@ -1,627 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import sys
-
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm import relay
-from tvm.contrib import cudnn
-from tvm.contrib.nvcc import have_fp16
-from tvm.contrib import graph_executor
-import numpy as np
-import tvm.topi.testing
-import tvm.testing
-from tvm.relay.op.contrib.cudnn import partition_for_cudnn
-
-
-requires_cudnn = pytest.mark.skipif(
-    tvm.get_global_func("tvm.contrib.cudnn.conv2d.forward", True) is None,
-    reason="CuDNN is not enabled",
-)
-
-
-def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
-    in_channel = 4
-    out_channel = 16
-    filter_h = 3
-    filter_w = 3
-    pad_h = 1
-    pad_w = 1
-    stride_h = 1
-    stride_w = 1
-    dilation_h = 1
-    dilation_w = 1
-    batch = 3
-    height = 32
-    width = 32
-
-    if data_dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
-        print("Skip because gpu does not have fp16 support")
-        return
-
-    # schedule
-    if tensor_format == 0:
-        xshape = [batch, in_channel, height, width]
-        wshape = [out_channel, in_channel // groups, filter_h, filter_w]
-    else:
-        xshape = [batch, height, width, in_channel]
-        wshape = [out_channel, filter_h, filter_w, in_channel // groups]
-
-    X = te.placeholder(xshape, name="X", dtype=data_dtype)
-    W = te.placeholder(wshape, name="W", dtype=data_dtype)
-    Y = cudnn.conv_forward(
-        X,
-        W,
-        [pad_h, pad_w],
-        [stride_h, stride_w],
-        [dilation_h, dilation_w],
-        conv_mode=1,
-        tensor_format=tensor_format,
-        conv_dtype=conv_dtype,
-        algo=-1,
-        groups=groups,
-    )
-    yshape = [x.value for x in Y.shape]
-    s = te.create_schedule(Y.op)
-
-    # validation
-    dev = tvm.cuda(0)
-    f = tvm.build(s, [X, W, Y], "cuda --host=llvm", name="conv2d")
-    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
-    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
-    y_np = np.zeros(yshape).astype(data_dtype)
-    x = tvm.nd.array(x_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    y = tvm.nd.array(y_np, dev)
-    if tensor_format == 0:
-        c_np = tvm.topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1, groups=groups)
-    elif tensor_format == 1:
-        wt = w_np.transpose((1, 2, 3, 0))  # OHWI => HWIO
-        c_np = tvm.topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1, groups=groups)
-
-    f(x, w, y)
-    tvm.testing.assert_allclose(y.numpy(), c_np, atol=1e-2, rtol=1e-2)
-
-
-@tvm.testing.requires_gpu
-@requires_cudnn
-def test_conv2d():
-    verify_conv2d("float32", "float32", tensor_format=0)
-    verify_conv2d("float16", "float32", tensor_format=1)
-    verify_conv2d("float16", "float16", tensor_format=0)
-    verify_conv2d("float16", "float16", tensor_format=1)
-    verify_conv2d("int8", "int32", tensor_format=1)
-
-    verify_conv2d("float32", "float32", tensor_format=0, groups=2)
-    verify_conv2d("float16", "float32", tensor_format=1, groups=2)
-    verify_conv2d("float16", "float16", tensor_format=0, groups=2)
-    verify_conv2d("int8", "int32", tensor_format=1, groups=2)
-
-
-def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
-    in_channel = 4
-    out_channel = 16
-    filter_d = 3
-    filter_h = 3
-    filter_w = 3
-    pad_d = 1
-    pad_h = 1
-    pad_w = 1
-    stride_d = 1
-    stride_h = 1
-    stride_w = 1
-    dilation_d = 1
-    dilation_h = 1
-    dilation_w = 1
-    batch = 3
-    depth = 32
-    height = 32
-    width = 32
-
-    # schedule
-    xshape = [batch, in_channel, depth, height, width]
-    wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w]
-
-    X = te.placeholder(xshape, name="X", dtype=data_dtype)
-    W = te.placeholder(wshape, name="W", dtype=data_dtype)
-    Y = cudnn.conv_forward(
-        X,
-        W,
-        [pad_d, pad_h, pad_w],
-        [stride_d, stride_h, stride_w],
-        [dilation_d, dilation_h, dilation_w],
-        conv_mode=1,
-        tensor_format=tensor_format,
-        algo=-1,
-        conv_dtype=conv_dtype,
-        groups=groups,
-    )
-    yshape = [x.value for x in Y.shape]
-    s = te.create_schedule(Y.op)
-
-    # validation
-    dev = tvm.cuda(0)
-    f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d")
-    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
-    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
-    y_np = np.zeros(yshape).astype(data_dtype)
-    x = tvm.nd.array(x_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    y = tvm.nd.array(y_np, dev)
-    if tensor_format == 0:
-        c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups)
-    else:
-        raise AssertionError("For now, conv3d tensor format only support: 0(NCHW)")
-
-    f(x, w, y)
-    tvm.testing.assert_allclose(y.numpy(), c_np, atol=3e-5, rtol=1e-4)
-
-
-@tvm.testing.requires_gpu
-@requires_cudnn
-def test_conv3d():
-    verify_conv3d("float32", "float32", tensor_format=0)
-    verify_conv3d("float32", "float32", tensor_format=0, groups=2)
-
-
-def verify_softmax(shape, axis, dtype="float32", log_softmax=False):
-    cudnn_op = cudnn.log_softmax if log_softmax else cudnn.softmax
-    testing_op = (
-        tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python
-    )
-
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = cudnn_op(A, axis)
-    s = te.create_schedule([B.op])
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = testing_op(a_np)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax")
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3)
-
-
-def verify_softmax_4d(shape, dtype="float32", log_softmax=False):
-    cudnn_op = cudnn.log_softmax if log_softmax else cudnn.softmax
-    testing_op = (
-        tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python
-    )
-
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = cudnn_op(A, axis=1)
-    s = te.create_schedule([B.op])
-
-    dev = tvm.cuda(0)
-    n, c, h, w = shape
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = testing_op(a_np.transpose(0, 2, 3, 1).reshape(h * w, c))
-    b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax")
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3)
-
-
-@tvm.testing.requires_gpu
-@requires_cudnn
-def test_softmax():
-    verify_softmax((32, 10), -1)
-    verify_softmax((3, 4), -1)
-    verify_softmax((1, 5), -1, "float64")
-    verify_softmax_4d((1, 16, 256, 256))
-    verify_softmax_4d((1, 16, 256, 256), "float64")
-
-    verify_softmax((32, 10), -1, log_softmax=True)
-    verify_softmax((3, 4), -1, log_softmax=True)
-    verify_softmax((1, 5), -1, "float64", log_softmax=True)
-    verify_softmax_4d((1, 16, 256, 256), log_softmax=True)
-    verify_softmax_4d((1, 16, 256, 256), "float64", log_softmax=True)
-
-
-def verify_conv2d_backward_data(data_dtype, conv_dtype, tensor_format=0, tol=1e-5):
-    batch = 3
-    in_channel = 4
-    out_channel = 16
-    filter_h, filter_w = 3, 3
-    pad_h, pad_w = 1, 1
-    stride_h, stride_w = 1, 1
-    height, width = 32, 32
-
-    if tensor_format == 0:
-        xshape = [batch, in_channel, height, width]
-        wshape = [out_channel, in_channel, filter_h, filter_w]
-        oshape = xshape
-        oshape[1] = out_channel
-        ref_func = tvm.topi.testing.conv2d_transpose_nchw_python
-    else:
-        xshape = [batch, height, width, in_channel]
-        wshape = [out_channel, filter_h, filter_w, in_channel]
-        oshape = xshape
-        oshape[3] = out_channel
-        ref_func = lambda dy_np, w_np, strides, padding, out_pad: tvm.topi.testing.conv2d_transpose_nhwc_python(
-            dy_np, np.transpose(w_np, [1, 2, 3, 0]), "HWOI", strides, padding, out_pad
-        )
-
-    dy_np = np.random.uniform(-1, 1, oshape).astype(data_dtype)
-    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
-
-    if data_dtype == "float16":
-        dx_np = ref_func(
-            dy_np.astype("float32"),
-            w_np.astype("float32"),
-            (stride_h, stride_w),
-            (pad_h, pad_w),
-            (0, 0),
-        )
-        dx_np = dx_np.astype("float16")
-    else:
-        dx_np = ref_func(dy_np, w_np, (stride_h, stride_w), (pad_h, pad_w), (0, 0))
-
-    dy = te.placeholder(oshape, name="dy", dtype=data_dtype)
-    w = te.placeholder(wshape, name="dw", dtype=data_dtype)
-    dx = cudnn.conv_backward_data(
-        dy,
-        w,
-        [pad_h, pad_w],
-        [stride_h, stride_w],
-        [1, 1],
-        conv_mode=1,
-        tensor_format=tensor_format,
-        conv_dtype=conv_dtype,
-        groups=1,
-    )
-
-    s = te.create_schedule(dx.op)
-
-    dev = tvm.cuda(0)
-    f = tvm.build(s, [dy, w, dx], "cuda --host=llvm", name="conv2d_backward_data")
-
-    dy = tvm.nd.array(dy_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    dx = tvm.nd.array(dx_np, dev)
-
-    f(dy, w, dx)
-    tvm.testing.assert_allclose(dx.numpy(), dx_np, atol=tol, rtol=tol)
-
-
-@tvm.testing.requires_gpu
-@requires_cudnn
-def test_conv2d_backward_data():
-    verify_conv2d_backward_data("float32", "float32", tensor_format=0, tol=1e-5)
-    verify_conv2d_backward_data("float32", "float32", tensor_format=1, tol=1e-2)
-    # The scipy convolve function does not support fp16, so the reference will be computed with
-    # fp32. Use larger tolerance to be on the safe side (1e-2 also seems mostly ok).
-    verify_conv2d_backward_data("float16", "float16", tensor_format=1, tol=1e-1)
-
-
-def verify_conv2d_backward_filter(data_dtype, conv_dtype, tensor_format=0, tol=1e-5):
-    batch = 3
-    in_channel = 4
-    out_channel = 16
-    filter_h, filter_w = 3, 3
-    pad_h, pad_w = 1, 1
-    stride_h, stride_w = 1, 1
-    height, width = 32, 32
-
-    if tensor_format == 0:
-        x_shape = [batch, in_channel, height, width]
-        dy_shape = [batch, out_channel, height, width]
-    else:
-        x_shape = [batch, height, width, in_channel]
-        dy_shape = [batch, height, width, out_channel]
-
-    x_np = np.random.uniform(-1, 1, x_shape).astype(data_dtype)
-    dy_np = np.random.uniform(-1, 1, dy_shape).astype(data_dtype)
-
-    dw_np = tvm.topi.testing.conv2d_backward_weight_python(
-        dy_np,
-        x_np,
-        (filter_h, filter_w),
-        (stride_h, stride_w),
-        (pad_h, pad_w),
-        "NCHW" if tensor_format == 0 else "NHWC",
-    )
-
-    x = te.placeholder(x_shape, name="x", dtype=data_dtype)
-    dy = te.placeholder(dy_shape, name="dy", dtype=data_dtype)
-    dw = cudnn.conv_backward_filter(
-        dy,
-        x,
-        (filter_h, filter_w),
-        [pad_h, pad_w],
-        [stride_h, stride_w],
-        [1, 1],
-        conv_mode=1,
-        tensor_format=tensor_format,
-        conv_dtype=conv_dtype,
-    )
-
-    s = te.create_schedule(dw.op)
-
-    dev = tvm.cuda(0)
-    f = tvm.build(s, [dy, x, dw], "cuda --host=llvm", name="conv2d_backward_filter")
-
-    x = tvm.nd.array(x_np, dev)
-    dy = tvm.nd.array(dy_np, dev)
-    dw = tvm.nd.array(dw_np, dev)
-
-    f(dy, x, dw)
-    tvm.testing.assert_allclose(dw.numpy(), dw_np, atol=tol, rtol=tol)
-
-
-@tvm.testing.requires_gpu
-@requires_cudnn
-def test_conv2d_backward_filter():
-    verify_conv2d_backward_filter("float32", "float32", tensor_format=0, tol=1e-2)
-    verify_conv2d_backward_filter("float32", "float32", tensor_format=1, tol=1e-2)
-
-
-test_kwargs_default_2d = {
-    "tensor_format": 0,
-    "pad": [1, 1],
-    "stride": [1, 1],
-    "dilation": [1, 1],
-    "x_shape": [16, 4, 32, 32],
-    "w_shape": [8, 4, 3, 3],
-    "groups": 1,
-    "conv_dtype": "float32",
-    "data_dtype": "float32",
-}
-test_kwargs_default_3d = {
-    "tensor_format": 0,
-    "pad": [1, 1, 1],
-    "stride": [1, 1, 1],
-    "dilation": [1, 1, 1],
-    "x_shape": [16, 4, 32, 32, 32],
-    "w_shape": [8, 4, 3, 3, 3],
-    "groups": 1,
-    "conv_dtype": "float32",
-    "data_dtype": "float32",
-}
-conv_output_shape_conditions = {
-    "2d_small": test_kwargs_default_2d,
-    "2d_large": {
-        **test_kwargs_default_2d,
-        "x_shape": [16, 32, 512, 1024],
-        "w_shape": [8, 32, 5, 5],
-    },
-    "2d_pad": {**test_kwargs_default_2d, "pad": [2, 3]},
-    "2d_stride": {**test_kwargs_default_2d, "stride": [2, 3]},
-    "2d_dilation": {**test_kwargs_default_2d, "dilation": [2, 3]},
-    "2d_groups": {**test_kwargs_default_2d, "groups": 4, "w_shape": [8, 1, 3, 3]},
-    "2d_NHWC": {
-        **test_kwargs_default_2d,
-        "tensor_format": 1,
-        "x_shape": [16, 32, 32, 4],
-        "w_shape": [8, 3, 3, 4],
-    },
-    "2d_NCHW_VECT_C": {
-        **test_kwargs_default_2d,
-        "tensor_format": 2,
-        "w_shape": [8, 16, 3, 3],
-        "data_dtype": "int8x4",
-    },
-    "3d_small": test_kwargs_default_3d,
-    "3d_large": {
-        **test_kwargs_default_3d,
-        "x_shape": [16, 32, 64, 128, 256],
-        "w_shape": [8, 32, 5, 5, 5],
-    },
-    "3d_pad": {**test_kwargs_default_3d, "pad": [2, 3, 4]},
-    "3d_stride": {**test_kwargs_default_3d, "stride": [2, 3, 4]},
-    "3d_dilation": {**test_kwargs_default_3d, "dilation": [2, 3, 4]},
-    "3d_groups": {**test_kwargs_default_3d, "groups": 4, "w_shape": [8, 1, 3, 3, 3]},
-    "3d_NCHW_VECT_C": {
-        **test_kwargs_default_3d,
-        "tensor_format": 2,
-        "w_shape": [8, 16, 3, 3, 3],
-        "data_dtype": "int8x4",
-    },
-}
-
-
-@pytest.fixture(
-    params=[pytest.param(kwargs, id=name) for name, kwargs in conv_output_shape_conditions.items()]
-)
-def conv_output_shape_kwargs(request):
-    return request.param
-
-
-def _verify_cudnn_relay(expr):
-    np.random.seed(42)
-
-    mod = tvm.IRModule.from_expr(expr)
-    mod = relay.transform.InferType()(mod)
-    func = mod["main"]
-    cudnn_mod = partition_for_cudnn(mod)
-    assert len(cudnn_mod.get_global_vars()) == 2
-
-    input_data = []
-    for param in func.params:
-        shape = [int(x) for x in param.checked_type.shape]
-        input_data.append(
-            (
-                param.name_hint,
-                np.random.uniform(-32, 32, size=shape).astype(param.checked_type.dtype),
-            )
-        )
-
-    cuda_config = (tvm.target.cuda(), tvm.cuda(), cudnn_mod)
-    cpu_config = (tvm.target.Target("llvm"), tvm.cpu(), mod)
-    outputs = []
-    for target, dev, test_mod in [cuda_config, cpu_config]:
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(test_mod, target=target, target_host=cpu_config[0])
-            module = graph_executor.GraphModule(lib["default"](dev))
-            for name, data in input_data:
-                module.set_input(name, tvm.nd.array(data, dev))
-
-            module.run()
-            out_type = func.body.checked_type
-            outputs.append(
-                module.get_output(0, tvm.nd.empty(out_type.shape, dtype=out_type.dtype)).numpy()
-            )
-
-    tvm.testing.assert_allclose(
-        outputs[0],
-        outputs[1],
-        rtol=1e-2,
-        atol=30,
-    )
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "shape,axis",
-    [
-        ((200,), 0),
-        ((13, 27), 0),
-        ((44, 12, 67), 1),
-        ((1, 16, 16, 8), 2),
-        ((2, 4, 6, 8, 10), 3),
-    ],
-)
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "float32",
-        "float16",
-        "float64",
-    ],
-)
-def test_relay_cudnn_softmax(shape, axis, dtype):
-    x = tvm.relay.var("x", tvm.relay.TensorType(shape, dtype))
-    softmax = relay.op.nn.softmax(x, axis=axis)
-    _verify_cudnn_relay(softmax)
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "shape,axis",
-    [
-        ((32, 16), -1),
-        ((13, 27), 1),
-    ],
-)
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "float32",
-        "float16",
-        "float64",
-    ],
-)
-def test_relay_cudnn_log_softmax(shape, axis, dtype):
-    x = tvm.relay.var("x", tvm.relay.TensorType(shape, dtype))
-    log_softmax = relay.op.nn.log_softmax(x, axis=axis)
-    _verify_cudnn_relay(log_softmax)
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "n,h,w,ci,co,groups",
-    [
-        (1, 16, 20, 8, 16, 1),
-        (10, 17, 19, 16, 8, 4),
-    ],
-)
-@pytest.mark.parametrize(
-    "kh,kw,padding",
-    [
-        (1, 1, (3, 1, 3, 1)),
-        (3, 3, (1, 2)),
-        (7, 2, (0, 0)),
-    ],
-)
-@pytest.mark.parametrize(
-    "strides,dilation,dtype",
-    [
-        ((1, 1), (1, 1), "float32"),
-        ((2, 1), (2, 2), "float16"),
-        ((3, 3), (1, 2), "float64"),
-    ],
-)
-def test_relay_cudnn_conv2d(n, h, w, ci, co, kh, kw, strides, dilation, padding, groups, dtype):
-    data = tvm.relay.var("data", tvm.relay.TensorType((n, ci, h, w), dtype))
-    weight = tvm.relay.var("weight", tvm.relay.TensorType((co, ci // groups, kh, kw), dtype))
-    conv2d = relay.op.nn.conv2d(
-        data,
-        weight,
-        groups=groups,
-        channels=co,
-        kernel_size=(kh, kw),
-        strides=strides,
-        dilation=dilation,
-        padding=padding,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-    )
-    _verify_cudnn_relay(conv2d)
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "n,h,w,ci,co,groups",
-    [
-        (1, 16, 20, 8, 16, 1),
-        (10, 17, 19, 16, 8, 4),
-    ],
-)
-@pytest.mark.parametrize(
-    "kh,kw,padding,strides,dilation,dtype",
-    [
-        (1, 1, (3, 1, 3, 1), (1, 1), (1, 1), "float32"),
-        (3, 3, (1, 2), (2, 1), (2, 2), "float16"),
-        (7, 2, (0, 0), (3, 3), (1, 2), "float64"),
-    ],
-)
-@pytest.mark.parametrize("activation", [True, False])
-def test_relay_cudnn_conv2d_bias_act(
-    n, h, w, ci, co, kh, kw, strides, dilation, padding, groups, dtype, activation
-):
-    data = tvm.relay.var("data", tvm.relay.TensorType((n, ci, h, w), dtype))
-    weight = tvm.relay.var("weight", tvm.relay.TensorType((co, ci // groups, kh, kw), dtype))
-    bias = relay.var("bias", relay.TensorType((co,), dtype))
-    conv2d = relay.op.nn.conv2d(
-        data,
-        weight,
-        groups=groups,
-        channels=co,
-        kernel_size=(kh, kw),
-        strides=strides,
-        dilation=dilation,
-        padding=padding,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-    )
-    out = relay.op.nn.bias_add(conv2d, bias)
-    if activation:
-        out = relay.op.nn.relu(out)
-
-    _verify_cudnn_relay(out)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
deleted file mode 100644
index bc80323b753e..000000000000
--- a/tests/python/contrib/test_cutlass.py
+++ /dev/null
@@ -1,1288 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import logging
-import math
-import tempfile
-
-import ml_dtypes
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import auto_scheduler, relay
-from tvm.contrib.cudnn import conv_output_shape
-from tvm.contrib.cutlass import (
-    finalize_modules,
-    finalize_modules_vm,
-    has_cutlass,
-    num_cutlass_partitions,
-)
-from tvm.contrib.pickle_memoize import memoize
-from tvm.relay import op as _op
-from tvm.relay.op.contrib.cutlass import partition_for_cutlass
-from tvm.relay.transform import FirstOrderGradient, InferType, ToMixedPrecision
-from tvm.runtime.vm import VirtualMachine
-
-logging.basicConfig(level=logging.INFO)
-
-
-def has_cublas():
-    return tvm.get_global_func("tvm.contrib.cublas.matmul", True) != None
-
-
-def get_ref_rt_mod(mod, params, target="cuda"):
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, params=params)
-    dev = tvm.device(target, 0)
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    return rt_mod, dev
-
-
-def get_ref_vm(mod, params, target="cuda"):
-    with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, target=target, params=params)
-        code, lib = vm_exec.save()
-    dev = tvm.device(target, 0)
-    vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
-    return VirtualMachine(vm_exec, dev), dev
-
-
-def get_output(rt_mod, names, inputs):
-    for name, inp in zip(names, inputs):
-        rt_mod.set_input(name, inp)
-    rt_mod.run()
-    return rt_mod.get_output(0).asnumpy()
-
-
-def get_output_vm(vm, names, inputs):
-    params = dict(zip(names, inputs))
-    return vm.invoke("main", **params).numpy()
-
-
-def get_dense_with_shape(
-    data_shape, weight_shape, out_dtype="float16", data_dtype="float16", weight_dtype="float16"
-):
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=weight_shape, dtype=weight_dtype)
-    return relay.nn.dense(data, weight, out_dtype=out_dtype)
-
-
-def get_dense(M, N, K, out_dtype="float16", data_dtype="float16", weight_dtype="float16"):
-    return get_dense_with_shape((M, K), (N, K), out_dtype, data_dtype, weight_dtype)
-
-
-def get_dense_bias(M, N, K, out_dtype="float16"):
-    dense = get_dense(M, N, K, out_dtype=out_dtype)
-    bias = relay.var("bias", shape=(N,), dtype=out_dtype)
-    return relay.nn.bias_add(dense, bias)
-
-
-def get_dense_bias_relu(M, N, K, out_dtype="float16"):
-    return relay.nn.relu(get_dense_bias(M, N, K, out_dtype=out_dtype))
-
-
-def get_dense_bias_gelu(M, N, K, out_dtype="float16"):
-    bias_add = get_dense_bias(M, N, K, out_dtype)
-    mul = bias_add * relay.const((1.0 / math.sqrt(2.0)), dtype=out_dtype)
-    if out_dtype == "float16":
-        erf = relay.cast(relay.op.erf(relay.cast(mul, "float32")), "float16")
-    else:
-        erf = relay.op.erf(mul)
-    mul_half = erf * relay.const(0.5, dtype=out_dtype)
-    add = mul_half + relay.const(0.5, dtype=out_dtype)
-    return add * bias_add
-
-
-def get_batch_matmul_with_shape(x_shape, y_shape, out_dtype="float16"):
-    x = relay.var("x", shape=x_shape, dtype="float16")
-    y = relay.var("y", shape=y_shape, dtype="float16")
-    return relay.nn.batch_matmul(x, y, out_dtype=out_dtype)
-
-
-def get_batch_matmul(batch, M, N, K, out_dtype="float16"):
-    return get_batch_matmul_with_shape((batch, M, K), (batch, N, K), out_dtype="float16")
-
-
-def get_conv2d_nchw(
-    d_shape,
-    w_shape,
-    padding,
-    strides=(1, 1),
-    out_dtype="float16",
-    data_dtype="float16",
-    weight_dtype="float16",
-):
-    data = relay.var("data", shape=d_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
-    out_channel = w_shape[0]
-    return relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=w_shape[2:],
-        channels=out_channel,
-        padding=padding,
-        strides=strides,
-        out_dtype=out_dtype,
-    )
-
-
-def get_conv2d_nchw_bias(d_shape, w_shape, padding, out_dtype="float16"):
-    conv2d = get_conv2d_nchw(d_shape, w_shape, padding, out_dtype=out_dtype)
-    bias = relay.var("bias", shape=(w_shape[0],), dtype=out_dtype)
-    return relay.nn.bias_add(conv2d, bias)
-
-
-def silu(x):
-    return x * relay.sigmoid(x)
-
-
-def hardswish(x, out_dtype="float16"):
-    return x * (
-        relay.clip(x + relay.const(3, dtype=out_dtype), a_min=0, a_max=6)
-        / relay.const(6, dtype=out_dtype)
-    )
-
-
-def get_conv2d_nchw_bias_relu(d_shape, w_shape, padding, out_dtype="float16"):
-    return relay.nn.relu(get_conv2d_nchw_bias(d_shape, w_shape, padding, out_dtype=out_dtype))
-
-
-def get_conv2d_nchw_bias_sigmoid(d_shape, w_shape, padding, out_dtype="float16"):
-    return relay.sigmoid(get_conv2d_nchw_bias(d_shape, w_shape, padding, out_dtype=out_dtype))
-
-
-def get_conv2d_nchw_bias_silu(d_shape, w_shape, padding, out_dtype="float16"):
-    conv_out = get_conv2d_nchw_bias(d_shape, w_shape, padding, out_dtype=out_dtype)
-    return silu(conv_out)
-
-
-def get_conv2d_nchw_bias_hardswish(d_shape, w_shape, padding, out_dtype="float16"):
-    conv_out = get_conv2d_nchw_bias(d_shape, w_shape, padding, out_dtype=out_dtype)
-    return hardswish(conv_out, out_dtype)
-
-
-def get_conv2d_nchw_bias_residual(d_shape, w_shape, padding, out_dtype="float16"):
-    data = relay.var("data", shape=d_shape, dtype="float16")
-    weight = relay.var("weight", shape=w_shape, dtype="float16")
-    bias = relay.var("bias", shape=(w_shape[0],), dtype=out_dtype)
-    out_channel = w_shape[0]
-    conv2d = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=w_shape[2:],
-        channels=out_channel,
-        padding=padding,
-        out_dtype=out_dtype,
-    )
-    bias_add = relay.nn.bias_add(conv2d, bias)
-    return bias_add, data
-
-
-def get_conv2d_transpose_nchw(
-    d_shape,
-    w_shape,
-    padding,
-    output_padding,
-    strides,
-    out_dtype="float32",
-    data_dtype="float32",
-    weight_dtype="float32",
-):
-    data = relay.var("data", shape=d_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
-    out_channel = w_shape[1]
-    return relay.nn.conv2d_transpose(
-        data=data,
-        weight=weight,
-        kernel_size=w_shape[2:],
-        channels=out_channel,
-        padding=padding,
-        output_padding=output_padding,
-        strides=strides,
-        out_dtype=out_dtype,
-    )
-
-
-def get_conv2d_backward_weight(
-    d_shape,
-    w_shape,
-    o_shape,
-    padding,
-    strides,
-    out_dtype="float32",
-    data_dtype="float32",
-    weight_dtype="float32",
-):
-    grad = relay.var("grad", shape=o_shape, dtype=weight_dtype)
-    data = relay.var("data", shape=d_shape, dtype=data_dtype)
-    out_channel = o_shape[1]
-    return relay.nn.conv2d_backward_weight(
-        grad=grad,
-        data=data,
-        kernel_size=w_shape[2:],
-        channels=out_channel,
-        padding=padding,
-        strides=strides,
-        out_dtype=out_dtype,
-    )
-
-
-def get_dense_transpose_dense(M, N, K, dtype="float16"):
-    """
-    output = nn.dense(_op.transpose(nn.dense(input, weight0), axes=(1, 0)), weight1)
-
-    dense0: [M, K] * [N, K] -> [M, N]
-    transpose: [M, N] -> [N, M]
-    dense1: [N, M] * [K, M] -> [N, K]
-
-    input: [M, K]
-    weight0: [N, K]
-    weight1: [K, M]
-    """
-    input_shape = (M, K)
-    weight0_shape = (N, K)
-    weight1_shape = (K, M)
-
-    input = relay.var("input", shape=input_shape, dtype=dtype)
-    weight0 = relay.var("weight0", shape=weight0_shape, dtype=dtype)
-    weight1 = relay.var("weight1", shape=weight1_shape, dtype=dtype)
-
-    output0 = relay.nn.dense(input, weight0, out_dtype=dtype)
-    input1 = _op.transpose(output0, axes=(1, 0))
-    output = relay.nn.dense(input1, weight1, out_dtype=dtype)
-    return output
-
-
-def convert_conv2d_layout(mod, desired_layouts):
-    with tvm.transform.PassContext(opt_level=3):
-        seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
-        return seq(mod)
-
-
-def get_random_ndarray(shape, dtype):
-    if dtype == "int8":
-        return np.random.randint(-128, 128, shape).astype(dtype)
-    elif dtype == "uint8":
-        return np.random.randint(0, 256, shape).astype(dtype)
-    return np.random.uniform(-1, 1, shape).astype(dtype)
-
-
-def profile_and_build(
-    mod,
-    params,
-    sm,
-    split_k_slices=[1],
-    tmp_dir="./tmp",
-    use_fast_math=False,
-    use_3xtf32=True,
-    use_ansor=False,
-    ansor_tuning=False,
-):
-    logging.info("before partitioning:\n%s", mod)
-    mod = partition_for_cutlass(mod)
-    logging.info("after partitioning:\n%s", mod)
-
-    num_cutlass_partition = num_cutlass_partitions(mod)
-    host = tvm.target.Target("llvm")
-    cuda = tvm.target.Target("cuda", host=host)
-    cutlass = tvm.target.Target(
-        {
-            "kind": "cutlass",
-            "sm": sm,
-            "use_3xtf32": use_3xtf32,
-            "split_k_slices": split_k_slices,
-            "profile_all_alignments": False,
-            "find_first_valid": True,
-            "use_multiprocessing": True,
-            "use_fast_math": use_fast_math,
-            "tmp_dir": tmp_dir,
-        },
-        host=host,
-    )
-
-    if use_ansor:
-        with tvm.transform.PassContext(
-            opt_level=3, config={"relay.backend.use_auto_scheduler": True}
-        ):
-            tasks, task_weights = auto_scheduler.extract_tasks(
-                mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass]
-            )
-        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-            logging.info(
-                f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) ====="
-            )
-            logging.info(task.compute_dag)
-
-        with tempfile.NamedTemporaryFile() as fp:
-            log_file = fp.name
-
-            # auto-tuning is disabled by default
-            if ansor_tuning:
-                measure_ctx = auto_scheduler.LocalRPCMeasureContext(
-                    repeat=3, min_repeat_ms=200, timeout=10
-                )
-                tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-                tuner.tune(
-                    auto_scheduler.TuningOptions(
-                        num_measure_trials=100,
-                        runner=measure_ctx.runner,
-                        measure_callbacks=[
-                            auto_scheduler.RecordToFile(log_file),
-                        ],
-                    )
-                )
-
-            with auto_scheduler.ApplyHistoryBest(log_file):
-                with tvm.transform.PassContext(
-                    opt_level=3,
-                    config={"relay.backend.use_auto_scheduler": True},
-                ):
-                    lib = relay.build(
-                        mod,
-                        target=cuda,
-                        target_host=host,
-                        params=params,
-                    )
-    else:
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=[cuda, cutlass], params=params)
-    lib = finalize_modules(lib, "compile.so", tmp_dir)
-    dev = tvm.device("cuda", 0)
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    return rt_mod, dev, num_cutlass_partition
-
-
-def profile_and_build_vm(
-    mod,
-    params,
-    sm,
-    split_k_slices=[1],
-    tmp_dir="./tmp",
-    use_fast_math=False,
-    use_3xtf32=True,
-):
-    mod = partition_for_cutlass(mod)
-    num_cutlass_partition = num_cutlass_partitions(mod)
-    host = tvm.target.Target("llvm")
-    cuda = tvm.target.Target("cuda", host=host)
-    cutlass = tvm.target.Target(
-        {
-            "kind": "cutlass",
-            "sm": sm,
-            "use_3xtf32": use_3xtf32,
-            "split_k_slices": split_k_slices,
-            "profile_all_alignments": False,
-            "find_first_valid": True,
-            "use_multiprocessing": True,
-            "use_fast_math": use_fast_math,
-            "tmp_dir": tmp_dir,
-        },
-        host=host,
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, target=[cuda, cutlass], params=params)
-    vm_exec = finalize_modules_vm(vm_exec, "compile.so", tmp_dir)
-    dev = tvm.device("cuda", 0)
-    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
-
-
-def verify_dense(
-    func,
-    M,
-    N,
-    K,
-    ref_target="cuda",
-    sm=80,
-    atol=1e-5,
-    rtol=1e-5,
-    run_benchmark=False,
-    data_dtype="float16",
-    weight_dtype="float16",
-    use_3xtf32=True,
-):
-    assert has_cutlass()
-    if sm < 80 and data_dtype == "float32":
-        return
-
-    mod = tvm.IRModule.from_expr(func)
-    typ = relay.transform.InferType()(mod)["main"].body.checked_type
-    out_dtype = typ.dtype
-    use_vm = any(isinstance(s, tvm.tir.Any) for s in typ.shape)
-    np_data = get_random_ndarray((M, K), data_dtype)
-    np_weight = get_random_ndarray((N, K), weight_dtype)
-    np_bias = get_random_ndarray((N,), out_dtype)
-
-    params = {"weight": np_weight, "bias": np_bias}
-
-    if use_vm:
-        if ref_target == "cuda" and out_dtype == "float16":
-            # Uncomment "return" below to see the accuracy difference of static vs dynamic TVM native fp16 dense
-            # The static one can use a tensorcore schedule, but the dynamic one cannot
-            rt_mod, dev = get_ref_vm(tvm.IRModule.from_expr(get_dense(M, N, K)), params)
-            num_partition = 1
-            logging.warning(
-                "The reference fp16 dense with dynamic shape using fp16 accumulation has accuracy issues."
-            )
-            return
-        else:
-            rt_mod, dev, num_partition = profile_and_build_vm(
-                mod, params, sm, use_3xtf32=use_3xtf32
-            )
-
-        rt_mod_ref, dev = get_ref_vm(mod, params, target=ref_target)
-        x = tvm.nd.array(np_data, device=dev)
-        out = get_output_vm(rt_mod, ["data"], [x])
-        ref_out = get_output_vm(rt_mod_ref, ["data"], [x])
-    else:
-        rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
-        rt_mod, dev, num_partition = profile_and_build(mod, params, sm, use_3xtf32=use_3xtf32)
-        x = tvm.nd.array(np_data, device=dev)
-        out = get_output(rt_mod, ["data"], [x])
-        ref_out = get_output(rt_mod_ref, ["data"], [x])
-
-    assert num_partition > 0
-    np.testing.assert_allclose(out, ref_out, atol=atol, rtol=rtol)
-
-    if run_benchmark:
-        print("CUTLASS:", rt_mod.benchmark(dev, number=1, repeat=600))
-        print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600))
-
-
-def verify_batch_matmul(
-    func, batch, M, N, K, ref_target="cuda", sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
-):
-    assert has_cutlass()
-    mod = tvm.IRModule.from_expr(func)
-    typ = relay.transform.InferType()(mod)["main"].body.checked_type
-    use_vm = any(isinstance(s, tvm.tir.Any) for s in typ.shape)
-    x_np = np.random.uniform(-1, 1, (batch, M, K)).astype("float16")
-    y_np = np.random.uniform(-1, 1, (batch, N, K)).astype("float16")
-
-    if use_vm:
-        rt_mod, dev, num_partition = profile_and_build_vm(mod, {}, sm)
-        rt_mod_ref, dev = get_ref_vm(mod, {}, target=ref_target)
-        assert num_partition > 0
-        x = tvm.nd.array(x_np, device=dev)
-        y = tvm.nd.array(y_np, device=dev)
-        out = get_output_vm(rt_mod, ["x", "y"], [x, y])
-        ref_out = get_output_vm(rt_mod_ref, ["x", "y"], [x, y])
-    else:
-        rt_mod, dev, num_partition = profile_and_build(mod, {}, sm)
-        rt_mod_ref, dev = get_ref_rt_mod(mod, {})
-        assert num_partition > 0
-
-        x = tvm.nd.array(x_np, device=dev)
-        y = tvm.nd.array(y_np, device=dev)
-        out = get_output(rt_mod, ["x", "y"], [x, y])
-        ref_out = get_output(rt_mod_ref, ["x", "y"], [x, y])
-
-    np.testing.assert_allclose(out, ref_out, atol=atol, rtol=rtol)
-
-    if run_benchmark:
-        print("CUTLASS:", rt_mod.benchmark(dev, number=1, repeat=600))
-        print("TVM Tensorcore (no tuning):", rt_mod_ref.benchmark(dev, number=1, repeat=600))
-
-
-M = 96
-N = 64
-K = 64
-
-
-@tvm.testing.requires_cutlass
-def test_dense():
-    verify_dense(get_dense(M, N, K), M, N, K)
-    verify_dense(get_dense(M, N, K, out_dtype="float32"), M, N, K)
-    # Test align1 case
-    verify_dense(get_dense_bias(M, N + 1, K), M, N + 1, K)
-    # int8
-    verify_dense(
-        get_dense(M, N, K, "int32", "int8", "int8"), M, N, K, data_dtype="int8", weight_dtype="int8"
-    )
-
-    dense_fp32 = get_dense(M, N, K, "float32", "float32", "float32")
-    # fp32
-    verify_dense(
-        dense_fp32,
-        M,
-        N,
-        K,
-        data_dtype="float32",
-        weight_dtype="float32",
-        use_3xtf32=False,
-        sm=75,
-    )
-    # tf32
-    verify_dense(
-        dense_fp32,
-        M,
-        N,
-        K,
-        data_dtype="float32",
-        weight_dtype="float32",
-        use_3xtf32=False,
-        atol=1e-2,
-        rtol=1e-2,
-    )
-    # 3xtf32
-    verify_dense(
-        dense_fp32,
-        M,
-        N,
-        K,
-        data_dtype="float32",
-        weight_dtype="float32",
-    )
-
-
-@tvm.testing.requires_cutlass
-def test_dense_bias():
-    verify_dense(get_dense_bias(M, N, K), M, N, K)
-    verify_dense(get_dense_bias(M, N, K, out_dtype="float32"), M, N, K)
-
-
-@tvm.testing.requires_cutlass
-def test_dense_bias_relu():
-    verify_dense(get_dense_bias_relu(M, N, K), M, N, K)
-    verify_dense(get_dense_bias_relu(M, N, K, out_dtype="float32"), M, N, K)
-
-
-@tvm.testing.requires_cutlass
-def test_dense_bias_gelu():
-    verify_dense(get_dense_bias_gelu(M, N, K), M, N, K, atol=1e-3, rtol=1e-3)
-    verify_dense(get_dense_bias_gelu(M, N, K, out_dtype="float32"), M, N, K, atol=1e-3, rtol=1e-3)
-
-
-@tvm.testing.requires_cutlass
-def test_dense_dynamic():
-    data_shape = (relay.Any(), K)
-    weight_shape = (relay.Any(), K)
-
-    if has_cublas():
-        # TVM native fp16 dense (without tensorcore), using fp16 accum, seems to have accuracy issues
-        # Use cublas as a reference
-
-        verify_dense(
-            get_dense_with_shape(data_shape, weight_shape),
-            M,
-            N,
-            K,
-            ref_target="cuda -libs=cublas",
-        )
-
-    verify_dense(
-        get_dense_with_shape(data_shape, weight_shape, out_dtype="float32"),
-        M,
-        N,
-        K,
-        atol=1e-4,
-        rtol=1e-4,
-    )
-
-
-@tvm.testing.requires_cutlass
-def test_batch_matmul():
-    batch = 8
-    verify_batch_matmul(get_batch_matmul(batch, M, N, K), batch, M, N, K)
-    verify_batch_matmul(get_batch_matmul(batch, M, N, K, out_dtype="float32"), batch, M, N, K)
-
-    if has_cublas():
-        # Test dynamic shape batch_matmul
-        # AutoTVM does not seem to support it
-        x_shape = (relay.Any(), relay.Any(), K)
-        y_shape = (relay.Any(), relay.Any(), K)
-
-        verify_batch_matmul(
-            get_batch_matmul_with_shape(x_shape, y_shape),
-            batch,
-            M,
-            N,
-            K,
-            ref_target="cuda -libs=cublas",
-        )
-
-
-def verify_conv2d_common(
-    expr_nchw,  # can be dynamic batch
-    expr_ref,  # always static batch
-    input_names,
-    inputs,
-    params,
-    sm=80,
-    split_k_slices=[1],
-    atol=1e-5,
-    rtol=1e-5,
-    use_cudnn_ref=False,
-    run_benchmark=False,
-    use_fast_math=False,
-    ref_target="cuda",
-    use_vm=False,
-):
-    assert has_cutlass()
-    if sm < 80 and inputs[0].dtype == "float32":
-        return
-
-    mod_nchw = tvm.IRModule.from_expr(expr_nchw)
-    mod_ref = tvm.IRModule.from_expr(expr_ref)
-
-    if use_vm:
-        profile_and_build_func = profile_and_build_vm
-        get_output_func = get_output_vm
-        ref_build_func = get_ref_vm
-    else:
-        profile_and_build_func = profile_and_build
-        get_output_func = get_output
-        ref_build_func = get_ref_rt_mod
-
-    mod_weight_ohwi = convert_conv2d_layout(
-        mod_nchw,
-        {
-            "nn.conv2d": ["NHWC", "OHWI"],
-            "nn.conv2d_transpose": ["NHWC", "IHWO"],
-            "nn.conv2d_backward_weight": ["NHWC", "OHWI"],
-        },
-    )
-
-    rt_mod, _, num_cutlass_partition = profile_and_build_func(
-        mod_weight_ohwi, params, sm, split_k_slices, use_fast_math=use_fast_math
-    )
-    out = get_output_func(rt_mod, input_names, inputs)
-
-    assert num_cutlass_partition > 0
-
-    if use_cudnn_ref:
-        rt_mod_ref, dev = ref_build_func(
-            convert_conv2d_layout(mod_ref, {"nn.conv2d": ["NHWC", "OHWI"]}),
-            params,
-            target="cuda -libs=cudnn",
-        )
-    else:
-        rt_mod_ref, dev = ref_build_func(
-            convert_conv2d_layout(mod_ref, {"nn.conv2d": ["NHWC", "HWIO"]}),
-            params,
-            target=ref_target,
-        )
-
-    ref_out = get_output_func(rt_mod_ref, input_names, inputs)
-
-    if run_benchmark:
-        print("CUTLASS:", rt_mod.benchmark(dev, number=1, repeat=600))
-        print("TVM Tensorcore (no tuning):", rt_mod_ref.benchmark(dev, number=1, repeat=600))
-
-    np.testing.assert_allclose(out, ref_out, atol=atol, rtol=rtol)
-
-
-def verify_conv2d(
-    expr_nchw,  # can be dynamic batch
-    expr_ref,  # always static batch
-    d_shape,
-    w_shape,
-    sm=80,
-    atol=1e-5,
-    rtol=1e-5,
-    use_cudnn_ref=False,
-    run_benchmark=False,
-    use_fast_math=False,
-    data_dtype="float16",
-    weight_dtype="float16",
-    ref_target="cuda",
-    use_vm=False,
-):
-    mod_nchw = tvm.IRModule.from_expr(expr_nchw)
-    typ = relay.transform.InferType()(mod_nchw)["main"].body.checked_type
-
-    use_vm = use_vm or any(isinstance(s, tvm.tir.Any) for s in typ.shape)
-
-    np_data = get_random_ndarray(d_shape, data_dtype)
-    np_weight = get_random_ndarray(w_shape, weight_dtype)
-    np_bias = get_random_ndarray((w_shape[0],), typ.dtype)
-    params = {"weight": np_weight, "bias": np_bias}
-
-    split_k_slices = [1]
-
-    return verify_conv2d_common(
-        expr_nchw,
-        expr_ref,
-        ["data"],
-        [np_data],
-        params,
-        sm,
-        split_k_slices,
-        atol,
-        rtol,
-        use_cudnn_ref,
-        run_benchmark,
-        use_fast_math,
-        ref_target,
-        use_vm,
-    )
-
-
-def verify_conv2d_backward_weight(
-    expr_nchw,  # can be dynamic batch
-    expr_ref,  # always static batch
-    grad_shape,
-    data_shape,
-    sm=80,
-    split_k_slices=[1],
-    atol=1e-5,
-    rtol=1e-5,
-    use_cudnn_ref=False,
-    use_fast_math=False,
-    grad_dtype="float16",
-    data_dtype="float16",
-    ref_target="cuda",
-    use_vm=False,
-):
-    np_grad = get_random_ndarray(grad_shape, grad_dtype)
-    np_data = get_random_ndarray(data_shape, data_dtype)
-    params = {}
-    input_names = ["grad", "data"]
-    return verify_conv2d_common(
-        expr_nchw,
-        expr_ref,
-        input_names,
-        [np_grad, np_data],
-        params,
-        sm,
-        split_k_slices,
-        atol,
-        rtol,
-        use_cudnn_ref,
-        False,
-        use_fast_math,
-        ref_target,
-        use_vm,
-    )
-
-
-@tvm.testing.requires_cutlass
-def test_conv2d():
-    d_shape = (16, 16, 32, 32)
-    w_shape = (32, 16, 3, 3)
-    padding = (1, 1)
-
-    for IC in [3, 16]:
-        d_shape = (16, IC, 32, 32)
-        w_shape = (32, IC, 3, 3)
-        mod_nchw = get_conv2d_nchw(d_shape, w_shape, padding)
-
-        verify_conv2d(
-            mod_nchw,
-            mod_nchw,
-            d_shape,
-            w_shape,
-            sm=80,
-            atol=1e-5,
-            rtol=1e-5,
-            use_cudnn_ref=(IC == 3),  # The autotvm kernel has an accuracy issue with IC == 3 case
-            run_benchmark=False,
-        )
-
-    dyn_batch_shape = (relay.Any(),) + d_shape[1:]
-    mod_nchw = get_conv2d_nchw(d_shape, w_shape, padding)
-    mod_dyn = get_conv2d_nchw(dyn_batch_shape, w_shape, padding)
-
-    verify_conv2d(
-        mod_dyn, mod_nchw, d_shape, w_shape, sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
-    )
-
-    for data_dtype, weight_dtype, out_dtype in [
-        ("float32", "float32", "float32"),  # 3xtf32
-        ("int8", "int8", "int32"),
-        ("uint8", "int8", "int32"),
-    ]:
-        expr = get_conv2d_nchw(
-            d_shape,
-            w_shape,
-            padding,
-            out_dtype=out_dtype,
-            data_dtype=data_dtype,
-            weight_dtype=weight_dtype,
-        )
-
-        verify_conv2d(
-            expr,
-            expr,
-            d_shape,
-            w_shape,
-            sm=80,
-            atol=1e-5,
-            rtol=1e-5,
-            run_benchmark=False,
-            data_dtype=data_dtype,
-            weight_dtype=weight_dtype,
-            ref_target="llvm",
-        )
-
-    # align1 + int8 case
-    d_shape = (16, 3, 32, 32)
-    w_shape = (32, 3, 3, 3)
-    mod_nchw = get_conv2d_nchw(
-        d_shape, w_shape, padding, out_dtype="int32", data_dtype="uint8", weight_dtype="int8"
-    )
-
-    verify_conv2d(
-        mod_nchw,
-        mod_nchw,
-        d_shape,
-        w_shape,
-        sm=80,
-        atol=1e-5,
-        rtol=1e-5,
-        ref_target="llvm",
-        data_dtype="uint8",
-        weight_dtype="int8",
-    )
-
-
-@tvm.testing.requires_cutlass
-def test_conv2d_fusion():
-    d_shape = (16, 16, 32, 32)
-    w_shape = (32, 16, 3, 3)
-    padding = (1, 1)
-
-    mod_nchw = get_conv2d_nchw_bias(d_shape, w_shape, padding)
-    verify_conv2d(
-        mod_nchw, mod_nchw, d_shape, w_shape, sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
-    )
-
-    mod_nchw = get_conv2d_nchw_bias_relu(d_shape, w_shape, padding)
-    verify_conv2d(
-        mod_nchw, mod_nchw, d_shape, w_shape, sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
-    )
-
-    mod_nchw = get_conv2d_nchw_bias_sigmoid(d_shape, w_shape, padding, out_dtype="float16")
-    verify_conv2d(
-        mod_nchw, mod_nchw, d_shape, w_shape, sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
-    )
-    verify_conv2d(
-        mod_nchw,
-        mod_nchw,
-        d_shape,
-        w_shape,
-        sm=80,
-        atol=1e-3,
-        rtol=1e-3,
-        run_benchmark=False,
-        use_fast_math=True,
-    )
-
-    mod_nchw = get_conv2d_nchw_bias_sigmoid(d_shape, w_shape, padding, out_dtype="float32")
-    verify_conv2d(
-        mod_nchw, mod_nchw, d_shape, w_shape, sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
-    )
-
-    mod_nchw = get_conv2d_nchw_bias_silu(d_shape, w_shape, padding, out_dtype="float32")
-    verify_conv2d(
-        mod_nchw, mod_nchw, d_shape, w_shape, sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
-    )
-
-    mod_nchw = get_conv2d_nchw_bias_hardswish(d_shape, w_shape, padding, out_dtype="float16")
-    verify_conv2d(
-        mod_nchw, mod_nchw, d_shape, w_shape, sm=80, atol=5e-2, rtol=5e-2, run_benchmark=False
-    )
-
-
-@tvm.testing.requires_cutlass
-def test_conv2d_residual_block():
-    d_shape = (16, 16, 32, 32)
-    w_shape = (16, 16, 3, 3)
-    padding = (1, 1)
-
-    bias_add, residual_input = get_conv2d_nchw_bias_residual(d_shape, w_shape, padding)
-
-    for func, tol in [
-        (relay.nn.relu(bias_add + residual_input), 1e-5),
-        (relay.nn.relu(bias_add) + residual_input, 1e-5),
-        (relay.sigmoid(bias_add) * residual_input, 1e-5),
-        (relay.nn.relu(silu(bias_add) * residual_input), 1e-5),
-        # HardSwish requires higher tolerance since vectoring the residual block epilogue
-        # in cutlass.
-        # TODO(masahi): Invesitigate this issue
-        (relay.nn.relu(hardswish(bias_add) + residual_input), 5e-2),
-    ]:
-        verify_conv2d(func, func, d_shape, w_shape, sm=80, atol=tol, rtol=tol, run_benchmark=False)
-
-
-@tvm.testing.requires_cutlass
-def test_conv2d_transpose():
-    OC = 8
-    IC = 16
-    d_shape = (16, IC, 32, 32)
-    w_shape = (OC, IC, 3, 3)
-    padding = (1, 1)
-    dtype = "float32"
-
-    for strides in [(1, 1), (2, 2)]:
-        o_shape = conv_output_shape(
-            0, padding, strides, (1, 1), d_shape, (OC, IC, 3, 3), "float32", "float32"
-        )
-        output_padding = (1, 1) if strides[0] > 1 else (0, 0)
-        mod_nchw = get_conv2d_transpose_nchw(
-            o_shape,
-            w_shape,
-            padding,
-            output_padding,
-            strides,
-            out_dtype=dtype,
-            data_dtype=dtype,
-            weight_dtype=dtype,
-        )
-
-        verify_conv2d(
-            mod_nchw,
-            mod_nchw,
-            o_shape,
-            w_shape,
-            sm=80,
-            atol=1e-3,
-            rtol=1e-3,
-            use_cudnn_ref=False,
-            run_benchmark=False,
-            data_dtype=dtype,
-            weight_dtype=dtype,
-        )
-
-
-@tvm.testing.requires_cutlass
-def test_conv2d_backward_weight():
-    OC = 8
-    IC = 16
-    d_shape = (16, IC, 32, 32)
-    w_shape = (OC, IC, 3, 3)
-    dtype = "float16"
-
-    for strides in [(1, 1), (2, 2)]:
-        o_shape = (16, OC, 32 // strides[0], 32 // strides[1])
-        padding = (1, 1)
-
-        mod_nchw = get_conv2d_backward_weight(
-            d_shape,
-            w_shape,
-            o_shape,
-            padding,
-            strides,
-            out_dtype="float32",
-            data_dtype=dtype,
-            weight_dtype=dtype,
-        )
-
-        for split_k_slices in [1, 8]:
-            verify_conv2d_backward_weight(
-                mod_nchw,
-                mod_nchw,
-                o_shape,
-                d_shape,
-                sm=80,
-                split_k_slices=[split_k_slices],
-                atol=5e-3,
-                rtol=5e-3,
-                use_cudnn_ref=False,
-                grad_dtype=dtype,
-                data_dtype=dtype,
-            )
-
-
-@tvm.testing.requires_cutlass
-def test_conv2d_bwd():
-    IC = 16
-    OC = 8
-    dshape = (16, IC, 32, 32)
-    wshape = (OC, IC, 3, 3)
-    padding = (0, 0)
-    strides = (1, 1)
-
-    conv = get_conv2d_nchw(
-        dshape,
-        wshape,
-        padding,
-        strides=strides,
-        out_dtype="float32",
-        data_dtype="float32",
-        weight_dtype="float32",
-    )
-    fwd_mod = InferType()(tvm.IRModule.from_expr(conv))
-
-    # Note: large difference in tvm and cutlass Wgrad results if use fp16.
-    # Cutlass wgrad uses fp32 accumulation even if the output is fp16.
-    use_fp16 = False
-    verify_dgrad = False  # False to verify wgrad
-    tol = 1e-5 if verify_dgrad else 1e-4  # Wgrad slightly less accurate
-
-    if use_fp16:
-        fwd_mod = ToMixedPrecision("float16")(fwd_mod)
-
-    fwd_bwd_func = FirstOrderGradient()(fwd_mod)["main"]
-
-    bwd_func = relay.Function(
-        fwd_bwd_func.params,
-        relay.TupleGetItem(relay.TupleGetItem(fwd_bwd_func.body, 1), 0 if verify_dgrad else 1),
-    )
-
-    verify_conv2d(
-        bwd_func,
-        bwd_func,
-        dshape,
-        wshape,
-        sm=80,
-        atol=1e-2 if use_fp16 else tol,
-        rtol=1e-2 if use_fp16 else tol,
-        use_cudnn_ref=False,
-        data_dtype="float32",
-        weight_dtype="float32",
-        use_vm=True,
-    )
-
-
-def verify_dense_transpose_dense(
-    func,
-    M,
-    N,
-    K,
-    ref_target="cuda",
-    sm=80,
-    atol=1e-5,
-    rtol=1e-5,
-    run_benchmark=False,
-    dtype="float16",
-    use_3xtf32=True,
-):
-    assert has_cutlass()
-    if sm < 80 and dtype == "float32":
-        return
-
-    mod = tvm.IRModule.from_expr(func)
-    typ = relay.transform.InferType()(mod)["main"].body.checked_type
-    np_data = get_random_ndarray((M, K), dtype)
-    np_weight0 = get_random_ndarray((N, K), dtype)
-    np_weight1 = get_random_ndarray((K, M), dtype)
-
-    params = {"weight0": np_weight0, "weight1": np_weight1}
-
-    rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
-    cutlass_rt_mod, dev, num_partition = profile_and_build(
-        mod,
-        params,
-        sm,
-        use_3xtf32=use_3xtf32,
-        use_ansor=False,
-    )
-    cutlass_ansor_rt_mod, dev, num_partition = profile_and_build(
-        mod,
-        params,
-        sm,
-        use_3xtf32=use_3xtf32,
-        use_ansor=True,
-    )
-    x = tvm.nd.array(np_data, device=dev)
-    cutlass_out = get_output(cutlass_rt_mod, ["input"], [x])
-    cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x])
-    ref_out = get_output(rt_mod_ref, ["input"], [x])
-
-    assert num_partition > 0
-    np.testing.assert_allclose(cutlass_out, ref_out, atol=atol, rtol=rtol)
-    np.testing.assert_allclose(cutlass_ansor_out, ref_out, atol=atol, rtol=rtol)
-
-    if run_benchmark:
-        print("CUTLASS:", cutlass_rt_mod.benchmark(dev, number=1, repeat=600))
-        print("CUTLASS with Ansor:", cutlass_ansor_rt_mod.benchmark(dev, number=1, repeat=600))
-        print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600))
-
-
-@tvm.testing.requires_cutlass
-def test_dense_transpose_dense():
-    verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K)
-
-
-def verify_group_gemm(
-    func_name, M, N, K, num_groups, x_dtype, weight_dtype, out_dtype, use_scale, rtol, atol
-):
-    group_gemm_func = tvm.get_global_func(func_name, allow_missing=True)
-    if group_gemm_func is None:
-        print(f"Skipped as {func_name} is not available")
-        return
-
-    @memoize("tvm.contrib.cutlass.test_group_gemm_sm90")
-    def get_ref_data():
-        assert M % num_groups == 0
-        M_per_group = M // num_groups
-        a_np = get_random_ndarray((M, K), "float16")
-        b_np = get_random_ndarray((num_groups, N, K), "float16")
-        indptr_np = np.arange(1, num_groups + 1).astype("int64") * M_per_group
-        c_np = np.concatenate(
-            [a_np[i * M_per_group : (i + 1) * M_per_group] @ b_np[i].T for i in range(num_groups)],
-            axis=0,
-        )
-        return a_np, b_np, indptr_np, c_np
-
-    def to_numpy_dtype(dtype):
-        mapping = {"e5m2_float8": ml_dtypes.float8_e5m2, "e4m3_float8": ml_dtypes.float8_e4m3fn}
-        return mapping.get(dtype, dtype)
-
-    a_np, b_np, indptr_np, c_np = get_ref_data()
-    dev = tvm.cuda(0)
-    a_nd = tvm.nd.array(a_np.astype(to_numpy_dtype(x_dtype)), device=dev)
-    b_nd = tvm.nd.array(b_np.astype(to_numpy_dtype(weight_dtype)), device=dev)
-    c_nd = tvm.nd.empty(c_np.shape, dtype=out_dtype, device=dev)
-    indptr_nd = tvm.nd.array(indptr_np, device=dev)
-    workspace = tvm.nd.empty((4096 * 1024,), dtype="uint8", device=dev)
-    if use_scale:
-        scale = tvm.nd.array(np.array([1.0], dtype="float32"), device=dev)
-        group_gemm_func(a_nd, b_nd, indptr_nd, workspace, scale, c_nd)
-    else:
-        group_gemm_func(a_nd, b_nd, indptr_nd, workspace, c_nd)
-    tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=rtol, atol=atol)
-
-
-@tvm.testing.requires_cutlass
-def test_group_gemm_sm90():
-    verify_group_gemm(
-        "cutlass.group_gemm_fp16_sm90",
-        8,
-        128,
-        128,
-        4,
-        "float16",
-        "float16",
-        "float16",
-        False,
-        rtol=1e-3,
-        atol=1e-3,
-    )
-    verify_group_gemm(
-        "cutlass.group_gemm_e5m2_e5m2_fp16",
-        8,
-        16,
-        16,
-        4,
-        "e5m2_float8",
-        "e5m2_float8",
-        "float16",
-        True,
-        rtol=1e-1,
-        atol=1,
-    )
-    verify_group_gemm(
-        "cutlass.group_gemm_e4m3_e4m3_fp16",
-        8,
-        16,
-        16,
-        4,
-        "e4m3_float8",
-        "e4m3_float8",
-        "float16",
-        True,
-        rtol=1e-1,
-        atol=1,
-    )
-    verify_group_gemm(
-        "cutlass.group_gemm_e5m2_e4m3_fp16",
-        8,
-        16,
-        16,
-        4,
-        "e5m2_float8",
-        "e4m3_float8",
-        "float16",
-        True,
-        rtol=1e-1,
-        atol=1,
-    )
-
-
-def verify_gemm(func_name, M, N, K, x_dtype, weight_dtype, out_dtype, scale_value, rtol, atol):
-    gemm_func = tvm.get_global_func(func_name, allow_missing=True)
-    if gemm_func is None:
-        print(f"Skipped as {func_name} is not available")
-        return
-
-    @memoize("tvm.contrib.cutlass.test_fp8_gemm_sm90")
-    def get_ref_data():
-        a_np = get_random_ndarray((M, K), "float16")
-        b_np = get_random_ndarray((N, K), "float16")
-        c_np = a_np @ b_np.T * scale_value
-        return a_np, b_np, c_np
-
-    def to_numpy_dtype(dtype):
-        mapping = {"e5m2_float8": ml_dtypes.float8_e5m2, "e4m3_float8": ml_dtypes.float8_e4m3fn}
-        return mapping.get(dtype, dtype)
-
-    a_np, b_np, c_np = get_ref_data()
-    dev = tvm.cuda(0)
-    a_nd = tvm.nd.array(a_np.astype(to_numpy_dtype(x_dtype)), device=dev)
-    b_nd = tvm.nd.array(b_np.astype(to_numpy_dtype(weight_dtype)), device=dev)
-    c_nd = tvm.nd.empty(c_np.shape, dtype=out_dtype, device=dev)
-    workspace = tvm.nd.empty((4096 * 1024,), dtype="uint8", device=dev)
-    scale = tvm.nd.array(np.array([scale_value], dtype="float32"), device=dev)
-    gemm_func(a_nd, b_nd, workspace, scale, c_nd)
-    tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=rtol, atol=atol)
-
-
-@tvm.testing.requires_cutlass
-def test_fp8_gemm_sm90():
-    verify_gemm(
-        "cutlass.gemm_e5m2_e5m2_fp16",
-        8,
-        16,
-        16,
-        "e5m2_float8",
-        "e5m2_float8",
-        "float16",
-        1.5,
-        rtol=1e-1,
-        atol=1,
-    )
-    verify_gemm(
-        "cutlass.gemm_e4m3_e4m3_fp16",
-        8,
-        16,
-        16,
-        "e4m3_float8",
-        "e4m3_float8",
-        "float16",
-        1.5,
-        rtol=1e-1,
-        atol=1,
-    )
-    verify_gemm(
-        "cutlass.gemm_e4m3_e4m3_fp16",
-        32,
-        16,
-        16,
-        "e4m3_float8",
-        "e4m3_float8",
-        "float16",
-        1.5,
-        rtol=1e-1,
-        atol=1,
-    )
-    verify_gemm(
-        "cutlass.gemm_e5m2_e4m3_fp16",
-        8,
-        16,
-        16,
-        "e5m2_float8",
-        "e4m3_float8",
-        "float16",
-        1.5,
-        rtol=1e-1,
-        atol=1,
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
deleted file mode 100644
index c45149fc5f1e..000000000000
--- a/tests/python/contrib/test_dnnl.py
+++ /dev/null
@@ -1,1829 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import itertools
-import numpy as np
-import sys
-import subprocess
-import math
-import collections
-
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.relay.op.contrib import dnnl
-import tvm.testing
-
-
-has_dnnl_codegen = pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.dnnl", True), reason="DNNL codegen not available"
-)
-
-run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
-    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
-    ids=["compile", "run"],
-)
-
-_bf16_supported = None
-
-
-def bf16_supported():
-    global _bf16_supported
-    if _bf16_supported is None:
-        _bf16_supported = False
-        if sys.platform.startswith("darwin"):
-            cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
-            for line in cpu_info.split("\n"):
-                if line.startswith("hw.optional.avx512f"):
-                    _bf16_supported = bool(int(line.split(":", 1)[1]))
-        elif sys.platform.startswith("linux"):
-            _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
-    return _bf16_supported
-
-
-def partition_for_dnnl(mod, params=None, alter_layout=True, prune_subgraphs=True):
-    """Partition the graph greedily offloading supported operators to DNNL.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-    Returns
-    -------
-    mod : Module
-        Annotated and partitioned module.
-    """
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    with TempOpAttr("nn.conv2d", "FTVMLegalize", dnnl.legalize_group_conv):
-        with TempOpAttr("nn.conv2d_transpose", "FTVMLegalize", dnnl.legalize_group_conv):
-            seq = tvm.transform.Sequential(
-                [
-                    transform.CanonicalizeOps(),
-                    transform.InferType(),
-                    transform.SimplifyInference(),
-                    transform.FoldConstant(),
-                    transform.FoldScaleAxis(),
-                    # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu`
-                    transform.SimplifyExpr(),
-                    transform.FoldConstant(),
-                    # alter group conv /conv_transpose layout to `GOIHW` / `GIOHW`
-                    transform.Legalize(),
-                    transform.FoldConstant(),
-                ]
-            )
-            with tvm.transform.PassContext(opt_level=3):
-                mod = seq(mod)
-    if alter_layout:
-        with TempOpAttr("nn.conv1d", "FTVMAlterOpLayout", dnnl.alter_conv):
-            with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", dnnl.alter_conv):
-                with TempOpAttr("nn.conv3d", "FTVMAlterOpLayout", dnnl.alter_conv):
-                    with TempOpAttr(
-                        "nn.conv2d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
-                    ):
-                        with TempOpAttr(
-                            "nn.conv3d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
-                        ):
-                            alter_layout_seq = tvm.transform.Sequential(
-                                [
-                                    transform.AlterOpLayout(),
-                                    transform.FoldConstant(),
-                                ]
-                            )
-                            with tvm.transform.PassContext(opt_level=3):
-                                mod = alter_layout_seq(mod)
-
-    mod = dnnl.rewrite_layer_norm(mod)
-    mod = dnnl.rewrite_dense_bias_gelu_reshape_last(mod)
-    mod = dnnl.legalize_qnn_for_dnnl(mod)
-
-    byoc_seq = tvm.transform.Sequential(
-        [
-            transform.MergeComposite(dnnl.pattern_table()),
-            transform.AnnotateTarget("dnnl"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    with tvm.transform.PassContext(opt_level=3):
-        mod = byoc_seq(mod)
-        if prune_subgraphs:
-            mod = dnnl.prune_dnnl_subgraphs(mod)
-    return mod
-
-
-def vmobj_to_list(o):
-    if isinstance(o, tvm.nd.NDArray):
-        o_np = o.numpy()
-        if o_np.dtype == np.uint16:
-            o_np = np.left_shift(o_np.astype("uint32"), 16).view("<f4")
-        return [o_np]
-    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
-        return [vmobj_to_list(f) for f in o]
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def assert_result_dict_holds(result_dict):
-    for k1, k2 in itertools.combinations(result_dict, 2):
-        res1 = vmobj_to_list(result_dict[k1])
-        res2 = vmobj_to_list(result_dict[k2])
-        for r1, r2 in zip(res1, res2):
-            # ignore the accuracy checking if only one bf16 result presents
-            if ("bf16" in k1) == ("bf16" in k2):
-                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
-
-
-def check_dnnl_used(mod, subgraph_num=None):
-    num_dnnl_subgraphs = sum([1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()])
-    if subgraph_num:
-        assert num_dnnl_subgraphs == subgraph_num
-    else:
-        assert num_dnnl_subgraphs >= 1
-
-
-def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
-    dev = tvm.cpu()
-    result_dict = dict()
-    for mode in ["graph", "vm"]:
-        configs = [
-            (False, False, False),
-            (True, False, False),
-            (True, True, False),
-        ]
-        if test_bf16 and bf16_supported():
-            configs += [(True, False, True), (True, True, True)]
-
-        for use_dnnl, alter_layout, use_bf16 in configs:
-            result_key = (
-                mode
-                + ("_dnnl" if use_dnnl else "")
-                + ("_layout" if alter_layout else "")
-                + ("_bf16" if use_bf16 else "_fp32")
-            )
-            processed_mod = mod
-            if use_bf16:
-                processed_mod = relay.transform.ToMixedPrecision("bfloat16")(processed_mod)
-                if tvm.ir.structural_equal(processed_mod, mod):
-                    print("can not convert to bfloat16, skipping...")
-                    continue
-            if use_dnnl:
-                processed_mod = partition_for_dnnl(processed_mod, params, alter_layout)
-                check_dnnl_used(processed_mod)
-            with tvm.transform.PassContext(opt_level=3):
-                func = relay.create_executor(
-                    mode, mod=processed_mod, device=dev, target=target
-                ).evaluate()
-            if run_module:
-                if isinstance(input, dict):
-                    result_dict[result_key] = func(**input, **params)
-                else:
-                    result_dict[result_key] = func(input, **params)
-
-    if run_module:
-        assert_result_dict_holds(result_dict)
-
-
-def run_and_verify_func(
-    config, run_module, subgraph_num=None, target="llvm", dtype="float32", test_bf16=True
-):
-    """Test a Relay func by compiling, running, and comparing TVM and DNNL outputs.
-    Parameters
-    ----------
-    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
-        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
-        3) A list of which vars should be considered params.
-    run_module: bool
-        If True, the built module will be run after being compiled.
-    """
-    f, input_shapes, is_param = config
-    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype) for x in is_param}
-    input_dict = {
-        k: np.random.uniform(-1, 1, v).astype(dtype)
-        for k, v in input_shapes.items()
-        if k not in is_param
-    }
-    run_and_verify(
-        f,
-        input_dict,
-        params,
-        subgraph_num=subgraph_num,
-        target=target,
-        run_module=run_module,
-        test_bf16=test_bf16,
-    )
-
-
-def add_activation(activation, out, dic, param_lst):
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    elif activation == "clip":
-        return relay.clip(out, 0.0, 6.0), dic, param_lst
-    elif activation == "swish":
-        sig_out = relay.sigmoid(out)
-        out = relay.multiply(out, sig_out)
-        return out, dic, param_lst
-    elif activation == "gelu":
-        out = gelu_helper(out)
-        return out, dic, param_lst
-    elif activation == "mish":
-        exp = relay.exp(out)
-        add = relay.add(exp, relay.const(1.0))
-        log = relay.log(add)
-        tanh = relay.tanh(log)
-        out = relay.multiply(out, tanh)
-        return out, dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv1d(
-    x_shape=((1, 3, 224)),
-    k_shape=(16, 3, 3),
-    groups=1,
-    padding=(1, 1),
-    strides=(1),
-    dilation=(1),
-    channels=None,
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv1d(
-        x,
-        kernel,
-        kernel_size=k_shape[2:3],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        channels=k_shape[0],
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dtype="float32"):
-    conv, dic, param_lst = get_conv1d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv1d_bias_bn_relu(x_shape=(1, 3, 224), k_shape=(10, 3, 3), dtype="float32"):
-    conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, dtype=dtype)
-    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-    conv1d_bias_bn, _, _ = relay.nn.batch_norm(
-        conv1d_bias,
-        gamma=gamma,
-        beta=beta,
-        moving_mean=moving_mean,
-        moving_var=moving_var,
-        axis=1,
-        center=True,
-        scale=True,
-        epsilon=1e-5,
-    )
-    return relay.nn.relu(conv1d_bias_bn), dic, param_lst
-
-
-def get_conv2d(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(16, 32, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    dilation=(1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv2d(
-        x,
-        kernel,
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        channels=k_shape[0],
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv2d_transpose(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(32, 16, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv2d_transpose(
-        x,
-        kernel,
-        channels=k_shape[1] * groups,
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv2d_weights_const(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(16, 32, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    dilation=(1, 1),
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv2d(
-        x,
-        kernel,
-        channels=k_shape[0],
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-    )
-    dic = {"x": x_shape}
-    param_lst = []
-    return out, dic, param_lst
-
-
-def get_conv2d_bias(
-    x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv2d_weights_const(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv2d_transpose_bias(
-    x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv2d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[1],)
-    param_lst += ["bias"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
-    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
-    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-    conv2d_bias_bn, _, _ = relay.nn.batch_norm(
-        conv2d_bias,
-        gamma=gamma,
-        beta=beta,
-        moving_mean=moving_mean,
-        moving_var=moving_var,
-        axis=1,
-        center=True,
-        scale=True,
-        epsilon=1e-5,
-    )
-    return relay.nn.relu(conv2d_bias_bn), dic, param_lst
-
-
-def get_layer_norm(x_shape=(1, 49, 64), dtype="float32"):
-    dic = {"input": x_shape}
-    param_lst = []
-    input = relay.var("input", shape=x_shape)
-    beta = relay.const(np.zeros(x_shape[2]).astype(dtype))
-    gamma = relay.const(np.ones(x_shape[2]).astype(dtype))
-    out = relay.nn.layer_norm(input, gamma=gamma, beta=beta)
-    return out, dic, param_lst
-
-
-def get_conv3d(
-    x_shape=(1, 32, 8, 8, 8),
-    k_shape=(16, 32, 3, 3, 3),
-    groups=1,
-    padding=(0, 0, 0),
-    strides=(1, 1, 1),
-    dilation=(1, 1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv3d(
-        x,
-        kernel,
-        channels=k_shape[0],
-        kernel_size=k_shape[2:],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv3d_transpose(
-    x_shape=(1, 32, 8, 8, 8),
-    k_shape=(32, 16, 3, 3, 3),
-    groups=1,
-    padding=(0, 0, 0),
-    strides=(1, 1, 1),
-    output_padding=(0, 0, 0),
-    activation=None,
-    dtype="float32",
-    data_layout="NCDHW",
-    kernel_layout="IODHW",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv3d_transpose(
-        x,
-        kernel,
-        channels=k_shape[1],
-        kernel_size=k_shape[2:5],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        output_padding=output_padding,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv3d_bias(
-    x_shape=(1, 32, 8, 8, 8), k_shape=(16, 32, 3, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv3d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def get_conv3d_transpose_bias(
-    x_shape=(1, 32, 8, 8, 8), k_shape=(32, 16, 3, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv3d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[1],)
-    param_lst += ["bias"]
-    return add_activation(activation, out, dic, param_lst)
-
-
-def gelu_helper(data):
-    const1 = relay.const(math.sqrt(2.0))
-    const2 = relay.const(1.0)
-    const3 = relay.const(0.5)
-    divisor = relay.op.divide(data, const1)
-    val_erf = relay.op.erf(divisor)
-    added_erf = relay.op.add(val_erf, const2)
-    mul1 = relay.op.multiply(data, added_erf)
-    out = relay.op.multiply(mul1, const3)
-    return out
-
-
-def get_dense(
-    x_shape=(1, 16), k_shape=(32, 16), activation=None, has_reshape=False, dtype="float32"
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.dense(x, kernel, units=k_shape[0])
-    # out = relay.nn.dense(x, kernel, units=None)
-    if has_reshape:
-        out = relay.reshape(out, newshape=(1, x_shape[0], k_shape[0]))
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "gelu":
-        out = gelu_helper(out)
-    return out, dic, param_lst
-
-
-def get_bmm(
-    x_shape=(1, 16, 8), k_shape=(1, 4, 8), dtype="float32", transpose_a=False, transpose_b=True
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.batch_matmul(
-        x, kernel, out_dtype=dtype, transpose_a=transpose_a, transpose_b=transpose_b
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return out, dic, param_lst
-
-
-def test_bmm(run_module, dtype="float32"):
-    x_shape = (1, 2, 4)
-    k_shape = (1, 3, 4)
-
-    dense, dic, param_lst = get_bmm(x_shape, k_shape, dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    k_shape_t = (1, 4, 3)
-    dense, dic, param_lst = get_bmm(x_shape, k_shape_t, dtype=dtype, transpose_b=False)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def get_dense_bias(
-    x_shape=(1, 16),
-    k_shape=(32, 16),
-    activation=None,
-    has_reshape=False,
-    use_add=False,
-    dtype="float32",
-):
-    dense, dic, param_lst = get_dense(
-        x_shape=x_shape, k_shape=k_shape, has_reshape=has_reshape, dtype=dtype
-    )
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    if use_add:
-        out = relay.add(dense, bias)
-    else:
-        out = relay.nn.bias_add(dense, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-
-    if activation == "gelu":
-        out = gelu_helper(out)
-    return out, dic, param_lst
-
-
-def test_dnnl_not_compatible(run_module, target="llvm", dtype="float32"):
-    xshape = (1, 32, 14, 14)
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.add(x, x)
-    z = relay.cast(relay.cast(y, "int32"), "float32")
-    out = relay.nn.relu(z)
-    f = relay.Function([x], out)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = partition_for_dnnl(mod)
-    for mode in ["graph", "vm"]:
-        with tvm.transform.PassContext(opt_level=3):
-            func = relay.create_executor(mode, mod=mod, device=tvm.cpu(0), target=target).evaluate()
-            if run_module:
-                results = func(x_data)
-
-
-def test_multiple_outputs(run_module, dtype="float32"):
-    def get_graph():
-        x = relay.var("x", shape=(1, 3), dtype=dtype)
-        y = relay.var("y", shape=(1, 3), dtype=dtype)
-        z = relay.add(x, y)
-        w = relay.add(z, y)
-        out = relay.Tuple((z, w))
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": (1, 3), "y": (1, 3)}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, dtype=dtype)
-
-
-def test_elementwise(run_module, dtype="float32"):
-    def get_graph(op, x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = op(x)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    for op in [
-        relay.abs,
-        relay.exp,
-        relay.log,
-        relay.sqrt,
-        relay.nn.relu,
-        relay.tanh,
-        relay.sigmoid,
-    ]:
-        run_and_verify_func(get_graph(op), run_module=run_module)
-
-
-def test_clip(run_module, dtype="float32"):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.clip(x, a_min=-0.2, a_max=0.4)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_leaky_relu(run_module, dtype="float32"):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.nn.leaky_relu(x, alpha=0.1)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_softmax(run_module, dtype="float32"):
-    def get_graph(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.nn.softmax(x, axis=axis)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)
-
-
-def test_conv1d(run_module, dtype="float32"):
-    conv1d, dic, param_lst = get_conv1d(channels=16, dtype=dtype)
-    conv1d = tvm.IRModule.from_expr(conv1d)
-    config = conv1d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    x_shape = (1, 32, 224)
-    k_shape = (16, 32, 3)
-    conv1d_bias, dic, param_lst = get_conv1d(x_shape, k_shape, dtype=dtype)
-    conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
-    config = conv1d_bias, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv1d_pattern(run_module, dtype="float32"):
-    x_shape = (1, 3, 224)
-    k_shape = (16, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv1d, dic, param_lst = get_conv1d(x_shape, k_shape, activation=a, dtype=dtype)
-        conv1d = tvm.IRModule.from_expr(conv1d)
-        config = conv1d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, activation=a, dtype=dtype)
-        conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
-        config = conv1d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 2, 3, 3), 16)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                for dilation in [(1, 1), (2, 2)]:
-                    conv2d, dic, param_lst = get_conv2d(
-                        x_shape=x_shape,
-                        k_shape=k_shape,
-                        groups=groups,
-                        padding=padding,
-                        strides=strides,
-                        dilation=dilation,
-                        dtype=dtype,
-                    )
-                    conv2d = tvm.IRModule.from_expr(conv2d)
-                    config = conv2d, dic, param_lst
-                    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_weights_const(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    k_shape = (16, 32, 3, 3)
-    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
-    conv2d = tvm.IRModule.from_expr(conv2d)
-    config = conv2d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    x_shape = (1, 3, 8, 8)
-    k_shape = (16, 3, 3, 3)
-    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
-    conv2d = tvm.IRModule.from_expr(conv2d)
-    config = conv2d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_pattern(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    k_shape = (16, 32, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu", "mish"]
-    for a in activation_lst:
-        conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype)
-        conv2d = tvm.IRModule.from_expr(conv2d)
-        config = conv2d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, activation=a, dtype=dtype)
-        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
-        config = conv2d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
-    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
-    config = conv2d_bias_bn_relu, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
-    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
-    config = conv2d_bias_bn_relu, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_bias_sum_relu(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    k_shape = (16, 32, 3, 3)
-
-    def get_conv2d_bn_sum_relu(x_shape, k_shape, dtype="float32"):
-        out, dic, param_lst = get_conv2d_bias(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-        beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-        gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-        moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-        moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-        out, _, _ = relay.nn.batch_norm(
-            out,
-            gamma=gamma,
-            beta=beta,
-            moving_mean=moving_mean,
-            moving_var=moving_var,
-            axis=1,
-            center=True,
-            scale=True,
-            epsilon=1e-5,
-        )
-        sum_in = relay.var("sum_in", shape=x_shape, dtype=dtype)
-        kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-        conv_sum = relay.nn.conv2d(
-            sum_in,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:4],
-            groups=1,
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-        )
-        # sum over two conv2d outputs to meet inplace condition
-        out = relay.add(out, conv_sum)
-        dic["sum_in"] = x_shape
-        return relay.nn.relu(out), dic, param_lst
-
-    conv2d_bn_sum_relu, dic, param_lst = get_conv2d_bn_sum_relu(x_shape, k_shape, dtype=dtype)
-    conv2d_bn_sum_relu = tvm.IRModule.from_expr(conv2d_bn_sum_relu)
-    config = conv2d_bn_sum_relu, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_dense_bias_sum(run_module, dtype="float32"):
-    x_shape = (4, 32)
-    k_shape = (16, 32)
-
-    def get_dense_bias_sum(x_shape, k_shape, dtype="float32"):
-        out, dic, param_lst = get_dense_bias(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-
-        sum_in = relay.var("sum_in", shape=x_shape, dtype=dtype)
-        ker = relay.var("ker", shape=(k_shape), dtype=dtype)
-        dense_sum = relay.nn.dense(sum_in, ker, units=k_shape[0])
-
-        # sum over two dense outputs to meet inplace condition
-        out = relay.add(out, dense_sum)
-        dic["sum_in"] = x_shape
-        dic["ker"] = k_shape
-        param_lst += ["ker"]
-        return out, dic, param_lst
-
-    dense_bias_sum, dic, param_lst = get_dense_bias_sum(x_shape, k_shape, dtype=dtype)
-    dense_bias_sum = tvm.IRModule.from_expr(dense_bias_sum)
-    print("hebi-dbg:")
-    print(dense_bias_sum)
-    config = dense_bias_sum, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_transpose(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    for k_shape, groups in [((32, 16, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 4, 3, 3), 16)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                conv2d_transpose, dic, param_lst = get_conv2d_transpose(
-                    x_shape=x_shape,
-                    k_shape=k_shape,
-                    groups=groups,
-                    padding=padding,
-                    strides=strides,
-                    dtype=dtype,
-                )
-                conv2d_transpose = tvm.IRModule.from_expr(conv2d_transpose)
-                config = conv2d_transpose, dic, param_lst
-                run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu", "mish"]
-    for a in activation_lst:
-        conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype)
-        conv2d = tvm.IRModule.from_expr(conv2d)
-        config = conv2d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv2d_bias, dic, param_lst = get_conv2d_transpose_bias(activation=a, dtype=dtype)
-        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
-        config = conv2d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d(run_module, dtype="float32"):
-    conv3d, dic, param_lst = get_conv3d(dtype=dtype)
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d, dic, param_lst = get_conv3d(padding=(0, 0, 0, 1, 1, 1), dtype=dtype)
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d, dic, param_lst = get_conv3d(
-        x_shape=(1, 3, 8, 8, 8), k_shape=(16, 3, 3, 3, 3), dtype=dtype
-    )
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu", "mish"]
-    for a in activation_lst:
-        conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype)
-        conv3d = tvm.IRModule.from_expr(conv3d)
-        config = conv3d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv3d_bias, dic, param_lst = get_conv3d_bias(activation=a, dtype=dtype)
-        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
-        config = conv3d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_transpose(run_module, dtype="float32"):
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(dtype=dtype)
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(strides=(2, 2, 2), dtype=dtype)
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(
-        strides=(2, 2, 2), output_padding=(1, 1, 1), dtype=dtype
-    )
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu", "mish"]
-    for a in activation_lst:
-        conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype)
-        conv3d = tvm.IRModule.from_expr(conv3d)
-        config = conv3d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv3d_bias, dic, param_lst = get_conv3d_transpose_bias(activation=a, dtype=dtype)
-        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
-        config = conv3d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_dense(run_module, dtype="float32"):
-    x_shape = (1, 16)
-    k_shape = (32, 16)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape=(1, 16), dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape, activation="gelu", dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_dense_pattern(run_module, dtype="float32"):
-    x_shape = (1, 16)
-    k_shape = (32, 16)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, dtype=dtype)
-    dense_bias = tvm.IRModule.from_expr(dense_bias)
-    config = dense_bias, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, activation="gelu", dtype=dtype)
-    dense_bias = tvm.IRModule.from_expr(dense_bias)
-    config = dense_bias, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_pool2d(run_module, dtype="float32"):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 32, 32),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
-        out = tvm.IRModule.from_expr(out)
-        return out, {"x": x_shape}, []
-
-    for pool_size in [(2, 2), (3, 3)]:
-        for strides in [(1, 1), (2, 2)]:
-            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
-                for ceil_mode in [False]:
-                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
-                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
-                        continue
-                    for count_include_pad in [False, True]:
-                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
-                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
-                            continue
-                        run_and_verify_func(
-                            get_graph(
-                                relay.nn.avg_pool2d,
-                                pool_size=pool_size,
-                                strides=strides,
-                                padding=padding,
-                                ceil_mode=ceil_mode,
-                                count_include_pad=count_include_pad,
-                            ),
-                            run_module=run_module,
-                        )
-                    run_and_verify_func(
-                        get_graph(
-                            relay.nn.max_pool2d,
-                            pool_size=pool_size,
-                            strides=strides,
-                            padding=padding,
-                            ceil_mode=ceil_mode,
-                        ),
-                        run_module=run_module,
-                    )
-
-
-def test_global_avg_pooling2d(run_module, dtype="float32"):
-    x_shape = (1, 3, 32, 32)
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    out = relay.nn.global_avg_pool2d(x)
-    out = tvm.IRModule.from_expr(out)
-    config = out, {"x": x_shape}, []
-    run_and_verify_func(config, run_module=run_module)
-
-
-def test_pool3d(run_module, dtype="float32"):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 8, 32, 32),
-        pool_size=(2, 2, 2),
-        strides=(2, 2, 2),
-        padding=(0, 0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-        dtype="float32",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
-        out = tvm.IRModule.from_expr(out)
-        return out, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(relay.nn.avg_pool3d), run_module=run_module)
-    run_and_verify_func(get_graph(relay.nn.max_pool3d), run_module=run_module)
-    run_and_verify_func(
-        get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)), run_module=run_module
-    )
-    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)), run_module=run_module)
-
-
-def test_prune_dnnl_subgraph(run_module):
-    """In this test, OP "add" should be offloaded from dnnl codegen."""
-
-    def get_graph():
-        x1 = relay.var("x1", shape=(1, 32, 56, 56))
-        x2 = relay.var("x2", shape=(1, 32, 56, 56))
-        bias = relay.var("bias", shape=(32,))
-        weight = relay.var("weight", shape=(32, 32, 3, 3))
-        y = relay.nn.conv2d(
-            x1,
-            weight,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.nn.bias_add(y, bias)
-        y = relay.nn.relu(y)
-        y = relay.nn.global_max_pool2d(y)
-        y = relay.add(y, x2)
-        dic = {
-            "x1": (1, 32, 56, 56),
-            "x2": (1, 32, 56, 56),
-            "weight": (32, 32, 3, 3),
-            "bias": (32,),
-        }
-        param_lst = ["weight", "bias"]
-        out = tvm.IRModule.from_expr(y)
-        return out, dic, param_lst
-
-    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
-
-
-def test_layer_norm(run_module, dtype="float32"):
-    x_shape = (1, 49, 64)
-
-    ln, dic, param_lst = get_layer_norm(x_shape, dtype=dtype)
-    ln = tvm.IRModule.from_expr(ln)
-    config = ln, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_rewrite_dense_bias_gelu_reshape_last(run_module, dtype="float32"):
-    def get_graph(act=None):
-        x_shape = (1, 16)
-        k_shape = (32, 16)
-
-        dense_bias, dic, param_lst = get_dense_bias(
-            x_shape, k_shape, activation=act, has_reshape=True, use_add=True, dtype=dtype
-        )
-        dense_bias = tvm.IRModule.from_expr(dense_bias)
-        processed_dense_bias = partition_for_dnnl(
-            dense_bias, params=None, alter_layout=False, prune_subgraphs=False
-        )
-        check_dnnl_used(processed_dense_bias, 1)
-
-        return dense_bias, dic, param_lst
-
-    run_and_verify_func(
-        get_graph("gelu"), subgraph_num=1, run_module=run_module, dtype=dtype, test_bf16=False
-    )
-    run_and_verify_func(
-        get_graph(), subgraph_num=1, run_module=run_module, dtype=dtype, test_bf16=False
-    )
-
-
-def test_resnetv1_rewrite(run_module, dtype="float32"):
-    def get_graph():
-        data_shape = (1, 256, 56, 56)
-        w_shapes = [
-            (64, 256, 1, 1),
-            (64, 64, 3, 3),
-            (256, 64, 1, 1),
-            (128, 256, 1, 1),
-            (128, 128, 3, 3),
-            (512, 128, 1, 1),
-            (512, 256, 1, 1),
-        ]
-        x = relay.var("x", shape=data_shape, dtype=dtype)
-        wights = [relay.const(np.random.randint(0, 1, w).astype(dtype)) for w in w_shapes]
-        biases = [relay.const(np.random.randint(0, 1, w[0]).astype(dtype)) for w in w_shapes]
-
-        conv1 = relay.nn.conv2d(
-            x,
-            wights[0],
-            channels=w_shapes[0][0],
-            kernel_size=w_shapes[0][2:4],
-            padding=(w_shapes[0][2] // 2, w_shapes[0][3] // 2),
-        )
-        conv1 = relay.nn.bias_add(conv1, biases[0])
-        conv1 = relay.nn.relu(conv1)
-
-        conv2 = relay.nn.conv2d(
-            conv1,
-            wights[1],
-            channels=w_shapes[1][0],
-            kernel_size=w_shapes[1][2:4],
-            padding=(w_shapes[1][2] // 2, w_shapes[1][3] // 2),
-        )
-        conv2 = relay.nn.bias_add(conv2, biases[1])
-        conv2 = relay.nn.relu(conv2)
-
-        conv3 = relay.nn.conv2d(
-            conv2,
-            wights[2],
-            channels=w_shapes[2][0],
-            kernel_size=w_shapes[2][2:4],
-            padding=(w_shapes[2][2] // 2, w_shapes[2][3] // 2),
-        )
-        conv3 = relay.nn.bias_add(conv3, biases[2])
-        conv3 = relay.add(conv3, x)
-        conv3 = relay.nn.relu(conv3)
-
-        left_conv4 = relay.nn.conv2d(
-            conv3,
-            wights[3],
-            channels=w_shapes[3][0],
-            strides=(2, 2),
-            kernel_size=w_shapes[3][2:4],
-            padding=(w_shapes[3][2] // 2, w_shapes[3][3] // 2),
-        )
-        left_conv4 = relay.nn.bias_add(left_conv4, biases[3])
-        left_conv4 = relay.nn.relu(left_conv4)
-
-        left_conv5 = relay.nn.conv2d(
-            left_conv4,
-            wights[4],
-            channels=w_shapes[4][0],
-            kernel_size=w_shapes[4][2:4],
-            padding=(w_shapes[4][2] // 2, w_shapes[4][3] // 2),
-        )
-        left_conv5 = relay.nn.bias_add(left_conv5, biases[4])
-        left_conv5 = relay.nn.relu(left_conv5)
-
-        left_conv6 = relay.nn.conv2d(
-            left_conv5,
-            wights[5],
-            channels=w_shapes[5][0],
-            kernel_size=w_shapes[5][2:4],
-            padding=(w_shapes[5][2] // 2, w_shapes[5][3] // 2),
-        )
-        left_conv6 = relay.nn.bias_add(left_conv6, biases[5])
-
-        right_conv7 = relay.nn.conv2d(
-            conv3,
-            wights[6],
-            channels=w_shapes[6][0],
-            strides=(2, 2),
-            kernel_size=w_shapes[6][2:4],
-            padding=(w_shapes[6][2] // 2, w_shapes[6][3] // 2),
-        )
-        right_conv7 = relay.nn.bias_add(right_conv7, biases[6])
-
-        out = relay.add(left_conv6, right_conv7)
-        out = relay.nn.relu(out)
-
-        dic = {"x": data_shape}
-        param_lst = []
-        return out, dic, param_lst
-
-    net, dic, param_lst = get_graph()
-    net = tvm.IRModule.from_expr(net)
-    config = net, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_fuse_pad_avg_pool(run_module, dtype="float32"):
-    def get_graph():
-        data_shape = (1, 768, 17, 17)
-        x = relay.var("x", shape=data_shape, dtype=dtype)
-        out = relay.nn.pad(x, pad_width=[[0, 0], [0, 0], [1, 1], [1, 1]])
-        out = relay.nn.avg_pool2d(out, pool_size=[3, 3])
-        dic = {"x": data_shape}
-        param_lst = []
-        return out, dic, param_lst
-
-    net, dic, param_lst = get_graph()
-    net = tvm.IRModule.from_expr(net)
-    config = net, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def permute_shape(shape, l_from="", l_to=""):
-    res_shape = []
-    for label in l_to:
-        pos = l_from.find(label)
-        res_shape.append(shape[pos])
-
-    return res_shape
-
-
-def expand_dim(shape, rank=0):
-    assert len(shape) == 1
-    return shape + [1] * (rank - 1)
-
-
-def filler_uni(low=0, high=1):
-    def filler_func(shape):
-        return np.random.uniform(low, high, shape)
-
-    return filler_func
-
-
-class QnnBuilder:
-    def __init__(self, qnn_profile=None):
-        self._args = {}
-        self._args_op = []
-        self._qp = qnn_profile
-
-    def arg(self, shape=[], dtype="float32", filler=filler_uni(), is_const=True):
-        if isinstance(filler, (int, float)):
-            value = np.full(shape, filler).astype(dtype)
-        else:
-            value = filler(shape).astype(dtype)
-
-        if is_const:
-            res = relay.const(value, dtype=dtype)
-        else:
-            name = f"in_{len(self._args)}"
-            res = relay.var(name, shape=shape, dtype=dtype)
-            self._args[name] = value
-            self._args_op.append(res)
-
-        return res
-
-    def make_zp(self, mean_val, num_ch=1, dispersion=0.2):
-        if num_ch == 1:
-            return self.arg(shape=[], dtype="int32", filler=mean_val)
-        else:
-            low = int(mean_val * (1 - dispersion))
-            high = int(mean_val * (1 + dispersion))
-            return self.arg(shape=[num_ch], dtype="int32", filler=filler_uni(low, high))
-
-    def make_scl(self, mean_val, num_ch=1, dispersion=0.2):
-        if num_ch == 1:
-            return self.arg(shape=[], dtype="float32", filler=mean_val)
-        else:
-            low = mean_val * (1 - dispersion)
-            high = mean_val * (1 + dispersion)
-            return self.arg(shape=[num_ch], dtype="float32", filler=filler_uni(low, high))
-
-    def make_zp_and_scl(self, name, num_ch=1, dispersion=0.2):
-        is_per_channel = getattr(self._qp, f"{name}_pc")
-        zp_val = getattr(self._qp, f"{name}_zp")
-        scl_val = getattr(self._qp, f"{name}_scl")
-
-        zp = self.make_zp(zp_val, num_ch if is_per_channel else 1, dispersion)
-        scl = self.make_scl(scl_val, num_ch if is_per_channel else 1, dispersion)
-        return zp, scl
-
-    def finalize(self, op):
-        func = relay.Function(self._args_op, op)
-        mod = tvm.IRModule.from_expr(func)
-        mod = relay.transform.InferType()(mod)
-        return mod, self._args
-
-
-def check_fully_annotated(mod, desired_compiler):
-    matched_ops = []
-    other_ops = []
-
-    def _visit(node):
-        if isinstance(node, tvm.relay.Call):
-            op = node.op
-            if isinstance(op, relay.GlobalVar):
-                func = mod[op]
-                if "Compiler" in func.attrs and func.attrs["Compiler"] == desired_compiler:
-                    matched_ops.append(op)
-                    return
-            else:
-                other_ops.append(op)
-
-    tvm.relay.analysis.post_order_visit(mod["main"].body, _visit)
-
-    assert len(other_ops) == 0 and len(matched_ops) != 0, "Model is not fully DNNL compiled"
-
-
-def check_result(
-    mod,
-    ref_mod,
-    map_inputs,
-    tol=1e-5,
-    target="llvm",
-    device=tvm.cpu(),
-    params=None,
-    ref_result=None,
-    atol=None,
-    desired_compiler="dnnl",
-):
-    if atol is None:
-        atol = tol
-
-    if desired_compiler is not None:
-        check_fully_annotated(mod, desired_compiler)
-
-    if ref_result is None:
-        # Run the reference result
-        relay.backend.te_compiler.get().clear()
-        with tvm.transform.PassContext(opt_level=3):
-            ref_lib = relay.build(ref_mod, target=target, params=params)
-        ref_rt_mod = tvm.contrib.graph_executor.GraphModule(ref_lib["default"](device))
-
-        for name, data in map_inputs.items():
-            ref_rt_mod.set_input(name, data)
-        ref_rt_mod.run()
-        out = ref_rt_mod.get_output(0)
-        ref_result = out.numpy()
-
-    def check_vm_result():
-        relay.backend.te_compiler.get().clear()
-        with tvm.transform.PassContext(opt_level=3):
-            exe = relay.vm.compile(mod, target=target, params=params)
-        code, lib = exe.save()
-        exe = tvm.runtime.vm.Executable.load_exec(code, lib)
-        vm = tvm.runtime.vm.VirtualMachine(exe, device)
-        output = vm.run(**map_inputs)
-        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
-
-    def check_graph_executor_result():
-        relay.backend.te_compiler.get().clear()
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target, params=params)
-        rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
-
-        rt_mod.run(**map_inputs)
-        output = rt_mod.get_output(0)
-        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
-
-    check_vm_result()
-    check_graph_executor_result()
-
-
-ConvProfile = collections.namedtuple(
-    "ConvProfile",
-    [
-        "SHAPE",
-        "KER",
-        "STR",
-        "PAD",
-        "DEL",
-        "OC",
-        "GR",
-        "D_LAYOUT",
-        "K_LAYOUT",
-    ],
-)
-base_conv = ConvProfile(
-    SHAPE=[1, 8, 5, 5],
-    KER=[3, 3],
-    STR=[1, 1],
-    PAD=[1, 1],
-    DEL=[1, 1],
-    OC=16,
-    GR=1,
-    D_LAYOUT="NCHW",
-    K_LAYOUT="OIHW",
-)
-base_conv_nhwc = base_conv._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
-base_conv_dilated = base_conv._replace(PAD=[2, 2], DEL=[2, 2])
-base_conv_no_pad = base_conv._replace(PAD=[0, 0])
-base_conv_no_pad_nhwc = base_conv_no_pad._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
-base_conv_group_no_pad = base_conv_no_pad._replace(GR=2)
-base_conv_dw_no_pad = base_conv_no_pad._replace(SHAPE=[1, 16, 5, 5], GR=16)
-
-
-DenseProfile = collections.namedtuple("DenseProfile", ["N", "IC", "OC"])
-base_dense_profile = DenseProfile(N=2, IC=10, OC=16)
-
-ArgConstConfig = collections.namedtuple("ArgConstConfig", ["Data", "Weights", "Bias", "Sum"])
-acp_regular = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=None)
-acp_no_bias = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=None)
-acp_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=False)
-acp_no_bias_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=False)
-
-QuantizationConfig = collections.namedtuple(
-    "QuantizationConfig",
-    [
-        "d_zp",
-        "d_scl",
-        "d_pc",
-        "k_zp",
-        "k_scl",
-        "k_pc",
-        "rq_zp",
-        "rq_scl",
-        "rq_pc",
-        "sum_zp",
-        "sum_scl",
-        "sum_pc",
-        "o_zp",
-        "o_scl",
-        "o_pc",
-    ],
-)
-
-qp_regular = QuantizationConfig(
-    d_zp=0,
-    d_scl=0.2,
-    d_pc=False,
-    k_zp=0,
-    k_scl=0.1,
-    k_pc=False,
-    rq_zp=30,
-    rq_scl=0.2,
-    rq_pc=False,
-    sum_zp=15,
-    sum_scl=0.3,
-    sum_pc=False,
-    o_zp=5,
-    o_scl=0.2,
-    o_pc=False,
-)
-qp_asymmetric_data = qp_regular._replace(
-    d_zp=3, rq_zp=10, rq_scl=0.1, sum_zp=15, sum_scl=0.3, o_zp=4
-)
-
-qnn_conv_profiles = tvm.testing.parameter(
-    by_dict={
-        #  Pattern qnn.conv2d + qnn.requantize
-        "Base": (base_conv, acp_regular, qp_regular),
-        "NHWC": (base_conv_nhwc, acp_regular, qp_regular),
-        #  Asymmetric input. NOTE: No pad! Input ZP is not compatible with padding
-        "Group": (base_conv_group_no_pad, acp_regular, qp_asymmetric_data),
-        "DW": (base_conv_dw_no_pad, acp_regular, qp_asymmetric_data),
-        "NoBias": (base_conv, acp_no_bias, qp_regular),
-        "AsymmetricInput": (base_conv_no_pad, acp_regular, qp_asymmetric_data),
-        "AsymmetricInput_NHWC": (base_conv_no_pad_nhwc, acp_regular, qp_asymmetric_data),
-        #  Pattern Conv2d + Requantize + Sum
-        "WithSum": (base_conv_no_pad, acp_with_sum, qp_asymmetric_data),
-        "WithSum_NHWC": (base_conv_no_pad_nhwc, acp_with_sum, qp_asymmetric_data),
-        "WithSum_NoBias": (base_conv_no_pad, acp_no_bias_with_sum, qp_asymmetric_data),
-    }
-)
-
-
-@has_dnnl_codegen
-def test_qnn_conv2d(qnn_conv_profiles):
-    def generate_model(p, c, q):
-        np.random.seed(0)
-
-        N, IC, IH, IW = p.SHAPE
-        d_shape = p.SHAPE
-        w_shape = [p.OC, IC, *p.KER]
-        b_shape = [p.OC]
-        s_shape = [
-            p.SHAPE[0],
-            p.OC,
-            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
-            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
-        ]
-
-        if p.GR != 1:
-            w_shape[1] //= p.GR
-
-        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
-        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
-        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
-
-        c_dim = p.D_LAYOUT.find("C")
-        b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
-
-        bld = QnnBuilder(qnn_profile=q)
-
-        # Start build a test graph
-        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
-        d_zp, d_scl = bld.make_zp_and_scl("d", IC)
-
-        # Convolution
-        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
-        w_zp, w_scl = bld.make_zp_and_scl("k")
-
-        op = tvm.relay.qnn.op.conv2d(
-            data,
-            wgh,
-            d_zp,
-            w_zp,
-            d_scl,
-            w_scl,
-            kernel_size=p.KER,
-            padding=p.PAD,
-            strides=p.STR,
-            dilation=p.DEL,
-            groups=p.GR,
-            channels=p.OC,
-            out_dtype="int32",
-            data_layout=p.D_LAYOUT,
-            kernel_layout=p.K_LAYOUT,
-        )
-        # Optional bias
-        if c.Bias is not None:
-            bias = bld.arg(
-                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
-            )
-            op = tvm.relay.add(op, bias)
-
-        # Re-quantization
-        rq_in_zp = bld.make_zp(0)
-        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
-        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
-
-        op = tvm.relay.qnn.op.requantize(
-            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
-        )
-        op = tvm.relay.clip(
-            op, a_min=0.0, a_max=255.0
-        )  # pytorch frontend specific, I guess it's redundant
-        op = tvm.relay.cast(op, dtype="uint8")
-
-        # Optional sum (ResNet like)
-        if c.Sum is not None:
-            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
-
-            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
-            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
-            out_zp, out_scl = bld.make_zp_and_scl("o")
-
-            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
-            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
-
-        return bld.finalize(op)
-
-    conv_p, arg_p, quant_p = qnn_conv_profiles
-    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
-    mod = partition_for_dnnl(ref_mod)
-
-    # atol=1 means int values should match with +-1 quantum value tolerance
-    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
-
-
-conv_profiles = tvm.testing.parameter(
-    by_dict={
-        "Base": (base_conv, acp_regular),
-        "NHWC": (base_conv_nhwc, acp_regular),
-        "Group": (base_conv_group_no_pad, acp_regular),
-        "DW": (base_conv_dw_no_pad, acp_regular),
-        "Dilated": (base_conv_dilated, acp_regular),
-    }
-)
-
-
-@has_dnnl_codegen
-def test_conv2d_plus(conv_profiles):
-    def generate_model(p, c):
-        np.random.seed(0)
-
-        N, IC, IH, IW = p.SHAPE
-        d_shape = p.SHAPE
-        w_shape = [p.OC, IC, *p.KER]
-        b_shape = [p.OC]
-        s_shape = [
-            p.SHAPE[0],
-            p.OC,
-            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
-            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
-        ]
-
-        if p.GR != 1:
-            w_shape[1] //= p.GR
-
-        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
-        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
-        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
-
-        c_dim = p.D_LAYOUT.find("C")
-        # b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
-
-        bld = QnnBuilder()
-
-        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
-        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
-        op = tvm.relay.nn.conv2d(
-            op,
-            wgh,
-            kernel_size=p.KER,
-            padding=p.PAD,
-            strides=p.STR,
-            dilation=p.DEL,
-            groups=p.GR,
-            channels=p.OC,
-            out_dtype="float32",
-            data_layout=p.D_LAYOUT,
-            kernel_layout=p.K_LAYOUT,
-        )
-
-        if c.Bias is not None:
-            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
-            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
-
-        if c.Sum is not None:
-            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
-            op = tvm.relay.op.add(op, sum_in)
-
-        return bld.finalize(op)
-
-    conv_p, arg_p = conv_profiles
-    ref_mod, args = generate_model(conv_p, arg_p)
-    mod = partition_for_dnnl(ref_mod, alter_layout=False)
-    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
-
-
-qnn_dense_profiles = tvm.testing.parameter(
-    by_dict={
-        #  Pattern Dense + Requantize
-        "Base": (base_dense_profile, acp_regular, qp_regular),
-        "AsymmetricInput": (base_dense_profile, acp_regular, qp_asymmetric_data),
-        #  Pattern Dense + Requantize + Sum
-        "AsymmetricInput_Sum": (base_dense_profile, acp_with_sum, qp_asymmetric_data),
-    }
-)
-
-
-@has_dnnl_codegen
-def test_qnn_dense(qnn_dense_profiles):
-    def generate_model(p, c, q):
-        np.random.seed(0)
-
-        d_shape = [p.N, p.IC]
-        w_shape = [p.OC, p.IC]
-        b_shape = [p.OC]
-        s_shape = [p.N, p.OC]
-
-        bld = QnnBuilder(qnn_profile=q)
-
-        # Start build a test graph
-        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
-        d_zp, d_scl = bld.make_zp_and_scl("d", p.IC)
-
-        # Convolution
-        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
-        w_zp, w_scl = bld.make_zp_and_scl("k")
-
-        op = tvm.relay.qnn.op.dense(
-            data, wgh, d_zp, w_zp, d_scl, w_scl, units=p.OC, out_dtype="int32"
-        )
-        # Optional bias
-        if c.Bias is not None:
-            bias = bld.arg(
-                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
-            )
-            op = tvm.relay.add(op, bias)
-
-        # Re-quantization
-        rq_in_zp = bld.make_zp(0)
-        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
-        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
-
-        op = tvm.relay.qnn.op.requantize(
-            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
-        )
-        op = tvm.relay.clip(
-            op, a_min=0.0, a_max=255.0
-        )  # pytorch frontend specific, I guess it's redundant
-        op = tvm.relay.cast(op, dtype="uint8")
-
-        # Optional sum (ResNet like)
-        if c.Sum is not None:
-            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
-
-            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
-            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
-            out_zp, out_scl = bld.make_zp_and_scl("o")
-
-            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
-            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
-
-        return bld.finalize(op)
-
-    conv_p, arg_p, quant_p = qnn_dense_profiles
-    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
-    mod = partition_for_dnnl(ref_mod)
-
-    # atol=1 means int values should match with +-1 quantum value tolerance
-    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
-
-
-dense_profiles = tvm.testing.parameter(
-    by_dict={
-        "Base": (base_dense_profile, acp_regular),
-        "WithSum": (base_dense_profile, acp_with_sum),
-    }
-)
-
-
-@has_dnnl_codegen
-def test_dense_plus(dense_profiles):
-    def generate_model(p, c):
-        np.random.seed(0)
-
-        d_shape = [p.N, p.IC]
-        w_shape = [p.OC, p.IC]
-        b_shape = [p.OC]
-        s_shape = [p.N, p.OC]
-
-        c_dim = 1
-
-        bld = QnnBuilder()
-
-        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
-        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
-        op = tvm.relay.nn.dense(op, wgh, out_dtype="float32")
-
-        if c.Bias is not None:
-            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
-            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
-
-        if c.Sum is not None:
-            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
-            op = tvm.relay.op.add(op, sum_in)
-
-        return bld.finalize(op)
-
-    dense_p, arg_p = dense_profiles
-    ref_mod, args = generate_model(dense_p, arg_p)
-    mod = partition_for_dnnl(ref_mod)
-    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/__init__.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/__init__.py
deleted file mode 100644
index 9a0514058201..000000000000
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Demonstration of end-to-end MetaSchedule tuning."""
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
deleted file mode 100644
index 3e331cbf8ccb..000000000000
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hexagon MetaSchedule test helper functions."""
-
-import torch
-from torchvision.models import resnet
-from torchvision.models.quantization import resnet as qresnet
-
-import tvm
-from tvm import relay
-
-
-def export_resnet50_fp16():
-    """Export Resnet50 FP16."""
-    model = resnet.resnet50(pretrained=True).eval()
-
-    pt_inp = torch.randn(1, 3, 224, 224)
-
-    script_module = torch.jit.trace(model, pt_inp).eval()
-
-    input_name = "image"
-    input_shapes = [(input_name, pt_inp.shape)]
-    mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
-    mod = relay.transform.ToMixedPrecision("float16")(mod)
-
-    with open("resnet50_fp16.json", "w") as file:
-        file.write(tvm.ir.save_json(mod))
-
-    with open("resnet50_fp16.params", "wb") as file:
-        file.write(relay.save_param_dict(params))
-
-
-def export_resnet50_int8():
-    """Export Resnet50 INT8."""
-
-    def quantize_model(model, inp):
-        model.fuse_model()
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        torch.quantization.prepare(model, inplace=True)
-        model(inp)
-        torch.quantization.convert(model, inplace=True)
-
-    model = qresnet.resnet50(pretrained=True).eval()
-
-    pt_inp = torch.randn(1, 3, 224, 224)
-    quantize_model(model, pt_inp)
-
-    script_module = torch.jit.trace(model, pt_inp).eval()
-
-    input_name = "image"
-    input_shapes = [(input_name, pt_inp.shape)]
-    mod, params = relay.frontend.from_pytorch(
-        script_module, input_shapes, keep_quantized_weight=True
-    )
-
-    with open("resnet50_int8.json", "w") as file:
-        file.write(tvm.ir.save_json(mod))
-
-    with open("resnet50_int8.params", "wb") as file:
-        file.write(relay.save_param_dict(params))
-
-
-if __name__ == "__main__":
-    export_resnet50_fp16()
-    export_resnet50_int8()
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
deleted file mode 100644
index 52892c60ad22..000000000000
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test Resnet50 float16 with MetaSchedule"""
-
-import os
-import tempfile
-
-import pytest
-import numpy as np
-
-import tvm.testing
-from tvm import relay
-from tvm import meta_schedule as ms
-from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
-from tvm.relay.backend import Executor
-
-from ..infrastructure import get_hexagon_target
-
-
-def convert_conv2d_layout(mod, desired_layouts):
-    with tvm.transform.PassContext(opt_level=3):
-        seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
-        return seq(mod)
-
-
-@pytest.mark.skip("End-to-end tuning is skipped on CI.")
-@tvm.testing.requires_hexagon
-def test_resnet50(hexagon_launcher):
-    """Test Resnet50."""
-    model_json = "resnet50_fp16.json"
-    target_llvm = tvm.target.Target("llvm")
-    target_hexagon = get_hexagon_target("v69")
-    model_params = "resnet50_fp16.params"
-
-    if not os.path.exists(model_json):
-        pytest.skip("Run python export_models.py first.")
-
-    with open(model_json, "r") as file:
-        mod = tvm.ir.load_json(file.read())
-
-    with open(model_params, "rb") as file:
-        params = relay.load_param_dict(file.read())
-
-    mod = convert_conv2d_layout(mod, {"nn.conv2d": ["NHWC", "HWIO"]})
-
-    inp = np.random.randn(1, 3, 224, 224).astype("float32")
-    input_name = "image"
-
-    executor = Executor("graph", {"link-params": True})
-    # This line is necessary for link-params to take effect during
-    # task extraction and relay.build(...).
-    mod = mod.with_attr("executor", executor)
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=target_hexagon,
-            params=params,
-            work_dir=work_dir,
-            # for faster tuning
-            max_trials_global=20000,
-            max_trials_per_task=8,
-            num_trials_per_iter=8,
-            strategy="replay-trace",
-            # max_trials_global=20000,
-            # num_trials_per_iter=32,
-            # max_trials_per_task=128,
-            # strategy="evolutionary",
-            builder=get_hexagon_local_builder(),
-            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
-            # Without this, the same workloads with different constant weights
-            # are treated as distinct tuning tasks.
-            module_equality="ignore-ndarray",
-        )
-
-        hexagon_lowered = ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=target_hexagon,
-            params=params,
-        )
-
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
-            params=params,
-        )
-
-        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_graph_mod.set_input(input_name, inp.copy())
-        llvm_graph_mod.run()
-        ref_result = llvm_graph_mod.get_output(0).numpy()
-
-    with hexagon_launcher.create_session() as session:
-        graph_mod = session.get_executor_from_factory(hexagon_lowered)
-        graph_mod.set_input(input_name, inp.copy())
-
-        graph_mod.run()
-        hexagon_output = graph_mod.get_output(0).numpy()
-
-        # Example output: max and mean abs difference with the reference: 0.1406 0.0126
-        print(
-            "max and mean abs difference with the reference:",
-            np.max(np.abs(ref_result - hexagon_output)),
-            np.mean(np.abs(ref_result - hexagon_output)),
-        )
-        tvm.testing.assert_allclose(ref_result, hexagon_output, atol=2e-1)
-
-        time_ms = graph_mod.benchmark(session.device, number=1, repeat=20).mean * 1e3
-
-        print("time elapsed: ", time_ms)
-
-        debug_ex = session.get_graph_debug_executor(
-            hexagon_lowered.get_graph_json(), hexagon_lowered.lib
-        )
-        print(debug_ex.profile(input_name=inp.copy()))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
deleted file mode 100644
index 84c796bee5dc..000000000000
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ /dev/null
@@ -1,539 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test Resnet50 int8 with MetaSchedule"""
-
-import os
-import tempfile
-from types import MappingProxyType
-from typing import Any, Mapping, Optional
-
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm._ffi import register_func
-from tvm.contrib.hexagon.meta_schedule import (
-    get_hexagon_local_builder,
-    get_hexagon_rpc_runner,
-)
-from tvm.meta_schedule import postproc, schedule_rule
-from tvm.meta_schedule.utils import cpu_count
-from tvm.tir.schedule import BlockRV, Schedule
-from tvm.tir.schedule.analysis import has_block
-from tvm.tir.tensor_intrin.hexagon import (
-    VRMPY_u8i8i32_INTRIN,
-    VRMPY_u8u8i32_INTRIN,
-    VRMPY_u8i8i32_VTCM_INTRIN,
-)
-
-from ..infrastructure import get_hexagon_target
-
-MODEL_JSON = "resnet50_int8.json"
-MODEL_PARAMS = "resnet50_int8.params"
-EXECUTOR = relay.backend.Executor("graph", {"link-params": True})
-TARGET_LLVM = tvm.target.Target("llvm")
-TARGET_HEXAGON = get_hexagon_target("v68")
-
-
-def load_model():
-    """Load renset50 model."""
-    if not os.path.exists(MODEL_JSON):
-        pytest.skip("Run python export_models.py first.")
-
-    with open(MODEL_JSON, "r") as file:
-        mod = tvm.ir.load_json(file.read())
-
-    with open(MODEL_PARAMS, "rb") as file:
-        params = relay.load_param_dict(file.read())
-
-    return mod, params
-
-
-def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
-    """Tune VRMPY with auto tensorization."""
-    sch_rules = [
-        schedule_rule.ApplyCustomRule(),
-        schedule_rule.AutoInline(
-            into_producer=False,
-            into_consumer=True,
-            inline_const_tensor=True,
-            disallow_if_then_else=True,
-            require_injective=True,
-            require_ordered=True,
-            disallow_op=["tir.exp"],
-        ),
-        # VRMPY_u8i8i32_INTRIN is used for conv2d. See topi/hexagon/conv2d_alter_op.py
-        # for why we use different intrins for conv2d and dense.
-        schedule_rule.MultiLevelTilingWithIntrin(
-            VRMPY_u8i8i32_INTRIN,
-            structure="SRSRS",
-            tile_binds=None,
-            max_innermost_factor=64,
-            vector_load_lens=None,
-            reuse_read=None,
-            reuse_write=schedule_rule.ReuseType(
-                req="may",
-                levels=[1, 2],
-                scope="global",
-            ),
-        ),
-        # VRMPY_u8u8i32_INTRIN is used for dense
-        schedule_rule.MultiLevelTilingWithIntrin(
-            VRMPY_u8u8i32_INTRIN,
-            structure="SRSRS",
-            tile_binds=None,
-            max_innermost_factor=64,
-            vector_load_lens=None,
-            reuse_read=None,
-            reuse_write=schedule_rule.ReuseType(
-                req="may",
-                levels=[1, 2],
-                scope="global",
-            ),
-        ),
-        schedule_rule.ParallelizeVectorizeUnroll(
-            max_jobs_per_core=16,
-            max_vectorize_extent=128,
-            unroll_max_steps=[0, 16, 64, 512],
-            unroll_explicit=True,
-        ),
-    ]
-
-    postprocs = [
-        postproc.RewriteParallelVectorizeUnroll(),
-        postproc.RewriteReductionBlock(),
-        postproc.RewriteTensorize(vectorize_init_loop=True),
-    ]
-
-    # This line is necessary for link-params to take effect during
-    # task extraction and relay.build(...).
-    mod = mod.with_attr("executor", EXECUTOR)
-
-    num_cores = cpu_count(logical=False)
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=TARGET_HEXAGON,
-            params=params,
-            work_dir=work_dir,
-            # for faster tuning
-            max_trials_global=20000,
-            max_trials_per_task=8,
-            num_trials_per_iter=8,
-            strategy="replay-trace",
-            # max_trials_global=20000,
-            # num_trials_per_iter=32,
-            # max_trials_per_task=128,
-            # strategy="evolutionary",
-            builder=get_hexagon_local_builder(max_workers=num_cores),
-            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20, max_workers=num_cores),
-            space=ms.space_generator.PostOrderApply(
-                sch_rules=sch_rules,
-                postprocs=postprocs,
-                mutator_probs={},
-            ),
-            # This enables anchor-block tuning, where different subgraphs
-            # with the same anchor block workload will be identified as equal.
-            # It reduces the number of conv2d tuning tasks in the int8 resnet50 model
-            # from 36 to 23, with negligible performance difference.
-            module_equality="anchor-block",
-            num_tuning_cores=num_cores,
-        )
-        return ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=TARGET_HEXAGON,
-            params=params,
-        )
-
-
-@tvm.testing.requires_hexagon
-def test_resnet50(hexagon_launcher):
-    """Test Resnet50."""
-
-    if tvm.testing.utils.IS_IN_CI:
-        pytest.skip("Skipping test since it takes too long in CI.")
-
-    if not os.path.exists(MODEL_JSON):
-        pytest.skip("Run python export_models.py first.")
-
-    mod, params = load_model()
-
-    inp = np.random.randn(1, 3, 224, 224).astype("float32")
-    input_name = "image"
-
-    do_tune = True
-
-    if do_tune:
-        hexagon_lowered = tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher)
-    else:
-        with tvm.transform.PassContext(opt_level=3):
-            hexagon_lowered = relay.build(
-                mod,
-                tvm.target.Target(TARGET_HEXAGON, host=TARGET_HEXAGON),
-                params=params,
-                executor=EXECUTOR,
-            )
-
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            mod,
-            tvm.target.Target(TARGET_LLVM, host=TARGET_LLVM),
-            params=params,
-        )
-
-    with hexagon_launcher.create_session() as session:
-        graph_mod = session.get_executor_from_factory(hexagon_lowered)
-        graph_mod.set_input(input_name, inp.copy())
-        graph_mod.run()
-        hexagon_output = graph_mod.get_output(0).numpy()
-
-        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_graph_mod.set_input(input_name, inp.copy())
-        llvm_graph_mod.run()
-        ref_result = llvm_graph_mod.get_output(0).numpy()
-
-        np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
-
-        time_ms = graph_mod.benchmark(session.device, number=1, repeat=20).mean * 1e3
-
-        print("time elapsed: ", time_ms)
-
-        debug_ex = session.get_graph_debug_executor(
-            hexagon_lowered.get_graph_json(), hexagon_lowered.lib
-        )
-        print(debug_ex.profile(input_name=inp.copy()))
-
-
-def evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp, benchmark=False):
-    """Evaluate the Modules against llvm version."""
-    with hexagon_launcher.create_session() as session:
-        graph_mod = session.get_executor_from_factory(hexagon_lowered)
-        graph_mod.set_input(input_name, inp.copy())
-        graph_mod.run()
-        output = graph_mod.get_output(0).numpy()
-
-        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_graph_mod.set_input(input_name, inp.copy())
-        llvm_graph_mod.run()
-        ref_result = llvm_graph_mod.get_output(0).numpy()
-
-        if benchmark:
-            time_ms = graph_mod.benchmark(session.device, number=1, repeat=1).mean * 1e3
-            print("hexagon time elapsed: ", time_ms)
-            debug_ex = session.get_graph_debug_executor(
-                hexagon_lowered.get_graph_json(), hexagon_lowered.lib
-            )
-            print(debug_ex.profile(input_name=inp.copy()))
-
-        np.testing.assert_allclose(ref_result, output, atol=1e-4, rtol=1e-5)
-
-
-def _schedule_packed_8x8x32_conv2d():
-    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
-    using 8x8x32 packed layout.
-    """
-
-    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
-        if conv2d_block is None:
-            if has_block(sch, "conv2d_NCHWc_int8"):
-                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
-            else:
-                return False
-
-        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
-
-        # Apply scheduling
-
-        post_blocks = sch.get_consumers(conv2d_block)
-        if len(post_blocks) > 0:
-            # Fuse all intermediate post ops into the last op.
-            # This is equivalent to the traverse_inline function used in TE schedules.
-            while True:
-                next_post_blocks = []
-                for post_block in post_blocks:
-                    next_consumers = sch.get_consumers(post_block)
-                    if len(next_consumers) > 0:
-                        sch.compute_inline(post_block)
-                    next_post_blocks += next_consumers
-                if len(next_post_blocks) == 0:
-                    assert len(post_blocks) == 1
-                    outer_block = post_blocks[0]
-                    break
-                post_blocks = next_post_blocks
-        else:
-            outer_block = conv2d_block
-
-        # Move the conv2d mma into the injective post mma compute block
-        if outer_block != conv2d_block:
-            loops = sch.get_loops(outer_block)
-            # TODO(csullivan): Currently does all post conv2d mma steps
-            # directly after accumulation for one spatial pixel. May
-            # be desirable to do this with coarser spatial granularity
-            sch.compute_at(conv2d_block, loops[4])
-
-        def index_map_nchw32c_nchw8h8w32c(n_batch, channel, height, width, channel_32):
-            return [n_batch, channel, height // 8, width // 8, height % 8, width % 8, channel_32]
-
-        # Add cache for input and output activation layout transform,
-        # note that weight is already in correct layout
-        # pylint: disable=unused-variable
-        input_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
-        output_cache = sch.cache_write(outer_block, 0, "global.vtcm")
-        # Transform the layout of the input
-        sch.transform_layout(
-            conv2d_block, ("read", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0
-        )
-        # Transform the layout of the int32 accumulator
-        sch.transform_layout(
-            conv2d_block, ("write", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0
-        )
-        # Transform the layout of the output
-        sch.transform_layout(
-            outer_block, ("write", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0
-        )
-        return True
-
-    return schedule_fn
-
-
-def tune_conv2d_template(
-    mod,
-    scheduler,
-    schedule_tag,
-    params,
-    hexagon_launcher,
-    pass_config: Mapping[str, Any] = MappingProxyType({}),
-):
-    """Generate packed 8*8*32 template."""
-
-    def schedule_rule_conv2d(sch: Schedule, conv2d_block: BlockRV):
-        scheduler()(sch, conv2d_block)
-        return [sch]
-
-    register_func(
-        "meta_schedule.conv2d_NCHWc_int8.{}.hexagon".format(schedule_tag), schedule_rule_conv2d
-    )
-
-    def schedule_conv2d_for_tune(sch: Schedule):
-        scheduler()(sch)
-
-    # This line is necessary for link-params to take effect during
-    # task extraction and relay.build(...).
-    mod = mod.with_attr("executor", EXECUTOR)
-
-    pass_context = None
-    if len(pass_config.items()) > 0:
-        pass_context = (
-            tvm.transform.PassContext(opt_level=3, config=pass_config)
-            if pass_config is not None
-            else None
-        )
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=TARGET_HEXAGON,
-            params=params,
-            work_dir=work_dir,
-            max_trials_global=20000,
-            max_trials_per_task=1,
-            num_trials_per_iter=1,
-            strategy="replay-trace",
-            builder=get_hexagon_local_builder(pass_context),
-            runner=get_hexagon_rpc_runner(hexagon_launcher, number=1),
-            # Apply MS auto scheduling rules for all blocks, but utilize
-            # the custom block scheduling strategy registered above for
-            # blocks annotated as `schedule_rule:meta_schedule.conv2d_NCHWc_int8`
-            # space=ms.space_generator.PostOrderApply(
-            #     f_block_filter=None,
-            #     sch_rules="from-target",
-            #     postprocs=[],
-            #     mutator_probs="from-target",
-            # ),
-            # Constrain search space to only be the single
-            # schedule provided for all blocks. No auto
-            # scheduling will be possible.
-            space=ms.space_generator.ScheduleFn(
-                schedule_conv2d_for_tune,
-                sch_rules=[],
-                postprocs=[],
-                mutator_probs={},
-            ),
-            # Without this, the same workloads with different constant weights
-            # are treated as distinct tuning tasks.
-            module_equality="ignore-ndarray",
-        )
-
-        # Add default options so that it still uses the base config.
-        pass_config["relay.backend.use_meta_schedule"] = True
-        pass_config["relay.backend.tir_converter"] = "default"
-        return ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=TARGET_HEXAGON,
-            params=params,
-            pass_config=pass_config,
-        )
-
-
-@tvm.testing.requires_hexagon
-def test_packed_8x8x32_resnet50(hexagon_launcher):
-    """Test packed 8*8*32 Resnet50"""
-
-    if tvm.testing.utils.IS_IN_CI:
-        pytest.skip("Skipping test since it takes too long in CI.")
-
-    mod, params = load_model()
-
-    inp = np.random.randn(1, 3, 224, 224).astype("float32")
-    input_name = "image"
-
-    do_tune = True
-
-    if do_tune:
-        hexagon_lowered = tune_conv2d_template(
-            mod, _schedule_packed_8x8x32_conv2d, "packed_8x8x32", params, hexagon_launcher, {}
-        )
-    else:
-        with tvm.transform.PassContext(opt_level=3):
-            hexagon_lowered = relay.build(
-                mod,
-                tvm.target.Target(TARGET_HEXAGON, host=TARGET_HEXAGON),
-                params=params,
-                executor=EXECUTOR,
-            )
-
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            mod,
-            tvm.target.Target(TARGET_LLVM, host=TARGET_LLVM),
-            params=params,
-        )
-
-    evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp)
-
-
-def _schedule_async_dma_conv2d():
-    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
-    using 8x8x32 packed layout.
-    """
-
-    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
-        if conv2d_block is None:
-            if has_block(sch, "conv2d_NCHWc_int8"):
-                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
-            else:
-                return False
-
-        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
-
-        # Apply scheduling
-
-        post_blocks = sch.get_consumers(conv2d_block)
-        if len(post_blocks) > 0:
-            # Fuse all intermediate post ops into the last op.
-            # This is equivalent to the traverse_inline function used in TE schedules.
-            while True:
-                next_post_blocks = []
-                for post_block in post_blocks:
-                    next_consumers = sch.get_consumers(post_block)
-                    if len(next_consumers) > 0:
-                        sch.compute_inline(post_block)
-                    next_post_blocks += next_consumers
-                if len(next_post_blocks) == 0:
-                    assert len(post_blocks) == 1
-                    outer_block = post_blocks[0]
-                    break
-                post_blocks = next_post_blocks
-        else:
-            outer_block = conv2d_block
-
-        # Move the conv2d mma into the injective post mma compute block
-        if outer_block != conv2d_block:
-            loops = sch.get_loops(outer_block)
-            # Compute at the second loop for pipelining.
-            sch.compute_at(conv2d_block, loops[1], preserve_unit_loops=True)
-
-        # Add cache for input and output for copying data to vtcm.
-        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
-        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
-        sch.fuse(*sch.get_loops(input_a_cache)[2:])
-
-        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
-        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
-        sch.fuse(*sch.get_loops(input_b_cache)[2:])
-
-        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
-        sch.fuse(*sch.get_loops(output_cache_write)[2:])
-
-        conv2d_loops = sch.get_loops(block=conv2d_block)
-        o_c, k_h, k_w, x_0, x_1, i_c = conv2d_loops[-6:]
-        ic_o, ic_i = sch.split(loop=i_c, factors=[None, 4], preserve_unit_iters=True)
-        oc_o, oc_i = sch.split(loop=o_c, factors=[None, 32], preserve_unit_iters=True)
-        sch.reorder(oc_o, k_h, k_w, x_0, x_1, ic_o, oc_i, ic_i)
-        new_loops = sch.get_loops(block=conv2d_block)
-        sch.parallel(new_loops[4])
-        sch.unroll(new_loops[5])
-        # TODO(nverke): Add compute optimizations here.
-        sch.blockize(target=oc_i)
-
-        sch.tensorize(oc_i, VRMPY_u8i8i32_VTCM_INTRIN)
-
-        pipeline_loop = conv2d_loops[1]
-        sch.annotate(pipeline_loop, "software_pipeline_stage", [0, 0, 1, 2, 3])
-        sch.annotate(pipeline_loop, "software_pipeline_order", [0, 1, 2, 3, 4])
-        sch.annotate(pipeline_loop, "software_pipeline_async_stages", [0, 2])
-
-        return True
-
-    return schedule_fn
-
-
-@tvm.testing.requires_hexagon
-def test_async_dma_resnet50(hexagon_launcher):
-    """Test async dma Resnet50"""
-
-    if tvm.testing.utils.IS_IN_CI:
-        pytest.skip("Skipping test since it takes too long in CI.")
-
-    mod, params = load_model()
-
-    inp = np.random.randn(1, 3, 224, 224).astype("float32")
-    input_name = "image"
-
-    pass_config = {
-        "tir.use_async_copy": 1,
-        "relay.backend.use_meta_schedule": True,
-        "relay.backend.tir_converter": "default",
-    }
-
-    hexagon_lowered = tune_conv2d_template(
-        mod, _schedule_async_dma_conv2d, "async_dma", params, hexagon_launcher, pass_config
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            mod, tvm.target.Target(TARGET_LLVM, host=TARGET_LLVM), params=params
-        )
-    evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp, True)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_autotvm.py b/tests/python/contrib/test_hexagon/test_autotvm.py
deleted file mode 100644
index 74ea66ab248a..000000000000
--- a/tests/python/contrib/test_hexagon/test_autotvm.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Minimal example of tuning on hexagon. """
-
-import contextlib
-import os
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import autotvm, te
-from tvm.autotvm.tuner import GATuner, XGBTuner
-
-from .infrastructure import get_hexagon_target
-
-
-@autotvm.template("demo_template")
-def demo_template():
-    """Initial demo template"""
-    size_m, size_n, size_k = [1024] * 3
-    input1 = te.placeholder((size_m, size_k), dtype="float32")
-    input2 = te.placeholder((size_n, size_k), dtype="float32")
-    k = te.reduce_axis((0, 1024), name="k")
-    output = te.compute(
-        (size_m, size_n), lambda i, j: te.sum(input1[i, k] * input2[j, k], axis=[k])
-    )
-
-    s = te.create_schedule(output.op)
-    cfg = autotvm.get_config()
-
-    _, _ = s[output].op.axis
-    (k_iter,) = s[output].op.reduce_axis
-
-    cfg.define_split("k_split", k_iter, num_outputs=2)
-    _, _ = cfg["k_split"].apply(s, output, k_iter)
-
-    return s, [input1, input2, output]
-
-
-class HexagonModuleLoader:
-    """HexagonModuleLoader"""
-
-    def __init__(self, hexagon_session, pre_load_function=None) -> None:
-        self.pre_load_function = pre_load_function
-        self.hexagon_session = hexagon_session
-
-    @contextlib.contextmanager
-    def __call__(self, remote_kwargs, build_result):
-        remote = self.hexagon_session._rpc
-        if self.pre_load_function is not None:
-            self.pre_load_function(remote, build_result)
-
-        try:
-            yield remote, self.hexagon_session.load_module(build_result)
-        finally:
-            pass
-
-
-def tune_tasks(
-    tasks,
-    measure_option,
-    tuner="xgb",
-    n_trial=2048,
-    early_stopping=None,
-    log_filename="tuning.log",
-    use_transfer_learning=True,
-):
-    """Tune tasks with different tuners"""
-
-    tmp_log_file = log_filename + ".tmp"
-    if os.path.exists(tmp_log_file):
-        os.remove(tmp_log_file)
-
-    for i, tsk in enumerate(reversed(tasks)):
-        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-        if tuner == "xgb":
-            tuner_obj = XGBTuner(tsk, loss_type="reg")
-        elif tuner == "xgb_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob")
-        elif tuner == "xgb_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar")
-        elif tuner == "xgb_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve")
-        elif tuner == "xgb_rank":
-            tuner_obj = XGBTuner(tsk, loss_type="rank")
-        elif tuner == "xgb_rank_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
-        elif tuner == "xgb_rank_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
-        elif tuner == "xgb_rank_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
-        elif tuner == "xgb_rank_binary":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary")
-        elif tuner == "xgb_rank_binary_knob":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob")
-        elif tuner == "xgb_rank_binary_itervar":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar")
-        elif tuner == "xgb_rank_binary_curve":
-            tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve")
-        elif tuner == "ga":
-            tuner_obj = GATuner(tsk, pop_size=50)
-        elif tuner == "random":
-            tuner_obj = RandomTuner(tsk)
-        elif tuner == "gridsearch":
-            tuner_obj = GridSearchTuner(tsk)
-        else:
-            raise ValueError("Invalid tuner: " + tuner)
-
-        if use_transfer_learning:
-            if os.path.isfile(tmp_log_file):
-                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
-
-        tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(
-            n_trial=tsk_trial,
-            early_stopping=early_stopping,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-    autotvm.record.pick_best(tmp_log_file, log_filename)
-    os.remove(tmp_log_file)
-
-
-@pytest.mark.skip(reason="AutoTVM tuning is not yet enabled on Hexagon")
-@tvm.testing.requires_hexagon
-def test_autotvm(hexagon_session):
-    """Top level test function for testing autotvm"""
-    logfilename = "./hexagon.autotvm.log"
-
-    options = {
-        "log_filename": logfilename,
-        "early_stopping": None,
-        "measure_option": autotvm.measure_option(
-            builder=autotvm.LocalBuilder(timeout=15),
-            runner=autotvm.RPCRunner(
-                module_loader=HexagonModuleLoader(hexagon_session),
-                key=hexagon_session._remote_kw["key"],
-                host=hexagon_session._remote_kw["host"],
-                port=hexagon_session._remote_kw["port"],
-                number=3,
-                timeout=15,
-                min_repeat_ms=150,
-                # cooldown_interval=150
-            ),
-        ),
-    }
-    task = autotvm.task.create(
-        "demo_template",
-        args=[],
-        target=get_hexagon_target("v68"),
-    )
-    tune_tasks([task], **options)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
deleted file mode 100644
index fdfe3ad2b76e..000000000000
--- a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test Fixed Point Multiply on Hexagon."""
-
-import re
-import numpy as np
-
-import tvm.testing
-from tvm import relay
-from tvm import te
-from tvm.relay.backend import Executor
-from tvm.contrib.hexagon.session import Session
-from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
-
-from .infrastructure import get_hexagon_target
-
-
-@tvm.testing.requires_hexagon
-def test_vmpy_intrinsic_presence():
-    """
-    check intrinsic lowering for fixed_point_multiply operation.
-    GraphExecutor is used here since get_source("asm") is not supported with aot.
-    """
-    ishape = (1, 128)
-    a = relay.var("a", relay.TensorType(ishape, "int32"))
-
-    y = relay.fixed_point_multiply(a, 1395864320, 1)  # 1.3
-
-    relay_mod = tvm.IRModule.from_expr(y)
-
-    params = {}
-    executor = Executor("graph", {"link-params": True})
-
-    with tvm.transform.PassContext(opt_level=3):
-        hexagon_lowered = tvm.relay.build(
-            relay_mod,
-            get_hexagon_target("v68"),
-            executor=executor,
-            params=params,
-        )
-
-    asm = hexagon_lowered.lib.get_source("asm")
-
-    # Check that 'vmpye' instruction was generated in asm file.
-    vmpye_regex = re.compile(r"v\d{1,2}.w = vmpye\(v\d{1,2}.w,v\d{1,2}.uh\)")
-    assert vmpye_regex.search(asm) is not None
-
-    # Check that 'vmpyo' instruction was generated in asm file.
-    vmpyo_regex = re.compile(r"v\d{1,2}.w \+= vmpyo\(v\d{1,2}.w,v\d{1,2}.h\):<<1:rnd:sat:shift")
-    assert vmpyo_regex.search(asm) is not None
-
-
-def build_module(relay_mod, target):
-    params = {}
-    executor = Executor("aot", {"link-params": True})
-    lowered = tvm.relay.build(
-        relay_mod,
-        tvm.target.Target(target, host=target),
-        executor=executor,
-        params=params,
-    )
-    return lowered
-
-
-def run_module(mod, inputs):
-    mod.set_input(**inputs)
-    mod.run()
-    output = mod.get_output(0).numpy()
-    return output
-
-
-class TestFixedPointMultiply:
-    """Fixed point Multiply test class"""
-
-    in_scale_const, out_scale_const = tvm.testing.parameters(
-        (1.3, 30.0),
-        (1.37, 1.0),
-        (0.6, 1.0),
-        ((1.7, 0.6), 1.0),
-        ((0.007, 1.9), 1.0),
-    )
-
-    multiplier, shift = tvm.testing.parameters(
-        (1288490240, -2),  # 0.15
-        (1395864320, 1),  # 1.3
-        (1288490188, 0),  # 0.6
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_per_tensor(self, hexagon_session: Session, multiplier: int, shift: int):
-        """Fixed point multiply test."""
-        ishape = (6, 32)
-        a = relay.var("a", relay.TensorType(ishape, "int32"))
-        fpm = relay.fixed_point_multiply(a, multiplier, shift)
-        relay_mod = tvm.IRModule.from_expr(fpm)
-
-        with tvm.transform.PassContext(opt_level=3):
-            # Compile for Hexagon...
-            hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET)
-
-            # Compile for LLVM...
-            llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
-
-        data_in = np.arange(-96, 96).reshape(ishape)
-        inputs = {"a": data_in}
-
-        # Run hexagon...
-        hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = run_module(hexagon_mod, inputs)
-
-        # Run llvm...
-        llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-        expected_output = run_module(llvm_mod, inputs)
-
-        tvm.testing.assert_allclose(hexagon_output, expected_output)
-
-    @tvm.testing.requires_hexagon
-    def test_per_channel(self, hexagon_session: Session, in_scale_const, out_scale_const):
-        """Per channel multiply test."""
-        ishape = [1, 128, 56, 56]
-        axis = 1
-        a = relay.var("a", shape=ishape, dtype="int32")
-
-        # Make list of input scales from in_scale_const parameter.
-        if isinstance(in_scale_const, tuple):
-            in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const))
-        else:
-            in_scale = [in_scale_const] * ishape[axis]
-        assert len(in_scale) == ishape[axis]
-
-        # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype.
-        iscale = relay.const(in_scale)
-        izero = relay.const(0)
-        oscale = relay.const(out_scale_const)
-        ozero = relay.const(0)
-        op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32")
-        mod = tvm.IRModule.from_expr(op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            # Compile for Hexagon...
-            hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET)
-
-            # Compile for LLVM...
-            llvm_lowered = build_module(mod, tvm.target.Target("llvm"))
-
-        a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape)
-        inputs = {"a": a_np}
-
-        # Run hexagon...
-        hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = run_module(hexagon_mod, inputs)
-
-        # Run llvm...
-        llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-        expected_output = run_module(llvm_mod, inputs)
-
-        tvm.testing.assert_allclose(hexagon_output, expected_output)
-
-    vector_size = tvm.testing.parameter(32, 64, 128, 256)
-
-    def test_per_tensor_with_lanes(self, hexagon_session: Session, vector_size):
-        """Test fixed point multiply with vectorization.
-        Vectorization size is more than hw vector length"""
-        ishape = [2, 256, 16]
-
-        def q_mul_shift(shape):
-            x = te.placeholder(shape, name="X", dtype="int32")
-            out = te.compute(
-                shape,
-                lambda i, j, k: tvm.tir.q_multiply_shift(
-                    x[i, j, k],
-                    tvm.tir.const(1395864320, "int32"),
-                    tvm.tir.const(31, "int32"),
-                    tvm.tir.const(1, "int32"),
-                ),
-                name="compute",
-            )
-            return te.create_prim_func([x, out])
-
-        mod = q_mul_shift(ishape)
-
-        # Schedule with vectorization
-        sch = tvm.tir.Schedule(mod)
-        b00 = sch.get_block(name="compute", func_name="main")
-        fused = sch.fuse(*sch.get_loops(block=b00))
-        _, v = sch.split(loop=fused, factors=[None, vector_size])
-        sch.vectorize(v)
-
-        with tvm.transform.PassContext(opt_level=3):
-            hex_lib = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
-            host_lib = tvm.build(mod, target=tvm.target.Target("llvm"))
-
-        asm = hex_lib.get_source("asm")
-
-        # Check that 'vmpye' instruction was generated in asm file.
-        vmpye_regex = re.compile(r"v\d{1,2}.w = vmpye\(v\d{1,2}.w,v\d{1,2}.uh\)")
-        assert vmpye_regex.search(asm) is not None
-
-        # Check that 'vmpyo' instruction was generated in asm file.
-        vmpyo_regex = re.compile(r"v\d{1,2}.w \+= vmpyo\(v\d{1,2}.w,v\d{1,2}.h\):<<1:rnd:sat:shift")
-        assert vmpyo_regex.search(asm) is not None
-
-        # Verify accuracy
-        a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape).astype("int32")
-        b_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape).astype("int32")
-        hex_args = [
-            tvm.runtime.ndarray.array(arg, device=hexagon_session.device, mem_scope="global")
-            for arg in [a_np, b_np]
-        ]
-        host_args = [tvm.runtime.ndarray.array(arg) for arg in [a_np, b_np]]
-
-        hex_rt = hexagon_session.load_module(hex_lib)
-        hex_rt(*hex_args)
-        host_lib(*host_args)
-
-        assert np.allclose(hex_args[1].numpy(), host_args[1].numpy())
-
-    def test_per_channel_with_lanes(self, hexagon_session: Session, vector_size):
-        """Test fixed point multiply with vectorization.
-        Vectorization size is more than hw vector length"""
-        a_shape = [2, 256, 16]
-        b_shape = [256]
-
-        def q_mul_shift(shape):
-            shift_shape = [shape[1]]
-            x = te.placeholder(shape, name="X", dtype="int32")
-            y = te.placeholder(shift_shape, name="X", dtype="int32")
-            l_shift = te.placeholder(shift_shape, name="X", dtype="int32")
-            r_shift = te.placeholder(shift_shape, name="X", dtype="int32")
-
-            out = te.compute(
-                shape,
-                lambda i, j, k: tvm.tir.q_multiply_shift_per_axis(
-                    x[i, j, k],
-                    y[j],
-                    l_shift[j],
-                    r_shift[j],
-                    tvm.tir.const(31, "int32"),
-                    tvm.tir.const(1, "bool"),
-                    tvm.tir.const(0, "bool"),
-                ),
-                name="compute",
-            )
-            return te.create_prim_func([x, y, l_shift, r_shift, out])
-
-        mod = q_mul_shift(a_shape)
-
-        # Schedule with vectorization
-        sch = tvm.tir.Schedule(mod)
-        b00 = sch.get_block(name="compute", func_name="main")
-        fused = sch.fuse(*sch.get_loops(block=b00))
-        _, v = sch.split(loop=fused, factors=[None, vector_size])
-        sch.vectorize(v)
-
-        with tvm.transform.PassContext(opt_level=3):
-            hex_lib = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
-            host_lib = tvm.build(mod, target=tvm.target.Target("llvm"))
-
-        asm = hex_lib.get_source("asm")
-
-        # Check that 'vmpye' instruction was generated in asm file.
-        vmpye_regex = re.compile(r"v\d{1,2}.w = vmpye\(v\d{1,2}.w,v\d{1,2}.uh\)")
-        assert vmpye_regex.search(asm) is not None
-
-        # Check that 'vmpyo' instruction was generated in asm file.
-        vmpyo_regex = re.compile(r"v\d{1,2}.w \+= vmpyo\(v\d{1,2}.w,v\d{1,2}.h\):<<1:rnd:sat:shift")
-        assert vmpyo_regex.search(asm) is not None
-
-        # Verify accuracy
-        x_np = (
-            np.random.randint(-1000, 1000, size=np.prod(a_shape)).reshape(a_shape).astype("int32")
-        )
-        y_np = (
-            np.random.randint(-1000, 1000, size=np.prod(b_shape)).reshape(b_shape).astype("int32")
-        )
-        lsh_np = np.random.randint(0, 10, size=np.prod(b_shape)).reshape(b_shape).astype("int32")
-        rsh_np = np.random.randint(0, 10, size=np.prod(b_shape)).reshape(b_shape).astype("int32")
-        b_np = (
-            np.random.randint(-1000, 1000, size=np.prod(a_shape)).reshape(a_shape).astype("int32")
-        )
-        np_args = [x_np, y_np, lsh_np, rsh_np, b_np]
-        hex_args = [
-            tvm.runtime.ndarray.array(arg, device=hexagon_session.device, mem_scope="global")
-            for arg in np_args
-        ]
-        host_args = [tvm.runtime.ndarray.array(arg) for arg in np_args]
-
-        hex_rt = hexagon_session.load_module(hex_lib)
-        hex_rt(*hex_args)
-        host_lib(*host_args)
-
-        assert np.allclose(hex_args[4].numpy(), host_args[4].numpy())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
deleted file mode 100644
index 2919e3f641de..000000000000
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test mobilenet model with aot executor"""
-
-import numpy as np
-import pytest
-
-import tvm.testing
-from tvm import relay
-from tvm.contrib.hexagon.session import Session
-from tvm.relay.backend import Executor, Runtime
-
-
-def get_mobilenet():
-    """Download and import mobilenet model with ONNX"""
-    onnx = pytest.importorskip("onnx")
-
-    model_url = "https://github.com/onnx/models/raw/131c99da401c757207a40189385410e238ed0934/vision/classification/mobilenet/model/mobilenetv2-7.onnx"  # pylint: disable=line-too-long
-    model_path = tvm.contrib.download.download_testdata(
-        model_url, "mobilenetv2-7.onnx", module="onnx"
-    )
-    return onnx.load(model_path)
-
-
-@pytest.mark.parametrize("enable_usmp", [False, True])
-@tvm.testing.requires_hexagon
-def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, enable_usmp):
-    """Test mobilenet with aot executor"""
-    dtype = "float32"
-    onnx_model = get_mobilenet()
-
-    data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)
-
-    input_name = "data"
-    shape_dict = {input_name: data_in.shape}
-    relay_mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
-    inputs = {input_name: data_in}
-
-    target_llvm = tvm.target.Target("llvm")
-    config = {"tir.usmp.enable": enable_usmp}
-    with tvm.transform.PassContext(opt_level=3, config=config):
-        hexagon_lowered = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(aot_target, host=aot_host_target),
-            runtime=Runtime("cpp"),
-            executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
-            params=params,
-        )
-
-    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_mod.set_input(**inputs)
-    hexagon_mod.run()
-    hexagon_output = hexagon_mod.get_output(0).numpy()
-
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
-            runtime=Runtime("cpp"),
-            executor=Executor("aot", {"interface-api": "packed"}),
-            params=params,
-        )
-
-    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-    llvm_mod.set_input(**inputs)
-    llvm_mod.run()
-    expected_output = llvm_mod.get_output(0).numpy()
-    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_qnn_op_integration.py b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py
deleted file mode 100644
index dbf217ce4e7c..000000000000
--- a/tests/python/contrib/test_hexagon/test_qnn_op_integration.py
+++ /dev/null
@@ -1,576 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Tests for QNN operations on Hexagon"""
-
-import numpy as np
-
-import tvm.testing
-import tvm.topi.testing
-from tvm import relay
-from tvm.contrib.hexagon.session import Session
-from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
-from tvm.relay.backend import Executor
-from tvm.relay.testing import run_opt_pass, run_infer_type
-
-from .infrastructure import quantize_np
-
-
-@tvm.testing.requires_hexagon
-def test_disable_qnn_legalize_pass():
-    """No QNN pass test."""
-    x = relay.var("x", shape=(4, 8), dtype="float32")
-    op0 = relay.qnn.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8")
-    op1 = relay.qnn.dequantize(op0, relay.const(0.5), relay.const(5))
-    relay_mod = tvm.IRModule.from_expr(op1)
-
-    target_hexagon = tvm.target.hexagon("v68")
-    # Default compilation flow
-    with tvm.transform.PassContext(opt_level=3):
-        opt_with_legalize, _ = relay.optimize(
-            relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon)
-        )
-
-    # Disable QNN legalization and canonicalization passes
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
-        opt_without_legalize, _ = relay.optimize(
-            relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon)
-        )
-
-    # Check that QNN ops are absent with default compilation flow.
-    text_with_legalize = opt_with_legalize.astext(show_meta_data=False)
-    assert "qnn.quantize" not in text_with_legalize and "qnn.dequantize" not in text_with_legalize
-
-    # Check that QNN ops are present without "qnn.Legalize" passes.
-    text_without_legalize = opt_without_legalize.astext(show_meta_data=False)
-    assert "qnn.quantize" in text_without_legalize and "qnn.dequantize" in text_without_legalize
-
-
-def build_hexagon_module(relay_mod):
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["QnnCanonicalize"]):
-        exe_mod = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
-            executor=Executor("aot"),
-        )
-
-    return exe_mod
-
-
-def build_ref_module(relay_mod):
-    target_llvm = tvm.target.Target("llvm")
-    with tvm.transform.PassContext(opt_level=3):
-        exe_mod = tvm.relay.build(
-            relay_mod, tvm.target.Target(target_llvm, host=target_llvm), executor=Executor("aot")
-        )
-    return exe_mod
-
-
-def execute(mod_executor, inputs: dict):
-    for input_name, input_data in inputs.items():
-        mod_executor.set_input(input_name, input_data)
-    mod_executor.run()
-    return [mod_executor.get_output(i).numpy() for i in range(mod_executor.get_num_outputs())]
-
-
-def execute_on_hexagon(hexagon_session, exe_mod, inputs: dict):
-    return execute(hexagon_session.get_executor_from_factory(exe_mod), inputs)
-
-
-def execute_on_cpu(exe_mod, inputs: dict):
-    return execute(tvm.runtime.executor.AotModule(exe_mod["default"](tvm.cpu(0))), inputs)
-
-
-def assert_allclose(actuals, desireds, rtol=1e-07, atol=0.01):
-    return [tvm.testing.assert_allclose(a, d, rtol, atol) for a, d in zip(actuals, desireds)]
-
-
-def run_and_compare(hexagon_session, relay_mod, inputs, rtol=None, atol=None):
-    """Compile and execute given relay module on CPU and Hexagon, and compare
-    results"""
-    hexagon_mod = build_hexagon_module(relay_mod)
-    cpu_mod = build_ref_module(relay_mod)
-
-    hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
-    cpu_outs = execute_on_cpu(cpu_mod, inputs)
-
-    # Do not pass rtol/atol if not present to use default values from assert_allclose
-    tolerances = dict()
-    if rtol is not None:
-        tolerances["rtol"] = rtol
-    if atol is not None:
-        tolerances["atol"] = atol
-
-    assert_allclose(hexagon_outs, cpu_outs, **tolerances)
-
-
-# First test basic QNN ops: quantize, dequantize, requantize
-#
-class TestQnnQuantize:
-    """QNN Quantize test class."""
-
-    input_shape = tvm.testing.parameter([1, 8, 8, 32], [1, 10, 10, 32], [1, 12, 12, 128])
-    odtype = tvm.testing.parameter("int8", "uint8")
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_quantize(self, hexagon_session: Session, odtype, input_shape):
-        """Test qnn.quantize"""
-
-        def gen_relay_expr_qnn(output_scale, output_zero_point):
-            data = relay.var("data", shape=input_shape, dtype="float32")
-            qnn_quantize = relay.qnn.quantize(
-                data,
-                output_scale=relay.const(output_scale),
-                output_zero_point=relay.const(output_zero_point),
-                axis=-1,
-                out_dtype=odtype,
-            )
-            return qnn_quantize
-
-        inputs = {"data": np.random.random(input_shape)}
-        # Use quantize_np to obtain reasonable quantization parameters.
-        ref_out, scale, zero_point = quantize_np(inputs["data"], odtype)
-
-        relay_mod = tvm.IRModule.from_expr(gen_relay_expr_qnn(scale, zero_point))
-
-        hexagon_mod = build_hexagon_module(relay_mod)
-        hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
-        assert_allclose(hexagon_outs, [ref_out], atol=1)
-
-
-class TestQnnDequantize:
-    """QNN Dequantize test class."""
-
-    input_shape = tvm.testing.parameter(
-        [1, 12, 32, 128], [1, 10, 10, 32], [1, 6, 6, 2048], [1, 1000]
-    )
-    idtype = tvm.testing.parameter("int8", "uint8")
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_dequantize(self, hexagon_session: Session, idtype, input_shape):
-        """Test qnn.dequantize"""
-
-        def gen_relay_expr_qnn(dtype, input_scale, input_zero_point):
-            data = relay.var("data", shape=input_shape, dtype=dtype)
-            qnn_dequantize = relay.qnn.dequantize(
-                data,
-                input_scale=relay.const(input_scale),
-                input_zero_point=relay.const(input_zero_point),
-            )
-            return qnn_dequantize
-
-        # Generate float data, then quantize it to produce input.
-        ref_out = np.random.random(input_shape)
-        data, scale, zero_point = quantize_np(ref_out, idtype)
-        inputs = {"data": data}
-
-        relay_mod = tvm.IRModule.from_expr(gen_relay_expr_qnn(idtype, scale, zero_point))
-
-        hexagon_mod = build_hexagon_module(relay_mod)
-        hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
-        # We do
-        #   original -[quantize]-> input -[dequantize]-> output
-        # then compare "original" with "output". Use rtol=1 because of the quantized
-        # format in the middle.
-        assert_allclose(hexagon_outs, [ref_out], rtol=1, atol=1e-2)  # rtol = 1
-
-
-class TestQnnRequantize:
-    """QNN requantize test class"""
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_requantize(self, hexagon_session: Session):
-        """Test qnn.requantize"""
-        data_shape = [256]
-        data = relay.var("data", shape=data_shape, dtype="int32")
-
-        op = relay.qnn.requantize(
-            data,
-            input_scale=relay.const(0.156),
-            input_zero_point=relay.const(2),
-            output_scale=relay.const(0.212),
-            output_zero_point=relay.const(1),
-            out_dtype="int8",
-        )
-        relay_mod = tvm.IRModule.from_expr(op)
-
-        inputs = {"data": np.arange(-256, 256, 2, dtype="int32")}
-
-        run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0)  # equal
-
-
-class TestQnnAvgPool2d:
-    """QNN AvgPool2d test class."""
-
-    _multitest_params = [
-        ([1, 12, 12, 32], "NHWC", [3, 3], [1, 1], [2, 3], [1, 2, 3, 4], False, False),
-        ([1, 18, 18, 32], "NCHW", [3, 3], [2, 2], [2, 1], [1, 2, 3, 4], False, True),
-    ]
-
-    (
-        input_shape,
-        layout,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
-    ) = tvm.testing.parameters(*_multitest_params)
-
-    idtype, odtype = tvm.testing.parameters(("uint8", "uint8"))
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_avg_pool2d(
-        self,
-        hexagon_session: Session,
-        idtype,
-        odtype,
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
-        layout,
-    ):
-        """Test qnn.avg_pool2d"""
-
-        def gen_relay_expr_qnn(
-            dtype, input_scale, input_zero_point, output_scale, output_zero_point
-        ):
-            data = relay.var("data", shape=input_shape, dtype=dtype)
-            qnn_avg_pool = relay.qnn.avg_pool2d(
-                data,
-                input_scale=relay.const(input_scale),
-                input_zero_point=relay.const(input_zero_point),
-                output_scale=relay.const(output_scale),
-                output_zero_point=relay.const(output_zero_point),
-                pool_size=kernel,
-                strides=stride,
-                dilation=dilation,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-                layout=layout,
-            )
-
-            return qnn_avg_pool
-
-        # Generate inputs and reference data first.
-        fp_input = np.random.random(input_shape)
-        fp_output = tvm.topi.testing.poolnd_python(
-            fp_input,
-            kernel,
-            stride,
-            dilation,
-            padding_before=padding[:2],
-            padding_after=padding[2:],
-            pool_type="avg",
-            count_include_pad=count_include_pad,
-            ceil_mode=ceil_mode,
-            layout=layout,
-        )
-        input_data, input_scale, input_zero_point = quantize_np(fp_input, idtype)
-        ref_out, output_scale, output_zero_point = quantize_np(fp_output, odtype)
-        inputs = {"data": input_data}
-
-        relay_mod = tvm.IRModule.from_expr(
-            gen_relay_expr_qnn(
-                idtype, input_scale, input_zero_point, output_scale, output_zero_point
-            )
-        )
-
-        hexagon_mod = build_hexagon_module(relay_mod)
-        hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
-        assert_allclose(hexagon_outs, [ref_out], rtol=0, atol=2)
-
-
-class TestQnnBinaryOp:
-    """QNN binary op test class"""
-
-    operation = tvm.testing.parameter(relay.qnn.add, relay.qnn.subtract, relay.qnn.mul)
-    dtype = tvm.testing.parameter("uint8", "int8")
-    input_shape = tvm.testing.parameter([256], [4, 256])
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_binary_op(self, hexagon_session: Session, operation, dtype, input_shape):
-        """Test binary qnn ops"""
-        lhs_shape = [4, 256]
-        rhs_shape = input_shape
-        lhs = relay.var("lhs", shape=lhs_shape, dtype=dtype)
-        rhs = relay.var("rhs", shape=rhs_shape, dtype=dtype)
-        lhs_zp = 1
-        rhs_zp = 3
-
-        op = operation(
-            lhs,
-            rhs,
-            lhs_scale=relay.const(0.041, "float32"),
-            lhs_zero_point=relay.const(lhs_zp, "int32"),
-            rhs_scale=relay.const(0.017, "float32"),
-            rhs_zero_point=relay.const(rhs_zp, "int32"),
-            output_scale=relay.const(0.039, "float32"),
-            output_zero_point=relay.const(2, "int32"),
-        )
-        relay_mod = tvm.IRModule.from_expr(op)
-
-        inputs = {
-            "lhs": np.random.randint(np.iinfo(dtype).min + lhs_zp, np.iinfo(dtype).max, lhs_shape),
-            "rhs": np.random.randint(np.iinfo(dtype).min + rhs_zp, np.iinfo(dtype).max, rhs_shape),
-        }
-
-        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_binary_op_broadcasting(self, hexagon_session: Session, operation):
-        """Test binary qnn ops (with argument broadcast)"""
-        lhs_shape = [4, 256]
-        lhs = relay.var("lhs", shape=lhs_shape, dtype="uint8")
-        rhs = relay.const(11, dtype="uint8")
-
-        op = operation(
-            lhs,
-            rhs,
-            lhs_scale=relay.const(0.049, "float32"),
-            lhs_zero_point=relay.const(1, "int32"),
-            rhs_scale=relay.const(0.067, "float32"),
-            rhs_zero_point=relay.const(3, "int32"),
-            output_scale=relay.const(0.041, "float32"),
-            output_zero_point=relay.const(2, "int32"),
-        )
-        relay_mod = tvm.IRModule.from_expr(op)
-
-        inputs = {"lhs": np.random.randint(1, 255, size=lhs_shape)}
-
-        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
-
-
-class TestQnnConcatenate:
-    """QNN concatenate test class"""
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_concatenate(self, hexagon_session: Session):
-        """Test qnn.concatenate"""
-        x_shape = [1, 64]
-        y_shape = [2, 64]
-        z_shape = [3, 64]
-        input_x = relay.var("x", shape=x_shape, dtype="uint8")
-        input_y = relay.var("y", shape=y_shape, dtype="uint8")
-        input_z = relay.var("z", shape=z_shape, dtype="uint8")
-
-        op = relay.qnn.concatenate(
-            (input_x, input_y, input_z),
-            input_scales=(relay.const(0.3), relay.const(0.7), relay.const(1.3)),
-            input_zero_points=(relay.const(0), relay.const(1), relay.const(2)),
-            output_scale=relay.const(0.8),
-            output_zero_point=relay.const(5),
-            axis=0,
-        )
-        relay_mod = tvm.IRModule.from_expr(op)
-
-        inputs = {
-            "x": np.arange(0, 64, 1, dtype="uint8").reshape(x_shape),
-            "y": np.arange(0, 128, 1, dtype="uint8").reshape(y_shape),
-            "z": np.arange(0, 192, 1, dtype="uint8").reshape(z_shape),
-        }
-
-        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
-
-
-class TestQnnConv2D:
-    """QNN conv2d op test class."""
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_quantize_conv2d_requantize(self, hexagon_session: Session):
-        """Tast qnn.conv2d"""
-        data_shape = [1, 8, 32, 32]
-        weight_shape = [16, 8, 3, 3]
-        data = relay.var("data", shape=data_shape, dtype="float32")
-        weight = relay.var("weight", shape=weight_shape, dtype="float32")
-        op0 = relay.qnn.quantize(data, relay.const(0.078), relay.const(0), out_dtype="uint8")
-        op1 = relay.qnn.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8")
-        op2 = relay.qnn.conv2d(
-            op0,
-            op1,
-            input_zero_point=relay.const(0),
-            kernel_zero_point=relay.const(0),
-            input_scale=relay.const(0.078),
-            kernel_scale=relay.const(0.07),
-            padding=[0, 0, 0, 0],
-            channels=16,
-            kernel_size=[3, 3],
-        )
-        op5 = relay.qnn.requantize(
-            op2,
-            input_scale=relay.const(0.05),
-            input_zero_point=relay.const(0),
-            output_scale=relay.const(0.21),
-            output_zero_point=relay.const(61),
-            out_dtype="int8",
-        )
-        relay_mod = tvm.IRModule.from_expr(op5)
-
-        inputs = {
-            "data": np.random.rand(*data_shape),
-            "weight": np.random.rand(*weight_shape) - 0.5,
-        }
-
-        run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0)  # equal
-
-
-class TestQnnDense:
-    """QNN dense op test class."""
-
-    @tvm.testing.requires_hexagon
-    def test_alter_layout_qnn_dense(self):
-        """Test weights layout transformation of qnn.dense with int8 weights"""
-        data = relay.var("data", shape=(128, 16), dtype="uint8")
-        weight = relay.var("weight", shape=(64, 16), dtype="int8")
-        zero = relay.const(0)
-        iscale = relay.const(0.15)
-        wscale = relay.const(0.37)
-
-        def before():
-            return relay.qnn.dense(data, weight, zero, zero, iscale, wscale, units=None)
-
-        def expected():
-            op0 = relay.layout_transform(weight, src_layout="NC", dst_layout="NC32n4c")
-            return relay.qnn.contrib_dense_pack(data, op0, zero, zero, iscale, wscale, "NC32n4c")
-
-        target = tvm.target.hexagon("v68")
-        with tvm.target.Target(target):
-            a = run_opt_pass(before(), tvm.relay.transform.AlterOpLayout())
-            b = run_infer_type(expected())
-            tvm.ir.assert_structural_equal(a, b)
-
-    # Dense + bias_add + requantize
-    #
-    dtype = tvm.testing.parameter("uint8", "int8")
-    n_dim = tvm.testing.parameter(64, 60)
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_dense_biasadd_requantize(self, hexagon_session: Session, dtype, n_dim):
-        """Check lowering of qnn.dense + bias_add + qnn.requantize
-        dtype: type of weights
-        n_dim: N dimension of weights, need to check cases when it is multiple of 32 and not.
-        """
-        data_shape = [128, 32]
-        weight_shape = [n_dim, 32]
-        bias_shape = [n_dim]
-        data = relay.var("data", shape=data_shape, dtype="uint8")
-        weight = relay.var("weight", shape=weight_shape, dtype=dtype)
-        bias = relay.var("bias", shape=bias_shape, dtype="int32")
-
-        op0 = relay.qnn.dense(
-            data,
-            weight,
-            input_zero_point=relay.const(2),
-            kernel_zero_point=relay.const(0),
-            input_scale=relay.const(0.08),
-            kernel_scale=relay.const(0.07),
-            units=None,
-        )
-        op1 = relay.nn.bias_add(op0, bias)
-        op2 = relay.qnn.requantize(
-            op1,
-            input_scale=relay.const(1.3),
-            input_zero_point=relay.const(4),
-            output_scale=relay.const(3.7),
-            output_zero_point=relay.const(1),
-            out_dtype="uint8",
-        )
-        relay_mod = tvm.IRModule.from_expr(op2)
-
-        np.random.seed(0)
-
-        inputs = {
-            "data": np.random.randint(2, 8, size=data_shape, dtype="uint8"),
-            "weight": np.random.randint(0, 8, size=weight_shape, dtype=dtype),
-            "bias": np.random.randint(-10, 10, size=bias_shape, dtype="int32"),
-        }
-
-        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
-
-    # Dense + requantize
-    #
-    @tvm.testing.requires_hexagon
-    def test_qnn_dense_requantize(self, hexagon_session: Session):
-        """Check lowering of qnn.dense + qnn.requantize
-        Checkint the case: data type = "uint8", weight type = "int8", input zp = 0 and kernel zp = 0
-        """
-        data_shape = [128, 32]
-        weight_shape = [64, 32]
-        data = relay.var("data", shape=data_shape, dtype="uint8")
-        weight = relay.var("weight", shape=weight_shape, dtype="int8")
-
-        op0 = relay.qnn.dense(
-            data,
-            weight,
-            input_zero_point=relay.const(0),
-            kernel_zero_point=relay.const(0),
-            input_scale=relay.const(0.06),
-            kernel_scale=relay.const(0.19),
-            units=64,
-        )
-        op1 = relay.qnn.requantize(
-            op0,
-            input_scale=relay.const(0.1),
-            input_zero_point=relay.const(0),
-            output_scale=relay.const(0.24),
-            output_zero_point=relay.const(64),
-            out_dtype="uint8",
-        )
-        relay_mod = tvm.IRModule.from_expr(op1)
-
-        np.random.seed(0)
-
-        inputs = {
-            "data": np.random.randint(0, 8, size=data_shape, dtype="uint8"),
-            "weight": np.random.randint(-4, 4, size=weight_shape, dtype="int8"),
-        }
-
-        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
-
-
-class TestQnnTanh:
-    """QNN tanh test class"""
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_tanh(self, hexagon_session: Session):
-        """Test qnn.tanh"""
-        data_shape = [256]
-        data = relay.var("data", shape=data_shape, dtype="uint8")
-
-        op = relay.qnn.tanh(
-            data,
-            scale=relay.const(0.518),
-            zero_point=relay.const(137),
-            output_scale=relay.const(0.207),
-            output_zero_point=relay.const(128),
-        )
-        relay_mod = tvm.IRModule.from_expr(op)
-
-        inputs = {"data": np.arange(0, 256, 1, dtype="uint8")}
-
-        run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0)  # equal
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_relay_simplify_conv_pat.py b/tests/python/contrib/test_hexagon/test_relay_simplify_conv_pat.py
deleted file mode 100644
index 0f8a9a739559..000000000000
--- a/tests/python/contrib/test_hexagon/test_relay_simplify_conv_pat.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-wildcard-import, invalid-name
-
-"""
-Test hexagon relay transform - qnn.concat optimization
-"""
-import numpy as np
-import tvm
-from tvm.runtime import ndarray as nd
-from tvm import relay, testing
-from tvm.contrib.hexagon.transform import simplify_conv_pat
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.hexagon.session import Session
-from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
-from .infrastructure import build_module, run_module
-
-
-def get_test_module_relay_exprs(isConstScalarMultiplier=True):
-    """
-    Creates relay expressions that can be used both by
-    test module and expected output module
-    """
-
-    act_shape = (1, 32, 32, 3)
-    data_in = np.random.rand(*get_const_tuple(act_shape))
-    data_in_float32 = np.full(data_in.shape, data_in, dtype="float32")
-    kernel_shape = (16, 3, 3, 3)
-    weights = np.random.rand(*get_const_tuple(kernel_shape))
-
-    bias = np.random.rand(get_const_tuple(kernel_shape)[0])
-    relay_act = relay.var("q1", shape=act_shape, dtype="float32")
-    if isConstScalarMultiplier:
-        relay_mul_factor = relay.const(0.00392151, dtype="float32")
-    else:
-        relay_mul_factor = np.random.rand(*get_const_tuple(act_shape))
-        relay_mul_factor = relay.Constant(
-            nd.array(np.full(relay_mul_factor.shape, relay_mul_factor, dtype="float32"))
-        )
-    relay_sub_term = relay.const(0.5, dtype="float32")
-    relay_weights = relay.Constant(nd.array(np.full(weights.shape, weights, dtype="float32")))
-    relay_bias = relay.Constant(nd.array(np.full(bias.shape, bias, dtype="float32")))
-    return (relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias, data_in_float32)
-
-
-def get_test_module_graph(relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias):
-    """Creates a test relay graph with the specified relay expressions"""
-    v1 = relay.multiply(relay_act, relay_mul_factor)
-    v2 = relay.subtract(v1, relay_sub_term)
-    v3 = relay.transpose(v2, axes=[0, 3, 1, 2])
-    weights_type_info = tvm.relay.transform.InferTypeLocal(relay_weights)
-    v4 = relay.nn.conv2d(
-        v3,
-        relay_weights,
-        padding=[1, 1, 1, 1],
-        channels=weights_type_info.shape[0],
-        kernel_size=[3, 3],
-    )
-    graph = relay.nn.bias_add(v4, relay_bias)
-    return graph
-
-
-def get_test_module(relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias):
-    """Creates a test relay module and returns it."""
-    graph = get_test_module_graph(
-        relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias
-    )
-
-    func = relay.Function(relay.analysis.free_vars(graph), graph)
-    mod = tvm.IRModule.from_expr(func)
-    return mod
-
-
-def get_expected_output_module_graph(
-    relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias
-):
-    """Creates the relay graph for expected output"""
-    v1 = relay.transpose(relay_act, axes=[0, 3, 1, 2])
-    v2 = relay.multiply(relay_mul_factor, relay_weights)
-    weights_type_info = tvm.relay.transform.InferTypeLocal(relay_weights)
-    v3 = relay.nn.conv2d(
-        v1, v2, padding=[1, 1, 1, 1], channels=weights_type_info.shape[0], kernel_size=[3, 3]
-    )
-    type_info = tvm.relay.transform.InferTypeLocal(v1)
-    relay_zero_act = relay.Constant(
-        nd.array(np.zeros(get_const_tuple(type_info.shape), dtype="float32"))
-    )
-    v4 = relay.subtract(relay_zero_act, relay_sub_term)
-    v5 = relay.nn.bias_add(v3, relay_bias)
-    v6 = relay.nn.conv2d(
-        v4,
-        relay_weights,
-        padding=[1, 1, 1, 1],
-        channels=weights_type_info.shape[0],
-        kernel_size=[3, 3],
-    )
-    return relay.add(v5, v6)
-
-
-def get_expected_output_module(
-    relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias
-):
-    """Returns manually created expected output module."""
-    graph = get_expected_output_module_graph(
-        relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias
-    )
-
-    out_func = relay.Function(relay.analysis.free_vars(graph), graph)
-    return tvm.IRModule.from_expr(out_func)
-
-
-def get_test_modules():
-    """generates test, expected modules and their inputs"""
-    (
-        relay_act,
-        relay_mul_factor,
-        relay_sub_term,
-        relay_weights,
-        relay_bias,
-        data_in_float32,
-    ) = get_test_module_relay_exprs()
-    mod = get_test_module(relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias)
-    exp_relay_mod = get_expected_output_module(
-        relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias
-    )
-
-    return mod, exp_relay_mod, {"q1": data_in_float32}
-
-
-@tvm.testing.requires_hexagon
-def test_simplify_conv_pat(hexagon_session: Session):
-    """A positive test case"""
-
-    (mod, exp_relay_mod, inputs) = get_test_modules()
-
-    with tvm.transform.PassContext(opt_level=3):
-        mod = tvm.relay.transform.InferType()(mod)
-        hexagon_lowered = build_module(
-            mod, tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET)
-        )
-
-    with tvm.transform.PassContext(opt_level=3):
-        mod = simplify_conv_pat(mod)
-        mod = tvm.relay.transform.InferType()(mod)
-        exp_relay_mod = tvm.relay.transform.InferType()(exp_relay_mod)
-        tvm.ir.assert_structural_equal(mod["main"], exp_relay_mod["main"], map_free_vars=True)
-        mod = tvm.relay.transform.FoldConstant()(mod)
-        hexagon_lowered_opt = build_module(
-            mod, tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET)
-        )
-
-    # Run unoptimized llvm module
-    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    expected_output = run_module(hexagon_mod, inputs)
-
-    # Run optimized llvm module
-    hexagon_mod_opt = hexagon_session.get_executor_from_factory(hexagon_lowered_opt)
-    actual_output = run_module(hexagon_mod_opt, inputs)
-
-    tvm.testing.assert_allclose(actual_output, expected_output, rtol=0.00001)
-
-
-def get_negative_test_module():
-    """generates a negative test module with non-const multiplier"""
-    (
-        relay_act,
-        relay_mul_factor,
-        relay_sub_term,
-        relay_weights,
-        relay_bias,
-        _,
-    ) = get_test_module_relay_exprs(False)
-    mod = get_test_module(relay_act, relay_mul_factor, relay_sub_term, relay_weights, relay_bias)
-
-    return mod
-
-
-def test_negative():
-    """A negative test case"""
-    orig_mod = get_negative_test_module()
-    with tvm.transform.PassContext(opt_level=3):
-        orig_mod = tvm.relay.transform.InferType()(orig_mod)
-        opt_mod = simplify_conv_pat(orig_mod)
-        opt_mod = tvm.relay.transform.InferType()(opt_mod)
-        tvm.ir.assert_structural_equal(orig_mod["main"], opt_mod["main"], map_free_vars=True)
-
-
-if __name__ == "__main__":
-    testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_relay_simplify_qnn_concat.py b/tests/python/contrib/test_hexagon/test_relay_simplify_qnn_concat.py
deleted file mode 100644
index 4eda615a1dd5..000000000000
--- a/tests/python/contrib/test_hexagon/test_relay_simplify_qnn_concat.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-wildcard-import, invalid-name
-
-"""
-Test hexagon relay transform - qnn.concat optimization
-"""
-import tvm
-from tvm import relay, testing
-from tvm.contrib.hexagon.transform import simplify_qnn_concat
-
-
-def get_test_module():
-    """Creates a test relay module and returns it."""
-    q1 = relay.var("q1", shape=(1, 64, 35, 35), dtype="uint8")
-    q2 = relay.var("q2", shape=(1, 64, 35, 35), dtype="uint8")
-    q3 = relay.var("q3", shape=(1, 32, 35, 35), dtype="uint8")
-    s2 = relay.const(0.000109401, dtype="float32")
-    s3 = relay.const(0.0486874, dtype="float32")
-    s4 = relay.const(0.0425042, dtype="float32")
-    s5 = relay.const(0.00345, dtype="float32")
-    z1 = relay.const(0, dtype="int32")
-    r1 = relay.op.nn.max_pool2d(
-        q1,
-        pool_size=[3, 3],
-        strides=[1, 1],
-        padding=[1, 1],
-        dilation=[1, 1],
-        ceil_mode=False,
-        layout="NHWC",
-    )
-    r2 = relay.qnn.requantize(q2, s2, z1, s5, z1, axis=1, out_dtype="uint8")
-    q_tuple = relay.expr.Tuple([r1, r2, q3])
-    s_tuple = relay.expr.Tuple([s4, s5, s3])
-    z_tuple = relay.expr.Tuple([z1, z1, z1])
-    graph = relay.qnn.concatenate(q_tuple, s_tuple, z_tuple, s3, z1, axis=1)
-
-    func = relay.Function(relay.analysis.free_vars(graph), graph)
-    mod = tvm.IRModule.from_expr(func)
-    return mod
-
-
-def get_expected_output_module():
-    """Returns manually created expected output module."""
-    out_q1 = relay.var("q1", shape=(1, 64, 35, 35), dtype="uint8")
-    out_q2 = relay.var("q2", shape=(1, 64, 35, 35), dtype="uint8")
-    out_q3 = relay.var("q3", shape=(1, 32, 35, 35), dtype="uint8")
-    out_s2 = relay.const(0.000109401, dtype="float32")
-    out_s3 = relay.const(0.0486874, dtype="float32")
-    out_s4 = relay.const(0.0425042, dtype="float32")
-    out_z1 = relay.const(0, dtype="int32")
-    nn_max_pool = relay.op.nn.max_pool2d(
-        out_q1,
-        pool_size=[3, 3],
-        strides=[1, 1],
-        padding=[1, 1],
-        dilation=[1, 1],
-        ceil_mode=False,
-        layout="NHWC",
-    )
-    out_r1 = relay.qnn.requantize(
-        nn_max_pool, out_s4, out_z1, out_s3, out_z1, axis=1, out_dtype="uint8"
-    )
-    out_r2 = relay.qnn.requantize(out_q2, out_s2, out_z1, out_s3, out_z1, axis=1, out_dtype="uint8")
-    out_q_tuple = relay.expr.Tuple([out_r1, out_r2, out_q3])
-    out_graph = relay.op.concatenate(out_q_tuple, axis=1)
-
-    out_func = relay.Function(relay.analysis.free_vars(out_graph), out_graph)
-    out_mod = tvm.IRModule.from_expr(out_func)
-    return out_mod
-
-
-def test_simplify_qnn_concat():
-    mod = get_test_module()
-    mod = tvm.relay.transform.InferType()(mod)
-    mod = simplify_qnn_concat(mod)
-
-    out_mod = get_expected_output_module()
-    out_mod = tvm.relay.transform.InferType()(out_mod)
-
-    tvm.ir.assert_structural_equal(mod["main"], out_mod["main"])
-
-
-if __name__ == "__main__":
-    testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_relay_transforms.py b/tests/python/contrib/test_hexagon/test_relay_transforms.py
deleted file mode 100644
index 32c8ff126544..000000000000
--- a/tests/python/contrib/test_hexagon/test_relay_transforms.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-wildcard-import, invalid-name
-
-"""
-Test hexagon relay transforms
-"""
-import tvm
-from tvm import relay
-from tvm.contrib.hexagon.transform import rewrite_qdistilbert, remove_empty_pad
-from tvm import testing
-
-
-def test_rewrite_qdistilbert():
-    """Test case for rewrite_qdistilbert"""
-    A = relay.var("A", shape=(12, 128, 64), dtype="int8")
-    B = relay.var("B", shape=(12, 64, 128), dtype="int8")
-
-    z = tvm.tir.IntImm("int64", 0)
-    s1 = tvm.tir.IntImm("int64", 1)
-    tx = tvm.tir.IntImm("int64", 128)
-    ty = tvm.tir.IntImm("int64", 64)
-    expand_dims = []
-    for i in range(12):
-        d1 = relay.const(13, dtype="int32")
-        d2 = relay.const(1, dtype="int32")
-        d3 = relay.const(0.0541715, dtype="float32")
-        d4 = relay.const(0.0489368, dtype="float32")
-
-        q1 = relay.const(0.00265098, dtype="float32")
-        q2 = relay.const(0, dtype="int32")
-        q3 = relay.const(0.728874, dtype="float32")
-        q4 = relay.const(-14, dtype="int32")
-
-        x = tvm.tir.IntImm("int64", i)
-        y = tvm.tir.IntImm("int64", i + 1)
-
-        SA = relay.op.strided_slice(
-            A, begin=[x, z, z], end=[y, tx, ty], strides=[s1, s1, s1], axes=None
-        )
-        RA = relay.op.reshape(SA, [128, 64])
-        SB = relay.op.strided_slice(
-            B, begin=[x, z, z], end=[y, ty, tx], strides=[s1, s1, s1], axes=None
-        )
-        RB = relay.op.reshape(SB, [64, 128])
-        TB = relay.op.transpose(RB, [1, 0])
-        dense = relay.qnn.op.dense(RA, TB, d1, d2, d3, d4, units=None, out_dtype="int32")
-        requantize = relay.qnn.op.requantize(dense, q1, q2, q3, q4)
-        expand_dims.append(relay.op.expand_dims(requantize, axis=0))
-
-    t = relay.expr.Tuple(expand_dims)
-    graph = relay.op.concatenate(t, axis=0)
-
-    func = relay.Function(relay.analysis.free_vars(graph), graph)
-    mod = tvm.IRModule.from_expr(func)
-    mod = rewrite_qdistilbert(mod)
-
-    d1 = relay.const(13, dtype="int32")
-    d2 = relay.const(1, dtype="int32")
-    d3 = relay.const(0.0541715, dtype="float32")
-    d4 = relay.const(0.0489368, dtype="float32")
-
-    q1 = relay.const(0.00265098, dtype="float32")
-    q2 = relay.const(0, dtype="int32")
-    q3 = relay.const(0.728874, dtype="float32")
-    q4 = relay.const(-14, dtype="int32")
-
-    ref = relay.op.transpose(B, [0, 2, 1])
-    ref = relay.qnn.op.batch_matmul(A, ref, d1, d2, d3, d4, out_dtype="int32")
-    ref = relay.qnn.op.requantize(ref, q1, q2, q3, q4, out_dtype="int8")
-    ref_func = relay.Function(relay.analysis.free_vars(ref), ref)
-    ref_mod = tvm.IRModule.from_expr(ref_func)
-
-    tvm.ir.assert_structural_equal(mod["main"], ref_mod["main"])
-
-    # If the pattern does not match, should return the original.
-    func = relay.expr.Tuple(expand_dims)  # omitting concatenate
-    mod = tvm.IRModule.from_expr(func)
-    out_mod = rewrite_qdistilbert(mod)  # out does not return ref_mod but the original mod
-
-    tvm.ir.assert_structural_equal(mod["main"], out_mod["main"])
-
-
-def test_remove_empty_pad():
-    """Test case for remove_empty_pad"""
-    A = relay.var("A", shape=(32, 32), dtype="float16")
-    B = relay.var("B", shape=(32, 32), dtype="float16")
-
-    p0 = relay.cast(relay.const(0, dtype="float32"), dtype="float16")
-    p1 = relay.nn.pad(A, pad_value=p0, pad_width=((0, 0), (0, 0)))
-    graph = relay.nn.matmul(p1, B)
-
-    func = relay.Function(relay.analysis.free_vars(graph), graph)
-    mod = tvm.IRModule.from_expr(func)
-
-    mod = remove_empty_pad(mod)
-
-    ref = relay.nn.matmul(A, B)
-    ref_func = relay.Function(relay.analysis.free_vars(ref), ref)
-    ref_mod = tvm.IRModule.from_expr(ref_func)
-
-    tvm.ir.assert_structural_equal(mod["main"], ref_mod["main"])
-
-
-if __name__ == "__main__":
-    testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/__init__.py b/tests/python/contrib/test_hexagon/topi/__init__.py
deleted file mode 100644
index dce5413e66e2..000000000000
--- a/tests/python/contrib/test_hexagon/topi/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Hexagon TOPI tests """
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/__init__.py b/tests/python/contrib/test_hexagon/topi/slice_op/__init__.py
deleted file mode 100644
index baf28ad93323..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Hexagon TOPI Slice OP tests """
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
deleted file mode 100644
index 92a951106765..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Tests for Hexagon slice argmax op """
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-import tvm.topi.hexagon.slice_ops as sl
-import tvm.contrib.hexagon
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, get_hexagon_target
-
-
-class TestArgMaxSlice:
-    """Argmax Slice Op Tests"""
-
-    (
-        input_shape,
-        input_layout,
-        output_layout,
-        dtype,
-        in_axis,
-        in_axis_sep,
-        out_axis_sep,
-    ) = tvm.testing.parameters(
-        ((1, 64, 64, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]),
-        ((3, 32, 16, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]),
-        ((1, 32, 32, 64), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]),
-        ((1, 64, 64, 32), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]),
-        ((3, 32, 16, 32), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]),
-        ((1, 32, 32, 64), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]),
-    )
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        return np.random.uniform(size=input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, input_layout):
-        return transform_numpy(input_np, "nhwc", input_layout)
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, in_axis):
-        ref_np = np.argmax(input_np, *in_axis).astype("int32")
-        return ref_np
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, output_layout):
-        return transform_numpy(expected_output_np, "nhw", output_layout)
-
-    @tvm.testing.requires_hexagon
-    def test_argmax_slice(
-        self,
-        input_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        in_axis,
-        transformed_input_np,
-        transformed_expected_output_np,
-        in_axis_sep,
-        out_axis_sep,
-        hexagon_session,
-        working_scope,
-    ):
-        """Top level testing function for argmax"""
-        argmax_input = te.placeholder(input_shape, name="A", dtype=dtype)
-        output = sl.argmax.argmax_compute(argmax_input, in_axis)
-        argmax_func = te.create_prim_func([argmax_input, output])
-        tir_s = sl.argmax_schedule(argmax_func, input_layout, output_layout)
-        input_data = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            axis_separators=in_axis_sep,
-            mem_scope=working_scope,
-        )
-        output_data = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=transformed_expected_output_np.dtype,
-            axis_separators=out_axis_sep,
-            mem_scope=working_scope,
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            tir_irm = tvm.lower(tir_s.mod, [argmax_input, output], name="argmax")
-            runtime_module = tvm.build(
-                tir_irm, [argmax_input, output], target=get_hexagon_target("v69"), name="argmax"
-            )
-        mod = hexagon_session.load_module(runtime_module)
-
-        mod(input_data, output_data)
-        output_np = output_data.numpy()
-        tvm.testing.assert_allclose(
-            output_np,
-            transformed_expected_output_np,
-            1e-3,
-            1e-3,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
deleted file mode 100644
index 712d5b303eeb..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
+++ /dev/null
@@ -1,478 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-from typing import *
-
-from tvm import te
-import tvm.testing
-from tvm.topi.testing import poolnd_python
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.hexagon.slice_ops as sl
-import tvm.topi.hexagon.qnn as qn
-from tvm.contrib.hexagon import allocate_hexagon_array
-import pytest
-from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target
-from ...pytest_util import get_multitest_ids, create_populated_numpy_ndarray, TensorContentRandom
-
-
-dtype = tvm.testing.parameter("uint8", "float16")
-
-
-@tvm.testing.fixture
-def output_layout(output_shape, op_layout, dtype):
-    if op_layout == "NHWC":
-        o_b, o_h, o_w, o_c = output_shape
-        if dtype == "float16":
-            if o_h == 1 and o_w == 1:
-                return "n11c-1024c-2d"
-            else:
-                return "nhwc-8h2w32c2w-2d"
-        elif dtype == "int8" or "uint8":
-            if o_h == 1 and o_w == 1:
-                return "n11c-2048c-2d"
-            else:
-                return "nhwc-8h8w32c-2d"
-        else:
-            raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    elif op_layout == "NCHW":
-        o_b, o_c, o_h, o_w = output_shape
-        if dtype == "float16":
-            if o_h == 1 and o_w == 1:
-                return "nc11-1024c-2d"
-            else:
-                return "nchw-8h2w32c2w-2d"
-        elif dtype == "int8" or "uint8":
-            if o_h == 1 and o_w == 1:
-                return "nc11-2048c-2d"
-            else:
-                return "nchw-8h8w32c-2d"
-        else:
-            raise RuntimeError(f"Unsupported data type '{dtype}'")
-    else:
-        raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'")
-
-
-@tvm.testing.fixture
-def input_layout(op_layout, dtype):
-    in_layout = op_layout.lower()
-    if dtype == "float16":
-        return in_layout + "-8h2w32c2w-2d"
-    elif dtype == "int8" or "uint8":
-        return in_layout + "-8h8w32c-2d"
-    else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def input_np(input_shape, dtype: str, input_tensor_populator):
-    if dtype == "uint8":
-        dtype = "float32"  # Use "float32" input which will be quantized later
-    return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
-
-
-class TestAvgPool2dSlice:
-    _param_descs = [
-        "out_shape",  # output_shape
-        "kernel",  # kernel
-        "stride",  # stride
-        "dil",  # dilation
-        "pad",  # padding
-        "ceil",  # ceil_mode
-        "cnt_padded",  # count_include_pad
-        "op_layout",  # input output 4D layout
-        None,  # input_tensor_populator
-    ]
-    _multitest_params = [
-        (
-            [1, 7, 11, 32],
-            [3, 3],
-            [3, 2],
-            [2, 3],
-            [1, 2, 3, 4],
-            False,
-            False,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 1, 1, 2048],
-            [4, 4],
-            [2, 2],
-            [2, 3],
-            [0, 2, 1, 4],
-            False,
-            False,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        # Test default stride,dilation, and padding with different layouts
-        (
-            [1, 10, 10, 32],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 12, 12, 32],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 32, 14, 14],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NCHW",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 32, 15, 15],
-            [8, 8],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NCHW",
-            TensorContentRandom(),
-        ),
-        # Test non-one stride and dilation with different layouts
-        (
-            [1, 18, 24, 32],
-            [3, 3],
-            [2, 3],
-            [2, 2],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 32, 18, 18],
-            [5, 5],
-            [2, 2],
-            [2, 3],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NCHW",
-            TensorContentRandom(),
-        ),
-        # Test non-zero padding with count include and exclude pad and different layouts
-        (
-            [1, 6, 6, 32],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [1, 1, 1, 1],
-            False,
-            False,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 8, 8, 32],
-            [3, 3],
-            [1, 2],
-            [2, 3],
-            [2, 2, 3, 3],
-            False,
-            False,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 32, 6, 6],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [1, 2, 3, 4],
-            False,
-            False,
-            "NCHW",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 32, 15, 22],
-            [3, 3],
-            [3, 2],
-            [2, 3],
-            [1, 2, 3, 4],
-            False,
-            False,
-            "NCHW",
-            TensorContentRandom(),
-        ),
-        # Test n11c-1024c-2d layout which will require input and output to have different layout
-        (
-            [1, 1, 1, 2048],
-            [8, 8],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 1, 1, 2048],
-            [6, 6],
-            [1, 1],
-            [1, 1],
-            [2, 2, 2, 2],
-            False,
-            False,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 1, 1, 2048],
-            [4, 4],
-            [2, 2],
-            [2, 3],
-            [0, 2, 1, 4],
-            False,
-            False,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 1, 1, 2048],
-            [3, 3],
-            [2, 2],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NHWC",
-            TensorContentRandom(),
-        ),
-        (
-            [1, 2048, 1, 1],
-            [4, 4],
-            [2, 2],
-            [2, 3],
-            [0, 0, 0, 0],
-            False,
-            True,
-            "NCHW",
-            TensorContentRandom(),
-        ),
-    ]
-
-    _param_ids = get_multitest_ids(_multitest_params, _param_descs)
-
-    (
-        output_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
-        op_layout,
-        input_tensor_populator,
-    ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self, input_np, kernel, stride, dilation, padding, ceil_mode, count_include_pad, op_layout
-    ):
-        pad_before = padding[:2]
-        pad_after = padding[2:]
-        ref_np = poolnd_python(
-            input_np,
-            kernel,
-            stride,
-            dilation,
-            pad_before,
-            pad_after,
-            "avg",  # pool_type
-            count_include_pad,
-            False,  # ceil_mode,
-            layout=op_layout,
-        )
-        return ref_np
-
-    @tvm.testing.fixture
-    def input_shape(
-        self, output_shape, kernel, padding, stride, dilation, op_layout, output_layout
-    ):
-        # Input shape without any padding; 'ceil' is being ignored from calculation:
-        if op_layout == "NHWC":
-            o_b, o_h, o_w, o_c = output_shape
-        else:
-            o_b, o_c, o_h, o_w = output_shape
-        d_h, d_w = dilation
-        s_h, s_w = stride
-        k_h, k_w = kernel
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h, pad_after_w = padding[2:]
-
-        if (
-            output_layout == "n11c-2048c-2d"
-            or output_layout == "nc11-2048c-2d"
-            or output_layout == "n11c-1024c-2d"
-            or output_layout == "nc11-1024c-2d"
-        ):
-            assert o_h == 1 and o_w == 1, "Output height and width must be 1"
-
-        in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h
-        in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w
-
-        if op_layout == "NHWC":
-            return [o_b, in_h, in_w, o_c]
-        else:
-            return [o_b, o_c, in_h, in_w]
-
-    @tvm.testing.fixture
-    def schedule_args(
-        self,
-        kernel,
-        stride,
-        padding,
-        dilation,
-        count_include_pad,
-        output_layout,
-        output_shape,
-        input_np,
-        input_shape,
-        input_layout,
-        expected_output_np,
-        dtype,
-        op_layout,
-    ):
-        """Construct schedule args based on dtype"""
-        A = te.placeholder(input_shape, name="A", dtype=dtype)
-        if dtype == "float16":
-            if op_layout == "NHWC":
-                M = sl.avg_pool2d_NHWC(
-                    A, kernel, stride, padding, dilation, count_include_pad, output_shape
-                )
-            elif op_layout == "NCHW":
-                M = sl.avg_pool2d_NCHW(
-                    A, kernel, stride, padding, dilation, count_include_pad, output_shape
-                )
-            else:
-                raise RuntimeError(f"Unsupported layout for slice_op.avg_pool2d '{op_layout}'")
-            tir_schedule = sl.avg_pool2d_schedule(M, A, output_layout, input_layout)
-        elif dtype in ("uint8", "int8"):
-            _, in_scale, in_zero_point = quantize_np(input_np, dtype)
-            _, out_scale, out_zero_point = quantize_np(expected_output_np, dtype)
-            if op_layout == "NHWC":
-                M = qn.qnn_avg_pool2d_NHWC(
-                    A,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    count_include_pad,
-                    output_shape,
-                    dtype,
-                    in_scale,
-                    in_zero_point,
-                    out_scale,
-                    out_zero_point,
-                )
-            elif op_layout == "NCHW":
-                M = qn.qnn_avg_pool2d_NCHW(
-                    A,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    count_include_pad,
-                    output_shape,
-                    dtype,
-                    in_scale,
-                    in_zero_point,
-                    out_scale,
-                    out_zero_point,
-                )
-            else:
-                raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'")
-
-            tir_schedule = qn.qnn_avg_pool2d_schedule(M, A, output_layout, input_layout)
-
-        return [tir_schedule.mod, [A, M]]
-
-    @tvm.testing.requires_hexagon
-    def test_avg_pool2d_slice(
-        self, dtype, input_np, expected_output_np, schedule_args, hexagon_session: Session
-    ):
-        print("schedule_args : ", schedule_args)
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(*schedule_args, get_hexagon_target("v69"), name="avg_pool2d")
-
-        input_axis_separator = []
-        output_axis_separator = []
-
-        if dtype == "float16":
-            in_data_np = input_np
-            out_data_np = expected_output_np
-        elif dtype in ("uint8", "int8"):
-            in_data_np, _, _ = quantize_np(input_np, dtype)
-            out_data_np, _, _ = quantize_np(expected_output_np, dtype)
-        else:
-            raise RuntimeError(f"Unsupport dtype '{dtype}'")
-
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=in_data_np,
-            axis_separators=input_axis_separator,
-            mem_scope="global.ddr",
-        )
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            out_data_np.shape,
-            dtype,
-            axis_separators=output_axis_separator,
-            mem_scope="global.ddr",
-        )
-
-        mod = hexagon_session.load_module(func)
-
-        mod(input_arr, output_arr)
-
-        output_np = output_arr.numpy()
-        if dtype == "float16":
-            np.testing.assert_allclose(output_np, out_data_np, rtol=1e-3, atol=1e-3)
-        else:
-            output_np = output_arr.numpy()
-            np.testing.assert_allclose(output_np, out_data_np, rtol=0, atol=2)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
deleted file mode 100644
index aa1a53c224d5..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Tests for Hexagon slice cast ops """
-import pytest
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-import tvm.topi.hexagon.slice_ops as sl
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, get_hexagon_target
-
-
-class TestCastF16F32Slice2d:
-    """
-    For testing Cast F16  to F32 Slice ops
-    """
-
-    input_shape, orig_layout, input_layout, output_layout, axis_sep = tvm.testing.parameters(
-        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-4h2w32c2w-2d", [4]),
-        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-4h2w32c2w-2d", [4]),
-        ((1, 1024), "nc", "nc-1024c-2d", "nc-1024c-2d", [2]),
-        ((1, 1024), "nc", "nc-1024c-2d", "nc-512c-2d", [2]),
-    )
-    dtype = tvm.testing.parameter("float16")
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        return np.random.uniform(size=input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, orig_layout, input_layout):
-        return transform_numpy(input_np, orig_layout, input_layout)
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np):
-        ref_np = input_np.astype("float32")
-        return ref_np
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
-        return transform_numpy(expected_output_np, orig_layout, output_layout)
-
-    @tvm.testing.requires_hexagon
-    def test_cast_fp16_fp32_slice(
-        self,
-        input_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        transformed_input_np,
-        transformed_expected_output_np,
-        axis_sep,
-        hexagon_session,
-        working_scope,
-    ):
-        """
-        Top level testing function for cast fp16 to fp32
-        """
-        if hexagon_session.is_simulator():
-            pytest.skip("Due to https://github.com/apache/tvm/issues/11957")
-
-        cast_input = te.placeholder(input_shape, name="A", dtype=dtype)
-        cast_output = sl.cast_f16_f32_compute(cast_input)
-        cast_func = te.create_prim_func([cast_input, cast_output])
-        tir_s = sl.cast_f16_f32_schedule(cast_func, input_layout, output_layout)
-        input_data = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        output_data = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=transformed_expected_output_np.dtype,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f16_f32")
-            runtime_module = tvm.build(
-                tir_irm, target=get_hexagon_target("v69"), name="cast_f16_f32"
-            )
-        mod = hexagon_session.load_module(runtime_module)
-
-        mod(input_data, output_data)
-        output_np = output_data.numpy()
-        tvm.testing.assert_allclose(
-            output_np,
-            transformed_expected_output_np,
-            1e-3,
-            1e-3,
-        )
-
-
-class TestCastF32F16Slice2d:
-    """
-    For testing Cast F32 to F16 Slice ops
-    """
-
-    (input_shape, orig_layout, input_layout, output_layout, axis_sep,) = tvm.testing.parameters(
-        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 16, 12, 64), "nhwc", "nhwc-4h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 64, 64, 32), "nhwc", "nhwc-4h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 1024), "nc", "nc-1024c-2d", "nc-1024c-2d", [2]),
-        ((1, 1024), "nc", "nc-512c-2d", "nc-1024c-2d", [2]),
-    )
-    dtype = tvm.testing.parameter("float32")
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        return np.random.uniform(size=input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, orig_layout, input_layout):
-        return transform_numpy(input_np, orig_layout, input_layout)
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np):
-        ref_np = input_np.astype("float16")
-        return ref_np
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
-        return transform_numpy(expected_output_np, orig_layout, output_layout)
-
-    @tvm.testing.requires_hexagon
-    def test_cast_fp32_fp16_slice(
-        self,
-        input_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        transformed_input_np,
-        transformed_expected_output_np,
-        axis_sep,
-        hexagon_session,
-        working_scope,
-    ):
-        """
-        Top level testing function for cast fp32 to fp16
-        """
-        if hexagon_session.is_simulator():
-            pytest.skip("Due to https://github.com/apache/tvm/issues/11957")
-
-        cast_input = te.placeholder(input_shape, name="A", dtype=dtype)
-        cast_output = sl.cast_f32_f16_compute(cast_input)
-        cast_func = te.create_prim_func([cast_input, cast_output])
-        tir_s = sl.cast_f32_f16_schedule(cast_func, input_layout, output_layout)
-        input_data = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        output_data = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=transformed_expected_output_np.dtype,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f32_f16")
-            runtime_module = tvm.build(
-                tir_irm, target=get_hexagon_target("v69"), name="cast_f32_f16"
-            )
-        mod = hexagon_session.load_module(runtime_module)
-
-        mod(input_data, output_data)
-        output_np = output_data.numpy()
-        tvm.testing.assert_allclose(
-            output_np,
-            transformed_expected_output_np,
-            1e-3,
-            1e-3,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
deleted file mode 100644
index d3f9804cd6c3..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name
-
-import numpy as np
-
-from tvm import te
-import tvm.testing
-import tvm.topi.hexagon.slice_ops as sl
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, get_hexagon_target
-
-input_layout = tvm.testing.parameter(
-    "nhwc-8h2w32c2w-2d",
-)
-
-
-@tvm.testing.fixture
-def input_np(input_shape, dtype):
-    return np.random.random(input_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout):
-    return transform_numpy(expected_output_np, "nhwc", output_layout)
-
-
-@tvm.testing.fixture
-def transformed_input_np(input_np, input_layout):
-    return transform_numpy(input_np, "nhwc", input_layout)
-
-
-class TestClipSlice:
-    input_shape, output_shape, A_min, A_max, output_layout, dtype = tvm.testing.parameters(
-        ([1, 8, 4, 32], [1, 8, 4, 32], 0.1, 0.5, "nhwc-8h2w32c2w-2d", "float16")
-    )
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, A_min, A_max):
-        ref_np = np.clip(input_np, A_min, A_max)
-        return ref_np
-
-    @tvm.testing.requires_hexagon
-    def test_clip_slice(
-        self,
-        input_shape,
-        output_shape,
-        input_np,
-        input_layout,
-        output_layout,
-        dtype,
-        A_min,
-        A_max,
-        transformed_input_np,
-        transformed_expected_output_np,
-        hexagon_session,
-    ):
-        # establish target and input placeholder
-        A = te.placeholder(input_shape, name="A", dtype=dtype)
-
-        # get the compute function and schedule
-        M = sl.clip_compute(A, A_min, A_max)
-
-        # Assume layout is nhwc-8h2w32c2w-2d
-        tir_schedule = sl.clip_schedule(M, A, output_layout, input_layout)
-
-        # build the function
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                tir_schedule.mod,
-                target=get_hexagon_target("v69"),
-                name="clip",
-            )
-
-        # allocate input and output nd arrays
-        axis_separators = [4]
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=dtype,
-            axis_separators=axis_separators,
-            mem_scope="global.vtcm",
-        )
-
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            transformed_expected_output_np.shape,
-            dtype=dtype,
-            axis_separators=axis_separators,
-            mem_scope="global.vtcm",
-        )
-
-        # execute
-        mod = hexagon_session.load_module(func)
-        mod(input_arr, output_arr)
-
-        # convert output nd array to numpy array
-        output_np = output_arr.numpy()
-        b, h, w, c = output_shape
-        reshaped_output_np = np.reshape(output_np, [b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
-
-        # test results
-        np.testing.assert_allclose(
-            reshaped_output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
deleted file mode 100644
index dcc926addcab..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=line-too-long, redefined-outer-name
-
-"""Test conv2d slice op for hexagon"""
-
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm.topi.hexagon.slice_ops.conv2d import conv2d_compute, conv2d_schedule
-from tvm.topi.testing import conv2d_nhwc_python
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, get_hexagon_target
-
-input_layout = tvm.testing.parameter(
-    "nhwc-8h2w32c2w-2d",
-)
-
-output_layout = tvm.testing.parameter(
-    "nhwc-8h2w32c2w-2d",
-)
-
-weights_layout = tvm.testing.parameter("iohw-16i32o2i-1d")
-
-
-@tvm.testing.fixture
-def input_np(in_shape, dtype):
-    return np.random.uniform(size=in_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def weights_np(filt_shape, dtype):
-    return (np.random.uniform(size=filt_shape)).astype(dtype)
-
-
-@tvm.testing.fixture
-def dilated_filt_shape(filt_shape, dilation):
-    """Compute the dilated filter shape when dilation > 1"""
-    filt_height, filt_width, in_channel, out_channel = filt_shape
-    dilation_height, dilation_width = dilation
-    if dilation_height == 1 and dilation_width == 1:
-        return filt_shape
-    dilated_height, dilated_width = (
-        dilation_height * (filt_height - 1) + 1,
-        dilation_width * (filt_width - 1) + 1,
-    )
-    return dilated_height, dilated_width, in_channel, out_channel
-
-
-@tvm.testing.fixture
-def dilated_weights_np(weights_np, dilation, dilated_filt_shape):
-    """Get dilated weights from original weights for testing"""
-    filt_height, filt_width, in_channels, out_channels = weights_np.shape
-    dilation_height, dilation_width = dilation
-    if dilation_height == 1 and dilation_width == 1:
-        return weights_np
-    dilated_height, dilated_width = dilated_filt_shape[0], dilated_filt_shape[1]
-    dilated_weights = np.zeros(dilated_filt_shape, dtype="float16")
-    for in_channel in range(in_channels):
-        for out_channel in range(out_channels):
-            for dilation_i, height_i in zip(
-                range(0, dilated_height, dilation_height), range(filt_height)
-            ):
-                for dilation_j, width_j in zip(
-                    range(0, dilated_width, dilation_width), range(filt_width)
-                ):
-                    dilated_weights[dilation_i, dilation_j, in_channel, out_channel] = weights_np[
-                        height_i, width_j, in_channel, out_channel
-                    ]
-
-    return dilated_weights
-
-
-@tvm.testing.fixture
-def input_np_padded(input_np, in_shape, padded_in_shape):
-    pad_height = padded_in_shape[1] - in_shape[1]
-    pad_width = padded_in_shape[2] - in_shape[2]
-    pad_channel = padded_in_shape[3] - in_shape[3]
-    input_padded = np.pad(
-        input_np, ((0, 0), (0, pad_height), (0, pad_width), (0, pad_channel)), "constant"
-    )
-    return input_padded
-
-
-@tvm.testing.fixture
-def padded_filt_shape(filt_shape):
-    filt_height, filt_width, in_channels, out_channels = filt_shape
-    in_channels = ((in_channels + 31) // 32) * 32
-    out_channels = ((out_channels + 31) // 32) * 32
-    return filt_height, filt_width, in_channels, out_channels
-
-
-@tvm.testing.fixture
-def weights_np_padded(weights_np, filt_shape, padded_filt_shape):
-    pad_in_channels = padded_filt_shape[2] - filt_shape[2]
-    pad_out_channels = padded_filt_shape[3] - filt_shape[3]
-    filt_padded = np.pad(
-        weights_np, ((0, 0), (0, 0), (0, pad_in_channels), (0, pad_out_channels)), "constant"
-    )
-    return filt_padded
-
-
-@tvm.testing.fixture
-def weights_np_transformed(weights_np_padded):
-    height, width, in_channel, out_channel = weights_np_padded.shape
-    weights_np_reverse_width = weights_np_padded[:, ::-1, :, :]
-    transformed_weights_np = weights_np_reverse_width.reshape(
-        [height, width, in_channel // 32, 16, 2, out_channel // 32, 32]
-    ).transpose(2, 5, 0, 1, 3, 6, 4)
-    return transformed_weights_np
-
-
-def generate_test_config(test_params):
-    """Utility function to generate test config with meaningful ids"""
-    test_config = {}
-
-    dims = lambda vals: "x".join(map(str, vals))
-
-    for param in test_params:
-        in_shape, filt_shape, stride, dilation = param
-        test_name = f"nhwc{dims(in_shape)}-hwio{dims(filt_shape)}-stride{dims(stride)}-dilation{dims(dilation)}"
-        test_config[test_name] = param
-
-    return test_config
-
-
-class TestConv2dSlice:
-    """Test class that defines the conv2d slice test"""
-
-    test_params = [
-        [
-            (1, 10, 6, 32),
-            (3, 3, 32, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 18, 10, 32),
-            (3, 3, 32, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 10, 6, 64),
-            (3, 3, 64, 64),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 12, 8, 4),
-            (3, 3, 4, 32),
-            (1, 1),
-            (2, 2),
-        ],
-        [
-            (1, 12, 8, 32),
-            (5, 5, 32, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 16, 12, 32),
-            (5, 5, 32, 32),
-            (1, 1),
-            (2, 2),
-        ],
-        [
-            (1, 13, 9, 32),
-            (6, 6, 32, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 18, 10, 32),
-            (3, 3, 32, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 20, 12, 32),
-            (5, 5, 32, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 22, 14, 32),
-            (7, 7, 32, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 28, 20, 32),
-            (7, 7, 32, 32),
-            (2, 2),
-            (2, 2),
-        ],
-        [
-            (1, 10, 4, 4),
-            (3, 1, 4, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 18, 8, 4),
-            (3, 1, 4, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 20, 8, 4),
-            (3, 1, 4, 32),
-            (2, 2),
-            (2, 2),
-        ],
-    ]
-
-    test_config = generate_test_config(test_params)
-
-    in_shape, filt_shape, stride, dilation = tvm.testing.parameters(
-        *test_config.values(), ids=test_config.keys()
-    )
-    dtype = tvm.testing.parameter("float16")
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-    @tvm.testing.fixture
-    def padded_in_shape(self, in_shape):
-        in_batch, in_height, in_width, in_channel = in_shape
-        in_height = ((in_height + 7) // 8) * 8
-        in_width = ((in_width + 3) // 4) * 4
-        in_channel = ((in_channel + 31) // 32) * 32
-        return in_batch, in_height, in_width, in_channel
-
-    @tvm.testing.fixture
-    def out_shape(self, in_shape, dilated_filt_shape, stride):
-        in_batch, in_height, in_width, _ = in_shape
-        filt_height, filt_width, _, num_filt = dilated_filt_shape
-        out_height = (in_height - filt_height) // stride[0] + 1
-        out_width = (in_width - filt_width) // stride[1] + 1
-        out_channel = num_filt
-        return in_batch, out_height, out_width, out_channel
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, dilated_weights_np, stride):
-        ref_np = conv2d_nhwc_python(
-            input_np.astype("float32"), dilated_weights_np.astype("float32"), stride, padding=0
-        ).astype("float16")
-        return ref_np
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d(
-        self,
-        padded_in_shape,
-        padded_filt_shape,
-        stride,
-        dilation,
-        dtype,
-        out_shape,
-        input_layout,
-        weights_layout,
-        output_layout,
-        input_np_padded,
-        weights_np_transformed,
-        expected_output_np,
-        working_scope,
-        hexagon_session,
-    ):
-        """Main test function that tests the conv2d slice op"""
-        input_tensor = tvm.te.placeholder(padded_in_shape, name="InputTensor", dtype=dtype)
-        weights = tvm.te.placeholder(padded_filt_shape, name="Weights", dtype=dtype)
-        output_name = "output"
-
-        output_tensor = conv2d_compute(
-            input_tensor, weights, out_shape, stride, dilation, dtype, output_name
-        )
-
-        tir_schedule = conv2d_schedule(
-            output_tensor,
-            [input_tensor, weights],
-            input_layout,
-            weights_layout,
-            output_layout,
-            output_name,
-        )
-
-        func_name = f"fconv2d_{dtype}"
-        with tvm.transform.PassContext(opt_level=3):
-            runtime_module = tvm.build(
-                tir_schedule.mod,
-                target=get_hexagon_target("v69"),
-                name=func_name,
-            )
-
-        input_np_transformed = transform_numpy(input_np_padded, "nhwc", input_layout)
-        output_np_transformed = transform_numpy(expected_output_np, "nhwc", output_layout)
-
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=input_np_transformed,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-
-        weights_arr = allocate_hexagon_array(
-            hexagon_session.device, data=weights_np_transformed, mem_scope=working_scope
-        )
-
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=output_np_transformed.shape,
-            dtype=output_np_transformed.dtype,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-
-        mod = hexagon_session.load_module(runtime_module)
-        mod(input_arr, weights_arr, output_arr)
-        output_np = output_arr.numpy()
-        np.testing.assert_allclose(output_np, output_np_transformed, atol=1.0, rtol=0.05)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_dense_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_dense_slice.py
deleted file mode 100644
index e616c384fb40..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_dense_slice.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import numpy as np
-
-from tvm import te, topi
-
-import tvm.testing
-from tvm.topi import testing
-from tvm.contrib.hexagon.build import HexagonLauncher
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.hexagon.qnn as qnn
-import tvm.topi.hexagon.slice_ops as sl
-from ...infrastructure import transform_numpy, quantize_np
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-
-@tvm.testing.fixture
-def input_np(input_shape, dtype):
-    if "int" in dtype:
-        data = np.random.random(input_shape).astype("float32")
-    elif "float" in dtype:
-        data = np.random.random(input_shape).astype(dtype)
-    return data
-
-
-@tvm.testing.fixture
-def weight_np(weight_shape, dtype):
-    if "int" in dtype:
-        weight = np.random.random(weight_shape).astype("float32")
-    elif "float" in dtype:
-        weight = np.random.random(weight_shape).astype(dtype)
-    return weight
-
-
-@tvm.testing.fixture
-def input_quant(input_np, dtype):
-    if "float" in dtype:
-        return None
-    quant, scale, zp = quantize_np(input_np, dtype)
-    return {"zero": zp, "scale": scale, "data": quant}
-
-
-@tvm.testing.fixture
-def weight_quant(weight_np, dtype):
-    if "float" in dtype:
-        return None
-    quant, scale, zp = quantize_np(weight_np, "int8")
-    return {"zero": zp, "scale": scale, "data": quant}
-
-
-@tvm.testing.fixture
-def bias_np(bias_shape, bias, dtype):
-    if bias:
-        if "int" in dtype:
-            data = np.random.randint(-128, 127, size=bias_shape).astype("int32")
-        elif "float" in dtype:
-            data = np.random.random(bias_shape).astype(dtype)
-        return data
-    else:
-        return None
-
-
-@tvm.testing.fixture
-def quant_arr(input_quant, weight_quant):
-    if input_quant is None:
-        return None
-    arr = np.empty((6,), dtype="float32")
-    arr[0] = input_quant["zero"]
-    arr[1] = input_quant["scale"]
-    arr[2] = weight_quant["zero"]
-    arr[3] = weight_quant["scale"]
-    return arr
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, layout):
-    return transform_numpy(expected_output_np, "nc", layout)
-
-
-@tvm.testing.fixture
-def transformed_input_np(input_np, layout):
-    return transform_numpy(input_np, "nc", layout)
-
-
-@tvm.testing.fixture
-def transformed_input_quant(input_quant, layout):
-    if input_quant is None:
-        return None
-    input_quant["data"] = transform_numpy(input_quant["data"], "nc", layout)
-    return input_quant
-
-
-class TestDenseSlice:
-    (input_shape, output_shape, layout, bias, dtype,) = tvm.testing.parameters(
-        (  # Float 16
-            [1, 1024],
-            [1, 1024],
-            "nc-1024c-2d",
-            False,
-            "float16",
-        ),
-        (
-            [1, 2048],
-            [1, 2048],
-            "nc-1024c-2d",
-            True,
-            "float16",
-        ),
-        (  # Uint 8
-            [1, 2048],
-            [1, 2048],
-            "nc-2048c-2d",
-            False,
-            "uint8",
-        ),
-        (
-            [1, 4096],
-            [1, 4096],
-            "nc-2048c-2d",
-            True,
-            "uint8",
-        ),
-    )
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, weight_np, bias_np, bias):
-        ref_np = tvm.topi.testing.dense(
-            np.reshape(input_np, (input_np.shape[0], input_np.shape[-1])),
-            weight_np.T,  # Function expects [in_dim, out_dim]
-            bias_np,
-            use_bias=bias,
-            out_dtype="float32" if "int" in str(input_np.dtype) else input_np.dtype,
-        )
-        return ref_np
-
-    @tvm.testing.fixture
-    def weight_shape(self, input_shape, output_shape):
-        return (output_shape[-1], input_shape[-1])
-
-    @tvm.testing.fixture
-    def bias_shape(self, output_shape):
-        return (output_shape[-1],)
-
-    @tvm.testing.requires_hexagon
-    def test_dense_slice(
-        self,
-        dtype,
-        bias_np,
-        layout,
-        output_shape,
-        input_shape,
-        input_np,
-        input_quant,
-        transformed_input_np,
-        transformed_input_quant,
-        weight_np,
-        # transformed_weight_np,
-        weight_quant,
-        # transformed_weight_quant,
-        transformed_expected_output_np,
-        expected_output_np,
-        quant_arr,
-        hexagon_session: Session,
-    ):
-
-        target_hexagon = tvm.target.hexagon("v69")
-        A = te.placeholder(input_shape, name="A", dtype=dtype)
-        W = te.placeholder(
-            (output_shape[-1], input_shape[-1]),
-            name="W",
-            dtype="int8" if dtype == "uint8" else dtype,
-        )
-        args = [A, W]
-        tensors = [A, W]
-
-        # If quantized, append the quantization params
-        if "int" in dtype:
-            args.append(quant_arr[0].astype("int32"))
-            args.append(quant_arr[1])
-            args.append(quant_arr[2].astype("int32"))
-            args.append(quant_arr[3])
-
-        if bias_np is not None:
-            B = te.placeholder((output_shape[-1],), name="B", dtype=str(bias_np.dtype))
-            args.append(B)
-            tensors.append(B)
-        else:
-            B = None
-
-        # Different compute and schedule for quant and float
-        if "float" in dtype:
-            M = sl.dense_compute(*args)
-            tir_schedule = sl.dense_schedule([M], tensors, layout, layout)
-        elif "int" in dtype:
-            M = qnn.qdense_compute(*args, bias=B)
-            tir_schedule = qnn.qdense_schedule([M], tensors, layout, layout)
-        else:
-            print("Unsupported dtype {}".format(dtype))
-            exit(-1)
-
-        sch = tir_schedule.mod
-
-        input_axis_separator = [2]
-        output_axis_separator = [2]
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                args,
-                target=tvm.target.Target(target_hexagon, host=target_hexagon),
-                name="dense",
-            )
-            func.save("dense.s" if bias_np is None else "dense_bias.s")
-
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np if "float" in dtype else transformed_input_quant["data"],
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-        weight_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=weight_np if "float" in dtype else weight_quant["data"],
-            axis_separators=None,
-            mem_scope="global",
-        )
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            transformed_expected_output_np.shape,
-            "float32" if "int" in dtype else dtype,
-            axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
-        )
-        arrs = [input_arr, weight_arr]
-
-        if bias_np is not None:
-            bias_arr = allocate_hexagon_array(
-                hexagon_session.device,
-                data=bias_np,
-                axis_separators=None,
-                mem_scope="global.vtcm",
-            )
-            arrs.append(bias_arr)
-
-        arrs.append(output_arr)
-
-        mod = hexagon_session.load_module(func)
-        mod(*arrs)
-
-        # Reshape for comparison
-        b, c = output_shape
-        if layout == "nc-1024c-2d":
-            output_np = output_arr.numpy().reshape([b, c // 1024, 1024])
-        elif layout == "nc-2048c-2d":
-            output_np = output_arr.numpy().reshape([b, c // 2048, 2048])
-        else:
-            raise RuntimeError(f"Unexpected layout '{layout}'")
-
-        if "int" in dtype:
-            np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-2, atol=0)
-        elif "float" in dtype:
-            np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-1, atol=0)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
deleted file mode 100644
index e5a22e8879b5..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, disable=line-too-long, redefined-outer-name
-
-"""Test depthwise_conv2d slice op for hexagon."""
-
-import numpy as np
-
-import tvm
-import tvm.testing
-import tvm.topi.hexagon.qnn as qn
-from tvm.topi.testing import depthwise_conv2d_python_nhwc
-from tvm.topi.hexagon.slice_ops.dwconv2d import dwconv2d_compute, dwconv2d_schedule
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, quantize_np
-
-
-@tvm.testing.fixture
-def input_np(in_shape, dtype, low, high):
-    if dtype in ("uint8"):
-        return np.random.uniform(low=low, high=high, size=in_shape).astype("float32")
-    if dtype in ("int8"):
-        return np.random.uniform(low=-low, high=high, size=in_shape).astype("float32")
-    return np.random.uniform(size=in_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def input_np_padded(input_np, in_shape, padded_in_shape):
-    pad_height = padded_in_shape[1] - in_shape[1]
-    pad_width = padded_in_shape[2] - in_shape[2]
-    pad_channel = padded_in_shape[3] - in_shape[3]
-    input_padded = np.pad(
-        input_np, ((0, 0), (0, pad_height), (0, pad_width), (0, pad_channel)), "constant"
-    )
-    return input_padded
-
-
-@tvm.testing.fixture
-def in_out_layout(dtype):
-    if dtype == "float16":
-        return "nhwc-8h2w32c2w-2d"
-    elif dtype in ("uint8", "int8"):
-        return "nhwc-8h8w32c-2d"
-    else:
-        raise RuntimeError(f"Unsupported quantized data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def expected_output_np(input_np, dilated_weights_np, stride, dtype):
-    dilated_weights_np_t = dilated_weights_np.transpose(0, 1, 3, 2)
-    ref_type = dtype
-    if dtype in ("uint8", "int8"):
-        # for quantized versions, return float32 output
-        ref_type = "float32"
-    ref_np = depthwise_conv2d_python_nhwc(
-        input_np.astype("float32"), dilated_weights_np_t.astype("float32"), stride, padding=0
-    ).astype(ref_type)
-    return ref_np
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, in_out_layout, dtype):
-    if dtype == "float16":
-        return transform_numpy(expected_output_np, "nhwc", in_out_layout)
-    elif dtype in ("uint8", "int8"):
-        quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype)
-        return [transform_numpy(quant_arr, "nhwc", in_out_layout), scale, zero_point]
-    else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def transformed_input_np_padded(input_np_padded, in_out_layout, dtype):
-    if dtype == "float16":
-        return transform_numpy(input_np_padded, "nhwc", in_out_layout)
-    if dtype in ("uint8", "int8"):
-        quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype)
-        return [transform_numpy(quant_arr, "nhwc", in_out_layout), scale, zero_point]
-    raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def weights_np(filt_shape, dtype):
-    if dtype == "float16":
-        return np.random.uniform(size=filt_shape).astype(dtype)
-    elif dtype in ("uint8", "int8"):
-        weight_arr = np.random.uniform(low=-5, high=5, size=filt_shape).astype("float32")
-        return weight_arr
-    else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def dilated_filt_shape(filt_shape, dilation):
-    """Compute the dilated filter shape when dilation > 1"""
-    filt_height, filt_width, in_channel, out_channel = filt_shape
-    dilation_height, dilation_width = dilation
-    if dilation_height == 1 and dilation_width == 1:
-        return filt_shape
-    dilated_height = dilation_height * (filt_height - 1) + 1
-    dilated_width = dilation_width * (filt_width - 1) + 1
-    return dilated_height, dilated_width, in_channel, out_channel
-
-
-@tvm.testing.fixture
-def dilated_weights_np(weights_np, dilation, dilated_filt_shape, dtype):
-    """Get dilated weights from original weights for testing"""
-    if dtype in ["int8", "uint8"]:
-        dtype = "float32"
-    filt_height, filt_width, in_channels, out_channels = weights_np.shape
-    dilated_weights = np.zeros(dilated_filt_shape)
-    dilation_height, dilation_width = dilation
-    if dilation_height == 1 and dilation_width == 1:
-        return weights_np
-    dilated_height, dilated_width = dilated_filt_shape[0], dilated_filt_shape[1]
-    for in_channel in range(in_channels):
-        for out_channel in range(out_channels):
-            for dilation_i, height_i in zip(
-                range(0, dilated_height, dilation_height), range(filt_height)
-            ):
-                for dilation_j, width_j in zip(
-                    range(0, dilated_width, dilation_width), range(filt_width)
-                ):
-                    dilated_weights[dilation_i, dilation_j, in_channel, out_channel] = weights_np[
-                        height_i, width_j, in_channel, out_channel
-                    ]
-    return dilated_weights
-
-
-@tvm.testing.fixture
-def transformed_weights_np(weights_np, dtype):
-    height, width, in_channel, out_channel = weights_np.shape
-    t = weights_np.reshape([height, width, in_channel, out_channel // 32, 32]).transpose(
-        3, 0, 1, 2, 4
-    )
-    if dtype == "float16":
-        return t
-    if dtype in ("uint8", "int8"):
-        quant_arr, scale, zero_point = quantize_np(t, dtype)
-        return [quant_arr, scale, zero_point]
-    raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-def generate_test_config(test_params):
-    """Utility function to generate test config with meaningful ids"""
-    test_config = {}
-
-    dims = lambda vals: "x".join(map(str, vals))
-
-    for param in test_params:
-        in_shape, filt_shape, stride, dilation = param[:4]
-        test_name = f"nhwc{dims(in_shape)}-hwio{dims(filt_shape)}-stride{dims(stride)}-dilation{dims(dilation)}"
-        test_config[test_name] = param
-
-    return test_config
-
-
-class Testdwconv2dSlice:
-    """Test class that defines the dwconv2d slice test"""
-
-    test_params = [
-        [(1, 10, 10, 32), (3, 3, 1, 32), (1, 1), (1, 1), 0.0, 10.0],
-        [(1, 10, 10, 64), (3, 3, 1, 64), (1, 1), (1, 1), 0.0, 10.0],
-        [(1, 12, 12, 32), (5, 5, 1, 32), (1, 1), (1, 1), 0.0, 20.0],
-        [(1, 16, 16, 32), (5, 5, 1, 32), (1, 1), (2, 2), 0.0, 1.0],
-        [(1, 18, 10, 32), (3, 3, 1, 32), (1, 1), (1, 1), 0.0, 10.0],
-        [(1, 18, 18, 32), (3, 3, 1, 32), (2, 2), (1, 1), 0.0, 10.0],
-        [(1, 18, 10, 96), (3, 3, 1, 96), (1, 1), (1, 1), 0.0, 10.0],
-        [(1, 21, 21, 32), (7, 7, 1, 32), (2, 2), (1, 1), 0.0, 10.0],
-        [(1, 28, 28, 32), (7, 7, 1, 32), (2, 2), (2, 2), 0.0, 10.0],
-        [(1, 28, 28, 96), (7, 7, 1, 96), (2, 2), (2, 2), 0.0, 10.0],
-        [(1, 10, 16, 32), (3, 1, 1, 32), (1, 1), (1, 1), 0.0, 10.0],
-    ]
-
-    test_config = generate_test_config(test_params)
-
-    in_shape, filt_shape, stride, dilation, low, high = tvm.testing.parameters(
-        *test_config.values(), ids=test_config.keys()
-    )
-    dtype = tvm.testing.parameter("float16", "uint8")
-    working_scope = tvm.testing.parameter("global.vtcm")
-    weights_layout = tvm.testing.parameter("ohwi32o-1d")
-
-    @tvm.testing.fixture
-    def padded_in_shape(self, in_shape, dtype):
-        """Padding the input shape according to layout"""
-        # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and
-        # for int8/uint8, it's nhwc-8h8w32c-2d.
-        # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple
-        # of 8. However, the width should be a multiple of 4 for the first case and 8 for
-        # the second case.
-        in_batch, in_height, in_width, in_channel = in_shape
-        in_height = ((in_height + 7) // 8) * 8
-
-        if dtype == "float16":
-            in_width = ((in_width + 3) // 4) * 4
-        elif dtype in ("uint8", "int8"):
-            in_width = ((in_width + 7) // 8) * 8
-
-        in_channel = ((in_channel + 31) // 32) * 32
-
-        return in_batch, in_height, in_width, in_channel
-
-    @tvm.testing.fixture
-    def out_shape(self, in_shape, dilated_filt_shape, stride):
-        in_batch, in_height, in_width, _ = in_shape
-        filt_height, filt_width, _, num_filt = dilated_filt_shape
-        out_height = (in_height - filt_height) // stride[0] + 1
-        out_width = (in_width - filt_width) // stride[1] + 1
-        out_channel = num_filt
-        return in_batch, out_height, out_width, out_channel
-
-    @tvm.testing.requires_hexagon
-    def test_dwconv2d(
-        self,
-        dtype,
-        in_out_layout,
-        weights_layout,
-        padded_in_shape,
-        weights_np,
-        filt_shape,
-        stride,
-        dilation,
-        out_shape,
-        input_np,
-        input_np_padded,
-        transformed_weights_np,
-        expected_output_np,
-        target,
-        working_scope,
-        transformed_input_np_padded,
-        transformed_expected_output_np,
-        hexagon_session,
-    ):
-        """Main test function that tests the dwconv2d slice op"""
-        input_tensor = tvm.te.placeholder(padded_in_shape, name="InputTensor", dtype=dtype)
-        weights = tvm.te.placeholder(filt_shape, name="Weights", dtype=dtype)
-
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-        # Construct compute and schedule based on dtype
-        if dtype in ("uint8", "int8"):
-            in_data_np, activation_scale, activation_zero_point = transformed_input_np_padded
-            (
-                weights_data_np,
-                weight_scale,
-                weight_zero_point,
-            ) = transformed_weights_np
-            out_data_np, output_scale, output_zero_point = transformed_expected_output_np
-
-            output_tensor = qn.qdepthwise_conv2d_compute(
-                input_tensor,
-                weights,
-                out_shape,
-                stride,
-                dilation,
-                dtype,
-                activation_zero_point,
-                activation_scale,
-                weight_zero_point,
-                weight_scale,
-                output_zero_point,
-                output_scale,
-            )
-
-            tir_schedule = qn.qdepthwise_conv2d_schedule(
-                output_tensor, [input_tensor, weights], in_out_layout, weights_layout
-            )
-
-        elif dtype == "float16":
-            in_data_np = transformed_input_np_padded
-            out_data_np = transformed_expected_output_np
-            weights_data_np = transformed_weights_np
-            output_tensor = dwconv2d_compute(
-                input_tensor, weights, out_shape, stride, dilation, dtype
-            )
-
-            tir_schedule = dwconv2d_schedule(
-                output_tensor, [input_tensor, weights], in_out_layout, weights_layout
-            )
-        else:
-            raise RuntimeError(f"Unsupport dtype '{dtype}'")
-
-        func_name = "depthwise_conv2d_slice"
-        with tvm.transform.PassContext(opt_level=3):
-            runtime_module = tvm.build(
-                tir_schedule.mod,
-                [input_tensor, output_tensor],
-                target=target,
-                name=func_name,
-            )
-
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=in_data_np,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-
-        weights_arr = allocate_hexagon_array(
-            hexagon_session.device, data=weights_data_np, mem_scope=working_scope
-        )
-
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            out_data_np.shape,
-            dtype=dtype,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-
-        mod = hexagon_session.load_module(runtime_module)
-        mod(input_arr, weights_arr, output_arr)
-        n, h, w, c = out_shape
-
-        if dtype in ("uint8", "int8"):
-            output_np = output_arr.numpy().reshape([n, h // 8, w // 8, c // 32, 8, 8, 32])
-            np.testing.assert_allclose(output_np, out_data_np, atol=3, rtol=0.02)
-        elif dtype == "float16":
-            output_np = output_arr.numpy()
-            np.testing.assert_allclose(output_np, out_data_np, atol=0.01, rtol=0.01)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
deleted file mode 100644
index 8b9f49458df2..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-""" Tests for Hexagon dequantize """
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.topi.hexagon import qnn
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ...infrastructure import (
-    transform_numpy,
-    quantize_np,
-    get_hexagon_target,
-)
-
-
-class TestDequantizeSlice2d:
-    """
-    For testing Dequantize Slice ops
-    """
-
-    input_shape, orig_layout, input_layout, output_layout, axis_sep, dtype = tvm.testing.parameters(
-        ((1, 16, 64, 128), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "int8"),
-        ((1, 16, 64, 128), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "uint8"),
-        ((1, 8, 8, 32), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "int8"),
-        ((1, 8, 8, 32), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "uint8"),
-        ((1, 2048), "nc", "nc-2048c-2d", "nc-512c-2d", [2], "int8"),
-        ((1, 2048), "nc", "nc-2048c-2d", "nc-512c-2d", [2], "uint8"),
-    )
-
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape):
-        arr_np = np.random.random(size=input_shape).astype("float32")
-        return arr_np
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, orig_layout, input_layout, dtype):
-        quant_arr, scale, zero_point = quantize_np(input_np, dtype)
-        return [transform_numpy(quant_arr, orig_layout, input_layout), scale, zero_point]
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, dtype):
-        quant_np, scale, zero_point = quantize_np(input_np, dtype)
-        ref_np = (scale * (quant_np.astype("int32") - zero_point)).astype("float32")
-        return ref_np
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
-        return transform_numpy(expected_output_np, orig_layout, output_layout)
-
-    @tvm.testing.requires_hexagon
-    def test_dequant_qnn(
-        self,
-        input_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        transformed_input_np,
-        transformed_expected_output_np,
-        axis_sep,
-        hexagon_session,
-        working_scope,
-    ):
-        """
-        Top level testing function for dequantize
-        """
-
-        dequant_input = te.placeholder(input_shape, name="A", dtype=dtype)
-
-        in_data_np, in_scale, in_zero_pt = transformed_input_np
-
-        dequant_output = qnn.dequantize_compute(dequant_input, in_scale, in_zero_pt)
-
-        tir_s = qnn.dequantize_schedule(dequant_input, dequant_output, input_layout, output_layout)
-
-        input_data = allocate_hexagon_array(
-            hexagon_session.device,
-            data=in_data_np,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        output_data = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=transformed_expected_output_np.dtype,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            tir_irm = tvm.lower(tir_s.mod, [dequant_input, dequant_output], name="dequantize")
-            runtime_module = tvm.build(tir_irm, target=get_hexagon_target("v69"), name="dequantize")
-        mod = hexagon_session.load_module(runtime_module)
-
-        mod(input_data, output_data)
-        output_np = output_data.numpy()
-        tvm.testing.assert_allclose(
-            output_np,
-            transformed_expected_output_np,
-            1e-3,
-            1e-3,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
deleted file mode 100644
index 7cde83e0cb77..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test code for float16 and uint8 global_avg_pool2d."""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.topi.testing import adaptive_pool
-import tvm.topi.hexagon.qnn as qn
-import tvm.topi.hexagon.slice_ops as sl
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target
-
-
-SCALE_M_VAL = None
-ZERO_POINT_M_VAL = None
-SCALE_VAL = None
-ZERO_POINT_VAL = None
-
-
-class TestGlobalPool2D:
-    (input_shape,) = tvm.testing.parameters(
-        ([1, 32, 8, 8],),
-        ([1, 1056, 16, 16],),
-    )
-
-    # Fixed chunk layout is set as nchw-32c8h8w-2d for uint8 and nchw-32c8h4w-2d for float16.
-    # For optimization, it might get changed later.
-    # Since output shape will be NxCx1x1 which is not a
-    # multiple of fixed-chunk, output_layout is NCHW.
-    input_layout, output_layout, pool_type, layout, dtype = tvm.testing.parameters(
-        ("nchw-32c8h8w-2d", "nchw", "avg", "NCHW", "uint8"),
-        ("nchw-32c8h4w-2d", "nchw", "avg", "NCHW", "float16"),
-    )
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self,
-        input_np,
-        pool_type,
-        layout,
-    ):
-        """Generate expected output."""
-        ref_np = tvm.topi.testing.adaptive_pool(
-            input_np,
-            (1, 1),
-            pool_type,
-            layout,
-        )
-        return ref_np
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        if dtype in ("uint8", "int8"):
-            dtype = "float32"
-        return np.random.random(input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def quantize_input_np(self, input_np, dtype):
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_VAL, SCALE_VAL
-            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
-            return input_np_quantized
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, quantize_input_np, input_layout, layout, dtype):
-        if dtype == "float16":
-            return transform_numpy(input_np, layout.lower(), input_layout)
-        if dtype in ("uint8", "int8"):
-            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def quantize_expected_output_np(self, expected_output_np, dtype):
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_M_VAL, SCALE_M_VAL
-            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
-                expected_output_np, dtype
-            )
-
-            # Since output_layout is nchw, no transformation is needed.
-            return out_ref_quantized
-
-    @tvm.testing.requires_hexagon
-    def test_global_pool2d(
-        self,
-        dtype,
-        input_shape,
-        input_layout,
-        transformed_input_np,
-        expected_output_np,
-        quantize_expected_output_np,
-        hexagon_session,
-    ):
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
-
-        if dtype == "float16":
-            m_tensor = sl.global_avg_pool2d(a_tensor)
-            tir_schedule = sl.stir_global_avg_pool2d_schedule(m_tensor, a_tensor, input_layout)
-        elif dtype in ["uint8", "int8"]:
-            m_tensor = qn.global_avg_pool2d_u8(
-                a_tensor,
-                dtype,
-                ZERO_POINT_VAL,
-                SCALE_VAL,
-                ZERO_POINT_M_VAL,
-                SCALE_M_VAL,
-            )
-            tir_schedule = qn.stir_global_avg_pool2d_u8_schedule(m_tensor, a_tensor, input_layout)
-
-        sch = tir_schedule.mod
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="global_pool2d",
-            )
-
-        input_axis_separator = [4]
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            expected_output_np.shape,
-            dtype=dtype,
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        # Convert nd to np
-        m_data_np = m_data_nd.numpy()
-
-        if dtype == "float16":
-            np.testing.assert_allclose(expected_output_np, m_data_np, rtol=1e-3, atol=1e-3)
-        elif dtype in ["int8", "uint8"]:
-            np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
deleted file mode 100644
index 4cd92f4dd27d..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-from typing import *
-
-from tvm import te
-import tvm.testing
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.hexagon.slice_ops as sl
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, get_hexagon_target
-from ...pytest_util import (
-    get_multitest_ids,
-    create_populated_numpy_ndarray,
-    TensorContentRandom,
-)
-
-
-@tvm.testing.fixture
-def input_np(input_shape, dtype: str, input_tensor_populator):
-    return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout):
-    return transform_numpy(expected_output_np, "nhwc", output_layout)
-
-
-@tvm.testing.fixture
-def transformed_input_np_padded(input_np_padded, input_layout):
-    return transform_numpy(input_np_padded, "nhwc", input_layout)
-
-
-(input_layout, dtype) = tvm.testing.parameters(
-    ("nhwc-8h2w32c2w-2d", "float16"),
-    ("nhwc-8h8w32c-2d", "uint8"),
-)
-
-
-@tvm.testing.fixture
-def output_layout(output_shape, dtype):
-    o_b, o_h, o_w, o_c = output_shape
-    if dtype == "float16":
-        if o_h == 1 and o_w == 1:
-            return "n11c-1024c-2d"
-        else:
-            assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape"
-            return "nhwc-8h2w32c2w-2d"
-    elif dtype == "int8" or "uint8":
-        if o_h == 1 and o_w == 1:
-            return "n11c-2048c-2d"
-        else:
-            assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape"
-            return "nhwc-8h8w32c-2d"
-    else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-class TestmaxPool2dSlice:
-    _param_descs = [
-        "out_shape",  # output_shape
-        "kernel",  # kernel
-        "stride",  # stride
-        "dil",  # dilation
-        "pad",  # padding
-        "ceil",  # ceil_mode
-        "cnt_padded",  # count_include_pad
-        None,  # input_tensor_populator
-    ]
-
-    _multitest_params = [
-        (
-            [1, 8, 8, 32],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 16, 16, 32],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 8, 8, 32],
-            [8, 8],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        # Test non-one stride and dilation
-        (
-            [1, 8, 8, 32],
-            [3, 3],
-            [2, 3],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 8, 8, 32],
-            [3, 3],
-            [2, 2],
-            [2, 2],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 8, 8, 32],
-            [3, 3],
-            [2, 2],
-            [2, 3],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        # Test non-zero padding
-        (
-            [1, 8, 8, 32],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [1, 1, 1, 1],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 8, 8, 32],
-            [3, 3],
-            [1, 1],
-            [1, 1],
-            [1, 2, 3, 4],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        # Test n11c-1024c-2d layout which will require input and output to have different layout
-        (
-            [1, 1, 1, 2048],
-            [8, 8],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 1, 1, 2048],
-            [6, 6],
-            [1, 1],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 1, 1, 2048],
-            [3, 3],
-            [2, 2],
-            [1, 1],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-        (
-            [1, 1, 1, 2048],
-            [4, 4],
-            [2, 2],
-            [2, 3],
-            [0, 0, 0, 0],
-            False,
-            True,
-            TensorContentRandom(),
-        ),
-    ]
-
-    _param_ids = get_multitest_ids(_multitest_params, _param_descs)
-
-    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d" for float16
-    # and "nhwc-8h8w32c-2d" for uint8
-    (
-        output_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
-        input_tensor_populator,
-    ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self,
-        input_np,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
-    ):
-        pad_before = padding[:2]
-        pad_after = padding[2:]
-        ref_np = tvm.topi.testing.poolnd_python(
-            input_np,
-            kernel,
-            stride,
-            dilation,
-            pad_before,
-            pad_after,
-            "max",  # pool_type
-            count_include_pad,
-            False,  # ceil_mode,
-            layout="NHWC",
-        )
-        return ref_np
-
-    @tvm.testing.fixture
-    def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout):
-        # Input shape without any padding; 'ceil' is being ignored from calculation:
-        o_b, o_h, o_w, o_c = output_shape
-        d_h, d_w = dilation
-        s_h, s_w = stride
-        k_h, k_w = kernel
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h, pad_after_w = padding[2:]
-
-        if output_layout == "n11c-1024c-2d":
-            assert (
-                pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0
-            ), "Padding must be zero for n11c-1024c-2d layout"
-            assert o_h == 1 and o_w == 1, "Output height and width must be 1"
-
-        in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h
-        in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w
-
-        return [o_b, in_h, in_w, o_c]
-
-    @tvm.testing.fixture
-    def input_shape_padded(self, dtype, input_shape, padding, output_layout):
-        # Input shape is adjusted to account for 'padding'. Also, due to the physical
-        # layout of the buffer, height and width are adjusted so that they are a
-        # multiple of 8 and 4 respectively.
-        # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and
-        # for int8/uint8, it's nhwc-8h8w32c-2d.
-        # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple
-        # of 8. However, the width should be a multiple of 4 for the first case and 8 for
-        # the second case.
-
-        height_mult = 8
-        if dtype == "float16":
-            width_mult = 4  # input layout : nhwc-8h2w32c2w-2d
-        elif dtype in ("uint8", "int8"):
-            width_mult = 8  # input layout : nhwc-8h8w32c-2d
-        else:
-            raise RuntimeError(f"Unsupport dtype '{dtype}'")
-
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h, pad_after_w = padding[2:]
-        padded_input_height = (
-            (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult
-        ) * height_mult
-        padded_input_width = (
-            (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult
-        ) * width_mult
-        return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]]
-
-    @tvm.testing.fixture
-    def input_np_padded(self, input_np, input_shape, input_shape_padded, padding):
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h
-        pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w
-
-        input_padded = np.pad(
-            input_np,
-            ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)),
-            "constant",
-        )
-        return input_padded
-
-    @tvm.testing.requires_hexagon
-    def test_max_pool2d_slice(
-        self,
-        stride,
-        kernel,
-        dtype,
-        dilation,
-        padding,
-        ceil_mode,  # only needed for manually obtaining the test id string
-        input_tensor_populator,  # only needed for manually obtaining the test id string
-        count_include_pad,
-        input_layout,
-        output_layout,
-        output_shape,
-        input_shape,
-        input_shape_padded,
-        input_np,
-        input_np_padded,
-        transformed_input_np_padded,
-        transformed_expected_output_np,
-        expected_output_np,
-        hexagon_session: Session,
-    ):
-        A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
-
-        M = sl.max_pool2d_compute(A, output_shape, kernel, stride, dilation)
-
-        # tir schedule
-        tir_schedule = sl.max_pool2d_STIR_schedule(M, A, output_layout, input_layout)
-        sch = tir_schedule.mod
-
-        input_axis_separator = [4]
-        if output_layout in (
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h8w32c-2d",
-            "n11c-1024c-2d",
-            "n11c-2048c-2d",
-        ):
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [A, M],
-                get_hexagon_target("v69"),
-                name="max_pool2d",
-            )
-
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np_padded,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            transformed_expected_output_np.shape,
-            dtype,
-            axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(input_arr, output_arr)
-        b, h, w, c = output_shape
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
-        elif output_layout == "nhwc-8h8w32c-2d":
-            output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
-        elif output_layout == "n11c-2048c-2d":
-            output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048])
-        elif output_layout == "n11c-1024c-2d":
-            output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-        if dtype == "float16":
-            np.testing.assert_allclose(
-                output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3
-            )
-        elif dtype == "uint8":
-            np.testing.assert_allclose(output_np, transformed_expected_output_np, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
deleted file mode 100644
index 1430551df719..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm.topi.hexagon.slice_ops.relu import relu_compute, relu_stir_schedule
-from tvm import te
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, get_hexagon_target
-
-
-@tvm.testing.fixture
-def input_np(in_shape, dtype):
-    return np.random.uniform(size=in_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def ref_output_np(input_np):
-    output_np = input_np * (input_np > 0)
-    return output_np
-
-
-@tvm.testing.fixture
-def transformed_input_np(input_np, input_layout):
-    return transform_numpy(input_np, "nhwc", input_layout)
-
-
-@tvm.testing.fixture
-def transformed_ref_output_np(ref_output_np, output_layout):
-    return transform_numpy(ref_output_np, "nhwc", output_layout)
-
-
-class BaseRelu:
-    in_shape = tvm.testing.parameter(
-        (1, 8, 4, 32),
-        (1, 16, 4, 32),
-        (1, 16, 8, 32),
-        (1, 16, 8, 64),
-        (2, 8, 4, 32),
-        (2, 16, 4, 32),
-        (2, 16, 8, 32),
-        (2, 16, 8, 64),
-    )
-    dtype = tvm.testing.parameter("float16")
-    working_scope = tvm.testing.parameter("global.vtcm")
-    input_layout = tvm.testing.parameter("nhwc-8h2w32c2w-2d")
-    output_layout = tvm.testing.parameter("nhwc-8h2w32c2w-2d")
-
-
-class TestReluSlice(BaseRelu):
-    @tvm.testing.requires_hexagon
-    def test_relu(
-        self,
-        in_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        transformed_input_np,
-        transformed_ref_output_np,
-        working_scope,
-        hexagon_session,
-    ):
-        InputTensor = te.placeholder(in_shape, name="InputTensor", dtype=dtype)
-
-        OutputTensor = relu_compute(InputTensor)
-
-        tir_s = relu_stir_schedule(InputTensor, OutputTensor, input_layout, output_layout)
-
-        input_data = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-        output_data = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_ref_output_np.shape,
-            dtype=transformed_ref_output_np.dtype,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-
-        func_name = "relu"
-        with tvm.transform.PassContext(opt_level=3):
-            runtime_module = tvm.build(tir_s.mod, target=get_hexagon_target("v69"), name=func_name)
-
-        mod = hexagon_session.load_module(runtime_module)
-
-        mod(input_data, output_data)
-        output_np = output_data.numpy()
-
-        tvm.testing.assert_allclose(
-            output_np,
-            transformed_ref_output_np,
-            1e-3,
-            1e-3,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
deleted file mode 100644
index 2707ed3a5af1..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.topi.testing import softmax_python
-import tvm.topi.hexagon.slice_ops as sl
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-
-def transform_numpy(arr_np, layout):
-
-    if layout in ["nc-512c-2d"]:
-        N, C = arr_np.shape
-        return arr_np.reshape([N, C // 512, 512])
-    raise RuntimeError(f"Unexpected layout '{layout}'")
-
-
-@tvm.testing.fixture
-def input_np(input_shape, dtype):
-    return (np.random.uniform(size=input_shape)).astype(dtype)
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout):
-    return transform_numpy(expected_output_np, output_layout)
-
-
-@tvm.testing.fixture
-def transformed_input_np(input_np, input_layout):
-    return transform_numpy(input_np, input_layout)
-
-
-class Basesoftmax2d:
-
-    input_shape, input_layout, output_layout, axis_sep = tvm.testing.parameters(
-        ((1, 1024), "nc-512c-2d", "nc-512c-2d", [2])
-    )
-    dtype = tvm.testing.parameter("float32")
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-
-class TestSoftmax2d(Basesoftmax2d):
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np):
-        if len(input_np.shape) == 2:
-            ref_np_2d = softmax_python(input_np)
-            return ref_np_2d
-        raise RuntimeError(f"Unexpected input shape '{input_np.shape}'")
-
-    @tvm.testing.requires_hexagon
-    def test_softmax_f32(
-        self,
-        dtype,
-        input_layout,
-        output_layout,
-        input_shape,
-        input_np,
-        transformed_input_np,
-        transformed_expected_output_np,
-        expected_output_np,
-        working_scope,
-        axis_sep,
-        hexagon_session,
-    ):
-        target_hexagon = tvm.target.hexagon(
-            "v69",
-            llvm_options="--disable-loop-unrolling-pass",
-        )
-        A = te.placeholder(input_shape, name="A", dtype=dtype)
-
-        O = sl.softmax_compute(A)
-
-        if input_layout == "nc-512c-2d":
-            tir_s = sl.softmax_stir_schedule(O, A, output_layout, input_layout)
-            sch = tir_s.mod
-        else:
-            raise RuntimeError(f"Unexpected input layout '{input_layout}'")
-
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={
-                "tir.LoopPartition": {"partition_const_loop": True},
-            },
-        ):
-
-            func = tvm.build(
-                sch,
-                [A, O],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
-                name="softmax_slice",
-            )
-
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=transformed_expected_output_np.dtype,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(input_arr, output_arr)
-
-        n, c = input_np.shape
-        output_np = output_arr.numpy().reshape(1, c // 512, 512)
-
-        np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-4, atol=1e-4)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
deleted file mode 100644
index 6297ef2c1e6e..000000000000
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Test for Hexagon slice tanh op """
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-import tvm.topi.hexagon.slice_ops as sl
-import tvm.contrib.hexagon
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ...infrastructure import transform_numpy, get_hexagon_target
-
-# pylint: disable=invalid-name
-
-
-class TestTanhSlice:
-    """For Testing Tanh fp16 op"""
-
-    input_shape, orig_layout, input_layout, output_layout, axis_sep = tvm.testing.parameters(
-        ((1, 8, 4, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
-    )
-    dtype = tvm.testing.parameter("float16")
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        return np.random.uniform(size=input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, orig_layout, input_layout):
-        return transform_numpy(input_np, orig_layout, input_layout)
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np):
-        ref_np = np.tanh(input_np)
-        return ref_np
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
-        return transform_numpy(expected_output_np, orig_layout, output_layout)
-
-    @tvm.testing.requires_hexagon
-    def test_tanh(
-        self,
-        input_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        transformed_input_np,
-        transformed_expected_output_np,
-        axis_sep,
-        hexagon_session,
-        working_scope,
-    ):
-        """Top Level testing function for tanh fp16 op"""
-
-        A = te.placeholder(input_shape, name="A", dtype=dtype)
-        M = sl.tanh_te_compute(A)
-        tanhf16_func = te.create_prim_func([A, M])
-        tir_s = sl.tanhf16_schedule(tanhf16_func, input_layout, output_layout)
-        A_data = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        M_data = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=transformed_expected_output_np.dtype,
-            axis_separators=axis_sep,
-            mem_scope=working_scope,
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            tir_irm = tvm.lower(tir_s.mod, [A, M], name="tanhf16")
-            runtime_module = tvm.build(tir_irm, target=get_hexagon_target("v69"), name="tanhf16")
-        mod = hexagon_session.load_module(runtime_module)
-
-        mod(A_data, M_data)
-        output_np = M_data.numpy()
-        tvm.testing.assert_allclose(
-            output_np,
-            transformed_expected_output_np,
-            1e-3,
-            1e-3,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py b/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py
deleted file mode 100644
index e5b6c4d79065..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test code for specialized case of adaptive_avg_pool1d."""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.topi.testing import adaptive_pool
-import tvm.topi.hexagon.qnn as s1
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ..infrastructure import transform_numpy, quantize_np
-
-
-SCALE_M_VAL = None
-ZERO_POINT_M_VAL = None
-SCALE_VAL = None
-ZERO_POINT_VAL = None
-
-
-class TestAdaptivePool1D:
-    """Test specialized case of adaptive_avg_pool1d."""
-
-    (input_shape,) = tvm.testing.parameters(
-        ([1, 128, 128],),
-        ([1, 64, 64],),
-        ([1, 64, 128],),
-        ([1, 32, 64],),
-        ([1, 128, 768],),
-    )
-
-    # Fixed chunk layout is set as ncw-32c64w-2d for now.
-    # The adaptive_avg_pool1d implementation only handles specialized case
-    # where output_size is 1 as it appears on quantized distilbert model.
-    # Since output size won't be a multiple of fixed-chunk,
-    # output_layout is ncw.
-    # For optimization, it might get changed later.
-    input_layout, output_layout, pool_type, layout, output_size, dtype, = tvm.testing.parameters(
-        (
-            "ncw-32c64w-2d",
-            "ncw",
-            "avg",
-            "NCW",
-            [1],
-            "uint8",
-        )
-    )
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self,
-        input_np,
-        output_size,
-        pool_type,
-        layout,
-    ):
-        """Generate expected output."""
-        out_width = output_size[0]
-
-        ref_np = adaptive_pool(
-            input_np,
-            out_width,
-            pool_type,
-            layout,
-        )
-        return ref_np
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        if dtype in ("uint8", "int8"):
-            dtype = "float32"
-        return np.random.random(input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def quantize_input_np(self, input_np, dtype):
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_VAL, SCALE_VAL
-            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
-            return input_np_quantized
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, quantize_input_np, input_layout, layout, dtype):
-        if dtype in ("uint8", "int8"):
-            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def quantize_expected_output_np(self, expected_output_np, dtype):
-        """Generate expected output."""
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_M_VAL, SCALE_M_VAL
-            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
-                expected_output_np, dtype
-            )
-
-            # Since output_layout is ncw, no transformation is needed.
-            return out_ref_quantized
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.requires_hexagon
-    def test_pool1d(
-        self,
-        dtype,
-        output_size,
-        input_layout,
-        output_layout,
-        input_shape,
-        transformed_input_np,
-        quantize_expected_output_np,
-        hexagon_session,
-    ):
-        """Test adaptive_avg_pool1d."""
-        target_hexagon = tvm.target.hexagon("v69")
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
-
-        m_tensor = s1.adaptive_avg_pool1d(
-            a_tensor,
-            output_size,
-            dtype,
-            ZERO_POINT_VAL,
-            SCALE_VAL,
-            ZERO_POINT_M_VAL,
-            SCALE_M_VAL,
-        )
-
-        tir_schedule = s1.tir_adaptive_avg_pool1d_schedule(
-            m_tensor, a_tensor, output_layout, input_layout
-        )
-
-        sch = tir_schedule.mod
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
-                name="adaptive_pool1d",
-            )
-
-        input_axis_separator = [3]
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            quantize_expected_output_np.shape,
-            dtype=dtype,
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        # Convert nd to np
-        m_data_np = m_data_nd.numpy()
-
-        np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=2)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
deleted file mode 100644
index 94cb5ffca543..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ /dev/null
@@ -1,411 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for Add, Subtract and Multiply."""
-import numpy as np
-
-import tvm
-from tvm import te
-import tvm.topi.hexagon.slice_ops as sl
-import tvm.topi.hexagon.qnn as qn
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ..infrastructure import (
-    transform_numpy,
-    quantize_np,
-    get_hexagon_target,
-)
-
-ZERO_POINT_A_VAL = None
-SCALE_A_VAL = None
-
-ZERO_POINT_B_VAL = None
-SCALE_B_VAL = None
-
-ZERO_POINT_M_VAL = None
-SCALE_M_VAL = None
-
-
-def hexagon_wrapper_allocation(
-    device,
-    layout,
-    axis_separators,
-    tensor_shape=None,
-    data_original=None,
-    transformed_data=None,
-    dtype=None,
-):
-    """Input layout can either be nhwc-8h2w32c2w-2d or nhwc"""
-    if layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
-        data_nd = allocate_hexagon_array(
-            device,
-            tensor_shape=tensor_shape,
-            data=transformed_data,
-            dtype=dtype,
-            axis_separators=axis_separators,
-            mem_scope="global.vtcm",
-        )
-    elif layout == "nhwc":
-        data_nd = allocate_hexagon_array(
-            device,
-            data=data_original,
-        )
-    return data_nd
-
-
-class TestAddSubtractMultiplyBroadcast2d:
-    """Test Add, Subtract and Multiply class."""
-
-    (
-        input_shape_a,
-        input_shape_b,
-        input_a_layout,
-        input_b_layout,
-        output_layout,
-        dtype,
-    ) = tvm.testing.parameters(
-        # no broadcast needed - short input
-        (
-            [1, 8, 4, 32],
-            [1, 8, 4, 32],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # no broadcast needed - large input
-        (
-            [1, 56, 64, 128],
-            [1, 56, 64, 128],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # one input needs broadcast
-        (
-            [1, 56, 64, 128],
-            [1, 1, 64, 1],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # Both input needs broadcast
-        (
-            [1, 56, 1, 128],
-            [1, 1, 64, 1],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # One axis in one input needs broadcast
-        (
-            [1, 56, 20, 128],
-            [1, 56, 20, 1],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # broadcast all axes in one input
-        (
-            [1, 48, 56, 32],
-            [1, 1, 1, 1],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        (
-            [1, 48, 32, 64],
-            [1, 48, 32, 64],
-            "nhwc-8h8w32c-2d",
-            "nhwc-8h8w32c-2d",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast axis 2 in one input
-        (
-            [1, 48, 32, 64],
-            [1, 48, 1, 64],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast axis 1 in one input
-        (
-            [1, 48, 32, 64],
-            [1, 1, 32, 64],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast axis 3 in one input
-        (
-            [1, 8, 8, 32],
-            [1, 8, 8, 1],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast both inputs
-        (
-            [1, 56, 1, 128],
-            [1, 1, 64, 1],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast both inputs
-        (
-            [1, 48, 1, 1],
-            [1, 1, 32, 32],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast both inputs
-        (
-            [1, 48, 1, 32],
-            [1, 1, 32, 1],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast all axes in one input
-        (
-            [1, 48, 56, 32],
-            [1, 1, 1, 1],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-    )
-
-    op_name = tvm.testing.parameter("add", "subtract", "multiply")
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np_a, input_np_b, op_name):
-        """Generate expected output."""
-        if op_name == "add":
-            out_ref = np.add(input_np_a, input_np_b)
-        elif op_name == "subtract":
-            out_ref = np.subtract(input_np_a, input_np_b)
-        elif op_name == "multiply":
-            out_ref = np.multiply(input_np_a, input_np_b)
-        return out_ref
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, output_layout, dtype):
-        """Generate expected output."""
-        if dtype == "float16":
-            return transform_numpy(expected_output_np, "nhwc", output_layout)
-        if dtype in ["uint8", "int8"]:
-            global ZERO_POINT_M_VAL, SCALE_M_VAL
-            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
-                expected_output_np, dtype
-            )
-            return transform_numpy(out_ref_quantized, "nhwc", output_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def input_np_a(self, input_shape_a, dtype):
-        """Generate numpy input for variable a."""
-        if dtype in ["uint8", "int8"]:
-            dtype = "float32"
-        return np.random.random(input_shape_a).astype(dtype)
-
-    @tvm.testing.fixture
-    def input_np_b(self, input_shape_b, dtype):
-        """Generate numpy input for variable b."""
-        if dtype in ["uint8", "int8"]:
-            dtype = "float32"
-        return np.random.random(input_shape_b).astype(dtype)
-
-    @tvm.testing.fixture
-    def quantize_input_np_a(self, input_np_a, dtype):
-        if dtype in ["uint8", "int8"]:
-            global ZERO_POINT_A_VAL, SCALE_A_VAL
-            input_np_a_quantized, SCALE_A_VAL, ZERO_POINT_A_VAL = quantize_np(input_np_a, dtype)
-            return input_np_a_quantized
-        return None
-
-    @tvm.testing.fixture
-    def quantize_input_np_b(self, input_np_b, dtype):
-        if dtype in ["uint8", "int8"]:
-            global ZERO_POINT_B_VAL, SCALE_B_VAL
-            input_np_b_quantized, SCALE_B_VAL, ZERO_POINT_B_VAL = quantize_np(input_np_b, dtype)
-            return input_np_b_quantized
-        return None
-
-    @tvm.testing.fixture
-    def transformed_input_np_a(self, input_np_a, quantize_input_np_a, input_a_layout, dtype):
-        if dtype == "float16":
-            return transform_numpy(input_np_a, "nhwc", input_a_layout)
-        if dtype in ["uint8", "int8"]:
-            return transform_numpy(quantize_input_np_a, "nhwc", input_a_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def transformed_input_np_b(self, input_np_b, quantize_input_np_b, input_b_layout, dtype):
-        if dtype == "float16":
-            return transform_numpy(input_np_b, "nhwc", input_b_layout)
-        if dtype in ["uint8", "int8"]:
-            return transform_numpy(quantize_input_np_b, "nhwc", input_b_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.requires_hexagon
-    def test_transform(
-        self,
-        dtype,
-        input_shape_a,
-        input_shape_b,
-        input_np_a,
-        input_np_b,
-        quantize_input_np_a,
-        quantize_input_np_b,
-        transformed_input_np_a,
-        transformed_input_np_b,
-        expected_output_np,
-        transformed_expected_output_np,
-        hexagon_session,
-        output_layout,
-        input_a_layout,
-        input_b_layout,
-        op_name,
-    ):
-        """Test transform."""
-        output_shape = expected_output_np.shape
-        a_tensor = te.placeholder(input_shape_a, name="a_tensor", dtype=dtype)
-        b_tensor = te.placeholder(input_shape_b, name="b_tensor", dtype=dtype)
-        if dtype == "float16":
-            if op_name == "add":
-                m_tensor = sl.add_broadcast_compute(a_tensor, b_tensor)
-            elif op_name == "subtract":
-                m_tensor = sl.subtract_broadcast_compute(a_tensor, b_tensor)
-            elif op_name == "multiply":
-                m_tensor = sl.multiply_broadcast_compute(a_tensor, b_tensor)
-            tir_schedule = sl.tir_broadcast_schedule(
-                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout, op_name
-            )
-        elif dtype in ["uint8", "int8"]:
-            args = [
-                a_tensor,
-                b_tensor,
-                output_shape,
-                ZERO_POINT_A_VAL,
-                SCALE_A_VAL,
-                ZERO_POINT_B_VAL,
-                SCALE_B_VAL,
-                ZERO_POINT_M_VAL,
-                SCALE_M_VAL,
-                dtype,
-            ]
-            if op_name == "add":
-                m_tensor = qn.qadd_broadcast_compute(*args)
-            elif op_name == "subtract":
-                m_tensor = qn.qsubtract_broadcast_compute(*args)
-            elif op_name == "multiply":
-                m_tensor = qn.qmultiply_broadcast_compute(*args)
-            tir_schedule = qn.tir_schedule_quant(
-                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout
-            )
-
-        sch = tir_schedule.mod
-
-        input_axis_separator = [4]
-        if output_layout in (
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h8w32c-2d",
-        ):
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, b_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="slice_op_with_transform",
-            )
-
-        if dtype == "float16":
-            in_data_np_a = input_np_a
-            in_data_np_b = input_np_b
-        elif dtype in ["int8", "uint8"]:
-            in_data_np_a = quantize_input_np_a
-            in_data_np_b = quantize_input_np_b
-        else:
-            raise RuntimeError(f"Unsupport dtype '{dtype}'")
-
-        a_data_nd = hexagon_wrapper_allocation(
-            hexagon_session.device,
-            layout=input_a_layout,
-            data_original=in_data_np_a,
-            transformed_data=transformed_input_np_a,
-            axis_separators=input_axis_separator,
-        )
-        b_data_nd = hexagon_wrapper_allocation(
-            hexagon_session.device,
-            layout=input_b_layout,
-            data_original=in_data_np_b,
-            transformed_data=transformed_input_np_b,
-            axis_separators=input_axis_separator,
-        )
-        m_data_nd = hexagon_wrapper_allocation(
-            hexagon_session.device,
-            layout=output_layout,
-            tensor_shape=transformed_expected_output_np.shape,
-            axis_separators=output_axis_separator,
-            dtype=dtype,
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, b_data_nd, m_data_nd)
-
-        batch, height, width, channel = output_shape
-        # convert nd to np and reshape to fixed chunk size layout
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
-            )
-        elif output_layout == "nhwc-8h8w32c-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch, height // 8, width // 8, channel // 32, 8, 8, 32]
-            )
-
-        if dtype == "float16":
-            np.testing.assert_allclose(
-                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
-            )
-        elif dtype in ["int8", "uint8"]:
-            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
deleted file mode 100644
index 22fd96254ca7..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for matmul"""
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import topi
-from tvm import te
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-
-from ..infrastructure import get_hexagon_target
-
-
-class TestMatMulFloat:
-    """Test MatMul Float class."""
-
-    x_batch, y_batch, m_size, n_size, k_size = tvm.testing.parameters(
-        (1, 1, 16, 16, 32),
-        (5, 5, 16, 16, 32),
-        (5, 5, 16, 20, 32),
-        (30, 30, 16, 20, 32),
-        # Test batch broadcasting.
-        (1, 5, 16, 16, 32),
-        (5, 1, 16, 16, 32),
-    )
-
-    dtype = tvm.testing.parameter(
-        "float32",
-        "float16",
-    )
-
-    # TODO(mehrdadh): add dynamic testing
-    @tvm.testing.requires_hexagon
-    def test_batch_matmul(
-        self, hexagon_session: Session, x_batch, y_batch, m_size, n_size, k_size, dtype
-    ):
-        """Test batch MatMul."""
-        if dtype == "float16":
-            pytest.xfail("float16 is not supported.")
-
-        x = te.placeholder((x_batch, m_size, k_size), name="x")
-        y = te.placeholder((y_batch, n_size, k_size), name="y")
-
-        def get_ref_data():
-            a_np = np.random.uniform(size=(x_batch, m_size, k_size)).astype(dtype)
-            b_np = np.random.uniform(size=(y_batch, n_size, k_size)).astype(dtype)
-            c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
-            return (a_np, b_np, c_np)
-
-        # get the test data
-        a_np, b_np, c_np = get_ref_data()
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fcompute = topi.nn.batch_matmul
-            fschedule = topi.hexagon.schedule_batch_matmul
-            out = fcompute(x, y)
-            s = fschedule([out])
-            out_shape = out.shape
-
-        func = tvm.build(
-            s,
-            [x, y, out],
-            get_hexagon_target("v68"),
-            name="batch_matmul",
-        )
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev)
-        mod["batch_matmul"](a, b, c)
-
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-class TestMatMulInt8:
-    """Test MatMul INT8 class."""
-
-    x_batch, y_batch, m_size, n_size, k_size = tvm.testing.parameters(
-        (1, 1, 2, 3, 1),
-        (1, 1, 16, 24, 32),
-        (5, 5, 24, 16, 32),
-        (30, 30, 16, 20, 32),
-        (1, 5, 16, 16, 32),
-        (5, 1, 16, 16, 32),
-    )
-
-    dtype = tvm.testing.parameter(
-        "float32",
-        "float16",
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_batch_matmul_int8(
-        self, hexagon_session: Session, x_batch, y_batch, m_size, n_size, k_size
-    ):
-        """Test batch matmul INT8."""
-        dtype = "int8"
-        out_dtype = "int8"
-        assert x_batch == y_batch or x_batch == 1 or y_batch == 1
-        x = te.placeholder((x_batch, m_size, k_size), name="x", dtype=dtype)
-        y = te.placeholder((y_batch, n_size, k_size), name="y", dtype=dtype)
-
-        def get_ref_data():
-            a_np = np.random.randint(low=-128, high=127, size=(x_batch, m_size, k_size)).astype(
-                dtype
-            )
-            b_np = np.random.randint(low=-128, high=127, size=(y_batch, n_size, k_size)).astype(
-                dtype
-            )
-            c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype=out_dtype)
-            return (a_np, b_np, c_np)
-
-        # get the test data
-        a_np, b_np, c_np = get_ref_data()
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fcompute = topi.nn.batch_matmul
-            fschedule = topi.hexagon.schedule_batch_matmul
-            out = fcompute(x, y)
-            s = fschedule([out])
-
-        func = tvm.build(
-            s,
-            [x, y, out],
-            get_hexagon_target("v68"),
-            name="batch_matmul_int8",
-        )
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev)
-        mod["batch_matmul_int8"](a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
deleted file mode 100644
index 41fe310d8484..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Test conv2d HVX intrinsic implementation"""
-
-import numpy as np
-
-import tvm
-import tvm.contrib.hexagon
-from tvm.topi.testing import conv2d_nhwc_python
-
-from ..infrastructure import get_hexagon_target
-
-
-def build_conv2d(target):
-    """Build and the return the conv2d module that calls the intrinsic implementation"""
-    act_n, act_h, act_w, act_c = (
-        tvm.te.var("act_n"),
-        tvm.te.var("act_h"),
-        tvm.te.var("act_w"),
-        tvm.te.var("act_c"),
-    )
-    filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("fw"), tvm.te.var("filt_o")
-    off_l, off_t = tvm.te.var("off_l"), tvm.te.var("off_t")
-    stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w")
-
-    act_flat = tvm.te.placeholder(
-        shape=(act_n, act_h, act_w, act_c), dtype="float16", name="act_flat"
-    )
-    wgt_flat = tvm.te.placeholder(
-        shape=(filt_h, filt_w, act_c, filt_o), dtype="float16", name="wgt_flat"
-    )
-
-    out_flat = tvm.te.extern(
-        shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o),
-        inputs=[act_flat, wgt_flat],
-        fcompute=lambda ins, outs: tvm.tir.call_cpacked(
-            "conv2d_packed_fp16",  # Function from TVM runtime
-            ins[0],
-            ins[1],
-            off_t,
-            off_l,
-            stride_h,
-            stride_w,
-            outs[0],
-            tvm.runtime.const(0),  # resource_handle (unused)
-        ),
-        dtype="float16",
-    )
-
-    s = tvm.te.create_schedule(out_flat.op)
-
-    func_name = "extern_conv"
-    with tvm.transform.PassContext(opt_level=3):
-        module = tvm.build(
-            s,
-            [act_flat, wgt_flat, off_t, off_l, stride_h, stride_w, out_flat],
-            target=target,
-            name=func_name,
-        )
-
-    return module
-
-
-def gen_config(params):
-    """Utility function to generate useful ids for shape_parameters"""
-
-    dims = lambda vals: "x".join(map(str, vals))
-
-    config = {}
-    for param in params:
-        act_shape, wgt_shape, inp_stride = param
-        name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}"
-        config[name] = param
-
-    return config
-
-
-class TestConv2dIntrin:
-    """Test Conv2d Intrin class"""
-
-    shape_parameters = [
-        (
-            (1, 8, 4, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 10, 14, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 14, 6, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 14, 6, 3),
-            (3, 3, 3, 64),
-            (1, 1),
-        ),
-        (
-            (1, 14, 6, 3),
-            (5, 5, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 8, 8, 3),
-            (2, 2, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 14, 6, 64),
-            (3, 3, 64, 3),
-            (1, 1),
-        ),
-        (
-            (1, 4, 4, 40),
-            (3, 3, 40, 3),
-            (1, 1),
-        ),
-        (
-            (1, 4, 4, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 5, 5, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 6, 6, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 7, 7, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 8, 8, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 8, 8, 3),
-            (5, 5, 3, 3),
-            (1, 1),
-        ),
-        (
-            (1, 8, 8, 64),
-            (2, 2, 64, 64),
-            (1, 1),
-        ),
-        (
-            (1, 8, 4, 3),
-            (3, 3, 3, 3),
-            (2, 2),
-        ),
-        (
-            (1, 14, 6, 3),
-            (3, 3, 3, 64),
-            (2, 2),
-        ),
-        (
-            (1, 14, 6, 3),
-            (5, 5, 3, 3),
-            (2, 2),
-        ),
-        (
-            (1, 8, 8, 3),
-            (2, 2, 3, 3),
-            (2, 2),
-        ),
-    ]
-
-    config = gen_config(shape_parameters)
-    act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys())
-    inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hexagon_session):
-        """Test conv2d intrinsic implementation"""
-        assert act_shape[3] == wgt_shape[2]
-
-        # Currently, input offset does not affect the output shape
-        def get_out_shape(ash, wsh, inp_stride):
-            assert ash[3] == wsh[2]
-            osh = (
-                ash[0],
-                (ash[1] - wsh[0]) // inp_stride[0] + 1,
-                (ash[2] - wsh[1]) // inp_stride[1] + 1,
-                wsh[3],
-            )
-            assert tvm.tir.all([x > 0 for x in osh])
-            return osh
-
-        act = np.random.rand(*act_shape).astype("float16")
-        wgt = np.random.rand(*wgt_shape).astype("float16")
-
-        module = build_conv2d(get_hexagon_target("v68"))
-
-        mod = hexagon_session.load_module(module)
-        output = tvm.nd.array(
-            np.zeros(get_out_shape(act_shape, wgt_shape, inp_stride), dtype="float16"),
-            device=hexagon_session.device,
-        )
-        mod(
-            tvm.nd.array(act, device=hexagon_session.device),
-            tvm.nd.array(wgt, device=hexagon_session.device),
-            inp_offset[0],  # off_t
-            inp_offset[1],  # off_l
-            inp_stride[0],  # stride_height
-            inp_stride[1],  # stride_width
-            output,
-        )
-
-        out = output.numpy()
-
-        # Generate reference output and compare:
-        ref_out = conv2d_nhwc_python(
-            act.astype("float32"), wgt.astype("float32"), stride=inp_stride, padding="VALID"
-        ).astype("float16")
-
-        tvm.testing.assert_allclose(out, ref_out, rtol=5e-2, atol=5e-2)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
deleted file mode 100644
index 9c89427e1b01..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for convolution."""
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import topi
-from tvm import te
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-
-from ..infrastructure import get_hexagon_target
-
-
-class BaseConv2DTests:
-    """Conv2D test class."""
-
-    add_bias = tvm.testing.parameter(False)
-    apply_relu = tvm.testing.parameter(False)
-    dilation = tvm.testing.parameter(1)
-    batch = tvm.testing.parameter(1)
-    dtype = tvm.testing.parameter("float32")
-
-    random_seed = tvm.testing.parameter(0)
-
-    @tvm.testing.fixture
-    def input_shape(self, batch, in_channel, in_size):
-        return (batch, in_channel, in_size, in_size)
-
-    @tvm.testing.fixture
-    def weight_shape(self, num_filter, in_channel, kernel):
-        return (num_filter, in_channel, kernel, kernel)
-
-    @tvm.testing.fixture
-    def bias_shape(self, num_filter):
-        return (num_filter, 1, 1)
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(
-        self,
-        random_seed,
-        input_shape,
-        weight_shape,
-        bias_shape,
-        dtype,
-        stride,
-        padding,
-        dilation,
-        add_bias,
-        apply_relu,
-    ):
-        """Generate reference data."""
-        np.random.seed(random_seed)
-
-        # scipy.signal.convolve2d does not support float16 data types, and
-        # the python fallback is too slow for general use.  Computing
-        # ref_data in float32 will have fewer rounding errors than the TVM
-        # float16 compute, but those vary based on schedule anyways.
-        conv_dtype = "float32" if dtype == "float16" else dtype
-
-        a_np = np.random.uniform(size=input_shape).astype(dtype)
-        w_np = np.random.uniform(size=weight_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(
-            a_np.astype(conv_dtype), dw_np.astype(conv_dtype), stride, padding
-        ).astype(dtype)
-
-        if add_bias:
-            c_np = c_np + b_np
-        if apply_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d_nchw(
-        self,
-        hexagon_session: Session,
-        batch,
-        in_channel,
-        in_size,
-        num_filter,
-        kernel,
-        stride,
-        padding,
-        dtype,
-        ref_data,
-        dilation,
-        add_bias,
-        apply_relu,
-    ):
-        """Test Conv2d NCHW."""
-
-        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-        padding_sum = pad_top + pad_left + pad_bottom + pad_right
-
-        a_np, w_np, b_np, c_np = ref_data
-
-        a_tensor = te.placeholder(a_np.shape, name="a_tensor", dtype=dtype)
-        w_tensor = te.placeholder(w_np.shape, name="w_tensor", dtype=dtype)
-        bias = te.placeholder(b_np.shape, name="bias", dtype=dtype)
-
-        if "int" in dtype:
-            tol = {"atol": 0, "rtol": 0}
-        elif dtype == "float32":
-            tol = {"rtol": 1e-4, "atol": 2e-4}
-        elif dtype == "float16":
-            # a_tensor summation in float16 with a single accumulator very
-            # quickly runs into large rounding errors.  At some point,
-            # this tolerance should be schedule-dependent for to avoid
-            # false negatives.
-            num_values_summed = in_channel * kernel * kernel
-            gap_size = np.nextafter(c_np.max(), np.inf, dtype=c_np.dtype) - c_np.max()
-            tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fcompute = topi.nn.conv2d_nchw
-            fschedule = topi.hexagon.schedule_conv2d_nchw
-            c_tensor = fcompute(
-                a_tensor, w_tensor, (stride, stride), padding, (dilation, dilation), dtype
-            )
-            if add_bias:
-                c_tensor = topi.add(c_tensor, bias)
-            if apply_relu:
-                c_tensor = topi.nn.relu(c_tensor)
-            s = fschedule([c_tensor])
-
-        func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
-            dtype,
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel,
-            stride,
-            padding_sum,
-            dilation,
-        )
-        func = tvm.build(
-            s,
-            [a_tensor, w_tensor, bias, c_tensor],
-            get_hexagon_target("v68"),
-            name=func_name,
-        )
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        a_data = tvm.nd.array(a_np, dev)
-        weight = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-
-        c = tvm.nd.array(np.zeros(get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype), dev)
-        mod[func_name](a_data, weight, b, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
-
-
-class TestBatchSize(BaseConv2DTests):
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (32, 28, 32, 3, 1, 1),
-    )
-    batch = tvm.testing.parameter(1, 4, 9)
-
-
-class TestBiasRelu(BaseConv2DTests):
-    apply_relu = tvm.testing.parameter(True, False, ids=["relu", "no_relu"])
-    add_bias = tvm.testing.parameter(True, False, ids=["bias", "no_bias"])
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (64, 56, 64, 3, 1, 1),
-        (64, 8, 64, 3, 1, (1, 2, 2, 1)),
-        (64, 8, 64, 5, 2, (1, 3)),
-        (64, 8, 64, 3, 1, "VALID"),
-        (32, 8, 32, 24, 1, "SAME"),
-    )
-
-
-class TestResNet18Workloads(BaseConv2DTests):
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (3, 224, 64, 7, 2, 3),
-        (64, 56, 64, 3, 1, 1),
-        (64, 56, 64, 1, 1, 0),
-        (64, 56, 32, 3, 2, 1),
-        (64, 56, 32, 1, 2, 0),
-        (64, 28, 32, 3, 1, 1),
-    )
-
-
-class TestMobilenet(BaseConv2DTests):
-    batch, in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (1, 32, 112, 32, 3, 1, 1),
-    )
-
-
-class TestWeirdWorkloads(BaseConv2DTests):
-    batch, in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (2, 2, 2, 2, 2, 2, 2),
-        (3, 3, 3, 3, 3, 3, 3),
-        (4, 4, 4, 4, 4, 4, 4),
-        (5, 5, 5, 5, 5, 5, 5),
-        (6, 6, 6, 6, 6, 6, 6),
-        (1, 1, 1, 1, 1, 1, 1),
-        (2, 13, 71, 59, 3, 1, 1),
-    )
-
-
-class TestAsymmetricPadding(BaseConv2DTests):
-    dilation = tvm.testing.parameter(1, 2)
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (3, 35, 64, 7, 2, (0, 0, 1, 1)),
-        (64, 8, 128, 3, 1, (3, 3, 2, 2)),
-        (64, 8, 64, 1, 1, (1, 2, 2, 1)),
-        (64, 17, 48, 1, 1, (1, 2)),
-        (64, 8, 64, 3, 1, (3, 1)),
-        (128, 8, 96, 3, 1, (0, 2)),
-        (64, 35, 64, 3, 1, (1, 2)),
-        (64, 8, 64, 1, 1, "VALID"),
-        (388, 8, 64, 3, 1, "VALID"),
-        (64, 10, 48, 3, 1, "VALID"),
-        (64, 19, 64, 1, 1, "SAME"),
-        (64, 5, 32, 2, 1, "SAME"),
-        (32, 8, 32, 3, 1, "SAME"),
-        (64, 8, 64, 3, 1, (1, 2, 2, 1)),
-        (64, 8, 64, 5, 2, (1, 3)),
-        (64, 8, 64, 3, 1, "VALID"),
-        (32, 8, 32, 24, 1, "SAME"),
-        (32, 35, 64, 7, 2, (0, 0, 2, 2)),
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
deleted file mode 100644
index 9edc04db4398..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for convolution."""
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import topi
-from tvm import te
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-
-from ..infrastructure import get_hexagon_target
-
-
-class BaseConv2DTests:
-    """Test Conv2D base class."""
-
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(
-        self, dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation
-    ):
-        """Generate reference data."""
-        in_height = in_width = in_size
-        a_shape = (batch, in_height, in_width, in_channel)
-        w_shape = (kernel, kernel, in_channel, num_filter)
-
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-        return a_np, w_np, b_np
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d_nhwc(
-        self,
-        hexagon_session: Session,
-        ref_data,
-        batch,
-        in_channel,
-        in_size,
-        num_filter,
-        kernel,
-        dtype,
-        stride,
-        padding,
-        dilation,
-    ):
-        """Test Conv2D NHWC."""
-        a_np, w_np, b_np = ref_data
-
-        a_tensor = te.placeholder(a_np.shape, name="a_tensor", dtype=dtype)
-        w_tensor = te.placeholder(w_np.shape, name="w_tensor", dtype=dtype)
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fcompute = topi.nn.conv2d_nhwc
-            fschedule = topi.hexagon.schedule_conv2d_nhwc
-            b_tensor = fcompute(a_tensor, w_tensor, stride, padding, dilation, dtype)
-            s = fschedule([b_tensor])
-
-        func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
-            dtype,
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel,
-            stride,
-            padding,
-            dilation,
-        )
-        func = tvm.build(
-            s, [a_tensor, w_tensor, b_tensor], get_hexagon_target("v68"), name=func_name
-        )
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        a_data = tvm.nd.array(a_np, dev)
-        weight = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=b_tensor.dtype), dev)
-
-        mod[func_name](a_data, weight, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-class TestConv2dNHWC(BaseConv2DTests):
-    (
-        batch,
-        in_channel,
-        in_size,
-        num_filter,
-        kernel,
-        stride,
-        padding,
-        dilation,
-    ) = tvm.testing.parameters(
-        (1, 64, 32, 64, 3, 1, "SAME", 1),
-        (4, 32, 16, 32, 5, 2, "SAME", 1),
-        (1, 64, 32, 64, 3, 1, "VALID", 1),
-        (4, 32, 16, 32, 5, 2, "VALID", 1),
-        (1, 32, 16, 64, 3, 2, (0, 0, 1, 1), 1),
-        (1, 32, 16, 64, 3, 2, (1, 1, 2, 2), 1),
-        (1, 32, 16, 32, 5, 2, (3, 3, 2, 2), 1),
-        (1, 32, 16, 64, 3, 2, (0, 1, 2, 3), 1),
-        (1, 64, 32, 64, 3, 1, "SAME", 2),
-        (1, 64, 32, 64, 3, 1, (1, 1, 2, 2), 2),
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
deleted file mode 100644
index c26e6142ba5c..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Test quantized conv2d HVX intrinsic implementation"""
-
-import numpy as np
-
-import tvm
-import tvm.contrib.hexagon
-from tvm.topi.hexagon.utils import get_fixed_point_value
-from tvm.topi.testing import conv2d_nhwc_python
-
-from ..infrastructure import get_hexagon_target, quantize_np
-
-
-def build_conv2d(target):
-    """Build and return the conv2d IRModule that calls the intrinsic implementation"""
-    act_n, act_h, act_w, act_c = (
-        tvm.te.var("an"),
-        tvm.te.var("ah"),
-        tvm.te.var("aw"),
-        tvm.te.var("ac"),
-    )
-    filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("filt_w"), tvm.te.var("filt_o")
-    act_scale, act_zp = tvm.te.var("act_scale", dtype="float32"), tvm.te.var("act_zp")
-    wgt_scale, wgt_zp = tvm.te.var("wgt_scale", dtype="float32"), tvm.te.var("wgt_zp")
-    out_scale, out_zp = tvm.te.var("out_scale", dtype="float32"), tvm.te.var("out_zp")
-    fixed_final_scale, scale_factor = tvm.te.var("fixed_final_scale", dtype="int32"), tvm.te.var(
-        "scale_factor"
-    )
-    stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w")
-
-    act_flat = tvm.te.placeholder(
-        shape=(act_n, act_h, act_w, act_c), dtype="uint8", name="act_flat"
-    )
-    wgt_flat = tvm.te.placeholder(
-        shape=(filt_h, filt_w, act_c, filt_o), dtype="int8", name="wgt_flat"
-    )
-
-    out_flat = tvm.te.extern(
-        shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o),
-        inputs=[act_flat, wgt_flat],
-        fcompute=lambda ins, outs: tvm.tir.call_cpacked(
-            "conv2d_packed_quant",  # Function from TVM runtime
-            ins[0],
-            ins[1],
-            act_scale,
-            act_zp,
-            wgt_scale,
-            wgt_zp,
-            out_scale,
-            out_zp,
-            stride_h,
-            stride_w,
-            fixed_final_scale,
-            scale_factor,
-            outs[0],
-            tvm.runtime.const(0),  # resource_handle (unused)
-        ),
-        dtype="uint8",
-    )
-
-    s = tvm.te.create_schedule(out_flat.op)
-
-    func_name = "conv2d_quant_hvx"
-    module = tvm.build(
-        s,
-        [
-            act_flat,
-            wgt_flat,
-            act_scale,
-            act_zp,
-            wgt_scale,
-            wgt_zp,
-            out_scale,
-            out_zp,
-            stride_h,
-            stride_w,
-            fixed_final_scale,
-            scale_factor,
-            out_flat,
-        ],
-        target=target,
-        name=func_name,
-    )
-
-    return module
-
-
-def gen_config(params):
-    """Utility function to generate useful ids for shape_parameters"""
-
-    dims = lambda vals: "x".join(map(str, vals))
-
-    config = {}
-    for param in params:
-        act_shape, wgt_shape, inp_stride = param
-        name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}"
-        config[name] = param
-
-    return config
-
-
-class TestQuantConv2dIntrin:
-    """Test Quantized Conv2d Intrin class"""
-
-    shape_parameters = [
-        [
-            (1, 5, 5, 33),
-            (3, 3, 33, 33),
-            (1, 1),
-        ],
-        [
-            (1, 9, 8, 64),
-            (3, 3, 64, 64),
-            (1, 1),
-        ],
-        [
-            (1, 11, 16, 64),
-            (3, 3, 64, 32),
-            (1, 1),
-        ],
-        [
-            (1, 24, 8, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 4, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 5, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 6, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 7, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 8, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 9, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 10, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 11, 3),
-            (3, 3, 3, 3),
-            (1, 1),
-        ],
-        [
-            (1, 4, 4, 5),
-            (3, 3, 5, 3),
-            (1, 1),
-        ],
-    ]
-
-    config = gen_config(shape_parameters)
-    act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys())
-    inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d_quant(self, act_shape, wgt_shape, inp_stride, hexagon_session):
-        """Test quantized conv2d intrinsic implementation"""
-        assert act_shape[3] == wgt_shape[2]
-
-        # Currently, input offset does not affect the output shape
-        def get_out_shape(ash, wsh, inp_stride):
-            assert ash[3] == wsh[2]
-            osh = (
-                ash[0],
-                (ash[1] - wsh[0]) // inp_stride[0] + 1,
-                (ash[2] - wsh[1]) // inp_stride[1] + 1,
-                wsh[3],
-            )
-            assert tvm.tir.all([x > 0 for x in osh])
-            return osh
-
-        act_f = np.random.uniform(-1.5, 1.0, size=act_shape).astype("float32")
-        wgt_f = np.random.uniform(-1.5, 1.0, size=wgt_shape).astype("float32")
-
-        # Quanize activations using onnxruntime
-        act_q, act_scale, act_zp = quantize_np(act_f, dtype="uint8")
-        act_q = act_q.reshape(act_f.shape)
-
-        # Quanize weights using onnxruntime
-        wgt_q, wgt_scale, wgt_zp = quantize_np(wgt_f, dtype="int8")
-        wgt_q = wgt_q.reshape(wgt_f.shape)
-
-        # Generate reference output
-        ref_out = conv2d_nhwc_python(act_f, wgt_f, stride=inp_stride, padding="VALID")
-
-        ref_out_q, out_scale, out_zp = quantize_np(ref_out, dtype="uint8")
-        ref_out_q = ref_out_q.reshape(ref_out.shape)
-
-        final_scale = act_scale * wgt_scale / out_scale
-        fixed_final_scale, scale_factor = get_fixed_point_value(final_scale)
-
-        module = build_conv2d(get_hexagon_target("v69"))
-        mod = hexagon_session.load_module(module)
-
-        output_shape = get_out_shape(act_shape, wgt_shape, inp_stride)
-
-        output = tvm.nd.array(
-            np.zeros(output_shape, dtype="uint8"),
-            device=hexagon_session.device,
-        )
-        mod(
-            tvm.nd.array(act_q, device=hexagon_session.device),
-            tvm.nd.array(wgt_q, device=hexagon_session.device),
-            act_scale,
-            act_zp,
-            wgt_scale,
-            wgt_zp,
-            out_scale,
-            out_zp,
-            inp_stride[0],  # stride_height
-            inp_stride[1],  # stride_width
-            fixed_final_scale,
-            scale_factor,
-            output,
-        )
-
-        out_q = output.numpy()
-
-        tvm.testing.assert_allclose(out_q, ref_out_q, rtol=0, atol=2)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
deleted file mode 100644
index d19223a42d74..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for transposed convolution."""
-import numpy as np
-
-import tvm
-from tvm.contrib.hexagon.session import Session
-import tvm.testing
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-
-from ..infrastructure import get_hexagon_target
-
-# TODO Should add kernal to tvm.testing.fixture
-
-
-class BaseConv2DTransposeTests:
-    """Conv2D transpose base class."""
-
-    random_seed = tvm.testing.parameter(0)
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d(
-        self,
-        hexagon_session: Session,
-        batch,
-        in_channel,
-        in_size,
-        num_filter,
-        stride,
-        padding,
-        output_padding,
-        random_seed,
-    ):
-        """Test conv2D."""
-        in_height, in_width = in_size
-        kernel_height, kernel_width = (1, 1)
-        stride_height, stride_width = stride
-        pad_top, pad_left, pad_bottom, pad_right = padding
-
-        a_tensor = te.placeholder((batch, in_channel, in_height, in_width), name="a_tensor")
-        w_tensor = te.placeholder(
-            (in_channel, num_filter, kernel_height, kernel_width), name="w_tensor"
-        )
-
-        a_shape = get_const_tuple(a_tensor.shape)
-        w_shape = get_const_tuple(w_tensor.shape)
-        dtype = a_tensor.dtype
-
-        def get_ref_data():
-
-            np.random.seed(random_seed)
-            a_np = np.random.uniform(size=a_shape).astype(dtype)
-            w_np = np.random.uniform(size=w_shape).astype(dtype)
-            b_np = tvm.topi.testing.conv2d_transpose_nchw_python(
-                a_np, w_np, stride, padding, output_padding
-            )
-            c_np = np.maximum(b_np, 0)
-            return a_np, w_np, b_np, c_np
-
-        a_np, w_np, b_np, c_np = get_ref_data()
-
-        fcompute_args = (
-            a_tensor,
-            w_tensor,
-            [stride_height, stride_width],
-            [pad_top, pad_left, pad_bottom, pad_right],
-            a_tensor.dtype,
-            output_padding,
-        )
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fcompute = topi.nn.conv2d_transpose_nchw
-            fschedule = topi.hexagon.schedule_conv2d_transpose_nchw
-            b_tensor = fcompute(*fcompute_args)
-            c_tensor = topi.nn.relu(b_tensor)
-            schedule_1 = fschedule([b_tensor])
-            schedule_2 = fschedule([c_tensor])
-
-            dev = hexagon_session.device
-
-            a_data = tvm.nd.array(a_np, dev)
-            weight = tvm.nd.array(w_np, dev)
-            b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=b_tensor.dtype), dev)
-            c = tvm.nd.array(np.zeros(get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype), dev)
-
-            func1 = tvm.build(schedule_1, [a_tensor, w_tensor, b_tensor], get_hexagon_target("v68"))
-            func2 = tvm.build(schedule_2, [a_tensor, w_tensor, c_tensor], get_hexagon_target("v68"))
-
-            mod1 = hexagon_session.load_module(func1)
-            mod2 = hexagon_session.load_module(func2)
-
-            mod1(a_data, weight, b)
-            mod2(a_data, weight, c)
-            tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-class TestConv2DTranspose(BaseConv2DTransposeTests):
-    """Test Conv2D transpose class."""
-
-    (batch, in_channel, in_size, num_filter, stride) = tvm.testing.parameters(
-        (1, 3, (224, 224), 1, (1, 1)),
-        (1, 8, (224, 224), 1, (1, 1)),
-        (1, 512, (8, 1), 128, (31, 1)),
-        (1, 32, (8192, 1), 1, (1, 1)),
-    )
-
-    padding = tvm.testing.parameter((0, 0, 0, 0))
-    output_padding = tvm.testing.parameter((0, 0))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
deleted file mode 100644
index fff4fd989f6d..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for dense"""
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import topi
-from tvm import te
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-
-from ..infrastructure import get_hexagon_target
-
-
-class TestDense:
-    """Dense test class."""
-
-    random_seed = tvm.testing.parameter(0)
-
-    use_bias = tvm.testing.parameter(True, False)
-
-    # batch_size more than 8 would break
-    batch_size = tvm.testing.parameter(1, 2, 8)
-
-    in_dim, out_dim = tvm.testing.parameters((1024, 1000))
-
-    in_dtype, out_dtype = tvm.testing.parameters(
-        ("float32", "float32"),
-        ("float16", "float32"),
-        ("int8", "int32"),
-    )
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def dense_ref_data(
-        self, random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype
-    ):
-        """Generate reference data."""
-        np.random.seed(random_seed)
-
-        if "float" in in_dtype:
-            a_np = np.random.uniform(size=(batch_size, in_dim)).astype(in_dtype)
-            b_np = np.random.uniform(size=(out_dim, in_dim)).astype(in_dtype)
-            c_np = np.random.uniform(size=(out_dim,)).astype(out_dtype)
-        elif in_dtype == "int8":
-            a_np = np.random.randint(low=-128, high=127, size=(batch_size, in_dim)).astype(in_dtype)
-            b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(in_dtype)
-            c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(out_dtype)
-        else:
-            raise ValueError("No method to generate test data for data type '{}'".format(in_dtype))
-
-        matmul = np.dot(a_np.astype(out_dtype), b_np.T.astype(out_dtype))
-
-        if use_bias:
-            matmul += c_np
-
-        d_np = np.maximum(matmul, 0)
-        return (a_np, b_np, c_np, d_np)
-
-    @tvm.testing.requires_hexagon
-    def test_dense(
-        self,
-        hexagon_session: Session,
-        batch_size,
-        in_dim,
-        out_dim,
-        use_bias,
-        in_dtype,
-        out_dtype,
-        dense_ref_data,
-    ):
-        """Test dense."""
-        if in_dtype == "float16":
-            pytest.xfail("float16 is not supported.")
-
-        if "int" in in_dtype:
-            tol = {"atol": 0, "rtol": 0}
-        elif in_dtype == "float32":
-            tol = {"rtol": 1e-5, "atol": 1e-5}
-
-        a_tensor = te.placeholder((batch_size, in_dim), name="a_tensor", dtype=in_dtype)
-        b_tensor = te.placeholder((out_dim, in_dim), name="b_tensor", dtype=in_dtype)
-        c_tensor = te.placeholder((out_dim,), name="c_tensor", dtype=out_dtype)
-
-        a_np, b_np, c_np, d_np = dense_ref_data
-
-        fcompute = topi.nn.dense
-        fschedule = topi.hexagon.schedule_dense
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            d_tensor = fcompute(a_tensor, b_tensor, c_tensor if use_bias else None, out_dtype)
-            d_tensor = topi.nn.relu(d_tensor)
-            schedule = fschedule([d_tensor])
-
-        func = tvm.build(
-            schedule,
-            [a_tensor, b_tensor, c_tensor, d_tensor],
-            get_hexagon_target("v68"),
-            name="dense",
-        )
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        a_data = tvm.nd.array(a_np, dev)
-        b_data = tvm.nd.array(b_np, dev)
-        c_data = tvm.nd.array(c_np, dev)
-        d_data = tvm.nd.array(np.zeros(get_const_tuple(d_tensor.shape), dtype=out_dtype), dev)
-        mod["dense"](a_data, b_data, c_data, d_data)
-        tvm.testing.assert_allclose(d_data.numpy(), d_np, **tol)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
deleted file mode 100644
index 7d4afb953a50..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=line-too-long, redefined-outer-name
-
-"""Test depth_to_space slice op for hexagon"""
-
-import numpy as np
-
-import tvm
-from tvm import te
-import tvm.testing
-from tvm.topi.hexagon.slice_ops.depth_to_space import d2s_compute, d2s_schedule
-from tvm.topi.testing import depth_to_space_python
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ..infrastructure import transform_numpy, get_hexagon_target
-
-
-class TestD2SSlice:
-    """Test class that defines the Depth to Space slice test"""
-
-    d2s_fp16_tests = (
-        ((1, 8, 8, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-        ((1, 8, 8, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-        ((1, 16, 16, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-        ((1, 16, 16, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-        ((1, 8, 8, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-        ((1, 8, 8, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-        ((1, 16, 16, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-        ((1, 16, 16, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    )
-
-    d2s_uint8_tests = (
-        ((1, 8, 8, 256), 2, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-        ((1, 8, 8, 1024), 4, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-        ((1, 8, 8, 256), 2, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-        ((1, 8, 8, 1024), 4, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-    )
-
-    (input_shape, block_size, mode, dtype, input_layout, output_layout,) = tvm.testing.parameters(
-        *d2s_fp16_tests,
-        *d2s_uint8_tests,
-    )
-
-    working_scope = tvm.testing.parameter("global.vtcm")
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        return np.random.uniform(size=input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, input_layout):
-        return transform_numpy(input_np, "nhwc", input_layout)
-
-    @tvm.testing.fixture
-    def ref_output_np(self, input_np, block_size, mode):
-        a_np = np.transpose(input_np, axes=[0, 3, 1, 2])
-        ref_np = depth_to_space_python(a_np, block_size, mode=mode)
-        ref_np = np.transpose(ref_np, axes=[0, 2, 3, 1])
-        return ref_np
-
-    @tvm.testing.fixture
-    def transformed_ref_output_np(self, ref_output_np, output_layout):
-        return transform_numpy(ref_output_np, "nhwc", output_layout)
-
-    @tvm.testing.requires_hexagon
-    def test_d2s_slice(
-        self,
-        input_shape,
-        block_size,
-        mode,
-        dtype,
-        input_layout,
-        output_layout,
-        hexagon_session,
-        working_scope,
-        transformed_input_np,
-        transformed_ref_output_np,
-    ):
-        """Top level testing function for depth to space"""
-        input_tensor = te.placeholder(input_shape, name="input_tensor", dtype=dtype)
-
-        output = d2s_compute(input_tensor, block_size, "NHWC", mode)
-
-        tir_s = d2s_schedule(input_tensor, output, input_layout, output_layout)
-
-        input_data = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-        output_data = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_ref_output_np.shape,
-            dtype=transformed_ref_output_np.dtype,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            runtime_module = tvm.build(
-                tir_s.mod,
-                [input_tensor, output],
-                target=get_hexagon_target("v69"),
-                name="depth_to_space",
-            )
-        mod = hexagon_session.load_module(runtime_module)
-
-        mod(input_data, output_data)
-        output_np = output_data.numpy()
-
-        tvm.testing.assert_allclose(
-            output_np,
-            transformed_ref_output_np,
-            1e-3,
-            1e-3,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
deleted file mode 100644
index f95d41093043..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Depthwise Conv2D Tests."""
-
-import numpy as np
-
-import tvm
-from tvm.contrib.hexagon.session import Session
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-
-from ..infrastructure import get_hexagon_target
-
-
-class BaseDepthwiseConv2D:
-    """Provides the test_conv2d test function, to be used by other test classes.
-
-    Test parameter sets are split out into different classes for
-    readability (e.g. used for mobilenet), and for restrictions
-    (e.g. implemented only for llvm).
-    """
-
-    random_seed = tvm.testing.parameter(0)
-
-    in_dtype, out_dtype = tvm.testing.parameters(
-        ("float32", "float32"),
-    )
-
-    @tvm.testing.fixture
-    def input_shape(self, layout, batch, in_channel, in_size, filter_shape):
-        """Returns input shape."""
-        if layout == "NCHW":
-            return (batch, in_channel, in_size, in_size)
-        elif layout == "NHWC":
-            return (batch, in_size, in_size, in_channel)
-        elif layout == "NCHWc":
-            oc_block = filter_shape[-1]
-            ic_block = next(bn for bn in range(oc_block, 0, -1) if in_channel % bn == 0)
-            return (batch, in_channel // ic_block, in_size, in_size, ic_block)
-        else:
-            raise RuntimeError(f"Not supported layout {layout}")
-
-    @tvm.testing.fixture
-    def filter_shape(self, layout, in_channel, channel_multiplier, kernel):
-        """Returns filter shape."""
-        filter_channel = in_channel
-        if layout == "NCHW":
-            return (filter_channel, channel_multiplier, kernel, kernel)
-        elif layout == "NHWC":
-            return (kernel, kernel, filter_channel, channel_multiplier)
-        elif layout == "NCHWc":
-            out_channel = in_channel * channel_multiplier
-            # For testing the functionality, we choose an arbitrary block
-            # size that can divide out_channel, regardless of the
-            # performance.
-            oc_block = next(bn for bn in range(16, 0, -1) if out_channel % bn == 0)
-            return (out_channel // oc_block, 1, kernel, kernel, 1, oc_block)
-        else:
-            raise RuntimeError(f"Not supported layout {layout}")
-
-    @tvm.testing.fixture
-    def scale_shape(self, layout, in_channel, channel_multiplier, filter_shape):
-        """Returns scale shape."""
-        out_channel = in_channel * channel_multiplier
-
-        if layout in ("NCHW", "NHWC"):
-            return (out_channel,)
-
-        if layout == "NCHWc":
-            oc_block = filter_shape[-1]
-            return (out_channel // oc_block, oc_block)
-
-        raise ValueError("Unknown layout {}".format(layout))
-
-    @tvm.testing.fixture
-    def shift_shape(self, scale_shape):
-        """Returns shift shape."""
-        return scale_shape
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(
-        self,
-        random_seed,
-        in_dtype,
-        out_dtype,
-        layout,
-        input_shape,
-        filter_shape,
-        dilation,
-        stride,
-        padding,
-        scale_shape,
-        shift_shape,
-        use_scale_shift,
-        apply_relu,
-    ):
-        """Generate reference data."""
-        np.random.seed(random_seed)
-
-        # scipy.signal.convolve2d does not support float16 data types, and
-        # the python fallback is too slow for general use.  Computing
-        # ref_data in float32 will have fewer rounding errors than the TVM
-        # float16 compute, but those vary based on schedule anyways.
-        conv_dtype = "float32" if in_dtype == "float16" else in_dtype
-
-        input_np = np.random.uniform(size=input_shape).astype(in_dtype)
-        filter_np = np.random.uniform(size=filter_shape).astype(in_dtype)
-        scale_np = np.random.uniform(size=scale_shape).astype(out_dtype)
-        shift_np = np.random.uniform(size=shift_shape).astype(out_dtype)
-        if layout == "NCHW":
-            np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchw
-            dilation = (1, 1, dilation, dilation)
-            reshape = (1, -1, 1, 1)
-        elif layout == "NHWC":
-            np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nhwc
-            dilation = (dilation, dilation, 1, 1)
-            reshape = (1, 1, 1, -1)
-        elif layout == "NCHWc":
-            np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchwc
-            dilation = (1, 1, dilation, dilation, 1, 1)
-            reshape = (1, scale_shape[0], 1, 1, scale_shape[1])
-
-        dilated_filter_np = tvm.topi.testing.dilate_python(filter_np, dilation)
-        output_np = np_depthwise_conv2d(
-            input_np.astype(conv_dtype), dilated_filter_np.astype(conv_dtype), stride, padding
-        ).astype(out_dtype)
-
-        if use_scale_shift:
-            output_np = output_np * scale_np.reshape(reshape) + shift_np.reshape(reshape)
-        if apply_relu:
-            output_np = np.maximum(output_np, 0)
-
-        return (
-            input_np,
-            filter_np,
-            scale_np,
-            shift_np,
-            output_np,
-        )
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d(
-        self,
-        hexagon_session: Session,
-        in_dtype,
-        out_dtype,
-        layout,
-        input_shape,
-        filter_shape,
-        scale_shape,
-        shift_shape,
-        use_scale_shift,
-        apply_relu,
-        kernel,
-        stride,
-        padding,
-        dilation,
-        ref_data,
-    ):
-        """Test conv2D."""
-        # Transform the padding argument from 'str' to 'tuple' to
-        # match the "workload" tuple in TopHub.  Which padding_args to
-        # use for each layout chosen to reproduce previous behavior.
-        if dilation == 1:
-            padding_args = get_pad_tuple(padding, (kernel, kernel))
-            padding_args_i = [0, 1, 2, 3] if layout == "NCHW" else [0, 1]
-            padding_args = [padding_args[i] for i in padding_args_i]
-        else:
-            padding_args = padding
-
-        # placeholder
-        input_tensor = te.placeholder(input_shape, name="input_tensor", dtype=in_dtype)
-        filter_tensor = te.placeholder(filter_shape, name="filter_tensor", dtype=in_dtype)
-        scale = te.placeholder(scale_shape, name="scale", dtype=out_dtype)
-        shift = te.placeholder(shift_shape, name="shift", dtype=out_dtype)
-
-        if layout == "NCHW":
-            topi_scale_shift = topi.nn.scale_shift_nchw
-            fcompute_args = (input_tensor, filter_tensor, stride, padding_args, dilation, out_dtype)
-
-        elif layout == "NHWC":
-            topi_scale_shift = topi.nn.scale_shift_nhwc
-            fcompute_args = (input_tensor, filter_tensor, stride, padding_args, dilation, out_dtype)
-
-        elif layout == "NCHWc":
-            topi_scale_shift = topi.nn.scale_shift_nchwc
-            in_layout = "NCHW{}c".format(input_shape[-1])
-            out_layout = "NCHW{}c".format(filter_shape[-1])
-            fcompute_args = (
-                input_tensor,
-                filter_tensor,
-                stride,
-                padding,
-                dilation,
-                in_layout,
-                out_layout,
-                out_dtype,
-            )
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            # Declare, build schedule
-            if layout == "NCHW":
-                fcompute = topi.nn.depthwise_conv2d_nchw
-                fschedule = topi.hexagon.schedule_depthwise_conv2d_nchw
-            elif layout == "NHWC":
-                fcompute = topi.nn.depthwise_conv2d_nhwc
-                fschedule = topi.hexagon.schedule_depthwise_conv2d_nhwc
-            c_tensor = fcompute(*fcompute_args)
-            if use_scale_shift:
-                c_tensor = topi_scale_shift(c_tensor, scale, shift)
-            if apply_relu:
-                c_tensor = topi.nn.relu(c_tensor)
-
-            schedule = fschedule([c_tensor])
-
-            # Build and run
-            f = tvm.build(
-                schedule,
-                [input_tensor, filter_tensor, scale, shift, c_tensor],
-                get_hexagon_target("v68"),
-            )
-            mod = hexagon_session.load_module(f)
-
-            input_np, filter_np, scale_np, shift_np, output_np = ref_data
-
-            dev = hexagon_session.device
-            input_tvm = tvm.nd.array(input_np, dev)
-            filter_tvm = tvm.nd.array(filter_np, dev)
-            scale_tvm = tvm.nd.array(scale_np, dev)
-            shift_tvm = tvm.nd.array(shift_np, dev)
-            output_tvm = tvm.nd.array(
-                np.zeros(shape=get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype),
-                dev,
-            )
-
-            mod(input_tvm, filter_tvm, scale_tvm, shift_tvm, output_tvm)
-
-            tol = {"rtol": 1e-4, "atol": 1e-5}
-            tvm.testing.assert_allclose(output_np, output_tvm.numpy(), **tol)
-
-
-class TestDepthwiseConv2DMobilenetWorkloads(BaseDepthwiseConv2D):
-    """Extra tests to verify functionality for workloads used by mobilenet."""
-
-    layout = tvm.testing.parameter("NCHW", "NHWC")
-    use_scale_shift = tvm.testing.parameter(False, ids=["no_scale_shift"])
-    apply_relu = tvm.testing.parameter(False, ids=["no_relu"])
-
-    batch = tvm.testing.parameter(1)
-    channel_multiplier = tvm.testing.parameter(1)
-    kernel = tvm.testing.parameter(3)
-    padding = tvm.testing.parameter("SAME")
-    dilation = tvm.testing.parameter(1)
-
-    in_channel, in_size, stride = tvm.testing.parameters(
-        (32, 112, 1),
-        (64, 112, 2),
-        (128, 56, 1),
-        (128, 56, 2),
-        (256, 28, 1),
-    )
-
-
-class TestDepthwiseConv2D(BaseDepthwiseConv2D):
-    """Test depthwise conv2D class."""
-
-    layout = tvm.testing.parameter("NCHW", "NHWC")
-    use_scale_shift = tvm.testing.parameter(True, False, ids=["with_scale_shift", "no_scale_shift"])
-    apply_relu = tvm.testing.parameter(True, False, ids=["with_relu", "no_relu"])
-
-    (batch, in_channel, in_size, channel_multiplier, kernel, stride) = tvm.testing.parameters(
-        (1, 64, 32, 1, 3, 1),
-        (1, 128, 64, 2, 5, 2),
-    )
-    padding = tvm.testing.parameter("VALID")
-    dilation = tvm.testing.parameter(1)
-
-
-# TODO(hexagon-team): add TestDepthwiseConv2D_NCHWc test.
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_pad.py b/tests/python/contrib/test_hexagon/topi/test_pad.py
deleted file mode 100644
index f44f228a01d3..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_pad.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for reduce"""
-import numpy as np
-
-import tvm
-from tvm import te, topi
-from tvm.contrib.hexagon.session import Session
-from tvm.topi.utils import get_const_tuple
-
-from ..infrastructure import get_hexagon_target
-
-
-@tvm.testing.requires_hexagon
-def test_nn_pad(hexagon_session: Session):
-    """Test nn pad."""
-    dtype = "uint8"
-    in_shape = (1, 56, 56, 32)
-
-    data_in = np.ones(in_shape).astype(dtype)
-
-    a_tensor = te.placeholder(shape=in_shape, name="a_tensor", dtype=dtype)
-
-    c_tensor = topi.nn.pad(a_tensor, [0, 1, 1, 0], [0, 1, 1, 0], pad_value=0)
-
-    with tvm.target.Target(get_hexagon_target("v68")):
-        fschedule = topi.hexagon.schedule_pad
-        s = fschedule(c_tensor)
-
-    func = tvm.build(s, [a_tensor, c_tensor], get_hexagon_target("v68"), name="pad")
-    mod = hexagon_session.load_module(func)
-
-    dev = hexagon_session.device
-    a = tvm.nd.array(data_in, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype), dev)
-    mod["pad"](a, b)
-
-    # Reference numpy pad output
-    ref_out = np.pad(data_in, pad_width=((0, 0), (1, 1), (1, 1), (0, 0)), mode="constant")
-
-    tvm.testing.assert_allclose(b.numpy(), ref_out)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
deleted file mode 100644
index 5ae857c2dca5..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ /dev/null
@@ -1,751 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for pooling"""
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import topi
-from tvm import te
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-
-from ..infrastructure import get_hexagon_target
-
-
-class TestAdaptivePool:
-    """Adaptive pool test class."""
-
-    dshape, out_size, pool_type, layout = tvm.testing.parameters(
-        ((1, 3, 112, 112), (1, 1), "max", "NCHW"),
-        ((1, 3, 112, 112), (1, 1), "avg", "NCHW"),
-        ((1, 14, 56, 78), (34, 13), "max", "NCHW"),
-        ((1, 5, 46, 97), (4, 96), "avg", "NCHW"),
-        ((1, 112, 112, 3), (1, 1), "max", "NHWC"),
-        ((1, 5, 46, 97), (4, 96), "avg", "NHWC"),
-        ((1, 16, 32, 32, 32), (1, 1, 1), "max", "NCDHW"),
-        ((1, 16, 32, 32, 32), (1, 1, 1), "avg", "NCDHW"),
-        ((1, 16, 32, 32, 32), (2, 2, 2), "avg", "NCDHW"),
-        (
-            (1, 16, 64, 32, 32),
-            (7, 8, 9),
-            "avg",
-            "NCDHW",
-        ),
-        (
-            (1, 16, 64, 32, 32),
-            (8, 16, 16),
-            "avg",
-            "NCDHW",
-        ),
-        ((1, 16, 32, 32, 32), (1, 1, 1), "avg", "NDHWC"),
-        ((1, 16, 32, 32, 32), (2, 2, 2), "max", "NDHWC"),
-        ((1, 16, 32, 32, 32), (2, 4, 4), "max", "NDHWC"),
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size, pool_type, layout):
-        """Test adaptive pool."""
-        dtype = "float32"
-        np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
-        np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
-        oshape = np_out.shape
-
-        data = te.placeholder(dshape, name="data", dtype=dtype)
-        if len(out_size) == 2:
-            out = topi.nn.adaptive_pool(data, out_size, pool_type, layout)
-        else:
-            assert len(out_size) == 3
-            out = topi.nn.adaptive_pool3d(data, out_size, pool_type, layout)
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fschedule = topi.hexagon.schedule_adaptive_pool
-            s = fschedule(out)
-
-        func = tvm.build(
-            s,
-            [data, out],
-            get_hexagon_target("v68"),
-            name="adaptive-pool",
-        )
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        a = tvm.nd.array(np_data, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), dev)
-        mod["adaptive-pool"](a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), np_out, rtol=4e-5, atol=1e-6)
-
-
-def verify_poolnd(
-    hexagon_session,
-    n,
-    input_shape,
-    kernel,
-    stride,
-    dilation,
-    padding,
-    pool_type,
-    ceil_mode,
-    count_include_pad=True,
-    layout="NCW",
-):
-    """Pool test verification."""
-    a_tensor = te.placeholder(input_shape, name="a_tensor")
-
-    if n == 1:
-        b_tensor = topi.nn.pool1d(
-            a_tensor,
-            kernel=kernel,
-            stride=stride,
-            dilation=dilation,
-            padding=padding,
-            pool_type=pool_type,
-            ceil_mode=ceil_mode,
-            layout=layout,
-            count_include_pad=count_include_pad,
-        )
-    elif n == 2:
-        b_tensor = topi.nn.pool2d(
-            a_tensor,
-            kernel=kernel,
-            stride=stride,
-            dilation=dilation,
-            padding=padding,
-            pool_type=pool_type,
-            ceil_mode=ceil_mode,
-            layout=layout,
-            count_include_pad=count_include_pad,
-        )
-    elif n == 3:
-        b_tensor = topi.nn.pool3d(
-            a_tensor,
-            kernel=kernel,
-            stride=stride,
-            dilation=dilation,
-            padding=padding,
-            pool_type=pool_type,
-            ceil_mode=ceil_mode,
-            layout=layout,
-            count_include_pad=count_include_pad,
-        )
-    else:
-        raise ValueError(f"PoolND only supports n=1, 2, 3 got n={n}")
-
-    b_tensor = topi.nn.relu(b_tensor)
-    dtype = a_tensor.dtype
-    output_shape = [int(i) for i in b_tensor.shape]
-
-    input_np = np.random.uniform(low=0.001, size=input_shape).astype(dtype)
-
-    padding_before = padding[:n]
-    padding_after = padding[n:]
-    ref_np = tvm.topi.testing.poolnd_python(
-        input_np,
-        kernel,
-        stride,
-        dilation,
-        padding_before,
-        padding_after,
-        pool_type,
-        count_include_pad,
-        ceil_mode,
-        layout=layout,
-    )
-
-    np.testing.assert_equal(tuple(output_shape), tuple(ref_np.shape))
-
-    with tvm.target.Target(get_hexagon_target("v68")):
-        fschedule = topi.hexagon.schedule_pool
-        s = fschedule(b_tensor, layout)
-
-    func = tvm.build(s, [a_tensor, b_tensor], get_hexagon_target("v68"), name="pool")
-    mod = hexagon_session.load_module(func)
-
-    dev = hexagon_session.device
-    a = tvm.nd.array(input_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=dtype), dev)
-    mod["pool"](a, b)
-
-    tvm.testing.assert_allclose(b.numpy(), ref_np, rtol=1e-5)
-
-
-class TestPool1D:
-    """Pool1D test class."""
-
-    (
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        count_include_pad,
-        layout,
-    ) = tvm.testing.parameters(
-        ([1, 16, 32], [2], [2], [1], [0, 0], "avg", False, True, "NCW"),
-        ([1, 16, 31], [3], [3], [1], [1, 2], "avg", False, True, "NCW"),
-        ([1, 16, 32], [2], [2], [1], [1, 2], "avg", False, False, "NCW"),
-        ([1, 16, 31], [4], [4], [1], [3, 3], "avg", False, False, "NCW"),
-        ([1, 16, 31], [4], [4], [1], [0, 0], "avg", False, False, "NCW"),
-        ([1, 16, 32], [2], [2], [1], [0, 0], "max", False, True, "NCW"),
-        ([1, 16, 31], [3], [3], [1], [2, 1], "max", False, True, "NCW"),
-        ([1, 16, 31], [3], [3], [1], [2, 1], "max", True, True, "NCW"),
-        ([1, 16, 31], [3], [3], [1], [2, 5], "avg", False, True, "NCW"),
-        ([1, 16, 32], [2], [2], [1], [0, 3], "avg", False, False, "NCW"),
-        ([1, 16, 31], [3], [3], [1], [1, 4], "max", False, True, "NCW"),
-        ([1, 16, 31], [3], [3], [1], [3, 0], "max", True, True, "NCW"),
-        # Test non-1 dilations
-        ([1, 16, 31], [3], [3], [2], [2, 5], "avg", False, True, "NCW"),
-        ([1, 16, 32], [2], [2], [3], [0, 3], "avg", False, False, "NCW"),
-        ([1, 16, 31], [3], [3], [2], [1, 4], "max", False, True, "NCW"),
-        ([1, 16, 31], [3], [3], [3], [3, 0], "max", True, True, "NCW"),
-        # Test Channel last
-        ([1, 32, 16], [2], [2], [1], [0, 0], "avg", False, True, "NWC"),
-        ([1, 31, 16], [3], [3], [1], [1, 2], "avg", False, True, "NWC"),
-        ([1, 32, 16], [2], [2], [1], [1, 2], "avg", False, False, "NWC"),
-        ([1, 31, 16], [4], [4], [1], [3, 3], "avg", False, False, "NWC"),
-        ([1, 31, 16], [4], [4], [1], [0, 0], "avg", False, False, "NWC"),
-        ([1, 32, 16], [2], [2], [1], [0, 0], "max", False, True, "NWC"),
-        ([1, 31, 16], [3], [3], [1], [2, 1], "max", False, True, "NWC"),
-        ([1, 31, 16], [3], [3], [1], [2, 1], "max", True, True, "NWC"),
-        ([1, 31, 16], [3], [3], [1], [2, 5], "avg", False, True, "NWC"),
-        ([1, 31, 16], [2], [2], [1], [0, 3], "avg", False, False, "NWC"),
-        ([1, 31, 16], [3], [3], [1], [1, 4], "max", False, True, "NWC"),
-        ([1, 31, 16], [3], [3], [1], [3, 0], "max", True, True, "NWC"),
-        ([1, 31, 16], [3], [3], [2], [2, 5], "avg", False, True, "NWC"),
-        ([1, 32, 16], [2], [2], [3], [0, 3], "avg", False, False, "NWC"),
-        ([1, 31, 16], [3], [3], [2], [1, 4], "max", False, True, "NWC"),
-        ([1, 31, 16], [3], [3], [3], [3, 0], "max", True, True, "NWC"),
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_pool1d(
-        self,
-        hexagon_session: Session,
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        count_include_pad,
-        layout,
-    ):
-        """Test Pool1D."""
-        verify_poolnd(
-            hexagon_session,
-            1,
-            input_shape,
-            kernel,
-            stride,
-            dilation,
-            padding,
-            pool_type,
-            ceil_mode,
-            count_include_pad,
-            layout,
-        )
-
-
-class TestPool2D:
-    """Pool2D test class."""
-
-    (
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        count_include_pad,
-        layout,
-    ) = tvm.testing.parameters(
-        ([1, 16, 32, 32], [2, 2], [2, 2], [1, 1], [0, 0, 0, 0], "avg", False, True, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [1, 2, 1, 2], "avg", False, True, "NCHW"),
-        ([1, 16, 32, 32], [2, 2], [2, 2], [1, 1], [1, 2, 1, 2], "avg", False, False, "NCHW"),
-        ([1, 16, 31, 31], [4, 4], [4, 4], [1, 1], [3, 3, 3, 3], "avg", False, False, "NCHW"),
-        ([1, 16, 31, 31], [4, 4], [4, 4], [1, 1], [0, 0, 0, 0], "avg", False, False, "NCHW"),
-        ([1, 16, 32, 32], [2, 3], [2, 2], [1, 1], [0, 0, 0, 0], "max", False, True, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", False, True, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", True, True, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 0, 3], "avg", False, True, "NCHW"),
-        ([1, 16, 32, 32], [2, 3], [2, 2], [1, 1], [0, 3, 2, 1], "avg", False, False, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [1, 0, 3, 2], "max", False, True, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [3, 2, 1, 0], "max", True, True, "NCHW"),
-        # Test non-1 dilations
-        ([1, 16, 31, 31], [3, 3], [3, 3], [2, 1], [2, 1, 0, 3], "avg", False, True, "NCHW"),
-        ([1, 16, 32, 32], [2, 3], [2, 2], [2, 3], [0, 3, 2, 1], "avg", False, False, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [3, 3], [1, 0, 3, 2], "max", False, True, "NCHW"),
-        ([1, 16, 31, 31], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, True, "NCHW"),
-        # Test channel last
-        ([1, 32, 32, 16], [2, 2], [2, 2], [1, 1], [0, 0, 0, 0], "avg", False, True, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [1, 2, 1, 2], "avg", False, True, "NHWC"),
-        ([1, 32, 32, 16], [2, 2], [2, 2], [1, 1], [1, 2, 1, 2], "avg", False, False, "NHWC"),
-        ([1, 31, 31, 16], [4, 4], [4, 4], [1, 1], [3, 3, 3, 3], "avg", False, False, "NHWC"),
-        ([1, 31, 31, 16], [4, 4], [4, 4], [1, 1], [0, 0, 0, 0], "avg", False, False, "NHWC"),
-        ([1, 32, 32, 16], [2, 3], [2, 2], [1, 1], [0, 0, 0, 0], "max", False, True, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", False, True, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", True, True, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 0, 3], "avg", False, True, "NHWC"),
-        ([1, 32, 32, 16], [2, 3], [2, 2], [1, 1], [0, 3, 2, 1], "avg", False, False, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [1, 0, 3, 2], "max", False, True, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [3, 2, 1, 0], "max", True, True, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [2, 1], [2, 1, 0, 3], "avg", False, True, "NHWC"),
-        ([1, 32, 32, 16], [2, 3], [2, 2], [2, 3], [0, 3, 2, 1], "avg", False, False, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [3, 3], [1, 0, 3, 2], "max", False, True, "NHWC"),
-        ([1, 31, 31, 16], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, True, "NHWC"),
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_pool2d(
-        self,
-        hexagon_session: Session,
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        count_include_pad,
-        layout,
-    ):
-        """Test Pool2D."""
-        verify_poolnd(
-            hexagon_session,
-            2,
-            input_shape,
-            kernel,
-            stride,
-            dilation,
-            padding,
-            pool_type,
-            ceil_mode,
-            count_include_pad,
-            layout,
-        )
-
-
-class TestPool3D:
-    """Pool3D test class."""
-
-    (
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        count_include_pad,
-        layout,
-    ) = tvm.testing.parameters(
-        (
-            [1, 16, 32, 32, 32],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [0, 0, 0, 0, 0, 0],
-            "avg",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [1, 1, 2, 2, 2, 1],
-            "avg",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 32, 32, 32],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [1, 1, 2, 2, 2, 1],
-            "avg",
-            False,
-            False,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [4, 4, 4],
-            [4, 4, 4],
-            [1, 1, 1],
-            [3, 3, 3, 3, 3, 3],
-            "avg",
-            False,
-            False,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [4, 4, 4],
-            [4, 4, 4],
-            [1, 1, 1],
-            [0, 0, 0, 0, 0, 0],
-            "avg",
-            False,
-            False,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 32, 32, 32],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [0, 0, 0, 0, 0, 0],
-            "max",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [2, 2, 1, 1, 1, 2],
-            "max",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [2, 2, 1, 1, 1, 2],
-            "max",
-            True,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [2, 1, 0, 5, 4, 3],
-            "avg",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 32, 32, 32],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [0, 5, 4, 3, 2, 1],
-            "avg",
-            False,
-            False,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [1, 0, 5, 4, 3, 2],
-            "max",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [3, 2, 1, 0, 5, 4],
-            "max",
-            True,
-            True,
-            "NCDHW",
-        ),
-        # Test non-1 dilation
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [3, 3, 3],
-            [2, 1, 0, 5, 4, 3],
-            "avg",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 32, 32, 32],
-            [2, 2, 2],
-            [2, 2, 2],
-            [2, 2, 2],
-            [0, 5, 4, 3, 2, 1],
-            "avg",
-            False,
-            False,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [2, 1, 3],
-            [1, 0, 5, 4, 3, 2],
-            "max",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [2, 2, 3],
-            [3, 2, 1, 0, 5, 4],
-            "max",
-            True,
-            True,
-            "NCDHW",
-        ),
-        # Test channel last layouts
-        (
-            [1, 32, 32, 32, 16],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [0, 0, 0, 0, 0, 0],
-            "avg",
-            False,
-            True,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [1, 1, 2, 2, 2, 1],
-            "avg",
-            False,
-            True,
-            "NDHWC",
-        ),
-        (
-            [1, 32, 32, 32, 16],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [1, 1, 2, 2, 2, 1],
-            "avg",
-            False,
-            False,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [4, 4, 4],
-            [4, 4, 4],
-            [1, 1, 1],
-            [3, 3, 3, 3, 3, 3],
-            "avg",
-            False,
-            False,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [4, 4, 4],
-            [4, 4, 4],
-            [1, 1, 1],
-            [0, 0, 0, 0, 0, 0],
-            "avg",
-            False,
-            False,
-            "NDHWC",
-        ),
-        (
-            [1, 32, 32, 32, 16],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [0, 0, 0, 0, 0, 0],
-            "max",
-            False,
-            True,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [2, 2, 1, 1, 1, 2],
-            "max",
-            False,
-            True,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [2, 2, 1, 1, 1, 2],
-            "max",
-            True,
-            True,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [2, 1, 0, 5, 4, 3],
-            "avg",
-            False,
-            True,
-            "NDHWC",
-        ),
-        (
-            [1, 32, 32, 32, 16],
-            [2, 2, 2],
-            [2, 2, 2],
-            [1, 1, 1],
-            [0, 5, 4, 3, 2, 1],
-            "avg",
-            False,
-            False,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [1, 0, 5, 4, 3, 2],
-            "max",
-            False,
-            True,
-            "NDHWC",
-        ),
-        (
-            [1, 31, 31, 31, 16],
-            [3, 3, 3],
-            [3, 3, 3],
-            [1, 1, 1],
-            [3, 2, 1, 0, 5, 4],
-            "max",
-            True,
-            True,
-            "NDHWC",
-        ),
-        # Test non-1 dilation
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [3, 3, 3],
-            [2, 1, 0, 5, 4, 3],
-            "avg",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 32, 32, 32],
-            [2, 2, 2],
-            [2, 2, 2],
-            [2, 2, 2],
-            [0, 5, 4, 3, 2, 1],
-            "avg",
-            False,
-            False,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [2, 1, 3],
-            [1, 0, 5, 4, 3, 2],
-            "max",
-            False,
-            True,
-            "NCDHW",
-        ),
-        (
-            [1, 16, 31, 31, 31],
-            [3, 3, 3],
-            [3, 3, 3],
-            [2, 2, 3],
-            [3, 2, 1, 0, 5, 4],
-            "max",
-            True,
-            True,
-            "NCDHW",
-        ),
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_pool3d(
-        self,
-        hexagon_session: Session,
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        count_include_pad,
-        layout,
-    ):
-        """Test Pool3D."""
-        verify_poolnd(
-            hexagon_session,
-            3,
-            input_shape,
-            kernel,
-            stride,
-            dilation,
-            padding,
-            pool_type,
-            ceil_mode,
-            count_include_pad,
-            layout,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_quantize.py b/tests/python/contrib/test_hexagon/topi/test_quantize.py
deleted file mode 100644
index 2c54b12ab98e..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_quantize.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TIR quantize schedule tests."""
-import numpy as np
-
-import tvm
-from tvm import te
-import tvm.topi.hexagon.qnn as s1
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ..infrastructure import (
-    transform_numpy,
-    quantize_np,
-    get_hexagon_target,
-)
-
-QUANTIZE_SCALE = None
-QUANTIZE_ZERO_POINT = None
-
-
-class TestQuantize:
-    """Test quantize class."""
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, output_dtype):
-        global QUANTIZE_SCALE, QUANTIZE_ZERO_POINT
-        quant_np, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT = quantize_np(input_np, output_dtype)
-        return quant_np
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, input_dtype):
-        return np.random.random(input_shape).astype(input_dtype)
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, input_crouton_layout):
-        return transform_numpy(input_np, "nhwc", input_crouton_layout)
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, output_layout):
-        return transform_numpy(expected_output_np, "nhwc", output_layout)
-
-    input_crouton_layout, output_layout, input_dtype = tvm.testing.parameters(
-        ("nhwc-4h2w32c2w-2d", "nhwc-8h8w32c-2d", "float32"),
-    )
-
-    output_dtype = tvm.testing.parameter("uint8", "int8")
-
-    input_shape = tvm.testing.parameter(
-        (1, 8, 8, 32), (1, 16, 16, 32), (1, 16, 16, 128), (1, 64, 64, 64)
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_quantize(
-        self,
-        input_dtype,
-        output_dtype,
-        transformed_input_np,
-        input_shape,
-        expected_output_np,
-        transformed_expected_output_np,
-        input_crouton_layout,
-        output_layout,
-        hexagon_session,
-    ):
-        """Test quantize."""
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=input_dtype)
-
-        m_tensor = s1.quantize_compute(a_tensor, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT, output_dtype)
-
-        tir_schedule = s1.tir_quantize_schedule(
-            m_tensor, a_tensor, input_crouton_layout, output_layout
-        )
-
-        sch = tir_schedule.mod
-
-        input_axis_separator = [4]
-        output_axis_separator = [4]
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="quantize",
-            )
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=input_dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=output_dtype,
-            axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        b, h, weight, c = expected_output_np.shape
-
-        # convert nd to np and reshape to fixed chunk size layout
-        m_data_np = m_data_nd.numpy().reshape([b, h // 8, weight // 8, c // 32, 8, 8, 32])
-
-        np.testing.assert_allclose(transformed_expected_output_np, m_data_np, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
deleted file mode 100644
index eb798db1dd2b..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for reduce"""
-import numpy as np
-
-import tvm
-from tvm import topi
-from tvm import te
-from tvm.contrib.hexagon.session import Session
-
-from ..infrastructure import get_hexagon_target
-
-
-def _my_npy_argmax(arr, axis, keepdims):
-    if not keepdims:
-        return arr.argmax(axis=axis)
-    else:
-        if axis is None:
-            out_shape = [1 for _ in arr.shape]
-        else:
-            out_shape = list(arr.shape)
-            out_shape[axis] = 1
-
-        return arr.argmax(axis=axis).reshape(out_shape)
-
-
-def _my_npy_argmin(arr, axis, keepdims):
-    if not keepdims:
-        return arr.argmin(axis=axis)
-    else:
-        if axis is None:
-            out_shape = [1 for _ in arr.shape]
-        else:
-            out_shape = list(arr.shape)
-            out_shape[axis] = 1
-        return arr.argmin(axis=axis).reshape(out_shape)
-
-
-class TestReduce:
-    """Test reduce class."""
-
-    in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
-        ((32,), 0, False, "argmax", "float32"),
-        ((32, 24, 32, 24), (1, 2, 3), True, "sum", "float32"),
-        ((2, 3), None, True, "all", "bool"),
-        ((32, 24 * 32 * 24), (1,), False, "max", "float32"),
-        ((32, 128, 24), None, True, "sum", "float32"),
-        ((32, 128, 24), None, True, "all", "bool"),
-        ((32, 24, 32, 24), (0, 2), False, "min", "float32"),
-        ((32, 128), 1, True, "argmax", "float32"),
-        ((32, 24, 32, 24), 2, False, "argmin", "float32"),
-        ((31, 21, 15), None, True, "argmax", "float32"),
-        ((31, 21, 15), None, False, "sum", "float32"),
-        ((2, 3), None, True, "any", "bool"),
-        ((32, 128, 24), None, True, "any", "bool"),
-        ((1, 4, 7), 1, True, "any", "bool"),
-        ((32, 24, 32, 24), 2, False, "any", "bool"),
-    )
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(self, in_shape, axis, keepdims, reduce_type, dtype):
-        """Generate test reference data."""
-        if dtype == "bool":
-            in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
-        else:
-            in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
-            in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
-
-        if reduce_type == "sum":
-            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
-        elif reduce_type == "all" and dtype == "bool":
-            out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
-        elif reduce_type == "any" and dtype == "bool":
-            out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
-        elif reduce_type == "max":
-            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
-        elif reduce_type == "min":
-            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
-        elif reduce_type == "argmax":
-            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
-        elif reduce_type == "argmin":
-            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
-        else:
-            raise NotImplementedError
-
-        return in_npy, in_npy_map, out_npy
-
-    @tvm.testing.requires_hexagon
-    def test_reduce_map(
-        self, hexagon_session: Session, ref_data, in_shape, axis, keepdims, reduce_type, dtype
-    ):
-        """Test reduce map."""
-        in_npy, in_npy_map, out_npy = ref_data
-
-        # Build the logic and compile the function
-        a_tensor = te.placeholder(shape=in_shape, name="a_tensor", dtype=dtype)
-        a1_tensor = topi.sqrt(topi.exp(a_tensor))
-        out_dtype = dtype
-        if reduce_type == "sum":
-            b_tensor = topi.sum(a1_tensor, axis=axis, keepdims=keepdims)
-        elif reduce_type == "all":
-            b_tensor = topi.all(a_tensor, axis=axis, keepdims=keepdims)
-        elif reduce_type == "any":
-            b_tensor = topi.any(a_tensor, axis=axis, keepdims=keepdims)
-        elif reduce_type == "max":
-            b_tensor = topi.max(a1_tensor, axis=axis, keepdims=keepdims)
-        elif reduce_type == "min":
-            b_tensor = topi.min(a1_tensor, axis=axis, keepdims=keepdims)
-        elif reduce_type == "argmax":
-            b_tensor = topi.argmax(a1_tensor, axis=axis, keepdims=keepdims)
-            out_dtype = "int32"
-        elif reduce_type == "argmin":
-            b_tensor = topi.argmin(a1_tensor, axis=axis, keepdims=keepdims)
-            out_dtype = "int32"
-        else:
-            raise NotImplementedError
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fschedule = topi.hexagon.schedule_reduce
-            s = fschedule(b_tensor)
-
-        func = tvm.build(s, [a_tensor, b_tensor], get_hexagon_target("v68"), name=reduce_type)
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        data_tvm = tvm.nd.array(in_npy, device=dev)
-        out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
-
-        mod[reduce_type](data_tvm, out_tvm)
-
-        if reduce_type in ["argmax", "argmin"]:
-            out_tvm_indices = out_tvm.numpy()
-            if keepdims:
-                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
-            if axis is None:
-                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
-            else:
-                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis + 1) :]))
-                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
-                out_tvm_val = in_npy_map[sel_indices]
-            if reduce_type == "argmax":
-                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3)
-            elif reduce_type == "argmin":
-                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3)
-        else:
-            tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_reshape.py b/tests/python/contrib/test_hexagon/topi/test_reshape.py
deleted file mode 100644
index 51ac12506023..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_reshape.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test reshape class."""
-import numpy as np
-
-import tvm
-import tvm.testing
-import tvm.topi.hexagon.slice_ops as sl
-from tvm import te
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ..infrastructure import transform_numpy, get_hexagon_target
-
-BATCH_FLATTEN_FP16_TESTS = (
-    ([1, 1, 1, 2048], [1, 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-    ([1, 8, 8, 1024], [1, 8 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-    ([2, 4, 8, 1024], [2, 4 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-)
-
-BATCH_FLATTEN_UINT8_TESTS = (
-    ([1, 1, 1, 2048], [1, 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
-    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
-)
-
-
-def reshape_helper(
-    func,
-    fcompute,
-    fschedule,
-    data_type,
-    input_shape,
-    input_layout,
-    output_shape,
-    output_layout,
-    hexagon_session,
-):
-    """Reshape helper function."""
-
-    a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=data_type)
-    if func == "reshape":
-        d_tesnsor = fcompute(a_tensor, output_shape)
-    elif func == "batch_flatten":
-        d_tesnsor = fcompute(a_tensor)
-    else:
-        raise RuntimeError(f"Unexpected func'{func}'")
-    tir_s = fschedule(
-        d_tesnsor,
-        a_tensor,
-        output_layout,
-        input_layout,
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        runtime_module = tvm.build(tir_s.mod, target=get_hexagon_target("v69"), name=func)
-
-    mod = hexagon_session.load_module(runtime_module)
-
-    a_numpy = (np.random.uniform(-10, 10, input_shape)).astype(data_type)
-    ref = np.reshape(a_numpy, output_shape)
-
-    input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout)
-    ref_np_transformed = transform_numpy(ref, "nhwc", output_layout)
-    input_axis_sep = [4]
-    if output_layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
-        output_axis_sep = [4]
-    elif output_layout in ["nc-1024-2d", "nc-2048-2d"]:
-        output_axis_sep = [2]
-    else:
-        raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-    a_tvm = allocate_hexagon_array(
-        hexagon_session.device,
-        data=input_np_transformed,
-        axis_separators=input_axis_sep,
-        mem_scope="global.vtcm",
-    )
-    output = allocate_hexagon_array(
-        hexagon_session.device,
-        ref_np_transformed.shape,
-        data_type,
-        axis_separators=output_axis_sep,
-        mem_scope="global.vtcm",
-    )
-
-    mod(a_tvm, output)
-    np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0)
-
-
-class BaseTestBatchFlatten:
-    """Test batch flatten class."""
-
-    (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters(
-        *BATCH_FLATTEN_FP16_TESTS,
-        *BATCH_FLATTEN_UINT8_TESTS,
-    )
-
-
-class TestBatchFlatten(BaseTestBatchFlatten):
-    """Test batch flatten class."""
-
-    @tvm.testing.requires_hexagon
-    def test_batch_flatten(
-        self,
-        data_type,
-        input_shape,
-        input_layout,
-        output_shape,
-        output_layout,
-        hexagon_session,
-    ):
-        """Test batch flatten."""
-        reshape_helper(
-            "batch_flatten",
-            sl.batch_flatten_compute,
-            sl.batch_flatten_stir_schedule,
-            data_type,
-            input_shape,
-            input_layout,
-            output_shape,
-            output_layout,
-            hexagon_session,
-        )
-
-
-class BaseTestReshape(BaseTestBatchFlatten):
-    """Test reshape base class."""
-
-    reshape_fp16_tests = (
-        ([1, 8, 4, 64], [1, 8, 8, 32], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
-        ([1, 16, 8, 128], [1, 16, 16, 64], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
-    )
-
-    reshape_uint8_tests = (
-        ([1, 8, 8, 128], [1, 8, 16, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
-        ([1, 16, 64, 128], [1, 16, 128, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
-    )
-
-    (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters(
-        *BATCH_FLATTEN_FP16_TESTS,
-        *BATCH_FLATTEN_UINT8_TESTS,
-        *reshape_fp16_tests,
-        *reshape_uint8_tests,
-    )
-
-
-class TestReshape(BaseTestReshape):
-    """Test reshape class."""
-
-    @tvm.testing.requires_hexagon
-    def test_reshape(
-        self,
-        data_type,
-        input_shape,
-        input_layout,
-        output_shape,
-        output_layout,
-        hexagon_session,
-    ):
-        """Test reshape."""
-        reshape_helper(
-            "reshape",
-            sl.reshape_compute,
-            sl.reshape_stir_schedule,
-            data_type,
-            input_shape,
-            input_layout,
-            output_shape,
-            output_layout,
-            hexagon_session,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
deleted file mode 100644
index 4adb7c6768e7..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""Resize 2D tesst.
-"""
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.topi.testing import resize2d_python
-import tvm.topi.hexagon as s1
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ..infrastructure import transform_numpy, get_hexagon_target
-
-
-class TestResize2d:
-    """Test resize 2D class."""
-
-    (batch, channel, in_height, in_width, out_height, out_width,) = tvm.testing.parameters(
-        (
-            1,
-            32,
-            8,
-            8,
-            16,
-            16,
-        ),
-        (
-            1,
-            32,
-            48,
-            48,
-            8,
-            8,
-        ),
-    )
-
-    (layout, input_crouton_layout, output_layout, dtype,) = tvm.testing.parameters(
-        ("NHWC", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
-        ("NHWC", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
-    )
-
-    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
-    method = tvm.testing.parameter("nearest_neighbor", "linear")
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self,
-        input_np,
-        in_height,
-        in_width,
-        out_height,
-        out_width,
-        layout,
-        method,
-        coord_trans,
-    ):
-        """Generate expected output."""
-        scale_h = out_height / in_height
-        scale_w = out_width / in_width
-
-        return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans)
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        if dtype == "float16":
-            return np.random.random(input_shape).astype(dtype)
-        if dtype == "uint8":
-            return np.random.randint(0, 255, input_shape).astype(dtype)
-        if dtype == "int8":
-            return np.random.randint(-128, 127, input_shape).astype(dtype)
-        raise RuntimeError(f"dtype {dtype} is not valid.")
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, layout, input_crouton_layout, dtype):
-        if dtype in ["float16", "uint8", "int8"]:
-            return transform_numpy(input_np, layout.lower(), input_crouton_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, layout, output_layout, dtype):
-        if dtype in ["float16", "uint8", "int8"]:
-            return transform_numpy(expected_output_np, layout.lower(), output_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def input_shape(self, batch, channel, in_height, in_width):
-        return (batch, in_height, in_width, channel)
-
-    @tvm.testing.fixture
-    def output_shape(self, batch, channel, out_height, out_width):
-        return (batch, out_height, out_width, channel)
-
-    @tvm.testing.requires_hexagon
-    def test_resize2d(
-        self,
-        dtype,
-        input_np,
-        transformed_input_np,
-        input_shape,
-        output_shape,
-        expected_output_np,
-        transformed_expected_output_np,
-        layout,
-        input_crouton_layout,
-        output_layout,
-        coord_trans,
-        method,
-        hexagon_session,
-    ):
-        """Test resize 2D."""
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
-
-        m_tensor = s1.resize2d_compute(
-            a_tensor,
-            [0.0] * 4,
-            (output_shape[1], output_shape[2]),
-            layout=layout,
-            coordinate_transformation_mode=coord_trans,
-            method=method,
-            out_dtype=dtype,
-        )
-
-        tir_schedule = s1.tir_resize2d_schedule(
-            m_tensor, a_tensor, input_crouton_layout, output_layout
-        )
-
-        sch = tir_schedule.mod
-
-        input_axis_separator = [4]
-        if output_layout in (
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h8w32c-2d",
-        ):
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="resize2d",
-            )
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            transformed_expected_output_np.shape,
-            dtype=dtype,
-            axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        batch_size, height, width, channel = output_shape
-        # convert nd to np and reshape to fixed chunk size layout
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch_size, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
-            )
-        elif output_layout == "nhwc-8h8w32c-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch_size, height // 8, width // 8, channel // 32, 8, 8, 32]
-            )
-
-        if dtype == "float16":
-            np.testing.assert_allclose(
-                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
-            )
-        elif dtype in ["int8", "uint8"]:
-            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
deleted file mode 100644
index e1b4d97bc171..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for softmax"""
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import topi
-from tvm import te
-from tvm.contrib.hexagon.session import Session
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-
-from ..infrastructure import get_hexagon_target
-
-# TODO(mehrdadh): add log_softmax to config
-OPERATOR_CONFIGS = {
-    "softmax": {
-        "topi": topi.nn.softmax,
-        "ref": tvm.topi.testing.softmax_python,
-        "dimensions": [2, 4],
-    },
-}
-
-
-class TestSoftmax:
-    """Softmax test class."""
-
-    dtype = tvm.testing.parameter(
-        "float16",
-        "float32",
-    )
-
-    # TODO(mehrdadh): larger size like (1, 16, 256, 256)
-    # would fail due to TVM_HEXAGON_RPC_BUFF_SIZE_BYTES
-    shape = tvm.testing.parameter((32, 10), (3, 4), (1, 16, 32, 32))
-
-    @tvm.testing.fixture
-    def softmax_operation(self, shape) -> tuple:
-        """Returns the operation name and shape."""
-        for name, config in OPERATOR_CONFIGS.items():
-            if len(shape) in config["dimensions"]:
-                return name
-            else:
-                raise ValueError(f"Shape {shape} is not supported.")
-
-    @tvm.testing.requires_hexagon
-    def test_softmax(self, hexagon_session: Session, dtype, shape, softmax_operation):
-        """Test softmax."""
-        if dtype == "float16":
-            pytest.xfail("float16 is not supported.")
-
-        a_tensor = te.placeholder(shape, dtype=dtype, name="a_tensor")
-
-        topi_op = OPERATOR_CONFIGS[softmax_operation]["topi"]
-        b_tensor = topi_op(a_tensor, axis=1)
-
-        def get_ref_data(shape):
-            ref_func = tvm.topi.testing.softmax_python
-            a_np = np.random.uniform(size=shape).astype(dtype)
-
-            if len(shape) == 2:
-                b_np = ref_func(a_np)
-            elif len(shape) == 4:
-                _, c, height, width = a_np.shape
-                a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(height * width, c)
-                b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
-                b_np = b_np_2d.reshape(1, height, width, c).transpose(0, 3, 1, 2)
-
-            return a_np, b_np
-
-        # get the test data
-        a_np, b_np = get_ref_data(shape)
-
-        with tvm.target.Target(get_hexagon_target("v68")):
-            fschedule = topi.hexagon.schedule_softmax
-            s = fschedule(b_tensor)
-
-        func = tvm.build(s, [a_tensor, b_tensor], get_hexagon_target("v68"), name="softmax")
-        mod = hexagon_session.load_module(func)
-
-        dev = hexagon_session.device
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=b_tensor.dtype), dev)
-        mod["softmax"](a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_libtorch_ops.py b/tests/python/contrib/test_libtorch_ops.py
deleted file mode 100644
index 153232a2f531..000000000000
--- a/tests/python/contrib/test_libtorch_ops.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-
-import tvm.relay
-from tvm.relay.op.contrib import torchop
-from tvm.testing import requires_libtorch
-
-import_torch_error = None
-
-try:
-    import torch
-except ImportError as e:
-    torch = None
-    import_torch_error = str(e)
-
-
-@pytest.mark.skipif(torch is None, reason=f"PyTorch is not available: {import_torch_error}")
-@requires_libtorch
-def test_backend():
-    @torch.jit.script
-    def script_fn(x, y):
-        res = x * y
-        return res
-
-    for torch_dt, dt in (
-        (torch.int32, "int32"),
-        (torch.float32, "float32"),
-        (torch.float64, "float64"),
-    ):
-        x2 = tvm.relay.var("x", shape=[1, 2], dtype=dt)
-        y2 = tvm.relay.var("y", shape=[2, 2], dtype=dt)
-
-        x3 = tvm.relay.var("x", shape=[1, 3], dtype=dt)
-        y3 = tvm.relay.var("y", shape=[3, 3], dtype=dt)
-
-        test_body = tvm.relay.sum(torchop(script_fn, x2, y2)) + tvm.relay.sum(
-            torchop(script_fn, x3, y3)
-        )
-        test_fn = tvm.relay.Function([x2, y2, x3, y3], test_body)
-        mod = tvm.IRModule({"main": test_fn})
-
-        tvm.relay.transform.InferType()(mod)
-
-        # mod = tvm.relay.transform.AnnotateTarget("target.torch")(mod)
-        mod = tvm.relay.transform.MergeCompilerRegions()(mod)
-        mod = tvm.relay.transform.PartitionGraph()(mod)
-        mod = tvm.relay.transform.InferType()(mod)
-
-        target = "llvm"
-        with tvm.transform.PassContext(opt_level=3):
-            lib = tvm.relay.build(mod, target, params={})
-
-        ctx = tvm.cpu(0)
-        rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](ctx))
-
-        # int does not have randn, so we cast...
-        x2t = torch.randn(1, 2).to(dtype=torch_dt)
-        y2t = torch.randn(2, 2).to(dtype=torch_dt)
-        x3t = torch.randn(1, 3).to(dtype=torch_dt)
-        y3t = torch.randn(3, 3).to(dtype=torch_dt)
-        # Set inputs
-        rt_mod.set_input(0, x2t)
-        rt_mod.set_input(1, y2t)
-        rt_mod.set_input(2, x3t)
-        rt_mod.set_input(3, y3t)
-        # Execute
-        rt_mod.run()
-        # Get outputs
-        tvm_output = rt_mod.get_output(0).numpy()
-        expected = (script_fn(x2t, y2t).sum() + script_fn(x3t, y3t).sum()).numpy()
-        print(tvm_output.dtype)
-        print(expected.dtype)
-        tvm.testing.assert_allclose(tvm_output, expected)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_mrvl/__init__.py b/tests/python/contrib/test_mrvl/__init__.py
deleted file mode 100644
index 736bad937051..000000000000
--- a/tests/python/contrib/test_mrvl/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Infrastructure and tests for Marvell"""
diff --git a/tests/python/contrib/test_mrvl/infrastructure.py b/tests/python/contrib/test_mrvl/infrastructure.py
deleted file mode 100644
index c4c56edfead5..000000000000
--- a/tests/python/contrib/test_mrvl/infrastructure.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-
-"""Infrastructure to Test Marvell Code Generation"""
-import json
-
-import tvm
-from tvm import relay
-from tvm.relay.op.contrib import mrvl
-import numpy as np
-from tvm.contrib import graph_executor
-from tvm.relay.build_module import build
-from tvm.relay.op.contrib.mrvl import partition_for_mrvl
-
-
-def get_cpu_op_count(mod):
-    """Traverse graph counting ops offloaded to TVM."""
-
-    class Counter(tvm.relay.ExprVisitor):
-        def __init__(self):
-            super().__init__()
-            self.count = 0
-
-        def visit_call(self, call):
-            if isinstance(call.op, tvm.ir.Op):
-                self.count += 1
-
-            super().visit_call(call)
-
-    c = Counter()
-    c.visit(mod["main"])
-    return c.count
-
-
-def build_module(
-    mod,
-    target,
-    params=None,
-    enable_mrvl=True,
-    tvm_ops=0,
-    mrvl_partitions=1,
-):
-    """Partition and build module for mrvl codegen."""
-    if isinstance(mod, tvm.relay.expr.Call):
-        mod = tvm.IRModule.from_expr(mod)
-    if params is None:
-        params = {}
-
-    with tvm.transform.PassContext(opt_level=3):
-        if enable_mrvl:
-            mod = mrvl.partition_for_mrvl(mod, params)
-            tvm_op_count = get_cpu_op_count(mod)
-            assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
-                tvm_op_count, tvm_ops
-            )
-            partition_count = 0
-            for global_var in mod.get_global_vars():
-                if "mrvl" in global_var.name_hint:
-                    partition_count += 1
-
-            assert mrvl_partitions == partition_count, "Got {} mrvl partitions, expected {}".format(
-                partition_count, mrvl_partitions
-            )
-        return relay.build(mod, target, params=params)
-
-
-def extract_mrvl_modules(module):
-    """Get a list of all built mrvl runtime modules."""
-    return list(filter(lambda mod: mod.type_key == "mrvl_sim", module.get_lib().imported_modules))
-
-
-def verify_codegen(
-    module, num_mrvl_modules=1, params=None, target="llvm", tvm_ops=0, contains=None
-):
-    """Check mrvl codegen against a known good output."""
-    module = build_module(
-        module,
-        target,
-        params=params,
-        tvm_ops=tvm_ops,
-        mrvl_partitions=num_mrvl_modules,
-    )
-
-    mrvl_modules = extract_mrvl_modules(module)
-    assert len(mrvl_modules) == num_mrvl_modules, (
-        f"The number of mrvl modules produced ({len(mrvl_modules)}) does not "
-        f"match the expected value ({num_mrvl_modules})."
-    )
-
-    # Check if expected string is found inside actual string
-    if contains is not None:
-        actual_str = json.dumps(json.loads(mrvl_modules[0].get_source()))
-        assert actual_str.find(contains)
-
-
-def run_and_verify_func(config, data_type="float32"):
-
-    np.random.seed(0)
-    tvm_target = "llvm"
-
-    func, input_shapes, is_param, option_dict = config
-    params = {
-        x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype=data_type) for x in is_param
-    }
-    inputs_dict = {
-        k: np.random.uniform(-1, 1, v).astype(dtype=data_type)
-        for k, v in input_shapes.items()
-        if k not in is_param
-    }
-
-    dev = tvm.cpu()
-    for use_mrvl in [True, False]:
-        mod = tvm.IRModule()
-        mod["main"] = func
-        if use_mrvl:
-            mod = partition_for_mrvl(mod, params, **option_dict)
-            with tvm.transform.PassContext(
-                opt_level=3, config={"relay.ext.mrvl.options": option_dict}
-            ):
-                model_lib = relay.build(mod, tvm_target, params=params)
-
-            model_rt_graph = graph_executor.GraphModule(model_lib["default"](dev))
-            model_rt_graph.set_input(**inputs_dict)
-            model_rt_graph.run()
-            output_tensor1 = model_rt_graph.get_output(0).numpy()
-
-        else:
-            with tvm.transform.PassContext(
-                opt_level=3, config={"relay.ext.mrvl.options": option_dict}
-            ):
-                model_lib = relay.build(mod, tvm_target, params=params)
-
-            model_rt_graph = graph_executor.GraphModule(model_lib["default"](dev))
-            model_rt_graph.set_input(**inputs_dict)
-            model_rt_graph.run()
-            output_tensor2 = model_rt_graph.get_output(0).numpy()
-
-    tvm.testing.assert_allclose(output_tensor1, output_tensor2, rtol=1e-2, atol=1e-2)
diff --git a/tests/python/contrib/test_mrvl/test_mrvl.py b/tests/python/contrib/test_mrvl/test_mrvl.py
deleted file mode 100644
index cd3f343c2d03..000000000000
--- a/tests/python/contrib/test_mrvl/test_mrvl.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-
-"""Test Marvell BYOC partitioning, code generation and runtime"""
-
-import numpy as np
-
-import tvm
-from tvm import relay
-import tvm.relay.testing
-from tvm.testing.utils import requires_mrvl
-from tvm.relay.op.contrib.mrvl import partition_for_mrvl
-from .infrastructure import verify_codegen
-from .infrastructure import run_and_verify_func
-from tvm.testing import requires_mrvl
-
-
-@requires_mrvl
-def test_mrvl_fuse():
-    def get_blocks(
-        prefix,
-        data,
-        in_channel,
-        out_channel,
-        include_bias_add=True,
-        include_bn=True,
-        include_sigmoid=False,
-    ):
-        weight = relay.var(prefix + "weight")
-        bias = relay.var(prefix + "bias")
-        bn_gamma = relay.var(prefix + "bn_gamma")
-        bn_beta = relay.var(prefix + "bn_beta")
-        bn_mmean = relay.var(prefix + "bn_mean")
-        bn_mvar = relay.var(prefix + "bn_var")
-
-        layer = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=out_channel, padding=(1, 1)
-        )
-        if include_bias_add:
-            layer = relay.nn.bias_add(layer, bias)
-        if include_bn:
-            bn_output = relay.nn.batch_norm(layer, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-            layer = bn_output[0]
-        if include_sigmoid:
-            layer = relay.sigmoid(layer)
-        layer = relay.nn.relu(layer)
-        return layer
-
-    def get_net(include_bias_add=True, include_bn=True, include_sigmoid=False):
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        block1 = get_blocks("block1_", data, 3, 8, include_bias_add, include_bn, include_sigmoid)
-        block2 = get_blocks("block2_", block1, 8, 8, False, False, include_sigmoid)
-        return relay.Function(relay.analysis.free_vars(block2), block2)
-
-    def test_detect_pattern(include_bias_add, include_bn, include_sigmoid, num_expected_partition):
-        net = get_net(include_bias_add, include_bn, include_sigmoid)
-        mod, params = tvm.relay.testing.create_workload(net)
-        mod = partition_for_mrvl(mod, params)
-        assert len(mod.functions) - 1 == num_expected_partition
-
-    def test_sum_pattern(num_expected_partition):
-        def get_conv2d_bn_sum_relu(
-            x_shape=(1, 32, 8, 8),
-            k_shape=(16, 32, 3, 3),
-            sum_shape=(1, 16, 6, 6),
-            dtype="float32",
-        ):
-            x = relay.var("x", shape=(x_shape), dtype=dtype)
-            kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-            bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-            beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-            gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-            moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-            moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-            sum_data = relay.var("data1", shape=sum_shape, dtype=dtype)
-
-            dic = {"x": x_shape, "bias": (k_shape[0],), "sum_data": sum_shape}
-            param_lst = ["bias", "sum_data"]
-
-            conv = relay.nn.conv2d(
-                x,
-                kernel,
-                channels=k_shape[0],
-                kernel_size=k_shape[2:4],
-            )
-            conv_bias = relay.nn.bias_add(conv, bias)
-            conv_bias_bn, _, _ = relay.nn.batch_norm(
-                conv_bias,
-                gamma=gamma,
-                beta=beta,
-                moving_mean=moving_mean,
-                moving_var=moving_var,
-                axis=1,
-                center=True,
-                scale=True,
-                epsilon=1e-5,
-            )
-            conv_bias_bn_sum = relay.add(conv_bias_bn, sum_data)
-            return relay.nn.relu(conv_bias_bn_sum), dic, param_lst
-
-        net, dic, param_lst = get_conv2d_bn_sum_relu()
-        net = tvm.IRModule.from_expr(net)
-        params = {x: np.random.uniform(-1, 1, dic[x]).astype("float32") for x in param_lst}
-        mod = partition_for_mrvl(net, params)
-        assert len(mod.functions) - 1 == num_expected_partition
-
-    def test_partition():
-        test_detect_pattern(True, False, False, 1)
-        test_detect_pattern(False, True, False, 1)
-        test_detect_pattern(False, False, True, 2)
-        test_detect_pattern(True, True, False, 1)
-        test_detect_pattern(True, False, True, 2)
-        test_detect_pattern(False, True, True, 2)
-        test_detect_pattern(False, False, False, 1)
-        test_detect_pattern(True, True, True, 2)
-        test_sum_pattern(1)
-
-    def test_partition_mobilenet(num_expected_partition):
-        mod, params = relay.testing.mobilenet.get_workload()
-        mod = partition_for_mrvl(mod, params)
-        assert len(mod.functions) - 1 == num_expected_partition
-
-    test_partition()
-    test_partition_mobilenet(1)
-
-
-@requires_mrvl
-def test_conv2d():
-    """Test conv2d operator for "mrvl" targets"""
-
-    def get_graph():
-        x = relay.var("x", shape=(1, 3, 224, 224))
-        arr = np.random.rand(16, 3, 3, 3).astype("float32")
-        w = relay.const(arr)
-        y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-        func = relay.Function([x], y)
-        params = {}
-        params["w"] = arr
-        mod = tvm.IRModule()
-        mod["main"] = func
-        option_dict = {"num_tiles": 1}
-        verify_codegen(mod, params=params, tvm_ops=1, contains="mrvl.conv2d_nhwc2nhwc")
-        return func, {"x": (1, 3, 224, 224), "w": (16, 3, 3, 3)}, ["w"], option_dict
-
-    run_and_verify_func(get_graph())
-
-
-@requires_mrvl
-def test_dense():
-    """Test dense operator for "mrvl" targets"""
-
-    def get_graph():
-        x = relay.var("x", shape=(1, 16))
-        arr = np.random.rand(16, 16).astype("float32")
-        w = relay.const(arr)
-        y = relay.nn.dense(x, w)
-        func = relay.Function([x], y)
-        params = {}
-        params["w"] = arr
-        mod = tvm.IRModule()
-        mod["main"] = func
-        option_dict = {"num_tiles": 1}
-        verify_codegen(mod, params=params, tvm_ops=0, contains="mrvl.fc_ni2no")
-        return func, {"x": (1, 16), "w": (16, 16)}, ["w"], option_dict
-
-    run_and_verify_func(get_graph())
-
-
-@requires_mrvl
-def test_maxpool2d():
-    """Test maxpool2d operator for "mrvl" targets"""
-
-    def get_graph():
-        x = relay.var("x", shape=(1, 3, 224, 224))
-        arr = np.random.rand(16, 3, 3, 3).astype("float32")
-        w = relay.const(arr)
-        y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-        y = relay.nn.max_pool2d(y)
-        func = relay.Function([x], y)
-        mod = tvm.IRModule()
-        mod["main"] = func
-        option_dict = {"num_tiles": 1}
-        verify_codegen(mod, params={}, tvm_ops=1, contains="mrvl.maxpool2d_nhwc2nhwc")
-        return func, {"x": (1, 3, 224, 224)}, [], option_dict
-
-    run_and_verify_func(get_graph())
-
-
-@requires_mrvl
-def test_avgpool2d():
-    """Test avgpool2d operator for "mrvl" targets"""
-
-    def get_graph():
-        x = relay.var("x", shape=(1, 3, 224, 224))
-        arr = np.random.rand(16, 3, 3, 3).astype("float32")
-        w = relay.const(arr)
-        y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-        y = relay.nn.avg_pool2d(y)
-        func = relay.Function([x], y)
-        mod = tvm.IRModule()
-        mod["main"] = func
-        option_dict = {"num_tiles": 1}
-        verify_codegen(mod, params={}, tvm_ops=1, contains="mrvl.avgpool2d_nhwc2nhwc")
-        return func, {"x": (1, 3, 224, 224)}, [], option_dict
-
-    run_and_verify_func(get_graph())
-
-
-@requires_mrvl
-def test_globalavgpool2d():
-    """Test globalavgpool2d operator for "mrvl" targets"""
-
-    def get_graph():
-        x = relay.var("x", shape=(1, 3, 224, 224))
-        arr = np.random.rand(16, 3, 3, 3).astype("float32")
-        w = relay.const(arr)
-        y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-        y = relay.nn.global_avg_pool2d(y)
-        func = relay.Function([x], y)
-        mod = tvm.IRModule()
-        mod["main"] = func
-        option_dict = {"num_tiles": 1}
-        verify_codegen(mod, params={}, tvm_ops=1, contains="mrvl.globalavgpool2d_nhwc2nhwc")
-        return func, {"x": (1, 3, 224, 224)}, [], option_dict
-
-    run_and_verify_func(get_graph())
-
-
-@requires_mrvl
-def test_globalmaxpool2d():
-    """Test globalmaxpool2d operator for "mrvl" targets"""
-
-    def get_graph():
-        x = relay.var("x", shape=(1, 3, 224, 224))
-        arr = np.random.rand(16, 3, 3, 3).astype("float32")
-        w = relay.const(arr)
-        y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-        y = relay.nn.global_max_pool2d(y)
-        func = relay.Function([x], y)
-        params = {}
-        params["w"] = arr
-        mod = tvm.IRModule()
-        mod["main"] = func
-        option_dict = {"num_tiles": 1}
-        verify_codegen(mod, params=params, tvm_ops=2, contains="mrvl.globalmaxpool2d_nhwc2nhwc")
-        return func, {"x": (1, 3, 224, 224), "w": (16, 3, 3, 3)}, ["w"], option_dict
-
-    run_and_verify_func(get_graph())
-
-
-@requires_mrvl
-def test_squeeze():
-    """Test squeeze operator for "mrvl" targets"""
-
-    def get_graph():
-        x = relay.var("x", shape=(1, 3, 224, 224))
-        arr = np.random.rand(16, 3, 3, 3).astype("float32")
-        w = relay.const(arr)
-        y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-        y = relay.reshape(y, newshape=(1, 1, 16, 112, 112))
-        y = relay.squeeze(y, axis=[0, 1])
-        func = relay.Function([x], y)
-        mod = tvm.IRModule()
-        mod["main"] = func
-        option_dict = {"num_tiles": 1}
-        verify_codegen(mod, params={}, tvm_ops=3, contains="mrvl.squeeze")
-        return func, {"x": (1, 3, 224, 224)}, [], option_dict
-
-    run_and_verify_func(get_graph())
-
-
-if __name__ == "__main__":
-    test_mrvl_fuse()
-    test_conv2d()
-    test_dense()
-    test_maxpool2d()
-    test_avgpool2d()
-    test_globalavgpool2d()
-    test_globalmaxpool2d()
-    test_squeeze()
diff --git a/tests/python/contrib/test_msc/test_translate_relay.py b/tests/python/contrib/test_msc/test_translate_relay.py
deleted file mode 100644
index 801893e9debd..000000000000
--- a/tests/python/contrib/test_msc/test_translate_relay.py
+++ /dev/null
@@ -1,1143 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-
-""" Test translate from relay. """
-
-import torch
-from torch import fx
-from torch.nn import Module
-
-import tvm.testing
-from tvm.relax.frontend.torch import from_fx
-from tvm.relay.frontend import from_pytorch
-from tvm import relay
-from tvm.ir.module import IRModule
-from tvm.contrib.msc.core.frontend import translate
-from tvm.contrib.msc.framework.tvm import codegen as tvm_codegen
-from tvm.contrib.msc.core import utils as msc_utils
-
-
-def _valid_target(target):
-    if not target:
-        return target
-    if target == "ignore":
-        return None
-    if target == "cuda" and not tvm.cuda().exist:
-        return None
-    if isinstance(target, str):
-        target = tvm.target.Target(target)
-    return target
-
-
-def _run_relax(relax_mod, target, datas):
-    relax_mod = tvm.relax.transform.LegalizeOps()(relax_mod)
-    with tvm.transform.PassContext(opt_level=3):
-        relax_exec = tvm.relax.build(relax_mod, target)
-        runnable = tvm.relax.VirtualMachine(relax_exec, tvm.cpu())
-    res = runnable["main"](*datas)
-    if isinstance(res, tvm.runtime.NDArray):
-        return [res.asnumpy()]
-    return [e.asnumpy() for e in res]
-
-
-def verify_model(torch_model, input_info, opt_config=None, codegen_config=None, build_target=None):
-    """Compare relax with relay"""
-
-    graph_model = fx.symbolic_trace(torch_model)
-    with torch.no_grad():
-        expected = from_fx(graph_model, input_info)
-    expected = tvm.relax.transform.CanonicalizeBindings()(expected)
-
-    # graph from relay
-    datas = [msc_utils.random_data(i) for i in input_info]
-    torch_datas = [torch.from_numpy(i) for i in datas]
-    with torch.no_grad():
-        scripted_model = torch.jit.trace(torch_model, tuple(torch_datas)).eval()  # type: ignore
-    shape_list = [("input" + str(idx), i) for idx, i in enumerate(input_info)]
-    relay_mod, params = from_pytorch(scripted_model, shape_list)
-    graph, weights = translate.from_relay(relay_mod, params, opt_config=opt_config)
-    # to relax
-    codegen_config = codegen_config or {}
-    codegen_config.update({"explicit_name": False, "from_relay": True})
-    mod = tvm_codegen.to_relax(graph, weights, codegen_config)
-    if build_target:
-        build_target = _valid_target(build_target)
-        if not build_target:
-            return
-        tvm_datas = [tvm.nd.array(i) for i in datas]
-        expected_res = _run_relax(expected, build_target, tvm_datas)
-        if not graph.get_inputs():
-            tvm_datas = []
-        res = _run_relax(mod, build_target, tvm_datas)
-        for exp_r, new_r in zip(expected_res, res):
-            tvm.testing.assert_allclose(exp_r, new_r, atol=1e-5, rtol=1e-5)
-    else:
-        tvm.ir.assert_structural_equal(mod, expected)
-
-
-def test_conv1d():
-    """test relay to relax for conv1d"""
-
-    class Conv1D1(Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv1d(3, 6, 7, bias=True)
-
-        def forward(self, data):
-            return self.conv(data)
-
-    class Conv1D2(Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv1d(3, 6, 7, bias=False)
-
-        def forward(self, data):
-            return self.conv(data)
-
-    input_info = [([1, 3, 10], "float32")]
-    verify_model(Conv1D1(), input_info)
-    verify_model(Conv1D2(), input_info)
-
-
-def test_conv2d():
-    """test relay to relax for conv2d"""
-
-    class Conv2D1(Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv2d(3, 6, 7, bias=True)
-
-        def forward(self, data):
-            return self.conv(data)
-
-    class Conv2D2(Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv2d(3, 6, 7, bias=False)
-
-        def forward(self, data):
-            return self.conv(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Conv2D1(), input_info)
-    verify_model(Conv2D2(), input_info)
-
-
-def test_linear():
-    """test relay to relax for linear"""
-
-    class Dense1(Module):
-        def __init__(self):
-            super().__init__()
-            self.linear = torch.nn.Linear(10, 7, bias=True)
-
-        def forward(self, data):
-            return self.linear(data)
-
-    class Dense2(Module):
-        def __init__(self):
-            super().__init__()
-            self.linear = torch.nn.Linear(10, 7, bias=False)
-
-        def forward(self, data):
-            return self.linear(data)
-
-    class MatMul1(Module):
-        def forward(self, x, y):
-            return torch.matmul(x, y)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Dense1(), input_info, build_target="llvm")
-    verify_model(Dense2(), input_info, build_target="llvm")
-    verify_model(MatMul1(), [([10, 10], "float32"), ([10, 10], "float32")], build_target="llvm")
-
-
-def test_bmm():
-    """test relay to relax for bmm"""
-
-    class BMM(Module):
-        def forward(self, x, y):
-            return torch.bmm(x, y)
-
-    input_info = [((4, 128, 256), "float32"), ((4, 256, 512), "float32")]
-    verify_model(BMM(), input_info, opt_config={"opt_level": 3})
-
-
-def test_baddbmm():
-    """test relay to relax for baddbmm"""
-
-    class BAddBMM1(Module):
-        def forward(self, c, x, y):
-            return torch.baddbmm(c, x, y)
-
-    class BAddBMM2(Module):
-        def forward(self, c, x, y):
-            return torch.baddbmm(c, x, y, alpha=2, beta=0)
-
-    input_info = [
-        ((4, 128, 512), "float32"),
-        ((4, 128, 256), "float32"),
-        ((4, 256, 512), "float32"),
-    ]
-    verify_model(BAddBMM1(), input_info, opt_config={"opt_level": 3}, build_target="llvm")
-    verify_model(BAddBMM2(), input_info, opt_config={"opt_level": 3}, build_target="llvm")
-
-
-def test_relu():
-    """test relay to relax for relu"""
-
-    class ReLU(Module):
-        def __init__(self):
-            super().__init__()
-            self.relu = torch.nn.ReLU()
-
-        def forward(self, data):
-            return self.relu(data)
-
-    class ReLU1(Module):
-        def forward(self, data):
-            return torch.nn.functional.relu(data)
-
-    input_info = [([10, 10], "float32")]
-    verify_model(ReLU(), input_info)
-    verify_model(ReLU1(), input_info)
-
-
-def test_relu6():
-    """test relay to relax for relu6"""
-
-    class ReLU6(Module):
-        def __init__(self):
-            super().__init__()
-            self.relu6 = torch.nn.ReLU6()
-
-        def forward(self, data):
-            return self.relu6(data)
-
-    input_info = [([10, 10], "float32")]
-    verify_model(ReLU6(), input_info)
-
-
-def test_maxpool2d():
-    """test relay to relax for maxpool2d"""
-
-    class MaxPool2d(Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = torch.nn.MaxPool2d(kernel_size=[1, 1])
-
-        def forward(self, data):
-            return self.pool(data)
-
-    class MaxPool2d2(Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = torch.nn.MaxPool2d(kernel_size=[2, 2], dilation=[2, 3])
-
-        def forward(self, data):
-            return self.pool(data)
-
-    class MaxPool2d3(Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = torch.nn.MaxPool2d(kernel_size=[4, 4], padding=2, stride=2)
-
-        def forward(self, data):
-            return self.pool(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(MaxPool2d(), input_info)
-    verify_model(MaxPool2d2(), input_info)
-    verify_model(MaxPool2d3(), input_info)
-
-
-def test_avgpool2d():
-    """test relay to relax for avgpool2d"""
-
-    class AvgPool2d(Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = torch.nn.AvgPool2d(kernel_size=[1, 1])
-
-        def forward(self, data):
-            return self.pool(data)
-
-    class AvgPool2d2(Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = torch.nn.AvgPool2d(kernel_size=[4, 4], stride=2, padding=2, ceil_mode=True)
-
-        def forward(self, data):
-            return self.pool(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(AvgPool2d(), input_info)
-    verify_model(AvgPool2d2(), input_info)
-
-
-def test_adaptive_avgpool2d():
-    """test relay to relax for adaptive_avgpool2d"""
-
-    class AdaptiveAvgPool2d0(Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = torch.nn.AdaptiveAvgPool2d([10, 10])
-
-        def forward(self, data):
-            return self.pool(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(AdaptiveAvgPool2d0(), input_info)
-
-
-def test_flatten():
-    """test relay to relax for flatten"""
-
-    class Flatten(Module):
-        def __init__(self):
-            super().__init__()
-            self.f = torch.nn.Flatten(2, -1)
-
-        def forward(self, data):
-            return self.f(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Flatten(), input_info, opt_config={"opt_level": 3}, build_target="llvm")
-    verify_model(
-        torch.nn.Flatten(2, -1), input_info, opt_config={"opt_level": 3}, build_target="llvm"
-    )
-
-
-def test_batchnorm2d():
-    """test relay to relax for batchnorm2d"""
-
-    class BatchNorm2d(Module):
-        def __init__(self):
-            super().__init__()
-            self.batchnorm = torch.nn.BatchNorm2d(3)
-
-        def forward(self, data):
-            return self.batchnorm(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(BatchNorm2d(), input_info, build_target="llvm")
-
-
-def test_embedding():
-    """test relay to relax for embedding"""
-
-    class Embedding(Module):
-        def __init__(self):
-            super().__init__()
-            self.embedding = torch.nn.Embedding(10, 3)
-
-        def forward(self, data):
-            return self.embedding(data)
-
-    verify_model(Embedding(), [([4], "int64")])
-    verify_model(Embedding(), [([4, 5], "int64")])
-
-
-def test_layernorm():
-    """test relay to relax for layernorm"""
-
-    class LayerNorm(Module):
-        def __init__(self):
-            super().__init__()
-            self.layernorm = torch.nn.LayerNorm(10)
-
-        def forward(self, data):
-            return self.layernorm(data)
-
-    input_info = [([1, 10, 10], "float32")]
-    verify_model(LayerNorm(), input_info)
-
-
-def test_functional_layernorm():
-    """test relay to relax for functional_layernorm"""
-
-    class LayerNorm(Module):
-        def __init__(self, shape):
-            super().__init__()
-            self.weight = torch.nn.Parameter(torch.ones(shape))
-            self.bias = torch.nn.Parameter(torch.zeros(shape))
-
-        def forward(self, data):
-            return torch.nn.functional.layer_norm(
-                data, self.weight.shape, self.weight, self.bias, 1e-5
-            )
-
-    input_info = [([1, 10, 10], "float32")]
-    verify_model(LayerNorm((10)), input_info)
-
-
-def test_cross_entropy():
-    """test relay to relax for cross_entropy"""
-
-    class CrossEntropy1(Module):
-        def __init__(self):
-            super().__init__()
-            self.loss = torch.nn.CrossEntropyLoss()
-
-        def forward(self, logits, targets):
-            return self.loss(logits, targets)
-
-    class CrossEntropy2(Module):
-        def __init__(self):
-            super().__init__()
-            self.weight = torch.nn.Parameter(torch.ones((2,)))
-            self.loss = torch.nn.CrossEntropyLoss(weight=self.weight)
-
-        def forward(self, logits, targets):
-            return self.loss(logits, targets)
-
-    class CrossEntropy3(Module):
-        def __init__(self):
-            super().__init__()
-            self.loss = torch.nn.CrossEntropyLoss(ignore_index=1, reduction="sum")
-
-        def forward(self, logits, targets):
-            return self.loss(logits, targets)
-
-    input_info = [([3, 2], "float32"), ([3], "int64")]
-    verify_model(CrossEntropy1(), input_info, opt_config={"opt_level": 3}, build_target="llvm")
-    verify_model(CrossEntropy2(), input_info, opt_config={"opt_level": 3}, build_target="llvm")
-    verify_model(CrossEntropy3(), input_info, opt_config={"opt_level": 3}, build_target="llvm")
-
-
-def test_functional_cross_entropy():
-    """test relay to relax for functional_cross_entropy"""
-
-    class CrossEntropy(Module):
-        def forward(self, logits, targets):
-            return torch.nn.functional.cross_entropy(logits, targets)
-
-    input_info = [([3, 10], "float32"), ([3], "int64")]
-    verify_model(CrossEntropy(), input_info, opt_config={"opt_level": 3}, build_target="llvm")
-
-
-def test_silu():
-    """test relay to relax for silu"""
-
-    class SiLU(Module):
-        def __init__(self):
-            super().__init__()
-            self.silu = torch.nn.SiLU()
-
-        def forward(self, data):
-            return self.silu(data)
-
-    class SiLU2(Module):
-        def forward(self, data):
-            return torch.nn.functional.silu(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(SiLU(), input_info, build_target="llvm")
-    verify_model(SiLU2(), input_info, build_target="llvm")
-
-
-def test_groupnorm():
-    """test relay to relax for groupnorm"""
-
-    class GroupNorm(Module):
-        def __init__(self):
-            super().__init__()
-            self.groupnorm = torch.nn.GroupNorm(3, 3)
-
-        def forward(self, data):
-            return self.groupnorm(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(GroupNorm(), input_info)
-
-
-def test_softmax():
-    """test relay to relax for softmax"""
-
-    class Softmax(Module):
-        def __init__(self):
-            super().__init__()
-            self.softmax = torch.nn.Softmax(dim=1)
-
-        def forward(self, data):
-            return self.softmax(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Softmax(), input_info)
-
-
-def test_binary():
-    """test relay to relax for binary"""
-
-    input_info1 = [([1, 3, 10, 10], "float32"), ([1, 3, 10, 10], "float32")]
-    input_info2 = [([1, 3, 10, 10], "float32")]
-
-    # Add
-    class Add1(Module):
-        def forward(self, lhs, rhs):
-            return lhs + rhs
-
-    class Add2(Module):
-        def forward(self, lhs):
-            return lhs + 1.0
-
-    verify_model(Add1(), input_info1, opt_config={"opt_level": 3})
-    verify_model(Add2(), input_info2, opt_config={"opt_level": 3})
-
-    # Sub
-    class Sub1(Module):
-        def forward(self, lhs, rhs):
-            return lhs - rhs
-
-    class Sub2(Module):
-        def forward(self, lhs):
-            return lhs - 1.0
-
-    verify_model(Sub1(), input_info1, opt_config={"opt_level": 3})
-    verify_model(Sub2(), input_info2, opt_config={"opt_level": 3})
-
-    # Mul
-    class Mul1(Module):
-        def forward(self, lhs, rhs):
-            return lhs * rhs
-
-    class Mul2(Module):
-        def forward(self, lhs):
-            return lhs * 1.0
-
-    verify_model(Mul1(), input_info1, opt_config={"opt_level": 3})
-    verify_model(Mul2(), input_info2)
-
-    # True div
-    class TrueDiv1(Module):
-        def forward(self, lhs, rhs):
-            return lhs / rhs
-
-    class TrueDiv2(Module):
-        def forward(self, lhs):
-            return lhs / 1.0
-
-    verify_model(TrueDiv1(), input_info1, opt_config={"opt_level": 3})
-    verify_model(TrueDiv2(), input_info2)
-
-    # Floor div
-    class FloorDiv1(Module):
-        def forward(self, lhs, rhs):
-            return lhs // rhs
-
-    class FloorDiv2(Module):
-        def forward(self, lhs):
-            return lhs // 1.0
-
-    verify_model(FloorDiv1(), input_info1, opt_config={"opt_level": 3})
-    verify_model(FloorDiv2(), input_info2, opt_config={"opt_level": 3})
-
-    # Power
-    class Power1(Module):
-        def forward(self, lhs, rhs):
-            return lhs**rhs
-
-    class Power2(Module):
-        def forward(self, lhs):
-            return lhs**1.0
-
-    verify_model(Power1(), input_info1, opt_config={"opt_level": 3})
-    verify_model(Power2(), input_info2, opt_config={"opt_level": 3})
-
-    # LT
-    class LT1(Module):
-        def forward(self, lhs, rhs):
-            return lhs < rhs
-
-    class LT2(Module):
-        def forward(self, lhs):
-            return lhs < 1.0
-
-    verify_model(LT1(), input_info1, opt_config={"opt_level": 3})
-    verify_model(LT2(), input_info2, opt_config={"opt_level": 3})
-
-
-def test_squeeze():
-    """test relay to relax for squeeze"""
-
-    class Squeeze1(Module):
-        def forward(self, data):
-            return data.squeeze(1)
-
-    class Squeeze2(Module):
-        def forward(self, data):
-            return data.squeeze()
-
-    input_info = [([3, 1, 4, 1], "float32")]
-    verify_model(Squeeze1(), input_info)
-    verify_model(Squeeze2(), input_info)
-
-
-def test_unsqueeze():
-    """test relay to relax for unsqueeze"""
-
-    class Unsqueeze1(Module):
-        def forward(self, data):
-            return data.unsqueeze(1)
-
-    class Unsqueeze2(Module):
-        def forward(self, data):
-            return data.unsqueeze(-1)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Unsqueeze1(), input_info)
-    verify_model(Unsqueeze2(), input_info)
-
-
-def test_getitem():
-    """test relay to relax for getitem"""
-
-    class Slice1(Module):
-        def forward(self, x):
-            return x[0, 1::2, :, :3]
-
-    class Slice2(Module):
-        def forward(self, x):
-            return x[:, None, None, :, None]
-
-    verify_model(Slice1(), [([1, 3, 10, 10], "float32")], build_target="ignore")
-    verify_model(Slice2(), [([8, 16], "float32")], build_target="llvm")
-
-
-def test_unary():
-    """test relay to relax for unary"""
-
-    input_info = [([1, 3, 10, 10], "float32")]
-
-    # sin
-    class Sin(Module):
-        def forward(self, data):
-            return torch.sin(data)
-
-    verify_model(Sin(), input_info)
-
-    # cos
-    class Cos(Module):
-        def forward(self, data):
-            return torch.cos(data)
-
-    verify_model(Cos(), input_info)
-
-    # exp
-    class Exp(Module):
-        def forward(self, data):
-            return torch.exp(data)
-
-    verify_model(Exp(), input_info)
-
-    # sqrt
-    class Sqrt(Module):
-        def forward(self, data):
-            return torch.sqrt(data)
-
-    verify_model(Sqrt(), input_info)
-
-    # sigmoid
-    class Sigmoid(Module):
-        def forward(self, data):
-            return torch.sigmoid(data)
-
-    verify_model(Sigmoid(), input_info)
-
-    # round
-    class Round(Module):
-        def forward(self, data):
-            return torch.round(data)
-
-    verify_model(Round(), input_info)
-
-
-def test_gelu():
-    """test relay to relax for gelu"""
-
-    class Gelu(Module):
-        def forward(self, data):
-            return torch.nn.functional.gelu(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Gelu(), input_info)
-
-
-def test_tanh():
-    """test relay to relax for tanh"""
-
-    class Tanh(Module):
-        def forward(self, data):
-            return torch.tanh(data)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Tanh(), input_info)
-
-
-def test_clamp():
-    """test relay to relax for clamp"""
-
-    class Clamp(Module):
-        def forward(self, data):
-            return torch.clamp(data, min=0.1, max=0.5)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Clamp(), input_info)
-
-
-def test_interpolate():
-    """test relay to relax for interpolate"""
-
-    class Interpolate(Module):
-        def forward(self, data):
-            return torch.nn.functional.interpolate(data, (5, 5))
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Interpolate(), input_info, build_target="llvm")
-
-
-def test_addmm():
-    """test relay to relax for addmm"""
-
-    class Addmm(Module):
-        def forward(self, x_1, x_2, x_3):
-            return torch.addmm(x_1, x_2, x_3)
-
-    input_info = [
-        ([10, 10], "float32"),
-        ([10, 10], "float32"),
-        ([10, 10], "float32"),
-    ]
-    verify_model(Addmm(), input_info, build_target="llvm")
-
-
-def test_split():
-    """test relay to relax for split"""
-
-    class Split1(Module):
-        def forward(self, data):
-            return torch.split(data, 1, dim=1)
-
-    class Split2(Module):
-        def forward(self, data):
-            return torch.split(data, [1, 2], dim=1)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Split1(), input_info, build_target="llvm")
-    verify_model(Split2(), input_info, build_target="llvm")
-
-
-def test_unbind():
-    """test relay to relax for unbind"""
-
-    class Unbind1(Module):
-        def forward(self, data):
-            return torch.unbind(data)
-
-    class Unbind2(Module):
-        def forward(self, data):
-            return torch.unbind(data, dim=1)
-
-    input_info = [([3, 3, 10, 10], "float32")]
-    verify_model(Unbind1(), input_info, build_target="llvm")
-    verify_model(Unbind2(), input_info, build_target="llvm")
-
-
-def test_cumsum():
-    """test relay to relax for cumsum"""
-
-    class Cumsum(Module):
-        def forward(self, data):
-            return torch.cumsum(data, dim=1, dtype=torch.int32)
-
-    input_info = [([1, 2, 3, 4], "float32")]
-    verify_model(Cumsum(), input_info)
-
-
-def test_chunk():
-    """test relay to relax for chunk"""
-
-    class Chunk(Module):
-        def forward(self, data):
-            return torch.chunk(data, 3, dim=1)
-
-    input_info = [([1, 3, 10, 10], "float32")]
-    verify_model(Chunk(), input_info, build_target="llvm")
-
-
-def test_inplace_fill():
-    """test relay to relax for inplace_fill"""
-
-    class InplaceFill(Module):
-        def forward(self, data):
-            data.fill_(1.5)
-            return data
-
-    verify_model(InplaceFill(), [([10, 10], "float32")], build_target="llvm")
-
-
-def test_arange():
-    """test relay to relax for arange"""
-
-    class Arange(Module):
-        def forward(self, data):
-            return torch.arange(0, 20, dtype=torch.int32)
-
-    verify_model(
-        Arange(), [([10, 10], "float32")], opt_config={"opt_level": 3}, build_target="llvm"
-    )
-
-
-def test_empty():
-    """test relay to relax for empty"""
-
-    class Empty(Module):
-        def forward(self, data):
-            return torch.empty((10, 10), dtype=torch.float32)
-
-    verify_model(
-        Empty(), [([10, 10], "float32")], opt_config={"opt_level": 3}, build_target="ignore"
-    )
-
-
-def test_tensor():
-    """test relay to relax for tensor"""
-
-    class Empty1(Module):
-        def forward(self, data):
-            return torch.tensor(3, dtype=torch.float32)
-
-    class Empty2(Module):
-        def forward(self, data):
-            return torch.tensor(3)
-
-    verify_model(Empty1(), [([10, 10], "float32")], build_target="llvm")
-    verify_model(Empty2(), [([10, 10], "float32")], build_target="llvm")
-
-
-def test_tril():
-    """test relay to relax for tril"""
-
-    class Tril(Module):
-        def forward(self, data):
-            return torch.tril(data, 1)
-
-    class InplaceTril(Module):
-        def forward(self, data):
-            data.tril_(1)
-            return data
-
-    input_info = [([10, 10], "float32")]
-    verify_model(Tril(), input_info)
-    verify_model(InplaceTril(), input_info)
-
-
-def test_triu():
-    """test relay to relax for triu"""
-
-    class Triu(Module):
-        def forward(self, data):
-            return torch.triu(data, 1)
-
-    class InplaceTriu(Module):
-        def forward(self, data):
-            data.triu_(1)
-            return data
-
-    input_info = [([10, 10], "float32")]
-    verify_model(Triu(), input_info)
-    verify_model(InplaceTriu(), input_info)
-
-
-def test_new_ones():
-    """test relay to relax for new_ones"""
-
-    class NewOnes(Module):
-        def forward(self, x):
-            return x.new_ones(1, 2, 3)
-
-    input_info = [([1, 2, 3], "float32")]
-    verify_model(NewOnes(), input_info, build_target="llvm")
-
-
-def test_expand():
-    """test relay to relax for expand"""
-
-    class Expand1(Module):
-        def forward(self, x):
-            return x.expand(4, 2, 3, 4)
-
-    class Expand2(Module):
-        def forward(self, x):
-            return x.expand(4, -1, -1, 4)
-
-    input_info = [([1, 2, 3, 4], "float32")]
-    verify_model(Expand1(), input_info, build_target="llvm")
-    verify_model(Expand2(), input_info, build_target="llvm")
-
-
-def test_reduce():
-    """test relay to relax for reduce"""
-
-    # sum
-    class Sum(Module):
-        def forward(self, x):
-            return torch.sum(x, (2, 1))
-
-    input_info = [([1, 2, 3, 4], "float32")]
-    verify_model(Sum(), input_info)
-
-
-def test_datatype():
-    """test relay to relax for datatype"""
-
-    input_info = [([1, 2, 3, 4], "float32")]
-
-    # float
-    class ToFloat(Module):
-        def forward(self, x):
-            return x.float()
-
-    verify_model(ToFloat(), input_info, build_target="llvm")
-
-    # half
-    class ToHalf(Module):
-        def forward(self, x):
-            return x.half()
-
-    verify_model(ToHalf(), input_info)
-
-    # type
-    class Type(Module):
-        def forward(self, x):
-            return x.type(torch.float32)
-
-    verify_model(Type(), input_info, build_target="llvm")
-
-
-def test_permute():
-    """test relay to relax for permute"""
-
-    class Permute(Module):
-        def forward(self, x):
-            return x.permute(0, 3, 2, 1)
-
-    input_info = [([1, 2, 3, 4], "float32")]
-    verify_model(Permute(), input_info)
-
-
-def test_reshape():
-    """test relay to relax for reshape"""
-
-    class Reshape(Module):
-        def forward(self, x):
-            return x.reshape(2, 12)
-
-    input_info = [([1, 2, 3, 4], "float32")]
-    verify_model(Reshape(), input_info)
-
-
-def test_transpose():
-    """test relay to relax for transpose"""
-
-    class Transpose(Module):
-        def forward(self, x):
-            return x.transpose(1, 3)
-
-    input_info = [([1, 2, 3, 4], "float32")]
-    verify_model(Transpose(), input_info)
-
-
-def test_view():
-    """test relay to relax for view"""
-
-    class View(Module):
-        def forward(self, x):
-            return x.view(2, 12)
-
-    input_info = [([1, 2, 3, 4], "float32")]
-    verify_model(View(), input_info)
-
-
-def test_keep_params():
-    """test relay to relax for keep_params"""
-
-    class Conv2D1(Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv2d(3, 6, 7, bias=True)
-
-        def forward(self, data):
-            return self.conv(data)
-
-    verify_model(Conv2D1(), [([1, 3, 10, 10], "float32")])
-
-
-def test_unwrap_unit_return_tuple():
-    """test relay to relax for unwrap_unit_return_tuple"""
-
-    class Identity(Module):
-        def forward(self, x):
-            return (x,)
-
-    verify_model(Identity(), [([256, 256], "float32")], build_target="llvm")
-
-
-def test_no_bind_return_tuple():
-    """test relay to relax for no_bind_return_tuple"""
-
-    class Identity(Module):
-        def forward(self, x, y):
-            return (x, y)
-
-    input_info = [([256, 256], "float32"), ([256, 256], "float32")]
-    verify_model(Identity(), input_info)
-
-
-def test_argmax():
-    """test relay to relax for argmax"""
-
-    class Argmax1(Module):
-        def forward(self, data):
-            return torch.argmax(data, dim=-1)
-
-    class Argmax2(Module):
-        def forward(self, data):
-            return torch.argmax(data, dim=-1, keepdim=True)
-
-    verify_model(Argmax1(), [([256, 256], "float32")])
-    verify_model(Argmax2(), [([256, 256], "float32")])
-
-
-def test_to():
-    """test relay to relax for to"""
-
-    class To1(Module):
-        def forward(self, data):
-            return data.to(torch.float16)
-
-    class To2(Module):
-        def forward(self, data):
-            return data.to("cpu")
-
-    verify_model(To1(), [([256, 256], "float32")])
-    verify_model(To2(), [([256, 256], "float32")])
-
-
-def test_mean():
-    """test relay to relax for mean"""
-
-    class Mean(Module):
-        def forward(self, data):
-            return data.mean(-1)
-
-    class MeanKeepDim(Module):
-        def forward(self, data):
-            return data.mean(-1, keepdim=True)
-
-    verify_model(Mean(), [([256, 256], "float32")])
-    verify_model(MeanKeepDim(), [([256, 256], "float32")])
-
-
-def test_rsqrt():
-    """test relay to relax for rsqrt"""
-
-    class Rsqrt(Module):
-        def forward(self, data):
-            return torch.rsqrt(data)
-
-    verify_model(Rsqrt(), [([256, 256], "float32")])
-
-
-def test_neg():
-    """test relay to relax for neg"""
-
-    class Neg(Module):
-        def forward(self, data):
-            return -data
-
-    verify_model(Neg(), [([256, 256], "float32")])
-
-
-def test_max():
-    """test relay to relax for max"""
-
-    class Max(Module):
-        def forward(self, x, y):
-            return torch.max(x, y)
-
-    verify_model(Max(), [([256, 256], "float32"), ([256, 256], "float32")])
-
-
-def test_cat():
-    """test relay to relax for cat"""
-
-    class Cat1(Module):
-        def forward(self, data, data1, data2):
-            return torch.cat((data, data1, data2), dim=1)
-
-    class Cat2(Module):
-        def forward(self, data):
-            const1 = torch.ones((1, 3, 10, 10), dtype=torch.float32)
-            const2 = torch.ones((1, 3, 10, 10), dtype=torch.float32)
-            return torch.cat((data, const1, const2), dim=1)
-
-    input_info = [
-        ([1, 3, 10, 10], "float32"),
-        ([1, 3, 10, 10], "float32"),
-        ([1, 3, 10, 10], "float32"),
-    ]
-    verify_model(Cat1(), input_info, build_target="llvm")
-    verify_model(Cat2(), [([1, 3, 10, 10], "float32")], build_target="llvm")
-
-
-def test_name_string_with_colon():
-    """test name string with colons,
-    e.g., TFLite default input name 'serving_default_input:0'
-    """
-
-    dtype = "float32"
-    x_var = relay.var("input_0:0", shape=(3, 5), dtype=dtype)
-    y_var = relay.var("input_1:0", shape=(3, 5), dtype=dtype)
-    z_add = relay.add(x_var, y_var)
-    func = relay.Function([x_var, y_var], z_add)
-    mod = IRModule()
-    mod["main"] = func
-
-    try:
-        graph, _ = translate.from_relay(mod)
-    except Exception as err:
-        raise RuntimeError(f"Translation from relay to graph failed: {err}")
-    inspect = graph.inspect()
-
-    expected = {
-        "inputs": [
-            {"name": "input_0:0", "shape": [3, 5], "dtype": dtype, "layout": ""},
-            {"name": "input_1:0", "shape": [3, 5], "dtype": dtype, "layout": ""},
-        ],
-        "outputs": [{"name": "add", "shape": [3, 5], "dtype": dtype, "layout": ""}],
-        "nodes": {"total": 3, "input": 2, "add": 1},
-    }
-
-    assert msc_utils.dict_equal(inspect, expected), "Inspect {} mismatch with expected {}".format(
-        inspect, expected
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
deleted file mode 100644
index 881226725ac3..000000000000
--- a/tests/python/contrib/test_nnpack.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-import numpy as np
-import scipy.signal
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.contrib import nnpack
-import pytest
-
-
-@tvm.testing.requires_llvm
-def test_fully_connected_inference():
-    n = 1024
-    l = 128
-    m = 235
-    bias = te.var("bias", dtype="float32")
-    A = te.placeholder((l,), name="A")
-    B = te.placeholder((m, l), name="B")
-    C = nnpack.fully_connected_inference(A, B)
-    D = te.compute(C.shape, lambda i: C[i] + bias, name="D")
-    s = te.create_schedule(D.op)
-
-    def verify(target="llvm"):
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            pytest.skip("extern function is not available")
-        if not nnpack.is_available():
-            pytest.skip("nnpack is not available")
-
-        dev = tvm.cpu(0)
-        f = tvm.build(s, [A, B, D, bias], target)
-        a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(m, l)).astype(B.dtype), dev)
-        d = tvm.nd.array(np.zeros((m,), dtype=D.dtype), dev)
-        bb = 10.0
-        f(a, b, d, bb)
-        tvm.testing.assert_allclose(d.numpy(), np.dot(a.numpy(), b.numpy().T) + bb, rtol=1e-5)
-
-    verify()
-
-
-def np_conv(na, nw, padding, stride=1):
-    batch, in_channel, in_height, in_width = na.shape
-    _, num_filter, kernel_h, kernel_w = nw.shape
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
-    pad_h = pad_top + pad_bottom
-    pad_w = pad_left + pad_right
-
-    out_channel = num_filter
-    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
-    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
-    nb = np.zeros((batch, out_channel, out_height, out_width))
-    for n in range(batch):
-        for f in range(out_channel):
-            for c in range(in_channel):
-                if pad_h > 0 or pad_w > 0:
-                    apad = np.zeros((in_height + pad_h, in_width + pad_w))
-                    apad[pad_top : pad_top + in_height, pad_left : pad_left + in_width] = na[n, c]
-                else:
-                    apad = na[n, c]
-                out = scipy.signal.convolve2d(apad, np.rot90(np.rot90(nw[f, c])), mode="valid")
-                nb[n, f] += out[::stride, ::stride]
-    return nb
-
-
-@tvm.testing.requires_llvm
-def test_convolution_inference():
-    BATCH = 8
-    IH = 48
-    IW = 48
-    IC = 16
-    OC = 16
-    K = 3
-    PAD = 1
-    STRIDE = 1
-
-    OH = (IH + 2 * PAD - K) + 1
-    OW = (IW + 2 * PAD - K) + 1
-    dshape = (BATCH, IC, IH, IW)
-    kshape = (OC, IC, K, K)
-    bshape = (OC,)
-    oshape = (BATCH, OC, OH, OW)
-
-    data = te.placeholder(dshape, name="data")
-    kernel = te.placeholder(kshape, name="kernel")
-    bias = te.placeholder(bshape, name="bias")
-
-    def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=True):
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            pytest.skip("extern function is not available")
-        if not nnpack.is_available():
-            pytest.skip("nnpack is not available")
-
-        dev = tvm.cpu(0)
-        output = nnpack.convolution_inference(
-            data,
-            kernel,
-            bias if with_bias else None,
-            [PAD, PAD, PAD, PAD],
-            [STRIDE, STRIDE],
-            algorithm=algorithm,
-        )
-        s = te.create_schedule(output.op)
-
-        f = tvm.build(s, [data, kernel, bias, output], target)
-
-        na = np.random.uniform(size=dshape).astype(data.dtype)
-        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
-        nc = np.zeros(bshape, dtype=bias.dtype)
-        ta = tvm.nd.array(na, dev)
-        tb = tvm.nd.array(nb, dev)
-        tc = tvm.nd.array(nc, dev)
-        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), dev)
-        f(ta, tb, tc, td)
-        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(
-            1, bshape[0], 1, 1
-        )
-        tvm.testing.assert_allclose(td.numpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
-
-    for algorithm in [
-        nnpack.ConvolutionAlgorithm.AUTO,
-        nnpack.ConvolutionAlgorithm.FFT_8x8,
-        nnpack.ConvolutionAlgorithm.FFT_16x16,
-        nnpack.ConvolutionAlgorithm.WT_8x8,
-        nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM,
-        nnpack.ConvolutionAlgorithm.WT_8x8_FP16,
-    ]:
-        for with_bias in [True, False]:
-            verify(algorithm=algorithm, with_bias=with_bias)
-
-
-@tvm.testing.requires_llvm
-def test_convolution_inference_without_weight_transform():
-    BATCH = 6
-    IH = 48
-    IW = 48
-    IC = 16
-    OC = 16
-    K = 3
-    PAD = 1
-    STRIDE = 1
-
-    OH = (IH + 2 * PAD - K) + 1
-    OW = (IW + 2 * PAD - K) + 1
-    dshape = (BATCH, IC, IH, IW)
-    kshape = (OC, IC, K, K)
-    bshape = (OC,)
-    oshape = (BATCH, OC, OH, OW)
-
-    data = te.placeholder(dshape, name="data")
-    kernel = te.placeholder(kshape, name="kernel")
-    bias = te.placeholder(bshape, name="bias")
-
-    def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=True):
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            pytest.skip("extern function is not available")
-        if not nnpack.is_available():
-            pytest.skip("nnpack is not available")
-
-        dev = tvm.cpu(0)
-        transformed_kernel = nnpack.convolution_inference_weight_transform(
-            kernel, algorithm=algorithm
-        )
-        output = nnpack.convolution_inference_without_weight_transform(
-            data,
-            transformed_kernel,
-            bias if with_bias else None,
-            [PAD, PAD, PAD, PAD],
-            [STRIDE, STRIDE],
-            algorithm=algorithm,
-        )
-
-        s = te.create_schedule(output.op)
-
-        f = tvm.build(s, [data, kernel, bias, output], target)
-
-        na = np.random.uniform(size=dshape).astype(data.dtype)
-        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
-        nc = (
-            np.random.uniform(size=bshape).astype(bias.dtype)
-            if with_bias
-            else np.zeros(bshape, dtype=bias.dtype)
-        )
-        ta = tvm.nd.array(na, dev)
-        tb = tvm.nd.array(nb, dev)
-        tc = tvm.nd.array(nc, dev)
-        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), dev)
-        f(ta, tb, tc, td)
-        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(
-            1, bshape[0], 1, 1
-        )
-        tvm.testing.assert_allclose(td.numpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
-
-    for algorithm in [nnpack.ConvolutionAlgorithm.WT_8x8]:
-        for with_bias in [True, False]:
-            verify(algorithm=algorithm, with_bias=with_bias)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_onnx.py b/tests/python/contrib/test_onnx.py
deleted file mode 100644
index afebc2295a68..000000000000
--- a/tests/python/contrib/test_onnx.py
+++ /dev/null
@@ -1,754 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Relay to ONNX serialization test cases"""
-import pytest
-
-pytest.importorskip("onnx")
-pytest.importorskip("onnxruntime")
-
-import numpy as np
-import onnxruntime as rt
-
-import tvm
-from tvm import relay
-from tvm.contrib.target.onnx import to_onnx
-from tvm.relay.testing import run_infer_type
-
-
-def func_to_onnx(func, name):
-    mod = tvm.IRModule()
-    mod["main"] = func
-    onnx_model = to_onnx(mod, {}, name, path=None)
-    return onnx_model.SerializeToString()
-
-
-def run_onnx(onnx_model, input_data):
-    sess = rt.InferenceSession(onnx_model)
-    input_names = {}
-    for input, data in zip(sess.get_inputs(), input_data):
-        input_names[input.name] = data
-    output_names = [out.name for out in sess.get_outputs()]
-    res = sess.run(output_names, input_names)
-    return res
-
-
-def run_relay(func, data_tuple, is_dyn=False):
-    target = "llvm"
-    dev = tvm.device("llvm", 0)
-    kind = "graph" if not is_dyn else "vm"
-    relay_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(*data_tuple)
-
-    result = []
-    relay_res = relay_res if isinstance(relay_res, list) else [relay_res]
-    for res in relay_res:
-        result.append(res.numpy())
-
-    return result
-
-
-def verify_results(relay_func, indata, test_name, rtol=1e-7, atol=0, is_dyn=False):
-    relay_results = run_relay(relay_func, indata, is_dyn)
-    onnx_results = run_onnx(func_to_onnx(relay_func, test_name), indata)
-
-    for relay_res, onnx_res in zip(relay_results, onnx_results):
-        np.testing.assert_allclose(relay_res, onnx_res, rtol=rtol, atol=atol)
-
-
-def test_add():
-    dtype = "float32"
-    t1 = relay.TensorType((5, 10, 5))
-    t2 = relay.TensorType((5, 10, 5))
-    x = relay.var("x", t1, dtype=dtype)
-    y = relay.var("y", t2, dtype=dtype)
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
-
-    x_data = np.random.rand(5, 10, 5).astype(dtype)
-    y_data = np.random.rand(5, 10, 5).astype(dtype)
-
-    verify_results(func, [x_data, y_data], "test_add")
-
-
-def test_bias_add():
-    for dtype in ["float16", "float32"]:
-        xshape = (10, 2, 3, 4)
-        bshape = (2,)
-        rtol = 1e-2 if dtype == "float16" else 1e-5
-        x = relay.var("x", shape=xshape, dtype=dtype)
-        bias = relay.var("bias", shape=bshape, dtype=dtype)
-        z = relay.nn.bias_add(x, bias)
-        func = relay.Function([x, bias], z)
-
-        x_data = np.random.uniform(size=xshape).astype(dtype)
-        y_data = np.random.uniform(size=bshape).astype(dtype)
-
-        verify_results(func, [x_data, y_data], "test_bias_add", rtol=rtol)
-
-
-def test_conv2d():
-    def verify_conv2d(
-        dtype, scale, dshape, kshape, padding=(1, 1), groups=1, dilation=(1, 1), **attrs
-    ):
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", shape=kshape, dtype=dtype)
-        y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
-        func = relay.Function([x, w], y)
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        verify_results(func, [data, kernel], "test_conv2d", rtol=1e-5, atol=1e-5, is_dyn=True)
-
-    dshape = (1, 32, 18, 18)
-    kshape = (32, 1, 3, 3)
-    verify_conv2d(
-        "float32", 1, dshape, kshape, padding=(1, 1), channels=32, groups=32, kernel_size=(3, 3)
-    )
-
-    dshape = (1, 32, 18, 18)
-    kshape = (32, 4, 3, 3)
-    verify_conv2d(
-        "float32", 1, dshape, kshape, padding=(1, 1), channels=32, groups=8, kernel_size=(3, 3)
-    )
-
-    # also group conv2d
-    dshape = (1, 32, 18, 18)
-    kshape = (64, 1, 3, 3)
-    verify_conv2d(
-        "float32", 1, dshape, kshape, padding=(1, 1), channels=64, groups=32, kernel_size=(3, 3)
-    )
-
-    # normal conv2d
-    dshape = (1, 3, 224, 224)
-    kshape = (10, 3, 3, 3)
-    verify_conv2d("float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(3, 3))
-
-    dshape = (1, 3, 224, 224)
-    kshape = (10, 3, 3, 3)
-    verify_conv2d("float32", 1, dshape, kshape, padding=(2, 2), channels=10, kernel_size=(3, 3))
-
-    dshape = (1, 3, 18, 18)
-    kshape = (10, 3, 3, 3)
-    verify_conv2d(
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=10,
-        kernel_size=(3, 3),
-        dilation=(3, 3),
-    )
-
-    dshape = (1, 3, 18, 18)
-    kshape = (10, 3, 2, 2)
-    verify_conv2d(
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(2, 2),
-        channels=10,
-        kernel_size=(2, 2),
-        dilation=(1, 1),
-    )
-
-    dshape = (1, 3, 18, 18)
-    kshape = (10, 3, 4, 4)
-    verify_conv2d("float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(4, 4))
-
-    dshape = (1, 3, 18, 18)
-    kshape = (10, 3, 4, 4)
-    verify_conv2d("float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(4, 4))
-
-
-def test_conv2d_transpose():
-    """Conv2d_Transpose unit tests."""
-
-    def verify_conv2d_transpose(
-        dtype, scale, dshape, kshape, padding=(1, 1), groups=1, dilation=(1, 1), **attrs
-    ):
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", shape=kshape, dtype=dtype)
-        y = relay.nn.conv2d_transpose(
-            x, w, padding=padding, dilation=dilation, groups=groups, **attrs
-        )
-        func = relay.Function([x, w], y)
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        verify_results(func, [data, kernel], "test_conv2d_transpose", rtol=1e-5, atol=1e-5)
-
-    dshape = (1, 3, 224, 224)
-    kshape = (3, 10, 3, 3)
-    verify_conv2d_transpose(
-        "float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(3, 3)
-    )
-
-    dshape = (1, 3, 224, 224)
-    kshape = (3, 10, 3, 3)
-    verify_conv2d_transpose(
-        "float32", 1, dshape, kshape, padding=(2, 2), channels=10, kernel_size=(3, 3)
-    )
-
-    dshape = (1, 3, 18, 18)
-    kshape = (3, 10, 2, 2)
-    verify_conv2d_transpose(
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(2, 2),
-        channels=10,
-        kernel_size=(2, 2),
-        dilation=(1, 1),
-    )
-
-    dshape = (1, 3, 18, 18)
-    kshape = (3, 10, 4, 4)
-    verify_conv2d_transpose(
-        "float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(4, 4)
-    )
-
-    dshape = (1, 3, 18, 18)
-    kshape = (3, 10, 4, 4)
-    verify_conv2d_transpose(
-        "float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(4, 4)
-    )
-
-
-def test_reshape():
-    def verify_reshape(shape, newshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        z = relay.reshape(x, newshape=newshape)
-
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        verify_results(func, [x_data], "test_reshape", rtol=1e-5, atol=1e-5)
-
-    verify_reshape((2, 3, 4), tuple(np.array([4, 2, 3], dtype=np.int64)))
-    verify_reshape((2, 3, 4), tuple(np.array([2, 0, 0], dtype=np.int64)))
-    verify_reshape((2, 3, 4), tuple(np.array([0, -1], dtype=np.int64)))
-    verify_reshape((2, 3, 4), tuple(np.array([-1, 0], dtype=np.int64)))
-
-
-def test_transpose():
-    def verify_reshape(shape, newshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        z = relay.transpose(x, newshape)
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        verify_results(func, [x_data], "test_transpose", rtol=1e-5, atol=1e-5)
-
-    verify_reshape((1, 2, 3, 4), (0, 2, 3, 1))
-    verify_reshape((1, 2, 3, 4), (0, 3, 2, 1))
-
-
-def test_dense():
-    def verify_dense(d_shape, w_shape):
-        data = relay.var("data", relay.TensorType(d_shape, "float32"))
-        weight = relay.var("weight", relay.TensorType(w_shape, "float32"))
-        func = relay.Function([data, weight], relay.nn.dense(data, weight))
-        x_data = np.random.uniform(size=d_shape).astype("float32")
-        w_data = np.random.uniform(size=w_shape).astype("float32")
-        verify_results(func, [x_data, w_data], "test_dense", rtol=1e-5, atol=1e-5)
-
-    verify_dense((1, 8), (16, 8))
-    verify_dense((1, 4), (3, 4))
-
-
-def test_max_pool():
-    def verify_max_pool(x_shape, pool_size, strides, padding, ceil_mode):
-        x = relay.var("x", relay.TensorType(x_shape, "float32"))
-        y = tvm.relay.nn.max_pool2d(
-            x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode
-        )
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=x_shape).astype("float32")
-        verify_results(func, [x_data], "test_max_pool", rtol=1e-5, atol=1e-5)
-
-    verify_max_pool(
-        (1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0), ceil_mode=False
-    )
-
-
-def test_batch_flatten():
-    def verify_test_batch_flatten(d_shape):
-        data = relay.var("data", relay.TensorType(d_shape, "float32"))
-        func = relay.Function([data], relay.nn.batch_flatten(data))
-        x_data = np.random.uniform(size=d_shape).astype("float32")
-        verify_results(func, [x_data], "test_batch_flatten", rtol=1e-5, atol=1e-5)
-
-    verify_test_batch_flatten((1, 2, 3, 4))
-    verify_test_batch_flatten((1, 8))
-
-
-def test_batch_norm():
-    def verify_batch_norm(axis=1):
-        for dtype in ["float16", "float32"]:
-            data = relay.var("data", relay.TensorType((2, 4, 4, 1), dtype))
-            gamma_shape = (data.type_annotation.shape[axis].value,)
-            beta = relay.var("beta", relay.TensorType(gamma_shape, dtype))
-            gamma = relay.var("gamma", relay.TensorType(gamma_shape, dtype))
-            moving_mean = relay.var("moving_mean", relay.TensorType(gamma_shape, dtype))
-            moving_var = relay.var("moving_var", relay.TensorType(gamma_shape, dtype))
-            y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var, axis=axis)
-            func = relay.Function([data, gamma, beta, moving_mean, moving_var], y[0])
-
-            x_data = np.random.uniform(size=(2, 4, 4, 1)).astype(dtype)
-            beta = np.random.uniform(size=gamma_shape).astype(dtype)
-            gamma = np.random.uniform(size=gamma_shape).astype(dtype)
-            moving_mean = np.random.uniform(size=gamma_shape).astype(dtype)
-            moving_var = np.random.uniform(size=gamma_shape).astype(dtype)
-            verify_results(
-                func,
-                [x_data, gamma, beta, moving_mean, moving_var],
-                "test_batch_norm",
-                rtol=1e-1,
-                atol=1e-1,
-            )
-
-    verify_batch_norm(axis=1)
-    verify_batch_norm(axis=3)
-
-
-def test_pad():
-    """Pad unit test."""
-
-    def verify_pad():
-        dshape = (4, 10, 7, 7)
-        x = relay.var("x", shape=dshape, dtype="int32")
-        y = relay.nn.pad(x, ((1, 1), (2, 2), (3, 3), (4, 4)))
-        func = relay.Function([x], y)
-        func = run_infer_type(func)
-        x_data = np.random.randint(low=-255, high=255, size=dshape).astype(np.int32)
-        verify_results(func, [x_data], "test_pad", rtol=1e-5, atol=1e-5)
-
-    verify_pad()
-
-
-def test_sofmax():
-    def verify_sofmax():
-        for dtype in ["float32"]:
-            shape = (10, 4)
-            x = relay.var("x", shape=shape, dtype=dtype)
-            y = relay.nn.softmax(x, axis=1)
-            func = relay.Function([x], y)
-            x_data = np.random.uniform(size=shape).astype(dtype)
-            verify_results(func, [x_data], "test_softmax", rtol=1e-5, atol=1e-5)
-
-    verify_sofmax()
-
-
-def test_squeeze():
-    def verify_squeeze(shape, dtype, axis):
-        x = relay.var("x", relay.TensorType(shape, dtype))
-        z = relay.squeeze(x, axis=axis)
-        func = relay.Function([x], z)
-        x_data = np.random.random_sample(shape).astype(dtype)
-        verify_results(func, [x_data], "test_squeeze", rtol=1e-5, atol=1e-5)
-
-    verify_squeeze((1, 3, 2, 5), "float32", None)
-    verify_squeeze(
-        (1, 3, 1),
-        "float32",
-        [
-            2,
-        ],
-    )
-    verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
-
-
-def test_mean():
-    def verify_mean(data_shape, axis, exclude, keepdims):
-        dtype = "float32"
-        x = relay.var("x", shape=data_shape, dtype=dtype)
-        y = relay.mean(x, axis, keepdims, exclude)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=data_shape).astype(dtype)
-        verify_results(func, [x_data], "test_mean", rtol=1e-5, atol=1e-5)
-
-    verify_mean((1, 2), 0, False, False)
-    verify_mean((1, 2), 0, True, False)
-    verify_mean((1, 2), 0, True, True)
-    verify_mean((1, 2), 1, True, True)
-    verify_mean((3, 2, 1), 1, False, True)
-
-
-def test_split():
-    def verify_split(dshape, indices_or_sections, axis=None):
-        dtype = "float32"
-        x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
-        y = relay.split(x, indices_or_sections, axis=axis)
-        func = relay.Function([x], y.astuple())
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-
-        verify_results(func, [x_data], "test_split", rtol=1e-5, atol=1e-5)
-
-    verify_split((5, 5, 2, 2), 5, axis=1)
-    verify_split((5, 5, 2, 2), 5, axis=0)
-    verify_split((5, 5, 2, 2), [1, 3, 4], axis=0)
-    verify_split((5, 5, 2, 2), [1, 3, 4], axis=1)
-
-
-def test_concatenate():
-    def verify_concatenate(shapes, axis, dtype="float32"):
-        in_vars = []
-        in_data = []
-        for i, shape in enumerate(shapes):
-            in_vars.append(relay.var("x" + str(i), relay.ty.TensorType(shape, dtype)))
-            in_data.append(np.random.uniform(size=shape).astype(dtype))
-
-        out_tensor = relay.concatenate(in_vars, axis)
-        func = relay.Function(in_vars, out_tensor)
-        verify_results(func, in_data, "test_concatenate", rtol=1e-5, atol=1e-5)
-
-    verify_concatenate([(2,), (2,), (2,)], -1)
-    verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1)
-    verify_concatenate([(1, 2, 4), (1, 2, 3), (1, 2, 7), (1, 2, 8), (1, 2, 1)], -1)
-    verify_concatenate([(5, 6, 7, 3), (16, 6, 7, 3), (12, 6, 7, 3), (8, 6, 7, 3), (2, 6, 7, 3)], 0)
-    verify_concatenate([(1, 14400), (1, 2400), (1, 640), (1, 240)], 1)
-
-
-def test_strided_slice():
-    def verify_strided_slice(dshape, begin, end, strides, mode):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        if mode == "size":
-            strides = None
-        z = relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=mode)
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        verify_results(func, [x_data], "test_strided_slice", rtol=1e-5, atol=1e-5)
-
-    for mode in ["end", "size"]:
-        verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 2, 3], None, mode)
-        verify_strided_slice((3, 4, 3), [1, -1, 0], [4, -1, 3], [1, 2], mode)
-        verify_strided_slice(
-            (3, 4, 3),
-            [
-                1,
-            ],
-            [4, -3],
-            None,
-            mode,
-        )
-        verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], mode)
-        verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, -3], [2, 1, 1], mode)
-        verify_strided_slice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], mode)
-        verify_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], mode)
-        verify_strided_slice((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], mode)
-
-        verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, mode)
-        verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4], None, mode)
-        verify_strided_slice((3, 4, 3), [1, 1], [4, 4, 3], None, mode)
-        verify_strided_slice((3, 4, 3), [1, 1], [4, 4, 3], [1, 1, 2], mode)
-
-
-def test_cmp_type():
-    for op, ref in ((relay.greater, np.greater), (relay.less, np.less), (relay.equal, np.equal)):
-        x_shape = (10, 4)
-        y_shape = (5, 10, 1)
-        t1 = relay.TensorType(x_shape)
-        t2 = relay.TensorType(y_shape)
-        x = relay.var("x", t1)
-        y = relay.var("y", t2)
-        z = op(x, y)
-        x_data = np.random.rand(*x_shape).astype(t1.dtype)
-        y_data = np.random.rand(*y_shape).astype(t2.dtype)
-        func = relay.Function([x, y], z)
-        verify_results(func, [x_data, y_data], "test_cmp_type", rtol=1e-5, atol=1e-5)
-
-
-def test_unary_identity():
-    for dtype in ["int16", "float32", "float64"]:
-        for op, ref in [(relay.zeros_like, np.zeros_like), (relay.ones_like, np.ones_like)]:
-            shape = (8, 9, 4)
-            x = relay.var("x", relay.TensorType(shape, dtype))
-            y = op(x)
-            func = relay.Function(
-                [
-                    x,
-                ],
-                y,
-            )
-            x_data = np.random.rand(*shape).astype(dtype)
-            verify_results(func, [x_data], "test_cmp_type", rtol=1e-5, atol=1e-5)
-
-
-def test_binary_op():
-    def check_binary_op(opfunc, dtype):
-        t1 = relay.TensorType((5, 10, 5))
-        t2 = relay.TensorType((5, 10, 5))
-        x = relay.var("x", t1, dtype=dtype)
-        y = relay.var("y", t2, dtype=dtype)
-        z = opfunc(x, y)
-        x_data = np.random.rand(5, 10, 5).astype(dtype)
-        y_data = np.random.rand(5, 10, 5).astype(dtype)
-        func = relay.Function([x, y], z)
-        verify_results(func, [x_data, y_data], "test_binary_op", rtol=1e-5, atol=1e-5)
-
-    for opfunc, ref in [
-        (relay.add, np.add),
-        (relay.subtract, np.subtract),
-        (relay.multiply, np.multiply),
-        (relay.divide, np.divide),
-    ]:
-        for dtype in ["float32"]:
-            check_binary_op(opfunc, dtype)
-
-
-def test_tuple_types():
-    def verify_tuple_types(dshape, indices_or_sections, axis=None, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.split(x, indices_or_sections, axis=axis)
-        z = relay.concatenate(y, axis=axis)
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_tuple_types", rtol=1e-5, atol=1e-5)
-
-        split_z = relay.split(z, indices_or_sections, axis=axis)
-        func = relay.Function([x], split_z.astuple())
-        verify_results(func, [x_data], "test_tuple_types", rtol=1e-5, atol=1e-5)
-
-        out = relay.Tuple([y[0] + y[1], y[0] - y[1]])
-        func = relay.Function([x], out)
-        verify_results(func, [x_data], "test_tuple_types", rtol=1e-5, atol=1e-5)
-
-        z = relay.concatenate(out, axis=axis)
-        func = relay.Function([x], z)
-        verify_results(func, [x_data], "test_tuple_types", rtol=1e-5, atol=1e-5)
-
-    verify_tuple_types((5, 5, 2, 2), 5, axis=1)
-    verify_tuple_types((5, 5, 2, 2), 5, axis=0)
-    verify_tuple_types((5, 5, 2, 2), [1, 3, 4], axis=0)
-    verify_tuple_types((5, 5, 2, 2), [1, 3, 4], axis=1)
-
-
-def test_layout_transform():
-    def verify_layout_transform(dshape, src_layout, dst_layout, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.layout_transform(x, src_layout, dst_layout)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_layout_transform", rtol=1e-5, atol=1e-5)
-
-    verify_layout_transform((1, 3, 8, 8), "NCHW", "NHWC")
-    verify_layout_transform((1, 8, 8, 3), "NHWC", "NCHW")
-
-
-def test_clip():
-    def verify_clip(dshape, a_min, a_max, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.clip(x, a_min, a_max)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_clip", rtol=1e-5, atol=1e-5)
-
-    verify_clip((5, 5, 2, 5), 0, 0.2)
-    verify_clip((5, 5, 2, 5), 0.2, 0.5)
-
-
-def test_expand_dims():
-    def verify_expand_dims(dshape, axis, num_newaxis, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.expand_dims(x, axis, num_newaxis)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_expand_dims", rtol=1e-5, atol=1e-5)
-
-    verify_expand_dims((1, 1001), 0, 2)
-    verify_expand_dims((1, 1, 1001), 2, 2)
-
-
-def test_lrn():
-    """LRN unit test."""
-
-    def verify_lrn(xshape, size, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(xshape, dtype))
-        y = relay.nn.lrn(x, size=size, axis=1, alpha=1.0, beta=1.0, bias=1.0)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=xshape).astype(dtype)
-        verify_results(func, [x_data], "test_lrn", rtol=1e-5, atol=1e-5)
-
-    isize = [(1, 1, 480, 640), (1, 3, 224, 224)]
-    sizes = [1, 3]
-    for i in isize:
-        for s in sizes:
-            verify_lrn(i, s)
-
-
-def test_sigmoid():
-    """Sigmoid unit test."""
-
-    def verify_sigmoid(dshape, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.sigmoid(x)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_sigmoid", rtol=1e-4, atol=1e-4)
-
-    isize = [(1, 3, 480, 640), (1, 3, 224, 224)]
-
-    for i in isize:
-        verify_sigmoid(i)
-
-
-def test_copy():
-    """Copy unit test."""
-
-    def verify_copy(dshape, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.copy(x)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_copy", rtol=1e-4, atol=1e-4)
-
-    isize = [(1, 3, 480, 640), (1, 3, 224, 224)]
-
-    for i in isize:
-        verify_copy(i)
-
-
-def test_round():
-    """Round unit test."""
-
-    def verify_round(dshape, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.round(x)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_round", rtol=1e-4, atol=1e-4)
-
-    isize = [(1, 3, 480, 640), (1, 3, 224, 224)]
-
-    for i in isize:
-        verify_round(i)
-
-
-def test_cast():
-    """Cast unit test."""
-
-    def verify_cast(dshape, dtype):
-        x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
-        y = relay.cast(x, dtype)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        verify_results(func, [x_data], "test_cast", rtol=1e-4, atol=1e-4)
-
-    isize = [(1, 3, 480, 640), (1, 3, 224, 224)]
-    out_dtypes = ["int8", "int16", "uint8", "uint16"]
-
-    for i in isize:
-        for o_dtype in out_dtypes:
-            verify_cast(i, o_dtype)
-
-
-@pytest.mark.xfail(reason="Known failing test. See issue #12567.")
-def test_resize():
-    """Resize unit test."""
-
-    def verify_resize(dshape, outsize, method, coord_trans, rounding_method, dtype="float32"):
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        y = relay.image.resize2d(
-            x,
-            outsize,
-            None,
-            layout="NCHW",
-            method=method,
-            coordinate_transformation_mode=coord_trans,
-            rounding_method=rounding_method,
-        )
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=dshape).astype(dtype)
-        verify_results(func, [x_data], "test_resize", rtol=1e-4, atol=1e-4)
-
-    method = ["nearest_neighbor", "linear", "cubic"]
-    coord_trans = ["half_pixel", "align_corners", "asymmetric"]
-    rounding_method = ["round", "floor", "ceil"]
-
-    isize = (1, 3, 480, 640)
-
-    # Downsample
-    osize = (240, 320)
-    for i in method:
-        for j in coord_trans:
-            for k in rounding_method:
-                if (i == "nearest_neighbor" and j == "align_corners") or (
-                    i == "cubic" and j in ["half_pixel", "align_corners"]
-                ):
-                    continue
-                verify_resize(isize, osize, method=i, coord_trans=j, rounding_method=k)
-
-    # Upsample
-    osize = (960, 1280)
-    for i in method:
-        for j in coord_trans:
-            for k in rounding_method:
-                if (i == "nearest_neighbor" and j == "align_corners") or (i == "cubic"):
-                    continue
-                verify_resize(isize, osize, method=i, coord_trans=j, rounding_method=k)
-
-
-def test_dyn():
-    """Dynamic unit test."""
-
-    def verify_dyn_bcast(lhs_shape, rhs_shape, dtype):
-        lhs_dyn_shape = tuple(relay.Any() for i in range(len(lhs_shape)))
-        rhs_dyn_shape = tuple(relay.Any() for i in range(len(rhs_shape)))
-        x = relay.var("x", shape=lhs_dyn_shape, dtype=dtype)
-        y = relay.var("y", shape=rhs_dyn_shape, dtype=dtype)
-        z = relay.add(x, y)
-        func = relay.Function([x, y], z)
-        lhs_data = np.random.uniform(size=lhs_shape).astype(dtype)
-        rhs_data = np.random.uniform(size=rhs_shape).astype(dtype)
-        verify_results(
-            func, [lhs_data, rhs_data], "test_dyn_bcast", rtol=1e-5, atol=1e-5, is_dyn=True
-        )
-
-    verify_dyn_bcast((1, 3, 32, 1), (1, 3, 1, 3), "float32")
-    verify_dyn_bcast((1, 13), (4, 3, 5, 1), "float32")
-
-
-if __name__ == "__main__":
-    test_add()
-    test_bias_add()
-    test_conv2d()
-    test_conv2d_transpose()
-    test_reshape()
-    test_transpose()
-    test_dense()
-    test_max_pool()
-    test_batch_flatten()
-    test_batch_norm()
-    test_pad()
-    test_mean()
-    test_split()
-    test_concatenate()
-    test_sofmax()
-    test_squeeze()
-    test_strided_slice()
-    test_cmp_type()
-    test_binary_op()
-    test_tuple_types()
-    test_layout_transform()
-    test_clip()
-    test_expand_dims()
-    test_lrn()
-    test_sigmoid()
-    test_copy()
-    test_round()
-    test_cast()
-    test_resize()
-    test_dyn()
diff --git a/tests/python/contrib/test_onnx_model.py b/tests/python/contrib/test_onnx_model.py
deleted file mode 100644
index 075085ff8806..000000000000
--- a/tests/python/contrib/test_onnx_model.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Relay to ONNX target test cases"""
-import pytest
-
-pytest.importorskip("onnx")
-pytest.importorskip("onnxruntime")
-
-from collections import OrderedDict
-import numpy as np
-import onnxruntime as rt
-import tvm
-from tvm import relay
-from tvm.contrib.target.onnx import to_onnx
-import tvm.relay.testing
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-from tvm.ir import IRModule
-from tvm.relay import transform
-
-
-def func_to_onnx(mod, params, name):
-    onnx_model = to_onnx(mod, params, name, path=None)
-    return onnx_model.SerializeToString()
-
-
-def run_onnx(mod, params, name, input_data):
-    onnx_model = func_to_onnx(mod, params, name)
-    sess = rt.InferenceSession(onnx_model)
-    input_names = {}
-    for input, data in zip(sess.get_inputs(), input_data):
-        input_names[input.name] = data
-    output_names = [output.name for output in sess.get_outputs()]
-    res = sess.run(output_names, input_names)
-    return res[0]
-
-
-def get_data(in_data_shapes, dtype="float32"):
-    in_data = OrderedDict()
-    for name, shape in in_data_shapes.items():
-        in_data[name] = np.random.uniform(size=shape).astype(dtype)
-    return in_data
-
-
-def run_relay(mod, params, in_data):
-    target = "llvm"
-    dev = tvm.device("llvm", 0)
-    in_data = [tvm.nd.array(value) for value in in_data.values()]
-    return (
-        relay.create_executor("graph", mod, device=dev, target=target)
-        .evaluate()(*in_data, **params)
-        .numpy()
-    )
-
-
-def _verify_results(mod, params, in_data):
-    a = run_relay(mod, params, in_data)
-    b = run_onnx(mod, params, "test_resent", in_data.values())
-    np.testing.assert_allclose(a, b, rtol=1e-7, atol=1e-7)
-
-
-def test_resnet():
-    num_class = 1000
-    in_data_shapes = OrderedDict({"data": (1, 3, 224, 224)})
-    in_data = get_data(in_data_shapes, dtype="float32")
-    for n in [18, 34, 50, 101]:
-        mod, params = tvm.relay.testing.resnet.get_workload(1, num_class, num_layers=n)
-        _verify_results(mod, params, in_data)
-
-
-def test_squeezenet():
-    in_data_shapes = OrderedDict({"data": (1, 3, 224, 224)})
-    in_data = get_data(in_data_shapes, dtype="float32")
-    for version in ["1.0", "1.1"]:
-        mod, params = tvm.relay.testing.squeezenet.get_workload(1, version=version)
-        _verify_results(mod, params, in_data)
-
-
-@pytest.mark.skip("USE_TARGET_ONNX should be ON")
-def test_partition():
-    in_1 = relay.var("in_1", shape=(10, 10), dtype="float32")
-    in_2 = relay.var("in_2", shape=(10, 10), dtype="float32")
-    in_3 = relay.var("in_3", shape=(10, 10), dtype="float32")
-    in_4 = relay.var("in_4", shape=(10, 10), dtype="float32")
-    in_5 = relay.var("in_5", shape=(10, 10), dtype="float32")
-    in_6 = relay.var("in_6", shape=(10, 10), dtype="float32")
-    in_7 = relay.var("in_7", shape=(10, 10), dtype="float32")
-    in_8 = relay.var("in_8", shape=(10, 10), dtype="float32")
-    in_9 = relay.var("in_9", shape=(10, 10), dtype="float32")
-    in_10 = relay.var("in_10", shape=(10, 10), dtype="float32")
-
-    begin0 = compiler_begin(in_1, "onnx")
-    begin1 = compiler_begin(in_2, "onnx")
-    begin2 = compiler_begin(in_3, "onnx")
-    begin3 = compiler_begin(in_4, "onnx")
-    node0 = relay.add(begin0, begin1)
-    node1 = relay.add(begin2, begin3)
-    end0 = compiler_end(node0, "onnx")
-    end1 = compiler_end(node1, "onnx")
-    begin4 = compiler_begin(end0, "onnx")
-    begin5 = compiler_begin(end1, "onnx")
-    node2 = relay.add(begin4, begin5)
-    end2 = compiler_end(node2, "onnx")
-
-    dbegin0 = compiler_begin(in_5, "default")
-    dbegin1 = compiler_begin(in_6, "default")
-    node3 = relay.subtract(dbegin0, dbegin1)
-    dbegin2 = compiler_begin(in_7, "default")
-    dend1 = compiler_end(node3, "default")
-    dbegin3 = compiler_begin(dend1, "default")
-    node4 = relay.subtract(dbegin2, dbegin3)
-    dend2 = compiler_end(node4, "default")
-
-    begin6 = compiler_begin(end2, "onnx")
-    begin7 = compiler_begin(dend2, "onnx")
-    node5 = relay.add(begin6, begin7)
-    end3 = compiler_end(node5, "onnx")
-    end4 = compiler_end(node5, "onnx")
-    dbegin4 = compiler_begin(in_8, "default")
-    dbegin5 = compiler_begin(end3, "default")
-    node6 = relay.subtract(dbegin4, dbegin5)
-    begin8 = compiler_begin(in_9, "onnx")
-    begin9 = compiler_begin(end4, "onnx")
-    node7 = relay.multiply(begin8, begin9)
-    end5 = compiler_end(node7, "onnx")
-
-    dend3 = compiler_end(node6, "default")
-    begin10 = compiler_begin(dend3, "onnx")
-    begin11 = compiler_begin(end5, "onnx")
-    node8 = relay.add(begin10, begin11)
-    end6 = compiler_end(node8, "onnx")
-    begin12 = compiler_begin(in_10, "onnx")
-    begin13 = compiler_begin(end6, "onnx")
-    node9 = relay.add(begin12, begin13)
-    end7 = compiler_end(node9, "onnx")
-
-    func = relay.Function([in_1, in_2, in_3, in_4, in_5, in_6, in_7, in_8, in_9, in_10], end7)
-
-    target = "llvm"
-    mod = IRModule.from_expr(func)
-    mod = transform.PartitionGraph()(mod)
-
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["FuseOps"]):
-        graph_json, mod1, params = relay.build(mod, target)
-
-    assert mod1.type_key == "metadata"
-    assert mod1.imported_modules[0].type_key == "llvm"
-    assert mod1.imported_modules[0].get_source()
-    assert mod1.imported_modules[1].type_key == "onnx"
-    assert mod1.imported_modules[1].get_source()
-
-
-if __name__ == "__main__":
-    test_resnet()
-    test_squeezenet()
-    # test_partition needs USE_TARGET_ONNX to be ON
-    test_partition()
diff --git a/tests/python/contrib/test_rpc_server_device.py b/tests/python/contrib/test_rpc_server_device.py
deleted file mode 100644
index f3d6a6a5eb05..000000000000
--- a/tests/python/contrib/test_rpc_server_device.py
+++ /dev/null
@@ -1,442 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""iOS RPC Server tests."""
-# pylint: disable=invalid-name, no-value-for-parameter, missing-function-docstring, import-error
-import multiprocessing
-import pytest
-import numpy as np
-
-import tvm.testing
-import tvm.relay.testing
-from tvm import te
-from tvm import rpc
-from tvm import relay, auto_scheduler
-from tvm.contrib import utils, xcode, graph_executor
-from tvm.autotvm.measure import request_remote
-from tvm.auto_scheduler.measure_record import load_records
-from tvm.auto_scheduler.measure import MeasureErrorNo
-from tvm.auto_scheduler.utils import call_func_with_timeout
-from tvm.contrib.popen_pool import PopenWorker, StatusKind
-from tvm.rpc import tracker, proxy, server_ios_launcher
-
-
-HOST_URL = "0.0.0.0"
-HOST_PORT = 9190
-DEVICE_KEY = "ios_mobile_device"
-
-
-TEMPORARY_DIRECTORY = utils.tempdir()
-ARCH = "x86_64"
-SDK = "iphonesimulator"
-DSO_NAME = "lib.dylib"
-DTYPE = "float32"
-
-
-np.random.seed(0)
-
-
-ios_rpc_bundle_description_required = pytest.mark.skipif(
-    not server_ios_launcher.ServerIOSLauncher.is_compatible_environment(),
-    reason="To run this test, you need to set environment variables required in ServerIOSLauncher.",
-)
-
-
-@pytest.fixture(scope="session", autouse=True)
-def setup_and_teardown_actions():
-    """Setup and teardown actions for pytest."""
-
-    # No setup actions
-    yield
-    # Teardown actions:
-    server_ios_launcher.ServerIOSLauncher.shutdown_booted_devices()
-
-
-def setup_rpc_standalone_configuration(f):
-    """
-    Host  --  RPC server
-    """
-
-    def wrapper():
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.standalone.value,
-            host=HOST_URL,
-            port=HOST_PORT,
-            key=DEVICE_KEY,
-        ) as ios_server:
-            f(host=ios_server.host, port=ios_server.port)
-
-    return wrapper
-
-
-def setup_rpc_proxy_configuration(f):
-    """
-    Host -- Proxy -- RPC server
-    """
-
-    def wrapper():
-        proxy_server = proxy.Proxy(host=HOST_URL, port=HOST_PORT)
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.proxy.value,
-            host=proxy_server.host,
-            port=proxy_server.port,
-            key=DEVICE_KEY,
-        ):
-            f(host=proxy_server.host, port=proxy_server.port)
-        proxy_server.terminate()
-
-    return wrapper
-
-
-def setup_rpc_tracker_configuration(f):
-    """
-         tracker
-         /     \
-    Host   --   RPC server
-    """
-
-    def wrapper():
-        tracker_server = tracker.Tracker(host=HOST_URL, port=HOST_PORT, silent=True)
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.tracker.value,
-            host=tracker_server.host,
-            port=tracker_server.port,
-            key=DEVICE_KEY,
-        ):
-            f(host=tracker_server.host, port=tracker_server.port)
-        tracker_server.terminate()
-
-    return wrapper
-
-
-def setup_rpc_tracker_via_proxy_configuration(f):
-    """
-         tracker
-         /     \
-    Host   --   Proxy -- RPC server
-    """
-
-    def wrapper():
-        tracker_server = tracker.Tracker(host=HOST_URL, port=HOST_PORT, silent=True)
-        proxy_server_tracker = proxy.Proxy(
-            host=HOST_URL, port=8888, tracker_addr=(tracker_server.host, tracker_server.port)
-        )
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.proxy.value,
-            host=proxy_server_tracker.host,
-            port=proxy_server_tracker.port,
-            key=DEVICE_KEY,
-        ):
-            f(host=tracker_server.host, port=tracker_server.port)
-        proxy_server_tracker.terminate()
-        tracker_server.terminate()
-
-    return wrapper
-
-
-def wrapper_for_call_function_with_timeout(timeout, func, args=(), kwargs=None):
-    """Wrapper for call_func_with_timeout."""
-
-    def wrapper(*_args, **_kwargs):
-        """
-        This wrapper is needed because the cloudpicle
-        cannot serialize objects that contain pointers (RPCSession)
-        """
-        func(*_args, **_kwargs)
-        return StatusKind.COMPLETE
-
-    worker = PopenWorker()
-    ret = call_func_with_timeout(worker, timeout=timeout, func=wrapper, args=args, kwargs=kwargs)
-    if isinstance(ret, Exception):
-        raise ret
-    return ret
-
-
-def try_create_remote_session(session_factory, args=(), kwargs=None):
-    """Deadlock-safe RPC Session creation."""
-
-    try:
-        successful_attempt = True
-        results = []
-        for _ in range(2):
-            ret = wrapper_for_call_function_with_timeout(
-                timeout=10, func=session_factory, args=args, kwargs=kwargs
-            )
-            results.append(ret)
-        if not np.all(np.array(results) == StatusKind.COMPLETE):
-            raise ValueError("One or more sessions ended incorrectly.")
-    except Exception as e:  # pylint: disable=broad-except
-        successful_attempt = False
-        print(e)
-    return successful_attempt
-
-
-def ios_create_dylib(output, objects, **kwargs):  # pylint: disable=unused-argument
-    xcode.create_dylib(output, objects, arch=ARCH, sdk=SDK)
-
-
-ios_create_dylib.output_format = "dylib"
-
-
-def export_lib(lib):
-    """Export lib to temporary directory."""
-
-    path_dso = TEMPORARY_DIRECTORY.relpath(DSO_NAME)
-    lib.export_library(path_dso, fcompile=ios_create_dylib)
-    return path_dso
-
-
-def get_add_relay_module(a_numpy, b_numpy):
-    """Get simple relay module that add two tensors."""
-
-    a = relay.var("a", shape=a_numpy.shape, dtype=DTYPE)
-    b = relay.var("b", shape=b_numpy.shape, dtype=DTYPE)
-    params = {}
-    out = tvm.IRModule.from_expr(relay.add(a, b))
-    return out, params
-
-
-def get_add_module(target):
-    """Get simple module that add two tensors."""
-
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    return tvm.build(s, [A, B, C], target=target, target_host=target, name="simple_add")
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_rpc_standalone(host, port):
-    status_ok = try_create_remote_session(session_factory=rpc.connect, args=(host, port))
-    assert status_ok
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_proxy_configuration
-def test_rpc_proxy(host, port):
-    status_ok = try_create_remote_session(
-        session_factory=rpc.connect, args=(host, port, DEVICE_KEY)
-    )
-    assert status_ok
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_configuration
-def test_rpc_tracker(host, port):
-    status_ok = try_create_remote_session(
-        session_factory=request_remote, args=(DEVICE_KEY, host, port)
-    )
-    assert status_ok
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_via_proxy_configuration
-def test_rpc_tracker_via_proxy(host, port):
-    status_ok = try_create_remote_session(
-        session_factory=request_remote, args=(DEVICE_KEY, host, port)
-    )
-    assert status_ok
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_can_call_remote_function_with_rpc_standalone(host, port):
-    remote_session = rpc.connect(host, port)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_proxy"])
-@ios_rpc_bundle_description_required
-@setup_rpc_proxy_configuration
-def test_can_call_remote_function_with_rpc_proxy(host, port):
-    remote_session = rpc.connect(host, port, key=DEVICE_KEY)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_tracker"])
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_configuration
-def test_can_call_remote_function_with_rpc_tracker(host, port):
-    remote_session = request_remote(DEVICE_KEY, host, port)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_tracker_via_proxy"])
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_via_proxy_configuration
-def test_can_call_remote_function_with_rpc_tracker_via_proxy(host, port):
-    remote_session = request_remote(DEVICE_KEY, host, port)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_basic_functionality_of_rpc_session(host, port):
-    remote_session = rpc.connect(host, port)
-    device = remote_session.cpu(0)
-
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    lib = get_add_module(target)
-    path_dso = export_lib(lib)
-
-    # Check correct upload
-    remote_session.upload(path_dso)
-
-    # Check correct download
-    downloaded_lib = remote_session.download(DSO_NAME)
-    with open(path_dso, "rb") as source_lib_file:
-        assert downloaded_lib == bytearray(
-            source_lib_file.read()
-        ), "The downloaded module does not match the loaded module"
-
-    # Check correct remote computing
-    lib = remote_session.load_module(DSO_NAME)
-    n = 100
-    a = tvm.nd.array(np.random.uniform(size=n).astype(DTYPE), device)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(DTYPE), device)
-    c = tvm.nd.array(np.zeros(n, dtype=DTYPE), device)
-    lib(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    # Check correct remove
-    remote_session.remove(DSO_NAME)
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@pytest.mark.xfail(reason="Not implemented functionality")
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_cleanup_workspace_after_session_end(host, port):
-    # Arrange
-    remote_session = rpc.connect(host, port)
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    lib = get_add_module(target)
-    path_dso = export_lib(lib)
-    remote_session.upload(path_dso)
-
-    # Act
-    del remote_session
-    remote_session = rpc.connect(host, port)
-    try:
-        remote_session.download(DSO_NAME)
-        status_ok = False
-    except Exception as _:  # pylint: disable=broad-except
-        status_ok = True
-
-    # Assert
-    assert status_ok, "Workspace not cleared after RPC Session termination."
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_graph_executor_remote_run(host, port):
-    remote_session = rpc.connect(host, port)
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    device = remote_session.cpu(0)
-
-    size = 100
-    a = np.random.uniform(size=size).astype(DTYPE)
-    b = np.random.uniform(size=size).astype(DTYPE)
-    mod, params = get_add_relay_module(a, b)
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, target_host=target, params=params)
-
-    path_dso = export_lib(lib)
-    remote_session.upload(path_dso)
-    lib = remote_session.load_module(DSO_NAME)
-
-    gen_module = graph_executor.GraphModule(lib["default"](device))
-
-    # Check set input
-    gen_module.set_input("a", tvm.nd.array(a))
-    gen_module.set_input("b", tvm.nd.array(b))
-    tvm.testing.assert_allclose(gen_module.get_input(0).numpy(), a)
-    tvm.testing.assert_allclose(gen_module.get_input(1).numpy(), b)
-
-    # Check run
-    gen_module.run()
-    out = gen_module.get_output(0)
-    tvm.testing.assert_allclose(out.numpy(), a + b)
-
-
-@pytest.mark.xfail(
-    strict=False, reason="flaky test (see https://github.com/apache/tvm/issues/9824)"
-)
-@pytest.mark.dependency(depends=["test_rpc_tracker"])
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_configuration
-def test_check_auto_schedule_tuning(host, port):  # pylint: disable=too-many-locals
-    log_file = TEMPORARY_DIRECTORY.relpath("ios_tuning_stat.log")
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    mod, params = relay.testing.mlp.get_workload(batch_size=4, image_shape=(1, 4, 4))
-
-    try:
-        status_ok = True
-        measure_runner = auto_scheduler.RPCRunner(
-            DEVICE_KEY,
-            host,
-            port,
-            min_repeat_ms=1,
-            timeout=10,
-            n_parallel=multiprocessing.cpu_count(),
-        )
-        builder = auto_scheduler.LocalBuilder(timeout=10, build_func=ios_create_dylib)
-        tune_option = auto_scheduler.TuningOptions(
-            builder=builder,
-            num_measure_trials=2,
-            num_measures_per_round=1,
-            runner=measure_runner,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-            verbose=0,
-        )
-
-        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-        tasks, task_weights = tasks[:2], task_weights[:2]
-        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-        tuner.tune(tune_option, search_policy="sketch.random")
-
-        # Check tuning log
-        tuning_statistic = list(load_records(log_file))
-        for _, measure_result in tuning_statistic:
-            if measure_result.error_no != MeasureErrorNo.NO_ERROR:
-                raise ValueError(
-                    f"Error for MeasureResult. Error code: {measure_result.error_no},"
-                    f" for details see MeasureErrorNO."
-                )
-
-    except Exception as e:  # pylint: disable=broad-except
-        status_ok = False
-        print(e)
-
-    assert status_ok, "Tuning failed, see logs."
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_tedd.py b/tests/python/contrib/test_tedd.py
deleted file mode 100644
index de0a65799c7c..000000000000
--- a/tests/python/contrib/test_tedd.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Configure pytest of Tensor Expression Debug Display"""
-# pylint: disable=invalid-name
-import re
-import tvm
-from tvm import te
-from tvm import topi
-from tvm import relay
-from tvm.relay import testing
-from tvm.relay.backend import Runtime, Executor
-
-
-def findany(pattern, _str):
-    matches = re.findall(pattern, _str)
-    assert len(matches) > 0, "Pattern not found.\nPattern: " + pattern + "\nString:  " + _str
-
-
-def checkdependency():
-    # pylint: disable=import-outside-toplevel
-    import pkg_resources
-
-    # pylint: disable=E1133
-    return not {"graphviz", "ipython"} - {pkg.key for pkg in pkg_resources.working_set}
-
-
-def test_dfg():
-    """Tests dataflow graph"""
-    A = te.placeholder((1024, 4096), dtype="float32", name="A")
-    B = topi.nn.softmax(A)
-    # confirm lower works
-    s = te.create_schedule([B.op])
-
-    def verify():
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib import tedd
-
-        _str = tedd.viz_dataflow_graph(s, False, "", True)
-        # Check all edges are available
-        findany(r"digraph \"Dataflow Graph\"", str)
-        findany(r"Stage_0:O_0 -> Tensor_0_0", str)
-        findany(r"Tensor_0_0 -> Stage_1:I_0", str)
-        findany(r"Stage_1:O_0 -> Tensor_1_0", str)
-        findany(r"Tensor_0_0 -> Stage_2:I_0", str)
-        findany(r"Tensor_1_0 -> Stage_2:I_1", str)
-        findany(r"Stage_2:O_0 -> Tensor_2_0", str)
-        findany(r"Tensor_2_0 -> Stage_3:I_0", str)
-        findany(r"Stage_3:O_0 -> Tensor_3_0", str)
-        findany(r"Tensor_2_0 -> Stage_4:I_0", str)
-        findany(r"Tensor_3_0 -> Stage_4:I_1", str)
-        findany(r"Stage_4:O_0 -> Tensor_4_0", str)
-
-    if checkdependency():
-        verify()
-
-
-def test_itervar_relationship_graph():
-    """Tests itervars relationship graph"""
-    n = te.var("n")
-    m = te.var("m")
-    A = te.placeholder((n, m), name="A")
-    k = te.reduce_axis((0, m), "k")
-    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
-
-    s = te.create_schedule(B.op)
-    s[B].split(B.op.reduce_axis[0], factor=16)
-
-    def verify():
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib import tedd
-
-        _str = tedd.viz_itervar_relationship_graph(s, False, "", True)
-        findany(r"digraph \"IterVar Relationship Graph\"", str)
-        findany(r"subgraph cluster_legend", str)
-        # Check subgraphs for stages
-        findany(r"subgraph cluster_Stage_0", str)
-        findany(r"subgraph cluster_Stage_1", str)
-        # Check itervars and their types
-        findany(r"\(kDataPar\)\<br/\>T.Range\(0, n\)", str)
-        findany(r"\(kCommReduce\)\<br/\>T.Range\(0, m\)", str)
-        # Check the split node
-        findany(r"Split_Relation_1_0 +.+\>Split", str)
-        # Check all edges to/from the split node
-        findany(r"IterVar_1_1:itervar -> Split_Relation_1_0:Input", str)
-        findany(r"Split_Relation_1_0:Outer -> IterVar_1_2:itervar", str)
-        findany(r"Split_Relation_1_0:Inner -> IterVar_1_3:itervar", str)
-
-    if checkdependency():
-        verify()
-
-
-def test_schedule_tree():
-    """Tests schedule tree"""
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis("threadIdx.x")
-    n = te.var("n")
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((n, m, l), name="A")
-    B = te.compute((n, m, l), lambda bi, bj, bk: A[bi, bj, bk] + 1, name="B")
-    r = te.reduce_axis((0, m), "r")
-    C = te.compute(
-        (
-            n,
-            m,
-        ),
-        lambda ci, cj: te.sum(B[ci, cj, r], axis=r),
-        name="C",
-    )
-    s = te.create_schedule(C.op)
-    s.cache_read(A, "shared", [B])
-    s[B].vectorize(B.op.axis[-1])
-    s[C].reorder(C.op.reduce_axis[0], C.op.axis[0])
-    _, ki = s[C].split(C.op.reduce_axis[0], factor=16)
-    Cr = s.rfactor(C, ki)
-    s[Cr].compute_at(s[C], s[C].op.axis[-1])
-    s[C].bind(s[C].op.axis[0], block_x)
-    s[C].bind(s[C].op.axis[1], thread_x)
-
-    def verify():
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib import tedd
-
-        _str = tedd.viz_schedule_tree(s, False, "", True)
-        findany(r"digraph \"Schedule Tree\"", str)
-        findany(r"subgraph cluster_legend", str)
-        # Check the A_shared stage, including memory scope, itervars,
-        # and compute
-        findany(
-            r"Stage_1.*A\.shared<br/>Scope: shared.+>0.+>"
-            r"ax0.*\(kDataPar\).+>1.+ax1.*\(kDataPar\).+>2.+>ax2.*\(kDataPar\).+>"
-            r"\[A[\[\(]ax0, ax1, ax2[\)\]]\]",
-            str,
-        )
-        # Check itervars of types different from KDataPar
-        findany(r"bk.*\(kVectorized\)", str)
-        findany(r"r.outer.*\(kCommReduce\)", str)
-        findany(r"label=ROOT", str)
-        # Check the compute_at edge
-        findany(r"Stage_1.*\[color\=\"\#000000\"\]", str)
-
-    if checkdependency():
-        verify()
-
-
-@tvm.testing.requires_llvm
-def test_tedd_with_schedule_record():
-    """Test to build a nn model and check if all schedules could be generated"""
-
-    def check_schedule(executor):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib import tedd
-
-        error = {}
-        for func_name, func_meta in executor.function_metadata.items():
-            # check converted op only
-            if "main" not in func_name:
-                primfunc = list(func_meta.relay_primfuncs.values())[0]
-                schs = primfunc.attrs["schedule"].schedule_record
-                for index in range(len(schs)):
-                    try:
-                        sch = schs[index].normalize()
-                        tedd.viz_dataflow_graph(sch, False, "", True)
-                        tedd.viz_itervar_relationship_graph(sch, False, "", True)
-                        tedd.viz_schedule_tree(sch, False, "", True)
-                    except:  # pylint: disable=W0702
-                        if func_name not in error:
-                            error[func_name] = []
-                        error[func_name].append(index)
-
-        assert not error, str(error)
-
-    if checkdependency():
-        relay_mod, params = testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-        target_llvm = tvm.target.Target("llvm")
-        config = {"te.keep_schedule_record": True}
-
-        with tvm.transform.PassContext(opt_level=3, config=config):
-            aot_executor_factory = relay.build(
-                relay_mod,
-                target_llvm,
-                runtime=Runtime("cpp"),
-                executor=Executor("aot"),
-                params=params,
-            )
-            graph_executor_factory = relay.build(
-                relay_mod,
-                target_llvm,
-                params=params,
-            )
-
-        check_schedule(aot_executor_factory)
-        check_schedule(graph_executor_factory)
-
-
-if __name__ == "__main__":
-    test_dfg()
-    test_itervar_relationship_graph()
-    test_schedule_tree()
-    test_tedd_with_schedule_record()
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
deleted file mode 100644
index ce84f804ae78..000000000000
--- a/tests/python/contrib/test_tensorrt.py
+++ /dev/null
@@ -1,1410 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import itertools
-import logging
-from typing import Tuple
-
-import numpy as np
-import pytest
-
-try:
-    # See issue #9362.
-    import torch
-except:
-    pass
-
-import tvm
-import tvm.relay.testing
-import tvm.testing
-from tvm import relay
-from tvm.contrib.download import download
-from tvm.relay import Any, GlobalVar
-from tvm.relay.expr_functor import ExprVisitor
-from tvm.relay.op.contrib import tensorrt
-
-SUPPORTED_DTYPES = ["float16", "float32"]
-
-has_tensorrt_codegen = pytest.mark.skipif(
-    not tensorrt.is_tensorrt_compiler_enabled(), reason="TensorRT codegen not available"
-)
-
-# CAUTION: Currently always false in CI since adds tens of minutes to test time and depends
-# on TensorRT installation. See https://github.com/apache/tvm/issues/11765
-has_tensorrt_runtime = pytest.mark.skipif(
-    not tensorrt.is_tensorrt_runtime_enabled(), reason="TensorRT runtime not available"
-)
-
-run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_tensorrt_codegen, *tvm.testing.requires_cuda.marks()]),
-    pytest.param(
-        True, marks=[has_tensorrt_runtime, has_tensorrt_codegen, *tvm.testing.requires_cuda.marks()]
-    ),
-    ids=["compile", "run"],
-)
-
-
-def vmobj_to_list(o):
-    if isinstance(o, tvm.nd.NDArray):
-        return [o.numpy()]
-    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
-        return [vmobj_to_list(f) for f in o]
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def assert_result_dict_holds(result_dict, dtype="float16"):
-    for k1, k2 in itertools.combinations(result_dict, 2):
-        res1 = vmobj_to_list(result_dict[k1])
-        res2 = vmobj_to_list(result_dict[k2])
-        for r1, r2 in zip(res1, res2):
-            if dtype == "float16":
-                tvm.testing.assert_allclose(r1, r2, rtol=1e-1, atol=1e-1)
-            else:
-                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=5e-3)
-
-
-def set_outer_func_attr(func, compile_name, symbol_name):
-    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Compiler", compile_name)
-    func = func.with_attr("global_symbol", symbol_name)
-    return func
-
-
-def set_inner_func_attr(func, pattern_name, composite_name):
-    func = func.with_attr("PartitionedFromPattern", pattern_name)
-    func = func.with_attr("Composite", composite_name)
-    return func
-
-
-def run_and_verify_func(config, target="cuda", run_module=True, data_type="float32"):
-    """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
-
-    Parameters
-    ----------
-    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
-        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
-        3) A list of which vars should be considered params.
-
-    run_module: bool
-
-        If True, the built module will be run after being compiled.
-
-    data_type: str
-        Check between single and double floating precision
-    """
-    np.random.seed(42)
-    f, input_shapes, is_param = config
-    params = {
-        x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype=data_type) for x in is_param
-    }
-    input_dict = {
-        k: np.random.uniform(-1, 1, v).astype(dtype=data_type)
-        for k, v in input_shapes.items()
-        if k not in is_param
-    }
-    dev = tvm.device(target)
-
-    result_dict = dict()
-    for mode in ["vm", "graph"]:
-        for use_trt in [True, False]:
-            mod = tvm.IRModule()
-            mod["main"] = f
-            result_key = mode + ("_trt" if use_trt else "")
-            if use_trt:
-                use_fp16 = data_type == "float16"
-                trt_target = tvm.target.Target(f"tensorrt -use_fp16={use_fp16}")
-                mod = relay.transform.InferType()(mod)
-                mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
-                with tvm.transform.PassContext(opt_level=3):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=dev, target=[target, trt_target]
-                    ).evaluate()
-            else:
-                mod = relay.transform.InferType()(mod)
-                with tvm.transform.PassContext(opt_level=3):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=dev, target=target
-                    ).evaluate()
-
-            if run_module:
-                result_dict[result_key] = func(**input_dict, **params)
-
-            if run_module:
-                assert_result_dict_holds(result_dict, data_type)
-
-
-def test_tensorrt_simple(run_module):
-    for dtype in SUPPORTED_DTYPES:
-        xshape = (1, 3, 2, 2)
-        yshape = (1, 3, 1, 1)
-        zshape = (1, 1, 1, 1)
-        x = relay.var("x", shape=(xshape), dtype=dtype)
-        y = relay.var("y", shape=(yshape), dtype=dtype)
-        z = relay.var("z", shape=(zshape), dtype=dtype)
-        w = z * (x + y)
-        out = relay.nn.relu(w)
-        f = relay.Function([x, y, z], out)
-        x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-        y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
-        z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
-
-        result_dict = dict()
-        for mode in ["vm", "graph"]:
-            for use_trt in [False, True]:
-                mod = tvm.IRModule()
-                mod["main"] = f
-                result_key = mode + ("_trt" if use_trt else "")
-                if use_trt:
-                    mod = relay.transform.InferType()(mod)
-                    mod = tensorrt.partition_for_tensorrt(mod)
-                    with tvm.transform.PassContext(opt_level=3):
-                        func = relay.create_executor(
-                            mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                        ).evaluate()
-                else:
-                    mod = relay.transform.InferType()(mod)
-                    with tvm.transform.PassContext(opt_level=3):
-                        func = relay.create_executor(
-                            mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                        ).evaluate()
-                if run_module:
-                    result_dict[result_key] = func(x_data, y_data, z_data)
-
-        if run_module:
-            assert_result_dict_holds(result_dict)
-
-
-def test_tensorrt_simple_cpu_io(run_module):
-    def get_graph():
-        dtype = "float32"
-        x_shape = (1, 3, 2, 2)
-        y_shape = (1, 3, 1, 1)
-        z_shape = (1, 1, 1, 1)
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        y = relay.var("y", shape=(y_shape), dtype=dtype)
-        z = relay.var("z", shape=(z_shape), dtype=dtype)
-        w = z * (x + y)
-        out = relay.nn.relu(w)
-        f = relay.Function([x, y, z], out)
-        return f, {"x": x_shape, "y": y_shape, "z": z_shape}, ["y"]
-
-    run_and_verify_func(get_graph(), target="llvm", run_module=run_module)
-
-
-def test_tensorrt_not_compatible(run_module):
-    dtype = "float32"
-    xshape = (1, 32, 14, 14)
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.add(x, x)
-    z = relay.cast(relay.cast(y, "int32"), "float32")
-    out = relay.nn.relu(z)
-    f = relay.Function([x], out)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = tensorrt.partition_for_tensorrt(mod)
-    for mode in ["graph", "vm"]:
-        with tvm.transform.PassContext(opt_level=3):
-            func = relay.create_executor(
-                mode, mod=mod, device=tvm.cuda(0), target="cuda"
-            ).evaluate()
-            if run_module:
-                results = func(x_data)
-
-
-def test_conv1d(run_module):
-    def get_graph(
-        x_shape=((1, 3, 224)),
-        k_shape=(10, 3, 3),
-        groups=1,
-        padding=(1, 1),
-        strides=(1),
-        dilation=(1),
-        channels=None,
-        d_type="float16",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=d_type)
-        kernel = relay.var("kernel", shape=(k_shape), dtype=d_type)
-        out = relay.nn.conv1d(
-            x,
-            kernel,
-            kernel_size=k_shape[2:3],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-            channels=channels,
-            out_dtype="float16",
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    for d_type in ["float16"]:
-        run_and_verify_func(
-            get_graph(channels=10, d_type=d_type), run_module=run_module, data_type=d_type
-        )
-
-
-def test_conv2d(run_module):
-    def get_graph(
-        x_shape=(1, 32, 8, 8),
-        k_shape=(16, 32, 3, 3),
-        groups=1,
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        channels=None,
-        data_type="float16",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=data_type)
-        kernel = relay.var("kernel", shape=(k_shape), dtype=data_type)
-        out = relay.nn.conv2d(
-            x,
-            kernel,
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-            channels=channels,
-            out_dtype=data_type,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                for dilation in [(1, 1), (2, 2)]:
-                    run_and_verify_func(
-                        get_graph(
-                            k_shape=k_shape,
-                            groups=groups,
-                            padding=padding,
-                            strides=strides,
-                            dilation=dilation,
-                        ),
-                        run_module=run_module,
-                        data_type="float16",
-                    )
-    run_and_verify_func(
-        get_graph(
-            (1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24, data_type="float16"
-        ),
-        run_module=run_module,
-        data_type="float16",
-    )
-
-    run_and_verify_func(
-        get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1, data_type="float32"),
-        run_module=run_module,
-        data_type="float32",
-    )
-
-
-def test_conv2d_nhwc(run_module):
-    def get_graph(x_shape=(1, 8, 8, 32), k_shape=(3, 3, 32, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv2d(
-            x, kernel, channels=16, kernel_size=(3, 3), data_layout="NHWC", kernel_layout="HWIO"
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_conv2d_weights_const(run_module):
-    def get_graph(
-        x_shape=(1, 32, 8, 8),
-        k_shape=(16, 32, 3, 3),
-        groups=1,
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_type="float16",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=data_type)
-        kernel = relay.const(np.ones(k_shape).astype(dtype=data_type))
-        out = relay.nn.conv2d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for tp in ["float16"]:
-        run_and_verify_func(get_graph(data_type=tp), run_module=run_module, data_type=tp)
-
-
-def test_conv2d_weights_transposed(run_module):
-    def get_graph(x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 0, 1)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        kernel_t = relay.transpose(kernel, order)
-        # Conv2d requires constant weights in TensorRT, so the weights should be transposed by
-        # FoldConstant.
-        out = relay.nn.conv2d(x, kernel_t, channels=k_shape[order[0]], kernel_size=(3, 3))
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_dense(run_module):
-    def get_graph(x_shape=(1, 16), k_shape=(32, 16), dtp="float16"):
-        x = relay.var("x", shape=(x_shape), dtype=dtp)
-        kernel = relay.var("kernel", shape=(k_shape), dtype=dtp)
-        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.dense(x, kernel, units=k_shape[0])
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    for tp in ["float32"]:
-        run_and_verify_func(get_graph(dtp=tp), run_module=run_module, data_type=tp)
-        run_and_verify_func(get_graph(k_shape=(1, 16), dtp=tp), run_module=run_module, data_type=tp)
-
-
-def test_batch_matmul(run_module):
-    def get_graph(x_shape=(12, 128, 64), y_shape=(12, 128, 64), transa=False, transb=True):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        y = relay.var("y", shape=(y_shape), dtype="float32")
-        out = relay.nn.batch_matmul(x, y, transpose_a=transa, transpose_b=transb)
-        f = relay.Function([x, y], out)
-        return f, {"x": x_shape, "y": y_shape}, []
-
-    run_and_verify_func(
-        get_graph(x_shape=(12, 64, 128), y_shape=(12, 128, 64), transa=True, transb=True),
-        run_module=run_module,
-    )
-    run_and_verify_func(
-        get_graph(x_shape=(12, 64, 128), y_shape=(12, 64, 128), transa=True, transb=False),
-        run_module=run_module,
-    )
-    run_and_verify_func(
-        get_graph(x_shape=(12, 128, 64), y_shape=(12, 128, 64), transa=False, transb=True),
-        run_module=run_module,
-    )
-    run_and_verify_func(
-        get_graph(x_shape=(12, 128, 64), y_shape=(12, 64, 128), transa=False, transb=False),
-        run_module=run_module,
-    )
-
-
-def test_bias_add(run_module):
-    def get_graph(x_shape=(1, 16), channels=16, axis=1):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        bias = relay.var("bias", shape=(channels,), dtype="float32")
-        out = relay.nn.bias_add(x, bias, axis)
-        f = relay.Function([x, bias], out)
-        return f, {"x": x_shape, "bias": (channels,)}, ["bias"]
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-    run_and_verify_func(get_graph((1, 6, 3, 4), 6), run_module=run_module)
-    run_and_verify_func(get_graph((1, 6, 3, 4), 4, -1), run_module=run_module)
-
-
-def test_pool2d(run_module):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 32, 32),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for pool_size in [(2, 2), (3, 3)]:
-        for strides in [(1, 1), (2, 2)]:
-            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
-                for ceil_mode in [False, True]:
-                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
-                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
-                        continue
-                    for count_include_pad in [False, True]:
-                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
-                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
-                            continue
-                        run_and_verify_func(
-                            get_graph(
-                                relay.nn.avg_pool2d,
-                                pool_size=pool_size,
-                                strides=strides,
-                                padding=padding,
-                                ceil_mode=ceil_mode,
-                                count_include_pad=count_include_pad,
-                            ),
-                            run_module=run_module,
-                        )
-                    run_and_verify_func(
-                        get_graph(
-                            relay.nn.max_pool2d,
-                            pool_size=pool_size,
-                            strides=strides,
-                            padding=padding,
-                            ceil_mode=ceil_mode,
-                        ),
-                        run_module=run_module,
-                    )
-
-
-def test_global_pool2d(run_module):
-    def get_graph(op, x_shape=(1, 3, 32, 32)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(relay.nn.global_max_pool2d), run_module=run_module)
-    run_and_verify_func(get_graph(relay.nn.global_avg_pool2d), run_module=run_module)
-
-
-def test_batch_flatten(run_module):
-    def get_graph(x_shape=(1, 3, 4, 6), data_type="float16"):
-        x = relay.var("x", shape=(x_shape), dtype=data_type)
-        out = relay.nn.batch_flatten(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for dtp in ["float16", "float32"]:
-        run_and_verify_func(get_graph(data_type=dtp), run_module=run_module, data_type=dtp)
-
-
-def test_expand_dims(run_module):
-    def get_graph(x_shape=(1, 3), axis=1, num_newaxis=1):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.expand_dims(x, axis, num_newaxis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_squeeze(run_module):
-    def get_graph(x_shape, axis, dtype):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.squeeze(x, axis=axis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for dtype in SUPPORTED_DTYPES:
-        run_and_verify_func(
-            get_graph((1, 5, 1, 1), (2, 3), dtype=dtype), run_module=run_module, data_type=dtype
-        )
-        run_and_verify_func(
-            get_graph((1, 3, 1), (-1,), dtype=dtype), run_module=run_module, data_type=dtype
-        )
-
-
-def test_concatenate(run_module):
-    def get_graph(input_shapes, axis):
-        concat_inputs = []
-        shapes_dict = {}
-        for i in range(len(input_shapes)):
-            name = "input_{}".format(i)
-            concat_inputs.append(relay.var(name, shape=(input_shapes[i]), dtype="float32"))
-            shapes_dict[name] = input_shapes[i]
-        out = relay.concatenate(concat_inputs, axis)
-        f = relay.Function(concat_inputs, out)
-        return f, shapes_dict, []
-
-    run_and_verify_func(get_graph([(1, 2, 6, 6), (1, 3, 6, 6)], axis=1), run_module=run_module)
-
-
-def test_split(run_module):
-    def get_graph(x_shape, indices_or_sections, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.split(x, indices_or_sections=indices_or_sections, axis=axis)
-        f = relay.Function([x], out.astuple())
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph((1, 16), indices_or_sections=2, axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 16), indices_or_sections=4, axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 16), indices_or_sections=[8], axis=1), run_module=run_module)
-    run_and_verify_func(
-        get_graph((1, 16), indices_or_sections=[2, 3, 6, 10, 14], axis=1), run_module=run_module
-    )
-
-
-def test_conv2d_transpose(run_module):
-    def get_graph(
-        x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), groups=1, padding=(0, 0), strides=(1, 1)
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv2d_transpose(
-            x,
-            kernel,
-            channels=k_shape[1],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    for padding in [(0, 0), (1, 1)]:
-        for strides in [(1, 1), (2, 2)]:
-            run_and_verify_func(get_graph(padding=padding, strides=strides), run_module=run_module)
-
-
-def test_reshape(run_module):
-    def get_graph(x_shape, new_shape):
-        x = relay.var("x", shape=(x_shape), dtype="float16")
-        out = relay.reshape(x, new_shape)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(
-        get_graph((1, 1, 1, 10), (-1, 10)), run_module=run_module, data_type="float16"
-    )
-    run_and_verify_func(
-        get_graph((1, 10, 2, 3), (1, -1)), run_module=run_module, data_type="float16"
-    )
-    run_and_verify_func(get_graph((1, 1, 2, 3), (1, 6)), run_module=run_module, data_type="float16")
-
-
-class AreOpsOnGraph(ExprVisitor):
-    """
-    Visits the Graph recursively and checks if it contains ops in the op_list
-    """
-
-    def __init__(self, op_list):
-        ExprVisitor.__init__(self)
-        self.op_list = op_list
-        self.on_graph = False
-
-    def visit_call(self, call):
-        if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op.name) in self.op_list:
-                self.on_graph = True
-
-        return super().visit_call(call)
-
-    def are_ops_on_graph(self, subgraph) -> bool:
-        """
-        This function recursively visits the graph and checks if op_list ops are ongraph"
-        """
-        self.visit(subgraph)
-        return self.on_graph
-
-
-def are_ops_on_trt(mod, op_list):
-    op_on_trt = False
-    op_on_tvm = False
-    for subgraph in mod.get_global_vars():
-        name = subgraph.name_hint
-        if mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt":
-            op_on_trt |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
-        else:
-            op_on_tvm |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
-
-    return op_on_trt and not op_on_tvm
-
-
-def test_dynamic_reshape(run_module):
-    def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
-        result_arr = [{} for _ in range(len(x_data_list))]
-        for use_trt in [True, False]:
-            x = relay.var("x", shape=x_shape, dtype="float32")
-            out = relay.reshape(x, new_shape)
-            f = relay.Function([x], out)
-            mod = tvm.IRModule()
-            mod["main"] = f
-            if use_trt:
-                logging.info("Before partitioning:\n%s", mod)
-                mod = tensorrt.partition_for_tensorrt(mod)
-                logging.info("After partitioning:\n%s", mod)
-                assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt
-            if run_module:
-                with relay.build_config(opt_level=3):
-                    func = relay.create_executor(
-                        "vm", mod=mod, device=tvm.cpu(0), target="llvm"
-                    ).evaluate()
-
-                for i, x_data in enumerate(x_data_list):
-                    result_arr[i][use_trt] = func(x_data)
-
-        if run_module:
-            for i in range(len(x_data_list)):
-                assert_result_dict_holds(result_arr[i])
-
-    dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2]
-    x_shape = (relay.Any(), 3, 2, 3)
-    x_data_list = [
-        np.ones([dim_value] + list(x_shape)[1:]).astype("float32") for dim_value in dim_values
-    ]
-    new_shape = (-1, 3, 2, 3)
-    should_offload_to_trt = True
-    test_run(x_data_list, x_shape, new_shape, should_offload_to_trt)
-
-    dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2]
-    x_shape = (relay.Any(), 3, 2, 3)
-    x_data_list = [
-        np.ones([dim_value] + list(x_shape)[1:]).astype("float32") for dim_value in dim_values
-    ]
-    new_shape = (-1, 1, 2, 3)
-    should_offload_to_trt = False
-    test_run(x_data_list, x_shape, new_shape, should_offload_to_trt)
-
-    dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2]
-    x_shape = (1, relay.Any(), 2, 3)
-    x_data_list = [
-        np.ones(list(x_shape[:1]) + [dim_value] + list(x_shape)[2:]).astype("float32")
-        for dim_value in dim_values
-    ]
-    new_shape = (1, -1, 2, 3)
-    should_offload_to_trt = False
-    test_run(x_data_list, x_shape, new_shape, should_offload_to_trt)
-
-
-def test_transpose(run_module):
-    def get_graph(x_shape, order):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.transpose(x, order)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph((1, 16, 7, 7), [0, 2, 3, 1]), run_module=run_module)
-    run_and_verify_func(get_graph((1, 7, 7, 16), [0, 3, 1, 2]), run_module=run_module)
-
-
-def test_float_const(run_module):
-    def get_graph(x_shape=(1, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        beta = relay.const(1, dtype="float32")
-        out = relay.multiply(x, beta)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, data_type="float32")
-
-
-def test_float_const16(run_module):
-    def get_graph(x_shape=(1, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float16")
-        beta = relay.const(1, dtype="float16")
-        out = relay.multiply(x, beta)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
-
-
-def test_pad(run_module):
-    def get_graph(x_shape, pad_width):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.pad(x, pad_width=pad_width)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(
-        get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 0]]), run_module=run_module
-    )
-    run_and_verify_func(
-        get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [1, 1], [1, 1]]), run_module=run_module
-    )
-    run_and_verify_func(
-        get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 1], [2, 0]]), run_module=run_module
-    )
-    run_and_verify_func(
-        get_graph((1, 8, 3, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]),
-        run_module=run_module,
-    )
-
-
-def test_add(run_module):
-    def get_graph(x_shape):
-        x = relay.var("x", shape=(x_shape), dtype="float16")
-        y = relay.var("y", shape=(x_shape), dtype="float16")
-        out = relay.add(x, y)
-        f = relay.Function([x, y], out)
-        return f, {"x": x_shape, "y": x_shape}, []
-
-    run_and_verify_func(get_graph((1, 1000)), run_module=run_module, data_type="float16")
-
-
-def test_softmax(run_module):
-    def get_graph(x_shape, axis, data_type="float32"):
-        x = relay.var("x", shape=(x_shape), dtype=data_type)
-        out = relay.nn.softmax(x, axis=axis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(
-        get_graph((1, 1000), axis=1, data_type="float32"),
-        run_module=run_module,
-        data_type="float32",
-    )
-    run_and_verify_func(
-        get_graph((1, 1000), axis=-1, data_type="float32"),
-        run_module=run_module,
-        data_type="float32",
-    )
-    run_and_verify_func(
-        get_graph((1, 3, 4), axis=-2, data_type="float16"),
-        run_module=run_module,
-        data_type="float16",
-    )
-    run_and_verify_func(
-        get_graph((1, 3, 4), axis=1, data_type="float16"),
-        run_module=run_module,
-        data_type="float16",
-    )
-
-
-def test_batch_norm(run_module):
-    def get_graph(x_shape, param_shape, axis=1, epsilon=1e-5):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        beta = relay.var("beta", shape=(param_shape), dtype="float32")
-        gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
-        moving_mean = relay.var("moving_mean", shape=(param_shape), dtype="float32")
-        moving_var = relay.var("moving_var", shape=(param_shape), dtype="float32")
-        out, _, _ = relay.nn.batch_norm(
-            x,
-            gamma=gamma,
-            beta=beta,
-            moving_mean=moving_mean,
-            moving_var=moving_var,
-            axis=axis,
-            center=True,
-            scale=True,
-            epsilon=epsilon,
-        )
-        f = relay.Function([x, gamma, beta, moving_mean, moving_var], out)
-        return (
-            f,
-            {
-                "x": x_shape,
-                "beta": param_shape,
-                "gamma": param_shape,
-                "moving_mean": param_shape,
-                "moving_var": param_shape,
-            },
-            ["beta", "gamma", "moving_mean", "moving_var"],
-        )
-
-    run_and_verify_func(get_graph((1, 64, 56, 56), (64,)), run_module=run_module)
-    run_and_verify_func(
-        get_graph((1, 56, 56, 64), (64,), axis=3, epsilon=1.001e-05), run_module=run_module
-    )
-    run_and_verify_func(get_graph((1, 4, 8, 4), (8,), axis=2), run_module=run_module)
-    run_and_verify_func(get_graph((1, 8, 4, 4, 4), (8,), axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 4, 8, 4, 4), (8,), axis=2), run_module=run_module)
-    run_and_verify_func(get_graph((1, 4, 4, 4, 8), (8,), axis=4), run_module=run_module)
-    run_and_verify_func(get_graph((1, 8), (8,), axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 8), (8,), axis=2), run_module=run_module)
-
-
-def test_layer_norm(run_module):
-    def get_graph(x_shape, param_shape, axis=1, epsilon=1e-5):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
-        beta = relay.var("beta", shape=(param_shape), dtype="float32")
-        out = relay.nn.layer_norm(
-            x, gamma=gamma, beta=beta, axis=axis, epsilon=epsilon, center=True, scale=True
-        )
-        f = relay.Function([x, gamma, beta], out)
-        return (f, {"x": x_shape, "beta": param_shape, "gamma": param_shape}, ["beta", "gamma"])
-
-    run_and_verify_func(get_graph((1, 32, 8, 8), (32,)), run_module=run_module)
-    run_and_verify_func(
-        get_graph((1, 8, 8, 32), (32,), axis=3, epsilon=1.001e-05), run_module=run_module
-    )
-    run_and_verify_func(get_graph((1, 8), (8,), axis=1), run_module=run_module)
-
-
-def test_unary(run_module):
-    def get_graph(op, x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for op in [
-        relay.nn.relu,
-        relay.sigmoid,
-        relay.tanh,
-        relay.exp,
-        relay.log,
-        relay.sqrt,
-        relay.abs,
-        relay.negative,
-        relay.sin,
-        relay.cos,
-        relay.atan,
-        relay.ceil,
-        relay.floor,
-        relay.erf,
-    ]:
-        run_and_verify_func(get_graph(op), run_module=run_module)
-
-
-def test_clip(run_module):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float16")
-        out = relay.clip(x, a_min=-0.2, a_max=0.4)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
-
-
-def test_relu(run_module):
-    def get_graph(x_shape=(1, 8, 3, 4)):
-        x = relay.var("x", shape=(x_shape), dtype="float16")
-        out = relay.nn.relu(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
-
-
-def test_leaky_relu(run_module):
-    def get_graph(x_shape=(1, 8, 3, 4)):
-        x = relay.var("x", shape=(x_shape), dtype="float16")
-        out = relay.nn.leaky_relu(x, alpha=0.1)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
-
-
-def test_binary(run_module):
-    def get_graph(op, x_shape, y_shape, y_is_const=False, d_type="float16"):
-        x = relay.var("x", shape=(x_shape), dtype=d_type)
-        if y_is_const:
-            y = relay.const(np.ones(y_shape).astype(d_type))
-            out = op(x, y)
-            f = relay.Function([x], out)
-            return f, {"x": x_shape}, []
-        y = relay.var("y", shape=(y_shape), dtype=d_type)
-        out = op(x, y)
-        f = relay.Function([x, y], out)
-        return f, {"x": x_shape, "y": y_shape}, []
-
-    for op in [relay.add, relay.subtract, relay.multiply, relay.divide, relay.power]:
-        for d_type in SUPPORTED_DTYPES:
-            for y_is_const in [True, False]:
-                run_and_verify_func(
-                    get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const, d_type),
-                    run_module=run_module,
-                    data_type=d_type,
-                )
-                run_and_verify_func(
-                    get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const, d_type),
-                    run_module=run_module,
-                    data_type=d_type,
-                )
-                run_and_verify_func(
-                    get_graph(op, (1, 10), (10,), y_is_const, d_type),
-                    run_module=run_module,
-                    data_type=d_type,
-                )
-                run_and_verify_func(
-                    get_graph(op, (1, 1, 1, 10), (10,), y_is_const, d_type),
-                    run_module=run_module,
-                    data_type=d_type,
-                )
-                run_and_verify_func(
-                    get_graph(op, (1, 1, 1), (3,), y_is_const, d_type),
-                    run_module=run_module,
-                    data_type=d_type,
-                )
-
-
-def test_reduce(run_module):
-    def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False, d_type="float32"):
-        x = relay.var("x", shape=(x_shape), dtype=d_type)
-        out = op(x, axis=axis, keepdims=keepdims)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for type in SUPPORTED_DTYPES:
-        for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
-            for keepdims in [True, False]:
-                run_and_verify_func(
-                    get_graph(op, axis=(1), keepdims=keepdims, d_type=type),
-                    run_module=run_module,
-                    data_type=type,
-                )
-                run_and_verify_func(
-                    get_graph(op, axis=(2, 3), keepdims=keepdims, d_type=type),
-                    run_module=run_module,
-                    data_type=type,
-                )
-                run_and_verify_func(
-                    get_graph(op, axis=(1, 2), keepdims=keepdims, d_type=type),
-                    run_module=run_module,
-                    data_type=type,
-                )
-                run_and_verify_func(
-                    get_graph(op, axis=(1, 2, 3), keepdims=keepdims, d_type=type),
-                    run_module=run_module,
-                    data_type=type,
-                )
-
-
-def test_strided_slice(run_module):
-    def get_graph(x_shape, begin, end, strides=None, slice_mode="size"):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if strides:
-            out = relay.strided_slice(x, begin, end, strides, slice_mode=slice_mode)
-        else:
-            out = relay.strided_slice(x, begin, end, slice_mode=slice_mode)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for slice_mode in ["size", "end"]:
-        run_and_verify_func(
-            get_graph((1, 3, 6, 7), (0, 0, 0, 0), (1, 1, 6, 7), slice_mode=slice_mode),
-            run_module=run_module,
-        )
-        run_and_verify_func(
-            get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6], slice_mode=slice_mode),
-            run_module=run_module,
-        )
-        run_and_verify_func(
-            get_graph((2, 3, 6, 7), [0, 0, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode),
-            run_module=run_module,
-        )
-        run_and_verify_func(
-            get_graph((2, 3, 6, 7), [0, 1, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode),
-            run_module=run_module,
-        )
-        run_and_verify_func(
-            get_graph((1, 6), [0, 1], [1, 3], slice_mode=slice_mode), run_module=run_module
-        )
-
-
-def test_adaptive_pool2d(run_module):
-    def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1), data_type="float16"):
-        x = relay.var("x", shape=(x_shape), dtype=data_type)
-        out = op(x, out_size)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    for type in SUPPORTED_DTYPES:
-        run_and_verify_func(
-            get_graph(relay.nn.adaptive_max_pool2d, data_type=type),
-            run_module=run_module,
-            data_type=type,
-        )
-        run_and_verify_func(
-            get_graph(relay.nn.adaptive_avg_pool2d, data_type=type),
-            run_module=run_module,
-            data_type=type,
-        )
-
-
-def test_multiple_outputs(run_module):
-    def get_graph(d_type="float16"):
-        x = relay.var("x", shape=(1, 3), dtype=d_type)
-        y = relay.var("y", shape=(1, 3), dtype=d_type)
-        z = relay.add(x, y)
-        w = relay.add(z, y)
-        out = relay.Tuple((z, w))
-        f = relay.Function([x, y], out)
-        return f, {"x": (1, 3), "y": (1, 3)}, []
-
-    for type in SUPPORTED_DTYPES:
-        run_and_verify_func(get_graph(d_type=type), run_module=run_module, data_type=type)
-
-
-@pytest.mark.skip(reason=("Fails assert_allclose. See https://github.com/apache/tvm/issues/11765"))
-def test_conv3d(run_module):
-    def get_graph(
-        x_shape=(1, 24, 8, 8, 8),
-        k_shape=(16, 24, 3, 3, 3),
-        groups=1,
-        padding=(0, 0, 0),
-        strides=(1, 1, 1),
-        dilation=(1, 1, 1),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv3d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-    run_and_verify_func(get_graph(padding=(0, 0, 0, 1, 1, 1)), run_module=run_module)
-
-
-def test_pool3d(run_module):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 8, 32, 32),
-        pool_size=(2, 2, 2),
-        strides=(2, 2, 2),
-        padding=(0, 0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(relay.nn.avg_pool3d), run_module=run_module)
-    run_and_verify_func(get_graph(relay.nn.max_pool3d), run_module=run_module)
-    run_and_verify_func(
-        get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)), run_module=run_module
-    )
-    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)), run_module=run_module)
-
-
-def test_conv3d_transpose(run_module):
-    def get_graph(
-        x_shape=(1, 32, 8, 8, 8),
-        k_shape=(32, 16, 3, 3, 3),
-        groups=1,
-        padding=(0, 0, 0),
-        strides=(1, 1, 1),
-        output_padding=(0, 0, 0),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv3d_transpose(
-            x,
-            kernel,
-            channels=k_shape[1],
-            kernel_size=k_shape[2:5],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            output_padding=output_padding,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-    run_and_verify_func(get_graph(strides=(2, 2, 2)), run_module=run_module)
-    run_and_verify_func(
-        get_graph(strides=(2, 2, 2), output_padding=(1, 1, 1)), run_module=run_module
-    )
-
-
-@has_tensorrt_codegen
-def test_dynamic_offload():
-    """
-    This test checks for proper dynamic offloading of relay graphs. An addition between
-    the outputs of two conv2d's is performed, one of them having all static args whereas
-    the other has a arg with dynamic shape. It is expected for the TRT partitioner to
-    offload the conv2d with dynamic arg to TVM while running the other in TRT.
-    """
-
-    data_shape = (1, 32, 8, 8)
-    k_shape = (1, 32, 3, 3)
-
-    x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32")
-    y = relay.var("y", shape=(data_shape), dtype="float32")
-    kernel = relay.const(np.random.rand(*k_shape).astype("float32"))
-
-    def get_expected():
-        # Create a nested TRT function that matches the expected output
-        mod = tvm.IRModule()
-        outer_var = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
-        inner_var = relay.var("FunctionVar_0_0", shape=(data_shape), dtype="float32")
-        inner_body = relay.nn.conv2d(
-            inner_var, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]
-        )
-        inner_func = relay.Function([inner_var], inner_body)
-        inner_func = set_inner_func_attr(inner_func, "nn.conv2d_", "tensorrt.nn.conv2d")
-        outer_body = inner_func(outer_var)
-        outer_func = relay.Function([outer_var], outer_body)
-        outer_func = set_outer_func_attr(outer_func, "tensorrt", "tvmgen_default_tensorrt_main_0")
-        gv = GlobalVar("tvmgen_default_tensorrt_main_0")
-        mod[gv] = outer_func
-        mod = relay.transform.InferType()(mod)
-
-        # Create the main function
-        out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
-        out = relay.add(out1, gv(y))
-        f = relay.Function([x, y], out)
-        mod["main"] = f
-        mod = relay.transform.InferType()(mod)
-        return mod
-
-    # Create relay function that will be offloaded to TRT
-    out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
-    out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
-    out = relay.add(out1, out2)
-    f = relay.Function([x, y], out)
-
-    # Pass the function to TRT compilation
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = relay.transform.InferType()(mod)
-    mod_trt = tensorrt.partition_for_tensorrt(mod)
-
-    # Get the expected relay graph and compare
-    mod_exp = get_expected()
-    tvm.ir.assert_structural_equal(mod_trt, mod_exp, map_free_vars=True)
-
-
-def test_tensorrt_dynamic_batch(run_module):
-    batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
-    x_shape = (relay.Any(), 1, 8, 8)
-    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
-    result_arr = [{} for _ in range(len(batches_to_test))]
-    for use_trt in [True, False]:
-        x = relay.var("x", shape=x_shape, dtype="float32")
-        out = relay.nn.relu(x)
-        f = relay.Function([x], out)
-        mod = tvm.IRModule()
-        mod["main"] = f
-        if use_trt:
-            mod = tensorrt.partition_for_tensorrt(mod)
-
-        if run_module:
-            with relay.build_config(opt_level=3):
-                func = relay.create_executor(
-                    "vm", mod=mod, device=tvm.cpu(0), target="llvm"
-                ).evaluate()
-            for i, batch_size in enumerate(batches_to_test):
-                result_arr[i][use_trt] = func(x_data[:batch_size, ...])
-
-    if run_module:
-        for i in range(len(batches_to_test)):
-            assert_result_dict_holds(result_arr[i])
-
-
-def test_tensorrt_dynamic_batch_conv(run_module):
-    batches_to_test = [1, 5, 1, 0, 2, 3, 0, 1, 3, 2]
-    x_shape = (relay.Any(), 32, 8, 8)
-    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
-    k_shape = (16, 32, 3, 3)
-    params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")}
-    for use_implicit_batch in [True, False]:
-        result_arr = [{"cuda": {}, "llvm": {}} for _ in range(len(batches_to_test))]
-        for use_trt in [True, False]:
-            x = relay.var("x", shape=x_shape, dtype="float32")
-            kernel = relay.var("kernel", shape=k_shape, dtype="float32")
-            out = relay.nn.conv2d(x, kernel, channels=16, kernel_size=(3, 3), groups=1)
-            f = relay.Function([x, kernel], out)
-            mod = tvm.IRModule()
-            mod["main"] = f
-            trt_target = tvm.target.Target(f"tensorrt -use_implicit_batch={use_implicit_batch}")
-            if use_trt:
-                mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
-            if run_module:
-                for target in ["llvm", "cuda"]:
-                    targets = [target]
-                    if use_trt:
-                        targets.append(trt_target)
-                    with tvm.transform.PassContext(opt_level=3):
-                        func = relay.create_executor(
-                            "vm", mod=mod, device=tvm.device(target), target=targets
-                        ).evaluate()
-                    for i, batch_size in enumerate(batches_to_test):
-                        result_arr[i][target][use_trt] = func(x_data[:batch_size, ...], **params)
-        if run_module:
-            for i in range(len(batches_to_test)):
-                for target in ["llvm", "cuda"]:
-                    assert_result_dict_holds(result_arr[i][target])
-
-
-def test_maskrcnn_resnet50(run_module) -> None:
-    """
-    This function tests the working of pytorch maskrcnn with resnet50 as backbone with
-    VM and VM + TRT. Since the order of compiled model outputs is a bit different from
-    original pytorch model, it uses a custom logic for comparison check.
-    """
-    import torch
-    import torchvision
-
-    def convert_traced_model_to_vm_trt(
-        traced_module: torch.jit.TopLevelTracedModule, np_sample_input: np.ndarray, target: str
-    ) -> tvm.runtime.vm.Executable:
-        """
-        This function converts a traced pytorch model to VM + TRT.
-        """
-        input_shape = np_sample_input.shape
-        input_name = "input0"
-        shape_list = [(input_name, input_shape)]
-        mod, params = relay.frontend.from_pytorch(traced_module, shape_list)
-        trt_target = tvm.target.Target("tensorrt -remove_no_mac_subgraphs=True")
-        mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
-        targets = [target, trt_target]
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
-            vm_trt_exec = relay.vm.compile(mod, target=targets, params=params)
-
-        return vm_trt_exec
-
-    class TraceWrapper(torch.nn.Module):
-        """
-        This class is a wrapper over the torch module to convert the outputs into traceable form
-        """
-
-        def __init__(self, model: torch.nn.Module) -> None:
-            super().__init__()
-            self.model = model
-
-        def forward(
-            self, inp: torch.Tensor
-        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-            out = self.model(inp)
-            return out[0]["boxes"], out[0]["scores"], out[0]["labels"], out[0]["masks"]
-
-    def get_traced_maskrcnn_model(np_sample_input: np.ndarray) -> torch.jit.TopLevelTracedModule:
-        """
-        This function takes a sample input and returns the traced maskrcnn model
-        """
-        model_func = torchvision.models.detection.maskrcnn_resnet50_fpn
-        model = TraceWrapper(model_func(pretrained=True))
-        model.eval()
-        inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=np_sample_input.shape))
-
-        with torch.no_grad():
-            out = model(inp)
-            traced_module = torch.jit.trace(model, inp)
-            traced_module.eval()
-
-        return traced_module
-
-    def get_maskrcnn_input(in_size: int) -> np.ndarray:
-        """
-        This function gets a real image with multiple objects of interest and returns it.
-        """
-        input_shape = (1, 3, in_size, in_size)
-        img_path = "test_street_small.jpg"
-        img_url = "https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/detection/street_small.jpg"
-        download(img_url, img_path)
-        import cv2
-
-        img = cv2.imread(img_path).astype("float32")
-        img = cv2.resize(img, (in_size, in_size))
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        img = np.transpose(img / 255.0, [2, 0, 1])
-        img = np.expand_dims(img, axis=0)
-
-        return img
-
-    in_size = 300
-    np_sample_input = get_maskrcnn_input(in_size)
-    traced_module = get_traced_maskrcnn_model(np_sample_input)
-    vm_trt_exec = convert_traced_model_to_vm_trt(traced_module, np_sample_input, target="llvm")
-
-    if run_module:
-        dev = tvm.cpu()
-        vm = tvm.runtime.vm.VirtualMachine(vm_trt_exec, dev)
-        vm.set_input("main", **{"input0": np_sample_input})
-        tvm_res = vm.run()
-
-        # Descending sort by scores and get the high confidence indices. In this example 9 is chosen,
-        # because this image has 9 boxes over 0.9 confidence
-        num_high_confidence_boxes = 9
-        tvm_indices = np.argsort(-1 * tvm_res[1].numpy())[:num_high_confidence_boxes]
-
-        with torch.no_grad():
-            out = traced_module(torch.Tensor(np_sample_input))
-            # Descending sort by scores and get the high confidence indices
-            pt_indices = np.argsort(-1 * out[1].numpy())[:num_high_confidence_boxes]
-
-        # [Box Tol, Score Tol, Label Tol, Mask Tol]
-        tol = [1e-1, 5e-3, 1e-5, 4e-1]
-        # Because of certain ops, there are certain minor differences in TVM outputs and PT outputs,
-        # This means that the tolerance can't be 1e-4 or 1e-5 throughout. The ideal way to get around
-        # this is to test it on an entire dataset and compare mAP with the original model.
-        # However, since that is not practically possible on CI, the following compromise is made.
-        # These tolerances are chosen based on their impact or lack thereof to the mAP score, e.g:
-        # 0.1 pixel difference of a box in a 300X300 image wont make any change.
-        for i, tol_val in zip(range(4), tol):
-            np.testing.assert_allclose(
-                tvm_res[i].numpy()[tvm_indices],
-                out[i].numpy()[pt_indices],
-                rtol=tol_val,
-                atol=tol_val,
-            )
-
-
-def test_empty_subgraph(run_module):
-    x_shape = (1, 3, 5)
-    mod = tvm.IRModule()
-    # Empty tensorrt subgraph.
-    var1 = relay.var("tensorrt_0_i0", shape=(x_shape), dtype="float32")
-    f1 = GlobalVar("tensorrt_0")
-    func = relay.Function([var1], var1)
-    func = set_outer_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0")
-    mod[f1] = func
-    mod = relay.transform.InferType()(mod)
-
-    # Create the main function
-    x = relay.var("x", shape=x_shape, dtype="float32")
-    out = f1(relay.nn.relu(x))
-    f = relay.Function([x], out)
-    mod["main"] = f
-
-    x_data = np.random.uniform(-1, 1, x_shape).astype("float32")
-    for mode in ["graph", "vm"]:
-        with tvm.transform.PassContext(opt_level=3):
-            func = relay.create_executor(
-                mode, mod=mod, device=tvm.cuda(0), target="cuda"
-            ).evaluate()
-            if run_module:
-                results = func(x_data)
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_tensorrt_int8_exp.py b/tests/python/contrib/test_tensorrt_int8_exp.py
deleted file mode 100644
index 8e0664d30153..000000000000
--- a/tests/python/contrib/test_tensorrt_int8_exp.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import os
-import numpy as np
-
-try:
-    # See issue #9362.
-    import torch
-except:
-    pass
-
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.contrib.download import download_testdata
-from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt
-from tvm.relay.op.contrib import tensorrt
-
-
-def skip_codegen_test():
-    """Skip test if TensorRT and CUDA codegen are not present"""
-    if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist:
-        print("Skip because CUDA is not enabled.")
-        return True
-    if not tensorrt.is_tensorrt_compiler_enabled():
-        print("Skip because TensorRT compiler is not available.")
-        return True
-    print("TensorRT compiler is available!")
-    return False
-
-
-def skip_runtime_test():
-    if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist:
-        print("Skip because CUDA is not enabled.")
-        return True
-    if not tensorrt.is_tensorrt_runtime_enabled():
-        print("Skip because TensorRT runtime is not available.")
-        return True
-    print("TensorRT runtime is available!")
-    return False
-
-
-def test_trt_int8():
-    """
-    This Function is used to use tensorrt int8 to compile a resnet34 model,
-    and compare cosine distance between the output of the original model and trt int8 tvm output
-
-    """
-    if skip_codegen_test() or skip_runtime_test():
-        return
-
-    try:
-        from PIL import Image
-        from scipy.spatial import distance
-    except:
-        print("please install scipy and Image python packages")
-        return
-
-    try:
-        import torch
-        import torchvision
-        from torchvision import transforms
-    except:
-        print("please install pytorch python package")
-        return
-
-    os.environ["TVM_TENSORRT_USE_INT8"] = "1"
-    os.environ["TENSORRT_NUM_CALI_INT8"] = "10"
-    model_name = "resnet34"
-    model = getattr(torchvision.models, model_name)(pretrained=True)
-    model = model.eval()
-
-    # We grab the TorchScripted model via tracing
-    input_shape = [1, 3, 224, 224]
-    input_data = torch.randn(input_shape)
-    scripted_model = torch.jit.trace(model, input_data).eval()
-
-    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
-    img_path = download_testdata(img_url, "cat.png", module="data")
-    img = Image.open(img_path).resize((224, 224))
-    my_preprocess = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-    img = my_preprocess(img)
-    img = np.expand_dims(img, 0)
-
-    input_name = "input0"
-    shape_list = [(input_name, img.shape)]
-    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
-
-    # compile the model
-    target = "cuda"
-    dev = tvm.cuda()
-    mod = partition_for_tensorrt(mod, params)
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, params=params)
-
-    gen_module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    num_cali_int8 = int(os.environ["TENSORRT_NUM_CALI_INT8"])
-    if num_cali_int8 != 0:
-        print("start calibrating data ... ")
-        for i in range(num_cali_int8):
-            tvm_data = tvm.nd.array(img)
-            gen_module.set_input(input_name, tvm_data)
-            gen_module.run(data=tvm_data)
-        print("finished calibrating data ... ")
-
-    # get output of tvm model
-    print("rebuild engine and test to run ... ")
-    tvm_data = tvm.nd.array(img)
-    gen_module.set_input(input_name, tvm_data)
-    gen_module.run(data=tvm_data)
-    out = gen_module.get_output(0)
-
-    # check output of tvm and output of pytorch model are equal
-    torch_data = torch.from_numpy(img)
-    model = scripted_model.eval()
-    torch_output = model(torch_data)
-
-    cosine_distance_res = distance.cosine(out.numpy(), torch_output.detach().cpu().numpy())
-    assert cosine_distance_res <= 0.01
-
-    # Evaluate
-    print("Evaluate inference time cost...")
-    ftimer = gen_module.module.time_evaluator("run", dev, repeat=10, min_repeat_ms=500)
-    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
-    message = "Mean inference time (std dev): %.2f ms (%.2f ms)" % (
-        np.mean(prof_res),
-        np.std(prof_res),
-    )
-    print(message)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py
deleted file mode 100644
index 27ad70a9c894..000000000000
--- a/tests/python/contrib/test_thrust.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Configure pytest"""
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.topi.cuda import stable_sort_by_key_thrust
-from tvm.topi.cuda.scan import exclusive_scan, scan_thrust, schedule_scan
-from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
-
-
-thrust_check_func = {"cuda": can_use_thrust, "rocm": can_use_rocthrust}
-
-
-def test_stable_sort_by_key():
-    """Tests function test_stable_sort_by_key"""
-    size = 6
-    keys = te.placeholder((size,), name="keys", dtype="int32")
-    values = te.placeholder((size,), name="values", dtype="int32")
-
-    keys_out, values_out = stable_sort_by_key_thrust(keys, values)
-
-    for target in ["cuda", "rocm"]:
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            continue
-
-        with tvm.target.Target(target + " -libs=thrust") as tgt:
-            if not thrust_check_func[target](tgt, "tvm.contrib.thrust.stable_sort_by_key"):
-                print("skip because thrust is not enabled...")
-                return
-
-            dev = tvm.device(target, 0)
-            s = te.create_schedule([keys_out.op, values_out.op])
-            f = tvm.build(s, [keys, values, keys_out, values_out], target)
-
-            keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
-            values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
-            keys_np_out = np.zeros(keys_np.shape, np.int32)
-            values_np_out = np.zeros(values_np.shape, np.int32)
-            keys_in = tvm.nd.array(keys_np, dev)
-            values_in = tvm.nd.array(values_np, dev)
-            keys_out = tvm.nd.array(keys_np_out, dev)
-            values_out = tvm.nd.array(values_np_out, dev)
-            f(keys_in, values_in, keys_out, values_out)
-
-            ref_keys_out = np.sort(keys_np)
-            ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
-            tvm.testing.assert_allclose(keys_out.numpy(), ref_keys_out, rtol=1e-5)
-            tvm.testing.assert_allclose(values_out.numpy(), ref_values_out, rtol=1e-5)
-
-
-def test_exclusive_scan():
-    """Tests function test_exclusive_scan"""
-    for target in ["cuda", "rocm"]:
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            continue
-
-        with tvm.target.Target(target + " -libs=thrust") as tgt:
-            if not thrust_check_func[target](tgt, "tvm.contrib.thrust.sum_scan"):
-                print("skip because thrust is not enabled...")
-                return
-
-            for ishape in [(10,), (10, 10), (10, 10, 10)]:
-                values = te.placeholder(ishape, name="values", dtype="int32")
-
-                scan, reduction = exclusive_scan(values, return_reduction=True)
-                s = schedule_scan([scan, reduction])
-
-                dev = tvm.device(target, 0)
-                f = tvm.build(s, [values, scan, reduction], target)
-
-                values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
-                values_np_out = np.zeros(values_np.shape, np.int32)
-
-                if len(ishape) == 1:
-                    reduction_shape = ()
-                else:
-                    reduction_shape = ishape[:-1]
-
-                reduction_np_out = np.zeros(reduction_shape, np.int32)
-
-                values_in = tvm.nd.array(values_np, dev)
-                values_out = tvm.nd.array(values_np_out, dev)
-                reduction_out = tvm.nd.array(reduction_np_out, dev)
-                f(values_in, values_out, reduction_out)
-
-                ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np
-                tvm.testing.assert_allclose(values_out.numpy(), ref_values_out, rtol=1e-5)
-                ref_reduction_out = np.sum(values_np, axis=-1)
-                tvm.testing.assert_allclose(reduction_out.numpy(), ref_reduction_out, rtol=1e-5)
-
-
-def test_inclusive_scan():
-    """Tests function test_inclusive_scan"""
-    out_dtype = "int64"
-
-    for target in ["cuda", "rocm"]:
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            continue
-
-        with tvm.target.Target(target + " -libs=thrust") as tgt:
-            if not thrust_check_func[target](tgt, "tvm.contrib.thrust.sum_scan"):
-                print("skip because thrust is not enabled...")
-                return
-
-            for ishape in [(10,), (10, 10)]:
-                values = te.placeholder(ishape, name="values", dtype="int32")
-
-                scan = scan_thrust(values, out_dtype, exclusive=False)
-                s = tvm.te.create_schedule([scan.op])
-
-                dev = tvm.device(target, 0)
-                f = tvm.build(s, [values, scan], target)
-
-                values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
-                values_np_out = np.zeros(values_np.shape, out_dtype)
-                values_in = tvm.nd.array(values_np, dev)
-                values_out = tvm.nd.array(values_np_out, dev)
-                f(values_in, values_out)
-
-                ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype)
-                tvm.testing.assert_allclose(values_out.numpy(), ref_values_out, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_stable_sort_by_key()
-    test_exclusive_scan()
-    test_inclusive_scan()
diff --git a/tests/python/contrib/test_uma/test_partition.py b/tests/python/contrib/test_uma/test_partition.py
deleted file mode 100644
index d02903610933..000000000000
--- a/tests/python/contrib/test_uma/test_partition.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import tvm
-import tvm.relay as relay
-from tvm.relay.backend.contrib.uma import uma_available
-from tvm.relay.backend.contrib.uma.api import UMAPartitioner
-from tvm.relay.op.contrib.register import get_pattern_table
-from tvm.relay.testing import mlp, resnet
-
-pytestmark = pytest.mark.skipif(not uma_available(), reason="UMA not available")
-
-
-def test_partition_table():
-    partitioner = UMAPartitioner("test_partition")
-    assert get_pattern_table("test_partition") is None
-
-    partitioner.register()
-
-    assert get_pattern_table("test_partition") is not None
-
-
-@pytest.mark.parametrize(
-    "workload,backend,merge",
-    [
-        ("resnet", "dnnl", False),
-        ("resnet", "dnnl", True),
-        ("mlp", "dnnl", False),
-        ("mlp", "dnnl", True),
-        ("resnet", "cutlass", False),
-        ("resnet", "cutlass", True),
-        ("mlp", "cutlass", False),
-        ("mlp", "cutlass", True),
-    ],
-)
-def test_existing_pattern_tables(workload, backend, merge):
-    """Tests that uma partitioner creates the same partitions than default BYOC partitioning"""
-    partitioner = UMAPartitioner(backend, merge)
-    pattern_table = get_pattern_table(backend)
-
-    for entry in pattern_table:
-        partitioner.add_pattern(*entry)
-
-    if workload == "resnet":
-        net = resnet.get_net(1, 10)
-    elif workload == "mlp":
-        net = mlp.get_net(1, 10)
-    else:
-        assert False, f"don't know how to find workload for {workload}"
-
-    mod = tvm.ir.IRModule()
-    mod["main"] = net
-
-    partitioner.register()
-    partitioned_mod = partitioner.partition(mod)
-
-    def partition_default(mod):
-        """partitions using default BYOC flow"""
-
-        sequence = [
-            relay.transform.MergeComposite(pattern_table),
-            relay.transform.AnnotateTarget(backend),
-        ]
-
-        if merge:
-            sequence.append(relay.transform.MergeCompilerRegions())
-
-        sequence.append(relay.transform.PartitionGraph())
-        sequential = tvm.transform.Sequential(sequence)
-
-        return sequential(mod)
-
-    default_partitioned_mod = partition_default(mod)
-
-    assert len(partitioned_mod.functions) == len(default_partitioned_mod.functions)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_uma/test_target.py b/tests/python/contrib/test_uma/test_target.py
deleted file mode 100644
index 1662becf088d..000000000000
--- a/tests/python/contrib/test_uma/test_target.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from typing import Union
-
-import pytest
-import tvm
-from tests.python.contrib.test_uma.test_uma_vanilla_accelerator import VanillaAcceleratorBackend
-from tvm.relay.backend.contrib.uma import uma_available
-
-pytestmark = pytest.mark.skipif(not uma_available(), reason="UMA not available")
-
-
-@pytest.mark.parametrize(
-    "target_name,target_attrs,target_args",
-    [
-        ("my_hwa", {}, {}),
-        (
-            "my_hwa2",
-            {
-                "local_memory_size": 128 * 1024,
-                "variant": "version1",
-            },
-            {"local_memory_size": 256 * 1024, "variant": "version2"},
-        ),
-    ],
-)
-def test_uma_target(target_name, target_attrs, target_args):
-    registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
-    registration_func(target_name, target_attrs)
-
-    # Test Defaults
-    my_target = tvm.target.Target(target_name)
-
-    assert str(my_target.kind) == target_name
-
-    for attr in target_attrs.keys():
-        assert my_target.attrs[attr] == target_attrs[attr]
-
-    # Test with parameters overwritten
-    args = " ".join((f"--{k}={v}" for k, v in target_args.items()))
-    my_target = tvm.target.Target(f"{target_name} {args}")
-
-    for attr in target_args.keys():
-        assert my_target.attrs[attr] == target_args[attr]
-
-
-@pytest.mark.parametrize(
-    "attr_name, target_attr",
-    [
-        ("float_attr", 3.14),
-        ("none_attr", None),
-        ("model", "my_model"),
-    ],
-)
-def test_invalid_attr_option(attr_name: str, target_attr: Union[str, int, bool, float, None]):
-    registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
-    if target_attr is None:
-        # None cannot be caught as TVMError, as it causes a SIGKILL, therefore it must be prevented to be
-        # entered into relay.backend.contrib.uma.RegisterTarget at Python level.
-        with pytest.raises(ValueError, match=r"Target attribute None is not supported."):
-            uma_backend = VanillaAcceleratorBackend()
-            uma_backend._target_attrs = {attr_name: target_attr}
-            uma_backend.register()
-    elif "model" in attr_name:
-        target_name = f"{attr_name}_{target_attr}"
-        target_attr = {attr_name: target_attr}
-        with pytest.raises(tvm.TVMError, match=r"Attribute is already registered: .*"):
-            registration_func(target_name, target_attr)
-    else:
-        target_name = f"{attr_name}_{target_attr}"
-        target_attr = {attr_name: target_attr}
-        with pytest.raises(TypeError, match=r"Only String, Integer, or Bool are supported. .*"):
-            registration_func(target_name, target_attr)
-
-
-@pytest.mark.parametrize(
-    "target_name",
-    [
-        "llvm",
-        "c",
-    ],
-)
-def test_target_duplication(target_name: str):
-    with pytest.raises(tvm.TVMError, match=r"TVM UMA Error: Target is already registered: .*"):
-        registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
-        registration_func(target_name, {})
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_uma/test_uma_lowering_with_umalower.py b/tests/python/contrib/test_uma/test_uma_lowering_with_umalower.py
deleted file mode 100644
index d2e0af05e3ee..000000000000
--- a/tests/python/contrib/test_uma/test_uma_lowering_with_umalower.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import pathlib
-
-import tvm
-from tests.python.contrib.test_uma.test_uma_utils import _create_schedule, _generate_io_arrays
-from tvm import topi
-from apps.uma._template.passes import MyAiHwConv2dPass
-import tvm.testing
-from tvm import te
-from tvm.relay.backend.contrib.uma.api.lower import UMALower
-from tvm.relay.backend.contrib.uma.api.utils import PassPhase
-from tvm.relay.backend.contrib.uma import uma_available
-
-
-pytestmark = pytest.mark.skipif(not uma_available(), reason="UMA not available")
-
-
-def _conv2d_te_definition(shapes: dict) -> list:
-    n, w, h, ci, kw, kh, co = (
-        shapes["n"],
-        shapes["w"],
-        shapes["h"],
-        shapes["ci"],
-        shapes["kw"],
-        shapes["kh"],
-        shapes["co"],
-    )
-    ifmap = te.placeholder((n, ci, w, h), dtype="float32", name="ifmap")
-    weights = te.placeholder((co, ci, kw, kh), dtype="float32", name="weights")
-    result = topi.nn.conv2d_nchw(ifmap, weights, stride=1, padding=[kw // 2, kh // 2], dilation=1)
-    return [ifmap, weights, result]
-
-
-def _pepare_conv2d_schedule(shapes, use_external_conv2d_impl=True):
-    placeholders = _conv2d_te_definition(shapes)
-
-    apps_path = (
-        pathlib.Path(str(__file__)).parent.parent.parent.parent.parent.joinpath("apps").absolute()
-    )
-    conv2d_file = apps_path / "uma" / "_template" / "conv2dnchw.cc"
-
-    with conv2d_file.open() as f:
-        sch_tir = _create_schedule(
-            placeholders, f, use_external_conv2d_impl=use_external_conv2d_impl
-        )
-    return placeholders, sch_tir
-
-
-def _run_external_conv2d(dut_io_arrays, conv2d_shapes, target):
-    # Run conv2d with external function
-    placeholders, schedule = _pepare_conv2d_schedule(conv2d_shapes)
-
-    uma_lower = UMALower("lower_test")
-    uma_lower._tir_passes.append((PassPhase.TIR_PHASE_0, MyAiHwConv2dPass()))
-    with tvm.transform.PassContext():
-        tir_mod = uma_lower._lower_stir_to_nstir(schedule.mod["main"])
-
-    ifmap_data, weight_data, result_data = dut_io_arrays
-
-    llvm_conv2d_mod = tvm.build(tir_mod, placeholders, target=target, name="test_external_conv2d")
-    llvm_conv2d_mod(ifmap_data, weight_data, result_data)
-
-
-def _run_reference_conv2d(reference_io_arrays, conv2d_shapes, target):
-    placeholders, schedule = _pepare_conv2d_schedule(conv2d_shapes)
-    ref_mod = tvm.build(schedule.mod, placeholders, target=target, name="test_reference_conv2d")
-    ifmap, weights, result = reference_io_arrays
-    ref_mod(ifmap, weights, result)
-
-
-def _prepare_io_arrays(conv2d_shapes, dev):
-    dut_io_arrays = _generate_io_arrays(conv2d_shapes, dev)
-    _, _, ref_result = _generate_io_arrays(conv2d_shapes, dev)
-    reference_io_arrays = [dut_io_arrays[0], dut_io_arrays[1], ref_result]
-    return dut_io_arrays, reference_io_arrays
-
-
-@pytest.mark.parametrize(
-    "n, w, h, ci, kw, kh, co",
-    [
-        (1, 224, 224, 3, 3, 3, 4),
-        (1, 224, 224, 3, 5, 5, 4),
-        (1, 224, 224, 3, 7, 7, 4),
-        (1, 224, 320, 3, 7, 7, 4),
-        (1, 224, 224, 3, 7, 7, 4),
-    ],
-)
-def test_lower_with_uma(n, w, h, ci, kw, kh, co):
-    target = tvm.target.Target(target="llvm", host="llvm")
-    dev = tvm.device(target.kind.name, 0)
-    conv2d_shapes = dict(n=n, w=w, h=h, ci=ci, kw=kw, kh=kh, co=co)
-
-    dut_io_arrays, reference_io_arrays = _prepare_io_arrays(conv2d_shapes, dev)
-
-    _run_external_conv2d(dut_io_arrays, conv2d_shapes, target)
-    _run_reference_conv2d(reference_io_arrays, conv2d_shapes, target)
-
-    # compare results
-    dut_results = dut_io_arrays[2].numpy()
-    ref_results = reference_io_arrays[2].numpy()
-    tvm.testing.assert_allclose(dut_results, ref_results, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_uma/test_uma_utils.py b/tests/python/contrib/test_uma/test_uma_utils.py
deleted file mode 100644
index 933602806f0e..000000000000
--- a/tests/python/contrib/test_uma/test_uma_utils.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import io
-
-import tvm
-from tvm import topi, IRModule
-import numpy as np
-from tvm.contrib import utils, clang
-import tvm.testing
-from tvm import te
-from typing import Union
-
-
-def _create_schedule(
-    placeholder: list,
-    c_code: Union[str, io.TextIOWrapper] = "",
-    use_external_conv2d_impl: bool = True,
-):
-    # How to do the same with TE
-    # Add pragma TE
-    # s = te.create_schedule(result.op)
-    # axis = result.op.axis
-    # s[result].pragma(axis[0], "import_llvm", c_to_llvm())
-    # with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, my_ai_hw_conv2d_pass)]}):
-    #     mod = tvm.lower(s, [ifmap, weights, result], simple_mode=True)
-    #
-    # llvm_mod = tvm.build(mod, [ifmap, weights, result], target=target, name="test_external_conv2d")
-    # llvm_mod(ifmap_data, weight_data, result_data)
-    if isinstance(c_code, io.TextIOWrapper):
-        c_code_str = c_code.read()
-    elif isinstance(c_code, str):
-        c_code_str = c_code
-    else:
-        raise TypeError()
-
-    assert (
-        use_external_conv2d_impl
-        and c_code_str != ""
-        or not use_external_conv2d_impl
-        and c_code_str == ""
-    )
-
-    def _c_to_llvm(c_code: str) -> str:
-        temp = utils.tempdir()
-        ll_path = temp.relpath("conv2d.ll")
-        ll_code = clang.create_llvm([c_code], output=ll_path)
-        return ll_code
-
-    func_tir = te.create_prim_func(placeholder)
-    ir_module_from_te = IRModule({"main": func_tir})
-    sch_tir = tvm.tir.Schedule(ir_module_from_te)
-    if use_external_conv2d_impl:
-        conv2d_b = sch_tir.get_block("conv2d_nchw")
-        conv2d_l = sch_tir.get_loops(conv2d_b)
-        sch_tir.annotate(conv2d_l[0], "pragma_import_llvm", _c_to_llvm(c_code_str))
-    return sch_tir
-
-
-def _generate_io_arrays(shapes: dict, dev):
-    n, w, h, ci, kw, kh, co = (
-        shapes["n"],
-        shapes["w"],
-        shapes["h"],
-        shapes["ci"],
-        shapes["kw"],
-        shapes["kh"],
-        shapes["co"],
-    )
-
-    ifmap_data = tvm.nd.array(np.random.uniform(size=(n, ci, w, h)).astype("float32"), dev)
-    weight_data = tvm.nd.array(np.random.uniform(size=(co, ci, kh, kw)).astype("float32"), dev)
-    result_data = tvm.nd.array(np.zeros((n, co, w, h)).astype("float32"), dev)
-    return ifmap_data, weight_data, result_data
diff --git a/tests/python/contrib/test_uma/test_uma_vanilla_accelerator.py b/tests/python/contrib/test_uma/test_uma_vanilla_accelerator.py
deleted file mode 100644
index 043203e22a99..000000000000
--- a/tests/python/contrib/test_uma/test_uma_vanilla_accelerator.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""UMA testcase for the vanilla_accelerator accelerator"""
-import pytest
-
-from tvm.relay.backend.contrib.uma.api.utils import PassPhase
-from tvm.relay.backend.contrib.uma.backend import UMABackend
-from apps.uma._template.passes import (
-    MyAiHwConv2dPass as VanillaAcceleratorConv2dPass,
-)
-from apps.uma._template.codegen import gen_includes
-
-from apps.uma._template.patterns import conv2d_pattern, dense_pattern
-from tvm.relay.backend.contrib.uma import uma_available
-
-pytestmark = pytest.mark.skipif(not uma_available(), reason="UMA not available")
-
-
-class VanillaAcceleratorBackend(UMABackend):
-    """UMA backend for the VanillaAccelerator accelerator."""
-
-    def __init__(self):
-        super().__init__()
-
-        #######################################################################
-        # Relay to Relay function registration
-        #######################################################################
-        self._register_pattern("conv2d", conv2d_pattern())
-        self._register_pattern("dense", dense_pattern())
-
-        #######################################################################
-        # Relay to TIR function registration
-        #######################################################################
-        self._register_tir_pass(PassPhase.TIR_PHASE_0, VanillaAcceleratorConv2dPass())
-
-        #######################################################################
-        # TIR to runtime function registration
-        #######################################################################
-        self._register_codegen(fmt="c", includes=gen_includes)
-
-    @property
-    def target_name(self):
-        return "vanilla_accelerator"
diff --git a/tests/python/contrib/test_vitis_ai/__init__.py b/tests/python/contrib/test_vitis_ai/__init__.py
deleted file mode 100644
index c5fe1539b059..000000000000
--- a/tests/python/contrib/test_vitis_ai/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Infrastructure and tests for Vitis-AI codegen """
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
deleted file mode 100644
index aaeb1e5e0702..000000000000
--- a/tests/python/contrib/test_vitis_ai/infrastructure.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
-
-"""Expose Vitis-AI test functions to the Python frontend"""
-
-import sys
-import numpy as np
-
-import pytest
-
-pytest.importorskip("pyxir")
-import pyxir.contrib.target.DPUCZDX8G
-
-import tvm
-from tvm import relay
-from tvm import runtime
-from tvm.relay import transform
-from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
-from tvm.relay.build_module import bind_params_by_name
-from tvm.contrib.target import vitis_ai
-from tvm.contrib import graph_executor
-from tvm.contrib import utils
-
-
-def get_cpu_op_count(mod):
-    """Traverse graph counting ops offloaded to TVM."""
-
-    class Counter(tvm.relay.ExprVisitor):
-        def __init__(self):
-            super().__init__()
-            self.count = 0
-
-        def visit_call(self, call):
-            if isinstance(call.op, tvm.ir.Op):
-                self.count += 1
-
-            super().visit_call(call)
-
-    c = Counter()
-    c.visit(mod["main"])
-    return c.count
-
-
-def build_module(
-    mod,
-    target,
-    dpu_target="DPUCADF8H",
-    params=None,
-    enable_vitis_ai=True,
-    tvm_ops=0,
-    vitis_ai_partitions=1,
-):
-    """Build module for Vitis-AI codegen."""
-    if isinstance(mod, tvm.relay.expr.Call):
-        mod = tvm.IRModule.from_expr(mod)
-    if params is None:
-        params = {}
-
-    with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.vitis_ai.options.target": dpu_target}
-    ):
-        if enable_vitis_ai:
-            mod = partition_for_vitis_ai(mod, params, dpu_target)
-            tvm_op_count = get_cpu_op_count(mod)
-            assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
-                tvm_op_count, tvm_ops
-            )
-            partition_count = 0
-            for global_var in mod.get_global_vars():
-                if "vitis_ai" in global_var.name_hint:
-                    partition_count += 1
-
-            assert (
-                vitis_ai_partitions == partition_count
-            ), "Got {} Vitis-AI partitions, expected {}".format(
-                partition_count, vitis_ai_partitions
-            )
-        relay.backend.te_compiler.get().clear()
-        return relay.build(mod, target, params=params)
-
-
-def update_lib(lib, cross_compile=None):
-    tmp_path = utils.tempdir()
-    lib_name = "lib.so"
-    lib_path = tmp_path.relpath(lib_name)
-    if cross_compile:
-        lib.export_library(lib_path, cc=cross_compile)
-    else:
-        lib.export_library(lib_path)
-    lib = runtime.load_module(lib_path)
-    return lib
-
-
-def extract_vitis_ai_modules(module):
-    """Get the Vits-AI runtime module from llvm module."""
-    return list(
-        filter(lambda mod: mod.type_key == "VitisAIRuntime", module.get_lib().imported_modules)
-    )
-
-
-def verify_codegen(
-    module, num_vitis_ai_modules=1, params=None, target="llvm", tvm_ops=0, dpu_target="DPUCADX8G"
-):
-    """Check Vitis-AI codegen against a known good output."""
-    module = build_module(
-        module,
-        target,
-        params=params,
-        dpu_target=dpu_target,
-        tvm_ops=tvm_ops,
-        vitis_ai_partitions=num_vitis_ai_modules,
-    )
-    vitis_ai_modules = extract_vitis_ai_modules(module)
-
-    assert len(vitis_ai_modules) == num_vitis_ai_modules, (
-        f"The number of Vitis-AI modules produced ({len(vitis_ai_modules)}) does not "
-        f"match the expected value ({num_vitis_ai_modules})."
-    )
-
-
-def verify_result(
-    mod,
-    map_inputs,
-    out_shape,
-    result,
-    tol=1e-5,
-    target="llvm",
-    device=tvm.cpu(),
-    params=None,
-    dpu_target="DPUCADX8G",
-    tvm_ops=0,
-):
-    """To check the result between reference and byoc vitis-ai flow"""
-
-    lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops)
-    lib = update_lib(lib)
-    rt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu()))
-
-    for name, data in map_inputs.items():
-        rt_mod.set_input(name, data)
-    rt_mod.set_input(**params)
-    rt_mod.run()
-
-    out_shapes = out_shape if isinstance(out_shape, list) else [out_shape]
-    results = result if isinstance(result, list) else [result]
-
-    for idx, shape in enumerate(out_shapes):
-        out = tvm.nd.empty(shape, device=device)
-        out = rt_mod.get_output(idx, out)
-        tvm.testing.assert_allclose(out.numpy(), results[idx], rtol=tol, atol=tol)
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
deleted file mode 100644
index b4d12cf62ced..000000000000
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
-
-"""Vitis-AI codegen tests"""
-
-import sys
-
-import numpy as np
-import pytest
-
-pytest.importorskip("pyxir")
-import pyxir.contrib.target.DPUCADF8H
-import pyxir.contrib.target.DPUCAHX8H
-import pyxir.contrib.target.DPUCAHX8L
-import pyxir.contrib.target.DPUCVDX8G
-import pyxir.contrib.target.DPUCVDX8H
-import pyxir.contrib.target.DPUCZDX8G
-import tvm
-from tvm import relay
-from tvm.testing import requires_vitis_ai
-from tvm.contrib.target import vitis_ai
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.op.contrib.vitis_ai import annotation
-
-from .infrastructure import verify_codegen
-
-
-def set_func_attr(func, compile_name, symbol_name):
-    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Compiler", compile_name)
-    func = func.with_attr("global_symbol", symbol_name)
-    return func
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_conv2d(dpu_target):
-    """Test conv2d operator for Vitis AI DPU targets"""
-
-    x = relay.var("x", shape=(1, 3, 224, 224))
-    w = relay.const(np.zeros((16, 3, 3, 3), dtype="float32"))
-    y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
-    func = relay.Function([x], y)
-    params = {}
-    params["w"] = np.random.rand(16, 3, 3, 3).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, params=params, dpu_target=dpu_target, tvm_ops=2)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize("dpu_target", ["DPUCAHX8L", "DPUCZDX8G-zcu104"])
-def test_depthwise_conv(dpu_target):
-    """Test depthwise_conv operator for Vitis-AI DPUCZDX8G-zcu104 target"""
-
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    wshape = (32, 1, 3, 3)
-    data = relay.var("data", shape=(ishape), dtype=dtype)
-    weights = relay.var("weights", shape=(wshape), dtype=dtype)
-    depthwise_conv2d = relay.nn.conv2d(data, weights, kernel_size=(3, 3), padding=(1, 1), groups=32)
-    func = relay.Function([data, weights], depthwise_conv2d)
-    params = {}
-    params["weights"] = np.random.randn(32, 1, 3, 3).astype(dtype)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, params=params, dpu_target=dpu_target, tvm_ops=2)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_bias_add(dpu_target):
-    """Test bias_add operator for Vitis AI DPU targets"""
-
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    data = relay.var("data", shape=(ishape), dtype=dtype)
-    bias = relay.var("bias", relay.TensorType((32,), dtype))
-    out = relay.nn.bias_add(data, bias)
-    func = relay.Function([data, bias], out)
-    params = {}
-    params["bias"] = np.random.randn(32).astype(dtype)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, params=params, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_relu(dpu_target):
-    """Test relu operator for Vitis AI DPU targets"""
-
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.relu(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target=dpu_target, num_vitis_ai_modules=0, tvm_ops=1)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_batchnorm(dpu_target):
-    """Test batchnorm operator for Vitis AI DPU targets"""
-
-    data = relay.var("data", shape=(1, 16, 112, 112))
-    bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-    bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-    bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-    bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
-    bn_output = relay.nn.batch_norm(data, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-    func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output[0])
-    params = {}
-    params["bn_gamma"] = np.random.rand(16).astype("float32")
-    params["bn_beta"] = np.random.rand(16).astype("float32")
-    params["bn_mean"] = np.random.rand(16).astype("float32")
-    params["bn_var"] = np.random.rand(16).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, params=params, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_add(dpu_target):
-    """Test add operator for Vitis AI DPU targets"""
-
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = x + x
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_global_avg_pool2d(dpu_target):
-    """Test global_avg_pool2d operator for Vitis AI DPU targets"""
-
-    shape = (10, 10, 7, 7)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.global_avg_pool2d(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_avg_pool2d(dpu_target):
-    """Test avg_pool2d for operator Vitis AI DPU targets"""
-
-    shape = (10, 10, 10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.avg_pool2d(x, pool_size=(3, 3))
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_max_pool2d(dpu_target):
-    """Test max_pool2d for operator Vitis AI DPU targets"""
-
-    shape = (64, 512, 10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.max_pool2d(x, pool_size=(3, 3))
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_global_max_pool2d(dpu_target):
-    """Test global_maxpool2d operator for Vitis AI DPU targets"""
-
-    shape = (1, 512, 7, 7)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.global_max_pool2d(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_upsampling(dpu_target):
-    """Test upsampling operator for Vitis AI DPU targets"""
-
-    shape = (64, 512, 10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.upsampling(x, scale_h=2, scale_w=2)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target=dpu_target)
-
-
-@pytest.mark.skip(
-    reason="I and O used to be mixed up in kernel layouts in TVM."
-    "This is fixed, but vitis needs to adopt the new convention."
-    "To change, simply remove this line:"
-    "https://github.com/Xilinx/pyxir/blob/bef661d6d77adcdbd2cf4163f2cf3a1d31d40406/"
-    "python/pyxir/frontend/tvm/relay_tools/relay_l2_convolution.py#L380"
-)
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_conv2d_transpose(dpu_target):
-    """Test conv2d_transpose operator for Vitis AI DPU targets"""
-
-    dshape = (1, 3, 18, 18)
-    kshape = (3, 10, 3, 3)
-    x = relay.var("x", shape=dshape)
-    w = relay.const(np.zeros(kshape, dtype="float32"))
-    y = relay.nn.conv2d_transpose(
-        x,
-        w,
-        channels=10,
-        kernel_size=(3, 3),
-        strides=(1, 1),
-        padding=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-    )
-    func = relay.Function([x], y)
-    params = {}
-    dtype = "float32"
-    params["w"] = np.random.uniform(size=kshape).astype(dtype)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, params=params, dpu_target=dpu_target)
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize(
-    "dpu_target",
-    ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
-)
-def test_annotate(dpu_target):
-    """Test annotation operator for Vitis AI DPU targets"""
-
-    def partition(dpu_target):
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
-        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
-
-        conv = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-        bn_output = relay.nn.batch_norm(conv, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-
-        func = relay.Function(
-            [data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output.astuple()
-        )
-        mod = tvm.IRModule()
-        mod["main"] = func
-        params = {}
-        params["weight"] = np.random.rand(16, 3, 3, 3).astype("float32")
-        params["bn_gamma"] = np.random.rand(16).astype("float32")
-        params["bn_beta"] = np.random.rand(16).astype("float32")
-        params["bn_mean"] = np.random.rand(16).astype("float32")
-        params["bn_var"] = np.random.rand(16).astype("float32")
-        mod = annotation(mod, params, dpu_target)
-
-        opt_pass = tvm.transform.Sequential(
-            [
-                transform.MergeCompilerRegions(),
-                transform.PartitionGraph(),
-            ]
-        )
-
-        with tvm.transform.PassContext(opt_level=3):
-            mod = opt_pass(mod)
-        return mod
-
-    def expected():
-        # function variables for conv2d
-        data0 = relay.var("data0", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight0 = relay.var("weight0", relay.TensorType((16, 3, 3, 3), "float32"))
-        conv = relay.nn.conv2d(
-            data=data0, weight=weight0, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-
-        # function variables for batch_norm
-        bn_gamma0 = relay.var("bn_gamma0", relay.TensorType((16,), "float32"))
-        bn_beta0 = relay.var("bn_beta0", relay.TensorType((16,), "float32"))
-        bn_mmean0 = relay.var("bn_mean0", relay.TensorType((16,), "float32"))
-        bn_mvar0 = relay.var("bn_var0", relay.TensorType((16,), "float32"))
-        bn = relay.nn.batch_norm(conv, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0)
-        func0 = relay.Function(
-            [data0, weight0, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0], bn.astuple()
-        )
-        func0 = set_func_attr(func0, "vitis_ai", "tvmgen_default_vitis_ai_main_0")
-        gv0 = relay.GlobalVar("tvmgen_default_vitis_ai_main_0")
-        mod = tvm.IRModule()
-        mod[gv0] = func0
-        mod = relay.transform.InferType()(mod)
-
-        # main function
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
-        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
-        call0 = gv0(data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-        mod["main"] = relay.Function([data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], call0)
-        mod = relay.transform.InferType()(mod)
-        return mod
-
-    partitioned_mod = partition(dpu_target)
-
-    ref_mod = expected()
-
-    tvm.ir.assert_structural_equal(partitioned_mod, ref_mod, map_free_vars=True)
-
-
-if __name__ == "__main__":
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        sys.exit(0)
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
deleted file mode 100644
index 4273f5fa34d5..000000000000
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
-
-"""Vitis-AI runtime test for CPU only part
-
-This test verifies as much as possible whether the a model can be correctly offloaded
-and executed for Vitis-AI acceleration. This entails:
-    - Annotating and partitioning model for Vitis-AI acceleration
-    - Building a Vitis-AI PyXIR runtime module with on-the-fly quantization enabled
-    - Run first iteration of on-the-fly quantization flow. This will always be run
-      on CPU as the first N (parameter) will be used for collecting calibration data
-      for quantization.
-
-NOTE This is not a full end-to-end test as we need the full Vitis-AI docker environment
-and access to an FPGA instance for that. This test verifies the Vitis-AI flow as much as
-possible without requiring access to dedicated docker environment and/or hardware setup.
-NOTE Quantization is not being tested (we need to be inside Vitis-AI docker environment
-for that) buth the internal representation used for quantization is being generated and
-functionally tested (CPU).
-"""
-
-import sys
-import numpy as np
-
-import pytest
-
-pytest.importorskip("pyxir")
-import pyxir.contrib.target.DPUCADF8H
-import pyxir.contrib.target.DPUCVDX8H
-import pyxir.contrib.target.DPUCZDX8G
-
-import tvm
-import tvm.relay.testing
-from tvm import relay
-from tvm.testing import requires_vitis_ai
-
-from .infrastructure import verify_result
-
-
-@requires_vitis_ai
-@pytest.mark.parametrize("dpu_target", ["DPUCADF8H", "DPUCVDX8H", "DPUCZDX8G-zcu104"])
-def test_extern_vitis_ai_resnet18(dpu_target):
-    """Test first part of Vitis AI on-the-fly quantization runtime with ResNet 18 model"""
-
-    dtype = "float32"
-    ishape = (1, 3, 224, 224)
-    mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
-    ref_mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-
-    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)).evaluate()(
-        i_data, **params
-    )
-
-    verify_result(
-        mod,
-        {"data": i_data},
-        (1, 1000),
-        ref_res.numpy(),
-        tol=1e-5,
-        params=params,
-        dpu_target=dpu_target,
-        tvm_ops=7,
-    )
-
-
-if __name__ == "__main__":
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        sys.exit(0)
-    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
deleted file mode 100644
index 5f10ebb24aab..000000000000
--- a/tests/python/driver/tvmc/conftest.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import pytest
-import tarfile
-import textwrap
-
-import numpy as np
-
-from PIL import Image
-
-import tvm
-from tvm import relay
-from tvm.driver import tvmc
-
-from tvm.contrib.download import download_testdata
-
-# Support functions
-
-
-def download_and_untar(model_url, model_sub_path, temp_dir):
-    model_tar_name = os.path.basename(model_url)
-    model_path = download_testdata(model_url, model_tar_name, module=["tvmc"])
-
-    if model_path.endswith("tgz") or model_path.endswith("gz") or model_path.endswith("tar"):
-        tar = tarfile.open(model_path)
-        tar.extractall(path=temp_dir)
-        tar.close()
-
-    return os.path.join(temp_dir, model_sub_path)
-
-
-# PyTest fixtures
-
-
-@pytest.fixture(scope="session")
-def tflite_mobilenet_v1_1_quant(tmpdir_factory):
-    base_url = "https://storage.googleapis.com/download.tensorflow.org/models"
-    model_url = "mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
-    model_file = download_and_untar(
-        "{}/{}".format(base_url, model_url),
-        "mobilenet_v1_1.0_224_quant.tflite",
-        temp_dir=tmpdir_factory.mktemp("data"),
-    )
-
-    return model_file
-
-
-@pytest.fixture(scope="session")
-def pb_mobilenet_v1_1_quant(tmpdir_factory):
-    base_url = "https://storage.googleapis.com/download.tensorflow.org/models"
-    model_url = "mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz"
-    model_file = download_and_untar(
-        "{}/{}".format(base_url, model_url),
-        "mobilenet_v1_1.0_224_frozen.pb",
-        temp_dir=tmpdir_factory.mktemp("data"),
-    )
-
-    return model_file
-
-
-@pytest.fixture(scope="session")
-def keras_resnet50(tmpdir_factory):
-    try:
-        from tensorflow.keras.applications.resnet50 import ResNet50
-    except ImportError:
-        # not all environments provide TensorFlow, so skip this fixture
-        # if that is that case.
-        return ""
-
-    model_file_name = "{}/{}".format(tmpdir_factory.mktemp("data"), "resnet50.h5")
-    model = ResNet50(include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000)
-    model.save(model_file_name)
-
-    return model_file_name
-
-
-@pytest.fixture(scope="session")
-def keras_simple(tmpdir_factory):
-    try:
-        from tensorflow import keras
-    except ImportError:
-        # not all environments provide TensorFlow, so skip this fixture
-        # if that is that case.
-        return ""
-
-    model_file_name = "{}/{}".format(tmpdir_factory.mktemp("data"), "simple_conv.h5")
-    model = keras.Sequential(
-        [
-            keras.layers.InputLayer(input_shape=[32, 32, 3], batch_size=1),
-            keras.layers.Conv2D(8, kernel_size=(3, 3)),
-            keras.layers.Flatten(),
-            keras.layers.Dense(64),
-        ]
-    )
-    model.save(model_file_name)
-
-    return model_file_name
-
-
-@pytest.fixture(scope="session")
-def pytorch_resnet18(tmpdir_factory):
-    try:
-        import torch
-        import torchvision.models as models
-    except ImportError:
-        # Not all environments provide Pytorch, so skip if that's the case.
-        return ""
-    model = models.resnet18()
-    model_file_name = "{}/{}".format(tmpdir_factory.mktemp("data"), "resnet18.pth")
-    # Trace model into torchscript.
-    traced_cpu = torch.jit.trace(model, torch.randn(1, 3, 224, 224))
-    torch.jit.save(traced_cpu, model_file_name)
-
-    return model_file_name
-
-
-@pytest.fixture(scope="session")
-def pytorch_mobilenetv2_quantized(tmpdir_factory):
-    try:
-        import torch
-        import torchvision.models as models
-    except ImportError:
-        # Not all environments provide Pytorch, so skip if that's the case.
-        return ""
-    model = models.quantization.mobilenet_v2(quantize=True)
-    model_file_name = "{}/{}".format(tmpdir_factory.mktemp("data"), "mobilenet_v2_quantized.pth")
-    # Trace model into torchscript.
-    traced_cpu = torch.jit.trace(model, torch.randn(1, 3, 224, 224))
-    torch.jit.save(traced_cpu, model_file_name)
-
-    return model_file_name
-
-
-@pytest.fixture(scope="session")
-def onnx_resnet50():
-    base_url = "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/resnet/model"
-    file_to_download = "resnet50-v2-7.onnx"
-    model_file = download_testdata(
-        "{}/{}".format(base_url, file_to_download), file_to_download, module=["tvmc"]
-    )
-
-    return model_file
-
-
-@pytest.fixture(scope="session")
-def paddle_resnet50(tmpdir_factory):
-    base_url = "https://bj.bcebos.com/x2paddle/models"
-    model_url = "paddle_resnet50.tar"
-    model_file = download_and_untar(
-        "{}/{}".format(base_url, model_url),
-        "paddle_resnet50/model.pdmodel",
-        temp_dir=tmpdir_factory.mktemp("data"),
-    )
-    return model_file
-
-
-@pytest.fixture(scope="session")
-def onnx_mnist():
-    base_url = "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/mnist/model"
-    file_to_download = "mnist-1.onnx"
-    model_file = download_testdata(
-        "{}/{}".format(base_url, file_to_download), file_to_download, module=["tvmc"]
-    )
-
-    return model_file
-
-
-@pytest.fixture
-def tflite_compile_model(tmpdir_factory):
-    """Support function that returns a TFLite compiled module"""
-
-    def model_compiler(model_file, **overrides):
-        package_path = tmpdir_factory.mktemp("data").join("mock.tar")
-        tvmc_model = tvmc.frontends.load_model(model_file)
-        args = {"target": "llvm", **overrides}
-        return tvmc.compiler.compile_model(tvmc_model, package_path=package_path, **args)
-
-    # Returns a TVMCPackage
-    return model_compiler
-
-
-@pytest.fixture
-def relay_compile_model(tmpdir_factory):
-    """Support function that returns a TFLite compiled module"""
-
-    def model_compiler(model_file, shape_dict, **overrides):
-        package_path = tmpdir_factory.mktemp("data").join("mock.tar")
-        tvmc_model = tvmc.frontends.load_model(
-            model_file, model_format="relay", shape_dict=shape_dict
-        )
-        args = {"target": "llvm", **overrides}
-        return tvmc.compiler.compile_model(tvmc_model, package_path=package_path, **args)
-
-    # Returns a TVMCPackage
-    return model_compiler
-
-
-@pytest.fixture(scope="session")
-def imagenet_cat(tmpdir_factory):
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    cat_file_name = "imagenet_cat.npz"
-
-    cat_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
-    image_path = download_testdata(cat_url, "inputs", module=["tvmc"])
-    resized_image = Image.open(image_path).resize((224, 224))
-    image_data = np.asarray(resized_image).astype("float32")
-    image_data = np.expand_dims(image_data, axis=0)
-
-    cat_file_full_path = os.path.join(tmpdir_name, cat_file_name)
-    np.savez(cat_file_full_path, input=image_data)
-
-    return cat_file_full_path
-
-
-@pytest.fixture(scope="session")
-def tflite_mobilenet_v1_0_25_128(tmpdir_factory):
-    base_url = "https://storage.googleapis.com/download.tensorflow.org/models"
-    model_url = "mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz"
-    model_file = download_and_untar(
-        "{}/{}".format(base_url, model_url),
-        "mobilenet_v1_0.25_128.tflite",
-        temp_dir=tmpdir_factory.mktemp("data"),
-    )
-
-    return model_file
-
-
-@pytest.fixture(scope="session")
-def tflite_cnn_s_quantized(tmpdir_factory):
-    base_url = "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8"
-    file_to_download = "cnn_s_quantized.tflite"
-    model_file = download_testdata(
-        "{}/{}".format(base_url, file_to_download), file_to_download, module=["tvmc"]
-    )
-    return model_file
-
-
-@pytest.fixture(scope="session")
-def relay_text_conv2d(tmpdir_factory):
-    file_path = os.path.join(tmpdir_factory.mktemp("model"), "relay.txt")
-
-    RELAY_MODEL = textwrap.dedent(
-        """\
-        #[version = "0.0.5"]
-        def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5), int8]) {
-            %1 = nn.conv2d(
-                 %data,
-                 %weight,
-                 padding=[2, 2],
-                 channels=3,
-                 kernel_size=[5, 5],
-                 data_layout="NCHW",
-                 kernel_layout="OIHW",
-                 out_dtype="int32");
-            %2 = cast(nn.max_pool2d(%1, pool_size=[3, 3]), dtype="int8");
-            %3 = nn.conv2d(
-                 %2,
-                 %weight,
-                 padding=[2, 2],
-                 channels=3,
-                 kernel_size=[5, 5],
-                 data_layout="NCHW",
-                 kernel_layout="OIHW",
-                 out_dtype="int32");
-            %4 = nn.max_pool2d(%3, pool_size=[3, 3]);
-            %4
-        }
-    """
-    )
-
-    with open(file_path, "w") as relay_text:
-        relay_text.write(RELAY_MODEL)
-    return file_path
-
-
-@pytest.fixture(scope="session")
-def relay_conv2d():
-    """
-    Simple conv2d Relay implementation.
-    """
-    dtype = "float32"
-
-    x = relay.var("x", shape=(1, 4, 2, 2), dtype=dtype)
-    weight = relay.const(np.random.uniform(size=(2, 4, 2, 2)), dtype=dtype)
-    x = relay.nn.conv2d(x, weight)
-    func = relay.Function(relay.analysis.free_vars(x), x)
-    return tvm.IRModule.from_expr(func)
diff --git a/tests/python/driver/tvmc/test_autoscheduler.py b/tests/python/driver/tvmc/test_autoscheduler.py
deleted file mode 100644
index a3a3f52d39ac..000000000000
--- a/tests/python/driver/tvmc/test_autoscheduler.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import platform
-import pytest
-import os
-
-from os import path
-
-from tvm import auto_scheduler
-from tvm.driver import tvmc
-
-
-def _get_tasks(model):
-    tvmc_model = tvmc.frontends.load_model(model)
-    tasks, weights = tvmc.autotuner.autoscheduler_get_tuning_tasks(
-        tvmc_model.mod, tvmc_model.params, "llvm"
-    )
-    return (tasks, weights)
-
-
-def _autoscheduler_test_helper(model, tmpdir_name, early_stopping=1, prior_records=None):
-    tvmc_model = tvmc.frontends.load_model(model)
-    log_file = os.path.join(tmpdir_name, "autoscheduler.json")
-
-    hardware_params = auto_scheduler.HardwareParams(num_cores=4, target="llvm")
-
-    tvmc.tune(
-        tvmc_model,
-        target="llvm",
-        tuning_records=log_file,
-        prior_records=prior_records,
-        early_stopping=early_stopping,
-        enable_autoscheduler=True,
-        trials=2,
-        hardware_params=hardware_params,
-    )
-
-    # testing whether the log file was produced
-    assert path.exists(log_file), "autoscheduler log file should exist"
-
-    with auto_scheduler.ApplyHistoryBest(log_file) as best:
-        assert isinstance(
-            best, auto_scheduler.dispatcher.ApplyHistoryBest
-        ), "unable to load the best results of tuning"
-
-    return log_file
-
-
-def test_get_tuning_tasks(keras_simple):
-    pytest.importorskip("tensorflow")
-
-    tasks, weights = _get_tasks(keras_simple)
-    expected_task_type = auto_scheduler.SearchTask
-
-    assert type(tasks) is list
-    assert len(tasks) > 0
-    assert all([type(x) is expected_task_type for x in tasks]) is True
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_tune_tasks(keras_simple, tmpdir_factory):
-    pytest.importorskip("tensorflow")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _autoscheduler_test_helper(keras_simple, tmpdir_name)
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_tune_tasks__tuning_records(keras_simple, tmpdir_factory):
-    pytest.importorskip("tensorflow")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    output_log_phase_1 = _autoscheduler_test_helper(keras_simple, tmpdir_name)
-
-    # Exercises transfer learning by making sure a previous log exists
-    _autoscheduler_test_helper(keras_simple, tmpdir_name, prior_records=output_log_phase_1)
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_tune_tasks__no_early_stopping(keras_simple, tmpdir_factory):
-    pytest.importorskip("tensorflow")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _autoscheduler_test_helper(keras_simple, tmpdir_name, early_stopping=None)
diff --git a/tests/python/driver/tvmc/test_autotuner.py b/tests/python/driver/tvmc/test_autotuner.py
deleted file mode 100644
index ce5b888f25dc..000000000000
--- a/tests/python/driver/tvmc/test_autotuner.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import platform
-import pytest
-import os
-
-from unittest import mock
-
-from os import path
-from pathlib import Path
-
-import tvm
-import tvm.testing
-from tvm import autotvm, auto_scheduler
-from tvm.driver import tvmc
-from tvm.driver.tvmc.autotuner import filter_tasks, gen_task_list
-
-
-def _get_tasks(model):
-    tvmc_model = tvmc.frontends.load_model(model)
-    return tvmc.autotuner.autotvm_get_tuning_tasks(tvmc_model.mod, tvmc_model.params, "llvm")
-
-
-def _get_measure_options():
-    return autotvm.measure_option(
-        builder=autotvm.LocalBuilder(build_func="default"), runner="local"
-    )
-
-
-def _tuner_test_helper(model, tuner_name, tmpdir_name, early_stopping=1, prior_records=None):
-    tvmc_model = tvmc.frontends.load_model(model)
-    log_file = os.path.join(tmpdir_name, "log_{}.txt".format(tuner_name))
-
-    tvmc.tune(
-        tvmc_model,
-        target="llvm",
-        tuning_records=log_file,
-        prior_records=prior_records,
-        tuner=tuner_name,
-        trials=4,
-        early_stopping=early_stopping,
-    )
-
-    # testing whether the log file was produced
-    assert path.exists(log_file), "tuning log file should exist"
-
-    with autotvm.apply_history_best(log_file) as best:
-        assert isinstance(
-            best, autotvm.task.dispatcher.ApplyHistoryBest
-        ), "unable to load the best results of tuning"
-
-    return log_file
-
-
-def test_get_tuning_tasks(onnx_mnist):
-    pytest.importorskip("onnx")
-
-    sut = _get_tasks(onnx_mnist)
-    expected_task_type = autotvm.task.Task
-
-    assert type(sut) is list
-    assert len(sut) > 0
-    assert all([type(x) is expected_task_type for x in sut]) is True
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_tune_tasks__tuner__xgb(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "xgb", tmpdir_name)
-
-
-def test_tune_tasks__tuner__xgb_knob(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "xgb_knob", tmpdir_name)
-
-
-def test_tune_tasks__tuner__ga(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "ga", tmpdir_name)
-
-
-def test_tune_tasks__tuner__random(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "random", tmpdir_name)
-
-
-def test_tune_tasks__tuner__gridsearch(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "gridsearch", tmpdir_name)
-
-
-def test_tune_tasks__tuner__gridsearch__tuning_records(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    output_log_phase_1 = _tuner_test_helper(onnx_mnist, "gridsearch", tmpdir_name)
-
-    # Exercises transfer learning by making sure a previous log exists
-    _tuner_test_helper(onnx_mnist, "gridsearch", tmpdir_name, prior_records=output_log_phase_1)
-
-
-def test_tune_tasks__tuner__ga__empty_tasks(tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    log_file = os.path.join(tmpdir_name, "log_{}.txt".format("ga"))
-
-    tvmc.autotuner.tune_tasks(
-        tasks=[],
-        log_file=log_file,
-        measure_option=_get_measure_options(),
-        tuner="ga",
-        trials=1,
-        early_stopping=1,
-    )
-
-
-def test_tune_tasks__tuner__xgb__no_early_stopping(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "xgb", tmpdir_name, early_stopping=None)
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_tune_tasks__tuner__xgb__no_tuning_records(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "xgb", tmpdir_name, prior_records=None)
-
-
-def test_tune_tasks__invalid_tuner(onnx_mnist, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tasks = _get_tasks(onnx_mnist)
-    log_file = os.path.join(tmpdir_factory.mktemp("data"), "log2.txt")
-
-    with pytest.raises(tvmc.TVMCException):
-        tvmc.autotuner.tune_tasks(tasks, log_file, _get_measure_options(), "invalid_tuner", 1, 1)
-
-
-@mock.patch("tvm.driver.tvmc.autotuner.auto_scheduler.HardwareParams", return_value=None)
-@mock.patch("tvm.driver.tvmc.autotuner.tune_model", return_value=None)
-@mock.patch("tvm.driver.tvmc.frontends.load_model", return_value=None)
-def test_tune_rpc_tracker_parsing(mock_load_model, mock_tune_model, mock_auto_scheduler):
-    cli_args = mock.MagicMock()
-    cli_args.rpc_tracker = "10.0.0.1:9999"
-    # FILE is not used but it's set to a valid value here to avoid it being set
-    # by mock to a MagicMock class, which won't pass the checks for valid FILE.
-    fake_input_file = "./fake_input_file.tflite"
-    Path(fake_input_file).touch()
-    cli_args.FILE = fake_input_file
-
-    tvmc.autotuner.drive_tune(cli_args)
-
-    os.remove(fake_input_file)
-
-    mock_tune_model.assert_called_once()
-
-    # inspect the mock call, to search for specific arguments
-    _, _, kwargs = mock_tune_model.mock_calls[0]
-    assert "hostname" in kwargs
-    assert "10.0.0.1" == kwargs["hostname"]
-    assert "port" in kwargs
-    assert 9999 == kwargs["port"]
-
-
-@mock.patch("tvm.transform.PassContext", return_value=tvm.transform.PassContext())
-def test_autotune_pass_context(mock_pc, onnx_mnist, tmpdir_factory):
-    """
-    Check that the pass context while tuning is as expected.
-    """
-    pytest.importorskip("onnx")
-
-    tmpdir_name = tmpdir_factory.mktemp("data")
-    _tuner_test_helper(onnx_mnist, "gridsearch", tmpdir_name)
-
-    # AutoTVM overrides the pass context later in the pipeline to disable AlterOpLayout
-    assert mock_pc.call_count == 2
-    assert mock_pc.call_args_list[0][1]["opt_level"] == 3
-
-
-def test_filter_tasks_valid():
-    filter_tasks(list(range(10)), "list") == ([], True)
-    filter_tasks(list(range(10)), "help") == ([], True)
-    filter_tasks(list(range(10)), "all") == ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], False)
-    filter_tasks(list(range(10)), "5") == ([5], False)
-    filter_tasks(list(range(10)), "1-5") == ([1, 2, 3, 4, 5], False)
-    filter_tasks(list(range(10)), "-5") == ([0, 1, 2, 3, 4, 5], False)
-    filter_tasks(list(range(10)), "6-") == ([6, 7, 8, 9], False)
-    filter_tasks(list(range(10)), "0,1-3,all") == ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], False)
-    filter_tasks(list(range(10)), "0,4-5,9,list") == ([0, 4, 5, 9], True)
-
-
-@pytest.mark.parametrize(
-    "value,err_msg",
-    [
-        ("10", "Task index out of range"),
-        ("5,10", "Task index out of range"),
-        ("1-10", "Right-hand side expression out of range"),
-        ("-10", "Right-hand side expression out of range"),
-        ("-", "Missing lhs or rhs for range expression"),
-        ("-10-", "Malformed range expression"),
-        ("--", "Malformed range expression"),
-    ],
-)
-def test_filter_tasks_invalid(value, err_msg):
-    with pytest.raises(AssertionError, match=err_msg):
-        filter_tasks(list(range(10)), value)
-
-
-@pytest.mark.parametrize(
-    "enable_autoscheduler,expected",
-    [
-        (
-            False,
-            """Available Tasks for tuning:
-  0. Task(func_name=taskA, args=[], kwargs={}, workload=('taskA',)) (len=?)
-  1. Task(func_name=taskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBta... (len=?)
-  2. Task(func_name=taskC, args=[], kwargs={}, workload=('taskC',)) (len=?)""",
-        ),
-        (
-            True,
-            """Available Tasks for tuning:
-  0. taskA
-  1. taskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBtaskBta...
-  2. Unnamed""",
-        ),
-    ],
-)
-def test_print_task_list(enable_autoscheduler, expected):
-    if enable_autoscheduler:
-        auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
-        N = 64
-        target = "llvm"
-        test_input_0 = tvm.runtime.ndarray.empty((64, 64))
-        test_input_1 = tvm.runtime.ndarray.empty((10, 20))
-        test_input_2 = tvm.runtime.ndarray.empty((30, 40, 50))
-        task_inputs = {
-            "test_input_0": test_input_0,
-            "test_input_1": test_input_1,
-            "test_input_2": test_input_2,
-        }
-        task1 = auto_scheduler.SearchTask(
-            func="matmul_auto_scheduler_test",
-            args=(N, N, N),
-            target=target,
-            task_inputs=task_inputs,
-            task_inputs_overwrite=True,
-            desc="taskA",
-        )
-        task2 = auto_scheduler.SearchTask(
-            func="matmul_auto_scheduler_test",
-            args=(N, N, N),
-            target=target,
-            task_inputs=task_inputs,
-            task_inputs_overwrite=True,
-            desc="taskB" * 20,  # very long name
-        )
-        task3 = auto_scheduler.SearchTask(
-            func="matmul_auto_scheduler_test",
-            args=(N, N, N),
-            target=target,
-            task_inputs=task_inputs,
-            task_inputs_overwrite=True,
-            # missing description
-        )
-    else:
-        task1 = autotvm.task.Task("taskA", [])
-        task2 = autotvm.task.Task("taskB" * 20, [])  # very long name
-        task3 = autotvm.task.Task("taskC", [])
-    tasks = [task1, task2, task3]
-    out = gen_task_list(tasks, enable_autoscheduler)
-    assert out == expected
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_command_line.py b/tests/python/driver/tvmc/test_command_line.py
deleted file mode 100644
index af6cf8a26f73..000000000000
--- a/tests/python/driver/tvmc/test_command_line.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import platform
-import pytest
-import shutil
-import logging
-import sys
-
-from unittest import mock
-
-import tvm
-from tvm.driver.tvmc.main import _main
-from tvm.driver.tvmc import compiler
-from unittest.mock import MagicMock
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_tvmc_cl_workflow(keras_simple, tmpdir_factory):
-    pytest.importorskip("tensorflow")
-
-    tmpdir = tmpdir_factory.mktemp("data")
-
-    # Test model tuning
-    log_path = os.path.join(tmpdir, "keras-autotuner_records.json")
-    tuning_str = (
-        f"tvmc tune --target llvm --output {log_path} "
-        f"--trials 2 --enable-autoscheduler {keras_simple}"
-    )
-    tuning_args = tuning_str.split(" ")[1:]
-    _main(tuning_args)
-    assert os.path.exists(log_path)
-
-    # Test model compilation
-    package_path = os.path.join(tmpdir, "keras-tvm.tar")
-    compile_str = (
-        f"tvmc compile --target llvm --tuning-records {log_path} "
-        f"--output {package_path} {keras_simple}"
-    )
-    compile_args = compile_str.split(" ")[1:]
-    _main(compile_args)
-    assert os.path.exists(package_path)
-
-    # Test running the model
-    output_path = os.path.join(tmpdir, "predictions.npz")
-    run_str = f"tvmc run --end-to-end --outputs {output_path} {package_path}"
-    run_args = run_str.split(" ")[1:]
-    _main(run_args)
-    assert os.path.exists(output_path)
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_tvmc_cl_workflow_json_config(keras_simple, tmpdir_factory):
-    pytest.importorskip("tensorflow")
-    tune_config_file = "tune_config_test"
-    tmpdir = tmpdir_factory.mktemp("data")
-
-    # Test model tuning
-    log_path = os.path.join(tmpdir, "keras-autotuner_records.json")
-    tuning_str = (
-        f"tvmc tune --config {tune_config_file} --output {log_path} "
-        f"--enable-autoscheduler {keras_simple}"
-    )
-    tuning_args = tuning_str.split(" ")[1:]
-    _main(tuning_args)
-    assert os.path.exists(log_path)
-
-    # Test model compilation
-    package_path = os.path.join(tmpdir, "keras-tvm.tar")
-    compile_str = (
-        f"tvmc compile --tuning-records {log_path} " f"--output {package_path} {keras_simple}"
-    )
-    compile_args = compile_str.split(" ")[1:]
-    _main(compile_args)
-    assert os.path.exists(package_path)
-
-    # Test running the model
-    output_path = os.path.join(tmpdir, "predictions.npz")
-    run_str = f"tvmc run --outputs {output_path} {package_path}"
-    run_args = run_str.split(" ")[1:]
-    _main(run_args)
-    assert os.path.exists(output_path)
-
-
-@pytest.fixture
-def missing_file():
-    missing_file_name = "missing_file_as_invalid_input.tfite"
-    return missing_file_name
-
-
-@pytest.fixture
-def broken_symlink(tmp_path):
-    broken_symlink = "broken_symlink_as_invalid_input.tflite"
-    os.symlink("non_existing_file", tmp_path / broken_symlink)
-    yield broken_symlink
-    os.unlink(tmp_path / broken_symlink)
-
-
-@pytest.fixture
-def fake_directory(tmp_path):
-    dir_as_invalid = "dir_as_invalid_input.tflite"
-    os.mkdir(tmp_path / dir_as_invalid)
-    yield dir_as_invalid
-    shutil.rmtree(tmp_path / dir_as_invalid)
-
-
-@pytest.mark.parametrize(
-    "invalid_input",
-    ["missing_file", "broken_symlink", "fake_directory"],
-)
-def test_tvmc_compile_file_check(capsys, invalid_input, request):
-    invalid_input = request.getfixturevalue(invalid_input)
-    compile_cmd = f"tvmc compile --target 'c' {invalid_input}"
-    run_arg = compile_cmd.split(" ")[1:]
-
-    _main(run_arg)
-
-    captured = capsys.readouterr()
-    expected_err = (
-        f"Error: Input file '{invalid_input}' doesn't exist, "
-        "is a broken symbolic link, or a directory.\n"
-    )
-    on_assert_error = f"'tvmc compile' failed to check invalid FILE: {invalid_input}"
-    assert captured.err == expected_err, on_assert_error
-
-
-@pytest.mark.parametrize(
-    "invalid_input",
-    ["missing_file", "broken_symlink", "fake_directory"],
-)
-def test_tvmc_tune_file_check(capsys, invalid_input, request):
-    invalid_input = request.getfixturevalue(invalid_input)
-    tune_cmd = f"tvmc tune --target 'llvm' --output output.json {invalid_input}"
-    run_arg = tune_cmd.split(" ")[1:]
-
-    _main(run_arg)
-
-    captured = capsys.readouterr()
-    expected_err = (
-        f"Error: Input file '{invalid_input}' doesn't exist, "
-        "is a broken symbolic link, or a directory.\n"
-    )
-    on_assert_error = f"'tvmc tune' failed to check invalid FILE: {invalid_input}"
-    assert captured.err == expected_err, on_assert_error
-
-
-@mock.patch("tvm.relay.build", side_effect=tvm.relay.build)
-@mock.patch("tvm.driver.tvmc.model.TVMCPackage.__init__", return_value=None)
-def test_tvmc_workspace_pools_check(mock_pkg, mock_relay, keras_simple, tmpdir_factory):
-    pytest.importorskip("tensorflow")
-    tmpdir = tmpdir_factory.mktemp("data")
-
-    # Test model compilation
-    package_path = os.path.join(tmpdir, "keras-tvm.tar")
-    compile_str = (
-        f"tvmc compile --target=llvm --workspace-pools=sram "
-        f"--workspace-pools-targets=sram:llvm "
-        f"--output={package_path} {keras_simple}"
-    )
-    compile_args = compile_str.split(" ")[1:]
-    _main(compile_args)
-    assert os.path.exists(package_path)
-    assert mock_relay.call_count == 1
-    assert mock_relay.call_args_list[0][1]["workspace_memory_pools"].pools[0].pool_name == "sram"
-
-
-@pytest.fixture
-def paddle_model(paddle_resnet50):
-    # If we can't import "paddle" module, skip testing paddle as the input model.
-    if pytest.importorskip("paddle", reason="'paddle' module not installed"):
-        return paddle_resnet50
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        "paddle_model",
-    ],
-)
-# compile_model() can take too long and is tested elsewhere, hence it's mocked below
-@mock.patch.object(compiler, "compile_model")
-# @mock.patch.object(compiler, "compile_model")
-def test_tvmc_compile_input_model(mock_compile_model, tmpdir_factory, model, request):
-
-    model = request.getfixturevalue(model)
-    output_dir = tmpdir_factory.mktemp("output")
-    output_file = output_dir / "model.tar"
-
-    compile_cmd = (
-        f"tvmc compile --target 'llvm' {model} --model-format paddle --output {output_file}"
-    )
-    run_arg = compile_cmd.split(" ")[1:]
-
-    _main(run_arg)
-
-    mock_compile_model.assert_called_once()
-
-
-def test_tvmc_logger(caplog, tmpdir_factory, keras_simple):
-    pytest.importorskip("tensorflow")
-    tmpdir = tmpdir_factory.mktemp("out")
-
-    # TUNE
-    log_path = os.path.join(tmpdir, "records.json")
-    tune_cmd = f"tvmc tune --target llvm -vvvv --output {log_path} " f"--trials 2 {keras_simple}"
-
-    tuning_args = tune_cmd.split(" ")[1:]
-    _main(tuning_args)
-
-    # Check that we log during tvmc tune
-    for log_str in ("DEBUG", "INFO", "WARNING", "TVMC"):
-        assert log_str in caplog.text
-
-    caplog.clear()
-
-    # COMPILE
-    module_file = os.path.join(tmpdir, "m.tar")
-    compile_cmd = f"tvmc compile --target 'llvm' {keras_simple} -vvvv --output {module_file}"
-
-    compile_args = compile_cmd.split(" ")[1:]
-    _main(compile_args)
-
-    # Check that we log during tvmc compile
-    for log_str in ("DEBUG", "WARNING", "TVMC"):
-        assert log_str in caplog.text
-
-    caplog.clear()
-
-    # RUN
-    run_cmd = f"tvmc run -vvvv {module_file}"
-
-    run_args = run_cmd.split(" ")[1:]
-    _main(run_args)
-
-    # Check that we log during tvmc run
-    for log_str in ("DEBUG", "TVMC"):
-        assert log_str in caplog.text
-
-
-# Unfortunately pytest seems to intercept the logging output, so we can't test whether it
-# actually writes the logging output to sys.stdout, but we can test that we call
-# logging.basicConfig with the correct arguments
-def test_tvmc_logger_set_basicConfig(monkeypatch, tmpdir_factory, keras_simple):
-    pytest.importorskip("tensorflow")
-    mock_basicConfig = MagicMock()
-    monkeypatch.setattr(logging, "basicConfig", mock_basicConfig)
-
-    # Run a random tvmc command
-    tmpdir = tmpdir_factory.mktemp("out")
-    module_file = os.path.join(tmpdir, "m.tar")
-    compile_cmd = f"tvmc compile --target 'llvm' {keras_simple} -vvvv --output {module_file}"
-    compile_args = compile_cmd.split(" ")[1:]
-    _main(compile_args)
-
-    mock_basicConfig.assert_called_with(stream=sys.stdout)
-
-
-def test_tvmc_print_pass_times(capsys, keras_simple, tmpdir_factory):
-    pytest.importorskip("tensorflow")
-    tmpdir = tmpdir_factory.mktemp("out")
-    print_cmd = "--print-pass-times"
-
-    # Compile model
-    module_file = os.path.join(tmpdir, "keras-tvm.tar")
-    compile_cmd = f"tvmc compile --target 'llvm' {keras_simple} --output {module_file} {print_cmd}"
-    compile_args = compile_cmd.split(" ")[1:]
-    _main(compile_args)
-
-    # Check for timing results output
-    captured_out = capsys.readouterr().out
-    for exp_str in ("Compilation time breakdown by pass:", "sequential:", "us]"):
-        assert exp_str in captured_out
-
-
-@pytest.mark.parametrize(
-    "print_cmd, out_str",
-    [
-        (
-            "--print-ir-after=[tir.SplitHostDevice]",
-            (
-                "Print IR after: tir.SplitHostDevice\n# from tvm.script import ir as I\n",
-                "@I.ir_module",
-            ),
-        ),
-        (
-            "--print-ir-before=[tir.SplitHostDevice]",
-            ("Print IR before: tir.SplitHostDevice\n# from tvm.script import ir as I\n"),
-        ),
-        (
-            "--print-ir-after=[tir.ThreadSync,tir.SplitHostDevice]",
-            ("tir.ThreadSync,tir.SplitHostDevice"),
-        ),
-        (
-            "--print-ir-before=[tir.SplitHostDevice] --print-ir-after=[tir.SplitHostDevice]",
-            ("Print IR before: tir.SplitHostDevice\n", "Print IR after: tir.SplitHostDevice\n"),
-        ),
-    ],
-)
-def test_tvmc_print_ir_before_after(capsys, keras_simple, tmpdir_factory, print_cmd, out_str):
-    pytest.importorskip("tensorflow")
-    tmpdir = tmpdir_factory.mktemp("out")
-
-    # Compile model
-    module_file = os.path.join(tmpdir, "keras-tvm.tar")
-    compile_cmd = f"tvmc compile --target 'llvm' {keras_simple} --output {module_file} {print_cmd}"
-    compile_args = compile_cmd.split(" ")[1:]
-    _main(compile_args)
-
-    # Check for printing IR before or IR after
-    captured_out = capsys.readouterr().out
-    for exp_str in out_str:
-        assert exp_str in captured_out
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
deleted file mode 100644
index 0019bb366b8d..000000000000
--- a/tests/python/driver/tvmc/test_compiler.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import re
-import numpy as np
-import shutil
-import tarfile
-from os import path
-
-from unittest import mock
-import pytest
-
-import tvm
-from tvm.ir.memory_pools import WorkspacePoolInfo, WorkspaceMemoryPools
-from tvm.target import Target
-import tvm.testing
-from tvm.relay.backend import Runtime, Executor
-from tvm import relay
-
-from tvm.contrib.target.vitis_ai import vitis_ai_available
-
-from tvm.driver import tvmc
-from tvm.driver.tvmc.model import TVMCPackage
-
-from tvm.contrib import utils
-
-
-def test_save_dumps(tmpdir_factory):
-    tmpdir = tmpdir_factory.mktemp("data")
-    dump_formats = {"relay": "fake relay", "tir": "fake tir", "ll": "fake llvm", "asm": "fake asm"}
-    tvmc.compiler.save_dumps("fake_module", dump_formats, dump_root=tmpdir)
-
-    assert path.exists("{}/{}".format(tmpdir, "fake_module.ll"))
-    assert path.exists("{}/{}".format(tmpdir, "fake_module.asm"))
-    assert path.exists("{}/{}".format(tmpdir, "fake_module.tir"))
-    assert path.exists("{}/{}".format(tmpdir, "fake_module.relay"))
-
-
-# End to end tests for compilation
-
-
-def verify_tvmc_package(tvmc_package, dumps_path, use_vm=False):
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert os.path.exists(dumps_path)
-    assert type(tvmc_package.lib_path) is str
-
-    if use_vm:
-        assert tvmc_package.graph is None
-        assert tvmc_package.params is None
-    else:
-        assert type(tvmc_package.graph) is str
-        assert type(tvmc_package.params) is bytearray
-
-
-def verify_compile_tflite_module(model, shape_dict=None, use_vm=False):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(model, shape_dict=shape_dict)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm",
-        dump_code="ll",
-        desired_layout="NCHW",
-        use_vm=use_vm,
-    )
-    dumps_path = tvmc_package.package_path + ".ll"
-    verify_tvmc_package(tvmc_package, dumps_path, use_vm=use_vm)
-
-
-@pytest.mark.parametrize("use_vm", [True, False])
-def test_compile_tflite_module(use_vm, tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer tflite, so skip in case it is not present
-    pytest.importorskip("tflite")
-    # Check default compilation.
-    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant)
-    # Check with manual shape override
-    shape_string = "input:[1,224,224,3]"
-    shape_dict = tvmc.shape_parser.parse_shape_string(shape_string)
-    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant, shape_dict, use_vm=use_vm)
-
-
-def test_single_tir_dump(tflite_mobilenet_v1_1_quant):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="tir")
-    dumps_path = tvmc_package.package_path + ".tir"
-    assert os.path.exists(dumps_path)
-    with open(dumps_path) as f:
-        assert "tir" in f.read()
-
-
-def test_code_dumps(tflite_mobilenet_v1_1_quant):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    dump_code = ["asm", "ll", "tir", "relay"]
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code=dump_code)
-    for ext in dump_code:
-        dumps_path = tvmc_package.package_path + "." + ext
-        assert os.path.exists(dumps_path)
-        with open(dumps_path) as f:
-            assert len(f.read()) > 0
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_aarch64_tflite_module(tflite_mobilenet_v1_1_quant):
-    pytest.importorskip("tflite")
-
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_options_aarch64_tflite_module(tflite_mobilenet_v1_1_quant):
-    pytest.importorskip("tflite")
-
-    fake_sysroot_dir = utils.tempdir().relpath("")
-
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-        cross_options="--sysroot=" + fake_sysroot_dir,
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-def test_compile_keras__save_module(keras_resnet50, tmpdir_factory):
-    # some CI environments wont offer tensorflow/Keras, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    expected_temp_dir = tmpdir_factory.mktemp("saved_output")
-    expected_file_name = "saved.tar"
-    module_file = os.path.join(expected_temp_dir, expected_file_name)
-
-    tvmc_model = tvmc.load(keras_resnet50)
-    tvmc.compile(tvmc_model, target="llvm", dump_code="ll", package_path=module_file)
-
-    assert os.path.exists(module_file), "output file {0} should exist".format(module_file)
-
-    # Test that we can load back in a module.
-    tvmc_package = TVMCPackage(package_path=module_file)
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.params) is bytearray
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_aarch64_keras_module(keras_resnet50):
-    # some CI environments wont offer tensorflow/Keras, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    tvmc_model = tvmc.load(keras_resnet50)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_options_aarch64_keras_module(keras_resnet50):
-    # some CI environments wont offer tensorflow/Keras, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    fake_sysroot_dir = utils.tempdir().relpath("")
-
-    tvmc_model = tvmc.load(keras_resnet50)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-        cross_options="--sysroot=" + fake_sysroot_dir,
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-def verify_compile_onnx_module(model, shape_dict=None, use_vm=False):
-    # some CI environments wont offer onnx, so skip in case it is not present
-    pytest.importorskip("onnx")
-    tvmc_model = tvmc.load(model, shape_dict=shape_dict)
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll", use_vm=use_vm)
-    dumps_path = tvmc_package.package_path + ".ll"
-    verify_tvmc_package(tvmc_package, dumps_path, use_vm=use_vm)
-
-
-@pytest.mark.parametrize("use_vm", [True, False])
-def test_compile_onnx_module(use_vm, onnx_resnet50):
-    # Test default compilation
-    verify_compile_onnx_module(onnx_resnet50)
-    # Test with manual shape dict
-    shape_string = "data:[1,3,200,200]"
-    shape_dict = tvmc.shape_parser.parse_shape_string(shape_string)
-    verify_compile_onnx_module(onnx_resnet50, shape_dict, use_vm=use_vm)
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_aarch64_onnx_module(onnx_resnet50):
-    # some CI environments wont offer onnx, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    tvmc_model = tvmc.load(onnx_resnet50)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_options_aarch64_onnx_module(onnx_resnet50):
-    # some CI environments wont offer onnx, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    fake_sysroot_dir = utils.tempdir().relpath("")
-
-    tvmc_model = tvmc.load(onnx_resnet50)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-        cross_options="--sysroot=" + fake_sysroot_dir,
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-def verify_compile_paddle_module(model, shape_dict=None):
-    pytest.importorskip("paddle")
-    tvmc_model = tvmc.load(model, "paddle", shape_dict=shape_dict)
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll", desired_layout="NCHW")
-    dumps_path = tvmc_package.package_path + ".ll"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-def test_compile_paddle_module(paddle_resnet50):
-    # some CI environments wont offer Paddle, so skip in case it is not present
-    pytest.importorskip("paddle")
-    # Check default compilation.
-    verify_compile_paddle_module(paddle_resnet50)
-    # Check with manual shape override
-    shape_string = "inputs:[1,3,224,224]"
-    shape_dict = tvmc.shape_parser.parse_shape_string(shape_string)
-    verify_compile_paddle_module(paddle_resnet50, shape_dict)
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_aarch64_paddle_module(paddle_resnet50):
-    # some CI environments wont offer paddle, so skip in case it is not present
-    pytest.importorskip("paddle")
-
-    tvmc_model = tvmc.load(paddle_resnet50, "paddle")
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-# This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
-@pytest.mark.skipif(
-    not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
-)
-def test_cross_compile_options_aarch64_paddle_module(paddle_resnet50):
-    # some CI environments wont offer paddle, so skip in case it is not present
-    pytest.importorskip("paddle")
-
-    fake_sysroot_dir = utils.tempdir().relpath("")
-
-    tvmc_model = tvmc.load(paddle_resnet50, "paddle")
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-        dump_code="asm",
-        cross="aarch64-linux-gnu-gcc",
-        cross_options="--sysroot=" + fake_sysroot_dir,
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-@tvm.testing.requires_opencl
-def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_0_25_128)
-    tvmc_package = tvmc.compile(
-        tvmc_model,
-        target="opencl -host=llvm",
-        desired_layout="NCHW",
-        dump_code="asm",
-    )
-    dumps_path = tvmc_package.package_path + ".asm"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-    assert path.exists("{}.{}".format(tvmc_package.package_path, "opencl"))
-
-
-@tvm.testing.requires_vitis_ai
-def test_compile_tflite_module_with_external_codegen_vitis_ai(tflite_mobilenet_v1_1_quant):
-    pytest.importorskip("tflite")
-
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compiler.compile_model(
-        tvmc_model,
-        target="vitis-ai -dpu=DPUCZDX8G-zcu104 -export_runtime_module=vitis_ai.rtmod, llvm",
-        dump_code="relay",
-    )
-    dumps_path = tvmc_package.package_path + ".relay"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-@tvm.testing.requires_mrvl
-def test_compile_pytorch_module_with_external_codegen_mrvl(pytorch_resnet18):
-    tvmc_model = tvmc.load(pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]})
-    tvmc_package = tvmc.compiler.compile_model(
-        tvmc_model,
-        target="mrvl, llvm",
-        dump_code="relay",
-    )
-    dumps_path = tvmc_package.package_path + ".relay"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
-@mock.patch("tvm.relay.build")
-@mock.patch("tvm.driver.tvmc.composite_target.get_codegen_by_target")
-@mock.patch("tvm.driver.tvmc.load")
-@mock.patch("tvm.transform.PassContext")
-@mock.patch("tvm.driver.tvmc.model.TVMCPackage.__init__", return_value=None)
-def test_compile_check_configs_composite_target(mock_pkg, mock_pc, mock_fe, mock_ct, mock_relay):
-    mock_codegen = {}
-    mock_codegen["config_key"] = "relay.ext.mock.options"
-    mock_codegen["pass_pipeline"] = lambda *args, **kwargs: None
-
-    mock_fe.return_value = mock.MagicMock()
-    mock_ct.return_value = mock_codegen
-    mock_relay.return_value = mock.MagicMock()
-
-    tvmc_model = tvmc.load("no_file_needed")
-    tvmc.compile(tvmc_model, target="mockcodegen -testopt=value, llvm")
-
-    assert mock_pc.call_count == 1
-    codegen_compile_context = mock.call(
-        config={"relay.ext.mock.options": {"testopt": "value"}},
-        opt_level=3,
-        disabled_pass=None,
-        instruments=None,
-    )
-    mock_pc.assert_has_calls(
-        [
-            codegen_compile_context,
-            codegen_compile_context.__enter__(),
-            codegen_compile_context.__exit__(None, None, None),
-        ]
-    )
-
-
-@mock.patch("tvm.relay.build")
-@mock.patch("tvm.driver.tvmc.load")
-@mock.patch("tvm.driver.tvmc.model.TVMCPackage.__init__", return_value=None)
-def test_compile_check_workspace_pools(mock_pkg, mock_fe, mock_relay):
-    mock_fe.return_value = mock.MagicMock()
-    mock_relay.return_value = mock.MagicMock()
-    memory_pools = WorkspaceMemoryPools(
-        [WorkspacePoolInfo(pool_name="sram", targets=[Target("llvm")])]
-    )
-    tvmc_model = tvmc.load("no_file_needed")
-    tvmc.compile(
-        tvmc_model,
-        target="llvm,c",
-        workspace_pools=memory_pools,
-    )
-
-    assert mock_relay.call_count == 1
-    assert mock_relay.call_args_list[0][1]["workspace_memory_pools"] == memory_pools
-
-
-def test_compile_check_pass_instrument(keras_resnet50):
-    pytest.importorskip("tensorflow")
-
-    @tvm.instrument.pass_instrument
-    class PassesCounter:
-        def __init__(self):
-            self.run_before_count = 0
-            self.run_after_count = 0
-
-        def run_before_pass(self, mod, info):
-            self.run_before_count = self.run_before_count + 1
-
-        def run_after_pass(self, mod, info):
-            self.run_after_count = self.run_after_count + 1
-
-    passes_counter = PassesCounter()
-    tvmc_model = tvmc.load(keras_resnet50)
-    tvmc.compile(tvmc_model, target="llvm", instruments=[passes_counter])
-    assert passes_counter.run_after_count > 0
-    assert passes_counter.run_after_count == passes_counter.run_before_count
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py
deleted file mode 100644
index 0fb89d524be7..000000000000
--- a/tests/python/driver/tvmc/test_composite_target.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import argparse
-import os
-import shutil
-
-from inspect import isfunction
-from os import path
-
-import pytest
-
-import tvm
-
-from tvm.driver import tvmc
-
-from tvm.driver.tvmc import TVMCException
-
-
-def test_get_codegen_names():
-    names = tvmc.composite_target.get_codegen_names()
-
-    assert "vitis-ai" in names
-    assert "mrvl" in names
-    assert len(names) > 0
-
-
-def test_valid_codegen():
-    codegen = tvmc.composite_target.get_codegen_by_target("compute-library")
-
-    assert codegen is not None
-    assert codegen["pass_pipeline"] is not None
-
-
-def test_invalid_codegen():
-    with pytest.raises(TVMCException):
-        _ = tvmc.composite_target.get_codegen_by_target("invalid")
-
-
-def test_all_codegens_contain_pass_pipeline():
-    for name in tvmc.composite_target.get_codegen_names():
-        codegen = tvmc.composite_target.get_codegen_by_target(name)
-        assert "pass_pipeline" in codegen, f"{name} does not contain a pass_pipeline"
-        assert isfunction(codegen["pass_pipeline"])
-
-
-def test_all_pass_pipelines_are_functions():
-    for name in tvmc.composite_target.get_codegen_names():
-        codegen = tvmc.composite_target.get_codegen_by_target(name)
-        assert isfunction(codegen["pass_pipeline"]), f"pass_pipeline for {name} is not a function"
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
deleted file mode 100644
index e7d0a1d4dfb2..000000000000
--- a/tests/python/driver/tvmc/test_frontends.py
+++ /dev/null
@@ -1,459 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import platform
-import pytest
-import builtins
-import importlib
-
-import tvm
-from unittest import mock
-from tvm.ir.module import IRModule
-
-from tvm.driver import tvmc
-from tvm.driver.tvmc import TVMCException, TVMCImportError
-from tvm.driver.tvmc.model import TVMCModel
-
-
-orig_import = importlib.import_module
-
-
-def mock_error_on_name(name):
-    def mock_imports(module_name, package=None):
-        if module_name == name:
-            raise ImportError()
-        return orig_import(module_name, package)
-
-    return mock_imports
-
-
-def test_get_frontends_contains_only_strings():
-    sut = tvmc.frontends.get_frontend_names()
-    assert all([type(x) is str for x in sut]) is True
-
-
-def test_get_frontend_by_name_valid():
-    # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    sut = tvmc.frontends.get_frontend_by_name("keras")
-    assert type(sut) is tvmc.frontends.KerasFrontend
-
-
-def test_get_frontend_by_name_invalid():
-    with pytest.raises(TVMCException):
-        tvmc.frontends.get_frontend_by_name("unsupported_thing")
-
-
-def test_guess_frontend_tflite():
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    sut = tvmc.frontends.guess_frontend("a_model.tflite")
-    assert type(sut) is tvmc.frontends.TFLiteFrontend
-
-
-def test_guess_frontend_onnx():
-    # some CI environments wont offer onnx, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    sut = tvmc.frontends.guess_frontend("a_model.onnx")
-    assert type(sut) is tvmc.frontends.OnnxFrontend
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_guess_frontend_pytorch():
-    # some CI environments wont offer pytorch, so skip in case it is not present
-    pytest.importorskip("torch")
-
-    sut = tvmc.frontends.guess_frontend("a_model.pth")
-    assert type(sut) is tvmc.frontends.PyTorchFrontend
-
-
-def test_guess_frontend_keras():
-    # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    sut = tvmc.frontends.guess_frontend("a_model.h5")
-    assert type(sut) is tvmc.frontends.KerasFrontend
-
-
-def test_guess_frontend_tensorflow():
-    # some CI environments wont offer TensorFlow, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    sut = tvmc.frontends.guess_frontend("a_model.pb")
-    assert type(sut) is tvmc.frontends.TensorflowFrontend
-
-
-def test_guess_frontend_paddle():
-    # some CI environments wont offer Paddle, so skip in case it is not present
-    pytest.importorskip("paddle")
-
-    sut = tvmc.frontends.guess_frontend("a_model.pdmodel")
-    assert type(sut) is tvmc.frontends.PaddleFrontend
-
-
-def test_guess_frontend_relay():
-
-    sut = tvmc.frontends.guess_frontend("relay.relay")
-    assert type(sut) is tvmc.frontends.RelayFrontend
-
-
-def test_guess_frontend_invalid():
-    with pytest.raises(TVMCException):
-        tvmc.frontends.guess_frontend("not/a/file.txt")
-
-
-def test_load_model__invalid_path__no_language():
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    with pytest.raises(FileNotFoundError):
-        tvmc.load("not/a/file.tflite")
-
-
-def test_load_model__invalid_path__with_language():
-    # some CI environments wont offer onnx, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    with pytest.raises(FileNotFoundError):
-        tvmc.load("not/a/file.txt", model_format="onnx")
-
-
-def test_load_model__tflite(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-    # check whether one known value is part of the params dict
-    assert "_param_1" in tvmc_model.params.keys()
-
-
-@pytest.mark.parametrize("load_model_kwargs", [{}, {"layout": "NCHW"}])
-def test_load_model__keras(keras_resnet50, load_model_kwargs):
-    # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    tvmc_model = tvmc.frontends.load_model(keras_resnet50, **load_model_kwargs)
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-    ## check whether one known value is part of the params dict
-    assert "_param_1" in tvmc_model.params.keys()
-
-
-def verify_load_model__onnx(model, **kwargs):
-    tvmc_model = tvmc.frontends.load_model(model, **kwargs)
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-    return tvmc_model
-
-
-def test_load_model__onnx(onnx_resnet50):
-    # some CI environments wont offer onnx, so skip in case it is not present
-    pytest.importorskip("onnx")
-    tvmc_model = verify_load_model__onnx(onnx_resnet50, freeze_params=False)
-    # check whether one known value is part of the params dict
-    assert "resnetv24_batchnorm0_gamma" in tvmc_model.params.keys()
-    tvmc_model = verify_load_model__onnx(onnx_resnet50, freeze_params=True)
-    # check that the parameter dict is empty, implying that they have been folded into constants
-    assert tvmc_model.params == {}
-
-
-def test_load_model__pb(pb_mobilenet_v1_1_quant):
-    # some CI environments wont offer TensorFlow, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    tvmc_model = tvmc.load(pb_mobilenet_v1_1_quant)
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-    # check whether one known value is part of the params dict
-    assert "MobilenetV1/Conv2d_0/weights" in tvmc_model.params.keys()
-
-
-def test_load_model__paddle(paddle_resnet50):
-    # some CI environments wont offer Paddle, so skip in case it is not present
-    pytest.importorskip("paddle")
-
-    tvmc_model = tvmc.load(paddle_resnet50, model_format="paddle")
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-
-
-def test_load_model__relay(relay_text_conv2d):
-    tvmc_model = tvmc.load(relay_text_conv2d, model_format="relay")
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-
-
-def test_load_model___wrong_language__to_keras(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present
-    pytest.importorskip("tensorflow")
-
-    with pytest.raises(OSError):
-        tvmc.load(tflite_mobilenet_v1_1_quant, model_format="keras")
-
-
-def test_load_model___wrong_language__to_tflite(keras_resnet50):
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    with pytest.raises(TVMCException):
-        tvmc.frontends.load_model(keras_resnet50, model_format="tflite")
-
-
-def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer onnx, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    from google.protobuf.message import DecodeError
-
-    with pytest.raises(DecodeError):
-        tvmc.load(tflite_mobilenet_v1_1_quant, model_format="onnx")
-
-
-@pytest.mark.skip(
-    reason="free(): invalid pointer error despite using llvm-config --link-static and -DHIDE_PRIVATE_SYMBOLS=ON",
-)
-def test_load_model__pth(pytorch_resnet18):
-    # some CI environments wont offer torch, so skip in case it is not present
-    pytest.importorskip("torch")
-    pytest.importorskip("torchvision")
-
-    tvmc_model = tvmc.load(pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]})
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-    # check whether one known value is part of the params dict
-    assert "layer1.0.conv1.weight" in tvmc_model.params.keys()
-
-
-@pytest.mark.skip(
-    reason="free(): invalid pointer error despite using llvm-config --link-static and -DHIDE_PRIVATE_SYMBOLS=ON",
-)
-def test_load_quantized_model__pth(pytorch_mobilenetv2_quantized):
-    # some CI environments wont offer torch, so skip in case it is not present
-    pytest.importorskip("torch")
-    pytest.importorskip("torchvision")
-
-    tvmc_model = tvmc.load(pytorch_mobilenetv2_quantized, shape_dict={"input": [1, 3, 224, 224]})
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_model.mod) is IRModule
-    assert type(tvmc_model.params) is dict
-
-    # checking weights remain quantized and are not float32
-    for p in tvmc_model.params.values():
-        assert p.dtype in ["int8", "uint8", "int32"]  # int32 for bias
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer pytorch, so skip in case it is not present
-    pytest.importorskip("torch")
-
-    with pytest.raises(RuntimeError) as e:
-        tvmc.load(
-            tflite_mobilenet_v1_1_quant,
-            model_format="pytorch",
-            shape_dict={"input": [1, 3, 224, 224]},
-        )
-
-
-def test_compile_tflite_module_nhwc_to_nchw(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    tvmc_model = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant)
-    before = tvmc_model.mod
-
-    expected_layout = "NCHW"
-    with tvm.transform.PassContext(opt_level=3):
-        after = tvmc.transform.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NHWC"
-                and node.attrs.dst_layout == "NCHW"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert any(layout_transform_calls), "Expected 'layout_transform NHWC->NCHW' not found"
-
-
-def test_compile_onnx_module_nchw_to_nhwc(onnx_resnet50):
-    # some CI environments wont offer ONNX, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    tvmc_model = tvmc.frontends.load_model(onnx_resnet50)
-    before = tvmc_model.mod
-
-    expected_layout = "NHWC"
-    with tvm.transform.PassContext(opt_level=3):
-        after = tvmc.transform.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NCHW"
-                and node.attrs.dst_layout == "NHWC"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found"
-
-
-def test_compile_paddle_module_nchw_to_nhwc(paddle_resnet50):
-    # some CI environments wont offer Paddle, so skip in case it is not present
-    pytest.importorskip("paddle")
-
-    tvmc_model = tvmc.frontends.load_model(paddle_resnet50, "paddle")
-    before = tvmc_model.mod
-
-    expected_layout = "NHWC"
-    with tvm.transform.PassContext(opt_level=3):
-        after = tvmc.transform.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NCHW"
-                and node.attrs.dst_layout == "NHWC"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found"
-
-
-def test_compile_tflite_module__same_layout__nhwc_to_nhwc(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    tvmc_model = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant)
-    before = tvmc_model.mod
-
-    expected_layout = "NHWC"
-
-    with tvm.transform.PassContext(opt_level=3):
-        after = tvmc.transform.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NHWC"
-                and node.attrs.dst_layout == "NHWC"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert not any(layout_transform_calls), "Unexpected 'layout_transform' call"
-
-
-def test_compile_onnx_module__same_layout__nchw_to_nchw(onnx_resnet50):
-    # some CI environments wont offer ONNX, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    tvmc_model = tvmc.frontends.load_model(onnx_resnet50)
-    before = tvmc_model.mod
-
-    expected_layout = "NCHW"
-
-    with tvm.transform.PassContext(opt_level=3):
-        after = tvmc.transform.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NCHW"
-                and node.attrs.dst_layout == "NCHW"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert not any(layout_transform_calls), "Unexpected 'layout_transform' call"
-
-
-def test_import_keras_friendly_message(keras_resnet50, monkeypatch):
-    # keras is part of tensorflow
-    monkeypatch.setattr("importlib.import_module", mock_error_on_name("tensorflow"))
-
-    with pytest.raises(TVMCImportError, match="tensorflow") as e:
-        _ = tvmc.frontends.load_model(keras_resnet50, model_format="keras")
-
-
-def test_import_onnx_friendly_message(onnx_resnet50, monkeypatch):
-    monkeypatch.setattr("importlib.import_module", mock_error_on_name("onnx"))
-
-    with pytest.raises(TVMCImportError, match="onnx") as e:
-        _ = tvmc.frontends.load_model(onnx_resnet50, model_format="onnx")
-
-
-def test_import_tensorflow_friendly_message(pb_mobilenet_v1_1_quant, monkeypatch):
-    monkeypatch.setattr("importlib.import_module", mock_error_on_name("tensorflow"))
-
-    with pytest.raises(TVMCImportError, match="tensorflow") as e:
-        _ = tvmc.frontends.load_model(pb_mobilenet_v1_1_quant, model_format="pb")
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-def test_import_torch_friendly_message(pytorch_resnet18, monkeypatch):
-    monkeypatch.setattr("importlib.import_module", mock_error_on_name("torch"))
-
-    with pytest.raises(TVMCImportError, match="torch") as e:
-        _ = tvmc.frontends.load_model(pytorch_resnet18, model_format="pytorch")
-
-
-def test_import_tflite_friendly_message(tflite_mobilenet_v1_1_quant, monkeypatch):
-    monkeypatch.setattr("importlib.import_module", mock_error_on_name("tflite.Model"))
-
-    with pytest.raises(TVMCImportError, match="tflite.Model") as e:
-        _ = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="tflite")
diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py
deleted file mode 100644
index 4d937212e9cc..000000000000
--- a/tests/python/driver/tvmc/test_model.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import platform
-import pytest
-import os
-import numpy as np
-
-from os import path
-
-from tvm.driver import tvmc
-from tvm.driver.tvmc.model import TVMCModel, TVMCPackage, TVMCResult
-from tvm.runtime.module import BenchmarkResult
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-@pytest.mark.parametrize("use_vm", [True, False])
-def test_tvmc_workflow(use_vm, keras_simple):
-    pytest.importorskip("tensorflow")
-    import tensorflow as tf
-
-    # Reset so the input name remains consistent across unit test runs
-    tf.keras.backend.clear_session()
-
-    tvmc_model = tvmc.load(keras_simple)
-    tuning_records = tvmc.tune(tvmc_model, target="llvm", enable_autoscheduler=True, trials=2)
-    tvmc_package = tvmc.compile(
-        tvmc_model, tuning_records=tuning_records, target="llvm", use_vm=use_vm
-    )
-    input_dict = {"input_1": np.random.uniform(size=(1, 32, 32, 3)).astype("float32")}
-
-    result = tvmc.run(
-        tvmc_package, device="cpu", end_to_end=True, benchmark=True, inputs=input_dict
-    )
-    assert type(tvmc_model) is TVMCModel
-    assert type(tvmc_package) is TVMCPackage
-    assert type(result) is TVMCResult
-    assert path.exists(tuning_records)
-    assert type(result.outputs) is dict
-    assert type(result.times) is BenchmarkResult
-    assert "output_0" in result.outputs.keys()
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
-@pytest.mark.parametrize("use_vm", [True, False])
-def test_save_load_model(use_vm, keras_simple, tmpdir_factory):
-    pytest.importorskip("onnx")
-
-    tmpdir = tmpdir_factory.mktemp("data")
-    tvmc_model = tvmc.load(keras_simple)
-
-    # Create tuning artifacts
-    tvmc.tune(tvmc_model, target="llvm", trials=2)
-
-    # Create package artifacts
-    tvmc.compile(tvmc_model, target="llvm", use_vm=use_vm)
-
-    # Save the model to disk
-    model_path = os.path.join(tmpdir, "saved_model.tar")
-    tvmc_model.save(model_path)
-
-    # Load the model into a new TVMCModel
-    new_tvmc_model = TVMCModel(model_path=model_path)
-
-    # Check that the two models match.
-    assert str(new_tvmc_model.mod) == str(tvmc_model.mod)
-    # Check that tuning records and the compiled package are recoverable.
-    assert path.exists(new_tvmc_model.default_package_path())
-    assert path.exists(new_tvmc_model.default_tuning_records_path())
diff --git a/tests/python/driver/tvmc/test_parse_config_file.py b/tests/python/driver/tvmc/test_parse_config_file.py
deleted file mode 100644
index cc822ed640a9..000000000000
--- a/tests/python/driver/tvmc/test_parse_config_file.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import os
-import shlex
-
-import tvm
-from tvm.driver.tvmc.main import _main
-from tvm.driver.tvmc.config_options import convert_config_json_to_cli, get_configs_json_dir
-
-
-def test_parse_json_config_file_one_target():
-    tokens = convert_config_json_to_cli(
-        {"targets": [{"kind": "llvm"}], "output": "resnet50-v2-7-autotuner_records.json"}
-    )
-    expected_tokens = [{"target": "llvm"}, {"output": "resnet50-v2-7-autotuner_records.json"}]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_parse_json_config_file_multipile_targets():
-    tokens = convert_config_json_to_cli(
-        {
-            "targets": [{"kind": "llvm"}, {"kind": "c", "mcpu": "cortex-m55"}],
-            "tuning-records": "resnet50-v2-7-autotuner_records.json",
-            "pass-config": {"tir.disable_vectorizer": "1"},
-        }
-    )
-    expected_tokens = [
-        {"target_c_mcpu": "cortex-m55"},
-        {"target": "llvm, c"},
-        {"tuning_records": "resnet50-v2-7-autotuner_records.json"},
-        {"pass_config": ["tir.disable_vectorizer=1"]},
-    ]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_parse_json_config_file_executor():
-    tokens = convert_config_json_to_cli(
-        {
-            "executor": {"kind": "aot", "interface-api": "c"},
-            "inputs": "imagenet_cat.npz",
-            "max-local-memory-per-block": "4",
-            "repeat": "100",
-        }
-    )
-    expected_tokens = [
-        {"executor": "aot"},
-        {"executor_aot_interface_api": "c"},
-        {"inputs": "imagenet_cat.npz"},
-        {"max_local_memory_per_block": "4"},
-        {"repeat": "100"},
-    ]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_parse_json_config_file_target_and_executor():
-    tokens = convert_config_json_to_cli(
-        {
-            "targets": [
-                {"kind": "ethos-u -accelerator_config=ethos-u55-256"},
-                {"kind": "c", "mcpu": "cortex-m55"},
-                {"kind": "cmsis-nn"},
-            ],
-            "executor": {"kind": "aot", "interface-api": "c", "unpacked-api": "1"},
-            "inputs": "imagenet_cat.npz",
-            "max-local-memory-per-block": "4",
-            "repeat": "100",
-        }
-    )
-    expected_tokens = [
-        {"target_c_mcpu": "cortex-m55"},
-        {"target": "ethos-u -accelerator_config=ethos-u55-256, c, cmsis-nn"},
-        {"executor": "aot"},
-        {"executor_aot_interface_api": "c"},
-        {"executor_aot_unpacked_api": "1"},
-        {"inputs": "imagenet_cat.npz"},
-        {"max_local_memory_per_block": "4"},
-        {"repeat": "100"},
-    ]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_parse_json_config_file_runtime():
-    tokens = convert_config_json_to_cli(
-        {
-            "targets": [
-                {"kind": "cmsis-nn", "from_device": "1"},
-                {"kind": "c", "mcpu": "cortex-m55"},
-            ],
-            "runtime": {"kind": "crt"},
-            "inputs": "imagenet_cat.npz",
-            "output": "predictions.npz",
-            "pass-config": {"tir.disable_vectorize": "1", "relay.backend.use_auto_scheduler": "0"},
-        }
-    )
-    expected_tokens = [
-        {"target_cmsis-nn_from_device": "1"},
-        {"target_c_mcpu": "cortex-m55"},
-        {"target": "cmsis-nn, c"},
-        {"runtime": "crt"},
-        {"inputs": "imagenet_cat.npz"},
-        {"output": "predictions.npz"},
-        {"pass_config": ["tir.disable_vectorize=1", "relay.backend.use_auto_scheduler=0"]},
-    ]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tvmc_get_configs_json_dir(tmpdir_factory, monkeypatch):
-    # Reset global state
-    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
-
-    # Get default directory for reference
-    default_dir = get_configs_json_dir()
-
-    # Set custom dir which does not exist -> ignore
-    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
-    monkeypatch.setenv("TVM_CONFIGS_JSON_DIR", "not_a_directory")
-    result = get_configs_json_dir()
-    assert_msg = "Non-existent directory passed via TVM_CONFIGS_JSON_DIR should be ignored."
-    assert result == default_dir, assert_msg
-
-    # Set custom dir which does exist
-    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
-    configs_dir = tmpdir_factory.mktemp("configs")
-    monkeypatch.setenv("TVM_CONFIGS_JSON_DIR", str(configs_dir))
-    result = get_configs_json_dir()
-    assert_msg = (
-        "Custom value passed via TVM_CONFIGS_JSON_DIR should be used instead of default one."
-    )
-    assert result != default_dir and result is not None, assert_msg
diff --git a/tests/python/driver/tvmc/test_pass_config.py b/tests/python/driver/tvmc/test_pass_config.py
deleted file mode 100644
index 034f761f1d6b..000000000000
--- a/tests/python/driver/tvmc/test_pass_config.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-from unittest import mock
-
-from tvm.contrib.target.vitis_ai import vitis_ai_available
-
-from tvm.driver.tvmc import TVMCException
-from tvm.driver.tvmc.pass_config import parse_configs
-from tvm.tir.transform import PrimFuncPass
-from tvm.ir.transform import Sequential
-
-
-def test_config_invalid_format():
-    with pytest.raises(TVMCException):
-        _ = parse_configs(["relay.backend.use_auto_scheduler.missing.value"])
-
-
-def test_config_missing_from_tvm():
-    with pytest.raises(TVMCException):
-        _ = parse_configs(["relay.backend.use_auto_scheduler.missing.value=1234"])
-
-
-def test_config_unsupported_tvmc_config():
-    with pytest.raises(TVMCException):
-        _ = parse_configs(["tir.LoopPartition=value"])
-
-
-def test_config_empty():
-    with pytest.raises(TVMCException):
-        _ = parse_configs([""])
-
-
-def test_config_valid_config_bool():
-    configs = parse_configs(["relay.backend.use_auto_scheduler=true"])
-
-    assert len(configs) == 1
-    assert "relay.backend.use_auto_scheduler" in configs.keys()
-    assert configs["relay.backend.use_auto_scheduler"] == True
-
-
-@pytest.mark.skipif(
-    not vitis_ai_available(),
-    reason="--target vitis-ai is not available. TVM built with 'USE_VITIS_AI OFF'",
-)
-def test_config_valid_multiple_configs():
-    configs = parse_configs(
-        [
-            "relay.backend.use_auto_scheduler=false",
-            "tir.detect_global_barrier=10",
-            "relay.ext.vitis_ai.options.build_dir=mystring",
-        ]
-    )
-
-    assert len(configs) == 3
-    assert "relay.backend.use_auto_scheduler" in configs.keys()
-    assert configs["relay.backend.use_auto_scheduler"] == False
-    assert "tir.detect_global_barrier" in configs.keys()
-    assert configs["tir.detect_global_barrier"] == 10
-    assert "relay.ext.vitis_ai.options.build_dir" in configs.keys()
-    assert configs["relay.ext.vitis_ai.options.build_dir"] == "mystring"
-
-
-def test_add_lower_pass_multi_built_in_pass():
-    configs = parse_configs(
-        [
-            "tir.add_lower_pass=1,tir.transform.UnrollLoop",
-            "tir.add_lower_pass=1,tir.transform.HoistIfThenElse,2,tir.transform.LoopPartition",
-        ]
-    )
-
-    assert len(configs["tir.add_lower_pass"]) == 3
-    # opt_level: 1, pass: tir.transform.UnrollLoop
-    assert configs["tir.add_lower_pass"][0][0] == 1
-    assert isinstance(configs["tir.add_lower_pass"][0][1], PrimFuncPass)
-    # opt_level: 1, pass: tir.transform.HoistIfThenElse
-    assert configs["tir.add_lower_pass"][1][0] == 1
-    assert isinstance(configs["tir.add_lower_pass"][1][1], Sequential)
-    assert configs["tir.add_lower_pass"][1][1].pass_info.name == "tir.HoistIfThenElse"
-    # opt_level: 2, pass: tir.transform.LoopPartition
-    assert configs["tir.add_lower_pass"][2][0] == 2
-    assert isinstance(configs["tir.add_lower_pass"][2][1], PrimFuncPass)
-
-
-def test_add_lower_pass_multi_external_pass():
-    fake_pass_1 = mock.MagicMock()
-    fake_pass_2 = mock.MagicMock()
-    fake_pass_3 = mock.MagicMock()
-    with mock.patch.dict(
-        "sys.modules",
-        {"fake_module": fake_pass_1, "fake_module": fake_pass_2, "fake_module": fake_pass_3},
-    ):
-        configs = parse_configs(
-            [
-                "tir.add_lower_pass=1,fake_module.fake_pass_1,2,fake_module.fake_pass2",
-                "tir.add_lower_pass=3,fake_module.fake_pass_3",
-            ]
-        )
-        assert len(configs["tir.add_lower_pass"]) == 3
-        # opt_level: 1, pass: fake_module.fake_pass_1
-        assert configs["tir.add_lower_pass"][0][0] == 1
-        # opt_level: 2, pass: fake_module.fake_pass_2
-        assert configs["tir.add_lower_pass"][1][0] == 2
-        # opt_level: 3, pass: fake_module.fake_pass_3
-        assert configs["tir.add_lower_pass"][2][0] == 3
-
-
-def test_add_lower_pass_multi_mix_pass():
-    fake_pass_1 = mock.MagicMock()
-    fake_pass_2 = mock.MagicMock()
-    with mock.patch.dict("sys.modules", {"fake_module": fake_pass_1, "fake_module": fake_pass_2}):
-        configs = parse_configs(
-            [
-                "tir.add_lower_pass=1,fake_module.fake_pass_1,1,tir.transform.UnrollLoop",
-                "tir.add_lower_pass=2,fake_module.fake_pass_2,2,tir.transform.LoopPartition",
-            ]
-        )
-        assert len(configs["tir.add_lower_pass"]) == 4
-        # opt_level: 1, pass: fake_module.fake_pass_1
-        assert configs["tir.add_lower_pass"][0][0] == 1
-        # opt_level: 1, pass: tir.transform.UnrollLoop
-        assert configs["tir.add_lower_pass"][1][0] == 1
-        assert isinstance(configs["tir.add_lower_pass"][1][1], PrimFuncPass)
-        # opt_level: 2, pass: fake_module.fake_pass_2
-        assert configs["tir.add_lower_pass"][2][0] == 2
-        # opt_level: 2, pass: tir.transform.LoopPartition
-        assert configs["tir.add_lower_pass"][3][0] == 2
-        assert isinstance(configs["tir.add_lower_pass"][3][1], PrimFuncPass)
-
-
-def test_add_lower_pass_invalid_format():
-    # wrong format
-    with pytest.raises(TVMCException):
-        _ = parse_configs(["tir.add_lower_pass=tir.transform.UnrollLoop,1"])
-    # missing pass name
-    with pytest.raises(TVMCException):
-        _ = parse_configs(["tir.add_lower_pass=1,tir.transform.UnrollLoop,3"])
-    # wrong opt level
-    with pytest.raises(TVMCException):
-        _ = parse_configs(["tir.add_lower_pass=a,tir.transform.UnrollLoop"])
-    # fake module
-    with pytest.raises(ModuleNotFoundError):
-        _ = parse_configs(
-            ["tir.add_lower_pass=1,tir.transform.UnrollLoop,2,path.to.module.fake_func"]
-        )
-    # real module and fake func
-    with pytest.raises(TVMCException):
-        _ = parse_configs(["tir.add_lower_pass=1,tir.transform.UnrollLoop,2,tvm.tir.fake_func"])
diff --git a/tests/python/driver/tvmc/test_pass_list.py b/tests/python/driver/tvmc/test_pass_list.py
deleted file mode 100644
index 5b6c6710158d..000000000000
--- a/tests/python/driver/tvmc/test_pass_list.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import pytest
-from tvm.driver.tvmc.pass_list import parse_pass_list_str
-
-
-def test_parse_pass_list_str():
-    assert [""] == parse_pass_list_str("")
-    assert ["FoldScaleAxis", "FuseOps"] == parse_pass_list_str("FoldScaleAxis,FuseOps")
-
-    with pytest.raises(argparse.ArgumentTypeError) as ate:
-        parse_pass_list_str("MyYobaPass,MySuperYobaPass,FuseOps")
-
-    assert "MyYobaPass" in str(ate.value)
-    assert "MySuperYobaPass" in str(ate.value)
-    assert "FuseOps" in str(ate.value)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_registry_options.py b/tests/python/driver/tvmc/test_registry_options.py
deleted file mode 100644
index dbd7cc050091..000000000000
--- a/tests/python/driver/tvmc/test_registry_options.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-
-import pytest
-
-from tvm.driver.tvmc import TVMCException
-from tvm.driver.tvmc.registry import generate_registry_args, reconstruct_registry_entity
-from tvm.relay.backend import Executor
-
-
-def test_registry_to_argparse():
-    parser = argparse.ArgumentParser()
-    generate_registry_args(parser, Executor)
-    parsed, _ = parser.parse_known_args(["--executor=aot", "--executor-aot-interface-api=c"])
-
-    assert parsed.executor == "aot"
-    assert parsed.executor_aot_interface_api == "c"
-
-
-def test_registry_to_argparse_default():
-    parser = argparse.ArgumentParser()
-    generate_registry_args(parser, Executor, "aot")
-    parsed, _ = parser.parse_known_args([])
-
-    assert parsed.executor == "aot"
-
-
-def test_mapping_registered_args():
-    parser = argparse.ArgumentParser()
-    generate_registry_args(parser, Executor)
-    parsed, _ = parser.parse_known_args(["--executor=aot", "--executor-aot-interface-api=c"])
-    entity = reconstruct_registry_entity(parsed, Executor)
-
-    assert isinstance(entity, Executor)
-    assert "interface-api" in entity
-    assert entity["interface-api"] == "c"
-
-
-def test_mapping_registered_args_no_match_for_name():
-    parser = argparse.ArgumentParser()
-    generate_registry_args(parser, Executor)
-    parsed, _ = parser.parse_known_args(["--executor=woof"])
-
-    with pytest.raises(TVMCException, match='Executor "woof" is not defined'):
-        reconstruct_registry_entity(parsed, Executor)
-
-
-def test_mapping_registered_args_no_arg():
-    parser = argparse.ArgumentParser()
-    generate_registry_args(parser, Executor)
-    parsed, _ = parser.parse_known_args([])
-
-    assert reconstruct_registry_entity(parsed, Executor) == None
-
-
-def test_mapping_registered_args_mismatch_for_arg():
-    parser = argparse.ArgumentParser()
-    generate_registry_args(parser, Executor)
-    parsed, _ = parser.parse_known_args(["--executor=aot", "--executor-graph-link-params=1"])
-
-    with pytest.raises(
-        TVMCException,
-        match="Passed --executor-graph-link-params but did not specify graph executor",
-    ):
-        reconstruct_registry_entity(parsed, Executor)
diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py
deleted file mode 100644
index 5e6386614b1c..000000000000
--- a/tests/python/driver/tvmc/test_runner.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import numpy as np
-
-from tvm import rpc
-from tvm.driver import tvmc
-from tvm.driver.tvmc.model import TVMCResult
-from tvm.driver.tvmc.result_utils import get_top_results
-from tvm.runtime.module import BenchmarkResult
-
-
-def test_generate_tensor_data_zeros():
-    expected_shape = (2, 3)
-    expected_dtype = "uint8"
-    sut = tvmc.runner.generate_tensor_data(expected_shape, expected_dtype, "zeros")
-
-    assert sut.shape == (2, 3)
-
-
-def test_generate_tensor_data_ones():
-    expected_shape = (224, 224)
-    expected_dtype = "uint8"
-    sut = tvmc.runner.generate_tensor_data(expected_shape, expected_dtype, "ones")
-
-    assert sut.shape == (224, 224)
-
-
-def test_generate_tensor_data_random():
-    expected_shape = (2, 3)
-    expected_dtype = "uint8"
-    sut = tvmc.runner.generate_tensor_data(expected_shape, expected_dtype, "random")
-
-    assert sut.shape == (2, 3)
-
-
-def test_generate_tensor_data__type_unknown():
-    with pytest.raises(tvmc.TVMCException) as e:
-        tvmc.runner.generate_tensor_data((2, 3), "float32", "whatever")
-
-
-def test_format_times__contains_header():
-    fake_result = TVMCResult(outputs=None, times=BenchmarkResult([0.6, 1.2, 0.12, 0.42]))
-    sut = fake_result.format_times()
-    assert "std (ms)" in sut
-
-
-def test_get_top_results_keep_results():
-    fake_outputs = {"output_0": np.array([[1, 2, 3, 4], [5, 6, 7, 8]])}
-    fake_result = TVMCResult(outputs=fake_outputs, times=None)
-    number_of_results_wanted = 3
-    sut = get_top_results(fake_result, number_of_results_wanted)
-
-    expected_number_of_lines = 2
-    assert len(sut) == expected_number_of_lines
-
-    expected_number_of_results_per_line = 3
-    assert len(sut[0]) == expected_number_of_results_per_line
-    assert len(sut[1]) == expected_number_of_results_per_line
-
-
-@pytest.mark.parametrize("use_vm", [True, False])
-def test_run_tflite_module__with_profile__valid_input(
-    use_vm, tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
-):
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    inputs = np.load(imagenet_cat)
-    input_dict = {"input": inputs["input"].astype("uint8")}
-
-    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant, use_vm=use_vm)
-    result = tvmc.run(
-        tflite_compiled_model,
-        inputs=input_dict,
-        benchmark=True,
-        hostname=None,
-        device="cpu",
-        profile=True,
-    )
-
-    # collect the top 5 results
-    top_5_results = get_top_results(result, 5)
-    top_5_ids = top_5_results[0]
-
-    # IDs were collected from this reference:
-    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/
-    # java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
-    tiger_cat_mobilenet_id = 283
-
-    assert (
-        tiger_cat_mobilenet_id in top_5_ids
-    ), "tiger cat is expected in the top-5 for mobilenet v1"
-    assert isinstance(result.outputs, dict)
-    assert isinstance(result.times, BenchmarkResult)
-    assert "output_0" in result.outputs.keys()
-
-
-def test_run_tflite_module_with_rpc(
-    tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
-):
-    """
-    Test to check that TVMC run is functional when it is being used in
-    conjunction with an RPC server.
-    """
-    pytest.importorskip("tflite")
-
-    inputs = np.load(imagenet_cat)
-    input_dict = {"input": inputs["input"].astype("uint8")}
-
-    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant)
-
-    server = rpc.Server("127.0.0.1", 9099)
-    result = tvmc.run(
-        tflite_compiled_model,
-        inputs=input_dict,
-        hostname=server.host,
-        port=server.port,
-        device="cpu",
-    )
-
-    top_5_results = get_top_results(result, 5)
-    top_5_ids = top_5_results[0]
-
-    # IDs were collected from this reference:
-    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/
-    # java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
-    tiger_cat_mobilenet_id = 283
-
-    assert (
-        tiger_cat_mobilenet_id in top_5_ids
-    ), "tiger cat is expected in the top-5 for mobilenet v1"
-    assert isinstance(result.outputs, dict)
-    assert "output_0" in result.outputs.keys()
-
-
-@pytest.mark.parametrize("use_vm", [True, False])
-@pytest.mark.parametrize(
-    "benchmark,repeat,number,expected_len", [(False, 1, 1, 0), (True, 1, 1, 1), (True, 3, 2, 3)]
-)
-def test_run_relay_module__benchmarking(
-    use_vm,
-    benchmark,
-    repeat,
-    number,
-    expected_len,
-    relay_text_conv2d,
-    relay_compile_model,
-):
-    """Check the length of the results from benchmarking is what is expected by expected_len."""
-    shape_dict = {"data": (1, 3, 64, 64), "weight": (3, 3, 5, 5)}
-    input_dict = {
-        "data": np.random.randint(low=0, high=10, size=shape_dict["data"], dtype="uint8"),
-        "weight": np.random.randint(low=0, high=10, size=shape_dict["weight"], dtype="int8"),
-    }
-
-    tflite_compiled_model = relay_compile_model(
-        relay_text_conv2d, shape_dict=shape_dict, use_vm=use_vm
-    )
-    result = tvmc.run(
-        tflite_compiled_model,
-        inputs=input_dict,
-        hostname=None,
-        device="cpu",
-        benchmark=benchmark,
-        repeat=repeat,
-        number=number,
-    )
-
-    # When no benchmarking is used, an empty list is used to
-    # represent an absence of results.
-    if isinstance(result.times, list):
-        assert len(result.times) == expected_len
-    else:
-        assert len(result.times.results) == expected_len
diff --git a/tests/python/driver/tvmc/test_shape_parser.py b/tests/python/driver/tvmc/test_shape_parser.py
deleted file mode 100644
index b7b96ae4efa9..000000000000
--- a/tests/python/driver/tvmc/test_shape_parser.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-
-import pytest
-from tvm.driver.tvmc.shape_parser import parse_shape_string
-
-
-def test_shape_parser():
-    # Check that a valid input is parsed correctly
-    shape_string = "input:[10,10,10]"
-    shape_dict = parse_shape_string(shape_string)
-    assert shape_dict == {"input": [10, 10, 10]}
-
-
-def test_alternate_syntax():
-    shape_string = "input:0:[10,10,10] input2:[20,20,20,20]"
-    shape_dict = parse_shape_string(shape_string)
-    assert shape_dict == {"input:0": [10, 10, 10], "input2": [20, 20, 20, 20]}
-
-
-@pytest.mark.parametrize(
-    "shape_string",
-    [
-        "input:[10,10,10] input2:[20,20,20,20]",
-        "input: [10, 10, 10] input2: [20, 20, 20, 20]",
-        "input:[10,10,10],input2:[20,20,20,20]",
-    ],
-)
-def test_alternate_syntaxes(shape_string):
-    shape_dict = parse_shape_string(shape_string)
-    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
-
-
-def test_negative_dimensions():
-    # Check that negative dimensions parse to Any correctly.
-    shape_string = "input:[-1,3,224,224]"
-    shape_dict = parse_shape_string(shape_string)
-    # Convert to strings to allow comparison with Any.
-    assert str(shape_dict) == "{'input': [T.Any(), 3, 224, 224]}"
-
-
-def test_multiple_valid_gpu_inputs():
-    # Check that multiple valid gpu inputs are parsed correctly.
-    shape_string = "gpu_0/data_0:[1, -1,224,224] gpu_1/data_1:[7, 7]"
-    shape_dict = parse_shape_string(shape_string)
-    expected = "{'gpu_0/data_0': [1, T.Any(), 224, 224], 'gpu_1/data_1': [7, 7]}"
-    assert str(shape_dict) == expected
-
-
-def test_invalid_pattern():
-    shape_string = "input:[a,10]"
-    with pytest.raises(argparse.ArgumentTypeError):
-        parse_shape_string(shape_string)
-
-
-def test_invalid_separators():
-    shape_string = "input:5,10 input2:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        parse_shape_string(shape_string)
-
-
-def test_invalid_colon():
-    shape_string = "gpu_0/data_0:5,10 :test:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        parse_shape_string(shape_string)
-
-
-@pytest.mark.parametrize(
-    "shape_string",
-    [
-        "gpu_0/data_0:5,10 /:10,10",
-        "gpu_0/data_0:5,10 data/:10,10",
-        "gpu_0/data_0:5,10 /data:10,10",
-        "gpu_0/invalid/data_0:5,10 data_1:10,10",
-    ],
-)
-def test_invalid_slashes(shape_string):
-    with pytest.raises(argparse.ArgumentTypeError):
-        parse_shape_string(shape_string)
-
-
-def test_dot():
-    # Check dot in input name
-    shape_string = "input.1:[10,10,10]"
-    shape_dict = parse_shape_string(shape_string)
-    assert shape_dict == {"input.1": [10, 10, 10]}
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
deleted file mode 100644
index 7ce8ee9eae2c..000000000000
--- a/tests/python/driver/tvmc/test_target.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import tvm.testing
-from tvm.driver.tvmc import TVMCException
-from tvm.driver.tvmc.target import target_from_cli, tokenize_target, parse_target
-
-
-def test_target_from_cli__error_duplicate():
-    with pytest.raises(TVMCException):
-        _ = target_from_cli("llvm, llvm")
-
-
-def test_target_invalid_more_than_two_tvm_targets():
-    with pytest.raises(TVMCException):
-        _ = target_from_cli("cuda, opencl, llvm")
-
-
-def test_target_from_cli__error_target_not_found():
-    with pytest.raises(TVMCException):
-        _ = target_from_cli("invalidtarget")
-
-
-def test_target_two_tvm_targets():
-    tvm_target, extra_targets = target_from_cli(
-        "opencl -device=mali, llvm -mtriple=aarch64-linux-gnu"
-    )
-
-    assert "opencl" in str(tvm_target)
-    assert "llvm" in str(tvm_target.host)
-
-    # No extra targets
-    assert 0 == len(extra_targets)
-
-
-def test_tokenize_target_with_opts():
-    tokens = tokenize_target("foo -opt1=value1 --flag, bar -opt2=value2")
-    expected_tokens = ["foo", "-opt1=value1", "--flag", ",", "bar", "-opt2=value2"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_plus_sign():
-    tokens = tokenize_target("foo -opt1=+value1 --flag, bar -opt2=test,+v")
-    expected_tokens = ["foo", "-opt1=+value1", "--flag", ",", "bar", "-opt2=test,+v"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_commas():
-    tokens = tokenize_target("foo -opt1=v,a,l,u,e,1 --flag")
-    expected_tokens = ["foo", "-opt1=v,a,l,u,e,1", "--flag"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_commas_and_single_quotes():
-    tokens = tokenize_target("foo -opt1='v, a, l, u, e', bar")
-    expected_tokens = ["foo", "-opt1='v, a, l, u, e'", ",", "bar"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_commas_and_double_quotes():
-    tokens = tokenize_target('foo -opt1="v, a, l, u, e", bar')
-    expected_tokens = ["foo", '-opt1="v, a, l, u, e"', ",", "bar"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_dashes():
-    tokens = tokenize_target("foo-bar1 -opt-1=t-e-s-t, baz")
-    expected_tokens = ["foo-bar1", "-opt-1=t-e-s-t", ",", "baz"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_parse_single_target_with_opts():
-    targets = parse_target("llvm -device=arm_cpu -mattr=+fp")
-
-    assert len(targets) == 1
-    assert "device" in targets[0]["opts"]
-    assert "mattr" in targets[0]["opts"]
-
-
-def test_parse_multiple_target():
-    targets = parse_target("compute-library, llvm -device=arm_cpu")
-
-    assert len(targets) == 2
-    assert "compute-library" == targets[0]["name"]
-    assert "llvm" == targets[1]["name"]
-
-
-def test_parse_quotes_and_separators_on_options():
-    targets_no_quote = parse_target("foo -option1=+v1.0x,+value,+bar")
-    targets_single_quote = parse_target("foo -option1='+v1.0x,+value'")
-    targets_double_quote = parse_target('foo -option1="+v1.0x,+value"')
-
-    assert len(targets_no_quote) == 1
-    assert "+v1.0x,+value,+bar" == targets_no_quote[0]["opts"]["option1"]
-
-    assert len(targets_single_quote) == 1
-    assert "+v1.0x,+value" == targets_single_quote[0]["opts"]["option1"]
-
-    assert len(targets_double_quote) == 1
-    assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"]
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_target_options.py b/tests/python/driver/tvmc/test_target_options.py
deleted file mode 100644
index 352bfac7940d..000000000000
--- a/tests/python/driver/tvmc/test_target_options.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-
-import pytest
-
-import tvm
-from tvm.driver.tvmc import TVMCException
-from tvm.driver.tvmc.target import generate_target_args, reconstruct_target_args, target_from_cli
-
-
-def test_target_to_argparse():
-    parser = argparse.ArgumentParser()
-    generate_target_args(parser)
-    parsed, _ = parser.parse_known_args(
-        ["--target=llvm", "--target-llvm-mattr=+fp,+mve", "--target-llvm-mcpu=cortex-m3"]
-    )
-    assert parsed.target == "llvm"
-    assert parsed.target_llvm_mcpu == "cortex-m3"
-    assert parsed.target_llvm_mattr == "+fp,+mve"
-
-
-@tvm.testing.requires_mrvl
-def test_target_to_argparse_for_mrvl_hybrid():
-    parser = argparse.ArgumentParser()
-    generate_target_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--target=mrvl, llvm",
-            "--target-mrvl-mattr=wb_pin_ocm=1,quantize=fp16",
-            "--target-mrvl-num_tiles=2",
-            "--target-mrvl-mcpu=cnf10kb",
-        ]
-    )
-
-    assert parsed.target == "mrvl, llvm"
-    assert parsed.target_mrvl_mattr == "wb_pin_ocm=1,quantize=fp16"
-    assert parsed.target_mrvl_num_tiles == 2
-    assert parsed.target_mrvl_mcpu == "cnf10kb"
-
-
-@tvm.testing.requires_mrvl
-def test_default_arg_for_mrvl_hybrid():
-    parser = argparse.ArgumentParser()
-    generate_target_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--target=mrvl, llvm",
-        ]
-    )
-    assert parsed.target == "mrvl, llvm"
-    assert parsed.target_mrvl_mcpu == "cn10ka"
-    assert parsed.target_mrvl_num_tiles == 8
-
-
-@tvm.testing.requires_mrvl
-# Test for default(LLVM) target, when built with USE_MRVL=ON
-def test_mrvl_build_with_llvm_only_target():
-    parser = argparse.ArgumentParser()
-    generate_target_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--target=llvm",
-        ]
-    )
-    assert parsed.target == "llvm"
-
-
-@tvm.testing.requires_vitis_ai
-def test_composite_target_cmd_line_help():
-    parser = argparse.ArgumentParser()
-    generate_target_args(parser)
-    assert parser._option_string_actions["--target-vitis-ai-dpu"].help == "Vitis AI DPU identifier"
-    assert (
-        parser._option_string_actions["--target-vitis-ai-build_dir"].help
-        == "Build directory to be used (optional, debug)"
-    )
-    assert (
-        parser._option_string_actions["--target-vitis-ai-work_dir"].help
-        == "Work directory to be used (optional, debug)"
-    )
-    assert (
-        parser._option_string_actions["--target-vitis-ai-export_runtime_module"].help
-        == "Export the Vitis AI runtime module to this file"
-    )
-    assert (
-        parser._option_string_actions["--target-vitis-ai-load_runtime_module"].help
-        == "Load the Vitis AI runtime module to this file"
-    )
-
-
-def test_target_recombobulation_single():
-    tvm_target, _ = target_from_cli("llvm", {"llvm": {"mcpu": "cortex-m3"}})
-
-    assert str(tvm_target) == "llvm -keys=arm_cpu,cpu -mcpu=cortex-m3"
-
-
-def test_target_recombobulation_many():
-    tvm_target, _ = target_from_cli(
-        "opencl -device=mali, llvm -mtriple=aarch64-linux-gnu",
-        {"llvm": {"mcpu": "cortex-m3"}, "opencl": {"max_num_threads": 404}},
-    )
-
-    assert "-max_num_threads=404" in str(tvm_target)
-    assert "-device=mali" in str(tvm_target)
-    assert "-mtriple=aarch64-linux-gnu" in str(tvm_target.host)
-    assert "-mcpu=cortex-m3" in str(tvm_target.host)
-
-
-def test_target_recombobulation_codegen():
-    tvm_target, extras = target_from_cli(
-        "cmsis-nn, c -mcpu=cortex-m55",
-        {"cmsis-nn": {"mcpu": "cortex-m55"}},
-    )
-
-    assert "-mcpu=cortex-m55" in str(tvm_target)
-    assert len(extras) == 1
-    assert extras[0]["name"] == "cmsis-nn"
-    assert extras[0]["opts"] == {"mcpu": "cortex-m55"}
-
-
-def test_error_if_target_missing():
-    with pytest.raises(
-        TVMCException,
-        match="Passed --target-opencl-max_num_threads but did not specify opencl target",
-    ):
-        target_from_cli(
-            "llvm",
-            {"opencl": {"max_num_threads": 404}},
-        )
diff --git a/tests/python/driver/tvmc/test_tracker.py b/tests/python/driver/tvmc/test_tracker.py
deleted file mode 100644
index 8734ad5c421f..000000000000
--- a/tests/python/driver/tvmc/test_tracker.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from tvm.driver.tvmc.tracker import tracker_host_port_from_cli
-
-
-def test_tracker_host_port_from_cli__hostname_port():
-    input_str = "1.2.3.4:9090"
-    expected_host = "1.2.3.4"
-    expected_port = 9090
-
-    actual_host, actual_port = tracker_host_port_from_cli(input_str)
-
-    assert expected_host == actual_host
-    assert expected_port == actual_port
-
-
-def test_tracker_host_port_from_cli__hostname_port__empty():
-    input_str = ""
-
-    actual_host, actual_port = tracker_host_port_from_cli(input_str)
-
-    assert actual_host is None
-    assert actual_port is None
-
-
-def test_tracker_host_port_from_cli__only_hostname__default_port_is_9090():
-    input_str = "1.2.3.4"
-    expected_host = "1.2.3.4"
-    expected_port = 9090
-
-    actual_host, actual_port = tracker_host_port_from_cli(input_str)
-
-    assert expected_host == actual_host
-    assert expected_port == actual_port
diff --git a/tests/python/driver/tvmc/test_transform.py b/tests/python/driver/tvmc/test_transform.py
deleted file mode 100644
index ebf067990d0f..000000000000
--- a/tests/python/driver/tvmc/test_transform.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-from unittest.mock import MagicMock
-
-import tvm
-from tvm import relay
-from tvm.relay import testing
-from tvm.relay.expr_functor import ExprMutator
-from tvm.ir.instrument import pass_instrument
-from tvm.driver.tvmc.transform import apply_graph_transforms
-from tvm.driver.tvmc.model import TVMCException
-
-
-def test_layout_transform_fold_constant(relay_conv2d):
-    """
-    Test layout is correctly transformed and constant folding is applied.
-    """
-    desired_layout = "NHWC"
-
-    @pass_instrument
-    class CollectPassNames:
-        def __init__(self):
-            self.names = []
-
-        def run_after_pass(self, _, info):
-            self.names.append(info.name)
-
-    pass_names = CollectPassNames()
-    with tvm.transform.PassContext(opt_level=3, instruments=[pass_names]):
-        apply_graph_transforms(relay_conv2d, {"desired_layout": [desired_layout]})
-
-    names = pass_names.names
-    assert "ConvertLayout" in names
-    assert "FoldConstant" in names
-    assert names.index("ConvertLayout") < names.index("FoldConstant")
-
-
-def test_layout_transform_convert_layout_pass_args(relay_conv2d, monkeypatch):
-    """
-    Check the convert layout desired layouts arugment is what is expected when
-    a desired layout is provided.
-    """
-    desired_layout = "NHWC"
-
-    mock_convert_layout = MagicMock()
-    mock_convert_layout.return_value = relay.transform.ConvertLayout({})
-    monkeypatch.setattr(relay.transform, "ConvertLayout", mock_convert_layout)
-
-    with tvm.transform.PassContext(opt_level=3):
-        apply_graph_transforms(relay_conv2d, {"desired_layout": [desired_layout]})
-
-    mock_convert_layout.assert_called_once_with(
-        {
-            "nn.conv2d": ["NHWC", "default"],
-            "nn.conv2d_transpose": ["NHWC", "default"],
-            "qnn.conv2d": ["NHWC", "default"],
-        }
-    )
-
-
-def test_layout_transform_convert_kernel_layout_pass_args(relay_conv2d, monkeypatch):
-    """
-    Check the convert layout desired layouts arugment is what is expected when
-    a non-default kernel layout is provided.
-    """
-    desired_layout = "NHWC:HWIO"
-    desired_layout_ops = ["nn.conv2d"]
-
-    mock_convert_layout = MagicMock()
-    mock_convert_layout.return_value = relay.transform.ConvertLayout({})
-    monkeypatch.setattr(relay.transform, "ConvertLayout", mock_convert_layout)
-
-    with tvm.transform.PassContext(opt_level=3):
-        apply_graph_transforms(
-            relay_conv2d,
-            {"desired_layout": [desired_layout], "desired_layout_ops": desired_layout_ops},
-        )
-
-    mock_convert_layout.assert_called_once_with(
-        {
-            "nn.conv2d": ["NHWC", "HWIO"],
-        }
-    )
-
-
-def test_layout_transform_convert_layout_pass_args_multiple(relay_conv2d, monkeypatch):
-    """
-    Check the convert layout desired layouts arugment is what is expected when
-    a multiple desired layouts are provided.
-    """
-    desired_layout = ["NHWC", "NCHW"]
-    desired_layout_ops = ["nn.max_pool2d", "qnn.conv2d"]
-
-    mock_convert_layout = MagicMock()
-    mock_convert_layout.return_value = relay.transform.ConvertLayout({})
-    monkeypatch.setattr(relay.transform, "ConvertLayout", mock_convert_layout)
-
-    with tvm.transform.PassContext(opt_level=3):
-        apply_graph_transforms(
-            relay_conv2d,
-            {"desired_layout": desired_layout, "desired_layout_ops": desired_layout_ops},
-        )
-
-    mock_convert_layout.assert_called_once_with(
-        {
-            "nn.max_pool2d": ["NHWC", "default"],
-            "qnn.conv2d": ["NCHW", "default"],
-        }
-    )
-
-
-@pytest.mark.parametrize(
-    "desired",
-    [
-        (["NHWC", "NCHW"], ["nn.max_pool2d"]),
-        (["NHWC", "NCHW"], None),
-    ],
-)
-def test_layout_transform_convert_layout_pass_args_multiple_invalid(
-    relay_conv2d,
-    monkeypatch,
-    desired,
-):
-    """
-    Check invalid cases when passing multiple values to the desired layouts argument.
-    """
-    desired_layout, desired_layout_ops = desired
-
-    mock_convert_layout = MagicMock()
-    mock_convert_layout.return_value = relay.transform.ConvertLayout({})
-    monkeypatch.setattr(relay.transform, "ConvertLayout", mock_convert_layout)
-
-    with pytest.raises(TVMCException):
-        with tvm.transform.PassContext(opt_level=3):
-            apply_graph_transforms(
-                relay_conv2d,
-                {"desired_layout": desired_layout, "desired_layout_ops": desired_layout_ops},
-            )
-
-
-def test_layout_transform_to_mixed_precision_pass_args_mock(relay_conv2d, monkeypatch):
-    """
-    Check the mixed precision arugments which are expected when
-    mixed precision arguments are provided.
-    """
-    mock_mixed_precision = MagicMock()
-    mock_mixed_precision.return_value = tvm.driver.tvmc.transform.MixedPrecision([], "")
-    monkeypatch.setattr(tvm.driver.tvmc.transform, "MixedPrecision", mock_mixed_precision)
-
-    with tvm.transform.PassContext(opt_level=3):
-        apply_graph_transforms(
-            relay_conv2d,
-            {
-                "mixed_precision": True,
-                "mixed_precision_ops": ["nn.conv2d"],
-                "mixed_precision_calculation_type": "float16",
-                "mixed_precision_acc_type": "float16",
-            },
-        )
-        mock_mixed_precision.assert_called_with(["nn.conv2d"], "float16")
-
-        apply_graph_transforms(
-            relay_conv2d,
-            {
-                "mixed_precision": True,
-                "mixed_precision_ops": ["nn.conv2d", "nn.dense"],
-                "mixed_precision_calculation_type": "float16",
-                "mixed_precision_acc_type": "float32",
-            },
-        )
-        mock_mixed_precision.assert_called_with(["nn.conv2d", "nn.dense"], "float32")
-
-
-def test_layout_transform_to_mixed_precision_pass_args_graph():
-    """
-    Check the mixed precision arugments application with in a graph.
-    """
-
-    mod, params = testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-
-    class CheckOpMutator(ExprMutator):
-        """Inspect Ops According to expected types."""
-
-        def __init__(self, calculation_type, acc_type, op):
-            self.calculation_type = calculation_type
-            self.acc_type = acc_type
-            self.op = op
-            self.is_expected = True
-            super().__init__()
-
-        def visit_call(self, call):
-            visit = super().visit(call.args[0])
-            if call.op == relay.op.get(self.op):
-                if self.is_expected:
-                    self.is_expected = (
-                        call.checked_type.dtype == self.acc_type
-                        or call.args[0].checked_type.dtype == self.calculation_type
-                    )
-            return call
-
-        def check(self, func):
-            self.visit(func)
-            return self.is_expected
-
-    mod = apply_graph_transforms(
-        mod,
-        {
-            "mixed_precision": True,
-            "mixed_precision_ops": ["nn.conv2d", "nn.dense"],
-            "mixed_precision_calculation_type": "float16",
-            "mixed_precision_acc_type": "float16",
-        },
-        params,
-    )
-    ret = CheckOpMutator("float16", "float16", "nn.conv2d").check(mod["main"])
-    assert ret
-    ret = CheckOpMutator("float16", "float16", "nn.dense").check(mod["main"])
-    assert ret
-
-    mod = apply_graph_transforms(
-        mod,
-        {
-            "mixed_precision": True,
-            "mixed_precision_ops": ["nn.conv2d", "nn.dense"],
-            "mixed_precision_calculation_type": "float16",
-            "mixed_precision_acc_type": "float32",
-        },
-        params,
-    )
-    ret = CheckOpMutator("float16", "float32", "nn.conv2d").check(mod["main"])
-    assert ret
-    ret = CheckOpMutator("float16", "float32", "nn.dense").check(mod["main"])
-    assert ret
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_workspace_pools.py b/tests/python/driver/tvmc/test_workspace_pools.py
deleted file mode 100644
index 5d5e0851b2cc..000000000000
--- a/tests/python/driver/tvmc/test_workspace_pools.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import argparse
-
-import tvm
-from tvm.driver.tvmc.workspace_pools import (
-    generate_workspace_pools_args,
-    workspace_pools_recombobulate,
-)
-from tvm.target import Target
-from tvm.driver.tvmc import TVMCException
-
-
-def test_workspace_pools_argparse():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, unparsed = parser.parse_known_args(
-        [
-            "--workspace-pools=sram,flash",
-            "--workspace-pools-targets=sram:c,llvm",
-            "--workspace-pools-targets=flash:c",
-            "--workspace-pools-size-hint-bytes=sram:400",
-            "--workspace-pools-size-hint-bytes=sram:500",
-            "--workspace-pools-clock-frequency-hz=sram:500",
-            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:200",
-            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:100",
-            "--workspace-pools-read-latency-cycles=sram:50",
-            "--workspace-pools-read-latency-cycles=flash:30",
-            "--workspace-pools-write-latency-cycles=sram:9001",
-            "--workspace-pools-target-burst-bytes=sram:c:2",
-            "--workspace-pools-is-internal=sram:0",
-        ]
-    )
-
-    assert parsed.workspace_pools == "sram,flash"
-    assert parsed.workspace_pools_targets == ["sram:c,llvm", "flash:c"]
-    assert parsed.workspace_pools_size_hint_bytes == ["sram:400", "sram:500"]
-    assert parsed.workspace_pools_clock_frequency_hz == ["sram:500"]
-    assert parsed.workspace_pools_read_bandwidth_bytes_per_cycle == ["sram:200"]
-    assert parsed.workspace_pools_write_bandwidth_bytes_per_cycle == ["sram:100"]
-    assert parsed.workspace_pools_read_latency_cycles == ["sram:50", "flash:30"]
-    assert parsed.workspace_pools_write_latency_cycles == ["sram:9001"]
-    assert parsed.workspace_pools_target_burst_bytes == ["sram:c:2"]
-
-    assert unparsed == ["--workspace-pools-is-internal=sram:0"]
-
-
-def test_workspace_pools_recombobulate_empty():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args([])
-
-    targets = [Target("llvm")]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-    assert memory_pools is None
-
-
-def test_workspace_pools_recombobulate():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-            "--workspace-pools-targets=sram:llvm",
-            "--workspace-pools-size-hint-bytes=sram:400",
-            "--workspace-pools-clock-frequency-hz=sram:500",
-        ]
-    )
-
-    targets = [Target("llvm")]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-    assert len(memory_pools.pools) == 1
-    assert memory_pools.pools[0].pool_name == "sram"
-    assert memory_pools.pools[0].size_hint_bytes == 400
-    assert memory_pools.pools[0].clock_frequency_hz == 500
-
-
-def test_workspace_pools_defaults():
-    parser = argparse.ArgumentParser()
-    targets = [Target("llvm")]
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-            "--workspace-pools-targets=sram:llvm",
-        ]
-    )
-
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-    assert len(memory_pools.pools) == 1
-    assert memory_pools.pools[0].pool_name == "sram"
-    assert memory_pools.pools[0].size_hint_bytes == -1
-    assert memory_pools.pools[0].clock_frequency_hz == -1
-    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == -1
-    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == -1
-    assert memory_pools.pools[0].read_latency_cycles == 0
-    assert memory_pools.pools[0].write_latency_cycles == 0
-    assert len(memory_pools.pools[0].target_burst_bytes) == 0
-
-
-def test_workspace_pools_recombobulate_multi_fields():
-    parser = argparse.ArgumentParser()
-    targets = [Target("c")]
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-            "--workspace-pools-targets=sram:c",
-            "--workspace-pools-size-hint-bytes=sram:400",
-            "--workspace-pools-clock-frequency-hz=sram:500",
-            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:200",
-            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:100",
-            "--workspace-pools-read-latency-cycles=sram:50",
-            "--workspace-pools-write-latency-cycles=sram:9001",
-            "--workspace-pools-target-burst-bytes=sram:c:2",
-        ]
-    )
-
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-    assert len(memory_pools.pools) == 1
-    assert memory_pools.pools[0].pool_name == "sram"
-    assert memory_pools.pools[0].size_hint_bytes == 400
-    assert memory_pools.pools[0].clock_frequency_hz == 500
-    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == 200
-    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == 100
-    assert memory_pools.pools[0].read_latency_cycles == 50
-    assert memory_pools.pools[0].write_latency_cycles == 9001
-    assert len(memory_pools.pools[0].target_burst_bytes) == 1
-    assert memory_pools.pools[0].target_burst_bytes[targets[0]] == 2
-
-
-def test_workspace_pools_recombobulate_multi_fields_variant():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=flash",
-            "--workspace-pools-targets=flash:c",
-            "--workspace-pools-size-hint-bytes=flash:2048",
-            "--workspace-pools-clock-frequency-hz=flash:2000000",
-            "--workspace-pools-read-bandwidth-bytes-per-cycle=flash:4",
-            "--workspace-pools-write-bandwidth-bytes-per-cycle=flash:1",
-            "--workspace-pools-read-latency-cycles=flash:2000",
-            "--workspace-pools-write-latency-cycles=flash:1000",
-            "--workspace-pools-target-burst-bytes=flash:c:4",
-        ]
-    )
-
-    targets = [Target("c")]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-    assert len(memory_pools.pools) == 1
-    assert memory_pools.pools[0].pool_name == "flash"
-    assert memory_pools.pools[0].size_hint_bytes == 2048
-    assert memory_pools.pools[0].clock_frequency_hz == 2000000
-    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == 4
-    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == 1
-    assert memory_pools.pools[0].read_latency_cycles == 2000
-    assert memory_pools.pools[0].write_latency_cycles == 1000
-    assert len(memory_pools.pools[0].target_burst_bytes) == 1
-    assert memory_pools.pools[0].target_burst_bytes[targets[0]] == 4
-
-
-def test_workspace_pools_recombobulate_multi_fields_multi_pools():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram,flash",
-            "--workspace-pools-targets=sram:c",
-            "--workspace-pools-targets=flash:c",
-            "--workspace-pools-size-hint-bytes=sram:1024",
-            "--workspace-pools-size-hint-bytes=flash:2048",
-            "--workspace-pools-clock-frequency-hz=sram:4000000",
-            "--workspace-pools-clock-frequency-hz=flash:2000000",
-            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:8",
-            "--workspace-pools-read-bandwidth-bytes-per-cycle=flash:4",
-            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:4",
-            "--workspace-pools-write-bandwidth-bytes-per-cycle=flash:1",
-            "--workspace-pools-read-latency-cycles=sram:250",
-            "--workspace-pools-read-latency-cycles=flash:2000",
-            "--workspace-pools-write-latency-cycles=sram:500",
-            "--workspace-pools-write-latency-cycles=flash:1000",
-            "--workspace-pools-target-burst-bytes=sram:c:8",
-            "--workspace-pools-target-burst-bytes=flash:c:4",
-        ]
-    )
-
-    targets = [Target("c")]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-    assert len(memory_pools.pools) == 2
-
-    assert memory_pools.pools[0].pool_name == "sram"
-    assert memory_pools.pools[0].size_hint_bytes == 1024
-    assert memory_pools.pools[0].clock_frequency_hz == 4000000
-    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == 8
-    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == 4
-    assert memory_pools.pools[0].read_latency_cycles == 250
-    assert memory_pools.pools[0].write_latency_cycles == 500
-    assert len(memory_pools.pools[0].target_burst_bytes) == 1
-    assert memory_pools.pools[0].target_burst_bytes[targets[0]] == 8
-
-    assert memory_pools.pools[1].pool_name == "flash"
-    assert memory_pools.pools[1].size_hint_bytes == 2048
-    assert memory_pools.pools[1].clock_frequency_hz == 2000000
-    assert memory_pools.pools[1].read_bandwidth_bytes_per_cycle == 4
-    assert memory_pools.pools[1].write_bandwidth_bytes_per_cycle == 1
-    assert memory_pools.pools[1].read_latency_cycles == 2000
-    assert memory_pools.pools[1].write_latency_cycles == 1000
-    assert len(memory_pools.pools[1].target_burst_bytes) == 1
-    assert memory_pools.pools[1].target_burst_bytes[targets[0]] == 4
-
-
-def test_workspace_pools_recombobulate_multi_fields_ordering():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram,flash",
-            "--workspace-pools-targets=flash:c",
-            "--workspace-pools-targets=sram:c",
-            "--workspace-pools-size-hint-bytes=flash:2048",
-            "--workspace-pools-size-hint-bytes=sram:1024",
-            "--workspace-pools-clock-frequency-hz=sram:4000000",
-            "--workspace-pools-clock-frequency-hz=flash:2000000",
-            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:8",
-            "--workspace-pools-read-bandwidth-bytes-per-cycle=flash:4",
-            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:4",
-            "--workspace-pools-write-bandwidth-bytes-per-cycle=flash:1",
-            "--workspace-pools-read-latency-cycles=sram:250",
-            "--workspace-pools-read-latency-cycles=flash:2000",
-            "--workspace-pools-write-latency-cycles=flash:1000",
-            "--workspace-pools-write-latency-cycles=sram:500",
-            "--workspace-pools-target-burst-bytes=sram:c:8",
-            "--workspace-pools-target-burst-bytes=flash:c:4",
-        ]
-    )
-
-    targets = [Target("c")]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-    assert len(memory_pools.pools) == 2
-
-    assert memory_pools.pools[0].pool_name == "sram"
-    assert memory_pools.pools[0].size_hint_bytes == 1024
-    assert memory_pools.pools[0].write_latency_cycles == 500
-
-    assert memory_pools.pools[1].pool_name == "flash"
-    assert memory_pools.pools[1].size_hint_bytes == 2048
-    assert memory_pools.pools[1].write_latency_cycles == 1000
-
-
-def test_workspace_pools_recombobulate_multi_target():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-            "--workspace-pools-targets=sram:c,llvm",
-            "--workspace-pools-target-burst-bytes=sram:c:8",
-            "--workspace-pools-target-burst-bytes=sram:llvm:4",
-        ]
-    )
-
-    c_target = Target("c")
-    llvm_target = Target("llvm")
-    extra_targets = []
-
-    targets = [c_target, llvm_target]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, extra_targets)
-
-    assert len(memory_pools.pools) == 1
-
-    assert len(memory_pools.pools[0].target_burst_bytes) == 2
-    assert memory_pools.pools[0].target_burst_bytes[c_target] == 8
-    assert memory_pools.pools[0].target_burst_bytes[llvm_target] == 4
-
-
-def test_workspace_pools_recombobulate_no_target_burst_bytes():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-            "--workspace-pools-targets=sram:c",
-            "--workspace-pools-target-burst-bytes=sram:c:8",
-        ]
-    )
-
-    c_target = Target("c")
-    targets = [c_target]
-
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-
-    assert len(memory_pools.pools) == 1
-    assert len(memory_pools.pools[0].target_burst_bytes) == 1
-    assert memory_pools.pools[0].target_burst_bytes[c_target] == 8
-
-
-def test_workspace_pools_recombobulate_missing_target():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-        ]
-    )
-
-    c_target = Target("c")
-    with pytest.raises(TVMCException):
-        workspace_pools_recombobulate(parsed, [c_target], _)
-
-
-def test_workspace_pools_recombobulate_multi_target_multi_pool():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-            "--workspace-pools-targets=sram:c,llvm",
-            "--workspace-pools-target-burst-bytes=sram:c:8",
-            "--workspace-pools-target-burst-bytes=sram:llvm:4",
-        ]
-    )
-
-    c_target = Target("c")
-    llvm_target = Target("llvm")
-
-    targets = [c_target, llvm_target]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-
-    assert len(memory_pools.pools) == 1
-
-    assert len(memory_pools.pools[0].target_burst_bytes) == 2
-    assert memory_pools.pools[0].target_burst_bytes[llvm_target] == 4
-    assert memory_pools.pools[0].target_burst_bytes[c_target] == 8
-
-
-def test_workspace_pools_recombobulate_parameter_overrides():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram",
-            "--workspace-pools-targets=sram:c",
-            "--workspace-pools-size-hint-bytes=sram:800",
-            "--workspace-pools-size-hint-bytes=sram:400",
-            "--workspace-pools-clock-frequency-hz=sram:4000000",
-            "--workspace-pools-clock-frequency-hz=sram:3600000",
-        ]
-    )
-
-    c_target = Target("c")
-
-    targets = [c_target]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-
-    assert len(memory_pools.pools) == 1
-
-    assert memory_pools.pools[0].size_hint_bytes == 400
-    assert memory_pools.pools[0].clock_frequency_hz == 3600000
-
-
-def test_workspace_pools_recombobulate_single_pool_overrides():
-    parser = argparse.ArgumentParser()
-    generate_workspace_pools_args(parser)
-    parsed, _ = parser.parse_known_args(
-        [
-            "--workspace-pools=sram,flash",
-            "--workspace-pools-targets=sram:c",
-            "--workspace-pools-targets=flash:c",
-            "--workspace-pools-targets=sram:c,llvm",  # Override on one pool
-            "--workspace-pools-size-hint-bytes=sram:800",
-            "--workspace-pools-size-hint-bytes=flash:1200",
-            "--workspace-pools-size-hint-bytes=sram:400",  # Override on one pool
-        ]
-    )
-
-    c_target = Target("c")
-    llvm_target = Target("llvm")
-
-    targets = [c_target, llvm_target]
-    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
-
-    assert len(memory_pools.pools) == 2
-
-    assert memory_pools.pools[0].size_hint_bytes == 400
-    assert memory_pools.pools[1].size_hint_bytes == 1200
-
-    assert len(memory_pools.pools[0].targets) == 2
-    assert len(memory_pools.pools[1].targets) == 1
diff --git a/tests/python/integration/__init__.py b/tests/python/integration/__init__.py
deleted file mode 100644
index 56984ac61535..000000000000
--- a/tests/python/integration/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Infrastructure and tests for e2e integration tests."""
diff --git a/tests/python/integration/test_arm_aprofile.py b/tests/python/integration/test_arm_aprofile.py
deleted file mode 100644
index d32fed00afe8..000000000000
--- a/tests/python/integration/test_arm_aprofile.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tests for Arm(R) A-Profile Architecture."""
-import os
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.transform import ToMixedPrecision, FoldConstant
-from tvm.relay.build_module import bind_params_by_name
-
-
-def get_mattr(dtype):
-    mattr = "+v8.2a,+neon"
-    if dtype == "float16":
-        mattr += ",+fullfp16"
-    elif dtype == "bfloat16":
-        mattr += ",+bf16"
-    return mattr
-
-
-@tvm.testing.skip_if_32bit(reason="skipping test for i386.")
-@pytest.mark.parametrize("dtype", ["float32", "float16", "bfloat16"])
-def test_conv2d(dtype):
-    """Test if Conv2d cross compiles with TVM schedules."""
-    dtype = "float32"
-    ishape = [1, 28, 28, 3]  # NHWC
-    kernel_size = (3, 3)
-    wshape = (kernel_size[0], kernel_size[1], ishape[-1], 2)  # HWIO
-    weight_data = np.random.uniform(-128, 127, wshape).astype(dtype)
-    invar = relay.var("data", relay.TensorType(ishape, dtype))
-    weight = relay.const(weight_data, dtype)
-    out = relay.op.nn.conv2d(
-        invar,
-        weight,
-        kernel_size=kernel_size,
-        channels=2,
-        strides=(1, 1),
-        padding=(0, 0),
-        dilation=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype=dtype,
-        out_layout="NHWC",
-    )
-    mod = tvm.IRModule.from_expr(relay.Function([invar], out))
-    params = {}
-
-    prefixed_network_name = dtype + ".conv2d"
-    lib_path = os.getcwd() + "/" + prefixed_network_name + ".mod.so"
-    target = "llvm -mtriple=aarch64-linux-gnu -mattr=" + get_mattr(dtype)
-
-    mod["main"] = bind_params_by_name(mod["main"], params)
-    if dtype in ["float16", "bfloat16"]:
-        mod = ToMixedPrecision(dtype)(mod)
-        mod = FoldConstant()(mod)
-
-    with tvm.transform.PassContext(opt_level=3):
-        lib = tvm.relay.build(mod, target=target, params=params)
-        lib.export_library(lib_path, cc="aarch64-linux-gnu-gcc")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
deleted file mode 100644
index 8b5dd63fc859..000000000000
--- a/tests/python/integration/test_auto_tensorize.py
+++ /dev/null
@@ -1,462 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Integration test for MetaSchedule's auto tensorization."""
-import tempfile
-
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.meta_schedule.testing import relay_workload
-from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.tir.tensor_intrin.arm_cpu import DP4A_S8S8S32_INTRIN
-from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
-from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
-
-
-CASCADELAKE_VNNI_TARGET = "llvm -mcpu=cascadelake -num-cores 4"
-SKYLAKE_AVX512_TARGET = "llvm -mcpu=skylake-avx512 -num-cores 4"
-
-
-def _get_schedule_rules_for_x86(intrin):
-    return [
-        ms.schedule_rule.ApplyCustomRule(),
-        ms.schedule_rule.AutoInline(
-            into_producer=False,
-            into_consumer=True,
-            inline_const_tensor=True,
-            disallow_if_then_else=True,
-            require_injective=True,
-            require_ordered=True,
-            disallow_op=["tir.exp"],
-        ),
-        ms.schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
-        ms.schedule_rule.MultiLevelTilingWithIntrin(
-            intrin,
-            structure="SSRSRS",
-            tile_binds=None,
-            max_innermost_factor=64,
-            vector_load_lens=None,
-            reuse_read=None,
-            reuse_write=ms.schedule_rule.ReuseType(
-                req="may",
-                levels=[1, 2],
-                scope="global",
-            ),
-        ),
-        ms.schedule_rule.MultiLevelTiling(
-            structure="SSRSRS",
-            tile_binds=None,
-            max_innermost_factor=64,
-            vector_load_lens=None,
-            reuse_read=None,
-            reuse_write=ms.schedule_rule.ReuseType(
-                req="may",
-                levels=[1, 2],
-                scope="global",
-            ),
-        ),
-        ms.schedule_rule.ParallelizeVectorizeUnroll(
-            max_jobs_per_core=16,
-            max_vectorize_extent=64,
-            unroll_max_steps=[0, 16, 64, 512],
-            unroll_explicit=True,
-        ),
-        ms.schedule_rule.RandomComputeLocation(),
-    ]
-
-
-SCH_RULES_FOR_VNNI = _get_schedule_rules_for_x86(VNNI_INTRIN)
-SCH_RULES_FOR_AVX512 = _get_schedule_rules_for_x86(AVX512_INTRIN)
-
-
-def _get_sch_rules_for_dp4a(intrin):
-    return [
-        ms.schedule_rule.MultiLevelTilingWithIntrin(
-            intrin,
-            structure="SSSRRSRS",
-            tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
-            max_innermost_factor=64,
-            vector_load_lens=[1, 2, 3, 4],
-            reuse_read=ms.schedule_rule.ReuseType(
-                req="must",
-                levels=[4],
-                scope="shared",
-            ),
-            reuse_write=ms.schedule_rule.ReuseType(
-                req="must",
-                levels=[3],
-                scope="local",
-            ),
-        ),
-        ms.schedule_rule.AutoInline(
-            into_producer=True,
-            into_consumer=True,
-            inline_const_tensor=True,
-            disallow_if_then_else=False,
-            require_injective=False,
-            require_ordered=False,
-            disallow_op=None,
-        ),
-        ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
-        ms.schedule_rule.ParallelizeVectorizeUnroll(
-            max_jobs_per_core=-1,  # disable parallelize
-            max_vectorize_extent=-1,  # disable vectorize
-            unroll_max_steps=[0, 16, 64, 512, 1024],
-            unroll_explicit=True,
-        ),
-    ]
-
-
-SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_S8S8S32_INTRIN)
-SCH_RULES_FOR_SDOT4 = _get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
-
-POSTPROCS_FOR_VNNI = [
-    ms.postproc.DisallowDynamicLoop(),
-    ms.postproc.RewriteParallelVectorizeUnroll(),
-    ms.postproc.RewriteReductionBlock(),
-    ms.postproc.RewriteTensorize(vectorize_init_loop=True),
-]
-
-POSTPROCS_FOR_DP4A = [
-    ms.postproc.DisallowDynamicLoop(),
-    ms.postproc.RewriteCooperativeFetch(),
-    ms.postproc.RewriteUnboundBlock(),
-    ms.postproc.RewriteParallelVectorizeUnroll(),
-    ms.postproc.RewriteReductionBlock(),
-    ms.postproc.RewriteTensorize(),
-    ms.postproc.VerifyGPUCode(),
-]
-
-
-def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs):
-    """Test tuning."""
-    tgt = "cuda" if "nvidia" in target else target
-    dev = tvm.device(tgt, 0)
-    ref = (
-        relay.create_executor("vm", mod=relay_mod, device=dev, target=tgt)
-        .evaluate()(*[data_np, weight_np])
-        .numpy()
-    )
-    params = {"weight": weight_np}
-    tune_tasks = list(
-        filter(
-            lambda task: op_name in task.task_name,
-            ms.relay_integration.extract_tasks(relay_mod, target, params),
-        )
-    )
-    with tempfile.TemporaryDirectory() as work_dir:
-        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
-            extracted_tasks=tune_tasks,
-            work_dir=work_dir,
-            space=ms.space_generator.PostOrderApply(
-                sch_rules=sch_rules,
-                postprocs=postprocs,
-            ),
-        )
-        database = ms.tune.tune_tasks(
-            tasks=tasks,
-            task_weights=task_weights,
-            work_dir=work_dir,
-            max_trials_global=32,
-        )
-    with database, tvm.transform.PassContext(
-        opt_level=3,
-        config={"relay.backend.use_meta_schedule": True},
-    ):
-        lib = relay.build(relay_mod, target=target, params=params)
-
-    if "cascadelake" in target:
-        asm = lib.lib.get_source("asm")
-        assert "vpdpbusd" in asm
-
-    if "skylake-avx512" in target:
-        asm = lib.lib.get_source("asm")
-        assert "pmaddubs" in asm
-        assert "pmaddw" in asm
-
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    runtime.set_input("data", data_np)
-    runtime.run()
-    out = runtime.get_output(0).numpy()
-    np.testing.assert_equal(out, ref)
-
-
-def _test_dense(data_dtype, sch_rules, postprocs, target):
-    dim_m, dim_n, dim_k = 1024, 1024, 1024
-    data_shape = (dim_m, dim_k)
-    weight_shape = (dim_n, dim_k)
-
-    weight_dtype = "int8"
-    out_dtype = "int32"
-
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=weight_shape, dtype=weight_dtype)
-    dense = relay.nn.dense(data, weight, out_dtype=out_dtype)
-
-    relay_mod = tvm.IRModule.from_expr(dense)
-
-    data_np = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
-    weight_np = np.random.uniform(1, 10, size=weight_shape).astype(weight_dtype)
-
-    tune_and_test(relay_mod, data_np, weight_np, "dense", target, sch_rules, postprocs)
-
-
-def _test_conv2d(data_dtype, sch_rules, postprocs, target):
-    d_shape = (1, 64, 56, 56)
-    w_shape = (64, 64, 3, 3)
-
-    weight_dtype = "int8"
-    out_dtype = "int32"
-
-    data = relay.var("data", shape=d_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
-    out_channel = w_shape[0]
-    conv2d = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=w_shape[2:],
-        channels=out_channel,
-        padding=(1, 1),
-        strides=(1, 1),
-        out_dtype=out_dtype,
-    )
-
-    relay_mod = tvm.IRModule.from_expr(conv2d)
-
-    data_np = np.random.uniform(1, 10, d_shape).astype(data_dtype)
-    weight_np = np.random.uniform(1, 10, size=w_shape).astype("int8")
-
-    tune_and_test(relay_mod, data_np, weight_np, "conv2d", target, sch_rules, postprocs)
-
-
-def _test_bert_int8(relay_mod, params, input_info, target, sch_rules, postprocs):
-    relay_mod = relay.transform.FastMath()(relay_mod)
-    tune_tasks = [
-        task
-        for task in ms.relay_integration.extract_tasks(relay_mod, target, params)
-        if "dense" in task.task_name or "batch_matmul" in task.task_name
-    ]
-    with tempfile.TemporaryDirectory() as work_dir:
-        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
-            extracted_tasks=tune_tasks,
-            work_dir=work_dir,
-            space=ms.space_generator.PostOrderApply(
-                sch_rules=sch_rules,
-                postprocs=postprocs,
-            ),
-        )
-        database = ms.tune.tune_tasks(
-            tasks=tasks,
-            task_weights=task_weights,
-            work_dir=work_dir,
-            max_trials_per_task=32,
-            max_trials_global=20000,
-        )
-    with database, tvm.transform.PassContext(
-        opt_level=3,
-        config={"relay.backend.use_meta_schedule": True},
-    ):
-        lib = relay.build(relay_mod, target=target, params=params)
-
-    dev = tvm.device("cuda" if "nvidia" in target else target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    inputs = []
-    for name, shape in input_info:
-        arr = np.random.uniform(1, 10, size=shape).astype("int64")
-        runtime.set_input(name, arr)
-        inputs.append(arr)
-    print(runtime.benchmark(dev, number=1, repeat=50).mean)
-
-
-@tvm.testing.requires_x86_vnni
-def test_vnni_dense():
-    _test_dense("uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET)
-
-
-@tvm.testing.requires_x86_avx512
-def test_avx512_dense():
-    _test_dense("uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_VNNI, SKYLAKE_AVX512_TARGET)
-
-
-@pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
-@tvm.testing.requires_gpu
-def test_dp4a_dense():
-    _test_dense("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
-    # Uncomment to test on vulkan or rocm target
-    # _test_dense(
-    #     "int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "vulkan -from_device=0"
-    # )
-    # _test_dense(
-    #     "int8", SCH_RULES_FOR_SDOT4, POSTPROCS_FOR_DP4A, "rocm"
-    # )
-
-
-@tvm.testing.requires_x86_vnni
-def test_vnni_conv2d():
-    _test_conv2d("uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET)
-
-
-@tvm.testing.requires_x86_avx512
-def test_avx512_conv2d():
-    _test_conv2d("uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_VNNI, SKYLAKE_AVX512_TARGET)
-
-
-@pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
-@tvm.testing.requires_gpu
-def test_dp4a_conv2d():
-    _test_conv2d("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
-    # Uncomment to test on vulkan or rocm target
-    # _test_conv2d(
-    #     "int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "vulkan -from_device=0"
-    # )
-    # _test_conv2d(
-    #     "int8", SCH_RULES_FOR_SDOT4, POSTPROCS_FOR_DP4A, "rocm"
-    # )
-
-
-@tvm.testing.requires_x86_vnni
-@pytest.mark.skipif(tvm.testing.IS_IN_CI, reason="Slow on CI")
-def test_vnni_bert_int8():
-    pytest.importorskip("onnx")
-    relay_mod, params, input_info = load_quantized_bert_base()
-    _test_bert_int8(
-        relay_mod,
-        params,
-        input_info,
-        CASCADELAKE_VNNI_TARGET,
-        SCH_RULES_FOR_VNNI,
-        POSTPROCS_FOR_VNNI,
-    )
-
-
-@tvm.testing.requires_x86_avx512
-@pytest.mark.skip("Due to quantized BERT download issue")
-def test_avx512_bert_int8():
-    relay_mod, params, input_info = load_quantized_bert_base()
-    _test_bert_int8(
-        relay_mod,
-        params,
-        input_info,
-        SKYLAKE_AVX512_TARGET,
-        SCH_RULES_FOR_AVX512,
-        POSTPROCS_FOR_VNNI,
-    )
-
-
-@tvm.testing.requires_gpu
-@pytest.mark.skip("Slow on CI")
-def test_dp4a_bert_int8():
-    relay_mod, params, input_info = load_quantized_bert_base()
-    _test_bert_int8(
-        relay_mod,
-        params,
-        input_info,
-        "nvidia/geforce-rtx-3070",
-        SCH_RULES_FOR_DP4A,
-        POSTPROCS_FOR_DP4A,
-    )
-    # Uncomment to test on vulkan or rocm target
-    # _test_bert_int8(
-    #     relay_mod,
-    #     params,
-    #     input_info,
-    #     "vulkan -from_device=0",
-    #     SCH_RULES_FOR_DP4A,
-    #     POSTPROCS_FOR_DP4A,
-    # )
-    # _test_bert_int8(
-    #     relay_mod,
-    #     params,
-    #     input_info,
-    #     "rocm",
-    #     SCH_RULES_FOR_SDOT4
-    #     POSTPROCS_FOR_DP4A,
-    # )
-
-
-@tvm.testing.requires_gpu
-@pytest.mark.skip("Slow on CI")
-@pytest.mark.parametrize(
-    ["model_name", "input_shape"],
-    [("bert_base", (8, 128)), ("resnet_18", (16, 3, 224, 224)), ("resnet_50", (16, 3, 224, 224))],
-)
-def test_cuda_tensor_core(model_name, input_shape):
-    """Integration tests of auto tensorization with CUDA tensor core"""
-    target = tvm.target.Target("nvidia/geforce-rtx-3070")
-    dev = tvm.cuda()
-    if model_name.startswith("bert"):
-        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev)  # embedding size
-    else:
-        data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)
-    mod, params, (input_name, _, _) = relay_workload.get_network(model_name, input_shape)
-    seq = tvm.transform.Sequential(
-        [
-            relay.transform.ToMixedPrecision(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-
-    def convert_layout(mod):
-        seq = tvm.transform.Sequential(
-            [relay.transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"]})]
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            mod = seq(mod)
-        return mod
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        with ms.Profiler() as profiler:
-            converted_mod = convert_layout(mod)
-            database = ms.relay_integration.tune_relay(
-                mod=converted_mod,
-                target=target,
-                work_dir=work_dir,
-                max_trials_global=3000,
-                params=params,
-            )
-            rt_mod1 = ms.relay_integration.compile_relay(
-                database=database,
-                mod=converted_mod,
-                target=target,
-                params=params,
-            )
-        print(profiler.table())
-
-        # Compile without MetaSchedule for correctness check
-        with tvm.transform.PassContext(opt_level=0):
-            rt_mod2 = relay.build(mod, target=target, params=params)
-
-        def get_output(data, lib):
-            module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-            module.set_input(input_name, data)
-            module.run()
-            return module.get_output(0).numpy()
-
-        # Check correctness
-        actual_output = get_output(data, rt_mod1)
-        expected_output = get_output(data, rt_mod2)
-        assert np.allclose(actual_output, expected_output, rtol=1e-2, atol=2e-2)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
deleted file mode 100644
index 20e628c8c14b..000000000000
--- a/tests/python/integration/test_dot.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test scheduling and running a dot product."""
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-
-
-@tvm.testing.requires_llvm
-def test_dot():
-    """Test dot product."""
-    arr_length = 12
-    arr_length_tvm = tvm.runtime.convert(arr_length)
-    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
-    placeholder_b = te.placeholder((arr_length_tvm,), name="B")
-    reduce_axis_k = te.reduce_axis((0, arr_length_tvm), "k")
-    result_c = te.compute(
-        (),
-        lambda: te.sum(
-            placeholder_a[reduce_axis_k] * placeholder_b[reduce_axis_k], axis=reduce_axis_k
-        ),
-        name="C",
-    )
-    schedule = te.create_schedule(result_c.op)
-
-    def verify(target):
-        f = tvm.driver.build(schedule, [placeholder_a, placeholder_b, result_c], target)
-        # verify
-        dev = tvm.cpu(0)
-        buff_a = tvm.nd.array(
-            np.random.uniform(size=(arr_length,)).astype(placeholder_a.dtype), dev
-        )
-        buff_b = tvm.nd.array(
-            np.random.uniform(size=(arr_length,)).astype(placeholder_b.dtype), dev
-        )
-        buff_c = tvm.nd.array(np.zeros((), dtype=result_c.dtype), dev)
-        f(buff_a, buff_b, buff_c)
-        tvm.testing.assert_allclose(
-            buff_c.numpy(), np.dot(buff_a.numpy(), buff_b.numpy()), rtol=1e-4
-        )
-
-    verify("llvm")
-
-
-if __name__ == "__main__":
-    test_dot()
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
deleted file mode 100644
index d1d2b9902c60..000000000000
--- a/tests/python/integration/test_ewise.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test elementwise integration."""
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.contrib import nvcc
-
-
-@tvm.testing.requires_gpu
-def test_exp():
-    """Test scheduling and running exponent."""
-    # graph
-    arr_length = 1024
-    arr_length_tvm = tvm.runtime.convert(arr_length)
-    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
-    placeholder_b = te.compute(placeholder_a.shape, lambda *i: te.exp(placeholder_a(*i)), name="B")
-    schedule = te.create_schedule(placeholder_b.op)
-    # create iter var and assign them tags.
-    num_thread = 8
-    axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread)
-    schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x"))
-    schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x"))
-
-    # one line to build the function.
-    def check_device(device, host="stackvm"):
-        if not tvm.testing.device_enabled(host):
-            return
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-        fexp = tvm.build(schedule, [placeholder_a, placeholder_b], device, host, name="myexp")
-        dev = tvm.device(device, 0)
-        # launch the kernel.
-        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev)
-        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=placeholder_b.dtype), dev)
-        fexp(buff_a, buff_b)
-        tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5)
-
-    check_device("opencl -device=intel_graphics")
-    check_device("cuda", "llvm")
-    check_device("vulkan")
-
-
-@tvm.testing.requires_gpu
-def test_fmod():
-    """Test scheduling and running fmod."""
-
-    # graph
-    def run(dtype):
-        size_var_n = te.size_var("n")
-        placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype)
-        placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype)
-        result_c = te.compute(
-            placeholder_a.shape, lambda *i: te.fmod(placeholder_a(*i), placeholder_b(*i)), name="C"
-        )
-        schedule = te.create_schedule(result_c.op)
-        # create iter var and assign them tags.
-        num_thread = 8
-        axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread)
-
-        def check_device(device):
-            dev = tvm.device(device, 0)
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled.." % device)
-                return
-            target = tvm.target.Target(device)
-            if "cpu" not in target.keys:
-                schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x"))
-                schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x"))
-            fmod = tvm.build(
-                schedule, [placeholder_a, placeholder_b, result_c], device, name="myfmod"
-            )
-
-            # launch the kernel.
-            value_n = 1024
-            a_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_a.dtype)
-            b_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_b.dtype)
-
-            # "fix" the values in a and b to avoid the result being too small
-            b_np += (b_np < 2.0) * 2
-            a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1
-
-            buff_a = tvm.nd.array(a_np, dev)
-            buff_b = tvm.nd.array(b_np, dev)
-            buff_c = tvm.nd.array(np.zeros(value_n, dtype=result_c.dtype), dev)
-            ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1)
-            _ = ftimer(buff_a, buff_b, buff_c).mean
-            np.testing.assert_allclose(
-                buff_c.numpy(), np.mod(buff_a.numpy(), buff_b.numpy()), rtol=1e-5
-            )
-
-        check_device("cuda")
-        check_device("opencl -device=intel_graphics")
-        check_device("metal")
-
-    run("float32")
-
-
-@tvm.testing.requires_gpu
-def test_multiple_cache_write():
-    """Test multiple cache writes."""
-    # graph
-    arr_length = 1024
-    arr_length_tvm = tvm.runtime.convert(arr_length)
-    placeholder_a0 = te.placeholder((arr_length_tvm,), name="A0", dtype="float32")
-    placeholder_a1 = te.placeholder((arr_length_tvm,), name="A1", dtype="float32")
-    result_b0, result_b1 = te.compute(
-        (arr_length_tvm,),
-        lambda *i: (
-            placeholder_a0(*i) + placeholder_a1(*i),
-            placeholder_a0(*i) * placeholder_a1(*i),
-        ),
-        name="B",
-    )
-    result_c = te.compute((arr_length_tvm,), lambda *i: result_b0(*i) + result_b1(*i), name="C")
-    schedule = te.create_schedule(result_c.op)
-    # create iter var and assign them tags.
-    num_thread = 8
-    cache_b0, _ = schedule.cache_write([result_b0, result_b1], "local")
-    axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread)
-    schedule[result_b0].compute_at(schedule[result_c], axis0)
-    schedule[cache_b0].compute_at(schedule[result_c], axis0)
-    schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x"))
-    schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x"))
-
-    # one line to build the function.
-    def check_device(device, host="stackvm"):
-        if not tvm.testing.device_enabled(host):
-            return
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            return
-        func = tvm.build(
-            schedule,
-            [placeholder_a0, placeholder_a1, result_c],
-            device,
-            host,
-            name="multiple_cache_write",
-        )
-        dev = tvm.device(device, 0)
-        # launch the kernel.
-        buff_a0 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a0.dtype), dev)
-        buff_a1 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a1.dtype), dev)
-        buff_c = tvm.nd.array(np.zeros(arr_length, dtype=result_c.dtype), dev)
-        func(buff_a0, buff_a1, buff_c)
-        tvm.testing.assert_allclose(
-            buff_c.numpy(),
-            buff_a0.numpy() + buff_a1.numpy() + (buff_a0.numpy() * buff_a1.numpy()),
-            rtol=1e-5,
-        )
-
-    check_device("cuda", "llvm")
-    check_device("vulkan")
-    check_device("opencl")
-
-
-def test_log_pow_llvm():
-    """Test log pow using llvm to lower."""
-    # graph
-    size_var_n = te.size_var("n")
-    placeholder_a = te.placeholder((size_var_n,), name="A")
-    result_b = te.compute(
-        placeholder_a.shape, lambda *i: te.power(te.log(placeholder_a(*i)), 2.0), name="B"
-    )
-    schedule = te.create_schedule(result_b.op)
-    # create iter var and assign them tags.
-    schedule[result_b].split(result_b.op.axis[0], factor=32)
-    # one line to build the function.
-    if not tvm.testing.device_enabled("llvm"):
-        return
-
-    flog = tvm.build(schedule, [placeholder_a, result_b], "llvm", name="mylog")
-    dev = tvm.cpu(0)
-    # launch the kernel.
-    size_var_n = 1028
-    buff_a = tvm.nd.array(np.random.uniform(size=size_var_n).astype(placeholder_a.dtype), dev)
-    buff_b = tvm.nd.array(np.zeros(size_var_n, dtype=result_b.dtype), dev)
-    repeat = 10
-    ftimer = flog.time_evaluator(flog.entry_name, dev, number=1, repeat=repeat)
-    res = ftimer(buff_a, buff_b)
-    assert len(res.results) == repeat
-    tvm.testing.assert_allclose(buff_b.numpy(), np.power(np.log(buff_a.numpy()), 2.0), rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_popcount():
-    """Test popcount."""
-
-    def run(dtype):
-        # graph
-        arr_length = 1024
-        arr_length_tvm = tvm.runtime.convert(1024)
-        placeholder_a = te.placeholder((arr_length_tvm,), name="A", dtype=dtype)
-        placeholder_b = te.compute(
-            placeholder_a.shape, lambda *i: tvm.tir.popcount(placeholder_a(*i)), name="B"
-        )
-        schedule = te.create_schedule(placeholder_b.op)
-        # simple schedule
-        num_thread = 8
-        axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread)
-
-        def check_device(device):
-            dev = tvm.device(device, 0)
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled.." % device)
-                return
-            target = tvm.target.Target(device)
-            if "cpu" not in target.keys:
-                schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x"))
-                schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x"))
-            func = tvm.build(schedule, [placeholder_a, placeholder_b], device)
-            # launch the kernel.
-            buff_a = tvm.nd.array(
-                np.random.randint(low=0, high=1000, size=arr_length, dtype=placeholder_a.dtype), dev
-            )
-            buff_b = tvm.nd.array(np.zeros(shape=arr_length, dtype=placeholder_b.dtype), dev)
-            func(buff_a, buff_b)
-            tvm.testing.assert_allclose(
-                buff_b.numpy(), list(map(lambda x: bin(x).count("1"), buff_a.numpy())), rtol=1e-5
-            )
-
-        check_device("llvm")
-        check_device("cuda")
-        check_device("opencl")
-        if dtype == "uint32":
-            check_device("metal")
-            check_device("vulkan")
-
-    run("uint32")
-    run("uint64")
-
-
-@tvm.testing.requires_gpu
-def test_add():
-    """Test addition."""
-
-    def run(dtype):
-        # graph
-        size_var_n = te.size_var("n")
-        placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype)
-        placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype)
-        result_c = te.compute(
-            placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C"
-        )
-        # schedule
-        schedule = te.create_schedule(result_c.op)
-        # create iter var and assign them tags.
-        num_thread = 16
-        axis_bx, axis_x = schedule[result_c].split(result_c.op.axis[0], factor=num_thread * 4)
-        axis_tx, axis_x = schedule[result_c].split(axis_x, nparts=num_thread)
-        _, axis_x = schedule[result_c].split(axis_x, factor=4)
-        schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x"))
-        schedule[result_c].bind(axis_tx, te.thread_axis("threadIdx.x"))
-        schedule[result_c].vectorize(axis_x)
-
-        # one line to build the function.
-        def check_device(device):
-            dev = tvm.device(device, 0)
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled.." % device)
-                return
-            fadd = tvm.build(
-                schedule, [placeholder_a, placeholder_b, result_c], device, name="myadd"
-            )
-
-            # launch the kernel.
-            n = 1024
-            buff_a = tvm.nd.array(
-                (np.random.uniform(size=n) * 256).astype(placeholder_a.dtype), dev
-            )
-            buff_b = tvm.nd.array(
-                (np.random.uniform(size=n) * 256).astype(placeholder_b.dtype), dev
-            )
-            buff_c = tvm.nd.array(np.zeros(n, dtype=result_c.dtype), dev)
-            ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1)
-            _ = ftimer(buff_a, buff_b, buff_c).mean
-            tvm.testing.assert_allclose(buff_c.numpy(), buff_a.numpy() + buff_b.numpy(), rtol=1e-6)
-
-        check_device("opencl")
-        check_device("cuda")
-        if dtype == "float32":
-            check_device("metal")
-            check_device("vulkan")
-
-    run("float32")
-    run("int32")
-    run("int64")
-    run("uint64")
-
-
-@tvm.testing.requires_gpu
-def try_warp_memory():
-    """Test using warp memory
-    skip this in default test because it require higher arch"""
-    arr_size = 128
-    placeholder_a = te.placeholder((arr_size,), name="A")
-    result_b = te.compute((arr_size,), lambda i: placeholder_a[i] + 3, name="B")
-    warp_size = 32
-    schedule = te.create_schedule(result_b.op)
-    cache_read_aa = schedule.cache_read(placeholder_a, "warp", [result_b])
-    axis_x0, axis_xi = schedule[result_b].split(result_b.op.axis[0], warp_size * 2)
-    _, axis_xi1 = schedule[result_b].split(axis_xi, factor=warp_size)
-    thread_axis_tx = te.thread_axis("threadIdx.x")
-    schedule[result_b].bind(axis_xi1, thread_axis_tx)
-    schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x"))
-    schedule[cache_read_aa].compute_at(schedule[result_b], axis_x0)
-    axis_x0, axis_xi = schedule[cache_read_aa].split(schedule[cache_read_aa].op.axis[0], warp_size)
-    schedule[cache_read_aa].bind(axis_xi, thread_axis_tx)
-
-    @tvm.register_func("tvm_callback_cuda_compile", override=True)
-    def tvm_callback_cuda_compile(code, _):  # pylint: disable=unused-variable
-        ptx = nvcc.compile_cuda(code)
-        return ptx
-
-    # one line to build the function.
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-        myfunc = tvm.build(schedule, [placeholder_a, result_b], device)
-        buff_a = tvm.nd.array(
-            (np.random.uniform(size=arr_size) * 256).astype(placeholder_a.dtype), dev
-        )
-        buff_b = tvm.nd.array(np.zeros(arr_size, dtype=result_b.dtype), dev)
-        myfunc(buff_a, buff_b)
-        tvm.testing.assert_allclose(buff_b.numpy(), buff_a.numpy() + 3, rtol=1e-6)
-
-    check_device("cuda")
-
-
-if __name__ == "__main__":
-    test_exp()
-    try_warp_memory()
-    test_multiple_cache_write()
-    test_add()
-    test_log_pow_llvm()
-    test_popcount()
-    test_fmod()
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
deleted file mode 100644
index cae6364c2248..000000000000
--- a/tests/python/integration/test_ewise_fpga.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test elementwise ops on fpga."""
-import os
-
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te
-
-os.environ["XCL_EMULATION_MODE"] = "1"
-os.environ["CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA"] = "1"
-
-
-@tvm.register_func
-def tvm_callback_vhls_postproc(code, _):
-    """Hook to inspect the Vivado HLS code before actually run it"""
-    print(code)
-    return code
-
-
-def test_exp():
-    """Test scheduling and running exp function."""
-    # graph
-    arr_length = 1024
-    arr_length_tvm = tvm.runtime.convert(arr_length)
-    placeholder_b = te.placeholder((arr_length_tvm,), name="A")
-    result_b = te.compute(placeholder_b.shape, lambda *i: te.exp(placeholder_b(*i)), name="B")
-    schedule = te.create_schedule(result_b.op)
-    # create iter var and assign them tags.
-    axis1, _ = schedule[result_b].split(result_b.op.axis[0], nparts=1)
-    schedule[result_b].bind(axis1, te.thread_axis("pipeline"))
-
-    # one line to build the function.
-    def check_device(device, host="llvm"):
-        if not tvm.testing.device_enabled(device):
-            return
-        dev = tvm.device(device, 0)
-        fexp = tvm.build(schedule, [placeholder_b, result_b], device, host, name="myexp")
-        dev = tvm.device(device, 0)
-        # launch the kernel.
-        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev)
-        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=result_b.dtype), dev)
-        fexp(buff_a, buff_b)
-        tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5)
-
-    check_device("sdaccel")
-    if "AWS_PLATFORM" in os.environ:
-        check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM"))
-
-    check_device("aocl_sw_emu")
-
-
-def test_multi_kernel():
-    """Test scheduling with multiple computes."""
-    # graph
-    arr_length = 1024
-    arr_length_tvm = tvm.runtime.convert(arr_length)
-    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
-    placeholder_b = te.placeholder((arr_length_tvm,), name="B")
-    result_c = te.compute(
-        placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C"
-    )
-    result_d = te.compute(
-        placeholder_a.shape, lambda *i: placeholder_a(*i) + result_c(*i), name="D"
-    )
-    schedule = te.create_schedule(result_d.op)
-    # create iter var and assign them tags.
-    axis1, _ = schedule[result_c].split(result_c.op.axis[0], nparts=1)
-    schedule[result_c].bind(axis1, te.thread_axis("pipeline"))
-    axis1, _ = schedule[result_d].split(result_d.op.axis[0], nparts=1)
-    schedule[result_d].bind(axis1, te.thread_axis("pipeline"))
-
-    # one line to build the function.
-    def check_device(device, host="llvm"):
-        if not tvm.testing.device_enabled(device):
-            return
-        dev = tvm.device(device, 0)
-        fadd = tvm.build(
-            schedule, [placeholder_a, placeholder_b, result_c, result_d], device, host, name="myadd"
-        )
-        dev = tvm.device(device, 0)
-        # launch the kernel.
-        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev)
-        buff_b = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev)
-        buff_c = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_c.dtype), dev)
-        buff_d = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_d.dtype), dev)
-        fadd(buff_a, buff_b, buff_c, buff_d)
-        tvm.testing.assert_allclose(buff_d.numpy(), buff_a.numpy() * 2 + buff_b.numpy(), rtol=1e-5)
-
-    check_device("sdaccel")
-    check_device("aocl_sw_emu")
-
-
-if __name__ == "__main__":
-    test_exp()
-    test_multi_kernel()
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
deleted file mode 100644
index 66d777989d8c..000000000000
--- a/tests/python/integration/test_gemm.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test scheduling and running a gemm!"""
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-
-
-@tvm.testing.requires_gpu
-def test_gemm():
-    """Test the gemm!"""
-    # graph
-    dim1_length = 1024
-    dim_n = tvm.runtime.convert(dim1_length)
-    dim_m = dim_n
-    dim_l = dim_n
-    placeholder_a = te.placeholder((dim_n, dim_l), name="A")
-    placeholder_b = te.placeholder((dim_m, dim_l), name="B")
-    axis_k = te.reduce_axis((0, dim_l), name="k")
-    result_c = te.compute(
-        (dim_n, dim_m),
-        lambda ii, jj: te.sum(placeholder_a[ii, axis_k] * placeholder_b[jj, axis_k], axis=axis_k),
-        name="CC",
-    )
-    # schedule
-    schedule = te.create_schedule(result_c.op)
-    scale = 8
-    num_thread = 8
-    block_factor = scale * num_thread
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis("threadIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    thread_y = te.thread_axis("threadIdx.y")
-
-    cache_write = schedule.cache_write(result_c, "local")
-    cache_read_a = schedule.cache_read(placeholder_a, "shared", [cache_write])
-    cache_read_b = schedule.cache_read(placeholder_b, "shared", [cache_write])
-    axis_by, axis_yi = schedule[result_c].split(result_c.op.axis[0], factor=block_factor)
-    axis_bx, axis_xi = schedule[result_c].split(result_c.op.axis[1], factor=block_factor)
-    schedule[result_c].reorder(axis_by, axis_bx, axis_yi, axis_xi)
-    schedule[result_c].bind(axis_by, block_y)
-    schedule[result_c].bind(axis_bx, block_x)
-    axis_ty, axis_yi = schedule[result_c].split(axis_yi, nparts=num_thread)
-    axis_tx, axis_xi = schedule[result_c].split(axis_xi, nparts=num_thread)
-    schedule[result_c].reorder(axis_ty, axis_tx, axis_yi, axis_xi)
-    schedule[result_c].bind(axis_ty, thread_y)
-    schedule[result_c].bind(axis_tx, thread_x)
-    axis_yo, axis_xo = cache_write.op.axis
-    schedule[cache_write].reorder(axis_k, axis_yo, axis_xo)
-
-    schedule[cache_write].compute_at(schedule[result_c], axis_tx)
-    schedule[cache_read_a].compute_at(schedule[cache_write], axis_k)
-    schedule[cache_read_b].compute_at(schedule[cache_write], axis_k)
-    schedule[cache_read_a].double_buffer()
-    schedule[cache_read_b].double_buffer()
-    axis_ty, axis_xi = schedule[cache_read_a].split(
-        schedule[cache_read_a].op.axis[0], nparts=num_thread
-    )
-    axis_tx, axis_xi = schedule[cache_read_a].split(axis_xi, nparts=num_thread)
-    schedule[cache_read_a].bind(axis_ty, thread_y)
-    schedule[cache_read_a].bind(axis_tx, thread_x)
-
-    axis_ty, axis_xi = schedule[cache_read_b].split(
-        schedule[cache_read_b].op.axis[0], nparts=num_thread
-    )
-    axis_tx, axis_xi = schedule[cache_read_b].split(axis_xi, nparts=num_thread)
-    schedule[cache_read_b].bind(axis_ty, thread_y)
-    schedule[cache_read_b].bind(axis_tx, thread_x)
-
-    # lowering test
-    schedule = schedule.normalize()
-
-    # one line to build the function.
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-
-        with tvm.target.Target(device):
-            f = tvm.build(schedule, [placeholder_a, placeholder_b, result_c])
-
-        # launch the kernel.
-        num_n = dim1_length
-        num_m = num_n
-        num_l = num_n
-        a_np = np.random.uniform(size=(num_n, num_l)).astype(placeholder_a.dtype)
-        b_np = np.random.uniform(size=(num_m, num_l)).astype(placeholder_b.dtype)
-        buff_a = tvm.nd.array(a_np, dev)
-        buff_b = tvm.nd.array(b_np, dev)
-        buff_c = tvm.nd.array(np.zeros((num_n, num_m), dtype=result_c.dtype), dev)
-        ftimer = f.time_evaluator(f.entry_name, dev, number=1)
-        tcost = ftimer(buff_a, buff_b, buff_c).mean
-        print("%s: exec=%g sec/op" % (dev, tcost))
-        tvm.testing.assert_allclose(buff_c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5)
-
-    check_device("vulkan")
-    check_device("nvptx -mcpu=sm_20")
-    check_device("rocm")
-    check_device("metal")
-    check_device("opencl")
-    check_device("cuda")
-
-
-if __name__ == "__main__":
-    test_gemm()
diff --git a/tests/python/integration/test_legacy_tuning.py b/tests/python/integration/test_legacy_tuning.py
deleted file mode 100644
index 41f7b99996bb..000000000000
--- a/tests/python/integration/test_legacy_tuning.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Test the tuner
-"""
-import logging
-import multiprocessing as mp
-import textwrap
-
-import tvm
-import tvm.relay
-import tvm.testing
-from tvm import autotvm, te
-from tvm.autotvm.measure import measure_methods
-from tvm.autotvm.tuner import RandomTuner
-from tvm.contrib import tar
-from tvm.ir.instrument import pass_instrument
-from tvm.ir.transform import PassContext
-from tvm.target import Target
-from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
-
-
-def setup_module():
-    """Setup the module used for testing."""
-
-    @autotvm.template("testing/conv2d_no_batching")
-    def conv2d_no_batching(  # pylint: disable=unused-variable
-        batch_size, input_h, input_w, channels_in, channels_out, kernel_h, kernel_w
-    ):
-        """An example template for testing"""
-        assert batch_size == 1, "Only consider batch_size = 1 in this template"
-
-        data = te.placeholder((batch_size, channels_in, input_h, input_w), name="data")
-        kernel = te.placeholder((channels_out, channels_in, kernel_h, kernel_w), name="kernel")
-
-        axis_rc = te.reduce_axis((0, channels_in), name="rc")
-        axis_ry = te.reduce_axis((0, kernel_h), name="ry")
-        axis_rx = te.reduce_axis((0, kernel_w), name="rx")
-
-        conv = te.compute(
-            (batch_size, channels_out, input_h - kernel_h + 1, input_w - kernel_w + 1),
-            lambda nn, ff, yy, xx: te.sum(
-                data[nn, axis_rc, yy + axis_ry, xx + axis_rx]
-                * kernel[ff, axis_rc, axis_ry, axis_rx],
-                axis=[axis_rc, axis_ry, axis_rx],
-            ),
-            tag="conv2d_nchw",
-        )
-
-        schedule = te.create_schedule([conv.op])
-
-        output = conv
-        cache_write_ol = schedule.cache_write(conv, "local")
-
-        # create cache stage
-        cache_read_aa = schedule.cache_read(data, "shared", [cache_write_ol])
-        cache_read_ww = schedule.cache_read(kernel, "shared", [cache_write_ol])
-        cache_read_al = schedule.cache_read(cache_read_aa, "local", [cache_write_ol])
-        cache_read_wl = schedule.cache_read(cache_read_ww, "local", [cache_write_ol])
-
-        # tile and bind spatial axes
-        axis_n, axis_f, axis_y, axis_x = schedule[output].op.axis
-        cfg = autotvm.get_config()
-        cfg.define_split("tile_f", cfg.axis(axis_f), num_outputs=4)
-        cfg.define_split("tile_y", cfg.axis(axis_y), num_outputs=4)
-        cfg.define_split("tile_x", cfg.axis(axis_x), num_outputs=4)
-        axis_bf, axis_vf, axis_tf, axis_fi = cfg["tile_f"].apply(schedule, output, axis_f)
-        axis_by, axis_vy, axis_ty, axis_yi = cfg["tile_y"].apply(schedule, output, axis_y)
-        axis_bx, axis_vx, axis_tx, axis_xi = cfg["tile_x"].apply(schedule, output, axis_x)
-        kernel_scope = axis_n  # this is the scope to attach global config inside this kernel
-
-        schedule[output].bind(axis_bf, te.thread_axis("blockIdx.z"))
-        schedule[output].bind(axis_by, te.thread_axis("blockIdx.y"))
-        schedule[output].bind(axis_bx, te.thread_axis("blockIdx.x"))
-        schedule[output].bind(axis_vf, te.thread_axis("vthread"))
-        schedule[output].bind(axis_vy, te.thread_axis("vthread"))
-        schedule[output].bind(axis_vx, te.thread_axis("vthread"))
-        schedule[output].bind(axis_tf, te.thread_axis("threadIdx.z"))
-        schedule[output].bind(axis_ty, te.thread_axis("threadIdx.y"))
-        schedule[output].bind(axis_tx, te.thread_axis("threadIdx.x"))
-        schedule[output].reorder(
-            axis_n,
-            axis_bf,
-            axis_by,
-            axis_bx,
-            axis_vf,
-            axis_vy,
-            axis_vx,
-            axis_tf,
-            axis_ty,
-            axis_tx,
-            axis_fi,
-            axis_yi,
-            axis_xi,
-        )
-        schedule[cache_write_ol].compute_at(schedule[output], axis_tx)
-
-        # tile and bind reduction axes
-        axis_n, axis_f, axis_y, axis_x = schedule[cache_write_ol].op.axis
-        axis_rc, axis_ry, axis_rx = schedule[cache_write_ol].op.reduce_axis
-        cfg.define_split("tile_rc", cfg.axis(axis_rc), num_outputs=3)
-        cfg.define_split("tile_ry", cfg.axis(axis_ry), num_outputs=3)
-        cfg.define_split("tile_rx", cfg.axis(axis_rx), num_outputs=3)
-        axis_rco, axis_rcm, axis_rci = cfg["tile_rc"].apply(schedule, cache_write_ol, axis_rc)
-        axis_ryo, axis_rym, axis_ryi = cfg["tile_rx"].apply(schedule, cache_write_ol, axis_ry)
-        axis_rxo, axis_rxm, axis_rxi = cfg["tile_ry"].apply(schedule, cache_write_ol, axis_rx)
-        schedule[cache_write_ol].reorder(
-            axis_rco,
-            axis_ryo,
-            axis_rxo,
-            axis_rcm,
-            axis_rym,
-            axis_rxm,
-            axis_rci,
-            axis_ryi,
-            axis_rxi,
-            axis_n,
-            axis_f,
-            axis_y,
-            axis_x,
-        )
-
-        schedule[cache_read_aa].compute_at(schedule[cache_write_ol], axis_rxo)
-        schedule[cache_read_ww].compute_at(schedule[cache_write_ol], axis_rxo)
-        schedule[cache_read_al].compute_at(schedule[cache_write_ol], axis_rxm)
-        schedule[cache_read_wl].compute_at(schedule[cache_write_ol], axis_rxm)
-
-        # cooperative fetching
-        for load in [cache_read_aa, cache_read_ww]:
-            axis_n, axis_f, axis_y, axis_x = schedule[load].op.axis
-            fused = schedule[load].fuse(axis_n, axis_f, axis_y, axis_x)
-            axis_tz, fused = schedule[load].split(fused, nparts=cfg["tile_f"].size[2])
-            axis_ty, fused = schedule[load].split(fused, nparts=cfg["tile_y"].size[2])
-            axis_tx, fused = schedule[load].split(fused, nparts=cfg["tile_x"].size[2])
-            schedule[load].bind(axis_tz, te.thread_axis("threadIdx.z"))
-            schedule[load].bind(axis_ty, te.thread_axis("threadIdx.y"))
-            schedule[load].bind(axis_tx, te.thread_axis("threadIdx.x"))
-
-        # tune unroll
-        cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-        cfg.define_knob("unroll_explicit", [0, 1])
-        schedule[output].pragma(
-            kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val
-        )
-        schedule[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-        return schedule, [data, kernel, conv]
-
-
-def teardown_module():
-    """Remove the module from the autotvm task tables."""
-    # TODO(areusch): Tasks should not be registered into a global.
-    del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"]
-
-
-def get_sample_task(target=tvm.target.cuda(), target_host=None):
-    """return a sample task for testing"""
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    task = autotvm.task.create(
-        "testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target
-    )
-    return task, target
-
-
-def run_test_with_all_multiprocessing(func, *args, **kwargs):
-    """Check all multiprocessing methods work for the tuning test.
-
-    In the past fork() had the most support at detriment to spawn() and forkserver().
-    As fork() is unavailable or unsafe on some platforms it is good to check all
-    available methods.
-    """
-    for multiprocessing_method in mp.get_all_start_methods():
-        old_start_method = mp.get_start_method()
-        try:
-            mp.set_start_method(multiprocessing_method, force=True)
-            func(*args, **kwargs)
-        finally:
-            mp.set_start_method(old_start_method, force=True)
-
-
-@tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu(target):
-    """Test gpu tuning."""
-
-    def runner(target):
-        # init task
-        task, target = get_sample_task(target, None)
-        logging.info("task config space: %s", task.config_space)
-
-        measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
-
-        results = []
-
-        tuner = RandomTuner(task)
-        tuner.tune(
-            n_trial=20,
-            measure_option=measure_option,
-            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
-        )
-
-        assert len(results) == 20
-
-        successful_results = [
-            r
-            for r in results
-            if r.error_no == autotvm.MeasureErrorNo.NO_ERROR
-            # We filter records before building if we know they won't work ahead of time.
-            # We can't guarantee we get one good record so we count these as success too
-            or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR
-        ]
-        assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
-
-    run_test_with_all_multiprocessing(runner, target)
-
-
-@tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu_inherits_pass_context(target):
-    """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default.
-
-    Test that using PassContext inherits passes properly but also runs gpu verification pass.
-    """
-
-    @pass_instrument
-    class PassInstrumentChecker:
-        """Pass Instrument that simply sees if it's been run."""
-
-        def __init__(self):
-            self.has_been_run = False
-
-        def run_after_pass(self, *_):
-            self.has_been_run = True
-
-    class GPUVerifyPassMocked:
-        """Context manager that mocks tir.analysis.verify_gpu_code meant
-        to verify the pass has been run. This is done by patching the ffi func handles."""
-
-        FFI_FUNC_HANDLE = "tir.analysis.verify_gpu_code"
-        FUNC_NAME = "verify_gpu_code"
-
-        def __init__(self) -> None:
-            self.old_impl = tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
-            self.has_been_run = False
-
-        def gpu_verify_pass_mocked(self):
-            """Get the replacement for the gpu verification pass."""
-
-            def _gpu_verify_pass_mocked(*args, **kwargs):
-                self.has_been_run = True
-                return self.old_impl(*args, **kwargs)
-
-            return _gpu_verify_pass_mocked
-
-        def __enter__(self):
-            tvm._ffi.register_func(
-                self.FFI_FUNC_HANDLE, self.gpu_verify_pass_mocked(), override=True
-            )
-
-            # Also overwrite the python bindings
-            setattr(
-                _analysis_ffi_api, self.FUNC_NAME, tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
-            )
-
-        def __exit__(self, *args, **kwargs):
-            # Restore FFI status back to normal
-            tvm._ffi.register_func(self.FFI_FUNC_HANDLE, self.old_impl, override=True)
-            setattr(_analysis_ffi_api, self.FUNC_NAME, self.old_impl)
-
-    class OverwrittenBuildFunc(measure_methods._WrappedBuildFunc):
-        """BuildFunc that mocks and patches as necessary to test proper passes are run."""
-
-        def __call__(self, measure_input, tmp_dir, **kwargs):
-            instrument = PassInstrumentChecker()
-            mocked_pass_checker = GPUVerifyPassMocked()
-            with mocked_pass_checker:
-                with PassContext(instruments=[instrument]):
-                    regular_result = super().__call__(measure_input, tmp_dir, **kwargs)
-
-                    # Check instrument has been run, meaning context was inherited by builder
-                    assert instrument.has_been_run
-
-                    # But also check the gpu verification pass has been run
-                    # (which was not in the inherited ctx)
-                    assert mocked_pass_checker.has_been_run
-
-                    return regular_result
-
-    class MockedLocalBuilder(measure_methods.LocalBuilder):
-        """As measure_methods.LocalBuilder but overwrites the PassContext for testing."""
-
-        def __init__(
-            self,
-            timeout=10,
-            n_parallel=None,
-            build_kwargs=None,
-            build_func="default",
-            do_fork=False,
-            runtime=None,
-        ):
-            # pylint: disable=too-many-function-args
-            super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime)
-
-            self.build_func = OverwrittenBuildFunc(tar.tar, runtime)
-
-    def runner(target):
-        task, target = get_sample_task(target, None)
-        logging.info("task config space: %s", task.config_space)
-
-        # Note: we use the MockedLocalBuilder here instead of autotvm.LocalBuilder()
-        measure_option = autotvm.measure_option(MockedLocalBuilder(), autotvm.LocalRunner())
-
-        results = []
-
-        tuner = RandomTuner(task)
-        tuner.tune(
-            n_trial=1,
-            measure_option=measure_option,
-            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
-        )
-
-        assert len(results) == 1
-
-    run_test_with_all_multiprocessing(runner, target)
-
-
-def test_tuning_cpu():
-    """Test tuning on cpu."""
-
-    def runner():
-        ir_mod = tvm.relay.fromtext(
-            textwrap.dedent(
-                """
-            #[version = "0.0.5"]
-            def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float32]) {
-                nn.conv2d(%a, %b, data_layout="NCHW", kernel_layout="OIHW")
-            }
-            """
-            )
-        )
-        tasks = autotvm.task.relay_integration.extract_from_program(
-            ir_mod, {}, tvm.target.create("llvm")
-        )
-        assert len(tasks) >= 1, f"Extracted no tasks from program: {tasks!r}"
-
-        task = tasks[0]
-
-        measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
-
-        results = []
-
-        tuner = RandomTuner(task)
-        tuner.tune(
-            n_trial=20,
-            measure_option=measure_option,
-            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
-        )
-
-        assert len(results) == 20
-
-        successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR]
-        assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
-
-    run_test_with_all_multiprocessing(runner)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/integration/test_lower.py b/tests/python/integration/test_lower.py
deleted file mode 100644
index 1d042610ac07..000000000000
--- a/tests/python/integration/test_lower.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test workload for lowering and build."""
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm.script import tir as T
-
-
-# complains that index_i is defined outside of a block
-@T.prim_func(check_well_formed=False)
-def tensorcore_gemm(handle_a: T.handle, handle_b: T.handle, handle_c: T.handle) -> None:
-    # pylint: disable=missing-function-docstring
-    # match buffer
-    match_buffer_a = T.match_buffer(handle_a, [1024, 1024], "float16")
-    match_buffer_b = T.match_buffer(handle_b, [1024, 1024], "float16")
-    match_buffer_c = T.match_buffer(handle_c, [1024, 1024], "float32")
-
-    # body
-    for block_idx_x in T.thread_binding(0, 16, "blockIdx.x"):
-        for block_idx_y in T.thread_binding(0, 8, "blockIdx.y"):
-            with T.block():
-                axis_bx, axis_by = T.axis.remap("SS", [block_idx_x, block_idx_y])
-                shared_a = T.alloc_buffer([1024, 1024], "float16", scope="shared")
-                shared_b = T.alloc_buffer([1024, 1024], "float16", scope="shared")
-                wmma_a = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a")
-                wmma_b = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b")
-                wmma_c = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator")
-
-                # pylint: disable=too-many-nested-blocks
-                for thread_ty in T.thread_binding(0, 2, "threadIdx.y"):
-                    for thread_tz in T.thread_binding(0, 2, "threadIdx.z"):
-                        for index_i, index_jj in T.grid(2, 4):
-                            with T.block():
-                                new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i)
-                                new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj)
-                                T.reads([])
-                                T.writes(
-                                    wmma_c[
-                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                    ]
-                                )
-                                match_buffer_c0 = T.match_buffer(
-                                    wmma_c[
-                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                    ],
-                                    (16, 16),
-                                    "float32",
-                                    strides=[16 * 4, 1],
-                                    scope="wmma.accumulator",
-                                    offset_factor=1,
-                                )
-                                T.evaluate(
-                                    T.tvm_fill_fragment(
-                                        match_buffer_c0.data,
-                                        16,
-                                        16,
-                                        16,
-                                        index_i * 4 + index_jj,
-                                        T.float32(0),  # pylint: disable=not-callable
-                                        dtype="handle",
-                                    )
-                                )
-
-                        for k_o in range(0, 32):
-                            # copy data from global to shared
-                            for thread_tx in T.thread_binding(0, 32, "threadIdx.x"):
-                                for index_i0, index_j0 in T.grid(1, 4):
-                                    for index_j1 in T.vectorized(0, 4):
-                                        with T.block():
-                                            new_axis_vi = T.axis.S(
-                                                1024,
-                                                axis_bx * 64
-                                                + thread_ty * 32
-                                                + thread_tx
-                                                + index_i0,
-                                            )
-                                            new_axis_vj = T.axis.S(
-                                                1024,
-                                                k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1,
-                                            )
-                                            shared_a[new_axis_vi, new_axis_vj + 8] = match_buffer_a[
-                                                new_axis_vi, new_axis_vj
-                                            ]
-
-                                for index_i0, index_j0 in T.grid(2, 4):
-                                    for index_j1 in T.vectorized(0, 4):
-                                        with T.block():
-                                            new_axis_vi = T.axis.S(
-                                                1024,
-                                                axis_by * 128
-                                                + thread_ty * 64
-                                                + thread_tx * 2
-                                                + index_i0,
-                                            )
-                                            new_axis_vj = T.axis.S(
-                                                1024,
-                                                k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1,
-                                            )
-                                            shared_b[new_axis_vi, new_axis_vj + 8] = match_buffer_b[
-                                                new_axis_vi, new_axis_vj
-                                            ]
-
-                            for k_i in range(0, 2):
-                                for index_i in range(0, 2):
-                                    with T.block():
-                                        new_axis_vi = T.axis.S(
-                                            64, axis_bx * 4 + thread_ty * 2 + index_i
-                                        )
-                                        axis_vk = T.axis.S(64, k_o * 2 + k_i)
-                                        T.reads(
-                                            shared_a[
-                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
-                                            ]
-                                        )
-                                        T.writes(
-                                            wmma_a[
-                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16,
-                                            ]
-                                        )
-                                        stride0 = T.int32()
-                                        stride1 = T.int32()
-                                        match_buffer_a0 = T.match_buffer(
-                                            shared_a[
-                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
-                                            ],
-                                            (16, 16 + 8),
-                                            "float16",
-                                            strides=[stride0, stride1],
-                                            scope="shared",
-                                            offset_factor=1,
-                                        )
-                                        wmma_a0 = T.match_buffer(
-                                            wmma_a[
-                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16,
-                                            ],
-                                            (16, 16),
-                                            "float16",
-                                            strides=[16, 1],
-                                            scope="wmma.matrix_a",
-                                            offset_factor=1,
-                                        )
-                                        T.evaluate(
-                                            T.tvm_load_matrix_sync(
-                                                wmma_a0.data,
-                                                16,
-                                                16,
-                                                16,
-                                                index_i,
-                                                T.tvm_access_ptr(
-                                                    T.type_annotation(dtype="float16"),
-                                                    match_buffer_a0.data,
-                                                    match_buffer_a0.elem_offset + 8,
-                                                    match_buffer_a0.strides[0],
-                                                    1,
-                                                    dtype="handle",
-                                                ),
-                                                match_buffer_a0.strides[0],
-                                                "row_major",
-                                                dtype="handle",
-                                            )
-                                        )
-                                for index_jj in range(0, 4):
-                                    with T.block():
-                                        new_axis_vj = T.axis.S(
-                                            64, axis_by * 8 + thread_tz * 4 + index_jj
-                                        )
-                                        axis_vk = T.axis.S(64, k_o * 2 + k_i)
-                                        T.reads(
-                                            shared_b[
-                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
-                                            ]
-                                        )
-                                        T.writes(
-                                            wmma_b[
-                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16,
-                                            ]
-                                        )
-                                        stride0 = T.int32()
-                                        stride1 = T.int32()
-                                        match_buffer_b0 = T.match_buffer(
-                                            shared_b[
-                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
-                                            ],
-                                            (16, 16 + 8),
-                                            "float16",
-                                            strides=[stride0, stride1],
-                                            scope="shared",
-                                            offset_factor=1,
-                                        )
-                                        wmma_b0 = T.match_buffer(
-                                            wmma_b[
-                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16,
-                                            ],
-                                            (16, 16),
-                                            "float16",
-                                            strides=[16, 1],
-                                            scope="wmma.matrix_b",
-                                            offset_factor=1,
-                                        )
-                                        T.evaluate(
-                                            T.tvm_load_matrix_sync(
-                                                wmma_b0.data,
-                                                16,
-                                                16,
-                                                16,
-                                                index_jj,
-                                                T.tvm_access_ptr(
-                                                    T.type_annotation(dtype="float16"),
-                                                    match_buffer_b0.data,
-                                                    match_buffer_b0.elem_offset + 8,
-                                                    match_buffer_b0.strides[0],
-                                                    1,
-                                                    dtype="handle",
-                                                ),
-                                                match_buffer_b0.strides[0],
-                                                "col_major",
-                                                dtype="handle",
-                                            )
-                                        )
-                                for index_i, index_jj in T.grid(2, 4):
-                                    with T.block():
-                                        new_axis_vi = T.axis.S(
-                                            64, axis_bx * 4 + thread_ty * 2 + index_i
-                                        )
-                                        new_axis_vj = T.axis.S(
-                                            64, axis_by * 8 + thread_tz * 4 + index_jj
-                                        )
-                                        axis_vk = T.axis.R(64, k_o * 2 + k_i)
-                                        T.reads(
-                                            [
-                                                wmma_a[
-                                                    new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                    axis_vk * 16 : axis_vk * 16 + 16,
-                                                ],
-                                                wmma_b[
-                                                    new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                                    axis_vk * 16 : axis_vk * 16 + 16,
-                                                ],
-                                                wmma_c[
-                                                    new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                    new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                                ],
-                                            ]
-                                        )
-                                        T.writes(
-                                            wmma_c[
-                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                            ]
-                                        )
-                                        wmma_a1 = T.match_buffer(
-                                            wmma_a[
-                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16,
-                                            ],
-                                            (16, 16),
-                                            "float16",
-                                            strides=[16, 1],
-                                            scope="wmma.matrix_a",
-                                            offset_factor=1,
-                                        )
-                                        wmma_b1 = T.match_buffer(
-                                            wmma_b[
-                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                                axis_vk * 16 : axis_vk * 16 + 16,
-                                            ],
-                                            (16, 16),
-                                            "float16",
-                                            strides=[16, 1],
-                                            scope="wmma.matrix_b",
-                                            offset_factor=1,
-                                        )
-                                        wmma_c1 = T.match_buffer(
-                                            wmma_c[
-                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                            ],
-                                            (16, 16),
-                                            "float32",
-                                            strides=[16 * 4, 1],
-                                            scope="wmma.accumulator",
-                                            offset_factor=1,
-                                        )
-                                        T.evaluate(
-                                            T.tvm_mma_sync(
-                                                wmma_c1.data,
-                                                index_i * 4 + index_jj,
-                                                wmma_a1.data,
-                                                index_i,
-                                                wmma_b1.data,
-                                                index_jj,
-                                                wmma_c1.data,
-                                                index_i * 4 + index_jj,
-                                                dtype="handle",
-                                            )
-                                        )
-                        for index_i, index_jj in T.grid(2, 4):
-                            with T.block():
-                                new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i)
-                                new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj)
-                                T.reads(
-                                    wmma_c[
-                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                    ]
-                                )
-                                T.writes(
-                                    match_buffer_c[
-                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                    ]
-                                )
-                                stride0 = T.int32()
-                                stride1 = T.int32()
-                                wmma_c2 = T.match_buffer(
-                                    wmma_c[
-                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                    ],
-                                    (16, 16),
-                                    "float32",
-                                    strides=[16 * 4, 1],
-                                    scope="wmma.accumulator",
-                                    offset_factor=1,
-                                )
-                                match_buffer_c1 = T.match_buffer(
-                                    match_buffer_c[
-                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
-                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
-                                    ],
-                                    (16, 16),
-                                    "float32",
-                                    strides=[stride0, stride1],
-                                    offset_factor=1,
-                                )
-                                T.evaluate(
-                                    T.tvm_store_matrix_sync(
-                                        wmma_c2.data,
-                                        16,
-                                        16,
-                                        16,
-                                        index_i * 4 + index_jj,
-                                        T.tvm_access_ptr(
-                                            T.type_annotation(dtype="float32"),
-                                            match_buffer_c1.data,
-                                            match_buffer_c1.elem_offset,
-                                            match_buffer_c1.strides[0],
-                                            1,
-                                            dtype="handle",
-                                        ),
-                                        match_buffer_c1.strides[0],
-                                        "row_major",
-                                        dtype="handle",
-                                    )
-                                )
-
-
-@tvm.testing.requires_cuda
-def test_gemm_tensorcore():
-    """Test running gemm on tensorcore."""
-    dev = tvm.device("cuda", 0)
-    a_np = np.random.uniform(size=(1024, 1024)).astype("float16")
-    b_np = np.random.uniform(size=(1024, 1024)).astype("float16")
-    c_np = np.dot(a_np.astype("float32"), b_np.T.astype("float32"))
-    buff_a = tvm.nd.array(a_np, dev)
-    buff_b = tvm.nd.array(b_np, dev)
-    buff_c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev)
-    myfunc = tvm.build(tensorcore_gemm, target="cuda", name="dense")
-    myfunc(buff_a, buff_b, buff_c)
-    tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-3)
-
-    evaluator = myfunc.time_evaluator(myfunc.entry_name, dev, number=100)
-    time_elapsed = evaluator(buff_a, buff_b, buff_c).mean
-    num_flops = 2 * 1024 * 1024 * 1024
-    gflops = num_flops / (time_elapsed * 1e3) / 1e6
-    print("gemm with tensor core: %f ms" % (time_elapsed * 1e3))
-    print("GFLOPS: %f" % gflops)
-
-
-if __name__ == "__main__":
-    test_gemm_tensorcore()
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
deleted file mode 100644
index f173e69cb94e..000000000000
--- a/tests/python/integration/test_reduce.py
+++ /dev/null
@@ -1,677 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test scheduling of reduction operations."""
-import numpy as np
-
-import tvm
-from tvm import te, topi
-from tvm.driver.build_module import schedule_to_module
-import tvm.testing
-import tvm.topi.testing
-
-
-@tvm.testing.requires_gpu
-def test_reduce_prims():
-    """Test reduction operations."""
-
-    def test_prim(reducer, np_reducer):
-        # graph
-        size_var_n = tvm.te.size_var("n")
-        size_var_m = tvm.te.size_var("m")
-        placeholder_a = te.placeholder((size_var_n, size_var_m), name="A")
-        result_r = te.compute((size_var_n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R")
-        axis_k = te.reduce_axis((0, size_var_m))
-        result_b = te.compute(
-            (size_var_n,),
-            lambda i: reducer(placeholder_a[i, axis_k], axis=axis_k, where=(result_r[i] == 1)),
-            name="B",
-        )
-        # schedule
-        schedule = te.create_schedule(result_b.op)
-        # create iter var and assign them tags.
-        num_thread = 1
-        axis_x0, axis_x1 = schedule[result_b].split(result_b.op.axis[0], factor=num_thread)
-        schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x"))
-        schedule[result_b].bind(axis_x1, te.thread_axis("threadIdx.x"))
-        schedule[result_r].compute_inline()
-
-        # one line to build the function.
-        def check_device(device, host="llvm"):
-            dev = tvm.device(device, 0)
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled.." % device)
-                return
-            freduce = tvm.build(
-                schedule,
-                args=[placeholder_a, result_b],
-                target=tvm.target.Target(device, host),
-                name="myreduce",
-            )
-            # launch the kernel.
-            num_n = 1028
-            num_m = 129
-            buff_x = tvm.nd.array(
-                np.random.uniform(size=(num_n, num_m)).astype(placeholder_a.dtype), dev
-            )
-            buff_y = tvm.nd.array(np.zeros(num_n, dtype=result_b.dtype), dev)
-            freduce(buff_x, buff_y)
-            npy = buff_y.numpy()
-            npy[:2] = 0
-            res = np_reducer(buff_x.numpy(), axis=1)
-            res[:2] = 0
-            tvm.testing.assert_allclose(npy, res, rtol=1e-4)
-
-        check_device("metal")
-        check_device("vulkan")
-        check_device("cuda")
-        check_device("opencl")
-        check_device("rocm")
-
-    test_prim(te.sum, np.sum)
-    test_prim(tvm.te.min, np.amin)
-    test_prim(tvm.te.max, np.amax)
-
-
-def test_init_imm():
-    """Test initial values which are immutable in reduction ops."""
-    num_n = 1027
-    arr_length = tvm.runtime.convert(num_n)
-    placeholder_a = te.placeholder((arr_length,), name="A")
-    axis_k = te.reduce_axis((0, arr_length))
-    result_b = te.compute(
-        (), lambda: te.sum(placeholder_a[axis_k], axis=axis_k, init=10.0), name="B"
-    )
-    # schedule
-    schedule_s = te.create_schedule(result_b.op)
-    # one line to build the function.
-    def check_target(target="llvm"):
-        if not tvm.runtime.enabled(target):
-            return
-        dev = tvm.cpu(0)
-        fapi = tvm.lower(schedule_s, args=[placeholder_a, result_b])
-        fsum = tvm.build(fapi, target=target, name="mysum")
-        # launch the kernel.
-        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
-        buff_b = tvm.nd.array(np.zeros((), dtype=result_b.dtype), dev)
-        fsum(buff_a, buff_b)
-        res = 10.0 + np.sum(buff_a.numpy(), axis=0)
-        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
-
-    check_target()
-
-
-def test_init():
-    """Test initializer which is non-const."""
-    num_n = 1027
-    arr_length = tvm.runtime.convert(num_n)
-    placeholder_a = te.placeholder((arr_length, arr_length), name="A")
-    placeholder_c = te.placeholder((arr_length, arr_length), name="C")
-    placeholder_i = te.placeholder((arr_length, arr_length), name="I")
-    axis_k = te.reduce_axis((0, arr_length))
-    result_b = te.compute(
-        (arr_length, arr_length),
-        lambda i, j: te.sum(
-            placeholder_a[i, axis_k] * placeholder_c[axis_k, j],
-            axis=axis_k,
-            init=placeholder_i[i, j],
-        ),
-        name="B",
-    )
-
-    # schedule
-    schedule = te.create_schedule(result_b.op)
-    # one line to build the function.
-    def check_target(target="llvm"):
-        if not tvm.runtime.enabled(target):
-            return
-        dev = tvm.cpu(0)
-        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b])
-        print(fapi)
-        mmult = tvm.build(fapi, target=target, name="mmult")
-        # launch the kernel.
-        buff_a = tvm.nd.array(
-            np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev
-        )
-        buff_c = tvm.nd.array(
-            np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev
-        )
-        buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev)
-        buf_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev)
-        mmult(buff_a, buff_c, buff_i, buf_b)
-        res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy())
-        tvm.testing.assert_allclose(buf_b.numpy(), res, rtol=1e-4)
-
-    check_target()
-
-
-def test_rfactor():
-    """Test rfactors."""
-    num_n = 1027
-    arr_length = tvm.runtime.convert(num_n)
-    placeholder_a = te.placeholder((arr_length,), name="A")
-    axis_k = te.reduce_axis((0, arr_length))
-    placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B")
-    # schedule
-    schedule = te.create_schedule(placeholder_b.op)
-    axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4)
-    rfactor_bf = schedule.rfactor(placeholder_b, axis_kf)
-    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
-    # one line to build the function.
-    def check_target(target="llvm"):
-        if not tvm.testing.device_enabled(target):
-            return
-        dev = tvm.cpu(0)
-        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b])
-        fsum = tvm.build(fapi, target=target, name="mysum")
-        # launch the kernel.
-        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
-        buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev)
-        fsum(buff_a, buff_b)
-        res = np.sum(buff_a.numpy(), axis=0)
-        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
-
-    check_target()
-
-
-def test_rfactor_init():
-    """Test rfactors with constant inits."""
-    num_n = 1027
-    arr_length = tvm.runtime.convert(num_n)
-    placeholder_a = te.placeholder((arr_length, arr_length), name="A")
-    placeholder_c = te.placeholder((arr_length, arr_length), name="C")
-    placeholder_i = te.placeholder((arr_length, arr_length), name="I")
-    axis_k = te.reduce_axis((0, arr_length))
-    result_b = te.compute(
-        (arr_length, arr_length),
-        lambda i, j: te.sum(
-            placeholder_a[i, axis_k] * placeholder_c[axis_k, j],
-            axis=axis_k,
-            init=placeholder_i[i, j],
-        ),
-        name="B",
-    )
-
-    # schedule
-    schedule = te.create_schedule(result_b.op)
-    axis_kf, _ = schedule[result_b].split(axis_k, nparts=4)
-    rfactor_bf = schedule.rfactor(result_b, axis_kf, 1)
-    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
-    # one line to build the function.
-    def check_target(target="llvm"):
-        if not tvm.runtime.enabled(target):
-            return
-        dev = tvm.cpu(0)
-        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b])
-        print(fapi)
-        mmult = tvm.build(fapi, target=target, name="mmult")
-        # launch the kernel.
-        buff_a = tvm.nd.array(
-            np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev
-        )
-        buff_c = tvm.nd.array(
-            np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev
-        )
-        buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev)
-        buff_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev)
-        mmult(buff_a, buff_c, buff_i, buff_b)
-        res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy())
-        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
-
-    check_target()
-
-
-def test_rfactor_factor_axis():
-    """Test rfactors across axis."""
-    num_n = 1027
-    arr_length = tvm.runtime.convert(num_n)
-    placeholder_a = te.placeholder((arr_length,), name="A")
-    axis_k = te.reduce_axis((0, arr_length))
-    placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B")
-    # schedule
-    schedule = te.create_schedule(placeholder_b.op)
-    axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4)
-    rfactor_bf = schedule.rfactor(placeholder_b, axis_kf, 0)
-    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
-    # one line to build the function.
-    def check_target(target="llvm"):
-        if not tvm.testing.device_enabled(target):
-            return
-        dev = tvm.cpu(0)
-        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b])
-        fsum = tvm.build(fapi, target=target, name="mysum")
-        # launch the kernel.
-        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
-        buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev)
-        fsum(buff_a, buff_b)
-        res = np.sum(buff_a.numpy(), axis=0)
-        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
-
-    check_target()
-
-
-@tvm.testing.requires_gpu
-def test_rfactor_threads():
-    """Test rfactors across threads."""
-    num_n = 1027
-    num_m = 10
-    length_n = tvm.runtime.convert(num_n)
-    length_m = tvm.runtime.convert(num_m)
-    placeholder_a = te.placeholder((length_m, length_n), name="A")
-    axis_k = te.reduce_axis((0, length_n))
-    nthread = 16
-    result_b = te.compute(
-        (length_m,),
-        lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k, where=(i > 1)),
-        name="B",
-    )
-    # schedule
-    schedule = te.create_schedule(result_b.op)
-    _, axis_kf = schedule[result_b].split(axis_k, factor=nthread)
-    rfactor_bf = schedule.rfactor(result_b, axis_kf)
-    axis_bx, axis_ty = schedule[result_b].split(schedule[result_b].op.axis[0], factor=nthread)
-    schedule[result_b].bind(axis_bx, te.thread_axis("blockIdx.x"))
-    schedule[result_b].bind(axis_ty, te.thread_axis("threadIdx.y"))
-    axis_tx = schedule[result_b].op.reduce_axis[0]
-    thread_x = te.thread_axis("threadIdx.x")
-    schedule[result_b].bind(axis_tx, thread_x)
-    schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx)
-    schedule[result_b].set_store_predicate(thread_x.var.equal(0))
-
-    # one line to build the function.
-    def check_target(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-
-        fapi = tvm.lower(schedule, args=[placeholder_a, result_b])
-        fsum = tvm.build(fapi, target=device, name="mysum")
-        # launch the kernel.
-        buff_a = tvm.nd.array(
-            np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev
-        )
-        buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev)
-        fsum(buff_a, buff_b)
-        res = np.sum(buff_a.numpy(), axis=1)
-        res[:2] = 0
-        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
-
-    check_target("vulkan")
-    check_target("cuda")
-    check_target("metal")
-    check_target("opencl")
-    check_target("rocm")
-
-
-@tvm.testing.requires_gpu
-def test_rfactor_elemwise_threads():
-    """Test rfactor elemwise threads."""
-    num_n = 1025
-    num_m = 10
-    placeholder_a = te.placeholder((num_m, num_n), name="A")
-    axis_k = te.reduce_axis((0, num_n))
-    nthread = 16
-    result_b = te.compute(
-        (num_m,), lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k), name="B"
-    )
-    result_bb = te.compute((num_m,), lambda i: result_b[i] + 1, name="BB")
-    result_c = te.compute((num_m,), lambda i: result_bb[i] + 1, name="C")
-    # schedule
-    schedule = te.create_schedule(result_c.op)
-    schedule[result_bb].compute_inline()
-    axis_bx, axis_ty = schedule[result_c].split(schedule[result_c].op.axis[0], factor=nthread)
-    _, axis_kf = schedule[result_b].split(axis_k, factor=nthread)
-    rfactor_bf = schedule.rfactor(result_b, axis_kf)
-    schedule[result_b].compute_at(schedule[result_c], axis_ty)
-    schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x"))
-    schedule[result_c].bind(axis_ty, te.thread_axis("threadIdx.y"))
-    axis_tx = schedule[result_b].op.reduce_axis[0]
-    thread_x = te.thread_axis("threadIdx.x")
-    schedule[result_b].bind(axis_tx, thread_x)
-    schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx)
-    # Since thread_x is shared across reductions
-    # only one of them need to do write back
-    schedule[result_b].set_store_predicate(thread_x.var.equal(0))
-    schedule[result_c].set_store_predicate(thread_x.var.equal(0))
-
-    # one line to build the function.
-    def check_target(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-        fapi = tvm.lower(schedule, args=[placeholder_a, result_c])
-        fsum = tvm.build(fapi, target=device, name="mysum")
-        # launch the kernel.
-        buff_a = tvm.nd.array(
-            np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev
-        )
-        buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev)
-        fsum(buff_a, buff_b)
-        res = np.sum(buff_a.numpy(), axis=1) + 2
-        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
-
-    check_target("vulkan")
-    check_target("cuda")
-    check_target("metal")
-    check_target("opencl")
-    check_target("rocm")
-
-
-def test_argmax():
-    """Test argmax."""
-
-    def fcombine(tensor_x, tensor_y):
-        lhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[0], tensor_y[0])
-        rhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[1], tensor_y[1])
-        return lhs, rhs
-
-    def fidentity(tensor1, tensor2):
-        return tvm.tir.const(-1, tensor1), tvm.te.min_value(tensor2)
-
-    argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
-    size_var_m = te.size_var("m")
-    size_var_n = te.size_var("n")
-    idx = te.placeholder((size_var_m, size_var_n), name="idx", dtype="int32")
-    val = te.placeholder((size_var_m, size_var_n), name="val", dtype="float32")
-    axis_k = te.reduce_axis((0, size_var_n), "k")
-    result_t0, result_t1 = te.compute(
-        (size_var_m,), lambda i: argmax((idx[i, axis_k], val[i, axis_k]), axis=axis_k), name="T"
-    )
-    schedule = te.create_schedule(result_t0.op)
-
-    def check_target():
-        device = "cpu"
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-        dev = tvm.device(device, 0)
-        fapi = tvm.lower(schedule, args=[idx, val, result_t0, result_t1])
-        fargmax = tvm.build(fapi, target="llvm", name="argmax")
-
-        height = 12
-        width = 16
-        np_idx = np.repeat(np.arange(width, dtype="int32").reshape(1, width), height, axis=0)
-        np_val = np.random.uniform(size=(height, width)).astype("float32")
-        np_res = np.argmax(np_val, axis=1)
-
-        nd_idx = tvm.nd.array(np_idx, dev)
-        nd_val = tvm.nd.array(np_val, dev)
-        nd_res0 = tvm.nd.array(np.zeros(height, dtype="int32"), dev)
-        nd_res1 = tvm.nd.array(np.zeros(height, dtype="float32"), dev)
-        fargmax(nd_idx, nd_val, nd_res0, nd_res1)
-        tvm.testing.assert_allclose(np_res, nd_res0.numpy())
-
-    check_target()
-
-
-@tvm.testing.requires_gpu
-def test_rfactor_argmax():
-    """Test rfactor argmax"""
-
-    def fcombine(tensor0, tensor1):
-        lhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[0], tensor1[0])
-        rhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[1], tensor1[1])
-        return lhs, rhs
-
-    def fidentity(tensor0, tensor1):
-        return tvm.tir.const(-1, tensor0), tvm.te.min_value(tensor1)
-
-    argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
-
-    num_width = 1027
-    num_height = 10
-    width = tvm.runtime.convert(num_width)
-    height = tvm.runtime.convert(num_height)
-    placeholder_a0 = te.placeholder((height, width), name="A0", dtype="int32")
-    placeholder_a1 = te.placeholder((height, width), name="A1", dtype="float32")
-    axis_k = te.reduce_axis((0, width))
-    result_b0, result_b1 = te.compute(
-        (height,),
-        lambda i: argmax((placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k),
-        name="B",
-    )
-
-    # schedule
-    schedule = te.create_schedule(result_b0.op)
-    nthread = 16
-    _, axis_kf = schedule[result_b0].split(axis_k, factor=nthread)
-    rfactor_bf0, _ = schedule.rfactor(result_b0, axis_kf)
-    axis_bx, axis_ty = schedule[result_b0].split(schedule[result_b0].op.axis[0], factor=nthread)
-    schedule[result_b0].bind(axis_bx, te.thread_axis("blockIdx.x"))
-    schedule[result_b0].bind(axis_ty, te.thread_axis("threadIdx.y"))
-    axis_tx = schedule[result_b0].op.reduce_axis[0]
-    thread_x = te.thread_axis("threadIdx.x")
-    schedule[result_b0].bind(axis_tx, thread_x)
-    schedule[rfactor_bf0.op].compute_at(schedule[result_b0], axis_tx)
-    schedule[result_b0].set_store_predicate(thread_x.var.equal(0))
-
-    def check_target(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-        fapi = tvm.lower(schedule, args=[placeholder_a0, placeholder_a1, result_b0, result_b1])
-        fargmax = tvm.build(fapi, target=device, name="argmax")
-
-        np_idx = np.repeat(
-            np.arange(num_width, dtype="int32").reshape(1, num_width), num_height, axis=0
-        )
-        np_val = np.random.uniform(size=(num_height, num_width)).astype("float32")
-        np_res = np.argmax(np_val, axis=1)
-
-        nd_idx = tvm.nd.array(np_idx, dev)
-        nd_val = tvm.nd.array(np_val, dev)
-        nd_res0 = tvm.nd.array(np.zeros(num_height, dtype="int32"), dev)
-        nd_res1 = tvm.nd.array(np.zeros(num_height, dtype="float32"), dev)
-        fargmax(nd_idx, nd_val, nd_res0, nd_res1)
-        tvm.testing.assert_allclose(np_res, nd_res0.numpy())
-
-    check_target("cuda")
-    check_target("vulkan")
-    check_target("rocm")
-
-
-@tvm.testing.requires_gpu
-def test_warp_reduction1():
-    """Test warp reductions."""
-    nthx = 32
-    nthy = 4
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis((0, nthx), "threadIdx.x")
-    thread_y = te.thread_axis((0, nthy), "threadIdx.y")
-
-    def check_target(device, m, n):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-
-        # compute
-        placeholder_a = te.placeholder((m, n), name="A")
-        axis_k = te.reduce_axis((0, n))
-        placeholder_b = te.compute(
-            (m,), lambda i: te.max(placeholder_a[i][axis_k], axis=axis_k), name="B"
-        )
-        schedule = te.create_schedule(placeholder_b.op)
-
-        # schedule
-        axis_k = schedule[placeholder_b].op.reduce_axis[0]
-        axis_ko, _ = schedule[placeholder_b].split(axis_k, nparts=nthx)
-        schedule[placeholder_b].bind(axis_ko, thread_x)
-        axis_xo, axis_xi = schedule[placeholder_b].split(
-            schedule[placeholder_b].op.axis[0], factor=nthy
-        )
-        schedule[placeholder_b].bind(axis_xi, thread_y)
-        schedule[placeholder_b].bind(axis_xo, block_x)
-
-        tvm.lower(schedule, [placeholder_a, placeholder_b], simple_mode=True)
-
-        # validation
-        func = tvm.build(schedule, [placeholder_a, placeholder_b], device, name="warp_reduction")
-        a_np = np.random.uniform(size=(m, n)).astype(placeholder_a.dtype)
-        b_np = np.zeros((m,), dtype=placeholder_a.dtype)
-        buff_a = tvm.nd.array(a_np, dev)
-        buff_b = tvm.nd.array(b_np, dev)
-        b_np = np.max(a_np, axis=1)
-        func(buff_a, buff_b)
-        tvm.testing.assert_allclose(buff_b.numpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    check_target("cuda", m=32, n=256)
-    check_target("cuda", m=10, n=20)
-    check_target("rocm", m=32, n=256)
-    check_target("rocm", m=10, n=20)
-    # This is a bug in normal reduction.
-    # check_target("cuda", m=10, n=37)
-
-
-@tvm.testing.requires_gpu
-def test_warp_reduction2():
-    """Test warp reductions."""
-
-    def fcombine(tensor1, tensor2):
-        return tensor1[0] + tensor2[0], tensor1[1] * tensor2[1]
-
-    def fidentity(tensor1, tensor2):
-        return tvm.tir.const(0, tensor1), tvm.tir.const(1, tensor2)
-
-    add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer")
-
-    # compute
-    num_m = 16
-    num_n = 256
-    placeholder_a0 = te.placeholder((num_m, num_n), name="A0", dtype="float32")
-    placeholder_a1 = te.placeholder((num_m, num_n), name="Al", dtype="float32")
-    axis_k = te.reduce_axis((0, num_n), "k")
-    result0, result1 = te.compute(
-        (num_m,),
-        lambda i: add_mul_reducer(
-            (placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k
-        ),
-        name="T",
-    )
-
-    nthdx, nthdy = 32, 2
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis((0, nthdx), "threadIdx.x")
-    thread_y = te.thread_axis((0, nthdy), "threadIdx.y")
-
-    def check_target(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-
-        # schedule
-        schedule = te.create_schedule(result0.op)
-        axis_ko, _ = schedule[result0].split(axis_k, nparts=nthdx)
-        axis_xo, axis_xi = schedule[result0].split(schedule[result0].op.axis[0], factor=nthdy)
-        schedule[result0].bind(axis_ko, thread_x)
-        schedule[result0].bind(axis_xi, thread_y)
-        schedule[result0].bind(axis_xo, block_x)
-
-        # validation
-        dev = tvm.device(device, 0)
-        a0_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a0.dtype)
-        a1_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a1.dtype)
-        t0_np = np.zeros((num_m,), dtype=placeholder_a0.dtype)
-        t1_np = np.zeros((num_m,), dtype=placeholder_a1.dtype)
-        buff_a0 = tvm.nd.array(a0_np, dev)
-        buff_a1 = tvm.nd.array(a1_np, dev)
-        buff_t0 = tvm.nd.array(t0_np, dev)
-        buff_t1 = tvm.nd.array(t1_np, dev)
-        func = tvm.build(
-            schedule, [placeholder_a0, placeholder_a1, result0, result1], device, name="reduction"
-        )
-        func(buff_a0, buff_a1, buff_t0, buff_t1)
-        t0_np = np.sum(a0_np, axis=1)
-        t1_np = np.product(a1_np, axis=1)
-        tvm.testing.assert_allclose(buff_t0.numpy(), t0_np, rtol=1e-3, atol=1e-3)
-        tvm.testing.assert_allclose(buff_t1.numpy(), t1_np, rtol=1e-3, atol=1e-3)
-
-    check_target("cuda")
-    check_target("rocm")
-
-
-@tvm.testing.requires_cuda
-def test_reduce_storage_reuse():
-    """Test reduction reuses storage."""
-    target = tvm.target.Target("cuda")
-
-    def run_passes(sch, args):
-        mod = schedule_to_module(sch, args)
-        mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", target))(mod)
-        return tvm.transform.Sequential(
-            [
-                tvm.tir.transform.StorageFlatten(64),
-                tvm.tir.transform.Simplify(),
-                tvm.tir.transform.StorageRewrite(),
-                tvm.tir.transform.LowerThreadAllreduce(),
-            ]
-        )(mod)
-
-    dev = tvm.device(target.kind.name, 0)
-    shape = (16, 16)
-
-    placeholder_a = te.placeholder(shape, dtype="float32", name="A")
-    placeholder_b = topi.nn.softmax(placeholder_a, axis=1) + 1.0
-
-    with tvm.target.Target(target):
-        schedule = topi.cuda.schedule_softmax(placeholder_b)
-
-    mod = run_passes(schedule, [placeholder_a, placeholder_b])
-
-    # Due to the storage rewrite pass, the reduction output storage reduce_temp0 can be reused as
-    # the storage of the next compute.
-
-    # Example:
-    # ...
-    # tir.tvm_thread_allreduce((uint32)1, normal_reduce_temp0[0], 1, reduce_temp0, threadIdx.x)
-    # if ((threadIdx.x < 16)) {
-    #   reduce_temp0[0] = (T_softmax_exp[threadIdx.x]/reduce_temp0[0])
-    # }
-    # ...
-
-    # The LowerThreadAllreduce pass should remap reduce_temp0 on the left hand side of the store
-    # above, as well as the load on the right hand side.
-
-    # Expected output:
-    # ...
-    # red_buf0[0] = tir.tvm_warp_shuffle(mask[0], red_buf0[0], 0, 32, 32)
-    # if ((threadIdx.x < 16)) {
-    #   red_buf0[0] = (T_softmax_exp[threadIdx.x]/red_buf0[0])
-    # }
-    # ...
-
-    def check_store_dst_remapped(op):
-        if isinstance(op, tvm.tir.BufferStore):
-            assert op.buffer.data.name != "reduce_temp0"
-
-    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, check_store_dst_remapped)
-
-    inp = np.random.uniform(size=shape).astype("float32")
-    ref = tvm.topi.testing.softmax_python(inp) + 1.0
-
-    func = tvm.build(schedule, [placeholder_a, placeholder_b], target)
-    buff_a = tvm.nd.array(inp, dev)
-    buff_b = tvm.nd.array(np.zeros(shape, dtype=placeholder_b.dtype), dev)
-    func(buff_a, buff_b)
-    tvm.testing.assert_allclose(buff_b.numpy(), ref, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
deleted file mode 100644
index fa920e513502..000000000000
--- a/tests/python/integration/test_scan.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test scheduling adn running scan operators."""
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-
-
-@tvm.testing.requires_gpu
-def test_scan():
-    """Test scan operators."""
-    size_var_m = te.size_var("m")
-    size_var_n = te.size_var("n")
-    placeholder_x = te.placeholder((size_var_m, size_var_n), name="X")
-    s_state = te.placeholder((size_var_m, size_var_n))
-    s_init = te.compute((1, size_var_n), lambda _, i: placeholder_x[0, i])
-    s_update = te.compute(
-        (size_var_m, size_var_n), lambda t, i: s_state[t - 1, i] + placeholder_x[t, i]
-    )
-    scan = tvm.te.scan(s_init, s_update, s_state)
-    # test scan + compute case
-    res = te.compute((size_var_m, size_var_n), lambda i, j: scan[i, j])
-
-    # schedule
-    schedule = te.create_schedule(res.op)
-    num_thread = 256
-    block_x = te.thread_axis(None, "blockIdx.x")
-    thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-    axis_xo, axis_xi = schedule[s_init].split(s_init.op.axis[1], factor=num_thread)
-    schedule[s_init].bind(axis_xo, block_x)
-    schedule[s_init].bind(axis_xi, thread_x)
-    axis_xo, axis_xi = schedule[s_update].split(s_update.op.axis[1], factor=num_thread)
-    schedule[s_update].bind(axis_xo, block_x)
-    schedule[s_update].bind(axis_xi, thread_x)
-    axis_xo, axis_xi = schedule[res].split(res.op.axis[1], factor=num_thread)
-    schedule[res].bind(axis_xo, block_x)
-    schedule[res].bind(axis_xi, thread_x)
-
-    # one line to build the function.
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("skip because %s is not enabled.." % device)
-            return
-        fscan = tvm.build(schedule, [placeholder_x, res], device, name="myscan")
-        # launch the kernel.
-        num_n = 1024
-        num_m = 10
-        a_np = np.random.uniform(size=(num_m, num_n)).astype(res.dtype)
-        buff_a = tvm.nd.array(a_np, dev)
-        buff_b = tvm.nd.array(np.zeros((num_m, num_n), dtype=res.dtype), dev)
-        fscan(buff_a, buff_b)
-        tvm.testing.assert_allclose(buff_b.numpy(), np.cumsum(a_np, axis=0))
-
-    check_device("vulkan")
-    check_device("cuda")
-    check_device("metal")
-    check_device("opencl")
-
-
-if __name__ == "__main__":
-    test_scan()
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
deleted file mode 100644
index 589845b13f7f..000000000000
--- a/tests/python/integration/test_tuning.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import logging
-import tempfile
-from typing import List, Optional
-
-import numpy as np  # type: ignore
-import pytest
-import tvm
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.tune_utils import generate_input_data
-from tvm.target.target import Target
-
-logging.basicConfig(
-    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
-
-
-@pytest.mark.skip("Integration test")
-@pytest.mark.parametrize(
-    "model_name, input_shape, data_type, target, layout",
-    [
-        ("resnet_18", [1, 3, 224, 224], "float32", "llvm --num-cores=12", "NHWC"),
-        ("resnet_18", [1, 3, 224, 224], "float32", "nvidia/geforce-rtx-3070", "NHWC"),
-    ],
-)
-def test_meta_schedule_tune_relay(
-    model_name: str,
-    input_shape: List[int],
-    data_type: str,
-    target: str,
-    layout: Optional[str],
-):
-    dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda()
-    data = generate_input_data(input_shape, data_type)
-
-    mod, params, (input_name, _, _) = get_network(
-        name=model_name,
-        input_shape=input_shape,
-        layout=layout,
-    )
-
-    target = Target(target)
-    with tempfile.TemporaryDirectory() as work_dir:
-        with ms.Profiler() as profiler:
-            database = ms.relay_integration.tune_relay(
-                mod=mod,
-                target=target,
-                params=params,
-                work_dir=work_dir,
-                max_trials_global=2048,
-            )
-            rt_mod1 = ms.relay_integration.compile_relay(
-                database=database,
-                mod=mod,
-                target=target,
-                params=params,
-            )
-        print(profiler.table())
-
-    def get_output(data, lib, dev):
-        module = graph_executor.GraphModule(lib["default"](dev))
-        module.set_input(input_name, tvm.nd.array(data, device=dev))
-        module.run()
-        return module.get_output(0).numpy()
-
-    # Check correctness
-    actual_output = get_output(data, rt_mod1, dev)
-    print(
-        f"{model_name} finished tuning and running on {Target(target).kind.name}. "
-        "Running baseline...",
-        flush=True,
-    )
-
-    # Compile without meta-schedule for correctness check
-    baseline_target = "llvm -num-cores=1"
-    with tvm.transform.PassContext(opt_level=0):
-        rt_mod2 = relay.build(mod, target=baseline_target, params=params)
-
-    expected_output = get_output(data, rt_mod2, tvm.cpu())
-    print(
-        f"Basline finished running on {Target(baseline_target).kind.name}. "
-        "Verifying correctness...",
-        flush=True,
-    )
-
-    assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
-    print(
-        f"Correctness verified for {model_name} on {Target(target).kind.name}.",
-        flush=True,
-    )
-
-
-if __name__ == """__main__""":
-    test_meta_schedule_tune_relay(
-        "resnet_18", [1, 3, 224, 224], "float32", "llvm --num-cores=12", "NHWC"
-    )
-    test_meta_schedule_tune_relay(
-        "resnet_18", [1, 3, 224, 224], "float32", "nvidia/geforce-rtx-3070", None
-    )
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
deleted file mode 100644
index d53dc21d6328..000000000000
--- a/tests/python/integration/test_winograd_nnpack.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test winograd convolution using nnpack impl."""
-import numpy as np
-from pytest import skip
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import autotvm, te, topi
-from tvm.autotvm.task.space import FallbackConfigEntity
-from tvm.contrib import nnpack
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-
-def verify_conv2d_nchw(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    devices,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    """Verify conv2d nchw workload."""
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    placholder_a = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    placeholder_w = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
-    bias = te.placeholder((num_filter, 1, 1), name="bias")
-
-    a_shape = get_const_tuple(placholder_a.shape)
-    w_shape = get_const_tuple(placeholder_w.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = placholder_a.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skipping %s becuase it is not enabled" % device)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            result_c = topi.nn.conv2d(
-                placholder_a,
-                placeholder_w,
-                stride,
-                padding,
-                dilation,
-                data_layout="NCHW",
-                out_dtype=dtype,
-            )
-            if add_bias:
-                result_c = topi.add(result_c, bias)
-            if add_relu:
-                result_c = topi.nn.relu(result_c)
-            schedule = topi.generic.schedule_conv2d_nchw([result_c])
-
-        buff_a = tvm.nd.array(a_np, dev)
-        buff_w = tvm.nd.array(w_np, dev)
-        buff_b = tvm.nd.array(b_np, dev)
-        buff_c = tvm.nd.array(np.zeros(get_const_tuple(result_c.shape), dtype=result_c.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                schedule,
-                [placholder_a, placeholder_w, bias, result_c],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
-            )
-            func(buff_a, buff_w, buff_b, buff_c)
-        else:
-            func = tvm.build(
-                schedule,
-                [placholder_a, placeholder_w, result_c],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
-            )
-            func(buff_a, buff_w, buff_c)
-        tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-4)
-
-    for device in devices:
-        check_device(device)
-
-
-class WinogradFallback(autotvm.FallbackContext):
-    """Winograd fallbacks."""
-
-    def _query_inside(self, target, workload):
-        key = (target, workload)
-        if key in self.memory:
-            return self.memory[key]
-        cfg = FallbackConfigEntity()
-        cfg.template_key = "winograd_nnpack_fp32"
-        self.memory[key] = cfg
-        return cfg
-
-
-def test_conv2d_nchw():
-    """Verify conv2d nchw winograd works."""
-
-    if not tvm.get_global_func(
-        "tvm.contrib.nnpack.convolution_inference_without_weight_transform", True
-    ):
-        skip("extern function is not available")
-
-    if not nnpack.is_available():
-        skip("nnpack is not available")
-
-    devices = ["llvm -device=arm_cpu"]
-    autotvm.GLOBAL_SCOPE.silent = True
-    with WinogradFallback():
-        # resnet 18 workloads
-        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, devices=devices)
-        verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1, devices=devices)
-        verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1, devices=devices)
-        verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1, devices=devices)
-
-        # unet workloads
-        verify_conv2d_nchw(1, 3, 192, 12, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 4, 192, 12, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 12, 96, 24, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 24, 48, 48, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 48, 24, 96, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 96, 12, 180, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 180, 6, 220, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 220, 6, 180, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 180, 12, 96, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 96, 24, 48, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 48, 48, 24, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 24, 96, 12, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 12, 192, 1, 3, 1, 1, add_bias=True, devices=devices)
-
-        # relu, bias
-        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, devices=devices)
-        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, devices=devices)
-        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True, devices=devices)
-
-        # werid workloads
-        verify_conv2d_nchw(1, 3, 3, 3, 3, 1, 1, devices=devices)
-        verify_conv2d_nchw(1, 13, 71, 59, 3, 1, 1, devices=devices)
-    autotvm.GLOBAL_SCOPE.silent = False
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/ir/test_ir_type.py b/tests/python/ir/test_ir_type.py
index b70406c1bb7a..238cb5965ba3 100644
--- a/tests/python/ir/test_ir_type.py
+++ b/tests/python/ir/test_ir_type.py
@@ -38,15 +38,6 @@ def test_tensor_type_bad_constructor():
         pass
 
 
-def test_tensor_type():
-    tt = tvm.ir.TensorType([1, 2, 3], "float32")
-    assert tt.dtype == "float32"
-    assert list(tt.shape) == [T.int32(1), T.int32(2), T.int32(3)]
-    assert tt.span == None
-    str(tt)
-    check_json_roundtrip(tt)
-
-
 def test_type_param():
     tp = tvm.ir.TypeVar("name", tvm.ir.TypeKind.Type)
     assert tp.kind == tvm.ir.TypeKind.Type
diff --git a/tests/python/ir/test_roundtrip_runtime_module.py b/tests/python/ir/test_roundtrip_runtime_module.py
index 96deb35fb6d8..3723cc6c112c 100644
--- a/tests/python/ir/test_roundtrip_runtime_module.py
+++ b/tests/python/ir/test_roundtrip_runtime_module.py
@@ -21,7 +21,6 @@
 import tvm
 import tvm.testing
 from tvm import TVMError
-from tvm import relay
 
 
 def test_csource_module():
@@ -33,80 +32,5 @@ def test_csource_module():
     assert new_mod.is_binary_serializable
 
 
-def get_test_mod():
-    x = relay.var("x", shape=(1, 10), dtype="float32")
-    y = relay.var("y", shape=(1, 10), dtype="float32")
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
-    return relay.build_module._build_module_no_factory(func, target="cuda")
-
-
-def get_cuda_mod():
-    # Get Cuda module which is binary serializable
-    return get_test_mod().imported_modules[0].imported_modules[0]
-
-
-@tvm.testing.requires_cuda
-def test_cuda_module():
-    mod = get_cuda_mod()
-    assert mod.type_key == "cuda"
-    assert mod.is_binary_serializable
-    new_mod = tvm.ir.load_json(tvm.ir.save_json(mod))
-    assert new_mod.type_key == "cuda"
-    assert new_mod.is_binary_serializable
-
-
-@tvm.testing.requires_cuda
-def test_valid_submodules():
-    mod, mod2, mod3, mod4 = get_cuda_mod(), get_cuda_mod(), get_cuda_mod(), get_cuda_mod()
-
-    # Create the nested cuda module
-    mod.import_module(mod2)
-    mod2.import_module(mod3)
-    mod2.import_module(mod4)
-
-    # Root module and all submodules should be binary serializable since they are cuda module
-    assert mod.type_key == "cuda"
-    assert mod.is_binary_serializable
-    assert mod.imported_modules[0].type_key == "cuda"
-    assert mod.imported_modules[0].is_binary_serializable
-    assert mod.imported_modules[0].imported_modules[0].type_key == "cuda"
-    assert mod.imported_modules[0].imported_modules[1].type_key == "cuda"
-    assert mod.imported_modules[0].imported_modules[0].is_binary_serializable
-    assert mod.imported_modules[0].imported_modules[1].is_binary_serializable
-
-    # The roundtripped mod should have the same structure
-    new_mod = tvm.ir.load_json(tvm.ir.save_json(mod))
-    assert new_mod.type_key == "cuda"
-    assert new_mod.is_binary_serializable
-    assert new_mod.imported_modules[0].type_key == "cuda"
-    assert new_mod.imported_modules[0].is_binary_serializable
-    assert new_mod.imported_modules[0].imported_modules[0].type_key == "cuda"
-    assert new_mod.imported_modules[0].imported_modules[1].type_key == "cuda"
-    assert new_mod.imported_modules[0].imported_modules[0].is_binary_serializable
-    assert new_mod.imported_modules[0].imported_modules[1].is_binary_serializable
-
-
-@tvm.testing.requires_cuda
-def test_invalid_submodules():
-    mod, mod2, mod3 = get_cuda_mod(), get_cuda_mod(), get_cuda_mod()
-    mod4 = tvm.get_global_func("relay.build_module._AOTExecutorCodegen")()
-
-    # Create the nested cuda module
-    mod.import_module(mod2)
-    mod2.import_module(mod3)
-    mod2.import_module(mod4)
-
-    # One of submodules is not binary serializable.
-    assert mod.is_binary_serializable
-    assert mod.imported_modules[0].is_binary_serializable
-    assert mod.imported_modules[0].imported_modules[0].is_binary_serializable
-    assert not mod.imported_modules[0].imported_modules[1].is_binary_serializable
-
-    # Therefore, we cannot roundtrip.
-    with pytest.raises(TVMError):
-        tvm.ir.load_json(tvm.ir.save_json(mod))
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/meta_schedule/test_meta_schedule_byoc_tensorrt.py b/tests/python/meta_schedule/test_meta_schedule_byoc_tensorrt.py
deleted file mode 100644
index 21f56cc912ef..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_byoc_tensorrt.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Test Meta Schedule Builder """
-# pylint: disable=missing-docstring
-
-import sys
-from typing import List
-
-import pytest
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.meta_schedule.arg_info import TensorInfo
-from tvm.meta_schedule.builder import BuilderInput, LocalBuilder
-from tvm.meta_schedule.runner import EvaluatorConfig, LocalRunner, RunnerInput
-from tvm.meta_schedule.testing.custom_builder_runner import (
-    build_relay,
-    build_relay_with_tensorrt,
-    run_with_graph_executor,
-)
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.relay import testing
-from tvm.relay.op.contrib import tensorrt
-from tvm.target import Target
-from tvm.tir import FloatImm
-
-has_tensorrt_codegen = pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.tensorrt", True),
-    reason="TensorRT codegen not available",
-)
-has_tensorrt_runtime = pytest.mark.skipif(
-    not tensorrt.is_tensorrt_runtime_enabled(),
-    reason="TensorRT runtime not available",
-)
-
-# conv2d+relu network
-def get_conv2d_relu(
-    data_shape,
-    out_channels,
-    kernel_size,
-    strides,
-    padding,
-    dilation,
-    groups,
-    data_layout,
-    kernel_layout,
-    dtype,
-):
-
-    data = relay.var("data", relay.TensorType(data_shape, dtype))
-    weight = relay.var("weight")
-
-    net = relay.nn.conv2d(
-        data=data,
-        weight=weight,  # conv kernel
-        strides=strides,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-        channels=out_channels,
-        kernel_size=kernel_size,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    net = relay.add(net, net)
-    net = relay.nn.relu(net)
-
-    inputs = relay.analysis.free_vars(net)
-    return relay.Function(inputs, net)
-
-
-def verify_meta_schedule_with_tensorrt(
-    mod,
-    params,
-    data_shape,
-    use_trt: bool = True,
-):
-    # Build
-    builder = LocalBuilder(
-        f_build=build_relay_with_tensorrt if use_trt else build_relay,
-        timeout_sec=1000,
-    )
-    builder_input = BuilderInput(mod, Target("cuda"), params)
-    builder_result = builder.build([builder_input])[0]
-    assert builder_result.error_msg is None, builder_result.error_msg
-    assert builder_result.artifact_path is not None
-
-    # Run
-    runner_input = RunnerInput(
-        builder_result.artifact_path,
-        device_type="cuda",
-        args_info=[TensorInfo("float32", data_shape)],
-    )
-    runner = LocalRunner(
-        evaluator_config=EvaluatorConfig(
-            number=5,
-            repeat=2,
-            min_repeat_ms=0,
-            enable_cpu_cache_flush=False,
-        ),
-        f_run_evaluator=run_with_graph_executor,
-    )
-
-    # Run the module
-    runner_future = runner.run([runner_input])[0]
-    runner_result = runner_future.result()
-    assert runner_result is not None
-    assert runner_result.error_msg is None, runner_result.error_msg
-    assert runner_result.run_secs is not None
-
-    for result in runner_result.run_secs:
-        if isinstance(result, FloatImm):
-            result = result.value
-        assert isinstance(result, float)
-        assert result >= 0.0
-
-
-@has_tensorrt_codegen
-def test_conv2d_relu():
-    data_shape = (1, 1280, 14, 14)
-    out_channels = 256
-    kernel_size, strides, padding, dilation, groups = (1, 1), (1, 1), (0, 0, 0, 0), (1, 1), 1
-    data_layout, kernel_layout = "NCHW", "OIHW"
-    dtype = "float32"
-
-    f = get_conv2d_relu(
-        data_shape,
-        out_channels,
-        kernel_size,
-        strides,
-        padding,
-        dilation,
-        groups,
-        data_layout,
-        kernel_layout,
-        dtype,
-    )
-
-    mod, params = testing.create_workload(f)
-    verify_meta_schedule_with_tensorrt(mod, params, data_shape)
-
-
-@has_tensorrt_codegen
-@pytest.mark.parametrize("model_name", ["resnet_50"])
-@pytest.mark.parametrize("input_shape", [[1, 3, 224, 224]])
-@pytest.mark.parametrize("use_trt", [True, False])
-def test_relay_model(model_name: str, input_shape: List[int], use_trt: bool):
-    mod, params, _ = get_network(model_name, input_shape)
-    verify_meta_schedule_with_tensorrt(
-        mod,
-        params,
-        input_shape,
-        use_trt,
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/meta_schedule/test_meta_schedule_cpu_dot_product.py b/tests/python/meta_schedule/test_meta_schedule_cpu_dot_product.py
deleted file mode 100644
index cc2731ff5974..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_cpu_dot_product.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import logging
-import tempfile
-from typing import Optional
-
-import numpy as np  # type: ignore
-import tvm
-import tvm.testing
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm._ffi import register_func
-from tvm.tir.schedule import BlockRV, Schedule
-from tvm.tir.schedule.analysis import has_block
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
-from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
-
-logging.basicConfig(
-    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
-
-
-def _schedule_dense(m: Optional[int], do_tune: bool, intrin=VNNI_INTRIN):
-    """Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
-    using VNNI or AVX512 instructions.
-    """
-
-    def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
-        if "dense" not in sch.mod.attrs["task_name"]:
-            return False
-        if dense_block is None:
-            assert has_block(sch, "compute")
-            dense_block = sch.get_block("compute")
-            assert "dense_int8" in sch.get(dense_block).annotations["schedule_rule"]
-
-        post_blocks = sch.get_consumers(dense_block)
-        if len(post_blocks) > 0:
-            # Fuse all intermediate post ops into the last op.
-            # This is equivalent to the traverse_inline function used in TE schedules.
-            while True:
-                next_post_blocks = []
-                for post_block in post_blocks:
-                    next_consumers = sch.get_consumers(post_block)
-                    if len(next_consumers) > 0:
-                        sch.compute_inline(post_block)
-                    next_post_blocks += next_consumers
-                if len(next_post_blocks) == 0:
-                    assert len(post_blocks) == 1
-                    outer_block = post_blocks[0]
-                    a_y, a_x = sch.get_loops(outer_block)[-2:]
-                    break
-                post_blocks = next_post_blocks
-        else:
-            a_y, a_x, _ = sch.get_loops(dense_block)[-3:]
-            outer_block = dense_block
-        if do_tune:
-            y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
-            a_yo, a_yi = sch.split(a_y, factors=y_factors)
-        else:
-            a_yo, a_yi = sch.split(a_y, factors=[None, min(m, 64)])
-        a_xo, a_xi = sch.split(a_x, factors=[None, 16])
-        sch.reorder(a_yo, a_xo, a_yi, a_xi)
-        fused = sch.fuse(a_yo, a_xo)
-        if outer_block != dense_block:
-            # Handle the case when dense is fused with post ops.
-            sch.vectorize(a_xi)
-            sch.compute_at(dense_block, a_yi)
-        a_xi, a_k = sch.get_loops(dense_block)[-2:]
-        a_ko, a_ki = sch.split(a_k, factors=[None, 4])
-        sch.reorder(a_ko, a_xi, a_ki)
-        # We need to parallelize before decompose_reduction, otherwise the so-called "Compact dataflow"
-        # condition is violated.
-        sch.parallel(fused)
-        dec = sch.decompose_reduction(dense_block, a_ko)
-        init_loop = sch.get_loops(dec)[-1]
-        sch.vectorize(init_loop)
-        sch.tensorize(a_xi, intrin)
-        return True
-
-    return schedule_fn
-
-
-def _relay_dense(m, n, k):
-    data = relay.var("data", shape=(m, k), dtype="uint8")
-    weight = relay.var("weight", shape=(n, k), dtype="int8")
-    bias = relay.var("bias", shape=(n,), dtype="int32")
-    # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py)
-    dense = relay.nn.dense(data, weight, out_dtype="int32")
-    bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32")
-    out = relay.nn.batch_matmul(
-        relay.cast(relay.expand_dims(bias_add, 0), "uint8"),
-        relay.cast(relay.expand_dims(bias_add, 0), "int8"),
-        out_dtype="int32",
-    )
-    relay_mod = tvm.IRModule.from_expr(out)
-    data = np.random.randint(0, 5, size=(m, k), dtype="uint8")
-    params = {
-        "weight": np.random.randint(0, 5, size=(n, k), dtype="int8"),
-        "bias": np.random.randint(0, 5, size=(n,), dtype="int32"),
-    }
-
-    def f_check(lib, dev):
-        ref = (
-            relay.create_executor(
-                "vm",
-                mod=relay_mod,
-                device=dev,
-                target="llvm",
-            )
-            .evaluate()(data, params["weight"], params["bias"])
-            .numpy()
-        )
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-        runtime.set_input("data", data)
-        runtime.run()
-        out = runtime.get_output(0).numpy()
-        np.testing.assert_equal(out, ref)
-
-    return relay_mod, params, f_check
-
-
-def schedule_16x4_dense_fn_database(target, intrin, m=1024, n=1024, k=1024):
-    dev = tvm.cpu(0)
-    relay_mod, params, f_check = _relay_dense(m, n, k)
-
-    with ms.database.ScheduleFnDatabase(
-        _schedule_dense(
-            m=m,
-            do_tune=False,
-            intrin=intrin,
-        )
-    ), tvm.transform.PassContext(
-        opt_level=3,
-        config={"relay.backend.use_meta_schedule": True},
-    ):
-        # pylint: disable=W0105
-        """The log should say
-        Warning: Cannot find workload: tvmgen_default_fused_expand_dims
-        Warning: Cannot find workload: tvmgen_default_fused_cast
-        Warning: Cannot find workload: tvmgen_default_fused_cast_1
-        Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
-
-        This means batch matmul and others are scheduled by TE, and dense (the one not warned)
-        is found in the meta schedule tuning database during compilation
-        """
-        # pylint: enable=W0105
-        lib = relay.build(relay_mod, target=target, params=params)
-    f_check(lib, dev)
-
-
-@tvm.testing.requires_x86_vnni
-def test_vnni_schedule_fn_database():
-    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
-    schedule_16x4_dense_fn_database(target, VNNI_INTRIN)
-
-
-@tvm.testing.requires_x86_avx512
-def test_avx512_schedule_fn_database():
-    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
-    schedule_16x4_dense_fn_database(target, AVX512_INTRIN, 16, 16, 16)
-
-
-def schedule_16x4_dense_fn_tune(target, intrin, m=1024, n=1024, k=1024):
-    # pylint: disable=W0105
-    """
-    We can inject and apply a custom TIR scheduling to a TE compute of interest, using
-    the "schedule_rule" annotation. For example, in topi/x86/dense.py we have the following
-    declaration for int8 dense targeting the VNNI or AVX512 instructions.
-
-    C = te.compute(
-        ...
-        attrs={"schedule_rule": "meta_schedule.x86.dense_int8"},
-    )
-
-    When the MetaSchedule encounters a TensorIR block with the "schedule_rule" annotation,
-    it looks up the packed func registry for a function that is associated with the given schedule
-    rule key ("meta_schedule.x86.dense_int8" in this example). The signature of such custom
-    schedule functions must be
-
-       (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
-
-    The BlockRV argument corresponds to the TE compute annotated with "schedule_rule".
-
-    The relevant code is in `src/meta_schedule/space_generator/apply_custom_rule.cc`.
-    """
-
-    def schedule_rule_dense_16x4(sch: Schedule, dense_block: BlockRV):
-        _schedule_dense(m=None, do_tune=True, intrin=intrin)(sch, dense_block)
-        return [sch]
-
-    register_func("meta_schedule.x86.dense_int8", schedule_rule_dense_16x4, override=True)
-
-    dev = tvm.cpu(0)
-    relay_mod, params, f_check = _relay_dense(m, n, k)
-
-    extracted_tasks = ms.relay_integration.extract_tasks(relay_mod, target, params)
-    with tempfile.TemporaryDirectory() as work_dir:
-        # postprocs=lambda: [] is important to prevent default post processors from
-        # tampering with the manual schedule.
-        tasks, weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
-            list(
-                filter(
-                    lambda task: "dense" in task.task_name,
-                    extracted_tasks,
-                )
-            ),
-            work_dir=work_dir,
-            space=ms.space_generator.PostOrderApply(
-                f_block_filter=None,
-                sch_rules="from-target",
-                postprocs=[],
-                mutator_probs="from-target",
-            ),
-        )
-        database = ms.relay_integration.tune_tasks(
-            tasks=tasks,
-            task_weights=weights,
-            work_dir=work_dir,
-            max_trials_per_task=32,
-            max_trials_global=20000,
-        )
-    with database, tvm.transform.PassContext(
-        opt_level=3,
-        config={"relay.backend.use_meta_schedule": True},
-    ):
-        # pylint: disable=W0105
-        """The log should say
-        Warning: Cannot find workload: tvmgen_default_fused_expand_dims
-        Warning: Cannot find workload: tvmgen_default_fused_cast
-        Warning: Cannot find workload: tvmgen_default_fused_cast_1
-        Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
-
-        This means batch matmul and others are scheduled by TE, and dense (the one not warned)
-        is found in the meta schedule tuning database during compilation
-        """
-        # pylint: enable=W0105
-        lib = relay.build(relay_mod, target=target, params=params)
-    f_check(lib, dev)
-
-
-@tvm.testing.requires_x86_vnni
-def test_vnni_schedule_fn_tune():
-    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
-    schedule_16x4_dense_fn_tune(target, VNNI_INTRIN)
-
-
-@tvm.testing.requires_x86_avx512
-def test_avx512_schedule_fn_tune():
-    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
-    schedule_16x4_dense_fn_tune(target, AVX512_INTRIN, 16, 16, 16)
-
-
-if __name__ == """__main__""":
-    test_vnni_schedule_fn_database()
-    test_avx512_schedule_fn_database()
-    test_vnni_schedule_fn_tune()
-    test_avx512_schedule_fn_tune()
diff --git a/tests/python/meta_schedule/test_meta_schedule_database.py b/tests/python/meta_schedule/test_meta_schedule_database.py
index f87c8753f8f7..0f33a09a8428 100644
--- a/tests/python/meta_schedule/test_meta_schedule_database.py
+++ b/tests/python/meta_schedule/test_meta_schedule_database.py
@@ -24,7 +24,7 @@
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
-from tvm import relay, tir
+from tvm import tir
 from tvm.ir.module import IRModule
 from tvm.meta_schedule.database import TuningRecord, Workload
 from tvm.script import tir as T
@@ -452,7 +452,6 @@ def commit_record(db, run_sec):  # pylint: disable=invalid-name
 
 
 def test_meta_schedule_pydatabase_default_query():
-
     mod: IRModule = Matmul
     target = tvm.target.Target("llvm")
     arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"])
@@ -492,7 +491,6 @@ def commit_record(trace, db, run_sec):  # pylint: disable=invalid-name
 
 
 def test_meta_schedule_pydatabase_override_query():
-
     mod: IRModule = Matmul
     target = tvm.target.Target("llvm")
     arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"])
@@ -584,18 +582,11 @@ def test_json_database_get_top_k(k, expected):
     assert result == expected
 
 
-def MatmulFunc() -> IRModule:
-    a = relay.var("a", relay.TensorType((1024, 1024), "float32"))
-    b = relay.var("b", relay.TensorType((1024, 1024), "float32"))
-    func = relay.Function([a, b], relay.nn.matmul(a, b))
-    return tvm.IRModule.from_expr(func)
-
-
 def MatmulPrimFunc() -> IRModule:
     return Matmul
 
 
-@pytest.mark.parametrize("f_mod", [MatmulPrimFunc, MatmulFunc])
+@pytest.mark.parametrize("f_mod", [MatmulPrimFunc])
 @pytest.mark.parametrize("mod_eq", ["structural", "ignore-ndarray", "anchor-block"])
 def test_json_database_commit_workload(f_mod, mod_eq):
     mod: IRModule = f_mod()
@@ -604,7 +595,7 @@ def test_json_database_commit_workload(f_mod, mod_eq):
         database.commit_workload(mod)
 
 
-@pytest.mark.parametrize("f_mod", [MatmulPrimFunc, MatmulFunc])
+@pytest.mark.parametrize("f_mod", [MatmulPrimFunc])
 @pytest.mark.parametrize("mod_eq", ["structural", "ignore-ndarray", "anchor-block"])
 def test_memory_database_commit_workload(f_mod, mod_eq):
     mod: IRModule = f_mod()
diff --git a/tests/python/meta_schedule/test_meta_schedule_mma_m16n8k8_auto_tensorization.py b/tests/python/meta_schedule/test_meta_schedule_mma_m16n8k8_auto_tensorization.py
deleted file mode 100644
index ea8fee672461..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_mma_m16n8k8_auto_tensorization.py
+++ /dev/null
@@ -1,1239 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tests for MMA m16n8k8 Auto Tensorization"""
-
-import tempfile
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import meta_schedule as ms
-from tvm._ffi import register_func
-from tvm.meta_schedule.testing.space_generation import (
-    check_sketches,
-    generate_design_space,
-)
-from tvm.meta_schedule.builder import LocalBuilder
-from tvm.script import ir as I
-from tvm.script import tir as T
-from tvm.target import Target
-from tvm.tir import Schedule
-from tvm.tir.schedule import Trace
-
-# get tensor intrin
-from tvm.tir.tensor_intrin import cuda  # pylint: disable=unused-import
-
-import tvm.testing
-
-
-@I.ir_module
-class MmaModule:
-    @T.prim_func
-    def main(
-        X: T.Buffer((4096, 4096), "float16"),
-        Y: T.Buffer((4096, 4096), "float16"),
-        C: T.Buffer((4096, 4096), "float16"),
-    ):
-        T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)})
-        # with T.block("root"):
-        C_reindex_m16n8k8_matrixC = T.alloc_buffer((4096, 4096), "float16", scope="m16n8k8.matrixC")
-        X_reindex_shared_dyn = T.alloc_buffer((4096, 4096), "float16", scope="shared.dyn")
-        Y_reindex_shared_dyn = T.alloc_buffer((4096, 4096), "float16", scope="shared.dyn")
-        X_reindex_shared_dyn_m16n8k8_matrixA = T.alloc_buffer(
-            (4096, 4096), "float16", scope="m16n8k8.matrixA"
-        )
-        Y_reindex_shared_dyn_m16n8k8_matrixB = T.alloc_buffer(
-            (4096, 4096), "float16", scope="m16n8k8.matrixB"
-        )
-        for ax0_0_0_ax1_0_0_fused in T.thread_binding(4, thread="blockIdx.x"):
-            for ax0_0_1_ax1_0_1_fused in T.thread_binding(256, thread="blockIdx.y"):
-                for ax0_0_2_ax1_0_2_fused in T.thread_binding(4, thread="threadIdx.y"):
-                    for ax2_0_0 in T.serial(
-                        128,
-                        annotations={
-                            "software_pipeline_async_stages": [0],
-                            "software_pipeline_order": [0, 1, 3, 2, 4],
-                            "software_pipeline_stage": [0, 0, 1, 2, 2],
-                        },
-                    ):
-                        with T.block("X_reindex_shared.dyn"):
-                            v0, v1 = T.axis.remap("SS", [ax0_0_1_ax1_0_1_fused, ax2_0_0])
-                            T.reads(X[v0 // 8 * 128 : v0 // 8 * 128 + 128, v1 * 32 : v1 * 32 + 32])
-                            T.writes(
-                                X_reindex_shared_dyn[
-                                    v0 // 8 * 128 : v0 // 8 * 128 + 128, v1 * 32 : v1 * 32 + 32
-                                ]
-                            )
-                            T.block_attr(
-                                {
-                                    "auto_copy": 1,
-                                    "buffer_dim_align": [[0, 0, 32, 8]],
-                                    "permuted_layout": "g2s_A",
-                                    "vector_bytes": 16,
-                                }
-                            )
-                            for ax0, ax1 in T.grid(128, 32):
-                                X_reindex_shared_dyn[v0 // 8 * 128 + ax0, v1 * 32 + ax1] = X[
-                                    v0 // 8 * 128 + ax0, v1 * 32 + ax1
-                                ]
-                        with T.block("Y_reindex_shared.dyn"):
-                            v0, v1, v2 = T.axis.remap(
-                                "SSS", [ax2_0_0, ax0_0_0_ax1_0_0_fused, ax0_0_1_ax1_0_1_fused]
-                            )
-                            T.reads(
-                                Y[
-                                    v0 * 32 : v0 * 32 + 32,
-                                    v1 * 1024 + v2 % 8 * 128 : v1 * 1024 + v2 % 8 * 128 + 128,
-                                ]
-                            )
-                            T.writes(
-                                Y_reindex_shared_dyn[
-                                    v0 * 32 : v0 * 32 + 32,
-                                    v1 * 1024 + v2 % 8 * 128 : v1 * 1024 + v2 % 8 * 128 + 128,
-                                ]
-                            )
-                            T.block_attr(
-                                {
-                                    "auto_copy": 1,
-                                    "buffer_dim_align": [[0, 0, 32, 8]],
-                                    "permuted_layout": "g2s_B",
-                                    "vector_bytes": 16,
-                                }
-                            )
-                            for ax0, ax1 in T.grid(32, 128):
-                                Y_reindex_shared_dyn[
-                                    v0 * 32 + ax0, v1 * 1024 + v2 % 8 * 128 + ax1
-                                ] = Y[v0 * 32 + ax0, v1 * 1024 + v2 % 8 * 128 + ax1]
-                        for ax2_0_1 in T.serial(
-                            4,
-                            annotations={
-                                "software_pipeline_order": [0, 1, 2],
-                                "software_pipeline_stage": [0, 0, 1],
-                            },
-                        ):
-                            for ax0_0, ax1_0 in T.grid(2, 1):
-                                with T.block("X_reindex_shared.dyn_m16n8k8.matrixA_o"):
-                                    v0_o = T.axis.spatial(
-                                        128,
-                                        ax0_0_1_ax1_0_1_fused // 8 * 4
-                                        + ax0_0_2_ax1_0_2_fused // 2 * 2
-                                        + ax0_0,
-                                    )
-                                    v1_o = T.axis.spatial(512, ax2_0_0 * 4 + ax2_0_1 + ax1_0)
-                                    T.reads(
-                                        X_reindex_shared_dyn[
-                                            v0_o * 32 : v0_o * 32 + 32, v1_o * 8 : v1_o * 8 + 8
-                                        ]
-                                    )
-                                    T.writes(
-                                        X_reindex_shared_dyn_m16n8k8_matrixA[
-                                            v0_o * 32 : v0_o * 32 + 32, v1_o * 8 : v1_o * 8 + 8
-                                        ]
-                                    )
-                                    T.block_attr(
-                                        {
-                                            "meta_schedule.auto_tensorize": "mma_load_m16n8k8_f16_A_shared_dyn",
-                                            "permuted_layout": "s2l_A",
-                                        }
-                                    )
-                                    for ax0_1, ax1_1 in T.grid(32, 8):
-                                        with T.block("X_reindex_shared.dyn_m16n8k8.matrixA"):
-                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
-                                            T.reads(
-                                                X_reindex_shared_dyn[
-                                                    v0_o * 32 + v0_i, v1_o * 8 + v1_i
-                                                ]
-                                            )
-                                            T.writes(
-                                                X_reindex_shared_dyn_m16n8k8_matrixA[
-                                                    v0_o * 32 + v0_i, v1_o * 8 + v1_i
-                                                ]
-                                            )
-                                            X_reindex_shared_dyn_m16n8k8_matrixA[
-                                                v0_o * 32 + v0_i, v1_o * 8 + v1_i
-                                            ] = X_reindex_shared_dyn[
-                                                v0_o * 32 + v0_i, v1_o * 8 + v1_i
-                                            ]
-                            for ax0_0, ax1_0 in T.grid(1, 2):
-                                with T.block("Y_reindex_shared.dyn_m16n8k8.matrixB_o"):
-                                    v0_o = T.axis.spatial(512, ax2_0_0 * 4 + ax2_0_1 + ax0_0)
-                                    v1_o = T.axis.spatial(
-                                        128,
-                                        ax0_0_0_ax1_0_0_fused * 32
-                                        + ax0_0_1_ax1_0_1_fused % 8 * 4
-                                        + ax0_0_2_ax1_0_2_fused % 2 * 2
-                                        + ax1_0,
-                                    )
-                                    T.reads(
-                                        Y_reindex_shared_dyn[
-                                            v0_o * 8 : v0_o * 8 + 8, v1_o * 32 : v1_o * 32 + 32
-                                        ]
-                                    )
-                                    T.writes(
-                                        Y_reindex_shared_dyn_m16n8k8_matrixB[
-                                            v0_o * 8 : v0_o * 8 + 8, v1_o * 32 : v1_o * 32 + 32
-                                        ]
-                                    )
-                                    T.block_attr(
-                                        {
-                                            "meta_schedule.auto_tensorize": "mma_load_m16n8k8_f16_B_shared_dyn",
-                                            "permuted_layout": "s2l_B",
-                                        }
-                                    )
-                                    for ax0_1, ax1_1 in T.grid(8, 32):
-                                        with T.block("Y_reindex_shared.dyn_m16n8k8.matrixB"):
-                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
-                                            T.reads(
-                                                Y_reindex_shared_dyn[
-                                                    v0_o * 8 + v0_i, v1_o * 32 + v1_i
-                                                ]
-                                            )
-                                            T.writes(
-                                                Y_reindex_shared_dyn_m16n8k8_matrixB[
-                                                    v0_o * 8 + v0_i, v1_o * 32 + v1_i
-                                                ]
-                                            )
-                                            Y_reindex_shared_dyn_m16n8k8_matrixB[
-                                                v0_o * 8 + v0_i, v1_o * 32 + v1_i
-                                            ] = Y_reindex_shared_dyn[
-                                                v0_o * 8 + v0_i, v1_o * 32 + v1_i
-                                            ]
-                            for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(
-                                1, 1, 1, 4, 8
-                            ):
-                                with T.block("C_o"):
-                                    v0_o = T.axis.spatial(
-                                        256,
-                                        ax0_0_1_ax1_0_1_fused // 8 * 8
-                                        + ax0_0_2_ax1_0_2_fused // 2 * 4
-                                        + ax0_0_3 * 4
-                                        + ax0_0_4,
-                                    )
-                                    v1_o = T.axis.spatial(
-                                        512,
-                                        ax0_0_0_ax1_0_0_fused * 128
-                                        + ax0_0_1_ax1_0_1_fused % 8 * 16
-                                        + ax0_0_2_ax1_0_2_fused % 2 * 8
-                                        + ax1_0_3 * 8
-                                        + ax1_0_4,
-                                    )
-                                    v2_o = T.axis.reduce(512, ax2_0_0 * 4 + ax2_0_1 + ax2_0_2)
-                                    T.reads(
-                                        X_reindex_shared_dyn_m16n8k8_matrixA[
-                                            v0_o * 16 : v0_o * 16 + 16, v2_o * 8 : v2_o * 8 + 8
-                                        ],
-                                        Y_reindex_shared_dyn_m16n8k8_matrixB[
-                                            v2_o * 8 : v2_o * 8 + 8, v1_o * 8 : v1_o * 8 + 8
-                                        ],
-                                    )
-                                    T.writes(
-                                        C_reindex_m16n8k8_matrixC[
-                                            v0_o * 16 : v0_o * 16 + 16, v1_o * 8 : v1_o * 8 + 8
-                                        ]
-                                    )
-                                    T.block_attr(
-                                        {
-                                            "meta_schedule.auto_tensorize": "mma_sync_m16n8k8_f16f16f16",
-                                            "meta_schedule.auto_tensorize_init": "mma_init_m16n8k8_f16",
-                                            "meta_schedule.thread_extent_high_inclusive": 1024,
-                                            "meta_schedule.thread_extent_low_inclusive": 32,
-                                            "warp_execution": 1,
-                                        }
-                                    )
-                                    with T.init():
-                                        for ax0_1, ax1_1 in T.grid(16, 8):
-                                            with T.block("C_init"):
-                                                v0_i_init, v1_i_init = T.axis.remap(
-                                                    "SS", [ax0_1, ax1_1]
-                                                )
-                                                T.reads()
-                                                T.writes(
-                                                    C_reindex_m16n8k8_matrixC[
-                                                        v0_o * 16 + v0_i_init, v1_o * 8 + v1_i_init
-                                                    ]
-                                                )
-                                                C_reindex_m16n8k8_matrixC[
-                                                    v0_o * 16 + v0_i_init, v1_o * 8 + v1_i_init
-                                                ] = T.float16(0)
-                                    for ax0_1, ax1_1, ax2_1 in T.grid(16, 8, 8):
-                                        with T.block("C"):
-                                            v0_i, v1_i, v2_i = T.axis.remap(
-                                                "SSR", [ax0_1, ax1_1, ax2_1]
-                                            )
-                                            T.reads(
-                                                C_reindex_m16n8k8_matrixC[
-                                                    v0_o * 16 + v0_i, v1_o * 8 + v1_i
-                                                ],
-                                                X_reindex_shared_dyn_m16n8k8_matrixA[
-                                                    v0_o * 16 + v0_i, v2_o * 8 + v2_i
-                                                ],
-                                                Y_reindex_shared_dyn_m16n8k8_matrixB[
-                                                    v2_o * 8 + v2_i, v1_o * 8 + v1_i
-                                                ],
-                                            )
-                                            T.writes(
-                                                C_reindex_m16n8k8_matrixC[
-                                                    v0_o * 16 + v0_i, v1_o * 8 + v1_i
-                                                ]
-                                            )
-                                            T.block_attr(
-                                                {"meta_schedule.tiling_structure": "SSSRRSRS"}
-                                            )
-                                            C_reindex_m16n8k8_matrixC[
-                                                v0_o * 16 + v0_i, v1_o * 8 + v1_i
-                                            ] = (
-                                                C_reindex_m16n8k8_matrixC[
-                                                    v0_o * 16 + v0_i, v1_o * 8 + v1_i
-                                                ]
-                                                + X_reindex_shared_dyn_m16n8k8_matrixA[
-                                                    v0_o * 16 + v0_i, v2_o * 8 + v2_i
-                                                ]
-                                                * Y_reindex_shared_dyn_m16n8k8_matrixB[
-                                                    v2_o * 8 + v2_i, v1_o * 8 + v1_i
-                                                ]
-                                            )
-                    with T.block("C_reindex_m16n8k8.matrixC"):
-                        v0, v1, v2 = T.axis.remap(
-                            "SSS",
-                            [ax0_0_1_ax1_0_1_fused, ax0_0_2_ax1_0_2_fused, ax0_0_0_ax1_0_0_fused],
-                        )
-                        T.reads(
-                            C_reindex_m16n8k8_matrixC[
-                                v0 // 8 * 128 + v1 // 2 * 64 : v0 // 8 * 128 + v1 // 2 * 64 + 64,
-                                v2 * 1024
-                                + v0 % 8 * 128
-                                + v1 % 2 * 64 : v2 * 1024
-                                + v0 % 8 * 128
-                                + v1 % 2 * 64
-                                + 64,
-                            ]
-                        )
-                        T.writes(
-                            C[
-                                v0 // 8 * 128 + v1 // 2 * 64 : v0 // 8 * 128 + v1 // 2 * 64 + 64,
-                                v2 * 1024
-                                + v0 % 8 * 128
-                                + v1 % 2 * 64 : v2 * 1024
-                                + v0 % 8 * 128
-                                + v1 % 2 * 64
-                                + 64,
-                            ]
-                        )
-                        T.block_attr({"auto_copy": 1})
-                        for ax0, ax1 in T.grid(64, 64):
-                            C[
-                                v0 // 8 * 128 + v1 // 2 * 64 + ax0,
-                                v2 * 1024 + v0 % 8 * 128 + v1 % 2 * 64 + ax1,
-                            ] = C_reindex_m16n8k8_matrixC[
-                                v0 // 8 * 128 + v1 // 2 * 64 + ax0,
-                                v2 * 1024 + v0 % 8 * 128 + v1 % 2 * 64 + ax1,
-                            ]
-
-
-def matmul_fp16(N: int, M: int, K: int, out_dtype: str):
-    x = te.placeholder((N, K), name="X", dtype="float16")
-    y = te.placeholder((K, M), name="Y", dtype="float16")
-    k = te.reduce_axis((0, K), name="k")
-    c = te.compute(  # pylint: disable=invalid-name
-        (N, M),
-        lambda i, j: te.sum(x[i][k].astype(out_dtype) * y[k][j].astype(out_dtype), axis=[k]),
-        name="C",
-    )
-    return (x, y, c)
-
-
-def multi_level_tiling_mma(out_dtype):
-    simplify_dict = {"float32": "f32", "float16": "f16"}
-    out_dtype = simplify_dict[out_dtype]
-    return ms.schedule_rule.MultiLevelTilingTensorCore(
-        intrin_groups=[
-            {
-                "init": f"mma_init_m16n8k8_{out_dtype}",
-                "load_a": "mma_load_m16n8k8_f16_A_shared_dyn",
-                "load_b": "mma_load_m16n8k8_f16_B_shared_dyn",
-                "compute": f"mma_sync_m16n8k8_f16f16{out_dtype}",
-                "store": f"mma_store_m16n8k8_{out_dtype}_global",
-            },
-        ],
-        structure="SSSRRSRS",
-        tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"],
-        max_innermost_factor=4,  # 64 // tensor intrin size
-        vector_load_lens=[1, 2, 3, 4, 8, 16],
-        reuse_read=ms.schedule_rule.ReuseType(
-            req="must",
-            levels=[4],
-            scope="shared.dyn",
-        ),
-        reuse_write=ms.schedule_rule.ReuseType(
-            req="no",
-            levels=[2],
-            scope="shared.dyn",
-        ),
-        use_software_pipeline=True,
-    )
-
-
-def _design_space(mod, out_dtype):
-    return generate_design_space(
-        kind="cuda-tensorcore",
-        mod=mod,
-        target=Target("nvidia/geforce-rtx-3080"),
-        types=None,
-        sch_rules=[multi_level_tiling_mma(out_dtype)],
-    )
-
-
-gemm_decision = [
-    ("SamplePartitionedTile", [1, 32, 2, 1, 4]),
-    ("SamplePartitionedTile", [4, 8, 2, 1, 8]),
-    ("SamplePerfectTile", [128, 4, 1]),
-]
-
-
-def test_mma_auto_tensorization():
-    mod = te.create_prim_func(matmul_fp16(M=4096, N=4096, K=4096, out_dtype="float16"))
-    actual = _design_space(mod, "float16")
-    check_sketches(
-        mod,
-        sketches=actual,
-        expected_mods=[MmaModule],
-        expected_decisions=[gemm_decision],
-    )
-
-
-expected_cuda_script = r"""#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-#include <cuda_fp16.h>
-__device__ half max(half a, half b)
-{
-  return __hgt(__half(a), __half(b)) ? a : b;
-}
-__device__ half min(half a, half b)
-{
-  return __hlt(__half(a), __half(b)) ? a : b;
-}
-#else
-
-typedef unsigned short uint16_t;
-typedef unsigned char uint8_t;
-typedef signed char int8_t;
-typedef int int32_t;
-typedef unsigned long long uint64_t;
-typedef unsigned int uint32_t;
-
-#define TVM_FORCE_INLINE inline __attribute__((always_inline))
-#define TVM_XINLINE TVM_FORCE_INLINE __device__ __host__
-#define TVM_ALIGNED(x) __attribute__ ((aligned(x)))
-#define TVM_HALF_OPERATOR(RTYPE, OP)                              \
-  TVM_XINLINE RTYPE operator OP (half a, half b) {                \
-    return RTYPE(float(a) OP float(b));                           \
-  }                                                               \
-  template<typename T>                                            \
-  TVM_XINLINE RTYPE operator OP (half a, T b) {                   \
-    return RTYPE(float(a) OP float(b));                           \
-  }                                                               \
-  template<typename T>                                            \
-  TVM_XINLINE RTYPE operator OP (T a, half b) {                   \
-    return RTYPE(float(a) OP float(b));                           \
-  }
-
-#define TVM_HALF_ASSIGNOP(AOP, OP)                                \
-  template<typename T>                                            \
-  TVM_XINLINE half operator AOP (const T& a) {                    \
-    return *this = half(float(*this) OP float(a));                \
-  }                                                               \
-  template<typename T>                                            \
-  TVM_XINLINE half operator AOP (const volatile T& a) volatile {  \
-    return *this = half(float(*this) OP float(a));                \
-  }
-
-class TVM_ALIGNED(2) half {
- public:
-  uint16_t half_;
-
-  static TVM_XINLINE half Binary(uint16_t value) {
-    half res;
-    res.half_ = value;
-    return res;
-  }
-
-  TVM_XINLINE half() {}
-
-  TVM_XINLINE half(const float& value) { constructor(value); }
-  TVM_XINLINE explicit half(const double& value) { constructor(value); }
-  TVM_XINLINE explicit half(const int8_t& value) { constructor(value); }
-  TVM_XINLINE explicit half(const uint8_t& value) { constructor(value); }
-  TVM_XINLINE explicit half(const int32_t& value) { constructor(value); }
-  TVM_XINLINE explicit half(const uint32_t& value) { constructor(value); }
-  TVM_XINLINE explicit half(const long long& value) { constructor(value); }
-  TVM_XINLINE explicit half(const uint64_t& value) { constructor(value); }
-
-  TVM_XINLINE operator float() const {                          \
-    return float(half2float(half_));                            \
-  }                                                             \
-  TVM_XINLINE operator float() const volatile {                 \
-    return float(half2float(half_));                            \
-  }
-
-
-  TVM_HALF_ASSIGNOP(+=, +)
-  TVM_HALF_ASSIGNOP(-=, -)
-  TVM_HALF_ASSIGNOP(*=, *)
-  TVM_HALF_ASSIGNOP(/=, /)
-
-  TVM_XINLINE half operator+() {
-    return *this;
-  }
-
-  TVM_XINLINE half operator-() {
-    return half(-float(*this));
-  }
-
-  TVM_XINLINE half operator=(const half& a) {
-    half_ = a.half_;
-    return a;
-  }
-
-  template<typename T>
-  TVM_XINLINE half operator=(const T& a) {
-    return *this = half(a);
-  }
-
-  TVM_XINLINE half operator=(const half& a) volatile {
-    half_ = a.half_;
-    return a;
-  }
-
-  template<typename T>
-  TVM_XINLINE half operator=(const T& a) volatile {
-    return *this = half(a);
-  }
-
- private:
-  union Bits {
-    float f;
-    int32_t si;
-    uint32_t ui;
-  };
-
-  static int const fp16FractionBits = 10;
-  static int const fp32FractionBits = 23;
-  static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits);   // == 0x7fffff
-  static int32_t const fp32HiddenBit = 1 << fp32FractionBits;   // == 0x800000
-  static int const shift = fp32FractionBits - fp16FractionBits;   // == 13
-  static int const shiftSign = 16;
-  static int32_t const expAdjust = 127 - 15;   // exp32-127 = exp16-15, so exp16 = exp32 - (127-15)
-
-  static int32_t const infN = 0x7F800000;   // flt32 infinity
-  static int32_t const maxN = 0x477FFFFF;   // max flt32 that's a flt16 normal after >> by shift
-  static int32_t const minN = 0x38800000;   // min flt16 normal as a flt32
-  static int32_t const maxZ = 0x33000000;   // max fp32 number that's still rounded to zero in fp16
-  static int32_t const signN = 0x80000000;  // flt32 sign bit
-
-  static int32_t const infC = infN >> shift;
-  static int32_t const nanN = (infC + 1) << shift;   // minimum flt16 nan as a flt32
-  static int32_t const maxC = maxN >> shift;
-  static int32_t const minC = minN >> shift;
-  static int32_t const signC = signN >> shiftSign;  // flt16 sign bit
-
-  static int32_t const mulN = 0x52000000;  // (1 << 23) / minN
-  static int32_t const mulC = 0x33800000;  // minN / (1 << (23 - shift))
-
-  static int32_t const subC = 0x003FF;  // max flt32 subnormal down shifted
-  static int32_t const norC = 0x00400;  // min flt32 normal down shifted
-
-  static int32_t const maxD = infC - maxC - 1;
-  static int32_t const minD = minC - subC - 1;
-
-  TVM_XINLINE uint16_t float2half(const float& value) const {
-    Bits v;
-    v.f = value;
-    uint32_t sign = v.si & signN;    // grab sign bit
-    v.si ^= sign;                    // clear sign bit from v
-    sign >>= shiftSign;              // logical shift sign to fp16 position
-
-    if (v.si <= maxZ) {
-      // Handle eventual zeros here to ensure
-      // vshift will not exceed 32 below.
-      v.ui = 0;
-    } else if (v.si < minN) {
-      // Handle denorms
-      uint32_t exp32 = v.ui >> fp32FractionBits;
-      int32_t exp16 = exp32 - expAdjust;
-      // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1.
-      // Smaller (so negative) exp16 values should result in greater right shifts.
-      uint32_t vshift = 1 - exp16;
-      uint32_t significand = fp32HiddenBit | (v.ui & fp32FractionMask);
-      v.ui = significand >> vshift;
-      v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0;
-    } else if (v.si <= maxN) {
-      // Handle norms
-      v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0;
-      v.ui -= expAdjust << fp32FractionBits;
-    } else if (v.si <= infN) {
-      v.si = infN;
-    } else if (v.si < nanN) {
-      v.si = nanN;
-    }
-
-    v.ui >>= shift;
-    return sign | (v.ui & 0x7fff);
-  }
-
-  // Same as above routine, except for addition of volatile keyword
-  TVM_XINLINE uint16_t float2half(
-    const volatile float& value) const volatile {
-    Bits v;
-    v.f = value;
-    uint32_t sign = v.si & signN;    // grab sign bit
-    v.si ^= sign;                    // clear sign bit from v
-    sign >>= shiftSign;              // logical shift sign to fp16 position
-
-    if (v.si <= maxZ) {
-      // Handle eventual zeros here to ensure
-      // vshift will not exceed 32 below.
-      v.ui = 0;
-    } else if (v.si < minN) {
-      // Handle denorms
-      uint32_t exp32 = v.ui >> fp32FractionBits;
-      int32_t exp16 = exp32 - expAdjust;
-      // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1.
-      // Smaller (so negative) exp16 values should result in greater right shifts.
-      uint32_t vshift = 1 - exp16;
-      uint32_t significand = fp32HiddenBit | (v.ui & fp32FractionMask);
-      v.ui = significand >> vshift;
-      v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0;
-    } else if (v.si <= maxN) {
-      // Handle norms
-      v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0;
-      v.ui -= expAdjust << fp32FractionBits;
-    } else if (v.si <= infN) {
-      v.si = infN;
-    } else if (v.si < nanN) {
-      v.si = nanN;
-    }
-
-    v.ui >>= shift;
-    return sign | (v.ui & 0x7fff);
-  }
-
-  TVM_XINLINE float half2float(const uint16_t& value) const {
-    Bits v;
-    v.ui = value;
-    int32_t sign = v.si & signC;
-    v.si ^= sign;
-    sign <<= shiftSign;
-    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
-    Bits s;
-    s.si = mulC;
-    s.f *= v.si;
-    int32_t mask = -(norC > v.si);
-    v.si <<= shift;
-    v.si ^= (s.si ^ v.si) & mask;
-    v.si |= sign;
-    return v.f;
-  }
-
-  TVM_XINLINE float half2float(
-    const volatile uint16_t& value) const volatile {
-    Bits v;
-    v.ui = value;
-    int32_t sign = v.si & signC;
-    v.si ^= sign;
-    sign <<= shiftSign;
-    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
-    Bits s;
-    s.si = mulC;
-    s.f *= v.si;
-    int32_t mask = -(norC > v.si);
-    v.si <<= shift;
-    v.si ^= (s.si ^ v.si) & mask;
-    v.si |= sign;
-    return v.f;
-  }
-
-  template<typename T>
-  TVM_XINLINE void constructor(const T& value) {
-    half_ = float2half(float(value));
-  }
-};
-
-TVM_HALF_OPERATOR(half, +)
-TVM_HALF_OPERATOR(half, -)
-TVM_HALF_OPERATOR(half, *)
-TVM_HALF_OPERATOR(half, /)
-TVM_HALF_OPERATOR(bool, >)
-TVM_HALF_OPERATOR(bool, <)
-TVM_HALF_OPERATOR(bool, >=)
-TVM_HALF_OPERATOR(bool, <=)
-
-TVM_XINLINE half __float2half_rn(const float a) {
-  return half(a);
-}
-#endif
-
-
-// Pack two half values.
-static inline __device__ __host__ unsigned
-__pack_half2(const half x, const half y) {
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
-}
-
-#define CUDA_UNSUPPORTED_HALF_MATH_BINARY(HALF_MATH_NAME, FP32_MATH_NAME) \
-static inline __device__ __host__ half HALF_MATH_NAME(half x, half y) {   \
-  float tmp_x = __half2float(x);                                          \
-  float tmp_y = __half2float(y);                                          \
-  float result = FP32_MATH_NAME(tmp_x, tmp_y);                            \
-  return __float2half(result);                                            \
-}
-
-#define CUDA_UNSUPPORTED_HALF_MATH_UNARY(HALF_MATH_NAME, FP32_MATH_NAME) \
-static inline __device__ __host__ half HALF_MATH_NAME(half x) {          \
-  float tmp_x = __half2float(x);                                         \
-  float result = FP32_MATH_NAME(tmp_x);                                  \
-  return __float2half(result);                                           \
-}
-
-// Some fp16 math functions are not supported in cuda_fp16.h,
-// so we define them here to make sure the generated CUDA code
-// is valid.
-#if defined(__CUDA_ARCH__)
-#if (__CUDA_ARCH__ >= 530)
-CUDA_UNSUPPORTED_HALF_MATH_BINARY(hpow, powf)
-#if ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 8)))
-CUDA_UNSUPPORTED_HALF_MATH_UNARY(htanh, tanhf)
-#endif
-CUDA_UNSUPPORTED_HALF_MATH_UNARY(htan, tanf)
-CUDA_UNSUPPORTED_HALF_MATH_UNARY(hatan, atanf)
-CUDA_UNSUPPORTED_HALF_MATH_UNARY(herf, erf)
-#else
-CUDA_UNSUPPORTED_HALF_MATH_UNARY(hexp, exp)
-#endif
-#endif
-
-#undef CUDA_UNSUPPORTED_HALF_MATH_BINARY
-#undef CUDA_UNSUPPORTED_HALF_MATH_UNARY
-__forceinline__ __device__ unsigned int
-cast_smem_ptr_to_int(const void* const smem_ptr)
-{
-  unsigned int smem_int;
-  asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }"
-    : "=r"(smem_int) : "l"(smem_ptr));
-  return smem_int;
-}
-
-#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
-     (__CUDACC_VER_MAJOR__ > 11))
-#define TVM_ENABLE_L2_PREFETCH 1
-#else
-#define TVM_ENABLE_L2_PREFETCH 0
-#endif
-
-#ifdef _WIN32
-  using uint = unsigned int;
-  using uchar = unsigned char;
-  using ushort = unsigned short;
-  using int64_t = long long;
-  using uint64_t = unsigned long long;
-#else
-  #define uint unsigned int
-  #define uchar unsigned char
-  #define ushort unsigned short
-  #define int64_t long long
-  #define uint64_t unsigned long long
-#endif
-extern "C" __global__ void __launch_bounds__(128) main_kernel(half* __restrict__ C, half* __restrict__ X, half* __restrict__ Y);
-extern "C" __global__ void __launch_bounds__(128) main_kernel(half* __restrict__ C, half* __restrict__ X, half* __restrict__ Y) {
-  extern __shared__ uchar buf_dyn_shmem[];
-  uint1 C_reindex_m16n8k8_matrixC[64];
-  half X_reindex_shared_dyn_m16n8k8_matrixA[32];
-  half Y_reindex_shared_dyn_m16n8k8_matrixB[32];
-  for (int ax0_0_4_init = 0; ax0_0_4_init < 4; ++ax0_0_4_init) {
-    for (int ax1_0_4_init = 0; ax1_0_4_init < 8; ++ax1_0_4_init) {
-      for (int b = 0; b < 2; ++b) {
-        C_reindex_m16n8k8_matrixC[(((ax0_0_4_init * 16) + (ax1_0_4_init * 2)) + b)] = make_uint1(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-  }
-  for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int(buf_dyn_shmem + ((((ax0_ax1_fused_0 * 2048) + (((int)threadIdx.y) * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 16)));
-    __asm__ __volatile__(
-      #if TVM_ENABLE_L2_PREFETCH
-        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;"
-      #else
-        "cp.async.cg.shared.global [%0], [%1], %2;"
-      #endif
-        :: "r"(addr), "l"((void*)(X + ((((((((int)blockIdx.y) >> 3) * 524288) + (ax0_ax1_fused_0 * 131072)) + (((int)threadIdx.y) * 32768)) + ((((int)threadIdx.x) >> 2) * 4096)) + ((((int)threadIdx.x) & 3) * 8)))), "n"(16)
-    );
-  }
-  }
-  for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int(buf_dyn_shmem + (((((ax0_ax1_fused_0_1 * 2048) + (((int)threadIdx.y) * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + (((((int)threadIdx.x) & 15) ^ ((((int)threadIdx.y) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 24576));
-    __asm__ __volatile__(
-      #if TVM_ENABLE_L2_PREFETCH
-        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;"
-      #else
-        "cp.async.cg.shared.global [%0], [%1], %2;"
-      #endif
-        :: "r"(addr), "l"((void*)(Y + ((((((ax0_ax1_fused_0_1 * 32768) + (((int)threadIdx.y) * 8192)) + ((((int)threadIdx.x) >> 4) * 4096)) + (((int)blockIdx.x) * 1024)) + ((((int)blockIdx.y) & 7) * 128)) + ((((int)threadIdx.x) & 15) * 8)))), "n"(16)
-    );
-  }
-  }
-__asm__ __volatile__("cp.async.commit_group;");
-
-  for (int ax0_ax1_fused_0_2 = 0; ax0_ax1_fused_0_2 < 4; ++ax0_ax1_fused_0_2) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int(buf_dyn_shmem + (((((ax0_ax1_fused_0_2 * 2048) + (((int)threadIdx.y) * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 16)) + 8192));
-    __asm__ __volatile__(
-      #if TVM_ENABLE_L2_PREFETCH
-        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;"
-      #else
-        "cp.async.cg.shared.global [%0], [%1], %2;"
-      #endif
-        :: "r"(addr), "l"((void*)(X + (((((((((int)blockIdx.y) >> 3) * 524288) + (ax0_ax1_fused_0_2 * 131072)) + (((int)threadIdx.y) * 32768)) + ((((int)threadIdx.x) >> 2) * 4096)) + ((((int)threadIdx.x) & 3) * 8)) + 32))), "n"(16)
-    );
-  }
-  }
-  for (int ax0_ax1_fused_0_3 = 0; ax0_ax1_fused_0_3 < 4; ++ax0_ax1_fused_0_3) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int(buf_dyn_shmem + (((((ax0_ax1_fused_0_3 * 2048) + (((int)threadIdx.y) * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + (((((int)threadIdx.x) & 15) ^ ((((int)threadIdx.y) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 32768));
-    __asm__ __volatile__(
-      #if TVM_ENABLE_L2_PREFETCH
-        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;"
-      #else
-        "cp.async.cg.shared.global [%0], [%1], %2;"
-      #endif
-        :: "r"(addr), "l"((void*)(Y + (((((((ax0_ax1_fused_0_3 * 32768) + (((int)threadIdx.y) * 8192)) + ((((int)threadIdx.x) >> 4) * 4096)) + (((int)blockIdx.x) * 1024)) + ((((int)blockIdx.y) & 7) * 128)) + ((((int)threadIdx.x) & 15) * 8)) + 131072))), "n"(16)
-    );
-  }
-  }
-__asm__ __volatile__("cp.async.commit_group;");
-
-__asm__ __volatile__("cp.async.wait_group 1;");
-
-  __syncthreads();
-  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[(((((((int)threadIdx.y) >> 1) * 2048) + (ax0_0 * 1024)) + (((int)threadIdx.x) * 32)) + ((0 ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0 * 8)))[3])
-      : "r"(addr)
-    );
-  }
-  }
-  for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((((int)threadIdx.x) & 7) * 128) + ((((((((int)threadIdx.y) & 1) * 8) + (ax1_0 * 4)) + (((int)threadIdx.x) >> 3)) ^ (((int)threadIdx.x) & 7)) * 8)) + 12288)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0 * 8)))[3])
-      : "r"(addr)
-    );
-  }
-  }
-  for (int ax2_0_0 = 0; ax2_0_0 < 126; ++ax2_0_0) {
-    __syncthreads();
-    for (int ax0_ax1_fused_0_4 = 0; ax0_ax1_fused_0_4 < 4; ++ax0_ax1_fused_0_4) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int(buf_dyn_shmem + (((((((ax2_0_0 + 2) % 3) * 8192) + (ax0_ax1_fused_0_4 * 2048)) + (((int)threadIdx.y) * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 16)));
-    __asm__ __volatile__(
-      #if TVM_ENABLE_L2_PREFETCH
-        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;"
-      #else
-        "cp.async.cg.shared.global [%0], [%1], %2;"
-      #endif
-        :: "r"(addr), "l"((void*)(X + ((((((((((int)blockIdx.y) >> 3) * 524288) + (ax0_ax1_fused_0_4 * 131072)) + (((int)threadIdx.y) * 32768)) + ((((int)threadIdx.x) >> 2) * 4096)) + (ax2_0_0 * 32)) + ((((int)threadIdx.x) & 3) * 8)) + 64))), "n"(16)
-    );
-  }
-    }
-    for (int ax0_ax1_fused_0_5 = 0; ax0_ax1_fused_0_5 < 4; ++ax0_ax1_fused_0_5) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int(buf_dyn_shmem + ((((((((ax2_0_0 + 2) % 3) * 8192) + (ax0_ax1_fused_0_5 * 2048)) + (((int)threadIdx.y) * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + (((((int)threadIdx.x) & 15) ^ ((((int)threadIdx.y) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 24576));
-    __asm__ __volatile__(
-      #if TVM_ENABLE_L2_PREFETCH
-        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;"
-      #else
-        "cp.async.cg.shared.global [%0], [%1], %2;"
-      #endif
-        :: "r"(addr), "l"((void*)(Y + ((((((((ax2_0_0 * 131072) + (ax0_ax1_fused_0_5 * 32768)) + (((int)threadIdx.y) * 8192)) + ((((int)threadIdx.x) >> 4) * 4096)) + (((int)blockIdx.x) * 1024)) + ((((int)blockIdx.y) & 7) * 128)) + ((((int)threadIdx.x) & 15) * 8)) + 262144))), "n"(16)
-    );
-  }
-    }
-__asm__ __volatile__("cp.async.commit_group;");
-
-__asm__ __volatile__("cp.async.wait_group 1;");
-
-    __syncthreads();
-    for (int ax2_0_1 = 0; ax2_0_1 < 3; ++ax2_0_1) {
-      for (int ax0_0_1 = 0; ax0_0_1 < 2; ++ax0_0_1) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((((ax2_0_0 % 3) * 4096) + ((((int)threadIdx.y) >> 1) * 2048)) + (ax0_0_1 * 1024)) + (((int)threadIdx.x) * 32)) + (((ax2_0_1 + 1) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1 + 1) & 1) * 16) + (ax0_0_1 * 8))))[0]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1 + 1) & 1) * 16) + (ax0_0_1 * 8))))[1]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1 + 1) & 1) * 16) + (ax0_0_1 * 8))))[2]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1 + 1) & 1) * 16) + (ax0_0_1 * 8))))[3])
-      : "r"(addr)
-    );
-  }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((((ax2_0_0 % 3) * 4096) + (ax2_0_1 * 1024)) + ((((int)threadIdx.x) & 7) * 128)) + ((((((((int)threadIdx.y) & 1) * 8) + (ax1_0_1 * 4)) + (((int)threadIdx.x) >> 3)) ^ (((int)threadIdx.x) & 7)) * 8)) + 13312)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1 + 1) & 1) * 16) + (ax1_0_1 * 8))))[0]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1 + 1) & 1) * 16) + (ax1_0_1 * 8))))[1]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1 + 1) & 1) * 16) + (ax1_0_1 * 8))))[2]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1 + 1) & 1) * 16) + (ax1_0_1 * 8))))[3])
-      : "r"(addr)
-    );
-  }
-      }
-      for (int ax0_0_4 = 0; ax0_0_4 < 4; ++ax0_0_4) {
-        for (int ax1_0_4 = 0; ax1_0_4 < 8; ++ax1_0_4) {
-
-  {
-    __asm__ __volatile__(
-      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16"
-      "{%0, %1}, {%2, %3}, {%4}, {%5, %6};\n"
-      :  "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4 * 16) + (ax1_0_4 * 2))))[0]), "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4 * 16) + (ax1_0_4 * 2))))[1])
-      : "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (((ax2_0_1 & 1) * 16) + (ax0_0_4 * 4))))[0]), "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (((ax2_0_1 & 1) * 16) + (ax0_0_4 * 4))))[1]), "r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (((ax2_0_1 & 1) * 16) + (ax1_0_4 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4 * 16) + (ax1_0_4 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4 * 16) + (ax1_0_4 * 2))))[1]));
-  }
-        }
-      }
-    }
-    for (int ax0_0_2 = 0; ax0_0_2 < 2; ++ax0_0_2) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[(((((((ax2_0_0 + 1) % 3) * 4096) + ((((int)threadIdx.y) >> 1) * 2048)) + (ax0_0_2 * 1024)) + (((int)threadIdx.x) * 32)) + ((0 ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_2 * 8)))[0]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_2 * 8)))[1]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_2 * 8)))[2]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_2 * 8)))[3])
-      : "r"(addr)
-    );
-  }
-    }
-    for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((((ax2_0_0 + 1) % 3) * 4096) + ((((int)threadIdx.x) & 7) * 128)) + ((((((((int)threadIdx.y) & 1) * 8) + (ax1_0_2 * 4)) + (((int)threadIdx.x) >> 3)) ^ (((int)threadIdx.x) & 7)) * 8)) + 12288)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_2 * 8)))[0]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_2 * 8)))[1]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_2 * 8)))[2]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_2 * 8)))[3])
-      : "r"(addr)
-    );
-  }
-    }
-    for (int ax0_0_4_1 = 0; ax0_0_4_1 < 4; ++ax0_0_4_1) {
-      for (int ax1_0_4_1 = 0; ax1_0_4_1 < 8; ++ax1_0_4_1) {
-
-  {
-    __asm__ __volatile__(
-      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16"
-      "{%0, %1}, {%2, %3}, {%4}, {%5, %6};\n"
-      :  "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_1 * 16) + (ax1_0_4_1 * 2))))[0]), "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_1 * 16) + (ax1_0_4_1 * 2))))[1])
-      : "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((ax0_0_4_1 * 4) + 16)))[0]), "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((ax0_0_4_1 * 4) + 16)))[1]), "r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((ax1_0_4_1 * 2) + 16)))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_1 * 16) + (ax1_0_4_1 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_1 * 16) + (ax1_0_4_1 * 2))))[1]));
-  }
-      }
-    }
-  }
-__asm__ __volatile__("cp.async.wait_group 0;");
-
-  __syncthreads();
-  for (int ax2_0_1_1 = 0; ax2_0_1_1 < 3; ++ax2_0_1_1) {
-    for (int ax0_0_3 = 0; ax0_0_3 < 2; ++ax0_0_3) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[(((((((int)threadIdx.y) >> 1) * 2048) + (ax0_0_3 * 1024)) + (((int)threadIdx.x) * 32)) + (((ax2_0_1_1 + 1) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax0_0_3 * 8))))[0]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax0_0_3 * 8))))[1]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax0_0_3 * 8))))[2]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax0_0_3 * 8))))[3])
-      : "r"(addr)
-    );
-  }
-    }
-    for (int ax1_0_3 = 0; ax1_0_3 < 2; ++ax1_0_3) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((ax2_0_1_1 * 1024) + ((((int)threadIdx.x) & 7) * 128)) + ((((((((int)threadIdx.y) & 1) * 8) + (ax1_0_3 * 4)) + (((int)threadIdx.x) >> 3)) ^ (((int)threadIdx.x) & 7)) * 8)) + 13312)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax1_0_3 * 8))))[0]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax1_0_3 * 8))))[1]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax1_0_3 * 8))))[2]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_1 + 1) & 1) * 16) + (ax1_0_3 * 8))))[3])
-      : "r"(addr)
-    );
-  }
-    }
-    for (int ax0_0_4_2 = 0; ax0_0_4_2 < 4; ++ax0_0_4_2) {
-      for (int ax1_0_4_2 = 0; ax1_0_4_2 < 8; ++ax1_0_4_2) {
-
-  {
-    __asm__ __volatile__(
-      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16"
-      "{%0, %1}, {%2, %3}, {%4}, {%5, %6};\n"
-      :  "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_2 * 16) + (ax1_0_4_2 * 2))))[0]), "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_2 * 16) + (ax1_0_4_2 * 2))))[1])
-      : "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (((ax2_0_1_1 & 1) * 16) + (ax0_0_4_2 * 4))))[0]), "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (((ax2_0_1_1 & 1) * 16) + (ax0_0_4_2 * 4))))[1]), "r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (((ax2_0_1_1 & 1) * 16) + (ax1_0_4_2 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_2 * 16) + (ax1_0_4_2 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_2 * 16) + (ax1_0_4_2 * 2))))[1]));
-  }
-      }
-    }
-  }
-  for (int ax0_0_5 = 0; ax0_0_5 < 2; ++ax0_0_5) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((((((int)threadIdx.y) >> 1) * 2048) + (ax0_0_5 * 1024)) + (((int)threadIdx.x) * 32)) + ((0 ^ ((((int)threadIdx.x) & 7) >> 1)) * 8)) + 4096)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_5 * 8)))[0]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_5 * 8)))[1]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_5 * 8)))[2]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (ax0_0_5 * 8)))[3])
-      : "r"(addr)
-    );
-  }
-  }
-  for (int ax1_0_5 = 0; ax1_0_5 < 2; ++ax1_0_5) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((((int)threadIdx.x) & 7) * 128) + ((((((((int)threadIdx.y) & 1) * 8) + (ax1_0_5 * 4)) + (((int)threadIdx.x) >> 3)) ^ (((int)threadIdx.x) & 7)) * 8)) + 16384)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_5 * 8)))[0]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_5 * 8)))[1]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_5 * 8)))[2]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (ax1_0_5 * 8)))[3])
-      : "r"(addr)
-    );
-  }
-  }
-  for (int ax0_0_4_3 = 0; ax0_0_4_3 < 4; ++ax0_0_4_3) {
-    for (int ax1_0_4_3 = 0; ax1_0_4_3 < 8; ++ax1_0_4_3) {
-
-  {
-    __asm__ __volatile__(
-      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16"
-      "{%0, %1}, {%2, %3}, {%4}, {%5, %6};\n"
-      :  "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_3 * 16) + (ax1_0_4_3 * 2))))[0]), "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_3 * 16) + (ax1_0_4_3 * 2))))[1])
-      : "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((ax0_0_4_3 * 4) + 16)))[0]), "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((ax0_0_4_3 * 4) + 16)))[1]), "r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((ax1_0_4_3 * 2) + 16)))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_3 * 16) + (ax1_0_4_3 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_3 * 16) + (ax1_0_4_3 * 2))))[1]));
-  }
-    }
-  }
-  for (int ax2_0_1_2 = 0; ax2_0_1_2 < 3; ++ax2_0_1_2) {
-    for (int ax0_0_6 = 0; ax0_0_6 < 2; ++ax0_0_6) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((((((int)threadIdx.y) >> 1) * 2048) + (ax0_0_6 * 1024)) + (((int)threadIdx.x) * 32)) + (((ax2_0_1_2 + 1) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8)) + 4096)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax0_0_6 * 8))))[0]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax0_0_6 * 8))))[1]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax0_0_6 * 8))))[2]), "=r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax0_0_6 * 8))))[3])
-      : "r"(addr)
-    );
-  }
-    }
-    for (int ax1_0_6 = 0; ax1_0_6 < 2; ++ax1_0_6) {
-
-  {
-    unsigned int addr = cast_smem_ptr_to_int((&(((half*)buf_dyn_shmem)[((((ax2_0_1_2 * 1024) + ((((int)threadIdx.x) & 7) * 128)) + ((((((((int)threadIdx.y) & 1) * 8) + (ax1_0_6 * 4)) + (((int)threadIdx.x) >> 3)) ^ (((int)threadIdx.x) & 7)) * 8)) + 17408)])) + 0);
-    __asm__ __volatile__(
-      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-      "{%0, %1, %2, %3}, [%4];\n"
-      : "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax1_0_6 * 8))))[0]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax1_0_6 * 8))))[1]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax1_0_6 * 8))))[2]), "=r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((((ax2_0_1_2 + 1) & 1) * 16) + (ax1_0_6 * 8))))[3])
-      : "r"(addr)
-    );
-  }
-    }
-    for (int ax0_0_4_4 = 0; ax0_0_4_4 < 4; ++ax0_0_4_4) {
-      for (int ax1_0_4_4 = 0; ax1_0_4_4 < 8; ++ax1_0_4_4) {
-
-  {
-    __asm__ __volatile__(
-      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16"
-      "{%0, %1}, {%2, %3}, {%4}, {%5, %6};\n"
-      :  "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_4 * 16) + (ax1_0_4_4 * 2))))[0]), "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_4 * 16) + (ax1_0_4_4 * 2))))[1])
-      : "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (((ax2_0_1_2 & 1) * 16) + (ax0_0_4_4 * 4))))[0]), "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + (((ax2_0_1_2 & 1) * 16) + (ax0_0_4_4 * 4))))[1]), "r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + (((ax2_0_1_2 & 1) * 16) + (ax1_0_4_4 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_4 * 16) + (ax1_0_4_4 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_4 * 16) + (ax1_0_4_4 * 2))))[1]));
-  }
-      }
-    }
-  }
-  for (int ax0_0_4_5 = 0; ax0_0_4_5 < 4; ++ax0_0_4_5) {
-    for (int ax1_0_4_5 = 0; ax1_0_4_5 < 8; ++ax1_0_4_5) {
-
-  {
-    __asm__ __volatile__(
-      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16"
-      "{%0, %1}, {%2, %3}, {%4}, {%5, %6};\n"
-      :  "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_5 * 16) + (ax1_0_4_5 * 2))))[0]), "=r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_5 * 16) + (ax1_0_4_5 * 2))))[1])
-      : "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((ax0_0_4_5 * 4) + 16)))[0]), "r"(((unsigned *)(X_reindex_shared_dyn_m16n8k8_matrixA + ((ax0_0_4_5 * 4) + 16)))[1]), "r"(((unsigned *)(Y_reindex_shared_dyn_m16n8k8_matrixB + ((ax1_0_4_5 * 2) + 16)))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_5 * 16) + (ax1_0_4_5 * 2))))[0]), "r"(((unsigned *)(C_reindex_m16n8k8_matrixC + ((ax0_0_4_5 * 16) + (ax1_0_4_5 * 2))))[1]));
-  }
-    }
-  }
-  for (int ax0_0_7 = 0; ax0_0_7 < 8; ++ax0_0_7) {
-    __syncthreads();
-    for (int ax1_0_7 = 0; ax1_0_7 < 8; ++ax1_0_7) {
-      *(uint1*)(((half*)buf_dyn_shmem) + ((((((int)threadIdx.x) * 2050) + (((int)threadIdx.y) * 512)) + (ax1_0_7 * 64)) + 12288)) = C_reindex_m16n8k8_matrixC[((((ax0_0_7 >> 1) * 16) + (ax1_0_7 * 2)) + (ax0_0_7 & 1))];
-    }
-    __syncthreads();
-    for (int threadIdx_x_cache_threadIdx_y_cache_ax1_0_cache_ax0_1_cache_ax1_1_cache_fused_0 = 0; threadIdx_x_cache_threadIdx_y_cache_ax1_0_cache_ax0_1_cache_ax1_1_cache_fused_0 < 512; ++threadIdx_x_cache_threadIdx_y_cache_ax1_0_cache_ax0_1_cache_ax1_1_cache_fused_0) {
-      C[(((((((((((((int)blockIdx.y) >> 3) * 524288) + (((threadIdx_x_cache_threadIdx_y_cache_ax1_0_cache_ax0_1_cache_ax1_1_cache_fused_0 & 15) >> 3) * 262144)) + (ax0_0_7 * 32768)) + ((((int)threadIdx.y) & 1) * 16384)) + ((((int)threadIdx.x) >> 3) * 4096)) + (((int)blockIdx.x) * 1024)) + ((((int)blockIdx.y) & 7) * 128)) + ((threadIdx_x_cache_threadIdx_y_cache_ax1_0_cache_ax0_1_cache_ax1_1_cache_fused_0 & 7) * 16)) + ((((int)threadIdx.y) >> 1) * 8)) + (((int)threadIdx.x) & 7))] = ((half*)buf_dyn_shmem)[((((threadIdx_x_cache_threadIdx_y_cache_ax1_0_cache_ax0_1_cache_ax1_1_cache_fused_0 * 128) + (((int)threadIdx.y) * 32)) + ((int)threadIdx.x)) + 12288)];
-    }
-  }
-}
-
-"""
-
-
-@tvm.testing.requires_tensorcore
-def test_mma_script_after_build():
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # At least sm80 is required
-        return
-
-    mod = te.create_prim_func(matmul_fp16(M=4096, N=4096, K=4096, out_dtype="float16"))
-    actual = _design_space(mod, "float16")
-    assert len(actual) == 1
-    sketch = actual[0]
-
-    i = 0
-    new_decisions = {}
-    for inst in sketch.trace.insts:
-        if not inst.kind.name.startswith("Sample"):
-            continue
-        assert i < len(gemm_decision)
-        if inst.kind.name == gemm_decision[i][0]:
-            new_decisions[inst] = gemm_decision[i][1]
-            i += 1
-    assert len(new_decisions) == len(gemm_decision)
-    sch = Schedule(mod)
-    Trace(
-        insts=sketch.trace.insts,
-        decisions=new_decisions,
-    ).apply_to_schedule(sch, remove_postproc=True)
-
-    sch.enter_postproc()
-    # DefaultCUDATensorCore
-    ms.postproc.DisallowDynamicLoop().apply(sch)
-    ms.postproc.RewriteCooperativeFetch().apply(sch)
-    # Disable RewriteUnboundBlock here since max_threads_per_block_ is not set
-    # ms.postproc.RewriteUnboundBlock(256).apply(sch)
-    ms.postproc.RewriteParallelVectorizeUnroll().apply(sch)
-    ms.postproc.RewriteReductionBlock().apply(sch)
-    ms.postproc.VerifyGPUCode().apply(sch)
-    ms.postproc.RewriteTensorize(False).apply(sch)
-
-    with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        rt_mod = tvm.build(sch.mod, target="cuda")
-    print(rt_mod.imported_modules[0].get_source())
-    assert rt_mod.imported_modules[0].get_source() == expected_cuda_script
-
-
-def initializer():
-    @register_func("meta_schedule.builder.async_build")
-    def async_build(mod, target, _params):  # pylint: disable=unused-variable, unused-argument
-        # pylint: disable=import-outside-toplevel
-        from tvm.driver import build as tvm_build
-        from tvm.tir.transform import RemoveWeightLayoutRewriteBlock
-
-        # re-import here for local builder to register index_map_m16n8k8_matrixC
-        # pylint: disable=import-outside-toplevel, unused-import
-        from tvm.tir.tensor_intrin import cuda
-
-        mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
-        with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-            rt_mod = tvm_build(mod, target=target)
-        return rt_mod
-
-
-@tvm.testing.requires_tensorcore
-@tvm.testing.requires_cublas
-def test_mma_tune():
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # At least sm80 is required
-        return
-
-    # pylint: disable=import-outside-toplevel
-    from tvm.contrib import cublas
-
-    def tune(out_dtype):
-        M, N, K = 1024, 1024, 1024
-        target = Target("nvidia/geforce-rtx-3080")
-        func = te.create_prim_func(matmul_fp16(N=N, M=M, K=K, out_dtype=out_dtype)).with_attr(
-            {"global_symbol": "main"}
-        )
-        mod = tvm.IRModule({"main": func})
-
-        with tempfile.TemporaryDirectory() as work_dir:
-            db = ms.tir_integration.tune_tir(
-                mod=mod,
-                target=target,
-                work_dir=work_dir,
-                max_trials_global=8,
-                builder=LocalBuilder(
-                    f_build="meta_schedule.builder.async_build", initializer=initializer
-                ),
-                space=ms.space_generator.PostOrderApply(
-                    sch_rules=[multi_level_tiling_mma(out_dtype=out_dtype)],
-                ),
-            )
-            sch = db.query_schedule(mod, target=target, workload_name="main")
-            with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-                rt_mod = tvm.build(sch.mod, target=target)
-
-        a_np = np.random.uniform(0, 1, size=(M, K)).astype("float16")
-        b_np = np.random.uniform(0, 1, size=(K, N)).astype("float16")
-        A_cublas = te.placeholder((M, K), name="A", dtype="float16")
-        B_cublas = te.placeholder((K, N), name="B", dtype="float16")
-        C_cublas = cublas.matmul(A_cublas, B_cublas, dtype=out_dtype)
-        s = te.create_schedule(C_cublas.op)
-        dev = tvm.cuda(0)
-        f_cublas = tvm.build(s, [A_cublas, B_cublas, C_cublas], target)
-        a_cublas = tvm.nd.array(a_np.astype("float16"), dev)
-        b_cublas = tvm.nd.array(b_np.astype("float16"), dev)
-        c_cublas = tvm.nd.array(np.zeros((M, N), dtype=C_cublas.dtype), dev)
-        f_cublas(a_cublas, b_cublas, c_cublas)
-        a_tvm = tvm.nd.array(a_np, device=tvm.cuda(0))
-        b_tvm = tvm.nd.array(b_np, device=tvm.cuda(0))
-        c_tvm = tvm.nd.array(np.empty((M, N)).astype(out_dtype), device=tvm.cuda(0))
-        rt_mod(a_tvm, b_tvm, c_tvm)
-        assert np.allclose(c_tvm.numpy(), c_cublas.numpy(), rtol=1e-2)
-
-    tune("float16")
-    tune("float32")
-
-
-if __name__ == "__main__":
-    test_mma_auto_tensorization()
-    test_mma_script_after_build()
-    test_mma_tune()
diff --git a/tests/python/meta_schedule/test_meta_schedule_multi_anchor.py b/tests/python/meta_schedule/test_meta_schedule_multi_anchor.py
deleted file mode 100644
index cb6f59c6e5d5..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_multi_anchor.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import meta_schedule as ms
-from tvm import relay
-
-
-def get_dense_dense(data_shape, weight_shape):
-    def multi_dense():
-        p_data = relay.var("p_data", shape=data_shape, dtype="float32")
-        p_weight1 = relay.var("p_weight1", shape=weight_shape, dtype="float32")
-        p_weight2 = relay.var("p_weight2", shape=weight_shape, dtype="float32")
-        dense1 = relay.nn.dense(p_data, p_weight1)
-        dense2 = relay.nn.dense(dense1, p_weight2)
-        f = relay.Function([p_data, p_weight1, p_weight2], dense2)
-        f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        return f
-
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight1 = relay.var("weight1", shape=weight_shape, dtype="float32")
-    weight2 = relay.var("weight2", shape=weight_shape, dtype="float32")
-    out = relay.Call(multi_dense(), [data, weight1, weight2])
-    return relay.Function([data, weight1, weight2], out)
-
-
-def get_ref(data_np, weight1_np, weight2_np):
-    dense1 = np.dot(data_np, np.transpose(weight1_np))
-    return np.dot(dense1, np.transpose(weight2_np))
-
-
-def schedule_dense_dense(sch):
-    dense1 = sch.get_block("T_matmul_NT")
-    dense2 = sch.get_block("T_matmul_NT_1")
-    _y1, _x1, _k1 = sch.get_loops(dense1)
-    _y2, _x2, _k2 = sch.get_loops(dense2)
-
-
-def test_dense_dense():
-    M, N, K = 128, 128, 128
-    data_shape = (M, K)
-    weight_shape = (N, K)
-    relay_mod = tvm.IRModule.from_expr(get_dense_dense(data_shape, weight_shape))
-    data_np = np.random.randn(*data_shape).astype("float32")
-    weight1_np = np.random.randn(*weight_shape).astype("float32")
-    weight2_np = np.random.randn(*weight_shape).astype("float32")
-    target = "llvm"
-    params = {"weight1": weight1_np, "weight2": weight2_np}
-
-    def schedule_fn(sch):
-        if "nn_dense_nn_dense" in sch.mod.attrs["task_name"]:
-            schedule_dense_dense(sch)
-            return True
-        return False
-
-    with ms.database.ScheduleFnDatabase(schedule_fn):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            lib = relay.build(relay_mod, target=target, params=params)
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    runtime.set_input("data", data_np)
-    runtime.run()
-    out = runtime.get_output(0).numpy()
-    ref = get_ref(data_np, weight1_np, weight2_np)
-    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
-
-
-if __name__ == "__main__":
-    test_dense_dense()
diff --git a/tests/python/meta_schedule/test_meta_schedule_relay_integration.py b/tests/python/meta_schedule/test_meta_schedule_relay_integration.py
deleted file mode 100644
index 3a2ca69cba7b..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_relay_integration.py
+++ /dev/null
@@ -1,1002 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Integration test for MetaSchedule"""
-import platform
-import tempfile
-from typing import List
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import IRModule
-from tvm import meta_schedule as ms
-from tvm import relay, te, tir
-from tvm._ffi import register_func
-from tvm.contrib import graph_executor
-from tvm.ir.transform import PassContext
-from tvm.meta_schedule.database import TuningRecord, Workload
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.meta_schedule.tune_context import _normalize_mod
-from tvm.script import tir as T
-from tvm.target import Target
-
-# pylint: disable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument,missing-docstring,invalid-name
-
-
-@tvm.script.ir_module
-class MockModule:
-    @T.prim_func
-    def main(a: T.handle, b: T.handle) -> None:  # type: ignore
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, (16,), "float32")
-        B = T.match_buffer(b, (16,), "float32")
-        for i in T.serial(0, 16):
-            with T.block("matmul"):
-                vi = T.axis.remap("S", [i])
-                B[vi] = A[vi]
-
-
-# pylint: enable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument
-
-
-@pytest.mark.skip("Integration tests")
-def test_meta_schedule_dynamic_loop_extent():
-    a = relay.var("a", shape=(1, 8, 8, 512), dtype="float32")
-    b = relay.nn.adaptive_avg_pool2d(a, (7, 7), "NHWC")
-    mod = IRModule({"main": relay.Function([a], b)})
-    extracted_tasks = ms.relay_integration.extract_tasks(mod, target="llvm", params={})
-    assert not extracted_tasks
-
-
-@pytest.mark.skip("Integration tests")
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently torch.jit.trace fails on AArch64",
-)
-@tvm.testing.requires_package("torch")
-def test_meta_schedule_integration_extract_from_resnet():
-    mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    extracted_tasks = ms.relay_integration.extract_tasks(mod, target="llvm", params=params)
-    expected_task_names = [
-        "fused_" + s
-        for s in [
-            "nn_max_pool2d",
-            "nn_adaptive_avg_pool2d",
-            "nn_dense_add",
-            "nn_conv2d_add",
-            "nn_conv2d_add_1",
-            "nn_conv2d_add_2",
-            "nn_conv2d_add_add_nn_relu",
-            "nn_conv2d_add_add_nn_relu_1",
-            "nn_conv2d_add_nn_relu",
-            "nn_conv2d_add_nn_relu_1",
-            "nn_conv2d_add_nn_relu_2",
-            "nn_conv2d_add_nn_relu_3",
-            "nn_conv2d_add_nn_relu_4",
-            "nn_conv2d_add_nn_relu_5",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1",
-            # The two tasks below are purely spatial and are ruled out by AutoScheduler
-            "layout_transform",
-            "layout_transform_reshape_squeeze",
-        ]
-    ]
-
-    assert len(extracted_tasks) == len(expected_task_names)
-    for t in extracted_tasks:
-        assert t.task_name in expected_task_names, t.task_name
-
-
-@pytest.mark.skip("Integration tests")
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently torch.jit.trace fails on AArch64",
-)
-@tvm.testing.requires_package("torch")
-def test_task_extraction_winograd_tensorcore():
-    mod, params, _ = get_network(name="resnet_50", input_shape=[16, 3, 224, 224])
-    seq = tvm.transform.Sequential(
-        [
-            relay.transform.ToMixedPrecision("float16"),
-            relay.transform.ConvertLayout({"nn.conv2d": ["NHWC", "HWIO"]}),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-
-    target = tvm.target.Target("nvidia/geforce-rtx-3070")
-    extracted_tasks = ms.relay_integration.extract_tasks(mod, target=target, params=params)
-
-    assert len([t for t in extracted_tasks if "winograd" in t.task_name]) == 4
-
-
-@pytest.mark.skip("Integration tests")
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently torch.jit.trace fails on AArch64",
-)
-@tvm.testing.requires_package("torch")
-def test_task_extraction_anchor_block():
-    mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    extracted_tasks = ms.relay_integration.extract_tasks(
-        mod, target="llvm", params=params, module_equality="anchor-block"
-    )
-
-    # Note that there is no task from residual blocks
-    expected_task_names = [
-        "fused_" + s
-        for s in [
-            "nn_max_pool2d",
-            "nn_adaptive_avg_pool2d",
-            "nn_dense_add",
-            "nn_conv2d_add",
-            "nn_conv2d_add_1",
-            "nn_conv2d_add_2",
-            "nn_conv2d_add_nn_relu",
-            "nn_conv2d_add_nn_relu_1",
-            "nn_conv2d_add_nn_relu_2",
-            "nn_conv2d_add_nn_relu_3",
-            "nn_conv2d_add_nn_relu_4",
-            "nn_conv2d_add_nn_relu_5",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1",
-            "layout_transform",
-            "layout_transform_reshape_squeeze",
-        ]
-    ]
-
-    assert len(extracted_tasks) == len(expected_task_names)
-    for t in extracted_tasks:
-        assert t.task_name in expected_task_names, t.task_name
-
-
-@pytest.mark.skip("Integration tests")
-@tvm.testing.requires_package("torch")
-def test_meta_schedule_integration_extract_from_bert_base():
-    pytest.importorskip(
-        "transformers", reason="transformers package is required to import bert_base"
-    )
-
-    expected = {
-        "fused_nn_dense_2": (
-            12,
-            [[64, 3072], [768, 3072], [64, 768]],
-        ),
-        "fused_nn_dense": (
-            48,
-            [[64, 768], [768, 768], [64, 768]],
-        ),
-        "fused_nn_dense_1": (
-            12,
-            [[64, 768], [3072, 768], [64, 3072]],
-        ),
-        "fused_subtract_add_rsqrt_multiply_multiply_add": (
-            25,
-            [[1, 64, 768], [1, 64, 1], [1, 64, 1], [768], [768], [1, 64, 768]],
-        ),
-        "fused_nn_batch_matmul": (
-            24,
-            [[12, 64, 64], [12, 64, 64], [12, 64, 64]],
-        ),
-        "fused_reshape_add_add": (
-            24,
-            [[64, 768], [768], [1, 64, 768], [1, 64, 768]],
-        ),
-        "fused_variance": (
-            25,
-            [[1, 64, 768], [1, 64, 1], [1, 64, 1]],
-        ),
-        "fused_mean": (
-            25,
-            [[1, 64, 768], [1, 64, 1]],
-        ),
-        "fused_reshape_add_reshape_transpose_reshape": (
-            12,
-            [[64, 768], [768], [12, 64, 64]],
-        ),
-        "fused_reshape_add_multiply_fast_erf_multiply_add_multiply_reshape": (
-            12,
-            [[64, 3072], [3072], [64, 3072]],
-        ),
-        "fused_nn_fast_softmax": (
-            12,
-            [[1, 12, 64, 64], [1, 12, 64, 64]],
-        ),
-        "fused_reshape_add_reshape_transpose_reshape_1": (
-            24,
-            [[64, 768], [768], [12, 64, 64]],
-        ),
-        "fused_reshape_divide_add": (
-            12,
-            [[12, 64, 64], [1, 1, 1, 64], [1, 12, 64, 64]],
-        ),
-        "fused_reshape_transpose_reshape": (
-            12,
-            [[12, 64, 64], [64, 768]],
-        ),
-        "fused_nn_dense_add_fast_tanh": (
-            1,
-            [[1, 768], [768, 768], [1, 768], [1, 768]],
-        ),
-        "fused_cast_take_add": (
-            1,
-            [[1, 64], [30522, 768], [1, 64, 768], [1, 64, 768]],
-        ),
-        "fused_take": (
-            1,
-            [[1, 64, 768], [1, 768]],
-        ),
-        "fused_reshape": (
-            12,
-            [[1, 12, 64, 64], [12, 64, 64]],
-        ),
-        "fused_reshape_1": (
-            24,
-            [[1, 64, 768], [64, 768]],
-        ),
-    }
-    mod, params, _ = get_network(name="bert_base", input_shape=[1, 64])
-    extracted_tasks = ms.relay_integration.extract_tasks(mod, target="llvm", params=params)
-    assert len(extracted_tasks) == len(expected)
-    for t in extracted_tasks:
-        prim_func = None
-        for _, v in t.dispatched[0].functions.items():
-            prim_func = v
-        shape = [[int(x) for x in prim_func.buffer_map[b].shape] for b in prim_func.params]
-        assert t.task_name in expected
-        expected_weight, expected_shape = expected[t.task_name]
-        assert expected_weight == t.weight, t.task_name
-        assert expected_shape == shape, t.task_name
-
-
-@pytest.mark.skip("Integration tests")
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently torch.jit.trace fails on AArch64",
-)
-@tvm.testing.requires_package("torch")
-def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
-    @register_func("relay.backend.tir_converter.remove_purely_spatial", override=True)
-    def filter_func(args, _) -> bool:
-        from tvm.te import create_prim_func  # pylint: disable=import-outside-toplevel
-
-        has_complex_op = False
-        visited = set()
-
-        def traverse(t):
-            nonlocal has_complex_op
-            assert t.handle is not None
-            if t.handle.value in visited:
-                return
-            if isinstance(t.op, te.PlaceholderOp):
-                pass
-            elif isinstance(t.op, te.ComputeOp):
-                has_complex_op = has_complex_op or any(isinstance(e, tir.Reduce) for e in t.op.body)
-                for x in t.op.input_tensors:
-                    traverse(x)
-            visited.add(t.handle.value)
-
-        for t in args:
-            traverse(t)
-        if not has_complex_op:
-            return None
-        return create_prim_func(args)
-
-    mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    extracted_tasks = ms.relay_integration.extract_tasks(
-        mod,
-        target="llvm",
-        params=params,
-        pass_config={
-            "relay.backend.use_meta_schedule": True,
-            "relay.backend.tir_converter": "remove_purely_spatial",
-        },
-    )
-    expected_task_names = [
-        "fused_" + s
-        for s in [
-            "nn_max_pool2d",
-            "nn_adaptive_avg_pool2d",
-            "nn_dense_add",
-            "nn_conv2d_add",
-            "nn_conv2d_add_1",
-            "nn_conv2d_add_2",
-            "nn_conv2d_add_add_nn_relu",
-            "nn_conv2d_add_add_nn_relu_1",
-            "nn_conv2d_add_nn_relu",
-            "nn_conv2d_add_nn_relu_1",
-            "nn_conv2d_add_nn_relu_2",
-            "nn_conv2d_add_nn_relu_3",
-            "nn_conv2d_add_nn_relu_4",
-            "nn_conv2d_add_nn_relu_5",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu",
-            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1",
-        ]
-    ]
-
-    assert len(extracted_tasks) == len(expected_task_names)
-    for t in extracted_tasks:
-        assert t.task_name in expected_task_names, t.task_name
-
-
-def extract_task_qbert(target, sch_rule_tag):
-    def _test(mod, params, target, sch_rule_tag):
-        extracted_tasks = ms.relay_integration.extract_tasks(mod, target, params)
-        tune_tasks = list(
-            filter(
-                lambda task: "dense" in task.task_name or "batch_matmul" in task.task_name,
-                extracted_tasks,
-            )
-        )
-        # three int8 dense, two int8 bmm, and one fp32 dense
-        assert len(tune_tasks) == 6
-
-        for task in tune_tasks:
-            relay_func = list(task.mod.functions.values())[0]
-            out_type = relay_func.body.checked_type
-
-            if out_type.dtype == "float32":
-                continue
-
-            sch = tvm.tir.Schedule(_normalize_mod(task.dispatched[0]))
-            block = sch.get_block("compute")
-            annotations = sch.get(block).annotations
-
-            assert "schedule_rule" in annotations
-            assert sch_rule_tag in annotations["schedule_rule"]
-
-    mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
-    _test(mod, params, target=target, sch_rule_tag=sch_rule_tag)
-
-
-@pytest.mark.skip("Too slow on CI")
-def extract_task_qbert_vnni():
-    extract_task_qbert("llvm -mcpu=cascadelake", "vnni")
-
-
-@pytest.mark.skip("Too slow on CI")
-def extract_task_qbert_avx512():
-    extract_task_qbert("llvm -mcpu=skylake-avx512", "avx512")
-
-
-@pytest.mark.skip("Integration tests")
-@tvm.testing.skip_if_32bit(reason="Apparently the LLVM version on i386 image is too old")
-def test_extract_task_arm_conv2d_nchwc():
-    data_shape = (1, 64, 128, 128)
-    weight_shape = (32, 64, 1, 1)
-    bias_shape = (weight_shape[0],)
-    padding = (1, 1)
-
-    data = relay.var("data", shape=data_shape, dtype="int8")
-    weight = relay.var("weight", shape=weight_shape, dtype="int8")
-    bias = relay.var("bias", shape=bias_shape, dtype="int32")
-    conv2d = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=weight_shape[2:],
-        channels=weight_shape[0],
-        padding=padding,
-        strides=(1, 1),
-        out_dtype="int32",
-    )
-    bias_add = relay.nn.bias_add(conv2d, bias)
-    relay_mod = tvm.IRModule.from_expr(bias_add)
-
-    weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8")
-    bias_np = np.random.uniform(1, 10, size=bias_shape).astype("int32")
-
-    params = {"weight": weight_np, "bias": bias_np}
-
-    target = "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon"
-    extracted_tasks = ms.relay_integration.extract_tasks(relay_mod, target, params)
-    tune_tasks = list(
-        filter(
-            lambda task: "conv2d" in task.task_name,
-            extracted_tasks,
-        )
-    )
-
-    assert len(tune_tasks) == 1
-
-    relay_func = list(tune_tasks[0].mod.functions.values())[0]
-    out_type = relay_func.body.checked_type
-
-    # Check that the output is in NCHWc layout
-    assert list(out_type.shape) == [1, 8, 130, 130, 4]
-
-
-@pytest.mark.skip("Integration tests")
-def test_meta_schedule_te2primfunc_argument_order_and_lowering():
-    # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
-    # fmt: off
-    @tvm.script.ir_module
-    class _fused_layout_transform:
-        @T.prim_func
-        def main( # type: ignore
-            placeholder: T.Buffer((T.int64(1), T.int64(3), T.int64(16), T.int64(16)), "float32"), # type: ignore
-            T_layout_trans: T.Buffer((T.int64(1), T.int64(1), T.int64(16), T.int64(16), T.int64(3)), "float32"), # type: ignore
-        ) -> None: # type: ignore
-            # function attr dict
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            # body
-            # with T.block("root")
-            for i0, i1, i2, i3, i4 in T.grid(T.int64(1), T.int64(1), T.int64(16), T.int64(16), T.int64(3)):
-                with T.block("T_layout_trans"):
-                    ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                    T.reads(placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3])
-                    T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
-                    T.block_attr({"dst_layout": "NCHW3c", "input_shape": [1, 3, 16, 16], "schedule_rule": "None", "src_layout": "NCHW"})
-                    T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
-                        ax0 < T.int64(1) and ax1 * T.int64(3) + ax4 < T.int64(3) and ax2 < T.int64(16) and ax3 < T.int64(16), # type: ignore
-                        placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3],
-                        T.float32(0),
-                        dtype="float32",
-                    )
-
-    @tvm.script.ir_module
-    class _fused_layout_transform_1:
-        @T.prim_func
-        def main(placeholder: T.Buffer((T.int64(1), T.int64(2), T.int64(16), T.int64(16), T.int64(4)), "float32"), T_layout_trans: T.Buffer((T.int64(1), T.int64(8), T.int64(16), T.int64(16)), "float32")) -> None: # type: ignore
-            # function attr dict
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            # body
-            # with T.block("root")
-            for i0, i1, i2, i3 in T.grid(T.int64(1), T.int64(8), T.int64(16), T.int64(16)):
-                with T.block("T_layout_trans"):
-                    ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                    T.reads(placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)]) # type: ignore
-                    T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
-                    T.block_attr({"dst_layout": "NCHW", "input_shape": [1, 2, 16, 16, 4], "schedule_rule": "None", "src_layout": "NCHW4c"})
-                    T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < T.int64(1) and ax1 < T.int64(8) and ax2 < T.int64(16) and ax3 < T.int64(16), placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)], T.float32(0), dtype="float32") # type: ignore
-
-    @tvm.script.ir_module
-    class _fused_nn_contrib_conv2d_NCHWc:
-        @T.prim_func
-        def main(placeholder: T.Buffer((T.int64(1), T.int64(1), T.int64(16), T.int64(16), T.int64(3)), "float32"), placeholder_1: T.Buffer((T.int64(2), T.int64(1), T.int64(5), T.int64(5), T.int64(3), T.int64(4)), "float32"), conv2d_NCHWc: T.Buffer((T.int64(1), T.int64(2), T.int64(16), T.int64(16), T.int64(4)), "float32")) -> None: # type: ignore
-            # function attr dict
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            # body
-            # with T.block("root")
-            data_pad = T.alloc_buffer([T.int64(1), T.int64(1), T.int64(20), T.int64(20), T.int64(3)], dtype="float32")
-            for i0, i1, i2, i3, i4 in T.grid(T.int64(1), T.int64(1), T.int64(20), T.int64(20), T.int64(3)):
-                with T.block("data_pad"):
-                    i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                    T.reads(placeholder[i0_1, i1_1, i2_1 - T.int64(2), i3_1 - T.int64(2), i4_1])
-                    T.writes(data_pad[i0_1, i1_1, i2_1, i3_1, i4_1])
-                    data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(T.int64(2) <= i2_1 and i2_1 < T.int64(18) and T.int64(2) <= i3_1 and i3_1 < T.int64(18), placeholder[i0_1, i1_1, i2_1 - T.int64(2), i3_1 - T.int64(2), i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
-            for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(T.int64(1), T.int64(2), T.int64(16), T.int64(16), T.int64(4), T.int64(3), T.int64(5), T.int64(5)):
-                with T.block("conv2d_NCHWc"):
-                    n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
-                    T.reads(data_pad[n, ic // T.int64(3), oh + kh, ow + kw, ic % T.int64(3)], placeholder_1[oc_chunk, ic // T.int64(3), kh, kw, ic % T.int64(3), oc_block]) # type: ignore
-                    T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
-                    with T.init():
-                        conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
-                    conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // T.int64(3), oh + kh, ow + kw, ic % T.int64(3)] * placeholder_1[oc_chunk, ic // T.int64(3), kh, kw, ic % T.int64(3), oc_block] # type: ignore
-
-    # fmt: on
-    # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
-
-    def _create_verification_database():
-        @ms.derived_object
-        class VerificationDatabase(ms.database.PyDatabase):
-            def __init__(self):
-                super().__init__()
-                self.tuning_records_: List[TuningRecord] = []
-                self.workloads_: List[Workload] = []
-
-            def has_workload(self, mod: IRModule) -> bool:
-                for workload in self.workloads_:
-                    if tvm.ir.structural_equal(mod, workload.mod):
-                        return True
-                # Note: The database has already put in all correct workloads
-                # This is where we can check if the workload is correct
-                raise ValueError(
-                    "The workload searched for is not in given database!"
-                    + " Incorrect TIR was generated from TE subgraph."
-                )
-
-            def commit_workload(self, mod: IRModule) -> ms.database.Workload:
-                # No need to deduplicate workload because they are specified
-                workload = ms.database.Workload(mod)
-                self.workloads_.append(workload)
-                return workload
-
-            def commit_tuning_record(self, record: TuningRecord) -> None:
-                self.tuning_records_.append(record)
-
-            def get_all_tuning_records(self) -> List[TuningRecord]:
-                return self.tuning_records_
-
-            def get_top_k(self, workload: ms.database.Workload, top_k: int) -> List[TuningRecord]:
-                return sorted(
-                    list(
-                        filter(
-                            lambda x: tvm.ir.structural_equal(workload.mod, x.workload.mod),
-                            self.tuning_records_,
-                        )
-                    ),
-                    key=lambda x: sum(x.run_secs) / len(x.run_secs) if x.run_secs else 1e9,
-                )[:top_k]
-
-            def __len__(self) -> int:
-                return len(self.tuning_records_)
-
-        database = VerificationDatabase()
-
-        def _commit(mod):
-            workload = database.commit_workload(mod)
-            database.commit_tuning_record(
-                ms.database.TuningRecord(
-                    tir.schedule.Trace([], {}),
-                    workload=workload,
-                    run_secs=[0.1],
-                )
-            )
-
-        _commit(_fused_layout_transform)
-        _commit(_fused_layout_transform_1)
-        _commit(_fused_nn_contrib_conv2d_NCHWc)
-        return database
-
-    data_shape = (1, 3, 16, 16)
-    weight_shape = (8, 3, 5, 5)
-
-    def _create_relay_mod():
-        data = relay.var("data", relay.TensorType(data_shape, "float32"))
-        weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
-        y = relay.nn.conv2d(
-            data,
-            weight,
-            padding=(2, 2),
-            kernel_size=(5, 5),
-            kernel_layout="OIHW",
-            out_dtype="float32",
-        )
-        f = relay.Function([data, weight], y)
-        mod = tvm.IRModule.from_expr(f)
-        mod = relay.transform.InferType()(mod)
-        return mod
-
-    mod = _create_relay_mod()
-    dev = tvm.cpu()
-    target = Target("llvm --num-cores=16")
-    params = {
-        "weight": np.random.rand(*weight_shape).astype("float32"),
-    }
-    data = tvm.nd.array(
-        np.random.rand(*data_shape).astype("float32"),
-        dev,
-    )
-
-    with (
-        target
-    ), _create_verification_database(), PassContext(  # pylint: disable=not-context-manager
-        opt_level=3,
-        config={
-            "relay.backend.use_meta_schedule": True,
-            "relay.backend.use_meta_schedule_dispatch": 7,
-            "relay.backend.tir_converter": "default",
-        },
-    ):
-        rt_mod1 = relay.build(mod, target=target, params=params)
-
-    # Compile without meta-schedule for correctness check
-    with tvm.transform.PassContext(opt_level=0):
-        rt_mod2 = relay.build(mod, target=target, params=params)
-
-    def get_output(data, lib):
-        module = graph_executor.GraphModule(lib["default"](dev))
-        module.set_input("data", data)
-        module.run()
-        return module.get_output(0).numpy()
-
-    # Check correctness
-    actual_output = get_output(data, rt_mod1)
-    expected_output = get_output(data, rt_mod2)
-    assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
-
-
-@pytest.mark.skip("Integration tests")
-def test_rewrite_layout_link_params():
-    I, O, H, W = 64, 64, 56, 56
-    kH = kW = 3
-
-    strides = (1, 1)
-    padding = (1, 1)
-
-    data_shape = (1, H, W, I)
-    w_shape = (kH, kW, I, O)
-    bias_shape = (1, 1, 1, O)
-
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight1", shape=w_shape, dtype="float32")
-    bias = relay.var("bias", shape=bias_shape, dtype="float32")
-
-    conv = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=(kH, kW),
-        channels=O,
-        padding=padding,
-        strides=strides,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype="float32",
-    )
-
-    mod = tvm.IRModule.from_expr(conv + bias)
-
-    weight_np = np.random.randn(*w_shape).astype("float32")
-    bias_np = np.random.randn(*bias_shape).astype("float32")
-
-    params = {"weight1": weight_np, "bias": bias_np}
-
-    data_np = np.random.randn(*data_shape).astype("float32")
-
-    ref = (
-        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
-        .evaluate()(*[data_np, weight_np, bias_np])
-        .numpy()
-    )
-
-    link_params = True
-
-    target = "llvm --num-cores=4"
-
-    executor = relay.backend.Executor("graph", {"link-params": link_params})
-    mod = mod.with_attr("executor", executor)
-
-    for strategy in ["replay-trace", "evolutionary"]:
-        with tempfile.TemporaryDirectory() as work_dir:
-            database = ms.relay_integration.tune_relay(
-                mod=mod,
-                target=target,
-                params=params,
-                work_dir=work_dir,
-                max_trials_global=4,
-                strategy=strategy,
-            )
-
-            lib = ms.relay_integration.compile_relay(
-                database=database,
-                mod=mod,
-                target=target,
-                params=params,
-            )
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        runtime.set_input("data", data_np)
-        runtime.run()
-
-        out = runtime.get_output(0).numpy()
-
-        np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
-
-
-@pytest.mark.skip("Integration tests")
-def test_module_equality_ignore_ndarray():
-    target = "llvm --num-cores=4"
-
-    data_shape = (128, 128)
-    weight_shape1 = (128, 128)
-    weight_shape2 = (128, 128)
-
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight1 = relay.var("weight1", shape=weight_shape1, dtype="float32")
-    weight2 = relay.var("weight2", shape=weight_shape2, dtype="float32")
-    dense1 = relay.nn.dense(data, weight1)
-    dense2 = relay.nn.dense(dense1, weight2)
-    mod = tvm.IRModule.from_expr(dense2)
-
-    weight1_np = np.random.randn(*weight_shape1).astype("float32")
-    weight2_np = np.random.randn(*weight_shape2).astype("float32")
-
-    params = {"weight1": weight1_np, "weight2": weight2_np}
-
-    executor = relay.backend.Executor("graph", {"link-params": True})
-    mod = mod.with_attr("executor", executor)
-
-    # Without using ignore-ndarray for module equality, we get duplicated tasks
-    assert len(ms.relay_integration.extract_tasks(mod, target, params)) == 2
-
-    module_eqality = "ignore-ndarray"
-    extracted_tasks = ms.relay_integration.extract_tasks(
-        mod, target, params, module_equality=module_eqality
-    )
-
-    assert len(extracted_tasks) == 1
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
-            extracted_tasks, work_dir, strategy="replay-trace"
-        )
-        database = ms.tune.tune_tasks(
-            tasks=tasks,
-            task_weights=task_weights,
-            work_dir=work_dir,
-            max_trials_global=4,
-            module_equality=module_eqality,
-        )
-        lib = ms.relay_integration.compile_relay(database, mod, target, params)
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    data_np = np.random.randn(*data_shape).astype("float32")
-
-    runtime.set_input("data", data_np)
-    runtime.run()
-
-    out = runtime.get_output(0).numpy()
-
-    ref = np.dot(np.dot(data_np, weight1_np.transpose()), weight2_np.transpose())
-    np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
-
-
-def _test_anchor_tuning(target, space):
-    data_shape = (128, 128)
-    weight_shape1 = (128, 128)
-    weight_shape2 = (128, 128)
-
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight1 = relay.var("weight1", shape=weight_shape1, dtype="float32")
-    weight2 = relay.var("weight2", shape=weight_shape2, dtype="float32")
-    dense1 = relay.nn.dense(data, weight1)
-    dense2 = relay.nn.dense(dense1 + relay.const(1.0, dtype="float32"), weight2)
-    mod = tvm.IRModule.from_expr(dense2 - data + relay.const(1.0, dtype="float32"))
-
-    weight1_np = np.random.randn(*weight_shape1).astype("float32")
-    weight2_np = np.random.randn(*weight_shape2).astype("float32")
-
-    data_np = np.random.randn(*data_shape).astype("float32")
-    params = {"weight1": weight1_np, "weight2": weight2_np}
-
-    module_equality = "anchor-block"
-
-    extracted_tasks = ms.relay_integration.extract_tasks(
-        mod, target, params, module_equality=module_equality
-    )
-
-    assert len(extracted_tasks) == 1
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=target,
-            params=params,
-            work_dir=work_dir,
-            space=space,
-            max_trials_global=4,
-            strategy="replay-trace",
-            module_equality=module_equality,
-            num_tuning_cores=4,
-        )
-        lib = ms.relay_integration.compile_relay(database, mod, target, params)
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    runtime.set_input("data", data_np)
-    runtime.run()
-    out = runtime.get_output(0).numpy()
-
-    ref = (
-        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
-        .evaluate()(*[data_np, weight1_np, weight2_np])
-        .numpy()
-    )
-
-    np.testing.assert_allclose(ref, out, atol=1e-3)
-
-
-@pytest.mark.skip("Integration tests")
-@pytest.mark.parametrize(
-    "space",
-    [
-        ms.space_generator.PostOrderApply(),
-        ms.space_generator.PostOrderApply(sch_rules=[], postprocs=[], mutator_probs={}),
-    ],
-)
-def test_anchor_tuning_cpu(space):
-    _test_anchor_tuning("llvm --num-cores=4", space)
-
-
-@pytest.mark.skip("Integration tests")
-def test_anchor_tuning_cpu_link_params():
-    data_shape = (128, 128)
-    weight_shape1 = (128, 128)
-    weight_shape2 = (128, 128)
-
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight1 = relay.var("weight1", shape=weight_shape1, dtype="float32")
-    weight2 = relay.var("weight2", shape=weight_shape2, dtype="float32")
-    dense1 = relay.nn.dense(data, weight1)
-    dense2 = relay.nn.dense(dense1, weight2)
-    mod = tvm.IRModule.from_expr(dense2 + relay.const(1.0, dtype="float32"))
-
-    weight1_np = np.random.randn(*weight_shape1).astype("float32")
-    weight2_np = np.random.randn(*weight_shape2).astype("float32")
-
-    data_np = np.random.randn(*data_shape).astype("float32")
-    params = {"weight1": weight1_np, "weight2": weight2_np}
-
-    module_equality = "anchor-block"
-    target = "llvm --num-cores=4"
-
-    executor = relay.backend.Executor("graph", {"link-params": True})
-    mod = mod.with_attr("executor", executor)
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=target,
-            params=params,
-            work_dir=work_dir,
-            max_trials_global=4,
-            strategy="replay-trace",
-            module_equality=module_equality,
-        )
-        lib = ms.relay_integration.compile_relay(database, mod, target, params)
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    runtime.set_input("data", data_np)
-    runtime.run()
-    out = runtime.get_output(0).numpy()
-
-    ref = (
-        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
-        .evaluate()(*[data_np, weight1_np, weight2_np])
-        .numpy()
-    )
-
-    np.testing.assert_allclose(ref, out, atol=1e-3)
-
-
-@pytest.mark.skip("Integration tests")
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_disabled_pass_param():
-    """
-    Check 'disabled_pass' parameter in tune_relay. Should throw exception in
-    case of correct work.
-    """
-    data_shape = [1, 4, 16, 16]
-    weight_shape = [32, 4, 2, 2]
-
-    data = relay.var("data", shape=data_shape, dtype="uint8")
-    weight = relay.var("weight", shape=weight_shape, dtype="int8")
-
-    op = relay.qnn.op.conv2d(
-        data,
-        weight,
-        input_zero_point=relay.const(0),
-        kernel_zero_point=relay.const(0),
-        input_scale=relay.const(0.7),
-        kernel_scale=relay.const(0.3),
-        kernel_size=[2, 2],
-        channels=32,
-    )
-    mod = tvm.IRModule.from_expr(op)
-
-    weight_np = np.random.randint(-10, 10, size=weight_shape).astype("int8")
-    params = {"weight": weight_np}
-
-    executor = relay.backend.Executor("graph", {"link-params": True})
-    mod = mod.with_attr("executor", executor)
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target="llvm --num-cores=4",
-            params=params,
-            work_dir=work_dir,
-            max_trials_global=4,
-            strategy="replay-trace",
-            disabled_pass=["qnn.Legalize"],
-        )
-
-    # Test failed, otherwise we can not reach this point.
-    pytest.fail("'disabled_pass' argument does not work")
-
-
-@pytest.mark.skip("Integration tests")
-def test_rewrite_layout_link_params_1x1_conv2d():
-    I, O, H, W = 32, 16, 256, 256
-    kH = kW = 1
-
-    strides = (1, 1)
-    padding = (0, 0)
-
-    data_shape = (1, H, W, I)
-    w_shape = (kH, kW, I, O)
-
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=w_shape, dtype="float32")
-
-    conv = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=(kH, kW),
-        channels=O,
-        padding=padding,
-        strides=strides,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype="float32",
-    )
-
-    mod = tvm.IRModule.from_expr(conv)
-
-    weight_np = np.random.randn(*w_shape).astype("float32")
-
-    params = {"weight": weight_np}
-
-    data_np = np.random.randn(*data_shape).astype("float32")
-
-    ref = (
-        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
-        .evaluate()(*[data_np, weight_np])
-        .numpy()
-    )
-
-    link_params = True
-
-    target = "llvm --num-cores=4"
-
-    executor = relay.backend.Executor("graph", {"link-params": link_params})
-    mod = mod.with_attr("executor", executor)
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=target,
-            params=params,
-            work_dir=work_dir,
-            max_trials_global=8,
-            strategy="replay-trace",
-        )
-
-        lib = ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=target,
-            params=params,
-        )
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    runtime.set_input("data", data_np)
-    runtime.run()
-
-    out = runtime.get_output(0).numpy()
-
-    np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/meta_schedule/test_meta_schedule_relay_tir_compute.py b/tests/python/meta_schedule/test_meta_schedule_relay_tir_compute.py
deleted file mode 100644
index b37333803603..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_relay_tir_compute.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import autotvm
-from tvm import meta_schedule as ms
-from tvm import relay, te
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.script import tir as T
-
-
-def compute_tir_conv2d_nchw_oihw(data_shape, weight_shape, dtype):
-    assert dtype == "float32"
-    OC, IC, FH, FW = weight_shape
-
-    padding = (0, 0, 0, 0)
-    strides = (1, 1)
-    dilation = (1, 1)
-    output_shape = (
-        data_shape[0],
-        weight_shape[0],
-        (data_shape[2] - ((weight_shape[2] - 1) * dilation[0] + 1) + padding[0] + padding[1])
-        // strides[0]
-        + 1,
-        (data_shape[3] - ((weight_shape[3] - 1) * dilation[1] + 1) + padding[2] + padding[3])
-        // strides[1]
-        + 1,
-    )
-    N, K, BH, BW = output_shape
-
-    # fmt: off
-    @T.prim_func
-    def conv2d(a: T.handle, filt: T.handle, b: T.handle) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, data_shape, dtype=dtype)
-        Filter = T.match_buffer(filt, weight_shape, dtype=dtype)
-        B = T.match_buffer(b, output_shape, dtype=dtype)
-        for n, k, bh, bw in T.grid(N, K, BH, BW):
-            with T.block("init"):
-                vn, vk, vbh, vbw = T.axis.remap("SSSS", [n, k, bh, bw])
-                B[vn, vk, vbh, vbw] = T.float32(0)
-            for ic, fh, fw in T.grid(IC, FH, FW):
-                with T.block("update"):
-                    vn, vk, vbh, vbw, vc, vfh, vfw = T.axis.remap("SSSSRRR", [n, k, bh, bw, ic, fh, fw])
-                    B[vn, vk, vbh, vbw] = B[vn, vk, vbh, vbw] + A[vn, vc, vbh + vfh, vbw + vfw] * Filter[vk, vc, vfh, vfw]
-    # fmt: on
-
-    return conv2d
-
-
-def schedule_tir_conv2d_nchw_oihw(sch):
-    update_block = sch.get_block("update")
-    vn, vk, vbh, vbw, vc, vfh, vfw = sch.get_loops(update_block)
-    sch.split(vk, factors=(None, 32))
-
-
-@autotvm.register_topi_compute("test/conv2d_1")
-def _compute_conv2d_1(cfg, input, filter, strides, padding, dilation, out_dtype):
-    prim_func = compute_tir_conv2d_nchw_oihw(input.shape, filter.shape, input.dtype)
-    output = te.extern_primfunc([input, filter], prim_func, name="tir")
-    return output
-
-
-@autotvm.register_topi_schedule("test/conv2d_1")
-def _schedule_conv2d_1(cfg, outs):
-    s = te.create_schedule([x.op for x in outs])
-    return s
-
-
-@tvm.target.override_native_generic_func("test_conv2d_strategy")
-def _tmp_strategy(attrs, inputs, out_type, target):
-    strategy = relay.op.OpStrategy()
-    if attrs.groups == 1 and attrs.data_layout == "NCHW" and attrs.kernel_layout == "OIHW":
-        strategy.add_implementation(
-            relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_1),
-            relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_1),
-            name="conv2d_2",
-            plevel=15,
-        )
-    else:
-        raise ValueError("No valid strategy found")
-    return strategy
-
-
-def get_conv2d(data_shape, weight_shape, **kwargs):
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    conv2d = relay.nn.conv2d(
-        data,
-        weight,
-        **kwargs,
-    )
-    return relay.Function([data, weight], conv2d)
-
-
-def get_ref(data, weight, stride, padding):
-    return tvm.topi.testing.conv2d_nchw_python(data, weight, stride, padding)
-
-
-def test_conv2d():
-    N, IC, H, W = 1, 64, 56, 56
-    OC, IC, FH, FW = 128, 64, 3, 3
-    data_shape = (N, IC, H, W)
-    weight_shape = (OC, IC, FH, FW)
-    padding = (0, 0)
-    strides = (1, 1)
-
-    relay_mod = tvm.IRModule.from_expr(
-        get_conv2d(
-            data_shape,
-            weight_shape,
-            padding=padding,
-            strides=strides,
-            channels=OC,
-            kernel_size=(FH, FW),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-    )
-
-    data_np = np.random.randn(*data_shape).astype("float32")
-    weight_np = np.random.randn(*weight_shape).astype("float32")
-
-    target = "llvm"
-    params = {"weight": weight_np}
-
-    def schedule_fn(sch):
-        if "nn_conv2d" in sch.mod.attrs["task_name"]:
-            schedule_tir_conv2d_nchw_oihw(sch)
-            return True
-        return False
-
-    with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
-        with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext(
-            opt_level=3,
-            config={
-                "relay.backend.use_meta_schedule": True,
-                "relay.backend.tir_converter": "allow_extern",
-            },
-        ):
-            lib = relay.build(relay_mod, target=target, params=params)
-
-    dev = tvm.device(target, 0)
-
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    runtime.set_input("data", data_np)
-    runtime.run()
-
-    out = runtime.get_output(0).numpy()
-
-    ref = get_ref(data_np, weight_np, strides, padding)
-
-    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
-
-
-if __name__ == "__main__":
-    test_conv2d()
diff --git a/tests/python/meta_schedule/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/meta_schedule/test_meta_schedule_schedule_cuda_layout_transform.py
deleted file mode 100644
index ce6e9e101d22..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_schedule_cuda_layout_transform.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import itertools
-import random
-import tempfile
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import meta_schedule, relay
-from tvm.meta_schedule.schedule.cuda.layout_transform import (
-    cuda_layout_transform_schedule_rule,
-)
-from tvm.relay.op import OpPattern
-from tvm.script import ir as I
-from tvm.script import tir as T
-from tvm.tir.schedule import BlockRV
-
-# fmt: off
-# Small gpu parameters which should work for nearly every (modern-ish) gpu.
-TARGET = tvm.target.Target(
-    "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024"
-)
-
-
-class PatchCustomLayoutTransformScheduleRule:
-    """Patch the custom layout transform schedule to test only specific tile sizes.
-
-    If tile_sizes = [], then returns the default (non-tiled) schedule, otherwise
-    returns only the schedule with the given tiles.
-    """
-
-    FUNC_NAME = "meta_schedule.cuda.layout_transform"
-
-    def __init__(self, tile_sizes: List[int]) -> None:
-        self.tile_sizes = tile_sizes
-        self.old_func = None
-
-    def __enter__(self, *args, **kwargs) -> None:
-        self.old_func = tvm.get_global_func(self.FUNC_NAME)
-
-        def new_layout_rule(
-            sch: tvm.tir.Schedule,
-            block: BlockRV,
-            tile_sizes: Optional[List[int]] = self.tile_sizes,
-        ) -> List[tvm.tir.Schedule]:
-            return cuda_layout_transform_schedule_rule(sch, block, tile_sizes)
-
-        tvm.register_func(self.FUNC_NAME, new_layout_rule, override=True)
-
-    def __exit__(self, *args, **kwargs) -> None:
-        tvm.register_func(self.FUNC_NAME, self.old_func, override=True)
-
-
-# Create unary functions which apply ops with compatible fusion levels to layout transform
-def get_random_axis(data: relay.Expr):
-    rank = len(relay.transform.InferTypeLocal(data).shape)
-    return random.randint(0, rank - 1)
-
-
-def apply_elemwise_clip(data: relay.Expr, min=0, max=10):
-    assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE
-    return relay.clip(data, min, max)
-
-
-def apply_broadcast_add(data: relay.Expr, val_to_add=5):
-    assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST
-    type_info = relay.transform.InferTypeLocal(data)
-    return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype))
-
-
-def apply_injective_concatenate(data: relay.Expr, axis=None):
-    if axis is None:
-        axis = get_random_axis(data)
-    assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE
-    return relay.concatenate([data, data], axis)
-
-
-def apply_comm_reduce_max(data: relay.Expr, axis=None):
-    if axis is None:
-        axis = get_random_axis(data)
-    assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE
-
-    # Do this to maintain dimensions
-    return relay.add(data, relay.max(data, axis, keepdims=True))
-
-
-pattern_level_to_op = {
-    OpPattern.ELEMWISE: apply_elemwise_clip,
-    OpPattern.BROADCAST: apply_broadcast_add,
-    OpPattern.INJECTIVE: apply_injective_concatenate,
-    OpPattern.COMM_REDUCE: apply_comm_reduce_max,
-}
-
-
-def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str):
-    assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE
-    return relay.layout_transform(data, src_layout, dst_layout)
-
-
-def create_relay_module(
-    input_shape: List[int], dtype: str, ops: List[Union[OpPattern, Tuple[str, str]]]
-) -> tvm.IRModule:
-    """Create a relay module with the given string of ops.
-
-    ops:
-        Applies the associated operators in order. If an integer, refers to applying
-        the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies
-        a layout transform with the given (src_layout, dst_layout)
-    """
-    input_data = relay.var("input", shape=input_shape, dtype=dtype)
-
-    cur_data = input_data
-    for op_info in ops:
-        # Progressively build type info
-        relay.transform.InferTypeLocal(cur_data)
-        if isinstance(op_info, tuple):
-            # layout transform case
-            src_layout, dst_layout = op_info
-            cur_data = apply_layout_transform(cur_data, src_layout, dst_layout)
-        else:
-            cur_data = pattern_level_to_op[op_info](cur_data)
-
-    relay.transform.InferTypeLocal(cur_data)
-    return tvm.IRModule.from_expr(cur_data)
-
-
-def extract_layout_transform_task(
-    mod: tvm.IRModule, target: tvm.target.Target
-) -> meta_schedule.ExtractedTask:
-    """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task."""
-    extracted_tasks = meta_schedule.relay_integration.extract_tasks(
-        mod,
-        target,
-        {},
-        pass_config={"relay.backend.use_meta_schedule": True},
-    )
-    task_of_interest = None
-    for task in extracted_tasks:
-        if "layout_transform" in task.task_name:
-            task_of_interest = task
-            break
-    assert task_of_interest is not None
-    return task_of_interest
-
-
-def run_primfunc(
-    primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray]
-):
-    """Compile and run the primfunc with the given input tensors."""
-    with tvm.transform.PassContext(
-        config={"relay.backend.use_meta_schedule": True},
-        opt_level=3,
-    ):
-        lib = tvm.build(primfunc_mod, target=target)
-    lib(*input_tensors)
-
-
-@pytest.mark.skip("Integration test")
-class TestRandomRelayE2ECorrectness:
-    """Tests E2E correctness of layout transform schedule.
-
-    Randomly generates relay mod with layout transform and fusable ops. Checks the
-    layout transform task for correctness by comparing against its unscheduled result.
-    """
-
-    @staticmethod
-    def generate_test_case(
-        input_shape: List[int],
-        implicit_reshape_info: Optional[Tuple[int, int]],
-        dtype: str,
-        num_additional_ops: int,
-    ) -> tvm.IRModule:
-        """Creates a random layout transform module with up to num_additional_ops fused."""
-        # Create layout transforms
-        rank = len(input_shape)
-
-        # src_layout is a string like ABCDEFG... with length as rank
-        src_layout = "".join([chr(i + ord("A")) for i in range(rank)])
-
-        # dst_layout is randomly shuffled src_layout, potentially after adding split axis
-        dst_layout = list(src_layout)
-        if implicit_reshape_info:
-            axis_to_reshape, size_new_dim = implicit_reshape_info
-            cur_dim = dst_layout[axis_to_reshape]
-            dst_layout[axis_to_reshape] = f"{cur_dim}"
-            dst_layout.append(f"{size_new_dim}{cur_dim.lower()}")
-
-        random.shuffle(dst_layout)
-        while "".join(dst_layout) == src_layout:
-            random.shuffle(dst_layout)
-        dst_layout = "".join(dst_layout)
-
-        # Randomly sample a list of potentially fusable ops to layout transform
-        op_order = random.choices(
-            list(pattern_level_to_op.keys()),
-            k=num_additional_ops,
-        )
-
-        # Append tuple, representing layout transfomr from src --> dst layout
-        op_order.append((src_layout, dst_layout))
-
-        random.shuffle(op_order)
-        return create_relay_module(input_shape, dtype, op_order)
-
-    @staticmethod
-    def get_primfunc(extracted_task: meta_schedule.ExtractedTask, tile_size: Optional[int]):
-        with PatchCustomLayoutTransformScheduleRule(
-            tile_sizes=[] if tile_size is None else [tile_size]
-        ):
-            with tempfile.TemporaryDirectory() as tmpdir:
-                (
-                    tune_contexts,
-                    _,
-                ) = meta_schedule.relay_integration.extracted_tasks_to_tune_contexts(
-                    [extracted_task],
-                    tmpdir,
-                )
-                tune_contexts[0].pre_tuning(1)
-                candidates = tune_contexts[0].generate_measure_candidates()
-                primfunc = candidates[0].sch.mod["main"]
-                return primfunc
-
-    @staticmethod
-    def verify_layout_transform_task(
-        extracted_task: meta_schedule.ExtractedTask,
-        target: tvm.target.Target,
-        tile_sizes: List[int],
-    ):
-        """Given a layout transform task, tests the given tile_sizes and verifies output matches."""
-        device = tvm.cuda(0)
-        relay_mod = extracted_task.mod
-
-        # Create and cache inputs
-        func_type = relay.transform.InferTypeLocal(relay_mod[relay_mod.get_global_vars()[0]])
-        input_tensors = []
-        for input_type in func_type.arg_types:
-            orig_input_np = np.random.uniform(0, 10, size=list(map(int, input_type.shape))).astype(
-                input_type.dtype
-            )
-            orig_input_np = np.arange(0, orig_input_np.size, dtype=input_type.dtype).reshape(
-                orig_input_np.shape
-            )
-            input_tensors.append(tvm.nd.array(orig_input_np, device))
-        ret_type = func_type.ret_type
-
-        def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]:
-            numpy_init = np.random.uniform(0, 1000, size=list(map(int, ret_type.shape))).astype(
-                ret_type.dtype
-            )
-            return tvm.nd.array(numpy_init, device)
-
-        def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
-            returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(
-                extracted_task, tile_size
-            )
-            output_tensor = get_output_tensor()
-            run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor])
-            return output_tensor.numpy()
-
-        # Passing None, we basically do not apply the custom rule we have created
-        # and instead use the old default schedule which is the ground truth.
-        ground_truth_np = run_and_get_output(None)
-
-        for tile_size in tile_sizes:
-            experimental_np = run_and_get_output(tile_size)
-            np.testing.assert_allclose(ground_truth_np, experimental_np)
-
-    (
-        input_shape,
-        implicit_reshape_info,
-        dtype,
-        tile_sizes,
-        num_additional_ops,
-    ) = tvm.testing.parameters(
-        *itertools.product(
-            # input_shape: Each has ~10k elements, should take single microseconds on modern gpu
-            [
-                [12, 48, 18],
-                [890, 14],
-                [10, 12, 2, 5, 3, 3],
-            ],
-            # implicit_reshape_info: Implicit reshape conditions.
-            # None is do no implicit reshape, (0, 2) means divide axis 0 in half, e.g. AB --> A2aB
-            [None, (0, 2), (1, 2)],
-            # dtype: dtypes to test, should not matter that much
-            ["float16"],
-            # tile_sizes: Tile sizes to try
-            [[8, 7]],
-            # num_additional_ops: number of non-layout transform ops to include and may be fused
-            [5],
-        )
-    )
-
-    @tvm.testing.requires_gpu
-    def test_all_test_case(
-        self,
-        input_shape,
-        implicit_reshape_info,
-        dtype,
-        tile_sizes,
-        num_additional_ops,
-    ):
-        """Tests the product of all conditions `repeat_per_condition` times."""
-        # Generate random module of fusable ops + layout transform and extract fused layout transform task
-        full_mod = self.generate_test_case(
-            input_shape, implicit_reshape_info, dtype, num_additional_ops
-        )
-
-        # Fused layout transform task
-        extracted_task = extract_layout_transform_task(full_mod, TARGET)
-        self.verify_layout_transform_task(extracted_task, TARGET, tile_sizes)
-
-
-@tvm.testing.requires_gpu
-class TestManualCases:
-    def assert_extracted_equals_expected(
-        self, relay_mod: tvm.IRModule, expected_mod: tvm.IRModule, tile_size: int
-    ):
-        extracted_task = extract_layout_transform_task(relay_mod, TARGET)
-        dispatched_mod = extracted_task.dispatched[0]
-        sch = tvm.tir.Schedule(dispatched_mod)
-        block = sch.get_block("T_layout_trans")
-        output_sch = cuda_layout_transform_schedule_rule(sch, block, [tile_size])[0]
-        assert output_sch.mod.script() == expected_mod.script()
-
-    def test_simple_tiling(self):
-        mod = create_relay_module([1, 32, 32, 32], "float16", [("NCHW", "NHWC")])
-
-        # Main things to notice:
-        # - two blocks each with 16, 16 extents which write/read shared mem
-        # - coalesced accesses in inner loop of global memory buffer for both
-        # fmt: off
-        @I.ir_module
-        class ExpectedModule:
-            @T.prim_func
-            def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16")):
-                T.func_attr({"global_symbol": "main", "tir.noalias": True})
-                # with T.block("root"):
-                p0_shared = T.alloc_buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16", scope="shared")
-                for ax0_ax2_ax1_0_ax3_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
-                    for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
-                        for ax1_1_fused_0_ax1_1_fused_1_fused in range(T.int64(16)):
-                            with T.block("p0_shared"):
-                                v0 = T.axis.spatial(T.int64(1), T.int64(0))
-                                v1 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(4) // T.int64(2) * T.int64(16) + ax1_1_fused_0_ax1_1_fused_1_fused)
-                                v2 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused // T.int64(4))
-                                v3 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(2) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
-                                T.reads(p0[v0, v1, v2, v3])
-                                T.writes(p0_shared[v0, v1, v2, v3])
-                                p0_shared[v0, v1, v2, v3] = p0[v0, v1, v2, v3]
-                    for ax0_ax1_fused_0 in range(T.int64(16)):
-                        for ax0_ax1_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
-                            with T.block("T_layout_trans"):
-                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
-                                v_ax1 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused // T.int64(4))
-                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(2) * T.int64(16) + (ax0_ax1_fused_0 * T.int64(16) + ax0_ax1_fused_1) // T.int64(16))
-                                v_ax3 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(4) // T.int64(2) * T.int64(16) + (ax0_ax1_fused_0 * T.int64(16) + ax0_ax1_fused_1) % T.int64(16))
-                                T.reads(p0_shared[v_ax0, v_ax3, v_ax1, v_ax2])
-                                T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3])
-                                T.block_attr({"dst_layout": "NHWC", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
-                                T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3] = T.if_then_else(v_ax0 < T.int64(1) and v_ax3 < T.int64(32) and v_ax1 < T.int64(32) and v_ax2 < T.int64(32), p0_shared[v_ax0, v_ax3, v_ax1, v_ax2], T.float16(0))
-
-        self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
-
-    def test_simple_implicit_reshape(self):
-        mod = create_relay_module([1, 32, 32, 32], "float16", [("NCHW", "NCHW4c")])
-
-        # Main things to notice:
-        # - two blocks each with 16, 16 extents which write/read shared mem
-        # - coalesced accesses in inner loop of global memory buffer for both
-        # - an implicit reshape is done (see p0_shared)
-        # fmt: off
-        @I.ir_module
-        class ExpectedModule:
-            @T.prim_func
-            def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")):
-                T.func_attr({"global_symbol": "main", "tir.noalias": True})
-                # with T.block("root"):
-                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(4), T.int64(32), T.int64(32)), "float16", scope="shared")
-                for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
-                    for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
-                        for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)):
-                            with T.block("p0_shared"):
-                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
-                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
-                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused // T.int64(8))
-                                v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
-                                v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4))
-                                T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3])
-                                T.writes(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3])
-                                p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
-                    for ax0_ax1_ax2_fused_0 in range(T.int64(16)):
-                        for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
-                            with T.block("T_layout_trans"):
-                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
-                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
-                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128))
-                                v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4))
-                                v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4))
-                                T.reads(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3])
-                                T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4])
-                                T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
-                                T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], T.float16(0))
-        self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
-
-    def test_expected_fusion_post(self):
-        mod = create_relay_module(
-            [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c"), OpPattern.BROADCAST]
-        )
-
-        # Main things to notice:
-        # - two blocks each with 16, 16 extents which write/read shared mem
-        # - coalesced accesses in inner loop of global memory buffer for both
-        # - an implicit reshape is done (see p0_shared)
-        # - an addition is inlined in the final block (p1 input)
-        # fmt: off
-        @I.ir_module
-        class ExpectedModule:
-            @T.prim_func
-            def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), p1: T.Buffer((), "float16"), T_add: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")):
-                T.func_attr({"global_symbol": "main", "tir.noalias": True})
-                # with T.block("root"):
-                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(4), T.int64(32), T.int64(32)), "float16", scope="shared")
-                for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
-                    for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
-                        for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)):
-                            with T.block("p0_shared"):
-                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
-                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
-                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused // T.int64(8))
-                                v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
-                                v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4))
-                                T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3])
-                                T.writes(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3])
-                                p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
-                    for ax0_ax1_ax2_fused_0 in range(T.int64(16)):
-                        for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
-                            with T.block("T_layout_trans"):
-                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
-                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
-                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128))
-                                v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4))
-                                v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4))
-                                T.reads(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], p1[()])
-                                T.writes(T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4])
-                                T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
-                                T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], T.float16(0)) + p1[()]
-        self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/meta_schedule/test_meta_schedule_space_cpu_winograd.py b/tests/python/meta_schedule/test_meta_schedule_space_cpu_winograd.py
deleted file mode 100644
index 1da2794a4cc6..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_space_cpu_winograd.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tests for MetaSchedule search space on CPU"""
-from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.space_generation import (
-    check_sketches,
-    generate_design_space,
-)
-from tvm.meta_schedule.testing.te_workload import create_te_workload
-from tvm.script import tir as T
-from tvm.target import Target
-
-
-def _target():
-    return Target("aws/cpu/c5.9xlarge")
-
-
-def _design_space(mod):
-    return generate_design_space(
-        kind="llvm",
-        mod=mod,
-        target=_target(),
-        types=ms.ScheduleRule,
-    )
-
-
-def test_cpu_nhwc():
-    # fmt: off
-    @T.prim_func
-    def cpu_nhwc_0(X: T.Buffer((1, 14, 14, 128), "float32"), W: T.Buffer((6, 6, 128, 128), "float32"), conv2d_winograd: T.Buffer((1, 12, 12, 128), "float32")) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
-        # body
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64})
-            data_pad = T.alloc_buffer([1, 16, 16, 128], dtype="float32")
-            input_tile = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
-            data_pack = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
-            bgemm = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
-            inverse = T.alloc_buffer([4, 4, 9, 128], dtype="float32")
-            bgemm_global = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
-            for i2_0 in T.serial(9):
-                for ax0, ax1, ax2, ax3 in T.grid(1, 6, 6, 128):
-                    with T.block("data_pad"):
-                        i0 = T.axis.spatial(1, ax0)
-                        i1 = T.axis.spatial(16, i2_0 // 3 * 4 + ax1)
-                        i2 = T.axis.spatial(16, i2_0 % 3 * 4 + ax2)
-                        i3 = T.axis.spatial(128, ax3)
-                        T.reads(X[i0, i1, i2, i3])
-                        T.writes(data_pad[i0, i1, i2, i3])
-                        T.block_attr({"schedule_rule":"None"})
-                        data_pad[i0, i1, i2, i3] = T.if_then_else(0 <= i1 and i1 < 14 and 0 <= i2 and i2 < 14, X[i0, i1, i2, i3], T.float32(0), dtype="float32")
-                for i3_0 in T.serial(2):
-                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 64):
-                        with T.block("input_tile"):
-                            eps, nu = T.axis.remap("SS", [ax0, ax1])
-                            p = T.axis.spatial(9, i2_0 + ax2)
-                            ci = T.axis.spatial(128, i3_0 * 64 + ax3)
-                            T.reads(data_pad[p // 9, p % 9 // 3 * 4 + eps, p % 3 * 4 + nu, ci])
-                            T.writes(input_tile[eps, nu, p, ci])
-                            T.block_attr({"schedule_rule":"None"})
-                            input_tile[eps, nu, p, ci] = data_pad[p // 9, p % 9 // 3 * 4 + eps, p % 3 * 4 + nu, ci]
-                    for i2_1, i3_1 in T.grid(1, 64):
-                        for i0 in T.unroll(6):
-                            for i1 in T.unroll(6):
-                                for i4 in T.unroll(6):
-                                    for i5 in T.unroll(6):
-                                        with T.block("data_pack"):
-                                            eps, nu = T.axis.remap("SS", [i0, i1])
-                                            p = T.axis.spatial(9, i2_0 + i2_1)
-                                            ci = T.axis.spatial(128, i3_0 * 64 + i3_1)
-                                            r_a, r_b = T.axis.remap("RR", [i4, i5])
-                                            T.reads(input_tile[r_a, r_b, p, ci])
-                                            T.writes(data_pack[eps, nu, p, ci])
-                                            T.block_attr({"schedule_rule":"conv2d_nhwc_winograd_data_pack"})
-                                            with T.init():
-                                                data_pack[eps, nu, p, ci] = T.float32(0)
-                                            data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
-            for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1, i2_1, i3_1 in T.grid(3, 2, 3, 1, 1, 1, 1, 1):
-                for i4_0, i0_2, i1_2, i2_2, i3_2, i4_1, i0_3, i1_3, i2_3, i3_3 in T.grid(32, 1, 1, 1, 2, 4, 2, 3, 3, 64):
-                    with T.block("bgemm"):
-                        eps = T.axis.spatial(6, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3)
-                        nu = T.axis.spatial(6, i1_0 * 3 + i1_1 * 3 + i1_2 * 3 + i1_3)
-                        p = T.axis.spatial(9, i2_0 * 3 + i2_1 * 3 + i2_2 * 3 + i2_3)
-                        co = T.axis.spatial(128, i3_0 * 128 + i3_1 * 128 + i3_2 * 64 + i3_3)
-                        ci = T.axis.reduce(128, i4_0 * 4 + i4_1)
-                        T.reads(data_pack[eps, nu, p, ci], W[eps, nu, co, ci])
-                        T.writes(bgemm_global[eps, nu, p, co])
-                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS", "meta_schedule.write_cache_level":[2]})
-                        with T.init():
-                            bgemm_global[eps, nu, p, co] = T.float32(0)
-                        bgemm_global[eps, nu, p, co] = bgemm_global[eps, nu, p, co] + data_pack[eps, nu, p, ci] * W[eps, nu, co, ci]
-                for ax0, ax1, ax2, ax3 in T.grid(2, 3, 3, 128):
-                    with T.block("bgemm_global"):
-                        v0 = T.axis.spatial(6, i0_0 * 2 + ax0)
-                        v1 = T.axis.spatial(6, i1_0 * 3 + ax1)
-                        v2 = T.axis.spatial(9, i2_0 * 3 + ax2)
-                        v3 = T.axis.spatial(128, ax3)
-                        T.reads(bgemm_global[v0, v1, v2, v3])
-                        T.writes(bgemm[v0, v1, v2, v3])
-                        bgemm[v0, v1, v2, v3] = bgemm_global[v0, v1, v2, v3]
-            for i2_0, i3_0, i2_1, i3_1 in T.grid(3, 8, 3, 16):
-                for i0 in T.unroll(4):
-                    for i1 in T.unroll(4):
-                        for i4 in T.unroll(6):
-                            for i5 in T.unroll(6):
-                                with T.block("inverse"):
-                                    vh, vw = T.axis.remap("SS", [i0, i1])
-                                    p = T.axis.spatial(9, i2_0 * 3 + i2_1)
-                                    co = T.axis.spatial(128, i3_0 * 16 + i3_1)
-                                    r_a, r_b = T.axis.remap("RR", [i4, i5])
-                                    T.reads(bgemm[r_a, r_b, p, co])
-                                    T.writes(inverse[vh, vw, p, co])
-                                    T.block_attr({"schedule_rule":"conv2d_nhwc_winograd_inverse"})
-                                    with T.init():
-                                        inverse[vh, vw, p, co] = T.float32(0)
-                                    inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
-            for i0, i1, i2, i3 in T.grid(1, 12, 12, 128):
-                with T.block("conv2d_winograd"):
-                    n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                    T.reads(inverse[h % 4, w % 4, n * 9 + h // 4 * 3 + w // 4, co])
-                    T.writes(conv2d_winograd[n, h, w, co])
-                    conv2d_winograd[n, h, w, co] = inverse[h % 4, w % 4, n * 9 + h // 4 * 3 + w // 4, co]
-    # fmt: on
-    decision_0 = [
-        ("SamplePerfectTile", [3, 3]),
-        ("SamplePerfectTile", [8, 16]),
-        ("SamplePerfectTile", [9, 1]),
-        ("SamplePerfectTile", [2, 64]),
-        ("SampleComputeLocation", 1),
-        ("SampleComputeLocation", 0),
-        ("SamplePerfectTile", [3, 1, 1, 2]),
-        ("SamplePerfectTile", [2, 1, 1, 3]),
-        ("SamplePerfectTile", [3, 1, 1, 3]),
-        ("SamplePerfectTile", [1, 1, 2, 64]),
-        ("SamplePerfectTile", [32, 4]),
-        ("SampleCategorical", 2),
-    ]
-    with _target():
-        mod = create_te_workload("C2D_WIN_NHWC", 0)
-    actual = _design_space(mod)
-    check_sketches(
-        mod,
-        sketches=actual,
-        expected_mods=[cpu_nhwc_0],
-        expected_decisions=[decision_0],
-    )
-
-
-if __name__ == "__main__":
-    test_cpu_nhwc()
diff --git a/tests/python/meta_schedule/test_meta_schedule_space_cuda_winograd.py b/tests/python/meta_schedule/test_meta_schedule_space_cuda_winograd.py
deleted file mode 100644
index 844f0216bd03..000000000000
--- a/tests/python/meta_schedule/test_meta_schedule_space_cuda_winograd.py
+++ /dev/null
@@ -1,593 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tests for MetaSchedule search space on CUDA"""
-from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.space_generation import (
-    check_sketches,
-    generate_design_space,
-    print_sketches,
-)
-from tvm.meta_schedule.testing.te_workload import create_te_workload
-from tvm.script import tir as T
-from tvm.target import Target
-
-
-def _target():
-    return Target("nvidia/geforce-rtx-2080")  # disable async trace using sm75
-
-
-def _design_space(mod):
-    return generate_design_space(
-        kind="cuda",
-        mod=mod,
-        target=_target(),
-        types=ms.ScheduleRule,
-    )
-
-
-def test_cuda_nhwc():
-    # fmt: off
-    @T.prim_func
-    def cuda_nhwc_0(data: T.Buffer((1, 14, 14, 128), "float32"), weight: T.Buffer((6, 6, 128, 128), "float32"), conv2d_winograd: T.Buffer((1, 12, 12, 128), "float32")) -> None:
-        T.func_attr({"global_symbol": "main", "layout_free_buffers": [1], "tir.noalias": T.bool(True)})
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            T.block_attr({"meta_schedule.unroll_explicit": 16})
-            input_tile_local = T.alloc_buffer((6, 6, 9, 128), scope="local")
-            data_pack = T.alloc_buffer((6, 6, 9, 128))
-            bgemm = T.alloc_buffer((6, 6, 9, 128))
-            inverse = T.alloc_buffer((4, 4, 9, 128))
-            data_pack_local = T.alloc_buffer((6, 6, 9, 128), scope="local")
-            bgemm_local = T.alloc_buffer((6, 6, 9, 128), scope="local")
-            data_pack_shared = T.alloc_buffer((6, 6, 9, 128), scope="shared")
-            weight_shared = T.alloc_buffer((6, 6, 128, 128), scope="shared")
-            for p_0_ci_0_p_1_ci_1_fused_0 in T.thread_binding(2, thread="blockIdx.x"):
-                for p_0_ci_0_p_1_ci_1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
-                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
-                        with T.block("input_tile"):
-                            v_eps, v_nu = T.axis.remap("SS", [ax0, ax1])
-                            v_p = T.axis.spatial(9, (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) // 384 * 3 + (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 24 // 8 + ax2)
-                            v_ci = T.axis.spatial(128, (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 384 // 24 * 8 + (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 8 + ax3)
-                            T.where(p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1 < 1152)
-                            T.reads(data[v_p // 9, v_p % 9 // 3 * 4 + v_eps, v_p % 3 * 4 + v_nu, v_ci])
-                            T.writes(input_tile_local[v_eps, v_nu, v_p, v_ci])
-                            T.block_attr({"schedule_rule": "None"})
-                            input_tile_local[v_eps, v_nu, v_p, v_ci] = T.if_then_else(0 <= v_p % 9 // 3 * 4 + v_eps and v_p % 9 // 3 * 4 + v_eps < 14 and 0 <= v_p % 3 * 4 + v_nu and v_p % 3 * 4 + v_nu < 14, data[v_p // 9, v_p % 9 // 3 * 4 + v_eps, v_p % 3 * 4 + v_nu, v_ci], T.float32(0))
-                    for eps in T.unroll(6):
-                        for nu in T.unroll(6):
-                            for r_a in T.unroll(6):
-                                for r_b in T.unroll(6):
-                                    with T.block("data_pack"):
-                                        v_eps, v_nu = T.axis.remap("SS", [eps, nu])
-                                        v_p = T.axis.spatial(9, (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) // 384 * 3 + (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 24 // 8)
-                                        v_ci = T.axis.spatial(128, (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 384 // 24 * 8 + (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 8)
-                                        v_r_a, v_r_b = T.axis.remap("RR", [r_a, r_b])
-                                        T.where(p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1 < 1152)
-                                        T.reads(input_tile_local[v_r_a, v_r_b, v_p, v_ci])
-                                        T.writes(data_pack_local[v_eps, v_nu, v_p, v_ci])
-                                        T.block_attr({"schedule_rule": "conv2d_nhwc_winograd_data_pack"})
-                                        with T.init():
-                                            data_pack_local[v_eps, v_nu, v_p, v_ci] = T.float32(0)
-                                        data_pack_local[v_eps, v_nu, v_p, v_ci] = data_pack_local[v_eps, v_nu, v_p, v_ci] + input_tile_local[v_r_a, v_r_b, v_p, v_ci] * T.Select(v_r_a % 6 == 5 and v_eps % 6 == 5, T.float32(1), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 4, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 3, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 2, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 1, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 0, T.float32(0), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 5, T.float32(1.5), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 4, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 3, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 2, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 1, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 0, T.float32(1), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 5, T.float32(-2), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 4, T.float32(-0.5), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 3, T.float32(2), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 2, T.float32(2.5), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 1, T.float32(0.5), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 0, T.float32(1.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 5, T.float32(-1.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 4, T.float32(-1), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 3, T.float32(-1), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 2, T.float32(0.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 1, T.float32(-2.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 0, T.float32(-2), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 5, T.float32(1), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 4, T.float32(0.5), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 3, T.float32(-2), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 2, T.float32(-1), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 1, T.float32(1), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 0, T.float32(-1.5), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 5, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 4, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 3, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 2, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 1, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(v_r_b % 6 == 5 and v_nu % 6 == 5, T.float32(1), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 4, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 3, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 2, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 1, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 0, T.float32(0), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 5, T.float32(1.5), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 4, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 3, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 2, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 1, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 0, T.float32(1), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 5, T.float32(-2), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 4, T.float32(-0.5), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 3, T.float32(2), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 2, T.float32(2.5), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 1, T.float32(0.5), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 0, T.float32(1.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 5, T.float32(-1.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 4, T.float32(-1), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 3, T.float32(-1), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 2, T.float32(0.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 1, T.float32(-2.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 0, T.float32(-2), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 5, T.float32(1), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 4, T.float32(0.5), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 3, T.float32(-2), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 2, T.float32(-1), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 1, T.float32(1), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 0, T.float32(-1.5), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 5, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 4, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 3, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 2, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 1, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
-                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
-                        with T.block("data_pack_local"):
-                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
-                            v2 = T.axis.spatial(9, (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) // 384 * 3 + (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 24 // 8 + ax2)
-                            v3 = T.axis.spatial(128, (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 384 // 24 * 8 + (p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1) % 8 + ax3)
-                            T.where(p_0_ci_0_p_1_ci_1_fused_0 * 1024 + p_0_ci_0_p_1_ci_1_fused_1 < 1152)
-                            T.reads(data_pack_local[v0, v1, v2, v3])
-                            T.writes(data_pack[v0, v1, v2, v3])
-                            data_pack[v0, v1, v2, v3] = data_pack_local[v0, v1, v2, v3]
-            for eps_0_nu_0_p_0_co_0_fused in T.thread_binding(96, thread="blockIdx.x"):
-                for eps_1_nu_1_p_1_co_1_fused in T.thread_binding(4, thread="vthread.x"):
-                    for eps_2_nu_2_p_2_co_2_fused in T.thread_binding(27, thread="threadIdx.x"):
-                        for ci_0 in range(8):
-                            for ax0_ax1_ax2_ax3_fused in range(1728):
-                                with T.block("data_pack_shared"):
-                                    v0 = T.axis.spatial(6, eps_0_nu_0_p_0_co_0_fused // 32 * 2 + ax0_ax1_ax2_ax3_fused // 864)
-                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 864 // 144)
-                                    v2 = T.axis.spatial(9, ax0_ax1_ax2_ax3_fused % 144 // 16)
-                                    v3 = T.axis.spatial(128, ci_0 * 16 + ax0_ax1_ax2_ax3_fused % 16)
-                                    T.reads(data_pack[v0, v1, v2, v3])
-                                    T.writes(data_pack_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch": 1})
-                                    data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
-                            for ax0_ax1_ax2_ax3_fused in range(768):
-                                with T.block("weight_shared"):
-                                    v0 = T.axis.spatial(6, eps_0_nu_0_p_0_co_0_fused // 32 * 2 + ax0_ax1_ax2_ax3_fused // 384)
-                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 384 // 64)
-                                    v2 = T.axis.spatial(128, eps_0_nu_0_p_0_co_0_fused % 32 * 4 + ax0_ax1_ax2_ax3_fused % 64 // 16)
-                                    v3 = T.axis.spatial(128, ci_0 * 16 + ax0_ax1_ax2_ax3_fused % 16)
-                                    T.reads(weight[v0, v1, v2, v3])
-                                    T.writes(weight_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch": 3})
-                                    weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
-                            for ci_1, eps_3, nu_3, p_3, co_3, ci_2, eps_4, nu_4, p_4, co_4 in T.grid(1, 2, 1, 1, 2, 16, 1, 1, 1, 1):
-                                with T.block("bgemm"):
-                                    v_eps = T.axis.spatial(6, eps_0_nu_0_p_0_co_0_fused // 32 * 2 + eps_3 + eps_4)
-                                    v_nu = T.axis.spatial(6, eps_1_nu_1_p_1_co_1_fused // 2 * 3 + eps_2_nu_2_p_2_co_2_fused // 9 + nu_3 + nu_4)
-                                    v_p = T.axis.spatial(9, eps_2_nu_2_p_2_co_2_fused % 9 + p_3 + p_4)
-                                    v_co = T.axis.spatial(128, eps_0_nu_0_p_0_co_0_fused % 32 * 4 + eps_1_nu_1_p_1_co_1_fused % 2 * 2 + co_3 + co_4)
-                                    v_ci = T.axis.reduce(128, ci_0 * 16 + ci_1 * 16 + ci_2)
-                                    T.reads(data_pack_shared[v_eps, v_nu, v_p, v_ci], weight_shared[v_eps, v_nu, v_co, v_ci])
-                                    T.writes(bgemm_local[v_eps, v_nu, v_p, v_co])
-                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive": 1024, "meta_schedule.thread_extent_low_inclusive": 32, "meta_schedule.tiling_structure": "SSSRRSRS", "meta_schedule.write_cache_level": [3]})
-                                    with T.init():
-                                        bgemm_local[v_eps, v_nu, v_p, v_co] = T.float32(0)
-                                    bgemm_local[v_eps, v_nu, v_p, v_co] = bgemm_local[v_eps, v_nu, v_p, v_co] + data_pack_shared[v_eps, v_nu, v_p, v_ci] * weight_shared[v_eps, v_nu, v_co, v_ci]
-                        for ax0, ax1, ax2, ax3 in T.grid(2, 1, 1, 2):
-                            with T.block("bgemm_local"):
-                                v0 = T.axis.spatial(6, eps_0_nu_0_p_0_co_0_fused // 32 * 2 + ax0)
-                                v1 = T.axis.spatial(6, eps_1_nu_1_p_1_co_1_fused // 2 * 3 + eps_2_nu_2_p_2_co_2_fused // 9 + ax1)
-                                v2 = T.axis.spatial(9, eps_2_nu_2_p_2_co_2_fused % 9 + ax2)
-                                v3 = T.axis.spatial(128, eps_0_nu_0_p_0_co_0_fused % 32 * 4 + eps_1_nu_1_p_1_co_1_fused % 2 * 2 + ax3)
-                                T.reads(bgemm_local[v0, v1, v2, v3])
-                                T.writes(bgemm[v0, v1, v2, v3])
-                                bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
-            for p_0_co_0_p_1_co_1_fused_0 in T.thread_binding(18, thread="blockIdx.x"):
-                for p_0_co_0_p_1_co_1_fused_1 in T.thread_binding(64, thread="threadIdx.x"):
-                    for vh in T.unroll(4):
-                        for vw in T.unroll(4):
-                            for r_a in T.unroll(6):
-                                for r_b in T.unroll(6):
-                                    with T.block("inverse"):
-                                        v_vh, v_vw = T.axis.remap("SS", [vh, vw])
-                                        v_p = T.axis.spatial(9, (p_0_co_0_p_1_co_1_fused_0 * 64 + p_0_co_0_p_1_co_1_fused_1) // 384 * 3 + (p_0_co_0_p_1_co_1_fused_0 * 64 + p_0_co_0_p_1_co_1_fused_1) % 24 // 8)
-                                        v_co = T.axis.spatial(128, (p_0_co_0_p_1_co_1_fused_0 * 64 + p_0_co_0_p_1_co_1_fused_1) % 384 // 24 * 8 + (p_0_co_0_p_1_co_1_fused_0 * 64 + p_0_co_0_p_1_co_1_fused_1) % 8)
-                                        v_r_a, v_r_b = T.axis.remap("RR", [r_a, r_b])
-                                        T.reads(bgemm[v_r_a, v_r_b, v_p, v_co])
-                                        T.writes(inverse[v_vh, v_vw, v_p, v_co])
-                                        T.block_attr({"schedule_rule": "conv2d_nhwc_winograd_inverse"})
-                                        with T.init():
-                                            inverse[v_vh, v_vw, v_p, v_co] = T.float32(0)
-                                        inverse[v_vh, v_vw, v_p, v_co] = inverse[v_vh, v_vw, v_p, v_co] + bgemm[v_r_a, v_r_b, v_p, v_co] * T.Select(v_r_a % 6 == 5 and v_vh % 4 == 3, T.float32(1), T.Select(v_r_a % 6 == 5 and v_vh % 4 == 2, T.float32(0), T.Select(v_r_a % 6 == 5 and v_vh % 4 == 1, T.float32(0), T.Select(v_r_a % 6 == 5 and v_vh % 4 == 0, T.float32(0), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 3, T.float32(-8), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 2, T.float32(4), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 1, T.float32(-2), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 3, T.float32(0.125), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 2, T.float32(0.25), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 1, T.float32(0.5), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 3, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 2, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 1, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 3, T.float32(-1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 2, T.float32(1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 1, T.float32(-1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 3, T.float32(0), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 2, T.float32(0), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 1, T.float32(0), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(v_r_b % 6 == 5 and v_vw % 4 == 3, T.float32(1), T.Select(v_r_b % 6 == 5 and v_vw % 4 == 2, T.float32(0), T.Select(v_r_b % 6 == 5 and v_vw % 4 == 1, T.float32(0), T.Select(v_r_b % 6 == 5 and v_vw % 4 == 0, T.float32(0), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 3, T.float32(-8), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 2, T.float32(4), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 1, T.float32(-2), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 3, T.float32(0.125), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 2, T.float32(0.25), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 1, T.float32(0.5), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 3, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 2, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 1, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 3, T.float32(-1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 2, T.float32(1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 1, T.float32(-1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 3, T.float32(0), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 2, T.float32(0), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 1, T.float32(0), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
-            for n_h_w_co_fused_0 in T.thread_binding(144, thread="blockIdx.x"):
-                for n_h_w_co_fused_1 in T.thread_binding(128, thread="threadIdx.x"):
-                    with T.block("conv2d_winograd"):
-                        v_n = T.axis.spatial(1, 0)
-                        v_h = T.axis.spatial(12, (n_h_w_co_fused_0 * 128 + n_h_w_co_fused_1) // 1536)
-                        v_w = T.axis.spatial(12, (n_h_w_co_fused_0 * 128 + n_h_w_co_fused_1) % 1536 // 128)
-                        v_co = T.axis.spatial(128, (n_h_w_co_fused_0 * 128 + n_h_w_co_fused_1) % 128)
-                        T.reads(inverse[v_h % 4, v_w % 4, v_n * 9 + v_h // 4 * 3 + v_w // 4, v_co])
-                        T.writes(conv2d_winograd[v_n, v_h, v_w, v_co])
-                        conv2d_winograd[v_n, v_h, v_w, v_co] = inverse[v_h % 4, v_w % 4, v_n * 9 + v_h // 4 * 3 + v_w // 4, v_co]
-    # fmt: on
-    decision_0 = [
-        ("SamplePerfectTile", [3, 3]),
-        ("SamplePerfectTile", [16, 8]),
-        ("SampleCategorical", 1),
-        ("SamplePerfectTile", [3, 3]),
-        ("SamplePerfectTile", [16, 8]),
-        ("SampleCategorical", 5),
-        ("SamplePerfectTile", [3, 1, 1, 2, 1]),
-        ("SamplePerfectTile", [1, 2, 3, 1, 1]),
-        ("SamplePerfectTile", [1, 1, 9, 1, 1]),
-        ("SamplePerfectTile", [32, 2, 1, 2, 1]),
-        ("SamplePerfectTile", [8, 1, 16]),
-        ("SampleCategorical", 0),
-        ("SampleCategorical", 2),
-        ("SampleCategorical", 1),
-        ("SampleCategorical", 2),
-    ]
-    with _target():
-        mod = create_te_workload("C2D_WIN_NHWC", 0)
-    actual = _design_space(mod)
-    check_sketches(
-        mod,
-        sketches=actual,
-        expected_mods=[cuda_nhwc_0],
-        expected_decisions=[decision_0],
-    )
-
-
-def test_cuda_nchw():
-    # fmt: off
-    @T.prim_func
-    def cuda_nchw_0(data: T.Buffer((1, 64, 56, 56), "float32"), weight: T.Buffer((6, 6, 64, 64), "float32"), conv2d_winograd: T.Buffer((1, 64, 56, 56), "float32")) -> None:
-        T.func_attr({"global_symbol": "main", "layout_free_buffers": [1], "tir.noalias": T.bool(True)})
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            T.block_attr({"meta_schedule.unroll_explicit": 16})
-            input_tile_local = T.alloc_buffer((64, 196, 6, 6), scope="local")
-            data_pack = T.alloc_buffer((6, 6, 64, 196))
-            bgemm = T.alloc_buffer((6, 6, 64, 196))
-            inverse_local = T.alloc_buffer((64, 196, 4, 4), scope="local")
-            data_pack_local = T.alloc_buffer((6, 6, 64, 196), scope="local")
-            bgemm_local = T.alloc_buffer((6, 6, 64, 196), scope="local")
-            data_pack_shared = T.alloc_buffer((6, 6, 64, 196), scope="shared")
-            weight_shared = T.alloc_buffer((6, 6, 64, 64), scope="shared")
-            for ci_p_fused_0 in T.thread_binding(25, thread="blockIdx.x"):
-                for ci_p_fused_1 in T.thread_binding(512, thread="threadIdx.x"):
-                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 6, 6):
-                        with T.block("input_tile"):
-                            v_ci = T.axis.spatial(64, (ci_p_fused_0 * 512 + ci_p_fused_1) // 196 + ax0)
-                            v_p = T.axis.spatial(196, (ci_p_fused_0 * 120 + ci_p_fused_1) % 196 + ax1)
-                            v_eps, v_nu = T.axis.remap("SS", [ax2, ax3])
-                            T.where(ci_p_fused_0 * 512 + ci_p_fused_1 < 12544)
-                            T.reads(data[v_p // 196, v_ci, v_p % 196 // 14 * 4 + v_eps - 1, v_p % 14 * 4 + v_nu - 1])
-                            T.writes(input_tile_local[v_ci, v_p, v_eps, v_nu])
-                            T.block_attr({"schedule_rule": "None"})
-                            input_tile_local[v_ci, v_p, v_eps, v_nu] = T.if_then_else(1 <= v_p % 196 // 14 * 4 + v_eps and v_p % 196 // 14 * 4 + v_eps < 57 and 1 <= v_p % 14 * 4 + v_nu and v_p % 14 * 4 + v_nu < 57, data[v_p // 196, v_ci, v_p % 196 // 14 * 4 + v_eps - 1, v_p % 14 * 4 + v_nu - 1], T.float32(0))
-                    for eps in T.unroll(6):
-                        for nu in T.unroll(6):
-                            for r_a in T.unroll(6):
-                                for r_b in T.unroll(6):
-                                    with T.block("data_pack"):
-                                        v_eps, v_nu = T.axis.remap("SS", [eps, nu])
-                                        v_ci = T.axis.spatial(64, (ci_p_fused_0 * 512 + ci_p_fused_1) // 196)
-                                        v_p = T.axis.spatial(196, (ci_p_fused_0 * 512 + ci_p_fused_1) % 196)
-                                        v_r_a, v_r_b = T.axis.remap("RR", [r_a, r_b])
-                                        T.where(ci_p_fused_0 * 512 + ci_p_fused_1 < 12544)
-                                        T.reads(input_tile_local[v_ci, v_p, v_r_a, v_r_b])
-                                        T.writes(data_pack_local[v_eps, v_nu, v_ci, v_p])
-                                        T.block_attr({"schedule_rule": "conv2d_nchw_winograd_data_pack"})
-                                        with T.init():
-                                            data_pack_local[v_eps, v_nu, v_ci, v_p] = T.float32(0)
-                                        data_pack_local[v_eps, v_nu, v_ci, v_p] = data_pack_local[v_eps, v_nu, v_ci, v_p] + input_tile_local[v_ci, v_p, v_r_a, v_r_b] * T.Select(v_r_a % 6 == 5 and v_eps % 6 == 5, T.float32(1), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 4, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 3, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 2, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 1, T.float32(0), T.Select(v_r_a % 6 == 5 and v_eps % 6 == 0, T.float32(0), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 5, T.float32(1.5), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 4, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 3, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 2, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 1, T.float32(1), T.Select(v_r_a % 6 == 4 and v_eps % 6 == 0, T.float32(1), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 5, T.float32(-2), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 4, T.float32(-0.5), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 3, T.float32(2), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 2, T.float32(2.5), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 1, T.float32(0.5), T.Select(v_r_a % 6 == 3 and v_eps % 6 == 0, T.float32(1.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 5, T.float32(-1.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 4, T.float32(-1), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 3, T.float32(-1), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 2, T.float32(0.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 1, T.float32(-2.5), T.Select(v_r_a % 6 == 2 and v_eps % 6 == 0, T.float32(-2), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 5, T.float32(1), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 4, T.float32(0.5), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 3, T.float32(-2), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 2, T.float32(-1), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 1, T.float32(1), T.Select(v_r_a % 6 == 1 and v_eps % 6 == 0, T.float32(-1.5), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 5, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 4, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 3, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 2, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 1, T.float32(0), T.Select(v_r_a % 6 == 0 and v_eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(v_r_b % 6 == 5 and v_nu % 6 == 5, T.float32(1), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 4, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 3, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 2, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 1, T.float32(0), T.Select(v_r_b % 6 == 5 and v_nu % 6 == 0, T.float32(0), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 5, T.float32(1.5), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 4, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 3, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 2, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 1, T.float32(1), T.Select(v_r_b % 6 == 4 and v_nu % 6 == 0, T.float32(1), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 5, T.float32(-2), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 4, T.float32(-0.5), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 3, T.float32(2), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 2, T.float32(2.5), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 1, T.float32(0.5), T.Select(v_r_b % 6 == 3 and v_nu % 6 == 0, T.float32(1.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 5, T.float32(-1.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 4, T.float32(-1), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 3, T.float32(-1), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 2, T.float32(0.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 1, T.float32(-2.5), T.Select(v_r_b % 6 == 2 and v_nu % 6 == 0, T.float32(-2), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 5, T.float32(1), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 4, T.float32(0.5), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 3, T.float32(-2), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 2, T.float32(-1), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 1, T.float32(1), T.Select(v_r_b % 6 == 1 and v_nu % 6 == 0, T.float32(-1.5), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 5, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 4, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 3, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 2, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 1, T.float32(0), T.Select(v_r_b % 6 == 0 and v_nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
-                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
-                        with T.block("data_pack_local"):
-                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
-                            v2 = T.axis.spatial(64, (ci_p_fused_0 * 512 + ci_p_fused_1) // 196 + ax2)
-                            v3 = T.axis.spatial(196, (ci_p_fused_0 * 120 + ci_p_fused_1) % 196 + ax3)
-                            T.where(ci_p_fused_0 * 512 + ci_p_fused_1 < 12544)
-                            T.reads(data_pack_local[v0, v1, v2, v3])
-                            T.writes(data_pack[v0, v1, v2, v3])
-                            data_pack[v0, v1, v2, v3] = data_pack_local[v0, v1, v2, v3]
-            for eps_0_nu_0_co_0_p_0_fused in T.thread_binding(14, thread="blockIdx.x"):
-                for eps_1_nu_1_co_1_p_1_fused in T.thread_binding(224, thread="vthread.x"):
-                    for eps_2_nu_2_co_2_p_2_fused in T.thread_binding(2, thread="threadIdx.x"):
-                        for ci_0 in range(2):
-                            for ax0_ax1_ax2_ax3_fused in range(32256):
-                                with T.block("data_pack_shared"):
-                                    v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 5376)
-                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 5376 // 896)
-                                    v2 = T.axis.spatial(64, ci_0 * 32 + ax0_ax1_ax2_ax3_fused % 896 // 28)
-                                    v3 = T.axis.spatial(196, eps_0_nu_0_co_0_p_0_fused % 7 * 28 + ax0_ax1_ax2_ax3_fused % 28)
-                                    T.reads(data_pack[v0, v1, v2, v3])
-                                    T.writes(data_pack_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch": 4})
-                                    data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
-                            for ax0_ax1_ax2_ax3_fused in range(36864):
-                                with T.block("weight_shared"):
-                                    v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 6144)
-                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 6144 // 1024)
-                                    v2 = T.axis.spatial(64, ci_0 * 32 + ax0_ax1_ax2_ax3_fused % 1024 // 32)
-                                    v3 = T.axis.spatial(64, eps_0_nu_0_co_0_p_0_fused // 7 * 32 + ax0_ax1_ax2_ax3_fused % 32)
-                                    T.reads(weight[v0, v1, v2, v3])
-                                    T.writes(weight_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch": 3})
-                                    weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
-                            for ci_1, eps_3, nu_3, co_3, p_3, ci_2, eps_4, nu_4, co_4, p_4 in T.grid(16, 2, 3, 1, 4, 2, 3, 1, 1, 1):
-                                with T.block("bgemm"):
-                                    v_eps = T.axis.spatial(6, eps_3 * 3 + eps_4)
-                                    v_nu = T.axis.spatial(6, eps_1_nu_1_co_1_p_1_fused // 112 * 3 + nu_3 + nu_4)
-                                    v_co = T.axis.spatial(64, eps_0_nu_0_co_0_p_0_fused // 7 * 32 + eps_1_nu_1_co_1_p_1_fused % 112 // 7 * 2 + eps_2_nu_2_co_2_p_2_fused + co_3 + co_4)
-                                    v_p = T.axis.spatial(196, eps_0_nu_0_co_0_p_0_fused % 7 * 28 + eps_1_nu_1_co_1_p_1_fused % 7 * 4 + p_3 + p_4)
-                                    v_ci = T.axis.reduce(64, ci_0 * 32 + ci_1 * 2 + ci_2)
-                                    T.reads(data_pack_shared[v_eps, v_nu, v_ci, v_p], weight_shared[v_eps, v_nu, v_ci, v_co])
-                                    T.writes(bgemm_local[v_eps, v_nu, v_co, v_p])
-                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive": 1024, "meta_schedule.thread_extent_low_inclusive": 32, "meta_schedule.tiling_structure": "SSSRRSRS"})
-                                    with T.init():
-                                        bgemm_local[v_eps, v_nu, v_co, v_p] = T.float32(0)
-                                    bgemm_local[v_eps, v_nu, v_co, v_p] = bgemm_local[v_eps, v_nu, v_co, v_p] + data_pack_shared[v_eps, v_nu, v_ci, v_p] * weight_shared[v_eps, v_nu, v_ci, v_co]
-                        for ax0, ax1, ax2, ax3 in T.grid(6, 3, 1, 4):
-                            with T.block("bgemm_local"):
-                                v0 = T.axis.spatial(6, ax0)
-                                v1 = T.axis.spatial(6, eps_1_nu_1_co_1_p_1_fused // 112 * 3 + ax1)
-                                v2 = T.axis.spatial(64, eps_0_nu_0_co_0_p_0_fused // 7 * 32 + eps_1_nu_1_co_1_p_1_fused % 112 // 7 * 2 + eps_2_nu_2_co_2_p_2_fused + ax2)
-                                v3 = T.axis.spatial(196, eps_0_nu_0_co_0_p_0_fused % 7 * 28 + eps_1_nu_1_co_1_p_1_fused % 7 * 4 + ax3)
-                                T.reads(bgemm_local[v0, v1, v2, v3])
-                                T.writes(bgemm[v0, v1, v2, v3])
-                                bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
-            for n_co_h_0_w_0_fused_0 in T.thread_binding(196, thread="blockIdx.x"):
-                for n_co_h_0_w_0_fused_1 in T.thread_binding(64, thread="threadIdx.x"):
-                    for ax0, ax1 in T.grid(1, 1):
-                        for ax2 in T.unroll(4):
-                            for ax3 in T.unroll(4):
-                                for ax4 in T.unroll(6):
-                                    for ax5 in T.unroll(6):
-                                        with T.block("inverse"):
-                                            v_co = T.axis.spatial(64, (n_co_h_0_w_0_fused_0 * 64 + n_co_h_0_w_0_fused_1) // 196 + ax0)
-                                            v_p = T.axis.spatial(196, (n_co_h_0_w_0_fused_0 * 64 + n_co_h_0_w_0_fused_1) % 196 + ax1)
-                                            v_vh, v_vw, v_r_a, v_r_b = T.axis.remap("SSRR", [ax2, ax3, ax4, ax5])
-                                            T.reads(bgemm[v_r_a, v_r_b, v_co, v_p])
-                                            T.writes(inverse_local[v_co, v_p, v_vh, v_vw])
-                                            T.block_attr({"schedule_rule": "conv2d_nchw_winograd_inverse"})
-                                            with T.init():
-                                                inverse_local[v_co, v_p, v_vh, v_vw] = T.float32(0)
-                                            inverse_local[v_co, v_p, v_vh, v_vw] = inverse_local[v_co, v_p, v_vh, v_vw] + bgemm[v_r_a, v_r_b, v_co, v_p] * T.Select(v_r_a % 6 == 5 and v_vh % 4 == 3, T.float32(1), T.Select(v_r_a % 6 == 5 and v_vh % 4 == 2, T.float32(0), T.Select(v_r_a % 6 == 5 and v_vh % 4 == 1, T.float32(0), T.Select(v_r_a % 6 == 5 and v_vh % 4 == 0, T.float32(0), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 3, T.float32(-8), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 2, T.float32(4), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 1, T.float32(-2), T.Select(v_r_a % 6 == 4 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 3, T.float32(0.125), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 2, T.float32(0.25), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 1, T.float32(0.5), T.Select(v_r_a % 6 == 3 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 3, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 2, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 1, T.float32(1), T.Select(v_r_a % 6 == 2 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 3, T.float32(-1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 2, T.float32(1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 1, T.float32(-1), T.Select(v_r_a % 6 == 1 and v_vh % 4 == 0, T.float32(1), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 3, T.float32(0), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 2, T.float32(0), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 1, T.float32(0), T.Select(v_r_a % 6 == 0 and v_vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(v_r_b % 6 == 5 and v_vw % 4 == 3, T.float32(1), T.Select(v_r_b % 6 == 5 and v_vw % 4 == 2, T.float32(0), T.Select(v_r_b % 6 == 5 and v_vw % 4 == 1, T.float32(0), T.Select(v_r_b % 6 == 5 and v_vw % 4 == 0, T.float32(0), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 3, T.float32(-8), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 2, T.float32(4), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 1, T.float32(-2), T.Select(v_r_b % 6 == 4 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 3, T.float32(0.125), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 2, T.float32(0.25), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 1, T.float32(0.5), T.Select(v_r_b % 6 == 3 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 3, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 2, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 1, T.float32(1), T.Select(v_r_b % 6 == 2 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 3, T.float32(-1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 2, T.float32(1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 1, T.float32(-1), T.Select(v_r_b % 6 == 1 and v_vw % 4 == 0, T.float32(1), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 3, T.float32(0), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 2, T.float32(0), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 1, T.float32(0), T.Select(v_r_b % 6 == 0 and v_vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
-                    for h_1, w_1 in T.grid(4, 4):
-                        with T.block("conv2d_winograd"):
-                            v_n = T.axis.spatial(1, 0)
-                            v_co = T.axis.spatial(64, (n_co_h_0_w_0_fused_0 * 64 + n_co_h_0_w_0_fused_1) // 196)
-                            v_h = T.axis.spatial(56, (n_co_h_0_w_0_fused_0 * 64 + n_co_h_0_w_0_fused_1) % 196 // 14 * 4 + h_1)
-                            v_w = T.axis.spatial(56, (n_co_h_0_w_0_fused_0 * 64 + n_co_h_0_w_0_fused_1) % 14 * 4 + w_1)
-                            T.reads(inverse_local[v_co, v_n * 196 + v_h // 4 * 14 + v_w // 4, v_h % 4, v_w % 4])
-                            T.writes(conv2d_winograd[v_n, v_co, v_h, v_w])
-                            conv2d_winograd[v_n, v_co, v_h, v_w] = inverse_local[v_co, v_n * 196 + v_h // 4 * 14 + v_w // 4, v_h % 4, v_w % 4]
-    # fmt: on
-    decision_0 = [
-        ("SampleCategorical", 4),
-        ("SamplePerfectTile", [1, 1, 1, 2, 3]),
-        ("SamplePerfectTile", [1, 2, 1, 3, 1]),
-        ("SamplePerfectTile", [2, 16, 2, 1, 1]),
-        ("SamplePerfectTile", [7, 7, 1, 4, 1]),
-        ("SamplePerfectTile", [2, 16, 2]),
-        ("SampleCategorical", 3),
-        ("SampleCategorical", 2),
-        ("SampleCategorical", 1),
-        ("SampleCategorical", 1),
-    ]
-    with _target():
-        mod = create_te_workload("C2D_WIN_NCHW", 0)
-    actual = _design_space(mod)
-    check_sketches(
-        mod,
-        sketches=actual,
-        expected_mods=[cuda_nchw_0],
-        expected_decisions=[decision_0],
-        debug_mask=0,
-    )
-
-
-def test_cuda_nchw_add_relu():
-    # fmt: off
-    @T.prim_func
-    def nchw_add_relu(p0: T.Buffer((2, 2048, 50, 75), "float32"), p1: T.Buffer((4, 4, 2048, 2048), "float32"), p2: T.Buffer((1, 2048, 1, 1), "float32"), T_relu: T.Buffer((2, 2048, 50, 75), "float32")):
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
-        # body
-        # with T.block("root")
-        data_pad = T.alloc_buffer([2, 2048, 52, 77], dtype="float32")
-        input_tile = T.alloc_buffer([2048, 1900, 4, 4], dtype="float32")
-        B = T.alloc_buffer([4, 4], dtype="float32")
-        data_pack = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32")
-        bgemm = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32")
-        A = T.alloc_buffer([4, 2], dtype="float32")
-        inverse = T.alloc_buffer([2048, 1900, 2, 2], dtype="float32")
-        conv2d_winograd = T.alloc_buffer([2, 2048, 50, 75], dtype="float32")
-        T_add = T.alloc_buffer([2, 2048, 50, 75], dtype="float32")
-        for i0, i1, i2, i3 in T.grid(2, 2048, 52, 77):
-            with T.block("data_pad"):
-                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1])
-                T.writes(data_pad[i0_1, i1_1, i2_1, i3_1])
-                data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i2_1 and i2_1 < 51 and 1 <= i3_1 and i3_1 < 76, p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1], T.float32(0), dtype="float32")
-        for i0, i1, i2, i3 in T.grid(2048, 1900, 4, 4):
-            with T.block("input_tile"):
-                ci, p, eps, nu = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(data_pad[p // 950, ci, p % 950 // 38 * 2 + eps, p % 38 * 2 + nu])
-                T.writes(input_tile[ci, p, eps, nu])
-                T.block_attr({"schedule_rule":"None"})
-                input_tile[ci, p, eps, nu] = data_pad[p // 950, ci, p % 950 // 38 * 2 + eps, p % 38 * 2 + nu]
-        for i0, i1 in T.grid(4, 4):
-            with T.block("B"):
-                i, j = T.axis.remap("SS", [i0, i1])
-                T.reads()
-                T.writes(B[i, j])
-                T.block_attr({"schedule_rule":"None"})
-                B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))
-        for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 2048, 1900, 4, 4):
-            with T.block("data_pack"):
-                eps, nu, ci, p, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
-                T.reads(input_tile[ci, p, r_a, r_b], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1])
-                T.writes(data_pack[eps, nu, ci, p])
-                T.block_attr({"schedule_rule":"conv2d_nchw_winograd_data_pack"})
-                with T.init():
-                    data_pack[eps, nu, ci, p] = T.float32(0)
-                data_pack[eps, nu, ci, p] = data_pack[eps, nu, ci, p] + input_tile[ci, p, r_a, r_b] * B[r_a, eps] * B[r_b, nu]
-        for i0, i1, i2, i3, i4 in T.grid(4, 4, 2048, 1900, 2048):
-            with T.block("bgemm"):
-                eps, nu, co, p, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4])
-                T.reads(data_pack[eps, nu, ci, p], p1[eps, nu, ci, co])
-                T.writes(bgemm[eps, nu, co, p])
-                with T.init():
-                    bgemm[eps, nu, co, p] = T.float32(0)
-                bgemm[eps, nu, co, p] = bgemm[eps, nu, co, p] + data_pack[eps, nu, ci, p] * p1[eps, nu, ci, co]
-        for i0, i1 in T.grid(4, 2):
-            with T.block("A"):
-                i, j = T.axis.remap("SS", [i0, i1])
-                T.reads()
-                T.writes(A[i, j])
-                T.block_attr({"schedule_rule":"None"})
-                A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0)))))))))
-        for i0, i1, i2, i3, i4, i5 in T.grid(2048, 1900, 2, 2, 4, 4):
-            with T.block("inverse"):
-                co, p, vh, vw, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
-                T.reads(bgemm[r_a, r_b, co, p], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1])
-                T.writes(inverse[co, p, vh, vw])
-                T.block_attr({"schedule_rule":"conv2d_nchw_winograd_inverse"})
-                with T.init():
-                    inverse[co, p, vh, vw] = T.float32(0)
-                inverse[co, p, vh, vw] = inverse[co, p, vh, vw] + bgemm[r_a, r_b, co, p] * A[r_a, vh] * A[r_b, vw]
-        for i0, i1, i2, i3 in T.grid(2, 2048, 50, 75):
-            with T.block("conv2d_winograd"):
-                n, co, h, w = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(inverse[co, n * 950 + h // 2 * 38 + w // 2, h % 2, w % 2])
-                T.writes(conv2d_winograd[n, co, h, w])
-                conv2d_winograd[n, co, h, w] = inverse[co, n * 950 + h // 2 * 38 + w // 2, h % 2, w % 2]
-        for i0, i1, i2, i3 in T.grid(2, 2048, 50, 75):
-            with T.block("T_add"):
-                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], p2[0, ax1, 0, 0])
-                T.writes(T_add[ax0, ax1, ax2, ax3])
-                T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + p2[0, ax1, 0, 0]
-        for i0, i1, i2, i3 in T.grid(2, 2048, 50, 75):
-            with T.block("T_relu"):
-                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(T_add[ax0, ax1, ax2, ax3])
-                T.writes(T_relu[ax0, ax1, ax2, ax3])
-                T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0))
-
-    @T.prim_func
-    def nchw_add_relu_scheduled(p0: T.Buffer((2, 2048, 50, 75), "float32"), p1: T.Buffer((4, 4, 2048, 2048), "float32"), p2: T.Buffer((1, 2048, 1, 1), "float32"), T_relu: T.Buffer((2, 2048, 50, 75), "float32")):
-        T.func_attr({"global_symbol": "main", "layout_free_buffers": [1], "tir.noalias": T.bool(True)})
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            T.block_attr({"meta_schedule.unroll_explicit": 1024})
-            input_tile_local = T.alloc_buffer((2048, 1900, 4, 4), scope="local")
-            data_pack = T.alloc_buffer((4, 4, 2048, 1900))
-            bgemm = T.alloc_buffer((4, 4, 2048, 1900))
-            inverse_local = T.alloc_buffer((2048, 1900, 2, 2), scope="local")
-            data_pack_local = T.alloc_buffer((4, 4, 2048, 1900), scope="local")
-            bgemm_local = T.alloc_buffer((4, 4, 2048, 1900), scope="local")
-            data_pack_shared = T.alloc_buffer((4, 4, 2048, 1900), scope="shared")
-            p1_shared = T.alloc_buffer((4, 4, 2048, 2048), scope="shared")
-            for i2_i3_fused_1 in T.thread_binding(256, thread="blockIdx.x"):
-                for i2_i3_fused_2 in T.thread_binding(1024, thread="threadIdx.x"):
-                    for i2_i3_fused_0 in range(15):
-                        for ax0, ax1, ax2, ax3 in T.grid(1, 1, 4, 4):
-                            with T.block("input_tile"):
-                                ci = T.axis.spatial(2048, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) // 1900 + ax0)
-                                p = T.axis.spatial(1900, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) % 1900 + ax1)
-                                eps, nu = T.axis.remap("SS", [ax2, ax3])
-                                T.where(i2_i3_fused_0 * 256 + i2_i3_fused_1 < 3800)
-                                T.reads(p0[p // 950, ci, p % 950 // 38 * 2 + eps - 1, p % 38 * 2 + nu - 1])
-                                T.writes(input_tile_local[ci, p, eps, nu])
-                                T.block_attr({"schedule_rule": "None"})
-                                input_tile_local[ci, p, eps, nu] = T.if_then_else(1 <= p % 950 // 38 * 2 + eps and p % 950 // 38 * 2 + eps < 51 and 1 <= p % 38 * 2 + nu and p % 38 * 2 + nu < 76, p0[p // 950, ci, p % 950 // 38 * 2 + eps - 1, p % 38 * 2 + nu - 1], T.float32(0))
-                        for i0 in T.unroll(4):
-                            for i1 in T.unroll(4):
-                                for i4 in T.unroll(4):
-                                    for i5 in T.unroll(4):
-                                        with T.block("data_pack"):
-                                            eps, nu = T.axis.remap("SS", [i0, i1])
-                                            ci = T.axis.spatial(2048, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) // 1900)
-                                            p = T.axis.spatial(1900, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) % 1900)
-                                            r_a, r_b = T.axis.remap("RR", [i4, i5])
-                                            T.where((i2_i3_fused_0 * 256 + i2_i3_fused_1) * 1024 + i2_i3_fused_2 < 3891200)
-                                            T.reads(input_tile_local[ci, p, r_a, r_b])
-                                            T.writes(data_pack_local[eps, nu, ci, p])
-                                            T.block_attr({"schedule_rule": "conv2d_nchw_winograd_data_pack"})
-                                            with T.init():
-                                                data_pack_local[eps, nu, ci, p] = T.float32(0)
-                                            data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + input_tile_local[ci, p, r_a, r_b] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))
-                        for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1):
-                            with T.block("data_pack_local"):
-                                v0, v1 = T.axis.remap("SS", [ax0, ax1])
-                                v2 = T.axis.spatial(2048, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) // 1900 + ax2)
-                                v3 = T.axis.spatial(1900, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) % 1900 + ax3)
-                                T.where(i2_i3_fused_0 * 256 + i2_i3_fused_1 < 3800)
-                                T.reads(data_pack_local[v0, v1, v2, v3])
-                                T.writes(data_pack[v0, v1, v2, v3])
-                                data_pack[v0, v1, v2, v3] = data_pack_local[v0, v1, v2, v3]
-            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(24320, thread="blockIdx.x"):
-                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"):
-                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(64, thread="threadIdx.x"):
-                        for i4_0 in range(256):
-                            for ax0_ax1_ax2_ax3_fused in range(640):
-                                with T.block("data_pack_shared"):
-                                    v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + ax0_ax1_ax2_ax3_fused // 320)
-                                    v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + ax0_ax1_ax2_ax3_fused % 320 // 160)
-                                    v2 = T.axis.spatial(2048, i4_0 * 8 + ax0_ax1_ax2_ax3_fused % 160 // 20)
-                                    v3 = T.axis.spatial(1900, i0_0_i1_0_i2_0_i3_0_fused % 95 * 20 + ax0_ax1_ax2_ax3_fused % 20)
-                                    T.reads(data_pack[v0, v1, v2, v3])
-                                    T.writes(data_pack_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch": 1})
-                                    data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
-                            for ax0_ax1_ax2_ax3_fused in range(1024):
-                                with T.block("p1_shared"):
-                                    v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + ax0_ax1_ax2_ax3_fused // 512)
-                                    v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + ax0_ax1_ax2_ax3_fused % 512 // 256)
-                                    v2 = T.axis.spatial(2048, i4_0 * 8 + ax0_ax1_ax2_ax3_fused % 256 // 32)
-                                    v3 = T.axis.spatial(2048, i0_0_i1_0_i2_0_i3_0_fused % 6080 // 95 * 32 + ax0_ax1_ax2_ax3_fused % 32)
-                                    T.reads(p1[v0, v1, v2, v3])
-                                    T.writes(p1_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch": 4})
-                                    p1_shared[v0, v1, v2, v3] = p1[v0, v1, v2, v3]
-                            for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 2, 1, 1, 8, 1, 1, 2, 5):
-                                with T.block("bgemm"):
-                                    eps = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 32 + i0_3 + i0_4)
-                                    nu = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + i1_3 + i1_4)
-                                    co = T.axis.spatial(2048, i0_0_i1_0_i2_0_i3_0_fused % 6080 // 95 * 32 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused % 32 // 4 * 2 + i2_3 * 2 + i2_4)
-                                    p = T.axis.spatial(1900, i0_0_i1_0_i2_0_i3_0_fused % 95 * 20 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 5 + i3_3 * 5 + i3_4)
-                                    ci = T.axis.reduce(2048, i4_0 * 8 + i4_1 * 8 + i4_2)
-                                    T.reads(data_pack_shared[eps, nu, ci, p], p1_shared[eps, nu, ci, co])
-                                    T.writes(bgemm_local[eps, nu, co, p])
-                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive": 1024, "meta_schedule.thread_extent_low_inclusive": 32, "meta_schedule.tiling_structure": "SSSRRSRS"})
-                                    with T.init():
-                                        bgemm_local[eps, nu, co, p] = T.float32(0)
-                                    bgemm_local[eps, nu, co, p] = bgemm_local[eps, nu, co, p] + data_pack_shared[eps, nu, ci, p] * p1_shared[eps, nu, ci, co]
-                        for ax0, ax1, ax2, ax3 in T.grid(1, 2, 2, 5):
-                            with T.block("bgemm_local"):
-                                v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 32 + ax0)
-                                v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + ax1)
-                                v2 = T.axis.spatial(2048, i0_0_i1_0_i2_0_i3_0_fused % 6080 // 95 * 32 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused % 32 // 4 * 2 + ax2)
-                                v3 = T.axis.spatial(1900, i0_0_i1_0_i2_0_i3_0_fused % 95 * 20 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 5 + ax3)
-                                T.reads(bgemm_local[v0, v1, v2, v3])
-                                T.writes(bgemm[v0, v1, v2, v3])
-                                bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
-            for i0_i1_i2_0_i3_0_fused_1 in T.thread_binding(256, thread="blockIdx.x"):
-                for i0_i1_i2_0_i3_0_fused_2 in T.thread_binding(1024, thread="threadIdx.x"):
-                    for i0_i1_i2_0_i3_0_fused_0 in range(15):
-                        for ax0, ax1 in T.grid(1, 1):
-                            for ax2 in T.unroll(2):
-                                for ax3 in T.unroll(2):
-                                    for ax4 in T.unroll(4):
-                                        for ax5 in T.unroll(4):
-                                            with T.block("inverse"):
-                                                co = T.axis.spatial(2048, (i0_i1_i2_0_i3_0_fused_0 * 262144 + i0_i1_i2_0_i3_0_fused_1 * 1024 + i0_i1_i2_0_i3_0_fused_2) % 1945600 // 950 + ax0)
-                                                p = T.axis.spatial(1900, (i0_i1_i2_0_i3_0_fused_0 * 262144 + i0_i1_i2_0_i3_0_fused_1 * 1024 + i0_i1_i2_0_i3_0_fused_2) // 1945600 * 950 + (i0_i1_i2_0_i3_0_fused_0 * 262144 + i0_i1_i2_0_i3_0_fused_1 * 1024 + i0_i1_i2_0_i3_0_fused_2) % 950 + ax1)
-                                                vh, vw, r_a, r_b = T.axis.remap("SSRR", [ax2, ax3, ax4, ax5])
-                                                T.where((i0_i1_i2_0_i3_0_fused_0 * 256 + i0_i1_i2_0_i3_0_fused_1) * 1024 + i0_i1_i2_0_i3_0_fused_2 < 3891200)
-                                                T.reads(bgemm[r_a, r_b, co, p])
-                                                T.writes(inverse_local[co, p, vh, vw])
-                                                T.block_attr({"schedule_rule": "conv2d_nchw_winograd_inverse"})
-                                                with T.init():
-                                                    inverse_local[co, p, vh, vw] = T.float32(0)
-                                                inverse_local[co, p, vh, vw] = inverse_local[co, p, vh, vw] + bgemm[r_a, r_b, co, p] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0)))))))))
-                        for i2_1, i3_1 in T.grid(2, 2):
-                            with T.block("conv2d_winograd"):
-                                n = T.axis.spatial(2, (i0_i1_i2_0_i3_0_fused_0 * 262144 + i0_i1_i2_0_i3_0_fused_1 * 1024 + i0_i1_i2_0_i3_0_fused_2) // 1945600)
-                                co = T.axis.spatial(2048, (i0_i1_i2_0_i3_0_fused_0 * 262144 + i0_i1_i2_0_i3_0_fused_1 * 1024 + i0_i1_i2_0_i3_0_fused_2) % 1945600 // 950)
-                                h = T.axis.spatial(50, (i0_i1_i2_0_i3_0_fused_0 * 262144 + i0_i1_i2_0_i3_0_fused_1 * 1024 + i0_i1_i2_0_i3_0_fused_2) % 950 // 38 * 2 + i2_1)
-                                w = T.axis.spatial(75, (i0_i1_i2_0_i3_0_fused_0 * 262144 + i0_i1_i2_0_i3_0_fused_1 * 1024 + i0_i1_i2_0_i3_0_fused_2) % 38 * 2 + i3_1)
-                                T.where(((i0_i1_i2_0_i3_0_fused_0 * 256 + i0_i1_i2_0_i3_0_fused_1) * 1024 + i0_i1_i2_0_i3_0_fused_2) % 38 * 2 + i3_1 < 75 and (i0_i1_i2_0_i3_0_fused_0 * 256 + i0_i1_i2_0_i3_0_fused_1) * 1024 + i0_i1_i2_0_i3_0_fused_2 < 3891200)
-                                T.reads(inverse_local[co, n * 950 + h // 2 * 38 + w // 2, h % 2, w % 2], p2[0, co, 0, 0])
-                                T.writes(T_relu[n, co, h, w])
-                                T_relu[n, co, h, w] = T.max(inverse_local[co, n * 950 + h // 2 * 38 + w // 2, h % 2, w % 2] + p2[0, co, 0, 0], T.float32(0))
-
-    # fmt: on
-    decision_0 = [
-        ("SamplePerfectTile", [2, 1, 2, 1, 1]),
-        ("SamplePerfectTile", [2, 1, 1, 2, 1]),
-        ("SamplePerfectTile", [64, 2, 8, 1, 2]),
-        ("SamplePerfectTile", [95, 1, 4, 1, 5]),
-        ("SamplePerfectTile", [256, 1, 8]),
-        ("SampleCategorical", 0),
-        ("SampleCategorical", 3),
-        ("SampleCategorical", 4),
-    ]
-    with _target():
-        mod = nchw_add_relu
-    actual = _design_space(mod)
-    check_sketches(
-        mod,
-        sketches=actual,
-        expected_mods=[nchw_add_relu_scheduled],
-        expected_decisions=[decision_0],
-        debug_mask=0,
-    )
-
-
-if __name__ == "__main__":
-    test_cuda_nhwc()
-    test_cuda_nchw()
-    test_cuda_nchw_add_relu()
diff --git a/tests/python/nightly/quantization/test_quantization_accuracy.py b/tests/python/nightly/quantization/test_quantization_accuracy.py
deleted file mode 100644
index 7eebdd17f2eb..000000000000
--- a/tests/python/nightly/quantization/test_quantization_accuracy.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from collections import namedtuple
-import logging
-import os
-
-import mxnet as mx
-from mxnet import gluon
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay import quantize as qtz
-
-logging.basicConfig(level=logging.INFO)
-
-Config = namedtuple(
-    "Config",
-    [
-        "model",
-        "nbit_input",
-        "dtype_input",
-        "nbit_output",
-        "dtype_output",
-        "global_scale",
-        "expected_acc",
-    ],
-)
-
-
-def get_val_data(model_name, rec_val, batch_size, num_workers=4):
-    rec_val = os.path.expanduser(rec_val)
-    mean_rgb = [123.68, 116.779, 103.939]
-    std_rgb = [58.393, 57.12, 57.375]
-
-    def batch_fn(batch, ctx):
-        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-        return data, label
-
-    img_size = 299 if model_name == "inceptionv3" else 224
-    val_data = mx.io.ImageRecordIter(
-        path_imgrec=rec_val,
-        preprocess_threads=num_workers,
-        shuffle=False,
-        batch_size=batch_size,
-        resize=256,
-        data_shape=(3, img_size, img_size),
-        mean_r=mean_rgb[0],
-        mean_g=mean_rgb[1],
-        mean_b=mean_rgb[2],
-        std_r=std_rgb[0],
-        std_g=std_rgb[1],
-        std_b=std_rgb[2],
-    )
-    return val_data, batch_fn
-
-
-def get_model(model_name, batch_size, qconfig, original=False):
-    try:
-        gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)
-    except RuntimeError:
-        pytest.skip(reason="mxnet downloads no longer supported")
-    img_size = 299 if model_name == "inceptionv3" else 224
-    data_shape = (batch_size, 3, img_size, img_size)
-    mod, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape})
-
-    logging.debug("original")
-    logging.debug(mod.astext(show_meta_data=False))
-    if original:
-        return mod, params
-
-    with qconfig:
-        logging.debug("current quantize config")
-        logging.debug(qtz.current_qconfig())
-        qfunc = qtz.quantize(mod, params)
-        logging.debug("after quantize")
-        logging.debug(qfunc.astext(show_meta_data=False))
-    return qfunc, params
-
-
-def eval_acc(
-    model, params, dataset, batch_fn, target=tvm.target.cuda(), device=tvm.cuda(), log_interval=500
-):
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(model, target, params=params)
-    # create runtime module
-    m = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
-
-    # setup evaluaiton metric
-    dataset.reset()
-    batch_size = dataset.batch_size
-    acc_top1 = mx.metric.Accuracy()
-    acc_top5 = mx.metric.TopKAccuracy(5)
-    acc_top1.reset()
-    acc_top5.reset()
-    # Execute
-    for i, batch in enumerate(dataset):
-        data, label = batch_fn(batch, [mx.cpu(0)])
-        m.set_input("data", tvm.nd.array(data[0].asnumpy()))
-        m.run()
-        out_arr = m.get_output(0)
-        acc_top1.update(label, [mx.nd.array(out_arr.numpy())])
-        acc_top5.update(label, [mx.nd.array(out_arr.numpy())])
-
-        if not (i + 1) % log_interval:
-            _, top1 = acc_top1.get()
-            _, top5 = acc_top5.get()
-            nsamples = (i + 1) * batch_size
-            logging.info("[%d samples] validation: acc-top1=%f acc-top5=%f", nsamples, top1, top5)
-    logging.info("[final] validation: acc-top1=%f acc-top5=%f", top1, top5)
-    return top1
-
-
-@tvm.testing.requires_gpu
-def test_quantize_acc(cfg, rec_val):
-    qconfig = qtz.qconfig(
-        skip_conv_layers=[0],
-        nbit_input=cfg.nbit_input,
-        nbit_weight=cfg.nbit_input,
-        global_scale=cfg.global_scale,
-        dtype_input=cfg.dtype_input,
-        dtype_weight=cfg.dtype_input,
-        dtype_activation=cfg.dtype_output,
-        debug_enabled_ops=None,
-    )
-
-    batch_size = 1
-    model, params = get_model(cfg.model, batch_size, qconfig)
-    val_data, batch_fn = get_val_data(cfg.model, rec_val=rec_val, batch_size=batch_size)
-
-    acc = eval_acc(model, params, val_data, batch_fn)
-    assert acc > cfg.expected_acc
-    return acc
-
-
-if __name__ == "__main__":
-    # TODO(for user): replace the line with the path to imagenet validation dataset
-    rec_val = "/scratch/tqchen/imagenet/val.rec"
-
-    results = []
-    configs = [
-        # TODO: need to fix accuracy and add AutoTVM log
-        Config(
-            "mobilenetv2_1.0",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=32,
-            dtype_output="int32",
-            global_scale=4.0,
-            expected_acc=0.666,
-        ),
-        Config(
-            "mobilenetv2_1.0",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=16,
-            dtype_output="int16",
-            global_scale=4.0,
-            expected_acc=0.666,
-        ),
-        Config(
-            "resnet18_v1",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=16,
-            dtype_output="int16",
-            global_scale=8.0,
-            expected_acc=0.692,
-        ),
-        Config(
-            "resnet18_v1",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=32,
-            dtype_output="int32",
-            global_scale=8.0,
-            expected_acc=0.692,
-        ),
-        Config(
-            "resnet34_v1",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=32,
-            dtype_output="int32",
-            global_scale=8.0,
-            expected_acc=0.733,
-        ),
-        Config(
-            "resnet50_v1",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=32,
-            dtype_output="int32",
-            global_scale=8.0,
-            expected_acc=0.747,
-        ),
-        Config(
-            "resnet101_v1",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=32,
-            dtype_output="int32",
-            global_scale=8.0,
-            expected_acc=0.756,
-        ),
-    ]
-
-    for config in configs:
-        acc = test_quantize_acc(config, rec_val)
-        results.append((config, acc))
-    for res in results:
-        print(res)
diff --git a/tests/python/nightly/quantization/test_quantization_accuracy_for_vit.py b/tests/python/nightly/quantization/test_quantization_accuracy_for_vit.py
deleted file mode 100644
index 6b5891c62630..000000000000
--- a/tests/python/nightly/quantization/test_quantization_accuracy_for_vit.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import sys
-import logging
-
-import pytest
-
-pytest.importorskip("onnx")
-
-import onnx
-
-import tvm
-from tvm import relay
-from tvm.relay import quantize as qtz
-import tvm.testing
-from test_quantization_accuracy import Config, get_val_data, eval_acc
-
-logging.basicConfig(level=logging.INFO)
-
-
-def calibrate_dataset(model_name, rec_val, batch_size, calibration_samples):
-    val_data, _ = get_val_data(model_name, rec_val=rec_val, batch_size=batch_size)
-    val_data.reset()
-    for i, batch in enumerate(val_data):
-        if i * batch_size >= calibration_samples:
-            break
-        data = batch.data[0].asnumpy()
-        yield {"data": data}
-
-
-def download_file(url_base, file_name):
-    if not os.path.exists(file_name) or not os.path.isfile(file_name):
-        import urllib.request as urllib2
-
-        url = "{}/{}".format(url_base, file_name)
-        try:
-            print("download from {}".format(url))
-            if sys.version_info >= (3,):
-                urllib2.urlretrieve(url, file_name)
-            else:
-                f = urllib2.urlopen(url)
-                data = f.read()
-                with open(file_name, "wb") as code:
-                    code.write(data)
-        except Exception as err:
-            if os.path.exists(file_name):
-                os.remove(file_name)
-            raise Exception("download {} failed due to {}!".format(file_name, repr(err)))
-
-
-def get_onnx_model(model_name, batch_size, qconfig, original=False, dataset=None):
-    assert model_name == "vit32", "Only support vit32 model!"
-    base = "https://github.com/TheGreatCold/tvm-vit/raw/d2aa1e60eef42e2fdedbd1e13aa85ac5faf0a7fc"
-    logfile = "gtx1660_vit_B32_224.log"
-    onnx_path = "vit_B32_224.onnx"
-
-    download_file(base, logfile)
-    download_file(base, onnx_path)
-
-    onnx_graph = onnx.load(open(onnx_path, "rb"))
-    data_shape = (batch_size, 3, 224, 224)
-    mod, params = relay.frontend.from_onnx(onnx_graph, {"data": data_shape})
-
-    with tvm.transform.PassContext(opt_level=3):
-        qfunc = relay.quantize.prerequisite_optimize(mod, params=params)
-    logging.debug("original")
-    logging.debug(qfunc.astext(show_meta_data=False))
-    if original:
-        return qfunc, params, logfile
-
-    with qconfig:
-        logging.debug("current quantize config")
-        logging.debug(qtz.current_qconfig())
-
-        if dataset is not None:
-            with tvm.target.cuda():
-                with tvm.autotvm.apply_history_best(logfile):
-                    qfunc = qtz.quantize(qfunc, params, dataset=dataset)
-        else:
-            qfunc = qtz.quantize(qfunc, params)
-
-        logging.debug("after quantize")
-        logging.debug(qfunc.astext(show_meta_data=False))
-    return qfunc, params, logfile
-
-
-@tvm.testing.requires_gpu
-def test_onnx_quantize_acc(cfg, rec_val, batch_size=1, original=False):
-    qconfig = qtz.qconfig(
-        skip_conv_layers=[0],
-        skip_dense_layer=False,
-        nbit_input=cfg.nbit_input,
-        nbit_weight=cfg.nbit_input,
-        dtype_input=cfg.dtype_input,
-        dtype_weight=cfg.dtype_input,
-        dtype_activation=cfg.dtype_output,
-        debug_enabled_ops=None,
-        calibrate_mode="percentile",
-        calibrate_chunk_by=8,
-    )
-
-    dataset = list(calibrate_dataset(cfg.model, rec_val, batch_size, 64))
-    model, params, logfile = get_onnx_model(
-        cfg.model, batch_size, qconfig, original=original, dataset=dataset
-    )
-    val_data, batch_fn = get_val_data(cfg.model, rec_val=rec_val, batch_size=batch_size)
-
-    with tvm.autotvm.apply_history_best(logfile):
-        acc = eval_acc(model, params, val_data, batch_fn, log_interval=1000)
-    assert acc > cfg.expected_acc
-    return acc
-
-
-if __name__ == "__main__":
-    # TODO(for user): replace the line with the path to imagenet validation dataset
-    rec_val = "/scratch/tqchen/imagenet/val.rec"
-
-    configs = [
-        Config(
-            "vit32",
-            nbit_input=8,
-            dtype_input="int8",
-            nbit_output=32,
-            dtype_output="int32",
-            global_scale=8.0,
-            expected_acc=0.727,
-        ),
-    ]
-
-    for config in configs:
-
-        # float32 model
-        acc = test_onnx_quantize_acc(config, rec_val, batch_size=1, original=True)
-        print("{}-float32: {}".format(config.model, acc))
-
-        # int8 model
-        acc = test_onnx_quantize_acc(config, rec_val, batch_size=1, original=False)
-        print("{}-int8: {}".format(config.model, acc))
diff --git a/tests/python/relax/test_backend_dispatch_sort_scan.py b/tests/python/relax/test_backend_dispatch_sort_scan.py
index 1efbd690f034..4fe6de9e09c6 100644
--- a/tests/python/relax/test_backend_dispatch_sort_scan.py
+++ b/tests/python/relax/test_backend_dispatch_sort_scan.py
@@ -93,13 +93,13 @@ def main(x: R.Tensor(("m", 3), "float32", "cuda")):
         with bb.function("main", (x,), {"global_symbol": "main"}):
             with bb.dataflow():
                 lv = bb.emit_te(
-                    topi.cuda.cumsum,
+                    topi.gpu.cumsum,
                     x,
                     axis=1,
                     exclusive=True,
                 )
                 out = bb.emit_te(
-                    topi.cuda.cumprod,
+                    topi.gpu.cumprod,
                     lv,
                     axis=1,
                 )
@@ -178,7 +178,7 @@ def foo2(y: R.Tensor((2, 3), "float32")):
         with bb.function("foo", (x,), {"global_symbol": "foo"}):
             with bb.dataflow():
                 out = bb.emit_te(
-                    topi.cuda.sort,
+                    topi.gpu.sort,
                     x,
                     axis=1,
                 )
@@ -193,14 +193,14 @@ def foo2(y: R.Tensor((2, 3), "float32")):
                         )
                     )
                     out = bb.emit_te(
-                        topi.cuda.sort_thrust,
+                        topi.gpu.sort_thrust,
                         y,
                         axis=0,
                         is_ascend=False,
                         workspace=workspace,
                     )
                 else:
-                    out = bb.emit_te(topi.cuda.sort, y, axis=0, is_ascend=False)
+                    out = bb.emit_te(topi.gpu.sort, y, axis=0, is_ascend=False)
                 out = bb.emit_output(out)
             bb.emit_func_output(out)
     expected_mod = bb.finalize()
@@ -273,7 +273,7 @@ def foo2(y: R.Tensor((2, 3), "float32")):
     with target:
         with bb.function("foo", (x,), {"global_symbol": "foo"}):
             with bb.dataflow():
-                out = bb.emit_te(topi.cuda.argsort, x, axis=1, is_ascend=True, dtype="int32")
+                out = bb.emit_te(topi.gpu.argsort, x, axis=1, is_ascend=True, dtype="int32")
                 out = bb.emit_output(out)
             bb.emit_func_output(out)
         with bb.function("foo2", (y,), {"global_symbol": "foo2"}):
@@ -285,7 +285,7 @@ def foo2(y: R.Tensor((2, 3), "float32")):
                         )
                     )
                     out = bb.emit_te(
-                        topi.cuda.argsort_thrust,
+                        topi.gpu.argsort_thrust,
                         y,
                         axis=0,
                         is_ascend=False,
@@ -293,7 +293,7 @@ def foo2(y: R.Tensor((2, 3), "float32")):
                         workspace=workspace,
                     )
                 else:
-                    out = bb.emit_te(topi.cuda.argsort, y, axis=0, is_ascend=False, dtype="int64")
+                    out = bb.emit_te(topi.gpu.argsort, y, axis=0, is_ascend=False, dtype="int64")
                 out = bb.emit_output(out)
             bb.emit_func_output(out)
     expected_mod = bb.finalize()
@@ -357,7 +357,7 @@ def foo(x: R.Tensor((2, 3), "float32", "cuda")):
     with target:
         with bb.function("foo", (x,), {"global_symbol": "foo"}):
             with bb.dataflow():
-                out = bb.emit_te(topi.cuda.topk, x, k=2, axis=1, is_ascend=False, dtype="int32")
+                out = bb.emit_te(topi.gpu.topk, x, k=2, axis=1, is_ascend=False, dtype="int32")
                 out = bb.emit_output(out)
             bb.emit_func_output(out)
     expected_mod = bb.finalize()
@@ -393,8 +393,8 @@ def foo(x: R.Tensor((2, 3), "float32", "vulkan")):
     with target:
         with bb.function("foo", (x,), {"global_symbol": "foo"}):
             with bb.dataflow():
-                lv0 = bb.emit_te(topi.cuda.topk, x, k=2, axis=1, is_ascend=False, dtype="int32")
-                lv1 = bb.emit_te(topi.cuda.topk, x, k=2, axis=1, is_ascend=False, dtype="int32")
+                lv0 = bb.emit_te(topi.gpu.topk, x, k=2, axis=1, is_ascend=False, dtype="int32")
+                lv1 = bb.emit_te(topi.gpu.topk, x, k=2, axis=1, is_ascend=False, dtype="int32")
                 out = (lv0, lv1)
                 out = bb.emit_output(out)
             bb.emit_func_output(out)
diff --git a/tests/python/relax/test_blockbuilder_core.py b/tests/python/relax/test_blockbuilder_core.py
index 02cf7f14c155..da030935c323 100644
--- a/tests/python/relax/test_blockbuilder_core.py
+++ b/tests/python/relax/test_blockbuilder_core.py
@@ -19,9 +19,10 @@
 import pytest
 import tvm
 import tvm.testing
+import tvm.contrib.cblas
 
 from tvm import te, tir, topi
-from tvm import relax as rx, relay
+from tvm import relax as rx
 from tvm.ir.base import assert_structural_equal
 from tvm.relax import ExternFunc
 from tvm.script import ir as I, relax as R, tir as T
@@ -443,7 +444,6 @@ def get_tir_func():
 
     call_node = rx_func.body.blocks[0].bindings[0].value
     assert isinstance(call_node, rx.Call)
-    assert call_node.op == relay.op.get("relax.call_tir")
     assert len(call_node.args) == 2
     assert call_node.args[0].name_hint == "te_func"
     assert call_node.args[1][0] == x
@@ -502,7 +502,6 @@ def te_func(A):
     # check call tir output shape is a Tuple of ShapeExpr
     assert rx_func.params[0] == x
     call_node = rx_func.body.blocks[0].bindings[0].value
-    assert call_node.op == relay.op.get("relax.call_tir")
     assert call_node.args[0].name_hint == "te_func"
     assert isinstance(call_node.sinfo_args[0], rx.TupleStructInfo)
     assert len(call_node.sinfo_args[0].fields) == 2
@@ -529,7 +528,6 @@ def test_emit_te_extern():
     assert len(rx_func.body.blocks) == 1
     call_node = rx_func.body.blocks[0].bindings[0].value
     assert isinstance(call_node, rx.Call)
-    assert call_node.op == relay.op.get("relax.call_tir")
     assert len(call_node.args) == 2
     assert call_node.args[0].name_hint == "matmul"
     assert call_node.args[1][0] == x
@@ -556,7 +554,6 @@ def test_emit_te_prim_value():
     assert len(rx_func.body.blocks) == 1
     call_node = rx_func.body.blocks[0].bindings[0].value
     assert isinstance(call_node, rx.Call)
-    assert call_node.op == relay.op.get("relax.call_tir")
     assert len(call_node.args) == 2
     assert call_node.args[1][0] == x
 
diff --git a/tests/python/relax/test_codegen_coreml.py b/tests/python/relax/test_codegen_coreml.py
index 0be6f4731635..f33f165e7d13 100644
--- a/tests/python/relax/test_codegen_coreml.py
+++ b/tests/python/relax/test_codegen_coreml.py
@@ -27,6 +27,8 @@
 
 def _has_xcode():
     try:
+        import tvm.contrib.xcode
+
         tvm.contrib.xcode.xcrun([])
         return True
     except FileNotFoundError:
diff --git a/tests/python/relax/test_dataflow_pattern.py b/tests/python/relax/test_dataflow_pattern.py
index 7a3b65cea10e..4b5da0d9e608 100644
--- a/tests/python/relax/test_dataflow_pattern.py
+++ b/tests/python/relax/test_dataflow_pattern.py
@@ -22,7 +22,7 @@
 
 import tvm.testing
 from tvm import relax as rx
-from tvm import relay, tir
+from tvm import tir
 from tvm.relax.analysis import get_var2val
 from tvm.relax.dpl import *
 from tvm.script import relax as R
@@ -278,13 +278,13 @@ def test_extern_fn_pattern():
 def test_op_attr():
     x = rx.Var("x", R.Tensor("float32"))
     y = rx.Var("y", R.Tensor("float32"))
-    conv2d = relay.nn.conv2d(x, y, kernel_size=(3, 3))
+    conv2d = rx.op.nn.conv2d(x, y, strides=(3, 3))
     xp = is_var("x")
     yp = is_var("y")
     # TODO(@yuchen): reenable the assert after figuring out why it fails
-    # assert is_op("nn.conv2d")(xp, yp).has_attr({"kernel_size": [3, 3]}).match(conv2d)
-    assert not is_op("nn.conv2d")(xp, yp).has_attr({"kernel_size": [4, 3]}).match(conv2d)
-    assert not is_op("nn.conv2d")(xp, yp).has_attr({"kernel_size_": [3, 3]}).match(conv2d)
+    # assert is_op("nn.conv2d")(xp, yp).has_attr({"strides": [3, 3]}).match(conv2d)
+    assert not is_op("nn.conv2d")(xp, yp).has_attr({"strides": [4, 3]}).match(conv2d)
+    assert not is_op("nn.conv2d")(xp, yp).has_attr({"strides": [3, 3]}).match(conv2d)
 
 
 def test_match_call_attr():
diff --git a/tests/python/relax/test_relay_translator.py b/tests/python/relax/test_relay_translator.py
deleted file mode 100644
index c32382dbb0a8..000000000000
--- a/tests/python/relax/test_relay_translator.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tempfile
-
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import meta_schedule as ms
-from tvm import relax, relay, tir, topi
-from tvm.ir.base import assert_structural_equal
-from tvm.relax.testing import relay_translator
-from tvm.relay import testing
-from tvm.runtime import vm
-from tvm.script import tir as T
-from tvm.target import Target
-
-
-def get_resnet(batch_size, dtype, layout, image_shape):
-    relay_mod, params = testing.resnet.get_workload(
-        num_layers=18,
-        batch_size=batch_size,
-        dtype=dtype,
-        layout=layout,
-        image_shape=image_shape,
-    )
-
-    return relay_mod, params
-
-
-def relay_build_and_run(mod, target, dev, params, data):
-    with tempfile.TemporaryDirectory() as work_dir:
-        db = ms.relay_integration.tune_relay(
-            mod=mod,
-            params=params,
-            target=target,
-            num_trials_per_iter=32,
-            max_trials_per_task=32,
-            max_trials_global=1024,
-            task_scheduler="round-robin",
-            work_dir=work_dir,
-        )
-        ex = ms.relay_integration.compile_relay(
-            db,
-            mod=mod,
-            target=target,
-            params=params,
-        )
-    rt_mod = tvm.contrib.graph_executor.GraphModule(ex["default"](dev))
-    rt_mod.set_input("data", data)
-    rt_mod.run()
-    out = rt_mod.get_output(0).numpy()
-    return ex, rt_mod, out
-
-
-def relax_build_and_run(mod, target, dev, params, data):
-    mod = relax.transform.BindParams("main", params)(mod)
-    with tempfile.TemporaryDirectory() as work_dir:
-        db = ms.relax_integration.tune_relax(
-            mod=mod,
-            target=target,
-            params=params,
-            task_scheduler="round-robin",
-            num_trials_per_iter=32,
-            max_trials_per_task=32,
-            max_trials_global=1024,
-            work_dir=work_dir,
-        )
-        ex = ms.relax_integration.compile_relax(
-            db,
-            mod=mod,
-            target=target,
-            params=params,
-        )
-    vm = relax.VirtualMachine(ex, dev)
-    res = vm["main"](data)
-    out = res.numpy()
-    return ex, vm, out
-
-
-def verify_e2e_translation(target_str, layout, batch_size, image_shape):
-    target = Target(target_str)
-    dev = tvm.device(str(target), dev_id=0)
-    relay_mod, params = get_resnet(batch_size, "float32", layout, image_shape)
-    input_shape = (1, *image_shape)
-    data = tvm.nd.array(np.random.rand(*input_shape).astype(np.float32), dev)
-    relax_mod = relay_translator.from_relay(relay_mod["main"], target, params)
-
-    _, _, relay_out = relay_build_and_run(relay_mod, target, dev, params, data)
-    _, _, relax_out = relax_build_and_run(relax_mod, target, dev, params, data)
-    tvm.testing.assert_allclose(relay_out, relax_out, atol=1e-5, rtol=1e-5)
-
-
-@pytest.mark.skip(reason="take too much time")
-@pytest.mark.parametrize(
-    "layout, batch_size, image_shape", [("NCHW", 1, (3, 224, 224)), ("NHWC", 1, (224, 224, 3))]
-)
-def test_verify_e2e_translation_cpu(layout, batch_size, image_shape):
-    verify_e2e_translation("llvm --num-cores=16", layout, batch_size, image_shape)
-
-
-@pytest.mark.skip(reason="take too much time")
-@tvm.testing.requires_gpu
-@pytest.mark.parametrize(
-    "layout, batch_size, image_shape", [("NCHW", 1, (3, 224, 224)), ("NHWC", 1, (224, 224, 3))]
-)
-def test_verify_e2e_translation_gpu(layout, batch_size, image_shape):
-    verify_e2e_translation("cuda", layout, batch_size, image_shape)
-
-
-def verify_extracted_tasks(target_str, layout, batch_size, image_shape, module_equality):
-    target = Target(target_str)
-    relay_mod, params = get_resnet(batch_size, "float32", layout, image_shape)
-    # Parameters can be bound either as part of the `from_relay`
-    # conversion, or as part of the `extract_tasks` method.  However,
-    # they shouldn't be used in both locations, because
-    # `relax.BindParams` validates that there exists an unbound
-    # parameter of the specified name.
-    relax_mod = relay_translator.from_relay(
-        relay_mod["main"],
-        target,
-        pass_config={
-            "relay.backend.use_meta_schedule": True,
-            "relay.FuseOps.max_depth": 1,  # Disable relay fusion
-        },
-    )
-    relay_tasks = ms.relay_integration.extract_tasks(
-        relay_mod,
-        target=target,
-        params=params,
-        pass_config={
-            "relay.backend.use_meta_schedule": True,
-            "relay.FuseOps.max_depth": 1,  # Disable relay fusion
-        },
-        module_equality=module_equality,
-    )
-    relax_tasks = ms.relax_integration.extract_tasks(
-        relax_mod,
-        target=target,
-        params=params,
-        module_equality=module_equality,
-    )
-    # TODO (yongwww, yuchen): tophub guides relay passes, which causes inconsistent tasks
-    # assert len(relay_tasks) == len(relax_tasks)
-    # TODO: Can we compare extracted tasks as well?
-
-
-@pytest.mark.parametrize("module_equality", ["structural", "ignore-ndarray", "anchor-block"])
-@pytest.mark.parametrize(
-    "layout, batch_size, image_shape",
-    [
-        ("NCHW", 1, (3, 224, 224)),
-        ("NHWC", 1, (224, 224, 3)),
-    ],
-)
-def test_verify_extracted_tasks_cpu(layout, batch_size, image_shape, module_equality):
-    verify_extracted_tasks("llvm --num-cores=16", layout, batch_size, image_shape, module_equality)
-
-
-@tvm.testing.requires_gpu
-@pytest.mark.parametrize("module_equality", ["structural", "ignore-ndarray", "anchor-block"])
-@pytest.mark.parametrize(
-    "layout, batch_size, image_shape", [("NCHW", 1, (3, 224, 224)), ("NHWC", 1, (224, 224, 3))]
-)
-def test_verify_extracted_tasks_gpu(layout, batch_size, image_shape, module_equality):
-    verify_extracted_tasks("cuda", layout, batch_size, image_shape, module_equality)
-
-
-def translate_and_build_vms(relay_mod, target_str="llvm", translate_op_with_tir=None):
-    target = tvm.target.Target(target_str)
-
-    # build the relay IRModule and create relay vm
-    relay_ex = relay.vm.compile(relay_mod, target)
-    relay_vm = vm.VirtualMachine(relay_ex, tvm.cpu())
-
-    # build the relax IRModule and create relax vm
-    relax_mod = relay_translator.from_relay(
-        relay_mod["main"], target, translate_op_with_tir=translate_op_with_tir
-    )
-    relax_ex = relax.build(relax_mod, target)
-    relax_vm = relax.VirtualMachine(relax_ex, tvm.cpu())
-
-    return relay_vm, relax_vm, relax_mod
-
-
-def verify_vm_outputs(
-    input_shape,
-    relay_vm,
-    relax_vm,
-    extra_args=[],
-):
-    input = tvm.nd.array(np.random.rand(*input_shape).astype(np.float32))
-
-    # check correctness by comparing relax and relay result
-    args = [input] + extra_args
-    relax_output = relax_vm["main"](*args)
-    relay_output = relay_vm.run(*args)
-    tvm.testing.assert_allclose(relay_output.numpy(), relax_output.numpy())
-
-
-def test_single_dynamic_dim():
-    wx, wy = 64, 128
-    # create relay module: y = data * weights + bias with dynamic batch dimension
-    data = relay.var("data", shape=(relay.Any(), wx))
-    weights = relay.var("weights", shape=(wx, wy))
-    bias = relay.var("bias", shape=(wy,))
-    y = relay.nn.matmul(data, weights)
-    relay_mod = tvm.IRModule.from_expr(relay.Function([data, weights, bias], y + bias))
-
-    relay_vm, relax_vm, _ = translate_and_build_vms(relay_mod)
-    weights = tvm.nd.array(np.random.rand(wx, wy).astype(np.float32))
-    bias = tvm.nd.array(np.random.rand(wy).astype(np.float32))
-    # verify for different batch sizes
-    verify_vm_outputs([10, wx], relay_vm, relax_vm, [weights, bias])
-    verify_vm_outputs([32, wx], relay_vm, relax_vm, [weights, bias])
-
-
-def test_multiple_dynamic_dims():
-    # create relay module: y = a + a, where a has shape = (?, 5, ?)
-    shape = (relay.Any(), 5, relay.Any())
-    a = relay.var("a", shape=shape)
-
-    relay_mod = tvm.IRModule.from_expr(relay.Function([a], a + a))
-    relay_vm, relax_vm, _ = translate_and_build_vms(relay_mod)
-    # verify for different shapes
-    verify_vm_outputs([2, 5, 10], relay_vm, relax_vm)
-    verify_vm_outputs([12, 5, 24], relay_vm, relax_vm)
-
-
-def test_layout_transform():
-    shape = (1, 3, 224, 224)
-    a = relay.var("a", shape=shape)
-    b = relay.layout_transform(a, "NCHW", "NHWC")
-    relay_mod = tvm.IRModule.from_expr(relay.Function([a], b))
-
-    relay_vm, relax_vm, _ = translate_and_build_vms(relay_mod)
-    verify_vm_outputs([1, 3, 224, 224], relay_vm, relax_vm)
-
-
-def test_translate_op_with_tir():
-    @T.prim_func
-    def tir_matmul(
-        A: T.Buffer((512, 512), "float32"),
-        B: T.Buffer((512, 512), "float32"),
-        C: T.Buffer((512, 512), "float32"),
-    ) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "multiply", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1, i2 in T.grid(512, 512, 512):
-            with T.block("C"):
-                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
-                T.reads(C[i, j], A[i, k], B[k, j])
-                T.writes(C[i, j])
-                with T.init():
-                    C[i, j] = T.float32(0)
-                C[i, j] = C[i, j] + A[i, k] * B[k, j]
-
-    shape = (512, 512)
-    a = relay.var("a", shape=shape)
-
-    relay_mod = tvm.IRModule.from_expr(relay.Function([a], a * a))
-    _, _, relax_mod = translate_and_build_vms(
-        relay_mod, translate_op_with_tir={"multiply": tir_matmul}
-    )
-    assert_structural_equal(relax_mod["multiply"], tir_matmul)
-
-
-def test_translate_tuple_arg():
-    x = relay.var("x", shape=(10, 16))
-    y = relay.var("y", shape=(10, 16))
-    relay_mod = tvm.IRModule.from_expr(relay.Function([x, y], relay.concatenate((x, y), axis=-1)))
-    relax_mod = relay_translator.from_relay(relay_mod["main"], target="llvm")
-
-    # Construct the expected module
-    bb = relax.BlockBuilder()
-    x_relax = relax.Var("x", relax.TensorStructInfo([10, 16], "float32"))
-    y_relax = relax.Var("y", relax.TensorStructInfo([10, 16], "float32"))
-    with bb.function("main", [x_relax, y_relax]):
-        with bb.dataflow():
-            _ = bb.emit(relax.Tuple((x_relax, y_relax)))
-            lv1 = bb.emit(x_relax)
-            lv2 = bb.emit(y_relax)
-            lv3 = bb.emit_te(topi.x86.concatenate, (lv1, lv2), axis=-1)
-            gv = bb.emit_output(lv3)
-        bb.emit_func_output(gv)
-
-    assert_structural_equal(relax_mod, bb.get())
-
-
-def test_append_op_attrs():
-    x = relay.var("x", shape=(10, 16))
-    y = relay.var("y", shape=(10, 16))
-    relay_mod = tvm.IRModule.from_expr(relay.Function([x, y], relay.concatenate((x, y), axis=-1)))
-    relax_mod_wo_attrs = relay_translator.from_relay(relay_mod["main"], target="llvm")
-    relax_mod_with_attrs = relay_translator.from_relay(
-        relay_mod["main"], target="llvm", append_op_attrs=True
-    )
-    assert "op_attrs" in relax_mod_with_attrs["concatenate"].attrs
-    assert "op_attrs" not in relax_mod_wo_attrs["concatenate"].attrs
-
-
-def test_instruments_support():
-    x = relay.var("x", shape=(10, 16))
-    y = relay.var("y", shape=(10, 16))
-    out = relay.add(x, y)
-    mod = tvm.IRModule.from_expr(out)
-
-    @tvm.instrument.pass_instrument
-    class SampleRunBeforeAfterInstrument:
-        def __init__(self):
-            self.events = []
-
-        def run_before_pass(self, mod, info):
-            self.events.append("run before " + info.name)
-
-        def run_after_pass(self, mod, info):
-            self.events.append("run after " + info.name)
-
-    my_test = SampleRunBeforeAfterInstrument()
-    relax_mod_with_attrs = relay_translator.from_relay(
-        mod["main"], target="llvm", instruments=[my_test]
-    )
-
-    assert "run after " in "".join(my_test.events)
-    assert "run before " in "".join(my_test.events)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/python/relax/test_transform_gradient_numeric.py b/tests/python/relax/test_transform_gradient_numeric.py
index 27c0ffb5651c..22241a2fbd65 100644
--- a/tests/python/relax/test_transform_gradient_numeric.py
+++ b/tests/python/relax/test_transform_gradient_numeric.py
@@ -18,12 +18,15 @@
 import tvm
 import tvm.testing
 from tvm import relax
-from tvm.relay.testing import rand
 from tvm.testing import assert_allclose
 from tvm.testing.utils import check_numerical_grads
 from tvm.script.parser import ir as I, relax as R
 
 
+def rand(dtype, *shape):
+    return tvm.nd.array(np.random.rand(*shape).astype(dtype))
+
+
 def _legalize_and_build(mod, target, dev):
     ex = relax.build(mod, target)
     vm = relax.VirtualMachine(ex, dev)
diff --git a/tests/python/relax/test_vm_build.py b/tests/python/relax/test_vm_build.py
index ecf33aa9da1e..9acd1b86292c 100644
--- a/tests/python/relax/test_vm_build.py
+++ b/tests/python/relax/test_vm_build.py
@@ -1135,7 +1135,6 @@ def run_on_rpc(
     # Use local rpc server for testing.
     # Server must use popen so it doesn't inherit the current process state. It
     # will crash otherwise.
-    # Adapted from relay/test_vm.py
     def check_remote(server):
         remote = rpc.connect(server.host, server.port, session_timeout=10)
 
diff --git a/tests/python/relay/backend/test_pass_lower_te.py b/tests/python/relay/backend/test_pass_lower_te.py
deleted file mode 100644
index 89bd62fe5aa8..000000000000
--- a/tests/python/relay/backend/test_pass_lower_te.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Exercises the LowerTE pass.
-
-import tvm
-import tvm.testing
-import logging
-
-logging.basicConfig()
-logger = logging.getLogger("test_pass_lower_te")
-logger.setLevel(logging.INFO)
-
-# Since the TE compiler needs a good refactor it has not been exposed as a 'standard' pass
-# in relay.transform. For testing grab it directly.
-LowerTE = tvm._ffi.get_global_func("relay.tec.LowerTE")
-
-
-def transform(mod):
-    logger.info("Starting module:\n%s", mod)
-    host_target = tvm.target.Target("llvm")
-    prim_target = tvm.target.Target("llvm", host=host_target)
-    ctxt = tvm.transform.PassContext()
-    config = tvm.target.make_compilation_config(ctxt, prim_target)
-    mod = tvm.relay.transform.PlanDevices(config)(mod)
-    mod = tvm.relay.transform.InferType()(mod)
-    mod = LowerTE("test", config)(mod)
-    mod = tvm.relay.transform.InferType()(mod)
-    logger.info("After LowerTE:\n%s", mod)
-    return mod
-
-
-# All attempts to use structural equalty tests against an expected IRModule parsed from
-# Relay text were thwarted by the difficulty of setting up the expected call_lower attributes
-# with the right GlobalVar instances. So the following assert structural correctness the hard way.
-
-
-def test_lower_primitive():
-    input_mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1) -> Tensor[(5, 7), float32] {
-            add(%x, %y)
-          };
-          %0(%a, %a)
-        }
-        """,
-        "from_string",
-        None,
-        None,
-    )
-
-    actual_mod = transform(input_mod)
-
-    # Expected:
-    #   def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-    #     %0 = (%a, %a);
-    #     call_lowered(@test_fused_add, %0, metadata={relay_attrs={Primitive=1},all_prim_fn_vars=[@test_fused_add]})
-    #   }
-    #   def @test_fused_add = <lowered PrimFunc>
-
-    main = actual_mod["main"]
-    call = main.body
-    assert call.op.name == "call_lowered"
-    assert len(call.args) == 2
-    assert call.args[0].name_hint == "test_fused_add"
-    assert len(call.args[1].fields) == 2
-    assert call.args[1].fields[0].name_hint == "a"
-    assert call.args[1].fields[1].name_hint == "a"
-    assert call.attrs.metadata["relay_attrs"].Primitive == 1
-    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 1
-    assert call.attrs.metadata["all_prim_fn_vars"][0].name_hint == "test_fused_add"
-
-    test_fused_add = actual_mod["test_fused_add"]
-    assert isinstance(test_fused_add, tvm.tir.PrimFunc)
-
-
-def test_lower_compiler():
-    @tvm._ffi.register_func("relay.ext.test_pass_lower_te")
-    def relay_ext_test_pass_lower_te(func):
-        return None
-
-    input_mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add") -> Tensor[(5, 7), float32] {
-            add(%x, %y)
-          };
-          %0(%a, %a)
-        }
-        """,
-        "from_string",
-        None,
-        None,
-    )
-
-    actual_mod = transform(input_mod)
-
-    # Expected:
-    #   def @main(%a : Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-    #     %0 = (%a, %a)
-    #     call_lowered(@test_add , %0, metadata={relay_attrs={Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add"}}, all_prim_fn_vars=[]})
-    #   }
-    #   def @test_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
-    #     add(%x, %y)
-    #   }
-
-    main = actual_mod["main"]
-    call = main.body
-    assert call.op.name == "call_lowered"
-    assert len(call.args) == 2
-    assert call.args[0].name_hint == "test_add"
-    assert len(call.args[1].fields) == 2
-    assert call.args[1].fields[0].name_hint == "a"
-    assert call.args[1].fields[1].name_hint == "a"
-    assert call.attrs.metadata["relay_attrs"].Primitive == 1
-    assert call.attrs.metadata["relay_attrs"].Compiler == "test_pass_lower_te"
-    assert call.attrs.metadata["relay_attrs"].global_symbol == "test_add"
-    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
-
-    test_add = actual_mod["test_add"]
-    assert isinstance(test_add, tvm.relay.Function)
-    assert test_add.attrs["Extern"] == 1
-
-
-def test_lower_extern():
-    input_mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-          @my_add(%a, %a)
-        }
-        def @my_add(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
-          add(%x, %y)
-        }
-        """,
-        "from_string",
-        None,
-        None,
-    )
-
-    actual_mod = transform(input_mod)
-
-    # Expected:
-    #   def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-    #     %0 = (%a, %a);
-    #     call_lowered(@my_add, %0, metadata={relay_attrs={Extern=1}}, all_prim_fn_vars=[]})
-    #   }
-    #   def @my_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
-    #     add(%x, %y)
-    #   }
-
-    main = actual_mod["main"]
-    call = main.body
-    assert call.op.name == "call_lowered"
-    assert len(call.args) == 2
-    assert call.args[0].name_hint == "my_add"
-    assert len(call.args[1].fields) == 2
-    assert call.args[1].fields[0].name_hint == "a"
-    assert call.args[1].fields[1].name_hint == "a"
-    assert call.attrs.metadata["relay_attrs"].Extern == 1
-    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
-
-    test_add = actual_mod["my_add"]
-    assert isinstance(test_add, tvm.relay.Function)
-    assert test_add.attrs["Extern"] == 1
-
-
-def test_lower_extern_with_dynamic_shape():
-    input_mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(?, ?), float32] {
-          @my_dyn(%a, %a)
-        }
-        def @my_dyn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(?, ?), float32] {
-          add(%x, %y)
-        }
-        """,
-        "from_string",
-        None,
-        None,
-    )
-
-    actual_mod = transform(input_mod)
-
-    # Expected:
-    # def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(?, ?), float32] {
-    #   %0 = (%a, %a);
-    #   call_lowered(@my_dyn, %0, metadata={prim_shape_fn_var='test_shape_func_add', relay_attrs={Extern=1}, prim_shape_fn_states=[2, 2], prim_shape_fn_num_inputs=2, all_prim_shape_fn_vars=['shape_func_add'], prim_shape_fn_num_outputs=1, all_prim_fn_vars=[]})
-    # }
-    # def @my_dyn(%x: Tensor[(5, 7), float32] , %y: Tensor[(5, 7), float32] , Extern=1) -> Tensor[(?, ?), float32] {
-    #   add(%x, %y)
-    # }
-    # def @test_shape_func_add = <shape PrimFunc>
-
-    main = actual_mod["main"]
-    call = main.body
-    assert call.op.name == "call_lowered"
-    assert len(call.args) == 2
-    assert call.args[0].name_hint == "my_dyn"
-    assert len(call.args[1].fields) == 2
-    assert call.args[1].fields[0].name_hint == "a"
-    assert call.args[1].fields[1].name_hint == "a"
-    assert call.attrs.metadata["prim_shape_fn_var"].name_hint == "test_shape_func_add"
-    assert call.attrs.metadata["relay_attrs"].Extern == 1
-    assert len(call.attrs.metadata["prim_shape_fn_states"]) == 2
-    assert call.attrs.metadata["prim_shape_fn_states"][0] == 2
-    assert call.attrs.metadata["prim_shape_fn_states"][1] == 2
-    assert call.attrs.metadata["prim_shape_fn_num_inputs"] == 2
-    assert len(call.attrs.metadata["all_prim_shape_fn_vars"]) == 1
-    assert call.attrs.metadata["all_prim_shape_fn_vars"][0].name_hint == "test_shape_func_add"
-    assert call.attrs.metadata["prim_shape_fn_num_outputs"] == 1
-    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
-
-    my_dyn = actual_mod["my_dyn"]
-    assert isinstance(my_dyn, tvm.relay.Function)
-    assert my_dyn.attrs["Extern"] == 1
-
-    shape_func_add = actual_mod["test_shape_func_add"]
-    assert isinstance(shape_func_add, tvm.tir.PrimFunc)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/backend/test_pass_remove_standalone_reshapes.py b/tests/python/relay/backend/test_pass_remove_standalone_reshapes.py
deleted file mode 100644
index 8b1b10d68e16..000000000000
--- a/tests/python/relay/backend/test_pass_remove_standalone_reshapes.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Exercises the RemoveStandaloneReshapes pass.
-
-import tvm
-from tvm import relay
-from tvm.relay.expr_functor import ExprMutator
-import tvm.testing
-from tvm.script import tir as T
-
-
-HOST_DEVICE = tvm.device("cpu")
-HOST_TARGET = tvm.target.Target("llvm")
-
-CPU_DEVICE = tvm.device("cpu")
-CPU_TARGET = tvm.target.Target("llvm").with_host(HOST_TARGET)
-
-CPU = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET)  # device_type=1
-
-
-RemoveStandaloneReshapes = tvm._ffi.get_global_func("relay._transform.RemoveStandaloneReshapes")
-
-
-class MarkReshapeOnlyMutator(ExprMutator):
-    """A pass for marking call_lowered as ReshapeOnly where reshapes exist unfused"""
-
-    def __init__(self):
-        ExprMutator.__init__(self)
-
-    def visit_call(self, call):
-        if isinstance(call.args[0], tvm.ir.GlobalVar) and "reshape" in call.args[0].name_hint:
-            # attrs = {"relay_attrs" : {"relay.reshape_only" : 1}}
-            dict_attrs = tvm.ir.make_node("DictAttrs", **{"relay.reshape_only": 1})
-            attrs = tvm.ir.make_node(
-                "relay.attrs.CallLoweredAttrs", **{"metadata": {"relay_attrs": dict_attrs}}
-            )
-            return relay.Call(call.op, call.args, attrs)
-        return super().visit_call(call)
-
-
-# Reshape should not be removed if its the first layer in the network
-def test_first_reshape():
-    mod = tvm.ir.IRModule()
-
-    @T.prim_func
-    def reshape_primfunc(a: T.handle, d: T.handle) -> None:
-        A = T.match_buffer(a, [128, 128])
-        D = T.match_buffer(d, [128, 128])
-
-        for i, j in T.grid(128, 128):
-            D[i, j] = A[i, j]
-
-    metatable = {"VirtualDevice": [CPU]}
-    reshape_ty = relay.FuncType(
-        [
-            relay.TensorType((128, 128), "float32"),
-        ],
-        relay.TensorType((128, 128), "float32"),
-    )
-
-    reshape_gv = relay.GlobalVar("reshape", type_annot=reshape_ty)
-    mod[reshape_gv] = reshape_primfunc
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                  virtual_device=meta[VirtualDevice][0]) {
-          %1 = call_lowered(@reshape, (%x,) );
-          let %x_14: Tensor[(128, 128), float32] = on_device(%1, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-          %x_14
-        }
-        """,
-        "from_string",
-        mod,
-        metatable,
-    )
-
-    mod["main"] = MarkReshapeOnlyMutator().visit(mod["main"])
-    mod = RemoveStandaloneReshapes()(mod)
-    reshapes_present = any(["reshape" in gv.name_hint for gv in mod.get_global_vars()])
-    assert reshapes_present, "Reshape should have been removed."
-    return
-
-
-# When reshape layer is the last one in the network
-def test_last_reshape():
-    mod = tvm.ir.IRModule()
-
-    @T.prim_func
-    def mul_primfunc(a: T.handle, b: T.handle, d: T.handle) -> None:
-        A = T.match_buffer(a, [128, 128])
-        B = T.match_buffer(b, [128, 128])
-        D = T.match_buffer(d, [128, 128])
-
-        for i, j, k in T.grid(128, 128, 128):
-            with T.block("update"):
-                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
-                D[vi, vj] = A[vi, vk] * B[vj, vk]
-
-    @T.prim_func
-    def reshape_primfunc(a: T.handle, d: T.handle) -> None:
-        A = T.match_buffer(a, [128, 128])
-        D = T.match_buffer(d, [128, 128])
-
-        for i, j in T.grid(128, 128):
-            D[i, j] = A[i, j]
-
-    metatable = {"VirtualDevice": [CPU]}
-    mul_ty = relay.FuncType(
-        [
-            relay.TensorType((128, 128), "float32"),
-            relay.TensorType((128, 128), "float32"),
-            relay.TensorType((128, 128), "float32"),
-        ],
-        relay.TensorType((128, 128), "float32"),
-    )
-
-    mul_gv = relay.GlobalVar("multiply", type_annot=mul_ty)
-    mod[mul_gv] = mul_primfunc
-    reshape_ty = relay.FuncType(
-        [
-            relay.TensorType((128, 128), "float32"),
-        ],
-        relay.TensorType((128, 128), "float32"),
-    )
-
-    reshape_gv = relay.GlobalVar("reshape", type_annot=reshape_ty)
-    mod[reshape_gv] = reshape_primfunc
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                  %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                  %z {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                  virtual_device=meta[VirtualDevice][0]) {
-          %0 = call_lowered(@multiply, (%x, %y, %z));
-          let %x_12: Tensor[(128, 128), float32] = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-          %1 = call_lowered(@reshape, (%x_12,) );
-          let %x_14: Tensor[(128, 128), float32] = on_device(%1, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-          %x_14
-        }
-        """,
-        "from_string",
-        mod,
-        metatable,
-    )
-
-    # Expected main:
-    ##[version = "0.0.5"]
-    # def @main(%x /* ty=Tensor[(128, 128), float32] */) -> Tensor[(128, 128), float32] {
-    #  %0 = (%x, %y, %z);
-    #  %1 = call_lowered(@multiply, %0);
-    #  let %x_12: Tensor[(128, 128), float32] = on_device(%1, constrain_result=True);
-    #  let %x_14: Tensor[(128, 128), float32] = on_device(%1, constrain_result=True);
-    #  %x_14
-    # }
-
-    mod["main"] = MarkReshapeOnlyMutator().visit(mod["main"])
-    mod = RemoveStandaloneReshapes()(mod)
-    reshapes_present = any(["reshape" in gv.name_hint for gv in mod.get_global_vars()])
-    assert not reshapes_present, "Reshape should have been removed."
-    return
-
-
-# When reshape layer is not marked as reshape_only
-def test_fused_reshape():
-    mod = tvm.ir.IRModule()
-
-    @T.prim_func
-    def mul_primfunc(a: T.handle, b: T.handle, d: T.handle) -> None:
-        A = T.match_buffer(a, [128, 128])
-        B = T.match_buffer(b, [128, 128])
-        D = T.match_buffer(d, [128, 128])
-
-        for i, j, k in T.grid(128, 128, 128):
-            with T.block("update"):
-                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
-                D[vi, vj] = A[vi, vk] * B[vj, vk]
-
-    @T.prim_func
-    def fused_reshape_primfunc(a: T.handle, d: T.handle) -> None:
-        A = T.match_buffer(a, [128, 128])
-        D = T.match_buffer(d, [128, 128])
-
-        for i, j in T.grid(128, 128):
-            D[i, j] = A[i, j]
-
-    metatable = {"VirtualDevice": [CPU]}
-    mul_ty = relay.FuncType(
-        [
-            relay.TensorType((128, 128), "float32"),
-            relay.TensorType((128, 128), "float32"),
-            relay.TensorType((128, 128), "float32"),
-        ],
-        relay.TensorType((128, 128), "float32"),
-    )
-
-    mul_gv = relay.GlobalVar("multiply", type_annot=mul_ty)
-    mod[mul_gv] = mul_primfunc
-    reshape_ty = relay.FuncType(
-        [
-            relay.TensorType((128, 128), "float32"),
-        ],
-        relay.TensorType((128, 128), "float32"),
-    )
-
-    reshape_gv = relay.GlobalVar("fused_reshape", type_annot=reshape_ty)
-    mod[reshape_gv] = fused_reshape_primfunc
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                  %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                  %z {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                  virtual_device=meta[VirtualDevice][0]) {
-          %0 = call_lowered(@multiply, (%x, %y, %z));
-          let %x_12: Tensor[(128, 128), float32] = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-          %1 = call_lowered(@fused_reshape, (%x_12,) );
-          let %x_14: Tensor[(128, 128), float32] = on_device(%1, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-          %x_14
-        }
-        """,
-        "from_string",
-        mod,
-        metatable,
-    )
-
-    # Expected main:
-    ##[version = "0.0.5"]
-    # def @main(%x /* ty=Tensor[(128, 128), float32] */) -> Tensor[(128, 128), float32] {
-    #  %0 = (%x, %y, %z);
-    #  %1 = call_lowered(@multiply, %0);
-    #  let %x_12: Tensor[(128, 128), float32] = on_device(%1, constrain_result=True);
-    #  let %x_14: Tensor[(128, 128), float32] = on_device(%1, constrain_result=True);
-    #  %x_14
-    # }
-
-    mod = RemoveStandaloneReshapes()(mod)
-    reshapes_present = any(["reshape" in gv.name_hint for gv in mod.get_global_vars()])
-    assert reshapes_present, "Reshape should have been removed."
-    return
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py
deleted file mode 100644
index 12edbdac5f23..000000000000
--- a/tests/python/relay/benchmarking/benchmark_vm.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Benchmarking Relay VM using models from MXNet."""
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.contrib import graph_executor
-from tvm import relay
-from tvm.runtime import container
-from tvm.runtime import vm as vm_rt
-from tvm.relay import testing
-from tvm.relay import vm
-
-
-def benchmark_execution(
-    mod,
-    params,
-    measure=True,
-    data_shape=(1, 3, 224, 224),
-    out_shape=(1, 1000),
-    dtype="float32",
-    model="unknown",
-):
-    def get_graph_executor_output(
-        mod, data, params, target, dev, dtype="float32", number=2, repeat=20
-    ):
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target, params=params)
-
-        m = graph_executor.GraphModule(lib["default"](dev))
-        # set inputs
-        m.set_input("data", data)
-        m.run()
-        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
-
-        if measure:
-            print("Evaluate graph executor inference cost of {} on " "{}".format(model, repr(dev)))
-            ftimer = m.module.time_evaluator("run", dev, number=1, repeat=20)
-            # Measure in millisecond.
-            prof_res = np.array(ftimer().results) * 1000
-            print(
-                "Mean graph executor inference time (std dev): %.2f ms (%.2f ms)"
-                % (np.mean(prof_res), np.std(prof_res))
-            )
-
-        return out.numpy()
-
-    def get_vm_output(mod, data, params, target, dev, dtype="float32", number=2, repeat=20):
-        with tvm.transform.PassContext(opt_level=3):
-            exe = vm.compile(mod, target, params=params)
-            rly_vm = vm_rt.VirtualMachine(exe, dev)
-            result = rly_vm.run(data)
-
-        if measure:
-            print("Evaluate vm inference cost of {} on {}".format(model, repr(dev)))
-            ftimer = rly_vm.module.time_evaluator("invoke", dev, number=number, repeat=repeat)
-            # Measure in millisecond.
-            prof_res = np.array(ftimer("main", data).results) * 1000
-            print(
-                "Mean vm inference time (std dev): %.2f ms (%.2f ms)"
-                % (np.mean(prof_res), np.std(prof_res))
-            )
-
-        return result.numpy().astype(dtype)
-
-    # random input
-    data = np.random.uniform(size=data_shape).astype(dtype)
-
-    for target, dev in testing.enabled_targets():
-        tvm_out = get_graph_executor_output(
-            mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype
-        )
-        vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype)
-        tvm.testing.assert_allclose(vm_out, tvm_out, rtol=1e-5, atol=1e-5)
-
-
-def test_mlp():
-    image_shape = (1, 1, 28, 28)
-    mod, params = testing.mlp.get_workload(1)
-    benchmark_execution(mod, params, data_shape=image_shape, out_shape=(1, 10), model="mlp")
-
-
-def test_vgg():
-    for n in [11, 16]:
-        mod, params = testing.vgg.get_workload(1, num_layers=n)
-        model = "vgg" + str(n)
-        benchmark_execution(mod, params, model=model)
-
-
-def test_resnet():
-    for n in [18, 50]:
-        mod, params = testing.resnet.get_workload(batch_size=1, num_layers=n)
-        model = "resnet" + str(n)
-        benchmark_execution(mod, params, model=model)
-
-
-def test_squeezenet():
-    for version in ["1.0", "1.1"]:
-        mod, params = testing.squeezenet.get_workload(version=version)
-        model = "squeezenet" + version
-        benchmark_execution(mod, params, model=model)
-
-
-def test_inception_v3():
-    image_shape = (3, 299, 299)
-    mod, params = testing.inception_v3.get_workload(image_shape=image_shape)
-    benchmark_execution(mod, params, data_shape=(1, 3, 299, 299), model="inception_v3")
-
-
-def test_dqn():
-    image_shape = (1, 4, 84, 84)
-    mod, params = testing.dqn.get_workload(batch_size=1, image_shape=image_shape)
-    benchmark_execution(mod, params, data_shape=image_shape, out_shape=(1, 18))
-
-
-def test_dcgan():
-    image_shape = (1, 100)
-    mod, params = testing.dcgan.get_workload(batch_size=1)
-    benchmark_execution(mod, params, data_shape=image_shape, out_shape=(1, 3, 64, 64))
-
-
-def test_mobilenet():
-    mod, params = testing.mobilenet.get_workload(batch_size=1)
-    benchmark_execution(mod, params, model="mobilenet")
-
-
-# TODO: enable when the low building performance (several minutes) fixed.
-def test_mobilenet_nhwc():
-    image_shape = (1, 224, 224, 3)
-    mod, params = testing.mobilenet.get_workload(
-        batch_size=1, image_shape=image_shape[1:], layout="NHWC"
-    )
-    benchmark_execution(mod, params, measure=False, data_shape=image_shape)
-
-
-def test_densenet():
-    mod, params = testing.densenet.get_workload(batch_size=1)
-    benchmark_execution(mod, params, model="densenet")
-
-
-if __name__ == "__main__":
-    test_resnet()
-    test_vgg()
-    test_squeezenet()
-    test_mobilenet()
-    test_densenet()
-    test_inception_v3()
-    test_mlp()
-    test_dqn()
-    test_dcgan()
diff --git a/tests/python/relay/collage/demo_collage_partitioner.py b/tests/python/relay/collage/demo_collage_partitioner.py
deleted file mode 100644
index 0b7c815a8806..000000000000
--- a/tests/python/relay/collage/demo_collage_partitioner.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Compares Collage with various other baselines."""
-
-# CAUTION: Requires some changes in python/tvm/autotvm/task/dispatcher.py
-# so that AutoTVM tuning records can be cached between runs and between
-# models. See https://github.com/mbs-octoml/mbs-tvm/tree/mbs-collage-hacks.
-
-import tvm
-import logging
-import tempfile
-import os
-import shutil
-
-import menangerie
-
-# The following are necessary to force global functions or pattern tables to be registered
-from tvm.relay.op.contrib.cutlass import partition_for_cutlass
-from tvm.contrib.cutlass import num_cutlass_partitions
-from tvm.relay.op.contrib.cublas import partition_for_cublas
-from tvm.relay.op.contrib.cudnn import partition_for_cudnn
-
-logging.basicConfig(level=logging.INFO)
-
-
-########### Configuration ###########
-
-###
-### Rename to match your hardware, eg ..._vt100...
-###
-TUNING_LOG = "/home/mbs/collage_autotvm_rtx3070.tuninglog"
-
-###
-### If true, runs final model under nvprof
-###
-PROFILE = True
-
-###
-### If true, run all models
-###
-ALL_MODELS = False
-
-###
-### If true, run all configurations
-###
-ALL_CONFIGS = False
-
-###
-### How aggressively to look for candidates?
-###
-TVM_MAX_DEPTH = 8
-BYOC_MAX_DEPTH = 8
-
-###
-### AutoTVM tuning parameters.
-###
-AUTOTVM_NUM_TRIALS = 2000
-AUTOTVM_EARLY_STOPPING = 600
-TIMEOUT = 10
-MEASURE_NUMBER = tvm.relay.collage.MEASURE_NUMBER
-MEASURE_REPEAT = tvm.relay.collage.MEASURE_REPEAT
-WARMUP_MIN_REPEAT_MS = tvm.relay.collage.WARMUP_MIN_REPEAT_MS
-
-HOST = tvm.target.Target("llvm")
-CUDA = tvm.target.Target("cuda", HOST)
-
-########### Runtime ###########
-
-# Code to run a model. The actual call to 'run' is appended at compile time.
-# We invoke the model as a sub-process so that we can wrap profiling tools around it.
-runner_template = f"""
-import tvm
-import tvm.runtime.vm
-import numpy as np
-import logging
-
-logging.basicConfig(level=logging.INFO)
-
-MEASURE_NUMBER = {MEASURE_NUMBER}
-MEASURE_REPEAT = {MEASURE_REPEAT}
-WARMUP_MIN_REPEAT_MS = {WARMUP_MIN_REPEAT_MS}
-
-def arg_for(shape, dtype, device):
-    return tvm.nd.array(
-        np.random.rand(*shape).astype(dtype), device=device)
-
-def vm_estimate_seconds(device, vm, args):
-    vm.benchmark(device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, **args)
-    return vm.benchmark(device, repeat=MEASURE_REPEAT, number=MEASURE_NUMBER, min_repeat_ms=0,
-                        **args)
-
-
-def run(label, name, device, lib_path, code_path, input_shapes, input_dtypes):
-    logging.info(f"Loading compiled code for {{name}} generated by {{label}} from {{lib_path}} and {{code_path}}...")
-    loaded_lib = tvm.runtime.load_module(lib_path)
-    loaded_code = bytearray(open(code_path, "rb").read())
-    loaded_exe = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib)
-    vm = tvm.runtime.vm.VirtualMachine(loaded_exe, device)
-    args = {{
-        input_name: arg_for(input_shapes[input_name], input_dtypes[input_name], device)
-        for input_name in input_shapes.keys()
-    }}
-    logging.info(f"Benchmarking for {{name}} generated by {{label}}...")
-    profile = vm_estimate_seconds(device, vm, args)
-    logging.info(f"Benchmarked for {{name}} generated by {{label}}: {{profile}}")
-    logging.info(f"RESULT: {{label}} | {{name}} | {{profile.median * 1e3}}ms")
-
-if __name__ == "__main__":
-"""
-
-########### AutoTVM tuning helpers ###########
-
-
-def extract_autotvm_tasks(mod, target):
-    """Returns TVM kernels to tune for mod and target."""
-    return tvm.autotvm.task.extract_from_program(mod, target=target, params=None)
-
-
-def optional_tuning_records(log_filename):
-    """Returns existing tuning records, if any."""
-    if log_filename == "" or not os.path.exists(log_filename):
-        return tvm.autotvm.task.FallbackContext()
-    else:
-        return tvm.autotvm.task.ApplyHistoryBest(log_filename)
-
-
-def is_already_tuned(task, log_filename):
-    """Returns True if we already have a tuning record for task in turning logs in log_filename"""
-    if not os.path.exists(log_filename):
-        return False
-
-    dispatch_context = tvm.autotvm.task.ApplyHistoryBest(log_filename)
-    return dispatch_context.contains(task.target, task.workload)
-
-
-def tune_autotvm_tasks(tasks, log_filename):
-    """Appends to log_filename the best strategies for tasks"""
-    if len(tasks) == 0:
-        return
-
-    measure_option = tvm.autotvm.measure_option(
-        builder=tvm.autotvm.LocalBuilder(timeout=TIMEOUT),
-        runner=tvm.autotvm.LocalRunner(
-            number=MEASURE_NUMBER, repeat=MEASURE_REPEAT, timeout=TIMEOUT, min_repeat_ms=0
-        ),
-    )
-
-    logging.info(
-        f"Using autotvm tuning for {len(tasks)} tasks with {AUTOTVM_NUM_TRIALS} trials, logging to {log_filename}"
-    )
-
-    # create tmp log file, starting with contents from existing log file
-    tmp_log_filename = log_filename + ".tmp"
-    if os.path.exists(tmp_log_filename):
-        os.remove(tmp_log_filename)
-    if os.path.exists(log_filename):
-        logging.info(f"Copying existing log {log_filename} to {tmp_log_filename}")
-        shutil.copy(log_filename, tmp_log_filename)
-
-    for i, task in enumerate(reversed(tasks)):
-        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-        logging.info(f"Considering task {task.name} {prefix}")
-        if is_already_tuned(task, tmp_log_filename):
-            logging.info(f"Re-using existing record for {task.name}")
-            continue
-
-        logging.info(f"Using autotvm to tune {task.name}")
-        tuner_obj = tvm.autotvm.tuner.XGBTuner(task, loss_type="reg")
-        if os.path.exists(tmp_log_filename):
-            tuner_obj.load_history(tvm.autotvm.record.load_from_file(tmp_log_filename))
-
-        # do tuning
-        n_trial = min(AUTOTVM_NUM_TRIALS, len(task.config_space))
-        tuner_obj.tune(
-            n_trial=n_trial,
-            early_stopping=AUTOTVM_EARLY_STOPPING,
-            measure_option=measure_option,
-            callbacks=[
-                tvm.autotvm.callback.progress_bar(n_trial, prefix=prefix),
-                tvm.autotvm.callback.log_to_file(tmp_log_filename),
-            ],
-        )
-
-    # pick best records and copy back to main log file
-    tvm.autotvm.record.pick_best(tmp_log_filename, log_filename)
-    os.remove(tmp_log_filename)
-
-    logging.info("Done with autotvm tuning")
-
-
-def autotvm_tune_module(mod, target, log_filename):
-    if log_filename == "":
-        logging.info("Not tuning with autotvm since disabled")
-        return
-    # Extract and tune any TVM kernels. BYOC partitions will have no tasks extracted.
-    logging.info("Extracting tasks from overall module")
-    tasks = extract_autotvm_tasks(mod, target)
-    logging.info(f"Auto-tuning {len(tasks)} tasks from overall module")
-    tune_autotvm_tasks(tasks, log_filename)
-
-
-########### Drivers ###########
-
-
-def compile_and_benchmark(label, model, targets, dev, tmp_dir):
-    """Compile model for target and run it with profiling."""
-    logging.info(f"Compiling {model['name']} using {label} with {targets}...")
-    exe = tvm.relay.vm.compile(model["mod"], target=targets, params=model["params"])
-    lib_path = os.path.join(tmp_dir, "lib.so")
-    code_path = os.path.join(tmp_dir, "code.ro")
-    code, lib = exe.save()
-    logging.info(f"Saving VM code to {code_path}...")
-    with open(code_path, "wb") as fo:
-        fo.write(code)
-    logging.info(f"Exporting library to {lib_path}...")
-    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
-    runner = f"{runner_template}    run('{label}', '{model['name']}', tvm.device({dev.device_type}), '{lib_path}', '{code_path}', {model['input_shapes']}, {model['input_dtypes']})\n"
-    runner_path = os.path.join(tmp_dir, "runner.py")
-    logging.info(f"Saving runner to {runner_path}...")
-    with open(runner_path, "w") as fo:
-        fo.write(runner)
-
-    logging.info(f"Invoking runner...")
-    if PROFILE:
-        profile_path = os.path.join(tmp_dir, "profile.txt")
-        os.system(f"nsys nvprof -o {profile_path} python3 {runner_path}")
-    else:
-        os.system(f"python3 {runner_path}")
-
-
-def collage(model):
-    """Run the Collage partitioner for a set of CUDA-related targets and profile the result"""
-    logging.info(f"collage | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
-    with optional_tuning_records(TUNING_LOG):
-        targets = []
-        targets.append(CUDA)
-        use_fp16 = model["main_dtype"] == "float16"
-        targets.append(
-            tvm.target.Target(f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST)
-        )
-        tmp_dir = tempfile.mkdtemp()
-        targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST))
-        targets.append(tvm.target.Target("cublas", HOST))
-        targets.append(tvm.target.Target("cudnn", HOST))
-        config = {
-            "relay.collage.tvm_max_depth": TVM_MAX_DEPTH,
-            "relay.collage.byoc_max_depth": BYOC_MAX_DEPTH,
-            "relay.collage.byoc_fusion_style": [
-                "cutlass.NoFusion",
-                "cublas.NoFusion",
-                "cudnn.NoFusion",
-                "tensorrt.TVMFusion",
-            ],
-        }
-        logging.info(f"Using PassContext(config={config}")
-        ctxt = tvm.transform.PassContext(config=config)
-        config = tvm.target.make_compilation_config(ctxt, targets)
-        with ctxt:
-            mod = model["mod"]
-            mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
-            logging.info("-------------- BEGIN INDEXED --------------")
-            logging.info(mod)
-            logging.info("-------------- END INDEXED ----------------")
-            mod = tvm.relay.transform.CollagePartition(config)(mod)
-            partitioned_model = model.copy()
-            partitioned_model["mod"] = mod
-            logging.info("-------------- BEGIN PARTITIONED --------------")
-            logging.info(partitioned_model["mod"])
-            logging.info("-------------- END PARTITIONED ----------------")
-            dev = tvm.device(CUDA.get_target_device_type())
-            compile_and_benchmark("collage", partitioned_model, targets, dev, tmp_dir)
-
-
-def just_tensorrt(model):
-    """Run partition_for_tensorrt, complete the compilation with TVM, and profile the result."""
-    logging.info(f"just_tensorrt | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    tmp_dir = tempfile.mkdtemp()
-    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
-    with optional_tuning_records(TUNING_LOG):
-        logging.info("Partitioning for TensorRT...")
-        use_fp16 = model["main_dtype"] == "float16"
-        trt_target = tvm.target.Target(
-            f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST
-        )
-        mod = tvm.relay.op.contrib.partition_for_tensorrt(
-            mod=model["mod"], params=model["params"], target=trt_target
-        )
-        partitioned_model = model.copy()
-        partitioned_model["mod"] = mod
-        logging.info("-------------- BEGIN PARTITIONED --------------")
-        logging.info(partitioned_model["mod"])
-        logging.info("-------------- END PARTITIONED ----------------")
-        targets = []
-        targets.append(CUDA)
-        targets.append(trt_target)
-        dev = tvm.device(CUDA.get_target_device_type())
-        compile_and_benchmark("just_tensorrt", partitioned_model, targets, dev, tmp_dir)
-
-
-def just_cutlass(model):
-    """Run partition_for_cutlass, complete the compilation with TVM, and profile the result."""
-    logging.info(f"just_cutlass | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    tmp_dir = tempfile.mkdtemp()
-    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
-    with optional_tuning_records(TUNING_LOG):
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            logging.info("Partitioning for CUTLASS...")
-            mod = tvm.relay.op.contrib.partition_for_cutlass(model["mod"], model["params"])
-            partitioned_model = model.copy()
-            partitioned_model["mod"] = mod
-            logging.info("-------------- BEGIN PARTITIONED --------------")
-            logging.info(partitioned_model["mod"])
-            logging.info("-------------- END PARTITIONED ----------------")
-            targets = []
-            targets.append(CUDA)
-            targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST))
-            dev = tvm.device(CUDA.get_target_device_type())
-            compile_and_benchmark("just_cutlass", partitioned_model, targets, dev, tmp_dir)
-
-
-def just_tvm(model):
-    """Compile and profile using vanilla TVM."""
-    logging.info(f"just_tvm | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    tmp_dir = tempfile.mkdtemp()
-    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
-    with optional_tuning_records(TUNING_LOG):
-        dev = tvm.device(CUDA.get_target_device_type())
-        compile_and_benchmark("just_tvm", model, CUDA, dev, tmp_dir)
-
-
-def tvm_with_libs(model):
-    """As for just_tvm, but use the existing -libs mechanism to enable standard CUDA libs."""
-    logging.info(f"tvm_with_libs | {model['name']}")
-    logging.info("-------------- BEGIN ORIGINAL --------------")
-    logging.info(model["mod"])
-    logging.info("-------------- END ORIGINAL ----------------")
-    tmp_dir = tempfile.mkdtemp()
-    cuda_target = tvm.target.Target("cuda -libs=cudnn,cublas", HOST)
-    autotvm_tune_module(model["mod"], cuda_target, TUNING_LOG)
-    with optional_tuning_records(TUNING_LOG):
-        dev = tvm.device(cuda_target.get_target_device_type())
-        compile_and_benchmark("tvm_with_libs", model, cuda_target, dev, tmp_dir)
-
-
-########### Runners ###########
-
-
-def run_all():
-    """Run the whole test suite."""
-    make_models = []
-    make_models.append(menangerie.resnext50_32x4d)
-    if ALL_MODELS:
-        make_models.append(menangerie.resnext50_32x4d_16)
-        make_models.append(menangerie.gpt2_16)
-        make_models.append(menangerie.gpt2)
-        make_models.append(menangerie.mobilenet_16)
-        make_models.append(menangerie.mobilenet)
-        make_models.append(menangerie.resnet50_16)
-        make_models.append(menangerie.resnet50)
-    run_models = []
-    if ALL_CONFIGS:
-        run_models.append(just_tensorrt)
-        run_models.append(just_tvm)
-        run_models.append(tvm_with_libs)
-    run_models.append(collage)
-    for make_model in make_models:
-        model = make_model()
-        for run_model in run_models:
-            run_model(model)
-
-
-def run_mini():
-    """Run Collage on a tiny GPT2 extract."""
-    collage(menangerie.gpt2_16_for_cutlass_extract())
-
-
-if __name__ == "__main__":
-    # run_all()
-    run_mini()
diff --git a/tests/python/relay/collage/menangerie.py b/tests/python/relay/collage/menangerie.py
deleted file mode 100644
index e74059282e3e..000000000000
--- a/tests/python/relay/collage/menangerie.py
+++ /dev/null
@@ -1,4288 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""A collection of Relay models for exercising Collage."""
-
-import tvm
-import onnx
-import numpy as np
-import logging
-import tvm.contrib.target.onnx
-
-MODEL_PREFIX = "/home/mbs/gauntlet/models/"
-MNIST = {
-    "name": "mnist",
-    "filename": "mnist-8.onnx",
-    "input_shapes": {"Input3": [1, 1, 28, 28]},
-    "input_dtypes": {"Input3": "float32"},
-    "main_dtype": "float32",
-}
-GPT2 = {
-    "name": "gpt2",
-    "filename": "gpt2.onnx",
-    "input_shapes": {"input1": [1, 50, 32]},
-    "input_dtypes": {"input1": "int64"},
-    "main_dtype": "float32",
-}
-RESNET50V2 = {
-    "name": "resnet50",
-    "filename": "resnet50-v2-7.onnx",
-    "input_shapes": {"data": [1, 3, 224, 224]},
-    "input_dtypes": {"data": "float32"},
-    "main_dtype": "float32",
-}
-MOBILENETV2 = {
-    "name": "mobilenet",
-    "filename": "mobilenetv2-1.0.onnx",
-    "input_shapes": {"data": [1, 3, 224, 224]},
-    "input_dtypes": {"data": "float32"},
-    "main_dtype": "float32",
-}
-# Note that resnext50_32_4d below was extracted directly from the pytorch model and not from any onnx file.
-RESNEXT50_32_4d = {
-    "name": "resnext50_32_4d",
-    "filename": "resnext50_32x4d.onnx",
-    "input_shapes": {"x": [1, 64, 56, 56]},
-    "input_dtypes": {"x": "float32"},
-    "main_dtype": "float32",
-}
-
-
-def make_const(dtype, shape):
-    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
-
-
-def make_consts(dtype, shapes):
-    return [make_const(dtype, shape) for shape in shapes]
-
-
-def mnist_consts(dtype):
-    return make_consts(
-        dtype,
-        [
-            (8, 1, 5, 5),  # 0
-            (8, 1, 1),  # 1
-            (16, 8, 5, 5),  # 2
-            (16, 1, 1),  # 3
-            (10, 256),  # 4
-            (1, 10),  # 5
-        ],
-    )
-
-
-def mnist():
-    metatable = {"relay.Constant": mnist_consts("float32")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(1, 1, 28, 28), float32]) -> Tensor[(1, 10), float32] {
-          %0 = nn.pad(%x, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]);
-          %1 = nn.conv2d(%0, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=8, kernel_size=[5, 5]);
-          %2 = add(%1, meta[relay.Constant][1]);
-          %3 = nn.relu(%2);
-          %4 = nn.max_pool2d(%3, pool_size=[2, 2], strides=[2, 2], padding=[0, 0, 0, 0]);
-          %5 = nn.pad(%4, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]);
-          %6 = nn.conv2d(%5, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=16, kernel_size=[5, 5]);
-          %7 = add(%6, meta[relay.Constant][3]);
-          %8 = nn.relu(%7);
-          %9 = nn.max_pool2d(%8, pool_size=[3, 3], strides=[3, 3], padding=[0, 0, 0, 0]);
-          %10 = reshape(%9, newshape=[1, 256]);
-          %11 = nn.dense(%10, meta[relay.Constant][4], units=None, out_dtype="float32");
-          add(%11, meta[relay.Constant][5])
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "mnist",
-        "input_shapes": {"x": [1, 1, 28, 28]},
-        "input_dtypes": {"x": "float32"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float32",
-    }
-
-
-def gpt2_consts(dtype):
-    return make_consts(
-        dtype,
-        [
-            (50257, 768),  # 0
-            (1, 32, 768),  # 1
-            (768,),  # 2
-            (768,),  # 3
-            (2304, 768),  # 4
-            (2304,),  # 5
-            (1, 1, 32, 32),  # 6
-            (1, 1, 32, 32),  # 7
-            (768, 768),  # 8
-            (768,),  # 9
-            (768,),  # 10
-            (768,),  # 11
-            (3072, 768),  # 12
-            (3072,),  # 13
-            (768, 3072),  # 14
-            (768,),  # 15
-            (768,),  # 16
-            (768,),  # 17
-            (2304, 768),  # 18
-            (2304,),  # 19
-            (1, 1, 32, 32),  # 20
-            (1, 1, 32, 32),  # 21
-            (768, 768),  # 22
-            (768,),  # 23
-            (768,),  # 24
-            (768,),  # 25
-            (3072, 768),  # 26
-            (3072,),  # 27
-            (768, 3072),  # 28
-            (768,),  # 29
-            (768,),  # 30
-            (768,),  # 31
-            (2304, 768),  # 32
-            (2304,),  # 33
-            (1, 1, 32, 32),  # 34
-            (1, 1, 32, 32),  # 35
-            (768, 768),  # 36
-            (768,),  # 37
-            (768,),  # 38
-            (768,),  # 39
-            (3072, 768),  # 40
-            (3072,),  # 41
-            (768, 3072),  # 42
-            (768,),  # 43
-            (768,),  # 44
-            (768,),  # 45
-            (2304, 768),  # 46
-            (2304,),  # 47
-            (1, 1, 32, 32),  # 48
-            (1, 1, 32, 32),  # 49
-            (768, 768),  # 50
-            (768,),  # 51
-            (768,),  # 52
-            (768,),  # 53
-            (3072, 768),  # 54
-            (3072,),  # 55
-            (768, 3072),  # 56
-            (768,),  # 57
-            (768,),  # 58
-            (768,),  # 59
-            (2304, 768),  # 60
-            (2304,),  # 61
-            (1, 1, 32, 32),  # 62
-            (1, 1, 32, 32),  # 63
-            (768, 768),  # 64
-            (768,),  # 65
-            (768,),  # 66
-            (768,),  # 67
-            (3072, 768),  # 68
-            (3072,),  # 69
-            (768, 3072),  # 70
-            (768,),  # 71
-            (768,),  # 72
-            (768,),  # 73
-            (2304, 768),  # 74
-            (2304,),  # 75
-            (1, 1, 32, 32),  # 76
-            (1, 1, 32, 32),  # 77
-            (768, 768),  # 78
-            (768,),  # 79
-            (768,),  # 80
-            (768,),  # 81
-            (3072, 768),  # 82
-            (3072,),  # 83
-            (768, 3072),  # 84
-            (768,),  # 85
-            (768,),  # 86
-            (768,),  # 87
-            (2304, 768),  # 88
-            (2304,),  # 89
-            (1, 1, 32, 32),  # 90
-            (1, 1, 32, 32),  # 91
-            (768, 768),  # 92
-            (768,),  # 93
-            (768,),  # 94
-            (768,),  # 95
-            (3072, 768),  # 96
-            (3072,),  # 97
-            (768, 3072),  # 98
-            (768,),  # 99
-            (768,),  # 100
-            (768,),  # 101
-            (2304, 768),  # 102
-            (2304,),  # 103
-            (1, 1, 32, 32),  # 104
-            (1, 1, 32, 32),  # 105
-            (768, 768),  # 106
-            (768,),  # 107
-            (768,),  # 108
-            (768,),  # 109
-            (3072, 768),  # 110
-            (3072,),  # 111
-            (768, 3072),  # 112
-            (768,),  # 113
-            (768,),  # 114
-            (768,),  # 115
-            (2304, 768),  # 116
-            (2304,),  # 117
-            (1, 1, 32, 32),  # 118
-            (1, 1, 32, 32),  # 119
-            (768, 768),  # 120
-            (768,),  # 121
-            (768,),  # 122
-            (768,),  # 123
-            (3072, 768),  # 124
-            (3072,),  # 125
-            (768, 3072),  # 126
-            (768,),  # 127
-            (768,),  # 128
-            (768,),  # 129
-            (2304, 768),  # 130
-            (2304,),  # 131
-            (1, 1, 32, 32),  # 132
-            (1, 1, 32, 32),  # 133
-            (768, 768),  # 134
-            (768,),  # 135
-            (768,),  # 136
-            (768,),  # 137
-            (3072, 768),  # 138
-            (3072,),  # 139
-            (768, 3072),  # 140
-            (768,),  # 141
-            (768,),  # 142
-            (768,),  # 143
-            (2304, 768),  # 144
-            (2304,),  # 145
-            (1, 1, 32, 32),  # 146
-            (1, 1, 32, 32),  # 147
-            (768, 768),  # 148
-            (768,),  # 149
-            (768,),  # 150
-            (768,),  # 151
-            (3072, 768),  # 152
-            (3072,),  # 153
-            (768, 3072),  # 154
-            (768,),  # 155
-            (768,),  # 156
-            (768,),  # 157
-            (2304, 768),  # 158
-            (2304,),  # 159
-            (1, 1, 32, 32),  # 160
-            (1, 1, 32, 32),  # 161
-            (768, 768),  # 162
-            (768,),  # 163
-            (768,),  # 164
-            (768,),  # 165
-            (3072, 768),  # 166
-            (3072,),  # 167
-            (768, 3072),  # 168
-            (768,),  # 169
-            (768,),  # 170
-            (768,),  # 171
-        ],
-    )
-
-
-def gpt2():
-    metatable = {"relay.Constant": gpt2_consts("float32")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32],
-                                                      Tensor[(2, 50, 12, 32, 64), float32]) {
-          %0 = reshape(%x, newshape=[-1, 32]);
-          %1 = less(%0, 0i64);
-          %2 = add(%0, 50257i64);
-          %3 = where(%1, %2, %0);
-          %4 = take(meta[relay.Constant][0], %3, axis=0);
-          %5 = add(%4, meta[relay.Constant][1]);
-          %6 = mean(%5, axis=[-1], keepdims=True);
-          %7 = subtract(%5, %6);
-          %8 = power(%7, 2f);
-          %9 = mean(%8, axis=[-1], keepdims=True);
-          %10 = add(%9, 1e-05f);
-          %11 = sqrt(%10);
-          %12 = divide(%7, %11);
-          %13 = multiply(%12, meta[relay.Constant][2]);
-          %14 = add(%13, meta[relay.Constant][3]);
-          %15 = reshape(%14, newshape=[-1, 768]);
-          %16 = nn.dense(%15, meta[relay.Constant][4], units=2304);
-          %17 = add(%16, meta[relay.Constant][5]);
-          %18 = reshape(%17, newshape=[50, 32, 2304]);
-          %19 = split(%18, indices_or_sections=[768, 1536], axis=2);
-          %20 = %19.0;
-          %21 = reshape(%20, newshape=[50, 32, 12, 64]);
-          %22 = transpose(%21, axes=[0, 2, 1, 3]);
-          %23 = %19.1;
-          %24 = reshape(%23, newshape=[50, 32, 12, 64]);
-          %25 = transpose(%24, axes=[0, 2, 3, 1]);
-          %26 = reshape(%25, newshape=[-1, 64, 32]);
-          %27 = reshape(%22, newshape=[-1, 32, 64]);
-          %28 = transpose(%26, axes=[0, 2, 1]);
-          %29 = nn.batch_matmul(%27, %28, out_dtype="float32", transpose_b=True);
-          %30 = reshape(%29, newshape=[50, 12, 32, 32]);
-          %31 = divide(%30, 8f);
-          %32 = multiply(%31, meta[relay.Constant][6]);
-          %33 = subtract(%32, meta[relay.Constant][7]);
-          %34 = nn.softmax(%33, axis=3);
-          %35 = %19.2;
-          %36 = reshape(%35, newshape=[50, 32, 12, 64]);
-          %37 = transpose(%36, axes=[0, 2, 1, 3]);
-          %38 = reshape(%37, newshape=[-1, 32, 64]);
-          %39 = reshape(%34, newshape=[-1, 32, 32]);
-          %40 = transpose(%38, axes=[0, 2, 1]);
-          %41 = nn.batch_matmul(%39, %40, out_dtype="float32", transpose_b=True);
-          %42 = reshape(%41, newshape=[50, 12, 32, 64]);
-          %43 = transpose(%42, axes=[0, 2, 1, 3]);
-          %44 = reshape(%43, newshape=[50, 32, 768]);
-          %45 = reshape(%44, newshape=[-1, 768]);
-          %46 = nn.dense(%45, meta[relay.Constant][8], units=768);
-          %47 = add(%46, meta[relay.Constant][9]);
-          %48 = reshape(%47, newshape=[50, 32, 768]);
-          %49 = add(%5, %48);
-          %50 = mean(%49, axis=[-1], keepdims=True);
-          %51 = subtract(%49, %50);
-          %52 = power(%51, 2f);
-          %53 = mean(%52, axis=[-1], keepdims=True);
-          %54 = add(%53, 1e-05f);
-          %55 = sqrt(%54);
-          %56 = divide(%51, %55);
-          %57 = multiply(%56, meta[relay.Constant][10]);
-          %58 = add(%57, meta[relay.Constant][11]);
-          %59 = reshape(%58, newshape=[-1, 768]);
-          %60 = nn.dense(%59, meta[relay.Constant][12], units=3072);
-          %61 = add(%60, meta[relay.Constant][13]);
-          %62 = reshape(%61, newshape=[50, 32, 3072]);
-          %63 = power(%62, 3f);
-          %64 = multiply(%63, 0.044715f);
-          %65 = add(%62, %64);
-          %66 = multiply(%65, 0.797885f);
-          %67 = tanh(%66);
-          %68 = multiply(%62, 0.5f);
-          %69 = add(%67, 1f);
-          %70 = multiply(%68, %69);
-          %71 = reshape(%70, newshape=[-1, 3072]);
-          %72 = nn.dense(%71, meta[relay.Constant][14], units=768);
-          %73 = add(%72, meta[relay.Constant][15]);
-          %74 = reshape(%73, newshape=[50, 32, 768]);
-          %75 = add(%49, %74);
-          %76 = mean(%75, axis=[-1], keepdims=True);
-          %77 = subtract(%75, %76);
-          %78 = power(%77, 2f);
-          %79 = mean(%78, axis=[-1], keepdims=True);
-          %80 = add(%79, 1e-05f);
-          %81 = sqrt(%80);
-          %82 = divide(%77, %81);
-          %83 = multiply(%82, meta[relay.Constant][16]);
-          %84 = add(%83, meta[relay.Constant][17]);
-          %85 = reshape(%84, newshape=[-1, 768]);
-          %86 = nn.dense(%85, meta[relay.Constant][18], units=2304);
-          %87 = add(%86, meta[relay.Constant][19]);
-          %88 = reshape(%87, newshape=[50, 32, 2304]);
-          %89 = split(%88, indices_or_sections=[768, 1536], axis=2);
-          %90 = %89.0;
-          %91 = reshape(%90, newshape=[50, 32, 12, 64]);
-          %92 = transpose(%91, axes=[0, 2, 1, 3]);
-          %93 = %89.1;
-          %94 = reshape(%93, newshape=[50, 32, 12, 64]);
-          %95 = transpose(%94, axes=[0, 2, 3, 1]);
-          %96 = reshape(%95, newshape=[-1, 64, 32]);
-          %97 = reshape(%92, newshape=[-1, 32, 64]);
-          %98 = transpose(%96, axes=[0, 2, 1]);
-          %99 = nn.batch_matmul(%97, %98, out_dtype="float32", transpose_b=True);
-          %100 = reshape(%99, newshape=[50, 12, 32, 32]);
-          %101 = divide(%100, 8f);
-          %102 = multiply(%101, meta[relay.Constant][20]);
-          %103 = subtract(%102, meta[relay.Constant][21]);
-          %104 = nn.softmax(%103, axis=3);
-          %105 = %89.2;
-          %106 = reshape(%105, newshape=[50, 32, 12, 64]);
-          %107 = transpose(%106, axes=[0, 2, 1, 3]);
-          %108 = reshape(%107, newshape=[-1, 32, 64]);
-          %109 = reshape(%104, newshape=[-1, 32, 32]);
-          %110 = transpose(%108, axes=[0, 2, 1]);
-          %111 = nn.batch_matmul(%109, %110, out_dtype="float32", transpose_b=True);
-          %112 = reshape(%111, newshape=[50, 12, 32, 64]);
-          %113 = transpose(%112, axes=[0, 2, 1, 3]);
-          %114 = reshape(%113, newshape=[50, 32, 768]);
-          %115 = reshape(%114, newshape=[-1, 768]);
-          %116 = nn.dense(%115, meta[relay.Constant][22], units=768);
-          %117 = add(%116, meta[relay.Constant][23]);
-          %118 = reshape(%117, newshape=[50, 32, 768]);
-          %119 = add(%75, %118);
-          %120 = mean(%119, axis=[-1], keepdims=True);
-          %121 = subtract(%119, %120);
-          %122 = power(%121, 2f);
-          %123 = mean(%122, axis=[-1], keepdims=True);
-          %124 = add(%123, 1e-05f);
-          %125 = sqrt(%124);
-          %126 = divide(%121, %125);
-          %127 = multiply(%126, meta[relay.Constant][24]);
-          %128 = add(%127, meta[relay.Constant][25]);
-          %129 = reshape(%128, newshape=[-1, 768]);
-          %130 = nn.dense(%129, meta[relay.Constant][26], units=3072);
-          %131 = add(%130, meta[relay.Constant][27]);
-          %132 = reshape(%131, newshape=[50, 32, 3072]);
-          %133 = power(%132, 3f);
-          %134 = multiply(%133, 0.044715f);
-          %135 = add(%132, %134);
-          %136 = multiply(%135, 0.797885f);
-          %137 = tanh(%136);
-          %138 = multiply(%132, 0.5f);
-          %139 = add(%137, 1f);
-          %140 = multiply(%138, %139);
-          %141 = reshape(%140, newshape=[-1, 3072]);
-          %142 = nn.dense(%141, meta[relay.Constant][28], units=768);
-          %143 = add(%142, meta[relay.Constant][29]);
-          %144 = reshape(%143, newshape=[50, 32, 768]);
-          %145 = add(%119, %144);
-          %146 = mean(%145, axis=[-1], keepdims=True);
-          %147 = subtract(%145, %146);
-          %148 = power(%147, 2f);
-          %149 = mean(%148, axis=[-1], keepdims=True);
-          %150 = add(%149, 1e-05f);
-          %151 = sqrt(%150);
-          %152 = divide(%147, %151);
-          %153 = multiply(%152, meta[relay.Constant][30]);
-          %154 = add(%153, meta[relay.Constant][31]);
-          %155 = reshape(%154, newshape=[-1, 768]);
-          %156 = nn.dense(%155, meta[relay.Constant][32], units=2304);
-          %157 = add(%156, meta[relay.Constant][33]);
-          %158 = reshape(%157, newshape=[50, 32, 2304]);
-          %159 = split(%158, indices_or_sections=[768, 1536], axis=2);
-          %160 = %159.0;
-          %161 = reshape(%160, newshape=[50, 32, 12, 64]);
-          %162 = transpose(%161, axes=[0, 2, 1, 3]);
-          %163 = %159.1;
-          %164 = reshape(%163, newshape=[50, 32, 12, 64]);
-          %165 = transpose(%164, axes=[0, 2, 3, 1]);
-          %166 = reshape(%165, newshape=[-1, 64, 32]);
-          %167 = reshape(%162, newshape=[-1, 32, 64]);
-          %168 = transpose(%166, axes=[0, 2, 1]);
-          %169 = nn.batch_matmul(%167, %168, out_dtype="float32", transpose_b=True);
-          %170 = reshape(%169, newshape=[50, 12, 32, 32]);
-          %171 = divide(%170, 8f);
-          %172 = multiply(%171, meta[relay.Constant][34]);
-          %173 = subtract(%172, meta[relay.Constant][35]);
-          %174 = nn.softmax(%173, axis=3);
-          %175 = %159.2;
-          %176 = reshape(%175, newshape=[50, 32, 12, 64]);
-          %177 = transpose(%176, axes=[0, 2, 1, 3]);
-          %178 = reshape(%177, newshape=[-1, 32, 64]);
-          %179 = reshape(%174, newshape=[-1, 32, 32]);
-          %180 = transpose(%178, axes=[0, 2, 1]);
-          %181 = nn.batch_matmul(%179, %180, out_dtype="float32", transpose_b=True);
-          %182 = reshape(%181, newshape=[50, 12, 32, 64]);
-          %183 = transpose(%182, axes=[0, 2, 1, 3]);
-          %184 = reshape(%183, newshape=[50, 32, 768]);
-          %185 = reshape(%184, newshape=[-1, 768]);
-          %186 = nn.dense(%185, meta[relay.Constant][36], units=768);
-          %187 = add(%186, meta[relay.Constant][37]);
-          %188 = reshape(%187, newshape=[50, 32, 768]);
-          %189 = add(%145, %188);
-          %190 = mean(%189, axis=[-1], keepdims=True);
-          %191 = subtract(%189, %190);
-          %192 = power(%191, 2f);
-          %193 = mean(%192, axis=[-1], keepdims=True);
-          %194 = add(%193, 1e-05f);
-          %195 = sqrt(%194);
-          %196 = divide(%191, %195);
-          %197 = multiply(%196, meta[relay.Constant][38]);
-          %198 = add(%197, meta[relay.Constant][39]);
-          %199 = reshape(%198, newshape=[-1, 768]);
-          %200 = nn.dense(%199, meta[relay.Constant][40], units=3072);
-          %201 = add(%200, meta[relay.Constant][41]);
-          %202 = reshape(%201, newshape=[50, 32, 3072]);
-          %203 = power(%202, 3f);
-          %204 = multiply(%203, 0.044715f);
-          %205 = add(%202, %204);
-          %206 = multiply(%205, 0.797885f);
-          %207 = tanh(%206);
-          %208 = multiply(%202, 0.5f);
-          %209 = add(%207, 1f);
-          %210 = multiply(%208, %209);
-          %211 = reshape(%210, newshape=[-1, 3072]);
-          %212 = nn.dense(%211, meta[relay.Constant][42], units=768);
-          %213 = add(%212, meta[relay.Constant][43]);
-          %214 = reshape(%213, newshape=[50, 32, 768]);
-          %215 = add(%189, %214);
-          %216 = mean(%215, axis=[-1], keepdims=True);
-          %217 = subtract(%215, %216);
-          %218 = power(%217, 2f);
-          %219 = mean(%218, axis=[-1], keepdims=True);
-          %220 = add(%219, 1e-05f);
-          %221 = sqrt(%220);
-          %222 = divide(%217, %221);
-          %223 = multiply(%222, meta[relay.Constant][44]);
-          %224 = add(%223, meta[relay.Constant][45]);
-          %225 = reshape(%224, newshape=[-1, 768]);
-          %226 = nn.dense(%225, meta[relay.Constant][46], units=2304);
-          %227 = add(%226, meta[relay.Constant][47]);
-          %228 = reshape(%227, newshape=[50, 32, 2304]);
-          %229 = split(%228, indices_or_sections=[768, 1536], axis=2);
-          %230 = %229.0;
-          %231 = reshape(%230, newshape=[50, 32, 12, 64]);
-          %232 = transpose(%231, axes=[0, 2, 1, 3]);
-          %233 = %229.1;
-          %234 = reshape(%233, newshape=[50, 32, 12, 64]);
-          %235 = transpose(%234, axes=[0, 2, 3, 1]);
-          %236 = reshape(%235, newshape=[-1, 64, 32]);
-          %237 = reshape(%232, newshape=[-1, 32, 64]);
-          %238 = transpose(%236, axes=[0, 2, 1]);
-          %239 = nn.batch_matmul(%237, %238, out_dtype="float32", transpose_b=True);
-          %240 = reshape(%239, newshape=[50, 12, 32, 32]);
-          %241 = divide(%240, 8f);
-          %242 = multiply(%241, meta[relay.Constant][48]);
-          %243 = subtract(%242, meta[relay.Constant][49]);
-          %244 = nn.softmax(%243, axis=3);
-          %245 = %229.2;
-          %246 = reshape(%245, newshape=[50, 32, 12, 64]);
-          %247 = transpose(%246, axes=[0, 2, 1, 3]);
-          %248 = reshape(%247, newshape=[-1, 32, 64]);
-          %249 = reshape(%244, newshape=[-1, 32, 32]);
-          %250 = transpose(%248, axes=[0, 2, 1]);
-          %251 = nn.batch_matmul(%249, %250, out_dtype="float32", transpose_b=True);
-          %252 = reshape(%251, newshape=[50, 12, 32, 64]);
-          %253 = transpose(%252, axes=[0, 2, 1, 3]);
-          %254 = reshape(%253, newshape=[50, 32, 768]);
-          %255 = reshape(%254, newshape=[-1, 768]);
-          %256 = nn.dense(%255, meta[relay.Constant][50], units=768);
-          %257 = add(%256, meta[relay.Constant][51]);
-          %258 = reshape(%257, newshape=[50, 32, 768]);
-          %259 = add(%215, %258);
-          %260 = mean(%259, axis=[-1], keepdims=True);
-          %261 = subtract(%259, %260);
-          %262 = power(%261, 2f);
-          %263 = mean(%262, axis=[-1], keepdims=True);
-          %264 = add(%263, 1e-05f);
-          %265 = sqrt(%264);
-          %266 = divide(%261, %265);
-          %267 = multiply(%266, meta[relay.Constant][52]);
-          %268 = add(%267, meta[relay.Constant][53]);
-          %269 = reshape(%268, newshape=[-1, 768]);
-          %270 = nn.dense(%269, meta[relay.Constant][54], units=3072);
-          %271 = add(%270, meta[relay.Constant][55]);
-          %272 = reshape(%271, newshape=[50, 32, 3072]);
-          %273 = power(%272, 3f);
-          %274 = multiply(%273, 0.044715f);
-          %275 = add(%272, %274);
-          %276 = multiply(%275, 0.797885f);
-          %277 = tanh(%276);
-          %278 = multiply(%272, 0.5f);
-          %279 = add(%277, 1f);
-          %280 = multiply(%278, %279);
-          %281 = reshape(%280, newshape=[-1, 3072]);
-          %282 = nn.dense(%281, meta[relay.Constant][56], units=768);
-          %283 = add(%282, meta[relay.Constant][57]);
-          %284 = reshape(%283, newshape=[50, 32, 768]);
-          %285 = add(%259, %284);
-          %286 = mean(%285, axis=[-1], keepdims=True);
-          %287 = subtract(%285, %286);
-          %288 = power(%287, 2f);
-          %289 = mean(%288, axis=[-1], keepdims=True);
-          %290 = add(%289, 1e-05f);
-          %291 = sqrt(%290);
-          %292 = divide(%287, %291);
-          %293 = multiply(%292, meta[relay.Constant][58]);
-          %294 = add(%293, meta[relay.Constant][59]);
-          %295 = reshape(%294, newshape=[-1, 768]);
-          %296 = nn.dense(%295, meta[relay.Constant][60], units=2304);
-          %297 = add(%296, meta[relay.Constant][61]);
-          %298 = reshape(%297, newshape=[50, 32, 2304]);
-          %299 = split(%298, indices_or_sections=[768, 1536], axis=2);
-          %300 = %299.0;
-          %301 = reshape(%300, newshape=[50, 32, 12, 64]);
-          %302 = transpose(%301, axes=[0, 2, 1, 3]);
-          %303 = %299.1;
-          %304 = reshape(%303, newshape=[50, 32, 12, 64]);
-          %305 = transpose(%304, axes=[0, 2, 3, 1]);
-          %306 = reshape(%305, newshape=[-1, 64, 32]);
-          %307 = reshape(%302, newshape=[-1, 32, 64]);
-          %308 = transpose(%306, axes=[0, 2, 1]);
-          %309 = nn.batch_matmul(%307, %308, out_dtype="float32", transpose_b=True);
-          %310 = reshape(%309, newshape=[50, 12, 32, 32]);
-          %311 = divide(%310, 8f);
-          %312 = multiply(%311, meta[relay.Constant][62]);
-          %313 = subtract(%312, meta[relay.Constant][63]);
-          %314 = nn.softmax(%313, axis=3);
-          %315 = %299.2;
-          %316 = reshape(%315, newshape=[50, 32, 12, 64]);
-          %317 = transpose(%316, axes=[0, 2, 1, 3]);
-          %318 = reshape(%317, newshape=[-1, 32, 64]);
-          %319 = reshape(%314, newshape=[-1, 32, 32]);
-          %320 = transpose(%318, axes=[0, 2, 1]);
-          %321 = nn.batch_matmul(%319, %320, out_dtype="float32", transpose_b=True);
-          %322 = reshape(%321, newshape=[50, 12, 32, 64]);
-          %323 = transpose(%322, axes=[0, 2, 1, 3]);
-          %324 = reshape(%323, newshape=[50, 32, 768]);
-          %325 = reshape(%324, newshape=[-1, 768]);
-          %326 = nn.dense(%325, meta[relay.Constant][64], units=768);
-          %327 = add(%326, meta[relay.Constant][65]);
-          %328 = reshape(%327, newshape=[50, 32, 768]);
-          %329 = add(%285, %328);
-          %330 = mean(%329, axis=[-1], keepdims=True);
-          %331 = subtract(%329, %330);
-          %332 = power(%331, 2f);
-          %333 = mean(%332, axis=[-1], keepdims=True);
-          %334 = add(%333, 1e-05f);
-          %335 = sqrt(%334);
-          %336 = divide(%331, %335);
-          %337 = multiply(%336, meta[relay.Constant][66]);
-          %338 = add(%337, meta[relay.Constant][67]);
-          %339 = reshape(%338, newshape=[-1, 768]);
-          %340 = nn.dense(%339, meta[relay.Constant][68], units=3072);
-          %341 = add(%340, meta[relay.Constant][69]);
-          %342 = reshape(%341, newshape=[50, 32, 3072]);
-          %343 = power(%342, 3f);
-          %344 = multiply(%343, 0.044715f);
-          %345 = add(%342, %344);
-          %346 = multiply(%345, 0.797885f);
-          %347 = tanh(%346);
-          %348 = multiply(%342, 0.5f);
-          %349 = add(%347, 1f);
-          %350 = multiply(%348, %349);
-          %351 = reshape(%350, newshape=[-1, 3072]);
-          %352 = nn.dense(%351, meta[relay.Constant][70], units=768);
-          %353 = add(%352, meta[relay.Constant][71]);
-          %354 = reshape(%353, newshape=[50, 32, 768]);
-          %355 = add(%329, %354);
-          %356 = mean(%355, axis=[-1], keepdims=True);
-          %357 = subtract(%355, %356);
-          %358 = power(%357, 2f);
-          %359 = mean(%358, axis=[-1], keepdims=True);
-          %360 = add(%359, 1e-05f);
-          %361 = sqrt(%360);
-          %362 = divide(%357, %361);
-          %363 = multiply(%362, meta[relay.Constant][72]);
-          %364 = add(%363, meta[relay.Constant][73]);
-          %365 = reshape(%364, newshape=[-1, 768]);
-          %366 = nn.dense(%365, meta[relay.Constant][74], units=2304);
-          %367 = add(%366, meta[relay.Constant][75]);
-          %368 = reshape(%367, newshape=[50, 32, 2304]);
-          %369 = split(%368, indices_or_sections=[768, 1536], axis=2);
-          %370 = %369.0;
-          %371 = reshape(%370, newshape=[50, 32, 12, 64]);
-          %372 = transpose(%371, axes=[0, 2, 1, 3]);
-          %373 = %369.1;
-          %374 = reshape(%373, newshape=[50, 32, 12, 64]);
-          %375 = transpose(%374, axes=[0, 2, 3, 1]);
-          %376 = reshape(%375, newshape=[-1, 64, 32]);
-          %377 = reshape(%372, newshape=[-1, 32, 64]);
-          %378 = transpose(%376, axes=[0, 2, 1]);
-          %379 = nn.batch_matmul(%377, %378, out_dtype="float32", transpose_b=True);
-          %380 = reshape(%379, newshape=[50, 12, 32, 32]);
-          %381 = divide(%380, 8f);
-          %382 = multiply(%381, meta[relay.Constant][76]);
-          %383 = subtract(%382, meta[relay.Constant][77]);
-          %384 = nn.softmax(%383, axis=3);
-          %385 = %369.2;
-          %386 = reshape(%385, newshape=[50, 32, 12, 64]);
-          %387 = transpose(%386, axes=[0, 2, 1, 3]);
-          %388 = reshape(%387, newshape=[-1, 32, 64]);
-          %389 = reshape(%384, newshape=[-1, 32, 32]);
-          %390 = transpose(%388, axes=[0, 2, 1]);
-          %391 = nn.batch_matmul(%389, %390, out_dtype="float32", transpose_b=True);
-          %392 = reshape(%391, newshape=[50, 12, 32, 64]);
-          %393 = transpose(%392, axes=[0, 2, 1, 3]);
-          %394 = reshape(%393, newshape=[50, 32, 768]);
-          %395 = reshape(%394, newshape=[-1, 768]);
-          %396 = nn.dense(%395, meta[relay.Constant][78], units=768);
-          %397 = add(%396, meta[relay.Constant][79]);
-          %398 = reshape(%397, newshape=[50, 32, 768]);
-          %399 = add(%355, %398);
-          %400 = mean(%399, axis=[-1], keepdims=True);
-          %401 = subtract(%399, %400);
-          %402 = power(%401, 2f);
-          %403 = mean(%402, axis=[-1], keepdims=True);
-          %404 = add(%403, 1e-05f);
-          %405 = sqrt(%404);
-          %406 = divide(%401, %405);
-          %407 = multiply(%406, meta[relay.Constant][80]);
-          %408 = add(%407, meta[relay.Constant][81]);
-          %409 = reshape(%408, newshape=[-1, 768]);
-          %410 = nn.dense(%409, meta[relay.Constant][82], units=3072);
-          %411 = add(%410, meta[relay.Constant][83]);
-          %412 = reshape(%411, newshape=[50, 32, 3072]);
-          %413 = power(%412, 3f);
-          %414 = multiply(%413, 0.044715f);
-          %415 = add(%412, %414);
-          %416 = multiply(%415, 0.797885f);
-          %417 = tanh(%416);
-          %418 = multiply(%412, 0.5f);
-          %419 = add(%417, 1f);
-          %420 = multiply(%418, %419);
-          %421 = reshape(%420, newshape=[-1, 3072]);
-          %422 = nn.dense(%421, meta[relay.Constant][84], units=768);
-          %423 = add(%422, meta[relay.Constant][85]);
-          %424 = reshape(%423, newshape=[50, 32, 768]);
-          %425 = add(%399, %424);
-          %426 = mean(%425, axis=[-1], keepdims=True);
-          %427 = subtract(%425, %426);
-          %428 = power(%427, 2f);
-          %429 = mean(%428, axis=[-1], keepdims=True);
-          %430 = add(%429, 1e-05f);
-          %431 = sqrt(%430);
-          %432 = divide(%427, %431);
-          %433 = multiply(%432, meta[relay.Constant][86]);
-          %434 = add(%433, meta[relay.Constant][87]);
-          %435 = reshape(%434, newshape=[-1, 768]);
-          %436 = nn.dense(%435, meta[relay.Constant][88], units=2304);
-          %437 = add(%436, meta[relay.Constant][89]);
-          %438 = reshape(%437, newshape=[50, 32, 2304]);
-          %439 = split(%438, indices_or_sections=[768, 1536], axis=2);
-          %440 = %439.0;
-          %441 = reshape(%440, newshape=[50, 32, 12, 64]);
-          %442 = transpose(%441, axes=[0, 2, 1, 3]);
-          %443 = %439.1;
-          %444 = reshape(%443, newshape=[50, 32, 12, 64]);
-          %445 = transpose(%444, axes=[0, 2, 3, 1]);
-          %446 = reshape(%445, newshape=[-1, 64, 32]);
-          %447 = reshape(%442, newshape=[-1, 32, 64]);
-          %448 = transpose(%446, axes=[0, 2, 1]);
-          %449 = nn.batch_matmul(%447, %448, out_dtype="float32", transpose_b=True);
-          %450 = reshape(%449, newshape=[50, 12, 32, 32]);
-          %451 = divide(%450, 8f);
-          %452 = multiply(%451, meta[relay.Constant][90]);
-          %453 = subtract(%452, meta[relay.Constant][91]);
-          %454 = nn.softmax(%453, axis=3);
-          %455 = %439.2;
-          %456 = reshape(%455, newshape=[50, 32, 12, 64]);
-          %457 = transpose(%456, axes=[0, 2, 1, 3]);
-          %458 = reshape(%457, newshape=[-1, 32, 64]);
-          %459 = reshape(%454, newshape=[-1, 32, 32]);
-          %460 = transpose(%458, axes=[0, 2, 1]);
-          %461 = nn.batch_matmul(%459, %460, out_dtype="float32", transpose_b=True);
-          %462 = reshape(%461, newshape=[50, 12, 32, 64]);
-          %463 = transpose(%462, axes=[0, 2, 1, 3]);
-          %464 = reshape(%463, newshape=[50, 32, 768]);
-          %465 = reshape(%464, newshape=[-1, 768]);
-          %466 = nn.dense(%465, meta[relay.Constant][92], units=768);
-          %467 = add(%466, meta[relay.Constant][93]);
-          %468 = reshape(%467, newshape=[50, 32, 768]);
-          %469 = add(%425, %468);
-          %470 = mean(%469, axis=[-1], keepdims=True);
-          %471 = subtract(%469, %470);
-          %472 = power(%471, 2f);
-          %473 = mean(%472, axis=[-1], keepdims=True);
-          %474 = add(%473, 1e-05f);
-          %475 = sqrt(%474);
-          %476 = divide(%471, %475);
-          %477 = multiply(%476, meta[relay.Constant][94]);
-          %478 = add(%477, meta[relay.Constant][95]);
-          %479 = reshape(%478, newshape=[-1, 768]);
-          %480 = nn.dense(%479, meta[relay.Constant][96], units=3072);
-          %481 = add(%480, meta[relay.Constant][97]);
-          %482 = reshape(%481, newshape=[50, 32, 3072]);
-          %483 = power(%482, 3f);
-          %484 = multiply(%483, 0.044715f);
-          %485 = add(%482, %484);
-          %486 = multiply(%485, 0.797885f);
-          %487 = tanh(%486);
-          %488 = multiply(%482, 0.5f);
-          %489 = add(%487, 1f);
-          %490 = multiply(%488, %489);
-          %491 = reshape(%490, newshape=[-1, 3072]);
-          %492 = nn.dense(%491, meta[relay.Constant][98], units=768);
-          %493 = add(%492, meta[relay.Constant][99]);
-          %494 = reshape(%493, newshape=[50, 32, 768]);
-          %495 = add(%469, %494);
-          %496 = mean(%495, axis=[-1], keepdims=True);
-          %497 = subtract(%495, %496);
-          %498 = power(%497, 2f);
-          %499 = mean(%498, axis=[-1], keepdims=True);
-          %500 = add(%499, 1e-05f);
-          %501 = sqrt(%500);
-          %502 = divide(%497, %501);
-          %503 = multiply(%502, meta[relay.Constant][100]);
-          %504 = add(%503, meta[relay.Constant][101]);
-          %505 = reshape(%504, newshape=[-1, 768]);
-          %506 = nn.dense(%505, meta[relay.Constant][102], units=2304);
-          %507 = add(%506, meta[relay.Constant][103]);
-          %508 = reshape(%507, newshape=[50, 32, 2304]);
-          %509 = split(%508, indices_or_sections=[768, 1536], axis=2);
-          %510 = %509.0;
-          %511 = reshape(%510, newshape=[50, 32, 12, 64]);
-          %512 = transpose(%511, axes=[0, 2, 1, 3]);
-          %513 = %509.1;
-          %514 = reshape(%513, newshape=[50, 32, 12, 64]);
-          %515 = transpose(%514, axes=[0, 2, 3, 1]);
-          %516 = reshape(%515, newshape=[-1, 64, 32]);
-          %517 = reshape(%512, newshape=[-1, 32, 64]);
-          %518 = transpose(%516, axes=[0, 2, 1]);
-          %519 = nn.batch_matmul(%517, %518, out_dtype="float32", transpose_b=True);
-          %520 = reshape(%519, newshape=[50, 12, 32, 32]);
-          %521 = divide(%520, 8f);
-          %522 = multiply(%521, meta[relay.Constant][104]);
-          %523 = subtract(%522, meta[relay.Constant][105]);
-          %524 = nn.softmax(%523, axis=3);
-          %525 = %509.2;
-          %526 = reshape(%525, newshape=[50, 32, 12, 64]);
-          %527 = transpose(%526, axes=[0, 2, 1, 3]);
-          %528 = reshape(%527, newshape=[-1, 32, 64]);
-          %529 = reshape(%524, newshape=[-1, 32, 32]);
-          %530 = transpose(%528, axes=[0, 2, 1]);
-          %531 = nn.batch_matmul(%529, %530, out_dtype="float32", transpose_b=True);
-          %532 = reshape(%531, newshape=[50, 12, 32, 64]);
-          %533 = transpose(%532, axes=[0, 2, 1, 3]);
-          %534 = reshape(%533, newshape=[50, 32, 768]);
-          %535 = reshape(%534, newshape=[-1, 768]);
-          %536 = nn.dense(%535, meta[relay.Constant][106], units=768);
-          %537 = add(%536, meta[relay.Constant][107]);
-          %538 = reshape(%537, newshape=[50, 32, 768]);
-          %539 = add(%495, %538);
-          %540 = mean(%539, axis=[-1], keepdims=True);
-          %541 = subtract(%539, %540);
-          %542 = power(%541, 2f);
-          %543 = mean(%542, axis=[-1], keepdims=True);
-          %544 = add(%543, 1e-05f);
-          %545 = sqrt(%544);
-          %546 = divide(%541, %545);
-          %547 = multiply(%546, meta[relay.Constant][108]);
-          %548 = add(%547, meta[relay.Constant][109]);
-          %549 = reshape(%548, newshape=[-1, 768]);
-          %550 = nn.dense(%549, meta[relay.Constant][110], units=3072);
-          %551 = add(%550, meta[relay.Constant][111]);
-          %552 = reshape(%551, newshape=[50, 32, 3072]);
-          %553 = power(%552, 3f);
-          %554 = multiply(%553, 0.044715f);
-          %555 = add(%552, %554);
-          %556 = multiply(%555, 0.797885f);
-          %557 = tanh(%556);
-          %558 = multiply(%552, 0.5f);
-          %559 = add(%557, 1f);
-          %560 = multiply(%558, %559);
-          %561 = reshape(%560, newshape=[-1, 3072]);
-          %562 = nn.dense(%561, meta[relay.Constant][112], units=768);
-          %563 = add(%562, meta[relay.Constant][113]);
-          %564 = reshape(%563, newshape=[50, 32, 768]);
-          %565 = add(%539, %564);
-          %566 = mean(%565, axis=[-1], keepdims=True);
-          %567 = subtract(%565, %566);
-          %568 = power(%567, 2f);
-          %569 = mean(%568, axis=[-1], keepdims=True);
-          %570 = add(%569, 1e-05f);
-          %571 = sqrt(%570);
-          %572 = divide(%567, %571);
-          %573 = multiply(%572, meta[relay.Constant][114]);
-          %574 = add(%573, meta[relay.Constant][115]);
-          %575 = reshape(%574, newshape=[-1, 768]);
-          %576 = nn.dense(%575, meta[relay.Constant][116], units=2304);
-          %577 = add(%576, meta[relay.Constant][117]);
-          %578 = reshape(%577, newshape=[50, 32, 2304]);
-          %579 = split(%578, indices_or_sections=[768, 1536], axis=2);
-          %580 = %579.0;
-          %581 = reshape(%580, newshape=[50, 32, 12, 64]);
-          %582 = transpose(%581, axes=[0, 2, 1, 3]);
-          %583 = %579.1;
-          %584 = reshape(%583, newshape=[50, 32, 12, 64]);
-          %585 = transpose(%584, axes=[0, 2, 3, 1]);
-          %586 = reshape(%585, newshape=[-1, 64, 32]);
-          %587 = reshape(%582, newshape=[-1, 32, 64]);
-          %588 = transpose(%586, axes=[0, 2, 1]);
-          %589 = nn.batch_matmul(%587, %588, out_dtype="float32", transpose_b=True);
-          %590 = reshape(%589, newshape=[50, 12, 32, 32]);
-          %591 = divide(%590, 8f);
-          %592 = multiply(%591, meta[relay.Constant][118]);
-          %593 = subtract(%592, meta[relay.Constant][119]);
-          %594 = nn.softmax(%593, axis=3);
-          %595 = %579.2;
-          %596 = reshape(%595, newshape=[50, 32, 12, 64]);
-          %597 = transpose(%596, axes=[0, 2, 1, 3]);
-          %598 = reshape(%597, newshape=[-1, 32, 64]);
-          %599 = reshape(%594, newshape=[-1, 32, 32]);
-          %600 = transpose(%598, axes=[0, 2, 1]);
-          %601 = nn.batch_matmul(%599, %600, out_dtype="float32", transpose_b=True);
-          %602 = reshape(%601, newshape=[50, 12, 32, 64]);
-          %603 = transpose(%602, axes=[0, 2, 1, 3]);
-          %604 = reshape(%603, newshape=[50, 32, 768]);
-          %605 = reshape(%604, newshape=[-1, 768]);
-          %606 = nn.dense(%605, meta[relay.Constant][120], units=768);
-          %607 = add(%606, meta[relay.Constant][121]);
-          %608 = reshape(%607, newshape=[50, 32, 768]);
-          %609 = add(%565, %608);
-          %610 = mean(%609, axis=[-1], keepdims=True);
-          %611 = subtract(%609, %610);
-          %612 = power(%611, 2f);
-          %613 = mean(%612, axis=[-1], keepdims=True);
-          %614 = add(%613, 1e-05f);
-          %615 = sqrt(%614);
-          %616 = divide(%611, %615);
-          %617 = multiply(%616, meta[relay.Constant][122]);
-          %618 = add(%617, meta[relay.Constant][123]);
-          %619 = reshape(%618, newshape=[-1, 768]);
-          %620 = nn.dense(%619, meta[relay.Constant][124], units=3072);
-          %621 = add(%620, meta[relay.Constant][125]);
-          %622 = reshape(%621, newshape=[50, 32, 3072]);
-          %623 = power(%622, 3f);
-          %624 = multiply(%623, 0.044715f);
-          %625 = add(%622, %624);
-          %626 = multiply(%625, 0.797885f);
-          %627 = tanh(%626);
-          %628 = multiply(%622, 0.5f);
-          %629 = add(%627, 1f);
-          %630 = multiply(%628, %629);
-          %631 = reshape(%630, newshape=[-1, 3072]);
-          %632 = nn.dense(%631, meta[relay.Constant][126], units=768);
-          %633 = add(%632, meta[relay.Constant][127]);
-          %634 = reshape(%633, newshape=[50, 32, 768]);
-          %635 = add(%609, %634);
-          %636 = mean(%635, axis=[-1], keepdims=True);
-          %637 = subtract(%635, %636);
-          %638 = power(%637, 2f);
-          %639 = mean(%638, axis=[-1], keepdims=True);
-          %640 = add(%639, 1e-05f);
-          %641 = sqrt(%640);
-          %642 = divide(%637, %641);
-          %643 = multiply(%642, meta[relay.Constant][128]);
-          %644 = add(%643, meta[relay.Constant][129]);
-          %645 = reshape(%644, newshape=[-1, 768]);
-          %646 = nn.dense(%645, meta[relay.Constant][130], units=2304);
-          %647 = add(%646, meta[relay.Constant][131]);
-          %648 = reshape(%647, newshape=[50, 32, 2304]);
-          %649 = split(%648, indices_or_sections=[768, 1536], axis=2);
-          %650 = %649.0;
-          %651 = reshape(%650, newshape=[50, 32, 12, 64]);
-          %652 = transpose(%651, axes=[0, 2, 1, 3]);
-          %653 = %649.1;
-          %654 = reshape(%653, newshape=[50, 32, 12, 64]);
-          %655 = transpose(%654, axes=[0, 2, 3, 1]);
-          %656 = reshape(%655, newshape=[-1, 64, 32]);
-          %657 = reshape(%652, newshape=[-1, 32, 64]);
-          %658 = transpose(%656, axes=[0, 2, 1]);
-          %659 = nn.batch_matmul(%657, %658, out_dtype="float32", transpose_b=True);
-          %660 = reshape(%659, newshape=[50, 12, 32, 32]);
-          %661 = divide(%660, 8f);
-          %662 = multiply(%661, meta[relay.Constant][132]);
-          %663 = subtract(%662, meta[relay.Constant][133]);
-          %664 = nn.softmax(%663, axis=3);
-          %665 = %649.2;
-          %666 = reshape(%665, newshape=[50, 32, 12, 64]);
-          %667 = transpose(%666, axes=[0, 2, 1, 3]);
-          %668 = reshape(%667, newshape=[-1, 32, 64]);
-          %669 = reshape(%664, newshape=[-1, 32, 32]);
-          %670 = transpose(%668, axes=[0, 2, 1]);
-          %671 = nn.batch_matmul(%669, %670, out_dtype="float32", transpose_b=True);
-          %672 = reshape(%671, newshape=[50, 12, 32, 64]);
-          %673 = transpose(%672, axes=[0, 2, 1, 3]);
-          %674 = reshape(%673, newshape=[50, 32, 768]);
-          %675 = reshape(%674, newshape=[-1, 768]);
-          %676 = nn.dense(%675, meta[relay.Constant][134], units=768);
-          %677 = add(%676, meta[relay.Constant][135]);
-          %678 = reshape(%677, newshape=[50, 32, 768]);
-          %679 = add(%635, %678);
-          %680 = mean(%679, axis=[-1], keepdims=True);
-          %681 = subtract(%679, %680);
-          %682 = power(%681, 2f);
-          %683 = mean(%682, axis=[-1], keepdims=True);
-          %684 = add(%683, 1e-05f);
-          %685 = sqrt(%684);
-          %686 = divide(%681, %685);
-          %687 = multiply(%686, meta[relay.Constant][136]);
-          %688 = add(%687, meta[relay.Constant][137]);
-          %689 = reshape(%688, newshape=[-1, 768]);
-          %690 = nn.dense(%689, meta[relay.Constant][138], units=3072);
-          %691 = add(%690, meta[relay.Constant][139]);
-          %692 = reshape(%691, newshape=[50, 32, 3072]);
-          %693 = power(%692, 3f);
-          %694 = multiply(%693, 0.044715f);
-          %695 = add(%692, %694);
-          %696 = multiply(%695, 0.797885f);
-          %697 = tanh(%696);
-          %698 = multiply(%692, 0.5f);
-          %699 = add(%697, 1f);
-          %700 = multiply(%698, %699);
-          %701 = reshape(%700, newshape=[-1, 3072]);
-          %702 = nn.dense(%701, meta[relay.Constant][140], units=768);
-          %703 = add(%702, meta[relay.Constant][141]);
-          %704 = reshape(%703, newshape=[50, 32, 768]);
-          %705 = add(%679, %704);
-          %706 = mean(%705, axis=[-1], keepdims=True);
-          %707 = subtract(%705, %706);
-          %708 = power(%707, 2f);
-          %709 = mean(%708, axis=[-1], keepdims=True);
-          %710 = add(%709, 1e-05f);
-          %711 = sqrt(%710);
-          %712 = divide(%707, %711);
-          %713 = multiply(%712, meta[relay.Constant][142]);
-          %714 = add(%713, meta[relay.Constant][143]);
-          %715 = reshape(%714, newshape=[-1, 768]);
-          %716 = nn.dense(%715, meta[relay.Constant][144], units=2304);
-          %717 = add(%716, meta[relay.Constant][145]);
-          %718 = reshape(%717, newshape=[50, 32, 2304]);
-          %719 = split(%718, indices_or_sections=[768, 1536], axis=2);
-          %720 = %719.0;
-          %721 = reshape(%720, newshape=[50, 32, 12, 64]);
-          %722 = transpose(%721, axes=[0, 2, 1, 3]);
-          %723 = %719.1;
-          %724 = reshape(%723, newshape=[50, 32, 12, 64]);
-          %725 = transpose(%724, axes=[0, 2, 3, 1]);
-          %726 = reshape(%725, newshape=[-1, 64, 32]);
-          %727 = reshape(%722, newshape=[-1, 32, 64]);
-          %728 = transpose(%726, axes=[0, 2, 1]);
-          %729 = nn.batch_matmul(%727, %728, out_dtype="float32", transpose_b=True);
-          %730 = reshape(%729, newshape=[50, 12, 32, 32]);
-          %731 = divide(%730, 8f);
-          %732 = multiply(%731, meta[relay.Constant][146]);
-          %733 = subtract(%732, meta[relay.Constant][147]);
-          %734 = nn.softmax(%733, axis=3);
-          %735 = %719.2;
-          %736 = reshape(%735, newshape=[50, 32, 12, 64]);
-          %737 = transpose(%736, axes=[0, 2, 1, 3]);
-          %738 = reshape(%737, newshape=[-1, 32, 64]);
-          %739 = reshape(%734, newshape=[-1, 32, 32]);
-          %740 = transpose(%738, axes=[0, 2, 1]);
-          %741 = nn.batch_matmul(%739, %740, out_dtype="float32", transpose_b=True);
-          %742 = reshape(%741, newshape=[50, 12, 32, 64]);
-          %743 = transpose(%742, axes=[0, 2, 1, 3]);
-          %744 = reshape(%743, newshape=[50, 32, 768]);
-          %745 = reshape(%744, newshape=[-1, 768]);
-          %746 = nn.dense(%745, meta[relay.Constant][148], units=768);
-          %747 = add(%746, meta[relay.Constant][149]);
-          %748 = reshape(%747, newshape=[50, 32, 768]);
-          %749 = add(%705, %748);
-          %750 = mean(%749, axis=[-1], keepdims=True);
-          %751 = subtract(%749, %750);
-          %752 = power(%751, 2f);
-          %753 = mean(%752, axis=[-1], keepdims=True);
-          %754 = add(%753, 1e-05f);
-          %755 = sqrt(%754);
-          %756 = divide(%751, %755);
-          %757 = multiply(%756, meta[relay.Constant][150]);
-          %758 = add(%757, meta[relay.Constant][151]);
-          %759 = reshape(%758, newshape=[-1, 768]);
-          %760 = nn.dense(%759, meta[relay.Constant][152], units=3072);
-          %761 = add(%760, meta[relay.Constant][153]);
-          %762 = reshape(%761, newshape=[50, 32, 3072]);
-          %763 = power(%762, 3f);
-          %764 = multiply(%763, 0.044715f);
-          %765 = add(%762, %764);
-          %766 = multiply(%765, 0.797885f);
-          %767 = tanh(%766);
-          %768 = multiply(%762, 0.5f);
-          %769 = add(%767, 1f);
-          %770 = multiply(%768, %769);
-          %771 = reshape(%770, newshape=[-1, 3072]);
-          %772 = nn.dense(%771, meta[relay.Constant][154], units=768);
-          %773 = add(%772, meta[relay.Constant][155]);
-          %774 = reshape(%773, newshape=[50, 32, 768]);
-          %775 = add(%749, %774);
-          %776 = mean(%775, axis=[-1], keepdims=True);
-          %777 = subtract(%775, %776);
-          %778 = power(%777, 2f);
-          %779 = mean(%778, axis=[-1], keepdims=True);
-          %780 = add(%779, 1e-05f);
-          %781 = sqrt(%780);
-          %782 = divide(%777, %781);
-          %783 = multiply(%782, meta[relay.Constant][156]);
-          %784 = add(%783, meta[relay.Constant][157]);
-          %785 = reshape(%784, newshape=[-1, 768]);
-          %786 = nn.dense(%785, meta[relay.Constant][158], units=2304);
-          %787 = add(%786, meta[relay.Constant][159]);
-          %788 = reshape(%787, newshape=[50, 32, 2304]);
-          %789 = split(%788, indices_or_sections=[768, 1536], axis=2);
-          %790 = %789.0;
-          %791 = reshape(%790, newshape=[50, 32, 12, 64]);
-          %792 = transpose(%791, axes=[0, 2, 1, 3]);
-          %793 = %789.1;
-          %794 = reshape(%793, newshape=[50, 32, 12, 64]);
-          %795 = transpose(%794, axes=[0, 2, 3, 1]);
-          %796 = reshape(%795, newshape=[-1, 64, 32]);
-          %797 = reshape(%792, newshape=[-1, 32, 64]);
-          %798 = transpose(%796, axes=[0, 2, 1]);
-          %799 = nn.batch_matmul(%797, %798, out_dtype="float32", transpose_b=True);
-          %800 = reshape(%799, newshape=[50, 12, 32, 32]);
-          %801 = divide(%800, 8f);
-          %802 = multiply(%801, meta[relay.Constant][160]);
-          %803 = subtract(%802, meta[relay.Constant][161]);
-          %804 = nn.softmax(%803, axis=3);
-          %805 = %789.2;
-          %806 = reshape(%805, newshape=[50, 32, 12, 64]);
-          %807 = transpose(%806, axes=[0, 2, 1, 3]);
-          %808 = reshape(%807, newshape=[-1, 32, 64]);
-          %809 = reshape(%804, newshape=[-1, 32, 32]);
-          %810 = transpose(%808, axes=[0, 2, 1]);
-          %811 = nn.batch_matmul(%809, %810, out_dtype="float32", transpose_b=True);
-          %812 = reshape(%811, newshape=[50, 12, 32, 64]);
-          %813 = transpose(%812, axes=[0, 2, 1, 3]);
-          %814 = reshape(%813, newshape=[50, 32, 768]);
-          %815 = reshape(%814, newshape=[-1, 768]);
-          %816 = nn.dense(%815, meta[relay.Constant][162], units=768);
-          %817 = add(%816, meta[relay.Constant][163]);
-          %818 = reshape(%817, newshape=[50, 32, 768]);
-          %819 = add(%775, %818);
-          %820 = mean(%819, axis=[-1], keepdims=True);
-          %821 = subtract(%819, %820);
-          %822 = power(%821, 2f);
-          %823 = mean(%822, axis=[-1], keepdims=True);
-          %824 = add(%823, 1e-05f);
-          %825 = sqrt(%824);
-          %826 = divide(%821, %825);
-          %827 = multiply(%826, meta[relay.Constant][164]);
-          %828 = add(%827, meta[relay.Constant][165]);
-          %829 = reshape(%828, newshape=[-1, 768]);
-          %830 = nn.dense(%829, meta[relay.Constant][166], units=3072);
-          %831 = add(%830, meta[relay.Constant][167]);
-          %832 = reshape(%831, newshape=[50, 32, 3072]);
-          %833 = power(%832, 3f);
-          %834 = multiply(%833, 0.044715f);
-          %835 = add(%832, %834);
-          %836 = multiply(%835, 0.797885f);
-          %837 = tanh(%836);
-          %838 = multiply(%832, 0.5f);
-          %839 = add(%837, 1f);
-          %840 = multiply(%838, %839);
-          %841 = reshape(%840, newshape=[-1, 3072]);
-          %842 = nn.dense(%841, meta[relay.Constant][168], units=768);
-          %843 = add(%842, meta[relay.Constant][169]);
-          %844 = reshape(%843, newshape=[50, 32, 768]);
-          %845 = add(%819, %844);
-          %846 = mean(%845, axis=[-1], keepdims=True);
-          %847 = subtract(%845, %846);
-          %848 = power(%847, 2f);
-          %849 = mean(%848, axis=[-1], keepdims=True);
-          %850 = add(%849, 1e-05f);
-          %851 = sqrt(%850);
-          %852 = divide(%847, %851);
-          %853 = multiply(%852, meta[relay.Constant][170]);
-          %854 = add(%853, meta[relay.Constant][171]);
-          %855 = transpose(%24, axes=[0, 2, 1, 3]);
-          %856 = expand_dims(%855, axis=0);
-          %857 = expand_dims(%37, axis=0);
-          %858 = (%856, %857);
-          %859 = transpose(%94, axes=[0, 2, 1, 3]);
-          %860 = expand_dims(%859, axis=0);
-          %861 = expand_dims(%107, axis=0);
-          %862 = (%860, %861);
-          %863 = transpose(%164, axes=[0, 2, 1, 3]);
-          %864 = expand_dims(%863, axis=0);
-          %865 = expand_dims(%177, axis=0);
-          %866 = (%864, %865);
-          %867 = transpose(%234, axes=[0, 2, 1, 3]);
-          %868 = expand_dims(%867, axis=0);
-          %869 = expand_dims(%247, axis=0);
-          %870 = (%868, %869);
-          %871 = transpose(%304, axes=[0, 2, 1, 3]);
-          %872 = expand_dims(%871, axis=0);
-          %873 = expand_dims(%317, axis=0);
-          %874 = (%872, %873);
-          %875 = transpose(%374, axes=[0, 2, 1, 3]);
-          %876 = expand_dims(%875, axis=0);
-          %877 = expand_dims(%387, axis=0);
-          %878 = (%876, %877);
-          %879 = transpose(%444, axes=[0, 2, 1, 3]);
-          %880 = expand_dims(%879, axis=0);
-          %881 = expand_dims(%457, axis=0);
-          %882 = (%880, %881);
-          %883 = transpose(%514, axes=[0, 2, 1, 3]);
-          %884 = expand_dims(%883, axis=0);
-          %885 = expand_dims(%527, axis=0);
-          %886 = (%884, %885);
-          %887 = transpose(%584, axes=[0, 2, 1, 3]);
-          %888 = expand_dims(%887, axis=0);
-          %889 = expand_dims(%597, axis=0);
-          %890 = (%888, %889);
-          %891 = transpose(%654, axes=[0, 2, 1, 3]);
-          %892 = expand_dims(%891, axis=0);
-          %893 = expand_dims(%667, axis=0);
-          %894 = (%892, %893);
-          %895 = transpose(%724, axes=[0, 2, 1, 3]);
-          %896 = expand_dims(%895, axis=0);
-          %897 = expand_dims(%737, axis=0);
-          %898 = (%896, %897);
-          %899 = transpose(%794, axes=[0, 2, 1, 3]);
-          %900 = expand_dims(%899, axis=0);
-          %901 = expand_dims(%807, axis=0);
-          %902 = (%900, %901);
-          %903 = reshape(%854, newshape=[1, 50, 32, 768]);
-          %904 = concatenate(%858);
-          %905 = concatenate(%862);
-          %906 = concatenate(%866);
-          %907 = concatenate(%870);
-          %908 = concatenate(%874);
-          %909 = concatenate(%878);
-          %910 = concatenate(%882);
-          %911 = concatenate(%886);
-          %912 = concatenate(%890);
-          %913 = concatenate(%894);
-          %914 = concatenate(%898);
-          %915 = concatenate(%902);
-          (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "gpt2",
-        "input_shapes": {"x": [1, 50, 32]},
-        "input_dtypes": {"x": "int64"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float32",
-    }
-
-
-def gpt2_16():
-    metatable = {"relay.Constant": gpt2_consts("float16")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16],
-                                                      Tensor[(2, 50, 12, 32, 64), float16]) {
-          %0 = reshape(%x, newshape=[-1, 32]);
-          %1 = less(%0, 0i64);
-          %2 = add(%0, 50257i64);
-          %3 = where(%1, %2, %0);
-          %4 = take(meta[relay.Constant][0], %3, axis=0);
-          %5 = add(%4, meta[relay.Constant][1]);
-          %6 = mean(%5, axis=[-1], keepdims=True);
-          %7 = subtract(%5, %6);
-          %8 = power(%7, 2f16);
-          %9 = mean(%8, axis=[-1], keepdims=True);
-          %10 = add(%9, 1e-05f16);
-          %11 = sqrt(%10);
-          %12 = divide(%7, %11);
-          %13 = multiply(%12, meta[relay.Constant][2]);
-          %14 = add(%13, meta[relay.Constant][3]);
-          %15 = reshape(%14, newshape=[-1, 768]);
-          %16 = nn.dense(%15, meta[relay.Constant][4], units=2304);
-          %17 = add(%16, meta[relay.Constant][5]);
-          %18 = reshape(%17, newshape=[50, 32, 2304]);
-          %19 = split(%18, indices_or_sections=[768, 1536], axis=2);
-          %20 = %19.0;
-          %21 = reshape(%20, newshape=[50, 32, 12, 64]);
-          %22 = transpose(%21, axes=[0, 2, 1, 3]);
-          %23 = %19.1;
-          %24 = reshape(%23, newshape=[50, 32, 12, 64]);
-          %25 = transpose(%24, axes=[0, 2, 3, 1]);
-          %26 = reshape(%25, newshape=[-1, 64, 32]);
-          %27 = reshape(%22, newshape=[-1, 32, 64]);
-          %28 = transpose(%26, axes=[0, 2, 1]);
-          %29 = nn.batch_matmul(%27, %28, out_dtype="float16", transpose_b=True);
-          %30 = reshape(%29, newshape=[50, 12, 32, 32]);
-          %31 = divide(%30, 8f16);
-          %32 = multiply(%31, meta[relay.Constant][6]);
-          %33 = subtract(%32, meta[relay.Constant][7]);
-          %34 = nn.softmax(%33, axis=3);
-          %35 = %19.2;
-          %36 = reshape(%35, newshape=[50, 32, 12, 64]);
-          %37 = transpose(%36, axes=[0, 2, 1, 3]);
-          %38 = reshape(%37, newshape=[-1, 32, 64]);
-          %39 = reshape(%34, newshape=[-1, 32, 32]);
-          %40 = transpose(%38, axes=[0, 2, 1]);
-          %41 = nn.batch_matmul(%39, %40, out_dtype="float16", transpose_b=True);
-          %42 = reshape(%41, newshape=[50, 12, 32, 64]);
-          %43 = transpose(%42, axes=[0, 2, 1, 3]);
-          %44 = reshape(%43, newshape=[50, 32, 768]);
-          %45 = reshape(%44, newshape=[-1, 768]);
-          %46 = nn.dense(%45, meta[relay.Constant][8], units=768);
-          %47 = add(%46, meta[relay.Constant][9]);
-          %48 = reshape(%47, newshape=[50, 32, 768]);
-          %49 = add(%5, %48);
-          %50 = mean(%49, axis=[-1], keepdims=True);
-          %51 = subtract(%49, %50);
-          %52 = power(%51, 2f16);
-          %53 = mean(%52, axis=[-1], keepdims=True);
-          %54 = add(%53, 1e-05f16);
-          %55 = sqrt(%54);
-          %56 = divide(%51, %55);
-          %57 = multiply(%56, meta[relay.Constant][10]);
-          %58 = add(%57, meta[relay.Constant][11]);
-          %59 = reshape(%58, newshape=[-1, 768]);
-          %60 = nn.dense(%59, meta[relay.Constant][12], units=3072);
-          %61 = add(%60, meta[relay.Constant][13]);
-          %62 = reshape(%61, newshape=[50, 32, 3072]);
-          %63 = power(%62, 3f16);
-          %64 = multiply(%63, 0.044715f16);
-          %65 = add(%62, %64);
-          %66 = multiply(%65, 0.797885f16);
-          %67 = tanh(%66);
-          %68 = multiply(%62, 0.5f16);
-          %69 = add(%67, 1f16);
-          %70 = multiply(%68, %69);
-          %71 = reshape(%70, newshape=[-1, 3072]);
-          %72 = nn.dense(%71, meta[relay.Constant][14], units=768);
-          %73 = add(%72, meta[relay.Constant][15]);
-          %74 = reshape(%73, newshape=[50, 32, 768]);
-          %75 = add(%49, %74);
-          %76 = mean(%75, axis=[-1], keepdims=True);
-          %77 = subtract(%75, %76);
-          %78 = power(%77, 2f16);
-          %79 = mean(%78, axis=[-1], keepdims=True);
-          %80 = add(%79, 1e-05f16);
-          %81 = sqrt(%80);
-          %82 = divide(%77, %81);
-          %83 = multiply(%82, meta[relay.Constant][16]);
-          %84 = add(%83, meta[relay.Constant][17]);
-          %85 = reshape(%84, newshape=[-1, 768]);
-          %86 = nn.dense(%85, meta[relay.Constant][18], units=2304);
-          %87 = add(%86, meta[relay.Constant][19]);
-          %88 = reshape(%87, newshape=[50, 32, 2304]);
-          %89 = split(%88, indices_or_sections=[768, 1536], axis=2);
-          %90 = %89.0;
-          %91 = reshape(%90, newshape=[50, 32, 12, 64]);
-          %92 = transpose(%91, axes=[0, 2, 1, 3]);
-          %93 = %89.1;
-          %94 = reshape(%93, newshape=[50, 32, 12, 64]);
-          %95 = transpose(%94, axes=[0, 2, 3, 1]);
-          %96 = reshape(%95, newshape=[-1, 64, 32]);
-          %97 = reshape(%92, newshape=[-1, 32, 64]);
-          %98 = transpose(%96, axes=[0, 2, 1]);
-          %99 = nn.batch_matmul(%97, %98, out_dtype="float16", transpose_b=True);
-          %100 = reshape(%99, newshape=[50, 12, 32, 32]);
-          %101 = divide(%100, 8f16);
-          %102 = multiply(%101, meta[relay.Constant][20]);
-          %103 = subtract(%102, meta[relay.Constant][21]);
-          %104 = nn.softmax(%103, axis=3);
-          %105 = %89.2;
-          %106 = reshape(%105, newshape=[50, 32, 12, 64]);
-          %107 = transpose(%106, axes=[0, 2, 1, 3]);
-          %108 = reshape(%107, newshape=[-1, 32, 64]);
-          %109 = reshape(%104, newshape=[-1, 32, 32]);
-          %110 = transpose(%108, axes=[0, 2, 1]);
-          %111 = nn.batch_matmul(%109, %110, out_dtype="float16", transpose_b=True);
-          %112 = reshape(%111, newshape=[50, 12, 32, 64]);
-          %113 = transpose(%112, axes=[0, 2, 1, 3]);
-          %114 = reshape(%113, newshape=[50, 32, 768]);
-          %115 = reshape(%114, newshape=[-1, 768]);
-          %116 = nn.dense(%115, meta[relay.Constant][22], units=768);
-          %117 = add(%116, meta[relay.Constant][23]);
-          %118 = reshape(%117, newshape=[50, 32, 768]);
-          %119 = add(%75, %118);
-          %120 = mean(%119, axis=[-1], keepdims=True);
-          %121 = subtract(%119, %120);
-          %122 = power(%121, 2f16);
-          %123 = mean(%122, axis=[-1], keepdims=True);
-          %124 = add(%123, 1e-05f16);
-          %125 = sqrt(%124);
-          %126 = divide(%121, %125);
-          %127 = multiply(%126, meta[relay.Constant][24]);
-          %128 = add(%127, meta[relay.Constant][25]);
-          %129 = reshape(%128, newshape=[-1, 768]);
-          %130 = nn.dense(%129, meta[relay.Constant][26], units=3072);
-          %131 = add(%130, meta[relay.Constant][27]);
-          %132 = reshape(%131, newshape=[50, 32, 3072]);
-          %133 = power(%132, 3f16);
-          %134 = multiply(%133, 0.044715f16);
-          %135 = add(%132, %134);
-          %136 = multiply(%135, 0.797885f16);
-          %137 = tanh(%136);
-          %138 = multiply(%132, 0.5f16);
-          %139 = add(%137, 1f16);
-          %140 = multiply(%138, %139);
-          %141 = reshape(%140, newshape=[-1, 3072]);
-          %142 = nn.dense(%141, meta[relay.Constant][28], units=768);
-          %143 = add(%142, meta[relay.Constant][29]);
-          %144 = reshape(%143, newshape=[50, 32, 768]);
-          %145 = add(%119, %144);
-          %146 = mean(%145, axis=[-1], keepdims=True);
-          %147 = subtract(%145, %146);
-          %148 = power(%147, 2f16);
-          %149 = mean(%148, axis=[-1], keepdims=True);
-          %150 = add(%149, 1e-05f16);
-          %151 = sqrt(%150);
-          %152 = divide(%147, %151);
-          %153 = multiply(%152, meta[relay.Constant][30]);
-          %154 = add(%153, meta[relay.Constant][31]);
-          %155 = reshape(%154, newshape=[-1, 768]);
-          %156 = nn.dense(%155, meta[relay.Constant][32], units=2304);
-          %157 = add(%156, meta[relay.Constant][33]);
-          %158 = reshape(%157, newshape=[50, 32, 2304]);
-          %159 = split(%158, indices_or_sections=[768, 1536], axis=2);
-          %160 = %159.0;
-          %161 = reshape(%160, newshape=[50, 32, 12, 64]);
-          %162 = transpose(%161, axes=[0, 2, 1, 3]);
-          %163 = %159.1;
-          %164 = reshape(%163, newshape=[50, 32, 12, 64]);
-          %165 = transpose(%164, axes=[0, 2, 3, 1]);
-          %166 = reshape(%165, newshape=[-1, 64, 32]);
-          %167 = reshape(%162, newshape=[-1, 32, 64]);
-          %168 = transpose(%166, axes=[0, 2, 1]);
-          %169 = nn.batch_matmul(%167, %168, out_dtype="float16", transpose_b=True);
-          %170 = reshape(%169, newshape=[50, 12, 32, 32]);
-          %171 = divide(%170, 8f16);
-          %172 = multiply(%171, meta[relay.Constant][34]);
-          %173 = subtract(%172, meta[relay.Constant][35]);
-          %174 = nn.softmax(%173, axis=3);
-          %175 = %159.2;
-          %176 = reshape(%175, newshape=[50, 32, 12, 64]);
-          %177 = transpose(%176, axes=[0, 2, 1, 3]);
-          %178 = reshape(%177, newshape=[-1, 32, 64]);
-          %179 = reshape(%174, newshape=[-1, 32, 32]);
-          %180 = transpose(%178, axes=[0, 2, 1]);
-          %181 = nn.batch_matmul(%179, %180, out_dtype="float16", transpose_b=True);
-          %182 = reshape(%181, newshape=[50, 12, 32, 64]);
-          %183 = transpose(%182, axes=[0, 2, 1, 3]);
-          %184 = reshape(%183, newshape=[50, 32, 768]);
-          %185 = reshape(%184, newshape=[-1, 768]);
-          %186 = nn.dense(%185, meta[relay.Constant][36], units=768);
-          %187 = add(%186, meta[relay.Constant][37]);
-          %188 = reshape(%187, newshape=[50, 32, 768]);
-          %189 = add(%145, %188);
-          %190 = mean(%189, axis=[-1], keepdims=True);
-          %191 = subtract(%189, %190);
-          %192 = power(%191, 2f16);
-          %193 = mean(%192, axis=[-1], keepdims=True);
-          %194 = add(%193, 1e-05f16);
-          %195 = sqrt(%194);
-          %196 = divide(%191, %195);
-          %197 = multiply(%196, meta[relay.Constant][38]);
-          %198 = add(%197, meta[relay.Constant][39]);
-          %199 = reshape(%198, newshape=[-1, 768]);
-          %200 = nn.dense(%199, meta[relay.Constant][40], units=3072);
-          %201 = add(%200, meta[relay.Constant][41]);
-          %202 = reshape(%201, newshape=[50, 32, 3072]);
-          %203 = power(%202, 3f16);
-          %204 = multiply(%203, 0.044715f16);
-          %205 = add(%202, %204);
-          %206 = multiply(%205, 0.797885f16);
-          %207 = tanh(%206);
-          %208 = multiply(%202, 0.5f16);
-          %209 = add(%207, 1f16);
-          %210 = multiply(%208, %209);
-          %211 = reshape(%210, newshape=[-1, 3072]);
-          %212 = nn.dense(%211, meta[relay.Constant][42], units=768);
-          %213 = add(%212, meta[relay.Constant][43]);
-          %214 = reshape(%213, newshape=[50, 32, 768]);
-          %215 = add(%189, %214);
-          %216 = mean(%215, axis=[-1], keepdims=True);
-          %217 = subtract(%215, %216);
-          %218 = power(%217, 2f16);
-          %219 = mean(%218, axis=[-1], keepdims=True);
-          %220 = add(%219, 1e-05f16);
-          %221 = sqrt(%220);
-          %222 = divide(%217, %221);
-          %223 = multiply(%222, meta[relay.Constant][44]);
-          %224 = add(%223, meta[relay.Constant][45]);
-          %225 = reshape(%224, newshape=[-1, 768]);
-          %226 = nn.dense(%225, meta[relay.Constant][46], units=2304);
-          %227 = add(%226, meta[relay.Constant][47]);
-          %228 = reshape(%227, newshape=[50, 32, 2304]);
-          %229 = split(%228, indices_or_sections=[768, 1536], axis=2);
-          %230 = %229.0;
-          %231 = reshape(%230, newshape=[50, 32, 12, 64]);
-          %232 = transpose(%231, axes=[0, 2, 1, 3]);
-          %233 = %229.1;
-          %234 = reshape(%233, newshape=[50, 32, 12, 64]);
-          %235 = transpose(%234, axes=[0, 2, 3, 1]);
-          %236 = reshape(%235, newshape=[-1, 64, 32]);
-          %237 = reshape(%232, newshape=[-1, 32, 64]);
-          %238 = transpose(%236, axes=[0, 2, 1]);
-          %239 = nn.batch_matmul(%237, %238, out_dtype="float16", transpose_b=True);
-          %240 = reshape(%239, newshape=[50, 12, 32, 32]);
-          %241 = divide(%240, 8f16);
-          %242 = multiply(%241, meta[relay.Constant][48]);
-          %243 = subtract(%242, meta[relay.Constant][49]);
-          %244 = nn.softmax(%243, axis=3);
-          %245 = %229.2;
-          %246 = reshape(%245, newshape=[50, 32, 12, 64]);
-          %247 = transpose(%246, axes=[0, 2, 1, 3]);
-          %248 = reshape(%247, newshape=[-1, 32, 64]);
-          %249 = reshape(%244, newshape=[-1, 32, 32]);
-          %250 = transpose(%248, axes=[0, 2, 1]);
-          %251 = nn.batch_matmul(%249, %250, out_dtype="float16", transpose_b=True);
-          %252 = reshape(%251, newshape=[50, 12, 32, 64]);
-          %253 = transpose(%252, axes=[0, 2, 1, 3]);
-          %254 = reshape(%253, newshape=[50, 32, 768]);
-          %255 = reshape(%254, newshape=[-1, 768]);
-          %256 = nn.dense(%255, meta[relay.Constant][50], units=768);
-          %257 = add(%256, meta[relay.Constant][51]);
-          %258 = reshape(%257, newshape=[50, 32, 768]);
-          %259 = add(%215, %258);
-          %260 = mean(%259, axis=[-1], keepdims=True);
-          %261 = subtract(%259, %260);
-          %262 = power(%261, 2f16);
-          %263 = mean(%262, axis=[-1], keepdims=True);
-          %264 = add(%263, 1e-05f16);
-          %265 = sqrt(%264);
-          %266 = divide(%261, %265);
-          %267 = multiply(%266, meta[relay.Constant][52]);
-          %268 = add(%267, meta[relay.Constant][53]);
-          %269 = reshape(%268, newshape=[-1, 768]);
-          %270 = nn.dense(%269, meta[relay.Constant][54], units=3072);
-          %271 = add(%270, meta[relay.Constant][55]);
-          %272 = reshape(%271, newshape=[50, 32, 3072]);
-          %273 = power(%272, 3f16);
-          %274 = multiply(%273, 0.044715f16);
-          %275 = add(%272, %274);
-          %276 = multiply(%275, 0.797885f16);
-          %277 = tanh(%276);
-          %278 = multiply(%272, 0.5f16);
-          %279 = add(%277, 1f16);
-          %280 = multiply(%278, %279);
-          %281 = reshape(%280, newshape=[-1, 3072]);
-          %282 = nn.dense(%281, meta[relay.Constant][56], units=768);
-          %283 = add(%282, meta[relay.Constant][57]);
-          %284 = reshape(%283, newshape=[50, 32, 768]);
-          %285 = add(%259, %284);
-          %286 = mean(%285, axis=[-1], keepdims=True);
-          %287 = subtract(%285, %286);
-          %288 = power(%287, 2f16);
-          %289 = mean(%288, axis=[-1], keepdims=True);
-          %290 = add(%289, 1e-05f16);
-          %291 = sqrt(%290);
-          %292 = divide(%287, %291);
-          %293 = multiply(%292, meta[relay.Constant][58]);
-          %294 = add(%293, meta[relay.Constant][59]);
-          %295 = reshape(%294, newshape=[-1, 768]);
-          %296 = nn.dense(%295, meta[relay.Constant][60], units=2304);
-          %297 = add(%296, meta[relay.Constant][61]);
-          %298 = reshape(%297, newshape=[50, 32, 2304]);
-          %299 = split(%298, indices_or_sections=[768, 1536], axis=2);
-          %300 = %299.0;
-          %301 = reshape(%300, newshape=[50, 32, 12, 64]);
-          %302 = transpose(%301, axes=[0, 2, 1, 3]);
-          %303 = %299.1;
-          %304 = reshape(%303, newshape=[50, 32, 12, 64]);
-          %305 = transpose(%304, axes=[0, 2, 3, 1]);
-          %306 = reshape(%305, newshape=[-1, 64, 32]);
-          %307 = reshape(%302, newshape=[-1, 32, 64]);
-          %308 = transpose(%306, axes=[0, 2, 1]);
-          %309 = nn.batch_matmul(%307, %308, out_dtype="float16", transpose_b=True);
-          %310 = reshape(%309, newshape=[50, 12, 32, 32]);
-          %311 = divide(%310, 8f16);
-          %312 = multiply(%311, meta[relay.Constant][62]);
-          %313 = subtract(%312, meta[relay.Constant][63]);
-          %314 = nn.softmax(%313, axis=3);
-          %315 = %299.2;
-          %316 = reshape(%315, newshape=[50, 32, 12, 64]);
-          %317 = transpose(%316, axes=[0, 2, 1, 3]);
-          %318 = reshape(%317, newshape=[-1, 32, 64]);
-          %319 = reshape(%314, newshape=[-1, 32, 32]);
-          %320 = transpose(%318, axes=[0, 2, 1]);
-          %321 = nn.batch_matmul(%319, %320, out_dtype="float16", transpose_b=True);
-          %322 = reshape(%321, newshape=[50, 12, 32, 64]);
-          %323 = transpose(%322, axes=[0, 2, 1, 3]);
-          %324 = reshape(%323, newshape=[50, 32, 768]);
-          %325 = reshape(%324, newshape=[-1, 768]);
-          %326 = nn.dense(%325, meta[relay.Constant][64], units=768);
-          %327 = add(%326, meta[relay.Constant][65]);
-          %328 = reshape(%327, newshape=[50, 32, 768]);
-          %329 = add(%285, %328);
-          %330 = mean(%329, axis=[-1], keepdims=True);
-          %331 = subtract(%329, %330);
-          %332 = power(%331, 2f16);
-          %333 = mean(%332, axis=[-1], keepdims=True);
-          %334 = add(%333, 1e-05f16);
-          %335 = sqrt(%334);
-          %336 = divide(%331, %335);
-          %337 = multiply(%336, meta[relay.Constant][66]);
-          %338 = add(%337, meta[relay.Constant][67]);
-          %339 = reshape(%338, newshape=[-1, 768]);
-          %340 = nn.dense(%339, meta[relay.Constant][68], units=3072);
-          %341 = add(%340, meta[relay.Constant][69]);
-          %342 = reshape(%341, newshape=[50, 32, 3072]);
-          %343 = power(%342, 3f16);
-          %344 = multiply(%343, 0.044715f16);
-          %345 = add(%342, %344);
-          %346 = multiply(%345, 0.797885f16);
-          %347 = tanh(%346);
-          %348 = multiply(%342, 0.5f16);
-          %349 = add(%347, 1f16);
-          %350 = multiply(%348, %349);
-          %351 = reshape(%350, newshape=[-1, 3072]);
-          %352 = nn.dense(%351, meta[relay.Constant][70], units=768);
-          %353 = add(%352, meta[relay.Constant][71]);
-          %354 = reshape(%353, newshape=[50, 32, 768]);
-          %355 = add(%329, %354);
-          %356 = mean(%355, axis=[-1], keepdims=True);
-          %357 = subtract(%355, %356);
-          %358 = power(%357, 2f16);
-          %359 = mean(%358, axis=[-1], keepdims=True);
-          %360 = add(%359, 1e-05f16);
-          %361 = sqrt(%360);
-          %362 = divide(%357, %361);
-          %363 = multiply(%362, meta[relay.Constant][72]);
-          %364 = add(%363, meta[relay.Constant][73]);
-          %365 = reshape(%364, newshape=[-1, 768]);
-          %366 = nn.dense(%365, meta[relay.Constant][74], units=2304);
-          %367 = add(%366, meta[relay.Constant][75]);
-          %368 = reshape(%367, newshape=[50, 32, 2304]);
-          %369 = split(%368, indices_or_sections=[768, 1536], axis=2);
-          %370 = %369.0;
-          %371 = reshape(%370, newshape=[50, 32, 12, 64]);
-          %372 = transpose(%371, axes=[0, 2, 1, 3]);
-          %373 = %369.1;
-          %374 = reshape(%373, newshape=[50, 32, 12, 64]);
-          %375 = transpose(%374, axes=[0, 2, 3, 1]);
-          %376 = reshape(%375, newshape=[-1, 64, 32]);
-          %377 = reshape(%372, newshape=[-1, 32, 64]);
-          %378 = transpose(%376, axes=[0, 2, 1]);
-          %379 = nn.batch_matmul(%377, %378, out_dtype="float16", transpose_b=True);
-          %380 = reshape(%379, newshape=[50, 12, 32, 32]);
-          %381 = divide(%380, 8f16);
-          %382 = multiply(%381, meta[relay.Constant][76]);
-          %383 = subtract(%382, meta[relay.Constant][77]);
-          %384 = nn.softmax(%383, axis=3);
-          %385 = %369.2;
-          %386 = reshape(%385, newshape=[50, 32, 12, 64]);
-          %387 = transpose(%386, axes=[0, 2, 1, 3]);
-          %388 = reshape(%387, newshape=[-1, 32, 64]);
-          %389 = reshape(%384, newshape=[-1, 32, 32]);
-          %390 = transpose(%388, axes=[0, 2, 1]);
-          %391 = nn.batch_matmul(%389, %390, out_dtype="float16", transpose_b=True);
-          %392 = reshape(%391, newshape=[50, 12, 32, 64]);
-          %393 = transpose(%392, axes=[0, 2, 1, 3]);
-          %394 = reshape(%393, newshape=[50, 32, 768]);
-          %395 = reshape(%394, newshape=[-1, 768]);
-          %396 = nn.dense(%395, meta[relay.Constant][78], units=768);
-          %397 = add(%396, meta[relay.Constant][79]);
-          %398 = reshape(%397, newshape=[50, 32, 768]);
-          %399 = add(%355, %398);
-          %400 = mean(%399, axis=[-1], keepdims=True);
-          %401 = subtract(%399, %400);
-          %402 = power(%401, 2f16);
-          %403 = mean(%402, axis=[-1], keepdims=True);
-          %404 = add(%403, 1e-05f16);
-          %405 = sqrt(%404);
-          %406 = divide(%401, %405);
-          %407 = multiply(%406, meta[relay.Constant][80]);
-          %408 = add(%407, meta[relay.Constant][81]);
-          %409 = reshape(%408, newshape=[-1, 768]);
-          %410 = nn.dense(%409, meta[relay.Constant][82], units=3072);
-          %411 = add(%410, meta[relay.Constant][83]);
-          %412 = reshape(%411, newshape=[50, 32, 3072]);
-          %413 = power(%412, 3f16);
-          %414 = multiply(%413, 0.044715f16);
-          %415 = add(%412, %414);
-          %416 = multiply(%415, 0.797885f16);
-          %417 = tanh(%416);
-          %418 = multiply(%412, 0.5f16);
-          %419 = add(%417, 1f16);
-          %420 = multiply(%418, %419);
-          %421 = reshape(%420, newshape=[-1, 3072]);
-          %422 = nn.dense(%421, meta[relay.Constant][84], units=768);
-          %423 = add(%422, meta[relay.Constant][85]);
-          %424 = reshape(%423, newshape=[50, 32, 768]);
-          %425 = add(%399, %424);
-          %426 = mean(%425, axis=[-1], keepdims=True);
-          %427 = subtract(%425, %426);
-          %428 = power(%427, 2f16);
-          %429 = mean(%428, axis=[-1], keepdims=True);
-          %430 = add(%429, 1e-05f16);
-          %431 = sqrt(%430);
-          %432 = divide(%427, %431);
-          %433 = multiply(%432, meta[relay.Constant][86]);
-          %434 = add(%433, meta[relay.Constant][87]);
-          %435 = reshape(%434, newshape=[-1, 768]);
-          %436 = nn.dense(%435, meta[relay.Constant][88], units=2304);
-          %437 = add(%436, meta[relay.Constant][89]);
-          %438 = reshape(%437, newshape=[50, 32, 2304]);
-          %439 = split(%438, indices_or_sections=[768, 1536], axis=2);
-          %440 = %439.0;
-          %441 = reshape(%440, newshape=[50, 32, 12, 64]);
-          %442 = transpose(%441, axes=[0, 2, 1, 3]);
-          %443 = %439.1;
-          %444 = reshape(%443, newshape=[50, 32, 12, 64]);
-          %445 = transpose(%444, axes=[0, 2, 3, 1]);
-          %446 = reshape(%445, newshape=[-1, 64, 32]);
-          %447 = reshape(%442, newshape=[-1, 32, 64]);
-          %448 = transpose(%446, axes=[0, 2, 1]);
-          %449 = nn.batch_matmul(%447, %448, out_dtype="float16", transpose_b=True);
-          %450 = reshape(%449, newshape=[50, 12, 32, 32]);
-          %451 = divide(%450, 8f16);
-          %452 = multiply(%451, meta[relay.Constant][90]);
-          %453 = subtract(%452, meta[relay.Constant][91]);
-          %454 = nn.softmax(%453, axis=3);
-          %455 = %439.2;
-          %456 = reshape(%455, newshape=[50, 32, 12, 64]);
-          %457 = transpose(%456, axes=[0, 2, 1, 3]);
-          %458 = reshape(%457, newshape=[-1, 32, 64]);
-          %459 = reshape(%454, newshape=[-1, 32, 32]);
-          %460 = transpose(%458, axes=[0, 2, 1]);
-          %461 = nn.batch_matmul(%459, %460, out_dtype="float16", transpose_b=True);
-          %462 = reshape(%461, newshape=[50, 12, 32, 64]);
-          %463 = transpose(%462, axes=[0, 2, 1, 3]);
-          %464 = reshape(%463, newshape=[50, 32, 768]);
-          %465 = reshape(%464, newshape=[-1, 768]);
-          %466 = nn.dense(%465, meta[relay.Constant][92], units=768);
-          %467 = add(%466, meta[relay.Constant][93]);
-          %468 = reshape(%467, newshape=[50, 32, 768]);
-          %469 = add(%425, %468);
-          %470 = mean(%469, axis=[-1], keepdims=True);
-          %471 = subtract(%469, %470);
-          %472 = power(%471, 2f16);
-          %473 = mean(%472, axis=[-1], keepdims=True);
-          %474 = add(%473, 1e-05f16);
-          %475 = sqrt(%474);
-          %476 = divide(%471, %475);
-          %477 = multiply(%476, meta[relay.Constant][94]);
-          %478 = add(%477, meta[relay.Constant][95]);
-          %479 = reshape(%478, newshape=[-1, 768]);
-          %480 = nn.dense(%479, meta[relay.Constant][96], units=3072);
-          %481 = add(%480, meta[relay.Constant][97]);
-          %482 = reshape(%481, newshape=[50, 32, 3072]);
-          %483 = power(%482, 3f16);
-          %484 = multiply(%483, 0.044715f16);
-          %485 = add(%482, %484);
-          %486 = multiply(%485, 0.797885f16);
-          %487 = tanh(%486);
-          %488 = multiply(%482, 0.5f16);
-          %489 = add(%487, 1f16);
-          %490 = multiply(%488, %489);
-          %491 = reshape(%490, newshape=[-1, 3072]);
-          %492 = nn.dense(%491, meta[relay.Constant][98], units=768);
-          %493 = add(%492, meta[relay.Constant][99]);
-          %494 = reshape(%493, newshape=[50, 32, 768]);
-          %495 = add(%469, %494);
-          %496 = mean(%495, axis=[-1], keepdims=True);
-          %497 = subtract(%495, %496);
-          %498 = power(%497, 2f16);
-          %499 = mean(%498, axis=[-1], keepdims=True);
-          %500 = add(%499, 1e-05f16);
-          %501 = sqrt(%500);
-          %502 = divide(%497, %501);
-          %503 = multiply(%502, meta[relay.Constant][100]);
-          %504 = add(%503, meta[relay.Constant][101]);
-          %505 = reshape(%504, newshape=[-1, 768]);
-          %506 = nn.dense(%505, meta[relay.Constant][102], units=2304);
-          %507 = add(%506, meta[relay.Constant][103]);
-          %508 = reshape(%507, newshape=[50, 32, 2304]);
-          %509 = split(%508, indices_or_sections=[768, 1536], axis=2);
-          %510 = %509.0;
-          %511 = reshape(%510, newshape=[50, 32, 12, 64]);
-          %512 = transpose(%511, axes=[0, 2, 1, 3]);
-          %513 = %509.1;
-          %514 = reshape(%513, newshape=[50, 32, 12, 64]);
-          %515 = transpose(%514, axes=[0, 2, 3, 1]);
-          %516 = reshape(%515, newshape=[-1, 64, 32]);
-          %517 = reshape(%512, newshape=[-1, 32, 64]);
-          %518 = transpose(%516, axes=[0, 2, 1]);
-          %519 = nn.batch_matmul(%517, %518, out_dtype="float16", transpose_b=True);
-          %520 = reshape(%519, newshape=[50, 12, 32, 32]);
-          %521 = divide(%520, 8f16);
-          %522 = multiply(%521, meta[relay.Constant][104]);
-          %523 = subtract(%522, meta[relay.Constant][105]);
-          %524 = nn.softmax(%523, axis=3);
-          %525 = %509.2;
-          %526 = reshape(%525, newshape=[50, 32, 12, 64]);
-          %527 = transpose(%526, axes=[0, 2, 1, 3]);
-          %528 = reshape(%527, newshape=[-1, 32, 64]);
-          %529 = reshape(%524, newshape=[-1, 32, 32]);
-          %530 = transpose(%528, axes=[0, 2, 1]);
-          %531 = nn.batch_matmul(%529, %530, out_dtype="float16", transpose_b=True);
-          %532 = reshape(%531, newshape=[50, 12, 32, 64]);
-          %533 = transpose(%532, axes=[0, 2, 1, 3]);
-          %534 = reshape(%533, newshape=[50, 32, 768]);
-          %535 = reshape(%534, newshape=[-1, 768]);
-          %536 = nn.dense(%535, meta[relay.Constant][106], units=768);
-          %537 = add(%536, meta[relay.Constant][107]);
-          %538 = reshape(%537, newshape=[50, 32, 768]);
-          %539 = add(%495, %538);
-          %540 = mean(%539, axis=[-1], keepdims=True);
-          %541 = subtract(%539, %540);
-          %542 = power(%541, 2f16);
-          %543 = mean(%542, axis=[-1], keepdims=True);
-          %544 = add(%543, 1e-05f16);
-          %545 = sqrt(%544);
-          %546 = divide(%541, %545);
-          %547 = multiply(%546, meta[relay.Constant][108]);
-          %548 = add(%547, meta[relay.Constant][109]);
-          %549 = reshape(%548, newshape=[-1, 768]);
-          %550 = nn.dense(%549, meta[relay.Constant][110], units=3072);
-          %551 = add(%550, meta[relay.Constant][111]);
-          %552 = reshape(%551, newshape=[50, 32, 3072]);
-          %553 = power(%552, 3f16);
-          %554 = multiply(%553, 0.044715f16);
-          %555 = add(%552, %554);
-          %556 = multiply(%555, 0.797885f16);
-          %557 = tanh(%556);
-          %558 = multiply(%552, 0.5f16);
-          %559 = add(%557, 1f16);
-          %560 = multiply(%558, %559);
-          %561 = reshape(%560, newshape=[-1, 3072]);
-          %562 = nn.dense(%561, meta[relay.Constant][112], units=768);
-          %563 = add(%562, meta[relay.Constant][113]);
-          %564 = reshape(%563, newshape=[50, 32, 768]);
-          %565 = add(%539, %564);
-          %566 = mean(%565, axis=[-1], keepdims=True);
-          %567 = subtract(%565, %566);
-          %568 = power(%567, 2f16);
-          %569 = mean(%568, axis=[-1], keepdims=True);
-          %570 = add(%569, 1e-05f16);
-          %571 = sqrt(%570);
-          %572 = divide(%567, %571);
-          %573 = multiply(%572, meta[relay.Constant][114]);
-          %574 = add(%573, meta[relay.Constant][115]);
-          %575 = reshape(%574, newshape=[-1, 768]);
-          %576 = nn.dense(%575, meta[relay.Constant][116], units=2304);
-          %577 = add(%576, meta[relay.Constant][117]);
-          %578 = reshape(%577, newshape=[50, 32, 2304]);
-          %579 = split(%578, indices_or_sections=[768, 1536], axis=2);
-          %580 = %579.0;
-          %581 = reshape(%580, newshape=[50, 32, 12, 64]);
-          %582 = transpose(%581, axes=[0, 2, 1, 3]);
-          %583 = %579.1;
-          %584 = reshape(%583, newshape=[50, 32, 12, 64]);
-          %585 = transpose(%584, axes=[0, 2, 3, 1]);
-          %586 = reshape(%585, newshape=[-1, 64, 32]);
-          %587 = reshape(%582, newshape=[-1, 32, 64]);
-          %588 = transpose(%586, axes=[0, 2, 1]);
-          %589 = nn.batch_matmul(%587, %588, out_dtype="float16", transpose_b=True);
-          %590 = reshape(%589, newshape=[50, 12, 32, 32]);
-          %591 = divide(%590, 8f16);
-          %592 = multiply(%591, meta[relay.Constant][118]);
-          %593 = subtract(%592, meta[relay.Constant][119]);
-          %594 = nn.softmax(%593, axis=3);
-          %595 = %579.2;
-          %596 = reshape(%595, newshape=[50, 32, 12, 64]);
-          %597 = transpose(%596, axes=[0, 2, 1, 3]);
-          %598 = reshape(%597, newshape=[-1, 32, 64]);
-          %599 = reshape(%594, newshape=[-1, 32, 32]);
-          %600 = transpose(%598, axes=[0, 2, 1]);
-          %601 = nn.batch_matmul(%599, %600, out_dtype="float16", transpose_b=True);
-          %602 = reshape(%601, newshape=[50, 12, 32, 64]);
-          %603 = transpose(%602, axes=[0, 2, 1, 3]);
-          %604 = reshape(%603, newshape=[50, 32, 768]);
-          %605 = reshape(%604, newshape=[-1, 768]);
-          %606 = nn.dense(%605, meta[relay.Constant][120], units=768);
-          %607 = add(%606, meta[relay.Constant][121]);
-          %608 = reshape(%607, newshape=[50, 32, 768]);
-          %609 = add(%565, %608);
-          %610 = mean(%609, axis=[-1], keepdims=True);
-          %611 = subtract(%609, %610);
-          %612 = power(%611, 2f16);
-          %613 = mean(%612, axis=[-1], keepdims=True);
-          %614 = add(%613, 1e-05f16);
-          %615 = sqrt(%614);
-          %616 = divide(%611, %615);
-          %617 = multiply(%616, meta[relay.Constant][122]);
-          %618 = add(%617, meta[relay.Constant][123]);
-          %619 = reshape(%618, newshape=[-1, 768]);
-          %620 = nn.dense(%619, meta[relay.Constant][124], units=3072);
-          %621 = add(%620, meta[relay.Constant][125]);
-          %622 = reshape(%621, newshape=[50, 32, 3072]);
-          %623 = power(%622, 3f16);
-          %624 = multiply(%623, 0.044715f16);
-          %625 = add(%622, %624);
-          %626 = multiply(%625, 0.797885f16);
-          %627 = tanh(%626);
-          %628 = multiply(%622, 0.5f16);
-          %629 = add(%627, 1f16);
-          %630 = multiply(%628, %629);
-          %631 = reshape(%630, newshape=[-1, 3072]);
-          %632 = nn.dense(%631, meta[relay.Constant][126], units=768);
-          %633 = add(%632, meta[relay.Constant][127]);
-          %634 = reshape(%633, newshape=[50, 32, 768]);
-          %635 = add(%609, %634);
-          %636 = mean(%635, axis=[-1], keepdims=True);
-          %637 = subtract(%635, %636);
-          %638 = power(%637, 2f16);
-          %639 = mean(%638, axis=[-1], keepdims=True);
-          %640 = add(%639, 1e-05f16);
-          %641 = sqrt(%640);
-          %642 = divide(%637, %641);
-          %643 = multiply(%642, meta[relay.Constant][128]);
-          %644 = add(%643, meta[relay.Constant][129]);
-          %645 = reshape(%644, newshape=[-1, 768]);
-          %646 = nn.dense(%645, meta[relay.Constant][130], units=2304);
-          %647 = add(%646, meta[relay.Constant][131]);
-          %648 = reshape(%647, newshape=[50, 32, 2304]);
-          %649 = split(%648, indices_or_sections=[768, 1536], axis=2);
-          %650 = %649.0;
-          %651 = reshape(%650, newshape=[50, 32, 12, 64]);
-          %652 = transpose(%651, axes=[0, 2, 1, 3]);
-          %653 = %649.1;
-          %654 = reshape(%653, newshape=[50, 32, 12, 64]);
-          %655 = transpose(%654, axes=[0, 2, 3, 1]);
-          %656 = reshape(%655, newshape=[-1, 64, 32]);
-          %657 = reshape(%652, newshape=[-1, 32, 64]);
-          %658 = transpose(%656, axes=[0, 2, 1]);
-          %659 = nn.batch_matmul(%657, %658, out_dtype="float16", transpose_b=True);
-          %660 = reshape(%659, newshape=[50, 12, 32, 32]);
-          %661 = divide(%660, 8f16);
-          %662 = multiply(%661, meta[relay.Constant][132]);
-          %663 = subtract(%662, meta[relay.Constant][133]);
-          %664 = nn.softmax(%663, axis=3);
-          %665 = %649.2;
-          %666 = reshape(%665, newshape=[50, 32, 12, 64]);
-          %667 = transpose(%666, axes=[0, 2, 1, 3]);
-          %668 = reshape(%667, newshape=[-1, 32, 64]);
-          %669 = reshape(%664, newshape=[-1, 32, 32]);
-          %670 = transpose(%668, axes=[0, 2, 1]);
-          %671 = nn.batch_matmul(%669, %670, out_dtype="float16", transpose_b=True);
-          %672 = reshape(%671, newshape=[50, 12, 32, 64]);
-          %673 = transpose(%672, axes=[0, 2, 1, 3]);
-          %674 = reshape(%673, newshape=[50, 32, 768]);
-          %675 = reshape(%674, newshape=[-1, 768]);
-          %676 = nn.dense(%675, meta[relay.Constant][134], units=768);
-          %677 = add(%676, meta[relay.Constant][135]);
-          %678 = reshape(%677, newshape=[50, 32, 768]);
-          %679 = add(%635, %678);
-          %680 = mean(%679, axis=[-1], keepdims=True);
-          %681 = subtract(%679, %680);
-          %682 = power(%681, 2f16);
-          %683 = mean(%682, axis=[-1], keepdims=True);
-          %684 = add(%683, 1e-05f16);
-          %685 = sqrt(%684);
-          %686 = divide(%681, %685);
-          %687 = multiply(%686, meta[relay.Constant][136]);
-          %688 = add(%687, meta[relay.Constant][137]);
-          %689 = reshape(%688, newshape=[-1, 768]);
-          %690 = nn.dense(%689, meta[relay.Constant][138], units=3072);
-          %691 = add(%690, meta[relay.Constant][139]);
-          %692 = reshape(%691, newshape=[50, 32, 3072]);
-          %693 = power(%692, 3f16);
-          %694 = multiply(%693, 0.044715f16);
-          %695 = add(%692, %694);
-          %696 = multiply(%695, 0.797885f16);
-          %697 = tanh(%696);
-          %698 = multiply(%692, 0.5f16);
-          %699 = add(%697, 1f16);
-          %700 = multiply(%698, %699);
-          %701 = reshape(%700, newshape=[-1, 3072]);
-          %702 = nn.dense(%701, meta[relay.Constant][140], units=768);
-          %703 = add(%702, meta[relay.Constant][141]);
-          %704 = reshape(%703, newshape=[50, 32, 768]);
-          %705 = add(%679, %704);
-          %706 = mean(%705, axis=[-1], keepdims=True);
-          %707 = subtract(%705, %706);
-          %708 = power(%707, 2f16);
-          %709 = mean(%708, axis=[-1], keepdims=True);
-          %710 = add(%709, 1e-05f16);
-          %711 = sqrt(%710);
-          %712 = divide(%707, %711);
-          %713 = multiply(%712, meta[relay.Constant][142]);
-          %714 = add(%713, meta[relay.Constant][143]);
-          %715 = reshape(%714, newshape=[-1, 768]);
-          %716 = nn.dense(%715, meta[relay.Constant][144], units=2304);
-          %717 = add(%716, meta[relay.Constant][145]);
-          %718 = reshape(%717, newshape=[50, 32, 2304]);
-          %719 = split(%718, indices_or_sections=[768, 1536], axis=2);
-          %720 = %719.0;
-          %721 = reshape(%720, newshape=[50, 32, 12, 64]);
-          %722 = transpose(%721, axes=[0, 2, 1, 3]);
-          %723 = %719.1;
-          %724 = reshape(%723, newshape=[50, 32, 12, 64]);
-          %725 = transpose(%724, axes=[0, 2, 3, 1]);
-          %726 = reshape(%725, newshape=[-1, 64, 32]);
-          %727 = reshape(%722, newshape=[-1, 32, 64]);
-          %728 = transpose(%726, axes=[0, 2, 1]);
-          %729 = nn.batch_matmul(%727, %728, out_dtype="float16", transpose_b=True);
-          %730 = reshape(%729, newshape=[50, 12, 32, 32]);
-          %731 = divide(%730, 8f16);
-          %732 = multiply(%731, meta[relay.Constant][146]);
-          %733 = subtract(%732, meta[relay.Constant][147]);
-          %734 = nn.softmax(%733, axis=3);
-          %735 = %719.2;
-          %736 = reshape(%735, newshape=[50, 32, 12, 64]);
-          %737 = transpose(%736, axes=[0, 2, 1, 3]);
-          %738 = reshape(%737, newshape=[-1, 32, 64]);
-          %739 = reshape(%734, newshape=[-1, 32, 32]);
-          %740 = transpose(%738, axes=[0, 2, 1]);
-          %741 = nn.batch_matmul(%739, %740, out_dtype="float16", transpose_b=True);
-          %742 = reshape(%741, newshape=[50, 12, 32, 64]);
-          %743 = transpose(%742, axes=[0, 2, 1, 3]);
-          %744 = reshape(%743, newshape=[50, 32, 768]);
-          %745 = reshape(%744, newshape=[-1, 768]);
-          %746 = nn.dense(%745, meta[relay.Constant][148], units=768);
-          %747 = add(%746, meta[relay.Constant][149]);
-          %748 = reshape(%747, newshape=[50, 32, 768]);
-          %749 = add(%705, %748);
-          %750 = mean(%749, axis=[-1], keepdims=True);
-          %751 = subtract(%749, %750);
-          %752 = power(%751, 2f16);
-          %753 = mean(%752, axis=[-1], keepdims=True);
-          %754 = add(%753, 1e-05f16);
-          %755 = sqrt(%754);
-          %756 = divide(%751, %755);
-          %757 = multiply(%756, meta[relay.Constant][150]);
-          %758 = add(%757, meta[relay.Constant][151]);
-          %759 = reshape(%758, newshape=[-1, 768]);
-          %760 = nn.dense(%759, meta[relay.Constant][152], units=3072);
-          %761 = add(%760, meta[relay.Constant][153]);
-          %762 = reshape(%761, newshape=[50, 32, 3072]);
-          %763 = power(%762, 3f16);
-          %764 = multiply(%763, 0.044715f16);
-          %765 = add(%762, %764);
-          %766 = multiply(%765, 0.797885f16);
-          %767 = tanh(%766);
-          %768 = multiply(%762, 0.5f16);
-          %769 = add(%767, 1f16);
-          %770 = multiply(%768, %769);
-          %771 = reshape(%770, newshape=[-1, 3072]);
-          %772 = nn.dense(%771, meta[relay.Constant][154], units=768);
-          %773 = add(%772, meta[relay.Constant][155]);
-          %774 = reshape(%773, newshape=[50, 32, 768]);
-          %775 = add(%749, %774);
-          %776 = mean(%775, axis=[-1], keepdims=True);
-          %777 = subtract(%775, %776);
-          %778 = power(%777, 2f16);
-          %779 = mean(%778, axis=[-1], keepdims=True);
-          %780 = add(%779, 1e-05f16);
-          %781 = sqrt(%780);
-          %782 = divide(%777, %781);
-          %783 = multiply(%782, meta[relay.Constant][156]);
-          %784 = add(%783, meta[relay.Constant][157]);
-          %785 = reshape(%784, newshape=[-1, 768]);
-          %786 = nn.dense(%785, meta[relay.Constant][158], units=2304);
-          %787 = add(%786, meta[relay.Constant][159]);
-          %788 = reshape(%787, newshape=[50, 32, 2304]);
-          %789 = split(%788, indices_or_sections=[768, 1536], axis=2);
-          %790 = %789.0;
-          %791 = reshape(%790, newshape=[50, 32, 12, 64]);
-          %792 = transpose(%791, axes=[0, 2, 1, 3]);
-          %793 = %789.1;
-          %794 = reshape(%793, newshape=[50, 32, 12, 64]);
-          %795 = transpose(%794, axes=[0, 2, 3, 1]);
-          %796 = reshape(%795, newshape=[-1, 64, 32]);
-          %797 = reshape(%792, newshape=[-1, 32, 64]);
-          %798 = transpose(%796, axes=[0, 2, 1]);
-          %799 = nn.batch_matmul(%797, %798, out_dtype="float16", transpose_b=True);
-          %800 = reshape(%799, newshape=[50, 12, 32, 32]);
-          %801 = divide(%800, 8f16);
-          %802 = multiply(%801, meta[relay.Constant][160]);
-          %803 = subtract(%802, meta[relay.Constant][161]);
-          %804 = nn.softmax(%803, axis=3);
-          %805 = %789.2;
-          %806 = reshape(%805, newshape=[50, 32, 12, 64]);
-          %807 = transpose(%806, axes=[0, 2, 1, 3]);
-          %808 = reshape(%807, newshape=[-1, 32, 64]);
-          %809 = reshape(%804, newshape=[-1, 32, 32]);
-          %810 = transpose(%808, axes=[0, 2, 1]);
-          %811 = nn.batch_matmul(%809, %810, out_dtype="float16", transpose_b=True);
-          %812 = reshape(%811, newshape=[50, 12, 32, 64]);
-          %813 = transpose(%812, axes=[0, 2, 1, 3]);
-          %814 = reshape(%813, newshape=[50, 32, 768]);
-          %815 = reshape(%814, newshape=[-1, 768]);
-          %816 = nn.dense(%815, meta[relay.Constant][162], units=768);
-          %817 = add(%816, meta[relay.Constant][163]);
-          %818 = reshape(%817, newshape=[50, 32, 768]);
-          %819 = add(%775, %818);
-          %820 = mean(%819, axis=[-1], keepdims=True);
-          %821 = subtract(%819, %820);
-          %822 = power(%821, 2f16);
-          %823 = mean(%822, axis=[-1], keepdims=True);
-          %824 = add(%823, 1e-05f16);
-          %825 = sqrt(%824);
-          %826 = divide(%821, %825);
-          %827 = multiply(%826, meta[relay.Constant][164]);
-          %828 = add(%827, meta[relay.Constant][165]);
-          %829 = reshape(%828, newshape=[-1, 768]);
-          %830 = nn.dense(%829, meta[relay.Constant][166], units=3072);
-          %831 = add(%830, meta[relay.Constant][167]);
-          %832 = reshape(%831, newshape=[50, 32, 3072]);
-          %833 = power(%832, 3f16);
-          %834 = multiply(%833, 0.044715f16);
-          %835 = add(%832, %834);
-          %836 = multiply(%835, 0.797885f16);
-          %837 = tanh(%836);
-          %838 = multiply(%832, 0.5f16);
-          %839 = add(%837, 1f16);
-          %840 = multiply(%838, %839);
-          %841 = reshape(%840, newshape=[-1, 3072]);
-          %842 = nn.dense(%841, meta[relay.Constant][168], units=768);
-          %843 = add(%842, meta[relay.Constant][169]);
-          %844 = reshape(%843, newshape=[50, 32, 768]);
-          %845 = add(%819, %844);
-          %846 = mean(%845, axis=[-1], keepdims=True);
-          %847 = subtract(%845, %846);
-          %848 = power(%847, 2f16);
-          %849 = mean(%848, axis=[-1], keepdims=True);
-          %850 = add(%849, 1e-05f16);
-          %851 = sqrt(%850);
-          %852 = divide(%847, %851);
-          %853 = multiply(%852, meta[relay.Constant][170]);
-          %854 = add(%853, meta[relay.Constant][171]);
-          %855 = transpose(%24, axes=[0, 2, 1, 3]);
-          %856 = expand_dims(%855, axis=0);
-          %857 = expand_dims(%37, axis=0);
-          %858 = (%856, %857);
-          %859 = transpose(%94, axes=[0, 2, 1, 3]);
-          %860 = expand_dims(%859, axis=0);
-          %861 = expand_dims(%107, axis=0);
-          %862 = (%860, %861);
-          %863 = transpose(%164, axes=[0, 2, 1, 3]);
-          %864 = expand_dims(%863, axis=0);
-          %865 = expand_dims(%177, axis=0);
-          %866 = (%864, %865);
-          %867 = transpose(%234, axes=[0, 2, 1, 3]);
-          %868 = expand_dims(%867, axis=0);
-          %869 = expand_dims(%247, axis=0);
-          %870 = (%868, %869);
-          %871 = transpose(%304, axes=[0, 2, 1, 3]);
-          %872 = expand_dims(%871, axis=0);
-          %873 = expand_dims(%317, axis=0);
-          %874 = (%872, %873);
-          %875 = transpose(%374, axes=[0, 2, 1, 3]);
-          %876 = expand_dims(%875, axis=0);
-          %877 = expand_dims(%387, axis=0);
-          %878 = (%876, %877);
-          %879 = transpose(%444, axes=[0, 2, 1, 3]);
-          %880 = expand_dims(%879, axis=0);
-          %881 = expand_dims(%457, axis=0);
-          %882 = (%880, %881);
-          %883 = transpose(%514, axes=[0, 2, 1, 3]);
-          %884 = expand_dims(%883, axis=0);
-          %885 = expand_dims(%527, axis=0);
-          %886 = (%884, %885);
-          %887 = transpose(%584, axes=[0, 2, 1, 3]);
-          %888 = expand_dims(%887, axis=0);
-          %889 = expand_dims(%597, axis=0);
-          %890 = (%888, %889);
-          %891 = transpose(%654, axes=[0, 2, 1, 3]);
-          %892 = expand_dims(%891, axis=0);
-          %893 = expand_dims(%667, axis=0);
-          %894 = (%892, %893);
-          %895 = transpose(%724, axes=[0, 2, 1, 3]);
-          %896 = expand_dims(%895, axis=0);
-          %897 = expand_dims(%737, axis=0);
-          %898 = (%896, %897);
-          %899 = transpose(%794, axes=[0, 2, 1, 3]);
-          %900 = expand_dims(%899, axis=0);
-          %901 = expand_dims(%807, axis=0);
-          %902 = (%900, %901);
-          %903 = reshape(%854, newshape=[1, 50, 32, 768]);
-          %904 = concatenate(%858);
-          %905 = concatenate(%862);
-          %906 = concatenate(%866);
-          %907 = concatenate(%870);
-          %908 = concatenate(%874);
-          %909 = concatenate(%878);
-          %910 = concatenate(%882);
-          %911 = concatenate(%886);
-          %912 = concatenate(%890);
-          %913 = concatenate(%894);
-          %914 = concatenate(%898);
-          %915 = concatenate(%902);
-          (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "gpt2_16",
-        "input_shapes": {"x": [1, 50, 32]},
-        "input_dtypes": {"x": "int64"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float16",
-    }
-
-
-def gpt2_extract_consts(dtype):
-    return make_consts(
-        dtype,
-        [
-            (768, 768),  # 0
-            (768,),  # 1
-            (768,),  # 2
-            (768,),  # 3
-            (3072, 768),  # 4
-            (3072,),  # 5
-            (1, 32, 768),  # 6
-        ],
-    )
-
-
-def gpt2_extract():
-    metatable = {"relay.Constant": gpt2_extract_consts("float32")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(1600, 768), float32]) -> Tensor[(50, 32, 3072), float32] {
-            %46 = nn.dense(%x, meta[relay.Constant][0], units=768);
-            %47 = add(%46, meta[relay.Constant][1]);
-            %48 = reshape(%47, newshape=[50, 32, 768]);
-            %49 = add(meta[relay.Constant][6], %48);
-            %50 = mean(%49, axis=[-1], keepdims=True);
-            %51 = subtract(%49, %50);
-            %52 = power(%51, 2f);
-            %53 = mean(%52, axis=[-1], keepdims=True);
-            %54 = add(%53, 1e-05f);
-            %55 = sqrt(%54);
-            %56 = divide(%51, %55);
-            %57 = multiply(%56, meta[relay.Constant][2]);
-            %58 = add(%57, meta[relay.Constant][3]);
-            %59 = reshape(%58, newshape=[-1, 768]);
-            %60 = nn.dense(%59, meta[relay.Constant][4], units=3072);
-            %61 = add(%60, meta[relay.Constant][5]);
-            %62 = reshape(%61, newshape=[50, 32, 3072]);
-            %63 = power(%62, 3f);
-            %64 = multiply(%63, 0.044715f);
-            %65 = add(%62, %64);
-            %66 = multiply(%65, 0.797885f);
-            %67 = tanh(%66);
-            %68 = multiply(%62, 0.5f);
-            %69 = add(%67, 1f);
-            %70 = multiply(%68, %69);
-            %70
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "input_shapes": {"x": [1600, 768]},
-        "input_dtypes": {"x": "float32"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float32",
-    }
-
-
-def gpt2_extract_16():
-    metatable = {"relay.Constant": gpt2_extract_consts("float16")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(1600, 768), float16]) -> Tensor[(50, 32, 3072), float16] {
-            %46 = nn.dense(%x, meta[relay.Constant][0], units=768);
-            %47 = add(%46, meta[relay.Constant][1]);
-            %48 = reshape(%47, newshape=[50, 32, 768]);
-            %49 = add(meta[relay.Constant][6], %48);
-            %50 = mean(%49, axis=[-1], keepdims=True);
-            %51 = subtract(%49, %50);
-            %52 = power(%51, 2f16);
-            %53 = mean(%52, axis=[-1], keepdims=True);
-            %54 = add(%53, 1e-05f16);
-            %55 = sqrt(%54);
-            %56 = divide(%51, %55);
-            %57 = multiply(%56, meta[relay.Constant][2]);
-            %58 = add(%57, meta[relay.Constant][3]);
-            %59 = reshape(%58, newshape=[-1, 768]);
-            %60 = nn.dense(%59, meta[relay.Constant][4], units=3072);
-            %61 = add(%60, meta[relay.Constant][5]);
-            %62 = reshape(%61, newshape=[50, 32, 3072]);
-            %63 = power(%62, 3f16);
-            %64 = multiply(%63, 0.044715f16);
-            %65 = add(%62, %64);
-            %66 = multiply(%65, 0.797885f16);
-            %67 = tanh(%66);
-            %68 = multiply(%62, 0.5f16);
-            %69 = add(%67, 1f16);
-            %70 = multiply(%68, %69);
-            %70
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "gpt2_extract_16",
-        "input_shapes": {"x": [1600, 768]},
-        "input_dtypes": {"x": "float16"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float16",
-    }
-
-
-def gpt2_16_for_cutlass_extract_consts(dtype):
-    return make_consts(
-        "float16",
-        [
-            (2304, 768),  # 0
-            (2304,),  # 1
-            (600, 32, 64),  # 2
-            (600, 32, 32),  # 3
-        ],
-    )
-
-
-def gpt2_16_for_cutlass_extract():
-    metatable = {"relay.Constant": gpt2_16_for_cutlass_extract_consts("float16")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x0: Tensor[(1600, 768), float16],
-                  %x3: Tensor[(600, 32, 64), float16])
-            -> (Tensor[(1600, 2304), float16], Tensor[(1200, 32, 32), float16]) {
-          %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304);
-          %1 = add(%0, meta[relay.Constant][1]);
-          %2 = nn.batch_matmul(%x3, meta[relay.Constant][2], out_dtype="float16", transpose_b=True);
-          %3 = (%2, meta[relay.Constant][3]);
-          %4 = concatenate(%3);
-          (%1, %4)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "gpt2_16_for_cutlass_extract",
-        "input_shapes": {"x0": (1600, 768), "x3": (600, 32, 64)},
-        "input_dtypes": {"x0": "float16", "x3": "float16"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float16",
-    }
-
-
-def resnet50_consts(dtype):
-    return make_consts(
-        dtype,
-        [
-            (3,),  # 0
-            (3,),  # 1
-            (3,),  # 2
-            (3,),  # 3
-            (64, 3, 7, 7),  # 4
-            (64,),  # 5
-            (64,),  # 6
-            (64,),  # 7
-            (64,),  # 8
-            (64,),  # 9
-            (64,),  # 10
-            (64,),  # 11
-            (64,),  # 12
-            (64, 64, 1, 1),  # 13
-            (64,),  # 14
-            (64,),  # 15
-            (64,),  # 16
-            (64,),  # 17
-            (64, 64, 3, 3),  # 18
-            (64,),  # 19
-            (64,),  # 20
-            (64,),  # 21
-            (64,),  # 22
-            (256, 64, 1, 1),  # 23
-            (256, 64, 1, 1),  # 24
-            (256,),  # 25
-            (256,),  # 26
-            (256,),  # 27
-            (256,),  # 28
-            (64, 256, 1, 1),  # 29
-            (64,),  # 30
-            (64,),  # 31
-            (64,),  # 32
-            (64,),  # 33
-            (64, 64, 3, 3),  # 34
-            (64,),  # 35
-            (64,),  # 36
-            (64,),  # 37
-            (64,),  # 38
-            (256, 64, 1, 1),  # 39
-            (256,),  # 40
-            (256,),  # 41
-            (256,),  # 42
-            (256,),  # 43
-            (64, 256, 1, 1),  # 44
-            (64,),  # 45
-            (64,),  # 46
-            (64,),  # 47
-            (64,),  # 48
-            (64, 64, 3, 3),  # 49
-            (64,),  # 50
-            (64,),  # 51
-            (64,),  # 52
-            (64,),  # 53
-            (256, 64, 1, 1),  # 54
-            (256,),  # 55
-            (256,),  # 56
-            (256,),  # 57
-            (256,),  # 58
-            (128, 256, 1, 1),  # 59
-            (128,),  # 60
-            (128,),  # 61
-            (128,),  # 62
-            (128,),  # 63
-            (128, 128, 3, 3),  # 64
-            (128,),  # 65
-            (128,),  # 66
-            (128,),  # 67
-            (128,),  # 68
-            (512, 128, 1, 1),  # 69
-            (512, 256, 1, 1),  # 70
-            (512,),  # 71
-            (512,),  # 72
-            (512,),  # 73
-            (512,),  # 74
-            (128, 512, 1, 1),  # 75
-            (128,),  # 76
-            (128,),  # 77
-            (128,),  # 78
-            (128,),  # 79
-            (128, 128, 3, 3),  # 80
-            (128,),  # 81
-            (128,),  # 82
-            (128,),  # 83
-            (128,),  # 84
-            (512, 128, 1, 1),  # 85
-            (512,),  # 86
-            (512,),  # 87
-            (512,),  # 88
-            (512,),  # 89
-            (128, 512, 1, 1),  # 90
-            (128,),  # 91
-            (128,),  # 92
-            (128,),  # 93
-            (128,),  # 94
-            (128, 128, 3, 3),  # 95
-            (128,),  # 96
-            (128,),  # 97
-            (128,),  # 98
-            (128,),  # 99
-            (512, 128, 1, 1),  # 100
-            (512,),  # 101
-            (512,),  # 102
-            (512,),  # 103
-            (512,),  # 104
-            (128, 512, 1, 1),  # 105
-            (128,),  # 106
-            (128,),  # 107
-            (128,),  # 108
-            (128,),  # 109
-            (128, 128, 3, 3),  # 110
-            (128,),  # 111
-            (128,),  # 112
-            (128,),  # 113
-            (128,),  # 114
-            (512, 128, 1, 1),  # 115
-            (512,),  # 116
-            (512,),  # 117
-            (512,),  # 118
-            (512,),  # 119
-            (256, 512, 1, 1),  # 120
-            (256,),  # 121
-            (256,),  # 122
-            (256,),  # 123
-            (256,),  # 124
-            (256, 256, 3, 3),  # 125
-            (256,),  # 126
-            (256,),  # 127
-            (256,),  # 128
-            (256,),  # 129
-            (1024, 256, 1, 1),  # 130
-            (1024, 512, 1, 1),  # 131
-            (1024,),  # 132
-            (1024,),  # 133
-            (1024,),  # 134
-            (1024,),  # 135
-            (256, 1024, 1, 1),  # 136
-            (256,),  # 137
-            (256,),  # 138
-            (256,),  # 139
-            (256,),  # 140
-            (256, 256, 3, 3),  # 141
-            (256,),  # 142
-            (256,),  # 143
-            (256,),  # 144
-            (256,),  # 145
-            (1024, 256, 1, 1),  # 146
-            (1024,),  # 147
-            (1024,),  # 148
-            (1024,),  # 149
-            (1024,),  # 150
-            (256, 1024, 1, 1),  # 151
-            (256,),  # 152
-            (256,),  # 153
-            (256,),  # 154
-            (256,),  # 155
-            (256, 256, 3, 3),  # 156
-            (256,),  # 157
-            (256,),  # 158
-            (256,),  # 159
-            (256,),  # 160
-            (1024, 256, 1, 1),  # 161
-            (1024,),  # 162
-            (1024,),  # 163
-            (1024,),  # 164
-            (1024,),  # 165
-            (256, 1024, 1, 1),  # 166
-            (256,),  # 167
-            (256,),  # 168
-            (256,),  # 169
-            (256,),  # 170
-            (256, 256, 3, 3),  # 171
-            (256,),  # 172
-            (256,),  # 173
-            (256,),  # 174
-            (256,),  # 175
-            (1024, 256, 1, 1),  # 176
-            (1024,),  # 177
-            (1024,),  # 178
-            (1024,),  # 179
-            (1024,),  # 180
-            (256, 1024, 1, 1),  # 181
-            (256,),  # 182
-            (256,),  # 183
-            (256,),  # 184
-            (256,),  # 185
-            (256, 256, 3, 3),  # 186
-            (256,),  # 187
-            (256,),  # 188
-            (256,),  # 189
-            (256,),  # 190
-            (1024, 256, 1, 1),  # 191
-            (1024,),  # 192
-            (1024,),  # 193
-            (1024,),  # 194
-            (1024,),  # 195
-            (256, 1024, 1, 1),  # 196
-            (256,),  # 197
-            (256,),  # 198
-            (256,),  # 199
-            (256,),  # 200
-            (256, 256, 3, 3),  # 201
-            (256,),  # 202
-            (256,),  # 203
-            (256,),  # 204
-            (256,),  # 205
-            (1024, 256, 1, 1),  # 206
-            (1024,),  # 207
-            (1024,),  # 208
-            (1024,),  # 209
-            (1024,),  # 210
-            (512, 1024, 1, 1),  # 211
-            (512,),  # 212
-            (512,),  # 213
-            (512,),  # 214
-            (512,),  # 215
-            (512, 512, 3, 3),  # 216
-            (512,),  # 217
-            (512,),  # 218
-            (512,),  # 219
-            (512,),  # 220
-            (2048, 512, 1, 1),  # 221
-            (2048, 1024, 1, 1),  # 222
-            (2048,),  # 223
-            (2048,),  # 224
-            (2048,),  # 225
-            (2048,),  # 226
-            (512, 2048, 1, 1),  # 227
-            (512,),  # 228
-            (512,),  # 229
-            (512,),  # 230
-            (512,),  # 231
-            (512, 512, 3, 3),  # 232
-            (512,),  # 233
-            (512,),  # 234
-            (512,),  # 235
-            (512,),  # 236
-            (2048, 512, 1, 1),  # 237
-            (2048,),  # 238
-            (2048,),  # 239
-            (2048,),  # 240
-            (2048,),  # 241
-            (512, 2048, 1, 1),  # 242
-            (512,),  # 243
-            (512,),  # 244
-            (512,),  # 245
-            (512,),  # 246
-            (512, 512, 3, 3),  # 247
-            (512,),  # 248
-            (512,),  # 249
-            (512,),  # 250
-            (512,),  # 251
-            (2048, 512, 1, 1),  # 252
-            (2048,),  # 253
-            (2048,),  # 254
-            (2048,),  # 255
-            (2048,),  # 256
-            (1000, 2048),  # 257
-            (1000,),  # 258
-        ],
-    )
-
-
-def resnet50():
-    metatable = {"relay.Constant": resnet50_consts("float32")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
-          %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
-          %1 = %0.0;
-          %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]);
-          %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]);
-          %4 = %3.0;
-          %5 = nn.relu(%4);
-          %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]);
-          %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]);
-          %8 = %7.0;
-          %9 = nn.relu(%8);
-          %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]);
-          %12 = %11.0;
-          %13 = nn.relu(%12);
-          %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-          %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]);
-          %16 = %15.0;
-          %17 = nn.relu(%16);
-          %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %20 = add(%18, %19);
-          %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]);
-          %22 = %21.0;
-          %23 = nn.relu(%22);
-          %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]);
-          %26 = %25.0;
-          %27 = nn.relu(%26);
-          %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-          %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]);
-          %30 = %29.0;
-          %31 = nn.relu(%30);
-          %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %33 = add(%32, %20);
-          %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]);
-          %35 = %34.0;
-          %36 = nn.relu(%35);
-          %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]);
-          %39 = %38.0;
-          %40 = nn.relu(%39);
-          %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-          %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]);
-          %43 = %42.0;
-          %44 = nn.relu(%43);
-          %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %46 = add(%45, %33);
-          %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]);
-          %48 = %47.0;
-          %49 = nn.relu(%48);
-          %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]);
-          %52 = %51.0;
-          %53 = nn.relu(%52);
-          %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]);
-          %56 = %55.0;
-          %57 = nn.relu(%56);
-          %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %60 = add(%58, %59);
-          %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
-          %62 = %61.0;
-          %63 = nn.relu(%62);
-          %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
-          %66 = %65.0;
-          %67 = nn.relu(%66);
-          %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
-          %70 = %69.0;
-          %71 = nn.relu(%70);
-          %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %73 = add(%72, %60);
-          %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
-          %75 = %74.0;
-          %76 = nn.relu(%75);
-          %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
-          %79 = %78.0;
-          %80 = nn.relu(%79);
-          %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
-          %83 = %82.0;
-          %84 = nn.relu(%83);
-          %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %86 = add(%85, %73);
-          %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
-          %88 = %87.0;
-          %89 = nn.relu(%88);
-          %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
-          %92 = %91.0;
-          %93 = nn.relu(%92);
-          %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
-          %96 = %95.0;
-          %97 = nn.relu(%96);
-          %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %99 = add(%98, %86);
-          %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
-          %101 = %100.0;
-          %102 = nn.relu(%101);
-          %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
-          %105 = %104.0;
-          %106 = nn.relu(%105);
-          %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
-          %109 = %108.0;
-          %110 = nn.relu(%109);
-          %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %113 = add(%111, %112);
-          %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]);
-          %115 = %114.0;
-          %116 = nn.relu(%115);
-          %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]);
-          %119 = %118.0;
-          %120 = nn.relu(%119);
-          %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]);
-          %123 = %122.0;
-          %124 = nn.relu(%123);
-          %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %126 = add(%125, %113);
-          %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]);
-          %128 = %127.0;
-          %129 = nn.relu(%128);
-          %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]);
-          %132 = %131.0;
-          %133 = nn.relu(%132);
-          %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]);
-          %136 = %135.0;
-          %137 = nn.relu(%136);
-          %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %139 = add(%138, %126);
-          %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]);
-          %141 = %140.0;
-          %142 = nn.relu(%141);
-          %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]);
-          %145 = %144.0;
-          %146 = nn.relu(%145);
-          %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]);
-          %149 = %148.0;
-          %150 = nn.relu(%149);
-          %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %152 = add(%151, %139);
-          %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]);
-          %154 = %153.0;
-          %155 = nn.relu(%154);
-          %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]);
-          %158 = %157.0;
-          %159 = nn.relu(%158);
-          %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]);
-          %162 = %161.0;
-          %163 = nn.relu(%162);
-          %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %165 = add(%164, %152);
-          %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]);
-          %167 = %166.0;
-          %168 = nn.relu(%167);
-          %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]);
-          %171 = %170.0;
-          %172 = nn.relu(%171);
-          %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]);
-          %175 = %174.0;
-          %176 = nn.relu(%175);
-          %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %178 = add(%177, %165);
-          %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]);
-          %180 = %179.0;
-          %181 = nn.relu(%180);
-          %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]);
-          %184 = %183.0;
-          %185 = nn.relu(%184);
-          %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
-          %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]);
-          %188 = %187.0;
-          %189 = nn.relu(%188);
-          %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %192 = add(%190, %191);
-          %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]);
-          %194 = %193.0;
-          %195 = nn.relu(%194);
-          %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]);
-          %198 = %197.0;
-          %199 = nn.relu(%198);
-          %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
-          %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]);
-          %202 = %201.0;
-          %203 = nn.relu(%202);
-          %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %205 = add(%204, %192);
-          %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]);
-          %207 = %206.0;
-          %208 = nn.relu(%207);
-          %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]);
-          %211 = %210.0;
-          %212 = nn.relu(%211);
-          %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
-          %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]);
-          %215 = %214.0;
-          %216 = nn.relu(%215);
-          %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %218 = add(%217, %205);
-          %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]);
-          %220 = %219.0;
-          %221 = nn.relu(%220);
-          %222 = nn.global_avg_pool2d(%221);
-          %223 = reshape(%222, newshape=[0, -1]);
-          %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
-          add(%224, meta[relay.Constant][258])
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "resnet50",
-        "input_shapes": {"data": [1, 3, 224, 224]},
-        "input_dtypes": {"data": "float32"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float32",
-    }
-
-
-def resnet50_16():
-    metatable = {"relay.Constant": resnet50_consts("float16")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] {
-          %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
-          %1 = %0.0;
-          %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]);
-          %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]);
-          %4 = %3.0;
-          %5 = nn.relu(%4);
-          %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]);
-          %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]);
-          %8 = %7.0;
-          %9 = nn.relu(%8);
-          %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]);
-          %12 = %11.0;
-          %13 = nn.relu(%12);
-          %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-          %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]);
-          %16 = %15.0;
-          %17 = nn.relu(%16);
-          %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %20 = add(%18, %19);
-          %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]);
-          %22 = %21.0;
-          %23 = nn.relu(%22);
-          %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]);
-          %26 = %25.0;
-          %27 = nn.relu(%26);
-          %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-          %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]);
-          %30 = %29.0;
-          %31 = nn.relu(%30);
-          %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %33 = add(%32, %20);
-          %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]);
-          %35 = %34.0;
-          %36 = nn.relu(%35);
-          %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]);
-          %39 = %38.0;
-          %40 = nn.relu(%39);
-          %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-          %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]);
-          %43 = %42.0;
-          %44 = nn.relu(%43);
-          %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %46 = add(%45, %33);
-          %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]);
-          %48 = %47.0;
-          %49 = nn.relu(%48);
-          %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]);
-          %52 = %51.0;
-          %53 = nn.relu(%52);
-          %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]);
-          %56 = %55.0;
-          %57 = nn.relu(%56);
-          %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %60 = add(%58, %59);
-          %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
-          %62 = %61.0;
-          %63 = nn.relu(%62);
-          %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
-          %66 = %65.0;
-          %67 = nn.relu(%66);
-          %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
-          %70 = %69.0;
-          %71 = nn.relu(%70);
-          %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %73 = add(%72, %60);
-          %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
-          %75 = %74.0;
-          %76 = nn.relu(%75);
-          %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
-          %79 = %78.0;
-          %80 = nn.relu(%79);
-          %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
-          %83 = %82.0;
-          %84 = nn.relu(%83);
-          %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %86 = add(%85, %73);
-          %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
-          %88 = %87.0;
-          %89 = nn.relu(%88);
-          %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
-          %92 = %91.0;
-          %93 = nn.relu(%92);
-          %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
-          %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
-          %96 = %95.0;
-          %97 = nn.relu(%96);
-          %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %99 = add(%98, %86);
-          %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
-          %101 = %100.0;
-          %102 = nn.relu(%101);
-          %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
-          %105 = %104.0;
-          %106 = nn.relu(%105);
-          %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
-          %109 = %108.0;
-          %110 = nn.relu(%109);
-          %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %113 = add(%111, %112);
-          %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]);
-          %115 = %114.0;
-          %116 = nn.relu(%115);
-          %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]);
-          %119 = %118.0;
-          %120 = nn.relu(%119);
-          %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]);
-          %123 = %122.0;
-          %124 = nn.relu(%123);
-          %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %126 = add(%125, %113);
-          %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]);
-          %128 = %127.0;
-          %129 = nn.relu(%128);
-          %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]);
-          %132 = %131.0;
-          %133 = nn.relu(%132);
-          %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]);
-          %136 = %135.0;
-          %137 = nn.relu(%136);
-          %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %139 = add(%138, %126);
-          %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]);
-          %141 = %140.0;
-          %142 = nn.relu(%141);
-          %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]);
-          %145 = %144.0;
-          %146 = nn.relu(%145);
-          %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]);
-          %149 = %148.0;
-          %150 = nn.relu(%149);
-          %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %152 = add(%151, %139);
-          %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]);
-          %154 = %153.0;
-          %155 = nn.relu(%154);
-          %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]);
-          %158 = %157.0;
-          %159 = nn.relu(%158);
-          %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]);
-          %162 = %161.0;
-          %163 = nn.relu(%162);
-          %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %165 = add(%164, %152);
-          %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]);
-          %167 = %166.0;
-          %168 = nn.relu(%167);
-          %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]);
-          %171 = %170.0;
-          %172 = nn.relu(%171);
-          %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
-          %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]);
-          %175 = %174.0;
-          %176 = nn.relu(%175);
-          %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %178 = add(%177, %165);
-          %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]);
-          %180 = %179.0;
-          %181 = nn.relu(%180);
-          %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]);
-          %184 = %183.0;
-          %185 = nn.relu(%184);
-          %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
-          %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]);
-          %188 = %187.0;
-          %189 = nn.relu(%188);
-          %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %192 = add(%190, %191);
-          %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]);
-          %194 = %193.0;
-          %195 = nn.relu(%194);
-          %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]);
-          %198 = %197.0;
-          %199 = nn.relu(%198);
-          %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
-          %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]);
-          %202 = %201.0;
-          %203 = nn.relu(%202);
-          %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %205 = add(%204, %192);
-          %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]);
-          %207 = %206.0;
-          %208 = nn.relu(%207);
-          %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]);
-          %211 = %210.0;
-          %212 = nn.relu(%211);
-          %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
-          %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]);
-          %215 = %214.0;
-          %216 = nn.relu(%215);
-          %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %218 = add(%217, %205);
-          %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]);
-          %220 = %219.0;
-          %221 = nn.relu(%220);
-          %222 = nn.global_avg_pool2d(%221);
-          %223 = reshape(%222, newshape=[0, -1]);
-          %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
-          add(%224, meta[relay.Constant][258])
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "resnet50_16",
-        "input_shapes": {"data": [1, 3, 224, 224]},
-        "input_dtypes": {"data": "float16"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float16",
-    }
-
-
-def mobilenet_consts(dtype):
-    return make_consts(
-        dtype,
-        [
-            (32, 3, 3, 3),  # 0
-            (32,),  # 1
-            (32,),  # 2
-            (32,),  # 3
-            (32,),  # 4
-            (32, 32, 1, 1),  # 5
-            (32,),  # 6
-            (32,),  # 7
-            (32,),  # 8
-            (32,),  # 9
-            (32, 1, 3, 3),  # 10
-            (32,),  # 11
-            (32,),  # 12
-            (32,),  # 13
-            (32,),  # 14
-            (16, 32, 1, 1),  # 15
-            (16,),  # 16
-            (16,),  # 17
-            (16,),  # 18
-            (16,),  # 19
-            (96, 16, 1, 1),  # 20
-            (96,),  # 21
-            (96,),  # 22
-            (96,),  # 23
-            (96,),  # 24
-            (96, 1, 3, 3),  # 25
-            (96,),  # 26
-            (96,),  # 27
-            (96,),  # 28
-            (96,),  # 29
-            (24, 96, 1, 1),  # 30
-            (24,),  # 31
-            (24,),  # 32
-            (24,),  # 33
-            (24,),  # 34
-            (144, 24, 1, 1),  # 35
-            (144,),  # 36
-            (144,),  # 37
-            (144,),  # 38
-            (144,),  # 39
-            (144, 1, 3, 3),  # 40
-            (144,),  # 41
-            (144,),  # 42
-            (144,),  # 43
-            (144,),  # 44
-            (24, 144, 1, 1),  # 45
-            (24,),  # 46
-            (24,),  # 47
-            (24,),  # 48
-            (24,),  # 49
-            (144, 24, 1, 1),  # 50
-            (144,),  # 51
-            (144,),  # 52
-            (144,),  # 53
-            (144,),  # 54
-            (144, 1, 3, 3),  # 55
-            (144,),  # 56
-            (144,),  # 57
-            (144,),  # 58
-            (144,),  # 59
-            (32, 144, 1, 1),  # 60
-            (32,),  # 61
-            (32,),  # 62
-            (32,),  # 63
-            (32,),  # 64
-            (192, 32, 1, 1),  # 65
-            (192,),  # 66
-            (192,),  # 67
-            (192,),  # 68
-            (192,),  # 69
-            (192, 1, 3, 3),  # 70
-            (192,),  # 71
-            (192,),  # 72
-            (192,),  # 73
-            (192,),  # 74
-            (32, 192, 1, 1),  # 75
-            (32,),  # 76
-            (32,),  # 77
-            (32,),  # 78
-            (32,),  # 79
-            (192, 32, 1, 1),  # 80
-            (192,),  # 81
-            (192,),  # 82
-            (192,),  # 83
-            (192,),  # 84
-            (192, 1, 3, 3),  # 85
-            (192,),  # 86
-            (192,),  # 87
-            (192,),  # 88
-            (192,),  # 89
-            (32, 192, 1, 1),  # 90
-            (32,),  # 91
-            (32,),  # 92
-            (32,),  # 93
-            (32,),  # 94
-            (192, 32, 1, 1),  # 95
-            (192,),  # 96
-            (192,),  # 97
-            (192,),  # 98
-            (192,),  # 99
-            (192, 1, 3, 3),  # 100
-            (192,),  # 101
-            (192,),  # 102
-            (192,),  # 103
-            (192,),  # 104
-            (64, 192, 1, 1),  # 105
-            (64,),  # 106
-            (64,),  # 107
-            (64,),  # 108
-            (64,),  # 109
-            (384, 64, 1, 1),  # 110
-            (384,),  # 111
-            (384,),  # 112
-            (384,),  # 113
-            (384,),  # 114
-            (384, 1, 3, 3),  # 115
-            (384,),  # 116
-            (384,),  # 117
-            (384,),  # 118
-            (384,),  # 119
-            (64, 384, 1, 1),  # 120
-            (64,),  # 121
-            (64,),  # 122
-            (64,),  # 123
-            (64,),  # 124
-            (384, 64, 1, 1),  # 125
-            (384,),  # 126
-            (384,),  # 127
-            (384,),  # 128
-            (384,),  # 129
-            (384, 1, 3, 3),  # 130
-            (384,),  # 131
-            (384,),  # 132
-            (384,),  # 133
-            (384,),  # 134
-            (64, 384, 1, 1),  # 135
-            (64,),  # 136
-            (64,),  # 137
-            (64,),  # 138
-            (64,),  # 139
-            (384, 64, 1, 1),  # 140
-            (384,),  # 141
-            (384,),  # 142
-            (384,),  # 143
-            (384,),  # 144
-            (384, 1, 3, 3),  # 145
-            (384,),  # 146
-            (384,),  # 147
-            (384,),  # 148
-            (384,),  # 149
-            (64, 384, 1, 1),  # 150
-            (64,),  # 151
-            (64,),  # 152
-            (64,),  # 153
-            (64,),  # 154
-            (384, 64, 1, 1),  # 155
-            (384,),  # 156
-            (384,),  # 157
-            (384,),  # 158
-            (384,),  # 159
-            (384, 1, 3, 3),  # 160
-            (384,),  # 161
-            (384,),  # 162
-            (384,),  # 163
-            (384,),  # 164
-            (96, 384, 1, 1),  # 165
-            (96,),  # 166
-            (96,),  # 167
-            (96,),  # 168
-            (96,),  # 169
-            (576, 96, 1, 1),  # 170
-            (576,),  # 171
-            (576,),  # 172
-            (576,),  # 173
-            (576,),  # 174
-            (576, 1, 3, 3),  # 175
-            (576,),  # 176
-            (576,),  # 177
-            (576,),  # 178
-            (576,),  # 179
-            (96, 576, 1, 1),  # 180
-            (96,),  # 181
-            (96,),  # 182
-            (96,),  # 183
-            (96,),  # 184
-            (576, 96, 1, 1),  # 185
-            (576,),  # 186
-            (576,),  # 187
-            (576,),  # 188
-            (576,),  # 189
-            (576, 1, 3, 3),  # 190
-            (576,),  # 191
-            (576,),  # 192
-            (576,),  # 193
-            (576,),  # 194
-            (96, 576, 1, 1),  # 195
-            (96,),  # 196
-            (96,),  # 197
-            (96,),  # 198
-            (96,),  # 199
-            (576, 96, 1, 1),  # 200
-            (576,),  # 201
-            (576,),  # 202
-            (576,),  # 203
-            (576,),  # 204
-            (576, 1, 3, 3),  # 205
-            (576,),  # 206
-            (576,),  # 207
-            (576,),  # 208
-            (576,),  # 209
-            (160, 576, 1, 1),  # 210
-            (160,),  # 211
-            (160,),  # 212
-            (160,),  # 213
-            (160,),  # 214
-            (960, 160, 1, 1),  # 215
-            (960,),  # 216
-            (960,),  # 217
-            (960,),  # 218
-            (960,),  # 219
-            (960, 1, 3, 3),  # 220
-            (960,),  # 221
-            (960,),  # 222
-            (960,),  # 223
-            (960,),  # 224
-            (160, 960, 1, 1),  # 225
-            (160,),  # 226
-            (160,),  # 227
-            (160,),  # 228
-            (160,),  # 229
-            (960, 160, 1, 1),  # 230
-            (960,),  # 231
-            (960,),  # 232
-            (960,),  # 233
-            (960,),  # 234
-            (960, 1, 3, 3),  # 235
-            (960,),  # 236
-            (960,),  # 237
-            (960,),  # 238
-            (960,),  # 239
-            (160, 960, 1, 1),  # 240
-            (160,),  # 241
-            (160,),  # 242
-            (160,),  # 243
-            (160,),  # 244
-            (960, 160, 1, 1),  # 245
-            (960,),  # 246
-            (960,),  # 247
-            (960,),  # 248
-            (960,),  # 249
-            (960, 1, 3, 3),  # 250
-            (960,),  # 251
-            (960,),  # 252
-            (960,),  # 253
-            (960,),  # 254
-            (320, 960, 1, 1),  # 255
-            (320,),  # 256
-            (320,),  # 257
-            (320,),  # 258
-            (320,),  # 259
-            (1280, 320, 1, 1),  # 260
-            (1280,),  # 261
-            (1280,),  # 262
-            (1280,),  # 263
-            (1280,),  # 264
-            (1000, 1280, 1, 1),  # 265
-        ],
-    )
-
-
-def mobilenet():
-    metatable = {"relay.Constant": mobilenet_consts("float32")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
-          %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]);
-          %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]);
-          %2 = %1.0;
-          %3 = nn.relu(%2);
-          %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]);
-          %6 = %5.0;
-          %7 = nn.relu(%6);
-          %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]);
-          %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]);
-          %10 = %9.0;
-          %11 = nn.relu(%10);
-          %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]);
-          %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]);
-          %14 = %13.0;
-          %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]);
-          %17 = %16.0;
-          %18 = nn.relu(%17);
-          %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]);
-          %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]);
-          %21 = %20.0;
-          %22 = nn.relu(%21);
-          %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
-          %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]);
-          %25 = %24.0;
-          %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
-          %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]);
-          %28 = %27.0;
-          %29 = nn.relu(%28);
-          %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
-          %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]);
-          %32 = %31.0;
-          %33 = nn.relu(%32);
-          %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
-          %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]);
-          %36 = %35.0;
-          %37 = add(%36, %25);
-          %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
-          %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]);
-          %40 = %39.0;
-          %41 = nn.relu(%40);
-          %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
-          %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]);
-          %44 = %43.0;
-          %45 = nn.relu(%44);
-          %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]);
-          %48 = %47.0;
-          %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
-          %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]);
-          %51 = %50.0;
-          %52 = nn.relu(%51);
-          %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
-          %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
-          %55 = %54.0;
-          %56 = nn.relu(%55);
-          %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
-          %59 = %58.0;
-          %60 = add(%59, %48);
-          %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
-          %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
-          %63 = %62.0;
-          %64 = nn.relu(%63);
-          %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
-          %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
-          %67 = %66.0;
-          %68 = nn.relu(%67);
-          %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
-          %71 = %70.0;
-          %72 = add(%71, %60);
-          %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
-          %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
-          %75 = %74.0;
-          %76 = nn.relu(%75);
-          %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
-          %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
-          %79 = %78.0;
-          %80 = nn.relu(%79);
-          %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
-          %83 = %82.0;
-          %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
-          %86 = %85.0;
-          %87 = nn.relu(%86);
-          %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
-          %90 = %89.0;
-          %91 = nn.relu(%90);
-          %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
-          %94 = %93.0;
-          %95 = add(%94, %83);
-          %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
-          %98 = %97.0;
-          %99 = nn.relu(%98);
-          %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]);
-          %102 = %101.0;
-          %103 = nn.relu(%102);
-          %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]);
-          %106 = %105.0;
-          %107 = add(%106, %95);
-          %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]);
-          %110 = %109.0;
-          %111 = nn.relu(%110);
-          %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]);
-          %114 = %113.0;
-          %115 = nn.relu(%114);
-          %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]);
-          %118 = %117.0;
-          %119 = add(%118, %107);
-          %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]);
-          %122 = %121.0;
-          %123 = nn.relu(%122);
-          %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]);
-          %126 = %125.0;
-          %127 = nn.relu(%126);
-          %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]);
-          %130 = %129.0;
-          %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
-          %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]);
-          %133 = %132.0;
-          %134 = nn.relu(%133);
-          %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
-          %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]);
-          %137 = %136.0;
-          %138 = nn.relu(%137);
-          %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]);
-          %141 = %140.0;
-          %142 = add(%141, %130);
-          %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
-          %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]);
-          %145 = %144.0;
-          %146 = nn.relu(%145);
-          %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
-          %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]);
-          %149 = %148.0;
-          %150 = nn.relu(%149);
-          %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]);
-          %153 = %152.0;
-          %154 = add(%153, %142);
-          %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
-          %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]);
-          %157 = %156.0;
-          %158 = nn.relu(%157);
-          %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
-          %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]);
-          %161 = %160.0;
-          %162 = nn.relu(%161);
-          %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
-          %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]);
-          %165 = %164.0;
-          %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
-          %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]);
-          %168 = %167.0;
-          %169 = nn.relu(%168);
-          %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
-          %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]);
-          %172 = %171.0;
-          %173 = nn.relu(%172);
-          %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
-          %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]);
-          %176 = %175.0;
-          %177 = add(%176, %165);
-          %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
-          %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]);
-          %180 = %179.0;
-          %181 = nn.relu(%180);
-          %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
-          %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]);
-          %184 = %183.0;
-          %185 = nn.relu(%184);
-          %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
-          %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]);
-          %188 = %187.0;
-          %189 = add(%188, %177);
-          %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
-          %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]);
-          %192 = %191.0;
-          %193 = nn.relu(%192);
-          %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
-          %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]);
-          %196 = %195.0;
-          %197 = nn.relu(%196);
-          %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]);
-          %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]);
-          %200 = %199.0;
-          %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]);
-          %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]);
-          %203 = %202.0;
-          %204 = nn.relu(%203);
-          %205 = nn.global_avg_pool2d(%204);
-          %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]);
-          reshape(%206, newshape=[0, -1])
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "mobilenet",
-        "input_shapes": {"data": [1, 3, 224, 224]},
-        "input_dtypes": {"data": "float32"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float32",
-    }
-
-
-def mobilenet_16():
-    metatable = {"relay.Constant": mobilenet_consts("float16")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] {
-          %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]);
-          %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]);
-          %2 = %1.0;
-          %3 = nn.relu(%2);
-          %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]);
-          %6 = %5.0;
-          %7 = nn.relu(%6);
-          %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]);
-          %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]);
-          %10 = %9.0;
-          %11 = nn.relu(%10);
-          %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]);
-          %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]);
-          %14 = %13.0;
-          %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]);
-          %17 = %16.0;
-          %18 = nn.relu(%17);
-          %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]);
-          %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]);
-          %21 = %20.0;
-          %22 = nn.relu(%21);
-          %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
-          %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]);
-          %25 = %24.0;
-          %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
-          %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]);
-          %28 = %27.0;
-          %29 = nn.relu(%28);
-          %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
-          %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]);
-          %32 = %31.0;
-          %33 = nn.relu(%32);
-          %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
-          %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]);
-          %36 = %35.0;
-          %37 = add(%36, %25);
-          %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
-          %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]);
-          %40 = %39.0;
-          %41 = nn.relu(%40);
-          %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
-          %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]);
-          %44 = %43.0;
-          %45 = nn.relu(%44);
-          %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]);
-          %48 = %47.0;
-          %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
-          %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]);
-          %51 = %50.0;
-          %52 = nn.relu(%51);
-          %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
-          %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
-          %55 = %54.0;
-          %56 = nn.relu(%55);
-          %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
-          %59 = %58.0;
-          %60 = add(%59, %48);
-          %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
-          %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
-          %63 = %62.0;
-          %64 = nn.relu(%63);
-          %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
-          %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
-          %67 = %66.0;
-          %68 = nn.relu(%67);
-          %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
-          %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
-          %71 = %70.0;
-          %72 = add(%71, %60);
-          %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
-          %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
-          %75 = %74.0;
-          %76 = nn.relu(%75);
-          %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
-          %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
-          %79 = %78.0;
-          %80 = nn.relu(%79);
-          %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
-          %83 = %82.0;
-          %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
-          %86 = %85.0;
-          %87 = nn.relu(%86);
-          %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
-          %90 = %89.0;
-          %91 = nn.relu(%90);
-          %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
-          %94 = %93.0;
-          %95 = add(%94, %83);
-          %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
-          %98 = %97.0;
-          %99 = nn.relu(%98);
-          %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]);
-          %102 = %101.0;
-          %103 = nn.relu(%102);
-          %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]);
-          %106 = %105.0;
-          %107 = add(%106, %95);
-          %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]);
-          %110 = %109.0;
-          %111 = nn.relu(%110);
-          %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]);
-          %114 = %113.0;
-          %115 = nn.relu(%114);
-          %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
-          %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]);
-          %118 = %117.0;
-          %119 = add(%118, %107);
-          %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
-          %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]);
-          %122 = %121.0;
-          %123 = nn.relu(%122);
-          %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
-          %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]);
-          %126 = %125.0;
-          %127 = nn.relu(%126);
-          %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]);
-          %130 = %129.0;
-          %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
-          %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]);
-          %133 = %132.0;
-          %134 = nn.relu(%133);
-          %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
-          %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]);
-          %137 = %136.0;
-          %138 = nn.relu(%137);
-          %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]);
-          %141 = %140.0;
-          %142 = add(%141, %130);
-          %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
-          %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]);
-          %145 = %144.0;
-          %146 = nn.relu(%145);
-          %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
-          %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]);
-          %149 = %148.0;
-          %150 = nn.relu(%149);
-          %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
-          %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]);
-          %153 = %152.0;
-          %154 = add(%153, %142);
-          %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
-          %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]);
-          %157 = %156.0;
-          %158 = nn.relu(%157);
-          %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
-          %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]);
-          %161 = %160.0;
-          %162 = nn.relu(%161);
-          %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
-          %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]);
-          %165 = %164.0;
-          %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
-          %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]);
-          %168 = %167.0;
-          %169 = nn.relu(%168);
-          %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
-          %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]);
-          %172 = %171.0;
-          %173 = nn.relu(%172);
-          %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
-          %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]);
-          %176 = %175.0;
-          %177 = add(%176, %165);
-          %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
-          %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]);
-          %180 = %179.0;
-          %181 = nn.relu(%180);
-          %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
-          %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]);
-          %184 = %183.0;
-          %185 = nn.relu(%184);
-          %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
-          %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]);
-          %188 = %187.0;
-          %189 = add(%188, %177);
-          %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
-          %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]);
-          %192 = %191.0;
-          %193 = nn.relu(%192);
-          %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
-          %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]);
-          %196 = %195.0;
-          %197 = nn.relu(%196);
-          %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]);
-          %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]);
-          %200 = %199.0;
-          %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]);
-          %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]);
-          %203 = %202.0;
-          %204 = nn.relu(%203);
-          %205 = nn.global_avg_pool2d(%204);
-          %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]);
-          reshape(%206, newshape=[0, -1])
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "mobilenet_16",
-        "input_shapes": {"data": [1, 3, 224, 224]},
-        "input_dtypes": {"data": "float16"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float16",
-    }
-
-
-def batch_norm_extract():
-    consts = make_consts(
-        "float32",
-        [
-            (32,),  # 0
-            (32,),  # 1
-            (32,),  # 2
-            (32,),  # 3
-        ],
-    )
-    metatable = {"relay.Constant": consts}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%FunctionVar_0: Tensor[(1, 32, 112, 112), float32]) -> Tensor[(1, 32, 112, 112), float32] {
-          %3 = nn.batch_norm(%FunctionVar_0, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
-          %3.0
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "batch_norm_extract",
-        "input_shapes": {"FunctionVar_0": [1, 32, 112, 112]},
-        "input_dtypes": {"FunctionVar_0": "float32"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float32",
-    }
-
-
-def resnext50_32x4d_consts(dtype):
-    return make_consts(
-        dtype,
-        [
-            (128, 64, 1, 1),  # 0
-            (128, 4, 3, 3),  # 1
-            (256, 128, 1, 1),  # 2
-            (256, 64, 1, 1),  # 3
-            (128, 256, 1, 1),  # 4
-            (128, 4, 3, 3),  # 5
-            (256, 128, 1, 1),  # 6
-            (128, 256, 1, 1),  # 7
-            (128, 4, 3, 3),  # 8
-            (256, 128, 1, 1),  # 9
-            (256, 256, 1, 1),  # 10
-            (256, 8, 3, 3),  # 11
-            (512, 256, 1, 1),  # 12
-            (512, 256, 1, 1),  # 13
-            (256, 512, 1, 1),  # 14
-            (256, 8, 3, 3),  # 15
-            (512, 256, 1, 1),  # 16
-            (256, 512, 1, 1),  # 17
-            (256, 8, 3, 3),  # 18
-            (512, 256, 1, 1),  # 19
-            (256, 512, 1, 1),  # 20
-            (256, 8, 3, 3),  # 21
-            (512, 256, 1, 1),  # 22
-            (512, 512, 1, 1),  # 23
-            (512, 16, 3, 3),  # 24
-            (1024, 512, 1, 1),  # 25
-            (1024, 512, 1, 1),  # 26
-            (512, 1024, 1, 1),  # 27
-            (512, 16, 3, 3),  # 28
-            (1024, 512, 1, 1),  # 29
-            (512, 1024, 1, 1),  # 30
-            (512, 16, 3, 3),  # 31
-            (1024, 512, 1, 1),  # 32
-            (512, 1024, 1, 1),  # 33
-            (512, 16, 3, 3),  # 34
-            (1024, 512, 1, 1),  # 35
-            (512, 1024, 1, 1),  # 36
-            (512, 16, 3, 3),  # 37
-            (1024, 512, 1, 1),  # 38
-            (512, 1024, 1, 1),  # 39
-            (512, 16, 3, 3),  # 40
-            (1024, 512, 1, 1),  # 41
-            (1024, 1024, 1, 1),  # 42
-            (1024, 32, 3, 3),  # 43
-            (2048, 1024, 1, 1),  # 44
-            (2048, 1024, 1, 1),  # 45
-            (1024, 2048, 1, 1),  # 46
-            (1024, 32, 3, 3),  # 47
-            (2048, 1024, 1, 1),  # 48
-            (1024, 2048, 1, 1),  # 49
-            (1024, 32, 3, 3),  # 50
-            (2048, 1024, 1, 1),  # 51
-        ],
-    )
-
-
-def resnext50_32x4d():
-    metatable = {"relay.Constant": resnext50_32x4d_consts("float32")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(1, 64, 56, 56), float32]) {
-          %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %1 = nn.relu(%0);
-          %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
-          %3 = nn.relu(%2);
-          %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %6 = add(%4, %5);
-          %7 = nn.relu(%6);
-          %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %9 = nn.relu(%8);
-          %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
-          %11 = nn.relu(%10);
-          %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %13 = add(%12, %7);
-          %14 = nn.relu(%13);
-          %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %16 = nn.relu(%15);
-          %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
-          %18 = nn.relu(%17);
-          %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %20 = add(%19, %14);
-          %21 = nn.relu(%20);
-          %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %23 = nn.relu(%22);
-          %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %25 = nn.relu(%24);
-          %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %28 = add(%26, %27);
-          %29 = nn.relu(%28);
-          %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %31 = nn.relu(%30);
-          %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %33 = nn.relu(%32);
-          %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %35 = add(%34, %29);
-          %36 = nn.relu(%35);
-          %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %38 = nn.relu(%37);
-          %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %40 = nn.relu(%39);
-          %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %42 = add(%41, %36);
-          %43 = nn.relu(%42);
-          %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %45 = nn.relu(%44);
-          %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %47 = nn.relu(%46);
-          %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %49 = add(%48, %43);
-          %50 = nn.relu(%49);
-          %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %52 = nn.relu(%51);
-          %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %54 = nn.relu(%53);
-          %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %57 = add(%55, %56);
-          %58 = nn.relu(%57);
-          %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %60 = nn.relu(%59);
-          %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %62 = nn.relu(%61);
-          %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %64 = add(%63, %58);
-          %65 = nn.relu(%64);
-          %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %67 = nn.relu(%66);
-          %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %69 = nn.relu(%68);
-          %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %71 = add(%70, %65);
-          %72 = nn.relu(%71);
-          %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %74 = nn.relu(%73);
-          %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %76 = nn.relu(%75);
-          %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %78 = add(%77, %72);
-          %79 = nn.relu(%78);
-          %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %81 = nn.relu(%80);
-          %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %83 = nn.relu(%82);
-          %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %85 = add(%84, %79);
-          %86 = nn.relu(%85);
-          %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %88 = nn.relu(%87);
-          %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %90 = nn.relu(%89);
-          %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %92 = add(%91, %86);
-          %93 = nn.relu(%92);
-          %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %95 = nn.relu(%94);
-          %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
-          %97 = nn.relu(%96);
-          %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %100 = add(%98, %99);
-          %101 = nn.relu(%100);
-          %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %103 = nn.relu(%102);
-          %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
-          %105 = nn.relu(%104);
-          %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %107 = add(%106, %101);
-          %108 = nn.relu(%107);
-          %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %110 = nn.relu(%109);
-          %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
-          %112 = nn.relu(%111);
-          %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %114 = add(%113, %108);
-          nn.relu(%114)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "resnext50_32x4d",
-        "input_shapes": {"x": [1, 64, 56, 56]},
-        "input_dtypes": {"x": "float32"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float32",
-    }
-
-
-def resnext50_32x4d_16():
-    metatable = {"relay.Constant": resnext50_32x4d_consts("float16")}
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(1, 64, 56, 56), float16]) {
-          %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %1 = nn.relu(%0);
-          %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
-          %3 = nn.relu(%2);
-          %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %6 = add(%4, %5);
-          %7 = nn.relu(%6);
-          %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %9 = nn.relu(%8);
-          %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
-          %11 = nn.relu(%10);
-          %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %13 = add(%12, %7);
-          %14 = nn.relu(%13);
-          %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
-          %16 = nn.relu(%15);
-          %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
-          %18 = nn.relu(%17);
-          %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %20 = add(%19, %14);
-          %21 = nn.relu(%20);
-          %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %23 = nn.relu(%22);
-          %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %25 = nn.relu(%24);
-          %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %28 = add(%26, %27);
-          %29 = nn.relu(%28);
-          %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %31 = nn.relu(%30);
-          %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %33 = nn.relu(%32);
-          %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %35 = add(%34, %29);
-          %36 = nn.relu(%35);
-          %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %38 = nn.relu(%37);
-          %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %40 = nn.relu(%39);
-          %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %42 = add(%41, %36);
-          %43 = nn.relu(%42);
-          %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
-          %45 = nn.relu(%44);
-          %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
-          %47 = nn.relu(%46);
-          %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %49 = add(%48, %43);
-          %50 = nn.relu(%49);
-          %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %52 = nn.relu(%51);
-          %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %54 = nn.relu(%53);
-          %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %57 = add(%55, %56);
-          %58 = nn.relu(%57);
-          %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %60 = nn.relu(%59);
-          %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %62 = nn.relu(%61);
-          %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %64 = add(%63, %58);
-          %65 = nn.relu(%64);
-          %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %67 = nn.relu(%66);
-          %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %69 = nn.relu(%68);
-          %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %71 = add(%70, %65);
-          %72 = nn.relu(%71);
-          %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %74 = nn.relu(%73);
-          %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %76 = nn.relu(%75);
-          %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %78 = add(%77, %72);
-          %79 = nn.relu(%78);
-          %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %81 = nn.relu(%80);
-          %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %83 = nn.relu(%82);
-          %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %85 = add(%84, %79);
-          %86 = nn.relu(%85);
-          %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
-          %88 = nn.relu(%87);
-          %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
-          %90 = nn.relu(%89);
-          %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %92 = add(%91, %86);
-          %93 = nn.relu(%92);
-          %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %95 = nn.relu(%94);
-          %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
-          %97 = nn.relu(%96);
-          %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %100 = add(%98, %99);
-          %101 = nn.relu(%100);
-          %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %103 = nn.relu(%102);
-          %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
-          %105 = nn.relu(%104);
-          %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %107 = add(%106, %101);
-          %108 = nn.relu(%107);
-          %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
-          %110 = nn.relu(%109);
-          %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
-          %112 = nn.relu(%111);
-          %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
-          %114 = add(%113, %108);
-          nn.relu(%114)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    return {
-        "name": "resnext50_32x4d_16",
-        "input_shapes": {"x": [1, 64, 56, 56]},
-        "input_dtypes": {"x": "float16"},
-        "mod": mod,
-        "params": None,
-        "main_dtype": "float16",
-    }
-
-
-def describe_onnx(name, filename):
-    """Returns the description of the ONNX model at filename, which can be passed to from_onnx to actually load
-    the model. Note that ? (ie unknown) shape dimensions must be manually changed to concrete dimensions
-    which are consistent with the overall model."""
-    onnx_model = onnx.load(MODEL_PREFIX + filename)
-    input_shapes = {}
-    input_dtypes = {}
-    initializer_names = [n.name for n in onnx_model.graph.initializer]
-    for input_info in onnx_model.graph.input:
-        if input_info.name not in initializer_names:
-            _, shape, dtype, _ = tvm.relay.frontend.onnx.get_info(input_info)
-            if dtype is None:
-                raise ValueError(f"Unknown dtype on input '{input_info.name}' is not supported.")
-            input_shapes.update({input_info.name: shape})
-            input_dtypes.update({input_info.name: dtype})
-    print(
-        f"{{'name': '{name}', 'filename': '{filename}', 'input_shapes': {input_shapes}, 'input_dtypes': {input_dtypes}, 'main_dtype': 'float32'}}"
-    )
-
-
-def from_onnx(model):
-    logging.info("-------------------- BEGIN ONNX IMPORT --------------------")
-
-    filename = MODEL_PREFIX + model["filename"]
-    logging.info(f"Loading ONNX model from {filename}")
-
-    onnx_model = onnx.load(filename)
-    logging.info(f"Loaded model from {filename}")
-
-    mod, params = tvm.relay.frontend.from_onnx(
-        onnx_model, model["input_shapes"], freeze_params=True
-    )
-    mod = tvm.relay.transform.InferType()(mod)
-    logging.info("-------------------- END ONNX IMPORT --------------------")
-
-    logging.info(f"Imported model:\n{mod}")
-    logging.info(f"Params:\n{params}")
-
-    return {
-        "name": model["name"],
-        "input_shapes": model["input_shapes"],
-        "input_dtypes": model["input_dtypes"],
-        "mod": mod,
-        "params": params,
-        "main_dtype": model["main_dtype"],
-    }
-
-
-def to_onnx(model):
-    logging.info("-------------------- BEGIN ONNX EXPORT --------------------")
-    short_filename = model["name"] + ".onnx"
-    filename = MODEL_PREFIX + short_filename
-    logging.info(f"Saving ONNX model to {filename}")
-
-    params = model["params"]
-    if params is None:
-        params = {}
-    tvm.contrib.target.onnx.to_onnx(model["mod"], params, model["name"], path=filename)
-    logging.info("-------------------- END ONNX EXPORT --------------------")
-
-    return {
-        "name": model["name"],
-        "filename": short_filename,
-        "input_shapes": model["input_shapes"],
-        "input_dtypes": model["input_dtypes"],
-        "main_dtype": model["main_dtype"],
-    }
diff --git a/tests/python/relay/collage/test_sub_graph.py b/tests/python/relay/collage/test_sub_graph.py
deleted file mode 100644
index 785bdf750169..000000000000
--- a/tests/python/relay/collage/test_sub_graph.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import logging
-import tvm.testing
-
-logging.basicConfig(level=logging.INFO)
-
-partition_for_testing = tvm._ffi.get_global_func("relay.collage.PartitionForTesting")
-
-
-def print_with_indexes(mod):
-    mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
-    print(mod)
-
-
-def run(in_mod, expected_mod, max_outputs, allow_taps, compiler, map):
-    expected_mod = tvm.relay.transform.InferType()(expected_mod)
-
-    in_mod = tvm.relay.transform.InferType()(in_mod)
-    in_mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(in_mod)
-
-    indexes = [i for l, iss in map.items() for i in iss]
-    labels = [l for l, iss in map.items() for i in iss]
-    actual_mod = partition_for_testing(max_outputs, allow_taps, compiler, indexes, labels)(in_mod)
-
-    if not tvm.ir.structural_equal(actual_mod, expected_mod, True):
-        # Print everything in full so we can see what's going on when things fail.
-        print("Input module:")
-        print(in_mod)
-        print("Expected module:")
-        print(expected_mod)
-        print("Actual module:")
-        print(actual_mod)
-        # Assert again so as to see the actual disagreeing sub-expressions.
-        tvm.ir.assert_structural_equal(actual_mod, expected_mod, map_free_vars=True)
-
-
-def test_single_op():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = add(%c, %d);   // node 7
-              subtract(%0, %1)
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = (fn(%x, %y, Compiler="foo") { add(%x, %y) })(%c, %d);
-              subtract(%0, %1)
-            }
-        """
-        )
-
-    run(input(), expected(), 1, False, "foo", {"": [7]})
-
-
-def test_multi_output():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);   // node 6
-              %1 = add(%c, %d);   // node 7
-              subtract(%0, %1)
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = (fn(%w, %x, %y, %z, Compiler="foo") { (add(%y, %z), add(%w, %x)) })(%c, %d, %a, %b);
-              %1 = %0.0;
-              %2 = %0.1;
-              subtract(%1, %2)
-            }
-        """
-        )
-
-    # No rewrite since 2 outputs
-    run(input(), input(), 1, False, "foo", {"": [6, 7]})
-    # Rewrite
-    run(input(), expected(), 2, False, "foo", {"": [6, 7]})
-
-
-def test_classic_conv2d_add_relu():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32],
-                      %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) {
-              %0 = nn.conv2d(%a, %b); // node 8
-              %1 = add(%0, %c);       // node 9
-              %2 = nn.relu(%1);       // node 10
-              subtract(%2, %d)
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32],
-                      %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) {
-              %2 = (fn(%x, %y, %z, Compiler="foo") {
-                %0 = nn.conv2d(%x, %y);
-                %1 = add(%0, %z);
-                nn.relu(%1)
-              })(%a, %b, %c);
-              subtract(%2, %d)
-            }
-        """
-        )
-
-    run(input(), expected(), 1, False, "foo", {"": [8, 9, 10]})
-
-
-def test_diamond_single_output():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
-              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
-              %1 = nn.relu(%0);                             // node 6
-              %2 = nn.relu(%1);                             // node 7
-              %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
-              add(%2, %3)                                   // node 10
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
-              (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Compiler="foo") {
-                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
-                %1 = nn.relu(%0);
-                %2 = nn.relu(%1);
-                %3 = nn.leaky_relu(%0, alpha=0f);
-                add(%2, %3)
-              })(%a, %b)
-            }
-        """
-        )
-
-    run(input(), expected(), 1, False, "foo", {"": [5, 6, 7, 9, 10]})
-
-
-def test_diamond_multi_output():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
-              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
-              %1 = nn.relu(%0);                             // node 6
-              %2 = nn.relu(%1);                             // node 7
-              %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
-              add(%2, %3)
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
-              %4 = (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Compiler="foo") {
-                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
-                %1 = nn.relu(%0);
-                %2 = nn.relu(%1);
-                %3 = nn.leaky_relu(%0, alpha=0f);
-                (%2, %3)
-              })(%a, %b);
-              %5 = %4.0;
-              %6 = %4.1;
-              add(%5, %6)
-            }
-        """
-        )
-
-    run(input(), expected(), 2, False, "foo", {"": [5, 6, 7, 9]})
-
-
-def test_with_tap():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
-              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
-              %1 = nn.relu(%0);                             // node 6
-              add(%1, %0)
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
-              %2 = (fn (%x, %y, Compiler="foo") {
-                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
-                %1 = nn.relu(%0);
-                (%0, %1)
-              })(%a, %b);
-              %3 = %2.1;
-              %4 = %2.0;
-              add(%3, %4)
-            }
-        """
-        )
-
-    # No rewrite since has tap
-    run(input(), input(), 2, False, "foo", {"": [5, 6]})
-    # Rewrite
-    run(input(), expected(), 2, True, "foo", {"": [5, 6]})
-
-
-def test_no_cycles():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b); // node 3
-              %1 = add(%0, %b);
-              add(%1, %b)       // node 5
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
-              (fn(%x, %y, Compiler="foo") {
-                %0 = add(%x, %y);
-                %1 = add(%0, %y);
-                add(%1, %y)
-              })(%a, %b)
-            }
-        """
-        )
-
-    # No rewrite since would create cycle
-    run(input(), input(), 2, False, "foo", {"": [3, 5]})
-    # No cycle
-    run(input(), expected(), 2, False, "foo", {"": [3, 4, 5]})
-
-
-def test_labels_direct_connection():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32]) {
-              %0 = nn.relu(%a);  // node 3
-              %1 = nn.relu(%0);  // node 4
-              %2 = nn.relu(%1);  // node 5
-              %3 = nn.relu(%1);  // node 6
-              %4 = add(%2, %3);  // node 7
-              %5 = nn.relu(%4);  // node 8
-              %6 = nn.relu(%4);  // node 9
-              %7 = add(%5, %6);  // node 10
-              nn.relu(%7)        // node 11
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32]) {
-              (fn(%aa: Tensor[(5, 7), float32], Compiler="foo") {
-                %0 = nn.relu(%aa);
-                %4 = (fn(%y, Composite="a") {
-                  %1 = nn.relu(%y);
-                  %2 = nn.relu(%1);
-                  %3 = nn.relu(%1);
-                  add(%2, %3)
-                })(%0);
-                %7 = (fn(%z, Composite="b") {
-                  %5 = nn.relu(%z);
-                  %6 = nn.relu(%z);
-                  add(%5, %6)
-                })(%4);
-                nn.relu(%7)
-              })(%a)
-            }
-        """
-        )
-
-    run(input(), expected(), 1, False, "foo", {"": [3, 11], "a": [4, 5, 6, 7], "b": [8, 9, 10]})
-
-
-def test_labels_nested_tap():
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32]) {
-              %0 = nn.relu(%a);  // node 3
-              %1 = nn.relu(%0);  // node 4
-              %2 = nn.relu(%1);  // node 5
-              %3 = nn.relu(%1);  // node 6
-              %4 = add(%2, %3);  // node 7
-              %5 = nn.relu(%4);  // node 8
-              %6 = nn.relu(%4);  // node 9
-              %7 = add(%5, %6);  // node 10
-              add(%2, %7)        // node 11
-            }
-        """
-        )
-
-    def expected():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32]) {
-              %0 = nn.relu(%a);
-              %9 = (fn(%x: Tensor[(5, 7), float32], Compiler="foo") {
-                %5 = (fn(%y, Composite="a") {
-                  %1 = nn.relu(%y);
-                  %2 = nn.relu(%1);
-                  %3 = nn.relu(%1);
-                  %4 = add(%2, %3);
-                  (%2, %4)
-                })(%x);
-                %8 = (fn(%z, Composite="b") {
-                  %6 = nn.relu(%z);
-                  %7 = nn.relu(%z);
-                  add(%6, %7)
-                })(%5.1);
-                (%5.0, %8)
-              })(%0);
-              add(%9.0, %9.1)
-            }
-        """
-        )
-
-    run(input(), expected(), 2, True, "foo", {"a": [4, 5, 6, 7], "b": [8, 9, 10]})
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
deleted file mode 100644
index db17fc3efe94..000000000000
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Support level10 operator test cases.
-
-"""
-
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.relay.testing import run_infer_type
-import tvm.topi.testing
-import random
-import tvm.testing
-
-executor_kind = tvm.testing.parameter("debug", "vm")
-
-
-@tvm.testing.uses_gpu
-def test_broadcast_to(executor_kind):
-    def verify_more_dynamic_broadcast_to(x_shape, out_shape):
-        rank = len(out_shape)
-        dtype = "float32"
-        shape_type = "int64"
-        reshape_shape = relay.Var("shape", relay.ty.TensorType((len(x_shape),), shape_type))
-        broadcast_shape = relay.Var("shape", relay.ty.TensorType((rank,), shape_type))
-        x = relay.Var("x", relay.ty.TensorType((np.prod(x_shape),), dtype))
-        r = relay.reshape(x, reshape_shape)
-        z = relay.broadcast_to(r, broadcast_shape)
-
-        func = relay.Function([x, reshape_shape, broadcast_shape], z)
-
-        x = np.random.uniform(size=np.prod(x_shape)).astype(dtype)
-        ref_res = np.broadcast_to(np.reshape(x, x_shape), out_shape)
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(
-                executor_kind, mod=mod, device=dev, target=target
-            ).evaluate(func)(
-                x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type)
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_more_dynamic_broadcast_to((4, 3), (3, 4, 3))
-
-    def verify_broadcast_to(x_shape, out_shape):
-        rank = len(out_shape)
-        dtype = "float32"
-        shape_type = "int64"
-        dyn_shape = relay.Var("shape", relay.ty.TensorType((rank,), shape_type))
-        x = relay.Var("x", relay.ty.TensorType(x_shape, dtype))
-        z = relay.broadcast_to(x, dyn_shape)
-        zz = run_infer_type(z)
-
-        assert zz.checked_type == relay.ty.TensorType((relay.Any(),) * rank, dtype)
-
-        func = relay.Function([x, dyn_shape], z)
-
-        x = np.random.uniform(size=x_shape).astype(dtype)
-        ref_res = np.broadcast_to(x, out_shape)
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(
-                executor_kind, mod=mod, device=dev, target=target
-            ).evaluate(func)(x, np.array(out_shape).astype(shape_type))
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_broadcast_to((1,), (1, 1, 1))
-    verify_broadcast_to((1, 1), (4, 1, 1))
-    verify_broadcast_to((4, 1), (1, 4, 3))
-
-
-@tvm.testing.uses_gpu
-def test_dyn_broadcast_to(executor_kind):
-    dtype = "uint8"
-    rank = 3
-    shape_type = "int64"
-    dyn_shape = relay.Var("shape", relay.ty.TensorType((rank,), shape_type))
-    x_shape = (1,)
-    x = relay.Var("x", relay.ty.TensorType(x_shape, dtype))
-    z = relay.broadcast_to(x, dyn_shape)
-    zz = run_infer_type(z)
-
-    assert zz.checked_type == relay.ty.TensorType((relay.Any(),) * rank, dtype)
-
-    func = relay.Function([x, dyn_shape], z)
-
-    x = np.random.uniform(size=x_shape).astype(dtype)
-    dyn_shape = (1,) * rank
-    ref_res = np.broadcast_to(x, dyn_shape)
-    for target, dev in tvm.testing.enabled_targets():
-        mod = tvm.ir.IRModule.from_expr(func)
-        op_res = relay.create_executor(executor_kind, mod=mod, device=dev, target=target).evaluate(
-            func
-        )(x, np.array(dyn_shape).astype(shape_type))
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_dyn_one_hot(executor_kind):
-    def _get_oshape(indices_shape, depth, axis):
-        oshape = []
-        true_axis = len(indices_shape) if axis == -1 else axis
-        ndim = len(indices_shape) + 1
-        indices_index = 0
-        for i in range(0, ndim):
-            if i == true_axis:
-                oshape.append(depth)
-            else:
-                oshape.append(indices_shape[indices_index])
-                indices_index += 1
-
-        return oshape
-
-    def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
-        indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
-        depth_var = relay.var("depth", relay.TensorType((), "int32"))
-        on_value_const = relay.const(on_value)
-        off_value_const = relay.const(off_value)
-        out = relay.one_hot(indices, on_value_const, off_value_const, depth_var, axis, dtype)
-        func = relay.Function([indices, depth_var], out)
-        indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
-        out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            out_relay = relay.create_executor(
-                executor_kind, mod=mod, device=dev, target=target
-            ).evaluate()(indices_np, np.array(depth).astype("int32"))
-            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
-
-    _verify((3,), 3, 1, 0, -1, "int32")
-    _verify((3,), 3, 1.0, 0.0, -1, "float32")
-    _verify((2, 2), 5, 2, -2, 0, "int32")
-    _verify((2, 2), 5, 0.5, -0.5, 1, "float32")
-    _verify((3, 2, 4, 5), 6, 1, 0, 1, "int32")
-    _verify((3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
deleted file mode 100644
index 690ddcac8d51..000000000000
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level2 dynamic operator test cases.
-"""
-
-import numpy as np
-import tvm
-from tvm import relay
-from tvm import te
-from tvm.relay.testing import enabled_targets
-import random
-from test_dynamic_op_level3 import verify_func
-import tvm.topi.testing
-from tvm.relay.testing import run_infer_type
-
-executor_kind = tvm.testing.parameter("debug", "vm")
-
-
-@tvm.testing.uses_gpu
-def test_dyn_upsampling_run(executor_kind):
-    def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=False):
-
-        if layout == "NCHW":
-            (n, c, h, w) = dshape
-            x_data = np.random.uniform(size=(n, c, h, w)).astype("float32")
-
-        elif layout == "NHWC":
-            (n, h, w, c) = dshape
-            x_data = np.random.uniform(size=(n, h, w, c)).astype("float32")
-
-        ref_res = tvm.topi.testing.resize2d_python(
-            x_data,
-            (scale_h, scale_w),
-            layout,
-            method[2:] if method[0:2] == "bi" else method,
-            "align_corners" if align_corners else "asymmetric",
-        )
-        x = relay.Var("x", relay.TensorType(dshape, "float32"))
-        scale_h_var = relay.var("scale_h", relay.TensorType((), "float32"))
-        scale_w_var = relay.var("scale_h", relay.TensorType((), "float32"))
-
-        z = relay.nn.upsampling(
-            x, scale_h_var, scale_w_var, method=method, layout=layout, align_corners=align_corners
-        )
-        zz = run_infer_type(z)
-        func = relay.Function([x, scale_h_var, scale_w_var], z)
-
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(
-                executor_kind, mod=mod, device=dev, target=target
-            ).evaluate()(
-                x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32")
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
-
-    verify_upsampling((1, 16, 32, 32), 3, 2.0, "NCHW", "nearest_neighbor")
-    verify_upsampling((1, 16, 32, 32), 5, 2.0, "NCHW", "bilinear", True)
-    verify_upsampling((1, 16, 32, 32), 2.0, 6, "NHWC", "nearest_neighbor")
-    verify_upsampling((1, 16, 32, 32), 2.0, 2.0, "NHWC", "bilinear", True)
-
-
-# tests upsampling type inference with scale_h passed in as a constant and scale_w as a variable
-@tvm.testing.uses_gpu
-def test_dyn_upsampling_infer_type_const():
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-
-    data = relay.var("data", relay.TensorType((n, c, h, w), "int8"))
-    scale_w = relay.Var("scale_w", relay.TensorType((), "float32"))
-
-    z = relay.nn.upsampling(data, 2.0, scale_w)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, relay.Any(), relay.Any()), "int8")
-
-
-@tvm.testing.uses_gpu
-def test_dyn_upsampling3d_run(executor_kind):
-    def verify_upsampling3d(
-        dshape, scale_d, scale_h, scale_w, layout, method, coord_trans="asymmetric"
-    ):
-
-        if layout == "NCDHW":
-            (n, c, d, h, w) = dshape
-            x_data = np.random.uniform(size=(n, c, d, h, w)).astype("float32")
-
-        elif layout == "NDHWC":
-            (n, d, h, w, c) = dshape
-            x_data = np.random.uniform(size=(n, d, h, w, c)).astype("float32")
-
-        ref_res = tvm.topi.testing.resize3d_python(
-            x_data,
-            (scale_d, scale_h, scale_w),
-            layout,
-            method[3:] if method[0:3] == "tri" else method,
-            coord_trans,
-        )
-
-        x = relay.Var("x", relay.TensorType(dshape, "float32"))
-        scale_d_var = relay.var("scale_d", relay.TensorType((), "float32"))
-        scale_h_var = relay.var("scale_h", relay.TensorType((), "float32"))
-        scale_w_var = relay.var("scale_h", relay.TensorType((), "float32"))
-
-        z = relay.nn.upsampling3d(
-            x,
-            scale_d_var,
-            scale_h_var,
-            scale_w_var,
-            method=method,
-            layout=layout,
-            coordinate_transformation_mode=coord_trans,
-        )
-        zz = run_infer_type(z)
-        func = relay.Function([x, scale_d_var, scale_h_var, scale_w_var], z)
-
-        for target, dev in enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(
-                executor_kind, mod=mod, device=dev, target=target
-            ).evaluate()(
-                x_data,
-                np.array(scale_d).astype("float32"),
-                np.array(scale_h).astype("float32"),
-                np.array(scale_w).astype("float32"),
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
-
-    verify_upsampling3d((1, 1, 1, 1, 1), 2, 3, 4, "NCDHW", "nearest_neighbor")
-    verify_upsampling3d((1, 8, 16, 16, 16), 2.0, 3.0, 4.0, "NCDHW", "nearest_neighbor")
-    verify_upsampling3d((1, 8, 16, 16, 16), 2.0, 5.0, 1.0, "NCDHW", "trilinear", "align_corners")
-    verify_upsampling3d((1, 20, 3, 4, 16), 2.0, 2.0, 2.0, "NDHWC", "nearest_neighbor")
-    verify_upsampling3d((1, 8, 4, 16, 15), 2.0, 2.0, 2.0, "NDHWC", "trilinear", "align_corners")
-
-
-# tests upsampling type inference with scale_h passed in as a constant and scale_w as a variable
-def test_dyn_upsampling3d_infer_type_const():
-    n, c, d, h, w = (
-        te.size_var("n"),
-        te.size_var("c"),
-        te.size_var("d"),
-        te.size_var("h"),
-        te.size_var("w"),
-    )
-
-    data = relay.var("data", relay.TensorType((n, c, d, h, w), "int8"))
-    scale_d = relay.Var("scale_h", relay.TensorType((), "float32"))
-    scale_w = relay.Var("scale_w", relay.TensorType((), "float32"))
-
-    z = relay.nn.upsampling3d(data, scale_d, 2.0, scale_w, layout="NCDHW", method="trilinear")
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType(
-        (n, c, relay.Any(), relay.Any(), relay.Any()), "int8"
-    )
-
-
-@tvm.testing.uses_gpu
-def test_dyn_pad(executor_kind):
-    def verify_pad(dshape, pad_width, pad_val, dtype):
-        x = relay.var("x", relay.TensorType(dshape, dtype))
-        ndim = len(dshape)
-        pad_width_var = relay.var("pad_width_var", relay.TensorType((ndim, 2), "int64"))
-        pad_val_var = relay.var("pad_val_var", relay.TensorType((), dtype))
-        y = relay.nn.pad(x, pad_width_var, pad_val_var)
-        yy = run_infer_type(y)
-
-        assert yy.checked_type == relay.ty.TensorType((relay.Any(),) * ndim, dtype)
-        func = relay.Function([x, pad_width_var, pad_val_var], y)
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = np.pad(data, pad_width, "constant", constant_values=(((pad_val,) * 2),) * ndim)
-        pad_width = np.array(pad_width).astype("int64")
-
-        verify_func(
-            executor_kind, func, [data, pad_width, np.array(pad_val).astype(dtype)], ref_res
-        )
-
-    def verify_pad_default_fill(dshape, pad_width, dtype):
-        x = relay.var("x", relay.TensorType(dshape, dtype))
-        ndim = len(dshape)
-        pad_width_var = relay.var("pad_width_var", relay.TensorType((ndim, 2), "int64"))
-        y = relay.nn.pad(x, pad_width_var)
-        yy = run_infer_type(y)
-
-        assert yy.checked_type == relay.ty.TensorType((relay.Any(),) * ndim, dtype)
-        func = relay.Function([x, pad_width_var], y)
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = np.pad(data, pad_width)
-        pad_width = np.array(pad_width).astype("int64")
-
-        verify_func(executor_kind, func, [data, pad_width], ref_res)
-
-    verify_pad((4, 10, 7, 7), ((1, 1), (2, 2), (3, 3), (4, 4)), 2.0, "int32")
-    verify_pad((2, 7), ((1, 4), (2, 2)), 4.0, "float64")
-    verify_pad_default_fill((4, 10, 7, 7), ((1, 1), (2, 2), (3, 3), (4, 4)), "float64")
-    verify_pad_default_fill((2, 7), ((1, 4), (2, 2)), "int32")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
deleted file mode 100644
index afc42c778a72..000000000000
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ /dev/null
@@ -1,481 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level3 operator test cases.
-"""
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import relay, te
-from tvm.relay.testing import check_grad, run_infer_type
-
-executor_kind = tvm.testing.parameter("debug", "vm")
-
-
-def verify_func(executor_kind, func, data, ref_res, target_device=tvm.testing.enabled_targets()):
-    assert isinstance(data, list)
-    for target, dev in target_device:
-        mod = tvm.ir.IRModule.from_expr(func)
-        op_res = relay.create_executor(
-            executor_kind, mod=mod, device=dev, target=target
-        ).evaluate()(*data)
-        if isinstance(op_res, tvm.runtime.container.ADT):
-            assert len(op_res) == len(
-                ref_res
-            ), "Outputs from TVM and Python implementation must be equal "
-            for op_result, ref_result in zip(op_res, ref_res):
-                tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
-        else:
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-        relay.backend.te_compiler.get().clear()
-
-
-def check_on_vm(target, dev, args, expected_result, mod):
-    """
-    Check that evaluating `expr` applied to the arguments produces
-    `result` on Relay VM.
-    """
-    rts_result = relay.create_executor("vm", device=dev, target=target, mod=mod).evaluate()(*args)
-    tvm.testing.assert_allclose(expected_result, rts_result.numpy())
-
-
-@tvm.testing.uses_gpu
-def test_dyn_reshape(executor_kind):
-    def verify_reshape(shape, newshape, oshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType((len(newshape),), "int64"))
-        z = relay.reshape(x, y)
-
-        func = relay.Function([x, y], z)
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        x_data = np.ones(shape).astype("float32")
-        ref_res = np.reshape(x_data, oshape)
-        check_grad(
-            run_infer_type(func),
-            inputs=[x_data, np.array(newshape).astype("int64")],
-            test_inputs=[x_data],
-            eps=1e-3,
-        )
-        verify_func(executor_kind, func, [x_data, np.array(newshape).astype("int64")], ref_res)
-
-    verify_reshape((2, 3, 4), (8, 3), (8, 3))
-    verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
-    verify_reshape((2, 3, 4), (4, 0, 2), (4, 3, 2))
-    verify_reshape((2, 3, 4), (2, 0, 0), (2, 3, 4))
-    verify_reshape((2, 3, 4), (0, -1), (2, 12))
-    verify_reshape((2, 3, 4), (-1, 0), (8, 3))
-    verify_reshape((2, 3, 4), (-3, 4), (6, 4))
-    verify_reshape((2, 3, 4, 5), (-3, -3), (6, 20))
-    verify_reshape((2, 3, 4), (0, -3), (2, 12))
-
-
-@tvm.testing.uses_gpu
-def test_dyn_shape_reshape(executor_kind):
-    def verify_reshape(shape, newshape, oshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType(newshape, "float32"))
-        z = relay.reshape(x, relay.shape_of(y))
-
-        func = relay.Function([x, y], z)
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=newshape).astype("float32")
-        ref_res = np.reshape(x_data, oshape)
-        check_grad(run_infer_type(func), inputs=[x_data, y_data], eps=1e-3)
-        verify_func(executor_kind, func, [x_data, y_data], ref_res)
-
-    verify_reshape((2, 3, 4), (8, 3), (8, 3))
-    verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
-
-
-def test_squeeze(executor_kind):
-    def verify_squeeze(shape, dtype, axis):
-        x = relay.var("x", relay.TensorType(shape, dtype))
-        assert axis is not None
-        np_axis = tuple(axis)
-        axis = relay.var("axis", relay.TensorType([len(axis)], "int64"))
-        squeeze = relay.squeeze(x, axis=axis)
-        func = relay.Function([x, axis], squeeze)
-        x_data = np.random.random_sample(shape).astype(dtype)
-        ref_res = np.squeeze(x_data, axis=np_axis)
-        verify_func(executor_kind, func, [x_data, np.array(np_axis).astype("int64")], ref_res)
-
-    verify_squeeze((1, 3, 1), "float32", [0])
-    verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
-
-
-@tvm.testing.uses_gpu
-def test_dyn_expand_dims(executor_kind):
-    def verify_expand_dims(
-        dshape, dtype, oshape, axis, num_newaxis, target_device=tvm.testing.enabled_targets()
-    ):
-        # Use 1 to avoid issues with invalid buffer sizes
-        x = relay.Var("x", relay.TensorType(dshape, dtype))
-        y = relay.var("axis", shape=[], dtype="int64")
-        z = relay.expand_dims(x, axis=y, num_newaxis=num_newaxis)
-        func = relay.Function([x, y], z)
-
-        data_np = np.random.uniform(size=dshape).astype(dtype)
-        axis_np = np.array(axis).astype("int64")
-        ref_res = data_np.reshape(oshape)
-        verify_func(executor_kind, func, [data_np, axis_np], ref_res, target_device=target_device)
-
-    for dtype in ["float16", "float32"]:
-        verify_expand_dims((2, 2), dtype, (2, 2, 1), 2, 1)
-        verify_expand_dims((2, 2), dtype, (2, 1, 2), 1, 1)
-        verify_expand_dims((2, 2), dtype, (1, 2, 2), 0, 1)
-
-        # TODO (AndrewZhaoLuo): investigate why runtimes in non-llvm are extremely slow
-        # for multiple new axis
-        llvm_target_only = [x for x in tvm.testing.enabled_targets() if "llvm" in x]
-        verify_expand_dims((2, 2), dtype, (2, 2, 1, 1), 2, 2, target_device=llvm_target_only)
-        verify_expand_dims((2, 2), dtype, (2, 1, 1, 1, 2), 1, 3, target_device=llvm_target_only)
-        verify_expand_dims((2, 2), dtype, (1, 1, 1, 1, 2, 2), 0, 4, target_device=llvm_target_only)
-
-
-@tvm.testing.uses_gpu
-def test_dyn_tile(executor_kind):
-    def verify_tile(dshape, reps):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        r = relay.var("reps", relay.TensorType((len(reps),), "float32"))
-        z = relay.tile(x, r)
-
-        func = relay.Function([x, r], z)
-        x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
-        ref_res = np.tile(x_data, reps=reps)
-        reps_data = np.array(reps).astype("float32")
-        verify_func(executor_kind, func, [x_data, np.array(reps).astype("float32")], ref_res)
-
-    verify_tile((2, 3, 4), (3, 2, 1))
-    verify_tile((2, 3, 4), (1, 2))
-    verify_tile((2, 3), (3, 2, 1))
-
-
-@tvm.testing.uses_gpu
-def test_dyn_zeros_ones(executor_kind):
-    def verify_zeros_ones(shape, dtype):
-        for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
-            rank = len(shape)
-            dyn_shape = relay.Var("shape", relay.ty.TensorType((rank,), "int64"))
-            y = op(dyn_shape, dtype)
-            yy = run_infer_type(y)
-            assert yy.checked_type == relay.ty.TensorType((relay.Any(),) * rank, dtype)
-
-            func = relay.Function([dyn_shape], y)
-            ref_res = ref(shape, dtype)
-            verify_func(
-                executor_kind, func, [np.array(shape).astype("int64")], ref_res.astype("int64")
-            )
-
-    verify_zeros_ones((1, 3), "int64")
-    verify_zeros_ones((8, 9, 1, 2), "float32")
-
-
-@tvm.testing.uses_gpu
-def test_dyn_full(executor_kind):
-    def verify_full(fill_value, src_shape, dtype):
-        x = relay.var("x", relay.scalar_type(dtype))
-        rank = len(src_shape)
-        dyn_src_shape = relay.var("dyn_scr_shape", relay.ty.TensorType((rank,), "int64"))
-        z = relay.full(x, dyn_src_shape, dtype)
-        func = relay.Function([x, dyn_src_shape], z)
-        ref_res = np.full(src_shape, fill_value).astype(dtype)
-
-        verify_func(
-            executor_kind,
-            func,
-            [np.array(fill_value).astype(dtype), np.array(src_shape).astype("int64")],
-            ref_res,
-        )
-
-    verify_full(4, (1, 3, 4, 4), "int32")
-    verify_full(4, (1, 3, 4, 4), "int64")
-    verify_full(4.0, (2, 50), "float32")
-
-
-@tvm.testing.uses_gpu
-def test_dyn_sparse_to_dense(executor_kind):
-    def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
-        sparse_indices_data = np.array(sparse_indices)
-        sparse_values_data = np.array(sparse_values)
-        default_value_data = np.array(default_value)
-        output_shape_data = np.array(output_shape)
-
-        a = relay.var(
-            "a", relay.TensorType(sparse_indices_data.shape, str(sparse_indices_data.dtype))
-        )
-        b = relay.var(
-            "b", relay.TensorType(sparse_values_data.shape, str(sparse_values_data.dtype))
-        )
-        output_shape_var = relay.var(
-            "output_shape", relay.TensorType(output_shape_data.shape, str(output_shape_data.dtype))
-        )
-        if default_value is None:
-            args = [a, b, output_shape_var]
-            d = relay.sparse_to_dense(a, output_shape_var, b)
-        else:
-            c = relay.var(
-                "c", relay.TensorType(default_value_data.shape, str(default_value_data.dtype))
-            )
-            args = [a, b, c, output_shape_var]
-            d = relay.sparse_to_dense(a, output_shape_var, b, c)
-
-        zz = run_infer_type(d)
-        assert len(zz.checked_type.shape) == len(output_shape)
-
-        func = relay.Function(args, d)
-
-        if default_value is None:
-            arguments = [sparse_indices_data, sparse_values_data, output_shape_data]
-        else:
-            arguments = [
-                sparse_indices_data,
-                sparse_values_data,
-                default_value_data,
-                output_shape_data,
-            ]
-
-        verify_func(executor_kind, func, arguments, xpected)
-
-    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
-    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
-    verify_sparse_to_dense(
-        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
-    )  # nXd
-    verify_sparse_to_dense(
-        [[0, 0, 0], [1, 2, 3]],
-        [1, 2],
-        4,
-        [2, 3, 4],
-        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
-    )  # nXd
-    verify_sparse_to_dense(
-        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
-    )  # floats
-    # default value not specified
-    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])
-
-
-@pytest.mark.parametrize(
-    "sparse_indices, sparse_values, dense_shape, default_value",
-    [
-        (
-            np.array([[0, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64),
-            np.array([1, 2, 3, 4], dtype=np.int64),
-            np.array([5, 6], dtype=np.int64),
-            np.array([10], dtype=np.int64),
-        ),
-        (
-            np.array([[1, 1, 1], [1, 3, 1], [2, 0, 5], [3, 1, 6]], dtype=np.int64),
-            np.array([1, 2, 3, 4], dtype=np.int64),
-            np.array([7, 7, 7], dtype=np.int64),
-            np.array([5], dtype=np.int64),
-        ),
-        (
-            np.array([[1], [2]], dtype=np.int64),
-            np.array([7, 8], dtype=np.int64),
-            np.array([5], dtype=np.int64),
-            np.array([4], dtype=np.int64),
-        ),
-        (
-            np.ones((0, 1), dtype=np.int64),
-            np.array([], dtype=np.int64),
-            np.array([5], dtype=np.int64),
-            np.array([4], dtype=np.int64),
-        ),
-        (
-            np.ones((0, 3), dtype=np.int64),
-            np.array([], dtype=np.int64),
-            np.array([9, 3, 7], dtype=np.int64),
-            np.array([100], dtype=np.int64),
-        ),
-    ],
-)
-@pytest.mark.parametrize("dtype", [np.int64, np.int32])
-@pytest.mark.parametrize("use_dyn", [True, False])
-def test_sparse_fill_empty_rows(
-    sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn, executor_kind
-):
-    def ref_sparse_fill_empty_rows(
-        sparse_indices: np.ndarray,
-        sparse_values: np.ndarray,
-        dense_shape: np.ndarray,
-        default_value: np.ndarray,
-    ) -> None:
-        """
-        This function calculates the expected output of sparse_fill_empty_rows operator given the
-        inputs.
-        """
-
-        def check_add_rows(current_idx, limit_idx):
-            while current_idx < limit_idx:
-                new_sparse_indices.append([current_idx] + [0] * (num_cols - 1))
-                new_sparse_values.append(default_value[0])
-                empty_row_indicator[current_idx] = True
-                current_idx += 1
-
-            return current_idx
-
-        current_idx = 0
-        new_sparse_indices = []
-        new_sparse_values = []
-        empty_row_indicator = [False for _ in range(dense_shape[0])]
-        num_cols = sparse_indices.shape[1]
-        for sparse_row, sparse_value in zip(sparse_indices, sparse_values):
-            limit_idx = sparse_row[0]
-            current_idx = check_add_rows(current_idx, limit_idx)
-            new_sparse_indices.append(list(sparse_row))
-            new_sparse_values.append(sparse_value)
-            current_idx = limit_idx + 1
-
-        check_add_rows(current_idx, dense_shape[0])
-        return new_sparse_indices, new_sparse_values, empty_row_indicator
-
-    def verify_sparse_fill_empty_rows(
-        sparse_indices_np: np.ndarray,
-        sparse_values_np: np.ndarray,
-        dense_shape_np: np.ndarray,
-        default_value_np: np.ndarray,
-    ) -> None:
-        """
-        This function verifies the relay output of sparse_fill_empty_rows with its expected output.
-        """
-        if use_dyn:
-            sparse_indices = relay.var(
-                "sparse_indices",
-                shape=[relay.Any(), relay.Any()],
-                dtype=str(sparse_indices_np.dtype),
-            )
-            sparse_values = relay.var(
-                "sparse_values",
-                shape=[relay.Any()],
-                dtype=str(sparse_values_np.dtype),
-            )
-            dense_shape = relay.var(
-                "dense_shape",
-                shape=[relay.Any()],
-                dtype=str(dense_shape_np.dtype),
-            )
-            default_value = relay.var(
-                "default_value",
-                shape=[relay.Any()],
-                dtype=str(default_value_np.dtype),
-            )
-        else:
-            sparse_indices = relay.var(
-                "sparse_indices",
-                relay.TensorType(sparse_indices_np.shape, str(sparse_indices_np.dtype)),
-            )
-            sparse_values = relay.var(
-                "sparse_values",
-                relay.TensorType(sparse_values_np.shape, str(sparse_values_np.dtype)),
-            )
-            dense_shape = relay.var(
-                "dense_shape",
-                relay.TensorType(dense_shape_np.shape, str(dense_shape_np.dtype)),
-            )
-            default_value = relay.var(
-                "default_value",
-                relay.TensorType(default_value_np.shape, str(default_value_np.dtype)),
-            )
-        z = relay.sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value)
-        func = relay.Function([sparse_indices, sparse_values, dense_shape, default_value], z)
-        ref_res = ref_sparse_fill_empty_rows(
-            sparse_indices_np,
-            sparse_values_np,
-            dense_shape_np,
-            default_value_np,
-        )
-        (
-            new_sparse_indices_infer_type,
-            new_sparse_values_infer_type,
-            empty_row_indicator_infer_type,
-        ) = run_infer_type(z)
-
-        assert new_sparse_indices_infer_type.checked_type.dtype == sparse_indices_np.dtype
-        assert new_sparse_values_infer_type.checked_type.dtype == sparse_indices_np.dtype
-        assert empty_row_indicator_infer_type.checked_type.dtype == "bool"
-
-        verify_func(
-            executor_kind,
-            func,
-            [sparse_indices_np, sparse_values_np, dense_shape_np, default_value_np],
-            ref_res,
-            [("llvm", tvm.cpu())],
-        )
-
-    verify_sparse_fill_empty_rows(
-        sparse_indices.astype(dtype),
-        sparse_values.astype(dtype),
-        dense_shape.astype(dtype),
-        default_value.astype(dtype),
-    )
-
-
-def test_dyn_copy():
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-    mod = tvm.relay.fromtext(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(?, 3), int64]) -> Tensor[(?, 3), int64] {
-          copy(%x)
-        }
-        """
-    )
-    x_data = np.random.rand(15, 3).astype("int64")
-    expected = x_data
-    check_on_vm(target, dev, [x_data], expected, mod)
-
-
-def test_dyn_copy_scalar():
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-    mod = tvm.relay.fromtext(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: int32, %y: Tensor[(?), int32]) -> Tensor[(?), int32] {
-          %0 = copy(%x);
-          %1 = expand_dims(%0, axis=0);
-          %2 = (%y, %1);
-          concatenate(%2)
-        }
-        """
-    )
-    x_data = 3
-    y_data = np.random.rand(7).astype("int32")
-    expected = np.concatenate((y_data, np.expand_dims(x_data, axis=0)))
-    check_on_vm(target, dev, [x_data, y_data], expected, mod)
-
-
-def test_dyn_cast():
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-    mod = tvm.relay.fromtext(
-        """
-        #[version = "0.0.5"]
-        def @main(%x: Tensor[(?, 3), int64]) -> Tensor[(?, 3), int32] {
-          cast(%x, dtype="int32")
-        }
-        """
-    )
-    x_data = np.random.rand(15, 3).astype("int64")
-    expected = x_data.astype("int32")
-    check_on_vm(target, dev, [x_data], expected, mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level4.py b/tests/python/relay/dyn/test_dynamic_op_level4.py
deleted file mode 100644
index 2a4606fcf93f..000000000000
--- a/tests/python/relay/dyn/test_dynamic_op_level4.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import run_infer_type
-import tvm.topi.testing
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_strided_slice():
-    def verify(dshape, begin, end, strides, slice_mode="end", test_ref=True, dtype="int32"):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        ndim = len(dshape)
-        slice_dim = len(begin)
-        begin = begin if begin else [0] * ndim
-        end = end if end else list(dshape)[:slice_dim]
-        if strides:
-            if len(strides) == 1:
-                strides = strides * slice_dim
-        else:
-            strides = [1] * slice_dim
-
-        num_static_axes = len(dshape) - len(begin)
-
-        # target numpy result
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = tvm.topi.testing.strided_slice_python(x_data, begin, end, strides, slice_mode)
-        data = [x_data, np.array(begin, dtype=dtype), np.array(end, dtype=dtype)]
-
-        begin = relay.var("begin", shape=[len(begin)], dtype=dtype)
-        end = relay.var("end", shape=[len(end)], dtype=dtype)
-        inputs = [x, begin, end]
-        if strides:
-            data.append(np.array(strides, dtype=dtype))
-            strides = relay.var("strides", shape=[len(strides)], dtype=dtype)
-            inputs.append(strides)
-            z = relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=slice_mode)
-        else:
-            z = relay.strided_slice(x, begin=begin, end=end, slice_mode=slice_mode)
-        func = relay.Function(inputs, z)
-
-        func = run_infer_type(func)
-
-        if num_static_axes > 0:
-            oshape = run_infer_type(z).checked_type.shape
-            assert tuple(oshape[-num_static_axes:]) == dshape[-num_static_axes:]
-
-        if not test_ref:
-            return
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor("vm", mod=mod, device=dev, target=target).evaluate()(
-                *data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-    verify(
-        (1, 224, 224, 3),
-        [0, 20, 20, 0],
-        [1, 140, 140, 3],
-        [1, 1, 1, 1],
-        dtype="int64",
-    )
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], dtype="int16")
-    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None)
-    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None)
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 4], None)
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None)
-    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1])
-    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1])
-    verify((20, 10, 5), [20, 10, 4], [0, 0, 1], [-1, -3, -2])
-    verify((3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], slice_mode="size", test_ref=False)
-    verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], slice_mode="size", test_ref=True)
-
-    # Slicing along first few axes, where the rest of axes remain static
-    verify((3, 4, 3), [0], [2], None)
-    verify((3, 4, 3), [1], [4], [2])
-    verify((3, 4, 3), [1, 0], [4, 2], [2, 1])
-
-
-if __name__ == "__main__":
-    test_dynamic_strided_slice()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py
deleted file mode 100644
index 5222516dca6d..000000000000
--- a/tests/python/relay/dyn/test_dynamic_op_level5.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level5 operator test cases.
-"""
-import math
-import numpy as np
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import run_infer_type
-import tvm.topi.testing
-import tvm.testing
-
-executor_kind = tvm.testing.parameter("debug", "vm")
-
-
-def test_resize2d_infer_type():
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
-    size = relay.var("size", relay.TensorType((2,), "int8"))
-    z = relay.image.resize2d(x, size)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, relay.Any(), relay.Any()), "int8")
-
-
-@tvm.testing.uses_gpu
-def test_resize2d(executor_kind):
-    def verify_resize2d(dshape, scale, method, layout):
-        if layout == "NHWC":
-            size = (dshape[1] * scale, dshape[2] * scale)
-        else:
-            size = (dshape[2] * scale, dshape[3] * scale)
-        size = np.array(size).astype("int64")
-        x_data = np.random.uniform(size=dshape).astype("float32")
-
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        size_var = relay.var("size", relay.TensorType((2,), "int64"))
-
-        coord_trans = "asymmetric" if method == "nearest_neighbor" else "align_corners"
-        z = relay.image.resize2d(
-            x, size_var, None, layout, method, coordinate_transformation_mode=coord_trans
-        )
-
-        zz = run_infer_type(z)
-        func = relay.Function([x, size_var], z)
-
-        ref_res = tvm.topi.testing.resize2d_python(
-            x_data, (scale, scale), layout, method, coord_trans
-        )
-
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(
-                executor_kind, mod=mod, device=dev, target=target
-            ).evaluate()(x_data, size)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
-
-    for method in ["linear", "nearest_neighbor"]:
-        for layout in ["NCHW", "NHWC"]:
-            verify_resize2d((1, 4, 4, 4), 2, method, layout)
-            verify_resize2d((2, 8, 17, 20), 7, method, layout)
-
-
-if __name__ == "__main__":
-    test_resize2d_infer_type()
-    test_resize2d()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
deleted file mode 100644
index ebf9c36263be..000000000000
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level6 operator test cases.
-"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import relay
-import tvm.testing
-
-executor_kind = tvm.testing.parameter("debug", "vm")
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_topk(executor_kind):
-    def verify_topk(k, axis, ret_type, is_ascend, dtype):
-        shape = (20, 100)
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        k_var = relay.var("x", relay.TensorType((1,), "float32"))
-        out = relay.topk(x, k_var, axis, ret_type, is_ascend, dtype)
-        if isinstance(out, relay.expr.TupleWrapper):
-            out = out.astuple()
-        func = relay.Function([x, k_var], out)
-
-        np_data = np.random.uniform(size=shape).astype("float32")
-        if is_ascend:
-            np_indices = np.argsort(np_data, axis=axis)
-        else:
-            np_indices = np.argsort(-np_data, axis=axis)
-        kk = k if k >= 1 else shape[axis]
-        if axis == 0:
-            np_indices = np_indices[:kk, :]
-            np_values = np.zeros(np_indices.shape).astype("float32")
-            for i in range(shape[1]):
-                np_values[:, i] = np_data[np_indices[:, i], i]
-        else:
-            np_indices = np_indices[:, :kk]
-            np_values = np.zeros(np_indices.shape).astype("float32")
-            for i in range(shape[0]):
-                np_values[i, :] = np_data[i, np_indices[i, :]]
-        np_indices = np_indices.astype(dtype)
-
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(
-                executor_kind, mod=mod, device=dev, target=target
-            ).evaluate()(np_data, np.array([k]).astype("float32"))
-            if ret_type == "both":
-                tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
-                tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
-            elif ret_type == "values":
-                tvm.testing.assert_allclose(op_res.numpy(), np_values)
-            else:
-                tvm.testing.assert_allclose(op_res.numpy(), np_indices)
-
-    np.random.seed(0)
-    for k in [0, 1, 5]:
-        for axis in [0, -1, 1]:
-            for ret_type in ["both", "values", "indices"]:
-                verify_topk(k, axis, ret_type, True, "int64")
-                verify_topk(k, axis, ret_type, False, "float32")
-
-
-if __name__ == "__main__":
-    test_dynamic_topk()
diff --git a/tests/python/relay/op/annotation/test_annotation.py b/tests/python/relay/op/annotation/test_annotation.py
deleted file mode 100644
index 502d88ff55b6..000000000000
--- a/tests/python/relay/op/annotation/test_annotation.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for annotations."""
-import tvm
-import tvm.testing
-from tvm import relay
-import pytest
-
-
-def test_on_device_via_string():
-    x = relay.Var("x")
-    call = relay.annotation.on_device(x, "cuda")
-    assert isinstance(call, relay.Call)
-    assert len(call.args) == 1
-    assert call.args[0] == x
-    assert call.attrs.virtual_device.device_type_int == 2  # ie kDLCUDA
-    assert call.attrs.virtual_device.virtual_device_id == 0
-    assert call.attrs.virtual_device.target is None
-    assert call.attrs.virtual_device.memory_scope == ""
-    assert call.attrs.constrain_body
-    assert not call.attrs.constrain_result
-
-
-def test_on_device_via_device():
-    x = relay.Var("x")
-    call = relay.annotation.on_device(x, tvm.device("cpu"))
-    assert call.attrs.virtual_device.device_type_int == 1  # ie kDLCPU
-
-
-def test_on_device_invalid_device():
-    x = relay.Var("x")
-    pytest.raises(ValueError, lambda: relay.annotation.on_device(x, "bogus"))
-
-
-def test_on_device_fixed():
-    x = relay.Var("x")
-    call = relay.annotation.on_device(x, "cuda", constrain_result=True)
-    assert call.attrs.virtual_device.device_type_int == 2  # ie kDLCUDA
-    assert call.attrs.constrain_body
-    assert call.attrs.constrain_result
-
-
-def test_on_device_free():
-    x = relay.Var("x")
-    call = relay.annotation.on_device(x, "cuda", constrain_result=False, constrain_body=False)
-    assert call.attrs.virtual_device.device_type_int == -1  # ie kInvalidDeviceType
-    assert not call.attrs.constrain_body
-    assert not call.attrs.constrain_result
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/op/test_tensor.py b/tests/python/relay/op/test_tensor.py
deleted file mode 100644
index ceee27161cda..000000000000
--- a/tests/python/relay/op/test_tensor.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for tensor helpers."""
-import tvm
-import tvm.testing
-from tvm import relay
-import pytest
-
-
-def test_device_copy_via_string():
-    x = relay.var("x")
-    call = relay.op.device_copy(x, "cuda", "cpu")
-    assert isinstance(call, relay.Call)
-    assert len(call.args) == 1
-    assert call.args[0] == x
-    assert call.attrs.src_virtual_device.device_type_int == 2  # ie kDLCUDA
-    assert call.attrs.src_virtual_device.virtual_device_id == 0
-    assert call.attrs.src_virtual_device.target is None
-    assert call.attrs.src_virtual_device.memory_scope == ""
-    assert call.attrs.dst_virtual_device.device_type_int == 1  # ie kDLCPU
-    assert call.attrs.dst_virtual_device.virtual_device_id == 0
-    assert call.attrs.dst_virtual_device.target is None
-    assert call.attrs.dst_virtual_device.memory_scope == ""
-
-
-def test_device_copy_via_device():
-    x = relay.var("x")
-    call = relay.op.device_copy(x, tvm.device("cuda"), tvm.device("cpu"))
-    assert isinstance(call, relay.Call)
-    assert len(call.args) == 1
-    assert call.args[0] == x
-    assert call.attrs.src_virtual_device.device_type_int == 2  # ie kDLCUDA
-    assert call.attrs.dst_virtual_device.device_type_int == 1  # ie kDLCPU
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/conftest.py b/tests/python/relay/opencl_texture/conftest.py
deleted file mode 100644
index 6b9c91ec1067..000000000000
--- a/tests/python/relay/opencl_texture/conftest.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import tvm
-from tvm import rpc
-import pytest
-
-
-@pytest.fixture(scope="session")
-def remote():
-    if (
-        "TVM_TRACKER_HOST" in os.environ
-        and "TVM_TRACKER_PORT" in os.environ
-        and "RPC_DEVICE_KEY" in os.environ
-    ):
-
-        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
-        rpc_tracker_port = int(os.environ["TVM_TRACKER_PORT"])
-        rpc_device_key = os.environ["RPC_DEVICE_KEY"]
-        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
-        remote = tracker.request(rpc_device_key, priority=0, session_timeout=600)
-        return remote
-    else:
-        return None
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
deleted file mode 100644
index 1dd5ca2abd00..000000000000
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ /dev/null
@@ -1,1593 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from tvm.contrib import utils
-from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm
-import pytest
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, executor_type, dtype):
-    input_shape = (1, 32, 42, 42)
-    filter_shape = (96, 32, 3, 3)
-    bias_shape = (1, 96, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=96,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, executor_type, dtype):
-    input_shape = (1, 32, 40, 40)
-    filter_shape = (96, 32, 2, 2)
-    bias_shape = (1, 96, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=96,
-        kernel_size=(2, 2),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_35_35_strides(remote, target, executor_type, dtype):
-    input_shape = (1, 48, 35, 35)
-    filter_shape = (64, 48, 5, 5)
-    bias_shape = (1, 64, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[2, 2, 2, 2],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=64,
-        kernel_size=(5, 5),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_resnet50_v2_nchw_3c(remote, target, executor_type, dtype):
-    input_shape = (1, 3, 224, 224)
-    filter_shape = (64, 3, 7, 7)
-    bias_shape = (1, 64, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[3, 3, 3, 3],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=64,
-        kernel_size=(7, 7),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_nchw_3c(remote, target, executor_type, dtype):
-    input_shape = (1, 3, 299, 299)
-    filter_shape = (64, 3, 3, 3)
-    bias_shape = (1, 64, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=64,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_1x1_16c16spatial(remote, target, executor_type, dtype):
-    input_shape = (1, 16, 256, 256)
-    filter_shape = (32, 16, 4, 4)
-    bias_shape = (1, 32, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(4, 4),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4_16c16pad(remote, target, executor_type, dtype):
-    input_shape = (1, 32, 256, 256)
-    filter_shape = (32, 32, 4, 4)
-    bias_shape = (1, 32, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[3, 3, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(4, 4),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4x4_16c16pad(remote, target, executor_type, dtype):
-    input_shape = (1, 32, 256, 256)
-    filter_shape = (4, 32, 4, 4)
-    bias_shape = (1, 4, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[3, 3, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=4,
-        kernel_size=(4, 4),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_yolov3_v2_nchw_3c(remote, target, executor_type, dtype):
-    input_shape = (1, 1024, 13, 13)
-    filter_shape = (255, 1024, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=255,
-        kernel_size=(1, 1),
-    )
-
-    mod = relay.Function([A, B], conv)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_vgg16_winograd_4d(remote, target, executor_type, dtype):
-    input_shape = (1, 512, 28, 28)
-    filter_shape = (512, 512, 3, 3)
-    bias_shape = (1, 512, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        channels=512,
-        kernel_size=[3, 3],
-        out_dtype=dtype,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    temp = utils.tempdir()
-    stat_file = temp.relpath("stat.log")
-    with open(stat_file, "w") as f:
-        f.write(
-            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "{dtype}"], ["TENSOR", [512, 512, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
-        )
-    if executor_type == "ge":
-        graph = build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", graph)
-        assert len(matches) > 0
-    else:
-        vmc = build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", vmc.primitives)
-        assert len(matches) > 0
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_conv(remote, target, executor_type, dtype):
-    input_shape = (1, 4, 3, 3)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    filter_shape3 = (8, 4, 3, 3)
-    bias_shape3 = (8,)
-    B3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
-    D = relay.nn.conv2d(
-        A, B3, padding=[1, 1, 1, 1], channels=8, kernel_size=[3, 3], out_dtype=dtype
-    )
-
-    filter_shape4 = (8, 8, 3, 3)
-    bias_shape4 = (8,)
-    B4 = relay.var("weight4", shape=filter_shape4, dtype=dtype)
-    D = relay.nn.conv2d(
-        D, B4, padding=[1, 1, 1, 1], channels=8, kernel_size=[3, 3], out_dtype=dtype
-    )
-    mod = relay.Function([A, B3, B4], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data3 = np.zeros(filter_shape3).astype(dtype)
-    bias_data3 = np.zeros(bias_shape3).astype(dtype)
-    filter_data4 = np.zeros(filter_shape4).astype(dtype)
-    bias_data4 = np.zeros(bias_shape4).astype(dtype)
-    initializer("weight", filter_data3)
-    initializer("bias", bias_data3)
-    initializer("weight", filter_data4)
-    initializer("bias", bias_data4)
-    params1 = {
-        "weight3": tvm.nd.array(filter_data3),
-        "weight4": tvm.nd.array(filter_data4),
-    }
-
-    temp = utils.tempdir()
-    stat_file = temp.relpath("stat.log")
-    with open(stat_file, "w") as f:
-        f.write(
-            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "{dtype}"], ["TENSOR", [8, 4, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
-        )
-    if executor_type == "ge":
-        graph = build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", graph)
-        assert len(matches) > 0
-    else:
-        vmc = build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", vmc.primitives)
-        assert len(matches) > 0
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_residual_block(remote, target, executor_type, dtype):
-    """
-    - some kind of residual block followed by convolution to have texture after residual block
-    - scalar data type verification which should be mapped to global memory scope
-        layout_transform (NCHW->NCHW4c)
-                  |                      <- buffer
-                conv2d (1)                  <- to get textures as output
-               /         \
-            conv2d (2)    |
-                 \       /
-                    add                     <- add should be fused into conv2d (2)
-                multiply to scalar          <- buffer to the input of multiply scalar value
-                    relu
-                     |                      <- texture in intermediate tensor
-                  conv2d (3)
-                   relu
-                     |                      <- buffer
-               layout_transform (NCHW4c->NCHW)
-    """
-    input_shape = (1, 32, 40, 40)
-    filter_shape1 = (32, 32, 2, 2)
-    filter_shape2 = (32, 32, 1, 1)
-    filter_shape3 = (32, 32, 2, 2)
-    bias_shape1 = (1, 32, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
-    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
-    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
-    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
-
-    conv1 = relay.nn.conv2d(
-        A,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(2, 2),
-    )
-    D = relay.op.add(conv1, B1)
-    D = relay.op.nn.relu(D)
-
-    conv2 = relay.nn.conv2d(
-        D,
-        W2,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-    D = relay.op.add(conv2, D)
-    D = D * relay.const(0.15, dtype)
-    D = relay.op.nn.relu(D)
-
-    conv3 = relay.nn.conv2d(
-        D,
-        W3,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(2, 2),
-    )
-    D = relay.op.nn.relu(conv3)
-
-    mod = relay.Function([A, W1, B1, W2, W3], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data1 = np.zeros(filter_shape1).astype(dtype)
-    bias_data1 = np.zeros(bias_shape1).astype(dtype)
-    initializer("weight", filter_data1)
-    initializer("bias", bias_data1)
-    filter_data2 = np.zeros(filter_shape2).astype(dtype)
-    initializer("weight", filter_data2)
-    filter_data3 = np.zeros(filter_shape3).astype(dtype)
-    initializer("weight", filter_data3)
-    params1 = {
-        "weight1": tvm.nd.array(filter_data1),
-        "bias1": tvm.nd.array(bias_data1),
-        "weight2": tvm.nd.array(filter_data2),
-        "weight3": tvm.nd.array(filter_data3),
-    }
-    if dtype == "float16":
-        static_memory_scope = [
-            "",
-            "global.texture",
-            "global.texture-weight",
-            "global.texture-weight",
-            "global.texture",
-            "global.texture-weight",
-            "global",
-            "global.texture",
-            "global.texture-weight",
-            "",
-            "",
-        ]
-    else:
-        static_memory_scope = [
-            "",
-            "global.texture",
-            "global",
-            "global.texture-weight",
-            "global.texture",
-            "global.texture-weight",
-            "global.texture",
-            "global",
-            "",
-            "",
-        ]
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_concat(remote, target, executor_type, dtype):
-    """
-        layout_transform (NCHW->NCHW4c)
-                  |                      <- buffer
-                conv2d (1)               <- to get textures as output
-               /         \
-            conv2d (2)    conv2d (3)
-                 \       /               <- concat does not support textures, there we should have buffers
-                concatenation
-                     |                   <- buffer
-               layout_transform (NCHW4c->NCHW)
-    """
-    input_shape = (1, 32, 40, 40)
-    filter_shape1 = (96, 32, 2, 2)
-    filter_shape2 = (32, 96, 2, 2)
-    filter_shape3 = (5, 96, 2, 2)
-    bias_shape1 = (1, 96, 1, 1)
-    bias_shape2 = (1, 32, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
-    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
-    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
-    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
-    B2 = relay.var("bias2", shape=bias_shape2, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv1 = relay.nn.conv2d(
-        A,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=96,
-        kernel_size=(2, 2),
-    )
-    D = relay.op.add(conv1, B1)
-    D = relay.op.nn.relu(D)
-
-    conv2 = relay.nn.conv2d(
-        D,
-        W2,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(2, 2),
-    )
-    conv2 = relay.op.add(conv2, B2)
-    conv2 = relay.op.nn.relu(conv2)
-
-    conv3 = relay.nn.conv2d(
-        D,
-        W3,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=5,
-        kernel_size=(2, 2),
-    )
-
-    t = relay.Tuple([conv2, conv3])
-    c = relay.op.concatenate(t, axis=1)
-
-    mod = relay.Function([A, W1, B1, W2, B2, W3], c)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data1 = np.zeros(filter_shape1).astype(dtype)
-    bias_data1 = np.zeros(bias_shape1).astype(dtype)
-    initializer("weight", filter_data1)
-    initializer("bias", bias_data1)
-    filter_data2 = np.zeros(filter_shape2).astype(dtype)
-    bias_data2 = np.zeros(bias_shape2).astype(dtype)
-    initializer("weight", filter_data2)
-    initializer("bias", bias_data2)
-    filter_data3 = np.zeros(filter_shape3).astype(dtype)
-    initializer("weight", filter_data3)
-    params1 = {
-        "weight1": tvm.nd.array(filter_data1),
-        "bias1": tvm.nd.array(bias_data1),
-        "weight2": tvm.nd.array(filter_data2),
-        "bias2": tvm.nd.array(bias_data2),
-        "weight3": tvm.nd.array(filter_data3),
-    }
-
-    static_memory_scope = [
-        "",
-        "global.texture",
-        "global",
-        "global.texture-weight",
-        "global",
-        "global.texture-nhwc",
-        "global",
-        "global.texture-weight",
-        "",
-        "",
-        "",
-        "",
-        "",
-    ]
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_pooling_branching_texture_params(remote, target, executor_type, dtype):
-    """
-    Verification of the pooling and many branches having textures
-                layout_transform (NCHW->NCHW4c)
-                         |                        <- buffer
-                      conv2d (0)                  <- to get textures
-                         |                        <- textures
-                     pooling
-               /           \           \          <- textures
-            conv2d (1)    conv2d (2)    conv2d (3)
-                \             /           |
-                     add                  |       <- to have  the only one output, will be fused
-                      \                  /
-                            add                  <- to have  the only one output, will be fused
-                             |                   <- buffer
-                    layout_transform (NCHW4c->NCHW)
-    """
-    input_shape = (1, 32, 40, 40)
-    filter_shape0 = (32, 32, 1, 1)
-    filter_shape1 = (32, 32, 2, 2)
-    filter_shape2 = (32, 32, 1, 1)
-    filter_shape3 = (32, 32, 2, 2)
-    bias_shape1 = (1, 32, 1, 1)
-    # bias_shape2 = (1, 32, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W0 = relay.var("weight0", shape=filter_shape0, dtype=dtype)
-    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
-    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
-    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
-    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
-
-    conv0 = relay.nn.conv2d(
-        A,
-        W0,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-
-    pool = relay.nn.avg_pool2d(conv0, pool_size=(2, 2), strides=(2, 2))
-    conv1 = relay.nn.conv2d(
-        pool,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(2, 2),
-    )
-    conv1 = relay.op.add(conv1, B1)
-    conv1 = relay.op.nn.relu(conv1)
-
-    conv2 = relay.nn.conv2d(
-        pool,
-        W2,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-
-    conv3 = relay.nn.conv2d(
-        pool,
-        W3,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 1, 1, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(2, 2),
-    )
-    conv3 = relay.op.nn.relu(conv3)
-    res = relay.op.add(conv1, conv2)
-    res = relay.op.add(res, conv3)
-
-    mod = relay.Function([A, W0, W1, B1, W2, W3], res)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data0 = np.zeros(filter_shape0).astype(dtype)
-    filter_data1 = np.zeros(filter_shape1).astype(dtype)
-    bias_data1 = np.zeros(bias_shape1).astype(dtype)
-    initializer("weight", filter_data1)
-    initializer("bias", bias_data1)
-    filter_data2 = np.zeros(filter_shape2).astype(dtype)
-    initializer("weight", filter_data2)
-    filter_data3 = np.zeros(filter_shape3).astype(dtype)
-    initializer("weight", filter_data3)
-    params1 = {
-        "weight0": tvm.nd.array(filter_data0),
-        "weight1": tvm.nd.array(filter_data1),
-        "bias1": tvm.nd.array(bias_data1),
-        "weight2": tvm.nd.array(filter_data2),
-        "weight3": tvm.nd.array(filter_data3),
-    }
-
-    static_memory_scope = [
-        "",
-        "global.texture",
-        "global.texture-weight",
-        "global.texture",
-        "global.texture",
-        "global",
-        "global.texture-weight",
-        "global",
-        "global.texture-weight",
-        "global.texture",
-        "global.texture",
-        "",
-        "",
-    ]
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_branching_texture_params(remote, target, executor_type, dtype):
-    """
-    Verification of passing texture to several consumers markup of relay variables in
-    primary functions + on_device
-
-                layout_transform (NCHW->NCHW4c)
-                         |                      <- buffer
-                      conv2d (0)                <- to get textures
-             /           \           \          <- here should be textures and textures in params
-          conv2d (1)    conv2d (2)    conv2d (3)
-            \             /           |
-                  add                 |         <- to have  the only one output
-                    \                /
-                           add                  <- to have  the only one output
-                            |                   <- buffer
-                    layout_transform (NCHW4c->NCHW)
-    """
-    input_shape = (1, 32, 40, 40)
-    filter_shape0 = (32, 32, 1, 1)
-    filter_shape1 = (32, 32, 2, 2)
-    filter_shape2 = (32, 32, 1, 1)
-    filter_shape3 = (32, 32, 2, 2)
-    bias_shape1 = (1, 32, 1, 1)
-    # bias_shape2 = (1, 32, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W0 = relay.var("weight0", shape=filter_shape0, dtype=dtype)
-    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
-    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
-    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
-    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
-
-    conv0 = relay.nn.conv2d(
-        A,
-        W0,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-
-    conv1 = relay.nn.conv2d(
-        conv0,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(2, 2),
-    )
-    conv1 = relay.op.add(conv1, B1)
-    conv1 = relay.op.nn.relu(conv1)
-
-    conv2 = relay.nn.conv2d(
-        conv0,
-        W2,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-
-    conv3 = relay.nn.conv2d(
-        conv0,
-        W3,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 1, 1, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(2, 2),
-    )
-    conv3 = relay.op.nn.relu(conv3)
-    res = relay.op.add(conv1, conv2)
-    res = relay.op.add(res, conv3)
-
-    mod = relay.Function([A, W0, W1, B1, W2, W3], res)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data0 = np.zeros(filter_shape0).astype(dtype)
-    filter_data1 = np.zeros(filter_shape1).astype(dtype)
-    bias_data1 = np.zeros(bias_shape1).astype(dtype)
-    initializer("weight", filter_data1)
-    initializer("bias", bias_data1)
-    filter_data2 = np.zeros(filter_shape2).astype(dtype)
-    initializer("weight", filter_data2)
-    filter_data3 = np.zeros(filter_shape3).astype(dtype)
-    initializer("weight", filter_data3)
-    params1 = {
-        "weight0": tvm.nd.array(filter_data0),
-        "weight1": tvm.nd.array(filter_data1),
-        "bias1": tvm.nd.array(bias_data1),
-        "weight2": tvm.nd.array(filter_data2),
-        "weight3": tvm.nd.array(filter_data3),
-    }
-
-    static_memory_scope = [
-        "",
-        "global.texture",
-        "global.texture-weight",
-        "global.texture",
-        "global",
-        "global.texture-weight",
-        "global",
-        "global.texture-weight",
-        "global.texture",
-        "global.texture",
-        "",
-        "",
-    ]
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-        )
-
-
-# function repeat, params scope are different in reused functions
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_different_lowering_same_op(remote, target, executor_type, dtype):
-    """
-    Use case for verification of caching compiled functions
-    Three convolutions following by each other in this case should be
-    compiled in three different entities and lowered differently because
-    they are differ in input param memory scopes and in output memory scope
-
-                layout_transform (NCHW->NCHW4c)
-                         |                      <- buffer
-                      conv2d (1)                <- buffer as input tensor and texture as output
-                         |                      <- texture
-                      conv2d (2)                <- texture as input and texture as output
-                         |                      <- texture
-                      conv2d (3)                <- texture as input and buffer as output
-                         |                      <- buffer
-                    layout_transform (NCHW4c->NCHW)
-    """
-    input_shape = (1, 32, 40, 40)
-    filter_shape1 = (32, 32, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
-
-    conv1 = relay.nn.conv2d(
-        A,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-
-    conv2 = relay.nn.conv2d(
-        conv1,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-
-    conv3 = relay.nn.conv2d(
-        conv2,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(1, 1),
-    )
-
-    mod = relay.Function([A, W1], conv3)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data1 = np.zeros(filter_shape1).astype(dtype)
-    params1 = {
-        "weight1": tvm.nd.array(filter_data1),
-    }
-
-    static_memory_scope = [
-        "",
-        "global.texture",
-        "global.texture-weight",
-        "global.texture",
-        "global.texture",
-        "",
-        "",
-    ]
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_non_rect(remote, target, executor_type, dtype):
-    input_shape = (1, 771, 36, 64)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    filter_shape = (128, 771, 3, 3)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    D = relay.nn.conv2d(
-        A, B, padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3], out_dtype=dtype
-    )
-
-    mod = relay.Function([A, B], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    temp = utils.tempdir()
-    stat_file = temp.relpath("stat.log")
-    with open(stat_file, "w") as f:
-        f.write(
-            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 771, 36, 64], "{dtype}"], ["TENSOR", [128, 771, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
-        )
-    if executor_type == "ge":
-        graph = build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", graph)
-        assert len(matches) > 0
-    else:
-        vmc = build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", vmc.primitives)
-        assert len(matches) > 0
-
-
-# function repeat, params scope are different in reused functions
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_injective_nwo_inputs1(remote, target, executor_type, dtype):
-    """
-    Use case for verification of stability of annotation primary functions
-    having several ops accepting data outside of Primary function
-    The visiting of ops during traversing of graph inside primary function
-    can depend on order of relay graph creation. Thus the annotation mechanism
-    should be reliable for graph traversal order
-    The current decision if Prim Function support textures or not depend on
-    *any* op accepting input of the function and if op support textures
-                                     Input
-                               /                   \
-                layout_transform (NCHW->NCHW4c)    |
-                         |                        /
-                      conv2d (1)                 /
-                         |                      /
-                      conv2d (2)       mean    /
-                  /         \                 /   <- Primary function several head ops
-             (1)add    (2)layout_transform    |
-                 |        (NCHW4c->NCHW)      |
-                 |           |      \        /
-                 |           |       (3) add
-                 |           |         |
-    layout_transform          \       /
-     (NCHW4c->NCHW)             \    /
-                 \                mul
-                  \            /
-                        add
-
-    This test verifies a case when the latest op which is visited is (3) and does not
-    support textures, but there is (1) supporting textures, thus the whole func will
-    support textures
-    """
-    input_shape = (1, 4, 40, 40)
-    filter_shape1 = (4, 4, 3, 3)
-    filter_shape2 = (4, 4, 3, 3)
-    filter_shape3 = (4, 4, 3, 3)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
-    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
-    mean = relay.mean(A, axis=1, keepdims=True)
-    conv1 = relay.nn.conv2d(
-        A,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=4,
-        kernel_size=(3, 3),
-    )
-
-    conv2 = relay.nn.conv2d(
-        conv1,
-        W2,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=4,
-        kernel_size=(3, 3),
-    )
-
-    ad3 = relay.op.add(conv1, conv2)
-    ad1 = relay.op.add(mean, conv1)
-    ad2 = relay.op.multiply(ad1, conv2)
-    ad4 = relay.op.add(ad3, ad2)
-
-    mod = relay.Function([A, W1, W2], ad4)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data1 = np.zeros(filter_shape1).astype(dtype)
-    filter_data2 = np.zeros(filter_shape2).astype(dtype)
-    initializer("weight", filter_data1)
-    initializer("weight", filter_data2)
-    params1 = {
-        "weight1": tvm.nd.array(filter_data1),
-        "weight2": tvm.nd.array(filter_data2),
-    }
-
-    static_memory_scope = [
-        "",
-        "global.texture",
-        "global",
-        "global.texture",
-        "global",
-        "global.texture",
-        "global",
-        "global",
-    ]
-    if executor_type == "ge":
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-        )
-
-
-# function repeat, params scope are different in reused functions
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_injective_nwo_inputs2(remote, target, executor_type, dtype):
-    """
-    Use case for verification of stability of annotation primary functions
-    having several ops accepting data outside of Primary function
-    The visiting of ops during traversing of graph inside primary function
-    can depend on order of relay graph creation. Thus the annotation mechanism
-    should be reliable for graph traversal order
-    The current decision if Prim Function support textures or not depend on
-    *any* op accepting input of the function and if op support textures
-                                     Input
-                               /                   \
-                layout_transform (NCHW->NCHW4c)    |
-                         |                        /
-                      conv2d (1)                 /
-                         |                      /
-                      conv2d (2)       mean    /
-                  /         \                 /   <- Primary function several head ops
-             (1)add    (2)layout_transform    |
-                 |        (NCHW4c->NCHW)      |
-                 |           |      \        /
-                 |           |       (3) add
-                 |           |         |
-    layout_transform          \       /
-     (NCHW4c->NCHW)             \    /
-                 \                mul
-                  \            /
-                        add
-
-    This test verifies a case when the latest op which is (1), it supports textures
-    an whole prim function is considered as a func working with textures
-    """
-    input_shape = (1, 4, 40, 40)
-    filter_shape1 = (4, 4, 3, 3)
-    filter_shape2 = (4, 4, 3, 3)
-    filter_shape3 = (4, 4, 3, 3)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
-    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
-    mean = relay.mean(A, axis=1, keepdims=True)
-    conv1 = relay.nn.conv2d(
-        A,
-        W1,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=4,
-        kernel_size=(3, 3),
-    )
-
-    conv2 = relay.nn.conv2d(
-        conv1,
-        W2,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=4,
-        kernel_size=(3, 3),
-    )
-
-    ad3 = relay.op.add(conv1, conv2)
-    ad1 = relay.op.add(mean, conv1)
-    ad2 = relay.op.multiply(ad1, conv2)
-    ad4 = relay.op.add(ad2, ad3)
-
-    mod = relay.Function([A, W1, W2], ad4)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data1 = np.zeros(filter_shape1).astype(dtype)
-    filter_data2 = np.zeros(filter_shape2).astype(dtype)
-    initializer("weight", filter_data1)
-    initializer("weight", filter_data2)
-    params1 = {
-        "weight1": tvm.nd.array(filter_data1),
-        "weight2": tvm.nd.array(filter_data2),
-    }
-
-    static_memory_scope = [
-        "",
-        "global.texture",
-        "global",
-        "global.texture",
-        "global",
-        "global",
-        "global.texture",
-        "global",
-    ]
-    if executor_type == "ge":
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_to_3_channels(remote, target, executor_type, dtype):
-    input_shape = (1, 256, 200, 200)
-    filter_shape = (3, 256, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-
-    D = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        out_dtype=dtype,
-        channels=3,
-        kernel_size=(1, 1),
-    )
-    mod = relay.Function([A, B], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [])
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_weight_on_buffers(remote, target, executor_type, dtype):
-    target = "opencl -device=adreno"
-    input_shape = (1, 64, 75, 75)
-    filter_shape = (64, 64, 3, 3)
-    bias_shape = (64,)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    W = relay.var("weight", shape=filter_shape, dtype=dtype)
-    BS = relay.var("bias", shape=bias_shape, dtype=dtype)
-    conv = relay.nn.conv2d(A, W, padding=[1, 1, 1, 1], channels=64, kernel_size=(3, 3))
-    conv = relay.nn.bias_add(conv, BS)
-    conv = relay.op.nn.relu(conv)
-
-    mod = relay.Function([A, W, BS], conv)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        static_memory_scope = [
-            "",
-            "global.texture",
-            "global",
-            "global.texture-weight",
-            "",
-            "",
-        ]
-        build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-    else:
-        static_memory_scope = """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global
-        VM VirtualDevice[4]: device type 4, id 0 and mem_scope global.texture-weight
-        """
-        build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            static_memory_scope,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
deleted file mode 100644
index dc86a231877f..000000000000
--- a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
+++ /dev/null
@@ -1,898 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import re
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from tvm.contrib import utils
-from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm
-import pytest
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(remote, target, executor_type, dtype):
-    input_shape = (1, 257, 257, 32)
-    filter_shape = (1, 1, 32, 16)
-    bias_shape = (filter_shape[-1],)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype=dtype,
-        channels=filter_shape[-1],
-        kernel_size=(1, 1),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(remote, target, executor_type, dtype):
-    input_shape = (1, 257, 257, 32)
-    filter_shape = (1, 1, 32, 16)
-    bias_shape = (filter_shape[-1],)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[3, 3, 3, 3],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=filter_shape[-1],
-        kernel_size=(1, 1),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4_35_35_32x3_3_144_16(remote, target, executor_type, dtype):
-    input_shape = (4, 35, 35, 32)
-    filter_shape = (3, 3, 32, 16)
-    bias_shape = (filter_shape[-1],)
-    kernel_size = (filter_shape[0], filter_shape[1])
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype=dtype,
-        channels=filter_shape[-1],
-        kernel_size=kernel_size,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(remote, target, executor_type, dtype):
-    input_shape = (1, 513, 513, 3)
-    filter_shape = (3, 3, 3, 32)
-    bias_shape = (filter_shape[-1],)
-    kernel_size = (filter_shape[0], filter_shape[1])
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype=dtype,
-        channels=filter_shape[-1],
-        kernel_size=kernel_size,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.ones(filter_shape).astype(dtype)
-    bias_data = np.ones(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, executor_type, dtype):
-    input_shape = (1, 42, 42, 32)
-    filter_shape = (3, 3, 32, 96)
-    bias_shape = (1, 1, 1, 96)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=96,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, executor_type, dtype):
-    input_shape = (1, 40, 40, 32)
-    filter_shape = (2, 2, 32, 96)
-    bias_shape = (1, 1, 1, 96)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=96,
-        kernel_size=(2, 2),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_35_35_strides(remote, target, executor_type, dtype):
-    input_shape = (1, 35, 35, 48)
-    filter_shape = (5, 5, 48, 64)
-    bias_shape = (1, 1, 1, 64)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[2, 2, 2, 2],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=64,
-        kernel_size=(5, 5),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_resnet50_v2_nhwc_3c(remote, target, executor_type, dtype):
-    input_shape = (1, 224, 224, 3)
-    filter_shape = (7, 7, 3, 64)
-    bias_shape = (1, 1, 1, 64)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[3, 3, 3, 3],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=64,
-        kernel_size=(7, 7),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_nhwc_3c(remote, target, executor_type, dtype):
-    input_shape = (1, 299, 299, 3)
-    filter_shape = (3, 3, 3, 64)
-    bias_shape = (1, 1, 1, 64)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=64,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_1x1_16c16spatial(remote, target, executor_type, dtype):
-    input_shape = (1, 128, 128, 16)
-    filter_shape = (4, 4, 16, 32)
-    bias_shape = (1, 1, 1, 32)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[0, 0, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(4, 4),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4_16c16pad(remote, target, executor_type, dtype):
-    input_shape = (1, 256, 256, 32)
-    filter_shape = (4, 4, 32, 32)
-    bias_shape = (1, 1, 1, 32)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[3, 3, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=32,
-        kernel_size=(4, 4),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4x4_16c16pad(remote, target, executor_type, dtype):
-    input_shape = (1, 256, 256, 32)
-    filter_shape = (4, 4, 32, 4)
-    bias_shape = (1, 1, 1, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[3, 3, 0, 0],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=4,
-        kernel_size=(4, 4),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_yolov3_v2_nhwc_3c(remote, target, executor_type, dtype):
-    input_shape = (1, 13, 13, 1024)
-    filter_shape = (1, 1, 1024, 255)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[0, 0, 0, 0],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=255,
-        kernel_size=(1, 1),
-    )
-
-    mod = relay.Function([A, B], conv)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_vgg16_winograd_4d(remote, target, executor_type, dtype):
-    input_shape = (1, 28, 28, 512)
-    filter_shape = (3, 3, 512, 512)
-    bias_shape = (1, 1, 1, 512)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[1, 1, 1, 1],
-        channels=512,
-        kernel_size=[3, 3],
-        out_dtype=dtype,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    temp = utils.tempdir()
-    stat_file = temp.relpath("stat.log")
-    with open(stat_file, "w") as f:
-        f.write(
-            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "{dtype}"], ["TENSOR", [3, 3, 512, 512], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
-        )
-    if executor_type == "ge":
-        graph = build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", graph)
-        assert len(matches) > 0
-    else:
-        vmc = build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", vmc.primitives)
-        assert len(matches) > 0
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_vgg16_winograd_4d_expand_spatial_dims(remote, target, executor_type, dtype):
-    input_shape = (1, 28, 28, 1)
-    filter_shape = (3, 3, 1, 64)
-    bias_shape = (1, 1, 1, 64)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[0, 0, 0, 0],
-        kernel_size=[3, 3],
-        out_dtype=dtype,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    temp = utils.tempdir()
-    stat_file = temp.relpath("stat.log")
-    with open(stat_file, "w") as f:
-        f.write(
-            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 1], "{dtype}"], ["TENSOR", [3, 3, 1, 64], "{dtype}"], [1, 1], [0, 0, 0, 0], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
-        )
-    if executor_type == "ge":
-        graph = build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", graph)
-        assert len(matches) > 0
-    else:
-        vmc = build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", vmc.primitives)
-        assert len(matches) > 0
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_conv(remote, target, executor_type, dtype):
-    input_shape = (1, 3, 3, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    filter_shape3 = (3, 3, 4, 8)
-    bias_shape3 = (1, 1, 1, 8)
-    B3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
-    D = relay.nn.conv2d(
-        A,
-        B3,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[1, 1, 1, 1],
-        channels=8,
-        kernel_size=[3, 3],
-        out_dtype=dtype,
-    )
-
-    filter_shape4 = (3, 3, 8, 8)
-    bias_shape4 = (1, 1, 1, 8)
-    B4 = relay.var("weight4", shape=filter_shape4, dtype=dtype)
-    D = relay.nn.conv2d(
-        D,
-        B4,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[1, 1, 1, 1],
-        channels=8,
-        kernel_size=[3, 3],
-        out_dtype=dtype,
-    )
-    mod = relay.Function([A, B3, B4], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data3 = np.zeros(filter_shape3).astype(dtype)
-    bias_data3 = np.zeros(bias_shape3).astype(dtype)
-    filter_data4 = np.zeros(filter_shape4).astype(dtype)
-    bias_data4 = np.zeros(bias_shape4).astype(dtype)
-    initializer("weight", filter_data3)
-    initializer("bias", bias_data3)
-    initializer("weight", filter_data4)
-    initializer("bias", bias_data4)
-    params1 = {
-        "weight3": tvm.nd.array(filter_data3),
-        "weight4": tvm.nd.array(filter_data4),
-    }
-
-    temp = utils.tempdir()
-    stat_file = temp.relpath("stat.log")
-    with open(stat_file, "w") as f:
-        f.write(
-            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 3, 3, 4], "{dtype}"], ["TENSOR", [3, 3, 4, 8], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
-        )
-    if executor_type == "ge":
-        graph = build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", graph)
-        assert len(matches) > 0
-    else:
-        vmc = build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", vmc.primitives)
-        assert len(matches) > 0
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_non_rect(remote, target, executor_type, dtype):
-    input_shape = (1, 36, 64, 771)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    filter_shape = (3, 3, 771, 128)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    D = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[1, 1, 1, 1],
-        channels=128,
-        kernel_size=[3, 3],
-        out_dtype=dtype,
-    )
-
-    mod = relay.Function([A, B], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    temp = utils.tempdir()
-    stat_file = temp.relpath("stat.log")
-    with open(stat_file, "w") as f:
-        f.write(
-            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 36, 64, 771], "{dtype}"], ["TENSOR", [3, 3, 771, 128], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
-        )
-    if executor_type == "ge":
-        graph = build_run_compare(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", graph)
-        assert len(matches) > 0
-    else:
-        vmc = build_run_compare_vm(
-            remote,
-            mod,
-            params1,
-            {"data": input_shape},
-            {"data": dtype},
-            target,
-            stat_file=stat_file,
-        )
-        matches = re.findall("winograd", vmc.primitives)
-        assert len(matches) > 0
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_to_3_channels(remote, target, executor_type, dtype):
-    input_shape = (1, 200, 200, 256)
-    filter_shape = (1, 1, 256, 3)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-
-    D = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        padding=[0, 0, 0, 0],
-        out_dtype=dtype,
-        channels=3,
-        kernel_size=(1, 1),
-    )
-    mod = relay.Function([A, B], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [])
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py
deleted file mode 100644
index d110c8329fd1..000000000000
--- a/tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from tvm.contrib import utils
-from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm
-import pytest
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_transpose_adreno(remote, target, executor_type, dtype):
-    # Conv2d transpose test cases lists
-    trials = [
-        [4, 4, (1, 1), (2, 2), (1, 1), 64, (256, 100, 100), (False, False), gpu_preprocess],
-        [4, 4, (0, 0), (2, 2), (1, 1), 256, (32, 64, 64), (False, False), None],
-        [3, 3, (0, 0), (2, 2), (1, 1), 64, (256, 100, 100), (True, True), None],
-        [4, 4, (1, 1), (1, 1), (1, 1), 512, (16, 100, 100), (False, False), gpu_preprocess],
-        [5, 5, (2, 2), (2, 2), (1, 1), 4, (16, 100, 100), (True, False), gpu_preprocess],
-        [7, 7, (3, 3), (2, 2), (1, 1), 8, (4, 100, 100), (False, True), None],
-        [7, 7, (3, 3), (2, 2), (1, 1), 64, (3, 100, 100), (True, True), None],
-        [3, 3, (1, 1), (1, 1), (1, 1), 3, (16, 8, 8), (True, True), None],
-    ]
-    # Tensors memory scope with graph executor build
-    ge_texture_scopes = [
-        ["", "global.texture", "global.texture-weight", "", ""],
-        ["", "global.texture", "global.texture-weight", "", ""],
-        ["", "global.texture", "global.texture-weight", "global.texture-weight", "", ""],
-        ["", "global.texture", "global.texture-weight", "", ""],
-        ["", "global.texture", "global.texture-weight", "global.texture-weight", "", ""],
-        ["", "global.texture", "global.texture-nhwc", "", ""],
-        [],
-        [],
-    ]
-    # Tensors memory scope with vm executor build
-    vm_texture_scopes = [
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture-weight
-        """,
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture-weight
-        """,
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture-weight
-        VM VirtualDevice[4]: device type 4, id 0 and mem_scope global.texture-weight
-        """,
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture-weight
-        """,
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture-weight
-        VM VirtualDevice[4]: device type 4, id 0 and mem_scope global.texture-weight
-        """,
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture-nhwc
-        """,
-        [],
-        [],
-    ]
-
-    for i, (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        shape,
-        composite,
-        _gpu_preprocess,
-    ) in enumerate(trials):
-        shape = (1, *shape)
-        has_bias = composite[0]
-        has_activation = composite[1]
-        input_shape = shape
-        filter_shape = (shape[1], out_channels, kernel_w, kernel_h)
-        x = relay.var("data", shape=input_shape, dtype=dtype)
-        w = relay.var("weight", shape=filter_shape, dtype=dtype)
-        inputs = [x, w]
-        y = relay.nn.conv2d_transpose(
-            x,
-            w,
-            channels=out_channels,
-            kernel_size=(kernel_w, kernel_h),
-            strides=stride,
-            padding=pad,
-            kernel_layout="IOHW",
-            data_layout="NCHW",
-            dilation=dilation,
-        )
-
-        np.random.seed(0)
-        initializer = relay.testing.init.Xavier()
-        filter_data = np.zeros(filter_shape).astype(dtype)
-        initializer("weight", filter_data)
-        params1 = {
-            "weight": tvm.nd.array(filter_data),
-        }
-
-        if has_bias:
-            b = relay.var("bias", shape=(out_channels,), dtype=dtype)
-            y = relay.nn.bias_add(y, b, axis=1)
-            inputs.append(b)
-            bias_data = np.zeros((out_channels,)).astype(dtype)
-            initializer("bias", bias_data)
-            params1["bias"] = tvm.nd.array(bias_data)
-        if has_activation:
-            y = relay.nn.relu(y)
-
-        mod = relay.Function(inputs, y)
-        if executor_type == "ge":
-            build_run_compare(
-                remote,
-                mod,
-                params1,
-                {"data": input_shape},
-                {"data": dtype},
-                target,
-                ge_texture_scopes[i],
-                _gpu_preprocess,
-            )
-        else:
-            build_run_compare_vm(
-                remote,
-                mod,
-                params1,
-                {"data": input_shape},
-                {"data": dtype},
-                target,
-                vm_texture_scopes[i],
-                _gpu_preprocess,
-            )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_transpose_three_layer_block(remote, target, executor_type, dtype):
-    # Conv2d transpose test cases lists
-    trials = [
-        [4, 4, (1, 1), (2, 2), (1, 1), 64, (256, 100, 100), (False, False), None],
-        [3, 3, (0, 0), (1, 1), (1, 1), 64, (256, 12, 12), (True, True), gpu_preprocess],
-    ]
-    ge_texture_scopes = [
-        [
-            "",
-            "global.texture",
-            "global.texture-weight",
-            "global.texture",
-            "global.texture-weight",
-            "global.texture",
-            "global.texture-weight",
-            "",
-            "",
-        ],
-        [
-            "",
-            "global.texture-nhwc",
-            "global.texture-weight",
-            "global.texture-nhwc",
-            "global.texture-weight",
-            "global.texture-weight",
-            "global.texture-nhwc",
-            "global.texture-weight",
-            "",
-            "",
-        ],
-    ]
-    vm_texture_scopes = [
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[4]: device type 4, id 0 and mem_scope global.texture-weight
-        VM VirtualDevice[5]: device type 4, id 0 and mem_scope global.texture
-        VM VirtualDevice[6]: device type 4, id 0 and mem_scope global.texture-weight
-        VM VirtualDevice[7]: device type 4, id 0 and mem_scope global.texture-weight
-        """,
-        """
-        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
-        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
-        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture-nhwc
-        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture-nhwc
-        VM VirtualDevice[4]: device type 4, id 0 and mem_scope global.texture-weight
-        VM VirtualDevice[5]: device type 4, id 0 and mem_scope global.texture-nhwc
-        VM VirtualDevice[6]: device type 4, id 0 and mem_scope global.texture-weight
-        VM VirtualDevice[7]: device type 4, id 0 and mem_scope global.texture-weight
-        VM VirtualDevice[8]: device type 4, id 0 and mem_scope global.texture-weight
-        """,
-    ]
-
-    for i, (
-        kernel_h,
-        kernel_w,
-        pad,
-        stride,
-        dilation,
-        out_channels,
-        shape,
-        composite,
-        _gpu_preprocess,
-    ) in enumerate(trials):
-        shape = (1, *shape)
-        has_bias = composite[0]
-        has_activation = composite[1]
-        input_shape = shape
-        filter_shape = (shape[1], out_channels, kernel_w, kernel_h)
-        x = relay.var("data", shape=input_shape, dtype=dtype)
-        w = relay.var("weight", shape=filter_shape, dtype=dtype)
-        inputs = [x, w]
-        W1 = relay.var("weight1", shape=(shape[1], shape[1], 1, 1), dtype=dtype)
-        conv = relay.nn.conv2d(x, W1, padding=[0, 0, 0, 0], channels=shape[1], kernel_size=(1, 1))
-        inputs.append(W1)
-        conv = relay.op.nn.relu(conv)
-        y = relay.nn.conv2d_transpose(
-            conv,
-            w,
-            channels=out_channels,
-            kernel_size=(kernel_w, kernel_h),
-            strides=stride,
-            padding=pad,
-            kernel_layout="IOHW",
-            data_layout="NCHW",
-            dilation=dilation,
-        )
-
-        if has_bias:
-            b = relay.var("bias", shape=(out_channels,), dtype=dtype)
-            y = relay.nn.bias_add(y, b, axis=1)
-            inputs.append(b)
-
-        if has_activation:
-            y = relay.nn.relu(y)
-        W2 = relay.var("weight2", shape=(out_channels, out_channels, 1, 1), dtype=dtype)
-        out = relay.nn.conv2d(
-            y, W2, padding=[0, 0, 0, 0], channels=out_channels, kernel_size=(1, 1)
-        )
-        out = relay.op.nn.relu(out)
-        np.random.seed(0)
-        inputs.append(W2)
-        initializer = relay.testing.init.Xavier()
-        filter_data = np.zeros(filter_shape).astype(dtype)
-        initializer("weight", filter_data)
-        filter_data1 = np.zeros((shape[1], shape[1], 1, 1)).astype(dtype)
-        initializer("weight", filter_data1)
-        filter_data2 = np.zeros((out_channels, out_channels, 1, 1)).astype(dtype)
-        initializer("weight", filter_data2)
-        params1 = {
-            "weight": tvm.nd.array(filter_data),
-            "weight1": tvm.nd.array(filter_data1),
-            "weight2": tvm.nd.array(filter_data2),
-        }
-        if has_bias:
-            bias_data = np.zeros((out_channels,)).astype(dtype)
-            initializer("bias", bias_data)
-            params1["bias"] = tvm.nd.array(bias_data)
-
-        mod = relay.Function(inputs, out)
-
-        if executor_type == "ge":
-            build_run_compare(
-                remote,
-                mod,
-                params1,
-                {"data": input_shape},
-                {"data": dtype},
-                target,
-                ge_texture_scopes[i],
-                _gpu_preprocess,
-            )
-        else:
-            build_run_compare_vm(
-                remote,
-                mod,
-                params1,
-                {"data": input_shape},
-                {"data": dtype},
-                target,
-                vm_texture_scopes[i],
-                _gpu_preprocess,
-            )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
deleted file mode 100644
index 87e9542140d1..000000000000
--- a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_bias_nchwc(remote, target, executor_type, dtype):
-    input_shape = (1, 64, 112, 112)
-    filter_shape = (64, 1, 3, 3)
-    bias_shape = (1, 64, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=64,
-        groups=64,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_nchwc(remote, target, executor_type, dtype):
-    input_shape = (1, 64, 112, 112)
-    filter_shape = (64, 1, 3, 3)
-    bias_shape = (1, 64, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=64,
-        groups=64,
-        kernel_size=(3, 3),
-    )
-
-    mod = relay.Function([A, B], conv)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
-        )
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_bias_nchw(remote, target, executor_type, dtype):
-    input_shape = (1, 64, 112, 112)
-    filter_shape = (64, 1, 3, 3)
-    bias_shape = (1, 64, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=64,
-        groups=64,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_repack_bias_nchw(remote, target, executor_type, dtype):
-    input_shape = (1, 63, 112, 112)
-    filter_shape = (63, 1, 3, 3)
-    bias_shape = (1, 63, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=63,
-        groups=63,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_to_3_channels(remote, target, executor_type, dtype):
-    input_shape = (1, 3, 200, 200)
-    filter_shape = (3, 1, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-
-    D = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[0, 0, 0, 0],
-        out_dtype=dtype,
-        channels=3,
-        groups=3,
-        kernel_size=(1, 1),
-    )
-    mod = relay.Function([A, B], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [])
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
deleted file mode 100644
index 782c99a96a8f..000000000000
--- a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from utils.adreno_utils import build_run_compare, build_run_compare_vm
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(remote, target, executor_type, dtype):
-    input_shape = (1, 129, 129, 144)
-    filter_shape = (3, 3, 144, 1)
-    kernel_size = (filter_shape[0], filter_shape[1])
-    bias_shape = (filter_shape[2],)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype=dtype,
-        groups=filter_shape[2],
-        channels=filter_shape[2],
-        kernel_size=kernel_size,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    mod = relay.Function([A, B, bias], conv)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(remote, target, executor_type, dtype):
-    input_shape = (4, 35, 35, 576)
-    filter_shape = (3, 3, 576, 1)
-    kernel_size = (filter_shape[0], filter_shape[1])
-    bias_shape = (filter_shape[2],)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype=dtype,
-        groups=filter_shape[2],
-        channels=filter_shape[2],
-        kernel_size=kernel_size,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    mod = relay.Function([A, B, bias], conv)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(
-    remote, target, executor_type, dtype
-):
-    input_shape = (1, 129, 129, 144)
-    filter_shape = (3, 3, 144, 1)
-    kernel_size = (filter_shape[0], filter_shape[1])
-    bias_shape = (filter_shape[2],)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        padding=[3, 3, 3, 3],
-        strides=[2, 2],
-        out_dtype=dtype,
-        groups=filter_shape[2],
-        channels=filter_shape[2],
-        kernel_size=kernel_size,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    # mod, params = relay.testing.init.create_workload(func)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_1_513_513_7x3_3_7_1(remote, target, executor_type, dtype):
-    input_shape = (1, 513, 513, 7)
-    filter_shape = (3, 3, 7, 1)
-    bias_shape = (filter_shape[2],)
-    kernel_size = (filter_shape[0], filter_shape[1])
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype=dtype,
-        channels=filter_shape[2],
-        groups=filter_shape[2],
-        kernel_size=kernel_size,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.ones(filter_shape).astype(dtype)
-    bias_data = np.ones(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_1_513_513_3x3_3_3_1(remote, target, executor_type, dtype):
-    input_shape = (1, 513, 513, 3)
-    filter_shape = (3, 3, 3, 1)
-    bias_shape = (filter_shape[2],)
-    kernel_size = (filter_shape[0], filter_shape[1])
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype=dtype,
-        channels=filter_shape[2],
-        groups=filter_shape[2],
-        kernel_size=kernel_size,
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.ones(filter_shape).astype(dtype)
-    bias_data = np.ones(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_to_3_channels(remote, target, executor_type, dtype):
-    input_shape = (1, 200, 200, 3)
-    filter_shape = (1, 1, 3, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-
-    D = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        padding=[0, 0, 0, 0],
-        out_dtype=dtype,
-        channels=3,
-        groups=3,
-        kernel_size=(1, 1),
-    )
-    mod = relay.Function([A, B], D)
-    np.random.seed(0)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    initializer("weight", filter_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [])
-    else:
-        build_run_compare_vm(
-            remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_group_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_group_conv2d_nchw_texture.py
deleted file mode 100644
index bd05610e92b7..000000000000
--- a/tests/python/relay/opencl_texture/test_group_conv2d_nchw_texture.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import re
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from utils.adreno_utils import build_run_compare, build_run_compare_vm
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_group_conv2d_nchwc_adreno_encoder1(remote, target, executor_type, dtype):
-    input_shape = (1, 512, 56, 100)
-    filter_shape = (512, 64, 3, 3)
-    bias_shape = (1, 512, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=512,
-        groups=8,
-        dilation=1,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_group_conv2d_nchwc_adreno_encoder2(remote, target, executor_type, dtype):
-    input_shape = (1, 1024, 56, 100)
-    filter_shape = (512, 128, 3, 3)
-    bias_shape = (1, 512, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[3, 3, 3, 3],
-        strides=[2, 2],
-        out_dtype=dtype,
-        channels=512,
-        groups=8,
-        dilation=2,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_group_conv2d_nchwc_adreno_nontrivial(remote, target, executor_type, dtype):
-    input_shape = (1, 56, 56, 100)
-    filter_shape = (112, 8, 7, 3)
-    bias_shape = (1, 112, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[3, 3, 3, 3],
-        strides=[1, 2],
-        out_dtype=dtype,
-        channels=112,
-        groups=7,
-        dilation=2,
-        kernel_size=(7, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_group_conv2d_nchwc_default(remote, target, executor_type, dtype):
-    input_shape = (1, 49, 56, 100)
-    filter_shape = (343, 7, 3, 3)
-    bias_shape = (1, 343, 1, 1)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    B = relay.var("weight", shape=filter_shape, dtype=dtype)
-    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-    # C = relay.nn.relu(A)
-    conv = relay.nn.conv2d(
-        A,
-        B,
-        data_layout="NCHW",
-        kernel_layout="OIHW",
-        padding=[1, 1, 1, 1],
-        strides=[1, 1],
-        out_dtype=dtype,
-        channels=343,
-        groups=7,
-        dilation=1,
-        kernel_size=(3, 3),
-    )
-    D = relay.op.add(conv, bias)
-    D = relay.op.nn.relu(D)
-
-    mod = relay.Function([A, B, bias], D)
-    np.random.seed(1)
-    initializer = relay.testing.init.Xavier()
-    filter_data = np.zeros(filter_shape).astype(dtype)
-    bias_data = np.zeros(bias_shape).astype(dtype)
-    initializer("weight", filter_data)
-    initializer("bias", bias_data)
-    params1 = {
-        "weight": tvm.nd.array(filter_data),
-        "bias": tvm.nd.array(bias_data),
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_injection_texture.py b/tests/python/relay/opencl_texture/test_injection_texture.py
deleted file mode 100644
index 31c082c99496..000000000000
--- a/tests/python/relay/opencl_texture/test_injection_texture.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import pytest
-import tvm
-import numpy as np
-from tvm import relay
-from utils.adreno_utils import build_run_compare, build_run_compare_vm
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_layout_transform_to_block_nchw4c(remote, target, executor_type, dtype):
-    """Verification of the case NCHW->NCHW4c"""
-    input_shape = (1, 32, 720, 1280)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    lt = relay.layout_transform(A, "NCHW", "NCHW4c")
-    mod = relay.Function([A], lt)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_layout_transform_to_block_nchw(remote, target, executor_type, dtype):
-    """Verification of the case NCHW4c->NCHW"""
-    input_shape = (1, 36, 1, 1, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    lt = relay.layout_transform(A, "NCHW4c", "NCHW")
-    mod = relay.Function([A], lt)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_layout_transform_to_block_nhwc4c(remote, target, executor_type, dtype):
-    """Verification of the case NHWC->NHWC4c"""
-    input_shape = (1, 1, 1, 144)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    lt = relay.layout_transform(A, "NHWC", "NHWC4c")
-    mod = relay.Function([A], lt)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@pytest.mark.skipif(
-    tvm.testing.utils.IS_IN_CI, reason="Skip because GPU in CI doesn't support FP16"
-)
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_layout_transform_to_block_nhwc(remote, target, executor_type, dtype):
-    """Verification of the case NHWC4c->NHWC"""
-    input_shape = (1, 80, 80, 36, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    mean = relay.mean(A, axis=[1, 2], keepdims=True)
-    cast = relay.cast(mean, "float16")
-    lt = relay.layout_transform(cast, "NHWC4c", "NHWC")
-    mod = relay.Function([A], lt)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-if __name__ == "__main__":
-    test_layout_transform_to_block_nhwc(None, "opencl -device=adreno", "float16")
diff --git a/tests/python/relay/opencl_texture/test_network.py b/tests/python/relay/opencl_texture/test_network.py
deleted file mode 100644
index 66c88ebbe294..000000000000
--- a/tests/python/relay/opencl_texture/test_network.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-
-import numpy as np
-import pytest
-import tvm
-from tvm import relay
-from tvm.contrib import utils
-from tvm.relay import testing
-from tvm.relay.op import register_mixed_precision_conversion
-from utils.adreno_utils import build_run_compare, build_run_compare_vm, get_model, gpu_preprocess
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-
-
-def _test_mobilenet_v1(remote, target, calc_dtype, executor_type, acc_dtype):
-    mod, params, inputs, dtypes = get_model(
-        "https://github.com/mlcommons/mobile_models/raw/main/v0_7/tflite/mobilenet_edgetpu_224_1.0_float.tflite",
-        "mobilenet_edgetpu_224_1.0_float.tflite",
-        "tflite",
-    )
-    if calc_dtype == "float16":
-        from tvm.driver.tvmc.transform import apply_graph_transforms
-
-        mod = apply_graph_transforms(
-            mod,
-            {
-                "mixed_precision": True,
-                "mixed_precision_ops": ["nn.conv2d", "nn.dense"],
-                "mixed_precision_calculation_type": calc_dtype,
-                "mixed_precision_acc_type": acc_dtype,
-            },
-            params,
-        )
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, params, inputs, dtypes, target, [])
-    else:
-        build_run_compare_vm(remote, mod, params, inputs, dtypes, target, [])
-
-
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443")
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="CI doesn't support fp16(half datatypes)")
-def test_mobilenet_v1_fp16(remote, target, executor_type):
-    _test_mobilenet_v1(remote, target, "float16", executor_type, "float16")
-
-
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443")
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mobilenet_v1_fp32(remote, target, executor_type):
-    _test_mobilenet_v1(remote, target, "float32", executor_type, "float32")
-
-
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443")
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mobilenet_v1_fp16_acc32(remote, target, executor_type):
-    _test_mobilenet_v1(remote, target, "float16", executor_type, "float32")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_pool_texture.py b/tests/python/relay/opencl_texture/test_pool_texture.py
deleted file mode 100644
index 6190790a3dd6..000000000000
--- a/tests/python/relay/opencl_texture/test_pool_texture.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import relay
-from utils.adreno_utils import build_run_compare, build_run_compare_vm
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_pool2d_nchw_wide(remote, target, executor_type, dtype):
-    """
-    Use case of NCHW global pooling with big spatial valies
-    """
-    input_shape = (1, 32, 160, 160)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_avg_pool2d(A)
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_pool2d_nchw4c_wide(remote, target, executor_type, dtype):
-    """
-    Use case of blocked NCHW4c global pooling with big spatial valies
-    """
-    input_shape = (1, 8, 160, 160, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_avg_pool2d(A, layout="NCHW4c")
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_pool2d_nchw_deep(remote, target, executor_type, dtype):
-    """
-    Use case of NCHW deep global pooling
-    """
-    input_shape = (1, 2048, 20, 20)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_avg_pool2d(A)
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_pool2d_nchw4c_deep(remote, target, executor_type, dtype):
-    """
-    Use case of blocked NCHW4c deep global pooling
-    """
-    input_shape = (1, 512, 20, 20, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_avg_pool2d(A, layout="NCHW4c")
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_pool2d_nhwc(remote, target, executor_type, dtype):
-    """
-    Use case of NHWC global pooling with big spatial valies
-    """
-    input_shape = (1, 160, 160, 32)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_avg_pool2d(A, layout="NHWC")
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_pool2d_nhwc4c(remote, target, executor_type, dtype):
-    """
-    Use case of NHWC deep global pooling
-    """
-    input_shape = (1, 160, 160, 8, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_avg_pool2d(A, layout="NHWC4c")
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_max_pool2d_nchw_wide(remote, target, executor_type, dtype):
-    """
-    Use case of NCHW global pooling with big spatial valies
-    """
-    input_shape = (1, 32, 160, 160)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_max_pool2d(A)
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_global_max_pool2d_nchw4c_wide(remote, target, executor_type, dtype):
-    """
-    Use case of blocked NCHW4c global pooling with big spatial valies
-    """
-    input_shape = (1, 8, 160, 160, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    C = relay.nn.global_max_pool2d(A, layout="NCHW4c")
-    mod = relay.Function([A], C)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_reduction_texture.py b/tests/python/relay/opencl_texture/test_reduction_texture.py
deleted file mode 100644
index 1016a7c88ec6..000000000000
--- a/tests/python/relay/opencl_texture/test_reduction_texture.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from tvm.contrib import utils
-from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mean(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 720, 1280)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    mean = relay.mean(A, axis=1, keepdims=True)
-    mod = relay.Function([A], mean)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_argmax(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 720, 1280)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    argmax = relay.op.argmax(A, axis=[1])
-    mod = relay.Function([A], argmax)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_reduction_max(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 720, 1280)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    argmax = relay.op.max(A, axis=[1])
-    mod = relay.Function([A], argmax)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mean_nd4(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 729, 729)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    mean = relay.mean(A, axis=1, keepdims=True)
-    mod = relay.Function([A], mean)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_argmax_nd4(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 729, 729)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    argmax = relay.op.argmax(A, axis=[1])
-    mod = relay.Function([A], argmax)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_reduction_max_nd4(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 729, 729)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    argmax = relay.op.max(A, axis=[1])
-    mod = relay.Function([A], argmax)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mean_b4(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 720, 320, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    mean = relay.mean(A, axis=1, keepdims=True)
-    mod = relay.Function([A], mean)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_argmax_b4(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 720, 320, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    argmax = relay.op.argmax(A, axis=[1])
-    mod = relay.Function([A], argmax)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_reduction_max_b4(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 3, 720, 320, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    argmax = relay.op.max(A, axis=[1])
-    mod = relay.Function([A], argmax)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mean_global_pooling(remote, target, executor_type, dtype):
-    """
-    Use case of blocked NCHW4c global pooling with big spatial valies
-    """
-    input_shape = (1, 160, 160, 32)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    mean = relay.mean(A, axis=[1, 2], keepdims=True)
-    mod = relay.Function([A], mean)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mean_global_pooling_block4(remote, target, executor_type, dtype):
-    """
-    Use case of blocked NCHW4c global pooling with big spatial valies
-    """
-    input_shape = (1, 160, 160, 8, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    mean = relay.mean(A, axis=[1, 2], keepdims=True)
-    mod = relay.Function([A], mean)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_max_global_pooling_block4(remote, target, executor_type, dtype):
-    """
-    Use case of blocked NCHW4c global pooling with big spatial valies
-    """
-    input_shape = (1, 160, 160, 8, 4)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    mean = relay.max(A, axis=[1, 2], keepdims=True)
-    mod = relay.Function([A], mean)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_sum_cast(remote, target, dtype):
-    shape = (10,)
-    A = relay.var("A", shape=shape)
-    w = relay.op.sum(A)
-    w = relay.cast(w, "int32")
-    mod = relay.Function([A], w)
-
-    shape_dict = {
-        "A": shape,
-    }
-    dtype_dict = {
-        "A": dtype,
-    }
-    build_run_compare(remote, mod, {}, shape_dict, dtype_dict, target)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_relay_ops.py b/tests/python/relay/opencl_texture/test_relay_ops.py
deleted file mode 100644
index 686a9a9b9e89..000000000000
--- a/tests/python/relay/opencl_texture/test_relay_ops.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay import testing
-from tvm.contrib import utils
-from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm
-
-
-executor_type = tvm.testing.parameter("ge", "vm")
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mod(remote, target, executor_type, dtype):
-    # NCHW
-    input_shape = (1, 25, 38, 64)
-    A = relay.var("data", shape=input_shape, dtype=dtype)
-    scale = relay.const(2.0, dtype=dtype)
-    op = relay.mod(A, scale)
-    mod = relay.Function([A], op)
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
-
-
-@tvm.testing.requires_opencl
-@tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_scatter_nd_add(remote, target, executor_type, dtype):
-    # NCHW
-
-    A = relay.var("data", shape=(6, 30, 30, 256), dtype=dtype)
-    indices = relay.const(tvm.nd.array(np.random.randint(0, 1, (2, 6, 30, 30))), dtype="int64")
-    update = relay.const(
-        tvm.nd.array(np.random.uniform(-1, 1, size=(50, 50, 256)).astype(dtype)), dtype=dtype
-    )
-    op = relay.scatter_nd(update, indices, A, mode="add")
-    mod = relay.Function([A], op)
-    shape_dict = {
-        "data": (6, 30, 30, 256),
-    }
-    dtype_dict = {
-        "data": dtype,
-    }
-
-    if executor_type == "ge":
-        build_run_compare(remote, mod, {}, shape_dict, dtype_dict, target)
-    else:
-        build_run_compare_vm(remote, mod, {}, shape_dict, dtype_dict, target)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/utils/adreno_utils.py b/tests/python/relay/opencl_texture/utils/adreno_utils.py
deleted file mode 100644
index 21bdfbdee3cb..000000000000
--- a/tests/python/relay/opencl_texture/utils/adreno_utils.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utils for adreno compute/schedules"""
-
-import os
-import tvm
-import numpy as np
-from tvm import relay
-from tvm import autotvm
-from tvm import rpc
-from tvm.contrib import utils, ndk
-from tvm.relay import testing
-from tvm.relay.transform import recast
-from tvm.contrib import graph_runtime
-from tvm.runtime.vm import VirtualMachine
-import json
-
-
-def get_cpu_reference(mod, params1, input_shape, inputs):
-    mod_fp32 = recast(mod, "float32", "float32", ops=["nn.conv2d", "add", "nn.relu"])
-    with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(mod_fp32, "llvm", params=params1)
-    ctx = tvm.cpu()
-    m = graph_runtime.create(graph, lib, ctx)
-    if isinstance(input_shape, dict):
-        for key in input_shape:
-            m.set_input(key, inputs[-1])
-    else:
-        m.set_input("data", inputs[-1])
-    m.set_input(**params)
-    m.run()
-    return [
-        m.get_output(0).asnumpy(),
-    ]
-
-
-# build module run with opencl and cpu, compare results
-def build_run_compare(
-    remote,
-    tvm_mod,
-    params1,
-    input_shape,
-    dtypes,
-    target="llvm",
-    static_mem_scopes=[],
-    gpu_preprocess=None,
-    stat_file=None,
-):
-    if remote is None:
-        target_host = "llvm"
-    else:
-        target_host = "llvm -mtriple=arm64-linux-android"
-
-    if gpu_preprocess:
-        tvm_mod_nchwc = gpu_preprocess(tvm_mod)
-    else:
-        tvm_mod_nchwc = tvm_mod
-
-    if stat_file is not None:
-        with autotvm.apply_history_best(stat_file):
-            with tvm.transform.PassContext(opt_level=3):
-                graph, lib, params = relay.build(
-                    tvm_mod_nchwc, target_host=target_host, target=target, params=params1
-                )
-    else:
-        with tvm.transform.PassContext(opt_level=3):
-            graph, lib, params = relay.build(
-                tvm_mod_nchwc, target_host=target_host, target=target, params=params1
-            )
-
-    # verification that storage_scope has expected textures scopes
-    graph_json = json.loads(graph)
-    if "storage_scope" in graph_json["attrs"]:
-        assert (
-            len(static_mem_scopes) == len(graph_json["attrs"]["storage_scope"][1])
-            or len(static_mem_scopes) == 0
-        )
-    else:
-        assert len(static_mem_scopes) == 0
-
-    for i in range(0, len(static_mem_scopes)):
-        assert static_mem_scopes[i] == graph_json["attrs"]["storage_scope"][1][i]
-
-    if remote is None:
-        ctx = tvm.opencl()
-        m = graph_runtime.create(graph, lib, ctx)
-    else:
-        temp = utils.tempdir()
-        dso_binary = "dev_lib_cl.so"
-        dso_binary_path = temp.relpath(dso_binary)
-        ctx = remote.cl(0)
-        lib.export_library(dso_binary_path, fcompile=ndk.create_shared)
-        remote.upload(dso_binary_path)
-        rlib = remote.load_module(dso_binary)
-        m = graph_runtime.create(graph, rlib, ctx)
-    m.set_input(**params)
-    inputs = []
-    for key in input_shape:
-        inputs.append(np.random.normal(size=input_shape[key]).astype(dtypes[key]))
-        m.set_input(key, inputs[-1])
-    m.run()
-
-    ref_outputs = get_cpu_reference(tvm_mod, params1, input_shape, inputs)
-    for i, ref_output in enumerate(ref_outputs):
-        tvm_output = m.get_output(i)
-        output = tvm_output.asnumpy()
-
-        np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1)
-    return graph
-
-
-def build_run_compare_vm(
-    remote,
-    tvm_mod,
-    params1,
-    input_shape,
-    dtypes,
-    target="llvm",
-    static_mem_scopes=[],
-    gpu_preprocess=None,
-    stat_file=None,
-):
-    if remote is None:
-        target_host = "llvm"
-    else:
-        target_host = "llvm -mtriple=arm64-linux-android"
-
-    if gpu_preprocess:
-        tvm_mod_nchwc = gpu_preprocess(tvm_mod)
-    else:
-        tvm_mod_nchwc = tvm_mod
-
-    if isinstance(tvm_mod_nchwc, relay.Function):
-        module = tvm.IRModule({})
-        module["main"] = tvm_mod_nchwc
-        tvm_mod_nchwc = module
-
-    if stat_file is not None:
-        with autotvm.apply_history_best(stat_file):
-            with tvm.transform.PassContext(opt_level=3):
-                vmc = relay.vm.compile(
-                    tvm_mod_nchwc, target=target, target_host=target_host, params=params1
-                )
-    else:
-        with tvm.transform.PassContext(opt_level=3):
-            vmc = relay.vm.compile(
-                tvm_mod_nchwc, target=target, target_host=target_host, params=params1
-            )
-
-    if len(static_mem_scopes) > 0:
-        mem_scopes_lines = static_mem_scopes.strip().split("\n")
-        vm_lines = vmc._get_virtual_devices().strip().split("\n")
-        for i in range(0, len(mem_scopes_lines)):
-            assert mem_scopes_lines[i].strip() == vm_lines[i].strip()
-
-    if remote is None:
-        dev = tvm.opencl()
-        vm = VirtualMachine(vmc, dev, "naive")
-    else:
-        temp = utils.tempdir()
-        dso_binary = "dev_lib_cl.so"
-        dso_binary_path = temp.relpath(dso_binary)
-        dev = remote.cl(0)
-        vmc.mod.export_library(dso_binary_path, fcompile=ndk.create_shared)
-        remote.upload(dso_binary_path)
-        rlib = remote.load_module(dso_binary)
-        vm = VirtualMachine(rlib, dev, "naive")
-    data = {}
-    inputs = []
-    for key in input_shape:
-        inputs.append(np.random.normal(size=input_shape[key]).astype(dtypes[key]))
-        data[key] = tvm.nd.array(inputs[-1], dev)
-    for k, v in params1.items():
-        data[k] = tvm.nd.array(v, dev)
-    vm.set_input("main", **data)
-    vm.invoke_stateful("main")
-
-    ref_outputs = get_cpu_reference(tvm_mod, params1, input_shape, inputs)
-    for i, ref_output in enumerate(ref_outputs):
-        tvm_output = vm.get_outputs()[i]
-        output = tvm_output.asnumpy()
-
-        np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1)
-    return vmc
-
-
-def gpu_preprocess(tvm_mod):
-    layout_config = relay.transform.LayoutConfig()
-    desired_layouts = {
-        "nn.conv2d": ["NCHW4c", "OIHW4o"],
-        "nn.conv2d_transpose": ["NCHW4c", "IOHW4o"],
-    }
-    with layout_config:
-        seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
-        with tvm.transform.PassContext(opt_level=3):
-            mod = tvm.IRModule.from_expr(tvm_mod)
-            tvm_mod_nchwc = seq(mod)
-            return tvm_mod_nchwc
-
-
-def get_model(url, local_file, module):
-    def get_tensor_type_str(tensor_type):
-        """Get tensor type string representation when given TFLite tensor type"""
-        try:
-            from tflite.TensorType import TensorType
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-
-        if tensor_type == TensorType.INT8:
-            return "int8"
-        if tensor_type == TensorType.INT16:
-            return "int16"
-        if tensor_type == TensorType.UINT8:
-            return "uint8"
-        if tensor_type == TensorType.FLOAT16:
-            return "float16"
-        if tensor_type == TensorType.FLOAT32:
-            return "float32"
-        if tensor_type == TensorType.INT32:
-            return "int32"
-        if tensor_type == TensorType.INT64:
-            return "int64"
-        if tensor_type == TensorType.BOOL:
-            return "bool"
-        raise NotImplementedError(
-            "Tensor type {} is currently not supported".format(str(tensor_type))
-        )
-
-    if url is None:
-        model_path = local_file
-    else:
-        model_path = tvm.contrib.download.download_testdata(url, local_file, module=module)
-
-    with open(model_path, "rb") as f:
-        tflite_model_buf = f.read()
-
-    try:
-        import tflite.Model
-
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-    except AttributeError:
-        import tflite
-
-        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
-    except ImportError:
-        raise ImportError("The tflite package must be installed")
-
-    # keep the same as tflite
-    assert tflite_model.SubgraphsLength() == 1, "only support one subgraph (main subgraph)"
-    subgraph = tflite_model.Subgraphs(0)
-
-    # model inputs
-    model_inputs = subgraph.InputsAsNumpy()
-    shape_dict = {}
-    dtype_dict = {}
-    for model_input in model_inputs:
-        model_input_name = subgraph.Tensors(model_input).Name().decode("utf-8")
-        model_shape_length = subgraph.Tensors(model_input).ShapeLength()
-        model_input_shape = [
-            subgraph.Tensors(model_input).Shape(i) for i in range(model_shape_length)
-        ]
-        shape_dict[model_input_name] = model_input_shape
-        dtype_dict[model_input_name] = get_tensor_type_str(subgraph.Tensors(model_input).Type())
-
-    # model Outputs
-    model_outputs = subgraph.OutputsAsNumpy()
-    shape_dict_out = {}
-    dtype_dict_out = {}
-    for model_output in model_outputs:
-        model_output_name = subgraph.Tensors(model_output).Name().decode("utf-8")
-        model_shape_length = subgraph.Tensors(model_output).ShapeLength()
-        model_output_shape = [
-            subgraph.Tensors(model_output).Shape(i) for i in range(model_shape_length)
-        ]
-        shape_dict_out[model_output_name] = model_output_shape
-        dtype_dict_out[model_output_name] = get_tensor_type_str(
-            subgraph.Tensors(model_input).Type()
-        )
-
-    mod, params = relay.frontend.from_tflite(
-        tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
-    )
-
-    layout_config = relay.transform.LayoutConfig(skip_layers=[])
-    desired_layouts = {"nn.conv2d": ["NCHW", "default"]}
-    seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-
-    return mod, params, shape_dict, dtype_dict
diff --git a/tests/python/relay/qnn/test_canonicalizations.py b/tests/python/relay/qnn/test_canonicalizations.py
deleted file mode 100644
index 0505a88c07bd..000000000000
--- a/tests/python/relay/qnn/test_canonicalizations.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from typing import Callable
-
-import numpy as np
-from tvm import relay
-from tvm.relay.qnn.op import canonicalizations
-
-
-class TestIntegerTableLookupTable:
-    """Consists of tests testing functionality of creating lookup tables for integer operations."""
-
-    def fake_identity_func_numpy(self, arr: np.ndarray):
-        return arr.astype("float32")
-
-    def fake_identity_func_relay(
-        self,
-        floating_point_func: Callable[[np.ndarray], np.ndarray],
-        input_arg=None,
-        in_scale=relay.const(1.0, dtype="float32"),
-        in_zero_point=relay.const(0, dtype="int32"),
-        out_scale=relay.const(1.0, dtype="float32"),
-        out_zero_point=relay.const(0, dtype="int32"),
-        in_axis=-1,
-        out_axis=-1,
-        in_dtype="uint8",
-        out_dtype="uint8",
-    ):
-        if input_arg is None:
-            input_arg = relay.const(np.arange(0, 256, dtype="uint8").view(in_dtype))
-
-        return (
-            canonicalizations.create_integer_lookup_op(
-                input_arg=input_arg,
-                floating_point_func=floating_point_func,
-                in_scale=in_scale,
-                in_zero_point=in_zero_point,
-                out_scale=out_scale,
-                out_zero_point=out_zero_point,
-                in_axis=in_axis,
-                out_axis=out_axis,
-                in_dtype=in_dtype,
-                out_dtype=out_dtype,
-            ),
-            input_arg.data.numpy(),
-        )
-
-    def dequantize_numpy(self, np_arr, np_scale=1.0, np_zero_point=0):
-        return (np_arr.astype("int32") - np_zero_point) * np_scale
-
-    def run_function_test(
-        self,
-        in_scale: float,
-        in_zero_point: int,
-        out_scale: float,
-        out_zero_point: int,
-        in_dtype: str,
-        out_dtype: str,
-        floating_point_func: Callable[[np.ndarray], np.ndarray],
-        input_arg: relay.Expr = None,
-        rtol=1e-7,
-        atol=0,
-    ):
-        relay_lookup, input_arg = self.fake_identity_func_relay(
-            input_arg=input_arg,
-            floating_point_func=floating_point_func,
-            in_scale=relay.const(in_scale, "float32"),
-            in_zero_point=relay.const(in_zero_point, "int32"),
-            out_scale=relay.const(out_scale, "float32"),
-            out_zero_point=relay.const(out_zero_point, "int32"),
-            in_dtype=in_dtype,
-            out_dtype=out_dtype,
-        )
-        result = canonicalizations.run_const_expr(relay_lookup)
-        np.testing.assert_allclose(
-            floating_point_func(
-                self.dequantize_numpy(input_arg, np_scale=in_scale, np_zero_point=in_zero_point)
-            ),
-            self.dequantize_numpy(result, np_scale=out_scale, np_zero_point=out_zero_point),
-            atol=atol,
-            rtol=rtol,
-        )
-
-    """Test mapping between different input/output dtypes"""
-
-    def test_int8_to_int8(self):
-        self.run_function_test(
-            in_scale=1.0,
-            in_zero_point=0,
-            out_scale=1.0,
-            out_zero_point=0,
-            in_dtype="int8",
-            out_dtype="int8",
-            floating_point_func=self.fake_identity_func_numpy,
-        )
-
-    def test_uint8_to_uint8(self):
-        self.run_function_test(
-            in_scale=1.0,
-            in_zero_point=128,
-            out_scale=1.0,
-            out_zero_point=128,
-            in_dtype="uint8",
-            out_dtype="uint8",
-            floating_point_func=self.fake_identity_func_numpy,
-        )
-
-    def test_int8_to_uint8(self):
-        self.run_function_test(
-            in_scale=1.0,
-            in_zero_point=0,
-            out_scale=1.0,
-            out_zero_point=128,
-            in_dtype="int8",
-            out_dtype="uint8",
-            floating_point_func=self.fake_identity_func_numpy,
-        )
-
-    def test_uint8_to_int8(self):
-        self.run_function_test(
-            in_scale=1.0,
-            in_zero_point=128,
-            out_scale=1.0,
-            out_zero_point=0,
-            in_dtype="uint8",
-            out_dtype="int8",
-            floating_point_func=self.fake_identity_func_numpy,
-        )
-
-    """Test different input shapes"""
-
-    def test_keep_input_shapes(self):
-        # input in floating point ~[-2, 2], final output ~[0, 8]
-        self.run_function_test(
-            input_arg=relay.const(np.arange(-128, 128).astype("int8").reshape([2, 2, 8, 8])),
-            in_scale=0.015,
-            in_zero_point=0,
-            out_scale=16 / 256,
-            out_zero_point=0,
-            in_dtype="int8",
-            out_dtype="int8",
-            floating_point_func=self.fake_identity_func_numpy,
-            atol=0.03,
-            rtol=0.01,
-        )
-        self.run_function_test(
-            input_arg=relay.const(np.arange(-128, 128).astype("int8").reshape([2, 2, 64])),
-            in_scale=0.015,
-            in_zero_point=0,
-            out_scale=16 / 256,
-            out_zero_point=0,
-            in_dtype="int8",
-            out_dtype="int8",
-            floating_point_func=self.fake_identity_func_numpy,
-            atol=0.03,
-            rtol=0.01,
-        )
-        self.run_function_test(
-            input_arg=relay.const(np.arange(-128, 128).astype("int8").reshape([2, 128])),
-            in_scale=0.015,
-            in_zero_point=0,
-            out_scale=16 / 256,
-            out_zero_point=0,
-            in_dtype="int8",
-            out_dtype="int8",
-            floating_point_func=self.fake_identity_func_numpy,
-            atol=0.03,
-            rtol=0.01,
-        )
-
-    """Test mapping with different in/out qparams works."""
-
-    def test_different_in_out_qparams(self):
-        self.run_function_test(
-            in_scale=1.0,
-            in_zero_point=128,
-            out_scale=1.0,
-            out_zero_point=128,
-            in_dtype="uint8",
-            out_dtype="uint8",
-            floating_point_func=self.fake_identity_func_numpy,
-            atol=1,  # numbers range from -128 -> 128 so not that big error
-            rtol=0,
-        )
-
-    """Test some simple functions"""
-
-    def test_tanh(self):
-        # 1 / 64 in scale -- input range is ~ (-2, 2), tanh(+-2) ~= +-1
-        # 1 / 128 out_scale -- output range is ~(-1, 1)
-        self.run_function_test(
-            input_arg=relay.const(np.arange(-128, 128).astype("int8")),
-            in_scale=1 / 64,
-            in_zero_point=0,
-            out_scale=1 / 128,
-            out_zero_point=0,
-            in_dtype="int8",
-            out_dtype="int8",
-            floating_point_func=np.tanh,
-            atol=0.01,
-            rtol=0.01,
-        )
-
-    def test_exp(self):
-        # input in floating point ~[-2, 2], final output ~[0, 8]
-        self.run_function_test(
-            input_arg=relay.const(np.arange(-128, 128).astype("int8")),
-            in_scale=0.015,
-            in_zero_point=0,
-            out_scale=16 / 256,
-            out_zero_point=0,
-            in_dtype="int8",
-            out_dtype="int8",
-            floating_point_func=np.exp,
-            atol=0.03,
-            rtol=0.01,
-        )
diff --git a/tests/python/relay/qnn/test_clip_legalization.py b/tests/python/relay/qnn/test_clip_legalization.py
deleted file mode 100644
index b7ccaccd98a7..000000000000
--- a/tests/python/relay/qnn/test_clip_legalization.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test that do-nothing requantize -> clip operators are removed during legalization."""
-
-import numpy as np
-import pytest
-
-import tvm
-from tvm import nd, relay
-from tvm.relay import transform
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def tvm_const(obj):
-    return relay.Constant(nd.array(obj))
-
-
-@pytest.mark.parametrize(
-    "dtype,min_val,max_val,is_redundant",
-    [
-        ("int8", -128, 127, True),
-        ("int8", -127, 127, False),
-        ("int16", -128, 127, False),
-        ("int32", -2147483648, 2147483647, True),
-    ],
-)
-def test_removes_redundant_requantize_clip_ops(dtype, min_val, max_val, is_redundant):
-    """Test that qnn.requantize -> clip sequences are removed during legalization if the bounds of
-    the clip operator match the min and max values of the data type."""
-
-    input_var = relay.var("input", shape=(1, 3, 3, 4), dtype="int32")
-    out = relay.qnn.requantize(
-        input_var,
-        tvm_const(np.float32(1.0)),
-        tvm_const(np.int32(0)),
-        tvm_const(np.float32(1.0)),
-        tvm_const(np.int32(-128)),
-        axis=3,
-        out_dtype=dtype,
-    )
-    out = relay.clip(out, a_min=min_val, a_max=max_val)
-    func = relay.Function([input_var], out)
-    unmodified = run_opt_pass(func, transform.InferType())
-    legalized = run_opt_pass(func, transform.Legalize())
-
-    # Check that the clip op was removed if and only if `is_redundant` is True.
-    if is_redundant:
-        assert legalized.body.op.name == "qnn.requantize"
-        assert not tvm.ir.structural_equal(unmodified, legalized)
-    else:
-        assert legalized.body.op.name == "clip"
-        tvm.ir.assert_structural_equal(unmodified, legalized)
-
-
-def test_ignores_standalone_clip_ops():
-    """The legalization pass should only affect qnn.requantize -> clip sequences, and should leave
-    standalone clip operators untouched."""
-
-    input_var = relay.var("x", shape=(1, 3, 3, 4), dtype="int8")
-    out = relay.clip(input_var, a_min=-128, a_max=127)
-    func = relay.Function([input_var], out)
-    unmodified = run_opt_pass(func, transform.InferType())
-    legalized = run_opt_pass(func, transform.Legalize())
-    tvm.ir.assert_structural_equal(unmodified, legalized)
diff --git a/tests/python/relay/qnn/test_qnn_channel_stripping.py b/tests/python/relay/qnn/test_qnn_channel_stripping.py
deleted file mode 100644
index 18b2da1a90ec..000000000000
--- a/tests/python/relay/qnn/test_qnn_channel_stripping.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test QNN channel stripping legalization pass."""
-
-import numpy as np
-import tvm
-from tvm import nd, relay
-
-from tvm.relay import transform
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.testing.utils import generate_ref_data
-
-from tvm.topi.arm_cpu.qnn_legalize import legalize_bias_add
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def execute_relay_func(relay_func, in_data):
-    ref_module = tvm.IRModule.from_expr(relay_func)
-    return generate_ref_data(ref_module, {"input": in_data})["output"]
-
-
-def tvm_const(obj):
-    return relay.Constant(nd.array(obj))
-
-
-def make_test_conv_depthwise_conv():
-    """Generates a convolution -> depthwise_convolution -> convolution pattern that can have
-    channels stripped. The structure here mirrors MobileNetV1's layers 8-10."""
-
-    input_var = relay.var("input", shape=(1, 12, 12, 4), dtype="int8")
-
-    kernel_1 = np.array(
-        [[0, 1, 0, -2], [0, 3, 0, 5], [0, 5, 0, -9], [0, 2, 0, 21]], dtype="int8"
-    ).reshape((1, 1, 4, 4))
-    input_scale_1 = np.float32(0.5)
-    output_scale_1 = np.array([0.5, 2.0, 0.25, 4.0], dtype="float32")
-
-    out = relay.qnn.conv2d(
-        input_var,
-        tvm_const(kernel_1),
-        tvm_const(np.int32(-128)),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale_1),
-        tvm_const(output_scale_1),
-        channels=4,
-        kernel_size=(1, 1),
-        padding=(0, 0),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-
-    bias_1 = np.array([198, -2, 19, 10], dtype="int32")
-    out = relay.nn.bias_add(
-        out,
-        tvm_const(bias_1),
-        axis=3,
-    )
-
-    input_scale_2 = np.float32(0.25)
-    out = relay.qnn.requantize(
-        out,
-        tvm_const(input_scale_1 * output_scale_1),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale_2),
-        tvm_const(np.int32(-128)),
-        axis=3,
-        out_dtype="int8",
-    )
-    # Outputs here will be fixed to {0: 70, 2: -118}
-
-    kernel_2 = np.array(
-        [
-            [0, 6, 4, 2],
-            [8, 6, -3, -1],
-            [-2, -5, 3, -8],
-            [-7, 5, 1, 9],
-            [-4, -9, -8, -2],
-            [-1, 4, -5, 3],
-            [-4, -9, 2, 6],
-            [9, -6, 0, 5],
-            [-3, 8, 1, -7],
-        ],
-        dtype="int8",
-    ).reshape((3, 3, 4, 1))
-    output_scale_2 = np.array([0.25, 0.125, 2.0, 0.125], dtype="float32")
-    out = relay.qnn.conv2d(
-        out,
-        tvm_const(kernel_2),
-        tvm_const(np.int32(-128)),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale_2),
-        tvm_const(output_scale_2),
-        channels=4,
-        groups=4,
-        kernel_size=(3, 3),
-        padding=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-
-    bias_2 = np.array([4582, 4, -12, 15], dtype="int32")
-    out = relay.nn.bias_add(
-        out,
-        tvm_const(bias_2),
-        axis=3,
-    )
-
-    input_scale_3 = np.float32(0.125)
-    out = relay.qnn.requantize(
-        out,
-        tvm_const(input_scale_2 * output_scale_2),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale_3),
-        tvm_const(np.int32(-128)),
-        axis=3,
-        out_dtype="int8",
-    )
-    # Outputs here will be fixed to {0: 127, 2: -128}
-
-    kernel_3 = np.array(
-        [[4, -2, 9, 9], [0, 0, 0, 0], [0, 0, 0, 0], [-1, 1, -1, 1]], dtype="int8"
-    ).reshape((1, 1, 4, 4))
-    output_scale_3 = np.array([0.25, 0.125, 1.0, 0.5], dtype="float32")
-
-    out = relay.qnn.conv2d(
-        out,
-        tvm_const(kernel_3),
-        tvm_const(np.int32(-128)),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale_3),
-        tvm_const(output_scale_3),
-        channels=4,
-        kernel_size=(1, 1),
-        padding=(0, 0),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-
-    bias_3 = np.array([1, -1, 4, 6], dtype="int32")
-    out = relay.nn.bias_add(
-        out,
-        tvm_const(bias_3),
-        axis=3,
-    )
-
-    return relay.Function([input_var], out)
-
-
-def make_test_conv_pool_dense():
-    """Generates a convolution -> pool -> dense pattern that can have channels stripped. The
-    structure here mirrors MobileNetV1's final few layers."""
-
-    input_var = relay.var("input", shape=(1, 3, 3, 4), dtype="int8")
-
-    kernel = np.array(
-        [[0, 1, 0, -2], [0, 3, 0, 5], [0, 5, 0, -9], [0, 2, 0, 21]], dtype="int8"
-    ).reshape((1, 1, 4, 4))
-    input_scale = np.float32(0.029626124)
-    output_scale = np.array([0.5, 2.0, 0.25, 4.0], dtype="float32")
-
-    out = relay.qnn.conv2d(
-        input_var,
-        tvm_const(kernel),
-        tvm_const(np.int32(-128)),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale),
-        tvm_const(output_scale),
-        channels=4,
-        kernel_size=(1, 1),
-        padding=(0, 0),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-
-    bias_1 = np.array([198, -2, 19, 10], dtype="int32")
-    out = relay.nn.bias_add(
-        out,
-        tvm_const(bias_1),
-        axis=3,
-    )
-
-    out = relay.qnn.requantize(
-        out,
-        tvm_const(input_scale * output_scale),
-        tvm_const(np.int32(0)),
-        tvm_const(np.float32(0.015656913)),
-        tvm_const(np.int32(-128)),
-        axis=3,
-        out_dtype="int8",
-    )
-
-    out = relay.cast(out, dtype="int32")
-    out = relay.nn.avg_pool2d(
-        out,
-        pool_size=[3, 3],
-        strides=[3, 3],
-        layout="NHWC",
-    )
-
-    out = relay.cast(out, dtype="int8")
-    # The channel stripping logic expects two reshape operators
-    out = relay.reshape(out, newshape=[-1, 4])
-    out = relay.reshape(out, newshape=[-1, 4])
-
-    dense_weights = np.array([[15, -2, -3, 11], [12, -10, 13, -10]], dtype="int8")
-    out = relay.qnn.dense(
-        out,
-        tvm_const(dense_weights),
-        tvm_const(np.int32(-128)),
-        tvm_const(np.int32(0)),
-        tvm_const(np.float32(0.015656913)),
-        tvm_const(np.float32(0.0047202893)),
-        units=2,
-        out_dtype="int32",
-    )
-
-    dense_bias = np.array([1463, -1463], dtype="int32")
-    out = relay.nn.bias_add(
-        out,
-        tvm_const(dense_bias),
-        axis=1,
-    )
-
-    return relay.Function([input_var], out)
-
-
-def test_conv_depthwise_conv():
-    """Make sure that qnn_legalize.py is able to detect and remove empty output channels from a
-    convolution -> depthwise convolution -> convolution pattern by folding into a bias_add op."""
-
-    original = make_test_conv_depthwise_conv()
-
-    with TempOpAttr("nn.bias_add", "FTVMLegalize", legalize_bias_add):
-        unoptimized = run_opt_pass(original, transform.InferType())
-        optimized = run_opt_pass(original, transform.Legalize())
-
-    # Inputs and outputs should be unmodified by channel stripping
-    assert unoptimized.checked_type == optimized.checked_type
-
-    # Make sure 2/4 channels were removed by channel stripping
-    assert tuple(unoptimized.body.args[0].args[0].checked_type.shape) == (1, 12, 12, 4)
-    assert tuple(optimized.body.args[0].args[0].checked_type.shape) == (1, 12, 12, 2)
-
-    # Make sure optimized and unoptimized versions behave identically
-    np.random.seed(12402)  # Fix seed for repeatability
-    input_data = np.random.randint(-128, 128, size=(1, 12, 12, 4), dtype="int8")
-
-    unoptimized_output = execute_relay_func(unoptimized, np.copy(input_data))
-    optimized_output = execute_relay_func(optimized, np.copy(input_data))
-    np.testing.assert_array_equal(unoptimized_output, optimized_output)
-
-
-def test_conv_pool_dense():
-    """Make sure that qnn_legalize.py is able to detect and remove empty output channels from a
-    convolution -> avg_pool2d -> dense pattern by folding them into a bias_add op."""
-
-    original = make_test_conv_pool_dense()
-
-    with TempOpAttr("nn.bias_add", "FTVMLegalize", legalize_bias_add):
-        unoptimized = run_opt_pass(original, transform.InferType())
-        optimized = run_opt_pass(original, transform.Legalize())
-
-    # Inputs and outputs should be unmodified by channel stripping
-    assert unoptimized.checked_type == optimized.checked_type
-
-    # Make sure 2/4 channels were removed by channel stripping
-    assert tuple(unoptimized.body.args[0].args[0].checked_type.shape) == (1, 4)
-    assert tuple(optimized.body.args[0].args[0].checked_type.shape) == (1, 2)
-
-    # Make sure optimized and unoptimized versions behave identically
-    np.random.seed(12402)  # Fix seed for repeatability
-    input_data = np.random.randint(-128, 128, size=(1, 3, 3, 4), dtype="int8")
-
-    unoptimized_output = execute_relay_func(unoptimized, np.copy(input_data))
-    optimized_output = execute_relay_func(optimized, np.copy(input_data))
-    np.testing.assert_array_equal(unoptimized_output, optimized_output)
diff --git a/tests/python/relay/strategy/test_select_implementation.py b/tests/python/relay/strategy/test_select_implementation.py
deleted file mode 100644
index 03e5030d09f9..000000000000
--- a/tests/python/relay/strategy/test_select_implementation.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Tests strategy selection for Relay ops """
-
-import pytest
-import numpy as np
-from unittest.mock import MagicMock
-
-import tvm
-from tvm import relay
-from tvm import te
-from tvm.relay.testing import run_infer_type, run_opt_pass
-import tvm.testing
-from tvm import topi
-from tvm.target.codegen import llvm_version_major
-
-
-@pytest.mark.parametrize(
-    "target, expected_implementation",
-    [("llvm -device=arm_cpu", "concatenate.arm_cpu")],
-)
-def test_concatenate(target, expected_implementation):
-    target = tvm.target.Target(target)
-
-    shape = (1, 1, 1, 3)
-    dtype = "float32"
-    axis = 1
-    inputs = []
-    inputs.append(relay.var("var0", shape=shape, dtype=dtype))
-    inputs.append(relay.var("var1", shape=shape, dtype=dtype))
-    input_tuple = relay.Tuple(inputs)
-    out = relay.op.concatenate(input_tuple, axis)
-    out = run_infer_type(out)
-
-    impl, xx = relay.backend.te_compiler.select_implementation(
-        relay.op.get("concatenate"),
-        out.attrs,
-        [te.placeholder(shape)],
-        out.checked_type,
-        target,
-        use_autotvm=False,
-    )
-    assert impl.name == expected_implementation
-
-
-def _get_conv2d_impl(in_dtype, out_dtype, target):
-    """Returns selected conv2d implementation for a given datatype and target"""
-    data_shape = (1, 1, 1, 4)
-    weight_shape = (1, 1, 4, 4)
-    data_layout = "NHWC"
-    kernel_layout = "HWIO"
-    channels = 4
-    kernel_size = (1, 1)
-
-    out = relay.nn.conv2d(
-        relay.var("data", shape=data_shape, dtype=in_dtype),
-        relay.var("weight", shape=weight_shape, dtype=in_dtype),
-        kernel_size=kernel_size,
-        channels=channels,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-        out_dtype=out_dtype,
-    )
-
-    with target:
-        out = run_opt_pass(out, relay.transform.AlterOpLayout())
-        data_shape = out.type_args[0].shape
-        weight_shape = out.type_args[1].shape
-
-        impl, _ = relay.backend.te_compiler.select_implementation(
-            out.op,
-            out.attrs,
-            [te.placeholder(data_shape, in_dtype), te.placeholder(weight_shape, in_dtype)],
-            out.checked_type,
-            target,
-            use_autotvm=False,
-        )
-
-    return impl.name
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 15, reason=f"Requires LLVM 15+, got {llvm_version_major()}"
-)
-@pytest.mark.parametrize(
-    "target,expected_impl",
-    [
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-            "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
-            "conv2d_nhwc_spatial_pack.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu",
-            "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-            "conv2d_NHWC_quantized_native_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
-            "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod,+i8mm",
-            "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v9a",
-            "conv2d_NHWC_quantized_native_without_transform.arm_cpu",
-        ),
-    ],
-)
-def test_int8_conv2d(target, expected_impl):
-    target = tvm.target.Target(target)
-    dtype = "int8"
-
-    selected_impl = _get_conv2d_impl(dtype, dtype, target)
-    assert selected_impl == expected_impl
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 15, reason=f"Requires LLVM 15+, got {llvm_version_major()}"
-)
-@pytest.mark.parametrize(
-    "target,expected_impl",
-    [
-        (
-            "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
-            "conv2d_nhwc_spatial_pack.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v8.2a,+neon",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v9a",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v9.2a,+sme",
-            "conv2d_NHWC_hybrid_SME.arm_cpu",
-        ),
-    ],
-)
-def test_fp32_conv2d(target, expected_impl):
-    target = tvm.target.Target(target)
-    dtype = "float32"
-
-    selected_impl = _get_conv2d_impl(dtype, dtype, target)
-    assert selected_impl == expected_impl
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 15, reason=f"Requires LLVM 15+, got {llvm_version_major()}"
-)
-@pytest.mark.parametrize(
-    "target,expected_impl",
-    [
-        (
-            "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
-            "conv2d_nhwc_spatial_pack.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+neon",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v9a",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v9.2a,+sme",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-    ],
-)
-def test_fp16_conv2d(target, expected_impl):
-    target = tvm.target.Target(target)
-    dtype = "float16"
-
-    selected_impl = _get_conv2d_impl(dtype, dtype, target)
-    assert selected_impl == expected_impl
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 15, reason=f"Requires LLVM 15+, got {llvm_version_major()}"
-)
-@pytest.mark.parametrize(
-    "target,expected_impl",
-    [
-        (
-            "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
-            "conv2d_nhwc_spatial_pack.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+neon",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v9a",
-            "conv2d_NHWC_hybrid_without_transform.arm_cpu",
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v9.2a,+sme",
-            "conv2d_NHWC_hybrid_SME_transposed_B.arm_cpu",
-        ),
-    ],
-)
-def test_fp16_to_fp32_conv2d(target, expected_impl):
-    target = tvm.target.Target(target)
-    in_dtype = "float16"
-    out_dtype = "float32"
-
-    selected_impl = _get_conv2d_impl(in_dtype, out_dtype, target)
-    assert selected_impl == expected_impl
-
-
-@pytest.mark.parametrize(
-    "target,expected_impl",
-    [
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-            "depthwise_conv2d_nhwc.arm_cpu",
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
-            "depthwise_conv2d_nhwc.generic",
-        ),
-        ("c -device=arm_cpu -mcpu=cortex-m55", "depthwise_conv2d_nhwc_dsp.arm_cpu"),
-    ],
-)
-def test_int8_depthwise_conv2d(target, expected_impl):
-    target = tvm.target.Target(target)
-
-    dtype = "int8"
-    out_dtype = "int32"
-    data_shape = (2, 2, 4, 8)
-    weight_shape = (2, 2, 8, 1)
-    data_layout = "NHWC"
-    kernel_layout = "HWOI"
-    groups = 8
-    kernel_size = (2, 2)
-
-    out = relay.nn.conv2d(
-        relay.var("data", shape=data_shape, dtype=dtype),
-        relay.var("weight", shape=weight_shape, dtype=dtype),
-        kernel_size=kernel_size,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-        groups=groups,
-        out_dtype=out_dtype,
-    )
-    out = run_infer_type(out)
-
-    with target:
-        impl, _ = relay.backend.te_compiler.select_implementation(
-            out.op,
-            out.attrs,
-            [te.placeholder(data_shape, dtype), te.placeholder(weight_shape, dtype)],
-            out.checked_type,
-            target,
-        )
-
-    assert impl.name == expected_impl
-
-
-@pytest.mark.parametrize(
-    "target,expected_valid_impl,expected_impl",
-    [
-        (
-            "llvm -mtriple=aarch64-linux-gnu -device=arm_cpu -mattr=+neon",
-            ["dense_gemm.arm_cpu", "dense_pack.x86", "dense_nopack.x86"],
-            "dense_gemm.arm_cpu",
-        ),
-    ],
-)
-def test_dense(target, expected_valid_impl, expected_impl):
-    target = tvm.target.Target(target)
-    data_shape = (30, 40)
-    weight_shape = (30, 40)
-    dtype = "float32"
-
-    out = relay.nn.dense(
-        relay.var("data", shape=data_shape, dtype=dtype),
-        relay.const(np.zeros((weight_shape)).astype(dtype)),
-        out_dtype=dtype,
-    )
-    out = run_infer_type(out)
-
-    with target:
-        args = [
-            out.op,
-            out.attrs,
-            [te.placeholder(data_shape, dtype), te.placeholder(weight_shape, dtype)],
-            out.checked_type,
-            target,
-        ]
-        valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
-        selected_impl, _ = relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
-    assert len(valid_impl) == len(expected_valid_impl)
-    for impl in valid_impl:
-        assert impl.name in expected_valid_impl
-    assert selected_impl.name == expected_impl
-
-
-@pytest.mark.skipif(llvm_version_major() < 15, reason="Older versions of LLVM don't support SME.")
-@pytest.mark.parametrize(
-    "shape,expected_valid_impl,expected_impl",
-    [
-        (
-            (30, 40),
-            ["matmul.arm_cpu.sme", "dense_gemm.arm_cpu", "dense_pack.x86", "dense_nopack.x86"],
-            "matmul.arm_cpu.sme",
-        ),
-        (
-            (5, 1),
-            ["dense_gemm.arm_cpu", "dense_pack.x86", "dense_nopack.x86"],
-            "dense_gemm.arm_cpu",
-        ),
-    ],
-)
-def test_dense_with_sme_target(shape, expected_valid_impl, expected_impl):
-    target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+v9.2a,+sme")
-    data_shape = shape
-    weight_shape = shape
-    dtype = "float32"
-
-    out = relay.nn.dense(
-        relay.var("data", shape=data_shape, dtype=dtype),
-        relay.const(np.zeros((weight_shape)).astype(dtype)),
-        out_dtype=dtype,
-    )
-    out = run_infer_type(out)
-
-    with target:
-        args = [
-            out.op,
-            out.attrs,
-            [te.placeholder(data_shape, dtype), te.placeholder(weight_shape, dtype)],
-            out.checked_type,
-            target,
-        ]
-        valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
-        selected_impl, _ = relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
-    assert len(valid_impl) == len(expected_valid_impl)
-    for impl in valid_impl:
-        assert impl.name in expected_valid_impl
-    assert selected_impl.name == expected_impl
-
-
-@pytest.mark.parametrize(
-    "target,schedule_func",
-    [
-        ("llvm -device=arm_cpu", topi.x86),
-        ("c -device=arm_cpu -mcpu=cortex-m55", topi.arm_cpu),
-    ],
-)
-def test_pool2d(target, schedule_func, monkeypatch):
-    target = tvm.target.Target(target)
-
-    data_shape = (1, 2, 2, 4)
-    dtype = "float32"
-
-    out = relay.nn.avg_pool2d(relay.var("data", shape=data_shape, dtype=dtype))
-    placeholders = [te.placeholder(data_shape, dtype)]
-
-    mock_schedule = MagicMock()
-    monkeypatch.setattr(schedule_func, "schedule_pool", mock_schedule)
-
-    # Since pool does not use OpStrategy to determine the relevant schedule,
-    # we cannot simply check the schedule name that was selected with
-    # `select_implementation`. With this implementation of schedule selection,
-    # "pool.arm_cpu" will always be the schedule name, regardless of what schedule
-    # was selected. Instead, this test checks that the relevant schedule function
-    # is called when selecting the pooling from schedule from arm_cpu.
-    relay.op.strategy.arm_cpu.schedule_pool_arm_cpu(out.attrs, placeholders, target)
-    mock_schedule.assert_called()
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
deleted file mode 100644
index 655ab11ee0a0..000000000000
--- a/tests/python/relay/test_adt.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay import testing
-from tvm.relay.backend.interpreter import ConstructorValue
-from tvm.relay import create_executor
-from tvm.relay.prelude import Prelude, StaticTensorArrayOps
-from tvm.relay.testing import count as count_, make_nat_value, make_nat_expr
-
-import numpy as np
-
-prelude = p = Prelude(tvm.IRModule({}))
-p.mod.import_from_std("nat.rly")
-
-
-def count(e):
-    return count_(p, e)
-
-
-dev = tvm.device("llvm", 0)
-
-
-def eval(expr):
-    # CAUTION: These tests re-process the entire prelude for each test expression.
-    # Hoisting the create_executor won't improve that since preprocessing won't begin
-    # until the evaluate.
-    return create_executor(mod=prelude.mod, device=dev, target="llvm").evaluate(expr)
-
-
-nat, z, s = prelude.mod.get_type("nat")
-
-double = p.mod.get_global_var("nat_double")
-add = p.mod.get_global_var("nat_add")
-
-optional, some, none = prelude.mod.get_type("Option")
-rlist, cons, nil = prelude.mod.get_type("List")
-
-hd = p.hd
-tl = p.tl
-nth = p.nth
-update = p.update
-length = p.length
-map = p.map
-foldl = p.foldl
-foldr = p.foldr
-foldr1 = p.foldr1
-sum = p.sum
-
-concat = p.concat
-filter = p.filter
-zip = p.zip
-rev = p.rev
-unfoldl = p.unfoldl
-unfoldr = p.unfoldr
-map_accumr = p.map_accumr
-map_accuml = p.map_accuml
-
-tree, rose = prelude.mod.get_type("Tree")
-
-tmap = p.tmap
-size = p.size
-
-compose = p.compose
-iterate = p.iterate
-
-
-def to_list(l):
-    assert isinstance(l, ConstructorValue)
-    val = l
-    ret = []
-    while True:
-        if val.tag == cons.tag:
-            ret.append(val.fields[0])
-            val = val.fields[1]
-        else:
-            assert val.tag == nil.tag
-            break
-    return ret
-
-
-def tree_to_dict(t):
-    assert isinstance(t, ConstructorValue)
-    ret = {}
-    assert t.tag == rose.tag
-    ret["member"] = t.fields[0]
-    ret["children"] = []
-    for subtree in to_list(t.fields[1]):
-        l = tree_to_dict(subtree)
-        ret["children"].append(l)
-    return ret
-
-
-def vmobj_to_list(o, dtype="float32"):
-    if isinstance(o, tvm.nd.NDArray):
-        return [o.numpy().tolist()]
-    elif isinstance(o, tvm.runtime.container.ADT):
-        if len(o) == 0:
-            tensor_nil = p.get_var("tensor_nil", dtype=dtype)
-            if tensor_nil.tag == o.tag:
-                return [0]
-            return []
-
-        result = []
-        for f in o:
-            result.extend(vmobj_to_list(f, dtype))
-        return result
-    elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
-        if o.constructor.name_hint == "Cons":
-            tl = vmobj_to_list(o.fields[1], dtype)
-            hd = vmobj_to_list(o.fields[0], dtype)
-            hd.extend(tl)
-            return hd
-        elif o.constructor.name_hint == "Nil":
-            return []
-        elif "tensor_nil" in o.constructor.name_hint:
-            return [0]
-        elif "tensor" in o.constructor.name_hint:
-            return [o.fields[0].numpy()]
-        else:
-            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-# turns a scalar-valued relay tensor value into a python number
-def get_scalar(tv):
-    return tv.numpy().item()
-
-
-# @tvm.testing.uses_gpu
-def test_nat_value():
-    assert count(make_nat_value(p, 10)) == 10
-    assert count(eval(s(s(z())))) == 2
-
-
-@tvm.testing.uses_gpu
-def test_nat_constructor():
-    func = relay.Function([], z())
-    test_z = relay.GlobalVar("test_z")
-    test_sz = relay.GlobalVar("test_sz")
-    prelude.mod[test_z] = func
-    func = relay.Function([], s(z()))
-    prelude.mod[test_sz] = func
-    ck_mod = relay.transform.InferType()(prelude.mod)
-    assert ck_mod[test_z].body.checked_type == nat()
-    assert ck_mod[test_sz].body.checked_type == nat()
-
-
-@tvm.testing.uses_gpu
-def test_double():
-    assert prelude.mod[double].checked_type == relay.FuncType([nat()], nat())
-    res = eval(double(s(z())))
-    assert count(res) == 2
-
-
-@tvm.testing.uses_gpu
-def test_add():
-    assert prelude.mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
-    res = eval(add(s(z()), s(z())))
-    assert count(res) == 2
-
-
-@tvm.testing.uses_gpu
-def test_list_constructor():
-    test_consz = relay.GlobalVar("test_consz")
-    func = relay.Function([], cons(z(), nil()))
-    prelude.mod[test_consz] = func
-    ck_mod = relay.transform.InferType()(prelude.mod)
-    assert ck_mod[test_consz].body.checked_type == rlist(nat())
-
-
-@tvm.testing.uses_gpu
-def test_hd_tl():
-    expected = list(range(10))
-    l = nil()
-    for i in reversed(expected):
-        l = cons(make_nat_expr(prelude, i), l)
-
-    got = []
-    for i in range(len(expected)):
-        got.append(count(eval(hd(l))))
-        l = tl(l)
-
-    assert got == expected
-
-
-@tvm.testing.uses_gpu
-def test_nth():
-    expected = list(range(10))
-    l = nil()
-    for i in reversed(expected):
-        l = cons(relay.const(i), l)
-
-    for i in range(len(expected)):
-        nth = prelude.mod.get_global_var("nth")
-        item = eval(nth(l, relay.const(i)))
-        assert get_scalar(item) == i
-
-
-@tvm.testing.uses_gpu
-def test_update():
-    expected = list(range(10))
-    l = nil()
-    # create zero initialized list
-    for i in range(len(expected)):
-        l = cons(make_nat_expr(prelude, 0), l)
-
-    # set value
-    for i, v in enumerate(expected):
-        l = update(l, relay.const(i), make_nat_expr(prelude, v))
-
-    got = []
-    for i in range(len(expected)):
-        got.append(count(eval(nth(l, relay.const(i)))))
-
-    assert got == expected
-
-
-@tvm.testing.uses_gpu
-def test_length():
-    a = relay.TypeVar("a")
-    assert prelude.mod[length].checked_type == relay.FuncType(
-        [rlist(a)], relay.scalar_type("int32"), [a]
-    )
-    res = eval(length(cons(z(), cons(z(), cons(z(), nil())))))
-    assert get_scalar(res) == 3
-
-
-@tvm.testing.uses_gpu
-def test_map():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    lhs = prelude.mod[map].checked_type
-    rhs = relay.FuncType([relay.FuncType([a], b), rlist(a)], rlist(b), [a, b])
-    assert lhs == rhs
-
-    x = relay.Var("x")
-    add_one = relay.Function([x], s(x))
-    res = eval(map(add_one, cons(z(), cons(z(), nil()))))
-    ones = to_list(res)
-    assert len(ones) == 2
-    assert count(ones[0]) == 1 and count(ones[1]) == 1
-
-
-@tvm.testing.uses_gpu
-def test_foldl():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-
-    lhs = prelude.mod[foldl].checked_type
-    rhs = relay.FuncType([relay.FuncType([a, b], a), a, rlist(b)], a, [a, b])
-    assert lhs == rhs
-
-    x = relay.Var("x")
-    y = relay.Var("y")
-    rev_dup = relay.Function([y, x], cons(x, cons(x, y)))
-    res = eval(
-        foldl(
-            rev_dup,
-            nil(),
-            cons(
-                make_nat_expr(prelude, 1),
-                cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-            ),
-        )
-    )
-    reversed = to_list(res)
-    assert len(reversed) == 6
-    assert count(reversed[0]) == 3 and count(reversed[1]) == 3
-    assert count(reversed[2]) == 2 and count(reversed[3]) == 2
-    assert count(reversed[4]) == 1 and count(reversed[5]) == 1
-
-
-@tvm.testing.uses_gpu
-def test_foldr():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    lhs = prelude.mod[foldr].checked_type
-    rhs = relay.FuncType([relay.FuncType([a, b], b), b, rlist(a)], b, [a, b])
-    assert lhs == rhs
-
-    x = relay.Var("x")
-    y = relay.Var("y")
-    identity = relay.Function([x, y], cons(x, y))
-    res = eval(
-        foldr(
-            identity,
-            nil(),
-            cons(
-                make_nat_expr(prelude, 1),
-                cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-            ),
-        )
-    )
-    same = to_list(res)
-    assert len(same) == 3
-    assert count(same[0]) == 1 and count(same[1]) == 2 and count(same[2]) == 3
-
-
-@tvm.testing.uses_gpu
-def test_foldr1():
-    a = relay.TypeVar("a")
-    lhs = prelude.mod[foldr1].checked_type
-    rhs = relay.FuncType([relay.FuncType([a, a], a), rlist(a)], a, [a])
-    assert lhs == rhs
-
-    x = relay.Var("x")
-    y = relay.Var("y")
-    f = relay.Function([x, y], add(x, y))
-    res = eval(
-        foldr1(
-            f,
-            cons(
-                make_nat_expr(prelude, 1),
-                cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-            ),
-        )
-    )
-
-    assert count(res) == 6
-
-
-@tvm.testing.uses_gpu
-def test_sum():
-    assert prelude.mod[sum].checked_type == relay.FuncType(
-        [rlist(relay.scalar_type("int32"))], relay.scalar_type("int32")
-    )
-    res = eval(sum(cons(relay.const(1), cons(relay.const(2), nil()))))
-    assert get_scalar(res) == 3
-
-
-@tvm.testing.uses_gpu
-def test_concat():
-    a = relay.TypeVar("a")
-    assert prelude.mod[concat].checked_type == relay.FuncType([rlist(a), rlist(a)], rlist(a), [a])
-
-    l1 = cons(make_nat_expr(prelude, 1), cons(make_nat_expr(prelude, 2), nil()))
-    l2 = cons(make_nat_expr(prelude, 3), cons(make_nat_expr(prelude, 4), nil()))
-    res = eval(concat(l1, l2))
-
-    catted = to_list(res)
-    assert len(catted) == 4
-    assert count(catted[0]) == 1
-    assert count(catted[1]) == 2
-    assert count(catted[2]) == 3
-    assert count(catted[3]) == 4
-
-
-@tvm.testing.uses_gpu
-def test_filter():
-    a = relay.TypeVar("a")
-    expected_type = relay.FuncType(
-        [relay.FuncType([a], relay.scalar_type("bool")), rlist(a)], rlist(a), [a]
-    )
-    assert prelude.mod[filter].checked_type == expected_type
-
-    x = relay.Var("x", nat())
-    greater_than_one = relay.Function(
-        [x],
-        relay.Match(
-            x,
-            [
-                relay.Clause(
-                    relay.PatternConstructor(
-                        s, [relay.PatternConstructor(s, [relay.PatternWildcard()])]
-                    ),
-                    relay.const(True),
-                ),
-                relay.Clause(relay.PatternWildcard(), relay.const(False)),
-            ],
-        ),
-    )
-    res = eval(
-        filter(
-            greater_than_one,
-            cons(
-                make_nat_expr(prelude, 1),
-                cons(
-                    make_nat_expr(prelude, 1),
-                    cons(
-                        make_nat_expr(prelude, 3),
-                        cons(
-                            make_nat_expr(prelude, 1),
-                            cons(make_nat_expr(prelude, 5), cons(make_nat_expr(prelude, 1), nil())),
-                        ),
-                    ),
-                ),
-            ),
-        )
-    )
-    filtered = to_list(res)
-    assert len(filtered) == 2
-    assert count(filtered[0]) == 3
-    assert count(filtered[1]) == 5
-
-
-@tvm.testing.uses_gpu
-def test_zip():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    expected_type = relay.FuncType([rlist(a), rlist(b)], rlist(relay.TupleType([a, b])), [a, b])
-    assert prelude.mod[zip].checked_type == expected_type
-
-    l1 = cons(
-        make_nat_expr(prelude, 1),
-        cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-    )
-    l2 = cons(nil(), cons(cons(nil(), nil()), cons(cons(nil(), cons(nil(), nil())), nil())))
-
-    res = eval(zip(l1, l2))
-    zipped = to_list(res)
-    assert len(zipped) == 3
-    assert count(zipped[0][0]) == 1
-    assert len(to_list(zipped[0][1])) == 0
-    assert count(zipped[1][0]) == 2
-    assert len(to_list(zipped[1][1])) == 1
-    assert count(zipped[2][0]) == 3
-    assert len(to_list(zipped[2][1])) == 2
-
-    # test truncation
-    l3 = cons(make_nat_expr(prelude, 4), cons(make_nat_expr(prelude, 5), nil()))
-    shorter_res = eval(zip(l3, l2))
-    truncated = to_list(shorter_res)
-    assert len(truncated) == 2
-    assert count(truncated[0][0]) == 4
-    assert len(to_list(truncated[0][1])) == 0
-    assert count(truncated[1][0]) == 5
-    assert len(to_list(truncated[1][1])) == 1
-
-    l4 = cons(nil(), nil())
-    shortest_res = eval(zip(l3, l4))
-    singleton = to_list(shortest_res)
-    assert len(singleton) == 1
-    assert count(singleton[0][0]) == 4
-    assert len(to_list(singleton[0][1])) == 0
-
-
-@tvm.testing.uses_gpu
-def test_rev():
-    a = relay.TypeVar("a")
-    assert prelude.mod[rev].checked_type == relay.FuncType([rlist(a)], rlist(a), [a])
-
-    res = eval(
-        rev(
-            cons(
-                make_nat_expr(prelude, 1),
-                cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-            )
-        )
-    )
-    reversed = to_list(res)
-
-    assert len(reversed) == 3
-    assert count(reversed[0]) == 3
-    assert count(reversed[1]) == 2
-    assert count(reversed[2]) == 1
-
-
-@tvm.testing.uses_gpu
-def test_unfoldr():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    expected_type = relay.FuncType(
-        [relay.FuncType([a], optional(relay.TupleType([a, b]))), a], rlist(b), [a, b]
-    )
-
-    x = relay.Var("x", nat())
-    n = relay.Var("n", nat())
-    count_down = relay.Function(
-        [x],
-        relay.Match(
-            x,
-            [
-                relay.Clause(
-                    relay.PatternConstructor(s, [relay.PatternVar(n)]), some(relay.Tuple([n, x]))
-                ),
-                relay.Clause(relay.PatternConstructor(z, []), none()),
-            ],
-        ),
-    )
-
-    res = eval(unfoldr(count_down, make_nat_expr(prelude, 3)))
-    unfolded = to_list(res)
-
-    assert len(unfolded) == 3
-    assert count(unfolded[0]) == 3
-    assert count(unfolded[1]) == 2
-    assert count(unfolded[2]) == 1
-
-
-@tvm.testing.uses_gpu
-def test_unfoldl():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    expected_type = relay.FuncType(
-        [relay.FuncType([a], optional(relay.TupleType([a, b]))), a], rlist(b), [a, b]
-    )
-
-    x = relay.Var("x", nat())
-    n = relay.Var("n", nat())
-    count_down = relay.Function(
-        [x],
-        relay.Match(
-            x,
-            [
-                relay.Clause(
-                    relay.PatternConstructor(s, [relay.PatternVar(n)]), some(relay.Tuple([n, x]))
-                ),
-                relay.Clause(relay.PatternConstructor(z, []), none()),
-            ],
-        ),
-    )
-
-    res = eval(unfoldl(count_down, make_nat_expr(prelude, 3)))
-    unfolded = to_list(res)
-
-    assert len(unfolded) == 3
-    assert count(unfolded[0]) == 1
-    assert count(unfolded[1]) == 2
-    assert count(unfolded[2]) == 3
-
-
-@tvm.testing.uses_gpu
-def test_map_accumr():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    c = relay.TypeVar("c")
-    expected_type = relay.FuncType(
-        [relay.FuncType([a, b], relay.TupleType([a, c])), a, rlist(b)],
-        relay.TupleType([a, rlist(c)]),
-        [a, b, c],
-    )
-    assert prelude.mod[map_accumr].checked_type == expected_type
-
-    acc = relay.Var("acc", nat())
-    x = relay.Var("x", nat())
-    add_acc_to_each = relay.Function([acc, x], relay.Tuple([add(x, acc), add(x, acc)]))
-
-    vals = cons(
-        make_nat_expr(prelude, 1),
-        cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-    )
-    res = eval(map_accumr(add_acc_to_each, z(), vals))
-
-    sum = count(res[0])
-    new_vals = to_list(res[1])
-
-    assert sum == 6
-    assert len(new_vals) == 3
-    assert count(new_vals[0]) == 6
-    assert count(new_vals[1]) == 5
-    assert count(new_vals[2]) == 3
-
-
-@tvm.testing.uses_gpu
-def test_map_accuml():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    c = relay.TypeVar("c")
-    expected_type = relay.FuncType(
-        [relay.FuncType([a, b], relay.TupleType([a, c])), a, rlist(b)],
-        relay.TupleType([a, rlist(c)]),
-        [a, b, c],
-    )
-    assert prelude.mod[map_accuml].checked_type == expected_type
-
-    acc = relay.Var("acc", nat())
-    x = relay.Var("x", nat())
-    add_to_acc = relay.Function([acc, x], relay.Tuple([add(x, acc), x]))
-
-    vals = cons(
-        make_nat_expr(prelude, 1),
-        cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-    )
-    res = eval(map_accuml(add_to_acc, z(), vals))
-
-    sum = count(res[0])
-    new_vals = to_list(res[1])
-
-    assert sum == 6
-    assert len(new_vals) == 3
-    assert count(new_vals[0]) == 3
-    assert count(new_vals[1]) == 2
-    assert count(new_vals[2]) == 1
-
-
-@tvm.testing.uses_gpu
-def test_optional_matching():
-    x = relay.Var("x")
-    y = relay.Var("y")
-    v = relay.Var("v")
-    condense = relay.Function(
-        [x, y],
-        relay.Match(
-            x,
-            [
-                relay.Clause(relay.PatternConstructor(some, [relay.PatternVar(v)]), cons(v, y)),
-                relay.Clause(relay.PatternConstructor(none), y),
-            ],
-        ),
-    )
-
-    res = eval(
-        foldr(
-            condense,
-            nil(),
-            cons(
-                some(make_nat_expr(prelude, 3)),
-                cons(none(), cons(some(make_nat_expr(prelude, 1)), nil())),
-            ),
-        )
-    )
-
-    reduced = to_list(res)
-    assert len(reduced) == 2
-    assert count(reduced[0]) == 3
-    assert count(reduced[1]) == 1
-
-
-@tvm.testing.uses_gpu
-def test_tmap():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    lhs = prelude.mod[tmap].checked_type
-    rhs = relay.FuncType([relay.FuncType([a], b), tree(a)], tree(b), [a, b])
-    assert lhs == rhs
-
-    x = relay.Var("x")
-    add_one = relay.Function([x], s(x))
-    res = eval(tmap(add_one, rose(z(), cons(rose(z(), nil()), cons(rose(z(), nil()), nil())))))
-
-    tree_dict = tree_to_dict(res)
-    assert count(tree_dict["member"]) == 1
-    assert len(tree_dict["children"]) == 2
-    for subtree in tree_dict["children"]:
-        assert count(subtree["member"]) == 1
-        assert len(subtree["children"]) == 0
-
-
-@tvm.testing.uses_gpu
-def test_size():
-    a = relay.TypeVar("a")
-    lhs = prelude.mod[size].checked_type
-    rhs = relay.FuncType([tree(a)], relay.scalar_type("int32"), [a])
-    assert lhs == rhs
-
-    root = rose(z(), cons(rose(z(), nil()), cons(rose(z(), nil()), nil())))
-    t = rose(z(), cons(root, cons(root, cons(root, nil()))))
-    res = eval(size(t))
-    assert get_scalar(res) == 10
-
-
-@tvm.testing.uses_gpu
-def test_wildcard_match_solo():
-    x = relay.Var("x", nat())
-    copy = relay.Function([x], relay.Match(x, [relay.Clause(relay.PatternWildcard(), x)]), nat())
-
-    res = eval(copy(s(s(s(z())))))
-    assert count(res) == 3
-
-
-@tvm.testing.uses_gpu
-def test_wildcard_match_order():
-    x = relay.Var("x", rlist(nat()))
-    y = relay.Var("y")
-    a = relay.Var("a")
-    return_zero = relay.Function(
-        [x],
-        relay.Match(
-            x,
-            [
-                relay.Clause(relay.PatternWildcard(), z()),
-                relay.Clause(
-                    relay.PatternConstructor(cons, [relay.PatternVar(y), relay.PatternVar(a)]), y
-                ),
-                relay.Clause(relay.PatternConstructor(nil), s(z())),
-            ],
-        ),
-        nat(),
-    )
-
-    res = eval(return_zero(cons(s(z()), nil())))
-    # wildcard pattern is evaluated first
-    assert count(res) == 0
-
-
-@tvm.testing.uses_gpu
-def test_nested_matches():
-    a = relay.TypeVar("a")
-    # TODO(@jroesch): inference should be able to handle this one
-    x = relay.Var("x", type_annotation=rlist(rlist(a)))
-    y = relay.Var("y")
-    w = relay.Var("w")
-    h = relay.Var("h")
-    t = relay.Var("t")
-    flatten = relay.GlobalVar("flatten")
-
-    # flatten could be written using a fold, but this way has nested matches
-    inner_match = relay.Match(
-        y,
-        [
-            relay.Clause(relay.PatternConstructor(nil), flatten(w)),
-            relay.Clause(
-                relay.PatternConstructor(cons, [relay.PatternVar(h), relay.PatternVar(t)]),
-                cons(h, flatten(cons(t, w))),
-            ),
-        ],
-    )
-
-    prelude.mod[flatten] = relay.Function(
-        [x],
-        relay.Match(
-            x,
-            [
-                relay.Clause(relay.PatternConstructor(nil), nil()),
-                relay.Clause(
-                    relay.PatternConstructor(cons, [relay.PatternVar(y), relay.PatternVar(w)]),
-                    inner_match,
-                ),
-            ],
-        ),
-        rlist(a),
-        [a],
-    )
-
-    first_list = cons(
-        make_nat_expr(prelude, 1),
-        cons(make_nat_expr(prelude, 2), cons(make_nat_expr(prelude, 3), nil())),
-    )
-    second_list = cons(
-        make_nat_expr(prelude, 4),
-        cons(make_nat_expr(prelude, 5), cons(make_nat_expr(prelude, 6), nil())),
-    )
-    final_list = cons(first_list, cons(second_list, nil()))
-
-    res = eval(flatten(final_list))
-
-    flat = to_list(res)
-    assert len(flat) == 6
-    for i in range(6):
-        assert count(flat[i]) == i + 1
-
-
-@tvm.testing.uses_gpu
-def test_match_full_var():
-    x = relay.Var("x")
-    v = relay.Var("v")
-    id_func = relay.Function([x], relay.Match(x, [relay.Clause(relay.PatternVar(v), v)]))
-
-    res1 = eval(id_func(nil()))
-    res2 = eval(id_func(cons(z(), cons(z(), nil()))))
-
-    empty = to_list(res1)
-    assert len(empty) == 0
-
-    zeroes = to_list(res2)
-    assert len(zeroes) == 2
-    assert count(zeroes[0]) == 0
-    assert count(zeroes[1]) == 0
-
-
-@tvm.testing.uses_gpu
-def test_nested_pattern_match():
-    x = relay.Var("x", rlist(nat()))
-    h1 = relay.Var("h1")
-    h2 = relay.Var("h2")
-    t = relay.Var("t")
-    match = relay.Match(
-        x,
-        [
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternVar(h1),
-                        relay.PatternConstructor(cons, [relay.PatternVar(h2), relay.PatternVar(t)]),
-                    ],
-                ),
-                h2,
-            ),
-            relay.Clause(relay.PatternWildcard(), z()),
-        ],
-    )
-    get_second = relay.Function([x], match)
-
-    res = eval(get_second(cons(s(z()), cons(s(s(z())), nil()))))
-
-    assert count(res) == 2
-
-
-@tvm.testing.uses_gpu
-def test_compose():
-    n = relay.Var("n")
-    inc = relay.Function([n], s(n))
-    x = relay.Var("x")
-    res = eval(relay.Call(compose(inc, double), [s(s(z()))]))
-    assert count(res) == 5
-
-
-@tvm.testing.uses_gpu
-def test_iterate():
-    expr = relay.Call(iterate(double, relay.const(2)), [make_nat_expr(prelude, 3)])
-    res = eval(relay.Function([], expr)())
-    assert count(res) == 12
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_analysis_basic_block_normal_form.py b/tests/python/relay/test_analysis_basic_block_normal_form.py
deleted file mode 100644
index 558f55ef40d5..000000000000
--- a/tests/python/relay/test_analysis_basic_block_normal_form.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay.analysis import check_basic_block_normal_form
-
-
-def test_one_block():
-    x = relay.var("x")
-    y = relay.add(x, x)
-    z = relay.add(x, y)
-    check_basic_block_normal_form(z)
-
-
-def test_let():
-    x = relay.var("x")
-    y = relay.var("y")
-    body = relay.Let(y, x, y)
-    check_basic_block_normal_form(body)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_invalid_if():
-    cond = relay.var("cond", dtype="bool", shape=())
-    shared = relay.var("shared")
-    true_branch = shared
-    false_branch = relay.add(shared, shared)
-    body = relay.If(cond, true_branch, false_branch)
-    """
-    The program below violates basic block normal form, as the scope of %shared
-    is ambiguous and should not be in that of true branch.
-
-    free_var %cond: bool
-    if (%cond) {
-      free_var %shared
-      %shared
-    } else {
-      add(%shared, %shared)
-    }
-    """
-    check_basic_block_normal_form(body)
-
-
-def test_valid_if():
-    cond = relay.var("cond", dtype="bool", shape=())
-    shared = relay.var("shared")
-    true_branch = shared
-    false_branch = relay.add(shared, shared)
-    body = relay.If(cond, true_branch, false_branch)
-    shared_bound = relay.var("shared_bound", shape=(1,), dtype="float32")
-    body = relay.Let(shared, shared_bound, body)
-    """
-    The program below uses let binding to control the scope of %shared, which
-    follows the basic block normal form.
-
-    free_var %shared_bound: Tensor[(1), float32]
-    let %shared = %shared_bound;
-    free_var %cond: bool
-    if (%cond) {
-      %shared
-    } else {
-      add(%shared, %shared)
-    }
-    """
-    check_basic_block_normal_form(body)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_invalid_if2():
-    """
-    fn (%x: float32) {
-      %0 = equal(%x, 2f);
-      if (%0) {
-        %1 = add(%x, 1f);
-        multiply(%1, 2f)
-      } else {
-        multiply(%1, 1f)
-      }
-    }
-    """
-    x = relay.var("x", shape=(), dtype="float32")
-    one = relay.const(1, dtype="float32")
-    two = relay.const(2, dtype="float32")
-    v1 = relay.add(x, one)
-    v2 = relay.equal(x, two)
-    true_branch = relay.multiply(v1, two)
-    false_branch = relay.multiply(v1, one)
-    body = relay.If(v2, true_branch, false_branch)
-    func = relay.Function([x], body)
-    check_basic_block_normal_form(func)
-
-
-def test_valid_if2():
-    """
-    fn (%x: float32) {
-      let %v1 = add(%x, 1f);
-      %0 = equal(%x, 2f);
-      if (%0) {
-        multiply(%v1, 2f)
-      } else {
-        multiply(%v1, 1f)
-      }
-    }
-    """
-    x = relay.var("x", shape=(), dtype="float32")
-    one = relay.const(1, dtype="float32")
-    two = relay.const(2, dtype="float32")
-    v1 = relay.var("v1")
-    v2 = relay.equal(x, two)
-    true_branch = relay.multiply(v1, two)
-    false_branch = relay.multiply(v1, one)
-    body = relay.If(v2, true_branch, false_branch)
-    body = relay.Let(v1, relay.add(x, one), body)
-    func = relay.Function([x], body)
-    check_basic_block_normal_form(func)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_func():
-    x = relay.var("x", shape=(1,), dtype="float32")  # , a)
-    y = relay.var("y", shape=(1,), dtype="float32")  # , a)
-    z = relay.var("z", shape=(1,), dtype="float32")  # , a)
-    x2 = relay.add(x, x)
-    func_a = relay.Function([y], relay.add(x2, y))  # , a, [a])
-    func_b = relay.Function([z], relay.add(x2, z))  # , a, [a])
-    body = relay.Tuple([func_a, func_b])
-    body = relay.Function([x], body)
-    """
-    fn (%x: Tensor[(1), float32]) {
-      %1 = fn (%y: Tensor[(1), float32]) {
-        %0 = add(%x, %x);
-        add(%0, %y)
-      };
-      %2 = fn (%z: Tensor[(1), float32]) {
-        add(%0, %z)
-      };
-      (%1, %2)
-    }
-    """
-    check_basic_block_normal_form(body)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_higher_order_return():
-    x = relay.var("x", shape=(1,), dtype="float32")  # , a)
-    y = relay.var("y", shape=(1,), dtype="float32")  # , a)
-    z = relay.var("z", shape=(1,), dtype="float32")  # , a)
-    x2 = relay.add(x, x)
-    func_a = relay.Function([y], relay.add(x2, y))  # , a, [a])
-    func_b = relay.Function([z], relay.add(x2, z))  # , a, [a])
-    body = relay.Tuple([func_a, func_b])
-    body = relay.Function([x], body)
-    """
-    fn (%x: Tensor[(1), float32]) {
-      %1 = fn (%y: Tensor[(1), float32]) {
-        %0 = add(%x, %x);
-        add(%0, %y)
-      };
-      %2 = fn (%z: Tensor[(1), float32]) {
-        add(%0, %z)
-      };
-      (%1, %2)
-    }
-    """
-    check_basic_block_normal_form(body)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_higher_order_nested():
-    x = relay.var("x", dtype="float32", shape=(1,))
-    s = relay.var("s", dtype="float32", shape=(1,))
-    shared = relay.add(s, s)
-    func_true = relay.Function([x], relay.add(x, shared))
-    choice_t = relay.FuncType([], relay.scalar_type("bool"))
-    f = relay.Var("f", choice_t)
-    z = relay.Var("z")
-    body = relay.If(f(), func_true, relay.Function([z], relay.add(z, shared)))
-    top = relay.Function([f, s], body)
-    """
-    fn (%f: fn () -> bool, %s: Tensor[(1), float32]) {
-      %0 = %f();
-      if (%0) {
-        fn (%x: Tensor[(1), float32]) {
-          %1 = add(%s, %s);
-          add(%x, %1)
-        }
-      } else {
-        fn (%z) {
-          add(%z, %1)
-        }
-      }
-    }
-    """
-    check_basic_block_normal_form(top)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_analysis_extract_fake_quantized_ops.py b/tests/python/relay/test_analysis_extract_fake_quantized_ops.py
deleted file mode 100644
index 54594a2ddc01..000000000000
--- a/tests/python/relay/test_analysis_extract_fake_quantized_ops.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test function extraction"""
-import tvm
-from tvm import relay
-
-
-def test_fake_quantize_conv():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-    zero = relay.const(0)
-
-    op = relay.op.nn.conv2d(
-        relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-        relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-        kernel_size=[5, 5],
-    )
-    op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")
-
-    mod = tvm.IRModule.from_expr(op)
-    fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)
-
-    assert dict(fake_quantized_op_freqs) == {"nn.conv2d": 1}
-
-
-def test_fake_quantize_dense():
-    x = relay.var("x", shape=[128, 64], dtype="int8")
-    w = relay.var("w", shape=[256, 64], dtype="int8")
-    zero = relay.const(0)
-
-    op = relay.op.nn.dense(
-        relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-        relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-    )
-    op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")
-
-    mod = tvm.IRModule.from_expr(op)
-    fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)
-
-    assert dict(fake_quantized_op_freqs) == {"nn.dense": 1}
-
-
-def test_fake_quantize_multiple_regions():
-    x = relay.var("x", shape=[128, 64], dtype="int8")
-    w = relay.var("w", shape=[256, 64], dtype="int8")
-    zero = relay.const(0)
-
-    op = relay.op.nn.dense(
-        relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-        relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-    )
-    op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")
-
-    op = relay.qnn.op.dequantize(op, relay.const(2.0), relay.const(114))
-    op = relay.op.nn.relu(op)
-    op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")
-
-    w2 = relay.var("w2", shape=[64, 256], dtype="int8")
-    op = relay.op.nn.dense(
-        relay.qnn.op.dequantize(op, relay.const(1.0), zero),
-        relay.qnn.op.dequantize(w2, relay.const(0.5), zero),
-    )
-    op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")
-
-    # We expect to ignore this sigmoid op since it's just outside a fake
-    # quantized region
-    op = relay.op.sigmoid(op)
-
-    mod = tvm.IRModule.from_expr(op)
-    fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)
-
-    assert dict(fake_quantized_op_freqs) == {"nn.dense": 2, "nn.relu": 1}
-
-
-def test_fake_quantize_maxpool():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.nn.max_pool2d(x, [3, 3])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    mod = tvm.IRModule.from_expr(op)
-    fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)
-
-    assert dict(fake_quantized_op_freqs) == {"nn.max_pool2d": 1}
-
-
-def test_fake_quantize_transpose_reshape():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.transpose(x, [1, 0, 2, 3])
-    op = relay.op.reshape(op, [3, -1])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    mod = tvm.IRModule.from_expr(op)
-    fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)
-
-    assert dict(fake_quantized_op_freqs) == {"transpose": 1, "reshape": 1}
-
-
-def test_fake_quantize_concat():
-    zero = relay.const(0)
-    inputs = []
-    for i in range(4):
-        inputs.append(
-            relay.qnn.op.dequantize(
-                relay.var("x%d" % i, shape=[1, 4], dtype="int8"), relay.const(i + 0.5), zero
-            )
-        )
-    concat = relay.op.concatenate(inputs, axis=1)
-    op = relay.qnn.op.quantize(concat, relay.const(3.5), zero)
-
-    mod = tvm.IRModule.from_expr(op)
-    fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)
-
-    assert dict(fake_quantized_op_freqs) == {"concatenate": 1}
diff --git a/tests/python/relay/test_analysis_extract_fused_functions.py b/tests/python/relay/test_analysis_extract_fused_functions.py
deleted file mode 100644
index 9317a4de7887..000000000000
--- a/tests/python/relay/test_analysis_extract_fused_functions.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test function extraction"""
-import tvm
-from tvm import relay
-from tvm.relay.testing.synthetic import get_workload
-
-
-def get_conv_net():
-    """This gets the net for a case described in fuse_ops.cc:
-
-            conv2d
-            /  |  \
-           /   |   \
-         op    op   op
-          \    |    /
-           \   |   /
-          elemwise add
-               |
-    """
-    dshape = (1, 1, 5, 1)
-    x = relay.var("x", shape=dshape)
-    y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-
-    x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-    x2 = relay.nn.conv2d(y, relay.var("w3"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-    x3 = relay.nn.conv2d(y, relay.var("w4"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-
-    z = relay.add(x1, x2)
-    z = relay.add(x3, z)
-
-    return tvm.IRModule.from_expr(z)
-
-
-def get_conv2d():
-    x = relay.var("x", shape=(1, 56, 56, 64))
-    weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-    y = relay.nn.conv2d(
-        x,
-        weight1,
-        channels=32,
-        kernel_size=(3, 3),
-        padding=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-    return tvm.IRModule.from_expr(y)
-
-
-def test_extract_identity():
-    mod = get_conv2d()
-    items = relay.analysis.extract_fused_functions(mod)
-    assert len(items) == 1
-
-    mod["main"] = mod["main"].with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    tvm.ir.structural_equal(list(items.values())[0], mod["main"])
-
-
-def test_extract_conv_net():
-    mod = get_conv_net()
-    items = relay.analysis.extract_fused_functions(mod)
-    functions = list(items.values())
-    assert len(functions) == 2
-    x = functions[0]
-    y = functions[1]
-
-    def is_conv(func):
-        conv2d = relay.op.op.get("nn.conv2d")
-        call_node = func.body
-        return call_node.op == conv2d
-
-    def is_conv_add(func):
-        add = relay.op.op.get("add")
-        call_node = func.body
-        maybe_conv_module = tvm.IRModule.from_expr(call_node.args[0])
-        return call_node.op == add and is_conv(maybe_conv_module["main"])
-
-    # Function traversal order isn't obvious, so checking both orders is more consistent
-    assert (is_conv(x) and is_conv_add(y)) or (is_conv_add(x) and is_conv(y))
-
-
-def test_extract_resnet():
-    mod, _params = get_workload()
-    items = relay.analysis.extract_fused_functions(mod)
-    assert len(items) == 7
-
-
-if __name__ == "__main__":
-    test_extract_identity()
-    test_extract_conv_net()
-    test_extract_resnet()
diff --git a/tests/python/relay/test_analysis_extract_intermediate_expr.py b/tests/python/relay/test_analysis_extract_intermediate_expr.py
deleted file mode 100644
index f0267ebc7951..000000000000
--- a/tests/python/relay/test_analysis_extract_intermediate_expr.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test function extraction"""
-import pytest
-import tvm
-from tvm import relay
-
-
-def get_conv_net():
-    """This gets the net for:
-          conv2d
-          /  |
-         /   |
-    conv2d   |
-        \    |
-         \   |
-        elemwise add
-             |
-             |
-             |
-           split
-             |
-             |
-             |
-        elemwise add
-    """
-    dshape = (1, 1, 5, 1)
-    x = relay.var("x", shape=dshape)
-    y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-    x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-
-    z = relay.add(y, x1)
-
-    tuple_out = relay.op.split(z, indices_or_sections=1, axis=0)
-
-    tuple_0_add = relay.add(tuple_out[0], relay.const(1, dtype="float32"))
-
-    return tvm.IRModule.from_expr(tuple_0_add)
-
-
-def get_conv2d():
-    x = relay.var("x", shape=(1, 56, 56, 64))
-    weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-    y = relay.nn.conv2d(
-        x,
-        weight1,
-        channels=32,
-        kernel_size=(3, 3),
-        padding=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-    return tvm.IRModule.from_expr(y)
-
-
-def test_extract():
-    dshape = (1, 1, 5, 1)
-
-    def before():
-        return get_conv_net()
-
-    def expected_0():
-        x = relay.var("x", shape=dshape)
-        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        return tvm.IRModule.from_expr(y)
-
-    def expected_1():
-        x = relay.var("x", shape=dshape)
-        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        return tvm.IRModule.from_expr(x1)
-
-    def expected_2():
-        x = relay.var("x", shape=dshape)
-        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        z = relay.add(y, x1)
-        return tvm.IRModule.from_expr(z)
-
-    def expected_3():
-        x = relay.var("x", shape=dshape)
-        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        z = relay.add(y, x1)
-        tuple_out = relay.op.split(z, indices_or_sections=1, axis=0)
-        return tvm.IRModule.from_expr(tuple_out.astuple())
-
-    def expected_4():
-        # check tuple node
-        x = relay.var("x", shape=dshape)
-        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-        z = relay.add(y, x1)
-        tuple_out = relay.op.split(z, indices_or_sections=1, axis=0)
-        return tvm.IRModule.from_expr(tuple_out[0])
-
-    tvm.ir.assert_structural_equal(
-        relay.analysis.extract_intermdeiate_expr(before(), 0), expected_0()
-    )
-    tvm.ir.assert_structural_equal(
-        relay.analysis.extract_intermdeiate_expr(before(), 1), expected_1()
-    )
-    tvm.ir.assert_structural_equal(
-        relay.analysis.extract_intermdeiate_expr(before(), 2), expected_2()
-    )
-    tvm.ir.assert_structural_equal(
-        (relay.analysis.extract_intermdeiate_expr(before(), 3)), expected_3()
-    )
-    tvm.ir.assert_structural_equal(
-        relay.analysis.extract_intermdeiate_expr(before(), 4), expected_4()
-    )
-    tvm.ir.assert_structural_equal(relay.analysis.extract_intermdeiate_expr(before(), 5), before())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_analysis_extract_operators.py b/tests/python/relay/test_analysis_extract_operators.py
deleted file mode 100644
index 5218fbf7003e..000000000000
--- a/tests/python/relay/test_analysis_extract_operators.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test function extraction"""
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay.testing.resnet import get_workload
-from tvm.relay.testing import run_opt_pass
-
-
-def get_conv_net():
-    """This gets the net for:
-          conv2d
-          /  |
-         /   |
-    conv2d   |
-        \    |
-         \   |
-        elemwise add
-             |
-    """
-    dshape = (1, 1, 5, 1)
-    x = relay.var("x", shape=dshape)
-    y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-    x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
-
-    z = relay.add(y, x1)
-
-    return tvm.IRModule.from_expr(z)
-
-
-def get_conv2d():
-    x = relay.var("x", shape=(1, 56, 56, 64))
-    weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-    y = relay.nn.conv2d(
-        x,
-        weight1,
-        channels=32,
-        kernel_size=(3, 3),
-        padding=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-    return tvm.IRModule.from_expr(y)
-
-
-def test_extract_identity():
-    mod = get_conv2d()
-    op_freqs = relay.analysis.list_op_freqs(mod)
-    assert len(op_freqs) == 1
-    assert op_freqs["nn.conv2d"] == 1
-
-
-def test_extract_conv_net():
-    mod = get_conv_net()
-    op_freqs = relay.analysis.list_op_freqs(mod)
-    assert len(op_freqs) == 2
-    assert op_freqs["add"] == 1
-    assert op_freqs["nn.conv2d"] == 2
-
-
-def test_extract_fused():
-    mod = get_conv_net()
-    mod = relay.transform.InferType()(mod)
-    mod = relay.transform.FuseOps(3)(mod)
-
-    op_freqs = relay.analysis.list_op_freqs(mod)
-    assert len(op_freqs) == 2
-    assert op_freqs["add"] == 1
-    assert op_freqs["nn.conv2d"] == 2
-
-
-def test_extract_resnet():
-    mod, _params = get_workload()
-    expected_op_freqs = {
-        "nn.batch_norm": 19,
-        "nn.conv2d": 21,
-        "nn.relu": 18,
-        "nn.max_pool2d": 1,
-        "add": 8,
-        "nn.global_avg_pool2d": 1,
-        "nn.batch_flatten": 1,
-        "nn.dense": 1,
-        "nn.bias_add": 1,
-        "nn.softmax": 1,
-    }
-    op_freqs = relay.analysis.list_op_freqs(mod)
-    assert len(op_freqs) == len(expected_op_freqs)
-    assert all([op_freqs[op] == expected_op_freqs[op] for op in expected_op_freqs])
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_analysis_feature.py b/tests/python/relay/test_analysis_feature.py
deleted file mode 100644
index 6ac7085fa0a9..000000000000
--- a/tests/python/relay/test_analysis_feature.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.analysis import detect_feature, Feature
-from tvm.relay.transform import gradient
-from tvm.relay.prelude import Prelude
-from tvm.relay.testing import run_infer_type
-
-
-def test_prelude():
-    p = Prelude()
-    feats = detect_feature(p.mod)
-    assert feats == set(
-        [
-            Feature.fVar,
-            Feature.fGlobalVar,
-            Feature.fConstant,
-            Feature.fTuple,
-            Feature.fTupleGetItem,
-            Feature.fFunction,
-            Feature.fOp,
-            Feature.fCall,
-            Feature.fLet,
-            Feature.fIf,
-            Feature.fConstructor,
-            Feature.fMatch,
-        ]
-    )
-
-
-def test_ad():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    func = relay.Function([x], x + x)
-    func = run_infer_type(func)
-    mod = tvm.IRModule.from_expr(gradient(func))
-    mod = relay.transform.InferType()(mod)
-    back_func = mod["main"]
-    feats = detect_feature(back_func)
-    assert feats == set(
-        [
-            Feature.fVar,
-            Feature.fTuple,
-            Feature.fTupleGetItem,
-            Feature.fFunction,
-            Feature.fOp,
-            Feature.fCall,
-            Feature.fLet,
-            Feature.fRefCreate,
-            Feature.fRefRead,
-            Feature.fRefWrite,
-        ]
-    )
-
-
-if __name__ == "__main__":
-    test_prelude()
-    test_ad()
diff --git a/tests/python/relay/test_analysis_get_calibration_data.py b/tests/python/relay/test_analysis_get_calibration_data.py
deleted file mode 100644
index 8ac36f7bebaf..000000000000
--- a/tests/python/relay/test_analysis_get_calibration_data.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-
-import tvm
-import tvm.relay.testing
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.analysis import get_calibration_data
-
-
-def check_data_size(mod, data):
-    assert len(data) == len(mod.functions) - 1
-    for key, value in mod.functions.items():
-        if key.name_hint != "main":
-            assert len(data[key]["inputs"]) == len(value.params)
-            if isinstance(value.body, relay.Tuple):
-                assert len(data[key]["outputs"]) == len(value.body.fields)
-            else:
-                assert len(data[key]["outputs"]) == 1
-
-
-def test_simple_graph():
-    # A module with two subgraphs
-    mod = tvm.IRModule()
-
-    x0 = relay.var("x0", shape=(8, 8))
-    y0 = relay.var("y0", shape=(8, 8))
-    z0 = x0 + y0
-    z1 = x0 - y0
-    z2 = relay.Tuple((z0, z1))
-    f0 = relay.Function([x0, y0], z2)
-    f0 = f0.with_attr("Compiler", "test_graph")
-    g0 = relay.GlobalVar("g0")
-    mod[g0] = f0
-    mod = relay.transform.InferType()(mod)
-
-    x1 = relay.var("x1", shape=(8, 8))
-    y1 = relay.var("y1", shape=(8, 8))
-    z1 = x1 - y1
-    f1 = relay.Function([x1, y1], z1)
-    f1 = f1.with_attr("Compiler", "test_graph")
-    g1 = relay.GlobalVar("g1")
-    mod[g1] = f1
-    mod = relay.transform.InferType()(mod)
-
-    x = relay.var("x", shape=(8, 8))
-    y = relay.var("y", shape=(8, 8))
-    z = relay.var("z", shape=(8, 8))
-    c0 = relay.Call(g0, [x, y])
-    c1 = relay.Call(g1, [relay.TupleGetItem(c0, 0), z])
-    fm = relay.Function([x, y, z], c1)
-    mod["main"] = fm
-    mod = relay.transform.InferType()(mod)
-
-    x_data = np.random.rand(8, 8).astype("float32")
-    y_data = np.random.rand(8, 8).astype("float32")
-    z_data = np.random.rand(8, 8).astype("float32")
-    data = get_calibration_data(mod, {"x": x_data, "y": y_data, "z": z_data})
-
-    # Check the number and orders
-    check_data_size(mod, data)
-    tvm.testing.assert_allclose(data[g0]["inputs"][0].numpy(), x_data)
-    tvm.testing.assert_allclose(data[g0]["inputs"][1].numpy(), y_data)
-    tvm.testing.assert_allclose(data[g0]["outputs"][0].numpy(), x_data + y_data)
-    tvm.testing.assert_allclose(data[g0]["outputs"][1].numpy(), x_data - y_data)
-    tvm.testing.assert_allclose(data[g1]["inputs"][0].numpy(), x_data + y_data)
-    tvm.testing.assert_allclose(data[g1]["inputs"][1].numpy(), z_data)
-    tvm.testing.assert_allclose(data[g1]["outputs"][0].numpy(), x_data + y_data - z_data)
-
-
-def test_mobilenet_dnnl():
-    if not tvm.get_global_func("relay.ext.dnnl", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    ishape = (1, 3, 224, 224)
-    mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-
-    mod = transform.AnnotateTarget(["dnnl"])(mod)
-    mod = transform.MergeCompilerRegions()(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-    data = get_calibration_data(mod, {"data": i_data, **params})
-
-    # Check the number and orders
-    check_data_size(mod, data)
-
-
-if __name__ == "__main__":
-    test_simple_graph()
-    test_mobilenet_dnnl()
diff --git a/tests/python/relay/test_annotated_regions.py b/tests/python/relay/test_annotated_regions.py
deleted file mode 100644
index 17c4ff2ba572..000000000000
--- a/tests/python/relay/test_annotated_regions.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-import tvm
-from tvm import relay
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-
-
-def check_region(region_set, target, args, nodes, rets):
-    region = region_set.get_region(args[0])
-    assert region
-    assert target == region.target
-    assert set(args) == set(region.args)
-    assert set(nodes) == set(region.nodes)
-    assert set(rets) == set(region.rets)
-
-
-def test_region_set_creator_diamond():
-    data = relay.var("data", shape=(10, 10))
-    cb_1 = compiler_begin(data, "test_target")
-    O_1 = relay.abs(cb_1)
-    ce_1 = compiler_end(O_1, "test_target")
-    ce_2 = compiler_end(O_1, "test_target")
-    cb_2 = compiler_begin(ce_1, "test_target")
-    O_2 = relay.nn.relu(cb_2)
-    ce_3 = compiler_end(O_2, "test_target")
-    cb_d = compiler_begin(ce_2, "default")
-    X = relay.tanh(cb_d)
-    ce_d = compiler_end(X, "default")
-    cb_3 = compiler_begin(ce_3, "test_target")
-    cb_4 = compiler_begin(ce_d, "test_target")
-    O_3 = relay.add(cb_3, cb_4)
-    ce_4 = compiler_end(O_3, "test_target")
-    diamond = relay.Function([data], ce_4)
-
-    region_set = relay.analysis.AnnotatedRegionSet(
-        diamond, relay.op.get("annotation.compiler_begin"), relay.op.get("annotation.compiler_end")
-    )
-    assert len(region_set) == 4
-    check_region(
-        region_set,
-        "test_target",
-        [cb_1],
-        [cb_1, O_1, ce_1, ce_2],
-        [ce_1, ce_2],
-    )
-    check_region(
-        region_set,
-        "test_target",
-        [cb_2],
-        [cb_2, O_2, ce_3],
-        [ce_3],
-    )
-    check_region(
-        region_set,
-        "default",
-        [cb_d],
-        [cb_d, X, ce_d],
-        [ce_d],
-    )
-    check_region(
-        region_set,
-        "test_target",
-        [cb_3, cb_4],
-        [cb_3, cb_4, O_3, ce_4],
-        [ce_4],
-    )
-
-
-def test_region_set_creator_merged():
-    data = relay.var("data", shape=(10, 10))
-    cb_1 = compiler_begin(data, "test_target")
-    O_1 = relay.abs(cb_1)
-    ce_2 = compiler_end(O_1, "test_target")
-    O_2 = relay.nn.relu(O_1)
-    ce_3 = compiler_end(O_2, "test_target")
-    cb_d = compiler_begin(ce_2, "default")
-    X = relay.tanh(cb_d)
-    ce_d = compiler_end(X, "default")
-    cb_3 = compiler_begin(ce_3, "test_target")
-    cb_4 = compiler_begin(ce_d, "test_target")
-    O_3 = relay.add(cb_3, cb_4)
-    O_4 = relay.add(cb_3, cb_4)
-    O_5 = relay.Tuple([O_3, O_4])
-    ce_4 = compiler_end(O_5, "test_target")
-    merged = relay.Function([data], ce_4)
-
-    region_set = relay.analysis.AnnotatedRegionSet(
-        merged, relay.op.get("annotation.compiler_begin"), relay.op.get("annotation.compiler_end")
-    )
-    assert len(region_set) == 3
-    check_region(
-        region_set,
-        "test_target",
-        [cb_1],
-        [cb_1, O_1, O_2, ce_2, ce_3],
-        [ce_2, ce_3],
-    )
-    check_region(
-        region_set,
-        "default",
-        [cb_d],
-        [cb_d, X, ce_d],
-        [ce_d],
-    )
-    check_region(
-        region_set,
-        "test_target",
-        [cb_3, cb_4],
-        [cb_3, cb_4, O_3, O_4, O_5, ce_4],
-        [ce_4],
-    )
-
-
-if __name__ == "__main__":
-    test_region_set_creator_diamond()
-    test_region_set_creator_merged()
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
deleted file mode 100644
index 336c08ab7ca2..000000000000
--- a/tests/python/relay/test_any.py
+++ /dev/null
@@ -1,2242 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import platform
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import relay, te
-from tvm.relay.loops import while_loop
-from tvm.relay.testing import run_infer_type as infer_type
-from tvm.topi.testing import searchsorted_ref
-
-from utils import ref_funcs
-from utils.assert_diagnostic import DiagnosticTesting
-
-
-def int32(val):
-    return relay.const(val, "int32")
-
-
-def any_dims(ndim):
-    shape = []
-    for _ in range(ndim):
-        shape.append(relay.Any())
-    return tuple(shape)
-
-
-def check_result(
-    args,
-    mod,
-    expected,
-    flatten=False,
-    assert_shape=False,
-    only_vm=False,
-    targets=None,
-    disable_targets=None,
-):
-    if not isinstance(expected, list):
-        expected = [expected]
-    for kind in ["debug", "vm"]:
-        targets = targets or tvm.testing.enabled_targets()
-        for tgt, dev in targets:
-            if disable_targets and tgt in disable_targets:
-                continue
-            if kind == "debug" and (only_vm or dev.device_type != tvm.cpu().device_type):
-                continue
-            result = relay.create_executor(kind, mod=mod, device=dev, target=tgt).evaluate()(*args)
-            if isinstance(result, tvm.runtime.container.ADT):
-                result = [r.numpy() for r in result]
-            else:
-                result = [result.numpy()]
-
-            for r, e in zip(result, expected):
-                if assert_shape:
-                    assert r.shape == e, "Shape mismatch: expect %s but got %s." % (
-                        str(e),
-                        str(r),
-                    )
-                else:
-                    if flatten:
-                        r = r.flatten()
-                        e = e.flatten()
-                    tvm.testing.assert_allclose(r, e, atol=2e-6)
-
-
-def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
-    dtype = "float32"
-    x = relay.var("x", shape=x_shape, dtype=dtype)
-    y = relay.var("y", shape=y_shape, dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], op(x, y))
-    x_np = np.random.uniform(size=x_np_shape).astype(dtype)
-    y_np = np.random.uniform(size=y_np_shape).astype(dtype)
-    res_np = np_op(x_np, y_np)
-    check_result([x_np, y_np], mod, res_np)
-
-
-@tvm.testing.uses_gpu
-def test_any_broadcast():
-    # Test broadcast with 1s
-    verify_any_broadcast((relay.Any(),), (3, 2), (1,), (3, 2), relay.add, np.add)
-    verify_any_broadcast((relay.Any(), 2), (1, 2), (1, 2), (1, 2), relay.add, np.add)
-    verify_any_broadcast((relay.Any(), 2), (1, 2), (3, 2), (1, 2), relay.add, np.add)
-    verify_any_broadcast((relay.Any(), 2), (3, 2), (1, 2), (3, 2), relay.add, np.add)
-    verify_any_broadcast((relay.Any(), 2), (3, relay.Any()), (1, 2), (3, 1), relay.add, np.add)
-
-    # Test broadcast with values other than 1
-    verify_any_broadcast((relay.Any(),), (3, 2), (2,), (3, 2), relay.add, np.add)
-    verify_any_broadcast((relay.Any(), 2), (3, 2), (3, 2), (3, 2), relay.add, np.add)
-
-
-def verify_any_elemwise(x_shape, x_np_shape, op, np_op):
-    dtype = "float32"
-    x = relay.var("x", shape=x_shape, dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], op(x))
-    x_np = np.random.uniform(size=x_np_shape).astype(dtype)
-    res_np = np_op(x_np)
-    check_result([x_np], mod, res_np)
-
-
-@tvm.testing.uses_gpu
-def test_any_elemwise():
-    verify_any_elemwise((relay.Any(),), (3,), relay.sqrt, np.sqrt)
-    verify_any_elemwise((relay.Any(), 2), (5, 2), relay.negative, np.negative)
-    verify_any_elemwise((relay.Any(), relay.Any()), (5, 4), relay.exp, np.exp)
-    verify_any_elemwise((relay.Any(),), (3,), relay.round, np.round)
-
-
-@tvm.testing.uses_gpu
-def test_any_broadcast_fail():
-    # Test broadcast with incompatible values at runtime
-    def check_fail(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
-        try:
-            verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op)
-        except tvm._ffi.base.TVMError:
-            pass
-        else:
-            assert False
-
-    check_fail((relay.Any(),), (3, 2), (1,), (4, 2), relay.add, np.add)
-    check_fail((relay.Any(), 2), (3, 2), (4, 2), (4, 2), relay.add, np.add)
-    check_fail((relay.Any(), 2), (3, relay.Any()), (1, 2), (4, 1), relay.add, np.add)
-    check_fail((relay.Any(), 2), (3, 3), (1, 3), (3, 3), relay.add, np.add)
-    check_fail((relay.Any(),), (3, 2), (2), (4, 2), relay.add, np.add)
-
-
-def verify_any_full_like(x_shape, x_np_shape, relay_op, np_op, dtype="float32"):
-    x = relay.var("x", shape=x_shape, dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], relay_op(x))
-    x_np = np.random.uniform(size=x_np_shape).astype(dtype)
-    res_np = np_op(x_np)
-    check_result([x_np], mod, res_np)
-
-
-@tvm.testing.uses_gpu
-def test_any_full_like():
-    # zeros_like, ones_like
-    verify_any_full_like(any_dims(3), (2, 3, 5), relay.zeros_like, np.zeros_like, "float32")
-    verify_any_full_like(any_dims(3), (225, 115, 15), relay.zeros_like, np.zeros_like, "float32")
-    verify_any_full_like(
-        any_dims(5), (10, 11, 12, 13, 14), relay.zeros_like, np.zeros_like, "int32"
-    )
-    verify_any_full_like(any_dims(3), (2, 3, 5), relay.ones_like, np.ones_like, "float32")
-    verify_any_full_like(any_dims(3), (225, 115, 15), relay.ones_like, np.ones_like, "float32")
-    verify_any_full_like(any_dims(5), (10, 11, 12, 13, 14), relay.ones_like, np.ones_like, "int32")
-
-
-def verify_any_full(x_np_shape, relay_op, np_op, dtype="float32", value=None):
-    x = relay.var("x", shape=(len(x_np_shape),), dtype="int32")
-    mod = tvm.IRModule()
-    out = relay_op(x, dtype) if value is None else relay_op(relay.expr.const(value), x, dtype)
-    mod["main"] = relay.Function([x], out)
-    res_np = np_op(x_np_shape) if value is None else np_op(x_np_shape, value)
-    x_np = np.array(x_np_shape).astype("int32")
-    check_result([x_np], mod, res_np)
-
-
-@tvm.testing.uses_gpu
-def test_any_full():
-    # zeros, ones, full
-    verify_any_full((2, 3, 5), relay.zeros, np.zeros, "float32")
-    verify_any_full((225, 115, 15), relay.zeros, np.zeros, "float32")
-    verify_any_full((10, 11, 12, 13, 14), relay.zeros, np.zeros, "int32")
-    verify_any_full((2, 3, 5), relay.ones, np.ones, "float32")
-    verify_any_full((225, 115, 15), relay.ones, np.ones, "float32")
-    verify_any_full((10, 11, 12, 13, 14), relay.ones, np.ones, "int32")
-    verify_any_full((10, 11, 12, 13, 14), relay.full, np.full, "float32", 2.0)
-    verify_any_full((1, 2, 3, 4), relay.full, np.full, "int32", -2)
-
-
-@tvm.testing.uses_gpu
-def test_any_concat():
-    x = relay.var("x", shape=(relay.Any(), 2), dtype="float32")
-    y = relay.var("y", shape=(1, 2), dtype="float32")
-    xx = x - relay.expr.const(3.0)
-    yy = y * relay.expr.const(5.0)
-    z = relay.op.concatenate([xx, yy], axis=0)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], z)
-    x_np = np.random.uniform(size=(3, 2)).astype("float32")
-    y_np = np.random.uniform(size=(1, 2)).astype("float32")
-    ref = np.concatenate([x_np - 3.0, y_np * 5.0], axis=0)
-    check_result([x_np, y_np], mod, ref)
-
-    num_inputs = 25
-    x = [relay.var("x", shape=(relay.Any(),), dtype="float32") for _ in range(num_inputs)]
-    z = relay.op.concatenate(x, axis=0)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function(x, z)
-    x_np = [np.random.uniform(size=(1,)).astype("float32") for _ in range(num_inputs)]
-    ref = np.concatenate(x_np, axis=0)
-    check_result(x_np, mod, ref)
-
-    def test_oshape(in_vars, axis, oshape):
-        z = relay.op.concatenate(in_vars, axis=axis)
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function(in_vars, z)
-        typed_mod = relay.transform.InferType()(mod)
-        assert typed_mod["main"].body.checked_type == relay.TensorType(oshape, dtype="float32")
-
-    x = [relay.var("x", shape=(relay.Any(), 3), dtype="float32") for _ in range(3)]
-    x.append(relay.var("x", shape=(relay.Any(), relay.Any()), dtype="float32"))
-
-    test_oshape(x, 0, (relay.Any(), 3))
-    test_oshape(x, 1, (relay.Any(), relay.Any()))
-
-    # [(1, 3), (1, ?)] -> (2, ?)
-    x = [
-        relay.var("x", shape=(1, 3), dtype="float32"),
-        relay.var("x", shape=(1, relay.Any()), dtype="float32"),
-    ]
-    test_oshape(x, 0, (2, relay.Any()))
-    test_oshape(x, 1, (1, relay.Any()))
-
-
-def verify_any_reshape(x_shape, newshape, x_np_shape, out_shape, variable_newshape=False):
-    x = relay.var("x", shape=x_shape, dtype="float32")
-    relu_x = relay.nn.relu(x)
-    data = np.random.uniform(size=x_np_shape).astype("float32")
-    expected = data.reshape(out_shape)
-    params = [x]
-    args = [data]
-
-    if variable_newshape:
-        newshape_var = relay.var("newshape", shape=(len(newshape),), dtype="int64")
-        params.append(newshape_var)
-        args.append(np.array(newshape, dtype="int64"))
-        newshape = newshape_var
-
-    y = relay.reshape(relu_x, newshape=newshape)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function(params, y)
-    check_result(args, mod, expected)
-
-
-@tvm.testing.uses_gpu
-def test_any_reshape():
-    for variable_newshape in [False, True]:
-        # Variable newshape only supports that output rank is the same as newshape
-        verify_any_reshape(any_dims(3), (1, -1), (2, 3, 4), (1, 24), variable_newshape)
-        verify_any_reshape(any_dims(3), (0, -1), (2, 3, 4), (2, 12), variable_newshape)
-    verify_any_reshape(any_dims(3), (0, -2), (2, 3, 4), (2, 3, 4))
-    verify_any_reshape(any_dims(3), (-4, -1, 2, -3), (6, 3, 4), (3, 2, 12))
-    verify_any_reshape(any_dims(3), (-4, 2, -1, -2), (6, 3, 4), (2, 3, 3, 4))
-    verify_any_reshape(any_dims(3), (1, -1, 0), (2, 3, 4), (1, 6, 4))
-    verify_any_reshape(any_dims(3), (-1, 1, 0), (2, 3, 4), (6, 1, 4))
-
-
-def verify_any_one_hot(indices_shape, indices_np_shape, depth, on_value, off_value, axis, dtype):
-    indices = relay.var("indices", shape=indices_shape, dtype="int32")
-    on_value_const = relay.const(on_value, dtype)
-    off_value_const = relay.const(off_value, dtype)
-    y = relay.one_hot(indices, on_value_const, off_value_const, depth, axis=axis, dtype=dtype)
-    params = [indices]
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function(params, y)
-
-    indices_npy = np.random.randint(0, depth, size=indices_np_shape).astype("int32")
-    out_npy = tvm.topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype)
-    args = [indices_npy]
-    check_result(args, mod, out_npy)
-
-
-@tvm.testing.uses_gpu
-def test_any_one_hot():
-    verify_any_one_hot(any_dims(1), (3,), 3, 1, 0, -1, "int32")
-    verify_any_one_hot(any_dims(2), (2, 2), 5, 0.5, -0.5, 1, "float32")
-    verify_any_one_hot(any_dims(4), (3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
-
-
-def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"):
-    x = relay.var("x", shape=x_shape, dtype=dtype)
-    y = relay.argwhere(x)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y)
-    data = np.random.choice([0, 1, 2, 3], size=x_np_shape).astype(dtype)
-    expected = np.argwhere(data)
-    check_result([data], mod, expected, flatten=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_argwhere():
-    verify_any_argwhere(any_dims(1), (5,))
-    verify_any_argwhere(any_dims(2), (5, 5))
-    verify_any_argwhere(any_dims(2), (5, 5), "int32")
-    verify_any_argwhere(any_dims(2), (5, 5), "int8")
-    verify_any_argwhere(any_dims(3), (5, 5, 5))
-    verify_any_argwhere(any_dims(4), (5, 5, 5, 5))
-    verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5))
-    verify_any_argwhere(any_dims(1), (5,), "int32")
-    verify_any_argwhere(any_dims(3), (5, 5, 5), "int32")
-    verify_any_argwhere(any_dims(4), (5, 5, 5, 5), "int32")
-    verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5), "int32")
-    verify_any_argwhere(any_dims(1), (5,), "int8")
-    verify_any_argwhere(any_dims(3), (5, 5, 5), "int8")
-    verify_any_argwhere(any_dims(4), (5, 5, 5, 5), "int8")
-    verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5), "int8")
-
-
-def verify_any_take(data_shape, indices_shape, axis, data_np_shape, indices_np_shape):
-    mod = tvm.IRModule()
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    indices = relay.var("indices", shape=indices_shape, dtype="int32")
-    y = relay.take(data, indices, axis=axis)
-    mod["main"] = relay.Function([data, indices], y)
-    data_np = np.random.uniform(size=data_np_shape).astype("float32")
-    if axis is None:
-        max_index = data_np.size
-    else:
-        max_index = data_np.shape[axis]
-    indices_np = np.random.randint(max_index, size=indices_np_shape).astype("int32")
-    ref = np.take(data_np, indices_np, axis=axis)
-    check_result([data_np, indices_np], mod, ref)
-
-
-@tvm.testing.uses_gpu
-def test_any_take():
-    verify_any_take(any_dims(2), (1,), 0, (4, 5), (1,))
-    verify_any_take(any_dims(2), (), 0, (4, 5), ())
-    verify_any_take(any_dims(2), (), None, (4, 5), ())
-    verify_any_take(any_dims(3), any_dims(2), 1, (3, 4, 5), (2, 3))
-    verify_any_take(any_dims(2), any_dims(3), None, (4, 5), (2, 3, 4))
-    verify_any_take(any_dims(2), any_dims(4), -1, (4, 5), (2, 3, 4, 5))
-
-
-def verify_any_tile(dshape, reps, np_dshape, np_reps):
-    mod = tvm.IRModule()
-    x = relay.var("x", shape=dshape, dtype="float32")
-    y = relay.tile(x, reps=reps)
-    mod["main"] = relay.Function([x], y)
-    x_data = np.random.uniform(size=np_dshape).astype("float32")
-    ref_res = np.tile(x_data, reps=np_reps)
-    check_result([x_data], mod, ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_any_tile():
-    verify_any_tile(any_dims(3), (3, 2, 1), (2, 3, 4), (3, 2, 1))
-    verify_any_tile(any_dims(3), (1, 2), (2, 3, 4), (1, 2))
-    verify_any_tile(any_dims(2), (3, 2, 1), (2, 3), (3, 2, 1))
-    verify_any_tile(any_dims(3), (1,), (2, 3, 4), (1,))
-
-
-@tvm.testing.uses_gpu
-def test_any_shape_of():
-    x = relay.var("x", shape=any_dims(2), dtype="float32")
-    y = relay.shape_of(x)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y)
-    data = np.random.uniform(size=(3, 4)).astype("float32")
-    check_result([data], mod, np.array([3, 4]).astype("int64"))
-
-    x = relay.var("x", shape=any_dims(3), dtype="float32")
-    y0 = relay.shape_of(x)
-    y1 = relay.take(y0, relay.const(1, "int32"))
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y1)
-    data = np.random.uniform(size=(2, 3, 4)).astype("float32")
-    check_result([data], mod, np.array(3).astype("int64"))
-
-
-class TestAnyReduce:
-    config = {
-        "argmax": (relay.argmax, any_dims(3), None, False, False, (3, 4, 5), ()),
-        "argmin": (relay.argmin, any_dims(4), 1, False, True, (3, 4, 5, 6), (3, 1, 5, 6)),
-        "all": (relay.all, any_dims(3), (1, 2), True, False, (3, 4, 5), (4, 5)),
-        "max": (relay.max, any_dims(4), -1, True, True, (3, 4, 5, 6), (1, 1, 1, 6)),
-        "min": (relay.min, any_dims(3), (0, 1), False, False, (4, 5, 6), (6,)),
-        "prod": (relay.prod, any_dims(4), 2, True, True, (3, 4, 5, 6), (1, 1, 5, 1)),
-        "mean": (relay.mean, any_dims(2), 0, False, False, (1, 2), (2,)),
-        "variance": (relay.variance, any_dims(5), (2, 4), False, False, (3, 4, 5, 6, 7), (3, 4, 6)),
-    }
-
-    (
-        reduce_op,
-        data_shape,
-        axis,
-        exclude,
-        keepdims,
-        static_data_shape,
-        ref_out_shape,
-    ) = tvm.testing.parameters(*config.values(), ids=config.keys())
-
-    def test_any_reduce(
-        self,
-        target,
-        dev,
-        reduce_op,
-        data_shape,
-        axis,
-        exclude,
-        keepdims,
-        static_data_shape,
-        ref_out_shape,
-    ):
-        target = tvm.target.Target(target)
-        if target.kind.name == "vulkan" and reduce_op == relay.all:
-            pytest.xfail("Known failing test case for vulkan runtime")
-
-        mod = tvm.IRModule()
-        dtype = "bool" if reduce_op == relay.all else "float32"
-        data = relay.var("data", shape=data_shape, dtype=dtype)
-        y = reduce_op(data, axis, keepdims, exclude)
-        mod["main"] = relay.Function([data], y)
-        data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-        check_result([data_np], mod, ref_out_shape, assert_shape=True, targets=[(target, dev)])
-
-
-def verify_any_layout_transform(
-    data_shape, src_layout, dst_layout, static_data_shape, ref_out_shape
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.layout_transform(data, src_layout, dst_layout)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_layout_transform():
-    verify_any_layout_transform(any_dims(4), "NCHW", "NHWC", (3, 4, 5, 6), (3, 5, 6, 4))
-    verify_any_layout_transform(
-        any_dims(5), "NCHW16c", "NCHW2c", (1, 2, 8, 8, 16), (1, 16, 8, 8, 2)
-    )
-    verify_any_layout_transform(any_dims(5), "NCHW6n", "NHWC", (3, 4, 5, 6, 6), (18, 5, 6, 4))
-    verify_any_layout_transform(any_dims(4), "NCHW", "NCHW4c", (3, 4, 5, 6), (3, 1, 5, 6, 4))
-    verify_any_layout_transform((16, 1), "CH", "C4cH", (16, 1), (4, 4, 1))
-
-
-def test_bilayout_with_any():
-    bilayout = tvm.tir.bijective_layout("NCHW", "NHWC")
-    assert isinstance(bilayout, tvm.tir.BijectiveLayout)
-    dst_shape = bilayout.forward_shape((relay.Any(), 32, 7, relay.Any()))
-    assert dst_shape[3] == 32
-    src_shape = bilayout.backward_shape(dst_shape)
-    assert src_shape[1] == 32
-
-
-def verify_any_expand_dims(data_shape, axis, num_newaxis, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.expand_dims(data, axis=axis, num_newaxis=num_newaxis)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_expand_dims():
-    verify_any_expand_dims(any_dims(3), 1, 2, (1, 2, 3), (1, 1, 1, 2, 3))
-    verify_any_expand_dims(any_dims(3), -1, 2, (1, 2, 3), (1, 2, 3, 1, 1))
-
-
-def verify_any_transpose(data_shape, axes, static_data_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.transpose(data, axes=axes)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    ref_out = np.transpose(data_np, axes)
-    check_result([data_np], mod, ref_out)
-
-
-@tvm.testing.uses_gpu
-def test_any_transpose():
-    verify_any_transpose(any_dims(3), (1, 0, 2), (10, 3, 2))
-    verify_any_transpose(any_dims(3), None, (2, 3, 4))
-    verify_any_transpose(any_dims(6), (0, 1, 3, 2, 5, 4), (11, 12, 2, 1, 9, 17))
-    verify_any_transpose(any_dims(2), (-1, 0), (3, 2))
-
-
-def verify_any_squeeze(data_shape, axis, static_data_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.squeeze(data, axis=axis)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    ref_out = np.squeeze(data_np, axis)
-    check_result([data_np], mod, ref_out)
-
-
-def verify_any_squeeze_sqrt(data_shape, axis, static_data_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.squeeze(data, axis=axis)
-    y = relay.sqrt(y)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    ref_out = np.sqrt(np.squeeze(data_np, axis))
-    check_result([data_np], mod, ref_out)
-
-
-@tvm.testing.uses_gpu
-def test_any_squeeze():
-    verify_any_squeeze((relay.Any(), relay.Any(), relay.Any()), (0,), (1, 9, 8))
-    verify_any_squeeze((1, relay.Any(), relay.Any()), (0,), (1, 9, 8))
-    verify_any_squeeze(
-        (1, relay.Any(), relay.Any(), 1, relay.Any(), relay.Any()), (0, 3), (1, 12, 2, 1, 9, 17)
-    )
-    verify_any_squeeze_sqrt((1, relay.Any(), 12, 32, 1), (-1,), (1, 100, 12, 32, 1))
-    verify_any_squeeze_sqrt((relay.Any(), relay.Any(), relay.Any(), 1), (-1,), (1, 9, 8, 1))
-
-
-@tvm.testing.uses_gpu
-def test_any_reshape_like():
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=(relay.Any(), 3, 10), dtype=dtype)
-    shape_like = relay.var("data", shape=(relay.Any(), 5, 6), dtype=dtype)
-    y = relay.reshape_like(data, shape_like)
-    mod["main"] = relay.Function([data, shape_like], y)
-    data_np = np.random.uniform(size=(3, 3, 10)).astype(dtype)
-    shape_like_np = np.random.uniform(size=(3, 5, 6)).astype(dtype)
-    check_result([data_np, shape_like_np], mod, shape_like_np.shape, assert_shape=True)
-
-
-def verify_any_conv2d(
-    data_shape,
-    kernel_shape,
-    strides,
-    padding,
-    dilation,
-    static_data_shape,
-    ref_out_shape,
-    data_layout="NCHW",
-    kernel_layout="OIHW",
-    use_cudnn=False,
-    targets=None,
-    disable_targets=None,
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    kernel = relay.var("kernel", shape=kernel_shape, dtype=dtype)
-    y = relay.nn.conv2d(
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        kernel_size=kernel_shape[2:4] if kernel_layout == "OIHW" else kernel_shape[0:2],
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    mod["main"] = relay.Function([data, kernel], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
-
-    if use_cudnn and tvm.get_global_func("tvm.contrib.cudnn.conv2d.forward", True):
-        targets = [("cuda -libs=cudnn", tvm.cuda(0))]
-
-    check_result(
-        [data_np, kernel_np],
-        mod,
-        ref_out_shape,
-        assert_shape=True,
-        targets=targets,
-        disable_targets=disable_targets,
-    )
-
-
-# TODO(@kevinthesun): Support dynamic input height and width.
-@tvm.testing.uses_gpu
-def test_any_conv2d():
-    verify_any_conv2d(
-        (relay.Any(), 64, 224, 224),
-        (64, 64, 3, 3),
-        (1, 1),
-        (1, 1),
-        (1, 1),
-        (1, 64, 224, 224),
-        (1, 64, 224, 224),
-    )
-    verify_any_conv2d(
-        (relay.Any(), 64, 224, 224),
-        (64, 64, 3, 3),
-        (1, 1),
-        (1, 1),
-        (2, 2),
-        (2, 64, 224, 224),
-        (2, 64, 222, 222),
-    )
-    verify_any_conv2d(
-        (relay.Any(), 64, 224, 224),
-        (64, 64, 3, 3),
-        (1, 1),
-        (1, 1),
-        (1, 1),
-        (1, 64, 224, 224),
-        (1, 64, 224, 224),
-        use_cudnn=True,
-    )
-    verify_any_conv2d(
-        (relay.Any(), 224, 224, 64),
-        (3, 3, 64, 64),
-        (1, 1),
-        (1, 1),
-        (1, 1),
-        (1, 224, 224, 64),
-        (1, 224, 224, 64),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-    verify_any_conv2d(
-        (relay.Any(), 224, 224, 64),
-        (3, 3, 64, 64),
-        (1, 1),
-        (1, 1),
-        (2, 2),
-        (2, 224, 224, 64),
-        (2, 222, 222, 64),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
-
-    if platform.machine() == "aarch64":
-        pytest.skip(
-            reason="Dynamic height and width not supported in arm_cpu. See https://github.com/apache/tvm/issues/16536"
-        )
-
-    verify_any_conv2d(
-        (relay.Any(), 64, relay.Any(), relay.Any()),
-        (64, 64, 3, 3),
-        (1, 1),
-        (1, 1),
-        (1, 1),
-        (1, 64, 224, 224),
-        (1, 64, 224, 224),
-        targets=[("llvm", tvm.cpu(0))],
-    )
-    verify_any_conv2d(
-        (relay.Any(), 64, relay.Any(), relay.Any()),
-        (64, 64, 1, 1),
-        (1, 1),
-        (0, 0),
-        (1, 1),
-        (1, 64, 224, 224),
-        (1, 64, 224, 224),
-        targets=[("llvm", tvm.cpu(0))],
-    )
-
-
-class TestAnyConv2dNCHWc:
-    data_shape = tvm.testing.parameter((relay.Any(), 8, 224, 224, 8))
-    kernel_shape = tvm.testing.parameter((8, 8, 3, 3, 8, 8))
-    strides = tvm.testing.parameter((1, 1))
-    padding = tvm.testing.parameter((1, 1))
-    data_layout = tvm.testing.parameter("NCHW8c")
-    kernel_layout = tvm.testing.parameter("OIHW8i8o")
-    out_layout = tvm.testing.parameter("NCHW8c")
-
-    dilation, static_data_shape, ref_out_shape = tvm.testing.parameters(
-        ((1, 1), (1, 8, 224, 224, 8), (1, 8, 224, 224, 8)),
-        ((2, 2), (2, 8, 224, 224, 8), (2, 8, 222, 222, 8)),
-    )
-
-    @tvm.testing.known_failing_targets("cuda", "vulkan")
-    def test_any_conv2d_NCHWc(
-        self,
-        target,
-        dev,
-        data_shape,
-        kernel_shape,
-        strides,
-        padding,
-        dilation,
-        data_layout,
-        kernel_layout,
-        out_layout,
-        static_data_shape,
-        ref_out_shape,
-    ):
-        mod = tvm.IRModule()
-        dtype = "float32"
-        data = relay.var("data", shape=data_shape, dtype=dtype)
-        kernel = relay.var("kernel", shape=kernel_shape, dtype=dtype)
-        y = relay.nn.contrib_conv2d_nchwc(
-            data,
-            kernel,
-            strides,
-            padding,
-            dilation,
-            kernel_size=kernel_shape[2:4],
-            channels=kernel_shape[0] * kernel_shape[-1],
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-            out_layout=out_layout,
-        )
-        mod["main"] = relay.Function([data, kernel], y)
-        data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-        kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
-        check_result(
-            [data_np, kernel_np], mod, ref_out_shape, assert_shape=True, targets=[(target, dev)]
-        )
-
-
-def verify_any_conv1d_transpose_ncw(
-    data_shape,
-    kernel_shape,
-    strides,
-    padding,
-    dilation,
-    groups,
-    static_data_shape,
-    ref_out_shape,
-    output_padding,
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    kernel = relay.var("kernel", shape=kernel_shape, dtype=dtype)
-    y = relay.nn.conv1d_transpose(
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        groups,
-        kernel_size=kernel_shape[2:],
-        output_padding=output_padding,
-    )
-    mod["main"] = relay.Function([data, kernel], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
-    check_result([data_np, kernel_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_conv1d_transpose_ncw():
-    verify_any_conv1d_transpose_ncw(
-        (relay.Any(), 64, 224),
-        (64, 192, 3),
-        (1,),
-        (1,),
-        (1,),
-        1,
-        (2, 64, 224),
-        (2, 192, 224),
-        (0, 0),
-    )
-    verify_any_conv1d_transpose_ncw(
-        (relay.Any(), 32, 224),
-        (32, 64, 3),
-        (2,),
-        (1,),
-        (1,),
-        1,
-        (1, 32, 224),
-        (1, 64, 448),
-        (1, 1),
-    )
-
-
-def verify_any_conv2d_transpose_nchw(
-    data_shape,
-    kernel_shape,
-    strides,
-    padding,
-    dilation,
-    groups,
-    static_data_shape,
-    ref_out_shape,
-    output_padding,
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    kernel = relay.var("kernel", shape=kernel_shape, dtype=dtype)
-    y = relay.nn.conv2d_transpose(
-        data,
-        kernel,
-        strides,
-        padding,
-        dilation,
-        groups,
-        kernel_size=kernel_shape[2:4],
-        output_padding=output_padding,
-    )
-    mod["main"] = relay.Function([data, kernel], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
-    check_result([data_np, kernel_np], mod, ref_out_shape, assert_shape=True)
-
-
-# TODO(@kevinthesun): Support dynamic input height and width.
-@tvm.testing.uses_gpu
-def test_any_conv2d_transpose_nchw():
-    verify_any_conv2d_transpose_nchw(
-        (relay.Any(), 64, 224, 224),
-        (64, 192, 3, 3),
-        (1, 1),
-        (1, 1),
-        (1, 1),
-        1,
-        (2, 64, 224, 224),
-        (2, 192, 224, 224),
-        (0, 0),
-    )
-    verify_any_conv2d_transpose_nchw(
-        (relay.Any(), 32, 224, 224),
-        (32, 64, 3, 3),
-        (2, 2),
-        (1, 1),
-        (1, 1),
-        1,
-        (1, 32, 224, 224),
-        (1, 64, 448, 448),
-        (1, 1),
-    )
-
-
-def verify_any_pool2d(
-    pool_type,
-    data_shape,
-    pool_size,
-    strides,
-    dilation,
-    padding,
-    layout,
-    static_data_shape,
-    ref_out_shape,
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    pool_func = relay.nn.max_pool2d if pool_type == "max" else relay.nn.avg_pool2d
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = pool_func(data, pool_size, strides, dilation, padding, layout)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_pool2d():
-    verify_any_pool2d(
-        "max",
-        (relay.Any(), 3, relay.Any(), relay.Any()),
-        (3, 3),
-        (1, 1),
-        (1, 1),
-        (1, 1),
-        "NCHW",
-        (2, 3, 220, 220),
-        (2, 3, 220, 220),
-    )
-    verify_any_pool2d(
-        "avg",
-        (relay.Any(), relay.Any(), relay.Any(), 4),
-        (1, 1),
-        (2, 2),
-        (1, 1),
-        (0, 0),
-        "NHWC",
-        (3, 220, 220, 4),
-        (3, 110, 110, 4),
-    )
-    verify_any_pool2d(
-        "max",
-        (relay.Any(), 3, relay.Any(), relay.Any(), 4),
-        (3, 3),
-        (2, 2),
-        (1, 1),
-        (1, 1),
-        "NCHW4c",
-        (2, 3, 220, 220, 4),
-        (2, 3, 110, 110, 4),
-    )
-
-
-def verify_any_global_pool2d(pool_type, data_shape, layout, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    pool_func = relay.nn.global_max_pool2d if pool_type == "max" else relay.nn.global_avg_pool2d
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = pool_func(data, layout)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_global_pool2d():
-    verify_any_global_pool2d(
-        "max", (relay.Any(), 3, relay.Any(), relay.Any()), "NCHW", (2, 3, 220, 220), (2, 3, 1, 1)
-    )
-    verify_any_global_pool2d(
-        "avg", (relay.Any(), relay.Any(), relay.Any(), 4), "NHWC", (3, 220, 220, 4), (3, 1, 1, 4)
-    )
-    verify_any_global_pool2d(
-        "max",
-        (relay.Any(), 3, relay.Any(), relay.Any(), 4),
-        "NCHW4c",
-        (2, 3, 220, 220, 4),
-        (2, 3, 1, 1, 4),
-    )
-
-
-def verify_any_split(data_shape, indices_or_sections, axis, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.split(data, indices_or_sections, axis)
-    mod["main"] = relay.Function([data], y.astuple())
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    for kind in ["vm"]:
-        result = relay.create_executor(kind, mod=mod, device=tvm.cpu(), target="llvm").evaluate()(
-            data_np
-        )
-        for ret, ref_ret in zip(result, ref_out_shape):
-            assert ret.numpy().shape == ref_ret, "Shape mismatch: expect %s but got %s." % (
-                str(ref_ret),
-                str(ret.numpy().shape),
-            )
-
-
-@tvm.testing.uses_gpu
-def test_any_split():
-    verify_any_split((relay.Any(), 4), 2, -1, (9, 4), [(9, 2), (9, 2)])
-    verify_any_split((relay.Any(), 4), 2, 1, (9, 4), [(9, 2), (9, 2)])
-    verify_any_split((relay.Any(), relay.Any()), 2, 1, (9, 4), [(9, 2), (9, 2)])
-    verify_any_split((relay.Any(), 12), (1, 4, 8), 1, (7, 12), [(7, 1), (7, 3), (7, 4)])
-    verify_any_split((relay.Any(), relay.Any()), (1, 4, 8), 1, (7, 12), [(7, 1), (7, 3), (7, 4)])
-    verify_any_split((relay.Any(), 12), (8,), 1, (7, 12), [(7, 8), (7, 4)])
-    verify_any_split((relay.Any(), relay.Any()), (8,), 1, (7, 12), [(7, 8), (7, 4)])
-
-
-@tvm.testing.uses_gpu
-def test_any_batch_flatten():
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=any_dims(3), dtype=dtype)
-    y = relay.nn.batch_flatten(data)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=(3, 3, 10)).astype(dtype)
-    ref_out_shape = (3, 30)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-# TODO(tvm-team) Fix dense schedule
-@tvm.testing.known_failing_targets("cuda", "vulkan")
-class TestAnyDense:
-    (
-        data_shape,
-        weight_shape,
-        units,
-        static_data_shape,
-        static_weight_shape,
-        ref_out_shape,
-    ) = tvm.testing.parameters(
-        (any_dims(2), any_dims(2), None, (4, 16), (8, 16), (4, 8)),
-        (any_dims(2), (50, relay.Any()), 50, (4, 40), (50, 40), (4, 50)),
-    )
-
-    @tvm.testing.known_failing_targets("cuda", "vulkan")
-    def test_any_dense(
-        self,
-        target,
-        dev,
-        data_shape,
-        weight_shape,
-        units,
-        static_data_shape,
-        static_weight_shape,
-        ref_out_shape,
-    ):
-
-        if platform.machine() == "aarch64":
-            pytest.skip(
-                reason="Dynamic height and width not supported in arm_cpu. See https://github.com/apache/tvm/issues/16536"
-            )
-
-        mod = tvm.IRModule()
-        dtype = "float32"
-        data = relay.var("data", shape=data_shape, dtype=dtype)
-        weight = relay.var("weight", shape=weight_shape, dtype=dtype)
-        y = relay.nn.dense(data, weight, units)
-        mod["main"] = relay.Function([data, weight], y)
-        data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-        weight_np = np.random.uniform(size=static_weight_shape).astype(dtype)
-
-        check_result(
-            [data_np, weight_np], mod, ref_out_shape, assert_shape=True, targets=[(target, dev)]
-        )
-
-    @tvm.testing.parametrize_targets("cuda -libs=cublas")
-    @tvm.testing.known_failing_targets("cuda", "vulkan")
-    def test_any_dense_cublas(
-        self,
-        target,
-        dev,
-        data_shape,
-        weight_shape,
-        units,
-        static_data_shape,
-        static_weight_shape,
-        ref_out_shape,
-    ):
-
-        self.test_any_dense(
-            target,
-            dev,
-            data_shape,
-            weight_shape,
-            units,
-            static_data_shape,
-            static_weight_shape,
-            ref_out_shape,
-        )
-
-
-class TestAnyBatchMatmul:
-    dtype = tvm.testing.parameter("float32")
-    executor_kind = tvm.testing.parameter("vm", "debug")
-
-    (x_shape, y_shape) = tvm.testing.parameters(
-        ((1, 16, 32), (1, 32, 16)),
-        ((5, 16, 32), (5, 32, 16)),
-        ((5, 16, 32), (5, 32, 20)),
-        ((30, 16, 32), (30, 32, 20)),
-    )
-
-    # any_x = tvm.testing.parameter("none", "batch")
-    # any_y = tvm.testing.parameter("none", "batch", "all")
-
-    any_x, any_y = tvm.testing.parameters(
-        ("none", "batch"), ("none", "all"), ("batch", "none"), ("batch", "batch"), ("batch", "all")
-    )
-
-    transpose_x = tvm.testing.parameter(True, False)
-    transpose_y = tvm.testing.parameter(True, False)
-
-    @tvm.testing.fixture
-    def x_var_shape(self, x_shape, any_x):
-        if any_x == "none":
-            return x_shape
-        elif any_x == "batch":
-            return tuple(relay.Any() if i == 0 else size for i, size in enumerate(x_shape))
-        elif any_x == "all":
-            return tuple(relay.Any() for _ in x_shape)
-
-    @tvm.testing.fixture
-    def y_var_shape(self, y_shape, any_y):
-        if any_y == "none":
-            return y_shape
-        elif any_y == "batch":
-            return tuple(relay.Any() if i == 0 else size for i, size in enumerate(y_shape))
-        elif any_y == "all":
-            return tuple(relay.Any() for _ in y_shape)
-
-    @tvm.testing.known_failing_targets("cuda", "vulkan")
-    def test_any_batch_matmul(
-        self,
-        target,
-        dev,
-        x_shape,
-        y_shape,
-        any_x,
-        any_y,
-        x_var_shape,
-        y_var_shape,
-        transpose_x,
-        transpose_y,
-        executor_kind,
-        dtype,
-    ):
-        if transpose_x:
-            x_shape = (x_shape[0], x_shape[2], x_shape[1])
-            x_var_shape = (x_var_shape[0], x_var_shape[2], x_var_shape[1])
-
-        if transpose_y:
-            y_shape = (y_shape[0], y_shape[2], y_shape[1])
-            y_var_shape = (y_var_shape[0], y_var_shape[2], y_var_shape[1])
-
-        x = relay.var("x", relay.TensorType(x_var_shape, dtype))
-        y = relay.var("y", relay.TensorType(y_var_shape, dtype))
-        z = relay.nn.batch_matmul(x, y, transpose_a=transpose_x, transpose_b=transpose_y)
-
-        func = relay.Function([x, y], z)
-        x_np = np.random.uniform(size=x_shape).astype(dtype)
-        y_np = np.random.uniform(size=y_shape).astype(dtype)
-        z_np = tvm.topi.testing.batch_matmul(x_np, y_np, trans_x=transpose_x, trans_y=transpose_y)
-
-        mod = tvm.ir.IRModule.from_expr(func)
-        z = relay.create_executor(executor_kind, mod=mod, device=dev, target=target).evaluate()(
-            x_np, y_np
-        )
-        tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def verify_any_pad(data_shape, pad_width, static_data_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.nn.pad(data, pad_width)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    ref_out = np.pad(data_np, pad_width)
-    check_result([data_np], mod, ref_out)
-
-
-@tvm.testing.uses_gpu
-def test_any_pad():
-    verify_any_pad(any_dims(3), ((0, 0), (1, 1), (2, 2)), (1, 2, 3))
-    verify_any_pad(any_dims(4), ((1, 0), (1, 3), (0, 2), (9, 0)), (13, 11, 3, 1))
-
-
-def verify_any_dilate(data_shape, strides, static_data_shape, dilation_value=None):
-    assert len(data_shape) == len(strides)
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    if dilation_value is None:
-        y = relay.nn.dilate(data, strides)
-    else:
-        y = relay.nn.dilate(data, strides, dilation_value)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    ref_shape = tuple(
-        (static_data_shape[i] - 1) * strides[i] + 1 for i in range(len(static_data_shape))
-    )
-    if dilation_value is None:
-        dilation_value = 0.0
-    ref_out = np.ones(shape=ref_shape, dtype=dtype)
-    ref_out = dilation_value * ref_out
-    ref_out[tuple(slice(None, None, strides[i]) for i in range(len(data_shape)))] = data_np
-    check_result([data_np], mod, ref_out)
-
-
-@tvm.testing.uses_gpu
-def test_any_dilate():
-    verify_any_dilate(any_dims(1), (1,), (1,))
-    verify_any_dilate(any_dims(1), (1,), (5,))
-    verify_any_dilate(any_dims(1), (5,), (5,))
-    verify_any_dilate(any_dims(3), (1, 1, 1), (1, 2, 3))
-    verify_any_dilate(any_dims(3), (1, 1, 2), (1, 2, 3))
-    verify_any_dilate(any_dims(3), (1, 1, 5), (1, 2, 3))
-    verify_any_dilate(any_dims(3), (3, 7, 5), (1, 2, 3))
-    verify_any_dilate(any_dims(4), (3, 7, 1, 5), (1, 2, 3, 4))
-    verify_any_dilate(any_dims(4), (3, 7, 1, 5), (1, 2, 3, 4), 1.0)
-
-
-def verify_any_softmax(data_shape, axis, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.nn.softmax(data, axis)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_softmax():
-    verify_any_softmax(any_dims(3), -1, (1, 2, 3), (1, 2, 3))
-    verify_any_softmax(any_dims(4), 2, (13, 11, 3, 1), (13, 11, 3, 1))
-
-
-def verify_any_relu(data_shape, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.nn.relu(data)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_relu():
-    verify_any_relu(any_dims(3), (1, 2, 3), (1, 2, 3))
-    verify_any_relu(any_dims(4), (13, 11, 3, 1), (13, 11, 3, 1))
-
-
-def verify_any_prelu(data_shape, alpha, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    alpha = relay.const(np.array([alpha]), dtype=dtype)
-    y = relay.nn.prelu(data, alpha)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_prelu():
-    verify_any_prelu(any_dims(3), 1, (1, 2, 3), (1, 2, 3))
-    verify_any_prelu(any_dims(4), 2, (13, 11, 3, 1), (13, 11, 3, 1))
-
-
-def verify_any_leaky_relu(data_shape, alpha, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.nn.leaky_relu(data, alpha)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_leaky_relu():
-    verify_any_leaky_relu(any_dims(3), 0.1, (1, 2, 3), (1, 2, 3))
-    verify_any_leaky_relu(any_dims(4), 0.2, (13, 11, 3, 1), (13, 11, 3, 1))
-
-
-def verify_any_bias_add(data_shape, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    bias = relay.const(np.random.randn(1), dtype=dtype)
-    y = relay.nn.bias_add(data, bias)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_bias_add():
-    verify_any_bias_add(any_dims(3), (1, 2, 3), (1, 2, 3))
-    verify_any_bias_add(any_dims(4), (13, 11, 3, 1), (13, 11, 3, 1))
-
-
-def verify_any_topk(data_shape, kval, np_dshape, dtype, ret_type="indices", const_k=False):
-    mod = tvm.IRModule()
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    np_data = np.random.uniform(size=np_dshape).astype(dtype)
-    if const_k:
-        k = relay.const(kval)
-        args = [data]
-        in_vals = [np_data]
-    else:
-        k = relay.var("k", shape=(), dtype="int32")
-        args = [data, k]
-        in_vals = [np_data, kval]
-    out = relay.topk(data, k, ret_type=ret_type)
-    if ret_type == "both":
-        out = out[0]
-    mod["main"] = relay.Function(args, out)
-
-    sorted = np.argsort(-np_data)
-    if len(np_dshape) == 2:
-        ref_out = sorted[:, 0:kval]
-    else:
-        ref_out = sorted[0:kval]
-
-    check_result(in_vals, mod, ref_out)
-
-
-@tvm.testing.uses_gpu
-def test_any_topk():
-    verify_any_topk(any_dims(1), 5, (10,), "float32")
-    verify_any_topk(any_dims(2), 2, (6, 3), "int32")
-    verify_any_topk(any_dims(2), 3, (6, 3), "float32", const_k=True)
-    verify_any_topk(any_dims(1), 0, (0,), "float32", ret_type="both")
-
-
-def verify_any_get_valid_counts(num_anchor_real, dtype, targets=None):
-    mod = tvm.IRModule()
-    batch_size = 1
-    num_anchor = relay.Any()
-    data = relay.var("data", shape=(batch_size, num_anchor, 5), dtype=dtype)
-    np_data = np.random.uniform(size=(batch_size, num_anchor_real, 5)).astype(dtype)
-
-    np_out1 = np.zeros(shape=(batch_size,))
-    np_out2 = np.zeros(shape=np_data.shape).astype(dtype)
-    np_out3 = np.zeros(shape=(batch_size, num_anchor_real))
-    score_threshold = 0.95
-
-    for i in range(batch_size):
-        np_out1[i] = 0
-        inter_idx = 0
-        for j in range(num_anchor_real):
-            score = np_data[i, j, 0]
-            if score > score_threshold:
-                for k in range(5):
-                    np_out2[i, inter_idx, k] = np_data[i, j, k]
-                np_out1[i] += 1
-                np_out3[i, inter_idx] = j
-                inter_idx += 1
-            if j >= np_out1[i]:
-                for k in range(5):
-                    np_out2[i, j, k] = -1.0
-                np_out3[i, j] = -1
-
-    z = relay.vision.get_valid_counts(data, score_threshold, 0, score_index=0)
-
-    mod["main"] = relay.Function([data], z.astuple())
-
-    check_result([np_data], mod, [np_out1, np_out2, np_out3], targets=targets)
-
-
-@tvm.testing.uses_gpu
-def test_any_get_valid_counts():
-    verify_any_get_valid_counts(10, "float32")
-    # opencl seems to have issues with empty size buffer
-    # Check failed: err_code == CL_SUCCESS == false: OpenCL Error,
-    # code=-61: CL_INVALID_BUFFER_SIZE
-    targets = []
-    for tgt, dev in tvm.testing.enabled_targets():
-        if "opencl" not in tgt:
-            targets.append((tgt, dev))
-    verify_any_get_valid_counts(0, "float32", targets=targets)
-
-
-@tvm.testing.uses_gpu
-def test_fused_ops():
-    x = relay.var("x", shape=(relay.Any(), relay.Any()), dtype="float32")
-    y0 = x + relay.const(1.0, "float32")
-    y1 = y0 * relay.const(2.0, "float32")
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y1)
-    data = np.random.uniform(size=(5, 4)).astype("float32")
-    check_result([data], mod, (data + 1) * 2)
-
-
-@tvm.testing.uses_gpu
-def test_arange_with_dynamic_shape():
-    # m, n, k = relay.ShapeVar('m'), relay.ShapeVar('n'), relay.ShapeVar('k')
-    m, n, k = relay.Any(), relay.Any(), relay.Any()
-    x = relay.var("x", shape=(m, n, k), dtype="float32")
-    y0 = relay.shape_of(x)
-    y1 = relay.take(y0, relay.const(0, "int32"))
-    y2 = relay.op.arange(y1, dtype="int32")
-    y3 = y2 + relay.const(1, dtype="int32")
-    data = np.random.rand(10, 5, 3).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y3)
-    check_result([data], mod, np.array(range(10)).astype("int32") + 1)
-
-
-def verify_any_random_strided_slice(
-    begin_shape,
-    end_shape,
-    strides_shape,
-    data_shape,
-    slice_mode="end",
-    const_attrs=False,
-):
-    # Generate random numpy input data
-    np_begin = np.random.randint(2, size=begin_shape, dtype="int32")
-    np_end = np.random.randint(5, 10, size=end_shape, dtype="int32")
-    np_strides = np.random.randint(
-        1, 2 if slice_mode == "size" else 3, size=strides_shape, dtype="int32"
-    )
-
-    verify_any_strided_slice(
-        np_begin, np_end, np_strides, data_shape, slice_mode=slice_mode, const_attrs=const_attrs
-    )
-
-
-def verify_any_strided_slice(
-    np_begin,
-    np_end,
-    np_strides,
-    data_shape,
-    axes=None,
-    slice_mode="end",
-    const_attrs=False,
-):
-    np_data = np.random.uniform(size=data_shape).astype("float32")
-    # target numpy result
-    ref_res = tvm.topi.testing.strided_slice_python(
-        np_data, np_begin, np_end, np_strides, slice_mode, axes
-    )
-
-    # Relay Module
-    mod = tvm.IRModule()
-    data = relay.var("data", shape=any_dims(len(data_shape)), dtype="float32")
-    if const_attrs:
-        begin = relay.const(np_begin)
-        end = relay.const(np_end)
-        strides = relay.const(np_strides)
-        args = [data]
-        np_inputs = [np_data]
-    else:
-        begin = relay.var("begin", shape=np_begin.shape, dtype="int32")
-        end = relay.var("end", shape=np_end.shape, dtype="int32")
-        strides = relay.var("strides", shape=np_strides.shape, dtype="int32")
-        args = [data, begin, end, strides]
-        np_inputs = [np_data, np_begin, np_end, np_strides]
-
-    y = relay.strided_slice(
-        data, begin=begin, end=end, strides=strides, axes=axes, slice_mode=slice_mode
-    )
-    mod["main"] = relay.Function(args, y)
-
-    check_result(np_inputs, mod, ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_any_strided_slice():
-    verify_any_random_strided_slice((2,), (2,), (2,), (15, 21))
-    verify_any_random_strided_slice((3,), (3,), (3,), (15, 17, 21))
-    verify_any_random_strided_slice((3,), (3,), (3,), (23, 29, 41))
-    verify_any_random_strided_slice((4,), (4,), (4,), (40, 50, 60, 70))
-    verify_any_random_strided_slice((3,), (3,), (3,), (15, 17, 21), slice_mode="size")
-    verify_any_random_strided_slice((2,), (2,), (2,), (15, 21), const_attrs=True)
-
-    begin = np.array([0, 1000000]).astype("int32")
-    end = np.array([1000000, -1000000]).astype("int32")
-    strides = np.array([1, -1]).astype("int32")
-    verify_any_strided_slice(begin, end, strides, (15, 21), const_attrs=False)
-    verify_any_strided_slice(begin, end, strides, (15, 21), const_attrs=True)
-    verify_any_strided_slice(begin, end, strides, (15, 17, 21), axes=[0, 2], const_attrs=True)
-
-
-@tvm.testing.uses_gpu
-def test_recursive_concat():
-    """
-    fn @concat_loop(%i: int32, %st: (any, 1)) -> (any, 1) {
-        if (%i < 10) {
-            let %i = reshape(cast(i, "float32"), newshape=(1, ))
-            let %new_st = concatenate((st, i), axis=0)
-            concat_loop(%i + 1, )
-        } else {
-            st
-        }
-    }
-    """
-    # Initial Values.
-    i = relay.var("i", shape=(), dtype="int32")
-    st = relay.var("st", shape=(relay.Any(), 1), dtype="int32")
-
-    def _cond(i, st):
-        return relay.op.min(relay.op.less(i, int32(10)))
-
-    def _body(i, st):
-        i_vec = relay.op.reshape(i, (1, 1))
-        ret = relay.op.concatenate([st, i_vec], axis=0)
-        return i + int32(1), ret
-
-    loop = while_loop(_cond, [i, st], _body)
-    start = relay.var("start", shape=(), dtype="int32")
-    body = loop(start, relay.op.reshape(relay.const(0), newshape=(1, 1)))
-    func = relay.Function([start], relay.TupleGetItem(body, 1))
-    mod = tvm.IRModule()
-    mod["main"] = func
-    data = np.array(0.0, dtype="int32")
-    ref = np.array([0] + list(range(10))).reshape((11, 1)).astype("int32")
-    check_result([data], mod, ref)
-
-
-@tvm.testing.uses_gpu
-def test_recursive_concat_with_wrong_annotation():
-    """
-    v0.0.1
-    fn (%start: int32) {
-        %7 = {
-            let %while_loop = fn (%i: int32, %st: Tensor[(1, 1), int32]) {
-            %0 = less(%i, 10)
-            %1 = min(%0)
-            if (%1) {
-                %2 = add(%i, 1)
-                %3 = reshape(%i, newshape=[1, 1])
-                %4 = (%st, %3)
-                /* The result of concat should be 1,1 but it is 2, 1. */
-                %5 = concatenate(%4)
-                %while_loop(%2, %5)
-            } else {
-                (%i, %st)
-            }
-        }
-        %6 = reshape(0, newshape=[1, 1])
-        %while_loop(%start, %6)
-    }
-    %7.1
-    }
-    """
-    # Initial Values.
-    i = relay.var("i", shape=(), dtype="int32")
-    st = relay.var("st", shape=(1, 1), dtype="int32")
-
-    def _cond(i, st):
-        return relay.op.min(relay.op.less(i, int32(10)))
-
-    def _body(i, st):
-        i_vec = relay.op.reshape(i, (1, 1))
-        ret = relay.op.concatenate([st, i_vec], axis=0)
-        return i + int32(1), ret
-
-    loop = while_loop(_cond, [i, st], _body)
-    start = relay.var("start", shape=(), dtype="int32")
-    body = loop(start, relay.op.reshape(relay.const(0), newshape=(1, 1)))
-    func = relay.Function([start], relay.TupleGetItem(body, 1))
-
-    with DiagnosticTesting() as diagnostics:
-        diagnostics.assert_message(
-            "The Relay type checker is unable to show the following types match:\n"
-            "  Tensor[(2, 1), int32]\n"
-            "  Tensor[(1, 1), int32]\n"
-            "In particular:\n"
-            "  dimension 0 conflicts: 2 does not match 1."
-        )
-        func = infer_type(func)
-
-
-@tvm.testing.uses_gpu
-def test_tuple_get_item():
-    mod = tvm.IRModule()
-    dtype = "float32"
-    static_data_shape = (9, 4)
-    data_shape = (relay.Any(), 4)
-    indices_or_sections = 2
-    axis = 1
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.split(data, indices_or_sections, axis)
-    y = relay.expr.TupleGetItem(y.astuple(), 0)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    ref_out_shape = (9, 2)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_mixed_input_type():
-    mod = tvm.IRModule()
-    dtype = "float32"
-    static_data_shape = (9, 4)
-    data_shape = (relay.Any(), 4)
-    tensor_type = relay.TensorType(data_shape, dtype)
-    tuple_type = relay.TupleType([tensor_type, tensor_type])
-    data0 = relay.var("d0", type_annotation=relay.TupleType([tuple_type, tensor_type]))
-    data1 = relay.var("d1", shape=(relay.Any(), 4), dtype=dtype)
-    data_tuple = relay.expr.TupleWrapper(data0, 2)
-    nested_data_tuple = relay.expr.TupleWrapper(data_tuple[0], 2)
-    y = nested_data_tuple[1] * data_tuple[1] + data1
-    mod["main"] = relay.Function([data0, data1], y)
-    data_np0 = np.random.uniform(size=static_data_shape).astype(dtype)
-    data_np1 = np.random.uniform(size=static_data_shape).astype(dtype)
-    ref_out_shape = (9, 4)
-    check_result(
-        [[[data_np0, data_np0], data_np0], data_np1],
-        mod,
-        ref_out_shape,
-        assert_shape=True,
-        only_vm=True,
-    )
-
-
-def verify_any_crop_and_resize(
-    data_shape,
-    boxes_shape,
-    box_indices_shape,
-    crop_size,
-    layout,
-    static_boxes,
-    static_box_indices_shape,
-    ref_out_shape,
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    indices_dtype = "int32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    boxes = relay.var("boxes", shape=boxes_shape, dtype=dtype)
-    box_indices = relay.var("box_indices", shape=box_indices_shape, dtype=indices_dtype)
-    y = relay.image.crop_and_resize(data, boxes, box_indices, crop_size, layout)
-    mod["main"] = relay.Function([data, boxes, box_indices], y)
-    data_np = np.random.uniform(size=data_shape).astype(dtype)
-    boxes_np = np.random.uniform(size=static_boxes).astype(dtype)
-    box_indices_np = np.random.uniform(size=static_box_indices_shape).astype(indices_dtype)
-    check_result([data_np, boxes_np, box_indices_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_crop_and_resize():
-    verify_any_crop_and_resize(
-        data_shape=(1, 234, 234, 256),
-        boxes_shape=(relay.Any(), 4),
-        box_indices_shape=(relay.Any(),),
-        crop_size=(14, 14),
-        layout="NHWC",
-        static_boxes=(128, 4),
-        static_box_indices_shape=(128,),
-        ref_out_shape=(128, 14, 14, 256),
-    )
-    verify_any_crop_and_resize(
-        data_shape=(1, 256, 234, 234),
-        boxes_shape=(relay.Any(), 4),
-        box_indices_shape=(relay.Any(),),
-        crop_size=(14, 14),
-        layout="NCHW",
-        static_boxes=(128, 4),
-        static_box_indices_shape=(128,),
-        ref_out_shape=(128, 256, 14, 14),
-    )
-
-
-def verify_any_mirror_pad(data_shape, pad_width, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.nn.mirror_pad(data, pad_width)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_mirror_pad():
-    verify_any_mirror_pad(
-        data_shape=(1, 256, 232, 232),
-        pad_width=((0, 0), (0, 0), (1, 1), (1, 1)),
-        static_data_shape=(1, 256, 232, 232),
-        ref_out_shape=(1, 256, 234, 234),
-    )
-
-
-def verify_any_ndarray_size(data_np_shape):
-    v = relay.var("v", shape=any_dims(len(data_np_shape)), dtype="float32")
-    n = relay.ndarray_size(v, dtype="int32")
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([v], n)
-    np_data = np.zeros(data_np_shape, dtype="float32")
-    ref_res = np.size(np_data)
-    check_result([np_data], mod, ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_any_ndarray_size():
-    verify_any_ndarray_size((2,))
-    verify_any_ndarray_size((2, 2))
-    verify_any_ndarray_size((1, 2, 3, 4))
-
-
-def verify_any_resize2d(data_shape, scale, layout, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    if layout == "NHWC":
-        size = (data_shape[1] * scale, data_shape[2] * scale)
-    else:
-        size = (data_shape[2] * scale, data_shape[3] * scale)
-    y = relay.image.resize2d(data, size, None, layout)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_resize():
-    verify_any_resize2d(
-        data_shape=(relay.Any(), 4, 4, 4),
-        scale=2,
-        layout="NHWC",
-        static_data_shape=(1, 4, 4, 4),
-        ref_out_shape=(1, 8, 8, 4),
-    )
-    verify_any_resize2d(
-        data_shape=(relay.Any(), 8, 17, 20),
-        scale=3,
-        layout="NCHW",
-        static_data_shape=(2, 8, 17, 20),
-        ref_out_shape=(2, 8, 51, 60),
-    )
-
-
-def verify_any_grid_sample(data_shape, grid_shape, static_data_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    grid = relay.var("grid", shape=grid_shape, dtype=dtype)
-    y = relay.image.grid_sample(data, grid)
-    mod["main"] = relay.Function([data, grid], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    grid_np = np.random.uniform(size=grid_shape).astype(dtype)
-    check_result([data_np, grid_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_grid_sample():
-    verify_any_grid_sample(
-        data_shape=(relay.Any(), 4, 16, 32),
-        grid_shape=(4, 2, 8, 8),
-        static_data_shape=(4, 4, 16, 32),
-        ref_out_shape=(4, 4, 8, 8),
-    )
-    verify_any_grid_sample(
-        data_shape=(relay.Any(), 4, 16, 32),
-        grid_shape=(4, 2, 32, 32),
-        static_data_shape=(4, 4, 16, 32),
-        ref_out_shape=(4, 4, 32, 32),
-    )
-
-
-def verify_any_affine_grid(num_batch, static_num_batch, target_shape, ref_out_shape):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data_shape = (num_batch, 2, 3)
-    static_data_shape = (static_num_batch, 2, 3)
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.image.affine_grid(data, target_shape)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
-
-
-@tvm.testing.uses_gpu
-def test_any_affine_grid():
-    verify_any_affine_grid(
-        num_batch=relay.Any(),
-        static_num_batch=1,
-        target_shape=(16, 32),
-        ref_out_shape=(1, 2, 16, 32),
-    )
-    verify_any_affine_grid(
-        num_batch=relay.Any(),
-        static_num_batch=8,
-        target_shape=(32, 32),
-        ref_out_shape=(8, 2, 32, 32),
-    )
-
-
-def test_any_consecutive_broadcast():
-    dtype = "float32"
-    data0 = relay.var("data0", shape=any_dims(2), dtype=dtype)
-    data1 = relay.var("data1", shape=any_dims(2), dtype=dtype)
-    data2 = relay.var("data2", shape=any_dims(2), dtype=dtype)
-    data3 = relay.var("data3", shape=any_dims(2), dtype=dtype)
-
-    out0 = data0 + data1
-    out1 = data0 * data1
-    out2 = out0 - out1
-
-    out3 = data2 + data3
-    out4 = data2 * data3
-    out5 = out3 - out4
-
-    out6 = out2 * out5
-
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([data0, data1, data2, data3], out6)
-
-    np_data0 = np.random.uniform(size=(1, 4)).astype(dtype)
-    np_data1 = np.random.uniform(size=(2, 4)).astype(dtype)
-    np_data2 = np.random.uniform(size=(1, 4)).astype(dtype)
-    np_data3 = np.random.uniform(size=(2, 4)).astype(dtype)
-    ref_res = ((np_data0 + np_data1) - (np_data0 * np_data1)) * (
-        (np_data2 + np_data3) - (np_data2 * np_data3)
-    )
-    check_result([np_data0, np_data1, np_data2, np_data3], mod, ref_res)
-
-
-def test_reshape_concat():
-    dtype = "float32"
-    d0 = relay.var("d0", shape=any_dims(2), dtype=dtype)
-    d1 = relay.var("d1", shape=any_dims(3), dtype=dtype)
-    out = relay.op.concatenate([relay.op.reshape(d0, [-1]), relay.op.reshape(d1, [-1])], axis=0)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([d0, d1], out)
-    np_data0 = np.random.uniform(size=(4, 5)).astype(dtype)
-    np_data1 = np.random.uniform(size=(2, 5, 2)).astype(dtype)
-    ref_res = np.concatenate([np.reshape(np_data0, [-1]), np.reshape(np_data1, [-1])], axis=0)
-    check_result([np_data0, np_data1], mod, ref_res)
-
-    d0 = relay.var("d0", shape=any_dims(2), dtype=dtype)
-    d1 = relay.var("d1", shape=any_dims(2), dtype=dtype)
-    s0 = relay.var("s0", shape=any_dims(3), dtype=dtype)
-    s1 = relay.var("s1", shape=any_dims(3), dtype=dtype)
-    out = relay.op.concatenate(
-        [relay.op.reshape_like(d0, s0), relay.op.reshape_like(d1, s1)], axis=0
-    )
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([d0, d1, s0, s1], out)
-    np_data0 = np.random.uniform(size=(4, 5)).astype(dtype)
-    np_data1 = np.random.uniform(size=(8, 5)).astype(dtype)
-    np_shape_like0 = np.random.uniform(size=(2, 2, 5)).astype(dtype)
-    np_shape_like1 = np.random.uniform(size=(4, 2, 5)).astype(dtype)
-    ref_res = np.concatenate(
-        [np.reshape(np_data0, np_shape_like0.shape), np.reshape(np_data1, np_shape_like1.shape)],
-        axis=0,
-    )
-    check_result([np_data0, np_data1, np_shape_like0, np_shape_like1], mod, ref_res)
-
-
-def test_any_adv_index():
-    data = relay.var("data", shape=(5, relay.Any(), relay.Any()), dtype="float32")
-    index0 = relay.var("index0", shape=(1, relay.Any()), dtype="int64")
-    index1 = relay.var("index1", shape=(relay.Any(), 1), dtype="int64")
-    out = relay.adv_index([data, index0, index1])
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([data, index0, index1], out)
-    np_data_shape = (5, 5, 10)
-    np_index0_shape = (1, 4)
-    np_index1_shape = (4, 1)
-    np_data = np.random.uniform(size=np_data_shape).astype("float32")
-    np_index0 = np.random.uniform(0, np_data_shape[0], size=np_index0_shape).astype("int64")
-    np_index1 = np.random.uniform(0, np_data_shape[0], size=np_index1_shape).astype("int64")
-    ref_res = np_data[tuple([np_index0, np_index1])]
-    print(ref_res.shape)
-    check_result([np_data, np_index0, np_index1], mod, ref_res)
-
-
-def verify_any_repeat(data_shape, np_dshape, repeats, axis):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.repeat(data, repeats, axis)
-    mod["main"] = relay.Function([data], y)
-    np_data = np.random.uniform(size=np_dshape).astype(dtype)
-    ref_res = np.repeat(np_data, repeats, axis)
-    check_result([np_data], mod, ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_any_repeat():
-    verify_any_repeat(any_dims(2), (1, 2), 2, 0)
-    verify_any_repeat(any_dims(1), (3,), 3, -1)
-    verify_any_repeat(any_dims(4), (2, 1, 1, 4), 4, 2)
-
-
-def verify_any_stack(data_shape, np_dshape, num_data, axis):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    inputs = []
-    for i in range(num_data):
-        inputs.append(relay.var("data{}".format(i), shape=data_shape, dtype=dtype))
-    y = relay.stack(inputs, axis)
-    mod["main"] = relay.Function(inputs, y)
-    np_inputs = []
-    for _ in range(num_data):
-        np_inputs.append(np.random.uniform(size=np_dshape).astype(dtype))
-    ref_res = np.stack(np_inputs, axis)
-    check_result(np_inputs, mod, ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_any_stack():
-    verify_any_stack(any_dims(2), (1, 2), 3, 0)
-    verify_any_stack(any_dims(1), (3,), 4, -1)
-    verify_any_stack(any_dims(4), (2, 1, 1, 4), 2, 2)
-
-
-def verify_any_where(
-    cond_shape, x_shape, y_shape, cond_np_shape, x_np_shape, y_np_shape, y_np_shape_invalid=None
-):
-    dtype = "float32"
-    cond = relay.var("cond", shape=cond_shape, dtype="bool")
-    x = relay.var("x", shape=x_shape, dtype=dtype)
-    y = relay.var("y", shape=y_shape, dtype=dtype)
-    z = relay.where(cond, x, y)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([cond, x, y], z)
-
-    cond_np = np.random.randn(*cond_np_shape) > 0
-    x_np = np.random.randn(*x_np_shape).astype(dtype)
-    y_np = np.random.randn(*y_np_shape).astype(dtype)
-    expected = np.where(cond_np, x_np, y_np)
-
-    check_result([cond_np, x_np, y_np], mod, expected)
-
-    # verify invalid broadcasting check
-    if y_np_shape_invalid:
-        y_np_bad = np.random.randn(*y_np_shape_invalid).astype(dtype)
-        try:
-            check_result([cond_np, x_np, y_np_bad], mod, expected)
-        except tvm.error.TVMError as e:
-            error_msg = str(e).split("\n")[-1]
-            assert "Invalid broadcast shapes" in error_msg
-
-
-@tvm.testing.uses_gpu
-def test_any_where():
-    verify_any_where(any_dims(1), (5,), (5,), (5,), (5,), (5,))
-    verify_any_where(any_dims(1), any_dims(1), (5,), (5,), (5,), (5,))
-    verify_any_where(any_dims(1), any_dims(1), any_dims(1), (5,), (5,), (5,))
-    verify_any_where((5,), any_dims(1), any_dims(1), (5,), (5,), (5,))
-
-    # where with broadcast
-    verify_any_where(any_dims(1), any_dims(1), any_dims(1), (5,), (1,), (5,))
-    verify_any_where(any_dims(1), any_dims(2), any_dims(2), (5,), (5, 5), (5, 5))
-    verify_any_where(any_dims(1), any_dims(1), any_dims(2), (5,), (5,), (5, 5))
-    verify_any_where(
-        any_dims(2), any_dims(2), any_dims(2), (3, 4), (3, 1), (1, 4), y_np_shape_invalid=(2, 4)
-    )
-
-    # Test scalar where in a dynamically shaped graph
-    x = relay.var("x", shape=any_dims(1), dtype="int64")
-    y = relay.var("y", shape=any_dims(2), dtype="float32")
-
-    left = relay.take(x, relay.const(1, dtype="int32")) + relay.const(4, "int64")
-    right = relay.const(4, "int64")
-    where = relay.where(relay.const(False, "bool"), left, right)
-    z = relay.take(y, where, axis=1)
-
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], z)
-
-    x_np = np.random.randn(2).astype("int64")
-    y_np = np.random.randn(2, 6).astype("float32")
-    expected = y_np[:, 4]
-
-    check_result([x_np, y_np], mod, expected)
-
-
-@tvm.testing.uses_gpu
-def test_non_max_suppression():
-    x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 6), "float32"))
-    x1 = relay.var("x1", relay.ty.TensorType((1,), "int32"))
-    x2 = relay.var("x2", relay.ty.TensorType((1, relay.Any()), "int32"))
-    x3 = relay.var("x3", relay.ty.TensorType((), "int32"))
-    z = relay.vision.non_max_suppression(
-        x0,
-        x1,
-        x2,
-        x3,
-        iou_threshold=0.5,
-        force_suppress=True,
-        top_k=2,
-        return_indices=True,
-        invalid_to_bottom=False,
-    )
-    z = z.astuple()
-    func = relay.Function([x0, x1, x2, x3], z)
-    mod = tvm.IRModule()
-    mod["main"] = func
-
-    np_data = np.array(
-        [
-            [
-                [0, 0.8, 1, 20, 25, 45],
-                [1, 0.7, 30, 60, 50, 80],
-                [0, 0.4, 4, 21, 19, 40],
-                [2, 0.9, 35, 61, 52, 79],
-                [1, 0.5, 100, 60, 70, 110],
-            ]
-        ]
-    ).astype("float32")
-    np_valid_count = np.array([4]).astype("int32")
-    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
-    np_max_output_size = -1
-    np_indices_result = np.array([[4, 0, -1, -1, -1]])
-    np_valid_box_count = np.array([[2]]).astype("int32")
-
-    check_result(
-        [np_data, np_valid_count, np_indices, np_max_output_size],
-        mod,
-        [np_indices_result, np_valid_box_count],
-        only_vm=False,
-    )
-
-    np_data = np.zeros((1, 0, 6)).astype("float32")
-    np_valid_count = np.array([0]).astype("int32")
-    np_indices = np.zeros((1, 0)).astype("int32")
-    np_max_output_size = -1
-    np_indices_result = np.zeros((1, 0))
-    np_valid_box_count = np.array([[0]]).astype("int32")
-
-    check_result(
-        [np_data, np_valid_count, np_indices, np_max_output_size],
-        mod,
-        [np_indices_result, np_valid_box_count],
-        only_vm=False,
-    )
-
-
-@tvm.testing.uses_gpu
-def test_all_class_non_max_suppression():
-    def verify_all_class_non_max_suppression(
-        boxes_np,
-        scores_np,
-        max_output_boxes_per_class,
-        iou_threshold,
-        score_threshold,
-        expected,
-        output_format="onnx",
-    ):
-        batch_size = boxes_np.shape[0]
-        num_classes = scores_np.shape[1]
-        num_boxes = relay.Any()
-        boxes = relay.var("boxes", relay.ty.TensorType((batch_size, num_boxes, 4), "float32"))
-        scores = relay.var(
-            "scores", relay.ty.TensorType((batch_size, num_classes, num_boxes), "float32")
-        )
-
-        nms_out = relay.vision.all_class_non_max_suppression(
-            boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, output_format
-        )
-
-        if output_format == "onnx":
-            three = relay.const(np.array([3]), dtype="int64")
-            begin = relay.const(np.array([0, 0]), dtype="int64")
-            end = relay.op.concatenate([nms_out[1], three], axis=0)
-            strides = relay.const(np.array([1, 1]), dtype="int64")
-            out = relay.op.strided_slice(nms_out[0], begin, end, strides)
-            mod = tvm.IRModule()
-            mod["main"] = relay.Function([boxes, scores], out)
-            check_result([boxes_np, scores_np], mod, [expected])
-        else:
-            out = nms_out.tuple_value
-            mod = tvm.IRModule()
-            mod["main"] = relay.Function([boxes, scores], out)
-            check_result([boxes_np, scores_np], mod, expected)
-
-    boxes = np.array(
-        [
-            [
-                [0.0, 0.0, 0.3, 0.3],
-                [0.5, 0.5, 0.4, 0.4],
-                [0.0, 0.0, 0.5, 0.5],
-                [0.5, 0.5, 0.9, 0.9],
-                [0.5, 0.5, 1.0, 1.0],
-            ],
-        ]
-    ).astype("float32")
-
-    scores = np.array(
-        [
-            [[0.1, 0.2, 0.6, 0.3, 0.9], [0.8, 0.2, 0.6, 0.3, 0.9]],
-        ]
-    ).astype("float32")
-
-    max_output_boxes_per_class = 2
-    iou_threshold = 0.8
-    score_threshold = 0.4
-
-    expected = np.array([[0, 0, 4], [0, 0, 2], [0, 1, 4], [0, 1, 0]])
-
-    verify_all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, expected
-    )
-
-    expected = [
-        np.array(
-            [[[0, 4], [0, 2], [1, 4], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]]
-        ),
-        np.array(
-            [
-                [
-                    0.9,
-                    0.6,
-                    0.9,
-                    0.8,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                ]
-            ]
-        ),
-        np.array([4]),
-    ]
-
-    verify_all_class_non_max_suppression(
-        boxes,
-        scores,
-        max_output_boxes_per_class,
-        iou_threshold,
-        score_threshold,
-        expected,
-        output_format="tensorflow",
-    )
-
-    boxes = np.array(
-        [
-            [
-                [0.0, 0.0, 1.0, 1.0],
-                [0.0, 0.1, 0.9, 1.2],
-            ]
-        ]
-    ).astype(np.float32)
-    scores = np.array([[[0.2, 0.3], [0.3, 0.2]]]).astype(np.float32)
-    iou_threshold = 0.3
-    score_threshold = 0.15
-
-    expected = np.array([[0, 0, 1], [0, 1, 0]])
-
-    verify_all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, expected
-    )
-
-    # zero box detection case
-    boxes = np.array(
-        [
-            [
-                [0.0, 0.0, 1.0, 1.0],
-            ]
-        ]
-    ).astype(np.float32)
-    scores = np.array([[[0.2]]]).astype(np.float32)
-    score_threshold = 0.4
-
-    expected = np.zeros((0, 3))
-
-    verify_all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, expected
-    )
-
-
-@tvm.testing.uses_gpu
-def test_gather_nd():
-    def verify_gather_nd(data_shape, indices_shape, data_shape_np, indices_shape_np, batch_dims=0):
-        x = relay.var("x", relay.TensorType(data_shape, "float32"))
-        y = relay.var("y", relay.TensorType(indices_shape, "int32"))
-        z = relay.gather_nd(x, y, batch_dims=batch_dims, index_rank=indices_shape[0])
-
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([x, y], z)
-
-        data_np = np.random.uniform(size=data_shape_np).astype("float32")
-        indices_np = np.random.randint(low=0, high=2, size=indices_shape_np, dtype="int32")
-
-        ref_res = ref_funcs.gather_nd(data_np, indices_np, batch_dims)
-        check_result([data_np, indices_np], mod, [ref_res])
-
-    verify_gather_nd((2, 2), (2, relay.Any()), (2, 2), (2, 3))
-    verify_gather_nd((relay.Any(), 2), (2, relay.Any()), (2, 2), (2, 3))
-    verify_gather_nd((relay.Any(), 2), (1, relay.Any()), (10, 2), (1, 10), 1)
-    verify_gather_nd(
-        (relay.Any(), 2, 2, 3, 4), (3, relay.Any(), relay.Any()), (3, 2, 2, 3, 4), (3, 3, 2), 2
-    )
-
-
-@tvm.testing.uses_gpu
-def test_scatter_nd():
-    def verify_scatter_nd(data_np, indices_np, updates_np, ref_res):
-        indices_shape = (2, relay.Any())
-        updates_shape = (relay.Any(),)
-        data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype))
-        indices = relay.var("indices", relay.TensorType(indices_shape, str(indices_np.dtype)))
-        updates = relay.var("updates", relay.TensorType(updates_shape, str(updates_np.dtype)))
-
-        out = relay.op.scatter_nd(data, indices, updates, "add")
-
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([data, indices, updates], out)
-
-        check_result([data_np, indices_np, updates_np], mod, [ref_res])
-
-    data = np.zeros((2, 2)).astype("int64")
-    indices = np.array([[1, 1, 0], [0, 1, 0]])
-    updates = np.array([2, 3, 0])
-    out = np.array([[0, 0], [2, 3]])
-    verify_scatter_nd(data, indices, updates, out)
-
-
-@tvm.testing.uses_gpu
-def test_scatter_nd_any_updates():
-    def verify_scatter_nd_any_updates(data_np, indices_np, updates_np, ref_res):
-        indices_shape = (2, relay.Any())
-        updates_shape = (2, relay.Any())
-        data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype))
-        indices = relay.var("indices", relay.TensorType(indices_shape, str(indices_np.dtype)))
-        updates = relay.var("updates", relay.TensorType(updates_shape, str(updates_np.dtype)))
-
-        out = relay.op.scatter_nd(data, indices, updates, "add")
-
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([data, indices, updates], out)
-
-        check_result([data_np, indices_np, updates_np], mod, [ref_res], only_vm=True)
-
-    data = np.zeros((3, 3)).astype("int64")
-    indices = np.array([[1, 1], [0, 1]])
-    updates = np.array([[2, 2], [1, 1]])
-    out = np.array([[0, 0, 0], [0, 0, 0], [2, 2, 1]])
-    verify_scatter_nd_any_updates(data, indices, updates, out)
-
-
-@tvm.testing.uses_gpu
-def test_gather():
-    def verify_gather(data_shape, indices_shape, data_shape_np, indices_shape_np, axis):
-        x = relay.var("x", relay.TensorType(data_shape, "float32"))
-        y = relay.var("y", relay.TensorType(indices_shape, "int32"))
-        z = relay.gather(x, axis, y)
-
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([x, y], z)
-
-        data_np = np.random.uniform(size=data_shape_np).astype("float32")
-        indices_np = np.random.randint(low=0, high=2, size=indices_shape_np, dtype="int32")
-
-        ref_res = tvm.topi.testing.gather_python(data_np, axis, indices_np)
-        check_result([data_np, indices_np], mod, [ref_res])
-
-    verify_gather((relay.Any(),), (relay.Any(),), (10,), (10,), 0)
-    verify_gather((2, 2), (2, relay.Any()), (2, 2), (2, 3), 1)
-    verify_gather((relay.Any(), 2), (2, relay.Any()), (2, 2), (2, 3), 1)
-    verify_gather((relay.Any(), relay.Any()), (relay.Any(), relay.Any()), (2, 3), (1, 3), 0)
-
-
-@tvm.testing.uses_gpu
-def test_searchsorted():
-    def verify_searchsorted(
-        sorted_sequence_shape, values_shape, sorted_sequence_shape_np, values_shape_np
-    ):
-        x = relay.var("x", relay.TensorType(sorted_sequence_shape, "float32"))
-        y = relay.var("y", relay.TensorType(values_shape, "float32"))
-        z = relay.searchsorted(x, y)
-
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([x, y], z)
-
-        x_np = np.sort(np.random.uniform(size=sorted_sequence_shape_np).astype("float32"), axis=-1)
-        y_np = np.random.uniform(size=values_shape_np).astype("float32")
-
-        ref_res = searchsorted_ref(x_np, y_np, False, "int32")
-        check_result([x_np, y_np], mod, [ref_res])
-
-    for shape_np, values_shape_np in zip([(8, 9, 10), (10,), (11,)], [(8, 9, 20), (5,), (8, 9, 7)]):
-        sorted_sequence_shape = (relay.Any(),) * len(shape_np)
-        values_shape = (relay.Any(),) * len(values_shape_np)
-
-        verify_searchsorted(
-            sorted_sequence_shape,
-            values_shape,
-            shape_np,
-            values_shape_np,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
deleted file mode 100644
index 54099e45a769..000000000000
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test layout rewrite support for whole neural networks"""
-import sys
-import tempfile
-import pytest
-
-import numpy as np
-
-import tvm
-from tvm import relay, auto_scheduler
-from tvm.contrib import graph_executor
-import tvm.testing
-
-
-def get_np_array(var, dtype):
-    return np.random.randn(*[int(x) for x in var.type_annotation.shape]).astype(dtype)
-
-
-def get_relay_conv2d(
-    outc=32,
-    inc=32,
-    height=14,
-    width=14,
-    kh=3,
-    kw=3,
-    batch=1,
-    pad=0,
-    stride=1,
-    dilation=1,
-    layout="NHWC",
-):
-    dtype = "float32"
-    if layout == "NHWC":
-        kernel_layout = "HWIO"
-        d = relay.var("data", shape=(batch, height, width, inc), dtype=dtype)
-        w = relay.var("weight", shape=(kh, kw, inc, outc), dtype=dtype)
-    elif layout == "NCHW":
-        kernel_layout = "OIHW"
-        d = relay.var("data", shape=(batch, inc, height, width), dtype=dtype)
-        w = relay.var("weight", shape=(outc, inc, kh, kw), dtype=dtype)
-
-    y = relay.nn.conv2d(
-        d,
-        w,
-        padding=pad,
-        kernel_size=(kh, kw),
-        strides=(stride, stride),
-        dilation=(dilation, dilation),
-        channels=outc,
-        groups=1,
-        data_layout=layout,
-        kernel_layout=kernel_layout,
-    )
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([d, w], y)
-    data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
-    return mod, data, weight
-
-
-def get_relay_conv3d(
-    outc=8,
-    inc=8,
-    depth=8,
-    height=7,
-    width=7,
-    kd=1,
-    kh=1,
-    kw=1,
-    batch=1,
-    pad=0,
-    stride=1,
-    dilation=1,
-    layout="NDHWC",
-):
-    dtype = "float32"
-    if layout == "NDHWC":
-        kernel_layout = "DHWIO"
-        d = relay.var("data", shape=(batch, depth, height, width, inc), dtype=dtype)
-        w = relay.var("weight", shape=(kd, kh, kw, inc, outc), dtype=dtype)
-    elif layout == "NCDHW":
-        kernel_layout = "OIDHW"
-        d = relay.var("data", shape=(batch, inc, depth, height, width), dtype=dtype)
-        w = relay.var("weight", shape=(outc, inc, kd, kh, kw), dtype=dtype)
-
-    y = relay.nn.conv3d(
-        d,
-        w,
-        padding=pad,
-        kernel_size=(kd, kh, kw),
-        strides=(stride, stride, stride),
-        dilation=(dilation, dilation, dilation),
-        channels=outc,
-        groups=1,
-        data_layout=layout,
-        kernel_layout=kernel_layout,
-    )
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([d, w], y)
-    data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
-    return mod, data, weight
-
-
-def get_relay_dense(m=128, n=128, k=128):
-    dtype = "float32"
-    d = relay.var("data", shape=(m, k), dtype=dtype)
-    w = relay.var("weight", shape=(n, k), dtype=dtype)
-    y = relay.nn.dense(d, w)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([d, w], y)
-    data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
-    return mod, data, weight
-
-
-def get_relay_batchmm(batch=4, m=128, n=128, k=128):
-    dtype = "float32"
-    d = relay.var("data", shape=(batch, m, k), dtype=dtype)
-    w = relay.var("weight", shape=(batch, n, k), dtype=dtype)
-    y = relay.nn.batch_matmul(d, w)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([d, w], y)
-    data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
-    return mod, data, weight
-
-
-def tune_and_check(mod, data, weight, target, dev):
-    # Extract tasks from a relay program
-    tasks, task_weights = auto_scheduler.extract_tasks(
-        mod, target=target, params={"weight": weight}
-    )
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-
-        # Tune tasks
-        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[])
-        tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=1,
-            num_measures_per_round=1,
-            builder=auto_scheduler.LocalBuilder(timeout=60),
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        )
-        tuner.tune(tune_option, search_policy="sketch.random")
-
-        # Compile
-        with auto_scheduler.ApplyHistoryBest(log_file):
-            with tvm.transform.PassContext(
-                opt_level=3,
-                config={"relay.backend.use_auto_scheduler": True},
-            ):
-                lib = relay.build(mod, target=target, params={"weight": weight})
-
-        # Compile without auto-scheduler for correctness check
-        with tvm.transform.PassContext(opt_level=0):
-            lib2 = relay.build(mod, target=target, params={"weight": weight})
-
-        def get_output(data, lib):
-            module = graph_executor.GraphModule(lib["default"](dev))
-            module.set_input("data", data)
-            module.run()
-
-            return module.get_output(0).numpy()
-
-        # Check correctness
-        actual_output = get_output(data, lib)
-        expected_output = get_output(data, lib2)
-
-        tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
-
-
-# layout rewriting only works on CPU targets
-@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
-def test_conv2d(target, dev):
-    mod, data, weight = get_relay_conv2d(kh=1, kw=1)
-    tune_and_check(mod, data, weight, target, dev)
-
-
-@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
-def test_conv2d_winograd(target, dev):
-    mod, data, weight = get_relay_conv2d(outc=128, kh=3, kw=3)
-    tune_and_check(mod, data, weight, target, dev)
-
-
-@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
-def test_conv3d(target, dev):
-    mod, data, weight = get_relay_conv3d()
-    tune_and_check(mod, data, weight, target, dev)
-
-
-@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
-def test_dense(target, dev):
-    mod, data, weight = get_relay_dense()
-    tune_and_check(mod, data, weight, target, dev)
-
-
-@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
-def test_batch_matmul(target, dev):
-    mod, data, weight = get_relay_batchmm()
-    tune_and_check(mod, data, weight, target, dev)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
deleted file mode 100644
index 9dbc653da23f..000000000000
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test task extraction for auto-scheduler"""
-import json
-import tempfile
-
-import pytest
-import tvm.relay.testing
-import tvm.testing
-from tvm import _ffi as _ffi_api
-from tvm import auto_scheduler, relay
-
-
-def get_network(name, batch_size=1, layout="NHWC"):
-    """Get the symbol definition and random weight of a network"""
-
-    # auto-scheduler prefer NHWC layout
-    if layout == "NHWC":
-        image_shape = (224, 224, 3)
-    elif layout == "NCHW":
-        image_shape = (3, 224, 224)
-    elif layout == "NCDHW":
-        image_shape = (3, 16, 224, 224)
-    elif layout == "NDHWC":
-        image_shape = (3, 224, 224, 16)
-    else:
-        raise ValueError("Invalid layout: " + layout)
-
-    if name == "resnet-18":
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
-        )
-    elif name == "resnet-50":
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=50, batch_size=batch_size, layout=layout, image_shape=image_shape
-        )
-    elif name == "winograd-test":
-        input_shape = [1, 23, 40, 32]
-
-        data = relay.var("data", shape=input_shape, dtype="float32")
-        net = relay.testing.layers.conv2d(
-            data=data,
-            channels=128,
-            kernel_size=3,
-            strides=1,
-            padding=1,
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            name="",
-        )
-        bias = relay.var("conv1_bias")
-        net = relay.nn.bias_add(net, bias, 3)
-        net = relay.nn.relu(net)
-        mod, params = relay.testing.create_workload(net)
-    elif name == "resnet3d-18":
-        mod, params = relay.testing.resnet_3d.get_workload(
-            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
-        )
-    elif name == "mobilenet":
-        mod, params = relay.testing.mobilenet.get_workload(
-            batch_size=batch_size, layout=layout, image_shape=image_shape
-        )
-    elif name == "resnet3d-18":
-        mod, params = relay.testing.resnet_3d.get_workload(
-            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
-        )
-    elif name == "dcgan":
-        mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size, layout=layout)
-    elif name == "mlp":
-        data = relay.var("data", shape=(batch_size, 32))
-        fc1 = relay.nn.dense(data, relay.var("fc1_weight"), units=32)
-        fc1 = relay.nn.bias_add(fc1, relay.var("fc1_bias"), axis=-1)
-        act1 = relay.nn.relu(fc1)
-        fc2 = relay.nn.dense(act1, relay.var("fc2_weight"), units=32)
-        fc2 = relay.nn.bias_add(fc2, relay.var("fc2_bias"), axis=-1)
-        act2 = relay.nn.relu(fc2)
-        mlp = act2
-        args = relay.analysis.free_vars(act2)
-        mlp = relay.Function(args, mlp)
-        mod, params = relay.testing.init.create_workload(mlp)
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    return mod, params
-
-
-@tvm.testing.requires_cuda
-@pytest.mark.parametrize(
-    "params",
-    [
-        ("mlp", "NHWC", 1, 2),
-        ("resnet-18", "NHWC", 24, 25),
-        ("resnet-18", "NCHW", 24, 25),
-        ("mobilenet", "NHWC", 22, 30),
-        ("mobilenet", "NCHW", 22, 30),
-        ("resnet3d-18", "NCDHW", 23, 24),
-        ("resnet3d-18", "NDHWC", 23, 24),
-    ],
-)
-def test_task_extraction_cuda(params):
-    target = tvm.target.Target("cuda")
-    network, layout, expected_task, expected_weights = params
-
-    mod, params = get_network(network, layout=layout)
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-    for task, weight in zip(tasks, task_weights):
-        print(task.desc, task.workload_key, weight)
-
-    assert len(tasks) == expected_task
-    assert sum(task_weights) == expected_weights
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        # Relay FuseOps puts two conv2ds to separate functions and results in two tasks.
-        ("basic_func", 2, False),
-        # Relay FuseOps will not break the primitive function and result in one task.
-        ("fused_func", 1, False),
-        # The Relay function without complex ops will not form a task by default.
-        ("simple_func", 0, False),
-        # Every Relay function becomes a task regardless what ops in its body.
-        ("simple_func", 1, True),
-        # The Relay function without any reduce op is considered as a simple task.
-        ("shape_of_func", 0, False),
-        ("shape_of_func", 1, True),
-        # The Relay function with dynamic shape inputs/outputs will not be extracted.
-        ("dyn_shape_func", 0, False),
-        # The Conv2D in the Relay function with control flow could still be a task.
-        # Also, two identical Conv2D should only be one task with weight=2.
-        ("control_flow_func", 1, False),
-        # The first function with unsupported op (NMS) will not be extracted.
-        ("func_w_unsupported_op", 1, True),
-    ],
-)
-def test_task_extraction_cpu(params):
-    ishape = (1, 3, 224, 224)
-    w1shape = (32, 3, 3, 3)
-    w2shape = (32, 32, 3, 3)
-    dtype = "float32"
-    target = tvm.target.Target("llvm")
-
-    def get_func():
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
-        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)
-
-        conv2d = relay.nn.conv2d(data, weight1, kernel_size=(3, 3), padding=(1, 1))
-        relu = relay.nn.relu(conv2d)
-        conv2d = relay.nn.conv2d(relu, weight2, kernel_size=(3, 3), padding=(1, 1))
-        out = relay.nn.relu(conv2d)
-        return relay.Function([data, weight1, weight2], out)
-
-    def get_fused_func():
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
-        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)
-
-        fused_func = get_func()
-
-        # Set to primitive to keep fuse_ops untouch.
-        fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        call = relay.Call(fused_func, [data, weight1, weight2])
-        return relay.Function([data, weight1, weight2], call)
-
-    def get_simple_func():
-        data = relay.var("data", relay.TensorType((1, 2, 3), "float32"))
-        out = relay.image.affine_grid(data, (150, 150))
-        return relay.Function([data], out)
-
-    def get_shape_of_func():
-        data = relay.var("data", shape=(relay.Any(), 28, 28), dtype="float32")
-        out = relay.shape_of(data)
-        return relay.Function([data], out)
-
-    def get_func_with_dynamic_shape():
-        data = relay.var("data", shape=(relay.Any(), 32), dtype="float32")
-        out = relay.max(data)
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    def get_func_with_control_flow():
-        data = relay.var("data", shape=(1, 3, 224, 224))
-        weight = relay.var("weight", shape=(3, 3, 3, 3))
-        eq1 = relay.var("e1", shape=[], dtype="float32")
-        eq2 = relay.var("e2", shape=[], dtype="float32")
-        eq = relay.equal(eq1, eq2)
-
-        true_branch = relay.zeros(shape=(1, 3, 224, 224), dtype="float32")
-        false_branch = relay.nn.conv2d(data, weight, kernel_size=(3, 3), channels=3, padding=(1, 1))
-        false_branch = relay.nn.conv2d(
-            false_branch, weight, kernel_size=(3, 3), channels=3, padding=(1, 1)
-        )
-        ife = relay.If(eq, true_branch, false_branch)
-        out = relay.erf(ife)
-        return relay.Function([data, weight, eq1, eq2], out)
-
-    def get_func_with_unsupported_op():
-        def get_postproc_func():
-            data = relay.var("data", shape=((1, 3, 6)), dtype=dtype)
-            out = relay.nn.relu(data)
-            func = relay.Function([data], out)
-            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-            return func
-
-        cls_prob = relay.var("cls_prob", relay.ty.TensorType((1, 3, 3), "float32"))
-        loc_pred = relay.var("loc_pred", relay.ty.TensorType((1, 3 * 4), "float32"))
-        anchors = relay.var("anchors", relay.ty.TensorType((1, 3, 4), "float32"))
-
-        mtl = relay.vision.multibox_transform_loc(
-            cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors
-        )
-        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False)
-        out = relay.Call(get_postproc_func(), [nms])
-        return relay.Function([cls_prob, loc_pred, anchors], out)
-
-    func_map = {
-        "basic_func": get_func,
-        "fused_func": get_fused_func,
-        "simple_func": get_simple_func,
-        "shape_of_func": get_shape_of_func,
-        "dyn_shape_func": get_func_with_dynamic_shape,
-        "control_flow_func": get_func_with_control_flow,
-        "func_w_unsupported_op": get_func_with_unsupported_op,
-    }
-
-    def verify_task_extraction(func_name, expected_task, include_simple_tasks=False):
-        func = func_map[func_name]()
-        mod = tvm.IRModule.from_expr(func)
-        tasks, task_weights = auto_scheduler.extract_tasks(
-            mod["main"], None, target, include_simple_tasks=include_simple_tasks
-        )
-
-        assert len(tasks) == expected_task
-        assert len(task_weights) == expected_task
-
-    verify_task_extraction(*params)
-
-
-def test_dump_workload_to_dag_extract_tasks():
-    mod, _ = get_network("mobilenet", layout="NHWC")
-    with tempfile.NamedTemporaryFile() as f:
-        tasks, _ = auto_scheduler.extract_tasks(
-            mod["main"], None, "llvm", include_simple_tasks=True, dump_workload_to_dag_log=f.name
-        )
-        expected = {task.workload_key: str(task.compute_dag) for task in tasks}
-        actual = json.load(f)
-        assert expected == actual
-
-
-def test_custom_hash_func_extract_tasks():
-    @_ffi_api.register_func("auto_scheduler.compute_dag.hash_func")
-    def counting_unique_hash(str_dag):
-        ret = counting_unique_hash.i
-        counting_unique_hash.i += 1
-        return ret
-
-    counting_unique_hash.i = 0
-
-    mod, _ = get_network("mobilenet", layout="NHWC")
-    tasks, _ = auto_scheduler.extract_tasks(mod["main"], None, "llvm", include_simple_tasks=True)
-
-    hash_values = []
-    for task in tasks:
-        # task.workload_key should look like
-        # [43, [3, 3, 1024, 1], [1024], [3, 3, 1024, 1]] where the first int is the result of the hash
-        # Extract the hash and keep track of every hash
-        hash_value = int(task.workload_key[1:].split(",")[0])
-        hash_values.append(hash_value)
-
-    # All values are unique, and we know the min and max
-    # This is a sufficient condition to know that hashes in hash_values are an increasing list
-    # of hashes up to counting_unique_hash.i - 1
-    assert len(hash_values) == len(set(hash_values))
-    assert min(hash_values) == 0
-    assert max(hash_values) == counting_unique_hash.i - 1
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
deleted file mode 100644
index e2f754aaf4e0..000000000000
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test end-to-end network tuning with auto-scheduler"""
-import tempfile
-
-import numpy as np
-import pytest
-
-from tvm import auto_scheduler, relay
-from tvm.contrib import graph_executor
-import tvm.testing
-
-from test_auto_scheduler_task_extraction import get_network
-
-
-network = tvm.testing.parameter(
-    "mlp",
-    pytest.param("winograd-test", marks=pytest.mark.xfail(reason="Flaky unit test")),
-)
-
-
-@tvm.testing.requires_cuda
-def test_tuning_cuda(network):
-    target = "cuda"
-
-    # Extract tasks
-    mod, params = get_network(network)
-    target = tvm.target.Target(target)
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-
-        # Tuning
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60, device=0)
-        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[])
-        tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=100,
-            num_measures_per_round=2,
-            early_stopping=1,
-            runner=measure_ctx.runner,
-            builder=auto_scheduler.LocalBuilder(timeout=60),
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        )
-        tuner.tune(tune_option, search_policy="sketch.random")
-        del measure_ctx
-
-        # Compile with the history best
-        with auto_scheduler.ApplyHistoryBest(log_file):
-            with tvm.transform.PassContext(
-                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
-            ):
-                lib = relay.build(mod, target=target, params=params)
-
-        # Also test that multiple log files can be loaded.
-        with auto_scheduler.ApplyHistoryBest([log_file, log_file]) as best:
-            assert isinstance(
-                best, auto_scheduler.dispatcher.ApplyHistoryBest
-            ), "Unable to load multiple log files jointly."
-
-        # Confirm iterables can be directly loaded.
-        loaded_recs = auto_scheduler.dispatcher.load_records(log_file)
-        with auto_scheduler.ApplyHistoryBest(iter(loaded_recs)) as best:
-            assert isinstance(
-                best, auto_scheduler.dispatcher.ApplyHistoryBest
-            ), "Unable to ingest logs from an interator."
-
-        # Sample a schedule when missing
-        with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2):
-            with tvm.transform.PassContext(
-                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
-            ):
-                lib2 = relay.build(mod, target=target, params=params)
-
-        # Compile without auto-scheduler and any other optimization for correctness check
-        with tvm.transform.PassContext(opt_level=0):
-            ref_lib = relay.build(mod, target=target, params=params)
-
-        # Check the correctness
-        def get_output(data, lib):
-            dev = tvm.cuda()
-            module = graph_executor.GraphModule(lib["default"](dev))
-            module.set_input("data", data)
-            module.run()
-            return module.get_output(0).numpy()
-
-        np.random.seed(0)
-        if network == "mlp":
-            data = np.random.uniform(size=(1, 32))
-        elif network == "winograd-test":
-            data = np.random.uniform(size=(1, 23, 40, 32))
-        else:
-            raise ValueError("Unknown network: " + network)
-
-        actual_output1 = get_output(data, lib)
-        actual_output2 = get_output(data, lib2)
-        expected_output = get_output(data, ref_lib)
-
-        tvm.testing.assert_allclose(actual_output1, expected_output, rtol=1e-4, atol=1e-4)
-        tvm.testing.assert_allclose(actual_output2, expected_output, rtol=1e-4, atol=1e-4)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
deleted file mode 100644
index b2d0bcedf9e1..000000000000
--- a/tests/python/relay/test_autotvm_task_extraction.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test task extraction for autotvm"""
-import tvm.relay.testing
-from tvm import relay
-from tvm import autotvm
-
-
-def get_network(name, batch_size):
-    """Get the symbol definition and random weight of a network"""
-    input_shape = (batch_size, 3, 224, 224)
-
-    if name == "resnet-18":
-        mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
-    elif name == "resnet3d-18":
-        mod, params = relay.testing.resnet_3d.get_workload(num_layers=18, batch_size=batch_size)
-    elif name == "mobilenet":
-        mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name == "dcgan":
-        mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size)
-        input_shape = (batch_size, 100)
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    return mod, params, input_shape
-
-
-@tvm.testing.requires_x86
-def test_task_extraction():
-    target = "llvm"
-    mod_list = []
-    params_list = []
-    conv2d = relay.op.get("nn.conv2d")
-    conv3d = relay.op.get("nn.conv3d")
-    conv2d_transpose = relay.op.get("nn.conv2d_transpose")
-    dense = relay.op.get("nn.dense")
-
-    mod, params, _ = get_network("resnet-18", batch_size=1)
-    tasks = autotvm.task.extract_from_program(
-        mod["main"], target=target, params=params, ops=(conv2d,)
-    )
-    assert len(tasks) == 12
-    tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(conv2d,))
-    assert len(tasks) == 12
-
-    mod, params, _ = get_network("resnet-18", batch_size=1)
-    tasks = autotvm.task.extract_from_program(
-        mod["main"], target=target, params=params, ops=(dense,)
-    )
-    assert len(tasks) == 2
-    tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,))
-    assert len(tasks) == 2
-
-    mod, params, _ = get_network("resnet-18", batch_size=1)
-    mod_list.append(mod)
-    params_list.append(params)
-    tasks = autotvm.task.extract_from_program(
-        mod["main"], target=target, params=params, ops=(conv2d, dense)
-    )
-    assert len(tasks) == 14
-    tasks = autotvm.task.extract_from_program(
-        mod, target=target, params=params, ops=(conv2d, dense)
-    )
-    assert len(tasks) == 14
-    tasks = autotvm.task.extract_from_program(mod, target=target, params=params)
-    assert len(tasks) == 14
-
-    mod, params, _ = get_network("resnet3d-18", batch_size=1)
-    tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(conv3d,))
-    assert len(tasks) == 12
-
-    mod, params, _ = get_network("mobilenet", batch_size=1)
-    mod_list.append(mod)
-    params_list.append(params)
-    tasks = autotvm.task.extract_from_program(
-        mod, target=target, params=params, ops=(conv2d, dense)
-    )
-    assert len(tasks) == 21
-
-    mod, params, _ = get_network("dcgan", batch_size=1)
-    tasks = autotvm.task.extract_from_program(
-        mod, target=target, params=params, ops=(conv2d_transpose,)
-    )
-    assert len(tasks) == 4
-
-    tasks = autotvm.task.extract_from_multiple_program(
-        mod_list, params_list, target=target, ops=(conv2d,)
-    )
-    assert len(tasks) == 31
-
-
-def test_task_extraction_for_dense_int8_cuda():
-    target = "cuda"
-    dense = relay.op.get("nn.dense")
-
-    def get_net(batch, in_dim, out_dim, dtype, out_dtype):
-        data = tvm.relay.var("data", shape=[batch, in_dim], dtype=dtype)
-        weight = tvm.relay.var("weight", shape=[out_dim, in_dim], dtype=dtype)
-        out = relay.nn.dense(data, weight, out_dtype=out_dtype)
-        mod, params = relay.testing.create_workload(out)
-        return mod, params
-
-    mod, params = get_net(1, 16, 32, "float32", "float32")
-    tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,))
-    assert len(tasks) == 1 and tasks[0].name == "dense_small_batch.gpu"
-
-    mod, params = get_net(1, 16, 32, "int8", "int32")
-    tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,))
-    assert len(tasks) == 1 and tasks[0].name == "dense_int8.cuda"
-
-
-if __name__ == "__main__":
-    test_task_extraction()
-    test_task_extraction_for_dense_int8_cuda()
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
deleted file mode 100644
index 133fcd191961..000000000000
--- a/tests/python/relay/test_backend_graph_executor.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-from unittest.mock import patch
-
-import tvm
-import json
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay.op import add
-import tvm.testing
-from tvm.relay.testing import mlp
-from tvm import rpc
-from tvm.contrib import utils
-
-# @tq, @jr should we put this in testing ns?
-def check_rts(expr, args, expected_result, mod=None):
-    """
-    Check that evaluating `expr` applied to the arguments produces
-    `result` on both the evaluator and TVM runtime.
-
-    Parameters
-    ----------
-    expr:
-        The expression to evaluate
-
-    args: list of Expr
-        The arguments to supply the expr.
-
-    expected_result:
-        The expected result of running the expression.
-    """
-    eval_result = relay.create_executor("debug", mod=mod).evaluate(expr)(*args)
-    rts_result = relay.create_executor("graph", mod=mod).evaluate(expr)(*args)
-    tvm.testing.assert_allclose(eval_result.numpy(), rts_result.numpy())
-    tvm.testing.assert_allclose(eval_result.numpy(), expected_result)
-
-
-def test_add_op_scalar():
-    """
-    test_add_op_scalar:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    x = relay.var("x", shape=())  # Default to float32
-    y = relay.var("y", shape=())  # Default to float32
-    func = relay.Function([x, y], add(x, y))
-    x_y_data = [
-        (np.array(10.0, dtype="float32"), np.array(1.0, dtype="float32")),
-        (np.float32(10.0), np.float32(1.0)),
-        (10.0, 1.0),
-    ]
-    for (x_data, y_data) in x_y_data:
-        check_rts(func, [x_data, y_data], x_data + y_data)
-
-
-def test_add_op_scalar_int():
-    """
-    test_add_op_scalar_int:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    x = relay.var("x", shape=(), dtype="int32")
-    y = relay.var("y", shape=(), dtype="int32")
-    func = relay.Function([x, y], add(x, y))
-    x_y_data = [
-        (np.array(10.0, dtype="int32"), np.array(1.0, dtype="int32")),
-        (np.int32(10), np.int32(1)),
-        (10, 1),
-    ]
-    for (x_data, y_data) in x_y_data:
-        check_rts(func, [x_data, y_data], x_data + y_data)
-
-
-def test_add_op_tensor():
-    """
-    Program:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    x = relay.var("x", shape=(10, 5))
-    y = relay.var("y", shape=(10, 5))
-    func = relay.Function([x, y], add(x, y))
-    x_data = np.random.rand(10, 5).astype("float32")
-    y_data = np.random.rand(10, 5).astype("float32")
-    check_rts(func, [x_data, y_data], x_data + y_data)
-
-
-def test_add_op_broadcast():
-    """
-    Program:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    x = relay.var("x", shape=(10, 5))
-    y = relay.var("y", shape=(1, 5))
-    func = relay.Function([x, y], add(x, y))
-    x_data = np.random.rand(10, 5).astype("float32")
-    y_data = np.random.rand(1, 5).astype("float32")
-    check_rts(func, [x_data, y_data], x_data + y_data)
-
-
-def test_with_params():
-    x = relay.var("x", shape=(10, 5))
-    y = relay.var("y", shape=(1, 5))
-    z = relay.add(x, y)
-    z = relay.exp(z)
-    func = relay.Function([x, y], z)
-    x_data = np.random.rand(10, 5).astype("float32")
-    y_data = np.random.rand(1, 5).astype("float32")
-    params = {"y": y_data}
-    graph, lib, params = relay.build(tvm.IRModule.from_expr(func), "llvm", params=params)
-    mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-    mod.set_input(**params)
-    mod.set_input(x=x_data)
-    mod.run()
-    res = mod.get_output(0).numpy()
-    ref_res = np.exp(y_data + x_data)
-    tvm.testing.assert_allclose(res, ref_res, atol=1e-5, rtol=1e-5)
-
-
-def test_plan_memory():
-    # it is sufficient to cycle through two memories.
-
-    x = relay.var("x", shape=(10,))
-    y = relay.var("x", shape=(1,))
-    y2 = relay.exp(y)
-    z = relay.add(x, y2)
-    z = relay.exp(z)
-    z = relay.exp(z)
-    z = relay.exp(z)
-    z = relay.exp(z)
-    z = relay.exp(z)
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.transform.FuseOps(0)(mod)
-    func = mod["main"]
-    mod = relay.transform.InferType()(mod)
-    memory_plan = relay.backend._backend.GraphPlanMemory(func)
-    storage_ids = set()
-    device_types = set()
-    storage_sizes = {}
-
-    for k, v in memory_plan.expr_to_storage_info.items():
-        for x in v.storage_ids:
-            storage_ids.add(x)
-            storage_sizes[x] = v.storage_sizes
-        for x in v.device_types:
-            device_types.add(x)
-
-    # Current rule requires vars have unique storage id
-    # because we don't do inplace, we will need another
-    # two alternating temporary space.
-    assert len(storage_ids) == 4, f"found storage_ids: {storage_ids}"
-    assert len(device_types) == 1
-    assert len(storage_sizes) == 4
-
-    # Check the specific size of each sid
-    assert (
-        storage_sizes[0][0] == 40
-        and storage_sizes[1][0] == 4
-        and storage_sizes[2][0] == 40
-        and storage_sizes[3][0] == 40
-    )
-
-
-def test_plan_2d_memory():
-    """Verification if GraphPlanMemory manages 2d memory reffered as
-    global.texture* memory scopes in json file."""
-    global_virtual_device = tvm.target.VirtualDevice(memory_scope="global")
-    texture_virtual_device = tvm.target.VirtualDevice(memory_scope="global.texture")
-    metatable = {
-        "VirtualDevice": [
-            global_virtual_device,
-            texture_virtual_device,
-        ]
-    }
-
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data1: Tensor[(1, 32, 40, 40), float32],
-                  %data2: Tensor[(1, 32, 40, 40), float32]) {
-          %0 = fn (%a, Primitive=1) {
-            layout_transform(%a, src_layout="NCHW", dst_layout="NCHW4c")
-          };
-          %1 = %0(%data1);
-          %3 = %0(%data2);
-          %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},  // global
-                   %b {virtual_device=meta[VirtualDevice][0]},  // global
-                   virtual_device=meta[VirtualDevice][1],       // texture
-                   Primitive=1) {
-            add(%a, %b)
-          };
-          %6 = %5(%1, %3);
-          %7 = fn (%a {virtual_device=meta[VirtualDevice][1]},  // texture
-                   %b {virtual_device=meta[VirtualDevice][0]},  // global
-                   virtual_device=meta[VirtualDevice][1],       // texture
-                   Primitive=1) {
-            add(%a, %b)
-          };
-          %8 = %7(%6, %3);
-          %9 = fn (%a {virtual_device=meta[VirtualDevice][1]},  // texture
-                   %b {virtual_device=meta[VirtualDevice][1]},  // texture
-                   virtual_device=meta[VirtualDevice][1],       // texture
-                   Primitive=1) {
-            add(%a, %b)
-          };
-          %10 = %9(%8, %6);
-          %11 = fn (%a,
-                    virtual_device=meta[VirtualDevice][0],      // global
-                    Primitive=1) {
-            layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
-          };
-          %11(%10)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-
-    GPU_DEVICE = tvm.device("cuda")
-    HOST_TARGET = tvm.target.Target("llvm")
-    GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET)
-    GPU = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET)  # device_type=2
-    CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": GPU.device_type_int})
-    config = tvm.target.make_compilation_config(CTXT, GPU_TARGET)
-    mod = relay.transform.InferType()(mod)
-    # PlanDevices should succeed.
-    mod = relay.transform.PlanDevices(config)(mod)
-
-    func = mod["main"]
-    memory_plan = relay.backend._backend.GraphPlanMemory(func)
-    virtual_devices = {}
-
-    # We do not have execution ordered information, the only order that we can stick
-    # in this place - storage_id
-    # for above graph we know that
-    # We have
-    #  - 8 manageable storages for above graph
-    #  - 5 of them are buffers
-    #  - 3 of them are textures (2d storages)
-    #  - 1 of buffer will be reused, since we have storage id maped data, we will have 4th
-    #      storage id reuesed and hidden in virtual_devices map
-    #  - no textures are reused so far
-    for k, v in memory_plan.expr_to_storage_info.items():
-        virtual_devices[v.storage_ids[0]] = v.virtual_devices[0].memory_scope
-
-    # Check the scopes according to abvoce expectaions
-    assert (
-        virtual_devices[0] == "global"
-        and virtual_devices[1] == "global"
-        and virtual_devices[2] == "global"
-        and virtual_devices[3] == "global"
-        and virtual_devices[4] == "global.texture"
-        and virtual_devices[5] == "global.texture"
-        and virtual_devices[6] == "global.texture"
-    )
-
-
-def test_reshape_nop():
-    # test that reshape can be turned into nop
-    x = relay.var("x", shape=(10, 4))
-    xx = relay.abs(x)
-    y = relay.expand_dims(xx, axis=1)
-    t0 = relay.reshape(y, (1, 40))
-    t1 = relay.abs(y)
-
-    z0 = relay.reshape(t0, (2, 20))
-    z1 = relay.sqrt(t1)
-    z2 = relay.reshape(t1, (1, 40))
-
-    func = relay.Function([x], relay.Tuple([z0, z1, z2]))
-    x_data = np.random.rand(10, 4).astype("float32")
-    graph = relay.build(tvm.IRModule.from_expr(func), "llvm")
-    graph_json_str = graph.get_graph_json()
-
-    graph_json = json.loads(graph_json_str)
-
-    # reshape must force sharing memory
-    storage_ids = graph_json["attrs"]["storage_id"][1]
-    assert tuple(storage_ids) == (0, 1, 1, 2, 3, 2)
-    assert graph_json["nodes"][2]["attrs"]["func_name"] == "__nop"
-    assert graph_json["nodes"][5]["attrs"]["func_name"] == "__nop"
-
-    gmod = graph_executor.GraphModule(graph["default"](tvm.cpu(0)))
-
-    gmod.set_input(x=x_data)
-    gmod.run()
-    z0_np = x_data.reshape(2, 20)
-    z1_np = np.sqrt(
-        np.abs(
-            x_data.reshape(
-                10,
-                1,
-                4,
-            )
-        )
-    )
-    z2_np = np.abs(x_data).reshape(1, 40)
-    tvm.testing.assert_allclose(gmod.get_output(0).numpy(), z0_np)
-    tvm.testing.assert_allclose(gmod.get_output(1).numpy(), z1_np)
-    tvm.testing.assert_allclose(gmod.get_output(2).numpy(), z2_np)
-
-
-@tvm.testing.uses_gpu
-def test_gru_like():
-    def unit(rnn_dim):
-        X = relay.var("X", shape=(1, rnn_dim))
-        W = relay.var("y", shape=(3 * rnn_dim, rnn_dim))
-        matmul = relay.nn.dense(X, W)
-        splitted = relay.split(matmul, indices_or_sections=3, axis=1)
-        out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
-        return relay.Function([X, W], out)
-
-    def sigmoid(x):
-        return 1 / (1 + np.exp(-x))
-
-    def unit_numpy(X, W):
-        prod = np.dot(X, W.transpose())
-        splits = np.split(prod, indices_or_sections=3, axis=1)
-        return sigmoid(splits[0]) + np.tanh(splits[1]) * np.exp(splits[2])
-
-    dtype = "float32"
-    rnn_dim = 1000
-    x = np.random.rand(1, rnn_dim).astype(dtype)
-    y = np.random.rand(3 * rnn_dim, rnn_dim).astype(dtype) * 0.01 - 0.005
-    out_shape = (1, rnn_dim)
-    z = unit(rnn_dim)
-
-    for target, dev in tvm.testing.enabled_targets():
-        with tvm.transform.PassContext(opt_level=2):
-            graph, lib, params = relay.build(tvm.IRModule.from_expr(z), target)
-            m = graph_executor.create(graph, lib, dev)
-            m.set_input("X", tvm.nd.array(x.astype(dtype)))
-            m.set_input("y", tvm.nd.array(y.astype(dtype)))
-            m.set_input(**params)
-            m.run()
-            out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).numpy()
-            ref = unit_numpy(x, y)
-            tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
-
-
-def test_compile_nested_tuples():
-    x = relay.var("x", shape=(10,))
-    x1 = x + relay.const(1.0)
-    x2 = x1 + relay.const(1.0)
-    x3 = x2 + relay.const(1.0)
-    x4 = x3 + relay.const(1.0)
-    out = relay.Tuple([x1, relay.Tuple([relay.Tuple([x2, x3]), x4])])
-    func = relay.Function([x], out)
-
-    graph, lib, _ = relay.build(tvm.IRModule.from_expr(func), "llvm")
-    mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-
-    x_data = np.random.uniform(size=(10,)).astype(np.float32)
-    mod.set_input(x=x_data)
-    mod.run()
-
-    assert mod.get_num_outputs() == 4
-
-    ref = x_data + 1
-    for i in range(mod.get_num_outputs()):
-        out = mod.get_output(i).numpy()
-        tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
-        ref = ref + 1
-
-
-def test_compile_return_empty_tuple():
-    x = relay.var("x", shape=[16], dtype="float32")
-    mod = tvm.IRModule.from_expr(relay.Function([x], relay.Tuple([])))
-    graph, lib, _ = relay.build(mod, "llvm")
-    mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-    mod.run()
-
-
-@tvm.testing.uses_gpu
-def test_compile_fused_identity_cast():
-    # a fused function that would optimized to identity
-    x = relay.var("x", shape=[16], dtype="float32")
-    y = relay.cast(x, "float32")
-    func1 = relay.Function([x], y).with_attr("Primitive", 1)
-
-    # a fused function with param pass-through
-    x = relay.var("x", shape=[16], dtype="float32")
-    y = relay.add(x, relay.const(3.14, "float32"))
-    func2 = relay.Function([x], relay.Tuple([x, y])).with_attr("Primitive", 1)
-
-    x_global = relay.var("xx", shape=[16], dtype="float32")
-    tup = func2(x_global)
-    y_global = func1(relay.TupleGetItem(tup, 0) + relay.TupleGetItem(tup, 1))
-
-    mod = tvm.IRModule.from_expr(relay.Function([x_global], y_global))
-    for target, device in tvm.testing.enabled_targets():
-        with tvm.transform.PassContext(opt_level=2):
-            graph, lib, _ = relay.build(mod, target=target)
-            executor = graph_executor.create(graph, lib, device=device)
-            executor.run()
-
-
-def test_graph_executor_nested_tuples():
-    x, y, z, w = [relay.var(c, shape=(2, 3), dtype="float32") for c in "xyzw"]
-    out = relay.Tuple([x, relay.Tuple([y, relay.Tuple([z, w])])])
-    func = relay.Function([x, y, z, w], out)
-
-    f = relay.create_executor(
-        kind="graph", mod=tvm.IRModule.from_expr(func), device=tvm.cpu(0), target="llvm"
-    ).evaluate()
-
-    data = [np.random.uniform(size=(2, 3)).astype("float32") for _ in "xyzw"]
-    out = f(*data)
-    assert len(out) == 2
-    tvm.testing.assert_allclose(out[0].numpy(), data[0])
-    assert len(out[1]) == 2
-    tvm.testing.assert_allclose(out[1][0].numpy(), data[1])
-    assert len(out[1][1]) == 2
-    tvm.testing.assert_allclose(out[1][1][0].numpy(), data[2])
-    tvm.testing.assert_allclose(out[1][1][1].numpy(), data[3])
-
-
-def test_graph_executor_api():
-    dname_0, dname_1 = "data_0", "data_1"
-    data_0, data_1 = [relay.var(c, shape=(1, 1), dtype="float32") for c in [dname_0, dname_1]]
-    net = relay.add(data_0, data_1)
-    func = relay.Function((data_0, data_1), net)
-
-    lib = relay.build(tvm.IRModule.from_expr(func), "llvm")
-    mod = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
-
-    assert mod.get_input_index(dname_1) == 1
-    assert mod.get_input_index(dname_0) == 0
-    assert mod.get_input_index("Invalid") == -1
-
-    shape_dict, dtype_dict = mod.get_input_info()
-    assert isinstance(shape_dict, tvm.container.Map)
-    assert isinstance(dtype_dict, tvm.container.Map)
-    for data in [data_0, data_1]:
-        name = data.name_hint
-        ty = data.type_annotation
-        # verify shape
-        assert name in shape_dict
-        assert isinstance(shape_dict[name], tvm.runtime.container.ShapeTuple)
-        assert shape_dict[name] == tvm.runtime.container.ShapeTuple([i.value for i in ty.shape])
-        # verify dtype
-        assert name in dtype_dict
-        assert isinstance(dtype_dict[name], tvm.runtime.container.String)
-        assert dtype_dict[name] == ty.dtype
-
-    shape_dict, dtype_dict = mod.get_output_info()
-    assert isinstance(shape_dict, tvm.container.Map)
-    assert isinstance(dtype_dict, tvm.container.Map)
-    for i, key in enumerate(shape_dict):
-        assert mod.get_output_index(key) == i
-
-
-@tvm.testing.requires_llvm
-def test_benchmark():
-    mod, params = mlp.get_workload(1)
-    lib = relay.build(mod, target="llvm", params=params)
-    exe = graph_executor.create(lib.get_graph_json(), lib.lib, tvm.cpu())
-    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
-    result = exe.benchmark(tvm.cpu(), data=data, func_name="run", repeat=2, number=1)
-    assert result.mean == result.median
-    assert result.mean > 0
-    assert len(result.results) == 2
-
-    with patch.object(
-        tvm.runtime.module.Module,
-        "time_evaluator",
-        return_value=lambda: tvm.runtime.module.BenchmarkResult([1, 2, 2, 5]),
-    ) as method:
-        result = exe.benchmark(tvm.cpu(), data=data, func_name="run", repeat=2, number=1)
-        assert result.mean == 2.5
-        assert result.median == 2.0
-        assert result.max == 5
-        assert result.min == 1
-        assert result.std == 1.5
-
-
-@tvm.testing.parametrize_targets("cuda", "llvm")
-def test_benchmark_end_to_end(dev, target):
-    mod, params = mlp.get_workload(1)
-    lib = relay.build(mod, target=target, params=params)
-    exe = graph_executor.create(lib.get_graph_json(), lib.lib, dev)
-    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
-    result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
-    assert result.mean > 0
-    assert len(result.results) == 2
-
-
-@tvm.testing.requires_cuda
-def test_benchmark_end_to_end_rpc():
-    server = rpc.Server("127.0.0.1")
-    remote = rpc.connect(server.host, server.port)
-
-    mod, params = mlp.get_workload(1)
-    lib = relay.build(mod, target="cuda", params=params)
-
-    temp = utils.tempdir()
-    path = temp.relpath("library.so")
-    lib.export_library(path)
-    remote.upload(path)
-    rlib = remote.load_module("library.so")
-
-    dev = remote.device("cuda")
-    exe = graph_executor.create(lib.get_graph_json(), rlib, dev)
-
-    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
-    result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
-    assert result.mean > 0
-    assert len(result.results) == 2
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
deleted file mode 100644
index 3c94452311de..000000000000
--- a/tests/python/relay/test_backend_interpreter.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-import tvm
-from tvm import testing
-from tvm import nd
-from tvm import relay
-from tvm.runtime import container
-from tvm.relay.backend.interpreter import RefValue, ConstructorValue
-from tvm.relay.scope_builder import ScopeBuilder
-
-
-def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
-    # TODO(tqchen) add more types once the schedule register is fixed.
-    for target in ["llvm"]:
-        dev = tvm.device(target, 0)
-        if not testing.device_enabled(target):
-            return
-        func = relay.create_executor(mod=mod, device=dev, target=target).evaluate(expr)
-        result = func if args is None else func(*args)
-        # use testing which also set atol
-        testing.assert_allclose(result.numpy(), expected_result, rtol=rtol)
-
-
-def test_tuple_value():
-    tv = container.tuple_object([relay.const(1), relay.const(2), relay.const(3)])
-    np.testing.assert_allclose(tv[0].data.numpy(), 1)
-    np.testing.assert_allclose(tv[1].data.numpy(), 2)
-    np.testing.assert_allclose(tv[2].data.numpy(), 3)
-
-
-def test_tuple_getitem():
-    two = relay.add(relay.const(1), relay.const(1))
-    func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
-    check_eval(func, [], 1)
-
-
-def test_id():
-    x = relay.var("x", "float32")
-    ident = relay.Function([x], x)
-    one = np.array(1.0, "float32")
-    check_eval(ident, [one], one)
-
-
-def test_add_const():
-    two = relay.add(relay.const(1), relay.const(1))
-    func = relay.Function([], two)
-    check_eval(func, [], 2)
-
-
-def test_mul_param():
-    x = relay.var("x", shape=(10, 10))
-    y = relay.var("y", shape=(1, 10))
-    func = relay.Function([x, y], relay.multiply(x, y))
-    x_data = np.random.rand(10, 10).astype("float32")
-    y_data = np.random.rand(1, 10).astype("float32")
-    check_eval(func, [x_data, y_data], x_data * y_data)
-
-
-def test_equal():
-    i = relay.var("i", shape=[], dtype="int32")
-    j = relay.var("i", shape=[], dtype="int32")
-    z = relay.equal(i, j)
-    func = relay.Function([i, j], z, ret_type=relay.TensorType([], "bool"))
-    i_data = relay.const(0, "int32")
-    j_data = relay.const(0, "int32")
-    check_eval(func, [i_data, j_data], True)
-
-
-def test_subtract():
-    i = relay.var("i", shape=[], dtype="int32")
-    sub = relay.subtract(i, relay.const(1, dtype="int32"))
-    func = relay.Function([i], sub, ret_type=relay.TensorType([], "int32"))
-    i_data = np.array(1, dtype="int32")
-    check_eval(func, [i_data], 0)
-
-
-def test_simple_loop():
-    mod = tvm.IRModule({})
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    sb = ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0, dtype="int32"))):
-        sb.ret(i)
-    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1, dtype="int32"))
-        rec_call = relay.Call(sum_up, [one_less])
-        sb.ret(relay.add(rec_call, i))
-    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], "int32"))
-    mod[sum_up] = func
-    i_data = np.array(10, dtype="int32")
-    check_eval(sum_up, [i_data], sum(range(1, 11)), mod=mod)
-
-
-def test_loop():
-    mod = tvm.IRModule({})
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    accum = relay.var("accum", shape=[], dtype="int32")
-    sb = ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0, "int32"))):
-        sb.ret(accum)
-    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1, "int32"))
-        new_accum = relay.add(accum, i)
-        sb.ret(relay.Call(sum_up, [one_less, new_accum]))
-    func = relay.Function([i, accum], sb.get())
-    mod[sum_up] = func
-    i_data = np.array(10, dtype="int32")
-    accum_data = np.array(0, dtype="int32")
-    check_eval(sum_up, [i_data, accum_data], sum(range(1, 11)), mod=mod)
-
-
-def test_ref():
-    mod = tvm.IRModule()
-    three_with_ref = relay.GlobalVar("three_with_ref")
-    i = relay.Var("i")
-    iv = relay.Var("iv")
-    u = relay.Var("u")
-    uv = relay.Var("uv")
-    body = relay.add(iv, uv)
-    body = relay.Let(uv, relay.RefRead(i), body)
-    body = relay.Let(u, relay.RefWrite(i, relay.const(2)), body)
-    body = relay.Let(iv, relay.RefRead(i), body)
-    body = relay.Let(i, relay.RefCreate(relay.const(1)), body)
-    mod[three_with_ref] = relay.Function([], body)
-    check_eval(three_with_ref, [], 3, mod=mod)
-
-
-def test_binds():
-    x = relay.var("x")
-    y = relay.add(x, x)
-    xx = np.ones((10, 20))
-    res = relay.create_executor().evaluate(y, binds={x: xx}).numpy()
-    testing.assert_allclose(xx + xx, res)
-
-
-def test_kwargs_params():
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.var("z", shape=(1, 10))
-    f = relay.Function([x, y, z], x + y + z)
-    x_data = np.random.rand(1, 10).astype("float32")
-    y_data = np.random.rand(1, 10).astype("float32")
-    z_data = np.random.rand(1, 10).astype("float32")
-    params = {"y": y_data, "z": z_data}
-    res = relay.create_executor().evaluate(f)(x_data, **params)
-    testing.assert_allclose(res.numpy(), x_data + y_data + z_data)
-
-
-def test_function_taking_adt_ref_tuple():
-    mod = tvm.IRModule()
-    prelude = relay.prelude.Prelude(mod)
-    _, cons, nil = prelude.mod.get_type("List")
-
-    nil_value = ConstructorValue(nil.tag, [], nil)
-    cons_value = ConstructorValue(
-        cons.tag,
-        [nd.array(np.random.rand(1, 10).astype("float32")), nil_value],
-        cons,
-    )
-
-    ref_value = RefValue(nd.array(np.random.rand(1, 10).astype("float32")))
-    tuple_value = container.tuple_object(
-        [nd.array(np.random.rand(1, 10).astype("float32")) for _ in range(10)]
-    )
-
-    id_func = relay.create_executor(mod=mod).evaluate(prelude.id)
-
-    res_nil = id_func(nil_value)
-    assert res_nil.tag == nil_value.tag
-    assert len(res_nil.fields) == 0
-
-    res_cons = id_func(cons_value)
-    assert res_cons.tag == cons_value.tag
-    assert len(res_cons.fields) == len(cons_value.fields)
-    testing.assert_allclose(res_cons.fields[0].numpy(), cons_value.fields[0].numpy())
-    assert isinstance(res_cons.fields[1], ConstructorValue)
-    assert res_cons.fields[1].tag == nil.tag
-    assert len(res_cons.fields[1].fields) == 0
-
-    res_ref = id_func(ref_value)
-    testing.assert_allclose(res_ref.value.numpy(), ref_value.value.numpy())
-
-    res_tuple = id_func(tuple_value)
-    for i in range(10):
-        testing.assert_allclose(res_tuple[i].numpy(), tuple_value[i].numpy())
-
-
-def test_tuple_passing():
-    x = relay.var(
-        "x",
-        type_annotation=relay.ty.TupleType(
-            [relay.ty.TensorType((), "int64"), relay.ty.TensorType((), "int64")]
-        ),
-    )
-
-    fn = relay.Function([x], relay.expr.TupleGetItem(x, 0))
-    mod = tvm.IRModule({})
-    gv = relay.GlobalVar("main")
-    mod[gv] = fn
-    mod = relay.transform.InferType()(mod)
-
-    dev = tvm.cpu()
-    target = tvm.target.Target("llvm")
-    f = relay.create_executor(mod=mod, device=dev, target=target).evaluate(gv)
-    # First use a Python tuple.
-    out = f((10, 8))
-    testing.assert_allclose(out.numpy(), np.array(10))
-    # Second use a tuple value.
-    value_tuple = container.tuple_object([nd.array(np.array(11)), nd.array(np.array(12))])
-    out = f(value_tuple)
-    testing.assert_allclose(out.numpy(), np.array(11))
-
-
-def test_dynamic():
-    n = 3
-    m = 2
-    x = relay.Var("x", relay.TensorType([relay.Any(), m], "float32"))
-    y = relay.Var("y", relay.TensorType([relay.Any(), m], "float32"))
-    xx = x - relay.expr.const(3.0)
-    yy = y * relay.expr.const(5.0)
-    z = relay.op.concatenate([xx, yy], axis=0)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], z)
-    x_np = np.random.uniform(size=(n, m)).astype("float32")
-    y_np = np.random.uniform(size=(n, m)).astype("float32")
-    expected = np.concatenate([x_np - 3.0, y_np * 5.0], axis=0)
-    check_eval(None, [x_np, y_np], expected, mod)
-
-
-def test_ref_global_from_expr():
-    n = 3
-    x = relay.Var("x", relay.TensorType([n], "float32"))
-    y = relay.Var("y", relay.TensorType([n], "float32"))
-    mod = tvm.IRModule()
-    mod["add"] = relay.Function([x, y], relay.add(x, y))
-    x_np = np.random.uniform(size=(n,)).astype("float32")
-    y_np = np.random.uniform(size=(n,)).astype("float32")
-    expected = np.add(x_np, y_np)
-    expr = relay.Call(mod.get_global_var("add"), [relay.const(x_np), relay.const(y_np)])
-    check_eval(expr, None, expected, mod)
-
-
-def test_keyword_args():
-    n = 3
-    x = relay.Var("x", relay.TensorType([n], "float32"))
-    y = relay.Var("y", relay.TensorType([n], "float32"))
-    z = relay.add(x, y)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], z)
-    x_np = np.random.uniform(size=(n,)).astype("float32")
-    y_np = np.random.uniform(size=(n,)).astype("float32")
-    expected = np.add(x_np, y_np)
-    actual = relay.create_executor(mod=mod).evaluate()(y=y_np, x=x_np)
-    testing.assert_allclose(actual.numpy(), expected)
-
-
-# TODO(mbs): Support? Would help reduce wasted work when we need to prepare
-# multiple functions w.r.t. the same module.
-@pytest.mark.skip(reason="closures are currently not directly Python callable")
-def test_functional_returns():
-    n = 3
-    x = relay.Var("x", relay.TensorType([n], "float32"))
-    f = relay.Function([x], x)
-    t = relay.Tuple([f, f])
-    c = np.random.rand(n).astype("float32")
-    result1, result2 = relay.create_executor().evaluate(t)
-    testing.assert_allclose(result1(c).numpy(), c)
-    testing.assert_allclose(result2(c).numpy(), c)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py
deleted file mode 100644
index b1146743eeb8..000000000000
--- a/tests/python/relay/test_build_module.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.target.target import Target
-from tvm.relay import testing
-from tvm.relay.backend import Runtime, Executor, graph_executor_codegen
-
-
-@pytest.mark.parametrize(
-    "test_target,unsupported_config",
-    [
-        ["c", "-runtime=c"],
-        ["c", "-system-lib=1"],
-        ["c", "-executor=aot"],
-        ["c", "-interface-api=c"],
-        ["c", "-unpacked-api=1"],
-        ["c", "-link-params=1"],
-    ],
-)
-def test_deprecated_target_parameters(test_target, unsupported_config):
-    with pytest.raises(ValueError) as e_info:
-        Target(f"{test_target} {unsupported_config}")
-        assert f"Cannot recognize '{unsupported_config}" in str(e_info.execption)
-
-
-def test_build_relay_graph_():
-    """Test to build a simple relay graph by using APIs directly"""
-
-    def build_graph(mod, target):
-        target, target_host = tvm.target.Target.canon_target_and_host(target)
-        mod, _ = relay.optimize(mod, target)
-        grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
-        _, lowered_funcs, _ = grc.codegen(mod, mod["main"])
-        _ = relay.backend._backend.build(lowered_funcs, target)
-
-    def add(shape, dtype):
-        lhs = relay.var("A", shape=shape, dtype=dtype)
-        rhs = relay.var("B", shape=shape, dtype=dtype)
-        out = relay.add(lhs, rhs)
-        expr = relay.Function((lhs, rhs), out)
-        mod = tvm.IRModule.from_expr(expr)
-        return mod
-
-    build_graph(add((1, 8), "float32"), tvm.target.Target("llvm"))
-
-
-@tvm.testing.requires_llvm
-def test_schedule_record():
-    """Test to build a nn model and get schedule_record from build_module"""
-
-    def check_schedule(executor):
-        for func_name, func_meta in executor.function_metadata.items():
-            # check converted op only
-            if "main" not in func_name:
-                primfunc = list(func_meta.relay_primfuncs.values())[0]
-                # make sure schedule is well-stored in function metadata
-                assert "schedule" in primfunc.attrs
-                sch = primfunc.attrs["schedule"]
-                assert len(sch.schedule_record) == len(sch.primitive_record)
-
-    relay_mod, params = testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-    target_llvm = tvm.target.Target("llvm")
-    config = {"te.keep_schedule_record": True}
-
-    with tvm.transform.PassContext(opt_level=3, config=config):
-        aot_executor_factory = relay.build(
-            relay_mod,
-            target_llvm,
-            runtime=Runtime("cpp"),
-            executor=Executor("aot"),
-            params=params,
-        )
-        graph_executor_factory = relay.build(
-            relay_mod,
-            target_llvm,
-            params=params,
-        )
-
-    check_schedule(aot_executor_factory)
-    check_schedule(graph_executor_factory)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_call_graph.py b/tests/python/relay/test_call_graph.py
deleted file mode 100644
index be4d52f8812a..000000000000
--- a/tests/python/relay/test_call_graph.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-import pytest
-import tvm
-from tvm import relay
-
-
-def test_callgraph_construct():
-    mod = tvm.IRModule({})
-    x = relay.var("x", shape=(2, 3))
-    y = relay.var("y", shape=(2, 3))
-    mod["g1"] = relay.Function([x, y], x + y)
-    call_graph = relay.analysis.CallGraph(mod)
-    assert "g1" in str(call_graph)
-    tvm.ir.assert_structural_equal(mod, call_graph.module)
-
-
-def test_print_element():
-    mod = tvm.IRModule({})
-    x0 = relay.var("x0", shape=(2, 3))
-    y0 = relay.var("y0", shape=(2, 3))
-    mod["g0"] = relay.Function([x0, y0], x0 + y0)
-    x1 = relay.var("x1", shape=(2, 3))
-    y1 = relay.var("y1", shape=(2, 3))
-    mod["g1"] = relay.Function([x1, y1], x1 - y1)
-    call_graph = relay.analysis.CallGraph(mod)
-
-    assert "#refs = 0" in str(call_graph.print_var("g0"))
-    assert "#refs = 0" in str(call_graph.print_var("g1"))
-
-
-def test_global_call_count():
-    mod = tvm.IRModule({})
-    x0 = relay.var("x0", shape=(2, 3))
-    y0 = relay.var("y0", shape=(2, 3))
-    g0 = relay.GlobalVar("g0")
-    mod[g0] = relay.Function([x0, y0], x0 + y0)
-    x1 = relay.var("x1", shape=(2, 3))
-    y1 = relay.var("y1", shape=(2, 3))
-    g1 = relay.GlobalVar("g1")
-    mod[g1] = relay.Function([x1, y1], g0(x1, y1))
-    call_graph = relay.analysis.CallGraph(mod)
-
-    p0 = relay.var("p0", shape=(2, 3))
-    p1 = relay.var("p1", shape=(2, 3))
-    func = relay.Function([p0, p1], g0(p0, p1) * g1(p0, p1))
-    mod["main"] = func
-    call_graph = relay.analysis.CallGraph(mod)
-
-    assert call_graph.global_call_count(g0) == 0
-    assert call_graph.global_call_count(g1) == 1
-    assert call_graph.global_call_count("main") == 2
-
-
-def test_ref_count():
-    mod = tvm.IRModule({})
-    x0 = relay.var("x0", shape=(2, 3))
-    y0 = relay.var("y0", shape=(2, 3))
-    g0 = relay.GlobalVar("g0")
-    mod[g0] = relay.Function([x0, y0], x0 + y0)
-    x1 = relay.var("x1", shape=(2, 3))
-    y1 = relay.var("y1", shape=(2, 3))
-    g1 = relay.GlobalVar("g1")
-    mod[g1] = relay.Function([x1, y1], x1 - y1)
-    call_graph = relay.analysis.CallGraph(mod)
-
-    p0 = relay.var("p0", shape=(2, 3))
-    p1 = relay.var("p1", shape=(2, 3))
-    func = relay.Function([p0, p1], g0(p0, p1) * g1(p0, p1))
-    mod["main"] = func
-    call_graph = relay.analysis.CallGraph(mod)
-
-    assert call_graph.ref_count(g0) == 1
-    assert call_graph.ref_count(g1) == 1
-    assert call_graph.ref_count("main") == 0
-
-
-def test_nested_ref():
-    mod = tvm.IRModule({})
-    x0 = relay.var("x0", shape=(2, 3))
-    y0 = relay.var("y0", shape=(2, 3))
-    g0 = relay.GlobalVar("g0")
-    mod[g0] = relay.Function([x0, y0], x0 + y0)
-    x1 = relay.var("x1", shape=(2, 3))
-    y1 = relay.var("y1", shape=(2, 3))
-    g1 = relay.GlobalVar("g1")
-    mod[g1] = relay.Function([x1, y1], g0(x1, y1))
-    call_graph = relay.analysis.CallGraph(mod)
-
-    p0 = relay.var("p0", shape=(2, 3))
-    p1 = relay.var("p1", shape=(2, 3))
-    func = relay.Function([p0, p1], g0(p0, p1) * g1(p0, p1))
-    mod["main"] = func
-    call_graph = relay.analysis.CallGraph(mod)
-
-    assert call_graph.ref_count(g0) == 2
-    assert call_graph.ref_count(g1) == 1
-    assert call_graph.ref_count("main") == 0
-
-
-def test_recursive_func():
-    mod = tvm.IRModule({})
-
-    x = relay.var("x", shape=[], dtype="int32")
-    fn0 = relay.Function([x], x)
-    gx = relay.GlobalVar("gx")
-    mod[gx] = fn0
-
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    sb = relay.ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0, dtype="int32"))):
-        sb.ret(i)
-    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1, dtype="int32"))
-        global_call = gx(i)
-        rec_call = relay.Call(sum_up, [one_less]) + global_call
-        sb.ret(relay.add(rec_call, i))
-    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], "int32"))
-    func = func.with_attr("Compiler", "a")
-    mod[sum_up] = func
-    iarg = relay.var("i", shape=[], dtype="int32")
-    mod["main"] = relay.Function([iarg], sum_up(iarg))
-    call_graph = relay.analysis.CallGraph(mod)
-
-    assert call_graph.is_recursive(sum_up)
-    assert call_graph.ref_count(sum_up) == 2
-    assert call_graph.ref_count(gx) == 1
-    assert call_graph.ref_count("main") == 0
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_change_batch.py b/tests/python/relay/test_change_batch.py
deleted file mode 100644
index 8b4c6bab97fd..000000000000
--- a/tests/python/relay/test_change_batch.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.testing import synthetic
-from tvm.relay import transform
-
-
-def test_change_batch_synthetic():
-    net, params = synthetic.get_workload()
-    new_net = transform.ChangeBatch({net["main"].params[0]: 0}, batch_size=123)(net)
-    assert new_net["main"].checked_type.ret_type.shape[0] == 123
-
-
-if __name__ == "__main__":
-    test_change_batch_synthetic()
diff --git a/tests/python/relay/test_cmp_op.py b/tests/python/relay/test_cmp_op.py
deleted file mode 100644
index b82646620584..000000000000
--- a/tests/python/relay/test_cmp_op.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from tvm import relay
-
-a = relay.Var("a")
-b = relay.expr.const(1.0, dtype="float32")
-
-c = a < b
-d = relay.less(a, b)
-assert c.astext() == d.astext()
-
-c = a > b
-d = relay.greater(a, b)
-assert c.astext() == d.astext()
-
-c = a >= b
-d = relay.greater_equal(a, b)
-assert c.astext() == d.astext()
-
-c = a <= b
-d = relay.less_equal(a, b)
-assert c.astext() == d.astext()
diff --git a/tests/python/relay/test_const.py b/tests/python/relay/test_const.py
deleted file mode 100644
index c815f6bd4fa4..000000000000
--- a/tests/python/relay/test_const.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.relay.frontend.common import infer_type
-from tvm.relay import op as _op
-
-
-def test_const_dtype():
-    strides = (1, 1)
-    np_array = np.array(strides).astype("int32")
-    strides = _op.const(np_array, dtype="int64")
-
-    # strides needs to be autoconverted to int64 on Windows
-    assert infer_type(strides).checked_type.dtype == np.dtype(np.int64)
-
-    a = tvm.nd.array(np.random.randint(0, high=255, size=(2, 3), dtype="uint8"))
-    a = _op.const(a, dtype="uint8")
-    aa = a.data.numpy()
-    assert aa.dtype == np.dtype(np.uint8)
-
-    b = _op.const(1, dtype="int8")
-    bb = b.data.numpy()
-    assert bb.dtype == np.dtype(np.int8)
-
-    kshape = (3, 10, 3, 3)
-    w = relay.const(np.zeros(kshape, dtype="float32"))
-    assert w.data.numpy().dtype == np.dtype(np.float32)
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
deleted file mode 100644
index e8e6676863a6..000000000000
--- a/tests/python/relay/test_cpp_build_module.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import relay, runtime
-from tvm.contrib.nvcc import have_fp16
-import tvm.testing
-
-
-def test_basic_build():
-    tgt = "llvm"
-    dev = tvm.cpu()
-    # func
-    a = relay.var("a", dtype="float32", shape=(16, 8))
-    b = relay.var("b", dtype="float32", shape=(8, 8))
-    c = relay.var("c", dtype="float32", shape=(16, 8))
-    x = relay.nn.dense(a, b)
-    y = relay.nn.relu(x)
-    z = y + c
-    func = relay.Function([a, b, c], z)
-    A = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), device=dev)
-    B = tvm.nd.array(np.random.uniform(-1, 1, (8, 8)).astype("float32"), device=dev)
-    C = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), device=dev)
-    params = {"b": B, "c": C}
-    # build
-    targets = {tvm.tir.IntImm("int32", dev.device_type): tgt}
-    mod = tvm.IRModule.from_expr(func)
-    func_in_mod = mod["main"]
-    assert mod["main"] == func_in_mod, "cannot compare function to itself"
-
-    lib = relay.build(mod, targets, "llvm", params=params)
-    assert mod["main"] == func_in_mod, "relay.build changed module in-place"
-
-    # test
-    rt = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    rt.set_input("a", A)
-    rt.run()
-    out = rt.get_output(0)
-
-    np.testing.assert_allclose(
-        out.numpy(),
-        np.maximum(np.dot(A.numpy(), B.numpy().T), 0) + C.numpy(),
-        atol=1e-5,
-        rtol=1e-5,
-    )
-
-
-@tvm.testing.requires_cuda
-def test_fp16_build():
-    dtype = "float16"
-
-    dev = tvm.cuda(0)
-    if dtype == "float16" and not have_fp16(dev.compute_version):
-        print("skip because gpu does not support fp16")
-        return
-
-    x = relay.var("x", dtype=dtype, shape=(4, 4))
-    y = relay.var("y", dtype=dtype, shape=(4, 4))
-    z = x + y
-    func = relay.Function([x, y], z)
-    X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
-    Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
-    params = {
-        "x": X,
-        "y": Y,
-    }
-
-    # build
-    g_json, mmod, params = relay.build(func, "cuda", params=params)
-
-    # test
-    rt = tvm.contrib.graph_executor.create(g_json, mmod, dev)
-    rt.load_params(runtime.save_param_dict(params))
-    rt.run()
-    out = rt.get_output(0)
-
-    np.testing.assert_allclose(out.numpy(), X.numpy() + Y.numpy(), atol=1e-5, rtol=1e-5)
-
-
-@tvm.testing.requires_llvm
-def test_bf16_build():
-    data = relay.var("data", shape=(1, 3, 224, 224), dtype="float32")
-    weight = relay.var("weight", shape=(64, 3, 7, 7), dtype="float32")
-    bn_gamma = relay.var("gamma", shape=(64,), dtype="float32")
-    bn_beta = relay.var("beta", shape=(64,), dtype="float32")
-    bn_mean = relay.var("mean", shape=(64,), dtype="float32")
-    bn_var = relay.var("var", shape=(64,), dtype="float32")
-    params = {
-        "weight": np.random.uniform(-1, 1, size=(64, 3, 7, 7)).astype("float32"),
-        "gamma": np.random.uniform(-1, 1, size=(64,)).astype("float32"),
-        "beta": np.random.uniform(-1, 1, size=(64,)).astype("float32"),
-        "mean": np.random.uniform(-1, 1, size=(64,)).astype("float32"),
-        "var": np.random.uniform(-1, 1, size=(64,)).astype("float32"),
-    }
-    conv_bf16 = relay.nn.conv2d(
-        relay.cast(data, "bfloat16"),
-        relay.cast(weight, "bfloat16"),
-        strides=(2, 2),
-        padding=(3, 3, 3, 3),
-        channels=64,
-        kernel_size=(7, 7),
-        out_dtype="bfloat16",
-    )
-    bn_bf16 = relay.nn.batch_norm(
-        conv_bf16,
-        relay.cast(bn_gamma, "bfloat16"),
-        relay.cast(bn_beta, "bfloat16"),
-        relay.cast(bn_mean, "bfloat16"),
-        relay.cast(bn_var, "bfloat16"),
-    )
-    relu_bf16 = relay.nn.relu(bn_bf16[0])
-    maxpool_bf16 = relay.nn.max_pool2d(relu_bf16, pool_size=(2, 2), strides=(2, 2))
-    avgpool_bf16 = relay.nn.avg_pool2d(maxpool_bf16, pool_size=(2, 2), strides=(2, 2))
-    flattened_bf16 = relay.nn.batch_flatten(avgpool_bf16)
-    softmax_bf16 = relay.nn.softmax(flattened_bf16)
-    mod_bf16 = tvm.IRModule.from_expr(softmax_bf16)
-    with tvm.transform.PassContext(opt_level=3):
-        relay.build(mod_bf16, target="llvm", params=params)
-
-
-@tvm.testing.parametrize_targets("llvm", "cuda")
-def test_fp16_conversion(target, dev):
-    if target == "cuda" and not have_fp16(dev.compute_version):
-        print("skip because gpu does not support fp16")
-        return
-
-    n = 10
-
-    for (src, dst) in [("float32", "float16"), ("float16", "float32")]:
-        x = relay.var("x", relay.TensorType((n,), src))
-        y = x.astype(dst)
-        func = relay.Function([x], y)
-
-        # init input
-        X = tvm.nd.array(n * np.random.randn(n).astype(src) - n / 2)
-
-        # build
-        with tvm.transform.PassContext(opt_level=1):
-            g_json, mmod, params = relay.build(tvm.IRModule.from_expr(func), target)
-
-        # test
-        rt = tvm.contrib.graph_executor.create(g_json, mmod, dev)
-        rt.set_input("x", X)
-        rt.run()
-        out = rt.get_output(0)
-
-        np.testing.assert_allclose(out.numpy(), X.numpy().astype(dst), atol=1e-5, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_basic_build()
-    test_fp16_build()
-    test_fp16_conversion()
-    test_bf16_build()
diff --git a/tests/python/relay/test_custom_datatypes.py b/tests/python/relay/test_custom_datatypes.py
deleted file mode 100644
index b0f01e62a059..000000000000
--- a/tests/python/relay/test_custom_datatypes.py
+++ /dev/null
@@ -1,662 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for the Bring Your Own Datatype framework.
-
-TODO(@gussmith23 @hypercubestart) link to documentation"""
-import platform
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.topi.testing
-import tvm.testing
-from tvm import relay
-from tvm.relay.testing.layers import batch_norm_infer
-from tvm.target.datatype import (
-    create_lower_func,
-    create_min_lower_func,
-    lower_call_pure_extern,
-    lower_ite,
-    register,
-    register_min_func,
-    register_op,
-)
-from tvm.tir.op import call_pure_extern
-from tvm.script import tir as T
-
-
-# note: we can't use relay.testing models because params are randomly initialized,
-# which lead the output to have the same values
-# get mobilenet model from Gluon CV
-# because: https://discuss.tvm.apache.org/t/mobilenet-intermediate-values-are-0/7812
-def get_mobilenet():
-    dshape = (1, 3, 224, 224)
-    from mxnet.gluon.model_zoo.vision import get_model
-
-    block = get_model("mobilenet0.25", pretrained=True)
-    shape_dict = {"data": dshape}
-    return relay.frontend.from_mxnet(block, shape_dict)
-
-
-# use real image instead of random data for end-to-end model training
-# or else output would all be around the same value
-def get_cat_image(dimensions):
-    from PIL import Image
-    from tvm.contrib.download import download_testdata
-
-    url = "https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png"
-    dst = "cat.png"
-    real_dst = download_testdata(url, dst, module="data")
-    img = Image.open(real_dst).resize(dimensions)
-    # CoreML's standard model image format is BGR
-    img_bgr = np.array(img)[:, :, ::-1]
-    img = np.transpose(img_bgr, (2, 0, 1))[np.newaxis, :]
-    return np.asarray(img, dtype="float32")
-
-
-# we use a random seed to generate input_data
-# to guarantee stable tests
-np.random.seed(0)
-
-
-def convert_ndarray(dst_dtype, array):
-    """Converts NDArray(s) into the specified datatype"""
-    x = relay.var("x", shape=array.shape, dtype=str(array.dtype))
-    cast = relay.Function([x], x.astype(dst_dtype))
-    with tvm.transform.PassContext(config={"tir.disable_vectorize": True}):
-        return relay.create_executor("graph").evaluate(cast)(array)
-
-
-def change_dtype(src, dst, module, params):
-    """Convert constants and functions in module from src type to dst type.
-    Returns changed module and converted params of type dst_type.
-    """
-    module = relay.frontend.ChangeDatatype(src, dst)(module)
-    module = relay.transform.InferType()(module)
-    params = {k: convert_ndarray(dst, v) for k, v in params.items()}
-    return module, params
-
-
-def compare(module, input, src_dtype, dst_dtype, rtol, atol, params={}, target="llvm"):
-    module = relay.transform.InferType()(module)
-    module = relay.transform.SimplifyInference()(module)
-
-    correct = relay.create_executor("graph", mod=module).evaluate()(*input, **params)
-    module, converted_params = change_dtype(src_dtype, dst_dtype, module, params)
-    # converts all inputs to dst_dtype
-    x_converted = [convert_ndarray(dst_dtype, arr) for arr in input]
-
-    # Vectorization is not implemented with custom datatypes
-    with tvm.transform.PassContext(config={"tir.disable_vectorize": True}):
-        maybe_correct = relay.create_executor("graph", mod=module, target=target).evaluate()(
-            *x_converted, **converted_params
-        )
-        # currently this only works for comparing single output
-        maybe_correct_converted = convert_ndarray(src_dtype, maybe_correct)
-    np.testing.assert_allclose(
-        maybe_correct_converted.numpy(), correct.numpy(), rtol=rtol, atol=atol
-    )
-
-
-def setup_myfloat():
-    """Set up tests for myfloat (a custom datatype that under the hood is float)
-
-    Currently, this registers some custom datatypes using the Bring Your
-    Own Datatypes framework.
-    """
-
-    def _setup_myfloat_inner():
-        # To use datatype operations in an external library, you should first load
-        # the library containing the datatype implementation:
-        # CDLL("libposit.so", RTLD_GLOBAL)
-        # In this case, the datatype library we are using is built right into TVM,
-        # so we do not need to explicitly load any library.
-
-        # You can pick a code for your datatype arbitrarily, as long as it is
-        # greater than 128 and has not already been chosen.
-        register("myfloat", 131)
-
-        register_op(
-            create_lower_func({(32, 32): "FloatToCustom32"}), "Cast", "llvm", "float", "myfloat"
-        )
-        register_op(
-            create_lower_func({(32, 32): "Custom32ToFloat"}), "Cast", "llvm", "myfloat", "float"
-        )
-        register_op(create_lower_func({32: "Custom32Add"}), "Add", "llvm", "myfloat")
-        register_op(
-            create_lower_func(
-                {
-                    32: "Custom32Sub",
-                }
-            ),
-            "Sub",
-            "llvm",
-            "myfloat",
-        )
-        register_op(create_lower_func({32: "Custom32Mul"}), "Mul", "llvm", "myfloat")
-        register_op(
-            create_lower_func(
-                {
-                    32: "FloatToCustom32",
-                }
-            ),
-            "FloatImm",
-            "llvm",
-            "myfloat",
-        )
-        register_op(
-            create_lower_func(
-                {
-                    32: "Custom32Div",
-                }
-            ),
-            "Div",
-            "llvm",
-            "myfloat",
-        )
-        register_op(create_lower_func({32: "Custom32Max"}), "Max", "llvm", "myfloat")
-        register_op(
-            create_lower_func({32: "Custom32Sqrt"}),
-            "Call",
-            "llvm",
-            "myfloat",
-            intrinsic_name="tir.sqrt",
-        )
-        register_op(
-            create_lower_func({32: "Custom32Exp"}),
-            "Call",
-            "llvm",
-            "myfloat",
-            intrinsic_name="tir.exp",
-        )
-        register_op(
-            create_lower_func({32: "Custom32Log"}),
-            "Call",
-            "llvm",
-            "myfloat",
-            intrinsic_name="tir.log",
-        )
-        register_op(
-            create_lower_func({32: "Custom32Sigmoid"}),
-            "Call",
-            "llvm",
-            "myfloat",
-            intrinsic_name="tir.sigmoid",
-        )
-        register_op(
-            create_lower_func({32: "Custom32Tanh"}),
-            "Call",
-            "llvm",
-            "myfloat",
-            intrinsic_name="tir.tanh",
-        )
-        register_op(lower_ite, "Call", "llvm", "myfloat", intrinsic_name="tir.if_then_else")
-        register_op(
-            lower_call_pure_extern, "Call", "llvm", "myfloat", intrinsic_name="tir.call_pure_extern"
-        )
-
-        register_min_func(create_min_lower_func({32: "MinCustom32"}, "myfloat"), "myfloat")
-
-    try:
-        _setup_myfloat_inner()
-    except tvm._ffi.base.TVMError as e:
-        # Ignore this specific error which can happen if another test
-        # that uses "myfloat" has already run.
-        if "float is already registered" not in str(e):
-            raise e
-
-
-def setup_posites2():
-    """Set up tests for posites2
-    Currently, this registers some custom datatypes using the Bring Your
-    Own Datatypes framework.
-    """
-
-    # To use datatype operations in an external library, you should first load
-    # the library containing the datatype implementation:
-    # CDLL("libposit.so", RTLD_GLOBAL)
-    # In this case, the datatype library we are using is built right into TVM,
-    # so we do not need to explicitly load any library.
-
-    # You can pick a code for your datatype arbitrarily, as long as it is
-    # greater than 128 and has not already been chosen.
-
-    register("posites2", 132)
-
-    register_op(
-        create_lower_func(
-            {
-                (32, 32): "FloatToPosit32es2",
-                (32, 16): "FloatToPosit16es2",
-                (32, 8): "FloatToPosit8es2",
-            }
-        ),
-        "Cast",
-        "llvm",
-        "float",
-        "posites2",
-    )
-    register_op(
-        create_lower_func(
-            {
-                (32, 32): "Posit32es2ToFloat",
-                (16, 32): "Posit16es2ToFloat",
-                (8, 32): "Posit8es2ToFloat",
-            }
-        ),
-        "Cast",
-        "llvm",
-        "posites2",
-        "float",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Add", 16: "Posit16es2Add", 8: "Posit8es2Add"}),
-        "Add",
-        "llvm",
-        "posites2",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Sub", 16: "Posit16es2Sub", 8: "Posit8es2Sub"}),
-        "Sub",
-        "llvm",
-        "posites2",
-    )
-    register_op(
-        create_lower_func(
-            {32: "FloatToPosit32es2", 16: "FloatToPosit16es2", 8: "FloatToPosit8es2"}
-        ),
-        "FloatImm",
-        "llvm",
-        "posites2",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Mul", 16: "Posit16es2Mul", 8: "Posit8es2Mul"}),
-        "Mul",
-        "llvm",
-        "posites2",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Div", 16: "Posit16es2Div", 8: "Posit8es2Div"}),
-        "Div",
-        "llvm",
-        "posites2",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Max", 16: "Posit16es2Max", 8: "Posit8es2Max"}),
-        "Max",
-        "llvm",
-        "posites2",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Sqrt", 16: "Posit16es2Sqrt", 8: "Posit8es2Sqrt"}),
-        "Call",
-        "llvm",
-        "posites2",
-        intrinsic_name="tir.sqrt",
-    )
-    register_op(lower_ite, "Call", "llvm", "posites2", intrinsic_name="tir.if_then_else")
-    register_op(
-        lower_call_pure_extern, "Call", "llvm", "posites2", intrinsic_name="tir.call_pure_extern"
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Exp", 16: "Posit16es2Exp", 8: "Posit8es2Exp"}),
-        "Call",
-        "llvm",
-        "posites2",
-        intrinsic_name="tir.exp",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Log", 16: "Posit16es2Log", 8: "Posit8es2Log"}),
-        "Call",
-        "llvm",
-        "posites2",
-        intrinsic_name="tir.log",
-    )
-    register_op(
-        create_lower_func(
-            {32: "Posit32es2Sigmoid", 16: "Posit16es2Sigmoid", 8: "Posit8es2Sigmoid"}
-        ),
-        "Call",
-        "llvm",
-        "posites2",
-        intrinsic_name="tir.sigmoid",
-    )
-    register_op(
-        create_lower_func({32: "Posit32es2Tanh", 16: "Posit16es2Tanh", 8: "Posit8es2Tanh"}),
-        "Call",
-        "llvm",
-        "posites2",
-        intrinsic_name="tir.tanh",
-    )
-
-    register_min_func(
-        create_min_lower_func(
-            {32: "MinPosit32es2", 16: "MinPosit16es2", 8: "MinPosit8es2"}, "posites2"
-        ),
-        "posites2",
-    )
-
-
-def run_ops(src_dtype, dst_dtype, rtol=1e-7, atol=1e-7):
-    """Run the same op, but with two different datatypes"""
-    # used for unary ops, first shape in binary ops
-    shape1 = (5, 10, 5)
-    # second shape for binary ops
-    shape2 = (5,)
-
-    def check_unary_op(op, src_dtype, dst_dtype, shape):
-        t1 = relay.TensorType(shape, src_dtype)
-        x = relay.var("x", t1)
-        z = op(x)
-        x_data = np.random.rand(*shape).astype(t1.dtype)
-
-        module = tvm.IRModule.from_expr(relay.Function([x], z))
-
-        compare(module, (x_data,), src_dtype, dst_dtype, rtol, atol)
-
-    # test unary ops
-    for op in [
-        relay.nn.softmax,
-        tvm.relay.log,
-        tvm.relay.exp,
-        tvm.relay.sqrt,
-        tvm.relay.rsqrt,
-        tvm.relay.sigmoid,
-        tvm.relay.tanh,
-        relay.nn.relu,
-        relay.nn.batch_flatten,
-    ]:
-        check_unary_op(op, src_dtype, dst_dtype, shape1)
-
-    # test unary ops over 4d data
-    for op in [relay.nn.max_pool2d, relay.nn.avg_pool2d, relay.nn.global_avg_pool2d]:
-        shape_2d = (3, 32, 32, 32)
-        check_unary_op(op, src_dtype, dst_dtype, shape_2d)
-
-    def check_binary_op(opfunc, src_dtype, dst_dtype):
-        t1 = relay.TensorType(shape1, src_dtype)
-        t2 = relay.TensorType(shape2, src_dtype)
-        x = relay.var("x", t1)
-        y = relay.var("y", t2)
-        z = opfunc(x, y)
-        x_data = np.random.rand(*shape1).astype(t1.dtype)
-        y_data = np.random.rand(*shape2).astype(t2.dtype)
-        module = tvm.IRModule.from_expr(relay.Function([x, y], z))
-
-        compare(module, (x_data, y_data), src_dtype, dst_dtype, rtol, atol)
-
-    for op in [
-        relay.add,
-        relay.subtract,
-        relay.divide,
-        relay.multiply,
-    ]:
-        check_binary_op(op, src_dtype, dst_dtype)
-
-    # we would like to test tvm_if_then_else
-    # but Relay.IfNode is not lowered to this intrinsic,
-    # so to keep our tests consistent with relay, we decide to not unit test
-    # Note: tvm_if_then_else is tested as part of the mobile_net model
-
-
-def run_model(get_workload, input, src_dtype, dst_dtype, rtol=1e-4, atol=1e-4):
-    module, params = get_workload()
-
-    # we don't generate random data here
-    # because then the output data would all be around the same value
-    compare(module, input, src_dtype, dst_dtype, rtol, atol, params)
-
-
-def run_conv2d(src_dtype, dst_dtype, rtol=1e-7, atol=1e-4):
-    def run_test_conv2d(
-        src_dtype,
-        dst_dtype,
-        scale,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        groups=1,
-        dilation=(1, 1),
-        **attrs,
-    ):
-        x = relay.var("x", shape=dshape, dtype=src_dtype)
-        w = relay.var("w", shape=kshape, dtype=src_dtype)
-        y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
-        module = tvm.IRModule.from_expr(relay.Function([x, w], y))
-        data = np.random.uniform(-scale, scale, size=dshape).astype(src_dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(src_dtype)
-
-        compare(module, (data, kernel), src_dtype, dst_dtype, rtol, atol)
-
-    # depthwise conv2d
-    dshape = (1, 32, 18, 18)
-    kshape = (32, 1, 3, 3)
-    run_test_conv2d(
-        src_dtype,
-        dst_dtype,
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=32,
-        groups=32,
-        kernel_size=(3, 3),
-    )
-
-    # CUDA is disabled for 'direct' schedule:
-    # https://github.com/dmlc/tvm/pull/3070#issuecomment-486597553
-    # group conv2d
-    dshape = (1, 32, 18, 18)
-    kshape = (32, 4, 3, 3)
-    run_test_conv2d(
-        src_dtype,
-        dst_dtype,
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=32,
-        groups=8,
-        kernel_size=(3, 3),
-    )
-    # also group conv2d
-    dshape = (1, 32, 18, 18)
-    kshape = (64, 1, 3, 3)
-    run_test_conv2d(
-        src_dtype,
-        dst_dtype,
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=64,
-        groups=32,
-        kernel_size=(3, 3),
-    )
-
-    # normal conv2d
-    dshape = (1, 3, 224, 224)
-    kshape = (10, 3, 3, 3)
-    run_test_conv2d(
-        src_dtype, dst_dtype, 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(3, 3)
-    )
-
-    # dilated conv2d
-    dshape = (1, 3, 18, 18)
-    kshape = (10, 3, 3, 3)
-    run_test_conv2d(
-        src_dtype,
-        dst_dtype,
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=10,
-        kernel_size=(3, 3),
-        dilation=(3, 3),
-    )
-
-
-def run_batchnorm(src_dtype, dst_dtype, rtol=1e-6, atol=1e-6):
-    shape = (3, 32, 32)
-    t = relay.TensorType(shape, src_dtype)
-    x = relay.var("x", t)
-    bn = batch_norm_infer(data=x, epsilon=2e-5, scale=False, name="bn_x")
-    f = relay.Function(relay.analysis.free_vars(bn), bn)
-
-    x_data = np.random.rand(*shape).astype(t.dtype)
-    module = tvm.IRModule.from_expr(f)
-
-    zero_data = np.zeros((32), "float32")
-    compare(
-        module,
-        (x_data, zero_data, zero_data, zero_data, zero_data),
-        src_dtype,
-        dst_dtype,
-        rtol,
-        atol,
-    )
-
-
-def test_myfloat():
-    setup_myfloat()
-
-    run_ops("float32", "custom[myfloat]32", rtol=1e-6, atol=1e-6)
-    run_conv2d("float32", "custom[myfloat]32", rtol=1e-6, atol=1e-6)
-    run_batchnorm("float32", "custom[myfloat]32", rtol=1e-6, atol=1e-6)
-
-    # mxnet python package not available
-    # run_model(get_mobilenet, (get_cat_image((224, 224)), ),
-    #           'float32',
-    #           'custom[myfloat]32')
-
-
-class TestMyfloatLowering(tvm.testing.CompareBeforeAfter):
-    setup_myfloat()
-
-    transform = tvm.tir.transform.LowerCustomDatatypes()
-
-    def before(self):
-        dtype = "custom[myfloat]32"
-
-        @T.prim_func
-        def func(A_data: T.handle(dtype)):
-            T.func_attr({"target": T.target("llvm")})
-            A = T.Buffer(16, dtype=dtype, data=A_data)
-            B_data = T.allocate([16], dtype=dtype)
-            B = T.Buffer(16, dtype=dtype, data=B_data)
-            for i in range(16):
-                B[i] = A[i] + 1.0
-
-        return func
-
-    def expected(self):
-        dtype = "custom[myfloat]32"
-
-        @T.prim_func
-        def func(A_data: T.handle(dtype)):
-            T.func_attr({"target": T.target("llvm")})
-            A_uint32 = T.Buffer(16, "uint32", data=A_data)
-            B_data = T.allocate([16], dtype="uint32")
-            B_uint32 = T.Buffer(16, "uint32", data=B_data)
-            for i in range(16):
-                B_uint32[i] = T.call_pure_extern(
-                    "uint32",
-                    "FloatToCustom32",
-                    T.call_pure_extern("float32", "Custom32ToFloat", A_uint32[i]) + T.float32(1),
-                )
-
-        return func
-
-
-class TestMyfloatLoweringDeclBuffer(tvm.testing.CompareBeforeAfter):
-    """Like TestMyfloatLoweringDeclBuffer, but using DeclBuffer"""
-
-    setup_myfloat()
-
-    transform = tvm.tir.transform.LowerCustomDatatypes()
-
-    def before(self):
-        dtype = "custom[myfloat]32"
-
-        @T.prim_func
-        def func(A_data: T.handle(dtype)):
-            T.func_attr({"target": T.target("llvm")})
-            A = T.decl_buffer(16, dtype=dtype, data=A_data)
-            B = T.decl_buffer(16, dtype=dtype)
-            for i in range(16):
-                B[i] = A[i] + 1.0
-
-        return func
-
-    def expected(self):
-        dtype = "custom[myfloat]32"
-
-        @T.prim_func
-        def func(A_data: T.handle(dtype)):
-            T.func_attr({"target": T.target("llvm")})
-            A_uint32 = T.decl_buffer(16, "uint32", data=A_data)
-            B_uint32 = T.decl_buffer(16, dtype="uint32")
-            for i in range(16):
-                B_uint32[i] = T.call_pure_extern(
-                    "uint32",
-                    "FloatToCustom32",
-                    T.call_pure_extern("float32", "Custom32ToFloat", A_uint32[i]) + T.float32(1),
-                )
-
-        return func
-
-
-def _has_posit():
-    return tvm.support.libinfo()["USE_BYODT_POSIT"] == "ON"
-
-
-@pytest.mark.skipif(not _has_posit(), reason="compiled with USE_BYODT_POSIT flag OFF")
-def test_posites2():
-    setup_posites2()
-    run_ops("float32", "custom[posites2]8", rtol=1, atol=1)
-    run_ops("float32", "custom[posites2]16", rtol=0.01, atol=1)
-    run_ops("float32", "custom[posites2]32", rtol=1e-6, atol=1e-6)
-
-    run_conv2d("float32", "custom[posites2]8", rtol=1, atol=1)
-    run_conv2d("float32", "custom[posites2]16", rtol=0.01, atol=1)
-    run_conv2d("float32", "custom[posites2]32")
-
-    run_batchnorm("float32", "custom[posites2]8", rtol=1, atol=1)
-    run_batchnorm("float32", "custom[posites2]16", rtol=0.01, atol=1)
-    run_batchnorm("float32", "custom[posites2]32", rtol=1e-4, atol=1e-4)
-    # Expected posit8 might be faster, but it's not.
-    # run_model(get_mobilenet, (get_cat_image((224, 224)), ), 'float32', 'custom[posit8]8')
-    # run_model(get_mobilenet, (get_cat_image((224, 224)), ), 'float32', 'custom[posit32]32')
-    # run_model(get_inception, (get_cat_image((229, 229)), ), 'float32', 'custom[posit32]32')
-    # run_model(get_resnet, (get_cat_image((224, 224)), ), 'float32', 'custom[posit32]32')
-
-    # can't run cifar-10 sizes because dimensions
-    # don't match pretrained weights
-
-    # runs on the order of minutes...
-    # run_model(get_inception, (get_cat_image((229, 229)), ),
-    #           'float32',
-    #           'custom[posites2]32')
-    # run_model(get_resnet, (get_cat_image((224, 224)), ),
-    #           'float32',
-    #           'custom[posites2]32')
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
deleted file mode 100644
index b79713e05ed3..000000000000
--- a/tests/python/relay/test_dataflow_pattern.py
+++ /dev/null
@@ -1,2078 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-wildcard-import
-import numpy as np
-
-import tvm
-from tvm.script import tir as T
-from tvm import relay
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.dataflow_pattern import *
-from tvm.relay.testing import run_opt_pass
-
-# NB: 1 corresponds to the C++ enum that specicfies this
-# we loose the type safety due to the Python/C++ calling
-# convention.
-K_ELEMWISE = 0
-K_BROADCAST = 1
-K_INJECTIVE = 2
-
-## NODE TESTS
-def test_expr_pattern():
-    ep = is_expr(relay.var("x", shape=(4, 1)))
-    assert isinstance(ep, ExprPattern)
-    assert isinstance(ep.expr, relay.Var)
-
-
-def test_var_pattern():
-    v = is_var("x")
-    assert isinstance(v, VarPattern)
-    assert v.name == "x"
-
-
-def test_constant_pattern():
-    c = is_constant()
-    assert isinstance(c, ConstantPattern)
-
-
-def test_wildcard_pattern():
-    wc = wildcard()
-    assert isinstance(wc, WildcardPattern)
-
-
-def test_CallPattern():
-    wc1 = wildcard()
-    wc2 = wildcard()
-    c = is_op("add")(wc1, wc2)
-    assert isinstance(c, CallPattern)
-    assert isinstance(c.args[0], WildcardPattern)
-    assert isinstance(c.args[1], WildcardPattern)
-
-
-def test_FunctionPattern():
-    wc1 = wildcard()
-    wc2 = wildcard()
-    c = is_op("add")(wc1, wc2)
-    f = FunctionPattern([wc1, wc2], c)
-    assert isinstance(f, FunctionPattern)
-    assert isinstance(f.params[0], WildcardPattern)
-    assert isinstance(f.params[1], WildcardPattern)
-    assert isinstance(f.body, CallPattern)
-    assert isinstance(f.body.args[0], WildcardPattern)
-    assert isinstance(f.body.args[1], WildcardPattern)
-
-
-def test_TuplePattern():
-    wc1 = wildcard()
-    wc2 = wildcard()
-    t = is_tuple([wc1, wc2])
-    assert isinstance(t, TuplePattern)
-    assert isinstance(t.fields[0], WildcardPattern)
-    assert isinstance(t.fields[1], WildcardPattern)
-
-
-def test_TupleGetItemPattern():
-    wc1 = wildcard()
-    wc2 = wildcard()
-    t = is_tuple([wc1, wc2])
-    tgi = is_tuple_get_item(t, 1)
-    assert isinstance(tgi, TupleGetItemPattern)
-    assert isinstance(tgi.tuple, TuplePattern)
-    assert isinstance(tgi.tuple.fields[0], WildcardPattern)
-    assert isinstance(tgi.tuple.fields[1], WildcardPattern)
-
-
-def test_AltPattern():
-    is_add_or_sub = is_op("add") | is_op("subtract")
-    assert isinstance(is_add_or_sub, AltPattern)
-
-
-def test_TypePattern():
-    ttype = relay.TensorType((10, 10), "float32")
-    ty_pat = has_type(ttype)
-    assert isinstance(ty_pat, TypePattern)
-    assert ty_pat.type == ttype
-
-
-def test_DataTypePattern():
-    dtype = "float16"
-    pattern = has_dtype(dtype)
-    assert isinstance(pattern, DataTypePattern)
-    assert pattern.dtype == dtype
-
-
-def test_ShapePattern():
-    shape = [T.int32(10), T.int32(10)]
-    pattern = has_shape(shape)
-    assert isinstance(pattern, ShapePattern)
-    tvm.ir.assert_structural_equal(pattern.shape, shape)
-
-
-def test_AttrPattern():
-    op = is_op("add").has_attr({"TOpPattern": K_ELEMWISE})
-    assert isinstance(op, AttrPattern)
-    assert op.attrs["TOpPattern"] == K_ELEMWISE
-
-
-def test_IfPattern():
-    x = is_var("x")
-    y = is_var("y")
-    pat = is_if(is_op("less")(x, y), x, y)
-
-    assert isinstance(pat, IfPattern)
-    assert isinstance(pat.cond, CallPattern)
-    assert isinstance(pat.true_branch, VarPattern)
-    assert isinstance(pat.false_branch, VarPattern)
-
-
-def test_LetPattern():
-    x = is_var("x")
-    y = is_var("y")
-    let_var = is_var("let")
-    pat = is_let(let_var, is_op("less")(x, y), let_var)
-
-    assert isinstance(pat, LetPattern)
-    assert isinstance(pat.var, VarPattern)
-    assert isinstance(pat.value, CallPattern)
-    assert isinstance(pat.body, VarPattern)
-
-
-## MATCHER TESTS
-
-
-def test_match_op():
-    assert is_op("add").match(relay.op.op.get("add"))
-
-
-def test_no_match_op():
-    assert not is_op("add").match(relay.op.op.get("subtract"))
-
-
-def test_match_op_or():
-    is_add_or_sub = is_op("add") | is_op("subtract")
-    assert is_add_or_sub.match(relay.op.op.get("add"))
-    assert is_add_or_sub.match(relay.op.op.get("subtract"))
-
-
-def test_match_call_commutive():
-    x = relay.var("x")
-    y = relay.var("y")
-    add_pattern = is_op("add")(is_var("x"), is_var("y"))
-    assert add_pattern.match(x + y)
-    assert add_pattern.match(y + x)
-    mul_pattern = is_op("multiply")(is_var("x"), is_var("y"))
-    assert mul_pattern.match(x * y)
-    assert mul_pattern.match(y * x)
-
-
-def test_no_match_call_commutive():
-    x = relay.var("x")
-    y = relay.var("y")
-    add_pattern = is_op("subtract")(is_var("x"), is_var("y"))
-    assert add_pattern.match(x - y)
-    assert not add_pattern.match(y - x)
-    add_pattern = is_op("divide")(is_var("x"), is_var("y"))
-    assert add_pattern.match(x / y)
-    assert not add_pattern.match(y / x)
-
-
-def test_match_call():
-    x = relay.var("x")
-    y = relay.var("y")
-    add_pattern = is_op("add")(wildcard(), wildcard())
-    assert add_pattern.match(x + y)
-
-    # Match call with any number of inputs
-    call_pattern = wildcard()(None)
-    assert call_pattern.match(relay.op.nn.relu(x))
-    assert call_pattern.match(relay.op.add(x, y))
-
-
-def test_no_match_call():
-    x = relay.var("x")
-    y = relay.var("y")
-    add_pattern = is_op("add")(wildcard(), wildcard())
-    assert not add_pattern.match(x - y)
-
-
-def test_match_func():
-    x = relay.var("x")
-    y = relay.var("y")
-    wc1 = wildcard()
-    wc2 = wildcard()
-    func_pattern = FunctionPattern([wc1, wc2], wc1 + wc2)
-    assert func_pattern.match(relay.Function([x, y], x + y))
-
-    # Match Function with any number of inputs
-    func_pattern = FunctionPattern(None, wildcard())
-    assert func_pattern.match(relay.Function([x], x))
-    assert func_pattern.match(relay.Function([x, y], x + y))
-
-
-def test_no_match_func():
-    x = relay.var("x")
-    y = relay.var("y")
-    wc1 = wildcard()
-    wc2 = wildcard()
-    func_pattern = FunctionPattern([wc1, wc2], wc1 + wc2)
-    assert not func_pattern.match(relay.Function([x, y], x - y))
-
-
-def test_match_if():
-    x = is_var("x")
-    y = is_var("y")
-    pat = is_if(is_op("less")(x, y), x, y)
-
-    x = relay.var("x")
-    y = relay.var("y")
-    cond = x < y
-
-    assert pat.match(relay.expr.If(cond, x, y))
-
-
-def test_no_match_if():
-    x = is_var("x")
-    y = is_var("y")
-    pat = is_if(is_op("less")(x, y), x, y)
-
-    x = relay.var("x")
-    y = relay.var("y")
-
-    assert not pat.match(relay.expr.If(x > y, x, y))
-    assert not pat.match(relay.expr.If(x < y, y, x))
-
-
-def test_match_let():
-    x = is_var("x")
-    y = is_var("y")
-    let_var = is_var("let")
-    pat = is_let(let_var, is_op("less")(x, y), let_var)
-
-    x = relay.var("x")
-    y = relay.var("y")
-    lv = relay.var("let")
-    cond = x < y
-    assert pat.match(relay.expr.Let(lv, cond, lv))
-
-
-def test_no_match_let():
-    x = is_var("x")
-    y = is_var("y")
-    let_var = is_var("let")
-    pat = is_let(let_var, is_op("less")(x, y), let_var)
-
-    x = relay.var("x")
-    y = relay.var("y")
-    lv = relay.var("let")
-
-    assert not pat.match(relay.expr.Let(lv, x > y, lv))
-    assert not pat.match(relay.expr.Let(lv, x < y, lv * x))
-
-
-def test_match_option():
-    x = relay.var("x")
-    w = relay.var("w")
-    b = relay.var("b")
-    pattern = is_op("nn.relu")(
-        is_op("nn.conv2d")(wildcard(), wildcard()).optional(
-            lambda x: is_op("nn.bias_add")(x, wildcard())
-        )
-    )
-
-    conv2d = relay.op.nn.conv2d(x, w)
-    relu = relay.op.nn.relu(conv2d)
-    assert pattern.match(relu)
-
-    conv2d = relay.op.nn.conv2d(x, w)
-    bias_add = relay.op.nn.bias_add(conv2d, b)
-    relu = relay.op.nn.relu(bias_add)
-    assert pattern.match(relu)
-
-    pattern = is_op("nn.conv2d")(wildcard(), wildcard())
-    pattern = pattern.optional(is_op("nn.relu")).optional(is_op("tanh"))
-
-    conv2d = relay.op.nn.conv2d(x, w)
-    relu = relay.op.nn.relu(conv2d)
-    tanh = relay.op.tanh(conv2d)
-    tanh2 = relay.op.tanh(relu)
-    relu2 = relay.op.nn.relu(tanh)
-    assert pattern.match(conv2d)
-    assert pattern.match(relu)
-    assert pattern.match(tanh)
-    assert pattern.match(tanh2)
-    assert not pattern.match(relu2)
-
-
-def test_no_match_option():
-    x = relay.var("x")
-    w = relay.var("w")
-    b = relay.var("b")
-    pattern = is_op("nn.relu")(
-        is_op("nn.conv2d")(wildcard(), wildcard()).optional(
-            lambda x: is_op("nn.bias_add")(x, wildcard())
-        )
-    )
-
-    conv2d = relay.op.nn.conv2d(x, w)
-    relu = relay.op.tanh(conv2d)
-    assert not pattern.match(relu)
-
-    conv2d = relay.op.nn.dense(x, w)
-    relu = relay.op.tanh(conv2d)
-    assert not pattern.match(relu)
-
-    conv2d = relay.op.nn.dense(x, w)
-    bias_add = relay.op.nn.bias_add(conv2d, b)
-    relu = relay.op.nn.relu(bias_add)
-    assert not pattern.match(relu)
-
-    conv2d = relay.op.nn.conv2d(x, w)
-    bias_add = conv2d + w
-    relu = relay.op.nn.relu(bias_add)
-    assert not pattern.match(relu)
-
-
-def test_match_const():
-    conv2d = is_op("nn.conv2d")(wildcard(), is_constant())
-    pattern = is_op("nn.bias_add")(conv2d, wildcard())
-
-    x = relay.var("x", shape=(1, 3, 224, 224))
-    w = relay.var("w", shape=(3, 3, 3, 3))
-    b = relay.var("b", shape=(3,))
-    conv2d = relay.op.nn.conv2d(x, w)
-    out = relay.op.nn.bias_add(conv2d, b)
-    func = relay.Function([x, w, b], out)
-    mod = tvm.IRModule.from_expr(func)
-
-    assert not pattern.match(mod["main"].body)
-    mod["main"] = bind_params_by_name(mod["main"], {"w": tvm.nd.array(np.ones(shape=(3, 3, 3, 3)))})
-    assert pattern.match(mod["main"].body)
-
-
-def test_match_tuple():
-    x = relay.var("x")
-    y = relay.var("y")
-    z = relay.op.op.get("add")
-    tuple_pattern = is_tuple((is_var("x"), wildcard(), is_op("add")))
-    assert tuple_pattern.match(relay.expr.Tuple((x, y, z)))
-
-    tuple_pattern = is_tuple((is_var("x"), wildcard(), is_op("add")))
-    tuple_get_item_pattern = is_tuple_get_item(tuple_pattern, 1)
-    assert tuple_get_item_pattern.match(relay.expr.TupleGetItem(relay.expr.Tuple((x, y, z)), 1))
-
-    tuple_get_item_pattern = is_tuple_get_item(tuple_pattern)  # Match any index
-    assert tuple_get_item_pattern.match(relay.expr.TupleGetItem(relay.expr.Tuple((x, y, z)), 0))
-    assert tuple_get_item_pattern.match(relay.expr.TupleGetItem(relay.expr.Tuple((x, y, z)), 1))
-    assert tuple_get_item_pattern.match(relay.expr.TupleGetItem(relay.expr.Tuple((x, y, z)), 2))
-
-    # Match tuple with any inputs
-    tuple_pattern = is_tuple(None)
-    concat_pattern = is_op("concatenate")(tuple_pattern)
-    assert concat_pattern.match(relay.op.concatenate(relay.expr.Tuple((x,)), axis=0))
-    assert concat_pattern.match(relay.op.concatenate(relay.expr.Tuple((x, y)), axis=0))
-    assert concat_pattern.match(relay.op.concatenate(relay.expr.Tuple((x, y, z)), axis=0))
-
-
-def test_no_match_tuple():
-    x = relay.var("x")
-    y = relay.var("y")
-    z = relay.op.op.get("add")
-    tuple_pattern = is_tuple((is_var("x"), wildcard(), is_op("add"), wildcard()))
-    assert not tuple_pattern.match(relay.expr.Tuple((x, y, z)))
-
-    tuple_pattern = is_tuple((is_var("x"), wildcard(), is_op("add")))
-    tuple_get_item_pattern = is_tuple_get_item(tuple_pattern, 1)
-    assert not tuple_get_item_pattern.match(relay.expr.TupleGetItem(relay.expr.Tuple((x, y, z)), 2))
-
-
-def test_match_type():
-    x = relay.var("x", shape=(10, 10), dtype="float32")
-    ty_pat = has_type(relay.TensorType((10, 10), "float32"))
-    assert ty_pat.match(x)
-
-
-def test_no_match_type():
-    x = relay.var("x", shape=(10, 10), dtype="int32")
-    ty_pat = has_type(relay.TensorType((10, 10), "float32"))
-    assert not ty_pat.match(x)
-
-
-def test_match_dtype():
-    x = relay.var("x", shape=(10, 10), dtype="float32")
-    ty_pat = has_dtype("float32")
-    assert ty_pat.match(x)
-
-
-def test_no_match_dtype():
-    x = relay.var("x", shape=(10, 10), dtype="int32")
-    ty_pat = has_dtype("float32")
-    assert not ty_pat.match(x)
-
-
-def test_match_shape():
-    x = relay.var("x", shape=(10, 10), dtype="float32")
-    ty_pat = has_shape((10, 10))
-    assert ty_pat.match(x)
-
-
-def test_no_match_shape():
-    x = relay.var("x", shape=(10, 10), dtype="int32")
-    ty_pat = has_shape((10, 5))
-    assert not ty_pat.match(x)
-
-
-def test_match_op_attr():
-    op = is_op("add").has_attr({"TOpPattern": K_BROADCAST})
-    op_pat = op(wildcard(), wildcard())
-    x = relay.var("x")
-    y = relay.var("y")
-    assert op_pat.match(x + y)
-
-
-def test_no_match_op_attr():
-    op = is_op("nn.dense").has_attr({"TOpPattern": K_ELEMWISE})
-    op_pat = op(wildcard(), wildcard())
-    x = relay.var("x")
-    y = relay.var("y")
-    assert not op_pat.match(relay.op.nn.dense(x, y))
-    op = is_op("add").has_attr({"TOpPattern": K_BROADCAST})
-    op_pat = op(wildcard(), wildcard())
-    x = relay.var("x")
-    y = relay.var("y")
-    assert not op_pat.match(x - y)
-    z = relay.var("z")
-    assert not op_pat.match(relay.Let(z, x + y, z))
-
-
-def test_match_func_attr():
-    pattern = wildcard().has_attr({"Composite": "add"})
-    x = relay.var("x")
-    y = relay.var("y")
-    f = relay.Function([x, y], x + y).with_attr("Composite", "add")
-    assert pattern.match(f)
-
-
-def test_no_match_func_attr():
-    pattern = wildcard().has_attr({"Composite": "add"})
-    x = relay.var("x")
-    y = relay.var("y")
-
-    f = relay.Function([x, y], x + y).with_attr("RandomTest", "add")
-    assert not pattern.match(f)
-    f = relay.Function([x, y], x + y).with_attr("Composite", "conv_bias")
-    assert not pattern.match(f)
-
-
-def test_match_call_attr():
-    # String attr
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"data_layout": "NCHW"})
-    x = relay.var("x")
-    y = relay.var("y")
-    assert is_conv2d.match(relay.op.nn.conv2d(x, y))
-
-    # Array attr
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"kernel_size": [3, 3]})
-    out = relay.op.nn.conv2d(x, y, kernel_size=[3, 3])
-    assert is_conv2d.match(out)
-
-    # non-operator call
-    attr_dict = {"call_attr": "attr"}
-    call_has_attr = wildcard()(wildcard()).has_attr(attr_dict)
-    call_attr = tvm.ir.make_node("DictAttrs", **attr_dict)
-    a = relay.Var("a")
-    b = relay.Var("b")
-    assert call_has_attr.match(relay.Call(a, [b], attrs=call_attr))
-
-    # empty attrs should match anything
-    empty_attrs = tvm.ir.make_node("DictAttrs", **{})
-    call_has_empty_attrs = wildcard()(wildcard()).has_attr({})
-    assert call_has_empty_attrs.match(relay.Call(a, [b], attrs=empty_attrs))
-    assert call_has_empty_attrs.match(relay.Call(a, [b], attrs=call_attr))
-
-
-def test_no_match_call_attr():
-    x = relay.var("x")
-    y = relay.var("y")
-
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"data_layout": "NHWC"})
-    assert not is_conv2d.match(relay.op.nn.conv2d(x, y))
-
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"RandomAttr": "NCHW"})
-    assert not is_conv2d.match(relay.op.nn.conv2d(x, y))
-
-    # Array attr
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"kernel_size": [3, 3]})
-    out = relay.op.nn.conv2d(x, y, kernel_size=[2, 1])
-    assert not is_conv2d.match(out)
-
-    # non-operator calls
-    call_has_attr = wildcard()(wildcard()).has_attr({"call_attr": "attr"})
-    wrong_key = tvm.ir.make_node("DictAttrs", **{"wrong": "attr"})
-    wrong_value = tvm.ir.make_node("DictAttrs", **{"call_attr": "wrong"})
-    empty_attrs = tvm.ir.make_node("DictAttrs", **{})
-
-    a = relay.Var("a")
-    b = relay.Var("b")
-    # attrs left undefined
-    assert not call_has_attr.match(relay.Call(a, [b]))
-    # wrong attrs
-    assert not call_has_attr.match(relay.Call(a, [b], attrs=wrong_key))
-    assert not call_has_attr.match(relay.Call(a, [b], attrs=wrong_value))
-    assert not call_has_attr.match(relay.Call(a, [b], attrs=empty_attrs))
-
-
-def test_match_call_attr_dtype():
-    is_cast = is_op("cast")(wildcard()).has_attr({"dtype": "float32"})
-    x = relay.var("x")
-    assert is_cast.match(relay.op.cast(x, "float32"))
-
-
-def test_match_diamond():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    path1 = is_op("nn.relu")(is_conv2d)
-    path2 = is_op("nn.leaky_relu")(is_conv2d)
-    diamond = is_op("add")(path1, path2)
-
-    # Expr
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert diamond.match(out)
-
-
-def test_no_match_diamond():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    path1 = is_op("nn.relu")(is_conv2d)
-    path2 = is_op("nn.leaky_relu")(is_conv2d)
-    diamond = is_op("add")(path1, path2)
-
-    # Expr
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-
-    # Check
-    assert not diamond.match(leaky_relu)
-    assert not diamond.match(relu)
-
-
-def test_match_fake_diamond():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    path1 = is_op("nn.relu")(is_conv2d)
-    path2 = is_op("nn.leaky_relu")(is_conv2d)
-    diamond = is_op("add")(path1, path2)
-
-    # Expr
-    input1 = relay.var("input1")
-    weight1 = relay.var("weight1")
-    conv2d1 = relay.op.nn.conv2d(input1, weight1)
-    inp2 = relay.var("input2")
-    weight2 = relay.var("weight2")
-    conv2d2 = relay.op.nn.conv2d(inp2, weight2)
-    relu = relay.op.nn.relu(conv2d1)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d2, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert not diamond.match(out)
-
-
-def test_at_most_one_parent():
-    # Pattern
-    P = is_op("nn.conv2d")(wildcard(), wildcard())  # 'parent'
-    I = is_op("nn.relu")(wildcard())  # 'intermediate' ('path' in the code)
-    C = is_op("add")(wildcard(), wildcard())  # 'child'
-    pattern = dominates(P, I, C)
-
-    #       n6(P)
-    #      /  \
-    #     n7   \
-    #    /      \
-    #    n8(P)  n10(I)
-    #    \      /
-    #    n9(I) /
-    #      \  /
-    #      n11(C)
-
-    x = relay.var("x")
-    w = relay.var("w")
-    n6 = relay.op.nn.conv2d(x, w)  # matches P
-    n7 = relay.op.tanh(n6)  # does not match I
-    n8 = relay.op.nn.conv2d(n7, w)  # matches P
-    n9 = relay.op.nn.relu(n8)  # matches I
-    n10 = relay.op.nn.relu(n6)  # matches I
-    n11 = relay.add(n9, n10)  # matches C
-
-    # Does not match: Can't match the parent pattern P at both 8 and 6.
-    # Note that if we did allow P to be used twice the implementation would
-    # need to be changed to not 'jump over' n7.
-    assert not pattern.match(n11)
-
-
-def test_match_dominator():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard())
-    reduction = is_op("add")(wildcard(), wildcard())
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    # Classic Diamond
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert diamond.match(out)
-
-    # Deeper Branch
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    relu = relay.op.tanh(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert diamond.match(out)
-
-    # Single Branch
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    tanh = relay.op.tanh(relu)
-    out = relu + tanh
-
-    # Check
-    assert diamond.match(out)
-
-    # Fuzzy path/nested Diamond
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard()) | is_op(
-        "add"
-    )(wildcard(), wildcard())
-    reduction = is_op("add")(wildcard(), wildcard())
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relu + relu
-    tanh = relay.op.tanh(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = tanh + leaky_relu
-
-    assert diamond.match(out)
-
-
-def test_match_dominator2():
-    # Pattern
-    conv2d_pat = is_op("nn.conv2d")(wildcard(), wildcard())
-    eltwise_pat = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(None)
-    broadcast_pat = (wildcard().has_attr({"TOpPattern": K_BROADCAST}))(None)
-    path_pat = eltwise_pat | broadcast_pat
-    injective_pat = (wildcard().has_attr({"TOpPattern": K_INJECTIVE}))(wildcard())
-    pattern = injective_pat.dominates(conv2d_pat, path_pat)
-
-    # Graph
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    bias = relay.var("bias")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    bias_add = relay.op.nn.bias_add(conv2d, bias)
-    relu = relay.op.nn.relu(bias_add)
-    reshape = relay.op.reshape(relu, newshape=[-1, 2, 8])
-
-    # Check
-    assert pattern.match(reshape)
-
-
-def test_not_match_dominator():
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard())
-    reduction = is_op("add")(wildcard(), wildcard())
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    # Fake Diamond
-    input1 = relay.var("input1")
-    weight1 = relay.var("weight1")
-    conv2d1 = relay.op.nn.conv2d(input1, weight1)
-    inp2 = relay.var("input2")
-    weight2 = relay.var("weight2")
-    conv2d2 = relay.op.nn.conv2d(inp2, weight2)
-    relu = relay.op.nn.relu(conv2d1)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d2, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert not diamond.match(out)
-
-    # Add op that doesn't match K_ELEMWISE
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relu + relu
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert not diamond.match(out)
-
-    # Relu on the input instead of the conv
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(inp)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert not diamond.match(out)
-
-    # No conv
-    inp = relay.var("input")
-    relu = relay.op.nn.relu(inp)
-    relu = relay.op.nn.relu(relu)
-    tanh = relay.op.tanh(relu)
-    out = relu + tanh
-
-    # Check
-    assert not diamond.match(out)
-
-
-def test_not_match_dominator2():
-    # Pattern
-    P = is_op("nn.conv2d")(wildcard(), wildcard())  # 'parent'
-    I = is_op("nn.relu")(wildcard())  # 'intermediate' ('path' in the code)
-    C = is_op("add")(wildcard(), wildcard())  # 'child'
-    pattern = dominates(P, I, C)
-
-    #       n6(P)
-    #      /  \
-    #     n7   \
-    #    /      \
-    #    n8(P)  n9(I)
-    #    \      /
-    #     \    /
-    #      \  /
-    #      n10(C)
-
-    x = relay.var("x")
-    w = relay.var("w")
-    n6 = relay.op.nn.conv2d(x, w)  # matches P
-    n7 = relay.op.tanh(n6)  # does not match I
-    n8 = relay.op.nn.conv2d(n7, w)  # matches P
-    n9 = relay.op.nn.relu(n6)  # matches I
-    n10 = relay.add(n8, n9)  # matches C
-
-    # Does not match: Can't match the parent pattern P at both 8 and 6.
-    # Note that if we did allow P to be used twice the implementation would
-    # need to be changed to not 'jump over' n7.
-    assert not pattern.match(n10)
-
-
-def test_match_typed_dominator():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard()).has_dtype(
-        "float32"
-    )
-    reduction = is_op("add")(wildcard(), wildcard()).has_shape([1, 3, 10, 10])
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    # Classic Diamond
-    inp = relay.var("input", relay.TensorType((1, 3, 12, 12), "float32"))
-    weight = relay.var("weight", relay.TensorType((3, 3, 3, 3), "float32"))
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Check
-    assert diamond.match(out)
-
-
-def test_no_match_typed_dominator():
-    # Classic Diamond
-    inp = relay.var("input", relay.TensorType((1, 3, 12, 12), "float32"))
-    weight = relay.var("weight", relay.TensorType((3, 3, 3, 3), "float32"))
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard()).has_dtype(
-        "float32"
-    )
-    reduction = is_op("add")(wildcard(), wildcard()).has_shape([1, 1, 10, 10])
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    # Check
-    assert not diamond.match(out)
-
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard()).has_dtype(
-        "float16"
-    )
-    reduction = is_op("add")(wildcard(), wildcard()).has_shape([1, 3, 10, 10])
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    # Check
-    assert not diamond.match(out)
-
-
-def test_rewrite():
-    x = relay.var("x")
-    y = relay.var("y")
-    add_pattern = is_op("add")(wildcard(), wildcard())
-    sub_pattern = is_op("subtract")(wildcard(), wildcard())
-
-    class TestRewrite(DFPatternCallback):
-        def __init__(self):
-            super(TestRewrite, self).__init__()
-            self.pattern = add_pattern
-
-        def callback(self, pre, post, node_map):
-            return post.args[0] - post.args[1]
-
-    out = rewrite(TestRewrite(), x + y)
-    assert sub_pattern.match(out)
-
-
-def test_rewrite_func():
-    x = relay.var("x")
-    w = relay.var("w")
-    y = relay.var("y")
-    add_pattern = is_op("add")(wildcard(), wildcard())
-    sub_pattern = is_op("subtract")(wildcard(), wildcard())
-
-    class TestRewrite(DFPatternCallback):
-        def __init__(self):
-            super(TestRewrite, self).__init__()
-            self.pattern = add_pattern
-
-        def callback(self, pre, post, node_map):
-            return post.args[0] - post.args[1]
-
-    inpf = relay.var("input")
-    weightf = relay.var("weight")
-    func = relay.Function(
-        [inpf, weightf], relay.op.nn.relu(relay.op.nn.conv2d(inpf, weightf)), attrs=None
-    )
-    out = rewrite(TestRewrite(), func(x, w) + y)
-    assert sub_pattern.match(out)
-
-
-def test_rewrite_func_with_attr():
-    x = relay.var("x")
-    y = relay.var("y")
-    f = relay.Function([x, y], x + y).with_attr("Composite", "add")
-
-    a = relay.var("a")
-    b = relay.var("b")
-    c = relay.Call(f, [a, b])
-    c_abs = relay.abs(c)
-
-    class TestRewrite(DFPatternCallback):
-        def __init__(self):
-            super(TestRewrite, self).__init__()
-            self.pattern = wildcard().has_attr({"Composite": "add"})(wildcard(), wildcard())
-
-        def callback(self, pre, post, node_map):
-            return post.args[0] + post.args[1]
-
-    out = rewrite(TestRewrite(), c_abs)
-    inlined_add_pattern = is_op("abs")(is_op("add")(wildcard(), wildcard()))
-    assert inlined_add_pattern.match(out)
-
-
-def test_nested_rewrite():
-    class PatternCallback(DFPatternCallback):
-        def __init__(self, pattern):
-            super(PatternCallback, self).__init__()
-            self.pattern = pattern
-
-        def callback(self, pre, post, node_map):
-            return post
-
-    def gen():
-        x = relay.var("x")
-        y = relay.var("y")
-        y_add = relay.add(y, y)
-        n0 = relay.add(x, y_add)
-        n1 = relay.add(x, n0)
-        return relay.add(n1, n0)
-
-    def pattern():
-        a = wildcard()
-        b = wildcard()
-        n0 = is_op("add")(a, b)
-        n1 = is_op("add")(n0, a)
-        return is_op("add")(n0, n1)
-
-    out = gen()
-    pat = pattern()
-    new_out = rewrite(PatternCallback(pat), out)
-
-    tvm.ir.assert_structural_equal(out, new_out)
-
-
-def test_not_fuse_multi_diamond():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    path1 = is_op("nn.relu")(is_conv2d)
-    path2 = is_op("nn.leaky_relu")(is_conv2d)
-    diamond = is_op("add")(path1, path2)
-
-    # Expr
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-    out = out + conv2d
-    # Check
-    assert not diamond.match(out)
-
-
-class BatchnormCallback(DFPatternCallback):
-    def __init__(self):
-        super(BatchnormCallback, self).__init__()
-        self.x = wildcard()
-        self.var = wildcard()
-        self.mean = wildcard()
-        self.beta = wildcard()
-        self.gamma = wildcard()
-        self.eps = is_constant()
-
-        self.pattern = (
-            self.gamma * (self.x - self.mean) / is_op("sqrt")(self.var + self.eps) + self.beta
-        )
-
-    def callback(self, pre, post, node_map):
-        x = node_map[self.x][0]
-        var = node_map[self.var][0]
-        mean = node_map[self.mean][0]
-        beta = node_map[self.beta][0]
-        gamma = node_map[self.gamma][0]
-        eps = node_map[self.eps][0]
-        return relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=eps.data.numpy().item())[0]
-
-
-def test_fuse_batchnorm():
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-
-    BN = gamma * (x - mean) / relay.op.sqrt(var + relay.const(1e-5)) + beta
-
-    out = rewrite(BatchnormCallback(), BN)
-    tvm.ir.assert_structural_equal(
-        out, relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)[0]
-    )
-
-
-def test_no_fuse_batchnorm():
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-
-    fake_BN = gamma * (x - mean) / relay.op.sqrt(var + relay.const(1e-5)) - beta
-
-    out = rewrite(BatchnormCallback(), fake_BN)
-    tvm.ir.assert_structural_equal(out, fake_BN)
-
-
-def test_fuse_double_batchnorm():
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-
-    BN = gamma * (x - mean) / relay.op.sqrt(var + relay.const(1e-5)) + beta
-    BN2 = gamma * (BN - mean) / relay.op.sqrt(var + relay.const(1e-5)) + beta
-
-    out = rewrite(BatchnormCallback(), BN2)
-
-    bn = relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)[0]
-    bn2 = relay.op.nn.batch_norm(bn, gamma, beta, mean, var, epsilon=1e-5)[0]
-
-    tvm.ir.assert_structural_equal(out, bn2)
-
-
-def test_partial_fuse_double_batchnorm():
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-
-    BN = gamma * (x - mean) / relay.op.sqrt(var + relay.const(1e-5)) - beta
-    BN2 = gamma * (BN - mean) / relay.op.sqrt(var + relay.const(1e-5)) + beta
-
-    out = rewrite(BatchnormCallback(), BN2)
-
-    bn2 = relay.op.nn.batch_norm(BN, gamma, beta, mean, var, epsilon=1e-5)[0]
-
-    tvm.ir.assert_structural_equal(out, bn2)
-
-
-def test_fuse_batchnorm_commutation():
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-
-    # commute add
-    BN = beta + gamma * (x - mean) / relay.op.sqrt(var + relay.const(1e-5))
-    out = rewrite(BatchnormCallback(), BN)
-    tvm.ir.assert_structural_equal(
-        out, relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)[0]
-    )
-
-    # associate divide/multiply
-    BN = (gamma * (x - mean)) / relay.op.sqrt(var + relay.const(1e-5)) + beta
-    out = rewrite(BatchnormCallback(), BN)
-    tvm.ir.assert_structural_equal(
-        out, relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)[0]
-    )
-
-    # associate multiply/divide
-    BN = gamma * ((x - mean) / relay.op.sqrt(var + relay.const(1e-5))) + beta
-    out = rewrite(BatchnormCallback(), BN)
-    tvm.ir.assert_structural_equal(
-        out, relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)[0]
-    )
-
-
-def test_quadruple_rewrite_dominator():
-    class DominatorRemovalCallback(DFPatternCallback):
-        def __init__(self):
-            super(DominatorRemovalCallback, self).__init__()
-            self.inp = wildcard()
-            self.weight = wildcard()
-            is_conv2d = is_op("nn.conv2d")(self.inp, self.weight)
-            is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(
-                wildcard()
-            ) | is_op("add")(wildcard(), wildcard())
-            reduction = is_op("add")(wildcard(), wildcard())
-            self.pattern = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-        def callback(self, pre, post, node_map):
-            inp = node_map[self.inp][0]
-            weight = node_map[self.weight][0]
-            return relay.op.nn.conv2d(inp, weight)
-
-    inp = relay.var("input")
-    weight = relay.var("weight")
-    # Classic Diamond
-    conv2d = relay.op.nn.conv2d(inp, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Deeper Branch
-    conv2d = relay.op.nn.conv2d(out, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    relu = relay.op.tanh(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = relu + leaky_relu
-
-    # Single Branch
-    conv2d = relay.op.nn.conv2d(out, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relay.op.nn.relu(relu)
-    tanh = relay.op.tanh(relu)
-    out = relu + tanh
-
-    # Fuzzy path/nested Diamond
-    conv2d = relay.op.nn.conv2d(out, weight)
-    relu = relay.op.nn.relu(conv2d)
-    relu = relu + relu
-    tanh = relay.op.tanh(relu)
-    leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-    out = tanh + leaky_relu
-    one = relay.op.nn.conv2d(inp, weight)
-    two = relay.op.nn.conv2d(one, weight)
-    three = relay.op.nn.conv2d(two, weight)
-    four = relay.op.nn.conv2d(three, weight)
-
-    tvm.ir.assert_structural_equal(DominatorRemovalCallback().rewrite(out), four)
-
-
-def algebraic_simplify(expr):
-    zero = is_expr(relay.const(0)) | is_expr(relay.const(0.0))
-    one = is_expr(relay.const(1)) | is_expr(relay.const(1.0))
-
-    class ElwiseNullCallback(DFPatternCallback):
-        def callback(self, pre, post, node_map):
-            return node_map[self.x][0]  # pylint: disable=no-member
-
-    class AddCallback(ElwiseNullCallback):
-        def __init__(self):
-            super(AddCallback, self).__init__()
-            self.x = wildcard()
-            self.pattern = self.x + zero
-
-    class SubCallback(ElwiseNullCallback):
-        def __init__(self):
-            super(SubCallback, self).__init__()
-            self.x = wildcard()
-            self.pattern = self.x - zero
-
-    class MulCallback(ElwiseNullCallback):
-        def __init__(self):
-            super(MulCallback, self).__init__()
-            self.x = wildcard()
-            self.pattern = self.x * one
-
-    class DivCallback(ElwiseNullCallback):
-        def __init__(self):
-            super(DivCallback, self).__init__()
-            self.x = wildcard()
-            self.pattern = self.x / one
-
-    class MulZeroCallback(ElwiseNullCallback):
-        def __init__(self):
-            super(MulZeroCallback, self).__init__()
-            self.x = zero
-            self.pattern = self.x * wildcard()
-
-    class ZeroDivCallback(ElwiseNullCallback):
-        def __init__(self):
-            super(ZeroDivCallback, self).__init__()
-            self.x = zero
-            self.pattern = self.x / wildcard()
-
-    return rewrite(
-        [
-            AddCallback(),
-            SubCallback(),
-            MulCallback(),
-            DivCallback(),
-            MulZeroCallback(),
-            ZeroDivCallback(),
-        ],
-        expr,
-    )
-
-
-def test_algebraic_simplify():
-    x = relay.Var("x")
-    y = relay.Var("y")
-
-    one = relay.const(1)
-    zero = relay.const(0)
-    onef = relay.const(1.0)
-    zerof = relay.const(0.0)
-
-    assert algebraic_simplify(x + zero) == x
-    assert algebraic_simplify(x + zerof) == x
-    assert algebraic_simplify(zero + x) == x
-    assert algebraic_simplify(zerof + x) == x
-
-    assert algebraic_simplify(x - zero) == x
-    assert algebraic_simplify(x - zerof) == x
-
-    assert algebraic_simplify(x * one) == x
-    assert algebraic_simplify(x * onef) == x
-    assert algebraic_simplify(one * x) == x
-    assert algebraic_simplify(onef * x) == x
-    assert algebraic_simplify(x * zero) == zero
-    assert algebraic_simplify(x * zerof) == zerof
-
-    assert algebraic_simplify(x / one) == x
-    assert algebraic_simplify(x / onef) == x
-    assert algebraic_simplify(zero / x) == zero
-    assert algebraic_simplify(zerof / x) == zerof
-
-    tvm.ir.assert_structural_equal(
-        algebraic_simplify((x + zero * y) / one + (y * one) - zero / x), x + y
-    )
-
-
-def test_double_partition():
-    # Pattern 1
-    conv2d_p = is_op("nn.conv2d")(wildcard(), wildcard())
-    bias_add_p = is_op("nn.bias_add")(conv2d_p, wildcard())
-    relu_p = is_op("nn.relu")(bias_add_p)
-
-    # Graph
-    x = relay.var("input")
-    w = relay.var("weight")
-    b = relay.var("bias")
-    w2 = relay.var("weight")
-    b2 = relay.var("bias")
-    conv2d = relay.op.nn.conv2d(x, w)
-    bias_add = relay.op.nn.bias_add(conv2d, b)
-    relu = relay.op.nn.relu(bias_add)
-    conv2d2 = relay.op.nn.conv2d(relu, w2)
-    bias_add2 = relay.op.nn.bias_add(conv2d2, b2)
-
-    partitioned = bias_add2
-    for pat, label in [(relu_p, "conv_bias_relu"), (bias_add_p, "conv_bias")]:
-        partitioned = pat.partition(partitioned, {"Composite": label})
-
-    inpf = relay.var("input")
-    weightf = relay.var("weight")
-    biasf = relay.var("bias")
-    func0 = (
-        relay.Function(
-            [inpf, weightf, biasf],
-            relay.op.nn.relu(relay.op.nn.bias_add(relay.op.nn.conv2d(inpf, weightf), biasf)),
-        )
-        .with_attr("Composite", "conv_bias_relu")
-        .with_attr("PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_")
-    )
-    inpf = relay.var("input")
-    weightf = relay.var("weight")
-    biasf = relay.var("bias")
-    func1 = (
-        relay.Function(
-            [inpf, weightf, biasf], relay.op.nn.bias_add(relay.op.nn.conv2d(inpf, weightf), biasf)
-        )
-        .with_attr("Composite", "conv_bias")
-        .with_attr("PartitionedFromPattern", "nn.conv2d_nn.bias_add_")
-    )
-
-    expected = func1(func0(x, w, b), w2, b2)
-    tvm.ir.assert_structural_equal(partitioned, expected)
-
-
-def test_partition_dominator():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard())
-    reduction = is_op("add")(wildcard(), wildcard())
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    # Classic Diamond
-    inp = relay.var("input")
-    weight = relay.var("weight")
-
-    def generate_diamond(inp, weight):
-        conv2d = relay.op.nn.conv2d(inp, weight)
-        relu = relay.op.nn.relu(conv2d)
-        relu = relay.op.nn.relu(relu)
-        leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-        return relu + leaky_relu
-
-    out = generate_diamond(inp * inp, weight * weight)
-    # Check
-    partitioned = diamond.partition(out)
-
-    i = relay.Var("input")
-    w = relay.Var("weight")
-    f = relay.Function([i, w], generate_diamond(i, w)).with_attr(
-        "PartitionedFromPattern", "nn.conv2d_nn.relu_nn.relu_nn.leaky_relu_add_"
-    )
-    tvm.ir.assert_structural_equal(partitioned, f(inp * inp, weight * weight))
-
-
-def test_quadruple_partition_dominator():
-    # Pattern
-    is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))(wildcard()) | is_op(
-        "add"
-    )(wildcard(), wildcard())
-    reduction = is_op("add")(wildcard(), wildcard())
-    diamond = dominates(is_conv2d, is_unary_elemwise, reduction)
-
-    inp = relay.var("input")
-    weight = relay.var("weight")
-
-    # Classic Diamond
-    def classic_diamond(inp, weight):
-        conv2d = relay.op.nn.conv2d(inp, weight)
-        relu = relay.op.nn.relu(conv2d)
-        relu = relay.op.nn.relu(relu)
-        leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-        return relu + leaky_relu
-
-    # Deeper Branch
-    def deeper_diamond(inp, weight):
-        conv2d = relay.op.nn.conv2d(inp, weight)
-        relu = relay.op.nn.relu(conv2d)
-        relu = relay.op.nn.relu(relu)
-        relu = relay.op.tanh(relu)
-        leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-        return relu + leaky_relu
-
-    # Single Branch
-    def single_branch(inp, weight):
-        conv2d = relay.op.nn.conv2d(inp, weight)
-        relu = relay.op.nn.relu(conv2d)
-        relu = relay.op.nn.relu(relu)
-        tanh = relay.op.tanh(relu)
-        return relu + tanh
-
-    # Fuzzy path/nested Diamond
-    def nested_diamond(inp, weight):
-        conv2d = relay.op.nn.conv2d(inp, weight)
-        relu = relay.op.nn.relu(conv2d)
-        relu = relu + relu
-        tanh = relay.op.tanh(relu)
-        leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0)
-        return tanh + leaky_relu
-
-    partitioned = diamond.partition(
-        nested_diamond(
-            single_branch(deeper_diamond(classic_diamond(inp, weight), weight), weight), weight
-        )
-    )
-
-    functions = []
-    partition_names = [
-        "nn.conv2d_nn.relu_nn.relu_nn.leaky_relu_add_",
-        "nn.conv2d_nn.relu_nn.relu_tanh_nn.leaky_relu_add_",
-        "nn.conv2d_nn.relu_nn.relu_tanh_add_",
-        "nn.conv2d_nn.relu_add_tanh_nn.leaky_relu_add_",
-    ]
-    for i, f in enumerate([classic_diamond, deeper_diamond, single_branch, nested_diamond]):
-        inpf = relay.var("input")
-        weightf = relay.var("weight")
-        functions.append(
-            relay.Function([inpf, weightf], f(inpf, weightf)).with_attr(
-                "PartitionedFromPattern", partition_names[i]
-            )
-        )
-
-    reference = functions[3](
-        functions[2](functions[1](functions[0](inp, weight), weight), weight), weight
-    )
-    tvm.ir.assert_structural_equal(partitioned, reference)
-
-
-def get_BN(x, var, mean, beta, gamma, eps):
-    return gamma * (x - mean) / relay.op.sqrt(var + eps) + beta
-
-
-def test_partition_batchnorm():
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-    eps = relay.const(1e-5)
-    BN = get_BN(x, var, mean, beta, gamma, eps)
-
-    xf = relay.var("xf")
-    varf = relay.var("varf")
-    meanf = relay.var("meanf")
-    betaf = relay.var("betaf")
-    gammaf = relay.var("gammaf")
-    # Put the arguments in toplogological order for the reference
-    f = relay.Function(
-        [gammaf, xf, meanf, varf, betaf], get_BN(xf, varf, meanf, betaf, gammaf, eps)
-    ).with_attr("PartitionedFromPattern", "subtract_multiply_add_sqrt_divide_add_")
-
-    partitioned = BatchnormCallback().pattern.partition(BN)
-    reference = f(gamma, x, mean, var, beta)
-    tvm.ir.assert_structural_equal(partitioned, reference)
-
-
-def test_partition_double_batchnorm():
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-    eps = relay.const(1e-5)
-
-    BN = gamma * (x - mean) / relay.op.sqrt(var + eps) + beta
-    BN2 = gamma * (BN - mean) / relay.op.sqrt(var + eps) + beta
-
-    xf = relay.var("xf")
-    varf = relay.var("varf")
-    meanf = relay.var("meanf")
-    betaf = relay.var("betaf")
-    gammaf = relay.var("gammaf")
-    f1 = relay.Function(
-        [gammaf, xf, meanf, varf, betaf], get_BN(xf, varf, meanf, betaf, gammaf, eps)
-    ).with_attr("PartitionedFromPattern", "subtract_multiply_add_sqrt_divide_add_")
-    # The partitioner doesn't replace duplicates, so we use two copies of the function
-    xf2 = relay.var("xf2")
-    varf2 = relay.var("varf2")
-    meanf2 = relay.var("meanf2")
-    betaf2 = relay.var("betaf2")
-    gammaf2 = relay.var("gammaf2")
-    f2 = relay.Function(
-        [gammaf2, xf2, meanf2, varf2, betaf2], get_BN(xf2, varf2, meanf2, betaf2, gammaf2, eps)
-    ).with_attr("PartitionedFromPattern", "subtract_multiply_add_sqrt_divide_add_")
-
-    partitioned = BatchnormCallback().pattern.partition(BN2)
-    reference = f2(gamma, f1(gamma, x, mean, var, beta), mean, var, beta)
-    tvm.ir.assert_structural_equal(partitioned, reference)
-
-
-def test_overlappting_partitions():
-    x = wildcard()
-    gamma = wildcard()
-    beta = wildcard()
-    moving_mean = wildcard()
-    moving_var = wildcard()
-    bn_node = is_op("nn.batch_norm")(x, gamma, beta, moving_mean, moving_var)
-    tuple_get_item_node = TupleGetItemPattern(bn_node, 0)
-
-    x = relay.var("x")
-    var = relay.var("var")
-    mean = relay.var("mean")
-    beta = relay.var("beta")
-    gamma = relay.var("gamma")
-    BN = relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)
-    T1 = BN[0]
-    T2 = BN[0]
-    add = T1 + T2
-
-    assert tuple_get_item_node.partition(add) == add
-
-
-def test_partition_overused():
-    pattern = is_op("nn.relu")(is_op("nn.conv2d")(wildcard(), wildcard()))
-
-    x = relay.var("input")
-    w = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(x, w)
-    relu = relay.op.nn.relu(conv2d)
-    out = relu + conv2d
-
-    assert pattern.partition(out) == out
-
-
-def test_partition_fuzzy_tuple():
-    x = relay.var("x")
-    y = relay.var("y")
-    z = x + y
-    tuple_pattern = is_tuple(None)
-    concat_pattern = is_op("concatenate")(tuple_pattern)
-
-    xp = relay.var("xp")
-    yp = relay.var("yp")
-    zp = relay.var("zp")
-
-    def create_func(args, body):
-        return relay.Function(args, body).with_attr("PartitionedFromPattern", "Tuple_concatenate_")
-
-    def concat(*args):
-        return relay.op.concatenate(relay.expr.Tuple(args), axis=0)
-
-    one = concat_pattern.partition(concat(x))
-    tvm.ir.assert_structural_equal(one, create_func([xp], concat(xp))(x))
-    two = concat_pattern.partition(concat(x, y))
-    tvm.ir.assert_structural_equal(two, create_func([xp, yp], concat(xp, yp))(x, y))
-    three = concat_pattern.partition(concat(x, y, z))
-    tvm.ir.assert_structural_equal(three, create_func([xp, yp, zp], concat(xp, yp, zp))(x, y, z))
-
-
-def test_partition_fuzzy_function_args():
-    func_pattern = FunctionPattern(None, wildcard() + wildcard())(None) + wildcard()
-    x = relay.var("x")
-    y = relay.var("y")
-    z = relay.var("z")
-    b = relay.var("b")
-    xp = relay.var("xp")
-    yp = relay.var("yp")
-    zp = relay.var("zp")
-
-    def create_func(call):
-        N = len(call.op.params)
-        new_params = [relay.var(str(i)) for i in range(N + 1)]
-        label = "add_FunctionCall_add_"
-        if N == 3:
-            label = "add_" + label
-        return relay.Function(
-            new_params, relay.Call(call.op, (new_params[0:-1])) + new_params[-1]
-        ).with_attr("PartitionedFromPattern", label)(*([x, y, z][0:N] + [b]))
-
-    f1 = relay.Function([xp], xp + xp)(x)
-    one = func_pattern.partition(f1 + b)
-    tvm.ir.assert_structural_equal(one, create_func(f1))
-    f2 = relay.Function([xp, yp], xp + yp)(x, y)
-    two = func_pattern.partition(f2 + b)
-    tvm.ir.assert_structural_equal(two, create_func(f2))
-    f3 = relay.Function([xp, yp, zp], xp + yp + zp)(x, y, z)
-    three = func_pattern.partition(f3 + b)
-    tvm.ir.assert_structural_equal(three, create_func(f3))
-
-
-def test_partition_check():
-    pattern = is_op("nn.relu")(is_op("nn.conv2d")(is_var("input"), wildcard()))
-
-    def check(pre):
-        return pre.args[0].attrs.data_layout == "NCHW"
-
-    x = relay.var("input")
-    w = relay.var("weight")
-    conv2d = relay.op.nn.conv2d(x, w)
-    relu = relay.op.nn.relu(conv2d)
-
-    xf = relay.var("input")
-    wf = relay.var("weight")
-    conv2df = relay.op.nn.conv2d(xf, wf)
-    reluf = relay.op.nn.relu(conv2df)
-    func = relay.Function([xf, wf], reluf).with_attr("PartitionedFromPattern", "nn.conv2d_nn.relu_")
-
-    reference = func(x, w)
-    partitioned = pattern.partition(relu, check=check)
-    tvm.ir.assert_structural_equal(partitioned, reference)
-
-    conv2d = relay.op.nn.conv2d(x, w, data_layout="NHWC")
-    relu = relay.op.nn.relu(conv2d)
-    assert relu == pattern.partition(relu, check=check)
-
-
-def test_partition_check_types():
-    pattern = is_op("nn.relu")(is_op("nn.conv2d")(wildcard(), wildcard()))
-
-    def check(pre):
-        conv = pre.args[0]
-        return (conv.attrs.data_layout == "NCHW") and bool(conv.checked_type.shape[0] == 1)
-
-    x = relay.var("input", shape=(1, 10, 10, 10))
-    w = relay.var("weight", shape=(10, 10, 3, 3))
-    conv2d = relay.op.nn.conv2d(x, w)
-    relu = relay.op.nn.relu(conv2d)
-    relu = run_opt_pass(relu, relay.transform.InferType())
-
-    partitioned = pattern.partition(relu, check=check)
-    assert partitioned.op.attrs["PartitionedFromPattern"] == "nn.conv2d_nn.relu_"
-
-    conv2d = relay.op.nn.conv2d(x, w, data_layout="NHWC")
-    relu = relay.op.nn.relu(conv2d)
-    relu = run_opt_pass(relu, relay.transform.InferType())
-    assert relu == pattern.partition(relu, check=check)
-
-    x = relay.var("input", shape=(2, 10, 10, 10))
-    w = relay.var("weight", shape=(10, 10, 3, 3))
-    conv2d = relay.op.nn.conv2d(x, w)
-    relu = relay.op.nn.relu(conv2d)
-    relu = run_opt_pass(relu, relay.transform.InferType())
-    assert relu == pattern.partition(relu, check=check)
-
-
-def conv_bias_relu(x, w, b):
-    conv2d = relay.op.nn.conv2d(x, w)
-    bias_add = relay.op.nn.bias_add(conv2d, b)
-    relu = relay.op.nn.relu(bias_add)
-    return relu
-
-
-def test_partition_option():
-    x = relay.var("x")
-    w = relay.var("w")
-    b = relay.var("b")
-
-    conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    bias = conv2d.optional(lambda x: is_op("nn.bias_add")(x, wildcard()))
-    pattern1 = is_op("nn.relu")(bias)
-
-    conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
-    bias = is_op("nn.bias_add")(conv2d, wildcard())
-    pattern2 = bias.optional(lambda x: is_op("nn.relu")(x))
-
-    relu = conv_bias_relu(x, w, b)
-
-    xf = relay.var("x")
-    wf = relay.var("w")
-    bf = relay.var("b")
-    func = relay.Function([xf, wf, bf], conv_bias_relu(xf, wf, bf)).with_attr(
-        "PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_"
-    )
-
-    assert pattern1.match(relu)
-    tvm.ir.assert_structural_equal(func(x, w, b), pattern1.partition(relu))
-
-    assert pattern2.match(relu)
-    tvm.ir.assert_structural_equal(func(x, w, b), pattern2.partition(relu))
-
-
-def test_partition_function():
-    x = relay.var("x")
-    w = relay.var("w")
-    b = relay.var("b")
-
-    x1 = relay.var("x1")
-    w1 = relay.var("w1")
-
-    wc_x = wildcard()
-    wc_w = wildcard()
-    wc_b = wildcard()
-    wc_x1 = wildcard()
-    wc_w1 = wildcard()
-
-    func_pattern = FunctionPattern([wc_x1, wc_w1], is_op("nn.conv2d")(wc_x1, wc_w1))
-    pattern = func_pattern(wc_x, wc_w) + wc_b
-
-    func = relay.Function([x1, w1], relay.nn.conv2d(x1, w1))
-    expr = func(x, w) + b + b
-
-    x2 = relay.var("x2")
-    w2 = relay.var("w2")
-    b2 = relay.var("b2")
-    func2 = relay.Function([x2, w2, b2], func(x2, w2) + b2).with_attr(
-        "PartitionedFromPattern", "nn.conv2d_FunctionCall_add_"
-    )
-    expr2 = func2(x, w, b) + b
-    tvm.ir.assert_structural_equal(pattern.partition(expr), expr2)
-
-
-def test_partition_optional_function():
-    x = relay.var("x")
-    w = relay.var("w")
-    b = relay.var("b")
-
-    x1 = relay.var("x1")
-    w1 = relay.var("w1")
-
-    wc_x = wildcard()
-    wc_w = wildcard()
-    wc_x1 = wildcard()
-    wc_w1 = wildcard()
-
-    func_pattern0 = FunctionPattern(
-        [wc_x1, wc_w1], is_op("sigmoid")(is_op("nn.conv2d")(wc_x1, wc_w1))
-    )
-    func_pattern1 = FunctionPattern(
-        [wc_x1, wc_w1], is_op("nn.relu")(is_op("nn.conv2d")(wc_x1, wc_w1))
-    )
-    pattern = func_pattern0(wc_x, wc_w) | func_pattern1(wc_x, wc_w)
-
-    func = relay.Function([x1, w1], relay.nn.relu(relay.nn.conv2d(x1, w1)))
-    expr = func(x, w) + b
-
-    x2 = relay.var("x2")
-    w2 = relay.var("w2")
-    func2 = relay.Function([x2, w2], func(x2, w2)).with_attr(
-        "PartitionedFromPattern", "nn.conv2d_nn.relu_FunctionCall_"
-    )
-    expr2 = func2(x, w) + b
-    tvm.ir.assert_structural_equal(pattern.partition(expr), expr2)
-
-
-def test_rewrite_function_with_fuzzy_body():
-    """Allow Rewriting a function with a fuzzy body via dominator analysis"""
-    x = relay.var("x")
-    w = relay.var("w")
-    b = relay.var("b")
-
-    x1 = relay.var("x1")
-    w1 = relay.var("w1")
-
-    wc_x = wildcard()
-    wc_w = wildcard()
-    wc_b = wildcard()
-    wc_x1 = wildcard()
-    wc_w1 = wildcard()
-
-    func_pattern = FunctionPattern([wc_x1, wc_w1], wildcard())
-    pattern = func_pattern(wc_x, wc_w) + wc_b
-
-    func = relay.Function([x1, w1], relay.nn.conv2d(x1, w1))
-    expr = func(x, w) + b + b
-
-    class TestRewrite(DFPatternCallback):
-        def __init__(self):
-            super(TestRewrite, self).__init__()
-            self.pattern = pattern
-
-        def callback(self, pre, post, node_map):
-            return x + w
-
-    out = rewrite(TestRewrite(), expr)
-    tvm.ir.assert_structural_equal(out, x + w + b)
-
-
-def test_partition_function_with_fuzzy_body():
-    """
-    Allow Rewriting a function with a fuzzy body via dominator analysis
-    """
-    x = relay.var("x")
-    w = relay.var("w")
-    b = relay.var("b")
-
-    x1 = relay.var("x1")
-    w1 = relay.var("w1")
-
-    wc_x = wildcard()
-    wc_w = wildcard()
-    wc_b = wildcard()
-    wc_x1 = wildcard()
-    wc_w1 = wildcard()
-
-    func_pattern = FunctionPattern([wc_x1, wc_w1], wildcard())
-    pattern = func_pattern(wc_x, wc_w) + wc_b
-
-    func = relay.Function([x1, w1], relay.nn.conv2d(x1, w1))
-    expr = func(x, w) + b + b
-
-    x2 = relay.var("x2")
-    w2 = relay.var("w2")
-    b2 = relay.var("b2")
-    func2 = relay.Function([x2, w2, b2], func(x2, w2) + b2).with_attr(
-        "PartitionedFromPattern", "nn.conv2d_FunctionCall_add_"
-    )
-    expr2 = func2(x, w, b) + b
-    tvm.ir.assert_structural_equal(pattern.partition(expr), expr2)
-
-
-def test_match_match():
-    add_pattern = is_op("add")(wildcard(), wildcard())
-
-    class TestRewrite(DFPatternCallback):
-        def __init__(self):
-            super(TestRewrite, self).__init__()
-            self.pattern = add_pattern
-
-        def callback(self, pre, post, node_map):
-            return post.args[0] - post.args[1]
-
-    mod = tvm.IRModule({})
-    tvm.relay.prelude.Prelude(mod)
-    # Apply rewrite on IR including relay.Match
-    out = rewrite(TestRewrite(), mod["tensor_concatenate_int64"])
-    tvm.ir.assert_structural_equal(mod["tensor_concatenate_int64"], out)
-
-
-def test_partition_constant_embedding():
-    x = relay.var("x")
-    w = relay.var("w")
-    wc = relay.const(1)
-    b = relay.var("b")
-
-    xf = relay.var("x")
-    wf = relay.var("w")
-    bf = relay.var("b")
-    embeded_func = relay.Function([xf, bf], conv_bias_relu(xf, wc, bf)).with_attr(
-        "PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_"
-    )
-    xf = relay.var("x")
-    wf = relay.var("w")
-    bf = relay.var("b")
-    lifted_func = relay.Function([xf, wf, bf], conv_bias_relu(xf, wf, bf)).with_attr(
-        "PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_"
-    )
-    relu = conv_bias_relu(x, w, b)
-    reluc = conv_bias_relu(x, wc, b)
-
-    # Check lifting of wildcard matches
-    pattern = is_op("nn.relu")(
-        is_op("nn.bias_add")(is_op("nn.conv2d")(wildcard(), wildcard()), wildcard())
-    )
-    tvm.ir.assert_structural_equal(lifted_func(x, w, b), pattern.partition(relu))
-    tvm.ir.assert_structural_equal(lifted_func(x, wc, b), pattern.partition(reluc))
-
-    # Check lifting of input matches
-    pattern = is_op("nn.relu")(
-        is_op("nn.bias_add")(is_op("nn.conv2d")(wildcard(), is_var()), wildcard())
-    )
-    tvm.ir.assert_structural_equal(lifted_func(x, w, b), pattern.partition(relu))
-    tvm.ir.assert_structural_equal(reluc, pattern.partition(reluc))  # Constants are not Inputs
-
-    # Check embedding of constant matches
-    pattern = is_op("nn.relu")(
-        is_op("nn.bias_add")(is_op("nn.conv2d")(wildcard(), is_constant()), wildcard())
-    )
-    tvm.ir.assert_structural_equal(relu, pattern.partition(relu))
-    tvm.ir.assert_structural_equal(embeded_func(x, b), pattern.partition(reluc))
-
-    # Check embedding of constant ExprPatterns
-    pattern = is_op("nn.relu")(
-        is_op("nn.bias_add")(is_op("nn.conv2d")(wildcard(), is_expr(wc)), wildcard())
-    )
-    tvm.ir.assert_structural_equal(relu, pattern.partition(relu))
-    tvm.ir.assert_structural_equal(embeded_func(x, b), pattern.partition(reluc))
-
-    # Check lifting/embedding of Alt matches
-    pattern = is_op("nn.relu")(
-        is_op("nn.bias_add")(is_op("nn.conv2d")(wildcard(), is_var() | is_constant()), wildcard())
-    )
-    tvm.ir.assert_structural_equal(lifted_func(x, w, b), pattern.partition(relu))
-    tvm.ir.assert_structural_equal(embeded_func(x, b), pattern.partition(reluc))
-
-    # Check lifting/embedding of Alt matches with the other ordering
-    pattern = is_op("nn.relu")(
-        is_op("nn.bias_add")(is_op("nn.conv2d")(wildcard(), is_constant() | is_var()), wildcard())
-    )
-    tvm.ir.assert_structural_equal(lifted_func(x, w, b), pattern.partition(relu))
-    tvm.ir.assert_structural_equal(embeded_func(x, b), pattern.partition(reluc))
-
-
-def test_rewrite_once():
-    # This class recursively removes the arguments to concat until there is nothing left to concatenate.
-    class ConcatRewriter(DFPatternCallback):
-        def __init__(self, rewrite_once):
-            super().__init__(rewrite_once=rewrite_once)
-            self.pattern = is_op("concatenate")(None)
-
-        def callback(self, pre, post, node_map):
-            concat_args = post.args[0]
-            # Remove the last argument
-            new_args = [concat_args[i] for i in range(len(concat_args) - 1)]
-            if new_args:
-                return relay.op.concatenate(relay.expr.Tuple(new_args), axis=0)
-            else:
-                return concat_args[0]
-
-    x = relay.var("x")
-    y = relay.var("y")
-    z = relay.var("z")
-    concat = relay.op.concatenate(relay.expr.Tuple([x, y, z]), axis=0)
-
-    def test_one_callback():
-        # Let the rewriter run recursively
-        out = rewrite(ConcatRewriter(False), concat)
-        expected = x
-        tvm.ir.assert_structural_equal(out, expected)
-
-        # Run the rewriter once
-        out = rewrite(ConcatRewriter(True), concat)
-        expected = relay.op.concatenate(relay.expr.Tuple([x, y]), axis=0)
-        tvm.ir.assert_structural_equal(out, expected)
-
-    def test_multi_callbacks():
-        # This class recursively add a nn.relu operator after nn.softmax
-        class OneMoreReluRewriter(DFPatternCallback):
-            def __init__(self, rewrite_once):
-                super().__init__(rewrite_once=rewrite_once)
-                self.pattern = is_op("nn.softmax")(None)
-
-            def callback(self, pre, post, node_map):
-                return relay.nn.relu(post)
-
-        def before():
-            # Before:
-            #    x    y    z
-            #    |    |    |
-            #       concat
-            #         |
-            #      softmax
-            return relay.nn.softmax(concat)
-
-        def once_concat():
-            # ConcatRewrite once, OneMoreReluRewrite once
-            # Expected:
-            #   x    y
-            #   |    |
-            #   concat
-            #      |
-            #   softmax
-            #      |
-            #    relu
-            return relay.nn.relu(
-                relay.nn.softmax(relay.op.concatenate(relay.expr.Tuple([x, y]), axis=0))
-            )
-
-        def recursive_concat():
-            # ConcatRewrite recursively, OneMoreReluRewrite once
-            # Expected:
-            #      x
-            #      |
-            #   softmax
-            #      |
-            #    relu
-            return relay.nn.relu(relay.nn.softmax(x))
-
-        # Run ConcatRewriter once, OneMoreReluRewriter once
-        out = rewrite(
-            [OneMoreReluRewriter(True), ConcatRewriter(True)],
-            before(),
-        )
-        tvm.ir.assert_structural_equal(out, once_concat())
-
-        # Run ConcatRewriter recursively, OneMoreReluRewriter once
-        out = rewrite(
-            [OneMoreReluRewriter(True), ConcatRewriter(False)],
-            before(),
-        )
-        tvm.ir.assert_structural_equal(out, recursive_concat())
-
-    test_one_callback()
-    test_multi_callbacks()
-
-
-def test_matched_outside_but_dominated():
-    """In this example the pattern matches the nn.conv2d/add/multiply flow. Even though the
-    add output is consumed by the sigmoid, the sigmoid itself is dominated by the multiply.
-    So partitioning can proceed, all be it with a duplication of the add."""
-    in_mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data: Tensor[(16, 16, 32, 32), float16], %weight: Tensor[(32, 16, 3, 3), float16], %bias: Tensor[(32), float32]) -> Tensor[(16, 32, 32, 32), float32] {
-          %0 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC");
-          %1 = layout_transform(%weight, src_layout="OIHW", dst_layout="OHWI");
-          %2 = expand_dims(%bias, axis=1, num_newaxis=2);
-          %3 = expand_dims(%2, axis=0);
-          %4 = nn.conv2d(%0, %1, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
-          %5 = layout_transform(%3, src_layout="NCHW", dst_layout="NHWC");
-          %6 = add(%4, %5);
-          %7 = sigmoid(%6);
-          %8 = multiply(%6, %7);
-          layout_transform(%8, src_layout="NHWC", dst_layout="NCHW")
-        }
-        """
-    )
-    expected_mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data: Tensor[(16, 16, 32, 32), float16], %weight: Tensor[(32, 16, 3, 3), float16], %bias: Tensor[(32), float32]) -> Tensor[(16, 32, 32, 32), float32] {
-          %2 = expand_dims(%bias, axis=1, num_newaxis=2);
-          %3 = expand_dims(%2, axis=0);
-          %4 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC");
-          %5 = layout_transform(%weight, src_layout="OIHW", dst_layout="OHWI");
-          %6 = nn.conv2d(%4, %5, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
-          %7 = layout_transform(%3, src_layout="NCHW", dst_layout="NHWC");
-          %8 = add(%6, %7);
-          %9 = sigmoid(%8);
-          %10 = fn (%FunctionVar_0_0, %FunctionVar_0_1, %FunctionVar_0_2, %FunctionVar_0_3, PartitionedFromPattern="nn.conv2d_add_multiply_") {
-            %0 = nn.conv2d(%FunctionVar_0_0, %FunctionVar_0_1, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
-            %1 = add(%0, %FunctionVar_0_2);
-            multiply(%1, %FunctionVar_0_3)
-          };
-          %11 = %10(%4, %5, %7, %9);
-          layout_transform(%11, src_layout="NHWC", dst_layout="NCHW")
-        }
-        """
-    )
-    pattern = is_op("multiply")(
-        is_op("add")(is_op("nn.conv2d")(wildcard(), wildcard()), wildcard()), wildcard()
-    )
-    actual_mod = tvm.IRModule.from_expr(pattern.partition(in_mod["main"]))
-    actual_mod = relay.transform.InferType()(actual_mod)
-    tvm.ir.assert_structural_equal(actual_mod, expected_mod)
-
-
-def test_partition_parallel_branch_with_same_input():
-    """In this example, conv2d's two consumer(add and multiply) on two different branches are
-    merged into one partition, make sure that the partitioned function has no redundant parameters"""
-    # Pattern
-    path1 = is_op("multiply")(wildcard(), wildcard())
-    path2 = is_op("add")(wildcard(), wildcard())
-    pattern = is_op("add")(path1, path2)
-
-    i = relay.Var("input")
-    w = relay.Var("weight")
-    l = relay.Var("left")
-    r = relay.Var("right")
-
-    conv2d = relay.op.nn.conv2d(i, w)
-    branch1 = relay.multiply(l, conv2d)
-    branch2 = relay.add(conv2d, r)
-    add = relay.add(branch1, branch2)
-
-    lf = relay.Var("leftf")
-    mf = relay.Var("midf")
-    rf = relay.Var("rightf")
-    f = relay.Function([lf, mf, rf], (lf * mf) + (mf + rf)).with_attr(
-        "PartitionedFromPattern", "multiply_add_add_"
-    )
-
-    partitioned = pattern.partition(add)
-    reference = f(l, conv2d, r)
-    tvm.ir.assert_structural_equal(partitioned, reference)
-
-
-def test_rewrite_with_pattern_recursion():
-    data = relay.var("data", relay.TensorType((2, 8), "float32"))
-    dense_weight = relay.const(np.zeros((4, 8)))
-    feat = relay.nn.dense(data, dense_weight)
-    feat = relay.cast(feat, "float32")
-    feat = relay.cast(feat, "float32")
-    feat = relay.cast(feat, "float32")
-    feat = relay.cast(feat, "float32")
-    feat = relay.cast(feat, "float32")
-    oup = relay.cast(feat, "float32")
-
-    expected = relay.nn.relu(oup)
-
-    class TheRewrite(DFPatternCallback):
-        def __init__(self, pattern):
-            super(TheRewrite, self).__init__(rewrite_once=True)
-            self.pattern = pattern
-
-        def callback(self, pre, post, node_map):
-            return relay.nn.relu(post)
-
-    def test_reset_call_args():
-        dense_pattern = is_op("nn.dense")(wildcard(), wildcard())
-        wildcard_redirect = wildcard()
-        the_pattern = is_op("cast")(wildcard_redirect)
-        the_pattern2 = the_pattern | dense_pattern
-        wildcard_redirect.redirect_to(the_pattern2)
-
-        actual = rewrite(TheRewrite(the_pattern), oup)
-        tvm.ir.assert_structural_equal(actual, expected)
-
-    def test_reset_alt_left():
-        dense_pattern = is_op("nn.dense")(wildcard(), wildcard())
-        wildcard_redirect = wildcard()
-        or_pattern = wildcard_redirect | dense_pattern
-        the_pattern = is_op("cast")(or_pattern)
-        wildcard_redirect.redirect_to(the_pattern)
-
-        actual = rewrite(TheRewrite(the_pattern), oup)
-        tvm.ir.assert_structural_equal(actual, expected)
-
-    def test_reset_alt_right():
-        dense_pattern = is_op("nn.dense")(wildcard(), wildcard())
-        wildcard_redirect = wildcard()
-        or_pattern = dense_pattern | wildcard_redirect
-        the_pattern = is_op("cast")(or_pattern)
-        wildcard_redirect.redirect_to(the_pattern)
-
-        actual = rewrite(TheRewrite(the_pattern), oup)
-        tvm.ir.assert_structural_equal(actual, expected)
-
-    test_reset_call_args()
-    test_reset_alt_left()
-    test_reset_alt_right()
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_debug.py b/tests/python/relay/test_debug.py
deleted file mode 100644
index 61557867f070..000000000000
--- a/tests/python/relay/test_debug.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from tvm.relay import var, const, create_executor
-from tvm.relay.op import debug
-
-
-_test_debug_hit = False
-
-
-def test_debug():
-    global _test_debug_hit
-    x = var("x", shape=(), dtype="int32")
-    _test_debug_hit = False
-
-    def did_exec(x):
-        global _test_debug_hit
-        _test_debug_hit = True
-
-    prog = debug(x, debug_func=did_exec)
-    result = create_executor().evaluate(prog, {x: const(1, "int32")})
-    assert _test_debug_hit
-    assert result.numpy() == 1
-
-
-def test_debug_with_expr():
-    global _test_debug_hit
-    _test_debug_hit = False
-    x = var("x", shape=(), dtype="int32")
-    _test_debug_hit = False
-
-    def did_exec(x):
-        global _test_debug_hit
-        _test_debug_hit = True
-
-    prog = debug(x + x * x, debug_func=did_exec)
-    result = create_executor().evaluate(prog, {x: const(2, "int32")})
-    assert _test_debug_hit
-    assert result.numpy() == 6
diff --git a/tests/python/relay/test_executor.py b/tests/python/relay/test_executor.py
deleted file mode 100644
index 04662f21ae9e..000000000000
--- a/tests/python/relay/test_executor.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-
-from tvm import TVMError
-from tvm.relay.backend import Executor
-
-
-def test_create_executor():
-    executor = Executor("aot")
-    assert executor.name == "aot"
-
-
-def test_create_executor_with_options():
-    executor = Executor("aot", {"interface-api": "c"})
-    assert executor.name == "aot"
-    assert executor["interface-api"] == "c"
-
-
-def test_create_executor_with_default():
-    executor = Executor("graph")
-    assert not executor["link-params"]
-
-
-def test_attr_check():
-    executor = Executor("aot", {"interface-api": "c"})
-    assert "woof" not in executor
-    assert "interface-api" in executor
-
-
-def test_create_executor_not_found():
-    with pytest.raises(TVMError, match='Executor "woof" is not defined'):
-        Executor("woof", {})
-
-
-def test_create_executor_attr_not_found():
-    with pytest.raises(TVMError, match='Attribute "woof" is not available on this Executor'):
-        Executor("aot", {"woof": "bark"})
-
-
-def test_create_executor_attr_type_incorrect():
-    with pytest.raises(
-        TVMError,
-        match='Attribute "interface-api" should have type "runtime.String"'
-        ' but instead found "runtime.BoxBool"',
-    ):
-        Executor("aot", {"interface-api": True})
-
-
-def test_list_executors():
-    assert "aot" in Executor.list_registered()
-
-
-@pytest.mark.parametrize("executor", [Executor("aot").name, "aot"])
-def test_list_executor_options(executor):
-    aot_options = Executor.list_registered_options(executor)
-    assert "interface-api" in aot_options
-    assert aot_options["interface-api"] == "runtime.String"
-
-
-def test_list_executor_options_not_found():
-    with pytest.raises(TVMError, match='Executor "woof" is not defined'):
-        Executor.list_registered_options("woof")
diff --git a/tests/python/relay/test_expr_functor.py b/tests/python/relay/test_expr_functor.py
deleted file mode 100644
index 930cbd926080..000000000000
--- a/tests/python/relay/test_expr_functor.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import ExprFunctor, ExprMutator, ExprVisitor
-
-
-def check_visit(expr):
-    try:
-        ef = ExprFunctor()
-        ef.visit(expr)
-        assert False
-    except NotImplementedError:
-        pass
-
-    ev = ExprVisitor()
-    ev.visit(expr)
-
-    em = ExprMutator()
-    assert expr == em.visit(expr)
-
-
-def test_constant():
-    check_visit(relay.const(1.0))
-
-
-def test_tuple():
-    t = relay.Tuple([relay.var("x", shape=())])
-    check_visit(t)
-
-
-def test_var():
-    v = relay.var("x", shape=())
-    check_visit(v)
-
-
-def test_global():
-    v = relay.GlobalVar("f")
-    check_visit(v)
-
-
-def test_function():
-    x = relay.var("x", shape=())
-    y = relay.var("y", shape=())
-    params = [x, y]
-    body = x + y
-    ret_type = relay.TensorType(())
-    type_params = []
-    attrs = None  # How to build?
-    f = relay.Function(params, body, ret_type, type_params, attrs)
-    check_visit(f)
-
-
-def test_call():
-    x = relay.var("x", shape=())
-    y = relay.var("y", shape=())
-    call = relay.op.add(x, y)
-    check_visit(call)
-
-
-def test_let():
-    x = relay.var("x", shape=())
-    value = relay.const(2.0)
-    body = x + x
-    l = relay.Let(x, value, body)
-    check_visit(l)
-
-
-def test_ite():
-    cond = relay.var("x", shape=(), dtype="bool")
-    ite = relay.If(cond, cond, cond)
-    check_visit(ite)
-
-
-def test_get_item():
-    t = relay.Tuple([relay.var("x", shape=())])
-    t = relay.TupleGetItem(t, 0)
-    check_visit(t)
-
-
-def test_ref_create():
-    r = relay.expr.RefCreate(relay.const(1.0))
-    check_visit(r)
-
-
-def test_ref_read():
-    ref = relay.expr.RefCreate(relay.const(1.0))
-    r = relay.expr.RefRead(ref)
-    check_visit(r)
-
-
-def test_ref_write():
-    ref = relay.expr.RefCreate(relay.const(1.0))
-    r = relay.expr.RefWrite(ref, relay.const(2.0))
-    check_visit(r)
-
-
-def test_memo():
-    expr = relay.const(1)
-    for _ in range(100):
-        expr = expr + expr
-    check_visit(expr)
-
-
-def test_match():
-    p = relay.prelude.Prelude()
-    check_visit(p.mod[p.map])
-
-
-def test_match_completeness():
-    p = relay.prelude.Prelude()
-    _, _, nil = p.mod.get_type("List")
-    for completeness in [True, False]:
-        match_expr = relay.adt.Match(nil, [], complete=completeness)
-        result_expr = ExprMutator().visit(match_expr)
-        # ensure the mutator doesn't mangle the completeness flag
-        assert result_expr.complete == completeness
-
-
-if __name__ == "__main__":
-    test_constant()
-    test_tuple()
-    test_var()
-    test_global()
-    test_function()
-    test_call()
-    test_let()
-    test_ite()
-    test_ref_create()
-    test_ref_read()
-    test_ref_write()
-    test_memo()
-    test_match()
-    test_match_completeness()
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
deleted file mode 100644
index 873475ac1ce7..000000000000
--- a/tests/python/relay/test_external_codegen.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for graph partitioning."""
-
-import sys
-from collections import OrderedDict
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import relay, runtime
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-from utils.external_codegen import (
-    update_lib,
-    set_external_func_attr,
-    parametrize_external_codegen_checks,
-    parametrize_external_json_codegen_checks,
-    check_graph_executor_result,
-    check_vm_result,
-)
-
-
-@parametrize_external_codegen_checks
-def test_multi_node_subgraph(check_result):
-    x = relay.var("x", shape=(10, 10))
-    w0 = relay.var("w0", shape=(10, 10))
-    w1 = relay.var("w1", shape=(10, 10))
-    w2 = relay.var("w2", shape=(10, 10))
-    w3 = relay.var("w3", shape=(10, 10))
-    w4 = relay.var("w4", shape=(10, 10))
-    w5 = relay.var("w5", shape=(10, 10))
-    w6 = relay.var("w6", shape=(10, 10))
-    w7 = relay.var("w7", shape=(10, 10))
-
-    # subgraph0
-    x0 = relay.var("x0", shape=(10, 10))
-    w00 = relay.var("w00", shape=(10, 10))
-    w01 = relay.var("w01", shape=(10, 10))
-    w02 = relay.var("w02", shape=(10, 10))
-    z00 = relay.add(x0, w00)
-    p00 = relay.subtract(z00, w01)
-    q00 = relay.multiply(p00, w02)
-    subgraph0 = relay.Function([x0, w00, w01, w02], q00)
-    subgraph0 = set_external_func_attr(subgraph0, "ccompiler", "ccompiler_0")
-    call0 = relay.Call(subgraph0, [x, w0, w1, w2])
-
-    # subgraph1
-    x1 = relay.var("x1", shape=(10, 10))
-    w10 = relay.var("w10", shape=(10, 10))
-    w11 = relay.var("w11", shape=(10, 10))
-    w12 = relay.var("w12", shape=(10, 10))
-    z10 = relay.add(x1, w10)
-    p10 = relay.subtract(z10, w11)
-    q10 = relay.multiply(p10, w12)
-    subgraph1 = relay.Function([x1, w10, w11, w12], q10)
-    subgraph1 = set_external_func_attr(subgraph1, "ccompiler", "ccompiler_1")
-    call1 = relay.Call(subgraph1, [x, w3, w4, w5])
-
-    # Other parts on TVM
-    z2 = relay.add(x, w6)
-    q2 = relay.subtract(z2, w7)
-
-    r = relay.concatenate((call0, call1, q2), axis=0)
-    f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = relay.transform.InferType()(mod)
-
-    x_data = np.random.rand(10, 10).astype("float32")
-    w_data = []
-    for _ in range(8):
-        w_data.append(np.random.rand(10, 10).astype("float32"))
-
-    map_inputs = OrderedDict([("x", x_data)] + [("w{}".format(i), w_data[i]) for i in range(8)])
-    check_result(
-        mod,
-        map_inputs,
-        (30, 10),
-        np.concatenate(
-            (
-                ((x_data + w_data[0]) - w_data[1]) * w_data[2],
-                ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-                x_data + w_data[6] - w_data[7],
-            ),
-            axis=0,
-        ),
-    )
-
-
-@parametrize_external_codegen_checks
-def test_extern_gcc_single_op(check_result):
-    x = relay.var("x", shape=(8, 8))
-    y = relay.var("y", shape=(8, 8))
-
-    x0 = relay.var("x0", shape=(8, 8))
-    y0 = relay.var("y0", shape=(8, 8))
-    z = x0 + y0
-    f = relay.Function([x0, y0], z)
-    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
-    call = relay.Call(f, [x, y])
-    mod = tvm.IRModule.from_expr(call)
-    x_data = np.random.rand(8, 8).astype("float32")
-    y_data = np.random.rand(8, 8).astype("float32")
-
-    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
-
-
-@parametrize_external_codegen_checks
-def test_extern_gcc_single_op_int(check_result):
-    x = relay.var("x", shape=(8, 8), dtype="int32")
-    y = relay.var("y", shape=(8, 8), dtype="int32")
-
-    x0 = relay.var("x0", shape=(8, 8), dtype="int32")
-    y0 = relay.var("y0", shape=(8, 8), dtype="int32")
-    z = x0 + y0
-    f = relay.Function([x0, y0], z)
-    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
-    call = relay.Call(f, [x, y])
-    mod = tvm.IRModule.from_expr(call)
-    x_data = np.random.rand(8, 8).astype("int32")
-    y_data = np.random.rand(8, 8).astype("int32")
-
-    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
-
-
-@parametrize_external_codegen_checks
-def test_extern_gcc(check_result):
-    x = relay.var("x", shape=(2, 2))
-    y = relay.var("y", shape=(2, 2))
-
-    # subgraph for mul
-    x0 = relay.var("x0", shape=(2, 2))
-    y0 = relay.var("y0", shape=(2, 2))
-    mul = x0 * y0
-    mul = relay.Function([x0, y0], mul)
-    mul = set_external_func_attr(mul, "ccompiler", "ccompiler_2")
-    call_mul = relay.Call(mul, [y, y])
-
-    # subgraph for add
-    x1 = relay.var("x1", shape=(2, 2))
-    y1 = relay.var("y1", shape=(2, 2))
-    add = x1 + y1
-    add = relay.Function([x1, y1], add)
-    add = set_external_func_attr(add, "ccompiler", "ccompiler_1")
-    call_add = relay.Call(add, [x, x])
-
-    # subgraph for sub
-    x2 = relay.var("x2", shape=(2, 2))
-    y2 = relay.var("y2", shape=(2, 2))
-    sub = x2 - y2
-    sub = relay.Function([x2, y2], sub)
-    sub = set_external_func_attr(sub, "ccompiler", "ccompiler_0")
-    call_sub = relay.Call(sub, [call_mul, call_add])
-    mod = tvm.IRModule.from_expr(call_sub)
-
-    x_data = np.random.rand(2, 2).astype("float32")
-    y_data = np.random.rand(2, 2).astype("float32")
-
-    inputs = OrderedDict(
-        [
-            ("y", y_data),
-            ("x", x_data),
-        ]
-    )
-
-    check_result(mod, inputs, (2, 2), (y_data * y_data) - (x_data + x_data))
-
-
-# TODO(mbs): The check_aot_executor_result does not support the list-of-targets, mostly because
-# tvm.testing.aot.compile_and_run requires the target to be a kind name string, and
-# tvm.testing.aot.compile_models requires a single Target object. However, code outside of
-# tvm.testing.aot is ready for this more general form.
-@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
-def test_extern_gcc_with_target_instance(check_result):
-    shape = (8, 8)
-    dtype = "int32"
-
-    def make_mod():
-        x0 = relay.var("x0", shape=shape, dtype=dtype)
-        y0 = relay.var("y0", shape=shape, dtype=dtype)
-        z = x0 + y0
-        f = relay.Function([x0, y0], z)
-        f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
-        x = relay.var("x", shape=shape, dtype=dtype)
-        y = relay.var("y", shape=shape, dtype=dtype)
-        call = relay.Call(f, [x, y])
-        return tvm.IRModule.from_expr(call)
-
-    host_target = tvm.target.Target("llvm")
-    generic_target = tvm.target.Target("llvm", host=host_target)
-    # The header attribute is just whitespace, so compilation is as usual.
-    good_extern_codegen_target = tvm.target.Target(
-        {"kind": "ccompiler", "header": "// Good"}, host=host_target
-    )
-    # The header attribute is ill-formed, so compilation is expected to fail.
-    bogus_extern_codegen_target = tvm.target.Target(
-        {"kind": "ccompiler", "header": "Bogus"}, host=host_target
-    )
-
-    mod = make_mod()
-
-    x_data = np.random.rand(*shape).astype(dtype)
-    y_data = np.random.rand(*shape).astype(dtype)
-    expected_result = x_data + y_data
-    inputs = {"x": x_data, "y": y_data}
-
-    check_result(
-        mod, inputs, shape, expected_result, target=[generic_target, good_extern_codegen_target]
-    )
-
-    with pytest.raises(RuntimeError):
-        check_result(
-            mod,
-            inputs,
-            shape,
-            expected_result,
-            target=[generic_target, bogus_extern_codegen_target],
-        )
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
-@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
-def test_extern_gcc_consts(check_result):
-    shape = (8, 8)
-    dtype = "float32"
-    x = relay.var("x", shape=shape)
-    y0_data = np.random.uniform(0, 1, shape).astype(dtype)
-
-    x0 = relay.var("x0", shape=shape)
-    y0_const = relay.const(y0_data, dtype)
-    z = x0 + y0_const
-    f = relay.Function([x0], z)
-    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
-    call = relay.Call(f, [x])
-    mod = tvm.IRModule.from_expr(call)
-
-    # Note that while the VMCompiler get_params() will return all 'parameters' from both
-    # TVM and external codegen compiled code, the GraphExecutor.get_params() will return only
-    # those from non-external modules. So in the following we'll test by execution rather than
-    # test by inspection.
-    x_data = np.random.rand(*shape).astype(dtype)
-    inputs = {"x": x_data}
-    expected_result = x_data + y0_data
-    check_result(mod, inputs, shape, expected_result, target="llvm")
-
-
-@pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.dnnl", True),
-    reason="skip because DNNL codegen is not available",
-)
-@parametrize_external_json_codegen_checks
-def test_extern_dnnl_padding(check_result):
-    dtype = "float32"
-    ishape = (1, 1, 99, 12)
-    w1shape = (54, 1, 3, 3)
-    data0 = relay.var("data0", shape=(ishape), dtype=dtype)
-    weight0 = relay.var("weight0", shape=(w1shape), dtype=dtype)
-    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), strides=(2, 2), padding=(1, 0, 1, 1))
-    f = relay.Function([data0, weight0], out)
-    ref_mod = tvm.IRModule()
-    ref_mod["main"] = f
-
-    data1 = relay.var("data0", shape=(ishape), dtype=dtype)
-    weight1 = relay.var("weight0", shape=(w1shape), dtype=dtype)
-    f = set_external_func_attr(f, "dnnl", "dnnl_0")
-    call = relay.Call(f, [data1, weight1])
-    mod = tvm.IRModule.from_expr(call)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-    w_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()).evaluate()(
-        i_data, w_data
-    )
-    check_result(
-        mod, {"data0": i_data, "weight0": w_data}, (1, 54, 50, 6), ref_res.numpy(), tol=1e-5
-    )
-
-
-@pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.dnnl", True),
-    reason="skip because DNNL codegen is not available",
-)
-@parametrize_external_json_codegen_checks
-def test_extern_dnnl(check_result):
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    w1shape = (32, 1, 3, 3)
-    data0 = relay.var("data0", shape=(ishape), dtype=dtype)
-    weight0 = relay.var("weight0", shape=(w1shape), dtype=dtype)
-
-    data1 = relay.var("data0", shape=(ishape), dtype=dtype)
-    weight1 = relay.var("weight0", shape=(w1shape), dtype=dtype)
-    weight2 = relay.var("weight1", shape=(w1shape), dtype=dtype)
-    depthwise_conv2d_1 = relay.nn.conv2d(
-        data1, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32
-    )
-    depthwise_conv2d_2 = relay.nn.conv2d(
-        depthwise_conv2d_1, weight2, kernel_size=(3, 3), padding=(1, 1), groups=32
-    )
-    out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-
-    f = relay.Function([data1, weight1, weight2], out)
-    ref_mod = tvm.IRModule()
-    ref_mod["main"] = f
-
-    f = set_external_func_attr(f, "dnnl", "dnnl_0")
-    call = relay.Call(f, [data0, weight0, weight0])
-    mod = tvm.IRModule.from_expr(call)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-    w_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()).evaluate()(
-        i_data, w_data, w_data
-    )
-    check_result(
-        mod, {"data0": i_data, "weight0": w_data}, (1, 32, 14, 14), ref_res.numpy(), tol=1e-5
-    )
-
-
-@pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.dnnl", True),
-    reason="skip because DNNL codegen is not available",
-)
-@parametrize_external_json_codegen_checks
-def test_extern_dnnl_const(check_result):
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    w1shape = (32, 1, 3, 3)
-    data0 = relay.var("data0", shape=(ishape), dtype=dtype)
-    w_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-    data1 = relay.var("data0", shape=(ishape), dtype=dtype)
-    weight1 = relay.const(w_data, dtype=dtype)
-    weight2 = relay.const(w_data, dtype=dtype)
-    depthwise_conv2d_1 = relay.nn.conv2d(
-        data1, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32
-    )
-    depthwise_conv2d_2 = relay.nn.conv2d(
-        depthwise_conv2d_1, weight2, kernel_size=(3, 3), padding=(1, 1), groups=32
-    )
-    out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-
-    f = relay.Function([data1], out)
-    ref_mod = tvm.IRModule()
-    ref_mod["main"] = f
-
-    f = set_external_func_attr(f, "dnnl", "dnnl_0")
-    call = relay.Call(f, [data0])
-    mod = tvm.IRModule.from_expr(call)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-
-    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()).evaluate()(i_data)
-    check_result(mod, {"data0": i_data}, (1, 32, 14, 14), ref_res.numpy(), tol=1e-5)
-
-
-def test_load_params_with_constants_in_ext_codegen():
-    # After binding params and partitioning graph_module.get_params()
-    # might contain parameters that are not an graph executor input but
-    # for example constants in external function.
-    y_in = np.ones((1,)).astype("float32")
-    params = {"y": y_in}
-    mod = tvm.IRModule()
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1,))
-    xcb = compiler_begin(x, "ccompiler")
-    ycb = compiler_begin(y, "ccompiler")
-    z = relay.add(xcb, ycb)
-    zce = compiler_end(z, "ccompiler")
-    mod["main"] = relay.Function([x, y], zce)
-    mod["main"] = bind_params_by_name(mod["main"], params)
-    mod = relay.transform.PartitionGraph()(mod)
-
-    graph_module = relay.build(mod, target="llvm", params=params)
-    # Params will be stored in metadata module.
-    assert len(graph_module.get_params()) == 0
-    lib = update_lib(graph_module.get_lib())
-    rt_mod = tvm.contrib.graph_executor.create(graph_module.get_graph_json(), lib, tvm.cpu(0))
-    rt_mod.load_params(runtime.save_param_dict(graph_module.get_params()))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_ir_bind.py b/tests/python/relay/test_ir_bind.py
deleted file mode 100644
index 1e5ab92cf2c5..000000000000
--- a/tests/python/relay/test_ir_bind.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" test bind function."""
-import pytest
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import TVMError
-
-
-def test_bind_params():
-    x = relay.var("x")
-    y = relay.var("y")
-    z = relay.add(x, y)
-    f = relay.Function([x, y], z)
-    fbinded = relay.bind(f, {x: relay.const(1, "float32")})
-    fexpected = relay.Function([y], relay.add(relay.const(1, "float32"), y))
-    tvm.ir.assert_structural_equal(fbinded, fexpected)
-
-    zbinded = relay.bind(z, {y: x})
-    zexpected = relay.add(x, x)
-    tvm.ir.assert_structural_equal(zbinded, zexpected)
-
-
-def test_bind_duplicated_params():
-    a = relay.var("a", shape=(1,))
-    aa = relay.var("a", shape=(1,))
-    s = a + aa
-    func = relay.Function([a, aa], s)
-
-    with pytest.raises(TVMError):
-        relay.build_module.bind_params_by_name(func, {"a": [1.0]})
-
-
-if __name__ == "__main__":
-    test_bind_params()
-    test_bind_duplicated_params()
diff --git a/tests/python/relay/test_ir_module.py b/tests/python/relay/test_ir_module.py
deleted file mode 100644
index c87ca19f117f..000000000000
--- a/tests/python/relay/test_ir_module.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tests for module functionality."""
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.prelude import Prelude
-
-
-def constructor_list(p):
-    list_ctors = p.mod.get_type("List")
-    optional_ctors = p.mod.get_type("Option")
-    nat_ctors = p.mod.get_type("nat")
-    rose_ctors = p.mod.get_type("Tree")
-    return list_ctors[1:] + optional_ctors[1:] + nat_ctors[1:] + rose_ctors[1:]
-
-
-def adt_list(p):
-    list_ctors = p.mod.get_type("List")
-    optional_ctors = p.mod.get_type("Option")
-    nat_ctors = p.mod.get_type("nat")
-    rose_ctors = p.mod.get_type("Tree")
-    return list_ctors[:1] + optional_ctors[:1] + nat_ctors[:1] + rose_ctors[:1]
-
-
-def test_constructor_tag_round_trip():
-    mod1 = tvm.IRModule()
-    p1 = Prelude(mod1)
-    p1.mod.import_from_std("nat.rly")
-
-    mod2 = tvm.IRModule()
-    p2 = Prelude(mod2)
-    p2.mod.import_from_std("nat.rly")
-
-    # ensure hashes match across modules
-    ctors1 = constructor_list(p1)
-    ctors2 = constructor_list(p2)
-
-    for i in range(len(ctors1)):
-        tag = ctors1[i].tag
-        ctor = mod2.get_constructor(tag)
-        assert ctor == ctors2[i]
-        assert ctor.name_hint == ctors1[i].name_hint
-
-
-def test_constructor_tag_differences():
-    # ensure that if we have the type data for a given ADT, the tags
-    # for the constructors of the *same ADT* are simple offsets from
-    # each other
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-
-    adts = adt_list(p)
-    for adt in adts:
-        data = mod[adt]
-        for i in range(len(data.constructors) - 1):
-            ctor1 = data.constructors[i]
-            ctor2 = data.constructors[i + 1]
-            assert ctor2.tag - ctor1.tag == 1
-            # make sure there is something present at the MSB
-            assert ctor1.tag - i != 0
-            assert ctor2.tag - (i + 1) != 0
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
deleted file mode 100644
index 8716acfa7a9b..000000000000
--- a/tests/python/relay/test_ir_nodes.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" test ir"""
-import pytest
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.tir.expr import *
-from tvm.relay import op
-import numpy as np
-
-
-def check_json_roundtrip(node):
-    json_str = tvm.ir.save_json(node)
-    back = tvm.ir.load_json(json_str)
-    assert tvm.ir.structural_equal(back, node, map_free_vars=True)
-
-
-# Span
-def test_span():
-    span = relay.Span(None, 1, 2, 3, 4)
-    assert span.source_name == None
-    assert span.line == 1
-    assert span.end_line == 2
-    assert span.column == 3
-    assert span.end_column == 4
-    assert span.same_as(span)
-    assert span == span
-    assert isinstance(span, relay.base.Span)
-    str(span)
-
-    # span is not a node so we can't use graph_equal
-    # to test the round trip
-    back = tvm.ir.load_json(tvm.ir.save_json(span))
-    assert back.source_name == span.source_name
-    assert back.line == span.line
-    assert back.end_line == span.end_line
-    assert back.column == span.column
-    assert back.end_column == span.end_column
-
-
-def test_constant():
-    arr = tvm.nd.array(10)
-    const = relay.Constant(arr)
-    assert const.data == arr
-    assert const.span == None
-    str(const)
-    check_json_roundtrip(const)
-
-
-def test_tuple():
-    fields = tvm.runtime.convert([])
-    tup = relay.Tuple(fields)
-    assert tup.fields == fields
-    assert tup.span == None
-    str(tup)
-    check_json_roundtrip(tup)
-
-
-def test_local_var():
-    name_hint = "s"
-    lv = relay.Var(name_hint)
-    assert lv.name_hint == name_hint
-    assert lv.type_annotation is None
-    # assert lv.span == None todo(@jroesch): what do we do about spans
-    str(lv)
-    check_json_roundtrip(lv)
-
-    t1 = relay.ty.TensorType((), "float")
-    lv = relay.Var(name_hint, t1)
-    assert lv.name_hint == name_hint
-    assert lv.type_annotation == t1
-
-
-def test_global_var():
-    name_hint = "g"
-    gv = relay.GlobalVar(name_hint)
-    gv.name_hint == name_hint
-    # assert lv.span == None todo(@jroesch): what do we do about spans
-    str(gv)
-    check_json_roundtrip(gv)
-
-
-def test_function():
-    param_names = ["a", "b", "c", "d"]
-    params = tvm.runtime.convert([relay.Var(n) for n in param_names])
-    ret_type = relay.TupleType(tvm.runtime.convert([]))
-    body = relay.Tuple(tvm.runtime.convert([]))
-    type_params = tvm.runtime.convert([])
-    fn = relay.Function(params, body, ret_type, type_params)
-    fn = fn.with_attr("test_attribute", "value")
-    fn = fn.with_attr("test_attribute1", "value1")
-    assert fn.params == params
-    assert fn.body == body
-    assert fn.type_params == type_params
-    assert fn.span == None
-    assert fn.attrs["test_attribute"] == "value"
-    assert fn.attrs["test_attribute1"] == "value1"
-    str(fn)
-    check_json_roundtrip(fn)
-
-
-def test_function_attrs():
-    param_names = ["a", "b", "c", "d"]
-    params = tvm.runtime.convert([relay.var(n, shape=(5, 2)) for n in param_names])
-    ret_type = relay.TupleType(tvm.runtime.convert([]))
-    body = relay.Tuple(tvm.runtime.convert([]))
-    type_params = tvm.runtime.convert([])
-    fn = relay.Function(params, body, ret_type, type_params)
-    model_params = {}
-    for param in params[:1]:
-        cty = param.type_annotation
-        tensor = np.random.rand(*[int(sh) for sh in cty.shape]).astype(cty.dtype)
-        model_params[param] = relay.Constant(tvm.nd.array(tensor))
-
-    fn = fn.with_attr("__params__", model_params)
-
-    assert fn.params == params
-    assert fn.body == body
-    assert fn.type_params == type_params
-    assert fn.span == None
-    str(fn)
-    check_json_roundtrip(fn)
-    json_str = tvm.ir.save_json(fn)
-    fn_after = tvm.ir.load_json(json_str)
-    model_params_after = fn_after.attrs["__params__"]
-    after_keys = [item[0] for item in model_params_after.items()]
-    for key1, key2 in zip(model_params, after_keys):
-        assert key1.name_hint == key2.name_hint
-        p1 = model_params[key1]
-        p2 = model_params_after[key2]
-        np.testing.assert_allclose(p1.data.numpy(), p2.data.numpy())
-
-
-def test_call():
-    op = relay.Var("f")
-    arg_names = ["a", "b", "c", "d"]
-    args = tvm.runtime.convert([relay.Var(n) for n in arg_names])
-    call = relay.Call(op, args, None, None)
-    assert call.op == op
-    assert call.args == args
-    assert call.span == None
-    str(call)
-    check_json_roundtrip(call)
-
-
-def test_let():
-    lv = relay.Var("x")
-    ty = None
-    arr = tvm.nd.array(10)
-    value = relay.Constant(arr)
-    # I would prefer that the order of arguments
-    # matches syntax let x: t = v in b
-    let = relay.Let(lv, value, lv)
-    assert let.var == lv
-    assert let.value == value
-    assert let.body == lv
-    assert let.span == None
-    str(let)
-    check_json_roundtrip(let)
-
-
-def test_if():
-    cond = relay.Var("cond")
-    left = relay.Var("left")
-    right = relay.Var("right")
-    ife = relay.If(cond, left, right)
-    assert ife.cond == cond
-    assert ife.true_branch == left
-    assert ife.false_branch == right
-    assert ife.span == None
-    str(ife)
-    check_json_roundtrip(ife)
-
-
-def test_tuple_get_item():
-    tup = relay.Var("tuple")
-    get = relay.TupleGetItem(tup, 1)
-    assert get.tuple_value == tup
-    assert get.index == 1
-    str(get)
-    check_json_roundtrip(get)
-
-
-def test_op():
-    add = op.op.get("add")
-    check_json_roundtrip(add)
-
-
-def test_conv2d_attrs():
-    data = relay.var("data", shape=(1, 3, 224, 224))
-    param = relay.var("param", shape=(64, 3, 7, 7))
-    out = op.nn.conv2d(data, param, strides=(2, 2), padding=(3, 3), channels=64, kernel_size=(7, 7))
-    check_json_roundtrip(out)
-
-
-# Commented due to weird memory allocation issue
-# def test_large_grpah():
-# Test large graphs to avoid stack overflow in serialize/deserialize
-#    size = int(1e5)
-#    var = [relay.var("var_" + str(i), shape=(2, 3)) for i in range(size)]
-#    body = var[-1]
-#    for i in range(size, 1, -1):
-#        body = relay.Let(var[i - 1], op.add(var[i - 2], var[i - 2]), body)
-#    func = relay.Function([var[0]], body)
-#    check_json_roundtrip(func)
-
-
-if __name__ == "__main__":
-    test_span()
-    test_constant()
-    test_tuple()
-    test_local_var()
-    test_global_var()
-    test_function()
-    test_function_attrs()
-    test_call()
-    test_let()
-    test_if()
-    test_tuple_get_item()
-    test_op()
-    test_conv2d_attrs()
-    # Commented due to weird memory allocation issue
-    # test_large_grpah()
diff --git a/tests/python/relay/test_ir_op.py b/tests/python/relay/test_ir_op.py
deleted file mode 100644
index edb8086dd426..000000000000
--- a/tests/python/relay/test_ir_op.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import relay
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.relay.op import op as _op
-
-
-def test_op_attr():
-    log_op = relay.op.get("log")
-
-    @tvm.ir.register_op_attr("exp", "ftest")
-    def test(x):
-        return x + 1
-
-    assert log_op.num_inputs == 1
-    assert log_op.get_attr("ftest") is None
-    assert relay.op.get("exp").get_attr("ftest")(1) == 2
-
-
-def test_op_reset_attr():
-    """Tests reset_attr functionality."""
-
-    def add1(x):
-        return x + 1
-
-    def add2(x):
-        return x + 2
-
-    # Register fadd1 and fadd2 attributes.
-    tvm.ir.register_op_attr("exp", "fadd1", add1)
-    tvm.ir.register_op_attr("log", "fadd1", add1)
-    tvm.ir.register_op_attr("log", "fadd2", add2)
-
-    # Reset log fadd1 attr.
-    log_op = relay.op.get("log")
-    log_op.reset_attr("fadd1")
-
-    # Check that fadd1 attr is resetted.
-    assert log_op.get_attr("fadd1") is None
-
-    # Check that fadd1 attr of other ops are intact.
-    assert relay.op.get("exp").get_attr("fadd1")(1) == 2
-
-    # Check that other attrs of the log op are intact.
-    assert relay.op.get("log").get_attr("fadd2")(1) == 3
-
-
-def test_op_temp_attr():
-    """Tests reset_attr functionality."""
-
-    def add1(x):
-        return x + 1
-
-    def add2(x):
-        return x + 2
-
-    # Set original attr value is add1.
-    tvm.ir.register_op_attr("sqrt", "ftest", add1)
-
-    with TempOpAttr("sqrt", "ftest", add2):
-        # Check that the attr value is updated to add2.
-        assert relay.op.get("sqrt").get_attr("ftest")(1) == 3
-
-    # Check that the attr value is recovered to add1.
-    assert relay.op.get("sqrt").get_attr("ftest")(1) == 2
-
-
-def test_op_level1():
-    x = relay.Var("x")
-
-    for op_name in ["log", "exp", "sqrt", "rsqrt", "tanh"]:
-        y = getattr(relay, op_name)(x)
-        assert y.op.name == op_name
-        assert y.op.support_level == 1
-        assert y.args[0] == x
-
-
-def test_op_level3():
-    x = relay.Var("x")
-
-    for op_name in ["ceil", "floor", "trunc", "round", "abs", "negative"]:
-        y = getattr(relay, op_name)(x)
-        assert y.op.name == op_name
-        assert y.op.support_level == 3
-        assert y.args[0] == x
-
-
-def test_op_register():
-    """Tests register_op functionality."""
-    op_name = "custom_op"
-
-    _op.register(op_name, r"code(Add two tensor with inner broadcasting.)code")
-    _op.get(op_name).set_num_inputs(2)
-    _op.get(op_name).add_argument("data_0", "Tensor", "The input data tensor.")
-    _op.get(op_name).add_argument("data_1", "Tensor", "The input data tensor.")
-    # call default relation functions
-    _op.get(op_name).add_type_rel("Identity")
-    _op.get(op_name).set_support_level(1)
-    _op.register_pattern(op_name, _op.OpPattern.ELEMWISE)
-    _op.register_stateful(op_name, False)
-
-    assert _op.get(op_name).name == op_name
-    assert _op.get(op_name).num_inputs == 2
-    assert _op.get(op_name).get_attr("TOpPattern") == _op.OpPattern.ELEMWISE
-    assert _op.get(op_name).get_attr("TOpIsStateful") == False
-
-
-if __name__ == "__main__":
-    test_op_attr()
-    test_op_reset_attr()
-    test_op_temp_attr()
-    test_op_level1()
-    test_op_level3()
-    test_op_register()
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
deleted file mode 100644
index 7e8f8c54f486..000000000000
--- a/tests/python/relay/test_ir_parser.py
+++ /dev/null
@@ -1,1032 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from typing import Union
-
-import numpy as np
-import pytest
-import tvm
-import tvm.relay.testing
-import tvm.testing
-from numpy import isclose
-from tvm import relay
-
-SEMVER = '#[version = "0.0.5"]\n'
-
-BINARY_OPS = {
-    "*": relay.multiply,
-    "/": relay.divide,
-    "+": relay.add,
-    "-": relay.subtract,
-    "<": relay.less,
-    ">": relay.greater,
-    "<=": relay.less_equal,
-    ">=": relay.greater_equal,
-    "==": relay.equal,
-    "!=": relay.not_equal,
-}
-
-TYPES = {
-    "int8",
-    "int16",
-    "int32",
-    "int64",
-    "uint8",
-    "uint16",
-    "uint32",
-    "uint64",
-    "float16",
-    "float32",
-    "float64",
-    "bool",
-    "int8x4",
-    "uint1x4",
-    "float16x4",
-}
-
-LIST_DEFN = """
-type List[A] {
-    Cons(A, List[A]),
-    Nil,
-}
-"""
-
-
-def assert_graph_equal(lhs, rhs):
-    tvm.ir.assert_structural_equal(lhs, rhs, map_free_vars=True)
-
-
-def graph_equal(lhs, rhs):
-    return tvm.ir.structural_equal(lhs, rhs, map_free_vars=True)
-
-
-def roundtrip_expr(expr):
-    text = expr.astext()
-    x = tvm.relay.parse_expr(text)
-    assert_graph_equal(x, expr)
-
-
-# Testing Utilities for expressions.
-def roundtrip(expr):
-    x = tvm.relay.fromtext(expr.astext())
-    assert_graph_equal(x, expr)
-
-
-def parse_text(code):
-    expr = tvm.relay.parse_expr(code)
-    roundtrip_expr(expr)
-    return expr
-
-
-def parses_as(code, expr):
-    # type: (str, relay.Expr) -> bool
-    parsed = parse_text(code)
-    result = graph_equal(parsed, expr)
-    return result
-
-
-# Testing Utilities for full modules.
-def parse_module(code):
-    mod = tvm.relay.parse(SEMVER + code)
-    roundtrip(mod)
-    return mod
-
-
-def assert_parses_as(code, expr):
-    parsed = parse_text(code)
-    assert_graph_equal(parsed, expr)
-
-
-def assert_parse_module_as(code, mod):
-    mod = tvm.relay.transform.InferType()(mod)
-    parsed = parse_module(code)
-    assert_graph_equal(parsed, mod)
-
-
-def get_scalar(x):
-    # type: (relay.Constant) -> (Union[float, int, bool])
-    return x.data.numpy().item()
-
-
-int32 = relay.scalar_type("int32")
-
-_ = relay.Var("_")
-X = relay.Var("x")
-Y = relay.Var("y")
-X_ANNO = relay.Var("x", int32)
-Y_ANNO = relay.Var("y", int32)
-
-UNIT = relay.Tuple([])
-
-
-def test_comments():
-    assert_parses_as(
-        """
-        // This is a line comment!
-        ()
-        """,
-        UNIT,
-    )
-
-    assert_parses_as(
-        """
-        /* This is a block comment!
-            This is still a block comment!
-        */
-        ()
-        """,
-        UNIT,
-    )
-
-    assert_parses_as(
-        """
-        /* This is a block comment!
-           /*Block comment is recursive!*/
-        */
-        ()
-        """,
-        UNIT,
-    )
-
-
-def test_int_literal():
-    assert isinstance(parse_text("1"), relay.Constant)
-    assert isinstance(parse_text("1").data, tvm.nd.NDArray)
-
-    assert get_scalar(parse_text("1")) == 1
-    assert get_scalar(parse_text("10")) == 10
-    assert get_scalar(parse_text("0")) == 0
-    assert get_scalar(parse_text("-100")) == -100
-    assert get_scalar(parse_text("-05")) == -5
-    assert get_scalar(parse_text("9223372036854775807")) == 9223372036854775807
-
-    assert get_scalar(parse_text("-42i")) == -42
-    assert get_scalar(parse_text("-42i16")) == -42
-    assert get_scalar(parse_text("-42i32")) == -42
-    assert get_scalar(parse_text("-42i64")) == -42
-
-    assert_parses_as("-42i16", relay.const(-42, "int16"))
-    assert_parses_as("-42i32", relay.const(-42, "int32"))
-    assert_parses_as("-42i", relay.const(-42, "int32"))
-    assert_parses_as("-42", relay.const(-42, "int32"))
-    assert_parses_as("-42i64", relay.const(-42, "int64"))
-    assert_parses_as("2147483647", relay.const(2147483647, "int32"))
-    assert_parses_as("2147483648", relay.const(2147483648, "int64"))
-
-    with pytest.raises(tvm.error.DiagnosticError):
-        # Unrepresentable
-        parse_text("2147483648i32")
-    with pytest.raises(tvm.error.DiagnosticError):
-        # Unrepresentable
-        parse_text("32768i16")
-
-
-def test_float_literal():
-    assert get_scalar(parse_text("1.0f")) == 1.0
-    assert isclose(get_scalar(parse_text("1.56667f")), 1.56667)
-    assert get_scalar(parse_text("0.0f")) == 0.0
-    assert get_scalar(parse_text("-10.0f")) == -10.0
-
-    # scientific notation
-    assert isclose(get_scalar(parse_text("1e-1f")), 1e-1)
-    assert get_scalar(parse_text("1e+1f")) == 1e1
-    assert isclose(get_scalar(parse_text("1E-1f")), 1e-1)
-    assert get_scalar(parse_text("1E+1f")) == 1e1
-    assert isclose(get_scalar(parse_text("1.0e-1f")), 1.0e-1)
-    assert get_scalar(parse_text("1.0e+1f")) == 1.0e1
-    assert isclose(get_scalar(parse_text("1.0E-1f")), 1.0e-1)
-    assert get_scalar(parse_text("1.0E+1f")) == 1.0e1
-
-    assert get_scalar(parse_text("3f16")) == 3.0
-    assert get_scalar(parse_text("3f32")) == 3.0
-
-    assert_parses_as("3f16", relay.const(3.0, "float16"))
-    assert_parses_as("3f32", relay.const(3.0, "float32"))
-    assert_parses_as("3f", relay.const(3.0, "float32"))
-    assert_parses_as("3f64", relay.const(3.0, "float64"))
-
-    with pytest.raises(tvm.error.DiagnosticError):
-        # Unrepresentable
-        parse_text("3.40283e+38f32")
-    with pytest.raises(tvm.error.DiagnosticError):
-        # Unrepresentable
-        parse_text("65505f16")
-
-
-def test_bool_literal():
-    assert get_scalar(parse_text("True")) == True
-    assert get_scalar(parse_text("False")) == False
-
-    assert_parses_as("True", relay.const(True, "bool"))
-
-
-def test_negative():
-    # need to handle parsing non-literal operations
-    # assert isinstance(parse_text("let %x = 1; -%x").body, relay.Call)
-    assert get_scalar(parse_text("--10")) == 10
-    assert get_scalar(parse_text("---10")) == -10
-
-
-def test_bin_op():
-    for bin_op in BINARY_OPS.keys():
-        assert_parses_as(
-            "1 {} 1".format(bin_op), BINARY_OPS.get(bin_op)(relay.const(1), relay.const(1))
-        )
-
-
-def test_parens():
-    assert graph_equal(parse_text("1 * 1 + 1"), parse_text("(1 * 1) + 1"))
-    assert not graph_equal(parse_text("1 * 1 + 1"), parse_text("1 * (1 + 1)"))
-
-
-def test_op_assoc():
-    assert graph_equal(parse_text("1 * 1 + 1 < 1 == 1"), parse_text("(((1 * 1) + 1) < 1) == 1"))
-    assert graph_equal(parse_text("1 == 1 < 1 + 1 * 1"), parse_text("1 == (1 < (1 + (1 * 1)))"))
-
-
-def test_vars():
-    # var
-    var = parse_text("let %foo = (); %foo")
-    assert isinstance(var.body, relay.Var)
-    assert var.body.name_hint == "foo"
-
-    # global var
-    global_var = parse_text("@foo")
-    assert isinstance(global_var, relay.GlobalVar)
-    assert global_var.name_hint == "foo"
-
-    # operator id
-    op = parse_text("add")
-    assert isinstance(op, tvm.ir.Op)
-    assert op.name == "add"
-
-    # operator id with prefix
-    op = parse_text("nn.global_avg_pool2d")
-    assert isinstance(op, tvm.ir.Op)
-    assert op.name == "nn.global_avg_pool2d"
-
-
-def test_meta_ref():
-    with pytest.raises(tvm.error.DiagnosticError):
-        meta_op = parse_text("meta[type_key][1337]")
-        assert meta_op.attrs.node_type_key == "type_key"
-        assert meta_op.attrs.node_index == 1337
-
-
-def test_let():
-    assert_parses_as("let %x = 1; ()", relay.Let(X, relay.const(1), UNIT))
-
-    assert_parses_as(
-        """
-        let %x = 1;
-        let %y = 2;
-        ()
-        """,
-        relay.Let(X, relay.const(1), relay.Let(Y, relay.const(2), UNIT)),
-    )
-
-
-def test_seq():
-    assert_parses_as("(); ()", relay.Let(_, UNIT, UNIT))
-
-    assert_parses_as("let %_ = 1; ()", relay.Let(X, relay.const(1), UNIT))
-
-
-def test_graph():
-    code = "%0 = (); %1 = 1; (%0, %0, %1)"
-    assert_parses_as(code, relay.Tuple([UNIT, UNIT, relay.const(1)]))
-
-
-def test_graph_single():
-    assert_parses_as("%1 = (); %1", relay.Tuple([]))
-
-
-def test_let_global_var():
-    with pytest.raises(tvm.error.DiagnosticError):
-        parse_text("let @x = 1; ()")
-
-
-def test_let_op():
-    with pytest.raises(tvm.error.DiagnosticError):
-        parse_text("let x = 1; ()")
-
-
-def test_tuple():
-    assert_parses_as("()", relay.Tuple([]))
-
-    assert_parses_as("(0,)", relay.Tuple([relay.const(0)]))
-
-    assert_parses_as("(0, 1)", relay.Tuple([relay.const(0), relay.const(1)]))
-
-    assert_parses_as("(0, 1, 2)", relay.Tuple([relay.const(0), relay.const(1), relay.const(2)]))
-
-
-def test_tuple_proj():
-    x = relay.var("x", shape=())
-    assert_parses_as(
-        "free_var %x: float32; %x((%x,).0, %x)",
-        relay.Call(x, [relay.TupleGetItem(relay.Tuple([x]), 0), x]),
-    )
-
-
-def test_func():
-    # 0 args
-    assert_parses_as("fn () { 0 }", relay.Function([], relay.const(0), None, []))
-
-    # 1 arg
-    assert_parses_as("fn (%x) { %x }", relay.Function([X], X, None, []))
-
-    # 2 args
-    assert_parses_as("fn (%x, %y) { %x + %y }", relay.Function([X, Y], relay.add(X, Y), None, []))
-
-    # annotations
-    assert_parses_as("fn (%x: int32) -> int32 { %x }", relay.Function([X_ANNO], X_ANNO, int32, []))
-
-    # Refactor the attribute syntax and printing.
-    #
-    # # attributes
-    # assert_parses_as(
-    #     "fn (n=5) { () }",
-    #     relay.Function([], UNIT, None, None, tvm.ir.make_node("DictAttrs", n=relay.const(5)))
-    # )
-
-
-# TODO(@jmp): Crashes if %x isn't annnotated.
-def test_defn():
-    id_defn = parse_module(
-        """
-        def @id(%x: int32) -> int32 {
-            %x
-        }
-        """
-    )
-    assert isinstance(id_defn, tvm.IRModule)
-
-
-def test_recursive_call():
-    id_defn = parse_module(
-        """
-        def @id(%x: int32) -> int32 {
-            @id(%x)
-        }
-        """
-    )
-    assert isinstance(id_defn, tvm.IRModule)
-
-
-def test_ifelse():
-    assert_parses_as(
-        """
-        if (True) {
-            0
-        } else {
-            1
-        }
-        """,
-        relay.If(relay.const(True), relay.const(0), relay.const(1)),
-    )
-
-
-def test_ifelse_scope():
-    with pytest.raises(tvm.error.DiagnosticError):
-        parse_text(
-            """
-            if (True) {
-                let %x = ();
-                ()
-            } else {
-                %x
-            }
-            """
-        )
-
-
-def test_ref():
-    program = """
-    #[version = "0.0.5"]
-    def @main(%x: float32) {
-        %0 = ref(%x);
-        ref_write(%0, 1f);
-        ref_read(%0)
-    }
-    """
-    tvm.relay.parse(program)
-
-
-def test_call():
-    # select right function to call: simple ident case
-    id_func = relay.Var("id")
-    assert_parses_as(
-        """
-        let %id = fn (%x) { %x };
-        10 * %id(10)
-        """,
-        relay.Let(
-            id_func,
-            relay.Function([X], X, None, []),
-            relay.multiply(relay.const(10), relay.Call(id_func, [relay.const(10)])),
-        ),
-    )
-
-    # 0 args
-    constant = relay.Var("constant")
-    assert_parses_as(
-        """
-        let %constant = fn () { 0 };
-        %constant()
-        """,
-        relay.Let(
-            constant,
-            relay.Function([], relay.const(0), None, []),
-            relay.Call(constant, [], None, None),
-        ),
-    )
-
-    # 1 arg
-    id_var = relay.Var("id")
-    assert_parses_as(
-        """
-        let %id = fn (%x) { %x };
-        %id(1)
-        """,
-        relay.Let(
-            id_var,
-            relay.Function([X], X, None, []),
-            relay.Call(id_var, [relay.const(1)], None, None),
-        ),
-    )
-
-    # 2 args
-    multiply = relay.Var("multiply")
-    assert_parses_as(
-        """
-        let %multiply = fn (%x, %y) { %x * %y };
-        %multiply(0, 0)
-        """,
-        relay.Let(
-            multiply,
-            relay.Function([X, Y], relay.multiply(X, Y), None, []),
-            relay.Call(multiply, [relay.const(0), relay.const(0)], None, None),
-        ),
-    )
-
-    # anonymous function
-    assert_parses_as(
-        """
-        (fn (%x) { %x })(0)
-        """,
-        relay.Call(relay.Function([X], X, None, []), [relay.const(0)], None, None),
-    )
-
-    # curried function
-    curried_mult = relay.Var("curried_mult")
-    assert_parses_as(
-        """
-        let %curried_mult =
-            fn (%x) {
-            fn (%y) {
-                %x * %y
-            }
-            };
-            %curried_mult(0);
-            %curried_mult(0)(0)
-        """,
-        relay.Let(
-            curried_mult,
-            relay.Function([X], relay.Function([Y], relay.multiply(X, Y), None, []), None, []),
-            relay.Let(
-                _,
-                relay.Call(curried_mult, [relay.const(0)], None, None),
-                relay.Call(
-                    relay.Call(curried_mult, [relay.const(0)], None, None),
-                    [relay.const(0)],
-                    None,
-                    None,
-                ),
-            ),
-        ),
-    )
-
-    # op
-    assert_parses_as("abs(1)", relay.Call(relay.op.get("abs"), [relay.const(1)], None, None))
-
-
-# Types
-
-
-def test_incomplete_type():
-    assert_parses_as("let %_ : _ = (); ()", relay.Let(_, UNIT, UNIT))
-
-
-def test_builtin_types():
-    for builtin_type in TYPES:
-        parse_text("let %_ : {} = (); ()".format(builtin_type))
-
-
-def test_tensor_type():
-    assert_parses_as(
-        "let %_ : Tensor[(), float32] = (); ()",
-        relay.Let(relay.Var("_", relay.TensorType((), "float32")), UNIT, UNIT),
-    )
-
-    assert_parses_as(
-        "let %_ : Tensor[(1), float32] = (); ()",
-        relay.Let(relay.Var("_", relay.TensorType((1,), "float32")), UNIT, UNIT),
-    )
-
-    assert_parses_as(
-        "let %_ : Tensor[(1, 1), float32] = (); ()",
-        relay.Let(relay.Var("_", relay.TensorType((1, 1), "float32")), UNIT, UNIT),
-    )
-
-    assert_parses_as(
-        "let %_ : Tensor[(?, 1), float32] = (); ()",
-        relay.Let(relay.Var("_", relay.TensorType((tvm.tir.Any(), 1), "float32")), UNIT, UNIT),
-    )
-
-
-def test_function_type():
-    assert_parses_as(
-        """
-        let %_: fn () -> int32 = fn () -> int32 { 0 }; ()
-        """,
-        relay.Let(
-            relay.Var("_", relay.FuncType([], int32, [], [])),
-            relay.Function([], relay.const(0), int32, []),
-            UNIT,
-        ),
-    )
-
-    assert_parses_as(
-        """
-        let %_: fn (int32) -> int32 = fn (%x: int32) -> int32 { 0 }; ()
-        """,
-        relay.Let(
-            relay.Var("_", relay.FuncType([int32], int32, [], [])),
-            relay.Function([relay.Var("x", int32)], relay.const(0), int32, []),
-            UNIT,
-        ),
-    )
-
-    assert_parses_as(
-        """
-        let %_: fn (int32, int32) -> int32 = fn (%x: int32, %y: int32) -> int32 { 0 }; ()
-        """,
-        relay.Let(
-            relay.Var("_", relay.FuncType([int32, int32], int32, [], [])),
-            relay.Function(
-                [relay.Var("x", int32), relay.Var("y", int32)], relay.const(0), int32, []
-            ),
-            UNIT,
-        ),
-    )
-
-
-def test_tuple_type():
-    assert_parses_as(
-        """
-        let %_: () = (); ()
-        """,
-        relay.Let(relay.Var("_", relay.TupleType([])), UNIT, UNIT),
-    )
-
-    assert_parses_as(
-        """
-        let %_: (int32,) = (0,); ()
-        """,
-        relay.Let(relay.Var("_", relay.TupleType([int32])), relay.Tuple([relay.const(0)]), UNIT),
-    )
-
-    assert_parses_as(
-        """
-        let %_: (int32, int32) = (0, 1); ()
-        """,
-        relay.Let(
-            relay.Var("_", relay.TupleType([int32, int32])),
-            relay.Tuple([relay.const(0), relay.const(1)]),
-            UNIT,
-        ),
-    )
-
-
-def test_adt_defn():
-    mod = tvm.IRModule()
-
-    glob_typ_var = relay.GlobalTypeVar("Ayy")
-    prog = relay.TypeData(glob_typ_var, [], [relay.Constructor("Nil", [], glob_typ_var)])
-    mod[glob_typ_var] = prog
-    assert_parse_module_as(
-        """
-        type Ayy { Nil }
-        """,
-        mod,
-    )
-
-
-def test_adt_any():
-    code = """
-    type my_dtype {
-        my_cons(Tensor[(?, 1), uint16]),
-    }
-    """
-    mod = parse_module(code)
-    items = mod.type_definitions.items()
-    global_type_var, type_data = items[0]
-    assert global_type_var.name_hint == "my_dtype"
-    ctors = type_data.constructors
-    assert len(ctors) == 1
-    my_cons = ctors[0]
-    assert my_cons.name_hint == "my_cons"
-    ty_shape = my_cons.inputs[0].shape
-    assert isinstance(ty_shape[0], tvm.tir.Any)
-    assert ty_shape[1] == 1
-
-
-def test_empty_adt_defn():
-    mod = tvm.IRModule()
-
-    glob_typ_var = relay.GlobalTypeVar("Ayy")
-    prog = relay.TypeData(glob_typ_var, [], [])
-    mod[glob_typ_var] = prog
-    assert_parse_module_as(
-        """
-        type Ayy { }
-        """,
-        mod,
-    )
-
-
-def test_multiple_cons_defn():
-    mod = tvm.IRModule()
-
-    list_var = relay.GlobalTypeVar("List")
-    typ_var = relay.TypeVar("A")
-    prog = relay.TypeData(
-        list_var,
-        [typ_var],
-        [
-            relay.Constructor("Cons", [typ_var, list_var(typ_var)], list_var),
-            relay.Constructor("Nil", [], list_var),
-        ],
-    )
-    mod[list_var] = prog
-    assert_parse_module_as(LIST_DEFN, mod)
-
-
-def test_multiple_type_param_defn():
-    glob_typ_var = relay.GlobalTypeVar("Either")
-    typ_var_a = relay.TypeVar("A")
-    typ_var_b = relay.TypeVar("B")
-    prog = relay.TypeData(
-        glob_typ_var,
-        [typ_var_a, typ_var_b],
-        [
-            relay.Constructor("Left", [typ_var_a], glob_typ_var),
-            relay.Constructor("Right", [typ_var_b], glob_typ_var),
-        ],
-    )
-    mod = tvm.IRModule()
-    mod[glob_typ_var] = prog
-    assert_parse_module_as(
-        """
-        type Either[A, B] {
-          Left(A),
-          Right(B),
-        }
-        """,
-        mod,
-    )
-
-
-def test_match():
-    # pair each match keyword with whether it specifies a complete match or not
-    match_keywords = [("match", True), ("match?", False)]
-    for (match_keyword, is_complete) in match_keywords:
-        mod = tvm.IRModule()
-
-        list_var = relay.GlobalTypeVar("List")
-        typ_var = relay.TypeVar("A")
-        cons_constructor = relay.Constructor("Cons", [typ_var, list_var(typ_var)], list_var)
-        nil_constructor = relay.Constructor("Nil", [], list_var)
-        list_def = relay.TypeData(list_var, [typ_var], [cons_constructor, nil_constructor])
-        mod[list_var] = list_def
-
-        length_var = relay.GlobalVar("length")
-        typ_var = relay.TypeVar("A")
-        input_type = list_var(typ_var)
-        input_var = relay.Var("xs", input_type)
-        rest_var = relay.Var("rest")
-        cons_case = relay.Let(
-            relay.var("", type_annotation=None),
-            UNIT,
-            relay.add(relay.const(1), relay.Call(length_var, [rest_var])),
-        )
-        body = relay.Match(
-            input_var,
-            [
-                relay.Clause(
-                    relay.PatternConstructor(
-                        cons_constructor, [relay.PatternWildcard(), relay.PatternVar(rest_var)]
-                    ),
-                    cons_case,
-                ),
-                relay.Clause(relay.PatternConstructor(nil_constructor, []), relay.const(0)),
-            ],
-            complete=is_complete,
-        )
-        length_func = relay.Function([input_var], body, int32, [typ_var])
-        mod[length_var] = length_func
-
-        assert_parse_module_as(
-            """
-            %s
-
-            def @length[A](%%xs: List[A]) -> int32 {
-              %s (%%xs) {
-                Cons(_, %%rest : List[A]) => {
-                  ();
-                  1 + @length(%%rest)
-                },
-                Nil => 0,
-              }
-            }
-            """
-            % (LIST_DEFN, match_keyword),
-            mod,
-        )
-
-
-def test_adt_cons_expr():
-    mod = tvm.IRModule()
-
-    list_var = relay.GlobalTypeVar("List")
-    typ_var = relay.TypeVar("A")
-    cons_constructor = relay.Constructor("Cons", [typ_var, list_var(typ_var)], list_var)
-    nil_constructor = relay.Constructor("Nil", [], list_var)
-    list_def = relay.TypeData(list_var, [typ_var], [cons_constructor, nil_constructor])
-    mod[list_var] = list_def
-
-    make_singleton_var = relay.GlobalVar("make_singleton")
-    input_var = relay.Var("x", int32)
-    make_singleton_func = relay.Function(
-        [input_var], cons_constructor(input_var, nil_constructor()), list_var(int32)
-    )
-    mod[make_singleton_var] = make_singleton_func
-
-    assert_parse_module_as(
-        """
-        %s
-
-        def @make_singleton(%%x: int32) -> List[int32] {
-          Cons(%%x, Nil)
-        }
-        """
-        % LIST_DEFN,
-        mod,
-    )
-
-
-def test_duplicate_adt_defn():
-    with pytest.raises(tvm.error.DiagnosticError):
-        parse_module(
-            """
-            %s
-
-            type List[A] {
-            Cons(A, List[A]),
-            Nil,
-            }
-            """
-            % LIST_DEFN
-        )
-
-
-def test_duplicate_adt_cons():
-    with pytest.raises(tvm.error.DiagnosticError):
-        parse_text(
-            """
-            type Ayy { Lmao }
-            type Haha { Lmao }
-            """
-        )
-
-
-def test_duplicate_adt_cons_defn():
-    with pytest.raises(tvm.error.DiagnosticError):
-        parse_text(
-            """
-            type Ayy { Lmao }
-            type Lmao { Ayy }
-            """
-        )
-
-
-def test_duplicate_global_var():
-    with pytest.raises(tvm.error.DiagnosticError):
-        parse_text(
-            """
-            def @id[A](%x: A) -> A { x }
-            def @id[A](%x: A) -> A { x }
-            """
-        )
-
-
-def test_extern_adt_defn():
-    mod = tvm.IRModule()
-
-    extern_var = relay.GlobalTypeVar("T")
-    typ_var = relay.TypeVar("A")
-    extern_def = relay.TypeData(extern_var, [typ_var], [])
-    mod[extern_var] = extern_def
-
-    assert_parse_module_as(
-        """
-        extern type T[A]
-        """,
-        mod,
-    )
-
-
-def test_import_grad():
-    mod = tvm.IRModule()
-    mod.import_from_std("gradient.rly")
-
-
-def test_mlp():
-    mod, _ = relay.testing.mlp.get_workload(1)
-    text = mod.astext()
-    parsed_mod = tvm.relay.parse(text)
-    tvm.ir.assert_structural_equal(mod, parsed_mod)
-
-
-def inline_params(mod, params):
-    main_fn = mod["main"]
-    str_to_var = {}
-    for param in main_fn.params:
-        str_to_var[param.name_hint] = param
-
-    bind_map = {}
-    for param in params:
-        bind_map[str_to_var[param]] = relay.const(params[param])
-
-    body = relay.bind(main_fn.body, bind_map)
-    main_fn = relay.Function(relay.analysis.free_vars(body), body)
-    mod._add("main", main_fn, True)
-    return mod
-
-
-def test_mlp_inlined_params():
-    mod, params = relay.testing.mlp.get_workload(1)
-    mod = inline_params(mod, params)
-    mod = relay.transform.InferType()(mod)
-    text = mod.astext()
-    parsed_mod = tvm.relay.parse(text)
-    tvm.ir.assert_structural_equal(mod, parsed_mod)
-
-
-def test_tuple_return_value():
-    program = """
-    type Box[T] {
-        constructor(T)
-    }
-
-    def @example() {
-        %0 = ();
-        %1 = constructor(%0);
-        %2 = constructor(0f);
-        (%1, %2,)
-    }
-    """
-    parse_module(program)
-
-
-def test_parse_if_in_binding():
-    program = """
-    def @example(%b: bool) {
-        %0 = if (%b) {
-            1
-        } else {
-            0
-        };
-        %0
-    }
-    """
-    parse_module(program)
-
-
-def test_op_string_attr():
-    call = parse_text(
-        """
-        free_var %x: Tensor[(1, 32, 32, 3), float32];
-        free_var %y: Tensor[(1, 1, 3, 3), float32];
-        nn.conv2d(%x, %y, data_layout="NHWC", kernel_layout="HWIO")
-        """
-    )
-
-    assert isinstance(call.op, tvm.ir.Op)
-    assert call.op.name == "nn.conv2d"
-    assert call.attrs.data_layout == "NHWC"
-    assert call.attrs.kernel_layout == "HWIO"
-
-
-def test_load_prelude():
-    mod = tvm.IRModule()
-    mod.import_from_std("prelude.rly")
-    tvm.relay.parse(mod.astext())
-
-
-def test_call_attrs():
-    def get_func(shape, dtype):
-        x0 = relay.var("data", shape=shape, dtype=dtype)
-        w0 = relay.var("weight", shape=shape, dtype=dtype)
-        a = relay.nn.dense(x0, w0)
-        b = relay.nn.relu(a)
-        d = relay.add(b, relay.const(1.0, dtype=dtype))
-        return relay.Function([x0, w0], d)
-
-    # build relay graph
-    shape = (2, 4)
-    dtype = "float32"
-    sub_func = get_func(shape, dtype)
-    p0 = relay.var("p0", shape=shape, dtype=dtype)
-    p1 = relay.var("p1", shape=shape, dtype=dtype)
-    attr = tvm.ir.make_node("attrs.TestAttrs", name="func_call_attrs")
-    call = relay.Call(sub_func, [p0, p1], attrs=attr)
-    func = relay.Function([p0, p1], call)
-
-    # build relay module
-    mod = tvm.IRModule()
-    mod["main"] = func
-    mod = tvm.relay.transform.InferType()(mod)
-
-    # assert equal
-    program = """
-    def @main(%p0: Tensor[(2, 4), float32], %p1: Tensor[(2, 4), float32]) {
-    %2 = fn (%data: Tensor[(2, 4), float32], %weight: Tensor[(2, 4), float32]) {
-        %0 = nn.dense(%data, %weight, units=None);
-        %1 = nn.relu(%0);
-        add(%1, 1f)
-    };
-    %2(%p0, %p1, name="func_call_attrs", attrs_type_key="attrs.TestAttrs")
-    }
-    """
-    parsed = parse_module(program)
-    assert_graph_equal(parsed, mod)
-
-
-def test_tokenize_inf():
-    x = relay.var("x", shape=(3, 4), dtype="float32")
-    y = relay.clip(x, -np.inf, np.inf)
-
-    f = relay.Function([x], y)
-    mod = tvm.IRModule.from_expr(f)
-
-    mod = relay.transform.AnnotateSpans()(mod)
-
-
-def test_func_attrs():
-    attrs = tvm.ir.make_node("DictAttrs", **{"Primitive": 1, "relay.reshape_only": 1})
-    x = relay.var("x", shape=(2, 3))
-    func = relay.Function([x], relay.reshape(x, (-1,)), attrs=attrs)
-    assert_parses_as(func.astext(), func)
-
-
-def test_init_module_and_metatable():
-    init_metatable = {"relay.Constant": [relay.const(np.random.rand(2, 3), dtype="float32")]}
-    init_module = tvm.relay.fromtext(
-        SEMVER
-        + """
-            def @f(%y : Tensor[(2, 3), float32]) -> Tensor[(2, 3), float32] {
-              negative(%y)
-            }
-        """,
-    )
-    mod = tvm.relay.parse(
-        SEMVER
-        + """
-            def @main(%x: Tensor[(2, 3), float32]) {
-              add(@f(%x), meta[relay.Constant][0])
-            }
-        """,
-        "from_string",
-        init_module,
-        init_metatable,
-    )
-    roundtrip(mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_ir_structural_equal_hash.py b/tests/python/relay/test_ir_structural_equal_hash.py
deleted file mode 100644
index 97b631a22518..000000000000
--- a/tests/python/relay/test_ir_structural_equal_hash.py
+++ /dev/null
@@ -1,800 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.testing import run_opt_pass
-
-
-def consistent_equal(x, y, map_free_vars=False):
-    struct_equal0 = tvm.ir.structural_equal(x, y, map_free_vars)
-    struct_equal1 = tvm.ir.structural_equal(y, x, map_free_vars)
-
-    xhash = tvm.ir.structural_hash(x, map_free_vars)
-    yhash = tvm.ir.structural_hash(y, map_free_vars)
-
-    if struct_equal0 != struct_equal1:
-        raise ValueError(
-            "Non-communicative {} vs {}, sequal0={}, sequal1={}".format(
-                x, y, struct_equal0, struct_equal1
-            )
-        )
-
-    # NOTE: hash colision can happen but should be rare.
-    # we can confirm that hash colison doesn't happen for our testcases
-    if struct_equal0 != (xhash == yhash):
-        raise ValueError(
-            "Inconsistent {} vs {}, sequal={}, xhash={}, yhash={}".format(
-                x, y, struct_equal0, xhash, yhash
-            )
-        )
-    return struct_equal0
-
-
-def test_tensor_type_sequal():
-    t1 = relay.TensorType((3, 4), "float32")
-    t2 = relay.TensorType((3, 4), "float32")
-    t3 = relay.TensorType((3, 4, 5), "float32")
-    assert t1 == t2
-    assert t1 != t3
-
-    t1 = relay.TensorType((), "float32")
-    t2 = relay.TensorType((), "float32")
-    assert t1 == t2
-
-
-def test_incomplete_type_sequal():
-    t1 = relay.IncompleteType(relay.TypeKind.ShapeVar)
-    t2 = relay.IncompleteType(relay.TypeKind.Type)
-    t3 = relay.IncompleteType(relay.TypeKind.Type)
-
-    # only equal when there is pointer equality
-    assert t2 == t2
-    assert t1 == t1
-    assert t1 != t2
-    assert t2 != t3
-
-
-def test_type_param_sequal():
-    t1 = relay.TypeVar("v1", relay.TypeKind.Type)
-    t2 = relay.TypeVar("v2", relay.TypeKind.ShapeVar)
-    t3 = relay.TypeVar("v3", relay.TypeKind.Type)
-
-    # only pointer equality and eq_map allow equal params
-    assert t1 == t1
-    assert t2 == t2
-    assert t1 != t2  # different kind
-    assert t1 != t3  # not in eq_map
-
-    # function types are the only way to put type params
-    # in eq map
-    ft1 = relay.FuncType(
-        tvm.runtime.convert([]), t1, tvm.runtime.convert([t1]), tvm.runtime.convert([])
-    )
-    ft2 = relay.FuncType(
-        tvm.runtime.convert([]), t3, tvm.runtime.convert([t3]), tvm.runtime.convert([])
-    )
-    # actually an invalid type because t2 is wrong kind
-    ft3 = relay.FuncType(
-        tvm.runtime.convert([]), t2, tvm.runtime.convert([t2]), tvm.runtime.convert([])
-    )
-
-    assert ft1 == ft2
-    assert ft1 != ft3  # kinds still do not match
-
-
-def test_func_type_sequal():
-    t1 = relay.TensorType((1, 2), "float32")
-    t2 = relay.TensorType((1, 2, 3), "float32")
-
-    tp1 = relay.TypeVar("v1", relay.TypeKind.Type)
-    tp2 = relay.TypeVar("v2", relay.TypeKind.Type)
-    tp3 = relay.TypeVar("v3", relay.TypeKind.ShapeVar)
-    tp4 = relay.TypeVar("v3", relay.TypeKind.ShapeVar)
-
-    broadcast = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Broadcast")
-    identity = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Identity")
-
-    tr1 = relay.TypeRelation(broadcast, tvm.runtime.convert([tp1, tp3]), 1, None)
-    tr2 = relay.TypeRelation(broadcast, tvm.runtime.convert([tp2, tp4]), 1, None)
-    tr3 = relay.TypeRelation(identity, tvm.runtime.convert([tp1, tp3]), 1, None)
-
-    ft = relay.FuncType(
-        tvm.runtime.convert([t1, t2]),
-        tp1,
-        tvm.runtime.convert([tp1, tp3]),
-        tvm.runtime.convert([tr1]),
-    )
-    translate_vars = relay.FuncType(
-        tvm.runtime.convert([t1, t2]),
-        tp2,
-        tvm.runtime.convert([tp2, tp4]),
-        tvm.runtime.convert([tr2]),
-    )
-    assert ft == translate_vars
-
-    different_args = relay.FuncType(
-        tvm.runtime.convert([t1]), tp1, tvm.runtime.convert([tp1, tp3]), tvm.runtime.convert([tr1])
-    )
-    assert ft != different_args
-
-    different_order = relay.FuncType(
-        tvm.runtime.convert([t2, t1]),
-        tp1,
-        tvm.runtime.convert([tp1, tp3]),
-        tvm.runtime.convert([tr1]),
-    )
-    assert ft != different_order
-
-    no_rel = relay.FuncType(
-        tvm.runtime.convert([t1, t2]), tp1, tvm.runtime.convert([tp1, tp3]), tvm.runtime.convert([])
-    )
-    assert ft != no_rel
-
-    more_vars = relay.FuncType(
-        tvm.runtime.convert([t1, t2]),
-        tp2,
-        tvm.runtime.convert([tp1, tp2, tp3]),
-        tvm.runtime.convert([tr1]),
-    )
-    assert ft != more_vars
-
-    all_the_vars = relay.FuncType(
-        tvm.runtime.convert([t1, t2]),
-        tp1,
-        tvm.runtime.convert([tp1, tp2, tp3, tp4]),
-        tvm.runtime.convert([tr1, tr2]),
-    )
-    assert ft != all_the_vars
-
-    different_rel = relay.FuncType(
-        tvm.runtime.convert([t1, t2]),
-        tp1,
-        tvm.runtime.convert([tp1, tp3]),
-        tvm.runtime.convert([tr3]),
-    )
-    assert ft != different_rel
-
-    more_rels = relay.FuncType(
-        tvm.runtime.convert([t1, t2]),
-        tp1,
-        tvm.runtime.convert([tp1, tp3]),
-        tvm.runtime.convert([tr1, tr3]),
-    )
-    assert ft != more_rels
-
-
-def test_tuple_type_sequal():
-    t1 = relay.TensorType((1, 2, 3), "float32")
-    t2 = relay.TensorType((1, 2, 3, 4), "float32")
-    tp1 = relay.TypeVar("v1", relay.TypeKind.Type)
-    tp2 = relay.TypeVar("v2", relay.TypeKind.Type)
-
-    tup1 = relay.TupleType(tvm.runtime.convert([t1, t2, tp1]))
-    tup2 = relay.TupleType(tvm.runtime.convert([t1, t2, tp1]))
-    tup3 = relay.TupleType(tvm.runtime.convert([t2, t1, tp1]))
-    tup4 = relay.TupleType(tvm.runtime.convert([t1, t2, tp2]))
-
-    # as long as types are alpha-equal and in same order,
-    # tuples should be alpha-equal
-    assert tup1 == tup2
-    assert tup1 != tup3
-    assert tup1 != tup4
-
-
-def test_type_relation_sequal():
-    t1 = relay.TensorType((1, 2), "float32")
-    t2 = relay.TensorType((1, 2, 3), "float32")
-    t3 = relay.TensorType((1, 2, 3, 4), "float32")
-
-    # functions are compared only by pointer equality so
-    # we need to be sure to use the same pointers
-    broadcast = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Broadcast")
-    identity = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Identity")
-
-    attr1 = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3, 4))
-    attr1_same = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3, 4))
-    attr2 = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3, 4, 4))
-
-    tr = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr1)
-    same = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr1)
-    diff_func = relay.TypeRelation(identity, tvm.runtime.convert([t1, t2]), 1, attr1)
-    diff_order = relay.TypeRelation(broadcast, tvm.runtime.convert([t2, t1]), 1, attr1)
-    diff_args = relay.TypeRelation(broadcast, tvm.runtime.convert([t2, t3]), 1, attr1)
-    diff_attr = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr2)
-    same_attr = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr1_same)
-
-    bigger = relay.TypeRelation(identity, tvm.runtime.convert([t1, t3, t2]), 2, attr1)
-    diff_num_inputs = relay.TypeRelation(identity, tvm.runtime.convert([t1, t3, t2]), 1, attr2)
-
-    # func, number of args, input count, and order should be the same
-    assert tr == same
-    assert tr != diff_func
-    assert tr != diff_order
-    assert tr != diff_args
-    assert tr != diff_attr
-    assert tr == same_attr
-    assert tr != bigger
-
-    assert bigger != diff_num_inputs
-
-
-def test_type_call_sequal():
-    h1 = relay.GlobalTypeVar("h1")
-    h2 = relay.GlobalTypeVar("h2")
-    t1 = relay.TensorType((1, 2), "float32")
-    t2 = relay.TensorType((1, 2, 3), "float32")
-    t3 = relay.TensorType((1, 2, 3, 4), "float32")
-    t4 = relay.TensorType((), "float32")
-
-    tc = relay.TypeCall(h1, [t1, t2, t3])
-    same = relay.TypeCall(h1, [t1, t2, t3])
-
-    different_func = relay.TypeCall(h2, [t1, t2, t3])
-    different_arg = relay.TypeCall(h1, [t1, t2, t4])
-    fewer_args = relay.TypeCall(h1, [t1, t2])
-    more_args = relay.TypeCall(h1, [t1, t2, t3, t4])
-    different_order_args = relay.TypeCall(h1, [t3, t2, t1])
-
-    assert tc == same
-    assert tc != different_func
-    assert tc != fewer_args
-    assert tc != more_args
-    assert tc != different_order_args
-
-
-def test_constant_sequal():
-    x = relay.const(1)
-    y = relay.const(2)
-    assert consistent_equal(x, x)
-    assert not consistent_equal(x, y)
-    assert consistent_equal(x, relay.const(1))
-
-
-def test_type_node_sequal():
-    v1 = relay.TypeVar("v1", 6)
-    v2 = relay.TypeVar("v2", 6)
-    assert not consistent_equal(v1, v2)
-
-    v1 = relay.TypeVar("v1", 0)
-    v2 = relay.TypeVar("v2", 6)
-    assert not consistent_equal(v1, v2)
-
-
-def test_type_node_incompatible_sequal():
-    v1 = relay.TypeVar("v1", 6)
-    v2 = relay.Var("v2")
-    assert not consistent_equal(v1, v2)
-
-
-def test_expr_node_incompatible_sequal():
-    v1 = relay.Var("v1")
-    v2 = relay.PatternVar(relay.Var("v2"))
-    assert not consistent_equal(v1, v2)
-
-
-def test_var_sequal():
-    v1 = relay.Var("v1")
-    v2 = relay.Var("v2")
-
-    # normally only pointer equality
-    assert consistent_equal(v1, v1)
-    assert not consistent_equal(v1, v2)
-
-    # let node allows for setting the eq_map
-    l1 = relay.Let(v1, relay.const(1), v1)
-    l2 = relay.Let(v2, relay.const(1), v2)
-    l3 = relay.Let(v1, relay.const(1), v2)
-
-    assert consistent_equal(l1, l2)
-    assert not consistent_equal(l1, l3)
-
-    # type annotations
-    tt1 = relay.TensorType([], "int32")
-    tt2 = relay.TensorType([], "int32")
-    tt3 = relay.TensorType([], "int64")
-    v3 = relay.Var("v3", tt1)
-    v4 = relay.Var("v4", tt2)
-    v5 = relay.Var("v5", tt3)
-
-    l4 = relay.Let(v3, relay.const(1), v3)
-    l5 = relay.Let(v4, relay.const(1), v4)
-    l6 = relay.Let(v5, relay.const(1), v5)
-
-    # same annotations
-    assert consistent_equal(l4, l5)
-    # different annotations
-    assert not consistent_equal(l4, l6)
-    # one null annotation
-    assert not consistent_equal(l1, l4)
-
-
-def test_global_var_sequal():
-    v1 = relay.GlobalVar("v1")
-    v2 = relay.GlobalVar("v2")
-
-    # only pointer equality suffices (smoke test)
-    assert consistent_equal(v1, v1)
-    assert not consistent_equal(v1, v2)
-
-
-def test_tuple_sequal():
-    v0 = relay.Var("v0")
-    v1 = relay.Var("v1")
-    v2 = relay.Var("v2")
-
-    # unit value is a valid tuple
-    assert consistent_equal(relay.Tuple([]), relay.Tuple([]))
-
-    tup = relay.Tuple([v0, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
-    same = relay.Tuple([v0, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
-
-    assert consistent_equal(tup, same)
-
-    # use the eq_map
-
-    let_tup = relay.Let(v1, tup, v1)
-    let_mapped = relay.Let(
-        v2, relay.Tuple([v0, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])]), v2
-    )
-
-    assert consistent_equal(let_tup, let_mapped)
-
-    more_fields = relay.Tuple(
-        [v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)]), v2]
-    )
-    assert not consistent_equal(tup, more_fields)
-
-    fewer_fields = relay.Tuple([v1, relay.const(2), relay.const(3)])
-    assert not consistent_equal(tup, fewer_fields)
-
-    different_end = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(5)])])
-    assert not consistent_equal(tup, different_end)
-
-    different_start = relay.Tuple(
-        [v2, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])]
-    )
-    assert not consistent_equal(tup, different_start)
-
-    longer_at_end = relay.Tuple(
-        [v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4), relay.const(5)])]
-    )
-    assert not consistent_equal(tup, longer_at_end)
-
-
-def test_tuple_get_item_sequal():
-    x = relay.Var("x")
-    y = relay.Var("y")
-    assert not consistent_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(y, 1))
-    assert not consistent_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 2))
-    assert consistent_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 1))
-
-
-def test_function_attr():
-    x0 = relay.var("x0", shape=(10, 10))
-    w00 = relay.var("w00", shape=(10, 10))
-    w01 = relay.var("w01", shape=(10, 10))
-    w02 = relay.var("w02", shape=(10, 10))
-    z00 = relay.add(x0, w00)
-    p00 = relay.subtract(z00, w01)
-    q00 = relay.multiply(p00, w02)
-    func0 = relay.Function([x0, w00, w01, w02], q00)
-    func0 = func0.with_attr("FuncName", "a")
-
-    x1 = relay.var("x1", shape=(10, 10))
-    w10 = relay.var("w10", shape=(10, 10))
-    w11 = relay.var("w11", shape=(10, 10))
-    w12 = relay.var("w12", shape=(10, 10))
-    z10 = relay.add(x1, w10)
-    p10 = relay.subtract(z10, w11)
-    q10 = relay.multiply(p10, w12)
-    func1 = relay.Function([x1, w10, w11, w12], q10)
-    func1 = func1.with_attr("FuncName", "b")
-    assert not consistent_equal(func0, func1)
-
-
-def test_function_sequal():
-    tt1 = relay.TensorType((1, 2, 3), "float32")
-    tt2 = relay.TensorType((4, 5, 6), "int8")
-    tt3 = relay.TupleType([tt1, tt2])
-
-    v1 = relay.Var("v1", tt1)
-    v2 = relay.Var("v2", tt2)
-    v3 = relay.Var("v3", tt3)
-    v4 = relay.Var("v4", tt2)
-    vret = relay.Constant(tvm.nd.array(np.ones(1)))
-
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.Type)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.Type)
-    tp3 = relay.TypeVar("tp3", relay.TypeKind.ShapeVar)
-    tp4 = relay.TypeVar("tp4", relay.TypeKind.ShapeVar)
-
-    basic_args = [relay.Var("v3", tt1), relay.Var("v4", tt2)]
-    basic_tps = [tp1, tp2]
-
-    func = relay.Function([v1, v2], v1, tt2, basic_tps)
-    mapped = relay.Function(basic_args, basic_args[0], tt2, basic_tps)
-    assert consistent_equal(func, mapped)
-
-    fewer_params = relay.Function([relay.Var("v4", tt2)], v4, tt2, basic_tps)
-    assert not consistent_equal(func, fewer_params)
-
-    more_params = relay.Function(
-        [relay.Var("v3", tt1), relay.Var("v4", tt2), relay.Var("v2", tt2)], v4, tt2, basic_tps
-    )
-    assert not consistent_equal(func, more_params)
-
-    params_unordered = relay.Function([v2, v1], v1, tt2, basic_tps)
-    assert not consistent_equal(func, params_unordered)
-
-    params_mismatch = relay.Function([v1, v3], v1, tt2, basic_tps)
-    assert not consistent_equal(func, params_mismatch)
-
-    # also would not typecheck
-    ret_type_mismatch = relay.Function(basic_args, v4, tt1, basic_tps)
-    assert not consistent_equal(func, ret_type_mismatch)
-
-    # also mis-typed
-    different_body = relay.Function(basic_args, v3, tt2, basic_tps)
-    assert not consistent_equal(func, different_body)
-
-    fewer_type_params = relay.Function(basic_args, v4, tt2, [tp1])
-    assert not consistent_equal(func, fewer_type_params)
-
-    more_type_params = relay.Function(basic_args, v4, tt2, [tp1, tp2, tp3])
-    assert not consistent_equal(func, more_type_params)
-
-    type_params_unordered = relay.Function(basic_args, v4, tt2, [tp2, tp1])
-    assert not consistent_equal(func, type_params_unordered)
-
-    different_type_params = relay.Function(basic_args, v4, tt2, [tp3, tp4])
-    assert not consistent_equal(func, different_type_params)
-
-    # a well-typed example that also differs in body, ret type, and type params
-    tupled_example = relay.Function(basic_args, relay.Tuple([v3, v4]), tt3)
-    assert not consistent_equal(func, tupled_example)
-
-    # nullable
-    no_ret_type = relay.Function(basic_args, v4, None, [tp1, tp2])
-    # both null
-    assert consistent_equal(no_ret_type, no_ret_type)
-    # one null
-    assert not consistent_equal(func, no_ret_type)
-    assert not consistent_equal(no_ret_type, func)
-
-
-def test_call_sequal():
-    v1 = relay.Var("v1")
-    v2 = relay.Var("v2")
-
-    attr1 = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3, 4))
-    attr1_same = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3, 4))
-    attr2 = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3, 4, 4))
-
-    tt1 = relay.TensorType((1, 2, 3), "float32")
-    tt2 = relay.TensorType((), "int8")
-
-    basic_args = [relay.const(1), relay.const(2), v2, relay.Tuple([])]
-
-    # manually writing out args to ensure that args does not rely on
-    # pointer equality
-    call = relay.Call(v1, [relay.const(1), relay.const(2), v2, relay.Tuple([])], attr1, [tt1])
-    same = relay.Call(v1, basic_args, attr1, [tt1])
-    assert consistent_equal(call, same)
-
-    different_fn = relay.Call(v2, basic_args, attr1, [tt1])
-    assert not consistent_equal(call, different_fn)
-
-    fewer_args = relay.Call(v1, [relay.const(1), relay.const(2), v2], attr1, [tt1])
-    assert not consistent_equal(call, fewer_args)
-
-    reordered_args = relay.Call(
-        v1, [relay.const(2), relay.const(1), relay.Tuple([]), v2], attr1, [tt1]
-    )
-    assert not consistent_equal(call, reordered_args)
-
-    different_args = relay.Call(v1, [relay.const(1), relay.const(2), relay.const(3)], attr1, [tt1])
-    assert not consistent_equal(call, different_args)
-
-    more_args = relay.Call(
-        v1,
-        [relay.const(1), relay.const(2), v2, relay.Tuple([]), relay.const(3), relay.const(4)],
-        attr1,
-        [tt1],
-    )
-    assert not consistent_equal(call, more_args)
-
-    different_attrs = relay.Call(v1, basic_args, attr2, [tt1])
-    assert not consistent_equal(call, different_attrs)
-
-    same_attrs = relay.Call(v1, basic_args, attr1_same, [tt1])
-    assert consistent_equal(call, same_attrs)
-
-    no_type_args = relay.Call(v1, basic_args, attr1)
-    assert not consistent_equal(call, no_type_args)
-
-    more_type_args = relay.Call(v1, basic_args, attr1, [tt1, tt2])
-    assert not consistent_equal(call, more_type_args)
-
-    different_type_arg = relay.Call(v1, basic_args, attr1, [tt2])
-    assert not consistent_equal(call, different_type_arg)
-
-
-def test_let_sequal():
-    tt1 = relay.TensorType((), "float32")
-    tt2 = relay.TensorType((), "int8")
-    v1 = relay.Var("v1")
-    v1_wtype = relay.Var("v1", tt1)
-    v2 = relay.Var("v2")
-    v3 = relay.Var("v3")
-
-    let = relay.Let(v1, relay.const(2), v1)
-    mapped = relay.Let(v2, relay.const(2), v2)
-    assert consistent_equal(let, mapped)
-
-    mismatched_var = relay.Let(v2, relay.const(2), v3)
-    assert not consistent_equal(let, mismatched_var)
-
-    different_value = relay.Let(v2, relay.const(3), v2)
-    assert not consistent_equal(let, different_value)
-
-    different_body = relay.Let(v2, relay.const(3), relay.const(12))
-    assert not consistent_equal(let, different_body)
-
-    # specified types must match
-
-    let_with_type = relay.Let(v1_wtype, relay.const(2), v1_wtype)
-    same_type = relay.Let(v1_wtype, relay.const(2), v1_wtype)
-    assert consistent_equal(let_with_type, same_type)
-    assert not consistent_equal(let, let_with_type)
-    v2 = relay.Var("v1", tt2)
-    different_type = relay.Let(v2, relay.const(2), v2)
-    assert not consistent_equal(let_with_type, different_type)
-
-
-def test_if_sequal():
-    v1 = relay.Var("v1")
-    v2 = relay.Var("v2")
-
-    if_sample = relay.If(v1, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
-    same = relay.If(v1, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
-    assert consistent_equal(if_sample, same)
-
-    different_cond = relay.If(v2, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
-    assert not consistent_equal(if_sample, different_cond)
-
-    different_true = relay.If(v1, relay.const(2), relay.Tuple([relay.const(2), relay.const(3)]))
-    assert not consistent_equal(if_sample, different_true)
-
-    different_false = relay.If(v1, relay.const(1), relay.Tuple([]))
-    assert not consistent_equal(if_sample, different_false)
-
-
-def test_constructor_sequal():
-    # smoke test: it should be pointer equality
-    mod = tvm.IRModule()
-    p = relay.prelude.Prelude(mod)
-    _, cons, nil = p.mod.get_type("List")
-
-    assert consistent_equal(nil, nil)
-    assert consistent_equal(cons, cons)
-    assert not consistent_equal(nil, cons)
-
-
-def test_match_sequal():
-    mod = tvm.IRModule()
-    p = relay.prelude.Prelude(mod)
-    _, cons, nil = p.mod.get_type("List")
-    _, none, some = p.mod.get_type("Option")
-
-    x = relay.Var("x")
-    y = relay.Var("y")
-    nil_case = relay.Clause(relay.PatternConstructor(nil), nil())
-    cons_case = relay.Clause(
-        relay.PatternConstructor(cons, [relay.PatternVar(x), relay.PatternVar(y)]), cons(x, y)
-    )
-
-    z = relay.Var("z")
-    a = relay.Var("a")
-    equivalent_cons = relay.Clause(
-        relay.PatternConstructor(cons, [relay.PatternVar(z), relay.PatternVar(a)]), cons(z, a)
-    )
-
-    data = cons(relay.const(1), cons(relay.const(2), nil()))
-
-    match = relay.Match(data, [nil_case, cons_case])
-    equivalent = relay.Match(data, [nil_case, equivalent_cons])
-    empty = relay.Match(data, [])
-    no_cons = relay.Match(data, [nil_case])
-    no_nil = relay.Match(data, [cons_case])
-    different_data = relay.Match(nil(), [nil_case, cons_case])
-    different_order = relay.Match(data, [cons_case, nil_case])
-    different_nil = relay.Match(
-        data, [relay.Clause(relay.PatternConstructor(nil), cons(nil(), nil())), cons_case]
-    )
-    different_cons = relay.Match(
-        data,
-        [
-            nil_case,
-            relay.Clause(
-                relay.PatternConstructor(cons, [relay.PatternWildcard(), relay.PatternWildcard()]),
-                nil(),
-            ),
-        ],
-    )
-    another_case = relay.Match(
-        data, [nil_case, cons_case, relay.Clause(relay.PatternWildcard(), nil())]
-    )
-    wrong_constructors = relay.Match(
-        data,
-        [
-            relay.Clause(relay.PatternConstructor(none), nil()),
-            relay.Clause(relay.PatternConstructor(some, [relay.PatternVar(x)]), cons(x, nil())),
-        ],
-    )
-
-    tvm.ir.assert_structural_equal(match, match)
-    assert consistent_equal(match, match)
-    assert consistent_equal(match, equivalent)
-    assert not consistent_equal(match, no_cons)
-    assert not consistent_equal(match, no_nil)
-    assert not consistent_equal(match, empty)
-    assert not consistent_equal(match, different_data)
-    assert not consistent_equal(match, different_order)
-    assert not consistent_equal(match, different_nil)
-    assert not consistent_equal(match, different_cons)
-    assert not consistent_equal(match, another_case)
-    assert not consistent_equal(match, wrong_constructors)
-
-
-def test_op_sequal():
-    # only checks names
-    op1 = relay.op.get("add")
-    op2 = relay.op.get("add")
-    assert consistent_equal(op1, op2)
-
-    op3 = relay.op.get("take")
-    assert not consistent_equal(op1, op3)
-
-
-def test_graph_equal():
-    x = relay.var("x")
-
-    y0 = relay.add(x, x)
-    z0 = relay.add(y0, y0)
-
-    y1 = relay.add(x, x)
-    z1 = relay.add(y1, y1)
-
-    z3 = relay.add(relay.add(x, x), relay.add(x, x))
-
-    assert consistent_equal(z0, z1)
-    assert consistent_equal(z0, z1)
-
-    # z3's dataflow format is different from z0
-    # z0 is computed from a common y0 node
-    # Relay view them as different programs
-    # Check the difference in the text format.
-    assert not consistent_equal(z0, z3)
-
-
-def test_hash_unequal():
-    x1 = relay.var("x1", shape=(10, 10), dtype="float32")
-    y1 = relay.var("y1", shape=(10, 10), dtype="float32")
-    func1 = relay.Function([x1, y1], relay.add(x1, y1))
-
-    # func2 is exactly same structure with same variables shapes and dtypes
-    x2 = relay.var("x2", shape=(10, 10), dtype="float32")
-    y2 = relay.var("y2", shape=(10, 10), dtype="float32")
-    func2 = relay.Function([x2, y2], relay.add(x2, y2))
-
-    assert consistent_equal(func1, func2)
-
-    # func3 is same as func1 but with different var shapes
-    x3 = relay.var("x3", shape=(20, 10), dtype="float32")
-    y3 = relay.var("y3", shape=(20, 10), dtype="float32")
-    func3 = relay.Function([x3, y3], relay.add(x3, y3))
-
-    assert not consistent_equal(func1, func3)
-
-
-def test_tuple_match():
-    a = relay.Var("a")
-    b = relay.Var("b")
-    clause = relay.Clause(relay.PatternTuple([relay.PatternVar(a), relay.PatternVar(b)]), a + b)
-    x = relay.Match(relay.Tuple([relay.const(1), relay.const(1)]), [clause])
-
-    a = relay.Var("a")
-    b = relay.Var("b")
-    clause = relay.Clause(relay.PatternTuple([relay.PatternVar(a), relay.PatternVar(b)]), a + b)
-    y = relay.Match(relay.Tuple([relay.const(1), relay.const(1)]), [clause])
-    assert consistent_equal(x, y)
-
-
-def test_fn_attribute():
-    # create function that performs add
-    a = relay.var("a", shape=(10, 10))
-    b = relay.var("b", shape=(10, 10))
-    add = relay.add(a, b)
-    add_fn = relay.Function([a, b], add)
-    add_fn = run_opt_pass(add_fn, relay.transform.InferType())
-
-    # create function that performs add with test attribute
-    c = relay.var("c", shape=(10, 10))
-    d = relay.var("d", shape=(10, 10))
-    add_1 = relay.add(c, d)
-    add_1_fn = relay.Function([c, d], add_1)
-    add_1_fn = add_1_fn.with_attr("TestAttribute", "test")
-    add_1_fn = run_opt_pass(add_1_fn, relay.transform.InferType())
-
-    assert not consistent_equal(add_1_fn, add_fn)
-    assert not consistent_equal(add_fn, add_1_fn)
-
-
-def test_fn_vid_map():
-    def get_fn(with_vid):
-        x = relay.var("x", shape=(10,), dtype="float32")
-        f = relay.Function([x], x).with_attr("dict", {x.vid: 1} if with_vid else {x: 1})
-        return f
-
-    assert consistent_equal(get_fn(True), get_fn(True))
-    assert consistent_equal(get_fn(False), get_fn(False))
-
-
-def test_lets():
-    shape = (5, 5)
-
-    def func1():
-        sb = relay.ScopeBuilder()
-        p0 = relay.var("p0", shape=shape)
-        p1 = relay.var("p1", shape=shape)
-        a0 = sb.let("a0", relay.add(p0, relay.const(1)))
-        a1 = sb.let("a1", relay.add(p1, relay.const(1)))
-        a2 = sb.let("a2", relay.add(a0, a1))
-        sb.ret(a2)
-        return relay.Function([p0, p1], sb.get())
-
-    def func2():
-        # Alpha conversion is structurally equal
-        sb = relay.ScopeBuilder()
-        p0 = relay.var("p0", shape=shape)
-        p1 = relay.var("p1", shape=shape)
-        a1 = sb.let("a1", relay.add(p0, relay.const(1)))
-        a0 = sb.let("a0", relay.add(p1, relay.const(1)))
-        a2 = sb.let("a2", relay.add(a1, a0))
-        sb.ret(a2)
-        return relay.Function([p0, p1], sb.get())
-
-    def func3():
-        # But changing the order of bindings is not structurally equal
-        # (even though algebraically equal)
-        sb = relay.ScopeBuilder()
-        p0 = relay.var("p0", shape=shape)
-        p1 = relay.var("p1", shape=shape)
-        a1 = sb.let("a1", relay.add(p1, relay.const(1)))
-        a0 = sb.let("a0", relay.add(p0, relay.const(1)))
-        a2 = sb.let("a2", relay.add(a1, a0))
-        sb.ret(a2)
-        return relay.Function([p0, p1], sb.get())
-
-    tvm.ir.assert_structural_equal(func1(), func2())
-    assert not tvm.ir.structural_equal(func1(), func3())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
deleted file mode 100644
index b1599c1b919f..000000000000
--- a/tests/python/relay/test_ir_text_printer.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-from tvm import relay
-from tvm.relay import testing
-import numpy as np
-from tvm.relay import Expr
-from tvm.relay.analysis import free_vars
-import pytest
-
-DEBUG_PRINT = False
-
-SEMVER = '#[version = "0.0.5"]\n'
-
-
-def astext(program, unify_free_vars=False):
-    text = program.astext()
-
-    if isinstance(program, Expr):
-        roundtrip_program = tvm.relay.parse_expr(text)
-    else:
-        roundtrip_program = tvm.relay.fromtext(text)
-
-    tvm.ir.assert_structural_equal(roundtrip_program, program, map_free_vars=True)
-
-    return text
-
-
-def show(text):
-    if DEBUG_PRINT:
-        print("---------------------------")
-        print(text)
-
-
-def assert_prints_as(expr, str):
-    assert astext(expr) == SEMVER + str
-
-
-def test_scalars():
-    assert_prints_as(relay.const(42, "int16"), "42i16")
-    assert_prints_as(relay.const(42, "int32"), "42")
-    assert_prints_as(relay.const(42, "int64"), "42i64")
-    assert_prints_as(relay.const(3.0, "float16"), "3f16")
-    assert_prints_as(relay.const(3.0, "float32"), "3f")
-    assert_prints_as(relay.const(3.0, "float64"), "3f64")
-
-
-def test_large_graph():
-    x = relay.var("x", shape=(3, 2))
-    y = relay.var("y")
-    one = relay.const(10e10, dtype="float32")
-    z = relay.add(x, one)
-    for i in range(int(9e4)):
-        z = relay.add(z, one)
-    f = relay.Function([x, y], z)
-    show(astext(f))
-
-
-def test_func():
-    x = relay.var("x", shape=(3, 2))
-    y = relay.var("y")
-    one = relay.const(10e10, dtype="float32")
-    z = relay.add(x, one)
-    z = relay.add(z, z)
-    f = relay.Function([x, y], z)
-    show(astext(z))
-    show(astext(f))
-
-
-def test_mod():
-    x = relay.var("x", "float32")
-    y = relay.var("y", "float32")
-    z = relay.add(x, y)
-    z = relay.add(z, z)
-    f = relay.Function([x, y], z)
-    mod = tvm.IRModule()
-    mod["myf"] = f
-    mod = relay.transform.InferType()(mod)
-    text = astext(mod)
-    assert "def @myf" in text
-    assert "def @myf" in str(mod)
-    assert "add(%0, %0) /* ty=float32 */" in text
-    assert "add(%0, %0) /* ty=float32 */" in str(mod)
-    show(mod.astext(annotate=lambda x: str(x.checked_type.dtype) if type(x) == relay.Call else ""))
-    show(text)
-
-
-def test_meta_data():
-    n, c, h, w = te.size_var("n"), 10, 224, 224
-    x = relay.var("x", shape=(n, c, h, w))
-    w = relay.var("w")
-    z = relay.nn.conv2d(x, w, kernel_size=(3, 3), padding=(1, 1), channels=2)
-    f = relay.Function([x, w], z)
-    text = astext(f, unify_free_vars=True)
-    text_no_meta = str(f)
-    assert "channels=2" in text
-    assert "channels=2" in text_no_meta
-    assert "meta[tir.SizeVar][0]" in text
-    assert "meta[tir.SizeVar][0]" in text_no_meta
-    assert "type_key" in text
-    assert "type_key" not in text_no_meta
-
-    text = astext(relay.const([1, 2, 3]))
-    assert "meta[relay.Constant][0]" in text
-
-
-def test_call_attrs():
-    x = relay.var("x")
-    # non default args
-    z = relay.nn.softmax(x, axis=2)
-    assert "axis=2" in astext(z)
-    # default args
-    z = relay.nn.softmax(x)
-    assert "softmax(%x)" in astext(z)
-    # non default args
-    z = relay.expand_dims(x, axis=2, num_newaxis=2)
-    assert "num_newaxis=2" in astext(z)
-
-
-def test_let_if_scope():
-    x = relay.var("x", "float32")
-    y = relay.var("y", "float32")
-    cond = relay.var("cond", "bool")
-
-    sb = relay.ScopeBuilder()
-    with sb.if_scope(cond):
-        v1 = sb.let("v", relay.const(1, "float32"))
-        v2 = sb.let("v", x)
-        sb.ret(relay.subtract(v1, v2))
-    with sb.else_scope():
-        v3 = relay.var("v")
-        let2 = relay.Let(v3, y, v3)
-        sb.ret(relay.add(let2, let2))
-    result = sb.get()
-
-    f = relay.Function([x, y, cond], result)
-    text = astext(f)
-    assert text.count("{") == 3
-    assert "%cond: bool" in text
-    show(astext(f))
-
-
-def test_variable_name():
-    # avoid pure number even if the namehint is pure number
-    v1 = relay.var("1")
-    assert "%v1" in astext(v1)
-
-
-def test_mlp():
-    net, _ = tvm.relay.testing.mlp.get_workload(batch_size=1)
-    astext(net)
-
-
-def test_resnet():
-    net, _ = tvm.relay.testing.resnet.get_workload(batch_size=1)
-    astext(net)
-
-
-def test_mobilenet():
-    net, _ = tvm.relay.testing.mobilenet.get_workload(batch_size=1)
-    astext(net)
-
-
-def test_dqn():
-    net, _ = tvm.relay.testing.dqn.get_workload(batch_size=1)
-    astext(net)
-
-
-def test_dcgan():
-    net, _ = tvm.relay.testing.dcgan.get_workload(batch_size=1)
-    astext(net)
-
-
-def test_lstm():
-    net, _ = tvm.relay.testing.lstm.get_workload(1, 1)
-    astext(net)
-
-    net, _ = tvm.relay.testing.lstm.get_workload(4, 4)
-    astext(net)
-
-
-def test_inception_v3():
-    net, _ = tvm.relay.testing.inception_v3.get_workload(batch_size=1)
-    astext(net)
-
-
-def test_squeezenet():
-    for version in ["1.0", "1.1"]:
-        net, _ = tvm.relay.testing.squeezenet.get_workload(batch_size=1, version=version)
-        astext(net)
-
-
-def test_densenet():
-    net, _ = tvm.relay.testing.densenet.get_workload(batch_size=1)
-    astext(net)
-
-
-def test_call_node_order():
-    x = relay.var("x")
-    y = relay.var("y")
-    prog = relay.Call(
-        relay.Function([x], x), [relay.Call(relay.Function([y], y), [relay.const(1)])]
-    )
-    assert astext(prog) == SEMVER + (
-        "%0 = fn (%y) {\n"
-        "  %y\n"
-        "};\n"
-        "%1 = %0(1);\n"
-        "%2 = fn (%x) {\n"
-        "  %x\n"
-        "};\n"
-        "%2(%1)"
-    )
-
-
-def test_let_inlining():
-    tup = relay.Tuple([relay.const(0), relay.const(0)])
-    x = relay.var("x")
-    assert astext(relay.Let(x, tup, tup)) == SEMVER + ("%0 = (0, 0);\n" "let %x = %0;\n" "%0")
-
-    assert astext(relay.Let(x, tup, x)) == SEMVER + ("let %x = (0, 0);\n" "%x")
-
-
-def test_zeros():
-    x = relay.op.zeros([], "float32")
-    astext(x)
-
-
-def test_unapplied_constructor():
-    type_def_str = r"""
-type List[A] {
-  Cons(A, List[A]),
-  Nil,
-}
-    """
-    main_def_str = r"""
-def @main[A]() -> fn (A, List[A]) -> List[A] {
-  Cons
-}
-    """
-    mod = tvm.relay.parse(SEMVER + type_def_str + main_def_str)
-    mod_str = str(mod)
-    # ensure constructors are printed correctly in type definitions (with their
-    # signature) and as exprs (without their signature)
-    assert type_def_str.strip() in mod_str
-    assert main_def_str.strip() in mod_str
-
-
-def test_null_attribute():
-    x = relay.var("x")
-    y = relay.var("y")
-    z = relay.Function([x], y)
-    z = z.with_attr("TestAttribute", None)
-    txt = astext(z)
-    assert "TestAttribute=None" in txt
-
-
-def test_span():
-    x = relay.var("x", shape=(3, 2))
-    y = relay.var("y")
-    one = relay.const(10e10, dtype="float32")
-    z = relay.add(x, one)
-    z = relay.Call(
-        z.op, z.args, z.attrs, z.type_args, relay.Span(relay.SourceName("Add0"), 0, 0, 0, 0)
-    )
-    z = relay.add(z, z)
-    z = relay.Call(
-        z.op, z.args, z.attrs, z.type_args, relay.Span(relay.SourceName("Add1"), 0, 0, 0, 0)
-    )
-    f = relay.Function([x, y], z)
-    txt = astext(f)
-    assert "Add0" in txt
-    assert "Add1" in txt
-
-
-def test_optional_info():
-    c = relay.const(1)
-    call = relay.add(c, c)
-    m = tvm.IRModule.from_expr(call)
-    m = relay.transform.InferType()(m)
-    txt = astext(m)
-    assert txt.count("/* ty=int32 */") == 3
-
-
-def test_slash_in_identifier():
-    x = relay.var("base/x")
-    y = relay.var("base/y")
-    z = x + y
-    txt = astext(z)
-    assert "base/x" in txt
-    assert "base/y" in txt
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
deleted file mode 100644
index 44750ad0643e..000000000000
--- a/tests/python/relay/test_ir_well_formed.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.analysis import well_formed
-from tvm.relay.prelude import Prelude
-
-
-def test_let():
-    x = relay.Var("x")
-    assert well_formed(x)
-    v = relay.Constant(tvm.nd.array(10))
-    ty = None
-    let = relay.Let(x, v, x)
-    assert well_formed(let)
-    assert not well_formed(relay.Let(x, v, let))
-    f = relay.Function([x], x, ty)
-    assert well_formed(f)
-    assert well_formed(relay.Let(relay.Var("y"), f, relay.Let(relay.Var("z"), f, v)))
-
-
-def test_tuple():
-    x = relay.Var("x")
-    assert well_formed(x)
-    v = relay.Constant(tvm.nd.array(10))
-    let = relay.Let(x, v, x)
-    assert well_formed(let)
-    assert well_formed(relay.Tuple([v, v]))
-    assert not well_formed(relay.Tuple([let, relay.Let(x, v, x)]))
-
-
-def test_tuple_get_item():
-    t = relay.Var("t")
-    assert well_formed(relay.TupleGetItem(t, 2))
-
-
-def test_adt():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, none, some = p.mod.get_type("Option")
-    x = relay.Var("x")
-    some_case = relay.Clause(relay.PatternConstructor(some, [relay.PatternVar(x)]), x)
-    default_case = relay.Clause(relay.PatternVar(x), x)
-    m0 = relay.Match(none(), [default_case])
-    m1 = relay.Match(none(), [some_case, default_case])
-    assert well_formed(m0)
-    assert not well_formed(m1)
-
-
-if __name__ == "__main__":
-    test_let()
-    test_tuple()
-    test_tuple_get_item()
-    test_adt()
diff --git a/tests/python/relay/test_json_compact.py b/tests/python/relay/test_json_compact.py
deleted file mode 100644
index 65381a0eb9ee..000000000000
--- a/tests/python/relay/test_json_compact.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import json
-
-import tvm
-import tvm.testing
-from tvm import relay
-
-# 0.6 BACKWARDS COMPATIBILITY TESTS
-
-
-def test_type_var():
-    # type var in 0.6
-    nodes = [
-        {"type_key": ""},
-        {"type_key": "relay.TypeVar", "attrs": {"kind": "0", "span": "0", "var": "2"}},
-        {"type_key": "Variable", "attrs": {"dtype": "int32", "name": "in0"}},
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    tvar = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(tvar, tvm.ir.TypeVar)
-    assert tvar.name_hint == "in0"
-    nodes[1]["type_key"] = "relay.GlobalTypeVar"
-    tvar = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(tvar, tvm.ir.GlobalTypeVar)
-    assert tvar.name_hint == "in0"
-
-
-def test_var():
-    # type var in 0.6
-    nodes = [
-        {"type_key": ""},
-        {
-            "type_key": "relay.Var",
-            "attrs": {
-                "_checked_type_": "0",
-                "span": "0",
-                "type_annotation": "0",
-                "vid": "2",
-            },
-        },
-        {"type_key": "relay.Id", "attrs": {"name_hint": "a3"}},
-        {"type_key": "relay.TensorType", "attrs": {"dtype": "float32", "shape": "4", "span": "0"}},
-        {"type_key": "Array", "data": [5, 6]},
-        {"type_key": "IntImm", "attrs": {"dtype": "int32", "value": "16", "span": "0"}},
-        {"type_key": "IntImm", "attrs": {"dtype": "int32", "value": "8", "span": "0"}},
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    tvar = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(tvar, relay.Var)
-    assert tvar.name_hint == "a3"
-
-
-def test_incomplete_type():
-    nodes = [
-        {"type_key": ""},
-        {"type_key": "relay.IncompleteType", "attrs": {"kind": "0", "span": "0"}},
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    tvar = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(tvar, tvm.ir.IncompleteType)
-
-
-def test_func_tuple_type():
-    nodes = [
-        {"type_key": ""},
-        {
-            "type_key": "relay.FuncType",
-            "attrs": {
-                "arg_types": "2",
-                "ret_type": "3",
-                "span": "0",
-                "type_constraints": "6",
-                "type_params": "5",
-            },
-        },
-        {"type_key": "Array"},
-        {"type_key": "relay.TupleType", "attrs": {"fields": "4", "span": "0"}},
-        {"type_key": "Array"},
-        {"type_key": "Array"},
-        {"type_key": "Array"},
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    tvar = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(tvar, tvm.ir.FuncType)
-
-
-def test_global_var():
-    nodes = [
-        {"type_key": ""},
-        {
-            "type_key": "relay.GlobalVar",
-            "attrs": {"_checked_type_": "0", "name_hint": "x", "span": "0", "struct_info_": "0"},
-        },
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    tvar = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(tvar, tvm.ir.GlobalVar)
-    nodes = [
-        {"type_key": ""},
-        {
-            "type_key": "GlobalVar",
-            "attrs": {"_checked_type_": "0", "name_hint": "x", "span": "0", "struct_info_": "0"},
-        },
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    tvar = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(tvar, tvm.ir.GlobalVar)
-
-
-def test_op():
-    nodes = [{"type_key": ""}, {"type_key": "relay.Op", "global_key": "nn.conv2d"}]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    op = tvm.ir.load_json(json.dumps(data))
-    assert op == relay.op.get("nn.conv2d")
-
-
-def test_tir_var():
-    nodes = [
-        {"type_key": ""},
-        {"type_key": "Variable", "attrs": {"dtype": "int32", "name": "x", "span": "0"}},
-        {"type_key": "SizeVar", "attrs": {"dtype": "int32", "name": "y", "span": "0"}},
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    x = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(x, tvm.tir.Var)
-    assert x.name == "x"
-    data["root"] = 2
-    y = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(y, tvm.tir.SizeVar)
-    assert y.name == "y"
-
-
-def test_str_map():
-    nodes = [
-        {"type_key": ""},
-        {"type_key": "StrMap", "keys": ["z", "x"], "data": [2, 3]},
-        {"type_key": "IntImm", "attrs": {"dtype": "int32", "value": "2", "span": "0"}},
-        {"type_key": "Max", "attrs": {"a": "4", "b": "10", "dtype": "int32", "span": "0"}},
-        {"type_key": "Add", "attrs": {"a": "5", "b": "9", "dtype": "int32", "span": "0"}},
-        {"type_key": "Add", "attrs": {"a": "6", "b": "8", "dtype": "int32", "span": "0"}},
-        {
-            "type_key": "tir.Var",
-            "attrs": {"dtype": "int32", "name": "7", "type_annotation": "0", "span": "0"},
-        },
-        {"type_key": "runtime.String", "repr_str": "x"},
-        {"type_key": "IntImm", "attrs": {"dtype": "int32", "value": "1", "span": "0"}},
-        {"type_key": "IntImm", "attrs": {"dtype": "int32", "value": "2", "span": "0"}},
-        {"type_key": "IntImm", "attrs": {"dtype": "int32", "value": "100", "span": "0"}},
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.6.0"},
-        "b64ndarrays": [],
-    }
-    x = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(x, tvm.ir.container.Map)
-    assert len(x) == 2
-    assert "x" in x
-    assert "z" in x
-    assert bool(x["z"] == 2)
-
-
-# 0.7 BACKWARDS COMPATIBILITY TESTS
-
-
-def test_irmodule_attributes():
-    nodes = [
-        {"type_key": ""},
-        {
-            "type_key": "IRModule",
-            "attrs": {
-                "functions": "0",
-                "global_type_var_map_": "0",
-                "global_var_map_": "0",
-                "source_map": "0",
-                "type_definitions": "0",
-                "global_infos": "0",
-            },
-        },
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.7.0"},
-        "b64ndarrays": [],
-    }
-    mod = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(mod, tvm.ir.IRModule)
-    # IRModule attributes should defualt to null
-    assert not mod.attrs
-
-
-# 0.8 BACKWARDS COMPATIBILITY TESTS
-
-
-def test_virtual_device():
-    nodes = [
-        {"type_key": ""},
-        {
-            "type_key": "relay.Function",
-            "attrs": {
-                "_checked_type_": "0",
-                "attrs": "0",
-                "body": "0",
-                "params": "0",
-                "ret_type": "0",
-                "span": "0",
-                "type_params": "0",
-            },
-        },
-    ]
-    data = {
-        "root": 1,
-        "nodes": nodes,
-        "attrs": {"tvm_version": "0.8.0"},
-        "b64ndarrays": [],
-    }
-    func = tvm.ir.load_json(json.dumps(data))
-    assert isinstance(func, relay.Function)
-    assert not func.virtual_device_
-
-
-def test_v0_16_ramp_broadcast_lanes():
-    json_graph_v0_15 = {
-        "root": 1,
-        "nodes": [
-            {"type_key": ""},
-            {
-                "type_key": "tir.BufferStore",
-                "attrs": {"buffer": "2", "indices": "16", "span": "0", "value": "14"},
-            },
-            {
-                "type_key": "tir.Buffer",
-                "attrs": {
-                    "axis_separators": "11",
-                    "buffer_type": "1",
-                    "data": "3",
-                    "data_alignment": "64",
-                    "dtype": "int32",
-                    "elem_offset": "12",
-                    "name": "13",
-                    "offset_factor": "1",
-                    "shape": "8",
-                    "span": "0",
-                    "strides": "10",
-                },
-            },
-            {
-                "type_key": "tir.Var",
-                "attrs": {"dtype": "handle", "name": "4", "span": "0", "type_annotation": "5"},
-            },
-            {"type_key": "runtime.String", "repr_str": "buffer"},
-            {"type_key": "PointerType", "attrs": {"element_type": "6", "storage_scope": "7"}},
-            {"type_key": "PrimType", "attrs": {"dtype": "int32"}},
-            {"type_key": "runtime.String"},
-            {"type_key": "Array", "data": [9]},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "50"}},
-            {"type_key": "Array"},
-            {"type_key": "Array"},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "0"}},
-            {"type_key": "runtime.String", "repr_str": "buffer"},
-            {
-                "type_key": "tir.Broadcast",
-                "attrs": {"dtype": "int32x12", "lanes": "12", "span": "0", "value": "15"},
-            },
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "3"}},
-            {"type_key": "Array", "data": [17]},
-            {
-                "type_key": "tir.Ramp",
-                "attrs": {
-                    "base": "18",
-                    "dtype": "int32x12",
-                    "lanes": "12",
-                    "span": "0",
-                    "stride": "19",
-                },
-            },
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "11"}},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "1"}},
-        ],
-        "b64ndarrays": [],
-        "attrs": {"tvm_version": "0.15.dev0"},
-    }
-    graph = tvm.ir.load_json(json.dumps(json_graph_v0_15))
-
-    # Ramp
-    assert graph.indices[0].base == 11
-    assert graph.indices[0].lanes == 12
-    # Broadcast
-    assert graph.value.value == 3
-    assert graph.value.lanes == 12
-
-
-def test_v0_17_load_store_predicate():
-    json_graph_v0_16 = {
-        "root": 1,
-        "nodes": [
-            {"type_key": ""},
-            {
-                "type_key": "tir.BufferStore",
-                "attrs": {
-                    "buffer": "2",
-                    "indices": "19",
-                    "predicate": "0",
-                    "span": "0",
-                    "value": "13",
-                },
-            },
-            {
-                "type_key": "tir.Buffer",
-                "attrs": {
-                    "axis_separators": "11",
-                    "buffer_type": "1",
-                    "data": "3",
-                    "data_alignment": "64",
-                    "dtype": "float32",
-                    "elem_offset": "12",
-                    "name": "4",
-                    "offset_factor": "1",
-                    "shape": "8",
-                    "span": "0",
-                    "strides": "10",
-                },
-            },
-            {
-                "type_key": "tir.Var",
-                "attrs": {"dtype": "handle", "name": "4", "span": "0", "type_annotation": "5"},
-            },
-            {"type_key": "runtime.String"},
-            {"type_key": "PointerType", "attrs": {"element_type": "6", "storage_scope": "7"}},
-            {"type_key": "PrimType", "attrs": {"dtype": "float32"}},
-            {"type_key": "runtime.String", "repr_str": "global"},
-            {"type_key": "Array", "data": [9]},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "8"}},
-            {"type_key": "Array"},
-            {"type_key": "Array"},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "0"}},
-            {
-                "type_key": "tir.BufferLoad",
-                "attrs": {
-                    "buffer": "2",
-                    "dtype": "float32x4",
-                    "indices": "14",
-                    "predicate": "0",
-                    "span": "0",
-                },
-            },
-            {"type_key": "Array", "data": [15]},
-            {
-                "type_key": "tir.Ramp",
-                "attrs": {
-                    "base": "16",
-                    "dtype": "int32x4",
-                    "lanes": "18",
-                    "span": "0",
-                    "stride": "17",
-                },
-            },
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "0"}},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "1"}},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "4"}},
-            {"type_key": "Array", "data": [20]},
-            {
-                "type_key": "tir.Ramp",
-                "attrs": {
-                    "base": "21",
-                    "dtype": "int32x4",
-                    "lanes": "23",
-                    "span": "0",
-                    "stride": "22",
-                },
-            },
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "4"}},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "1"}},
-            {"type_key": "IntImm", "attrs": {"dtype": "int32", "span": "0", "value": "4"}},
-        ],
-        "b64ndarrays": [],
-        "attrs": {"tvm_version": "0.16.0"},
-    }
-
-    expr = tvm.ir.load_json(json.dumps(json_graph_v0_16))
-    buffer_store = expr
-    buffer_load = buffer_store.value
-    assert not buffer_store.predicate
-    assert not buffer_load.predicate
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
deleted file mode 100644
index c6eb7531f635..000000000000
--- a/tests/python/relay/test_json_runtime.py
+++ /dev/null
@@ -1,726 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for JSON codegen and runtime."""
-import os
-import sys
-
-import numpy as np
-
-import tvm
-import tvm.relay.op as reg
-import tvm.relay.testing
-from tvm import relay, runtime
-from tvm.contrib import utils
-from tvm.relay import transform
-from tvm.relay.backend import te_compiler
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.op.contrib.register import get_pattern_table
-
-
-def set_func_attr(func, compile_name, symbol_name):
-    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Compiler", compile_name)
-    func = func.with_attr("global_symbol", symbol_name)
-    return func
-
-
-def check_result(
-    mod, ref_mod, map_inputs, out_shape, tol=1e-5, target="llvm", device=tvm.cpu(), params=None
-):
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        return
-
-    # Run the reference result
-    te_compiler.get().clear()
-    with tvm.transform.PassContext(opt_level=3):
-        json, lib, param = relay.build(ref_mod, target=target, params=params)
-    rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
-
-    for name, data in map_inputs.items():
-        rt_mod.set_input(name, data)
-    rt_mod.set_input(**param)
-    rt_mod.run()
-    out = tvm.nd.empty(out_shape, device=device)
-    out = rt_mod.get_output(0, out)
-    ref_result = out.numpy()
-
-    def check_vm_result():
-        te_compiler.get().clear()
-        with relay.build_config(opt_level=3):
-            exe = relay.vm.compile(mod, target=target, params=params)
-        code, lib = exe.save()
-        exe = runtime.vm.Executable.load_exec(code, lib)
-        vm = runtime.vm.VirtualMachine(exe, device)
-        out = vm.run(**map_inputs)
-        tvm.testing.assert_allclose(out.numpy(), ref_result, rtol=tol, atol=tol)
-
-    def check_graph_executor_result():
-        te_compiler.get().clear()
-        with relay.build_config(opt_level=3):
-            json, lib, param = relay.build(mod, target=target, params=params)
-        rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
-
-        for name, data in map_inputs.items():
-            rt_mod.set_input(name, data)
-        rt_mod.set_input(**param)
-        rt_mod.run()
-        out = tvm.nd.empty(out_shape, device=device)
-        out = rt_mod.get_output(0, out)
-        tvm.testing.assert_allclose(out.numpy(), ref_result, rtol=tol, atol=tol)
-
-    check_vm_result()
-    check_graph_executor_result()
-
-
-def test_conv2d():
-    """Test a subgraph with a single conv2d operator."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    def conv2d_direct():
-        dtype = "float32"
-        ishape = (1, 1, 99, 12)
-        w1shape = (54, 1, 3, 3)
-
-        data0 = relay.var("data", shape=ishape, dtype=dtype)
-        weight0 = relay.var("weight", shape=w1shape, dtype=dtype)
-        out = relay.nn.conv2d(
-            data0, weight0, kernel_size=(3, 3), strides=(2, 2), padding=(1, 0, 1, 1)
-        )
-
-        func = relay.Function([data0, weight0], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight = relay.var("weight", shape=(w1shape), dtype=dtype)
-        main_f = relay.Function([data, weight], glb_var(data, weight))
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data", shape=ishape, dtype=dtype)
-        weight0 = relay.var("weight", shape=w1shape, dtype=dtype)
-        out = relay.nn.conv2d(
-            data0, weight0, kernel_size=(3, 3), strides=(2, 2), padding=(1, 0, 1, 1)
-        )
-        main_f = relay.Function([data0, weight0], out)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_f
-        ref_mod = transform.InferType()(ref_mod)
-
-        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-        w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-        return mod, ref_mod, {"data": i_data, "weight": w1_data}, (1, 54, 50, 6)
-
-    def group_conv2d():
-        dtype = "float32"
-        ishape = (1, 32, 14, 14)
-        w2shape = (32, 1, 3, 3)
-
-        data0 = relay.var("data", shape=(ishape), dtype=dtype)
-        weight0 = relay.var("weight", shape=(w2shape), dtype=dtype)
-        out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=32)
-
-        func = relay.Function([data0, weight0], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight = relay.var("weight", shape=(w2shape), dtype=dtype)
-        main_f = relay.Function([data, weight], glb_var(data, weight))
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data", shape=(ishape), dtype=dtype)
-        weight0 = relay.var("weight", shape=(w2shape), dtype=dtype)
-        out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=32)
-        main_f = relay.Function([data0, weight0], out)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_f
-        ref_mod = transform.InferType()(ref_mod)
-
-        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-        w_data = np.random.uniform(0, 1, w2shape).astype(dtype)
-
-        return mod, ref_mod, {"data": i_data, "weight": w_data}, (1, 32, 14, 14)
-
-    for mod, ref_mod, map_inputs, out_shape in [conv2d_direct(), group_conv2d()]:
-        check_result(mod, ref_mod, map_inputs, out_shape, tol=1e-5)
-
-
-def test_add():
-    """Test a subgraph with a single add operator."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    shape = (10, 10)
-
-    def gen_add():
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        data1 = relay.var("data1", shape=shape, dtype=dtype)
-        out = relay.add(data0, data1)
-
-        func = relay.Function([data0, data1], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        data1 = relay.var("data1", shape=shape, dtype=dtype)
-        main_f = relay.Function([data0, data1], glb_var(data0, data1))
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        data1 = relay.var("data1", shape=shape, dtype=dtype)
-        out = relay.add(data0, data1)
-        main_f = relay.Function([data0, data1], out)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_f
-        ref_mod = transform.InferType()(ref_mod)
-
-        return mod, ref_mod
-
-    mod, ref_mod = gen_add()
-
-    data0 = np.random.uniform(0, 1, shape).astype(dtype)
-    data1 = np.random.uniform(0, 1, shape).astype(dtype)
-    check_result(mod, ref_mod, {"data0": data0, "data1": data1}, shape, tol=1e-5)
-
-
-def test_multiply():
-    """Test a subgraph with a single add operator."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    shape = (10, 10)
-
-    def gen_multiply():
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        data1 = relay.var("data1", shape=shape, dtype=dtype)
-        out = relay.multiply(data0, data1)
-
-        func = relay.Function([data0, data1], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        data1 = relay.var("data1", shape=shape, dtype=dtype)
-        main_f = relay.Function([data0, data1], glb_var(data0, data1))
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        data1 = relay.var("data1", shape=shape, dtype=dtype)
-        out = relay.multiply(data0, data1)
-        main_f = relay.Function([data0, data1], out)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_f
-        ref_mod = transform.InferType()(ref_mod)
-
-        return mod, ref_mod
-
-    mod, ref_mod = gen_multiply()
-
-    data0 = np.random.uniform(0, 1, shape).astype(dtype)
-    data1 = np.random.uniform(0, 1, shape).astype(dtype)
-    check_result(mod, ref_mod, {"data0": data0, "data1": data1}, shape, tol=1e-5)
-
-
-def test_relu():
-    """Test a subgraph with a single ReLU operator."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    shape = (1, 32, 14, 14)
-
-    def gen_relu(shape):
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        out = relay.nn.relu(data0)
-
-        func = relay.Function([data0], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        main_f = relay.Function([data0], glb_var(data0))
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        data0 = relay.var("data0", shape=shape, dtype=dtype)
-        out = relay.nn.relu(data0)
-        main_f = relay.Function([data0], out)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_f
-        ref_mod = transform.InferType()(ref_mod)
-
-        return mod, ref_mod
-
-    def check(shape):
-        mod, ref_mod = gen_relu(shape)
-
-        data0 = np.random.uniform(-1, 1, shape).astype(dtype)
-        check_result(
-            mod,
-            ref_mod,
-            {
-                "data0": data0,
-            },
-            shape,
-            tol=1e-5,
-        )
-
-    check(shape=(1, 32, 14, 14))
-    check(shape=(1, 32))
-
-
-def test_dense():
-    """Test a subgraph with a single dense operator."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    a_shape = (1, 512)
-    b_shape = (1024, 512)
-
-    def gen_dense():
-        a = relay.var("A", shape=a_shape, dtype=dtype)
-        b = relay.var("B", shape=b_shape, dtype=dtype)
-        out = relay.nn.dense(a, b)
-
-        func = relay.Function([a, b], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        a = relay.var("A", shape=a_shape, dtype=dtype)
-        b = relay.var("B", shape=b_shape, dtype=dtype)
-        main_f = relay.Function([a, b], glb_var(a, b))
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        a = relay.var("A", shape=a_shape, dtype=dtype)
-        b = relay.var("B", shape=b_shape, dtype=dtype)
-        out = relay.nn.dense(a, b)
-        main_f = relay.Function([a, b], out)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_f
-        ref_mod = transform.InferType()(ref_mod)
-
-        return mod, ref_mod
-
-    mod, ref_mod = gen_dense()
-
-    data_a = np.random.uniform(0, 1, a_shape).astype(dtype)
-    data_b = np.random.uniform(0, 1, b_shape).astype(dtype)
-    check_result(mod, ref_mod, {"A": data_a, "B": data_b}, (1, 1024), tol=1e-5)
-
-
-def test_bn():
-    """Test a subgraph with a single batch_norm operator."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    d_shape = (1, 8)
-    c_shape = (8,)
-
-    def gen_bn():
-        data = relay.var("data", shape=d_shape)
-        gamma = relay.var("gamma", shape=c_shape)
-        beta = relay.var("beta", shape=c_shape)
-        moving_mean = relay.var("moving_mean", shape=c_shape)
-        moving_var = relay.var("moving_var", shape=c_shape)
-        bn = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var)
-        out = bn[0]
-
-        func = relay.Function([data, gamma, beta, moving_mean, moving_var], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        data = relay.var("data", shape=d_shape)
-        gamma = relay.var("gamma", shape=c_shape)
-        beta = relay.var("beta", shape=c_shape)
-        moving_mean = relay.var("moving_mean", shape=c_shape)
-        moving_var = relay.var("moving_var", shape=c_shape)
-        main_f = relay.Function(
-            [data, gamma, beta, moving_mean, moving_var],
-            glb_var(data, gamma, beta, moving_mean, moving_var),
-        )
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        data = relay.var("data", shape=d_shape)
-        gamma = relay.var("gamma", shape=c_shape)
-        beta = relay.var("beta", shape=c_shape)
-        moving_mean = relay.var("moving_mean", shape=c_shape)
-        moving_var = relay.var("moving_var", shape=c_shape)
-        bn = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var)
-        out = bn[0]
-        main_f = relay.Function([data, gamma, beta, moving_mean, moving_var], out)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_f
-        ref_mod = transform.InferType()(ref_mod)
-
-        return mod, ref_mod
-
-    mod, ref_mod = gen_bn()
-
-    data = np.random.uniform(-1, 1, d_shape).astype(dtype)
-    gamma = np.random.uniform(-1, 1, c_shape).astype(dtype)
-    beta = np.random.uniform(-1, 1, c_shape).astype(dtype)
-    moving_mean = np.random.uniform(-1, 1, c_shape).astype(dtype)
-    moving_var = np.random.uniform(-1, 1, c_shape).astype(dtype)
-    check_result(
-        mod,
-        ref_mod,
-        {
-            "data": data,
-            "gamma": gamma,
-            "beta": beta,
-            "moving_mean": moving_mean,
-            "moving_var": moving_var,
-        },
-        d_shape,
-        tol=1e-5,
-    )
-
-
-def test_multiple_ops():
-    """Test a subgraph with multiple operators."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    w1shape = (32, 32, 3, 3)
-    w2shape = (64, 32, 5, 5)
-
-    def get_net():
-        data = relay.var("data", relay.TensorType(ishape, dtype))
-        w1 = relay.var("w1", relay.TensorType(w1shape, dtype))
-        w2 = relay.var("w2", relay.TensorType(w2shape, dtype))
-
-        layer = relay.nn.conv2d(data=data, weight=w1, kernel_size=(3, 3), padding=(1, 1))
-        layer = relay.nn.relu(layer)
-        layer = relay.nn.conv2d(data=layer, weight=w2, kernel_size=(5, 5), padding=(2, 2))
-        layer = relay.nn.relu(layer)
-
-        main_f = relay.Function([data, w1, w2], layer)
-        mod = tvm.IRModule()
-        mod["main"] = main_f
-        return mod
-
-    def get_partitoned_mod(mod):
-        remove_bn_pass = tvm.transform.Sequential(
-            [
-                transform.InferType(),
-                transform.SimplifyInference(),
-                transform.FoldConstant(),
-                transform.FoldScaleAxis(),
-            ]
-        )
-        byoc_pass = tvm.transform.Sequential(
-            [
-                remove_bn_pass,
-                transform.AnnotateTarget("dnnl"),
-                transform.MergeCompilerRegions(),
-                transform.PartitionGraph(),
-            ]
-        )
-
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            return byoc_pass(mod)
-
-    ref_mod = get_net()
-    mod = get_partitoned_mod(ref_mod)
-
-    data = np.random.uniform(0, 1, ishape).astype(dtype)
-    w1 = np.random.uniform(0, 1, w1shape).astype(dtype)
-    w2 = np.random.uniform(0, 1, w2shape).astype(dtype)
-    check_result(
-        mod,
-        ref_mod,
-        {
-            "data": data,
-            "w1": w1,
-            "w2": w2,
-        },
-        (1, 64, 14, 14),
-        tol=1e-5,
-    )
-
-
-def test_composite():
-    """Test DNNL patterns and there composite functions."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-
-    def conv2d_relu():
-        ishape = (1, 32, 14, 14)
-        w1shape = (32, 32, 3, 3)
-
-        # Composite function
-        in_1 = relay.var("in_1", shape=ishape, dtype=dtype)
-        in_2 = relay.var("in_2", shape=w1shape, dtype=dtype)
-        conv2d = relay.nn.conv2d(in_1, in_2, kernel_size=(3, 3), padding=(1, 1))
-        relu = relay.nn.relu(conv2d)
-        func = relay.Function([in_1, in_2], relu)
-        func = func.with_attr("Composite", "dnnl.conv2d_relu")
-        func = func.with_attr("PartitionedFromPattern", "nn.conv2d_nn.relu_")
-
-        # Partition function
-        arg_1 = relay.var("arg_1", shape=ishape, dtype=dtype)
-        arg_2 = relay.var("arg_2", shape=w1shape, dtype=dtype)
-        call = relay.Call(func, [arg_1, arg_2])
-        p_func = relay.Function([arg_1, arg_2], call)
-        p_func = set_func_attr(p_func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = p_func
-        mod = transform.InferType()(mod)
-
-        # Main function
-        data = relay.var("data", shape=ishape, dtype=dtype)
-        weight = relay.var("weight", shape=w1shape, dtype=dtype)
-        main_func = relay.Function([data, weight], glb_var(data, weight))
-        mod["main"] = main_func
-        mod = transform.InferType()(mod)
-
-        # Reference module
-        data = relay.var("data", shape=ishape, dtype=dtype)
-        weight = relay.var("weight", shape=w1shape, dtype=dtype)
-        conv2d = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1))
-        relu = relay.nn.relu(conv2d)
-        main_func = relay.Function([data, weight], relu)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_func
-        ref_mod = transform.InferType()(ref_mod)
-
-        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-        w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-        return mod, ref_mod, {"data": i_data, "weight": w1_data}, (1, 32, 14, 14)
-
-    def conv2d_bias_relu():
-        ishape = (1, 32, 14, 14)
-        w1shape = (32, 32, 3, 3)
-        bshape = (32, 1, 1)
-
-        # Composite function
-        in_1 = relay.var("in_1", shape=ishape, dtype=dtype)
-        in_2 = relay.var("in_2", shape=w1shape, dtype=dtype)
-        in_3 = relay.var("in_3", shape=bshape, dtype=dtype)
-        conv2d = relay.nn.conv2d(in_1, in_2, kernel_size=(3, 3), padding=(1, 1))
-        add = relay.add(conv2d, in_3)
-        relu = relay.nn.relu(add)
-        func = relay.Function([in_1, in_2, in_3], relu)
-        func = func.with_attr("Composite", "dnnl.conv2d_bias_relu")
-        func = func.with_attr("PartitionedFromPattern", "nn.conv2d_add_nn.relu_")
-
-        # Partition function
-        arg_1 = relay.var("arg_1", shape=ishape, dtype=dtype)
-        arg_2 = relay.var("arg_2", shape=w1shape, dtype=dtype)
-        arg_3 = relay.var("arg_3", shape=bshape, dtype=dtype)
-        call = relay.Call(func, [arg_1, arg_2, arg_3])
-        p_func = relay.Function([arg_1, arg_2, arg_3], call)
-        p_func = set_func_attr(p_func, "dnnl", "tvmgen_default_dnnl_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = p_func
-        mod = transform.InferType()(mod)
-
-        # Main function
-        data = relay.var("data", shape=ishape, dtype=dtype)
-        weight = relay.var("weight", shape=w1shape, dtype=dtype)
-        bias = relay.var("bias", shape=bshape, dtype=dtype)
-        main_func = relay.Function([data, weight, bias], glb_var(data, weight, bias))
-        mod["main"] = main_func
-        mod = transform.InferType()(mod)
-
-        # Reference module
-        data = relay.var("data", shape=ishape, dtype=dtype)
-        weight = relay.var("weight", shape=w1shape, dtype=dtype)
-        bias = relay.var("bias", shape=bshape, dtype=dtype)
-        conv2d = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1))
-        add = relay.add(conv2d, bias)
-        relu = relay.nn.relu(add)
-        main_func = relay.Function([data, weight, bias], relu)
-        ref_mod = tvm.IRModule()
-        ref_mod["main"] = main_func
-        ref_mod = transform.InferType()(ref_mod)
-
-        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-        w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-        b_data = np.random.uniform(0, 1, bshape).astype(dtype)
-
-        return mod, ref_mod, {"data": i_data, "weight": w1_data, "bias": b_data}, (1, 32, 14, 14)
-
-    for mod, ref_mod, input_maps, out_shape in [conv2d_relu(), conv2d_bias_relu()]:
-        check_result(mod, ref_mod, input_maps, out_shape, tol=1e-5)
-
-
-def test_constant():
-    """Test the subgraph with (var, const, ...) arguments."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    wshape = (32, 32, 3, 3)
-
-    data = relay.var("data", shape=ishape, dtype=dtype)
-    weight = relay.var("weight", shape=wshape, dtype=dtype)
-    bn_gamma = relay.var("bn_gamma")
-    bn_beta = relay.var("bn_beta")
-    bn_mmean = relay.var("bn_mean")
-    bn_mvar = relay.var("bn_var")
-
-    layer = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3, 3), padding=(1, 1))
-    bn_output = relay.nn.batch_norm(layer, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-    out = bn_output[0]
-    out = relay.nn.relu(out)
-
-    func = relay.Function(relay.analysis.free_vars(out), out)
-    ref_mod, params = tvm.relay.testing.create_workload(func)
-    ref_mod["main"] = bind_params_by_name(ref_mod["main"], params)
-
-    remove_bn_pass = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.SimplifyInference(),
-            transform.FoldConstant(),
-            transform.FoldScaleAxis(),
-        ]
-    )
-
-    dnnl_patterns = get_pattern_table("dnnl")
-    composite_partition = tvm.transform.Sequential(
-        [
-            transform.MergeComposite(dnnl_patterns),
-            transform.AnnotateTarget("dnnl"),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        ref_mod = remove_bn_pass(ref_mod)
-        mod = composite_partition(ref_mod)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-    check_result(mod, ref_mod, {"data": i_data}, (1, 32, 14, 14), tol=1e-5)
-
-
-def test_partial_constant():
-    """Test the subgraph with (const, var, const, var) arguments."""
-    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    ishape = (10, 10)
-
-    in_1 = relay.var("in_1", shape=ishape, dtype=dtype)
-    in_2 = relay.var("in_2", shape=ishape, dtype=dtype)
-    in_3 = relay.var("in_3", shape=ishape, dtype=dtype)
-    in_4 = relay.var("in_4", shape=ishape, dtype=dtype)
-
-    add1 = relay.add(in_1, in_2)
-    add2 = relay.add(add1, in_3)
-    add3 = relay.add(add2, in_3)
-    add4 = relay.add(add3, in_3)
-
-    func = relay.Function([in_1, in_2, in_3, in_4], add4)
-    ref_mod = tvm.IRModule.from_expr(func)
-    ref_mod = relay.transform.InferType()(ref_mod)
-
-    data1 = np.random.uniform(0, 1, ishape).astype(dtype)
-    data3 = np.random.uniform(0, 1, ishape).astype(dtype)
-
-    params = {
-        "in_1": tvm.nd.array(data1, device=tvm.cpu(0)),
-        "in_3": tvm.nd.array(data3, device=tvm.cpu(0)),
-    }
-    ref_mod["main"] = bind_params_by_name(ref_mod["main"], params)
-
-    opt_pass = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.SimplifyInference(),
-            transform.FoldConstant(),
-            transform.FoldScaleAxis(),
-            transform.AnnotateTarget("dnnl"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        mod = opt_pass(ref_mod)
-
-    data2 = np.random.uniform(0, 1, ishape).astype(dtype)
-    data4 = np.random.uniform(0, 1, ishape).astype(dtype)
-    check_result(mod, ref_mod, {"in_2": data2, "in_4": data4}, (10, 10), tol=1e-5)
-
-
-if __name__ == "__main__":
-    test_conv2d()
-    test_add()
-    test_multiply()
-    test_relu()
-    test_dense()
-    test_bn()
-    test_multiple_ops()
-    test_composite()
-    test_constant()
-    test_partial_constant()
diff --git a/tests/python/relay/test_layer_count.py b/tests/python/relay/test_layer_count.py
deleted file mode 100644
index f680bb2725f2..000000000000
--- a/tests/python/relay/test_layer_count.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from tvm.relay.testing import resnet
-from tvm.relay.analysis import count_layers
-
-
-def test_layer_count():
-    def verify(num_layers):
-        # Load a resnet with a known number of layers.
-        mod, _ = resnet.get_workload(num_layers=num_layers)
-        # Count the number of conv and dense layers.
-        count = count_layers(mod, valid_ops=["nn.conv2d", "nn.dense"])
-        assert count == num_layers
-
-    verify(18)
-    verify(50)
-
-
-if __name__ == "__main__":
-    test_layer_count()
diff --git a/tests/python/relay/test_link_params.py b/tests/python/relay/test_link_params.py
deleted file mode 100644
index 35ca74d6f8e7..000000000000
--- a/tests/python/relay/test_link_params.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import collections
-import ctypes
-import json
-import os
-import re
-from contextlib import redirect_stderr
-from io import StringIO
-
-import numpy as np
-import tvm
-import tvm.relay
-import tvm.testing
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.contrib import utils
-from tvm.relay.backend import Executor, Runtime
-
-INPUT_SHAPE = (1, 3, 16, 16)
-
-KERNEL_SHAPE = (3, 3, 3, 3)
-
-
-# The data types that are linkable.
-linkable_dtype = tvm.testing.parameter(
-    *(
-        [f"uint{b}" for b in (8, 16, 32, 64)]
-        + [f"int{b}" for b in (8, 16, 32, 64)]
-        + ["float32", "float64"]
-    )
-)
-
-
-def dtype_info(dtype):
-    """Lookup numpy type info for the given string dtype (of linkable_dtype params above)."""
-    if "int" in dtype:
-        return np.iinfo(getattr(np, dtype))
-    else:
-        return np.finfo(getattr(np, dtype))
-
-
-# Note: for debugging, set this to an integer (i.e. 1.0). Then all "random" tensors will become
-# predictable
-RANDOM_TENSOR_START = None
-
-
-def _make_random_tensor(dtype, shape):
-    """Create a random test tensor with given shape and dtype."""
-    global RAND_SEED
-    if RANDOM_TENSOR_START is not None:
-        to_return = np.arange(
-            RANDOM_TENSOR_START, RANDOM_TENSOR_START + np.prod(shape), dtype=dtype
-        ).reshape(shape)
-        RAND_SEED += np.prod(shape)
-        return to_return
-
-    dinfo = dtype_info(dtype)
-    if "int" in dtype:
-        return np.random.randint(dinfo.min, dinfo.max, shape, dtype=dtype)
-    else:
-        to_return = np.random.uniform(0, dinfo.max, shape).astype(dtype)
-        np.reshape(to_return, np.prod(shape))[::2] *= -1
-        return to_return
-
-
-def _lookup_sid(graph, name):
-    """Lookup the storage id of a named parameter.
-
-    Arguments
-    ---------
-    graph : dict
-        Parsed JSON graph.
-
-    name : str
-        Name of the tensor parameter to lookup.
-
-    Returns
-    -------
-    int :
-        The storage_id of the parameter.
-    """
-    num_outputs_seen = 0
-    for i, n in enumerate(graph["nodes"]):
-        if n["name"] == name:
-            print("sid", name, graph["attrs"]["storage_id"][1], num_outputs_seen)
-            return graph["attrs"]["storage_id"][1][num_outputs_seen]
-        else:
-            if "attrs" in n and "num_outputs" in n["attrs"]:
-                num_outputs_seen += int(n["attrs"]["num_outputs"])
-            else:
-                num_outputs_seen += 1
-
-    raise KeyError(f"no such param: {name}")
-
-
-def _get_ctypes_dtype(dt):
-    """Return a ctypes c_* datatype given a string data type."""
-    if "int" in dt:
-        return getattr(ctypes, f"c_{dt}")
-    elif dt == "float32":
-        return ctypes.c_float
-    elif dt == "float64":
-        return ctypes.c_double
-    else:
-        assert False, f"unknown dtype: {dt}"
-
-
-def _verify_linked_param(dtype, lib, mod, graph, name):
-    """Directly read memory from the linked library to verify the linked parameter is correct."""
-    sid = _lookup_sid(graph, name)
-    # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend),
-    # a GraphExecutorFactory module is created instead of the module itself.
-    param_ptr = mod.get_function("_lookup_linked_param", True)(sid)
-    gen_param = lib.params[name]
-    arr_data = (_get_ctypes_dtype(dtype) * np.prod(gen_param.shape)).from_address(param_ptr.value)
-    arr = np.ndarray(shape=gen_param.shape, dtype=gen_param.dtype, buffer=arr_data, order="C")
-    if "int" in gen_param.dtype:
-        np.testing.assert_equal(gen_param.numpy(), arr)
-    else:
-        np.testing.assert_allclose(gen_param.numpy(), arr)
-    return dtype == gen_param.dtype
-
-
-def _make_mod_and_params(dtype):
-    """Create a Relay module and parameters to test the given datatype."""
-    param_decls = collections.OrderedDict()
-    param_init = {}
-
-    def _add_decl(name, dtype):
-        param_decls[name] = f"%{name} : Tensor[{KERNEL_SHAPE}, {dtype}]"
-        param_init[name] = _make_random_tensor(dtype, KERNEL_SHAPE)
-
-    # Add several parameters so that the number of parameters
-    _add_decl(f"{dtype}_a", dtype)
-    _add_decl(f"{dtype}_b", dtype)
-
-    mod_lines = [
-        '#[version = "0.0.5"]',
-        f"def @main(%rand_input : Tensor[{INPUT_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
-        # This program ensures that GraphPlanMemory alternates between the same two storage IDs for a
-        # while. In doing this, it ensures that param %{dtype}_b will be placed into the graph at an
-        # index unequal to its storage_id. This ensures that GraphExecutorCodegen encodes the storage_id
-        # and not the parameter index into the graph.
-        (
-            f'    %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
-            f'kernel_size=[3, 3], out_dtype="{dtype}");'
-        ),
-        (
-            f'    %1 = nn.conv2d(%0, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
-            f'kernel_size=[3, 3], out_dtype="{dtype}");'
-        ),
-        (
-            f'    %2 = nn.conv2d(%1, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
-            f'kernel_size=[3, 3], out_dtype="{dtype}");'
-        ),
-        (
-            f'    %3 = nn.conv2d(%2, %{dtype}_b, data_layout="NCHW", kernel_layout="OIHW", '
-            f'kernel_size=[3, 3], out_dtype="{dtype}");'
-        ),
-        "    %3",
-        "}",
-    ]
-
-    mod = tvm.relay.fromtext("\n".join(mod_lines))
-    return mod, param_init
-
-
-@tvm.testing.requires_llvm
-def test_llvm_link_params(linkable_dtype):
-    ir_mod, param_init = _make_mod_and_params(linkable_dtype)
-    rand_input = _make_random_tensor(linkable_dtype, INPUT_SHAPE)
-    main_func = ir_mod["main"]
-    target = "llvm"
-    runtime = Runtime("crt", {"system-lib": True})
-    executor = Executor("graph", {"link-params": True})
-    with tvm.transform.PassContext(opt_level=3):
-        lib = tvm.relay.build(ir_mod, target, runtime=runtime, executor=executor, params=param_init)
-
-        # NOTE: Need to export_library() and load_library() to link all the Module(llvm, ...)
-        # against one another.
-        temp_dir = utils.TempDirectory()
-        export_file = temp_dir / "lib.so"
-        lib.lib.export_library(export_file)
-        mod = tvm.runtime.load_module(export_file)
-        assert len(lib.params.keys()) == 0  # NOTE: params became tir.constants
-        assert mod.get_function("TVMSystemLibEntryPoint") != None
-
-        graph = json.loads(lib.graph_json)
-        for p in lib.params:
-            _verify_linked_param(linkable_dtype, lib, mod, graph, p) or found_one
-
-        # Wrap in function to explicitly deallocate the runtime.
-        def _run_linked(lib, mod):
-            graph_json, _, _ = lib
-            graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0))
-            graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
-            graph_rt.run()
-            return graph_rt.get_output(0)
-
-        linked_output = _run_linked(lib, mod)
-
-    runtime = Runtime("cpp", {"system-lib": True})
-    with tvm.transform.PassContext(opt_level=3):
-        lib = tvm.relay.build(ir_mod, "llvm", runtime=runtime, params=param_init)
-
-        def _run_unlinked(lib):
-            graph_json, mod, lowered_params = lib
-            graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0))
-            graph_rt.set_input("rand_input", rand_input, **lowered_params)
-            graph_rt.run()
-            return graph_rt.get_output(0)
-
-        unlinked_output = _run_unlinked(lib)
-
-    if "int" in linkable_dtype:
-        np.testing.assert_equal(unlinked_output.numpy(), linked_output.numpy())
-    else:
-        np.testing.assert_allclose(unlinked_output.numpy(), linked_output.numpy())
-
-
-def _get_c_datatype(dtype):
-    """Translate LINKABLE_DTYPES element to c datatype."""
-    if "int" in dtype:
-        return f"{dtype}_t"
-    elif dtype == "float32":
-        return "float"
-    elif dtype == "float64":
-        return "double"
-    else:
-        assert False, f"unknown dtype {dtype}"
-
-
-HEX_NUM_RE = re.compile(r"[+\-]?(?:(?:0x[0-9A-Fa-f.p+-]+)|(?:INFINITY)|(?:NAN))")
-
-
-def test_c_link_params(linkable_dtype):
-    temp_dir = utils.tempdir()
-    mod, param_init = _make_mod_and_params(linkable_dtype)
-    rand_input = _make_random_tensor(linkable_dtype, INPUT_SHAPE)
-    main_func = mod["main"]
-    target = "c"
-    executor = Executor("graph", {"link-params": True})
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        lib = tvm.relay.build(mod, target, executor=executor, params=param_init)
-        assert len(lib.params.keys()) == 0  # NOTE: params became tir.constants
-
-        src = lib.lib.get_source()
-        lib.lib.save(temp_dir.relpath("test.c"), "c")
-        c_dtype = _get_c_datatype(linkable_dtype)
-        src_lines = src.split("\n")
-        param = param_init[f"{linkable_dtype}_a"].reshape(np.prod(KERNEL_SHAPE))
-        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) [a-zA-Z_0-9]*constant_\d+\[{np.prod(param.shape)}\] = {{$"
-
-        for i, line in enumerate(src_lines):
-            if re.match(param_def, line):
-                i += 1
-                break
-        else:
-            assert False, f'did not find parameter definition "{param_def}":\n{src}'
-
-        cursor = 0
-        width = dtype_info(linkable_dtype).bits // 4 + 2
-        if linkable_dtype.startswith("int"):
-            width += 1  # Account for sign
-
-        while "};" not in src_lines[i]:
-            for match in HEX_NUM_RE.finditer(src_lines[i]):
-                cursor += 1
-            i += 1
-
-        assert cursor == np.prod(param.shape)
-
-        # Need a unique name per library to avoid dlopen caching the lib load.
-        lib_path = temp_dir.relpath(f"test-{linkable_dtype}-linked.so")
-        lib["remove_params"]().export_library(lib_path)
-        lib_mod = tvm.runtime.load_module(lib_path)
-
-        #            lib_mod = lib_factory['default']()
-        graph = json.loads(lib.graph_json)
-        for p in lib.params:
-            _verify_linked_param(linkable_dtype, lib, lib_mod, graph, p)
-
-        # Wrap in function to explicitly deallocate the runtime.
-        def _run_linked(lib_mod):
-            graph_rt = tvm.contrib.graph_executor.GraphModule(lib_mod["default"](tvm.cpu(0)))
-            graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
-            graph_rt.run()
-
-            return graph_rt.get_output(0)
-
-        linked_output = _run_linked(lib_mod)
-
-    linked_params = lib.params
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        lib = tvm.relay.build(mod, "c", params=param_init)
-        _, _, params = lib
-        # Need a unique name per library to avoid dlopen caching the lib load.
-        lib_path = temp_dir.relpath(f"test-{linkable_dtype}-unlinked.so")
-        lib.export_library(lib_path)
-        lib_mod = tvm.runtime.load_module(lib_path)
-
-        def _run_unlinked(lib_mod):
-            graph_rt = tvm.contrib.graph_executor.GraphModule(lib_mod["default"](tvm.cpu(0)))
-            graph_rt.set_input("rand_input", rand_input, **params)
-            graph_rt.run()
-            return graph_rt.get_output(0)
-
-        unlinked_output = _run_unlinked(lib_mod)
-
-    if "int" in linkable_dtype:
-        np.testing.assert_equal(unlinked_output.numpy(), linked_output.numpy())
-    else:
-        np.testing.assert_allclose(unlinked_output.numpy(), linked_output.numpy())
-
-
-def test_tir_link_params():
-    def get_dense(data_shape, weight_shape):
-        data = relay.var("data", shape=data_shape, dtype="float32")
-        weight = relay.var("weight", shape=weight_shape, dtype="float32")
-        dense = relay.nn.dense(data, weight)
-        return relay.Function([data, weight], dense)
-
-    def get_ref_dense(data_np, weight_np):
-        return np.dot(data_np, np.transpose(weight_np))
-
-    def schedule_dense(sch):
-        dense = sch.get_block("T_matmul_NT")
-        _y, _x, _k = sch.get_loops(dense)
-
-    M, N, K = 128, 128, 128
-    data_shape = (M, K)
-    weight_shape = (N, K)
-    relay_mod = tvm.IRModule.from_expr(get_dense(data_shape, weight_shape))
-    relay_mod = relay.transform.InferType()(relay_mod)
-    data_np = np.random.randn(*data_shape).astype("float32")
-    weight_np = np.random.randn(*weight_shape).astype("float32")
-    target = "llvm"
-    params = {"weight": weight_np}
-
-    def schedule_fn(sch):
-        if "nn_dense" in sch.mod.attrs["task_name"]:
-            schedule_dense(sch)
-            return True
-        return False
-
-    with StringIO() as stderr_buf, redirect_stderr(stderr_buf):
-        with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            executor = Executor("graph", {"link-params": True})
-            lib = relay.build(relay_mod, target=target, executor=executor)
-
-        # Workload look up should succeed. This does not work when the test is invoked from pytest.
-        assert not "Cannot find workload" in stderr_buf.getvalue()
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    runtime.set_input(**params)
-    runtime.set_input("data", data_np)
-    runtime.run()
-    out = runtime.get_output(0).numpy()
-    ref = get_ref_dense(data_np, weight_np)
-    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_memory_passes.py b/tests/python/relay/test_memory_passes.py
deleted file mode 100644
index bed17dbbd830..000000000000
--- a/tests/python/relay/test_memory_passes.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-
-
-def check_memory_plan(func, check_fn):
-    # Build Module
-    mod = tvm.IRModule().from_expr(func)
-
-    # Convert arguments.
-    args = []
-    for param in func.params:
-        param = param.type_annotation
-        sh = [int(sh) for sh in param.shape]
-        data = np.random.rand(*sh).astype(param.dtype)
-        args.append(tvm.nd.array(data))
-
-    # TODO(mbs): Why does the executor need to be shared? Seems wrong.
-    ex = relay.create_executor("vm", mod)
-
-    # Compute without memory planning.
-    no_plan_result = ex.evaluate()(*args)
-
-    # Compute with memory planning.
-    with tvm.transform.PassContext(opt_level=1, disabled_pass=["MemoryPlan"]):
-        plan_result = ex.evaluate()(*args)
-
-    # Compute Python result.
-    py_res = check_fn(*[arg.numpy() for arg in args])
-
-    # First check that the two VM results agree.
-    np.testing.assert_allclose(no_plan_result.numpy(), plan_result.numpy())
-
-    # Finally check that the results match the Python result.
-    np.testing.assert_allclose(plan_result.numpy(), py_res)
-
-
-def storage_type(mod):
-    return relay.TypeCall(mod.get_global_type_var("Storage"), [])
-
-
-def test_tyck_alloc_storage():
-    mod = tvm.IRModule()
-    mod.import_from_std("core.rly")
-
-
-def test_tyck_alloc_tensor():
-    mod = tvm.IRModule()
-    mod.import_from_std("core.rly")
-    sto = relay.Var("x", storage_type(mod))
-    sh = relay.const(np.array([1, 2]), dtype="int64")
-    at = relay.op.memory.alloc_tensor(sto, relay.const(0, dtype="int64"), sh)
-    mod["main"] = relay.Function([sto], at)
-    relay.transform.InferType()(mod)
-
-
-def check_add(x):
-    return x + x
-
-
-def test_add():
-    x = relay.var("x", shape=(2,))
-    z = x + x
-    func = relay.Function(
-        [
-            x,
-        ],
-        z,
-    )
-    check_memory_plan(func, check_add)
-
-
-def check_add_sub(x, y):
-    z = x + x
-    return z - y
-
-
-def test_add_sub():
-    x = relay.var("x", shape=(10,))
-    y = relay.var("y", shape=(10,))
-    z = x + x
-    z = z - y
-    func = relay.Function([x, y], z)
-    check_memory_plan(func, check_add_sub)
-
-
-def check_no_fuse(x, y, w):
-    z = x + y
-    return np.matmul(z, np.transpose(w))
-
-
-def test_no_fuse():
-    x = relay.var("x", shape=(5, 1))
-    y = relay.var("y", shape=(5, 1))
-    w = relay.var("w", shape=(5, 1))
-    z = x + y
-    out = relay.op.nn.dense(z, w)
-    func = relay.Function([x, y, w], out)
-    check_memory_plan(func, check_no_fuse)
-
-
-if __name__ == "__main__":
-    test_tyck_alloc_tensor()
-    test_add()
-    test_add_sub()
diff --git a/tests/python/relay/test_name_mangling.py b/tests/python/relay/test_name_mangling.py
deleted file mode 100644
index 46195d1fa215..000000000000
--- a/tests/python/relay/test_name_mangling.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License
-
-import tvm
-import tvm.testing
-import tvm.relay as relay
-import tvm.relay.backend.utils as utils
-import pytest
-
-
-def test_mangle_mod_name():
-    assert utils.mangle_module_name("default") == "tvmgen_default"
-    assert utils.mangle_module_name("ccompiler") == "tvmgen_ccompiler"
-    assert utils.mangle_module_name("1234"), "tvmgen_1234"
-    assert utils.mangle_module_name(""), "tvmgen"
-    assert utils.mangle_module_name(None), "tvmgen"
-
-    with pytest.raises(ValueError):
-        utils.mangle_module_name("\u018e")
-        utils.mangle_module_name("\xf1")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_name_supply.py b/tests/python/relay/test_name_supply.py
deleted file mode 100644
index f48fe0a47485..000000000000
--- a/tests/python/relay/test_name_supply.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-
-from tvm import relay
-from tvm.ir import GlobalVar, structural_equal, assert_structural_equal
-from tvm.ir.supply import NameSupply
-from tvm.ir.supply import GlobalVarSupply
-
-
-def test_name_supply():
-    name_supply = NameSupply("prefix")
-    name_supply.reserve_name("test")
-
-    assert name_supply.contains_name("test")
-    assert name_supply.fresh_name("test") == "prefix_test_1"
-    assert name_supply.contains_name("test_1")
-    assert not name_supply.contains_name("test_1", False)
-    assert not name_supply.contains_name("test_2")
-
-
-def test_global_var_supply_from_none():
-    var_supply = GlobalVarSupply()
-    global_var = GlobalVar("test")
-    var_supply.reserve_global(global_var)
-
-    assert_structural_equal(var_supply.unique_global_for("test"), global_var)
-    assert not structural_equal(var_supply.fresh_global("test"), global_var)
-
-
-def test_global_var_supply_from_name_supply():
-    name_supply = NameSupply("prefix")
-    var_supply = GlobalVarSupply(name_supply)
-    global_var = GlobalVar("test")
-    var_supply.reserve_global(global_var)
-
-    assert_structural_equal(var_supply.unique_global_for("test", False), global_var)
-    assert not structural_equal(var_supply.unique_global_for("test"), global_var)
-
-
-def test_global_var_supply_from_ir_mod():
-    x = relay.var("x")
-    y = relay.var("y")
-    mod = tvm.IRModule()
-    global_var = GlobalVar("test")
-    mod[global_var] = relay.Function([x, y], relay.add(x, y))
-    var_supply = GlobalVarSupply(mod)
-
-    second_global_var = var_supply.fresh_global("test", False)
-
-    assert_structural_equal(var_supply.unique_global_for("test", False), global_var)
-    assert not structural_equal(var_supply.unique_global_for("test"), global_var)
-    assert not structural_equal(second_global_var, global_var)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_name_transforms.py b/tests/python/relay/test_name_transforms.py
deleted file mode 100644
index 72976dc19c21..000000000000
--- a/tests/python/relay/test_name_transforms.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License" you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-
-from tvm import TVMError
-from tvm.relay.backend.name_transforms import (
-    to_c_function_style,
-    to_c_variable_style,
-    to_c_constant_style,
-    prefix_name,
-    prefix_generated_name,
-)
-from tvm.runtime.name_transforms import sanitize_name
-
-
-def test_to_c_function_style():
-    assert to_c_function_style("TVM_Woof") == "TVMWoof"
-    assert to_c_function_style("TVM_woof") == "TVMWoof"
-    assert to_c_function_style("TVM_woof_woof") == "TVMWoofWoof"
-    assert to_c_function_style("TVMGen_woof_woof") == "TVMGenWoofWoof"
-
-    # Incorrect prefix
-    with pytest.raises(TVMError, match="Function not TVM prefixed"):
-        to_c_function_style("Cake_Bakery")
-    with pytest.raises(TVMError, match="Function name is empty"):
-        to_c_function_style("")
-
-
-def test_to_c_variable_style():
-    assert to_c_variable_style("TVM_Woof") == "tvm_woof"
-    assert to_c_variable_style("TVM_woof") == "tvm_woof"
-    assert to_c_variable_style("TVM_woof_Woof") == "tvm_woof_woof"
-
-    # Incorrect prefix
-    with pytest.raises(TVMError, match="Variable not TVM prefixed"):
-        to_c_variable_style("Cake_Bakery")
-    with pytest.raises(TVMError, match="Variable name is empty"):
-        to_c_variable_style("")
-
-
-def test_to_c_constant_style():
-    assert to_c_constant_style("TVM_Woof") == "TVM_WOOF"
-    assert to_c_constant_style("TVM_woof") == "TVM_WOOF"
-    assert to_c_constant_style("TVM_woof_Woof") == "TVM_WOOF_WOOF"
-
-    with pytest.raises(TVMError, match="Constant not TVM prefixed"):
-        to_c_constant_style("Cake_Bakery")
-    with pytest.raises(TVMError):
-        to_c_constant_style("")
-
-
-def test_prefix_name():
-    assert prefix_name("Woof") == "TVM_Woof"
-    assert prefix_name(["Woof"]) == "TVM_Woof"
-    assert prefix_name(["woof"]) == "TVM_woof"
-    assert prefix_name(["woof", "moo"]) == "TVM_woof_moo"
-
-    with pytest.raises(TVMError, match="Name is empty"):
-        prefix_name("")
-    with pytest.raises(TVMError, match="Name segments empty"):
-        prefix_name([])
-    with pytest.raises(TVMError, match="Name segment is empty"):
-        prefix_name([""])
-
-
-def test_prefix_generated_name():
-    assert prefix_generated_name("Woof") == "TVMGen_Woof"
-    assert prefix_generated_name(["Woof"]) == "TVMGen_Woof"
-    assert prefix_generated_name(["Woof"]) == "TVMGen_Woof"
-    assert prefix_generated_name(["woof"]) == "TVMGen_woof"
-    assert prefix_generated_name(["woof", "moo"]) == "TVMGen_woof_moo"
-
-    with pytest.raises(TVMError, match="Name is empty"):
-        prefix_generated_name("")
-    with pytest.raises(TVMError, match="Name segments empty"):
-        prefix_generated_name([])
-    with pytest.raises(TVMError, match="Name segment is empty"):
-        prefix_generated_name([""])
-
-
-def test_sanitize_name():
-    assert sanitize_name("+_+ ") == "____"
-    assert sanitize_name("input+") == "input_"
-    assert sanitize_name("input-") == "input_"
-    assert sanitize_name("input++") == "input__"
-    assert sanitize_name("woof:1") == "woof_1"
-
-    with pytest.raises(TVMError, match="Name is empty"):
-        sanitize_name("")
-
-
-def test_combined_logic():
-    assert (
-        to_c_function_style(prefix_name(["Device", "target", "Invoke"])) == "TVMDeviceTargetInvoke"
-    )
-    assert to_c_function_style(prefix_generated_name(["model", "Run"])) == "TVMGenModelRun"
-    assert to_c_variable_style(prefix_name(["Device", "target", "t"])) == "tvm_device_target_t"
-    assert (
-        to_c_variable_style(prefix_generated_name(["model", "Devices"])) == "tvmgen_model_devices"
-    )
diff --git a/tests/python/relay/test_op_fast_math.py b/tests/python/relay/test_op_fast_math.py
deleted file mode 100644
index 20ccefed8513..000000000000
--- a/tests/python/relay/test_op_fast_math.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import scipy
-from scipy import special
-import tvm
-import tvm.testing
-import tvm.relay as relay
-from tvm import topi
-from tvm import te
-from tvm.contrib import graph_executor
-from tvm.topi import testing
-
-
-@tvm.testing.parametrize_targets("llvm", "cuda")
-def test_fastmath(target, dev):
-    def test_apply(relay_op, name, f_numpy, low, high, step, dtype="float32"):
-        a_np = np.arange(low, high, step).astype(dtype).reshape((1, -1))
-        b_np = f_numpy(a_np)
-
-        x = relay.var("x", shape=a_np.shape, dtype="float32")
-        y = relay_op(x)
-        func = relay.Function([x], y)
-        mod = tvm.IRModule.from_expr(func)
-
-        with tvm.transform.PassContext(opt_level=3, required_pass=["FastMath"]):
-            graph, lib, params = relay.build(mod, target=target, params=None)
-
-        # Check that the op related to fast math have been convered to function in lib
-        func_name = "tvmgen_default_fused_" + name
-        # When there're multiple targets in tvm.testing.parametrize_targets, the function
-        # built will have a "_1" in function name
-        assert func_name in graph
-
-        m = graph_executor.create(graph, lib, dev)
-        # Set inputs
-        m.set_input("x", tvm.nd.array(a_np, dev))
-        m.set_input(**params)
-        # Execute
-        m.run()
-        # Get outputs
-        tvm_output = m.get_output(0)
-        tvm.testing.assert_allclose(tvm_output.numpy(), b_np, rtol=1e-5, atol=1e-5)
-
-    test_apply(relay.exp, "fast_exp", np.exp, low=-88, high=88, step=0.01)
-    test_apply(relay.erf, "fast_erf", scipy.special.erf, low=-10, high=10, step=0.01)
-    test_apply(relay.tanh, "fast_tanh", np.tanh, low=-10, high=10, step=0.01)
-    test_apply(
-        relay.nn.fast_softmax,
-        "nn_fast_softmax",
-        tvm.topi.testing.softmax_python,
-        low=-10,
-        high=10,
-        step=0.01,
-    )
-
-
-if __name__ == "__main__":
-    test_fastmath()
diff --git a/tests/python/relay/test_op_floordiv.py b/tests/python/relay/test_op_floordiv.py
deleted file mode 100644
index 8828a0155c89..000000000000
--- a/tests/python/relay/test_op_floordiv.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-import tvm.testing
-from tvm.script import tir
-
-
-def test_floor_div_op():
-    target = "llvm"
-    dev = tvm.device(target)
-    N = 100
-    divisor = 5
-
-    @tir.prim_func
-    def func_64(
-        A: tir.Buffer((N + 100, 2), "int64"),
-        B: tir.Buffer((N), "int64"),
-        C: tir.Buffer((N), "int64"),
-    ):
-        for i in tir.serial(N):
-            with tir.block("A"):
-                v_i = tir.axis.spatial(N, i)
-                A[v_i, 0] = tir.floordiv(C[v_i] - tir.max_value("int64"), divisor)
-                A[v_i, 1] = tir.floormod(C[v_i] - tir.max_value("int64"), divisor)
-                A[v_i + 100, 0] = tir.floordiv(B[v_i], divisor)
-                A[v_i + 100, 1] = tir.floormod(B[v_i], divisor)
-
-    @tir.prim_func
-    def func_32(
-        A: tir.Buffer((N + 100, 2), "int32"),
-        B: tir.Buffer((N), "int32"),
-        C: tir.Buffer((N), "int32"),
-    ):
-        for i in tir.serial(N):
-            with tir.block("A"):
-                v_i = tir.axis.spatial(N, i)
-                A[v_i, 0] = tir.floordiv(C[v_i] - tir.max_value("int32"), divisor)
-                A[v_i, 1] = tir.floormod(C[v_i] - tir.max_value("int32"), divisor)
-                A[v_i + 100, 0] = tir.floordiv(B[v_i], divisor)
-                A[v_i + 100, 1] = tir.floormod(B[v_i], divisor)
-
-    @tir.prim_func
-    def func_16(
-        A: tir.Buffer((N + 100, 2), "int16"),
-        B: tir.Buffer((N), "int16"),
-        C: tir.Buffer((N), "int16"),
-    ):
-        for i in tir.serial(N):
-            with tir.block("A"):
-                v_i = tir.axis.spatial(N, i)
-                A[v_i, 0] = tir.floordiv(C[v_i] - tir.max_value("int16"), divisor)
-                A[v_i, 1] = tir.floormod(C[v_i] - tir.max_value("int16"), divisor)
-                A[v_i + 100, 0] = tir.floordiv(B[v_i], divisor)
-                A[v_i + 100, 1] = tir.floormod(B[v_i], divisor)
-
-    @tir.prim_func
-    def func_8(
-        A: tir.Buffer((N + 100, 2), "int8"), B: tir.Buffer((N), "int8"), C: tir.Buffer((N), "int8")
-    ):
-        for i in tir.serial(N):
-            with tir.block("A"):
-                v_i = tir.axis.spatial(N, i)
-                A[v_i, 0] = tir.floordiv(C[v_i] - tir.max_value("int8"), divisor)
-                A[v_i, 1] = tir.floormod(C[v_i] - tir.max_value("int8"), divisor)
-                A[v_i + 100, 0] = tir.floordiv(B[v_i], divisor)
-                A[v_i + 100, 1] = tir.floormod(B[v_i], divisor)
-
-    for opfunc, type in [
-        (func_8, "int8"),
-        (func_16, "int16"),
-        (func_32, "int32"),
-        (func_64, "int64"),
-    ]:
-        built = tvm.build(opfunc, target=target)
-        x_data = np.random.randint(te.min_value(type), te.max_value(type), size=(100), dtype=type)
-        y_data = np.asarray([i for i in range(N)], dtype=type)
-
-        a_dev = tvm.nd.empty([N + 100, 2], type, dev)
-        b_dev = tvm.nd.array(x_data, dev)
-        c_dev = tvm.nd.array(y_data, dev)
-
-        built(a_dev, b_dev, c_dev)
-
-        a = a_dev.numpy()
-        b = b_dev.numpy()
-        c = c_dev.numpy()
-
-        # python modulo behaves a bit different to tvm floormod for negative numbers
-        for i in range(N + 100):
-            if a[i, 1] < 0:
-                a[i, 1] = divisor + a[i, 1]
-
-        np.testing.assert_array_equal(a[:100, 0], (c - te.max_value(type)) // divisor)
-        np.testing.assert_array_equal(a[:100, 1], (c - te.max_value(type)) % divisor)
-        np.testing.assert_array_equal(a[100 : N + 100, 0], b // divisor)
-        np.testing.assert_array_equal(a[100 : N + 100, 1], b % divisor)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
deleted file mode 100644
index d8e374393012..000000000000
--- a/tests/python/relay/test_op_grad_level1.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-
-from tvm import te, relay
-from tvm.relay.testing import check_grad, run_infer_type
-from tvm.relay.transform import gradient
-
-executor_kind = tvm.testing.parameter("debug")
-
-
-def sigmoid(x):
-    one = np.ones_like(x)
-    return one / (one + np.exp(-x))
-
-
-def relu(x):
-    x_copy = np.copy(x)
-    np.maximum(x_copy, 0, x_copy)
-    return x_copy
-
-
-class TestUnaryOp:
-    config = {
-        "log": (tvm.relay.log, lambda x, g: g * (1 / x)),
-        "exp": (tvm.relay.exp, lambda x, g: g * np.exp(x)),
-        "sigmoid": (tvm.relay.sigmoid, lambda x, g: g * sigmoid(x) * (1 - sigmoid(x))),
-        "tanh": (tvm.relay.tanh, lambda x, g: g * (1 - np.tanh(x) * np.tanh(x))),
-        "sqrt": (tvm.relay.sqrt, lambda x, g: g * 0.5 * np.power(x, -0.5)),
-        "abs": (tvm.relay.abs, lambda x, g: np.where(x < 0, -g, g)),
-        "relu": (relay.nn.relu, lambda x, g: np.where(x < 0, np.zeros_like(x), g)),
-        "erf": (tvm.relay.erf, lambda x, g: g * (2.0 / (np.pi ** (0.5)) * np.exp(-x * x))),
-        "cos": (tvm.relay.cos, lambda x, g: g * -1.0 * np.sin(x)),
-        "sin": (tvm.relay.sin, lambda x, g: g * np.cos(x)),
-        "tan": (tvm.relay.tan, lambda x, g: g * (1.0 / (np.cos(x) ** 2))),
-        "atan": (tvm.relay.atan, lambda x, g: g * (1 / (1 + np.power(x, 2.0)))),
-        "log2": (tvm.relay.log2, lambda x, g: g * (1 / (np.log(2) * x))),
-        "log10": (tvm.relay.log10, lambda x, g: g * (1 / (np.log(10) * x))),
-        "cosh": (tvm.relay.cosh, lambda x, g: g * (np.sinh(x))),
-        "sinh": (tvm.relay.sinh, lambda x, g: g * (np.cosh(x))),
-        "asin": (tvm.relay.asin, lambda x, g: g * (1.0 / (1.0 - x**2) ** (1.0 / 2.0))),
-        "acos": (tvm.relay.acos, lambda x, g: g * (-1.0 / (1.0 - x**2.0) ** (1.0 / 2.0))),
-        "acosh": (tvm.relay.acosh, lambda x, g: g * (1.0 / (x**2 - 1.0) ** (1.0 / 2.0))),
-        "asinh": (tvm.relay.asinh, lambda x, g: g * (1.0 / (x**2 + 1.0) ** (1.0 / 2.0))),
-        "atanh": (tvm.relay.atanh, lambda x, g: g * (-1.0 / (x**2 - 1.0))),
-    }
-
-    relay_op, ref_func = tvm.testing.parameters(*config.values(), ids=config.keys())
-    dtype = tvm.testing.parameter("float32", "float64")
-    shape = tvm.testing.parameter((10, 4))
-
-    def test_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype):
-
-        target = tvm.target.Target(target)
-        if target.kind.name == "vulkan":
-
-            known_breaks = {
-                "float32": [
-                    tvm.relay.erf,
-                    tvm.relay.tan,
-                    tvm.relay.atan,
-                    tvm.relay.log10,
-                    tvm.relay.cosh,
-                    tvm.relay.sinh,
-                    tvm.relay.asin,
-                    tvm.relay.acos,
-                    tvm.relay.acosh,
-                    tvm.relay.asinh,
-                    tvm.relay.atanh,
-                ],
-                "float64": [
-                    tvm.relay.log,
-                    tvm.relay.exp,
-                    tvm.relay.sigmoid,
-                    tvm.relay.tanh,
-                    tvm.relay.sqrt,
-                    tvm.relay.erf,
-                    tvm.relay.cos,
-                    tvm.relay.sin,
-                    tvm.relay.tan,
-                    tvm.relay.atan,
-                    tvm.relay.log2,
-                    tvm.relay.log10,
-                    tvm.relay.cosh,
-                    tvm.relay.sinh,
-                    tvm.relay.asin,
-                    tvm.relay.acos,
-                    tvm.relay.acosh,
-                    tvm.relay.asinh,
-                    tvm.relay.atanh,
-                ],
-            }
-
-            if relay_op in known_breaks[dtype]:
-                pytest.xfail(f"{dtype} {relay_op.__name__} not yet supported on Vulkan runtime")
-
-        tp = relay.TensorType(shape, dtype)
-        x = relay.var("x", tp)
-        g = relay.var("g", tp)
-        y = relay_op(x) * g
-
-        fwd_func = relay.Function([x, g], y)
-        fwd_func = run_infer_type(fwd_func)
-        bwd_func = run_infer_type(gradient(fwd_func))
-
-        data_in = np.random.rand(*shape).astype(dtype)
-        grad_in = np.random.rand(*shape).astype(dtype)
-        ref_grad_out = ref_func(data_in, grad_in)
-
-        op_res, (op_grad, _) = relay.create_executor(
-            executor_kind, device=dev, target=target
-        ).evaluate(bwd_func)(data_in, grad_in)
-        np.testing.assert_allclose(op_grad.numpy(), ref_grad_out, rtol=0.01)
-
-
-class TestBinaryOp:
-    config = {
-        "add": (relay.add, lambda x, y: [np.ones_like(x), np.ones_like(y)]),
-        "subtract": (relay.subtract, lambda x, y: [np.ones_like(x), -np.ones_like(y)]),
-        "multiply": (relay.multiply, lambda x, y: [y, x]),
-        "divide": (relay.divide, lambda x, y: [1 / y, -x / (y**2)]),
-    }
-
-    relay_op, ref_func = tvm.testing.parameters(*config.values(), ids=config.keys())
-    dtype = tvm.testing.parameter("float32", "float64")
-    shape = tvm.testing.parameter((5, 10, 5))
-
-    def test_binary_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype):
-        t = relay.TensorType(shape, dtype=dtype)
-        x = relay.var("x", t)
-        y = relay.var("y", t)
-        z = relay_op(x, y)
-
-        x_data = np.random.rand(*shape).astype(t.dtype)
-        y_data = np.random.rand(*shape).astype(t.dtype)
-        ref_grad0, ref_grad1 = ref_func(x_data, y_data)
-        fwd_func = relay.Function([x, y], z)
-        fwd_func = run_infer_type(fwd_func)
-        bwd_func = run_infer_type(gradient(fwd_func))
-
-        op_res, (op_grad0, op_grad1) = relay.create_executor(
-            executor_kind, device=dev, target=target
-        ).evaluate(bwd_func)(x_data, y_data)
-        np.testing.assert_allclose(op_grad0.numpy(), ref_grad0, rtol=0.01)
-        np.testing.assert_allclose(op_grad1.numpy(), ref_grad1, rtol=0.01)
-
-
-def test_softmax_grad(executor_kind, target, dev):
-    target = tvm.target.Target(target)
-    if target.kind.name == "vulkan":
-        pytest.xfail("Known failure on vulkan")
-
-    data = relay.var("data", relay.TensorType((1, 16), "float64"))
-    fwd_func = relay.Function([data], relay.nn.softmax(data))
-    check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind)
-
-
-def test_log_softmax_grad(executor_kind, target, dev):
-    target = tvm.target.Target(target)
-    if target.kind.name == "vulkan":
-        pytest.xfail("Known failure on vulkan")
-
-    data = relay.var("data", relay.TensorType((2, 16), "float64"))
-    fwd_func = relay.Function([data], relay.nn.log_softmax(data))
-    check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind)
-
-
-class TestBiasAddGrad:
-    d_shape, b_shape, axis = tvm.testing.parameters(
-        ((1, 16), (16,), 1),
-        ((1, 8, 2, 2), (8,), 1),
-        ((1, 2, 2, 8), (8,), 3),
-        ((4, 8), (8,), 1),
-    )
-
-    def test_bias_add(self, executor_kind, target, dev, d_shape, b_shape, axis):
-        data = relay.var("data", relay.TensorType(d_shape, "float32"))
-        bias = relay.var("bias", relay.TensorType(b_shape, "float32"))
-        fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias, axis=axis))
-        check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
-
-
-def test_expand_dims_grad(executor_kind, target, dev):
-    data = relay.var("data", shape=(2, 3), dtype="float64")
-    fwd_func = relay.Function([data], relay.expand_dims(data, axis=1, num_newaxis=2))
-    check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
-
-
-def test_concatenate_grad(executor_kind, target, dev):
-    x = relay.var("x", shape=(2, 2, 5))
-    y = relay.var("y", shape=(2, 1, 5))
-    z = relay.var("z", shape=(2, 4, 5))
-    fwd_func = relay.Function([x, y, z], relay.concatenate([x, y, z], axis=1))
-    check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
deleted file mode 100644
index 08add13f8072..000000000000
--- a/tests/python/relay/test_op_grad_level10.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-
-from tvm import relay
-from tvm.relay.testing import check_grad
-
-
-index_dtype = tvm.testing.parameter("int32", "int64")
-val_dtype = tvm.testing.parameter("float32", "float64")
-executor_kind = tvm.testing.parameter("debug")
-
-
-def test_cross_entropy_grad(executor_kind, target, dev, val_dtype):
-    target = tvm.target.Target(target)
-    if target.kind.name == "vulkan" and val_dtype == "float64":
-        # GLSL.std.450's Log implementation only takes 16/32-bit floats.
-        pytest.xfail("Known failing test case for vulkan runtime")
-
-    x = relay.var("x", shape=(2, 5), dtype=val_dtype)
-    y = relay.var("y", shape=(2, 5), dtype=val_dtype)
-    check_grad(
-        relay.Function([x, y], relay.op.nn.cross_entropy(x, y)),
-        eps=0.01,
-        scale=0.1,
-        mean=1,
-        target_devices=[(target, dev)],
-        executor_kind=executor_kind,
-    )
-
-
-def test_cross_entropy_with_logits_grad(executor_kind, target, dev, val_dtype):
-    x = relay.var("x", shape=(2, 5), dtype=val_dtype)
-    y = relay.var("y", shape=(2, 5), dtype=val_dtype)
-    check_grad(
-        relay.Function([x, y], relay.op.nn.cross_entropy_with_logits(x, y)),
-        eps=0.01,
-        scale=0.1,
-        mean=1,
-        target_devices=[(target, dev)],
-        executor_kind=executor_kind,
-    )
-
-
-def test_checkpoint(executor_kind, target, dev):
-    inputs = [relay.var("x{}".format(i), shape=(1,)) for i in range(4)]
-    output = relay.multiply(relay.add(inputs[0], inputs[1]), relay.add(inputs[2], inputs[3]))
-    check_grad(
-        relay.Function(inputs, relay.annotation.checkpoint(output)), executor_kind=executor_kind
-    )
-
-    scope = relay.ScopeBuilder()
-    out_tuple = scope.let(
-        "out_tuple",
-        relay.Tuple([relay.add(inputs[0], inputs[1]), relay.multiply(inputs[2], inputs[3])]),
-    )
-    scope.ret(
-        relay.subtract(
-            relay.annotation.checkpoint(relay.TupleGetItem(out_tuple, 0)),
-            relay.TupleGetItem(out_tuple, 1),
-        )
-    )
-    out_single = scope.get()
-    check_grad(
-        relay.Function(inputs, out_single),
-        target_devices=[(target, dev)],
-        executor_kind=executor_kind,
-    )
-
-
-class TestBatchMatmulGrad:
-    a_shape, b_shape, transpose_a, transpose_b = tvm.testing.parameters(
-        ((2, 3, 5), (2, 5, 4), False, False),
-        ((2, 3, 5), (2, 4, 5), False, True),
-        ((2, 5, 3), (2, 5, 4), True, False),
-        ((2, 5, 3), (2, 4, 5), True, True),
-    )
-
-    def test_batch_matmul_grad(
-        self, executor_kind, target, dev, a_shape, b_shape, transpose_a, transpose_b
-    ):
-        tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
-        tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
-        check_grad(
-            relay.Function(
-                [tensor_a, tensor_b],
-                relay.op.nn.batch_matmul(
-                    tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b
-                ),
-            ),
-            target_devices=[(target, dev)],
-            executor_kind=executor_kind,
-        )
-
-
-def test_reverse_reshape_grad(executor_kind, target, dev):
-    x = relay.var("x", shape=(3, 4, 5), dtype="float64")
-    check_grad(
-        relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))),
-        target_devices=[(target, dev)],
-        executor_kind=executor_kind,
-    )
-
-
-def test_one_hot_grad(executor_kind, target, dev, index_dtype, val_dtype):
-    indices_shape = (3, 4)
-    depth = 5
-    axis = -1
-
-    inputs = [
-        np.random.randint(depth, size=indices_shape, dtype=index_dtype),
-        np.array(np.random.randn() * 1e-5).astype(val_dtype),
-        np.array(np.random.randn() * 1e-5).astype(val_dtype),
-    ]
-    test_inputs = inputs[1:]
-
-    indices = relay.var("indices", shape=indices_shape, dtype=index_dtype)
-    on_val = relay.var("on_val", shape=tuple(), dtype=val_dtype)
-    off_val = relay.var("off_val", shape=tuple(), dtype=val_dtype)
-    y = relay.one_hot(indices, on_val, off_val, depth, axis, val_dtype)
-    f = relay.Function([indices, on_val, off_val], y)
-
-    check_grad(
-        f,
-        inputs=inputs,
-        test_inputs=test_inputs,
-        target_devices=[(target, dev)],
-        executor_kind=executor_kind,
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
deleted file mode 100644
index 7a40a58ee852..000000000000
--- a/tests/python/relay/test_op_grad_level2.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-from tvm import topi
-import tvm.topi.testing
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.testing import check_grad, run_infer_type, run_opt_pass
-from tvm.relay.transform import gradient
-import tvm.testing
-
-executor_kind = tvm.testing.parameter("debug")
-
-
-def verify_max_pool2d_grad(executor_kind, x_shape, pool_size, strides, padding, ceil_mode):
-    x = relay.var("x", relay.TensorType(x_shape, "float32"))
-    y = tvm.relay.nn.max_pool2d(
-        x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode
-    )
-
-    fwd_func = relay.Function([x], y)
-    fwd_func = run_infer_type(fwd_func)
-    bwd_func = run_infer_type(gradient(fwd_func))
-
-    data = np.random.rand(*x_shape).astype("float32")
-    ph, pw = padding
-    y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
-    out_grad = np.ones(shape=y_shape)
-    ref_grad = tvm.topi.testing.pool_grad_nchw(
-        data,
-        out_grad,
-        pool_size=pool_size,
-        strides=strides,
-        padding=[ph, pw, ph, pw],
-        pool_type="max",
-        ceil_mode=ceil_mode,
-    )
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res, (op_grad,) = relay.create_executor(
-            executor_kind, device=dev, target=target
-        ).evaluate(bwd_func)(data)
-        np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
-
-
-@tvm.testing.uses_gpu
-def test_max_pool2d_grad(executor_kind):
-    verify_max_pool2d_grad(
-        executor_kind,
-        (1, 4, 16, 16),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-    )
-    verify_max_pool2d_grad(
-        executor_kind,
-        (1, 4, 16, 16),
-        pool_size=(1, 1),
-        strides=(1, 1),
-        padding=(1, 1),
-        ceil_mode=False,
-    )
-
-
-def verify_avg_pool2d_grad(
-    x_shape,
-    pool_size,
-    strides,
-    padding,
-    ceil_mode,
-    count_include_pad,
-    executor_kind,
-    dtype="float32",
-):
-
-    for shape_dtype in ["int32", "int64"]:
-        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in x_shape], dtype=dtype)
-        y = tvm.relay.nn.avg_pool2d(
-            x,
-            pool_size=pool_size,
-            strides=strides,
-            padding=padding,
-            ceil_mode=ceil_mode,
-            count_include_pad=count_include_pad,
-        )
-
-        fwd_func = relay.Function([x], y)
-        fwd_func = run_infer_type(fwd_func)
-        bwd_func = run_infer_type(gradient(fwd_func))
-
-        data = np.random.rand(*x_shape).astype(dtype)
-        ph, pw = padding
-        y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
-        out_grad = np.ones(shape=y_shape)
-        ref_grad = tvm.topi.testing.pool_grad_nchw(
-            data,
-            out_grad,
-            pool_size=pool_size,
-            strides=strides,
-            padding=[ph, pw, ph, pw],
-            pool_type="avg",
-            ceil_mode=ceil_mode,
-        )
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res, (op_grad,) = relay.create_executor(
-                executor_kind, device=dev, target=target
-            ).evaluate(bwd_func)(data)
-            np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
-
-
-@tvm.testing.uses_gpu
-def test_avg_pool2d_grad(executor_kind):
-    verify_avg_pool2d_grad(
-        (1, 4, 16, 16),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-        count_include_pad=True,
-        executor_kind=executor_kind,
-    )
-    verify_avg_pool2d_grad(
-        (1, 4, 16, 16),
-        pool_size=(1, 1),
-        strides=(1, 1),
-        padding=(1, 1),
-        ceil_mode=False,
-        count_include_pad=False,
-        executor_kind=executor_kind,
-    )
-    verify_avg_pool2d_grad(
-        (1, 4, 16, 16),
-        pool_size=(1, 1),
-        strides=(1, 1),
-        padding=(1, 1),
-        ceil_mode=False,
-        count_include_pad=False,
-        executor_kind=executor_kind,
-        dtype="float16",
-    )
-
-
-def verify_global_avg_pool2d_grad(executor_kind, x_shape):
-    x = relay.var("x", relay.TensorType(x_shape, "float32"))
-    y = tvm.relay.nn.global_avg_pool2d(x)
-
-    fwd_func = relay.Function([x], y)
-    fwd_func = run_infer_type(fwd_func)
-    bwd_func = run_infer_type(gradient(fwd_func))
-
-    data = np.random.rand(*x_shape).astype("float32")
-    y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
-    out_grad = np.ones(shape=y_shape)
-    ref_grad = tvm.topi.testing.pool_grad_nchw(
-        data,
-        out_grad,
-        pool_size=(x_shape[2], x_shape[3]),
-        strides=(1, 1),
-        padding=[0, 0, 0, 0],
-        pool_type="avg",
-        ceil_mode=False,
-    )
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res, (op_grad,) = relay.create_executor(
-            executor_kind, device=dev, target=target
-        ).evaluate(bwd_func)(data)
-        np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
-
-
-@tvm.testing.uses_gpu
-def test_global_avg_pool2d_grad(executor_kind):
-    verify_global_avg_pool2d_grad(executor_kind, (1, 4, 16, 16))
-    verify_global_avg_pool2d_grad(executor_kind, (1, 8, 8, 24))
-
-
-def verify_conv2d_grad(
-    dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order", executor_kind="vm"
-):
-    dtype = "float32"
-    data = relay.var("data", shape=dshape, dtype=dtype)
-    weight = relay.var("weight", shape=wshape, dtype=dtype)
-    conv = relay.nn.conv2d(
-        data,
-        weight,
-        strides=strides,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-        out_dtype=dtype,
-    )
-    fwd_func = relay.Function([data, weight], conv)
-    check_grad(fwd_func, mode=mode, executor_kind=executor_kind)
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_grad(executor_kind):
-    verify_conv2d_grad(
-        (1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1], executor_kind=executor_kind
-    )
-    verify_conv2d_grad(
-        (1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1], executor_kind=executor_kind
-    )
-    verify_conv2d_grad(
-        (1, 4, 16, 16), (16, 4, 1, 1), [2, 2], [0, 0], [1, 1], executor_kind=executor_kind
-    )
-    verify_conv2d_grad(
-        (1, 4, 16, 16),
-        (16, 4, 3, 3),
-        [1, 1],
-        [1, 1],
-        [1, 1],
-        mode="first_order",
-        executor_kind=executor_kind,
-    )
-
-
-def verify_dense_grad(d_shape, w_shape, executor_kind):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    weight = relay.var("weight", relay.TensorType(w_shape, "float32"))
-    fwd_func = relay.Function([data, weight], relay.nn.dense(data, weight))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_dense_grad(executor_kind):
-    verify_dense_grad((1, 8), (16, 8), executor_kind)
-    verify_dense_grad((1, 4), (3, 4), executor_kind)
-    verify_dense_grad((5, 4), (3, 4), executor_kind)
-
-
-def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b, executor_kind):
-    tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
-    tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
-    fwd_func = relay.Function(
-        [tensor_a, tensor_b],
-        relay.nn.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b),
-    )
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_matmul_grad(executor_kind):
-    verify_matmul_grad((1, 8), (8, 16), False, False, executor_kind)
-    verify_matmul_grad((4, 1), (4, 3), True, False, executor_kind)
-    verify_matmul_grad((4, 5), (3, 4), True, True, executor_kind)
-
-
-def verify_batch_flatten_grad(d_shape, executor_kind):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    fwd_func = relay.Function([data], relay.nn.batch_flatten(data))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_batch_flatten_grad(executor_kind):
-    verify_batch_flatten_grad((1, 2, 3, 4), executor_kind)
-    verify_batch_flatten_grad((1, 8), executor_kind)
-
-
-def verify_conv2d_backward_weight(
-    executor_kind, dy_shape, x_shape, kernel_size, stride, padding, groups=1, out_channels=None
-):
-    dtype = "float32"
-    dy = relay.var("dy", shape=dy_shape, dtype=dtype)
-    x = relay.var("x", shape=x_shape, dtype=dtype)
-    dw_func = relay.Function(
-        [dy, x],
-        relay.nn.conv2d_backward_weight(
-            dy,
-            x,
-            strides=stride,
-            padding=padding,
-            kernel_size=kernel_size,
-            groups=groups,
-            channels=out_channels,
-            out_dtype=dtype,
-        ),
-    )
-
-    dw_func_legalized = run_opt_pass(dw_func, relay.transform.Legalize())
-
-    for dw, target in [(dw_func_legalized, "llvm"), (dw_func, "cuda -libs=cudnn")]:
-        if "cudnn" in target and not tvm.contrib.cudnn.exists():
-            continue
-
-        dev = tvm.device(target, 0)
-        dy_np = np.random.randn(*dy_shape).astype(dtype)
-        x_np = np.random.randn(*x_shape).astype(dtype)
-
-        dw_np = (
-            relay.create_executor(executor_kind, device=dev, target=target)
-            .evaluate(dw)(dy_np, x_np)
-            .numpy()
-        )
-        ref_dw_np = tvm.topi.testing.conv2d_backward_weight_python(
-            dy_np, x_np, kernel_size, stride, padding, groups=groups, channels=out_channels
-        )
-
-        np.testing.assert_allclose(dw_np, ref_dw_np, rtol=1e-4, atol=1e-4)
-
-
-def test_conv2d_backward_weight(executor_kind):
-    verify_conv2d_backward_weight(
-        executor_kind, (2, 8, 32, 32), (2, 4, 32, 32), (3, 3), (1, 1), (1, 1)
-    )
-    verify_conv2d_backward_weight(
-        executor_kind, (2, 16, 15, 15), (2, 3, 32, 32), (3, 3), (2, 2), (0, 0)
-    )
-    verify_conv2d_backward_weight(
-        executor_kind,
-        (1, 16, 32, 32),
-        (1, 16, 32, 32),
-        (3, 3),
-        (1, 1),
-        (1, 1),
-        groups=16,
-        out_channels=16,
-    )
-
-
-def test_conv2d_backward_weight_infer_type():
-    # From https://github.com/apache/tvm/pull/10439
-    depthwise_conv_code = """
-    fn (%input0: Tensor[(1, 3, 32, 32), float32], %v0_weight: Tensor[(3, 1, 3, 3), float32], %v0_bias: Tensor[(3), float32]) {
-      %0 = nn.conv2d(%input0, %v0_weight, padding=[1, 1, 1, 1], groups=3, channels=3, kernel_size=[3, 3]);
-      nn.bias_add(%0, %v0_bias)
-    }
-    """
-
-    normal_conv_code = """
-    fn (%input0: Tensor[(1, 3, 32, 32), float32], %v0_weight: Tensor[(3, 3, 3, 3), float32], %v0_bias: Tensor[(3), float32]) {
-      %0 = nn.conv2d(%input0, %v0_weight, padding=[1, 1, 1, 1], groups=1, channels=3, kernel_size=[3, 3]);
-      nn.bias_add(%0, %v0_bias)
-    }
-    """
-
-    SEMVER = '#[version = "0.0.5"]\n'
-
-    for code in [normal_conv_code, depthwise_conv_code]:
-        expr = tvm.relay.parse_expr(SEMVER + code)
-        fmod = tvm.IRModule.from_expr(expr)
-
-        mod = relay.transform.InferType()(fmod)
-        bwd_expr = relay.transform.gradient(mod["main"], mode="first_order")
-
-        bwd_mod = tvm.IRModule.from_expr(bwd_expr)
-        bwd_mod = relay.transform.InferType()(bwd_mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
deleted file mode 100644
index 4ca7cb9ce07f..000000000000
--- a/tests/python/relay/test_op_grad_level3.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.testing import check_grad, run_infer_type, run_opt_pass, _np_randn_from_type
-from tvm.relay.transform import gradient
-import tvm.testing
-
-executor_kind = tvm.testing.parameter("debug")
-
-
-@tvm.testing.uses_gpu
-def test_clip(executor_kind):
-    for dtype in ("float32", "float64"):
-        ref = lambda x: np.where(
-            x > 10.0, np.zeros_like(x), np.where(x < 1.0, np.zeros_like(x), np.ones_like(x))
-        )
-        x = relay.var("x", relay.TensorType((10, 4), dtype))
-        y = tvm.relay.clip(x, 1.0, 10.0)
-
-        data = np.random.rand(10, 4).astype(dtype) * 11.0
-        ref_grad = ref(data)
-        fwd_func = relay.Function([x], y)
-        fwd_func = run_infer_type(fwd_func)
-        bwd_func = run_infer_type(gradient(fwd_func))
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res, (op_grad,) = relay.create_executor(
-                executor_kind, device=dev, target=target
-            ).evaluate(bwd_func)(data)
-            np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
-
-
-def verify_transpose_grad(d_shape, axes=None, executor_kind="vm"):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    fwd_func = relay.Function([data], relay.transpose(data, axes=axes))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_transpose_grad(executor_kind):
-    verify_transpose_grad((1, 2, 3, 4), executor_kind=executor_kind)
-    verify_transpose_grad((1, 2, 3, 4), axes=(0, 2, 3, 1), executor_kind=executor_kind)
-
-
-def test_negative_grad(executor_kind):
-    data = relay.var("data", relay.TensorType((10, 4), "float32"))
-    fwd_func = relay.Function([data], relay.negative(data))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_cast_grad(executor_kind):
-    data = relay.var("data", relay.TensorType((10, 4), "float32"))
-    fwd_func = relay.Function([data], relay.cast(data, "float64"))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_cast_like_grad(executor_kind):
-    data = relay.var("data", shape=(10, 4), dtype="float32")
-    like = relay.var("like", shape=(1,), dtype="float64")
-    fwd_func = relay.Function([data, like], relay.cast_like(data, like))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_copy_grad(executor_kind):
-    data = relay.var("data", relay.TensorType((10, 4), "float64"))
-    fwd_func = relay.Function([data], relay.copy(data))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_take_grad(executor_kind):
-    data_dtype = relay.TensorType((3, 4, 5), "float64")
-    data = relay.var("data", data_dtype)
-    indices = relay.var("indices", relay.TensorType((relay.Any(),), "int32"))
-    inputs = [_np_randn_from_type(data_dtype, scale=1e-5), np.array([1, 2], dtype="int32")]
-    test_inputs = [inputs[0]]
-
-    # take on axis
-    fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=1))
-    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind)
-
-    # take on flattened
-    fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=None))
-    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind)
-
-
-def test_stack_grad(executor_kind):
-    args = [relay.var(c, shape=(2, 3, 4), dtype="float64") for c in "xyz"]
-    fwd_func = relay.Function(args, relay.stack(args, axis=0))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_squeeze_grad(executor_kind):
-    data = relay.var("data", shape=(2, 1, 1, 3, 4, 1), dtype="float64")
-    fwd_func = relay.Function([data], relay.squeeze(data))
-    fwd_func_subset = relay.Function([data], relay.squeeze(data, axis=[1, -1]))
-    check_grad(fwd_func, executor_kind=executor_kind)
-    check_grad(fwd_func_subset, executor_kind=executor_kind)
-
-
-def test_arange_grad(executor_kind):
-    # TODO: testing arange numerically is strange because two-sided approx can
-    #       produce different output shapes
-    dtype = "float64"
-    start = relay.var("start", relay.TensorType((), dtype))
-    stop = relay.var("stop", relay.TensorType((), dtype))
-    step = relay.var("step", relay.TensorType((), dtype))
-    values = [np.array(v, dtype=dtype) for v in [2.5, 9.5, 1.8]]
-    fwd_func = relay.Function([start, stop, step], relay.arange(start, stop, step, dtype))
-    check_grad(fwd_func, inputs=values, executor_kind=executor_kind)
-
-
-def test_gather_nd_grad(executor_kind):
-    data = relay.var("data", relay.TensorType((2, 3), "float64"))
-    indices = relay.var("indices", relay.TensorType((2, 4), "int64"))
-    fwd = relay.Function([data, indices], relay.gather_nd(data, indices))
-    data_np = np.random.rand(2, 3).astype("float64")
-    indices_np = np.array([[0, 1, 1, 0], [0, 1, 0, 0]], dtype="int64")
-    check_grad(
-        fwd, inputs=[data_np, indices_np], test_inputs=[data_np], executor_kind=executor_kind
-    )
-
-
-def test_reshape_like_grad(executor_kind):
-    data = relay.var("data", shape=(2, 3, 4), dtype="float32")
-    shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32")
-    fwd_func = relay.Function([data, shape_like], relay.reshape_like(data, shape_like))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_zeros_ones_grad_const_ints():
-    # when shape is static (i.e. not an input), there is no gradient at all
-    static_ty = relay.TensorType([2, 3, 4], dtype="float32")
-    expected_ty = relay.TupleType([static_ty, relay.TupleType([])])
-
-    for op in [relay.zeros, relay.ones]:
-        fwd_func = relay.Function([], op(static_ty.concrete_shape, static_ty.dtype))
-        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
-        tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty)
-
-
-def test_zeros_ones_grad_const_expr():
-    # when shape is static (i.e. not an input), there is no gradient at all
-    shape_const = relay.const(np.array([2, 3, 4]), dtype="int32") * relay.const(1, dtype="int32")
-    static_ty = relay.TensorType([2, 3, 4], dtype="float32")
-    dyn_ty = relay.TensorType([relay.Any(), relay.Any(), relay.Any()], dtype="float32")
-    expected_ty_static = relay.TupleType([static_ty, relay.TupleType([])])
-    expected_ty_dyn = relay.TupleType([dyn_ty, relay.TupleType([])])
-
-    for op in [relay.zeros, relay.ones]:
-        # with DynamicToStatic, the shape should be concretized
-        fwd_func = relay.Function([], op(shape_const, static_ty.dtype))
-        fwd_func = run_opt_pass(fwd_func, relay.transform.DynamicToStatic())
-        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
-        tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_static)
-
-        fwd_func = relay.Function([], op(shape_const, static_ty.dtype))
-        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
-        tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_dyn)
-
-
-def test_zeros_ones_grad_dynamic(executor_kind):
-    rank = np.random.randint(low=1, high=5, dtype="int32")
-    dyn_shape = np.random.randint(low=1, high=4, size=(rank,), dtype="int32")
-    shape_data = relay.var("shape_data", shape=(rank,), dtype="int32")
-
-    for op, op_ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
-        fwd_func = relay.Function([shape_data], op(shape_data, dtype="float32"))
-        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
-
-        for target, dev in tvm.testing.enabled_targets():
-            res, (grad,) = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
-                bwd_func
-            )(dyn_shape)
-            tvm.testing.assert_allclose(res.numpy(), op_ref(dyn_shape, dtype="float32"))
-            tvm.testing.assert_allclose(grad.numpy(), np.zeros((rank,), dtype="int32"))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py
deleted file mode 100644
index b85e692c5fe2..000000000000
--- a/tests/python/relay/test_op_grad_level4.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import numpy as np
-import tvm.testing
-from tvm import relay
-from tvm.relay.testing import check_grad, _np_randn_from_type
-
-executor_kind = tvm.testing.parameter("debug")
-
-
-def verify_reduction_grad(executor_kind, red_fn, d_shape, axis=None, keepdims=False, exclude=False):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    fwd_func = relay.Function([data], red_fn(data, axis=axis, keepdims=keepdims, exclude=exclude))
-    check_grad(fwd_func, executor_kind=executor_kind)
-
-
-def test_reduction_grad(executor_kind):
-    def _unbiased_variance(x, axis=None, keepdims=False, exclude=False):
-        return relay.variance(x, axis=axis, keepdims=keepdims, exclude=exclude, unbiased=True)
-
-    for op in (relay.sum, relay.variance, _unbiased_variance, relay.mean):
-        verify_reduction_grad(executor_kind, op, (4, 2))
-        verify_reduction_grad(executor_kind, op, (4, 2), axis=-1, keepdims=True)
-        verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=(1, 2), exclude=True)
-        verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=1)
-
-
-def verify_max_grad(executor_kind, d_shape, axis=None, keepdims=False, exclude=False):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    fwd_func = relay.Function(
-        [data], relay.max(data, axis=axis, keepdims=keepdims, exclude=exclude)
-    )
-    check_grad(fwd_func, scale=1e-3, executor_kind=executor_kind)
-
-
-def test_max_grad(executor_kind):
-    verify_max_grad(executor_kind, (10, 10), axis=None)
-    verify_max_grad(executor_kind, (10, 10), axis=-1)
-    verify_max_grad(executor_kind, (6, 3, 2), axis=(1, 2), keepdims=True)
-    verify_max_grad(executor_kind, (5, 4, 3), axis=(0, 2), exclude=True)
-
-
-def test_where_grad(executor_kind):
-    cond_type = relay.TensorType((2, 3, 4), "int32")
-    lhs_type = relay.TensorType((1, 3, 4), "float32")
-    rhs_type = relay.TensorType((2, 1, 4), "float32")
-    inputs = [
-        np.random.randint(2, size=cond_type.concrete_shape, dtype=cond_type.dtype),
-        _np_randn_from_type(lhs_type, scale=1e-5),
-        _np_randn_from_type(rhs_type, scale=1e-5),
-    ]
-
-    cond = relay.var("cond", type_annotation=cond_type)
-    lhs = relay.var("lhs", type_annotation=lhs_type)
-    rhs = relay.var("rhs", type_annotation=rhs_type)
-    fwd_func = relay.Function([cond, lhs, rhs], relay.where(cond, lhs, rhs))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:], executor_kind=executor_kind)
-
-
-def test_less_equal_grad(executor_kind):
-    x_type = relay.TensorType((2, 3, 4), "float32")
-    y_type = relay.TensorType((3, 1), "float32")
-    # We need to generate inputs far apart to get correct numerical gradients
-    # (otherwise adding epsilon may change comparison result). The gradient
-    # should always be zero for both inputs.
-    inputs = [
-        np.random.choice([-1, 1], size=x_type.concrete_shape).astype(x_type.dtype),
-        np.random.choice([-2, 2], size=y_type.concrete_shape).astype(y_type.dtype),
-    ]
-
-    x = relay.var("x", type_annotation=x_type)
-    y = relay.var("y", type_annotation=y_type)
-    fwd_func = relay.Function([x, y], relay.less_equal(x, y))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind)
-
-
-def test_not_equal_grad(executor_kind):
-    x_type = relay.TensorType((2, 3, 4), "float32")
-    y_type = relay.TensorType((3, 1), "float32")
-    # We need to generate inputs far apart to get correct numerical gradients
-    # (otherwise adding epsilon may change comparison result). The gradient
-    # should always be zero for both inputs.
-    inputs = [
-        np.random.choice([-1, 1], size=x_type.concrete_shape).astype(x_type.dtype),
-        np.random.choice([-2, 2], size=y_type.concrete_shape).astype(y_type.dtype),
-    ]
-
-    x = relay.var("x", type_annotation=x_type)
-    y = relay.var("y", type_annotation=y_type)
-    fwd_func = relay.Function([x, y], relay.not_equal(x, y))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind)
-
-
-def test_strided_slice_grad(executor_kind):
-    def check(sh, dtype, begin, end, strides, slice_mode):
-        x = relay.var("x", shape=sh, dtype=dtype)
-        f = relay.Function(
-            [x],
-            relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=slice_mode),
-        )
-        check_grad(f, executor_kind=executor_kind)
-
-    check((2, 3, 4), "float32", (0, 1, 0), (-1, -1, 1), (1, 1, 1), "size")
-    check((2, 3, 4), "float32", (0, 1, 0), (2, 3, 1), (1, 1, 1), "end")
-    # check that strides are properly ignored when using "size" mode
-    check((2, 3, 4), "float32", (0, 0, 0), (-1, -1, -1), (1, 1, 2), "size")
-    check((2, 3, 4), "float32", (0, 0, 0), (2, 3, 4), (1, 1, 2), "end")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
deleted file mode 100644
index ca8ffda9ba59..000000000000
--- a/tests/python/relay/test_op_level1.py
+++ /dev/null
@@ -1,926 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-import scipy
-from tvm import relay
-import pytest
-from tvm.relay.testing import run_infer_type
-import tvm.topi.testing
-from tvm.contrib.nvcc import have_fp16
-import tvm.testing
-from tvm.topi.utils import get_const_tuple
-
-executor_kind = tvm.testing.parameter("graph", "vm")
-
-
-def sigmoid(x):
-    one = np.ones_like(x)
-    return one / (one + np.exp(-x))
-
-
-def relu(x):
-    x_copy = np.copy(x)
-    np.maximum(x_copy, 0, x_copy)
-    return x_copy
-
-
-def rsqrt(x):
-    one = np.ones_like(x)
-    return one / np.sqrt(x)
-
-
-class TestUnaryOp:
-    # Tuple of (operator, reference op, supports fp16)
-    op_list = {
-        "log": (tvm.relay.log, np.log, True),
-        "exp": (tvm.relay.exp, np.exp, True),
-        "erf": (tvm.relay.erf, scipy.special.erf, True),
-        "sqrt": (tvm.relay.sqrt, np.sqrt, True),
-        "rqsrt": (tvm.relay.rsqrt, rsqrt, True),
-        "sigmoid": (tvm.relay.sigmoid, sigmoid, True),
-        "tanh": (tvm.relay.tanh, np.tanh, False),
-        "relu": (relay.nn.relu, relu, True),
-        "cos": (tvm.relay.cos, np.cos, True),
-        "sin": (tvm.relay.sin, np.sin, True),
-        "tan": (tvm.relay.tan, np.tan, False),
-        "atan": (tvm.relay.atan, np.arctan, False),
-        "ceil": (tvm.relay.ceil, np.ceil, True),
-        "floor": (tvm.relay.floor, np.floor, True),
-        "trunc": (tvm.relay.trunc, np.trunc, True),
-        "round": (tvm.relay.round, np.round, False),
-    }
-
-    dtype = tvm.testing.parameter("float16", "float32")
-
-    relay_op, ref_func, supports_fp16 = tvm.testing.parameters(
-        *op_list.values(), ids=op_list.keys()
-    )
-
-    def test_unary_op(self, target, dev, relay_op, ref_func, supports_fp16, dtype):
-        target = tvm.target.Target(target)
-        if dtype == "float16":
-            if target.kind.name == "cuda":
-                if not have_fp16(tvm.cuda(0).compute_version):
-                    pytest.xfail(
-                        "No float16 support on local cuda device (compute_version != 5.3 and < 6.0)"
-                    )
-            elif target.kind.name == "vulkan" and not target.attrs.get("supports_float16", False):
-                pytest.xfail("No float16 support on vulkan target (supports_float16=False)")
-            elif not supports_fp16:
-                pytest.xfail(f"No float16 support on {target.kind.name} target")
-
-        if target.kind.name == "vulkan" and relay_op in [
-            tvm.relay.erf,
-            tvm.relay.tan,
-            tvm.relay.atan,
-        ]:
-            pytest.xfail(f"Vulkan runtime doesn't yet support {relay_op}")
-
-        shape = (10, 4)
-        dtype = dtype
-        tp = relay.TensorType(shape, dtype=dtype)
-        x = relay.var("x", type_annotation=tp)
-        y = relay_op(x)
-        # test printer
-        assert ("{}(%x)".format(y.op.name)) in y.astext()
-        # test type inference
-        yy = run_infer_type(y)
-        assert yy.checked_type == tp
-
-        if ref_func is not None:
-            data = np.random.rand(*shape).astype(dtype)
-            ref_res = ref_func(data).astype(dtype)
-            func = relay.Function([x], y)
-            # use graph by execuor default for testing, as we need
-            # create function explicitly to avoid constant-folding.
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-            tolerance = 1e-2 if dtype == "float16" else 1e-5
-            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=tolerance)
-
-
-@tvm.testing.uses_gpu
-def test_binary_op():
-    def inst(vars, sh):
-        return [vars.get(s, s) for s in sh]
-
-    def check_binary_op(opfunc, ref, dtype):
-        # TODO(@jroesch): this piece of code improperly uses type variables.
-        n = te.var("n")
-        s1 = (5, n, 5)
-        s2 = (n, 1)
-        t1 = relay.TensorType(s1)
-        t2 = relay.TensorType(s2)
-        x = relay.var("x", t1, dtype=dtype)
-        y = relay.var("y", t2, dtype=dtype)
-        z = opfunc(x, y)
-        # test printer
-        assert ("{}(%x, %y)".format(z.op.name)) in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == t1
-
-        if ref is not None:
-            t1 = relay.TensorType((5, 10, 5))
-            t2 = relay.TensorType((5, 10, 5))
-            x = relay.var("x", t1, dtype=dtype)
-            y = relay.var("y", t2, dtype=dtype)
-            z = opfunc(x, y)
-            x_data = np.random.rand(5, 10, 5).astype(dtype)
-            y_data = np.random.rand(5, 10, 5).astype(dtype)
-            ref_res = ref(x_data, y_data)
-            func = relay.Function([x, y], z)
-
-            for target, dev in tvm.testing.enabled_targets():
-                # use graph by execuor default for testing, as we need
-                # create function explicitly to avoid constant-folding.
-                if (
-                    dtype == "float16"
-                    and target == "cuda"
-                    and not have_fp16(tvm.cuda(0).compute_version)
-                ):
-                    continue
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01, atol=1e-3)
-
-    for opfunc, ref in [
-        (relay.add, np.add),
-        (relay.subtract, np.subtract),
-        (relay.multiply, np.multiply),
-        (relay.divide, np.divide),
-        (relay.floor_divide, np.floor_divide),
-        (relay.floor_mod, np.fmod),
-    ]:
-        for dtype in ["float16", "float32"]:
-            check_binary_op(opfunc, ref, dtype)
-
-
-@tvm.testing.uses_gpu
-def test_expand_dims():
-    # based on topi test
-    def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
-        x = relay.Var("x", relay.TensorType(dshape, dtype))
-        func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
-        for target, dev in tvm.testing.enabled_targets():
-            if (
-                dtype == "float16"
-                and target == "cuda"
-                and not have_fp16(tvm.cuda(0).compute_version)
-            ):
-                continue
-            data = np.random.uniform(size=dshape).astype(dtype)
-            ref_res = data.reshape(oshape)
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-    for dtype in ["float16", "float32"]:
-        verify_expand_dims((3, 10), dtype, (3, 10, 1, 1), 2, 2)
-        verify_expand_dims((3, 10), dtype, (1, 3, 10), -3, 1)
-
-
-@tvm.testing.uses_gpu
-def test_bias_add():
-    for dtype in ["float16", "float32"]:
-        xshape = (10, 2, 3, 4)
-        bshape = (2,)
-        rtol = 1e-2 if dtype == "float16" else 1e-5
-        x = relay.var("x", shape=xshape, dtype=dtype)
-        bias = relay.var("bias", dtype=dtype)
-        z = relay.nn.bias_add(x, bias)
-        zz = run_infer_type(z)
-        assert "axis=" not in zz.astext()
-        assert zz.args[1].checked_type == relay.TensorType(bshape, dtype)
-
-        func = relay.Function([x, bias], z)
-        x_data = np.random.uniform(size=xshape).astype(dtype)
-        y_data = np.random.uniform(size=bshape).astype(dtype)
-        ref_res = x_data + y_data.reshape((2, 1, 1))
-        for target, dev in tvm.testing.enabled_targets():
-            if (
-                dtype == "float16"
-                and target == "cuda"
-                and not have_fp16(tvm.cuda(0).compute_version)
-            ):
-                continue
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data, y_data
-            )
-            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol)
-
-
-def test_bias_add_type_failure():
-    def assert_failure(expr):
-        try:
-            run_infer_type(expr)
-        except tvm._ffi.base.TVMError:
-            return
-        else:
-            assert False
-
-    for axis in (0, -1, -3, 1):
-        assert_failure(relay.nn.bias_add(relay.const(1), relay.const(2), axis=axis))
-
-
-def test_expand_dims_infer_type():
-    for dtype in ["float16", "float32"]:
-        n, t, d = te.size_var("n"), te.size_var("t"), 100
-        x = relay.var("x", shape=(n, t, d), dtype=dtype)
-        y = relay.expand_dims(x, axis=2)
-        assert "axis=2" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, t, 1, 100), dtype)
-
-
-@tvm.testing.uses_gpu
-def test_softmax():
-    for shape in [(10, 4), (10, 5, 4)]:
-        for dtype in ["float16", "float32"]:
-            # Softmax accuracy for float16 is poor
-            if dtype == "float16":
-                continue
-            x = relay.var("x", shape=shape, dtype=dtype)
-            y = relay.nn.softmax(x, axis=1)
-            assert "nn.softmax" in y.astext()
-            yy = run_infer_type(y)
-            assert yy.checked_type == relay.TensorType(shape, dtype)
-            func = relay.Function([x], y)
-            x_data = np.random.uniform(size=shape).astype(dtype)
-            ref_res = tvm.topi.testing.softmax_python(x_data, axis=1)
-            for target, dev in tvm.testing.enabled_targets():
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_log_softmax():
-    for shape in [(10, 4), (10, 5, 4)]:
-        for dtype in ["float16", "float32"]:
-            # Softmax accuracy for float16 is poor
-            if dtype == "float16":
-                continue
-            x = relay.var("x", shape=shape, dtype=dtype)
-            y = relay.nn.log_softmax(x, axis=1)
-            assert "nn.log_softmax" in y.astext()
-            yy = run_infer_type(y)
-            assert yy.checked_type == relay.TensorType(shape, dtype)
-            func = relay.Function([x], y)
-            x_data = np.random.uniform(size=shape).astype(dtype)
-            ref_res = tvm.topi.testing.log_softmax_python(x_data, axis=1)
-            for target, dev in tvm.testing.enabled_targets():
-                if target == "nvptx":
-                    continue
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_concatenate(executor_kind):
-    for dtype in ["float16", "float32"]:
-        n, t, d = te.size_var("n"), te.size_var("t"), 100
-        x = relay.var("x", shape=(n, t, d))
-        y = relay.var("y", shape=(n, t, d))
-        z = relay.concatenate((x, y), axis=-1)
-        assert "axis=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType((n, t, 200))
-
-        x = relay.exp(x)
-        z = relay.concatenate((x, y), axis=2)
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType((n, t, 200))
-
-        z = relay.concatenate((x, y), axis=1)
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType((n, t + t, 100))
-
-        # check shape mismatches (the following case is expected to raise tvm._ffi.base.TVMError.
-        try:
-            x = relay.var("p1", shape=(2, 5))
-            y = relay.var("p2", shape=(2, 3))
-            c = relay.concatenate([x, y], axis=0)
-            func = relay.Function([x, y], c)
-            zz = run_infer_type(func)
-        except tvm._ffi.base.TVMError:
-            pass
-        else:
-            assert False
-
-        x = relay.var("x", shape=(10, 5), dtype=dtype)
-        y = relay.var("y", shape=(10, 5), dtype=dtype)
-        t = relay.var("z", shape=(), dtype=dtype)
-        z = relay.concatenate((x, y), axis=1)
-        z = relay.add(z, t)
-        # Check result.
-        func = relay.Function([x, y, t], z)
-        x_data = np.random.rand(10, 5).astype(dtype)
-        y_data = np.random.rand(10, 5).astype(dtype)
-        t_data = np.random.uniform(size=()).astype(dtype)
-        ref_res = np.concatenate((x_data, y_data), axis=1) + t_data
-
-        for target, dev in tvm.testing.enabled_targets():
-            if (
-                dtype == "float16"
-                and target == "cuda"
-                and not have_fp16(tvm.cuda(0).compute_version)
-            ):
-                continue
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data, y_data, t_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-
-def test_dropout(executor_kind):
-    for dtype in ["float16", "float32"]:
-        n, t, d = te.size_var("n"), te.size_var("t"), te.size_var("d")
-        input_ty = relay.TensorType((n, t, d), dtype)
-        x = relay.var("x", input_ty)
-        y = relay.nn.dropout(x, rate=0.75)
-        assert "rate=" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == input_ty
-
-    in_np = np.random.random([4, 5, 6]).astype("float32")
-    x = relay.const(in_np)
-    y = relay.nn.dropout(x, rate=0.5)
-    func = relay.Function([], y)
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)()
-        tvm.testing.assert_allclose(op_res.numpy(), in_np, rtol=0.01)
-
-
-def test_batch_norm():
-    for dtype in ["float16", "float32"]:
-        # beta and gamma ignored
-        data = relay.var("data", relay.TensorType((3, 2, 1), dtype))
-        beta = relay.var("beta", relay.TensorType((2,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((2,), dtype))
-        moving_mean = relay.var("moving_mean", relay.TensorType((2,), dtype))
-        moving_var = relay.var("moving_var", relay.TensorType((2,), dtype))
-        y = relay.nn.batch_norm(
-            data, gamma, beta, moving_mean, moving_var, center=False, scale=False
-        )
-        yy = run_infer_type(y.astuple())
-        assert "center=" in yy.astext()
-        assert yy.checked_type == relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.TensorType((3, 2, 1), dtype),
-                    relay.TensorType((2,), dtype),
-                    relay.TensorType((2,), dtype),
-                ]
-            )
-        )
-
-        # axis=1
-        beta = relay.var("beta", relay.TensorType((3,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((3,), dtype))
-        moving_mean = relay.var("moving_mean", relay.TensorType((3,), dtype))
-        moving_var = relay.var("moving_var", relay.TensorType((3,), dtype))
-
-        y = relay.nn.batch_norm(
-            data, gamma, beta, moving_mean, moving_var, axis=0, center=False, scale=False
-        )
-        yy = run_infer_type(y.astuple())
-        assert yy.checked_type == relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((3, 2, 1), dtype),
-                    relay.ty.TensorType((3,), dtype),
-                    relay.ty.TensorType((3,), dtype),
-                ]
-            )
-        )
-
-        # axis=-1
-        data = relay.var("data", relay.TensorType((1, 2, 3), dtype))
-        beta = relay.var("beta", relay.TensorType((3,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((3,), dtype))
-        moving_mean = relay.var("moving_mean", relay.TensorType((3,), dtype))
-        moving_var = relay.var("moving_var", relay.TensorType((3,), dtype))
-        y = relay.nn.batch_norm(
-            data, gamma, beta, moving_mean, moving_var, axis=-1, center=False, scale=False
-        )
-        yy = run_infer_type(y.astuple())
-        assert yy.checked_type == relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((1, 2, 3), dtype),
-                    relay.ty.TensorType((3,), dtype),
-                    relay.ty.TensorType((3,), dtype),
-                ]
-            )
-        )
-
-
-def do_concat_test(shapes, t_shape, dtype, axis, dev, target):
-    varsToConcat = []
-    inputData = []
-    pos = 0
-    for s in shapes:
-        varsToConcat.append(relay.var("x{}".format(pos), shape=s))
-        inputData.append(np.random.rand(*s).astype(dtype))
-        pos += 1
-    t = relay.var("z", shape=t_shape, dtype=dtype)
-    z = relay.concatenate(varsToConcat, axis=axis)
-    z = relay.add(z, t)
-    params = varsToConcat
-    params.append(t)
-    func = relay.Function(params, z)
-    t_data = np.random.uniform(low=-10, high=10, size=t_shape).astype(dtype)
-    ref_res = np.concatenate((tuple(inputData)), axis=axis) + t_data
-    mod = tvm.IRModule.from_expr(func)
-
-    executor = relay.create_executor("graph", mod=mod, device=dev, target=target)
-    op_res1 = executor.evaluate()(*inputData, t_data)
-
-    tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.000001)
-    op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
-        *inputData, t_data
-    )
-    tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.000001)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_concatenate1(target, dev):
-    np.random.seed(471)
-    maxNumDimensions = 6
-    shape = [4, 32, 16, 1, 31, 20, 21, 8, 28, 7]  # just randomly selected 10 numbers
-    for dtype in ["float32"]:
-        for dimsNum in range(1, maxNumDimensions):
-            np.random.shuffle(shape)
-            for axis in range(0, dimsNum):  # range should be (-dimsNum + 1, dimsNum)
-                numToConcat = np.random.uniform(low=2, high=10, size=(1)).astype("int64")[0]
-                shapes = []
-                # the code below to normalize axes index. For some reasons tvm notifies about error if the axis is negative
-                normalizedAxis = axis
-                if axis < 0:
-                    normalizedAxis += dimsNum
-                finalSize = 0
-                for i in range(0, numToConcat):
-                    shp = tuple(shape[:dimsNum])
-                    finalSize += shape[(i % len(shape))]
-                    shapes.append(
-                        shp[:normalizedAxis]
-                        + tuple([shape[(i % len(shape))]])
-                        + shp[normalizedAxis + 1 :]
-                    )
-                t_shape = shp[:normalizedAxis] + tuple([finalSize]) + shp[normalizedAxis + 1 :]
-                do_concat_test(shapes, t_shape, dtype, axis, dev, target)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_concatenate2(target, dev):
-    # test to cover cases (1, .. , x, 1, .. , 1)
-    np.random.seed(13)
-    maxNumDimensions = 6
-    shape = [8, 3, 25, 33, 12, 29, 5, 11, 29, 11]  # just randomly selected 10 numbers
-    ind = 0
-    for dtype in ["float32"]:
-        for dimsNum in range(2, maxNumDimensions):
-            np.random.shuffle(shape)
-            for axis in range(-dimsNum + 1, dimsNum):  # range should be (-dimsNum + 1, dimsNum)
-                numToConcat = np.random.uniform(low=2, high=10, size=(1)).astype("int64")[0]
-                shapes = []
-                # the code below to normalize axes index. For some reasons tvm notifies about error if the axis is negative
-                normalizedAxis = axis
-                if axis < 0:
-                    normalizedAxis += dimsNum
-                finalSize = 0
-                for i in range(0, numToConcat):
-                    axisVal = [1] * dimsNum
-                    axisVal[axis] = shape[(ind % len(shape))]
-                    ind += 1
-                    finalSize += axisVal[axis]
-                    shapes.append(tuple(axisVal))
-                temp = [1] * dimsNum
-                temp[axis] = finalSize
-                t_shape = tuple(temp)
-                do_concat_test(shapes, t_shape, dtype, axis, dev, target)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_concatenate3(target, dev):
-    np.random.seed(477)
-    for dtype in ["float32"]:
-        axis = -2
-        ending = 1
-        shapes = [[3, 2, 1, ending], [3, 2, 1, ending]]
-        t_shape = [3, 2, 2, ending]
-        do_concat_test(shapes, t_shape, dtype, axis, dev, target)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_concatenate4(target, dev):
-    np.random.seed(7)
-    x_shape = (2, 1)
-    x = relay.var("x", shape=x_shape, dtype="int64")
-    concat = relay.concatenate([x], axis=1)
-    f = relay.Function([x], concat)
-    x_val = np.array([[33], [13]], dtype="int64")
-    graph = relay.create_executor("graph", device=tvm.cpu(), target="llvm")
-    op_res = graph.evaluate(f)(x_val)
-    ref_res = np.concatenate([x_val], axis=1)
-    tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.000001)
-
-
-def test_batch_norm_fold_const():
-    axis = 1
-    dtype = "float32"
-    shape = [4, 5, 6]
-
-    data_np = np.random.random(shape).astype(dtype)
-    beta_np = np.random.random(shape[axis]).astype(dtype)
-    gamma_np = np.random.random(shape[axis]).astype(dtype)
-    moving_mean_np = np.random.random(shape[axis]).astype(dtype)
-    moving_var_np = np.random.random(shape[axis]).astype(dtype)
-
-    data = relay.var("data", relay.TensorType(shape, dtype))
-    beta = relay.var("beta", relay.TensorType((shape[1],), dtype))
-    gamma = relay.var("gamma", relay.TensorType((shape[1],), dtype))
-    moving_mean = relay.var("moving_mean", relay.TensorType((shape[1],), dtype))
-    moving_var = relay.var("moving_var", relay.TensorType((shape[1],), dtype))
-    out = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var, axis=axis).astuple()
-    func = relay.Function([data, gamma, beta, moving_mean, moving_var], out)
-
-    out_const = relay.nn.batch_norm(
-        relay.const(data_np),
-        relay.const(gamma_np),
-        relay.const(beta_np),
-        relay.const(moving_mean_np),
-        relay.const(moving_var_np),
-        axis=axis,
-    ).astuple()
-    func_const = relay.Function([], out_const)
-
-    # Build the module with constants to have FoldConstant transform batch_norm.
-    mod_const = tvm.IRModule.from_expr(func_const)
-    mod_const = relay.transform.FoldConstant()(mod_const)
-
-    const_data_out = mod_const["main"].body[0].data
-    const_moving_mean_out = mod_const["main"].body[1].data
-    const_moving_var_out = mod_const["main"].body[2].data
-
-    # Run the Relay func without constants. This will use SimplyInference instead.
-    vm_data_out, vm_moving_mean_out, vm_moving_var_out = relay.create_executor(
-        "vm", device=tvm.device("llvm"), target="llvm"
-    ).evaluate(func)(data_np, gamma_np, beta_np, moving_mean_np, moving_var_np)
-
-    tvm.testing.assert_allclose(const_data_out.numpy(), vm_data_out.numpy())
-    tvm.testing.assert_allclose(const_moving_mean_out.numpy(), vm_moving_mean_out.numpy())
-    tvm.testing.assert_allclose(const_moving_var_out.numpy(), vm_moving_var_out.numpy())
-
-
-@pytest.mark.xfail
-def test_matmul_type_check():
-    dtype = "float16"
-    n, c, h, w = 2, 2, 2, 2
-    x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-    # it should fail since it does not match with m(2)
-    mismatch_w = 3
-    w = relay.var("w", relay.TensorType((mismatch_w, 2), dtype))
-    y = relay.nn.matmul(x, w)
-    yy = run_infer_type(y)
-
-    i0 = relay.var("i0", shape=(1, 1), dtype="float32")
-    i1 = relay.var("i1", shape=(1,), dtype="float32")
-    with pytest.raises(tvm.TVMError):
-        run_infer_type(relay.nn.matmul(i0, i1))
-
-
-@tvm.testing.uses_gpu
-def test_matmul(executor_kind):
-    for dtype in ["float16", "float32"]:
-        # Matmul accuracy for float16 is poor
-        if dtype == "float16":
-            continue
-        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-        w = relay.var("w", relay.TensorType((2, w), dtype))
-        y = relay.nn.matmul(x, w, units=2, transpose_b=True)
-        assert "units=2" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
-
-        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
-        x = relay.var("x", relay.TensorType((n, c, w, h), dtype))
-        wh, ww = te.size_var("wh"), te.size_var("ww")
-        w = relay.var("w", relay.TensorType((wh, ww), dtype))
-        y = relay.nn.matmul(x, w, transpose_a=True)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype)
-
-        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
-        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-        w = relay.var("w", relay.IncompleteType())
-        y = relay.nn.matmul(x, w, units=2)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
-
-        x = relay.var("x", shape=(5, 10), dtype=dtype)
-        w = relay.var("w", shape=(5, 2), dtype=dtype)
-        z = relay.nn.matmul(x, w, transpose_a=True)
-
-        # Check result.
-        func = relay.Function([x, w], z)
-        x_data = np.random.rand(5, 10).astype(dtype)
-        w_data = np.random.rand(5, 2).astype(dtype)
-        ref_res = np.dot(x_data.transpose(), w_data)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data, w_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@pytest.mark.xfail
-def test_dense_type_check():
-    dtype = "float16"
-    n, c, h, w = 2, 2, 2, 2
-    x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-    # it should fail since it does not match with m(2)
-    mismatch_w = 3
-    w = relay.var("w", relay.TensorType((2, mismatch_w), dtype))
-    y = relay.nn.dense(x, w)
-    yy = run_infer_type(y)
-
-
-@tvm.testing.uses_gpu
-def test_dense(executor_kind):
-    for dtype in ["float16", "float32"]:
-        # Dense accuracy for float16 is poor
-        if dtype == "float16":
-            continue
-        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-        w = relay.var("w", relay.TensorType((2, w), dtype))
-        y = relay.nn.dense(x, w, units=2)
-        assert "units=2" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
-
-        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
-        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-        wh, ww = te.size_var("wh"), te.size_var("ww")
-        w = relay.var("w", relay.TensorType((ww, wh), dtype))
-        y = relay.nn.dense(x, w)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype)
-
-        # test dynamic shape in inner
-        m, k = 4, 2
-        x = relay.var("x", relay.TensorType((m, k), dtype))
-        k, nw = relay.Any(), 6
-        w = relay.var("w", relay.TensorType((k, n), dtype))
-        y = relay.nn.dense(x, w)
-        yy = run_infer_type(y)
-        # Confirm that input shape has not been rewritten to become dynamic.
-        assert get_const_tuple(yy.type_args[0].shape) == (4, 2)
-
-        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
-        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-        w = relay.var("w", relay.IncompleteType())
-        y = relay.nn.dense(x, w, units=2)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
-
-        x = relay.var("x", shape=(10, 5), dtype=dtype)
-        w = relay.var("w", shape=(2, 5), dtype=dtype)
-        z = relay.nn.dense(x, w)
-
-        # Check result.
-        func = relay.Function([x, w], z)
-        x_data = np.random.rand(10, 5).astype(dtype)
-        w_data = np.random.rand(2, 5).astype(dtype)
-        ref_res = np.dot(x_data, w_data.T)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data, w_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_dense_same_args_compile():
-    for dtype in ["float32", "int8"]:
-        x = relay.var("x", shape=(32, 64), dtype=dtype)
-        out_dtype = "int32" if dtype == "int8" else "float32"
-        f = relay.Function([x], relay.nn.dense(x, x, out_dtype=out_dtype))
-        m = tvm.IRModule.from_expr(f)
-
-        for target, _ in tvm.testing.enabled_targets():
-            tvm.relay.build(m, target=target)
-
-
-def test_dense_dtype():
-    data_dtype = "uint8"
-    weight_dtype = "int8"
-    out_dtype = "uint8"
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), data_dtype))
-    w = relay.var("w", relay.TensorType((2, w), weight_dtype))
-    y = relay.nn.dense(x, w, units=2, out_dtype=out_dtype)
-    assert "units=2" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, 2), out_dtype)
-    assert run_infer_type(yy.args[0]).checked_type.dtype == "uint8"
-    assert run_infer_type(yy.args[1]).checked_type.dtype == "int8"
-
-
-def test_bitserial_dense():
-    m, k = te.size_var("m"), te.size_var("k")
-    x = relay.var("x", relay.TensorType((m, k), "int16"))
-    w = relay.var("w", relay.TensorType((k, 32), "int16"))
-    y = relay.nn.bitserial_dense(x, w, units=32)
-    "units=8" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((m, 32), "int16")
-
-
-def dense_x86_test(m, n, k, target="llvm -mcpu=cascadelake", intrins=["vpdpbusd"]):
-    data_shape = (m, k)
-    weight_shape = (n, k)
-
-    for data_dtype in ["uint8", "int8"]:
-        data = relay.var("data", shape=data_shape, dtype=data_dtype)
-        weight = relay.var("weight", shape=weight_shape, dtype="int8")
-        bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
-        dense = relay.nn.dense(data, weight, out_dtype="int32")
-        out = relay.nn.bias_add(dense, bias)
-        mod = tvm.IRModule.from_expr(out)
-
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target)
-
-        # TODO(vvchernov): needs for avx512 arch, can be extended
-        if n % 16 == 0 and k % 4 == 0:
-            asm = lib.lib.get_source("asm")
-            for intrin in intrins:
-                assert intrin in asm
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
-        b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
-        c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
-
-        runtime.set_input("data", a)
-        runtime.set_input("weight", b)
-        runtime.set_input("bias", c)
-        runtime.run()
-
-        out = runtime.get_output(0).numpy()
-        ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
-
-        np.testing.assert_equal(out, ref)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.skip("skip due to AMX feature not avaliable yet")
-def test_dense_amx_int8():
-    data_shape = (32, 128)
-    weight_shape = (32, 128)
-
-    amx_init = tvm.get_global_func("runtime.amx_init")
-    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
-    assert amx_init()
-    assert amx_tileconfig(16, 64)  # config tile size to 16 rows by 64 columns.
-
-    for data_dtype in ["uint8", "int8"]:
-        data = relay.var("data", shape=data_shape, dtype=data_dtype)
-        weight = relay.var("weight", shape=weight_shape, dtype="int8")
-        bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
-        dense = relay.nn.dense(data, weight, out_dtype="int32")
-        out = relay.nn.bias_add(dense, bias)
-        mod = tvm.IRModule.from_expr(out)
-
-        target = "llvm -mcpu=sapphirerapids"
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target)
-
-        asm = lib.lib.get_source("asm")
-        assert "tilezero" in asm
-        assert "tileloaddt1" in asm
-        assert "tdpbusd" in asm
-        assert "tilestored" in asm
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
-        b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
-        c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
-
-        runtime.set_input("data", a)
-        runtime.set_input("weight", b)
-        runtime.set_input("bias", c)
-        runtime.run()
-
-        out = runtime.get_output(0).numpy()
-        ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
-
-        np.testing.assert_equal(out, ref)
-
-
-@tvm.testing.requires_x86_vnni
-@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
-def test_dense_vnni(m, n, k):
-    dense_x86_test(m, n, k)
-
-
-@tvm.testing.requires_x86_avx512
-@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
-def test_dense_skylake_avx512(m, n, k):
-    dense_x86_test(m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"])
-
-
-@pytest.mark.skip("Requires GFX10 AMDGPU")
-def test_dense_rocm_sdot4():
-    data_shape = (32, 96)
-    weight_shape = (128, 96)
-
-    data_dtype = "int8"
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=weight_shape, dtype="int8")
-    bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
-    dense = relay.nn.dense(data, weight, out_dtype="int32")
-    out = relay.nn.bias_add(dense, bias)
-    mod = tvm.IRModule.from_expr(out)
-
-    target = "rocm -mattr=+dotprod"
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target)
-
-    asm = lib.lib.imported_modules[0].get_source("asm")
-    assert "v_dot4_i32_i8" in asm
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
-    b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
-    c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
-
-    runtime.set_input("data", a)
-    runtime.set_input("weight", b)
-    runtime.set_input("bias", c)
-    runtime.run()
-
-    out = runtime.get_output(0).numpy()
-    ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
-
-    np.testing.assert_equal(out, ref)
-
-
-def test_extern_concat_injective_fuse():
-    # This is a subgraph from MobileBERT, which crashes compilation if buffers created in te.extern(...)
-    # do not have their elem_offset explicitly set as a variable.
-
-    # fmt: off
-    mod = tvm.relay.fromtext(
-        """
-       #[version = "0.0.5"]
-       def @main(%p0844: Tensor[(1, 384), int64], %p1652: Tensor[(2016, 128), float16]) {
-        %1331 = cast(%p0844, dtype="int32");
-        %1332 = take(%p1652, %1331, axis=0);
-        %1333 = strided_slice(%1332, begin=[0, 1, 0], end=[1, 384, 128], strides=[1, 1, 1], axes=None);
-        %1334 = strided_slice(%1332, begin=[0, 0, 0], end=[1, -1, 128], strides=[1, 1, 1], axes=None);
-        %1335 = nn.pad(%1333, 0, pad_width=[[0, 0], [0, 1], [0, 0]]);
-        %1336 = nn.pad(%1334, 0, pad_width=[[0, 0], [1, 0], [0, 0]]);
-        %1337 = (%1335, %1332, %1336);
-        %1338 = concatenate(%1337, axis=2);
-        reshape(%1338, newshape=[-1, 384])
-      }
-    """
-    )
-    # fmt: on
-
-    relay.build(mod, params={}, target="llvm")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
deleted file mode 100644
index 6036f707126b..000000000000
--- a/tests/python/relay/test_op_level10.py
+++ /dev/null
@@ -1,862 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level10 operator test cases.
-"""
-import sys
-import pytest
-
-import numpy as np
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import relay, te, topi
-from tvm.relay import transform
-from tvm.relay.testing import run_infer_type
-
-executor_kind = tvm.testing.parameter("graph", "vm")
-
-
-@tvm.testing.uses_gpu
-def test_checkpoint(executor_kind):
-    dtype = "float32"
-    xs = [relay.var("x{}".format(i), dtype) for i in range(4)]
-    f = relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
-    f_checkpoint = relay.annotation.checkpoint(f)
-
-    func, func_checkpoint = relay.Function(xs, f), relay.Function(xs, f_checkpoint)
-    f, f_checkpoint = run_infer_type(func), run_infer_type(func_checkpoint)
-    assert f.checked_type == f_checkpoint.checked_type
-
-    inputs = [np.random.uniform() for _ in range(len(xs))]
-    for target, dev in tvm.testing.enabled_targets():
-        f_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(*inputs)
-        f_checkpoint_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
-            f_checkpoint
-        )(*inputs)
-        tvm.testing.assert_allclose(f_res.numpy(), f_checkpoint_res.numpy(), 0, 0)
-
-
-def test_checkpoint_alpha_equal():
-    xs = [relay.var("x{}".format(i), relay.TensorType((1,), "float32")) for i in range(4)]
-    f = relay.Function(
-        xs,
-        relay.annotation.checkpoint(
-            relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
-        ),
-    )
-    df = transform.gradient(run_infer_type(f))
-
-    # run PE and DCE
-    with tvm.transform.PassContext(opt_level=3):
-        # The expected output assumes DCE can elide 'dead writes' to references. At the time this unit test was
-        # written DCE would elide all writes, which though unsound in general happens to work for this case. Preserve
-        # that legacy behaviour here using 'ignore_impurity=True'.
-        # TODO(mbs): Revisit once DCE supports dead reference writes.
-        passes = [
-            transform.PartialEvaluate(),
-            transform.DeadCodeElimination(inline_once=True, ignore_impurity=True),
-        ]
-        mod = tvm.transform.Sequential(passes)(tvm.IRModule.from_expr(df))
-        df = mod["main"]
-
-    df_parsed = tvm.relay.parse_expr(
-        """
-        #[version = "0.0.5"]
-        fn (%x: Tensor[(1), float32], %y: Tensor[(1), float32],
-            %z: Tensor[(1), float32], %w: Tensor[(1), float32])
-            ->  (Tensor[(1), float32],
-                (Tensor[(1), float32], Tensor[(1), float32],
-                 Tensor[(1), float32], Tensor[(1), float32])) {
-            %0 = add(%x, %y);
-            %1 = add(%z, %w);
-            let %x1: Tensor[(1), float32] = multiply(%0, %1);
-            let %x2: Tensor[(1), float32] = ones_like(%x1);
-            let %x3: Tensor[(1), float32] = add(%x, %y);
-            let %x4: Tensor[(1), float32] = add(%z, %w);
-            %2 = zeros_like(%x3);
-            %3 = multiply(%x2, %x4);
-            %4 = collapse_sum_like(%3, %x3);
-            let %x5: Tensor[(1), float32] = add(%2, %4);
-            %5 = zeros_like(%x4);
-            %6 = multiply(%x2, %x3);
-            %7 = collapse_sum_like(%6, %x4);
-            let %x6: Tensor[(1), float32] = add(%5, %7);
-            %8 = zeros_like(%x);
-            %9 = collapse_sum_like(%x5, %x);
-            %10 = add(%8, %9);
-            %11 = zeros_like(%y);
-            %12 = collapse_sum_like(%x5, %y);
-            %13 = add(%11, %12);
-            %14 = zeros_like(%z);
-            %15 = collapse_sum_like(%x6, %z);
-            %16 = add(%14, %15);
-            %17 = zeros_like(%w);
-            %18 = collapse_sum_like(%x6, %w);
-            %19 = add(%17, %18);
-            %20 = (%10, %13, %16, %19);
-            (%x1, %20)
-        }
-        """
-    )
-
-    tvm.ir.assert_structural_equal(df, df_parsed)
-
-
-def test_checkpoint_alpha_equal_tuple():
-    xs = [relay.var("x{}".format(i), relay.TensorType((1,), "float32")) for i in range(4)]
-    f = relay.Function(
-        xs,
-        relay.annotation.checkpoint(
-            relay.Tuple([relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3])])
-        ),
-    )
-    df = transform.gradient(run_infer_type(f))
-
-    # run PE and DCE
-    with tvm.transform.PassContext(opt_level=3):
-        # See comment in test_checkpoint_alpha_equal above.
-        # TODO(mbs): Revisit once DCE supports dead reference writes.
-        passes = [
-            transform.PartialEvaluate(),
-            transform.DeadCodeElimination(inline_once=True, ignore_impurity=True),
-        ]
-        mod = tvm.transform.Sequential(passes)(tvm.IRModule.from_expr(df))
-        df = mod["main"]
-
-    df_parsed = tvm.relay.parse_expr(
-        """
-        #[version = "0.0.5"]
-        fn (%x: Tensor[(1), float32], %y: Tensor[(1), float32],
-            %z: Tensor[(1), float32], %w: Tensor[(1), float32])
-            -> ((Tensor[(1), float32], Tensor[(1), float32]),
-                (Tensor[(1), float32], Tensor[(1), float32],
-                 Tensor[(1), float32], Tensor[(1), float32])) {
-        let %x1: Tensor[(1), float32] = add(%x, %y) /* ty=Tensor[(1), float32] */;
-        let %x2: Tensor[(1), float32] = add(%z, %w) /* ty=Tensor[(1), float32] */;
-        let %x3: Tensor[(1), float32] = zeros_like(%x2) /* ty=Tensor[(1), float32] */;
-        let %x4: Tensor[(1), float32] = ones_like(%x1) /* ty=Tensor[(1), float32] */;
-        %0 = (%x1, %x2);
-        %1 = zeros_like(%x) /* ty=Tensor[(1), float32] */;
-        %2 = collapse_sum_like(%x4, %x) /* ty=Tensor[(1), float32] */;
-        %3 = add(%1, %2) /* ty=Tensor[(1), float32] */;
-        %4 = zeros_like(%y) /* ty=Tensor[(1), float32] */;
-        %5 = collapse_sum_like(%x4, %y) /* ty=Tensor[(1), float32] */;
-        %6 = add(%4, %5) /* ty=Tensor[(1), float32] */;
-        %7 = zeros_like(%z) /* ty=Tensor[(1), float32] */;
-        %8 = collapse_sum_like(%x3, %z) /* ty=Tensor[(1), float32] */;
-        %9 = add(%7, %8) /* ty=Tensor[(1), float32] */;
-        %10 = zeros_like(%w) /* ty=Tensor[(1), float32] */;
-        %11 = collapse_sum_like(%x3, %w) /* ty=Tensor[(1), float32] */;
-        %12 = add(%10, %11) /* ty=Tensor[(1), float32] */;
-        %13 = (%3, %6, %9, %12);
-        (%0, %13)
-        }
-        """
-    )
-
-    tvm.ir.assert_structural_equal(df, df_parsed)
-
-
-@tvm.testing.uses_gpu
-def test_collapse_sum_like(executor_kind):
-    shape = (3, 4, 5, 6)
-    shape_like = (4, 5, 6)
-    dtype = "float32"
-    x = relay.Var("x", relay.ty.TensorType(shape, dtype))
-    y = relay.Var("y", relay.ty.TensorType(shape_like, dtype))
-    z = relay.collapse_sum_like(x, y)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
-
-    func = relay.Function([x, y], z)
-    x = np.random.uniform(size=shape).astype(dtype)
-    y = np.random.uniform(size=shape_like).astype(dtype)
-    ref_res = np.sum(x, 0)
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x, y
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_collapse_sum_to(executor_kind):
-    shape = (3, 4, 5, 6)
-    shape_to = (4, 5, 6)
-    dtype = "float32"
-    x = relay.Var("x", relay.ty.TensorType(shape, dtype))
-    z = relay.collapse_sum_to(x, shape_to)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType(shape_to, dtype)
-
-    func = relay.Function([x], z)
-    x = np.random.uniform(size=shape).astype(dtype)
-    ref_res = np.sum(x, 0)
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x)
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_broadcast_to(executor_kind):
-    shape = (4, 1, 6)
-    shape_like = (3, 4, 5, 6)
-    dtype = "float32"
-    x = relay.Var("x", relay.ty.TensorType(shape, dtype))
-    z = relay.broadcast_to(x, shape=shape_like)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
-
-    func = relay.Function([x], z)
-    x = np.random.uniform(size=shape).astype(dtype)
-    ref_res = np.broadcast_to(x, shape_like)
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x)
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_broadcast_to_const_shape_int64(executor_kind):
-    shape_like = relay.const(np.array([1, 5]), dtype="int64")
-    x = relay.var("x", shape=(1,), dtype="int64")
-    z = relay.broadcast_to(x, shape=shape_like)
-    z = relay.sum(z, axis=0)
-
-    f = relay.Function([x], z)
-
-    x = np.random.randint(10, size=(1,), dtype="int64")
-    ref_res = np.broadcast_to(x, (5,))
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x)
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-
-def test_broadcast_concat_shape_int64(executor_kind):
-    x_shape = (1, 2, 1, 1)
-    broadcast_shape = [1, 2, 2, 1]
-    x = relay.var("data", relay.TensorType(x_shape, "float32"))
-    broadcast_to = relay.op.broadcast_to(x, relay.const(broadcast_shape, dtype="int64"))
-    concate = relay.op.concatenate((broadcast_to,), axis=0)
-
-    f = relay.Function([x], concate)
-
-    x = np.zeros(x_shape).astype("float32")
-    ref_res = np.concatenate((np.broadcast_to(x, broadcast_shape),), axis=0)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x)
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-
-def test_broadcast_pool2d_shape_int64(executor_kind):
-    x_shape = (1, 3, 32, 32)
-    out_shape = (2, 3, 32, 32)
-    x = relay.var("data", shape=x_shape, dtype="float32")
-    broadcast_to = relay.broadcast_to(x, shape=relay.const([2, 3, 32, 32], dtype="int64"))
-    pool2d = relay.nn.max_pool2d(broadcast_to, pool_size=(3, 3), padding=(1, 1, 1, 1))
-    sub = relay.subtract(broadcast_to, pool2d)
-
-    f = relay.Function([x], sub)
-    x = np.ones(x_shape).astype("float32")
-    ref_res = np.zeros(out_shape).astype("float32")
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x)
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_broadcast_to_like(executor_kind):
-    shape = (4, 1, 6)
-    shape_like = (3, 4, 5, 6)
-    dtype = "float32"
-    x = relay.Var("x", relay.ty.TensorType(shape, dtype))
-    y = relay.Var("y", relay.ty.TensorType(shape_like, dtype))
-    z = relay.broadcast_to_like(x, y)
-
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
-
-    func = relay.Function([x, y], z)
-    x = np.random.uniform(size=shape).astype(dtype)
-    y = np.random.uniform(size=shape_like).astype(dtype)
-    ref_res = np.broadcast_to(x, shape_like)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x, y
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-def np_slice_like(np_data, np_shape_like, axis=None):
-    begin_idx = [0 for _ in np_data.shape]
-    end_idx = list(np_data.shape)
-    if axis:
-        for i in axis:
-            if i < 0:
-                i = len(np_data.shape) + i
-            end_idx[i] = np_shape_like.shape[i]
-    else:
-        for i in range(len(np_data.shape)):
-            if i < len(np_shape_like.shape):
-                end_idx[i] = np_shape_like.shape[i]
-    slice_idx = []
-    for b, e in zip(begin_idx, end_idx):
-        slice_idx.append(slice(b, e))
-    np_result = np_data[tuple(slice_idx)]
-    return np_result
-
-
-def verify_slice_like(executor_kind, data, slice_like, axes, output, dtype="float32"):
-    x = relay.var("data", relay.TensorType(data, dtype))
-    y = relay.var("slice_like", relay.TensorType(slice_like, dtype))
-    z = relay.slice_like(x, y, axes)
-    zz = run_infer_type(z)
-    if axes:
-        assert "axes" in z.astext()
-    assert zz.checked_type == relay.ty.TensorType(output, dtype)
-
-    if all(isinstance(v, int) == 0 for v in data) or all(
-        isinstance(v, int) == 0 for v in slice_like
-    ):
-        return
-
-    func = relay.Function([x, y], z)
-    x_data = np.random.uniform(size=data).astype(dtype)
-    y_data = np.random.uniform(size=slice_like).astype(dtype)
-    ref_res = np_slice_like(x_data, y_data, axes)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data, y_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_slice_like(executor_kind):
-    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
-    verify_slice_like(
-        executor_kind, data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)
-    )
-    verify_slice_like(
-        executor_kind, data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3)
-    )
-    verify_slice_like(
-        executor_kind, data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1, 2), output=(d2, d2, d3)
-    )
-    verify_slice_like(
-        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)
-    )
-    verify_slice_like(executor_kind, data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
-    verify_slice_like(
-        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3)
-    )
-    verify_slice_like(
-        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3)
-    )
-    verify_slice_like(
-        executor_kind,
-        data=(1, 3, 224, 224),
-        slice_like=(1, 3, 112, 112),
-        axes=(2, 3),
-        output=(1, 3, 112, 112),
-    )
-
-
-@tvm.testing.uses_gpu
-def test_reverse_reshape(executor_kind):
-    def verify_reverse_reshape(executor_kind, shape, newshape, oshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        z = relay.reverse_reshape(x, newshape=newshape)
-        zz = run_infer_type(z)
-        assert "newshape=" in z.astext()
-        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
-
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        ref_res = np.reshape(x_data, oshape)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_reverse_reshape(executor_kind, (2, 3, 4), (4, 0, 2), (4, 3, 2))
-    verify_reverse_reshape(executor_kind, (2, 3, 4), (2, 0, 0), (2, 3, 4))
-    verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -1), (3, 8))
-    verify_reverse_reshape(executor_kind, (2, 3, 4), (-1, 0), (6, 4))
-    verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -3), (2, 12))
-
-
-def verify_batch_matmul_with_inputs(
-    executor_kind, x, y, x_np, y_np, out_shape, dtype="float32", trans_x=False, trans_y=True
-):
-    z = relay.nn.batch_matmul(x, y, transpose_a=trans_x, transpose_b=trans_y)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType(out_shape, dtype)
-
-    input_vars = relay.analysis.free_vars(z)
-    func = relay.Function(input_vars, z)
-    z_np = tvm.topi.testing.batch_matmul(x_np, y_np, trans_x=trans_x, trans_y=trans_y)
-
-    for target, dev in tvm.testing.enabled_targets():
-        if len(input_vars) == 2:
-            z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_np, y_np
-            )
-        else:
-            z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_np)
-        tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5, atol=1e-5)
-
-
-def verify_batch_matmul(
-    executor_kind, x_shape, y_shape, out_shape, dtype="float32", trans_x=False, trans_y=True
-):
-    x = relay.var("x", relay.TensorType(x_shape, dtype))
-    y = relay.var("y", relay.TensorType(y_shape, dtype))
-    x_np = np.random.uniform(size=x_shape).astype(dtype)
-    y_np = np.random.uniform(size=y_shape).astype(dtype)
-    verify_batch_matmul_with_inputs(
-        executor_kind, x, y, x_np, y_np, out_shape, dtype, trans_x, trans_y
-    )
-
-
-@tvm.testing.uses_gpu
-def test_batch_matmul(executor_kind):
-    b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var("n"), te.size_var("k")
-    x = relay.var("x", relay.TensorType((b, m, k), "float32"))
-    y = relay.var("y", relay.TensorType((b, n, k), "float32"))
-    z = relay.nn.batch_matmul(x, y)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((b, m, n), "float32")
-
-    verify_batch_matmul(
-        executor_kind, (1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True
-    )
-    verify_batch_matmul(
-        executor_kind, (5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True
-    )
-    verify_batch_matmul(
-        executor_kind, (5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True
-    )
-    verify_batch_matmul(
-        executor_kind, (30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True
-    )
-    verify_batch_matmul(
-        executor_kind, (1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True
-    )
-    verify_batch_matmul(
-        executor_kind, (5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False
-    )
-    verify_batch_matmul(
-        executor_kind, (5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False
-    )
-
-    x_np = np.random.randn(10, 27, 64).astype("float32")
-    x = relay.var("x", shape=x_np.shape)
-    verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27))
-
-
-def batch_matmul_x86_test(b, m, n, k, target="llvm -mcpu=cascadelake", intrins=["vpdpbusd"]):
-    x_shape = (b, m, k)
-    y_shape = (b, n, k)
-    z_shape = (b, m, n)
-
-    for lhs_dtype in ["uint8", "int8"]:
-        x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
-        y = relay.var("y", shape=y_shape, dtype="int8")
-        z = relay.var("z", shape=z_shape, dtype="int32")
-        bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
-        out = bmm + z
-        mod = tvm.IRModule.from_expr(out)
-
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target)
-
-        # TODO(vvchernov): needs for avx512 arch, can be extended
-        if n % 16 == 0 and k % 4 == 0:
-            asm = lib.lib.get_source("asm")
-            for intrin in intrins:
-                assert intrin in asm
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
-        y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
-        z_np = np.random.uniform(1, 10, size=z_shape).astype("int32")
-
-        runtime.set_input("x", x_np)
-        runtime.set_input("y", y_np)
-        runtime.set_input("z", z_np)
-        runtime.run()
-
-        out = runtime.get_output(0).numpy()
-        ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32") + z_np
-
-        np.testing.assert_equal(out, ref)
-
-
-@pytest.mark.skip("skip due to AMX feature not avaliable yet")
-@pytest.mark.parametrize(
-    "b,m,n,k",
-    [
-        (16, 32, 32, 128),
-        (16, 32, 32, 127),
-        (16, 32, 31, 128),
-    ],
-)
-def test_batch_matmul_amx(b, m, n, k):
-    amx_init = tvm.get_global_func("runtime.amx_init")
-    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
-    assert amx_init()
-    assert amx_tileconfig(16, 64)  # config tile size to 16 rows by 64 columns.
-
-    x_shape = (b, m, k)
-    y_shape = (b, n, k)
-    z_shape = (b, m, n)
-
-    for lhs_dtype in ["uint8", "int8"]:
-        x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
-        y = relay.var("y", shape=y_shape, dtype="int8")
-        z = relay.var("z", shape=z_shape, dtype="int32")
-        bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
-        out = bmm + z
-        mod = tvm.IRModule.from_expr(out)
-
-        target = "llvm -mcpu=sapphirerapids"
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target)
-
-        asm = lib.lib.get_source("asm")
-        assert "tilezero" in asm
-        assert "tileloaddt1" in asm
-        assert "tdpbusd" in asm
-        assert "tilestored" in asm
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
-        y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
-        z_np = np.random.uniform(1, 10, size=z_shape).astype("int32")
-
-        runtime.set_input("x", x_np)
-        runtime.set_input("y", y_np)
-        runtime.set_input("z", z_np)
-        runtime.run()
-
-        out = runtime.get_output(0).numpy()
-        ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32") + z_np
-
-        np.testing.assert_equal(out, ref)
-
-
-@tvm.testing.requires_x86_vnni
-@pytest.mark.parametrize(
-    "b,m,n,k",
-    [
-        (16, 32, 128, 96),
-        (16, 32, 128, 97),
-        (16, 32, 129, 96),
-    ],
-)
-def test_batch_matmul_vnni(b, m, n, k):
-    batch_matmul_x86_test(b, m, n, k)
-
-
-@tvm.testing.requires_x86_avx512
-@pytest.mark.parametrize(
-    "b,m,n,k",
-    [
-        (16, 32, 128, 96),
-        (16, 32, 128, 97),
-        (16, 32, 129, 96),
-    ],
-)
-def test_batch_matmul_skylake_avx512(b, m, n, k):
-    batch_matmul_x86_test(b, m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"])
-
-
-@pytest.mark.skip("Requires GFX10 AMDGPU")
-def test_batch_matmul_rocm_sdot4():
-    x_shape = (16, 32, 96)
-    y_shape = (16, 128, 96)
-
-    lhs_dtype = "int8"
-    x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
-    y = relay.var("y", shape=y_shape, dtype="int8")
-    bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
-
-    mod = tvm.IRModule.from_expr(bmm)
-
-    target = "rocm -mattr=+dotprod"
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target)
-
-    asm = lib.lib.imported_modules[0].get_source("asm")
-    assert "v_dot4_i32_i8" in asm
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
-    y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
-
-    runtime.set_input("x", x_np)
-    runtime.set_input("y", y_np)
-    runtime.run()
-
-    out = runtime.get_output(0).numpy()
-    ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32")
-
-    np.testing.assert_equal(out, ref)
-
-
-@tvm.testing.uses_gpu
-def test_shape_of():
-    shape = (10, 5, 12)
-    x = relay.var("x", shape=shape)
-    func = relay.Function([x], relay.op.shape_of(x))
-    func = run_infer_type(func)
-    x_data = np.random.rand(*shape).astype("float32")
-    for target, dev in tvm.testing.enabled_targets():
-        # Because using graph executor, this op will be optimized after
-        # constant folding pass, here we only test with interpreter
-        for kind in ["vm"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_data)
-            tvm.testing.assert_allclose(op_res.numpy(), np.array(shape).astype("int32"))
-
-
-@tvm.testing.uses_gpu
-def test_ndarray_size(executor_kind):
-    def verify_ndarray_size(shape):
-        x = relay.var("x", shape=shape)
-        func = relay.Function([x], relay.op.ndarray_size(x))
-        func = run_infer_type(func)
-
-        x_data = np.random.uniform(size=shape).astype("float32")
-        ref_res = np.size(x_data)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-    verify_ndarray_size((2, 3, 5))
-    verify_ndarray_size((2, 3, 5, 7))
-
-
-def verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc):
-    for shape_dtype in ["int32", "int64"]:
-        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
-        y = opfunc(x, out_size, layout)
-        func = relay.Function([x], y)
-
-        np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
-        np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
-
-        for target, dev in tvm.testing.enabled_targets():
-            relay_out = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                np_data
-            )
-            tvm.testing.assert_allclose(relay_out.numpy(), np_out, rtol=1e-5, atol=1e-5)
-
-
-def verify_adaptive_pool1d(dshape, out_size, pool_type, layout="NCW", dtype="float32"):
-    opfunc = relay.nn.adaptive_avg_pool1d if pool_type == "avg" else relay.nn.adaptive_max_pool1d
-    verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc)
-
-
-def verify_adaptive_pool2d(dshape, out_size, pool_type, layout="NCHW", dtype="float32"):
-    opfunc = relay.nn.adaptive_avg_pool2d if pool_type == "avg" else relay.nn.adaptive_max_pool2d
-    verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc)
-
-
-def verify_adaptive_pool3d(dshape, out_size, pool_type, layout="NCDHW", dtype="float32"):
-    opfunc = relay.nn.adaptive_avg_pool3d if pool_type == "avg" else relay.nn.adaptive_max_pool3d
-    verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc)
-
-
-@tvm.testing.uses_gpu
-def test_adaptive_pool():
-    verify_adaptive_pool1d((1, 9, 224), (1), "max")
-    verify_adaptive_pool1d((1, 3, 224), (3), "avg")
-    verify_adaptive_pool1d((1, 3, 224), (3), "avg", dtype="int32")
-    verify_adaptive_pool1d((1, 14, 78), (13), "max")
-    verify_adaptive_pool1d((1, 5, 97), (96), "avg")
-    verify_adaptive_pool1d((1, 224, 3), (1), "max", layout="NWC")
-    verify_adaptive_pool1d((1, 3, 224), (3), "avg", layout="NWC")
-    verify_adaptive_pool2d((1, 9, 224, 224), (1, 1), "max")
-    verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg")
-    verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg", dtype="int32")
-    verify_adaptive_pool2d((1, 14, 56, 78), (34, 13), "max")
-    verify_adaptive_pool2d((1, 5, 46, 97), (4, 96), "avg")
-    verify_adaptive_pool2d((1, 224, 224, 3), (1, 1), "max", layout="NHWC")
-    verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg", layout="NHWC")
-    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "max", layout="NCDHW")
-    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NCDHW")
-    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NDHWC")
-    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NCDHW", dtype="int32")
-    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NDHWC", dtype="int32")
-    verify_adaptive_pool3d((1, 16, 32, 32, 32), (2, 4, 4), "max", layout="NDHWC")
-
-
-@tvm.testing.uses_gpu
-def test_sequence_mask(executor_kind):
-    def _verify(data_shape, mask_value, axis, dtype, itype):
-        max_length = data_shape[axis]
-        nbatch = data_shape[1 - axis]
-        data = relay.var("data", relay.TensorType(data_shape, dtype))
-        valid_length = relay.var("valid_length", relay.TensorType((nbatch,), itype))
-        out = relay.sequence_mask(data, valid_length, mask_value, axis)
-        checked = run_infer_type(out)
-        assert checked.checked_type == relay.ty.TensorType(data_shape, dtype)
-        func = relay.Function([data, valid_length], out)
-        data_np = np.random.uniform(size=data_shape).astype(dtype)
-        valid_length_np = np.random.randint(0, max_length, size=nbatch).astype(itype)
-        gt_out_np = tvm.topi.testing.sequence_mask(data_np, valid_length_np, mask_value, axis)
-
-        for target, dev in tvm.testing.enabled_targets():
-            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
-                func
-            )(data_np, valid_length_np)
-            tvm.testing.assert_allclose(out_relay.numpy(), gt_out_np)
-
-    _verify((5, 10), 0.0, 1, "float32", "int32")
-    _verify((2, 3, 5, 3), 0.0, 0, "float32", "int64")
-    _verify((5, 8, 3), 0.1, 1, "float64", "float32")
-
-
-@tvm.testing.uses_gpu
-def test_one_hot(executor_kind):
-    def _get_oshape(indices_shape, depth, axis):
-        oshape = []
-        true_axis = len(indices_shape) if axis == -1 else axis
-        ndim = len(indices_shape) + 1
-        indices_index = 0
-        for i in range(0, ndim):
-            if i == true_axis:
-                oshape.append(depth)
-            else:
-                oshape.append(indices_shape[indices_index])
-                indices_index += 1
-
-        return oshape
-
-    def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
-        indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
-        on_value_const = relay.const(on_value)
-        off_value_const = relay.const(off_value)
-        out = relay.one_hot(indices, on_value_const, off_value_const, depth, axis, dtype)
-        checked = run_infer_type(out)
-        assert checked.checked_type == relay.ty.TensorType(
-            _get_oshape(indices_shape, depth, axis), dtype
-        )
-        func = relay.Function([indices], out)
-        indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
-        out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
-
-        for target, dev in tvm.testing.enabled_targets():
-            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
-                func
-            )(indices_np)
-            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
-
-    _verify((3,), 3, 1, 0, -1, "int32")
-    _verify((3,), 3, 1.0, 0.0, -1, "float32")
-    _verify((2, 2), 5, 2, -2, 0, "int32")
-    _verify((2, 2), 5, 0.5, -0.5, 1, "float32")
-    _verify((3, 2, 4, 5), 6, 1, 0, 1, "int32")
-    _verify((3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
-
-
-@tvm.testing.uses_gpu
-def test_matrix_set_diag(executor_kind):
-    def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
-        input = relay.var("input", relay.TensorType(input_shape, dtype))
-        diagonal = relay.var("diagonal", relay.TensorType(diagonal_shape, dtype))
-        out = relay.matrix_set_diag(input, diagonal, k, align)
-
-        in_type = run_infer_type(input)
-        out_type = run_infer_type(out)
-        assert in_type.checked_type == out_type.checked_type
-
-        func = relay.Function([input, diagonal], out)
-        input_np = np.random.randint(-100, 100, size=input_shape).astype(dtype)
-        diagonal_np = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype)
-        out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align)
-
-        for target, dev in tvm.testing.enabled_targets():
-            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
-                func
-            )(input_np, diagonal_np)
-            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
-
-    _verify((2, 2), (2,), "float32")
-    _verify((4, 3, 3), (4, 3), "int32")
-    _verify((2, 3, 4), (2, 3), "float32", 1)
-    _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "LEFT_RIGHT")
-    _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "LEFT_LEFT")
-    _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "RIGHT_RIGHT")
-
-
-@tvm.testing.parametrize_targets
-def test_nll_loss(executor_kind, dev, target):
-    def _get_oshape(target_shape, reduction):
-        if reduction == "none":
-            return target_shape
-        else:
-            return []
-
-    def _verify(prediction_shape, reduction="mean", ignore_index=-100, dtype="float32"):
-        C = prediction_shape[1]
-        target_shape = prediction_shape[:1] + prediction_shape[2:]
-
-        predictions = relay.var("predictions", relay.TensorType(prediction_shape, dtype))
-        targets = relay.var("targets", relay.TensorType(target_shape, "int32"))
-        weights = relay.var("weights", relay.TensorType((C,), dtype))
-        out = relay.nn.nll_loss(predictions, targets, weights, reduction, ignore_index)
-        checked = run_infer_type(out)
-        assert checked.checked_type == relay.ty.TensorType(
-            _get_oshape(target_shape, reduction), dtype
-        )
-        func = relay.Function([predictions, targets, weights], out)
-        predictions_np = np.random.uniform(size=prediction_shape).astype(dtype)
-        targets_np = np.random.randint(0, C, target_shape).astype("int32")
-        weights_np = np.random.uniform(size=(C,)).astype(dtype)
-        out_np = tvm.topi.testing.nll_loss(
-            predictions_np, targets_np, weights_np, reduction, ignore_index
-        )
-
-        out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            predictions_np, targets_np, weights_np
-        )
-        tvm.testing.assert_allclose(out_relay.numpy(), out_np, rtol=1e-6, atol=1e-6)
-
-    _verify((10, 5))
-    _verify((10, 5, 2, 2))
-    _verify((10, 5), reduction="sum")
-    _verify((10, 5), reduction="none")
-    _verify((10, 5), ignore_index=3)
-    _verify((10, 5), dtype="float64")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
deleted file mode 100644
index 78da144e54bf..000000000000
--- a/tests/python/relay/test_op_level2.py
+++ /dev/null
@@ -1,2287 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level2 operator test cases.
-"""
-import sys
-
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import autotvm, relay, te
-from tvm.contrib import utils, cudnn
-from tvm.ir.module import IRModule
-from tvm.relay import transform
-from tvm.relay.testing import run_infer_type
-from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
-
-executor_kind = tvm.testing.parameter("graph", "vm")
-
-
-@tvm.testing.uses_gpu
-def test_conv1d_infer_type():
-    # symbolic in batch dimension
-    n, c, w = te.var("n"), 10, 224
-    x = relay.var("x", relay.ty.TensorType((n, c, w), "float32"))
-    w = relay.var("w")
-    y = relay.nn.conv1d(x, w, kernel_size=3, padding=(1, 1), channels=2)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 224), "float32")
-    assert yy.args[1].checked_type == relay.TensorType((2, 10, 3), "float32")
-
-    # infer by shape of w, mixed precision
-    n, c, w = te.var("n"), 10, 224
-    x = relay.var("x", relay.TensorType((n, c, w), "int8"))
-    w = relay.var("w", relay.TensorType((2, 10, 3), "int8"))
-    y = relay.nn.conv1d(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 222), "int32")
-
-    # infer shape in case of different dtypes for input and weight.
-    n, c, w = te.var("n"), 10, 224
-    x = relay.var("x", relay.TensorType((n, c, w), "uint8"))
-    w = relay.var("w", relay.TensorType((2, 10, 3), "int8"))
-    y = relay.nn.conv1d(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 222), "int32")
-
-    # Infer with NWC
-    n, c, w = 4, 32, 224
-    x = relay.var("x", relay.TensorType((n, w, c), "int8"))
-    wt = relay.var("w")
-    y = relay.nn.conv1d(
-        x, wt, kernel_size=3, padding=(1, 1), channels=16, data_layout="NWC", out_dtype="int32"
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, w, 16), "int32")
-
-
-@tvm.testing.uses_gpu
-def test_conv1d_run():
-    def run_test_conv1d(
-        dtype,
-        out_dtype,
-        scale,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        fref=None,
-        dilation=1,
-        except_targets=None,
-        **attrs,
-    ):
-        if except_targets is None:
-            except_targets = []
-
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", dtype=dtype)
-        y = relay.nn.conv1d(x, w, padding=padding, dilation=dilation, **attrs)
-        func = relay.Function([x, w], y)
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        ref_res = tvm.topi.testing.conv1d_ncw_python(
-            data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, dilation
-        )
-
-        for target, dev in tvm.testing.enabled_targets():
-            if target in except_targets:
-                continue
-            dev = tvm.device(target, 0)
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                data, kernel
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    # normal conv1d
-    dshape = (1, 3, 224)
-    kshape = (10, 3, 3)
-    run_test_conv1d(
-        "float32", "float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=3
-    )
-    # mixed precision
-    run_test_conv1d("int8", "int32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=3)
-    # dilated conv2d
-    dshape = (1, 3, 18)
-    kshape = (10, 3, 3)
-    run_test_conv1d(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=10,
-        kernel_size=3,
-        dilation=3,
-    )
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_infer_type():
-    # symbolic in batch dimension
-    n, c, h, w = te.size_var("n"), 10, 224, 224
-    x = relay.var("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    w = relay.var("w")
-    y = relay.nn.conv2d(x, w, kernel_size=(3, 3), padding=(1, 1), channels=2)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 224, 224), "float32")
-    assert yy.args[1].checked_type == relay.TensorType((2, 10, 3, 3), "float32")
-
-    # infer by shape of w, mixed precision
-    n, c, h, w = te.size_var("n"), 10, 224, 224
-    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
-    w = relay.var("w", relay.TensorType((2, 10, 3, 3), "int8"))
-    y = relay.nn.conv2d(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 222, 222), "int32")
-
-    # infer shape in case of different dtypes for input and weight.
-    n, c, h, w = te.size_var("n"), 10, 224, 224
-    x = relay.var("x", relay.TensorType((n, c, h, w), "uint8"))
-    w = relay.var("w", relay.TensorType((2, 10, 3, 3), "int8"))
-    y = relay.nn.conv2d(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 222, 222), "int32")
-
-    # Infer with a different layout
-    n, c, h, w = 4, 32, 224, 224
-    x = relay.var("x", relay.TensorType((n // 4, c // 4, h, w, 4, 4), "int8"))
-    wt = relay.var("w")
-    y = relay.nn.conv2d(
-        x,
-        wt,
-        kernel_size=(3, 3),
-        padding=(1, 1),
-        channels=16,
-        data_layout="NCHW4n4c",
-        kernel_layout="OIHW4o4i",
-        out_dtype="int32",
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1, 4, 224, 224, 4, 4), "int32")
-    assert yy.args[1].checked_type == relay.TensorType((4, 8, 3, 3, 4, 4), "int8")
-
-    # Infer with NHWC
-    n, c, h, w = 4, 32, 224, 224
-    x = relay.var("x", relay.TensorType((n, h, w, c), "int8"))
-    wt = relay.var("w")
-    y = relay.nn.conv2d(
-        x,
-        wt,
-        kernel_size=(3, 3),
-        padding=(1, 1),
-        channels=16,
-        data_layout="NHWC",
-        out_dtype="int32",
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, h, w, 16), "int32")
-
-
-class TestConv2D:
-    config = {
-        "group1": dict(
-            dtype="float32",
-            out_dtype="float32",
-            scale=1,
-            dshape=(1, 32, 18, 18),
-            kshape=(32, 4, 3, 3),
-            padding=(1, 1),
-            channels=32,
-            groups=8,
-            kernel_size=(3, 3),
-            dilation=(1, 1),
-        ),
-        "group2": dict(
-            dtype="float32",
-            out_dtype="float32",
-            scale=1,
-            dshape=(1, 32, 18, 18),
-            kshape=(64, 1, 3, 3),
-            padding=(1, 1),
-            channels=64,
-            groups=32,
-            kernel_size=(3, 3),
-            dilation=(1, 1),
-        ),
-        "normal": dict(
-            dtype="float32",
-            out_dtype="float32",
-            scale=1,
-            dshape=(1, 3, 224, 224),
-            kshape=(10, 3, 3, 3),
-            padding=(1, 1),
-            channels=10,
-            groups=1,
-            kernel_size=(3, 3),
-            dilation=(1, 1),
-        ),
-        "mixed_precision_int8_int32_case1": dict(
-            dtype="int8",
-            out_dtype="int32",
-            scale=1,
-            dshape=(1, 3, 224, 224),
-            kshape=(10, 3, 3, 3),
-            padding=(1, 1),
-            channels=10,
-            groups=1,
-            kernel_size=(3, 3),
-            dilation=(1, 1),
-        ),
-        "mixed_precision_int8_int32_case2": dict(
-            dtype="int8",
-            out_dtype="int32",
-            scale=1,
-            dshape=(1, 3, 224, 224),
-            kshape=(10, 3, 1, 3),
-            padding=(0, 1),
-            channels=10,
-            groups=1,
-            kernel_size=(1, 3),
-            dilation=(1, 1),
-        ),
-        "dilated": dict(
-            dtype="float32",
-            out_dtype="float32",
-            scale=1,
-            dshape=(1, 3, 18, 18),
-            kshape=(10, 3, 3, 3),
-            padding=(1, 1),
-            channels=10,
-            groups=1,
-            kernel_size=(3, 3),
-            dilation=(3, 3),
-        ),
-    }
-
-    # TODO(Lunderberg): Make a cleaner utility for this type of
-    # parametrization.  It would be much nicer to have the fixture
-    # name come from the dictionaries themselves, rather than needing
-    # to be re-packed into tuples.
-    (
-        dtype,
-        out_dtype,
-        scale,
-        dshape,
-        kshape,
-        padding,
-        channels,
-        groups,
-        kernel_size,
-        dilation,
-    ) = tvm.testing.parameters(
-        *[
-            [
-                d[p]
-                for p in [
-                    "dtype",
-                    "out_dtype",
-                    "scale",
-                    "dshape",
-                    "kshape",
-                    "padding",
-                    "channels",
-                    "groups",
-                    "kernel_size",
-                    "dilation",
-                ]
-            ]
-            for d in config.values()
-        ],
-        ids=config.keys(),
-    )
-
-    def test_run(
-        self,
-        target,
-        dev,
-        dtype,
-        out_dtype,
-        scale,
-        dshape,
-        kshape,
-        padding,
-        groups,
-        dilation,
-        channels,
-        kernel_size,
-    ):
-        target = tvm.target.Target(target)
-
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", shape=kshape, dtype=dtype)
-        y = relay.nn.conv2d(
-            x,
-            w,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            channels=channels,
-            kernel_size=kernel_size,
-        )
-        func = relay.Function([x, w], y)
-
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        dkernel = tvm.topi.testing.dilate_python(kernel, (1, 1) + dilation)
-
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        ref_res = tvm.topi.testing.conv2d_nchw_python(
-            data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding, groups=groups
-        )
-
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            data, kernel
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-4, atol=1e-4)
-
-
-def test_compile_depthwise_conv2d_arm_cpu():
-    dtype = "float32"
-    out_dtype = "float32"
-    scale = 1
-    dshape = (1, 512, 32, 32)
-    kshape = (512, 1, 3, 3)
-    padding = (1, 1)
-    channels = 512
-    groups = 512
-    kernel_size = (3, 3)
-    dilation = (1, 1)
-
-    x = relay.var("x", shape=dshape, dtype=dtype)
-    w = relay.var("w", shape=kshape, dtype=dtype)
-    y = relay.nn.conv2d(
-        x,
-        w,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-        channels=channels,
-        kernel_size=kernel_size,
-    )
-    func = relay.Function([x, w], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-
-    test_schedule = '{"i": ["llvm -device=arm_cpu", "depthwise_conv2d_nchw_spatial_pack.arm_cpu", \
-                    [["TENSOR", [1, 512, 32, 32], "float32"], \
-                    ["TENSOR", [512, 1, 3, 3], "float32"], \
-                    [1, 1], [1, 1], [1, 1], "float32"], {}, \
-                    ["depthwise_conv2d_nchw_spatial_pack.arm_cpu", [1, 512, 32, 32, "float32"], \
-                    [512, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], \
-                    {"i": 743640, "t": "", "c": null, \
-                    "e": [["tile_co", "sp", [32, 16]], ["tile_oh", "sp", [8, 1]], \
-                    ["tile_ow", "sp", [1, 8]], \
-                    ["reorder_0", "re", [0, 1, 2, 3, 4, 5, 8, 6, 7]], \
-                    ["reorder_1", "re", [0, 1, 2, 3, 6, 4, 5]], \
-                    ["ann_reduce", "an", ["unroll", "none"]], \
-                    ["ann_spatial", "an", ["unroll", "unroll", "vec"]], \
-                    ["data_pad_inline", "ot", 4], ["data_vec_inline", "ot", 1], \
-                    ["conv_inline", "ot", 0]]}], "r": [[0.0002933163], \
-                    0, 3.1976189613342285, 1570811630.6058347], "v": 0.1}'
-    temp = utils.tempdir()
-    with open(temp.relpath("temp.log"), "w") as log_file:
-        log_file.write(test_schedule)
-    with autotvm.apply_history_best(temp.relpath("temp.log")):
-        with tvm.transform.PassContext(opt_level=3):
-            print("Compiling...")
-            graph_json, mod, params = tvm.relay.build(mod, target="llvm -device=arm_cpu")
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_winograd():
-    class WinogradFallback(autotvm.FallbackContext):
-        def _query_inside(self, target, workload):
-            key = (target, workload)
-            if key in self.memory:
-                return self.memory[key]
-            cfg = autotvm.task.space.FallbackConfigEntity()
-            cfg.is_fallback = False
-            cfg.cost = 0.1 if "winograd" in workload[0] else 1
-            cfg["tile_b"] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
-            cfg["tile_y"] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
-            cfg["tile_x"] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
-            cfg["tile_rc"] = autotvm.task.space.SplitEntity([-1, 1])
-            cfg["auto_unroll_max_step"] = autotvm.task.space.OtherOptionEntity(1500)
-            cfg["unroll_explicit"] = autotvm.task.space.OtherOptionEntity(1)
-            self.memory[key] = cfg
-            return cfg
-
-    def run_test_conv2d_cuda(
-        dtype, out_dtype, scale, dshape, kshape, padding=(1, 1), groups=1, dilation=(1, 1), **attrs
-    ):
-
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", shape=kshape, dtype=dtype)
-        y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
-        func = relay.Function([x, w], y)
-        mod = tvm.IRModule()
-        mod["main"] = func
-        mod = relay.transform.InferType()(mod)
-
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        ref_res = tvm.topi.testing.conv2d_nchw_python(
-            data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, groups=groups
-        )
-
-        with WinogradFallback(), tvm.transform.PassContext(opt_level=3):
-            for target, dev in tvm.testing.enabled_targets():
-                if target != "cuda":
-                    continue
-                dev = tvm.device(target, 0)
-                params = {"w": tvm.nd.array(kernel)}
-                graph, lib, params = relay.build_module.build(mod, target=target, params=params)
-                module = tvm.contrib.graph_executor.create(graph, lib, dev)
-                module.set_input("x", tvm.nd.array(data))
-                module.set_input(**params)
-                module.run()
-                op_res1 = module.get_output(0)
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-3, atol=1e-3)
-
-    # normal winograd: stride 1, padding 1, kernel 3x3
-    dshape = (1, 80, 73, 73)
-    kshape = (192, 80, 3, 3)
-    run_test_conv2d_cuda(
-        "float32", "float32", 1, dshape, kshape, padding=(1, 1), channels=192, kernel_size=(3, 3)
-    )
-    # extended winograd: stride 1, padding N, kernel 3x3
-    run_test_conv2d_cuda(
-        "float32", "float32", 1, dshape, kshape, padding=(0, 0), channels=192, kernel_size=(3, 3)
-    )
-    run_test_conv2d_cuda(
-        "float32", "float32", 1, dshape, kshape, padding=(2, 2), channels=192, kernel_size=(3, 3)
-    )
-    # extended winograd: stride 1, padding N, kernel NxN
-    kshape = (192, 80, 7, 7)
-    run_test_conv2d_cuda(
-        "float32", "float32", 1, dshape, kshape, padding=(2, 2), channels=192, kernel_size=(7, 7)
-    )
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_infer_type():
-    # symbolic in batch dimension
-    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
-    x = relay.var("x", relay.ty.TensorType((n, c, d, h, w), "float32"))
-    w = relay.var("w")
-    y = relay.nn.conv3d(x, w, kernel_size=(3, 3, 3), padding=(1, 1, 1), channels=2)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 224, 224, 224), "float32")
-    assert yy.args[1].checked_type == relay.TensorType((2, 10, 3, 3, 3), "float32")
-
-    # infer by shape of w, mixed precision
-    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "int8"))
-    w = relay.var("w", relay.TensorType((2, 10, 3, 3, 3), "int8"))
-    y = relay.nn.conv3d(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 222, 222, 222), "int32")
-
-    # infer shape in case of different dtypes for input and weight.
-    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "uint8"))
-    w = relay.var("w", relay.TensorType((2, 10, 3, 3, 3), "int8"))
-    y = relay.nn.conv3d(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 222, 222, 222), "int32")
-
-    # Infer with NDHWC
-    n, c, d, h, w = 4, 32, 224, 224, 224
-    x = relay.var("x", relay.TensorType((n, d, h, w, c), "int8"))
-    wt = relay.var("w")
-    y = relay.nn.conv3d(
-        x,
-        wt,
-        kernel_size=(3, 3, 3),
-        padding=(1, 1, 1),
-        channels=16,
-        data_layout="NDHWC",
-        out_dtype="int32",
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, d, h, w, 16), "int32")
-
-    # Infer with groups
-    x = relay.var("x", relay.TensorType((1, 16, 224, 224, 224), "float32"))
-    w = relay.var("w", relay.TensorType((4, 4, 1, 1, 1), "float32"))
-    y = relay.nn.conv3d(x, w, groups=4, kernel_size=(1, 1, 1), channels=4)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1, 4, 224, 224, 224), "float32")
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_run():
-    def run_test_conv3d(
-        dtype,
-        out_dtype,
-        scale,
-        dshape,
-        kshape,
-        padding=(1, 1, 1),
-        fref=None,
-        groups=1,
-        dilation=(1, 1, 1),
-        except_targets=None,
-        **attrs,
-    ):
-        if except_targets is None:
-            except_targets = []
-
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", dtype=dtype)
-        y = relay.nn.conv3d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
-        func = relay.Function([x, w], y)
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        dkernel = tvm.topi.testing.dilate_python(kernel, (1, 1) + dilation)
-        if fref is None:
-            ref_res = tvm.topi.testing.conv3d_ncdhw_python(
-                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding, groups=groups
-            )
-        else:
-            ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
-
-        for target, dev in tvm.testing.enabled_targets():
-            if target in except_targets:
-                continue
-            dev = tvm.device(target, 0)
-
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                data, kernel
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    # normal conv3d
-    dshape = (1, 3, 5, 224, 224)
-    kshape = (10, 3, 3, 3, 3)
-    run_test_conv3d(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1, 1),
-        channels=10,
-        kernel_size=(3, 3, 3),
-    )
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_ndhwc_run():
-    def run_test_conv3d(
-        dtype,
-        out_dtype,
-        scale,
-        dshape,
-        kshape,
-        padding=(1, 1, 1),
-        fref=None,
-        groups=1,
-        dilation=(1, 1, 1),
-        except_targets=None,
-        **attrs,
-    ):
-        if except_targets is None:
-            except_targets = []
-
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", dtype=dtype)
-        y = relay.nn.conv3d(
-            x,
-            w,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            data_layout="NDHWC",
-            kernel_layout="DHWIO",
-            **attrs,
-        )
-        func = relay.Function([x, w], y)
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        dkernel = tvm.topi.testing.dilate_python(kernel, (1, 1) + dilation)
-        if fref is None:
-            ref_res = tvm.topi.testing.conv3d_ndhwc_python(
-                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding
-            )
-        else:
-            ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
-
-        for target, dev in tvm.testing.enabled_targets():
-            if target in except_targets:
-                continue
-            dev = tvm.device(target, 0)
-
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                data, kernel
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    # normal conv3d
-    dshape = (1, 5, 224, 224, 6)
-    kshape = (3, 3, 3, 6, 10)
-    run_test_conv3d(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1, 1),
-        channels=10,
-        kernel_size=(3, 3, 3),
-        except_targets=["cuda"],
-    )
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_winograd():
-    class WinogradFallback(autotvm.FallbackContext):
-        def _query_inside(self, target, workload):
-            key = (target, workload)
-            if key in self.memory:
-                return self.memory[key]
-            cfg = autotvm.task.space.FallbackConfigEntity()
-            cfg.is_fallback = False
-            cfg.cost = 0.1 if "winograd" in workload[0] else 1
-            cfg["tile_b"] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
-            cfg["tile_y"] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
-            cfg["tile_x"] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
-            cfg["tile_rc"] = autotvm.task.space.SplitEntity([-1, 1])
-            cfg["auto_unroll_max_step"] = autotvm.task.space.OtherOptionEntity(0)
-            cfg["unroll_explicit"] = autotvm.task.space.OtherOptionEntity(1)
-            self.memory[key] = cfg
-            return cfg
-
-    def run_test_conv3d_cuda(
-        dtype,
-        out_dtype,
-        scale,
-        dshape,
-        kshape,
-        padding=(1, 1, 1),
-        groups=1,
-        dilation=(1, 1, 1),
-        prepack=False,
-        **attrs,
-    ):
-
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", shape=kshape, dtype=dtype)
-        if prepack:
-            tile_size = _infer_tile_size(np.zeros(shape=dshape), np.zeros(shape=kshape))
-            w_packed = relay.nn.contrib_conv3d_winograd_weight_transform(w, tile_size)
-
-            y = relay.nn.contrib_conv3d_winograd_without_weight_transform(
-                x,
-                w_packed,
-                tile_size,
-                padding=padding,
-                dilation=dilation,
-                groups=groups,
-                channels=kshape[0],
-                **attrs,
-            )
-        else:
-            y = relay.nn.conv3d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
-        func = relay.Function([x, w], y)
-        mod = tvm.IRModule()
-        mod["main"] = func
-        mod = relay.transform.InferType()(mod)
-
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        ref_res = tvm.topi.testing.conv3d_ncdhw_python(
-            data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, groups=groups
-        )
-
-        with WinogradFallback(), tvm.transform.PassContext(opt_level=3):
-            for target, dev in tvm.testing.enabled_targets():
-                if target != "cuda":
-                    continue
-                dev = tvm.device(target, 0)
-                params = {"w": tvm.nd.array(kernel)}
-                graph, lib, params = relay.build_module.build(mod, target=target, params=params)
-                module = tvm.contrib.graph_executor.create(graph, lib, dev)
-                module.set_input("x", tvm.nd.array(data))
-                module.set_input(**params)
-                module.run()
-                op_res1 = module.get_output(0)
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-3, atol=1e-3)
-
-    # normal winograd: stride 1, padding 1, kernel 3x3x3
-    dshape = (1, 32, 16, 16, 16)
-    kshape = (64, 32, 3, 3, 3)
-    run_test_conv3d_cuda(
-        "float32", "float32", 1, dshape, kshape, padding=(1, 1, 1), kernel_size=(3, 3, 3)
-    )
-    # Without depth transform using 1x3x3 kernel.
-    kshape = (64, 32, 1, 3, 3)
-    run_test_conv3d_cuda(
-        "float32", "float32", 1, dshape, kshape, padding=(0, 1, 1), kernel_size=(1, 3, 3)
-    )
-
-    # extended winograd: stride 1, padding N, kernel NxNxN
-    dshape = (1, 61, 20, 20, 20)
-    kshape = (120, 61, 5, 5, 5)
-    run_test_conv3d_cuda(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(2, 2, 2),
-        channels=120,
-        kernel_size=(5, 5, 5),
-    )
-    # Without depth transform
-    kshape = (120, 61, 1, 5, 5)
-    run_test_conv3d_cuda(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(0, 2, 2),
-        channels=120,
-        kernel_size=(1, 5, 5),
-    )
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_transpose_infer_type():
-    # symbolic in batch dimension
-    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
-    x = relay.var("x", relay.ty.TensorType((n, c, d, h, w), "float32"))
-    w = relay.var("w")
-    y = relay.nn.conv3d_transpose(x, w, kernel_size=(3, 3, 3), padding=(1, 1, 1), channels=2)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 2, 224, 224, 224), "float32")
-
-    assert yy.args[1].checked_type == relay.TensorType((10, 2, 3, 3, 3), "float32")
-
-    # infer by shape of w, mixed precision
-    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "int8"))
-    w = relay.var("w", relay.TensorType((10, 12, 3, 3, 3), "int8"))
-    y = relay.nn.conv3d_transpose(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 12, 226, 226, 226), "int32")
-
-    # infer shape in case of different dtypes for input and weight.
-    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "uint8"))
-    w = relay.var("w", relay.TensorType((10, 12, 3, 3, 3), "int8"))
-    y = relay.nn.conv3d_transpose(x, w, out_dtype="int32")
-    assert 'out_dtype="int32"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 12, 226, 226, 226), "int32")
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_transpose_ncdhw_run():
-    dshape = (1, 3, 24, 24, 24)
-    kshape = (3, 4, 2, 2, 2)
-
-    x = relay.var("x", shape=dshape)
-    w = relay.var("w")
-    y = relay.nn.conv3d_transpose(
-        x, w, channels=4, kernel_size=(2, 2, 2), strides=(1, 1, 1), padding=(1, 1, 1)
-    )
-    func = relay.Function([x, w], y)
-    dtype = "float32"
-
-    data = np.random.uniform(size=dshape).astype(dtype)
-    kernel = np.random.uniform(size=kshape).astype(dtype)
-    ref_res = tvm.topi.testing.conv3d_transpose_ncdhw_python(data, kernel, 1, 1, 0)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            data, kernel
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-def test_compile_depthwise_conv3d():
-    dshape = [1, 16, 10, 10, 10]
-    wshape = [16, 2, 1, 1, 1]
-    params = {}
-    data = relay.var("data", shape=dshape, dtype="float32")
-    kernel = relay.const(tvm.nd.array(np.ones(shape=wshape).astype(dtype="float32")))
-    mod = tvm.IRModule()
-    res = relay.nn.conv3d(
-        data,
-        kernel,
-        kernel_size=[1, 1, 1],
-        padding=[0] * 3,
-        channels=32,
-        groups=16,
-        data_layout="NCDHW",
-        kernel_layout="OIDHW",
-    )
-    func = relay.Function([data], res)
-    mod = tvm.IRModule.from_expr(func)
-
-    target = "llvm"
-    _ = relay.build(mod, tvm.target.Target(target, host=target))
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_transpose_infer_type():
-    # symbolic in batch dimension
-    n, c, h, w = te.size_var("n"), 10, 10, 12
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    w = relay.var("w", relay.IncompleteType())
-    y = relay.nn.conv2d_transpose(x, w, kernel_size=(3, 3), padding=(1, 1), channels=15)
-    assert "channels=15" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 15, 10, 12), "float32")
-    assert yy.args[1].checked_type == relay.TensorType((10, 15, 3, 3), "float32")
-
-    # infer by shape of w, mixed precision
-    n, h, w, c = te.size_var("n"), 10, 10, 12
-    x = relay.var("x", relay.TensorType((n, h, w, c), "float32"))
-    w = relay.var("w", relay.TensorType((12, 11, 5, 5), "float32"))
-    y = relay.nn.conv2d_transpose(x, w, output_padding=(1, 1), channels=11, data_layout="NHWC")
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 15, 15, 11), "float32")
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_transpose_nchw_run():
-    k_layouts = {"OIHW": (10, 3, 3, 3), "IOHW": (3, 10, 3, 3)}
-    output_padding = (1, 1)
-
-    for k_layout, kshape in k_layouts.items():
-        dshape = (1, 3, 18, 18)
-        x = relay.var("x", shape=dshape)
-        w = relay.var("w")
-        y = relay.nn.conv2d_transpose(
-            x,
-            w,
-            channels=10,
-            kernel_size=(3, 3),
-            strides=(2, 2),
-            padding=(1, 1),
-            output_padding=output_padding,
-            kernel_layout=k_layout,
-            data_layout="NCHW",
-        )
-        func = relay.Function([x, w], y)
-        dtype = "float32"
-        data = np.random.uniform(size=dshape).astype(dtype)
-        kernel = np.random.uniform(size=kshape).astype(dtype)
-
-        if k_layout != "IOHW":
-            # Must be OIHW so switch
-            kernel_iohw = np.transpose(kernel, [1, 0, 2, 3])
-        else:
-            kernel_iohw = kernel
-
-        ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(
-            data, kernel_iohw, 2, 1, output_padding
-        )
-
-        enabled_targets = tvm.testing.enabled_targets()
-
-        if cudnn.exists() and k_layout == "IOHW":
-            enabled_targets.append(("cuda -libs=cudnn", tvm.cuda(0)))
-
-        for target, dev in enabled_targets:
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                data, kernel
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_transpose_nhwc_run():
-    dshape_nhwc = (1, 18, 18, 3)
-    kshape_hwoi = (3, 3, 10, 3)
-    x = relay.var("x", shape=dshape_nhwc)
-    w = relay.var("w")
-
-    y = relay.nn.conv2d_transpose(
-        x,
-        w,
-        channels=10,
-        kernel_size=(3, 3),
-        strides=(2, 2),
-        padding=(1, 1),
-        output_padding=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    func = relay.Function([x, w], y)
-    dtype = "float32"
-    data = np.random.uniform(size=dshape_nhwc).astype(dtype)
-    kernel = np.random.uniform(size=kshape_hwoi).astype(dtype)
-
-    ref_res = tvm.topi.testing.conv2d_transpose_nhwc_python(
-        data, kernel, "HWOI", 2, 1, output_padding=(1, 1)
-    )
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            data, kernel
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_transpose_nhwc_cudnn():
-    if not cudnn.exists():
-        return
-
-    dshape_nhwc = (1, 18, 18, 3)
-    kshape_ihwo = (3, 3, 3, 10)
-    x = relay.var("x", shape=dshape_nhwc)
-    w = relay.var("w", shape=kshape_ihwo)
-
-    y = relay.nn.conv2d_transpose(
-        x,
-        w,
-        channels=10,
-        kernel_size=(3, 3),
-        strides=(2, 2),
-        padding=(1, 1),
-        output_padding=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="IHWO",
-    )
-    func = relay.Function([x, w], y)
-    dtype = "float32"
-    data = np.random.uniform(size=dshape_nhwc).astype(dtype)
-    kernel = np.random.uniform(size=kshape_ihwo).astype(dtype)
-
-    ref_res = tvm.topi.testing.conv2d_transpose_nhwc_python(
-        data, np.transpose(kernel, [1, 2, 3, 0]), "HWOI", 2, 1, output_padding=(1, 1)
-    )
-
-    target = "cuda -libs=cudnn"
-    dev = tvm.cuda(0)
-
-    op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data, kernel)
-    tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_conv1d_transpose_ncw_run():
-    dshape = (1, 3, 18)
-    kshape = (3, 10, 3)
-    oshape = (1, 10, 36)
-    x = relay.var("x", shape=dshape)
-    w = relay.var("w")
-    y = relay.nn.conv1d_transpose(
-        x, w, channels=10, kernel_size=(3,), strides=(2,), padding=(1,), output_padding=(1,)
-    )
-    func = relay.Function([x, w], y)
-    dtype = "float32"
-    data = np.random.uniform(size=dshape).astype(dtype)
-    kernel = np.random.uniform(size=kshape).astype(dtype)
-    ref_res = tvm.topi.testing.conv1d_transpose_ncw_python(data, kernel, 2, 1, output_padding=(1,))
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            data, kernel
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_upsampling_infer_type():
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    scale = tvm.tir.const(2.0, "float64")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.upsampling(x, scale_h=2, scale_w=2, layout="NCHW", method="bilinear")
-    'method="BINLINEAR"' in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType(
-        (
-            n,
-            c,
-            tvm.tir.Cast("int32", te.round(h * scale)),
-            tvm.tir.Cast("int32", te.round(w * scale)),
-        ),
-        "float32",
-    )
-    n, c = te.size_var("n"), te.size_var("c")
-    x = relay.var("x", relay.TensorType((n, c, 100, 200), "float32"))
-    y = relay.nn.upsampling(x, scale_h=2, scale_w=2, layout="NCHW", method="bilinear")
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
-
-
-@tvm.testing.uses_gpu
-def test_upsampling3d_infer_type():
-    n, c, d, h, w = (
-        te.size_var("n"),
-        te.size_var("c"),
-        te.size_var("d"),
-        te.size_var("h"),
-        te.size_var("w"),
-    )
-    scale = tvm.tir.const(2.0, "float64")
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
-    y = relay.nn.upsampling3d(
-        x, scale_d=2, scale_h=2, scale_w=2, layout="NCDHW", method="trilinear"
-    )
-
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType(
-        (
-            n,
-            c,
-            tvm.tir.Cast("int32", te.round(d * scale)),
-            tvm.tir.Cast("int32", te.round(h * scale)),
-            tvm.tir.Cast("int32", te.round(w * scale)),
-        ),
-        "float32",
-    )
-    n, c = te.size_var("n"), te.size_var("c")
-    x = relay.var("x", relay.TensorType((n, c, 100, 100, 200), "float32"))
-    y = relay.nn.upsampling3d(
-        x, scale_d=2, scale_h=2, scale_w=2, layout="NCDHW", method="trilinear"
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, 200, 200, 400), "float32")
-
-
-def _test_global_pool2d(opfunc, reffunc):
-    n, c, h, w = te.size_var("n"), te.size_var("c"), 224, 224
-    x = relay.var("x", relay.TensorType((n, h, w, c), "float32"))
-    y = opfunc(x, layout="NHWC")
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 1, 1, c), "float32")
-
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    y = opfunc(x)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, 1, 1), "float32")
-    # test execution
-    dtype = "float32"
-    dshape = (1, 1024, 7, 7)
-    x = relay.var("x", shape=dshape)
-    y = opfunc(x)
-    func = relay.Function([x], y)
-    data = np.random.uniform(size=dshape).astype(dtype)
-    ref_res = reffunc(data, axis=(2, 3), keepdims=True)
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_pool2d():
-    def _test_pool2d(opfunc, pool_type, pool_size=2, strides=2, dilation=1, padding=0):
-        n, c, h, w = te.size_var("n"), 10, 224, 224
-        x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-        y = opfunc(x, pool_size=(1, 1))
-        assert "pool_size=" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, 10, 224, 224), "float32")
-        # test execution
-        dtype = "float32"
-        dshape = (1, 3, 28, 28)
-        x = relay.var("x", shape=dshape)
-        y = opfunc(x, pool_size=pool_size, strides=strides, dilation=dilation, padding=padding)
-        func = relay.Function([x], y)
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = tvm.topi.testing.poolnd_python(
-            data,
-            [pool_size, pool_size],
-            [strides, strides],
-            [dilation, dilation],
-            [padding, padding],
-            [padding, padding],
-            pool_type,
-            count_include_pad=False,
-            ceil_mode=False,
-        )
-        for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    def _test_pool2d_int(opfunc, reffunc, dtype):
-        n, c, h, w = te.size_var("n"), 10, 224, 224
-        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-        y = opfunc(x, pool_size=(1, 1))
-        assert "pool_size=" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, 10, 224, 224), dtype)
-        # test execution
-        dshape = (1, 3, 28, 28)
-        for shape_dtype in ["int32", "int64"]:
-            x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
-            y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-            func = relay.Function([x], y)
-            data = np.random.randint(low=-128, high=128, size=dshape)
-            ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)).astype(dtype)
-            for target, dev in tvm.testing.enabled_targets():
-                op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    data
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    _test_pool2d(relay.nn.max_pool2d, "max")
-    _test_pool2d(relay.nn.max_pool2d, "max", pool_size=2, strides=2, padding=0)
-    _test_pool2d(relay.nn.max_pool2d, "max", pool_size=2, strides=2, padding=0, dilation=2)
-    _test_pool2d(relay.nn.avg_pool2d, "avg")
-    _test_pool2d(relay.nn.avg_pool2d, "avg", pool_size=2, strides=2, padding=0)
-    _test_pool2d(relay.nn.avg_pool2d, "avg", pool_size=2, strides=2, padding=0, dilation=2)
-
-    _test_pool2d_int(relay.nn.avg_pool2d, np.mean, "int64")
-    _test_pool2d_int(relay.nn.avg_pool2d, np.mean, "float16")
-    _test_global_pool2d(relay.nn.global_max_pool2d, np.max)
-    _test_global_pool2d(relay.nn.global_avg_pool2d, np.mean)
-
-
-def _test_global_pool1d(opfunc, reffunc):
-    n, c, w = te.size_var("n"), te.size_var("c"), 224
-    x = relay.var("x", relay.TensorType((n, w, c), "float32"))
-    y = opfunc(x, layout="NWC")
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 1, c), "float32")
-
-    n, c, w = te.size_var("n"), te.size_var("c"), te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, w), "float32"))
-    y = opfunc(x)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, 1), "float32")
-    # test execution
-    dtype = "float32"
-    dshape = (1, 1024, 7)
-    x = relay.var("x", shape=dshape)
-    y = opfunc(x)
-    func = relay.Function([x], y)
-    data = np.random.uniform(size=dshape).astype(dtype)
-    ref_res = reffunc(data, axis=(2,), keepdims=True)
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_pool1d():
-    def _test_pool1d(
-        opfunc, pool_type, pool_size=2, strides=2, dilation=1, padding=0, dtype="float32"
-    ):
-        n, c, w = te.var("n"), 10, 224
-        x = relay.var("x", relay.TensorType((n, c, w), "float32"))
-        y = opfunc(x, pool_size=(1,))
-        assert "pool_size=" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, 10, 224), "float32")
-        # test execution
-        dshape = (1, 3, 32)
-        for shape_dtype in ["int32", "int64"]:
-            x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
-            pool_type = "max" if "max" in str(opfunc) else "avg"
-            y = opfunc(x, pool_size=pool_size, strides=strides, dilation=dilation, padding=padding)
-            func = relay.Function([x], y)
-            data = np.random.uniform(size=dshape).astype(dtype)
-            ref_res = tvm.topi.testing.poolnd_python(
-                data,
-                [pool_size],
-                [strides],
-                [dilation],
-                [padding],
-                [padding],
-                pool_type,
-                count_include_pad=False,
-                ceil_mode=False,
-            )
-            for target, dev in tvm.testing.enabled_targets():
-                op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    data
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    _test_pool1d(relay.nn.max_pool1d, "max")
-    _test_pool1d(relay.nn.max_pool1d, "max", dtype="int32")
-    _test_pool1d(relay.nn.max_pool1d, "max", pool_size=2, strides=2, padding=0)
-    _test_pool1d(relay.nn.max_pool1d, "max", pool_size=2, strides=2, padding=0, dilation=2)
-    _test_pool1d(relay.nn.avg_pool1d, "avg")
-    _test_pool1d(relay.nn.avg_pool1d, "avg", dtype="int64")
-    _test_pool1d(relay.nn.avg_pool1d, "avg", pool_size=2, strides=2, padding=0)
-    _test_pool1d(relay.nn.avg_pool1d, "avg", pool_size=2, strides=2, padding=0, dilation=2)
-    _test_global_pool1d(relay.nn.global_max_pool1d, np.max)
-    _test_global_pool1d(relay.nn.global_avg_pool1d, np.mean)
-
-
-@tvm.testing.uses_gpu
-def test_pool3d():
-    def _test_pool3d(
-        opfunc,
-        pool_type,
-        pool_size=2,
-        strides=2,
-        dilation=1,
-        padding=[0, 0, 0, 0, 0, 0],
-        dtype="float32",
-    ):
-        n, c, d, h, w = te.size_var("n"), 10, 5, 224, 224
-        x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
-        y = opfunc(x, pool_size=(1, 1, 1))
-        assert "pool_size=" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, 10, 5, 224, 224), "float32")
-        # test execution
-        dtype = "float32"
-        dshape = (1, 3, 32, 32, 32)
-        for shape_dtype in ["int32", "int64"]:
-            x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
-            pool_type = "max" if "max" in str(opfunc) else "avg"
-            y = opfunc(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                dilation=dilation,
-            )
-            func = relay.Function([x], y)
-            data = np.random.uniform(size=dshape).astype(dtype)
-            ref_res = tvm.topi.testing.poolnd_python(
-                data,
-                [pool_size, pool_size, pool_size],
-                [strides, strides, strides],
-                [dilation, dilation, dilation],
-                padding[:3],
-                padding[3:],
-                pool_type,
-                count_include_pad=False,
-                ceil_mode=False,
-            )
-            for target, dev in tvm.testing.enabled_targets():
-                op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    data
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    _test_pool3d(relay.nn.max_pool3d, "max")
-    _test_pool3d(relay.nn.max_pool3d, "max", dtype="int32")
-    _test_pool3d(relay.nn.max_pool3d, "max", padding=(2, 0, 0, 2, 0, 0))
-    _test_pool3d(relay.nn.max_pool3d, "max", padding=(0, 3, 0, 0, 3, 0))
-    _test_pool3d(relay.nn.max_pool3d, "max", padding=(0, 0, 4, 0, 0, 4))
-    _test_pool3d(relay.nn.max_pool3d, "max", pool_size=2, strides=2)
-    _test_pool3d(relay.nn.max_pool3d, "max", pool_size=2, strides=2, dilation=2)
-    _test_pool3d(relay.nn.avg_pool3d, "avg")
-    _test_pool3d(relay.nn.avg_pool3d, "avg", dtype="int32")
-    _test_pool3d(relay.nn.avg_pool3d, "avg", padding=(2, 0, 0, 2, 0, 0))
-    _test_pool3d(relay.nn.avg_pool3d, "avg", padding=(0, 3, 0, 0, 3, 0))
-    _test_pool3d(relay.nn.avg_pool3d, "avg", padding=(0, 0, 4, 0, 0, 4))
-    _test_pool3d(relay.nn.avg_pool3d, "avg", pool_size=2, strides=2)
-    _test_pool3d(relay.nn.avg_pool3d, "avg", pool_size=2, strides=2, dilation=2)
-
-
-@tvm.testing.uses_gpu
-def test_avg_pool2d_no_count_pad():
-    kh, kw = (4, 4)
-    sh, sw = (2, 2)
-    ph, pw = (2, 2)
-    n = 1
-    (ic, ih, iw) = (3, 28, 28)
-    (oc, oh, ow) = (3, 15, 15)
-    dshape = (n, ic, ih, iw)
-    x = relay.var("x", shape=dshape)
-    y = relay.nn.avg_pool2d(
-        x, pool_size=(kh, kw), strides=(sw, sw), padding=(ph, pw), count_include_pad=False
-    )
-    func = relay.Function([x], y)
-    dtype = "float32"
-    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
-    pad_np = np.zeros(shape=(n, ic, ih + 2 * ph, iw + 2 * pw)).astype(dtype)
-    no_zero = (range(n), range(ic), (range(ph, ih + ph)), (range(pw, iw + pw)))
-    pad_np[np.ix_(*no_zero)] = a_np
-    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
-    for i in range(oh):
-        for j in range(ow):
-            pad_count = np.sum(
-                pad_np[:, :, i * sh : i * sh + kh, j * sw : j * sw + kw] > 0, axis=(2, 3)
-            )
-            b_np[:, :, i, j] = np.sum(
-                pad_np[:, :, i * sh : i * sh + kh, j * sw : j * sw + kw], axis=(2, 3)
-            ) / np.maximum(pad_count, 1)
-    ref_res = np.maximum(b_np, 0.0)
-    data = a_np
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_flatten_infer_type(executor_kind):
-    d1, d2, d3, d4 = te.size_var("d1"), te.size_var("d2"), te.size_var("d3"), te.size_var("d4")
-    x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
-    y = relay.nn.batch_flatten(x)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((d1, ((d2 * d3) * d4)), "float32")
-
-    x = relay.var("x", relay.TensorType((3, 2, 4, 3), "float32"))
-    y = relay.nn.batch_flatten(x)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((3, 24), "float32")
-
-    x = relay.var("x", relay.TensorType((d1, 2, d3, 3), "float32"))
-    y = relay.nn.batch_flatten(x)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((d1, ((2 * d3) * 3)), "float32")
-
-    shape = (1, 5, 10, 10)
-    o_shape = (1, 500)
-    dtype = "float32"
-    x = relay.var("x", relay.TensorType(shape, dtype))
-    z = relay.nn.batch_flatten(x)
-    yy = run_infer_type(z)
-    assert yy.checked_type == relay.TensorType(o_shape, dtype)
-    func = relay.Function([x], z)
-    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
-    ref_res = x_data.flatten().reshape(o_shape)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_pad_infer_type():
-    # entirely concrete cases
-    n, c, h, w = 1, 2, 3, 4
-    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((3, 6, 9, 12), "float32")
-
-    n, c, h, w = 4, 6, 3, 5
-    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.pad(t, ((-1, -1), (2, -2), (0, -3), (4, 4)), pad_mode="reflect")
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((2, 6, 0, 13), "float32")
-
-    # some symbolic values
-    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
-    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
-
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.pad(t, ((-1, -1), (-2, -2), (1, -3), (4, 4)))
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n + (-2), c + (-4), h + (-2), w + 8), "float32")
-
-    # dealing with dynamic vals
-    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
-    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.pad(
-        t, ((1, 1), (2, 2), (3, 3), (4, 4)), pad_value=relay.var("pad_value", "float32")
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
-
-
-def _get_numpy_pad(dshape, data, pad, pad_value=0):
-    mod_pad = []
-    for axis, (pad_x, pad_y) in enumerate(pad):
-        indices = range(dshape[axis])
-        if pad_x < 0:
-            indices = indices[abs(pad_x) :]
-            pad_x = 0
-        if pad_y < 0:
-            indices = indices[:pad_y]
-            pad_y = 0
-        data = np.take(data, indices, axis)
-        mod_pad.append((pad_x, pad_y))
-    return np.pad(data, tuple(mod_pad), "constant", constant_values=pad_value)
-
-
-@tvm.testing.uses_gpu
-def test_pad_run():
-    def _test_run(dtype):
-        dshape_list = [(4, 10, 7, 7), (4, 6, 3, 5)]
-        pad_list = [((1, 1), (2, 2), (3, 3), (4, 4)), ((-1, -1), (2, -2), (0, -2), (4, 4))]
-
-        for dshape, pad in zip(dshape_list, pad_list):
-            x = relay.var("x", shape=dshape)
-            y = relay.nn.pad(x, pad)
-            func = relay.Function([x], y)
-            data = np.random.uniform(size=dshape).astype(dtype)
-            ref_res = _get_numpy_pad(dshape, data, pad)
-            for target, dev in tvm.testing.enabled_targets():
-                op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    data
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    _test_run("float32")
-    _test_run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_pad_run_dynamic_pad_value():
-    def _test_run(dtype):
-        dshape = (4, 6, 3, 5)
-        pad = ((-1, -1), (2, -2), (0, -2), (4, 4))
-
-        data = relay.var("data", shape=dshape, dtype=dtype)
-        pad_value = relay.var("pad_value", dtype)
-        pad_data = relay.nn.pad(data, pad, pad_value=pad_value)
-        f = relay.Function([data, pad_value], pad_data)
-
-        data_arr = np.random.uniform(-10, 10, size=dshape).astype(dtype)
-        pad_value_arr = 2.0
-        ref_res = _get_numpy_pad(dshape, data_arr, pad, pad_value=pad_value_arr)
-
-        for target, dev in tvm.testing.enabled_targets():
-            result = relay.create_executor(kind="graph", device=dev, target=target).evaluate(f)(
-                data_arr, pad_value_arr
-            )
-            tvm.testing.assert_allclose(result.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    _test_run("float32")
-    _test_run("int32")
-
-
-def test_pad_value_in_array():
-    A = relay.var("A", shape=(32, 32), dtype="int8")
-
-    # Extract pad value from an array
-    p0 = relay.Constant(tvm.nd.array(np.array([2], dtype="int8")))
-    p1 = relay.nn.pad(A, pad_value=p0, pad_width=((1, 1), (1, 1)))
-
-    func = relay.Function(relay.analysis.free_vars(p1), p1)
-    mod = tvm.IRModule.from_expr(func)
-
-    target = "llvm"
-    lib = relay.build(
-        mod,
-        tvm.target.Target(target, host=target),
-        runtime=relay.backend.Runtime("cpp"),
-        executor=relay.backend.Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
-    )
-
-
-@tvm.testing.uses_gpu
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-def test_lrn(executor_kind, dtype):
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", shape=(n, c, h, w), dtype=dtype)
-    y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=0.00001, beta=0.75)
-    "alpha=" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, w), dtype)
-
-    shape = (1, 5, 10, 10)
-    x = relay.var("x", relay.TensorType(shape, dtype))
-    size = 5
-    axis = 1
-    bias = 0.5
-    alpha = 0.00001
-    beta = 0.75
-    z = relay.nn.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
-    yy = run_infer_type(z)
-    assert yy.checked_type == relay.TensorType(shape, dtype)
-    func = relay.Function([x], z)
-    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
-    ref_res = tvm.topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_l2_normalize(executor_kind):
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", shape=(n, c, h, w))
-    y = relay.nn.l2_normalize(x, eps=0.001, axis=[1])
-    "axis=" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, w))
-
-    shape = (1, 5, 10, 10)
-    dtype = "float32"
-    x = relay.var("x", relay.TensorType(shape, dtype))
-    eps = 0.001
-    axis = 1
-    z = relay.nn.l2_normalize(x, eps=0.001, axis=[axis])
-    yy = run_infer_type(z)
-    assert yy.checked_type == relay.TensorType(shape, dtype)
-    func = relay.Function([x], z)
-    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
-    ref_res = tvm.topi.testing.l2_normalize_python(x_data, eps, axis)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-def batch_flatten(data):
-    shape = data.shape
-    target_dim = 1
-    for i in range(len(shape) - 1):
-        target_dim = target_dim * shape[i + 1]
-    return np.reshape(data, (shape[0], target_dim))
-
-
-@tvm.testing.uses_gpu
-def test_batch_flatten():
-    t1 = relay.TensorType((5, 10, 5))
-    x = relay.Var("x", t1)
-    func = relay.Function([x], relay.nn.batch_flatten(x))
-
-    data = np.random.rand(5, 10, 5).astype(t1.dtype)
-    ref_res = batch_flatten(data)
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-        np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-
-def _test_upsampling(layout, method, align_corners=False):
-    n, c, h, w = te.size_var("n"), 16, 32, 32
-    scale_h = 2.0
-    scale_w = 2.0
-    dtype = "float32"
-
-    def get_shape():
-        if layout == "NCHW":
-            return (c, h, w), (c, int(round(h * scale_h)), int(round(w * scale_w)))
-        else:
-            return (h, w, c), (int(round(h * scale_h)), int(round(w * scale_w)), c)
-
-    ishape, oshape = get_shape()
-    x = relay.var("x", relay.TensorType((n,) + ishape, dtype))
-    y = relay.nn.upsampling(
-        x,
-        scale_h=scale_h,
-        scale_w=scale_w,
-        layout=layout,
-        method=method,
-        align_corners=align_corners,
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n,) + oshape, dtype)
-    dshape = (1,) + ishape
-    x = relay.var("x", shape=dshape)
-    y = relay.nn.upsampling(
-        x,
-        scale_h=scale_h,
-        scale_w=scale_w,
-        layout=layout,
-        method=method,
-        align_corners=align_corners,
-    )
-    func = relay.Function([x], y)
-
-    data = np.random.uniform(size=dshape).astype(dtype)
-    ref = tvm.topi.testing.resize2d_python(
-        data,
-        (scale_h, scale_w),
-        layout,
-        method[2:] if method[0:2] == "bi" else method,
-        "align_corners" if align_corners else "asymmetric",
-    )
-    for target, dev in tvm.testing.enabled_targets():
-        out = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-        tvm.testing.assert_allclose(out.numpy(), ref, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_upsampling():
-    _test_upsampling("NCHW", "nearest_neighbor")
-    _test_upsampling("NCHW", "bilinear", True)
-    _test_upsampling("NHWC", "nearest_neighbor")
-    _test_upsampling("NHWC", "bilinear", True)
-
-
-def _test_upsampling3d(layout, method, coordinate_transformation_mode="half_pixel"):
-    n, c, d, h, w = te.size_var("n"), 8, 16, 16, 16
-    scale_d = 2.0
-    scale_h = 2.0
-    scale_w = 2.0
-    dtype = "float32"
-
-    def get_shape():
-        if layout == "NCDHW":
-            return (c, d, h, w), (
-                c,
-                int(round(d * scale_d)),
-                int(round(h * scale_h)),
-                int(round(w * scale_w)),
-            )
-        else:
-            return (d, h, w, c), (
-                int(round(d * scale_d)),
-                int(round(h * scale_h)),
-                int(round(w * scale_w)),
-                c,
-            )
-
-    ishape, oshape = get_shape()
-    x = relay.var("x", relay.TensorType((n,) + ishape, dtype))
-    y = relay.nn.upsampling3d(
-        x,
-        scale_d=scale_d,
-        scale_h=scale_h,
-        scale_w=scale_w,
-        layout=layout,
-        method=method,
-        coordinate_transformation_mode=coordinate_transformation_mode,
-    )
-
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n,) + oshape, dtype)
-    dshape = (1,) + ishape
-    x = relay.var("x", shape=dshape)
-    y = relay.nn.upsampling3d(
-        x,
-        scale_d=scale_d,
-        scale_h=scale_h,
-        scale_w=scale_w,
-        layout=layout,
-        method=method,
-        coordinate_transformation_mode=coordinate_transformation_mode,
-    )
-    func = relay.Function([x], y)
-
-    data = np.random.uniform(size=dshape).astype(dtype)
-    ref = tvm.topi.testing.resize3d_python(
-        data,
-        (scale_d, scale_h, scale_w),
-        layout,
-        method[3:] if method[0:3] == "tri" else method,
-        coordinate_transformation_mode,
-    )
-    for target, dev in tvm.testing.enabled_targets():
-        out = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-        tvm.testing.assert_allclose(out.numpy(), ref, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_upsampling3d():
-    _test_upsampling3d("NCDHW", "nearest_neighbor", "asymmetric")
-    _test_upsampling3d("NCDHW", "trilinear", "align_corners")
-    _test_upsampling3d("NDHWC", "nearest_neighbor", "asymmetric")
-    _test_upsampling3d("NDHWC", "trilinear", "align_corners")
-
-
-@tvm.testing.requires_x86
-@pytest.mark.skipif(tvm.target.codegen.llvm_version_major() < 8, reason="Requires LLVM 8")
-class TestConv2DInt8Intrinsics:
-    supported_targets = [
-        "llvm -mcpu=nehalem",
-        "llvm -mcpu=core-avx2",
-        "llvm -mcpu=skylake-avx512",
-        "llvm -mcpu=cascadelake",
-    ]
-
-    unsupported_targets = [
-        "llvm -mcpu=x86-64",
-    ]
-
-    data_layout, kernel_layout = tvm.testing.parameters(
-        ("NCHW", "OIHW"),
-        # TODO(@anijain2305, @icemelon9): disable conv2d_int8 for NHWC data layout.
-        #   Re-enable this after adding conv2d_NCHWc_int8 support for NHWC.
-        # ("NHWC", "HWIO"),
-    )
-
-    input_channels, output_channels = tvm.testing.parameters(
-        # Sweep the input channels to check int8 robustness
-        # Input channels should be a multiple of 4 internally.
-        (1, 16),
-        (4, 16),
-        (6, 16),
-        # Sweep the output channels to check int8 robustness
-        # Output channels should be a multiple of 16 internally.
-        (8, 4),
-        (8, 16),
-        (8, 20),
-        # Check that both non-divisible oc and ic work
-        (17, 29),
-    )
-
-    @tvm.testing.fixture
-    def fast_int8_intrinsic(self, target):
-        if "nehalem" in target or "core-avx2" in target or "skylake-avx512" in target:
-            return "pmaddubs"
-        elif "cascadelake" in target:
-            return "vpdpbusd"
-        else:
-            assert False, "Target should be Nehalem or core-avx2 or Skylake or Cascadelake"
-
-    @tvm.testing.fixture
-    def assembly(
-        self,
-        target,
-        dtypes,
-        input_channels,
-        output_channels,
-        data_layout,
-        kernel_layout,
-    ):
-        if (
-            input_channels == 17
-            and output_channels == 29
-            and target == "llvm -mcpu=x86-64"
-            and tvm.target.codegen.llvm_version_major() in [16, 17]
-        ):
-            pytest.skip(
-                "Non divisible dims does not produce vectorized code when 15 < LLVM Version < 18."
-            )
-
-        input_dtype, weight_dtype, output_dtype = dtypes
-
-        image_size = (64, 64)
-        kernel_size = (3, 3)
-        batch_size = 1
-
-        h, w = image_size
-
-        if data_layout == "NCHW":
-            data_shape = (batch_size, input_channels, *image_size)
-        elif data_layout == "NHWC":
-            data_shape = (batch_size, *image_size, input_channels)
-        else:
-            raise ValueError(f"Unsupported data layout: {data_layout}")
-        x = relay.var("x", relay.TensorType(data_shape, input_dtype))
-
-        if kernel_layout == "OIHW":
-            kernel_shape = (output_channels, input_channels, *kernel_size)
-        elif kernel_layout == "HWIO":
-            kernel_shape = (*kernel_size, input_channels, output_channels)
-        else:
-            raise ValueError("Not supported")
-        weight = relay.var("weight", relay.TensorType(kernel_shape, weight_dtype))
-
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            kernel_size=kernel_size,
-            channels=output_channels,
-            padding=(0, 0, 0, 1),
-            dilation=(1, 1),
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-            out_dtype=output_dtype,
-        )
-
-        func = relay.Function([x, weight], y)
-
-        wdata = np.random.rand(*kernel_shape) * 10
-        parameters = {"weight": tvm.nd.array(wdata.astype(weight_dtype))}
-
-        with tvm.transform.PassContext(opt_level=3):
-            graph, lib, params = relay.build(func, target, params=parameters)
-
-        return lib.get_source("asm")
-
-    # Ensure that code uses the fast int8 instructions when available.
-    @tvm.testing.parametrize_targets(*supported_targets)
-    @pytest.mark.parametrize(
-        "dtypes",
-        [
-            # compile conv2d for x86 (skylake, cascadelake) and test
-            # assembly contains *pmadd* instructions
-            ("uint8", "int8", "int32"),
-            # Check that int8 x int8 goes through legalization so that
-            # fast instructions can be picked up.
-            ("int8", "int8", "int32"),
-        ],
-    )
-    def test_uses_intrinsic(
-        self,
-        fast_int8_intrinsic,
-        assembly,
-    ):
-        assert fast_int8_intrinsic in assembly
-
-    # For datatypes that don't have HW support, ensure that code is
-    # generated without the fast int8 intrinsic.
-    @tvm.testing.parametrize_targets(*supported_targets)
-    @pytest.mark.parametrize("dtypes", [("uint8", "uint8", "int32")])
-    def test_no_intrinsic(
-        self,
-        fast_int8_intrinsic,
-        assembly,
-    ):
-        assert fast_int8_intrinsic not in assembly
-
-    # Check that a vectorized instruction is generated for older Intel
-    # generations, because we default to NCHWc layout.
-    @tvm.testing.parametrize_targets(*unsupported_targets)
-    @pytest.mark.parametrize("dtypes", [("uint8", "int8", "int32")])
-    def test_uses_vectorized_instruction(self, assembly):
-        assert "pmulhw" in assembly or "pmaddwd" in assembly
-        assert "paddd" in assembly
-
-
-@tvm.testing.uses_gpu
-def test_depthwise_conv2d_int8():
-    input_dtype = "uint8"
-    weight_dtype = "int8"
-    output_dtype = "int32"
-
-    data_shape = (1, 64, 56, 56)
-    x = relay.var("x", relay.TensorType(data_shape, input_dtype))
-
-    kernel_shape = (64, 1, 3, 3)
-    weight = relay.var("weight", relay.TensorType(kernel_shape, weight_dtype))
-
-    y = relay.nn.conv2d(
-        x,
-        weight,
-        kernel_size=(3, 3),
-        groups=64,
-        padding=(1, 1),
-        dilation=(1, 1),
-        out_dtype=output_dtype,
-    )
-    func = relay.Function([x, weight], y)
-    wdata = np.random.rand(*kernel_shape) * 10
-    parameters = {"weight": tvm.nd.array(wdata.astype(weight_dtype))}
-
-    targets = [
-        "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake-avx512",
-        "llvm -mtriple=x86_64-linux-gnu -mcpu=cascadelake",
-    ]
-    llvm_version = tvm.target.codegen.llvm_version_major()
-    for target in targets:
-        if llvm_version >= 8:
-            with tvm.transform.PassContext(opt_level=3):
-                graph, lib, params = relay.build(func, target, params=parameters)
-
-
-@tvm.testing.uses_gpu
-def test_bitserial_conv2d_infer_type():
-    # Basic shape test with ambiguous batch.
-    n, c, h, w = te.size_var("n"), 32, 224, 224
-    x = relay.var("x", relay.ty.TensorType((n, c, h, w), "int16"))
-    w = relay.var("w", relay.ty.TensorType((32, 32, 3, 3), "int16"))
-    y = relay.nn.bitserial_conv2d(x, w, kernel_size=(3, 3), padding=(0, 0), channels=32)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 32, 222, 222), "int16")
-
-
-@tvm.testing.uses_gpu
-def test_bitpack_infer_type():
-    # Test axis packing shape inference.
-    o, i, h, w = 32, 32, 128, 128
-    x = relay.var("x", relay.ty.TensorType((o, i, h, w), "int16"))
-    y = relay.nn.bitpack(x, bit_axis=4, pack_axis=1, pack_type="uint16", bits=1)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((32, 2, 128, 128, 1), "uint16")
-
-
-# TODO(@jwfromm): Need to add bitserial_conv2d & bitpack run test cases
-
-
-@tvm.testing.uses_gpu
-def test_correlation():
-    def _test_correlation(
-        data_shape,
-        kernel_size,
-        max_displacement,
-        stride1,
-        stride2,
-        padding,
-        is_multiply,
-        dtype="float32",
-    ):
-        data1 = relay.var("data1", relay.ty.TensorType(data_shape, dtype))
-        data2 = relay.var("data2", relay.ty.TensorType(data_shape, dtype))
-        y = relay.nn.correlation(
-            data1,
-            data2,
-            kernel_size,
-            max_displacement,
-            stride1,
-            stride2,
-            padding,
-            is_multiply,
-            "NCHW",
-        )
-        yy = run_infer_type(y)
-        padded_height = data_shape[2] + 2 * padding
-        padded_width = data_shape[3] + 2 * padding
-        border_size = (kernel_size - 1) // 2 + max_displacement
-        displacement_radius = max_displacement // stride2
-        out_channel = ((2 * displacement_radius) + 1) ** 2
-        out_height = (padded_height - 2 * border_size + stride1 - 1) // stride1
-        out_width = (padded_width - 2 * border_size + stride1 - 1) // stride1
-        assert yy.checked_type == relay.TensorType(
-            (data_shape[0], out_channel, out_height, out_width), dtype
-        )
-        func = relay.Function([data1, data2], y)
-        data1_np = np.random.uniform(size=data_shape).astype(dtype)
-        data2_np = np.random.uniform(size=data_shape).astype(dtype)
-        ref_res = tvm.topi.testing.correlation_nchw_python(
-            data1_np,
-            data2_np,
-            kernel_size,
-            max_displacement,
-            stride1,
-            stride2,
-            padding,
-            is_multiply,
-        )
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                data1_np, data2_np
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    _test_correlation(
-        (1, 3, 10, 10),
-        kernel_size=1,
-        max_displacement=4,
-        stride1=1,
-        stride2=1,
-        padding=4,
-        is_multiply=True,
-    )
-    _test_correlation(
-        (1, 3, 10, 10),
-        kernel_size=1,
-        max_displacement=5,
-        stride1=1,
-        stride2=1,
-        padding=5,
-        is_multiply=True,
-    )
-    _test_correlation(
-        (5, 1, 4, 4),
-        kernel_size=3,
-        max_displacement=1,
-        stride1=2,
-        stride2=1,
-        padding=2,
-        is_multiply=True,
-    )
-    _test_correlation(
-        (5, 1, 6, 4),
-        kernel_size=3,
-        max_displacement=1,
-        stride1=2,
-        stride2=2,
-        padding=2,
-        is_multiply=False,
-    )
-    _test_correlation(
-        (5, 1, 11, 11),
-        kernel_size=5,
-        max_displacement=1,
-        stride1=1,
-        stride2=1,
-        padding=2,
-        is_multiply=False,
-    )
-
-
-@pytest.mark.skip("Requires GFX10 AMDGPU")
-def test_conv2d_rocm_sdot4():
-    d_shape = (1, 64, 56, 56)
-    w_shape = (64, 64, 3, 3)
-    padding = (1, 1)
-    strides = (1, 1)
-    data_dtype = "int8"
-    weight_dtype = "int8"
-    out_dtype = "int32"
-
-    data = relay.var("data", shape=d_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
-    out_channel = w_shape[0]
-    conv2d = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=w_shape[2:],
-        channels=out_channel,
-        padding=padding,
-        strides=strides,
-        out_dtype=out_dtype,
-    )
-
-    mod = tvm.IRModule.from_expr(conv2d)
-
-    data_np = np.random.uniform(1, 10, d_shape).astype("int8")
-    weight_np = np.random.uniform(1, 10, size=w_shape).astype("int8")
-
-    target = "rocm -mattr=+dotprod"
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, params={"weight": weight_np})
-
-    asm = lib.lib.imported_modules[0].get_source("asm")
-    assert "v_dot4_i32_i8" in asm
-
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    runtime.set_input("data", data_np)
-    runtime.run()
-
-    out = runtime.get_output(0).numpy()
-
-    ref = tvm.topi.testing.conv2d_nchw_python(
-        data_np.astype("int32"), weight_np.astype("int32"), strides, padding
-    )
-
-    np.testing.assert_equal(out, ref)
-
-
-def np_float2tvm_bf16(arr):
-    """Convert a numpy array of float to a TVM array
-    of bf16"""
-    orig = arr.view("<u4")
-    bias = np.bitwise_and(np.right_shift(orig, 16), 1) + 0x7FFF
-    nparr = np.right_shift(orig + bias, 16).astype("uint16")
-    return tvm.nd.empty(nparr.shape, "bfloat16").copyfrom(nparr)
-
-
-def np_bf162np_float(arr):
-    """Convert a numpy array of bf16 (uint16) to a numpy array
-    of float"""
-    u32 = np.left_shift(arr.astype("uint32"), 16)
-    return u32.view("<f4")
-
-
-@tvm.testing.requires_x86
-def test_conv2d_nchw_dnnl():
-    if not tvm.get_global_func("tvm.contrib.dnnl.conv2d", allow_missing=True):
-        print(
-            "skip because extern dnnl function is not available, \
-                built with dnnl=ON"
-        )
-        return
-    d_shape = (1, 64, 56, 56)
-    w_shape = (64, 64, 3, 3)
-    padding = (1, 1)
-    strides = (1, 1)
-
-    def get_subgraph(dtype):
-        data = relay.var("data", shape=d_shape, dtype=dtype)
-        weight = relay.var("weight", shape=w_shape, dtype=dtype)
-        out_channel = w_shape[0]
-        conv2d = relay.nn.conv2d(
-            data=data,
-            weight=weight,
-            kernel_size=w_shape[2:],
-            channels=out_channel,
-            padding=padding,
-            strides=strides,
-            out_dtype=dtype,
-        )
-        return conv2d
-
-    for t in ["float32", "bfloat16"]:
-        mod = tvm.IRModule.from_expr(get_subgraph(t))
-
-        data_np = np.random.uniform(1, 10, d_shape).astype("float32")
-        weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
-        ref = tvm.topi.testing.conv2d_nchw_python(data_np, weight_np, strides, padding)
-
-        if t == "bfloat16":
-            data_np = np_float2tvm_bf16(data_np)
-            weight_np = np_float2tvm_bf16(weight_np)
-
-        target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target, params={"weight": weight_np})
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        runtime.set_input("data", data_np)
-        runtime.run()
-
-        out = runtime.get_output(0).numpy()
-
-        if t == "bfloat16":
-            out = np_bf162np_float(out)
-            np.testing.assert_allclose(out, ref, rtol=1e-2)
-        else:
-            np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.requires_x86
-def test_conv2d_nhwc_dnnl():
-    if not tvm.get_global_func("tvm.contrib.dnnl.conv2d", allow_missing=True):
-        print(
-            "skip because extern dnnl function is not available, \
-                built with dnnl=ON"
-        )
-        return
-    d_shape = (1, 56, 56, 64)
-    w_shape = (3, 3, 64, 64)
-    padding = (1, 1)
-    strides = (1, 1)
-
-    def get_subgraph(dtype):
-        data = relay.var("data", shape=d_shape, dtype=dtype)
-        weight = relay.var("weight", shape=w_shape, dtype=dtype)
-        out_channel = w_shape[3]
-        conv2d = relay.nn.conv2d(
-            data=data,
-            weight=weight,
-            kernel_size=w_shape[:2],
-            channels=out_channel,
-            padding=padding,
-            strides=strides,
-            out_dtype=dtype,
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        return conv2d
-
-    for t in ["float32", "bfloat16"]:
-        mod = tvm.IRModule.from_expr(get_subgraph(t))
-
-        data_np = np.random.uniform(1, 10, d_shape).astype("float32")
-        weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
-        ref = tvm.topi.testing.conv2d_nhwc_python(data_np, weight_np, strides, padding)
-
-        if t == "bfloat16":
-            data_np = np_float2tvm_bf16(data_np)
-            weight_np = np_float2tvm_bf16(weight_np)
-
-        target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target, params={"weight": weight_np})
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        runtime.set_input("data", data_np)
-        runtime.run()
-
-        out = runtime.get_output(0).numpy()
-
-        if t == "bfloat16":
-            out = np_bf162np_float(out)
-            np.testing.assert_allclose(out, ref, rtol=1e-2)
-        else:
-            np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
-
-
-def _test_conv2d_int8_alter_dtype(data_dtype, target, dot_product_instrs):
-    def get_conv2d_nchw(
-        d_shape,
-        w_shape,
-        data_dtype,
-    ):
-        out_dtype = "int32"
-        strides = (1, 1)
-        padding = (1, 1)
-        data = relay.var("data", shape=d_shape, dtype=data_dtype)
-        weight = relay.var("weight", shape=w_shape, dtype="int8")
-        out_channel = w_shape[0]
-        return relay.nn.conv2d(
-            data=data,
-            weight=weight,
-            kernel_size=w_shape[2:],
-            channels=out_channel,
-            padding=padding,
-            strides=strides,
-            out_dtype=out_dtype,
-        )
-
-    I, O, H, W = 64, 64, 56, 56
-    kH = kW = 3
-
-    data_shape = (1, I, H, W)
-    weight_shape = (O, I, kH, kW)
-    bias_shape = (1, weight_shape[0], 1, 1)
-
-    bias = relay.var("bias", shape=bias_shape, dtype="int32")
-    bias_np = np.random.randint(low=-127, high=128, size=bias_shape).astype("int32")
-    weight_np = np.random.uniform(-32, 32, size=weight_shape).astype("int8")
-
-    conv2d = get_conv2d_nchw(data_shape, weight_shape, data_dtype)
-    bias_add = relay.add(conv2d, bias)
-    mod = tvm.IRModule.from_expr(bias_add)
-
-    if data_dtype == "uint8":
-        data_np = np.random.uniform(0, 64, size=data_shape).astype("uint8")
-    else:
-        data_np = np.random.uniform(-32, 32, size=data_shape).astype("int8")
-
-    params = {"weight": weight_np, "bias": bias_np}
-
-    ref = (
-        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
-        .evaluate()(*[data_np, weight_np, bias_np])
-        .numpy()
-    )
-
-    dev = tvm.cpu(0)
-
-    with tvm.transform.PassContext(
-        opt_level=3,
-    ):
-        lib = relay.build(mod, target=target, params=params)
-
-    for dot_product_instr in dot_product_instrs:
-        assert dot_product_instr in lib.lib.get_source("asm")
-
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    rt_mod.set_input("data", data_np)
-
-    rt_mod.run()
-
-    out = rt_mod.get_output(0).numpy()
-
-    np.testing.assert_equal(out, ref)
-
-
-@tvm.testing.requires_arm_dot
-def test_conv2d_int8_alter_dtype_arm():
-    _test_conv2d_int8_alter_dtype(
-        "uint8", "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod", ["sdot"]
-    )
-
-
-@tvm.testing.requires_x86_vnni
-def test_conv2d_int8_alter_dtype_vnni():
-    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=cascadelake", ["vpdpbusd"])
-
-
-@tvm.testing.requires_x86_avx512
-def test_conv2d_int8_alter_dtype_avx512():
-    _test_conv2d_int8_alter_dtype(
-        "int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"]
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
deleted file mode 100644
index df60393776f6..000000000000
--- a/tests/python/relay/test_op_level3.py
+++ /dev/null
@@ -1,2341 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level3 operator test cases.
-"""
-import sys
-from typing import Callable, Optional
-
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import relay, te
-from tvm.error import TVMError
-from tvm.relay import create_executor, transform
-from tvm.relay.testing import check_grad, run_infer_type
-
-from utils import ref_funcs
-
-executor_kind = tvm.testing.parameter("graph", "vm")
-
-
-class TestZerosOnes:
-    config = {"zeros": (relay.zeros, np.zeros), "ones": (relay.ones, np.ones)}
-    op, ref = tvm.testing.parameters(*config.values(), ids=config.keys())
-
-    def test_zeros_ones(self, op, ref):
-        y = op(shape=(124, 50), dtype="float64")
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((124, 50), "float64")
-        intrp_res = create_executor().evaluate(y).numpy()
-        np.testing.assert_allclose(intrp_res, ref((124, 50), "float64"))
-
-
-class TestUnaryIdentity:
-    config = {
-        "zeros_like": (relay.zeros_like, np.zeros_like),
-        "ones_like": (relay.ones_like, np.ones_like),
-        "ceil": (relay.ceil, np.ceil),
-        "floor": (relay.floor, np.floor),
-        "trunc": (relay.trunc, np.trunc),
-        "round": (relay.round, np.round),
-        "abs": (relay.abs, np.abs),
-        "copy": (relay.copy, None),  # np.copy
-        "negative": (relay.negative, np.negative),
-        "sign": (relay.sign, np.sign),
-    }
-    op, ref = tvm.testing.parameters(*config.values(), ids=config.keys())
-
-    def test_unary_identity(self, op, ref):
-        shape = (8, 9, 4)
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = op(x)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType(shape, "float32")
-
-        if ref is not None:
-            data = np.random.rand(*shape).astype("float32")
-            op_res = create_executor().evaluate(y, {x: relay.const(data)})
-            ref_res = ref(data)
-            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-
-def test_cast():
-    x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
-    y = x.astype("int32")
-    yy = run_infer_type(y)
-    assert "dtype=" in yy.astext()
-    assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
-
-    x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
-    y = relay.cast(x, "int32")
-    yy = run_infer_type(y)
-    assert "dtype=" in yy.astext()
-    assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
-
-
-def test_sliding_window():
-    # Slide a window of shape (3, 4, 5) over the x tensor, beginning with
-    # dimension 1, which slides the window over the two subtensors of shape (3,
-    # 32, 32).
-    x = relay.var("x", relay.TensorType((2, 3, 32, 32), "float32"))
-    y = relay.sliding_window(x, 1, [3, 4, 5], [1, 2, 3])
-
-    # The resulting shape still has batch size 2. Each dimension in (1, 15, 10)
-    # represents the locations where we were able to form a window; that is, we
-    # were able to place the window in one place along the dimension of length
-    # 3, 15 places along the dimension of length 32 (when striding by 2), and 10
-    # places along the second dimension of length 32 (when striding by 3). The
-    # remaining dimensions (3, 4, 5) represent the formed windows.
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((2, 1, 15, 10, 3, 4, 5), "float32")
-
-    data = np.random.rand(2, 3, 32, 32).astype("float32")
-    intrp = create_executor()
-    result = intrp.evaluate(y, {x: relay.const(data)})
-    result_np = result.numpy()
-    assert result_np.shape == (2, 1, 15, 10, 3, 4, 5)
-    assert np.array_equal(result_np[0, 0, 0, 0, :, :, :], data[0, :, 0:4, 0:5])
-    assert np.array_equal(result_np[1, 0, 7, 3, :, :, :], data[1, :, 14:18, 9:14])
-    assert np.array_equal(result_np[1, 0, 14, 9, :, :, :], data[1, :, 28:32, 27:32])
-
-
-def test_clip():
-    a = relay.var("a", relay.TensorType((10, 4), "float32"))
-    y = relay.clip(a, 1.0, 4.0)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((10, 4), "float32")
-
-    data = np.random.rand(10, 4).astype("float32")
-    op_res = create_executor().evaluate(y, {a: relay.const(data)})
-    ref_res = np.clip(data, 1.0, 4.0)
-    np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-
-def test_fixed_point_multiply():
-    # Test 23 * 1/16
-    # [m,s] = [0.5, -3] = frexp(1/16)
-    # M = 0.5*2^31 = 1073741824
-    # so M = 1073741824 and s = -3
-
-    a = relay.var("a", relay.TensorType((10, 4), "int32"))
-    y = relay.fixed_point_multiply(a, 1073741824, -3)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((10, 4), "int32")
-
-    data = 23 * np.ones((10, 4)).astype("int32")
-    op_res = create_executor().evaluate(y, {a: relay.const(data)})
-    ref_res = np.ones((10, 4)).astype("int32")
-    np.testing.assert_allclose(op_res.numpy(), ref_res, atol=1)
-
-
-def test_reinterpret():
-    a = relay.var("a", relay.TensorType((1000, 4), "float32"))
-    y = relay.reinterpret(a, "int32")
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1000, 4), "int32")
-
-    data = np.random.randn(1000, 4).astype("float32") * 1000
-    op_res = create_executor().evaluate(y, {a: relay.const(data)})
-    ref_res = data.view("int32")
-    np.testing.assert_equal(op_res.numpy(), ref_res)
-
-
-def test_approximate_transcendental():
-    def C(x):
-        return relay.expr.const(x, "float32")
-
-    def approx_exp(x):
-        # An approximation derived from Opus,
-        # https://github.com/xiph/opus/blob/c1c247/celt/mathops.h#L147-L165
-        x = relay.minimum(relay.maximum(x, C(-88.0)), C(88.0))
-        x = C(127.0) + x * C(1.44269504)
-        xf = relay.floor(x)
-        i = relay.cast(xf, "int32")
-        x = x - xf
-        Y = C(0.99992522) + x * (C(0.69583354) + x * (C(0.22606716) + x * C(0.078024523)))
-        exponent = relay.left_shift(i, relay.expr.const(23, "int32"))
-        exponent = relay.reinterpret(exponent, "float32")
-        return exponent * Y
-
-    def approximate_sigmoid(x):
-        y = approx_exp(x)
-        return y / (y + C(1.0))
-
-    def approximate_tanh(x):
-        x = x * C(2.0)
-        y = approx_exp(x)
-        return (y - C(1.0)) / (y + C(1.0))
-
-    a = relay.var("a", relay.TensorType((1000,), "float32"))
-    y = approximate_sigmoid(a)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1000,), "float32")
-    data = np.linspace(-5, 5, 1000).astype("float32")
-    op_res = create_executor().evaluate(y, {a: relay.const(data)})
-
-    def reference_sigmoid(x):
-        return np.exp(-np.logaddexp(0, -x))
-
-    np.testing.assert_allclose(op_res.numpy(), reference_sigmoid(data), atol=2e-5, rtol=1e-9)
-
-    y = approximate_tanh(a)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1000,), "float32")
-    data = np.linspace(-5, 5, 1000).astype("float32")
-    op_res = create_executor().evaluate(y, {a: relay.const(data)})
-
-    def reference_tanh(x):
-        return np.tanh(x)
-
-    np.testing.assert_allclose(op_res.numpy(), reference_tanh(data), atol=4e-5, rtol=1e-9)
-
-
-class TestSqueeze:
-    shape, dtype, axis = tvm.testing.parameters(
-        ((1, 3, 2, 5), "float32", None),
-        ((1, 3, 1), "float32", [0]),
-        ((1, 2, 1, 2, 1), "float32", [0, 2]),
-        ((1, 3, 1), "float32", 2),
-        ((1, 3, 1), "float32", []),
-    )
-
-    def test_squeeze(self, shape, dtype, axis):
-        x = relay.var("x", relay.TensorType(shape, dtype))
-        squeeze = relay.squeeze(x, axis=axis)
-
-        if isinstance(axis, int):
-            np_axis = (axis,)
-        else:
-            np_axis = tuple(axis) if axis is not None else None
-
-        data = np.random.random_sample(shape).astype(dtype)
-        op_res = create_executor().evaluate(squeeze, {x: relay.const(data)})
-        ref_res = np.squeeze(data, axis=np_axis)
-        np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-
-def test_transpose_infer_type():
-    n, t, d = te.size_var("n"), te.size_var("t"), 100
-    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
-    y = relay.transpose(x, axes=(1, 0, 2))
-    assert "axes=" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((t, n, 100), "float32")
-
-    y = relay.transpose(x)
-    assert "axes=" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((100, t, n), "float32")
-
-
-def test_transpose(target, dev, executor_kind):
-    dshape = (2, 3, 4)
-    axes = (0, 2, 1)
-
-    x = relay.var("x", relay.TensorType(dshape, "float32"))
-    z = relay.transpose(x, axes=axes)
-
-    func = relay.Function([x], z)
-    x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
-    ref_res = np.transpose(x_data, axes=axes)
-
-    op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_data)
-    tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-def test_squeeze_infer_type():
-    n, t, d = 1, 4, 1
-    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
-    y = relay.squeeze(x, axis=(2,))
-    assert "axis=" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1, 4), "float32")
-
-    n, t, d = 1, 4, 1
-    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
-    y = relay.squeeze(x)
-    assert "axis=" not in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((4,), "float32")
-
-
-@pytest.mark.xfail(raises=tvm._ffi.base.TVMError)
-def test_squeeze_bad_axes_infer_type():
-    n, t, d = 1, 4, 1
-    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
-    y = relay.squeeze(x, axis=(1,))
-    yy = run_infer_type(y)
-
-
-def test_reshape_infer_type():
-    n, t, d1, d2 = 10, 20, 100, 20
-    x = relay.var("x", relay.TensorType((n, t, d1, d2), "float32"))
-    y = relay.reshape(x, newshape=(n, t, 2000))
-    assert "newshape=" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, t, 2000), "float32")
-
-
-class TestReshape:
-    shape, newshape, oshape = tvm.testing.parameters(
-        ((2, 3, 4), (8, 3), (8, 3)),
-        ((4, 7), (2, 7, 2), (2, 7, 2)),
-        ((2, 3, 4), (4, 0, 2), (4, 3, 2)),
-        ((2, 3, 4), (2, 0, 0), (2, 3, 4)),
-        ((2, 3, 4), (0, -1), (2, 12)),
-        ((2, 3, 4), (-1, 0), (8, 3)),
-        ((2, 3, 4), (2, -2), (2, 3, 4)),
-        ((2, 3, 4), (-2, 1, 1), (2, 3, 4, 1, 1)),
-        ((2, 3, 4), (-3, 4), (6, 4)),
-        ((2, 3, 4, 5), (-3, -3), (6, 20)),
-        ((2, 3, 4), (0, -3), (2, 12)),
-        ((2, 3, 4), (-3, -2), (6, 4)),
-        ((2, 3, 4), (-4, 1, 2, -2), (1, 2, 3, 4)),
-        ((2, 3, 4), (2, -4, -1, 3, -2), (2, 1, 3, 4)),
-        ((1,), (), ()),
-    )
-
-    def test_reshape(self, target, dev, executor_kind, shape, newshape, oshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        z = relay.reshape(x, newshape=newshape)
-        zz = run_infer_type(z)
-        assert "newshape=" in z.astext()
-        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
-
-        func = relay.Function([x], z)
-        check_grad(func)
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        ref_res = np.reshape(x_data, oshape)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-def test_reshape_fail():
-    with pytest.raises(TVMError) as reshape_err:
-        x = relay.var("x", relay.TensorType([2, 3], "float32"))
-        z = relay.reshape(x, [7])
-        zz = run_infer_type(z)
-
-
-def test_reshape_like_infer_type():
-    # concrete shape
-    x = relay.var("x", relay.TensorType((1, 2, 3), "float32"))
-    y = relay.var("y", relay.TensorType((1, 6), "float32"))
-    z = relay.reshape_like(x, y)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((1, 6), "float32")
-
-    # symbolic shape
-    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.var("y", relay.TensorType((1, 8, 8), "float32"))
-    z = relay.reshape_like(x, y)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((1, 8, 8), "float32")
-
-    # partial reshaping
-    x = relay.var("x", relay.TensorType((1, 2, 3, 4), "float32"))
-    y = relay.var("y", relay.TensorType((1, 6, 5), "float32"))
-    z = relay.reshape_like(x, y, lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((1, 6, 4), "float32")
-
-    x = relay.var("x", relay.TensorType((1, 2, 3, 4), "float32"))
-    y = relay.var("y", relay.TensorType((2, 3, 4, 1, 6), "float32"))
-    z = relay.reshape_like(x, y, rhs_end=3)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((2, 3, 4), "float32")
-    z = relay.reshape_like(x, y, rhs_begin=2)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((4, 1, 6), "float32")
-
-    # symbolic partial reshaping
-    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.var("y", relay.TensorType((5, 6), "float32"))
-    z = relay.var("z", relay.TensorType((4,), "float32"))
-    w = relay.reshape_like(x, y, lhs_end=3)
-    w = relay.reshape_like(w, z, lhs_begin=2)
-    w = run_infer_type(w)
-    assert w.checked_type == relay.TensorType((5, 6, 4), "float32")
-
-
-class TestReshapeLike:
-    shape, oshape, shape_like, reshape_like_kwargs = tvm.testing.parameters(
-        ((2, 3, 4), (1, 8, 3), None, {}),
-        ((4, 7), (2, 7, 2), None, {}),
-        ((1, 2, 3, 4), (1, 6, 4), (1, 6, 5), dict(lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)),
-    )
-
-    def test_reshape_like(
-        self, target, dev, executor_kind, shape, oshape, shape_like=None, reshape_like_kwargs={}
-    ):
-        if shape_like is None:
-            shape_like = oshape
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=shape_like).astype("float32")
-        ref_res = np.reshape(x_data, oshape)
-
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("x", relay.TensorType(shape_like, "float32"))
-        z = relay.reshape_like(x, y, **reshape_like_kwargs)
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
-
-        func = relay.Function([x, y], z)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data, y_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestTakeInferType:
-    d1, d2, d3 = te.var("d1"), te.var("d2"), te.var("d3")
-    d4, d5, d6 = te.var("d4"), te.var("d5"), te.var("d6")
-    dshape, indices_shape, oshape, axis = tvm.testing.parameters(
-        ((d1,), (1,), (1,), 0),
-        ((4,), (d1, d2), (d1, d2), None),
-        ((3, 3, 3), (1, d2), (1, d2), None),
-        ((d1, d2), (d3, d4, d5), (d3, d4, d5, d2), 0),
-        ((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1),
-        ((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2),
-    )
-
-    def test_take(self, dshape, indices_shape, oshape, axis):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
-        y = relay.take(x, indices, axis=axis)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType(oshape, "float32")
-
-
-class TestTake:
-    src_shape, indices_src, axis, mode, indices_dtype = tvm.testing.parameters(
-        ((4,), [1], None, "clip", "int32"),
-        ((4,), [[0, 1, 2, 3]], None, "clip", "int32"),
-        ((3, 3, 3), [[11, 25]], None, "clip", "int32"),
-        ((4,), [[0, 1], [2, 3]], None, "clip", "int32"),
-        ((4,), [1], 0, "clip", "int32"),
-        ((2, 2), [[[1, 0], [0, 1]]], 0, "clip", "int32"),
-        ((2, 2), [[[1, 0], [0, 1]]], 1, "clip", "int32"),
-        ((4, 3, 5, 6), [[2, 1, 0, 0]], -2, "clip", "int32"),
-        ((3, 4), [-5, 20], None, "clip", "int32"),
-        ((3, 4), [-5, 20], None, "wrap", "int32"),
-        ((3, 4), [-1, 2], 0, "clip", "int32"),
-        ((3, 4), [-1, 2], 0, "wrap", "int32"),
-        ((3, 4), [-1, 2], 1, "clip", "int32"),
-        ((3, 4), [-1, 2], 1, "wrap", "int32"),
-        ((3, 3, 3), [[11, 25]], None, "fast", "int32"),
-        ((3, 4), [0, 2], 0, "fast", "int32"),
-        ((3, 4), [0, 2], 1, "fast", "int32"),
-        ((3, 4), [1, 2], 1, "clip", "uint32"),
-        ((3, 4), [1, 2], 1, "wrap", "uint16"),
-        ((3, 3, 3), [1, 2], None, "fast", "uint16"),
-        ((3, 4), [0, 2], 0, "fast", "uint8"),
-    )
-
-    # Incorrect numeric output in some cases on vulkan
-    @tvm.testing.known_failing_targets("vulkan")
-    def test_take(
-        self, target, dev, executor_kind, src_shape, indices_src, axis, mode, indices_dtype
-    ):
-        src_dtype = "float32"
-        indices_src = np.array(indices_src, dtype=indices_dtype)
-        x = relay.var("x", relay.TensorType(src_shape, src_dtype))
-        indices = relay.var("indices", relay.TensorType(indices_src.shape, indices_dtype))
-        z = relay.take(x, indices, axis=axis, mode=mode)
-
-        func = relay.Function([x, indices], z)
-        x_data = np.random.uniform(low=-1, high=1, size=src_shape).astype(src_dtype)
-        np_mode = "raise" if mode == "fast" else mode
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data, indices_src
-        )
-
-        # Old versions of numpy has take internally cast inside take which may violate
-        # safety rules. We have such version in i386 CI image.
-        indices_src = indices_src.astype("int32")
-        ref_res = np.take(x_data, indices=indices_src, axis=axis, mode=np_mode)
-
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestSplitInferType:
-    idxd = tvm.tir.indexdiv
-
-    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
-    axis = te.var("axis")
-
-    dshape, indices_or_sections, ret_type, axis = tvm.testing.parameters(
-        (
-            (5, 5, 2, 2),
-            5,
-            relay.ty.TupleType(
-                tvm.runtime.convert(
-                    [
-                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                    ]
-                )
-            ),
-            1,
-        ),
-        (
-            (5, 5, 2, 2),
-            5,
-            relay.ty.TupleType(
-                tvm.runtime.convert(
-                    [
-                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                    ]
-                )
-            ),
-            0,
-        ),
-        (
-            (d1, d2, d3, d4),
-            4,
-            relay.ty.TupleType(
-                tvm.runtime.convert(
-                    [
-                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                    ]
-                )
-            ),
-            2,
-        ),
-        (
-            (d1, d2, d3, d4),
-            2,
-            relay.ty.TupleType(
-                tvm.runtime.convert(
-                    [
-                        relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"),
-                        relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"),
-                    ]
-                )
-            ),
-            0,
-        ),
-        (
-            (d1, d2, d3, d4),
-            (2, 4, 7),
-            relay.ty.TupleType(
-                tvm.runtime.convert(
-                    [
-                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                        relay.ty.TensorType((d1, 3, d3, d4), "float32"),
-                        relay.ty.TensorType((d1, (d2 - 7), d3, d4), "float32"),
-                    ]
-                )
-            ),
-            1,
-        ),
-        (
-            (d1, d2, d3, d4),
-            tuple(np.array([2, 4, 7]).astype(np.int64)),
-            relay.ty.TupleType(
-                tvm.runtime.convert(
-                    [
-                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                        relay.ty.TensorType((d1, 3, d3, d4), "float32"),
-                        relay.ty.TensorType((d1, (d2 - 7), d3, d4), "float32"),
-                    ]
-                )
-            ),
-            1,
-        ),
-    )
-
-    def test_split(self, dshape, indices_or_sections, ret_type, axis):
-        x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
-        y = relay.split(x, indices_or_sections, axis=axis)
-        yy = run_infer_type(y.astuple())
-        assert yy.checked_type == ret_type
-
-
-def test_full_infer_type():
-    # default settings: match input dtype
-    x = relay.var("x", relay.TensorType((), "int8"))
-    y = relay.full(x, ())
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((), "int8")
-
-    # change the shape and dtype
-    x = relay.var("x", relay.TensorType((), "float32"))
-    y = relay.full(x, (1, 2), "int8")
-    assert "shape=" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1, 2), "int8")
-
-
-class TestFull:
-    fill_value, arr_shape, dtype = tvm.testing.parameters(
-        (4, (1, 3, 4, 4), "int32"),
-        (4, (1, 3, 4, 4), "int64"),
-        (4.0, (1, 4), "float32"),
-    )
-
-    def test_full(self, target, dev, executor_kind, fill_value, arr_shape, dtype):
-        x = relay.var("x", relay.scalar_type(dtype))
-        z = relay.full(x, arr_shape, dtype)
-        func = relay.Function([x], z)
-        ref_res = np.full(arr_shape, fill_value, dtype=dtype)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            np.array(fill_value, dtype)
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    def test_full_like(self, target, dev, executor_kind, arr_shape, fill_value, dtype):
-        x_data = np.random.uniform(low=-1, high=1, size=arr_shape).astype(dtype)
-        x = relay.var("x", relay.TensorType(arr_shape, dtype))
-        y = relay.var("y", relay.scalar_type(dtype))
-        z = relay.full_like(x, y)
-
-        func = relay.Function([x, y], z)
-        ref_res = np.full_like(x_data, fill_value)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data, np.array(fill_value, dtype)
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-def test_full_like_infer_type():
-    # concrete shape
-    base = relay.var("base", relay.TensorType((1, 2, 3), "float32"))
-    fill = relay.var("fill", relay.TensorType((), "float32"))
-    y = relay.full_like(base, fill)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((1, 2, 3), "float32")
-
-    # symbolic shape
-    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
-    base = relay.var("base", relay.TensorType((n, c, h, w), "float32"))
-    fill = relay.var("fill", relay.TensorType((), "float32"))
-    y = relay.full_like(base, fill)
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
-
-
-def test_infer_type_leaky_relu(target, dev, executor_kind):
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.leaky_relu(x, alpha=0.1)
-    "alpha=0.1" in y.astext()
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
-
-    shape = (1, 5, 10, 10)
-    dtype = "float32"
-    x = relay.var("x", relay.TensorType(shape, dtype))
-    z = relay.nn.leaky_relu(x, alpha=0.1)
-    assert "alpha=0.1" in z.astext()
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType(shape, dtype)
-    func = relay.Function([x], z)
-    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
-    ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
-
-    op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_data)
-    tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestInferTypePrelu:
-    dtype = tvm.testing.parameter("float32")
-
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    data, alpha, axis, output = tvm.testing.parameters(
-        ((n, c, h, w), (c,), 1, (n, c, h, w)),
-        ((n, h, w, c), (c,), 3, (n, h, w, c)),
-        ((n, c, h, w), None, 1, (n, c, h, w)),
-        ((n, h, w, c), None, 3, (n, h, w, c)),
-        ((1, 3, 2, 2), (3,), 1, (1, 3, 2, 2)),
-        ((1, 2, 2, 3), (3,), 3, (1, 2, 2, 3)),
-        ((1, 3, 2, 2), None, 1, (1, 3, 2, 2)),
-        ((1, 2, 2, 3), None, 3, (1, 2, 2, 3)),
-    )
-
-    def test_infer_type_prelu(self, target, dev, executor_kind, data, alpha, axis, output, dtype):
-        x = relay.var("data", relay.TensorType(data, dtype))
-        if alpha:
-            y = relay.var("alpha", relay.TensorType(alpha, dtype))
-        else:
-            y = relay.var("alpha", relay.IncompleteType())
-        z = relay.nn.prelu(x, y, axis=axis)
-        zz = run_infer_type(z)
-        if axis != 1:
-            assert "axis" in z.astext()
-        assert zz.checked_type == relay.ty.TensorType(output, dtype)
-        if not alpha:
-            axis = axis if axis else 1
-            alpha_shape = (data[axis],)
-            assert zz.args[1].checked_type == relay.TensorType(alpha_shape, "float32")
-
-        if all(isinstance(v, tvm.tir.Var) == 1 for v in data) or not alpha:
-            return
-
-        func = relay.Function([x, y], z)
-        x_data = np.random.uniform(low=-1, high=1, size=data).astype(dtype)
-        a_data = np.random.uniform(low=-1, high=1, size=alpha).astype(dtype)
-
-        if axis == 1:
-            ref_res = (x_data < 0) * (x_data * a_data.reshape(3, 1, 1)) + (x_data >= 0) * x_data
-        else:
-            ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data, a_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestArange:
-    dtype = tvm.testing.parameter("float32")
-
-    start, stop, step = tvm.testing.parameters(
-        (None, 20, None),
-        (None, 20, 2),
-        (1, 20, None),
-        (1, 20, 2),
-        # arange doesnt' support floating point right now, see type relation
-        # (1, 20, 1.5),
-        (1, 20.5, None),
-        (1, 20, 3),
-        (20, 1, -1),
-        # arange doesnt' support floating point right now, see type relation
-        # (20, 1, -1.5),
-    )
-
-    def test_arange(self, target, dev, executor_kind, start, stop, step, dtype):
-        if start is None and step is None:
-            x = relay.arange(relay.const(stop, dtype=dtype))
-            ref_res = np.arange(stop).astype(dtype)
-        elif start is None:
-            x = relay.arange(relay.const(stop, dtype=dtype), step=relay.const(step, dtype=dtype))
-            ref_res = np.arange(stop, step=step).astype(dtype)
-        elif step is None:
-            x = relay.arange(relay.const(start, dtype=dtype), relay.const(stop, dtype=dtype))
-            ref_res = np.arange(start, stop).astype(dtype)
-        else:
-            x = relay.arange(
-                relay.const(start, dtype=dtype),
-                relay.const(stop, dtype=dtype),
-                relay.const(step, dtype=dtype),
-            )
-            ref_res = np.arange(start, stop, step).astype(dtype)
-
-        func = relay.Function([], x)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)()
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestMeshgrid:
-    lengths, indexing = tvm.testing.parameters(
-        ([3, 5], "ij"),
-        ([4, 2], "xy"),
-        ([3, 5, 2], "ij"),
-        ([3, 1, 5], "xy"),
-        # Length 0 signifies scalar.
-        ([3, 5, 0], "ij"),
-    )
-
-    def test_meshgrid(self, target, dev, executor_kind, lengths, indexing="ij"):
-        input_vars = []
-        input_data = []
-        for i, length in enumerate(lengths):
-            input_name = "x_{}".format(i)
-            if length == 0:
-                # Scalar
-                input_vars.append(relay.var(input_name, relay.scalar_type("float32")))
-                input_data.append(np.array(1, "float32"))
-            else:
-                input_vars.append(relay.var(input_name, relay.TensorType((length,), "float32")))
-                input_data.append(np.arange(length).astype("float32"))
-
-        z = relay.meshgrid(input_vars, indexing=indexing).astuple()
-        func = relay.Function(input_vars, z)
-        # Get ref
-        ref_res = np.meshgrid(*input_data, indexing=indexing)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            *input_data
-        )
-        assert len(op_res) == len(ref_res)
-        for i in range(len(op_res)):
-            tvm.testing.assert_allclose(op_res[i].numpy(), ref_res[i], rtol=1e-5)
-
-
-class TestTile:
-    dshape, reps = tvm.testing.parameters(
-        ((2, 3, 4), (3, 2, 1)),
-        ((2, 3, 4), (1, 2)),
-        ((2, 3), (3, 2, 1)),
-    )
-
-    def test_tile(self, target, dev, executor_kind, dshape, reps):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.tile(x, reps=reps)
-
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
-        ref_res = np.tile(x_data, reps=reps)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestRepeat:
-    dshape, repeats, axis = tvm.testing.parameters(
-        ((3,), 2, 0),
-        ((3, 10), 2, -1),
-        ((3, 2, 4), 3, 1),
-    )
-
-    def test_repeat(self, target, dev, executor_kind, dshape, repeats, axis):
-        x = relay.Var("x", relay.TensorType(dshape, "float32"))
-        func = relay.Function([x], relay.repeat(x, repeats, axis))
-        data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = np.repeat(data, repeats, axis)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestStack:
-    dshapes, axis = tvm.testing.parameters(
-        ([(2,), (2,), (2,)], -1),
-        ([(2,), (2,), (2,)], 0),
-        ([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1),
-        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1),
-        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], 4),
-    )
-
-    expr_type = tvm.testing.parameter("tuple", "list", "tuple_expr")
-
-    @tvm.testing.fixture
-    def ref_data(self, dshapes, axis):
-        np_in = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
-        np_out = np.stack(np_in, axis=axis)
-        return np_in, np_out
-
-    @tvm.testing.fixture
-    def input_expr(self, dshapes, axis, expr_type, ref_data):
-        input_vars = [relay.var("input", relay.TensorType(shape, "float32")) for shape in dshapes]
-
-        if expr_type == "tuple":
-            input_expr = relay.Tuple(input_vars)
-
-        elif expr_type == "list":
-            input_expr = input_vars
-
-        elif expr_type == "tuple_expr":
-            # expression that evaluates to a tuple
-            # but is not a tuple literal
-            np_in, np_out = ref_data
-            x = relay.Var("x")
-            input_expr = relay.Let(x, relay.Tuple([relay.const(inp) for inp in np_in]), x)
-
-        else:
-            raise ValueError(f"Unknown expr_type '{expr_type}'")
-
-        return input_expr
-
-    def test_stack(self, target, dev, executor_kind, input_expr, ref_data, axis):
-        z = relay.stack(input_expr, axis=axis)
-        inp_vars = relay.analysis.free_vars(z)
-        func = relay.Function(inp_vars, z)
-
-        np_in, np_out = ref_data
-        relay_args = np_in if inp_vars else []
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            *relay_args
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-5)
-
-
-class TestReverse:
-    dshape, axis = tvm.testing.parameters(
-        ((2, 3, 4), 1),
-        ((4, 7), 0),
-        ((2, 3, 4), -1),
-    )
-
-    def test_reverse(self, target, dev, executor_kind, dshape, axis):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.reverse(x, axis=axis)
-        zz = run_infer_type(z)
-
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
-        ref_res = np.flip(x_data, axis)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-def test_reverse_sequence(target, dev, executor_kind):
-    def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
-        seq_lengths_data = np.array(seq_lengths).astype("int32")
-        x = relay.var("x", relay.TensorType(x_data.shape, str(x_data.dtype)))
-        z = relay.reverse_sequence(x, relay.const(seq_lengths_data), seq_axis, batch_axis)
-        zz = run_infer_type(z)
-        assert zz.checked_type == x.type_annotation
-        func = relay.Function([x], z)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = [[0, 5, 10, 15], [4, 1, 6, 11], [8, 9, 2, 7], [12, 13, 14, 3]]
-    verify_reverse_sequence(indata, [1, 2, 3, 4], 1, 0, np.array(result))
-    verify_reverse_sequence(indata, [1, 2, 3, 4], -1, 0, np.array(result))
-    verify_reverse_sequence(
-        indata.astype("float32"), [1, 2, 3, 4], 1, 0, np.array(result).astype("float32")
-    )
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = [[0, 1, 2, 3], [5, 4, 6, 7], [10, 9, 8, 11], [15, 14, 13, 12]]
-    verify_reverse_sequence(indata, [1, 2, 3, 4], 0, 1, np.array(result))
-    verify_reverse_sequence(indata, [1, 2, 3, 4], 0, -1, np.array(result))
-    verify_reverse_sequence(
-        indata.astype("float32"), [1, 2, 3, 4], 0, 1, np.array(result).astype("float32")
-    )
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [15, 14, 13, 12]]
-    verify_reverse_sequence(indata, [-1, 0, 1, 5], 0, 1, np.array(result))
-
-    indata = np.array(np.arange(0, 54)).reshape([2, 3, 3, 3]).astype("int32")
-    result = [
-        [
-            [[18, 19, 20], [21, 22, 23], [24, 25, 26]],
-            [[9, 10, 11], [12, 13, 14], [15, 16, 17]],
-            [[0, 1, 2], [3, 4, 5], [6, 7, 8]],
-        ],
-        [
-            [[45, 46, 47], [48, 49, 50], [51, 52, 53]],
-            [[36, 37, 38], [39, 40, 41], [42, 43, 44]],
-            [[27, 28, 29], [30, 31, 32], [33, 34, 35]],
-        ],
-    ]
-    verify_reverse_sequence(indata, [3, 3], 0, 1, np.array(result))
-
-    indata = np.array(np.arange(0, 54)).reshape([2, 3, 3, 3]).astype("int32")
-    result = [
-        [
-            [[9, 10, 11], [21, 22, 23], [15, 16, 17]],
-            [[0, 1, 2], [12, 13, 14], [6, 7, 8]],
-            [[18, 19, 20], [3, 4, 5], [24, 25, 26]],
-        ],
-        [
-            [[36, 37, 38], [48, 49, 50], [42, 43, 44]],
-            [[27, 28, 29], [39, 40, 41], [33, 34, 35]],
-            [[45, 46, 47], [30, 31, 32], [51, 52, 53]],
-        ],
-    ]
-    verify_reverse_sequence(indata, [2, 3, 2], 2, 1, np.array(result))
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = []
-    with pytest.raises(Exception) as execinfo:
-        verify_reverse_sequence(indata, [2, 3, 2, 4, 5], 1, 0, np.array(result))
-
-    assert (
-        "For reverse_sequnece seq_lengths size should match with dimension of batch axis,"
-        " but got dimension of batch_axis = 4, and seq_length size = 5" in execinfo.value.args[0]
-    )
-
-
-def ref_scatter(data, indices, updates, axis=0):
-    idx = np.indices(indices.shape).reshape(indices.ndim, -1)
-
-    updated_idx = np.copy(idx)
-    indices = indices.reshape(-1)
-    for i in range(len(indices)):
-        updated_idx[axis, i] = indices[i]
-    scattered = np.copy(data)
-    scattered[tuple(updated_idx)] = updates[tuple(idx)]
-    return scattered
-
-
-def test_scatter(target, dev, executor_kind):
-    def verify_scatter(dshape, ishape, axis=0, indices_dtype="int64"):
-        d = relay.var("d", relay.TensorType(dshape, "float32"))
-        i = relay.var("i", relay.TensorType(ishape, indices_dtype))
-        u = relay.var("u", relay.TensorType(ishape, "float32"))
-        z = relay.op.scatter_elements(d, i, u, axis)
-
-        func = relay.Function([d, i, u], z)
-
-        data_np = np.random.uniform(size=dshape).astype("float32")
-        updates_np = np.random.uniform(size=ishape).astype("float32")
-        indices_np = np.random.randint(0, dshape[axis] - 1, ishape).astype(indices_dtype)
-
-        ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            data_np, indices_np, updates_np
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_scatter((10,), (10,), 0)
-    verify_scatter((10, 5), (10, 5), -2)
-    verify_scatter((10, 5), (10, 5), -1)
-    verify_scatter((10, 5), (3, 5), 0)
-    verify_scatter((12, 4), (7, 2), 1)
-    verify_scatter((2, 3, 4), (1, 3, 4), 0)
-    verify_scatter((2, 3, 4), (2, 1, 4), 1)
-    verify_scatter((2, 3, 4), (2, 3, 1), 2)
-    verify_scatter((4, 2, 1), (1, 1, 1), 0)
-    verify_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
-    verify_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
-    verify_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
-    verify_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
-    verify_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3, indices_dtype="uint32")
-
-
-class TestDynamicScatter:
-    dshape, ishape, axis = tvm.testing.parameters(
-        ((10,), (10,), 0),
-        ((10, 5), (10, 5), -2),
-        ((10, 5), (10, 5), -1),
-        ((10, 5), (3, 5), 0),
-        ((12, 4), (7, 2), 1),
-        ((2, 3, 4), (1, 3, 4), 0),
-        ((2, 3, 4), (2, 1, 4), 1),
-        ((2, 3, 4), (2, 3, 1), 2),
-        ((4, 2, 1), (1, 1, 1), 0),
-        ((2, 3, 4, 5), (1, 3, 4, 5), 0),
-        ((6, 3, 4, 5), (2, 3, 4, 5), 1),
-        ((2, 3, 8, 5), (2, 3, 1, 1), 2),
-        ((16, 16, 4, 5), (16, 16, 4, 5), 3),
-    )
-
-    @pytest.mark.parametrize("executor_kind", ["vm"])
-    def test_dynamic_scatter(self, target, dev, executor_kind, dshape, ishape, axis):
-        d = relay.var("d", relay.TensorType([relay.Any() for i in range(len(dshape))], "float32"))
-        i = relay.var("i", relay.TensorType([relay.Any() for i in range(len(ishape))], "int64"))
-        u = relay.var("u", relay.TensorType([relay.Any() for i in range(len(ishape))], "float32"))
-        z = relay.op.scatter_elements(d, i, u, axis)
-
-        func = relay.Function([d, i, u], z)
-
-        data_np = np.random.uniform(size=dshape).astype("float32")
-        updates_np = np.random.uniform(size=ishape).astype("float32")
-        indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
-
-        ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
-
-        mod = tvm.ir.IRModule.from_expr(func)
-        op_res = relay.create_executor(
-            executor_kind, mod=mod, device=dev, target=target
-        ).evaluate()(data_np, indices_np, updates_np)
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-
-class TestScatterAdd:
-    dshape, ishape, axis, dtype, indice_dtype = tvm.testing.parameters(
-        ((10,), (10,), 0, "int32", "int64"),
-        ((1000,), (1000,), 0, "int32", "int64"),
-        ((10, 5), (10, 5), -2, "float32", "int64"),
-        ((10, 5), (10, 5), -1, "float32", "int64"),
-        ((10, 5), (3, 5), 0, "float32", "int64"),
-        ((12, 4), (7, 2), 1, "float32", "int64"),
-        ((2, 3, 4), (1, 3, 4), 0, "float32", "int64"),
-        ((2, 3, 4), (2, 1, 4), 1, "float32", "int64"),
-        ((2, 3, 4), (2, 3, 1), 2, "float32", "int64"),
-        ((2, 3, 4, 5), (1, 3, 4, 5), 0, "float32", "int64"),
-        ((6, 3, 4, 5), (2, 3, 4, 5), 1, "float32", "int64"),
-        ((2, 3, 8, 5), (2, 3, 1, 1), 2, "float32", "int64"),
-        ((16, 16, 4, 5), (16, 16, 4, 5), 3, "float32", "int64"),
-        ((16, 16, 4, 5), (16, 16, 4, 5), 3, "float32", "uint32"),
-    )
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(self, dshape, ishape, axis, dtype, indice_dtype):
-        data_np = np.random.uniform(size=dshape).astype(dtype)
-        updates_np = np.random.uniform(size=ishape).astype(dtype)
-        indices_np = np.random.randint(0, dshape[axis] - 1, ishape).astype(indice_dtype)
-
-        out_np = np.copy(data_np)
-        for index in np.ndindex(*indices_np.shape):
-            new_index = list(index)
-            new_index[axis] = indices_np[index]
-            out_np[tuple(new_index)] += updates_np[index]
-        return data_np, updates_np, indices_np, out_np
-
-    # Optimization can produce tir.atomic_add, not currently supported
-    # on vulkan runtime.
-    @tvm.testing.known_failing_targets("vulkan")
-    def test_scatter_add(self, target, dev, ref_data, dshape, ishape, axis, dtype, indice_dtype):
-        d = relay.var("d", relay.TensorType(shape=[relay.Any() for _ in dshape], dtype=dtype))
-        i = relay.var(
-            "i", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype=indice_dtype)
-        )
-        u = relay.var("u", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype=dtype))
-        z = relay.op.scatter_elements(d, i, u, axis, "add")
-
-        func = relay.Function([d, i, u], z)
-
-        data_np, updates_np, indices_np, out_np = ref_data
-
-        verify_func(target, dev, func, [data_np, indices_np, updates_np], out_np)
-
-
-@pytest.mark.parametrize(
-    "data, axis, indices, ref_res",
-    [
-        ([[1, 2], [3, 4]], 1, [[0, 0], [1, 0]], [[1, 1], [4, 3]]),
-        ([[1, 2], [3, 4]], -1, [[0, 0], [1, 0]], [[1, 1], [4, 3]]),
-        (
-            [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]],
-            0,
-            [[[1, 0, 1], [1, 1, 0]]],
-            [[[6, 1, 8], [9, 10, 5]]],
-        ),
-        (
-            [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]],
-            -3,
-            [[[1, 0, 1], [1, 1, 0]]],
-            [[[6, 1, 8], [9, 10, 5]]],
-        ),
-        (
-            [
-                [
-                    [-0.2321, -0.2024, -1.7624],
-                    [-0.3829, -0.4246, 0.2448],
-                    [0.1822, 0.2360, -0.8965],
-                    [0.4497, -0.2224, 0.6103],
-                ],
-                [
-                    [0.0408, -0.7667, -0.4303],
-                    [-0.3216, 0.7489, -0.1502],
-                    [0.0144, -0.4699, -0.0064],
-                    [-0.0768, -1.6064, 1.3390],
-                ],
-            ],
-            1,
-            [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]],
-            [
-                [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]],
-                [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]],
-            ],
-        ),
-        (
-            [
-                [
-                    [-0.2321, -0.2024, -1.7624],
-                    [-0.3829, -0.4246, 0.2448],
-                    [0.1822, 0.2360, -0.8965],
-                    [0.4497, -0.2224, 0.6103],
-                ],
-                [
-                    [0.0408, -0.7667, -0.4303],
-                    [-0.3216, 0.7489, -0.1502],
-                    [0.0144, -0.4699, -0.0064],
-                    [-0.0768, -1.6064, 1.3390],
-                ],
-            ],
-            -2,
-            [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]],
-            [
-                [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]],
-                [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]],
-            ],
-        ),
-        (
-            [
-                [
-                    [-0.2321, -0.2024, -1.7624],
-                    [-0.3829, -0.4246, 0.2448],
-                    [0.1822, 0.2360, -0.8965],
-                    [0.4497, -0.2224, 0.6103],
-                ],
-                [
-                    [0.0408, -0.7667, -0.4303],
-                    [-0.3216, 0.7489, -0.1502],
-                    [0.0144, -0.4699, -0.0064],
-                    [-0.0768, -1.6064, 1.3390],
-                ],
-            ],
-            -2,
-            [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]],
-            [
-                [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]],
-                [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]],
-            ],
-        ),
-        (
-            [
-                [
-                    [0.3050, 1.6986, 1.1034],
-                    [0.7020, -0.6960, -2.1818],
-                    [0.3116, -0.5773, -0.9912],
-                    [0.0835, -1.3915, -1.0720],
-                ],
-                [
-                    [0.1694, -0.6091, -0.6539],
-                    [-0.5234, -0.1218, 0.5084],
-                    [0.2374, -1.9537, -2.0078],
-                    [-0.5700, -1.0302, 0.1558],
-                ],
-            ],
-            2,
-            [
-                [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]],
-                [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]],
-            ],
-            [
-                [
-                    [1.6986, 1.6986, 0.3050, 1.6986],
-                    [0.7020, 0.7020, -2.1818, -2.1818],
-                    [-0.5773, -0.9912, -0.5773, -0.9912],
-                    [-1.0720, -1.0720, -1.3915, 0.0835],
-                ],
-                [
-                    [0.1694, 0.1694, -0.6091, -0.6539],
-                    [0.5084, 0.5084, -0.1218, -0.5234],
-                    [-1.9537, -2.0078, 0.2374, 0.2374],
-                    [-0.5700, 0.1558, -0.5700, 0.1558],
-                ],
-            ],
-        ),
-        (
-            [
-                [
-                    [0.3050, 1.6986, 1.1034],
-                    [0.7020, -0.6960, -2.1818],
-                    [0.3116, -0.5773, -0.9912],
-                    [0.0835, -1.3915, -1.0720],
-                ],
-                [
-                    [0.1694, -0.6091, -0.6539],
-                    [-0.5234, -0.1218, 0.5084],
-                    [0.2374, -1.9537, -2.0078],
-                    [-0.5700, -1.0302, 0.1558],
-                ],
-            ],
-            -1,
-            [
-                [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]],
-                [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]],
-            ],
-            [
-                [
-                    [1.6986, 1.6986, 0.3050, 1.6986],
-                    [0.7020, 0.7020, -2.1818, -2.1818],
-                    [-0.5773, -0.9912, -0.5773, -0.9912],
-                    [-1.0720, -1.0720, -1.3915, 0.0835],
-                ],
-                [
-                    [0.1694, 0.1694, -0.6091, -0.6539],
-                    [0.5084, 0.5084, -0.1218, -0.5234],
-                    [-1.9537, -2.0078, 0.2374, 0.2374],
-                    [-0.5700, 0.1558, -0.5700, 0.1558],
-                ],
-            ],
-        ),
-    ],
-)
-def test_gather(target, dev, executor_kind, data, axis, indices, ref_res):
-    def verify_gather(data, axis, indices, ref_res):
-        data = np.asarray(data, dtype="float32")
-        indices = np.asarray(indices, dtype="int32")
-        ref_res = np.asarray(ref_res)
-        d = relay.var("x", relay.TensorType(data.shape, "float32"))
-        i = relay.var("y", relay.TensorType(indices.shape, "int32"))
-        z = relay.gather(d, axis, i)
-
-        func = relay.Function([d, i], z)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            data, indices
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_gather(data, axis, indices, ref_res)
-
-
-def test_gather_nd(target, dev, executor_kind):
-    def verify_gather_nd(xshape, yshape, y_data, batch_dims=0, indices_dtype="int32"):
-        x = relay.var("x", relay.TensorType(xshape, "float32"))
-        y = relay.var("y", relay.TensorType(yshape, indices_dtype))
-        z = relay.gather_nd(x, y, batch_dims)
-
-        func = relay.Function([x, y], z)
-
-        x_data = np.random.uniform(size=xshape).astype("float32")
-
-        if y_data:
-            y_data = np.array(y_data, dtype=indices_dtype)
-        else:
-            y_data = np.random.randint(low=0, high=2, size=yshape, dtype=indices_dtype)
-
-        ref_res = ref_funcs.gather_nd(x_data, y_data, batch_dims)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data, y_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_gather_nd((2, 2), (2, 3), [[1, 1, 0], [0, 1, 0]])
-    verify_gather_nd((2, 2, 2), (2, 2), [[0, 1], [1, 0]])
-    verify_gather_nd((3, 2, 2), (2, 2), [[0, 1], [1, 0]])
-    verify_gather_nd((3, 2), (2, 2, 3), [[[0, 1, 2], [2, 0, 1]], [[0, 0, 0], [1, 1, 1]]])
-
-    # Examples from tensorflow gather_nd doc
-    # https://www.tensorflow.org/api_docs/python/tf/gather_nd
-    verify_gather_nd((2, 2, 2), (1, 2), [[1, 0]], 1)
-    verify_gather_nd((2, 2, 2), (1, 2, 1), [[[1], [0]]], 1)
-    verify_gather_nd((2, 2, 2), (2, 2, 1), [[[1], [0]], [[0], [1]]], 1)
-
-    # Test cases from tensorflow gather_nd tests kernel_tests/array_ops_test.py
-    verify_gather_nd((2, 2, 2), (1, 2), None, 1)
-    verify_gather_nd((2, 2, 2), (2, 2), None, 1)
-    verify_gather_nd((2, 2, 3, 2), (3, 2), None, 1)
-    verify_gather_nd((2, 2, 3, 2), (2, 2), None, 1)
-    verify_gather_nd((2, 2, 3, 2), (1, 2), None, 1)
-    verify_gather_nd((2, 2, 3, 2), (3, 2, 1), None, 1)
-    verify_gather_nd((2, 2, 3, 2), (2, 2, 2), None, 1)
-    verify_gather_nd((2, 2, 3, 2), (1, 2, 3), None, 1)
-
-    verify_gather_nd((3, 2, 2, 3, 4), (3, 3, 2), None, 2)
-    verify_gather_nd((3, 2, 2, 3, 4), (2, 3, 2), None, 2)
-    verify_gather_nd((3, 2, 2, 3, 4), (1, 3, 2), None, 2)
-    verify_gather_nd((3, 2, 2, 3, 4), (3, 3, 2, 1), None, 2)
-    verify_gather_nd((3, 2, 2, 3, 4), (2, 3, 2, 2), None, 2)
-    verify_gather_nd((3, 2, 2, 3, 4), (1, 3, 2, 3), None, 2)
-
-    verify_gather_nd((3, 2, 2, 3, 4), (1, 3, 2, 3), None, 2, indices_dtype="uint8")
-    verify_gather_nd((2, 2, 2), (2, 2, 1), [[[1], [0]], [[0], [1]]], 1, indices_dtype="uint32")
-
-
-def _verify_infiniteness_ops(relay_op, ref_op, target="llvm", dev=None):
-    for dtype in ["float32", "float16", "float16", "int32", "int16"]:
-        shape = (2, 8, 8)
-        x = relay.var("x", relay.TensorType(shape, dtype))
-        y = relay_op(x)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType(shape, "bool")
-
-        data = np.random.uniform(size=shape).astype(dtype)
-        if dtype.startswith("float"):
-            data.ravel()[np.random.choice(data.size, int(data.size * 0.5), replace=False)] = np.inf
-            data.ravel()[np.random.choice(data.size, int(data.size * 0.5), replace=False)] = np.nan
-
-        op_res = create_executor(target=target, device=dev).evaluate(y, {x: data})
-        ref_res = ref_op(data)
-        np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-
-@tvm.testing.requires_gpu
-def test_isfinite():
-    for target, dev in tvm.testing.enabled_targets():
-        if target not in ["llvm", "cuda"]:
-            continue
-        _verify_infiniteness_ops(relay.isfinite, np.isfinite, target=target, dev=dev)
-
-
-@tvm.testing.requires_gpu
-def test_isinf():
-    for target, dev in tvm.testing.enabled_targets():
-        if target not in ["llvm", "cuda"]:
-            continue
-        _verify_infiniteness_ops(relay.isinf, np.isinf, target=target, dev=dev)
-
-
-def test_unravel_index(target, dev, executor_kind):
-    def verify_unravel_index(indices, shape, dtype):
-        x_data = np.array(indices).astype(dtype)
-        y_data = np.array(shape).astype(dtype)
-        x = relay.var("x", relay.TensorType(x_data.shape, dtype))
-        y = relay.var("y", relay.TensorType(y_data.shape, dtype))
-
-        z = relay.unravel_index(x, y)
-        zz = run_infer_type(z)
-
-        if len(x_data.shape) == 1:
-            out_shape = [y_data.shape[0], x_data.shape[0]]
-        else:
-            out_shape = [y_data.shape[0]]
-        assert zz.checked_type == relay.ty.TensorType(out_shape, dtype)
-
-        func = relay.Function([x, y], z)
-        ref_res = np.unravel_index(x_data, y_data)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data, y_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    for dtype in ["int64", "int32"]:
-        verify_unravel_index([0, 1, 2, 3], [2, 2], dtype)
-        verify_unravel_index([144], [5, 5, 5, 2], dtype)
-        verify_unravel_index(144, [5, 5, 5, 2], dtype)
-        verify_unravel_index([100, 13, 5], [5, 5, 5, 2], dtype)
-
-        # In below example, 5 is out of bound for array of size 4.
-        # Numpy implementation throws error for it
-        # TVM implementation does not throw error instead it produces
-        # output which is inline with Tensorflow
-        # verify_unravel_index([0, 1, 2, 5], [2, 2], dtype)
-
-
-def test_sparse_to_dense(target, dev, executor_kind):
-    def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
-        sparse_indices_data = np.array(sparse_indices)
-        sparse_values_data = np.array(sparse_values)
-        default_value_data = np.array(default_value)
-
-        a = relay.var(
-            "a", relay.TensorType(sparse_indices_data.shape, str(sparse_indices_data.dtype))
-        )
-        b = relay.var(
-            "b", relay.TensorType(sparse_values_data.shape, str(sparse_values_data.dtype))
-        )
-        if default_value is None:
-            args = [a, b]
-            d = relay.sparse_to_dense(a, output_shape, b)
-        else:
-            c = relay.var(
-                "c", relay.TensorType(default_value_data.shape, str(default_value_data.dtype))
-            )
-            args = [a, b, c]
-            d = relay.sparse_to_dense(a, output_shape, b, c)
-
-        zz = run_infer_type(d)
-        assert zz.checked_type == relay.ty.TensorType(output_shape, str(sparse_values_data.dtype))
-
-        func = relay.Function(args, d)
-        f = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)
-        if default_value is None:
-            op_res = f(sparse_indices_data, sparse_values_data)
-        else:
-            op_res = f(sparse_indices_data, sparse_values_data, default_value_data)
-        tvm.testing.assert_allclose(op_res.numpy(), xpected, rtol=1e-5)
-
-    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
-    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
-    verify_sparse_to_dense(
-        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
-    )  # nXd
-    verify_sparse_to_dense(
-        [[0, 0, 0], [1, 2, 3]],
-        [1, 2],
-        4,
-        [2, 3, 4],
-        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
-    )  # nXd
-    verify_sparse_to_dense(
-        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
-    )  # floats
-    # default value not specified
-    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])
-
-    # negative test cases
-    # sparse indices should be ints
-    # verify_sparse_to_dense([[0.1, 1.1, 4.1], [0,2,4]], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
-    # sparse_values should be 0d or 1d only
-    # verify_sparse_to_dense([[0, 1, 4], [0, 2, 4]], [[[3.1, 3.1, 3.1]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
-    # sparse_indices should not be > 2d tensor
-    # verify_sparse_to_dense([[[[0, 1, 4], [0, 2, 4]]]], [[[[3.1, 3.1, 3.1]]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
-
-
-class TestSparseReshape:
-
-    sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np = tvm.testing.parameters(
-        (
-            np.array([[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 2, 3]], dtype=np.int32),
-            np.array([7, 5, 6, 3, 9], dtype=np.int32),
-            np.array([2, 3, 6], dtype=np.int32),
-            np.array([9, -1], dtype=np.int32),
-        ),
-        (
-            np.array(
-                [[0, 0, 0, 0], [0, 0, 1, 2], [0, 1, 0, 3], [1, 0, 0, 4], [1, 2, 3, 6]],
-                dtype=np.int64,
-            ),
-            np.array([7, 5, 6, 3, 9], dtype=np.int64),
-            np.array([2, 3, 6, 7], dtype=np.int64),
-            np.array([9, -1, 7], dtype=np.int64),
-        ),
-        (
-            np.array(
-                [
-                    [0, 0, 0, 0, 0],
-                    [0, 0, 1, 2, 3],
-                    [0, 1, 0, 3, 5],
-                    [1, 0, 0, 4, 6],
-                    [1, 2, 3, 6, 8],
-                ],
-                dtype=np.int64,
-            ),
-            np.array([7, 5, 6, 3, 9], dtype=np.int64),
-            np.array([2, 3, 6, 7, 9], dtype=np.int64),
-            np.array([9, -1, 7], dtype=np.int64),
-        ),
-        (
-            np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int32),
-            np.array([7, 5, 6, 3, 9], dtype=np.int32),
-            np.array([9, 4], dtype=np.int32),
-            np.array([2, -1, 6], dtype=np.int32),
-        ),
-        (
-            np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int64),
-            np.array([7, 5, 6, 3, 9], dtype=np.int64),
-            np.array([9, 4], dtype=np.int64),
-            np.array([-1], dtype=np.int64),
-        ),
-        (
-            np.array([[0], [5], [10], [20], [24]], dtype=np.int32),
-            np.array([7, 5, 6, 3, 9], dtype=np.int32),
-            np.array([25], dtype=np.int32),
-            np.array([5, 5], dtype=np.int32),
-        ),
-        (
-            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64),
-            np.array([7, 5, 6, 3, 9], dtype=np.int64),
-            np.array([500, 20], dtype=np.int64),
-            np.array([500, 20], dtype=np.int64),
-        ),
-        (
-            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int32),
-            np.array([7, 5, 6, 3, 9], dtype=np.int32),
-            np.array([500, 20], dtype=np.int32),
-            np.array([500, -1], dtype=np.int32),
-        ),
-        (
-            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64),
-            np.array([7, 5, 6, 3, 9], dtype=np.int64),
-            np.array([500, 20], dtype=np.int64),
-            np.array([250, 40], dtype=np.int64),
-        ),
-        (
-            np.ones((0, 1), dtype=np.int32),
-            np.array([], dtype=np.int32),
-            np.array([4], dtype=np.int32),
-            np.array([2, -1], dtype=np.int32),
-        ),
-        (
-            np.ones((0, 1), dtype=np.int64),
-            np.array([], dtype=np.int64),
-            np.array([4], dtype=np.int64),
-            np.array([2, 2], dtype=np.int64),
-        ),
-        (
-            np.ones((0, 2), dtype=np.int32),
-            np.array([], dtype=np.int32),
-            np.array([3, 6], dtype=np.int32),
-            np.array([-1, 2], dtype=np.int32),
-        ),
-    )
-
-    use_dyn = tvm.testing.parameter(True, False, ids=["dyn", "static"])
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_res(
-        self,
-        sparse_indices_np: np.ndarray,
-        prev_shape_np: np.ndarray,
-        new_shape_np: np.ndarray,
-    ):
-        """
-        This function calculates the expected output of sparse_reshape operator given the inputs.
-        """
-
-        new_sparse_indices = np.ones(
-            (sparse_indices_np.shape[0], new_shape_np.shape[0]), dtype=sparse_indices_np.dtype
-        )
-        multipliers = np.ones(prev_shape_np.shape[0])
-        dividers = np.ones(new_shape_np.shape[0])
-        total_ele = np.prod(prev_shape_np)
-        division_total_ele = 1
-        for i in range(new_shape_np.shape[0]):
-            if new_shape_np[i] == -1:
-                continue
-            division_total_ele *= new_shape_np[i]
-        for i in range(prev_shape_np.shape[0] - 2, -1, -1):
-            multipliers[i] = prev_shape_np[i + 1] * multipliers[i + 1]
-
-        for i in range(len(new_shape_np)):
-            if new_shape_np[i] == -1:
-                new_shape_np[i] = total_ele // division_total_ele
-
-        if np.array_equal(prev_shape_np, new_shape_np):
-            return sparse_indices_np, prev_shape_np
-
-        for i in range(new_shape_np.shape[0] - 2, -1, -1):
-            dividers[i] = new_shape_np[i + 1] * dividers[i + 1]
-
-        for row_num, sparse_row in enumerate(sparse_indices_np):
-            flat_idx = 0
-            if len(sparse_indices_np.shape) != 1:
-                for i, ele in enumerate(sparse_row):
-                    flat_idx += sparse_row[i] * multipliers[i]
-            else:
-                flat_idx += sparse_row
-            if len(new_sparse_indices.shape) != 1:
-                for i in range(new_sparse_indices.shape[1]):
-                    new_sparse_indices[row_num][i] = flat_idx // dividers[i]
-                    flat_idx = flat_idx % dividers[i]
-            else:
-                new_sparse_indices[row_num] = flat_idx
-
-        return new_sparse_indices, new_shape_np
-
-    @tvm.testing.known_failing_targets("vulkan")
-    def test_sparse_reshape(
-        self,
-        target,
-        dev,
-        ref_res,
-        sparse_indices_np,
-        sparse_values_np,
-        prev_shape_np,
-        new_shape_np,
-        use_dyn,
-    ):
-        if use_dyn:
-            sparse_indices = relay.var(
-                "sparse_indices",
-                shape=[relay.Any(), relay.Any()],
-                dtype=str(sparse_indices_np.dtype),
-            )
-            prev_shape = relay.var(
-                "prev_shape",
-                shape=[relay.Any()],
-                dtype=str(prev_shape_np.dtype),
-            )
-            new_shape = relay.var(
-                "new_shape",
-                shape=[relay.Any()],
-                dtype=str(new_shape_np.dtype),
-            )
-        else:
-            sparse_indices = relay.var(
-                "sparse_indices",
-                relay.TensorType(sparse_indices_np.shape, str(sparse_indices_np.dtype)),
-            )
-            prev_shape = relay.var(
-                "prev_shape", relay.TensorType(prev_shape_np.shape, str(prev_shape_np.dtype))
-            )
-            new_shape = relay.var(
-                "new_shape", relay.TensorType(new_shape_np.shape, str(new_shape_np.dtype))
-            )
-        z = relay.op.sparse_reshape(sparse_indices, prev_shape, new_shape).astuple()
-
-        func = relay.Function([sparse_indices, prev_shape, new_shape], z)
-
-        outputs = run_infer_type(z)
-        new_sparse_indices_infer_type, new_shape_infer_type = (
-            outputs.checked_type.fields[0].dtype,
-            outputs.checked_type.fields[1].dtype,
-        )
-
-        assert new_sparse_indices_infer_type == sparse_indices_np.dtype
-        assert new_shape_infer_type == new_shape_np.dtype
-        verify_func(
-            target,
-            dev,
-            func,
-            [sparse_indices_np, prev_shape_np, new_shape_np],
-            ref_res,
-        )
-
-
-class TestSegmentSum:
-    data_np, segment_ids_np, num_segments = tvm.testing.parameters(
-        (
-            np.array([5, 1, 7, 2, 3, 4], dtype=np.float32),
-            np.array([0, 0, 1, 1, 0, 1], dtype=np.int32),
-            None,
-        ),
-        (
-            np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64),
-            np.array([0, 0, 1], dtype=np.int32),
-            None,
-        ),
-        (
-            np.random.random((6, 4, 5)),
-            np.array([2, 0, 1, 0, 3, 2], dtype=np.int64),
-            None,
-        ),
-        (
-            np.array([[[1, 7]], [[3, 8]], [[2, 9]]], dtype=np.float32),
-            np.array([0, 0, 1], dtype=np.int32),
-            None,
-        ),
-        (
-            np.random.random((9, 4, 5, 7)),
-            np.array([5, 0, 1, 0, 3, 6, 8, 7, 7], dtype=np.int64),
-            9,
-        ),
-        (
-            np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64),
-            np.array([0, 2], dtype=np.int32),
-            4,
-        ),
-        (
-            np.random.random((6, 4, 5)),
-            np.array([0, 0, 1, 5, 5], dtype=np.int32),
-            100,
-        ),
-    )
-
-    use_dyn = tvm.testing.parameter(True, False, ids=["dyn", "static"])
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_res(
-        self,
-        data_np: np.ndarray,
-        segment_ids_np: np.ndarray,
-        num_segments: Optional[int],
-    ):
-        """
-        This function calculates the expected output of segment_sum operator given the inputs.
-        """
-        if not num_segments:
-            num_segments = np.unique(segment_ids_np).shape[0]
-
-        result = np.zeros((num_segments,) + data_np.shape[1:], data_np.dtype)
-        for i, index in enumerate(segment_ids_np):
-            result[index] += data_np[i]
-        return result
-
-    # Optimization can produce tir.atomic_add, not currently supported
-    # on vulkan runtime.
-    @tvm.testing.known_failing_targets("vulkan")
-    def test_segment_sum(
-        self,
-        target,
-        dev,
-        ref_res: np.ndarray,
-        data_np: np.ndarray,
-        segment_ids_np: np.ndarray,
-        num_segments: Optional[int],
-        use_dyn: bool,
-    ):
-        """
-        This function verifies the relay output of segment_sum with its expected output.
-        """
-        if use_dyn:
-            data = relay.var(
-                "data",
-                shape=[relay.Any() for _ in data_np.shape],
-                dtype=str(data_np.dtype),
-            )
-            segment_ids = relay.var(
-                "segment_ids",
-                shape=[relay.Any()],
-                dtype=str(segment_ids_np.dtype),
-            )
-        else:
-            data = relay.var(
-                "data",
-                relay.TensorType(data_np.shape, str(data_np.dtype)),
-            )
-            segment_ids = relay.var(
-                "segment_ids", relay.TensorType(segment_ids_np.shape, str(segment_ids_np.dtype))
-            )
-        z = relay.op.segment_sum(data, segment_ids, num_segments)
-
-        func = relay.Function([data, segment_ids], z)
-        segment_sum_result = run_infer_type(z)
-        assert segment_sum_result.checked_type.dtype == data_np.dtype
-        verify_func(
-            target,
-            dev,
-            func,
-            [data_np, segment_ids_np],
-            ref_res,
-        )
-
-
-def verify_func(target, dev, func, data, ref_res, rtol=1e-5, atol=1e-7, kinds=["vm"]):
-    assert isinstance(data, list)
-    for kind in kinds:
-        mod = tvm.ir.IRModule.from_expr(func)
-        op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(*data)
-        if isinstance(op_res, tvm.runtime.container.ADT):
-            assert len(op_res) == len(
-                ref_res
-            ), "Outputs from TVM and Python implementation must be equal "
-            for op_result, ref_result in zip(op_res, ref_res):
-                tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=rtol, atol=atol)
-        else:
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
-        relay.backend.te_compiler.get().clear()
-
-
-def test_adv_index(target, dev, executor_kind):
-    def verify_adv_index(data_shape, index_shapes):
-        dtype = "float32"
-        inputs = [relay.var("data", relay.TensorType(data_shape, dtype))]
-        np_data = np.random.uniform(size=data_shape).astype(dtype)
-        np_indices = []
-        for i, index_shape in enumerate(index_shapes):
-            limit = data_shape[i]
-            np_indices.append(np.random.uniform(0, limit - 1, size=index_shape).astype("int64"))
-            inputs.append(relay.var("index_{}".format(i), relay.TensorType(index_shape, "int64")))
-        np_out = np_data[tuple(np_indices)]
-        np_args = [np_data] + np_indices
-        out = relay.op.adv_index(inputs)
-
-        func = relay.Function(inputs, out)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            *np_args
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-5)
-
-    verify_adv_index((10, 5), [(3, 4), (3, 1)])
-    verify_adv_index((10, 5), [(1, 4), (3, 1)])
-    verify_adv_index(
-        (10, 5),
-        [
-            (2,),
-        ],
-    )
-    verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)])
-
-
-# Helper for testing binop functions
-scanops_supported = {"cumsum": relay.op.cumsum, "cumprod": relay.op.cumprod}
-
-
-def run_binop_tests(
-    target,
-    dev,
-    executor_kind,
-    binop_type: str,
-    gt_func: Callable[..., np.array],
-    identity_value: int,
-):
-    def assert_relay_scanop(
-        data_np: np.array,
-        np_out: np.array,
-        axis: int = None,
-        out_dtype: str = None,
-        rtol: float = 1e-5,
-        atol: float = 1e-5,
-        exclusive: bool = False,
-    ):
-        inp = relay.var("data", relay.TensorType(data_np.shape, str(data_np.dtype)))
-
-        if binop_type not in scanops_supported.keys():
-            raise ValueError(f"Unknown function {binop_type}. Options: {scanops_supported.keys()}")
-        out = scanops_supported[binop_type](inp, axis, out_dtype, exclusive=exclusive)
-        func = relay.Function([inp], out)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            data_np
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=rtol, atol=atol)
-
-    data = np.array([2, 3, 0])
-    assert_relay_scanop(data, gt_func(data))
-    assert_relay_scanop(data, gt_func(data), out_dtype="int64")
-
-    data = np.random.randn(10, 10)
-    assert_relay_scanop(data, gt_func(data))
-    assert_relay_scanop(data, gt_func(data, axis=0), axis=0)
-    assert_relay_scanop(data, gt_func(data, axis=1), axis=1)
-
-    data = np.random.randn(10, 5, 10).astype("float32")
-    assert_relay_scanop(data, gt_func(data), rtol=1e-4, atol=1e-4)
-    assert_relay_scanop(data, gt_func(data, axis=0), axis=0, rtol=1e-4, atol=1e-4)
-    assert_relay_scanop(data, gt_func(data, axis=1), axis=1, rtol=1e-4, atol=1e-4)
-    assert_relay_scanop(data, gt_func(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4)
-
-    data = np.random.rand(10) > 0.5
-    data = data.astype(np.int32)
-    assert_relay_scanop(data, gt_func(data, dtype=np.int32))
-    assert_relay_scanop(data, gt_func(data, dtype="int64"), out_dtype="int64")
-
-    # Test exclusivity operations
-    data = np.random.randint(-100, 100, size=(10, 10)).astype("int64")
-    expected_result = np.roll(gt_func(data), 1)
-    expected_result[0] = identity_value
-    assert_relay_scanop(data, expected_result, exclusive=True)
-
-    expected_result = np.roll(gt_func(data, axis=0), 1, axis=0)
-    expected_result[0, :] = identity_value
-    assert_relay_scanop(data, expected_result, exclusive=True, axis=0)
-
-    expected_result = np.roll(gt_func(data, axis=1), 1, axis=1)
-    expected_result[:, 0] = identity_value
-    assert_relay_scanop(data, expected_result, exclusive=True, axis=1)
-
-
-@tvm.testing.parametrize_targets
-def test_cumsum(target, dev, executor_kind):
-    run_binop_tests(
-        target, dev, executor_kind, binop_type="cumsum", gt_func=np.cumsum, identity_value=0
-    )
-
-
-@tvm.testing.parametrize_targets
-def test_cumprod(target, dev, executor_kind):
-    run_binop_tests(
-        target, dev, executor_kind, binop_type="cumprod", gt_func=np.cumprod, identity_value=1
-    )
-
-
-@tvm.testing.parametrize_targets
-def test_scatter_nd(target, dev, executor_kind):
-    def test_scatter_nd_large_shape():
-        def before():
-            data = relay.const(np.zeros((1, 900, 300), dtype="float32"), dtype="float32")
-            indices = relay.const(np.ones((3, 1, 900, 300), dtype="int64"), dtype="int64")
-            update = relay.const(np.ones((1, 900, 300), dtype="float32"), dtype="float32")
-            b = relay.op.scatter_nd(data, indices, update)
-            return relay.Function(relay.analysis.free_vars(b), b)
-
-        passes = tvm.transform.Sequential(
-            [
-                relay.transform.InferType(),
-                relay.transform.FoldConstant(),
-            ]
-        )
-        before_mod = tvm.IRModule.from_expr(before())
-        with tvm.transform.PassContext(opt_level=3):
-            after_mod = passes(before_mod)
-
-    test_scatter_nd_large_shape()
-
-    def test_scatter_nd_inequal_m_k():
-        def before():
-            data = relay.const(np.zeros((1, 1, 10), dtype="float32"), dtype="float32")
-            indices = relay.const(np.zeros((2, 1, 1, 1), dtype="float32"), dtype="int64")
-            update = relay.const(np.ones((1, 1, 1, 10), dtype="float32"), dtype="float32")
-            b = relay.op.scatter_nd(data, indices, update)
-            return relay.Function(relay.analysis.free_vars(b), b)
-
-        passes = tvm.transform.Sequential(
-            [
-                relay.transform.InferType(),
-                relay.transform.FoldConstant(),
-            ]
-        )
-        before_mod = tvm.IRModule.from_expr(before())
-        with tvm.transform.PassContext(opt_level=3):
-            after_mod = passes(before_mod)
-
-    test_scatter_nd_inequal_m_k()
-
-    def verify_scatter_nd(
-        data_np, indices_np, updates_np, ref_res, mode="add", rtol=1e-5, atol=1e-5
-    ):
-        data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype))
-        indices = relay.var("indices", shape=indices_np.shape, dtype=str(indices_np.dtype))
-        updates = relay.var("updates", shape=updates_np.shape, dtype=str(updates_np.dtype))
-
-        out = relay.op.scatter_nd(data, indices, updates, mode)
-        func = relay.Function([data, indices, updates], out)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            data_np, indices_np, updates_np
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
-
-    def verify_scatter_nd_with_stack(
-        data_np, indices_np, updates_np, ref_res, mode="add", rtol=1e-5, atol=1e-5
-    ):
-        data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype))
-        indices_vars = [
-            relay.var("ind%d" % i, shape=v.shape, dtype=str(v.dtype))
-            for i, v in enumerate(indices_np)
-        ]
-        updates = relay.var("updates", shape=updates_np.shape, dtype=str(updates_np.dtype))
-
-        # test if scatter_nd works in case indices are prepared by another Relay operator
-        indices = relay.op.stack(indices_vars, axis=0)
-        out = relay.op.scatter_nd(data, indices, updates, mode)
-        func = relay.Function(
-            [data, updates] + indices_vars,
-            out,
-        )
-
-        fargs = [data_np, updates_np]
-        for a in indices_np:
-            fargs.append(a)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            *fargs
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
-
-    # TODO(vcchernov): check frameworks' int type requirements. ONNX expects int64 only
-    for indice_dtype in ["uint8", "uint16", "uint32"]:
-        data = np.zeros((2, 2)).astype("int64")
-        indices = np.array([[1, 1, 0], [0, 1, 0]]).astype(indice_dtype)
-        updates = np.array([2, 3, 0])
-        out = np.array([[0, 0], [2, 3]])
-        verify_scatter_nd(data, indices, updates, out)
-        verify_scatter_nd_with_stack(data, indices, updates, out)
-
-        data = np.zeros((2, 2, 2, 2)).astype("int64")
-        indices = np.array([[0, 1], [1, 1]]).astype(indice_dtype)
-        updates = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
-        out = np.array([[[[0, 0], [0, 0]], [[1, 2], [3, 4]]], [[[0, 0], [0, 0]], [[5, 6], [7, 8]]]])
-        verify_scatter_nd(data, indices, updates, out)
-        verify_scatter_nd_with_stack(data, indices, updates, out)
-
-        indices = np.array([[1, 0, 0]]).astype(indice_dtype)
-        updates = np.reshape(np.arange(1560 * 3), (3, 1560)).astype("float32")
-        shape = (2, 1560)
-        data = np.zeros(shape).astype("float32")
-        out = data.copy()
-        out[1, :] += updates[0, :]
-        out[0, :] += updates[1, :]
-        out[0, :] += updates[2, :]
-        verify_scatter_nd(data, indices, updates, out, mode="add")
-        verify_scatter_nd_with_stack(data, indices, updates, out)
-
-        for mode in ["update", "add", "mul", "min", "max"]:
-            indices = np.stack((np.random.randint(2, size=5), np.random.randint(7, size=5))).astype(
-                indice_dtype
-            )
-            updates = np.ones((5, 3)).astype("float64")
-            shape = (2, 7, 3)
-            data = np.random.random(shape).astype("float64")
-            out = data.copy()
-            for i in range(indices.shape[1]):
-                for j in range(updates.shape[1]):
-                    if mode == "update":
-                        out[indices[0, i], indices[1, i], j] = updates[i, j]
-                    elif mode == "add":
-                        out[indices[0, i], indices[1, i], j] += updates[i, j]
-                    elif mode == "mul":
-                        out[indices[0, i], indices[1, i], j] *= updates[i, j]
-                    elif mode == "min":
-                        out[indices[0, i], indices[1, i], j] = min(
-                            out[indices[0, i], indices[1, i], j], updates[i, j]
-                        )
-                    elif mode == "max":
-                        out[indices[0, i], indices[1, i], j] = max(
-                            out[indices[0, i], indices[1, i], j], updates[i, j]
-                        )
-            verify_scatter_nd(data, indices, updates, out, mode)
-            verify_scatter_nd_with_stack(data, indices, updates, out, mode)
-
-
-def test_unique(target, dev):
-    def calc_numpy_unique(data, is_sorted=False):
-        uniq, index, inverse, counts = np.unique(
-            data, return_index=True, return_inverse=True, return_counts=True
-        )
-        num_uniq = np.array([len(uniq)]).astype("int32")
-        if not is_sorted:
-            order = np.argsort(index)
-            reverse_order = np.argsort(order)
-            uniq = uniq[order].astype(data.dtype)
-            inverse = np.array([reverse_order[i] for i in inverse]).astype("int32")
-            counts = counts[order].astype("int32")
-            # In unsorted case, need to sort the index of first occurence
-            index = np.sort(index)
-        return [
-            uniq.astype(data.dtype),
-            index.astype("int32"),
-            inverse.astype("int32"),
-            num_uniq,
-            counts,
-        ]
-
-    def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
-        if is_dyn:
-            x = relay.var("x", relay.TensorType([relay.Any()], dtype))
-        else:
-            x = relay.var("x", relay.TensorType([n], dtype))
-        outs = relay.unique(x, is_sorted, return_counts)
-        outs = outs.astuple()
-        func = relay.Function([x], outs)
-        x_data = np.random.randint(50, size=n).astype(dtype)
-
-        if is_dyn:
-            backend = "vm"
-        else:
-            backend = "graph"
-
-        mod = tvm.ir.IRModule.from_expr(func)
-        tvm_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
-            x_data
-        )  # unique, indices, inverse_indices, num_unique, (counts)
-        np_res = calc_numpy_unique(
-            x_data, is_sorted
-        )  # unique, indices, inverse_indices, num_unique, counts
-        num_unique = np_res[3][0]
-
-        # num_unique
-        assert num_unique == tvm_res[3].numpy()[0]
-        # unique
-        tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5)
-        # indices
-        tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5)
-        # inverse_indices
-        tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5)
-        # counts
-        if return_counts:
-            tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5)
-
-    for dtype in ["int32", "int64"]:
-        for i in range(8):
-            is_dyn, is_sorted, return_counts = bool(i & 1), bool(i & 2), bool(i & 4)
-            verify_unique(10, dtype, is_dyn, is_sorted, return_counts)
-
-
-class TestSTFT:
-    (
-        data_np,
-        n_fft,
-        hop_length,
-        win_length,
-        window_np,
-        normalized,
-        onesided,
-    ) = tvm.testing.parameters(
-        (
-            np.array([[1, 2, 3, 4, 5, 6]], dtype=np.float32),
-            3,
-            3,
-            3,
-            np.array([4, 3, 2], dtype=np.int32),
-            False,
-            True,
-        ),
-        (
-            np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9], [2, 5, 7, 8, 5, 6, 7, 3, 2]], dtype=np.float32),
-            2,
-            1,
-            2,
-            np.array([1, 3], dtype=np.int32),
-            False,
-            True,
-        ),
-        (
-            np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9], [2, 5, 7, 8, 5, 6, 7, 3, 2]], dtype=np.float32),
-            2,
-            1,
-            2,
-            np.array([1, 3], dtype=np.int32),
-            True,
-            True,
-        ),
-        (
-            np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9], [2, 5, 7, 8, 5, 6, 7, 3, 2]], dtype=np.float32),
-            2,
-            1,
-            2,
-            np.array([1, 3], dtype=np.int32),
-            False,
-            False,
-        ),
-    )
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_res(
-        self,
-        data_np: np.ndarray,
-        n_fft: int,
-        hop_length: int,
-        win_length: int,
-        window_np,
-        normalized,
-        onesided,
-    ):
-        """
-        This function calculates the expected output of segment_sum operator given the inputs.
-        """
-
-        def pad_window(window_np, n_fft):
-            shape = window_np.shape[-1]
-            lpad = int((n_fft - shape) // 2)
-            lengths = [(0, 0)] * len(window_np.shape)
-            lengths[-1] = (lpad, int(n_fft - shape - lpad))
-            if lpad < 0:
-                print("ERROR Padding")
-            return np.pad(window_np, lengths, mode="constant")
-
-        import math
-
-        if not onesided:
-            n_rows = n_fft
-        else:
-            n_rows = n_fft // 2 + 1
-        if window_np is None:
-            window_np = np.ones(win_length, dtype=np.int32)
-        window_np = pad_window(window_np, n_fft)
-
-        n_cols = (data_np.shape[-1] - n_fft) // hop_length + 1
-        np_result = np.zeros((data_np.shape[0], n_rows, n_cols, 2))
-
-        for batch in range(data_np.shape[0]):
-            for w in range(n_rows):
-                for m in range(n_cols):
-                    for k in range(n_fft):
-                        np_result[batch][w][m][0] += (
-                            window_np[k]
-                            * data_np[batch][m * hop_length + k]
-                            * math.cos(2 * math.pi * w * k / n_fft)
-                        )
-                        np_result[batch][w][m][1] -= (
-                            window_np[k]
-                            * data_np[batch][m * hop_length + k]
-                            * math.sin(2 * math.pi * w * k / n_fft)
-                        )
-                    if normalized:
-                        np_result[batch][w][m][0] /= math.sqrt(n_fft)
-                        np_result[batch][w][m][1] /= math.sqrt(n_fft)
-        return np_result
-
-    use_dyn = tvm.testing.parameter(True, False, ids=["dyn", "static"])
-
-    @tvm.testing.parametrize_targets("llvm", "cuda")
-    def test_stft(
-        self,
-        target,
-        dev,
-        ref_res: np.ndarray,
-        data_np: np.ndarray,
-        n_fft: int,
-        hop_length: int,
-        win_length: int,
-        window_np: np.ndarray,
-        normalized: bool,
-        onesided: bool,
-        use_dyn,
-    ):
-        if use_dyn:
-            data = relay.var(
-                "data",
-                relay.TensorType([relay.Any(), relay.Any()], str(data_np.dtype)),
-            )
-            window = relay.var(
-                "window",
-                relay.TensorType([relay.Any()], str(window_np.dtype)),
-            )
-            backends = ["vm"]
-        else:
-            data = relay.var(
-                "data",
-                relay.TensorType(data_np.shape, str(data_np.dtype)),
-            )
-            window = relay.var(
-                "window",
-                relay.TensorType(window_np.shape, str(window_np.dtype)),
-            )
-            backends = ["graph", "vm"]
-
-        z = relay.op.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
-        func = relay.Function([data, window], z)
-        verify_func(
-            target, dev, func, [data_np, window_np], ref_res, rtol=1e-3, atol=1e-3, kinds=backends
-        )
-
-
-def test_trilu(target="llvm", dev=tvm.cpu()):
-    def verify_trilu(data_shape, upper=True, k=0):
-        data = relay.var("data", relay.TensorType(data_shape, "float32"))
-        y = relay.trilu(data, k, upper)
-        mod = tvm.ir.IRModule.from_expr(y)
-
-        data_np = np.random.normal(size=data_shape).astype("float32")
-        tvm_res = (
-            relay.create_executor("graph", mod=mod, device=dev, target=target)
-            .evaluate()(data_np)
-            .numpy()
-        )
-        if upper:
-            np_res = np.triu(data_np, k)
-        else:
-            np_res = np.tril(data_np, k)
-        tvm.testing.assert_allclose(tvm_res, np_res)
-
-    # Test upper and lower triangle
-    verify_trilu((3, 3), True, 0)
-    verify_trilu((3, 3), False, 0)
-    # Test larger matrices with offset.
-    verify_trilu((6, 6), True, 1)
-    verify_trilu((6, 6), False, 2)
-    verify_trilu((6, 6), False, -2)
-    # Test batch size
-    verify_trilu((8, 6, 6), False, -2)
-
-
-def test_trilu_shape_i64():
-    data_x = np.ones((2, 1), dtype="int32")
-
-    x = relay.var("x", shape=[2, 1], dtype="float32")
-    v0 = relay.broadcast_to(x, shape=relay.const([2, 1], dtype="int64"))
-    v2 = relay.add(relay.const([[1.0]]), v0)
-    v3 = relay.trilu(v0, k=0)
-
-    f = relay.Function([x], relay.Tuple([v2, v3]))
-    tvm_res = relay.create_executor("graph", device=tvm.cpu(), target="llvm").evaluate(f)(data_x)
-
-    np_res = (
-        np.array([[2.0], [2.0]], dtype=np.float32),
-        np.array([[1.0], [0.0]], dtype=np.float32),
-    )
-
-    tvm.testing.assert_allclose(tvm_res[0].numpy(), np_res[0])
-    tvm.testing.assert_allclose(tvm_res[1].numpy(), np_res[1])
-
-
-def test_trilu_reduce():
-    data_i0 = np.ones((2, 2), dtype="int32")
-    k = 0
-
-    i0 = relay.var("i0", shape=[2, 2], dtype="int32")
-    i1 = relay.var("i1", shape=(), dtype="int64")
-    v0 = relay.trilu(i0, i1)
-    v1 = relay.argmin(v0, axis=[0])
-    f = relay.Function([i0, i1], v1)
-    tvm_res = (
-        relay.create_executor("graph", device=tvm.cpu(), target="llvm")
-        .evaluate(f)(data_i0, k)
-        .numpy()
-    )
-
-    np_res = np.triu(data_i0, k).argmin(axis=0)
-    tvm.testing.assert_allclose(tvm_res, np_res)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
deleted file mode 100644
index c2877c5cda55..000000000000
--- a/tests/python/relay/test_op_level4.py
+++ /dev/null
@@ -1,701 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import sys
-
-import numpy as np
-import numpy.random
-import pytest
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import relay, te
-from tvm.relay import transform
-from tvm.relay.testing import run_infer_type
-
-executor_kind = tvm.testing.parameter("graph", "vm")
-
-
-@tvm.testing.uses_gpu
-def test_binary_op():
-    def check_binary_op(opfunc, ref):
-        n = te.size_var("n")
-        t1 = relay.TensorType((5, n, 5))
-        t2 = relay.TensorType((n, 1))
-        x = relay.var("x", t1)
-        y = relay.var("y", t2)
-        z = opfunc(x, y)
-        # test printer
-        assert ("{}(%x, %y)".format(z.op.name)) in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == t1
-
-        if ref is not None:
-            t1 = relay.TensorType((5, 10, 5))
-            t2 = relay.TensorType((5, 10, 5))
-            x = relay.var("x", t1)
-            y = relay.var("y", t2)
-            z = opfunc(x, y)
-            x_data = np.random.rand(5, 10, 5).astype(t1.dtype)
-            y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
-            ref_res = ref(x_data, y_data)
-            func = relay.Function([x, y], z)
-
-            for target, dev in tvm.testing.enabled_targets():
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-    for opfunc, ref in [(relay.power, np.power)]:
-        check_binary_op(opfunc, ref)
-
-
-@tvm.testing.uses_gpu
-def test_cmp_type():
-    for op, ref in (
-        (relay.greater, np.greater),
-        (relay.greater_equal, np.greater_equal),
-        (relay.less, np.less),
-        (relay.less_equal, np.less_equal),
-        (relay.equal, np.equal),
-        (relay.not_equal, np.not_equal),
-    ):
-        x = relay.var("x", relay.TensorType((10, 4), "float32"))
-        y = relay.var("y", relay.TensorType((5, 10, 1), "float32"))
-        z = op(x, y)
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType((5, 10, 4), "bool")
-
-        if ref is not None:
-            x_shape = (10, 4)
-            y_shape = (5, 10, 1)
-            t1 = relay.TensorType(x_shape)
-            t2 = relay.TensorType(y_shape)
-            x = relay.var("x", t1)
-            y = relay.var("y", t2)
-            z = op(x, y)
-            x_data = np.random.rand(*x_shape).astype(t1.dtype)
-            y_data = np.random.rand(*y_shape).astype(t2.dtype)
-            ref_res = ref(x_data, y_data)
-            func = relay.Function([x, y], z)
-
-            for target, dev in tvm.testing.enabled_targets():
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_binary_int_broadcast_1():
-    for op, ref in [(relay.right_shift, np.right_shift), (relay.left_shift, np.left_shift)]:
-        x = relay.var("x", relay.TensorType((10, 4), "int32"))
-        y = relay.var("y", relay.TensorType((5, 10, 1), "int32"))
-        z = op(x, y)
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType((5, 10, 4), "int32")
-
-        if ref is not None:
-            x_shape = (10, 4)
-            y_shape = (5, 10, 1)
-            t1 = relay.TensorType(x_shape, "int32")
-            t2 = relay.TensorType(y_shape, "int32")
-            x_data = np.random.randint(1, 10000, size=(x_shape)).astype(t1.dtype)
-            y_data = np.random.randint(1, 31, size=(y_shape)).astype(t2.dtype)
-            func = relay.Function([x, y], z)
-            ref_res = ref(x_data, y_data)
-
-            for target, dev in tvm.testing.enabled_targets():
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_binary_int_broadcast_2():
-    for op, ref in [(relay.maximum, np.maximum), (relay.minimum, np.minimum), (relay.mod, np.mod)]:
-        x = relay.var("x", relay.TensorType((10, 4), "int32"))
-        y = relay.var("y", relay.TensorType((5, 10, 1), "int32"))
-        z = op(x, y)
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType((5, 10, 4), "int32")
-
-        if ref is not None:
-            x_shape = (10, 4)
-            y_shape = (5, 10, 1)
-            t1 = relay.TensorType(x_shape, "int32")
-            t2 = relay.TensorType(y_shape, "int32")
-            x_data = np.random.randint(1, 10000, size=(x_shape)).astype(t1.dtype)
-            y_data = np.random.randint(1, 10000, size=(y_shape)).astype(t2.dtype)
-            func = relay.Function([x, y], z)
-            ref_res = ref(x_data, y_data)
-
-            for target, dev in tvm.testing.enabled_targets():
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-
-@tvm.testing.uses_gpu
-def test_where(executor_kind):
-    def run(func, inputs, ref_res):
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                *inputs
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    def verify(x_np, y_np, cond_np):
-        ref_res = np.where(cond_np, x_np, y_np)
-
-        args = []
-        args_np = []
-        vs = []
-
-        cond = relay.var("cond", relay.TensorType(cond_np.shape, "bool"))
-
-        args.append(cond)
-        args_np.append(cond_np)
-
-        for v_name, v_np in [("x", x_np), ("y", y_np)]:
-            if len(v_np.shape) == 0:
-                v = relay.const(v_np.item())
-            else:
-                v = relay.var(v_name, relay.TensorType(v_np.shape, dtype))
-                args.append(v)
-                args_np.append(v_np)
-            vs.append(v)
-
-        z = relay.where(cond, vs[0], vs[1])
-
-        func = relay.Function(args, z)
-
-        run(func, args_np, ref_res)
-
-    dtype = "float32"
-
-    x_np = np.random.uniform(size=(3, 4)).astype(dtype)
-    y_np = np.random.uniform(size=(3, 4)).astype(dtype)
-    cond_np = np.random.uniform(low=-1, high=1, size=(3, 4)) > 0
-
-    verify(x_np, y_np, cond_np)
-
-    x_np = np.array(1.0, dtype)
-    y_np = np.array(-1.0, dtype)
-    cond_np = np.array([1, 0, 1], dtype=bool)
-
-    verify(x_np, y_np, cond_np)
-
-    x_np = np.arange(10).astype(dtype)
-    y_np = 10 * x_np
-    cond_np = x_np < 5
-
-    verify(x_np, y_np, cond_np)
-
-    x_np = np.array([[1, 2], [3, 4]], dtype)
-    y_np = np.array([[5, 6], [7, 8]], dtype)
-    cond_np = np.array([[1], [0]], dtype=bool)
-
-    verify(x_np, y_np, cond_np)
-    verify(x_np, y_np, cond_np.T)
-
-    x_np = np.random.randn(1, 12, 8, 8).astype(dtype)
-    y_np = np.array(-1.0, dtype)
-    cond_np = np.random.randn(1, 1, 8, 8) > 0
-
-    verify(x_np, y_np, cond_np)
-
-    x_np, y_np = np.ogrid[:3, :4]
-    cond_np = np.where(x_np < y_np, x_np, 10 + y_np).astype(bool)
-
-    verify(x_np.astype(dtype), y_np.astype(dtype), cond_np)
-
-
-def _with_keepdims(func):
-    def _wrapper(data, axis=None, keepdims=False):
-        if not keepdims:
-            return func(data, axis=axis)
-        else:
-            if axis is not None:
-                axis = axis if isinstance(axis, int) else axis[0]
-                out_shape = list(data.shape)
-                out_shape[axis] = 1
-            else:
-                out_shape = [1 for _ in range(len(data.shape))]
-            return func(data, axis=axis).reshape(out_shape)
-
-    return _wrapper
-
-
-def _np_log_sum_exp(x, axis, keepdims=False):
-    max_x = np.max(x, axis=axis, keepdims=True)
-    x = np.log(np.sum(np.exp(x - max_x), axis=axis, keepdims=True))
-    x = x + max_x
-    if not keepdims:
-        x = np.squeeze(x, axis=axis)
-    return x
-
-
-def _unbiased_relay_wrapper(f):
-    def _unbiased_func(x, axis=None, keepdims=False, exclude=False):
-        return f(x, axis=axis, keepdims=keepdims, exclude=exclude, unbiased=True)
-
-    return _unbiased_func
-
-
-def _unbiased_np_wrapper(f):
-    def _unbiased_func(a, axis=None, dtype=None, keepdims=None):
-        return f(a, axis=axis, dtype=dtype, ddof=1, keepdims=keepdims)
-
-    return _unbiased_func
-
-
-class TestReduceFunctions:
-    funcs = {
-        "sum": (relay.sum, np.sum),
-        "max": (relay.max, np.max),
-        "min": (relay.min, np.min),
-        "mean": (relay.mean, np.mean),
-        "var": (relay.variance, np.var),
-        "unbiased_var": (_unbiased_relay_wrapper(relay.variance), _unbiased_np_wrapper(np.var)),
-        "std": (relay.std, np.std),
-        "unbiased_std": (_unbiased_relay_wrapper(relay.std), _unbiased_np_wrapper(np.std)),
-        "prod": (relay.prod, np.prod),
-        "all": (relay.all, np.all),
-        "any": (relay.any, np.any),
-        "logsumexp": (relay.logsumexp, _np_log_sum_exp),
-        "argmin": (relay.argmin, _with_keepdims(np.argmin)),
-        "argmax": (relay.argmax, _with_keepdims(np.argmax)),
-    }
-    relay_func, ref_func = tvm.testing.parameters(
-        *funcs.values(),
-        ids=list(funcs),
-    )
-
-    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
-
-    data, axis, keepdims, exclude, output = tvm.testing.parameters(
-        ((d1, d2, d3, d4), None, False, False, ()),
-        ((d1, d2, d3, d4), 2, True, False, (d1, d2, 1, d4)),
-        ((d1, d2, d3, d4), 0, True, False, (1, d2, d3, d4)),
-        ((d1, d2, d3), 1, True, False, (d1, 1, d3)),
-        ((d1, d2, d3), 0, True, False, (1, d2, d3)),
-        ((d1, d2, d3), None, True, False, (1, 1, 1)),
-        ((d1, d2, d3), (0, 1), True, False, (1, 1, d3)),
-        ((2, 3, 4), 1, True, False, (2, 1, 4)),
-        ((2, 3, 4), (1,), True, False, (2, 1, 4)),
-        ((2, 3, 4), -1, True, False, (2, 3, 1)),
-        ((2, 3, 4), (0, 1, 2), False, False, ()),
-        ((4, 4, 3), None, False, False, ()),
-        ((4, 4, 3), (0, 2), False, False, (4,)),
-        ((128, 24, 128), (0, 1), False, False, (128,)),
-        ((128, 24, 128), (0, 2), False, False, (24,)),
-        ((128, 24, 128), (0, 1), True, False, (1, 1, 128)),
-        ((128, 24, 128), (0, 2), True, False, (1, 24, 1)),
-    )
-
-    def test_reduce(
-        self,
-        target,
-        dev,
-        relay_func,
-        ref_func,
-        executor_kind,
-        data,
-        axis,
-        keepdims,
-        exclude,
-        output,
-    ):
-        dtype = "bool" if ref_func in [np.all, np.any] else "float32"
-        out_type = "int32" if relay_func in [relay.argmin, relay.argmax] else dtype
-
-        target = tvm.target.Target(target)
-        if target.kind.name == "vulkan" and dtype == "bool":
-            pytest.xfail("Known failing test on vulkan runtime")
-
-        x = relay.var("x", relay.TensorType(data, dtype))
-        if relay_func == relay.logsumexp:
-            z = relay_func(x, axis, keepdims)
-        else:
-            z = relay_func(x, axis, keepdims, exclude)
-        zz = run_infer_type(z)
-        if axis:
-            assert "axis=" in z.astext()
-        if keepdims:
-            assert "keepdims=" in z.astext()
-        if exclude:
-            assert "exclude=" in z.astext()
-        assert zz.checked_type == relay.ty.TensorType(output, out_type)
-
-        if all(isinstance(v, tvm.tir.Var) == 1 for v in data):
-            return
-
-        func = relay.Function([x], z)
-        x_data = (
-            np.random.choice([True, False], size=data)
-            if ref_func in [np.all]
-            else np.random.uniform(size=data).astype(dtype)
-        )
-
-        if ref_func in [np.sum]:
-            ref_res = ref_func(x_data + 0, axis=axis, dtype=dtype, keepdims=keepdims)
-        elif ref_func in [np.max, np.min, np.mean, np.prod]:
-            ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
-        else:  # argmin/argmax
-            if axis and not isinstance(axis, int) and len(axis) > 1:
-                return
-            ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
-
-        op_res1 = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_sum_with_bool_input():
-    def verify(dshape, axis, keepdims, exclude):
-        x = relay.var("x", relay.TensorType(dshape, "bool"))
-
-        y = relay.sum(x, axis, keepdims, exclude)
-
-        func = relay.Function([x], y)
-        func = run_infer_type(func)
-
-        text = func.astext()
-        assert "sum" in text
-
-        data = np.random.choice([False, True], size=dshape)
-
-        if exclude and axis is not None:
-            axis = tuple(set(range(len(dshape))) - set(axis))
-
-        ref_res = np.sum(data, axis, keepdims=keepdims, dtype="bool")
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-    verify((3, 5, 7, 9), None, False, False)
-    verify((3, 5, 7, 9), None, True, False)
-    verify((3, 5, 7, 9), (0,), False, False)
-    verify((3, 5, 7, 9), (1,), True, False)
-    verify((3, 5, 7, 9), (2, 3), False, True)
-    verify((3, 5, 7, 9), (0, 2), True, True)
-    verify((3, 5, 7, 9), (0, 1, 2, 3), False, False)
-    verify((3, 5, 7, 9), (0, 1, 2, 3), False, True)
-    verify((3, 5, 7, 9), (0, 1, 2, 3), True, False)
-    verify((3, 5, 7, 9), (0, 1, 2, 3), True, True)
-
-
-@tvm.testing.uses_gpu
-def test_argmin_argmax_get_last_elements():
-    def get_test_case(shape, gt_func, test_argmin=False):
-        total_ele = np.product(shape)
-        arr = np.zeros(total_ele)
-        target_value = -1 if test_argmin else 1
-        arr[: total_ele // 3] = target_value
-        np.random.shuffle(arr)
-        arr = arr.reshape(shape)
-        ans = gt_func(np.flip(arr))
-        return arr, len(arr) - ans - 1
-
-    funcs_and_gt_funcs = [(relay.argmax, np.argmax), (relay.argmin, np.argmin)]
-    lengths = [5, 10, 15]
-    for func, gt_func in funcs_and_gt_funcs:
-        for shape in lengths:
-            x_in = relay.var("x_in", shape=[shape])
-            output = func(x_in, select_last_index=True)
-            arr, ans = get_test_case(shape, gt_func, test_argmin=func == relay.argmin)
-
-            mod = tvm.IRModule.from_expr(output)
-            for target, dev in tvm.testing.enabled_targets():
-                op_res = relay.create_executor(
-                    "graph", mod=mod, device=dev, target=target
-                ).evaluate()(arr)
-                assert op_res.numpy().item() == ans
-
-
-def verify_mean_var_std(executor_kind, funcs, shape, axis, keepdims, dtype="float32"):
-    test_func = funcs[0]
-    ref_func = funcs[1]
-
-    x = relay.var("x", relay.TensorType(shape, dtype))
-    z = test_func(x, axis, keepdims)
-    func = relay.Function([x], z.astuple())
-    x_data = np.random.uniform(size=shape).astype("float32")
-    ref_mean = np.mean(x_data, axis=axis, dtype="float32", keepdims=keepdims).astype(dtype)
-    ref_res = ref_func(x_data, axis=axis, dtype="float32", keepdims=keepdims).astype(dtype)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data.astype(dtype)
-        )
-        # FP16 is always a little less accurate.
-        if dtype == "float16":
-            rtol, atol = (1e-2, 1e-2)
-        else:
-            rtol, atol = (1e-5, 1e-5)
-        tvm.testing.assert_allclose(op_res[0].numpy(), ref_mean, rtol=rtol, atol=atol)
-        tvm.testing.assert_allclose(op_res[1].numpy(), ref_res, rtol=rtol, atol=atol)
-
-
-@tvm.testing.uses_gpu
-def test_mean_var_std(executor_kind):
-    for func in [[relay.mean_variance, np.var], [relay.mean_std, np.std]]:
-        verify_mean_var_std(executor_kind, func, (2, 3, 4), 1, True)
-        verify_mean_var_std(executor_kind, func, (2, 3, 4), (1,), True)
-        verify_mean_var_std(executor_kind, func, (2, 3, 4), -1, True)
-        verify_mean_var_std(executor_kind, func, (2, 3, 4), (0, 1, 2), False)
-        verify_mean_var_std(executor_kind, func, (4, 4, 3), None, False)
-        verify_mean_var_std(executor_kind, func, (4, 4, 3), (0, 2), False)
-        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), False)
-        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), False)
-        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), True)
-        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), True)
-        # Test FP16 reduction with large indices.
-        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), True, "float16")
-        verify_mean_var_std(executor_kind, func, (128, 24, 128), None, False, "float16")
-
-
-@tvm.testing.uses_gpu
-def test_strided_slice():
-    def verify(
-        dshape,
-        begin,
-        end,
-        strides,
-        output,
-        axes=None,
-        slice_mode="end",
-        test_ref=True,
-        dtype="int32",
-        unknown_dim_value=10,
-    ):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        ndim = len(dshape)
-        begin = begin if begin else [0] * ndim
-        end = end if end else list(dshape)
-
-        # Resolve unknown dimensions to create test case:
-        dshape = list(dshape)
-        for i, d in enumerate(dshape):
-            if not isinstance(d, int):
-                dshape[i] = unknown_dim_value
-        x_data = np.random.uniform(size=dshape).astype("float32")
-
-        ref_res = tvm.topi.testing.strided_slice_python(
-            x_data,
-            begin,
-            end,
-            strides,
-            slice_mode,
-            axes=axes,
-        )
-
-        if strides:
-            z = relay.strided_slice(
-                x, begin=begin, end=end, strides=strides, axes=axes, slice_mode=slice_mode
-            )
-        else:
-            z = relay.strided_slice(x, begin=begin, end=end, axes=axes, slice_mode=slice_mode)
-        func = relay.Function([x], z)
-
-        func = run_infer_type(func)
-        text = func.astext()
-        assert "begin=" in text
-        assert "end=" in text
-
-        if output:
-            assert func.body.checked_type == relay.ty.TensorType(output, "float32")
-
-        if not test_ref:
-            return
-        for target, dev in tvm.testing.enabled_targets():
-            # Need VM to run tests with non-static dimensions
-            op_res = relay.create_executor("vm", device=dev, target=target).evaluate(func)(x_data)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-    verify((1, 3, 10, 10), [0, 0, 0, 0], [-1, 3, 10, 10], [1], (0, 3, 10, 10), dtype="int64")
-    verify(
-        (1, 224, 224, 3),
-        [0, 20, 20, 0],
-        [1, 140, 140, 3],
-        [1, 1, 1, 1],
-        (1, 120, 120, 3),
-        dtype="int64",
-    )
-
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
-    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
-    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
-
-    # Test backwards slicing.
-    verify((3, 4, 3), [-1, -1, -1], [-5, -5, -5], [-1, -1, -1], (3, 4, 3))
-    # Test slicing with overlarge indices.
-    verify((3, 4, 3), [0, 0, 0], [np.iinfo(np.int32).max] * 3, [1, 1, 1], (3, 4, 3))
-    # Test slice mode.
-    verify(
-        (3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], (2, 4, 3), slice_mode="size", test_ref=False
-    )
-
-    verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], (2, 2, 3), slice_mode="size", test_ref=True)
-    verify((3, 4, 3), [1], [4], None, None, axes=[1])
-
-    # Test Any dims for simple cases
-    verify((3, relay.Any()), [0], [1], [1], None, axes=[1], unknown_dim_value=10)
-    verify((relay.Any(), 3), [0], [1], [1], None, axes=[1], unknown_dim_value=10)
-    verify(
-        (relay.Any(), relay.Any(), relay.Any()),
-        [0, 1, 2],
-        [5, 5, 5],
-        [1, 2, 1],
-        None,
-        unknown_dim_value=10,
-    )
-
-
-@tvm.testing.uses_gpu
-def test_dyn_strided_slice():
-    def verify(
-        dshape,
-        begin,
-        end,
-        strides,
-        output,
-        axes=None,
-        ishape=None,
-        slice_mode="end",
-        test_ref=True,
-        dtype="int32",
-    ):
-        ndim = len(dshape)
-        begin = begin if begin else [0] * ndim
-        end = end if end else list(dshape)
-
-        # target numpy result
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = tvm.topi.testing.strided_slice_python(
-            x_data, begin, end, strides, slice_mode, axes=axes
-        )
-
-        if ishape is None:
-            ishape = (relay.Any(),) * ndim
-
-        x = relay.var("x", relay.TensorType(ishape, "float32"))
-        if strides:
-            z = relay.strided_slice(
-                x, begin=begin, end=end, strides=strides, axes=axes, slice_mode=slice_mode
-            )
-        else:
-            z = relay.strided_slice(x, begin=begin, end=end, axes=axes, slice_mode=slice_mode)
-        func = relay.Function([x], z)
-
-        func = run_infer_type(func)
-        text = func.astext()
-        assert "begin=" in text
-        assert "end=" in text
-
-        if not test_ref:
-            return
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor("vm", mod=mod, device=dev, target=target).evaluate()(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-    verify(
-        (1, 224, 224, 3),
-        [0, 20, 20, 0],
-        [1, 140, 140, 3],
-        [1, 1, 1, 1],
-        (1, 120, 120, 3),
-        dtype="int64",
-    )
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
-    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 4], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
-    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
-    verify(
-        (3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], (2, 4, 3), slice_mode="size", test_ref=False
-    )
-    verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], (2, 2, 3), slice_mode="size", test_ref=True)
-    verify(
-        (3, 4, 3, 2),
-        [1, 0],
-        [3, 1],
-        [1, 1],
-        None,
-        axes=[1, 3],
-        ishape=(relay.Any(), 4, relay.Any(), 2),
-    )
-
-
-@tvm.testing.uses_gpu
-def test_strided_set():
-    def verify(dshape, begin, end, strides, vshape, test_ref=True):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        v = relay.var("v", relay.TensorType(vshape, "float32"))
-        begin_c = relay.const(begin, dtype="int32")
-        end_c = relay.const(end, dtype="int32")
-        if strides:
-            strides_c = relay.const(strides, dtype="int32")
-            z = relay.strided_set(x, v, begin=begin_c, end=end_c, strides=strides_c)
-        else:
-            z = relay.strided_set(x, v, begin=begin_c, end=end_c)
-        func = relay.Function([x, v], z)
-        func = run_infer_type(func)
-        text = func.astext()
-        assert "strided_set" in text
-        assert func.body.checked_type == relay.ty.TensorType(dshape, "float32")
-        if not test_ref:
-            return
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        v_data = np.random.uniform(size=vshape).astype("float32")
-        ref_res = tvm.topi.testing.strided_set_python(x_data, v_data, begin, end, strides)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data, v_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
-
-    verify((3, 4, 16), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
-    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
-    verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2))
-    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
deleted file mode 100644
index 4f68256ab5d8..000000000000
--- a/tests/python/relay/test_op_level5.py
+++ /dev/null
@@ -1,1601 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level5 operator test cases.
-"""
-import math
-import platform
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import relay, te
-from tvm.relay.testing import run_infer_type
-
-executor_kind = tvm.testing.parameter("graph", "vm")
-
-
-def test_resize1d_infer_type():
-    n, c, w = te.size_var("n"), te.size_var("c"), te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, w), "int8"))
-    tw = te.var("tw")
-    z = relay.image.resize1d(x, (tw,))
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, tw), "int8")
-
-    x = relay.var("x", relay.TensorType((n, c, w), "int8"))
-    z = relay.image.resize1d(x, (200,), None, "NCW", "linear", "align_corners")
-    assert "size=" in z.astext()
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, 200), "int8")
-
-
-class TestResize1D:
-    interpolate_method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
-    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
-
-    layout = tvm.testing.parameter("NWC", "NCW")
-    dshape, scale = tvm.testing.parameters(
-        ((1, 4, 4), 2),
-        ((2, 8, 17), 3),
-        ((2, 8, 17), 3),
-        ((3, 4, 5), 5),
-    )
-
-    def test_resize(
-        self, target, dev, executor_kind, dshape, scale, interpolate_method, layout, coord_trans
-    ):
-        target_kind = tvm.target.Target(target).kind.name
-        if (
-            target_kind == "vulkan"
-            and dshape == (3, 4, 5)
-            and scale == 5
-            and interpolate_method == "nearest_neighbor"
-            and coord_trans == "align_corners"
-        ):
-            pytest.xfail("Known failing case for these parameters")
-
-        if layout == "NWC":
-            size = (dshape[1] * scale,)
-        else:
-            size = (dshape[2] * scale,)
-
-        x_data = np.random.uniform(size=dshape).astype("float32")
-
-        ref_res = tvm.topi.testing.resize1d_python(
-            x_data, (scale,), layout, interpolate_method, coord_trans
-        )
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.image.resize1d(
-            x, size, None, layout, interpolate_method, coordinate_transformation_mode=coord_trans
-        )
-        assert "size=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-4)
-
-
-def test_resize2d_infer_type():
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
-    th, tw = te.var("th"), te.var("tw")
-    z = relay.image.resize2d(x, (th, tw))
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, th, tw), "int8")
-
-    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
-    z = relay.image.resize2d(x, (100, 200), None, "NCHW", "linear", "align_corners")
-    assert "size=" in z.astext()
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
-
-
-class TestResize2D:
-    interpolate_method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
-    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
-
-    layout = tvm.testing.parameter("NHWC", "NCHW")
-
-    dshape, scale = tvm.testing.parameters(
-        ((1, 4, 4, 4), 2),
-        ((2, 8, 17, 20), 3),
-        ((2, 8, 17, 20), 3),
-        ((3, 4, 5, 6), 5),
-    )
-
-    def test_resize(
-        self, target, dev, executor_kind, dshape, scale, interpolate_method, layout, coord_trans
-    ):
-        target_kind = tvm.target.Target(target).kind.name
-        if (
-            target_kind == "vulkan"
-            and dshape == (3, 4, 5, 6)
-            and scale == 5
-            and interpolate_method == "nearest_neighbor"
-            and coord_trans == "align_corners"
-        ):
-            pytest.xfail("Known failing case for these parameters")
-
-        if layout == "NHWC":
-            size = (dshape[1] * scale, dshape[2] * scale)
-        else:
-            size = (dshape[2] * scale, dshape[3] * scale)
-
-        x_data = np.random.uniform(size=dshape).astype("float32")
-
-        ref_res = tvm.topi.testing.resize2d_python(
-            x_data, (scale, scale), layout, interpolate_method, coord_trans
-        )
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.image.resize2d(
-            x, size, None, layout, interpolate_method, coordinate_transformation_mode=coord_trans
-        )
-        assert "size=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-4)
-
-
-def test_resize3d_infer_type():
-    n, c, d, h, w = (
-        te.size_var("n"),
-        te.size_var("c"),
-        te.size_var("d"),
-        te.size_var("h"),
-        te.size_var("w"),
-    )
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "int8"))
-    td, th, tw = te.var("td"), te.var("th"), te.var("tw")
-    z = relay.image.resize3d(x, (td, th, tw))
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, td, th, tw), "int8")
-
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "int8"))
-    z = relay.image.resize3d(x, (10, 10, 20), None, "NCDHW", "linear", "align_corners")
-    assert "size=" in z.astext()
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType((n, c, 10, 10, 20), "int8")
-
-
-class TestResize3D:
-    interpolate_method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
-    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
-
-    layout = tvm.testing.parameter("NDHWC", "NCDHW")
-
-    dshape, scale = tvm.testing.parameters(
-        ((1, 4, 4, 4, 4), 2),
-    )
-
-    def test_resize(
-        self, target, dev, executor_kind, dshape, scale, interpolate_method, layout, coord_trans
-    ):
-        if layout == "NDHWC":
-            size = (dshape[1] * scale, dshape[2] * scale, dshape[3] * scale)
-        else:
-            size = (dshape[2] * scale, dshape[3] * scale, dshape[4] * scale)
-
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = tvm.topi.testing.resize3d_python(
-            x_data, (scale, scale, scale), layout, interpolate_method, coord_trans
-        )
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.image.resize3d(x, size, None, layout, interpolate_method, coord_trans)
-        assert "size=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
-
-
-class TestCropAndResize:
-    interpolate_method = tvm.testing.parameter("bilinear", "nearest_neighbor")
-    layout = tvm.testing.parameter("NHWC", "NCHW")
-
-    @pytest.mark.skipif(
-        platform.machine() == "aarch64",
-        reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-    )
-    def test_crop_and_resize(self, target, dev, executor_kind, layout, interpolate_method):
-        target_kind = tvm.target.Target(target).kind.name
-        if (
-            target_kind == "vulkan"
-            and layout == "NHWC"
-            and interpolate_method == "nearest_neighbor"
-        ):
-            pytest.xfail("Known failing case for these parameters")
-
-        extrapolation_value = 0.0
-
-        np.random.seed(0)
-
-        eps = 1e-4
-
-        if layout == "NHWC":
-            img_shape = (10, 224, 224, 3)
-            boxes = np.random.uniform(size=(2, 4)).astype("float32")
-            box_indices = np.array([1, 0]).astype("int32")
-            crop_size = np.array([20, 30]).astype("int32")
-        elif layout == "NCHW":
-            img_shape = (5, 3, 255, 255)
-            boxes = np.random.uniform(size=(2, 4)).astype("float32")
-            box_indices = np.array([0, 1]).astype("int32")
-            crop_size = np.array([30, 30]).astype("int32")
-        else:
-            raise ValueError(f"Unknown layout: {layout}")
-
-        image_data = np.random.uniform(size=img_shape).astype("float32")
-
-        ref_res = tvm.topi.testing.crop_and_resize_python(
-            image_data,
-            boxes,
-            box_indices,
-            crop_size,
-            layout,
-            interpolate_method,
-            extrapolation_value,
-        )
-
-        img = relay.var("img", relay.TensorType(img_shape, "float32"))
-        bx = relay.var("bx", relay.TensorType(boxes.shape, "float32"))
-        bx_idx = relay.var("bx_idx", relay.TensorType(box_indices.shape, "int32"))
-
-        z = relay.image.crop_and_resize(
-            img, bx, bx_idx, list(crop_size), layout, interpolate_method, extrapolation_value
-        )
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([img, bx, bx_idx], z)
-
-        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            image_data, boxes, box_indices
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-04)
-
-
-@tvm.testing.uses_gpu
-def test_multibox_prior(executor_kind):
-    def get_ref_result(
-        dshape, sizes=(1.0,), ratios=(1.0,), steps=(-1.0, -1.0), offsets=(0.5, 0.5), clip=True
-    ):
-        in_height = dshape[2]
-        in_width = dshape[3]
-        num_sizes = len(sizes)
-        num_ratios = len(ratios)
-        size_ratio_concat = sizes + ratios
-        steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
-        steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
-        offset_h = offsets[0]
-        offset_w = offsets[1]
-
-        oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
-        dtype = "float32"
-        np_out = np.zeros(oshape).astype(dtype)
-
-        for i in range(in_height):
-            center_h = (i + offset_h) * steps_h
-            for j in range(in_width):
-                center_w = (j + offset_w) * steps_w
-                for k in range(num_sizes + num_ratios - 1):
-                    w = (
-                        size_ratio_concat[k] * in_height / in_width / 2.0
-                        if k < num_sizes
-                        else size_ratio_concat[0]
-                        * in_height
-                        / in_width
-                        * math.sqrt(size_ratio_concat[k + 1])
-                        / 2.0
-                    )
-                    h = (
-                        size_ratio_concat[k] / 2.0
-                        if k < num_sizes
-                        else size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
-                    )
-                    count = (
-                        i * in_width * (num_sizes + num_ratios - 1)
-                        + j * (num_sizes + num_ratios - 1)
-                        + k
-                    )
-                    np_out[0][count][0] = center_w - w
-                    np_out[0][count][1] = center_h - h
-                    np_out[0][count][2] = center_w + w
-                    np_out[0][count][3] = center_h + h
-        if clip:
-            np_out = np.clip(np_out, 0, 1)
-
-        return np_out
-
-    def verify_multibox_prior(
-        x,
-        dshape,
-        ref_res,
-        sizes=(1.0,),
-        ratios=(1.0,),
-        steps=(-1.0, -1.0),
-        offsets=(0.5, 0.5),
-        clip=True,
-        check_size=False,
-        check_type_only=False,
-    ):
-
-        z = relay.vision.multibox_prior(x, sizes, ratios, steps, offsets, clip)
-        zz = run_infer_type(z)
-        if check_size:
-            assert "sizes=" in z.astext()
-        assert zz.checked_type == relay.TensorType(
-            (1, dshape[2] * dshape[3] * (len(sizes) + len(ratios) - 1), 4), "float32"
-        )
-
-        if check_type_only:
-            return
-
-        data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
-        func = relay.Function([x], z)
-        func = run_infer_type(func)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    sizes = (0.3, 1.5, 0.7)
-    ratios = (1.3, 2.4)
-    steps = (2.0, 1.5)
-    offsets = (0.2, 0.3)
-    dshape = (1, 3, 56, 56)
-    ref_res = get_ref_result(dshape, sizes, ratios, steps, offsets)
-    x = relay.var("x", relay.TensorType(dshape, "float32"))
-    verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets, check_size=True)
-    y = relay.var("y", relay.TensorType((te.size_var("n"), 3, 56, 56), "float32"))
-    verify_multibox_prior(
-        x, dshape, ref_res, sizes, ratios, steps, offsets, check_size=True, check_type_only=True
-    )
-
-    dshape = (1, 24, 32, 32)
-    ref_res = get_ref_result(dshape, clip=False)
-    x = relay.var("x", relay.TensorType(dshape, "float32"))
-    verify_multibox_prior(x, dshape, ref_res, clip=False)
-    y = relay.var("y", relay.TensorType((te.size_var("n"), 24, 32, 32), "float32"))
-    verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
-
-
-@tvm.testing.uses_gpu
-def test_get_valid_counts():
-    def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
-        dtype = "float32"
-        batch_size, num_anchor, elem_length = dshape
-        np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
-        np_out1 = np.zeros(shape=(batch_size,))
-        np_out2 = np.zeros(shape=dshape).astype(dtype)
-        np_out3 = np.zeros(shape=(batch_size, num_anchor))
-        for i in range(batch_size):
-            np_out1[i] = 0
-            inter_idx = 0
-            for j in range(num_anchor):
-                score = np_data[i, j, score_index]
-                if score > score_threshold and (id_index < 0 or np_data[i, j, id_index] >= 0):
-                    for k in range(elem_length):
-                        np_out2[i, inter_idx, k] = np_data[i, j, k]
-                    np_out1[i] += 1
-                    np_out3[i, inter_idx] = j
-                    inter_idx += 1
-                if j >= np_out1[i]:
-                    for k in range(elem_length):
-                        np_out2[i, j, k] = -1.0
-                    np_out3[i, j] = -1
-
-        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        z = relay.vision.get_valid_counts(x, score_threshold, id_index, score_index)
-        assert "score_threshold" in z.astext()
-        func = relay.Function([x], z.astuple())
-        func = run_infer_type(func)
-        for target, dev in tvm.testing.enabled_targets():
-            out = relay.create_executor("vm", device=dev, target=target).evaluate(func)(np_data)
-
-            tvm.testing.assert_allclose(out[0].numpy(), np_out1, rtol=1e-3, atol=1e-04)
-            tvm.testing.assert_allclose(out[1].numpy(), np_out2, rtol=1e-3, atol=1e-04)
-            tvm.testing.assert_allclose(out[2].numpy(), np_out3, rtol=1e-3, atol=1e-04)
-
-    verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
-    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
-    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
-    verify_get_valid_counts((16, 500, 5), 0.95, -1, 0)
-
-
-@tvm.testing.uses_gpu
-def test_non_max_suppression(executor_kind):
-    def verify_nms(
-        x0_data,
-        x1_data,
-        x2_data,
-        x3_data,
-        dshape,
-        ref_res,
-        ref_indices_res,
-        iou_threshold=0.5,
-        force_suppress=False,
-        top_k=-1,
-        check_type_only=False,
-    ):
-        x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
-        x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32"))
-        x2 = relay.var("x2", relay.ty.TensorType((dshape[0], dshape[1]), "int32"))
-        x3 = relay.var("x3", relay.ty.TensorType((), "int32"))
-        z = relay.vision.non_max_suppression(
-            x0,
-            x1,
-            x2,
-            x3,
-            iou_threshold=iou_threshold,
-            force_suppress=force_suppress,
-            top_k=top_k,
-            return_indices=False,
-        )
-        z_indices = relay.vision.non_max_suppression(
-            x0,
-            x1,
-            x2,
-            x3,
-            iou_threshold=iou_threshold,
-            force_suppress=force_suppress,
-            top_k=top_k,
-            return_indices=True,
-        )
-        if isinstance(z_indices, relay.expr.TupleWrapper):
-            z_indices = z_indices.astuple()
-        zz = run_infer_type(z)
-        zz_indices = run_infer_type(z_indices)
-        assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
-        assert zz_indices.checked_type == relay.ty.TupleType(
-            [
-                relay.ty.TensorType((dshape[0], dshape[1]), "int32"),
-                relay.ty.TensorType((dshape[0], 1), "int32"),
-            ]
-        )
-
-        if check_type_only:
-            return
-
-        func = relay.Function([x0, x1, x2, x3], z)
-        func = run_infer_type(func)
-        func_indices = relay.Function([x0, x1, x2, x3], z_indices)
-        func_indices = run_infer_type(func_indices)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x0_data, x1_data, x2_data, x3_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-            op_indices_res = relay.create_executor(
-                executor_kind, device=dev, target=target
-            ).evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
-            tvm.testing.assert_allclose(op_indices_res[0].numpy(), ref_indices_res, rtol=1e-5)
-
-    np_data = np.array(
-        [
-            [
-                [0, 0.8, 1, 20, 25, 45],
-                [1, 0.7, 30, 60, 50, 80],
-                [0, 0.4, 4, 21, 19, 40],
-                [2, 0.9, 35, 61, 52, 79],
-                [1, 0.5, 100, 60, 70, 110],
-            ]
-        ]
-    ).astype("float32")
-    np_valid_count = np.array([4]).astype("int32")
-    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
-    np_max_output_size = -1
-
-    np_result = np.array(
-        [
-            [
-                [2, 0.9, 35, 61, 52, 79],
-                [0, 0.8, 1, 20, 25, 45],
-                [-1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1],
-            ]
-        ]
-    )
-    np_indices_result = np.array([[4, 0, -1, -1, -1]])
-    num_anchors = 5
-
-    dshape = (te.size_var("n"), num_anchors, 6)
-    verify_nms(
-        np_data,
-        np_valid_count,
-        np_indices,
-        np_max_output_size,
-        dshape,
-        np_result,
-        np_indices_result,
-        force_suppress=True,
-        top_k=2,
-        check_type_only=True,
-    )
-    dshape = (1, num_anchors, 6)
-    verify_nms(
-        np_data,
-        np_valid_count,
-        np_indices,
-        np_max_output_size,
-        dshape,
-        np_result,
-        np_indices_result,
-        force_suppress=True,
-        top_k=2,
-        check_type_only=False,
-    )
-
-    np_result = np.array(
-        [
-            [
-                [2, 0.9, 35, 61, 52, 79],
-                [0, 0.8, 1, 20, 25, 45],
-                [-1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1],
-            ]
-        ]
-    )
-    np_indices_result = np.array([[4, 0, -1, -1, -1]])
-    np_max_output_size = 2
-    dshape = (te.size_var("n"), num_anchors, 6)
-    verify_nms(
-        np_data,
-        np_valid_count,
-        np_indices,
-        np_max_output_size,
-        dshape,
-        np_result,
-        np_indices_result,
-        check_type_only=True,
-    )
-    dshape = (1, num_anchors, 6)
-    verify_nms(
-        np_data,
-        np_valid_count,
-        np_indices,
-        np_max_output_size,
-        dshape,
-        np_result,
-        np_indices_result,
-        top_k=2,
-    )
-
-    np_data = np.array(
-        [
-            [
-                [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
-                [1, 0.7, 30, 60, 50, 80, 5, 6, 7, 8],
-                [0, 0.4, 4, 21, 19, 40, 9, 10, 11, 12],
-                [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
-                [1, 0.5, 100, 60, 70, 110, 17, 18, 19, 20],
-            ]
-        ]
-    ).astype("float32")
-    np_result = np.array(
-        [
-            [
-                [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
-                [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
-                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-            ]
-        ]
-    )
-    dshape = (1, 5, 10)
-    verify_nms(
-        np_data,
-        np_valid_count,
-        np_indices,
-        np_max_output_size,
-        dshape,
-        np_result,
-        np_indices_result,
-        force_suppress=True,
-        top_k=2,
-        check_type_only=False,
-    )
-
-
-@tvm.testing.uses_gpu
-def test_multibox_transform_loc(executor_kind):
-    def test_default_value(keep_background):
-        num_anchors = 3
-        num_classes = 3
-
-        np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]]).astype(
-            "float32"
-        )
-        np_loc_preds = np.array(
-            [[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]]
-        ).astype("float32")
-        np_anchors = np.array(
-            [[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]]
-        ).astype("float32")
-
-        expected_np_out = (
-            np.array(
-                [
-                    [
-                        [2, 0.69999999, 0, 0, 0.10818365, 0.10008108],
-                        [0, 0.49999999, 0, 0, 0.22903419, 0.20435292],
-                        [1, 0.44999999, 1, 1, 1, 1],
-                    ]
-                ]
-            )
-            if keep_background
-            else np.array(
-                [
-                    [
-                        [1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
-                        [0, 0.44999999, 1, 1, 1, 1],
-                        [0, 0.30000001, 0, 0, 0.22903419, 0.20435292],
-                    ]
-                ]
-            )
-        )
-
-        cls_prob = relay.var(
-            "cls_prob", relay.ty.TensorType((1, num_anchors, num_classes), "float32")
-        )
-        loc_pred = relay.var("loc_pred", relay.ty.TensorType((1, num_anchors * 4), "float32"))
-        anchors = relay.var("anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
-
-        mtl = relay.vision.multibox_transform_loc(
-            cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors, keep_background=keep_background
-        )
-        ret = run_infer_type(mtl.astuple())
-        ref_type = relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((1, num_anchors, 6), "float32"),
-                    relay.ty.TensorType((1,), "int"),
-                ]
-            )
-        )
-
-        assert ret.checked_type == ref_type
-
-        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False)
-        func = relay.Function([cls_prob, loc_pred, anchors], nms)
-        func = run_infer_type(func)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                np_cls_prob, np_loc_preds, np_anchors
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), expected_np_out, rtol=1e-5)
-
-    def test_threshold():
-        num_anchors = 5
-        num_classes = 5
-        n = te.size_var("n")
-        cls_prob = relay.var(
-            "cls_prob", relay.ty.TensorType((n, num_anchors, num_classes), "float32")
-        )
-        loc_pred = relay.var("loc_pred", relay.ty.TensorType((n, num_anchors * 4), "float32"))
-        anchors = relay.var("anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
-        threshold = 0.02
-        variances = (0.2, 0.2, 0.3, 0.3)
-
-        ret = relay.vision.multibox_transform_loc(
-            cls_prob=cls_prob,
-            loc_pred=loc_pred,
-            anchor=anchors,
-            threshold=threshold,
-            variances=variances,
-        )
-        ret = run_infer_type(ret.astuple())
-        ref_type = relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((n, num_anchors, 6), "float32"),
-                    relay.ty.TensorType((n,), "int"),
-                ]
-            )
-        )
-        assert ret.checked_type == ref_type
-
-    test_default_value(keep_background=False)
-    test_default_value(keep_background=True)
-    test_threshold()
-
-
-@tvm.testing.uses_gpu
-def test_roi_align(executor_kind):
-    def verify_roi_align(
-        data_shape,
-        rois_shape,
-        channel,
-        in_size,
-        pooled_size,
-        spatial_scale,
-        sample_ratio,
-        mode,
-        layout,
-        ref_func,
-    ):
-        data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
-        rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32"))
-        z = relay.vision.roi_align(
-            data,
-            rois,
-            pooled_size=(pooled_size, pooled_size),
-            spatial_scale=spatial_scale,
-            sample_ratio=sample_ratio,
-            mode=mode,
-            layout=layout,
-        )
-        zz = run_infer_type(z)
-
-        num_roi = rois_shape[0]
-
-        if layout == "NCHW":
-            assert zz.checked_type == relay.ty.TensorType(
-                (num_roi, channel, pooled_size, pooled_size), "float32"
-            )
-        else:
-            assert zz.checked_type == relay.ty.TensorType(
-                (num_roi, pooled_size, pooled_size, channel), "float32"
-            )
-
-        func = relay.Function([data, rois], z)
-        func = run_infer_type(func)
-        np_data = np.random.uniform(size=data_shape).astype("float32")
-        np_rois = np.random.uniform(size=rois_shape).astype("float32") * in_size
-        np_rois[:, 0] = np.random.randint(low=0, high=data_shape[0], size=num_roi)
-        ref_res = ref_func(
-            np_data,
-            np_rois,
-            pooled_size=pooled_size,
-            spatial_scale=spatial_scale,
-            sample_ratio=sample_ratio,
-            mode=mode,
-        )
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                np_data, np_rois
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, atol=1e-6, rtol=1e-3)
-
-    def verify_roi_align_nchw(
-        data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode
-    ):
-        _, channel, in_size, _ = data_shape
-        return verify_roi_align(
-            data_shape,
-            rois_shape,
-            channel,
-            in_size,
-            pooled_size,
-            spatial_scale,
-            sample_ratio,
-            mode,
-            "NCHW",
-            tvm.topi.testing.roi_align_nchw_python,
-        )
-
-    def verify_roi_align_nhwc(
-        data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode
-    ):
-        _, in_size, _, channel = data_shape
-        return verify_roi_align(
-            data_shape,
-            rois_shape,
-            channel,
-            in_size,
-            pooled_size,
-            spatial_scale,
-            sample_ratio,
-            mode,
-            "NHWC",
-            tvm.topi.testing.roi_align_nhwc_python,
-        )
-
-    verify_roi_align_nchw(
-        (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="avg"
-    )
-    verify_roi_align_nchw(
-        (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="avg"
-    )
-    verify_roi_align_nchw(
-        (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="max"
-    )
-    verify_roi_align_nchw(
-        (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="max"
-    )
-    verify_roi_align_nhwc(
-        (1, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="avg"
-    )
-    verify_roi_align_nhwc(
-        (4, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="avg"
-    )
-    verify_roi_align_nhwc(
-        (1, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="max"
-    )
-    verify_roi_align_nhwc(
-        (4, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="max"
-    )
-
-
-@tvm.testing.uses_gpu
-def test_roi_pool(executor_kind):
-    def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
-        data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
-        rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32"))
-        z = relay.vision.roi_pool(
-            data,
-            rois,
-            pooled_size=(pooled_size, pooled_size),
-            spatial_scale=spatial_scale,
-            layout="NCHW",
-        )
-        zz = run_infer_type(z)
-        batch, channel, in_size, _ = data_shape
-        num_roi = rois_shape[0]
-        assert zz.checked_type == relay.ty.TensorType(
-            (num_roi, channel, pooled_size, pooled_size), "float32"
-        )
-
-        func = relay.Function([data, rois], z)
-        func = run_infer_type(func)
-        np_data = np.random.uniform(size=data_shape).astype("float32")
-        np_rois = np.random.uniform(size=rois_shape).astype("float32") * in_size
-        np_rois[:, 0] = np.random.randint(low=0, high=batch, size=num_roi).astype("float32")
-        ref_res = tvm.topi.testing.roi_pool_nchw_python(
-            np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale
-        )
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                np_data, np_rois
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
-
-    verify_roi_pool((1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0)
-    verify_roi_pool((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5)
-
-
-@tvm.testing.uses_gpu
-def test_proposal(executor_kind):
-    def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
-        cls_prob = relay.var("cls_prob", relay.ty.TensorType(np_cls_prob.shape, "float32"))
-        bbox_pred = relay.var("bbox_pred", relay.ty.TensorType(np_bbox_pred.shape, "float32"))
-        im_info = relay.var("im_info", relay.ty.TensorType(np_im_info.shape, "float32"))
-        z = relay.vision.proposal(cls_prob, bbox_pred, im_info, **attrs)
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.ty.TensorType(np_out.shape, "float32")
-
-        func = relay.Function([cls_prob, bbox_pred, im_info], z)
-        func = run_infer_type(func)
-        for target in ["llvm", "cuda"]:
-            if not tvm.testing.device_enabled(target):
-                print("Skip test because %s is not enabled." % target)
-                continue
-            dev = tvm.device(target, 0)
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                np_cls_prob, np_bbox_pred, np_im_info
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-4)
-
-    attrs = {
-        "scales": (0.5,),
-        "ratios": (0.5,),
-        "feature_stride": 16,
-        "iou_loss": False,
-        "rpn_min_size": 16,
-        "threshold": 0.7,
-        "rpn_pre_nms_top_n": 200,
-        "rpn_post_nms_top_n": 4,
-    }
-
-    np_cls_prob = np.array(
-        [
-            [
-                [[0.3, 0.6, 0.2], [0.4, 0.7, 0.5], [0.1, 0.4, 0.3]],
-                [[0.7, 0.5, 0.3], [0.6, 0.4, 0.8], [0.9, 0.2, 0.5]],
-            ]
-        ],
-        dtype="float32",
-    )
-    np_bbox_pred = np.array(
-        [
-            [
-                [[0.5, 1.0, 0.6], [0.8, 1.2, 2.0], [0.9, 1.0, 0.8]],
-                [[0.5, 1.0, 0.7], [0.8, 1.2, 1.6], [2.1, 1.5, 0.7]],
-                [[1.0, 0.5, 0.7], [1.5, 0.9, 1.6], [1.4, 1.5, 0.8]],
-                [[1.0, 0.5, 0.6], [1.5, 0.9, 2.0], [1.8, 1.0, 0.9]],
-            ]
-        ],
-        dtype="float32",
-    )
-    np_im_info = np.array([[48.0, 48.0, 1.0]], dtype="float32")
-    np_out = np.array(
-        [
-            [0.0, 0.0, 2.8451548, 28.38012, 18.154846],
-            [0.0, 0.0, 15.354933, 41.96971, 41.245064],
-            [0.0, 18.019852, 1.0538368, 51.98015, 25.946163],
-            [0.0, 27.320923, -1.266357, 55.0, 24.666357],
-        ],
-        dtype="float32",
-    )
-
-    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
-
-    np_out = np.array(
-        [
-            [0.0, -5.25, -2.5, 21.75, 19.0],
-            [0.0, 11.25, -2.0, 37.25, 18.5],
-            [0.0, 26.849998, -2.3000002, 53.45, 18.6],
-            [0.0, -4.95, 13.799999, 22.25, 35.5],
-        ],
-        dtype="float32",
-    )
-    attrs["iou_loss"] = True
-    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
-
-
-def test_yolo_reorg_infer_shape():
-    def verify_yolo_reorg(shape, stride, out_shape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        z = relay.vision.yolo_reorg(x, stride=stride)
-        zz = run_infer_type(z)
-        assert "stride=" in z.astext()
-        assert zz.checked_type == relay.ty.TensorType(out_shape, "float32")
-
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    idxd = tvm.tir.indexdiv
-    verify_yolo_reorg((n, c, 20, 20), 10, (n, c * 10 * 10, 2, 2))
-    verify_yolo_reorg((n, c, h, w), 2, (n, c * 2 * 2, idxd(h, 2), idxd(w, 2)))
-
-
-@tvm.testing.uses_gpu
-def test_yolo_reorg(executor_kind):
-    def verify_yolo_reorg(shape, stride):
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        ref_res = tvm.topi.testing.reorg_python(x_data, stride)
-
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        z = relay.vision.yolo_reorg(x, stride=stride)
-        zz = run_infer_type(z)
-        assert "stride=" in z.astext()
-        assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
-
-        func = relay.Function([x], z)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_yolo_reorg((1, 100, 20, 20), 10)
-    verify_yolo_reorg((1, 4, 6, 6), 2)
-
-
-class TestDeformableConv2D:
-    batch, in_channel, size, out_channel, deformable_groups = tvm.testing.parameters(
-        (1, 4, 16, 4, 4),
-        (2, 4, 16, 4, 1),
-    )
-    kernel_size = tvm.testing.parameter((3, 3))
-    groups = tvm.testing.parameter(1, 2)
-    layout = tvm.testing.parameter("NCHW", "NHWC")
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.fixture
-    def data_shape(self, layout, batch, in_channel, size):
-        if layout == "NCHW":
-            return (batch, in_channel, size, size)
-        elif layout == "NHWC":
-            return (batch, size, size, in_channel)
-
-    @tvm.testing.fixture
-    def kernel_shape(self, layout, in_channel, out_channel, groups, kernel_size):
-        if layout == "NCHW":
-            return (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
-        elif layout == "NHWC":
-            return (kernel_size[0], kernel_size[1], in_channel // groups, out_channel)
-
-    @tvm.testing.fixture
-    def out_shape(self, layout, batch, out_channel, size):
-        if layout == "NCHW":
-            return (batch, out_channel, size, size)
-        elif layout == "NHWC":
-            return (batch, size, size, out_channel)
-
-    @tvm.testing.fixture
-    def offset_shape(self, layout, batch, kernel_size, deformable_groups, out_shape):
-        if layout == "NCHW":
-            return (
-                batch,
-                2 * kernel_size[0] * kernel_size[1] * deformable_groups,
-                out_shape[2],
-                out_shape[3],
-            )
-        elif layout == "NHWC":
-            return (
-                batch,
-                out_shape[1],
-                out_shape[2],
-                2 * kernel_size[0] * kernel_size[1] * deformable_groups,
-            )
-
-    @tvm.testing.fixture
-    def kernel_layout(self, layout):
-        return {"NCHW": "OIHW", "NHWC": "HWIO"}[layout]
-
-    @tvm.testing.fixture
-    def relay_setup(
-        self,
-        dtype,
-        data_shape,
-        layout,
-        kernel_layout,
-        kernel_size,
-        deformable_groups,
-        groups,
-        out_channel,
-    ):
-        data = relay.var("data", shape=data_shape, dtype=dtype)
-        offset = relay.var("offset", dtype=dtype)
-        kernel = relay.var("kernel", dtype=dtype)
-        expr = relay.nn.deformable_conv2d(
-            data,
-            offset,
-            kernel,
-            strides=(1, 1),
-            padding=(1, 1),
-            dilation=(1, 1),
-            data_layout=layout,
-            kernel_layout=kernel_layout,
-            kernel_size=kernel_size,
-            deformable_groups=deformable_groups,
-            groups=groups,
-            channels=out_channel,
-        )
-        func = relay.Function([data, offset, kernel], expr)
-        return expr, func
-
-    def test_infer_type(self, relay_setup, out_shape, offset_shape, kernel_shape):
-        expr, func = relay_setup
-        yy = run_infer_type(expr)
-        assert yy.checked_type == relay.TensorType(out_shape), yy.checked_type
-        assert yy.args[1].checked_type == relay.TensorType(offset_shape), yy.args[1].checked_type
-        assert yy.args[2].checked_type == relay.TensorType(kernel_shape), yy.args[2].checked_type
-
-    # The reference python implementation only supports groups==1.
-    @pytest.mark.parametrize("groups", [1])
-    def test_run(
-        self,
-        target,
-        dev,
-        dtype,
-        executor_kind,
-        data_shape,
-        offset_shape,
-        kernel_shape,
-        relay_setup,
-        deformable_groups,
-        groups,
-        layout,
-    ):
-        target = tvm.target.Target(target)
-        if layout == "NHWC" and target.kind.name != "llvm":
-            pytest.xfail("Can only run NHWC layout on llvm")
-
-        expr, func = relay_setup
-        data = np.random.uniform(size=data_shape).astype(dtype)
-        offset = np.random.uniform(size=offset_shape).astype(dtype)
-        kernel = np.random.uniform(size=kernel_shape).astype(dtype)
-        if layout == "NCHW":
-            ref_res = tvm.topi.testing.deformable_conv2d_nchw_python(
-                data,
-                offset,
-                kernel,
-                stride=(1, 1),
-                padding=(1, 1),
-                dilation=(1, 1),
-                deformable_groups=deformable_groups,
-                groups=groups,
-            )
-        else:
-            ref_res = tvm.topi.testing.deformable_conv2d_nhwc_python(
-                data,
-                offset,
-                kernel,
-                stride=(1, 1),
-                padding=(1, 1),
-                dilation=(1, 1),
-                deformable_groups=deformable_groups,
-                groups=groups,
-            )
-
-        op_res1 = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            data, offset, kernel
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_depth_to_space(executor_kind):
-    def verify_depth_to_space(dshape, block_size, layout, mode):
-        if layout == "NHWC":
-            out_shape = [
-                dshape[0],
-                dshape[1] * block_size,
-                dshape[2] * block_size,
-                dshape[3] / (block_size * block_size),
-            ]
-        else:
-            out_shape = [
-                dshape[0],
-                dshape[1] / (block_size * block_size),
-                dshape[2] * block_size,
-                dshape[3] * block_size,
-            ]
-
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 3, 1, 2])
-        ref_res = tvm.topi.testing.depth_to_space_python(x_data, block_size, mode=mode)
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 2, 3, 1])
-            ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1])
-
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.nn.depth_to_space(x, block_size, layout, mode)
-        assert "block_size=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
-
-    for layout in ["NHWC", "NCHW"]:
-        for mode in ["DCR", "CDR"]:
-            verify_depth_to_space((1, 4, 4, 4), 2, layout, mode)
-
-
-@tvm.testing.uses_gpu
-def test_space_to_depth(executor_kind):
-    def verify_space_to_depth(dshape, block_size, layout):
-        if layout == "NHWC":
-            out_shape = [
-                dshape[0],
-                dshape[1] / block_size,
-                dshape[2] / block_size,
-                dshape[3] * (block_size * block_size),
-            ]
-        else:
-            out_shape = [
-                dshape[0],
-                dshape[1] * (block_size * block_size),
-                dshape[2] / block_size,
-                dshape[3] / block_size,
-            ]
-
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 3, 1, 2])
-        ref_res = tvm.topi.testing.space_to_depth_python(x_data, block_size)
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 2, 3, 1])
-            ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1])
-
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.nn.space_to_depth(x, block_size, layout)
-        assert "block_size=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
-
-    for layout in ["NHWC", "NCHW"]:
-        verify_space_to_depth((1, 4, 4, 4), 2, layout)
-
-
-def test_dilation2d_infer_type():
-    # symbolic in batch dimension
-    n, h, w, c = te.var("n"), 224, 224, 10
-    x = relay.var("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    kc, kh, kw = 10, 8, 8
-    w = relay.var("w", relay.ty.TensorType((kc, kw, kh), "float32"))
-    y = relay.image.dilation2d(
-        x,
-        w,
-        # kernel_size=(3, 3),
-        strides=[1, 1, 1, 1],
-        dilations=[1, 1, 1, 1],
-        padding=[0, 0, 0, 0],
-    )
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, 10, 217, 217), "float32")
-
-
-class TestDilation2DRun:
-    data_layout, kernel_layout = tvm.testing.parameters(("NCHW", "IHW"), ("NHWC", "HWI"))
-    dtype = tvm.testing.parameter("float32")
-
-    config = tvm.testing.parameter(
-        dict(
-            image=[[[[0.1], [0.2]], [[0.3], [0.4]]]],
-            kernel=[[[0.4], [0.3]], [[0.1], [0.0]]],
-            out=[[[[0.5]]]],
-        ),
-        dict(
-            image=[[[[0.1], [0.2]], [[0.3], [0.4]]]],
-            kernel=[[[0.4], [0.3]], [[0.1], [0.0]]],
-            out=[[[[0.5], [0.6]], [[0.7], [0.8]]]],
-            padding=[0, 0, 1, 1],
-        ),
-        dict(
-            image=[[[[0.1, 0.2, 0.0], [0.2, 0.3, 0.1]], [[0.3, 0.4, 0.2], [0.4, 0.5, 0.3]]]],
-            kernel=[[[0.4, 0.5, 0.3], [0.3, 0.4, 0.2]], [[0.1, 0.2, 0.0], [0.0, 0.1, -0.1]]],
-            out=[[[[0.5, 0.7, 0.3], [0.6, 0.8, 0.4]], [[0.7, 0.9, 0.5], [0.8, 1.0, 0.6]]]],
-            padding=[0, 0, 1, 1],
-        ),
-        dict(
-            image=[[[[0.1], [0.2]], [[0.3], [0.4]]], [[[0.2], [0.3]], [[0.4], [0.5]]]],
-            kernel=[[[0.4], [0.3]], [[0.1], [0.0]]],
-            out=[[[[0.5], [0.6]], [[0.7], [0.8]]], [[[0.6], [0.7]], [[0.8], [0.9]]]],
-            padding=[0, 0, 1, 1],
-        ),
-        dict(
-            image=[[[[0.1], [0.2]], [[0.3], [0.4]]]],
-            kernel=[[[0.4], [0.3]]],
-            out=[[[[0.5]], [[0.7]]]],
-        ),
-        dict(
-            image=[[[[0.1], [0.2], [0.3]], [[0.4], [0.5], [0.6]], [[0.7], [0.8], [0.9]]]],
-            kernel=[[[0.4], [0.3]], [[0.1], [0.2]]],
-            out=[[[[0.7], [0.8], [0.6]], [[1.0], [1.1], [0.9]], [[0.8], [0.9], [0.9]]]],
-            padding=[1, 1],
-            dilations=[2, 2],
-        ),
-        dict(
-            image=[
-                [
-                    [[0.1], [0.2], [0.3], [0.4]],
-                    [[0.5], [0.6], [0.7], [0.8]],
-                    [[0.9], [1.0], [1.1], [1.2]],
-                ]
-            ],
-            kernel=[[[0.4], [0.3]], [[0.1], [0.2]]],
-            out=[[[[0.8], [1.0]], [[1.2], [1.4]]]],
-            strides=[1, 2],
-        ),
-    )
-
-    @tvm.testing.fixture
-    def test_case(self, config, data_layout, dtype):
-        indata = np.array(config["image"], dtype=dtype)
-        kernel = np.array(config["kernel"], dtype=dtype)
-        out = np.array(config["out"], dtype=dtype)
-
-        if data_layout == "NHWC":
-            pass
-        elif data_layout == "NCHW":
-            indata = indata.transpose([0, 3, 1, 2])
-            kernel = kernel.transpose([2, 0, 1])
-            out = out.transpose([0, 3, 1, 2])
-        else:
-            raise ValueError(f"Unsupported layout '{data_layout}'")
-
-        return indata, kernel, out
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_dilation2d(
-        self,
-        target,
-        dev,
-        test_case,
-        dtype,
-        config,
-        data_layout,
-        kernel_layout,
-    ):
-        strides = config.get("strides", [1, 1])
-        padding = config.get("padding", [0, 0])
-        dilations = config.get("dilations", [1, 1])
-
-        indata, kernel, out = test_case
-
-        x = relay.var("x", shape=indata.shape, dtype=dtype)
-        w = relay.var("w", shape=kernel.shape, dtype=dtype)
-        y = relay.image.dilation2d(
-            x,
-            w,
-            strides=strides,
-            dilations=dilations,
-            padding=padding,
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        func = relay.Function([x, w], y)
-
-        op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            indata, kernel
-        )
-        tvm.testing.assert_allclose(op_res.numpy(), out, rtol=1e-5, atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_affine_grid(executor_kind):
-    def verify_affine_grid(num_batch, target_shape):
-        dtype = "float32"
-        data_shape = (num_batch, 2, 3)
-        data = relay.var("data", relay.ty.TensorType(data_shape, dtype))
-        y = relay.image.affine_grid(data, target_shape)
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.ty.TensorType(
-            (num_batch, len(target_shape), *target_shape), dtype
-        )
-
-        func = relay.Function([data], y)
-        data_np = np.random.uniform(size=data_shape).astype(dtype)
-        ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                data_np
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    verify_affine_grid(1, (16, 32))
-    verify_affine_grid(4, (16, 32))
-
-
-@tvm.testing.uses_gpu
-def test_grid_sample(executor_kind):
-    def verify_grid_sample(
-        data_shape, grid_shape, method="bilinear", padding_mode="zeros", align_corners=True
-    ):
-        dtype = "float32"
-        data = relay.var("data", relay.ty.TensorType(data_shape, dtype))
-        grid = relay.var("grid", relay.ty.TensorType(grid_shape, dtype))
-
-        if len(data_shape) == 4:
-            layout = "NCHW"
-            batch, channel, _, _ = data_shape
-            _, _, out_height, out_width = grid_shape
-            tensor_type = relay.TensorType((batch, channel, out_height, out_width), dtype)
-        else:  # len(data_shape) == 5:
-            layout = "NCDHW"
-            batch, channel, _, _, _ = data_shape
-            _, _, out_depth, out_height, out_width = grid_shape
-            tensor_type = relay.TensorType(
-                (batch, channel, out_depth, out_height, out_width), dtype
-            )
-
-        y = relay.image.grid_sample(
-            data,
-            grid,
-            method=method,
-            layout=layout,
-            padding_mode=padding_mode,
-            align_corners=align_corners,
-        )
-        yy = run_infer_type(y)
-        assert yy.checked_type == tensor_type
-        func = relay.Function([data, grid], y)
-
-        data_np = np.random.uniform(size=data_shape).astype(dtype)
-        grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype)
-        ref_res = tvm.topi.testing.grid_sample_python(
-            data_np, grid_np, method, layout, padding_mode, align_corners
-        )
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                data_np, grid_np
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    methods = ["nearest", "bilinear", "bicubic"]
-    padding_modes = ["zeros", "border", "reflection"]
-    align_corners = [True, False]
-
-    data_2D_shape = (4, 4, 8, 8)
-    grid_2D_shape = (4, 2, 16, 16)
-    # choosing smaller sizes to be testable on weaker GPUs
-    data_3D_shape = (4, 4, 4, 4, 4)
-    grid_3D_shape = (4, 3, 8, 8, 8)
-
-    for _method in methods:
-        for _padding in padding_modes:
-            for _align in align_corners:
-                verify_grid_sample(data_2D_shape, grid_2D_shape, _method, _padding, _align)
-
-                # 3D "bicubic"(tricubic) is not supported in pytorch
-                if _method != "bicubic":
-                    verify_grid_sample(data_3D_shape, grid_3D_shape, _method, _padding, _align)
-
-
-@tvm.testing.uses_gpu
-def test_space_to_batch_nd(executor_kind):
-    def verify_space_to_batch_nd(dshape, block_shape, paddings):
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        pad_before, pad_after = map(list, zip(*paddings))
-        ref_res = tvm.topi.testing.space_to_batch_nd_python(
-            x_data, block_shape, pad_before, pad_after
-        )
-
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.nn.space_to_batch_nd(x, block_shape, paddings)
-        assert "block_shape=" in z.astext()
-        assert "paddings=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
-
-    verify_space_to_batch_nd([3, 3, 2, 1], [3], [[0, 0]])
-    verify_space_to_batch_nd([2, 2, 4, 1], [2, 2], [[0, 0], [2, 0]])
-
-
-@tvm.testing.uses_gpu
-def test_batch_to_space_nd(executor_kind):
-    def verify_batch_to_space_nd(dshape, block_shape, crops):
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        crop_begin_list, crop_end_list = map(list, zip(*crops))
-        ref_res = tvm.topi.testing.batch_to_space_nd_python(
-            x_data, block_shape, crop_begin_list, crop_end_list
-        )
-
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.nn.batch_to_space_nd(x, block_shape, crops)
-        assert "block_shape=" in z.astext()
-        assert "crops=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
-
-    verify_batch_to_space_nd([4, 1, 1, 3], [2, 2], [[0, 0], [0, 0]])
-    verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [[0, 0], [2, 0]])
-
-
-@tvm.testing.uses_gpu
-def test_all_class_non_max_suppression(executor_kind):
-    def verify_all_class_non_max_suppression(
-        boxes_np,
-        scores_np,
-        max_output_boxes_per_class,
-        iou_threshold,
-        score_threshold,
-        expected_indices,
-    ):
-        boxes = relay.var("boxes", relay.ty.TensorType(boxes_np.shape, "float32"))
-        scores = relay.var("scores", relay.ty.TensorType(scores_np.shape, "float32"))
-
-        out = relay.vision.all_class_non_max_suppression(
-            boxes,
-            scores,
-            max_output_boxes_per_class,
-            iou_threshold,
-            score_threshold,
-        )
-
-        func = relay.Function([boxes, scores], out.astuple())
-        func = run_infer_type(func)
-
-        for target, dev in tvm.testing.enabled_targets():
-            selected_indices, num_detections = relay.create_executor(
-                executor_kind, device=dev, target=target
-            ).evaluate(func)(boxes_np, scores_np)
-            tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
-            np.testing.assert_equal(tvm_res, expected_indices)
-
-    boxes = np.array(
-        [
-            [
-                [0.0, 0.0, 0.3, 0.3],
-                [0.0, 0.0, 0.4, 0.4],
-                [0.0, 0.0, 0.5, 0.5],
-                [0.5, 0.5, 0.9, 0.9],
-                [0.5, 0.5, 1.0, 1.0],
-            ],
-            [
-                [0.0, 0.0, 0.3, 0.3],
-                [0.0, 0.0, 0.4, 0.4],
-                [0.5, 0.5, 0.95, 0.95],
-                [0.5, 0.5, 0.96, 0.96],
-                [0.5, 0.5, 1.0, 1.0],
-            ],
-        ]
-    ).astype("float32")
-
-    scores = np.array(
-        [
-            [[0.1, 0.2, 0.6, 0.3, 0.9], [0.1, 0.2, 0.6, 0.3, 0.9]],
-            [[0.1, 0.2, 0.6, 0.3, 0.9], [0.1, 0.2, 0.6, 0.3, 0.9]],
-        ]
-    ).astype("float32")
-
-    max_output_boxes_per_class = 2
-    iou_threshold = 0.8
-    score_threshold = 0.0
-
-    expected = np.array(
-        [[0, 0, 4], [0, 0, 2], [0, 1, 4], [0, 1, 2], [1, 0, 4], [1, 0, 1], [1, 1, 4], [1, 1, 1]]
-    )
-
-    verify_all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, expected
-    )
-
-    boxes = np.array(
-        [
-            [
-                [0.0, 0.0, 1.0, 1.0],
-                [0.0, 0.1, 1.0, 1.1],
-                [0.0, -0.1, 1.0, 0.9],
-                [0.0, 10.0, 1.0, 11.0],
-                [0.0, 10.1, 1.0, 11.1],
-                [0.0, 100.0, 1.0, 101.0],
-            ]
-        ]
-    ).astype(np.float32)
-    scores = np.array([[[0.9, 0.75, 0.6, 0.95, 0.5, 0.3]]]).astype(np.float32)
-    max_output_boxes_per_class = 3
-    iou_threshold = 0.5
-    score_threshold = 0.4
-
-    expected = np.array([[0, 0, 3], [0, 0, 0]])
-
-    verify_all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, expected
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
deleted file mode 100644
index 47cf73d6915d..000000000000
--- a/tests/python/relay/test_op_level6.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Support level6 operator test cases.
-"""
-import pytest
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.topi.testing import searchsorted_ref
-import tvm.testing
-
-executor_kind = tvm.testing.parameter("graph", "vm")
-
-
-@tvm.testing.uses_gpu
-def test_sort():
-    def verify_sort(shape, axis, is_ascend, is_dyn=False, in_dtype="float32"):
-        if is_dyn:
-            x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), in_dtype))
-        else:
-            x = relay.var("x", relay.TensorType(shape, in_dtype))
-        z = relay.sort(x, axis=axis, is_ascend=is_ascend)
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(size=shape).astype(in_dtype)
-        if is_ascend:
-            ref_res = np.sort(x_data, axis=axis)
-        else:
-            ref_res = -np.sort(-x_data, axis=axis)
-
-        if is_dyn:
-            backend = "vm"
-        else:
-            backend = "graph"
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    for is_dyn in [False, True]:
-        verify_sort((2, 3, 4), axis=0, is_ascend=False, is_dyn=is_dyn)
-        verify_sort((1, 4, 6), axis=1, is_ascend=True, is_dyn=is_dyn)
-        verify_sort((3, 5, 6), axis=-1, is_ascend=False, is_dyn=is_dyn)
-        verify_sort((3, 2000, 6), axis=1, is_ascend=False, is_dyn=is_dyn)
-        verify_sort((1, 122640), axis=1, is_ascend=False, is_dyn=is_dyn)
-        verify_sort((1, 122640), axis=1, is_ascend=False, is_dyn=is_dyn, in_dtype="float16")
-
-
-@tvm.testing.uses_gpu
-def test_argsort():
-    def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False, in_dtype="float32"):
-        if is_dyn:
-            x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), in_dtype))
-        else:
-            x = relay.var("x", relay.TensorType(shape, in_dtype))
-        z = relay.argsort(x, axis=axis, is_ascend=is_ascend, dtype=dtype)
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(size=shape).astype(in_dtype)
-        if is_ascend:
-            ref_res = np.argsort(x_data, axis=axis, kind="stable")
-        else:
-            ref_res = np.argsort(-x_data, axis=axis, kind="stable")
-
-        if is_dyn:
-            backend = "vm"
-        else:
-            backend = "graph"
-        for target, dev in tvm.testing.enabled_targets():
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
-                x_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res.astype(dtype), rtol=1e-5)
-
-    for is_dyn in [False, True]:
-        for dtype in ["int32", "int64", "float32", "float64"]:
-            verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-            verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype, is_dyn=is_dyn)
-        dtype = "int32"
-        verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-        verify_argsort((3, 6000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-        verify_argsort((1000, 1, 1), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-        verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-        verify_argsort(
-            (1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn, in_dtype="float16"
-        )
-
-
-@tvm.testing.uses_gpu
-def test_topk(executor_kind):
-    def verify_topk(k, axis, ret_type, is_ascend, dtype, in_dtype="float32"):
-        shape = (20, 100)
-        x = relay.var("x", relay.TensorType(shape, in_dtype))
-        out = relay.topk(x, k, axis, ret_type, is_ascend, dtype)
-        if isinstance(out, relay.expr.TupleWrapper):
-            out = out.astuple()
-        func = relay.Function([x], out)
-        np_data = np.random.uniform(size=shape).astype(in_dtype)
-        if is_ascend:
-            np_indices = np.argsort(np_data, axis=axis, kind="stable")
-        else:
-            np_indices = np.argsort(-np_data, axis=axis, kind="stable")
-        kk = k if k >= 1 else shape[axis]
-        if axis == 0:
-            np_indices = np_indices[:kk, :]
-            np_values = np.zeros(np_indices.shape).astype(in_dtype)
-            for i in range(shape[1]):
-                np_values[:, i] = np_data[np_indices[:, i], i]
-        else:
-            np_indices = np_indices[:, :kk]
-            np_values = np.zeros(np_indices.shape).astype(in_dtype)
-            for i in range(shape[0]):
-                np_values[i, :] = np_data[i, np_indices[i, :]]
-        np_indices = np_indices.astype(dtype)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-                np_data
-            )
-            if ret_type == "both":
-                tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
-                tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
-            elif ret_type == "values":
-                tvm.testing.assert_allclose(op_res.numpy(), np_values)
-            else:
-                tvm.testing.assert_allclose(op_res.numpy(), np_indices)
-
-    np.random.seed(0)
-    for k in [0, 1, 5]:
-        for axis in [0, -1, 1]:
-            for ret_type in ["both", "values", "indices"]:
-                verify_topk(k, axis, ret_type, True, "int64")
-                verify_topk(k, axis, ret_type, False, "float32")
-                verify_topk(k, axis, ret_type, False, "int64", "float16")
-
-
-@tvm.testing.uses_gpu
-def test_searchsorted():
-    def verify_searchsorted(right, dtype):
-        shape = (8, 9, 10)
-        values_shape = shape[:-1] + (10,)
-        sorted_sequence = relay.var("sorted_sequence", relay.TensorType(shape, "float32"))
-        values = relay.var("sorted_sequence", relay.TensorType(values_shape, "float32"))
-        out = relay.searchsorted(sorted_sequence, values, right, dtype)
-        func = relay.Function([sorted_sequence, values], out)
-        sorted_sequence_np = np.sort(np.random.randn(*shape).astype("float32"), axis=-1)
-        values_np = np.random.randn(*values_shape).astype("float32")
-        np_indices = searchsorted_ref(sorted_sequence_np, values_np, right, dtype)
-
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                sorted_sequence_np, values_np
-            )
-            np.testing.assert_equal(op_res.numpy(), np_indices)
-
-    verify_searchsorted(False, "int32")
-    verify_searchsorted(True, "int64")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_op_qnn_add.py b/tests/python/relay/test_op_qnn_add.py
deleted file mode 100644
index ed2b1723bb8e..000000000000
--- a/tests/python/relay/test_op_qnn_add.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import numpy as np
-from tvm import relay
-
-
-def test_tflite_same_io_qnn_params():
-    data_dtype = "uint8"
-
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
-    z = relay.qnn.add(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(0.00784314, "float32"),
-        lhs_zero_point=relay.const(127, "int32"),
-        rhs_scale=relay.const(0.00784314, "float32"),
-        rhs_zero_point=relay.const(127, "int32"),
-        output_scale=relay.const(0.00784314, "float32"),
-        output_zero_point=relay.const(127, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_datas = [
-        np.array((140, 153, 165, 178)).reshape((1, 4)),
-        np.array((25, 153, 178, 216)).reshape((1, 4)),
-        np.array((25, 153, 216, 165)).reshape((1, 4)),
-    ]
-    y_datas = [
-        np.array((204, 178, 165, 140)).reshape((1, 4)),
-        np.array((204, 178, 191, 25)).reshape((1, 4)),
-        np.array((204, 178, 25, 191)).reshape((1, 4)),
-    ]
-    golden_outputs = [
-        np.array((217, 204, 203, 191)).reshape((1, 4)),
-        np.array((102, 204, 242, 114)).reshape((1, 4)),
-        np.array((102, 204, 114, 229)).reshape((1, 4)),
-    ]
-
-    for i in range(0, 3):
-        x_data = x_datas[i]
-        y_data = y_datas[i]
-        golden_output = golden_outputs[i]
-
-        op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-            x_data, y_data
-        )
-        np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_tflite_different_io_qnn_params():
-    data_dtype = "uint8"
-
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
-    z = relay.qnn.add(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(0.0156863, "float32"),
-        lhs_zero_point=relay.const(127, "int32"),
-        rhs_scale=relay.const(0.0117647, "float32"),
-        rhs_zero_point=relay.const(85, "int32"),
-        output_scale=relay.const(0.0235294, "float32"),
-        output_zero_point=relay.const(128, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_datas = [
-        np.array((76, 140, 153, 172)).reshape((1, 4)),
-        np.array((133, 140, 146, 153)).reshape((1, 4)),
-        np.array((76, 140, 172, 146)).reshape((1, 4)),
-    ]
-    y_datas = [
-        np.array((136, 119, 128, 17)).reshape((1, 4)),
-        np.array((136, 119, 111, 94)).reshape((1, 4)),
-        np.array((136, 119, 17, 128)).reshape((1, 4)),
-    ]
-    golden_outputs = [
-        np.array((120, 154, 167, 124)).reshape((1, 4)),
-        np.array((158, 154, 154, 150)).reshape((1, 4)),
-        np.array((120, 154, 124, 163)).reshape((1, 4)),
-    ]
-
-    for i in range(0, 3):
-        x_data = x_datas[i]
-        y_data = y_datas[i]
-        golden_output = golden_outputs[i]
-
-        op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-            x_data, y_data
-        )
-        np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_saturation():
-    # Same params
-    data_dtype = "uint8"
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
-    z = relay.qnn.add(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(0.125, "float32"),
-        lhs_zero_point=relay.const(0, "int32"),
-        rhs_scale=relay.const(0.125, "float32"),
-        rhs_zero_point=relay.const(0, "int32"),
-        output_scale=relay.const(0.125, "float32"),
-        output_zero_point=relay.const(0, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-    mod = relay.transform.InferType()(mod)
-
-    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
-    y_data = np.array((255, 255, 128, 0)).reshape((1, 4))
-    golden_output = np.array((255, 255, 129, 0)).reshape((1, 4))
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-    # Same params, different scale
-    z = relay.qnn.add(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(0.125, "float32"),
-        lhs_zero_point=relay.const(0, "int32"),
-        rhs_scale=relay.const(0.125, "float32"),
-        rhs_zero_point=relay.const(0, "int32"),
-        output_scale=relay.const(0.25, "float32"),
-        output_zero_point=relay.const(0, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
-    y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
-    golden_output = np.array((255, 129, 65, 0)).reshape((1, 4))
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-    # Same io params, different output scale
-    z = relay.qnn.add(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(0.125, "float32"),
-        lhs_zero_point=relay.const(0, "int32"),
-        rhs_scale=relay.const(0.125, "float32"),
-        rhs_zero_point=relay.const(0, "int32"),
-        output_scale=relay.const(0.25, "float32"),
-        output_zero_point=relay.const(0, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
-    y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
-    golden_output = np.array((255, 129, 65, 0)).reshape((1, 4))
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-    # All params different
-    z = relay.qnn.add(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(0.5, "float32"),
-        lhs_zero_point=relay.const(0, "int32"),
-        rhs_scale=relay.const(0.25, "float32"),
-        rhs_zero_point=relay.const(0, "int32"),
-        output_scale=relay.const(0.125, "float32"),
-        output_zero_point=relay.const(0, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_data = np.array((255, 0, 1, 0)).reshape((1, 4))
-    y_data = np.array((0, 128, 64, 0)).reshape((1, 4))
-    golden_output = np.array((255, 255, 132, 0)).reshape((1, 4))
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_ignore_channel_axis():
-    data_dtype = "uint8"
-
-    x = relay.var("x", shape=(4,), dtype=data_dtype)
-    y = relay.var("y", shape=(4,), dtype=data_dtype)
-    z = relay.qnn.add(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(0.00784314, "float32"),
-        lhs_zero_point=relay.const(127, "int32"),
-        rhs_scale=relay.const(0.00784314, "float32"),
-        rhs_zero_point=relay.const(127, "int32"),
-        output_scale=relay.const(0.00784314, "float32"),
-        output_zero_point=relay.const(127, "int32"),
-        lhs_axis=1,
-        rhs_axis=1,
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-
-
-if __name__ == "__main__":
-    test_tflite_same_io_qnn_params()
-    test_tflite_different_io_qnn_params()
-    test_saturation()
-    test_ignore_channel_axis()
diff --git a/tests/python/relay/test_op_qnn_batch_matmul.py b/tests/python/relay/test_op_qnn_batch_matmul.py
deleted file mode 100644
index 278b6f725399..000000000000
--- a/tests/python/relay/test_op_qnn_batch_matmul.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-
-# We use llvm target for testing functionality. `llvm` points to an older Intel
-# generation machine, that legalizes to a simple lowering. Therefore, the
-# legalization is overwritten such that it can be skipped and we use the
-# QNNCanonicalizeOps lowering for the testing.
-def legalize_qnn_batch_matmul(attrs, inputs, types):
-    return None
-
-
-def make_requantize_params(input_scale, output_scale, output_zero_point, out_dtype):
-    config = {
-        "input_scale": input_scale,
-        "output_scale": output_scale,
-        "output_zero_point": output_zero_point,
-        "out_dtype": out_dtype,
-    }
-    return config
-
-
-def make_configuration(
-    quantized_x,
-    quantized_y,
-    dtype,
-    x_shape,
-    y_shape,
-    x_zero_point,
-    y_zero_point,
-    x_scale,
-    y_scale,
-    output,
-    out_dtype="int32",
-    requantize=None,
-):
-    config = {
-        "quantized_x": quantized_x,
-        "quantized_y": quantized_y,
-        "dtype": dtype,
-        "x_shape": x_shape,
-        "y_shape": y_shape,
-        "x_zero_point": x_zero_point,
-        "y_zero_point": y_zero_point,
-        "x_scale": x_scale,
-        "y_scale": y_scale,
-        "output": output,
-        "out_dtype": out_dtype,
-        "requantize": requantize,
-    }
-    return config
-
-
-def make_int_configuration(
-    xzero_point_zero=True,
-    yzero_point_zero=True,
-    requantize_output=False,
-    per_channel=False,
-    batch_size=1,
-):
-    x_shape, y_shape, output_shape = (batch_size, 4, 5), (batch_size, 3, 5), (batch_size, 4, 3)
-    if xzero_point_zero == True:
-        x_zero_point = 0
-    else:
-        x_zero_point = -123
-
-    if yzero_point_zero == True:
-        y_zero_point = 0
-    else:
-        y_zero_point = -123
-
-    in_dtype = "int8"
-    out_dtype = "int32" if not requantize_output else "int8"
-
-    quantized_x_np = (
-        np.array(
-            [
-                1,
-                3,
-                5,
-                7,
-                9,  # sum = 25
-                11,
-                13,
-                15,
-                -19,
-                -21,  # sum = -1
-                1,
-                3,
-                5,
-                7,
-                9,  # sum = 25
-                11,
-                13,
-                -17,
-                17,
-                -21,
-            ]
-        )[  # sum = 3
-            np.newaxis, np.newaxis, :
-        ]
-        .repeat(batch_size, axis=1)
-        .astype(in_dtype)
-        .reshape(x_shape)
-    )
-    quantized_y_np = (
-        np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9])[np.newaxis, np.newaxis, :]
-        .repeat(batch_size, axis=1)
-        .astype(in_dtype)
-        .reshape(y_shape)
-    )
-    x_scale = 0.5
-    y_scale = 0.5
-    output_scale = 2.0
-
-    if requantize_output:
-        assert xzero_point_zero is True
-        assert yzero_point_zero is True
-        output = np.array([20, 51, 20, -26, -27, -26, 20, 51, 20, -14, -10, -14])
-    elif xzero_point_zero is False and yzero_point_zero is False:
-        output = np.array(
-            [81960, 88360, 81960, 78400, 84540, 78400, 81960, 88360, 81960, 78984, 85164, 78984]
-        )
-    elif xzero_point_zero is True and yzero_point_zero is False:
-        output = np.array([3240, 3490, 3240, -320, -330, -320, 3240, 3490, 3240, 264, 294, 264])
-    elif xzero_point_zero is False and yzero_point_zero is True:
-        output = np.array([3240, 9640, 3240, 2878, 9018, 2878, 3240, 9640, 3240, 2970, 9150, 2970])
-    else:
-        output = np.array([165, 415, 165, -197, -207, -197, 165, 415, 165, -105, -75, -105])
-
-    requant_params = (
-        make_requantize_params(x_scale * y_scale, output_scale, -1, "int8")
-        if requantize_output
-        else None
-    )
-    # Outputs are for batch size 1, make batch size n version
-    output = (
-        output[np.newaxis, np.newaxis, :]
-        .repeat(batch_size, axis=1)
-        .astype(out_dtype)
-        .reshape(output_shape)
-    )
-    return make_configuration(
-        quantized_x=quantized_x_np,
-        quantized_y=quantized_y_np,
-        dtype=in_dtype,
-        x_shape=x_shape,
-        y_shape=y_shape,
-        x_zero_point=x_zero_point,
-        y_zero_point=y_zero_point,
-        x_scale=x_scale,
-        y_scale=y_scale,
-        output=output,
-        requantize=requant_params,
-    )
-
-
-def qnn_batch_matmul_driver(test_configuration):
-    in_dtype = test_configuration["dtype"]
-    out_dtype = test_configuration["out_dtype"]
-    quantized_x_name = "quantized_x"
-    quantized_y_name = "quantized_y"
-    expected_out_dtype = test_configuration["out_dtype"]
-    quantized_x = relay.var(quantized_x_name, shape=test_configuration["x_shape"], dtype=in_dtype)
-    quantized_y = relay.var(quantized_y_name, shape=test_configuration["y_shape"], dtype=in_dtype)
-    mod = relay.qnn.batch_matmul(
-        quantized_x,
-        quantized_y,
-        relay.const(test_configuration["x_zero_point"], "int32"),
-        relay.const(test_configuration["y_zero_point"], "int32"),
-        relay.const(test_configuration["x_scale"], "float32"),
-        relay.const(test_configuration["y_scale"], "float32"),
-    )
-    if test_configuration["requantize"] is not None:
-        requantize_config = test_configuration["requantize"]
-        mod = relay.qnn.requantize(
-            mod,
-            input_scale=relay.const(requantize_config["input_scale"], "float32"),
-            input_zero_point=relay.const(0, "int32"),
-            output_scale=relay.const(requantize_config["output_scale"], "float32"),
-            output_zero_point=relay.const(requantize_config["output_zero_point"], "int32"),
-            out_dtype=requantize_config["out_dtype"],
-        )
-        expected_out_dtype = requantize_config["out_dtype"]
-
-    mod = relay.Function(relay.analysis.free_vars(mod), mod)
-    mod = tvm.IRModule.from_expr(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    with tvm.transform.PassContext(opt_level=2):
-        graph, lib, params = relay.build(mod, "llvm", params=None)
-        mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-        mod.set_input(quantized_x_name, test_configuration[quantized_x_name])
-        mod.set_input(quantized_y_name, test_configuration[quantized_y_name])
-        mod.set_input(**params)
-        mod.run()
-        res = mod.get_output(0).numpy()
-        np.testing.assert_equal(res, test_configuration["output"])
-        assert res.dtype == expected_out_dtype
-
-
-def test_qnn_batch_matmul_xzp0_yzp0():
-    with TempOpAttr("qnn.batch_matmul", "FTVMQnnLegalize", legalize_qnn_batch_matmul):
-        for batch_size in [1, 4, 7]:
-            int32_output_params = make_int_configuration(
-                xzero_point_zero=True, yzero_point_zero=True, batch_size=batch_size
-            )
-            qnn_batch_matmul_driver(int32_output_params)
-
-
-def test_qnn_batch_matmul_xzp0():
-    with TempOpAttr("qnn.batch_matmul", "FTVMQnnLegalize", legalize_qnn_batch_matmul):
-        for batch_size in [1, 4, 7]:
-            int32_output_params = make_int_configuration(
-                xzero_point_zero=True, yzero_point_zero=False, batch_size=batch_size
-            )
-            qnn_batch_matmul_driver(int32_output_params)
-
-
-def test_qnn_batch_matmul_yzp0():
-    with TempOpAttr("qnn.batch_matmul", "FTVMQnnLegalize", legalize_qnn_batch_matmul):
-
-        for batch_size in [1, 4, 7]:
-            int32_output_params = make_int_configuration(
-                xzero_point_zero=False, yzero_point_zero=True, batch_size=batch_size
-            )
-            qnn_batch_matmul_driver(int32_output_params)
-
-
-def test_qnn_batch_matmul():
-    with TempOpAttr("qnn.batch_matmul", "FTVMQnnLegalize", legalize_qnn_batch_matmul):
-        for batch_size in [1, 4, 7]:
-
-            int32_output_params = make_int_configuration(
-                xzero_point_zero=False, yzero_point_zero=False, batch_size=batch_size
-            )
-            qnn_batch_matmul_driver(int32_output_params)
-
-
-def test_qnn_batch_matmul_with_requantized_output():
-    with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_batch_matmul):
-        for batch_size in [1, 4, 7]:
-            int8_requantized_output_params = make_int_configuration(
-                requantize_output=True, batch_size=batch_size
-            )
-            qnn_batch_matmul_driver(int8_requantized_output_params)
-
-
-if __name__ == "__main__":
-    test_qnn_batch_matmul_xzp0_yzp0()
-    test_qnn_batch_matmul_xzp0()
-    test_qnn_batch_matmul_yzp0()
-    test_qnn_batch_matmul()
-    test_qnn_batch_matmul_with_requantized_output()
diff --git a/tests/python/relay/test_op_qnn_concatenate.py b/tests/python/relay/test_op_qnn_concatenate.py
deleted file mode 100644
index 7ad6318ae4e6..000000000000
--- a/tests/python/relay/test_op_qnn_concatenate.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-import tvm.topi.testing
-
-
-def test_same_io_qnn_params():
-    data_dtype = "int32"
-    axis = 0
-    x_data = np.arange(-32, 32, 1).reshape(1, 64).astype(data_dtype)
-    y_data = np.arange(-64, 64, 2).reshape(1, 64).astype(data_dtype)
-    x_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    y_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    zero = relay.const(0, "int32")
-
-    x = relay.var("x", shape=(1, 64), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 64), dtype=data_dtype)
-    z = relay.qnn.concatenate(
-        (x, y),
-        input_scales=(x_scale, y_scale),
-        input_zero_points=(zero, zero),
-        output_scale=y_scale,
-        output_zero_point=zero,
-        axis=axis,
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    golden_output = np.concatenate((x_data, y_data), axis=axis)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_different_io_qnn_params():
-    data_dtype = "int32"
-    axis = 0
-    x_data = np.arange(-32, 32, 1).reshape(1, 64).astype(data_dtype)
-    y_data = np.arange(-64, 64, 2).reshape(1, 64).astype(data_dtype)
-
-    x_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    y_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    x_zero_point = relay.const(3, "int32")
-    y_zero_point = relay.const(4, "int32")
-
-    x = relay.var("x", shape=(1, 64), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 64), dtype=data_dtype)
-    z = relay.qnn.concatenate(
-        (x, y),
-        input_scales=(x_scale, y_scale),
-        input_zero_points=(x_zero_point, y_zero_point),
-        output_scale=y_scale,
-        output_zero_point=relay.const(1, "int32"),
-        axis=axis,
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    golden_output = np.concatenate((x_data - 2, y_data - 3), axis=axis)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_few_same_io_qnn_params():
-    data_dtype = "int32"
-    axis = 0
-    x_data = np.arange(-32, 32, 1).reshape(1, 64).astype(data_dtype)
-    y_data = np.arange(-64, 64, 2).reshape(1, 64).astype(data_dtype)
-
-    x_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    y_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    x_zero_point = relay.const(0, "int32")
-    y_zero_point = relay.const(1, "int32")
-
-    x = relay.var("x", shape=(1, 64), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 64), dtype=data_dtype)
-    z = relay.qnn.concatenate(
-        (x, y),
-        input_scales=(x_scale, y_scale),
-        input_zero_points=(x_zero_point, y_zero_point),
-        output_scale=y_scale,
-        output_zero_point=relay.const(1, "int32"),
-        axis=axis,
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    golden_output = np.concatenate((x_data + 1, y_data), axis=axis)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_same_i_qnn_params():
-    data_dtype = "int32"
-    axis = 0
-    x_data = np.arange(-32, 32, 1).reshape(1, 64).astype(data_dtype)
-    y_data = np.arange(-64, 64, 2).reshape(1, 64).astype(data_dtype)
-
-    x_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    y_scale = relay.const((62 + 64) / (np.power(2, 32) - 1.0), "float32")
-    x_zero_point = relay.const(0, "int32")
-    y_zero_point = relay.const(0, "int32")
-
-    x = relay.var("x", shape=(1, 64), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 64), dtype=data_dtype)
-    z = relay.qnn.concatenate(
-        (x, y),
-        input_scales=(x_scale, y_scale),
-        input_zero_points=(x_zero_point, y_zero_point),
-        output_scale=y_scale,
-        output_zero_point=relay.const(1, "int32"),
-        axis=axis,
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    golden_output = np.concatenate((x_data + 1, y_data + 1), axis=axis)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_call_input():
-    # This tests the case where the input to concatenate is not explicitly a
-    # tuple node but is instead a call node.
-    x_data = np.ones(shape=(64,)).astype("uint8")
-
-    x = relay.var("x", shape=(64,), dtype="uint8")
-    x_scale = relay.const(1, "float32")
-    y_scale = relay.const(1, "float32")
-    x_zero_point = relay.const(0, "int32")
-    y_zero_point = relay.const(0, "int32")
-
-    tup = relay.split(x, 2, axis=0)
-    z = relay.qnn.concatenate(
-        tup,
-        input_scales=(x_scale, y_scale),
-        input_zero_points=(x_zero_point, y_zero_point),
-        output_scale=y_scale,
-        output_zero_point=relay.const(0, "int32"),
-        axis=0,
-    )
-    func = relay.Function([x], z)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(x_data)
-    np.testing.assert_equal(op_res.numpy(), x_data)
-
-
-if __name__ == "__main__":
-    test_call_input()
-    test_same_io_qnn_params()
-    test_different_io_qnn_params()
-    test_few_same_io_qnn_params()
-    test_same_i_qnn_params()
diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py
deleted file mode 100644
index b226d0a33a18..000000000000
--- a/tests/python/relay/test_op_qnn_conv2_transpose.py
+++ /dev/null
@@ -1,715 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import tvm
-from tvm import relay, te
-from tvm.contrib import graph_executor
-from tvm.relay import transform
-from tvm.relay.testing import run_infer_type
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-
-
-def get_ref_func(
-    data,
-    kernel,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_size,
-    padding,
-    strides,
-    dilation,
-    data_layout,
-    kernel_layout,
-    out_dtype,
-    groups,
-    channels=None,
-):
-    casted_data = relay.op.cast(data, "int32")
-    casted_kernel = relay.op.cast(kernel, "int32")
-    shifted_data = relay.op.subtract(casted_data, relay.const(input_zero_point, "int32"))
-    shifted_kernel = relay.op.subtract(casted_kernel, relay.const(kernel_zero_point, "int32"))
-    func = relay.op.nn.conv2d_transpose(
-        shifted_data,
-        shifted_kernel,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        groups=groups,
-        channels=channels,
-        kernel_size=kernel_size,
-        out_dtype=out_dtype,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-
-    func = relay.Function(relay.analysis.free_vars(func), func)
-    return func
-
-
-def get_qnn_func(
-    data,
-    kernel,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_size,
-    padding,
-    strides,
-    dilation,
-    data_layout,
-    kernel_layout,
-    out_dtype,
-    channels,
-    groups,
-):
-    func = relay.qnn.conv2d_transpose(
-        data,
-        kernel,
-        input_zero_point=relay.const(input_zero_point, "int32"),
-        kernel_zero_point=relay.const(kernel_zero_point, "int32"),
-        input_scale=relay.const(input_scale, "float32"),
-        kernel_scale=relay.const(kernel_scale, "float32"),
-        kernel_size=kernel_size,
-        strides=strides,
-        dilation=dilation,
-        padding=padding,
-        out_dtype=out_dtype,
-        groups=groups,
-        channels=channels,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-
-    mod = relay.Function(relay.analysis.free_vars(func), func)
-    mod = tvm.IRModule.from_expr(mod)
-    return mod
-
-
-def get_funcs(
-    data_shape,
-    data_dtype,
-    kernel_shape,
-    kernel_dtype,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_size,
-    padding,
-    strides,
-    dilation,
-    data_layout,
-    kernel_layout,
-    out_dtype,
-    groups=1,
-    channels=None,
-):
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
-
-    ref_func = get_ref_func(
-        data,
-        kernel,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        kernel_size,
-        padding,
-        strides,
-        dilation,
-        data_layout,
-        kernel_layout,
-        out_dtype,
-        groups,
-        channels,
-    )
-    ref_func = run_infer_type(ref_func)
-    ref_func = tvm.IRModule.from_expr(ref_func)
-    qnn_func = get_qnn_func(
-        data,
-        kernel,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        kernel_size,
-        padding,
-        strides,
-        dilation,
-        data_layout,
-        kernel_layout,
-        out_dtype,
-        channels,
-        groups,
-    )
-
-    return (ref_func, qnn_func)
-
-
-def verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype):
-    def get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype):
-        # Keeping inputs multiple of 4 because of a bug in Average Pool2d
-        # https://discuss.tvm.apache.org/t/pool2d-gives-bad-output-for-integer-inputs/3377
-        low = -128
-        high = 127
-        if data_dtype == "uint8":
-            low = 0
-            high = 255
-        golden_data = np.random.randint(low=low, high=high, size=data_shape).astype(data_dtype)
-        low = -128
-        high = 127
-        if kernel_dtype == "uint8":
-            low = 0
-            high = 255
-        golden_weight = np.random.randint(low=low, high=high, size=kernel_shape).astype(
-            kernel_dtype
-        )
-        return (golden_data, golden_weight)
-
-    def get_output(func, golden_inputs):
-        with tvm.transform.PassContext(opt_level=2):
-            golden_data, golden_weight = golden_inputs
-            params = {"kernel": golden_weight}
-            libs = relay.build(func, "llvm", params=params)
-            mod = graph_executor.create(libs.graph_json, libs.lib, device=tvm.cpu(0))
-            mod.set_input("data", golden_data)
-            mod.set_input(**libs.params)
-            mod.run()
-            res = mod.get_output(0).numpy()
-            return res
-
-    golden_inputs = get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype)
-    golden_output = get_output(ref_func, golden_inputs)
-    qnn_output = get_output(qnn_func, golden_inputs)
-    np.testing.assert_equal(qnn_output, golden_output)
-
-
-def test_no_zero_point():
-    # uint8 input
-    data_shape = (2, 1, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (1, 3, 2, 2)
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=0,
-        kernel_zero_point=0,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    # int8 input
-    data_shape = (2, 1, 2, 4)
-    data_dtype = "int8"
-    kernel_shape = (1, 3, 2, 2)
-    kernel_dtype = "int8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=0,
-        kernel_zero_point=0,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_kernel_zero_point():
-    # uint8 input
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=0,
-        kernel_zero_point=1,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    # int8 input
-    data_shape = (2, 1, 2, 4)
-    data_dtype = "int8"
-    kernel_shape = (1, 3, 2, 2)
-    kernel_dtype = "int8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=0,
-        kernel_zero_point=5,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_input_zero_point():
-    # uint8 input
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=0,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    # int8 input
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "int8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "int8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=0,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_both_zero_point():
-    # uint8 input
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    # int8 input
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "int8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "int8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_different_dtype():
-    # uint8 input and int8 weight
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "int8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-        channels=kernel_shape[1],
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    # int8 input and uint8 weight
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "int8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-        channels=kernel_shape[1],
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_layout():
-    # uint8 input
-    data_shape = (2, 2, 4, 4)  # NHWC
-    data_dtype = "uint8"
-    kernel_shape = (2, 2, 3, 4)  # HWOI
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    data_shape = (2, 2, 4, 3)  # NHWC
-    data_dtype = "uint8"
-    kernel_shape = (2, 2, 1, 3)  # HWOI
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=5,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_padding():
-    # uint8 input
-    data_shape = (1, 4, 2, 2)
-    data_dtype = "uint8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=8,
-        kernel_zero_point=5,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(1, 1),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    # Try different layout
-    data_shape = (2, 2, 4, 4)  # NHWC
-    data_dtype = "uint8"
-    kernel_shape = (2, 2, 3, 4)  # HWOI
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=8,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(1, 1),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-    # Try asymmetric padding
-    data_shape = (2, 8, 6, 4)  # NHWC
-    data_dtype = "uint8"
-    kernel_shape = (2, 2, 3, 4)  # HWOI
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=8,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(1, 1, 2, 2),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_const_folding():
-    data_shape = (2, 4, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (4, 3, 2, 2)
-    kernel_dtype = "uint8"
-
-    golden_weight = np.random.randint(low=0, high=255, size=kernel_shape).astype(kernel_dtype)
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    kernel = relay.const(golden_weight)
-    qnn_func = get_qnn_func(
-        data,
-        kernel,
-        input_zero_point=8,
-        kernel_zero_point=3,
-        kernel_size=(2, 2),
-        input_scale=1.0,
-        kernel_scale=1.0,
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-        channels=kernel_shape[1],
-        groups=1,
-    )
-    folded_mod = transform.FoldConstant()(qnn_func)
-    folded_func = folded_mod["main"]
-    assert "reshape" not in folded_func.astext()
-
-
-def test_broadcast_layout():
-    # Test broadcast support for NHWC layout.
-    data_shape = (1, 229, 229, 3)  # NHWC
-    data_dtype = "uint8"
-    kernel_shape = (7, 7, 64, 3)  # HWOI
-    kernel_dtype = "int8"
-    _, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=8,
-        kernel_zero_point=3,
-        input_scale=1.0,
-        kernel_scale=1.0,
-        kernel_size=(7, 7),
-        padding=(1, 1),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-        out_dtype="int32",
-    )
-    func = qnn_func["main"].body
-    bias = relay.var("bias", shape=(64,), dtype="int32")
-    bias2 = relay.var("bias2", shape=(1, 233, 233, 64), dtype="int32")
-
-    # Check broadcast support on both lhs and rhs
-    func = relay.add(func, bias2)
-    func = relay.add(bias2, func)
-    func = relay.add(bias, func)
-    func = relay.add(func, bias)
-    func = relay.Function(relay.analysis.free_vars(func), func)
-    mod = tvm.IRModule.from_expr(func)
-    with tvm.transform.PassContext(opt_level=3):
-        libs = relay.build(mod, "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake-avx512")
-
-
-def test_non_scalar_input_scale_zp():
-    data_shape = (2, 1, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (1, 3, 2, 2)
-    kernel_dtype = "uint8"
-    ref_func, qnn_func = get_funcs(
-        data_shape=data_shape,
-        data_dtype=data_dtype,
-        kernel_shape=kernel_shape,
-        kernel_dtype=kernel_dtype,
-        input_zero_point=[0],
-        kernel_zero_point=0,
-        input_scale=[1.0],
-        kernel_scale=1.0,
-        kernel_size=(2, 2),
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_per_channel_kernel_scale():
-    data_shape = (2, 1, 2, 4)
-    data_dtype = "uint8"
-    kernel_shape = (1, 3, 2, 2)
-    kernel_dtype = "uint8"
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
-    kernel_scales = [2, 2, 2]
-    kernel_scales = relay.const(np.array(kernel_scales).astype("float32"))
-    func = relay.qnn.conv2d_transpose(
-        data,
-        kernel,
-        input_zero_point=relay.const(0, "int32"),
-        kernel_zero_point=relay.const(0, "int32"),
-        input_scale=relay.const(2.0, "float32"),
-        kernel_scale=kernel_scales,
-        kernel_size=(2, 2),
-        channels=kernel_shape[0],
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-        data_layout="NCHW",
-        kernel_layout="IOHW",
-        out_dtype="int32",
-    )
-
-    mod = relay.Function(relay.analysis.free_vars(func), func)
-    mod = tvm.IRModule.from_expr(mod)
-
-
-if __name__ == "__main__":
-    test_no_zero_point()
-    test_input_zero_point()
-    test_kernel_zero_point()
-    test_both_zero_point()
-    test_different_dtype()
-    test_layout()
-    test_padding()
-    test_const_folding()
-    test_broadcast_layout()
-    test_per_channel_kernel_scale()
diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
deleted file mode 100644
index 7bf1a3dbaf54..000000000000
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ /dev/null
@@ -1,1124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import platform
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import run_infer_type
-from tvm.contrib import graph_executor
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-
-# We use llvm target for testing functionality. `llvm` points to an older Intel
-# generation machine, that legalizes to a simple lowering. Therefore, the
-# legalization is overwritten such that it can be skipped and we use the
-# QNNCanonicalizeOps lowering for the testing.
-def legalize_qnn_conv2d(attrs, inputs, types):
-    return None
-
-
-def get_ref_func(
-    data,
-    kernel,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_size,
-    padding,
-    strides,
-    dilation,
-    data_layout,
-    kernel_layout,
-    out_dtype,
-    groups,
-    channels=None,
-):
-    if isinstance(input_zero_point, (int, float)):
-        input_zero_point = relay.const(input_zero_point, "int32")
-    if isinstance(kernel_zero_point, (int, float)):
-        kernel_zero_point = relay.const(kernel_zero_point, "int32")
-    else:
-        # Kernel zero point expression requires manual broadcasting for some layouts.
-        if kernel_layout == "OIHW":
-            kernel_zero_point = relay.reshape(kernel_zero_point, [-1, 1, 1, 1])
-        elif kernel_layout == "HWOI":
-            kernel_zero_point = relay.reshape(kernel_zero_point, [1, 1, -1, 1])
-
-    casted_data = relay.op.cast(data, "int32")
-    casted_kernel = relay.op.cast(kernel, "int32")
-    shifted_data = relay.op.subtract(casted_data, input_zero_point)
-    shifted_kernel = relay.op.subtract(casted_kernel, kernel_zero_point)
-    func = relay.op.nn.conv2d(
-        shifted_data,
-        shifted_kernel,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        groups=groups,
-        channels=channels,
-        kernel_size=kernel_size,
-        out_dtype=out_dtype,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-
-    func = relay.Function(relay.analysis.free_vars(func), func)
-    return func
-
-
-def get_qnn_func(
-    data,
-    kernel,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_size,
-    padding,
-    strides,
-    dilation,
-    data_layout,
-    kernel_layout,
-    out_dtype,
-    channels,
-    groups,
-):
-    if isinstance(input_zero_point, (int, float)):
-        input_zero_point = relay.const(input_zero_point, "int32")
-    if isinstance(kernel_zero_point, (int, float)):
-        kernel_zero_point = relay.const(kernel_zero_point, "int32")
-
-    func = relay.qnn.conv2d(
-        data,
-        kernel,
-        input_zero_point=input_zero_point,
-        kernel_zero_point=kernel_zero_point,
-        input_scale=relay.const(input_scale, "float32"),
-        kernel_scale=relay.const(kernel_scale, "float32"),
-        kernel_size=kernel_size,
-        strides=strides,
-        dilation=dilation,
-        padding=padding,
-        out_dtype=out_dtype,
-        groups=groups,
-        channels=channels,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-
-    mod = relay.Function(relay.analysis.free_vars(func), func)
-    mod = tvm.IRModule.from_expr(mod)
-    return mod
-
-
-def get_funcs(
-    data_shape,
-    data_dtype,
-    kernel_shape,
-    kernel_dtype,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    kernel_size,
-    padding,
-    strides,
-    dilation,
-    data_layout,
-    kernel_layout,
-    out_dtype,
-    groups=1,
-    channels=None,
-):
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
-
-    ref_func = get_ref_func(
-        data,
-        kernel,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        kernel_size,
-        padding,
-        strides,
-        dilation,
-        data_layout,
-        kernel_layout,
-        out_dtype,
-        groups,
-        channels,
-    )
-    ref_func = run_infer_type(ref_func)
-    ref_func = tvm.IRModule.from_expr(ref_func)
-    qnn_func = get_qnn_func(
-        data,
-        kernel,
-        input_zero_point,
-        kernel_zero_point,
-        input_scale,
-        kernel_scale,
-        kernel_size,
-        padding,
-        strides,
-        dilation,
-        data_layout,
-        kernel_layout,
-        out_dtype,
-        channels,
-        groups,
-    )
-
-    return (ref_func, qnn_func)
-
-
-def verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype):
-    def get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype):
-        # Keeping inputs multiple of 4 because of a bug in Average Pool2d
-        # https://discuss.tvm.apache.org/t/pool2d-gives-bad-output-for-integer-inputs/3377
-        low = -128
-        high = 127
-        if data_dtype == "uint8":
-            low = 0
-            high = 255
-        golden_data = np.random.randint(low=low, high=high, size=data_shape).astype(data_dtype)
-        low = -128
-        high = 127
-        if kernel_dtype == "uint8":
-            low = 0
-            high = 255
-        golden_weight = np.random.randint(low=low, high=high, size=kernel_shape).astype(
-            kernel_dtype
-        )
-        return (golden_data, golden_weight)
-
-    def get_output(func, golden_inputs):
-        with tvm.transform.PassContext(opt_level=2):
-            golden_data, golden_weight = golden_inputs
-            params = {"kernel": golden_weight}
-            graph, lib, params = relay.build(func, "llvm", params=params)
-            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-            mod.set_input("data", golden_data)
-            mod.set_input(**params)
-            mod.run()
-            res = mod.get_output(0).numpy()
-            return res
-
-    golden_inputs = get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype)
-    golden_output = get_output(ref_func, golden_inputs)
-    qnn_output = get_output(qnn_func, golden_inputs)
-    np.testing.assert_equal(qnn_output, golden_output)
-
-
-def test_no_zero_point():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 1, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 1, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=0,
-            kernel_zero_point=0,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # int8 input
-        data_shape = (2, 1, 2, 4)
-        data_dtype = "int8"
-        kernel_shape = (3, 1, 2, 2)
-        kernel_dtype = "int8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=0,
-            kernel_zero_point=0,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_kernel_zero_point():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=0,
-            kernel_zero_point=1,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # int8 input
-        data_shape = (2, 1, 2, 4)
-        data_dtype = "int8"
-        kernel_shape = (3, 1, 2, 2)
-        kernel_dtype = "int8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=0,
-            kernel_zero_point=5,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_input_zero_point():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=0,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # int8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "int8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "int8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=0,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_both_zero_point():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # int8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "int8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "int8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_dynamic_zero_point():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input with non static zero points.
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-        input_zero_point = relay.op.multiply(
-            relay.const(2, dtype="int32"), relay.const(2, dtype="int32")
-        )
-        kernel_zero_point = relay.const(np.random.randint(10, size=[3]), "int32")
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=input_zero_point,
-            kernel_zero_point=kernel_zero_point,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # int8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "int8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "int8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=input_zero_point,
-            kernel_zero_point=kernel_zero_point,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_layout():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 2, 4, 4)  # NHWC
-        data_dtype = "uint8"
-        kernel_shape = (2, 2, 4, 3)  # HWIO
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # NHWC and HWOI layout. Used in depthwise conv.
-        data_shape = (2, 2, 4, 3)  # NHWC
-        data_dtype = "uint8"
-        kernel_shape = (2, 2, 3, 1)  # HWOI
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            groups=3,
-            data_layout="NHWC",
-            kernel_layout="HWOI",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_padding():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (1, 4, 2, 2)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=8,
-            kernel_zero_point=5,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(1, 1),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # Try different layout
-        data_shape = (2, 2, 4, 4)  # NHWC
-        data_dtype = "uint8"
-        kernel_shape = (2, 2, 4, 3)  # HWIO
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=8,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(1, 1),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # Try asymmetric padding
-        data_shape = (2, 2, 4, 4)  # NHWC
-        data_dtype = "uint8"
-        kernel_shape = (2, 2, 4, 3)  # HWIO
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=8,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(1, 1, 2, 2),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_dilation():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # Non-zero kernel point - fall back to simpler lowering.
-        data_shape = (2, 4, 4, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(2, 2),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # Zero kernel point
-        data_shape = (2, 4, 4, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=0,
-            kernel_zero_point=0,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(2, 2),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_const_folding():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 2, 2)
-        kernel_dtype = "uint8"
-
-        golden_weight = np.random.randint(low=0, high=255, size=kernel_shape).astype(kernel_dtype)
-        data = relay.var("data", shape=data_shape, dtype=data_dtype)
-        kernel = relay.const(golden_weight)
-        qnn_func = get_qnn_func(
-            data,
-            kernel,
-            input_zero_point=8,
-            kernel_zero_point=3,
-            kernel_size=(2, 2),
-            input_scale=1.0,
-            kernel_scale=1.0,
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-            channels=kernel_shape[0],
-            groups=1,
-        )
-        folded_mod = transform.FoldConstant()(qnn_func)
-        folded_func = folded_mod["main"]
-        assert "reshape" not in folded_func.astext()
-
-
-def test_kernel_size_1x1():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 1, 1)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(1, 1),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        assert "avg_pool2d" not in qnn_func.astext()
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_kernel_size_1x1_strides_2():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 4, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 4, 1, 1)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=5,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(1, 1),
-            padding=(0, 0),
-            strides=(2, 2),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        assert "avg_pool2d" not in qnn_func.astext()
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Fails due to encountering none type in autotvm. See https://github.com/apache/tvm/issues/16538",
-)
-def test_tflite_large_irregular():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (1, 1024, 1, 1)
-        data_dtype = "uint8"
-        kernel_shape = (1001, 1024, 1, 1)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=127,
-            kernel_zero_point=127,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(1, 1),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        golden_data = np.full(data_shape, 127).astype("uint8")
-        golden_weight = np.full(kernel_shape, 127).astype("uint8")
-
-        with tvm.transform.PassContext(opt_level=2):
-            params = {"kernel": golden_weight}
-            graph, lib, params = relay.build(qnn_func, "llvm", params=params)
-            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-            mod.set_input("data", golden_data)
-            mod.set_input(**params)
-            mod.run()
-            qnn_output = mod.get_output(0).numpy()
-        golden_output = np.full((1, 1001, 1, 1), 0).astype("uint8")
-        np.testing.assert_equal(qnn_output, golden_output)
-
-
-def test_tflite_output_multiplier_greater_than_one():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (2, 1, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 1, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            input_zero_point=128,
-            kernel_zero_point=128,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(2, 2),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        golden_data = 128 + np.array((1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4)).reshape(
-            data_shape
-        ).astype("uint8")
-        golden_weight = 128 + np.array((1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1)).reshape(
-            kernel_shape
-        )
-        golden_weight = golden_weight.astype("uint8")
-
-        with tvm.transform.PassContext(opt_level=2):
-            params = {"kernel": golden_weight}
-            graph, lib, params = relay.build(qnn_func, "llvm", params=params)
-            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-            mod.set_input("data", golden_data)
-            mod.set_input(**params)
-            mod.run()
-            qnn_output = mod.get_output(0).numpy()
-        golden_output = np.array((17, 17, 0, 0, 2, 2, 16, 36, 2, 2, 0, 0)).reshape(2, 3, 1, 2)
-        np.testing.assert_equal(qnn_output, golden_output)
-
-
-def test_tflite_anistropic_strides():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input
-        data_shape = (1, 1, 3, 6)
-        data_dtype = "uint8"
-        kernel_shape = (1, 1, 2, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=127,
-            kernel_zero_point=127,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-            strides=(1, 3),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-        golden_data = np.array(
-            (
-                133,
-                131,
-                129,
-                125,
-                123,
-                121,
-                135,
-                133,
-                131,
-                123,
-                121,
-                119,
-                137,
-                135,
-                133,
-                121,
-                119,
-                117,
-            )
-        ).reshape(data_shape)
-        golden_data = golden_data.astype("uint8")
-        golden_weight = np.array((129, 131, 133, 135)).reshape(kernel_shape)
-        golden_weight = golden_weight.astype("uint8")
-
-        with tvm.transform.PassContext(opt_level=2):
-            params = {"kernel": golden_weight}
-            graph, lib, params = relay.build(qnn_func, "llvm", params=params)
-            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-            mod.set_input("data", golden_data)
-            mod.set_input(**params)
-            mod.run()
-            qnn_output = mod.get_output(0).numpy()
-        golden_output = np.array((124, -92, 164, -132)).reshape(1, 1, 2, 2)
-        np.testing.assert_equal(qnn_output, golden_output)
-
-
-def test_broadcast_layout():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # Test broadcast support for NHWC layout.
-        data_shape = (1, 229, 229, 3)  # NHWC
-        data_dtype = "uint8"
-        kernel_shape = (7, 7, 3, 64)  # HWIO
-        kernel_dtype = "int8"
-        _, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=8,
-            kernel_zero_point=3,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(7, 7),
-            padding=(1, 1),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-        )
-        func = qnn_func["main"].body
-        bias = relay.var("bias", shape=(64,), dtype="int32")
-        bias2 = relay.var("bias2", shape=(1, 225, 225, 1), dtype="int32")
-
-        # Check broadcast support on both lhs and rhs
-        func = relay.add(func, bias2)
-        func = relay.add(bias2, func)
-        func = relay.add(bias, func)
-        func = relay.add(func, bias)
-        func = relay.Function(relay.analysis.free_vars(func), func)
-        mod = tvm.IRModule.from_expr(func)
-        with tvm.transform.PassContext(opt_level=3):
-            graph, lib, params = relay.build(
-                mod, "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake-avx512"
-            )
-
-
-def test_depthwise_depth_multiplier():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-
-        # uint8 input, NCHW and OIHW
-        # Depthwise multiplier = 1
-        data_shape = (2, 4, 16, 16)
-        data_dtype = "uint8"
-        kernel_shape = (4, 1, 3, 3)
-        kernel_dtype = "uint8"
-        input_zero_point = relay.op.multiply(
-            relay.const(2, dtype="int32"), relay.const(2, dtype="int32")
-        )
-        kernel_zero_point = relay.const(np.random.randint(10, size=[4]), "int32")
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=input_zero_point,
-            kernel_zero_point=kernel_zero_point,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(3, 3),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-            groups=4,
-            channels=4,
-        )
-
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # Depthwise multiplier = 2
-        data_shape = (10, 4, 16, 16)
-        data_dtype = "uint8"
-        kernel_shape = (4, 2, 3, 3)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=input_zero_point,
-            kernel_zero_point=kernel_zero_point,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(3, 3),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-            groups=4,
-            channels=8,
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # uint8 input, NHWC and HWOI
-        # Depthwise multiplier = 1
-        data_shape = (2, 16, 16, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 3, 4, 1)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=input_zero_point,
-            kernel_zero_point=kernel_zero_point,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(3, 3),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWOI",
-            out_dtype="int32",
-            groups=4,
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-        # Depthwise multiplier = 2
-        data_shape = (2, 16, 16, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 3, 4, 2)
-        kernel_dtype = "uint8"
-        ref_func, qnn_func = get_funcs(
-            data_shape=data_shape,
-            data_dtype=data_dtype,
-            kernel_shape=kernel_shape,
-            kernel_dtype=kernel_dtype,
-            input_zero_point=input_zero_point,
-            kernel_zero_point=kernel_zero_point,
-            input_scale=1.0,
-            kernel_scale=1.0,
-            kernel_size=(3, 3),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWOI",
-            out_dtype="int32",
-            groups=4,
-            channels=8,
-        )
-        verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
-
-
-def test_per_channel_kernel_scale():
-    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):
-        data_shape = (2, 1, 2, 4)
-        data_dtype = "uint8"
-        kernel_shape = (3, 1, 2, 2)
-        kernel_dtype = "uint8"
-        data = relay.var("data", shape=data_shape, dtype=data_dtype)
-        kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
-        kernel_scales = [2, 2, 2]
-        kernel_scales = relay.const(np.array(kernel_scales).astype("float32"))
-        func = relay.qnn.conv2d(
-            data,
-            kernel,
-            input_zero_point=relay.const(0, "int32"),
-            kernel_zero_point=relay.const(0, "int32"),
-            input_scale=relay.const(2.0, "float32"),
-            kernel_scale=kernel_scales,
-            kernel_size=(2, 2),
-            channels=kernel_shape[0],
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-        )
-
-        mod = relay.Function(relay.analysis.free_vars(func), func)
-        mod = tvm.IRModule.from_expr(mod)
-
-
-if __name__ == "__main__":
-    test_no_zero_point()
-    test_input_zero_point()
-    test_kernel_zero_point()
-    test_both_zero_point()
-    test_layout()
-    test_padding()
-    test_dilation()
-    test_const_folding()
-    test_kernel_size_1x1()
-    test_kernel_size_1x1_strides_2()
-    test_tflite_large_irregular()
-    test_broadcast_layout()
-    test_tflite_output_multiplier_greater_than_one()
-    test_tflite_anistropic_strides()
-    test_depthwise_depth_multiplier()
-    test_per_channel_kernel_scale()
diff --git a/tests/python/relay/test_op_qnn_dense.py b/tests/python/relay/test_op_qnn_dense.py
deleted file mode 100644
index d28742ddf06d..000000000000
--- a/tests/python/relay/test_op_qnn_dense.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-
-
-# We use llvm target for testing functionality. `llvm` points to an older Intel
-# generation machine, that legalizes to a simple lowering. Therefore, the
-# legalization is overwritten such that it can be skipped and we use the
-# QNNCanonicalizeOps lowering for the testing.
-def legalize_qnn_dense(attrs, inputs, types):
-    return None
-
-
-def make_requantize_params(input_scale, output_scale, output_zero_point, out_dtype):
-    config = {
-        "input_scale": input_scale,
-        "output_scale": output_scale,
-        "output_zero_point": output_zero_point,
-        "out_dtype": out_dtype,
-    }
-    return config
-
-
-def make_configuration(
-    quantized_data,
-    quantized_kernel,
-    dtype,
-    input_shape,
-    kernel_shape,
-    input_zero_point,
-    kernel_zero_point,
-    input_scale,
-    kernel_scale,
-    units,
-    output,
-    out_dtype="int32",
-    bias=None,
-    requantize=None,
-):
-    if requantize is not None:
-        assert bias is not None
-    config = {
-        "quantized_data": quantized_data,
-        "quantized_kernel": quantized_kernel,
-        "dtype": dtype,
-        "input_shape": input_shape,
-        "kernel_shape": kernel_shape,
-        "input_zero_point": input_zero_point,
-        "kernel_zero_point": kernel_zero_point,
-        "input_scale": input_scale,
-        "kernel_scale": kernel_scale,
-        "units": units,
-        "output": output,
-        "out_dtype": out_dtype,
-        "bias": bias,
-        "requantize": requantize,
-    }
-    return config
-
-
-def make_int_configuration(use_bias=False, requantize_output=False, per_channel=False):
-    input_shape, kernel_shape, output_shape = (2, 10), (3, 10), (2, 3)
-    input_zero_point, kernel_zero_point = -1, -1
-    in_dtype = "int8"
-    out_dtype = "int32" if not requantize_output else "int8"
-    units = 3
-    quantized_data_np = (
-        np.array([1, 3, 5, 7, 9, 11, 13, 15, -19, -21, 1, 3, 5, 7, 9, 11, 13, -17, 17, -21])
-        .astype(in_dtype)
-        .reshape(input_shape)
-    )
-    quantized_kernel_np = (
-        np.array(
-            [
-                1,
-                3,
-                5,
-                7,
-                9,
-                11,
-                13,
-                15,
-                17,
-                19,
-                1,
-                3,
-                5,
-                7,
-                9,
-                11,
-                13,
-                15,
-                17,
-                19,
-                1,
-                3,
-                5,
-                7,
-                9,
-                11,
-                13,
-                15,
-                17,
-                19,
-            ]
-        )
-        .astype(in_dtype)
-        .reshape(kernel_shape)
-    )
-    input_scale = 0.5
-    kernel_scale = 0.5
-    output_scale = 1.0
-    bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units,)) if use_bias else None
-
-    if per_channel:
-        assert use_bias and requantize_output
-        kernel_scale = np.array([0.5, 0.3, 0.4], dtype=np.float32)
-        output = np.array([23, 14, 20, 57, 34, 47])
-    elif requantize_output:
-        assert use_bias
-        output = np.array([23, 24, 25, 57, 58, 59])
-    elif use_bias:
-        output = np.array([96, 100, 104, 232, 236, 240])
-    else:
-        output = np.array([92, 92, 92, 228, 228, 228])
-
-    requant_params = (
-        make_requantize_params(input_scale * kernel_scale, output_scale, -1, "int8")
-        if requantize_output
-        else None
-    )
-
-    output = output.astype(out_dtype).reshape(output_shape)
-    return make_configuration(
-        quantized_data=quantized_data_np,
-        quantized_kernel=quantized_kernel_np,
-        dtype=in_dtype,
-        input_shape=input_shape,
-        kernel_shape=kernel_shape,
-        input_zero_point=input_zero_point,
-        kernel_zero_point=kernel_zero_point,
-        input_scale=input_scale,
-        kernel_scale=kernel_scale,
-        units=units,
-        output=output,
-        bias=bias,
-        requantize=requant_params,
-    )
-
-
-def qnn_dense_driver(test_configuration):
-    in_dtype = test_configuration["dtype"]
-    out_dtype = test_configuration["out_dtype"]
-    quantized_data_name = "quantized_data"
-    quantized_kernel_name = "quantized_kernel"
-    expected_out_dtype = test_configuration["out_dtype"]
-    bias_name = "bias"
-    quantized_data = relay.var(
-        quantized_data_name, shape=test_configuration["input_shape"], dtype=in_dtype
-    )
-    quantized_kernel = relay.var(
-        quantized_kernel_name, shape=test_configuration["kernel_shape"], dtype=in_dtype
-    )
-    mod = relay.qnn.dense(
-        quantized_data,
-        quantized_kernel,
-        relay.const(test_configuration["input_zero_point"], "int32"),
-        relay.const(test_configuration["kernel_zero_point"], "int32"),
-        relay.const(test_configuration["input_scale"], "float32"),
-        relay.const(test_configuration["kernel_scale"], "float32"),
-        test_configuration["units"],
-    )
-    if test_configuration[bias_name] is not None:
-        bias = relay.var(bias_name, shape=test_configuration["bias"].shape, dtype=out_dtype)
-        mod = relay.nn.bias_add(mod, bias)
-    if test_configuration["requantize"] is not None:
-        requantize_config = test_configuration["requantize"]
-        mod = relay.qnn.requantize(
-            mod,
-            input_scale=relay.const(requantize_config["input_scale"], "float32"),
-            input_zero_point=relay.const(0, "int32"),
-            output_scale=relay.const(requantize_config["output_scale"], "float32"),
-            output_zero_point=relay.const(requantize_config["output_zero_point"], "int32"),
-            out_dtype=requantize_config["out_dtype"],
-        )
-        expected_out_dtype = requantize_config["out_dtype"]
-
-    mod = relay.Function(relay.analysis.free_vars(mod), mod)
-    mod = tvm.IRModule.from_expr(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    with tvm.transform.PassContext(opt_level=2):
-        graph, lib, params = relay.build(mod, "llvm", params=None)
-        mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-        mod.set_input(quantized_data_name, test_configuration[quantized_data_name])
-        mod.set_input(quantized_kernel_name, test_configuration[quantized_kernel_name])
-        if test_configuration[bias_name] is not None:
-            mod.set_input(bias_name, test_configuration[bias_name])
-        mod.set_input(**params)
-        mod.run()
-        res = mod.get_output(0).numpy()
-        np.testing.assert_equal(res, test_configuration["output"])
-        assert res.dtype == expected_out_dtype
-
-
-def test_qnn_dense_without_bias():
-    with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense):
-
-        int32_output_without_bias_params = make_int_configuration(use_bias=False)
-        qnn_dense_driver(int32_output_without_bias_params)
-
-
-def test_qnn_dense_with_bias():
-    with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense):
-
-        int32_output_with_bias_params = make_int_configuration(use_bias=True)
-        qnn_dense_driver(int32_output_with_bias_params)
-
-
-def test_qnn_dense_with_requantized_output():
-    with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense):
-
-        int8_requantized_output_with_bias_params = make_int_configuration(
-            use_bias=True, requantize_output=True
-        )
-        qnn_dense_driver(int8_requantized_output_with_bias_params)
-
-
-def test_per_channel_weight_scale():
-    with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense):
-        config = make_int_configuration(use_bias=True, requantize_output=True, per_channel=True)
-        qnn_dense_driver(config)
-
-
-if __name__ == "__main__":
-    test_qnn_dense_without_bias()
-    test_qnn_dense_with_bias()
-    test_qnn_dense_with_requantized_output()
-    test_per_channel_weight_scale()
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
deleted file mode 100644
index 68908b4d7e43..000000000000
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay.testing import run_infer_type
-
-
-def dequantize_test_driver(
-    in_dtype, quant_args, in_data, verify_output_data, axis, out_dtype="float32"
-):
-    shape = in_data.shape
-    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
-    input_zero_point = relay.const(quant_args["in_zero_point"], "int32")
-    input_scale = relay.const(quant_args["in_scale"], "float32")
-    quantized_output = relay.qnn.dequantize(
-        input_data,
-        input_scale=input_scale,
-        input_zero_point=input_zero_point,
-        axis=axis,
-        out_dtype=out_dtype,
-    )
-    mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output)
-    mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, params = relay.build(mod, "llvm", params=None)
-        rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-        rt_mod.set_input(input_data=in_data)
-        rt_mod.set_input(**params)
-        rt_mod.run()
-        res = rt_mod.get_output(0).numpy()
-        np.testing.assert_equal(res, verify_output_data)
-        assert res.dtype == out_dtype
-
-
-def test_uint8_to_float32():
-    data = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]).astype("uint8").reshape((2, 5))
-    output = (
-        np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    quant_args = {"in_zero_point": 127, "in_scale": 0.5}
-    dequantize_test_driver(
-        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
-    )
-
-
-def test_int8_to_float32():
-    data = (
-        np.array([-128, -127, -126, -125, -124, 123, 124, 125, 126, 127])
-        .astype("int8")
-        .reshape((2, 5))
-    )
-    output = (
-        np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    quant_args = {"in_zero_point": -1, "in_scale": 0.5}
-    dequantize_test_driver(
-        in_dtype="int8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
-    )
-
-
-def test_int8_to_float16():
-    data = (
-        np.array([-128, -127, -126, -125, -124, 123, 124, 125, 126, 127])
-        .astype("int8")
-        .reshape((2, 5))
-    )
-    output = (
-        np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64])
-        .astype("float16")
-        .reshape((2, 5))
-    )
-    quant_args = {"in_zero_point": -1, "in_scale": 0.5}
-    dequantize_test_driver(
-        in_dtype="int8",
-        quant_args=quant_args,
-        in_data=data,
-        verify_output_data=output,
-        axis=-1,
-        out_dtype="float16",
-    )
-
-
-def test_scalar_int8_to_float32():
-    data = np.array(-128).astype("int8")
-    output = np.array(-63.5).astype("float32")
-    quant_args = {"in_zero_point": -1, "in_scale": 0.5}
-    dequantize_test_driver(
-        in_dtype="int8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
-    )
-
-
-def test_int32_to_float32():
-    data = np.array([113, 29, -1052]).astype("int32")
-    output = np.array([0.6550452, 0.16810896, -6.098297]).astype("float32")
-    quant_args = {"in_zero_point": 0, "in_scale": 0.0057968604}
-    dequantize_test_driver(
-        in_dtype="int32", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
-    )
-
-
-def test_channelwise_axis_1():
-    data = np.transpose(
-        np.array([0, 1, 2, 3, 4, 243, 247, 249, 250, 251]).astype("uint8").reshape((2, 5))
-    )
-    output = np.transpose(
-        np.array([-63.5, -63, -62.5, -62, -61.5, 30, 31, 31.5, 31.75, 32])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    quant_args = {
-        "in_zero_point": np.array([127, 123]).astype("int32"),
-        "in_scale": np.array([0.5, 0.25]).astype("float32"),
-    }
-
-    dequantize_test_driver(
-        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
-    )
-
-
-def test_channelwise_axis_0():
-    data = np.array([0, 1, 2, 3, 4, 243, 247, 249, 250, 251]).astype("uint8").reshape((2, 5))
-    output = (
-        np.array([-63.5, -63, -62.5, -62, -61.5, 30, 31, 31.5, 31.75, 32])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    quant_args = {
-        "in_zero_point": np.array([127, 123]).astype("int32"),
-        "in_scale": np.array([0.5, 0.25]).astype("float32"),
-    }
-
-    dequantize_test_driver(
-        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=0
-    )
-
-
-def test_per_tensor_vector_args():
-    data = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]).astype("uint8")
-    output = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]).astype("float32")
-
-    quant_args = {
-        "in_zero_point": np.array([127]).astype("int32"),
-        "in_scale": np.array([0.5]).astype("float32"),
-    }
-
-    dequantize_test_driver(
-        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
-    )
-
-
-def test_dynamic_dequantize():
-    x = relay.var("x", shape=(1, 2, 3, 4), dtype="int8")
-    scale_var = relay.var("scale", shape=(), dtype="float32")
-    zp_var = relay.var("zp", shape=(), dtype="int32")
-
-    deq_x = relay.qnn.dequantize(x, scale_var * scale_var, zp_var + zp_var)
-    tt = run_infer_type(deq_x)
-
-    assert tt.checked_type == relay.TensorType((1, 2, 3, 4), "float32")
-    func = relay.Function([x, scale_var, zp_var], deq_x)
-    data = np.random.uniform(size=(1, 2, 3, 4)).astype("int8")
-    scale = np.array(1).astype("float32")
-    zp = np.array(0).astype("int32")
-
-    mod = tvm.ir.IRModule.from_expr(func)
-
-    for target, dev in tvm.testing.enabled_targets():
-        # TODO: (electriclilies) enable AlterOpLayout when it is fixed
-        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            lib = relay.build(mod, target=target)
-
-    module = graph_executor.GraphModule(lib["default"](dev))
-    module.set_input(**{"x": data, "scale": scale, "zp": zp})
-    module.run()
-
-
-if __name__ == "__main__":
-    test_uint8_to_float32()
-    test_int8_to_float32()
-    test_int8_to_float16()
-    test_scalar_int8_to_float32()
-    test_int32_to_float32()
-    test_channelwise_axis_1()
-    test_channelwise_axis_0()
-    test_dynamic_dequantize()
diff --git a/tests/python/relay/test_op_qnn_leaky_relu.py b/tests/python/relay/test_op_qnn_leaky_relu.py
deleted file mode 100644
index 21e42d8d27fb..000000000000
--- a/tests/python/relay/test_op_qnn_leaky_relu.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import numpy as np
-from tvm import relay
-
-
-def dequantize(data, scale, zp):
-    return scale * (np.asarray(data) - zp)
-
-
-def generate_golden_output(x_data, dequantized_x, alpha, o_scale, o_zero_point, i_zero_point):
-    prod = np.multiply(dequantized_x, alpha)
-    prod = np.around(prod / o_scale + o_zero_point)
-
-    q_min = np.iinfo(np.uint8).min
-    q_max = np.iinfo(np.uint8).max
-    prod = np.clip(prod, q_min, q_max)
-
-    requantized = np.clip(np.round(dequantized_x / o_scale + o_zero_point), q_min, q_max)
-
-    output = np.where(x_data < i_zero_point, prod, requantized)
-    return output
-
-
-def test_qnn_leaky_relu():
-    data_dtype = "uint8"
-    input_scale = 0.125
-    input_zero_point = 60
-    output_scale = 0.6
-    output_zero_point = 17
-    alpha = 0.9
-
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.qnn.leaky_relu(
-        x=x,
-        alpha=alpha,
-        input_scale=relay.const(input_scale, "float32"),
-        input_zero_point=relay.const(input_zero_point, "int32"),
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-    )
-
-    func = relay.Function([x], y)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_data = np.array((255, 133, 0, 9)).reshape((1, 4))
-    x_dequantized = dequantize(x_data, input_scale, input_zero_point)
-    golden_output = generate_golden_output(
-        x_data, x_dequantized, alpha, output_scale, output_zero_point, input_zero_point
-    )
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(x_data)
-
-    np.testing.assert_allclose(op_res.numpy(), golden_output, atol=1)
-
-
-if __name__ == "__main__":
-    test_qnn_leaky_relu()
diff --git a/tests/python/relay/test_op_qnn_mul.py b/tests/python/relay/test_op_qnn_mul.py
deleted file mode 100644
index bbc1bfd2ae57..000000000000
--- a/tests/python/relay/test_op_qnn_mul.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-import tvm.topi.testing
-
-# "unquantize" a quantized tensor
-def recover(data, scale, zp):
-    return scale * (np.asarray(data) - zp)
-
-
-def generate_golden_output(x_recovered, y_recovered, scale, zp):
-    mul = x_recovered * y_recovered
-    output = np.around(mul / scale + zp)
-
-    q_min = np.iinfo(np.uint8).min
-    q_max = np.iinfo(np.uint8).max
-    return np.clip(output, q_min, q_max)
-
-
-def test_tflite_same_io_qnn_params():
-    data_dtype = "uint8"
-
-    lhs_scale = rhs_scale = output_scale = 0.00784314
-    lhs_zero_point = rhs_zero_point = output_zero_point = 127
-
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
-    z = relay.qnn.mul(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(lhs_scale, "float32"),
-        lhs_zero_point=relay.const(lhs_zero_point, "int32"),
-        rhs_scale=relay.const(rhs_scale, "float32"),
-        rhs_zero_point=relay.const(rhs_zero_point, "int32"),
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_datas = [
-        np.array((1, 153, 2, 178)).reshape((1, 4)),
-        np.array((25, 1, 178, 216)).reshape((1, 4)),
-        np.array((25, 153, 1, 165)).reshape((1, 4)),
-    ]
-    y_datas = [
-        np.array((204, 178, 1, 8)).reshape((1, 4)),
-        np.array((204, 178, 191, 1)).reshape((1, 4)),
-        np.array((204, 178, 1, 191)).reshape((1, 4)),
-    ]
-
-    for i in range(0, 3):
-        x_data = x_datas[i]
-        y_data = y_datas[i]
-
-        x_rec = recover(x_data, lhs_scale, lhs_zero_point)
-        y_rec = recover(y_data, rhs_scale, rhs_zero_point)
-        golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
-
-        op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-            x_data, y_data
-        )
-
-        np.testing.assert_equal(op_res.numpy(), np.uint8(golden))
-
-
-def test_tflite_different_io_qnn_params():
-    data_dtype = "uint8"
-
-    lhs_scale = 0.0156863
-    lhs_zero_point = 127
-    rhs_scale = 0.0117647
-    rhs_zero_point = 85
-    output_scale = 0.0235294
-    output_zero_point = 128
-
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
-    z = relay.qnn.mul(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(lhs_scale, "float32"),
-        lhs_zero_point=relay.const(lhs_zero_point, "int32"),
-        rhs_scale=relay.const(rhs_scale, "float32"),
-        rhs_zero_point=relay.const(rhs_zero_point, "int32"),
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_datas = [
-        np.array((76, 140, 153, 172)).reshape((1, 4)),
-        np.array((133, 140, 146, 153)).reshape((1, 4)),
-        np.array((76, 140, 172, 146)).reshape((1, 4)),
-    ]
-    y_datas = [
-        np.array((136, 119, 128, 17)).reshape((1, 4)),
-        np.array((136, 119, 111, 94)).reshape((1, 4)),
-        np.array((136, 119, 17, 128)).reshape((1, 4)),
-    ]
-
-    for i in range(0, 3):
-        x_data = x_datas[i]
-        y_data = y_datas[i]
-
-        x_rec = recover(x_data, lhs_scale, lhs_zero_point)
-        y_rec = recover(y_data, rhs_scale, rhs_zero_point)
-        golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
-
-        op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-            x_data, y_data
-        )
-        np.testing.assert_equal(op_res.numpy(), np.uint8(golden))
-
-
-def test_saturation():
-    # Same params
-    data_dtype = "uint8"
-    lhs_scale = rhs_scale = output_scale = 0.125
-    lhs_zero_point = rhs_zero_point = output_zero_point = 0
-
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
-    z = relay.qnn.mul(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(lhs_scale, "float32"),
-        lhs_zero_point=relay.const(lhs_zero_point, "int32"),
-        rhs_scale=relay.const(rhs_scale, "float32"),
-        rhs_zero_point=relay.const(rhs_zero_point, "int32"),
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
-    y_data = np.array((255, 255, 128, 0)).reshape((1, 4))
-
-    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
-    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
-
-    golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), np.uint8(golden))
-
-    # Same params, different scale
-
-    lhs_scale = rhs_scale = 0.125
-    output_scale = 0.25
-
-    z = relay.qnn.mul(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(lhs_scale, "float32"),
-        lhs_zero_point=relay.const(lhs_zero_point, "int32"),
-        rhs_scale=relay.const(rhs_scale, "float32"),
-        rhs_zero_point=relay.const(rhs_zero_point, "int32"),
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
-    y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
-
-    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
-    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
-
-    golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), np.uint8(golden))
-
-    # All params different
-
-    lhs_scale = 0.5
-    rhs_scale = 0.25
-    output_scale = 0.125
-
-    z = relay.qnn.mul(
-        lhs=x,
-        rhs=y,
-        lhs_scale=relay.const(lhs_scale, "float32"),
-        lhs_zero_point=relay.const(lhs_zero_point, "int32"),
-        rhs_scale=relay.const(rhs_scale, "float32"),
-        rhs_zero_point=relay.const(rhs_zero_point, "int32"),
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-    )
-
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    x_data = np.array((255, 0, 1, 0)).reshape((1, 4))
-    y_data = np.array((0, 128, 64, 0)).reshape((1, 4))
-
-    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
-    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
-
-    golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-        x_data, y_data
-    )
-    np.testing.assert_equal(op_res.numpy(), np.uint8(golden))
-
-
-if __name__ == "__main__":
-    test_tflite_same_io_qnn_params()
-    test_tflite_different_io_qnn_params()
-    test_saturation()
diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py
deleted file mode 100644
index 89f8904698e3..000000000000
--- a/tests/python/relay/test_op_qnn_quantize.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay.testing import run_infer_type
-
-
-def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data, verify_output_data):
-    shape = in_data.shape
-    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
-    output_zero_point = relay.const(quant_args["out_zero_point"])
-    output_scale = relay.const(quant_args["out_scale"])
-    quantized_output = relay.qnn.quantize(
-        input_data,
-        output_scale=output_scale,
-        output_zero_point=output_zero_point,
-        axis=axis,
-        out_dtype=out_dtype,
-    )
-    mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output)
-    mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, params = relay.build(mod, "llvm", params=None)
-        rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-        rt_mod.set_input(input_data=in_data)
-        rt_mod.set_input(**params)
-        rt_mod.run()
-        res = rt_mod.get_output(0).numpy()
-        np.testing.assert_equal(res, verify_output_data)
-        assert res.dtype == out_dtype
-
-
-def test_float32_to_uint8():
-    data = (
-        np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    output = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]).astype("uint8").reshape((2, 5))
-    quant_args = {"out_zero_point": np.int32(127), "out_scale": np.float32(0.5)}
-    quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype="uint8",
-        in_data=data,
-        verify_output_data=output,
-    )
-
-
-def test_float32_to_int8():
-    data = (
-        np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    output = (
-        np.array([-128, -127, -126, -125, -124, 123, 124, 125, 126, 127])
-        .astype("int8")
-        .reshape((2, 5))
-    )
-    quant_args = {"out_zero_point": np.int32(-1), "out_scale": np.float32(0.5)}
-    quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype="int8",
-        in_data=data,
-        verify_output_data=output,
-    )
-
-
-def test_float32_to_uint16():
-    data = (
-        np.array([-6553, -6552.8, -6552.6, -6552.4, -6552.2, 6553.2, 6553.4, 6553.6, 6553.8, 6554])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    output = (
-        np.array([0, 1, 2, 3, 4, 65531, 65532, 65533, 65534, 65535])
-        .astype("uint16")
-        .reshape((2, 5))
-    )
-    quant_args = {"out_zero_point": np.int32(32765), "out_scale": np.float32(0.2)}
-    quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype="uint16",
-        in_data=data,
-        verify_output_data=output,
-    )
-
-
-def test_scalar_float32_to_int8():
-    data = np.array(-63.5).astype("float32")
-    output = np.array(-128).astype("int8")
-    quant_args = {"out_zero_point": np.int32(-1), "out_scale": np.float32(0.5)}
-    quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype="int8",
-        in_data=data,
-        verify_output_data=output,
-    )
-
-
-def test_channelwise_axis_0():
-    data = (
-        np.array([-63.5, -63, -62.5, -62, -61.5, 30, 31, 31.5, 31.75, 32])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    output = np.array([0, 1, 2, 3, 4, 243, 247, 249, 250, 251]).astype("uint8").reshape((2, 5))
-    quant_args = {
-        "out_zero_point": np.array([127, 123]).astype("int32"),
-        "out_scale": np.array([0.5, 0.25]).astype("float32"),
-    }
-
-    quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=0,
-        out_dtype="uint8",
-        in_data=data,
-        verify_output_data=output,
-    )
-
-
-def test_channelwise_axis_1():
-    data = np.transpose(
-        np.array([-63.5, -63, -62.5, -62, -61.5, 30, 31, 31.5, 31.75, 32])
-        .astype("float32")
-        .reshape((2, 5))
-    )
-    output = np.transpose(
-        np.array([0, 1, 2, 3, 4, 243, 247, 249, 250, 251]).astype("uint8").reshape((2, 5))
-    )
-    quant_args = {
-        "out_zero_point": np.array([127, 123]).astype("int32"),
-        "out_scale": np.array([0.5, 0.25]).astype("float32"),
-    }
-
-    quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype="uint8",
-        in_data=data,
-        verify_output_data=output,
-    )
-
-
-def test_dynamic_quantize():
-    x = relay.var("x", shape=(1, 2, 3, 4), dtype="float32")
-    scale_var = relay.var("scale", shape=(), dtype="float32")
-    zp_var = relay.var("zp", shape=(), dtype="int32")
-
-    q_x = relay.qnn.quantize(x, scale_var * scale_var, zp_var + zp_var)
-    tt = run_infer_type(q_x)
-
-    assert tt.checked_type == relay.TensorType((1, 2, 3, 4), "int8")
-    func = relay.Function([x, scale_var, zp_var], q_x)
-    data = np.random.uniform(size=(1, 2, 3, 4)).astype("float32")
-    scale = np.array(1).astype("float32")
-    zp = np.array(0).astype("int32")
-
-    mod = tvm.ir.IRModule.from_expr(func)
-
-    for target, dev in tvm.testing.enabled_targets():
-        # TODO: (electriclilies) enable AlterOpLayout when it is fixed
-        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            lib = relay.build(mod, target=target)
-
-    module = graph_executor.GraphModule(lib["default"](dev))
-    module.set_input(**{"x": data, "scale": scale, "zp": zp})
-    module.run()
-
-
-if __name__ == "__main__":
-    test_float32_to_uint8()
-    test_float32_to_int8()
-    test_float32_to_uint16()
-    test_scalar_float32_to_int8()
-    test_channelwise_axis_0()
-    test_channelwise_axis_1()
-    test_dynamic_quantize()
diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py
deleted file mode 100644
index 4c0f2c7ee7fd..000000000000
--- a/tests/python/relay/test_op_qnn_requantize.py
+++ /dev/null
@@ -1,648 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-
-roundings = ["UPWARD", "TONEAREST"]
-compute_dtypes = ["float32", "float64", "int64"]
-out_dtypes = ["int8", "int16"]
-
-
-def verify(mod, goldens, target="llvm"):
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, params = relay.build(mod, target, params=None)
-        golden_data, golden_output = goldens
-        rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-        rt_mod.set_input("input_data", golden_data)
-        rt_mod.set_input(**params)
-        rt_mod.run()
-        res = rt_mod.get_output(0).numpy()
-        np.testing.assert_equal(res, golden_output)
-
-
-def get_mod(
-    data_shape,
-    data_dtype,
-    out_dtype,
-    input_scale,
-    output_scale,
-    input_zero_point=0,
-    output_zero_point=0,
-    rounding="None",
-    compute_dtype="None",
-    axis=0,
-):
-    input_data = relay.var("input_data", shape=data_shape, dtype=data_dtype)
-    if isinstance(input_scale, float):
-        input_scale_expr = relay.const(input_scale, "float32")
-    else:
-        input_scale_expr = relay.const(np.array(input_scale).astype("float32"))
-
-    if isinstance(input_zero_point, float):
-        input_zero_point_expr = relay.const(input_zero_point, "int32")
-    else:
-        input_zero_point_expr = relay.const(np.array(input_zero_point).astype("int32"))
-
-    mod = relay.qnn.requantize(
-        input_data,
-        input_scale=input_scale_expr,
-        input_zero_point=input_zero_point_expr,
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-        axis=axis,
-        rounding=rounding,
-        compute_dtype=compute_dtype,
-        out_dtype=out_dtype,
-    )
-
-    mod = relay.Function(relay.analysis.free_vars(mod), mod)
-    mod = tvm.IRModule.from_expr(mod)
-    return mod
-
-
-def test_same_scale():
-    # Have same scales, everything within range
-    golden_data = np.arange(-100, 100, 1).astype("int32")
-    golden_output = golden_data
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            for qnn_out_dtype in out_dtypes:
-                mod = get_mod(
-                    data_shape=(200,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=0.5,
-                    output_scale=0.5,
-                    rounding=rounding,
-                    compute_dtype=compute_dtype,
-                )
-                assert "right_shift" not in mod.astext()
-                verify(mod, (golden_data, golden_output))
-
-
-def test_scalar_same_scale():
-    # Have same scales, everything within range
-    golden_data = np.array(-10).astype("int32")
-    golden_output = golden_data
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            for qnn_out_dtype in out_dtypes:
-                mod = get_mod(
-                    data_shape=(),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=0.5,
-                    output_scale=0.5,
-                    rounding=rounding,
-                    compute_dtype=compute_dtype,
-                )
-                assert "right_shift" not in mod.astext()
-                verify(mod, (golden_data, golden_output))
-
-
-def test_downscale():
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            for qnn_out_dtype in out_dtypes:
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=1,
-                    output_scale=16,
-                    rounding=rounding,
-                    compute_dtype=compute_dtype,
-                )
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype("int32")
-                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                verify(mod, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype("int32")
-                if rounding == "UPWARD":
-                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                verify(mod, (golden_data, golden_output))
-
-                # Try a different scale
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=1,
-                    output_scale=4,
-                    rounding=rounding,
-                )
-
-                # Try positive values
-                # 2I corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype("int32")
-                golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8], [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                verify(mod, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype("int32")
-                if rounding == "UPWARD":
-                    golden_output = np.repeat(
-                        [0, -1, -2, -3, -4, -5, -6, -7, -8], [3, 4, 4, 4, 4, 4, 4, 4, 1]
-                    )
-                else:
-                    golden_output = np.repeat(
-                        [0, -1, -2, -3, -4, -5, -6, -7, -8], [2, 4, 4, 4, 4, 4, 4, 4, 2]
-                    )
-                verify(mod, (golden_data, golden_output))
-
-            # Try uint8 out_dtype
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="uint8",
-                input_scale=1,
-                output_scale=16,
-                rounding=rounding,
-            )
-
-            # Try positive values
-            # 8 corresponds to 0.5, resulting in 1
-            golden_data = np.arange(0, 32, 1).astype("int32")
-            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            verify(mod, (golden_data, golden_output))
-
-            # Try uint8 in_dtyope and uint8 out_dtype
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="uint8",
-                out_dtype="uint8",
-                input_scale=1,
-                output_scale=16,
-                rounding=rounding,
-            )
-
-            # Try positive values
-            # 8 corresponds to 0.5, resulting in 1
-            golden_data = np.arange(0, 32, 1).astype("int32")
-            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            verify(mod, (golden_data, golden_output))
-
-
-def test_upscale():
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            for qnn_out_dtype in out_dtypes:
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=2,
-                    output_scale=1,
-                    rounding=rounding,
-                    compute_dtype=compute_dtype,
-                )
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype("int32")
-                golden_output = np.multiply(2, golden_data)
-                verify(mod, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype("int32")
-                golden_output = np.multiply(2, golden_data)
-                verify(mod, (golden_data, golden_output))
-
-
-def test_non_power_of_two():
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            for qnn_out_dtype in out_dtypes:
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=1,
-                    output_scale=3,
-                    rounding=rounding,
-                    compute_dtype=compute_dtype,
-                )
-
-                # Try positive values
-                golden_data = np.multiply(np.arange(0, 32, 1).astype("int32"), 3)
-                golden_output = np.arange(0, 32, 1)
-                verify(mod, (golden_data, golden_output))
-
-                # Try negative values
-                golden_data = np.multiply(np.arange(0, -32, -1).astype("int32"), 3)
-                golden_output = np.arange(0, -32, -1)
-                verify(mod, (golden_data, golden_output))
-
-                # Try a different scale
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=3,
-                    output_scale=1,
-                    rounding=rounding,
-                )
-
-                # Try positive values
-                golden_data = np.arange(0, 32, 1).astype("int32")
-                golden_output = np.multiply(golden_data, 3)
-                verify(mod, (golden_data, golden_output))
-
-                # Try negative values
-                golden_data = np.arange(0, -32, -1).astype("int32")
-                golden_output = np.multiply(golden_data, 3)
-                verify(mod, (golden_data, golden_output))
-
-
-def test_saturation_int8():
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            mod = get_mod(
-                data_shape=(16,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=0.5,
-                output_scale=0.5,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            golden_data = np.arange(0, 16, 1).astype("int32")
-            golden_data = np.add(120, golden_data)
-            output = np.array(
-                [120, 121, 122, 123, 124, 125, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127]
-            )
-            golden_output = output
-            verify(mod, (golden_data, golden_output))
-
-            # Try negative numbers
-            golden_data = np.arange(0, -16, -1).astype("int32")
-            golden_data = np.add(-120, golden_data)
-            output = np.array(
-                [
-                    -120,
-                    -121,
-                    -122,
-                    -123,
-                    -124,
-                    -125,
-                    -126,
-                    -127,
-                    -128,
-                    -128,
-                    -128,
-                    -128,
-                    -128,
-                    -128,
-                    -128,
-                    -128,
-                ]
-            )
-            golden_output = output
-            verify(mod, (golden_data, golden_output))
-
-
-def test_saturation_int16():
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            mod = get_mod(
-                data_shape=(16,),
-                data_dtype="int32",
-                out_dtype="int16",
-                input_scale=0.5,
-                output_scale=0.5,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            golden_data = np.arange(0, 16, 1).astype("int32")
-            golden_data = np.add(32760, golden_data)
-            output = np.array(
-                [
-                    32760,
-                    32761,
-                    32762,
-                    32763,
-                    32764,
-                    32765,
-                    32766,
-                    32767,
-                    32767,
-                    32767,
-                    32767,
-                    32767,
-                    32767,
-                    32767,
-                    32767,
-                    32767,
-                ]
-            )
-            golden_output = output
-            verify(mod, (golden_data, golden_output))
-
-            # Try negative numbers
-            golden_data = np.arange(0, -16, -1).astype("int32")
-            golden_data = np.add(-32760, golden_data)
-            output = np.array(
-                [
-                    -32760,
-                    -32761,
-                    -32762,
-                    -32763,
-                    -32764,
-                    -32765,
-                    -32766,
-                    -32767,
-                    -32768,
-                    -32768,
-                    -32768,
-                    -32768,
-                    -32768,
-                    -32768,
-                    -32768,
-                    -32768,
-                ]
-            )
-            golden_output = output
-            verify(mod, (golden_data, golden_output))
-
-
-def test_zero_point():
-    # Output zero point
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=1,
-                output_scale=16,
-                output_zero_point=1,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-
-            # Try positive values
-            # 8 corresponds to 0.5, resulting in 1
-            golden_data = np.arange(0, 32, 1).astype("int32")
-            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            golden_output = np.add(1, golden_output)
-            verify(mod, (golden_data, golden_output))
-
-            # Try negative values
-            # -8 corresponds to -0.5. For UPWARD, this is 0
-            golden_data = np.arange(-32, -64, -1).astype("int32")
-            if rounding == "UPWARD":
-                golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-            else:
-                golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-            golden_output = np.add(1, golden_output)
-            verify(mod, (golden_data, golden_output))
-
-    # Input zero point
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            for qnn_out_dtype in out_dtypes:
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=1,
-                    output_scale=16,
-                    input_zero_point=16,
-                    rounding=rounding,
-                    compute_dtype=compute_dtype,
-                )
-
-                # Try positive values
-                golden_data = np.arange(32, 64, 1).astype("int32")
-                golden_output = np.repeat([2, 3, 4], [8, 16, 8])
-                golden_output = np.subtract(golden_output, 1)
-                verify(mod, (golden_data, golden_output))
-
-                # Try negative values
-                golden_data = np.arange(-32, -64, -1).astype("int32")
-                if rounding == "UPWARD":
-                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                golden_output = np.subtract(golden_output, 1)
-                verify(mod, (golden_data, golden_output))
-
-
-def test_per_channel_same_scale():
-    # Have same scales, everything within range
-    golden_data = np.arange(-5, 5, 1).astype("int32").reshape((5, 2))
-    golden_output = golden_data
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            for qnn_out_dtype in out_dtypes:
-                mod = get_mod(
-                    data_shape=(5, 2),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=[0.5, 0.5],
-                    output_scale=0.5,
-                    axis=1,
-                    rounding=rounding,
-                    compute_dtype=compute_dtype,
-                )
-                verify(mod, (golden_data, golden_output))
-
-    # Change axis
-    golden_data = np.arange(-10, 10, 1).astype("int32").reshape((2, 2, 5))
-    golden_output = golden_data
-
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            mod = get_mod(
-                data_shape=(2, 2, 5),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=[0.5, 0.5],
-                output_scale=0.5,
-                axis=1,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            verify(mod, (golden_data, golden_output))
-
-
-def test_per_channel_different_scale():
-    # Have same scales, everything within range
-    golden_data = np.arange(-5, 5, 1).astype("int32").reshape((5, 2))
-    golden_output = np.array([-5, -2, -3, -1, -1, 0, 1, 1, 3, 2]).reshape((5, 2))
-
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            mod = get_mod(
-                data_shape=(5, 2),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=[0.5, 0.25],
-                output_scale=0.5,
-                axis=1,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            verify(mod, (golden_data, golden_output))
-
-    # Change axis
-    golden_data = np.arange(-20, 20, 2).astype("int32").reshape((2, 2, 5))
-    golden_output = np.array(
-        [-20, -18, -16, -14, -12, -5, -4, -3, -2, -1, 0, 2, 4, 6, 8, 5, 6, 7, 8, 9]
-    ).reshape((2, 2, 5))
-
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            mod = get_mod(
-                data_shape=(2, 2, 5),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=[0.5, 0.25],
-                output_scale=0.5,
-                axis=1,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            verify(mod, (golden_data, golden_output))
-
-    # Have input scale > output scale
-    golden_data = np.arange(-5, 5, 1).astype("int32").reshape((5, 2))
-    golden_output = np.array([-10, -2, -6, -1, -2, 0, 2, 1, 6, 2]).reshape((5, 2))
-
-    for compute_dtype in compute_dtypes:
-        for rounding in roundings:
-            mod = get_mod(
-                data_shape=(5, 2),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=[1.0, 0.25],
-                output_scale=0.5,
-                axis=1,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            verify(mod, (golden_data, golden_output))
-
-
-def test_default_cfg_and_no_args():
-    for qnn_out_dtype in out_dtypes:
-        mod = get_mod(
-            data_shape=(32,),
-            data_dtype="int32",
-            out_dtype=qnn_out_dtype,
-            input_scale=1,
-            output_scale=16,
-        )
-        golden_data = np.arange(0, -32, -1).astype("int32")
-        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-        verify(mod, (golden_data, golden_output))
-
-
-def test_non_default_cfg_and_no_args():
-    for rounding_cfg in roundings:
-        for qnn_out_dtype in out_dtypes:
-            with relay.qnn.requantize_config(rounding=rounding_cfg):
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=1,
-                    output_scale=16,
-                )
-
-                golden_data = np.arange(0, -32, -1).astype("int32")
-
-                if rounding_cfg == "UPWARD":
-                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                verify(mod, (golden_data, golden_output))
-
-
-def test_default_cfg_and_args():
-    for rounding in roundings:
-        for qnn_out_dtype in out_dtypes:
-            with relay.qnn.requantize_config(rounding="UPWARD"):
-                mod = get_mod(
-                    data_shape=(32,),
-                    data_dtype="int32",
-                    out_dtype=qnn_out_dtype,
-                    input_scale=1,
-                    output_scale=16,
-                    rounding=rounding,
-                )
-
-                golden_data = np.arange(0, -32, -1).astype("int32")
-
-                if rounding == "UPWARD":
-                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                verify(mod, (golden_data, golden_output))
-
-
-def test_non_default_cfg_and_args():
-    for rounding_arg in roundings:
-        for rounding_cfg in roundings:
-            for qnn_out_dtype in out_dtypes:
-                with relay.qnn.requantize_config(rounding=rounding_cfg):
-                    mod = get_mod(
-                        data_shape=(32,),
-                        data_dtype="int32",
-                        out_dtype=qnn_out_dtype,
-                        input_scale=1,
-                        output_scale=16,
-                        rounding=rounding_arg,
-                    )
-
-                    golden_data = np.arange(0, -32, -1).astype("int32")
-
-                    if rounding_arg == "UPWARD":
-                        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                    else:
-                        golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                    verify(mod, (golden_data, golden_output))
-
-
-if __name__ == "__main__":
-    test_same_scale()
-    test_scalar_same_scale()
-    test_downscale()
-    test_upscale()
-    test_non_power_of_two()
-    test_saturation_int8()
-    test_saturation_int16()
-    test_zero_point()
-    test_per_channel_same_scale()
-    test_per_channel_different_scale()
-    test_default_cfg_and_no_args()
-    test_non_default_cfg_and_no_args()
-    test_default_cfg_and_args()
-    test_non_default_cfg_and_args()
diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py
deleted file mode 100644
index 75d7f9727af9..000000000000
--- a/tests/python/relay/test_op_qnn_simulated_dequantize.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.runtime.vm import VirtualMachine
-from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
-
-
-def dequantize_test_driver(in_dtype, quant_args, axis, in_data):
-    shape = in_data.shape
-    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
-    input_zero_point = relay.const(quant_args["in_zero_point"])
-    input_scale = relay.const(quant_args["in_scale"])
-    dequantized_output = relay.qnn.dequantize(
-        input_data,
-        input_scale=input_scale,
-        input_zero_point=input_zero_point,
-        axis=axis,
-    )
-    mod = relay.Function(relay.analysis.free_vars(dequantized_output), dequantized_output)
-    mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, params = relay.build(mod, "llvm", params=None)
-    rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-    rt_mod.set_input(input_data=in_data)
-    rt_mod.set_input(**params)
-    rt_mod.run()
-    res = rt_mod.get_output(0).numpy()
-    return res
-
-
-def build_simulated_dequantize(input_data, scale, zp, dtype, axis=-1):
-    sim_q = relay.qnn.simulated_dequantize(
-        input_data,
-        scale,
-        zp,
-        axis=axis,
-        in_dtype=dtype,
-    )
-    mod = tvm.IRModule.from_expr(sim_q)
-    with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, "llvm", params=None)
-    vm = VirtualMachine(vm_exec, tvm.cpu(0))
-    return vm
-
-
-def verify_simulated_dequantize_simple(dtype):
-    data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype(dtype)
-    data_fp = data.astype("float32")
-    scale_np = np.float32(0.5)
-    zp_np = np.int32(127)
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype])
-    quant_args = {"in_zero_point": zp_np, "in_scale": scale_np}
-    dq_out = dequantize_test_driver(
-        in_dtype=dtype,
-        quant_args=quant_args,
-        axis=-1,
-        in_data=data,
-    )
-    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
-    scale = relay.var("scale", shape=[])
-    zp = relay.var("zp", shape=[], dtype="int32")
-    dtype = relay.var("dtype", shape=[], dtype="int32")
-    vm = build_simulated_dequantize(input_data, scale, zp, dtype)
-    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_allclose(sim_dq_out.numpy(), dq_out, rtol=1e-5)
-
-
-def test_simulated_dequantize():
-    verify_simulated_dequantize_simple("uint8")
-    verify_simulated_dequantize_simple("int8")
-    verify_simulated_dequantize_simple("int32")
-
-
-def test_dynamic_channels():
-    # Compile simulated quantize once but support either per-channel or scalar params.
-    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("int8")
-    data_fp = data.astype("float32")
-    # Test scalar qnn params.
-    scale_np = np.asarray([0.5]).astype("float32")
-    zp_np = np.asarray([0]).astype("int32")
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"])
-    quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]}
-    dq_out = dequantize_test_driver(
-        in_dtype="int8",
-        quant_args=quant_args,
-        axis=0,
-        in_data=data,
-    )
-    # Create variables with undefined shape and run with scalar inputs.
-    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
-    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
-    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
-    dtype = relay.var("dtype", shape=[], dtype="int32")
-    vm = build_simulated_dequantize(input_data, scale, zp, dtype, axis=0)
-    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_allclose(sim_dq_out.numpy(), dq_out, rtol=1e-5)
-
-    # Now get the perchannel quantize output and compare without recompiling.
-    scale_np = np.array([0.5, 0.25]).astype("float32")
-    zp_np = np.array([127, 123]).astype("int32")
-
-    # Get the reference quantize output.
-    quant_args = {"in_zero_point": zp_np, "in_scale": scale_np}
-    dq_out = dequantize_test_driver(
-        in_dtype="int8",
-        quant_args=quant_args,
-        axis=0,
-        in_data=data,
-    )
-    # Run the simulated quantize without recompiling and confirm results match.
-    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_allclose(sim_dq_out.numpy(), dq_out, rtol=1e-5)
-
-
-def test_dynamic_dtype():
-    # Compile simulated quantize once but support any type of quantization.
-    data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("uint8")
-    data_fp = data.astype("float32")
-    # Test scalar uint8 to fp32.
-    scale_np = np.asarray([0.5]).astype("float32")
-    zp_np = np.asarray([127]).astype("int32")
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
-    quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]}
-    dq_out = dequantize_test_driver(
-        in_dtype="uint8",
-        quant_args=quant_args,
-        axis=-1,
-        in_data=data,
-    )
-    # Create variables with undefined shape and run with scalar inputs.
-    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
-    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
-    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
-    dtype = relay.var("dtype", shape=[], dtype="int32")
-    vm = build_simulated_dequantize(input_data, scale, zp, dtype)
-    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_allclose(sim_dq_out.numpy(), dq_out, rtol=1e-5)
-
-    # Now test int8 to float32 compilation.
-    data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("int8")
-    data_fp = data.astype("float32")
-    # Get the reference quantize output.
-    dq_out = dequantize_test_driver(
-        in_dtype="int8",
-        quant_args=quant_args,
-        axis=-1,
-        in_data=data,
-    )
-    # Run the simulated quantize without recompiling and confirm results match.
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"])
-    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_allclose(sim_dq_out.numpy(), dq_out, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_simulated_dequantize()
-    test_dynamic_channels()
-    test_dynamic_dtype()
diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py
deleted file mode 100644
index c0f45837e4b5..000000000000
--- a/tests/python/relay/test_op_qnn_simulated_quantize.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.runtime.vm import VirtualMachine
-from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
-
-
-def allclose_with_rounding(a, b):
-    # Find number of mismatches in inputs.
-    mismatch = a != b
-    # Allow some rounding errors due to GPU fp32 arithmetic.
-    assert np.sum(mismatch) <= 3
-
-
-def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data):
-    shape = in_data.shape
-    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
-    output_zero_point = relay.const(quant_args["out_zero_point"])
-    output_scale = relay.const(quant_args["out_scale"])
-    quantized_output = relay.qnn.quantize(
-        input_data,
-        output_scale=output_scale,
-        output_zero_point=output_zero_point,
-        axis=axis,
-        out_dtype=out_dtype,
-    )
-    mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output)
-    mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, params = relay.build(mod, "llvm", params=None)
-    rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
-    rt_mod.set_input(input_data=in_data)
-    rt_mod.set_input(**params)
-    rt_mod.run()
-    res = rt_mod.get_output(0).numpy()
-    return res
-
-
-def build_simulated_quantize(input_data, scale, zp, dtype, axis=-1):
-    sim_q = relay.qnn.simulated_quantize(
-        input_data,
-        scale,
-        zp,
-        axis=axis,
-        out_dtype=dtype,
-    )
-    mod = tvm.IRModule.from_expr(sim_q)
-    with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, "llvm", params=None)
-    vm = VirtualMachine(vm_exec, tvm.cpu(0))
-    return vm
-
-
-def verify_simulated_quantize_simple(dtype):
-    data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype("float32")
-    scale_np = np.float32(0.5)
-    zp_np = np.int32(127)
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype])
-    quant_args = {"out_zero_point": zp_np, "out_scale": scale_np}
-    q_out = quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype=dtype,
-        in_data=data,
-    )
-    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
-    scale = relay.var("scale", shape=[])
-    zp = relay.var("zp", shape=[], dtype="int32")
-    dtype = relay.var("dtype", shape=[], dtype="int32")
-    vm = build_simulated_quantize(input_data, scale, zp, dtype)
-    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    allclose_with_rounding(sim_q_out.numpy(), q_out)
-
-
-def test_simulated_quantize():
-    verify_simulated_quantize_simple("uint8")
-    verify_simulated_quantize_simple("int8")
-    verify_simulated_quantize_simple("int32")
-
-
-def test_dynamic_channels():
-    # Compile simulated quantize once but support either per-channel or scalar params.
-    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32")
-    # Test scalar qnn params.
-    scale_np = np.asarray([0.5]).astype("float32")
-    zp_np = np.asarray([127]).astype("int32")
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
-    quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]}
-    q_out = quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=0,
-        out_dtype="uint8",
-        in_data=data,
-    )
-    # Create variables with undefined shape and run with scalar inputs.
-    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
-    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
-    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
-    dtype = relay.var("dtype", shape=[], dtype="int32")
-    vm = build_simulated_quantize(input_data, scale, zp, dtype, axis=0)
-    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    allclose_with_rounding(sim_q_out.numpy(), q_out)
-
-    # Now get the perchannel quantize output and compare without recompiling.
-    scale_np = np.array([0.5, 0.25]).astype("float32")
-    zp_np = np.array([127, 123]).astype("int32")
-
-    # Get the reference quantize output.
-    quant_args = {"out_zero_point": zp_np, "out_scale": scale_np}
-    q_out = quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=0,
-        out_dtype="uint8",
-        in_data=data,
-    )
-    # Run the simulated quantize without recompiling and confirm results match.
-    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    allclose_with_rounding(sim_q_out.numpy(), q_out)
-
-
-def test_dynamic_dtype():
-    # Compile simulated quantize once but support any type of quantization.
-    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32")
-    # Test scalar float32 to uint8.
-    scale_np = np.asarray([0.5]).astype("float32")
-    zp_np = np.asarray([127]).astype("int32")
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
-    quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]}
-    q_out = quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype="uint8",
-        in_data=data,
-    )
-    # Create variables with undefined shape and run with scalar inputs.
-    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
-    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
-    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
-    dtype = relay.var("dtype", shape=[], dtype="int32")
-    vm = build_simulated_quantize(input_data, scale, zp, dtype)
-    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    allclose_with_rounding(sim_q_out.numpy(), q_out)
-
-    # Now test float32 to int32 compilation.
-    # Get the reference quantize output.
-    q_out = quantize_test_driver(
-        in_dtype="float32",
-        quant_args=quant_args,
-        axis=-1,
-        out_dtype="int32",
-        in_data=data,
-    )
-    # Run the simulated quantize without recompiling and confirm results match.
-    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int32"])
-    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    allclose_with_rounding(sim_q_out.numpy(), q_out)
-
-
-if __name__ == "__main__":
-    test_simulated_quantize()
-    test_dynamic_channels()
-    test_dynamic_dtype()
diff --git a/tests/python/relay/test_op_qnn_subtract.py b/tests/python/relay/test_op_qnn_subtract.py
deleted file mode 100644
index 16f26e77e768..000000000000
--- a/tests/python/relay/test_op_qnn_subtract.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import numpy as np
-from tvm import relay
-
-
-def qnn_subtract_driver(x_datas, y_datas, golden_outputs, scale_and_zp, data_dtype="uint8"):
-    # all x, y and golden outputs should be of the same length
-    assert len(x_datas) == len(y_datas)
-    assert len(y_datas) == len(golden_outputs)
-
-    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
-    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
-    lhs_scale = relay.const(scale_and_zp["lhs_scale"], "float32")
-    lhs_zp = relay.const(scale_and_zp["lhs_zp"], "int32")
-    rhs_scale = relay.const(scale_and_zp["rhs_scale"], "float32")
-    rhs_zp = relay.const(scale_and_zp["rhs_zp"], "int32")
-    output_scale = relay.const(scale_and_zp["output_scale"], "float32")
-    output_zp = relay.const(scale_and_zp["output_zp"], "int32")
-    z = relay.qnn.subtract(
-        lhs=x,
-        rhs=y,
-        lhs_scale=lhs_scale,
-        lhs_zero_point=lhs_zp,
-        rhs_scale=rhs_scale,
-        rhs_zero_point=rhs_zp,
-        output_scale=output_scale,
-        output_zero_point=output_zp,
-    )
-    func = relay.Function([x, y], z)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-    for i in range(0, len(x_datas)):
-        x_data = x_datas[i]
-        y_data = y_datas[i]
-        golden_output = golden_outputs[i]
-        op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(
-            x_data, y_data
-        )
-        np.testing.assert_equal(op_res.numpy(), golden_output)
-
-
-def test_tflite_same_io_qnn_params():
-    scale_and_zp = {
-        "lhs_scale": 0.00784314,
-        "lhs_zp": 127,
-        "rhs_scale": 0.00784314,
-        "rhs_zp": 127,
-        "output_scale": 0.00784314,
-        "output_zp": 127,
-    }
-    x_datas = [
-        np.array((140, 153, 165, 178)).reshape((1, 4)),
-        np.array((25, 153, 178, 216)).reshape((1, 4)),
-        np.array((25, 153, 216, 165)).reshape((1, 4)),
-    ]
-    y_datas = [
-        np.array((204, 178, 165, 140)).reshape((1, 4)),
-        np.array((204, 178, 191, 25)).reshape((1, 4)),
-        np.array((204, 178, 25, 191)).reshape((1, 4)),
-    ]
-    golden_outputs = [
-        np.array((63, 102, 127, 165)).reshape((1, 4)),
-        np.array((0, 102, 114, 255)).reshape((1, 4)),
-        np.array((0, 102, 255, 101)).reshape((1, 4)),
-    ]
-    qnn_subtract_driver(x_datas, y_datas, golden_outputs, scale_and_zp)
-
-
-def test_tflite_different_io_qnn_params():
-    scale_and_zp = {
-        "lhs_scale": 0.0156863,
-        "lhs_zp": 127,
-        "rhs_scale": 0.0117647,
-        "rhs_zp": 85,
-        "output_scale": 0.0235294,
-        "output_zp": 128,
-    }
-    x_datas = [
-        np.array((76, 140, 153, 172)).reshape((1, 4)),
-        np.array((133, 140, 146, 153)).reshape((1, 4)),
-        np.array((76, 140, 172, 146)).reshape((1, 4)),
-    ]
-    y_datas = [
-        np.array((136, 119, 128, 17)).reshape((1, 4)),
-        np.array((136, 119, 111, 94)).reshape((1, 4)),
-        np.array((136, 119, 17, 128)).reshape((1, 4)),
-    ]
-    golden_outputs = [
-        np.array((68, 120, 123, 192)).reshape((1, 4)),
-        np.array((106, 120, 128, 140)).reshape((1, 4)),
-        np.array((68, 120, 192, 119)).reshape((1, 4)),
-    ]
-    qnn_subtract_driver(x_datas, y_datas, golden_outputs, scale_and_zp)
-
-
-def test_saturation():
-    # Same params
-    scale_and_zp = {
-        "lhs_scale": 0.125,
-        "lhs_zp": 0,
-        "rhs_scale": 0.125,
-        "rhs_zp": 0,
-        "output_scale": 0.125,
-        "output_zp": 0,
-    }
-    x_data = [np.array((255, 1, 1, 0)).reshape((1, 4))]
-    y_data = [np.array((255, 255, 128, 0)).reshape((1, 4))]
-    golden_output = [np.array((0, 0, 0, 0)).reshape((1, 4))]
-    qnn_subtract_driver(x_data, y_data, golden_output, scale_and_zp)
-
-    # Same params, different scale
-    scale_and_zp = {
-        "lhs_scale": 0.125,
-        "lhs_zp": 0,
-        "rhs_scale": 0.125,
-        "rhs_zp": 0,
-        "output_scale": 0.25,
-        "output_zp": 0,
-    }
-    x_data = [np.array((255, 1, 200, 0)).reshape((1, 4))]
-    y_data = [np.array((255, 255, 127, 0)).reshape((1, 4))]
-    golden_output = [np.array((0, 0, 36, 0)).reshape((1, 4))]
-    qnn_subtract_driver(x_data, y_data, golden_output, scale_and_zp)
-
-    # All params different
-    scale_and_zp = {
-        "lhs_scale": 0.5,
-        "lhs_zp": 0,
-        "rhs_scale": 0.25,
-        "rhs_zp": 0,
-        "output_scale": 0.125,
-        "output_zp": 0,
-    }
-    x_data = [np.array((255, 0, 1, 0)).reshape((1, 4))]
-    y_data = [np.array((0, 128, 64, 0)).reshape((1, 4))]
-    golden_output = [np.array((255, 0, 0, 0)).reshape((1, 4))]
-    qnn_subtract_driver(x_data, y_data, golden_output, scale_and_zp)
-
-
-if __name__ == "__main__":
-    test_tflite_same_io_qnn_params()
-    test_tflite_different_io_qnn_params()
-    test_saturation()
diff --git a/tests/python/relay/test_op_qnn_unary_elementwise.py b/tests/python/relay/test_op_qnn_unary_elementwise.py
deleted file mode 100644
index 01a7374a0b3d..000000000000
--- a/tests/python/relay/test_op_qnn_unary_elementwise.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import Callable, List
-
-import numpy as np
-import pytest
-import scipy.special
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.qnn.op.legalizations import hardswish_func
-
-
-def dequantize(data, scale, zp):
-    return scale * (np.asarray(data) - zp)
-
-
-def generate_golden_output(
-    floating_point_golden_func, dequantized_x, output_scale, output_zero_point, dtype
-):
-    output = floating_point_golden_func(dequantized_x)
-    output = np.around(output / output_scale + output_zero_point)
-
-    np_dtype = {"int8": np.int8, "uint8": np.uint8}[dtype]
-
-    q_min = np.iinfo(np_dtype).min
-    q_max = np.iinfo(np_dtype).max
-    return np.clip(output, q_min, q_max)
-
-
-def run_qnn_func(func: relay.Function, args: List[relay.Expr]):
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.qnn.transform.Legalize()(mod)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    func = mod["main"]
-
-    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(*args)
-    return op_res.numpy()
-
-
-def create_qnn_func(
-    qnn_op: Callable[[relay.Expr, relay.Expr, relay.Expr, relay.Expr, relay.Expr], relay.Call],
-    x_data: np.ndarray,
-    input_scale: float,
-    input_zero_point: int,
-    output_scale: float,
-    output_zero_point: int,
-    input_dtype: str = "uint8",
-):
-    x = relay.var("x", shape=x_data.shape, dtype=input_dtype)
-    y = qnn_op(
-        x=x,
-        scale=relay.const(input_scale, "float32"),
-        zero_point=relay.const(input_zero_point, "int32"),
-        output_scale=relay.const(output_scale, "float32"),
-        output_zero_point=relay.const(output_zero_point, "int32"),
-    )
-    return relay.Function([x], y)
-
-
-def run_condition(
-    qnn_op: Callable[[relay.Expr, relay.Expr, relay.Expr, relay.Expr, relay.Expr], relay.Call],
-    floating_point_golden_func: Callable[[np.ndarray], np.ndarray],
-    x_data: np.ndarray,
-    input_scale: float,
-    input_zero_point: int,
-    output_scale: float,
-    output_zero_point: int,
-    input_dtype: str = "uint8",
-):
-    func = create_qnn_func(
-        qnn_op,
-        x_data,
-        input_scale=input_scale,
-        input_zero_point=input_zero_point,
-        output_scale=output_scale,
-        output_zero_point=output_zero_point,
-        input_dtype=input_dtype,
-    )
-
-    x_dequantized = dequantize(x_data, input_scale, input_zero_point)
-    golden_output = generate_golden_output(
-        floating_point_golden_func,
-        x_dequantized,
-        output_scale,
-        output_zero_point,
-        dtype=input_dtype,
-    )
-
-    op_res = run_qnn_func(func, [x_data])
-    np.testing.assert_equal(op_res, golden_output.astype(input_dtype))
-
-
-def generic_test(
-    qnn_op: Callable[[relay.Expr, relay.Expr, relay.Expr, relay.Expr, relay.Expr], relay.Call],
-    floating_point_golden_func: Callable[[np.ndarray], np.ndarray],
-    input_dtype: str = "uint8",
-    x_data: np.ndarray = np.arange(0, 256, dtype="uint8"),
-):
-    x_data = x_data.view(input_dtype)
-    return run_condition(
-        qnn_op,
-        floating_point_golden_func,
-        x_data,
-        input_scale=0.125,
-        input_zero_point=0,
-        output_scale=0.125,
-        output_zero_point=0,
-        input_dtype=input_dtype,
-    )
-
-
-class TestRSqrt:
-    def test_saturation(self):
-        # Same qparams in and out
-        x_data = np.array((255, 133, 0, 9)).reshape((1, 4))
-        run_condition(
-            relay.qnn.rsqrt,
-            lambda x: 1 / np.sqrt(x),
-            x_data,
-            input_scale=0.125,
-            input_zero_point=0,
-            output_scale=0.125,
-            output_zero_point=0,
-            input_dtype="uint8",
-        )
-
-        # Different scale
-        run_condition(
-            relay.qnn.rsqrt,
-            lambda x: 1 / np.sqrt(x),
-            x_data,
-            input_scale=0.125,
-            input_zero_point=0,
-            output_scale=0.25,
-            output_zero_point=0,
-            input_dtype="uint8",
-        )
-
-    def test_all_numbers_uint8(self):
-        generic_test(relay.qnn.rsqrt, lambda x: 1 / np.sqrt(x), input_dtype="uint8")
-
-    def test_all_numbers_int8(self):
-        generic_test(
-            relay.qnn.rsqrt,
-            lambda x: 1 / np.sqrt(x),
-            input_dtype="int8",
-            x_data=np.arange(1, 128, dtype="int8"),
-        )
-
-
-class Sqrt:
-    def test_all_numbers_uint8(self):
-        generic_test(relay.qnn.sqrt, np.sqrt, input_dtype="uint8")
-
-    def test_all_numbers_int8(self):
-        generic_test(
-            relay.qnn.sqrt,
-            np.sqrt,
-            input_dtype="int8",
-            x_data=np.arange(1, 128, dtype="int8"),
-        )
-
-
-class TestExp:
-    def test_all_numbers_uint8(self):
-        generic_test(relay.qnn.exp, np.exp, input_dtype="uint8")
-
-    def test_all_numbers_int8(self):
-        generic_test(relay.qnn.exp, np.exp, input_dtype="int8")
-
-
-class TestTanh:
-    def test_all_numbers_uint8(self):
-        generic_test(relay.qnn.tanh, np.tanh, input_dtype="uint8")
-
-    def test_all_numbers_int8(self):
-        generic_test(relay.qnn.tanh, np.tanh, input_dtype="int8")
-
-
-class TestErf:
-    def test_all_numbers_uint8(self):
-        generic_test(relay.qnn.erf, scipy.special.erf, input_dtype="uint8")
-
-    def test_all_numbers_int8(self):
-        generic_test(relay.qnn.erf, scipy.special.erf, input_dtype="int8")
-
-
-class TestSigmoid:
-    def test_all_numbers_uint8(self):
-        generic_test(relay.qnn.sigmoid, lambda x: 1 / (1 + np.exp(-x)), input_dtype="uint8")
-
-    def test_all_numbers_int8(self):
-        generic_test(relay.qnn.sigmoid, lambda x: 1 / (1 + np.exp(-x)), input_dtype="int8")
-
-
-class TestHardswish:
-    def test_all_numbers_uint8(self):
-        generic_test(relay.qnn.hardswish, hardswish_func, input_dtype="uint8")
-
-    def test_all_numbers_int8(self):
-        generic_test(relay.qnn.hardswish, hardswish_func, input_dtype="int8")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
deleted file mode 100644
index 5471460c71f2..000000000000
--- a/tests/python/relay/test_param_dict.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import numpy as np
-import tvm
-from tvm import te, runtime
-import json
-import base64
-from tvm._ffi.base import py_str
-from tvm.relay.op import add
-from tvm import relay
-from tvm import rpc
-from tvm.contrib import utils, graph_executor
-
-
-def test_save_load():
-    x = np.ones((10, 2)).astype("float32")
-    y = np.ones((1, 2, 3)).astype("float32")
-    params = {"x": x, "y": y}
-    param_bytes = runtime.save_param_dict(params)
-    assert isinstance(param_bytes, bytearray)
-    param2 = relay.load_param_dict(param_bytes)
-    assert len(param2) == 2
-    np.testing.assert_equal(param2["x"].numpy(), x)
-    np.testing.assert_equal(param2["y"].numpy(), y)
-
-
-def test_ndarray_reflection():
-    # Make two `NDArrayWrapper`s that point to the same underlying array.
-    np_array = np.random.uniform(size=(10, 2)).astype("float32")
-    tvm_array = tvm.nd.array(np_array)
-    param_dict = {"x": tvm_array, "y": tvm_array}
-    assert param_dict["x"].same_as(param_dict["y"])
-    # Serialize then deserialize `param_dict`.
-    deser_param_dict = relay.load_param_dict(runtime.save_param_dict(param_dict))
-    # Make sure the data matches the original data and `x` and `y` contain the same data.
-    np.testing.assert_equal(deser_param_dict["x"].numpy(), tvm_array.numpy())
-    # Make sure `x` and `y` contain the same data.
-    np.testing.assert_equal(deser_param_dict["x"].numpy(), deser_param_dict["y"].numpy())
-
-
-def test_bigendian_rpc_param():
-    """Test big endian rpc when there is a PowerPC RPC server available"""
-    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
-    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
-    if host is None:
-        return
-
-    def verify_graph_executor(remote, target, shape, dtype):
-        x = relay.var("x")
-        y = relay.const(1)
-        z = relay.add(x, y)
-        func = relay.Function([x], z)
-
-        x_in = np.ones(shape).astype(dtype)
-        params = {"x": x_in}
-        graph, lib, params = relay.build(func, target=target, params=params)
-
-        temp = utils.tempdir()
-        path_dso = temp.relpath("dev_lib.o")
-        lib.save(path_dso)
-        remote.upload(path_dso)
-        lib = remote.load_module("dev_lib.o")
-        dev = remote.cpu(0)
-        mod = graph_executor.create(graph, lib, dev)
-        mod.load_params(runtime.save_param_dict(params))
-        mod.run()
-        out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, device=dev))
-        tvm.testing.assert_allclose(x_in + 1, out.numpy())
-
-    print("Test RPC connection to PowerPC...")
-    remote = rpc.connect(host, port)
-    target = "llvm -mtriple=powerpc-linux-gnu"
-    for dtype in ["float32", "float64", "int32", "int8"]:
-        verify_graph_executor(remote, target, (10,), dtype)
-
-
-if __name__ == "__main__":
-    test_save_load()
-    test_ndarray_reflection()
-    test_bigendian_rpc_param()
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
deleted file mode 100644
index 527848b143a2..000000000000
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ /dev/null
@@ -1,2085 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test alter op layout pass"""
-import platform
-import pytest
-
-import tvm
-from tvm import relay, topi
-from tvm.relay import transform, analysis
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.relay.testing import run_infer_type
-from tvm.target.codegen import llvm_version_major
-import numpy as np
-import tvm.testing
-from tvm.relay import testing
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_alter_op():
-    """Test directly replacing an operator with a new one"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        weight = relay.multiply(weight, relay.const(2.0, "float32"))
-        return relay.nn.conv2d(data, weight, **attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            relay.multiply(weight, relay.const(2.0, "float32")),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_return_none():
-    """Test doing nothing by returning 'None'"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        y = relay.nn.global_max_pool2d(x)
-        y = relay.Function([x], y)
-        return y
-
-    called = [False]
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        called[0] = True
-        return None
-
-    with TempOpAttr("nn.global_max_pool2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(before(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-    assert called[0]
-
-
-def test_alter_layout():
-    """Test alternating the layout of a conv2d.
-    The layout of broadcast operators and the weight should be changed accordingly.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        bias = relay.var("bias")
-        weight = relay.var("weight")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.bias_add(y, bias)
-        # a useless tuple, which will be eliminated
-        y = relay.Tuple([y])[0]
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2))
-        y = relay.cast(y, "int32")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        new_attrs["kernel_layout"] = "OIHW16i"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        w = relay.layout_transform(weight, "OIHW", "OIHW16i")
-        y = relay.nn.conv2d(
-            y,
-            w,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            kernel_layout="OIHW16i",
-            data_layout="NCHW16c",
-        )
-        b = relay.expand_dims(bias, axis=1, num_newaxis=2)
-        b = relay.expand_dims(b, axis=0, num_newaxis=1)
-        b = relay.layout_transform(b, "NCHW", "NCHW16c")
-        y = relay.add(y, b)
-
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NCHW16c")
-        y = relay.cast(y, "int32")
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_multi():
-    """Test alternating the layout of a conv2d.
-    The layout of broadcast operators and the weight should be changed accordingly.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight")
-        y = relay.nn.conv2d(x, weight, channels=128, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        new_attrs["kernel_layout"] = "OHWI16i64o2i"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(128, 64, 3, 3))
-
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        w = relay.layout_transform(weight, "OIHW", "OHWI16i64o2i")
-        y = relay.nn.conv2d(
-            y,
-            w,
-            channels=128,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            kernel_layout="OHWI16i64o2i",
-            data_layout="NCHW16c",
-        )
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_lrn():
-    """Test alternating the layout of a conv2d.
-    The layout of broadcast operators and the weight should be changed accordingly.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        bias = relay.var("bias")
-        weight = relay.var("weight")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2))
-        y = relay.nn.lrn(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        new_attrs["kernel_layout"] = "OIHW16i"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        w = relay.layout_transform(weight, "OIHW", "OIHW16i")
-        y = relay.nn.conv2d(
-            y,
-            w,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            kernel_layout="OIHW16i",
-            data_layout="NCHW16c",
-        )
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NCHW16c")
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.nn.lrn(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_dual_path():
-    """
-    Test alternating the layout with two outputs.
-    One path continues to use the new layout while one path fall backs to old layout.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y1 = relay.nn.conv2d(y, weight2, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.batch_flatten(y)
-        ret = relay.Tuple([y1, y2])
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y = relay.nn.relu(y)
-        y1 = relay.nn.conv2d(
-            y, weight2, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y1 = relay.nn.relu(y1)
-        y1 = relay.layout_transform(y1, "NCHW16c", "NCHW")
-        y2 = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y2 = relay.nn.batch_flatten(y2)
-        ret = relay.Tuple([y1, y2])
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_resnet():
-    """Test alternating the layout of a residual block
-    This also tests the elimination of duplicated transformation.
-    If a same transformation applies to a same node twice, only one transformation will be created.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(x, weight2, channels=32, kernel_size=(1, 1))
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.nn.global_max_pool2d(y)
-        return relay.Function(analysis.free_vars(y), y)
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        x = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(x, weight2, channels=32, kernel_size=(1, 1), data_layout="NCHW16c")
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.nn.global_max_pool2d(y, layout="NCHW16c")
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        return relay.Function(analysis.free_vars(y), y)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_broadcast_op():
-    """Test boradcast operators"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        bias = relay.var("bias", shape=(64,))
-        scale = relay.var("scale", shape=(64, 1, 1))
-        weight = relay.var("weight")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.bias_add(y, bias)  # test broadcasting to lhs
-        y = relay.multiply(scale, y)  # test broadcasting to rhs
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        bias = relay.var("bias", shape=(64,))
-        scale = relay.var("scale", shape=(64, 1, 1))
-        weight = relay.var("weight")
-        x = relay.layout_transform(x, "NCHW", "NCHW16c")
-        bias = relay.expand_dims(bias, 1, 2)
-        bias = relay.expand_dims(bias, 0, 1)
-        bias = relay.layout_transform(bias, "NCHW", "NCHW16c")
-        scale = relay.expand_dims(scale, 0, 1)
-        scale = relay.layout_transform(scale, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y = relay.add(y, bias)  # test broadcasting to lhs
-        y = relay.multiply(scale, y)  # test broadcasting to rhs
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_broadcast_scalar_op():
-    """Test alternating the layout of a conv2d.
-    The layout of broadcast operators and the weight should be changed accordingly.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 500, 500, 64))
-        kernel = relay.var("kernel", shape=(3, 3, 64, 64), dtype="float32")
-        bias = relay.var("bias", shape=(64,))
-        multiplier1 = relay.var("multiplier1", shape=(1,), dtype="float32")
-        multiplier2 = relay.var("multiplier2", shape=(1, 1), dtype="float32")
-
-        y = relay.nn.conv2d(x, kernel, data_layout="NHWC", kernel_layout="HWIO", kernel_size=(3, 3))
-        y = relay.add(bias, y)
-        y = relay.nn.relu(y)
-
-        y = relay.multiply(multiplier1, y)
-        y = relay.multiply(y, multiplier2)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 500, 500, 64))
-        kernel = relay.var("kernel", shape=(3, 3, 64, 64), dtype="float32")
-        bias = relay.var("bias", shape=(64,))
-        multiplier1 = relay.var("multiplier1", shape=(1,), dtype="float32")
-        multiplier2 = relay.var("multiplier2", shape=(1, 1), dtype="float32")
-
-        b = relay.expand_dims(bias, axis=0, num_newaxis=3)
-        b = relay.layout_transform(b, "NHWC", "NCHW16c")
-
-        y = relay.layout_transform(x, "NHWC", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, kernel, data_layout="NCHW16c", kernel_layout="HWIO", kernel_size=(3, 3)
-        )
-
-        y = relay.add(b, y)
-        y = relay.nn.relu(y)
-
-        y = relay.multiply(multiplier1, y)
-        y = relay.multiply(y, multiplier2)
-        y = relay.layout_transform(y, "NCHW16c", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_scalar():
-    """Test alternating the layout of a conv2d.
-    The layout of broadcast operators and the weight should be changed accordingly.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.add(y, relay.const(1, "float32"))
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        w = relay.var("weight")
-
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, w, channels=64, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y = relay.add(y, relay.const(1.0, "float32"))
-
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_scalar_regression():
-    """regression test where scalar fails"""
-
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 16))
-        bias = relay.var("bias", shape=(1, 1, 1, 16))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=16,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.add(y, bias)
-        mean = relay.mean(y, axis=3, exclude=True)
-        var = relay.variance(y, axis=3, exclude=True)
-        gamma = relay.var("gamma")
-        beta = relay.var("beta")
-        y = relay.nn.batch_norm(y, gamma, beta, mean, var, axis=3)
-        y = y[0]
-        return relay.Function(analysis.free_vars(y), y)
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 16))
-        bias = relay.var("bias", shape=(1, 1, 1, 16))
-        x = relay.layout_transform(x, src_layout="NHWC", dst_layout="NCHW")
-        x = relay.layout_transform(x, src_layout="NCHW", dst_layout="NCHW16c")
-        weight = relay.layout_transform(weight, src_layout="HWIO", dst_layout="OIHW")
-        y = relay.nn.conv2d(
-            x, weight, channels=16, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        bias = relay.layout_transform(bias, src_layout="NHWC", dst_layout="NCHW")
-        bias = relay.layout_transform(bias, src_layout="NCHW", dst_layout="NCHW16c")
-        add = relay.add(y, bias)
-        mean = relay.mean(add, axis=[1, 4], exclude=True)
-        var = relay.variance(add, axis=[1, 4], exclude=True)
-        denom = relay.const(1.0) / relay.sqrt(var + relay.const(1e-05))
-        gamma = relay.var("gamma", shape=(16,))
-        denom_c16c = denom * relay.layout_transform(gamma, src_layout="C", dst_layout="C16c")
-        denom = relay.layout_transform(denom_c16c, src_layout="C16c", dst_layout="C")
-        denom_expand1 = relay.expand_dims(denom, axis=1, num_newaxis=2)
-        denom_expand2 = relay.expand_dims(denom_expand1, axis=0)
-        denom_nchwc16 = relay.layout_transform(
-            denom_expand2, src_layout="NCHW", dst_layout="NCHW16c"
-        )
-        out = add * denom_nchwc16
-        beta = relay.var("beta", shape=(16,))
-        numerator_c16c = (-mean) * denom_c16c + relay.layout_transform(
-            beta, src_layout="C", dst_layout="C16c"
-        )
-        numerator = relay.layout_transform(numerator_c16c, src_layout="C16c", dst_layout="C")
-        numerator_expand1 = relay.expand_dims(numerator, axis=1, num_newaxis=2)
-        numerator_expand2 = relay.expand_dims(numerator_expand1, axis=0)
-        numerator_nchwc16 = relay.layout_transform(
-            numerator_expand2, src_layout="NCHW", dst_layout="NCHW16c"
-        )
-        out = out + numerator_nchwc16
-        out = relay.layout_transform(out, src_layout="NCHW16c", dst_layout="NCHW")
-        y = relay.layout_transform(out, src_layout="NCHW", dst_layout="NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        desired_layouts = {"nn.conv2d": ["NCHW", "default"], "nn.batch_norm": ["NHWC", "default"]}
-        a = run_opt_pass(
-            a,
-            [
-                transform.InferType(),
-                relay.transform.ConvertLayout(desired_layouts),
-                transform.SimplifyInference(),
-                transform.CanonicalizeOps(),
-                transform.AlterOpLayout(),
-            ],
-        )
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_concatenate():
-    """NCHW, NHWC and corner case concatenate layout transform."""
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    # NCHW layout transformation.
-    def before_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y1 = relay.nn.conv2d(y, weight2, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        ret = relay.concatenate([y, y1], axis=1)
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y1 = relay.nn.conv2d(
-            y, weight2, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.concatenate([y, y1], axis=1)
-        ret = relay.layout_transform(ret, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nchw()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nchw(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-    # NHWC layout transformation.
-    def before_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        y = relay.nn.conv2d(
-            x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NHWC"
-        )
-        y1 = relay.nn.conv2d(
-            y, weight2, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NHWC"
-        )
-        ret = relay.concatenate([y, y1], axis=3)
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        weight2 = relay.var("weight2")
-        y = relay.layout_transform(x, "NHWC", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y1 = relay.nn.conv2d(
-            y, weight2, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.concatenate([y, y1], axis=1)
-        ret = relay.layout_transform(ret, "NCHW16c", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nhwc()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nhwc(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_nchw_upsamping_op():
-    """Test upsamping operators"""
-
-    def before():
-        x = relay.var("x", shape=(1, 32, 28, 28))
-        weight = relay.var("weight", shape=(32, 32, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.upsampling(y, scale_h=2, scale_w=2)
-        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2))
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 32, 28, 28))
-        weight = relay.var("weight")
-        x = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y = relay.nn.upsampling(y, scale_h=2, scale_w=2, layout="NCHW16c")
-        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2), layout="NCHW16c")
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_nchw_dyn_upsamping_op():
-    """Test upsamping operators"""
-
-    def before():
-        x = relay.var("x", shape=(1, 32, 28, 28))
-        weight = relay.var("weight", shape=(32, 32, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.upsampling(y, scale_h=relay.const(2), scale_w=relay.const(2))
-        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2))
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 32, 28, 28))
-        weight = relay.var("weight")
-        x = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y = relay.nn.upsampling(y, scale_h=relay.const(2), scale_w=relay.const(2), layout="NCHW16c")
-        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2), layout="NCHW16c")
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_alter_layout_strided_slice(target, dev):
-    """Test rewriting strided_slice during alter_iop_layout"""
-
-    def before():
-        x = relay.var("x", shape=(1, 32, 28, 28))
-        weight = relay.var("weight", shape=(32, 32, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.strided_slice(y, begin=[0, 16], end=[1, 33], strides=[1, 1])
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW4c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 32, 28, 28))
-        weight = relay.var("weight", shape=(32, 32, 3, 3))
-        weight = relay.layout_transform(weight, "OIHW", "OIHW4i4o")
-        x = relay.layout_transform(x, "NCHW", "NCHW4c")
-        y = relay.op.nn.contrib_conv2d_nchwc(
-            x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW4c"
-        )
-
-        y = relay.strided_slice(y, begin=[0, 4], end=[1, 21], strides=[1, 1])
-
-        y = relay.layout_transform(y, "NCHW4c", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        b = run_opt_pass(expected(), transform.InferType())
-
-    # Verify inference result
-    mod_before = tvm.IRModule()
-    mod_new = tvm.IRModule()
-    mod_before["main"] = a
-    mod_new["main"] = b
-    mod_before = transform.InferType()(mod_before)
-    mod_new = transform.InferType()(mod_new)
-    with relay.build_config(opt_level=3):
-        for kind in ["graph", "debug", "vm"]:
-            np_data = np.random.uniform(size=(1, 32, 28, 28)).astype("float32")
-            np_weight = np.random.uniform(size=(32, 32, 3, 3)).astype("float32")
-            f_before = relay.create_executor(
-                kind, mod=mod_before, device=dev, target=target
-            ).evaluate()
-            result_before = f_before(np_data, np_weight)
-            f_new = relay.create_executor(kind, mod=mod_new, device=dev, target=target).evaluate()
-            result_new = f_new(np_data, np_weight)
-            tvm.testing.assert_allclose(
-                result_before.numpy(), result_new.numpy(), rtol=1e-5, atol=1e-5
-            )
-
-
-def test_alter_layout_strided_slice_axes_nhwc():
-    """Test rewriting strided_slice with axes during alter_iop_layout"""
-
-    def before():
-        x = relay.var("x", shape=(1, 28, 28, 32))
-        weight = relay.var("weight", shape=(3, 3, 32, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.strided_slice(y, begin=[0, 16], end=[1, 32], strides=[1, 1], axes=[0, 3])
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NHWC4c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 28, 28, 32))
-        weight = relay.var("weight", shape=(3, 3, 32, 32))
-        x = relay.layout_transform(x, "NHWC", "NHWC4c")
-        y = relay.op.nn.conv2d(
-            x,
-            weight,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC4c",
-            kernel_layout="HWIO",
-        )
-        y = relay.strided_slice(y, begin=[0, 4], end=[1, 8], strides=[1, 1], axes=[0, 3])
-        y = relay.layout_transform(y, "NHWC4c", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = run_opt_pass(before(), transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    mod_before = tvm.IRModule()
-    mod_new = tvm.IRModule()
-    mod_before["main"] = a
-    mod_new["main"] = b
-    tvm.ir.assert_structural_equal(mod_before, mod_new)
-
-
-def test_alter_layout_depthwise_conv2d():
-    """Test depthwise_conv2d operator"""
-
-    def before():
-        x = relay.var("x", shape=(1, 32, 56, 56))
-        w = relay.var("w", shape=(32, 1, 3, 3))
-        y = relay.nn.conv2d(x, w, padding=(1, 1), channels=32, kernel_size=(3, 3), groups=32)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    from tvm import topi
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        with tvm.target.Target("llvm -mtriple=x86_64-linux-gnu -mcpu=core-avx2"):
-            return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
-
-    def expected():
-        x = relay.var("x", shape=(1, 32, 56, 56))
-        w = relay.var("w", shape=(32, 1, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NCHW8c")
-        w = relay.layout_transform(w, "OIHW", "OIHW1i8o")
-        y = relay.nn.contrib_depthwise_conv2d_nchwc(
-            x,
-            w,
-            padding=(1, 1, 1, 1),
-            channels=32,
-            kernel_size=(3, 3),
-            groups=32,
-            data_layout="NCHW8c",
-            kernel_layout="OIHW1i8o",
-            out_layout="NCHW8c",
-        )
-        y = relay.layout_transform(y, "NCHW8c", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_prelu():
-    """Test PRelu operator"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight")
-        alpha = relay.var("alpha", relay.IncompleteType())
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.prelu(y, alpha)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        w = relay.var("weight")
-        alpha = relay.var("alpha", relay.IncompleteType())
-
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, w, channels=64, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        y = relay.layout_transform(y, "NCHW16c", "NCHW")
-        y = relay.nn.prelu(y, alpha)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(), transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_pad():
-    """Check NCHW, NHWC and corner case for pad layout conversion"""
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    # Check NCHW conversion.
-    def before_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        ret = relay.nn.pad(y, pad_width=((0, 0), (0, 0), (1, 1), (1, 1)))
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.nn.pad(y, pad_width=((0, 0), (0, 0), (1, 1), (1, 1), (0, 0)))
-        ret = relay.layout_transform(ret, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nchw()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nchw(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Check NHWC conversion.
-    def before_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        y = relay.nn.conv2d(
-            x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NHWC"
-        )
-        ret = relay.nn.pad(y, pad_width=((0, 0), (1, 1), (1, 1), (0, 0)))
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        y = relay.layout_transform(x, "NHWC", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.nn.pad(y, pad_width=((0, 0), (0, 0), (1, 1), (1, 1), (0, 0)))
-        ret = relay.layout_transform(ret, "NCHW16c", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nhwc()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nhwc(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Check that conversion does not happen when padding along split axis.
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        ret = relay.nn.pad(y, pad_width=((0, 0), (1, 1), (1, 1), (1, 1)))
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.layout_transform(y, "NCHW16c", "NCHW")
-        ret = relay.nn.pad(ret, pad_width=((0, 0), (1, 1), (1, 1), (1, 1)))
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_pool():
-    """Check NCHW, NHWC pool layout conversion"""
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    # Check NCHW conversion.
-    def before_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        ret = relay.nn.avg_pool2d(y, pool_size=(1, 1))
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.nn.avg_pool2d(y, pool_size=(1, 1), layout="NCHW16c")
-        ret = relay.layout_transform(ret, "NCHW16c", "NCHW")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nchw()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nchw(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Check NHWC conversion.
-    def before_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        y = relay.nn.conv2d(
-            x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NHWC"
-        )
-        ret = relay.nn.avg_pool2d(y, pool_size=(1, 1), layout="NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        y = relay.layout_transform(x, "NHWC", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.nn.avg_pool2d(y, pool_size=(1, 1), layout="NCHW16c")
-        ret = relay.layout_transform(ret, "NCHW16c", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nhwc()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nhwc(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_sum():
-    """Check NCHW, NHWC sum layout conversion"""
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    # Check NCHW conversion.
-    def before_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        ret = relay.sum(y, axis=1, keepdims=True)
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1")
-        y = relay.layout_transform(x, "NCHW", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.sum(y, axis=[1, 4], keepdims=True)
-        ret = relay.layout_transform(ret, "NCHW1c", "NCHW")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nchw()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nchw(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Check NHWC conversion.
-    def before_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        y = relay.nn.conv2d(
-            x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NHWC"
-        )
-        ret = relay.sum(y, axis=3, keepdims=True)
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1")
-        y = relay.layout_transform(x, "NHWC", "NCHW16c")
-        y = relay.nn.conv2d(
-            y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
-        )
-        ret = relay.sum(y, axis=[1, 4], keepdims=True)
-        ret = relay.layout_transform(ret, "NCHW1c", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nhwc()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nhwc(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_nhwc_arm():
-    """Check that AlterOplayout does not alter NHWC data layout."""
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        from tvm import topi
-
-        with tvm.target.Target("llvm -mtriple=arm-linux-gnu -device=arm_cpu"):
-            return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
-
-    # Check NHWC conversion.
-    def before_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 64))
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x, weight1, channels=64, kernel_size=(3, 3), data_layout="NHWC", kernel_layout="HWIO"
-        )
-        y = relay.nn.relu(y)
-        y = relay.nn.avg_pool2d(y, pool_size=(1, 1), layout="NHWC")
-        y = relay.nn.conv2d(
-            y, weight2, channels=64, kernel_size=(3, 3), data_layout="NHWC", kernel_layout="HWIO"
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected_nhwc():
-        return before_nhwc()
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nhwc()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nhwc(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_layout_nhwc_int8_aarch64():
-    """Check that AlterOplayout does not alter NHWC data layout."""
-    from tvm import autotvm
-
-    expected_workload_shape = (20, 44, 4, 16)
-
-    # We use Int8Fallback  to disable the fallback flag
-    # and to test the new workload produced during the pass
-    class Int8Fallback(autotvm.FallbackContext):
-        def _query_inside(self, target, workload):
-            key = (target, workload)
-            if key in self.memory:
-                return self.memory[key]
-            cfg = autotvm.task.space.FallbackConfigEntity()
-            cfg.is_fallback = False
-            cfg.cost = 0
-            self.memory[key] = cfg
-            return cfg
-
-        def update(self, target, workload, cfg):
-            key = (str(target), workload)
-            assert workload[2][1] == expected_workload_shape
-            assert workload[0] == "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
-            self.memory[key] = cfg
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        from tvm import topi
-
-        with tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"):
-            with Int8Fallback():
-                tmp = topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
-                return tmp
-
-    # Check NHWC conversion.
-    def before_nhwc_int8():
-        x = relay.var("x", shape=(1, 56, 56, 73), dtype="int8")
-        weight = relay.var("weight1", shape=(3, 3, 73, 79), dtype="int8")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=79,
-            kernel_size=(3, 3),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected_nhwc_int8():
-        x = relay.var("x", shape=(1, 56, 56, 73), dtype="int8")
-        weight = relay.var("weight1", shape=(3, 3, 73, 79), dtype="int8")
-        tile_rows = 4
-        tile_cols = 16
-        weight_transformed = relay.nn.contrib_conv2d_gemm_weight_transform(
-            weight, tile_rows, tile_cols
-        )
-        y = relay.nn.contrib_conv2d_gemm_without_weight_transform(
-            x,
-            weight_transformed,
-            channels=79,
-            kernel_size=(3, 3),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before_nhwc_int8()
-        a = run_opt_pass(a, transform.AlterOpLayout())
-        b = run_opt_pass(expected_nhwc_int8(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_op_with_global_var():
-    """Test directly replacing an operator with a new one"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        mod = tvm.IRModule()
-        foo = relay.GlobalVar("foo")
-        mod[foo] = relay.Function([x, weight], y)
-        mod = transform.InferType()(mod)
-        mod["main"] = relay.Function([x, weight], foo(x, weight))
-        mod = transform.InferType()(mod)
-        return mod
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        weight = relay.multiply(weight, relay.const(2.0, "float32"))
-        return relay.nn.conv2d(data, weight, **attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            relay.multiply(weight, relay.const(2.0, "float32")),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.nn.relu(y)
-        mod = tvm.IRModule()
-        foo = relay.GlobalVar("foo")
-        mod[foo] = relay.Function([x, weight], y)
-        mod = transform.InferType()(mod)
-        mod["main"] = relay.Function([x, weight], foo(x, weight))
-        return mod
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = transform.AlterOpLayout()(a)
-        b = transform.InferType()(expected())
-
-    tvm.ir.assert_structural_equal(a, b, map_free_vars=True)
-
-
-def test_alter_op_dense():
-    def before():
-        x = relay.var("x", shape=(32, 1, 128))
-        weight = relay.var("weight", shape=(48, 64))
-        avg1d = relay.nn.adaptive_avg_pool1d(x, [64])
-        squeeze = relay.squeeze(avg1d, axis=[1])
-        y = relay.nn.dense(squeeze, weight)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(32, 1, 128))
-        weight = relay.var("weight", shape=(48, 64))
-        target_layout = "NC16n"
-        weight_transform = relay.layout_transform(weight, "NC", target_layout)
-        avg1d = relay.nn.adaptive_avg_pool1d(x, [64])
-        squeeze = relay.squeeze(avg1d, axis=[1])
-        y = relay.nn.contrib_dense_pack(
-            squeeze, weight_transform, target_layout, units=None, out_dtype="float32"
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    target = "llvm -mtriple=x86_64-linux-gnu -mcpu=core-avx2"
-    with tvm.target.Target(target):
-        with TempOpAttr(
-            "nn.dense", "FTVMAlterOpLayout", topi.x86.dense_alter_op._alter_dense_layout
-        ):
-            a = before()
-            a = run_opt_pass(a, transform.AlterOpLayout())
-            b = run_opt_pass(expected(), transform.InferType())
-            tvm.ir.assert_structural_equal(a, b)
-
-
-def test_not_inplace_modify():
-    def func():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=[2, 2], strides=[2, 2], padding=[0, 0, 0, 0])
-        y = relay.Function([x, weight], y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW16c"
-        new_attrs["kernel_layout"] = "OIHW16i"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        before = func()
-        run_opt_pass(before, [transform.AlterOpLayout()])
-        assert before.body.attrs.layout == "NCHW"
-
-
-def test_alter_op_dense_packed_data():
-    def before():
-        x = relay.var("x", shape=(1, 32, 8, 8))
-        weight = relay.var("conv2d_weight", shape=(32, 32, 3, 3))
-        conv = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        pool = relay.nn.avg_pool2d(conv, pool_size=[8, 8], padding=[0, 0, 0, 0])
-        squeeze = relay.squeeze(pool, axis=[2, 3])
-        dense = relay.nn.dense(squeeze, relay.var("dense_weight", shape=(16, 32)))
-        return relay.Function(analysis.free_vars(dense), dense)
-
-    def expected():
-        x = relay.var("x", shape=(1, 32, 8, 8))
-        conv_weight = relay.var("conv2d_weight", shape=(32, 32, 3, 3))
-        dense_weight = relay.var("dense_weight", shape=(16, 32))
-        conv = relay.nn.contrib_conv2d_nchwc(
-            relay.layout_transform(x, "NCHW", "NCHW8c"),
-            relay.layout_transform(conv_weight, "OIHW", "OIHW8i8o"),
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW8c",
-            kernel_layout="OIHW8i8o",
-            out_layout="NCHW8c",
-        )
-        pool = relay.nn.avg_pool2d(conv, pool_size=[8, 8], padding=[0, 0, 0, 0], layout="NCHW8c")
-        squeeze = relay.squeeze(pool, axis=[2, 3])
-        dense = relay.nn.contrib_dense_pack(
-            relay.layout_transform(squeeze, "NC8c", "NC"),
-            relay.layout_transform(dense_weight, "NC", "NC16n"),
-            "NC16n",
-            out_dtype="float32",
-        )
-        return relay.Function(analysis.free_vars(dense), dense)
-
-    with tvm.target.Target("llvm -mtriple=x86_64-linux-gnu -mcpu=core-avx2"):
-        with TempOpAttr(
-            "nn.dense", "FTVMAlterOpLayout", topi.x86.dense_alter_op._alter_dense_layout
-        ):
-            a = run_opt_pass(before(), transform.AlterOpLayout())
-            b = run_opt_pass(expected(), transform.InferType())
-            tvm.ir.assert_structural_equal(a, b)
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 17, reason="SME is not supported in earlier versions of LLVM"
-)
-def test_alter_op_dense_arm_cpu_sme_float32():
-    np.random.seed(0)
-    y_data = np.random.uniform(size=(64, 32)).astype("float32")
-
-    def before():
-        x = relay.var("x", shape=(32, 32), dtype="float32")
-        y = relay.const(y_data, dtype="float32")
-        dense = relay.nn.dense(x, y)
-        return relay.Function(analysis.free_vars(dense), dense)
-
-    def expected():
-        x = relay.var("x", shape=(32, 32), dtype="float32")
-        y = relay.transpose(relay.const(y_data, dtype="float32"))
-        matmul = relay.nn.matmul(x, y)
-        return relay.Function(analysis.free_vars(matmul), matmul)
-
-    with tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+v9.2a,+sme"):
-        with TempOpAttr("nn.dense", "FTVMAlterOpLayout", topi.arm_cpu.dense_alter_op._alter_dense):
-            a = run_opt_pass(before(), transform.AlterOpLayout())
-            b = run_opt_pass(expected(), transform.InferType())
-            tvm.ir.assert_structural_equal(a, b)
-
-
-def test_alter_op_dense_arm_cpu_neon():
-    np.random.seed(0)
-    y_data = np.random.uniform(size=(64, 32)).astype("float32")
-
-    def before():
-        x = relay.var("x", shape=(32, 32), dtype="float32")
-        y = relay.const(y_data, dtype="float32")
-        dense = relay.nn.dense(x, y)
-        return relay.Function(analysis.free_vars(dense), dense)
-
-    def expected():
-        x = relay.var("x", shape=(32, 32), dtype="float32")
-        y = relay.transpose(relay.const(y_data, dtype="float32"))
-        matmul = relay.nn.matmul(x, y)
-        return relay.Function(analysis.free_vars(matmul), matmul)
-
-    with tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+v8.6a,+neon"):
-        with TempOpAttr("nn.dense", "FTVMAlterOpLayout", topi.arm_cpu.dense_alter_op._alter_dense):
-            a = run_opt_pass(before(), transform.AlterOpLayout())
-            b = run_opt_pass(expected(), transform.InferType())
-            assert tvm.ir.structural_equal(a, b)
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 17, reason="SME is not supported in earlier versions of LLVM"
-)
-def test_alter_op_dense_arm_cpu_sme_float16_float32():
-    from tvm.relay.op.nn import _make  # pylint: disable-top-level-import
-
-    np.random.seed(0)
-    y_data = np.random.uniform(size=(64, 32)).astype("float16")
-
-    def before():
-        x = relay.var("x", shape=(32, 32), dtype="float16")
-        y = relay.const(y_data, dtype="float16")
-        dense = relay.nn.dense(x, y, out_dtype="float32")
-        return relay.Function(analysis.free_vars(dense), dense)
-
-    def expected():
-        x = relay.var("x", shape=(32, 32), dtype="float16")
-        y = relay.const(y_data, dtype="float16")
-        # Cannot make using the public API (relay.nn.matmul) since it will
-        # create an nn.dense op instead
-        matmul = _make.matmul(x, y, None, "float32", False, True)
-        return relay.Function(analysis.free_vars(matmul), matmul)
-
-    with tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+v9.2a,+sme"):
-        with TempOpAttr("nn.dense", "FTVMAlterOpLayout", topi.arm_cpu.dense_alter_op._alter_dense):
-            a = run_opt_pass(before(), transform.AlterOpLayout())
-            b = run_opt_pass(expected(), transform.InferType())
-            tvm.ir.assert_structural_equal(a, b)
-
-
-@pytest.mark.skipif(
-    llvm_version_major() < 17, reason="SME is not supported in earlier versions of LLVM"
-)
-@pytest.mark.parametrize("transpose_b", [False, True])
-def test_alter_op_matmul_arm_cpu_sme(transpose_b):
-    np.random.seed(0)
-    y_data = np.random.uniform(size=(64, 32)).astype("float32")
-
-    def before():
-        x = relay.var("x", shape=(96, 32), dtype="float32")
-        y = relay.const(y_data, dtype="float32")
-        dense = relay.nn.matmul(x, y, transpose_a=False, transpose_b=transpose_b)
-        return relay.Function(analysis.free_vars(dense), dense)
-
-    def expected():
-        x = relay.var("x", shape=(96, 32), dtype="float32")
-        y = relay.const(y_data, dtype="float32")
-        if transpose_b:
-            y = relay.transpose(y)
-        matmul = relay.nn.matmul(x, y)
-        return relay.Function(analysis.free_vars(matmul), matmul)
-
-    with tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+v9.2a,+sme"):
-        with TempOpAttr("nn.dense", "FTVMAlterOpLayout", topi.arm_cpu.dense_alter_op._alter_dense):
-            a = run_opt_pass(before(), transform.AlterOpLayout())
-            b = run_opt_pass(expected(), transform.InferType())
-            tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv2d_strided_slice_packed_to_unpacked():
-    """We do not support propagating through packed to unpacked layout"""
-    x_shape = (1, 1, 1, 1, 4)
-    w_shape = (9, 1, 3, 3, 4, 4)
-
-    def before():
-        x = relay.var("x", shape=x_shape)
-        weight = relay.var("weight", shape=w_shape)
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW4c",
-            kernel_layout="OIHW4i4o",
-        )
-        y = relay.strided_slice(y, begin=[0, 0], end=[1, -1], strides=[1, 8])
-        return relay.Function([x, weight], y)
-
-    def expected():
-        x = relay.var("x", shape=x_shape)
-        weight = relay.var("weight", shape=w_shape)
-        x_nchw = relay.layout_transform(x, src_layout="NCHW4c", dst_layout="NCHW")
-        weight_oihw = relay.layout_transform(weight, src_layout="OIHW4i4o", dst_layout="OIHW")
-        y = relay.nn.conv2d(
-            x_nchw,
-            weight_oihw,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.layout_transform(y, src_layout="NCHW", dst_layout="NCHW4c")
-        y = relay.strided_slice(y, begin=[0, 0], end=[1, -1], strides=[1, 8])
-        return relay.Function([x, weight], y)
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW"
-        new_attrs["kernel_layout"] = "OIHW"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = run_opt_pass(before(), transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv2d_strided_slice_arbitrary_stride():
-    """Test rewriting strided_slice with arbitrary stride"""
-
-    def before():
-        x = relay.var("x", shape=(4, 12, 1, 1))
-        weight = relay.var("weight", shape=(9, 12, 1, 1))
-        y = relay.nn.conv2d(x, weight, channels=9, kernel_size=(1, 1), padding=(0, 0))
-        y = relay.strided_slice(y, begin=[3], end=[6], strides=[3], axes=[1])
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW3c"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        run_opt_pass(before(), transform.AlterOpLayout())
-
-
-def test_conv2d_reduce_channels():
-    x = relay.var("data", shape=(1, 8, 48, 48))
-    y = relay.nn.conv2d(
-        data=x,
-        weight=relay.var("weight"),
-        kernel_size=(1, 1),
-        channels=8,
-        dilation=1,
-        strides=(47, 47),
-    )
-    z = relay.argmin(y, axis=1)
-
-    mod, params = testing.create_workload(z)
-
-    with tvm.transform.PassContext(opt_level=3):
-        relay.build(mod, params=params, target="llvm")
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Layout NCHW4c unsupported in `arm_cpu`. See https://github.com/apache/tvm/issues/16537",
-)
-def test_alter_layout_nonscalar_broadcast():
-    """Test boradcast operators"""
-
-    def before():
-        x = relay.var("x", shape=(1, 16, 3, 3))
-        weight = relay.var("weight", shape=(16, 16, 1, 1))
-        y = relay.nn.conv2d(
-            x, weight, channels=16, kernel_size=(1, 1), padding=(0, 0), data_layout="NCHW"
-        )
-        z = relay.var("z", shape=(1, 3, 3))
-        y = y + z
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 16, 3, 3))
-        weight = relay.var("weight", shape=(16, 16, 1, 1))
-        x = relay.layout_transform(x, src_layout="NCHW", dst_layout="NCHW4c")
-        weight = relay.layout_transform(weight, src_layout="OIHW", dst_layout="OIHW4i4o")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=16,
-            kernel_size=(1, 1),
-            padding=(0, 0),
-            data_layout="NCHW4c",
-            kernel_layout="OIHW4i4o",
-        )
-        z = relay.var("z", shape=(1, 3, 3))
-        z = relay.expand_dims(z, 0)
-        z = relay.layout_transform(z, src_layout="NCHW", dst_layout="NCHW1c")
-        y = y + z
-        y = relay.layout_transform(y, src_layout="NCHW4c", dst_layout="NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW4c"
-        new_attrs["kernel_layout"] = "OIHW4i4o"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = run_opt_pass(before(), transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    inp = np.random.uniform(size=(1, 16, 3, 3)).astype(np.float32)
-    weight = np.random.uniform(size=(16, 16, 1, 1)).astype(np.float32)
-    z = np.random.uniform(size=(1, 3, 3)).astype(np.float32)
-    mod = tvm.IRModule.from_expr(before())
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        with tvm.transform.PassContext(opt_level=4):
-            res = relay.build_module.create_executor(
-                "graph", mod, target="llvm", device=tvm.cpu()
-            ).evaluate()(inp, weight, z)
-    with tvm.transform.PassContext(opt_level=0):
-        res1 = relay.build_module.create_executor(
-            "debug", mod, target="llvm", device=tvm.cpu()
-        ).evaluate()(inp, weight, z)
-    np.testing.assert_allclose(res.numpy(), res1.numpy())
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Layout NCHW4c unsupported in `arm_cpu`. See https://github.com/apache/tvm/issues/16537",
-)
-def test_alter_layout_blocked_no_broadcast():
-    """Test boradcast operators working on already blocked layout"""
-
-    def before():
-        dtype = "float32"
-        input_shape = (1, 8, 16, 16, 4)
-        filter_shape = (1, 8, 4, 4, 4, 4)
-        bias_shape = (1, 1, 1, 1, 4)
-        A = relay.var("data", shape=input_shape, dtype=dtype)
-        B = relay.var("weight", shape=filter_shape, dtype=dtype)
-        C = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-        conv = relay.nn.conv2d(
-            A,
-            B,
-            data_layout="NCHW4c",
-            kernel_layout="OIHW4i4o",
-            padding=[3, 3, 0, 0],
-            strides=[2, 2],
-            out_dtype=dtype,
-            channels=4,
-            kernel_size=(4, 4),
-        )
-        bias = relay.op.add(conv, C)
-        bias = relay.Function(analysis.free_vars(bias), bias)
-        return bias
-
-    def expected():
-        return before()
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW4c"
-        new_attrs["kernel_layout"] = "OIHW4i4o"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = run_opt_pass(before(), transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
-    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
-    z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32)
-    mod = tvm.IRModule.from_expr(before())
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        with tvm.transform.PassContext(opt_level=4):
-            res = relay.build_module.create_executor(
-                "graph", mod, target="llvm", device=tvm.cpu()
-            ).evaluate()(inp, weight, z)
-    with tvm.transform.PassContext(opt_level=0):
-        res1 = relay.build_module.create_executor(
-            "debug", mod, target="llvm", device=tvm.cpu()
-        ).evaluate()(inp, weight, z)
-    np.testing.assert_allclose(res.numpy(), res1.numpy())
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Layout NCHW4c unsupported in `arm_cpu`. See https://github.com/apache/tvm/issues/16537",
-)
-def test_alter_layout_blocked_broadcast():
-    """Test boradcast operators working on already blocked layout"""
-
-    def before():
-        dtype = "float32"
-        input_shape = (1, 8, 16, 16, 4)
-        filter_shape = (1, 8, 4, 4, 4, 4)
-        bias_shape = (1, 1, 1, 1, 1)
-        A = relay.var("data", shape=input_shape, dtype=dtype)
-        B = relay.var("weight", shape=filter_shape, dtype=dtype)
-        C = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-        conv = relay.nn.conv2d(
-            A,
-            B,
-            data_layout="NCHW4c",
-            kernel_layout="OIHW4i4o",
-            padding=[3, 3, 0, 0],
-            strides=[2, 2],
-            out_dtype=dtype,
-            channels=4,
-            kernel_size=(4, 4),
-        )
-        bias = relay.op.add(conv, C)
-        bias = relay.Function(analysis.free_vars(bias), bias)
-        return bias
-
-    def expected():
-        return before()
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW4c"
-        new_attrs["kernel_layout"] = "OIHW4i4o"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = run_opt_pass(before(), transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
-    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
-    z = np.random.uniform(size=(1, 1, 1, 1, 1)).astype(np.float32)
-    mod = tvm.IRModule.from_expr(before())
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        with tvm.transform.PassContext(opt_level=4):
-            res = relay.build_module.create_executor(
-                "graph", mod, target="llvm", device=tvm.cpu()
-            ).evaluate()(inp, weight, z)
-    with tvm.transform.PassContext(opt_level=0):
-        res1 = relay.build_module.create_executor(
-            "debug", mod, target="llvm", device=tvm.cpu()
-        ).evaluate()(inp, weight, z)
-    np.testing.assert_allclose(res.numpy(), res1.numpy())
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Layout NCHW4c unsupported in `arm_cpu`. See https://github.com/apache/tvm/issues/16537",
-)
-def test_alter_layout_re_blocking_broadcast():
-    """Test of re-blocking shapes with boradcast operators"""
-
-    def before():
-        dtype = "float32"
-        input_shape = (1, 8, 16, 16, 4)
-        filter_shape = (1, 8, 4, 4, 4, 4)
-        bias_shape = (1, 1, 1, 1, 4)
-        A = relay.var("data", shape=input_shape, dtype=dtype)
-        B = relay.var("weight", shape=filter_shape, dtype=dtype)
-        C = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-        conv = relay.nn.conv2d(
-            A,
-            B,
-            data_layout="NCHW4c",
-            kernel_layout="OIHW4i4o",
-            padding=[3, 3, 0, 0],
-            strides=[2, 2],
-            out_dtype=dtype,
-            channels=4,
-            kernel_size=(4, 4),
-        )
-        bias = relay.op.add(conv, C)
-        bias = relay.Function(analysis.free_vars(bias), bias)
-        return bias
-
-    def expected():
-        dtype = "float32"
-        input_shape = (1, 8, 16, 16, 4)
-        filter_shape = (1, 8, 4, 4, 4, 4)
-        bias_shape = (1, 1, 1, 1, 4)
-        A = relay.var("data", shape=input_shape, dtype=dtype)
-        B = relay.var("weight", shape=filter_shape, dtype=dtype)
-        C = relay.var("bias", shape=bias_shape, dtype=dtype)
-
-        A = relay.layout_transform(A, src_layout="NCHW4c", dst_layout="NCHW2c")
-        B = relay.layout_transform(B, src_layout="OIHW4i4o", dst_layout="OIHW2i2o")
-
-        conv = relay.nn.conv2d(
-            A,
-            B,
-            data_layout="NCHW2c",
-            kernel_layout="OIHW2i2o",
-            padding=[3, 3, 0, 0],
-            strides=[2, 2],
-            out_dtype=dtype,
-            channels=4,
-            kernel_size=(4, 4),
-        )
-        C = relay.layout_transform(C, src_layout="NCHW4c", dst_layout="NCHW2c")
-        bias = relay.op.add(conv, C)
-        bias = relay.layout_transform(bias, src_layout="NCHW2c", dst_layout="NCHW4c")
-        bias = relay.Function(analysis.free_vars(bias), bias)
-        return bias
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW2c"
-        new_attrs["kernel_layout"] = "OIHW2i2o"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = run_opt_pass(before(), transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
-    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
-    z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32)
-    mod = tvm.IRModule.from_expr(before())
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        with tvm.transform.PassContext(opt_level=4):
-            res = relay.build_module.create_executor(
-                "graph", mod, target="llvm", device=tvm.cpu()
-            ).evaluate()(inp, weight, z)
-    with tvm.transform.PassContext(opt_level=0):
-        res1 = relay.build_module.create_executor(
-            "debug", mod, target="llvm", device=tvm.cpu()
-        ).evaluate()(inp, weight, z)
-    np.testing.assert_allclose(res.numpy(), res1.numpy(), rtol=1e-5, atol=1e-5)
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Layout NCHW4c unsupported in `arm_cpu`. See https://github.com/apache/tvm/issues/16537",
-)
-def test_broadcast_non_adaptable():
-    """NCHW4c + [x, x, 4] and NCHW4c is being altered to NCHW"""
-
-    def before():
-        x = relay.var("x", shape=(1, 4, 3, 3, 4))
-        weight = relay.var("weight", shape=(4, 4, 1, 1, 4, 4))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=16,
-            kernel_size=(1, 1),
-            padding=(0, 0),
-            data_layout="NCHW4c",
-            kernel_layout="OIHW4i4o",
-        )
-        z = relay.var("z", shape=(3, 3, 4))
-        y = y + z
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 4, 3, 3, 4))
-        weight = relay.var("weight", shape=(4, 4, 1, 1, 4, 4))
-        x = relay.layout_transform(x, src_layout="NCHW4c", dst_layout="NCHW")
-        weight = relay.layout_transform(weight, src_layout="OIHW4i4o", dst_layout="OIHW")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=16,
-            kernel_size=(1, 1),
-            padding=(0, 0),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        z = relay.var("z", shape=(3, 3, 4))
-        y = relay.layout_transform(y, src_layout="NCHW", dst_layout="NCHW4c")
-        y = y + z
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW"
-        new_attrs["kernel_layout"] = "OIHW"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = run_opt_pass(before(), transform.AlterOpLayout())
-        b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    inp = np.random.uniform(size=(1, 4, 3, 3, 4)).astype(np.float32)
-    weight = np.random.uniform(size=(4, 4, 1, 1, 4, 4)).astype(np.float32)
-    z = np.random.uniform(size=(3, 3, 4)).astype(np.float32)
-    mod = tvm.IRModule.from_expr(before())
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        with tvm.transform.PassContext(opt_level=4):
-            res = relay.build_module.create_executor(
-                "graph", mod, target="llvm", device=tvm.cpu()
-            ).evaluate()(inp, weight, z)
-    with tvm.transform.PassContext(opt_level=0):
-        res1 = relay.build_module.create_executor(
-            "debug", mod, target="llvm", device=tvm.cpu()
-        ).evaluate()(inp, weight, z)
-    np.testing.assert_allclose(res.numpy(), res1.numpy())
-
-
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Layout NCHW4c unsupported in `arm_cpu`. See https://github.com/apache/tvm/issues/16537",
-)
-def test_broadcast_respect_input_layouts():
-    def before():
-        x = relay.var("x", shape=(1, 16, 1, 1))
-        w = relay.var("w", shape=(16, 16, 1, 1))
-        x = relay.nn.conv2d(
-            x,
-            w,
-            kernel_size=(1, 1),
-            padding=(0, 0),
-            channels=16,
-        )
-        y1 = relay.min(x, axis=[2])
-        y2 = relay.min(x, axis=[3])
-        z = y1 + y2
-        z = relay.Function(analysis.free_vars(z), z)
-        return z
-
-    def alter_conv2d(attrs, inputs, tinfos, out_type):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs["data_layout"] = "NCHW4c"
-        new_attrs["kernel_layout"] = "OIHW4i4o"
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    inp = np.random.uniform(size=(1, 16, 1, 1)).astype(np.float32)
-    weight = np.random.uniform(size=(16, 16, 1, 1)).astype(np.float32)
-    mod = tvm.IRModule.from_expr(before())
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        with tvm.transform.PassContext(opt_level=4):
-            res = relay.build_module.create_executor(
-                "graph", mod, target="llvm", device=tvm.cpu()
-            ).evaluate()(inp, weight)
-    with tvm.transform.PassContext(opt_level=0):
-        res1 = relay.build_module.create_executor(
-            "debug", mod, target="llvm", device=tvm.cpu()
-        ).evaluate()(inp, weight)
-    np.testing.assert_allclose(res.numpy(), res1.numpy())
-
-
-def test_axis_semantic_change():
-    x = relay.var("x", shape=(1, 1, 24, 48))
-    w1 = relay.const(np.random.uniform(size=(1, 1, 1, 1)))
-    w2 = relay.const(np.random.uniform(size=(1, 1, 1, 1)))
-    y = relay.nn.conv2d(x, w1, kernel_size=(1, 1), padding=(0, 0), channels=1)
-    y = relay.transpose(y, (0, 1, 3, 2))
-    z = relay.nn.conv2d(y, w2, kernel_size=(1, 1), padding=(0, 0), channels=1)
-    func = relay.Function([x], z)
-    mod = tvm.IRModule.from_expr(func)
-    with tvm.transform.PassContext(opt_level=3):
-        relay.build(mod, target="llvm")
-
-
-def test_alter_with_subfunc():
-    v1 = relay.var("v", shape=[1, 256, 10, 10], dtype="float32")
-    v2 = relay.image.resize2d(v1, size=[16, 16], roi=[0.0, 0.0, 0.0, 0.0], rounding_method="")
-    sub_func = relay.Function([v1], v2)
-    x1 = relay.var("x", shape=[1, 256, 10, 10], dtype="float32")
-    x2 = sub_func(x1)
-    x3 = relay.image.resize2d(x2, size=[8, 8], roi=[0.0, 0.0, 0.0, 0.0], rounding_method="")
-    func = relay.Function([x1], x3)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    tvm.ir.assert_structural_equal(relay.transform.AlterOpLayout()(mod), mod)
-
-
-def test_alter_with_reduce():
-    x = relay.var("x", shape=(1, 1, 1, 1))
-    y = relay.image.resize2d(x, (2, 4))
-    z = relay.mean(y, axis=0)
-    a = relay.image.resize1d(z, (1,))
-    func = relay.Function((x,), a)
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    with tvm.transform.PassContext(opt_level=4):
-        relay.build(mod, target="llvm")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_annotate_spans_defuse.py b/tests/python/relay/test_pass_annotate_spans_defuse.py
deleted file mode 100644
index c513c592d611..000000000000
--- a/tests/python/relay/test_pass_annotate_spans_defuse.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for annotating spans."""
-
-
-import tvm
-import tvm.relay as relay
-from tvm.relay import testing
-import tvm.testing
-
-
-def test_annotate_spans_compatibility():
-    data = relay.var("data", relay.TensorType((1, 3, 64, 64), "float32"))
-    weight = relay.var("weight")
-
-    bn_gamma = relay.var("bn_gamma")
-    bn_beta = relay.var("bn_beta")
-    bn_mmean = relay.var("bn_mean")
-    bn_mvar = relay.var("bn_var")
-
-    simple_net = relay.nn.conv2d(
-        data=data, weight=weight, kernel_size=(3, 3), channels=3, padding=(1, 1)
-    )
-    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
-    simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
-
-    module, params = testing.create_workload(simple_net)
-
-    # Apply some simple passes to legalize the IR.
-    with tvm.transform.PassContext(opt_level=0):
-        module, params = relay.optimize(
-            module, target=tvm.testing.enabled_targets()[0][0], params=params
-        )
-
-    seq = tvm.transform.Sequential([relay.transform.AnnotateSpans(), relay.transform.DefuseOps()])
-    with tvm.transform.PassContext(opt_level=3):
-        module = seq(module)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py
deleted file mode 100644
index a32f7d7f6190..000000000000
--- a/tests/python/relay/test_pass_annotate_target.py
+++ /dev/null
@@ -1,806 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for annotating external targets."""
-import os
-import sys
-import numpy as np
-import pytest
-
-import tvm
-import tvm.relay.testing
-import tvm.relay.transform as transform
-from tvm import relay
-from tvm import runtime
-from tvm.contrib import utils
-
-
-def check_result(
-    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu(), params=None
-):
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        return
-
-    def update_lib(lib):
-        test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-        source_dir = os.path.join(test_dir, "..", "..", "..")
-        contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
-
-        kwargs = {}
-        kwargs["options"] = ["-O2", "-std=c++17", "-I" + contrib_path]
-        tmp_path = utils.tempdir()
-        lib_name = "lib.so"
-        lib_path = tmp_path.relpath(lib_name)
-        lib.export_library(lib_path, fcompile=False, **kwargs)
-        lib = runtime.load_module(lib_path)
-
-        return lib
-
-    def check_vm_result():
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            exe = relay.vm.compile(mod, target=target, params=params)
-        code, lib = exe.save()
-        lib = update_lib(lib)
-        exe = runtime.vm.Executable.load_exec(code, lib)
-        vm = runtime.vm.VirtualMachine(exe, device)
-        out = vm.run(**map_inputs)
-        tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
-
-    def check_graph_executor_result():
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            json, lib, param = relay.build(mod, target=target, params=params)
-        lib = update_lib(lib)
-        rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
-
-        for name, data in map_inputs.items():
-            rt_mod.set_input(name, data)
-        rt_mod.set_input(**param)
-        rt_mod.run()
-        out = tvm.nd.empty(out_shape, device=device)
-        out = rt_mod.get_output(0, out)
-
-        tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
-
-    check_vm_result()
-    check_graph_executor_result()
-
-
-def test_extern_dnnl():
-    def annotated(dtype, ishape, w1shape):
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
-        depthwise_conv2d_1 = relay.nn.conv2d(
-            data, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        depthwise_conv2d_2 = relay.nn.conv2d(
-            depthwise_conv2d_1, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-
-        f = relay.Function([data, weight1], out)
-
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    def expected(dtype, ishape, w1shape):
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
-        begin0 = relay.annotation.compiler_begin(data, "dnnl")
-        begin1 = relay.annotation.compiler_begin(weight1, "dnnl")
-        depthwise_conv2d_1 = relay.nn.conv2d(
-            begin0, begin1, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        end0 = relay.annotation.compiler_end(depthwise_conv2d_1, "dnnl")
-        end1 = relay.annotation.compiler_end(depthwise_conv2d_1, "dnnl")
-        begin2 = relay.annotation.compiler_begin(end1, "dnnl")
-        begin3 = relay.annotation.compiler_begin(end0, "dnnl")
-        begin4 = relay.annotation.compiler_begin(weight1, "dnnl")
-        depthwise_conv2d_2 = relay.nn.conv2d(
-            begin3, begin4, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        end2 = relay.annotation.compiler_end(depthwise_conv2d_2, "dnnl")
-        begin5 = relay.annotation.compiler_begin(end2, "dnnl")
-        out = relay.add(begin2, begin5)
-        end3 = relay.annotation.compiler_end(out, "dnnl")
-        f = relay.Function([data, weight1], end3)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    w1shape = (32, 1, 3, 3)
-
-    def test_annotate():
-        mod = annotated(dtype, ishape, w1shape)
-        mod = transform.AnnotateTarget("dnnl")(mod)
-        mod = relay.transform.InferType()(mod)
-        ref_mod = expected(dtype, ishape, w1shape)
-        ref_mod = relay.transform.InferType()(ref_mod)
-        tvm.ir.assert_structural_equal(mod, ref_mod)
-
-    def test_run():
-        if not tvm.get_global_func("relay.ext.dnnl", True):
-            print("skip because DNNL codegen is not available")
-            return
-
-        ref_mod = annotated(dtype, ishape, w1shape)
-        mod = annotated(dtype, ishape, w1shape)
-        mod = transform.PartitionGraph()(mod)
-
-        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-        w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-        ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()).evaluate()(
-            i_data, w1_data
-        )
-
-        check_result(
-            mod, {"data": i_data, "weight1": w1_data}, (1, 32, 14, 14), ref_res.numpy(), tol=1e-5
-        )
-
-    test_annotate()
-    test_run()
-
-
-@pytest.mark.skip(reason="fix constant node before opening this case")
-def test_extern_dnnl_mobilenet():
-    if not tvm.get_global_func("relay.ext.dnnl", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    ishape = (1, 3, 224, 224)
-    mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-
-    mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params)
-    mod = transform.AnnotateTarget("dnnl")(mod)
-    mod = transform.PartitionGraph()(mod)
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-
-    ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)).evaluate()(
-        i_data, **params
-    )
-
-    check_result(mod, {"data": i_data}, (1, 1000), ref_res.numpy(), tol=1e-5, params=params)
-
-
-def test_multiple_ends():
-    @tvm.ir.register_op_attr("nn.relu", "target.test")
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    def before():
-        x = relay.var("x", shape=(10, 10))
-        r = relay.nn.relu(x)
-        a_1 = relay.abs(r)
-        a_2 = relay.abs(r)
-        out = relay.add(a_1, a_2)
-        f = relay.Function([x], out)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    def after():
-        x = relay.var("x", shape=(10, 10))
-        cb_1 = relay.annotation.compiler_begin(x, "test")
-        r = relay.nn.relu(cb_1)
-        ce_1 = relay.annotation.compiler_end(r, "test")
-        ce_2 = relay.annotation.compiler_end(r, "test")
-        cb_2 = relay.annotation.compiler_begin(ce_1, "default")
-        cb_3 = relay.annotation.compiler_begin(ce_2, "default")
-        a_1 = relay.abs(cb_2)
-        a_2 = relay.abs(cb_3)
-        ce_3 = relay.annotation.compiler_end(a_1, "default")
-        ce_4 = relay.annotation.compiler_end(a_2, "default")
-        cb_4 = relay.annotation.compiler_begin(ce_3, "default")
-        cb_5 = relay.annotation.compiler_begin(ce_4, "default")
-        out = relay.add(cb_4, cb_5)
-        ce_6 = relay.annotation.compiler_end(out, "default")
-        f = relay.Function([x], ce_6)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    for annotate_non_call_ops in [False, True]:
-        result = transform.AnnotateTarget("test", annotate_non_call_ops)(before())
-        expected = transform.InferType()(after())
-        tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_type_propagation():
-    target = "test_type_propagation"
-
-    @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(expr):  # pylint: disable=unused-variable
-        return expr.args[0].checked_type.dtype == "float32"
-
-    def before():
-        x = relay.var("x", shape=(10, 10))
-        r = relay.nn.relu(x)
-        out = relay.nn.relu(r)
-        f = relay.Function([x], out)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    for annotate_non_call_ops in [False, True]:
-        # If the type isn't propogated, then the relu checker function will fail to get the dtype.
-        assert transform.AnnotateTarget(target, annotate_non_call_ops)(before())
-
-
-def test_ref_create_read_write():
-    target = "relu"
-
-    @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def annotate(expr):
-        return True
-
-    def before():
-        ref = relay.expr.RefCreate(relay.const(1.0))
-        r = relay.expr.RefWrite(ref, relay.nn.relu(relay.expr.RefRead(ref)))
-        return tvm.IRModule.from_expr(r)
-
-    def after(annotate_non_call_ops):
-        co = relay.const(1.0)
-        if annotate_non_call_ops:
-            co = relay.annotation.compiler_begin(co, "default")
-
-        ref = relay.expr.RefCreate(co)
-        ref1 = ref
-        if annotate_non_call_ops:
-            ref = relay.annotation.compiler_end(ref, "default")
-            ref = relay.annotation.compiler_begin(ref, "default")
-            ref1 = relay.annotation.compiler_end(ref1, "default")
-            ref1 = relay.annotation.compiler_begin(ref1, "default")
-
-        read = relay.expr.RefRead(ref1)
-        if annotate_non_call_ops:
-            read = relay.annotation.compiler_end(read, "default")
-
-        beg = relay.annotation.compiler_begin(read, target)
-        relu = relay.nn.relu(beg)
-        end = relay.annotation.compiler_end(relu, target)
-
-        if annotate_non_call_ops:
-            end = relay.annotation.compiler_begin(end, "default")
-
-        r = relay.expr.RefWrite(ref, end)
-
-        if annotate_non_call_ops:
-            r = relay.annotation.compiler_end(r, "default")
-        return tvm.IRModule.from_expr(r)
-
-    for annotate_non_call_ops in [True, False, True]:
-        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
-        expected = transform.InferType()(after(annotate_non_call_ops))
-        tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_tuple():
-    target = "test_tuple"
-
-    @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("concatenate", "target." + target)
-    def concatenate(expr):  # pylint: disable=unused-variable
-        return True
-
-    """Test that TupleNode is included in annotation when surrounded by supported nodes."""
-
-    def before():
-        x = relay.var("x", shape=(10, 5))
-        y = relay.var("y", shape=(10, 5))
-        a_1 = relay.nn.relu(x)
-        a_2 = relay.nn.relu(y)
-        out = relay.concatenate((a_1, a_2), axis=1)
-        f = relay.Function([x, y], out)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    def after(annotate_non_call_ops):
-        x = relay.var("x", shape=(10, 5))
-        y = relay.var("y", shape=(10, 5))
-        cb_1 = relay.annotation.compiler_begin(x, target)
-        cb_2 = relay.annotation.compiler_begin(y, target)
-        a_1 = relay.nn.relu(cb_1)
-        a_2 = relay.nn.relu(cb_2)
-        ce_1 = relay.annotation.compiler_end(a_1, target)
-        ce_2 = relay.annotation.compiler_end(a_2, target)
-
-        if annotate_non_call_ops:
-            cb_3 = relay.annotation.compiler_begin(ce_1, target)
-            cb_4 = relay.annotation.compiler_begin(ce_2, target)
-            tup = relay.Tuple([cb_3, cb_4])
-            ce_3 = relay.annotation.compiler_end(tup, target)
-        else:
-            ce_3 = relay.Tuple([ce_1, ce_2])
-
-        cb_3 = relay.annotation.compiler_begin(ce_3, target)
-        out = relay.op._make.concatenate(cb_3, 1)
-        ce_4 = relay.annotation.compiler_end(out, target)
-        f = relay.Function([x, y], ce_4)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    for annotate_non_call_ops in [False, True]:
-        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
-        expected = transform.InferType()(after(annotate_non_call_ops))
-        tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_composite_function():
-    def before():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-
-        # add_relu function
-        in_1 = relay.var("in_1", shape=(10, 10))
-        in_2 = relay.var("in_2", shape=(10, 10))
-        add_node = relay.add(in_1, in_2)
-        relu_node = relay.nn.relu(add_node)
-        add_relu = relay.Function([in_1, in_2], relu_node)
-        add_relu = add_relu.with_attr("Composite", "test.add_relu")
-
-        # merged function
-        r = relay.Call(add_relu, [a, b])
-        f = relay.Function([a, b], r)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    def after():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-
-        # add_relu function
-        in_1 = relay.var("in_1", shape=(10, 10))
-        in_2 = relay.var("in_2", shape=(10, 10))
-        add_node = relay.add(in_1, in_2)
-        relu_node = relay.nn.relu(add_node)
-        add_relu = relay.Function([in_1, in_2], relu_node)
-        add_relu = add_relu.with_attr("Composite", "test.add_relu")
-
-        # merged function
-        cb_1 = relay.annotation.compiler_begin(a, "test")
-        cb_2 = relay.annotation.compiler_begin(b, "test")
-        r = relay.Call(add_relu, [cb_1, cb_2])
-        ce_1 = relay.annotation.compiler_end(r, "test")
-        f = relay.Function([a, b], ce_1)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    result = transform.AnnotateTarget("test")(before())
-    expected = transform.InferType()(after())
-    tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_double_target():
-    @tvm.ir.register_op_attr("nn.relu", "target.double.A")
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    def before():
-        x = relay.var("x", shape=(10, 5))
-        a_1 = relay.nn.relu(x)
-        mod = tvm.IRModule.from_expr(a_1)
-        return mod
-
-    for annotate_non_call_ops in [True, False]:
-        mod = before()
-        mod1 = transform.AnnotateTarget("double.A", annotate_non_call_ops)(mod)
-        mod2 = transform.AnnotateTarget("double.A", annotate_non_call_ops)(mod1)
-        tvm.ir.assert_structural_equal(mod1, mod2)
-
-
-def test_different_targets():
-    @tvm.ir.register_op_attr("nn.relu", "target.different.A")
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("add", "target.different.B")
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    def before():
-        x = relay.var("x", shape=(10, 5))
-        a_1 = relay.nn.relu(x)
-        b_1 = relay.add(a_1, a_1)
-        mod = tvm.IRModule.from_expr(b_1)
-        return mod
-
-    for annotate_non_call_ops in [True, False]:
-        mod = before()
-        mod1 = transform.AnnotateTarget("different.A", annotate_non_call_ops)(mod)
-        mod1 = transform.AnnotateTarget("different.B", annotate_non_call_ops)(mod1)
-        mod2 = transform.AnnotateTarget(["different.A", "different.B"], annotate_non_call_ops)(mod)
-        tvm.ir.assert_structural_equal(mod1, mod2)
-
-
-def test_multiple_runs():
-    @tvm.ir.register_op_attr("nn.relu", "target.A")
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("add", "target.B")
-    def add(expr):  # pylint: disable=unused-variable
-        return True
-
-    def before():
-        x = relay.var("x", shape=(10, 5))
-        a_1 = relay.nn.relu(x)
-        a_2 = relay.abs(a_1)
-        a_3 = relay.nn.relu(a_1)
-        out = relay.add(a_2, a_3)
-
-        f = relay.Function([x], out)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    for annotate_non_call_ops in [True, False]:
-        mod = transform.AnnotateTarget("A", annotate_non_call_ops)(before())
-        mod = transform.AnnotateTarget("B", annotate_non_call_ops)(mod)
-        expected = transform.AnnotateTarget(["A", "B"], annotate_non_call_ops)(before())
-        tvm.ir.assert_structural_equal(expected, mod)
-
-
-def test_ends_with_tuple():
-    trgt = "clip"
-
-    @tvm.ir.register_op_attr("clip", "target." + trgt)
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    def get_model(get_item):
-        """Return a model"""
-        a = relay.var("a", shape=(1, 16, 16, 4), dtype="uint8")
-        z = relay.op.clip(a, 0, 255)
-        b = relay.op.clip(z, 0, 15)
-        c = relay.op.clip(z, 16, 31)
-        t = relay.Tuple((c, b))
-        tgi = relay.TupleGetItem(t, 1) if get_item else t
-        foo = relay.Function([a], tgi)
-        return tvm.IRModule.from_expr(tgi)
-
-    def get_expected(annotate_non_call_ops, get_item):
-        a_ = relay.var("a", shape=(1, 16, 16, 4), dtype="uint8")
-        a = relay.annotation.compiler_begin(a_, trgt)
-        z = relay.op.clip(a, 0, 255)
-        z1 = relay.annotation.compiler_end(z, trgt)
-        z1 = relay.annotation.compiler_begin(z1, trgt)
-        b = relay.op.clip(z1, 0, 15)
-        b = relay.annotation.compiler_end(b, trgt)
-        b = relay.annotation.compiler_begin(b, trgt) if annotate_non_call_ops else b
-        z2 = relay.annotation.compiler_end(z, trgt)
-        z2 = relay.annotation.compiler_begin(z2, trgt)
-        c = relay.op.clip(z2, 16, 31)
-        c = relay.annotation.compiler_end(c, trgt)
-        c = relay.annotation.compiler_begin(c, trgt) if annotate_non_call_ops else c
-        t = relay.Tuple((c, b))
-        t = relay.annotation.compiler_end(t, trgt) if annotate_non_call_ops else t
-        if get_item:
-            t = relay.annotation.compiler_begin(t, trgt) if annotate_non_call_ops else t
-            tgi = relay.TupleGetItem(t, 1)
-            tgi = relay.annotation.compiler_end(tgi, trgt) if annotate_non_call_ops else tgi
-        else:
-            tgi = t
-        foo = relay.Function([a_], tgi)
-        return tvm.IRModule.from_expr(foo)
-
-    for get_item in [True, False]:
-        for annotate_non_call_ops in [False, True]:
-            mod = get_model(get_item)
-            mod = transform.AnnotateTarget("clip", annotate_non_call_ops)(mod)
-            expected = transform.InferType()(get_expected(annotate_non_call_ops, get_item))
-            tvm.ir.assert_structural_equal(expected, mod)
-
-
-def test_if_else():
-    target = "test_if_else"
-
-    @tvm.ir.register_op_attr("equal", "target." + target)
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("tanh", "target." + target)
-    def tanh(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("sigmoid", "target." + target)
-    def sigmoid(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("erf", "target." + target)
-    def erf(expr):  # pylint: disable=unused-variable
-        return True
-
-    """Test that If-else nodes compiles correctly when surrounded by supported nodes."""
-
-    def before():
-        data = relay.var("data", shape=(1, 32))
-        eq1 = relay.var("e1", shape=[], dtype="float32")
-        eq2 = relay.var("e2", shape=[], dtype="float32")
-        eq = relay.equal(eq1, eq2)
-
-        true_branch = relay.tanh(data)
-        false_branch = relay.sigmoid(data)
-        ife = relay.If(eq, true_branch, false_branch)
-        out = relay.erf(ife)
-        func = relay.Function([data, eq1, eq2], out)
-        mod = tvm.IRModule.from_expr(func)
-
-        return mod
-
-    def after():
-
-        data = relay.var("data", shape=(1, 32))
-        eq1 = relay.var("e1", shape=[], dtype="float32")
-        eq2 = relay.var("e2", shape=[], dtype="float32")
-
-        cb_1 = relay.annotation.compiler_begin(eq1, target)
-        cb_2 = relay.annotation.compiler_begin(eq2, target)
-
-        equality_condition = relay.equal(cb_1, cb_2)
-        ce_1 = relay.annotation.compiler_end(equality_condition, target)
-
-        # if condition
-        cb_3 = relay.annotation.compiler_begin(data, target)
-        true_branch = relay.tanh(cb_3)
-        ce_2 = relay.annotation.compiler_end(true_branch, target)
-
-        # else condition
-        cb_4 = relay.annotation.compiler_begin(data, target)
-        false_branch = relay.sigmoid(cb_4)
-        ce_3 = relay.annotation.compiler_end(false_branch, target)
-
-        if_condition = relay.If(ce_1, ce_2, ce_3)
-        cb_5 = relay.annotation.compiler_begin(if_condition, target)
-        erf_out = relay.erf(cb_5)
-        ce_4 = relay.annotation.compiler_end(erf_out, target)
-        func = relay.Function([data, eq1, eq2], ce_4)
-        mod = tvm.IRModule.from_expr(func)
-        return mod
-
-    expected = transform.InferType()(after())
-    for annotate_non_call_ops in [True, False]:
-        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
-        tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_while_let():
-    target = "test_while_let"
-
-    @tvm.ir.register_op_attr("less", "target." + target)
-    def less(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("add", "target." + target)
-    def add(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("zeros_like", "target." + target)
-    def zeros_like(expr):  # pylint: disable=unused-variable
-        return True
-
-    """Test that let nodes compiles correctly when surrounded by other nodes."""
-
-    def before():
-
-        var1 = relay.var("var1", shape=(2,))
-        var2 = relay.var("var2", shape=(), dtype="int32")
-        var3 = relay.var("var3", shape=(2,))
-        cond = relay.less(var2, relay.const(10, dtype="int32"))
-
-        loop = relay.var("while_loop")
-        ii = var2 + relay.const(1, dtype="int32")
-        ss = var3 + var1
-        true_branch = loop(ii, ss)
-        ife = relay.If(cond, true_branch, var3)
-        func_1 = relay.Function([var2, var3], ife)
-
-        ret = relay.Let(loop, func_1, loop(relay.const(0, dtype="int32"), relay.zeros_like(var1)))
-        func_2 = relay.Function([var1], ret)
-        mod = tvm.IRModule.from_expr(func_2)
-        return mod
-
-    def after(annotate_non_call_ops):
-        var1 = relay.var("var1", shape=(2,))
-        var2 = relay.var("var2", shape=(), dtype="int32")
-        var3 = relay.var("var3", shape=(2,))
-        var4 = relay.const(10, dtype="int32")
-
-        cb_1 = relay.annotation.compiler_begin(var2, target)
-        cb_2 = relay.annotation.compiler_begin(var4, target)
-
-        less_condition = relay.less(cb_1, cb_2)
-        ce_1 = relay.annotation.compiler_end(less_condition, target)
-
-        loop = relay.var("while_loop")
-
-        # if condition
-        cb_3 = relay.annotation.compiler_begin(var2, target)
-        cb_4 = relay.annotation.compiler_begin(relay.const(1, dtype="int32"), target)
-        add_op_1 = relay.add(cb_3, cb_4)
-        ce_2 = relay.annotation.compiler_end(add_op_1, target)
-
-        cb_5 = relay.annotation.compiler_begin(ce_2, "default") if annotate_non_call_ops else ce_2
-
-        cb_6 = relay.annotation.compiler_begin(var3, target)
-        cb_7 = relay.annotation.compiler_begin(var1, target)
-        add_op_2 = relay.add(cb_6, cb_7)
-        ce_3 = relay.annotation.compiler_end(add_op_2, target)
-
-        cb_8 = relay.annotation.compiler_begin(ce_3, "default") if annotate_non_call_ops else ce_3
-
-        true_branch = loop(cb_5, cb_8)  # while loop
-        ce_4 = (
-            relay.annotation.compiler_end(true_branch, "default")
-            if annotate_non_call_ops
-            else true_branch
-        )
-        if_condition = relay.If(ce_1, ce_4, var3)
-        const_1 = relay.const(0, dtype="int32")
-        cb_9 = (
-            relay.annotation.compiler_begin(const_1, "default")
-            if annotate_non_call_ops
-            else const_1
-        )
-        cb_10 = relay.annotation.compiler_begin(var1, target)
-        zeros_like = relay.zeros_like(cb_10)
-        ce_5 = relay.annotation.compiler_end(zeros_like, target)
-        cb_11 = relay.annotation.compiler_begin(ce_5, "default") if annotate_non_call_ops else ce_5
-        while_condition = loop(cb_9, cb_11)
-        ce_6 = (
-            relay.annotation.compiler_end(while_condition, "default")
-            if annotate_non_call_ops
-            else while_condition
-        )
-
-        func_1 = relay.Function([var2, var3], if_condition)
-        ret = relay.Let(loop, func_1, ce_6)
-        func_2 = relay.Function([var1], ret)
-        mod = tvm.IRModule.from_expr(func_2)
-        return mod
-
-    for annotate_non_call_ops in [False, True]:
-        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
-        expected = transform.InferType()(after(annotate_non_call_ops))
-        tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_if_free_vars():
-    target = "test_if_free_vars"
-
-    @tvm.ir.register_op_attr("equal", "target." + target)
-    def equal(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("sigmoid", "target." + target)
-    def sigmoid(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("erf", "target." + target)
-    def erf(expr):  # pylint: disable=unused-variable
-        return True
-
-    """Test that If-else nodes compiles correctly when surrounded by free variables"""
-
-    def before():
-        data = relay.var("data", shape=(1, 32))
-        eq1 = relay.var("e1", shape=[], dtype="float32")
-        eq2 = relay.var("e2", shape=[], dtype="float32")
-        eq = relay.equal(eq1, eq2)
-
-        true_branch = relay.zeros(shape=(1, 32), dtype="float32")
-        false_branch = relay.sigmoid(data)
-        ife = relay.If(eq, true_branch, false_branch)
-        out = relay.erf(ife)
-
-        func = relay.Function([data, eq1, eq2], out)
-        mod = tvm.IRModule.from_expr(func)
-
-        return mod
-
-    def after():
-        data = relay.var("data", shape=(1, 32))
-        eq1 = relay.var("e1", shape=[], dtype="float32")
-        eq2 = relay.var("e2", shape=[], dtype="float32")
-
-        cb_1 = relay.annotation.compiler_begin(eq1, target)
-        cb_2 = relay.annotation.compiler_begin(eq2, target)
-
-        equality_condition = relay.equal(cb_1, cb_2)
-        ce_1 = relay.annotation.compiler_end(equality_condition, target)
-
-        # if condition
-        true_branch = relay.zeros(shape=(1, 32), dtype="float32")
-
-        # else condition
-        cb_3 = relay.annotation.compiler_begin(data, target)
-        false_branch = relay.sigmoid(cb_3)
-        ce_2 = relay.annotation.compiler_end(false_branch, target)
-
-        if_condition = relay.If(ce_1, true_branch, ce_2)
-        cb_4 = relay.annotation.compiler_begin(if_condition, target)
-        erf_out = relay.erf(cb_4)
-        ce_3 = relay.annotation.compiler_end(erf_out, target)
-        func = relay.Function([data, eq1, eq2], ce_3)
-        mod = tvm.IRModule.from_expr(func)
-        return mod
-
-    for annotate_non_call_ops in [True, False]:
-        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
-        expected = transform.InferType()(after())
-        tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_free_vars_zeros():
-    target = "test_free_vars_zeros"
-
-    """Test that free variables compile correctly on their own"""
-
-    def before():
-        func = relay.Function([], relay.zeros(shape=(0), dtype="float32"))
-        mod = tvm.IRModule.from_expr(func)
-        return mod
-
-    def after():
-        func = relay.Function([], relay.zeros(shape=(0), dtype="float32"))
-        mod = tvm.IRModule.from_expr(func)
-        return mod
-
-    result = transform.AnnotateTarget(target)(before())
-    expected = transform.InferType()(after())
-    tvm.ir.assert_structural_equal(expected, result)
-
-
-def test_empty_tuple():
-    target = "test_empty_tuple"
-
-    """An empty tuple should behave just like a call with no args (see above test)."""
-
-    def before():
-        func = relay.Function([], relay.Tuple([]))
-        mod = tvm.IRModule.from_expr(func)
-        return mod
-
-    def after():
-        func = relay.Function([], relay.Tuple([]))
-        mod = tvm.IRModule.from_expr(func)
-        return mod
-
-    for annotate_non_call_ops in [True, False]:
-        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
-        expected = transform.InferType()(after())
-        tvm.ir.assert_structural_equal(expected, result)
-
-
-if __name__ == "__main__":
-    test_extern_dnnl()
-    test_composite_function()
-    # test_extern_dnnl_mobilenet()
-    test_multiple_ends()
-    test_type_propagation()
-    test_tuple()
-    test_multiple_runs()
-    test_if_else()
-    test_while_let()
-    test_if_free_vars()
-    test_free_vars_zeros()
-    test_different_targets()
-    test_double_target()
-    test_ends_with_tuple()
-    test_ref_create_read_write()
-    test_empty_tuple()
diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
deleted file mode 100644
index 30d4c3650215..000000000000
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ /dev/null
@@ -1,537 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import testing
-from tvm.relay.expr import Call
-from tvm.topi.utils import get_const_tuple
-
-
-def quantize_and_build(out, skip_conv_layers=[]):
-    f = relay.Function(relay.analysis.free_vars(out), out)
-    mod, params = testing.create_workload(f)
-
-    with relay.quantize.qconfig(skip_conv_layers=skip_conv_layers):
-        qmod = relay.quantize.quantize(mod, params)
-
-    relay.build(qmod, "llvm", params=params)
-
-    return qmod
-
-
-def test_mul_rewrite():
-    """a test case where rhs of mul is not constant"""
-    data = relay.var("data", shape=(1, 16, 64, 64))
-    multiplier = relay.sigmoid(relay.var("data", shape=(1, 16, 1, 1)))
-    conv = relay.nn.conv2d(
-        data, relay.var("weight"), kernel_size=(3, 3), padding=(1, 1), channels=16
-    )
-    act = relay.nn.relu(data=conv)
-
-    quantize_and_build(act * multiplier)
-
-    pool = relay.nn.global_avg_pool2d(data=act)
-
-    quantize_and_build(act * pool)
-
-
-def test_skip_conv():
-    data = relay.var("data", shape=(1, 16, 64, 64))
-    np_weight = np.random.rand(16, 16, 3, 3)
-    conv0_weight = relay.Constant(tvm.nd.array(np_weight)).astype("float32")
-    conv1_weight = relay.Constant(tvm.nd.array(np_weight)).astype("float32")
-    multiplier = relay.sigmoid(relay.var("data", shape=(1, 16, 1, 1)))
-
-    conv0 = relay.nn.conv2d(data, conv0_weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
-    act0 = relay.nn.relu(data=conv0)
-    conv1 = relay.nn.conv2d(act0, conv1_weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
-    act1 = relay.nn.relu(data=conv1)
-
-    quantize_and_build(act1 * multiplier)
-    quantize_and_build(act1 * multiplier, skip_conv_layers=[0])
-    quantize_and_build(act1 * multiplier, skip_conv_layers=[1])
-    quantize_and_build(act1 * multiplier, skip_conv_layers=[0, 1])
-
-
-def test_stop_quantize():
-    data = relay.var("data", shape=(1, 16, 64, 64))
-    np_weight0 = np.random.rand(16, 16, 3, 3)
-    conv0_weight = relay.Constant(tvm.nd.array(np_weight0)).astype("float32")
-    np_weight1 = np.random.rand(16, 16, 1, 1)
-    conv1_weight = relay.Constant(tvm.nd.array(np_weight1)).astype("float32")
-    multiplier = relay.sigmoid(relay.var("data", shape=(1, 16, 1, 1)))
-
-    conv0 = relay.nn.conv2d(data, conv0_weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
-    act0 = relay.nn.relu(data=conv0)
-
-    pool = relay.nn.global_avg_pool2d(data=act0)
-
-    conv1 = relay.nn.conv2d(pool, conv1_weight, kernel_size=(1, 1), padding=(0, 0), channels=16)
-    act1 = relay.nn.relu(data=conv1)
-
-    quantize_and_build(act1 * multiplier)
-
-
-def test_batch_flatten_rewrite():
-
-    data = relay.var("data", shape=(1, 16, 64, 64), dtype="float32")
-
-    out = relay.nn.conv2d(
-        data, relay.var("weight"), kernel_size=(3, 3), padding=(1, 1), channels=16
-    )
-
-    out = relay.nn.batch_flatten(out)
-
-    qmod = quantize_and_build(out)
-
-    def _check_batch_flatten(node):
-        if isinstance(node, Call):
-            if node.op.name == "nn.batch_flatten":
-                assert node.checked_type.dtype == "int8"
-
-    # check if batch_flatten is quantized
-    relay.analysis.post_order_visit(qmod["main"], _check_batch_flatten)
-
-
-def test_batch_matmul_rewrite():
-    data = relay.var("data", shape=(1, 4, 16, 16))
-    data2 = relay.sigmoid(relay.var("data", shape=(4, 16, 64)))
-    out = relay.nn.conv2d(data, relay.var("weight"), kernel_size=(3, 3), padding=(1, 1), channels=8)
-
-    out = relay.nn.batch_flatten(out)
-    out = relay.reshape(out, [1, 32, 64])
-    out = relay.nn.batch_matmul(out, data2)
-
-    qmod = quantize_and_build(out)
-
-    def _check_batch_matmul(node):
-        if isinstance(node, Call):
-
-            if node.op.name in ["nn.batch_matmul", "nn.conv2d"]:
-                assert node.checked_type.dtype == "int32"
-            elif node.op.name == "nn.batch_flatten":
-                assert node.checked_type.dtype == "int8"
-
-    # check if batch_matmul is quantized
-    relay.analysis.post_order_visit(qmod["main"], _check_batch_matmul)
-
-
-def get_calibration_dataset(mod, input_name):
-    dataset = []
-    input_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
-    for i in range(5):
-        data = np.random.uniform(size=input_shape)
-        dataset.append({input_name: data})
-    return dataset
-
-
-@pytest.mark.parametrize("create_target", [True, False])
-def test_calibrate_target(create_target):
-    mod, params = testing.synthetic.get_workload()
-    dataset = get_calibration_dataset(mod, "data")
-    with relay.quantize.qconfig(calibrate_mode="kl_divergence"):
-        if create_target:
-            with tvm.target.Target("llvm"):
-                relay.quantize.quantize(mod, params, dataset)
-        else:
-            # current_target = None
-            relay.quantize.quantize(mod, params, dataset)
-
-
-def test_calibrate_memory_bound():
-    mod, params = testing.synthetic.get_workload()
-    dataset = get_calibration_dataset(mod, "data")
-    import multiprocessing
-
-    num_cpu = multiprocessing.cpu_count()
-    with relay.quantize.qconfig(calibrate_mode="kl_divergence", calibrate_chunk_by=num_cpu):
-        relay.quantize.quantize(mod, params, dataset)
-
-
-def test_calibrate_percentile():
-    mod, params = testing.synthetic.get_workload()
-    dataset = get_calibration_dataset(mod, "data")
-    with relay.quantize.qconfig(calibrate_mode="percentile"):
-        relay.quantize.quantize(mod, params, dataset)
-
-
-####################################
-# Quant/Dequant Partitioning Tests #
-####################################
-
-BASE_CFG = {
-    "skip_conv_layers": [],
-    "skip_dense_layers": False,
-    "dtype_input": "int8",
-    "dtype_weight": "int8",
-    "dtype_activation": "int32",
-}
-
-
-def gen_rand_tvm(tt, low, high):
-    if "int" in tt.dtype:
-        data_np = np.random.randint(low, high, size=get_const_tuple(tt.shape), dtype=tt.dtype)
-    elif "float" in tt.dtype:
-        data_np = np.random.uniform(low, high, size=get_const_tuple(tt.shape)).astype(tt.dtype)
-    else:
-        assert False, "unknown dtype"
-    return tvm.nd.array(data_np, device=tvm.cpu(0))
-
-
-def verify_partition_fails(mod, params):
-    # standard partition should always succeed
-    with relay.quantize.qconfig(**BASE_CFG, partition_conversions="enabled"):
-        partitioned_mod = relay.quantize.quantize(mod, params)
-
-    try:
-        with relay.quantize.qconfig(**BASE_CFG, partition_conversions="fully_integral"):
-            partitioned_mod = relay.quantize.quantize(mod, params)
-        raise RuntimeError("partitioning should have failed")
-    except AssertionError:
-        pass
-
-
-def verify_partition(mod, params):
-    with relay.quantize.qconfig(**BASE_CFG, paritition_conversions="disabled"):
-        unpartitioned_mod = relay.quantize.quantize(mod, params)
-        assert (
-            len(unpartitioned_mod.get_global_vars()) == 1
-        ), "unpartitioned module should only have one function"
-    with relay.quantize.qconfig(**BASE_CFG, partition_conversions="fully_integral"):
-        partitioned_mod = relay.quantize.quantize(mod, params)
-
-    # ensure partitioned and unpartitioned results agree
-    params = [gen_rand_tvm(param.type_annotation, 0, 1) for param in partitioned_mod["main"].params]
-
-    def _eval_mod(mod):
-        return relay.create_executor("vm", device=tvm.cpu(0), target="llvm", mod=mod).evaluate()(
-            *params
-        )
-
-    partitioned_mod_result = _eval_mod(partitioned_mod)
-    unpartitioned_mod_result = _eval_mod(unpartitioned_mod)
-    tvm.testing.assert_allclose(unpartitioned_mod_result.numpy(), partitioned_mod_result.numpy())
-
-
-def test_add_partition():
-    mod = tvm.relay.parse(
-        """
-    #[version = "0.0.5"]
-    def @main(
-        %x: Tensor[(10, 10), float32],
-        %y: Tensor[(10, 10), float32]) {
-      add(%x, %y)
-    }
-    """
-    )
-    params = {}
-    verify_partition_fails(mod, params)
-
-
-def test_conv2d_partition():
-    mod = tvm.relay.parse(
-        """
-    #[version = "0.0.5"]
-    def @main(
-        %x: Tensor[(1, 4, 16, 16), float32],
-        %w: Tensor[(4, 4, 3, 3), float32]) -> Tensor[(1, 4, 16, 16), float32] {
-      nn.conv2d(%x, %w,
-        padding=[1, 1, 1, 1],
-        channels=4,
-        kernel_size=[3, 3])
-    }
-    """
-    )
-    weight_ty = mod["main"].params[1].checked_type
-    params = {"w": gen_rand_tvm(weight_ty, 0, 1)}
-    verify_partition(mod, params)
-
-
-def test_multiple_arg_conversions_partition():
-    mod = tvm.relay.parse(
-        """
-    #[version = "0.0.5"]
-    def @main(
-        %x1: Tensor[(1, 4, 16, 16), float32],
-        %w1: Tensor[(4, 4, 3, 3), float32],
-        %x2: Tensor[(1, 4, 16, 16), float32],
-        %w2: Tensor[(4, 4, 3, 3), float32]
-        ) -> Tensor[(1, 4, 16, 16), float32] {
-      %0 = nn.conv2d(%x1, %w1,
-        padding=[1, 1, 1, 1],
-        channels=4,
-        kernel_size=[3, 3]);
-      %1 = nn.conv2d(%x2, %w2,
-        padding=[1, 1, 1, 1],
-        channels=4,
-        kernel_size=[3, 3]);
-      add(%0, %1)
-    }
-    """
-    )
-
-    w1_ty = mod["main"].params[1].checked_type
-    w2_ty = mod["main"].params[3].checked_type
-    params = {"w1": gen_rand_tvm(w1_ty, 0, 1), "w2": gen_rand_tvm(w2_ty, 0, 1)}
-    verify_partition(mod, params)
-
-
-def test_unquantizable_prefix_partition():
-    mod = tvm.relay.parse(
-        """
-    #[version = "0.0.5"]
-    def @main(
-        %x: Tensor[(1, 4, 16, 16), float32],
-        %b: Tensor[(4), float32],
-        %w: Tensor[(4, 4, 3, 3), float32]) -> Tensor[(1, 4, 16, 16), float32] {
-      // NOTE bias_add isn't currently quantizable
-      %0 = nn.bias_add(%x, %b);
-      nn.conv2d(%0, %w,
-        padding=[1, 1, 1, 1],
-        channels=4,
-        kernel_size=[3, 3])
-    }
-    """
-    )
-    bias_ty = mod["main"].params[1].checked_type
-    weight_ty = mod["main"].params[2].checked_type
-    params = {"b": gen_rand_tvm(bias_ty, 0, 1), "w": gen_rand_tvm(weight_ty, 0, 1)}
-    verify_partition_fails(mod, params)
-
-
-def test_unquantizable_core_partition():
-    mod = tvm.relay.parse(
-        """
-    #[version = "0.0.5"]
-    def @main(
-        %x1: Tensor[(1, 4, 16, 16), float32],
-        %w1: Tensor[(4, 4, 3, 3), float32],
-        %b: Tensor[(4), float32],
-        %w2: Tensor[(4, 4, 3, 3), float32]) -> Tensor[(1, 4, 16, 16), float32] {
-      %0 = nn.conv2d(%x1, %w1,
-        padding=[1, 1, 1, 1],
-        channels=4,
-        kernel_size=[3, 3]);
-      // NOTE bias_add isn't currently quantizable
-      %1 = nn.bias_add(%0, %b);
-      nn.conv2d(%1, %w2,
-        padding=[1, 1, 1, 1],
-        channels=4,
-        kernel_size=[3, 3])
-    }
-    """
-    )
-    w1_ty = mod["main"].params[1].checked_type
-    bias_ty = mod["main"].params[2].checked_type
-    w2_ty = mod["main"].params[3].checked_type
-    params = {
-        "w1": gen_rand_tvm(w1_ty, 0, 1),
-        "w2": gen_rand_tvm(w2_ty, 0, 1),
-        "b": gen_rand_tvm(bias_ty, 0, 1),
-    }
-    verify_partition_fails(mod, params)
-
-
-def test_unquantizable_suffix_partition():
-    mod = tvm.relay.parse(
-        """
-    #[version = "0.0.5"]
-    def @main(
-        %x: Tensor[(1, 4, 16, 16), float32],
-        %w: Tensor[(4, 4, 3, 3), float32],
-        %b: Tensor[(4), float32]) -> Tensor[(1, 4, 16, 16), float32] {
-      %0 = nn.conv2d(%x, %w,
-        padding=[1, 1, 1, 1],
-        channels=4,
-        kernel_size=[3, 3]);
-      // NOTE bias_add isn't currently quantizable
-      nn.bias_add(%0, %b)
-    }
-    """
-    )
-    weight_ty = mod["main"].params[1].checked_type
-    bias_ty = mod["main"].params[2].checked_type
-    params = {"w": gen_rand_tvm(weight_ty, 0, 1), "b": gen_rand_tvm(bias_ty, 0, 1)}
-    verify_partition_fails(mod, params)
-
-
-def test_left_shift_negative():
-    data = relay.var("data", shape=(1, 16, 64, 64))
-    weight = relay.const(np.full((16, 16, 3, 3), 256.0))
-    conv2d = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
-    relu = relay.nn.relu(conv2d)
-
-    mod = tvm.IRModule.from_expr(relu)
-
-    with tvm.transform.PassContext(opt_level=3):
-        with relay.quantize.qconfig(
-            calibrate_mode="global_scale", global_scale=8.0, skip_conv_layers=None
-        ):
-            qnn_mod = relay.quantize.quantize(mod)
-
-    class OpFinder(relay.ExprVisitor):
-        def __init__(self, op_name):
-            super(OpFinder, self).__init__()
-            self._op_name = op_name
-            self.ops = list()
-
-        def visit_call(self, call):
-            super().visit_call(call)
-            if call.op.name == self._op_name:
-                self.ops.append(call)
-
-    opf = OpFinder("left_shift")
-    opf.visit(qnn_mod["main"])
-    assert len(opf.ops) > 0, 'Broken case, can\'t find any "left_shift" operators.'
-    for left_shift_op in opf.ops:
-        shift_amount = left_shift_op.args[1].data.numpy()
-        assert shift_amount >= 0, "Shift amount must be non-negative."
-
-
-def test_dense_conv2d_rewrite():
-    n, c, h, w = 1, 16, 64, 64
-    data = relay.var("data", relay.TensorType((n, c, h, w)))
-    inp = relay.var("inp", relay.TensorType((n, c * h * w)))
-    weight_T = relay.const(np.random.random((n, c * h * w)), dtype="float32")
-    bias = relay.const(np.random.random((n,)), dtype="float32")
-    conv_w = relay.const(np.random.random((16, 16, 3, 3)), dtype="float32")
-
-    dense_o = relay.nn.dense(inp, weight_T)
-    linear_o = relay.nn.bias_add(dense_o, bias)
-    conv2d_o = relay.nn.conv2d(data, conv_w, kernel_size=(3, 3), padding=(1, 1), channels=16)
-    result = relay.Tuple((linear_o, conv2d_o))
-
-    mod = tvm.IRModule.from_expr(result)
-    with tvm.transform.PassContext(opt_level=3):
-        with relay.quantize.qconfig(
-            calibrate_mode="global_scale", global_scale=8.0, skip_dense_layer=False
-        ):
-            qnn_mod = relay.quantize.quantize(mod)
-
-    def _check_dense(node):
-        if isinstance(node, Call):
-            if node.op.name == "nn.dense":
-                assert node.args[0].checked_type.dtype == "int8"
-                assert node.args[1].checked_type.dtype == "int8"
-                assert node.checked_type.dtype == "int32"
-            if node.op.name == "nn.conv2d":
-                assert node.args[0].checked_type.dtype == "float32"
-                assert node.args[1].checked_type.dtype == "float32"
-                assert node.checked_type.dtype == "float32"
-
-    relay.analysis.post_order_visit(qnn_mod["main"], _check_dense)
-
-
-def test_add_lhs_is_none_annotate():
-    data_conv = relay.var("data_conv", shape=(1, 16, 64, 64))
-    conv2d_w = relay.const(np.random.random((16, 16, 3, 3)))
-    conv2d = relay.nn.conv2d(data_conv, conv2d_w, padding=(1, 1), kernel_size=(3, 3))
-    data_add = relay.var("data_add", shape=(16, 1, 1))
-    add = relay.add(data_add, conv2d)
-    global_avg_pool2d = relay.nn.global_avg_pool2d(add)
-    mod = tvm.IRModule.from_expr(global_avg_pool2d)
-
-    calibrate_data = [
-        {"data_conv": np.random.random((1, 16, 64, 64)), "data_add": np.random.random((16, 1, 1))}
-    ]
-
-    with tvm.transform.PassContext(opt_level=3):
-        with relay.quantize.qconfig(calibrate_mode="kl_divergence", skip_conv_layers=None):
-            qmod = relay.quantize.quantize(mod, dataset=calibrate_data)
-
-    params = [gen_rand_tvm(param.type_annotation, 0, 1) for param in mod["main"].params]
-
-    def _eval_mod(mod):
-        return relay.create_executor("vm", device=tvm.cpu(0), target="llvm", mod=mod).evaluate()(
-            *params
-        )
-
-    mod_result = _eval_mod(mod)
-    qmod_result = _eval_mod(qmod)
-    tvm.testing.assert_allclose(mod_result.numpy(), qmod_result.numpy(), rtol=1e-1, atol=1e-1)
-
-
-def test_add_lhs_rhs_is_input_annotate():
-    data_conv_r = relay.var("data_conv_r", shape=(1, 16, 64, 64))
-    conv2d_r = relay.nn.conv2d(
-        data_conv_r,
-        relay.const(np.random.random((16, 16, 3, 3))),
-        padding=(1, 1),
-        kernel_size=(3, 3),
-    )
-    data_conv_l = relay.var("data_conv_l", shape=(1, 16, 64, 64))
-    conv2d_l = relay.nn.conv2d(
-        data_conv_l,
-        relay.const(np.random.random((16, 16, 3, 3))),
-        padding=(1, 1),
-        kernel_size=(3, 3),
-    )
-    add = relay.add(conv2d_l, conv2d_r)
-    global_avg_pool2d = relay.nn.global_avg_pool2d(add)
-    mod = tvm.IRModule.from_expr(global_avg_pool2d)
-
-    calibrate_data = [
-        {
-            "data_conv_l": np.random.random((1, 16, 64, 64)),
-            "data_conv_r": np.random.random((1, 16, 64, 64)),
-            "data_add": np.random.random((16, 1, 1)),
-        }
-    ]
-
-    with tvm.transform.PassContext(opt_level=3):
-        with relay.quantize.qconfig(calibrate_mode="kl_divergence", skip_conv_layers=None):
-            qmod = relay.quantize.quantize(mod, dataset=calibrate_data)
-
-    params = [gen_rand_tvm(param.type_annotation, 0, 1) for param in mod["main"].params]
-
-    def _eval_mod(mod):
-        return relay.create_executor("vm", device=tvm.cpu(0), target="llvm", mod=mod).evaluate()(
-            *params
-        )
-
-    mod_result = _eval_mod(mod)
-    qmod_result = _eval_mod(qmod)
-    tvm.testing.assert_allclose(mod_result.numpy(), qmod_result.numpy(), rtol=1e-1, atol=1e-1)
-
-
-if __name__ == "__main__":
-    test_mul_rewrite()
-    test_batch_flatten_rewrite()
-    test_batch_matmul_rewrite()
-    test_calibrate_target(False)
-    test_calibrate_target(True)
-    test_calibrate_memory_bound()
-    test_calibrate_percentile()
-
-    test_add_partition()
-    test_conv2d_partition()
-    test_multiple_arg_conversions_partition()
-    test_unquantizable_prefix_partition()
-    test_unquantizable_core_partition()
-    test_unquantizable_suffix_partition()
-    test_left_shift_negative()
-    test_dense_conv2d_rewrite()
-
-    test_skip_conv()
-    test_stop_quantize()
-
-    test_add_lhs_is_none_annotate()
-    test_add_lhs_rhs_is_input_annotate()
diff --git a/tests/python/relay/test_pass_canonicalize_cast.py b/tests/python/relay/test_pass_canonicalize_cast.py
deleted file mode 100644
index 2a7d83fe27df..000000000000
--- a/tests/python/relay/test_pass_canonicalize_cast.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import tvm.relay as relay
-import tvm.relay.transform as _transform
-
-
-def test_canonicalize_cast():
-    def before(data, conv_weight, bias1, bias2):
-        x = relay.nn.conv2d(
-            data, conv_weight, channels=16, kernel_size=(3, 3), padding=(1, 1), out_dtype="int8"
-        )
-        x1 = relay.cast(x, dtype="int32")
-        y1 = relay.add(x1, bias1)
-        y2 = relay.add(x1, bias2)
-        y = relay.add(y1, y2)
-        return relay.Function([data, conv_weight, bias1, bias2], y)
-
-    def expected(data, conv_weight, bias1, bias2):
-        x = relay.nn.conv2d(
-            data, conv_weight, channels=16, kernel_size=(3, 3), padding=(1, 1), out_dtype="int8"
-        )
-        x1 = relay.cast(x, dtype="int32")
-        x2 = relay.cast(x, dtype="int32")
-        y1 = relay.add(x1, bias1)
-        y2 = relay.add(x2, bias2)
-        y = relay.add(y1, y2)
-        return relay.Function([data, conv_weight, bias1, bias2], y)
-
-    def check(shape):
-        data = relay.var("data", shape=shape, dtype="int8")
-        conv_weight = relay.var("weight")
-        bias1 = relay.var("bias1", shape=(16, 1, 1), dtype="int32")
-        bias2 = relay.var("bias2", shape=(16, 1, 1), dtype="int32")
-        y = before(data, conv_weight, bias1, bias2)
-        mod = tvm.IRModule.from_expr(y)
-        seq = tvm.transform.Sequential(
-            [_transform.InferType(), _transform.CanonicalizeCast(), _transform.InferType()]
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            mod = seq(mod)
-        y = mod["main"]
-        y_expected = expected(data, conv_weight, bias1, bias2)
-        gv = relay.GlobalVar("expected")
-        mod[gv] = y_expected
-        mod = _transform.InferType()(mod)
-        y_expected = mod["expected"]
-        tvm.ir.assert_structural_equal(y, y_expected)
-
-    check((1, 16, 7, 7))
-
-
-if __name__ == "__main__":
-    test_canonicalize_cast()
diff --git a/tests/python/relay/test_pass_check_kind.py b/tests/python/relay/test_pass_check_kind.py
deleted file mode 100644
index 41c77540cb7f..000000000000
--- a/tests/python/relay/test_pass_check_kind.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.analysis import check_kind
-import pytest
-
-
-def test_typevar_kind():
-    # returns the same kind
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.Type)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.ShapeVar)
-    tp3 = relay.TypeVar("tp3", relay.TypeKind.Constraint)
-
-    assert check_kind(tp1) == relay.TypeKind.Type
-    assert check_kind(tp2) == relay.TypeKind.ShapeVar
-    assert check_kind(tp3) == relay.TypeKind.Constraint
-
-
-def test_tuple_kind():
-    # only contain type kinds
-    tp = relay.TypeVar("tp", relay.TypeKind.Type)
-    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), "float32")
-    tf = relay.FuncType(
-        tvm.runtime.convert([]), tt, tvm.runtime.convert([]), tvm.runtime.convert([])
-    )
-    fields = tvm.runtime.convert([tp, tf, tt])
-
-    tup_ty = relay.TupleType(fields)
-    assert check_kind(tup_ty) == relay.TypeKind.Type
-
-
-def test_func_kind():
-    # only contain type kinds
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.Type)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.Type)
-
-    shape = tvm.runtime.convert([1, 2, 3])
-    dtype = "float32"
-    tensor_type = relay.TensorType(shape, dtype)
-
-    tr = relay.TypeRelation(None, tvm.runtime.convert([tensor_type, tp1]), 1, None)
-
-    type_params = tvm.runtime.convert([tp1, tp2])
-    type_constraints = tvm.runtime.convert([tr])
-    arg_types = tvm.runtime.convert([tp1, tensor_type])
-    ret_type = relay.TupleType(tvm.runtime.convert([tp2, tensor_type]))
-
-    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
-    assert check_kind(tf) == relay.TypeKind.Type
-
-
-def test_ref_kind():
-    # only contain type kinds
-    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), "float32")
-    ft = relay.FuncType(
-        tvm.runtime.convert([]), tt, tvm.runtime.convert([]), tvm.runtime.convert([])
-    )
-
-    rt1 = relay.RefType(tt)
-    assert check_kind(rt1) == relay.TypeKind.Type
-    rt2 = relay.RefType(ft)
-    assert check_kind(rt2) == relay.TypeKind.Type
-    rt3 = relay.RefType(relay.TupleType([rt1, rt2]))
-    assert check_kind(rt3) == relay.TypeKind.Type
-
-
-def test_relation_kind():
-    # only have type kinds for arguments
-    tp = relay.TypeVar("tp", relay.TypeKind.Type)
-    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), "float32")
-    tf = relay.FuncType(
-        tvm.runtime.convert([]), tt, tvm.runtime.convert([]), tvm.runtime.convert([])
-    )
-    args = tvm.runtime.convert([tf, tt, tp])
-
-    tr = relay.TypeRelation(None, args, 2, None)
-    assert check_kind(tr) == relay.TypeKind.Constraint
-
-
-def test_global_typevar_kind():
-    v1 = relay.GlobalTypeVar("gtv1", relay.TypeKind.AdtHandle)
-    v2 = relay.GlobalTypeVar("gtv2", relay.TypeKind.Type)
-
-    assert check_kind(v1) == relay.TypeKind.AdtHandle
-    assert check_kind(v2) == relay.TypeKind.Type
-
-
-def test_typecall_kind():
-    gtv = relay.GlobalTypeVar("gtv")
-
-    mod = tvm.IRModule()
-    data = relay.TypeData(gtv, [], [])
-    mod[gtv] = data
-    empty_call = relay.TypeCall(gtv, [])
-    assert check_kind(empty_call, mod) == relay.TypeKind.Type
-
-    new_mod = tvm.IRModule()
-    tv = relay.TypeVar("tv")
-    new_data = relay.TypeData(gtv, [tv], [])
-    new_mod[gtv] = new_data
-    call = relay.TypeCall(gtv, [relay.TupleType([])])
-    assert check_kind(call, new_mod) == relay.TypeKind.Type
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_invalid_tuple_kind():
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.ShapeVar)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.BaseType)
-    tp3 = relay.TypeVar("tp3", relay.TypeKind.Constraint)
-    fields = tvm.runtime.convert([tp1, tp2, tp3])
-
-    tup_ty = relay.TupleType(fields)
-    check_kind(tup_ty)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_invalid_func_kind():
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.ShapeVar)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.BaseType)
-    tp3 = relay.TypeVar("tp3", relay.TypeKind.Constraint)
-
-    type_params = tvm.runtime.convert([tp1, tp2, tp3])
-    type_constraints = tvm.runtime.convert([])
-    arg_types = tvm.runtime.convert([tp1, tp2])
-    ret_type = tp3
-
-    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
-    check_kind(tf)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_invalid_ref_kind():
-    tp = relay.TypeVar("tp", relay.TypeKind.ShapeVar)
-    rt = relay.RefType(tp)
-    check_kind(rt)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_invalid_relation_kind():
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.ShapeVar)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.BaseType)
-    tp3 = relay.TypeVar("tp3", relay.TypeKind.Constraint)
-    args = tvm.runtime.convert([tp1, tp2, tp3])
-
-    func = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Broadcast")
-    tr = relay.TypeRelation(func, args, 2, None)
-    check_kind(tr)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_typecall_invalid_callee():
-    # global type var must be an ADT handle
-    gtv = relay.GlobalTypeVar("v1", relay.TypeKind.Type)
-    check_kind(relay.TypeCall(gtv, []))
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_typecall_invalid_args():
-    # args must all be type kind
-    mod = tvm.IRModule()
-    gtv = relay.GlobalTypeVar("v1")
-    data = relay.TypeData(gtv, [], [])
-    mod[gtv] = data
-
-    check_kind(relay.TypeCall(gtv, [data]))
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_typecall_invalid_num_args():
-    mod = tvm.IRModule()
-    gtv = relay.GlobalTypeVar("v1")
-    tv = relay.TypeVar("tv")
-    data = relay.TypeData(gtv, [tv], [])
-    mod[gtv] = data
-    check_kind(relay.TypeCall(gtv, []))
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_func_with_invalid_ret_type():
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.Type)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.ShapeVar)
-    tf = relay.FuncType(
-        tvm.runtime.convert([tp1]), tp2, tvm.runtime.convert([tp1, tp2]), tvm.runtime.convert([])
-    )
-
-    check_kind(tf)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_func_with_invalid_arg_types():
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.ShapeVar)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.Type)
-    tf = relay.FuncType(
-        tvm.runtime.convert([tp1]), tp2, tvm.runtime.convert([tp1, tp2]), tvm.runtime.convert([])
-    )
-
-    check_kind(tf)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_func_with_invalid_tuple():
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.ShapeVar)
-
-    ret_type = relay.TupleType(tvm.runtime.convert([tp1, tp1, tp1]))
-
-    tf = relay.FuncType(
-        tvm.runtime.convert([]), ret_type, tvm.runtime.convert([tp1]), tvm.runtime.convert([])
-    )
-    check_kind(tf)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_func_with_invalid_relation():
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.Type)
-    tp2 = relay.TypeVar("tp2", relay.TypeKind.ShapeVar)
-    tp3 = relay.TypeVar("tp3", relay.TypeKind.Constraint)
-
-    func = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Identity")
-    tr = relay.TypeRelation(func, tvm.runtime.convert([tp2, tp3]), 1, None)
-
-    tf = relay.FuncType(
-        tvm.runtime.convert([tp1]),
-        tp1,
-        tvm.runtime.convert([tp1, tp2, tp3]),
-        tvm.runtime.convert([tr]),
-    )
-    check_kind(tf)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_tuple_with_invalid_func():
-    tensor_type = relay.TensorType(tvm.runtime.convert([1, 2, 3]), "float32")
-
-    tp1 = relay.TypeVar("tp1", relay.TypeKind.ShapeVar)
-    tf = relay.FuncType(
-        tvm.runtime.convert([]), tp1, tvm.runtime.convert([tp1]), tvm.runtime.convert([])
-    )
-
-    tup_ty = relay.TupleType(tvm.runtime.convert([tensor_type, tf]))
-    check_kind(tup_ty)
-
-
-if __name__ == "__main__":
-    test_tuple_kind()
-    test_func_kind()
-    test_ref_kind()
-    test_relation_kind()
-    test_global_typevar_kind()
-    test_typecall_kind()
-    test_invalid_tuple_kind()
-    test_invalid_func_kind()
-    test_invalid_ref_kind()
-    test_invalid_relation_kind()
-    test_typecall_invalid_callee()
-    test_typecall_invalid_args()
-    test_typecall_invalid_num_args()
-    test_func_with_invalid_ret_type()
-    test_func_with_invalid_arg_types()
-    test_func_with_invalid_tuple()
-    test_func_with_invalid_relation()
-    test_tuple_with_invalid_func()
diff --git a/tests/python/relay/test_pass_collage_partition.py b/tests/python/relay/test_pass_collage_partition.py
deleted file mode 100644
index f40631628ea5..000000000000
--- a/tests/python/relay/test_pass_collage_partition.py
+++ /dev/null
@@ -1,683 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import tvm.testing
-import pytest
-from tvm.relay.transform import CollagePartition, InferType, CapturePostDfsIndexInSpans
-from tvm.target import make_compilation_config
-from tvm.relay.collage import MockCostEstimator
-from unittest.mock import patch
-from tvm.relay.dataflow_pattern import is_op, wildcard
-
-
-# We'll reuse the target kind "example_target_hook" (registered in
-# src/relay/backend/contrib/example_target_hooks/target.cc) as our
-# example external codegen target.
-
-
-def test_pattern_table():
-    def relu_pattern():
-        return is_op("nn.relu")(wildcard())
-
-    def add_pattern():
-        return is_op("add")(wildcard(), wildcard())
-
-    def concatenate_pattern():
-        return is_op("concatenate")(wildcard())
-
-    def predicate(expr):
-        return True
-
-    return [
-        ("relu", relu_pattern(), predicate),
-        ("add", add_pattern(), predicate),
-        ("concatenate", concatenate_pattern(), predicate),
-    ]
-
-
-def _mock_get_pattern_table(target):
-    if target == "example_target_hook":
-        return test_pattern_table()
-
-
-def run_collage(
-    input_mod, targets, cost_estimator, expected_mod, tvm_max_depth=8, byoc_max_depth=8
-):
-    ctxt = {
-        "relay.collage.tvm_max_depth": tvm_max_depth,
-        "relay.collage.byoc_max_depth": byoc_max_depth,
-    }
-    expected_mod = InferType()(expected_mod)
-    pass_ctxt = tvm.transform.PassContext(config=ctxt)
-    with pass_ctxt:
-        config = make_compilation_config(pass_ctxt, targets)
-        actual_mod = InferType()(input_mod)
-        # Capture indexes only to help debug failing tests
-        actual_mod = CapturePostDfsIndexInSpans()(actual_mod)
-        actual_mod = CollagePartition(config, cost_estimator)(actual_mod)
-
-        if not tvm.ir.structural_equal(actual_mod, expected_mod, map_free_vars=True):
-            # Print everything in full so we can see what's going on when things fail.
-            print("Input module:")
-            print(input_mod)
-            print("Actual module:")
-            print(actual_mod)
-            print("Expected module:")
-            print(expected_mod)
-            # Assert again so as to see the actual disagreeing sub-expressions.
-            tvm.ir.assert_structural_equal(actual_mod, expected_mod, map_free_vars=True)
-
-
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_partition_single_op_llvm(mock_get_pattern_table):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        nn.relu(%x)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-        nn.relu(%x)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 1,
-            "example_target_hook": 2,
-        }
-    )
-    run_collage(mod, targets, cost_estimator, expected_mod)
-
-
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_partition_single_op_byoc(mock_get_pattern_table):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        nn.relu(%x)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
-        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_01)
-        };
-        %0(%FunctionVar_0)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-        @collage_example_target_hook_nn_relu(%x)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 2,
-            "example_target_hook": 1,
-        }
-    )
-    run_collage(mod, targets, cost_estimator, expected_mod)
-
-
-@pytest.mark.parametrize("byoc_max_depth", [1, 3])
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_partition_diamond_valid_topology(mock_get_pattern_table, byoc_max_depth):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = abs(%0);
-        %2 = nn.relu(%1);
-        add(%1, %2)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_3_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
-        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_01)
-        };
-        %0(%FunctionVar_0)
-      }
-
-      def @collage_example_target_hook_nn_relu_add(%FunctionVar_02: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_add") -> Tensor[(10, 10), float32] {
-        %1 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_04)
-        };
-        %2 = %1(%FunctionVar_02);
-        %3 = fn (%FunctionVar_03: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_03, %FunctionVar_1)
-        };
-        %3(%FunctionVar_02, %2)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-        %4 = @collage_example_target_hook_nn_relu(%x);
-        %5 = abs(%4);
-        @collage_example_target_hook_nn_relu_add(%5)
-      }
-    """
-    expected_1_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> Tensor[(10, 10), float32] {
-        %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_02)
-        };
-        %1 = %0(%FunctionVar_0);
-        %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_01, %FunctionVar_1)
-        };
-        %2(%FunctionVar_0, %1)
-      }
-
-      def @collage_example_target_hook_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
-        %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_04)
-        };
-        %3(%FunctionVar_03)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-        %4 = @collage_example_target_hook_nn_relu(%x);
-        %5 = abs(%4);
-        @collage_example_target_hook(%5)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_1_txt if byoc_max_depth == 1 else expected_3_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 2,
-            "example_target_hook": 1,
-        }
-    )
-    run_collage(
-        mod, targets, cost_estimator, expected_mod, tvm_max_depth=1, byoc_max_depth=byoc_max_depth
-    )
-
-
-@pytest.mark.parametrize("tvm_max_depth", [1, 2, 3])
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_tvm_max_depth(mock_get_pattern_table, tvm_max_depth):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = nn.relu(%0);
-        nn.relu(%1)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txts = {
-        1: """
-          #[version = "0.0.5"]
-          def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> Tensor[(10, 10), float32] {
-            %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_03)
-            };
-            %1 = %0(%FunctionVar_0);
-            %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_02)
-            };
-            %3 = %2(%1);
-            %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_01)
-            };
-            %4(%3)
-          }
-
-          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-            @collage_example_target_hook(%x)
-          }
-        """,
-        2: """
-          #[version = "0.0.5"]
-          def @collage_example_target_hook_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
-            %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_01)
-            };
-            %0(%FunctionVar_0)
-          }
-
-          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-            %1 = @collage_example_target_hook_nn_relu(%x);
-            %2 = nn.relu(%1);
-            nn.relu(%2)
-          }
-        """,
-        3: """
-          #[version = "0.0.5"]
-          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-            %0 = nn.relu(%x);
-            %1 = nn.relu(%0);
-            nn.relu(%1)
-          }
-        """,
-    }
-    expected_mod = tvm.relay.fromtext(expected_txts[tvm_max_depth])
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 100,
-            "example_target_hook": 99,
-        }
-    )
-    run_collage(
-        mod, targets, cost_estimator, expected_mod, tvm_max_depth=tvm_max_depth, byoc_max_depth=1
-    )
-
-
-@pytest.mark.parametrize("byoc_max_depth", [1, 2, 3])
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_byoc_max_depth(mock_get_pattern_table, byoc_max_depth):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = nn.relu(%0);
-        nn.relu(%1)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txts = {
-        1: """
-          #[version = "0.0.5"]
-          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-            %0 = nn.relu(%x);
-            %1 = nn.relu(%0);
-            nn.relu(%1)
-          }
-        """,
-        2: """
-          #[version = "0.0.5"]
-          def @collage_example_target_hook_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_nn_relu") -> Tensor[(10, 10), float32] {
-            %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_02)
-            };
-            %1 = %0(%FunctionVar_0);
-            %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_01)
-            };
-            %2(%1)
-          }
-
-          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-            %3 = nn.relu(%x);
-            @collage_example_target_hook_nn_relu_nn_relu(%3)
-          }
-        """,
-        3: """
-          #[version = "0.0.5"]
-          def @collage_example_target_hook_nn_relu_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_nn_relu_nn_relu") -> Tensor[(10, 10), float32] {
-            %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_03)
-            };
-            %1 = %0(%FunctionVar_0);
-            %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_02)
-            };
-            %3 = %2(%1);
-            %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-              nn.relu(%FunctionVar_01)
-            };
-            %4(%3)
-          }
-
-          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-            @collage_example_target_hook_nn_relu_nn_relu_nn_relu(%x)
-          }
-        """,
-    }
-    expected_mod = tvm.relay.fromtext(expected_txts[byoc_max_depth])
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 99,
-            "example_target_hook": 100,
-        }
-    )
-    run_collage(
-        mod, targets, cost_estimator, expected_mod, tvm_max_depth=1, byoc_max_depth=byoc_max_depth
-    )
-
-
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_partition_output_tuple(mock_get_pattern_table):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = nn.relu(%0);
-        %2 = abs(%1);
-        (%0, %1, %2)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
-        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_01)
-        };
-        %1 = %0(%FunctionVar_0);
-        %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_02)
-        };
-        %3 = %2(%1);
-        (%1, %3)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32]) -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
-        %4 = @collage_example_target_hook(%x);
-        %5 = %4.1;
-        %6 = %4.0;
-        %7 = abs(%5);
-        (%6, %5, %7)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 2,
-            "example_target_hook": 1,
-        }
-    )
-    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=2, byoc_max_depth=2)
-
-
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_partition_intermediate_tuple(mock_get_pattern_table):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = nn.relu(%0);
-        %2 = (%0, %1);
-        concatenate(%2)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
-        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_01)
-        };
-        %1 = %0(%FunctionVar_0);
-        %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_02)
-        };
-        %3 = %2(%1);
-        (%1, %3)
-      }
-
-      def @collage_example_target_hook_concatenate(%FunctionVar_03: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]), Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_concatenate") -> Tensor[(20, 10), float32] {
-        %4 = fn (%FunctionVar_04: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]), Composite="concatenate") -> Tensor[(20, 10), float32] {
-          concatenate(%FunctionVar_04)
-        };
-        %4(%FunctionVar_03)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(20, 10), float32] {
-        %5 = @collage_example_target_hook(%x);
-        %6 = %5.0;
-        %7 = %5.1;
-        %8 = (%6, %7);
-        @collage_example_target_hook_concatenate(%8)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 2,
-            "example_target_hook": 1,
-        }
-    )
-    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=3, byoc_max_depth=5)
-
-
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_fusion_benefit(mock_get_pattern_table):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = nn.relu(%0);
-        %2 = abs(%x);
-        %3 = nn.relu(%2);
-        %4 = add(%1, %3);
-        %5 = nn.relu(%4);
-        abs(%5)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu") -> Tensor[(10, 10), float32] {
-        %0 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_04)
-        };
-        %1 = %0(%FunctionVar_0);
-        %2 = fn (%FunctionVar_03: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_03)
-        };
-        %3 = fn (%FunctionVar_05: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_05)
-        };
-        %4 = %2(%1);
-        %5 = %3(%FunctionVar_1);
-        %6 = fn (%FunctionVar_02: Tensor[(10, 10), float32], %FunctionVar_11: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_02, %FunctionVar_11)
-        };
-        %7 = %6(%4, %5);
-        %8 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_01)
-        };
-        %8(%7)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-        %9 = abs(%x);
-        %10 = @collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu(%x, %9);
-        abs(%10)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 5,
-            "example_target_hook": 6,
-        }
-    )
-    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=1, byoc_max_depth=5)
-
-
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_double_residual(mock_get_pattern_table):
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = abs(%0);
-        %2 = add(%0, %1);
-        add(%1, %2)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook_add_add(%FunctionVar_0: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_add_add") -> Tensor[(10, 10), float32] {
-        %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32], %FunctionVar_12: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_02, %FunctionVar_12)
-        };
-        %1 = %0(%FunctionVar_1, %FunctionVar_0);
-        %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32], %FunctionVar_11: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_01, %FunctionVar_11)
-        };
-        %2(%FunctionVar_0, %1)
-      }
-
-      def @collage_example_target_hook_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
-        %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_04)
-        };
-        %3(%FunctionVar_03)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-        %4 = @collage_example_target_hook_nn_relu(%x);
-        %5 = abs(%4);
-        @collage_example_target_hook_add_add(%5, %4)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 2,
-            "example_target_hook": 1,
-        }
-    )
-    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=4, byoc_max_depth=4)
-
-
-@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
-def test_pruning_heuristic(mock_get_pattern_table):
-    # In this example both the default TVM partition spec and the 'example_target_hook' partition
-    # spec will yield the same set of candidates, and those candidates will include all 7
-    # partitions of the four operators (ie 14 in total).
-    #
-    # However, the pruning heuristics will reduce those back to just two 'maximal' candidates
-    # which have all four operators fused. We'll then just estimate those for the two targets.
-    mod_txt = """
-      #[version = "0.0.5"]
-      def @main(%x: Tensor[(10, 10), float32]) {
-        %0 = nn.relu(%x);
-        %1 = nn.relu(%0);
-        %2 = add(%0, %1);
-        add(%1, %2)
-      }
-    """
-    mod = tvm.relay.fromtext(mod_txt)
-
-    expected_txt = """
-      #[version = "0.0.5"]
-      def @collage_example_target_hook_nn_relu_nn_relu_add_add(
-        %FunctionVar_0: Tensor[(10, 10), float32],
-        Primitive=1,
-        Compiler="example_target_hook",
-        global_symbol="collage_example_target_hook_nn_relu_nn_relu_add_add") -> Tensor[(10, 10), float32] {
-        %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32] , Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_03)
-        };
-        %1 = %0(%FunctionVar_0) ;
-        %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] , Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_02)
-        };
-        %3 = %2(%1);
-        %4 = fn (%FunctionVar_04: Tensor[(10, 10), float32] , %FunctionVar_11: Tensor[(10, 10), float32] , Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_04, %FunctionVar_11)
-        };
-        %5 = %4(%1, %3);
-        %6 = fn (%FunctionVar_01: Tensor[(10, 10), float32] , %FunctionVar_1: Tensor[(10, 10), float32] , Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_01, %FunctionVar_1)
-        };
-        %6(%3, %5)
-      }
-
-      def @main(%x: Tensor[(10, 10), float32] ) -> Tensor[(10, 10), float32] {
-        @collage_example_target_hook_nn_relu_nn_relu_add_add(%x)
-      }
-    """
-    expected_mod = tvm.relay.fromtext(expected_txt)
-
-    targets = [
-        tvm.target.Target("llvm"),
-        tvm.target.Target("example_target_hook"),
-    ]
-
-    cost_estimator = MockCostEstimator(
-        {
-            "llvm": 2,
-            "example_target_hook": 1,
-        },
-        # Limit the number of cost estimations to 2 to assert pruning did its job.
-        max_estimates=2,
-    )
-    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=4, byoc_max_depth=4)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_combine_parallel_batch_matmul.py b/tests/python/relay/test_pass_combine_parallel_batch_matmul.py
deleted file mode 100644
index 1c09e15e92a5..000000000000
--- a/tests/python/relay/test_pass_combine_parallel_batch_matmul.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,too-many-locals,too-many-arguments,missing-module-docstring
-
-import tvm
-from tvm import relay
-from tvm.relay import transform
-
-
-def run_opt_pass(expr, opt_pass):
-    "runs the opt_pass on the expr of a function the function"
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = tvm.relay.transform.InferType()(mod)
-    mod = opt_pass(mod)
-    return mod["main"]
-
-
-def test_combine_parallel_batch_matmul():
-    """Simple testcase."""
-
-    def before(x, w1, w2, w3):
-        args = [x, w1, w2, w3]
-        y1 = relay.nn.batch_matmul(x, w1)
-        y2 = relay.nn.batch_matmul(x, w2)
-        y3 = relay.nn.batch_matmul(x, w3)
-        y = relay.Tuple((y1, y2, y3))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, w3):
-        # use a fixed order of args so alpha equal check can pass
-        s1 = w1.type_annotation.shape[1]
-        s2 = w2.type_annotation.shape[1]
-        s3 = w3.type_annotation.shape[1]
-        args = [x, w1, w2, w3]
-        w = relay.concatenate((w1, w2, w3), axis=1)
-        y = relay.nn.batch_matmul(x, w)
-        y1 = relay.strided_slice(
-            y, begin=[0, 0, 0], end=[-1, -1, s1], strides=[1, 1, 1], slice_mode="size"
-        )
-        y2 = relay.strided_slice(
-            y, begin=[0, 0, s1], end=[-1, -1, s2], strides=[1, 1, 1], slice_mode="size"
-        )
-        y3 = relay.strided_slice(
-            y, begin=[0, 0, s1 + s2], end=[-1, -1, s3], strides=[1, 1, 1], slice_mode="size"
-        )
-        y = relay.Tuple((y1, y2, y3))
-        return relay.Function(args, y)
-
-    def check(b, i, j, k):
-        x = relay.var("x", shape=(b, i, k))
-        w1 = relay.var("w1", shape=(b, j, k))
-        w2 = relay.var("w2", shape=(b, j, k))
-        w3 = relay.var("w3", shape=(b, j, k))
-
-        y_before = before(x, w1, w2, w3)
-        y = run_opt_pass(y_before, transform.CombineParallelBatchMatmul(min_num_branches=2))
-        y_expected = expected(x, w1, w2, w3)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(2, 3, 5, 4)
-    check(1, 100, 200, 300)
-
-
-def test_combine_parallel_batch_matmul_biasadd():
-    """Simple testcase with bias"""
-
-    def before(x, w1, w2, w3, b1, b2, b3):
-        args = [x, w1, w2, w3, b1, b2, b3]
-        y1 = relay.nn.batch_matmul(x, w1)
-        y2 = relay.nn.batch_matmul(x, w2)
-        y3 = relay.nn.batch_matmul(x, w3)
-        y1 = relay.add(y1, b1)
-        y2 = relay.add(y2, b2)
-        y3 = relay.add(y3, b3)
-        y = relay.Tuple((y1, y2, y3))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, w3, b1, b2, b3):
-        # use a fixed order of args so alpha equal check can pass
-        s1 = w1.type_annotation.shape[1]
-        s2 = w2.type_annotation.shape[1]
-        s3 = w3.type_annotation.shape[1]
-        args = [x, w1, w2, w3, b1, b2, b3]
-        w = relay.concatenate((w1, w2, w3), axis=1)
-        b = relay.concatenate((b1, b2, b3), axis=-1)
-        y = relay.nn.batch_matmul(x, w)
-        y = relay.add(y, b)
-        y1 = relay.strided_slice(
-            y, begin=[0, 0, 0], end=[-1, -1, s1], strides=[1, 1, 1], slice_mode="size"
-        )
-        y2 = relay.strided_slice(
-            y, begin=[0, 0, s1], end=[-1, -1, s2], strides=[1, 1, 1], slice_mode="size"
-        )
-        y3 = relay.strided_slice(
-            y, begin=[0, 0, s1 + s2], end=[-1, -1, s3], strides=[1, 1, 1], slice_mode="size"
-        )
-        y = relay.Tuple((y1, y2, y3))
-        return relay.Function(args, y)
-
-    def check(b, i, j, k):
-        x = relay.var("x", shape=(b, i, k))
-        w1 = relay.var("w1", shape=(b, j, k))
-        w2 = relay.var("w2", shape=(b, j, k))
-        w3 = relay.var("w3", shape=(b, j, k))
-        b1 = relay.var("b1", shape=(j,))
-        b2 = relay.var("b2", shape=(j,))
-        b3 = relay.var("b3", shape=(j,))
-
-        y_before = before(x, w1, w2, w3, b1, b2, b3)
-        y = run_opt_pass(y_before, transform.CombineParallelBatchMatmul(min_num_branches=2))
-        y_expected = expected(x, w1, w2, w3, b1, b2, b3)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(2, 3, 5, 4)
-    check(1, 100, 200, 300)
-
-
-if __name__ == "__main__":
-    test_combine_parallel_batch_matmul()
-    test_combine_parallel_batch_matmul_biasadd()
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
deleted file mode 100644
index 0d41ed1294f8..000000000000
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import relay
-from tvm.relay import transform
-
-
-def run_combine_parallel(expr, min_num_branches=3):
-    mod = tvm.IRModule.from_expr(expr)
-    mod = transform.CombineParallelConv2D(min_num_branches)(mod)
-    return mod["main"]
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = tvm.relay.transform.InferType()(mod)
-    mod = opt_pass(mod)
-    return mod["main"]
-
-
-def test_combine_parallel_conv2d():
-    """Simple testcase."""
-
-    def before(x, w1, w2, w3, w4):
-        args = [x, w1, w2, w3, w4]
-        y1 = relay.nn.conv2d(x, w1)
-        y2 = relay.nn.conv2d(x, w2)
-        # y3 cannot be combined
-        y3 = relay.nn.conv2d(x, w3)
-        y4 = relay.nn.conv2d(x, w4)
-        y5 = relay.nn.max_pool2d(x)
-        y = relay.Tuple((y1, y2, y3, y4, y5))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, w1, w2, w3, w4]
-        w = relay.concatenate((w1, w2, w4), axis=0)
-        y = relay.nn.conv2d(x, w, channels=channels1 + channels2 + channels4)
-        y1 = relay.strided_slice(
-            y, begin=[0, 0], end=[-1, channels1], strides=[1, 1], slice_mode="size"
-        )
-        y2 = relay.strided_slice(
-            y, begin=[0, channels1], end=[-1, channels2], strides=[1, 1], slice_mode="size"
-        )
-        y3 = relay.nn.conv2d(x, w3)
-        y4 = relay.strided_slice(
-            y,
-            begin=[0, channels1 + channels2],
-            end=[-1, channels4],
-            strides=[1, 1],
-            slice_mode="size",
-        )
-        y5 = relay.nn.max_pool2d(x)
-        y = relay.Tuple((y1, y2, y3, y4, y5))
-        return relay.Function(args, y)
-
-    def check(x_shape, channels1, channels2, channels3, channels4):
-        x = relay.var("x", shape=x_shape)
-        in_c = x_shape[1]
-        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
-        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
-        w3 = relay.var("w3", shape=(channels3, in_c, 3, 3))
-        w4 = relay.var("w4", shape=(channels4, in_c, 1, 1))
-
-        y_before = before(x, w1, w2, w3, w4)
-        y = run_opt_pass(y_before, transform.CombineParallelConv2D(min_num_branches=2))
-        y_expected = expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check((1, 4, 16, 16), 4, 4, 4, 4)
-    check((1, 4, 16, 16), 4, 8, 4, 7)
-
-
-def test_combine_parallel_conv2d_scale_relu():
-    """Testcase of combining conv2d + scale + relu"""
-
-    def before(x, w1, w2, scale1, scale2, bias):
-        args = [x, w1, w2, scale1, scale2, bias]
-        y1 = relay.nn.conv2d(x, w1)
-        y1 = relay.multiply(y1, scale1)
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.conv2d(x, w2)
-        y2 = relay.multiply(y2, scale2)
-        y2 = relay.nn.relu(y2)
-        y2 = relay.add(y2, bias)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
-        args = [x, w1, w2, scale1, scale2, bias]
-        w = relay.concatenate((w1, w2), axis=0)
-        scale = relay.concatenate((scale1, scale2), axis=0)
-        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
-        y = relay.multiply(y, scale)
-        y = relay.nn.relu(y)
-        y1 = relay.strided_slice(
-            y, begin=[0, 0], end=[-1, channels1], strides=[1, 1], slice_mode="size"
-        )
-        y2 = relay.strided_slice(
-            y, begin=[0, channels1], end=[-1, channels2], strides=[1, 1], slice_mode="size"
-        )
-        y2 = relay.add(y2, bias)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def check(x_shape, channels1, channels2):
-        x = relay.var("x", shape=x_shape)
-        in_c = x_shape[1]
-        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
-        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
-        scale1 = relay.var("scale1", shape=(channels1, 1, 1))
-        scale2 = relay.var("scale2", shape=(channels2, 1, 1))
-        bias = relay.var("bias", shape=(channels2, 1, 1))
-        y_before = before(x, w1, w2, scale1, scale2, bias)
-        y = run_opt_pass(y_before, transform.CombineParallelConv2D(min_num_branches=2))
-        y_expected = expected(x, w1, w2, scale1, scale2, bias, channels1, channels2)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check((1, 4, 16, 16), 4, 8)
-
-
-def test_combine_parallel_conv2d_scale():
-    """Testcase of un-combinable scale"""
-
-    def before(x, w1, w2, scale1, scale2):
-        args = [x, w1, w2, scale1, scale2]
-        y1 = relay.nn.conv2d(x, w1)
-        y1 = relay.multiply(y1, scale1)
-        y2 = relay.nn.conv2d(x, w2)
-        y2 = relay.multiply(y2, scale2)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, scale1, scale2, channels1, channels2):
-        args = [x, w1, w2, scale1, scale2]
-        w = relay.concatenate((w1, w2), axis=0)
-        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
-        y1 = relay.strided_slice(
-            y, begin=[0, 0], end=[-1, channels1], strides=[1, 1], slice_mode="size"
-        )
-        y2 = relay.strided_slice(
-            y, begin=[0, channels1], end=[-1, channels2], strides=[1, 1], slice_mode="size"
-        )
-        y1 = relay.multiply(y1, scale1)
-        y2 = relay.multiply(y2, scale2)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def check(x_shape, channels1, channels2):
-        x = relay.var("x", shape=x_shape)
-        in_c = x_shape[1]
-        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
-        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
-        scale1 = relay.var("scale1", shape=(1,))
-        scale2 = relay.var("scale2", shape=(1,))
-        y_before = before(x, w1, w2, scale1, scale2)
-        y = run_opt_pass(y_before, transform.CombineParallelConv2D(min_num_branches=2))
-        y_expected = expected(x, w1, w2, scale1, scale2, channels1, channels2)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check((1, 4, 16, 16), 4, 8)
-
-
-def test_combine_parallel_conv2d_multiple_blocks():
-    def before(x, w, repeat):
-        args = [x, w]
-        y = x
-        for i in range(repeat):
-            y1 = relay.nn.conv2d(y, w)
-            y2 = relay.nn.conv2d(y, w)
-            y = relay.concatenate((y1, y2), axis=1)
-        return relay.Function(args, y)
-
-    def expected(x, w, channels, repeat):
-        args = [x, w]
-        y = x
-        for i in range(repeat):
-            w_concat = relay.concatenate((w, w), axis=0)
-            y = relay.nn.conv2d(y, w_concat, channels=channels * 2)
-            y1 = relay.strided_slice(
-                y, begin=[0, 0], end=[-1, channels], strides=[1, 1], slice_mode="size"
-            )
-            y2 = relay.strided_slice(
-                y, begin=[0, channels], end=[-1, channels], strides=[1, 1], slice_mode="size"
-            )
-            y = relay.concatenate((y1, y2), axis=1)
-        return relay.Function(args, y)
-
-    def check(x_shape, repeat):
-        x = relay.var("x", shape=x_shape)
-        in_c = x_shape[1]
-        out_c = in_c // 2
-        w = relay.var("w", shape=(out_c, in_c, 1, 1))
-        y_before = before(x, w, repeat)
-        y = run_opt_pass(y_before, transform.CombineParallelConv2D(min_num_branches=2))
-        y_expected = expected(x, w, out_c, repeat)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check((1, 4, 16, 16), 4)
-
-
-if __name__ == "__main__":
-    test_combine_parallel_conv2d()
-    test_combine_parallel_conv2d_scale_relu()
-    test_combine_parallel_conv2d_scale()
-    test_combine_parallel_conv2d_multiple_blocks()
diff --git a/tests/python/relay/test_pass_combine_parallel_dense.py b/tests/python/relay/test_pass_combine_parallel_dense.py
deleted file mode 100644
index 2494c1a550cd..000000000000
--- a/tests/python/relay/test_pass_combine_parallel_dense.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay import transform
-
-
-def run_combine_parallel(expr, min_num_branches=3, to_batch=True):
-    mod = tvm.IRModule.from_expr(expr)
-    mod = transform.CombineParallelDense(min_num_branches, to_batch)(mod)
-    return mod["main"]
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = tvm.relay.transform.InferType()(mod)
-    mod = opt_pass(mod)
-    return mod["main"]
-
-
-def test_combine_parallel_dense():
-    """Simple testcase. One dense cannot be combined due to shape mismatch"""
-
-    def before(x, w1, w2, w3, w4):
-        args = [x, w1, w2, w3, w4]
-        y1 = relay.nn.dense(x, w1)
-        y2 = relay.nn.dense(x, w2)
-
-        # y3 cannot be combined
-        y3 = relay.nn.dense(x, w3)
-
-        y4 = relay.nn.dense(x, w4)
-        y = relay.Tuple((y1, y2, y3, y4))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, w3, w4):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, w1, w2, w3, w4]
-        x_stacked = relay.stack((x, x, x), axis=0)
-        w = relay.stack((w1, w2, w4), axis=0)
-        y = relay.nn.batch_matmul(x_stacked, w)
-        (y1, y2, y4) = relay.split(y, 3)
-        y1 = relay.squeeze(y1, [0])
-        y2 = relay.squeeze(y2, [0])
-        y4 = relay.squeeze(y4, [0])
-
-        # y3 cannot be combined
-        y3 = relay.nn.dense(x, w3)
-
-        y = relay.Tuple((y1, y2, y3, y4))
-        return relay.Function(args, y)
-
-    def check(i, j, k):
-        x = relay.var("x", shape=(i, k))
-        w1 = relay.var("w1", shape=(j, k))
-        w2 = relay.var("w2", shape=(j, k))
-        w3 = relay.var("w3", shape=(j + 1, k))
-        w4 = relay.var("w4", shape=(j, k))
-
-        y_before = before(x, w1, w2, w3, w4)
-        y = run_opt_pass(y_before, transform.CombineParallelDense(min_num_branches=2))
-        y_expected = expected(x, w1, w2, w3, w4)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(3, 5, 4)
-    check(100, 200, 300)
-
-
-def test_combine_parallel_dense_biasadd():
-    """Testcase of combining dense + 1d biasadd"""
-
-    def before(x, w1, w2, b1, b2):
-        args = [x, w1, w2, b1, b2]
-        y1 = relay.nn.dense(x, w1)
-        y2 = relay.nn.dense(x, w2)
-        y1 = relay.add(y1, b1)
-        y2 = relay.add(y2, b2)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, b1, b2, is_2d_bias):
-        args = [x, w1, w2, b1, b2]
-        x_stacked = relay.stack((x, x), axis=0)
-        w = relay.stack((w1, w2), axis=0)
-        y = relay.nn.batch_matmul(x_stacked, w)
-
-        if not is_2d_bias:
-            b1 = relay.expand_dims(b1, 0)
-            b2 = relay.expand_dims(b2, 0)
-
-        b = relay.stack((b1, b2), axis=0)
-        y = relay.add(y, b)
-        (y1, y2) = relay.split(y, 2)
-        y1 = relay.squeeze(y1, [0])
-        y2 = relay.squeeze(y2, [0])
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def check(i, j, k, is_2d_bias):
-        x = relay.var("x", shape=(i, k))
-        w1 = relay.var("w1", shape=(j, k))
-        w2 = relay.var("w2", shape=(j, k))
-
-        if is_2d_bias:
-            b1 = relay.var("b1", shape=(i, j))
-            b2 = relay.var("b2", shape=(i, j))
-        else:
-            b1 = relay.var("b1", shape=(j,))
-            b2 = relay.var("b2", shape=(j,))
-
-        y_before = before(x, w1, w2, b1, b2)
-        y = run_opt_pass(y_before, transform.CombineParallelDense(min_num_branches=2))
-        y_expected = expected(x, w1, w2, b1, b2, is_2d_bias)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(3, 5, 4, False)
-    check(100, 200, 300, False)
-    check(3, 5, 4, True)
-    check(100, 200, 300, True)
-
-
-def test_combine_parallel_dense_biasadd_scale_reshape():
-    """Testcase of combining dense + 1d biasadd + multiply with non-fused reshape"""
-
-    def before(x, w1, w2, b1, b2, scale1, scale2, newshape):
-        args = [x, w1, w2, b1, b2, scale1, scale2]
-        y1 = relay.nn.dense(x, w1)
-        y2 = relay.nn.dense(x, w2)
-        y1 = relay.add(y1, b1)
-        y2 = relay.add(y2, b2)
-        y1 = relay.multiply(y1, scale1)
-        y2 = relay.multiply(y2, scale2)
-        y1 = relay.reshape(y1, newshape=newshape)
-        y2 = relay.reshape(y2, newshape=newshape)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, b1, b2, scale1, scale2, newshape):
-        args = [x, w1, w2, b1, b2, scale1, scale2]
-        x_stacked = relay.stack((x, x), axis=0)
-        w = relay.stack((w1, w2), axis=0)
-        y = relay.nn.batch_matmul(x_stacked, w)
-        b1 = relay.expand_dims(b1, 0)
-        b2 = relay.expand_dims(b2, 0)
-        b = relay.stack((b1, b2), axis=0)
-        y = relay.add(y, b)
-        scale1 = relay.expand_dims(scale1, 0)
-        scale2 = relay.expand_dims(scale2, 0)
-        scale = relay.stack((scale1, scale2), axis=0)
-        y = relay.multiply(y, scale)
-        (y1, y2) = relay.split(y, 2)
-        y1 = relay.squeeze(y1, [0])
-        y2 = relay.squeeze(y2, [0])
-        y1 = relay.reshape(y1, newshape=newshape)
-        y2 = relay.reshape(y2, newshape=newshape)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def check(i, j, k, scale1, scale2, newshape):
-        x = relay.var("x", shape=(i, k))
-        w1 = relay.var("w1", shape=(j, k))
-        w2 = relay.var("w2", shape=(j, k))
-        b1 = relay.var("b1", shape=(j,))
-        b2 = relay.var("b2", shape=(j,))
-        scale1 = relay.var("scale1", shape=(1,))
-        scale2 = relay.var("scale2", shape=(1,))
-
-        y_before = before(x, w1, w2, b1, b2, scale1, scale2, newshape)
-        y = run_opt_pass(y_before, transform.CombineParallelDense(min_num_branches=2))
-        y_expected = expected(x, w1, w2, b1, b2, scale1, scale2, newshape)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(3, 5, 4, 0.5, 0.25, (1, 1, 15))
-    check(100, 200, 300, 0.5, 0.25, (1, 1, 20000))
-
-
-def test_combine_parallel_dense_flat():
-    """Simple testcase. All matmul of different output dim can be combined"""
-
-    def before(x, w1, w2, w3):
-        args = [x, w1, w2, w3]
-        y1 = relay.nn.dense(x, w1)
-        y2 = relay.nn.dense(x, w2)
-        y3 = relay.nn.dense(x, w3)
-        y = relay.Tuple((y1, y2, y3))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, w3, j):
-        args = [x, w1, w2, w3]
-        w_stacked = relay.concatenate((w1, w2, w3), axis=0)
-        y = relay.nn.dense(x, w_stacked, units=6 * j)
-        strides = [1, 1]
-        y1 = relay.strided_slice(y, begin=[0, 0], end=[-1, j], strides=strides, slice_mode="size")
-        y2 = relay.strided_slice(
-            y, begin=[0, j], end=[-1, 2 * j], strides=strides, slice_mode="size"
-        )
-        y3 = relay.strided_slice(
-            y, begin=[0, 3 * j], end=[-1, 3 * j], strides=strides, slice_mode="size"
-        )
-        y = relay.Tuple((y1, y2, y3))
-        return relay.Function(args, y)
-
-    def check(i, j, k):
-        x = relay.var("x", shape=(i, k))
-        w1 = relay.var("w1", shape=(j, k))
-        w2 = relay.var("w2", shape=(2 * j, k))
-        w3 = relay.var("w3", shape=(3 * j, k))
-
-        y_before = before(x, w1, w2, w3)
-        combine_pass = transform.CombineParallelDense(min_num_branches=3, to_batch=False)
-        y = run_opt_pass(y_before, combine_pass)
-        y_expected = expected(x, w1, w2, w3, j)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(3, 5, 4)
-    check(100, 200, 300)
-
-
-def test_combine_parallel_dense_flat_biasadd():
-    """Testcase of combining dense + 1d biasadd with different out dims"""
-
-    def before(x, w1, w2, b1, b2):
-        args = [x, w1, w2, b1, b2]
-        y1 = relay.nn.dense(x, w1)
-        y2 = relay.nn.dense(x, w2)
-        y1 = relay.add(y1, b1)
-        y2 = relay.add(y2, b2)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, b1, b2, j, bias_shape1, bias_shape2):
-        args = [x, w1, w2, b1, b2]
-        w_stacked = relay.concatenate((w1, w2), axis=0)
-        y = relay.nn.dense(x, w_stacked, units=3 * j)
-        n_out_dims = max(len(bias_shape1), 2)
-        if len(bias_shape1) == 0:
-            b1 = relay.repeat(relay.expand_dims(b1, -1), j, 0)
-        elif bias_shape1[-1] == 1:
-            b1 = relay.repeat(b1, j, len(bias_shape1) - 1)
-        if len(bias_shape2) == 0:
-            b2 = relay.repeat(relay.expand_dims(b2, -1), 2 * j, 0)
-        elif bias_shape2[-1] == 1:
-            b2 = relay.repeat(b2, 2 * j, len(bias_shape2) - 1)
-        b = relay.concatenate((b1, b2), axis=max(0, len(bias_shape1) - 1))
-        y = relay.add(y, b)
-        begin = [0 for _ in range(n_out_dims - 1)]
-        end = [-1 for _ in range(n_out_dims - 1)]
-        strides = [1 for _ in range(n_out_dims)]
-        y1 = relay.strided_slice(
-            y, begin=begin + [0], end=end + [j], strides=strides, slice_mode="size"
-        )
-        y2 = relay.strided_slice(
-            y, begin=begin + [j], end=end + [2 * j], strides=strides, slice_mode="size"
-        )
-        return relay.Function(args, relay.Tuple((y1, y2)))
-
-    def check(i, j, k, bias_shape1, bias_shape2):
-        x = relay.var("x", shape=(i, k))
-        w1 = relay.var("w1", shape=(j, k))
-        w2 = relay.var("w2", shape=(2 * j, k))
-        b1 = relay.var("b1", shape=bias_shape1)
-        b2 = relay.var("b2", shape=bias_shape2)
-
-        y_before = before(x, w1, w2, b1, b2)
-        combine_pass = transform.CombineParallelDense(min_num_branches=2, to_batch=False)
-        y = run_opt_pass(y_before, combine_pass)
-        y_expected = expected(x, w1, w2, b1, b2, j, bias_shape1, bias_shape2)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(3, 5, 4, (), ())
-    check(3, 5, 4, (1,), (1,))
-    check(3, 5, 4, (5,), (1,))
-    check(3, 5, 4, (1,), (10,))
-    check(3, 5, 4, (3, 1), (3, 1))
-    check(3, 5, 4, (3, 5), (3, 10))
-    check(3, 5, 4, (3, 1), (3, 10))
-    check(3, 5, 4, (3, 5), (3, 1))
-    check(3, 5, 4, (9, 3, 5), (9, 3, 10))
-    check(3, 5, 4, (9, 3, 5), (9, 3, 1))
-    check(3, 5, 4, (9, 3, 1), (9, 3, 10))
-
-
-def test_combine_parallel_dense_flat_biasadd_scale_reshape():
-    """Testcase of combining dense with different out dims
-    following bias add, scale, reshape ops
-    """
-
-    def before(x, w1, w2, b1, b2, scale1, scale2, newshape1, newshape2):
-        args = [x, w1, w2, b1, b2, scale1, scale2]
-        y1 = relay.nn.dense(x, w1)
-        y2 = relay.nn.dense(x, w2)
-        y1 = relay.add(y1, b1)
-        y2 = relay.add(y2, b2)
-        y1 = relay.multiply(y1, scale1)
-        y2 = relay.multiply(y2, scale2)
-        y1 = relay.reshape(y1, newshape=newshape1)
-        y2 = relay.reshape(y2, newshape=newshape2)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2, b1, b2, scale1, scale2, newshape1, newshape2, j):
-        args = [x, w1, w2, b1, b2, scale1, scale2]
-        w_stacked = relay.concatenate((w1, w2), axis=0)
-        y = relay.nn.dense(x, w_stacked, units=3 * j)
-        b = relay.concatenate((b1, b2), axis=0)
-        y = relay.add(y, b)
-        scale1 = relay.repeat(scale1, j, 0)
-        scale2 = relay.repeat(scale2, 2 * j, 0)
-        scale = relay.concatenate((scale1, scale2), axis=0)
-        y = relay.multiply(y, scale)
-        strides = [1, 1]
-        y1 = relay.strided_slice(y, begin=[0, 0], end=[-1, j], strides=strides, slice_mode="size")
-        y2 = relay.strided_slice(
-            y, begin=[0, j], end=[-1, 2 * j], strides=strides, slice_mode="size"
-        )
-        y1 = relay.reshape(y1, newshape=newshape1)
-        y2 = relay.reshape(y2, newshape=newshape2)
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def check(i, j, k, scale1, scale2, newshape1, newshape2):
-        x = relay.var("x", shape=(i, k))
-        w1 = relay.var("w1", shape=(j, k))
-        w2 = relay.var("w2", shape=(2 * j, k))
-        b1 = relay.var("b1", shape=(j,))
-        b2 = relay.var("b2", shape=(2 * j,))
-        scale1 = relay.var("scale1", shape=(1,))
-        scale2 = relay.var("scale2", shape=(1,))
-
-        y_before = before(x, w1, w2, b1, b2, scale1, scale2, newshape1, newshape2)
-        combine_pass = transform.CombineParallelDense(min_num_branches=2, to_batch=False)
-        y = run_opt_pass(y_before, combine_pass)
-        y_expected = expected(x, w1, w2, b1, b2, scale1, scale2, newshape1, newshape2, j)
-        y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-    check(3, 5, 4, 0.5, 0.25, (1, 1, 15), (1, 1, 30))
-    check(100, 200, 300, 0.5, 0.25, (1, 1, 20000), (1, 1, 40000))
-
-
-def test_combine_parallel_dense_expand_dims():
-    """Verify that the correct slice axis is selected after the combined dense."""
-
-    def before(x, w1, w2):
-        args = [x, w1, w2]
-        y1 = relay.nn.dense(x, w1)
-        y1 = relay.expand_dims(y1, axis=2)
-
-        y2 = relay.nn.dense(x, w2)
-        y2 = relay.expand_dims(y2, axis=2)
-
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    def expected(x, w1, w2):
-        args = [x, w1, w2]
-        w_stacked = relay.concatenate((w1, w2), axis=0)
-        y = relay.nn.dense(x, w_stacked, units=24)
-        y = relay.expand_dims(y, axis=2)
-
-        strides = [1, 1, 1]
-        y1 = relay.strided_slice(
-            y, begin=[0, 0, 0], end=[-1, 16, -1], strides=strides, slice_mode="size"
-        )
-        y2 = relay.strided_slice(
-            y, begin=[0, 16, 0], end=[-1, 8, -1], strides=strides, slice_mode="size"
-        )
-        y = relay.Tuple((y1, y2))
-        return relay.Function(args, y)
-
-    x = relay.var("x", shape=(2, 32))
-    w1 = relay.var("w1", shape=(16, 32))
-    w2 = relay.var("w2", shape=(8, 32))
-
-    y_before = before(x, w1, w2)
-    combine_pass = transform.CombineParallelDense(min_num_branches=2, to_batch=False)
-    y = run_opt_pass(y_before, combine_pass)
-    y_expected = expected(x, w1, w2)
-    y_expected = run_opt_pass(y_expected, transform.InferType())
-    tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
deleted file mode 100644
index 5450f1aa6906..000000000000
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ /dev/null
@@ -1,2917 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test alter op layout pass"""
-import pytest
-import tvm
-from tvm import relay, te
-from tvm.relay import analysis, transform
-from tvm.relay.op import op as reg
-from tvm.relay.op import register_alter_op_layout
-from tvm.relay.quantize._annotate import (
-    attach_simulated_quantize,
-    QAnnotateKind,
-)
-from tvm.relay.transform.infer_layout_utils import InferCorrectLayoutOutput
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_no_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        return before()
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_binary_no_convert_layout():
-    def before():
-        x = relay.var("x", shape=(2, 2))
-        y = relay.var("y", shape=(1, 2))
-        return relay.Function(
-            [x, y],
-            relay.qnn.op.add(
-                x,
-                y,
-                lhs_scale=relay.const(0.0156863, "float32"),
-                lhs_zero_point=relay.const(127, "int32"),
-                rhs_scale=relay.const(0.0117647, "float32"),
-                rhs_zero_point=relay.const(85, "int32"),
-                output_scale=relay.const(0.0235294, "float32"),
-                output_zero_point=relay.const(128, "int32"),
-            ),
-        )
-
-    def expected():
-        return before()
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({}))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        weight = relay.layout_transform(weight, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_nhwc_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight = relay.layout_transform(weight, "OIHW", "HWIO")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_transpose_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d_transpose(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        weight = relay.layout_transform(weight, "HWIO", "IOHW")
-        y = relay.nn.conv2d_transpose(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d_transpose": ["NCHW", "IOHW"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_bias_pool_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.bias_add(y, bias, axis=3)
-        # a useless tuple, which will be eliminated
-        y = relay.Tuple([y])[0]
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NHWC")
-        y = relay.cast(y, "int32")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        weight = relay.layout_transform(weight, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-
-        bias = relay.expand_dims(bias, axis=0, num_newaxis=3)
-        bias = relay.layout_transform(bias, "NHWC", "NCHW")
-        y = relay.add(y, bias)
-        # a useless tuple, which will be eliminated
-        y = relay.Tuple([y])[0]
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2))
-        y = relay.cast(y, "int32")
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_bias_pool_uses_specified_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.bias_add(y, bias, axis=3)
-        # a useless tuple, which will be eliminated
-        y = relay.Tuple([y])[0]
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NHWC")
-        y = relay.cast(y, "int32")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        weight = relay.layout_transform(weight, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-
-        bias = relay.expand_dims(bias, axis=0, num_newaxis=3)
-        bias = relay.layout_transform(bias, "NHWC", "NCHW")
-        y = relay.add(y, bias)
-        # a useless tuple, which will be eliminated
-        y = relay.Tuple([y])[0]
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NHWC", out_layout="NHWC")
-        y = relay.cast(y, "int32")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(
-        a,
-        transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"], "nn.max_pool2d": ["NHWC"]}),
-    )
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_concat_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 64))
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y1 = relay.nn.conv2d(
-            y,
-            weight2,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        ret = relay.concatenate([y, y1], axis=3)
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 64))
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
-        weight1 = relay.layout_transform(weight1, "HWIO", "OIHW")
-        weight2 = relay.layout_transform(weight2, "HWIO", "OIHW")
-        y = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(y, weight1, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y1 = relay.nn.conv2d(y, weight2, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        ret = relay.concatenate([y, y1], axis=1)
-        ret = relay.layout_transform(ret, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_deformable_conv_bias_pool_convert_layout():
-    def before(N, CI, H, W, CO, KH, KW, layout):
-        if layout == "NCHW":
-            data_shape = (N, CI, H, W)
-            weight_shape = (CO, CI, KH, KW)
-            kernel_layout = "OIHW"
-        else:
-            data_shape = (N, H, W, CI)
-            weight_shape = (KH, KW, CI, CO)
-            kernel_layout = "HWIO"
-        bias_shape = (CO,)
-
-        data = relay.var("data", shape=data_shape, dtype="float32")
-        offset = relay.var("offset")
-        weight = relay.var("weight", shape=weight_shape, dtype="float32")
-        bias = relay.var("bias", shape=bias_shape, dtype="float32")
-
-        y = relay.nn.deformable_conv2d(
-            data,
-            offset,
-            weight,
-            kernel_size=(KH, KW),
-            channels=CO,
-            data_layout=layout,
-            kernel_layout=kernel_layout,
-        )
-        y = relay.nn.bias_add(y, bias, axis=-1 if layout == "NHWC" else 1)
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout=layout)
-        y = relay.cast(y, "int32")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected(N, CI, H, W, CO, KH, KW, OH, OW, src_layout, dst_layout):
-        layout_map = {"src": {}, "dst": {}}
-        if src_layout == "NCHW":
-            nchw = layout_map["src"]
-            nhwc = layout_map["dst"]
-        else:
-            nchw = layout_map["dst"]
-            nhwc = layout_map["src"]
-
-        nchw["data_layout"] = "NCHW"
-        nchw["data_shape"] = (N, CI, H, W)
-        nchw["offset_shape"] = (N, KH * KW * 2, OH, OW)
-        nchw["weight_shape"] = (CO, CI, KH, KW)
-        nchw["kernel_layout"] = "OIHW"
-
-        nhwc["data_layout"] = "NHWC"
-        nhwc["data_shape"] = (N, H, W, CI)
-        nhwc["offset_shape"] = (N, OH, OW, KH * KW * 2)
-        nhwc["weight_shape"] = (KH, KW, CI, CO)
-        nhwc["kernel_layout"] = "HWIO"
-
-        bias_shape = (CO,)
-
-        data = relay.var("data", shape=layout_map["src"]["data_shape"], dtype="float32")
-        offset = relay.var("offset", shape=layout_map["src"]["offset_shape"], dtype="float32")
-        weight = relay.var("weight", shape=layout_map["src"]["weight_shape"], dtype="float32")
-        bias = relay.var("bias", shape=bias_shape, dtype="float32")
-
-        data = relay.layout_transform(
-            data, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
-        )
-        offset = relay.layout_transform(
-            offset, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
-        )
-        weight = relay.layout_transform(
-            weight, layout_map["src"]["kernel_layout"], layout_map["dst"]["kernel_layout"]
-        )
-        y = relay.nn.deformable_conv2d(
-            data,
-            offset,
-            weight,
-            kernel_size=(KH, KW),
-            channels=CO,
-            data_layout=layout_map["dst"]["data_layout"],
-            kernel_layout=layout_map["dst"]["kernel_layout"],
-        )
-        if layout_map["src"]["data_layout"] == "NHWC":
-            bias = relay.expand_dims(bias, axis=0, num_newaxis=3)
-        else:
-            bias = relay.expand_dims(bias, axis=1, num_newaxis=2)
-            bias = relay.expand_dims(bias, axis=0)
-        bias = relay.layout_transform(
-            bias, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
-        )
-        y = relay.add(y, bias)
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout=layout_map["dst"]["data_layout"])
-        y = relay.cast(y, "int32")
-        y = relay.layout_transform(
-            y, layout_map["dst"]["data_layout"], layout_map["src"]["data_layout"]
-        )
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    # NHWC -> NCHW
-    a = before(1, 3, 224, 224, 32, 3, 3, "NHWC")
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.deformable_conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(
-        expected(1, 3, 224, 224, 32, 3, 3, 222, 222, "NHWC", "NCHW"), transform.InferType()
-    )
-    tvm.ir.assert_structural_equal(a, b)
-
-    # NCHW -> NHWC
-    a = before(1, 3, 224, 224, 32, 3, 3, "NCHW")
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.deformable_conv2d": ["NHWC", "default"]}))
-    b = run_opt_pass(
-        expected(1, 3, 224, 224, 32, 3, 3, 222, 222, "NCHW", "NHWC"), transform.InferType()
-    )
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_deformable_conv_bias_pool_uses_specified_convert_layout():
-    def before(N, CI, H, W, CO, KH, KW, layout):
-        if layout == "NCHW":
-            data_shape = (N, CI, H, W)
-            weight_shape = (CO, CI, KH, KW)
-            kernel_layout = "OIHW"
-        else:
-            data_shape = (N, H, W, CI)
-            weight_shape = (KH, KW, CI, CO)
-            kernel_layout = "HWIO"
-        bias_shape = (CO,)
-
-        data = relay.var("data", shape=data_shape, dtype="float32")
-        offset = relay.var("offset")
-        weight = relay.var("weight", shape=weight_shape, dtype="float32")
-        bias = relay.var("bias", shape=bias_shape, dtype="float32")
-
-        y = relay.nn.deformable_conv2d(
-            data,
-            offset,
-            weight,
-            kernel_size=(KH, KW),
-            channels=CO,
-            data_layout=layout,
-            kernel_layout=kernel_layout,
-        )
-        y = relay.nn.bias_add(y, bias, axis=-1 if layout == "NHWC" else 1)
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout=layout)
-        y = relay.cast(y, "int32")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected(N, CI, H, W, CO, KH, KW, OH, OW, src_layout, dst_layout, max_pool_layout=None):
-        layout_map = {"src": {}, "dst": {}}
-        if src_layout == "NCHW":
-            nchw = layout_map["src"]
-            nhwc = layout_map["dst"]
-        else:
-            nchw = layout_map["dst"]
-            nhwc = layout_map["src"]
-
-        nchw["data_layout"] = "NCHW"
-        nchw["data_shape"] = (N, CI, H, W)
-        nchw["offset_shape"] = (N, KH * KW * 2, OH, OW)
-        nchw["weight_shape"] = (CO, CI, KH, KW)
-        nchw["kernel_layout"] = "OIHW"
-
-        nhwc["data_layout"] = "NHWC"
-        nhwc["data_shape"] = (N, H, W, CI)
-        nhwc["offset_shape"] = (N, OH, OW, KH * KW * 2)
-        nhwc["weight_shape"] = (KH, KW, CI, CO)
-        nhwc["kernel_layout"] = "HWIO"
-
-        bias_shape = (CO,)
-
-        data = relay.var("data", shape=layout_map["src"]["data_shape"], dtype="float32")
-        offset = relay.var("offset", shape=layout_map["src"]["offset_shape"], dtype="float32")
-        weight = relay.var("weight", shape=layout_map["src"]["weight_shape"], dtype="float32")
-        bias = relay.var("bias", shape=bias_shape, dtype="float32")
-
-        data = relay.layout_transform(
-            data, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
-        )
-        offset = relay.layout_transform(
-            offset, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
-        )
-        weight = relay.layout_transform(
-            weight, layout_map["src"]["kernel_layout"], layout_map["dst"]["kernel_layout"]
-        )
-        y = relay.nn.deformable_conv2d(
-            data,
-            offset,
-            weight,
-            kernel_size=(KH, KW),
-            channels=CO,
-            data_layout=layout_map["dst"]["data_layout"],
-            kernel_layout=layout_map["dst"]["kernel_layout"],
-        )
-        if layout_map["src"]["data_layout"] == "NHWC":
-            bias = relay.expand_dims(bias, axis=0, num_newaxis=3)
-        else:
-            bias = relay.expand_dims(bias, axis=1, num_newaxis=2)
-            bias = relay.expand_dims(bias, axis=0)
-        bias = relay.layout_transform(
-            bias, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
-        )
-        y = relay.add(y, bias)
-        y = relay.nn.relu(y)
-        if max_pool_layout != layout_map["dst"]["data_layout"]:
-            y = relay.layout_transform(y, layout_map["dst"]["data_layout"], max_pool_layout)
-        y = relay.nn.max_pool2d(
-            y, pool_size=(2, 2), layout=max_pool_layout, out_layout=max_pool_layout
-        )
-        y = relay.cast(y, "int32")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    # NHWC -> NCHW
-    a = before(1, 3, 224, 224, 32, 3, 3, "NHWC")
-    a = run_opt_pass(
-        a,
-        transform.ConvertLayout(
-            {"nn.deformable_conv2d": ["NCHW", "default"], "nn.max_pool2d": ["NHWC"]}
-        ),
-    )
-    # - in the before() func, its last argument "NHWC" is also the layout of max_pool
-    b = run_opt_pass(
-        # max_pool has its own layout argument
-        expected(1, 3, 224, 224, 32, 3, 3, 222, 222, "NHWC", "NCHW", max_pool_layout="NHWC"),
-        transform.InferType(),
-    )
-    tvm.ir.assert_structural_equal(a, b)
-
-    # NCHW -> NHWC
-    a = before(1, 3, 224, 224, 32, 3, 3, "NCHW")
-    a = run_opt_pass(
-        a,
-        transform.ConvertLayout(
-            {"nn.deformable_conv2d": ["NHWC", "default"], "nn.max_pool2d": ["NCHW"]}
-        ),
-    )
-    # - in the before() func, its last argument "NCHW" is also the layout of max_pool
-    b = run_opt_pass(
-        # max_pool has its own layout argument
-        expected(1, 3, 224, 224, 32, 3, 3, 222, 222, "NCHW", "NHWC", max_pool_layout="NCHW"),
-        transform.InferType(),
-    )
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_dual_path_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        weight2 = relay.var("weight2", shape=(3, 3, 32, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y1 = relay.nn.conv2d(
-            y,
-            weight2,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.batch_flatten(y)
-        ret = relay.Tuple([y1, y2])
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        weight2 = relay.var("weight2", shape=(3, 3, 32, 32))
-        weight1 = relay.layout_transform(weight1, "HWIO", "OIHW")
-        weight2 = relay.layout_transform(weight2, "HWIO", "OIHW")
-        y = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y1 = relay.nn.conv2d(y, weight2, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y1 = relay.nn.relu(y1)
-        y1 = relay.layout_transform(y1, "NCHW", "NHWC")
-        y2 = relay.layout_transform(y, "NCHW", "NHWC")
-        y2 = relay.nn.batch_flatten(y2)
-        ret = relay.Tuple([y1, y2])
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_bn_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        gamma = relay.var("gamma")
-        beta = relay.var("beta")
-        mean = relay.var("mean")
-        variance = relay.var("variance")
-        y, _, _ = relay.nn.batch_norm(y, gamma, beta, mean, variance, axis=3)
-        return relay.Function(analysis.free_vars(y), y)
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-
-    # Check that there is only 1 NHWC to NCHW transform.
-    has_lt = list()
-    find_op = lambda x: has_lt.append(
-        isinstance(x, tvm.relay.expr.Call)
-        and x.op.name == "layout_transform"
-        and x.attrs.src_layout == "NCHW"
-        and x.attrs.dst_layout == "NHWC"
-    )
-    relay.analysis.post_order_visit(a, find_op)
-    has_lt = list(filter(lambda x: x, has_lt))
-    assert len(has_lt) == 1
-
-
-def test_slice_like_convert_layout():
-    def verify_slice_like(after, expected_axes):
-        # Verify if the slice_like after the convert layout has the expected axes.
-        has_expected = list()
-        checker = lambda x: has_expected.append(
-            isinstance(x, tvm.relay.expr.Call)
-            and x.op.name == "slice_like"
-            and str(x.attrs.axes) == str(expected_axes)
-        )
-        relay.analysis.post_order_visit(after, checker)
-        assert any(has_expected)
-
-    def func_nhwc():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        out = relay.slice_like(y, y, axes=[1, 2])
-        return relay.Function(analysis.free_vars(out), out)
-
-    after = run_opt_pass(func_nhwc(), transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    verify_slice_like(after, [2, 3])
-
-    def func_nchw():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(32, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        out = relay.slice_like(y, y, axes=[2, 3])
-        return relay.Function(analysis.free_vars(out), out)
-
-    after = run_opt_pass(func_nchw(), transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
-    verify_slice_like(after, [1, 2])
-
-    def func_vars():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        # z has no layout information so convert layout won't happen.
-        z = relay.var("y", shape=(1, 56, 56, 32))
-        out = relay.slice_like(y, z, axes=[1, 2])
-        return relay.Function(analysis.free_vars(out), out)
-
-    after = run_opt_pass(func_vars(), transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    verify_slice_like(after, [1, 2])
-
-
-def test_transpose_convert_layout():
-    def verify_transpose(after, expected_axes, expected_transform_cnt):
-        # Verify if the transpose after the convert layout has the expected axes.
-        has_expected = list()
-        checker = lambda x: has_expected.append(
-            isinstance(x, tvm.relay.expr.Call)
-            and x.op.name == "transpose"
-            and str(x.attrs.axes) == str(expected_axes)
-        )
-        relay.analysis.post_order_visit(after, checker)
-        assert any(has_expected), after
-
-        is_transform = list()
-        checker = lambda x: is_transform.append(
-            1 if isinstance(x, tvm.relay.expr.Call) and x.op.name == "layout_transform" else 0
-        )
-        relay.analysis.post_order_visit(after, checker)
-        assert (
-            sum(is_transform) == expected_transform_cnt
-        ), "Expected %s layout_transform, but get\n%s" % (expected_transform_cnt, after)
-
-    def nhwc_to_nchw():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        z = relay.var("z", shape=(56, 56, 32))
-        out = relay.add(y, z)
-        out = relay.transpose(out, axes=[0, 3, 1, 2])
-        out = relay.nn.batch_flatten(out)
-        func = relay.Function(analysis.free_vars(out), out)
-        return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-
-    verify_transpose(nhwc_to_nchw(), [0, 1, 2, 3], 3)
-
-    def nchw_to_nhwc():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(32, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        z = relay.var("z", shape=(32, 56, 56))
-        out = relay.add(y, z)
-        out = relay.transpose(out, axes=[0, 2, -1, 1])  # Also test a negative axis.
-        out = relay.nn.batch_flatten(out)
-        func = relay.Function(analysis.free_vars(out), out)
-        return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
-
-    verify_transpose(nchw_to_nhwc(), [0, 1, 2, 3], 3)
-
-    def default_axes():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(32, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        z = relay.var("z", shape=(32, 56, 56))
-        out = relay.add(y, z)
-        out = relay.transpose(out)  # No axes provided, will use the reversed axes.
-        func = relay.Function(analysis.free_vars(out), out)
-        return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
-
-    verify_transpose(default_axes(), [2, 1, 3, 0], 3)
-
-
-def test_resnet_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        weight2 = relay.var("weight2", shape=(1, 1, 64, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(
-            x, weight2, channels=32, kernel_size=(1, 1), data_layout="NHWC", kernel_layout="HWIO"
-        )
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.nn.global_max_pool2d(y, layout="NHWC")
-        return relay.Function(analysis.free_vars(y), y)
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        weight2 = relay.var("weight2", shape=(1, 1, 64, 32))
-        weight1 = relay.layout_transform(weight1, "HWIO", "OIHW")
-        weight2 = relay.layout_transform(weight2, "HWIO", "OIHW")
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(x, weight2, channels=32, kernel_size=(1, 1))
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.nn.global_max_pool2d(y)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        return relay.Function(analysis.free_vars(y), y)
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_resnet_pool_uses_specified_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        weight2 = relay.var("weight2", shape=(1, 1, 64, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(
-            x, weight2, channels=32, kernel_size=(1, 1), data_layout="NHWC", kernel_layout="HWIO"
-        )
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.nn.global_max_pool2d(y, layout="NHWC")
-        return relay.Function(analysis.free_vars(y), y)
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
-        weight2 = relay.var("weight2", shape=(1, 1, 64, 32))
-        weight1 = relay.layout_transform(weight1, "HWIO", "OIHW")
-        weight2 = relay.layout_transform(weight2, "HWIO", "OIHW")
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(x, weight2, channels=32, kernel_size=(1, 1))
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.nn.global_max_pool2d(y, layout="NHWC", out_layout="NHWC")
-        return relay.Function(analysis.free_vars(y), y)
-
-    a = before()
-    a = run_opt_pass(
-        a,
-        transform.ConvertLayout(
-            {"nn.conv2d": ["NCHW", "default"], "nn.global_max_pool2d": ["NHWC"]}
-        ),
-    )
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_scalar_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.add(y, relay.const(1, "float32"))
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        w = relay.layout_transform(w, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.add(y, relay.const(1.0, "float32"))
-
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_ln_convert_layout():
-    """Check that layout transforms are propagated through ln."""
-
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-
-        y = relay.nn.layer_norm(y, gamma, beta, axis=3)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        w = relay.layout_transform(w, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-
-        y = relay.nn.layer_norm(y, gamma, beta, axis=1)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_InstanceNorm_convert_layout():
-    """Check that layout transforms are propagated through instance norm."""
-
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-
-        y = relay.nn.instance_norm(y, gamma, beta, axis=3)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        w = relay.layout_transform(w, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-
-        y = relay.nn.instance_norm(y, gamma, beta, axis=1)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_bn_convert_layout():
-    """Check that layout transforms are propagated through bn."""
-
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-        moving_mean = relay.var("moving_mean", relay.TensorType((64,), dtype))
-        moving_var = relay.var("moving_var", relay.TensorType((64,), dtype))
-
-        y = relay.nn.batch_norm(y, gamma, beta, moving_mean, moving_var, axis=3)
-        y = relay.nn.relu(y[0])
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        w = relay.layout_transform(w, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-        moving_mean = relay.var("moving_mean", relay.TensorType((64,), dtype))
-        moving_var = relay.var("moving_var", relay.TensorType((64,), dtype))
-
-        y = relay.nn.batch_norm(y, gamma, beta, moving_mean, moving_var, axis=1)
-        y = relay.nn.relu(y[0])
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_conv_requantize_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.qnn.op.requantize(
-            y,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            out_dtype="int32",
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        weight = relay.layout_transform(weight, "HWIO", "OIHW")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.qnn.op.requantize(
-            y,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            axis=1,
-            out_dtype="int32",
-        )
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"qnn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_conv_concat_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 64), dtype="int8")
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64), dtype="int8")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight1,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y1 = relay.qnn.op.conv2d(
-            y,
-            weight2,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.cast(y, "int8")
-        y1 = relay.cast(y, "int8")
-        ret = relay.qnn.op.concatenate(
-            [y, y1],
-            [relay.const(1, "float32"), relay.const(1, "float32")],
-            [relay.const(1, "int32"), relay.const(1, "int32")],
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            axis=3,
-        )
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 64), dtype="int8")
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64), dtype="int8")
-        weight1 = relay.layout_transform(weight1, "HWIO", "OIHW")
-        weight2 = relay.layout_transform(weight2, "HWIO", "OIHW")
-        y = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.qnn.op.conv2d(
-            y,
-            weight1,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y1 = relay.qnn.op.conv2d(
-            y,
-            weight2,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.cast(y, "int8")
-        y1 = relay.cast(y, "int8")
-        ret = relay.qnn.op.concatenate(
-            [y, y1],
-            [relay.const(1, "float32"), relay.const(1, "float32")],
-            [relay.const(1, "int32"), relay.const(1, "int32")],
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            axis=1,
-        )
-        ret = relay.layout_transform(ret, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"qnn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_conv_add_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 64), dtype="int8")
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64), dtype="int8")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight1,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y1 = relay.qnn.op.conv2d(
-            y,
-            weight2,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.cast(y, "int8")
-        y1 = relay.cast(y, "int8")
-        ret = relay.qnn.op.add(
-            y,
-            y1,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-        )
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight1 = relay.var("weight1", shape=(3, 3, 64, 64), dtype="int8")
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64), dtype="int8")
-        weight1 = relay.layout_transform(weight1, "HWIO", "OIHW")
-        weight2 = relay.layout_transform(weight2, "HWIO", "OIHW")
-        y = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.qnn.op.conv2d(
-            y,
-            weight1,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y1 = relay.qnn.op.conv2d(
-            y,
-            weight2,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.cast(y, "int8")
-        y1 = relay.cast(y, "int8")
-        ret = relay.qnn.op.add(
-            y,
-            y1,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-        )
-        ret = relay.layout_transform(ret, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"qnn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_conv_nhwc_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56), dtype="int8")
-        weight = relay.var("weight", shape=(64, 64, 3, 3), dtype="int8")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56), dtype="int8")
-        weight = relay.var("weight", shape=(64, 64, 3, 3), dtype="int8")
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight = relay.layout_transform(weight, "OIHW", "HWIO")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"qnn.conv2d": ["NHWC", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_conv_transpose_requantize_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        y = relay.qnn.op.conv2d_transpose(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-        )
-        y = relay.qnn.op.requantize(
-            y,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            out_dtype="int32",
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        weight = relay.layout_transform(weight, "HWIO", "IOHW")
-        y = relay.qnn.op.conv2d_transpose(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            out_dtype="int32",
-        )
-        y = relay.qnn.op.requantize(
-            y,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            axis=1,
-            out_dtype="int32",
-        )
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"qnn.conv2d_transpose": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_convert_kernel_layout():
-    """Check that convolution kernel layout is correctly transformed."""
-
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        w = relay.layout_transform(w, "HWIO", "OHWI")
-        y = relay.nn.conv2d(
-            x,
-            w,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_conv_avgpool_2d_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.cast(y, "int8")
-        y = relay.qnn.op.avg_pool2d(
-            y,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            layout="NHWC",
-            out_layout="NHWC",
-            pool_size=(3, 3),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-        )
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        weight = relay.layout_transform(weight, "HWIO", "OIHW")
-        y = relay.qnn.op.conv2d(
-            x,
-            weight,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.cast(y, "int8")
-        y = relay.qnn.op.avg_pool2d(
-            y,
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "int32"),
-            layout="NCHW",
-            out_layout="NCHW",
-            pool_size=(3, 3),
-            padding=(0, 0),
-            strides=(1, 1),
-            dilation=(1, 1),
-        )
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(
-        a, transform.ConvertLayout({"qnn.conv2d": ["NCHW", "default"], "qnn.avg_pool2d": ["NCHW"]})
-    )
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_roi_align_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        rois = relay.var("rois", shape=(32, 5))
-        y = relay.vision.roi_align(
-            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NCHW"
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight1 = relay.layout_transform(weight1, "OIHW", "HWIO")
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        rois = relay.var("rois", shape=(32, 5))
-        y = relay.vision.roi_align(
-            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NHWC"
-        )
-        ret = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    desired_layouts = {
-        "nn.conv2d": ["NHWC", "HWIO"],
-        "vision.roi_align": ["NHWC", "default"],
-    }
-    a = run_opt_pass(a, transform.ConvertLayout(desired_layouts))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_strided_slice_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.nn.relu(y)
-        y = relay.strided_slice(y, begin=[0, 1], end=[1, -1, 10], strides=[1, 1, 2, 1])
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight = relay.layout_transform(weight, "OIHW", "HWIO")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y = relay.strided_slice(y, begin=[0, 0, 0, 1], end=[1, 10, 56, -1], strides=[1, 2, 1, 1])
-        y = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_split_convert_layout():
-    def _test_conv_split_convert_layout1():
-        def before():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=512,
-                kernel_size=(3, 3),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=2, axis=-1).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            out = relay.Tuple([a, b])
-            return relay.Function(analysis.free_vars(out), out)
-
-        def expected():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=512, kernel_size=(3, 3))
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=2, axis=1).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            a = relay.layout_transform(a, "NCHW", "NHWC")
-            b = relay.layout_transform(b, "NCHW", "NHWC")
-            out = relay.Tuple([a, b])
-            return relay.Function(analysis.free_vars(out), out)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    def _test_conv_split_convert_layout2():
-        def before():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=512,
-                kernel_size=(3, 3),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=2, axis=3).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            out = relay.Tuple([a, b])
-            return relay.Function(analysis.free_vars(out), out)
-
-        def expected():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=512, kernel_size=(3, 3))
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=2, axis=1).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            a = relay.layout_transform(a, "NCHW", "NHWC")
-            b = relay.layout_transform(b, "NCHW", "NHWC")
-            out = relay.Tuple([a, b])
-            return relay.Function(analysis.free_vars(out), out)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    def _test_conv_split_convert_layout3():
-        def before():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=512,
-                kernel_size=(3, 3),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=(5, 10), axis=-1).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            c = relay.TupleGetItem(y, 2)
-            out = relay.Tuple([a, b, c])
-            return relay.Function(analysis.free_vars(out), out)
-
-        def expected():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=512, kernel_size=(3, 3))
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=(5, 10), axis=1).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            c = relay.TupleGetItem(y, 2)
-            a = relay.layout_transform(a, "NCHW", "NHWC")
-            b = relay.layout_transform(b, "NCHW", "NHWC")
-            c = relay.layout_transform(c, "NCHW", "NHWC")
-            out = relay.Tuple([a, b, c])
-            return relay.Function(analysis.free_vars(out), out)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    def _test_conv_split_convert_layout_blocking():
-        def before():
-            x = relay.var("x", shape=(1, 512, 38, 38))
-            weight = relay.var("weight", shape=(512, 512, 3, 3))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=512,
-                kernel_size=(3, 3),
-                data_layout="NCHW",
-                kernel_layout="OIHW",
-            )
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=[256], axis=1).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            out = relay.Tuple([a, b])
-            return relay.Function(analysis.free_vars(out), out)
-
-        def expected():
-            x = relay.var("x", shape=(1, 512, 38, 38))
-            weight = relay.var("weight", shape=(512, 512, 3, 3))
-            weight = relay.layout_transform(weight, "OIHW", "OIHW4o")
-            x = relay.layout_transform(x, "NCHW", "NCHW4c")
-            y = relay.op.nn.contrib_conv2d_nchwc(
-                x,
-                weight,
-                channels=512,
-                kernel_size=(3, 3),
-                padding=(0, 0),
-                data_layout="NCHW4c",
-                kernel_layout="OIHW4o",
-            )
-            y = relay.nn.relu(y)
-            y = relay.op.split(y, indices_or_sections=[64], axis=1).astuple()
-            a = relay.TupleGetItem(y, 0)
-            b = relay.TupleGetItem(y, 1)
-            a = relay.layout_transform(a, "NCHW4c", "NCHW")
-            b = relay.layout_transform(b, "NCHW4c", "NCHW")
-            out = relay.Tuple([a, b])
-            return relay.Function(analysis.free_vars(out), out)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW4c", "OIHW4o"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    _test_conv_split_convert_layout1()
-    _test_conv_split_convert_layout2()
-    _test_conv_split_convert_layout3()
-    _test_conv_split_convert_layout_blocking()
-
-
-def test_conv_strided_slice_axes_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 28, 28, 32))
-        weight = relay.var("weight", shape=(3, 3, 32, 32))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.strided_slice(y, begin=[0, 16], end=[1, 33], strides=[1, 1], axes=[0, 3])
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 28, 28, 32))
-        weight = relay.var("weight", shape=(3, 3, 32, 32))
-        weight = relay.layout_transform(weight, "HWIO", "OIHW")
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.strided_slice(y, begin=[0, 16], end=[1, 33], strides=[1, 1], axes=[0, 1])
-
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = run_opt_pass(before(), transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_topk_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.topk(y, k=2, axis=2)
-        if isinstance(y, relay.expr.TupleWrapper):
-            y = y.astuple()
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        weight = relay.layout_transform(weight, "HWIO", "OIHW")
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.topk(y, k=2, axis=3).astuple()
-        a = relay.TupleGetItem(y, 0)
-        b = relay.TupleGetItem(y, 1)
-        a = relay.layout_transform(a, "NCHW", "NHWC")
-        b = relay.layout_transform(b, "NCHW", "NHWC")
-        out = relay.Tuple([a, b])
-        return relay.Function(analysis.free_vars(out), out)
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_roi_pool_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        rois = relay.var("rois", shape=(32, 5))
-        y = relay.vision.roi_pool(
-            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, layout="NCHW"
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight1 = relay.layout_transform(weight1, "OIHW", "HWIO")
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        rois = relay.var("rois", shape=(32, 5))
-        y = relay.vision.roi_pool(
-            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, layout="NHWC"
-        )
-        ret = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    desired_layouts = {
-        "nn.conv2d": ["NHWC", "HWIO"],
-        "vision.roi_pool": ["NHWC", "default"],
-    }
-    a = run_opt_pass(a, transform.ConvertLayout(desired_layouts))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_default_keyword():
-    """Check that the default keyword selects correct TVM default layout."""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 3, 3, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OHWI",
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        w = relay.var("weight", shape=(64, 3, 3, 64))
-        w = relay.layout_transform(w, "OHWI", "OIHW")
-        y = relay.nn.conv2d(
-            x,
-            w,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_different_ops_convert_layout():
-    """Check convert layout correctly supports converting the layout of
-    different ops in the same graph.
-    """
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 3, 3, 64))
-        weight2 = relay.var("weight2", shape=(64, 3, 3, 64), dtype="int8")
-        weight3 = relay.var("weight3", shape=(64, 3, 3, 64))
-        out = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OHWI",
-        )
-        out = relay.cast(out, "int8")
-        out = relay.qnn.op.conv2d(
-            out,
-            weight2,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OHWI",
-        )
-        out = relay.cast(out, "float32")
-        out = relay.nn.conv2d_transpose(
-            out,
-            weight3,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OHWI",
-        )
-        out = relay.Function(analysis.free_vars(out), out)
-        return out
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 3, 3, 64))
-        weight2 = relay.var("weight2", shape=(64, 3, 3, 64), dtype="int8")
-        weight3 = relay.var("weight3", shape=(64, 3, 3, 64))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight1 = relay.layout_transform(weight1, "OHWI", "HWIO")
-        out = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        out = relay.cast(out, "int8")
-        out = relay.layout_transform(out, "NHWC", "NCHW")
-        weight2 = relay.layout_transform(weight2, "OHWI", "OIHW")
-        out = relay.qnn.op.conv2d(
-            out,
-            weight2,
-            relay.const(1, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        out = relay.cast(out, "float32")
-        out = relay.layout_transform(out, "NCHW", "NHWC")
-        weight3 = relay.layout_transform(weight3, "OHWI", "HWIO")
-        out = relay.nn.conv2d_transpose(
-            out,
-            weight3,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        out = relay.layout_transform(out, "NHWC", "NCHW")
-        out = relay.Function(analysis.free_vars(out), out)
-        return out
-
-    a = before()
-    desired_layouts = {
-        "nn.conv2d": ["NHWC", "HWIO"],
-        "qnn.conv2d": ["NCHW", "OIHW"],
-        "nn.conv2d_transpose": ["NHWC", "HWIO"],
-    }
-    a = run_opt_pass(a, transform.ConvertLayout(desired_layouts))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_no_desired_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        rois = relay.var("rois", shape=(32, 5))
-        y = relay.vision.roi_align(
-            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NCHW"
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight1 = relay.layout_transform(weight1, "OIHW", "HWIO")
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.layout_transform(y, "NHWC", "NCHW")
-        rois = relay.var("rois", shape=(32, 5))
-        y = relay.vision.roi_align(
-            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NCHW"
-        )
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "HWIO"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_convert_with_config():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
-        y2 = relay.nn.conv2d(
-            y,
-            weight2,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y2 = relay.nn.relu(y2)
-
-        out = relay.Function([x, weight, weight2], y2)
-        return out
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-
-        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
-        weight2 = relay.layout_transform(weight2, "HWIO", "HWOI")
-
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NHWC", "HWNC")
-
-        y2 = relay.nn.conv2d(
-            y,
-            weight2,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="HWNC",
-            kernel_layout="HWOI",
-        )
-        y2 = relay.nn.relu(y2)
-
-        y2 = relay.layout_transform(y2, "HWNC", "NHWC")
-        output = relay.Function(relay.analysis.free_vars(y2), y2)
-        return output
-
-    a = before()
-    layout_config = relay.transform.LayoutConfig(skip_layers=[0])
-    with layout_config:
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["HWNC", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_squeeze_convert_layout():
-    def _test_conv_squeeze_convert_layout1():
-        # specified axis is squeezed
-        def before():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=1000,
-                kernel_size=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.nn.relu(y)
-            y = relay.squeeze(y, axis=[-3])
-            return relay.Function(analysis.free_vars(y), y)
-
-        def expected():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=1000, kernel_size=(1, 1))
-            y = relay.nn.relu(y)
-            y = relay.squeeze(y, axis=[2])
-            y = relay.layout_transform(y, "NCW", "NWC")
-            return relay.Function(analysis.free_vars(y), y)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    def _test_conv_squeeze_convert_layout2():
-        # all axes of dimension 1 are squeezed
-        def before():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=1000,
-                kernel_size=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.nn.relu(y)
-            y = relay.squeeze(y)
-            return relay.Function(analysis.free_vars(y), y)
-
-        def expected():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=1000, kernel_size=(1, 1))
-            y = relay.nn.relu(y)
-            y = relay.squeeze(y, [0, 2, 3])
-            return relay.Function(analysis.free_vars(y), y)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    def _test_conv_squeeze_convert_layout3():
-        # squeeze axis is empty
-        def before():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=1000,
-                kernel_size=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.nn.relu(y)
-            y = relay.squeeze(y, axis=[])
-            return relay.Function(analysis.free_vars(y), y)
-
-        def expected():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=1000, kernel_size=(1, 1))
-            y = relay.nn.relu(y)
-            y = relay.squeeze(y, axis=[])
-            y = relay.layout_transform(y, "NCHW", "NHWC")
-            return relay.Function(analysis.free_vars(y), y)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    _test_conv_squeeze_convert_layout1()
-    _test_conv_squeeze_convert_layout2()
-    _test_conv_squeeze_convert_layout3()
-
-
-def test_conv_reduce_convert_layout():
-    def _test_conv_reduce_convert_layout1():
-        def before():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=1000,
-                kernel_size=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.nn.relu(y)
-            y = relay.sum(y, axis=(1, 2))
-            y = relay.sum(y, axis=(1,))
-            y = relay.sum(y)
-            y = relay.sum(y)
-            return relay.Function(analysis.free_vars(y), y)
-
-        def expected():
-            x = relay.var("x", shape=(1, 1, 1, 2048))
-            weight = relay.var("weight", shape=(1, 1, 2048, 1000))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=1000, kernel_size=(1, 1))
-            y = relay.nn.relu(y)
-            y = relay.sum(y, axis=(2, 3))
-            y = relay.sum(y, axis=(1,))
-            y = relay.sum(y)
-            y = relay.sum(y)
-            return relay.Function(analysis.free_vars(y), y)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    def _test_conv_reduce_convert_layout2():
-        def _set_span(y, text):
-            return relay.Call(
-                y.op, y.args, y.attrs, y.type_args, relay.Span(relay.SourceName(text), 0, 0, 0, 0)
-            )
-
-        def before():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=512,
-                kernel_size=(3, 3),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = _set_span(y, "SpanConv2D")
-            y = relay.nn.relu(y)
-            y = _set_span(y, "SpanRelu")
-            y = relay.multiply(y, y)
-            y = _set_span(y, "SpanMultiply")
-            y = relay.sum(y, axis=(3,), keepdims=True)
-            y = _set_span(y, "SpanSum")
-            return relay.Function(analysis.free_vars(y), y)
-
-        def expected():
-            x = relay.var("x", shape=(1, 38, 38, 512))
-            weight = relay.var("weight", shape=(3, 3, 512, 512))
-            weight = relay.layout_transform(weight, "HWIO", "OIHW")
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.nn.conv2d(x, weight, channels=512, kernel_size=(3, 3))
-            y = relay.nn.relu(y)
-            y = relay.multiply(y, y)
-            y = relay.sum(y, axis=(1,), keepdims=True)
-            y = relay.layout_transform(y, "NCHW", "NHWC")
-            return relay.Function(analysis.free_vars(y), y)
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-        assert "SpanConv2D" in a.astext()
-        assert "SpanRelu" in a.astext()
-        assert "SpanMultiply" in a.astext()
-        assert "SpanSum" in a.astext()
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    _test_conv_reduce_convert_layout1()
-    _test_conv_reduce_convert_layout2()
-
-
-def test_image_resize2d_convert_layout():
-    def _test_image_resize_convert_layout_nchw_to_nhwc():
-        def before():
-            x = relay.var("x", shape=(1, 2, 4, 4))
-            y = relay.image.resize2d(x, (8, 8))
-            y = relay.Function([x], y)
-            return y
-
-        def expected():
-            x = relay.var("x", shape=(1, 2, 4, 4))
-            x = relay.layout_transform(x, "NCHW", "NHWC")
-            y = relay.image.resize2d(x, (8, 8), layout="NHWC")
-            y = relay.layout_transform(y, "NHWC", "NCHW")
-            y = relay.Function(relay.analysis.free_vars(y), y)
-            return y
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"image.resize2d": ["NHWC"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    def _test_image_resize_convert_layout_nhwc_to_nchw():
-        def before():
-            x = relay.var("x", shape=(1, 4, 4, 2))
-            y = relay.image.resize2d(x, (8, 8), layout="NHWC")
-            y = relay.Function([x], y)
-            return y
-
-        def expected():
-            x = relay.var("x", shape=(1, 4, 4, 2))
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            y = relay.image.resize2d(x, (8, 8), layout="NCHW")
-            y = relay.layout_transform(y, "NCHW", "NHWC")
-            y = relay.Function(relay.analysis.free_vars(y), y)
-            return y
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"image.resize2d": ["NCHW"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    _test_image_resize_convert_layout_nchw_to_nhwc()
-    _test_image_resize_convert_layout_nhwc_to_nchw()
-
-
-def test_conv_image_resize2d_convert_layout():
-    """Check that layout transforms are propagated through image resize."""
-
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.image.resize2d(y, (112, 112), layout="NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        w = relay.layout_transform(w, "HWIO", "OIHW")
-        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.image.resize2d(y, (112, 112), layout="NCHW")
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_infer_correct_layout():
-    test_infer_correct_layout_flag = False
-
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    @reg.register_infer_correct_layout("nn.relu", level=11)
-    def infer_correct_layout_relu(attrs, new_in_layouts, old_in_layouts, old_in_types):
-        nonlocal test_infer_correct_layout_flag
-        test_infer_correct_layout_flag = True
-        ret = tvm.tir.layout("")
-        if new_in_layouts:
-            assert len(new_in_layouts) >= 1
-            ret = new_in_layouts[0]
-        else:
-            for i in range(len(old_in_layouts)):
-                if old_in_layouts[i]:
-                    ret = old_in_layouts[i]
-                    break
-        input_layouts = []
-        for i in range(len(old_in_layouts)):
-            input_layouts.append(ret)
-        return InferCorrectLayoutOutput(input_layouts, [ret], attrs)
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    assert test_infer_correct_layout_flag == True
-
-
-def test_reduce_op_convert_layout():
-    for reduce_op in [relay.argmax, relay.mean, relay.max]:
-
-        def before():
-            x = relay.var("x", shape=(1, 64, 56, 56))
-            weight = relay.var("weight", shape=(64, 64, 3, 3))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=64,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                data_layout="NCHW",
-                kernel_layout="OIHW",
-            )
-            y = reduce_op(y, axis=[2, 3])
-            y = relay.Function([x, weight], y)
-            return y
-
-        def expected():
-            x = relay.var("x", shape=(1, 64, 56, 56))
-            weight = relay.var("weight", shape=(64, 64, 3, 3))
-            x = relay.layout_transform(x, "NCHW", "NHWC")
-            weight = relay.layout_transform(weight, "OIHW", "HWIO")
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=64,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = reduce_op(y, axis=[1, 2])
-            y = relay.Function(relay.analysis.free_vars(y), y)
-            return y
-
-        a = before()
-        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
-        b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_max_pool_uses_specified_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NCHW")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight = relay.layout_transform(weight, "OIHW", "OHWI")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-        )
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NHWC", out_layout="NHWC")
-        y = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(
-        a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"], "nn.max_pool2d": ["NHWC"]})
-    )
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_simulated_quantize_uses_specified_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = attach_simulated_quantize(y, QAnnotateKind.INPUT)
-        y = relay.nn.relu(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        x = relay.layout_transform(x, "NCHW", "NHWC")
-        weight = relay.layout_transform(weight, "OIHW", "OHWI")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-        )
-        y = attach_simulated_quantize(y, QAnnotateKind.INPUT)
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-@pytest.mark.parametrize(
-    "data_layout, kernel_layout",
-    [
-        ("NCHW1c", "OIHW1i1o"),
-        ("NCHW4c", "OIHW4i4o"),
-        ("NCHW8c", "OIHW8i8o"),
-        ("NCHW16c", "OIHW16i16o"),
-    ],
-)
-def test_resnet_convert_layout_nchwc(data_layout, kernel_layout):
-    x = relay.var("x", shape=(1, 3, 224, 224))
-    weight1 = relay.var("weight1", shape=(64, 3, 7, 7))
-    weight2 = relay.var("weight2", shape=(64, 64, 3, 3))
-    weight3 = relay.var("weight3", shape=(64, 64, 1, 1))
-
-    def before():
-        y = relay.nn.conv2d(
-            x,
-            weight1,
-            strides=(2, 2),
-            padding=(3, 3),
-            channels=64,
-            kernel_size=(7, 7),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
-        y1 = relay.nn.conv2d(
-            y,
-            weight2,
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.conv2d(
-            y,
-            weight3,
-            channels=64,
-            kernel_size=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y2 = relay.nn.relu(y2)
-        y = y1 + y2
-        y = relay.nn.global_max_pool2d(y, layout="NCHW")
-        return y
-
-    def expected():
-        if data_layout == "NCHW1c":
-            y = relay.nn.contrib_conv2d_nchwc(
-                relay.layout_transform(x, "NCHW", data_layout),
-                relay.layout_transform(weight1, "OIHW", kernel_layout),
-                strides=(2, 2),
-                padding=(3, 3),
-                channels=64,
-                kernel_size=(7, 7),
-                data_layout=data_layout,
-                kernel_layout=kernel_layout,
-            )
-            y = relay.nn.relu(y)
-            y = relay.nn.max_pool2d(
-                y, pool_size=(3, 3), strides=(2, 2), padding=(1, 1), layout=data_layout
-            )
-        else:
-            y = relay.nn.conv2d(
-                x,
-                weight1,
-                strides=(2, 2),
-                padding=(3, 3),
-                channels=64,
-                kernel_size=(7, 7),
-                data_layout="NCHW",
-                kernel_layout="OIHW",
-            )
-            y = relay.nn.relu(y)
-            y = relay.nn.max_pool2d(y, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
-            y = relay.layout_transform(y, "NCHW", data_layout)
-        y1 = relay.nn.contrib_conv2d_nchwc(
-            y,
-            relay.layout_transform(weight2, "OIHW", kernel_layout),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.contrib_conv2d_nchwc(
-            y,
-            relay.layout_transform(weight3, "OIHW", kernel_layout),
-            channels=64,
-            kernel_size=(1, 1),
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-        )
-        y2 = relay.nn.relu(y2)
-        y = y1 + y2
-        y = relay.nn.global_max_pool2d(y, layout=data_layout)
-        y = relay.layout_transform(y, data_layout, "NCHW")
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": [data_layout, kernel_layout]}))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_conv_l2n_convert_layout():
-    """Check that layout transforms are propagated through bn."""
-    axis_list = ([3], [-1], [2, 3])
-    expected_axis = ([1], [1], [3, 1])
-    for i, axis in enumerate(axis_list):
-
-        def before():
-            x = relay.var("x", shape=(1, 56, 56, 64))
-            weight = relay.var("weight", shape=(3, 3, 64, 64))
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=64,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            z = relay.nn.l2_normalize(y, eps=0.001, axis=axis)
-            z = relay.Function(analysis.free_vars(z), z)
-            return z
-
-        def expected():
-            x = relay.var("x", shape=(1, 56, 56, 64))
-            w = relay.var("weight", shape=(3, 3, 64, 64))
-            x = relay.layout_transform(x, "NHWC", "NCHW")
-            w = relay.layout_transform(w, "HWIO", "OIHW")
-            y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
-            z = relay.nn.l2_normalize(y, eps=0.001, axis=expected_axis[i])
-            z = relay.layout_transform(z, "NCHW", "NHWC")
-            z = relay.Function(analysis.free_vars(z), z)
-            return z
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
deleted file mode 100644
index 6374d20173b2..000000000000
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay import Function, transform
-from tvm.relay.testing import inception_v3
-import numpy as np
-import pytest
-
-cpu_scope = tvm.target.VirtualDevice(tvm.cpu(), tvm.target.Target("llvm"))
-metatable = {"VirtualDevice": [cpu_scope]}
-core = tvm.IRModule()
-core.import_from_std("core.rly")
-
-
-def optimize_and_check(before_program, after_program, passes):
-    if isinstance(before_program, str):
-        before_program = tvm.relay.parse(before_program)
-    if isinstance(after_program, str):
-        after_program = tvm.relay.parse(after_program)
-    if not isinstance(passes, list):
-        passes = [passes]
-    optimize = tvm.transform.Sequential(passes)
-    optimized_program = optimize(before_program)
-    print("Actual:")
-    print(optimized_program)
-    print("Expected:")
-    print(after_program)
-    tvm.ir.assert_structural_equal(optimized_program, after_program, map_free_vars=True)
-
-
-def test_dead_let():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main(%z: int) {
-        let %x = 1;
-        %z
-    }
-    """
-    after_program = """
-    #[version = "0.0.5"]
-    def @main(%z: int) {
-        %z
-    }
-    """
-    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
-
-
-def test_one_live_let():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main(%z: int) {
-        let %x = 1;
-        let %y = 2;
-        %x + %x
-    }
-    """
-    after_program = """
-    #[version = "0.0.5"]
-    def @main(%z: int) {
-        let %x = 1;
-        %x + %x
-    }
-    """
-    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
-
-
-def test_nested_let():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main(%d: int, %b: int) {
-        let %a = %b;
-        let %c = %d;
-        %c
-    }
-    """
-    after_program = """
-    #[version = "0.0.5"]
-    def @main(%d: int, %b: int) {
-        let %c = %d;
-        %c
-    }
-    """
-    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
-
-
-def test_live_recursion():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        let %f = fn (%n: int, %data: int) -> int {
-            if (%n == 0) {
-                %data
-            } else {
-                %f(%n - 1, log(%data))
-            }
-        };
-        %f(2, 10000)
-    }
-    """
-
-    after_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        let %f = fn (%n: int, %data: int) -> int {
-            if (%n == 0) {
-                %data
-            } else {
-                %f(%n - 1, log(%data))
-            }
-        };
-        %f(2, 10000)
-    }
-    """
-
-    optimize_and_check(
-        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
-    )
-
-
-def test_dead_recursion():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        let %f = fn (%n: int, %data: int) -> int {
-            if (%n == 0) {
-                %data
-            } else {
-                %f(%n - 1, log(%data))
-            }
-        };
-        ()
-    }
-    """
-
-    after_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        ()
-    }
-    """
-
-    optimize_and_check(
-        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
-    )
-
-
-def test_add_with_let():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        (let %a = 1; 3) + 2
-    }
-    """
-
-    after_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        3 + 2
-    }
-    """
-
-    optimize_and_check(
-        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
-    )
-
-
-def test_tuple_get_item():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        let %a = 100;
-        (1, 2, 3, 4).0
-    }
-    """
-
-    after_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        (1, 2, 3, 4).0
-    }
-    """
-
-    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
-
-
-def test_inline_into_function():
-    """Don't inline across function boundaries."""
-    before_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        let %x = 1 + 1;
-        let %f = fn (%y: int) -> int {
-          let %z = %y + %y;
-          %x + %z
-        };
-        (%f(2), %f(3))
-    }
-    """
-
-    after_program = """
-    #[version = "0.0.5"]
-    def @main() {
-        let %x = 1 + 1;
-        let %f = fn (%y: int) -> int {
-          %x + (%y + %y)
-        };
-        (%f(2), %f(3))
-    }
-    """
-
-    optimize_and_check(
-        before_program, after_program, transform.DeadCodeElimination(inline_once=True)
-    )
-
-
-def test_impure_op():
-    shape = np.array([64, 2])
-    metatable = {
-        "VirtualDevice": [cpu_scope],
-        "relay.Constant": [relay.const(shape, dtype="int64")],
-    }
-    """Don't elide calls to side-effecting operators."""
-    before_program = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main() {
-           let %size: int64 = cast(1024, dtype="int64");
-           let %alignment: int64 = cast(64, dtype="int64");
-           let %x = memory.alloc_storage(%size, meta[relay.Constant][0], %alignment, virtual_device=meta[VirtualDevice][0]);
-           let %_ = memory.kill(%x);
-           0
-        }
-        """,
-        "from_string",
-        core,
-        metatable,
-    )
-
-    after_program = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main() {
-           %0 = memory.alloc_storage(cast(1024, dtype="int64"),
-                                     meta[relay.Constant][0],
-                                     cast(64, dtype="int64"),
-                                     virtual_device=meta[VirtualDevice][0]);
-           let %_ = memory.kill(%0);
-           0
-        }
-        """,
-        "from_string",
-        core,
-        metatable,
-    )
-
-    optimize_and_check(
-        before_program, after_program, transform.DeadCodeElimination(inline_once=True)
-    )
-
-
-def test_impure_func():
-    shape = np.array([64, 2])
-    metatable = {
-        "VirtualDevice": [cpu_scope],
-        "relay.Constant": [relay.const(shape, dtype="int64")],
-    }
-    """Don't elide calls to side-effecting functions."""
-    before_program = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @f() -> int {
-           let %size: int64 = cast(1024, dtype="int64");
-           let %alignment: int64 = cast(64, dtype="int64");
-           let %x = memory.alloc_storage(%size, meta[relay.Constant][0], %alignment, virtual_device=meta[VirtualDevice][0]);
-           let %_ = memory.kill(%x);
-           0
-        }
-        def @main() -> int {
-           let %y = @f();
-           0
-        }
-        """,
-        "from_string",
-        core,
-        metatable,
-    )
-
-    after_program = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @f() -> int {
-           %0 = memory.alloc_storage(cast(1024, dtype="int64"),
-                                     meta[relay.Constant][0],
-                                     cast(64, dtype="int64"),
-                                     virtual_device=meta[VirtualDevice][0]);
-           let %_ = memory.kill(%0);
-           0
-        }
-        def @main() -> int {
-            let %y = @f();
-            0
-        }
-        """,
-        "from_string",
-        core,
-        metatable,
-    )
-
-    optimize_and_check(
-        before_program, after_program, transform.DeadCodeElimination(inline_once=True)
-    )
-
-
-def test_refs():
-    """Don't elide expressions with reference create/read/write side effects"""
-    before_program = """
-    #[version = "0.0.5"]
-    def @f(%r) -> int {
-        let %v = ref_read(%r);
-        let %u = ref_write(%r, %v + 1);
-        %v
-    }
-    def @main() -> int {
-        let %r = ref(0);
-        let %y = @f(%r);
-        let %z = @f(%r);
-        %z
-    }
-    """
-
-    after_program = before_program
-
-    optimize_and_check(
-        before_program,
-        after_program,
-        [transform.InferType(), transform.DeadCodeElimination(inline_once=True)],
-    )
-
-
-def test_complexity():
-    mod = transform.InferType()(
-        tvm.IRModule.from_expr(inception_v3.get_net(1, 1000, (3, 299, 299), "float32"))
-    )
-
-    optimize_and_check(mod, mod, transform.DeadCodeElimination(inline_once=True))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_defunctionalization.py b/tests/python/relay/test_pass_defunctionalization.py
deleted file mode 100644
index 96c061bd93b1..000000000000
--- a/tests/python/relay/test_pass_defunctionalization.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm.relay.backend.interpreter import ConstructorValue
-from tvm.relay import transform, ExprVisitor, TypeVisitor
-from tvm.relay.testing import Prelude
-
-# determine if type t is a FuncType or has a nested FuncType
-def has_func_type(t):
-    class FuncTypeVisitor(TypeVisitor):
-        def __init__(self):
-            super().__init__()
-            self.has_func = False
-
-        def visit_func_type(self, ftt):
-            self.has_func = True
-
-    ftvisitor = FuncTypeVisitor()
-    ftvisitor.visit(t)
-    return ftvisitor.has_func
-
-
-# determine whether a program has any higher order functions
-# a higher order function is defined as one that:
-# - has function type arguments
-# - returns a function
-def assert_no_higher_order_functions(expr, mod):
-    class CheckFirstOrderVisitor(ExprVisitor):
-        def __init__(self, mod):
-            super().__init__()
-            self.mod = mod
-            self.hof = []
-            self.visited_gv = set()
-
-        def visit_call(self, call):
-            is_higher_order = False
-            # check return type
-            if has_func_type(call.checked_type):
-                is_higher_order = True
-            # check argument types
-            for a in call.args:
-                if has_func_type(a.checked_type):
-                    is_higher_order = True
-            # if it is higher order, save it for debugging later
-            if is_higher_order:
-                self.hof.append(call)
-            super().visit_call(call)
-
-        def visit_global_var(self, gv):
-            # visit global vars to visit entire program
-            if gv not in self.visited_gv:
-                self.visited_gv.add(gv)
-                self.visit(self.mod[gv])
-
-    mod = transform.InferType()(mod)
-    check_fo_visitor = CheckFirstOrderVisitor(mod)
-    check_fo_visitor.visit(expr)
-
-    nl = "\n--------\n"
-    errmsg = f"""found {len(check_fo_visitor.hof)} higher order functions:
-  {nl.join(expr.astext() for expr in check_fo_visitor.hof)}"""
-
-    assert len(check_fo_visitor.hof) == 0, errmsg
-
-
-# assert that a program is defunctionalized and returns
-# defunctionalized module
-# assumes program starts from mod['main']
-def defunctionalized(mod):
-    mod = transform.InferType()(mod)
-    mod["main"] = transform.Defunctionalization(mod["main"], mod)
-    mod = transform.InferType()(mod)
-    assert_no_higher_order_functions(mod["main"], mod)
-
-    return mod
-
-
-# adt list to python list
-def to_list(mod, l):
-    list = mod.get_global_type_var("List")
-    list_adt = mod[list]
-    cons = list_adt.constructors[0]
-    nil = list_adt.constructors[1]
-
-    assert isinstance(l, ConstructorValue)
-    val = l
-    ret = []
-    while True:
-        if val.tag == cons.tag:
-            ret.append(val.fields[0].numpy())
-            val = val.fields[1]
-        else:
-            assert val.tag == nil.tag
-            break
-    return ret
-
-
-# list to adt list
-def to_adt_list(mod, arr):
-    expr = mod["main"]
-    l = mod.get_global_type_var("List")
-    list_adt = mod[l]
-    cons = list_adt.constructors[0]
-    nil = list_adt.constructors[1]
-
-    li = nil()
-    for a in arr:
-        li = cons(relay.const(a), li)
-    adt = relay.create_executor(mod=mod).evaluate(li)
-    mod["main"] = expr
-    return adt
-
-
-def test_simple():
-    code = """
-#[version = "0.0.5"]
-def @simple[A, B](%f: fn(A) -> B, %xs: A) -> B {
-  %f(%xs)
-}
-def @main(%l: Tensor[(5, 5), float32]) -> Tensor[(5, 5), float32] {
-  %0 = fn[A](%x: A) -> A {
-    %x
-  };
-  @simple(%0, %l)
-}
-"""
-    mod = tvm.relay.fromtext(code)
-    defunc_mod = defunctionalized(mod)
-
-    input = np.random.rand(5, 5).astype("float32")
-
-    out = relay.create_executor("debug", mod=mod).evaluate()(input)
-
-    defunc_out = relay.create_executor("debug", mod=defunc_mod).evaluate()(input)
-
-    np.testing.assert_equal(out.numpy(), defunc_out.numpy())
-
-
-def test_global_recursion():
-    code = """
-#[version = "0.0.5"]
-type List[A] {
-  Cons(A, List[A]),
-  Nil,
-}
-def @id[A](%x: A) -> A {
-  %x
-}
-def @map[A, B](%f: fn(A) -> B, %xs: List[A]) -> List[B] {
-  match (%xs) {
-    Cons(%x, %rest) => Cons(%f(%x), @map(%f, %rest)),
-    Nil => Nil,
-  }
-}
-def @main(%l: List[float32]) -> List[float32] {
-  @map(@id, %l)
-}
-"""
-    mod = tvm.relay.fromtext(code)
-    defunc_mod = defunctionalized(mod)
-
-    input = np.random.rand(10).astype("float32")
-
-    out = relay.create_executor("debug", mod=mod).evaluate(mod["main"])(to_adt_list(mod, input))
-
-    defunc_out = relay.create_executor("debug", mod=defunc_mod).evaluate()(
-        to_adt_list(defunc_mod, input)
-    )
-
-    np.testing.assert_array_equal(to_list(mod, out), to_list(defunc_mod, defunc_out))
-
-
-def test_recursive_datatype():
-    # CPS will create recursive datatype
-    code = """
-#[version = "0.0.5"]
-type List[A] {
-  Cons(A, List[A]),
-  Nil,
-}
-def @sum(%f: fn(int32) -> int32, %k: List[int32]) -> int32 {
-  match (%k) {
-    Cons(%x, %rest) => %0 = fn(%n) {
-      %x + %f(%n)
-    };
-    @sum(%0, %rest),
-    Nil => %f(0),
-  }
-}
-def @id[A](%x: A) -> A {
-  %x
-}
-def @main(%l: List[int32]) -> int32 {
-  @sum(@id, %l)
-}
-"""
-    mod = tvm.relay.fromtext(code)
-    defunc_mod = defunctionalized(mod)
-
-    input = np.random.randint(1, 100, 10)
-
-    out = relay.create_executor("debug", mod=mod).evaluate(mod["main"])(to_adt_list(mod, input))
-
-    defunc_out = relay.create_executor("debug", mod=defunc_mod).evaluate()(
-        to_adt_list(defunc_mod, input)
-    )
-
-    tvm.testing.assert_allclose(out.numpy(), defunc_out.numpy())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_defuse_ops.py b/tests/python/relay/test_pass_defuse_ops.py
deleted file mode 100644
index 4f446865c7a7..000000000000
--- a/tests/python/relay/test_pass_defuse_ops.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import run_opt_pass
-
-
-def test_defuse_simple():
-    """Simple testcase."""
-
-    def before():
-        x = relay.var("x", shape=(10, 20))
-        y = relay.add(x, relay.const(1, "float32"))
-        z = relay.exp(y)
-        w = relay.squeeze(z)
-        return relay.Function([x], w)
-
-    x = before()
-    x = run_opt_pass(x, transform.InferType())
-    fused = run_opt_pass(x, transform.FuseOps())
-    defused = run_opt_pass(fused, transform.DefuseOps())
-
-    tvm.ir.assert_structural_equal(x, defused)
-
-
-def test_inception_like():
-    def conv(data):
-        y = relay.nn.conv2d(data, relay.var("w"), kernel_size=(3, 3), padding=(1, 1), channels=16)
-        return relay.nn.relu(data=y)
-
-    def inception_like(data):
-        c0 = conv(data)
-        c1 = conv(data)
-        return relay.concatenate((c0, c1), axis=1)
-
-    def before(dshape):
-        x = relay.var("x", shape=dshape)
-        in1 = inception_like(x)
-        in2 = inception_like(in1)
-        return relay.Function(relay.analysis.free_vars(in2), in2)
-
-    dshape = (1, 16, 64, 64)
-    x = before(dshape)
-    x = run_opt_pass(x, transform.InferType())
-    fused = run_opt_pass(x, transform.FuseOps())
-    defused = run_opt_pass(fused, transform.DefuseOps())
-
-    tvm.ir.assert_structural_equal(x, defused)
-
-
-def test_defuse_complex():
-    """Complex defuse testcase"""
-
-    def fused_conv2d_batch_norm(w):
-        data = relay.var("data", shape=(1, 224, 224, 3))
-        bn_gamma0 = relay.var("bn_gamma0", relay.TensorType((64,), "float32"))
-        bn_beta0 = relay.var("bn_beta0", relay.TensorType((64,), "float32"))
-        bn_mmean0 = relay.var("bn_mean0", relay.TensorType((64,), "float32"))
-        bn_mvar0 = relay.var("bn_var0", relay.TensorType((64,), "float32"))
-        c0 = relay.nn.conv2d(
-            data,
-            w,
-            strides=(2, 2),
-            padding=(3, 3, 3, 3),
-            channels=64,
-            kernel_size=(7, 7),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-            out_layout="NHWC",
-        )
-        c1 = relay.nn.batch_norm(c0, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0, axis=3)
-        c2 = c1[0]
-        return relay.Function(relay.analysis.free_vars(c2), c2)
-
-    def fused_conv2d_batch_norm_relu(z):
-        data2 = relay.var("data2", shape=(1, 56, 56, 64))
-        bn_gamma0 = relay.var("bn_gamma0", relay.TensorType((64,), "float32"))
-        bn_beta0 = relay.var("bn_beta0", relay.TensorType((64,), "float32"))
-        bn_mmean0 = relay.var("bn_mean0", relay.TensorType((64,), "float32"))
-        bn_mvar0 = relay.var("bn_var0", relay.TensorType((64,), "float32"))
-        c0 = relay.nn.conv2d(
-            data2,
-            z,
-            padding=(1, 1, 1, 1),
-            channels=64,
-            kernel_size=(3, 3),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-            out_layout="NHWC",
-        )
-        c1 = relay.nn.batch_norm(c0, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0, axis=3)
-        c2 = c1[0]
-        c3 = relay.nn.relu(data=c2)
-        return relay.Function(relay.analysis.free_vars(c3), c3)
-
-    def fused_max_pool2d():
-        data1 = relay.var("data1", shape=(1, 112, 112, 64))
-        a1 = relay.nn.max_pool2d(
-            data1,
-            pool_size=(3, 3),
-            strides=(2, 2),
-            padding=(1, 1, 1, 1),
-            layout="NHWC",
-            out_layout="NHWC",
-        )
-        return relay.Function(relay.analysis.free_vars(a1), a1)
-
-    def fused_add_relu():
-        data1 = relay.var("data1", shape=(1, 56, 56, 64))
-        data2 = relay.var("data2", shape=(1, 56, 56, 64))
-        a0 = relay.add(data1, data2)
-        a1 = relay.nn.relu(a0)
-        return relay.Function(relay.analysis.free_vars(a1), a1)
-
-    def before_fused(conv_layer1_weight, conv_layer2_weight):
-        data = relay.var("data", shape=(1, 3, 224, 224))
-        data1 = relay.layout_transform(data, src_layout="NCHW", dst_layout="NHWC")
-        bn_gamma0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        bn_beta0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        bn_mmean0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        bn_mvar0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        a0 = fused_conv2d_batch_norm(conv_layer1_weight)
-        a1 = fused_max_pool2d()
-        a2 = fused_conv2d_batch_norm_relu(conv_layer2_weight)
-        a3 = fused_add_relu()
-        y0 = relay.Call(a0, [data1, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0])
-        y1 = relay.Call(a1, [y0])
-        y2 = relay.Call(a2, [y1, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0])
-        y3 = relay.Call(a3, [y1, y2])
-        return relay.Function(relay.analysis.free_vars(y3), y3)
-
-    def golden_defused(conv_layer1_weight, conv_layer2_weight):
-        data = relay.var("data", shape=(1, 3, 224, 224))
-        data1 = relay.layout_transform(data, src_layout="NCHW", dst_layout="NHWC")
-        bn_gamma0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        bn_beta0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        bn_mmean0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        bn_mvar0 = relay.const(tvm.nd.array(numpy.ndarray(shape=(64,), dtype="float32")))
-        c0 = relay.nn.conv2d(
-            data1,
-            conv_layer1_weight,
-            strides=(2, 2),
-            padding=(3, 3, 3, 3),
-            channels=64,
-            kernel_size=(7, 7),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-            out_layout="NHWC",
-        )
-        c1 = relay.nn.batch_norm(c0, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0, axis=3)
-        c2 = c1[0]
-        c3 = relay.nn.max_pool2d(
-            c2,
-            pool_size=(3, 3),
-            strides=(2, 2),
-            padding=(1, 1, 1, 1),
-            layout="NHWC",
-            out_layout="NHWC",
-        )
-        c4 = relay.nn.conv2d(
-            c3,
-            conv_layer2_weight,
-            padding=(1, 1, 1, 1),
-            channels=64,
-            kernel_size=(3, 3),
-            data_layout="NHWC",
-            kernel_layout="OHWI",
-            out_layout="NHWC",
-        )
-        c5 = relay.nn.batch_norm(c4, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0, axis=3)
-        c6 = c5[0]
-        c7 = relay.nn.relu(c6)
-        c8 = relay.add(c3, c7)
-        c9 = relay.nn.relu(c8)
-        return relay.Function(relay.analysis.free_vars(c9), c9)
-
-    # creating weight constants for the two convolution layers
-    # in the input fused model and the golden defused model.
-    conv_layer1_weight = relay.nn.Constant(
-        tvm.nd.array(numpy.ndarray(shape=(64, 7, 7, 3), dtype="float32"))
-    )
-    conv_layer2_weight = relay.nn.Constant(
-        tvm.nd.array(numpy.ndarray(shape=(64, 3, 3, 64), dtype="float32"))
-    )
-    x = before_fused(conv_layer1_weight, conv_layer2_weight)
-    x = run_opt_pass(x, transform.InferType())
-    defused = run_opt_pass(x, transform.DefuseOps())
-
-    golden1 = golden_defused(conv_layer1_weight, conv_layer2_weight)
-    golden1 = run_opt_pass(golden1, transform.InferType())
-
-    tvm.ir.assert_structural_equal(defused, golden1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_div_to_mul.py b/tests/python/relay/test_pass_div_to_mul.py
deleted file mode 100644
index 60c67ae2499c..000000000000
--- a/tests/python/relay/test_pass_div_to_mul.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import relay
-import pytest
-import numpy as np
-
-
-@pytest.mark.parametrize("dtype, rtol", [("float16", 1e-3), ("float32", 1e-7), ("float64", 1e-12)])
-def test_div_to_mul(dtype, rtol):
-    x = relay.var("x", relay.TensorType((), dtype))
-    y = relay.Constant(tvm.nd.array(np.array([1.5]).astype(dtype)))
-    z = x / y
-    mod = tvm.IRModule.from_expr(z)
-    transformed = relay.transform.DivToMul()(mod)
-    assert transformed["main"].body.op.name == "multiply"
-    np.testing.assert_allclose(transformed["main"].body.args[1].data.numpy()[0], 1 / 1.5, rtol=rtol)
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
deleted file mode 100644
index 7d492b4fc3f4..000000000000
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ /dev/null
@@ -1,641 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing import run_infer_type, create_workload
-import tvm.topi.testing
-import tvm.testing
-
-
-def run_opt_pass(expr, opt_pass, params=None):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-
-    mod = tvm.IRModule.from_expr(expr)
-    if params is not None:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def verify_func(func, data, ref_res, rtol=1e-5, atol=1e-7):
-    assert isinstance(data, list)
-    for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "vm", "debug"]:
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                *data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_reshape():
-    def verify_reshape(shape, newshape, oshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType(newshape, "float32"))
-        z = relay.reshape(x, relay.shape_of(y))
-        func = run_infer_type(relay.Function([x, y], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("reshape")
-        assert "newshape=" in zz.astext()
-        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
-
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=newshape).astype("float32")
-        ref_res = np.reshape(x_data, oshape)
-        verify_func(func2, [x_data, y_data], ref_res)
-
-    verify_reshape((2, 3, 4), (8, 3), (8, 3))
-    verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_squeeze():
-    def verify_squeeze(shape, axis, oshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType(axis, "float32"))
-        z = relay.squeeze(x, relay.shape_of(y))
-        func = run_infer_type(relay.Function([x, y], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("squeeze")
-        assert "axis=" in zz.astext()
-        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
-
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=axis).astype("float32")
-        ref_res = np.squeeze(x_data, axis)
-        verify_func(func2, [x_data, y_data], ref_res)
-
-    verify_squeeze((1, 3, 4, 1), (0,), (3, 4, 1))
-    verify_squeeze((1, 3, 4, 1), (3,), (1, 3, 4))
-    verify_squeeze((1, 3, 4, 1), (0, 3), (3, 4))
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_double_reshape():
-    def verify_reshape(shape, newshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType(newshape, "float32"))
-        z = relay.reshape(x, relay.shape_of(y))
-        z = relay.reshape(z, relay.shape_of(x))
-        func = run_infer_type(relay.Function([x, y], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("reshape")
-        assert "newshape=" in zz.astext()
-        assert zz.checked_type == relay.ty.TensorType(shape, "float32")
-
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=newshape).astype("float32")
-        verify_func(func2, [x_data, y_data], x_data)
-
-    verify_reshape((2, 3, 4), (8, 3))
-    verify_reshape((4, 7), (2, 7, 2))
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_quad_reshape():
-    def verify_reshape(shape, newshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType(newshape, "float32"))
-        z1 = relay.reshape(x, relay.shape_of(y))
-        z2 = relay.reshape(z1, relay.shape_of(x))
-        z3 = relay.reshape(z2, relay.shape_of(z1))
-        z4 = relay.reshape(z3, relay.shape_of(z2))
-        func = run_infer_type(relay.Function([x, y], z4))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("reshape")
-        assert "newshape=" in zz.astext()
-        assert zz.checked_type == relay.ty.TensorType(shape, "float32")
-
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=newshape).astype("float32")
-        verify_func(func2, [x_data, y_data], x_data)
-
-    verify_reshape((2, 3, 4), (8, 3))
-    verify_reshape((4, 7), (2, 7, 2))
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_tile():
-    def verify_tile(shape, reps, oshape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType(reps, "float32"))
-        z = relay.tile(x, relay.shape_of(y))
-        func = run_infer_type(relay.Function([x, y], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("tile")
-        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
-
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=reps).astype("float32")
-        ref_res = np.tile(x_data, reps)
-        verify_func(func2, [x_data, y_data], ref_res)
-
-    verify_tile((2, 3, 4), (2, 1, 5), (4, 3, 20))
-    verify_tile((4, 7), (4, 2), (16, 14))
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_topk():
-    def verify_topk(k, axis, ret_type, is_ascend, dtype):
-        shape = (20, 100)
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        k_var = relay.var("k", relay.TensorType((), "int32"))
-        out = relay.topk(x, k_var, axis, ret_type, is_ascend, dtype)
-        if isinstance(out, relay.expr.TupleWrapper):
-            out = out.astuple()
-        func = relay.Function([x, k_var], out)
-        params = {"k": k}
-
-        np_data = np.random.uniform(size=shape).astype("float32")
-        if is_ascend:
-            np_indices = np.argsort(np_data, axis=axis)
-        else:
-            np_indices = np.argsort(-np_data, axis=axis)
-        kk = k if k >= 1 else shape[axis]
-        if axis == 0:
-            np_indices = np_indices[:kk, :]
-            np_values = np.zeros(np_indices.shape).astype("float32")
-            for i in range(shape[1]):
-                np_values[:, i] = np_data[np_indices[:, i], i]
-        else:
-            np_indices = np_indices[:, :kk]
-            np_values = np.zeros(np_indices.shape).astype("float32")
-            for i in range(shape[0]):
-                np_values[i, :] = np_data[i, np_indices[i, :]]
-        np_indices = np_indices.astype(dtype)
-
-        func2 = run_opt_pass(
-            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
-        )
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("topk")
-
-        for target, dev in tvm.testing.enabled_targets():
-            if "llvm" not in target:
-                continue
-            for kind in ["graph", "vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func2)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    np_data
-                )
-                if ret_type == "both":
-                    tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
-                    tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
-                elif ret_type == "values":
-                    tvm.testing.assert_allclose(op_res.numpy(), np_values)
-                else:
-                    tvm.testing.assert_allclose(op_res.numpy(), np_indices)
-
-    np.random.seed(0)
-    for k in [0, 1, 5]:
-        for axis in [0, -1, 1]:
-            for ret_type in ["both", "values", "indices"]:
-                verify_topk(k, axis, ret_type, True, "int64")
-                verify_topk(k, axis, ret_type, False, "float32")
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_broadcast_to():
-    def verify_broadcast_to(shape, broadcast_shape):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("y", relay.TensorType(broadcast_shape, "float32"))
-        z = relay.broadcast_to(x, shape=relay.shape_of(y))
-
-        func = run_infer_type(relay.Function([x, y], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("broadcast_to")
-        assert zz.checked_type == relay.ty.TensorType(broadcast_shape, "float32")
-
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=broadcast_shape).astype("float32")
-
-        ref_res = np.broadcast_to(x_data, y_data.shape)
-        verify_func(func2, [x_data, y_data], ref_res)
-
-    verify_broadcast_to((3, 1), (3, 3))
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_zeros_ones():
-    def verify_ones_zeros(shape, dtype):
-        for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
-            x = relay.var("x", relay.TensorType(shape, dtype))
-            y = op(relay.shape_of(x), dtype)
-
-            func = run_infer_type(relay.Function([x], y))
-            func2 = run_opt_pass(
-                run_opt_pass(func, transform.DynamicToStatic()),
-                transform.InferType(),
-            )
-
-            zz = func2.body
-            assert zz.checked_type == relay.ty.TensorType(shape, dtype)
-
-            x_data = np.random.uniform(low=1, high=1, size=shape)
-            ref_res = ref(x_data.shape)
-            verify_func(func2, [x_data], ref_res)
-
-    verify_ones_zeros((1, 2, 3), "int64")
-    verify_ones_zeros((9, 8, 3, 4), "float32")
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_resize2d():
-    def verify_resize(shape, scale, method, layout):
-        if layout == "NHWC":
-            size = (shape[1] * scale, shape[2] * scale)
-        else:
-            size = (shape[2] * scale, shape[3] * scale)
-
-        x = relay.var("x", relay.TensorType(shape, "float32"))
-        size_var = relay.var("size", relay.TensorType((len(size),), "float32"))
-        coord_trans = "asymmetric" if method == "nearest_neighbor" else "align_corners"
-        z = relay.image.resize2d(
-            x, size_var, None, layout, method, coordinate_transformation_mode=coord_trans
-        )
-        params = {"size": np.array(size).astype("float32")}
-
-        func = run_infer_type(relay.Function([x, size_var], z))
-        func2 = run_opt_pass(
-            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
-        )
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("image.resize2d")
-
-        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        ref_res = tvm.topi.testing.resize2d_python(
-            x_data, (scale, scale), layout, method, coord_trans
-        )
-
-    for method in ["linear", "nearest_neighbor"]:
-        for layout in ["NCHW", "NHWC"]:
-            verify_resize((1, 4, 4, 4), 2, method, layout)
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_one_hot():
-    def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
-        indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
-        depth_var = relay.const(depth)
-        on_value_var = relay.var("on_value", relay.TensorType((), "int32"))
-        off_value_var = relay.var("off_value", relay.TensorType((), "int32"))
-        out = relay.one_hot(indices, on_value_var, off_value_var, depth_var, axis, dtype)
-        params = {
-            "on_value": on_value,
-            "off_value": off_value,
-        }
-
-        func = relay.Function([indices, on_value_var, off_value_var], out)
-        func2 = run_opt_pass(
-            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
-        )
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("one_hot")
-
-        indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
-        out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
-        verify_func(func2, [indices_np], out_np)
-
-    _verify((3,), 3, 1, 0, -1, "int32")
-    _verify((3,), 3, 1.0, 0.0, -1, "float32")
-    _verify((2, 2), 5, 2, -2, 0, "int32")
-    _verify((2, 2), 5, 0.5, -0.5, 1, "float32")
-    _verify((3, 2, 4, 5), 6, 1, 0, 1, "int32")
-    _verify((3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_full():
-    def verify_full(fill_value, fill_shape, dtype):
-        x = relay.var("x", relay.scalar_type(dtype))
-        y = relay.var("y", relay.TensorType(fill_shape, "int64"))
-        z = relay.full(x, relay.shape_of(y), dtype)
-
-        func = run_infer_type(relay.Function([x, y], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("full")
-
-        ref_res = np.full(fill_shape, fill_value).astype(dtype)
-        y_data = np.random.uniform(low=-1, high=1, size=fill_shape).astype("int64")
-        verify_func(func2, [fill_value, y_data], ref_res)
-
-    verify_full(4, (1, 2, 3, 4), "int32")
-    verify_full(4.0, (1, 2, 8, 10), "float32")
-
-
-def test_dynamic_to_static_upsampling():
-    def verify_upsampling(data_shape, scale_h_val, scale_w_val, dtype):
-        x = relay.var("x", relay.TensorType(data_shape, dtype))
-        scale_h = relay.var("scale_h", relay.TensorType((), "float32"))
-        scale_w = relay.var("scale_w", relay.TensorType((), "float32"))
-        z = relay.nn.upsampling(x, scale_h, scale_w)
-        params = {
-            "scale_h": scale_h_val,
-            "scale_w": scale_w_val,
-        }
-
-        func = run_infer_type(relay.Function([x, scale_h, scale_w], z))
-        func2 = run_opt_pass(
-            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
-        )
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("nn.upsampling")
-
-        x_data = np.random.uniform(size=data_shape).astype(dtype)
-        ref_res = tvm.topi.testing.resize2d_python(
-            x_data, (scale_h_val, scale_w_val), "NCHW", "nearest_neighbor", "asymmetric"
-        )
-        verify_func(func2, [x_data], ref_res)
-
-    verify_upsampling((1, 16, 32, 32), 2, 2, "int8")
-    verify_upsampling((1, 16, 32, 32), 4, 4, "int32")
-
-
-def test_dynamic_to_static_upsampling3d():
-    def verify_upsampling3d(data_shape, scale_d_val, scale_h_val, scale_w_val, dtype):
-        x = relay.var("x", relay.TensorType(data_shape, dtype))
-        scale_d = relay.var("scale_d", relay.TensorType((), "float32"))
-        scale_h = relay.var("scale_h", relay.TensorType((), "float32"))
-        scale_w = relay.var("scale_w", relay.TensorType((), "float32"))
-
-        z = relay.nn.upsampling3d(x, scale_d, scale_h, scale_w)
-        params = {
-            "scale_d": scale_d_val,
-            "scale_h": scale_h_val,
-            "scale_w": scale_w_val,
-        }
-
-        func = run_infer_type(relay.Function([x, scale_d, scale_h, scale_w], z))
-        func2 = run_opt_pass(
-            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
-        )
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("nn.upsampling3d")
-
-        x_data = np.random.uniform(size=data_shape).astype(dtype)
-        ref_res = tvm.topi.testing.resize3d_python(
-            x_data,
-            (scale_d_val, scale_h_val, scale_w_val),
-            "NCDHW",
-            "nearest_neighbor",
-            "asymmetric",
-        )
-        verify_func(func2, [x_data], ref_res)
-
-    verify_upsampling3d((1, 1, 1, 1, 1), 2, 3, 4, "int8")
-    verify_upsampling3d((5, 7, 8, 10, 32), 3, 2, 2, "int8")
-    verify_upsampling3d((1, 4, 2, 5, 3), 5, 4, 3, "int32")
-
-
-def test_dynamic_to_static_pad():
-    def verify_pad(data_shape, pad_width_val, pad_val, dtype):
-        x = relay.var("x", relay.TensorType(data_shape, dtype))
-        pad_width = relay.var(
-            "pad_width", relay.TensorType((len(pad_width_val), len(pad_width_val[0])), "int32")
-        )
-        z = relay.nn.pad(x, pad_width, pad_val)
-        func = run_infer_type(relay.Function([x, pad_width], z))
-        params = {"pad_width": np.array(pad_width_val)}
-        func2 = run_opt_pass(
-            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
-        )
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("nn.pad")
-
-        x_data = np.random.uniform(size=data_shape).astype(dtype)
-        ref_res = np.pad(
-            x_data, pad_width_val, "constant", constant_values=(((pad_val,) * 2),) * len(data_shape)
-        )
-        verify_func(func2, [x_data], ref_res)
-
-    verify_pad((4, 10, 7, 7), ((1, 1), (2, 2), (3, 3), (4, 4)), 2.0, "int32")
-    verify_pad((2, 7), ((1, 4), (2, 2)), 4.0, "float64")
-
-
-def test_dynamic_to_static_strided_slice():
-    def verify(
-        dshape,
-        begin_val,
-        end_val,
-        strides_val,
-        output,
-        slice_mode="end",
-        test_ref=True,
-        dtype="int32",
-    ):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        ndim = len(dshape)
-        begin_val = begin_val if begin_val else [0] * ndim
-        end_val = end_val if end_val else list(dshape)
-        if strides_val:
-            if len(strides_val) == 1:
-                strides_val = strides_val * ndim
-        else:
-            strides_val = [1] * ndim
-
-        # target numpy result
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = tvm.topi.testing.strided_slice_python(
-            x_data, begin_val, end_val, strides_val, slice_mode
-        )
-        data = [x_data, np.array(begin_val), np.array(end_val)]
-
-        begin = relay.var("begin", relay.TensorType((len(begin_val),), dtype))
-        end = relay.var("end", relay.TensorType((len(end_val),), dtype))
-
-        func_params = [x, begin, end]
-        if strides_val:
-            data.append(np.array(strides_val))
-            strides = relay.var("strides", relay.TensorType((len(strides_val),), dtype))
-            z = relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=slice_mode)
-            func_params.append(strides)
-        else:
-            z = relay.strided_slice(x, begin=begin, end=end, slice_mode=slice_mode)
-        func = relay.Function(func_params, z)
-        params = {"begin": begin_val, "end": end_val, "strides": strides_val}
-
-        func = run_infer_type(func)
-        func2 = run_opt_pass(
-            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
-        )
-        assert isinstance(func2.body, relay.Call)
-        assert func2.body.op == relay.op.get("strided_slice")
-        verify_func(func2, [x_data], ref_res)
-
-    verify((1, 3, 10, 10), [0, 0, 0, 0], [1, 3, 10, 10], [1], (0, 3, 10, 10), dtype="int64")
-    verify(
-        (1, 224, 224, 3),
-        [0, 20, 20, 0],
-        [1, 140, 140, 3],
-        [1, 1, 1, 1],
-        (1, 120, 120, 3),
-        dtype="int64",
-    )
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
-    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 4], None, (2, 3, 3))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
-    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
-    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
-    verify(
-        (3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], (2, 4, 3), slice_mode="size", test_ref=False
-    )
-    verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], (2, 2, 3), slice_mode="size", test_ref=True)
-
-
-@tvm.testing.uses_gpu
-def test_dyn_to_static_sparse_to_dense():
-    def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
-        sparse_indices_data = np.array(sparse_indices)
-        sparse_values_data = np.array(sparse_values)
-        default_value_data = np.array(default_value)
-        output_shape_data = np.array(output_shape)
-
-        a = relay.var(
-            "a", relay.TensorType(sparse_indices_data.shape, str(sparse_indices_data.dtype))
-        )
-        b = relay.var(
-            "b", relay.TensorType(sparse_values_data.shape, str(sparse_values_data.dtype))
-        )
-        output_shape_const = relay.const(output_shape_data)
-
-        if default_value is None:
-            args = [a, b]
-            d = relay.sparse_to_dense(a, output_shape_const, b)
-        else:
-            c = relay.var(
-                "c", relay.TensorType(default_value_data.shape, str(default_value_data.dtype))
-            )
-            args = [a, b, c]
-            d = relay.sparse_to_dense(a, output_shape_const, b, c)
-
-        zz = run_infer_type(d)
-        assert len(zz.checked_type.shape) == len(output_shape)
-
-        func = relay.Function(args, d)
-
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-        assert isinstance(func2.body, relay.Call)
-        assert func2.body.op == relay.op.get("sparse_to_dense")
-
-        if default_value is None:
-            arguments = [sparse_indices_data, sparse_values_data]
-        else:
-            arguments = [sparse_indices_data, sparse_values_data, default_value_data]
-
-        verify_func(func2, arguments, xpected)
-
-    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
-    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
-    verify_sparse_to_dense(
-        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
-    )  # nXd
-    verify_sparse_to_dense(
-        [[0, 0, 0], [1, 2, 3]],
-        [1, 2],
-        4,
-        [2, 3, 4],
-        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
-    )  # nXd
-    verify_sparse_to_dense(
-        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
-    )  # floats
-    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_dynamic_rank():
-    def verify_full(fill_value, fill_shape, dtype):
-        x = relay.var("x", relay.scalar_type(dtype))
-        y = relay.var("y", relay.TensorType(fill_shape, "int64"))
-        shape = relay.shape_of(y)
-        shape = relay.strided_slice(shape, [0], relay.shape_of(shape))
-        z = relay.full(x, shape, dtype)
-
-        func = relay.Function([x, y], z)
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-        zz = func2.body
-        assert isinstance(zz, relay.Call)
-        assert zz.op == relay.op.get("full")
-
-        ref_res = np.full(fill_shape, fill_value).astype(dtype)
-        y_data = np.random.uniform(low=-1, high=1, size=fill_shape).astype("int64")
-        verify_func(func2, [fill_value, y_data], ref_res)
-
-    verify_full(4, (1, 2, 3, 4), "int32")
-    verify_full(4.0, (1, 2, 8, 10), "float32")
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_to_static_dynamic_if():
-    x = relay.var("x", relay.TensorType((2, 2), "int64"))
-    cond = relay.const(1)
-    iff = relay.If(cond, relay.reshape(x, [1, 4]), relay.reshape(x, (4, 1)))
-
-    func = relay.Function([x], iff)
-    func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-
-    zz = func2.body
-    assert isinstance(zz, relay.Call)
-    assert zz.op == relay.op.get("reshape")
-    x_data = np.random.uniform(low=-1, high=1, size=(2, 2)).astype("int64")
-    verify_func(func2, [x_data], x_data.reshape(1, 4))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_eliminate_common_subexpr.py b/tests/python/relay/test_pass_eliminate_common_subexpr.py
deleted file mode 100644
index fd4bb0c9fbfa..000000000000
--- a/tests/python/relay/test_pass_eliminate_common_subexpr.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test eliminate common subexpr pass"""
-import numpy as np
-import tvm
-from tvm import te
-
-from tvm import relay
-from tvm.relay.op import register_alter_op_layout
-from tvm.relay import transform, analysis
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_simple():
-    def before():
-        x = relay.var("x", shape=(1, 16))
-        y1 = relay.nn.relu(x)
-        y2 = relay.nn.relu(x)
-        y1 = relay.add(y1, relay.const(1.0, "float32"))
-        y2 = relay.add(y2, relay.const(1.0, "float32"))
-        y = relay.add(y1, y2)
-        f = relay.Function([x], y)
-        return f
-
-    def expected():
-        x = relay.var("x", shape=(1, 16))
-        y = relay.nn.relu(x)
-        y = relay.add(y, relay.const(1.0, "float32"))
-        y = relay.add(y, y)
-        f = relay.Function([x], y)
-        return run_opt_pass(f, transform.InferType())
-
-    z = before()
-    z = run_opt_pass(z, transform.EliminateCommonSubexpr())
-    tvm.ir.assert_structural_equal(z, expected())
-
-
-def test_callback():
-    def before():
-        x = relay.var("x", shape=(1, 16))
-        y1 = relay.nn.relu(x)
-        y2 = relay.nn.relu(x)
-        y1 = relay.add(y1, relay.const(1.0, "float32"))
-        y2 = relay.add(y2, relay.const(1.0, "float32"))
-        y = relay.add(y1, y2)
-        f = relay.Function([x], y)
-        return f
-
-    def expected():
-        x = relay.var("x", shape=(1, 16))
-        y = relay.nn.relu(x)
-        y1 = relay.add(y, relay.const(1.0, "float32"))
-        y2 = relay.add(y, relay.const(1.0, "float32"))
-        y = relay.add(y1, y2)
-        f = relay.Function([x], y)
-        return run_opt_pass(f, transform.InferType())
-
-    def fskip(expr):
-        if isinstance(expr, relay.expr.Call) and expr.op.name == "add":
-            return True
-        return False
-
-    z = before()
-    z = run_opt_pass(z, transform.EliminateCommonSubexpr(fskip))
-    tvm.ir.assert_structural_equal(z, expected())
-
-
-def test_tuple_get_time():
-    def before():
-        x = relay.var("x", shape=(1, 16, 1, 1))
-        var = relay.var("var", shape=(16,))
-        mean = relay.var("mean", shape=(16,))
-        beta = relay.var("beta", shape=(16,))
-        gamma = relay.var("gamma", shape=(16,))
-        BN = relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)
-        T1 = BN[0]
-        T2 = BN[0]
-        add = T1 + T2
-        f = relay.Function([x, var, mean, beta, gamma], add)
-        return f
-
-    def expected():
-        x = relay.var("x", shape=(1, 16, 1, 1))
-        var = relay.var("var", shape=(16,))
-        mean = relay.var("mean", shape=(16,))
-        beta = relay.var("beta", shape=(16,))
-        gamma = relay.var("gamma", shape=(16,))
-        BN = relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon=1e-5)
-        T1 = BN[0]
-        add = T1 + T1
-        f = relay.Function([x, var, mean, beta, gamma], add)
-        return run_opt_pass(f, transform.InferType())
-
-    z = before()
-    z = run_opt_pass(z, transform.EliminateCommonSubexpr())
-    tvm.ir.assert_structural_equal(z, expected())
-
-
-def test_tuple_arg():
-    def before():
-        x = relay.var("x", shape=(1, 16))
-        y1 = relay.nn.relu(x)
-        y2 = relay.nn.relu(x)
-        y1 = relay.add(y1, relay.const(1.0, "float32"))
-        y2 = relay.add(y2, relay.const(1.0, "float32"))
-        c0 = relay.const(np.ones((1, 16)), "float32")
-        y1 = relay.concatenate([y1, c0], axis=0)
-        y2 = relay.concatenate([y2, c0], axis=0)
-        y = relay.add(y1, y2)
-        f = relay.Function([x], y)
-        return f
-
-    def expected():
-        x = relay.var("x", shape=(1, 16))
-        y = relay.nn.relu(x)
-        y = relay.add(y, relay.const(1.0, "float32"))
-        c0 = relay.const(np.ones((1, 16)), "float32")
-        y = relay.concatenate([y, c0], axis=0)
-        y = relay.add(y, y)
-        f = relay.Function([x], y)
-        return run_opt_pass(f, transform.InferType())
-
-    z = before()
-    z = run_opt_pass(z, transform.EliminateCommonSubexpr())
-    tvm.ir.assert_structural_equal(z, expected())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_eta_expand.py b/tests/python/relay/test_pass_eta_expand.py
deleted file mode 100644
index b1776cb801aa..000000000000
--- a/tests/python/relay/test_pass_eta_expand.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import relay
-import tvm.relay.transform as _transform
-
-
-def test_eta_expand_global_var():
-    mod = tvm.relay.fromtext(
-        r"""
-        #[version = "0.0.5"]
-        def @aux(%x: Tensor[(), int32]) -> Tensor[(), int32] {
-            %x
-        }
-        def @main() -> fn(Tensor[(), int32]) -> Tensor[(), int32] {
-            @aux
-        }
-    """
-    )
-    seq = tvm.transform.Sequential([_transform.EtaExpand(expand_global_var=True)])
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    expected = tvm.relay.fromtext(
-        r"""
-        #[version = "0.0.5"]
-        def @aux(%x: Tensor[(), int32]) -> Tensor[(), int32] {
-            %x
-        }
-        def @main() -> fn(Tensor[(), int32]) -> Tensor[(), int32] {
-            fn (%x: Tensor[(), int32]) -> Tensor[(), int32] {
-                @aux(%x)
-            }
-        }
-    """
-    )
-    tvm.ir.assert_structural_equal(mod["main"], expected["main"], map_free_vars=True)
-
-
-def test_eta_expand_constructor():
-    mod = tvm.relay.fromtext(
-        r"""
-        #[version = "0.0.5"]
-        type List[A] {
-            Cons(A, List[A]),
-            Nil,
-        }
-        def @main[A]() -> fn(A, List[A]) -> List[A] {
-            Cons
-        }
-    """
-    )
-    seq = tvm.transform.Sequential(
-        [_transform.EtaExpand(expand_constructor=True), _transform.InferType()]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    expected = tvm.relay.fromtext(
-        r"""
-        #[version = "0.0.5"]
-        type List[A] {
-            Cons(A, List[A]),
-            Nil,
-        }
-        def @main[A]() -> fn(A, List[A]) -> List[A] {
-            fn [A](%x: A, %xs: List[A]) -> List[A] {
-                Cons(%x, %xs)
-            }
-        }
-    """
-    )
-    tvm.ir.assert_structural_equal(mod["main"], expected["main"], map_free_vars=True)
-
-
-if __name__ == "__main__":
-    test_eta_expand_global_var()
-    test_eta_expand_constructor()
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
deleted file mode 100644
index c0b61f72d1d3..000000000000
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ /dev/null
@@ -1,1180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-wildcard-import
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.transform import fake_quantization_to_integer
-
-
-def compare_fq_to_int(expr, args, allow_rounding_error=False):
-    mod = tvm.IRModule.from_expr(expr)
-    mod = tvm.relay.transform.InferType()(mod)
-    mod_int = tvm.relay.transform.FakeQuantizationToInteger()(mod)
-    assert not tvm.ir.structural_equal(mod, mod_int)
-    result = (
-        relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-    result_int = (
-        relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-
-    if allow_rounding_error:
-        assert np.all(np.abs(result.astype("int32") - result_int.astype("int32")) <= 1)
-    else:
-        assert np.array_equal(result, result_int)
-
-
-def test_fake_quantize_conv():
-    for out_dtype in ["int8", "uint8"]:
-        x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-        w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-        one = relay.const(1.0)
-        zero = relay.const(0)
-
-        op = relay.op.nn.conv2d(
-            relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-            relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-            kernel_size=[5, 5],
-        )
-        op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
-
-        x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-        w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
-
-        compare_fq_to_int(op, [x_np, w_np])
-
-
-def test_fake_quantize_conv_per_channel():
-    for out_dtype in ["int8", "uint8"]:
-        x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-        w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-        one = relay.const([1.0] * 16)
-        zero_point = relay.const([np.random.randint(0, 255)] * 16)
-
-        op = relay.op.nn.conv2d(
-            relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0)),
-            relay.qnn.op.dequantize(
-                w, relay.const(np.random.random([16]).astype("float32")), zero_point, axis=0
-            ),
-            kernel_size=[5, 5],
-            channels=16,
-        )
-        op = relay.qnn.op.quantize(op, relay.const(1.0), relay.const(0), out_dtype=out_dtype)
-
-        x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-        w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
-
-        compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)
-
-
-def test_fake_quantize_transposeconv():
-    for out_dtype in ["int8", "uint8"]:
-        x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-        w = relay.var("w", shape=[3, 16, 5, 5], dtype="int8")
-        one = relay.const(1.0)
-        zero = relay.const(0)
-
-        op = relay.op.nn.conv2d_transpose(
-            relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-            relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-            kernel_size=[5, 5],
-            data_layout="NCHW",
-            kernel_layout="IOHW",
-        )
-        op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
-
-        x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-        w_np = np.random.randint(-128, 127, size=[3, 16, 5, 5], dtype="int8")
-
-        compare_fq_to_int(op, [x_np, w_np])
-
-
-def test_fake_quantize_dense():
-    for out_dtype in ["int8", "uint8"]:
-        x = relay.var("x", shape=[128, 64], dtype="int8")
-        w = relay.var("w", shape=[256, 64], dtype="int8")
-        one = relay.const(1.0)
-        zero = relay.const(0)
-
-        op = relay.op.nn.dense(
-            relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-            relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-        )
-        op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
-
-        x_np = np.random.randint(-128, 127, size=[128, 64], dtype="int8")
-        w_np = np.random.randint(-128, 127, size=[256, 64], dtype="int8")
-
-        compare_fq_to_int(op, [x_np, w_np])
-
-
-def test_fake_quantize_dense_per_channel():
-    for out_dtype in ["int8", "uint8"]:
-        x = relay.var("x", shape=[128, 64], dtype="int8")
-        w = relay.var("w", shape=[256, 64], dtype="int8")
-        one = relay.const(1.0)
-        zero = relay.const(0)
-
-        op = relay.op.nn.dense(
-            relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-            relay.qnn.op.dequantize(
-                w,
-                relay.const(np.random.random([256]).astype("float32")),
-                relay.const([0] * 256),
-                axis=0,
-            ),
-            units=256,
-        )
-        op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
-
-        x_np = np.random.randint(-128, 127, size=[128, 64], dtype="int8")
-        w_np = np.random.randint(-128, 127, size=[256, 64], dtype="int8")
-
-        compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)
-
-
-def test_fake_quantize_dense_bias():
-    out_dtype = "int8"
-    x = relay.var("x", shape=[128, 64], dtype="int8")
-    w = relay.var("w", shape=[256, 64], dtype="int8")
-    bias = relay.var("bias", shape=[256], dtype="int32")
-    one = relay.const(1.0)
-    zero = relay.const(0)
-    w_scale = np.random.random([256]).astype("float32")
-
-    op = relay.op.nn.dense(
-        relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-        relay.qnn.op.dequantize(
-            w,
-            relay.const(w_scale),
-            zero,
-            axis=0,
-        ),
-        units=256,
-    )
-
-    op += relay.qnn.op.dequantize(
-        bias,
-        relay.const(2.0 * w_scale),
-        zero,
-    )
-
-    op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
-
-    x_np = np.random.randint(-128, 127, size=[128, 64], dtype="int8")
-    w_np = np.random.randint(-128, 127, size=[256, 64], dtype="int8")
-    bias_np = np.random.randint(-128, 127, size=[256], dtype="int32")
-
-    compare_fq_to_int(op, [x_np, w_np, bias_np], allow_rounding_error=True)
-
-
-def test_fake_quantize_batch_matmul():
-    for out_dtype in ["int8", "uint8"]:
-        x = relay.var("x", shape=[1, 128, 64], dtype="int8")
-        w = relay.var("w", shape=[1, 256, 64], dtype="int8")
-        one = relay.const(1.0)
-        zero = relay.const(0)
-
-        op = relay.op.nn.batch_matmul(
-            relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-            relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-        )
-        op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
-
-        x_np = np.random.randint(-128, 127, size=[1, 128, 64], dtype="int8")
-        w_np = np.random.randint(-128, 127, size=[1, 256, 64], dtype="int8")
-
-        compare_fq_to_int(op, [x_np, w_np])
-
-
-def test_fake_transpose_quantize_conv():
-    x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8")
-    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-    one = relay.const(1.0)
-    zero = relay.const(0)
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    x = relay.transpose(x, [0, 3, 1, 2])
-    op = relay.op.nn.conv2d(
-        x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5]
-    )
-    op = relay.qnn.op.quantize(op, one, zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8")
-    w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
-
-    compare_fq_to_int(op, [x_np, w_np])
-
-
-@pytest.mark.parametrize("const_bias", [False, True])
-def test_fake_transpose_quantize_conv_bias_add(const_bias):
-    x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8")
-    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-    one = relay.const(1.0)
-    zero = relay.const(0)
-    if const_bias:
-        bias = relay.const(np.random.random(16).astype("float32"))
-    else:
-        bias = relay.qnn.op.dequantize(relay.var("bias", shape=[16], dtype="int32"), one, zero)
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    x = relay.transpose(x, [0, 3, 1, 2])
-    op = relay.op.nn.conv2d(
-        x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5]
-    )
-    op = relay.op.nn.bias_add(op, bias)
-    op = relay.qnn.op.quantize(op, one, zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8")
-    w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
-    bias_np = np.random.randint(-32768, 32767, size=[16], dtype="int32")
-    args = [x_np, w_np]
-
-    if not const_bias:
-        args.append(bias_np)
-    compare_fq_to_int(op, args)
-
-
-def test_fake_transpose_quantize_conv_bias_add_per_channel():
-    x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8")
-    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-    bias = relay.var("bias", shape=[16], dtype="int32")
-    one = relay.const(1.0)
-    zero = relay.const(0)
-    w_scale = (np.random.random([16]).astype("float32") - 0.5) / 10 + 0.5
-    noise = (np.random.random([16]).astype("float32") - 0.5) * 1e-15
-    w_zp = relay.const([0] * 16)
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    x = relay.transpose(x, [0, 3, 1, 2])
-    op = relay.op.nn.conv2d(
-        x, relay.qnn.op.dequantize(w, relay.const(w_scale), w_zp, axis=0), kernel_size=[5, 5]
-    )
-    op = relay.op.nn.bias_add(
-        op, relay.qnn.op.dequantize(bias, relay.const(2.0 * w_scale + noise), w_zp, axis=0)
-    )
-    op = relay.qnn.op.quantize(op, one, zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8")
-    w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
-    bias_np = np.random.randint(-32768, 32767, size=[16], dtype="int32")
-
-    compare_fq_to_int(op, [x_np, w_np, bias_np], allow_rounding_error=True)
-
-
-def test_fake_transpose_quantize_conv_bias_add_mismatch():
-    x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8")
-    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-    bias = relay.var("bias", shape=[16], dtype="int32")
-    one = relay.const(1.0)
-    two = relay.const(2.0)
-    zero = relay.const(0)
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    x = relay.transpose(x, [0, 3, 1, 2])
-    op = relay.op.nn.conv2d(
-        x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5]
-    )
-    op = relay.op.nn.bias_add(op, relay.qnn.op.dequantize(bias, two, zero))
-    op = relay.qnn.op.quantize(op, one, zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8")
-    w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
-    bias_np = np.random.randint(-32768, 32767, size=[16], dtype="int32")
-
-    compare_fq_to_int(op, [x_np, w_np, bias_np])
-
-
-def test_fake_quantize_maxpool():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.nn.max_pool2d(x, [3, 3])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-@pytest.mark.parametrize("output_size", [None, 1])
-def test_fake_quantize_adaptive_avgpool1d(output_size):
-    x = relay.var("x", shape=[1, 128, 768], dtype="int8")
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12))
-    op = relay.op.nn.adaptive_avg_pool1d(x, output_size)
-    op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10))
-
-    x_np = np.random.randint(-128, 127, size=[1, 128, 768], dtype="int8")
-
-    compare_fq_to_int(op, [x_np], True)
-
-
-def test_fake_quantize_avgpool():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12))
-    op = relay.op.nn.avg_pool2d(x, [3, 3])
-    op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10))
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np], True)
-
-
-def test_fake_quantize_global_avg_pool():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12))
-    op = relay.op.nn.global_avg_pool2d(x)
-    op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10))
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np], True)
-
-
-class TestUnaryQNNOp:
-    def helper_test_fake_quantize_unary_op(self, fp32_op, pos_values=False):
-        for dtype in ["int8", "uint8"]:
-            x = relay.var("x", shape=[1, 3, 3, 3], dtype=dtype)
-
-            zero = -128 if dtype == "int8" else 0
-            if pos_values:
-                # Use a positive range for quanitzed ops that only work on positive values
-                input_mid_point = relay.const(zero)
-                output_mid_point = relay.const(zero)
-            else:
-                input_mid_point = relay.const(np.random.randint(0, 255) + zero)
-                output_mid_point = relay.const(np.random.randint(0, 255) + zero)
-
-            input_scale = relay.const(np.random.rand())
-            output_scale = relay.const(np.random.rand())
-
-            x = relay.qnn.op.dequantize(x, input_scale, input_mid_point)
-            op = fp32_op(x)
-
-            op = relay.qnn.op.quantize(op, output_scale, output_mid_point, out_dtype=dtype)
-
-            x_np = np.random.randint(0 + zero, 255 + zero, size=[1, 3, 3, 3], dtype=dtype)
-
-            compare_fq_to_int(op, [x_np], True)
-
-    def test_sqrt(self):
-        self.helper_test_fake_quantize_unary_op(fp32_op=relay.sqrt, pos_values=True)
-
-    def test_rsqrt(self):
-        self.helper_test_fake_quantize_unary_op(fp32_op=relay.rsqrt, pos_values=True)
-
-    def test_exp(self):
-        self.helper_test_fake_quantize_unary_op(fp32_op=relay.exp)
-
-    def test_erf(self):
-        self.helper_test_fake_quantize_unary_op(fp32_op=relay.erf)
-
-    def test_sigmoid(self):
-        self.helper_test_fake_quantize_unary_op(fp32_op=relay.sigmoid)
-
-    def test_tanh(self):
-        self.helper_test_fake_quantize_unary_op(fp32_op=relay.tanh)
-
-    def test_log(self):
-        self.helper_test_fake_quantize_unary_op(fp32_op=relay.log, pos_values=True)
-
-
-def test_fake_quantize_reshape():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.reshape(x, [1, 3, -1])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_image_resize_bilinear():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.image.resize2d(x, size=[4, 4], method="linear")
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np], allow_rounding_error=True)
-
-
-def test_fake_quantize_abs():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.abs(x)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_expand_dims():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.expand_dims(x, axis=1)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_squeeze():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.squeeze(x, axis=[0])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_strided_slice():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.strided_slice(x, begin=[0, 0, 0, 0], end=[1, 1, 112, 112])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_split():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.split(x, axis=3, indices_or_sections=2)
-    op = relay.qnn.op.quantize(op[0], relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-    op = relay.op.split(x, axis=3, indices_or_sections=[56, 112, 168])
-    op = relay.qnn.op.quantize(op[1], relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_batch_flatten():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.nn.batch_flatten(x)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_transpose_reshape():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.transpose(x, [1, 0, 2, 3])
-    op = relay.op.reshape(op, [3, -1])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_concat():
-    zero = relay.const(0)
-    inputs = []
-    for i in range(4):
-        inputs.append(
-            relay.qnn.op.dequantize(
-                relay.var("x%d" % i, shape=[1, 4], dtype="int8"), relay.const(i + 0.5), zero
-            )
-        )
-    concat = relay.op.concatenate(inputs, axis=1)
-    out = relay.qnn.op.quantize(concat, relay.const(3.5), zero)
-
-    inputs_np = []
-    for i in range(4):
-        inputs_np.append(np.random.randint(-128, 127, size=[1, 4], dtype="int8"))
-
-    compare_fq_to_int(out, inputs_np)
-
-
-@pytest.mark.parametrize("k", [0, 1, 5])
-@pytest.mark.parametrize("axis", [0, -1, 1])
-@pytest.mark.parametrize("is_ascend", [True, False])
-@pytest.mark.parametrize("dtype", ["int8", "uint8"])
-def test_fake_quantize_topk(k, axis, is_ascend, dtype):
-    x = relay.var("x", shape=[20, 100], dtype=dtype)
-    zero = relay.const(0)
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.topk(x, k, axis, "values", is_ascend, "float32")
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero, out_dtype=dtype)
-    x_np = np.random.randint(0, 127, size=[20, 100], dtype=dtype)
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_clip():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
-    op = relay.op.clip(x, 0, 6)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
-
-    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_clip_per_channel():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
-
-    x = relay.qnn.op.dequantize(
-        x, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), axis=1
-    )
-    op = relay.op.clip(x, 0, 6)
-    op = relay.qnn.op.quantize(
-        op, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), out_dtype="uint8", axis=1
-    )
-
-    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_relu():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
-    op = relay.op.nn.relu(x)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
-
-    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_mean():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
-    op = relay.op.mean(x)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
-
-    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
-
-    compare_fq_to_int(op, [x_np], allow_rounding_error=True)
-
-
-def test_fake_quantize_relu_per_channel():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
-
-    x = relay.qnn.op.dequantize(
-        x, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), axis=1
-    )
-    op = relay.op.nn.relu(x)
-    op = relay.qnn.op.quantize(
-        op, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), out_dtype="uint8", axis=1
-    )
-
-    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_leaky_relu():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
-    op = relay.op.nn.leaky_relu(x, 0.1)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
-
-    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
-
-    compare_fq_to_int(op, [x_np], True)
-
-
-@pytest.mark.parametrize(
-    "operator",
-    [relay.op.add, relay.op.multiply, relay.op.subtract, relay.op.minimum, relay.op.maximum],
-)
-def test_fake_quantize_binary(operator):
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-    x = relay.qnn.op.dequantize(x, relay.const(0.1), relay.const(0))
-
-    y = relay.var("y", shape=[1, 3, 224, 224], dtype="int8")
-    y = relay.qnn.op.dequantize(y, relay.const(0.2), relay.const(0))
-
-    op = operator(x, y)
-    if operator == relay.op.multiply:
-        out_scale = relay.const(20.0)
-    else:
-        out_scale = relay.const(0.1)
-
-    op = relay.qnn.op.quantize(op, out_scale, relay.const(0), out_dtype="int8")
-
-    x_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
-    y_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np, y_np])
-
-
-@pytest.mark.parametrize(
-    "operator",
-    [relay.op.add, relay.op.multiply, relay.op.subtract, relay.op.minimum, relay.op.maximum],
-)
-def test_fake_quantize_binary_per_channel(operator):
-    def verify_binary_per_channel(lhs_scale, rhs_scale, lhs_zp, rhs_zp, out_zp, lhs_axis, rhs_axis):
-        if operator == relay.op.multiply:
-            out_scale = relay.const(2.0)
-            rhs_axis = lhs_axis  # TODO: Support different axes for per-channel quantized multiply
-        else:
-            out_scale = relay.const(0.1)
-
-        x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-        x = relay.qnn.op.dequantize(x, relay.const(lhs_scale), relay.const(lhs_zp), axis=lhs_axis)
-
-        y = relay.var("y", shape=[1, 3, 224, 224], dtype="int8")
-        y = relay.qnn.op.dequantize(y, relay.const(rhs_scale), relay.const(rhs_zp), axis=rhs_axis)
-
-        op = operator(x, y)
-
-        op = relay.qnn.op.quantize(op, out_scale, relay.const(out_zp), out_dtype="int8")
-        x_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
-        y_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
-
-        compare_fq_to_int(op, [x_np, y_np], allow_rounding_error=True)
-
-    # Same axis
-    verify_binary_per_channel(
-        lhs_scale=np.random.uniform(1.0, 5.0, 3),
-        rhs_scale=np.random.uniform(1.0, 5.0, 3),
-        lhs_zp=0,
-        rhs_zp=0,
-        out_zp=0,
-        lhs_axis=1,
-        rhs_axis=1,
-    )
-    verify_binary_per_channel(
-        lhs_scale=np.random.uniform(1.0, 5.0, 3),
-        rhs_scale=np.random.uniform(1.0, 5.0, 3),
-        lhs_zp=np.random.randint(1, 3),
-        rhs_zp=np.random.randint(1, 3),
-        out_zp=0,
-        lhs_axis=1,
-        rhs_axis=1,
-    )
-    verify_binary_per_channel(
-        lhs_scale=np.random.uniform(1.0, 5.0, 3),
-        rhs_scale=np.random.uniform(1.0, 5.0, 3),
-        lhs_zp=np.random.randint(1, 3),
-        rhs_zp=np.random.randint(1, 3),
-        out_zp=np.random.randint(1, 3),
-        lhs_axis=1,
-        rhs_axis=1,
-    )
-    verify_binary_per_channel(
-        lhs_scale=np.random.uniform(1.0, 5.0, 224),
-        rhs_scale=np.random.uniform(1.0, 5.0, 224),
-        lhs_zp=np.random.randint(1, 3),
-        rhs_zp=np.random.randint(1, 3),
-        out_zp=np.random.randint(1, 3),
-        lhs_axis=-1,
-        rhs_axis=-1,
-    )
-
-    # Different axes
-    verify_binary_per_channel(
-        lhs_scale=np.random.uniform(1.0, 5.0, 224),
-        rhs_scale=np.random.uniform(1.0, 5.0, 224),
-        lhs_zp=0,
-        rhs_zp=0,
-        out_zp=0,
-        lhs_axis=2,
-        rhs_axis=3,
-    )
-    verify_binary_per_channel(
-        lhs_scale=np.random.uniform(1.0, 5.0, 224),
-        rhs_scale=np.random.uniform(1.0, 5.0, 224),
-        lhs_zp=np.random.randint(1, 3),
-        rhs_zp=np.random.randint(1, 3),
-        out_zp=0,
-        lhs_axis=2,
-        rhs_axis=3,
-    )
-    verify_binary_per_channel(
-        lhs_scale=np.random.uniform(1.0, 5.0, 224),
-        rhs_scale=np.random.uniform(1.0, 5.0, 224),
-        lhs_zp=np.random.randint(1, 3),
-        rhs_zp=np.random.randint(1, 3),
-        out_zp=np.random.randint(1, 3),
-        lhs_axis=2,
-        rhs_axis=3,
-    )
-
-
-@pytest.mark.parametrize(
-    "operator",
-    [
-        relay.op.add,
-        relay.op.multiply,
-        relay.op.subtract,
-        relay.op.minimum,
-        relay.op.maximum,
-    ],
-)
-def test_fake_quantize_binary_const(operator):
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-    x = relay.qnn.op.dequantize(x, relay.const(0.1), relay.const(10))
-
-    y = relay.const(1.0)
-
-    op = operator(x, y)
-    op = relay.qnn.op.quantize(op, relay.const(0.1), relay.const(10), out_dtype="int8")
-
-    x_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_subtract_different_output_zp():
-    for dtype in ["uint8"]:
-        x = relay.var("x", shape=[1, 128, 128, 3], dtype=dtype)
-        x = relay.qnn.op.dequantize(x, relay.const(0.1), relay.const(0), axis=1)
-
-        y = relay.const(0.5)
-
-        op = relay.subtract(x, y)
-        op = relay.transpose(op, axes=[0, 3, 1, 2])
-        op = relay.qnn.op.quantize(op, relay.const(0.2), relay.const(128), out_dtype=dtype, axis=1)
-
-        x_np = np.random.randint(0, 255, size=[1, 128, 128, 3], dtype=dtype)
-
-        compare_fq_to_int(op, [x_np], True)
-
-
-def test_fake_quantize_pad():
-    x = relay.var("x", shape=[1, 383, 128], dtype="int8")
-    x = relay.qnn.op.dequantize(x, relay.const(1.0), relay.const(10))
-    op = relay.op.nn.pad(x, [[0, 0], [0, 1], [0, 0]], 0.0)
-    op = relay.qnn.op.quantize(op, relay.const(1.0), relay.const(10), out_dtype="int8")
-
-    x_np = np.random.randint(-25, 25, size=[1, 383, 128], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_pad_with_float_min():
-    in_shape = [1, 383, 128]
-    x = relay.var("x", shape=in_shape, dtype="float32")
-    op = relay.qnn.quantize(x, relay.const(1.0), relay.const(0), out_dtype="uint8")
-    op = relay.qnn.dequantize(op, relay.const(1.0), relay.const(0), out_dtype="float32")
-    op = relay.op.nn.pad(
-        op, pad_width=[[0, 0], [0, 1], [0, 0]], pad_value=relay.const(-3.40282e38, dtype="float32")
-    )
-    op = relay.qnn.op.quantize(op, relay.const(1.0), relay.const(0), out_dtype="uint8")
-    x_np = np.random.randint(0, 256, size=in_shape)
-    x_as_float = x_np.astype("float32")
-    compare_fq_to_int(op, [x_as_float], True)
-
-
-def test_fake_quantize_depth_to_space():
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-    op = relay.op.nn.depth_to_space(x, 4)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_max_min():
-    def run_test_case(partial_func):
-        x = relay.var("x", shape=[1, 3, 10, 10], dtype="int8")
-
-        zero = relay.const(0)
-        x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
-        # To be a little more realistic since max/min will rarely be by themselves
-        x = relay.op.nn.depth_to_space(x, 4)
-        op = partial_func(x)
-        op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
-
-        x_np = np.random.randint(-128, 127, size=[1, 3, 10, 10], dtype="int8")
-        compare_fq_to_int(op, [x_np])
-
-    run_test_case(relay.op.max)
-    run_test_case(relay.op.min)
-
-    # Test forwarding kwargs works
-    run_test_case(lambda x: relay.op.max(x, axis=1))
-    run_test_case(lambda x: relay.op.min(x, axis=1))
-
-
-def test_fq_avg_pool_conv2d():
-    dtype = "uint8"
-    shape_x = [1, 4, 24, 24]
-    shape_w = [8, 4, 1, 1]
-    x = relay.var("x", shape=shape_x, dtype=dtype)
-    w = relay.var("w", shape=shape_w, dtype=dtype)
-    zero = relay.const(0)
-    one = relay.const(1.0)
-
-    # Tested expression.
-    op0 = relay.qnn.op.dequantize(x, relay.const(0.64), relay.const(2))
-    op1 = relay.op.nn.avg_pool2d(op0, [3, 3])
-    op2 = relay.qnn.op.dequantize(w, relay.const(0.5), relay.const(10))
-    op3 = relay.op.nn.conv2d(op1, op2, kernel_size=[1, 1])
-    expr = relay.qnn.op.quantize(op3, one, zero, out_dtype="uint8")
-
-    x_np = np.random.randint(0, 255, size=shape_x, dtype=dtype)
-    w_np = np.random.randint(0, 255, size=shape_w, dtype=dtype)
-    compare_fq_to_int(expr, [x_np, w_np])
-
-
-def test_fq_hard_fail():
-    @tvm.ir.register_op_attr("nn.conv2d", "FTVMFakeQuantizationToInteger", level=11)
-    def conv2d(expr, type_map):  # pylint: disable=unused-variable
-        raise NotImplementedError
-
-    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
-    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-    one = relay.const(1.0)
-    zero = relay.const(0)
-
-    op = relay.op.nn.conv2d(
-        relay.qnn.op.dequantize(x, relay.const(2.0), zero),
-        relay.qnn.op.dequantize(w, relay.const(0.5), zero),
-        kernel_size=[5, 5],
-    )
-    op = relay.qnn.op.quantize(op, one, zero, out_dtype="int8")
-    mod = tvm.IRModule.from_expr(op)
-    mod = tvm.relay.transform.InferType()(mod)
-
-    mod_int = tvm.relay.transform.FakeQuantizationToInteger(hard_fail=False)(mod)
-    tvm.ir.assert_structural_equal(mod_int, mod)
-    # Catch a generic exception because the tvm FFI eats the python exception type
-    with pytest.raises(Exception):
-        mod_int = tvm.relay.transform.FakeQuantizationToInteger(hard_fail=True)(mod)
-
-
-def compare_expected_fq_qat_to_int(expr, expected_expr, args, allow_rounding_error=False):
-    mod = tvm.IRModule.from_expr(expr)
-    mod_def = tvm.relay.transform.InferType()(mod)
-    mod_int = tvm.relay.transform.FakeQuantizationToInteger(False, True)(mod_def)
-    mod_exp = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(expected_expr))
-    assert not tvm.ir.structural_equal(mod, mod_int)
-    tvm.ir.assert_structural_equal(mod_int, mod_exp)
-    result_def = (
-        relay.create_executor("vm", mod=mod_def, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-    result_int = (
-        relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-    result_exp = (
-        relay.create_executor("vm", mod=mod_exp, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-    if allow_rounding_error:
-        assert np.all(np.abs(result_def.astype("int32") - result_int.astype("int32")) <= 1)
-    else:
-        assert np.array_equal(result_def, result_int)
-
-    assert np.array_equal(result_int, result_exp)
-
-
-def test_fq_qat_op_positive_part():
-    # Only the first operation is converted, since the next operation("add") is not enabled.
-    shape_x = [1, 4, 2]
-    shape_w = [1, 4, 2]
-    a = relay.var("a", shape=shape_x, dtype="int8")
-    b = relay.var("b", shape=shape_w, dtype="int8")
-
-    op0 = relay.qnn.op.dequantize(a, relay.const(2.0), relay.const(0))
-    op1 = relay.qnn.op.dequantize(b, relay.const(6.0), relay.const(0))
-    op2 = relay.op.nn.batch_matmul(op0, op1)
-    op3 = relay.op.add(op2, relay.const(1.0))
-    expr = relay.op.erf(op3)
-
-    op0 = relay.qnn.op.qnn.batch_matmul(
-        a, b, relay.const(0), relay.const(0), relay.const(2.0), relay.const(6.0)
-    )
-    op1 = relay.qnn.op.qnn.dequantize(op0, relay.const(12.0), relay.const(0))
-    op2 = relay.op.add(op1, relay.const(1.0))
-    expected_expr = relay.op.erf(op2)
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8")
-    compare_expected_fq_qat_to_int(expr, expected_expr, [x_np, w_np])
-
-
-def test_fq_qat_negative_all():
-    # None of the operations are converted, since the first operation("add") is not enabled.
-    shape_x = [1, 4, 2]
-    shape_w = [1, 4, 2]
-    a = relay.var("a", shape=shape_x, dtype="int8")
-    b = relay.var("b", shape=shape_w, dtype="int8")
-
-    op0 = relay.qnn.op.dequantize(a, relay.const(2.0), relay.const(0))
-    op1 = relay.qnn.op.dequantize(b, relay.const(6.0), relay.const(0))
-    op2 = relay.op.add(op1, relay.const(1.0))
-    op3 = relay.op.nn.batch_matmul(op0, op2)
-    expr = relay.op.erf(op3)
-
-    expected_expr = expr
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8")
-    compare_expected_fq_qat_to_int(expr, expected_expr, [x_np, w_np])
-
-
-def test_fq_qat_positive_single():
-    # The single operation is converted.
-    shape_x = [1, 4, 2]
-    shape_w = [1, 4, 2]
-    a = relay.var("a", shape=shape_x, dtype="int8")
-    b = relay.var("b", shape=shape_w, dtype="int8")
-
-    op0 = relay.qnn.op.dequantize(a, relay.const(2.0), relay.const(0))
-    op1 = relay.qnn.op.dequantize(b, relay.const(6.0), relay.const(0))
-    expr = relay.op.nn.batch_matmul(op0, op1)
-
-    op0 = relay.qnn.op.qnn.batch_matmul(
-        a, b, relay.const(0), relay.const(0), relay.const(2.0), relay.const(6.0)
-    )
-    expected_expr = relay.qnn.op.qnn.dequantize(op0, relay.const(12.0), relay.const(0))
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8")
-    compare_expected_fq_qat_to_int(expr, expected_expr, [x_np, w_np])
-
-
-def test_fq_qat_positive_nothing_to_do():
-    # All operations are converted by the non-QAT pass.
-    shape_x = [1, 4, 2]
-    shape_w = [1, 4, 2]
-    a = relay.var("a", shape=shape_x, dtype="int8")
-    b = relay.var("b", shape=shape_w, dtype="int8")
-
-    op0 = relay.qnn.op.dequantize(a, relay.const(2.0), relay.const(0))
-    op1 = relay.qnn.op.dequantize(b, relay.const(6.0), relay.const(0))
-    op2 = relay.op.nn.batch_matmul(op0, op1)
-    op3 = relay.op.add(op2, relay.const(1.0))
-    expr = relay.qnn.op.quantize(op3, relay.const(1.0), relay.const(0), out_dtype="int8")
-
-    op0 = relay.qnn.op.batch_matmul(
-        a, b, relay.const(0), relay.const(0), relay.const(2.0), relay.const(6.0)
-    )
-    op1 = relay.qnn.op.quantize(
-        relay.const(1.0), relay.const(12.0), relay.const(0), out_dtype="int32"
-    )
-    op2 = relay.op.add(
-        op0,
-        op1,
-    )
-    expected_expr = relay.qnn.op.requantize(
-        op2, relay.const(12.0), relay.const(0), relay.const(1.0), relay.const(0), out_dtype="int8"
-    )
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8")
-    compare_expected_fq_qat_to_int(expr, expected_expr, [x_np, w_np])
-
-
-def test_fq_qat_positive_couple():
-    # Several consecutive operations are converted.
-    shape_x = [1, 2, 4]
-    shape_w = [2]
-    a = relay.var("a", shape=shape_x, dtype="int8")
-    b = relay.var("b", shape=shape_w, dtype="int8")
-
-    op0 = relay.qnn.op.dequantize(a, relay.const(2.0), relay.const(0))
-    op1 = relay.qnn.op.dequantize(b, relay.const(6.0), relay.const(0))
-    op2 = relay.op.reshape(op0, (1, 4, 2))
-    op3 = relay.op.broadcast_to(op1, (2, 2, 2))
-    op4 = relay.op.nn.batch_matmul(op2, op3)
-    expr = relay.op.erf(op4)
-
-    op0 = relay.op.reshape(a, (1, 4, 2))
-    op1 = relay.op.broadcast_to(b, (2, 2, 2))
-    op3 = relay.qnn.op.qnn.batch_matmul(
-        op0, op1, relay.const(0), relay.const(0), relay.const(2.0), relay.const(6.0)
-    )
-    op4 = relay.qnn.op.qnn.dequantize(op3, relay.const(12.0), relay.const(0))
-    expected_expr = relay.op.erf(op4)
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8")
-    compare_expected_fq_qat_to_int(expr, expected_expr, [x_np, w_np])
-
-
-def test_fq_positive_single_arg_part():
-    # The single-argument operation is converted.
-    shape_x = [1, 2, 4]
-    a = relay.var("a", shape=shape_x, dtype="int8")
-
-    op0 = relay.qnn.op.dequantize(a, relay.const(2.0), relay.const(0))
-
-    op1 = relay.op.reshape(op0, (1, 4, 2))
-    expr = relay.op.erf(op1)
-
-    op0 = relay.op.reshape(a, (1, 4, 2))
-    op1 = relay.qnn.op.dequantize(op0, relay.const(2.0), relay.const(0))
-    expected_expr = relay.op.erf(op1)
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
-    compare_expected_fq_qat_to_int(expr, expected_expr, [x_np])
-
-
-def test_fq_qat_intermediate_infertype():
-    # Complex conversion of non-QAT and QAT passes that form FakeQuantizationToInteger.
-    shape_x = [1, 2, 4]
-    x = relay.var("x", shape=shape_x, dtype="float32")
-    const_0 = relay.const(np.random.uniform(size=[1, 4, 2]).astype("float32"))
-
-    op0 = relay.qnn.op.quantize(x, relay.const(17.0), relay.const(0), out_dtype="int8")
-    op1 = relay.qnn.op.dequantize(op0, relay.const(17.0), relay.const(0))
-    op2 = relay.op.reshape(op1, (1, 4, 2))
-    op3 = relay.qnn.op.quantize(op2, relay.const(10.0), relay.const(0), out_dtype="int8")
-    op4 = relay.qnn.op.quantize(const_0, relay.const(1.0), relay.const(8), out_dtype="int8")
-    op5 = relay.qnn.op.dequantize(op3, relay.const(10.0), relay.const(0))
-    op6 = relay.qnn.op.dequantize(op4, relay.const(4.0), relay.const(9))
-    op7 = relay.op.nn.batch_matmul(op5, op6)
-    expr = relay.op.add(op7, relay.const(5.0))
-
-    op0 = relay.qnn.op.quantize(x, relay.const(17.0), relay.const(0), out_dtype="int8")
-    op1 = relay.op.reshape(op0, (1, 4, 2))
-    op2 = relay.qnn.op.requantize(
-        op1, relay.const(17.0), relay.const(0), relay.const(10.0), relay.const(0), out_dtype="int8"
-    )
-    op3 = relay.qnn.op.quantize(const_0, relay.const(1.0), relay.const(8), out_dtype="int8")
-    op4 = relay.qnn.op.batch_matmul(
-        op2, op3, relay.const(0), relay.const(9), relay.const(10.0), relay.const(4.0)
-    )
-    op5 = relay.qnn.op.dequantize(op4, relay.const(40.0), relay.const(0))
-    expected_expr = relay.op.add(op5, relay.const(5.0))
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int32").astype("float32")
-    compare_expected_fq_qat_to_int(expr, expected_expr, [x_np])
-
-
-def test_fake_quantize_take():
-    x = relay.var("x", shape=[33, 11], dtype="int8")
-    indices_np = np.random.randint(0, 33, size=[37], dtype="int32")
-    indices = relay.const(indices_np)
-
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
-    op = relay.op.take(x, indices, axis=0)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
-
-    x_np = np.random.randint(-25, 25, size=[33, 11], dtype="int8")
-
-    compare_fq_to_int(op, [x_np])
-
-
-def test_fake_quantize_softmax():
-    shape = [5, 10]
-    x_ = relay.var("x", shape=shape, dtype="int8")
-
-    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
-
-    for scale in [1.0, 0.1, 0.01]:
-        x = relay.qnn.op.dequantize(x_, relay.const(scale), relay.const(0))
-        op = relay.op.nn.softmax(x, axis=1)
-        op = relay.qnn.op.quantize(
-            op, relay.const(1.0 / 256.0), relay.const(-128), out_dtype="int8"
-        )
-
-        x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
-        x_np = np.sort(x_np)
-        args = [x_np]
-
-        mod = tvm.IRModule.from_expr(op)
-        mod = tvm.relay.transform.InferType()(mod)
-        mod_int = tvm.relay.transform.FakeQuantizationToInteger(
-            hard_fail=True, optional_qnn_ops=["nn.softmax"]
-        )(mod)
-        assert not tvm.ir.structural_equal(mod, mod_int)
-
-        result = (
-            relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
-            .evaluate()(*args)
-            .numpy()
-        )
-        result_int = (
-            relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
-            .evaluate()(*args)
-            .numpy()
-        )
-
-        # Check at least the softmax output is in ascending order,
-        # since it is difficult to use allclose due to not-so-good accuracy.
-        for qdq, qop in zip(result, result_int):
-            assert is_sorted(qdq)
-            assert is_sorted(qop)
-
-        try:
-            np.testing.assert_allclose(result_int, result, atol=1)
-        except AssertionError as e:
-            # To see the difference
-            print(e)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_fast_math.py b/tests/python/relay/test_pass_fast_math.py
deleted file mode 100644
index f63b6ce0f23e..000000000000
--- a/tests/python/relay/test_pass_fast_math.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm.ir import IRModule
-from tvm import relay
-from tvm.relay.transform import FastMath
-
-
-def test_exp():
-    x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-    y = relay.exp(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule.from_expr(func)
-
-    fast_mod = FastMath()(mod)
-    assert "fast_exp" in fast_mod.astext()
-
-    # Check that FastMath option works for relay.build.
-    with tvm.transform.PassContext(opt_level=3, required_pass=["FastMath"]):
-        fast_mod = relay.optimize(mod, target="llvm", params=None)
-    assert "fast_exp" in fast_mod[0].astext()
-
-
-def test_tanh():
-    x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-    y = relay.tanh(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule.from_expr(func)
-
-    fast_mod = FastMath()(mod)
-    assert "fast_tanh" in fast_mod.astext()
-
-    # Check that FastMath option works for relay.build.
-    with tvm.transform.PassContext(opt_level=3, required_pass=["FastMath"]):
-        fast_mod = relay.optimize(mod, target="llvm", params=None)
-    assert "fast_tanh" in fast_mod[0].astext()
-
-
-def test_erf():
-    x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-    y = relay.erf(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule.from_expr(func)
-
-    fast_mod = FastMath()(mod)
-    assert "fast_erf" in fast_mod.astext()
-
-    # Check that FastMath option works for relay.build.
-    with tvm.transform.PassContext(opt_level=3, required_pass=["FastMath"]):
-        fast_mod = relay.optimize(mod, target="llvm", params=None)
-    assert "fast_erf" in fast_mod[0].astext()
-
-
-def test_softmax():
-    x = relay.var("x", shape=(1, 16), dtype="float32")
-    y = relay.nn.softmax(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule.from_expr(func)
-
-    with tvm.transform.PassContext(opt_level=3, required_pass=["FastMath"]):
-        fast_mod = relay.optimize(mod, target="llvm")
-    assert "nn.fast_softmax" in fast_mod[0].astext()
-
-
-if __name__ == "__main__":
-    test_exp()
-    test_tanh()
-    test_erf()
-    test_softmax()
diff --git a/tests/python/relay/test_pass_flatten_atrous_conv.py b/tests/python/relay/test_pass_flatten_atrous_conv.py
deleted file mode 100644
index 37b69a426df2..000000000000
--- a/tests/python/relay/test_pass_flatten_atrous_conv.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-wildcard-import
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.contrib import graph_executor
-
-
-def compare_expected_fac(expr, expected_expr, args):
-    mod_def = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(expr))
-    mod_flat = tvm.relay.transform.FlattenAtrousConv()(mod_def)
-    mod_exp = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(expected_expr))
-
-    assert expr is expected_expr or not tvm.ir.structural_equal(mod_def, mod_flat)
-    tvm.ir.assert_structural_equal(mod_flat, mod_exp)
-
-    result_def = (
-        relay.create_executor("vm", mod=mod_def, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-    result_flat = (
-        relay.create_executor("vm", mod=mod_flat, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-    result_exp = (
-        relay.create_executor("vm", mod=mod_exp, device=tvm.cpu(), target="llvm")
-        .evaluate()(*args)
-        .numpy()
-    )
-
-    assert np.array_equal(result_def, result_flat)
-    assert np.array_equal(result_flat, result_exp)
-
-
-def test_fac_block_shape_2():
-    # pattern entry with block_shape=[2, 2]
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    op2 = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-
-    expected_expr = relay.nn.conv2d(
-        data,
-        weight,
-        padding=[2, 2, 2, 2],
-        dilation=[2, 2],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_block_shape_4():
-    # pattern entry with block_shape=[4, 4]
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[4, 4], paddings=[[4, 7], [4, 7]])
-    op2 = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expr = relay.nn.batch_to_space_nd(op2, block_shape=[4, 4], crops=[[0, 3], [0, 3]])
-
-    expected_expr = relay.nn.conv2d(
-        data,
-        weight,
-        padding=[4, 4, 4, 4],
-        dilation=[4, 4],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_quantize():
-    # quantize pattern entry
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="int8")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    op2 = relay.qnn.op.conv2d(
-        op1,
-        weight,
-        input_zero_point=relay.const(0),
-        kernel_zero_point=relay.const(0),
-        input_scale=relay.const(2.0),
-        kernel_scale=relay.const(1.0),
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-
-    expected_expr = relay.qnn.op.conv2d(
-        data,
-        weight,
-        input_zero_point=relay.const(0),
-        kernel_zero_point=relay.const(0),
-        input_scale=relay.const(2.0),
-        kernel_scale=relay.const(1.0),
-        padding=[2, 2, 2, 2],
-        dilation=[2, 2],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_surrounding():
-    # pattern entry with surrounding operations add
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op0 = relay.op.add(data, relay.const(1.0))
-    op1 = relay.nn.space_to_batch_nd(op0, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    op2 = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    op3 = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-    expr = relay.op.add(op3, relay.const(-1.0))
-
-    op0 = relay.op.add(data, relay.const(1.0))
-    op1 = relay.nn.conv2d(
-        op0,
-        weight,
-        padding=[2, 2, 2, 2],
-        dilation=[2, 2],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expected_expr = relay.op.add(op1, relay.const(-1.0))
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_several():
-    # several pattern entries
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    op2 = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    op3 = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-    op4 = relay.nn.space_to_batch_nd(op3, block_shape=[4, 4], paddings=[[4, 7], [4, 7]])
-    op5 = relay.nn.conv2d(
-        op4,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expr = relay.nn.batch_to_space_nd(op5, block_shape=[4, 4], crops=[[0, 3], [0, 3]])
-
-    op1 = relay.nn.conv2d(
-        data,
-        weight,
-        padding=[2, 2, 2, 2],
-        dilation=[2, 2],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-
-    expected_expr = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[4, 4, 4, 4],
-        dilation=[4, 4],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test__fac_only_s2b_conv():
-    # negative case, only operations space_to_batch_nd-conv2d
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    expr = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-
-    expected_expr = expr
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_only_s2b():
-    # negative case, only operation space_to_batch_nd
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    expr = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-
-    expected_expr = expr
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_only_conv_b2s():
-    # negative case, only operations conv2d-batch_to_space_nd
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.conv2d(
-        data,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expr = relay.nn.batch_to_space_nd(op1, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-
-    expected_expr = expr
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_only_b2s():
-    # negative case, only operation batch_to_space_nd
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    expr = relay.nn.batch_to_space_nd(data, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-
-    expected_expr = expr
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_op_btwn_s2b_conv():
-    # negative case, add operation between space_to_batch_nd-conv2d
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    op_1_5 = relay.op.add(op1, relay.const(1.0))
-    op2 = relay.nn.conv2d(
-        op_1_5,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-
-    expected_expr = expr
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_op_btwn_conv_b2s():
-    # negative case, add operation between conv2d-batch_to_space_nd
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    op2 = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    op_2_5 = relay.op.add(op2, relay.const(1.0))
-    expr = relay.nn.batch_to_space_nd(op_2_5, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-
-    expected_expr = expr
-
-    compare_expected_fac(expr, expected_expr, [x_np])
-
-
-def test_fac_relay_build():
-    #  Check the default optimize pipeline
-    shape_x = [1, 5, 5, 4]
-    shape_w = [3, 3, 4, 1]
-
-    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
-    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
-
-    weight = relay.const(w_np)
-    data = relay.var("data", shape=shape_x, dtype="float32")
-    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
-    op2 = relay.nn.conv2d(
-        op1,
-        weight,
-        padding=[0, 0, 0, 0],
-        groups=4,
-        channels=4,
-        kernel_size=[3, 3],
-        data_layout="NHWC",
-        kernel_layout="HWOI",
-    )
-    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
-
-    mod_def = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(expr))
-    result_def = (
-        relay.create_executor("vm", mod=mod_def, device=tvm.cpu(), target="llvm")
-        .evaluate()(x_np)
-        .numpy()
-    )
-
-    graph, lib, params = relay.build(mod_def, "llvm", params=None)
-    rt_mod = graph_executor.create(graph, lib, device=tvm.cpu())
-    rt_mod.set_input("data", x_np)
-    rt_mod.set_input(**params)
-    rt_mod.run()
-    result_flat = rt_mod.get_output(0).numpy()
-
-    assert "space_to_batch_nd" not in graph
-    assert "conv2d" in graph
-    assert "batch_to_space_nd" not in graph
-
-    assert np.array_equal(result_def, result_flat)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_flexible_shape_dispatch.py b/tests/python/relay/test_pass_flexible_shape_dispatch.py
deleted file mode 100644
index 86ccb25db54c..000000000000
--- a/tests/python/relay/test_pass_flexible_shape_dispatch.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test flexible shape dispatch pass"""
-import numpy as np
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay.testing.resnet import get_workload
-from tvm.relay import vm
-from tvm import runtime
-
-
-def test_end_to_end():
-    # Load a resnet model.
-    mod, params = get_workload()
-    # Apply flexible dispatch pass.
-    mod = relay.transform.FlexibleShapeDispatch(axis=0, buckets=[1, 4], auto_pad=True)(mod)
-    # Compile and confirm result supports multiple shapes.
-    exe = relay.vm.compile(mod, "llvm", params=params)
-    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
-
-    # Evaluate various batch sizes
-    batch_1 = np.random.normal(size=[1, 3, 224, 224]).astype("float32")
-    assert list(vm.invoke("main", batch_1).shape) == [1, 1000]
-
-    batch_4 = np.random.normal(size=[4, 3, 224, 224]).astype("float32")
-    assert list(vm.invoke("main", batch_4).shape) == [4, 1000]
-
-    # Apply autopadding to an input.
-    batch_3 = np.random.normal(size=[3, 3, 224, 224]).astype("float32")
-    assert list(vm.invoke("main", batch_3).shape) == [3, 1000]
-
-
-def test_multiple_inputs():
-    # Create a small relay module with multiple inputs to dispatch over.
-    x = relay.var("x", shape=[10, 10], dtype="float32")
-    w = relay.var("w", shape=[10, 10], dtype="float32")
-    y = x + w
-    mod = tvm.IRModule.from_expr(y)
-
-    # Apply flexible dispatch to dim 1 for both inputs.
-    mod = relay.transform.FlexibleShapeDispatch(axis=1, buckets=[5, 10], input_indices=[0, 1])(mod)
-
-    # Compile and confirm that output shapes are correct.
-    exe = relay.vm.compile(mod, "llvm")
-    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
-
-    x_w_5 = np.random.normal(size=[10, 5]).astype("float32")
-    assert list(vm.invoke("main", x_w_5, x_w_5).shape) == [10, 5]
-
-    x_w_10 = np.random.normal(size=[10, 10]).astype("float32")
-    assert list(vm.invoke("main", x_w_10, x_w_10).shape) == [10, 10]
-
-
-def test_fixed_output():
-    # Test a graph where the output shape is not based on input dynamism.
-    x = relay.var("x", shape=[10, 10], dtype="float32")
-    w = relay.var("w", shape=[10, 10], dtype="float32")
-    y = relay.nn.dense(x, w)
-    mod = tvm.IRModule.from_expr(y)
-
-    # Apply flexible dispatch to dimension 1 for both inputs.
-    mod = relay.transform.FlexibleShapeDispatch(
-        axis=1, buckets=[5, 7], input_indices=[0, 1], affects_output=False
-    )(mod)
-
-    # Compile and confirm that output shapes are correct.
-    exe = relay.vm.compile(mod, "llvm")
-    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
-
-    x_w_5 = np.random.normal(size=[10, 5]).astype("float32")
-    assert list(vm.invoke("main", x_w_5, x_w_5).shape) == [10, 10]
-
-    x_w_7 = np.random.normal(size=[10, 7]).astype("float32")
-    assert list(vm.invoke("main", x_w_7, x_w_7).shape) == [10, 10]
-
-    return
-
-
-def test_multiple_outputs():
-    # Create a graph with multiple outputs and test that it works.
-    x = relay.var("x", shape=[10, 10], dtype="float32")
-    y = relay.split(x, 2, axis=1)
-    mod = tvm.IRModule.from_expr(y.astuple())
-
-    # Apply flexible dispatch to batch dimension.
-    mod = relay.transform.FlexibleShapeDispatch(axis=0, buckets=[5, 10])(mod)
-
-    # Compile and confirm that both outputs are correct.
-    exe = relay.vm.compile(mod, "llvm")
-    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
-
-    x_5 = np.random.normal(size=[5, 10]).astype("float32")
-    result_5 = vm.invoke("main", x_5)
-    assert list(result_5[0].shape) == [5, 5]
-    assert list(result_5[1].shape) == [5, 5]
-
-    x_10 = np.random.normal(size=[10, 10]).astype("float32")
-    result_10 = vm.invoke("main", x_10)
-    assert list(result_10[0].shape) == [10, 5]
-    assert list(result_10[1].shape) == [10, 5]
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
deleted file mode 100644
index 585ae5d7a21d..000000000000
--- a/tests/python/relay/test_pass_fold_constant.py
+++ /dev/null
@@ -1,576 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.backend import Executor
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing import run_infer_type, create_workload
-
-
-def annot_expr(e):
-    """Returns e wrapped with an on_device annotation."""
-    return relay.op.annotation.on_device(e, tvm.cpu(), constrain_result=True)
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-
-    mod = tvm.IRModule.from_expr(expr)
-    mod = relay.transform.InferType()(mod)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_concatenate_const():
-    def before():
-        data = tvm.nd.array(np.array([1.0, 2.0, 3.0]))
-        const = relay.const(data)
-        concat = relay.op.concatenate([const, const], axis=0)
-        func = relay.Function([], concat)
-        return func
-
-    def expected():
-        data = tvm.nd.array(np.array([1.0, 2.0, 3.0, 1.0, 2.0, 3.0]))
-        const = relay.const(data)
-        func = relay.Function([], const)
-        return func
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_const():
-    c_data = np.array([1, 2, 3]).astype("float32")
-    t = relay.TensorType([1, 2, 3], "float32")
-
-    def before():
-        c = relay.const(c_data)
-        x = relay.var("x", t)
-        y = relay.add(c, c)
-        y = relay.multiply(y, relay.const(2, "float32"))
-        y = relay.add(x, y)
-        z = relay.add(y, c)
-        return relay.Function([x], z)
-
-    def expected():
-        x = relay.var("x", t)
-        c_folded = (c_data + c_data) * 2
-        y = relay.add(x, relay.const(c_folded))
-        z = relay.add(y, relay.const(c_data))
-        return relay.Function([x], z)
-
-    # the fold constant should work on any context.
-    with tvm.target.Target("cuda"):
-        zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_const_with_on_device():
-    """Make sure on_device annotations don't get in the way of constant folding"""
-    c_data = np.array([1, 2, 3]).astype("float32")
-    t = relay.TensorType([1, 2, 3], "float32")
-
-    def before():
-        c = relay.const(c_data)
-        x = relay.var("x", t)
-        x.virtual_device_ = tvm.cpu()
-        y = relay.add(c, c)
-        y = relay.multiply(y, relay.const(2, "float32"))
-        y = relay.add(x, y)
-        z = relay.add(y, c)
-        f = relay.Function([x], z)
-        f.virtual_device_ = tvm.cpu()
-        return f
-
-    def expected():
-        x = relay.var("x", t)
-        x.virtual_device_ = tvm.cpu()
-        c_folded = (c_data + c_data) * 2
-        y = relay.add(x, relay.const(c_folded))
-        z = relay.add(y, relay.const(c_data))
-        f = relay.Function([x], z)
-        f.virtual_device_ = tvm.cpu()
-        return f
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_let():
-    c_data = np.array(1).astype("float32")
-    t = relay.TensorType([1], "float32")
-
-    def before():
-        sb = relay.ScopeBuilder()
-        x = relay.var("x", t)
-        t1 = sb.let("t1", relay.const(c_data))
-        t2 = sb.let("t2", relay.add(t1, t1))
-        t3 = sb.let("t3", relay.add(t2, x))
-        sb.ret(t3)
-        return relay.Function([x], sb.get())
-
-    def expected():
-        sb = relay.ScopeBuilder()
-        x = relay.var("x", t)
-        c_folded = c_data + c_data
-        t3 = sb.let("t3", relay.add(relay.const(c_folded), x))
-        sb.ret(t3)
-        return relay.Function([x], sb.get())
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_let_with_on_device():
-    """Make sure on_device annotations don't get in the way of constant folding,
-    and inlined constants bring their annotations with them."""
-    c_data = np.array(1).astype("float32")
-    t = relay.TensorType([1], "float32")
-
-    def before():
-        sb = relay.ScopeBuilder()
-        x = relay.var("x", t)
-        x.virtual_device_ = tvm.cpu()
-        t1 = sb.let("t1", annot_expr(relay.const(c_data)))
-        t2 = sb.let("t2", annot_expr(relay.add(t1, t1)))
-        t3 = sb.let("t3", annot_expr(relay.add(t2, x)))
-        sb.ret(t3)
-        f = relay.Function([x], sb.get())
-        f.virtual_device_ = tvm.cpu()
-        return f
-
-    def expected():
-        sb = relay.ScopeBuilder()
-        x = relay.var("x", t)
-        x.virtual_device_ = tvm.cpu()
-        c_folded = c_data + c_data
-        t3 = sb.let("t3", annot_expr(relay.add(annot_expr(relay.const(c_folded)), x)))
-        sb.ret(t3)
-        f = relay.Function([x], sb.get())
-        f.virtual_device_ = tvm.cpu()
-        return f
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_tuple():
-    c_data = np.array(1).astype("float32")
-    t = relay.TensorType([1], "float32")
-
-    def before():
-        c = relay.const(c_data)
-        x = relay.var("x", t)
-        y = relay.Tuple([x, c])
-        z = relay.add(y[1], c)
-        z = relay.add(z, y[0])
-        return relay.Function([x], z)
-
-    def expected():
-        c = relay.const(c_data + c_data)
-        x = relay.var("x", t)
-        z = relay.add(c, x)
-        return relay.Function([x], z)
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_concat():
-    c_data = np.array([[1, 2, 3]]).astype("float32")
-
-    def before():
-        a = relay.const(c_data)
-        b = relay.const(c_data)
-        y = relay.concatenate((a, b), axis=0)
-        return relay.Function([], y)
-
-    def expected():
-        y_data = np.concatenate((c_data, c_data), axis=0)
-        y = relay.const(y_data)
-        return relay.Function([], y)
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_if():
-    cond_data = np.array(1).astype("bool")
-    x_data = np.array([[1, 2, 3]]).astype("float32")
-
-    def before():
-        a = relay.const(cond_data)
-        x = relay.const(x_data)
-        y = relay.const(x_data)
-        iff = relay.If(a, x + y, x - y)
-        return relay.Function([], iff)
-
-    def expected():
-        y_data = x_data + x_data
-        y = relay.const(y_data)
-        return relay.Function([], y)
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-    cond_data = np.array(0).astype("bool")
-
-    def before():
-        a = relay.const(cond_data)
-        x = relay.const(x_data)
-        y = relay.const(x_data)
-        iff = relay.If(a, x + y, x - y)
-        return relay.Function([], iff)
-
-    def expected():
-        y_data = x_data - x_data
-        y = relay.const(y_data)
-        return relay.Function([], y)
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_shape_of():
-    c_shape = (8, 9, 10)
-
-    def before(dtype):
-        x = relay.var("x", shape=c_shape, dtype="float32")
-        y = relay.var("y", shape=c_shape, dtype="float32")
-        z = relay.shape_of(x + y, dtype)
-        return relay.Function([x, y], z)
-
-    def expected(dtype):
-        x = relay.var("x", shape=c_shape, dtype="float32")
-        y = relay.var("y", shape=c_shape, dtype="float32")
-        z = relay.const(np.array(c_shape).astype(dtype), dtype=dtype)
-        func = relay.Function([x, y], z)
-        return func
-
-    for dtype in ["int32", "float32"]:
-        zz = run_opt_pass(before(dtype), transform.FoldConstant())
-        zexpected = run_opt_pass(expected(dtype), transform.InferType())
-        tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_ndarray_size():
-    c_shape = (8, 9, 10)
-
-    def before(dtype):
-        x = relay.var("x", shape=c_shape, dtype="float32")
-        y = relay.var("y", shape=c_shape, dtype="float32")
-        z = relay.ndarray_size(x + y, dtype)
-        return relay.Function([x, y], z)
-
-    def expected(dtype):
-        x = relay.var("x", shape=c_shape, dtype="float32")
-        y = relay.var("y", shape=c_shape, dtype="float32")
-        z = relay.const(np.size(np.zeros(c_shape)), dtype=dtype)
-        func = relay.Function([x, y], z)
-        mod = tvm.IRModule.from_expr(func)
-        return mod["main"]
-
-    for dtype in ["int32", "float32"]:
-        zz = run_opt_pass(before(dtype), transform.FoldConstant())
-        zexpected = run_opt_pass(expected(dtype), transform.InferType())
-        tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_batch_norm():
-    def expected():
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.const(np.zeros((16, 3, 3, 3)))
-        bias = relay.const(np.zeros((16, 1, 1)))
-        conv = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-        add = relay.add(conv, bias)
-        return relay.Function(relay.analysis.free_vars(add), add)
-
-    remove_bn_pass = tvm.transform.Sequential(
-        [
-            relay.transform.InferType(),
-            relay.transform.SimplifyInference(),
-            relay.transform.FoldConstant(),
-            relay.transform.FoldScaleAxis(),
-        ]
-    )
-
-    data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-    weight = relay.var("weight")
-    bn_gamma = relay.var("bn_gamma")
-    bn_beta = relay.var("bn_beta")
-    bn_mmean = relay.var("bn_mean")
-    bn_mvar = relay.var("bn_var")
-
-    conv = relay.nn.conv2d(
-        data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
-    )
-    bn_output = relay.nn.batch_norm(conv, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-
-    def initializer(_, param):
-        param = np.zeros(param.shape)
-
-    mod, params = create_workload(bn_output[0], initializer)
-    mod["main"] = bind_params_by_name(mod["main"], params)
-
-    with tvm.transform.PassContext(opt_level=3):
-        mod = remove_bn_pass(mod)
-
-    expect = run_infer_type(expected())
-    tvm.ir.assert_structural_equal(mod["main"], expect)
-
-
-def test_fold_dropout():
-    def before():
-        # A constant graph to fire fold constant
-        data = relay.const(np.arange(10).astype(np.float32))
-        dropout = relay.nn.dropout(data)
-        add = dropout + relay.const(1.0)
-        return relay.Function(relay.analysis.free_vars(add), add)
-
-    passes = tvm.transform.Sequential(
-        [
-            relay.transform.InferType(),
-            relay.transform.FoldConstant(),
-        ]
-    )
-
-    before_mod = tvm.IRModule.from_expr(before())
-
-    with tvm.transform.PassContext(opt_level=3):
-        after_mod = passes(before_mod)
-
-    tvm.ir.assert_structural_equal(run_infer_type(before_mod["main"]), after_mod["main"])
-
-
-def test_fold_qnn_const():
-    def before():
-        # QNN op with 2 constant arguments.
-        add = relay.qnn.op.add(
-            relay.const(np.ones((2, 3), dtype="uint8"), dtype="uint8"),
-            relay.const(np.ones((2, 3), dtype="uint8"), dtype="uint8"),
-            lhs_scale=relay.const(2.0),
-            lhs_zero_point=relay.const(0),
-            rhs_scale=relay.const(2.0),
-            rhs_zero_point=relay.const(0),
-            output_scale=relay.const(1.0),
-            output_zero_point=relay.const(0),
-        )
-        # QNN op with 1 constant and 1 non-constant arguments.
-        a = relay.var("a", shape=[2, 3], dtype="float32")
-        dense = relay.qnn.op.dense(
-            relay.qnn.op.quantize(a, relay.const(1.0), relay.const(0)),
-            add,
-            input_zero_point=relay.const(0),
-            kernel_zero_point=relay.const(0),
-            input_scale=relay.const(2.0),
-            kernel_scale=relay.const(2.0),
-            units=None,
-        )
-        # QNN op with 2 non-constant arguments.
-        b = relay.var("b", shape=[2], dtype="float32")
-        bias = relay.qnn.op.add(
-            dense,
-            relay.qnn.op.quantize(b, relay.const(1.0), relay.const(0), out_dtype="int32"),
-            lhs_scale=relay.const(2.0),
-            lhs_zero_point=relay.const(0),
-            rhs_scale=relay.const(2.0),
-            rhs_zero_point=relay.const(0),
-            output_scale=relay.const(1.0),
-            output_zero_point=relay.const(0),
-        )
-        return relay.Function([a, b], bias)
-
-    def expected():
-        a = relay.var("a", shape=[2, 3], dtype="float32")
-        dense = relay.qnn.op.dense(
-            relay.qnn.op.quantize(a, relay.const(1.0), relay.const(0)),
-            relay.const(np.array([[4, 4, 4], [4, 4, 4]], dtype="uint8"), dtype="uint8"),
-            input_zero_point=relay.const(0),
-            kernel_zero_point=relay.const(0),
-            input_scale=relay.const(2.0),
-            kernel_scale=relay.const(2.0),
-            units=None,
-        )
-        b = relay.var("b", shape=[2], dtype="float32")
-        bias = relay.qnn.op.add(
-            dense,
-            relay.qnn.op.quantize(b, relay.const(1.0), relay.const(0), out_dtype="int32"),
-            lhs_scale=relay.const(2.0),
-            lhs_zero_point=relay.const(0),
-            rhs_scale=relay.const(2.0),
-            rhs_zero_point=relay.const(0),
-            output_scale=relay.const(1.0),
-            output_zero_point=relay.const(0),
-        )
-        return relay.Function([a, b], bias)
-
-    # Nothing changed after applying FoldConstant
-    a = run_opt_pass(before(), transform.FoldConstant())
-    b = run_opt_pass(before(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Fold QNN constants
-    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_fold_quantize():
-    t = relay.TensorType([1, 2, 3], "int8")
-
-    def before():
-        data = tvm.nd.array(np.array([1.0, 2.0, 3.0], dtype="float32"))
-        const_fp = relay.const(data, dtype="float32")
-        const_i8 = relay.qnn.op.quantize(
-            const_fp, output_scale=relay.const(0.5), output_zero_point=relay.const(0)
-        )
-        x = relay.var("x", t)
-        sub = relay.op.subtract(x, const_i8)
-        func = relay.Function([x], sub)
-        return func
-
-    def expected():
-        data = tvm.nd.array(np.array([2, 4, 6], dtype="int8"))
-        const_i8 = relay.const(data, dtype="int8")
-        x = relay.var("x", t)
-        sub = relay.op.subtract(x, const_i8)
-        func = relay.Function([x], sub)
-        return func
-
-    # Nothing changed after applying FoldConstant
-    a = run_opt_pass(before(), transform.FoldConstant())
-    b = run_opt_pass(before(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Fold QNN constants
-    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_fold_qnn_conv2d_qnn_mul():
-    def before():
-        dtype = "uint8"
-        op0 = relay.qnn.op.conv2d(
-            relay.const(np.ones((1, 1, 2, 2), dtype=dtype), dtype=dtype),
-            relay.const(np.ones((1, 1, 2, 2), dtype=dtype), dtype=dtype),
-            input_zero_point=relay.const(0, "int32"),
-            kernel_zero_point=relay.const(0, "int32"),
-            input_scale=relay.const(1.0, "float32"),
-            kernel_scale=relay.const(1.0, "float32"),
-            kernel_size=(2, 2),
-            channels=1,
-        )
-        op = relay.qnn.op.mul(
-            op0,
-            relay.const(np.array([10], dtype="int32"), dtype="int32"),
-            relay.const(1.0, dtype="float32"),
-            relay.const(0, dtype="int32"),
-            relay.const(1.0, dtype="float32"),
-            relay.const(0, dtype="int32"),
-            relay.const(1.0, dtype="float32"),
-            relay.const(0, dtype="int32"),
-        )
-        func = relay.Function([], op)
-        return func
-
-    def expected():
-        data = relay.const(np.array([[[[40]]]], dtype="int32"), dtype="int32")
-        func = relay.Function([], data)
-        return func
-
-    # Nothing changed after applying FoldConstant
-    a = run_opt_pass(before(), transform.FoldConstant())
-    b = run_opt_pass(before(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Fold QNN constants
-    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_fold_requantize():
-    def before():
-        data = tvm.nd.array(np.array([1, 2, 3], dtype="int8"))
-        const_i8 = relay.const(data, dtype="int8")
-        op = relay.qnn.op.requantize(
-            const_i8,
-            input_scale=relay.const(2.0, dtype="float32"),
-            input_zero_point=relay.const(1, dtype="int32"),
-            output_scale=relay.const(1.0, dtype="float32"),
-            output_zero_point=relay.const(1, dtype="int32"),
-        )
-        x = relay.var("x", relay.TensorType([3], "int8"))
-        add = relay.op.add(op, x)
-        func = relay.Function([x], add)
-        return func
-
-    def expected():
-        data = tvm.nd.array(np.array([1, 3, 5], dtype="int8"))
-        const_i8 = relay.const(data, dtype="int8")
-        x = relay.var("x", relay.TensorType([3], "int8"))
-        add = relay.op.add(const_i8, x)
-        func = relay.Function([x], add)
-        return func
-
-    # Nothing changed after applying FoldConstant
-    a = run_opt_pass(before(), transform.FoldConstant())
-    b = run_opt_pass(before(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-    # Fold QNN constants
-    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
-    b = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_pass_link_params():
-    """
-    This test checks ensures that proper executor is passed to interpreter instance
-    The test will fail if FoldConstant does not override the executor due to "int8"
-    is not supported in ScheduleBuilder
-    """
-
-    def expr():
-        z = relay.const(10, dtype="int8")
-        return relay.cast(z, dtype="int32")
-
-    mod = tvm.IRModule.from_expr(expr())
-    mod = tvm.relay.transform.InferType()(mod)
-    # Add executor with link-params
-    mod = mod.with_attr("executor", Executor("aot", {"link-params": True}))
-    mod = tvm.relay.transform.FoldConstant()(mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
deleted file mode 100644
index f2bd360fc667..000000000000
--- a/tests/python/relay/test_pass_fold_explicit_padding.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import run_opt_pass
-
-import numpy as np
-
-
-def test_simplify_conv_pad():
-    convs = [relay.nn.conv1d, relay.nn.conv2d, relay.nn.conv3d]
-
-    def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout, no_fold=False):
-        if layout[1] == "C":
-            shape = [1, 3] + [10] * ndim
-            wshape = [8, 3] + [3] * ndim
-        elif layout[-1] == "C":
-            shape = [1] + [10] * ndim + [3]
-            wshape = [8] + [3] * ndim + [3]
-        else:
-            raise ValueError("This test only supports NC* and N*C")
-
-        x = relay.var("x", shape=shape, dtype="float32")
-        w = relay.var("w", shape=wshape, dtype="float32")
-        pad = relay.nn.pad(x, pad_width, pad_value, pad_mode)
-        if layout[1] == "C":
-            conv = convs[ndim - 1](pad, w, padding=orig_padding)
-        else:
-            conv = convs[ndim - 1](
-                pad, w, padding=orig_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
-            )
-
-        if pad_mode == "constant" and pad_value == 0:
-            new_padding = []
-            for j in range(2):
-                for i in range(len(pad_width)):
-                    if layout[i] in ["D", "H", "W"]:
-                        new_padding.append(pad_width[i][j])
-            for i in range(len(new_padding)):
-                new_padding[i] += orig_padding[i]
-            if layout[1] == "C":
-                after = convs[ndim - 1](x, w, padding=new_padding)
-            else:
-                after = convs[ndim - 1](
-                    x, w, padding=new_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
-                )
-        else:
-            after = conv
-
-        zz = run_opt_pass(conv, transform.FoldExplicitPadding())
-        expected = run_opt_pass(after, transform.InferType())
-        tvm.ir.assert_structural_equal(zz, expected)
-
-        mod1 = tvm.IRModule.from_expr(conv)
-        mod2 = tvm.IRModule.from_expr(zz)
-
-        if not no_fold:
-            op_freqs = relay.analysis.list_op_freqs(mod2)
-            assert "nn.pad" not in op_freqs
-
-        with tvm.transform.PassContext():
-            func1 = relay.create_executor(
-                "vm", mod=mod1, device=tvm.cpu(), target="llvm"
-            ).evaluate()
-        func2 = relay.create_executor("vm", mod=mod2, device=tvm.cpu(), target="llvm").evaluate()
-        x_np = np.random.rand(*shape).astype("float32")
-        w_np = np.random.rand(*wshape).astype("float32")
-
-        result1 = func1(x_np, w_np)
-        result2 = func2(x_np, w_np)
-
-        tvm.testing.assert_allclose(result1.numpy(), result2.numpy(), rtol=1e-5, atol=1e-5)
-
-    # Test fold cases
-    for orig_pad in [[0, 0], [2, 0], [0, 2]]:
-        for i_pad in [[0, 0], [1, 1], [1, 0]]:
-            for ndim in [1, 2, 3]:
-                for channels_last in [0, 1]:
-                    if channels_last:
-                        layout = "NDHWC"
-                        layout = layout[0:1] + layout[4 - ndim : 4] + layout[-1:]
-                        padding = [[0, 0]] + [i_pad] * ndim + [[0, 0]]
-                    else:
-                        layout = "NCDHW"
-                        layout = layout[0:2] + layout[5 - ndim :]
-                        padding = [[0, 0]] * 2 + [i_pad] * ndim
-
-                    validate(ndim, padding, 0, "constant", orig_pad * ndim, layout)
-
-    # Test no fold cases
-    ndim = 2
-    # Conv only folds when pad_value=0
-    validate(
-        ndim, [[0, 0]] * 2 + [i_pad] * ndim, 1, "constant", orig_pad * ndim, "NCHW", no_fold=True
-    )
-    # Conv only folds when pad's pad_mode="constant"
-    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW", no_fold=True)
-
-
-def get_min_value(dtype):
-    if np.issubdtype(dtype, np.floating):
-        return np.finfo(dtype).min
-    elif np.issubdtype(dtype, np.integer):
-        return np.iinfo(dtype).min
-    else:
-        raise ValueError("Cannot get min value for dtypes that are not integer or not floating")
-
-
-def test_simplify_pool_pad():
-    max_pools = [relay.nn.max_pool1d, relay.nn.max_pool2d, relay.nn.max_pool3d]
-    avg_pools = [relay.nn.avg_pool1d, relay.nn.avg_pool2d, relay.nn.avg_pool3d]
-
-    def validate(
-        pools,
-        ndim,
-        pad_width,
-        pad_value,
-        orig_padding,
-        layout,
-        pool_size,
-        pad_mode="constant",
-        dtype="float32",
-        no_fold=False,
-        **kwargs,
-    ):
-        pad_value_const = relay.const(pad_value, dtype=dtype)
-
-        if layout[1] == "C":
-            shape = [1, 3] + [10] * ndim
-        elif layout[-1] == "C":
-            shape = [1] + [10] * ndim + [3]
-        else:
-            raise ValueError("This test only supports NC* and N*C")
-
-        x = relay.var("x", shape=shape, dtype=dtype)
-        pad = relay.nn.pad(x, pad_width, pad_value_const, pad_mode)
-        if layout[1] == "C":
-            pool = pools[ndim - 1](pad, padding=orig_padding, pool_size=pool_size, **kwargs)
-        else:
-            pool = pools[ndim - 1](
-                pad, padding=orig_padding, layout=layout, pool_size=pool_size, **kwargs
-            )
-
-        if pools == max_pools:
-            foldable_pad_value = get_min_value(dtype)
-        else:
-            foldable_pad_value = 0
-
-        if pad_mode == "constant" and pad_value == foldable_pad_value:
-            new_padding = []
-            for j in range(2):
-                for i in range(len(pad_width)):
-                    if layout[i] in ["D", "H", "W"]:
-                        new_padding.append(pad_width[i][j])
-            for i in range(len(new_padding)):
-                new_padding[i] += orig_padding[i]
-
-            if pools == avg_pools and all(v == 0 for v in orig_padding):
-                # If the orig padding for AvgPool is all zero and the pad op to fold
-                # has non-zero pad width, the resultant folded AvgPool will have
-                # count_include_pad=True so AvgPool's divisor is agnostic of pad boundaries
-                kwargs["count_include_pad"] = True
-            if layout[1] == "C":
-                after = pools[ndim - 1](x, padding=new_padding, pool_size=pool_size, **kwargs)
-            else:
-                after = pools[ndim - 1](
-                    x, padding=new_padding, layout=layout, pool_size=pool_size, **kwargs
-                )
-        else:
-            after = pool
-
-        zz = run_opt_pass(pool, transform.FoldExplicitPadding())
-        expected = run_opt_pass(after, transform.InferType())
-
-        tvm.ir.assert_structural_equal(zz, expected)
-
-        mod1 = tvm.IRModule.from_expr(pool)
-        mod2 = tvm.IRModule.from_expr(zz)
-
-        if not no_fold:
-            op_freqs = relay.analysis.list_op_freqs(mod2)
-            assert "nn.pad" not in op_freqs
-
-        with tvm.transform.PassContext():
-            func1 = relay.create_executor(
-                "vm", mod=mod1, device=tvm.cpu(), target="llvm"
-            ).evaluate()
-
-        func2 = relay.create_executor("vm", mod=mod2, device=tvm.cpu(), target="llvm").evaluate()
-        x_np = np.random.rand(*shape).astype(dtype)
-
-        result1 = func1(x_np)
-        result2 = func2(x_np)
-
-        tvm.testing.assert_allclose(result1.numpy(), result2.numpy(), rtol=1e-5, atol=1e-5)
-
-    # Test fold cases
-    float_min_val = get_min_value("float32")
-    for orig_pad in [[0, 0], [2, 0]]:
-        for i_pad in [[1, 1], [1, 0]]:
-            for ndim in [1, 2, 3]:
-                for channels_last in [0, 1]:
-                    if channels_last:
-                        layout = "NDHWC"
-                        layout = layout[0:1] + layout[4 - ndim : 4] + layout[-1:]
-                        padding = [[0, 0]] + [i_pad] * ndim + [[0, 0]]
-                    else:
-                        layout = "NCDHW"
-                        layout = layout[0:2] + layout[5 - ndim :]
-                        padding = [[0, 0]] * 2 + [i_pad] * ndim
-
-                    validate(max_pools, ndim, padding, float_min_val, orig_pad * ndim, layout, 2)
-
-    # Check Pool pad folding when pad width on pad op is all zero.
-    validate(max_pools, 1, [[0, 0], [0, 0], [0, 0]], float_min_val, [2, 0], "NCW", 2)
-    # Check MaxPool pad folding with uint dtype
-    int_min_val = get_min_value("uint8")
-    validate(
-        max_pools,
-        2,
-        [[0, 0], [0, 0], [0, 2], [2, 0]],
-        int_min_val,
-        [2, 0, 0, 0],
-        "NCHW",
-        2,
-        dtype="uint8",
-    )
-    # Fold when original AvgPool has its own padding but count_include_pad=True
-    validate(
-        avg_pools,
-        2,
-        [[0, 0], [0, 0], [0, 2], [2, 0]],
-        0,
-        [0, 0, 1, 0],
-        "NCHW",
-        2,
-        count_include_pad=True,
-    )
-    # Fold when count_include_pad=False but original AvgPool has no orig padding
-    validate(avg_pools, 2, [[0, 0], [0, 0], [0, 2], [2, 0]], 0, [0, 0, 0, 0], "NCHW", 2)
-
-    # Test no fold cases
-    # AvgPool only folds pad when count_include_pad (False by default) is True
-    validate(
-        avg_pools, 2, [[0, 0], [0, 0], [0, 2], [2, 0]], 0, [0, 0, 0, 0], "NCHW", 2, no_fold=True
-    )
-    # MaxPool only folds pad when pad_value is the min for its dtype
-    validate(max_pools, 1, [[0, 0], [0, 0], [0, 2]], 0, [0, 0], "NCHW", 2, no_fold=True)
-    # AvgPool only folds pad when pad_value=0
-    validate(avg_pools, 1, [[0, 0], [0, 0], [0, 2]], 1, [0, 0], "NCHW", 2, no_fold=True)
-    # Pools only fold when pad_mode="constant"
-    validate(
-        avg_pools, 1, [[0, 0], [0, 0], [0, 2]], 0, [0, 0], "NCHW", 2, pad_mode="edge", no_fold=True
-    )
-
-
-def test_fold_pad_qconv2d():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        input_zero_point = 10
-        pad = relay.nn.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]], pad_value=input_zero_point)
-        return relay.qnn.op.conv2d(
-            pad,
-            weight,
-            relay.const(input_zero_point, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(0, 0),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
-        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
-        input_zero_point = 10
-        return relay.qnn.op.conv2d(
-            x,
-            weight,
-            relay.const(input_zero_point, "int32"),
-            relay.const(1, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-
-    a = run_opt_pass(before(), relay.transform.FoldExplicitPadding())
-    b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b, map_free_vars=True)
-
-
-def test_pad_qconv2d_no_fold():
-    def get_expr():
-        x = relay.var("x", shape=(1, 1, 2, 2), dtype="int8")
-        weight = relay.var("weight", shape=(1, 1, 2, 2), dtype="int8")
-        # Pad value and input zp are not equal
-        pad_value = 1
-        input_zero_point = 0
-        pad = relay.nn.pad(x, [[0, 0], [0, 0], [1, 1], [1, 1]], pad_value=pad_value)
-        return relay.qnn.op.conv2d(
-            pad,
-            weight,
-            relay.const(input_zero_point, "int32"),
-            relay.const(0, "int32"),
-            relay.const(1, "float32"),
-            relay.const(1, "float32"),
-            channels=1,
-            kernel_size=(2, 2),
-            padding=(0, 0),
-        )
-
-    a = run_opt_pass(get_expr(), relay.transform.FoldExplicitPadding())
-    b = run_opt_pass(get_expr(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b, map_free_vars=True)
-
-
-if __name__ == "__main__":
-    test_simplify_conv_pad()
-    test_simplify_pool_pad()
-    test_fold_pad_qconv2d()
-    test_pad_qconv2d_no_fold()
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
deleted file mode 100644
index bf8dcc0d9c47..000000000000
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ /dev/null
@@ -1,1274 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import create_workload
-from tvm.relay.build_module import bind_params_by_name
-
-
-def initializer(_, param):
-    param = np.zeros(param.shape)
-
-
-def _get_positive_scale(size):
-    return np.random.uniform(0.5, 1, size=size).astype("float32")
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_fold_fwd_simple():
-    """Simple testcase."""
-
-    def before(x, conv_weight, in_bias, in_scale, channels, blocking):
-        args = [x, conv_weight, in_bias]
-        x = relay.multiply(x, in_scale)
-        x = relay.nn.relu(x)
-        x = relay.add(x, in_bias)
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW2i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, in_bias, in_scale, in_channels, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, in_bias]
-        if blocking:
-            squeezed_scale = relay.squeeze(in_scale, axis=[0, 2, 3])
-            x = relay.nn.relu(x)
-            in_bias = relay.divide(
-                in_bias,
-                relay.reshape(squeezed_scale, (1, in_channels // blocking[0], 1, 1, blocking[0])),
-            )  # NCHWc
-            x = relay.add(x, in_bias)
-            conv_weight = relay.multiply(
-                conv_weight, relay.reshape(squeezed_scale, (1, in_channels // 2, 1, 1, 2, 1))
-            )  # OIHWio
-        else:
-            squeezed_scale = relay.squeeze(in_scale, axis=[1, 2])
-            x = relay.nn.relu(x)
-            in_bias = relay.divide(
-                in_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2)
-            )
-            x = relay.add(x, in_bias)
-            conv_weight = relay.multiply(
-                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2)
-            )
-
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW2i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        return relay.Function(args, y)
-
-    def check(shape, channels, blocking):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        if blocking:
-            in_channels = shape[1] * shape[4]
-            in_bias = relay.var("in_bias", shape=(1, in_channels // blocking[0], 1, 1, blocking[0]))
-            in_scale = relay.const(
-                _get_positive_scale((1, in_channels // blocking[0], 1, 1, blocking[0]))
-            )
-        else:
-            in_channels = shape[1]
-            in_bias = relay.var("in_bias", shape=(in_channels, 1, 1))
-            in_scale = relay.const(_get_positive_scale((in_channels, 1, 1)))
-        y1 = before(x, weight, in_bias, in_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        y1_expected = expected(x, weight, in_bias, in_scale, in_channels, channels, blocking)
-
-        y1_folded = run_opt_pass(y1_folded, transform.InferType())
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10), 2, None)
-    check((2, 2, 10, 10, 2), 8, (2, 4))
-
-
-def test_fold_fwd_dual_path():
-    """scale axis being consumed by two consumers"""
-
-    def before(x, conv_weight, in_bias, in_scale, channels, blocking):
-        args = [x, conv_weight, in_bias]
-        x = relay.multiply(in_scale, x)
-        x = relay.nn.relu(x)
-        x = relay.subtract(x, in_bias)
-        y1 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            data_layout="NHWC{}c".format(blocking[0]) if blocking else "NHWC",
-            kernel_layout="HWIO1i{}o".format(blocking[1]) if blocking else "HWIO",
-            groups=channels,
-            padding=(1, 1),
-        )
-        y2 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            data_layout="NHWC{}c".format(blocking[0]) if blocking else "NHWC",
-            kernel_layout="HWIO1i{}o".format(blocking[1]) if blocking else "HWIO",
-            groups=channels,
-            padding=(1, 1),
-        )
-        z = relay.add(y1, y2)
-        return relay.Function(args, z)
-
-    def expected(x, conv_weight, in_bias, in_scale, channels, blocking):
-        args = [x, conv_weight, in_bias]
-        x = relay.nn.relu(x)
-        if blocking:
-            _in_scale = relay.reshape(
-                in_scale, (1, 1, 1, channels // blocking[0], blocking[0])
-            )  # NHWCc
-        else:
-            _in_scale = in_scale
-        in_bias = relay.divide(in_bias, _in_scale)
-        x = relay.subtract(x, in_bias)
-        if blocking:
-            _in_scale = relay.reshape(
-                in_scale, (1, 1, 1, channels // blocking[0], 1, blocking[0])
-            )  # HWIOio
-        y1 = relay.nn.conv2d(
-            x,
-            relay.multiply(conv_weight, _in_scale),
-            channels=channels,
-            kernel_size=(3, 3),
-            data_layout="NHWC{}c".format(blocking[0]) if blocking else "NHWC",
-            kernel_layout="HWIO1i{}o".format(blocking[1]) if blocking else "HWIO",
-            groups=channels,
-            padding=(1, 1),
-        )
-        if blocking:
-            _in_scale = relay.reshape(
-                in_scale, (1, 1, 1, channels // blocking[0], 1, blocking[0])
-            )  # HWIOio
-        y2 = relay.nn.conv2d(
-            x,
-            relay.multiply(conv_weight, _in_scale),
-            channels=channels,
-            kernel_size=(3, 3),
-            data_layout="NHWC{}c".format(blocking[0]) if blocking else "NHWC",
-            kernel_layout="HWIO1i{}o".format(blocking[1]) if blocking else "HWIO",
-            groups=channels,
-            padding=(1, 1),
-        )
-        z = relay.add(y1, y2)
-        return relay.Function(args, z)
-
-    def check(dshape, channels, blocking):
-        x = relay.var("x", shape=dshape)
-        if blocking:
-            in_channels = dshape[3] * dshape[4]
-            wshape = (3, 3, 1, channels // blocking[1], 1, blocking[1])  # HWIOio
-            weight = relay.var("weight", shape=wshape)
-            in_bias = relay.var("in_bias", shape=(in_channels // blocking[0], blocking[0]))
-            in_scale = relay.const(_get_positive_scale((in_channels // blocking[0], blocking[0])))
-        else:
-            in_channels = dshape[-1]
-            wshape = (3, 3, 1, channels)  # HWIO
-            weight = relay.var("weight", shape=wshape)
-            in_bias = relay.var("in_bias", shape=(in_channels,))
-            in_scale = relay.const(
-                _get_positive_scale(
-                    in_channels,
-                )
-            )
-
-        # test depthwise
-        assert in_channels == channels
-
-        y1 = before(x, weight, in_bias, in_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_expected = expected(x, weight, in_bias, in_scale, channels, blocking)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 3), 3, None)
-    check((2, 4, 10, 2, 2), 4, (2, 2))
-
-
-def test_fold_fwd_fail():
-    """testcase where we canont fold"""
-
-    def before(x, conv_weight, in_bias, in_scale, channels, blocking):
-        x = relay.multiply(x, in_scale)
-        xx = relay.nn.leaky_relu(x, alpha=0.1)
-        y1 = relay.nn.conv2d(
-            xx,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            data_layout="NHWC{}c".format(blocking[0]) if blocking else "NHWC",
-            kernel_layout="HWIO1i{}o".format(blocking[1]) if blocking else "HWIO",
-            padding=(1, 1),
-        )
-        z = relay.add(y1, x)
-        return relay.Function(relay.analysis.free_vars(z), z)
-
-    def check(shape, channels, blocking):
-        x = relay.var("x", shape=shape)
-        if blocking:
-            in_channels = shape[3] * shape[4]
-            in_bias = relay.var("in_bias", shape=(in_channels // blocking[0], blocking[0]))
-            in_scale = relay.const(_get_positive_scale((in_channels // blocking[0], blocking[0])))
-        else:
-            in_channels = shape[-1]
-            in_bias = relay.var("in_bias", shape=(in_channels,))
-            in_scale = relay.const(_get_positive_scale(size=(in_channels,)))
-        # test depthwise
-        assert in_channels == channels
-        weight = relay.var("weight")
-        y1 = before(x, weight, in_bias, in_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        tvm.ir.assert_structural_equal(y1, y1_folded)
-
-    check((2, 11, 10, 4), 4, None)
-    check((2, 11, 10, 2, 2), 4, (2, 2))
-
-
-def test_fold_fwd_relu_fail():
-    """testcase where we canont fold because scale can not pass relu"""
-
-    def before(x, conv_weight, in_bias, in_scale, channels, blocking):
-        x = relay.multiply(x, in_scale)
-        xx = relay.nn.relu(x)
-        y1 = relay.nn.conv2d(
-            xx,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            data_layout="NHWC{}c".format(blocking[0]) if blocking else "NHWC",
-            kernel_layout="HWIO1i{}o".format(blocking[1]) if blocking else "HWIO",
-            padding=(1, 1),
-        )
-        z = relay.add(y1, x)
-        return relay.Function(relay.analysis.free_vars(z), z)
-
-    def check(shape, channels, blocking, in_scale):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        if blocking:
-            in_channels = shape[3] * shape[4]
-            in_bias = relay.var("in_bias", shape=(1, in_channels // blocking[0], 1, 1, blocking[0]))
-        else:
-            in_channels = shape[-1]
-            in_bias = relay.var("in_bias", shape=(in_channels,))
-
-        assert in_channels == channels
-        y1 = before(x, weight, in_bias, in_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        tvm.ir.assert_structural_equal(y1, y1_folded)
-
-    in_scale = relay.var("in_scale", shape=(4,))
-    check((2, 11, 10, 4), 4, None, in_scale)
-    in_scale = relay.const(-_get_positive_scale((4,)))
-    check((2, 11, 10, 4), 4, None, in_scale)
-
-    in_scale = relay.var("in_scale", shape=(1, 1, 1, 2, 2))
-    check((2, 11, 10, 2, 2), 4, (2, 2), in_scale)
-    in_scale = relay.const(-_get_positive_scale((1, 1, 1, 2, 2)))
-    check((2, 11, 10, 2, 2), 4, (2, 2), in_scale)
-
-
-def test_fold_fwd_let_fail():
-    """testcase where we canont fold"""
-
-    def before(x, conv_weight, in_bias, in_scale, channels):
-        args = [x, conv_weight, in_bias]
-        x = relay.multiply(x, in_scale)
-        x = relay.nn.relu(x)
-        x = relay.add(x, in_bias)
-        x_var = relay.Var("x_var")
-        y1 = relay.nn.conv2d(
-            x_var,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            padding=(1, 1),
-        )
-        z = relay.add(y1, x)
-        let = relay.Let(x_var, x, z)
-        return relay.Function(args, let)
-
-    def check(shape, channels):
-        x = relay.var("x", shape=shape)
-        in_channels = shape[-1]
-        in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.const(_get_positive_scale(size=(in_channels,)))
-        # test depthwise
-        assert in_channels == channels
-        weight = relay.var("weight")
-        y1 = before(x, weight, in_bias, in_scale, channels)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        tvm.ir.assert_structural_equal(y1, y1_folded)
-
-    check((2, 11, 10, 4), 4)
-
-
-def test_fold_fwd_negative_scale():
-    """Testcase of folding negative scale"""
-
-    def before(x, conv_weight, in_scale, channels, blocking):
-        args = [x, conv_weight]
-        x = relay.multiply(x, in_scale)
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW4i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, in_scale, in_channels, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight]
-        if blocking:
-            squeezed_scale = relay.squeeze(in_scale, axis=[0, 2, 3])
-            conv_weight = relay.multiply(
-                conv_weight, relay.reshape(squeezed_scale, (1, in_channels // 4, 1, 1, 4, 1))
-            )
-            # blocking by "i" in OIHWio
-        else:
-            squeezed_scale = relay.squeeze(in_scale, axis=[1, 2])
-            conv_weight = relay.multiply(
-                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2)
-            )
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW4i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        return relay.Function(args, y)
-
-    def check(shape, channels, blocking):
-        x = relay.var("x", shape=shape)
-        if blocking:
-            in_channels = shape[1] * shape[4]
-            in_scale = relay.const(-_get_positive_scale((1, shape[1], 1, 1, shape[4])))
-        else:
-            in_channels = shape[1]
-            in_scale = relay.const(-_get_positive_scale((in_channels, 1, 1)))
-        weight = relay.var("weight")
-        y1 = before(x, weight, in_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        y1_expected = expected(x, weight, in_scale, in_channels, channels, blocking)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10), 4, None)
-    check((2, 2, 10, 10, 2), 8, (2, 2))
-
-
-def test_fold_fwd_dense():
-    """dense testcase."""
-
-    def before(x, weight, in_bias, in_scale):
-        args = [x, weight, in_bias]
-        x = relay.multiply(x, in_scale)
-        x = relay.nn.relu(x)
-        x = relay.add(x, in_bias)
-        y = relay.nn.dense(x, weight)
-        return relay.Function(args, y)
-
-    def expected(x, weight, in_bias, in_scale):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, weight, in_bias]
-        x = relay.nn.relu(x)
-        in_bias = relay.divide(in_bias, in_scale)
-        x = relay.add(x, in_bias)
-        weight = relay.multiply(weight, in_scale)
-        y = relay.nn.dense(x, weight)
-        return relay.Function(args, y)
-
-    def check(data_shape, weight_shape):
-        x = relay.var("x", shape=data_shape)
-        weight = relay.var("weight", shape=weight_shape)
-        in_channels = data_shape[1]
-        in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.const(_get_positive_scale((in_channels,)))
-        y1 = before(x, weight, in_bias, in_scale)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        y1_expected = expected(x, weight, in_bias, in_scale)
-
-        y1_folded = run_opt_pass(y1_folded, transform.InferType())
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4), (3, 4))
-    check((3, 5), (4, 5))
-
-
-def test_fold_bwd_simple():
-    """Simple testcase."""
-
-    def before(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        args = [x, conv_weight, out_bias]
-        if blocking:
-            out_bias = relay.reshape(out_bias, (1, channels // blocking[1], 1, 1, blocking[1]))
-        else:
-            out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y = relay.add(y, out_bias)
-        y = relay.nn.relu(y)
-        if blocking:
-            out_scale = relay.reshape(out_scale, (1, channels // blocking[1], 1, 1, blocking[1]))
-        y = relay.multiply(y, out_scale)
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias]
-        if blocking:
-            out_bias = relay.reshape(out_bias, (1, channels // blocking[1], 1, 1, blocking[1]))
-            out_scale = relay.reshape(out_scale, (1, channels // blocking[1], 1, 1, blocking[1]))
-            squeezed_scale = relay.squeeze(out_scale, axis=[0, 2, 3])
-            conv_weight = relay.multiply(
-                conv_weight,
-                relay.reshape(squeezed_scale, (channels // blocking[1], 1, 1, 1, 1, blocking[1])),
-            )
-        else:
-            out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
-            squeezed_scale = relay.squeeze(out_scale, axis=[1, 2])
-            conv_weight = relay.multiply(
-                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-            )
-
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        if blocking:
-            out_bias = relay.multiply(
-                out_bias,
-                relay.reshape(squeezed_scale, (1, channels // blocking[1], 1, 1, blocking[1])),
-            )
-        else:
-            out_bias = relay.multiply(
-                out_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2)
-            )
-        y = relay.add(y, out_bias)
-        y = relay.nn.relu(y)
-        return relay.Function(args, y)
-
-    def check(shape, in_channels, channels, blocking):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        out_bias = relay.var("out_bias", shape=(channels,))
-        if blocking:
-            out_scale = relay.const(_get_positive_scale((channels,)))
-        else:
-            out_scale = relay.const(_get_positive_scale((channels, 1, 1)))
-        y1 = before(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        y1_expected = expected(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10), 4, 8, None)
-    check((2, 2, 10, 10, 16), 32, 64, (16, 16))
-
-
-def test_fold_bwd_dual_path():
-    """Dual path testcase."""
-
-    def before(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        args = [x, conv_weight, out_bias]
-        y1 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y2 = relay.nn.relu(y2)
-        y = relay.add(y1, y2)
-        y = relay.multiply(y, out_scale)
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias]
-        if not blocking:
-            out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
-        squeezed_scale = relay.squeeze(out_scale, axis=[1, 2])
-
-        def fold_conv_weight():
-            if blocking:
-                return relay.multiply(
-                    conv_weight,
-                    relay.reshape(
-                        squeezed_scale, (channels // blocking[1], 1, 1, 1, 1, blocking[1])
-                    ),
-                )
-            else:
-                return relay.multiply(
-                    conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-                )
-
-        y1 = relay.nn.conv2d(
-            x,
-            fold_conv_weight(),
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.conv2d(
-            x,
-            fold_conv_weight(),
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y2 = relay.nn.relu(y2)
-        y = relay.add(y1, y2)
-        return relay.Function(args, y)
-
-    def check(shape, in_channels, channels, blocking):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        if blocking:
-            out_bias = relay.var("out_bias", shape=(channels // blocking[1], 1, 1, blocking[1]))
-            out_scale = relay.const(
-                _get_positive_scale((channels // blocking[1], 1, 1, blocking[1]))
-            )
-        else:
-            out_bias = relay.var("out_bias", shape=(channels,))
-            out_scale = relay.const(_get_positive_scale((channels, 1, 1)))
-
-        y1 = before(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        y1_expected = expected(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10), 4, 8, None)
-    check((2, 2, 10, 10, 2), 4, 8, (2, 2))
-
-
-def test_fold_bwd_simple_constant():
-    def before(data, weight, out_bias, channels):
-        y = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-
-        y = relay.add(y, out_bias)
-        c2 = relay.const(2.0)
-        y = relay.nn.relu(y)
-        y = relay.multiply(y, c2)
-        mod, params = create_workload(y, initializer)
-        mod["main"] = bind_params_by_name(mod["main"], params)
-        return mod
-
-    def expected(data, weight, out_bias, channels):
-        y0 = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-        y0 = relay.add(y0, out_bias)
-        y0 = relay.nn.relu(y0)
-        mod, params = create_workload(y0, initializer)
-        mod["main"] = bind_params_by_name(mod["main"], params)
-        return mod
-
-    def check(shape, channels):
-        x = relay.var("data", relay.TensorType(shape, "float32"))
-        weight = relay.var("weight")
-        out_bias = relay.var("in_bias", shape=(channels, 1, 1))
-
-        y0 = before(x, weight, out_bias, channels)
-        remove_last_multiply = tvm.transform.Sequential(
-            [
-                relay.transform.InferType(),
-                relay.transform.FoldScaleAxis(),
-            ]
-        )
-        with tvm.transform.PassContext(opt_level=3):
-            y0 = remove_last_multiply(y0)
-        _expect = expected(x, weight, out_bias, channels)
-        tvm.ir.assert_structural_equal(y0, _expect)
-
-    check((1, 3, 200, 200), 16)
-
-
-def test_fold_bwd_dual_consumer():
-    def before(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        args = [x, conv_weight, out_bias]
-        y0 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y0 = relay.multiply(y0, out_scale)
-        y0 = relay.nn.relu(y0)
-
-        y1 = relay.nn.conv2d(
-            y0,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y1 = relay.multiply(y1, out_scale)
-        y1 = relay.nn.relu(y1)
-
-        y2 = relay.nn.conv2d(
-            y0,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y2 = relay.multiply(y2, out_scale)
-        y2 = relay.nn.relu(y2)
-
-        y = relay.add(y1, y2)
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias]
-
-        def fold_conv_weight():
-            squeezed_scale = relay.squeeze(out_scale, axis=[1, 2])
-            if blocking:
-                return relay.multiply(
-                    conv_weight,
-                    relay.reshape(
-                        squeezed_scale, (channels // blocking[1], 1, 1, 1, 1, blocking[1])
-                    ),
-                )
-            else:
-                return relay.multiply(
-                    conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-                )
-
-        y0 = relay.nn.conv2d(
-            x,
-            fold_conv_weight(),
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y0 = relay.nn.relu(y0)
-        y1 = relay.nn.conv2d(
-            y0,
-            fold_conv_weight(),
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.conv2d(
-            y0,
-            fold_conv_weight(),
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y2 = relay.nn.relu(y2)
-        y = relay.add(y1, y2)
-        return relay.Function(args, y)
-
-    def check(shape, in_channels, channels, blocking):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        if blocking:
-            out_bias = relay.var("out_bias", shape=(channels // blocking[1], 1, 1, blocking[1]))
-            out_scale = relay.const(
-                _get_positive_scale((channels // blocking[1], 1, 1, blocking[1]))
-            )
-        else:
-            out_bias = relay.var("out_bias", shape=(channels,))
-            out_scale = relay.const(_get_positive_scale((channels, 1, 1)))
-
-        y1 = before(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        y1_expected = expected(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10), 4, 4, None)
-    check((2, 2, 10, 10, 2), 4, 4, (2, 2))
-
-
-def test_fold_bwd_fail():
-    """Dual path testcase."""
-
-    def fail1(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        args = [x, conv_weight, out_bias]
-        y1 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-            out_layout="CNHW{}c".format(blocking[1]) if blocking else "CNHW",
-        )
-        # fold will fail because the axis from two path
-        # differs from each other.
-        y2 = relay.nn.relu(y2)
-        y = relay.add(y1, y2)
-        y = relay.multiply(y, out_scale)
-        return relay.Function(args, y)
-
-    def fail2(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        args = [x, conv_weight, out_bias]
-        y1 = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y2 = relay.nn.relu(y1)
-        # fold will fail because y1 is referred also by y2
-        y1 = relay.multiply(y1, out_scale)
-        y = relay.add(y1, y2)
-        return relay.Function(args, y)
-
-    def check(shape, in_channels, channels, blocking, fbefore):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        if blocking:
-            out_bias = relay.var("out_bias", shape=(channels // blocking[1], 1, 1, blocking[1]))
-            out_scale = relay.const(
-                _get_positive_scale((channels // blocking[1], 1, 1, blocking[1]))
-            )
-        else:
-            out_bias = relay.var("out_bias", shape=(channels, 1, 1))
-            out_scale = relay.const(_get_positive_scale((channels, 1, 1)))
-        y1 = fbefore(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        tvm.ir.assert_structural_equal(y1_folded, y1)
-
-    check((4, 4, 10, 10), 4, 4, None, fail1)
-    check((2, 2, 10, 10, 2), 4, 4, (2, 2), fail1)
-    check((4, 4, 10, 10), 4, 4, None, fail2)
-    check((4, 2, 10, 10, 2), 4, 4, (2, 2), fail2)
-
-
-def test_fold_bwd_relu_fail():
-    """testcase where we canont fold because scale can not pass relu"""
-
-    def before(x, conv_weight, out_scale, channels, blocking):
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y = relay.nn.relu(y)
-        y = relay.multiply(x, out_scale)
-        return relay.Function(relay.analysis.free_vars(y), y)
-
-    def check(shape, channels, blocking, out_scale):
-        x = relay.var("x", shape=shape)
-        in_channels = shape[1]
-        weight = relay.var("weight")
-        y1 = before(x, weight, out_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        tvm.ir.assert_structural_equal(y1, y1_folded)
-
-    out_scale = relay.var("in_scale", shape=(4, 1, 1))
-    check((4, 4, 10, 10), 4, None, out_scale)
-    out_scale = relay.const(np.random.uniform(size=(4, 1, 1), low=-1.0, high=0.0)).astype("float32")
-    check((4, 4, 10, 10), 4, None, out_scale)
-
-    out_scale = relay.var("in_scale", shape=(1, 2, 1, 1, 2))
-    check((4, 2, 10, 10, 2), 4, (2, 2), out_scale)
-    out_scale = relay.const(np.random.uniform(size=(1, 2, 1, 1, 2), low=-1.0, high=0.0)).astype(
-        "float32"
-    )
-    check((4, 2, 10, 10, 2), 4, (2, 2), out_scale)
-
-
-def test_fold_bwd_negative_scale():
-    """Testcase of folding negative scale"""
-
-    def before(x, conv_weight, out_scale, channels, blocking):
-        args = [x, conv_weight]
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        y = relay.multiply(y, out_scale)
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, out_scale, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight]
-        if blocking:
-            squeezed_scale = relay.squeeze(out_scale, axis=[0, 2, 3])
-            conv_weight = relay.multiply(
-                conv_weight,
-                relay.reshape(squeezed_scale, (channels // blocking[1], 1, 1, 1, 1, blocking[1])),
-            )
-        else:
-            squeezed_scale = relay.squeeze(out_scale, axis=[1, 2])
-            conv_weight = relay.multiply(
-                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-            )
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW{}c".format(blocking[0]) if blocking else "NCHW",
-            kernel_layout="OIHW1i{}o".format(blocking[1]) if blocking else "OIHW",
-        )
-        return relay.Function(args, y)
-
-    def check(shape, channels, blocking):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        if blocking:
-            out_scale = relay.const(
-                -_get_positive_scale((1, channels // blocking[1], 1, 1, blocking[1]))
-            )
-        else:
-            out_scale = relay.const(-_get_positive_scale((channels, 1, 1)))
-        y1 = before(x, weight, out_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        y1_expected = expected(x, weight, out_scale, channels, blocking)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10), 8, None)
-    check((2, 2, 10, 10, 2), 8, (2, 2))
-
-
-def test_fold_bwd_dense():
-    """dense testcase."""
-
-    def before(x, weight, in_bias, in_scale):
-        args = [x, weight, in_bias]
-        x = relay.nn.dense(x, weight)
-        x = relay.add(x, in_bias)
-        x = relay.nn.relu(x)
-        y = relay.multiply(x, in_scale)
-        return relay.Function(args, y)
-
-    def expected(x, weight, in_bias, in_scale):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, weight, in_bias]
-        scale = relay.expand_dims(in_scale, axis=1)
-        weight = relay.multiply(weight, scale)
-        x = relay.nn.dense(x, weight)
-        bias = relay.multiply(in_bias, in_scale)
-        x = relay.add(x, bias)
-        y = relay.nn.relu(x)
-        return relay.Function(args, y)
-
-    def check(data_shape, weight_shape):
-        x = relay.var("x", shape=data_shape)
-        weight = relay.var("weight", shape=weight_shape)
-        out_channels = weight_shape[0]
-        in_bias = relay.var("in_bias", shape=(out_channels,))
-        in_scale = relay.const(_get_positive_scale((out_channels,)))
-        y1 = before(x, weight, in_bias, in_scale)
-        y1 = run_opt_pass(y1, transform.InferType())
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        y1_expected = expected(x, weight, in_bias, in_scale)
-
-        y1_folded = run_opt_pass(y1_folded, transform.InferType())
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4), (3, 4))
-    check((3, 5), (4, 5))
-
-
-def test_fold_bwd_bias_add():
-    """bias add testcase."""
-
-    def before(x, conv_weight, out_bias, out_scale, channels):
-        args = [x, conv_weight, out_bias]
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-        y = relay.nn.bias_add(y, out_bias)
-        y = relay.nn.relu(y)
-        y = relay.multiply(y, out_scale)
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, out_bias, out_scale, channels):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias]
-        squeezed_scale = relay.squeeze(out_scale, axis=[1, 2])
-        conv_weight = relay.multiply(
-            conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-        )
-
-        y = relay.nn.conv2d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-
-        out_bias = relay.multiply(out_bias, squeezed_scale)
-        y = relay.nn.bias_add(y, out_bias)
-        y = relay.nn.relu(y)
-        return relay.Function(args, y)
-
-    def check(shape, channels):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        out_bias = relay.var("out_bias", shape=(channels,))
-        out_scale = relay.const(_get_positive_scale((channels, 1, 1)))
-        y1 = before(x, weight, out_bias, out_scale, channels)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        y1_expected = expected(x, weight, out_bias, out_scale, channels)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10), 4)
-
-
-def test_fold_fwd_conv3d():
-    """Conv3d testcase."""
-
-    def before(x, conv_weight, in_bias, in_scale, channels, blocking):
-        args = [x, conv_weight, in_bias]
-        x = relay.multiply(x, in_scale)
-        x = relay.nn.relu(x)
-        x = relay.add(x, in_bias)
-        y = relay.nn.conv3d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3, 3),
-            padding=(1, 1, 1),
-            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
-            kernel_layout="OIDHW2i{}o".format(blocking[1]) if blocking else "OIDHW",
-        )
-
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, in_bias, in_scale, in_channels, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, in_bias]
-        if blocking:
-            squeezed_scale = relay.squeeze(in_scale, axis=[0, 2, 3, 4])
-            x = relay.nn.relu(x)
-            in_bias = relay.divide(
-                in_bias,
-                relay.reshape(
-                    squeezed_scale, (1, in_channels // blocking[0], 1, 1, 1, blocking[0])
-                ),
-            )  # NCHWc
-            x = relay.add(x, in_bias)
-            conv_weight = relay.multiply(
-                conv_weight, relay.reshape(squeezed_scale, (1, in_channels // 2, 1, 1, 1, 2, 1))
-            )  # OIHWio
-        else:
-            squeezed_scale = relay.squeeze(in_scale, axis=[1, 2, 3])
-            x = relay.nn.relu(x)
-            in_bias = relay.divide(
-                in_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-            )
-            x = relay.add(x, in_bias)
-            conv_weight = relay.multiply(
-                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-            )
-
-        y = relay.nn.conv3d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3, 3),
-            padding=(1, 1, 1),
-            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
-            kernel_layout="OIDHW2i{}o".format(blocking[1]) if blocking else "OIDHW",
-        )
-        return relay.Function(args, y)
-
-    def check(shape, channels, blocking):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        if blocking:
-            in_channels = shape[1] * shape[-1]
-            in_bias = relay.var(
-                "in_bias", shape=(1, in_channels // blocking[0], 1, 1, 1, blocking[0])
-            )
-            in_scale = relay.const(
-                _get_positive_scale((1, in_channels // blocking[0], 1, 1, 1, blocking[0]))
-            )
-        else:
-            in_channels = shape[1]
-            in_bias = relay.var("in_bias", shape=(in_channels, 1, 1, 1))
-            in_scale = relay.const(_get_positive_scale((in_channels, 1, 1, 1)))
-        y1 = before(x, weight, in_bias, in_scale, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
-        y1_expected = expected(x, weight, in_bias, in_scale, in_channels, channels, blocking)
-
-        y1_folded = run_opt_pass(y1_folded, transform.InferType())
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10, 10), 2, None)
-    check((2, 2, 10, 10, 10, 2), 8, (2, 4))
-
-
-def test_fold_bwd_conv3d():
-    """Conv3d testcase."""
-
-    def before(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        args = [x, conv_weight, out_bias]
-        if blocking:
-            out_bias = relay.reshape(out_bias, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
-        else:
-            out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=3)
-        y = relay.nn.conv3d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3, 3),
-            padding=(1, 1, 1),
-            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
-            kernel_layout="OIDHW1i{}o".format(blocking[1]) if blocking else "OIDHW",
-        )
-        y = relay.add(y, out_bias)
-        y = relay.nn.relu(y)
-        if blocking:
-            out_scale = relay.reshape(out_scale, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
-        y = relay.multiply(y, out_scale)
-        return relay.Function(args, y)
-
-    def expected(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
-        # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias]
-        if blocking:
-            out_bias = relay.reshape(out_bias, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
-            out_scale = relay.reshape(out_scale, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
-            squeezed_scale = relay.squeeze(out_scale, axis=[0, 2, 3, 4])
-            conv_weight = relay.multiply(
-                conv_weight,
-                relay.reshape(
-                    squeezed_scale, (channels // blocking[1], 1, 1, 1, 1, 1, blocking[1])
-                ),
-            )
-        else:
-            out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=3)
-            squeezed_scale = relay.squeeze(out_scale, axis=[1, 2, 3])
-            conv_weight = relay.multiply(
-                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=4)
-            )
-
-        y = relay.nn.conv3d(
-            x,
-            conv_weight,
-            channels=channels,
-            kernel_size=(3, 3, 3),
-            padding=(1, 1, 1),
-            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
-            kernel_layout="OIDHW1i{}o".format(blocking[1]) if blocking else "OIDHW",
-        )
-        if blocking:
-            out_bias = relay.multiply(
-                out_bias,
-                relay.reshape(squeezed_scale, (1, channels // blocking[1], 1, 1, 1, blocking[1])),
-            )
-        else:
-            out_bias = relay.multiply(
-                out_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
-            )
-        y = relay.add(y, out_bias)
-        y = relay.nn.relu(y)
-        return relay.Function(args, y)
-
-    def check(shape, in_channels, channels, blocking):
-        x = relay.var("x", shape=shape)
-        weight = relay.var("weight")
-        out_bias = relay.var("out_bias", shape=(channels,))
-        if blocking:
-            out_scale = relay.const(_get_positive_scale((channels,)))
-        else:
-            out_scale = relay.const(_get_positive_scale((channels, 1, 1, 1)))
-        y1 = before(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1 = run_opt_pass(y1, transform.InferType())
-        type_dict = {x.name_hint: x.checked_type for x in y1.params}
-        weight = relay.var("weight", type_dict["weight"])
-        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
-        y1_expected = expected(x, weight, out_bias, out_scale, in_channels, channels, blocking)
-        y1_expected = run_opt_pass(y1_expected, transform.InferType())
-        tvm.ir.assert_structural_equal(y1_folded, y1_expected)
-
-    check((2, 4, 10, 10, 10), 4, 8, None)
-    check((2, 2, 10, 10, 10, 16), 32, 64, (16, 16))
-
-
-if __name__ == "__main__":
-    test_fold_fwd_simple()
-    test_fold_fwd_dual_path()
-    test_fold_fwd_fail()
-    test_fold_fwd_relu_fail()
-    test_fold_fwd_negative_scale()
-    test_fold_fwd_dense()
-    test_fold_bwd_simple_constant()
-    test_fold_bwd_simple()
-    test_fold_bwd_dual_path()
-    test_fold_bwd_dual_consumer()
-    test_fold_bwd_fail()
-    test_fold_bwd_relu_fail()
-    test_fold_bwd_negative_scale()
-    test_fold_bwd_dense()
-    test_fold_bwd_bias_add()
-    test_fold_fwd_conv3d()
-    test_fold_bwd_conv3d()
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
deleted file mode 100644
index 11411a830658..000000000000
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ /dev/null
@@ -1,948 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import run_opt_pass
-import tvm.testing
-import tvm.topi.testing
-
-
-def test_fuse_simple():
-    """Simple testcase."""
-
-    def before():
-        x = relay.var("x", shape=(10, 20))
-        y = relay.add(x, relay.const(1, "float32"))
-        z = relay.exp(y)
-        w = relay.squeeze(z)
-        return relay.Function([x], w)
-
-    def expected():
-        x = relay.var("p", shape=(10, 20))
-        y = relay.add(x, relay.const(1, "float32"))
-        z = relay.exp(y)
-        w = relay.squeeze(z)
-        f1 = relay.Function([x], w)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        x = relay.var("x", shape=(10, 20))
-        y = relay.Call(f1, [x])
-        return relay.Function([x], y)
-
-    z = before()
-    zz = run_opt_pass(z, transform.FuseOps())
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_conv2d_fuse():
-    """Test fusion case of conv2d"""
-
-    def before(dshape):
-        x = relay.var("x", shape=dshape)
-        x = relay.add(x, relay.const(1, "float32"))
-        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=16)
-        # this is the next dominator.
-        y1 = relay.add(relay.const(1, "float32"), y)
-        y = relay.add(y, y1)
-        # second path
-        z2 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(1, 1), padding=(0, 0), channels=16)
-        z3 = relay.nn.conv2d(y, relay.var("w3"), kernel_size=(3, 3), padding=(1, 1), channels=16)
-        # add can only be fused to z1
-        z = relay.add(z2, z3)
-        return relay.Function(relay.analysis.free_vars(z), z)
-
-    def expected(dshape):
-        # segment 0
-        x = relay.var("p0", shape=dshape)
-        y = relay.add(x, relay.const(1, "float32"))
-        f0 = relay.Function([x], y)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        # segment 1
-        x = relay.var("p0", shape=dshape)
-        w = relay.var("p1")
-        y = relay.nn.conv2d(x, w, kernel_size=(3, 3), padding=(1, 1), channels=16)
-        y1 = relay.add(relay.const(1, "float32"), y)
-        y = relay.add(y, y1)
-        f1 = relay.Function([x, w], y)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        # segment 2
-        x = relay.var("p0", shape=dshape)
-        w = relay.var("p1")
-        z2 = relay.nn.conv2d(x, w, kernel_size=(3, 3), padding=(1, 1), channels=16)
-        f2 = relay.Function([x, w], z2)
-        f2 = f2.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        # segment 3
-        x = relay.var("p0", shape=dshape)
-        w = relay.var("p1")
-        offset = relay.var("p2", shape=dshape)
-        z3 = relay.nn.conv2d(x, w, kernel_size=(1, 1), padding=(0, 0), channels=16)
-        z3 = relay.add(z3, offset)
-        f3 = relay.Function([x, w, offset], z3)
-        f3 = f3.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        # compose
-        x = relay.var("x", shape=dshape)
-        y = relay.Call(f0, [x])
-        y = relay.Call(f1, [y, relay.var("w1")])
-        z2 = relay.Call(f2, [y, relay.var("w3")])
-        z3 = relay.Call(f3, [y, relay.var("w2"), z2])
-        z = z3
-        return relay.Function(relay.analysis.free_vars(z), z)
-
-    dshape = (1, 16, 64, 64)
-    z = before(dshape)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
-    after = run_opt_pass(expected(dshape), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_concatenate():
-    """Test fusion case involving concat op and Tuple node"""
-
-    def before(dshape):
-        x = relay.var("x", shape=dshape)
-        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        upsampled = relay.nn.upsampling(pooled, scale_h=2, scale_w=2, layout="NCHW")
-        concat = relay.concatenate((upsampled, x), axis=1)
-        out = relay.add(concat, relay.const(1, "float32"))
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    def expected(dshape):
-        x = relay.var("x", shape=dshape)
-        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        f0 = relay.Function([x], pooled)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2] // 2, dshape[3] // 2))
-        p1 = relay.var("p1", shape=dshape)
-        upsampled = relay.nn.upsampling(p0, scale_h=2, scale_w=2, layout="NCHW")
-        concat = relay.concatenate((upsampled, p1), axis=1)
-        out = relay.add(concat, relay.const(1, "float32"))
-        f1 = relay.Function([p0, p1], out)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        x = relay.var("x", shape=dshape)
-        y = relay.Call(f0, [x])
-        z = relay.Call(f1, [y, x])
-        return relay.Function([x], z)
-
-    dshape = (1, 16, 64, 64)
-    z = before(dshape)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=0))
-    assert not relay.analysis.free_vars(zz)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
-    assert not relay.analysis.free_vars(zz)
-    after = run_opt_pass(expected(dshape), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_tuple_root():
-    """Test fusion case where Tuple node is the root in its group"""
-
-    def before(dshape):
-        x = relay.var("x", shape=dshape)
-        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        upsampled = relay.nn.upsampling(pooled, scale_h=2, scale_w=2, layout="NCHW")
-        out = relay.Tuple((upsampled, x))
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    def expected(dshape):
-        x = relay.var("x", shape=dshape)
-        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        f0 = relay.Function([x], pooled)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2] // 2, dshape[3] // 2))
-        upsampled = relay.nn.upsampling(p0, scale_h=2, scale_w=2, layout="NCHW")
-        f1 = relay.Function([p0], upsampled)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        x = relay.var("x", shape=dshape)
-        y = relay.Call(f0, [x])
-        z = relay.Call(f1, [y])
-        tup = relay.Tuple((z, x))
-        return relay.Function([x], tup)
-
-    dshape = (1, 16, 64, 64)
-    z = before(dshape)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=0))
-    assert not relay.analysis.free_vars(zz)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
-    assert not relay.analysis.free_vars(zz)
-    after = run_opt_pass(expected(dshape), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_stop_fusion():
-    def before(dshape):
-        x = relay.var("x", shape=dshape)
-        y = relay.add(x, relay.const(1, "float32"))
-        y = relay.annotation.stop_fusion(y)
-        z = relay.exp(y)
-        return relay.Function([x], z)
-
-    def expected(dshape):
-        x = relay.var("p0", shape=dshape)
-        y = relay.add(x, relay.const(1, "float32"))
-        f1 = relay.Function([x], y)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        x = relay.var("p01", shape=dshape)
-        y = relay.exp(x)
-        f2 = relay.Function([x], y)
-        f2 = f2.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        x = relay.var("x", shape=dshape)
-        y = relay.Call(f1, [x])
-        z = relay.Call(f2, [y])
-        return relay.Function([x], z)
-
-    dshape = (10, 20)
-    z = before(dshape)
-    zz = run_opt_pass(z, transform.FuseOps())
-    after = run_opt_pass(expected(dshape), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_fuse_myia_regression():
-    def before(dshape, dtype):
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        y = relay.var("y", shape=dshape, dtype=dtype)
-        sb = relay.ScopeBuilder()
-        with sb.if_scope(relay.op.greater(x, y)):
-            sb.ret(relay.Function([], x))
-        with sb.else_scope():
-            sb.ret(relay.Function([], y))
-        return relay.Function([x, y], relay.Call(sb.get(), []))
-
-    def expected(dshape, dtype):
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        y = relay.var("y", shape=dshape, dtype=dtype)
-        sb = relay.ScopeBuilder()
-        p1 = relay.var("p1", shape=dshape, dtype=dtype)
-        p2 = relay.var("p2", shape=dshape, dtype=dtype)
-        fused_gt = relay.Function([p1, p2], relay.op.greater(p1, p2))
-        fused_gt = fused_gt.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        with sb.if_scope(fused_gt(x, y)):
-            sb.ret(relay.Function([], x))
-        with sb.else_scope():
-            sb.ret(relay.Function([], y))
-        return relay.Function([x, y], relay.Call(sb.get(), []))
-
-    dshape = ()
-    dtype = "int64"
-    f = before(dshape, dtype)
-    zz = run_opt_pass(f, transform.FuseOps())
-    after = run_opt_pass(expected(dshape, dtype), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_fuse_tuple_get_elemwise():
-    def before(dim):
-        X = relay.var("X", shape=(1, dim))
-        W = relay.var("W", shape=(3 * dim, dim))
-        matmul = relay.nn.dense(X, W)
-        splitted = relay.split(matmul, indices_or_sections=3, axis=1)
-        out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
-        return relay.Function([X, W], out)
-
-    def expected(dim):
-        p0 = relay.var("p0", shape=(1, dim))
-        p1 = relay.var("p1", shape=(3 * dim, dim))
-        matmul = relay.nn.dense(p0, p1)
-        f0 = relay.Function([p0, p1], matmul)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p01 = relay.var("p01", shape=(1, 3 * dim))
-        splitted = relay.split(p01, indices_or_sections=3, axis=1)
-        out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
-        f1 = relay.Function([p01], out)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        X = relay.var("X", shape=(1, dim))
-        W = relay.var("W", shape=(3 * dim, dim))
-        y = relay.Call(f0, [X, W])
-        z = relay.Call(f1, [y])
-        return relay.Function([X, W], z)
-
-    dim = 10
-    z = before(dim)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=0))
-    assert not relay.analysis.free_vars(zz)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
-    assert not relay.analysis.free_vars(zz)
-    after = run_opt_pass(expected(dim), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_tuple_get_root():
-    def before(dim):
-        X = relay.var("X", shape=(1, 3 * dim))
-        W = relay.var("W", shape=(dim, dim))
-        splitted = relay.split(X, indices_or_sections=3, axis=1)
-        out = relay.nn.dense(splitted[0], W)
-        return relay.Function([X, W], out)
-
-    def expected(dim):
-        p0 = relay.var("p0", shape=(1, 3 * dim))
-        splitted = relay.split(p0, indices_or_sections=3, axis=1)
-        out = splitted[0]
-        f0 = relay.Function([p0], out)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p01 = relay.var("p01", shape=(1, dim))
-        p1 = relay.var("p1", shape=(dim, dim))
-        out = relay.nn.dense(p01, p1)
-        f1 = relay.Function([p01, p1], out)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        X = relay.var("X", shape=(1, 3 * dim))
-        W = relay.var("W", shape=(dim, dim))
-        y = relay.Call(f0, [X])
-        z = relay.Call(f1, [y, W])
-        return relay.Function([X, W], z)
-
-    dim = 10
-    z = before(dim)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=0))
-    assert not relay.analysis.free_vars(zz)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
-    assert not relay.analysis.free_vars(zz)
-    after = run_opt_pass(expected(dim), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def fuse0(mod):
-    mod = relay.transform.InferType()(mod)
-    return relay.transform.FuseOps(fuse_opt_level=0)(mod)
-
-
-def fuse2(mod):
-    mod = relay.transform.InferType()(mod)
-    return relay.transform.FuseOps(fuse_opt_level=2)(mod)
-
-
-def test_tuple_intermediate():
-    def before(x):
-        inj = relay.squeeze(x)
-        y1 = relay.add(inj, relay.const(1, "float32"))
-        tmp = relay.squeeze(inj)
-        tmp = relay.add(tmp, relay.const(1, "float32"))
-        y2 = relay.add(tmp, relay.const(1, "float32"))
-        y3 = relay.add(inj, relay.const(1, "float32"))
-        concat = relay.concatenate((y1, y2, y3), axis=1)
-        out_inj = relay.squeeze(concat)
-        out = relay.add(out_inj, relay.const(1, "float32"))
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    def expected(p0):
-        f0 = before(p0)
-        f1 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        x = relay.var("x", shape=dshape)
-        y = relay.Call(f1, [x])
-        return relay.Function([x], y)
-
-    dshape = (1, 16, 64, 64)
-    x = relay.var("x", shape=dshape)
-    orig = before(x)
-    fuse0(tvm.IRModule.from_expr(orig))
-    m = fuse2(tvm.IRModule.from_expr(orig))
-    relay.build(m, "llvm")
-    after = run_opt_pass(expected(x), transform.InferType())
-    tvm.ir.assert_structural_equal(m["main"], after)
-
-
-def test_tuple_consecutive():
-    def gen_intermediate_tuple(x):
-        y1 = relay.add(x, relay.const(1, "float32"))
-        y2 = relay.add(x, relay.const(1, "float32"))
-        y3 = relay.add(x, relay.const(1, "float32"))
-        concat = relay.concatenate((y1, y2, y3), axis=1)
-        out = relay.add(concat, relay.const(1, "float32"))
-        return out
-
-    def gen_consecutive_tuple(x):
-        y1 = gen_intermediate_tuple(x)
-        y2 = gen_intermediate_tuple(x)
-        y3 = gen_intermediate_tuple(x)
-        concat = relay.concatenate((y1, y2, y3), axis=1)
-        return concat
-
-    def before(x):
-        concat = gen_consecutive_tuple(x)
-        pooled = relay.nn.max_pool2d(concat, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        out = relay.add(pooled, relay.const(1, "float32"))
-        out2 = relay.add(out, relay.const(1, "float32"))
-        out_tup = relay.Tuple((out, out2))
-        return relay.Function(relay.analysis.free_vars(out_tup), out_tup)
-
-    def expected(dshape):
-        p0 = relay.var("p0", shape=dshape)
-        concat = gen_consecutive_tuple(p0)
-        f0 = relay.Function([p0], concat)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p01 = relay.var("p01", shape=(1, dshape[1] * 9, dshape[2], dshape[3]))
-        pooled = relay.nn.max_pool2d(p01, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        out = relay.add(pooled, relay.const(1, "float32"))
-        f1 = relay.Function([p01], out)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p02 = relay.var("p02", shape=(1, dshape[1] * 9, dshape[2] // 2, dshape[3] // 2))
-        out = relay.add(p02, relay.const(1, "float32"))
-        f2 = relay.Function([p02], out)
-        f2 = f2.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        x = relay.var("x", shape=dshape)
-        y = relay.Call(f0, [x])
-        z = relay.Call(f1, [y])
-        z2 = relay.Call(f2, [z])
-
-        return relay.Function([x], relay.Tuple((z, z2)))
-
-    dshape = (1, 16, 64, 64)
-    x = relay.var("x", shape=dshape)
-    orig = before(x)
-    fuse0(tvm.IRModule.from_expr(orig))
-    m = fuse2(tvm.IRModule.from_expr(orig))
-    relay.build(m, "llvm")
-    after = run_opt_pass(expected(dshape), transform.InferType())
-    tvm.ir.assert_structural_equal(m["main"], after)
-
-
-def test_inception_like():
-    def conv(data):
-        y = relay.nn.conv2d(data, relay.var("w"), kernel_size=(3, 3), padding=(1, 1), channels=16)
-        return relay.nn.relu(data=y)
-
-    def inception_like(data):
-        c0 = conv(data)
-        c1 = conv(data)
-        return relay.concatenate((c0, c1), axis=1)
-
-    def before(dshape):
-        x = relay.var("x", shape=dshape)
-        in1 = inception_like(x)
-        in2 = inception_like(in1)
-        return relay.Function(relay.analysis.free_vars(in2), in2)
-
-    def expected(dshape):
-        p0 = relay.var("p0", shape=dshape)
-        c = conv(p0)
-        f0 = relay.Function(relay.analysis.free_vars(c), c)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p01 = relay.var("p01", shape=dshape)
-        c = conv(p01)
-        f1 = relay.Function(relay.analysis.free_vars(c), c)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p02 = relay.var("p02", shape=dshape)
-        p12 = relay.var("p12", shape=dshape)
-        concat1 = relay.concatenate((p02, p12), axis=1)
-        f_concat1 = relay.Function([p02, p12], concat1)
-        f_concat1 = f_concat1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        dshape2 = (dshape[0], dshape[1] * 2, dshape[2], dshape[3])
-
-        p03 = relay.var("p03", shape=dshape2)
-        c = conv(p03)
-        f2 = relay.Function(relay.analysis.free_vars(c), c)
-        f2 = f2.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p04 = relay.var("p04", shape=dshape2)
-        c = conv(p04)
-        f3 = relay.Function(relay.analysis.free_vars(c), c)
-        f3 = f3.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        p05 = relay.var("p05", shape=dshape)
-        p15 = relay.var("p15", shape=dshape)
-        concat2 = relay.concatenate((p05, p15), axis=1)
-        f_concat2 = relay.Function([p05, p15], concat2)
-        f_concat2 = f_concat2.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        x = relay.var("x", shape=dshape)
-        c1 = relay.Call(f0, [x, relay.var("w1")])
-        c2 = relay.Call(f1, [x, relay.var("w2")])
-        concat = relay.Call(f_concat1, [c1, c2])
-        c3 = relay.Call(f2, [concat, relay.var("w3")])
-        c4 = relay.Call(f3, [concat, relay.var("w4")])
-        out = relay.Call(f_concat2, [c3, c4])
-
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    dshape = (1, 16, 64, 64)
-    orig = before(dshape)
-    fuse0(tvm.IRModule.from_expr(orig))
-    m = fuse2(tvm.IRModule.from_expr(orig))
-    relay.build(m, "llvm")
-    after = run_opt_pass(expected(dshape), transform.InferType())
-    tvm.ir.assert_structural_equal(m["main"], after)
-
-
-def test_fuse_parallel_injective():
-    """Test fusing parallel injective ops to an elemwise op."""
-
-    def before():
-        x = relay.var("x", shape=(10, 20))
-        y = relay.add(x, relay.const(1, "float32"))
-        z = relay.squeeze(y)
-        u = relay.transpose(y, axes=[0, 1])
-        w = relay.left_shift(z, u)
-        return relay.Function([x], w)
-
-    def expected():
-        x = relay.var("p", shape=(10, 20))
-        y = relay.add(x, relay.const(1, "float32"))
-        z = relay.squeeze(y)
-        u = relay.transpose(y, axes=[0, 1])
-        w = relay.left_shift(z, u)
-        f1 = relay.Function([x], w)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        x = relay.var("x", shape=(10, 20))
-        y = relay.Call(f1, [x])
-        return relay.Function([x], y)
-
-    z = before()
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=0))
-    assert not relay.analysis.free_vars(zz)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
-    assert not relay.analysis.free_vars(zz)
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_immutable():
-    """Verify the fusion pass won't change original module."""
-
-    def before():
-        x = relay.var("x", shape=(10, 20))
-        y = relay.add(x, relay.const(1, "float32"))
-        z = relay.exp(y)
-        w = relay.squeeze(z)
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([x], w)
-        return mod
-
-    def expected():
-        x = relay.var("p", shape=(10, 20))
-        y = relay.add(x, relay.const(1, "float32"))
-        z = relay.exp(y)
-        w = relay.squeeze(z)
-        f1 = relay.Function([x], w)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        x = relay.var("x", shape=(10, 20))
-        y = relay.Call(f1, [x])
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([x], y)
-        return mod
-
-    mod = transform.InferType()(before())
-    new_mod = transform.FuseOps(fuse_opt_level=2)(mod)
-    tvm.ir.assert_structural_equal(mod, transform.InferType()(before()))
-    tvm.ir.assert_structural_equal(new_mod, transform.InferType()(expected()))
-
-
-def test_split():
-    """Test that the result is well formed."""
-    x = relay.var("x", shape=(6, 9))
-    y = relay.split(x, 3).astuple()
-    a = relay.TupleGetItem(y, 0)
-    b = relay.TupleGetItem(y, 1)
-    c = relay.TupleGetItem(y, 2)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], a + relay.RefRead(relay.RefCreate(b)) + c)
-    mod = transform.InferType()(mod)
-    mod = transform.FuseOps()(mod)
-
-
-def test_fuse_max():
-    """Test the constraint of number of nodes in op fusion."""
-
-    def before(n):
-        x = relay.var("x", shape=(10, 20))
-        y = x
-        for i in range(n):
-            y = relay.exp(y)
-        return relay.Function([x], y)
-
-    def expected(n, max_fused_ops):
-        x = relay.var("p", shape=(10, 20))
-        y = x
-        for i in range(max_fused_ops):
-            y = relay.exp(y)
-        f1 = relay.Function([x], y)
-        f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        x = relay.var("x", shape=(10, 20))
-        z = relay.Call(f1, [x])
-        xx = relay.var("pp", shape=(10, 20))
-        yy = xx
-        # it is assumed that there are two fused functions
-        for i in range(n - max_fused_ops):
-            yy = relay.exp(yy)
-        f2 = relay.Function([xx], yy)
-        f2 = f2.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        zz = relay.Call(f2, [z])
-        return relay.Function([x], zz)
-
-    max_fused_ops = 256
-    n = 300
-    z = before(n)
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
-    zz = run_opt_pass(z, transform.FuseOps())
-    after = run_opt_pass(expected(n, max_fused_ops), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-    max_fused_ops = 10
-    n = 20
-    z = before(n)
-    after = run_opt_pass(expected(n, max_fused_ops), transform.InferType())
-
-    with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
-        zz = run_opt_pass(z, transform.FuseOps())
-
-    tvm.ir.assert_structural_equal(zz, after)
-
-    with tvm.target.Target("opencl"):
-        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
-            cl_zz = run_opt_pass(z, transform.FuseOps())
-
-    tvm.ir.assert_structural_equal(cl_zz, after)
-
-
-link_params = tvm.testing.parameter(False, True)
-
-
-def test_fuse_take(link_params):
-    """Test fusion case involving concat and take"""
-
-    def before():
-        shape = (tvm.tir.const(10, "int64"), tvm.tir.const(1, "int64"))
-        x = relay.var("x", shape=shape)
-        concat = relay.concatenate([x, x], axis=-1)
-        out = relay.op.take(concat, indices=relay.const([0], dtype="int64"))
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    def expected(link_params):
-        shape1 = (tvm.tir.const(10, "int64"), tvm.tir.const(1, "int64"))
-        shape2 = (tvm.tir.const(1, "int64"),)
-        x = relay.var("x", shape=shape1)
-        p0 = relay.var("p0", shape=shape1)
-        p1 = relay.var("p1", shape=shape2, dtype="int64")
-        c = relay.const([0], dtype="int64")
-        concat = relay.concatenate([p0, p0], axis=-1)
-        out = relay.op.take(concat, indices=c if link_params else p1)
-
-        f0 = relay.Function([p0] if link_params else [p0, p1], out)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        y = relay.Call(f0, [x] if link_params else [x, c])
-        return relay.Function([x], y)
-
-    after = run_opt_pass(expected(link_params), transform.InferType())
-    with tvm.transform.PassContext(opt_level=2, config={"relay.FuseOps.link_params": link_params}):
-        m = run_opt_pass(before(), transform.InferType())
-        m = run_opt_pass(m, transform.FuseOps())
-    tvm.ir.assert_structural_equal(m, after)
-    relay.build(m, "llvm")
-
-
-def test_fuse_gather_nd(link_params):
-    """Test fusion case involving concat and gather_nd"""
-
-    def before():
-        shape = (tvm.tir.const(10, "int64"), tvm.tir.const(1, "int64"))
-        x = relay.var("x", shape=shape)
-        concat = relay.concatenate([x, x], axis=-1)
-        out = relay.gather_nd(concat, indices=relay.expr.const([[0, 1], [1, 0]], dtype="int64"))
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    def expected(link_params):
-        shape1 = (tvm.tir.const(10, "int64"), tvm.tir.const(1, "int64"))
-        shape2 = (tvm.tir.const(2, "int64"), tvm.tir.const(2, "int64"))
-        x = relay.var("x", shape=shape1)
-        p0 = relay.var("p0", shape=shape1)
-        p1 = relay.var("p1", shape=shape2, dtype="int64")
-        c = relay.const([[0, 1], [1, 0]], dtype="int64")
-        concat = relay.concatenate([p0, p0], axis=-1)
-        out = relay.gather_nd(concat, indices=c if link_params else p1)
-
-        f0 = relay.Function([p0] if link_params else [p0, p1], out)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        y = relay.Call(f0, [x] if link_params else [x, c])
-        return relay.Function([x], y)
-
-    after = run_opt_pass(expected(link_params), transform.InferType())
-    with tvm.transform.PassContext(opt_level=2, config={"relay.FuseOps.link_params": link_params}):
-        m = run_opt_pass(before(), transform.InferType())
-        m = run_opt_pass(m, transform.FuseOps())
-    tvm.ir.assert_structural_equal(m, after)
-    relay.build(m, "llvm")
-
-
-@tvm.testing.uses_gpu
-def test_fuse_bcast_reduce_scalar():
-    """Test fusion case with broadcast and reduction involving scalar"""
-
-    def before():
-        x = relay.var("x", shape=(), dtype="int32")
-        less = relay.less(x, relay.const(10, dtype="int32"))
-        z = relay.min(less)
-        return relay.Function([x], z)
-
-    def expected():
-        p0 = relay.var("p0", shape=(), dtype="int32")
-        less = relay.less(p0, relay.const(10, dtype="int32"))
-        z0 = relay.min(less)
-        f0 = relay.Function([p0], z0)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-
-        x = relay.var("x", shape=(), dtype="int32")
-        f = relay.Call(f0, [x])
-        return relay.Function([x], f)
-
-    orig = before()
-    m = fuse2(tvm.IRModule.from_expr(orig))
-    for tgt, dev in tvm.testing.enabled_targets():
-        relay.build(m, tgt)
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(m["main"], after)
-
-
-def test_fuse_max_diamond():
-    def create_diamond(x, branch_len):
-        x1 = x
-        x2 = x
-        for _ in range(branch_len):
-            x1 = relay.exp(x1)
-            x2 = relay.exp(x2)
-        return relay.add(x1, x2)
-
-    def before(branch_len, num_diamond):
-        x = relay.var("x", shape=(10, 20))
-        out = x
-        for _ in range(num_diamond):
-            out = create_diamond(out, branch_len)
-        return relay.Function([x], out)
-
-    def after(branch_len, num_diamond):
-        def create_diamond_func(inp):
-            inp_var = relay.var("p", shape=(10, 20))
-            d = create_diamond(inp_var, branch_len)
-            f = relay.Function([inp_var], d)
-            f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-            return relay.Call(f, [inp])
-
-        inp = relay.var("x", shape=(10, 20))
-        out = inp
-        for _ in range(num_diamond):
-            out = create_diamond_func(out)
-        return relay.Function([inp], out)
-
-    branch_len = 5
-    max_fused_ops = branch_len * 2 + 1  # the number of ops in one diamond
-    num_diamond = 3
-
-    with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
-        fused = run_opt_pass(before(branch_len, num_diamond), transform.FuseOps())
-
-    expected = run_opt_pass(after(branch_len, num_diamond), transform.InferType())
-    tvm.ir.assert_structural_equal(fused, expected)
-
-
-def test_fuse_dynamic_squeeze_slice_take():
-    input_data = [
-        np.random.random([1, 2, 4]).astype("float32"),
-        np.array([0]).astype("int64"),
-    ]
-
-    x = relay.var("p0107", shape=(relay.Any(), relay.Any(), 4), dtype="float32")
-    take_val = relay.var("p166", shape=(relay.Any(),), dtype="int64")
-
-    squeeze = relay.op.squeeze(x, axis=[0])
-    strided_slice = relay.op.strided_slice(
-        squeeze, begin=[0, 0], end=[15130, 2147483647], strides=[1, 1]
-    )
-    take = relay.op.take(strided_slice, take_val, axis=0)
-
-    mod = tvm.IRModule.from_expr(take)
-    result = relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm").evaluate()(
-        *input_data
-    )
-
-    np_result = np.squeeze(input_data[0][:, input_data[1][0], :], axis=0)
-
-    assert np.allclose(result.numpy(), np_result)
-
-
-@tvm.testing.uses_gpu
-def test_fuse_softmax():
-    """Test if softmax can be fused with following ops."""
-    channel_size = 16
-
-    def before():
-        x = relay.var("x", shape=(16, channel_size))
-        softmax = relay.nn.softmax(x)
-        out = relay.cast(softmax, "float16")
-        return relay.Function([x], out)
-
-    def expected():
-        p0 = relay.var("p0", shape=(16, channel_size))
-        softmax = relay.nn.softmax(p0)
-        out = relay.cast(softmax, "float16")
-
-        x = relay.var("x", shape=(16, channel_size))
-
-        f0 = relay.Function([p0], out)
-        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        y = relay.Call(f0, [x])
-        return relay.Function([x], y)
-
-    orig = before()
-    m = fuse2(tvm.IRModule.from_expr(orig))
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(m["main"], after)
-
-    inp = np.random.randn(16, channel_size).astype("float32")
-    ref = tvm.topi.testing.softmax_python(inp).astype("float16")
-
-    for tgt, dev in tvm.testing.enabled_targets():
-        ex = relay.create_executor("graph", mod=m, device=dev, target=tgt)
-        result = ex.evaluate()(inp).numpy()
-        tvm.testing.assert_allclose(result, ref, rtol=1e-4, atol=1e-4)
-
-
-target_name = tvm.testing.parameter("opencl", "metal", "cuda")
-shape_type = tvm.testing.parameter("dynamic", "static")
-
-
-def test_fuse_max_num_args(target_name, shape_type):
-    if shape_type == "dynamic":
-        shape = (tvm.tir.Any(), 20)
-        number_of_any_dims = 1
-    else:
-        shape = (10, 20)
-        number_of_any_dims = 0
-    ndims = len(shape)
-    ops_num = 300
-
-    def _base_func(name):
-        x = relay.var(name, shape=shape)
-        y = relay.add(x, relay.const(1, "float32"))
-        w = relay.exp(y)
-        return x, w
-
-    def before(n):
-        inp = []
-        out = []
-        for i in range(n):
-            x, w = _base_func(f"x{i}")
-            inp.append(x)
-            out.append(w)
-        w = out[0]
-        for i in range(len(out) - 1):
-            w = relay.add(w, out[i + 1])
-        return relay.Function(inp, w)
-
-    def after(n):
-        def create_fused_func(limit):
-            added_args = 0
-            inputs = []
-            input_vars = []
-            res = None
-            i = 0
-            while added_args < limit:
-                inp, out = _base_func(f"p{i}")
-
-                curr_args = 1 + number_of_any_dims
-                if number_of_any_dims > 0:
-                    curr_args += ndims
-
-                if added_args + curr_args > limit:
-                    f = relay.Function(inputs, res)
-                    f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-                    return i, input_vars, f
-
-                input_vars.append(relay.var(f"x{i}", shape=shape))
-                inputs.append(inp)
-                if res is None:
-                    res = out
-                else:
-                    res = relay.add(res, out)
-                added_args += curr_args
-                i += 1
-            f = relay.Function(inputs, res)
-            f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-            return i, input_vars, f
-
-        def create_accum_func(args_limit):
-            out = None
-            inputs = []
-            if args_limit == 0:
-                for i in range(n):
-                    inputs.append(relay.var(f"x{i}", shape=shape))
-                f = before(n)
-                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-                out = relay.Call(f, inputs)
-                return relay.Function(inputs, out)
-
-            i, inputs, func = create_fused_func(args_limit)
-            out = relay.Call(func, inputs)
-            while i < n:
-                inp, func = _base_func(f"p{i}")
-                inputs.append(relay.var(f"xa{i}", shape=shape))
-                curr_args = 1 + number_of_any_dims
-                if number_of_any_dims > 0:
-                    curr_args += ndims
-                f = relay.Function([inp], func)
-                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-                w = relay.Call(f, [inputs[-1]])
-                a = relay.var(f"a", shape=shape)
-                b = relay.var(f"b", shape=shape)
-                out_add = relay.add(a, b)
-                f = relay.Function([a, b], out_add)
-                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-                out = relay.Call(f, [out, w])
-                i += 1
-            return relay.Function(inputs, out)
-
-        args_limit = tvm.target.Target.current().max_function_args - (
-            1 + number_of_any_dims
-        )  # one buffer with output
-        args_limit = max(args_limit, 0)
-        return create_accum_func(args_limit)
-
-    max_fused_ops = ops_num * 5
-    with tvm.target.Target(target_name):
-        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
-            fused = run_opt_pass(before(ops_num), transform.FuseOps())
-
-        expected = run_opt_pass(after(ops_num), transform.InferType())
-
-    tvm.ir.assert_structural_equal(fused, expected)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
deleted file mode 100644
index 33f0775b2d87..000000000000
--- a/tests/python/relay/test_pass_gradient.py
+++ /dev/null
@@ -1,460 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import collections
-import numpy as np
-import pytest
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import GlobalVar
-from tvm.relay.analysis import free_vars, free_type_vars
-from tvm.relay import create_executor, transform
-from tvm.relay.transform import gradient
-from tvm.relay.prelude import Prelude
-from tvm.relay.testing import (
-    make_nat_expr,
-    run_infer_type,
-    check_grad,
-    rand,
-    count_ops,
-)
-import tvm.relay.op as op
-
-
-def test_fo_id():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    func = relay.Function([x], x)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func, mode="first_order"))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x = rand(dtype, *shape)
-    forward, (grad,) = create_executor().evaluate(back_func)(x)
-    tvm.testing.assert_allclose(forward.numpy(), x.numpy())
-    tvm.testing.assert_allclose(grad.numpy(), np.ones_like(x.numpy()))
-
-
-def test_id():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    func = relay.Function([x], x)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x = rand(dtype, *shape)
-    forward, (grad,) = create_executor().evaluate(back_func)(x)
-    tvm.testing.assert_allclose(forward.numpy(), x.numpy())
-    tvm.testing.assert_allclose(grad.numpy(), np.ones_like(x.numpy()))
-
-
-def test_relu():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    func = relay.Function([x], op.nn.relu(x))
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    # gradient will implicitly check that no graph appear in result
-
-
-def test_add():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    func = relay.Function([x], x + x)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x = rand(dtype, *shape)
-    forward, (grad,) = create_executor().evaluate(back_func)(x)
-    tvm.testing.assert_allclose(forward.numpy(), 2 * x.numpy())
-    tvm.testing.assert_allclose(grad.numpy(), 2 * np.ones_like(x.numpy()))
-
-
-def test_check_grad():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    y = relay.var("y", t)
-    func = relay.Function([x, y], x + y)
-    check_grad(func)
-
-
-def test_temp_add():
-    scope = relay.ScopeBuilder()
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    y = scope.let("y", x + x)
-    scope.ret(y + y)
-    func = relay.Function([x], scope.get())
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x = rand(dtype, *shape)
-    forward, (grad,) = create_executor().evaluate(back_func)(x)
-    tvm.testing.assert_allclose(forward.numpy(), 4 * x.numpy())
-    tvm.testing.assert_allclose(grad.numpy(), 4 * np.ones_like(x.numpy()))
-
-
-def test_sub():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    func = relay.Function([x], x - x)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x = rand(dtype, *shape)
-    forward, (grad,) = create_executor().evaluate(back_func)(x)
-    tvm.testing.assert_allclose(forward.numpy(), np.zeros_like(x.numpy()))
-    tvm.testing.assert_allclose(grad.numpy(), np.zeros_like(x.numpy()))
-
-
-def test_broadcast_add():
-    shape1 = (3, 4, 1)
-    shape2 = (1, 5)
-    dtype = "float32"
-    x_nd = rand(dtype, *shape1)
-    y_nd = rand(dtype, *shape2)
-    x_np = x_nd.numpy()
-    y_np = y_nd.numpy()
-    expected_forward = x_np + y_np
-    t1 = relay.TensorType(shape1, dtype)
-    t2 = relay.TensorType(shape2, dtype)
-    x = relay.var("x", t1)
-    y = relay.var("y", t2)
-    func = relay.Function([x, y], x + y)
-    func = run_infer_type(func)
-    full_func = run_infer_type(gradient(func))
-    assert full_func.checked_type == relay.FuncType(
-        [t1, t2],
-        relay.TupleType(
-            [relay.TensorType(expected_forward.shape, dtype), relay.TupleType([t1, t2])]
-        ),
-    )
-    forward, (grad_x, grad_y) = create_executor().evaluate(full_func)(x_nd, y_nd)
-    tvm.testing.assert_allclose(forward.numpy(), expected_forward)
-    tvm.testing.assert_allclose(
-        grad_x.numpy(), np.ones_like(expected_forward).sum(axis=2, keepdims=True)
-    )
-    tvm.testing.assert_allclose(
-        grad_y.numpy(),
-        np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0),
-    )
-
-
-def test_broadcast_subtract():
-    shape1 = (3, 4, 1)
-    shape2 = (1, 5)
-    dtype = "float32"
-    x_nd = rand(dtype, *shape1)
-    y_nd = rand(dtype, *shape2)
-    x_np = x_nd.numpy()
-    y_np = y_nd.numpy()
-    expected_forward = x_np - y_np
-    t1 = relay.TensorType(shape1, dtype)
-    t2 = relay.TensorType(shape2, dtype)
-    x = relay.var("x", t1)
-    y = relay.var("y", t2)
-    func = relay.Function([x, y], x - y)
-    func = run_infer_type(func)
-    full_func = run_infer_type(gradient(func))
-    assert full_func.checked_type == relay.FuncType(
-        [t1, t2],
-        relay.TupleType(
-            [relay.TensorType(expected_forward.shape, dtype), relay.TupleType([t1, t2])]
-        ),
-    )
-    forward, (grad_x, grad_y) = create_executor().evaluate(full_func)(x_nd, y_nd)
-    tvm.testing.assert_allclose(forward.numpy(), expected_forward)
-    tvm.testing.assert_allclose(
-        grad_x.numpy(), np.ones_like(expected_forward).sum(axis=2, keepdims=True)
-    )
-    tvm.testing.assert_allclose(
-        grad_y.numpy(),
-        -np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0),
-    )
-
-
-def _test_tuple(mode):
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    y = relay.var("y", t)
-    z = relay.var("z", t)
-    if mode == "higher_order":
-        tup = relay.Var("tup")
-        func = relay.Function(
-            [x, y, z],
-            relay.Let(
-                tup,
-                relay.Tuple([x, y, z]),
-                relay.TupleGetItem(tup, 0)
-                + relay.TupleGetItem(tup, 1)
-                - relay.TupleGetItem(tup, 2),
-            ),
-        )
-    else:
-        # first order does not do let.
-        tup = relay.Tuple([x, y, z])
-        func = relay.Function(
-            [x, y, z],
-            relay.TupleGetItem(tup, 0) + relay.TupleGetItem(tup, 1) - relay.TupleGetItem(tup, 2),
-        )
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func, mode=mode))
-    assert back_func.checked_type == relay.FuncType(
-        [t, t, t], relay.TupleType([t, relay.TupleType([t, t, t])])
-    )
-    x_nd = rand(dtype, *shape)
-    y_nd = rand(dtype, *shape)
-    z_nd = rand(dtype, *shape)
-    x_np = x_nd.numpy()
-    y_np = y_nd.numpy()
-    z_np = z_nd.numpy()
-    expected_forward = x_np + y_np - z_np
-    forward, (grad_x, grad_y, grad_z) = create_executor().evaluate(back_func)(x_nd, y_nd, z_nd)
-    tvm.testing.assert_allclose(forward.numpy(), expected_forward)
-    tvm.testing.assert_allclose(grad_x.numpy(), np.ones_like(grad_x.numpy()))
-    tvm.testing.assert_allclose(grad_y.numpy(), np.ones_like(grad_y.numpy()))
-    tvm.testing.assert_allclose(grad_z.numpy(), -1 * np.ones_like(grad_z.numpy()))
-
-
-def _test_tuple_argument(mode):
-    shape = (2, 3)
-    dtype = "float32"
-    tensor_type = relay.TensorType(shape, dtype)
-    fields = 3
-    tuple_type = relay.TupleType([tensor_type] * fields)
-    tup = relay.var("tup", type_annotation=tuple_type)
-    body = relay.TupleGetItem(tup, 0)
-    for i in range(1, fields):
-        body = relay.add(body, relay.TupleGetItem(tup, i))
-    func = relay.Function([tup], body)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func, mode=mode))
-    xs = [rand(dtype, *shape) for _ in range(fields)]
-    xs_np = np.array([x.numpy() for x in xs])
-    expected_forward = np.sum(xs_np, axis=0)
-    forward, grad = create_executor().evaluate(back_func)(tuple(xs))
-    tvm.testing.assert_allclose(forward.numpy(), expected_forward)
-    for field in grad[0]:
-        tvm.testing.assert_allclose(field.numpy(), np.ones_like(field.numpy()))
-
-
-def test_tuple():
-    _test_tuple("higher_order")
-
-
-def test_tuple_first_order():
-    _test_tuple("first_order")
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_tuple_argument():
-    # fails until we add support for top-level tuple arguments in higher-order AD
-    _test_tuple_argument("higher_order")
-
-
-def test_tuple_argument_first_order():
-    _test_tuple_argument("first_order")
-
-
-def test_pow():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat_iterate = mod.get_global_var("nat_iterate")
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    double = relay.Function([x], x + x)
-    i = relay.var("i", t)
-    func = relay.Function([i], nat_iterate(double, make_nat_expr(p, 3))(i))
-    mod["main"] = func
-    mod = transform.InferType()(mod)
-    mod["main"] = gradient(mod["main"], mod=mod)
-    m = transform.InferType()(mod)
-    back_func = m["main"]
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    i_nd = rand(dtype, *shape)
-    forward, (grad_i,) = create_executor(mod=mod).evaluate(back_func)(i_nd)
-    tvm.testing.assert_allclose(forward.numpy(), 8 * i_nd.numpy())
-    tvm.testing.assert_allclose(grad_i.numpy(), 8 * np.ones_like(grad_i.numpy()))
-
-
-def test_ref():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    r = relay.Var("r")
-    u = relay.Var("u")
-    body = relay.RefRead(r)
-    body = relay.Let(u, relay.RefWrite(r, relay.RefRead(r) + relay.RefRead(r)), body)
-    body = relay.Let(r, relay.RefCreate(x), body)
-    func = relay.Function([x], body)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x_nd = rand(dtype, *shape)
-    forward, (grad_x,) = create_executor().evaluate(back_func)(x_nd)
-    tvm.testing.assert_allclose(forward.numpy(), 2 * x_nd.numpy())
-    tvm.testing.assert_allclose(grad_x.numpy(), 2 * np.ones_like(grad_x.numpy()))
-
-
-def test_square_second_order():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    func = relay.Function([x], x * x)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    y = relay.var("y", t)
-    back_func_adjusted = relay.Function(
-        [y], relay.TupleGetItem(relay.TupleGetItem(back_func(y), 1), 0)
-    )
-    back_func_adjusted = run_infer_type(back_func_adjusted)
-    back_back_func = run_infer_type(gradient(back_func_adjusted))
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x_nd = rand(dtype, *shape)
-    forward, (grad_x,) = create_executor().evaluate(back_back_func)(x_nd)
-    tvm.testing.assert_allclose(forward.numpy(), 2 * x_nd.numpy())
-    tvm.testing.assert_allclose(grad_x.numpy(), 2 * np.ones_like(grad_x.numpy()))
-
-
-def test_if():
-    x = relay.var("x", shape=(1, 16, 64, 64))
-    y = relay.var("y", shape=(1, 16, 64, 64))
-    cond = relay.var("cond", shape=(), dtype="uint1")
-    net = relay.If(cond, x, y)
-    net = relay.log(net)
-    func = relay.Function(free_vars(net), net)
-    func = run_infer_type(func)
-    net = gradient(func, mode="higher_order")
-    net = run_infer_type(net)
-
-
-def test_grad_tuple():
-    scope = relay.ScopeBuilder()
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    y = scope.let("y", x + x)
-    scope.ret(relay.Tuple([y + y, y]))
-    func = relay.Function([x], scope.get())
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    assert back_func.checked_type == relay.FuncType(
-        [t], relay.TupleType([relay.TupleType([t, t]), relay.TupleType([t])])
-    )
-    x = rand(dtype, *shape)
-    (forward_four, forward_two), (grad,) = create_executor().evaluate(back_func)(x)
-    tvm.testing.assert_allclose(forward_four.numpy(), 4 * x.numpy())
-    tvm.testing.assert_allclose(forward_two.numpy(), 2 * x.numpy())
-    tvm.testing.assert_allclose(grad.numpy(), 4 * np.ones_like(x.numpy()))
-
-
-def test_concat():
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    rt = relay.TensorType((10, 20), dtype)
-    x = relay.var("x", t)
-    y = op.concatenate([x, x], axis=1)
-    func = relay.Function([x], y)
-    func = run_infer_type(func)
-    back_func = run_infer_type(gradient(func))
-    tvm.ir.assert_structural_equal(
-        back_func.checked_type, relay.FuncType([t], relay.TupleType([rt, relay.TupleType([t])]))
-    )
-    # no value validation as concatenate has dummy gradient right now.
-
-
-def test_no_duplication():
-    x = tvm.relay.Var("x", type_annotation=tvm.relay.TensorType([12, 12]))
-    y = tvm.relay.Var("y", type_annotation=tvm.relay.TensorType([12, 12]))
-    xy = tvm.relay.nn.dense(x, y)
-
-    m = tvm.relay.sum(xy, keepdims=True)
-    s = tvm.relay.sum(xy - m)
-    fn = tvm.relay.Function([x, y], s)
-    fn = run_infer_type(fn)
-    gr = tvm.relay.transform.gradient(fn, mode="first_order")
-
-    counts = count_ops(gr)
-    assert counts["nn.dense"] == 3, "We expect 3 dense (1 forward, two backward)"
-
-
-def test_no_duplication_tuples():
-    x = tvm.relay.Var("x", type_annotation=tvm.relay.TensorType([12, 12]))
-    y = tvm.relay.Var("y", type_annotation=tvm.relay.TensorType([12, 12]))
-    xy = tvm.relay.nn.dense(x, y)
-
-    t = relay.Tuple([xy, xy])
-
-    m = tvm.relay.sum(xy, keepdims=True)
-    s = tvm.relay.sum(relay.TupleGetItem(t, 0) - m)
-    fn = tvm.relay.Function([x, y], s)
-    fn = run_infer_type(fn)
-    gr = tvm.relay.transform.gradient(fn, mode="first_order")
-
-    counts = count_ops(gr)
-    assert counts["nn.dense"] == 3, "We expect 3 dense (1 forward, two backward)"
-
-
-def test_global_function():
-    m = tvm.IRModule()
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.Var("x", t)
-    d = GlobalVar("double")
-    m[d] = relay.Function([x], x + x)
-    y = relay.Var("y", t)
-    q = GlobalVar("q")
-    m[q] = relay.Function([y], d(d(y)))
-    g = GlobalVar("grad")
-    m = tvm.relay.transform.InferType()(m)
-    m[g] = tvm.relay.transform.gradient(q, m)
-    m = tvm.relay.transform.InferType()(m)
-    back_func = m[g]
-    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
-    x = rand(dtype, *shape)
-    forward, (grad,) = create_executor(mod=m).evaluate(back_func)(x)
-    tvm.testing.assert_allclose(forward.numpy(), 4 * x.numpy())
-    tvm.testing.assert_allclose(grad.numpy(), 4 * np.ones_like(x.numpy()))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_inline.py b/tests/python/relay/test_pass_inline.py
deleted file mode 100644
index 482c2246654d..000000000000
--- a/tests/python/relay/test_pass_inline.py
+++ /dev/null
@@ -1,830 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, missing-docstring, too-many-statements
-import tvm
-from tvm import relay
-
-
-def get_recursive_count_loop():
-    mod = tvm.IRModule({})
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    sb = relay.ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0, dtype="int32"))):
-        sb.ret(i)
-    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1, dtype="int32"))
-        rec_call = relay.Call(sum_up, [one_less])
-        sb.ret(relay.add(rec_call, i))
-    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], "int32"))
-    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    mod[sum_up] = func
-    iarg = relay.var("i", shape=[], dtype="int32")
-    mod["main"] = relay.Function([iarg], sum_up(iarg))
-    return mod, sum_up
-
-
-def test_call_chain_inline_leaf():
-    """Test when only leaf call is inlined.
-
-    The call graph is like the following:
-              main
-              /  \
-             g1   g2
-             /
-            g11(inline)
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x11 = relay.var("x11", shape=(3, 5))
-        g11 = relay.GlobalVar("g11")
-        fn11 = relay.Function([x11], x11)
-        fn11 = fn11.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        mod[g11] = fn11
-
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1 + g11(x1))
-        fn1 = relay.Function([x1, y1], sb.get())
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = g1(p0, p1)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1 + x1)
-        fn1 = relay.Function([x1, y1], sb.get())
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = g1(p0, p1)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_call_chain_inline_multiple_levels():
-    """Test when only leaf call is inlined.
-
-    The call graph is like the following:
-                  main
-                 /    \
-          g1(inline)   g2
-               /
-        g11(inline)
-
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x11 = relay.var("x11", shape=(3, 5))
-        g11 = relay.GlobalVar("g11")
-        fn11 = relay.Function([x11], x11)
-        fn11 = fn11.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        mod[g11] = fn11
-
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1 + g11(x1))
-        fn1 = relay.Function([x1, y1], sb.get())
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = g1(p0, p1)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = p0 + p1 + p0
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_call_chain_inline_multiple_levels_extern_compiler():
-    """Test when only leaf call is inlined.
-
-    The call graph is like the following:
-                  main
-                 /    \
-          g1(inline)   g2
-               /
-        g11(inline, external compiler)
-
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x11 = relay.var("x11", shape=(3, 5))
-        g11 = relay.GlobalVar("g11")
-        fn11 = relay.Function([x11], x11)
-        fn11 = fn11.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn11 = fn11.with_attr("Compiler", "a")
-        mod[g11] = fn11
-
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1 + g11(x1))
-        fn1 = relay.Function([x1, y1], sb.get())
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = g1(p0, p1)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-        x11 = relay.var("x11", shape=(3, 5))
-        fn11 = relay.Function([x11], x11)
-        fn11 = fn11.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn11 = fn11.with_attr("Compiler", "a")
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = p0 + p1 + fn11(p0)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_recursive_call_with_global():
-    def get_mod():
-        mod = tvm.IRModule({})
-
-        x = relay.var("x", shape=[], dtype="int32")
-        fn0 = relay.Function([x], x)
-        fn0 = fn0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        gx = relay.GlobalVar("gx")
-        mod[gx] = fn0
-
-        sum_up = relay.GlobalVar("sum_up")
-        i = relay.var("i", shape=[], dtype="int32")
-        sb = relay.ScopeBuilder()
-        with sb.if_scope(relay.equal(i, relay.const(0, dtype="int32"))):
-            sb.ret(i)
-        with sb.else_scope():
-            one_less = relay.subtract(i, relay.const(1, dtype="int32"))
-            global_call = gx(i)
-            rec_call = relay.Call(sum_up, [one_less]) + global_call
-            sb.ret(relay.add(rec_call, i))
-        func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], "int32"))
-        func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        mod[sum_up] = func
-        iarg = relay.var("i", shape=[], dtype="int32")
-        mod["main"] = relay.Function([iarg], sum_up(iarg))
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-
-        sum_up = relay.GlobalVar("sum_up")
-        i = relay.var("i", shape=[], dtype="int32")
-        sb = relay.ScopeBuilder()
-        with sb.if_scope(relay.equal(i, relay.const(0, dtype="int32"))):
-            sb.ret(i)
-        with sb.else_scope():
-            one_less = relay.subtract(i, relay.const(1, dtype="int32"))
-            rec_call = relay.Call(sum_up, [one_less]) + i
-            sb.ret(relay.add(rec_call, i))
-        func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], "int32"))
-        func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        mod[sum_up] = func
-        iarg = relay.var("i", shape=[], dtype="int32")
-        mod["main"] = relay.Function([iarg], sum_up(iarg))
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_recursive_called():
-    mod, sum_up = get_recursive_count_loop()
-    iarg = relay.var("i", shape=[], dtype="int32")
-    mod["main"] = relay.Function([iarg], sum_up(iarg))
-    ref_mod = mod
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, ref_mod, map_free_vars=True)
-
-
-def test_recursive_not_called():
-    def get_mod():
-        mod, sum_up = get_recursive_count_loop()
-        x = relay.var("x", shape=(2, 2))
-        y = relay.var("y", shape=(2, 2))
-        x1 = relay.var("x1", shape=(2, 2))
-        fn1 = relay.Function([x1], x1)
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-        mod["main"] = relay.Function([x, y], x + y + g1(x))
-        return mod
-
-    def expected():
-        mod, sum_up = get_recursive_count_loop()
-        x = relay.var("x", shape=(2, 2))
-        y = relay.var("y", shape=(2, 2))
-        mod["main"] = relay.Function([x, y], x + y + x)
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    ref_mod = expected()
-    tvm.ir.assert_structural_equal(mod, ref_mod, map_free_vars=True)
-
-
-def test_recursive_not_called_extern_compiler():
-    def get_mod():
-        mod, sum_up = get_recursive_count_loop()
-        x = relay.var("x", shape=(2, 2))
-        y = relay.var("y", shape=(2, 2))
-        x1 = relay.var("x1", shape=(2, 2))
-        fn1 = relay.Function([x1], x1)
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn1 = fn1.with_attr("Compiler", "a")
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-        mod["main"] = relay.Function([x, y], x + y + g1(x))
-        return mod
-
-    def expected():
-        mod, sum_up = get_recursive_count_loop()
-        x = relay.var("x", shape=(2, 2))
-        y = relay.var("y", shape=(2, 2))
-        x1 = relay.var("x1", shape=(2, 2))
-        fn1 = relay.Function([x1], x1)
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn1 = fn1.with_attr("Compiler", "a")
-        mod["main"] = relay.Function([x, y], x + y + fn1(x))
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    ref_mod = expected()
-    tvm.ir.assert_structural_equal(mod, ref_mod, map_free_vars=True)
-
-
-def test_globalvar_as_call_arg():
-    def get_mod():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1)
-        fn1 = relay.Function([x1, y1], sb.get())
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = g1(p0, p1)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    def expected():
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = p0 + p1
-        call_fn2 = p2 - p3
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_globalvar_as_call_arg_extern_compiler():
-    def get_mod():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1)
-        fn1 = relay.Function([x1, y1], sb.get())
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn1 = fn1.with_attr("Compiler", "a")
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn2 = fn2.with_attr("Compiler", "b")
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = g1(p0, p1)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1)
-        fn1 = relay.Function([x1, y1], sb.get())
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn1 = fn1.with_attr("Compiler", "a")
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn2 = fn2.with_attr("Compiler", "b")
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = relay.Call(fn1, [p0, p1])
-        call_fn2 = relay.Call(fn2, [p2, p3])
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_inline_globalvar_without_args():
-    def get_mod():
-        mod = tvm.IRModule({})
-        fn1 = relay.Function([], relay.const(1))
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn2 = relay.Function([], relay.const(2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g1 = relay.GlobalVar("g1")
-        g2 = relay.GlobalVar("g2")
-        mod[g1] = fn1
-        mod = relay.transform.InferType()(mod)
-        mod[g2] = fn2
-        p = relay.var("p", "bool")
-        mod["main"] = relay.Function([p], relay.Call(relay.If(p, g1, g2), []))
-        return relay.transform.InferType()(mod)
-
-    def expected():
-        mod = tvm.IRModule({})
-        fn1 = relay.Function([], relay.const(1))
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn2 = relay.Function([], relay.const(2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        p = relay.var("p", "bool")
-        mod["main"] = relay.Function([p], relay.Call(relay.If(p, fn1, fn2), []))
-        return relay.transform.InferType()(mod)
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_inline_globalvar_without_args_extern_compiler():
-    def get_mod():
-        mod = tvm.IRModule({})
-        fn1 = relay.Function([], relay.const(1))
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn1 = fn1.with_attr("Compiler", "a")
-        fn2 = relay.Function([], relay.const(2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn2 = fn2.with_attr("Compiler", "b")
-        g1 = relay.GlobalVar("g1")
-        g2 = relay.GlobalVar("g2")
-        mod[g1] = fn1
-        mod[g2] = fn2
-        p = relay.var("p", "bool")
-        mod["main"] = relay.Function([p], relay.Call(relay.If(p, g1, g2), []))
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-        fn1 = relay.Function([], relay.const(1))
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn1 = fn1.with_attr("Compiler", "a")
-        fn2 = relay.Function([], relay.const(2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn2 = fn2.with_attr("Compiler", "b")
-        p = relay.var("p", "bool")
-        mod["main"] = relay.Function([p], relay.Call(relay.If(p, fn1, fn2), []))
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_globalvar_called_by_multiple_functions():
-    """Test when only leaf call is inlined.
-
-    The call graph is like the following:
-                  main    g0
-                 /    \   /
-                g1    g2(inline)
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1)
-        fn1 = relay.Function([x1, y1], sb.get())
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        sb1 = relay.ScopeBuilder()
-        sb1.ret(x2 - y2)
-        fn2 = relay.Function([x2, y2], sb1.get())
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        x0 = relay.var("x0", shape=(3, 5))
-        y0 = relay.var("y0", shape=(3, 5))
-        z0 = relay.var("z0", shape=(3, 5))
-        fn0 = relay.Function([x0, y0, z0], g2(x0, y0) + z0)
-        g0 = relay.GlobalVar("g0")
-        mod[g0] = fn0
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn1 = g1(p0, p1)
-        call_fn2 = g2(p2, p3)
-        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        sb = relay.ScopeBuilder()
-        sb.ret(x1 + y1)
-        fn1 = relay.Function([x1, y1], sb.get())
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        p0 = relay.var("p0", shape=(3, 5))
-        p1 = relay.var("p1", shape=(3, 5))
-        p2 = relay.var("p2", shape=(3, 5))
-        p3 = relay.var("p3", shape=(3, 5))
-
-        call_fn2 = p2 - p3
-        mod["main"] = relay.Function([p0, p1, p2, p3], g1(p0, p1) * call_fn2)
-
-        x0 = relay.var("x0", shape=(3, 5))
-        y0 = relay.var("y0", shape=(3, 5))
-        z0 = relay.var("z0", shape=(3, 5))
-
-        fn0 = relay.Function([x0, y0, z0], x0 - y0 + z0)
-        g0 = relay.GlobalVar("g0")
-        mod[g0] = fn0
-
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_entry_with_inline():
-    """Test entry function with inline
-
-    The call graph is like the following:
-                g1(inline)    g2(inline)
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        fn1 = relay.Function([x1, y1], x1 + y1)
-        fn1 = fn1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        fn2 = relay.Function([x2, y2], x2 - y2)
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, get_mod(), map_free_vars=True)
-
-
-def test_callee_not_inline():
-    """Test entry function with inline
-
-    The call graph is like the following:
-                    main
-                      |
-                 g2(inline)
-                      |
-                     g1
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        fn1 = relay.Function([x1, y1], x1 + y1)
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, get_mod(), map_free_vars=True)
-
-
-def test_callee_not_inline_leaf_inline():
-    """Test entry function with inline
-
-    The call graph is like the following:
-                    main
-                      |
-                 g2(inline)
-                      |
-                     g1
-                      |
-                 g0(inline)
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x0 = relay.var("x0", shape=(3, 5))
-        y0 = relay.var("y0", shape=(3, 5))
-        fn0 = relay.Function([x0, y0], x0 * y0)
-        fn0 = fn0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g0 = relay.GlobalVar("g0")
-        mod[g0] = fn0
-
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        fn1 = relay.Function([x1, y1], x1 + g0(x1, y1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        fn1 = relay.Function([x1, y1], x1 + x1 * y1)
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-def test_callee_not_inline_leaf_inline_extern_compiler():
-    """Test entry function with inline
-
-    The call graph is like the following:
-                    main
-                      |
-                 g2(inline)
-                      |
-                     g1
-                      |
-                 g0(inline, external compiler)
-    """
-
-    def get_mod():
-        mod = tvm.IRModule({})
-        x0 = relay.var("x0", shape=(3, 5))
-        y0 = relay.var("y0", shape=(3, 5))
-        fn0 = relay.Function([x0, y0], x0 * y0)
-        fn0 = fn0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn0 = fn0.with_attr("Compiler", "aa")
-        g0 = relay.GlobalVar("g0")
-        mod[g0] = fn0
-
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        fn1 = relay.Function([x1, y1], x1 + g0(x1, y1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-        return mod
-
-    def expected():
-        mod = tvm.IRModule({})
-        x0 = relay.var("x0", shape=(3, 5))
-        y0 = relay.var("y0", shape=(3, 5))
-        fn0 = relay.Function([x0, y0], x0 * y0)
-        fn0 = fn0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        fn0 = fn0.with_attr("Compiler", "aa")
-
-        x1 = relay.var("x1", shape=(3, 5))
-        y1 = relay.var("y1", shape=(3, 5))
-        fn1 = relay.Function([x1, y1], x1 + fn0(x1, y1))
-        g1 = relay.GlobalVar("g1")
-        mod[g1] = fn1
-
-        x2 = relay.var("x2", shape=(3, 5))
-        y2 = relay.var("y2", shape=(3, 5))
-        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
-        fn2 = fn2.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        g2 = relay.GlobalVar("g2")
-        mod[g2] = fn2
-
-        return mod
-
-    mod = get_mod()
-    mod = relay.transform.Inline()(mod)
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_instrument.py b/tests/python/relay/test_pass_instrument.py
deleted file mode 100644
index 455cf20b5de0..000000000000
--- a/tests/python/relay/test_pass_instrument.py
+++ /dev/null
@@ -1,547 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Instrument test cases.
-"""
-import pytest
-import tvm
-import tvm.relay
-from tvm.relay import op
-from tvm.ir.instrument import PassTimingInstrument, pass_instrument
-
-
-def get_test_model():
-    x, y, z = [tvm.relay.var(c, shape=(3, 4), dtype="float32") for c in "xyz"]
-    e1 = op.add(x, y)
-    e2 = op.subtract(x, z)
-    e3 = op.multiply(e1, e1 / e2)
-    return tvm.IRModule.from_expr(e3 + e2)
-
-
-def test_pass_timing_instrument():
-    pass_timing = PassTimingInstrument()
-
-    # Override current PassContext's instruments
-    tvm.transform.PassContext.current().override_instruments([pass_timing])
-
-    mod = get_test_model()
-    mod = tvm.relay.transform.AnnotateSpans()(mod)
-    mod = tvm.relay.transform.ToANormalForm()(mod)
-    mod = tvm.relay.transform.InferType()(mod)
-
-    profiles = pass_timing.render()
-    assert "AnnotateSpans" in profiles
-    assert "ToANormalForm" in profiles
-    assert "InferType" in profiles
-
-    # Reset current PassContext's instruments to None
-    tvm.transform.PassContext.current().override_instruments(None)
-
-    mod = get_test_model()
-    mod = tvm.relay.transform.AnnotateSpans()(mod)
-    mod = tvm.relay.transform.ToANormalForm()(mod)
-    mod = tvm.relay.transform.InferType()(mod)
-
-    profiles = pass_timing.render()
-    assert profiles == ""
-
-
-instrument_definition_type = tvm.testing.parameter("decorator", "subclass")
-
-
-def test_custom_instrument(instrument_definition_type):
-    class BaseTest:
-        def __init__(self):
-            self.events = []
-
-        def enter_pass_ctx(self):
-            self.events.append("enter ctx")
-
-        def exit_pass_ctx(self):
-            self.events.append("exit ctx")
-
-        def run_before_pass(self, mod, info):
-            self.events.append("run before " + info.name)
-
-        def run_after_pass(self, mod, info):
-            self.events.append("run after " + info.name)
-
-    if instrument_definition_type == "decorator":
-        MyTest = pass_instrument(BaseTest)
-
-    elif instrument_definition_type == "subclass":
-
-        class MyTest(BaseTest, tvm.ir.instrument.PassInstrument):
-            def __init__(self):
-                BaseTest.__init__(self)
-                tvm.ir.instrument.PassInstrument.__init__(self)
-
-    mod = get_test_model()
-    my_test = MyTest()
-    with tvm.transform.PassContext(instruments=[my_test]):
-        mod = tvm.relay.transform.InferType()(mod)
-
-    assert (
-        "enter ctx"
-        "run before InferType"
-        "run after InferType"
-        "exit ctx" == "".join(my_test.events)
-    )
-
-
-def test_disable_pass():
-    @pass_instrument
-    class CustomPI:
-        def __init__(self):
-            self.events = []
-
-        def should_run(self, mod, info):
-            # Only run pass name contains "InferType"
-            if "InferType" not in info.name:
-                return False
-            return True
-
-        def run_before_pass(self, mod, info):
-            self.events.append(info.name)
-
-    mod = get_test_model()
-    custom_pi = CustomPI()
-    with tvm.transform.PassContext(instruments=[custom_pi]):
-        mod = tvm.relay.transform.AnnotateSpans()(mod)
-        mod = tvm.relay.transform.ToANormalForm()(mod)
-        mod = tvm.relay.transform.InferType()(mod)
-
-    assert "InferType" == "".join(custom_pi.events)
-
-
-def test_multiple_instrument():
-    @pass_instrument
-    class SkipPass:
-        def __init__(self, skip_pass_name):
-            self.skip_pass_name = skip_pass_name
-
-        def should_run(self, mod, info):
-            if self.skip_pass_name in info.name:
-                return False
-            return True
-
-    skip_annotate = SkipPass("AnnotateSpans")
-    skip_anf = SkipPass("ToANormalForm")
-
-    @pass_instrument
-    class PrintPassName:
-        def __init__(self):
-            self.events = []
-
-        def run_before_pass(self, mod, info):
-            self.events.append(info.name)
-
-    mod = get_test_model()
-    print_pass_name = PrintPassName()
-    with tvm.transform.PassContext(instruments=[skip_annotate, skip_anf, print_pass_name]):
-        mod = tvm.relay.transform.AnnotateSpans()(mod)
-        mod = tvm.relay.transform.ToANormalForm()(mod)
-        mod = tvm.relay.transform.InferType()(mod)
-
-    assert "InferType" == "".join(print_pass_name.events)
-
-
-def test_instrument_pass_counts():
-    @pass_instrument
-    class PassesCounter:
-        def __init__(self):
-            self.run_before_count = 0
-            self.run_after_count = 0
-
-        def __clear(self):
-            self.run_before_count = 0
-            self.run_after_count = 0
-
-        def enter_pass_ctx(self):
-            self.__clear()
-
-        def exit_pass_ctx(self):
-            self.__clear()
-
-        def run_before_pass(self, mod, info):
-            self.run_before_count = self.run_before_count + 1
-
-        def run_after_pass(self, mod, info):
-            self.run_after_count = self.run_after_count + 1
-
-    mod = get_test_model()
-    passes_counter = PassesCounter()
-    with tvm.transform.PassContext(instruments=[passes_counter]):
-        tvm.relay.build(mod, "llvm")
-        assert passes_counter.run_after_count != 0
-        assert passes_counter.run_after_count == passes_counter.run_before_count
-
-    # Out of pass context scope, should be reset
-    assert passes_counter.run_before_count == 0
-    assert passes_counter.run_after_count == 0
-
-
-def test_list_pass_configs():
-    configs = tvm.transform.PassContext.list_configs()
-
-    assert len(configs) > 0
-    assert "relay.backend.use_auto_scheduler" in configs.keys()
-    assert configs["relay.backend.use_auto_scheduler"]["type"] == "IntImm"
-
-
-def test_enter_pass_ctx_exception():
-    events = []
-
-    @pass_instrument
-    class PI:
-        def __init__(self, id):
-            self.id = id
-
-        def enter_pass_ctx(self):
-            events.append(self.id + " enter ctx")
-
-        def exit_pass_ctx(self):
-            events.append(self.id + " exit ctx")
-
-    @pass_instrument
-    class PIBroken(PI):
-        def __init__(self, id):
-            super().__init__(id)
-
-        def enter_pass_ctx(self):
-            events.append(self.id + " enter ctx")
-            raise RuntimeError("Just a dummy error")
-
-    pass_ctx = tvm.transform.PassContext(instruments=[PI("%1"), PIBroken("%2"), PI("%3")])
-    with pytest.raises(RuntimeError) as cm:
-        with pass_ctx:
-            pass
-        assert "Just a dummy error" in str(cm.execption)
-
-    assert "%1 enter ctx" "%2 enter ctx" "%1 exit ctx" == "".join(events)
-
-    # Make sure we get correct PassContext
-    cur_pass_ctx = tvm.transform.PassContext.current()
-    assert pass_ctx != cur_pass_ctx
-    assert not cur_pass_ctx.instruments
-
-
-def test_enter_pass_ctx_exception_global():
-    @pass_instrument
-    class PIBroken:
-        def enter_pass_ctx(self):
-            raise RuntimeError("Just a dummy error")
-
-    cur_pass_ctx = tvm.transform.PassContext.current()
-    with pytest.raises(RuntimeError) as cm:
-        cur_pass_ctx.override_instruments([PIBroken()])
-        assert "Just a dummy error" in str(cm.exception)
-    assert not cur_pass_ctx.instruments
-
-
-def test_exit_pass_ctx_exception():
-    events = []
-
-    @pass_instrument
-    class PI:
-        def __init__(self, id):
-            self.id = id
-
-        def exit_pass_ctx(self):
-            events.append(self.id + " exit ctx")
-
-    @pass_instrument
-    class PIBroken(PI):
-        def __init__(self, id):
-            super().__init__(id)
-
-        def exit_pass_ctx(self):
-            events.append(self.id + " exit ctx")
-            raise RuntimeError("Just a dummy error")
-
-    pass_ctx = tvm.transform.PassContext(instruments=[PI("%1"), PIBroken("%2"), PI("%3")])
-    with pytest.raises(RuntimeError) as cm:
-        with pass_ctx:
-            pass
-        assert "Just a dummy error" in str(cm.exception)
-
-    assert "%1 exit ctx" "%2 exit ctx" == "".join(events)
-
-    # Make sure we get correct PassContext
-    cur_pass_ctx = tvm.transform.PassContext.current()
-    assert pass_ctx != cur_pass_ctx
-    assert not cur_pass_ctx.instruments
-
-
-def test_exit_pass_ctx_exception_global():
-    @pass_instrument
-    class PIBroken:
-        def exit_pass_ctx(self):
-            raise RuntimeError("Just a dummy error")
-
-    cur_pass_ctx = tvm.transform.PassContext.current()
-    with pytest.raises(RuntimeError) as cm:
-        cur_pass_ctx.override_instruments([PIBroken()])
-        cur_pass_ctx.override_instruments([PIBroken()])
-        assert "Just a dummy error" in str(cm.exception)
-    assert not cur_pass_ctx.instruments
-
-
-def test_pass_exception():
-    events = []
-
-    @pass_instrument
-    class PI:
-        def enter_pass_ctx(self):
-            events.append("enter_pass_ctx")
-
-        def exit_pass_ctx(self):
-            events.append("exit_pass_ctx")
-
-        def should_run(self, mod, info):
-            events.append("should_run")
-            return True
-
-        def run_before_pass(self, mod, info):
-            events.append("run_before_pass")
-
-        def run_after_pass(self, mod, info):
-            events.append("run_after_pass")
-
-    @tvm.transform.module_pass(opt_level=2)
-    def transform(mod, ctx):
-        events.append("transform pass")
-        raise RuntimeError("Just a dummy error")
-        return mod
-
-    mod = get_test_model()
-    with pytest.raises(RuntimeError) as cm:
-        with tvm.transform.PassContext(instruments=[PI()]):
-            mod = transform(mod)
-        assert "Just a dummy error" in str(cm.exception)
-
-    assert (
-        "enter_pass_ctx"
-        "should_run"
-        "run_before_pass"
-        "transform pass"
-        "exit_pass_ctx" == "".join(events)
-    )
-
-
-def test_should_run_exception():
-    events = []
-
-    @pass_instrument
-    class PI:
-        def __init__(self, id):
-            self.id = id
-
-        def enter_pass_ctx(self):
-            events.append(self.id + " enter_pass_ctx")
-
-        def exit_pass_ctx(self):
-            events.append(self.id + " exit_pass_ctx")
-
-        def should_run(self, mod, info):
-            events.append(self.id + " should_run")
-            raise RuntimeError("Just a dummy error")
-            return True
-
-        def run_before_pass(self, mod, info):
-            events.append(self.id + " run_before_pass")
-
-        def run_after_pass(self, mod, info):
-            events.append(self.id + " run_after_pass")
-
-    @tvm.transform.module_pass(opt_level=2)
-    def transform(mod, ctx):
-        events.append("transform pass")
-        return mod
-
-    mod = get_test_model()
-    with pytest.raises(RuntimeError) as cm:
-        with tvm.transform.PassContext(instruments=[PI("%1"), PI("%2")]):
-            mod = transform(mod)
-        assert "Just a dummy error" in str(cm.exception)
-
-    assert (
-        "%1 enter_pass_ctx"
-        "%2 enter_pass_ctx"
-        "%1 should_run"
-        "%1 exit_pass_ctx"
-        "%2 exit_pass_ctx" == "".join(events)
-    )
-
-
-def test_run_before_exception():
-    events = []
-
-    @pass_instrument
-    class PI:
-        def __init__(self, id):
-            self.id = id
-
-        def enter_pass_ctx(self):
-            events.append(self.id + " enter_pass_ctx")
-
-        def exit_pass_ctx(self):
-            events.append(self.id + " exit_pass_ctx")
-
-        def should_run(self, mod, info):
-            events.append(self.id + " should_run")
-            return True
-
-        def run_before_pass(self, mod, info):
-            events.append(self.id + " run_before_pass")
-            raise RuntimeError("Just a dummy error")
-
-        def run_after_pass(self, mod, info):
-            events.append(self.id + " run_after_pass")
-
-    @tvm.transform.module_pass(opt_level=2)
-    def transform(mod, ctx):
-        events.append("transform pass")
-        return mod
-
-    mod = get_test_model()
-    with pytest.raises(RuntimeError) as cm:
-        with tvm.transform.PassContext(instruments=[PI("%1"), PI("%2")]):
-            mod = transform(mod)
-        assert "Just a dummy error" in str(cm.exception)
-
-    assert (
-        "%1 enter_pass_ctx"
-        "%2 enter_pass_ctx"
-        "%1 should_run"
-        "%2 should_run"
-        "%1 run_before_pass"
-        "%1 exit_pass_ctx"
-        "%2 exit_pass_ctx" == "".join(events)
-    )
-
-
-def test_run_after_exception():
-    events = []
-
-    @pass_instrument
-    class PI:
-        def __init__(self, id):
-            self.id = id
-
-        def enter_pass_ctx(self):
-            events.append(self.id + " enter_pass_ctx")
-
-        def exit_pass_ctx(self):
-            events.append(self.id + " exit_pass_ctx")
-
-        def should_run(self, mod, info):
-            events.append(self.id + " should_run")
-            return True
-
-        def run_before_pass(self, mod, info):
-            events.append(self.id + " run_before_pass")
-
-        def run_after_pass(self, mod, info):
-            events.append(self.id + " run_after_pass")
-            raise RuntimeError("Just a dummy error")
-
-    @tvm.transform.module_pass(opt_level=2)
-    def transform(mod, ctx):
-        events.append("transform pass")
-        return mod
-
-    x, y = [tvm.relay.var(c, shape=(3, 4), dtype="float32") for c in "xy"]
-    mod = tvm.IRModule.from_expr(tvm.relay.add(x, y))
-
-    with pytest.raises(RuntimeError) as cm:
-        with tvm.transform.PassContext(instruments=[PI("%1"), PI("%2")]):
-            mod = transform(mod)
-        assert "Just a dummy error" in str(cm.exception)
-
-    assert (
-        "%1 enter_pass_ctx"
-        "%2 enter_pass_ctx"
-        "%1 should_run"
-        "%2 should_run"
-        "%1 run_before_pass"
-        "%2 run_before_pass"
-        "transform pass"
-        "%1 run_after_pass"
-        "%1 exit_pass_ctx"
-        "%2 exit_pass_ctx" == "".join(events)
-    )
-
-
-def test_instrument_call_sequence():
-    events = []
-
-    @pass_instrument
-    class PI:
-        def __init__(self, id):
-            self.id = id
-
-        def enter_pass_ctx(self):
-            events.append(self.id + " enter_pass_ctx")
-
-        def exit_pass_ctx(self):
-            events.append(self.id + " exit_pass_ctx")
-
-        def should_run(self, mod, info):
-            events.append("  " + self.id + " should_run")
-            return True
-
-        def run_before_pass(self, mod, info):
-            events.append("  " + self.id + " run_before_pass")
-
-        def run_after_pass(self, mod, info):
-            events.append("  " + self.id + " run_after_pass")
-
-    @tvm.transform.module_pass(opt_level=2)
-    def transform1(mod, ctx):
-        events.append("    transform1 pass")
-        return mod
-
-    @tvm.transform.module_pass(opt_level=2)
-    def transform2(mod, ctx):
-        events.append("    transform2 pass")
-        return mod
-
-    mod = get_test_model()
-    with tvm.transform.PassContext(instruments=[PI("%1"), PI("%2")]):
-        mod = transform1(mod)
-        mod = transform2(mod)
-
-    assert (
-        "%1 enter_pass_ctx"
-        "%2 enter_pass_ctx"
-        "  %1 should_run"
-        "  %2 should_run"
-        "  %1 run_before_pass"
-        "  %2 run_before_pass"
-        "    transform1 pass"
-        "  %1 run_after_pass"
-        "  %2 run_after_pass"
-        "  %1 should_run"
-        "  %2 should_run"
-        "  %1 run_before_pass"
-        "  %2 run_before_pass"
-        "    transform2 pass"
-        "  %1 run_after_pass"
-        "  %2 run_after_pass"
-        "%1 exit_pass_ctx"
-        "%2 exit_pass_ctx" == "".join(events)
-    )
diff --git a/tests/python/relay/test_pass_lambda_lift.py b/tests/python/relay/test_pass_lambda_lift.py
deleted file mode 100644
index 518a8c3078b6..000000000000
--- a/tests/python/relay/test_pass_lambda_lift.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import transform
-
-
-def test_basic():
-    mod = tvm.IRModule()
-    x2 = relay.var("x2", shape=(10, 5))
-    y2 = relay.var("y2", shape=(1, 5))
-    level2_func = relay.Function([x2, y2], relay.op.add(x2, y2))
-
-    x1 = relay.var("x1", shape=(10, 5))
-    y1 = relay.var("y1", shape=(1, 5))
-    level1_func = relay.Function([x1, y1], level2_func(x1, y1))
-
-    mod["main"] = level1_func
-    mod = relay.transform.InferType()(mod)
-    new_mod = transform.LambdaLift()(mod)
-    assert len(new_mod.functions) == 2
-
-
-def test_closure():
-    mod = tvm.IRModule()
-
-    x = relay.var("x", shape=(2,))
-    y = relay.var("y", shape=(2,))
-    inner_func = relay.Function([x], x + y)
-    outer_func = relay.Function([y], inner_func)
-    clo = outer_func(relay.ones(shape=(2,), dtype="float32"))
-    mod["main"] = relay.Function([], relay.Call(clo, [relay.zeros(shape=(2,), dtype="float32")]))
-
-    mod = relay.transform.InferType()(mod)
-    new_mod = transform.LambdaLift()(mod)
-    assert len(new_mod.functions) == 3
-
-
-def test_recursive():
-    mod = tvm.IRModule()
-
-    x = relay.var("x", shape=(2,))
-    i = relay.var("i", shape=(), dtype="int32")
-    s = relay.var("s", shape=(2,))
-    cond = i < relay.const(10, dtype="int32")
-
-    loop = relay.var("while_loop")
-    sb = relay.scope_builder.ScopeBuilder()
-    with sb.if_scope(cond):
-        ii = i + relay.const(1, dtype="int32")
-        ss = s + x
-        sb.ret(loop(ii, ss))
-    with sb.else_scope():
-        sb.ret(s)
-    func = relay.Function([i, s], sb.get())
-
-    ret = relay.Let(
-        loop, func, loop(relay.const(0, dtype="int32"), relay.zeros(shape=(2,), dtype="float32"))
-    )
-    mod["main"] = relay.Function([x], ret)
-
-    mod = relay.transform.InferType()(mod)
-    new_mod = transform.LambdaLift()(mod)
-    assert len(new_mod.functions) == 2
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_lazy_gradient_init.py b/tests/python/relay/test_pass_lazy_gradient_init.py
deleted file mode 100644
index 323eb6aa5095..000000000000
--- a/tests/python/relay/test_pass_lazy_gradient_init.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm.relay import create_executor, transform
-from tvm.relay.testing import rand, run_infer_type
-import tvm.testing
-from tvm.testing import assert_allclose
-
-
-def test_tc():
-    """Simple testcase, check that transformation typechecks."""
-    mod = tvm.IRModule()
-
-    shape = (20, 20)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x1 = relay.var("x1", t)
-    x2 = relay.var("x2", t)
-    # f(x1,x2) = (x1-x2)*x2
-    y = relay.Function([x1, x2], (x1 - x2) * x2)
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-
-    # function input/output types should remain the same
-    assert mod["main"].checked_type == relay.FuncType([t, t], t)
-
-
-def test_add():
-    """Simple add testcase. Check types and semantic equivalence."""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    # f(x) = x+x
-    y = relay.Function([x], x + x)
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    y = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], t)
-
-    x = rand(dtype, *shape)
-    y = create_executor(mod=mod).evaluate(y)(x)
-    assert_allclose(y.numpy(), x.numpy() + x.numpy())
-
-
-def test_add_tuple():
-    """Add elements of tuple. Check types and semantic equivalence."""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    tensor_type = relay.TensorType(shape, dtype)
-    t = relay.TupleType([tensor_type, tensor_type])
-
-    x = relay.var("x", t)
-    # f((x1,x2)) = x1 + x2
-    y = relay.Function([x], relay.TupleGetItem(x, 0) + relay.TupleGetItem(x, 1))
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    mod = tvm.transform.PrintIR(show_meta_data=True)(mod)
-    y = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], tensor_type)
-
-    x = (rand(dtype, *shape), rand(dtype, *shape))
-    y = create_executor(mod=mod).evaluate(y)(x)
-    assert_allclose(y.numpy(), x[0].numpy() + x[1].numpy())
-
-
-def test_mult():
-    """Simple multiplication testcase. Check types and semantic equivalence."""
-    mod = tvm.IRModule()
-
-    shape = (15, 15)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    # f(x) = x*x
-    y = relay.Function([x], x * x)
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    y = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], t)
-
-    x = rand(dtype, *shape)
-    y = create_executor(mod=mod).evaluate(y)(x)
-    assert_allclose(y.numpy(), x.numpy() * x.numpy())
-
-
-def test_ret_tuple():
-    """Test tuple return type. Check types and semantic equivalence."""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    # f(x) = (x,x)
-    func = relay.Function([x], relay.Tuple([x, x * relay.const(2.0)]))
-    func = run_infer_type(func)
-
-    mod["main"] = func
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    func = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], relay.TupleType([t, t]))
-
-    x = rand(dtype, *shape)
-    y = create_executor(mod=mod).evaluate(func)(x)
-    assert_allclose(y[0].numpy(), x.numpy())
-    assert_allclose(y[1].numpy(), x.numpy() * 2.0)
-
-
-def test_add_broadcast():
-    """Test adding matrices of different size. Check types and semantic equivalence."""
-    mod = tvm.IRModule()
-
-    shape1 = (3, 4, 1)
-    shape2 = (1, 5)
-    dtype = "float32"
-    t1 = relay.TensorType(shape1, dtype)
-    t2 = relay.TensorType(shape2, dtype)
-
-    x1 = relay.var("x1", t1)
-    x2 = relay.var("x2", t2)
-    func = relay.Function([x1, x2], x1 + x2)
-    func = run_infer_type(func)
-
-    mod["main"] = func
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    func = mod["main"]
-
-    x1_np = rand(dtype, *shape1).numpy()
-    x2_np = rand(dtype, *shape2).numpy()
-    expected_forward = x1_np + x2_np
-
-    expected_forward_type = relay.TensorType(expected_forward.shape, dtype)
-    assert mod["main"].checked_type == relay.FuncType([t1, t2], expected_forward_type)
-
-    forward = create_executor(mod=mod).evaluate(func)(x1_np, x2_np)
-
-    assert_allclose(forward.numpy(), expected_forward)
-
-
-def test_reverse_ad_identity():
-    """Simple test with reverse mode ad."""
-    # of f(x) = x
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-
-    func = relay.Function([x], x)
-    func = run_infer_type(func)
-    back_func = transform.gradient(func)
-    back_func = run_infer_type(back_func)
-
-    mod["main"] = back_func
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    back_func = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType(
-        [t], relay.TupleType([t, relay.TupleType([t])])
-    )
-
-    x = rand(dtype, *shape)
-    (forward), (grad,) = create_executor(mod=mod).evaluate(back_func)(x)
-    assert_allclose(forward.numpy(), x.numpy())
-    assert_allclose(grad.numpy(), np.ones_like(x.numpy()))
-
-
-def test_multivar_reverse_ad():
-    """Simple test with multivariate reverse mode ad."""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    y = relay.var("y", t)
-
-    func = relay.Function([x, y], (x * y) * relay.const(np.ones(shape, dtype)))
-    func = run_infer_type(func)
-    back_func = transform.gradient(func)
-    back_func = run_infer_type(back_func)
-
-    mod["main"] = back_func
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    back_func = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType(
-        [t, t], relay.TupleType([t, relay.TupleType([t, t])])
-    )
-
-    x = rand(dtype, *shape)
-    y = rand(dtype, *shape)
-    (forward), (grad_x, grad_y,) = create_executor(mod=mod).evaluate(
-        back_func
-    )(x, y)
-    assert_allclose(forward.numpy(), x.numpy() * y.numpy())
-    assert_allclose(grad_x.numpy(), y.numpy())
-    assert_allclose(grad_y.numpy(), x.numpy())
-
-
-def test_partial_eval():
-    """Test transformation following reverse mode ad and PartialEval"""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    func = relay.Function([], relay.const(np.ones(shape, dtype)))
-    func = run_infer_type(func)
-    back_func = transform.gradient(func)
-    back_func = run_infer_type(back_func)
-
-    mod["main"] = back_func
-    mod = transform.InferType()(mod)
-    back_func = mod["main"]
-
-    transform.PartialEvaluate()(mod)
-
-
-def test_after_partial_eval():
-    """Test transformation following reverse mode ad and PartialEval"""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    y = relay.var("y", t)
-
-    func = relay.Function([x, y], (x * y) * relay.const(np.ones(shape, dtype)))
-    func = run_infer_type(func)
-    back_func = transform.gradient(func)
-    back_func = run_infer_type(back_func)
-
-    mod["main"] = back_func
-    back_func = mod["main"]
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.PartialEvaluate(),
-            transform.InferType(),
-            transform.LazyGradientInit(),
-            transform.InferType(),
-            transform.DeadCodeElimination(),
-            transform.InferType(),
-        ]
-    )
-
-    mod = seq(mod)
-
-    assert mod["main"].checked_type == relay.FuncType(
-        [t, t], relay.TupleType([t, relay.TupleType([t, t])])
-    )
-
-    x = rand(dtype, *shape)
-    y = rand(dtype, *shape)
-    (forward), (grad_x, grad_y,) = create_executor(mod=mod).evaluate(
-        back_func
-    )(x, y)
-    assert_allclose(forward.numpy(), x.numpy() * y.numpy())
-    assert_allclose(grad_x.numpy(), y.numpy())
-    assert_allclose(grad_y.numpy(), x.numpy())
-
-
-def test_before_partial_eval():
-    """Test transformation before PartialEval"""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    y = relay.var("y", t)
-
-    func = relay.Function([x, y], x * y)
-    func = run_infer_type(func)
-    back_func = transform.gradient(func)
-    back_func = run_infer_type(back_func)
-
-    mod["main"] = back_func
-    seq = tvm.transform.Sequential(
-        [
-            transform.LazyGradientInit(),
-            transform.PartialEvaluate(),
-            transform.InferType(),
-            transform.DeadCodeElimination(),
-            transform.InferType(),
-        ]
-    )
-    mod = seq(mod)
-    back_func = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType(
-        [t, t], relay.TupleType([t, relay.TupleType([t, t])])
-    )
-
-    x = rand(dtype, *shape)
-    y = rand(dtype, *shape)
-    (forward), (grad_x, grad_y,) = create_executor(mod=mod).evaluate(
-        back_func
-    )(x, y)
-    assert_allclose(forward.numpy(), x.numpy() * y.numpy())
-    assert_allclose(grad_x.numpy(), y.numpy())
-    assert_allclose(grad_y.numpy(), x.numpy())
-
-
-def test_zeros():
-    """Simple test using "zeros" op"""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    y = relay.Function([x], x + relay.zeros(shape, dtype))
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    y = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], t)
-
-    x = rand(dtype, *shape)
-    y = create_executor(mod=mod).evaluate(y)(x)
-    assert_allclose(y.numpy(), x.numpy())
-
-
-def test_ones():
-    """Simple test using "ones" op"""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    y = relay.Function([x], x + relay.ones(shape, dtype))
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    y = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], t)
-
-    x = rand(dtype, *shape)
-    y = create_executor(mod=mod).evaluate(y)(x)
-    assert_allclose(y.numpy(), x.numpy() + np.ones_like(x.numpy()))
-
-
-def test_zeros_like():
-    """Simple test using "zeros_like" op"""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    y = relay.Function([x], x + relay.zeros_like(x))
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    y = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], t)
-
-    x = rand(dtype, *shape)
-    y = create_executor(mod=mod).evaluate(y)(x)
-    assert_allclose(y.numpy(), x.numpy())
-
-
-def test_ones_like():
-    """Simple test using "ones_like" op"""
-    mod = tvm.IRModule()
-
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-
-    x = relay.var("x", t)
-    y = relay.Function([x], x + relay.ones_like(x))
-
-    mod["main"] = y
-    mod = transform.InferType()(mod)
-    mod = transform.LazyGradientInit()(mod)
-    y = mod["main"]
-
-    assert mod["main"].checked_type == relay.FuncType([t], t)
-
-    x = rand(dtype, *shape)
-    y = create_executor(mod=mod).evaluate(y)(x)
-    assert_allclose(y.numpy(), x.numpy() + np.ones_like(x.numpy()))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_legalize.py b/tests/python/relay/test_pass_legalize.py
deleted file mode 100644
index 614663a62df2..000000000000
--- a/tests/python/relay/test_pass_legalize.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test legalize pass"""
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay import transform, analysis
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_legalize():
-    """Test directly replacing an operator with a new one"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def legalize_conv2d(attrs, inputs, types):
-        data, weight = inputs
-        weight = relay.multiply(weight, relay.const(2.0, "float32"))
-        return relay.nn.conv2d(data, weight, **attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            relay.multiply(weight, relay.const(2.0, "float32")),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.Legalize())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_legalize_none():
-    """Test doing nothing by returning 'None'"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        y = relay.nn.global_max_pool2d(x)
-        y = relay.Function([x], y)
-        return y
-
-    called = [False]
-
-    def legalize_conv2d(attrs, inputs, types):
-        called[0] = True
-        return None
-
-    with TempOpAttr("nn.global_max_pool2d", "FTVMLegalize", legalize_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.Legalize())
-        b = run_opt_pass(before(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-    assert called[0]
-
-
-def test_legalize_multiple_ops():
-    """Test directly replacing an operator with a new one"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def legalize_conv2d(attrs, inputs, types):
-        data, weight = inputs
-        weight = relay.multiply(weight, relay.const(2.0, "float32"))
-        return relay.nn.conv2d(data, weight, **attrs)
-
-    def legalize_relu(attrs, inputs, types):
-        data = inputs[0]
-        add = relay.add(tvm.relay.const(0, "float32"), data)
-        return relay.nn.relu(add)
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(
-            x,
-            relay.multiply(weight, relay.const(2.0, "float32")),
-            channels=64,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.add(tvm.relay.const(0, "float32"), y)
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
-        with TempOpAttr("nn.relu", "FTVMLegalize", legalize_relu):
-            a = before()
-            a = run_opt_pass(a, transform.Legalize())
-            b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-def test_legalize_multi_input():
-    """Test directly replacing an operator with a new one"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        y = relay.var("y", shape=(1, 64, 56, 20))
-        z = relay.var("z", shape=(1, 64, 56, 10))
-        func = relay.concatenate([x, y, z], axis=3)
-        func = relay.Function([x, y, z], func)
-        return func
-
-    def legalize_concatenate(attrs, inputs, types):
-        # Check that the correct multi-input case is handled.
-        assert len(inputs) == 1
-        assert isinstance(inputs[0], tvm.relay.expr.Tuple)
-        assert len(types) == 2
-        assert isinstance(types[0], tvm.relay.ty.TupleType)
-        assert isinstance(types[1], tvm.relay.ty.TensorType)
-        return None
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        y = relay.var("y", shape=(1, 64, 56, 20))
-        z = relay.var("z", shape=(1, 64, 56, 10))
-        func = relay.concatenate([x, y, z], axis=3)
-        func = relay.Function([x, y, z], func)
-        return func
-
-    with TempOpAttr("concatenate", "FTVMLegalize", legalize_concatenate):
-        a = before()
-        a = run_opt_pass(a, transform.Legalize())
-        b = run_opt_pass(expected(), transform.InferType())
-
-    tvm.ir.assert_structural_equal(a, b)
-
-
-@pytest.mark.parametrize(
-    "target,exp_in_channels",
-    [
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu",
-            8,
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-            3,
-        ),
-        (
-            "llvm --device=arm_cpu --mtriple=aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
-            8,
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
-            8,
-        ),
-        (
-            "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
-            8,
-        ),
-    ],
-)
-def test_conv2d_NHWC_legalize(target, exp_in_channels):
-    target = tvm.target.Target(target)
-
-    dtype = "int8"
-    data_layout = "NHWC"
-    kernel_layout = "HWIO"
-    in_channels = 3
-    out_channels = 4
-    kernel_size = (1, 1)
-
-    x = relay.var("x", shape=(1, 1, 1, in_channels), dtype=dtype)
-    weight = relay.var("weight", shape=(1, 1, in_channels, out_channels), dtype=dtype)
-    out = relay.nn.conv2d(
-        x,
-        weight,
-        kernel_size=kernel_size,
-        channels=out_channels,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-        out_dtype=dtype,
-    )
-
-    with target:
-        out = run_opt_pass(out, transform.Legalize())
-
-    act_in_channels = out.args[0].type_args[0].shape[3]
-
-    assert act_in_channels == exp_in_channels, "Actual input channels = " + str(act_in_channels)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
deleted file mode 100644
index 9f4a09dac46b..000000000000
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test legalize pass"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay import transform, analysis
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-@tvm.testing.uses_gpu
-def test_legalize_conv2d_NHWC():
-    """test legalize NHWC conv2d to enable tensorcore"""
-
-    def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
-        out_channel = kernel_shape[3]
-        out_shape = list(data_shape)
-        out_shape[3] = out_channel
-        db, di, do = pad_shape
-
-        def before():
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=out_channel,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            y = relay.Function([x, weight], y)
-            return y
-
-        def legalize_conv2d(attrs, inputs, types):
-            with tvm.target.Target("cuda"):
-                return topi.nn.conv2d_legalize(attrs, inputs, types)
-
-        def expected():
-            if not do_pad:
-                return before()
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            if db or di:
-                x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di)))
-            else:
-                x_pad = x
-            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
-            if di or do:
-                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do)))
-            else:
-                weight_pad = weight
-            y_pad = relay.nn.conv2d(
-                x_pad,
-                weight=weight_pad,
-                channels=out_channel + do,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                data_layout="NHWC",
-                kernel_layout="HWIO",
-            )
-            if db or do:
-                y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
-            else:
-                y = y_pad
-            y = relay.Function([x, weight], y)
-            return y
-
-        with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
-            a = before()
-            a = run_opt_pass(a, transform.Legalize())
-            b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    for dtype in ["float16", "int8", "int4"]:
-        # conv2d pad batch
-        _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0), dtype)
-        _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0), dtype)
-        _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), dtype, False)
-        # conv2d pad in_channel
-        _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0), dtype)
-        _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0), dtype)
-        _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0), dtype)
-        _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), dtype, False)
-        # conv2d pad out_channel
-        _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1), dtype)
-        _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31), dtype)
-        _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), dtype, False)
-
-
-@tvm.testing.uses_gpu
-def test_legalize_conv2d_HWNC():
-    """test legalize HWNC conv2d to enable tensorcore"""
-
-    def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
-        out_channel = kernel_shape[2]
-        out_shape = list(data_shape)
-        out_shape[3] = out_channel
-        db, di, do = pad_shape
-
-        def before():
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
-            y = relay.nn.conv2d(
-                x,
-                weight,
-                channels=out_channel,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                data_layout="HWNC",
-                kernel_layout="HWOI",
-            )
-            y = relay.Function([x, weight], y)
-            return y
-
-        def legalize_conv2d(attrs, inputs, types):
-            with tvm.target.Target("cuda"):
-                return topi.nn.conv2d_legalize(attrs, inputs, types)
-
-        def expected():
-            if not do_pad:
-                return before()
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            if db or di:
-                x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, 0), (0, db), (0, di)))
-            else:
-                x_pad = x
-            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
-            if di or do:
-                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, do), (0, di)))
-            else:
-                weight_pad = weight
-            y_pad = relay.nn.conv2d(
-                x_pad,
-                weight=weight_pad,
-                channels=out_channel + do,
-                kernel_size=(3, 3),
-                padding=(1, 1),
-                data_layout="HWNC",
-                kernel_layout="HWOI",
-            )
-            if db or do:
-                y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
-            else:
-                y = y_pad
-            y = relay.Function([x, weight], y)
-            return y
-
-        with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
-            a = before()
-            a = run_opt_pass(a, transform.Legalize())
-            b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    # conv2d pad batch
-    _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int8")
-    _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int8")
-    _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), "int8", False)
-    _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int4")
-    _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int4")
-    _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), "int4", False)
-    # conv2d pad in_channel
-    _test_legalize_conv2d((16, 16, 8, 63), (3, 3, 64, 63), (0, 1, 0), "int8")
-    _test_legalize_conv2d((16, 16, 8, 33), (3, 3, 64, 33), (0, 15, 0), "int8")
-    _test_legalize_conv2d((16, 16, 8, 13), (3, 3, 64, 13), (0, 3, 0), "int8")
-    _test_legalize_conv2d((16, 16, 8, 1), (3, 3, 64, 1), (0, 0, 0), "int8", False)
-    _test_legalize_conv2d((16, 16, 8, 63), (3, 3, 64, 63), (0, 1, 0), "int4")
-    _test_legalize_conv2d((16, 16, 8, 33), (3, 3, 64, 33), (0, 31, 0), "int4")
-    _test_legalize_conv2d((16, 16, 8, 13), (3, 3, 64, 13), (0, 19, 0), "int4")
-    _test_legalize_conv2d((16, 16, 8, 1), (3, 3, 64, 1), (0, 0, 0), "int4", False)
-    # conv2d pad out_channel
-    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 63, 64), (0, 0, 1), "int8")
-    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 33, 64), (0, 0, 31), "int8")
-    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 1, 64), (0, 0, 0), "int8", False)
-    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 63, 64), (0, 0, 1), "int4")
-    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 33, 64), (0, 0, 7), "int4")
-    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 1, 64), (0, 0, 0), "int4", False)
-
-
-@tvm.testing.uses_gpu
-def test_legalize_dense():
-    def _test_legalize_dense(data_shape, kernel_shape, pad_shape, dtype, do_pad=True, units=None):
-        """test legalize dense to enable tensorcore"""
-        M, K = data_shape
-        N, _ = kernel_shape
-        out_shape = (M, N)
-        dm, dk, dn = pad_shape
-
-        def before():
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
-            y = relay.nn.dense(x, weight, units)
-            y = relay.Function([x, weight], y)
-            return y
-
-        def legalize_dense(attrs, inputs, types):
-            with tvm.target.Target("cuda"):
-                return topi.nn.dense_legalize(attrs, inputs, types)
-
-        def expected():
-            if not do_pad:
-                return before()
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            if dm or dk:
-                x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
-            else:
-                x_pad = x
-            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
-            if dn or dk:
-                weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk)))
-            else:
-                weight_pad = weight
-            y_pad = relay.nn.dense(x_pad, weight_pad, units=N + dn if units else None)
-            if dm or dn:
-                y = relay.strided_slice(y_pad, begin=[0, 0], end=out_shape)
-            else:
-                y = y_pad
-            y = relay.Function([x, weight], y)
-            return y
-
-        with TempOpAttr("nn.dense", "FTVMLegalize", legalize_dense):
-            a = before()
-            a = run_opt_pass(a, transform.Legalize())
-            b = run_opt_pass(expected(), transform.InferType())
-
-        tvm.ir.assert_structural_equal(a, b)
-
-    # dense
-    for dtype in ["float16", "int8"]:
-        _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), dtype, False)
-        _test_legalize_dense((7, 16), (32, 16), (1, 0, 0), dtype)
-        _test_legalize_dense((8, 15), (32, 15), (0, 1, 0), dtype)
-        _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype)
-        _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype)
-        _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype)
-        _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), dtype, False)
-
-    # Test if units parameter is correctly updated
-    _test_legalize_dense((8, 16), (30, 16), (0, 0, 2), "float16", units=30)
-
-    _test_legalize_dense((8, 32), (32, 32), (0, 0, 0), "int4", False)
-    _test_legalize_dense((7, 32), (32, 32), (1, 0, 0), "int4")
-    _test_legalize_dense((8, 31), (32, 31), (0, 1, 0), "int4")
-    _test_legalize_dense((8, 32), (31, 32), (0, 0, 1), "int4")
-    _test_legalize_dense((7, 31), (31, 31), (1, 1, 1), "int4")
-    _test_legalize_dense((3, 32), (32, 32), (5, 0, 0), "int4")
-    _test_legalize_dense((8, 16), (32, 16), (0, 16, 0), "int4")
-    _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), "int4", False)
-
-
-@tvm.testing.uses_gpu
-def test_legalize_batch_matmul():
-    def _test_legalize_batch_matmul(
-        data_shape, kernel_shape, pad_shape, dtype, do_pad=True, transpose_a=False, transpose_b=True
-    ):
-        """test legalize dense to enable tensorcore"""
-        if transpose_a:
-            B, _, M = data_shape
-        else:
-            B, M, _ = data_shape
-
-        if transpose_b:
-            _, N, _ = kernel_shape
-        else:
-            _, _, N = kernel_shape
-
-        out_shape = (B, M, N)
-        dm, dk, dn = pad_shape
-
-        def before():
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
-            y = relay.nn.batch_matmul(x, weight, transpose_a=transpose_a, transpose_b=transpose_b)
-            y = relay.Function([x, weight], y)
-            return y
-
-        def legalize_batch_matmul(attrs, inputs, types):
-            with tvm.target.Target("cuda"):
-                return topi.nn.batch_matmul_legalize(attrs, inputs, types)
-
-        def expected():
-            if not do_pad:
-                return before()
-
-            x = relay.var("x", shape=data_shape, dtype=dtype)
-            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
-
-            if dm or dk:
-                if transpose_a:
-                    x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dk), (0, dm)))
-                else:
-                    x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
-            else:
-                x_pad = x
-
-            if dn or dk:
-                if transpose_b:
-                    weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
-                else:
-                    weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dk), (0, dn)))
-            else:
-                weight_pad = weight
-
-            y_pad = relay.nn.batch_matmul(
-                x_pad,
-                weight_pad,
-                transpose_a=transpose_a,
-                transpose_b=transpose_b,
-            )
-            if dm or dn:
-                y = relay.strided_slice(y_pad, begin=[0, 0, 0], end=out_shape)
-            else:
-                y = y_pad
-            y = relay.Function([x, weight], y)
-            return y
-
-        with TempOpAttr("nn.batch_matmul", "FTVMLegalize", legalize_batch_matmul):
-            a = before()
-            a = run_opt_pass(a, transform.Legalize())
-            b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-    for dtype in ["float16", "int8"]:
-        _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), dtype, False)
-        _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0), dtype)
-        _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0), dtype)
-        _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1), dtype)
-        _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1), dtype)
-        _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0), dtype)
-        _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), dtype, False)
-
-    _test_legalize_batch_matmul((16, 8, 32), (16, 32, 32), (0, 0, 0), "int4", False)
-    _test_legalize_batch_matmul((16, 7, 32), (16, 32, 32), (1, 0, 0), "int4")
-    _test_legalize_batch_matmul((16, 8, 31), (16, 32, 31), (0, 1, 0), "int4")
-    _test_legalize_batch_matmul((16, 8, 32), (16, 31, 32), (0, 0, 1), "int4")
-    _test_legalize_batch_matmul((16, 7, 31), (16, 31, 31), (1, 1, 1), "int4")
-    _test_legalize_batch_matmul((16, 3, 32), (16, 32, 32), (5, 0, 0), "int4")
-    _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 16, 0), "int4")
-    _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), "int4", False)
-
-    _test_legalize_batch_matmul(
-        (16, 8, 16), (16, 16, 32), (0, 0, 0), "float16", False, transpose_b=False
-    )
-    _test_legalize_batch_matmul(
-        (16, 16, 8), (16, 32, 16), (0, 0, 0), "float16", False, transpose_a=True
-    )
-
-
-if __name__ == "__main__":
-    test_legalize_conv2d_NHWC()
-    test_legalize_conv2d_HWNC()
-    test_legalize_dense()
-    test_legalize_batch_matmul()
diff --git a/tests/python/relay/test_pass_mac_count.py b/tests/python/relay/test_pass_mac_count.py
deleted file mode 100644
index 68f8851526b6..000000000000
--- a/tests/python/relay/test_pass_mac_count.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for MAC counter."""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import analysis, transform
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = tvm.relay.transform.InferType()(mod)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_gemm():
-    n = 512
-    k = 1024
-    m = 256
-    dshape1 = (n, k)
-    dshape2 = (m, k)
-    data1 = relay.var("data1", shape=dshape1)
-    data2 = relay.var("data2", shape=dshape2)
-    gemm = relay.nn.dense(data1, data2)
-    func = relay.Function([data1, data2], relay.Tuple(tvm.runtime.convert([gemm])))
-    func = run_opt_pass(func, transform.InferType())
-    compute_count = analysis.get_total_mac_number(func)
-    expect_count = n * m * k
-    assert compute_count == expect_count
-
-
-def test_conv():
-    batch_size = 1
-    input_channel = 3
-    h = 224
-    w = 224
-    output_channel = 64
-    kh = 7
-    kw = 7
-    h_padding = 1
-    w_padding = 1
-    oh = h + h_padding * 2 - kh + 1
-    ow = w + w_padding * 2 - kw + 1
-    dshape = (batch_size, input_channel, h, w)
-    weight = relay.var("weight", shape=(output_channel, input_channel, kh, kw))
-    data = relay.var("data", shape=dshape)
-    conv2d = relay.nn.conv2d(
-        data, weight, channels=output_channel, kernel_size=(kh, kw), padding=(h_padding, w_padding)
-    )
-    func = relay.Function([data, weight], relay.Tuple(tvm.runtime.convert([conv2d])))
-    func = run_opt_pass(func, transform.InferType())
-    compute_count = analysis.get_total_mac_number(func)
-    expect_count = batch_size * input_channel * oh * ow * output_channel * kh * kw
-    assert compute_count == expect_count
-
-
-def test_simple_network():
-    batch_size = 1
-    dshape = (batch_size, 64, 56, 56)
-    weight_conv = relay.var("weight_conv", shape=(64, 64, 3, 3))
-    data1 = relay.var("data1", shape=dshape)
-    data2 = relay.var("data2", shape=dshape)
-    weight_dense = relay.var("weight_dense", shape=(1, 56 * 56 * 64))
-
-    conv2d_1 = relay.nn.conv2d(data1, weight_conv, channels=64, kernel_size=(3, 3), padding=(1, 1))
-    conv2d_2 = relay.nn.conv2d(data2, weight_conv, channels=64, kernel_size=(3, 3), padding=(1, 1))
-    add = relay.add(conv2d_1, conv2d_2)
-    flattened = relay.nn.batch_flatten(add)
-    dense_1 = relay.nn.dense(flattened, weight_dense)
-
-    func = relay.Function(
-        [data1, data2, weight_conv, weight_dense],
-        relay.Tuple(tvm.runtime.convert([conv2d_1, conv2d_2, dense_1, add, flattened])),
-    )
-    # alter the CONV 2D data layout to test
-    func = run_opt_pass(func, transform.AlterOpLayout())
-    compute_count = analysis.get_total_mac_number(func)
-    expect_count = 231411712
-    assert compute_count == expect_count
-
-
-def test_depthwise_conv2d():
-    batch_size = 1
-    dshape = (batch_size, 64, 56, 56)
-    weight_conv = relay.var("weight_depthwiseconv", shape=(64, 1, 3, 3))
-    data1 = relay.var("data1", shape=dshape)
-    data2 = relay.var("data2", shape=dshape)
-    depthwise_conv2d_1 = relay.nn.conv2d(
-        data1, weight_conv, kernel_size=(3, 3), padding=(1, 1), groups=64
-    )
-    depthwise_conv2d_2 = relay.nn.conv2d(
-        data2, weight_conv, kernel_size=(3, 3), padding=(1, 1), groups=64
-    )
-    add = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-    func = relay.Function(
-        [data1, data2, weight_conv],
-        relay.Tuple(tvm.runtime.convert([depthwise_conv2d_1, depthwise_conv2d_2, add])),
-    )
-    func = run_opt_pass(func, transform.InferType())
-    compute_count = analysis.get_total_mac_number(func)
-    assert compute_count == 2 * np.prod(dshape) * 3 * 3
-
-
-def test_conv_2d_transpose():
-    batch_size = 1
-    input_channel = 3
-    h = 224
-    w = 224
-    output_channel = 64
-    kh = 7
-    kw = 7
-    h_padding = 1
-    w_padding = 1
-    oh = h - h_padding * 2 + kh - 1
-    ow = w - w_padding * 2 + kw - 1
-    dshape = (batch_size, input_channel, h, w)
-    weight = relay.var("weight", shape=(input_channel, output_channel, kh, kw))
-    data = relay.var("data", shape=dshape)
-    conv2d_transpose = relay.nn.conv2d_transpose(
-        data, weight, channels=output_channel, kernel_size=(kh, kw), padding=(h_padding, w_padding)
-    )
-    func = relay.Function([data, weight], relay.Tuple(tvm.runtime.convert([conv2d_transpose])))
-    func = run_opt_pass(func, transform.InferType())
-    compute_count = analysis.get_total_mac_number(func)
-    expect_count = batch_size * input_channel * oh * ow * output_channel * kh * kw
-    assert compute_count == expect_count
-
-
-if __name__ == "__main__":
-    test_conv()
-    test_gemm()
-    test_simple_network()
-    test_depthwise_conv2d()
-    test_conv_2d_transpose()
diff --git a/tests/python/relay/test_pass_manager.py b/tests/python/relay/test_pass_manager.py
deleted file mode 100644
index 9da3869288e9..000000000000
--- a/tests/python/relay/test_pass_manager.py
+++ /dev/null
@@ -1,617 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for relay pass manager."""
-import numpy as np
-import pytest
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import ExprFunctor
-from tvm.relay import Function, Call
-from tvm.relay import analysis
-from tvm.relay import transform as _transform
-from tvm.ir import instrument as _instrument
-from tvm.relay.testing import run_infer_type
-import tvm.testing
-
-
-def get_var_func():
-    shape = (5, 10)
-    tp = relay.TensorType(shape, "float32")
-    x = relay.var("x", tp)
-    gv = relay.GlobalVar("myAbs")
-    func = relay.Function([x], relay.abs(x))
-    return gv, func
-
-
-def extract_var_func(mod, name):
-    var = mod.get_global_var(name)
-    func = mod[var]
-    return var, func
-
-
-def update_func(func):
-    # Double the value of Constants and vars.
-    class DoubleValues(ExprFunctor):
-        def __init__(self):
-            ExprFunctor.__init__(self)
-
-        def visit_constant(self, const):
-            return relay.add(const, const)
-
-        def visit_var(self, var):
-            return relay.add(var, var)
-
-        def visit_call(self, call):
-            new_op = self.visit(call.op)
-            new_args = [self.visit(arg) for arg in call.args]
-            return Call(new_op, new_args, call.attrs)
-
-        def visit_global_var(self, gvar):
-            return gvar
-
-        def visit_op(self, op):
-            return op
-
-        def visit_function(self, fn):
-            new_body = self.visit(fn.body)
-            return Function(list(fn.params), new_body, fn.ret_type, fn.type_params, fn.attrs)
-
-    double_value = DoubleValues()
-    return double_value.visit(func)
-
-
-class OptTester:
-    """A helper class for testing the pass manager."""
-
-    def __init__(self, mod):
-        if not isinstance(mod, tvm.IRModule):
-            raise TypeError("mod is expected to be the type of " "tvm.IRModule")
-        self.mod = mod
-
-    def analysis(self):
-        """Perform analysis for the current module."""
-        pass
-
-    @staticmethod
-    def transform(node, ctx=None):
-        """Perform optimization on node."""
-        if isinstance(node, tvm.IRModule):
-            # Add a function to the module and return an updated module.
-            gv, func = get_var_func()
-            mod = tvm.IRModule({gv: func})
-            mod.update(node)
-            return mod
-        if isinstance(node, relay.Function):
-            return update_func(node)
-
-        raise TypeError("Found not supported node type.")
-
-
-def get_rand(shape, dtype="float32"):
-    return tvm.nd.array(np.random.rand(*shape).astype(dtype))
-
-
-def check_func(func, ref_func):
-    func = run_infer_type(func)
-    ref_func = run_infer_type(ref_func)
-    tvm.ir.assert_structural_equal(func, ref_func)
-
-
-@tvm.testing.uses_gpu
-def test_module_pass():
-    shape = (5, 10)
-    dtype = "float32"
-    tp = relay.TensorType(shape, dtype)
-    x = relay.var("x", tp)
-    y = relay.var("y", tp)
-    v_add = relay.GlobalVar("myAdd")
-    func = relay.Function([x, y], x + y)
-    mod = tvm.IRModule({v_add: func})
-
-    pass_name = "module_pass_test"
-    opt_level = 0
-    opt_tester = OptTester(mod)
-    pass_ctx = None
-
-    @tvm.transform.module_pass(opt_level=opt_level, name=pass_name)
-    def transform(expr, ctx):
-        return opt_tester.transform(expr, ctx)
-
-    def test_pass_registration():
-        mod_pass = transform
-        assert isinstance(mod_pass, tvm.transform.ModulePass)
-        pass_info = mod_pass.info
-        assert pass_info.name == pass_name
-        assert pass_info.opt_level == opt_level
-
-    def test_pass_registration_no_decorator():
-        def direct_transform(expr, ctx):
-            return opt_tester.transform(expr, ctx)
-
-        mod_pass = tvm.transform.module_pass(direct_transform, opt_level=3)
-        assert isinstance(mod_pass, tvm.transform.ModulePass)
-        pass_info = mod_pass.info
-        assert pass_info.name == "direct_transform"
-        assert pass_info.opt_level == 3
-
-    def test_pass_run():
-        module_pass = transform
-        assert pass_name in str(module_pass)
-
-        updated_mod = module_pass(mod)
-        assert isinstance(updated_mod, tvm.IRModule)
-
-        # Check the abs function in the updated module.
-        v_abs, myabs = get_var_func()
-        new_v_add = updated_mod.get_global_var(v_abs.name_hint)
-        new_abs = updated_mod[new_v_add]
-        check_func(new_abs, myabs)
-
-        # Check the add function in the updated module.
-        v_abs, myabs = get_var_func()
-        new_v_add = updated_mod.get_global_var(v_add.name_hint)
-        new_add = updated_mod[new_v_add]
-        check_func(new_add, func)
-
-        # Check the add function in the python transformed module.
-        ret = opt_tester.transform(mod, pass_ctx)
-        transformed_v_add = ret.get_global_var(v_add.name_hint)
-        transformed_add = mod[transformed_v_add]
-        check_func(new_add, transformed_add)
-
-        # Execute the add function.
-        x_nd = get_rand(shape, dtype)
-        y_nd = get_rand(shape, dtype)
-        ref_res = x_nd.numpy() + y_nd.numpy()
-        for target, dev in tvm.testing.enabled_targets():
-            res1 = relay.create_executor("graph", device=dev, target=target).evaluate(new_add)(
-                x_nd, y_nd
-            )
-            tvm.testing.assert_allclose(res1.numpy(), ref_res, rtol=1e-5)
-            res2 = relay.create_executor("debug", device=dev, target=target).evaluate(new_add)(
-                x_nd, y_nd
-            )
-            tvm.testing.assert_allclose(res2.numpy(), ref_res, rtol=1e-5)
-
-    test_pass_registration()
-    test_pass_registration_no_decorator
-    test_pass_run()
-
-
-def test_function_class_pass():
-    @relay.transform.function_pass(opt_level=1)
-    class TestReplaceFunc:
-        """Simple test function to replace one argument to another."""
-
-        def __init__(self, new_func):
-            self.new_func = new_func
-
-        def transform_function(self, func, mod, ctx):
-            return self.new_func
-
-    x = relay.var("x", shape=(10, 20))
-    f1 = relay.Function([x], x)
-    f2 = relay.Function([x], relay.log(x))
-    fpass = TestReplaceFunc(f1)
-    assert fpass.info.opt_level == 1
-    assert fpass.info.name == "TestReplaceFunc"
-    mod = tvm.IRModule.from_expr(f2)
-    mod = fpass(mod)
-    # wrap in expr
-    mod2 = tvm.IRModule.from_expr(f1)
-    mod2 = tvm.relay.transform.InferType()(mod2)
-    tvm.ir.assert_structural_equal(mod["main"], mod2["main"])
-
-
-@tvm.testing.uses_gpu
-def test_function_pass():
-    shape = (10,)
-    dtype = "float32"
-    tp = relay.TensorType(shape, dtype)
-    x = relay.var("x", tp)
-    v_log = relay.GlobalVar("myLog")
-    log = relay.Function([x], relay.log(x))
-    mod = tvm.IRModule({v_log: log})
-
-    pass_name = "function_pass_test"
-    opt_level = 1
-    opt_tester = OptTester(mod)
-    pass_ctx = None
-
-    @_transform.function_pass(opt_level=opt_level, name=pass_name)
-    def transform(expr, mod, ctx):
-        return opt_tester.transform(expr, ctx)
-
-    def get_ref_log():
-        ref_log = relay.Function([x], relay.log(relay.add(x, x)))
-        return ref_log
-
-    def test_pass_registration():
-        function_pass = transform
-        assert isinstance(function_pass, _transform.FunctionPass)
-        pass_info = function_pass.info
-        assert pass_info.name == pass_name
-        assert pass_info.opt_level == opt_level
-
-    def test_pass_registration_no_decorator():
-        def direct_transform(expr, ctx):
-            return opt_tester.transform(expr, ctx)
-
-        mod_pass = _transform.function_pass(direct_transform, opt_level=0)
-        assert isinstance(mod_pass, _transform.FunctionPass)
-        pass_info = mod_pass.info
-        assert pass_info.name == "direct_transform"
-        assert pass_info.opt_level == 0
-
-    def test_pass_run():
-        function_pass = transform
-        assert pass_name in str(function_pass)
-
-        updated_mod = function_pass(mod)
-        assert isinstance(updated_mod, tvm.IRModule)
-
-        # Check the log function in the updated module.
-        new_v_log = updated_mod.get_global_var(v_log.name_hint)
-        new_log = updated_mod[new_v_log]
-        check_func(new_log, get_ref_log())
-
-        # Check the log function in the python transformed function.
-        ret = opt_tester.transform(log, pass_ctx)
-        check_func(new_log, ret)
-
-        # Execute the add function.
-        x_nd = get_rand(shape, dtype)
-        ref_res = np.log(x_nd.numpy() * 2)
-        for target, dev in tvm.testing.enabled_targets():
-            res1 = relay.create_executor("graph", device=dev, target=target).evaluate(new_log)(x_nd)
-            tvm.testing.assert_allclose(res1.numpy(), ref_res, rtol=1e-5)
-            res2 = relay.create_executor("debug", device=dev, target=target).evaluate(new_log)(x_nd)
-            tvm.testing.assert_allclose(res2.numpy(), ref_res, rtol=1e-5)
-
-    test_pass_registration()
-    test_pass_registration_no_decorator()
-    test_pass_run()
-
-
-def test_module_class_pass():
-    @tvm.transform.module_pass(opt_level=1)
-    class TestPipeline:
-        """Simple test function to replace one argument to another."""
-
-        def __init__(self, new_mod, replace):
-            self.new_mod = new_mod
-            self.replace = replace
-
-        def transform_module(self, mod, ctx):
-            if self.replace:
-                return self.new_mod
-            return mod
-
-    x = relay.var("x", shape=(10, 20))
-    m1 = tvm.IRModule.from_expr(relay.Function([x], x))
-    m2 = tvm.IRModule.from_expr(relay.Function([x], relay.log(x)))
-    fpass = TestPipeline(m2, replace=True)
-    assert fpass.info.name == "TestPipeline"
-    mod3 = fpass(m1)
-    assert mod3.same_as(m2)
-    mod4 = TestPipeline(m2, replace=False)(m1)
-    assert mod4.same_as(m1)
-
-
-def test_pass_info():
-    info = tvm.transform.PassInfo(opt_level=1, name="xyz")
-    assert info.opt_level == 1
-    assert info.name == "xyz"
-
-
-@tvm.testing.uses_gpu
-def test_sequential_pass():
-    shape = (10,)
-    dtype = "float32"
-    tp = relay.TensorType(shape, dtype)
-    x = relay.var("x", tp)
-    y = relay.var("y", tp)
-    v_sub = relay.GlobalVar("mySub")
-    sub = relay.Function([x, y], relay.subtract(x, y))
-
-    z = relay.var("z", tp)
-    v_log = relay.GlobalVar("myLog")
-    log = relay.Function([z], relay.log(z))
-
-    mod = tvm.IRModule({v_sub: sub, v_log: log})
-
-    def get_ref_log():
-        ref_log = relay.Function([x], relay.log(relay.add(x, x)))
-        return ref_log
-
-    def get_ref_sub():
-        ref_sub = relay.Function([x, y], relay.subtract(relay.add(x, x), relay.add(y, y)))
-        return ref_sub
-
-    def get_ref_abs():
-        shape = (5, 10)
-        tp = relay.TensorType(shape, "float32")
-        a = relay.var("a", tp)
-        ref_abs = relay.Function([a], relay.abs(relay.add(a, a)))
-        return ref_abs
-
-    # Register a module pass.
-    opt_tester = OptTester(mod)
-    pass_ctx = None
-
-    @tvm.transform.module_pass(opt_level=1)
-    def mod_transform(expr, ctx):
-        return opt_tester.transform(expr, ctx)
-
-    module_pass = mod_transform
-
-    # Register a function pass.
-    @_transform.function_pass(opt_level=1)
-    def func_transform(expr, mod, ctx):
-        return opt_tester.transform(expr, ctx)
-
-    function_pass = func_transform
-
-    def test_pass_registration():
-        passes = [module_pass, function_pass]
-        opt_level = 2
-        pass_name = "sequential"
-        sequential = tvm.transform.Sequential(passes=passes, opt_level=opt_level)
-        pass_info = sequential.info
-        assert pass_info.name == pass_name
-        assert pass_info.opt_level == opt_level
-
-    def test_no_pass():
-        passes = []
-        sequential = tvm.transform.Sequential(opt_level=1, passes=passes)
-        ret_mod = sequential(mod)
-        mod_func = ret_mod[v_sub]
-        check_func(sub, mod_func)
-
-    def test_only_module_pass():
-        passes = [module_pass]
-        sequential = tvm.transform.Sequential(opt_level=1, passes=passes)
-        with tvm.transform.PassContext(required_pass=["mod_transform"]):
-            ret_mod = sequential(mod)
-        # Check the subtract function.
-        sub_var, new_sub = extract_var_func(ret_mod, v_sub.name_hint)
-        check_func(new_sub, sub)
-
-        # Check the abs function is added.
-        abs_var, abs_func = get_var_func()
-        abs_var, new_abs = extract_var_func(ret_mod, abs_var.name_hint)
-        check_func(new_abs, abs_func)
-
-    def test_only_function_pass():
-        # Check the subtract function.
-        passes = [function_pass]
-        sequential = tvm.transform.Sequential(opt_level=1, passes=passes)
-        with tvm.transform.PassContext(required_pass=["func_transform"]):
-            ret_mod = sequential(mod)
-        _, new_sub = extract_var_func(ret_mod, v_sub.name_hint)
-        check_func(new_sub, get_ref_sub())
-
-        # Check the log function.
-        log_var, new_log = extract_var_func(ret_mod, v_log.name_hint)
-        check_func(new_log, get_ref_log())
-
-    def test_multiple_passes():
-        # Reset the current module since mod has been polluted by the previous
-        # function pass.
-        mod = tvm.IRModule({v_sub: sub, v_log: log})
-        passes = [module_pass, function_pass]
-        sequential = tvm.transform.Sequential(opt_level=1, passes=passes)
-        required = ["mod_transform", "func_transform"]
-        with tvm.transform.PassContext(required_pass=required):
-            ret_mod = sequential(mod)
-
-        # Check the abs function is added.
-        abs_var, abs_func = get_var_func()
-        abs_var, new_abs = extract_var_func(ret_mod, abs_var.name_hint)
-        check_func(new_abs, get_ref_abs())
-
-        # Check the subtract function is modified correctly.
-        _, new_sub = extract_var_func(ret_mod, v_sub.name_hint)
-        check_func(new_sub, get_ref_sub())
-
-        # Check the log function is modified correctly.
-        _, new_log = extract_var_func(ret_mod, v_log.name_hint)
-        check_func(new_log, get_ref_log())
-
-        # Execute the updated subtract function.
-        x_nd = get_rand(shape, dtype)
-        y_nd = get_rand(shape, dtype)
-        ref_res = np.subtract(x_nd.numpy() * 2, y_nd.numpy() * 2)
-        for target, dev in tvm.testing.enabled_targets():
-            res1 = relay.create_executor("graph", device=dev, target=target).evaluate(new_sub)(
-                x_nd, y_nd
-            )
-            tvm.testing.assert_allclose(res1.numpy(), ref_res, rtol=1e-5)
-            res2 = relay.create_executor("debug", device=dev, target=target).evaluate(new_sub)(
-                x_nd, y_nd
-            )
-            tvm.testing.assert_allclose(res2.numpy(), ref_res, rtol=1e-5)
-
-        # Execute the updated abs function.
-        x_nd = get_rand((5, 10), dtype)
-        ref_res = np.abs(x_nd.numpy() * 2)
-        for target, dev in tvm.testing.enabled_targets():
-            res1 = relay.create_executor("graph", device=dev, target=target).evaluate(new_abs)(x_nd)
-            tvm.testing.assert_allclose(res1.numpy(), ref_res, rtol=1e-5)
-            res2 = relay.create_executor("debug", device=dev, target=target).evaluate(new_abs)(x_nd)
-            tvm.testing.assert_allclose(res2.numpy(), ref_res, rtol=1e-5)
-
-    test_pass_registration()
-    test_no_pass()
-    test_only_module_pass()
-    test_only_function_pass()
-    test_multiple_passes()
-
-
-def test_sequential_with_scoping():
-    shape = (1, 2, 3)
-    c_data = np.array(shape).astype("float32")
-    tp = relay.TensorType(shape, "float32")
-
-    def before():
-        c = relay.const(c_data)
-        x = relay.var("x", tp)
-        y = relay.add(c, c)
-        y = relay.multiply(y, relay.const(2, "float32"))
-        y = relay.add(x, y)
-        z = relay.add(y, c)
-        z1 = relay.add(y, c)
-        z2 = relay.add(z, z1)
-        return relay.Function([x], z2)
-
-    def expected():
-        x = relay.var("x", tp)
-        c_folded = (c_data + c_data) * 2
-        y = relay.add(x, relay.const(c_folded))
-        z = relay.add(y, relay.const(c_data))
-        z1 = relay.add(z, z)
-        return relay.Function([x], z1)
-
-    seq = tvm.transform.Sequential(
-        [
-            relay.transform.InferType(),
-            relay.transform.FoldConstant(),
-            relay.transform.EliminateCommonSubexpr(),
-            relay.transform.AlterOpLayout(),
-        ]
-    )
-
-    mod = tvm.IRModule({"main": before()})
-    with tvm.transform.PassContext(opt_level=3):
-        with tvm.target.Target("llvm"):
-            mod = seq(mod)
-
-    zz = mod["main"]
-    zexpected = run_infer_type(expected())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_nested_sequential_with_scoping():
-    def before():
-        x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        y = relay.reshape(y, newshape=(1, 16, -1))
-        y = relay.reshape(y, newshape=(4, 8, -1, 16))
-        y = relay.reverse_reshape(y, newshape=(32, 0, -1))
-        return tvm.IRModule.from_expr(y)
-
-    def expected():
-        x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        y = relay.reshape(y, newshape=(32, 16, 16))
-        return tvm.IRModule.from_expr(y)
-
-    z = before()
-    passes = [
-        tvm.transform.Sequential([relay.transform.SimplifyExpr()]),
-    ]
-    with tvm.transform.PassContext(opt_level=1):
-        zz = tvm.transform.Sequential(passes)(z)
-
-    expected = relay.transform.InferType()(expected())
-    tvm.ir.assert_structural_equal(zz, expected)
-
-
-def test_print_ir(capfd):
-    shape = (1, 2, 3)
-    tp = relay.TensorType(shape, "float32")
-    x = relay.var("x", tp)
-    y = relay.add(x, x)
-    y = relay.multiply(y, relay.const(2, "float32"))
-    func = relay.Function([x], y)
-
-    seq = tvm.transform.Sequential(
-        [
-            relay.transform.InferType(),
-            relay.transform.FoldConstant(),
-            tvm.transform.PrintIR(),
-            relay.transform.DeadCodeElimination(),
-        ]
-    )
-
-    mod = tvm.IRModule({"main": func})
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-
-    out = capfd.readouterr().err
-
-    assert "PrintIR" in out
-    assert "multiply" in out
-
-
-@tvm.instrument.pass_instrument
-class PassCounter:
-    def __init__(self):
-        # Just setting a garbage value to test set_up callback
-        self.counts = 1234
-
-    def enter_pass_ctx(self):
-        self.counts = 0
-
-    def exit_pass_ctx(self):
-        self.counts = 0
-
-    def run_before_pass(self, module, info):
-        self.counts += 1
-
-    def get_counts(self):
-        return self.counts
-
-
-def test_print_debug_callback():
-    shape = (1, 2, 3)
-    tp = relay.TensorType(shape, "float32")
-    x = relay.var("x", tp)
-    y = relay.add(x, x)
-    y = relay.multiply(y, relay.const(2, "float32"))
-    func = relay.Function([x], y)
-
-    seq = tvm.transform.Sequential(
-        [
-            relay.transform.InferType(),
-            relay.transform.FoldConstant(),
-            relay.transform.DeadCodeElimination(),
-        ]
-    )
-
-    mod = tvm.IRModule({"main": func})
-
-    pass_counter = PassCounter()
-    with tvm.transform.PassContext(opt_level=3, instruments=[pass_counter]):
-        # Should be reseted when entering pass context
-        assert pass_counter.get_counts() == 0
-        mod = seq(mod)
-
-        # TODO(@jroesch): when we remove new fn pass behavior we need to remove
-        # change this back to match correct behavior
-        assert pass_counter.get_counts() == 6
-
-    # Should be cleanned up after exiting pass context
-    assert pass_counter.get_counts() == 0
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_manifest_lifetimes.py b/tests/python/relay/test_pass_manifest_lifetimes.py
deleted file mode 100644
index ee9f824582ab..000000000000
--- a/tests/python/relay/test_pass_manifest_lifetimes.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm.relay import Function, transform
-from tvm.relay.testing import inception_v3
-import pytest
-import sys
-
-
-def optimize_and_check(before_program, after_program, passes):
-    if isinstance(before_program, str):
-        before_program = tvm.relay.parse(before_program)
-    if isinstance(after_program, str):
-        after_program = tvm.relay.parse(after_program)
-    if not isinstance(passes, list):
-        passes = [passes]
-    optimize = tvm.transform.Sequential(passes)
-    optimized_program = optimize(before_program)
-    print("Actual:")
-    print(optimized_program)
-    print("Expected:")
-    print(after_program)
-    tvm.ir.assert_structural_equal(optimized_program, after_program, map_free_vars=True)
-
-
-def test_simple_linear():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main(%x: int) {
-        let %y = %x + %x;
-        let %z = %y + %y;
-        let %w = %z + %z;
-        %w
-    }
-    """
-    after_program = """
-    #[version = "0.0.5"]
-    def @main(%x: int) {
-        let %y = %x + %x;
-        let %_0 = memory.kill(%x);
-        let %z = %y + %y;
-        let %_1 = memory.kill(%y);
-        let %w = %z + %z;
-        let %_2 = memory.kill(%z);
-        %w
-    }
-    """
-    optimize_and_check(before_program, after_program, transform.ManifestLifetimes())
-
-
-def test_simple_if():
-    before_program = """
-    #[version = "0.0.5"]
-    def @main(%x: int) {
-        let %y = cast(%x, dtype="bool");
-        let %z = if (%y) {
-            let %v0 = %x + %x;
-            let %v1 = %v0 * 2;
-            %v1
-        } else {
-            %x
-        };
-        %z
-    }
-    """
-    after_program = """
-    #[version = "0.0.5"]
-    def @main(%x: int) {
-        let %y = cast(%x, dtype="bool");
-        let %z = if (%y) {
-            let %v0 = %x + %x;
-            let %_0 = memory.kill(%x);
-            let %v1 = %v0 * 2;
-            let %_1 = memory.kill(%v0);
-            %v1
-        } else {
-            %x
-        };
-        let %_1 = memory.kill(%y);
-        %z
-    }
-    """
-    optimize_and_check(before_program, after_program, transform.ManifestLifetimes())
-
-
-def test_simple_match():
-    before_program = """
-    #[version = "0.0.5"]
-    type List[A] {
-        Cons(A, List[A]),
-        Nil,
-    }
-    def @main(%x: int) {
-        let %l : List[int] = Nil;
-        let %m = (match (%l) {
-            Cons(%head, %rest) => {
-                let %y = %x + 1;
-                let %z = %y + %y;
-                %z
-            },
-            Nil => -1,
-        });
-        %m
-    }
-    """
-    after_program = """
-    #[version = "0.0.5"]
-    type List[A] {
-        Cons(A, List[A]),
-        Nil,
-    }
-    def @main(%x: int) {
-        let %l : List[int] = Nil;
-        let %m = (match (%l) {
-            Cons(%head, %rest) => {
-                let %y = %x + 1;
-                let %_0 = memory.kill(%x);
-                let %z = %y + %y;
-                let %_1 = memory.kill(%y);
-                /* TODO: %head and %rest should be immediately killed */
-                %z
-            },
-            Nil => -1
-        });
-        let %_2 = memory.kill(%l);
-        %m
-    }
-    """
-    optimize_and_check(before_program, after_program, transform.ManifestLifetimes())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_merge_compiler_regions.py b/tests/python/relay/test_pass_merge_compiler_regions.py
deleted file mode 100644
index 440a56f43b21..000000000000
--- a/tests/python/relay/test_pass_merge_compiler_regions.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for merge compiler regions."""
-import tvm
-from tvm import relay
-import tvm.relay.transform as transform
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-from tvm.relay.testing import run_opt_pass
-
-
-def test_diamond_graph_fanouts():
-    """
-    This tests that the data dependencies present in a diamond-shaped
-    graph are correctly resolved by the merging pass.
-
-    O = supported by target
-    X = not supported by target
-
-       O         O
-      / \\      /               \\
-     O   X --> O    +       +    X
-     \\ /             \\ /
-       O                O
-
-    Note that we can't just merge the three supported operators together,
-    otherwise both subgraphs would depend on the other.
-    """
-
-    def diamond_graph_fanouts():
-        data = relay.var("data", shape=(10, 10))
-        cb_1 = compiler_begin(data, "test")
-        O_1 = relay.abs(cb_1)
-        ce_1 = compiler_end(O_1, "test")
-        ce_2 = compiler_end(O_1, "test")
-        cb_2 = compiler_begin(ce_1, "test")
-        cb_3 = compiler_begin(ce_2, "default")
-        O_2 = relay.nn.relu(cb_2)
-        ce_3 = compiler_end(O_2, "test")
-
-        X = relay.tanh(cb_3)
-        ce_4 = compiler_end(X, "default")
-
-        cb_4 = compiler_begin(ce_3, "test")
-        cb_5 = compiler_begin(ce_4, "test")
-        O_3 = relay.add(cb_4, cb_5)
-        ce_5 = compiler_end(O_3, "test")
-
-        diamond = relay.Function([data], ce_5)
-        return diamond
-
-    def expected():
-        data = relay.var("data", shape=(10, 10))
-        cb_1 = compiler_begin(data, "test")
-        O_1 = relay.abs(cb_1)
-        ce_2 = compiler_end(O_1, "test")
-        O_2 = relay.nn.relu(O_1)
-        ce_3 = compiler_end(O_2, "test")
-
-        cb_3 = compiler_begin(ce_2, "default")
-        X = relay.tanh(cb_3)
-        ce_4 = compiler_end(X, "default")
-
-        cb_4 = compiler_begin(ce_3, "test")
-        cb_5 = compiler_begin(ce_4, "test")
-        O_3 = relay.add(cb_4, cb_5)
-        ce_5 = compiler_end(O_3, "test")
-
-        func = relay.Function([data], ce_5)
-        return func
-
-    result = run_opt_pass(diamond_graph_fanouts(), relay.transform.MergeCompilerRegions())
-    golden = run_opt_pass(expected(), relay.transform.InferType())
-    tvm.ir.assert_structural_equal(result, golden)
-
-
-def test_example_graph():
-    """This tests the merging algorithm on the example used in the RFC.
-
-    See the RFC here: https://discuss.tvm.apache.org/t/relay-improved-graph-partitioning-algorithm/5830
-    Blue nodes are adds (target: test), red nodes are subtracts (target: default).
-    """
-
-    def annotated():
-        in_1 = relay.var("in_1", shape=(10, 10), dtype="float32")
-        in_2 = relay.var("in_2", shape=(10, 10), dtype="float32")
-        in_3 = relay.var("in_3", shape=(10, 10), dtype="float32")
-        in_4 = relay.var("in_4", shape=(10, 10), dtype="float32")
-        in_5 = relay.var("in_5", shape=(10, 10), dtype="float32")
-        in_6 = relay.var("in_6", shape=(10, 10), dtype="float32")
-        in_7 = relay.var("in_7", shape=(10, 10), dtype="float32")
-        in_8 = relay.var("in_8", shape=(10, 10), dtype="float32")
-        in_9 = relay.var("in_9", shape=(10, 10), dtype="float32")
-        in_10 = relay.var("in_10", shape=(10, 10), dtype="float32")
-
-        begin0 = compiler_begin(in_1, "test")
-        begin1 = compiler_begin(in_2, "test")
-        begin2 = compiler_begin(in_3, "test")
-        begin3 = compiler_begin(in_4, "test")
-        node0 = relay.add(begin0, begin1)
-        node1 = relay.add(begin2, begin3)
-        end0 = compiler_end(node0, "test")
-        end1 = compiler_end(node1, "test")
-        begin4 = compiler_begin(end0, "test")
-        begin5 = compiler_begin(end1, "test")
-        node2 = relay.add(begin4, begin5)
-        end2 = compiler_end(node2, "test")
-
-        dbegin0 = compiler_begin(in_5, "default")
-        dbegin1 = compiler_begin(in_6, "default")
-        node3 = relay.subtract(dbegin0, dbegin1)
-        dbegin2 = compiler_begin(in_7, "default")
-        dend1 = compiler_end(node3, "default")
-        dbegin3 = compiler_begin(dend1, "default")
-        node4 = relay.subtract(dbegin2, dbegin3)
-        dend2 = compiler_end(node4, "default")
-
-        begin6 = compiler_begin(end2, "test")
-        begin7 = compiler_begin(dend2, "test")
-        node5 = relay.add(begin6, begin7)
-        end3 = compiler_end(node5, "test")
-        end4 = compiler_end(node5, "test")
-        dbegin4 = compiler_begin(in_8, "default")
-        dbegin5 = compiler_begin(end3, "default")
-        node6 = relay.subtract(dbegin4, dbegin5)
-        begin8 = compiler_begin(in_9, "test")
-        begin9 = compiler_begin(end4, "test")
-        node7 = relay.add(begin8, begin9)
-        end5 = compiler_end(node7, "test")
-
-        dend3 = compiler_end(node6, "default")
-        begin10 = compiler_begin(dend3, "test")
-        begin11 = compiler_begin(end5, "test")
-        node8 = relay.add(begin10, begin11)
-        end6 = compiler_end(node8, "test")
-        begin12 = compiler_begin(in_10, "test")
-        begin13 = compiler_begin(end6, "test")
-        node9 = relay.add(begin12, begin13)
-        end7 = compiler_end(node9, "test")
-
-        f = relay.Function([in_1, in_2, in_3, in_4, in_5, in_6, in_7, in_8, in_9, in_10], end7)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    def expected():
-        in_1 = relay.var("in_1", shape=(10, 10), dtype="float32")
-        in_2 = relay.var("in_2", shape=(10, 10), dtype="float32")
-        in_3 = relay.var("in_3", shape=(10, 10), dtype="float32")
-        in_4 = relay.var("in_4", shape=(10, 10), dtype="float32")
-        in_5 = relay.var("in_5", shape=(10, 10), dtype="float32")
-        in_6 = relay.var("in_6", shape=(10, 10), dtype="float32")
-        in_7 = relay.var("in_7", shape=(10, 10), dtype="float32")
-        in_8 = relay.var("in_8", shape=(10, 10), dtype="float32")
-        in_9 = relay.var("in_9", shape=(10, 10), dtype="float32")
-        in_10 = relay.var("in_10", shape=(10, 10), dtype="float32")
-
-        begin0 = compiler_begin(in_1, "test")
-        begin1 = compiler_begin(in_2, "test")
-        begin2 = compiler_begin(in_3, "test")
-        begin3 = compiler_begin(in_4, "test")
-        node0 = relay.add(begin0, begin1)
-        node1 = relay.add(begin2, begin3)
-        node2 = relay.add(node0, node1)
-
-        dbegin0 = compiler_begin(in_5, "default")
-        dbegin1 = compiler_begin(in_6, "default")
-        dbegin2 = compiler_begin(in_7, "default")
-        node3 = relay.subtract(dbegin0, dbegin1)
-        node4 = relay.subtract(dbegin2, node3)
-        dend0 = compiler_end(node4, "default")
-
-        begin4 = compiler_begin(dend0, "test")
-        begin5 = compiler_begin(in_9, "test")
-        node5 = relay.add(node2, begin4)
-        end1 = compiler_end(node5, "test")
-
-        dbegin4 = compiler_begin(end1, "default")
-        dbegin5 = compiler_begin(in_8, "default")
-        node6 = relay.subtract(dbegin5, dbegin4)
-        dend1 = compiler_end(node6, "default")
-
-        node7 = relay.add(begin5, node5)
-        end2 = compiler_end(node7, "test")
-        begin6 = compiler_begin(end2, "test")
-        begin7 = compiler_begin(dend1, "test")
-
-        node8 = relay.add(begin7, begin6)
-
-        begin8 = compiler_begin(in_10, "test")
-        node9 = relay.add(begin8, node8)
-        end3 = compiler_end(node9, "test")
-
-        f = relay.Function([in_1, in_2, in_3, in_4, in_5, in_6, in_7, in_8, in_9, in_10], end3)
-        mod = tvm.IRModule.from_expr(f)
-        return mod
-
-    mod = annotated()
-    mod = relay.transform.MergeCompilerRegions()(mod)
-    mod = relay.transform.InferType()(mod)
-    ref_mod = expected()
-    ref_mod = relay.transform.InferType()(ref_mod)
-    tvm.ir.assert_structural_equal(mod, ref_mod)
-
-
-def test_if_else():
-    """
-    This tests that the restriction regions propagate successful in
-    if_else control flow.
-
-    O = supported by target
-    X = not supported by target
-
-
-           O1 - - - |      O1 --|
-            |       |               |
-            X       |               X
-            |       |                              |
-    If cond ? O1: X | -->       +       +  If cond ? O1: X  +
-            |       |                                           |
-           O2 <- - -|                                          O2 <-|
-
-
-    Avoid O1 merge to O2.
-    """
-
-    target = "test_if_else_merge"
-
-    @tvm.ir.register_op_attr("sigmoid", "target." + target)
-    def sigmoid(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("erf", "target." + target)
-    def erf(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("add", "target." + target)
-    def add(expr):  # pylint: disable=unused-variable
-        return True
-
-    """Test that If-else nodes merges regions correctly."""
-
-    def get_mod():
-        data = relay.var("data", shape=(1, 32))
-        add0 = relay.add(data, data)
-        sub0 = relay.subtract(add0, data)
-        eq = relay.equal(relay.sum(add0), relay.sum(sub0))
-
-        true_branch = relay.sigmoid(add0)
-        false_branch = relay.sigmoid(sub0)
-        ife = relay.If(eq, true_branch, false_branch)
-        erf = relay.erf(ife)
-        out = relay.add(add0, erf)
-        func = relay.Function([data], out)
-        mod = tvm.IRModule.from_expr(func)
-
-        return mod
-
-    for annotate_non_call_ops in [True, False]:
-        result = transform.AnnotateTarget(target, annotate_non_call_ops)(get_mod())
-        merge = transform.MergeCompilerRegions()(result)
-        # Ensure partition finished without segment fault.
-        partition = transform.PartitionGraph()(merge)
-
-
-if __name__ == "__main__":
-    test_diamond_graph_fanouts()
-    test_example_graph()
-    test_if_else()
diff --git a/tests/python/relay/test_pass_merge_composite.py b/tests/python/relay/test_pass_merge_composite.py
deleted file mode 100644
index 7983c5370bea..000000000000
--- a/tests/python/relay/test_pass_merge_composite.py
+++ /dev/null
@@ -1,1026 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for merge composite."""
-import pytest
-import tvm
-from tvm import relay, tir
-from tvm.relay.dataflow_pattern import TuplePattern, TupleGetItemPattern, is_op, wildcard
-from tvm.relay.testing import run_opt_pass
-
-
-"""
-The merge composite pass is designed to merge multiple relay operators, that
-match a given pattern, and combine them into a single relay function.
-
-For example suppose we have the graph:
-
-    conv2d
-      |       (merge composite pass)
-   bias_add            ====>           conv2d_bias_relu
-      |            (our target)
-     relu
-
-Our Relay IR before the pass:
-    fn (%data: Tensor[(1, 512, 28, 28), float32], %kernel: Tensor[(256, 512, 1, 1), float32],
-            %bias: Tensor[(256), float32]) -> Tensor[(1, 256, 28, 28), float32] {
-        %0 = nn.conv2d(%data, %kernel, kernel_size=[1, 1])
-            /* ty=Tensor[(1, 256, 28, 28), float32] */;
-        %1 = nn.bias_add(%0, %bias) /* ty=Tensor[(1, 256, 28, 28), float32] */;
-        nn.relu(%1) /* ty=Tensor[(1, 256, 28, 28), float32] */
-    }
-
-Our Relay IR after the pass:
-    fn (%data: Tensor[(1, 512, 28, 28), float32], %kernel: Tensor[(256, 512, 1, 1), float32],
-            %bias: Tensor[(256), float32]) -> Tensor[(1, 256, 28, 28), float32] {
-      %2 = fn (%x: Tensor[(1, 512, 28, 28), float32], %y: Tensor[(256, 512, 1, 1), float32],
-            %z: Tensor[(256), float32], Primitive=1, Composite="conv2d_bias_relu") ->
-            Tensor[(1, 256, 28, 28), float32] {
-        %0 = nn.conv2d(%x, %y, kernel_size=[1, 1]) /* ty=Tensor[(1, 256, 28, 28), float32] */;
-        %1 = nn.bias_add(%0, %z) /* ty=Tensor[(1, 256, 28, 28), float32] */;
-        nn.relu(%1) /* ty=Tensor[(1, 256, 28, 28), float32] */
-      };
-      %2(%data, %kernel, %bias) /* ty=Tensor[(1, 256, 28, 28), float32] */
-    }
-
-As you can see in the second relay example, the pattern we specified has been wrapped
-in a function. The function is then called, producing the same result as the first relay
-example.
-
-One convenient use for this pass is to offload multiple operators to a single external
-codegen function.
-"""
-
-
-def make_add_sub_mul_pattern():
-    r"""Create a pattern to match the following graph.
-
-    add  sub
-     \   /
-      \ /
-      mul
-    """
-    x = wildcard()
-    y = wildcard()
-    return (x + y) * (x - y)
-
-
-def make_add_relu_pattern():
-    r"""Create a pattern to match the following graph.
-
-     add
-      |
-    relu
-    """
-    add_node = wildcard() + wildcard()
-    r = is_op("nn.relu")(add_node)
-    return r
-
-
-def make_conv_bias_relu_pattern():
-    r"""Create a pattern to match the following graph.
-
-     conv2d
-       |
-    bias_add
-       |
-     relu
-    """
-    x = wildcard()
-    y = wildcard()
-    z = wildcard()
-    conv_node = is_op("nn.conv2d")(x, y)
-    bias_node = is_op("nn.bias_add")(conv_node, z)
-    r = is_op("nn.relu")(bias_node)
-    return r
-
-
-def make_pattern_with_optional():
-    r"""Create a pattern to match the following graph. Note that relu is optinal.
-
-     conv2d
-       |
-    bias_add
-       |
-     (relu)
-    """
-    x = wildcard()
-    y = wildcard()
-    z = wildcard()
-    conv_node = is_op("nn.conv2d")(x, y)
-    bias_node = is_op("nn.bias_add")(conv_node, z)
-    r = bias_node.optional(lambda x: is_op("nn.relu")(x))
-    return r
-
-
-def make_add_add_add_pattern():
-    r"""Create a pattern to match the following graph.
-       Useful for testing re-using a call node.
-
-        x    y
-      /  \  /
-      |  add
-       \  |  \
-         add |
-          | /
-         add
-    """
-    x = wildcard()
-    y = wildcard()
-    add_node = is_op("add")(x, y)
-    add_node_1 = is_op("add")(x, add_node)
-    r = is_op("add")(add_node_1, add_node)
-    return r
-
-
-def make_bn_relu_pattern():
-    r"""Create a pattern to match the following graph.
-
-     batch_norm
-         |
-    TupleGetItem(0)
-         |
-       relu
-    """
-    x = wildcard()
-    gamma = wildcard()
-    beta = wildcard()
-    moving_mean = wildcard()
-    moving_var = wildcard()
-    bn_node = is_op("nn.batch_norm")(x, gamma, beta, moving_mean, moving_var)
-    tuple_get_item_node = TupleGetItemPattern(bn_node, 0)
-    r = is_op("nn.relu")(tuple_get_item_node)
-    return r
-
-
-def check_result(pattern_table, graph, expected_graph, import_prelude=False):
-    """Utility function to check merge composite results."""
-    result = run_opt_pass(
-        graph, relay.transform.MergeComposite(pattern_table), import_prelude=import_prelude
-    )
-    assert not relay.analysis.free_vars(result), "Found free vars in the result graph: {0}".format(
-        str(result)
-    )
-    expected = run_opt_pass(expected_graph, relay.transform.InferType())
-    tvm.ir.assert_structural_equal(result, expected, map_free_vars=True)
-
-
-def test_simple_merge():
-    r"""Test composite function is correctly produced from simple graph.
-
-    We could expect the pattern `make_add_relu_pattern` to be merged
-    into a single op `add_relu`.
-
-        a  b
-        \ /               a  b
-        add    ====>      \ /
-         |             add_relu
-       relu
-
-    """
-    pattern_table = [("add_relu", make_add_relu_pattern())]
-
-    def before():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-        add_node = relay.add(a, b)
-        r = relay.nn.relu(add_node)
-        return relay.Function([a, b], r)
-
-    def expected():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-
-        # add_relu function
-        in_1 = relay.var("in_1", shape=(10, 10))
-        in_2 = relay.var("in_2", shape=(10, 10))
-        add_node = relay.add(in_1, in_2)
-        relu_node = relay.nn.relu(add_node)
-        add_relu = relay.Function([in_1, in_2], relu_node)
-        add_relu = add_relu.with_attr("Composite", "add_relu")
-        add_relu = add_relu.with_attr("PartitionedFromPattern", "add_nn.relu_")
-
-        # merged function
-        r = relay.Call(add_relu, [a, b])
-        return relay.Function([a, b], r)
-
-    check_result(pattern_table, before(), expected())
-
-
-def test_branch_merge():
-    r"""Test composite function is correctly produced from branching graph.
-
-    We would expect the pattern `make_add_sub_mul_pattern` to be merged
-    into a single op `add_sub_mul`.
-
-       a  b  a  b
-        \/    \/
-        add  sub                       a  b
-         \   /                          \/
-          \ /                      add_sub_mul
-          mul                     c     |
-          /  \                     \    |
-       c /  c |       ====>        add_sub_mul
-       \/   \/                          |
-       add  sub                         |
-        \   /                         relu
-         \ /
-         mul
-          |
-          |
-        relu
-    """
-
-    pattern_table = [("add_sub_mul", make_add_sub_mul_pattern())]
-
-    def before():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-        c = relay.var("c", shape=(10, 10))
-        add_node = relay.add(a, b)
-        sub_node = relay.subtract(a, b)
-        mul_node = relay.multiply(add_node, sub_node)
-        add_node_2 = relay.add(c, mul_node)
-        sub_node_2 = relay.subtract(c, mul_node)
-        mul_node_2 = relay.multiply(add_node_2, sub_node_2)
-        r = relay.nn.relu(mul_node_2)
-        return relay.Function([a, b, c], r)
-
-    def expected():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-        c = relay.var("c", shape=(10, 10))
-
-        # add_sub_mul function
-        in_1 = relay.var("in_1", shape=(10, 10))
-        in_2 = relay.var("in_2", shape=(10, 10))
-        add_node = relay.add(in_1, in_2)
-        sub_node = relay.subtract(in_1, in_2)
-        mul_node = relay.multiply(add_node, sub_node)
-        add_sub_mul = relay.Function([in_1, in_2], mul_node)
-        add_sub_mul = add_sub_mul.with_attr("Composite", "add_sub_mul")
-        add_sub_mul = add_sub_mul.with_attr("PartitionedFromPattern", "add_subtract_multiply_")
-
-        # add_sub_mul1 function
-        in_3 = relay.var("in_3", shape=(10, 10))
-        in_4 = relay.var("in_4", shape=(10, 10))
-        add_node_1 = relay.add(in_3, in_4)
-        sub_node_1 = relay.subtract(in_3, in_4)
-        mul_node_1 = relay.multiply(add_node_1, sub_node_1)
-        add_sub_mul_1 = relay.Function([in_3, in_4], mul_node_1)
-        add_sub_mul_1 = add_sub_mul_1.with_attr("Composite", "add_sub_mul")
-        add_sub_mul_1 = add_sub_mul_1.with_attr("PartitionedFromPattern", "add_subtract_multiply_")
-
-        # merged function
-        m_add_sub_mul_1 = relay.Call(add_sub_mul, [a, b])
-        m_add_sub_mul_2 = relay.Call(add_sub_mul_1, [c, m_add_sub_mul_1])
-        r = relay.nn.relu(m_add_sub_mul_2)
-        return relay.Function([a, b, c], r)
-
-    check_result(pattern_table, before(), expected())
-
-
-def test_reuse_call_merge():
-    r"""Test composite function is correctly produced from simple graph
-       which re-uses call nodes.
-
-    We could expect the pattern `make_add_add_add` to be merged
-    into a single op `add_add_add`.
-
-        x     y
-         \   / \
-          sub  |           x     y
-        /  |  /             \   / |
-        | add      ====>     sub  |
-         \ |  \               |  /
-          add |           add_add_add
-           | /
-          add
-
-    """
-    pattern_table = [("add_add_add", make_add_add_add_pattern())]
-
-    def before():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-        sub_node = relay.subtract(a, b)
-
-        # pattern
-        add_node = relay.add(sub_node, b)
-        add_node_1 = relay.add(sub_node, add_node)
-        r = relay.add(add_node_1, add_node)
-
-        return relay.Function([a, b], r)
-
-    def expected():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-
-        # add_relu_add function
-        in_1 = relay.var("in_1", shape=(10, 10))
-        in_2 = relay.var("in_2", shape=(10, 10))
-        add_node = relay.add(in_1, in_2)
-        add_node_1 = relay.add(in_1, add_node)
-        add_node_2 = relay.add(add_node_1, add_node)
-        add_add_add = relay.Function([in_1, in_2], add_node_2)
-        add_add_add = add_add_add.with_attr("Composite", "add_add_add")
-        add_add_add = add_add_add.with_attr("PartitionedFromPattern", "add_add_add_")
-
-        # merged function
-        sub_node = relay.subtract(a, b)
-        call = relay.Call(add_add_add, [sub_node, b])
-        return relay.Function([a, b], call)
-
-    check_result(pattern_table, before(), expected())
-
-
-def test_multiple_patterns():
-    r"""Test different patterns are merged correctly in the graph.
-
-    We would expect the pattern `make_conv_bias_relu_pattern` to be merged
-    into a single op `conv_bias_relu`. We would also expect `make_add_relu_pattern`
-    to be merged into a single op `add_relu`.
-
-        data   kernel
-          \      /
-           \    /
-           conv2d                   data   kernel   bias
-             |                         \      |      /
-             |   bias                 conv2d_bias_relu
-             |   /                            |
-          bias_add        ====>               |    a
-             |                                |   /
-           relu  a                        add_relu
-             \  /                             |
-             add                              |  b
-              |                               | /
-            relu  b                          mul
-              |  /
-             mul
-    """
-    pattern_table = [
-        ("conv2d_bias_relu", make_conv_bias_relu_pattern()),
-        ("add_relu", make_add_relu_pattern()),
-    ]
-
-    def before():
-        data = relay.var("data", shape=(1, 512, 28, 28))
-        kernel = relay.var("kernel", shape=(256, 512, 1, 1))
-        bias = relay.var("bias", shape=(256,))
-        a = relay.var("a", shape=(1, 256, 28, 28))
-        b = relay.var("b", shape=(1, 256, 28, 28))
-
-        conv_node = relay.nn.conv2d(
-            data, kernel, kernel_size=(1, 1), padding=(0, 0), strides=(1, 1)
-        )
-
-        bias_node = relay.nn.bias_add(conv_node, bias)
-        relu_node = relay.nn.relu(bias_node)
-        add_node = relay.add(relu_node, a)
-        relu_node_2 = relay.nn.relu(add_node)
-        r = relay.multiply(relu_node_2, b)
-        return relay.Function([data, kernel, bias, a, b], r)
-
-    def expected():
-        data = relay.var("data", shape=(1, 512, 28, 28))
-        kernel = relay.var("kernel", shape=(256, 512, 1, 1))
-        bias = relay.var("bias", shape=(256,))
-        a = relay.var("a", shape=(1, 256, 28, 28))
-        b = relay.var("b", shape=(1, 256, 28, 28))
-
-        # conv_bias_relu function
-        in_1 = relay.var("in_1", shape=(1, 512, 28, 28))
-        in_2 = relay.var("in_2", shape=(256, 512, 1, 1))
-        in_3 = relay.var("in_3", shape=(256,))
-
-        conv_node = relay.nn.conv2d(in_1, in_2, kernel_size=(1, 1), padding=(0, 0), strides=(1, 1))
-
-        bias_node = relay.nn.bias_add(conv_node, in_3)
-        r = relay.nn.relu(bias_node)
-        conv_bias_add_relu = relay.Function([in_1, in_2, in_3], r)
-        conv_bias_add_relu = conv_bias_add_relu.with_attr("Composite", "conv2d_bias_relu")
-        conv_bias_add_relu = conv_bias_add_relu.with_attr(
-            "PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_"
-        )
-
-        # add_relu function
-        in_4 = relay.var("in_4", shape=(1, 256, 28, 28))
-        in_5 = relay.var("in_5", shape=(1, 256, 28, 28))
-        add_node = relay.add(in_4, in_5)
-        r = relay.nn.relu(add_node)
-        add_relu = relay.Function([in_4, in_5], r)
-        add_relu = add_relu.with_attr("Composite", "add_relu")
-        add_relu = add_relu.with_attr("PartitionedFromPattern", "add_nn.relu_")
-
-        # merged function
-        conv_bias_add_relu_1 = relay.Call(conv_bias_add_relu, [data, kernel, bias])
-        add_relu_1 = relay.Call(add_relu, [conv_bias_add_relu_1, a])
-        r = relay.multiply(add_relu_1, b)
-        return relay.Function([data, kernel, bias, a, b], r)
-
-    check_result(pattern_table, before(), expected())
-
-
-def test_optional_pattern():
-    r"""Test the pattern with optional operators. We can define a pattern with some operators
-    optional. The merge composite pass will create composite functions for all matched patterns,
-    but with different "PartitionedFromPattern" attribute. We expect the backend codegen to
-    analyze that attribute and determine the corresponding action.
-
-    Pattern:    Matched Case A:    Matched Case B:
-
-     conv2d        conv2d             conv2d
-       |             |                  |
-    bias_add      bias_add           bias_add
-       |             |
-     (relu)         relu
-
-    In the above example, the composite function for matched case A would have
-    PartitionedFromPattern="nn.conv2d_nn.bias_add_nn.relu_" while the one for matched case B
-    woud be "nn.conv2d_nn.bias_add_".
-    """
-    pattern_table = [("layer", make_pattern_with_optional())]
-
-    def before():
-        x = relay.var("x", shape=(1, 3, 7, 7))
-        w1 = relay.var("w", shape=(3, 3, 1, 1))
-        b1 = relay.var("b", shape=(3,))
-        w2 = relay.var("w", shape=(3, 3, 1, 1))
-        b2 = relay.var("b", shape=(3,))
-        conv = relay.nn.conv2d(x, w1, kernel_size=(1, 1))
-        bias = relay.nn.bias_add(conv, b1)
-        relu = relay.nn.relu(bias)
-        conv = relay.nn.conv2d(relu, w2, kernel_size=(1, 1))
-        bias = relay.nn.bias_add(conv, b2)
-        return relay.Function([x, w1, w2, b1, b2], bias)
-
-    def expected():
-        # Matched composite function A
-        x = relay.var("x")
-        w = relay.var("w")
-        b = relay.var("b")
-        conv = relay.nn.conv2d(x, w, kernel_size=(1, 1))
-        bias = relay.nn.bias_add(conv, b)
-        relu = relay.nn.relu(bias)
-        func1 = relay.Function([x, w, b], relu)
-        func1 = func1.with_attr("Composite", "layer")
-        func1 = func1.with_attr("PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_")
-
-        # Matched composite function B
-        x = relay.var("x")
-        w = relay.var("w")
-        b = relay.var("b")
-        conv = relay.nn.conv2d(x, w, kernel_size=(1, 1))
-        bias = relay.nn.bias_add(conv, b)
-        func2 = relay.Function([x, w, b], bias)
-        func2 = func2.with_attr("Composite", "layer")
-        func2 = func2.with_attr("PartitionedFromPattern", "nn.conv2d_nn.bias_add_")
-
-        # Main function
-        x = relay.var("x", shape=(1, 3, 7, 7))
-        w1 = relay.var("w", shape=(3, 3, 1, 1))
-        b1 = relay.var("b", shape=(3,))
-        w2 = relay.var("w", shape=(3, 3, 1, 1))
-        b2 = relay.var("b", shape=(3,))
-        out1 = func1(x, w1, b1)
-        out2 = func2(out1, w2, b2)
-        return relay.Function([x, w1, w2, b1, b2], out2)
-
-    check_result(pattern_table, before(), expected())
-
-
-def test_merge_order():
-    r"""Test that patterns are merged in the order they exist in the pattern table.
-
-    There can be cases where one pattern is a subgraph of another, in which case
-    it is not clear which match should take priority. The priority should come
-    from the order in which the patterns are declared in the pattern table. The
-    first patterns will be merged with highest priority and the last with lowest.
-
-    A:       B:       C:
-    add      add      abs
-     |        |        |
-    abs      abs      relu
-     |
-    relu
-
-    """
-
-    def pattern_A():
-        x = wildcard()
-        y = wildcard()
-        out = is_op("add")(x, y)
-        out = is_op("abs")(out)
-        out = is_op("nn.relu")(out)
-        return out
-
-    def pattern_B():
-        x = wildcard()
-        y = wildcard()
-        out = is_op("add")(x, y)
-        out = is_op("abs")(out)
-        return out
-
-    def pattern_C():
-        x = wildcard()
-        out = is_op("abs")(x)
-        out = is_op("nn.relu")(out)
-        return out
-
-    def before():
-        input_1 = relay.var("input_1", shape=(10, 10))
-        input_2 = relay.var("input_2", shape=(10, 10))
-        out = relay.add(input_1, input_2)
-        out = relay.abs(out)
-        out = relay.nn.relu(out)
-        return relay.Function([input_1, input_2], out)
-
-    def after_A_priority():
-        input_1 = relay.var("input_1", shape=(10, 10))
-        input_2 = relay.var("input_2", shape=(10, 10))
-        x = relay.var("x")
-        y = relay.var("y")
-        out = relay.add(x, y)
-        out = relay.abs(out)
-        out = relay.nn.relu(out)
-        merged_func = relay.Function([x, y], out)
-        merged_func = merged_func.with_attr("Composite", "A")
-        merged_func = merged_func.with_attr("PartitionedFromPattern", "add_abs_nn.relu_")
-        ret = relay.Call(merged_func, [input_1, input_2])
-        return relay.Function([input_1, input_2], ret)
-
-    def after_B_priority():
-        input_1 = relay.var("input_1", shape=(10, 10))
-        input_2 = relay.var("input_2", shape=(10, 10))
-        x = relay.var("x")
-        y = relay.var("y")
-        out = relay.add(x, y)
-        out = relay.abs(out)
-        merged_func = relay.Function([x, y], out)
-        merged_func = merged_func.with_attr("Composite", "B")
-        merged_func = merged_func.with_attr("PartitionedFromPattern", "add_abs_")
-        out = relay.Call(merged_func, [input_1, input_2])
-        ret = relay.nn.relu(out)
-        return relay.Function([input_1, input_2], ret)
-
-    def after_C_priority():
-        input_1 = relay.var("input_1", shape=(10, 10))
-        input_2 = relay.var("input_2", shape=(10, 10))
-        x = relay.var("x")
-        out = relay.abs(x)
-        out = relay.nn.relu(out)
-        merged_func = relay.Function([x], out)
-        merged_func = merged_func.with_attr("Composite", "C")
-        merged_func = merged_func.with_attr("PartitionedFromPattern", "abs_nn.relu_")
-        out = relay.add(input_1, input_2)
-        ret = relay.Call(merged_func, [out])
-        return relay.Function([input_1, input_2], ret)
-
-    # check A highest priority
-    pattern_table = [
-        ("A", pattern_A()),
-        ("B", pattern_B()),
-        ("C", pattern_C()),
-    ]
-    check_result(pattern_table, before(), after_A_priority())
-
-    # check B highest priority
-    pattern_table = [
-        ("B", pattern_B()),
-        ("C", pattern_C()),
-        ("A", pattern_A()),
-    ]
-    check_result(pattern_table, before(), after_B_priority())
-
-    # check C highest priority
-    pattern_table = [
-        ("C", pattern_C()),
-        ("A", pattern_A()),
-        ("B", pattern_B()),
-    ]
-    check_result(pattern_table, before(), after_C_priority())
-
-
-def test_parallel_merge():
-    r"""Tests that parallel patterns relying on the same inputs are correctly merged.
-
-    The test graph is difficult to draw out as ascii art. It is essentially two parallel
-    add-sub-mul units which both consume input_1 and input_2 with their results being multiplied
-    to give the output. We expect both parallel branches should get merged and both should still
-    consume the same input variables, input_1 and input_2."""
-
-    def before():
-        input_1 = relay.var("input_1", shape=(10, 10))
-        input_2 = relay.var("input_2", shape=(10, 10))
-        branch_1_add = relay.add(input_1, input_2)
-        branch_1_sub = relay.subtract(input_1, input_2)
-        branch_1 = relay.multiply(branch_1_add, branch_1_sub)
-        branch_2_add = relay.add(input_1, input_2)
-        branch_2_sub = relay.subtract(input_1, input_2)
-        branch_2 = relay.multiply(branch_2_add, branch_2_sub)
-        out = relay.multiply(branch_1, branch_2)
-        return relay.Function([input_1, input_2], out)
-
-    def expected():
-        input_1 = relay.var("input_1", shape=(10, 10))
-        input_2 = relay.var("input_2", shape=(10, 10))
-        x = relay.var("x")
-        y = relay.var("y")
-        branch_1 = relay.multiply(relay.add(x, y), relay.subtract(x, y))
-        func_1 = relay.Function([x, y], branch_1)
-        func_1 = func_1.with_attr("Composite", "add_sub_mul")
-        func_1 = func_1.with_attr("PartitionedFromPattern", "add_subtract_multiply_")
-        call_1 = relay.Call(func_1, [input_1, input_2])
-        x1 = relay.var("x1")
-        y1 = relay.var("y1")
-        branch_2 = relay.multiply(relay.add(x1, y1), relay.subtract(x1, y1))
-        func_2 = relay.Function([x1, y1], branch_2)
-        func_2 = func_2.with_attr("Composite", "add_sub_mul")
-        func_2 = func_2.with_attr("PartitionedFromPattern", "add_subtract_multiply_")
-        call_2 = relay.Call(func_2, [input_1, input_2])
-        out = relay.multiply(call_1, call_2)
-        return relay.Function([input_1, input_2], out)
-
-    pattern_table = [("add_sub_mul", make_add_sub_mul_pattern())]
-    check_result(pattern_table, before(), expected())
-
-
-def test_multiple_input_subgraphs():
-    r"""Test the case when multiple input subgraphs feed into another subgraph.
-
-     (1)    (2)    (3)    (4)
-    add    add    add    add
-     |      |      |      |
-    relu   relu   relu   relu
-     \      /      \      /
-      \   /         \   /
-       add           sub
-        \            /
-          \        /
-            \    /
-              mul
-
-    ----> When 1=3 and 2=4 (Case 'A')
-
-    add_relu  add_relu
-       \         /
-        \      /
-       add_sub_mul
-
-    ----> When 1!=3 and 2!=4 (Case 'B')
-
-    add_relu  add_relu  add_relu  add_relu
-       \       /           \       /
-         \   /               \   /
-          add                 sub
-           \                  /
-            --------     -----
-                   \    /
-                    mul
-
-    The difference in behaviour comes from the fact that add_sub_mul expects that the
-    inputs to add and sub are identical (the same two relay expressions). So when you
-    have 4 independent inputs, the pattern should not be merged.
-    """
-
-    def before():
-        before_funcs = {}
-        inputs = [relay.var("input_" + str(i), shape=(10, 10)) for i in range(8)]
-        add_relu_1 = relay.add(inputs[0], inputs[1])
-        add_relu_1 = relay.nn.relu(add_relu_1)
-        add_relu_2 = relay.add(inputs[2], inputs[3])
-        add_relu_2 = relay.nn.relu(add_relu_2)
-        add_relu_3 = relay.add(inputs[4], inputs[5])
-        add_relu_3 = relay.nn.relu(add_relu_3)
-        add_relu_4 = relay.add(inputs[6], inputs[7])
-        add_relu_4 = relay.nn.relu(add_relu_4)
-        add = relay.add(add_relu_1, add_relu_2)
-        sub = relay.subtract(add_relu_3, add_relu_4)
-        out = relay.multiply(add, sub)
-        before_funcs["B"] = relay.Function(inputs, out)
-        sub = relay.subtract(add_relu_1, add_relu_2)
-        out = relay.multiply(add, sub)
-        before_funcs["A"] = relay.Function(inputs[:4], out)
-        return before_funcs
-
-    def after_A():
-        inputs = [relay.var("input_" + str(i), shape=(10, 10)) for i in range(4)]
-        x = relay.var("x")
-        y = relay.var("y")
-        add_relu_1 = relay.add(x, y)
-        add_relu_1 = relay.nn.relu(add_relu_1)
-        add_relu_1 = relay.Function([x, y], add_relu_1)
-        add_relu_1 = add_relu_1.with_attr("Composite", "add_relu")
-        add_relu_1 = add_relu_1.with_attr("PartitionedFromPattern", "add_nn.relu_")
-        add_relu_call_1 = relay.Call(add_relu_1, [inputs[0], inputs[1]])
-        x1 = relay.var("x1")
-        y1 = relay.var("y1")
-        add_relu_2 = relay.add(x1, y1)
-        add_relu_2 = relay.nn.relu(add_relu_2)
-        add_relu_2 = relay.Function([x1, y1], add_relu_2)
-        add_relu_2 = add_relu_2.with_attr("Composite", "add_relu")
-        add_relu_2 = add_relu_2.with_attr("PartitionedFromPattern", "add_nn.relu_")
-        add_relu_call_2 = relay.Call(add_relu_2, [inputs[2], inputs[3]])
-        x2 = relay.var("x2")
-        y2 = relay.var("y2")
-        add = relay.add(x2, y2)
-        sub = relay.subtract(x2, y2)
-        add_sub_mul = relay.multiply(add, sub)
-        add_sub_mul = relay.Function([x2, y2], add_sub_mul)
-        add_sub_mul = add_sub_mul.with_attr("Composite", "add_sub_mul")
-        add_sub_mul = add_sub_mul.with_attr("PartitionedFromPattern", "add_subtract_multiply_")
-        add_sub_mul_call = relay.Call(add_sub_mul, [add_relu_call_1, add_relu_call_2])
-        return relay.Function(inputs, add_sub_mul_call)
-
-    def after_B():
-        inputs = [relay.var("input_" + str(i), shape=(10, 10)) for i in range(8)]
-        add_relu_calls = []
-        for i in range(4):
-            x = relay.var("x" + str(i))
-            y = relay.var("x" + str(i))
-            add_relu = relay.add(x, y)
-            add_relu = relay.nn.relu(add_relu)
-            add_relu = relay.Function([x, y], add_relu)
-            add_relu = add_relu.with_attr("Composite", "add_relu")
-            add_relu = add_relu.with_attr("PartitionedFromPattern", "add_nn.relu_")
-            add_relu_call = relay.Call(add_relu, [inputs[i * 2], inputs[i * 2 + 1]])
-            add_relu_calls.append(add_relu_call)
-
-        add = relay.add(add_relu_calls[0], add_relu_calls[1])
-        sub = relay.subtract(add_relu_calls[2], add_relu_calls[3])
-        out = relay.multiply(add, sub)
-        return relay.Function(inputs, out)
-
-    pattern_table = [
-        ("add_sub_mul", make_add_sub_mul_pattern()),
-        ("add_relu", make_add_relu_pattern()),
-    ]
-    check_result(pattern_table, before()["A"], after_A())
-    check_result(pattern_table, before()["B"], after_B())
-
-
-def test_tuple_get_item_merge():
-    """Test composite function can be merged from pattern containing TupleGetItem nodes."""
-    pattern_table = [("bn_relu", make_bn_relu_pattern())]
-
-    def before():
-        x = relay.var("x", shape=(1, 8))
-        gamma = relay.var("gamma", shape=(8,))
-        beta = relay.var("beta", shape=(8,))
-        moving_mean = relay.var("moving_mean", shape=(8,))
-        moving_var = relay.var("moving_var", shape=(8,))
-        bn_node = relay.nn.batch_norm(x, gamma, beta, moving_mean, moving_var)
-        tuple_get_item_node = bn_node[0]
-        r = relay.nn.relu(tuple_get_item_node)
-        return relay.Function([x, gamma, beta, moving_mean, moving_var], r)
-
-    def expected():
-        x = relay.var("x", shape=(1, 8))
-        beta = relay.var("beta", shape=(8,))
-        gamma = relay.var("gamma", shape=(8,))
-        moving_mean = relay.var("moving_mean", shape=(8,))
-        moving_var = relay.var("moving_var", shape=(8,))
-
-        # bn_relu function
-        in_1 = relay.var("x1", shape=(1, 8))
-        in_2 = relay.var("gamma1", shape=(8,))
-        in_3 = relay.var("beta1", shape=(8,))
-        in_4 = relay.var("moving_mean1", shape=(8,))
-        in_5 = relay.var("moving_var1", shape=(8,))
-        bn_node = relay.nn.batch_norm(in_1, in_2, in_3, in_4, in_5)
-        tuple_get_item_node = bn_node[0]
-        relu_node = relay.nn.relu(tuple_get_item_node)
-        bn_relu = relay.Function([in_1, in_2, in_3, in_4, in_5], relu_node)
-        bn_relu = bn_relu.with_attr("Composite", "bn_relu")
-        bn_relu = bn_relu.with_attr(
-            "PartitionedFromPattern", "nn.batch_norm_TupleGetItem0_nn.relu_"
-        )
-
-        # merged function
-        r = relay.Call(bn_relu, [x, gamma, beta, moving_mean, moving_var])
-        return relay.Function([x, gamma, beta, moving_mean, moving_var], r)
-
-    check_result(pattern_table, before(), expected())
-
-
-def test_pattern_with_check():
-    def before():
-        x = relay.var("x", shape=(1, 10, 10, 10))
-        w = relay.var("w", shape=(10, 10, 3, 3))
-        b = relay.var("b", shape=(8,))
-        conv = relay.nn.conv2d(x, w, kernel_size=(3, 3), kernel_layout="OIHW", data_layout="NHWC")
-        bias = relay.nn.bias_add(conv, b)
-        relu = relay.nn.relu(bias)
-        return relay.Function([x, w, b], relu)
-
-    def _check_true(extract):
-        conv = extract.args[0].args[0]
-        return conv.attrs.data_layout == "NHWC"
-
-    def _check_false(extract):
-        conv = extract.args[0].args[0]
-        return conv.attrs.data_layout == "NCHW"
-
-    def expected():
-        x = relay.var("x")
-        w = relay.var("w")
-        b = relay.var("b")
-        conv = relay.nn.conv2d(x, w, kernel_size=(3, 3), kernel_layout="OIHW", data_layout="NHWC")
-        bias = relay.nn.bias_add(conv, b)
-        relu = relay.nn.relu(bias)
-        func = relay.Function([x, w, b], relu)
-        func = func.with_attr("Composite", "conv_bias_relu")
-        func = func.with_attr("PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_")
-
-        x = relay.var("x", shape=(1, 10, 10, 10))
-        w = relay.var("w", shape=(10, 10, 3, 3))
-        b = relay.var("b", shape=(8,))
-        return relay.Function([x, w, b], func(x, w, b))
-
-    pattern_table_false = [("conv_bias_relu", make_conv_bias_relu_pattern(), _check_false)]
-    check_result(pattern_table_false, before(), before())
-
-    pattern_table_true = [("conv_bias_relu", make_conv_bias_relu_pattern(), _check_true)]
-    check_result(pattern_table_true, before(), expected())
-
-
-def test_diamond_not_merge():
-    r"""
-    The pattern on the left shouldn't match the structure on the right
-
-    relu             relu
-     | \              | \
-     | clip           | add
-     |  /             |  |
-     mul              | clip
-                      |  /
-                      mul
-    """
-
-    def get_pattern():
-        conv = make_conv_bias_relu_pattern()
-        clip = is_op("clip")(conv, wildcard(), wildcard())
-        return is_op("multiply")(conv, clip)
-
-    def get_net():
-        data = relay.var("data", shape=(1, 512, 28, 28))
-        kernel = relay.var("kernel", shape=(256, 512, 1, 1))
-        conv = relay.nn.conv2d(data, kernel, kernel_size=(1, 1), padding=(0, 0), strides=(1, 1))
-        bias = relay.nn.bias_add(conv, relay.var("bias", shape=(256,)))
-        relu = relay.nn.relu(bias)
-        add = relay.op.add(relu, relay.const(1.0))
-        clip2 = relay.op.clip(add, 0, 255)
-        mul = relay.op.multiply(relu, clip2)
-        return relay.Function(relay.analysis.free_vars(mul), mul)
-
-    pattern_table = [("pat", get_pattern())]
-    net = get_net()
-    check_result(pattern_table, net, net)
-
-
-def test_type_check():
-    """Test that we can query tensor types in the 'check' function."""
-
-    def before():
-        x = relay.var("x", shape=(1, 10, 10, 10))
-        w = relay.var("w", shape=(10, 10, 3, 3))
-        b = relay.var("b", shape=(8,))
-        add = relay.op.add(x, x)
-        relu = relay.nn.relu(add)
-        conv = relay.nn.conv2d(
-            relu, w, kernel_size=(3, 3), kernel_layout="OIHW", data_layout="NHWC"
-        )
-        bias = relay.nn.bias_add(conv, b)
-        relu2 = relay.nn.relu(bias)
-        return run_opt_pass(relay.Function([x, w, b], relu2), relay.transform.InferType())
-
-    def expected_false():
-        x = relay.var("x", shape=(1, 10, 10, 10))
-        w = relay.var("w", shape=(10, 10, 3, 3))
-        b = relay.var("b", shape=(8,))
-
-        x0 = relay.var("x")
-
-        add = relay.op.add(x0, x0)
-        relu = relay.nn.relu(add)
-        func = relay.Function([x0], relu)
-        func = func.with_attr("PartitionedFromPattern", "add_nn.relu_")
-        func = func.with_attr("Composite", "add_relu")
-        call = relay.Call(func, [x])
-
-        conv = relay.nn.conv2d(
-            call, w, kernel_size=(3, 3), kernel_layout="OIHW", data_layout="NHWC"
-        )
-        bias = relay.nn.bias_add(conv, b)
-        relu2 = relay.nn.relu(bias)
-        return relay.Function([x, w, b], relu2)
-
-    def expected_true():
-        x = relay.var("x", shape=(1, 10, 10, 10))
-        w = relay.var("w", shape=(10, 10, 3, 3))
-        b = relay.var("b", shape=(8,))
-
-        x0 = relay.var("x")
-
-        add = relay.op.add(x0, x0)
-        relu = relay.nn.relu(add)
-        func = relay.Function([x0], relu)
-        func = func.with_attr("PartitionedFromPattern", "add_nn.relu_")
-        func = func.with_attr("Composite", "add_relu")
-        call = relay.Call(func, [x])
-
-        x2 = relay.var("x")
-        w1 = relay.var("w")
-        b1 = relay.var("b")
-        conv = relay.nn.conv2d(x2, w1, kernel_size=(3, 3), kernel_layout="OIHW", data_layout="NHWC")
-        bias = relay.nn.bias_add(conv, b1)
-        relu2 = relay.nn.relu(bias)
-        func = relay.Function([x2, w1, b1], relu2)
-        func = func.with_attr("Composite", "conv_bias_relu")
-        func = func.with_attr("PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_")
-        call = relay.Call(func, [call, w, b])
-        return relay.Function([x, w, b], call)
-
-    def _check_type_true(extract):
-        conv = extract.args[0].args[0]
-        typ = conv.checked_type
-        return bool(typ.shape[0] == 1)
-
-    def _check_type_false(extract):
-        conv = extract.args[0].args[0]
-        typ = conv.checked_type
-        return bool(typ.shape[0] != 1)
-
-    pattern_table_false = [
-        ("add_relu", make_add_relu_pattern()),
-        ("conv_bias_relu", make_conv_bias_relu_pattern(), _check_type_false),
-    ]
-    check_result(pattern_table_false, before(), expected_false())
-
-    pattern_table_true = [
-        ("add_relu", make_add_relu_pattern()),
-        ("conv_bias_relu", make_conv_bias_relu_pattern(), _check_type_true),
-    ]
-    check_result(pattern_table_true, before(), expected_true())
-
-
-def test_einsum_reshape_pattern():
-    """Test MergeComposite does not cause error with einsum operator."""
-
-    def make_einsum_reshape_pattern():
-        x = wildcard()
-        x = is_op("reshape")(x) | x
-        y = wildcard()
-        y = is_op("reshape")(y) | y
-        z = is_op("einsum")(TuplePattern([x, y]))
-        r = is_op("reshape")(z) | z
-        return r
-
-    pattern_table = [
-        (
-            "einsum_reshape",
-            make_einsum_reshape_pattern(),
-        )
-    ]
-
-    def before():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-        c = relay.reshape(a, [20, 5])
-        d = relay.reshape(b, [20, 5])
-        r = relay.einsum([c, d], "...ab,...cb->...ac")
-        return relay.Function([a, b], r)
-
-    def expected():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-        c = relay.reshape(a, [20, 5])
-        d = relay.reshape(b, [20, 5])
-        r = relay.einsum([c, d], "...ab,...cb->...ac")
-        func = relay.Function([a, b], r)
-        func = func.with_attr("Composite", "einsum_reshape")
-        func = func.with_attr("PartitionedFromPattern", "reshape_reshape_Tuple_einsum_")
-
-        input0 = relay.var("a", shape=(10, 10))
-        input1 = relay.var("b", shape=(10, 10))
-        output = func(input0, input1)
-        return relay.Function([input0, input1], output)
-
-    check_result(pattern_table, before(), expected())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
deleted file mode 100644
index 214b9fa330ec..000000000000
--- a/tests/python/relay/test_pass_partial_eval.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.prelude import Prelude
-from tvm.relay import op, create_executor, transform
-from tvm.relay import Var, TypeVar, TupleGetItem, Let, Function, const, RefRead, RefWrite, RefCreate
-from tvm.relay import TensorType, Tuple, If, Clause, PatternConstructor, PatternVar, Match
-from tvm.relay import GlobalVar, Call
-from tvm.relay.transform import gradient
-from tvm.relay.testing import make_nat_expr, run_infer_type
-
-
-def check_eval(expr, expected_result, mod=None, rtol=1e-07):
-    dev = tvm.device("llvm", 0)
-    result = create_executor(mod=mod, device=dev, target="llvm").evaluate(expr)
-    np.testing.assert_allclose(result.numpy(), expected_result, rtol=rtol)
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def tipe(expr):
-    return run_opt_pass(expr, [transform.PartialEvaluate(), transform.InferType()])
-
-
-def dcpe(expr, mod=None, grad=False, ignore_impurity=False):
-    passes = [
-        transform.PartialEvaluate(),
-        transform.InferType(),
-        transform.DeadCodeElimination(inline_once=True, ignore_impurity=ignore_impurity),
-        transform.InferType(),
-    ]
-    if grad:
-        expr = gradient(run_infer_type(expr))
-    if mod:
-        assert isinstance(expr, Function)
-        mod["main"] = expr
-        seq = tvm.transform.Sequential(passes)
-        mod = seq(mod)
-        return mod["main"]
-    return run_opt_pass(expr, passes)
-
-
-def test_tuple():
-    t = TypeVar("t")
-    x = Var("x", t)
-    body = TupleGetItem(relay.Tuple([relay.const(4.0), x]), 1)
-    f = Function([x], body, None, [t])
-    expected = relay.Function([x], x, None, [t])
-    expected = run_opt_pass(expected, transform.InferType())
-    tvm.ir.assert_structural_equal(dcpe(f), expected)
-
-
-def test_const_inline():
-    t = relay.TensorType([], "float32")
-    d = Var("d", t)
-    double = Function([d], d + d)
-    orig = double(const(4.0))
-    tvm.ir.assert_structural_equal(dcpe(orig), const(8.0))
-
-
-def test_ref():
-    t = relay.TensorType([], "float32")
-    d = relay.Var("d", t)
-    r = relay.Var("r", relay.RefType(t))
-    x = relay.Var("x")
-    body = relay.RefRead(r)
-    body = Let(x, RefWrite(r, RefRead(r) * RefRead(r)), body)
-    body = Let(r, RefCreate(d), body)
-    square = Function([d], body)
-    expected = run_opt_pass(Function([d], d * d), transform.InferType())
-    # TODO(mbs): Revisit once DCE eliminates dead writes.
-    actual = dcpe(square, ignore_impurity=True)
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_empty_ad():
-    shape = (10, 10)
-    dtype = "float32"
-    t = TensorType(shape, dtype)
-    d = Var("d", t)
-    f = Function([d], d)
-    # TODO(mbs): Revisit once DCE eliminates dead writes.
-    g = dcpe(f, grad=True, ignore_impurity=True)
-    expected = Function([d], Tuple([d, Tuple([op.ones_like(d)])]))
-    expected = run_opt_pass(expected, transform.InferType())
-    tvm.ir.assert_structural_equal(g, expected)
-
-
-def test_ad():
-    shape = (10, 10)
-    dtype = "float32"
-    t = TensorType(shape, dtype)
-    d = Var("d", t)
-    f = Function([d], d * d)
-    # TODO(mbs): Revisit once DCE eliminates dead writes.
-    g = dcpe(f, grad=True, ignore_impurity=True)
-    m = d * d
-    x = relay.Var("x")
-    o = op.ones_like(x)
-    x1 = relay.Var("x1")
-    grad = op.zeros_like(d) + op.collapse_sum_like(x1 * d, d) + op.collapse_sum_like(x1 * d, d)
-    body = Tuple([x, Tuple([grad])])
-    body = relay.Let(x1, o, body)
-    expected = Function([d], relay.Let(x, m, body))
-    expected = run_opt_pass(expected, transform.InferType())
-    tvm.ir.assert_structural_equal(g, expected)
-
-
-def test_if_ref():
-    shape = ()
-    dtype = "bool"
-    t = TensorType(shape, dtype)
-    d = Var("d", t)
-    r = Var("r")
-    update = Function([], RefWrite(r, RefRead(r) + RefRead(r)))
-    u = Var("u")
-    body = If(d, u(), u())
-    eff = Var("eff")
-    body = Let(eff, body, RefRead(r))
-    f = Function([d], Let(r, RefCreate(const(1)), Let(u, update, body)))
-    pe_f = tipe(f)
-    f_res = create_executor().evaluate(f)(const(True))
-    pe_f_res = create_executor().evaluate(pe_f)(const(True))
-    np.testing.assert_allclose(f_res.numpy(), 2 * np.ones_like(f_res.numpy()))
-    np.testing.assert_allclose(pe_f_res.numpy(), 2 * np.ones_like(pe_f_res.numpy()))
-
-
-def test_function_invalidate():
-    shape = ()
-    dtype = "bool"
-    t = TensorType(shape, dtype)
-    d = Var("d", t)
-    r = Var("r")
-    fetch = Function([], RefRead(r))
-    fet = Var("fetch")
-    fet_obscured = Var("fetch_obscured")
-    u = Var("u")
-    body = If(d, fet_obscured(), fet_obscured())
-    body = Let(u, RefWrite(r, const(1)), body)
-    body = Let(fet_obscured, If(d, fet, fet), body)
-    body = Let(fet, fetch, body)
-    body = Let(r, RefCreate(const(0)), body)
-    f = Function([d], body)
-    pe_f = tipe(f)
-    f_res = create_executor().evaluate(f)(const(True))
-    pe_f_res = create_executor().evaluate(pe_f)(const(True))
-    np.testing.assert_allclose(f_res.numpy(), np.ones_like(f_res.numpy()))
-    np.testing.assert_allclose(pe_f_res.numpy(), np.ones_like(pe_f_res.numpy()))
-
-
-def test_head_cons():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    t = TypeVar("t")
-    x = Var("x", t)
-    rlist, cons, nil = p.mod.get_type("List")
-    hd = p.mod.get_global_var("hd")
-    body = hd(cons(x, nil()))
-    f = Function([x], body, None, [t])
-    res = dcpe(f, mod)
-    expected_mod = tvm.IRModule.from_expr(Function([x], x, t, [t]))
-    tvm.ir.assert_structural_equal(res, expected_mod["main"])
-
-
-def test_map():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    rlist, cons, nil = p.mod.get_type("List")
-    rmap = p.mod.get_global_var("map")
-    f = GlobalVar("f")
-    t = TypeVar("t")
-    a = Var("a", t)
-    mod[f] = Function([a], a, t, [t])
-    orig = rmap(f, cons(const(1), cons(const(2), cons(const(3), nil()))))
-    expected = cons((const(1)), cons((const(2)), cons((const(3)), nil())))
-    expected = Function([], expected)
-    mod["main"] = expected
-    mod = transform.InferType()(mod)
-    expected = mod["main"]
-    orig = Function([], orig)
-    res = dcpe(orig, mod=mod)
-    tvm.ir.assert_structural_equal(res.body, expected.body)
-
-
-def test_loop():
-    mod = tvm.IRModule()
-    t = TypeVar("t")
-    x = Var("x", t)
-    loop = GlobalVar("loop")
-    mod[loop] = Function([x], loop(x), t, [t])
-    expected = Call(loop, [const(1)])
-    mod["main"] = Function([], expected)
-    mod = transform.InferType()(mod)
-    expected = mod["main"].body
-    call = Function([], loop(const(1)))
-    res = dcpe(call, mod=mod)
-    tvm.ir.assert_structural_equal(res.body, expected)
-
-
-def test_swap_loop():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat, _, _ = p.mod.get_type("nat")
-    x = Var("x", nat())
-    y = Var("y", nat())
-    loop = GlobalVar("loop")
-    mod[loop] = Function([x, y], loop(y, x), nat())
-    prog = loop(make_nat_expr(p, 1), make_nat_expr(p, 2))
-    res = Function([], prog)
-    res = dcpe(res, mod=mod)
-    tvm.ir.assert_structural_equal(prog, res.body)
-
-
-def test_abs_diff():
-    # TODO(@M.K.): refactor using tuple pattern (not yet implemented)
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat, z, s = p.mod.get_type("nat")
-    x = Var("x", nat())
-    y = Var("y", nat())
-    xp = Var("x'", nat())
-    yp = Var("y'", nat())
-    diff = GlobalVar("diff")
-    y_z_case = Clause(PatternConstructor(z, []), x)
-    y_s_case = Clause(PatternConstructor(s, [PatternVar(yp)]), diff(yp, xp))
-    x_z_case = Clause(PatternConstructor(z, []), y)
-    x_s_case = Clause(PatternConstructor(s, [PatternVar(xp)]), Match(y, [y_z_case, y_s_case]))
-    mod[diff] = Function([x, y], Match(x, [x_z_case, x_s_case]))
-    orig = diff(make_nat_expr(p, 7), make_nat_expr(p, 3))
-    orig = Function([], orig)
-    res = dcpe(orig, mod=mod)
-    tvm.ir.assert_structural_equal(res.body, make_nat_expr(p, 4))
-
-
-def test_match_nat_id():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat, z, s = p.mod.get_type("nat")
-    x = Var("x", nat())
-    y = Var("y", nat())
-    nat_id = GlobalVar("nat_id")
-    z_case = Clause(PatternConstructor(z, []), z())
-    s_case = Clause(PatternConstructor(s, [PatternVar(y)]), s(y))
-    mod[nat_id] = Function([x], Match(x, [z_case, s_case]))
-    orig = nat_id(make_nat_expr(p, 3))
-    orig = Function([], orig)
-    res = dcpe(orig, mod=mod)
-    tvm.ir.assert_structural_equal(res.body, make_nat_expr(p, 3))
-
-
-def test_nat_id():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat, _, _ = p.mod.get_type("nat")
-    x = Var("x", nat())
-    y = Var("y", nat())
-    nat_id = GlobalVar("nat_id")
-    mod[nat_id] = Function([x], x)
-    orig = nat_id(make_nat_expr(p, 3))
-    orig = Function([], orig)
-    res = dcpe(orig, mod=mod)
-    tvm.ir.assert_structural_equal(res.body, make_nat_expr(p, 3))
-
-
-def test_global_match_nat_id():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat, z, s = p.mod.get_type("nat")
-    x = Var("x", nat())
-    z_case = Clause(PatternConstructor(z, []), z())
-    s_case = Clause(PatternConstructor(s, [PatternVar(x)]), s(x))
-    orig = Match(make_nat_expr(p, 3), [z_case, s_case])
-    orig = Function([], orig)
-    res = dcpe(orig, mod=mod)
-    tvm.ir.assert_structural_equal(res.body, make_nat_expr(p, 3))
-
-
-def test_double():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    double = p.mod.get_global_var("nat_double")
-    orig = double(make_nat_expr(p, 3))
-    orig = Function([], orig)
-    res = dcpe(orig, mod=mod)
-    tvm.ir.assert_structural_equal(res.body, make_nat_expr(p, 6))
-
-
-def test_concat():
-    t = relay.TensorType([10], "float32")
-    x = Var("x", t)
-    y = Var("x", t)
-    orig = run_infer_type(Function([x, y], op.concatenate([x, y], axis=0)))
-    tvm.ir.assert_structural_equal(dcpe(orig), orig)
-
-
-def test_triangle_number():
-    t = relay.TensorType([], "int32")
-    x = Var("x", t)
-    f_var = Var("f")
-    f = Function([x], If(op.equal(x, const(0)), const(0), x + f_var(x - const(1))))
-    orig = run_infer_type(Let(f_var, f, f_var(const(10))))
-    tvm.ir.assert_structural_equal(dcpe(orig), const(55))
-
-
-def test_nat_update():
-    m = tvm.IRModule()
-    p = Prelude(m)
-    p.mod.import_from_std("nat.rly")
-    m = transform.ToANormalForm()(m)
-    transform.PartialEvaluate()(m)
-
-
-def test_tuple_match():
-    a = relay.Var("a")
-    b = relay.Var("b")
-    clause = relay.Clause(relay.PatternTuple([relay.PatternVar(a), relay.PatternVar(b)]), a + b)
-    x = relay.Match(relay.Tuple([relay.const(1), relay.const(1)]), [clause])
-    tvm.ir.assert_structural_equal(dcpe(x), const(2))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
deleted file mode 100644
index 524e93408c8c..000000000000
--- a/tests/python/relay/test_pass_partition_graph.py
+++ /dev/null
@@ -1,1579 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for graph partitioning."""
-# pylint: disable=not-callable
-import os
-import sys
-
-import numpy as np
-
-import tvm
-from tvm.relay.backend import te_compiler
-from tvm.relay.backend.runtime import Runtime
-import tvm.relay.testing
-import tvm.relay.op as reg
-from tvm import relay
-from tvm import runtime as tvm_runtime
-from tvm.relay import transform
-from tvm.relay.testing import byoc
-from tvm.contrib import utils
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-from tvm.relay.op.contrib.register import get_pattern_table
-from tvm.relay.build_module import bind_params_by_name
-
-
-# Leverage the pass manager to write a simple allowed list based annotator
-@transform.function_pass(opt_level=0)
-class AllowedListAnnotator:
-    def __init__(self, op_list, compiler):
-        assert isinstance(op_list, (list, tuple, set))
-        self.op_list = op_list
-        self.compiler = compiler
-
-    def transform_function(self, func, mod, dev):
-
-        annotator = self
-
-        class Annotator(tvm.relay.ExprMutator):
-            def visit_call(self, call):
-                op_name = call.op.name
-                if op_name in annotator.op_list:
-                    new_args = []
-                    for arg in call.args:
-                        ann = compiler_begin(super().visit(arg), annotator.compiler)
-                        new_args.append(ann)
-                    new_call = relay.Call(call.op, new_args, call.attrs, call.type_args)
-                    return compiler_end(new_call, annotator.compiler)
-                else:
-                    return super().visit_call(call)
-
-        return Annotator().visit(func)
-
-
-class WholeGraphAnnotator(ExprMutator):
-    """
-    An annotator that creates a compiler for an entire graph.
-    """
-
-    def __init__(self, compiler):
-        super(WholeGraphAnnotator, self).__init__()
-        self.compiler = compiler
-        self.last_call = True
-
-    def visit_call(self, call):
-        curr_last = self.last_call
-        self.last_call = False
-
-        params = []
-        for arg in call.args:
-            param = super().visit(arg)
-            if isinstance(param, relay.expr.Var):
-                param = compiler_begin(param, self.compiler)
-            params.append(param)
-
-        new_call = relay.Call(call.op, params, call.attrs)
-        if curr_last:
-            new_call = compiler_end(new_call, self.compiler)
-        return new_call
-
-
-class MobileNetAnnotator(ExprMutator):
-    """
-    Annotate mobilenet until global_avg_pool.
-    """
-
-    def __init__(self, compiler):
-        super(MobileNetAnnotator, self).__init__()
-        self.compiler = compiler
-        self.compiler_open = False
-
-    def visit_call(self, call):
-
-        if call.op.name == "nn.global_avg_pool2d":
-            self.compiler_open = True
-        compiler_open = self.compiler_open
-
-        params = []
-        for arg in call.args:
-            param = super().visit(arg)
-            if call.op.name == "nn.global_avg_pool2d":
-                param = compiler_end(param, self.compiler)
-            if compiler_open and isinstance(param, relay.expr.Var):
-                param = compiler_begin(param, self.compiler)
-            params.append(param)
-
-        new_call = relay.Call(call.op, params, call.attrs)
-        return new_call
-
-
-def check_result(
-    mod,
-    map_inputs,
-    out_shape,
-    result,
-    tol=1e-5,
-    target="llvm",
-    device=tvm.cpu(),
-    params=None,
-    runtime=Runtime("cpp"),
-):
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        return
-
-    def update_lib(lib):
-        test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-        source_dir = os.path.join(test_dir, "..", "..", "..")
-        contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
-
-        kwargs = {}
-        kwargs["options"] = ["-O2", "-std=c++17", "-I" + contrib_path]
-        tmp_path = utils.tempdir()
-        lib_name = "lib.so"
-        lib_path = tmp_path.relpath(lib_name)
-        lib.export_library(lib_path, fcompile=False, **kwargs)
-        lib = tvm_runtime.load_module(lib_path)
-
-        return lib
-
-    def check_vm_result():
-        te_compiler.get().clear()
-        with tvm.transform.PassContext(opt_level=3):
-            exe = relay.vm.compile(mod, target=target, params=params)
-        code, lib = exe.save()
-        lib = update_lib(lib)
-        exe = tvm_runtime.vm.Executable.load_exec(code, lib)
-        vm = tvm_runtime.vm.VirtualMachine(exe, device)
-        outs = vm.run(**map_inputs)
-        outs = outs if isinstance(outs, tvm_runtime.container.ADT) else [outs]
-        results = result if isinstance(result, list) else [result]
-        for out, ref in zip(outs, results):
-            tvm.testing.assert_allclose(out.numpy(), ref, rtol=tol, atol=tol)
-
-    def check_graph_executor_result():
-        te_compiler.get().clear()
-        with tvm.transform.PassContext(opt_level=3):
-            json, lib, param = relay.build(mod, target=target, params=params, runtime=runtime)
-        lib = update_lib(lib)
-        rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
-
-        for name, data in map_inputs.items():
-            rt_mod.set_input(name, data)
-        rt_mod.set_input(**param)
-        rt_mod.run()
-
-        out_shapes = out_shape if isinstance(out_shape, list) else [out_shape]
-        results = result if isinstance(result, list) else [result]
-
-        for idx, shape in enumerate(out_shapes):
-            out = tvm.nd.empty(shape, device=device)
-            out = rt_mod.get_output(idx, out)
-            tvm.testing.assert_allclose(out.numpy(), results[idx], rtol=tol, atol=tol)
-
-    check_vm_result()
-    check_graph_executor_result()
-
-
-def test_extern_ccompiler_single_op():
-    @transform.function_pass(opt_level=0)
-    class MyAnnotator:
-        def transform_function(self, func, mod, dev):
-            class Annotator(tvm.relay.ExprMutator):
-                def visit_call(self, call):
-                    new_args = []
-                    for arg in call.args:
-                        ann = compiler_begin(self.visit(arg), "ccompiler")
-                        new_args.append(ann)
-                    new_call = relay.Call(call.op, new_args)
-                    return compiler_end(new_call, "ccompiler")
-
-            return Annotator().visit(func)
-
-    x = relay.var("x", shape=(8, 8))
-    y = relay.var("y", shape=(8, 8))
-    z = x + y
-    f = relay.Function([x, y], z)
-    x_data = np.random.rand(8, 8).astype("float32")
-    y_data = np.random.rand(8, 8).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = MyAnnotator()(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
-
-
-def set_func_attr(func, compile_name, symbol_name):
-    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Compiler", compile_name)
-    func = func.with_attr("global_symbol", symbol_name)
-    return func
-
-
-def test_extern_ccompiler_default_ops():
-    def expected():
-        mod = tvm.IRModule()
-        x = relay.var("x", shape=(8, 8))
-        y = relay.var("y", shape=(8, 8))
-        x0 = relay.var("x0", shape=(8, 8))
-        y0 = relay.var("y0", shape=(8, 8))
-        add = x0 + y0
-        # Function that uses C compiler
-        func = relay.Function([x0, y0], add)
-        func = set_func_attr(func, "ccompiler", "tvmgen_default_ccompiler_main_0")
-        glb_0 = relay.GlobalVar("tvmgen_default_ccompiler_main_0")
-        mod[glb_0] = func
-        add_call = relay.Call(glb_0, [x, y])
-        # Function that uses default compiler. Ops are fused in this function.
-        p0 = relay.var("p0", shape=(8, 8))
-        log = relay.log(p0)
-        exp = relay.exp(p0)
-        concat = relay.concatenate([log, exp], axis=0)
-        fused_func = relay.Function([p0], concat)
-        fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        fused_call = relay.Call(fused_func, [add_call])
-        main = relay.Function([x, y], fused_call)
-        mod["main"] = main
-        mod = transform.InferType()(mod)
-        return mod
-
-    x = relay.var("x", shape=(8, 8))
-    y = relay.var("y", shape=(8, 8))
-    add = x + y
-    log = relay.log(add)
-    exp = relay.exp(add)
-    concat = relay.concatenate([log, exp], axis=0)
-    f = relay.Function([x, y], concat)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
-    mod = transform.PartitionGraph()(mod)
-    fused_mod = transform.FuseOps(2)(mod)
-    expected_mod = expected()
-    tvm.ir.assert_structural_equal(fused_mod, expected_mod, map_free_vars=True)
-
-    x_data = np.random.rand(8, 8).astype("float32")
-    y_data = np.random.rand(8, 8).astype("float32")
-    np_add = x_data + y_data
-    res = np.concatenate([np.log(np_add), np.exp(np_add)])
-    check_result(mod, {"x": x_data, "y": y_data}, (16, 8), res)
-
-
-def test_extern_compiler_sanitized_ops():
-    def expected():
-        mod = tvm.IRModule()
-        x = relay.var("x", shape=(8, 8))
-        y = relay.var("y", shape=(8, 8))
-        x0 = relay.var("x0", shape=(8, 8))
-        y0 = relay.var("y0", shape=(8, 8))
-        add = x0 + y0
-        # Function that uses C compiler
-        func = relay.Function([x0, y0], add)
-        func = set_func_attr(func, "unsanitary-name++", "tvmgen_default_unsanitary_name___main_0")
-        glb_0 = relay.GlobalVar("tvmgen_default_unsanitary_name___main_0")
-        mod[glb_0] = func
-        add_call = relay.Call(glb_0, [x, y])
-        # Function that uses default compiler. Ops are fused in this function.
-        p0 = relay.var("p0", shape=(8, 8))
-        log = relay.log(p0)
-        exp = relay.exp(p0)
-        concat = relay.concatenate([log, exp], axis=0)
-        fused_func = relay.Function([p0], concat)
-        fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        fused_call = relay.Call(fused_func, [add_call])
-        main = relay.Function([x, y], fused_call)
-        mod["main"] = main
-        mod = transform.InferType()(mod)
-        return mod
-
-    x = relay.var("x", shape=(8, 8))
-    y = relay.var("y", shape=(8, 8))
-    add = x + y
-    log = relay.log(add)
-    exp = relay.exp(add)
-    concat = relay.concatenate([log, exp], axis=0)
-    f = relay.Function([x, y], concat)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "unsanitary-name++")(mod)
-    mod = transform.PartitionGraph()(mod)
-    fused_mod = transform.FuseOps(2)(mod)
-    expected_mod = expected()
-    tvm.ir.assert_structural_equal(fused_mod, expected_mod, map_free_vars=True)
-
-
-def test_extern_ccompiler_multiple_functions():
-    def expected():
-        mod = tvm.IRModule()
-        x = relay.var("x", shape=(8, 8))
-        y = relay.var("y", shape=(8, 8))
-        x0 = relay.var("x0", shape=(8, 8))
-        y0 = relay.var("y0", shape=(8, 8))
-        add = x0 + y0
-        # Function that uses C compiler
-        func = relay.Function([x0, y0], add)
-        func = set_func_attr(func, "ccompiler", "tvmgen_default_ccompiler_main_0")
-        glb_0 = relay.GlobalVar("tvmgen_default_ccompiler_main_0")
-        mod[glb_0] = func
-        add_call = relay.Call(glb_0, [x, y])
-        # Function that uses default compiler. Ops are fused in this function.
-        p0 = relay.var("p0", shape=(8, 8))
-        log = relay.log(p0)
-        exp = relay.exp(p0)
-        concat = relay.concatenate([log, exp], axis=0)
-        fused_func = relay.Function([p0], concat)
-        fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        fused_call = relay.Call(fused_func, [add_call])
-        main = relay.Function([x, y], fused_call)
-        mod["main"] = main
-        # define the second one
-        a = relay.var("a", shape=(16, 16))
-        b = relay.var("b", shape=(16, 16))
-        a0 = relay.var("a0", shape=(16, 16))
-        b0 = relay.var("b0", shape=(16, 16))
-        add = a0 + b0
-        # Function that uses C compiler
-        func = relay.Function([a0, b0], add)
-        func = set_func_attr(func, "ccompiler", "tvmgen_default_ccompiler_subfunction_0")
-        glb_0 = relay.GlobalVar("tvmgen_default_ccompiler_subfunction_0")
-        mod[glb_0] = func
-        add_call = relay.Call(glb_0, [a, b])
-        # Function that uses default compiler. Ops are fused in this function.
-        p0 = relay.var("p0", shape=(16, 16))
-        log = relay.log(p0)
-        exp = relay.exp(p0)
-        concat = relay.concatenate([log, exp], axis=0)
-        fused_func = relay.Function([p0], concat)
-        fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        fused_call = relay.Call(fused_func, [add_call])
-        sunfunction = relay.Function([a, b], fused_call)
-        mod["subfunction"] = sunfunction
-        mod = transform.InferType()(mod)
-        return mod
-
-    x = relay.var("x", shape=(8, 8))
-    y = relay.var("y", shape=(8, 8))
-    add = x + y
-    log = relay.log(add)
-    exp = relay.exp(add)
-    concat = relay.concatenate([log, exp], axis=0)
-    f = relay.Function([x, y], concat)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    # define second function
-    a = relay.var("a", shape=(16, 16))
-    b = relay.var("b", shape=(16, 16))
-    add = a + b
-    log = relay.log(add)
-    exp = relay.exp(add)
-    concat = relay.concatenate([log, exp], axis=0)
-    f2 = relay.Function([a, b], concat)
-    mod["subfunction"] = f2
-    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    fused_mod = transform.FuseOps(2)(mod)
-    expected_mod = expected()
-    tvm.ir.assert_structural_equal(fused_mod, expected_mod, map_free_vars=True)
-
-    x_data = np.random.rand(8, 8).astype("float32")
-    y_data = np.random.rand(8, 8).astype("float32")
-    np_add = x_data + y_data
-    res = np.concatenate([np.log(np_add), np.exp(np_add)])
-    check_result(mod, {"x": x_data, "y": y_data}, (16, 8), res)
-
-
-def test_extern_ccompiler():
-    x = relay.var("x", shape=(2, 2))
-    y = relay.var("y", shape=(2, 2))
-    z = x + x
-    p = y * y
-    f = relay.Function([x, y], p - z)
-    x_data = np.random.rand(2, 2).astype("float32")
-    y_data = np.random.rand(2, 2).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    check_result(mod, {"x": x_data, "y": y_data}, (2, 2), (y_data * y_data) - (x_data + x_data))
-
-
-def test_extern_dnnl():
-    if not tvm.get_global_func("relay.ext.dnnl", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    ishape = (1, 32, 14, 14)
-    w1shape = (32, 1, 3, 3)
-
-    def expected():
-        data0 = relay.var("data", shape=(ishape), dtype=dtype)
-        input0 = relay.var("input", shape=(w1shape), dtype=dtype)
-        depthwise_conv2d_1 = relay.nn.conv2d(
-            data0, input0, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        depthwise_conv2d_2 = relay.nn.conv2d(
-            depthwise_conv2d_1, input0, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-
-        func = relay.Function([data0, input0], out)
-        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_main_0")
-        glb_var = relay.GlobalVar("tvmgen_default_dnnl_main_0")
-        mod = tvm.IRModule()
-        mod[glb_var] = func
-        mod = transform.InferType()(mod)
-
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight = relay.var("input", shape=(w1shape), dtype=dtype)
-        main_f = relay.Function([data, weight], glb_var(data, weight))
-        mod["main"] = main_f
-        mod = transform.InferType()(mod)
-
-        return mod
-
-    def get_func():
-        data = relay.var("data", shape=(ishape), dtype=dtype)
-        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
-        depthwise_conv2d_1 = relay.nn.conv2d(
-            data, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        depthwise_conv2d_2 = relay.nn.conv2d(
-            depthwise_conv2d_1, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32
-        )
-        out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-
-        return relay.Function([data, weight1], out)
-
-    mod = tvm.IRModule()
-    mod["main"] = WholeGraphAnnotator("dnnl").visit(get_func())
-    mod = transform.PartitionGraph()(mod)
-    mod = transform.InferType()(mod)
-
-    tvm.ir.assert_structural_equal(mod, expected(), map_free_vars=True)
-
-    ref_mod = tvm.IRModule()
-    ref_mod["main"] = get_func()
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-    w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()).evaluate()(
-        i_data, w1_data
-    )
-    check_result(
-        mod, {"data": i_data, "weight1": w1_data}, (1, 32, 14, 14), ref_res.numpy(), tol=1e-5
-    )
-
-
-def test_extern_dnnl_mobilenet():
-    if not tvm.get_global_func("relay.ext.dnnl", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = "float32"
-    ishape = (1, 3, 224, 224)
-    ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-    mod = transform.AnnotateTarget(["dnnl"])(ref_mod)
-    mod = transform.MergeCompilerRegions()(mod)
-    mod = transform.PartitionGraph()(mod)
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-
-    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)).evaluate()(
-        i_data, **params
-    )
-    te_compiler.get().clear()
-
-    check_result(mod, {"data": i_data}, (1, 1000), ref_res.numpy(), tol=1e-5, params=params)
-
-
-def test_function_lifting():
-    def partition():
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
-        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
-
-        conv = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-        bn_output = relay.nn.batch_norm(conv, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-
-        func = relay.Function(
-            [data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output.astuple()
-        )
-        mod = tvm.IRModule()
-        mod["main"] = func
-        mod = relay.transform.InferType()(mod)
-        op_list = ["nn.batch_norm", "nn.conv2d"]
-        mod = AllowedListAnnotator(op_list, "test_compiler")(mod)
-
-        opt_pass = tvm.transform.Sequential(
-            [
-                transform.InferType(),
-                transform.PartitionGraph(),
-                transform.SimplifyInference(),
-                transform.FoldConstant(),
-                transform.AlterOpLayout(),
-            ]
-        )
-
-        with tvm.transform.PassContext(opt_level=3):
-            mod = opt_pass(mod)
-
-        return mod
-
-    def expected():
-        # function for batch_norm
-        data0 = relay.var("data0", relay.TensorType((1, 16, 224, 224), "float32"))
-        mod = tvm.IRModule()
-        bn_gamma = relay.var("bn_gamma1", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta1", relay.TensorType((16,), "float32"))
-        bn_mmean = relay.var("bn_mean1", relay.TensorType((16,), "float32"))
-        bn_mvar = relay.var("bn_var1", relay.TensorType((16,), "float32"))
-
-        bn = relay.nn.batch_norm(data0, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-        func0 = relay.Function([data0, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn.astuple())
-        func0 = set_func_attr(func0, "test_compiler", "tvmgen_default_test_compiler_main_2")
-        gv0 = relay.GlobalVar("tvmgen_default_test_compiler_main_2")
-        mod[gv0] = func0
-        mod = transform.InferType()(mod)
-
-        # function for conv2d
-        data1 = relay.var("data1", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight1 = relay.var("weight1", relay.TensorType((16, 3, 3, 3), "float32"))
-        conv = relay.nn.conv2d(
-            data=data1, weight=weight1, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-        func1 = relay.Function([data1, weight1], conv)
-        func1 = set_func_attr(func1, "test_compiler", "tvmgen_default_test_compiler_main_0")
-        gv1 = relay.GlobalVar("tvmgen_default_test_compiler_main_0")
-        mod[gv1] = func1
-        mod = transform.InferType()(mod)
-
-        # main function
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
-        bn_gamma0 = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta0 = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mmean0 = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_mvar0 = relay.var("bn_var", relay.TensorType((16,), "float32"))
-
-        call1 = gv1(data, weight)
-        call0 = gv0(call1, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0)
-        mod["main"] = relay.Function(
-            [data, weight, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0], call0
-        )
-        mod = transform.InferType()(mod)
-        return mod
-
-    partitioned = partition()
-    ref_mod = expected()
-    tvm.ir.assert_structural_equal(partitioned, ref_mod, map_free_vars=True)
-
-
-def test_function_lifting_inline():
-    def partition():
-        data = relay.var("data", relay.TensorType((1, 16, 224, 224), "float32"))
-        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
-
-        bn_output = relay.nn.batch_norm(data, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-
-        func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output.astuple())
-        mod = tvm.IRModule()
-        mod["main"] = func
-        op_list = ["nn.batch_norm", "nn.conv2d"]
-        mod = AllowedListAnnotator(op_list, "test_compiler")(mod)
-
-        opt_pass = tvm.transform.Sequential(
-            [
-                transform.InferType(),
-                transform.PartitionGraph(),
-                transform.SimplifyInference(),
-                transform.FoldConstant(),
-                transform.AlterOpLayout(),
-                transform.Inline(),
-            ]
-        )
-
-        with tvm.transform.PassContext(opt_level=3):
-            mod = opt_pass(mod)
-
-        return mod
-
-    def expected():
-        # function for batch_norm
-        data0 = relay.var("data0", relay.TensorType((1, 16, 224, 224), "float32"))
-        mod = tvm.IRModule()
-        bn_gamma = relay.var("bn_gamma1", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta1", relay.TensorType((16,), "float32"))
-        bn_mmean = relay.var("bn_mean1", relay.TensorType((16,), "float32"))
-        bn_mvar = relay.var("bn_var1", relay.TensorType((16,), "float32"))
-
-        bn = relay.nn.batch_norm(data0, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-        func0 = relay.Function([data0, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn.astuple())
-        func0 = set_func_attr(func0, "test_compiler", "tvmgen_default_test_compiler_main_0")
-
-        # main function
-        data = relay.var("data", relay.TensorType((1, 16, 224, 224), "float32"))
-        bn_gamma0 = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta0 = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mmean0 = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_mvar0 = relay.var("bn_var", relay.TensorType((16,), "float32"))
-
-        call0 = func0(data, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0)
-        mod["main"] = relay.Function([data, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0], call0)
-        mod = transform.InferType()(mod)
-        return mod
-
-    partitioned = partition()
-    ref_mod = expected()
-    tvm.ir.assert_structural_equal(partitioned, ref_mod, map_free_vars=True)
-
-
-def test_constant_propagation():
-    ones = np.ones(shape=(8, 8), dtype="float32")
-
-    def expected():
-        mod = tvm.IRModule()
-        y = relay.var("y", shape=(8, 8))
-        x0 = relay.const(ones)
-        y0 = relay.var("y0", shape=(8, 8))
-        add = x0 + y0
-        # Function that uses C compiler
-        func = relay.Function([y0], add)
-        func = set_func_attr(func, "ccompiler", "tvmgen_default_ccompiler_main_0")
-        glb_0 = relay.GlobalVar("tvmgen_default_ccompiler_main_0")
-        mod[glb_0] = func
-        mod = relay.transform.InferType()(mod)
-        add_call = relay.Call(glb_0, [y])
-        log = relay.log(add_call)
-        main = relay.Function([y], log)
-        mod["main"] = main
-        mod = relay.transform.InferType()(mod)
-        return mod
-
-    x = relay.var("x", shape=(8, 8))
-    y = relay.var("y", shape=(8, 8))
-    add = x + y
-    log = relay.log(add)
-    f = relay.Function([x, y], log)
-    f = bind_params_by_name(f, {"x": tvm.nd.array(ones)})
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = AllowedListAnnotator(["add"], "ccompiler")(mod)
-    mod = transform.PartitionGraph()(mod)
-    mod = relay.transform.InferType()(mod)
-
-    expected_mod = expected()
-    expected_mod = relay.transform.InferType()(expected_mod)
-    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
-
-    y_data = np.random.rand(8, 8).astype("float32")
-    np_add = ones + y_data
-    check_result(mod, {"y": y_data}, (8, 8), np.log(np_add))
-
-
-def test_multiple_outputs():
-    def create_graph():
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
-        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_var = relay.var("bn_var", relay.TensorType((16,), "float32"))
-
-        data_cb = compiler_begin(data, "test_target")
-        weight_cb = compiler_begin(weight, "test_target")
-        bn_gamma_cb = compiler_begin(bn_gamma, "test_target")
-        bn_beta_cb = compiler_begin(bn_beta, "test_target")
-        bn_mean_cb = compiler_begin(bn_mean, "test_target")
-        bn_var_cb = compiler_begin(bn_var, "test_target")
-
-        conv_o = relay.nn.conv2d(
-            data=data_cb, weight=weight_cb, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-
-        bn_o = relay.nn.batch_norm(conv_o, bn_gamma_cb, bn_beta_cb, bn_mean_cb, bn_var_cb)
-
-        relu_o = relay.nn.relu(bn_o[0])
-        relu_o_ce = compiler_end(relu_o, "test_target")
-
-        bn_omean = bn_o[1]
-        rebn_omean_ce = compiler_end(bn_omean, "test_target")
-        bn_ovar = bn_o[2]
-        bn_ovar_ce = compiler_end(bn_ovar, "test_target")
-
-        dummy_mean_abs = relay.abs(rebn_omean_ce)
-        dummy_ovar_abs = relay.abs(bn_ovar_ce)
-        dummy_tuple = relay.Tuple((relu_o_ce, dummy_mean_abs, dummy_ovar_abs))
-
-        func = relay.Function([data, weight, bn_gamma, bn_beta, bn_mean, bn_var], dummy_tuple)
-        return func
-
-    def expected():
-        mod = tvm.IRModule()
-
-        # function 0
-        data = relay.var("test_target_0_i0", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.var("test_target_0_i1", relay.TensorType((16, 3, 3, 3), "float32"))
-        bn_gamma = relay.var("test_target_0_i2", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("test_target_0_i3", relay.TensorType((16,), "float32"))
-        bn_mean = relay.var("test_target_0_i4", relay.TensorType((16,), "float32"))
-        bn_var = relay.var("test_target_0_i5", relay.TensorType((16,), "float32"))
-
-        conv_o = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
-        )
-
-        bn_o = relay.nn.batch_norm(conv_o, bn_gamma, bn_beta, bn_mean, bn_var)
-
-        relu_o = relay.nn.relu(bn_o[0])
-        tuple_o = relay.Tuple((relu_o, bn_o[1], bn_o[2]))
-
-        func0 = relay.Function([data, weight, bn_gamma, bn_beta, bn_mean, bn_var], tuple_o)
-        func0 = set_func_attr(func0, "test_target", "tvmgen_default_test_target_main_0")
-        gv0 = relay.GlobalVar("tvmgen_default_test_target_main_0")
-        mod[gv0] = func0
-        mod = relay.transform.InferType()(mod)
-
-        # body
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
-        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
-        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
-        bn_mean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
-        bn_var = relay.var("bn_var", relay.TensorType((16,), "float32"))
-
-        f0_o = gv0(data, weight, bn_gamma, bn_beta, bn_mean, bn_var)
-        f0_relu_o = relay.TupleGetItem(f0_o, 0)
-        f0_mean_o = relay.TupleGetItem(f0_o, 1)
-        f0_var_o = relay.TupleGetItem(f0_o, 2)
-
-        f0_mean_abs = relay.abs(f0_mean_o)
-        f0_var_abs = relay.abs(f0_var_o)
-        main_tuple = relay.Tuple((f0_relu_o, f0_mean_abs, f0_var_abs))
-
-        func = relay.Function([data, weight, bn_gamma, bn_beta, bn_mean, bn_var], main_tuple)
-        mod["main"] = func
-        mod = relay.transform.InferType()(mod)
-        return mod
-
-    mod = tvm.IRModule()
-    mod["main"] = create_graph()
-    ref_mod = expected()
-    partitioned = transform.PartitionGraph()(mod)
-    tvm.ir.assert_structural_equal(partitioned, ref_mod, map_free_vars=True)
-
-
-def test_mixed_single_multiple_outputs():
-    def create_graph():
-        data = relay.var("data", shape=(10, 10))
-
-        cb_1 = compiler_begin(data, "test_target")
-        O_1 = relay.abs(cb_1)
-        ce_2 = compiler_end(O_1, "test_target")
-        O_2 = relay.nn.relu(O_1)
-        ce_3 = compiler_end(O_2, "test_target")
-
-        X = relay.tanh(ce_2)
-
-        cb_3 = compiler_begin(ce_3, "test_target")
-        cb_4 = compiler_begin(X, "test_target")
-        O_3 = relay.add(cb_3, cb_4)
-        ce_4 = compiler_end(O_3, "test_target")
-
-        func = relay.Function([data], ce_4)
-        return func
-
-    def expected():
-        mod = tvm.IRModule()
-
-        # function 1
-        f1_cb1 = relay.var("test_target_0_i0", shape=(10, 10))
-        f1_O_1 = relay.abs(f1_cb1)
-        f1_O_2 = relay.nn.relu(f1_O_1)
-        f1_out = relay.Tuple((f1_O_2, f1_O_1))
-        func1 = relay.Function([f1_cb1], f1_out)
-        func1 = set_func_attr(func1, "test_target", "tvmgen_default_test_target_main_0")
-        gv1 = relay.GlobalVar("tvmgen_default_test_target_main_0")
-        mod[gv1] = func1
-        mod = relay.transform.InferType()(mod)
-
-        # function 0
-        f2_cb3 = relay.var("test_target_1_i0", shape=(10, 10))
-        f2_cb4 = relay.var("test_target_1_i1", shape=(10, 10))
-        f2_O_3 = relay.add(f2_cb3, f2_cb4)
-        func0 = relay.Function([f2_cb3, f2_cb4], f2_O_3)
-        func0 = set_func_attr(func0, "test_target", "tvmgen_default_test_target_main_1")
-        gv0 = relay.GlobalVar("tvmgen_default_test_target_main_1")
-        mod[gv0] = func0
-        mod = relay.transform.InferType()(mod)
-
-        # body
-        data = relay.var("data", shape=(10, 10))
-        tuple_out = gv1(data)
-        ce_2 = relay.TupleGetItem(tuple_out, 1)
-        ce_3 = relay.TupleGetItem(tuple_out, 0)
-
-        X = relay.tanh(ce_2)
-        ce_4 = gv0(ce_3, X)
-        func = relay.Function([data], ce_4)
-        mod["main"] = func
-        mod = relay.transform.InferType()(mod)
-        return mod
-
-    mod = tvm.IRModule()
-    mod["main"] = create_graph()
-    mod = transform.InferType()(mod)
-
-    ref_mod = expected()
-
-    partitioned = transform.PartitionGraph()(mod)
-    tvm.ir.assert_structural_equal(partitioned, ref_mod, map_free_vars=True)
-
-
-def test_dnnl_fuse():
-    dnnl_patterns = get_pattern_table("dnnl")
-    for pattern in dnnl_patterns:
-        if pattern[0] == "dnnl.conv2d_bias_relu":
-            conv2d_bias_relu_pat = pattern
-        elif pattern[0] == "dnnl.conv2d_bias_sigmoid":
-            conv2d_bias_sigmoid_pat = pattern
-        elif pattern[0] == "dnnl.conv2d_bias":
-            conv2d_bias_pat = pattern
-        elif pattern[0] == "dnnl.conv2d_relu":
-            conv2d_relu_pat = pattern
-        elif pattern[0] == "dnnl.conv2d_sigmoid":
-            conv2d_sigmoid_pat = pattern
-        elif pattern[0] == "dnnl.conv2d_bias_sum":
-            conv2d_bias_sum_pat = pattern
-        elif pattern[0] == "dnnl.conv2d_bias_sum_relu":
-            conv2d_bias_sum_relu_pat = pattern
-
-    def get_blocks(
-        prefix,
-        data,
-        in_channel,
-        out_channel,
-        include_bias_add=True,
-        include_bn=True,
-        include_sigmoid=False,
-    ):
-        weight = relay.var(prefix + "weight")
-        bias = relay.var(prefix + "bias")
-        bn_gamma = relay.var(prefix + "bn_gamma")
-        bn_beta = relay.var(prefix + "bn_beta")
-        bn_mmean = relay.var(prefix + "bn_mean")
-        bn_mvar = relay.var(prefix + "bn_var")
-
-        layer = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=out_channel, padding=(1, 1)
-        )
-        if include_bias_add:
-            layer = relay.nn.bias_add(layer, bias)
-        if include_bn:
-            bn_output = relay.nn.batch_norm(layer, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-            layer = bn_output[0]
-        if include_sigmoid:
-            # dummy layer to prevent pattern detection
-            layer = relay.sigmoid(layer)
-        layer = relay.nn.relu(layer)
-        return layer
-
-    def get_net(include_bias_add=True, include_bn=True, include_sigmoid=False):
-        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-        block1 = get_blocks("block1_", data, 3, 8, include_bias_add, include_bn, include_sigmoid)
-        # The second block is always conv + relu, to make it more interesting
-        block2 = get_blocks("block2_", block1, 8, 8, False, False, include_sigmoid)
-        return relay.Function(relay.analysis.free_vars(block2), block2)
-
-    def get_partitoned_mod(mod, params, pattern_table):
-        # This is required for constant folding
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-        remove_bn_pass = tvm.transform.Sequential(
-            [
-                transform.InferType(),
-                transform.SimplifyInference(),
-                transform.FoldConstant(),
-                transform.FoldScaleAxis(),
-            ]
-        )
-        # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu`
-        remove_linear_pass = tvm.transform.Sequential(
-            [
-                transform.SimplifyExpr(),
-                transform.FoldConstant(),
-            ]
-        )
-        composite_partition = tvm.transform.Sequential(
-            [
-                transform.CanonicalizeOps(),
-                remove_bn_pass,
-                remove_linear_pass,
-                transform.MergeComposite(pattern_table),
-                transform.AnnotateTarget("dnnl"),
-                transform.PartitionGraph(),
-            ]
-        )
-
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            return composite_partition(mod)
-
-    def test_detect_pattern(
-        pattern_table, include_bias_add, include_bn, include_sigmoid, num_expected_partition
-    ):
-        net = get_net(include_bias_add, include_bn, include_sigmoid)
-        mod, params = tvm.relay.testing.create_workload(net)
-        mod = get_partitoned_mod(mod, params, pattern_table)
-        assert len(mod.functions) - 1 == num_expected_partition  # -1 for main
-
-    def test_sum_pattern(pattern_table, num_expected_partition):
-        def get_conv2d_bn_sum_relu(
-            x_shape=(1, 32, 8, 8),
-            k_shape=(16, 32, 3, 3),
-            sum_shape=(1, 16, 6, 6),
-            dtype="float32",
-        ):
-            x = relay.var("x", shape=(x_shape), dtype=dtype)
-            kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-            bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-            beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-            gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-            moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-            moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-            sum_data = relay.var("data1", shape=sum_shape, dtype=dtype)
-
-            dic = {"x": x_shape, "bias": (k_shape[0],), "sum_data": sum_shape}
-            param_lst = ["bias", "sum_data"]
-
-            conv = relay.nn.conv2d(
-                x,
-                kernel,
-                channels=k_shape[0],
-                kernel_size=k_shape[2:4],
-            )
-            conv_bias = relay.nn.bias_add(conv, bias)
-            conv_bias_bn, _, _ = relay.nn.batch_norm(
-                conv_bias,
-                gamma=gamma,
-                beta=beta,
-                moving_mean=moving_mean,
-                moving_var=moving_var,
-                axis=1,
-                center=True,
-                scale=True,
-                epsilon=1e-5,
-            )
-            conv_bias_bn_sum = relay.add(conv_bias_bn, sum_data)
-            return relay.nn.relu(conv_bias_bn_sum), dic, param_lst
-
-        net, dic, param_lst = get_conv2d_bn_sum_relu()
-        net = tvm.IRModule.from_expr(net)
-        params = {x: np.random.uniform(-1, 1, dic[x]).astype("float32") for x in param_lst}
-        mod = get_partitoned_mod(net, params, pattern_table)
-        assert len(mod.functions) - 1 == num_expected_partition  # -1 for main
-
-    def test_partition():
-        # conv + bn + relu, conv + relu -> fused conv_bias_relu, conv, and relu
-        test_detect_pattern([conv2d_bias_relu_pat], False, True, False, 3)
-        # conv + bn + relu, conv + relu -> conv, bias, relu, and fused conv_relu
-        test_detect_pattern([conv2d_relu_pat], False, True, False, 4)
-        # conv + bn + relu, conv + relu -> fused conv_bias_relu, and fused conv_relu
-        test_detect_pattern([conv2d_bias_relu_pat, conv2d_relu_pat], False, True, False, 2)
-        # conv + bias_add + bn + relu, conv + relu -> fused conv_bias_relu, and fused conv_relu
-        test_detect_pattern([conv2d_bias_relu_pat, conv2d_relu_pat], True, True, False, 2)
-        # conv + relu, conv + relu -> two fused conv_relu
-        test_detect_pattern([conv2d_relu_pat], False, False, False, 2)
-        # conv + relu, conv + relu -> no fusion, 4 partition each with a single op
-        test_detect_pattern([conv2d_bias_relu_pat], False, False, False, 4)
-        # conv + bn + sigmoid + relu, conv + sigmoid + relu -> no fusion
-        test_detect_pattern([conv2d_bias_relu_pat, conv2d_relu_pat], False, True, True, 7)
-        # conv + bias_add + bn + sigmoid + relu, conv + sigmoid + relu -> fused conv_bias
-        # and single op sigmoid, relu, conv, sigmoid, relu
-        test_detect_pattern([conv2d_bias_pat, conv2d_relu_pat], True, True, True, 6)
-        # conv + bias_add + bn + sigmoid + relu, conv + sigmoid + relu -> fused conv_bias_sigmoid
-        # and single op relu, conv, sigmoid, relu
-        test_detect_pattern([conv2d_bias_sigmoid_pat, conv2d_relu_pat], True, True, True, 5)
-        # conv + bias_add + bn + sigmoid + relu, conv + sigmoid + relu -> fused conv_bias_sigmoid,
-        # fused conv_sigmoid and single op relu, relu
-        test_detect_pattern([conv2d_bias_sigmoid_pat, conv2d_sigmoid_pat], True, True, True, 4)
-        # conv + bias_add + bn + add + relu -> fused conv_bias_sum, relu
-        test_sum_pattern([conv2d_bias_sum_pat], 2)
-        # conv + bias_add + bn + add + relu -> fused conv_bias_sum_relu,
-        test_sum_pattern([conv2d_bias_sum_relu_pat], 1)
-
-    def test_partition_mobilenet():
-        mod, params = relay.testing.mobilenet.get_workload()
-        mod = get_partitoned_mod(mod, params, dnnl_patterns)
-        # 27 fused conv + bn + relu, one dense, one softmax and one global_avg_pooling
-        assert len(mod.functions) - 1 == 30  # -1 for main
-
-    def test_exec(mod, params, ref_mod, ref_params, out_shape):
-        ishape = (1, 3, 224, 224)
-        i_data = np.random.randn(*ishape).astype(np.float32)
-        ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)).evaluate()(
-            i_data, **ref_params
-        )
-        te_compiler.get().clear()
-
-        mod = get_partitoned_mod(mod, params, dnnl_patterns)
-
-        check_result(mod, {"data": i_data}, out_shape, ref_res.numpy(), tol=1e-5, params=params)
-
-    test_partition()
-    test_partition_mobilenet()
-
-    if not tvm.get_global_func("relay.ext.dnnl", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    net = get_net()
-    mod, params = tvm.relay.testing.create_workload(net)
-    ref_mod, ref_params = tvm.relay.testing.create_workload(net)
-    test_exec(mod, params, ref_mod, ref_params, (1, 8, 224, 224))
-
-    mod, params = relay.testing.mobilenet.get_workload()
-    ref_mod, ref_params = relay.testing.mobilenet.get_workload()
-    test_exec(mod, params, ref_mod, ref_params, (1, 1000))
-
-
-def test_multiple_use_of_an_output():
-    def expected_same_output_region():
-        mod = tvm.IRModule()
-        x = relay.var("x", shape=(8, 8))
-        y = relay.var("y", shape=(8, 8))
-        z = relay.var("z", shape=(8, 8))
-        x0 = relay.var("x0", shape=(8, 8))
-        y0 = relay.var("y0", shape=(8, 8))
-        log = relay.log(x0)
-        sub = x0 - y0
-        mul = log * sub
-        # The partitioned graph contains log, subtract, and multiply
-        func = relay.Function([x0, y0], mul)
-        func = set_func_attr(func, "ccompiler", "tvmgen_default_ccompiler_main_0")
-        glb_0 = relay.GlobalVar("tvmgen_default_ccompiler_main_0")
-        mod[glb_0] = func
-        mod = transform.InferType()(mod)
-
-        add = x + y
-        call = relay.Call(glb_0, [add, z])
-        main = relay.Function([x, y, z], call)
-        mod["main"] = main
-        mod = transform.InferType()(mod)
-        return mod
-
-    def expected_different_output_region():
-        mod = tvm.IRModule()
-        x = relay.var("x", shape=(8, 8))
-        y = relay.var("y", shape=(8, 8))
-        z = relay.var("z", shape=(8, 8))
-
-        # The partitioned graph contains log
-        i0 = relay.var("i0", shape=(8, 8))
-        log = relay.log(i0)
-        func = relay.Function([i0], log)
-        func = set_func_attr(func, "ccompiler", "tvmgen_default_ccompiler_main_0")
-        glb_0 = relay.GlobalVar("tvmgen_default_ccompiler_main_0")
-        mod[glb_0] = func
-        mod = transform.InferType()(mod)
-
-        # The partitioned graph contains subtract
-        x0 = relay.var("x0", shape=(8, 8))
-        y0 = relay.var("y0", shape=(8, 8))
-        sub = x0 - y0
-        func = relay.Function([x0, y0], sub)
-        func = set_func_attr(func, "ccompiler", "tvmgen_default_ccompiler_main_1")
-        glb_1 = relay.GlobalVar("tvmgen_default_ccompiler_main_1")
-        mod[glb_1] = func
-        mod = transform.InferType()(mod)
-
-        add = x + y
-        call_log = relay.Call(glb_0, [add])
-        call_sub = relay.Call(glb_1, [add, z])
-        main = relay.Function([x, y, z], call_log * call_sub)
-        mod["main"] = main
-        mod = transform.InferType()(mod)
-        return mod
-
-    def get_mod():
-        x = relay.var("x", shape=(8, 8))
-        y = relay.var("y", shape=(8, 8))
-        z = relay.var("z", shape=(8, 8))
-        add = x + y
-        sub = add - z
-        log = relay.log(add)
-        sub1 = log * sub
-        f = relay.Function([x, y, z], sub1)
-        mod = tvm.IRModule()
-        mod["main"] = f
-        return mod
-
-    def test_same_output_region():
-        mod = get_mod()
-        mod = AllowedListAnnotator(["subtract", "log", "multiply"], "ccompiler")(mod)
-        mod = transform.MergeCompilerRegions()(mod)
-        mod = transform.PartitionGraph()(mod)
-
-        expected_mod = expected_same_output_region()
-        tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
-
-    def test_different_output_region():
-        mod = get_mod()
-        mod = AllowedListAnnotator(["subtract", "log"], "ccompiler")(mod)
-        mod = transform.MergeCompilerRegions()(mod)
-        mod = transform.PartitionGraph()(mod)
-
-        expected_mod = expected_different_output_region()
-        tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
-
-    test_same_output_region()
-    test_different_output_region()
-
-
-def test_duplicate_outputs():
-    target = "test_duplicate_outputs"
-
-    @tvm.ir.register_op_attr("abs", "target." + target)
-    def abs(expr):  # pylint: disable=unused-variable
-        return True
-
-    def create_graph():
-        data = relay.var("data", shape=(10, 10))
-        x = relay.abs(data)
-        out_1 = relay.nn.relu(x)
-        out_2 = relay.tanh(x)
-        out_3 = relay.log(x)
-        out = relay.Tuple([out_1, out_2, out_3])
-        func = relay.Function([data], out)
-        return func
-
-    def expected():
-        mod = tvm.IRModule()
-
-        # function 0
-        f0_i0 = relay.var(target + "_0_i0", shape=(10, 10))
-        f0_o0 = relay.abs(f0_i0)
-        func0 = relay.Function([f0_i0], f0_o0)
-
-        func0 = func0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        func0 = func0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        func0 = func0.with_attr("Compiler", target)
-        func0 = func0.with_attr("global_symbol", "tvmgen_default_" + target + "_main_0")
-        gv0 = relay.GlobalVar("tvmgen_default_" + target + "_main_0")
-        mod[gv0] = func0
-        mod = transform.InferType()(mod)
-
-        # body
-        data = relay.var("data", shape=(10, 10))
-        function_out = gv0(data)
-        out_1 = relay.nn.relu(function_out)
-        out_2 = relay.tanh(function_out)
-        out_3 = relay.log(function_out)
-        out = relay.Tuple([out_1, out_2, out_3])
-        func = relay.Function([data], out)
-        mod["main"] = func
-        mod = transform.InferType()(mod)
-        return mod
-
-    mod = tvm.IRModule()
-    mod["main"] = create_graph()
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.AnnotateTarget(target),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    ref_mod = expected()
-    partitioned = seq(mod)
-    tvm.ir.assert_structural_equal(partitioned, ref_mod, map_free_vars=True)
-
-
-def test_duplicate_merge_and_tuplegetitem():
-    target = "test_duplicate_merge_and_tuplegetitem"
-
-    @tvm.ir.register_op_attr("nn.batch_norm", "target." + target)
-    def batch_norm(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(expr):  # pylint: disable=unused-variable
-        return True
-
-    def create_graph():
-        data = relay.var("data", shape=(10, 10))
-        bn_gamma = relay.var("bn_gamma")
-        bn_beta = relay.var("bn_beta")
-        bn_mmean = relay.var("bn_mean")
-        bn_mvar = relay.var("bn_var")
-        x = relay.nn.batch_norm(data, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-        out_1 = relay.nn.relu(x[0])
-        bn_out_1 = x[1]
-        out_2 = relay.tanh(bn_out_1)
-        out_3 = relay.log(bn_out_1)
-        out = relay.Tuple([out_1, out_2, out_3])
-        func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], out)
-        return func
-
-    def expected():
-        mod = tvm.IRModule()
-
-        # function 0
-        f0_i0 = relay.var(target + "_0_i0", shape=(10, 10))
-        f0_i1 = relay.var(target + "_0_i1")
-        f0_i2 = relay.var(target + "_0_i2")
-        f0_i3 = relay.var(target + "_0_i3")
-        f0_i4 = relay.var(target + "_0_i4")
-        f0_n0 = relay.nn.batch_norm(f0_i0, f0_i1, f0_i2, f0_i3, f0_i4)
-        f0_n1 = f0_n0[1]
-        f0_n2 = relay.nn.relu(f0_n0[0])
-        f0_o0 = relay.Tuple([f0_n2, f0_n1])
-        func0 = relay.Function([f0_i0, f0_i1, f0_i2, f0_i3, f0_i4], f0_o0)
-
-        func0 = func0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        func0 = func0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        func0 = func0.with_attr("Compiler", target)
-        func0 = func0.with_attr("global_symbol", "tvmgen_default_" + target + "_main_0")
-        gv0 = relay.GlobalVar("tvmgen_default_" + target + "_main_0")
-        mod[gv0] = func0
-        mod = transform.InferType()(mod)
-
-        # body
-        data = relay.var("data", shape=(10, 10))
-        bn_gamma = relay.var("bn_gamma")
-        bn_beta = relay.var("bn_beta")
-        bn_mmean = relay.var("bn_mean")
-        bn_mvar = relay.var("bn_var")
-        function_out = gv0(data, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-        get_out0 = relay.TupleGetItem(function_out, 0)
-        get_out1 = relay.TupleGetItem(function_out, 1)
-        out_2 = relay.tanh(get_out1)
-        out_3 = relay.log(get_out1)
-        out = relay.Tuple([get_out0, out_2, out_3])
-        func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], out)
-        mod["main"] = func
-        mod = transform.InferType()(mod)
-        return mod
-
-    mod = tvm.IRModule()
-    mod["main"] = create_graph()
-    mod = transform.InferType()(mod)
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.AnnotateTarget(target),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    ref_mod = expected()
-    partitioned = seq(mod)
-    tvm.ir.assert_structural_equal(partitioned, ref_mod, map_free_vars=True)
-
-
-def test_constant_tuples():
-    @tvm.ir.register_op_attr("qnn.concatenate", "target.const_tuples")
-    def add(expr):  # pylint: disable=unused-variable
-        return True
-
-    def create_graph():
-        a = relay.var("a", shape=(10, 10), dtype="uint8")
-        b = relay.var("b", shape=(10, 10), dtype="uint8")
-        a1 = relay.abs(a)
-
-        zeroi = relay.const(1, "int32")
-        zerof = relay.const(0, "float32")
-        con = relay.qnn.op.concatenate(
-            (a1, b),
-            input_scales=(zerof, zerof),
-            input_zero_points=(zeroi, zeroi),
-            output_scale=zerof,
-            output_zero_point=zeroi,
-            axis=1,
-        )
-
-        f = relay.Function([a, b], con)
-        mod = tvm.IRModule.from_expr(f)
-        mod = transform.InferType()(mod)
-        return mod
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.AnnotateTarget("const_tuples"),
-            transform.InferType(),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    partitioned = seq(create_graph())
-
-    concat = partitioned["tvmgen_default_const_tuples_main_0"].body
-    assert type(concat.args[1]) == relay.Tuple
-    assert type(concat.args[2]) == relay.Tuple
-    assert type(concat.args[3]) == relay.Constant
-    assert type(concat.args[4]) == relay.Constant
-
-
-def test_flatten_tuple_output():
-    target = "test_flatten_tuple_output"
-
-    @tvm.ir.register_op_attr("split", "target." + target)
-    def split(expr):  # pylint: disable=unused-variable
-        return True
-
-    @tvm.ir.register_op_attr("abs", "target." + target)
-    def abs(expr):  # pylint: disable=unused-variable
-        return True
-
-    def create_graph():
-        a = relay.var("a", shape=(10, 10), dtype="uint8")
-
-        a_split = relay.split(a, 2)
-        a_split_0 = relay.TupleGetItem(a_split.astuple(), 0)
-        a_split_0_abs = relay.abs(a_split_0)
-
-        a_con = relay.concatenate(a_split, 0)
-        a_split_0_relu = relay.nn.relu(a_split_0_abs)
-
-        out = relay.Tuple((a_con, a_split_0_relu))
-        f = relay.Function([a], out)
-        mod = tvm.IRModule.from_expr(f)
-        mod = transform.InferType()(mod)
-        return mod
-
-    def expected():
-        mod = tvm.IRModule()
-
-        # function 0
-        f0_i0 = relay.var(target + "_0_i0", shape=(10, 10), dtype="uint8")
-        a_split = relay.split(f0_i0, 2)
-        a_split_0 = relay.TupleGetItem(a_split.astuple(), 0)
-        a_split_1 = relay.TupleGetItem(a_split.astuple(), 1)
-        a_split_abs_in = relay.TupleGetItem(a_split.astuple(), 0)
-        abs = relay.abs(a_split_abs_in)
-        tuple_out = relay.Tuple((a_split_0, a_split_1, abs))
-        func0 = relay.Function([f0_i0], tuple_out)
-
-        func0 = func0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-        func0 = func0.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-        func0 = func0.with_attr("Compiler", target)
-        func0 = func0.with_attr("global_symbol", "tvmgen_default_" + target + "_main_0")
-        gv0 = relay.GlobalVar("tvmgen_default_" + target + "_main_0")
-        mod[gv0] = func0
-        mod = transform.InferType()(mod)
-
-        # body
-        data = relay.var("a", shape=(10, 10), dtype="uint8")
-        f_out = gv0(data)
-        f_out_0 = relay.TupleGetItem(f_out, 0)
-        f_out_1 = relay.TupleGetItem(f_out, 1)
-        tuple = relay.Tuple((f_out_0, f_out_1))
-        concat = relay.concatenate(tuple, 0)
-        f_out_2 = relay.TupleGetItem(f_out, 2)
-        relu = relay.nn.relu(f_out_2)
-        ret_tuple = relay.Tuple((concat, relu))
-        mod["main"] = relay.Function([data], ret_tuple)
-        mod = transform.InferType()(mod)
-        return mod
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.AnnotateTarget(target),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    partitioned = seq(create_graph())
-    partitioned = transform.InferType()(partitioned)
-    expected_mod = transform.InferType()(expected())
-    tvm.ir.assert_structural_equal(partitioned, expected_mod, map_free_vars=True)
-
-
-def test_tuple_output_exec():
-    """Test C codegen and runtime for a subgraph with a tuple output"""
-    a = relay.var("a", shape=(10, 10), dtype="float32")
-    b = relay.var("b", shape=(10, 10), dtype="float32")
-    ba = relay.annotation.compiler_begin(a, "ccompiler")
-    bb = relay.annotation.compiler_begin(b, "ccompiler")
-    add = relay.add(ba, bb)
-    sub = relay.subtract(ba, bb)
-    out = relay.Tuple((add, sub))
-    eout = relay.annotation.compiler_end(out, "ccompiler")
-    func = relay.Function([a, b], eout)
-
-    mod = tvm.IRModule()
-    mod["main"] = func
-    mod = transform.InferType()(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    a_data = np.random.rand(10, 10).astype("float32")
-    b_data = np.random.rand(10, 10).astype("float32")
-
-    check_result(
-        mod,
-        {"a": a_data, "b": b_data},
-        [(10, 10), (10, 10)],
-        [(a_data + b_data), (a_data - b_data)],
-    )
-
-
-def test_extern_opt():
-    def Optimize(mod):
-        return relay.transform.FoldConstant()(mod)
-
-    tvm.register_func("relay.ext.test_target.optimize", Optimize)
-
-    x = relay.var("x", shape=(2, 2))
-    y0 = relay.var("y0", shape=(2, 2))
-    y1 = relay.var("y1", shape=(2, 2))
-    yy0 = relay.annotation.compiler_begin(y0, "test_target")
-    yy1 = relay.annotation.compiler_begin(y1, "test_target")
-    z = yy0 + yy1
-    end = relay.annotation.compiler_end(z, "test_target")
-    f = relay.Function([x, y0, y1], end * x)
-    c = np.ones(shape=(2, 2), dtype="float32")
-    f = bind_params_by_name(f, {"y0": tvm.nd.array(c), "y1": tvm.nd.array(c)})
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = transform.InferType()(mod)
-    mod = transform.PartitionGraph()(mod)
-
-    try:
-        t0 = mod["tvmgen_default_test_target_main_0"]
-    except:
-        raise KeyError("test_target_main_0 not found")
-
-    assert isinstance(t0.body, relay.Constant)
-    expected = np.empty([2, 2])
-    expected.fill(2)
-    tvm.testing.assert_allclose(t0.body.data.numpy(), expected, rtol=1e-5, atol=1e-5)
-
-
-def test_preserve_type_import():
-    """Test to make sure type definition and imports are preserved during the BYOC pipeline."""
-    from tvm.relay.prelude import Prelude, StaticTensorArrayOps
-
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        write = p.get_global_var_static("tensor_array_write", dtype, shape)
-        gather = p.get_global_var_static("tensor_array_gather", dtype, shape)
-        v = relay.var("v")
-        indice = relay.var("indice")
-        init_tensor_array = tensor_array(relay.const(3))
-        tensor_array1 = write(init_tensor_array, relay.const(0), tensor(v))
-        tensor_array2 = write(tensor_array1, relay.const(1), tensor(v))
-        tensor_array3 = write(tensor_array2, relay.const(2), tensor(v))
-        out = gather(tensor_array3, indice)
-        mod["main"] = relay.Function([v, indice], out)
-        mod = transform.RemoveUnusedFunctions()(mod)
-        mod = transform.PartitionGraph()(mod)
-
-    run("float32", [2, 3])
-
-
-def test_not_bind_constant():
-    def get_net(prefix, data, out_channel):
-        weight = relay.var(prefix + "weight")
-        bn_gamma = relay.var(prefix + "bn_gamma")
-        bn_beta = relay.var(prefix + "bn_beta")
-        bn_mmean = relay.var(prefix + "bn_mean")
-        bn_mvar = relay.var(prefix + "bn_var")
-
-        layer = relay.nn.conv2d(
-            data=data, weight=weight, kernel_size=(3, 3), channels=out_channel, padding=(1, 1)
-        )
-        bn_output = relay.nn.batch_norm(layer, bn_gamma, bn_beta, bn_mmean, bn_mvar)
-        out = relay.nn.relu(bn_output[0])
-        return relay.Function(relay.analysis.free_vars(out), out)
-
-    def get_partitoned_mod(mod, params, pattern_table, bind_constants):
-        mod["main"] = bind_params_by_name(mod["main"], params)
-        remove_bn_pass = tvm.transform.Sequential(
-            [
-                transform.InferType(),
-                transform.SimplifyInference(),
-                transform.FoldConstant(),
-                transform.FoldScaleAxis(),
-            ]
-        )
-        composite_partition = tvm.transform.Sequential(
-            [
-                remove_bn_pass,
-                transform.MergeComposite(pattern_table),
-                transform.AnnotateTarget("dnnl"),
-                transform.PartitionGraph(bind_constants=bind_constants),
-            ]
-        )
-
-        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            return composite_partition(mod)
-
-    data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
-    net = get_net("block_", data, 8)
-    mod, params = tvm.relay.testing.create_workload(net)
-
-    mod = get_partitoned_mod(mod, params, get_pattern_table("dnnl"), bind_constants=True)
-    len(mod["main"].body.args) == 1
-
-    mod = get_partitoned_mod(mod, params, get_pattern_table("dnnl"), bind_constants=False)
-    len(mod["main"].body.args) == 3
-
-
-if __name__ == "__main__":
-    test_multi_node_compiler()
-    test_extern_ccompiler_single_op()
-    test_extern_ccompiler_default_ops()
-    test_extern_ccompiler_multiple_functions()
-    test_extern_ccompiler()
-    test_extern_dnnl()
-    test_extern_dnnl_mobilenet()
-    test_function_lifting()
-    test_function_lifting_inline()
-    test_constant_propagation()
-    test_multiple_outputs()
-    test_mixed_single_multiple_outputs()
-    test_dnnl_fuse()
-    test_multiple_use_of_an_output()
-    test_duplicate_outputs()
-    test_duplicate_merge_and_tuplegetitem()
-    test_constant_tuples()
-    test_flatten_tuple_output()
-    test_tuple_output_exec()
-    test_extern_opt()
-    test_not_bind_constant()
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
deleted file mode 100644
index 0376410bd4ae..000000000000
--- a/tests/python/relay/test_pass_plan_devices.py
+++ /dev/null
@@ -1,1886 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License
-
-
-"""Unit tests for the PlanDevices pass. We check:
-    - The pass alone given the expected AST, though we need to manually run InferTypes.
-    - The pass is idempotent.
-    - Execution on the VM backend yields the correct result."""
-
-import tvm
-from tvm import relay
-from tvm.script import tir as T
-import tvm.testing
-import numpy as np
-import os
-
-HOST_DEVICE = tvm.device("cpu")
-HOST_TARGET = tvm.target.Target("llvm")
-
-CPU_DEVICE = tvm.device("cpu")
-CPU_TARGET = tvm.target.Target("llvm").with_host(HOST_TARGET)
-
-GPU_DEVICE = tvm.device("cuda")
-GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET)
-
-TARGETS = [CPU_TARGET, GPU_TARGET]
-
-HOST = tvm.target.VirtualDevice(HOST_DEVICE, HOST_TARGET)  # device_type=1
-CPU = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET)  # device_type=1
-GPU = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET)  # device_type=2
-DEFAULT = GPU
-
-CPU_SCOPE_A = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET, memory_scope="scopeA")
-CPU_SCOPE_B = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET, memory_scope="scopeB")
-
-CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": DEFAULT.device_type_int})
-
-core = tvm.IRModule()
-core.import_from_std("core.rly")
-
-recover_virtual_device_map = tvm._ffi.get_global_func("relay.transform.RecoverVirtualDeviceMap")
-
-
-def rewrite_and_assert(in_mod, expected_mod):
-    """Manually run the pass and assert it's structurally equals to the expected."""
-    config = tvm.target.make_compilation_config(CTXT, TARGETS)
-    actual_mod = relay.transform.InferType()(in_mod)
-    actual_mod = relay.transform.PlanDevices(config)(actual_mod)
-    actual_mod = relay.transform.InferType()(actual_mod)
-    expected_mod = relay.transform.InferType()(expected_mod)
-    if not tvm.ir.structural_equal(actual_mod, expected_mod, True):
-        # Print everything in full so we can see what's going on when things fail.
-        print("Input module:")
-        print(in_mod)
-        print("Expected module:")
-        print(expected_mod)
-        print("Actual module:")
-        print(actual_mod)
-        # Assert again so as to see the actual disagreeing sub-expressions.
-        tvm.ir.assert_structural_equal(actual_mod, expected_mod, True)
-
-
-def eval_and_assert(in_mod: tvm.IRModule, reference_func, args):
-    """Test the standard compilation flow gives us a function which agrees with the Numpy
-    reference implementation."""
-    if not tvm.runtime.enabled("cuda"):
-        print("Not evaluating since GPU is not available")
-        return
-    with tvm.transform.PassContext(opt_level=3):
-        compiled = relay.create_executor(
-            "vm", mod=in_mod, device=GPU_DEVICE, target=GPU_TARGET
-        ).evaluate()
-        actual = compiled(*args).numpy()
-        expected = reference_func(*args)
-        tvm.testing.assert_allclose(actual, expected)
-
-
-def rand(shape):
-    return np.random.rand(*shape).astype("float32")
-
-
-def rands(shape, n):
-    return [rand(shape) for i in range(n)]
-
-
-def exercise(in_mod: tvm.IRModule, expected_mod: tvm.IRModule, reference_func, args):
-    """Test in_mod against expected_mod and reference_func using args."""
-    # Correctness
-    rewrite_and_assert(in_mod, expected_mod)
-    # Idempotence
-    rewrite_and_assert(expected_mod, expected_mod)
-    # The VM can compile and possibly even run the module
-    if not (reference_func is None) and not (args is None):
-        eval_and_assert(in_mod, reference_func, args)
-
-
-def test_plain():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # Everything defaults to GPU
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = add(%c, %d);
-              subtract(%0, %1)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %d {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%a, %b);
-              %1 = add(%c, %d);
-              subtract(%0, %1)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.add(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_left_add_on_cpu():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # Force some args to be on CPU, rest default to GPU.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %2 = add(%c, %d);
-              subtract(%1, %2)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      %c {virtual_device= meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %d {virtual_device= meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%a, %b);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %2 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %3 = add(%c, %d);
-              subtract(%2, %3)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.add(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_left_add_on_cpu_via_copy():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # As for test_left_add_on_cpu, but with an explicit device_copy.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = device_copy(%0, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %2 = add(%c, %d);
-              subtract(%1, %2)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %d {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%a, %b);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %2 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %3 = add(%c, %d);
-              subtract(%2, %3)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.add(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_left_add_on_cpu_via_copy_as_map():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # As for test_left_add_on_cpu, but with an explicit device_copy.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = device_copy(%0, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %2 = add(%c, %d);
-              subtract(%1, %2)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    config = tvm.target.make_compilation_config(CTXT, TARGETS, HOST_TARGET)
-    actual_mod = relay.transform.InferType()(input())
-    actual_mod = relay.transform.PlanDevices(config)(actual_mod)
-    actual_mod = relay.transform.CapturePostDfsIndexInSpans()(actual_mod)
-
-    # Same expected result as for test_left_add_on_cpu, but we'll include indexes to help
-    # the test make sense.
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], // index 0
-                      %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], // index 1
-                      %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], // index 2
-                      %d {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], // index 3
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%a, %b);                                                            // index 8
-              %1 = on_device(%0,
-                             virtual_device=meta[VirtualDevice][0],
-                             constrain_result=True);                                       // index 9
-              %2 = device_copy(%1,
-                               src_virtual_device=meta[VirtualDevice][0],
-                               dst_virtual_device=meta[VirtualDevice][1]);                 // index 10
-              %3 = add(%c, %d);                                                            // index 11
-              subtract(%2, %3)                                                             // index 12
-            }                                                                              // index 13
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    # Make sure actual matches.
-    tvm.ir.assert_structural_equal(actual_mod, expected(), True)
-
-    # Recover all the inferred virtual devices in map form
-    raw_map = recover_virtual_device_map(actual_mod, actual_mod["main"])
-    # Rewrite the map to be from post-dfs indexes to device types
-    map = {e.span.line: d.device_type for e, d in raw_map.items()}
-    # Now we can express the expected map
-    expected_map = {
-        0: CPU.device_type,  # %a
-        1: CPU.device_type,  # %b
-        2: GPU.device_type,  # %c
-        3: GPU.device_type,  # %d
-        8: CPU.device_type,  # first add
-        9: CPU.device_type,  # on_device
-        10: GPU.device_type,  # device_copy
-        11: GPU.device_type,  # second add
-        12: GPU.device_type,  # subtract
-        13: GPU.device_type,  # @main
-    }
-    assert map == expected_map
-
-
-def test_both_adds_on_cpu():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = add(%c, %d);
-              %2 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %3 = on_device(%1, virtual_device=meta[VirtualDevice][0]);
-              subtract(%2, %3)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      %c {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %d {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%a, %b);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %2 = add(%c, %d);
-              %3 = on_device(%2, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %4 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %5 = device_copy(%3, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              subtract(%4, %5)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.add(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_sharing():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # The same add sub-expression is annotated twice.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %2 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              subtract(%1, %2)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%a, %b);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %2 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %3 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %4 = device_copy(%2, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              subtract(%3, %4)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b):
-        x = np.add(a, b)
-        return np.subtract(x, x)
-
-    exercise(input(), expected(), ref, rands((5, 7), 2))
-
-
-def test_let_on_cpu():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # The device for a let-bound expression can flow from uses of the let-bound var.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              let %l = add(%a, %b);
-              let %r = add(%c, %d);
-              %0 = on_device(%l, virtual_device=meta[VirtualDevice][0]);
-              subtract(%0, %r)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %d {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%a, %b);
-              let %l = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              let %r = on_device(add(%c, %d), virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %1 = device_copy(%l, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              subtract(%1, %r)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.add(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_func_param_on_cpu():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # Devices for function parameters flow to call sites.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              let %f = fn (%x, %y) {
-                %0 = add(%x, %y);
-                on_device(%0, virtual_device=meta[VirtualDevice][0])
-              };
-              %1 = %f(%a, %b);
-              %2 = add(%c, %d);
-              subtract(%1, %2)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      %c {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %d {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              let %f = fn (%x {virtual_device=meta[VirtualDevice][0]}, %y {virtual_device=meta[VirtualDevice][0]},
-                           virtual_device=meta[VirtualDevice][0]) {
-                add(%x, %y)
-              };
-              %0 = %f(%a, %b);
-              %1 = add(%c, %d);
-              subtract(%0, %1)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.add(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_func_result_on_cpu():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # Devices for call sites flow to function results.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              let %f = fn (%x, %y) {
-                add(%x, %y)
-              };
-              %0 = %f(%a, %b);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %2 = add(%c, %d);
-              subtract(%1, %2)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %d {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              let %f = fn (%x {virtual_device=meta[VirtualDevice][0]}, %y {virtual_device=meta[VirtualDevice][0]},
-                           virtual_device=meta[VirtualDevice][0]) {
-                add(%x, %y)
-              };
-              %1 = %f(%a, %b);
-              %2 = on_device(%1, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %3 = device_copy(%2, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %4 = add(%c, %d);
-              subtract(%3, %4)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.add(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_higher_order():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # The constraint on %a flows back to %y via %f and %h
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
-              let %f = fn (%g) {
-                fn (%a) {
-                  %0 = on_device(%a, virtual_device=meta[VirtualDevice][0]);
-                  %1 = %g(%0);
-                  add(%1, %x)
-                }
-              };
-              let %h = fn (%b) {
-                negative(%b)
-              };
-              %2 = %f(%h);
-              %3 = %2(%y);
-              subtract(%x, %3)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              let %f = fn (%g {virtual_device=meta[VirtualDevice][1]}, virtual_device=meta[VirtualDevice][1]) {
-                fn (%a {virtual_device=meta[VirtualDevice][0]}, virtual_device=meta[VirtualDevice][1]) {
-                  %0 = device_copy(%a, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-                  %1 = %g(%0);
-                  add(%1, %x)
-                }
-              };
-              let %h = fn (%b  {virtual_device=meta[VirtualDevice][1]}, virtual_device=meta[VirtualDevice][1]) {
-                negative(%b)
-              };
-              %2 = %f(%h);
-              %3 = %2(%y);
-              subtract(%x, %3)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y):
-        def f(g):
-            return lambda a: np.add(g(a), x)
-
-        def h(b):
-            return np.negative(b)
-
-        return np.subtract(x, f(h)(y))
-
-    exercise(input(), expected(), ref, rands((5, 7), 2))
-
-
-def test_function_in_tuple():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # Since %f ends up in a tuple its argument and result is forced to be on the CPU
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
-              let %f = fn (%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
-                %0 = on_device(%b, virtual_device=meta[VirtualDevice][0]);
-                add(%a, %0)
-              };
-              let %t = (%f, %x);
-              %1 = %t.1;
-              %2 = %t.0;
-              %2(%1, %y)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              let %f = fn (%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                           virtual_device=meta[VirtualDevice][0]) {
-                add(%a, %b)
-              };
-              let %t = on_device((%f, %x), virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %0 = %t.1;
-              %1 = %t.0;
-              %1(%0, %y)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y):
-        return np.add(x, y)
-
-    exercise(input(), expected(), ref, rands((5, 7), 2))
-
-
-def test_device_copy():
-    const = rand((5, 7))
-    metatable = {"VirtualDevice": [CPU, GPU], "relay.Constant": [relay.const(const)]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32]) {
-              %0 = device_copy(%x, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              add(%0, meta[relay.Constant][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = device_copy(%x, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              add(%0, meta[relay.Constant][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x):
-        return np.add(x, const)
-
-    exercise(input(), expected(), ref, rands((5, 7), 1))
-
-
-def test_shape_of():
-    metatable = {"VirtualDevice": [HOST, GPU]}
-
-    # We need to use constrain_result=True in the on_device call so that the tensor will be on the GPU. Otherwise the
-    # result defaults to the result device for @main which is the CPU, thus forcing a copy.
-    # TODO(mbs): Perhaps the defaulting heuristics are being too clever?
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(?, ?), float32]) {
-              %0 = on_device(%x, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              vm.shape_of(%0, dtype="int64")
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][1]}: Tensor[(?, ?), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              vm.shape_of(%x, dtype="int64")
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x):
-        return x.shape
-
-    exercise(input(), expected(), ref, rands((5, 7), 1))
-
-
-def test_alloc_storage():
-    shape = np.array([3, 2])
-    metatable = {
-        "VirtualDevice": [HOST, GPU],
-        "relay.Constant": [relay.const(shape, dtype="int64")],
-    }
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%size: int64, %alignment: int64) {
-              memory.alloc_storage(%size, meta[relay.Constant][0], %alignment, virtual_device=meta[VirtualDevice][1])
-            }
-        """,
-            "from_string",
-            core,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%size {virtual_device=meta[VirtualDevice][0]}: int64, %alignment {virtual_device=meta[VirtualDevice][0]}: int64,
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = on_device(meta[relay.Constant][0], virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              memory.alloc_storage(%size, %0, %alignment, virtual_device=meta[VirtualDevice][1])
-            }
-        """,
-            "from_string",
-            core,
-            metatable,
-        )
-
-    # Don't try to execute, too fiddly to setup.
-    exercise(input(), expected(), None, None)
-
-
-def test_alloc_tensor():
-    shape = np.array([3, 2])
-    metatable = {
-        "VirtualDevice": [HOST, GPU],
-        "relay.Constant": [relay.const(shape, dtype="int64")],
-    }
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%sto: Storage[]) {
-              memory.alloc_tensor(%sto, 0, meta[relay.Constant][0],
-                                  const_shape=meta[relay.Constant][0], assert_shape=[])
-            }
-        """,
-            "from_string",
-            core,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%sto {virtual_device=meta[VirtualDevice][1]}: Storage[], virtual_device=meta[VirtualDevice][1]) {
-              %0 = on_device(0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %1 = on_device(meta[relay.Constant][0], virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              memory.alloc_tensor(%sto, %0, %1, const_shape=meta[relay.Constant][0], assert_shape=[])
-            }
-        """,
-            "from_string",
-            core,
-            metatable,
-        )
-
-    # Don't try to execute, too fiddly to setup.
-    exercise(input(), expected(), None, None)
-
-
-def test_reshape_tensor():
-    newshape = [2, 4, 2]
-    metatable = {
-        "VirtualDevice": [HOST, GPU],
-        "relay.Constant": [relay.const(newshape, dtype="int64")],
-    }
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(2, 8), float32]) {
-              vm.reshape_tensor(%x, meta[relay.Constant][0], newshape=[2, 4, 2])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][1]}: Tensor[(2, 8), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = on_device(meta[relay.Constant][0], virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              vm.reshape_tensor(%x, %0, newshape=[2, 4, 2])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x):
-        return np.reshape(x, newshape)
-
-    exercise(input(), expected(), ref, rands((2, 8), 1))
-
-
-def test_dynamic_input():
-    metatable = {"VirtualDevice": [GPU]}
-
-    # There's nothing special about inferring devices for partially unknown types.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x0: Tensor[(?, ?), float32], %x1: Tensor[(?, ?), float32]) {
-              add(%x0, %x1)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x0 {virtual_device=meta[VirtualDevice][0]}: Tensor[(?, ?), float32], %x1 {virtual_device=meta[VirtualDevice][0]}: Tensor[(?, ?), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              add(%x0, %x1)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x0, x1):
-        return np.add(x0, x1)
-
-    exercise(input(), expected(), ref, rands((5, 7), 2))
-
-
-def test_redundant_annotation():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
-              %0 = add(%x, %y);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %2 = subtract(%1, %z);
-              %3 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              add(%2, %3)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %z {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = add(%x, %y);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %2 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %3 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %4 = subtract(%2, %z);
-              %5 = device_copy(%3, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              add(%4, %5)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y, z):
-        a = np.add(x, y)
-        return np.add(np.subtract(a, z), a)
-
-    exercise(input(), expected(), ref, rands((5, 7), 3))
-
-
-def test_annotate_expr():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
-              %0 = add(%x, %y);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][1]);
-              %2 = subtract(%1, %z);
-              on_device(%2, virtual_device=meta[VirtualDevice][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %z {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              %0 = add(%x, %y);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %2 = device_copy(%1, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][0]);
-              subtract(%2, %z)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y, z):
-        return np.subtract(np.add(x, y), z)
-
-    exercise(input(), expected(), ref, rands((5, 7), 3))
-
-
-def test_annotate_all():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
-              %0 = add(%x, %y);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %2 = subtract(%1, %z);
-              on_device(%2, virtual_device=meta[VirtualDevice][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %z {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              %0 = add(%x, %y);
-              subtract(%0, %z)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y, z):
-        return np.subtract(np.add(x, y), z)
-
-    exercise(input(), expected(), ref, rands((5, 7), 3))
-
-
-def test_conv_network():
-    r"""The network and devices are as follows:
-    data1     data2    <--- CPU
-      |         |
-    conv2d    conv2d   <--- CPU
-       \       /
-        \     /
-          add          <--- GPU
-           |
-         conv2d        <--- CPU
-           |
-        <result>       <--- CPU
-    """
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%data1: Tensor[(1, 64, 56, 56), float32], %data2: Tensor[(1, 64, 56, 56), float32],
-                      %weight: Tensor[(64, 64, 3, 3), float32]) {
-              %0 = nn.conv2d(%data1, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              %1 = nn.conv2d(%data2, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              %2 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %3 = on_device(%1, virtual_device=meta[VirtualDevice][0]);
-              %4 = add(%2, %3);
-              %5 = on_device(%4, virtual_device=meta[VirtualDevice][1]);
-              %6 = nn.conv2d(%5, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              on_device(%6, virtual_device=meta[VirtualDevice][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%data1 {virtual_device=meta[VirtualDevice][0]}: Tensor[(1, 64, 56, 56), float32], %data2 {virtual_device=meta[VirtualDevice][0]}: Tensor[(1, 64, 56, 56), float32],
-                      %weight {virtual_device=meta[VirtualDevice][0]}: Tensor[(64, 64, 3, 3), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              %0 = nn.conv2d(%data1, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %2 = nn.conv2d(%data2, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              %3 = on_device(%2, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %4 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %5 = device_copy(%3, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %6 = add(%4, %5);
-              %7 = on_device(%6, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %8 = device_copy(%7, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][0]);
-              nn.conv2d(%8, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    # Don't try to execute, we don't have a reference conv2d
-    exercise(input(), expected(), None, None)
-
-
-def test_tuple_get_item():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # Note that the device copy should be placed after projection rather than before. This is handled by
-    # a heuristic in the pass.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(3, 3, 4), float32]) {
-              let %t = split(%x, indices_or_sections=3);
-              %0 = on_device(%t, virtual_device=meta[VirtualDevice][0]);
-              %1 = on_device(%t, virtual_device=meta[VirtualDevice][0]);
-              %2 = %0.0;
-              %3 = %1.1;
-              %4 = subtract(%2, %3);
-              on_device(%4, virtual_device=meta[VirtualDevice][1])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(3, 3, 4), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = split(%x, indices_or_sections=3);
-              let %t = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %1 = %t.0;
-              %2 = on_device(%1, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %3 = %t.1;
-              %4 = on_device(%3, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %5 = device_copy(%2, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %6 = device_copy(%4, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              subtract(%5, %6)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x):
-        t = np.split(x, 3)
-        return np.subtract(t[0], t[1])
-
-    exercise(input(), expected(), ref, rands((3, 3, 4), 1))
-
-
-def test_propogation():
-    r""" The network and devices are as follows:
-                  x             <--- CPU
-                  |
-                negative        <--- CPU
-                /   \
-          negative  negative    <--- GPU
-                \   /
-                 add            <--- GPU
-                  |
-                negative        <--- CPU
-                  |
-               <result>         <--- CPU
-    """
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32]) {
-              %0 = negative(%x);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %2 = negative(%1);
-              %3 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %4 = negative(%3);
-              %5 = on_device(%2, virtual_device=meta[VirtualDevice][1]);
-              %6 = on_device(%4, virtual_device=meta[VirtualDevice][1]);
-              %7 = add(%5, %6);
-              %8 = on_device(%7, virtual_device=meta[VirtualDevice][1]);
-              %9 = negative(%8);
-              on_device(%9, virtual_device=meta[VirtualDevice][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              %0 = negative(%x);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %2 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %3 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %4 = device_copy(%3, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %5 = negative(%2);
-              %6 = negative(%4);
-              %7 = add(%5, %6);
-              %8 = on_device(%7, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %9 = device_copy(%8, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][0]);
-              negative(%9)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x):
-        y = np.negative(x)
-        return np.negative(np.add(np.negative(y), np.negative(y)))
-
-    exercise(input(), expected(), ref, rands((5, 7), 1))
-
-
-def test_fusible_network():
-    r""" The network is as follows:
-               x     y      <--- GPU
-                \   /
-                 add        <--- GPU
-                /   \
-           negative  \      <--- CPU
-              \       \
-               \  negative  <--- GPU
-                \   /
-                 add        <--- GPU
-                  |
-               negative     <--- CPU
-                  |
-               <result>     <--- CPU
-    """
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
-              %0 = add(%x, %y);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][1]);
-              %2 = negative(%1);
-              %3 = on_device(%2, virtual_device=meta[VirtualDevice][0]);
-              %4 = negative(%0);
-              %5 = add(%3, %4);
-              %6 = on_device(%5, virtual_device=meta[VirtualDevice][1]);
-              %7 = negative(%6);
-              on_device(%7, virtual_device=meta[VirtualDevice][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              %0 = add(%x, %y);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %2 = device_copy(%1, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][0]);
-              %3 = negative(%2);
-              %4 = on_device(%3, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %5 = device_copy(%4, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              %6 = negative(%0);
-              %7 = add(%5, %6);
-              %8 = on_device(%7, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %9 = device_copy(%8, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][0]);
-              negative(%9)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y):
-        z = np.add(x, y)
-        return np.negative(np.add(np.negative(z), np.negative(z)))
-
-    exercise(input(), expected(), ref, rands((5, 7), 2))
-
-
-def test_unpropagatable_graph():
-    r"""The network is as follows:
-    a      b            <--- CPU
-    \     /
-     \   /   c     d    <--- GPU
-      \ /    \     /
-      add     \   /     <--- CPU
-       \       \ /
-        \    multiply   <--- GPU
-         \     /
-        subtract        <--- CPU
-           |
-        <result>        <--- CPU
-    """
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b);
-              %1 = multiply(%c, %d);
-              %2 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-              %3 = on_device(%1, virtual_device=meta[VirtualDevice][1]);
-              %4 = subtract(%2, %3);
-              on_device(%4, virtual_device=meta[VirtualDevice][0])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %d {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              %0 = multiply(%c, %d);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %2 = add(%a, %b);
-              %3 = device_copy(%1, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][0]);
-              subtract(%2, %3)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c, d):
-        return np.subtract(np.add(a, b), np.multiply(c, d))
-
-    exercise(input(), expected(), ref, rands((5, 7), 4))
-
-
-def test_conditional():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # The conditional is over a function type, thus exercising the first-order/higher-order domain handling.
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: bool, %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
-              let %f = fn (%a) {
-                %0 = on_device(%y, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-                add(%a, %0)
-              };
-              let %g = fn (%a1) {
-                subtract(%a1, %y)
-              };
-              let %h = if (%x) {
-                %f
-              } else {
-                %g
-              };
-              %h(%z)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: bool, %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %z {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              let %f = fn (%a {virtual_device=meta[VirtualDevice][0]}, virtual_device=meta[VirtualDevice][0]) {
-                add(%a, %y)
-              };
-              let %g = fn (%a1 {virtual_device=meta[VirtualDevice][0]}, virtual_device=meta[VirtualDevice][0]) {
-                subtract(%a1, %y)
-              };
-              let %h = on_device(if (%x) {
-                %f
-              } else {
-                %g
-              }, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %h(%z)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y, z):
-        def f(a):
-            return np.add(a, y)
-
-        def g(a):
-            return np.subtract(a, y)
-
-        h = f if x else g
-        return h(z)
-
-    exercise(input(), expected(), ref, [True, rand((5, 7)), rand((5, 7))])
-
-
-def test_global():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @f(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-              %0 = on_device(%b, virtual_device=meta[VirtualDevice][0]);
-              add(%a, %0)
-            }
-
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-              @f(%y, %x)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @f(%a {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                   virtual_device=meta[VirtualDevice][1]) -> Tensor[(5, 7), float32] {
-              %0 = device_copy(%b, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              add(%a, %0)
-            }
-
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) -> Tensor[(5, 7), float32] {
-              @f(%y, %x)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y):
-        def f(a, b):
-            return np.add(a, b)
-
-        return f(x, y)
-
-    exercise(input(), expected(), ref, rands((5, 7), 2))
-
-
-def test_ref():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
-              let %r = ref(%x);
-              %0 = on_device(%y, virtual_device=meta[VirtualDevice][0]);
-              ref_write(%r, %0);
-              %1 = ref_read(%r);
-              add(%x, %1)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              let %r = on_device(ref(%x), virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %0 = device_copy(%y, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-              on_device(ref_write(%r, %0), virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              %1 = ref_read(%r);
-              add(%x, %1)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y):
-        r = {"value": x}
-        r["value"] = y
-        return np.add(x, r["value"])
-
-    # Don't try to execute, no backend currently supports both hetrogeneous devices and references.
-    exercise(input(), expected(), None, None)
-
-
-def test_adt():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            type List[A] {
-              Cons(A, List[A]),
-              Nil,
-            }
-            def @main(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32]) {
-              %0 = on_device(%y, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              %1 = Nil;
-              %2 = Cons(%0, %1);
-              let %l = Cons(%x, %2);
-              match? (%l) {
-                Cons(%z, _) => %z
-              }
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            type List[A] {
-              Cons(A, List[A]),
-              Nil,
-            }
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %y {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][0]) {
-              %0 = Nil;
-              %1 = Cons(%y, %0);
-              let %l = on_device(Cons(%x, %1), virtual_device=meta[VirtualDevice][0], constrain_result=True);
-              match? (%l) {
-                Cons(%z, _) => %z
-              }
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(x, y):
-        l = [x, y]
-        return l[0]
-
-    exercise(input(), expected(), ref, rands((5, 7), 2))
-
-
-def test_free_on_device():
-    """Tests that the 'free' form of on_device (ie with constrain_body=False) can be used to allow
-    a device_copy to be inserted if necessary, but otherwise does not prevent the flow of
-    device information."""
-    metatable = {
-        "VirtualDevice": [
-            CPU,  # no memory scope constraint
-            CPU_SCOPE_A,  # constrain to scopeA
-            CPU_SCOPE_B,
-        ]
-    }  # constrain to scopeB
-
-    # Everything defaults to GPU
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @on_scope_b(%x {virtual_device=meta[VirtualDevice][2]}: Tensor[(5, 7), float32],
-                            virtual_device=meta[VirtualDevice][2]) -> Tensor[(5, 7), float32] {
-              %x
-            }
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %c {virtual_device=meta[VirtualDevice][2]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              // %a's memory scope is unconstrained, so will take on "scopeB" and on_device has no effect
-              %0 = @on_scope_b(on_device(%a, virtual_device=meta[VirtualDevice][0], constrain_body=False));
-              // %b's memory scope is "scopeA", so will require a "scopeA"->"scopeB" copy.
-              %1 = @on_scope_b(on_device(%b, virtual_device=meta[VirtualDevice][0], constrain_body=False));
-              // %c's memory scope is "scopeB", so no copy required.
-              %2 = @on_scope_b(on_device(%c, virtual_device=meta[VirtualDevice][0], constrain_body=False));
-              // result's memory scope is on "scopeA", so will require a "scopeB"->"scopeA" copy.
-              %3 = add(add(%0, %1), %2);
-              on_device(%3, virtual_device=meta[VirtualDevice][0], constrain_body=False)
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @on_scope_b(%x {virtual_device=meta[VirtualDevice][2]}: Tensor[(5, 7), float32],
-                            virtual_device=meta[VirtualDevice][2]) -> Tensor[(5, 7), float32] {
-              %x
-            }
-            def @main(%a {virtual_device=meta[VirtualDevice][2]}: Tensor[(5, 7), float32], %b {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], %c {virtual_device=meta[VirtualDevice][2]}: Tensor[(5, 7), float32],
-                      virtual_device=meta[VirtualDevice][1]) {
-              %0 = @on_scope_b(%a);
-              %1 = device_copy(%b, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][2]);
-              %2 = @on_scope_b(%1);
-              %3 = @on_scope_b(%c);
-              %4 = add(add(%0, %2), %3);
-              %5 = on_device(%4, virtual_device=meta[VirtualDevice][2], constrain_result=True);
-              device_copy(%5, src_virtual_device=meta[VirtualDevice][2], dst_virtual_device=meta[VirtualDevice][1])
-            }
-        """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    exercise(input(), expected(), None, None)
-
-
-def test_lowered():
-    """
-    Tests propagation of memory scopes from PrimFuncs and insertion
-    of device_copies to mediate any scope changes.
-    """
-
-    @T.prim_func(private=True)
-    def input_gem(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
-        A = T.match_buffer(a, [128, 128], scope="scopeA")  # will flow out
-        B = T.match_buffer(b, [128, 128], scope="")  # will flow in
-        C = T.match_buffer(c, [128, 128], scope="scopeB")  # will flow out
-        D = T.match_buffer(d, [128, 128], scope="scopeA")  # will flow out
-
-        for i, j, k in T.grid(128, 128, 128):
-            with T.block("update"):
-                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
-                with T.init():
-                    D[vi, vj] = C[vi, vj]
-                D[vi, vj] = D[vi, vj] + A[vi, vk] * B[vj, vk]
-
-    @T.prim_func(private=True)
-    def expected_gem(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
-        A = T.match_buffer(a, [128, 128], scope="scopeA")
-        B = T.match_buffer(b, [128, 128], scope="scopeB")  # flowed in
-        C = T.match_buffer(c, [128, 128], scope="scopeB")
-        D = T.match_buffer(d, [128, 128], scope="scopeA")
-
-        for i, j, k in T.grid(128, 128, 128):
-            with T.block("update"):
-                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
-                with T.init():
-                    D[vi, vj] = C[vi, vj]
-                D[vi, vj] = D[vi, vj] + A[vi, vk] * B[vj, vk]
-
-    metatable = {
-        "VirtualDevice": [
-            CPU,  # meta[VirtualDevice][0], no memory scope
-            CPU_SCOPE_A,  # meta[VirtualDevice][1], "scopeA"
-            CPU_SCOPE_B,
-        ]
-    }  # meta[VirtualDevice][2], "scopeB"
-    gem_ty = relay.FuncType(
-        [
-            relay.TensorType((128, 128), "float32"),
-            relay.TensorType((128, 128), "float32"),
-            relay.TensorType((128, 128), "float32"),
-        ],
-        relay.TensorType((128, 128), "float32"),
-    )
-    gem_gv = relay.GlobalVar("gem", type_annot=gem_ty)
-
-    def input():
-        mod = tvm.ir.IRModule()
-        mod[gem_gv] = input_gem
-        # - %x on CPU, no memory scope constraint, so will be constrained by first param of gem to "scopeA".
-        # - %y on CPU "scopeB", so will flow in to second param of gem.
-        # - %z on CPU "scopeA", so will clash with third param of gem and will need device_copy.
-        # - result on CPU "scopeB", but result of gem on "scopeA" so will need device_copy
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][0]}: Tensor[(128, 128), float32],
-                      %y {virtual_device=meta[VirtualDevice][2]}: Tensor[(128, 128), float32],
-                      %z {virtual_device=meta[VirtualDevice][1]}: Tensor[(128, 128), float32],
-                      virtual_device=meta[VirtualDevice][2]) {
-              call_lowered(@gem, (%x, %y, %z))
-            }
-            """,
-            "from_string",
-            mod,
-            metatable,
-        )
-
-    def expected():
-        mod = tvm.ir.IRModule()
-        mod[gem_gv] = expected_gem
-        # - %x now on CPU "scopeA", no device_copy needed.
-        # - %y still on CPU "scopeB", no device_copy needed.
-        # - %z still on CPU "scopeA", needs device_copy to "scopeB".
-        # - result still on CPU "scopeB", needs device_copy  from "scopeA".
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%x {virtual_device=meta[VirtualDevice][1]}: Tensor[(128, 128), float32],
-                      %y {virtual_device=meta[VirtualDevice][2]}: Tensor[(128, 128), float32],
-                      %z {virtual_device=meta[VirtualDevice][1]}: Tensor[(128, 128), float32],
-                      virtual_device=meta[VirtualDevice][2]) {
-              %0 = device_copy(%z, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][2]);
-              %1 = on_device(%0, virtual_device=meta[VirtualDevice][2], constrain_result=True);
-              %2 = call_lowered(@gem, (%x, %y, %1));
-              %3 = on_device(%2, virtual_device=meta[VirtualDevice][1], constrain_result=True);
-              device_copy(%3, src_virtual_device=meta[VirtualDevice][1], dst_virtual_device=meta[VirtualDevice][2])
-            }
-            """,
-            "from_string",
-            mod,
-            metatable,
-        )
-
-    exercise(input(), expected(), None, None)
-
-
-def test_stack_overflow():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    # Everything defaults to GPU
-    def input():
-        tmp = "test_stack_overflow_input.txt"
-        mod = """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
-            %0 = add(%a, %b);
-            %1 = add(%c, %d);
-            """
-
-        end = 1555
-        for i in range(2, end):
-            s1 = "\n\t" + "%" + str(i) + " = add(%" + str(i - 1) + ", %" + str(i - 2) + ");"
-            mod += s1
-        mod += "\n\t" + "add(%" + str(end - 1) + ", %" + str(end - 2) + ")"
-        mod += "\n\t}"
-
-        return tvm.relay.parse(
-            mod,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    config = tvm.target.make_compilation_config(CTXT, TARGETS)
-    actual_mod = relay.transform.InferType()(input())
-    actual_mod = relay.transform.PlanDevices(config)(actual_mod)
-    relay.transform.InferType()(actual_mod)
-
-
-def test_primitive():
-    """Annotations on Primitive functions should be accepted, even though the body
-    of the Primitive function is not considered during PlanDevices."""
-    global_virtual_device = tvm.target.VirtualDevice(memory_scope="global")
-    texture_virtual_device = tvm.target.VirtualDevice(memory_scope="global.texture")
-    metatable = {
-        "VirtualDevice": [
-            global_virtual_device,
-            texture_virtual_device,
-        ]
-    }
-
-    mod = tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%data1: Tensor[(1, 32, 40, 40), float32],
-                  %data2: Tensor[(1, 32, 40, 40), float32]) {
-          %0 = fn (%a, Primitive=1) {
-            layout_transform(%a, src_layout="NCHW", dst_layout="NCHW4c")
-          };
-          %1 = %0(%data1);
-          %3 = %0(%data2);
-          %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},  // global
-                   %b {virtual_device=meta[VirtualDevice][0]},  // global
-                   virtual_device=meta[VirtualDevice][1],       // texture
-                   Primitive=1) {
-            add(%a, %b)
-          };
-          %6 = %5(%1, %3);
-          %10 = fn (%a,
-                    virtual_device=meta[VirtualDevice][0],      // global
-                    Primitive=1) {
-            layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
-          };
-          %10(%6)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-    print(mod)
-
-    config = tvm.target.make_compilation_config(CTXT, GPU_TARGET)
-    mod = relay.transform.InferType()(mod)
-    # PlanDevices should succeed.
-    mod = relay.transform.PlanDevices(config)(mod)
-    print(mod)
-
-
-def test_conflicated_inputs():
-    metatable = {"VirtualDevice": [CPU, GPU]}
-
-    def input():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                        %c: Tensor[(5, 7), float32]) {
-                %0 = add(%a, %b);
-                %1 = on_device(%0, virtual_device=meta[VirtualDevice][0]);
-                %2 = add(%b, %c);
-                %3 = on_device(%2, virtual_device=meta[VirtualDevice][1]);
-                subtract(%1, %3)
-            }
-            """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def expected():
-        return tvm.relay.parse(
-            """
-            #[version = "0.0.5"]
-            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                        %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32],
-                        %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32]) {
-                %0 = add(%a, %b);
-                %1 = on_device(%0, virtual_device=meta[VirtualDevice][0], constrain_result=True);
-                %2 = device_copy(%b, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-                %3 = device_copy(%1, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
-                %4 = add(%2, %c);
-                subtract(%3, %4)
-            }
-            """,
-            "from_string",
-            None,
-            metatable,
-        )
-
-    def ref(a, b, c):
-        return np.subtract(np.add(a, b), np.add(b, c))
-
-    exercise(input(), expected(), ref, rands((5, 7), 3))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
deleted file mode 100644
index adc93a0d2309..000000000000
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test legalize pass"""
-import numpy as np
-import tvm
-from tvm import te
-
-from tvm import relay
-from tvm.contrib import graph_executor
-from tvm.relay import transform, analysis
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.relay.testing import run_infer_type
-
-
-def alpha_equal(x, y):
-    """
-    Wrapper around alpha equality which ensures that
-    the hash function respects equality.
-    """
-    x = x["main"]
-    y = y["main"]
-    return tvm.ir.structural_equal(x, y) and tvm.ir.structural_hash(x) == tvm.ir.structural_hash(y)
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_qnn_legalize():
-    """Test directly replacing an operator with a new one"""
-
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56), dtype="int8")
-        y = relay.qnn.requantize(
-            x,
-            input_scale=relay.const(1, "float32"),
-            input_zero_point=relay.const(0, "int32"),
-            output_scale=relay.const(1, "float32"),
-            output_zero_point=relay.const(0, "int32"),
-            out_dtype="int8",
-        )
-        y = relay.Function([x], y)
-        return y
-
-    def legalize_qnn_requantize(attrs, inputs, types):
-        data = inputs[0]
-        data = relay.add(relay.const(0, "int8"), data)
-        y = relay.qnn.requantize(
-            data,
-            input_scale=relay.const(1, "float32"),
-            input_zero_point=relay.const(0, "int32"),
-            output_scale=relay.const(1, "float32"),
-            output_zero_point=relay.const(0, "int32"),
-            out_dtype="int8",
-        )
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 64, 56, 56), dtype="int8")
-        y = relay.add(relay.const(0, "int8"), x)
-        z = relay.qnn.requantize(
-            y,
-            input_scale=relay.const(1, "float32"),
-            input_zero_point=relay.const(0, "int32"),
-            output_scale=relay.const(1, "float32"),
-            output_zero_point=relay.const(0, "int32"),
-            out_dtype="int8",
-        )
-        z = relay.Function([x], z)
-        return z
-
-    a = before()
-
-    with TempOpAttr("qnn.requantize", "FTVMQnnLegalize", legalize_qnn_requantize):
-
-        # Check that Relay Legalize does not change the graph.
-        a = run_opt_pass(a, relay.transform.Legalize())
-        b = run_opt_pass(before(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-        # Check that QNN Legalize modifies the graph.
-        a = run_opt_pass(a, relay.qnn.transform.Legalize())
-        b = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_legalize_qnn_conv2d():
-    def _get_mod(data_dtype, kernel_dtype):
-        data_shape = (1, 64, 256, 256)
-        kernel_shape = (128, 64, 3, 3)
-        data = relay.var("data", shape=data_shape, dtype=data_dtype)
-        kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
-        func = relay.qnn.conv2d(
-            data,
-            kernel,
-            input_zero_point=relay.const(1, "int32"),
-            kernel_zero_point=relay.const(1, "int32"),
-            input_scale=relay.const(1.0, "float32"),
-            kernel_scale=relay.const(1.0, "float32"),
-            kernel_size=(3, 3),
-            channels=kernel_shape[0],
-            strides=(1, 1),
-            dilation=(1, 1),
-            out_dtype="int32",
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-        )
-
-        mod = relay.Function(relay.analysis.free_vars(func), func)
-        mod = tvm.IRModule.from_expr(mod)
-        return mod
-
-    # Check uint8 x uint8 and int8 x int8 transformation
-    for dtype in ("uint8", "int8"):
-        mod = _get_mod(dtype, dtype)
-
-        #############################################################
-        # Check transformations for platforms with fast Int8 support.
-        #############################################################
-        # Check that Intel AVX512 (with or w/o VNNI) gets picked up.
-        for target in [
-            "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake-avx512",
-            "llvm -mtriple=x86_64-linux-gnu -mcpu=cascadelake",
-        ]:
-            with tvm.target.Target(target):
-                mod = relay.transform.InferType()(mod)
-                legalized_mod = relay.qnn.transform.Legalize()(mod)
-                assert "cast" in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
-
-        # Since same dtype, there should not be any transformation
-        with tvm.target.Target(
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
-        ):
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            tvm.ir.assert_structural_equal(mod, legalized_mod)
-
-        ################################################################
-        # Check transformations for platforms without fast Int8 support.
-        ################################################################
-        # Older Intel versions.
-        with tvm.target.Target("llvm"):
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-        # Older ARM vesions.
-        with tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"):
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-    # Check uint8 x int8 transformation
-    mod = _get_mod("uint8", "int8")
-    #############################################################
-    # Check transformations for platforms with fast Int8 support.
-    #############################################################
-    # Check no transformation for Intel AVX512.
-    with tvm.target.Target("llvm -mtriple=x86_64-linux-gnu -mcpu=skylake-avx512"):
-        mod = relay.transform.InferType()(mod)
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        tvm.ir.assert_structural_equal(mod, legalized_mod)
-
-    # ARM - so check that transformation has happened.
-    with tvm.target.Target(
-        "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
-    ):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
-
-    ################################################################
-    # Check transformations for platforms without fast Int8 support.
-    ################################################################
-    # Older Intel versions.
-    with tvm.target.Target("llvm"):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-    # Older ARM vesions.
-    with tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-    ###########################################
-    # Check transformations for CUDA platforms.
-    ###########################################
-    with tvm.target.Target("cuda"):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn" in legalized_mod.astext()
-
-
-def test_qnn_legalize_qnn_dense():
-    def _get_mod(data_dtype, kernel_dtype):
-        data_shape = (10, 3)
-        kernel_shape = (20, 3)
-        data = relay.var("data", shape=data_shape, dtype=data_dtype)
-        kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
-        func = relay.qnn.dense(
-            data,
-            kernel,
-            input_zero_point=relay.const(1, "int32"),
-            kernel_zero_point=relay.const(1, "int32"),
-            input_scale=relay.const(1, "float32"),
-            kernel_scale=relay.const(1, "float32"),
-            units=kernel_shape[0],
-            out_dtype="int32",
-        )
-
-        mod = relay.Function(relay.analysis.free_vars(func), func)
-        mod = tvm.IRModule.from_expr(mod)
-        return mod
-
-    # Check uint8 x uint8 and int8 x int8 transformation
-    for dtype in ("uint8", "int8"):
-        mod = _get_mod(dtype, dtype)
-
-        #############################################################
-        # Check transformations for platforms with fast Int8 support.
-        #############################################################
-        # Check that Intel AVX512 (with or w/o VNNI) gets picked up.
-        for target in [
-            "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake-avx512",
-            "llvm -mtriple=x86_64-linux-gnu -mcpu=cascadelake",
-        ]:
-            with tvm.target.Target(target):
-                mod = relay.transform.InferType()(mod)
-                legalized_mod = relay.qnn.transform.Legalize()(mod)
-                assert "cast" in legalized_mod.astext() and "qnn.dense" in legalized_mod.astext()
-
-        # Since same dtype, there should not be any transformation
-        with tvm.target.Target(
-            "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
-        ):
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            tvm.ir.assert_structural_equal(mod, legalized_mod)
-
-        ################################################################
-        # Check transformations for platforms without fast Int8 support.
-        ################################################################
-        # Older Intel versions.
-        with tvm.target.Target("llvm"):
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-        # Older ARM vesions.
-        with tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"):
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-    # Check uint8 x int8 transformation
-    mod = _get_mod("uint8", "int8")
-    #############################################################
-    # Check transformations for platforms with fast Int8 support.
-    #############################################################
-    # Check no transformation for Intel AVX512.
-    with tvm.target.Target("llvm -mtriple=x86_64-linux-gnu -mcpu=skylake-avx512"):
-        mod = relay.transform.InferType()(mod)
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        tvm.ir.assert_structural_equal(mod, legalized_mod)
-
-    # ARM - so check that transformation has happened.
-    with tvm.target.Target(
-        "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
-    ):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn.dense" in legalized_mod.astext()
-
-    ################################################################
-    # Check transformations for platforms without fast Int8 support.
-    ################################################################
-    # Older Intel versions.
-    with tvm.target.Target("llvm"):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-    # Older ARM vesions.
-    with tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
-
-    ###########################################
-    # Check transformations for CUDA platforms.
-    ###########################################
-    with tvm.target.Target("cuda"):
-        legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert "cast" in legalized_mod.astext() and "qnn" in legalized_mod.astext()
-
-
-def test_qnn_legalize_qnn_conv2d_non_scalar_qnn_params():
-    """
-    Test QNN legalization for qnn.conv2d op for Hexagon target when kernel zero point and kernel
-    scale are vectors of scalars.
-    """
-    data_shape = (1, 29, 16, 16)
-    weights_shape = (60, 29, 3, 3)
-    O, I = weights_shape[0], weights_shape[1]
-    data = relay.var("data", shape=data_shape, dtype="uint8")
-    weights = relay.var("weight", shape=weights_shape, dtype="int8")
-    data_zp = relay.const(2)
-    data_scale = relay.const(0.15)
-
-    def before():
-        op = relay.qnn.conv2d(
-            data,
-            weights,
-            input_zero_point=data_zp,
-            kernel_zero_point=relay.const([1] * O),
-            input_scale=data_scale,
-            kernel_scale=relay.const([0.17] * O),
-            padding=[0, 0, 0, 0],
-            channels=O,
-            kernel_size=[3, 3],
-        )
-        return op
-
-    def expected():
-        in_diff = 3
-        out_diff = 4
-        op0 = relay.nn.pad(weights, pad_width=[[0, 0], [0, in_diff], [0, 0], [0, 0]])
-        op1 = relay.nn.pad(data, pad_width=[[0, 0], [0, in_diff], [0, 0], [0, 0]])
-        op2 = relay.nn.pad(op0, pad_width=[[0, out_diff], [0, 0], [0, 0], [0, 0]])
-        op3 = relay.qnn.conv2d(
-            op1,
-            op2,
-            input_zero_point=data_zp,
-            kernel_zero_point=relay.const([1] * O + [0] * out_diff),
-            input_scale=data_scale,
-            kernel_scale=relay.const([0.17] * O + [1.0] * out_diff),
-            padding=[0, 0, 0, 0],
-            channels=(O + out_diff),
-            kernel_size=[3, 3],
-        )
-        op4 = relay.strided_slice(op3, begin=[0, 0, 0, 0], end=[1, 60, 14, 14], strides=[1])
-        return op4
-
-    target = tvm.target.hexagon("v68")
-    with tvm.target.Target(target):
-        a = run_opt_pass(before(), relay.qnn.transform.Legalize())
-        b = run_infer_type(expected())
-        tvm.ir.assert_structural_equal(a, b)
-
-
-def test_qnn_legalize_qnn_dense_non_scalar_qnn_params():
-    """
-    Test QNN legalization for qnn.dense op for Hexagon target when kernel zero point and kernel
-    scale are vectors of scalars.
-    """
-    data_shape = (4, 16)
-    weights_shape = (58, 16)
-    N = weights_shape[0]
-    data = relay.var("data", shape=data_shape, dtype="uint8")
-    weights = relay.var("weight", shape=weights_shape, dtype="int8")
-    data_zp = relay.const(2)
-    data_scale = relay.const(0.15)
-
-    def before():
-        wzp = relay.const([1] * N)
-        wscale = relay.const([0.17] * N)
-        op = relay.qnn.dense(data, weights, data_zp, wzp, data_scale, wscale, units=N)
-        return op
-
-    def expected():
-        diff = 6
-        wzp = relay.const([1] * N + [0] * diff)
-        wscale = relay.const([0.17] * N + [1.0] * diff)
-        op0 = relay.nn.pad(weights, pad_width=[[0, diff], [0, 0]])
-        op1 = relay.qnn.dense(data, op0, data_zp, wzp, data_scale, wscale, units=(N + diff))
-        op2 = relay.strided_slice(op1, begin=[0, 0], end=[data_shape[0], N], strides=[1], axes=None)
-        return op2
-
-    target = tvm.target.hexagon("v68")
-    with tvm.target.Target(target):
-        a = run_opt_pass(before(), relay.qnn.transform.Legalize())
-        b = run_infer_type(expected())
-        tvm.ir.assert_structural_equal(a, b)
-
-
-if __name__ == "__main__":
-    test_qnn_legalize()
-    test_qnn_legalize_qnn_conv2d()
-    test_qnn_legalize_qnn_dense()
-    test_qnn_legalize_qnn_conv2d_non_scalar_qnn_params()
-    test_qnn_legalize_qnn_dense_non_scalar_qnn_params()
diff --git a/tests/python/relay/test_pass_remove_unused_functions.py b/tests/python/relay/test_pass_remove_unused_functions.py
deleted file mode 100644
index 3c7aad40a506..000000000000
--- a/tests/python/relay/test_pass_remove_unused_functions.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.prelude import Prelude
-
-
-def test_remove_all_prelude_functions():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    x = relay.var("x", shape=(1, 16))
-    mod["main"] = relay.Function([x], x)
-    mod = relay.transform.RemoveUnusedFunctions()(mod)
-    l = set([x[0].name_hint for x in mod.functions.items()])
-    assert l == set(["main"])
-
-
-def test_remove_all_prelude_functions_but_referenced_functions():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    x = relay.var("x", shape=(1, 16))
-    id_func = relay.Function([x], x)
-    id_name = relay.GlobalVar("id_func")
-    mod[id_name] = id_func
-
-    mod["main"] = relay.Function([x], id_name(x))
-    mod = relay.transform.RemoveUnusedFunctions()(mod)
-    l = set([x[0].name_hint for x in mod.functions.items()])
-    assert l == set(["id_func", "main"])
-
-
-def test_keep_only_referenced_prelude_functions():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = p.mod.get_type("List")
-    hd = p.mod.get_global_var("hd")
-    tl = p.mod.get_global_var("tl")
-    l = nil()
-    for i in [4, 3, 2, 1, 0]:
-        l = cons(relay.const(i), l)
-    body = hd(tl(tl(l)))
-    mod["main"] = relay.Function([], body)
-    mod = relay.transform.RemoveUnusedFunctions()(mod)
-    l = set([x[0].name_hint for x in mod.functions.items()])
-    assert l == set(["tl", "hd", "main"])
-
-
-def test_multiple_entry_functions():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = p.mod.get_type("List")
-    hd = p.mod.get_global_var("hd")
-    tl = p.mod.get_global_var("tl")
-    l = nil()
-    for i in [4, 3, 2, 1, 0]:
-        l = cons(relay.const(i), l)
-    body = hd(tl(tl(l)))
-    mod["main1"] = relay.Function([], body)
-
-    x = relay.var("x", shape=(1, 16))
-    id_func = relay.Function([x], x)
-    id_name = relay.GlobalVar("id_func")
-    mod[id_name] = id_func
-    mod["main2"] = relay.Function([x], id_name(x))
-    mod = relay.transform.RemoveUnusedFunctions(["main1", "main2"])(mod)
-    l = set([x[0].name_hint for x in mod.functions.items()])
-    assert l == set(["tl", "hd", "main2", "id_func", "main1"])
-
-
-def test_globalvar_as_call_arg():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    tensor_array = p.get_global_var("tensor_array", "int32")
-    tensor1 = p.get_ctor(p.get_name("tensor_t", "int32"), "tensor1", "int32")
-    write = p.get_global_var("tensor_array_write", "int32")
-    stack = p.get_global_var("tensor_array_stack", "int32")
-    v = relay.var("v")
-    init_tensor_array = tensor_array(relay.const(3))
-    tensor_array1 = write(init_tensor_array, relay.const(0), tensor1(v))
-    tensor_array2 = stack(tensor_array1)
-    mod["main"] = relay.Function([v], tensor_array2)
-    mod = relay.transform.RemoveUnusedFunctions()(mod)
-    l = set([x[0].name_hint for x in mod.functions.items()])
-    assert "tensor_array_int32" in l
-
-
-def test_call_globalvar_without_args():
-    def get_mod():
-        mod = tvm.IRModule({})
-        fn1 = relay.Function([], relay.const(1))
-        fn2 = relay.Function([], relay.const(2))
-        g1 = relay.GlobalVar("g1")
-        g2 = relay.GlobalVar("g2")
-        mod[g1] = fn1
-        mod[g2] = fn2
-        p = relay.var("p", "bool")
-        mod["main"] = relay.Function([p], relay.Call(relay.If(p, g1, g2), []))
-        return mod
-
-    mod = get_mod()
-    ref_mod = get_mod()
-    mod = relay.transform.RemoveUnusedFunctions()(mod)
-    tvm.ir.assert_structural_equal(mod, ref_mod, map_free_vars=True)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
deleted file mode 100644
index 7e2971a04e1b..000000000000
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ /dev/null
@@ -1,954 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from math import sqrt
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.testing import run_opt_pass, run_infer_type
-
-import numpy as np
-
-
-def test_simplify_reshape():
-    def before():
-        x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        y = relay.reshape(y, newshape=(1, 16, -1))
-        y = relay.reshape(y, newshape=(4, 8, -1, 16))
-        y = relay.reverse_reshape(y, newshape=(32, 0, -1))
-        return relay.Function([x, w], y)
-
-    def expected():
-        x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        y = relay.reshape(y, newshape=(32, 16, 16))
-        return relay.Function([x, w], y)
-
-    def symbolic():
-        b = tvm.te.size_var("b")
-        x = relay.var("x", shape=(b, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        y = relay.reshape(y, newshape=(1, 16, -1))
-        y = relay.reshape(y, newshape=(4, 8, -1, 16))
-        y = relay.reverse_reshape(y, newshape=(32, 0, -1))
-        return relay.Function([x, w], y)
-
-    z = before()
-    zz = run_opt_pass(z, transform.SimplifyExpr())
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-    z = symbolic()
-    zz = run_opt_pass(z, transform.SimplifyExpr())
-    after = run_opt_pass(symbolic(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, after)
-
-
-def test_simplify_transpose():
-    # Test a series of transpose and layout_transform ops
-    def before1():
-        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
-        y = relay.transpose(x, axes=[0, 2, 3, 1])  # To NHWC
-        y = relay.layout_transform(y, "NHWC", "HWCN")  # To HWCN
-        y = relay.transpose(y, axes=[3, 0, 1, 2])  # To NHWC
-        return relay.Function([x], y)
-
-    def expected1():
-        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
-        y = relay.transpose(x, axes=[0, 2, 3, 1])  # To NHWC
-        return relay.Function([x], y)
-
-    # Test that all transpose ops can be cancelled
-    def before2():
-        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
-        y = relay.nn.relu(x)
-        y = relay.transpose(y, axes=[0, 2, 3, 1])  # To NHWC
-        y = relay.transpose(y, axes=[1, 2, 3, 0])  # To HWCN
-        y = relay.transpose(y, axes=[3, 2, 0, 1])  # To NCHW
-        return relay.Function([x], y)
-
-    def expected2():
-        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
-        y = relay.nn.relu(x)
-        return relay.Function([x], y)
-
-    # Test default axis (reverse) and negative axis
-    def before3():
-        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
-        y = relay.nn.relu(x)
-        y = relay.transpose(y)  # Reverse
-        y = relay.transpose(y)  # Reverse
-        y = relay.transpose(y, axes=[0, 2, -1, 1])
-        y = relay.transpose(y)  # Reverse
-        y = relay.transpose(y)  # Reverse
-        return relay.Function([x], y)
-
-    def expected3():
-        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
-        y = relay.nn.relu(x)
-        y = relay.transpose(y, axes=[0, 2, 3, 1])
-        return relay.Function([x], y)
-
-    # Test a series of transpose and rank changing layout_transform
-    def before4():
-        """
-        Simplify transpose->layout_transform and its inverse.
-
-        Input:
-        NHWC -> NCHW -> NCHW4c -> op -> NCHW4c -> NCHW -> NHWC
-
-        Simplified:
-        NHWC -> NCHW4c -> op -> NCHW4c -> NHWC
-        """
-        x = relay.var("x", shape=(1, 56, 56, 128), dtype="float32")
-        y = relay.transpose(x, axes=[0, 3, 1, 2])
-        y = relay.layout_transform(y, "NCHW", "NCHW4c")
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW4c", "NCHW")
-        y = relay.transpose(y, axes=[0, 2, 3, 1])
-        return relay.Function([x], y)
-
-    def expected4():
-        x = relay.var("x", shape=(1, 56, 56, 128), dtype="float32")  # NHWC
-        y = relay.layout_transform(x, "NHWC", "NCHW4c")  # To NCHW4c
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW4c", "NHWC")  # To NHWC
-        return relay.Function([x], y)
-
-    def before5():
-        """
-        Simplify layout_transform->layout_transform and its inverse.
-
-        Input:
-        NHWC -> NCHW -> NCHW4c -> op -> NCHW4c -> NCHW -> NHWC
-
-        Simplified:
-        NHWC -> NCHW4c -> op -> NCHW4c -> NHWC
-        """
-        x = relay.var("x", shape=(1, 56, 56, 128), dtype="float32")  # NHWC
-        y = relay.layout_transform(x, "NHWC", "NCHW")  # To NCHW
-        y = relay.layout_transform(y, "NCHW", "NCHW4c")  # To NCHW4c
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW4c", "NCHW")  # To NCHW
-        y = relay.layout_transform(y, "NCHW", "NHWC")  # To NHWC
-        return relay.Function([x], y)
-
-    def expected5():
-        x = relay.var("x", shape=(1, 56, 56, 128), dtype="float32")  # NHWC
-        y = relay.layout_transform(x, "NHWC", "NCHW4c")  # To NCHW4c
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW4c", "NHWC")  # To NHWC
-        return relay.Function([x], y)
-
-    def before6():
-        """
-        Remove trivial layout_transform->layout_transform.
-
-        Input:
-        NCHW -> NHWC -> NCHW -> op
-
-        Simplified:
-        NHWC -> op
-        """
-
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.layout_transform(x, "NCHW", "NHWC")
-        y = relay.layout_transform(y, "NHWC", "NCHW")
-        y = relay.nn.relu(y)
-        return relay.Function([x], y)
-
-    def expected6():
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.nn.relu(x)
-        return relay.Function([x], y)
-
-    def before7():
-        """
-        Remove trivial layout_transform->layout_transform.
-
-        Input:
-        NCHW4c -> NCHW8c -> NCHW4c -> op
-
-        Simplified:
-        NCHW4c -> op
-        """
-        x = relay.var("x", shape=(1, 32, 56, 56, 4), dtype="float32")
-        y = relay.layout_transform(x, "NCHW4c", "NCHW8c")
-        y = relay.layout_transform(y, "NCHW8c", "NCHW4c")
-        y = relay.nn.relu(y)
-        return relay.Function([x], y)
-
-    def expected7():
-        x = relay.var("x", shape=(1, 32, 56, 56, 4), dtype="float32")
-        y = relay.nn.relu(x)
-        return relay.Function([x], y)
-
-    def before8():
-        """
-        Simplify layout_transform->layout_transform with rank contraction and expansion
-
-        Input:
-        NCHW4c -> NCHW -> NCHW8c -> op
-
-        Simplified:
-        NCHW4c -> NCHW8c -> op
-        """
-        x = relay.var("x", shape=(1, 32, 56, 56, 4), dtype="float32")
-        y = relay.layout_transform(x, "NCHW4c", "NCHW")
-        y = relay.layout_transform(y, "NCHW", "NCHW8c")
-        y = relay.nn.relu(y)
-        return relay.Function([x], y)
-
-    def expected8():
-        x = relay.var("x", shape=(1, 32, 56, 56, 4), dtype="float32")
-        y = relay.layout_transform(x, "NCHW4c", "NCHW8c")
-        y = relay.nn.relu(y)
-        return relay.Function([x], y)
-
-    def before9():
-        """
-        Remove trivial layout_transform->layout_transform.
-
-        Input:
-        NCHW -> NCHW4c -> NCHW -> op
-
-        Simplified:
-        NCHW -> op
-        """
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.layout_transform(x, "NCHW", "NCHW4c")
-        y = relay.layout_transform(y, "NCHW4c", "NCHW")
-        y = relay.nn.relu(y)
-        return relay.Function([x], y)
-
-    def expected9():
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.nn.relu(x)
-        return relay.Function([x], y)
-
-    def before10():
-        """
-        Simplify layout_transform->layout_transform without rank change to transpose.
-
-        Input:
-        NCHW -> NHWC -> CHWN -> op
-
-        Simplified:
-        NCHW -> CHWN -> op
-        """
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.layout_transform(x, "NCHW", "NHWC")
-        y = relay.layout_transform(y, "NHWC", "CHWN")
-        y = relay.nn.relu(y)
-        return relay.Function([x], y)
-
-    def expected10():
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.transpose(x, axes=[1, 2, 3, 0])
-        y = relay.nn.relu(y)
-        return relay.Function([x], y)
-
-    def before11():
-        """
-        Remove trivial no op transpose ops
-
-        Input:
-        op1 -> relay.transpose(x, axes=[0, 1, 2, 3]) -> op2
-
-        Simplified:
-        op1 -> op2
-        """
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.transpose(x, axes=[0, 1, 2, 3])
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, "NCHW", "NCHW")
-        return relay.Function([x], y)
-
-    def expected11():
-        x = relay.var("x", shape=(1, 128, 56, 56), dtype="float32")
-        y = relay.nn.relu(x)
-        return relay.Function([x], y)
-
-    for before, expected in [
-        [before1(), expected1()],
-        [before2(), expected2()],
-        [before3(), expected3()],
-        [before4(), expected4()],
-        [before5(), expected5()],
-        [before6(), expected6()],
-        [before7(), expected7()],
-        [before8(), expected8()],
-        [before9(), expected9()],
-        [before10(), expected10()],
-        [before11(), expected11()],
-    ]:
-        after = run_opt_pass(before, transform.SimplifyExpr())
-        expected = run_opt_pass(expected, transform.InferType())
-        tvm.ir.assert_structural_equal(after, expected)
-
-
-def test_simplify_full_elementwise():
-    def validate(shape, value, dtype):
-        def before_left(x, elem_op, full):
-            return elem_op(full, x)
-
-        def after_left(x, elem_op, value):
-            if elem_op == relay.add and value == 0:
-                return x
-            elif elem_op == relay.multiply and (value == 1 or (value > 1 and dtype == "bool")):
-                return x
-            return elem_op(relay.const(value, dtype), x)
-
-        def before_right(x, elem_op, full):
-            return elem_op(x, full)
-
-        def after_right(x, elem_op, value):
-            if elem_op in [relay.add, relay.subtract] and value == 0:
-                return x
-            elif elem_op in [relay.multiply, relay.divide] and (
-                value == 1 or (value > 1 and dtype == "bool")
-            ):
-                return x
-            return elem_op(x, relay.const(value, dtype))
-
-        x = relay.var("x", shape=shape, dtype=dtype)
-        elem_ops = [relay.add, relay.multiply, relay.subtract, relay.divide]
-        full_ops = []
-        if value == 0:
-            full_ops.append(relay.zeros(shape, dtype))
-            full_ops.append(relay.zeros_like(x))
-        if value == 1:
-            full_ops.append(relay.ones(shape, dtype))
-            full_ops.append(relay.ones_like(x))
-        else:
-            full_ops.append(relay.full(relay.const(value, dtype), shape))
-            full_ops.append(relay.full_like(x, relay.const(value, dtype)))
-        for op in elem_ops:
-            for full in full_ops:
-                z = before_left(x, op, full)
-                zz = run_opt_pass(z, transform.SimplifyExpr())
-                after = run_opt_pass(after_left(x, op, value), transform.InferType())
-                tvm.ir.assert_structural_equal(zz, after)
-
-                z = before_right(x, op, full)
-                zz = run_opt_pass(z, transform.SimplifyExpr())
-                after = run_opt_pass(after_right(x, op, value), transform.InferType())
-                tvm.ir.assert_structural_equal(zz, after)
-
-        # Test the case in which x is broadcast to full's shape
-        full_ops = []
-        if value == 0:
-            full_ops.append(relay.zeros(shape * 2, dtype))
-        if value == 1:
-            full_ops.append(relay.ones(shape * 2, dtype))
-        else:
-            full_ops.append(relay.full(relay.const(value, dtype), shape * 2))
-        for op in elem_ops:
-            for full in full_ops:
-                z = before_left(x, op, full)
-                zz = run_opt_pass(z, transform.SimplifyExpr())
-                after = run_opt_pass(before_left(x, op, full), transform.InferType())
-                tvm.ir.assert_structural_equal(zz, after)
-
-                z = before_right(x, op, full)
-                zz = run_opt_pass(z, transform.SimplifyExpr())
-                after = run_opt_pass(before_right(x, op, full), transform.InferType())
-                tvm.ir.assert_structural_equal(zz, after)
-
-    for shape in [[10], [10, 10], [10, 10, 10]]:
-        for dtype in ["float32", "int32", "bool"]:
-            for value in [0, 1, 2]:
-                validate(shape, value, dtype)
-
-
-def test_eliminate_identity():
-    def check(x, y=None, do_nothing=False):
-        expected = run_infer_type(x)
-        if do_nothing:
-            actual = run_opt_pass(x, transform.SimplifyExpr())
-            tvm.ir.assert_structural_equal(actual, expected)
-        else:
-            assert y is not None
-            actual = run_opt_pass(y, transform.SimplifyExpr())
-            tvm.ir.assert_structural_equal(actual, expected)
-
-    shape = [2, 3, 4]
-    dtype = "float32"
-    x = relay.var("x", shape=shape, dtype=dtype)
-    x = run_opt_pass(x, transform.InferType())
-
-    for op, op_like, id_op, const in [
-        (relay.zeros, relay.zeros_like, relay.add, relay.const(0, dtype)),
-        (relay.ones, relay.ones_like, relay.multiply, relay.const(1, dtype)),
-    ]:
-        check(x, id_op(op_like(x), x))
-        check(x, id_op(op(shape, dtype), x))
-        check(x, id_op(const, x))
-        check(x, id_op(op(shape[1:], dtype), x))
-        check(x, id_op(x, op_like(x)))
-        check(x, id_op(x, op(shape, dtype)))
-        check(x, id_op(x, const))
-        check(x, id_op(x, op(shape[1:], dtype)))
-        check(id_op(x, op([2] + shape, dtype)), do_nothing=True)
-        check(id_op(op([2] + shape, dtype), x), do_nothing=True)
-
-    for op, op_like, id_op, const in [
-        (relay.zeros, relay.zeros_like, relay.subtract, relay.const(0, dtype)),
-        (relay.ones, relay.ones_like, relay.divide, relay.const(1, dtype)),
-    ]:
-        check(x, id_op(x, op_like(x)))
-        check(x, id_op(x, const))
-        check(x, id_op(x, op(shape, dtype)))
-        check(x, id_op(x, op(shape[1:], dtype)))
-        check(id_op(x, op([2] + shape, dtype)), do_nothing=True)
-        check(id_op(const, x), id_op(op(shape, dtype), x))
-        check(id_op(const, x), id_op(op_like(x), x))
-
-
-def test_simplify_same_cast():
-    dtype = "int32"
-    data = relay.var("data", shape=(3, 4, 5), dtype=dtype)
-    expr1 = relay.cast(data, dtype)
-    dtype_like = relay.var("dtype_like", shape=(2, 2, 2), dtype=dtype)
-    expr2 = relay.cast_like(data, dtype_like)
-
-    expected = run_infer_type(data)
-    actual1 = run_opt_pass(expr1, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual1, expected)
-    actual2 = run_opt_pass(expr2, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual2, expected)
-
-
-def test_simplify_consecutive_cast():
-    x = relay.var("x", shape=(3, 4, 5), dtype="int8")
-    y = relay.var("y", shape=(3, 4), dtype="int64")
-    z = relay.var("z", shape=(3,), dtype="float32")
-
-    expr1 = relay.cast(x, "int16")
-    expr2 = relay.cast(expr1, "int32")
-    expr3 = relay.cast_like(expr2, y)
-    expr4 = relay.cast_like(expr3, z)
-
-    actual1 = run_opt_pass(expr2, relay.transform.SimplifyExpr())
-    expected = run_infer_type(relay.cast(x, "int32"))
-    tvm.ir.assert_structural_equal(actual1, expected)
-    actual2 = run_opt_pass(expr3, relay.transform.SimplifyExpr())
-    expected = run_infer_type(relay.cast(x, "int64"))
-    tvm.ir.assert_structural_equal(actual2, expected)
-    actual3 = run_opt_pass(expr4, relay.transform.SimplifyExpr())
-    expected = run_infer_type(relay.cast(x, "float32"))
-    tvm.ir.assert_structural_equal(actual3, expected)
-
-    # cannot simplify the narrow cast
-    x = relay.var("x", shape=(3, 4, 5), dtype="float32")
-    y = relay.var("y", shape=(3, 4), dtype="float32")
-    expr1 = relay.cast(x, "int32")
-    expr2 = relay.cast_like(expr1, y)
-    actual = run_opt_pass(expr2, relay.transform.SimplifyExpr())
-    expected = run_infer_type(relay.cast(expr1, "float32"))
-    tvm.ir.assert_structural_equal(actual, expected)
-
-    x = relay.var("x", shape=(3, 4), dtype="int64")
-    expr1 = relay.cast(x, "bool")
-    expr2 = relay.cast(expr1, "int32")
-    actual = run_opt_pass(expr2, relay.transform.SimplifyExpr())
-    expected = run_infer_type(expr2)
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_reshape_like():
-    data = relay.var("data", shape=(2, 3, 4), dtype="float32")
-    shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32")
-    expr = relay.reshape_like(data, shape_like)
-
-    expected = run_infer_type(relay.reshape(data, (6, 2, 2)))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_reshape_like_attrs():
-    data = relay.var("data", shape=(2, 3, 4), dtype="float32")
-    shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32")
-    expr = relay.reshape_like(data, shape_like, lhs_begin=2, rhs_begin=1)
-
-    expected = run_infer_type(relay.reshape(data, (2, 3, 2, 2)))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_zeros_like():
-    dtype = "int32"
-    shape_like = relay.var("shape_like", shape=(3, 4, 5), dtype=dtype)
-    expr = relay.zeros_like(shape_like)
-
-    expected = run_infer_type(relay.zeros((3, 4, 5), dtype))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_ones_like():
-    dtype = "int32"
-    shape_like = relay.var("shape_like", shape=(3, 4, 5), dtype=dtype)
-    expr = relay.ones_like(shape_like)
-
-    expected = run_infer_type(relay.ones((3, 4, 5), dtype))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_full_like():
-    dtype = "int32"
-    shape_like = relay.var("shape_like", shape=(3, 4, 5), dtype=dtype)
-    fill_value = relay.var("fill", relay.TensorType((), "float32"))
-    expr = relay.full_like(shape_like, fill_value)
-
-    expected = run_infer_type(relay.full(fill_value, (3, 4, 5), dtype))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_collapse_sum_like():
-    data = relay.var("data", shape=(3, 3, 3), dtype="float32")
-    shape_like = relay.var("shape_like", shape=(3,), dtype="float32")
-    expr = relay.collapse_sum_like(data, shape_like)
-
-    expected = run_infer_type(relay.collapse_sum_to(data, (3,)))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_broadcast_to_like():
-    data = relay.var("data", shape=(3,), dtype="float32")
-    shape_like = relay.var("shape_like", shape=(3, 3, 3), dtype="float32")
-    expr = relay.broadcast_to_like(data, shape_like)
-
-    expected = run_infer_type(relay.broadcast_to(data, (3, 3, 3)))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_cast_like():
-    dim_any = tvm.tir.Any()
-    data = relay.var("data", shape=(3, dim_any, 5), dtype="float32")
-    dtype_like = relay.var("dtype_like", shape=(dim_any, 3, 3), dtype="int32")
-    expr = relay.cast_like(data, dtype_like)
-
-    expected = run_infer_type(relay.cast(data, "int32"))
-    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_concretize_multiple():
-    x = relay.var("x", shape=(2, 3), dtype="float32")
-    y = relay.var("y", shape=(3,), dtype="float32")
-    l = x + y
-
-    dl = relay.ones_like(l)
-    dx = relay.zeros_like(x)
-    dy = relay.zeros_like(y)
-    dx = dx + relay.collapse_sum_like(dl, dx)
-    dy = dy + relay.collapse_sum_like(dl, dy)
-    ret = relay.Tuple([dx, dy])
-
-    dl_c = relay.ones((2, 3), "float32")
-    # NOTE: these are removed by EliminateIdentity
-    # dx_c = relay.zeros((2, 3), "float32")
-    # dy_c = relay.zeros((3,), "float32")
-    dx_c = relay.collapse_sum_to(dl_c, (2, 3))
-    dy_c = relay.collapse_sum_to(dl_c, (3,))
-    ret_c = relay.Tuple([dx_c, dy_c])
-
-    expected = run_infer_type(ret_c)
-    actual = run_opt_pass(ret, relay.transform.SimplifyExpr())
-    tvm.ir.assert_structural_equal(actual, expected)
-
-
-def test_simplify_mul_add():
-    def check_simple_fold(origin_exprs, expect_expr):
-        for origin_expr in origin_exprs:
-            simple_expr = run_opt_pass(origin_expr, transform.SimplifyExpr())
-            tvm.ir.assert_structural_equal(simple_expr, expect_expr)
-
-    n = 32
-    c1_val = np.random.uniform(size=n).astype("float32")
-    c2_val = np.random.uniform(size=n).astype("float32")
-    c3_val = np.random.uniform(size=n).astype("float32")
-
-    x = relay.var("x", shape=(n,), dtype="float32")
-    c1 = relay.const(c1_val)
-    c2 = relay.const(c2_val)
-    c3 = relay.const(c3_val)
-
-    # add-add -> add
-    origin_exprs = [
-        x + c1 + c2,
-        c1 + x + c2,
-    ]
-    expect_expr = x + relay.const(c1_val + c2_val)
-    check_simple_fold(origin_exprs, expect_expr)
-
-    # mul-mul -> mul
-    origin_exprs = [
-        x * c1 * c2,
-        c1 * x * c2,
-    ]
-    expect_expr = x * relay.const(c1_val * c2_val)
-    check_simple_fold(origin_exprs, expect_expr)
-
-    # add-mul -> mul-add
-    origin_exprs = [
-        (x + c1) * c2,
-        (c1 + x) * c2,
-        c2 * (x + c1),
-        c2 * (c1 + x),
-    ]
-    expect_expr = x * c2 + relay.const(c1_val * c2_val)
-    check_simple_fold(origin_exprs, expect_expr)
-
-    # add-mul-add -> mul-add
-    origin_exprs = [
-        (x + c1) * c2 + c3,
-        (c1 + x) * c2 + c3,
-        c2 * (x + c1) + c3,
-        c2 * (c1 + x) + c3,
-        c3 + (x + c1) * c2,
-        c3 + (c1 + x) * c2,
-        c3 + c2 * (x + c1),
-        c3 + c2 * (c1 + x),
-    ]
-    expect_expr = x * c2 + relay.const(c1_val * c2_val + c3_val)
-    check_simple_fold(origin_exprs, expect_expr)
-
-    # mul-add-mul -> mul-add
-    origin_exprs = [
-        (x * c1 + c2) * c3,
-        (c1 * x + c2) * c3,
-        (c2 + x * c1) * c3,
-        (c2 + c1 * x) * c3,
-        c3 * (x * c1 + c2),
-        c3 * (c1 * x + c2),
-        c3 * (c2 + x * c1),
-        c3 * (c2 + c1 * x),
-    ]
-    expect_expr = x * relay.const(c1_val * c3_val) + relay.const(c2_val * c3_val)
-    check_simple_fold(origin_exprs, expect_expr)
-
-
-def test_simplify_rsqrt():
-    shape = (32, 1, 1)
-    x = relay.var("x", shape=shape, dtype="float32")
-
-    def before(c):
-        return relay.const(c) / relay.sqrt(x)
-
-    def expected(c):
-        if c == 1:
-            return relay.rsqrt(x)
-        else:
-            return relay.const(c) * relay.rsqrt(x)
-
-    for c in [1.0, 2.0, 2.5]:
-        opt = run_opt_pass(before(c), transform.SimplifyExpr())
-        after = run_opt_pass(expected(c), transform.InferType())
-        tvm.ir.assert_structural_equal(opt, after)
-
-
-def test_simplify_dq_argmax():
-    shape = (4, 32, 1, 1)
-    x = relay.var("x", shape=shape, dtype="int8")
-
-    def before():
-        y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0))
-        return relay.op.argmax(y, axis=1)
-
-    def expected():
-        return relay.op.argmax(x, axis=1)
-
-    opt = run_opt_pass(before(), transform.SimplifyExpr())
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(opt, after)
-
-
-def test_simplify_dq_argmin():
-    shape = (4, 32, 1, 1)
-    x = relay.var("x", shape=shape, dtype="int8")
-
-    def before():
-        y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0))
-        return relay.op.argmin(y, axis=1)
-
-    def expected():
-        return relay.op.argmin(x, axis=1)
-
-    opt = run_opt_pass(before(), transform.SimplifyExpr())
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(opt, after)
-
-
-def test_simplify_dq_argsort():
-    shape = (4, 32, 1, 1)
-    x = relay.var("x", shape=shape, dtype="int8")
-
-    def before():
-        y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0))
-        return relay.op.argsort(y, axis=1)
-
-    def expected():
-        return relay.op.argsort(x, axis=1)
-
-    opt = run_opt_pass(before(), transform.SimplifyExpr())
-    after = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(opt, after)
-
-
-def test_simplify_clip_cast():
-    def before1():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        cast = relay.cast(clip, "uint8")
-        cast = relay.cast(cast, "int32")
-        return relay.Function([x], cast)
-
-    def expected1():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        return relay.Function([x], clip)
-
-    def before2():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        cast = relay.cast(clip, "uint8")
-        cast = relay.cast(cast, "int32")
-        return relay.Function([x], cast)
-
-    def expected2():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        return relay.Function([x], clip)
-
-    def before3():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        cast = relay.cast(clip, "uint8")
-        cast = relay.cast(cast, "int16")
-        cast = relay.cast(cast, "int32")
-        return relay.Function([x], cast)
-
-    def expected3():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        return relay.Function([x], clip)
-
-    def before4():
-        x = relay.var("x", shape=(4, 8), dtype="float32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        cast = relay.cast(clip, "uint8")
-        cast = relay.cast(cast, "int16")
-        cast = relay.cast(cast, "int32")
-        return relay.Function([x], cast)
-
-    def expected4():
-        x = relay.var("x", shape=(4, 8), dtype="float32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        cast = relay.cast(clip, "int32")
-        return relay.Function([x], cast)
-
-    def before5():
-        x = relay.var("x", shape=(4, 8), dtype="float32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        cast = relay.cast(clip, "int8")
-        cast = relay.cast(cast, "int16")
-        cast = relay.cast(cast, "int32")
-        return relay.Function([x], cast)
-
-    def expected5():
-        x = relay.var("x", shape=(4, 8), dtype="float32")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        cast = relay.cast(clip, "int8")
-        cast = relay.cast(cast, "int32")
-        return relay.Function([x], cast)
-
-    for before, expected in [
-        [before1(), expected1()],
-        [before2(), expected2()],
-        [before3(), expected3()],
-        [before4(), expected4()],
-        [before5(), expected5()],
-    ]:
-        after = run_opt_pass(before, transform.SimplifyExpr())
-        expected = run_opt_pass(expected, transform.InferType())
-        tvm.ir.assert_structural_equal(after, expected)
-
-
-def test_simplify_cast_clip():
-    def before1():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        cast = relay.cast(x, "uint8")
-        clip = relay.clip(cast, a_min=0.0, a_max=255.0)
-        return relay.Function([x], clip)
-
-    def expected1():
-        x = relay.var("x", shape=(4, 8), dtype="int32")
-        cast = relay.cast(x, "uint8")
-        return relay.Function([x], cast)
-
-    def before2():
-        x = relay.var("x", shape=(4, 8), dtype="uint8")
-        clip = relay.clip(x, a_min=0.0, a_max=255.0)
-        return relay.Function([x], clip)
-
-    def expected2():
-        x = relay.var("x", shape=(4, 8), dtype="uint8")
-        return relay.Function([x], x)
-
-    def before3():
-        x = relay.var("x", shape=(4, 8), dtype="float32")
-        cast = relay.cast(x, "bfloat16")
-        clip = relay.clip(cast, a_min=-0.2, a_max=0.4)
-        return relay.Function([x], clip)
-
-    def expected3():
-        x = relay.var("x", shape=(4, 8), dtype="float32")
-        cast = relay.cast(x, "bfloat16")
-        clip = relay.clip(cast, a_min=-0.2, a_max=0.4)
-        return relay.Function([x], clip)
-
-    for before, expected in [
-        [before1(), expected1()],
-        [before2(), expected2()],
-        [before3(), expected3()],
-    ]:
-        after = run_opt_pass(before, transform.SimplifyExpr())
-        expected = run_opt_pass(expected, transform.InferType())
-        tvm.ir.assert_structural_equal(after, expected)
-
-
-def test_simplify_add():
-    x = relay.var("x", shape=(1, 3, 100, 100), dtype="float32")
-
-    def before():
-        return relay.add(x, x)
-
-    def expected():
-        s = relay.const(2.0)
-        return relay.multiply(x, s)
-
-    opt = run_opt_pass(before(), transform.SimplifyExpr())
-    ref = run_infer_type(expected())
-    tvm.ir.assert_structural_equal(opt, ref)
-
-
-def test_binomials():
-    def check_simple_fold(origin_exprs, expect_exprs):
-        for origin_expr in origin_exprs:
-            simple_expr = run_opt_pass(origin_expr, transform.SimplifyExpr())
-            match = False
-            for expected in expect_exprs:
-                e = run_opt_pass(expected, transform.EliminateCommonSubexpr())
-                match = match or tvm.ir.structural_equal(simple_expr, e)
-                if match:
-                    break
-            assert match
-
-    def gen_expected_expressions(x, y, a, b, c, dtype):
-        if c == 1 and a > 1:
-            swap = a
-            a = c
-            c = swap
-            swap = x
-            x = y
-            y = swap
-
-        det = b * b - 4 * a * c
-        if det < 0:
-            return gen_expressions(x, y, a, b, c)
-
-        p_val = (b + sqrt(det)) / (2 * a)
-        q_val = (b - sqrt(det)) / (2 * a)
-        p = relay.const(p_val, dtype=dtype)
-        q = relay.const(q_val, dtype=dtype)
-        first_exp = [x + y, y + x] if p_val == 1 else [x + p * y, p * y + x, x + y * p, y * p + x]
-        second_exp = [x + y, y + x] if q_val == 1 else [x + q * y, q * y + x, x + y * q, y * q + x]
-        final_exp = []
-        for f in first_exp:
-            for s in second_exp:
-                final_exp.append(f * s)
-                if not p_val == q_val:
-                    final_exp.append(s * f)
-        return final_exp
-
-    def gen_expressions(x, y, a, b, c):
-        first_exp = [x * x] if a == 1 else [a * x * x, x * a * x, x * x * a]
-        second_exp = (
-            [x * y, y * x]
-            if b == 1
-            else [b * x * y, x * b * y, x * y * b, b * y * x, y * b * x, y * x * b]
-        )
-        third_exp = [y * y] if c == 1 else [c * y * y, y * c * y, y * y * c]
-        final_exp = []
-        for f in first_exp:
-            for s in second_exp:
-                for t in third_exp:
-                    final_exp.append(f + s + t)
-                    final_exp.append(f + t + s)
-                    final_exp.append(s + f + t)
-                    final_exp.append(s + t + f)
-                    final_exp.append(t + f + s)
-                    final_exp.append(t + s + f)
-        return final_exp
-
-    n = 5
-    dtypes = ["int32", "float32", "float64"]
-    for dtype in dtypes:
-        x = relay.var("x", shape=(n,), dtype=dtype)
-        y = relay.var("y", shape=(n,), dtype=dtype)
-
-        a = relay.const(1, dtype=dtype)
-        b = relay.const(2, dtype=dtype)
-        c = relay.const(1, dtype=dtype)
-        origin_exprs = gen_expressions(x, y, a, b, c)
-        expect_expr = gen_expected_expressions(x, y, 1, 2, 1, dtype)
-        check_simple_fold(origin_exprs, expect_expr)
-
-        a = relay.const(6, dtype=dtype)
-        b = relay.const(5, dtype=dtype)
-        c = relay.const(1, dtype=dtype)
-        origin_exprs = gen_expressions(x, y, a, b, c)
-        expect_expr = gen_expected_expressions(x, y, 6, 5, 1, dtype)
-        check_simple_fold(origin_exprs, expect_expr)
-
-        a = relay.const(1, dtype=dtype)
-        b = relay.const(1, dtype=dtype)
-        c = relay.const(1, dtype=dtype)
-        origin_exprs = gen_expressions(x, y, a, b, c)
-        expect_expr = gen_expected_expressions(x, y, 1, 1, 1, dtype)
-        check_simple_fold(origin_exprs, expect_expr)
-
-        a = relay.const(1, dtype=dtype)
-        b = relay.const(4, dtype=dtype)
-        c = relay.const(4, dtype=dtype)
-        origin_exprs = gen_expressions(x, y, a, b, c)
-        expect_expr = gen_expected_expressions(x, y, 1, 4, 4, dtype)
-        check_simple_fold(origin_exprs, expect_expr)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_simplify_inference.py b/tests/python/relay/test_pass_simplify_inference.py
deleted file mode 100644
index 42df54e5d2e7..000000000000
--- a/tests/python/relay/test_pass_simplify_inference.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from tvm.ir import IRModule, assert_structural_equal
-from tvm import relay as rly
-from tvm.relay.transform import SimplifyInference, InferType
-
-
-def test_simplify_batchnorm(dtype="float32"):
-    def simple_bn(x, gamma, beta, moving_mean, moving_var, axis=1, epsilon=1e-5, shape=None):
-        # expect = (x - moving_mean) / sqrt(moving_var + eps) * gamma + beta
-        scale = rly.multiply(
-            rly.const(1, dtype) / rly.sqrt(moving_var + rly.const(epsilon, dtype)), gamma
-        )
-        shift = rly.add(rly.multiply(rly.negative(moving_mean), scale), beta)
-        num_newaxis = len(shape) - (axis + 1)
-        if num_newaxis:
-            scale = rly.expand_dims(scale, axis=1, num_newaxis=num_newaxis)
-            shift = rly.expand_dims(shift, axis=1, num_newaxis=num_newaxis)
-        return x * scale + shift
-
-    def check(dim, axis, nstep):
-        eps = 0.01
-        ttype1 = rly.TensorType(tuple(10 for i in range(dim)), dtype)
-        ttype2 = rly.TensorType((10,), dtype)
-        x = rly.var("x", ttype1)
-        beta = rly.var("beta", ttype2)
-        gamma = rly.var("gamma", ttype2)
-        moving_var = rly.var("moving_var", ttype2)
-        moving_mean = rly.var("moving_mean", ttype2)
-        y1, y2 = x, x
-
-        for _ in range(nstep):
-            y1, _, _ = rly.nn.batch_norm(
-                y1 + rly.const(1, dtype),
-                gamma,
-                beta,
-                moving_mean,
-                moving_var,
-                epsilon=eps,
-                axis=axis,
-            )
-            y1 = rly.nn.dropout(y1)
-            y2 = simple_bn(
-                y2 + rly.const(1, dtype),
-                gamma,
-                beta,
-                moving_mean,
-                moving_var,
-                epsilon=eps,
-                axis=axis,
-                shape=ttype1.shape,
-            )
-
-        mod = IRModule.from_expr(y1)
-
-        simplify = SimplifyInference()
-        mod = InferType()(mod)
-        mod = simplify(mod)
-        y1 = mod["main"].body
-
-        assert_structural_equal(y1, y2, map_free_vars=True)
-
-    check(2, 1, 1)
-    check(4, 1, 1)
-    check(4, 0, 3)
-
-
-if __name__ == "__main__":
-    test_simplify_batchnorm(dtype="float32")
-    test_simplify_batchnorm(dtype="float16")
diff --git a/tests/python/relay/test_pass_split_args.py b/tests/python/relay/test_pass_split_args.py
deleted file mode 100644
index 04a3c5af1cd9..000000000000
--- a/tests/python/relay/test_pass_split_args.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing import run_infer_type, create_workload
-
-
-target_name = tvm.testing.parameter("opencl", "metal", "cuda")
-shape_type = tvm.testing.parameter("dynamic", "static")
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-
-    mod = tvm.IRModule.from_expr(expr)
-    mod = relay.transform.InferType()(mod)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_split_concat(target_name, shape_type):
-    if shape_type == "dynamic":
-        shape = (tvm.tir.Any(), 1, 1, 3)
-        number_of_any_dims = 1
-    else:
-        shape = (1, 1, 1, 3)
-        number_of_any_dims = 0
-    ndims = len(shape)
-    dtype = "float32"
-    axis = 1
-    tensors_num = 300
-    inputs = []
-    for i in range(tensors_num):
-        inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
-
-    def before():
-        inp = relay.Tuple(inputs)
-        return relay.op.concatenate(inp, axis)
-
-    def expected(limit):
-        if limit == 0:
-            return before()
-        limit = limit - 1  # one buffer with output
-        if number_of_any_dims > 0:
-            limit -= ndims
-
-        new_args = []
-        added_args = 0
-        num_inputs = 0
-        for inp in inputs:
-            curr_args = 1 + number_of_any_dims
-            if number_of_any_dims > 0:
-                curr_args += ndims
-            num_inputs += curr_args
-            if added_args + curr_args > limit:
-                t = relay.Tuple(new_args)
-                stop = relay.annotation.stop_fusion(t)
-                concat = relay.op.concatenate(stop, axis)
-                new_args = [concat]
-                added_args = curr_args
-            added_args += curr_args
-            new_args.append(inp)
-        t = relay.Tuple(new_args)
-        stop = relay.annotation.stop_fusion(t)
-        concat = relay.op.concatenate(stop, axis)
-
-        if num_inputs < limit:
-            return before()
-
-        return concat
-
-    # the fold constant should work on any context.
-    limit = tvm.target.Target(target_name).max_function_args
-    res = run_opt_pass(before(), transform.SplitArgs(limit))
-    exp = run_opt_pass(expected(limit), transform.InferType())
-    tvm.ir.assert_structural_equal(res, exp)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_to_a_normal_form.py b/tests/python/relay/test_pass_to_a_normal_form.py
deleted file mode 100644
index 873124ebf13a..000000000000
--- a/tests/python/relay/test_pass_to_a_normal_form.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import sys
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te
-from tvm import relay
-from tvm.relay.analysis import detect_feature
-from tvm.relay import op, create_executor, transform
-from tvm.relay.prelude import Prelude
-from tvm.relay.testing import count
-from tvm.relay.analysis import Feature
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def check_eval(expr, expected_result, mod=None, rtol=1e-07):
-    dev = tvm.device("llvm", 0)
-    result = create_executor(mod=mod, device=dev, target="llvm").evaluate(expr)
-    np.testing.assert_allclose(result.numpy(), expected_result, rtol=rtol)
-
-
-def test_explicit_bound():
-    x = relay.const(1)
-    y = op.add(x, x)
-    z = op.add(y, y)
-    f = relay.Function([], op.add(z, z))
-    assert not Feature.fLet in detect_feature(f)
-    anf = run_opt_pass(f, transform.ToANormalForm())
-    assert Feature.fLet in detect_feature(anf)
-    check_eval(f(), 8.0)
-    check_eval(anf(), 8.0)
-
-
-# test that the construction order does not matter,
-# and is instead ordered by the scope and by post-dfs ordering.
-def test_order():
-    z = relay.const(3)
-    y = relay.const(2)
-    x = relay.const(1)
-    val = x + y * z
-    check_eval(val, 7.0)
-    anf = run_opt_pass(val, [transform.ToANormalForm(), transform.InferType()])
-    a = relay.Var("a", relay.IncompleteType())
-    b = relay.Var("b", relay.IncompleteType())
-    c = relay.Var("c", relay.IncompleteType())
-    d = relay.Var("d", relay.IncompleteType())
-    e = relay.Var("e", relay.IncompleteType())
-    expected_output = e
-    expected_output = relay.Let(e, a + d, expected_output)
-    expected_output = relay.Let(d, b * c, expected_output)
-    expected_output = relay.Let(c, z, expected_output)
-    expected_output = relay.Let(b, y, expected_output)
-    expected_output = relay.Let(a, x, expected_output)
-    expected_output = run_opt_pass(expected_output, transform.InferType())
-    tvm.ir.assert_structural_equal(anf, expected_output)
-
-
-def test_if():
-    cond = relay.const(True)
-    x = relay.If(cond, relay.const(2), relay.const(3))
-    anf = run_opt_pass(x, [transform.ToANormalForm(), transform.InferType()])
-    a = relay.Var("a", relay.IncompleteType())
-    b = relay.Var("b", relay.IncompleteType())
-    c = relay.Var("c", relay.IncompleteType())
-    d = relay.Var("d", relay.IncompleteType())
-    true_branch = relay.Let(a, relay.const(2), a)
-    false_branch = relay.Let(b, relay.const(3), b)
-    expected_output = relay.If(c, true_branch, false_branch)
-    expected_output = relay.Let(d, expected_output, d)
-    expected_output = relay.Let(c, cond, expected_output)
-    expected_output = run_opt_pass(expected_output, transform.InferType())
-    tvm.ir.assert_structural_equal(anf, expected_output)
-
-
-def test_let_as_subexpr():
-    def on_cpu(x):
-        return relay.annotation.on_device(x, tvm.device("cpu"), constrain_result=True)
-
-    x = relay.Var("x", relay.IncompleteType())
-    c = relay.const(1)
-    l = relay.Let(x, on_cpu(c + c), x)
-    body = l * l
-
-    anf = run_opt_pass(body, [transform.ToANormalForm(), transform.InferType()])
-
-    v0 = relay.Var("v0", relay.IncompleteType())
-    v1 = relay.Var("v1", relay.IncompleteType())
-    v2 = relay.Var("v2", relay.IncompleteType())
-    expected_output = relay.Let(
-        v0,
-        on_cpu(c),
-        relay.Let(
-            x,
-            on_cpu(v0 + v0),
-            relay.Let(v1, x, relay.Let(v2, v1 * v1, v2)),
-        ),
-    )
-    expected_output = run_opt_pass(expected_output, transform.InferType())
-
-    tvm.ir.assert_structural_equal(anf, expected_output)
-
-
-# make sure we dont infinite loop.
-# it is too large so we wont check for the exact program.
-def test_recursion():
-    """
-    Program:
-       let f(n: i32) -> i32 = {
-          m = (n * 2)
-          if (n == 0) {
-              return m;
-          } else {
-              return m + f(n - 1);
-          }
-       }
-       f(5);
-    """
-    mod = tvm.IRModule()
-    i64 = relay.TensorType((), "int64")
-    f = relay.GlobalVar("f")
-    n = relay.Var("n", i64)
-    m = n * relay.const(2, "int64")
-    funcbody = relay.If(
-        relay.equal(n, relay.const(0, "int64")), m, m + f(n - relay.const(1, "int64"))
-    )
-    value = relay.Function([n], funcbody, i64, [])
-    mod[f] = value
-    check_eval(f(relay.const(5, "int64")), 30.0, mod=mod)
-    old_f = mod[f]
-    mod = transform.ToANormalForm()(mod)
-    f = mod[f]
-    check_eval(f(relay.const(5, "int64")), 30.0, mod=mod)
-
-
-def test_ref():
-    i = relay.Var("i")
-    iv = relay.Var("iv")
-    u = relay.Var("u")
-    uv = relay.Var("uv")
-    body = relay.add(iv, uv)
-    body = relay.Let(uv, relay.RefRead(i), body)
-    body = relay.Let(u, relay.RefWrite(i, relay.const(2)), body)
-    body = relay.Let(iv, relay.RefRead(i), body)
-    body = relay.Let(i, relay.RefCreate(relay.const(1)), body)
-    check_eval(body, 3)
-    opt_body = run_opt_pass(body, transform.ToANormalForm())
-    check_eval(opt_body, 3)
-
-
-def test_nat_add():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat, z, s = p.mod.get_type("nat")
-    add = p.mod.get_global_var("nat_add")
-    dev = tvm.device("llvm", 0)
-    intrp = create_executor(mod=mod, device=dev, target="llvm")
-    # CAUTION: Following calls to intrp.evaluate(...) will re-prepare the prelude.
-    assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
-    assert count(p, intrp.evaluate(add(s(z()), s(z())))) == 2
-    expr = add(s(z()), s(z()))
-    f = relay.GlobalVar("f")
-    mod[f] = relay.Function([], expr)
-    mod = transform.ToANormalForm()(mod)
-    expr = mod["f"]
-    assert count(p, intrp.evaluate(expr.body)) == 2
-    assert Feature.fLet in detect_feature(mod[add])
-
-
-def test_let():
-    x = relay.Var("x")
-    y = relay.Var("y")
-    d = relay.const(4.0, "float32")
-    body = relay.Let(y, x, x + y)
-    body = relay.Let(x, d, body)
-    check_eval(body, 8)
-    opt_body = run_opt_pass(body, transform.ToANormalForm())
-    check_eval(opt_body, 8)
-
-
-def test_function():
-    t = relay.TensorType((), "float32")
-    x = relay.Var("x", t)
-    f = relay.Function([x], x + x)
-    d = relay.const(4.0, "float32")
-    anf_f = run_opt_pass(f, transform.ToANormalForm())
-    assert isinstance(anf_f, relay.Function)
-    check_eval(f(d), 8)
-    check_eval(anf_f(d), 8)
-
-
-def test_gradient_if():
-    x = relay.var("a", shape=(1, 16))
-    y = relay.var("y", shape=(1, 16))
-    cond = relay.var("cond", shape=(), dtype="uint1")
-    net = relay.If(cond, x, x)
-    net = relay.add(x, net)
-    net = relay.Function([cond, x, y], net)
-    mod = tvm.IRModule.from_expr(net)
-    mod = relay.transform.ToANormalForm()(mod)
-    mod = relay.transform.InferType()(mod)
-    mod["main"] = relay.transform.gradient(mod["main"], mode="higher_order")
-    mod = relay.transform.ToANormalForm()(mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_to_basic_block_normal_form.py b/tests/python/relay/test_pass_to_basic_block_normal_form.py
deleted file mode 100644
index 5c852e970190..000000000000
--- a/tests/python/relay/test_pass_to_basic_block_normal_form.py
+++ /dev/null
@@ -1,517 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import numpy as np
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.analysis import detect_feature
-from tvm.relay import op, create_executor, transform
-from tvm.relay.prelude import Prelude
-from tvm.relay.testing import count, create_workload
-from tvm.relay.analysis import Feature
-from tvm.relay.analysis import check_basic_block_normal_form
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = tvm.IRModule.from_expr(expr)
-    seq = tvm.transform.Sequential(passes)
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def check_eval(expr, expected_result, mod=None, rtol=1e-07):
-    dev = tvm.device("llvm", 0)
-    result = create_executor(mod=mod, device=dev, target="llvm").evaluate(expr)
-    np.testing.assert_allclose(result.numpy(), expected_result, rtol=rtol)
-
-
-def test_no_explicit_bind():
-    x = relay.const(1)
-    y = op.add(x, x)
-    z = op.add(y, y)
-    f = relay.Function([], op.add(z, z))
-    """
-    fn () {
-      %0 = add(1, 1);
-      %1 = add(%0, %0);
-      add(%1, %1)
-    }
-    """
-    assert not Feature.fLet in detect_feature(f)
-    bblock = run_opt_pass(f, transform.ToBasicBlockNormalForm())
-    assert Feature.fLet not in detect_feature(bblock)
-    check_eval(f(), 8.0)
-    check_eval(bblock(), 8.0)
-    check_basic_block_normal_form(bblock)
-
-
-def test_top_level_nested_if():
-    x = relay.var("x", shape=(), dtype="bool")
-    y = relay.var("y", shape=(), dtype="float32")
-    z = relay.var("z", shape=(), dtype="float32")
-    cond_t = relay.const(True)
-    cond_f = relay.const(False)
-    one = relay.const(1, dtype="float32")
-    three = relay.const(3, dtype="float32")
-    y2 = relay.add(y, y)
-    z2 = relay.add(z, z)
-    true_branch = relay.If(cond_t, relay.add(z2, y2), relay.add(three, y2))
-    false_branch = relay.If(cond_f, z2, one)
-    body = relay.If(x, true_branch, false_branch)
-    """
-    free_var %x: bool
-    if (%x) {
-      if (True) {
-        free_var %z: float32
-        %0 = add(%z, %z);
-        free_var %y: float32
-        %1 = add(%y, %y);
-        add(%0, %1)
-      } else {
-        add(3f, %1)
-      }
-    } else {
-      if (False) {
-        %0
-      } else {
-        1f
-      }
-    }
-    """
-
-    def expected():
-        x = relay.var("x", shape=(), dtype="bool")
-        y = relay.var("y", shape=(), dtype="float32")
-        z = relay.var("z", shape=(), dtype="float32")
-        cond_t = relay.const(True)
-        cond_f = relay.const(False)
-        one = relay.const(1, dtype="float32")
-        three = relay.const(3, dtype="float32")
-        y2 = relay.var("y2")
-        z2 = relay.var("z2")
-        true_branch = relay.If(cond_t, relay.add(z2, y2), relay.add(three, y2))
-        true_branch = relay.Let(y2, relay.add(y, y), true_branch)
-        false_branch = relay.If(cond_f, z2, one)
-        body = relay.If(x, true_branch, false_branch)
-        body = relay.Let(z2, relay.add(z, z), body)
-        return body
-
-    bblock = run_opt_pass(body, [transform.ToBasicBlockNormalForm(), transform.InferType()])
-    """
-    free_var %z: float32
-    let %x: float32 = add(%z, %z) /* ty=float32 */;
-    free_var %x1: bool
-    if (%x1) {
-      free_var %y: float32
-      let %x2: float32 = add(%y, %y) /* ty=float32 */;
-      if (True /* ty=bool */) {
-        add(%x, %x2) /* ty=float32 */
-      } else {
-        add(3f /* ty=float32 */, %x2) /* ty=float32 */
-      }
-    } else {
-      if (False /* ty=bool */) {
-        %x
-      } else {
-        1f /* ty=float32 */
-      }
-    }
-    """
-    expected_output = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(bblock, expected_output, map_free_vars=True)
-
-
-def test_nested_if():
-    x = relay.var("x", shape=(), dtype="bool")
-    y = relay.var("y", shape=(), dtype="float32")
-    cond_t = relay.const(True)
-    cond_f = relay.const(False)
-    one = relay.const(1, dtype="float32")
-    two = relay.const(2, dtype="float32")
-    three = relay.const(3, dtype="float32")
-    y2 = relay.add(y, y)
-    true_branch = relay.If(cond_t, y2, relay.add(three, y2))
-    false_branch = relay.If(cond_f, two, one)
-    body = relay.If(x, true_branch, false_branch)
-    """
-    free_var %x: bool
-    if (%x) {
-      if (True) {
-        free_var %y: float32
-        %0 = add(%y, %y);
-        %0
-      } else {
-        add(3f, %0)
-      }
-    } else {
-      if (False) {
-        2f
-      } else {
-        1f
-      }
-    }
-    """
-
-    def expected():
-        x = relay.var("x", shape=(), dtype="bool")
-        y = relay.var("y", shape=(), dtype="float32")
-        cond_t = relay.const(True)
-        cond_f = relay.const(False)
-        one = relay.const(1, dtype="float32")
-        two = relay.const(2, dtype="float32")
-        three = relay.const(3, dtype="float32")
-        y2 = relay.var("y2")
-        true_branch = relay.If(cond_t, y2, relay.add(three, y2))
-        true_branch = relay.Let(y2, relay.add(y, y), true_branch)
-        false_branch = relay.If(cond_f, two, one)
-        body = relay.If(x, true_branch, false_branch)
-        return body
-
-    bblock = run_opt_pass(body, [transform.ToBasicBlockNormalForm(), transform.InferType()])
-    """
-    free_var %x: bool
-    if (%x) {
-      free_var %y: float32
-      let %x1: float32 = add(%y, %y) /* ty=float32 */;
-      if (True /* ty=bool */) {
-        %x1
-      } else {
-        add(3f /* ty=float32 */, %x1) /* ty=float32 */
-      }
-    } else {
-      if (False /* ty=bool */) {
-        2f /* ty=float32 */
-      } else {
-        1f /* ty=float32 */
-      }
-    }
-    """
-    expected_output = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(bblock, expected_output, map_free_vars=True)
-    check_basic_block_normal_form(bblock)
-
-
-# make sure we do not infinite loop.
-# it is too large so we won't check for the exact program.
-def test_recursion():
-    """
-    Program:
-       let f(n: i32) -> i32 = {
-          m = (n * 2)
-          if (n == 0) {
-              return m;
-          } else {
-              return m + f(n - 1);
-          }
-       }
-       f(5);
-    """
-    mod = tvm.IRModule()
-    i64 = relay.TensorType((), "int64")
-    f = relay.GlobalVar("f")
-    n = relay.Var("n", i64)
-    m = n * relay.const(2, "int64")
-    cond = relay.equal(n, relay.const(0, "int64"))
-    false_branch = m + f(n - relay.const(1, "int64"))
-    funcbody = relay.If(cond, m, false_branch)
-    value = relay.Function([n], funcbody, i64, [])
-    mod[f] = value
-    check_eval(f(relay.const(5, "int64")), 30.0, mod=mod)
-    old_f = mod[f]
-    mod = transform.ToBasicBlockNormalForm()(mod)
-    f = mod[f]
-    check_eval(f(relay.const(5, "int64")), 30.0, mod=mod)
-    check_basic_block_normal_form(f)
-
-
-def test_ref():
-    i = relay.Var("i")
-    iv = relay.Var("iv")
-    u = relay.Var("u")
-    uv = relay.Var("uv")
-    body = relay.add(iv, uv)
-    body = relay.Let(uv, relay.RefRead(i), body)
-    body = relay.Let(u, relay.RefWrite(i, relay.const(2)), body)
-    body = relay.Let(iv, relay.RefRead(i), body)
-    body = relay.Let(i, relay.RefCreate(relay.const(1)), body)
-    check_eval(body, 3)
-    opt_body = run_opt_pass(body, transform.ToBasicBlockNormalForm())
-    check_eval(opt_body, 3)
-    check_basic_block_normal_form(opt_body)
-
-
-def test_nat_add():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat, z, s = p.mod.get_type("nat")
-    add = p.mod.get_global_var("nat_add")
-    dev = tvm.device("llvm", 0)
-    assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
-    assert (
-        count(p, create_executor(mod=mod, device=dev, target="llvm").evaluate(add(s(z()), s(z()))))
-        == 2
-    )
-    expr = add(s(z()), s(z()))
-    f = relay.GlobalVar("f")
-    mod[f] = relay.Function([], expr)
-    mod = transform.InferType()(mod)
-    mod = transform.ToBasicBlockNormalForm()(mod)
-    opt_expr = mod["f"]
-    assert (
-        count(p, create_executor(mod=mod, device=dev, target="llvm").evaluate(opt_expr.body)) == 2
-    )
-    assert not Feature.fLet in detect_feature(mod[add])
-    check_basic_block_normal_form(opt_expr)
-
-
-def test_let():
-    def test_let1():
-        x = relay.Var("x")
-        c = relay.const(4.0, "float32")
-        body = relay.Let(x, c, x)
-        body = run_opt_pass(body, transform.InferType())
-        """
-        let %x: float32 = 4f /* ty=float32 */;
-        %x
-        """
-        opt_body = run_opt_pass(body, transform.ToBasicBlockNormalForm())
-        tvm.ir.assert_structural_equal(body, opt_body)
-        check_basic_block_normal_form(opt_body)
-
-    def test_let1_1():
-        x = relay.Var("y")
-        d = relay.const(4.0, "float32")
-        body = relay.Let(x, d, relay.add(x, x))
-        body = run_opt_pass(body, transform.InferType())
-        opt_body = run_opt_pass(body, transform.ToBasicBlockNormalForm())
-        tvm.ir.assert_structural_equal(body, opt_body)
-        check_basic_block_normal_form(opt_body)
-
-    def test_let2():
-        x = relay.Var("x")
-        y = relay.Var("y")
-        d = relay.const(4.0, "float32")
-        body = relay.Let(y, x, x)
-        body = relay.Let(x, d, body)
-        body = run_opt_pass(body, transform.InferType())
-        check_eval(body, 4)
-
-        def expected():
-            x = relay.Var("x")
-            y = relay.Var("y")
-            d = relay.const(4.0, "float32")
-            body = relay.Let(y, x, y)
-            body = relay.Let(x, d, body)
-            return body
-
-        opt_body = run_opt_pass(body, transform.ToBasicBlockNormalForm())
-        expected_body = run_opt_pass(expected(), transform.InferType())
-        tvm.ir.assert_structural_equal(opt_body, expected_body)
-        check_basic_block_normal_form(opt_body)
-
-    def test_let3():
-        x = relay.Var("x")
-        y = relay.Var("y")
-        z = relay.Var("z")
-        c = relay.const(3.0, "float32")
-        d = relay.const(4.0, "float32")
-        body = relay.Let(z, x + y, x + z)
-        body = relay.Let(x, d, body)
-        body = relay.Let(y, c, body)
-        body = run_opt_pass(body, transform.InferType())
-        opt_body = run_opt_pass(body, transform.ToBasicBlockNormalForm())
-        tvm.ir.assert_structural_equal(body, opt_body)
-        check_basic_block_normal_form(opt_body)
-
-    test_let1()
-    test_let1_1()
-    test_let2()
-    test_let3()
-
-
-def test_function():
-    t = relay.TensorType((), "float32")
-    x = relay.Var("x", t)
-    f = relay.Function([x], x + x)
-    d = relay.const(4.0, "float32")
-    bblock = run_opt_pass(f, transform.ToBasicBlockNormalForm())
-    assert isinstance(bblock, relay.Function)
-    check_eval(f(d), 8)
-    check_eval(bblock(d), 8)
-    check_basic_block_normal_form(bblock)
-
-
-def test_gradient_if():
-    x = relay.var("a", shape=(1, 16))
-    y = relay.var("y", shape=(1, 16))
-    cond = relay.var("cond", shape=(), dtype="uint1")
-    net = relay.If(cond, x, x)
-    net = relay.add(x, net)
-    net = relay.Function([cond, x, y], net)
-    mod = tvm.IRModule.from_expr(net)
-    mod = relay.transform.ToBasicBlockNormalForm()(mod)
-    mod = relay.transform.InferType()(mod)
-    net_grad = relay.transform.gradient(mod["main"], mode="higher_order")
-    mod["main"] = net_grad
-    mod_grad = relay.transform.ToBasicBlockNormalForm()(mod)
-    check_basic_block_normal_form(mod_grad["main"])
-    check_basic_block_normal_form(mod["main"])
-
-
-def test_if():
-    def if_expr(x):
-        """
-        free_var %x: float32
-        %0 = equal(%x, 2f);
-        if (%0) {
-          %1 = add(%x, 1f);
-          multiply(%1, 2f)
-        } else {
-          multiply(%1, 1f)
-        }
-        """
-        one = relay.const(1, dtype="float32")
-        two = relay.const(2, dtype="float32")
-        v1 = relay.add(x, one)
-        v2 = relay.equal(x, two)
-        true_branch = relay.multiply(v1, two)
-        false_branch = relay.multiply(v1, one)
-        body = relay.If(v2, true_branch, false_branch)
-        return body
-
-    def expected_if_expr(x):
-        """
-        free_var %x: float32
-        let %v1: float32 = add(%x, 1f /* ty=float32 */) /* ty=float32 */;
-        %0 = equal(%x, 2f /* ty=float32 */) /* ty=bool */;
-        if (%0) {
-          multiply(%v1, 2f /* ty=float32 */) /* ty=float32 */
-        } else {
-          multiply(%v1, 1f /* ty=float32 */) /* ty=float32 */
-        }
-        """
-        one = relay.const(1, dtype="float32")
-        two = relay.const(2, dtype="float32")
-        v1 = relay.var("v1")
-        v2 = relay.equal(x, two)
-        true_branch = relay.multiply(v1, two)
-        false_branch = relay.multiply(v1, one)
-        body = relay.If(v2, true_branch, false_branch)
-        body = relay.Let(v1, relay.add(x, one), body)
-        return body
-
-    x = relay.var("x", shape=(), dtype="float32")
-    body = if_expr(x)
-    expected_body = expected_if_expr(x)
-    bblock = run_opt_pass(body, [transform.ToBasicBlockNormalForm(), transform.InferType()])
-    expected_bblock = run_opt_pass(expected_body, transform.InferType())
-    tvm.ir.assert_structural_equal(bblock, expected_bblock, map_free_vars=True)
-    check_basic_block_normal_form(bblock)
-
-    func = relay.Function([x], body)
-    expected_func = relay.Function([x], expected_body)
-    bblock = run_opt_pass(func, [transform.ToBasicBlockNormalForm(), transform.InferType()])
-    expected_bblock = run_opt_pass(expected_func, transform.InferType())
-    tvm.ir.assert_structural_equal(bblock, expected_bblock)
-    check_basic_block_normal_form(bblock)
-
-
-def test_higher_order_return():
-    x = relay.var("x", shape=(1,), dtype="float32")  # , a)
-    y = relay.var("y", shape=(1,), dtype="float32")  # , a)
-    z = relay.var("z", shape=(1,), dtype="float32")  # , a)
-    x2 = relay.add(x, x)
-    func_a = relay.Function([y], relay.add(x2, y))  # , a, [a])
-    func_b = relay.Function([z], relay.add(x2, z))  # , a, [a])
-    body = relay.Tuple([func_a, func_b])
-    body = relay.Function([x], body)
-    """
-    fn (%x: Tensor[(1), float32]) {
-      %1 = fn (%y: Tensor[(1), float32]) {
-        %0 = add(%x, %x);
-        add(%0, %y)
-      };
-      %2 = fn (%z: Tensor[(1), float32]) {
-        add(%0, %z)
-      };
-      (%1, %2)
-    }
-    """
-
-    bblock = run_opt_pass(body, transform.ToBasicBlockNormalForm())
-    check_basic_block_normal_form(bblock)
-
-
-def test_higher_order_nested():
-    x = relay.var("x", dtype="float32", shape=(1,))
-    s = relay.var("s", dtype="float32", shape=(1,))
-    shared = relay.add(s, s)
-    func_true = relay.Function([x], relay.add(x, shared))
-    choice_t = relay.FuncType([], relay.scalar_type("bool"))
-    f = relay.Var("f", choice_t)
-    z = relay.Var("z")
-    body = relay.If(f(), func_true, relay.Function([z], relay.add(z, shared)))
-    top = relay.Function([f, s], body)
-    """
-    fn (%f: fn () -> bool, %s: Tensor[(1), float32]) {
-      %0 = %f();
-      if (%0) {
-        fn (%x: Tensor[(1), float32]) {
-          %1 = add(%s, %s);
-          add(%x, %1)
-        }
-      } else {
-        fn (%z) {
-          add(%z, %1)
-        }
-      }
-    }
-    """
-
-    bblock = run_opt_pass(top, transform.ToBasicBlockNormalForm())
-    check_basic_block_normal_form(bblock)
-
-
-def test_immutability():
-    simple_net = relay.nn.conv2d(
-        data=relay.var("data", relay.TensorType((1, 3, 224, 224), "float32")),
-        weight=relay.var("weight"),
-        kernel_size=(5, 5),
-        channels=3,
-        padding=(1, 1),
-    )
-    simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
-    mod, _ = create_workload(simple_net)
-
-    old_mod = mod
-
-    with tvm.transform.PassContext(opt_level=4):
-        with tvm.target.Target("llvm"):
-            seq = tvm.transform.Sequential(passes=[transform.ToBasicBlockNormalForm()], opt_level=4)
-            new_mod = seq(mod)
-
-    assert old_mod.astext() == mod.astext()
-    assert old_mod.astext() != new_mod.astext()
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_to_cps.py b/tests/python/relay/test_pass_to_cps.py
deleted file mode 100644
index d0a29aff7749..000000000000
--- a/tests/python/relay/test_pass_to_cps.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.analysis import detect_feature
-from tvm.relay.transform import to_cps, un_cps
-from tvm.relay.analysis import Feature
-from tvm.relay.prelude import Prelude
-from tvm.relay.testing import make_nat_expr, rand, run_infer_type, run_opt_pass
-from tvm.relay import create_executor
-from tvm.relay import transform
-
-
-def test_id():
-    x = relay.var("x", shape=[])
-    id = run_infer_type(relay.Function([x], x))
-    id_cps = run_infer_type(to_cps(id))
-
-
-def test_double():
-    t = relay.TypeVar("t")
-    x = relay.var("x", t)
-    f = relay.var("f", relay.FuncType([t], t))
-    double = run_infer_type(relay.Function([f, x], f(f(x)), t, [t]))
-    double_cps = run_infer_type(to_cps(double))
-
-
-# make sure cps work for recursion.
-def test_recursion():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    p.mod.import_from_std("nat.rly")
-    nat_iterate = p.mod.get_global_var("nat_iterate")
-    shape = (10, 10)
-    dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    x = relay.var("x", t)
-    double = relay.Function([x], x + x)
-    i = relay.var("i", t)
-    func = relay.Function([i], nat_iterate(double, make_nat_expr(p, 3))(i))
-    mod["main"] = func
-    mod = relay.transform.InferType()(mod)
-    mod["main"] = to_cps(mod["main"], mod=mod)
-    mod = relay.transform.InferType()(mod)
-    mod["main"] = un_cps(mod["main"])
-    i_nd = rand(dtype, *shape)
-    forward = create_executor(mod=mod).evaluate()(i_nd)
-    tvm.testing.assert_allclose(forward.numpy(), 8 * i_nd.numpy())
-
-
-# This serve as an integration test.
-# It test that, given a program with reference,
-# cps and pe can completely eliminate the allocation of reference.
-def test_cps_pe():
-    def destroy_ref(x):
-        x = run_infer_type(x)
-        x = to_cps(x)
-        x = run_infer_type(x)
-        y = un_cps(x)
-        y = run_infer_type(y)
-        # TODO(mbs): Revisit once DCE can eliminate dead writes.
-        x = run_opt_pass(
-            x,
-            tvm.transform.Sequential(
-                [
-                    transform.PartialEvaluate(),
-                    transform.InferType(),
-                    transform.DeadCodeElimination(inline_once=True, ignore_impurity=True),
-                ]
-            ),
-        )
-        assert Feature.fRefCreate not in detect_feature(x)
-
-    unit = relay.Function([], relay.const(0.0, dtype="float32"))
-    f_ref = relay.Var("f_ref")
-
-    one = relay.const(1.0, dtype="float32")
-    two = relay.const(2.0, dtype="float32")
-    cond = relay.var(shape=(), dtype="uint1", name_hint="cond")
-    true_branch = relay.RefWrite(f_ref, relay.Function([], one))
-    false_branch = relay.RefWrite(f_ref, relay.Function([], two))
-    if_expr = relay.If(cond, true_branch, false_branch)
-
-    stmt = relay.Let(
-        f_ref,
-        relay.RefCreate(unit),
-        relay.Let(relay.Var("x"), if_expr, relay.Call(relay.RefRead(f_ref), [])),
-    )
-
-    F = relay.Function([cond], stmt)
-    destroy_ref(F)
-
-    G = relay.Function([cond], relay.If(cond, one, two))
-    G = run_infer_type(G)
-    G = relay.transform.gradient(G)
-    destroy_ref(G)
-
-    x = relay.var("x", shape=(1, 16))
-    y = relay.var("y", shape=(1, 16))
-    z = relay.var("z", shape=(1, 16))
-    cond = relay.var("cond", shape=(), dtype="uint1")
-    H = relay.If(cond, x, y)
-    H = relay.add(H, z)
-    H = relay.Function([cond, x, y, z], H)
-    H = run_infer_type(H)
-    H = relay.transform.gradient(H)
-    destroy_ref(H)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_to_graph_normal_form.py b/tests/python/relay/test_pass_to_graph_normal_form.py
deleted file mode 100644
index 6a8c99d076e4..000000000000
--- a/tests/python/relay/test_pass_to_graph_normal_form.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.relay import op, create_executor, transform
-from tvm.relay.analysis import Feature
-from tvm.relay.analysis import detect_feature
-
-
-def run_opt_pass(expr, opt_pass):
-    mod = tvm.IRModule.from_expr(expr)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
-    if mod is None:
-        mod = tvm.IRModule()
-
-    dev = tvm.device("llvm", 0)
-    result = create_executor(mod=mod, device=dev, target="llvm").evaluate(expr)(*args)
-    np.testing.assert_allclose(result.numpy(), expected_result, rtol=rtol)
-
-
-def test_implicit_share():
-    x = relay.Var("x")
-    y = relay.Var("y")
-    z = relay.Var("z")
-    body = relay.Let(z, op.add(y, y), op.add(z, z))
-    body = relay.Let(y, op.add(x, x), body)
-    f = relay.Function([], relay.Let(x, relay.const(1), body))
-    g = run_opt_pass(f, transform.ToGraphNormalForm())
-    assert Feature.fLet in detect_feature(f)
-    assert not Feature.fLet in detect_feature(g)
-    check_eval(f, [], 8.0)
-    check_eval(g, [], 8.0)
-
-
-def test_round_trip():
-    x = relay.Var("x")
-    y = relay.Var("y")
-    z = relay.Var("z")
-    body = relay.Let(z, op.add(y, y), op.add(z, z))
-    body = relay.Let(y, op.add(x, x), body)
-    f = relay.Function([], relay.Let(x, relay.const(1), body))
-    g = run_opt_pass(f, transform.ToGraphNormalForm())
-    h = run_opt_pass(g, transform.ToANormalForm())
-    assert Feature.fLet in detect_feature(f)
-    assert not Feature.fLet in detect_feature(g)
-    check_eval(f, [], 8.0)
-    check_eval(g, [], 8.0)
-    check_eval(h, [], 8.0)
-
-
-if __name__ == "__main__":
-    test_implicit_share()
-    test_round_trip()
diff --git a/tests/python/relay/test_pass_unmatched_cases.py b/tests/python/relay/test_pass_unmatched_cases.py
deleted file mode 100644
index 528dc4b6826e..000000000000
--- a/tests/python/relay/test_pass_unmatched_cases.py
+++ /dev/null
@@ -1,470 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.prelude import Prelude
-from tvm.relay.analysis import unmatched_cases
-import pytest
-
-
-def test_empty_match_block():
-    # empty match block will not match anything, so it should return a wildcard pattern
-    v = relay.Var("v")
-    match = relay.Match(v, [])
-
-    unmatched = unmatched_cases(match)
-    assert len(unmatched) == 1
-    assert isinstance(unmatched[0], relay.PatternWildcard)
-
-
-def test_trivial_matches():
-    # a match clause with a wildcard will match anything
-    v = relay.Var("v")
-    match = relay.Match(v, [relay.Clause(relay.PatternWildcard(), v)])
-    assert len(unmatched_cases(match)) == 0
-
-    # same with a pattern var
-    w = relay.Var("w")
-    match = relay.Match(v, [relay.Clause(relay.PatternVar(w), w)])
-    assert len(unmatched_cases(match)) == 0
-
-
-def test_single_constructor_adt():
-    mod = tvm.IRModule()
-    box = relay.GlobalTypeVar("box")
-    a = relay.TypeVar("a")
-    box_ctor = relay.Constructor("box", [a], box)
-    box_data = relay.TypeData(box, [a], [box_ctor])
-    mod[box] = box_data
-
-    v = relay.Var("v")
-    match = relay.Match(
-        v, [relay.Clause(relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]), v)]
-    )
-
-    # with one constructor, having one pattern constructor case is exhaustive
-    assert len(unmatched_cases(match, mod)) == 0
-
-    # this will be so if we nest the constructors too
-    nested_pattern = relay.Match(
-        v,
-        [
-            relay.Clause(
-                relay.PatternConstructor(
-                    box_ctor,
-                    [
-                        relay.PatternConstructor(
-                            box_ctor,
-                            [relay.PatternConstructor(box_ctor, [relay.PatternWildcard()])],
-                        )
-                    ],
-                ),
-                v,
-            )
-        ],
-    )
-    assert len(unmatched_cases(nested_pattern, mod)) == 0
-
-
-def test_too_specific_match():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = mod.get_type("List")
-
-    v = relay.Var("v")
-    match = relay.Match(
-        v,
-        [
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternWildcard(),
-                        relay.PatternConstructor(
-                            cons, [relay.PatternWildcard(), relay.PatternWildcard()]
-                        ),
-                    ],
-                ),
-                v,
-            )
-        ],
-    )
-
-    unmatched = unmatched_cases(match, mod)
-
-    # will not match nil or a list of length 1
-    nil_found = False
-    single_length_found = False
-    assert len(unmatched) == 2
-    for case in unmatched:
-        assert isinstance(case, relay.PatternConstructor)
-        if case.constructor == nil:
-            nil_found = True
-        if case.constructor == cons:
-            assert isinstance(case.patterns[1], relay.PatternConstructor)
-            assert case.patterns[1].constructor == nil
-            single_length_found = True
-    assert nil_found and single_length_found
-
-    # if we add a wildcard, this should work
-    new_match = relay.Match(
-        v,
-        [
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternWildcard(),
-                        relay.PatternConstructor(
-                            cons, [relay.PatternWildcard(), relay.PatternWildcard()]
-                        ),
-                    ],
-                ),
-                v,
-            ),
-            relay.Clause(relay.PatternWildcard(), v),
-        ],
-    )
-    assert len(unmatched_cases(new_match, mod)) == 0
-
-
-def test_multiple_constructor_clauses():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = mod.get_type("List")
-
-    v = relay.Var("v")
-    match = relay.Match(
-        v,
-        [
-            # list of length exactly 1
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons, [relay.PatternWildcard(), relay.PatternConstructor(nil, [])]
-                ),
-                v,
-            ),
-            # list of length exactly 2
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternWildcard(),
-                        relay.PatternConstructor(
-                            cons, [relay.PatternWildcard(), relay.PatternConstructor(nil, [])]
-                        ),
-                    ],
-                ),
-                v,
-            ),
-            # empty list
-            relay.Clause(relay.PatternConstructor(nil, []), v),
-            # list of length 2 or more
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternWildcard(),
-                        relay.PatternConstructor(
-                            cons, [relay.PatternWildcard(), relay.PatternWildcard()]
-                        ),
-                    ],
-                ),
-                v,
-            ),
-        ],
-    )
-    assert len(unmatched_cases(match, mod)) == 0
-
-
-def test_missing_in_the_middle():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = mod.get_type("List")
-
-    v = relay.Var("v")
-    match = relay.Match(
-        v,
-        [
-            # list of length exactly 1
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons, [relay.PatternWildcard(), relay.PatternConstructor(nil, [])]
-                ),
-                v,
-            ),
-            # empty list
-            relay.Clause(relay.PatternConstructor(nil, []), v),
-            # list of length 3 or more
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternWildcard(),
-                        relay.PatternConstructor(
-                            cons,
-                            [
-                                relay.PatternWildcard(),
-                                relay.PatternConstructor(
-                                    cons, [relay.PatternWildcard(), relay.PatternWildcard()]
-                                ),
-                            ],
-                        ),
-                    ],
-                ),
-                v,
-            ),
-        ],
-    )
-
-    # fails to match a list of length exactly two
-    unmatched = unmatched_cases(match, mod)
-    assert len(unmatched) == 1
-    assert isinstance(unmatched[0], relay.PatternConstructor)
-    assert unmatched[0].constructor == cons
-    assert isinstance(unmatched[0].patterns[1], relay.PatternConstructor)
-    assert unmatched[0].patterns[1].constructor == cons
-    assert isinstance(unmatched[0].patterns[1].patterns[1], relay.PatternConstructor)
-    assert unmatched[0].patterns[1].patterns[1].constructor == nil
-
-
-def test_mixed_adt_constructors():
-    mod = tvm.IRModule()
-    box = relay.GlobalTypeVar("box")
-    a = relay.TypeVar("a")
-    box_ctor = relay.Constructor("box", [a], box)
-    box_data = relay.TypeData(box, [a], [box_ctor])
-    mod[box] = box_data
-
-    p = Prelude(mod)
-    _, cons, nil = p.mod.get_type("List")
-
-    v = relay.Var("v")
-    box_of_lists_inc = relay.Match(
-        v,
-        [
-            relay.Clause(
-                relay.PatternConstructor(
-                    box_ctor,
-                    [
-                        relay.PatternConstructor(
-                            cons, [relay.PatternWildcard(), relay.PatternWildcard()]
-                        )
-                    ],
-                ),
-                v,
-            )
-        ],
-    )
-
-    # will fail to match a box containing an empty list
-    unmatched = unmatched_cases(box_of_lists_inc, mod)
-    assert len(unmatched) == 1
-    assert isinstance(unmatched[0], relay.PatternConstructor)
-    assert unmatched[0].constructor == box_ctor
-    assert len(unmatched[0].patterns) == 1 and unmatched[0].patterns[0].constructor == nil
-
-    box_of_lists_comp = relay.Match(
-        v,
-        [
-            relay.Clause(
-                relay.PatternConstructor(box_ctor, [relay.PatternConstructor(nil, [])]), v
-            ),
-            relay.Clause(
-                relay.PatternConstructor(
-                    box_ctor,
-                    [
-                        relay.PatternConstructor(
-                            cons, [relay.PatternWildcard(), relay.PatternWildcard()]
-                        )
-                    ],
-                ),
-                v,
-            ),
-        ],
-    )
-    assert len(unmatched_cases(box_of_lists_comp, mod)) == 0
-
-    list_of_boxes_inc = relay.Match(
-        v,
-        [
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
-                        relay.PatternWildcard(),
-                    ],
-                ),
-                v,
-            )
-        ],
-    )
-
-    # fails to match empty list of boxes
-    unmatched = unmatched_cases(list_of_boxes_inc, mod)
-    assert len(unmatched) == 1
-    assert isinstance(unmatched[0], relay.PatternConstructor)
-    assert unmatched[0].constructor == nil
-
-    list_of_boxes_comp = relay.Match(
-        v,
-        [
-            # exactly one box
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
-                        relay.PatternConstructor(nil, []),
-                    ],
-                ),
-                v,
-            ),
-            # exactly two boxes
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
-                        relay.PatternConstructor(
-                            cons,
-                            [
-                                relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
-                                relay.PatternConstructor(nil, []),
-                            ],
-                        ),
-                    ],
-                ),
-                v,
-            ),
-            # exactly three boxes
-            relay.Clause(
-                relay.PatternConstructor(
-                    cons,
-                    [
-                        relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
-                        relay.PatternConstructor(
-                            cons,
-                            [
-                                relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
-                                relay.PatternConstructor(
-                                    cons,
-                                    [
-                                        relay.PatternConstructor(
-                                            box_ctor, [relay.PatternWildcard()]
-                                        ),
-                                        relay.PatternConstructor(nil, []),
-                                    ],
-                                ),
-                            ],
-                        ),
-                    ],
-                ),
-                v,
-            ),
-            # one or more boxes
-            relay.Clause(
-                relay.PatternConstructor(cons, [relay.PatternWildcard(), relay.PatternWildcard()]),
-                v,
-            ),
-            # no boxes
-            relay.Clause(relay.PatternConstructor(nil, []), v),
-        ],
-    )
-    assert len(unmatched_cases(list_of_boxes_comp, mod)) == 0
-
-
-def test_tuple_match():
-    a = relay.Var("a")
-    b = relay.Var("b")
-    clause = relay.Clause(relay.PatternTuple([relay.PatternVar(a), relay.PatternVar(b)]), a + b)
-    x = relay.Match(relay.Tuple([relay.const(1), relay.const(1)]), [clause])
-    assert len(unmatched_cases(x)) == 0
-
-
-def test_inf_loop_case():
-    code = """
-#[version = "0.0.5"]
-type Arith[A] {
-    Zero,
-    Const(A),
-    Plus(Arith[A], Arith[A])
-}
-
-def @shallow_opt[A](%a: Arith[A]) -> Arith[A] {
-    match (%a) {
-        Plus(Zero, %r) => %r,
-        Plus(%l, Zero) => %l,
-        _ => %a
-    }
-}
-"""
-    tvm.relay.fromtext(code)
-    # fromtext parse the module, then checked it (which include strictness checking).
-
-
-def test_expanding_ctor_with_no_args():
-    code = """
-#[version = "0.0.5"]
-type List[A] {
-    Cons(A, List[A]),
-    Nil,
-}
-
-def @expand_on_nil_match(%a: List[(List[()],)]) -> int {
-    match (%a) {
-        Cons((Nil), Nil) => 1,
-        _ => 2,
-    }
-}
-"""
-    # exhausion checks:
-    # * hits Cons((Nil), Nil), expands to Cons(*, *), Nil()
-    # Nil() fails Cons((Nil), Nil), passes _
-    # Cons(*, *) hits Cons((Nil), Nil), expands to Cons((*), Cons(*, *)), Cons((*), Nil())
-    # Cons((*), Cons(*, *)) fails Cons((Nil), Nil), passes _
-    # Cons((*), Nil()) hits Cons((Nil), Nil), expands to Cons((Nil), Nil), Cons((Cons(*, *)), Nil)
-    # Cons((Nil), Nil) passes the first pattern
-    # Cons((Cons(*, *)), Nil) fails the first pattern, passes _
-    # Note Nil() is passed to ExpandWildcardsConstructor many times in the above!
-    tvm.relay.fromtext(code)
-
-
-def test_expanding_empty_tuple():
-    # same principle as above, but with empty tuple
-    code = """
-#[version = "0.0.5"]
-type List[A] {
-    Cons(A, List[A]),
-    Nil,
-}
-
-def @expand_on_empty_tuple_match(%a: (List[()], ())) -> int {
-    match (%a) {
-        (Cons((), Nil), ()) => 1,
-        _ => 2,
-    }
-}
-"""
-    tvm.relay.fromtext(code)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_vars.py b/tests/python/relay/test_pass_vars.py
deleted file mode 100644
index d823f6ea4bff..000000000000
--- a/tests/python/relay/test_pass_vars.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay.analysis import (
-    free_vars,
-    free_type_vars,
-    bound_vars,
-    bound_type_vars,
-    all_vars,
-    all_type_vars,
-)
-
-
-def assert_vars_match(actual, expected):
-    assert len(actual) == len(expected)
-    for i in range(len(actual)):
-        assert actual[i] == expected[i]
-
-
-def test_free_vars():
-    ty = relay.TensorType([], "int32")
-    x = relay.Var("x", ty)
-    fvx = free_vars(x)
-    assert len(fvx) == 1
-    assert fvx[0] == x
-    v = relay.Constant(tvm.nd.array(10))
-
-    let = relay.Let(x, v, x)
-    fvx = free_vars(let)
-    assert len(free_vars(let)) == 0
-    f = relay.Function([x], x, ty)
-    assert len(free_vars(f)) == 0
-
-
-def test_free_vars_tuple():
-    t = relay.Var("t")
-    fv = free_vars(relay.Tuple([t, t]))
-    assert len(fv) == 1
-    assert fv[0] == t
-    fv = free_vars(relay.TupleGetItem(t, 123))
-    assert len(fv) == 1
-    assert fv[0] == t
-
-
-def test_free_type_vars():
-    tp = relay.TypeVar("")
-    ty = relay.TupleType([tp, relay.TensorType([], "int32")])
-    x = relay.Var("x", ty)
-    y = relay.Var("y")
-    let = relay.Let(x, y, x)
-    fvl = free_vars(let)
-    assert len(fvl) == 1
-    assert fvl[0] == y
-    ftvl = free_type_vars(let)
-    assert len(ftvl) == 1
-    assert ftvl[0] == tp
-
-
-def test_bound_vars():
-    x = relay.Var("x")
-    y = relay.Var("y")
-    z = relay.Var("z")
-    a = relay.Var("a")
-
-    f1 = relay.Function([x, y, z], relay.Let(a, x, relay.Tuple([])))
-    assert_vars_match(bound_vars(f1), [x, y, z, a])
-
-    tup = relay.Tuple([x, y, z, a])
-    assert len(bound_vars(tup)) == 0
-
-    f2 = relay.Function([x, y], relay.Tuple([x, y, z, a]))
-    assert_vars_match(bound_vars(f2), [x, y])
-
-
-def test_match_vars():
-    mod = tvm.IRModule()
-    p = relay.prelude.Prelude(mod)
-    rlist, cons, nil = p.mod.get_type("List")
-
-    x = relay.Var("x")
-    y = relay.Var("y")
-    z = relay.Var("z")
-
-    match1 = relay.Match(
-        nil(),
-        [
-            relay.Clause(relay.PatternConstructor(nil), z),
-            relay.Clause(
-                relay.PatternConstructor(cons, [relay.PatternVar(x), relay.PatternVar(y)]),
-                cons(x, y),
-            ),
-        ],
-    )
-
-    match2 = relay.Match(
-        nil(),
-        [
-            relay.Clause(
-                relay.PatternConstructor(cons, [relay.PatternWildcard(), relay.PatternVar(x)]), y
-            ),
-            relay.Clause(relay.PatternWildcard(), z),
-        ],
-    )
-
-    assert_vars_match(bound_vars(match1), [x, y])
-    assert_vars_match(free_vars(match1), [z])
-    assert_vars_match(all_vars(match1), [z, x, y])
-
-    assert_vars_match(bound_vars(match2), [x])
-    assert_vars_match(free_vars(match2), [y, z])
-    assert_vars_match(all_vars(match2), [x, y, z])
-
-
-def test_bound_type_vars():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    c = relay.TypeVar("c")
-
-    ft1 = relay.FuncType([a], b, [a, b])
-    bound_ft1 = bound_type_vars(ft1)
-    assert_vars_match(bound_type_vars(ft1), [a, b])
-
-    ft2 = relay.FuncType([], c, [a])
-    assert_vars_match(bound_type_vars(ft2), [a])
-
-    tup_ty = relay.TupleType([a, b, c])
-    assert len(bound_type_vars(tup_ty)) == 0
-
-    f1 = relay.Function([], relay.Tuple([]), type_params=[a, b])
-    assert_vars_match(bound_type_vars(f1), [a, b])
-
-    f2 = relay.Function([], relay.Tuple([]), c)
-    assert len(bound_type_vars(f2)) == 0
-
-    x = relay.Var("x", a)
-    let1 = relay.Let(x, relay.Tuple([]), x)
-    assert len(bound_type_vars(let1)) == 0
-
-    let2 = relay.Let(x, relay.Function([], relay.Tuple([]), type_params=[b, c]), x)
-    assert_vars_match(bound_type_vars(let2), [b, c])
-
-
-def test_all_vars():
-    x = relay.Var("x")
-    y = relay.Var("y")
-    z = relay.Var("z")
-
-    f1 = relay.Function([x, y], z)
-    assert_vars_match(all_vars(f1), [x, y, z])
-
-    f2 = relay.Function([x], relay.Let(y, relay.Tuple([]), z))
-    assert_vars_match(all_vars(f2), [x, y, z])
-
-    f3 = relay.Function([x], relay.Tuple([y, z]))
-    assert_vars_match(all_vars(f3), [x, y, z])
-
-    tup = relay.Tuple([x, y, z])
-    assert_vars_match(all_vars(tup), [x, y, z])
-
-
-def test_all_type_vars():
-    a = relay.TypeVar("a")
-    b = relay.TypeVar("b")
-    c = relay.TypeVar("c")
-
-    ft1 = relay.FuncType([b], c, [a])
-    assert_vars_match(all_type_vars(ft1), [a, b, c])
-
-    ft2 = relay.FuncType([], relay.TupleType([a, b, c]), [])
-    assert_vars_match(all_type_vars(ft2), [a, b, c])
-
-    w = relay.Var("w")
-    x = relay.Var("x", a)
-    y = relay.Var("y", b)
-    z = relay.Var("z", c)
-
-    f1 = relay.Function([x], y, b, [a])
-    assert_vars_match(all_type_vars(f1), [a, b])
-
-    f2 = relay.Function([x], relay.Let(y, x, z))
-    assert_vars_match(all_type_vars(f2), [a, b, c])
-
-    f3 = relay.Function([], relay.Tuple([x, y, z]), ret_type=relay.TupleType([a, b, c]))
-    assert_vars_match(all_type_vars(f3), [a, b, c])
-
-    f4 = relay.Function([w], relay.Tuple([]), type_params=[a, b, c])
-    assert_vars_match(all_type_vars(f4), [a, b, c])
-
-    f5 = relay.Function([w], w)
-    assert len(all_type_vars(f5)) == 0
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
deleted file mode 100644
index 3d71fdfc1d94..000000000000
--- a/tests/python/relay/test_pipeline_executor.py
+++ /dev/null
@@ -1,627 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import os
-import time
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay import transform, build_module
-from tvm.relay.testing import run_opt_pass
-from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
-from tvm._ffi import get_global_func
-from tvm.contrib import cc as _cc
-
-
-def graph_split(expr, split_conf, params=None):
-    """Splitting the graph into a list of subgraphs"""
-
-    def get_dep_var(sub_var_dep):
-        return [var for var in sub_var_dep[len(sub_var_dep) - 1]["ref_nodes"]]
-
-    def parse_dependency(value, snode_dep, new_input_idx):
-        new_args = []
-        need_update = False
-        for var in value.args:
-            is_free_var = False
-            for dep in snode_dep[:-1]:
-                if var in dep["nodes"]:
-                    # Mark the previous subgraph node as a dependency.
-                    dep["nodes"][var] += 1
-                    dep["ref_nodes"][var] = dep["nodes"][var]
-                    # The var of this call is a free_var
-                    is_free_var = True
-            # if the var of this call is a free_var, recreate it and give it a fixed input name.
-            if is_free_var:
-                need_update = True
-                new_args.append(relay.var(f"data_n_{new_input_idx}", var.checked_type))
-                new_input_idx += 1
-            else:
-                new_args.append(var)
-        # if the 'tvm.relay.expr.Call' has a free_var, recreate it with new name as 'data_n_*'.
-        if need_update:
-            value = tvm.relay.expr.Call(
-                value.op, new_args, value.attrs, value.type_args, value.span
-            )
-        return value, snode_dep, new_input_idx
-
-    def merge_constant_expr(constant_expr, expr):
-        # merge constant express with a express
-        if not isinstance(constant_expr.body, tvm.relay.expr.Let):
-            return tvm.relay.expr.Let(constant_expr.var, constant_expr.value, expr)
-
-        return tvm.relay.expr.Let(
-            constant_expr.var, constant_expr.value, merge_constant_expr(constant_expr.body, expr)
-        )
-
-    def _recursion(anf, pipeline_mods, split_conf, constant_expr):
-        # Enumurate all operators of compute graph, then split the compute graph into a group of
-        # subgraph.
-        nonlocal operator_index_map
-        nonlocal new_input_idx
-        nonlocal snode_dep
-        cur_node_dep = snode_dep[len(snode_dep) - 1]
-        if isinstance(anf, tvm.relay.Function):
-            return tvm.relay.Function(
-                anf.params,
-                _recursion(anf.body, pipeline_mods, split_conf, constant_expr),
-                anf.ret_type,
-                anf.type_params,
-                anf.attrs,
-            )
-        if isinstance(anf, tvm.relay.expr.Let):
-            value = anf.value
-            # record the constant expr to make sure all sugraphs can find correct constant.
-            if isinstance(value, tvm.relay.expr.Constant):
-                if not constant_expr:
-                    constant_expr = tvm.relay.expr.Let(anf.var, value, anf.var)
-                else:
-                    constant_expr = tvm.relay.expr.Let(anf.var, value, constant_expr)
-            if isinstance(value, tvm.relay.expr.Call):
-                new_args = []
-                # build current var list
-                cur_node_dep["nodes"][anf.var] = 0
-                # Get the dependency information of the nodes.
-                value, snode_dep, new_input_idx = parse_dependency(value, snode_dep, new_input_idx)
-                if isinstance(value.op, tvm.ir.Op):
-                    if value.op.name in operator_index_map:
-                        operator_index_map[value.op.name] += 1
-                    else:
-                        operator_index_map[value.op.name] = 0
-                    split_operator_name = split_conf[0]["op_name"] if split_conf else ""
-                    split_operator_index = split_conf[0]["op_index"] if split_conf else ""
-                    # if a operator name and repeating count in the network match with the values
-                    # of the 'split configuration', then this place is where we should do the
-                    # graph splitting.
-                    if (
-                        split_conf
-                        and split_operator_name in operator_index_map
-                        and operator_index_map[split_operator_name] >= split_operator_index
-                    ):
-                        # Do graph splitting.
-                        split_conf.pop(0)
-                        snode_dep.append({"nodes": {}, "ref_nodes": {}})
-                        ann = _recursion(
-                            anf.body,
-                            pipeline_mods,
-                            split_conf,
-                            constant_expr,
-                        )
-                        snode_dep.pop()
-                        dep_vars = get_dep_var(snode_dep)
-                        # When the nodes of the current subgraph are the depedency node of another
-                        # subgraph, we need to set them as the output of current subgraph.
-                        body = relay.Tuple(dep_vars) if len(dep_vars) > 1 else anf.var
-                        # when the operator of current subgraph uses previous subgraph constant
-                        # as the argument of a "relay.expr.call", such constant may become a free
-                        # varaible if the constant does not exist in the current subgraph.
-                        # merge the previous constant with current subgraph to avoid such issue.
-                        if constant_expr:
-                            ann = merge_constant_expr(constant_expr, ann)
-                        ann = run_opt_pass(ann, transform.ToGraphNormalForm())
-                        mod = tvm.IRModule.from_expr(ann)
-                        pipeline_mods.insert(0, mod)
-                        # Return the last node of the current subgraph.
-                        return tvm.relay.expr.Let(anf.var, value, body)
-            return tvm.relay.expr.Let(
-                anf.var,
-                value,
-                _recursion(anf.body, pipeline_mods, split_conf, constant_expr),
-            )
-        else:
-            return anf
-
-    snode_dep = [{"nodes": {}, "ref_nodes": {}}]
-    pipeline_mods = []
-    operator_index_map = {}
-    # Used to tracking new input which caused by graph splitting.
-    new_input_idx = 0
-    constant_expr = None
-    subgraph_split_conf = split_conf.copy()
-    # Binding the parameters.
-    if params:
-        expr = build_module.bind_params_by_name(expr, params)
-    anf = run_opt_pass(expr, transform.ToANormalForm())
-    anf = run_opt_pass(anf, transform.InferType())
-    ann = _recursion(
-        anf,
-        pipeline_mods,
-        subgraph_split_conf,
-        constant_expr,
-    )
-    ann = run_opt_pass(ann.body, transform.ToGraphNormalForm())
-    mod = tvm.IRModule.from_expr(ann)
-    pipeline_mods.insert(0, mod)
-    return pipeline_mods
-
-
-def get_network():
-    # Get a list of modules representing subgraphs.
-    mods = []
-    dshape = (3, 3)
-    data = relay.var("data_0", relay.TensorType(dshape, "float32"))
-    data21 = relay.var("data_1", relay.TensorType(dshape, "float32"))
-    data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
-    data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
-    data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
-    mvalue1 = np.full((1), 1).astype("float32")
-    mvalue2 = np.full((1), 2).astype("float32")
-    mvalue3 = np.full((1), 3).astype("float32")
-    mv1 = relay.Constant(tvm.nd.array(mvalue1))
-    mv2 = relay.Constant(tvm.nd.array(mvalue2))
-    mv3 = relay.Constant(tvm.nd.array(mvalue3))
-    # There are three outputs in the first model.
-    net1_output1 = relay.add(data, mv1)
-    net1_output2 = relay.subtract(data, mv2)
-    net1_output3 = relay.concatenate((net1_output1, net1_output2), axis=0)
-    (net1_output3, _) = relay.split(net1_output3, indices_or_sections=2, axis=0)
-    net1_output3 = relay.add(net1_output3, mv2)
-    # The second model uses the output named net1_output3 of the first model as the first input,
-    # the second input of the second model is data21.
-    net2 = relay.add(net1_output3, mv2)
-    net2 = relay.add(net2, data21)
-    net2_output = relay.add(net2, mv3)
-    # The third model uses the output named net2_output of the second model as the first input
-    # and uses the output named net1_output2 of the first model as the second input.
-    net3 = relay.multiply(net2_output, mv3)
-    net3 = relay.add(net3, net1_output2)
-    return tvm.IRModule.from_expr(relay.Function([data, data21], relay.Tuple([net3]))), dshape
-
-
-def get_split_mod():
-    mod, dshape = get_network()
-    split_conf = [{"op_name": "add", "op_index": 1}, {"op_name": "add", "op_index": 4}]
-    mods = graph_split(mod["main"], split_conf)
-    return mods, dshape
-
-
-def get_mannual_mod():
-    # Get a list of modules representing subgraphs.
-    mods = []
-    dshape = (3, 3)
-    data = relay.var("data_0", relay.TensorType(dshape, "float32"))
-    data21 = relay.var("data_1", relay.TensorType(dshape, "float32"))
-    data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
-    data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
-    data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
-    mvalue1 = np.full((1), 1).astype("float32")
-    mvalue2 = np.full((1), 2).astype("float32")
-    mvalue3 = np.full((1), 3).astype("float32")
-    mv1 = relay.Constant(tvm.nd.array(mvalue1))
-    mv2 = relay.Constant(tvm.nd.array(mvalue2))
-    mv3 = relay.Constant(tvm.nd.array(mvalue3))
-
-    # There are three outputs in the first model.
-
-    net1_output1 = relay.add(data, mv1)
-    net1_output2 = relay.subtract(data, mv2)
-    net1_output3 = relay.multiply(data, mv3)
-
-    # The second model use output named net1_output1 of the first model as the first input,
-    # the second input of the second model is data21.
-    net2 = relay.add(data_net1_output_1, mv2)
-    net2 = relay.add(net2, data21)
-    net2_output = relay.add(net2, mv3)
-
-    # The third model use the output named net2_output of the second model as the first input
-    # and use the output named net1_output2 of the first model as the second input.
-    net3 = relay.multiply(data_net2_output_1, mv3)
-    net3 = relay.add(net3, data_net1_output_2)
-
-    mods.append(
-        tvm.IRModule.from_expr(
-            relay.Function([data], relay.Tuple([net1_output1, net1_output2, net1_output3]))
-        )
-    )
-    mods.append(tvm.IRModule.from_expr(relay.Function([data_net1_output_1, data21], net2_output)))
-    mods.append(
-        tvm.IRModule.from_expr(relay.Function([data_net1_output_2, data_net2_output_1], net3))
-    )
-
-    return mods, dshape
-
-
-def get_manual_conf(mods, target):
-    # This function is used to generate manual pipeline configuration.
-    mod_config = {}
-    # The third output is the final output, the second output is for mod3, the first output
-    # is for mod2 input.
-    pipe_config1 = {
-        "mod_idx": 0,
-        "cpu_affinity": "0",
-        "output": [
-            {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_n_0"}]},
-            {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_n_2"}]},
-        ],
-    }
-    mod_config[mods[0]] = {
-        "pipeline": pipe_config1,
-        "target_host": None,
-        "mod_name": "default",
-        "build": None,
-        "params": None,
-        "target": target[0],
-        "fcompile": _cc.create_shared,
-        "dev": target[1],
-    }
-
-    pipe_config2 = {
-        "mod_idx": 1,
-        "cpu_affinity": "0",
-        "output": [
-            {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_n_1"}]},
-        ],
-    }
-    mod_config[mods[1]] = {
-        "pipeline": pipe_config2,
-        "target_host": None,
-        "mod_name": "default",
-        "build": None,
-        "params": None,
-        "target": "llvm",
-        "fcompile": None,
-        "dev": tvm.cpu(0),
-    }
-
-    pipe_config3 = {
-        "mod_idx": 2,
-        "cpu_affinity": "0",
-        "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 0}]}],
-    }
-    mod_config[mods[2]] = {
-        "pipeline": pipe_config3,
-        "target_host": None,
-        "mod_name": "default",
-        "build": None,
-        "params": None,
-        "target": "llvm",
-        "fcompile": None,
-        "dev": tvm.cpu(0),
-    }
-    return mod_config
-
-
-def recreate_parameters(mod):
-    # Get the binding parameters from a module, then create the same parameters with different data.
-    # This function is used to test the "parameter" connection.
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, "llvm")
-
-    mod_customized_params = {}
-    for key, value in lib.params.items():
-        new_value = value.numpy() + np.full(value.shape, 10).astype(value.dtype)
-        mod_customized_params[key] = tvm.nd.array(new_value)
-    return mod_customized_params, mod
-
-
-def run_modules(
-    mod_configs,
-    dev,
-    target,
-    global_input_name,
-    global_input_data,
-    mod_set_input,
-    input_name,
-    input_data,
-    params_mod=None,
-    params=None,
-):
-    # Running modules in serialized model. The returnning data are used to verify the pipeline
-    # executor result.
-    mod_input = {}
-    final_output = {}
-    idx = 0
-    for mod in mod_configs:
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target)
-
-        m = graph_executor.GraphModule(lib["default"](dev))
-        # Getting the input data then setting the input data into the module.
-        if idx in mod_input:
-            for input in mod_input[idx]:
-                input = mod_input[idx][input]
-                m.set_input(input["index"], input["data"])
-        else:
-            m.set_input(global_input_name, global_input_data)
-
-        # Setting the "input_data" into the module.
-        if mod == mod_set_input:
-            m.set_input(input_name, input_data)
-        # If the module is "params_mod" then setting the parameters to this module.
-        if params_mod == mod:
-            m.set_input(None, None, **params)
-
-        m.run()
-        n = m.get_num_outputs()
-        # Setting current output data as  the input of next module.
-        mconfig = mod_configs[mod]
-        for output in mconfig["pipeline"]["output"]:
-            output_data = m.get_output(output["output_idx"]).numpy()
-            for dep in output["dependencies"]:
-                is_global = False
-                if "global_output_index" in dep:
-                    is_global = True
-                    name = dep["global_output_index"]
-                else:
-                    mod_idx = dep["mod_idx"]
-                    name = dep["input_name"]
-                if is_global:
-                    final_output[name] = output_data
-                else:
-                    if mod_idx in mod_input:
-                        mod_input[mod_idx][name] = {"index": name, "data": output_data}
-                    else:
-                        mod_input[mod_idx] = {name: {"index": name, "data": output_data}}
-        idx = idx + 1
-
-    return final_output
-
-
-def reset_cpu_affinity(affinity):
-    # Restore the CPU affinity into the default value.
-    config_threadpool = get_global_func("runtime.config_threadpool")
-    config_threadpool(-2, 0)
-    os.sched_setaffinity(0, affinity)
-
-
-def test_pipe_runtime_error_check():
-    # This function is used to trigger runtime error by applying wrong logic.
-    if pipeline_executor_build.pipeline_executor_build_enabled():
-        # Get three pipeline modules here.
-        (mod1, mod2, mod3), dshape = get_split_mod()
-
-        # The input or output name is illegal and expects a runtime error.
-        pipe_error = pipeline_executor_build.PipelineConfig()
-        with pytest.raises(RuntimeError):
-            pipe_error[mod1]["output"][9]
-
-        with pytest.raises(RuntimeError):
-            pipe_error[mod1]["input"]["data_9"]
-
-        # The module connection will cause a cycle in DAG and expects runtime error.
-        with pytest.raises(RuntimeError):
-            pipe_error[mod1]["output"][0].connect(pipe_error[mod2]["input"]["data_0"])
-            pipe_error[mod2]["output"][0].connect(pipe_error[mod1]["input"]["data_0"])
-
-        # The module connection is illegal and expects runtime error.
-
-        with pytest.raises(RuntimeError):
-            pipe_error[mod1]["output"][0].connect(pipe_error[mod1]["input"]["data_0"])
-
-        with pytest.raises(RuntimeError):
-            pipe_error[mod1]["input"]["data_0"].connect(pipe_error[mod1]["input"]["data_0"])
-
-        with pytest.raises(RuntimeError):
-            pipe_error[mod1]["input"]["data_0"].connect(pipe_error[mod2]["input"]["data_0"])
-
-        with pytest.raises(RuntimeError):
-            pipe_error[mod1]["output"][0].connect(pipe_error["input"]["data_0"])
-
-        with pytest.raises(RuntimeError):
-            pipe_error["input"]["data_0"].connect(pipe_error[mod1]["output"][0])
-
-        with pytest.raises(RuntimeError):
-            pipe_error["output"]["0"].connect(pipe_error[mod1]["output"][0])
-
-        # Create pipeline executor to check the executor runtime errors.
-        pipe_config = pipeline_executor_build.PipelineConfig()
-        pipe_config[mod1].target = "llvm"
-        pipe_config[mod1].dev = tvm.cpu(0)
-        pipe_config["param_group"]["param_0"].connect(pipe_config[mod1]["param"])
-        pipe_config[mod1]["output"][0].connect(pipe_config["output"]["0"])
-        # Build and create a pipeline module.
-        with tvm.transform.PassContext(opt_level=3):
-            pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
-        pipeline_module = pipeline_executor.PipelineModule(pipeline_mod_factory)
-        customized_parameters, _ = recreate_parameters(mod1)
-
-        # Checking the pipeline executor runtime errors.
-        with pytest.raises(RuntimeError):
-            pipeline_module.set_params("param_0", None)
-
-        with pytest.raises(RuntimeError):
-            pipeline_module.set_params("param_1", customized_parameters)
-
-
-def test_pipeline():
-    if pipeline_executor_build.pipeline_executor_build_enabled():
-        target_list = tvm.testing.enabled_targets()
-        for target in target_list:
-            affinity = os.sched_getaffinity(0)
-            # Get the three pipeline modules here.
-            (mod1, mod2, mod3), dshape = get_split_mod()
-
-            # Prepare batch data for pipeline computation.
-            datas = []
-            for i in range(5):
-                datas.append(np.full(dshape, 3 + i).astype("float32"))
-
-            pipe_config = pipeline_executor_build.PipelineConfig()
-
-            customized_parameters, customized_parameters_mod = recreate_parameters(mod1)
-            assert customized_parameters_mod == mod1
-            # The global parameters group named "param_0" will be connected to "mod1" as parameters.
-            pipe_config["param_group"]["param_0"].connect(pipe_config[mod1]["param"])
-            # The pipeline input named "data_a" will be connected to a input named "data_0"
-            # of mod1.
-            pipe_config["input"]["data_a"].connect(pipe_config[mod1]["input"]["data_0"])
-
-            # The pipeline Input named "data_b" will be connected to a input named "data_1"
-            # of mod2.
-            pipe_config["input"]["data_b"].connect(pipe_config[mod2]["input"]["data_1"])
-
-            # The mod1 output[0] will be connected to a input named "data_n_0" of mod2.
-            pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_n_0"])
-
-            # The mod1 output[1] will be connected to a input named "data_n_2" of mod3.
-            pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_n_2"])
-
-            # The mod2 output[2] will be connected to a input named "data_n_1" of mod3.
-            pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_n_1"])
-
-            # The mod3 output[0] will be connected to pipeline output[0].
-            pipe_config[mod3]["output"][0].connect(pipe_config["output"]["0"])
-            # Print configuration (print(pipe_config)), the result looks like following.
-            #
-            # Params
-            #   |param_0: mod0:param
-            #
-            # Inputs
-            #   |data_a: mod0:data_0
-            #   |data_b: mod1:data_1
-            #
-            # output
-            #   |output(0) : mod2.output(0)
-            #
-            # connections
-            #   |mod0.output(0)-> mod1.data_n_0
-            #   |mod0.output(1)-> mod2.data_n_2
-            #   |mod1.output(0)-> mod2.data_n_1
-
-            # Set other parameters.
-            pipe_config[mod1].target = target[0]
-            pipe_config[mod1].dev = target[1]
-            pipe_config[mod1].cpu_affinity = "0"
-            pipe_config[mod1].fcompile = _cc.create_shared
-
-            pipe_config[mod2].target = "llvm"
-            pipe_config[mod2].dev = tvm.cpu(0)
-            pipe_config[mod2].cpu_affinity = "0"
-
-            pipe_config[mod3].target = "llvm"
-            pipe_config[mod3].dev = tvm.cpu(0)
-            pipe_config[mod3].cpu_affinity = "0"
-            # Checking the configuration of modules dependency.
-            mconfig = pipe_config.get_config()
-            assert mconfig["module_connection"] == get_manual_conf([mod1, mod2, mod3], target)
-
-            # Build and create a pipeline module.
-            with tvm.transform.PassContext(opt_level=3):
-                pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
-
-            # Export the parameter configuration to a file.
-            directory_path = tvm.contrib.utils.tempdir().temp_dir
-            # If the directory does not exist, create it.
-            if not os.path.exists(directory_path):
-                os.makedirs(directory_path)
-            config_file_name = pipeline_mod_factory.export_library(directory_path)
-
-            # Use the output of build to create and initialize PipelineModule.
-            pipeline_module = pipeline_executor.PipelineModule(pipeline_mod_factory)
-            assert pipeline_module
-
-            # Use the import function to create and initialize PipelineModule.
-            pipeline_module_test = pipeline_executor.PipelineModule.load_library(config_file_name)
-            assert pipeline_module_test.num_outputs == 1
-
-            input_map = pipeline_module_test.get_input_pipeline_map("data_b")
-            assert input_map[0] == "1" and input_map[1] == "data_1"
-            input_map = pipeline_module_test.get_input_pipeline_map("data_a")
-            assert input_map[0] == "0" and input_map[1] == "data_0"
-            module_index = pipeline_module_test.get_params_group_pipeline_map("param_0")
-            assert module_index == 0
-            # Using the parameters group name to set parameters.
-            pipeline_module_test.set_params("param_0", customized_parameters)
-            normal_outputs = []
-            for round in range(0, len(datas)):
-                data = datas[round]
-                # Getting the result without setting customized parameters.
-                wrong_output = run_modules(
-                    mconfig["module_connection"],
-                    tvm.cpu(),
-                    "llvm",
-                    "data_0",
-                    data,
-                    mod2,
-                    "data_1",
-                    data,
-                )
-                # Getting the result with setting customized parameters.
-                normal_output = run_modules(
-                    mconfig["module_connection"],
-                    tvm.cpu(),
-                    "llvm",
-                    "data_0",
-                    data,
-                    mod2,
-                    "data_1",
-                    data,
-                    customized_parameters_mod,
-                    customized_parameters,
-                )
-                # Appending the normal output into the list in order to do future correctness
-                # checking.
-                normal_outputs.append(normal_output)
-                # Setting the input data into the pipeline executor.
-                pipeline_module_test.set_input("data_a", tvm.nd.array(data))
-                pipeline_module_test.set_input("data_b", tvm.nd.array(data))
-                input_map = pipeline_module_test.get_input_pipeline_map("data_a")
-                # Checking whether the input setting of the first runtime is successful.
-                # The input of the rest of runtime will go into a queue and we can not check
-                # these input data here.
-                if input_map[0] == "0":
-                    input_data = pipeline_module_test.get_input("data_a")
-                    tvm.testing.assert_allclose(data, input_data.numpy())
-
-                assert pipeline_module_test.num_inputs == 2
-                # Running the pipeline executor in the pipeline mode.
-                pipeline_module_test.run()
-
-            for k in range(0, len(datas)):
-                statistic_time = 0
-                outputs = pipeline_module_test.get_output()
-                while len(outputs) == 0:
-                    outputs = pipeline_module_test.get_output()
-                    statistic_time = statistic_time + 1
-                    # Setting the timeout to 10 seconds.
-                    assert statistic_time < 5
-                    time.sleep(1)
-
-                for i in range(len(outputs)):
-                    tvm.testing.assert_allclose(normal_outputs[k][i], outputs[i].numpy())
-                    assert not (normal_output[i] == wrong_output[i]).all()
-
-                    assert pipeline_module_test.num_executing_pipeline == round + 1
-
-            # Reset the cpu affinity after a test.
-            reset_cpu_affinity(affinity)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_prng.py b/tests/python/relay/test_prng.py
deleted file mode 100644
index 98b4396a51f7..000000000000
--- a/tests/python/relay/test_prng.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm
-import tvm.relay
-import tvm.testing
-from tvm.relay.testing import run_infer_type
-
-
-@tvm.testing.parametrize_targets
-def test_threefry_repeatability(target, dev):
-    key1 = tvm.relay.random.threefry_key(1)
-    rand1 = tvm.relay.random.threefry_generate(key1, (12,))
-    out_key1, out1 = tvm.relay.create_executor(
-        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=target, device=dev
-    ).evaluate()()
-
-    key2 = tvm.relay.random.threefry_key(1)
-    rand2 = tvm.relay.random.threefry_generate(key2, (12,))
-    out_key2, out2 = tvm.relay.create_executor(
-        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand2)), target=target, device=dev
-    ).evaluate()()
-
-    assert (
-        out1.numpy() == out2.numpy()
-    ).all(), "Generate on same seed should have the same output random numbers"
-
-    assert (
-        out_key1.numpy() == out_key2.numpy()
-    ).all(), "Generate on same seed should have the same next keys"
-
-
-@tvm.testing.parametrize_targets
-def test_threefry_split(target, dev):
-    key = tvm.relay.random.threefry_key(1)
-    left, right = tvm.relay.TupleWrapper(tvm.relay.random.threefry_split(key), 2)
-    _, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(left, (16,)), 2)
-    _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(right, (16,)), 2)
-    out1, out2 = tvm.relay.create_executor(
-        "vm",
-        tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))),
-        target=target,
-        device=dev,
-    ).evaluate()()
-
-    assert (
-        out1.numpy() != out2.numpy()
-    ).any(), "Generate after split should not have the same output"
-
-
-@tvm.testing.parametrize_targets
-def test_threefry_sequential_generate(target, dev):
-    key = tvm.relay.random.threefry_key(1)
-    key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2)
-    _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2)
-    out1, out2 = tvm.relay.create_executor(
-        "vm",
-        tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))),
-        target=target,
-        device=dev,
-    ).evaluate()()
-
-    assert (
-        out1.numpy() != out2.numpy()
-    ).any(), "Sequential generates should not have the same output"
-
-
-@tvm.testing.parametrize_targets
-def test_threefry_sequential_generate_remaining(target, dev):
-    key = tvm.relay.random.threefry_key(1)
-    key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (7,)), 2)
-    _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (7,)), 2)
-    out1, out2 = tvm.relay.create_executor(
-        "vm",
-        tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))),
-        target=target,
-        device=dev,
-    ).evaluate()()
-
-    assert (
-        out1.numpy()[-3:] != out2.numpy()[-3:]
-    ).any(), "Sequential generates should not have the same output"
-
-
-def test_threefry_generate_infer():
-    oshape = (12,)
-    key_type = tvm.relay.TensorType([10], dtype="uint64")
-    gen_type = tvm.relay.TensorType(oshape, dtype="uint64")
-    expected_type = tvm.relay.TupleType([key_type, gen_type])
-
-    key = tvm.relay.random.threefry_key(1)
-    rand1 = tvm.relay.random.threefry_generate(key, oshape)
-    f = tvm.relay.Function([], rand1)
-    f = run_infer_type(f)
-    tvm.ir.assert_structural_equal(f.ret_type, expected_type)
-
-
-def test_threefry_split_infer():
-    key_type = tvm.relay.TensorType([10], dtype="uint64")
-    expected_type = tvm.relay.TupleType([key_type, key_type])
-
-    key = tvm.relay.random.threefry_key(1)
-    out_keys = tvm.relay.random.threefry_split(key)
-    f = tvm.relay.Function([], out_keys)
-    f = run_infer_type(f)
-    tvm.ir.assert_structural_equal(f.ret_type, expected_type)
-
-
-def test_uniform_infer():
-    oshape = (12,)
-    odtypes = ["float32", "float64"]
-    for odtype in odtypes:
-        key_type = tvm.relay.TensorType([10], dtype="uint64")
-        gen_type = tvm.relay.TensorType(oshape, dtype=odtype)
-        expected_type = tvm.relay.TupleType([key_type, gen_type])
-
-        key = tvm.relay.random.threefry_key(1)
-        rand1 = tvm.relay.random.uniform(key, oshape, odtype)
-        f = tvm.relay.Function([], rand1)
-        f = run_infer_type(f)
-        tvm.ir.assert_structural_equal(f.ret_type, expected_type)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_threefry_generate_infer_fail():
-    # xfail: key size should be 10
-    fake_key = tvm.relay.const([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype="uint64")
-    rand1 = tvm.relay.random.threefry_generate(fake_key, (12,))
-    f = tvm.relay.Function([], rand1)
-    f = run_infer_type(f)
-
-
-@pytest.mark.xfail(raises=tvm.error.TVMError)
-def test_threefry_split_infer_fail():
-    # xfail: key size should be 10
-    fake_key = tvm.relay.const([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype="uint64")
-    out_keys = tvm.relay.random.threefry_split(fake_key)
-    f = tvm.relay.Function([], out_keys)
-    f = run_infer_type(f)
-
-
-@tvm.testing.requires_llvm
-def test_threefry_generate_out_size():
-    key = tvm.relay.random.threefry_key(1)
-    key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (5,)), 2)
-    out = tvm.relay.create_executor(
-        "vm",
-        tvm.IRModule.from_expr(tvm.relay.Function([], rand1)),
-        target=tvm.target.Target("llvm"),
-        device=tvm.device("cpu"),
-    ).evaluate()()
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_py_converter.py b/tests/python/relay/test_py_converter.py
deleted file mode 100644
index 24bec7251e8f..000000000000
--- a/tests/python/relay/test_py_converter.py
+++ /dev/null
@@ -1,682 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay.backend.interpreter import ConstructorValue, RefValue
-from tvm.relay.prelude import Prelude
-from tvm.relay.testing import run_as_python
-from tvm.runtime.container import ADT
-
-
-# helper: uses a dummy let binding to sequence a list
-# of expressions: expr1; expr2; expr3, etc.
-def seq(*exprs):
-    ret = exprs[0]
-    for expr in exprs[1:]:
-        ret = relay.Let(relay.var("_"), ret, expr)
-    return ret
-
-
-# creates a dummy ADT for testing
-def init_box_adt(mod):
-    box = relay.GlobalTypeVar("box")
-    a = relay.TypeVar("a")
-    box_ctor = relay.Constructor("box", [a], box)
-    mod[box] = relay.TypeData(box, [a], [box_ctor])
-    return (box, box_ctor)
-
-
-# assert that the candidate is a NDArray with value val
-def assert_tensor_value(candidate, val):
-    assert isinstance(candidate, tvm.nd.NDArray)
-    assert np.array_equal(candidate.numpy(), np.array(val))
-
-
-# assert that the candidate is an ADT with the indicated number of fields
-def assert_adt_len(candidate, fields):
-    assert isinstance(candidate, ADT)
-    assert len(candidate) == fields
-
-
-# assert that the candidate is a ConstructorValue with the approrpaite constructor
-# and number of fields
-def assert_constructor_value(candidate, constructor, fields):
-    assert isinstance(candidate, ConstructorValue)
-    assert candidate.tag == constructor.tag
-    assert len(candidate.fields) == fields
-
-
-def test_create_empty_tuple():
-    empty = relay.Tuple([])
-    tup_val = run_as_python(empty)
-    assert_adt_len(tup_val, 0)
-
-
-def test_create_scalar():
-    scalar = relay.const(1)
-    tensor_val = run_as_python(scalar)
-    assert_tensor_value(tensor_val, 1)
-
-
-def test_create_tensor():
-    tensor = relay.const([[1, 1], [2, 2]])
-    tensor_val = run_as_python(tensor)
-    assert_tensor_value(tensor_val, [[1, 1], [2, 2]])
-
-
-def test_create_nested_tuple():
-    relay_tup = relay.Tuple(
-        [relay.const(1), relay.const(2), relay.Tuple([relay.const(3), relay.const(4)])]
-    )
-    tup_val = run_as_python(relay_tup)
-    assert_adt_len(tup_val, 3)
-    for i in range(2):
-        assert_tensor_value(tup_val[i], i + 1)
-    assert_adt_len(tup_val[2], 2)
-    for i in range(2):
-        assert_tensor_value(tup_val[2][i], i + 3)
-
-
-def test_tuple_get_item():
-    relay_tup = relay.Tuple(
-        [relay.const(1), relay.const(2), relay.Tuple([relay.const(3), relay.const(4)])]
-    )
-    for i in range(2):
-        index = relay.TupleGetItem(relay_tup, i)
-        val = run_as_python(index)
-        assert_tensor_value(val, i + 1)
-    # try the inner value too
-    for i in range(2):
-        index = relay.TupleGetItem(relay.TupleGetItem(relay_tup, 2), i)
-        val = run_as_python(index)
-        assert_tensor_value(val, i + 3)
-
-
-def test_create_let():
-    v = relay.Var("v")
-    let = relay.Let(v, relay.Tuple([]), relay.Tuple([v, v]))
-    tup_val = run_as_python(let)
-    assert_adt_len(tup_val, 2)
-    assert_adt_len(tup_val[0], 0)
-    assert_adt_len(tup_val[1], 0)
-
-
-def test_create_ref():
-    relay_ref = relay.RefCreate(relay.Tuple([]))
-    ref_val = run_as_python(relay_ref)
-    assert isinstance(ref_val, RefValue)
-    assert_adt_len(ref_val.value, 0)
-
-
-def test_ref_read():
-    v = relay.Var("v")
-    assign = relay.Let(v, relay.RefCreate(relay.Tuple([])), relay.RefRead(v))
-    read_val = run_as_python(assign)
-    assert_adt_len(read_val, 0)
-
-
-def test_ref_write():
-    # check that the result of a ref write is an empty tuple
-    v = relay.Var("v")
-    initial_write = relay.Let(
-        v,
-        relay.RefCreate(relay.Tuple([relay.const(1)])),
-        relay.RefWrite(v, relay.Tuple([relay.const(2)])),
-    )
-    write_val = run_as_python(initial_write)
-    assert_adt_len(write_val, 0)
-
-    # now ensure that the value, once written, can be read back
-    # (we read the value before and after mutation)
-    w = relay.Var("w")
-    read_after_write = relay.Let(
-        v,
-        relay.RefCreate(relay.Tuple([relay.const(1)])),
-        relay.Let(
-            w,
-            relay.RefCreate(relay.RefRead(v)),
-            seq(
-                relay.RefWrite(v, relay.Tuple([relay.const(2)])),
-                relay.Tuple([relay.RefRead(w), relay.RefRead(v)]),
-            ),
-        ),
-    )
-    read_val = run_as_python(read_after_write)
-    assert_adt_len(read_val, 2)
-    assert_adt_len(read_val[0], 1)
-    assert_adt_len(read_val[1], 1)
-    assert_tensor_value(read_val[0][0], 1)
-    assert_tensor_value(read_val[1][0], 2)
-
-
-def test_if():
-    # we will have effects in the blocks to ensure only the intended one is executed
-    true_cond = relay.const(True)
-    false_cond = relay.const(False)
-
-    v = relay.Var("v")
-    true_branch = seq(relay.RefWrite(v, relay.const(1)), relay.RefRead(v))
-    false_branch = seq(relay.RefWrite(v, relay.const(2)), relay.RefRead(v))
-
-    true_expr = relay.Let(
-        v, relay.RefCreate(relay.const(0)), relay.If(true_cond, true_branch, false_branch)
-    )
-    false_expr = relay.Let(
-        v, relay.RefCreate(relay.const(0)), relay.If(false_cond, true_branch, false_branch)
-    )
-
-    true_val = run_as_python(true_expr)
-    assert_tensor_value(true_val, 1)
-
-    false_val = run_as_python(false_expr)
-    assert_tensor_value(false_val, 2)
-
-
-def test_local_function():
-    v = relay.Var("v")
-    ident = relay.Function([v], v)
-    f = relay.Var("f")
-    call1 = relay.Let(f, ident, f(relay.Tuple([])))
-    call2 = relay.Let(f, ident, f(relay.const(2)))
-
-    call_val1 = run_as_python(call1)
-    assert_adt_len(call_val1, 0)
-
-    call_val2 = run_as_python(call2)
-    assert_tensor_value(call_val2, 2)
-
-
-def test_global_function():
-    mod = tvm.IRModule()
-    ident = relay.GlobalVar("ident")
-    a = relay.TypeVar("a")
-    v = relay.Var("v", a)
-    mod[ident] = relay.Function([v], v, a, [a])
-
-    call1 = ident(relay.const(1))
-    call2 = ident(relay.Tuple([relay.const(2), relay.const(2)]))
-
-    call_val1 = run_as_python(call1, mod)
-    assert_tensor_value(call_val1, 1)
-
-    call_val2 = run_as_python(call2, mod)
-    assert_adt_len(call_val2, 2)
-    assert_tensor_value(call_val2[0], 2)
-    assert_tensor_value(call_val2[1], 2)
-
-
-def test_constructor():
-    mod = tvm.IRModule()
-    box, box_ctor = init_box_adt(mod)
-
-    init_box_int = box_ctor(relay.const(1))
-    box_val_int = run_as_python(init_box_int, mod)
-
-    assert_constructor_value(box_val_int, box_ctor, 1)
-    assert_tensor_value(box_val_int.fields[0], 1)
-
-    init_box_tup = box_ctor(relay.Tuple([]))
-    box_val_tup = run_as_python(init_box_tup, mod)
-
-    assert_constructor_value(box_val_tup, box_ctor, 1)
-    assert_adt_len(box_val_tup.fields[0], 0)
-
-
-def test_match_wildcard():
-    mod = tvm.IRModule()
-    box, box_ctor = init_box_adt(mod)
-    v = relay.Var("v")
-    match = relay.Let(
-        v,
-        box_ctor(relay.Tuple([])),
-        relay.Match(v, [relay.Clause(relay.PatternWildcard(), relay.const(1))]),
-    )
-
-    match_val = run_as_python(match, mod)
-    assert_tensor_value(match_val, 1)
-
-
-def test_match_var():
-    mod = tvm.IRModule()
-    box, box_ctor = init_box_adt(mod)
-    v = relay.Var("v")
-    w = relay.Var("w")
-    match = relay.Let(
-        v, box_ctor(relay.const(1)), relay.Match(v, [relay.Clause(relay.PatternVar(w), w)])
-    )
-
-    match_val = run_as_python(match, mod)
-    assert_constructor_value(match_val, box_ctor, 1)
-    assert_tensor_value(match_val.fields[0], 1)
-
-
-def test_match_pattern():
-    mod = tvm.IRModule()
-    box, box_ctor = init_box_adt(mod)
-    v = relay.Var("v")
-    w = relay.Var("w")
-    match = relay.Let(
-        v,
-        box_ctor(relay.const(1)),
-        relay.Match(
-            v, [relay.Clause(relay.PatternConstructor(box_ctor, [relay.PatternVar(w)]), w)]
-        ),
-    )
-    match_val = run_as_python(match, mod)
-    assert_tensor_value(match_val, 1)
-
-
-def test_nested_match_pattern():
-    mod = tvm.IRModule()
-    box, box_ctor = init_box_adt(mod)
-    v = relay.Var("v")
-    w = relay.Var("w")
-    match = relay.Let(
-        v,
-        box_ctor(box_ctor(relay.const(2))),
-        relay.Match(
-            v,
-            [
-                relay.Clause(
-                    relay.PatternConstructor(
-                        box_ctor, [relay.PatternConstructor(box_ctor, [relay.PatternVar(w)])]
-                    ),
-                    w,
-                )
-            ],
-        ),
-    )
-    match_val = run_as_python(match, mod)
-    assert_tensor_value(match_val, 2)
-
-
-def test_match_order():
-    mod = tvm.IRModule()
-    box, box_ctor = init_box_adt(mod)
-    v = relay.Var("v")
-    w = relay.Var("w")
-    # wildcard pattern goes first
-    match = relay.Let(
-        v,
-        box_ctor(box_ctor(relay.const(2))),
-        relay.Match(
-            v,
-            [
-                relay.Clause(relay.PatternWildcard(), relay.const(1)),
-                relay.Clause(
-                    relay.PatternConstructor(
-                        box_ctor, [relay.PatternConstructor(box_ctor, [relay.PatternVar(w)])]
-                    ),
-                    w,
-                ),
-            ],
-        ),
-    )
-    match_val = run_as_python(match, mod)
-    assert_tensor_value(match_val, 1)
-
-
-def test_local_recursion():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = p.mod.get_type("List")
-
-    v = relay.Var("v")
-    h = relay.Var("h")
-    t = relay.Var("t")
-    f = relay.Var("f")
-
-    # just returns the same list
-    let = relay.Let(
-        f,
-        relay.Function(
-            [v],
-            relay.Match(
-                v,
-                [
-                    relay.Clause(
-                        relay.PatternConstructor(cons, [relay.PatternVar(h), relay.PatternVar(t)]),
-                        cons(h, f(t)),
-                    ),
-                    relay.Clause(relay.PatternConstructor(nil, []), nil()),
-                ],
-            ),
-        ),
-        f(cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))),
-    )
-
-    val = run_as_python(let, mod)
-    assert_constructor_value(val, cons, 2)
-    assert_tensor_value(val.fields[0], 1)
-    assert_constructor_value(val.fields[1], cons, 2)
-    assert_tensor_value(val.fields[1].fields[0], 2)
-    assert_constructor_value(val.fields[1].fields[1], cons, 2)
-    assert_tensor_value(val.fields[1].fields[1].fields[0], 3)
-    assert_constructor_value(val.fields[1].fields[1].fields[1], nil, 0)
-
-
-def test_global_recursion():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    rlist, cons, nil = p.mod.get_type("List")
-
-    copy = relay.GlobalVar("copy")
-    # same as above: it copies the given list
-    a = relay.TypeVar("a")
-    v = relay.Var("v", rlist(a))
-    h = relay.Var("h")
-    t = relay.Var("t")
-    copy_def = relay.Function(
-        [v],
-        relay.Match(
-            v,
-            [
-                relay.Clause(
-                    relay.PatternConstructor(cons, [relay.PatternVar(h), relay.PatternVar(t)]),
-                    cons(h, copy(t)),
-                ),
-                relay.Clause(relay.PatternConstructor(nil, []), nil()),
-            ],
-        ),
-        rlist(a),
-        [a],
-    )
-    mod[copy] = copy_def
-
-    call1 = copy_def(cons(relay.const(1), cons(relay.const(2), nil())))
-    val1 = run_as_python(call1, mod)
-    assert_constructor_value(val1, cons, 2)
-    assert_tensor_value(val1.fields[0], 1)
-    assert_constructor_value(val1.fields[1], cons, 2)
-    assert_tensor_value(val1.fields[1].fields[0], 2)
-    assert_constructor_value(val1.fields[1].fields[1], nil, 0)
-
-    call2 = copy_def(cons(relay.Tuple([]), nil()))
-    val2 = run_as_python(call2, mod)
-    assert_constructor_value(val2, cons, 2)
-    assert_adt_len(val2.fields[0], 0)
-    assert_constructor_value(val2.fields[1], nil, 0)
-
-
-def test_higher_order_call():
-    # test with anon func
-    h = relay.Var("h")
-    f = relay.Var("f")
-    x = relay.Var("x")
-    ho_anon = relay.Let(
-        h, relay.Function([f], f(relay.Tuple([]))), h(relay.Function([x], relay.const(1)))
-    )
-
-    anon_val = run_as_python(ho_anon)
-    assert_tensor_value(anon_val, 1)
-
-    # test with named func
-    g = relay.Var("g")
-    ho_named = relay.Let(
-        h,
-        relay.Function([f], f(relay.Tuple([]))),
-        relay.Let(g, relay.Function([x], relay.const(2)), h(g)),
-    )
-    named_val = run_as_python(ho_named)
-    assert_tensor_value(named_val, 2)
-
-
-def test_match_effect_exactly_once():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = p.mod.get_type("List")
-
-    # the list should be of length 1!
-    # Unless we mistakenly execute the data clause more than once
-    r = relay.Var("r")
-    data = seq(relay.RefWrite(r, cons(relay.Tuple([]), relay.RefRead(r))), relay.RefRead(r))
-    match = relay.Let(
-        r,
-        relay.RefCreate(nil()),
-        relay.Match(
-            data,
-            [
-                relay.Clause(relay.PatternConstructor(nil, []), relay.const(0)),
-                relay.Clause(
-                    relay.PatternConstructor(
-                        cons, [relay.PatternWildcard(), relay.PatternConstructor(nil, [])]
-                    ),
-                    relay.const(1),
-                ),
-                relay.Clause(relay.PatternWildcard(), relay.const(2)),
-            ],
-        ),
-    )
-
-    match_val = run_as_python(match, mod)
-    assert_tensor_value(match_val, 1)
-
-
-def test_arbitrary_let_nesting():
-    # something that is tricky to do in Python but comes naturally in Relay
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    x = relay.Var("x")
-    r = relay.Var("r")
-    y = relay.Var("y")
-    z = relay.Var("z")
-    expr = relay.Tuple(
-        [
-            relay.Let(x, relay.Tuple([relay.const(1), relay.const(2)]), relay.TupleGetItem(x, 1)),
-            relay.Let(
-                r,
-                relay.RefCreate(relay.const(1)),
-                seq(relay.RefWrite(r, relay.const(3)), relay.RefRead(r)),
-            ),
-            relay.Let(y, p.id(relay.Let(z, relay.const(4), z)), y),
-        ]
-    )
-
-    tup_val = run_as_python(expr, mod)
-    assert_adt_len(tup_val, 3)
-    assert_tensor_value(tup_val[0], 2)
-    assert_tensor_value(tup_val[1], 3)
-    assert_tensor_value(tup_val[2], 4)
-
-
-def test_ref_execution_order():
-    # we want to have effects execute from left to right
-    x = relay.Var("x")
-    y = relay.Var("y")
-    f = relay.Var("f")
-    r = relay.Var("r")
-
-    expr = relay.Let(
-        f,
-        relay.Function([x, y], x),
-        # r = 1
-        relay.Let(
-            r,
-            relay.RefCreate(relay.const(1)),
-            relay.Tuple(
-                [
-                    # should be 1
-                    relay.RefRead(r),
-                    # set r to 2 and read back
-                    seq(relay.RefWrite(r, relay.const(2)), relay.RefRead(r)),
-                    # set r to 3 and read back
-                    seq(relay.RefWrite(r, relay.const(3)), relay.RefRead(r)),
-                    # set r to 4 and read as first arg to f
-                    # set r to 5 and read as second arg to f
-                    # f should evaluate to 4
-                    f(
-                        seq(relay.RefWrite(r, relay.const(4)), relay.RefRead(r)),
-                        seq(relay.RefWrite(r, relay.const(5)), relay.RefRead(r)),
-                    ),
-                    # read back 5
-                    relay.RefRead(r),
-                ]
-            ),
-        ),
-    )
-
-    tup_val = run_as_python(expr)
-    assert_adt_len(tup_val, 5)
-    assert_tensor_value(tup_val[0], 1)
-    assert_tensor_value(tup_val[1], 2)
-    assert_tensor_value(tup_val[2], 3)
-    assert_tensor_value(tup_val[3], 4)
-    assert_tensor_value(tup_val[4], 5)
-
-
-def test_op_add():
-    add = relay.add(relay.const(1), relay.const(2))
-    add_val = run_as_python(add)
-    assert_tensor_value(add_val, 3)
-
-
-# test an op with a tuple input
-# adapted from test_stack in test_op_level3
-def test_op_stack():
-    def verify_stack(dshapes, axis):
-        x_data = [np.random.normal(size=shape).astype("int32") for shape in dshapes]
-        ref_res = np.stack(x_data, axis=axis)
-
-        args = []
-        for data in x_data:
-            args.append(relay.const(data))
-        call = relay.stack(relay.Tuple(args), axis)
-        call_val = run_as_python(call)
-        type(call_val)
-        assert_tensor_value(call_val, ref_res)
-
-    verify_stack([(2,), (2,), (2,)], -1)
-    verify_stack([(2,), (2,), (2,)], 0)
-    verify_stack([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1)
-    verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1)
-
-
-# test an op with a tuple output
-# adapted from test_split_infer_type in test_op_level3
-def test_split():
-    def verify_split(shape, indices_or_sections, axis=0):
-        x = np.random.normal(size=shape).astype("float32")
-        ref_res = np.split(x, indices_or_sections, axis=axis)
-        call = relay.split(relay.const(x), indices_or_sections, axis=axis)
-        call_val = run_as_python(call)
-        assert_adt_len(call_val, len(ref_res))
-        for i in range(len(ref_res)):
-            assert_tensor_value(call_val[i], ref_res[i])
-
-    verify_split((2, 3), 2)
-    verify_split((5, 3), [3])
-    verify_split((5, 9, 3), [3, 4], 1)
-    verify_split((5, 5, 2, 2), 5, 1)
-    verify_split((5, 5, 2, 2), 5, 0)
-
-
-# ensure we can generate code for batch_norm, since it requires simplify_inference
-def test_batch_norm():
-    def verify_batch_norm(shapes):
-        data = [np.absolute(np.random.normal(size=shape).astype("float32")) for shape in shapes]
-        relay_args = [relay.const(arg) for arg in data]
-
-        eps = 1e-5
-
-        def reference(x, gamma, beta, moving_mean, moving_var):
-            return (x - moving_mean) / np.sqrt(moving_var + eps) * gamma + beta
-
-        ref_res = reference(*data)
-
-        call = relay.nn.batch_norm(*relay_args, epsilon=eps)[0]
-        call_val = run_as_python(call)
-
-        # there will be a change in accuracy so we need to check
-        # approximate equality
-        assert isinstance(call_val, tvm.nd.NDArray)
-        tvm.testing.assert_allclose(call_val.numpy(), ref_res, atol=eps, rtol=eps)
-
-    verify_batch_norm([(10, 20), (20,), (20,), (20,), (20,)])
-    verify_batch_norm([(20, 10), (10,), (10,), (10,), (10,)])
-    verify_batch_norm([(10, 50), (50,), (50,), (50,), (50,)])
-    verify_batch_norm([(30, 40), (40,), (40,), (40,), (40,)])
-
-
-def test_return_global_var():
-    tt = relay.TensorType([1], "float32")
-    x = relay.Var("x", type_annotation=tt)
-    identity = relay.Function([x], x, ret_type=tt)
-    mod = tvm.IRModule()
-    mod["main"] = identity
-    main_var = mod.get_global_var("main")
-    main_func = run_as_python(main_var, mod=mod)
-
-    arg = tvm.nd.array(np.array([0.0], dtype="float32"))
-    res = main_func(arg)
-    assert arg.numpy() == res.numpy()
-
-
-def test_closure_in_tuple():
-    tt = relay.TensorType([1], "float32")
-    x = relay.Var("x", type_annotation=tt)
-    identity = relay.Function([x], x, ret_type=tt)
-    tup = relay.Tuple([identity, identity])
-    index = relay.TupleGetItem(tup, 0)
-
-    func = run_as_python(index)
-    arg = tvm.nd.array(np.array([0.0], dtype="float32"))
-    res = func(arg)
-    assert arg.numpy() == res.numpy()
-
-
-def test_closure_in_ref():
-    tt = relay.TensorType([1], "float32")
-    x = relay.Var("x", type_annotation=tt)
-    identity = relay.Function([x], x, ret_type=tt)
-    gv = relay.GlobalVar("id")
-
-    r = relay.Var("r")
-    seq = relay.Let(
-        r,
-        relay.RefCreate(gv),
-        relay.Call(relay.RefRead(r), [relay.const(np.array([0.0], dtype="float32"))]),
-    )
-
-    mod = tvm.IRModule()
-    mod[gv] = identity
-    res = run_as_python(seq, mod=mod)
-    assert res.numpy() == np.array([0.0], dtype="float32")
-
-
-def test_compiling_with_main():
-    unit_type = relay.TupleType([])
-    unit = relay.Function([], relay.Tuple([]), ret_type=unit_type)
-
-    x = relay.Var("x", type_annotation=unit_type)
-    identity = relay.Function([x], x, ret_type=unit_type)
-
-    mod = tvm.IRModule()
-    mod["unit"] = unit
-    mod["main"] = identity
-    gv_main = mod.get_global_var("main")
-    gv_unit = mod.get_global_var("unit")
-
-    res = run_as_python(gv_main(relay.Call(gv_unit, ())), mod=mod)
-    assert isinstance(res, ADT)
-    assert len(res) == 0
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_recast.py b/tests/python/relay/test_recast.py
deleted file mode 100644
index fea8a2d2b402..000000000000
--- a/tests/python/relay/test_recast.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import relay
-from tvm.relay.transform import recast
-
-
-def test_recast_simple():
-    """Recast a single convolution operator."""
-
-    def before():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
-        return relay.Function([x, w], c)
-
-    def expected():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        x_int = relay.cast(x, "int8")
-        w_int = relay.cast(w, "int8")
-        c = relay.nn.conv2d(x_int, w_int, padding=(1, 1), out_dtype="int32")
-        c_float = relay.cast(c, "float32")
-        return relay.Function([x, w], c_float)
-
-    pre = before()
-    post = recast(pre, "int8", "int32")
-    expected = expected()
-    tvm.ir.assert_structural_equal(expected, post)
-
-
-def test_recast_medium():
-    """Recast a slightly larger graph."""
-
-    def before():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
-        w2 = relay.var("w2", shape=[8, 8, 3, 3])
-        c2 = relay.nn.conv2d(c, w2, padding=(1, 1), out_dtype="float32")
-        return relay.Function([x, w, w2], c2)
-
-    def expected():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        x_int = relay.cast(x, "int8")
-        w_int = relay.cast(w, "int8")
-        c = relay.nn.conv2d(x_int, w_int, padding=(1, 1), out_dtype="int32")
-        c_float = relay.cast(c, "float32")
-        w2 = relay.var("w2", shape=[8, 8, 3, 3])
-        w2_int = relay.cast(w2, "int8")
-        c_float_int = relay.cast(c_float, "int8")
-        c2 = relay.nn.conv2d(c_float_int, w2_int, padding=(1, 1), out_dtype="int32")
-        c2_float = relay.cast(c2, "float32")
-        return relay.Function([x, w, w2], c2_float)
-
-    pre = before()
-    post = recast(pre, "int8", "int32")
-    expected = expected()
-    tvm.ir.assert_structural_equal(expected, post)
-
-
-def test_recast_skip():
-    """Recast a graph using skip layers."""
-
-    def before():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
-        w2 = relay.var("w2", shape=[8, 8, 3, 3])
-        c2 = relay.nn.conv2d(c, w2, padding=(1, 1), out_dtype="float32")
-        return relay.Function([x, w, w2], c2)
-
-    def expected():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
-        w2 = relay.var("w2", shape=[8, 8, 3, 3])
-        w2_int = relay.cast(w2, "int8")
-        c_int = relay.cast(c, "int8")
-        c2 = relay.nn.conv2d(c_int, w2_int, padding=(1, 1), out_dtype="int32")
-        c2_float = relay.cast(c2, "float32")
-        return relay.Function([x, w, w2], c2_float)
-
-    pre = before()
-    post = recast(pre, "int8", "int32", skip_layers=[0])
-    expected = expected()
-    tvm.ir.assert_structural_equal(expected, post)
-
-
-def test_recast_concat():
-    def before():
-        x = relay.var("x", shape=[1, 4])
-        y = relay.var("y", shape=[1, 4])
-        t = relay.Tuple([x, y])
-        c = relay.op.concatenate(t, axis=1)
-        return relay.Function([x, y], c)
-
-    def expected():
-        xv = relay.var("x", shape=[1, 4])
-        yv = relay.var("y", shape=[1, 4])
-        x = relay.cast(xv, "float16")
-        y = relay.cast(yv, "float16")
-        t = relay.Tuple([x, y])
-        c = relay.op.concatenate(t, axis=1)
-        c = relay.cast(c, "float32")
-        return relay.Function([xv, yv], c)
-
-    pre = before()
-    post = recast(pre, "float16", "float32", ops=["concatenate"])
-    expected = expected()
-    tvm.ir.assert_structural_equal(expected, post)
-
-
-def test_recast_relu():
-    """Recast a ReLU operator which does not have attributes."""
-
-    def before():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
-        r = relay.nn.relu(c)
-        return relay.Function([x, w], r)
-
-    def expected():
-        x = relay.var("x", shape=[8, 8, 8, 8])
-        w = relay.var("w", shape=[8, 8, 3, 3])
-        x_fp16 = relay.cast(x, "float16")
-        w_fp16 = relay.cast(w, "float16")
-        c = relay.nn.conv2d(x_fp16, w_fp16, padding=(1, 1), out_dtype="float16")
-        c_float32 = relay.cast(c, "float32")
-        c_float16 = relay.cast(c_float32, "float16")
-        r = relay.nn.relu(c_float16)
-        r_float32 = relay.cast(r, "float32")
-        return relay.Function([x, w], r_float32)
-
-    pre = before()
-    post = recast(pre, "float16", "float16", ops=["nn.conv2d", "nn.relu"])
-    expected = expected()
-    tvm.ir.assert_structural_equal(expected, post)
-
-
-if __name__ == "__main__":
-    test_recast_simple()
-    test_recast_medium()
-    test_recast_skip()
-    test_recast_concat()
-    test_recast_relu()
diff --git a/tests/python/relay/test_relay_te_compiler.py b/tests/python/relay/test_relay_te_compiler.py
deleted file mode 100644
index 16041f00cc12..000000000000
--- a/tests/python/relay/test_relay_te_compiler.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-from tvm import te
-import tvm.testing
-from tvm import relay
-from tvm import autotvm
-from tvm import topi
-from tvm.relay.backend import te_compiler
-from tvm.relay.testing import run_infer_type
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-
-
-@autotvm.register_topi_compute("test/conv2d_1")
-def _compute_conv2d_1(cfg, input, filter, strides, padding, dilation, out_dtype):
-    return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("test/conv2d_1")
-def _schedule_conv2d_1(cfg, outs):
-    return topi.generic.schedule_conv2d_nchw(outs)
-
-
-@autotvm.register_topi_compute("test/conv2d_2")
-def _compute_conv2d_2(cfg, input, filter, strides, padding, dilation, out_dtype):
-    return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-
-
-@autotvm.register_topi_schedule("test/conv2d_2")
-def _schedule_conv2d_2(cfg, outs):
-    return topi.generic.schedule_conv2d_nchw(outs)
-
-
-def _compute_conv2d_3(input, filter, strides, padding, dilation, out_dtype):
-    return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-
-
-def _schedule_conv2d_3(outs):
-    return topi.generic.schedule_conv2d_nchw(outs)
-
-
-@tvm.target.override_native_generic_func("test_conv2d_strategy")
-def _tmp_strategy(attrs, inputs, out_type, target):
-    strategy = relay.op.OpStrategy()
-    strategy.add_implementation(
-        relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_1),
-        relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_1),
-        name="conv2d_1",
-        plevel=10,
-    )
-    strategy.add_implementation(
-        relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_2),
-        relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_2),
-        name="conv2d_2",
-        plevel=15,
-    )
-    ic = inputs[0].shape[1]
-    with tvm.te.SpecializedCondition(ic >= 16):
-        strategy.add_implementation(
-            relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_3),
-            relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_3),
-            name="conv2d_3",
-            plevel=20,
-        )
-    return strategy
-
-
-def _create_record(task_name, dshape, wshape, target, cost):
-    args = [te.placeholder(dshape), te.placeholder(wshape), (1, 1), (1, 1, 1, 1), (1, 1), "float32"]
-    task = autotvm.task.create(task_name, args, target)
-    cfg = autotvm.ConfigEntity(0, None, {}, [])
-    cfg.cost = cost
-    inp = autotvm.MeasureInput(target=target, task=task, config=cfg)
-    result = autotvm.MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
-    return (inp, result)
-
-
-def test_get_valid_implementations():
-    target = tvm.target.Target("llvm")
-
-    def _get_impls(dshape, wshape):
-        data = relay.var("data", shape=dshape)
-        weight = relay.var("wshape", shape=wshape)
-        out = relay.nn.conv2d(data, weight, padding=(1, 1))
-        out = run_infer_type(out)
-        return relay.backend.te_compiler.get_valid_implementations(
-            relay.op.get("nn.conv2d"),
-            out.attrs,
-            [te.placeholder(dshape), te.placeholder(wshape)],
-            out.checked_type,
-            target,
-        )
-
-    with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
-        impls = _get_impls((1, 8, 7, 7), (32, 8, 3, 3))
-        assert len(impls) == 2
-        impls = _get_impls((1, 16, 7, 7), (32, 16, 3, 3))
-        assert len(impls) == 3
-
-
-def test_select_implementation():
-    target = tvm.target.Target("llvm")
-
-    def _select_impl(dshape, wshape, use_autotvm=False):
-        data = relay.var("data", shape=dshape)
-        weight = relay.var("wshape", shape=wshape)
-        out = relay.nn.conv2d(data, weight, padding=(1, 1))
-        out = run_infer_type(out)
-        return relay.backend.te_compiler.select_implementation(
-            relay.op.get("nn.conv2d"),
-            out.attrs,
-            [te.placeholder(dshape), te.placeholder(wshape)],
-            out.checked_type,
-            target,
-            use_autotvm,
-        )
-
-    with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
-        impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3))
-        assert impl.name == "conv2d_2"
-        impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True)
-        assert impl.name == "conv2d_2"
-        impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3))
-        assert impl.name == "conv2d_3"
-        impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True)
-        assert impl.name == "conv2d_3"
-
-        # add autotvm record
-        records = []
-        records.append(_create_record("test/conv2d_1", (1, 8, 7, 7), (32, 8, 3, 3), target, 0.5))
-        records.append(_create_record("test/conv2d_1", (1, 16, 7, 7), (32, 16, 3, 3), target, 1.0))
-        with target:
-            with autotvm.apply_history_best(records):
-                impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True)
-                assert impl.name == "conv2d_1"
-                impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True)
-                assert impl.name == "conv2d_1"
-
-        records.append(_create_record("test/conv2d_2", (1, 8, 7, 7), (32, 8, 3, 3), target, 0.2))
-        records.append(_create_record("test/conv2d_1", (1, 16, 7, 7), (32, 16, 3, 3), target, 1.2))
-        with target:
-            with autotvm.apply_history_best(records):
-                impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True)
-                assert impl.name == "conv2d_2"
-                impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True)
-                assert impl.name == "conv2d_1"
-
-
-def test_te_compiler():
-    tec = relay.backend.te_compiler.get()
-
-    def get_func(shape):
-        x = relay.var("x", shape=shape)
-        y = relay.add(x, x)
-        z = relay.add(y, x)
-        f = relay.Function([x], z)
-        mod = tvm.IRModule.from_expr(f)
-        mod = relay.transform.InferType()(mod)
-        return mod["main"]
-
-    z1 = tec.lower(get_func((10,)), "llvm")
-    z2 = tec.lower(get_func((10,)), "llvm")
-    z3 = tec.lower(get_func(()), "llvm")
-    assert z1.same_as(z2)
-    assert not z3.same_as(z1)
-    if tvm.testing.device_enabled("cuda"):
-        z4 = tec.lower(get_func(()), "cuda")
-        assert not z3.same_as(z4)
-
-    # Test JIT target
-    for target in ["llvm"]:
-        dev = tvm.device(target)
-        if tvm.testing.device_enabled(target):
-            f = tec.jit(get_func((10,)), target)
-            x = tvm.nd.array(np.ones(10).astype("float32"), device=dev)
-            y = tvm.nd.empty((10,), device=dev)
-            f(x, y)
-            tvm.testing.assert_allclose(y.numpy(), x.numpy() * 3)
-
-
-# Note: Once the te compiler is removed, we should keep this test so that
-# we make sure that opt_level=0 passes are being called correctly.
-def test_compile_placeholder_bypass():
-    te_compiler = relay.backend.te_compiler.get()
-    x = relay.var("x", shape=(2, 3))
-    y = relay.var("y", shape=(2, 3))
-    z = relay.var("z", shape=(2, 3))
-    result = relay.Tuple([x, relay.op.concatenate([y, z], axis=0)])
-    func = relay.Function(relay.analysis.free_vars(result), result)
-    with tvm.transform.PassContext(opt_level=0):
-        graph, lib, params = relay.build(tvm.IRModule.from_expr(func), "llvm")
-
-
-def test_compile_injective_with_tuple():
-    x = relay.var("x", shape=(2, 3))
-    y = relay.var("y", shape=(2, 3))
-    x_transpose = relay.transpose(x)
-    output = relay.Tuple([x_transpose, y])
-    func = relay.Function([x, y], output)
-    relay.build(tvm.IRModule.from_expr(func), "llvm")
-
-
-def test_compile_tuple_dup():
-    x = relay.var("data", shape=(16, 16))
-    log = relay.log(x)
-    output = relay.Tuple([log, log])
-    f = relay.Function([x], output)
-    relay.build(tvm.IRModule.from_expr(f), "llvm")
-
-
-def test_compile_full():
-    # Shape calculations can happen in int64. The test checks that full operator
-    # can handle when shapes are not int32
-    shape = (
-        tvm.tir.IntImm("int32", 1),
-        tvm.tir.IntImm("int64", 16),
-        tvm.tir.IntImm("int64", 16),
-        tvm.tir.IntImm("int32", 64),
-    )
-    output = relay.full(relay.const(0, "int32"), shape=shape, dtype="int32")
-    f = relay.Function([], output)
-    mod = tvm.IRModule.from_expr(f)
-    mod = relay.qnn.transform.CanonicalizeOps()(mod)
-    relay.build(mod, "llvm")
-
-
-def test_compile_nhwc_pack():
-    data = relay.var("data", shape=(1, 1, 1, 1024), dtype="uint8")
-    weight = relay.var("weight", shape=(1, 1, 1024, 1001), dtype="int8")
-    p2 = relay.var("p2", shape=(1, 1, 1, 1), dtype="int32")
-    conv = relay.nn.conv2d(
-        data,
-        weight,
-        kernel_size=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype="int32",
-    )
-    multiply = relay.multiply(relay.const(-22, dtype="int32"), p2)
-    tile = relay.tile(multiply, reps=(1, 1, 1, 1001))
-    subtract = relay.subtract(conv, tile)
-
-    func = subtract
-    mod = relay.Function(relay.analysis.free_vars(func), func)
-    relay.build(mod, target="llvm")
-
-
-def test_compile_propogate_hash():
-    data = relay.var("data", shape=(1, 1, 1, 1024), dtype="uint8")
-    weight = relay.var("weight", shape=(1, 1, 1024, 1001), dtype="int8")
-    p2 = relay.var("p2", shape=(1, 1, 1, 1), dtype="int32")
-    conv = relay.nn.conv2d(
-        data,
-        weight,
-        kernel_size=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-        out_dtype="int32",
-    )
-    multiply = relay.multiply(relay.const(-22, dtype="int32"), p2)
-    tile = relay.tile(multiply, reps=(1, 1, 1, 1001))
-    subtract = relay.subtract(conv, tile)
-
-    func = subtract
-    mod = tvm.IRModule.from_expr(relay.Function(relay.analysis.free_vars(func), func))
-    vm = relay.vm.VMCompiler()
-    opt_mod, _ = vm.optimize(mod, target="llvm")
-    for f in opt_mod.functions.values():
-        assert "hash" in f.attrs.keys()
-
-
-if __name__ == "__main__":
-    test_get_valid_implementations()
-    test_select_implementation()
-    test_te_compiler()
-    test_compile_placeholder_bypass()
-    test_compile_injective_with_tuple()
-    test_compile_tuple_dup()
-    test_compile_full()
-    test_compile_nhwc_pack()
diff --git a/tests/python/relay/test_roofline.py b/tests/python/relay/test_roofline.py
deleted file mode 100644
index 11c64048bb31..000000000000
--- a/tests/python/relay/test_roofline.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import csv
-import json
-import os
-import platform
-from io import StringIO
-
-import numpy as np
-import pytest
-
-import tvm.testing
-import tvm.utils
-from tvm import relay, rpc
-from tvm.contrib import utils
-from tvm.contrib.debugger import debug_executor
-from tvm.relay.testing import mlp
-from tvm.runtime import profiler_vm
-from tvm.runtime.profiling import Report
-from tvm.script import tir as T
-
-
-@tvm.testing.requires_x86
-@pytest.mark.parametrize("dtype", ["float32", "int8", "int32"])
-def test_estimate_peak_flops_cpu(dtype):
-    server = rpc.Server(key="roofline_flops_cpu")
-    remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_cpu")
-    target = tvm.target.Target("llvm -mattr=+fma,+avx2")
-    dev = remote.device(str(target))
-    # This test uses vectorized instructions so we need a target that supports them
-    flops = tvm.utils.roofline.x86.estimate_peak_fma_vector_flops(target, dev, remote, dtype)
-    # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
-    assert (
-        flops > 10**9 and flops < 10**14
-    ), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"
-
-
-@tvm.testing.requires_cuda
-def test_estimate_peak_flops_gpu():
-    server = rpc.Server(key="roofline_flops_gpu")
-    remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_gpu")
-    target = tvm.target.Target("cuda")
-    dev = remote.device(str(target))
-    # This test uses vectorized instructions so we need a target that supports them
-    flops = tvm.utils.roofline.cuda.estimate_peak_flops_tensorcore(target, dev, remote)
-    # should be able to hit a TFLOP/s with tensor cores
-    assert (
-        flops > 10**12 and flops < 10**14
-    ), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
-
-    # this test should run on all gpus
-    flops = tvm.utils.roofline.cuda.estimate_peak_flops_fma(target, dev, remote, "float32")
-    # most gpus since 2016 should be able to hit a TFLOP/s with fma instructions
-    assert (
-        flops > 10**12 and flops < 10**14
-    ), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
-
-
-@tvm.testing.requires_x86
-@tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
-@tvm.testing.requires_llvm
-def test_estimate_peak_bandwidth_cpu():
-    server = rpc.Server(key="roofline_bandwidth_cpu")
-    remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_cpu")
-    target = tvm.target.Target("llvm -mattr=+fma,+avx2")
-    dev = remote.device(str(target))
-    # This test uses vectorized instructions so we need a target that supports them
-    bandwidth = tvm.utils.roofline.x86.estimate_peak_bandwidth_dram(target, dev, remote)
-    # Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
-    # GB/s, so this should leave enough wiggle room.
-    assert (
-        bandwidth > 10**9 and bandwidth < 10**12
-    ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
-
-
-@tvm.testing.requires_cuda
-def test_estimate_peak_bandwidth_gpu():
-    server = rpc.Server(key="roofline_bandwidth_gpu")
-    remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_gpu")
-    target = tvm.target.Target("cuda")
-    dev = remote.device(str(target))
-    # This test uses vectorized instructions so we need a target that supports them
-    bandwidth = tvm.utils.roofline.cuda.estimate_peak_bandwidth_global_mem(target, dev, remote)
-    # should be able to hit a 100 GB/s on a GPU. GTX 280 hits 140 GB/s and
-    # it is really old.
-    assert (
-        bandwidth > 10**11 and bandwidth < 10**13
-    ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
-
-
-@tvm.testing.requires_x86
-@tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
-@tvm.testing.parametrize_targets("llvm -mattr=+fma,+avx2", "cuda")
-def test_roofline_analysis(target, dev):
-    a = relay.var("a", relay.TensorType((512, 512), "float32"))
-    b = relay.var("b", relay.TensorType((512, 512), "float32"))
-    c = relay.nn.dense(a, b)
-    mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
-    params = {}
-
-    server = rpc.Server(key="roofline")
-    remote = rpc.connect("127.0.0.1", server.port, key="roofline")
-    dev = remote.device(target)
-
-    report = tvm.utils.roofline_analysis(mod, params, target, dev, remote=remote)
-    print(report)
-
-    assert "Bound" in report.table()
-    assert "Percent of Theoretical Optimal" in report.table()
-    for call in report.calls:
-        if "Percent of Theoretical Optimal" in call:
-            if target.startswith("llvm"):
-                # Ideally we'd like a little tighter bound here, but it is hard to
-                # know how well this dense will perform without tuning. And we
-                # don't have an operator that uses a specific number of flops.
-                assert call["Percent of Theoretical Optimal"].ratio >= 5.0
-            elif target == "cuda":
-                # The cuda gpu kernel is really poorly optimized
-                assert 90 >= call["Percent of Theoretical Optimal"].ratio >= 0.01
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_runtime.py b/tests/python/relay/test_runtime.py
deleted file mode 100644
index db8252f3a3c4..000000000000
--- a/tests/python/relay/test_runtime.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-
-from tvm import TVMError
-from tvm.relay.backend import Runtime
-
-
-def test_create():
-    runtime = Runtime("cpp")
-    assert str(runtime) == "cpp"
-
-
-def test_create_runtime_with_options():
-    runtime = Runtime("crt", {"system-lib": True})
-    assert str(runtime) == "crt"
-    assert runtime["system-lib"]
-
-
-def test_attr_check():
-    runtime = Runtime("crt", {"system-lib": True})
-    assert "woof" not in runtime
-    assert "system-lib" in runtime
-
-
-def test_create_runtime_not_found():
-    with pytest.raises(TVMError, match='Runtime "woof" is not defined'):
-        Runtime("woof", {})
-
-
-def test_create_runtime_attr_not_found():
-    with pytest.raises(TVMError, match='Attribute "woof" is not available on this Runtime'):
-        Runtime("crt", {"woof": "bark"})
-
-
-def test_create_runtime_attr_type_incorrect():
-    with pytest.raises(
-        TVMError,
-        match='Attribute "system-lib" should have type "runtime.BoxBool"'
-        ' but instead found "runtime.String"',
-    ):
-        Runtime("crt", {"system-lib": "woof"})
-
-
-def test_list_runtimes():
-    assert "crt" in Runtime.list_registered()
-
-
-@pytest.mark.parametrize("runtime", [Runtime("crt"), "crt"])
-def test_list_runtime_options(runtime):
-    aot_options = Runtime.list_registered_options(runtime)
-    assert "system-lib" in aot_options
-    assert aot_options["system-lib"] == "runtime.BoxBool"
-
-
-def test_list_runtime_options_not_found():
-    with pytest.raises(TVMError, match='Runtime "woof" is not defined'):
-        Runtime.list_registered_options("woof")
diff --git a/tests/python/relay/test_set_input_zero_copy.py b/tests/python/relay/test_set_input_zero_copy.py
deleted file mode 100644
index 3effbaed152f..000000000000
--- a/tests/python/relay/test_set_input_zero_copy.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-function-docstring,missing-module-docstring
-import tvm
-from tvm import relay
-import numpy as np
-from tvm.contrib import graph_executor
-from tvm import testing
-import numpy as np
-import pytest
-
-
-dev = tvm.cpu(0)
-target = tvm.target.Target("llvm")
-
-
-def build_relay_module(func):
-    mod = tvm.IRModule()
-    mod["main"] = func
-    lib = relay.build(mod, target=target)
-
-    return graph_executor.GraphModule(lib["default"](dev))
-
-
-@testing.requires_llvm
-def test_simple_graph():
-    # Simple relay func:
-    # 1. y = x + 1
-    # 2. return y
-    shape = (2, 2)
-    x = relay.var("x", shape=shape, dtype="float32")
-    y = relay.add(x, relay.ones(shape, dtype="float32"))
-    func = relay.Function([x], y)
-
-    # Build 2 exactly same relay modules.
-    mod = build_relay_module(func)
-    mod_zero_copy = build_relay_module(func)
-    x_np = np.random.uniform(size=shape).astype(np.float32)
-
-    # Use set_input()
-    x_nd = tvm.nd.array(x_np, device=dev)
-    mod.set_input("x", x_nd)
-    mod.run()
-
-    # Use set_input_zero_copy()
-    x_nd_zero_copy = tvm.nd.array(x_np, device=dev)
-    index = mod_zero_copy.get_input_index("x")
-    mod_zero_copy.module["set_input_zero_copy"](index, x_nd_zero_copy)
-    mod_zero_copy.run()
-
-    # Expect get same output "x".
-    testing.assert_allclose(mod.get_output(0).numpy(), mod_zero_copy.get_output(0).numpy())
-
-
-@testing.requires_llvm
-def test_input_in_output():
-    # Relay func that input is also in output:
-    # 1. y = x + 1
-    # 2. return [x, y]
-    shape = (3, 4)
-    x = relay.var("x", shape=shape, dtype="float32")
-    y = relay.add(x, relay.ones(shape, dtype="float32"))
-    func = relay.Function([x], relay.expr.Tuple([x, y]))
-
-    # Build 2 exactly same relay modules.
-    mod = build_relay_module(func)
-    mod_zero_copy = build_relay_module(func)
-
-    x_np = np.random.uniform(size=shape).astype(np.float32)
-
-    # Use set_input()
-    x_nd = tvm.nd.array(x_np, device=dev)
-    mod.set_input("x", x_nd)
-    mod.run()
-
-    # Use set_input_zero_copy()
-    x_nd_zero_copy = tvm.nd.array(x_np, device=dev)
-    index = mod_zero_copy.get_input_index("x")
-    mod_zero_copy.module["set_input_zero_copy"](index, x_nd_zero_copy)
-    mod_zero_copy.run()
-
-    # Expect get same output "x".
-    testing.assert_allclose(mod.get_output(0).numpy(), mod_zero_copy.get_output(0).numpy())
-
-
-@testing.requires_llvm
-def test_reshape_after_input():
-    # Relay func that a reshape op follows immediately after input:
-    # 1. y = x + 1
-    # 2. return [x, y]
-    shape = (3, 4)
-    x = relay.var("x", shape=shape, dtype="float32")
-    y = relay.reshape(x, (1, 12))
-    z = relay.add(y, relay.ones((1, 12), dtype="float32"))
-    func = relay.Function([x], relay.expr.Tuple([x, y, z]))
-
-    # Build 2 exactly same relay modules.
-    mod = build_relay_module(func)
-    mod_zero_copy = build_relay_module(func)
-
-    x_np = np.random.uniform(size=shape).astype(np.float32)
-
-    # Use set_input()
-    x_nd = tvm.nd.array(x_np, device=dev)
-    mod.set_input("x", x_nd)
-    mod.run()
-
-    # Use set_input_zero_copy()
-    x_nd_zero_copy = tvm.nd.array(x_np, device=dev)
-    index = mod_zero_copy.get_input_index("x")
-    mod_zero_copy.module["set_input_zero_copy"](index, x_nd_zero_copy)
-    mod_zero_copy.run()
-
-    # Expect get same output "x".
-    testing.assert_allclose(mod.get_output(0).numpy(), mod_zero_copy.get_output(0).numpy())
-    # Expect get same output "y".
-    testing.assert_allclose(mod.get_output(1).numpy(), mod_zero_copy.get_output(1).numpy())
-
-
-if __name__ == "__main__":
-    test_simple_graph()
-    test_input_in_output()
-    test_reshape_after_input()
diff --git a/tests/python/relay/test_simplify_fc_transpose.py b/tests/python/relay/test_simplify_fc_transpose.py
deleted file mode 100644
index 284950471c58..000000000000
--- a/tests/python/relay/test_simplify_fc_transpose.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import itertools
-
-import numpy as np
-import scipy.sparse as sp
-
-
-import tvm
-from tvm.ir import IRModule
-from tvm import relay
-from tvm.relay.data_dep_optimization import simplify_fc_transpose
-
-
-def run_func(func, params, x):
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(func, "llvm", params=params)
-
-    from tvm.contrib import graph_executor
-
-    dev = tvm.cpu(0)
-    dtype = "float32"
-    m = graph_executor.GraphModule(lib["default"](dev))
-    # set inputs
-    m.set_input("data", tvm.nd.array(x.astype(dtype)))
-    # execute
-    m.run()
-    # get outputs
-    tvm_output = m.get_output(0)
-    return tvm_output.numpy()
-
-
-def test_simplify_fc_transpose():
-    data = relay.var("data", shape=(1, 32), dtype="float32")
-    x = relay.nn.relu(data)
-    w1 = relay.var("w1", shape=(32, 64), dtype="float32")
-    y = relay.nn.dense(x, relay.transpose(w1, axes=[1, 0]))
-    z = relay.nn.relu(y)
-    w2 = relay.var("w2", shape=(64, 16), dtype="float32")
-    zz = relay.nn.dense(z, relay.transpose(w2, axes=[1, 0]))
-    func = relay.Function(relay.analysis.free_vars(zz), zz)
-    params = {
-        "w1": tvm.nd.array(np.random.uniform(-1, 1, (32, 64)).astype("float32")),
-        "w2": tvm.nd.array(np.random.uniform(-1, 1, (64, 16)).astype("float32")),
-    }
-    x_np = np.random.randn(1, 32).astype("float32")
-    old_result = run_func(func, params, x_np)
-
-    new_func, new_params = simplify_fc_transpose.convert(func, params)
-    new_result = run_func(new_func, new_params, x_np)
-    np.testing.assert_allclose(old_result, new_result, atol=1e-5, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_simplify_fc_transpose()
diff --git a/tests/python/relay/test_sparse_conv2d_convert.py b/tests/python/relay/test_sparse_conv2d_convert.py
deleted file mode 100644
index 045462475ee1..000000000000
--- a/tests/python/relay/test_sparse_conv2d_convert.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import itertools
-
-import numpy as np
-import scipy.sparse as sp
-
-
-import tvm
-from tvm.ir import IRModule
-from tvm import relay
-from tvm.topi.sparse.utils import random_bsr_matrix
-from tvm.relay.build_module import bind_params_by_name
-
-
-def run_func(func, params, x):
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, new_params = relay.build(func, "llvm", params=params)
-
-    from tvm.contrib import graph_executor
-
-    dev = tvm.cpu(0)
-    dtype = "float32"
-    m = graph_executor.create(graph, lib, dev)
-    # set inputs
-    m.set_input("data", tvm.nd.array(x.astype(dtype)))
-    m.set_input(**new_params)
-    # execute
-    m.run()
-    # get outputs
-    tvm_output = m.get_output(0)
-    return tvm_output.numpy()
-
-
-def test_bsr_sparse_conv2d_nchw():
-    data = relay.var("data", shape=(1, 64, 32, 32), dtype="float32")
-    x = relay.nn.relu(data)
-    w = relay.var("weight", shape=(128, 64, 1, 1), dtype="float32")
-    y = relay.nn.conv2d(x, w, channels=128, kernel_size=1, data_layout="NCHW", kernel_layout="OIHW")
-    z = relay.nn.relu(y)
-    func = relay.Function(relay.analysis.free_vars(z), z)
-
-    params = {
-        "weight": tvm.nd.array(
-            np.array(random_bsr_matrix(128, 64, 8, 1, 0.1, "float32").todense()).reshape(
-                128, 64, 1, 1
-            )
-        )
-    }
-
-    x_np = np.random.randn(1, 64, 32, 32).astype("float32")
-    # dense output
-    dense_output = run_func(func, params, x_np)
-    # sparse
-    sparse_func, params = relay.data_dep_optimization.bsr_conv2d.convert(
-        func, params, (8, 1), 0.2, "NCHW"
-    )
-    sparse_output = run_func(sparse_func, params, x_np)
-    np.testing.assert_allclose(sparse_output, dense_output, atol=1e-5, rtol=1e-5)
-
-
-def test_bsr_sparse_conv2d_nhwc():
-    data = relay.var("data", shape=(1, 32, 32, 64), dtype="float32")
-    x = relay.nn.relu(data)
-    w = relay.var("weight", shape=(1, 1, 64, 128), dtype="float32")
-    y = relay.nn.conv2d(x, w, channels=128, kernel_size=1, data_layout="NHWC", kernel_layout="HWIO")
-    z = relay.nn.relu(y)
-    func = relay.Function(relay.analysis.free_vars(z), z)
-
-    params = {
-        "weight": tvm.nd.array(
-            np.array(random_bsr_matrix(128, 64, 8, 1, 0.1, "float32").todense()).T.reshape(
-                1, 1, 64, 128
-            )
-        )
-    }
-
-    x_np = np.random.randn(1, 32, 32, 64).astype("float32")
-    # dense output
-    dense_output = run_func(func, params, x_np)
-    # sparse
-    sparse_func, params = relay.data_dep_optimization.bsr_conv2d.convert(
-        func, params, (8, 1), 0.2, "NHWC"
-    )
-    sparse_output = run_func(sparse_func, params, x_np)
-    np.testing.assert_allclose(sparse_output, dense_output, atol=1e-5, rtol=1e-5)
-
-
-def test_bsr_sparse_conv2d_3x3_nchw():
-    data = relay.var("data", shape=(1, 64, 32, 32), dtype="float32")
-    x = relay.nn.relu(data)
-    w = relay.var("weight", shape=(128, 64, 3, 3), dtype="float32")
-    y = relay.nn.conv2d(
-        x, w, channels=128, kernel_size=3, padding=1, data_layout="NCHW", kernel_layout="OIHW"
-    )
-    z = relay.nn.relu(y)
-    func = relay.Function(relay.analysis.free_vars(z), z)
-
-    params = {
-        "weight": tvm.nd.array(
-            np.array(random_bsr_matrix(128, 64 * 9, 16, 1, 0.1, "float32").todense()).reshape(
-                128, 64, 3, 3
-            )
-        )
-    }
-
-    x_np = np.random.randn(1, 64, 32, 32).astype("float32")
-    # dense output
-    dense_output = run_func(func, params, x_np)
-    # sparse
-    func = bind_params_by_name(func, params)
-    sparse_func, params = relay.data_dep_optimization.bsr_conv2d.convert2(
-        func, {}, (16, 1), 0.2, "NCHW", 3
-    )
-    sparse_output = run_func(sparse_func, params, x_np)
-    np.testing.assert_allclose(sparse_output, dense_output, atol=1e-5, rtol=1e-5)
-
-
-def test_bsr_sparse_conv2d_3x3_nhwc():
-    data = relay.var("data", shape=(1, 32, 32, 64), dtype="float32")
-    x = relay.nn.relu(data)
-    w = relay.var("weight", shape=(3, 3, 64, 128), dtype="float32")
-    y = relay.nn.conv2d(
-        x, w, channels=128, kernel_size=3, padding=1, data_layout="NHWC", kernel_layout="HWIO"
-    )
-    z = relay.nn.relu(y)
-    func = relay.Function(relay.analysis.free_vars(z), z)
-
-    params = {
-        "weight": tvm.nd.array(
-            np.array(random_bsr_matrix(128, 64 * 9, 16, 1, 0.1, "float32").todense()).T.reshape(
-                3, 3, 64, 128
-            )
-        )
-    }
-
-    x_np = np.random.randn(1, 32, 32, 64).astype("float32")
-    # dense output
-    dense_output = run_func(func, params, x_np)
-    # sparse
-    func = bind_params_by_name(func, params)
-    sparse_func, params = relay.data_dep_optimization.bsr_conv2d.convert2(
-        func, {}, (16, 1), 0.2, "NHWC", 3
-    )
-    sparse_output = run_func(sparse_func, params, x_np)
-    np.testing.assert_allclose(sparse_output, dense_output, atol=1e-5, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_bsr_sparse_conv2d_nhwc()
-    test_bsr_sparse_conv2d_nchw()
-    test_bsr_sparse_conv2d_3x3_nhwc()
-    test_bsr_sparse_conv2d_3x3_nchw()
diff --git a/tests/python/relay/test_sparse_dense_convert.py b/tests/python/relay/test_sparse_dense_convert.py
deleted file mode 100644
index 3ff31db2e995..000000000000
--- a/tests/python/relay/test_sparse_dense_convert.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import itertools
-
-import numpy as np
-import scipy.sparse as sp
-
-
-import tvm
-from tvm.ir import IRModule
-from tvm import relay
-
-
-def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype="float32"):
-    Y = np.zeros((M, N), dtype=dtype)
-    assert M % BS_R == 0
-    assert N % BS_C == 0
-    nnz = int(density * M * N)
-    num_blocks = int(nnz / (BS_R * BS_C)) + 1
-    candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C))))
-    assert candidate_blocks.shape[0] == M // BS_R * N // BS_C
-    chosen_blocks = candidate_blocks[
-        np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False)
-    ]
-    for i in range(len(chosen_blocks)):
-        r, c = chosen_blocks[i]
-        Y[r : r + BS_R, c : c + BS_C] = np.random.randn(BS_R, BS_C)
-    s = sp.bsr_matrix(Y, blocksize=(BS_R, BS_C))
-    assert s.data.shape == (num_blocks, BS_R, BS_C)
-    assert s.data.size >= nnz
-    assert s.indices.shape == (num_blocks,)
-    assert s.indptr.shape == (M // BS_R + 1,)
-    return s
-
-
-def run_func(func, params, x):
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, new_params = relay.build(func, "llvm", params=params)
-
-    from tvm.contrib import graph_executor
-
-    dev = tvm.cpu(0)
-    dtype = "float32"
-    m = graph_executor.create(graph, lib, dev)
-    # set inputs
-    m.set_input("data", tvm.nd.array(x.astype(dtype)))
-    m.set_input(**new_params)
-    # execute
-    m.run()
-    # get outputs
-    tvm_output = m.get_output(0)
-    return tvm_output.numpy()
-
-
-def test_bsr_sparse_dense():
-    data = relay.var("data", shape=(1, 128), dtype="float32")
-    x = relay.nn.relu(data)
-    w = relay.var("weight", shape=(768, 128), dtype="float32")
-    y = relay.nn.dense(x, w)
-    z = relay.nn.relu(y)
-    func = relay.Function(relay.analysis.free_vars(z), z)
-
-    params = {"weight": tvm.nd.array(random_bsr_matrix(768, 128, 32, 1, 0.1).todense())}
-
-    x_np = np.random.randn(1, 128).astype("float32")
-    # dense output
-    dense_output = run_func(func, params, x_np)
-    # sparse
-    sparse_func, params = relay.data_dep_optimization.bsr_dense.convert(func, params, (32, 1), 0.2)
-    sparse_output = run_func(sparse_func, params, x_np)
-    np.testing.assert_allclose(sparse_output, dense_output, atol=1e-5, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_bsr_sparse_dense()
diff --git a/tests/python/relay/test_target_hooks.py b/tests/python/relay/test_target_hooks.py
deleted file mode 100644
index 0b888781873f..000000000000
--- a/tests/python/relay/test_target_hooks.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for target hooks."""
-import sys
-import numpy as np
-import pytest
-import logging
-
-import tvm
-import tvm.testing
-from tvm import relay, IRModule
-
-from utils.external_codegen import (
-    parametrize_external_codegen_checks,
-    set_external_func_attr,
-    check_graph_executor_result,
-    check_vm_result,
-)
-
-logging.basicConfig(level=logging.INFO)
-
-
-@parametrize_external_codegen_checks
-def test_tir_external_generation_inline_without_target_instance(check_result):
-    shape = (8,)
-    x_data = np.random.randint(255, size=shape).astype("float32")
-    y_data = np.random.randint(255, size=shape).astype("float32")
-    inputs = {"x": x_data, "y": y_data}
-
-    x0 = relay.var("x0", shape=shape, dtype="float32")
-    y0 = relay.var("y0", shape=shape, dtype="float32")
-    z = x0 + y0
-    f = relay.Function([x0, y0], z)
-    f = set_external_func_attr(f, "example_target_hook", "replace_add_with_subtract")
-
-    x = relay.var("x", shape=(8,), dtype="float32")
-    y = relay.var("y", shape=(8,), dtype="float32")
-    call = relay.Call(f, [x, y])
-    func = IRModule.from_expr(call)
-
-    check_result(func, inputs, (8,), x_data - y_data)
-
-
-# TODO(mbs): The check_aot_executor_result does not support list-of-targets, mostly because
-# tvm.testing.aot.compile_and_run requires the target to be a kind name string, and
-# tvm.testing.aot.compile_models requires a single Target object. However, code outside of
-# tvm.testing.aot is ready for this more general form.
-@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
-def test_tir_external_generation_outline_with_target_instance(check_result):
-    shape = (8,)
-    x_data = np.random.randint(255, size=shape).astype("float32")
-    y_data = np.random.randint(255, size=shape).astype("float32")
-    inputs = {"x": x_data, "y": y_data}
-    # Compile with an instance of the hooked target kind to demonstrate plumbing target attributes
-    # into custom passes.
-    host_target = tvm.target.Target("llvm")
-    generic_target = tvm.target.Target("llvm", host=host_target)
-    extern_codegen_target = tvm.target.Target(
-        "example_target_hook -example_attribute=42", host=host_target
-    )
-    mod = tvm.relay.fromtext(
-        """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(8), float32], %y: Tensor[(8), float32]) -> Tensor[(8), float32] {
-              @replace_add_with_subtract(%x, %y) * 2.0f
-            }
-
-            def @replace_add_with_subtract(%x: Tensor[(8), float32], %y: Tensor[(8), float32],
-                                           Inline=1,
-                                           Primitive=1,
-                                           Compiler="example_target_hook",
-                                           global_symbol="replace_add_with_subtract") -> Tensor[(8), float32] {
-              %x + %y  // will be rewritten to TIR implementing %x - %y - 42.0f by custom pass
-            }
-        """
-    )
-
-    check_result(
-        mod,
-        inputs,
-        (8,),
-        (x_data - y_data - 42.0) * 2.0,
-        target=[generic_target, extern_codegen_target],
-    )
-
-
-@pytest.mark.parametrize("check_result", [check_graph_executor_result])
-def test_runtime_module_generation(check_result):
-    shape = (8,)
-    x_data = np.random.randint(255, size=shape).astype("float32")
-    y_data = np.random.randint(255, size=shape).astype("float32")
-    inputs = {"x": x_data, "y": y_data}
-
-    x0 = relay.var("x0", shape=shape, dtype="float32")
-    y0 = relay.var("y0", shape=shape, dtype="float32")
-    z = x0 + y0
-    func = relay.Function([x0, y0], z)
-    func = set_external_func_attr(func, "example_target_hook", "replace_add_with_subtract")
-    # Test hook to trigger TIRToRuntime code generation
-    func = func.with_attr("tir_to_runtime", True)
-
-    x = relay.var("x", shape=(8,), dtype="float32")
-    y = relay.var("y", shape=(8,), dtype="float32")
-    call = relay.Call(func, [x, y])
-    func = IRModule.from_expr(call)
-
-    check_result(func, inputs, (8,), x_data * y_data)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_tensor_array.py b/tests/python/relay/test_tensor_array.py
deleted file mode 100644
index 4973fa20c447..000000000000
--- a/tests/python/relay/test_tensor_array.py
+++ /dev/null
@@ -1,785 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay import testing
-from tvm.relay.backend.interpreter import ConstructorValue
-from tvm.relay import create_executor
-from tvm.relay.prelude import Prelude, StaticTensorArrayOps
-from tvm.relay.testing import count as count_, make_nat_value, make_nat_expr
-
-import numpy as np
-
-
-def vmobj_to_list(mod, o, dtype="float32"):
-    _, tensor_nil, _, _, _, _, _, _, _ = mod.get_type(f"tensor_{dtype}_t")
-    if isinstance(o, tvm.nd.NDArray):
-        return [o.numpy().tolist()]
-    elif isinstance(o, tvm.runtime.container.ADT):
-        if len(o) == 0:
-            if tensor_nil.tag == o.tag:
-                return [0]
-            return []
-
-        result = []
-        for f in o:
-            result.extend(vmobj_to_list(mod, f, dtype))
-        return result
-    elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
-        if o.constructor.name_hint == "Cons":
-            tl = vmobj_to_list(mod, o.fields[1], dtype)
-            hd = vmobj_to_list(mod, o.fields[0], dtype)
-            hd.extend(tl)
-            return hd
-        elif o.constructor.name_hint == "Nil":
-            return []
-        elif "tensor_nil" in o.constructor.name_hint:
-            return [0]
-        elif "tensor" in o.constructor.name_hint:
-            return [o.fields[0].numpy()]
-        else:
-            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def check_tensor_array(ta_mod, ref_res, *args, dtype="float32", rtol=1e-5):
-    for kind in ["debug", "vm"]:
-        for target, dev in [("llvm", tvm.cpu(0))]:  # testing.enabled_targets():
-            if kind == "debug" and dev.device_type != tvm.cpu().device_type:
-                continue
-            result = relay.create_executor(kind, mod=ta_mod, device=dev, target=target).evaluate()(
-                *args
-            )
-            got = vmobj_to_list(ta_mod, result, dtype)
-            tvm.testing.assert_allclose(ref_res, got, rtol=rtol, atol=rtol)
-
-
-@tvm.testing.uses_gpu
-def test_tensor_expand_dims():
-    def run(dtype):
-        x = relay.var("x")
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        expand_dims_func = p.get_global_var("tensor_expand_dims", dtype)
-        tensor1 = p.get_tensor_ctor("tensor1", dtype)
-        mod["main"] = relay.Function([x], expand_dims_func(tensor1(x)))
-        x_np = np.random.uniform(low=0.0, high=8.0, size=(1,)).astype(dtype)
-        expected = [np.expand_dims(x_np, axis=0)]
-        check_tensor_array(mod, expected, x_np)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_constructor():
-    def run(dtype):
-        x = relay.var("x")
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        tensor_array = p.get_global_var("tensor_array", dtype)
-        mod["main"] = relay.Function([x], tensor_array(x))
-        expected = np.array([0, 0, 0, 0, 0])
-        check_tensor_array(mod, expected, 5, dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_read():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        l = relay.var("l")
-        i = relay.var("i")
-        read_func = p.get_global_var("tensor_array_read", dtype)
-        tensor_array = p.get_global_var("tensor_array", dtype)
-        mod["main"] = relay.Function([l, i], read_func(tensor_array(l), i))
-        expected = [0]
-        check_tensor_array(mod, expected, *(1, 0), dtype=dtype)
-        check_tensor_array(mod, expected, *(5, 1), dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_write():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        tensor_t = p.get_type("tensor_t", dtype)
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        tensor_array = p.get_global_var("tensor_array", dtype)
-        init_tensor_array = tensor_array(relay.const(2))
-        write_func = p.get_global_var("tensor_array_write", dtype)
-        tensor1 = p.get_tensor_ctor("tensor1", dtype)
-        tensor_array1 = write_func(init_tensor_array, relay.const(0), tensor1(v1))
-        tensor_array2 = write_func(tensor_array1, relay.const(1), tensor1(v2))
-        mod["main"] = relay.Function([v1, v2], tensor_array2)
-        expected = [3, 7]
-        check_tensor_array(mod, expected, *(3, 7), dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_stack():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        tensor_t = p.get_type("tensor_t", dtype)
-        rlist = p.mod.get_global_type_var(f"List")
-        tensor_array = p.get_global_var("tensor_array", dtype)
-        tensor1 = p.get_tensor_ctor("tensor1", dtype)
-        write = p.get_global_var("tensor_array_write", dtype)
-        stack = p.get_global_var("tensor_array_stack", dtype)
-        # TODO extract test case from inference failures
-        # setting this wrong causes crashes
-        v = relay.var("v", shape=(1,), dtype=dtype)
-        init_tensor_array = tensor_array(relay.const(3))
-        tensor_array1 = write(init_tensor_array, relay.const(0), tensor1(v))
-        tensor_array2 = write(tensor_array1, relay.const(1), tensor1(v))
-        tensor_array3 = write(tensor_array2, relay.const(2), tensor1(v))
-        tensor_array4 = stack(tensor_array3)
-        mod["main"] = relay.Function([v], tensor_array4, tensor_t())
-        t = np.random.uniform(low=0.0, high=8.0, size=(1,)).astype(dtype)
-        expected = [np.stack([t, t, t])]
-        check_tensor_array(mod, expected, t, dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_unstack():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        unstack_tensor1 = p.get_global_var("tensor_array_unstack_tensor1", dtype)
-        v = relay.var("v")
-        mod["main"] = relay.Function([v], unstack_tensor1(v))
-        t = np.random.uniform(low=0.0, high=8.0, size=(1,)).astype(dtype)
-        check_tensor_array(mod, t, t, dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_take():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        take = p.get_global_var("tensor_take", dtype)
-        tensor2 = p.get_tensor_ctor("tensor2", dtype)
-        v = relay.var("v")
-        lower = relay.var("lower")
-        upper = relay.var("upper")
-        mod["main"] = relay.Function([v, lower, upper], take(tensor2(v), lower, upper))
-        v_data = np.random.uniform(low=0.0, high=8.0, size=(10, 10)).astype(dtype)
-        expected = [np.take(v_data, range(2, 5), axis=0)]
-        check_tensor_array(mod, expected, *(v_data, 2, 5), dtype=dtype)
-        expected = [np.take(v_data, range(0, 9), axis=0)]
-        check_tensor_array(mod, expected, *(v_data, 0, 9), dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_concatenate():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        concat = p.get_global_var("tensor_concatenate", dtype)
-        tensor1 = p.get_tensor_ctor("tensor1", dtype)
-        v1 = relay.var("v1", shape=(tvm.tir.Any(),), dtype=dtype)
-        v2 = relay.var("v2", shape=(tvm.tir.Any(),), dtype=dtype)
-        mod["main"] = relay.Function([v1, v2], concat(tensor1(v1), tensor1(v2)))
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=(5,)).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=(5,)).astype(dtype)
-        expected = [np.concatenate((v1_data, v2_data))]
-        check_tensor_array(mod, expected, *(v1_data, v2_data), dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_concat():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        tensor_array = p.get_global_var("tensor_array", dtype)
-        tensor_array1 = tensor_array(relay.const(2))
-        write_func = p.get_global_var("tensor_array_write", dtype)
-        concat_func = p.get_global_var("tensor_array_concat", dtype)
-        tensor1 = p.get_tensor_ctor("tensor2", dtype)
-        tensor_array1 = write_func(tensor_array1, relay.const(0), tensor1(v1))
-        tensor_array1 = write_func(tensor_array1, relay.const(1), tensor1(v2))
-        tensor_array_concat = concat_func(tensor_array1)
-        mod["main"] = relay.Function([v1, v2], tensor_array_concat)
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=(1, 3)).astype(dtype)
-        expected = [np.concatenate((v1_data, v2_data), axis=0)]
-        check_tensor_array(mod, expected, *(v1_data, v2_data), dtype=dtype)
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_scatter():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-
-        # tensor array
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        v3 = relay.var("v2")
-        tensor_array = p.get_global_var("tensor_array", dtype)
-        tensor_array1 = tensor_array(relay.const(3))
-        write_func = p.get_global_var("tensor_array_write", dtype)
-        scatter_func = p.get_global_var("tensor_array_scatter", dtype)
-        tensor2 = p.get_tensor_ctor("tensor2", dtype)
-        tensor_array1 = write_func(tensor_array1, relay.const(0), tensor2(v1))
-        tensor_array1 = write_func(tensor_array1, relay.const(1), tensor2(v2))
-        tensor_array1 = write_func(tensor_array1, relay.const(2), tensor2(v3))
-
-        # indices array
-        index = relay.var("index")
-
-        # values array
-        value_0 = relay.var("value_0")
-        value_1 = relay.var("value_1")
-        values_array = tensor_array(relay.const(2))
-        values_array = write_func(values_array, relay.const(0), tensor2(value_0))
-        values_array = write_func(values_array, relay.const(1), tensor2(value_1))
-
-        # create the scatter function
-        tensor_array_scatter = scatter_func(tensor_array1, index, values_array)
-        mod["main"] = relay.Function([v1, v2, v3, index, value_0, value_1], tensor_array_scatter)
-
-        # initialize and check
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        v3_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        index_data = np.array([0, 1], dtype="int32")
-        val1_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        val2_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        expected = [val1_data, val2_data, v3_data]
-        check_tensor_array(
-            mod,
-            expected,
-            *(v1_data, v2_data, v3_data, index_data, val1_data, val2_data),
-            dtype=dtype,
-        )
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_tensor_array_split():
-    def run(dtype):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-
-        # tensor array
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        v3 = relay.var("v2")
-        tensor_array = p.get_global_var("tensor_array", dtype)
-        tensor_array1 = tensor_array(relay.const(3))
-        write_func = p.get_global_var("tensor_array_write", dtype)
-        split_func = p.get_global_var("tensor_array_split", dtype)
-        tensor2 = p.get_tensor_ctor("tensor2", dtype)
-        tensor_array1 = write_func(tensor_array1, relay.const(0), tensor2(v1))
-        tensor_array1 = write_func(tensor_array1, relay.const(1), tensor2(v2))
-        tensor_array1 = write_func(tensor_array1, relay.const(2), tensor2(v3))
-
-        # value tensor
-        value = relay.var("value")
-
-        # lengths tensor
-        ta_len = relay.var("length")
-
-        # create the scatter function
-        tensor_array_split = split_func(tensor_array1, tensor2(value), ta_len)
-        mod["main"] = relay.Function([v1, v2, v3, value, ta_len], tensor_array_split)
-
-        # initialize and check
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        v3_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        value_data = np.random.uniform(low=0.0, high=8.0, size=(4, 3)).astype(dtype)
-        length_data = np.array([2, 2], dtype="int32")
-        expected = np.concatenate([value_data, v3_data])
-        expected = np.split(expected, indices_or_sections=[2, 4])
-        check_tensor_array(
-            mod, expected, *(v1_data, v2_data, v3_data, value_data, length_data), dtype=dtype
-        )
-
-    run("float32")
-    run("int32")
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_take():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        take = p.get_global_var_static("tensor_take", dtype, shape)
-        tensor_constructor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        v = relay.var("v")
-        lower = relay.var("lower")
-        upper = relay.var("upper")
-        mod["main"] = relay.Function([v, lower, upper], take(tensor_constructor(v), lower, upper))
-        v_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        expected = [np.take(v_data, range(2, 5), axis=0)]
-        check_tensor_array(mod, expected, *(v_data, 2, 5), dtype=dtype)
-        expected = [np.take(v_data, range(0, 9), axis=0)]
-        check_tensor_array(mod, expected, *(v_data, 0, 9), dtype=dtype)
-
-    run("float32", [10, 10])
-    run("int32", [15, 11])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_concatenate():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        concat = p.get_global_var_static("tensor_concatenate", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        mod["main"] = relay.Function([v1, v2], concat(tensor(v1), tensor(v2)))
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        expected = [np.concatenate((v1_data, v2_data))]
-        check_tensor_array(mod, expected, *(v1_data, v2_data), dtype=dtype)
-
-    run(
-        "float32",
-        [
-            5,
-        ],
-    )
-    run("int32", [2, 3])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_expand_dims():
-    def run(dtype, shape):
-        x = relay.var("x")
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        expand_dims_func = p.get_global_var_static("tensor_expand_dims", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        mod["main"] = relay.Function([x], expand_dims_func(tensor(x)))
-        x_np = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        expected = [np.expand_dims(x_np, axis=0)]
-        check_tensor_array(mod, expected, x_np)
-
-    run("float32", [])
-    run(
-        "int32",
-        [
-            2,
-        ],
-    )
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_constructor():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-        tensor_constructor = p.get_name_static("tensor_constructor", dtype, shape)
-        assert tensor_constructor != None
-
-    run("float32", [1, 1])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_read():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        np_data_list = []
-        ta_length = 3
-        for _ in range(ta_length):
-            np_data_list.append(np.random.uniform(0, 10, size=shape).astype(dtype))
-
-        v0 = relay.var("v0")
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        n = relay.var("n")
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        init_tensor_array = tensor_array(relay.const(ta_length))
-        read_func = p.get_global_var_static("tensor_array_read", dtype, shape)
-        write_func = p.get_global_var_static("tensor_array_write", dtype, shape)
-        tensor_array0 = write_func(init_tensor_array, relay.const(0), tensor(v0))
-        tensor_array1 = write_func(tensor_array0, relay.const(1), tensor(v1))
-        tensor_array2 = write_func(tensor_array1, relay.const(2), tensor(v2))
-
-        mod["main"] = relay.Function([v0, v1, v2, n], read_func(tensor_array2, n))
-        expected = [np_data_list[0]]
-        check_tensor_array(mod, expected, *list(np_data_list + [0]), dtype=dtype)
-        expected = [np_data_list[1]]
-        check_tensor_array(mod, expected, *list(np_data_list + [1]), dtype=dtype)
-        expected = [np_data_list[2]]
-        check_tensor_array(mod, expected, *list(np_data_list + [2]), dtype=dtype)
-
-    run("float32", [])
-    run("int32", [2, 3])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_write():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        ta_length = 2
-        np_data_list = [
-            np.random.uniform(0, 10, size=shape).astype(dtype) for _ in range(ta_length)
-        ]
-
-        v0 = relay.var("v0")
-        v1 = relay.var("v1")
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        init_tensor_array = tensor_array(relay.const(ta_length))
-        write_func = p.get_global_var_static("tensor_array_write", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        tensor_array0 = write_func(init_tensor_array, relay.const(0), tensor(v0))
-        tensor_array1 = write_func(tensor_array0, relay.const(1), tensor(v1))
-        mod["main"] = relay.Function([v0, v1], tensor_array1)
-        expected = np_data_list
-        check_tensor_array(mod, expected, *np_data_list, dtype=dtype)
-
-    run("float32", [])
-    run("int32", [2, 3])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_unstack():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        unstack_tensor = p.get_global_var_static("tensor_array_unstack", dtype, shape)
-        v = relay.var("v")
-        mod["main"] = relay.Function([v], unstack_tensor(v))
-        t = np.random.uniform(low=0, high=10, size=shape).astype(dtype)
-        (*expected,) = t
-        check_tensor_array(mod, expected, t, dtype=dtype)
-
-    run("float32", [4])
-    run("int32", [2, 3])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_scatter():
-    def run(dtype, shape, indices_shape=None):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-        if indices_shape is not None:
-            static_tensor_array_ops.define_tensor_array_scatter(indices_shape, True)
-
-        # tensor array
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        v3 = relay.var("v2")
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        tensor_array0 = tensor_array(relay.const(3))
-        write_func = p.get_global_var_static("tensor_array_write", dtype, shape)
-        scatter_func = p.get_global_var_static("tensor_array_scatter", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        tensor_array1 = write_func(tensor_array0, relay.const(0), tensor(v1))
-        tensor_array1 = write_func(tensor_array1, relay.const(1), tensor(v2))
-        tensor_array1 = write_func(tensor_array1, relay.const(2), tensor(v3))
-
-        # indices array
-        index = relay.var("index")
-
-        # values array
-        value_0 = relay.var("value_0")
-        value_1 = relay.var("value_1")
-        values_array = tensor_array(relay.const(2))
-        values_array = write_func(values_array, relay.const(0), tensor(value_0))
-        values_array = write_func(values_array, relay.const(1), tensor(value_1))
-
-        # create the scatter function
-        tensor_array_scatter = scatter_func(tensor_array1, index, values_array)
-        mod["main"] = relay.Function([v1, v2, v3, index, value_0, value_1], tensor_array_scatter)
-
-        # initialize and check
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        v3_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        index_data = np.array([0, 1], dtype="int32")
-        val1_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        val2_data = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        expected = [val1_data, val2_data, v3_data]
-        check_tensor_array(
-            mod,
-            expected,
-            *(v1_data, v2_data, v3_data, index_data, val1_data, val2_data),
-            dtype=dtype,
-        )
-
-    run("float32", [2, 3])
-    run("int32", [2, 3])
-    run(
-        "float32",
-        [2, 3],
-        [
-            2,
-        ],
-    )
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_split():
-    def run(dtype, shape, value_shape=None, lengths_shape=None):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-        if value_shape is not None or lengths_shape is not None:
-            static_tensor_array_ops.define_tensor_array_split(value_shape, lengths_shape, False)
-
-        # tensor array
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        v3 = relay.var("v2")
-
-        adt_shape = [
-            relay.Any(),
-        ] + shape[1:]
-        test_ops = StaticTensorArrayOps(p, dtype, adt_shape)
-        test_ops.register()
-        tensor_array = test_ops.get_global_var("tensor_array")
-
-        tensor_array1 = tensor_array(relay.const(3))
-        write_func = test_ops.get_global_var("tensor_array_write")
-        split_ops = StaticTensorArrayOps(p, dtype, shape)
-        split_ops.register()
-        split_func = split_ops.get_global_var("tensor_array_split")
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, test_ops.shape)
-        tensor_array1 = write_func(tensor_array1, relay.const(0), tensor(v1))
-        tensor_array1 = write_func(tensor_array1, relay.const(1), tensor(v2))
-        tensor_array1 = write_func(tensor_array1, relay.const(2), tensor(v3))
-
-        # value tensor
-        value = relay.var("value")
-
-        # lengths tensor
-        ta_len = relay.var("length")
-
-        # create the split function
-        if value_shape is None:
-            tensor1 = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        else:
-            static_tensor_array_ops = StaticTensorArrayOps(p, dtype, value_shape)
-            static_tensor_array_ops.register()
-            tensor1 = p.get_tensor_ctor_static("tensor_constructor", dtype, test_ops.shape)
-
-        tensor_array_split = split_func(tensor_array1, tensor1(value), ta_len)
-        mod["main"] = relay.Function([v1, v2, v3, value, ta_len], tensor_array_split)
-
-        # initialize and check
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=[2, 3]).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=[2, 3]).astype(dtype)
-        v3_data = np.random.uniform(low=0.0, high=8.0, size=[2, 3]).astype(dtype)
-        value_data = np.random.uniform(low=0.0, high=8.0, size=value_shape or shape).astype(dtype)
-        length_data = np.array([2, 2], dtype="int32")
-        expected = np.concatenate([value_data, v3_data])
-        expected = np.split(expected, indices_or_sections=[2, 4])
-        check_tensor_array(
-            mod, expected, *(v1_data, v2_data, v3_data, value_data, length_data), dtype=dtype
-        )
-
-    run("float32", [4, 3])
-    run("int32", [4, 3])
-    run(
-        "int32",
-        [relay.Any(), 3],
-        [4, 3],
-        [
-            2,
-        ],
-    )
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_concat():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        tensor_array1 = tensor_array(relay.const(2))
-        write_func = p.get_global_var_static("tensor_array_write", dtype, shape)
-        concat_func = p.get_global_var_static("tensor_array_concat", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        tensor_array1 = write_func(tensor_array1, relay.const(0), tensor(v1))
-        tensor_array1 = write_func(tensor_array1, relay.const(1), tensor(v2))
-        tensor_array_concat = concat_func(tensor_array1)
-        mod["main"] = relay.Function([v1, v2], tensor_array_concat)
-        v1_data = np.random.uniform(low=0.0, high=8.0, size=(2, 3)).astype(dtype)
-        v2_data = np.random.uniform(low=0.0, high=8.0, size=(1, 3)).astype(dtype)
-        expected = [np.concatenate((v1_data, v2_data), axis=0)]
-        check_tensor_array(mod, expected, *(v1_data, v2_data), dtype=dtype)
-
-    run("float32", [relay.Any(), 3])
-    run("int32", [relay.Any(), 3])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_gather():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        write = p.get_global_var_static("tensor_array_write", dtype, shape)
-        gather = p.get_global_var_static("tensor_array_gather", dtype, shape)
-        v = relay.var("v")
-        indice = relay.var("indice")
-        init_tensor_array = tensor_array(relay.const(3))
-        tensor_array1 = write(init_tensor_array, relay.const(0), tensor(v))
-        tensor_array2 = write(tensor_array1, relay.const(1), tensor(v))
-        tensor_array3 = write(tensor_array2, relay.const(2), tensor(v))
-        out = gather(tensor_array3, indice)
-        mod["main"] = relay.Function([v, indice], out)
-        t = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        indice_data = np.array([0, 2], dtype="int32")
-        expected = [np.stack([t, t])]
-        check_tensor_array(mod, expected, *(t, indice_data), dtype=dtype)
-
-    run("float32", [])
-    run("int32", [2, 3])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_array_stack():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        write = p.get_global_var_static("tensor_array_write", dtype, shape)
-        stack = p.get_global_var_static("tensor_array_stack", dtype, shape)
-        v = relay.var("v")
-        init_tensor_array = tensor_array(relay.const(3))
-        tensor_array1 = write(init_tensor_array, relay.const(0), tensor(v))
-        tensor_array2 = write(tensor_array1, relay.const(1), tensor(v))
-        tensor_array3 = write(tensor_array2, relay.const(2), tensor(v))
-        tensor_array4 = stack(tensor_array3)
-        mod["main"] = relay.Function([v], tensor_array4)
-        t = np.random.uniform(low=0.0, high=8.0, size=shape).astype(dtype)
-        expected = [np.stack([t, t, t])]
-        check_tensor_array(mod, expected, t, dtype=dtype)
-
-    run("float32", [])
-    run("int32", [2, 3])
-
-
-@tvm.testing.uses_gpu
-def test_static_tensor_get_data():
-    def run(dtype, shape):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-        static_tensor_array_ops = StaticTensorArrayOps(p, dtype, shape)
-        static_tensor_array_ops.register()
-
-        np_data_list = []
-        ta_length = 3
-        for _ in range(ta_length):
-            np_data_list.append(np.random.uniform(0, 10, size=shape).astype(dtype))
-
-        v0 = relay.var("v0")
-        v1 = relay.var("v1")
-        v2 = relay.var("v2")
-        n = relay.var("n")
-        tensor = p.get_tensor_ctor_static("tensor_constructor", dtype, shape)
-        tensor_array = p.get_global_var_static("tensor_array", dtype, shape)
-        init_tensor_array = tensor_array(relay.const(ta_length))
-        read_func = p.get_global_var_static("tensor_array_read", dtype, shape)
-        write_func = p.get_global_var_static("tensor_array_write", dtype, shape)
-        get_data_func = p.get_global_var_static("tensor_get_data", dtype, shape)
-        tensor_array0 = write_func(init_tensor_array, relay.const(0), tensor(v0))
-        tensor_array1 = write_func(tensor_array0, relay.const(1), tensor(v1))
-        tensor_array2 = write_func(tensor_array1, relay.const(2), tensor(v2))
-
-        mod["main"] = relay.Function([v0, v1, v2, n], get_data_func(read_func(tensor_array2, n)))
-        expected = [np_data_list[0]]
-        check_tensor_array(mod, expected, *list(np_data_list + [0]), dtype=dtype)
-        expected = [np_data_list[1]]
-        check_tensor_array(mod, expected, *list(np_data_list + [1]), dtype=dtype)
-        expected = [np_data_list[2]]
-        check_tensor_array(mod, expected, *list(np_data_list + [2]), dtype=dtype)
-
-    run("float32", [])
-    run("int32", [2, 3])
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_to_mixed_precision.py b/tests/python/relay/test_to_mixed_precision.py
deleted file mode 100644
index a8032ce0d26d..000000000000
--- a/tests/python/relay/test_to_mixed_precision.py
+++ /dev/null
@@ -1,624 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for testing ToMixedPrecision pass"""
-from typing import Any, Dict, List
-
-import numpy as np
-import pytest
-import tvm
-from tvm import relay
-from tvm.relay.testing import lstm
-from tvm.relay.transform import InferType, ToMixedPrecision, mixed_precision
-
-target_precision = tvm.testing.parameter(
-    pytest.param("float16"),
-    pytest.param("bfloat16"),
-    ids=["float16", "bfloat16"],
-)
-
-
-def run_module(mod: tvm.runtime.Module, mod_params: Dict[str, Any]) -> List:
-    dev = tvm.device("llvm", 0)
-    result = relay.create_executor("debug", mod, device=dev, target="llvm").evaluate()(**mod_params)
-    if isinstance(result, tvm.runtime.container.ADT):
-        result = [r.numpy() for r in result]
-        return result
-    else:
-        return [result.numpy()]
-
-
-def verify_mixed_precision_output_close(
-    mod: tvm.runtime.Module,
-    mod_params: Dict[str, Any],
-    mixed_precision_dtype="float16",
-    rtol: float = 1e-3,
-    atol: float = 0,
-    keep_orig_output_dtype=False,
-) -> tvm.runtime.Module:
-    mod = InferType()(mod)
-    result_fp32 = run_module(mod, mod_params)
-
-    if not keep_orig_output_dtype:
-        amp_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
-        result_amp = run_module(amp_mod, mod_params)
-    else:
-        with tvm.transform.PassContext(
-            config={"relay.ToMixedPrecision.keep_orig_output_dtype": True}
-        ):
-            amp_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
-            result_amp = run_module(amp_mod, mod_params)
-
-    # Ensure the results are close
-    if mixed_precision_dtype != "bfloat16":
-        for fp32, amp in zip(result_fp32, result_amp):
-            np.testing.assert_allclose(fp32, amp, rtol=rtol, atol=atol)
-
-    if keep_orig_output_dtype:
-        assert (
-            np.array(result_amp).dtype == np.array(result_fp32).dtype
-        ), "output type and original type mismatch"
-
-    return amp_mod
-
-
-def test_lstm(target_precision):
-    """A small stress test on a single unrolled lstm unit.
-
-    Has internal functions and let statements the pass must work on.
-    """
-    # TODO(AndrewZhaoLuo): investigate why non-even units cause failure in codegen for CUDA
-    # See discussion here: https://github.com/apache/tvm/issues/8294#issuecomment-866190408
-    units = 4
-    iterations = 5
-    mod, mod_params = lstm.get_workload(iterations=iterations, num_hidden=units)
-
-    # This is an unrolled lstm so each data should be the previous results but
-    # we don't care, we just want to stress test things.
-    for i in range(iterations):
-        mod_params["data" if i == 0 else f"data{i}"] = np.random.uniform(
-            -10, 10, (1, units)
-        ).astype("float32")
-
-    verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, rtol=0.01, atol=0.01
-    )
-
-
-@pytest.mark.skip(reason="Flaky test")
-def test_lstm_float64():
-    """Tests if can handle other mixed precision types.
-
-    As a toy example show can convert graph to float64 and have it run.
-
-    It doesn't really make sense to do it, this just shows we can change
-    the target mixed_precision_dtype.
-    """
-    units = 3
-    iterations = 5
-    mod, mod_params = lstm.get_workload(iterations=iterations, num_hidden=units)
-
-    # This is an unrolled lstm so each data should be the previous results but
-    # we don't care, we just want to stress test things.
-    for i in range(iterations):
-        mod_params["data" if i == 0 else f"data{i}"] = np.random.uniform(
-            -10, 10, (1, units)
-        ).astype("float32")
-
-    verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype="float64", rtol=0.01, atol=0.01
-    )
-
-
-def test_convert_single_conv(target_precision):
-    """Conv is a green listed operation meaning it will always use fp16 workload.
-
-    By default it accumulates to fp32 and outputs fp16.
-    """
-    data_shape = (1, 3, 32, 32)
-    weight_shape = (5, 3, 3, 3)
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype="float32")
-    mod = tvm.IRModule.from_expr(conv)
-    mod = tvm.relay.transform.InferType()(mod)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
-    }
-    amp_mod = verify_mixed_precision_output_close(
-        mod,
-        mod_params,
-        mixed_precision_dtype=target_precision,
-        atol=0.01,
-        rtol=1e-3,
-        keep_orig_output_dtype=True,
-    )
-
-    expected_mod = tvm.IRModule.from_expr(
-        relay.cast(
-            relay.nn.conv2d(
-                relay.cast(data, target_precision),
-                relay.cast(weight, target_precision),
-                strides=(1, 1),
-                padding=(1, 1),
-                out_dtype=target_precision,
-            ),
-            "float32",
-        )
-    )
-    expected_mod = tvm.relay.transform.InferType()(expected_mod)
-
-    assert not tvm.ir.structural_equal(amp_mod, mod)
-    tvm.ir.assert_structural_equal(amp_mod, expected_mod)
-
-
-def test_convert_single_conv_fp64():
-    """As above but checks choosing a mixed_precision_type other than FP16 works"""
-    data_shape = (1, 3, 32, 32)
-    weight_shape = (5, 3, 3, 3)
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype="float32")
-    mod = tvm.IRModule.from_expr(conv)
-    mod = tvm.relay.transform.InferType()(mod)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
-    }
-    amp_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype="float64", atol=0.01, rtol=1e-3
-    )
-
-    # Note we still accumulate to FP32 by default, a user would need to overwrite default
-    # behavior to make this make more sense.
-    expected_mod = tvm.IRModule.from_expr(
-        relay.nn.conv2d(
-            relay.cast(data, "float64"),
-            relay.cast(weight, "float64"),
-            strides=(1, 1),
-            padding=(1, 1),
-            out_dtype="float64",
-        ),
-    )
-    expected_mod = tvm.relay.transform.InferType()(expected_mod)
-
-    assert not tvm.ir.structural_equal(amp_mod, mod)
-    tvm.ir.assert_structural_equal(amp_mod, expected_mod)
-
-
-def test_convert_conv_bn(target_precision):
-    """Conv is green and batch norm is gray. As Conv should output fp16 batch_norm should be green."""
-    data_shape = (1, 3, 32, 32)
-    weight_shape = (5, 3, 3, 3)
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype="float32")
-
-    bn_shape = [5]
-    gamma = relay.var("gamma", shape=bn_shape)
-    beta = relay.var("beta", shape=bn_shape)
-    moving_mean = relay.var("moving_mean", shape=bn_shape)
-    moving_var = relay.var("moving_var", shape=bn_shape)
-    bn = relay.nn.batch_norm(conv, gamma, beta, moving_mean, moving_var)
-    mod = tvm.IRModule.from_expr(bn[0])
-    mod = tvm.relay.transform.InferType()(mod)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
-        "gamma": np.random.uniform(-1, 1, size=bn_shape).astype("float32"),
-        "beta": np.random.uniform(-1, 1, size=bn_shape).astype("float32"),
-        "moving_mean": np.random.uniform(-1, 1, size=bn_shape).astype("float32"),
-        "moving_var": np.random.uniform(-1, 1, size=bn_shape).astype("float32"),
-    }
-    amp_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.025, rtol=0.01
-    )
-
-    # Creating expected module
-    data = relay.cast(relay.var("data", shape=data_shape), target_precision)
-    weight = relay.cast(relay.var("weight", shape=weight_shape), target_precision)
-    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype=target_precision)
-
-    bn_shape = [5]
-    gamma = relay.cast(relay.var("gamma", shape=bn_shape), target_precision)
-    beta = relay.cast(relay.var("beta", shape=bn_shape), target_precision)
-    moving_mean = relay.cast(relay.var("moving_mean", shape=bn_shape), target_precision)
-    moving_var = relay.cast(relay.var("moving_var", shape=bn_shape), target_precision)
-    bn = relay.nn.batch_norm(conv, gamma, beta, moving_mean, moving_var)
-
-    expected_mod = tvm.IRModule.from_expr(bn[0])
-    expected_mod = tvm.relay.transform.InferType()(expected_mod)
-    assert not tvm.ir.structural_equal(amp_mod, mod)
-    tvm.ir.assert_structural_equal(amp_mod, expected_mod)
-
-
-def test_do_not_convert_softmax(target_precision):
-    """Softmax is a red listed operation and therefore should never be fp16."""
-    shape = [1, 2, 3]
-    a = relay.var("a", shape=shape)
-    b = relay.nn.softmax(a)
-    mod = tvm.IRModule.from_expr(b)
-    mod = tvm.relay.transform.InferType()(mod)
-    out_mod = ToMixedPrecision(target_precision)(mod)
-    orig_mod = tvm.relay.transform.InferType()(mod)
-    tvm.ir.assert_structural_equal(orig_mod, out_mod)
-
-
-def test_do_not_convert_arange(target_precision):
-    """Arange is a red listed operation and therefore should never be fp16."""
-    dtype = "float32"
-    arange = relay.arange(relay.const(1, dtype), relay.const(128, dtype))
-    mod = tvm.IRModule.from_expr(arange)
-    out_mod = ToMixedPrecision(target_precision)(mod)
-    orig_mod = tvm.relay.transform.InferType()(mod)
-    tvm.ir.assert_structural_equal(orig_mod, out_mod)
-
-
-def test_do_not_convert_summation(target_precision):
-    """Ops that could involve a large summation are not allowed in fp16."""
-    shape = [1, 3, 16, 16]
-    a = relay.var("a", shape=shape)
-    ops = [
-        relay.sum,
-        relay.mean,
-        relay.nn.global_avg_pool2d,
-        lambda inp: relay.nn.adaptive_avg_pool2d(inp, (1, 1)),
-    ]
-    for op in ops:
-        mod = tvm.IRModule.from_expr(op(a))
-        out_mod = ToMixedPrecision(target_precision)(mod)
-        orig_mod = tvm.relay.transform.InferType()(mod)
-        tvm.ir.assert_structural_equal(orig_mod, out_mod)
-
-
-def test_green_gray_propagates_simple(target_precision):
-    """Conv is a green listed operation, while addition is gray.
-
-    As Conv outputs fp16 the add should be done in fp16.
-    """
-    data_shape = (1, 3, 32, 32)
-    weight_shape = (5, 3, 3, 3)
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype="float32")
-    conv = conv + conv
-    mod = tvm.IRModule.from_expr(conv)
-    mod = tvm.relay.transform.InferType()(mod)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
-    }
-    amp_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
-    )
-
-    conv_expr = relay.nn.conv2d(
-        relay.cast(data, target_precision),
-        relay.cast(weight, target_precision),
-        strides=(1, 1),
-        padding=(1, 1),
-        out_dtype=target_precision,
-    )
-    expected_mod = tvm.IRModule.from_expr(conv_expr + conv_expr)
-    expected_mod = tvm.relay.transform.InferType()(expected_mod)
-
-    assert not tvm.ir.structural_equal(amp_mod, mod)
-    tvm.ir.assert_structural_equal(amp_mod, expected_mod)
-
-
-def test_green_red_not_use_extraneous_cast(target_precision):
-    """Conv. is a green listed operation, while softmax is red.
-
-    Conv. also by default accumulates to fp32 but outputs fp16.
-
-    We want to avoid a situation where we have extraneous casts.
-    E.g. because softmax wants to operate on FP32 we might have
-
-    conv (FP32) -> cast (FP16) -> cast (FP32) -> softmax (FP32)
-
-    To get around this internally when we cast in the pass we cache
-    the output nodes and the reverse of the cast back to the original
-    node. For example casting the `conv (FP32)` to FP16 would produce:
-
-    `conv (FP32) -> cast (FP16)`
-
-    As the outputs. Now anytime we try to cast the `conv (FP32)` node
-    to FP16 it would return the cached result instead of a new cast node:
-
-    `conv (FP32) -> cast (FP16)`
-
-    Furthermore, if we try to cast the `cast (FP16)` node back to FP32 it
-    would just return
-
-    `conv (FP32)`.
-
-    This test makes sure this behavior occurs.
-    """
-    data_shape = (1, 3, 32, 32)
-    weight_shape = (5, 3, 3, 3)
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype="float32")
-    result = relay.nn.softmax(conv)
-    mod = tvm.IRModule.from_expr(result)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
-    }
-    amp_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=1e-3
-    )
-
-    # Construct expected structure
-    conv = relay.cast(
-        relay.nn.conv2d(
-            relay.cast(data, target_precision),
-            relay.cast(weight, target_precision),
-            strides=(1, 1),
-            padding=(1, 1),
-            out_dtype=target_precision,
-        ),
-        "float32",
-    )
-    result = relay.nn.softmax(conv)
-    expected_mod = tvm.IRModule.from_expr(result)
-    expected_mod = InferType()(expected_mod)
-
-    tvm.ir.assert_structural_equal(expected_mod, amp_mod)
-
-
-def test_red_gray_propagates_simple(target_precision):
-    """Everything after a softmax should be in FP32 (exception green colored ops)"""
-    shape = [1, 2, 3]
-    a = relay.var("a", shape=shape)
-    b = relay.nn.softmax(a)
-    c = b + b
-    mod = tvm.IRModule.from_expr(c)
-    mod = tvm.relay.transform.InferType()(mod)
-
-    mod_params = {
-        "a": np.random.uniform(-1, 1, size=shape).astype("float32"),
-    }
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.0, rtol=0.0
-    )
-
-    tvm.ir.assert_structural_equal(mod, output_mod)
-
-
-def test_let_statement_simple(target_precision):
-    """A 'simple' let statement example.
-
-    Noticeable is the mutation of the bound variable types.
-    """
-    var1 = relay.var("var1", shape=[1, 20])
-    var2 = relay.var("var2", shape=[1, 20])
-
-    data = relay.var("data", shape=[1, 20])
-    weight = relay.var("weight", shape=[20, 20])
-
-    r1 = var1 + var1
-
-    r2 = var2 + var2
-    let2 = relay.Let(var2, relay.nn.dense(r1, weight, units=20), r2)
-    let1 = relay.Let(var1, relay.nn.dense(data, weight, units=20), let2)
-
-    mod = tvm.IRModule.from_expr(let1)
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=[1, 20]).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=[20, 20]).astype("float32"),
-    }
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.05, rtol=0.15
-    )
-
-    # Construct expected structure
-    var1 = relay.var("var1", shape=[1, 20], dtype=target_precision)
-    var2 = relay.var("var2", shape=[1, 20], dtype=target_precision)
-    data = relay.cast(relay.var("data", shape=[1, 20]), target_precision)
-    weight = relay.cast(relay.var("weight", shape=[20, 20]), target_precision)
-    r1 = var1 + var1
-    r2 = var2 + var2
-    let2 = relay.Let(
-        var2,
-        relay.nn.dense(r1, weight, units=20, out_dtype=target_precision),
-        r2,
-    )
-    let1 = relay.Let(
-        var1,
-        relay.nn.dense(data, weight, units=20, out_dtype=target_precision),
-        let2,
-    )
-    expected_mod = tvm.IRModule.from_expr(let1)
-    expected_mod = InferType()(expected_mod)
-
-    tvm.ir.assert_structural_equal(expected_mod, output_mod)
-
-
-def test_where_simple(target_precision):
-    data = relay.var("data", shape=[1, 20])
-    weight = relay.var("weight", shape=[20, 20])
-    a = relay.nn.dense(data, weight, units=20)
-    b = relay.where(data, a, a)
-    mod = tvm.IRModule.from_expr(b)
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=[1, 20]).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=[20, 20]).astype("float32"),
-    }
-
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
-    )
-
-    # Create expected module
-    data = relay.cast(relay.var("data", shape=[1, 20]), target_precision)
-    weight = relay.cast(relay.var("weight", shape=[20, 20]), target_precision)
-    a = relay.nn.dense(data, weight, units=20, out_dtype=target_precision)
-    b = relay.where(data, a, a)
-    expected_mod = tvm.IRModule.from_expr(b)
-    expected_mod = InferType()(expected_mod)
-
-    tvm.ir.assert_structural_equal(expected_mod, output_mod)
-
-
-def test_batch_matmul_simple(target_precision):
-    """Batch matmul is a special case where we try to accumulate to fp16.
-
-    This is due to the fact heterogenous accumulation dtypes does not work
-    on all platforms at the moment.
-    """
-    data = relay.var("data", shape=[1, 1, 20])
-    weight = relay.var("weight", shape=[1, 20, 20])
-    a = relay.nn.batch_matmul(data, weight)
-    mod = tvm.IRModule.from_expr(a)
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=[1, 1, 20]).astype("float32"),
-        "weight": np.random.uniform(-1, 1, size=[1, 20, 20]).astype("float32"),
-    }
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
-    )
-    # Create expected module
-    data = relay.cast(relay.var("data", shape=[1, 1, 20]), target_precision)
-    weight = relay.cast(relay.var("weight", shape=[1, 20, 20]), target_precision)
-    a = relay.nn.batch_matmul(data, weight, out_dtype=target_precision)
-    expected_mod = tvm.IRModule.from_expr(a)
-    expected_mod = InferType()(expected_mod)
-    tvm.ir.assert_structural_equal(expected_mod, output_mod)
-
-
-def test_convert_follow_node_with_integer_arguments(target_precision):
-    """Tests the conversion of a follow op with integer arguments + constant float args.
-
-    The follow op should convert the floating point argument into fp16 as constants/vars
-    will always be converted if safe to do so.
-    """
-
-    data = relay.var("data", shape=[1, 10], dtype="float32")
-
-    # We use an addition to make sure the input indices are not a var
-    # (which are always casted if safe)
-    indices = relay.var("indices", shape=[1, 1], dtype="int32") + relay.const(0, dtype="int32")
-    take = relay.take(data, indices, axis=0)
-    mod = tvm.IRModule.from_expr(take)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=[1, 10]).astype("float32"),
-        "indices": np.array([[0]]).astype("int32"),
-    }
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
-    )
-
-    # Create expected module
-    data = relay.cast(relay.var("data", shape=[1, 10]), target_precision)
-    take = relay.take(data, indices, axis=0)
-    expected_mod = tvm.IRModule.from_expr(take)
-    expected_mod = InferType()(expected_mod)
-    tvm.ir.assert_structural_equal(expected_mod, output_mod)
-
-
-def test_clip(target_precision):
-    data = relay.var("data", shape=[1, 10], dtype="float32")
-    res = relay.clip(data, a_min=-128000, a_max=128000)
-
-    mod = tvm.IRModule.from_expr(res)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=[1, 10]).astype("float32"),
-    }
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
-    )
-
-    # Create expected module
-    if target_precision == "bfloat16":
-        data = relay.cast(relay.var("data", shape=[1, 10]), target_precision)
-    res = relay.clip(data, a_min=-128000, a_max=128000)
-    expected_mod = tvm.IRModule.from_expr(res)
-    expected_mod = InferType()(expected_mod)
-    tvm.ir.assert_structural_equal(expected_mod, output_mod)
-
-
-def test_clip_with_pre_op(target_precision):
-    data = relay.var("data", shape=[1, 10], dtype="float32")
-    const = relay.const(5, "float32")
-    res = relay.divide(data, const)
-    res = relay.clip(res, a_min=-128000, a_max=128000)
-
-    mod = tvm.IRModule.from_expr(res)
-
-    mod_params = {
-        "data": np.random.uniform(-1, 1, size=[1, 10]).astype("float32"),
-    }
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
-    )
-
-    # Create expected module
-    data = relay.cast(relay.var("data", shape=[1, 10]), target_precision)
-    const = relay.cast(relay.const(5, "float32"), target_precision)
-    res = relay.divide(data, const)
-    if target_precision == "float16":
-        res = relay.cast(res, "float32")
-    res = relay.clip(res, a_min=-128000, a_max=128000)
-    expected_mod = tvm.IRModule.from_expr(res)
-    expected_mod = InferType()(expected_mod)
-    tvm.ir.assert_structural_equal(expected_mod, output_mod)
-
-
-def test_loop(target_precision):
-    i = relay.var("i", shape=(), dtype="int32")
-    st = relay.var("st", shape=(relay.Any(), 1), dtype="int32")
-
-    def int32(val):
-        return relay.const(val, "int32")
-
-    def _cond(i, st):
-        return relay.op.min(relay.op.less(i, int32(10)))
-
-    def _body(i, st):
-        i_vec = relay.op.reshape(i, (1, 1))
-        ret = relay.op.concatenate([st, i_vec], axis=0)
-        return i + int32(1), ret
-
-    loop = relay.loops.while_loop(_cond, [i, st], _body)
-    start = relay.var("start", shape=(), dtype="int32")
-    body = loop(start, relay.op.reshape(relay.const(0), newshape=(1, 1)))
-    func = relay.Function([start], relay.TupleGetItem(body, 1))
-    mod = tvm.IRModule()
-    mod["main"] = func
-
-    mod_params = {
-        "start": np.random.uniform(-1, 1, size=()).astype("int32"),
-    }
-    output_mod = verify_mixed_precision_output_close(
-        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
-    )
-
-    # Create expected module
-    expected_mod = InferType()(mod)
-    tvm.ir.assert_structural_equal(expected_mod, output_mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_type_functor.py b/tests/python/relay/test_type_functor.py
deleted file mode 100644
index 8370b2a75dfc..000000000000
--- a/tests/python/relay/test_type_functor.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import TypeFunctor, TypeMutator, TypeVisitor
-from tvm.relay.ty import (
-    TypeVar,
-    IncompleteType,
-    TensorType,
-    FuncType,
-    TupleType,
-    TypeRelation,
-    RefType,
-    GlobalTypeVar,
-    TypeCall,
-)
-from tvm.relay.adt import TypeData
-
-
-def check_visit(typ):
-    try:
-        ef = TypeFunctor()
-        ef.visit(typ)
-        assert False
-    except NotImplementedError:
-        pass
-
-    ev = TypeVisitor()
-    ev.visit(typ)
-
-    tvm.ir.assert_structural_equal(TypeMutator().visit(typ), typ, map_free_vars=True)
-
-
-def test_type_var():
-    tv = TypeVar("a")
-    check_visit(tv)
-
-
-def test_incomplete_type():
-    it = IncompleteType()
-    check_visit(it)
-
-
-def test_tensor_type():
-    tt = TensorType([])
-    check_visit(tt)
-
-
-def test_func_type():
-    tv = TypeVar("tv")
-    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), "float32")
-    ft = FuncType([tt], tt, type_params=[tv])
-    check_visit(ft)
-
-
-def test_tuple_type():
-    tt = TupleType([TupleType([])])
-    check_visit(tt)
-
-
-def test_type_relation():
-    func = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Broadcast")
-    attrs = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3, 4))
-    tp = TypeVar("tp")
-    tf = FuncType([], TupleType([]), [], [])
-    tt = TensorType([1, 2, 3], "float32")
-    tr = TypeRelation(func, [tp, tf, tt], 2, attrs)
-
-    check_visit(tr)
-
-
-def test_ref_type():
-    rt = RefType(TupleType([]))
-    check_visit(rt)
-
-
-def test_global_type_var():
-    gtv = GlobalTypeVar("gtv")
-    check_visit(gtv)
-
-
-def test_type_call():
-    tc = TypeCall(GlobalTypeVar("tf"), [TupleType([])])
-    check_visit(tc)
-
-
-def test_type_data():
-    td = TypeData(GlobalTypeVar("td"), [TypeVar("tv")], [])
-    check_visit(td)
-
-
-if __name__ == "__main__":
-    test_type_var()
-    test_incomplete_type()
-    test_tensor_type()
-    test_func_type()
-    test_tuple_type()
-    test_type_relation()
-    test_ref_type()
-    test_global_type_var()
-    test_type_call()
-    test_type_data()
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
deleted file mode 100644
index 7d0cd51d3298..000000000000
--- a/tests/python/relay/test_type_infer.py
+++ /dev/null
@@ -1,587 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test that type checker correcly computes types
-   for expressions.
-"""
-import pytest
-import numpy as np
-
-import tvm
-from tvm import IRModule, relay
-from tvm.relay import op, transform
-from tvm.relay.op import op as _op
-from tvm.script import tir as T
-
-
-def infer_mod(mod, annotate_spans=True):
-    if annotate_spans:
-        mod = relay.transform.AnnotateSpans()(mod)
-
-    mod = transform.InferType()(mod)
-    return mod
-
-
-def infer_expr(expr):
-    transform.InferTypeLocal(expr)
-    return expr
-
-
-def assert_has_type(expr, typ, mod=None):
-    if not mod:
-        mod = tvm.IRModule({})
-
-    mod["main"] = expr
-    mod = infer_mod(mod)
-    checked_expr = mod["main"]
-    checked_type = checked_expr.checked_type
-    if checked_type != typ:
-        raise RuntimeError("Type mismatch %s vs %s" % (checked_type, typ))
-
-
-def initialize_box_adt(mod):
-    # initializes simple ADT for tests
-    box = relay.GlobalTypeVar("box")
-    tv = relay.TypeVar("tv")
-    constructor = relay.Constructor("constructor", [tv], box)
-    data = relay.TypeData(box, [tv], [constructor])
-    mod[box] = data
-    return box, constructor
-
-
-def test_monomorphic_let():
-    "Program: let %x = 1; %x"
-    # TODO(@jroesch): this seems whack.
-    sb = relay.ScopeBuilder()
-    x = relay.var("x", dtype="float64", shape=())
-    x = sb.let(x, relay.const(1.0, "float64"))
-    sb.ret(x)
-    xchecked = infer_expr(sb.get())
-    assert xchecked.checked_type == relay.scalar_type("float64")
-
-
-def test_single_op():
-    "Program: fn (%x : float32) { let %t1 = f(%x); %t1 }"
-    x = relay.var("x", shape=[])
-    func = relay.Function([x], op.log(x))
-    ttype = relay.TensorType([], dtype="float32")
-    assert_has_type(func, relay.FuncType([ttype], ttype))
-
-
-def test_add_broadcast_op():
-    """
-    Program:
-        fn (%x: Tensor[(10, 4), float32], %y: Tensor[(5, 10, 1), float32])
-            -> Tensor[(5, 10, 4), float32] {
-            %x + %y
-        }
-    """
-    x = relay.var("x", shape=(10, 4))
-    y = relay.var("y", shape=(5, 10, 1))
-    z = x + y
-    func = relay.Function([x, y], z)
-    t1 = relay.TensorType((10, 4), "float32")
-    t2 = relay.TensorType((5, 10, 1), "float32")
-    t3 = relay.TensorType((5, 10, 4), "float32")
-    expected_ty = relay.FuncType([t1, t2], t3)
-    assert_has_type(func, expected_ty)
-
-
-def test_dual_op():
-    """Program:
-    fn (%x : Tensor[(10, 10), float32]) {
-      let %t1 = log(x);
-      let %t2 = add(%t1, %x);
-      %t1
-    }
-    """
-    tp = relay.TensorType((10, 10), "float32")
-    x = relay.var("x", tp)
-    sb = relay.ScopeBuilder()
-    t1 = sb.let("t1", relay.log(x))
-    t2 = sb.let("t2", relay.add(t1, x))
-    sb.ret(t2)
-    f = relay.Function([x], sb.get())
-    fchecked = infer_expr(f)
-    assert fchecked.checked_type == relay.FuncType([tp], tp)
-
-
-def test_decl():
-    """Program:
-    def @f(%x : Tensor[(10, 10), float32]) {
-        log(%x)
-    }
-    """
-    tp = relay.TensorType((10, 10))
-    x = relay.var("x", tp)
-    f = relay.Function([x], relay.log(x))
-    fchecked = infer_expr(f)
-    assert fchecked.checked_type == relay.FuncType([tp], tp)
-
-
-def test_recursion():
-    """
-    Program:
-       def @f(%n: int32, %data: float32) -> float32 {
-          if (%n == 0) {
-              %data
-          } else {
-              @f(%n - 1, log(%data))
-          }
-       }
-    """
-    sb = relay.ScopeBuilder()
-    f = relay.GlobalVar("f")
-    ti32 = relay.scalar_type("int32")
-    tf32 = relay.scalar_type("float32")
-    n = relay.var("n", ti32)
-    data = relay.var("data", tf32)
-
-    with sb.if_scope(relay.equal(n, relay.const(0, ti32))):
-        sb.ret(data)
-    with sb.else_scope():
-        sb.ret(f(relay.subtract(n, relay.const(1, ti32)), relay.log(data)))
-    mod = tvm.IRModule()
-    mod[f] = relay.Function([n, data], sb.get())
-    mod = infer_mod(mod)
-    assert "@f(%1, %2)" in mod.astext()
-    assert mod["f"].checked_type == relay.FuncType([ti32, tf32], tf32)
-
-
-def test_incomplete_call():
-    tt = relay.scalar_type("int32")
-    x = relay.var("x", tt)
-    f_type = relay.FuncType([tt], tt)
-    f = relay.var("f")
-    func = relay.Function([x, f], relay.Call(f, [x]), tt)
-
-    ft = infer_expr(func)
-    assert ft.checked_type == relay.FuncType([tt, f_type], tt)
-
-
-def test_higher_order_argument():
-    a = relay.TypeVar("a")
-    x = relay.Var("x", a)
-    id_func = relay.Function([x], x, a, [a])
-
-    b = relay.TypeVar("b")
-    f = relay.Var("f", relay.FuncType([b], b))
-    y = relay.Var("y", b)
-    ho_func = relay.Function([f, y], f(y), b, [b])
-
-    # id func should be an acceptable argument to the higher-order
-    # function even though id_func takes a type parameter
-    ho_call = ho_func(id_func, relay.const(0, "int32"))
-
-    hc = infer_expr(ho_call)
-    expected = relay.scalar_type("int32")
-    assert hc.checked_type == expected
-
-
-def test_higher_order_return():
-    a = relay.TypeVar("a")
-    x = relay.Var("x", a)
-    id_func = relay.Function([x], x, a, [a])
-
-    b = relay.TypeVar("b")
-    nested_id = relay.Function([], id_func, relay.FuncType([b], b), [b])
-
-    ft = infer_expr(nested_id)
-    assert ft.checked_type == relay.FuncType([], relay.FuncType([b], b), [b])
-
-
-def test_higher_order_nested():
-    a = relay.TypeVar("a")
-    x = relay.Var("x", a)
-    id_func = relay.Function([x], x, a, [a])
-
-    choice_t = relay.FuncType([], relay.scalar_type("bool"))
-    f = relay.Var("f", choice_t)
-
-    b = relay.TypeVar("b")
-    z = relay.Var("z")
-    top = relay.Function(
-        [f], relay.If(f(), id_func, relay.Function([z], z)), relay.FuncType([b], b), [b]
-    )
-
-    expected = relay.FuncType([choice_t], relay.FuncType([b], b), [b])
-    ft = infer_expr(top)
-    assert ft.checked_type == expected
-
-
-def test_tuple():
-    tp = relay.TensorType((10,))
-    x = relay.var("x", tp)
-    res = relay.Tuple([x, x])
-    assert infer_expr(res).checked_type == relay.TupleType([tp, tp])
-
-
-def test_ref():
-    x = relay.var("x", "float32")
-    y = relay.var("y", "float32")
-    r = relay.RefCreate(x)
-    st = relay.scalar_type("float32")
-    assert infer_expr(r).checked_type == relay.RefType(st)
-    g = relay.RefRead(r)
-    assert infer_expr(g).checked_type == st
-    w = relay.RefWrite(r, y)
-    assert infer_expr(w).checked_type == relay.TupleType([])
-
-
-def test_free_expr():
-    x = relay.var("x", "float32")
-    y = relay.add(x, x)
-    yy = infer_expr(y)
-    tvm.ir.assert_structural_equal(yy.args[0], x, map_free_vars=True)
-    assert yy.checked_type == relay.scalar_type("float32")
-    assert x.vid.same_as(yy.args[0].vid)
-
-
-def test_type_args():
-    x = relay.var("x", shape=(10, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-
-    # InferTypeLocal does not support populating the type_args field
-    mod = infer_mod(IRModule.from_expr(z))
-    mod = infer_mod(mod, annotate_spans=False)
-    ty_args = mod["main"].body.type_args
-    assert len(ty_args) == 2
-    assert ty_args[0].dtype == "float32"
-    assert ty_args[1].dtype == "float32"
-    sh1 = ty_args[0].shape
-    sh2 = ty_args[1].shape
-    assert sh1[0].value == 10
-    assert sh1[1].value == 10
-    assert sh2[0].value == 1
-    assert sh2[1].value == 10
-
-
-def test_global_var_recursion():
-    mod = tvm.IRModule({})
-    gv = relay.GlobalVar("main")
-    x = relay.var("x", shape=[])
-    tt = relay.scalar_type("float32")
-
-    func = relay.Function([x], relay.Call(gv, [x]), tt)
-    mod[gv] = func
-    mod = infer_mod(mod)
-    func_ty = mod["main"].checked_type
-
-    assert func_ty == relay.FuncType([tt], tt)
-
-
-def test_equal():
-    i = relay.var("i", shape=[], dtype="int32")
-    eq = op.equal(i, relay.const(0, dtype="int32"))
-    func = relay.Function([i], eq)
-    ft = infer_expr(func)
-    expected = relay.FuncType([relay.scalar_type("int32")], relay.scalar_type("bool"))
-    assert ft.checked_type == expected
-
-    assert ft.checked_type == relay.FuncType(
-        [relay.scalar_type("int32")], relay.scalar_type("bool")
-    )
-
-
-def test_constructor_type():
-    mod = tvm.IRModule()
-    box, constructor = initialize_box_adt(mod)
-
-    a = relay.TypeVar("a")
-    x = relay.Var("x", a)
-    func = relay.Function([x], constructor(x), box(a), [a])
-    mod["main"] = func
-    mod = infer_mod(mod)
-    func_ty = mod["main"].checked_type
-    box = mod.get_global_type_var("box")
-    expected = relay.FuncType([a], box(a), [a])
-    assert func_ty == expected
-
-
-def test_constructor_call():
-    mod = tvm.IRModule()
-    box, constructor = initialize_box_adt(mod)
-
-    box_unit = constructor(relay.Tuple([]))
-    box_constant = constructor(relay.const(0, "float32"))
-
-    func = relay.Function([], relay.Tuple([box_unit, box_constant]))
-    mod["main"] = func
-    mod = infer_mod(mod)
-    ret_type = mod["main"].checked_type.ret_type.fields
-    # NB(@jroesch): when we annotate spans the ast fragments before
-    # annotation the previous fragments will no longer be directly equal.
-    box = mod.get_global_type_var("box")
-    expected1 = box(relay.TupleType([]))
-    expected2 = box(relay.TensorType((), "float32"))
-    assert ret_type[0] == expected1
-    assert ret_type[1] == expected2
-
-
-def test_adt_match():
-    mod = tvm.IRModule()
-    box, constructor = initialize_box_adt(mod)
-
-    v = relay.Var("v", relay.TensorType((), "float32"))
-    match = relay.Match(
-        constructor(relay.const(0, "float32")),
-        [
-            relay.Clause(
-                relay.PatternConstructor(constructor, [relay.PatternVar(v)]), relay.Tuple([])
-            ),
-            # redundant but shouldn't matter to typechecking
-            relay.Clause(relay.PatternWildcard(), relay.Tuple([])),
-        ],
-    )
-
-    func = relay.Function([], match)
-    mod["main"] = func
-    mod = infer_mod(mod)
-    actual = mod["main"].checked_type.ret_type
-    assert actual == relay.TupleType([])
-
-
-def test_adt_match_type_annotations():
-    mod = tvm.IRModule()
-    box, constructor = initialize_box_adt(mod)
-
-    # the only type annotation is inside the match pattern var
-    # but that should be enough info
-    tt = relay.TensorType((2, 2), "float32")
-    x = relay.Var("x")
-    mv = relay.Var("mv", tt)
-    match = relay.Match(
-        constructor(x),
-        [
-            relay.Clause(
-                relay.PatternConstructor(constructor, [relay.PatternVar(mv)]), relay.Tuple([])
-            )
-        ],
-    )
-
-    mod["main"] = relay.Function([x], match)
-    mod = infer_mod(mod)
-    ft = mod["main"].checked_type
-    assert ft == relay.FuncType([tt], relay.TupleType([]))
-
-
-def test_let_polymorphism():
-    id = relay.Var("id")
-    xt = relay.TypeVar("xt")
-    x = relay.Var("x", xt)
-    body = relay.Tuple([id(relay.const(1)), id(relay.Tuple([]))])
-    body = relay.Let(id, relay.Function([x], x, xt, [xt]), body)
-    body = infer_expr(body)
-    int32 = relay.TensorType((), "int32")
-    tvm.ir.assert_structural_equal(body.checked_type, relay.TupleType([int32, relay.TupleType([])]))
-
-
-def test_type_arg_infer():
-    code = """
-#[version = "0.0.5"]
-def @id[A](%x: A) -> A {
-  %x
-}
-def @main(%f: float32) -> float32 {
-  @id(%f)
-}
-"""
-    mod = tvm.relay.fromtext(code)
-    mod = transform.InferType()(mod)
-    tvm.ir.assert_structural_equal(mod["main"].body.type_args, [relay.TensorType((), "float32")])
-
-
-def test_dynamic_function():
-    dy_tt = relay.TensorType([relay.Any()], "float32")
-    s_tt = relay.TensorType([10], "float32")
-    x = relay.Var("x", dy_tt)
-    f = relay.Function([x], x + x)
-    y = relay.Var("y", s_tt)
-    c = f(y)
-
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([y], c)
-    mod = transform.InferType()(mod)
-    assert mod["main"].params[0].checked_type == s_tt
-
-    data = relay.var(
-        "data", shape=(relay.Any(), relay.Any(), relay.Any(), relay.Any()), dtype="float32"
-    )
-    weigth = relay.const(np.full((16, 16, 3, 3), 0.25), dtype="float32")
-    x = relay.nn.conv2d(data, weigth, kernel_size=(3, 3), channels=16, groups=2)
-    mod = tvm.IRModule.from_expr(x)
-    mod = transform.InferType()(mod)
-
-
-def test_custom_op_infer():
-    """Tests infer type for custom_op"""
-    op_name = "custom_log"
-    _op.register(op_name, r"code(cal log of a tensor.)code")
-    _op.get(op_name).set_num_inputs(1)
-    _op.get(op_name).add_argument("data_0", "Tensor", "The input data tensor.")
-    # call default relation functions
-    _op.get(op_name).add_type_rel("Identity")
-    _op.get(op_name).set_support_level(1)
-    _op.register_pattern(op_name, _op.OpPattern.ELEMWISE)
-    _op.register_stateful(op_name, False)
-
-    def clog(x):
-        return relay.Call(_op.get(op_name), [x])
-
-    tp = relay.TensorType((10, 10), "float32")
-    x = relay.var("x", tp)
-    sb = relay.ScopeBuilder()
-    t1 = sb.let("t1", clog(x))
-    t2 = sb.let("t2", relay.add(t1, x))
-    sb.ret(t2)
-    f = relay.Function([x], sb.get())
-    fchecked = infer_expr(f)
-    assert fchecked.checked_type == relay.FuncType([tp], tp)
-
-
-def test_custom_add_broadcast_op():
-    """Tests infer type for broadcast custom_op"""
-    op_name = "custom_broadcast_add"
-    _op.register(op_name, r"code(Add two tensor with inner broadcasting.)code")
-    _op.get(op_name).set_num_inputs(2)
-    _op.get(op_name).add_argument("data_0", "Tensor", "The input data tensor.")
-    _op.get(op_name).add_argument("data_1", "Tensor", "The input data tensor.")
-    # call default relation functions
-    _op.get(op_name).add_type_rel("Broadcast")
-    _op.get(op_name).set_support_level(1)
-    _op.register_stateful(op_name, False)
-
-    def broadcast_add(x, y):
-        return relay.Call(_op.get(op_name), [x, y])
-
-    x = relay.var("x", shape=(10, 4))
-    y = relay.var("y", shape=(5, 10, 1))
-    z = broadcast_add(x, y)
-    func = relay.Function([x, y], z)
-    t1 = relay.TensorType((10, 4), "float32")
-    t2 = relay.TensorType((5, 10, 1), "float32")
-    t3 = relay.TensorType((5, 10, 4), "float32")
-    expected_ty = relay.FuncType([t1, t2], t3)
-    assert_has_type(func, expected_ty)
-
-
-def test_custom_op_rel_infer():
-    """Tests infer type for custom_op"""
-
-    def custom_log1_rel(arg_types, attrs):
-        assert len(arg_types) == 1, "type relation arg number mismatch!"
-        if attrs:
-            assert isinstance(attrs, DictAttrs)
-        inputa_type = arg_types[0]
-        return relay.TensorType(inputa_type.shape, inputa_type.dtype)
-
-    op_name = "custom_log1"
-    _op.register(op_name, r"code(cal log of a tensor.)code")
-    _op.get(op_name).set_num_inputs(1)
-    _op.get(op_name).add_argument("data_0", "Tensor", "The input data tensor.")
-    _op.get(op_name).set_attrs_type_key("DictAttrs")
-    # call customized relation functions
-    _op.get(op_name).add_type_rel("custom_log1", custom_log1_rel)
-    _op.get(op_name).set_support_level(1)
-    _op.register_pattern(op_name, _op.OpPattern.ELEMWISE)
-    _op.register_stateful(op_name, False)
-
-    def clog(x):
-        return relay.Call(_op.get(op_name), [x])
-
-    tp = relay.TensorType((10, 10), "float32")
-    x = relay.var("x", tp)
-    sb = relay.ScopeBuilder()
-    t1 = sb.let("t1", clog(x))
-    t2 = sb.let("t2", relay.add(t1, x))
-    sb.ret(t2)
-    f = relay.Function([x], sb.get())
-    fchecked = infer_expr(f)
-    assert fchecked.checked_type == relay.FuncType([tp], tp)
-
-
-def test_custom_op_rel_infer_exception():
-    """Tests infer type for custom_op"""
-
-    def custom_log1_rel(arg_types, attrs):
-        assert len(arg_types) == 2, "type relation arg number mismatch!"
-        return None
-
-    op_name = "custom_log2"
-    _op.register(op_name, r"code(cal log of a tensor.)code")
-    _op.get(op_name).set_num_inputs(1)
-    _op.get(op_name).add_argument("data_0", "Tensor", "The input data tensor.")
-    _op.get(op_name).set_attrs_type_key("DictAttrs")
-    # call customized relation functions
-    _op.get(op_name).add_type_rel("custom_log2", custom_log1_rel)
-    _op.get(op_name).set_support_level(1)
-    _op.register_pattern(op_name, _op.OpPattern.ELEMWISE)
-    _op.register_stateful(op_name, False)
-
-    def clog(x):
-        return relay.Call(_op.get(op_name), [x])
-
-    tp = relay.TensorType((10, 10), "float32")
-    x = relay.var("x", tp)
-    sb = relay.ScopeBuilder()
-    t1 = sb.let("t1", clog(x))
-    t2 = sb.let("t2", relay.add(t1, x))
-    sb.ret(t2)
-    f = relay.Function([x], sb.get())
-    with pytest.raises(AssertionError) as cm:
-        fchecked = infer_expr(f)
-        assert "type relation arg number mismatch" in str(cm.execption)
-
-
-def test_repeat_register():
-    op_name = "custom_log3"
-    _op.register(op_name, r"code(cal log of a tensor.)code")
-    with pytest.raises(tvm.error.TVMError) as cm:
-        _op.register(op_name)
-        assert "Operator custom_log3 is registered before" in str(cm.execption)
-
-
-@pytest.mark.parametrize("relay_op", [relay.op.argmax, relay.op.argmin])
-@pytest.mark.parametrize(
-    "shape_dtype",
-    [
-        ("int32", T.int32),
-        ("int64", T.int64),
-    ],
-    ids=["int32", "int64"],
-)
-def test_argreduce_infer_return_type(relay_op, shape_dtype):
-    x_shape = (1, 1)
-    broadcast_shape = [1, 1]
-    (sdtype, conv) = shape_dtype
-
-    x = relay.var("data", relay.TensorType(x_shape, "float32"))
-    broadcast_to = relay.op.broadcast_to(x, relay.const(broadcast_shape, dtype=sdtype))
-    argmax = relay_op(broadcast_to, axis=[1])
-
-    f = relay.Function([x], argmax)
-    assert_has_type(
-        f,
-        relay.FuncType(
-            [relay.TensorType(broadcast_shape, "float32")],
-            relay.TensorType([conv(1)], dtype=sdtype),
-        ),
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_type_solver.py b/tests/python/relay/test_type_solver.py
deleted file mode 100644
index c1dc5c03a420..000000000000
--- a/tests/python/relay/test_type_solver.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import relay
-from tvm.relay import testing
-
-import pytest
-import numpy as np
-
-
-def make_rel(name, args, num_inputs=None, attrs=None):
-    func = tvm.ir.EnvFunc.get("tvm.relay.type_relation." + name)
-    if num_inputs is None:
-        num_inputs = len(args) - 1
-    return relay.ty.TypeRelation(func, args, num_inputs, attrs)
-
-
-def make_solver():
-    solver = relay.analysis._ffi_api._test_type_solver()
-    solver.Solve = solver("Solve")
-    solver.Unify = solver("Unify")
-    solver.Resolve = solver("Resolve")
-    solver.AddConstraint = solver("AddConstraint")
-
-    def gen_type(name, args, out=None):
-        out = out if out else relay.ty.IncompleteType()
-        solver.AddConstraint(make_rel(name, args + [out]))
-        return out
-
-    solver.gen_type = gen_type
-    return solver
-
-
-def test_bcast():
-    solver = make_solver()
-    t0 = relay.ty.TensorType((10, 20), "float32")
-    t1 = relay.ty.TensorType((10, 1), "float32")
-    tc = relay.ty.TensorType((10, 1, 1), "float32")
-    t2 = solver.gen_type("Broadcast", [t0, t1])
-    t3 = solver.gen_type("Identity", [t2])
-    t4 = solver.gen_type("Broadcast", [t3, tc])
-    assert solver.Solve()
-    assert solver.Resolve(t2) == relay.ty.TensorType((10, 20), "float32")
-    assert solver.Resolve(t4) == relay.ty.TensorType((10, 10, 20), "float32")
-
-
-def test_backward_solving():
-    solver = make_solver()
-    t0 = relay.ty.TensorType((10, 20), "float32")
-    tc = relay.ty.TensorType((10, 1, 1), "float32")
-    t1 = relay.ty.IncompleteType()
-    t3 = solver.gen_type("Broadcast", [t0, t1])
-    t2 = solver.gen_type("Identity", [t1], out=tc)
-    assert solver.Solve()
-    assert solver.Resolve(t3) == relay.ty.TensorType((10, 10, 20), "float32")
-
-
-def test_unify_tuple():
-    solver = make_solver()
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-    t3 = relay.ty.TensorType((10, 20), "float32")
-
-    tup1 = relay.ty.TupleType([t1, t2])
-    tup2 = relay.ty.TupleType([t3, t3])
-
-    unified = solver.Unify(tup1, tup2)
-    assert unified == tup2
-
-
-def test_unify_global_type_var():
-    # should only be able to unify if they're the same
-    solver = make_solver()
-    gtv = relay.GlobalTypeVar("gtv")
-    unified = solver.Unify(gtv, gtv)
-    assert unified == gtv
-
-
-def test_unify_typecall():
-    solver = make_solver()
-    gtv = relay.GlobalTypeVar("gtv")
-
-    # yeah, typecalls are shaped like tuples so the same
-    # tests work out
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-    t3 = relay.ty.TensorType((10, 20), "float32")
-
-    tc1 = relay.ty.TypeCall(gtv, [t1, t2])
-    tc2 = relay.ty.TypeCall(gtv, [t3, t3])
-    unified = solver.Unify(tc1, tc2)
-    assert unified == tc2
-
-
-def test_unify_functype():
-    solver = make_solver()
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-    t3 = relay.ty.IncompleteType()
-
-    unit = relay.ty.TupleType([])
-    tensor1 = relay.ty.TensorType((10, 20), "float32")
-    tensor2 = relay.ty.TensorType((10,), "float32")
-
-    ft1 = relay.ty.FuncType([t1, t2], t3)
-    ft2 = relay.ty.FuncType([tensor1, tensor2], unit)
-
-    unified = solver.Unify(ft1, ft2)
-    assert unified == ft2
-
-
-def test_recursive_unify():
-    solver = make_solver()
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-    t3 = relay.ty.IncompleteType()
-
-    tensor1 = relay.ty.TensorType((10, 10, 20), "float32")
-    tensor2 = relay.ty.TensorType((10, 20), "float32")
-    tensor3 = relay.ty.TensorType((10,), "float32")
-
-    tup1 = relay.ty.TupleType([relay.ty.TupleType([t1, t2]), t2])
-    tup2 = relay.ty.TupleType([relay.ty.TupleType([tensor1, tensor2]), tensor2])
-
-    ft1 = relay.ty.FuncType([tup1, t3], t3)
-    ft2 = relay.ty.FuncType([tup2, tensor3], tensor3)
-
-    unified = solver.Unify(ft1, ft2)
-    assert unified == ft2
-
-
-def test_unify_vars_under_tuples():
-    solver = make_solver()
-    t1 = relay.ty.IncompleteType()
-
-    tup1 = relay.ty.TupleType([t1, t1])
-    unified = solver.Unify(tup1, tup1)
-    assert unified == tup1
-
-    t2 = relay.ty.IncompleteType()
-    tup2 = relay.ty.TupleType([t2, t2])
-
-    tup3 = relay.ty.TupleType([t1, t2])
-    tup4 = relay.ty.TupleType([t2, t1])
-    unified = solver.Unify(tup3, tup4)
-    assert unified == tup1 or unified == tup2
-
-
-def test_binding_over_typevars():
-    solver = make_solver()
-
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-
-    a = relay.ty.TypeVar("a")
-    b = relay.ty.TypeVar("b")
-    c = relay.ty.TypeVar("c")
-    d = relay.ty.TypeVar("d")
-
-    ft1 = relay.ty.FuncType([t1], t2, [c, d])
-    ft2 = relay.ty.FuncType([a], b, [a, b])
-    unified = solver.Unify(ft1, ft2)
-    assert unified == solver.Resolve(ft1)
-
-
-def test_recursive_backward_solving():
-    solver = make_solver()
-
-    tensor1 = relay.ty.TensorType((10, 20), "float32")
-    tensor2 = relay.ty.TensorType((10, 1, 1), "float32")
-    tensor3 = relay.ty.TensorType((10,), "float32")
-
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-    t3 = relay.ty.IncompleteType()
-
-    tup1 = relay.ty.TupleType([relay.ty.TupleType([tensor1, tensor2]), tensor3])
-    tup2 = relay.ty.TupleType([relay.ty.TupleType([t1, t2]), t3])
-    solver.gen_type("Identity", [tup1], out=tup2)
-
-    assert solver.Solve()
-    assert solver.Resolve(tup2) == tup1
-
-
-def test_backward_solving_after_child_update():
-    solver = make_solver()
-
-    tensor1 = relay.ty.TensorType((10, 20), "float32")
-    tensor2 = relay.ty.TensorType((10, 1, 1), "float32")
-
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-    t3 = relay.ty.IncompleteType()
-
-    tup1 = relay.ty.TupleType([t1, t2])
-    tup2 = relay.ty.TupleType([t1, t3])
-
-    tup_concrete = relay.ty.TupleType([tensor1, tensor2])
-
-    t4 = solver.gen_type("Identity", [tup1])
-    t5 = solver.gen_type("Identity", [tup2])
-
-    solver.gen_type("Identity", [t4], out=t5)
-    assert solver.Solve()
-    assert solver.Resolve(t3) == t3 or solver.Resolve(t3) == t2
-    assert solver.Resolve(t4) == tup1 or solver.Resolve(t4) == tup2
-    assert solver.Resolve(t5) == tup1 or solver.Resolve(t5) == tup2
-
-    # updating the variables *inside* tup1 and tup2 should update t4 and t5
-    solver.gen_type("Identity", [t1], out=tensor1)
-    solver.gen_type("Identity", [t2], out=tensor2)
-    assert solver.Solve()
-    assert solver.Resolve(t4) == tup_concrete
-    assert solver.Resolve(t5) == tup_concrete
-
-
-def test_unify_quantified_funcs():
-    solver = make_solver()
-    a, b, c = relay.TypeVar("a"), relay.TypeVar("b"), relay.TypeVar("c")
-    ft1 = relay.FuncType([a, b], c, [a, b, c])
-    ft2 = relay.FuncType([a, a], a, [a])
-    unified = solver.Unify(ft1, ft2)
-    assert unified == ft2
-
-    ft3 = relay.FuncType([a], a, [a])
-    ft4 = relay.FuncType([b], c, [b, c])
-    unified = solver.Unify(ft3, ft4)
-    assert unified == ft3
-
-
-def test_unify_quantified_func_and_concrete():
-    solver = make_solver()
-    a, b = relay.TypeVar("a"), relay.TypeVar("b")
-    ft1 = relay.FuncType([a], b, [a, b])
-    ft2 = relay.FuncType([b], relay.TupleType([]), [b])
-    unified = solver.Unify(ft1, ft2)
-    assert unified == ft2
-
-
-def test_unify_quantified_funcs_nesting():
-    solver = make_solver()
-    a, b, c = relay.TypeVar("a"), relay.TypeVar("b"), relay.TypeVar("c")
-
-    ft1 = relay.FuncType([a, relay.TupleType([b, c])], relay.TupleType([a, b, c]), [a, b, c])
-    ft2 = relay.FuncType([a, relay.TupleType([a, a])], relay.TupleType([a, a, a]), [a])
-    unified = solver.Unify(ft1, ft2)
-    assert unified == ft2
-
-
-def test_unify_quantified_funcs_var_order():
-    solver = make_solver()
-    a, b, c = relay.TypeVar("a"), relay.TypeVar("b"), relay.TypeVar("c")
-
-    ft1 = relay.FuncType([a, relay.TupleType([b, c])], relay.TupleType([a, b, c]), [a, b, c])
-    ft2 = relay.FuncType([a, relay.TupleType([a, c])], relay.TupleType([a, a, c]), [a, c])
-    # unified = solver.Unify(ft1, ft2) # crashes here but it shouldn't
-    # assert unified == ft2
-
-
-@pytest.mark.xfail(raises=tvm._ffi.base.TVMError)
-def test_incompatible_tuple_unification():
-    solver = make_solver()
-    t1 = relay.ty.IncompleteType()
-    t2 = relay.ty.IncompleteType()
-
-    tensor1 = relay.ty.TensorType((1, 2, 3), "float32")
-    tensor2 = relay.ty.TensorType((2, 3), "float32")
-    tensor3 = relay.ty.TensorType((3,), "float32")
-
-    tup1 = relay.ty.TupleType([relay.ty.TupleType([t1, t1]), t2])
-    tup2 = relay.ty.TupleType([relay.ty.TupleType([tensor1, tensor2]), tensor3])
-    solver.Unify(tup1, tup2)
-
-
-@pytest.mark.xfail(raises=tvm._ffi.base.TVMError)
-def test_bad_recursive_unification():
-    solver = make_solver()
-    t1 = relay.ty.IncompleteType()
-    solver.Unify(t1, relay.ty.TupleType([t1, t1]))
-
-
-@pytest.mark.xfail(raises=tvm._ffi.base.TVMError)
-def test_unify_invalid_global_typevars():
-    solver = make_solver()
-    gtv1 = relay.GlobalTypeVar("gtv1")
-    gtv2 = relay.GlobalTypeVar("gtv2")
-    solver.Unify(gtv1, gtv2)
-
-
-@pytest.mark.xfail(raises=tvm._ffi.base.TVMError)
-def test_incompatible_typecall_var_unification():
-    solver = make_solver()
-    gtv1 = relay.GlobalTypeVar("gtv1")
-    gtv2 = relay.GlobalTypeVar("gtv2")
-
-    t1 = relay.IncompleteType()
-    t2 = relay.IncompleteType()
-
-    tc1 = relay.TypeCall(gtv1, [t1])
-    tc2 = relay.TypeCall(gtv2, [t2])
-    solver.Unify(tc1, tc2)
-
-
-@pytest.mark.xfail(raises=tvm._ffi.base.TVMError)
-def test_incompatible_typecall_args_unification():
-    solver = make_solver()
-    gtv = relay.GlobalTypeVar("gtv1")
-    t1 = relay.IncompleteType()
-    t2 = relay.IncompleteType()
-
-    tensor1 = relay.TensorType((1, 2, 3), "float32")
-    tensor2 = relay.TensorType((2, 3), "float32")
-    tensor3 = relay.TensorType((3,), "float32")
-
-    tc1 = relay.TypeCall(gtv, [relay.TupleType([t1, t1]), t2])
-    tc2 = relay.TypeCall(gtv, [relay.TupleType([tensor1, tensor2]), tensor3])
-    solver.Unify(tc1, tc2)
-
-
-@pytest.mark.xfail(raises=tvm._ffi.base.TVMError)
-def test_incompatible_quantified_func_unification():
-    solver = make_solver()
-    a, b, c = relay.TypeVar("a"), relay.TypeVar("b"), relay.TypeVar("c")
-
-    ft1 = relay.FuncType([a, b], c, [a, b, c])
-    ft2 = relay.FuncType([b, c], relay.TupleType([a]), [a, b, c])
-    solver.Unify(ft1, ft2)
-
-
-def test_integer_compatibility_in_layout_transform():
-    x = relay.var("data", shape=(2, 3, 48, 48), dtype="float32")
-    conv_out = relay.nn.conv2d(
-        x,
-        relay.var("weight", shape=(1, 3, 1, 1), dtype="float32"),
-        strides=[47, 47],
-        channels=1,
-        kernel_size=[1, 1],
-    )
-    bias_out = relay.nn.bias_add(conv_out, relay.var("bias"))
-    broadcast_out = relay.op.broadcast_to(bias_out, relay.const([2, 1, 2, 2], dtype="int64"))
-    y = relay.add(bias_out, broadcast_out)
-
-    mod, _ = testing.create_workload(y)
-    with tvm.transform.PassContext(opt_level=3):
-        with tvm.target.Target("llvm"):
-            mod = relay.transform.CanonicalizeOps()(mod)
-            mod = relay.transform.AlterOpLayout()(mod)
-
-
-if __name__ == "__main__":
-    test_bcast()
-    test_backward_solving()
-    test_unify_tuple()
-    test_unify_typecall()
-    test_unify_functype()
-    test_recursive_unify()
-    test_unify_vars_under_tuples()
-    test_recursive_backward_solving()
-    test_backward_solving_after_child_update()
-    test_unify_quantified_funcs()
-    test_unify_quantified_func_and_concrete()
-    test_unify_quantified_funcs_nesting()
-    test_unify_quantified_funcs_var_order()
-    test_incompatible_tuple_unification()
-    test_bad_recursive_unification()
-    test_incompatible_typecall_var_unification()
-    test_incompatible_typecall_args_unification()
-    test_incompatible_quantified_func_unification()
-    test_integer_compatibility_in_layout_transform()
diff --git a/tests/python/relay/test_typecall.py b/tests/python/relay/test_typecall.py
deleted file mode 100644
index 1cfa661a2c50..000000000000
--- a/tests/python/relay/test_typecall.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import relay
-from tvm.relay import transform
-
-
-def test_dup_type():
-    a = relay.TypeVar("a")
-    av = relay.Var("av", a)
-    make_id = relay.Function([av], relay.Tuple([av, av]), None, [a])
-    t = relay.scalar_type("float32")
-    b = relay.Var("b", t)
-    mod = tvm.IRModule.from_expr(make_id(b))
-    mod = transform.InferType()(mod)
-    inferred = mod["main"].body
-    assert inferred.checked_type == relay.TupleType([t, t])
-
-
-def test_id_type():
-    mod = tvm.IRModule()
-    id_type = relay.GlobalTypeVar("id")
-    a = relay.TypeVar("a")
-    mod[id_type] = relay.TypeData(id_type, [a], [])
-
-    b = relay.TypeVar("b")
-    make_id = relay.Var("make_id", relay.FuncType([b], id_type(b), [b]))
-    t = relay.scalar_type("float32")
-    b = relay.Var("b", t)
-    mod["main"] = relay.Function([make_id, b], make_id(b))
-    mod = transform.InferType()(mod)
-    assert mod["main"].body.checked_type == id_type(t)
-
-
-if __name__ == "__main__":
-    test_dup_type()
-    test_id_type()
diff --git a/tests/python/relay/test_used_memory_annotator.py b/tests/python/relay/test_used_memory_annotator.py
deleted file mode 100644
index e339152294b6..000000000000
--- a/tests/python/relay/test_used_memory_annotator.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""
-Testing for the pass that annotates used memory for each primitive
-Relay function.
-"""
-
-import pytest
-
-import tvm
-from tvm import relay
-from tvm.relay.expr_functor import ExprVisitor
-
-
-def AnnotateUsedMemory():
-    return relay.transform._ffi_api.AnnotateUsedMemory()
-
-
-class CheckUsedMemoryAnnotation(ExprVisitor):
-    """
-    Check that the annotations on each function in the graph match
-    what is expected.
-    """
-
-    def __init__(self, expected_annotations, expected_io_annotation):
-        self.expected_annotations = expected_annotations
-        self.expected_io_annotation = expected_io_annotation
-        super().__init__()
-
-    def visit_function(self, fn):
-        if "Primitive" in fn.attrs:
-            assert (
-                "used_memory" in fn.attrs
-            ), "Primitive function does not have used_memory annotation."
-
-            assert len(self.expected_annotations) > 0, "Not all expected annotations were compared"
-
-            expected_mem = self.expected_annotations.pop(0)
-            actual_mem = [int(x) for x in fn.attrs["used_memory"]]
-            assert expected_mem == actual_mem, (
-                f"Expected used memory annotation {expected_mem} "
-                f"did not match actual annotation {actual_mem}"
-            )
-        super().visit_function(fn)
-
-    def __call__(self, fn):
-        assert (
-            fn.attrs["io_used_memory"] == self.expected_io_annotation
-        ), "Expected IO annotation did not match."
-        self.visit(fn.body)
-
-
-def _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation):
-    mod = relay.transform.InferType()(mod)
-    mod = relay.transform.ToANormalForm()(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = AnnotateUsedMemory()(mod)
-
-    CheckUsedMemoryAnnotation(expected_annotations, expected_io_annotation)(mod["main"])
-
-
-def _create_primitive_function(expr):
-    func = relay.Function(relay.analysis.free_vars(expr), expr)
-    func = func.with_attr("Primitive", 1)
-    return func
-
-
-def test_simple():
-    """
-    Test simple graph with one primitive function.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
-        x = relay.nn.max_pool2d(x)
-        x = _create_primitive_function(x)
-        return x
-
-    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
-    call = relay.Call(get_inner_func(), [ifm])
-    mod = tvm.IRModule.from_expr(call)
-
-    expected_annotations = [
-        [2 * (1 * 2 * 2 * 4)],
-    ]
-    expected_io_annotation = 2 * (1 * 2 * 2 * 4)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_multiple_functions():
-    """
-    Test a graph with multiple primitive functions.
-    """
-
-    def get_inner_func(ifm_shape):
-        x = relay.var("x", shape=ifm_shape, dtype="int8")
-        x = relay.nn.max_pool2d(x, pool_size=(2, 2), layout="NHWC")
-        x = _create_primitive_function(x)
-        return x
-
-    ifm = relay.var("input", shape=(1, 8, 8, 2), dtype="int8")
-    x = get_inner_func((1, 8, 8, 2))
-    x = relay.Call(x, [ifm])
-    y = get_inner_func((1, 7, 7, 2))
-    y = relay.Call(y, [x])
-    z = get_inner_func((1, 6, 6, 2))
-    z = relay.Call(z, [y])
-    mod = tvm.IRModule.from_expr(z)
-
-    expected_annotations = [
-        [(1 * 8 * 8 * 2) + (1 * 7 * 7 * 2)],
-        [(1 * 7 * 7 * 2) + (1 * 6 * 6 * 2)],
-        [(1 * 6 * 6 * 2) + (1 * 5 * 5 * 2)],
-    ]
-    expected_io_annotation = (1 * 8 * 8 * 2) + (1 * 5 * 5 * 2)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_mixed_data_types():
-    """
-    Test a graph with a primitive function that has mixed datatypes.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(1, 2, 2, 2), dtype="int16")
-        x = relay.cast(x, dtype="uint32")
-        x = _create_primitive_function(x)
-        return x
-
-    ifm = relay.var("input", shape=(1, 2, 2, 2), dtype="int16")
-    x = get_inner_func()
-    x = relay.Call(x, [ifm])
-    mod = tvm.IRModule.from_expr(x)
-
-    expected_annotations = [
-        [(1 * 2 * 2 * 2) * 2 + (1 * 2 * 2 * 2) * 4],
-    ]
-    expected_io_annotation = (1 * 2 * 2 * 2) * 2 + (1 * 2 * 2 * 2) * 4
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_parallel_function_call():
-    """
-    Test a graph when the results of two functions are concatenated
-    into a single result. The second function will also have the result
-    of the first function alive so will be annotated with a larger
-    "used memory" value.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
-        x = relay.reshape(x, newshape=(1, 4, 30))
-        x = _create_primitive_function(x)
-        return x
-
-    ifm = relay.var("input", shape=(1, 4, 5, 6), dtype="int8")
-    x = relay.Call(get_inner_func(), [ifm])
-    y = relay.Call(get_inner_func(), [ifm])
-    z = relay.concatenate([x, y], axis=0)
-    mod = tvm.IRModule.from_expr(z)
-
-    expected_annotations = [
-        [(1 * 4 * 5 * 6) + (1 * 4 * 30)],
-        # the output tensor from the previous function is also alive
-        [(1 * 4 * 5 * 6) + (1 * 4 * 30) + (1 * 4 * 30)],
-    ]
-    expected_io_annotation = (1 * 4 * 5 * 6) + (1 * 4 * 60)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_many_different_parallel_calls():
-    """
-    Test a graph that calls many different functions in parallel.
-
-                    input
-            /         |         \
-    prim_func_1  prim_func_2  prim_func_3
-           \         |         /
-                 prim_func_4
-    """
-
-    def get_inner_func_1():
-        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
-        x = relay.tanh(x)
-        x = _create_primitive_function(x)
-        return x
-
-    def get_inner_func_2():
-        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
-        x = relay.nn.max_pool2d(x, pool_size=(1, 1), layout="NHWC")
-        x = _create_primitive_function(x)
-        return x
-
-    def get_inner_func_3():
-        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
-        x = relay.abs(x)
-        x = relay.nn.relu(x)
-        x = relay.exp(x)
-        x = _create_primitive_function(x)
-        return x
-
-    def get_inner_func_4():
-        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
-        y = relay.var("y", shape=(1, 4, 5, 6), dtype="int8")
-        z = relay.var("z", shape=(1, 4, 5, 6), dtype="int8")
-        out = relay.concatenate([x, y, z], axis=3)
-        out = _create_primitive_function(out)
-        return out
-
-    ifm = relay.var("input", shape=(1, 4, 5, 6), dtype="int8")
-    x = relay.Call(get_inner_func_1(), [ifm])
-    y = relay.Call(get_inner_func_2(), [ifm])
-    z = relay.Call(get_inner_func_3(), [ifm])
-    a = relay.Call(get_inner_func_4(), [x, y, z])
-    mod = tvm.IRModule.from_expr(a)
-
-    expected_annotations = [
-        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
-        # output from prim_func_1 is also still alive
-        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
-        # outputs from prim_func_1 and prim_func_2 are also still alive
-        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
-        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 18)],
-    ]
-    expected_io_annotation = (1 * 4 * 5 * 6) + (1 * 4 * 5 * 18)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_nested_branches():
-    """
-    Tests a graph with branches that also branch.
-
-             input
-            /     \
-          /        \
-    prim_func_1  prim_func_2
-                   /     \
-                  /       \
-            prim_func_3   prim_func_4
-    """
-
-    def get_generic_inner_func():
-        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
-        x = relay.nn.relu(x)
-        return _create_primitive_function(x)
-
-    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
-    a = relay.Call(get_generic_inner_func(), [ifm])
-    b = relay.Call(get_generic_inner_func(), [ifm])
-    c = relay.Call(get_generic_inner_func(), [b])
-    d = relay.Call(get_generic_inner_func(), [b])
-    out = relay.concatenate([a, c, d], axis=3)
-    mod = tvm.IRModule.from_expr(out)
-
-    expected_annotations = [
-        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
-        # output from prim_func_1 is also still alive
-        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
-        # output from prim_func_1 is also still alive
-        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
-        # outputs from prim_func_1 and prim_func_3 are also still alive
-        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
-    ]
-    expected_io_annotation = (1 * 2 * 2 * 4) + (1 * 2 * 2 * 12)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_composite_inner_function():
-    """
-    Tests the typical BYOC use case where a primitive function
-    contains a composite function.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
-        x = relay.nn.max_pool2d(x, pool_size=(2, 2), layout="NHWC")
-        x = relay.Function(relay.analysis.free_vars(x), x)
-        x = x.with_attr("Composite", "my_composite_func")
-
-        y = relay.var("y", shape=(1, 2, 2, 4), dtype="int8")
-        z = relay.Call(x, [y])
-        return _create_primitive_function(z)
-
-    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
-    x = relay.Call(get_inner_func(), [ifm])
-    mod = tvm.IRModule.from_expr(x)
-
-    expected_annotations = [
-        [(1 * 2 * 2 * 4) + (1 * 1 * 1 * 4)],
-    ]
-    expected_io_annotation = (1 * 2 * 2 * 4) + (1 * 1 * 1 * 4)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_multiple_calls_to_same_function():
-    """
-    Tests the case when there are multiple calls to the same function.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
-        x = relay.nn.max_pool2d(x)
-        x = _create_primitive_function(x)
-        return x
-
-    inner_func = get_inner_func()
-    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
-    call1 = relay.Call(inner_func, [ifm])
-    call2 = relay.Call(inner_func, [call1])
-    mod = tvm.IRModule.from_expr(call2)
-
-    expected_annotations = [[2 * (1 * 2 * 2 * 4), 2 * (1 * 2 * 2 * 4)]]
-    expected_io_annotation = 2 * (1 * 2 * 2 * 4)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_parallel_calls_to_same_function():
-    """
-    Test parallel calls to the same function.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
-        x = relay.nn.max_pool2d(x)
-        x = _create_primitive_function(x)
-        return x
-
-    inner_func = get_inner_func()
-    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
-    call1 = relay.Call(inner_func, [ifm])
-    call2 = relay.Call(inner_func, [ifm])
-    concat = relay.concatenate([call1, call2], axis=0)
-    mod = tvm.IRModule.from_expr(concat)
-
-    expected_annotations = [[2 * (1 * 2 * 2 * 4), 3 * (1 * 2 * 2 * 4)]]
-    expected_io_annotation = 3 * (1 * 2 * 2 * 4)
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_parallel_calls_with_non_ifm_input():
-    """
-    Test a graph that calls many different functions in parallel where
-    the input is not the input to the function.
-
-                    y = f(x)
-            /         |         \
-       z0 = g0(y)    ...      zi = gi(y)
-           \         |         /
-                  concat
-    """
-
-    def get_inner_func_1():
-        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
-        x = relay.tanh(x)
-        x = _create_primitive_function(x)
-        return x
-
-    def get_inner_func_2():
-        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
-        x = relay.nn.max_pool2d(x, pool_size=(2, 2))
-        x = _create_primitive_function(x)
-        return x
-
-    ifm = relay.var("input", shape=(1, 4, 5, 6), dtype="int8")
-    y = relay.Call(get_inner_func_1(), [ifm])
-    g = get_inner_func_2()
-
-    no_calls = 20
-    z = [relay.Call(g, [y]) for _ in range(0, no_calls)]
-    out = relay.concatenate(z, axis=3)
-    mod = tvm.IRModule.from_expr(out)
-
-    expected_annotations = [
-        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
-        [(1 * 4 * 5 * 6) + (1 * 4 * 4 * 5) * i for i in range(1, no_calls + 1)],
-    ]
-    expected_io_annotation = (1 * 4 * 5 * 6) + (1 * 4 * 4 * (5 * no_calls))
-    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
-
-
-def test_dynamic_io_tensor_not_supported():
-    """
-    Test to check dynamic IO tensor error.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
-        x = relay.nn.max_pool2d(x)
-        x = _create_primitive_function(x)
-        return x
-
-    ifm = relay.var("input", shape=(1, 2, 2, relay.Any()), dtype="int8")
-    call = relay.Call(get_inner_func(), [ifm])
-    mod = tvm.IRModule.from_expr(call)
-
-    err_rgx = r"AnnotateUsedMemory does not support dynamic shapes"
-    with pytest.raises(tvm.TVMError, match=err_rgx):
-        _check_used_memory_annotations(mod, [], [])
-
-
-def test_dynamic_callsite_tensor_not_supported():
-    """
-    Test to check dynamic callsite tensor error.
-    """
-
-    def get_inner_func():
-        x = relay.var("x", shape=(relay.Any(), 2, 2, 4), dtype="int8")
-        x = relay.nn.max_pool2d(x)
-        x = _create_primitive_function(x)
-        return x
-
-    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
-    call = relay.Call(get_inner_func(), [ifm])
-    mod = tvm.IRModule.from_expr(call)
-
-    err_rgx = r"AnnotateUsedMemory does not support dynamic shapes"
-    with pytest.raises(tvm.TVMError, match=err_rgx):
-        _check_used_memory_annotations(mod, [], [])
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
deleted file mode 100644
index 270de831e554..000000000000
--- a/tests/python/relay/test_vm.py
+++ /dev/null
@@ -1,1565 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from unittest.mock import patch
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import IRModule, relay, rpc, runtime
-from tvm.contrib import utils
-from tvm.relay import testing
-from tvm.relay.backend import vm
-from tvm.relay.backend.vm import VMCompiler
-from tvm.relay.dataflow_pattern import is_op, wildcard
-from tvm.relay.loops import while_loop
-from tvm.relay.prelude import Prelude
-from tvm.relay.scope_builder import ScopeBuilder
-from tvm.relay.testing import mlp
-from tvm.relay.transform import InferType
-
-
-def check_result(target, dev, args, expected_result, mod):
-    """
-    Check that evaluating `expr` applied to the arguments produces
-    `result` on Relay VM.
-
-    Parameters
-    ----------
-    args: list of Expr
-        The arguments to supply the expr.
-
-    expected_result:
-        The expected result of running the expression.
-    """
-    rts_result = relay.create_executor("vm", device=dev, target=target, mod=mod).evaluate()(*args)
-    tvm.testing.assert_allclose(expected_result, rts_result.numpy())
-
-
-def veval(f, *args, device=tvm.cpu(), target="llvm"):
-    if isinstance(f, relay.Expr):
-        mod = tvm.IRModule()
-        mod["main"] = f
-    else:
-        assert isinstance(f, tvm.IRModule), "expected expression or module"
-        mod = f
-    exe = relay.vm.compile(mod, target)
-    vm = runtime.vm.VirtualMachine(exe, device)
-    return vm.invoke("main", *args)
-
-
-def vmobj_to_list(o):
-    if isinstance(o, tvm.nd.NDArray):
-        return [o.numpy().tolist()]
-    elif isinstance(o, tvm.runtime.container.ADT):
-        result = []
-        for f in o:
-            result.extend(vmobj_to_list(f))
-        return result
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def test_split(target, dev):
-    x = relay.var("x", shape=(12,))
-    y = relay.split(x, 3, axis=0).astuple()
-    f = relay.Function([x], y)
-
-    x_data = np.random.rand(
-        12,
-    ).astype("float32")
-    ref_res = np.split(x_data, 3, axis=0)
-    res = veval(f, x_data, device=dev, target=target)
-    for i in range(3):
-        tvm.testing.assert_allclose(res[i].numpy(), ref_res[i])
-
-
-def test_split_no_fuse(target, dev):
-    x = relay.var("x", shape=(12,))
-    y = relay.split(x, 3, axis=0).astuple()
-    z = relay.concatenate([relay.TupleGetItem(y, 0)], axis=0)
-    z = relay.annotation.stop_fusion(z)
-    f = relay.Function([x], z)
-    x_data = np.random.rand(
-        12,
-    ).astype("float32")
-
-    res = veval(f, x_data, device=dev, target=target)
-    tvm.testing.assert_allclose(res.numpy(), np.split(x_data, 3, axis=0)[0])
-
-
-def test_id(target, dev):
-    x = relay.var("x", shape=(10, 10), dtype="float64")
-    f = relay.Function([x], x)
-    x_data = np.random.rand(10, 10).astype("float64")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    check_result(target, dev, [x_data], x_data, mod)
-
-
-def test_op(target, dev):
-    x = relay.var("x", shape=(10, 10))
-    f = relay.Function([x], x + x)
-    x_data = np.random.rand(10, 10).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    check_result(target, dev, [x_data], 2 * x_data, mod)
-
-
-def any(x):
-    x = relay.op.nn.batch_flatten(x)
-    return relay.op.min(x, axis=[0, 1])
-
-
-@tvm.testing.known_failing_targets("vulkan")
-def test_cond(target, dev):
-    x = relay.var("x", shape=(10, 10))
-    y = relay.var("y", shape=(10, 10))
-    # f = relay.Function([x, y], relay.op.equal(x, y))
-    f = relay.Function([x, y], any(relay.op.equal(x, y)))
-    x_data = np.random.rand(10, 10).astype("float32")
-    y_data = np.random.rand(10, 10).astype("float32")
-
-    mod = tvm.IRModule()
-    mod["main"] = f
-    # same
-    check_result(target, dev, [x_data, x_data], True, mod)
-
-    # diff
-    check_result(target, dev, [x_data, y_data], False, mod)
-
-
-@tvm.testing.known_failing_targets("vulkan")
-def test_simple_if(target, dev):
-    x = relay.var("x", shape=(10, 10))
-    y = relay.var("y", shape=(10, 10))
-    f = relay.Function([x, y], relay.If(any(relay.op.equal(x, y)), x, y))
-    x_data = np.random.rand(10, 10).astype("float32")
-    y_data = np.random.rand(10, 10).astype("float32")
-
-    mod = tvm.IRModule()
-    mod["main"] = f
-    # same
-    check_result(target, dev, [x_data, x_data], x_data, mod)
-
-    # diff
-    check_result(target, dev, [x_data, y_data], y_data, mod)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_multiple_ifs(target, dev):
-    mod = tvm.IRModule({})
-    b = relay.var("b")
-    v0 = relay.var("v0")
-    v1 = relay.var("v1")
-    v2 = relay.var("v2")
-    v3 = relay.var("v3")
-    out = relay.Tuple([v2, v3])
-    out = relay.Let(v3, relay.If(b, v1, v0), out)
-    out = relay.Let(v2, relay.If(b, v0, v1), out)
-    out = relay.Let(v1, relay.Tuple([relay.const(1)]), out)
-    out = relay.Let(v0, relay.Tuple([relay.const(0)]), out)
-    fn = relay.Function([b], out)
-    mod["main"] = fn
-    func = relay.create_executor(device=dev, mod=mod, kind="vm").evaluate()
-    res = vmobj_to_list(func(False))
-    assert res == [1, 0]
-
-
-def test_unused_function(target, dev):
-    cond = relay.const(True)
-    mod = tvm.IRModule()
-    then_name = relay.GlobalVar("times_2")
-    # define unused function
-    else_name = relay.GlobalVar("times_3")
-    t1 = relay.TensorType((2, 2), dtype="float32")
-    x1 = relay.var("x1", t1, dtype="float32")
-    x2 = relay.var("x2", t1, dtype="float32")
-    f2 = relay.multiply(x1, relay.const(2.0))
-    f3 = relay.multiply(x2, relay.const(3.0))
-    mod[then_name] = relay.Function([x1], f2)
-    mod[else_name] = relay.Function([x2], f3)
-    mod = InferType()(mod)
-    x3 = relay.var("x3", t1, dtype="float32")
-    # put unused function in else branch
-    f = relay.If(cond, then_name(x3), else_name(x3))
-    mod["main"] = relay.Function([x3], f)
-    x_data = np.random.rand(2, 2).astype("float32")
-    y_data = x_data * 2
-
-    check_result(target, dev, [x_data], y_data, mod)
-
-
-def test_simple_call(target, dev):
-    mod = tvm.IRModule({})
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    sb = ScopeBuilder()
-    sb.ret(i)
-    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], "int32"))
-    mod[sum_up] = func
-    i_data = np.array(0, dtype="int32")
-    iarg = relay.var("iarg", shape=[], dtype="int32")
-    mod["main"] = relay.Function([iarg], sum_up(iarg))
-    check_result(target, dev, [i_data], i_data, mod)
-
-
-def test_count_loop(target, dev):
-    mod = tvm.IRModule({})
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    sb = ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0, dtype="int32"))):
-        sb.ret(i)
-    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1, dtype="int32"))
-        rec_call = relay.Call(sum_up, [one_less])
-        sb.ret(relay.add(rec_call, i))
-    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], "int32"))
-    mod[sum_up] = func
-    i_data = np.array(0, dtype="int32")
-    iarg = relay.var("i", shape=[], dtype="int32")
-    mod["main"] = relay.Function([iarg], sum_up(iarg))
-    result = veval(mod, i_data, device=dev, target=target)
-    tvm.testing.assert_allclose(result.numpy(), i_data)
-    check_result(target, dev, [i_data], i_data, mod)
-
-
-def test_sum_loop(target, dev):
-    mod = tvm.IRModule({})
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    accum = relay.var("accum", shape=[], dtype="int32")
-    sb = ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0, "int32"))):
-        sb.ret(accum)
-    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1, "int32"))
-        new_accum = relay.add(accum, i)
-        sb.ret(relay.Call(sum_up, [one_less, new_accum]))
-    func = relay.Function([i, accum], sb.get())
-    mod[sum_up] = func
-    mod = relay.transform.InferType()(mod)
-    loop_bound = 0
-    i_data = np.array(loop_bound, dtype="int32")
-    accum_data = np.array(0, dtype="int32")
-    iarg = relay.var("i", shape=[], dtype="int32")
-    aarg = relay.var("accum", shape=[], dtype="int32")
-    mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
-    check_result(target, dev, [i_data, accum_data], sum(range(1, loop_bound + 1)), mod)
-
-
-def test_tuple_fst(target, dev):
-    ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
-    tup = relay.var("tup", type_annotation=ttype)
-    f = relay.Function([tup], relay.TupleGetItem(tup, 0))
-    i_data = np.random.rand(41).astype("float32")
-    j_data = np.random.rand(10).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    check_result(target, dev, [(i_data, j_data)], i_data, mod)
-
-
-def test_tuple_second(target, dev):
-    ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
-    tup = relay.var("tup", type_annotation=ttype)
-    f = relay.Function([tup], relay.TupleGetItem(tup, 1))
-    i_data = np.random.rand(41).astype("float32")
-    j_data = np.random.rand(10).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    check_result(target, dev, [(i_data, j_data)], j_data, mod)
-
-
-def test_list_constructor(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    l, cons, nil = mod.get_type("List")
-
-    one2 = cons(relay.const(1), nil())
-    one3 = cons(relay.const(2), one2)
-    one4 = cons(relay.const(3), one3)
-    f = relay.Function([], one4)
-
-    mod["main"] = f
-
-    result = veval(mod, device=dev, target=target)
-    assert len(result) == 2
-    assert len(result[1]) == 2
-
-    obj = vmobj_to_list(result)
-    tvm.testing.assert_allclose(obj, np.array([3, 2, 1]))
-
-
-def test_let_tensor(target, dev):
-    sb = relay.ScopeBuilder()
-    shape = (1,)
-    x = relay.var("x", shape=shape, dtype="float32")
-    x1 = relay.var("x1", shape=shape, dtype="float32")
-
-    x1 = sb.let(x1, x)
-    xplusone = x1 + relay.const(42.0, "float32")
-    sb.ret(xplusone)
-    body = sb.get()
-
-    f = relay.Function([x], body)
-
-    x_data = np.random.rand(*shape).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    check_result(target, dev, [x_data], x_data + 42.0, mod)
-
-
-def test_let_scalar(target, dev):
-    sb = relay.ScopeBuilder()
-
-    x = relay.var("x", "float32")
-    x1 = sb.let("x1", x)
-    xplusone = x1 + relay.const(42.0, "float32")
-    sb.ret(xplusone)
-    body = sb.get()
-
-    f = relay.Function([x], body)
-
-    x_data = np.array(np.random.rand()).astype("float32")
-    mod = tvm.IRModule()
-    mod["main"] = f
-    check_result(target, dev, [x_data], x_data + 42.0, mod)
-
-
-def test_compose(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    compose = p.compose
-
-    # add_one = fun x -> x + 1
-    sb = relay.ScopeBuilder()
-    x = relay.var("x", "float32")
-    x1 = sb.let("x1", x)
-    xplusone = x1 + relay.const(1.0, "float32")
-    sb.ret(xplusone)
-    body = sb.get()
-    add_one = relay.GlobalVar("add_one")
-    add_one_func = relay.Function([x], body)
-
-    # add_two = compose(add_one, add_one)
-    sb = relay.ScopeBuilder()
-    y = relay.var("y", "float32")
-    add_two_func = sb.let("add_two", compose(add_one_func, add_one_func))
-    add_two_res = add_two_func(y)
-    sb.ret(add_two_res)
-    add_two_body = sb.get()
-
-    mod[add_one] = add_one_func
-
-    f = relay.Function([y], add_two_body)
-    mod["main"] = f
-
-    x_data = np.array(np.random.rand()).astype("float32")
-    result = veval(mod, [x_data], device=dev, target=target)
-    tvm.testing.assert_allclose(result.numpy(), x_data + 2.0)
-
-
-def test_list_hd(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    l, cons, nil = mod.get_type("List")
-    hd = mod.get_global_var("hd")
-
-    one2 = cons(relay.const(1), nil())
-    one3 = cons(relay.const(2), one2)
-    one4 = cons(relay.const(3), one3)
-    three = hd(one4)
-    f = relay.Function([], three)
-
-    mod["main"] = f
-
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(result.numpy(), 3)
-
-
-def test_list_tl_empty_list(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    l, cons, nil = mod.get_type("List")
-    tl = mod.get_global_var("tl")
-
-    f = relay.Function([], tl(nil()))
-
-    mod["main"] = f
-
-    with pytest.raises(tvm.error.TVMError):
-        result = veval(mod, device=dev, target=target)
-
-
-def test_list_tl(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    l, cons, nil = mod.get_type("List")
-    tl = mod.get_global_var("tl")
-
-    one2 = cons(relay.const(1), nil())
-    one3 = cons(relay.const(2), one2)
-    one4 = cons(relay.const(3), one3)
-
-    f = relay.Function([], tl(one4))
-
-    mod["main"] = f
-
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([2, 1]))
-
-
-def test_list_nth(target, dev):
-    expected = list(range(10))
-
-    for i in range(len(expected)):
-        mod = tvm.IRModule()
-        p = Prelude(mod)
-
-        _, cons, nil = mod.get_type("List")
-        nth = mod.get_global_var("nth")
-
-        l = nil()
-        for i in reversed(expected):
-            l = cons(relay.const(i), l)
-
-        f = relay.Function([], nth(l, relay.const(i)))
-        mod["main"] = f
-        result = veval(mod, device=dev, target=target)
-        tvm.testing.assert_allclose(result.numpy(), expected[i])
-
-
-def test_list_update(target, dev):
-    expected = list(range(10))
-
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    _, cons, nil = mod.get_type("List")
-    update = mod.get_global_var("update")
-
-    l = nil()
-    # create zero initialized list
-    for i in range(len(expected)):
-        l = cons(relay.const(0), l)
-
-    # set value
-    for i, v in enumerate(expected):
-        l = update(l, relay.const(i), relay.const(v))
-
-    f = relay.Function([], l)
-    mod["main"] = f
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(vmobj_to_list(result), np.array(expected))
-
-
-def test_list_length(target, dev):
-    expected = list(range(10))
-
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    _, cons, nil = mod.get_type("List")
-    length = mod.get_global_var("length")
-
-    l = nil()
-    # create zero initialized list
-    for _ in range(len(expected)):
-        l = cons(relay.const(0), l)
-
-    l = length(l)
-
-    f = relay.Function([], l)
-    mod["main"] = f
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(result.numpy(), 10)
-
-
-def test_list_map(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    x = relay.var("x", "int32")
-    add_one_func = relay.Function([x], relay.const(1) + x)
-
-    _, cons, nil = mod.get_type("List")
-    map = mod.get_global_var("map")
-
-    l = cons(relay.const(2), cons(relay.const(1), nil()))
-
-    f = relay.Function([], map(add_one_func, l))
-    mod["main"] = f
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 2]))
-
-
-def test_list_foldl(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    _, cons, nil = mod.get_type("List")
-    foldl = mod.get_global_var("foldl")
-
-    x = relay.var("x")
-    y = relay.var("y")
-    rev_dup_func = relay.Function([y, x], cons(x, cons(x, y)))
-
-    l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
-    f = relay.Function([], foldl(rev_dup_func, nil(), l))
-    mod["main"] = f
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 3, 2, 2, 1, 1]))
-
-
-def test_list_foldr(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    _, cons, nil = mod.get_type("List")
-    foldr = mod.get_global_var("foldr")
-
-    x = relay.var("x")
-    y = relay.var("y")
-    identity_func = relay.Function([x, y], cons(x, y))
-
-    l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
-    f = relay.Function([], foldr(identity_func, nil(), l))
-    mod["main"] = f
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([1, 2, 3]))
-
-
-def test_list_sum(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    _, cons, nil = mod.get_type("List")
-    sum = mod.get_global_var("sum")
-
-    l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
-    f = relay.Function([], sum(l))
-    mod["main"] = f
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(result.numpy(), 6)
-
-
-def test_list_filter(target, dev):
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    _, cons, nil = mod.get_type("List")
-    filter = mod.get_global_var("filter")
-
-    x = relay.var("x", "int32")
-    greater_than_one = relay.Function([x], x > relay.const(1))
-    l = cons(
-        relay.const(1),
-        cons(
-            relay.const(3), cons(relay.const(1), cons(relay.const(5), cons(relay.const(1), nil())))
-        ),
-    )
-    f = relay.Function([], filter(greater_than_one, l))
-    mod["main"] = f
-    result = veval(mod, device=dev, target=target)
-    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 5]))
-
-
-def test_closure(target, dev):
-    x = relay.var("x", shape=())
-    y = relay.var("y", shape=())
-    f = relay.Function([x], x + y)
-    ff = relay.Function([y], f)
-    clo = ff(relay.const(1.0))
-    main = clo(relay.const(2.0))
-    res = veval(main, device=dev, target=target)
-    tvm.testing.assert_allclose(res.numpy(), 3.0)
-
-
-def test_add_op_scalar(target, dev):
-    """
-    test_add_op_scalar:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    mod = tvm.IRModule()
-    x = relay.var("x", shape=())  # Default to float32
-    y = relay.var("y", shape=())  # Default to float32
-    func = relay.Function([x, y], relay.op.add(x, y))
-    x_y_data = [
-        (np.array(10.0, dtype="float32"), np.array(1.0, dtype="float32")),
-        (np.float32(10.0), np.float32(1.0)),
-        (10.0, 1.0),
-    ]
-    for (x_data, y_data) in x_y_data:
-        mod["main"] = func
-        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
-
-
-def test_add_op_scalar_float16(target, dev):
-    """
-    test_add_op_scalar_float16:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    mod = tvm.IRModule()
-    x = relay.var("x", shape=(), dtype="float16")  # Default to float16
-    y = relay.var("y", shape=(), dtype="float16")  # Default to float16
-    func = relay.Function([x, y], relay.op.add(x, y))
-    x_y_data = [
-        (np.array(10.0, dtype="float16"), np.array(1.0, dtype="float16")),
-        (np.float16(10.0), np.float16(1.0)),
-    ]
-    for (x_data, y_data) in x_y_data:
-        mod["main"] = func
-        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
-
-
-def test_add_op_scalar_int(target, dev):
-    """
-    test_add_op_scalar_int:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    mod = tvm.IRModule()
-    x = relay.var("x", shape=(), dtype="int32")
-    y = relay.var("y", shape=(), dtype="int32")
-    func = relay.Function([x, y], relay.op.add(x, y))
-    x_y_data = [
-        (np.array(10.0, dtype="int32"), np.array(1.0, dtype="int32")),
-        (np.int32(10), np.int32(1)),
-        (10, 1),
-    ]
-    for (x_data, y_data) in x_y_data:
-        mod["main"] = func
-        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
-
-
-def test_add_op_tensor(target, dev):
-    """
-    test_add_op_tensor:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    mod = tvm.IRModule()
-    x = relay.var("x", shape=(10, 5))
-    y = relay.var("y", shape=(10, 5))
-    func = relay.Function([x, y], relay.op.add(x, y))
-    x_data = np.random.rand(10, 5).astype("float32")
-    y_data = np.random.rand(10, 5).astype("float32")
-    mod["main"] = func
-    check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
-
-
-def test_add_op_broadcast(target, dev):
-    """
-    test_add_op_broadcast:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    mod = tvm.IRModule()
-    x = relay.var("x", shape=(10, 5))
-    y = relay.var("y", shape=(1, 5))
-    func = relay.Function([x, y], relay.op.add(x, y))
-    x_data = np.random.rand(10, 5).astype("float32")
-    y_data = np.random.rand(1, 5).astype("float32")
-    mod["main"] = func
-    check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
-
-
-def test_vm_optimize_dynamic():
-    dtype = "float32"
-    x = relay.var("x", shape=(relay.Any(), relay.Any()), dtype=dtype)
-    y = relay.var("y", shape=(relay.Any(), relay.Any()), dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], relay.add(x, y))
-    comp = relay.vm.VMCompiler()
-    opt_mod, _ = comp.optimize(mod, target="llvm")
-    assert "shape_func" in opt_mod.astext(False)
-
-
-def test_vm_optimize():
-    mod, params = testing.synthetic.get_workload()
-    comp = relay.vm.VMCompiler()
-    opt_mod, _ = comp.optimize(mod, target="llvm", params=params)
-
-    free_vars = relay.analysis.free_vars(opt_mod["main"].body)
-    # Paremeters should all be bound, so the only free var is data
-    assert len(free_vars) == 1
-
-
-def test_loop_free_var(target, dev):
-    x = relay.var("x", shape=(), dtype="int32")
-    i = relay.var("i", shape=(), dtype="int32")
-    s = relay.var("s", shape=(), dtype="int32")
-
-    def cond(i, _):
-        return i < relay.const(10, dtype="int32")
-
-    def body_no_free_var(i, acc):
-        incr = relay.const(1, "int32")
-        return i + incr, acc + i
-
-    def body_with_free_var(i, acc):
-        incr = relay.const(1, "int32")
-        return i + incr, acc + x
-
-    for args, body, expected in zip([[], [1]], [body_no_free_var, body_with_free_var], [45, 10]):
-        loop = while_loop(cond, [i, s], body)
-        tup = loop(relay.const(0, dtype="int32"), relay.zeros(shape=(), dtype="int32"))
-        ret = relay.TupleGetItem(tup, 1)
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function(relay.analysis.free_vars(ret), ret)
-        check_result(target, dev, args, expected, mod)
-
-
-def test_vm_reshape_tensor(target, dev):
-    x_np = np.random.uniform(size=(8, 16)).astype("float32")
-    x = relay.var("x", shape=(8, 16), dtype="float32")
-    y = relay.reshape(x, [-1, 4, 8])
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y)
-    with tvm.transform.PassContext(opt_level=3):
-        exec = relay.vm.compile(mod, "llvm")
-    assert "reshape_tensor" in exec.bytecode
-    check_result(target, dev, [x_np], x_np.reshape([4, 4, 8]), mod)
-
-    x = relay.var("x", shape=(8, 16), dtype="float32")
-    y = relay.reshape(x, [16, -1])
-    y = relay.reverse_reshape(y, [-1, 4, 0])
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y)
-    with tvm.transform.PassContext(opt_level=3):
-        exec = relay.vm.compile(mod, "llvm")
-    assert exec.bytecode.count("reshape_tensor") == 1
-    check_result(target, dev, [x_np], x_np.reshape([4, 4, 8]), mod)
-
-    # reshape with symbolic/any shape
-    for n in [tvm.tir.Any(), tvm.te.size_var("n")]:
-        x = relay.var("x", shape=(n, 16), dtype="float32")
-        y = relay.reshape(x, [-1, 4])
-        y = relay.reshape(y, [0, 2, -1])
-        mod = tvm.IRModule()
-        mod["main"] = relay.Function([x], y)
-        with tvm.transform.PassContext(opt_level=3):
-            exec = relay.vm.compile(mod, "llvm")
-        assert exec.bytecode.count("reshape_tensor") == 1
-        check_result(target, dev, [x_np], x_np.reshape([32, 2, 2]), mod)
-
-    # dyn.reshape
-    x = relay.var("x", shape=(8, 16), dtype="float32")
-    y = relay.var("y", shape=(3,), dtype="int32")
-    z = relay.reshape(x, [-1, 4, 8])
-    z = relay.reshape(z, y)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], z)
-    with tvm.transform.PassContext(opt_level=3):
-        exec = relay.vm.compile(mod, "llvm")
-    assert exec.bytecode.count("reshape_tensor") == 2
-    assert "reshape_tensor" in exec.bytecode
-    y_np = np.array([8, 2, 8]).astype("int32")
-    check_result(target, dev, [x_np, y_np], x_np.reshape([8, 2, 8]), mod)
-
-
-def test_vm_reshape_and_copy(target, dev):
-    """Make sure the compiler notices the reshape result shape is a literal and can use
-    the immediate-mode alloc_tensor instruction instead of alloc_tensor_reg."""
-    x_np = np.random.uniform(size=(1, 1)).astype("float32")
-    x = relay.var("x", shape=(1, 1), dtype="float32")
-    mod = tvm.IRModule.from_expr(relay.Function([x], relay.copy(relay.reshape(x, [0, 1]))))
-    with tvm.transform.PassContext(opt_level=3):
-        exec = relay.vm.compile(mod, "llvm")
-    assert "alloc_tensor" in exec.bytecode
-    assert not "alloc_tensor_reg" in exec.bytecode
-    check_result(target, dev, [x_np], x_np.reshape([1, 1]), mod)
-
-
-def test_vm_reshape_tuple(target, dev, x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
-    tup = relay.var(
-        "tup",
-        type_annotation=relay.TupleType([relay.TensorType(x_shape), relay.TensorType(y_shape)]),
-    )
-    out = relay.reshape(relay.TupleGetItem(tup, 0), (1, -1))
-    f = relay.Function([tup], out)
-
-    x_data = np.random.uniform(size=x_shape).astype("float32")
-    y_data = np.random.uniform(size=y_shape).astype("float32")
-
-    res = veval(f, (x_data, y_data), device=dev, target=target)
-    tvm.testing.assert_allclose(res.numpy(), np.reshape(x_data, (1, -1)))
-
-
-def test_constant_shape_with_external_codegen():
-    @tvm.register_func("relay.ext.test1")
-    def relay_ext_test(func):
-        return None
-
-    mod = tvm.IRModule()
-    shape = (relay.Any(), 25)
-    dtype = "float32"
-
-    # external function
-    x = relay.var("x", shape=shape, dtype=dtype)
-    weight = relay.const(np.random.rand(5, 25).astype("float32"), dtype="float32")
-    out = relay.nn.dense(x, weight)
-    f1 = relay.Function([x], out)
-    f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    f1 = f1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-    f1 = f1.with_attr("Compiler", "test1")
-    f1 = f1.with_attr("global_symbol", "f1")
-    glb_f1 = relay.GlobalVar("f1")
-    mod[glb_f1] = f1
-    mod = relay.transform.InferType()(mod)
-
-    # Main function
-    x = relay.var("x", shape=shape, dtype=dtype)
-    mod["main"] = relay.Function([x], glb_f1(x))
-    comp = relay.vm.VMCompiler()
-    opt_mod, _ = comp.optimize(mod, target="llvm")
-    assert "shape_func" in opt_mod.astext(False)
-
-
-def prepare_vm_model(path, tensor_shape):
-    """
-    Virtual Machine is compiled for simple topology and
-    exported as library to given path
-    """
-    target = tvm.target.Target("llvm --host=llvm")
-
-    # Build a IRModule.
-    x = relay.var("x", shape=tensor_shape)
-    f = relay.Function([x], x + x)
-    mod = IRModule.from_expr(f)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-
-    # Export to Disk
-    vm_exec.mod.export_library(path)
-
-
-def test_vm_rpc():
-    """
-    This test checks to make sure you can export a VMExecutable,
-    upload it to a remote machine using RPC and then execute it
-    on the other machine.
-    """
-    # Shape for input and output tensors
-    shape = (10, 1)
-
-    # Export to Disk
-    temp = utils.tempdir()
-    path = temp.relpath("vm_library.so")
-    prepare_vm_model(path, shape)
-
-    # Use local rpc server for testing.
-    # Server must use popen so it doesn't inherit the current process state. It
-    # will crash otherwise.
-    def check_remote(server):
-        remote = rpc.connect(server.host, server.port, session_timeout=10)
-
-        # Upload the serialized Executable.
-        remote.upload(path)
-        # Get a handle to remote Executable.
-        rexec = remote.load_module("vm_library.so")
-
-        device = remote.cpu()
-        # Build a VM out of the executable and context.
-        vm_factory = runtime.vm.VirtualMachine(rexec, device)
-        np_input = np.random.uniform(size=shape).astype("float32")
-        input_tensor = tvm.nd.array(np_input, device)
-        # Invoke its "main" function.
-        out = vm_factory.invoke("main", input_tensor)
-        # Check the result.
-        np.testing.assert_allclose(out.numpy(), np_input + np_input)
-
-    check_remote(rpc.Server("127.0.0.1"))
-
-
-def test_vm_invoke_with_outputs_rpc():
-    """
-    This test checks to make sure you can export a VMExecutable,
-    upload it to a remote machine using RPC and then execute it
-    on the other machine with preallocated outputs.
-    """
-    # Shape for input and output tensors
-    shape = (3, 2)
-
-    # Export to Disk
-    temp = utils.tempdir()
-    path = temp.relpath("vm_library.so")
-    prepare_vm_model(path, shape)
-
-    # Use local rpc server for testing.
-    # Server must use popen so it doesn't inherit the current process state. It
-    # will crash otherwise.
-    def check_remote_invoke_with_outputs(server):
-        remote = rpc.connect(server.host, server.port, session_timeout=10)
-
-        # Upload the serialized Executable.
-        remote.upload(path)
-        # Get a handle to remote Executable.
-        rexec = remote.load_module("vm_library.so")
-
-        device = remote.cpu()
-        # Build a VM out of the executable and context.
-        vm_factory = runtime.vm.VirtualMachine(rexec, device)
-        np_input = np.random.uniform(size=shape).astype("float32")
-        input_tensor = tvm.nd.array(np_input, device)
-        np_output = np.empty(shape, dtype="float32")
-        output_tensor = tvm.nd.array(np_output, device)
-        # Invoke its "main" function.
-        vm_factory.invoke_with_outputs(
-            "main", input_args={"x": input_tensor}, output_args=[output_tensor]
-        )
-        # Check the result.
-        np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)
-
-    check_remote_invoke_with_outputs(rpc.Server("127.0.0.1"))
-
-
-def test_vm_invoke_with_outputs():
-    target = tvm.target.Target("llvm")
-    shape = (3, 2)
-
-    # Build a IRModule.
-    x = relay.var("x", shape=shape)
-    f = relay.Function([x], x + x)
-    mod = IRModule.from_expr(f)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
-    np_input = np.random.uniform(size=shape).astype("float32")
-    input_tensor = tvm.nd.array(np_input)
-    np_output = np.empty(shape, dtype="float32")
-    output_tensor = tvm.nd.array(np_output)
-    # Invoke
-    vm_factory.invoke_with_outputs(
-        "main", input_args={"x": input_tensor}, output_args=[output_tensor]
-    )
-    # Check the result.
-    np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)
-
-
-def test_get_output_single():
-    target = tvm.target.Target("llvm")
-
-    # Build a IRModule.
-    x = relay.var("x", shape=(10,))
-    f = relay.Function([x], x + x)
-    mod = IRModule.from_expr(f)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
-    inp = np.ones(10, dtype="float32")
-    vm_factory.invoke_stateful("main", inp)
-    outputs = vm_factory.get_outputs()
-    assert len(outputs) == 1
-    np.testing.assert_allclose(outputs[0].numpy(), inp + inp)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_get_output_multiple(target, dev):
-    # Build a IRModule.
-    x = relay.var("x", shape=(10,))
-    f = relay.Function([x], relay.Tuple([x + x, x]))
-    mod = IRModule.from_expr(f)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    vm_factory = runtime.vm.VirtualMachine(vm_exec, dev)
-    inp = np.ones(10, dtype="float32")
-    vm_factory.invoke_stateful("main", inp)
-    outputs = vm_factory.get_outputs()
-    assert len(outputs) == 2
-    np.testing.assert_allclose(outputs[0].numpy(), inp + inp)
-    np.testing.assert_allclose(outputs[1].numpy(), inp)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_get_input_index(target, dev):
-    # Build a IRModule.
-    data_0, data_1 = ["d1", "d2"]
-    x, y = [relay.var(c, shape=(10,)) for c in [data_0, data_1]]
-    f = relay.Function([x, y], x + y)
-    mod = IRModule.from_expr(f)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    vm_factory = runtime.vm.VirtualMachine(vm_exec, dev)
-    assert vm_factory.get_input_index(data_1) == 1
-    assert vm_factory.get_input_index(data_0) == 0
-    assert vm_factory.get_input_index("invalid") == -1
-
-
-def get_one_input_relay_mod(tensor_type, shape, data_name):
-    x = relay.var(data_name, shape=shape, dtype=tensor_type)
-    y = relay.exp(x)
-    f = relay.Function([x], y)
-    return IRModule.from_expr(f)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_one_set_input(target, dev):
-    dtype = "float32"
-    in_shape = [1, 2, 3, 3]
-    in_data_name_0 = "d0"
-
-    mod = get_one_input_relay_mod(dtype, in_shape, in_data_name_0)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    exe = runtime.vm.VirtualMachine(vm_exec, dev)
-
-    data0_core = np.random.uniform(size=in_shape).astype(dtype)
-    data0 = tvm.nd.array(data0_core)
-    ref_res_core = np.exp(data0_core)
-    ref_res = tvm.nd.array(ref_res_core)
-
-    exe.set_input("main", data0)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-    data_dict = {in_data_name_0: data0}
-    exe.set_input("main", **data_dict)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-
-def get_multiple_input_relay_mod(tensor_type, shape, data_name0, data_name1):
-    x, y = [relay.var(c, shape=shape, dtype=tensor_type) for c in [data_name0, data_name1]]
-    f = relay.Function([x, y], x + y)
-    return IRModule.from_expr(f)
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_multiple_set_input(target, dev):
-    dtype = "float32"
-    in_shape = [1, 2, 3, 3]
-    in_data_name_0 = "d0"
-    in_data_name_1 = "d1"
-
-    mod = get_multiple_input_relay_mod(dtype, in_shape, in_data_name_0, in_data_name_1)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    exe = runtime.vm.VirtualMachine(vm_exec, dev)
-
-    data0_core = np.random.uniform(size=in_shape).astype(dtype)
-    data0 = tvm.nd.array(data0_core)
-    data1_core = np.random.uniform(size=in_shape).astype(dtype)
-    data1 = tvm.nd.array(data1_core)
-    ref_res_core = data0_core + data1_core
-    ref_res = tvm.nd.array(ref_res_core)
-
-    exe.set_input("main", data0, data1)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-    data_dict = {in_data_name_1: data1, in_data_name_0: data0}
-    exe.set_input("main", **data_dict)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_one_set_one_input(target, dev):
-    dtype = "float32"
-    in_shape = [1, 2, 3, 3]
-    in_data_name_0 = "d0"
-
-    mod = get_one_input_relay_mod(dtype, in_shape, in_data_name_0)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    exe = runtime.vm.VirtualMachine(vm_exec, dev)
-
-    data0_core = np.random.uniform(size=in_shape).astype(dtype)
-    data0 = tvm.nd.array(data0_core)
-    ref_res_core = np.exp(data0_core)
-    ref_res = tvm.nd.array(ref_res_core)
-
-    exe.set_one_input("main", 0, data0)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-    exe.set_one_input("main", in_data_name_0, data0)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-    data_dict = {in_data_name_0: data0}
-    exe.set_one_input("main", **data_dict)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_multiple_set_one_input(target, dev):
-    dtype = "float32"
-    in_shape = [1, 2, 3, 3]
-    in_data_name_0 = "d0"
-    in_data_name_1 = "d1"
-
-    mod = get_multiple_input_relay_mod(dtype, in_shape, in_data_name_0, in_data_name_1)
-
-    # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target)
-    exe = runtime.vm.VirtualMachine(vm_exec, dev)
-
-    data0_core = np.random.uniform(size=in_shape).astype(dtype)
-    data0 = tvm.nd.array(data0_core)
-    data1_core = np.random.uniform(size=in_shape).astype(dtype)
-    data1 = tvm.nd.array(data1_core)
-    ref_res_core = data0_core + data1_core
-    ref_res = tvm.nd.array(ref_res_core)
-
-    exe.set_one_input("main", 1, data1)
-    exe.set_one_input("main", 0, data0)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-    exe.set_one_input("main", in_data_name_1, data1)
-    exe.set_one_input("main", in_data_name_0, data0)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-    data_dict = {in_data_name_1: data1}
-    exe.set_one_input("main", **data_dict)
-    data_dict = {in_data_name_0: data0}
-    exe.set_one_input("main", **data_dict)
-    output = exe.invoke("main")
-    assert output.dtype == ref_res.dtype
-    tvm.testing.assert_allclose(ref_res_core, output.numpy())
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_benchmark(target, dev):
-    mod, params = mlp.get_workload(1)
-    lib = vm.compile(mod, target=target, params=params)
-    exe = runtime.vm.VirtualMachine(lib, tvm.cpu())
-    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
-    result = exe.benchmark(tvm.cpu(), data, func_name="main", repeat=2, number=1)
-    assert result.mean == result.median
-    assert result.mean > 0
-    assert len(result.results) == 2
-
-    with patch.object(
-        tvm.runtime.module.Module,
-        "time_evaluator",
-        return_value=lambda x: tvm.runtime.module.BenchmarkResult([1, 2, 2, 5]),
-    ) as method:
-        result = exe.benchmark(dev, data, func_name="main", repeat=2, number=1)
-        assert result.mean == 2.5
-        assert result.median == 2.0
-        assert result.max == 5
-        assert result.min == 1
-        assert result.std == 1.5
-
-
-def test_benchmark_end_to_end(target, dev):
-    mod, params = mlp.get_workload(1)
-    lib = vm.compile(mod, target=target, params=params)
-    exe = runtime.vm.VirtualMachine(lib, dev)
-    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
-    result = exe.benchmark(dev, data, func_name="main", repeat=2, number=1, end_to_end=True)
-    assert result.mean > 0
-
-
-@tvm.testing.requires_cuda
-def test_benchmark_end_to_end_rpc():
-    server = rpc.Server("127.0.0.1")
-    remote = rpc.connect(server.host, server.port)
-
-    mod, params = mlp.get_workload(1)
-    lib = vm.compile(mod, target="cuda", params=params)
-
-    temp = utils.tempdir()
-    path = temp.relpath("vm_library.so")
-    lib.mod.export_library(path)
-    remote.upload(path)
-    rlib = remote.load_module("vm_library.so")
-
-    exe = runtime.vm.VirtualMachine(rlib, remote.device("cuda"))
-    data = tvm.nd.array(
-        np.random.rand(1, 1, 28, 28).astype("float32"), device=remote.device("cuda")
-    )
-    result = exe.benchmark(
-        remote.device("cuda"), data=data, func_name="main", repeat=2, number=1, end_to_end=True
-    )
-    assert result.mean > 0
-
-
-def test_shape_func_nested_function():
-    @tvm.register_func("relay.ext.test2")
-    def relay_ext_test(func):
-        return None
-
-    data_shape = (relay.Any(), 16)
-    weight_shape = (relay.Any(), 16)
-
-    dense = relay.nn.dense(
-        relay.var("data", shape=data_shape), relay.var("weight", shape=weight_shape)
-    )
-    mod = tvm.IRModule.from_expr(dense)
-
-    patterns = [("test.dense", is_op("nn.dense")(wildcard(), wildcard()))]
-    passes = tvm.transform.Sequential(
-        [
-            relay.transform.MergeComposite(patterns),
-            relay.transform.AnnotateTarget(["test2"]),
-            relay.transform.PartitionGraph(),
-        ]
-    )
-
-    mod = passes(mod)
-
-    compiler = VMCompiler()
-    compiler.lower(mod, "llvm")
-
-
-@tvm.testing.requires_cuda
-def test_storage_size_and_offset_on_cpu():
-    """Tests allocations place sizes and offsets on the CPU host even if the rest
-    of the computation is on a different device type."""
-
-    # TODO(mbs): Better would be to test ManifestAlloc independently.
-    # And/or move this to C++ and test the VM executable in it's C++ instead of
-    # pretty-printed form.
-
-    # CPU = device type 1
-    # GPU = device type 2
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32],
-                      param_device_types=[2], result_device_type=2) {
-              add(%a, %a)
-            }
-        """
-        )
-
-    exe = relay.vm.compile(
-        input(),
-        tvm.target.Target("cuda"),
-    )
-
-    # This program needs two constants:
-    # - The size of the tensor's storage (first arg) to alloc_storage
-    # - The offset of the tensor within the storage (second arg) to alloc_tensor
-    # Both should be on the CPU
-    assert "VirtualDevice[0]: device type 1" in exe.virtual_devices
-    assert "VM Const[0]: NDArray[(),int64,(1,0)]=[140] on device index 0" in exe.constants
-    assert "VM Const[1]: NDArray[(),int64,(1,0)]=[0] on device index 0" in exe.constants
-
-
-@tvm.testing.requires_cuda
-def test_reshape_shape_on_cpu():
-    """Tests the argument to a reshape places the shape on the CPU host even if the rest
-    of the computation is on a different device type."""
-
-    # TODO(mbs): Better would be to test ManifestAlloc independently.
-    # And/or move this to C++ and test the VM executable in it's C++ instead of
-    # pretty-printed form.
-
-    # CPU = device type 1
-    # GPU = device type 2
-    def input():
-        return tvm.relay.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(2, 8), float32],
-                      param_device_types=[2], result_device_type=2) {
-              reshape(%x, newshape=[2, 4, 2])
-            }
-        """
-        )
-
-    exe = relay.vm.compile(
-        input(),
-        tvm.target.Target("cuda"),
-    )
-
-    # The newshape annotation should have been turned into a constant on the CPU.
-    assert "VirtualDevice[0]: device type 1" in exe.virtual_devices
-    assert "VM Const[0]: NDArray[(3),int64,(1,0)]=[2,4,2] on device index 0" in exe.constants
-
-
-@tvm.testing.requires_cuda
-def test_multi_targets():
-    # Build an IRModule.
-    n = 10
-    x = relay.var("x", shape=(n,))
-    y = relay.var("y", shape=(n,))
-    z = relay.var("z", shape=(n,))
-    f = relay.Function([x, y, z], x + relay.op.annotation.on_device(y + z, tvm.cpu()))
-    mod = IRModule.from_expr(f)
-
-    # Compile to VMExecutable.
-    with tvm.transform.PassContext(
-        opt_level=3, config={"relay.fallback_device_type": tvm.cuda().device_type}
-    ):
-        exe = relay.vm.compile(
-            mod, target={"cpu": tvm.target.Target("llvm"), "cuda": tvm.target.Target("cuda")}
-        )
-
-    # Run
-    vm = runtime.vm.VirtualMachine(exe, [tvm.cuda(), tvm.cpu()])
-    x_data = np.random.rand(
-        n,
-    ).astype("float32")
-    y_data = np.random.rand(
-        n,
-    ).astype("float32")
-    z_data = np.random.rand(
-        n,
-    ).astype("float32")
-    actual_result = vm.invoke("main", x_data, y_data, z_data)
-
-    # Test
-    expected_result = x_data + y_data + z_data
-    tvm.testing.assert_allclose(actual_result.numpy(), expected_result)
-
-
-def test_let_bound_constants():
-    """This tests for an ICHECK failure for ill-formed IR with let-bound constants"""
-
-    x = relay.var("x", shape=(3,), dtype="int32")
-    y = relay.take(x, relay.const(0))
-    z = relay.const(1)
-
-    f = relay.Function([x], relay.stack((z, y), axis=0))
-    mod = IRModule.from_expr(f)
-
-    compiler = VMCompiler()
-    compiler.optimize(mod, target="llvm")
-
-
-def test_large_constants():
-    """Large constants can be serialized outside of executable"""
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-
-    # fn(x) { add(x, <large constant>) }
-    x = relay.var("x", shape=(1000, 1000))
-    const_data = np.random.rand(1000, 1000).astype("float32")
-    const = relay.const(const_data, dtype="float32")
-    func = relay.Function([x], relay.op.add(x, const))
-    mod = tvm.IRModule.from_expr(func)
-
-    # Compile to executable.
-    vm_exec = vm.compile(mod, target=target)
-
-    # Save to constants and library files
-    temp = utils.tempdir()
-    path_consts = temp.relpath("consts")
-    vm_exec.move_late_bound_consts(path_consts, byte_limit=256)
-    path_dso = temp.relpath("lib.so")
-    vm_exec.mod.export_library(path_dso)
-
-    # Load library files and constants
-    mod = runtime.load_module(path_dso)
-    mod["load_late_bound_consts"](path_consts)
-
-    # Test main
-    x_data = np.random.rand(1000, 1000).astype("float32")
-    the_vm = runtime.vm.VirtualMachine(mod, dev)
-    actual = the_vm.invoke("main", x_data)
-    expected = x_data + const_data
-    tvm.testing.assert_allclose(expected, actual.numpy())
-
-    # We load the mod again so it's missing the consts.
-    mod = runtime.load_module(path_dso)
-    exe = runtime.vm.Executable(mod)
-
-    # Also test loading consts via the VM's wrapper API.
-    exe.load_late_bound_consts(path_consts)
-
-    # Test main again with consts now loaded via the above API.
-    x_data = np.random.rand(1000, 1000).astype("float32")
-    the_vm = runtime.vm.VirtualMachine(exe, dev)
-    actual = the_vm.invoke("main", x_data)
-    expected = x_data + const_data
-    tvm.testing.assert_allclose(expected, actual.numpy())
-
-
-def test_load_late_bound_consts_with_no_late_bound_consts():
-    """Check that load_late_bound_consts handles a model with no late bound consts."""
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-
-    const_data = np.random.rand(1).astype("float64")
-    x = relay.var("x", shape=(1,), dtype="float64")
-    const = relay.const(const_data, dtype="float64")
-
-    func = relay.Function([x], relay.op.add(x, const))
-    mod = tvm.IRModule.from_expr(func)
-
-    vm_exec = vm.compile(mod, target=target)
-
-    temp = utils.tempdir()
-    path_consts = temp.relpath("consts")
-    path_dso = temp.relpath("lib.so")
-
-    # Ensure const_data is below the byte threshold for a late-bound const.
-    byte_limit = len(const_data.tobytes()) + 1
-    vm_exec.move_late_bound_consts(path_consts, byte_limit=byte_limit)
-    vm_exec.mod.export_library(path_dso)
-
-    mod = runtime.load_module(path_dso)
-    mod["load_late_bound_consts"](path_consts)
-
-    x_data = np.random.rand(1).astype("float64")
-    loaded_vm = runtime.vm.VirtualMachine(mod, dev)
-    actual = loaded_vm.invoke("main", x_data)
-    expected = x_data + const_data
-    tvm.testing.assert_allclose(expected, actual.numpy())
-
-
-def test_vm_save_and_load_without_designating_late_bound_consts():
-    """Check that a VM can be saved and loaded without late-bound consts in play.
-
-    Specifically, this test ensures that the machinery behind late-bound const
-    loading does not assume the need to load late-bound consts (and cause an error)
-    when the user did not choose to designate any consts as such.
-    """
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-
-    const_data = np.random.rand(1).astype("float64")
-    x = relay.var("x", shape=(1,), dtype="float64")
-    const = relay.const(const_data, dtype="float64")
-
-    func = relay.Function([x], relay.op.add(x, const))
-    mod = tvm.IRModule.from_expr(func)
-
-    vm_exec = vm.compile(mod, target=target)
-
-    code, lib = vm_exec.save()
-    exe = runtime.vm.Executable.load_exec(code, lib)
-
-    x_data = np.random.rand(1).astype("float64")
-    loaded_vm = runtime.vm.VirtualMachine(exe, dev)
-    actual = loaded_vm.invoke("main", x_data)
-    expected = x_data + const_data
-    tvm.testing.assert_allclose(expected, actual.numpy())
-
-
-def test_load_and_save_constants_via_map():
-    """Large constants can be serialized outside of executable"""
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-
-    # fn(x) { add(x, <large constant>) }
-    x = relay.var("x", shape=(1000, 1000))
-    const_data = np.random.rand(1000, 1000).astype("float32")
-    const = relay.const(const_data, dtype="float32")
-    func = relay.Function([x], relay.op.add(x, const))
-    mod = tvm.IRModule.from_expr(func)
-
-    # Compile to executable.
-    vm_exec = vm.compile(mod, target=target)
-
-    consts_map = vm_exec.get_late_bound_consts(byte_limit=256)
-
-    # Save to constants and library files
-    temp = utils.tempdir()
-    path_dso = temp.relpath("lib.so")
-    vm_exec.mod.export_library(path_dso)
-
-    # Load library files and constants
-    mod = runtime.load_module(path_dso)
-    mod["load_late_bound_consts_from_map"](consts_map)
-
-    # Test main
-    x_data = np.random.rand(1000, 1000).astype("float32")
-    the_vm = runtime.vm.VirtualMachine(mod, dev)
-    actual = the_vm.invoke("main", x_data)
-    expected = x_data + const_data
-    tvm.testing.assert_allclose(expected, actual.numpy())
-
-    # We load the mod again so it's missing the consts.
-    mod = runtime.load_module(path_dso)
-    exe = runtime.vm.Executable(mod)
-
-    # Also test loading consts via the VM's wrapper API.
-    exe.load_late_bound_consts_from_map(consts_map)
-
-    # Test main again with consts now loaded via the above API.
-    x_data = np.random.rand(1000, 1000).astype("float32")
-    the_vm = runtime.vm.VirtualMachine(exe, dev)
-    actual = the_vm.invoke("main", x_data)
-    expected = x_data + const_data
-    tvm.testing.assert_allclose(expected, actual.numpy())
-
-
-def test_load_late_bound_consts_via_map_with_no_late_bound_consts():
-    """Check that load_late_bound_consts handles a model with no late bound consts."""
-    target = tvm.target.Target("llvm")
-    dev = tvm.cpu()
-
-    const_data = np.random.rand(1).astype("float64")
-    x = relay.var("x", shape=(1,), dtype="float64")
-    const = relay.const(const_data, dtype="float64")
-
-    func = relay.Function([x], relay.op.add(x, const))
-    mod = tvm.IRModule.from_expr(func)
-
-    vm_exec = vm.compile(mod, target=target)
-
-    temp = utils.tempdir()
-    path_dso = temp.relpath("lib.so")
-
-    # Ensure const_data is below the byte threshold for a late-bound const.
-    byte_limit = len(const_data.tobytes()) + 1
-    consts_map = vm_exec.get_late_bound_consts(byte_limit=byte_limit)
-    vm_exec.mod.export_library(path_dso)
-
-    mod = runtime.load_module(path_dso)
-    mod["load_late_bound_consts_from_map"](consts_map)
-
-    x_data = np.random.rand(1).astype("float64")
-    loaded_vm = runtime.vm.VirtualMachine(mod, dev)
-    actual = loaded_vm.invoke("main", x_data)
-    expected = x_data + const_data
-    tvm.testing.assert_allclose(expected, actual.numpy())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
deleted file mode 100644
index f5a495bc71c7..000000000000
--- a/tests/python/relay/test_vm_serialization.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, missing-docstring, no-else-return
-"""Unit tests for the Relay VM serialization and deserialization."""
-import pytest
-import numpy as np
-
-import tvm
-from tvm.runtime import vm as _vm
-from tvm.relay import vm as rly_vm
-from tvm import relay
-
-from tvm.relay.scope_builder import ScopeBuilder
-from tvm.relay import transform
-from tvm.relay.prelude import Prelude
-from tvm.contrib import utils
-from tvm.relay import testing
-
-
-def create_exec(f, target="llvm", params=None):
-    if isinstance(f, relay.Expr):
-        mod = tvm.IRModule()
-        mod["main"] = f
-        executable = rly_vm.compile(mod, target=target, params=params)
-        return executable
-    else:
-        assert isinstance(f, tvm.IRModule), "expected mod as tvm.IRModule"
-        executable = rly_vm.compile(f, target=target, params=params)
-        return executable
-
-
-def get_serialized_output(mod, *data, params=None, target="llvm", device=tvm.cpu()):
-    exe = create_exec(mod, target, params=params)
-    code, lib = exe.save()
-    des_exec = _vm.Executable.load_exec(code, lib)
-    des_vm = _vm.VirtualMachine(des_exec, device)
-    result = des_vm.run(*data)
-    return result
-
-
-def run_network(mod, params, dtype="float32"):
-    def get_vm_output(mod, data, params, target, device, dtype="float32"):
-        result = relay.create_executor("vm", mod=mod, device=device).evaluate()(data, **params)
-        return result.numpy().astype(dtype)
-
-    data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
-    data = np.random.uniform(size=data_shape).astype(dtype)
-    target = "llvm"
-    dev = tvm.cpu(0)
-
-    tvm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype)
-    vm_out = get_serialized_output(
-        mod, tvm.nd.array(data.astype(dtype)), params=params, target=target, device=dev
-    )
-    tvm.testing.assert_allclose(vm_out.numpy().astype(dtype), tvm_out, rtol=1e-5, atol=1e-5)
-
-
-def test_serializer():
-    mod = tvm.IRModule({})
-    a = relay.const(1.0, "float32")
-    x = relay.var("x", shape=(10, 10), dtype="float32")
-    f1 = relay.Function([x], x + a)
-    glb_f1 = relay.GlobalVar("f1")
-    mod[glb_f1] = f1
-
-    # TODO(@jroesch): look into optimizing away the need to do this
-    mod = transform.InferType()(mod)
-
-    b = relay.const(2.0, "float32")
-    y = relay.var("y", shape=(10, 10), dtype="float32")
-    f2 = relay.Function([y], y - b)
-    glb_f2 = relay.GlobalVar("f2")
-    mod[glb_f2] = f2
-
-    # TODO(@jroesch): look into optimizing away the need to do this
-    mod = transform.InferType()(mod)
-
-    x1 = relay.var("x1", shape=(10, 10), dtype="float32")
-    y1 = relay.var("y1", shape=(10, 10), dtype="float32")
-    main = relay.Function([x1, y1], glb_f1(x1) * glb_f2(y1))
-    mod["main"] = main
-
-    exe = create_exec(mod)
-
-    glbs = exe.globals
-    assert len(glbs) == 3
-    assert "f1" in glbs
-    assert "f2" in glbs
-    assert "main" in glbs
-
-    prim_ops = exe.primitive_ops
-    assert any(item.startswith("vm_mod_fused_add") for item in prim_ops)
-    assert any(item.startswith("vm_mod_fused_subtract") for item in prim_ops)
-    assert any(item.startswith("vm_mod_fused_multiply") for item in prim_ops)
-
-    code = exe.bytecode
-    assert "main(x1, y1)" in code
-    assert "f1(x)" in code
-    assert "f2(y)" in code
-
-    code, lib = exe.save()
-    assert isinstance(code, bytearray)
-    assert isinstance(lib, tvm.runtime.Module)
-
-
-def test_save_load():
-    x = relay.var("x", shape=(10, 10))
-    f = relay.Function([x], x + x)
-    x_data = np.random.rand(10, 10).astype("float32")
-
-    # serialize.
-    vm = create_exec(f)
-    code, lib = vm.save()
-    assert isinstance(code, bytearray)
-
-    # save and load the code and lib file.
-    tmp = utils.tempdir()
-    path_lib = tmp.relpath("lib.so")
-    lib.export_library(path_lib)
-    with open(tmp.relpath("code.ro"), "wb") as fo:
-        fo.write(code)
-
-    loaded_lib = tvm.runtime.load_module(path_lib)
-    loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read())
-
-    # deserialize.
-    des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib)
-    des_vm = _vm.VirtualMachine(des_exec, tvm.cpu())
-
-    res = des_vm.run(x_data)
-    tvm.testing.assert_allclose(res.numpy(), x_data + x_data)
-
-
-def test_const():
-    c = relay.const(1.0, "float32")
-    x = relay.var("x", shape=(10, 10), dtype="float32")
-    f = relay.Function([x], x + c)
-    x_data = np.random.rand(10, 10).astype("float32")
-    res = get_serialized_output(f, x_data)
-    tvm.testing.assert_allclose(res.numpy(), x_data + 1)
-
-
-def test_if():
-    x = relay.var("x", shape=(10, 10))
-    y = relay.var("y", shape=(10, 10))
-    equal = relay.op.equal(x, y)
-    equal = relay.op.nn.batch_flatten(equal)
-    f = relay.Function([x, y], relay.If(relay.op.min(equal, axis=[0, 1]), x, y))
-    x_data = np.random.rand(10, 10).astype("float32")
-    y_data = np.random.rand(10, 10).astype("float32")
-
-    # same
-    res = get_serialized_output(f, x_data, x_data)
-    tvm.testing.assert_allclose(res.numpy(), x_data)
-
-    # diff
-    res = get_serialized_output(f, x_data, y_data)
-    tvm.testing.assert_allclose(res.numpy(), y_data)
-
-
-def test_loop():
-    mod = tvm.IRModule({})
-    sum_up = relay.GlobalVar("sum_up")
-    i = relay.var("i", shape=[], dtype="int32")
-    accum = relay.var("accum", shape=[], dtype="int32")
-    sb = ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0, "int32"))):
-        sb.ret(accum)
-    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1, "int32"))
-        new_accum = relay.add(accum, i)
-        sb.ret(relay.Call(sum_up, [one_less, new_accum]))
-    func = relay.Function([i, accum], sb.get())
-    mod[sum_up] = func
-    mod = transform.InferType()(mod)
-    loop_bound = 0
-    i_data = np.array(loop_bound, dtype="int32")
-    accum_data = np.array(0, dtype="int32")
-    iarg = relay.var("i", shape=[], dtype="int32")
-    aarg = relay.var("accum", shape=[], dtype="int32")
-    mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
-
-    result = get_serialized_output(mod, i_data, accum_data)
-    tvm.testing.assert_allclose(result.numpy(), sum(range(1, loop_bound + 1)))
-
-
-def test_tuple():
-    ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
-    tup = relay.var("tup", type_annotation=ttype)
-    f = relay.Function([tup], relay.TupleGetItem(tup, 1))
-    i_data = np.random.rand(41).astype("float32")
-    j_data = np.random.rand(10).astype("float32")
-
-    result = get_serialized_output(f, (i_data, j_data))
-    tvm.testing.assert_allclose(result.numpy(), j_data)
-
-
-def test_adt_list():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-    _, cons, nil = mod.get_type("List")
-    l1 = cons(relay.const(1), nil())
-    l21 = cons(relay.const(2), l1)
-    l321 = cons(relay.const(3), l21)
-
-    f = relay.Function([], l321)
-    mod["main"] = f
-
-    result = get_serialized_output(mod)
-    assert len(result) == 2
-    assert len(result[1]) == 2
-    assert len(result[1][1]) == 2
-    res = []
-    res.append(result[0].numpy().tolist())
-    res.append(result[1][0].numpy().tolist())
-    res.append(result[1][1][0].numpy().tolist())
-    tvm.testing.assert_allclose(res, np.array([3, 2, 1]))
-
-
-def test_adt_compose():
-    mod = tvm.IRModule()
-    p = Prelude(mod)
-
-    compose = mod.get_global_var("compose")
-
-    # add_one = fun x -> x + 1
-    sb = relay.ScopeBuilder()
-    x = relay.var("x", "float32")
-    x1 = sb.let("x1", x)
-    xplusone = x1 + relay.const(1.0, "float32")
-    sb.ret(xplusone)
-    body = sb.get()
-    add_one = relay.GlobalVar("add_one")
-    add_one_func = relay.Function([x], body)
-
-    # add_two = compose(add_one, add_one)
-    sb = relay.ScopeBuilder()
-    y = relay.var("y", "float32")
-    add_two_func = sb.let("add_two", compose(add_one_func, add_one_func))
-    add_two_res = add_two_func(y)
-    sb.ret(add_two_res)
-    add_two_body = sb.get()
-
-    mod[add_one] = add_one_func
-
-    f = relay.Function([y], add_two_body)
-    mod["main"] = f
-
-    x_data = np.array(np.random.rand()).astype("float32")
-    result = get_serialized_output(mod, x_data)
-    tvm.testing.assert_allclose(result.numpy(), x_data + 2.0)
-
-
-def test_closure():
-    x = relay.var("x", shape=())
-    y = relay.var("y", shape=())
-    f = relay.Function([x], x + y)
-    ff = relay.Function([y], f)
-    clo = ff(relay.const(1.0))
-    main = clo(relay.const(2.0))
-
-    res = get_serialized_output(main)
-    tvm.testing.assert_allclose(res.numpy(), 3.0)
-
-
-def test_synthetic():
-    mod, params = testing.synthetic.get_workload()
-    run_network(mod, params)
-
-
-def test_mobilenet():
-    mod, params = testing.mobilenet.get_workload(batch_size=1)
-    run_network(mod, params)
-
-
-def test_vm_shape_of():
-    x = relay.var("x", shape=(relay.Any(), relay.Any(), relay.Any()), dtype="float32")
-    relu_x = relay.nn.relu(x)
-    data = np.random.uniform(size=(2, 3, 4)).astype("float32")
-    args = [data]
-
-    newshape_var = relay.var("newshape", shape=(2,), dtype="int64")
-    args.append(np.array((1, -1), dtype="int64"))
-    main = relay.Function([x, newshape_var], relay.reshape(relu_x, newshape=newshape_var))
-
-    res = get_serialized_output(main, *args).numpy()
-    tvm.testing.assert_allclose(res.flatten(), data.flatten())
-
-
-def test_dynamic_bcast():
-    dtype = "float32"
-    x = relay.var("x", shape=(relay.Any(), 2), dtype=dtype)
-    y = relay.var("y", shape=(3, 2), dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], relay.add(x, y))
-    x_data = np.random.uniform(size=(1, 2)).astype(dtype)
-    y_data = np.random.uniform(size=(3, 2)).astype(dtype)
-    res_np = np.add(x_data, y_data)
-    for target, dev in testing.enabled_targets():
-        res = get_serialized_output(mod, *(x_data, y_data), target=target, device=dev)
-        tvm.testing.assert_allclose(res.numpy(), res_np)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py b/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py
deleted file mode 100644
index ab585fb4e011..000000000000
--- a/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License
-"""Unit tests for the CapturePostDfsIndexInSpans debugging pass."""
-
-import tvm
-import tvm.testing
-import numpy as np
-
-
-def make_const(dtype, shape):
-    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
-
-
-def make_consts(dtype, shapes):
-    return [make_const(dtype, shape) for shape in shapes]
-
-
-metatable = {
-    "relay.Constant": make_consts(
-        "float16",
-        [
-            (2304, 768),  # 0
-            (2304,),  # 1
-            (600, 32, 64),  # 2
-        ],
-    )
-}
-
-
-def input_mod():
-    return tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304);
-          %1 = add(%0, meta[relay.Constant][1]);
-          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
-                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
-            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
-                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
-              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
-            };
-            %6(%y_3_i0, %y_3_i1)
-          };
-          %3 = %2(%x3, meta[relay.Constant][2]);
-          (%1, %3)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-
-
-expected_pretty_printed_output_mod = r"""def @main(%x0: Tensor[(1600, 768), float16] /* ty=Tensor[(1600, 768), float16] span=index:0:5 */, %x3: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:1:18 */) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-  %0 = nn.dense(%x0, meta[relay.Constant][0] /* ty=Tensor[(2304, 768), float16] span=index:4:5 */, units=2304) /* ty=Tensor[(1600, 2304), float16] span=index:5:7 */;
-  %2 = fn (%y_3_i0: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:8:15 */, %y_3_i1: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:9:15 */, Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
-    %1 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:10:13 */, %FunctionVar_0_11: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:11:13 */, PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
-      nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True) /* ty=Tensor[(600, 32, 32), float16] span=index:13:14 */
-    } /* ty=fn (Tensor[(600, 32, 64), float16], Tensor[(600, 32, 64), float16]) -> Tensor[(600, 32, 32), float16] span=index:14:15 */;
-    %1(%y_3_i0, %y_3_i1) /* ty=Tensor[(600, 32, 32), float16] span=index:15:16 */
-  } /* ty=fn (Tensor[(600, 32, 64), float16], Tensor[(600, 32, 64), float16]) -> Tensor[(600, 32, 32), float16] span=index:16:18 */;
-  %3 = add(%0, meta[relay.Constant][1] /* ty=Tensor[(2304), float16] span=index:6:7 */) /* ty=Tensor[(1600, 2304), float16] span=index:7:19 */;
-  %4 = %2(%x3, meta[relay.Constant][2] /* ty=Tensor[(600, 32, 64), float16] span=index:17:18 */) /* ty=Tensor[(600, 32, 32), float16] span=index:18:19 */;
-  (%3, %4) /* ty=(Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) span=index:19:20 */
-}
-
-"""
-
-
-def test_capture_index_in_spans():
-    output_mod = str(tvm.relay.transform.CapturePostDfsIndexInSpans()(input_mod()))
-    assert output_mod == expected_pretty_printed_output_mod
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
deleted file mode 100644
index 2e5f3b5ecf0e..000000000000
--- a/tests/python/relay/transform/test_compiler_function_utils.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License
-"""Unit tests for the OutlineCompilerFunctionsWithExistingGlobalSymbols and
-   MarkCompilerFunctionsAsExtern external codegen helper passes."""
-
-import tvm
-import tvm.testing
-import numpy as np
-
-
-def make_const(dtype, shape):
-    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
-
-
-def make_consts(dtype, shapes):
-    return [make_const(dtype, shape) for shape in shapes]
-
-
-metatable = {
-    "relay.Constant": make_consts(
-        "float16",
-        [
-            (2304, 768),  # 0
-            (2304,),  # 1
-            (600, 32, 64),  # 2
-        ],
-    )
-}
-
-
-def original_mod():
-    return tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          %0 = fn(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
-                  Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
-            %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
-                     PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
-              %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
-              add(%5, %FunctionVar_0_2)
-            };
-            %4(%y_0_i0, %y_0_i1, %y_0_i2)
-          };
-          %1 = %0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
-          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
-                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
-            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
-                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
-              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
-            };
-            %6(%y_3_i0, %y_3_i1)
-          };
-          %3 = %2(%x3, meta[relay.Constant][2]);
-          (%1, %3)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-
-
-def original_mod_let_bound():
-    return tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          let %f = fn(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
-                      Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
-            %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
-                     PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
-              %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
-              add(%5, %FunctionVar_0_2)
-            };
-            %4(%y_0_i0, %y_0_i1, %y_0_i2)
-          };
-          %1 = %f(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
-          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
-                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
-            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
-                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
-              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
-            };
-            %6(%y_3_i0, %y_3_i1)
-          };
-          %3 = %2(%x3, meta[relay.Constant][2]);
-          (%1, %3)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-
-
-def expected_outlined_mod():
-    return tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          %1 = @tvmgen_default_cutlass_main_0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
-          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
-                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
-            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
-                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
-              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
-            };
-            %6(%y_3_i0, %y_3_i1)
-          };
-          %3 = %2(%x3, meta[relay.Constant][2]);
-          (%1, %3)
-        }
-
-        def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
-                  Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
-          %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
-                   PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
-            %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
-            add(%5, %FunctionVar_0_2)
-          };
-          %4(%y_0_i0, %y_0_i1, %y_0_i2)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-
-
-def expected_extern_mod():
-    return tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          %1 = @tvmgen_default_cutlass_main_0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
-          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
-                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
-            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
-                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
-              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
-            };
-            %6(%y_3_i0, %y_3_i1)
-          };
-          %3 = %2(%x3, meta[relay.Constant][2]);
-          (%1, %3)
-        }
-
-        def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
-                  Extern=1) -> Tensor[(1600, 2304), float16] {
-          %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
-                   PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
-            %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
-            add(%5, %FunctionVar_0_2)
-          };
-          %4(%y_0_i0, %y_0_i1, %y_0_i2)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-
-
-def expected_inlined_mod():
-    return tvm.relay.parse(
-        """
-        #[version = "0.0.5"]
-        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304);
-          %1 = add(%0, meta[relay.Constant][1]);
-          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
-                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
-            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
-                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
-              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
-            };
-            %6(%y_3_i0, %y_3_i1)
-          };
-          %3 = %2(%x3, meta[relay.Constant][2]);
-          (%1, %3)
-        }
-        """,
-        "from_string",
-        None,
-        metatable,
-    )
-
-
-def test_outline_compiler_functions_with_existing_global_symbols():
-    actual_outlined_mod = tvm.relay.transform.OutlineCompilerFunctionsWithExistingGlobalSymbols(
-        "cutlass"
-    )(original_mod())
-    tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True)
-
-
-def test_outline_let_bound_compiler_functions_with_existing_global_symbols():
-    actual_outlined_mod = tvm.relay.transform.OutlineCompilerFunctionsWithExistingGlobalSymbols(
-        "cutlass"
-    )(original_mod_let_bound())
-    tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True)
-
-
-def test_mark_compiler_functions_as_extern():
-    actual_extern_mod = tvm.relay.transform.MarkCompilerFunctionsAsExtern("cutlass")(
-        expected_outlined_mod()
-    )
-    tvm.ir.assert_structural_equal(actual_extern_mod, expected_extern_mod(), map_free_vars=True)
-
-
-def test_inline_compiler_functions():
-    mod = expected_outlined_mod()
-    gv = mod.get_global_var("tvmgen_default_cutlass_main_0")
-    actual_inlined_mod = tvm.relay.transform.InlineCompilerFunctionsBoundTo([gv])(mod)
-    tvm.ir.assert_structural_equal(actual_inlined_mod, expected_inlined_mod(), map_free_vars=True)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/utils/assert_diagnostic.py b/tests/python/relay/utils/assert_diagnostic.py
deleted file mode 100644
index 5fcd1c20a018..000000000000
--- a/tests/python/relay/utils/assert_diagnostic.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import IRModule, get_global_func, register_func, relay
-from tvm.error import DiagnosticError
-from tvm.ir.diagnostics import get_renderer, override_renderer
-from tvm.relay import SpanCheck
-from tvm.relay.transform import AnnotateSpans
-from tvm.runtime import Object
-
-DEFAULT_RENDERER = get_renderer()
-
-__TESTING__ = None
-
-
-def testing_renderer(diag_ctx):
-    global __TESTING__
-    if __TESTING__ and __TESTING__.mirror:
-        DEFAULT_RENDERER.render(diag_ctx)
-
-    if __TESTING__:
-        __TESTING__._render(diag_ctx)
-
-
-class DiagnosticTesting:
-    def __init__(self, mirror=False):
-        self.mirror = mirror
-        self.messages = []
-
-    def __enter__(self):
-        global __TESTING__
-        __TESTING__ = self
-        override_renderer(testing_renderer)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        global __TESTING__
-        __TESTING__ = None
-        override_renderer(None)
-        if type is DiagnosticError and self.matches:
-            return True
-
-    def assert_message(self, in_message):
-        self.messages.append(in_message)
-
-    def _render(self, diag_ctx):
-        self.matches = False
-        for diagnostic in diag_ctx.diagnostics:
-            message = diagnostic.message
-            for partial_msg in self.messages:
-                if partial_msg in message:
-                    self.matches = True
diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py
deleted file mode 100644
index e200e885225d..000000000000
--- a/tests/python/relay/utils/external_codegen.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities for testing external code generation"""
-
-import os
-import sys
-
-import pytest
-
-import tvm
-from tvm import relay, runtime, testing
-from tvm.contrib import utils
-
-
-skip_windows = pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
-
-
-def parametrize_external_codegen_checks(test):
-    """Parametrize over the various check_result functions which are available"""
-    return pytest.mark.parametrize(
-        "check_result",
-        [
-            pytest.param(check_graph_executor_result, marks=[skip_windows]),
-            pytest.param(check_vm_result, marks=[skip_windows]),
-        ],
-    )(test)
-
-
-def parametrize_external_json_codegen_checks(test):
-    """Parametrize over the various check_result functions which are available for JSON"""
-    return pytest.mark.parametrize(
-        "check_result",
-        [
-            pytest.param(check_graph_executor_result, marks=[skip_windows]),
-            pytest.param(check_vm_result, marks=[skip_windows]),
-        ],
-    )(test)
-
-
-def update_lib(lib):
-    test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    source_dir = os.path.join(test_dir, "..", "..", "..", "..")
-    contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
-
-    kwargs = {}
-    kwargs["options"] = ["-O2", "-std=c++17", "-I" + contrib_path]
-    tmp_path = utils.tempdir()
-    lib_name = "lib.so"
-    lib_path = tmp_path.relpath(lib_name)
-    lib.export_library(lib_path, fcompile=False, **kwargs)
-    lib = tvm.runtime.load_module(lib_path)
-
-    return lib
-
-
-def check_vm_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()):
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        exe = relay.vm.compile(mod, target=target)
-    code, lib = exe.save()
-    lib = update_lib(lib)
-    exe = runtime.vm.Executable.load_exec(code, lib)
-    vm = runtime.vm.VirtualMachine(exe, device)
-    out = vm.run(**map_inputs)
-    tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
-
-
-def check_graph_executor_result(
-    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()
-):
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        executor_factory = relay.build(mod, target=target)
-    lib = update_lib(executor_factory.lib)
-    rt_mod = tvm.contrib.graph_executor.create(executor_factory.graph_json, lib, device)
-
-    for name, data in map_inputs.items():
-        rt_mod.set_input(name, data)
-    rt_mod.run()
-    out = tvm.nd.empty(out_shape, device=device)
-    out = rt_mod.get_output(0, out)
-
-    tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
-
-
-def set_external_func_attr(func, compiler, ext_symbol):
-    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Compiler", compiler)
-    func = func.with_attr("global_symbol", ext_symbol)
-    return func
diff --git a/tests/python/relay/utils/ref_funcs.py b/tests/python/relay/utils/ref_funcs.py
deleted file mode 100644
index 924805b2295e..000000000000
--- a/tests/python/relay/utils/ref_funcs.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-
-def gather_nd(data_np, indices_np, batch_dims=0):
-    """gather_nd implemented using numpy"""
-    data_shape = data_np.shape
-    indices_shape = indices_np.shape
-
-    def gather_nd_batch_dims_1_ref(data, indices):
-        res = []
-        for i, row in enumerate(data):
-            indices_tuple = tuple(indices[:, i])  # the indices for the i-th batch
-            res.append(row[indices_tuple])
-        # stack on the batch dim
-        return np.stack(res, 0)
-
-    if batch_dims > 1:
-        data_np_reshape = np.reshape(data_np, (-1,) + data_shape[batch_dims:])
-        indices_np_reshape = np.reshape(
-            indices_np, (indices_shape[0], -1) + indices_shape[(batch_dims + 1) :]
-        )
-
-        ref_res = gather_nd_batch_dims_1_ref(data_np_reshape, indices_np_reshape)
-
-        out_shape = indices_shape[1 : (batch_dims + 1)] + ref_res.shape[1:]
-        ref_res = np.reshape(ref_res, out_shape)
-    elif batch_dims == 1:
-        ref_res = gather_nd_batch_dims_1_ref(data_np, indices_np)
-    else:
-        ref_res = data_np[tuple(indices_np)]
-
-    return ref_res
diff --git a/tests/python/relay/utils/tag_span.py b/tests/python/relay/utils/tag_span.py
deleted file mode 100644
index 3f9aaff3ee8d..000000000000
--- a/tests/python/relay/utils/tag_span.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import relay, tir
-from tvm.relay import expr as _expr
-from tvm.relay.expr_functor import ExprVisitor
-
-
-def _set_span(expr, src):
-    if isinstance(expr, _expr.Call):
-        return _expr.CallWithFields(
-            expr, expr.op, expr.args, expr.attrs, expr.type_args, None, _create_span(src)
-        )
-    elif isinstance(expr, _expr.Var):
-        return _expr.VarWithFields(expr, expr.vid, expr.type_annotation, None, _create_span(src))
-    elif isinstance(expr, _expr.TupleGetItem):
-        return _expr.TupleGetItemWithFields(
-            expr, expr.tuple_value, expr.index, None, _create_span(src)
-        )
-    elif isinstance(expr, _expr.Constant):
-        return _expr.ConstantWithFields(expr, expr.data, None, _create_span(src))
-    elif isinstance(expr, _expr.Tuple):
-        return _expr.TupleWithFields(expr, expr.fields, None, _create_span(src))
-    elif isinstance(expr, _expr.TupleWrapper):
-        return _expr.TupleWrapper(_set_span(expr.tuple_value, src), expr.size)
-
-    assert False, f"unsupported type {type(expr)}"
-
-
-def _create_span(src):
-    if isinstance(src, list):
-        tmp_list = []
-        for s in src:
-            if isinstance(s, str):
-                tmp_list.append(_create_span(s))
-            elif isinstance(s, relay.Span):
-                tmp_list.append(s)
-            elif isinstance(s, relay.SequentialSpan):
-                tmp_list.extend(s.spans)
-            elif s is None:
-                tmp_list.append(s)
-            else:
-                assert False, f"unsupported type {type(s)}"
-        return relay.SequentialSpan(tmp_list)
-    return relay.Span(relay.SourceName(src), 0, 0, 0, 0)
-
-
-def _collect_spans(objref):
-    class Collector:
-        def __init__(self):
-            self._spans = []
-
-        def collect(self, objref):
-            if hasattr(objref, "span"):
-                self._spans.append(objref.span)
-
-        @property
-        def get_spans(self):
-            return self._spans
-
-    pov = None
-    if isinstance(objref, relay.Expr):
-        pov = relay.analysis.post_order_visit
-    elif isinstance(objref, (tir.Stmt, tir.expr.PrimExprWithOp)):
-        pov = tir.stmt_functor.post_order_visit
-    else:
-        assert False, f"unsupported type {type(objref)}"
-
-    c = Collector()
-    pov(objref, c.collect)
-    return c.get_spans
-
-
-def _verify_span(lhs, rhs):
-    lhs_spans, rhs_spans = _collect_spans(lhs), _collect_spans(rhs)
-
-    assert len(lhs_spans) == len(rhs_spans)
-
-    for i in range(len(lhs_spans)):
-        tvm.ir.assert_structural_equal(lhs_spans[i], rhs_spans[i])
-
-
-def _verify_structural_equal_with_span(lhs, rhs, assert_mode=False, map_free_vars=False):
-    if isinstance(lhs, relay.Var) and isinstance(rhs, relay.Var):
-        # SEqualReduce compares the vid of Var type. Threrfore we only compare span here.
-        _verify_span(lhs, rhs)
-        return
-
-    if assert_mode:
-        tvm.ir.assert_structural_equal(lhs, rhs, map_free_vars)
-    else:
-        tvm.ir.assert_structural_equal(lhs, rhs, map_free_vars)
-
-    _verify_span(lhs, rhs)
diff --git a/tests/python/runtime/test_runtime_container.py b/tests/python/runtime/test_runtime_container.py
index e0d216b33e9a..8d9de8241510 100644
--- a/tests/python/runtime/test_runtime_container.py
+++ b/tests/python/runtime/test_runtime_container.py
@@ -22,40 +22,10 @@
 
 import tvm
 import tvm.testing
-from tvm import nd, relay
+from tvm import nd
 from tvm.runtime import container as _container
 
 
-def test_adt_constructor():
-    arr = nd.array([1, 2, 3])
-    fields = [arr, arr]
-    y = _container.ADT(0, [arr, arr])
-
-    assert len(y) == 2
-    assert isinstance(y, _container.ADT)
-    y[0:1][-1] == arr
-    assert y.tag == 0
-    assert isinstance(arr, nd.NDArray)
-
-
-def test_tuple_object():
-    x = relay.var(
-        "x",
-        type_annotation=relay.ty.TupleType(
-            [relay.ty.TensorType((), "int32"), relay.ty.TensorType((), "int32")]
-        ),
-    )
-
-    fn = relay.Function([x], relay.expr.TupleGetItem(x, 0))
-    mod = tvm.IRModule.from_expr(fn)
-
-    f = relay.create_executor(kind="vm", mod=mod, device=nd.cpu(), target="llvm").evaluate()
-    value_tuple = _container.tuple_object([nd.array(np.array(11)), nd.array(np.array(12))])
-    # pass an ADT object to evaluate
-    out = f(value_tuple)
-    tvm.testing.assert_allclose(out.numpy(), np.array(11))
-
-
 def test_string():
     s = tvm.runtime.String("xyz")
 
diff --git a/tests/python/runtime/test_runtime_graph.py b/tests/python/runtime/test_runtime_graph.py
deleted file mode 100644
index 108784de7eb1..000000000000
--- a/tests/python/runtime/test_runtime_graph.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tempfile
-import tvm
-import tvm.testing
-from tvm import te, runtime
-import numpy as np
-import json
-from tvm import rpc
-from tvm import relay
-from tvm.contrib import utils, graph_executor
-
-
-@tvm.testing.requires_llvm
-def test_graph_simple():
-    n = 4
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-
-    node0 = {"op": "null", "name": "x", "inputs": []}
-    node1 = {
-        "op": "tvm_op",
-        "name": "add",
-        "inputs": [[0, 0, 0]],
-        "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"},
-    }
-    nodes = [node0, node1]
-    arg_nodes = [0]
-    node_row_ptr = [0, 1, 2]
-    outputs = [[1, 0, 0]]
-    shape = (4,)
-    attrs = {
-        "shape": ["list_shape", [shape, shape]],
-        "dltype": ["list_str", ["float32", "float32"]],
-        "storage_id": ["list_int", [0, 1]],
-    }
-    graph = {
-        "nodes": nodes,
-        "arg_nodes": arg_nodes,
-        "node_row_ptr": node_row_ptr,
-        "heads": outputs,
-        "attrs": attrs,
-    }
-    graph = json.dumps(graph)
-
-    def check_verify():
-        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-        mod = graph_executor.create(graph, mlib, tvm.cpu(0))
-        a = np.random.uniform(size=(n,)).astype(A.dtype)
-        mod.run(x=a)
-        out = mod.get_output(0, tvm.nd.empty((n,)))
-        np.testing.assert_equal(out.numpy(), a + 1)
-
-    def check_remote(server):
-        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-        remote = rpc.connect(server.host, server.port)
-        temp = utils.tempdir()
-        dev = remote.cpu(0)
-        path_dso = temp.relpath("dev_lib.so")
-        mlib.export_library(path_dso)
-        remote.upload(path_dso)
-        mlib = remote.load_module("dev_lib.so")
-        mod = graph_executor.create(graph, mlib, remote.cpu(0))
-        a = np.random.uniform(size=(n,)).astype(A.dtype)
-        mod.run(x=tvm.nd.array(a, dev))
-        out = tvm.nd.empty((n,), device=dev)
-        out = mod.get_output(0, out)
-        np.testing.assert_equal(out.numpy(), a + 1)
-
-    def check_sharing():
-        x = relay.var("x", shape=(1, 10))
-        y = relay.var("y", shape=(1, 10))
-        z = relay.add(x, y)
-        func = relay.Function([x, y], z)
-
-        x_in = np.ones((1, 10)).astype("float32")
-        params = {"x": x_in}
-        graph, lib, params = relay.build(func, target="llvm", params=params)
-
-        mod_shared = graph_executor.create(graph, lib, tvm.cpu(0))
-        mod_shared.load_params(runtime.save_param_dict(params))
-        num_mods = 10
-        mods = [graph_executor.create(graph, lib, tvm.cpu(0)) for _ in range(num_mods)]
-
-        for mod in mods:
-            mod.share_params(mod_shared, runtime.save_param_dict(params))
-
-        a = np.random.uniform(size=(1, 10)).astype("float32")
-        for mod in mods:
-            mod.run(y=a)
-            out = mod.get_output(0, tvm.nd.empty((1, 10)))
-            np.testing.assert_equal(out.numpy(), x_in + a)
-
-        # Explicitly delete the shared module and verify correctness.
-        del mod_shared
-        for mod in mods:
-            mod.run(y=a)
-            out = mod.get_output(0, tvm.nd.empty((1, 10)))
-            np.testing.assert_equal(out.numpy(), x_in + a)
-            del mod
-
-    check_verify()
-    check_remote(rpc.Server("127.0.0.1"))
-    check_sharing()
-
-
-def test_load_unexpected_params():
-    # Test whether graph_executor.load_params works if parameters
-    # are provided that are not an expected input.
-    mod = tvm.IRModule()
-    params = {}
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    mod["main"] = relay.Function([x, y], z)
-
-    graph_module = relay.build(mod, target="llvm", params=params)
-    rt_mod = tvm.contrib.graph_executor.create(
-        graph_module.get_graph_json(), graph_module.get_lib(), tvm.cpu(0)
-    )
-
-    new_params = graph_module.get_params()
-    new_params.update({"y_unknown": np.ones((1,)).astype("float32")})
-    rt_mod.load_params(runtime.save_param_dict(new_params))
-
-
-def test_save_load_file():
-    p = np.random.randn(10)
-    params = {"x": p}
-
-    with tempfile.NamedTemporaryFile() as fp:
-        tvm.runtime.save_param_dict_to_file(params, fp.name)
-        params_loaded = tvm.runtime.load_param_dict_from_file(fp.name)
-
-        assert "x" in params_loaded
-        np.testing.assert_equal(p, params_loaded["x"].numpy())
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/runtime/test_runtime_graph_cuda_graph.py b/tests/python/runtime/test_runtime_graph_cuda_graph.py
deleted file mode 100644
index 0282161c60f8..000000000000
--- a/tests/python/runtime/test_runtime_graph_cuda_graph.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import json
-import os
-import re
-import sys
-import time
-
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import te
-import numpy as np
-
-from tvm.contrib import utils, graph_executor
-from tvm.contrib.cuda_graph import cuda_graph_executor
-
-
-bx = te.thread_axis("blockIdx.x")
-tx = te.thread_axis("threadIdx.x")
-
-
-@tvm.testing.requires_cudagraph
-def test_graph_simple():
-    n = 32
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=8)
-    s[B].bind(xo, bx)
-    s[B].bind(xi, tx)
-
-    node0 = {"op": "null", "name": "x", "inputs": []}
-    node1 = {
-        "op": "tvm_op",
-        "name": "add",
-        "inputs": [[0, 0, 0]],
-        "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"},
-    }
-    nodes = [node0, node1]
-    arg_nodes = [0]
-    node_row_ptr = [0, 1, 2]
-    outputs = [[1, 0, 0]]
-    shape = (n,)
-    attrs = {
-        "shape": ["list_shape", [shape, shape]],
-        "dltype": ["list_str", ["float32", "float32"]],
-        "storage_id": ["list_int", [0, 1]],
-    }
-    graph = {
-        "nodes": nodes,
-        "arg_nodes": arg_nodes,
-        "node_row_ptr": node_row_ptr,
-        "heads": outputs,
-        "attrs": attrs,
-    }
-    graph = json.dumps(graph)
-
-    def check_verify():
-        mlib = tvm.build(s, [A, B], "cuda", name="myadd")
-        dev = tvm.cuda(0)
-        try:
-            mod = cuda_graph_executor.create(graph, mlib, dev)
-        except ValueError:
-            return
-
-        for i in range(3):
-            a = np.random.uniform(size=(n,)).astype(A.dtype)
-            mod.run(x=a)  # The first run captured a CUDA graph
-            out = mod.get_output(0, tvm.nd.empty((n,)))
-            np.testing.assert_equal(out.numpy(), a + 1)
-
-        # capture / run CUDA graph manually
-        mod.capture_cuda_graph()
-        a = np.random.uniform(size=(n,)).astype(A.dtype)
-        mod.set_input(x=a)
-        mod.run_cuda_graph()
-        out = mod.get_output(0, tvm.nd.empty((n,)))
-        np.testing.assert_equal(out.numpy(), a + 1)
-
-    check_verify()
-
-
-if __name__ == "__main__":
-    test_graph_simple()
diff --git a/tests/python/runtime/test_runtime_graph_debug.py b/tests/python/runtime/test_runtime_graph_debug.py
deleted file mode 100644
index 9111ed38db33..000000000000
--- a/tests/python/runtime/test_runtime_graph_debug.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import json
-import os
-import re
-import sys
-import time
-from distutils.log import debug
-
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import rpc, te
-from tvm._ffi.base import TVMError
-from tvm.contrib import utils
-from tvm.contrib.debugger import debug_executor
-from tvm import relay
-
-# Constants for creating simple graphs, fixtures to avoid free globals
-@pytest.fixture
-def n():
-    return 4
-
-
-@pytest.fixture
-def A(n):
-    return te.placeholder((n,), name="A")
-
-
-@pytest.fixture
-def B(A):
-    return te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-
-
-@pytest.fixture
-def s(B):
-    return te.create_schedule(B.op)
-
-
-@pytest.fixture
-def mlib(s, A, B):
-    return tvm.build(s, [A, B], "llvm", name="myadd")
-
-
-@pytest.fixture
-def myadd(mlib):
-    def _myadd(*args):
-        to_return = mlib["myadd"](*args)
-        time.sleep(0.25)
-        return to_return
-
-    return _myadd
-
-
-@pytest.fixture
-def graph():
-    node0 = {"op": "null", "name": "x", "inputs": []}
-    node1 = {
-        "op": "tvm_op",
-        "name": "add",
-        "inputs": [[0, 0, 0]],
-        "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"},
-    }
-    nodes = [node0, node1]
-    arg_nodes = [0]
-    node_row_ptr = [0, 1, 2]
-    outputs = [[1, 0, 0]]
-    shape = (4,)
-    attrs = {
-        "shape": ["list_shape", [shape, shape]],
-        "dltype": ["list_str", ["float32", "float32"]],
-        "storage_id": ["list_int", [0, 1]],
-    }
-    graph = {
-        "nodes": nodes,
-        "arg_nodes": arg_nodes,
-        "node_row_ptr": node_row_ptr,
-        "heads": outputs,
-        "attrs": attrs,
-    }
-    graph = json.dumps(graph)
-    return graph
-
-
-@tvm.testing.requires_llvm
-@tvm.testing.requires_rpc
-@pytest.mark.skipif(
-    tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
-)
-def test_end_to_end_graph_simple(graph, n, A, B, s, myadd):
-    def check_verify():
-        mlib_proxy = tvm.support.FrontendTestModule()
-        mlib_proxy["myadd"] = myadd
-        mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
-
-        a = np.random.uniform(size=(n,)).astype(A.dtype)
-        mod.set_input(x=a)
-
-        # verify dumproot created
-        directory = mod._dump_path
-        assert os.path.exists(directory)
-
-        # verify graph is there
-        GRAPH_DUMP_FILE_NAME = "_tvmdbg_graph_dump.json"
-        assert len(os.listdir(directory)) == 1
-
-        # verify the file name is proper
-        graph_dump_path = os.path.join(directory, GRAPH_DUMP_FILE_NAME)
-        assert os.path.exists(graph_dump_path)
-
-        # verify the graph contains some expected keys
-        with open(graph_dump_path) as graph_f:
-            dumped_graph = json.load(graph_f)
-
-        assert isinstance(dumped_graph, dict)
-        for k in ("nodes", "arg_nodes", "node_row_ptr", "heads", "attrs"):
-            assert k in dumped_graph, f"key {k} not in dumped graph {graph!r}"
-
-        mod.run()
-        # Verify the tensors are dumped
-        assert len(os.listdir(directory)) > 1
-
-        debug_lines = mod.debug_datum.get_debug_result().split("\n")
-
-        def split_debug_line(i):
-            to_return = re.split(r"  [ ]*", debug_lines[i])
-            assert to_return[-1] == ""
-            to_return = to_return[:-1]  # strip empty trailing part
-            return to_return
-
-        assert split_debug_line(0) == [
-            "Node Name",
-            "Ops",
-            "Time(us)",
-            "Time(%)",
-            "Shape",
-            "Inputs",
-            "Outputs",
-            "Measurements(us)",
-        ]
-        myadd_lines = split_debug_line(2)
-        assert myadd_lines[0] == "add"
-        assert myadd_lines[1] == "myadd"
-        runtime_sec = float(myadd_lines[2]) / 1e6  # printed in us
-
-        # Ensure runtime is at least the sleep time and less than a unit prefix order of magnitude.
-        # Here we just care that the prefix is correct.
-        assert runtime_sec > 0.25 and runtime_sec < 0.25 * 1000
-
-        total_lines = split_debug_line(3)
-        assert total_lines[0] == "Total_time"
-        assert total_lines[2] == myadd_lines[2]
-
-        CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json"
-        assert os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME))
-
-        with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f:
-            trace = json.load(f)
-        assert trace["displayTimeUnit"] == "ns"
-        events = trace["traceEvents"]
-        assert len(events) == 4
-        assert all(event["ph"] in ("B", "E") for event in events)
-        assert all(event["pid"] == 1 for event in events)
-        assert all(event["tid"] == 1 for event in events)
-        assert all(event["name"] == "x" for event in events[:2])
-        assert all(event["name"] == "add" for event in events[2:])
-        assert events[0]["ts"] == 0
-        assert events[0]["ph"] == "B"
-
-        # verify the output is correct
-        out = mod.get_output(0, tvm.nd.empty((n,)))
-        np.testing.assert_equal(out.numpy(), a + 1)
-
-        mod.exit()
-        # verify dump root delete after cleanup
-        assert not os.path.exists(directory)
-
-    def check_remote(server):
-        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-        remote = rpc.connect(server.host, server.port)
-        temp = utils.tempdir()
-        dev = remote.cpu(0)
-        path_dso = temp.relpath("dev_lib.so")
-        mlib.export_library(path_dso)
-        remote.upload(path_dso)
-        mlib = remote.load_module("dev_lib.so")
-        try:
-            mod = debug_executor.create(graph, mlib, remote.cpu(0))
-        except ValueError:
-            print("Skip because debug runtime not enabled")
-            return
-        a = np.random.uniform(size=(n,)).astype(A.dtype)
-        mod.run(x=tvm.nd.array(a, dev))
-        out = tvm.nd.empty((n,), device=dev)
-        out = mod.get_output(0, out)
-        np.testing.assert_equal(out.numpy(), a + 1)
-
-    check_verify()
-    check_remote(rpc.Server("127.0.0.1"))
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.skipif(
-    tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
-)
-def test_run_single_node(graph, n, A, myadd):
-    mlib_proxy = tvm.support.FrontendTestModule()
-    mlib_proxy["myadd"] = myadd
-    mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
-
-    a = np.random.uniform(size=(n,)).astype(A.dtype)
-    mod.set_input(x=a)
-
-    assert len(mod.debug_datum.get_graph_nodes()) == 2
-    assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param"
-    assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd"
-
-    # Running a node with no associated function should return instantly and have 0 runtime
-    assert mod.run_individual_node(0, number=1).mean == 0
-
-    # Meanwhile the actual function should take some time, more time if you run it more times
-    repeat_1_result = mod.run_individual_node(1, repeat=1)
-    assert repeat_1_result.mean > 0
-
-    # Running multiple times (10) should take longer than 1 time
-    repeat_3_results = mod.run_individual_node(1, repeat=3)
-    assert sum(repeat_3_results.results) > sum(repeat_1_result.results)
-
-    # Increasing the number of repeats should give you the number of results asked for
-    assert len(mod.run_individual_node(1, repeat=10).results) == 10
-
-    # Doing repeat_ms should have the run time greater than the asked amount
-    start = time.time()
-    mod.run_individual_node(1, min_repeat_ms=500)
-    end = time.time()
-    elapsed_time_in_seconds = end - start
-    assert elapsed_time_in_seconds >= 0.5
-
-    # Doing `cooldown_interval_ms` should have the execution time increases
-    start = time.time()
-    mod.run_individual_node(1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000)
-    end = time.time()
-    elapsed_time_in_seconds_with_def_rep = end - start
-    assert elapsed_time_in_seconds_with_def_rep >= 3
-
-    # Doing with `repeats_to_cooldown` not equal 1 should not trigger
-    # cooldown after each repeat
-    start = time.time()
-    mod.run_individual_node(
-        1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000, repeats_to_cooldown=2
-    )
-    end = time.time()
-    elapsed_time_in_seconds_with_rep_2 = end - start
-    assert elapsed_time_in_seconds_with_rep_2 >= 2 and (
-        elapsed_time_in_seconds_with_rep_2 < elapsed_time_in_seconds_with_def_rep
-    )
-
-    # Going out of bounds of node index throws a tvm error
-    with pytest.raises(TVMError):
-        mod.run_individual_node(2)
-
-
-@tvm.testing.requires_llvm
-def test_multiple_output():
-    x = relay.var("x", shape=(1, 3, 48, 16), dtype="float32")
-    t = relay.split(x, [12, 16, 32], 2).astuple()
-    x0 = relay.TupleGetItem(t, 0)
-    x1 = relay.TupleGetItem(t, 1)
-    x2 = relay.TupleGetItem(t, 2)
-    x3 = relay.TupleGetItem(t, 3)
-    p0 = relay.const(np.random.uniform(-1, 1, (3, 3, 1, 1)).astype("float32"))
-    y = relay.nn.conv2d(x2, p0, kernel_size=(1, 1), kernel_layout="OIHW", out_dtype="float32") + x3
-
-    func = relay.Function([x], relay.Tuple([x0, x1, y]))
-    mod = tvm.IRModule.from_expr(func)
-    mod = relay.transform.InferType()(mod)
-    target = tvm.target.Target("llvm")
-    device = tvm.cpu()
-    lib = relay.build(mod, target=target)
-    m = debug_executor.GraphModuleDebug(
-        lib["debug_create"]("default", device), [device], lib.get_graph_json(), None
-    )
-    nodes = m.debug_datum.get_graph_nodes()
-    assert nodes[2]["shape"] == [3, 3, 1, 1]
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/runtime/test_runtime_heterogeneous.py b/tests/python/runtime/test_runtime_heterogeneous.py
deleted file mode 100644
index 167f61d748c2..000000000000
--- a/tests/python/runtime/test_runtime_heterogeneous.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=too-many-locals
-"""Unit tests for heterogeneous runtime"""
-import json
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.contrib import graph_executor, utils
-from tvm import topi
-
-
-def get_simplex_graph(host_dev_type, device_dev_type):
-    r""" Return the hand-crafted json object where only one copy node is
-    inserted. This node copies data from the target device to cpu.
-    The network is constructed as following:
-                 A    B
-                  \  /
-             elemwise_add  (gpu)
-                     \
-                     copy      C
-                       \      /
-                     elemwise_sub  (cpu)
-
-    Parameters
-    ----------
-    host_dev_type : int
-        The device type of the host processor, e.g. cpu.
-    device_dev_type : int
-        The device type of the device processor, e.g. gpu, opencl, etc.
-
-    Returns
-    -------
-    json : json
-        A json encoded object.
-    """
-    # Construct each node in the graph.
-    var_a = {"op": "null", "name": "A", "inputs": []}
-    var_b = {"op": "null", "name": "B", "inputs": []}
-    elemwise_add = {
-        "op": "tvm_op",
-        "name": "elemwise_add",
-        "attrs": {
-            "flatten_data": "1",
-            "func_name": "elemwise_add",
-            "num_inputs": "2",
-            "num_outputs": "1",
-        },
-        "inputs": [[0, 0, 0], [1, 0, 0]],
-    }
-    copy = {
-        "op": "tvm_op",
-        "name": "__copy_add_to_sub",
-        "attrs": {
-            "flatten_data": "0",
-            "func_name": "__copy",
-            "num_inputs": "1",
-            "num_outputs": "1",
-        },
-        "inputs": [[2, 0, 0]],
-    }
-    var_c = {"op": "null", "name": "C", "inputs": []}
-    elemwise_sub = {
-        "op": "tvm_op",
-        "name": "elemwise_sub",
-        "attrs": {
-            "flatten_data": "0",
-            "func_name": "elemwise_sub",
-            "num_inputs": "2",
-            "num_outputs": "1",
-        },
-        "inputs": [[3, 0, 0], [4, 0, 0]],
-    }
-
-    # Group the nodes.
-    nodes = [var_a, var_b, elemwise_add, copy, var_c, elemwise_sub]
-    arg_nodes = [0, 1, 4]
-    node_row_ptr = [0, 1, 2, 3, 4, 5, 6]
-    heads = [[5, 0, 0]]
-    shape = (4,)
-    attrs = {
-        "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]],
-        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]],
-        "device_index": [
-            "list_int",
-            [
-                device_dev_type,
-                device_dev_type,
-                device_dev_type,
-                host_dev_type,
-                host_dev_type,
-                host_dev_type,
-            ],
-        ],
-        "dtype": ["list_int", [0, 0, 0, 0, 0, 0]],
-        "dltype": ["list_str", ["float32", "float32", "float32", "float32", "float32", "float32"]],
-    }
-
-    # Construct the graph.
-    graph = {
-        "nodes": nodes,
-        "arg_nodes": arg_nodes,
-        "node_row_ptr": node_row_ptr,
-        "heads": heads,
-        "attrs": attrs,
-    }
-    return json.dumps(graph)
-
-
-def test_simplex_data_transferring():
-    r"""
-    Test the heterogeneous execution of a simple network where data
-    transferring is from the target device to the host processor at runtime.
-    The host processor is always assumed to be cpu, and the device varies.
-    """
-    host = "cpu"
-    target_host = "llvm"
-    host_dev = tvm.device(host)
-    if not tvm.runtime.enabled(target_host):
-        print("Skip test because llvm is not enabled.")
-        return
-
-    def check_device(device, target_device):
-        if not tvm.runtime.enabled(target_device):
-            print("Skip test because {} is not enabled.".format(target_device))
-            return
-
-        device_dev = tvm.device(device)
-        graph = get_simplex_graph(host_dev.device_type, device_dev.device_type)
-        shape = (4,)
-
-        # Create module for add whose target is the device.
-        tensor_a = te.placeholder(shape, name="A")
-        tensor_b = te.placeholder(shape, name="B")
-        elemwise_add = te.compute(
-            shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add"
-        )
-        target = topi.cpp.TEST_create_target(device)
-        schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
-        lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], name="elemwise_add")
-
-        # Insert copy. Neither compute nor schedule is required for the copy
-        # node. The compute will be performed at runtime which is just data
-        # copy from the input to the output.
-        tensor_copy = te.placeholder(shape, name="__copy")
-
-        # Create module for sub whose target is the host.
-        tensor_c = te.placeholder(shape, name="C")
-        elemwise_sub = te.compute(
-            shape, lambda *i: tensor_copy(*i) - tensor_c(*i), name="elemwise_sub"
-        )
-        schedule_sub = te.create_schedule(elemwise_sub.op)
-        lower_sub = tvm.lower(
-            schedule_sub, [tensor_copy, tensor_c, elemwise_sub], name="elemwise_sub"
-        )
-
-        target_flist = {target_device: lower_add, target_host: lower_sub}
-        target = tvm.target.Target(target, target_host)
-        mhost = tvm.build(target_flist, target=target)
-        dev = [host_dev, device_dev]
-        mod = graph_executor.create(graph, mhost, dev)
-        params = {}
-        params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)
-        params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype)
-        params["C"] = tensor_c = np.random.uniform(size=shape).astype(tensor_c.dtype)
-        mod.set_input(**params)
-        mod.run()
-        out = mod.get_output(0, tvm.nd.empty(shape))
-        np.testing.assert_equal(out.numpy(), (tensor_a + tensor_b) - tensor_c)
-
-    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
-    for device, target in dev_tar.items():
-        with tvm.target.Target(device):
-            check_device(device, target)
-
-
-def get_duplex_graph(host_dev_type, device_dev_type):
-    r""" Return the hand-crafted json object where two copy nodes are inserted.
-    Data transferring happens back-and-forth between the target device and CPU.
-    The network is constructed as following:
-                 A    B
-                  \  /
-             elemwise_add  (gpu)
-                     \
-                     copy        C
-                       \        /
-                      elemwise_sub  (cpu)
-                         \
-                         copy          D
-                           \          /
-                           elemwise_add  (gpu)
-
-    Parameters
-    ----------
-    host_dev_type : int
-        The device type of the host processor, e.g. cpu.
-    device_dev_type : int
-        The device type of the device processor, e.g. gpu, opencl, etc.
-
-    Returns
-    -------
-    json : json
-        A json encoded object.
-    """
-    # Construct each node in the graph.
-    var_a = {"op": "null", "name": "A", "inputs": []}
-    var_b = {"op": "null", "name": "B", "inputs": []}
-    elemwise_add0 = {
-        "op": "tvm_op",
-        "name": "elemwise_add0",
-        "attrs": {
-            "flatten_data": "1",
-            "func_name": "elemwise_add0",
-            "num_inputs": "2",
-            "num_outputs": "1",
-        },
-        "inputs": [[0, 0, 0], [1, 0, 0]],
-    }
-    copy_add_sub = {
-        "op": "tvm_op",
-        "name": "__copy_add_to_sub",
-        "attrs": {
-            "flatten_data": "0",
-            "func_name": "__copy",
-            "num_inputs": "1",
-            "num_outputs": "1",
-        },
-        "inputs": [[2, 0, 0]],
-    }
-    var_c = {"op": "null", "name": "C", "inputs": []}
-    elemwise_sub = {
-        "op": "tvm_op",
-        "name": "elemwise_sub",
-        "attrs": {
-            "flatten_data": "0",
-            "func_name": "elemwise_sub",
-            "num_inputs": "2",
-            "num_outputs": "1",
-        },
-        "inputs": [[3, 0, 0], [4, 0, 0]],
-    }
-    copy_sub_add = {
-        "op": "tvm_op",
-        "name": "__copy_sub_to_add",
-        "attrs": {
-            "flatten_data": "0",
-            "func_name": "__copy",
-            "num_inputs": "1",
-            "num_outputs": "1",
-        },
-        "inputs": [[5, 0, 0]],
-    }
-    var_d = {"op": "null", "name": "D", "inputs": []}
-    elemwise_add1 = {
-        "op": "tvm_op",
-        "name": "elemwise_add1",
-        "attrs": {
-            "flatten_data": "0",
-            "func_name": "elemwise_add1",
-            "num_inputs": "2",
-            "num_outputs": "1",
-        },
-        "inputs": [[6, 0, 0], [7, 0, 0]],
-    }
-
-    # Group the nodes.
-    nodes = [
-        var_a,
-        var_b,
-        elemwise_add0,
-        copy_add_sub,
-        var_c,
-        elemwise_sub,
-        copy_sub_add,
-        var_d,
-        elemwise_add1,
-    ]
-    arg_nodes = [0, 1, 4, 7]
-    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-    heads = [[8, 0, 0]]
-    shape = (4,)
-    attrs = {
-        "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]],
-        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape, shape, shape, shape]],
-        "device_index": [
-            "list_int",
-            [
-                device_dev_type,
-                device_dev_type,
-                device_dev_type,
-                host_dev_type,
-                host_dev_type,
-                host_dev_type,
-                device_dev_type,
-                device_dev_type,
-                device_dev_type,
-            ],
-        ],
-        "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]],
-        "dltype": [
-            "list_str",
-            [
-                "float32",
-                "float32",
-                "float32",
-                "float32",
-                "float32",
-                "float32",
-                "float32",
-                "float32",
-                "float32",
-            ],
-        ],
-    }
-
-    # Construct the graph.
-    graph = {
-        "nodes": nodes,
-        "arg_nodes": arg_nodes,
-        "node_row_ptr": node_row_ptr,
-        "heads": heads,
-        "attrs": attrs,
-    }
-    return json.dumps(graph)
-
-
-def test_duplex_data_transferring():
-    r"""
-    Test the heterogeneous execution of a simple network where data
-    transferring occurs back-and-forth between the target device and host
-    processor.
-    The host processor is always assumed to be cpu, and the target device
-    varies.
-    """
-    host = "cpu"
-    target_host = "llvm"
-    host_dev = tvm.device(host)
-    if not tvm.runtime.enabled(target_host):
-        print("Skip test because llvm is not enabled.")
-        return
-
-    def check_device(device, target_device):
-        if not tvm.runtime.enabled(target_device):
-            print("Skip test because {} is not enabled.".format(target_device))
-            return
-
-        device_dev = tvm.device(device)
-        graph = get_duplex_graph(host_dev.device_type, device_dev.device_type)
-        shape = (4,)
-
-        # Insert copy nodes for data transferring between add and sub nodes.
-        # Transfers data from gpu to cpu.
-        copy_add_sub = te.placeholder(shape, name="__copy0")
-        # Transfers data from cpu to gpu.
-        copy_sub_add = te.placeholder(shape, name="__copy1")
-
-        # Create a module containing adds on the device.
-        tensor_a = te.placeholder(shape, name="A")
-        tensor_b = te.placeholder(shape, name="B")
-        tensor_d = te.placeholder(shape, name="D")
-        elemwise_add0 = te.compute(
-            shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add0"
-        )
-        elemwise_add1 = te.compute(
-            shape, lambda *i: copy_sub_add(*i) + tensor_d(*i), name="elemwise_add1"
-        )
-        target = topi.cpp.TEST_create_target(device)
-        add_schedule0 = topi.cpp.cuda.schedule_injective(target, [elemwise_add0])
-        lower_add0 = tvm.lower(
-            add_schedule0, [tensor_a, tensor_b, elemwise_add0], name="elemwise_add0"
-        )
-        add_schedule1 = topi.cpp.cuda.schedule_injective(target, [elemwise_add1])
-        lower_add1 = tvm.lower(
-            add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], name="elemwise_add1"
-        )
-        # Create module for sub whose target is the host.
-        tensor_c = te.placeholder(shape, name="C")
-        elemwise_sub = te.compute(
-            shape, lambda *i: copy_add_sub(*i) - tensor_c(*i), name="elemwise_sub"
-        )
-        sub_schedule = te.create_schedule(elemwise_sub.op)
-        lower_sub = tvm.lower(
-            sub_schedule, [copy_add_sub, tensor_c, elemwise_sub], name="elemwise_sub"
-        )
-
-        lower_add0.update(lower_add1)
-        target_flist = {target_device: lower_add0, target_host: lower_sub}
-        target = tvm.target.Target(target, target_host)
-        mhost = tvm.build(target_flist, target=target)
-        dev = [host_dev, device_dev]
-        params = {}
-        params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)
-        params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype)
-        params["C"] = tensor_c = np.random.uniform(size=shape).astype(tensor_c.dtype)
-        params["D"] = tensor_d = np.random.uniform(size=shape).astype(tensor_d.dtype)
-
-        def check_verify():
-            mod = graph_executor.create(graph, mhost, dev)
-            mod.set_input(**params)
-            mod.run()
-            out = mod.get_output(0, tvm.nd.empty(shape))
-            np.testing.assert_equal(out.numpy(), tensor_a + tensor_b - tensor_c + tensor_d)
-
-        def check_load_module():
-            temp = utils.tempdir()
-            path_lib = temp.relpath("deploy.so")
-            mhost.export_library(path_lib)
-            with open(temp.relpath("deploy.json"), "w") as out_file:
-                out_file.write(graph)
-            loaded_lib = tvm.runtime.load_module(path_lib)
-            loaded_graph = open(temp.relpath("deploy.json")).read()
-            mod = graph_executor.create(loaded_graph, loaded_lib, dev)
-            mod.set_input(**params)
-            mod.run()
-            out = mod.get_output(0, tvm.nd.empty(shape))
-            np.testing.assert_equal(out.numpy(), tensor_a + tensor_b - tensor_c + tensor_d)
-
-        check_verify()
-        check_load_module()
-
-    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
-    for device, target in dev_tar.items():
-        with tvm.target.Target(device):
-            check_device(device, target)
-
-
-if __name__ == "__main__":
-    test_simplex_data_transferring()
-    test_duplex_data_transferring()
diff --git a/tests/python/runtime/test_runtime_module_based_interface.py b/tests/python/runtime/test_runtime_module_based_interface.py
deleted file mode 100644
index 2c46838b942a..000000000000
--- a/tests/python/runtime/test_runtime_module_based_interface.py
+++ /dev/null
@@ -1,798 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import platform
-
-import numpy as np
-import pytest
-
-from tvm import relay, runtime
-from tvm.relay import testing
-import tvm
-from tvm.contrib import graph_executor
-from tvm.contrib.debugger import debug_executor
-from tvm.contrib.cuda_graph import cuda_graph_executor
-import tvm.testing
-import pytest
-
-
-def input_shape(mod):
-    return [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
-
-
-def verify(data):
-    if not tvm.runtime.enabled("llvm"):
-        print("Skip because llvm is not enabled")
-        return
-    mod, params = relay.testing.synthetic.get_workload()
-    with relay.build_config(opt_level=3):
-        graph, lib, graph_params = relay.build_module.build(mod, "llvm", params=params)
-
-    dev = tvm.cpu()
-    module = graph_executor.create(graph, lib, dev)
-    module.set_input("data", data)
-    module.set_input(**graph_params)
-    module.run()
-    out = module.get_output(0).numpy()
-
-    return out
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.parametrize("target", ["llvm", "llvm -jit=mcjit"])
-def test_legacy_compatibility(target):
-    mod, params = relay.testing.synthetic.get_workload()
-    with relay.build_config(opt_level=3):
-        graph, lib, graph_params = relay.build_module.build(mod, target, params=params)
-    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-    dev = tvm.cpu()
-    module = graph_executor.create(graph, lib, dev)
-    module.set_input("data", data)
-    module.set_input(**graph_params)
-    module.run()
-    out = module.get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.parametrize("target", ["llvm", "llvm -jit=mcjit"])
-def test_cpu(target):
-    mod, params = relay.testing.synthetic.get_workload()
-    with relay.build_config(opt_level=3):
-        complied_graph_lib = relay.build_module.build(mod, target, params=params)
-    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-    # raw api
-    dev = tvm.cpu()
-    gmod = complied_graph_lib["default"](dev)
-    set_input = gmod["set_input"]
-    run = gmod["run"]
-    get_output = gmod["get_output"]
-    set_input("data", tvm.nd.array(data))
-    run()
-    out = get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    # graph executor wrapper
-    gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev))
-    gmod.set_input("data", data)
-    gmod.run()
-    out = gmod.get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-
-@tvm.testing.requires_llvm
-def test_cpu_get_graph_json():
-    mod, params = relay.testing.synthetic.get_workload()
-    with relay.build_config(opt_level=3):
-        complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
-    from tvm.contrib import utils
-
-    temp = utils.tempdir()
-    file_name = "deploy_lib.so"
-    path_lib = temp.relpath(file_name)
-    complied_graph_lib.export_library(path_lib)
-    loaded_lib = tvm.runtime.load_module(path_lib)
-    json = loaded_lib["get_graph_json"]()
-    assert isinstance(json, str) == True
-    assert json.find("tvmgen_default_fused_nn_softmax_add") > -1
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.parametrize("target", ["llvm", "llvm -jit=mcjit"])
-def test_cpu_get_graph_params_run(target):
-    mod, params = relay.testing.synthetic.get_workload()
-    with tvm.transform.PassContext(opt_level=3):
-        complied_graph_lib = relay.build_module.build(mod, target, params=params)
-    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-    dev = tvm.cpu()
-    from tvm.contrib import utils
-
-    temp = utils.tempdir()
-    file_name = "deploy_lib.so"
-    path_lib = temp.relpath(file_name)
-    complied_graph_lib.export_library(path_lib)
-
-    loaded_lib = tvm.runtime.load_module(path_lib)
-    loaded_params = loaded_lib["get_graph_params"]()
-
-    gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-    gmod.set_input(key="data", value=data, **loaded_params)
-    gmod.run()
-    out = gmod.get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-
-@tvm.testing.requires_llvm
-def test_cpu_get_graph_params_compare():
-    # Create sample net
-    from tvm.relay.testing.init import create_workload, Constant
-
-    inp_shape = (1, 3, 24, 12)
-    dtype = "float32"
-    data = relay.var("data", shape=inp_shape, dtype=dtype)
-    conv_shape = [inp_shape[1], inp_shape[1], 3, 3]
-    conv = relay.nn.conv2d(
-        data,
-        relay.var("conv_weight", shape=conv_shape, dtype=dtype),
-        padding=1,
-        kernel_size=3,
-    )
-    args = relay.analysis.free_vars(conv)
-    func = relay.Function(args, conv)
-
-    mod, params = create_workload(func, initializer=Constant())
-
-    with tvm.transform.PassContext(opt_level=3):
-        complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
-    from tvm.contrib import utils
-
-    temp = utils.tempdir()
-    file_name = "deploy_lib.so"
-    path_lib = temp.relpath(file_name)
-    complied_graph_lib.export_library(path_lib)
-
-    loaded_lib = tvm.runtime.load_module(path_lib)
-    loaded_params = loaded_lib["get_graph_params"]()
-
-    p0_squeezed = np.squeeze(loaded_params["p0"].numpy())
-    tvm.testing.assert_allclose(params["conv_weight"].numpy(), p0_squeezed, atol=1e-5)
-
-
-@tvm.testing.requires_cuda
-@tvm.testing.requires_gpu
-def test_gpu():
-    mod, params = relay.testing.synthetic.get_workload()
-    with relay.build_config(opt_level=3):
-        complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
-    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-    dev = tvm.cuda()
-
-    # raw api
-    gmod = complied_graph_lib["default"](dev)
-    set_input = gmod["set_input"]
-    run = gmod["run"]
-    get_output = gmod["get_output"]
-    set_input("data", tvm.nd.array(data))
-    run()
-    out = get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    # graph executor wrapper
-    gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev))
-    gmod.set_input("data", data)
-    gmod.run()
-    out = gmod.get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-
-@tvm.testing.uses_gpu
-def test_mod_export():
-    def verify_cpu_export(obj_format):
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib.export_library(path_lib)
-
-        # run the setup in a separate function, so the load_lib
-        # can get destructed right away
-        # test the robustness wrt to parent module destruction
-        def setup_gmod():
-            loaded_lib = tvm.runtime.load_module(path_lib)
-            dev = tvm.cpu(0)
-            return loaded_lib["default"](dev)
-
-        gmod = setup_gmod()
-        # raw api
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        set_input("data", tvm.nd.array(data))
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(setup_gmod())
-        gmod.set_input("data", data)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    def verify_gpu_export(obj_format):
-        if not tvm.testing.device_enabled("cuda"):
-            print("Skip because cuda is not enabled")
-            return
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib.export_library(path_lib)
-
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-
-        # run the setup in a separate function, so the load_lib
-        # can get destructed right away
-        # test the robustness wrt to parent module destruction
-        def setup_gmod():
-            loaded_lib = tvm.runtime.load_module(path_lib)
-            dev = tvm.cuda()
-            return loaded_lib["default"](dev)
-
-        gmod = setup_gmod()
-        # raw api
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        set_input("data", tvm.nd.array(data))
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(setup_gmod())
-        gmod.set_input("data", data)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    @tvm.testing.requires_llvm
-    def verify_rpc_cpu_export(obj_format):
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib.export_library(path_lib)
-
-        from tvm import rpc
-
-        remote = rpc.LocalSession()
-        remote.upload(path_lib)
-        loaded_lib = remote.load_module(path_lib)
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        dev = remote.cpu()
-
-        # raw api
-        gmod = loaded_lib["default"](dev)
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        set_input("data", tvm.nd.array(data, device=dev))
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-        gmod.set_input("data", data)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    def verify_rpc_gpu_export(obj_format):
-        if not tvm.testing.device_enabled("cuda"):
-            print("Skip because cuda is not enabled")
-            return
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib.export_library(path_lib)
-
-        from tvm import rpc
-
-        def check_remote(server):
-            remote = rpc.connect(server.host, server.port)
-            remote.upload(path_lib)
-            loaded_lib = remote.load_module(path_lib)
-            data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-            dev = remote.cuda()
-
-            # raw api
-            gmod = loaded_lib["default"](dev)
-            set_input = gmod["set_input"]
-            run = gmod["run"]
-            get_output = gmod["get_output"]
-            set_input("data", tvm.nd.array(data, device=dev))
-            run()
-            out = get_output(0).numpy()
-            tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-            # graph executor wrapper
-            gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-            gmod.set_input("data", data)
-            gmod.run()
-            out = gmod.get_output(0).numpy()
-            tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        check_remote(rpc.Server("127.0.0.1"))
-
-    for obj_format in [".so", ".tar"]:
-        verify_cpu_export(obj_format)
-        verify_gpu_export(obj_format)
-        verify_rpc_cpu_export(obj_format)
-        verify_rpc_gpu_export(obj_format)
-
-
-@tvm.testing.requires_llvm
-@tvm.testing.uses_gpu
-def test_remove_package_params():
-    def verify_cpu_remove_package_params(obj_format):
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib_no_params = complied_graph_lib["remove_params"]()
-        complied_graph_lib_no_params.export_library(path_lib)
-        with open(temp.relpath("deploy_param.params"), "wb") as fo:
-            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        dev = tvm.cpu(0)
-
-        # raw api
-        gmod = loaded_lib["default"](dev)
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        load_params = gmod["load_params"]
-        loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
-        set_input("data", tvm.nd.array(data))
-        load_params(loaded_params)
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-        loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
-        gmod.set_input("data", data)
-        gmod.load_params(loaded_params)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    def verify_gpu_remove_package_params(obj_format):
-        if not tvm.testing.device_enabled("cuda"):
-            print("Skip because cuda is not enabled")
-            return
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib_no_params = complied_graph_lib["remove_params"]()
-        complied_graph_lib_no_params.export_library(path_lib)
-        with open(temp.relpath("deploy_param.params"), "wb") as fo:
-            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        dev = tvm.cuda(0)
-
-        # raw api
-        gmod = loaded_lib["default"](dev)
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        load_params = gmod["load_params"]
-        loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
-        set_input("data", tvm.nd.array(data))
-        load_params(loaded_params)
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-        loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
-        gmod.set_input("data", data)
-        gmod.load_params(loaded_params)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    @tvm.testing.requires_llvm
-    def verify_rpc_cpu_remove_package_params(obj_format):
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib_no_params = complied_graph_lib["remove_params"]()
-        complied_graph_lib_no_params.export_library(path_lib)
-        path_params = temp.relpath("deploy_param.params")
-        with open(path_params, "wb") as fo:
-            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
-
-        from tvm import rpc
-
-        remote = rpc.LocalSession()
-        remote.upload(path_lib)
-        loaded_lib = remote.load_module(path_lib)
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        dev = remote.cpu()
-
-        # raw api
-        gmod = loaded_lib["default"](dev)
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        load_params = gmod["load_params"]
-        loaded_params = bytearray(open(path_params, "rb").read())
-        set_input("data", tvm.nd.array(data, device=dev))
-        load_params(loaded_params)
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-        loaded_params = bytearray(open(path_params, "rb").read())
-        gmod.set_input("data", data)
-        gmod.load_params(loaded_params)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    def verify_rpc_gpu_remove_package_params(obj_format):
-        if not tvm.testing.device_enabled("cuda"):
-            print("Skip because cuda is not enabled")
-            return
-        mod, params = relay.testing.synthetic.get_workload()
-        with relay.build_config(opt_level=3):
-            complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
-
-        from tvm.contrib import utils
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        complied_graph_lib_no_params = complied_graph_lib["remove_params"]()
-        complied_graph_lib_no_params.export_library(path_lib)
-        path_params = temp.relpath("deploy_param.params")
-        with open(path_params, "wb") as fo:
-            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
-
-        from tvm import rpc
-
-        remote = rpc.LocalSession()
-        remote.upload(path_lib)
-        loaded_lib = remote.load_module(path_lib)
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        dev = remote.cuda()
-
-        # raw api
-        gmod = loaded_lib["default"](dev)
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        load_params = gmod["load_params"]
-        loaded_params = bytearray(open(path_params, "rb").read())
-        set_input("data", tvm.nd.array(data, device=dev))
-        load_params(loaded_params)
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-        loaded_params = bytearray(open(path_params, "rb").read())
-        gmod.set_input("data", data)
-        gmod.load_params(loaded_params)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    for obj_format in [".so", ".tar"]:
-        verify_cpu_remove_package_params(obj_format)
-        verify_gpu_remove_package_params(obj_format)
-        verify_rpc_cpu_remove_package_params(obj_format)
-        verify_rpc_gpu_remove_package_params(obj_format)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.parametrize("target", ["llvm", "llvm -jit=mcjit"])
-def test_debug_graph_executor(target):
-    mod, params = relay.testing.synthetic.get_workload()
-    with relay.build_config(opt_level=3):
-        complied_graph_lib = relay.build_module.build(mod, target, params=params)
-    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-
-    # raw api
-    dev = tvm.cpu()
-    try:
-        gmod = complied_graph_lib["debug_create"]("default", dev)
-    except:
-        print("Skip because debug graph_executor not enabled")
-        return
-    set_input = gmod["set_input"]
-    run = gmod["run"]
-    get_output = gmod["get_output"]
-    set_input("data", tvm.nd.array(data))
-    run()
-    out = get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    # debug graph executor wrapper
-    debug_g_mod = debug_executor.GraphModuleDebug(
-        complied_graph_lib["debug_create"]("default", dev),
-        [dev],
-        complied_graph_lib.get_graph_json(),
-        None,
-    )
-    debug_g_mod.set_input("data", data)
-    debug_g_mod.run()
-    out = debug_g_mod.get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-
-@tvm.testing.requires_cudagraph
-def test_cuda_graph_executor():
-    mod, params = relay.testing.synthetic.get_workload()
-    with tvm.transform.PassContext(opt_level=3):
-        complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
-    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-
-    dev = tvm.cuda()
-    try:
-        gmod = complied_graph_lib["cuda_graph_create"](dev)
-    except:
-        print("Skip because cuda_graph not enabled")
-        return
-    set_input = gmod["set_input"]
-    run = gmod["run"]
-    get_output = gmod["get_output"]
-    set_input("data", tvm.nd.array(data))
-    run()
-    out = get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-    # cuda graph executor wrapper
-    cu_gmod = cuda_graph_executor.GraphModuleCudaGraph(gmod)
-    cu_gmod.set_input("data", data)
-    cu_gmod.run()
-    out = cu_gmod.get_output(0).numpy()
-    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-
-def test_multiple_imported_modules():
-    def make_func(symbol):
-        n = tvm.te.size_var("n")
-        Ab = tvm.tir.decl_buffer((n,), dtype="float32")
-        i = tvm.te.var("i")
-        stmt = tvm.tir.For(
-            i,
-            0,
-            n - 1,
-            tvm.tir.ForKind.SERIAL,
-            tvm.tir.BufferStore(Ab, tvm.tir.BufferLoad(Ab, [i]) + 1, [i + 1]),
-        )
-        return tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", symbol)
-
-    def make_module(mod):
-        mod = tvm.IRModule(mod)
-        mod = tvm.driver.build(mod, target="llvm")
-        return mod
-
-    module_main = make_module({"main": make_func("main")})
-    module_a = make_module({"func_a": make_func("func_a")})
-    module_b = make_module({"func_b": make_func("func_b")})
-    module_main.import_module(module_a)
-    module_main.import_module(module_b)
-    module_main.get_function("func_a", query_imports=True)
-    module_main.get_function("func_b", query_imports=True)
-
-
-def test_num_threads():
-    reported = tvm.runtime.num_threads()
-    env_threads = os.getenv("TVM_NUM_THREADS")
-    omp_env_threads = os.getenv("OMP_NUM_THREADS")
-    if env_threads is not None:
-        assert reported == int(env_threads)
-    elif omp_env_threads is not None:
-        assert reported == int(omp_env_threads)
-    else:
-        hardware_threads = os.cpu_count()
-        assert reported == hardware_threads or reported == hardware_threads // 2
-
-
-@tvm.testing.requires_llvm
-@tvm.testing.requires_package("torch")
-def test_graph_module_zero_copy():
-    mod = tvm.IRModule()
-    params = {}
-    dev = tvm.cpu()
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    mod["main"] = relay.Function([x, y], z)
-
-    # need torch to do the from_dlpack trick
-    import torch
-
-    compiled_graph_lib = relay.build(mod, target="llvm", params=params)
-    gm = graph_executor.GraphModule(compiled_graph_lib["default"](dev))
-    x_data = torch.rand((1, 10))
-    y_data = torch.rand((1, 10))
-    z_data = torch.rand((1, 10))
-    z_torch = x_data + y_data
-
-    # zero copy run
-    assert not np.allclose(z_data.numpy(), z_torch.numpy())
-    gm.set_input_zero_copy("x", tvm.nd.from_dlpack(x_data))
-    gm.set_input_zero_copy("y", tvm.nd.from_dlpack(y_data))
-    gm.set_output_zero_copy(0, tvm.nd.from_dlpack(z_data))
-    gm.run()
-
-    tvm.testing.assert_allclose(z_data.numpy(), z_torch.numpy())
-
-    # zero input copy with params
-    gm = graph_executor.GraphModule(compiled_graph_lib["default"](dev))
-    gm.set_input_zero_copy(x=tvm.nd.from_dlpack(x_data), y=tvm.nd.from_dlpack(y_data))
-    gm.run()
-
-    tvm.testing.assert_allclose(gm.get_output(0).numpy(), z_torch.numpy())
-
-
-@tvm.testing.requires_llvm
-def test_reshape_zero_copy():
-    shape0 = (56, 224)
-    shape1 = (112, 112)
-    in_name0 = "infeats0"
-    in_name1 = "infeats1"
-    x0 = relay.var(in_name0, shape=shape0, dtype="float32")
-    x0 = relay.reshape(x0, shape1)
-
-    x1 = relay.var(in_name1, shape=shape1, dtype="float32")
-    mat = relay.nn.matmul(x0, x1)
-    _y = relay.reshape(mat, (-1))
-    func = relay.Function(relay.analysis.free_vars(_y), _y)
-    mod = tvm.IRModule.from_expr(func)
-
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target="llvm")
-    m = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
-
-    data_ndarray0 = tvm.nd.array(
-        np.random.random(shape0).astype(np.float32), device=tvm.device("llvm", 0)
-    )
-    data_ndarray1 = tvm.nd.array(
-        np.random.random(shape1).astype(np.float32), device=tvm.device("llvm", 0)
-    )
-
-    def expected():
-        m.set_input(in_name0, data_ndarray0)
-        m.set_input(in_name1, data_ndarray1)
-        m.run()
-        return m.get_output(0).numpy()
-
-    def zero_copy():
-        from tvm.relay.frontend.common import infer_shape
-
-        outshape = infer_shape(_y)
-        output_view = tvm.nd.empty(outshape, device=tvm.device("llvm", 0))
-        m.set_input_zero_copy(in_name0, data_ndarray0)
-        m.set_input_zero_copy(in_name1, data_ndarray1)
-        m.set_output_zero_copy(0, output_view)
-        m.run()
-        return output_view.numpy()
-
-    golden_out = expected()
-    out = zero_copy()
-    tvm.testing.assert_allclose(golden_out, out)
-
-
-if __name__ == "__main__":
-    test_legacy_compatibility()
-    test_cpu()
-    test_gpu()
-    test_mod_export()
-    test_remove_package_params()
-    test_debug_graph_executor()
-    test_multiple_imported_modules()
-    test_cpu_get_graph_json()
-    test_cpu_get_graph_params_run()
-    test_cpu_get_graph_params_compare()
-    test_graph_module_zero_copy()
-    test_reshape_zero_copy()
diff --git a/tests/python/runtime/test_runtime_module_export.py b/tests/python/runtime/test_runtime_module_export.py
index 3f6acca18b89..a6554f3a4f75 100644
--- a/tests/python/runtime/test_runtime_module_export.py
+++ b/tests/python/runtime/test_runtime_module_export.py
@@ -14,11 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from tvm import relay
-from tvm.relay import testing
+
 import tvm
-from tvm import te
 import tvm.testing
+import pytest
 
 from tvm.contrib import utils
 import os
@@ -62,6 +61,7 @@ def generate_engine_module():
     return csource_module
 
 
+@pytest.mark.skip("LEGACY-TEST: test to be replaced by relax")
 @tvm.testing.uses_gpu
 def test_mod_export():
     def verify_gpu_mod_export(obj_format):
@@ -221,8 +221,11 @@ def verify_multi_c_mod_export():
     verify_multi_c_mod_export()
 
 
+@pytest.mark.skip("LEGACY-TEST: test to be replaced by TensorIR")
 @tvm.testing.requires_llvm
 def test_import_static_library():
+    from tvm import te
+
     # Generate two LLVM modules.
     A = te.placeholder((1024,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
diff --git a/tests/python/runtime/test_runtime_module_load.py b/tests/python/runtime/test_runtime_module_load.py
index 87a8ef9f5e12..33bd281b045f 100644
--- a/tests/python/runtime/test_runtime_module_load.py
+++ b/tests/python/runtime/test_runtime_module_load.py
@@ -21,7 +21,6 @@
 import numpy as np
 import subprocess
 import tvm.testing
-from tvm.relay.backend import Runtime
 import pytest
 
 runtime_py = """
@@ -116,13 +115,8 @@ def check_device(device):
             return
         temp = utils.tempdir()
         name = "myadd_%s" % device
-        if sys.platform == "darwin" or sys.platform.startswith("linux"):
-            runtime = Runtime("cpp", {"system-lib": True})
-            f = tvm.build(s, [A, B], device, "llvm", runtime=runtime, name=name)
-        elif sys.platform == "win32":
-            f = tvm.build(s, [A, B], device, "llvm", name=name)
-        else:
-            raise ValueError("Unsupported platform")
+
+        f = tvm.build(s, [A, B], device, "llvm", name=name)
 
         path_dso = temp.relpath("dev_lib.so")
         # test cross compiler function
@@ -137,10 +131,6 @@ def popen_check():
             b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
             f1(a, b)
             np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-            if sys.platform != "win32":
-                f2 = tvm.runtime.system_lib()
-                f2[name](a, b)
-                np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
         # system lib should be loaded in different process
         worker = popen_pool.PopenWorker()
@@ -176,13 +166,14 @@ def test_combine_module_llvm():
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
+    mod1 = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd1"))
+    mod2 = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd2"))
 
     def check_llvm():
         dev = tvm.cpu(0)
         temp = utils.tempdir()
-        fadd1 = tvm.build(s, [A, B], "llvm", name="myadd1")
-        fadd2 = tvm.build(s, [A, B], "llvm", name="myadd2")
+        fadd1 = tvm.build(mod1, "llvm")
+        fadd2 = tvm.build(mod2, "llvm")
         path1 = temp.relpath("myadd1.o")
         path2 = temp.relpath("myadd2.o")
         path_dso = temp.relpath("mylib.so")
@@ -206,9 +197,9 @@ def check_system_lib():
             print("Skip because llvm is not enabled")
             return
         temp = utils.tempdir()
-        runtime = Runtime("cpp", {"system-lib": True})
-        fadd1 = tvm.build(s, [A, B], "llvm", runtime=runtime, name="myadd1")
-        fadd2 = tvm.build(s, [A, B], "llvm", runtime=runtime, name="myadd2")
+        print("Running popen check")
+        fadd1 = tvm.build(mod1.with_attr("system_lib_prefix", ""), "llvm")
+        fadd2 = tvm.build(mod2.with_attr("system_lib_prefix", ""), "llvm")
         path1 = temp.relpath("myadd1.o")
         path2 = temp.relpath("myadd2.o")
         path_dso = temp.relpath("mylib.so")
@@ -243,5 +234,3 @@ def popen_check():
 
 if __name__ == "__main__":
     test_combine_module_llvm()
-    test_device_module_dump()
-    test_dso_module_load()
diff --git a/tests/python/runtime/test_runtime_profiling.py b/tests/python/runtime/test_runtime_profiling.py
deleted file mode 100644
index 7afcc5250d6f..000000000000
--- a/tests/python/runtime/test_runtime_profiling.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-from io import StringIO
-import csv
-import os
-import json
-import platform
-
-import tvm.testing
-import tvm.utils
-from tvm.runtime import profiler_vm
-from tvm import relay
-from tvm.relay.testing import mlp
-from tvm.contrib.debugger import debug_executor
-from tvm import rpc
-from tvm.contrib import utils
-from tvm.runtime.profiling import Report
-from tvm.script import tir as T
-
-
-def read_csv(report):
-    f = StringIO(report.csv())
-    headers = []
-    rows = []
-    reader = csv.reader(f, delimiter=",")
-    # force parsing
-    in_header = True
-    for row in reader:
-        if in_header:
-            headers = row
-            in_header = False
-            rows = [[] for x in headers]
-        else:
-            for i in range(len(row)):
-                rows[i].append(row[i])
-    return dict(zip(headers, rows))
-
-
-@pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled")
-@tvm.testing.skip_if_wheel_test
-@tvm.testing.parametrize_targets
-def test_vm(target, dev):
-    dtype = "float32"
-    x = relay.var("x", shape=(relay.Any(), relay.Any()), dtype=dtype)
-    y = relay.var("y", shape=(relay.Any(), relay.Any()), dtype=dtype)
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x, y], relay.add(x, y))
-    exe = relay.vm.compile(mod, target)
-    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
-
-    data = np.random.rand(28, 28).astype("float32")
-    report = vm.profile(data, data, func_name="main")
-    assert "fused_add" in str(report)
-    assert "Total" in str(report)
-    assert "AllocTensorReg" in str(report)
-    assert "AllocStorage" in str(report)
-    assert report.configuration["Executor"] == "VM"
-
-    csv = read_csv(report)
-    assert "Hash" in csv.keys()
-    # Ops should have a duration greater than zero.
-    assert all(
-        [
-            float(dur) > 0
-            for dur, name in zip(csv["Duration (us)"], csv["Name"])
-            if name[:5] == "fused"
-        ]
-    )
-    # AllocTensor or AllocStorage may be cached, so their duration could be 0.
-    assert all(
-        [
-            float(dur) >= 0
-            for dur, name in zip(csv["Duration (us)"], csv["Name"])
-            if name[:5] != "fused"
-        ]
-    )
-
-
-@tvm.testing.parametrize_targets
-def test_graph_executor(target, dev):
-    mod, params = mlp.get_workload(1)
-
-    exe = relay.build(mod, target, params=params)
-    gr = debug_executor.create(exe.get_graph_json(), exe.lib, dev)
-
-    data = np.random.rand(1, 1, 28, 28).astype("float32")
-    report = gr.profile(data=data)
-    assert "fused_nn_softmax" in str(report)
-    assert "Total" in str(report)
-    assert "Hash" in str(report)
-    assert "Graph" in str(report)
-
-
-@tvm.testing.parametrize_targets("cuda", "llvm")
-@pytest.mark.skipif(
-    tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None,
-    reason="PAPI profiling not enabled",
-)
-def test_papi(target, dev):
-    target = tvm.target.Target(target)
-    if str(target.kind) == "llvm":
-        metric = "PAPI_FP_OPS"
-    elif str(target.kind) == "cuda":
-        metric = "cuda:::event:shared_load:device=0"
-    else:
-        pytest.skip(f"Target {target.kind} not supported by this test")
-    mod, params = mlp.get_workload(1)
-
-    exe = relay.vm.compile(mod, target, params=params)
-    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
-
-    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
-    report = vm.profile(
-        data,
-        func_name="main",
-        collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})],
-    )
-    assert metric in str(report)
-
-    csv = read_csv(report)
-    assert metric in csv.keys()
-    assert any([float(x) > 0 for x in csv[metric]])
-
-
-@tvm.testing.requires_llvm
-def test_json():
-    mod, params = mlp.get_workload(1)
-
-    exe = relay.vm.compile(mod, "llvm", params=params)
-    vm = profiler_vm.VirtualMachineProfiler(exe, tvm.cpu())
-
-    data = np.random.rand(1, 1, 28, 28).astype("float32")
-    report = vm.profile(data, func_name="main")
-    parsed = json.loads(report.json())
-    assert "device_metrics" in parsed
-    assert "calls" in parsed
-    assert "configuration" in parsed
-    assert "Duration (us)" in parsed["calls"][0]
-    assert "microseconds" in parsed["calls"][0]["Duration (us)"]
-    assert len(parsed["calls"]) > 0
-    for call in parsed["calls"]:
-        assert isinstance(call["Name"]["string"], str)
-        assert isinstance(call["Count"]["count"], int)
-        assert isinstance(call["Duration (us)"]["microseconds"], float)
-
-
-@tvm.testing.requires_llvm
-def test_rpc_vm():
-    server = rpc.Server(key="profiling")
-    remote = rpc.connect("127.0.0.1", server.port, key="profiling")
-
-    mod, params = mlp.get_workload(1)
-    exe = relay.vm.compile(mod, "llvm", params=params)
-    temp = utils.tempdir()
-    path = temp.relpath("lib.tar")
-    exe.mod.export_library(path)
-    remote.upload(path)
-    rexec = remote.load_module("lib.tar")
-    vm = profiler_vm.VirtualMachineProfiler(rexec, remote.cpu())
-    report = vm.profile(tvm.nd.array(np.ones((1, 1, 28, 28), dtype="float32"), device=remote.cpu()))
-    assert len(report.calls) > 0
-
-
-def test_rpc_graph():
-    server = rpc.Server(key="profiling")
-    remote = rpc.connect("127.0.0.1", server.port, key="profiling")
-
-    mod, params = mlp.get_workload(1)
-    exe = relay.build(mod, "llvm", params=params)
-    temp = utils.tempdir()
-    path = temp.relpath("lib.tar")
-    exe.export_library(path)
-    remote.upload(path)
-    rexec = remote.load_module("lib.tar")
-
-    gr = debug_executor.create(exe.get_graph_json(), rexec, remote.cpu())
-
-    data = np.random.rand(1, 1, 28, 28).astype("float32")
-    report = gr.profile(data=data)
-    assert len(report.calls) > 0
-
-
-def test_report_serialization():
-    mod, params = mlp.get_workload(1)
-
-    exe = relay.vm.compile(mod, "llvm", params=params)
-    vm = profiler_vm.VirtualMachineProfiler(exe, tvm.cpu())
-
-    data = np.random.rand(1, 1, 28, 28).astype("float32")
-    report = vm.profile(data, func_name="main")
-
-    report2 = Report.from_json(report.json())
-    # Equality on reports compares pointers, so we compare the printed
-    # results instead.
-
-    # Use .table() instead of str(), because str() includes aggregate
-    # and column summations whose values may be impacted by otherwise
-    # negligible conversion errors. (2 occurrences / 3000 trials)
-    assert report.table(aggregate=False, col_sums=False) == report2.table(
-        aggregate=False, col_sums=False
-    )
-
-
-@T.prim_func
-def axpy_cpu(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, [10], "float64")
-    B = T.match_buffer(b, [10], "float64")
-    C = T.match_buffer(c, [10], "float64")
-    for i in range(10):
-        C[i] = A[i] + B[i]
-
-
-@T.prim_func
-def axpy_gpu(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, [10], "float64")
-    B = T.match_buffer(b, [10], "float64")
-    C = T.match_buffer(c, [10], "float64")
-    for i in T.thread_binding(0, 10, "threadIdx.x"):
-        C[i] = A[i] + B[i]
-
-
-@tvm.testing.parametrize_targets("cuda", "llvm")
-@pytest.mark.skipif(
-    tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None,
-    reason="PAPI profiling not enabled",
-)
-def test_profile_function(target, dev):
-    target = tvm.target.Target(target)
-    if str(target.kind) == "llvm":
-        metric = "PAPI_FP_OPS"
-        func = axpy_cpu
-    elif str(target.kind) == "cuda":
-        metric = (
-            "cuda:::gpu__compute_memory_access_throughput.max.pct_of_peak_sustained_region:device=0"
-        )
-        func = axpy_gpu
-    else:
-        pytest.skip(f"Target {target.kind} not supported by this test")
-    f = tvm.build(func, target=target)
-    a = tvm.nd.array(np.ones(10), device=dev)
-    b = tvm.nd.array(np.ones(10), device=dev)
-    c = tvm.nd.array(np.zeros(10), device=dev)
-    report = tvm.runtime.profiling.profile_function(
-        f, dev, [tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})]
-    )(a, b, c)
-    assert metric in report.keys()
-    assert report[metric].value > 0
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/runtime/test_runtime_rpc.py b/tests/python/runtime/test_runtime_rpc.py
index fbdc33928b6e..31cab2819df1 100644
--- a/tests/python/runtime/test_runtime_rpc.py
+++ b/tests/python/runtime/test_runtime_rpc.py
@@ -30,7 +30,6 @@
 
 from tvm import te
 from tvm import rpc
-from tvm.relay.backend import Runtime
 from tvm.contrib import utils, cc
 from tvm.rpc.tracker import Tracker
 from tvm.rpc.proxy import Proxy
@@ -251,7 +250,7 @@ def test_rpc_remote_module():
     n = tvm.runtime.convert(102)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
+    mod = tvm.ir.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd"))
 
     server0 = rpc.Server(key="x0")
     server1 = rpc.Server(key="x1")
@@ -266,7 +265,7 @@ def test_rpc_remote_module():
     def check_remote(remote):
         temp = utils.tempdir()
         dev = remote.cpu(0)
-        f = tvm.build(s, [A, B], "llvm", name="myadd")
+        f = tvm.build(mod, "llvm")
         path_dso = temp.relpath("dev_lib.so")
         f.export_library(path_dso)
         remote.upload(path_dso)
@@ -296,8 +295,8 @@ def check_minrpc():
             return
         # export to minrpc
         temp = utils.tempdir()
-        runtime = Runtime("cpp", {"system-lib": True})
-        f = tvm.build(s, [A, B], "llvm", name="myadd", runtime=runtime)
+        # system lib prefix will trigger system lib build
+        f = tvm.build(mod.with_attr("system_lib_prefix", ""), "llvm")
         path_minrpc = temp.relpath("dev_lib.minrpc")
         f.export_library(path_minrpc, fcompile=rpc.with_minrpc(cc.create_executable))
 
@@ -333,29 +332,14 @@ def check_remote_link_cl(remote):
             return
         temp = utils.tempdir()
         dev = remote.cl(0)
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=32)
-        s[B].bind(xo, te.thread_axis("blockIdx.x"))
-        s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        f = tvm.build(s, [A, B], "opencl --host=llvm", name="myadd")
-        # Option 1: save modules separately and rely on remote compiler
-        path_o = temp.relpath("myadd.o")
-        path_cl = temp.relpath("myadd.cl")
-        path_json = temp.relpath("myadd.tvm_meta.json")
-        f.save(path_o)
-        f.imported_modules[0].save(path_cl)
-        remote.upload(path_o)
-        remote.upload(path_cl)
-        # upload meta data
-        remote.upload(path_json)
-        fhost = remote.load_module("myadd.o")
-        fdev = remote.load_module("myadd.cl")
-        fhost.import_module(fdev)
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
-        fhost(a, b)
-        np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-        # Option 2: export library as a tar ball then handled by remote compiler
+
+        s = tvm.tir.Schedule(mod)
+
+        x = s.get_loops(s.get_block("B"))
+        xo, xi = s.split(x, factors=[None, 32])
+        s.bind(xo, "blockIdx.x")
+        s.bind(xi, "threadIdx.x")
+        f = tvm.build(s.mod, "opencl --host=llvm")
         path_tar = temp.relpath("myadd.tar")
         f.export_library(path_tar)
         remote.upload(path_tar)
diff --git a/tests/python/runtime/test_runtime_vm_profiler.py b/tests/python/runtime/test_runtime_vm_profiler.py
deleted file mode 100644
index 3559e11f8e72..000000000000
--- a/tests/python/runtime/test_runtime_vm_profiler.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm.runtime import profiler_vm
-from tvm import relay
-from tvm.relay.testing import mlp
-
-
-@tvm.testing.parametrize_targets
-def test_basic(dev, target):
-    mod, params = mlp.get_workload(batch_size=1)
-    if not profiler_vm.enabled():
-        return
-
-    exe = relay.vm.compile(mod, target, params=params)
-    code, lib = exe.save()
-    des_exe = tvm.runtime.vm.Executable.load_exec(code, lib)
-    vm = profiler_vm.VirtualMachineProfiler(des_exe, dev)
-
-    data = np.random.rand(1, 1, 28, 28).astype("float32")
-    res = vm.profile(tvm.nd.array(data), func_name="main")
-    assert "softmax" in str(res)
-
-
-def test_vm_reshape_and_copy():
-    target = "llvm"
-    dev = tvm.gpu()
-    x_np = np.random.uniform(size=(8, 16)).astype("float32")
-    x = relay.var("x", shape=(8, 16), dtype="float32")
-    y = relay.reshape(x, [-1, 4, 8])
-    mod = tvm.IRModule()
-    mod["main"] = relay.Function([x], y)
-    with tvm.transform.PassContext(opt_level=3):
-        exec = relay.vm.compile(mod, "llvm")
-    assert "reshape_tensor" in exec.bytecode
-    vm = profiler_vm.VirtualMachineProfiler(exec, dev)
-    vm.profile(tvm.nd.array(x_np))
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/target/test_arm_target.py b/tests/python/target/test_arm_target.py
index 158d941073c6..2fcac847a8e0 100644
--- a/tests/python/target/test_arm_target.py
+++ b/tests/python/target/test_arm_target.py
@@ -24,7 +24,6 @@
 
 import tvm
 from tvm.script import tir as T
-from tvm.topi.arm_cpu.conv2d_int8 import is_int8_hw_support
 from tvm.target import codegen
 
 llvm_version, arm_target, input_dtype, kernel_dtype, is_supported = tvm.testing.parameters(
@@ -49,27 +48,6 @@
 )
 
 
-def test_arm_conv2d_int8_support(
-    monkeypatch, llvm_version, arm_target, input_dtype, kernel_dtype, is_supported
-):
-    """Test ARM conv2d int8 support for different targets.
-
-    Parameters
-    ----------
-    arm_target : str
-        ARM CPU target.
-    input_dtype : str
-        Conv2d input data type.
-    kernel_dtype : Session
-        Conv2d kernel data type.
-    is_supported : bool
-        Expected result.
-    """
-    with tvm.target.Target(arm_target):
-        monkeypatch.setattr(codegen, "llvm_version_major", lambda: llvm_version)
-        assert is_int8_hw_support(input_dtype, kernel_dtype) == is_supported
-
-
 @pytest.fixture(scope="session")
 def sve_device_vector_length():
     c_code = r"""
diff --git a/tests/python/te/test_te_create_primfunc.py b/tests/python/te/test_te_create_primfunc.py
index 0fb64e8d0f32..486fc0b18c32 100644
--- a/tests/python/te/test_te_create_primfunc.py
+++ b/tests/python/te/test_te_create_primfunc.py
@@ -18,7 +18,7 @@
 import numpy as np
 import tvm
 import tvm.testing
-from tvm import te, tir, topi, relay
+from tvm import te, tir, topi
 from tvm.script import tir as T
 import pytest
 
@@ -640,59 +640,6 @@ def test_reshape():
     _check_workload(te_reshape, tir_reshape, index_dtype_override="int64")
 
 
-@T.prim_func
-def argmax_expected(
-    p0: T.Buffer((T.int64(1), T.int64(64), T.int64(56), T.int64(56)), "uint8"),
-    p0_red: T.Buffer((T.int64(1), T.int64(56), T.int64(56)), "int32"),
-):
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    p0_red_temp_v0 = T.alloc_buffer([T.int64(1), T.int64(56), T.int64(56)], dtype="int32")
-    p0_red_temp_v1 = T.alloc_buffer([T.int64(1), T.int64(56), T.int64(56)], dtype="uint8")
-    for ax0, ax1, ax2, k1 in T.grid(T.int64(1), T.int64(56), T.int64(56), T.int64(64)):
-        with T.block("p0_red_temp"):
-            v_ax0, v_ax1, v_ax2, v_k1 = T.axis.remap("SSSR", [ax0, ax1, ax2, k1])
-            T.reads(p0[v_ax0, v_k1, v_ax1, v_ax2])
-            T.writes(p0_red_temp_v0[v_ax0, v_ax1, v_ax2], p0_red_temp_v1[v_ax0, v_ax1, v_ax2])
-            with T.init():
-                p0_red_temp_v0[v_ax0, v_ax1, v_ax2] = -1
-                p0_red_temp_v1[v_ax0, v_ax1, v_ax2] = T.uint8(0)
-            v_p0_red_temp_v0: T.int64 = T.Select(
-                p0_red_temp_v1[v_ax0, v_ax1, v_ax2] > p0[v_ax0, v_k1, v_ax1, v_ax2]
-                or (
-                    p0_red_temp_v1[v_ax0, v_ax1, v_ax2] == p0[v_ax0, v_k1, v_ax1, v_ax2]
-                    and T.Cast("int64", p0_red_temp_v0[v_ax0, v_ax1, v_ax2]) < v_k1
-                ),
-                T.Cast("int64", p0_red_temp_v0[v_ax0, v_ax1, v_ax2]),
-                v_k1,
-            )
-            v_p0_red_temp_v1: T.uint8 = T.Select(
-                p0_red_temp_v1[v_ax0, v_ax1, v_ax2] > p0[v_ax0, v_k1, v_ax1, v_ax2],
-                p0_red_temp_v1[v_ax0, v_ax1, v_ax2],
-                p0[v_ax0, v_k1, v_ax1, v_ax2],
-            )
-            p0_red_temp_v0[v_ax0, v_ax1, v_ax2] = T.Cast("int32", v_p0_red_temp_v0)
-            p0_red_temp_v1[v_ax0, v_ax1, v_ax2] = v_p0_red_temp_v1
-    for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(56), T.int64(56)):
-        with T.block("p0_red"):
-            v_ax0, v_ax1, v_ax2 = T.axis.remap("SSS", [ax0, ax1, ax2])
-            T.reads(p0_red_temp_v0[v_ax0, v_ax1, v_ax2])
-            T.writes(p0_red[v_ax0, v_ax1, v_ax2])
-            p0_red[v_ax0, v_ax1, v_ax2] = p0_red_temp_v0[v_ax0, v_ax1, v_ax2]
-
-
-def test_argmax():
-    data = relay.var("data", shape=(1, 64, 56, 56), dtype="uint8")
-    mod = tvm.IRModule.from_expr(relay.argmax(data, axis=1))
-
-    target = tvm.target.Target("llvm")
-
-    opt_mod, _ = relay.optimize(mod, params={}, target=target)
-
-    prim_func = relay.backend.te_compiler.lower_to_primfunc(opt_mod["main"].body.op, target)
-
-    tvm.ir.assert_structural_equal(prim_func, argmax_expected)
-
-
 def te_resize2d_symbolic():
     oh = tir.Var("oh", "int64")
     ow = tir.Var("ow", "int64")
diff --git a/tests/python/te/test_te_tensor_overload.py b/tests/python/te/test_te_tensor_overload.py
deleted file mode 100644
index 6ee2bae3525d..000000000000
--- a/tests/python/te/test_te_tensor_overload.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-import tvm.testing
-
-
-def test_operator_type_and_tags():
-    k = 1
-    n = te.var("n")
-    A = te.placeholder((), name="A")
-    B = te.placeholder((10, 5), name="B")
-    B1 = B[0]
-    B2 = B[0, 0]
-
-    assert isinstance(k + n, tvm.tir.PrimExpr)
-    assert isinstance(n + n, tvm.tir.PrimExpr)
-    assert isinstance(k + A, te.tensor.Tensor)
-    assert isinstance(A + k, te.tensor.Tensor)
-    assert isinstance(n + A, te.tensor.Tensor)
-    assert isinstance(A + n, te.tensor.Tensor)
-    assert isinstance(A + A, te.tensor.Tensor)
-
-    assert isinstance(k + B, te.tensor.Tensor)
-    assert isinstance(B + k, te.tensor.Tensor)
-    assert isinstance(n + B, te.tensor.Tensor)
-    assert isinstance(B + n, te.tensor.Tensor)
-    assert isinstance(A + B, te.tensor.Tensor)
-    assert isinstance(B + A, te.tensor.Tensor)
-    assert isinstance(B + B, te.tensor.Tensor)
-
-    assert (k + B).op.tag == topi.tag.ELEMWISE
-    assert (B + k).op.tag == topi.tag.ELEMWISE
-    assert (n + B).op.tag == topi.tag.ELEMWISE
-    assert (B + n).op.tag == topi.tag.ELEMWISE
-    assert (A + B).op.tag == topi.tag.BROADCAST
-    assert (B + A).op.tag == topi.tag.BROADCAST
-    assert (B + B).op.tag == topi.tag.BROADCAST
-
-    assert isinstance(k + B2, tvm.tir.PrimExpr)
-    assert isinstance(B2 + k, tvm.tir.PrimExpr)
-    assert isinstance(n + B2, tvm.tir.PrimExpr)
-    assert isinstance(B2 + n, tvm.tir.PrimExpr)
-    assert isinstance(B2 + B2, tvm.tir.PrimExpr)
-    assert isinstance(B2 + A, te.tensor.Tensor)
-    assert isinstance(A + B2, te.tensor.Tensor)
-    assert isinstance(B2 + B, te.tensor.Tensor)
-    assert isinstance(B + B2, te.tensor.Tensor)
-
-
-def test_combination():
-    k = 3
-    n = 5
-    m = 10
-    x = te.var("x")
-    A = te.placeholder((n, m), name="A")
-    B = te.placeholder((n, m), name="B")
-    C = te.placeholder((n, m), name="C")
-    D = k + A - B * C + x
-    s = te.create_schedule(D.op)
-    foo = tvm.build(s, [x, A, B, C, D], "llvm")
-    dev = tvm.cpu(0)
-    x = 2
-    a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), dev)
-    c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), dev)
-    d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev)
-    foo(x, a, b, c, d)
-    tvm.testing.assert_allclose(d.numpy(), k + a.numpy() - b.numpy() * c.numpy() + x)
-
-
-def verify_tensor_scalar_bop(shape, typ="add"):
-    """Verify non-constant Tensor and scalar binary operations."""
-    sh = [te.size_var("n%d" % i) for i in range(0, len(shape))]
-    k = te.var("k")
-    A = te.placeholder(sh, name="A")
-    if typ == "add":
-        B = A + k
-    elif typ == "sub":
-        B = A - k
-    elif typ == "mul":
-        B = A * k
-    elif typ == "div":
-        B = A / k
-    else:
-        raise NotImplementedError()
-
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        dev = tvm.device(device, 0)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_elemwise_schedule(device)(B)
-
-        k_ = 2
-        foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ)
-        a_npy = np.random.uniform(size=shape).astype(A.dtype)
-        if typ == "add":
-            b_npy = a_npy + k_
-        elif typ == "sub":
-            b_npy = a_npy - k_
-        elif typ == "mul":
-            b_npy = a_npy * k_
-        elif typ == "div":
-            b_npy = a_npy / k_
-        else:
-            raise NotImplementedError()
-
-        a_nd = tvm.nd.array(a_npy, dev)
-        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev)
-        foo(a_nd, b_nd, k_, *shape)
-        tvm.testing.assert_allclose(b_nd.numpy(), b_npy, rtol=1e-5)
-
-    for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]:
-        check_device(device)
-
-
-def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"):
-    A = te.placeholder(shape=lhs_shape, name="A")
-    B = te.placeholder(shape=rhs_shape, name="B")
-    if typ == "add":
-        C = A + B
-    elif typ == "sub":
-        C = A - B
-    elif typ == "mul":
-        C = A * B
-    elif typ == "div":
-        C = A / B
-    else:
-        raise NotImplementedError()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(C)
-
-        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
-        lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
-        rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
-        if typ == "add":
-            out_npy = lhs_npy + rhs_npy
-        elif typ == "sub":
-            out_npy = lhs_npy - rhs_npy
-        elif typ == "mul":
-            out_npy = lhs_npy * rhs_npy
-        elif typ == "div":
-            rhs_npy = np.abs(rhs_npy) + 0.001
-            out_npy = lhs_npy / rhs_npy
-        else:
-            raise NotImplementedError()
-
-        lhs_nd = tvm.nd.array(lhs_npy, dev)
-        rhs_nd = tvm.nd.array(rhs_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev)
-        for _ in range(1):
-            foo(lhs_nd, rhs_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy, rtol=1e-4, atol=1e-4)
-
-    for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]:
-        check_device(device)
-
-
-@tvm.testing.uses_gpu
-def verify_conv2d_scalar_bop(
-    batch, in_size, in_channel, num_filter, kernel, stride, padding, typ="add"
-):
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-
-        conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(device)
-
-        k = 10.0
-        dilation = (1, 1)
-        with tvm.target.Target(device):
-            A = te.placeholder((batch, in_channel, in_size, in_size), name="A")
-            W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
-            B = conv2d_nchw(A, W, stride, padding, dilation, A.dtype)
-            if typ == "add":
-                C = B + k
-            elif typ == "sub":
-                C = B - k
-            elif typ == "mul":
-                C = B * k
-            elif typ == "div":
-                C = B / k
-            else:
-                raise NotImplementedError()
-            s = schedule_conv2d_nchw([C])
-
-        foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ)
-
-        a_npy = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-        w_npy = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
-        b_npy = tvm.topi.testing.conv2d_nchw_python(a_npy, w_npy, stride, padding)
-        c_npy = np.random.uniform(size=get_const_tuple(B.shape)).astype(B.dtype)
-        if typ == "add":
-            c_npy = b_npy + k
-        elif typ == "sub":
-            c_npy = b_npy - k
-        elif typ == "mul":
-            c_npy = b_npy * k
-        elif typ == "div":
-            c_npy = b_npy / k
-        else:
-            raise NotImplementedError()
-
-        a_nd = tvm.nd.array(a_npy, dev)
-        w_nd = tvm.nd.array(w_npy, dev)
-        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev)
-        c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), dev)
-        foo(a_nd, w_nd, b_nd, c_nd)
-        tvm.testing.assert_allclose(c_nd.numpy(), c_npy, rtol=1e-4, atol=1e-4)
-
-    for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]:
-        check_device(device)
-
-
-@tvm.testing.uses_gpu
-def test_tensor_scalar_bop():
-    verify_tensor_scalar_bop((1,), typ="add")
-    verify_tensor_scalar_bop((3, 5), typ="sub")
-    verify_tensor_scalar_bop((1, 3, 5), typ="mul")
-    verify_tensor_scalar_bop((2, 3, 1, 32), typ="div")
-
-
-@tvm.testing.uses_gpu
-def test_broadcast_bop():
-    verify_broadcast_bop((2, 3), (), typ="add")
-    verify_broadcast_bop((5, 2, 3), (1,), typ="add")
-    verify_broadcast_bop((1, 32), (64, 32), typ="sub")
-    verify_broadcast_bop((5, 64, 128), (2, 5, 64, 1), typ="mul")
-    verify_broadcast_bop((2, 3, 1, 32), (64, 32), typ="div")
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_scalar_bop():
-    verify_conv2d_scalar_bop(1, 16, 4, 4, 3, 1, 1, typ="add")
-    verify_conv2d_scalar_bop(1, 32, 2, 1, 3, 1, 1, typ="sub")
-    verify_conv2d_scalar_bop(1, 32, 1, 1, 3, 1, 1, typ="mul")
-    verify_conv2d_scalar_bop(1, 16, 2, 1, 3, 1, 1, typ="div")
-
-
-if __name__ == "__main__":
-    test_operator_type_and_tags()
-    test_combination()
-    test_tensor_scalar_bop()
-    test_broadcast_bop()
-    test_conv2d_scalar_bop()
diff --git a/tests/python/testing/test_format_si_prefix.py b/tests/python/testing/test_format_si_prefix.py
deleted file mode 100644
index e0276ce022b8..000000000000
--- a/tests/python/testing/test_format_si_prefix.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from numpy import isclose
-import random
-from tvm.autotvm import utils
-
-
-SI_PREFIXES = "yzafpn\xb5m kMGTPEZY"
-
-
-def test_format_si_prefix():
-    # test float conversion
-    assert utils.format_si_prefix(1024, "k") == 1.024
-
-    for i, prefix in enumerate(SI_PREFIXES):
-        integer, decimal = random.randint(0, 1000), random.randint(0, 1000)
-        exp = -24 + 3 * i  # 0th prefix (yocto) is 10^-24
-        number = integer * (10**exp) + decimal * (10 ** (exp - 3))
-        expected = integer + decimal / 1000
-        assert isclose(utils.format_si_prefix(number, prefix), expected)
-
-    assert utils.format_si_prefix(0, "y") == 0
-
-
-if __name__ == "__main__":
-    test_format_si_prefix()
diff --git a/tests/python/tir-analysis/test_tir_analysis_device_constraint_utils.py b/tests/python/tir-analysis/test_tir_analysis_device_constraint_utils.py
deleted file mode 100644
index 9dcf47230009..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_device_constraint_utils.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test retrieving and applying memory scope constraints to PrimFuncs"""
-import tvm
-import tvm.testing
-from tvm import tir
-from tvm import relay
-from tvm.script import tir as T
-
-
-@T.prim_func
-def gem(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
-    A = T.match_buffer(a, [128, 128], scope="scopeA")
-    B = T.match_buffer(b, [128, 128], scope="scopeA")
-    C = T.match_buffer(c, [128, 128], scope="scopeB")
-    D = T.match_buffer(d, [128, 128], scope="scopeC")
-
-    for i, j, k in T.grid(128, 128, 128):
-        with T.block("update"):
-            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
-            with T.init():
-                D[vi, vj] = C[vi, vj]
-            D[vi, vj] = D[vi, vj] + A[vi, vk] * B[vj, vk]
-
-
-gem_ty = relay.FuncType(
-    [
-        relay.TupleType(
-            [
-                relay.TensorType((128, 128), "float32"),
-                relay.TensorType((128, 128), "float32"),
-            ]
-        ),
-        relay.TensorType((128, 128), "float32"),
-    ],
-    relay.TensorType((128, 128), "float32"),
-)
-
-
-def test_get_prim_func_arg_and_result_constraints():
-    scopes = tir.analysis.get_prim_func_arg_and_result_memory_constraints(gem, gem_ty)
-    assert [x for x in scopes] == ["scopeA", "scopeB", "scopeC"]
-
-
-def test_apply_prim_func_arg_and_result_memory_constraints():
-    rewritten = tir.analysis.apply_prim_func_arg_and_result_memory_constraints(
-        gem, gem_ty, ["scopeX", "scopeY", "scopeZ"]
-    )
-    scopes = tir.analysis.get_prim_func_arg_and_result_memory_constraints(rewritten, gem_ty)
-    assert [x for x in scopes] == ["scopeX", "scopeY", "scopeZ"]
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-analysis/test_tir_analysis_stmt_finding.py b/tests/python/tir-analysis/test_tir_analysis_stmt_finding.py
deleted file mode 100644
index 72fb4898befd..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_stmt_finding.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm
-from tvm import te, topi
-from tvm.meta_schedule.testing.te_workload import conv2d_winograd_nhwc, matmul
-from tvm.tir.analysis import find_anchor_block
-
-
-def test_matmul_add():
-    n = m = k = 128
-    A, B, C = matmul(n, m, k)
-    mod = tvm.IRModule()
-    mod["main"] = te.create_prim_func([A, B, C + A])
-
-    block = find_anchor_block(mod)
-
-    assert block.name_hint == "C"
-
-
-def test_winograd():
-    mod = tvm.IRModule()
-    mod["main"] = te.create_prim_func(conv2d_winograd_nhwc(1, 14, 14, 128, 128, 6))
-
-    block = find_anchor_block(mod)
-
-    assert block.name_hint == "bgemm"
-
-
-def test_no_anchor_block():
-    inp = te.placeholder((10,), name="input")
-    out = topi.nn.relu(inp + 1.0)
-    mod = tvm.IRModule()
-    mod["main"] = te.create_prim_func([inp, out])
-
-    assert find_anchor_block(mod) is None
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py b/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
index a338f6c2149a..45a8a8138bd5 100644
--- a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
+++ b/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
@@ -19,7 +19,6 @@
 from tvm import te
 from tvm import topi
 import tvm.testing
-import tvm.topi.testing
 
 
 def get_verify_pass(valid, **kwargs):
@@ -431,32 +430,5 @@ def test_vthread():
             assert not valid[0]
 
 
-@tvm.testing.requires_gpu
-def test_redundant_kernels():
-    dtype = "float32"
-    A = te.placeholder(shape=(1,), name="A", dtype=dtype)
-    B = te.placeholder(shape=(1,), name="B", dtype=dtype)
-    C = te.placeholder(shape=(1,), name="C", dtype=dtype)
-    D = topi.less(A, C)
-    E = topi.less(B, C)
-    F = topi.logical_or(D, E)
-    G = topi.identity(F)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-        print("Running on target: %s" % target)
-        valid = [None]
-
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_reduce_schedule(target)(G)
-
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_kernels=1))]}
-        ):
-            tvm.build(s, [A, B, C, G], target)
-        assert valid[0]
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/tir-base/test_debug_info.py b/tests/python/tir-base/test_debug_info.py
index ecd25b3a6749..2e799815252d 100644
--- a/tests/python/tir-base/test_debug_info.py
+++ b/tests/python/tir-base/test_debug_info.py
@@ -18,7 +18,6 @@
 import tvm
 import tvm.testing
 from tvm import tir
-from tvm import relay
 from tvm.script import tir as T, ir as I
 
 from typing import List, Dict
diff --git a/tests/python/tir-transform/test_tir_transform_common_subexpr_elim.py b/tests/python/tir-transform/test_tir_transform_common_subexpr_elim.py
index f773e56e5ccb..5208262221b9 100644
--- a/tests/python/tir-transform/test_tir_transform_common_subexpr_elim.py
+++ b/tests/python/tir-transform/test_tir_transform_common_subexpr_elim.py
@@ -17,7 +17,7 @@
 import hashlib
 
 import tvm
-from tvm import auto_scheduler, te, topi
+from tvm import te, topi
 from tvm.ir.base import save_json
 from tvm.ir.module import IRModule
 from tvm.script import tir as T
@@ -446,50 +446,6 @@ def test_deterministic_cse():
         assert json_hash == initial_hash
 
 
-# Needed for the second test on determinism
-LOG_LINE = '{"i": [["[\\"conv2d_layer\\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", \
-            "llvm -keys=cpu -mcpu=broadwell -num-cores=2", \
-            [8, 64, 64, 0, 0, 0, 0, 0], "", 1, []], [[], [["CI", 5], \
-            ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 512, [1, 32, 16], 1], \
-            ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 7, [1, 1, 1], 1], \
-            ["SP", 3, 16, 512, [1], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 3, [3], 1], \
-            ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, \
-            11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], \
-            ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], \
-            ["CA", 3, 6, 7], ["CA", 1, 6, 5], ["FU", 6, [0, 1, 2, 3, 4, 5]], ["AN", 6, 0, 3], \
-            ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], \
-            ["AN", 6, 6, 2]]]], "r": [[0.0331129], 0, 0.900362, 1647464342], "v": "v0.6"}\n'
-
-
-# The workload associated with the log
-@auto_scheduler.register_workload
-def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
-    data = te.placeholder((N, CI, H, W), name="data")
-    kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
-    bias = te.placeholder((1, CO, 1, 1), name="bias")
-    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32")
-    out = topi.nn.relu(conv + bias)
-    return [data, kernel, bias, out]
-
-
-def test_deterministic_cse_2():
-    inp, inr = auto_scheduler.measure_record.load_record_from_string(LOG_LINE)
-    inp = auto_scheduler.measure.recover_measure_input(inp, rebuild_state=True)
-
-    initial_hash = None
-
-    for _ in range(10):
-        sch, args = inp.task.compute_dag.apply_steps_from_state(inp.state)
-        ir_module = tvm.lower(sch, args)
-        primfunc = ir_module["main"]
-        json_str = save_json(primfunc)
-        new_hash = hashlib.sha256(json_str.encode("utf-8")).hexdigest()
-        # Make sure that all the hashes are going to be the same
-        if initial_hash is None:
-            initial_hash = new_hash
-        assert new_hash == initial_hash
-
-
 if __name__ == "__main__":
     # Basic test:
     test_cse()
@@ -505,4 +461,3 @@ def test_deterministic_cse_2():
     test_semantic_equiv_associativity()
     # Tests that verify the determinism of the pass:
     test_deterministic_cse()
-    test_deterministic_cse_2()
diff --git a/tests/python/tir-transform/test_tir_transform_hoist_if.py b/tests/python/tir-transform/test_tir_transform_hoist_if.py
index dd10e15853f1..04f3f9771c64 100644
--- a/tests/python/tir-transform/test_tir_transform_hoist_if.py
+++ b/tests/python/tir-transform/test_tir_transform_hoist_if.py
@@ -16,7 +16,6 @@
 # under the License.
 import tvm
 from tvm import te
-from tvm import relay
 import numpy as np
 import pytest
 from tvm.testing import enabled_targets
@@ -745,68 +744,5 @@ def test_hoisting_block_scope_7():
     assert not tvm.ir.structural_equal(new_stmt, stmt)
 
 
-@pytest.mark.skip()
-def test_hoisting_op_conv():
-    dtype = "float32"
-    dshape = (1, 80, 73, 73)
-    kshape = (192, 80, 3, 3)
-    padding = (1, 1)
-    groups = 1
-    dilation = (1, 1)
-    kernel_size = (3, 3)
-    channels = 192
-    scale = 1
-    x = relay.var("x", shape=dshape, dtype=dtype)
-    w = relay.var("w", shape=kshape, dtype=dtype)
-    y = relay.nn.conv2d(
-        x,
-        w,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-        channels=channels,
-        kernel_size=kernel_size,
-    )
-
-    func = relay.Function([x, w], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    mod = relay.transform.InferType()(mod)
-
-    data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
-    kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-
-    params = {"w": tvm.nd.array(kernel)}
-    for target, dev in enabled_targets():
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build_module.build(mod, target=target, params=params)
-            m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-            x = np.random.uniform(size=dshape)
-            data_tvm = tvm.nd.array(data)
-            m.set_input("x", data_tvm)
-            m.run()
-            e = m.module.time_evaluator("run", dev, number=300, repeat=3)
-            t1 = e(data_tvm).results
-            t1 = np.array(t1) * 1000
-            print("{} ms".format(t1.mean()))
-
-        with tvm.transform.PassContext(
-            opt_level=3, config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
-        ):
-            lib = relay.build_module.build(mod, target=target, params=params)
-            m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-            x = np.random.uniform(size=dshape)
-            data_tvm = tvm.nd.array(data)
-            m.set_input("x", data_tvm)
-            m.set_input(**params)
-            m.run()
-            e = m.module.time_evaluator("run", dev, number=300, repeat=3)
-            t2 = e(data_tvm).results
-            t2 = np.array(t2) * 1000
-
-            print("{} ms".format(t2.mean()))
-        tvm.testing.assert_allclose(t1.mean(), t2.mean(), atol=1, rtol=1e-1)
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
index cf85f2e3714c..5ebdbe986082 100644
--- a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
+++ b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import relay, te
+from tvm import te
 from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 from tvm.tir import const
@@ -208,76 +208,6 @@ def check(m, n, target_bits, target_dtype):
     )
 
 
-def test_relay_basic():
-    engine = relay.backend.te_compiler.get()
-
-    def check(shapex, shapey, target_bits, target_dtype):
-        x = relay.var("x", shape=shapex)
-        y = relay.var("y", shape=shapey)
-        z = relay.add(x, y)
-        func = relay.Function([x, y], z)
-        mod = tvm.IRModule.from_expr(func)
-        mod = relay.transform.InferType()(mod)
-        func = mod["main"]
-        z = engine.lower(func, "llvm")
-        stmt = lower_sch(z.schedule, tuple(z.inputs) + tuple(z.outputs), 32)
-        # outer loop
-        assert stmt.loop_var.dtype == target_dtype
-        # inner loop
-        if len(shapex) > 1 or len(shapey) > 1:
-            assert stmt.body.loop_var.dtype == target_dtype
-
-    check(
-        (const(2**16, "int64"), const(2**15 + 1, "int64")),
-        (1, const(2**15 + 1, "int64")),
-        target_bits=32,
-        target_dtype="int64",
-    )
-    check(
-        (const(2**16, "int64"), const(2**15, "int64")),
-        (1, const(2**15, "int64")),
-        target_bits=32,
-        target_dtype="int32",
-    )
-    check(
-        (const(2**31, "int64"),), (const(2**31, "int64"),), target_bits=32, target_dtype="int32"
-    )
-    check(
-        (const(2**31 + 1, "int64"),),
-        (const(2**31 + 1, "int64"),),
-        target_bits=32,
-        target_dtype="int64",
-    )
-
-
-def test_relay_take():
-    engine = relay.backend.te_compiler.get()
-
-    def check(shape, index, target_bits, target_dtype):
-        x = relay.var("x", shape=shape)
-        y = relay.op.take(x, indices=index)
-        func = relay.Function([x], y)
-        mod = tvm.IRModule.from_expr(func)
-        mod = relay.transform.InferType()(mod)
-        func = mod["main"]
-        z = engine.lower(func, "llvm")
-        stmt = lower_sch(z.schedule, tuple(z.inputs) + tuple(z.outputs), 32)
-        assert stmt.value.indices[0].dtype == target_dtype
-
-    check(
-        (const(2**16, "int64"), const(2**15 + 1, "int64")),
-        relay.const(0, dtype="int64"),
-        target_bits=32,
-        target_dtype="int32",
-    )
-    check(
-        (const(2**16, "int64"), const(2**15 + 1, "int64")),
-        relay.const(2**31, dtype="int64"),
-        target_bits=32,
-        target_dtype="int64",
-    )
-
-
 def test_ramp_dtype_consistency():
     """
     for (i :int64, (int64)0, (int64)4) {
diff --git a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
index bb76bd235f15..1a1e780a7272 100644
--- a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
+++ b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
@@ -20,8 +20,7 @@
 import tvm.testing
 from tvm import te
 from tvm.script import tir as T
-from tvm import relay, tir
-from tvm.relay.backend.te_compiler import lower_to_primfunc
+from tvm import tir
 from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
 
 
@@ -365,57 +364,6 @@ def after(A: T.Buffer((4, 16), "int32"), C: T.Buffer((4, 8), "int32")):
     _check(before, after)
 
 
-def test_allocate_const_after_tensorize():
-    i_size, o_size, h_size, w_size = 64, 64, 56, 56
-    k_height_size = k_width_size = 3
-    w_shape = (o_size, i_size, k_height_size, k_width_size)
-
-    data = relay.var("data", shape=(1, i_size, h_size, w_size), dtype="uint8")
-    weight = relay.var("weight", shape=w_shape, dtype="uint8")
-    conv2d = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=(k_height_size, k_width_size),
-        channels=o_size,
-        padding=(0, 0),
-        strides=(1, 1),
-        out_dtype="int32",
-    )
-    mod = tvm.IRModule.from_expr(conv2d)
-
-    executor = relay.backend.Executor("graph", {"link-params": True})
-    mod = mod.with_attr("executor", executor)
-
-    weight_np = np.random.uniform(1, 10, size=w_shape).astype("uint8")
-
-    target = tvm.target.Target("hexagon")
-
-    with tvm.transform.PassContext(opt_level=3):
-        opt_mod, _ = relay.optimize(mod, params={"weight": weight_np}, target=target)
-
-    conv2d_func = opt_mod["main"].body.args[0].op
-    prim_func = lower_to_primfunc(conv2d_func, target)
-
-    sch = tir.Schedule(prim_func)
-    block = sch.get_block("conv2d_NCHWc_int8")
-    loops = sch.get_loops(block)
-
-    sch.reorder(loops[8], loops[4], loops[-1])
-    sch.decompose_reduction(block, loops[1])
-    sch.tensorize(loops[4], VRMPY_u8u8i32_INTRIN)
-
-    seq = tvm.transform.Sequential(
-        [
-            tvm.tir.transform.LowerInitBlock(),
-            tvm.tir.transform.PlanAndUpdateBufferAllocationLocation(),
-        ]
-    )
-
-    # The following error is emitted if AllocateConst nodes are not correctly handled:
-    #  Check failed: (buffer_data_to_buffer_.count(source_var)) is false:
-    _ = seq(sch.mod)
-
-
 def test_buffer_conditional_lowering():
     """Buffers passed as pointer arguments are unmodified
 
diff --git a/tests/python/tir-transform/test_tir_transform_storage_flatten.py b/tests/python/tir-transform/test_tir_transform_storage_flatten.py
index 8ddfbb5adfd3..4a81ab93c763 100644
--- a/tests/python/tir-transform/test_tir_transform_storage_flatten.py
+++ b/tests/python/tir-transform/test_tir_transform_storage_flatten.py
@@ -19,7 +19,6 @@
 from tvm import te
 from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
-from tvm.relay import GlobalVar
 
 
 def test_flatten2():
@@ -158,7 +157,7 @@ def tir_func(a: T.handle, b: T.handle) -> None:
 
 
 def test_flatten_tir():
-    orig_mod = tvm.IRModule({GlobalVar("main"): tir_func})
+    orig_mod = tvm.IRModule({"main": tir_func})
     mod = tvm.tir.transform.StorageFlatten(64)(orig_mod)
     tvm.ir.assert_structural_equal(
         orig_mod, mod
diff --git a/tests/python/tir-usmp/test_tir_usmp_algo.py b/tests/python/tir-usmp/test_tir_usmp_algo.py
index b9cfde485633..80f7f6b999ce 100644
--- a/tests/python/tir-usmp/test_tir_usmp_algo.py
+++ b/tests/python/tir-usmp/test_tir_usmp_algo.py
@@ -681,48 +681,3 @@ def test_resnet_subgraph(algorithm, workspace_size):
     )
 
     _check_max_workspace_size(buffer_pool_allocations, global_workspace_pool, workspace_size)
-
-
-def test_custom_algo():
-    target = Target("c")
-    global_workspace_pool = WorkspacePoolInfo(
-        "global_workspace",
-        [target],
-    )
-    tir_mod = ResnetStructure
-    tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
-    tir_mod = _assign_poolinfos_to_allocates_in_irmodule(tir_mod, [global_workspace_pool])
-    tir_mod = tir_mod.with_attr("executor", tvm.relay.backend.Executor("aot"))
-    tir_mod = tir_mod.with_attr("runtime", tvm.relay.backend.Runtime("crt"))
-    tir_mod["__tvm_main__"] = tir_mod[
-        "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast"
-    ]
-
-    algo_called = False
-
-    @tvm.register_func("tir.usmp.algo.trivial")
-    def _trivial_algo(buf_infos, mem_pressure):
-        nonlocal algo_called
-        algo_called = True
-        out_layout = {}
-        offset = 0
-        for buf_info in buf_infos:
-            pool_info = buf_info.pool_candidates[0]
-            out_layout[buf_info] = usmp_utils.PoolAllocation(pool_info, offset)
-            offset += buf_info.size_bytes
-        return out_layout
-
-    usmp_pass = tvm.get_global_func("tir.transform.UnifiedStaticMemoryPlanner")
-    usmp_pass()(tir_mod)
-    assert not algo_called
-
-    with tvm.transform.PassContext(config={"tir.usmp.custom_algorithm": "trivial"}):
-        usmp_pass()(tir_mod)
-
-    assert algo_called
-
-    with pytest.raises(
-        tvm.TVMError, match="The selected custom USMP algorithm : invalid is not defined"
-    ):
-        with tvm.transform.PassContext(config={"tir.usmp.custom_algorithm": "invalid"}):
-            usmp_pass()(tir_mod)
diff --git a/tests/python/topi/common.py b/tests/python/topi/common.py
deleted file mode 100644
index c0c4b1e1ae8d..000000000000
--- a/tests/python/topi/common.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Common utility for topi test"""
-
-from tvm import autotvm
-from tvm.autotvm.task.space import FallbackConfigEntity
-
-
-class Int8Fallback(autotvm.FallbackContext):
-    def _query_inside(self, target, workload):
-        key = (target, workload)
-        if key in self.memory:
-            return self.memory[key]
-        cfg = FallbackConfigEntity()
-        self.memory[key] = cfg
-        cfg.is_fallback = False
-        return cfg
diff --git a/tests/python/topi/test_fifo_buffer.py b/tests/python/topi/test_fifo_buffer.py
deleted file mode 100644
index 6668acdbc14f..000000000000
--- a/tests/python/topi/test_fifo_buffer.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for FIFO buffer"""
-
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-import numpy as np
-from tvm.contrib.pickle_memoize import memoize
-
-
-def verify_fifo_buffer(buffer_shape, data_shape, axis, dtype="float32"):
-    buffer = te.placeholder(buffer_shape, name="buffer", dtype=dtype)
-    data = te.placeholder(data_shape, name="data", dtype=dtype)
-
-    # Use memoize, pickle the test data for next time use
-    @memoize("topi.tests.test_fifo_buffer")
-    def get_ref_data():
-        buffer_np = np.random.uniform(size=buffer_shape).astype(dtype)
-        data_np = np.random.uniform(size=data_shape).astype(dtype)
-
-        # Reference implementation of FIFO queue
-        begin = data_np.shape[axis]
-        end = buffer_np.shape[axis] + data_np.shape[axis]
-        ndim = len(buffer_np.shape)
-        ss = tuple((slice(begin, end, 1) if x == axis else slice(None)) for x in range(ndim))
-        out_np = np.concatenate((buffer_np, data_np), axis=axis)[ss]
-        return (buffer_np, data_np, out_np)
-
-    # Get the test data
-    buffer_np, data_np, out_np = get_ref_data()
-
-    def check_device(target, dev):
-        print("  Running on target: {}".format(target))
-
-        with tvm.target.Target(target):
-            out = topi.nn.fifo_buffer(data, buffer, axis=axis)
-            s = tvm.topi.testing.get_injective_schedule(target)([out])
-
-        buffer_tvm = tvm.nd.array(buffer_np, device=dev)
-        data_tvm = tvm.nd.array(data_np, device=dev)
-        out_tvm = tvm.nd.empty(shape=buffer_shape, device=dev, dtype=dtype)
-        f = tvm.build(s, [data, buffer, out], target, name="fifo")
-        f(data_tvm, buffer_tvm, out_tvm)
-        tvm.testing.assert_allclose(out_tvm.numpy(), out_np)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_conv1d_integration():
-    batch_size = 1
-    num_channel = 1
-    num_filter = 1
-
-    # Note: TVM doesn't have a separate op for 1D convolution, so we use conv2d instead.
-    # We set height=1 to indicate that convolution is really 1D.
-    stride = (1, 1)
-    dilate = (1, 1)
-    padding = (0, 0)
-
-    kernel_size = (1, 3)
-    input_window_size = (1, 10)
-    inc_input_size = (1, 2)
-    context_size = (1, 4)
-    inc_output_size = (1, 2)
-    output_window_size = (1, 8)
-
-    num_iteration = 20
-    buffer_axis = 3
-
-    kernel_shape = (num_filter, num_channel, kernel_size[0], kernel_size[1])
-    input_window_shape = (batch_size, num_channel, input_window_size[0], input_window_size[1])
-    inc_input_shape = (batch_size, num_channel, inc_input_size[0], inc_input_size[1])
-    inc_output_shape = (batch_size, num_filter, inc_output_size[0], inc_output_size[1])
-    context_shape = (batch_size, num_channel, context_size[0], context_size[1])
-    output_window_shape = (batch_size, num_filter, output_window_size[0], output_window_size[1])
-    # Rule: Convolution of Tensor[context_shape] and Tensor[kernel_shape]
-    #       produces Tensor[inc_input_shape]
-
-    dtype = "float32"
-
-    inc_input = te.placeholder(inc_input_shape, name="inc_input", dtype=dtype)
-    input_window = te.placeholder(input_window_shape, name="input_window", dtype=dtype)
-    context = te.placeholder(context_shape, name="context", dtype=dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=dtype)
-    inc_output = te.placeholder(inc_input_shape, name="inc_output", dtype=dtype)
-    output_window = te.placeholder(output_window_shape, name="output_window", dtype=dtype)
-
-    # Use memoize, pickle the test data for next time use
-    @memoize("topi.tests.test_fifo_buffer_conv1d_integration")
-    def get_data():
-        # Generate [num_iteration] slices of input
-        inc_input_np = np.random.uniform(
-            size=tuple([num_iteration] + list(inc_input_shape))
-        ).astype(dtype)
-        input_window_np = np.zeros(input_window_shape, dtype=dtype)
-        kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
-        context_np = np.zeros(context_shape, dtype=dtype)
-        output_window_np = np.zeros(output_window_shape, dtype=dtype)
-
-        return (inc_input_np, input_window_np, kernel_np, context_np, output_window_np)
-
-    # Get the test data
-    inc_input_np, input_window_np, kernel_np, context_np, output_window_np = get_data()
-
-    def check_device(target, dev):
-        print("  Running on target: {}".format(target))
-
-        conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(target)
-
-        with tvm.target.Target(target):
-            out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis)
-            s = tvm.topi.testing.get_injective_schedule(target)([out])
-            update_context = tvm.build(s, [inc_input, context, out], target, name="update_context")
-
-            out = conv2d_nchw(context, kernel, stride, padding, dilate, dtype)
-            s = schedule_conv2d_nchw([out])
-            conv2d_inc = tvm.build(s, [context, kernel, out], target, name="conv2d_inc")
-
-            out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis)
-            s = tvm.topi.testing.get_injective_schedule(target)([out])
-            update_output_window = tvm.build(
-                s, [inc_output, output_window, out], target, name="update_output_window"
-            )
-
-            out = topi.nn.fifo_buffer(inc_input, input_window, axis=buffer_axis)
-            s = tvm.topi.testing.get_injective_schedule(target)([out])
-            update_input_window = tvm.build(
-                s, [inc_input, input_window, out], target, name="update_input_window"
-            )
-
-            out = conv2d_nchw(input_window, kernel, stride, padding, dilate, dtype)
-            s = schedule_conv2d_nchw([out])
-            conv2d = tvm.build(s, [input_window, kernel, out], target, name="conv2d")
-
-        input_window_tvm = tvm.nd.array(input_window_np, device=dev)
-        new_input_window_tvm = tvm.nd.empty(shape=input_window_shape, device=dev, dtype=dtype)
-        kernel_tvm = tvm.nd.array(kernel_np, device=dev)
-        context_tvm = tvm.nd.array(context_np, device=dev)
-        new_context_tvm = tvm.nd.empty(shape=context_shape, device=dev, dtype=dtype)
-        inc_output_tvm = tvm.nd.empty(shape=inc_output_shape, device=dev, dtype=dtype)
-        output_window_tvm = tvm.nd.array(output_window_np, device=dev)
-        new_output_window_tvm = tvm.nd.empty(shape=output_window_shape, device=dev, dtype=dtype)
-        output_window_ref_tvm = tvm.nd.empty(shape=output_window_shape, device=dev, dtype=dtype)
-
-        for i in range(num_iteration):
-            # Take i-th slice of inc_input_np
-            inc_input_tvm = tvm.nd.array(inc_input_np[i], device=dev)
-
-            # Compute new output window incrementally, using the FIFO buffer op
-            update_context(inc_input_tvm, context_tvm, new_context_tvm)
-            conv2d_inc(new_context_tvm, kernel_tvm, inc_output_tvm)
-            update_output_window(inc_output_tvm, output_window_tvm, new_output_window_tvm)
-            context_tvm = new_context_tvm
-            output_window_tvm = new_output_window_tvm
-
-            # Compute full input window, so that we have a baseline
-            update_input_window(inc_input_tvm, input_window_tvm, new_input_window_tvm)
-            input_window_tvm = new_input_window_tvm
-            conv2d(input_window_tvm, kernel_tvm, output_window_ref_tvm)
-            # Incrementally updating the output window should be equivalent to computing it from
-            # scratch using the input window
-            tvm.testing.assert_allclose(output_window_tvm.numpy(), output_window_ref_tvm.numpy())
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_fifo_buffer():
-    for ndim in [1, 2, 3, 4, 5, 6]:
-        for axis in range(ndim):
-            buffer_shape = tuple(7 for _ in range(ndim))
-            data_shape = tuple((2 if i == axis else 7) for i in range(ndim))
-            print(
-                "Testing FIFO buffer op: buffer_shape = {}, data_shape = {}, axis = {}".format(
-                    buffer_shape, data_shape, axis
-                )
-            )
-            verify_fifo_buffer(buffer_shape, data_shape, axis)
-
-
-@tvm.testing.uses_gpu
-def test_conv1d_integration():
-    print("Testing FIFO buffer with 1D convolution")
-    verify_conv1d_integration()
-
-
-if __name__ == "__main__":
-    test_fifo_buffer()
-    test_conv1d_integration()
diff --git a/tests/python/topi/test_topi_argwhere.py b/tests/python/topi/test_topi_argwhere.py
deleted file mode 100644
index bc43dbb2b051..000000000000
--- a/tests/python/topi/test_topi_argwhere.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test for argwhere operator"""
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-
-_argwhere_schedule = {
-    "generic": topi.generic.schedule_argwhere,
-    "gpu": topi.cuda.schedule_argwhere,
-}
-
-_argwhere_compute = {"llvm": topi.argwhere, "cuda": topi.cuda.argwhere}
-
-data_shape = tvm.testing.parameter(
-    (1,),
-    (100,),
-    (1, 1),
-    (5, 3),
-    (32, 64),
-    (128, 65),
-    (200, 500),
-    (6, 5, 3),
-    (1, 1, 1),
-    (1, 1, 1, 1),
-    (6, 4, 5, 3),
-    (1, 1, 1, 1, 1),
-    (6, 4, 5, 3, 7),
-)
-
-
-@tvm.testing.parametrize_targets("llvm", "cuda")
-def test_argwhere(target, dev, data_shape):
-    dtype = "int32"
-    np_data = np.random.choice([0, 1, 2, 3], size=data_shape).astype(dtype)
-    np_out = np.argwhere(np_data)
-    out_shape = np_out.shape[0]
-
-    np_shape = np.ones(shape=(out_shape, len(data_shape)), dtype=dtype)
-
-    out_shape = te.placeholder(shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype)
-    condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
-
-    with tvm.target.Target(target):
-        out = _argwhere_compute[target](out_shape, condition)
-        s_func = tvm.topi.testing.dispatch(target, _argwhere_schedule)
-        sch = s_func(out)
-
-    func = tvm.build(sch, [out_shape, condition, out], target, name="argwhere")
-
-    args = [tvm.nd.array(np_shape, dev)]
-    args.append(tvm.nd.array(np_data, dev))
-    args.append(tvm.nd.empty(out.shape, device=dev, dtype=condition.dtype))
-    func(*args)
-    np.set_printoptions(threshold=np.inf)
-    tvm_out = args[-1].numpy()
-    tvm.testing.assert_allclose(tvm_out, np_out)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_basic.py b/tests/python/topi/test_topi_basic.py
deleted file mode 100644
index 108b92d903d9..000000000000
--- a/tests/python/topi/test_topi_basic.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import topi
-from tvm.topi import utils
-
-
-def test_util():
-    x = tvm.tir.const(100, "int32")
-    assert utils.get_const_int(x) == 100
-    assert utils.get_const_tuple((x, x)) == (100, 100)
-
-
-def test_ewise():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-
-    def test_apply(func, name):
-        B = func(A)
-        assert tuple(B.shape) == tuple(A.shape)
-        assert B.op.body[0].op.name == "tir." + name
-
-    test_apply(topi.exp, "exp")
-    test_apply(topi.erf, "erf")
-    test_apply(topi.tanh, "tanh")
-    test_apply(topi.sigmoid, "sigmoid")
-    test_apply(topi.log, "log")
-    test_apply(topi.sqrt, "sqrt")
-    test_apply(topi.rsqrt, "rsqrt")
-    test_apply(topi.sin, "sin")
-    test_apply(topi.cos, "cos")
-    test_apply(topi.tan, "tan")
-    test_apply(topi.atan, "atan")
-
-
-if __name__ == "__main__":
-    test_util()
-    test_ewise()
diff --git a/tests/python/topi/test_topi_batch_matmul.py b/tests/python/topi/test_topi_batch_matmul.py
deleted file mode 100644
index 9bd9dd286b1a..000000000000
--- a/tests/python/topi/test_topi_batch_matmul.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for batch_matmul operator"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-import tvm.testing
-from common import Int8Fallback
-
-_batch_matmul_implement = {
-    "generic": (topi.nn.batch_matmul, topi.generic.schedule_batch_matmul),
-    "cpu": (topi.x86.batch_matmul, topi.x86.schedule_batch_matmul),
-    "gpu": (topi.cuda.batch_matmul, topi.cuda.schedule_batch_matmul),
-}
-
-
-def verify_batch_matmul(x_batch, y_batch, M, N, K, dynamic=False, debug=False):
-
-    if not dynamic:
-        x = te.placeholder((x_batch, M, K), name="x")
-        y = te.placeholder((y_batch, N, K), name="y")
-        dtype = x.dtype
-    else:
-        assert x_batch == y_batch or x_batch == 1 or y_batch == 1
-        batch_size = max(x_batch, y_batch)
-        dynamic_batch_size = te.var("dynamic_batch_size")
-        dynamic_M = te.var("dynamic_M")
-        dynamic_N = te.var("dynamic_N")
-        dynamic_K = te.var("dynamic_K")
-
-        x = te.placeholder((dynamic_batch_size, dynamic_M, dynamic_K), name="x")
-        y = te.placeholder((dynamic_batch_size, dynamic_N, dynamic_K), name="y")
-        dtype = x.dtype
-
-    # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_batch_matmul")
-    def get_ref_data():
-        a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
-        b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
-        c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
-        return (a_np, b_np, c_np)
-
-    # get the test data
-    a_np, b_np, c_np = get_ref_data()
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _batch_matmul_implement)
-            out = fcompute(x, y)
-            if not dynamic:
-                s = fschedule([out])
-                out_shape = out.shape
-            else:
-                s = te.create_schedule(out.op)
-                out_shape = (batch_size, M, N)
-
-            if debug:
-                print(tvm.lower(s, [x, y, out], simple_mode=True))
-
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev)
-        f = tvm.build(s, [x, y, out], target, name="dense")
-        f(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        target_kind = tvm.target.Target(target).kind.name
-        if dynamic and target_kind in ["cuda", "nvptx", "vulkan", "opencl"]:
-            print("Dynamic batch matmul test is skippped on %s" % target)
-            continue
-
-        check_device(target, dev)
-
-
-def verify_batch_matmul_int8(x_batch, y_batch, M, N, K):
-    dtype = "int8"
-    out_dtype = "int32"
-    assert x_batch == y_batch or x_batch == 1 or y_batch == 1
-    x = te.placeholder((x_batch, M, K), name="x", dtype=dtype)
-    y = te.placeholder((y_batch, N, K), name="y", dtype=dtype)
-
-    # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_batch_matmul")
-    def get_ref_data():
-        a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype)
-        b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype)
-        c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype=out_dtype)
-        return (a_np, b_np, c_np)
-
-    # get the test data
-    a_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if device == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            print("Skip because int8 intrinsics are not available")
-            return
-
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            out = topi.cuda.batch_matmul_int8(x, y, None, out_dtype)
-            s = topi.cuda.schedule_batch_matmul_int8([out])
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev)
-        f = tvm.build(s, [x, y, out], device, name="batch_matmul_int8")
-        f(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for device in ["cuda"]:
-        check_device(device)
-
-
-@tvm.testing.uses_gpu
-def test_batch_matmul():
-    verify_batch_matmul(1, 1, 16, 16, 32)
-    verify_batch_matmul(5, 5, 16, 16, 32)
-    verify_batch_matmul(5, 5, 16, 20, 32)
-    verify_batch_matmul(30, 30, 16, 20, 32)
-    # Test batch broadcasting.
-    verify_batch_matmul(1, 5, 16, 16, 32)
-    verify_batch_matmul(5, 1, 16, 16, 32)
-
-    # Test dynamic batch
-    verify_batch_matmul(1, 1, 16, 16, 32, dynamic=True)
-    verify_batch_matmul(5, 5, 16, 16, 32, dynamic=True)
-
-
-@tvm.testing.requires_cuda
-@tvm.testing.requires_gpu
-def test_batch_matmul_int8():
-    with Int8Fallback():
-        verify_batch_matmul_int8(1, 1, 2, 3, 1)
-        verify_batch_matmul_int8(1, 1, 16, 24, 32)
-        verify_batch_matmul_int8(5, 5, 24, 16, 32)
-        verify_batch_matmul_int8(30, 30, 16, 20, 32)
-        verify_batch_matmul_int8(1, 5, 16, 16, 32)
-        verify_batch_matmul_int8(5, 1, 16, 16, 32)
-
-
-if __name__ == "__main__":
-    test_batch_matmul()
-    test_batch_matmul_int8()
diff --git a/tests/python/topi/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/test_topi_batch_matmul_tensorcore.py
deleted file mode 100644
index eb657a329889..000000000000
--- a/tests/python/topi/test_topi_batch_matmul_tensorcore.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for batch_matmul operator"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-import tvm.testing
-
-_batch_matmul_implement = {
-    "gpu": (topi.cuda.batch_matmul_tensorcore, topi.cuda.schedule_batch_matmul_tensorcore),
-}
-
-
-def convert_int32_into_int4(a_int32):
-    """convert int32 values into int4
-    Parameters
-    ----------
-    a_int32 : int
-
-    Return
-    ------
-    a_int4 : int
-    """
-    B, K, L = a_int32.shape
-    assert L % 8 == 0
-    a_int4 = np.zeros(shape=(B, K, L // 8), dtype=np.int32)
-    for b in range(B):
-        for k in range(K):
-            for l in range(L // 8):
-                for m in range(min(8, L - l * 8)):
-                    a_int4[b, k, l] = a_int4[b, k, l] | (
-                        (a_int32[b, k, l * 8 + m] & 0xF) << ((7 - m) * 4)
-                    )
-    return a_int4
-
-
-def verify_batch_matmul(x_batch, y_batch, M, N, K, dtype):
-    x = te.placeholder((x_batch, M, K), name="x", dtype=dtype)
-    y = te.placeholder((y_batch, N, K), name="y", dtype=dtype)
-
-    assert dtype in ["int4", "int8", "float16"]
-
-    out_dtype = "float32"
-    if dtype in ["int8", "int4"]:
-        out_dtype = "int32"
-
-    # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_batch_matmul_tensorcore")
-    def get_ref_data():
-        if dtype == "int4":
-            a_np = np.random.randint(low=-8, high=7, size=(x_batch, M, K))
-            b_np = np.random.randint(low=-8, high=7, size=(y_batch, N, K))
-        elif dtype == "int8":
-            a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype)
-            b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype)
-        else:
-            a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
-            b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
-        c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype)
-        return (a_np, b_np, c_np)
-
-    # get the test data
-    a_np, b_np, c_np = get_ref_data()
-    if dtype == "int4":
-        a_np = convert_int32_into_int4(a_np)
-        b_np = convert_int32_into_int4(b_np)
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement)
-            out = fcompute(x, y, None, out_dtype)
-            s = fschedule([out])
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev)
-        f = tvm.build(s, [x, y, out], device, name="batch_matmul")
-        f(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
-
-    check_device("cuda")
-
-
-@tvm.testing.requires_tensorcore
-def test_batch_matmul():
-    for dtype in ["float16", "int8", "int4"]:
-        verify_batch_matmul(1, 1, 16, 16, 32, dtype)
-        verify_batch_matmul(5, 5, 16, 16, 32, dtype)
-        verify_batch_matmul(5, 5, 16, 32, 32, dtype)
-        verify_batch_matmul(30, 30, 16, 32, 32, dtype)
-
-
-if __name__ == "__main__":
-    test_batch_matmul()
diff --git a/tests/python/topi/test_topi_batch_norm.py b/tests/python/topi/test_topi_batch_norm.py
deleted file mode 100644
index c7feb5d7c860..000000000000
--- a/tests/python/topi/test_topi_batch_norm.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tests for the batch_norm operator."""
-import numpy as np
-import pytest
-
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-
-
-_DEVICE = "llvm"
-_BATCH_NORM_IMPLEMENT = {
-    "generic": (topi.nn.batch_norm, topi.generic.schedule_batch_norm),
-    "cpu": (topi.nn.batch_norm, topi.x86.schedule_batch_norm),
-}
-
-
-@pytest.mark.parametrize(
-    "shape, axis, epsilon, center, scale, training, momentum",
-    [
-        ((1,), 0, 0.1, True, True, False, 0.1),
-        ((2, 3), 0, 0.1, True, True, False, 0.1),
-        ((1, 2, 4), 0, 0.1, True, True, False, 0.1),
-        ((1, 2, 3, 4), 0, 0.001, False, False, False, 0.1),
-        ((2, 3, 4, 1), 1, 0.01, False, True, False, 0.1),
-        ((3, 4, 1, 2), 2, 0.1, True, False, True, 0.1),
-        ((4, 1, 2, 3), 3, 1.0, True, True, True, 0.2),
-        ((1, 2, 4, 4, 5), 0, 0.1, True, True, True, 0.3),
-    ],
-)
-def test_batch_norm(shape, axis, epsilon, center, scale, training, momentum):
-    x_np = np.random.random(shape).astype("float32")
-    gamma_np = np.random.random(shape[axis]).astype("float32")
-    beta_np = np.random.random(shape[axis]).astype("float32")
-    moving_mean_np = np.random.random(shape[axis]).astype("float32")
-    moving_var_np = np.random.random(shape[axis]).astype("float32")
-
-    out_x_np, out_moving_mean_np, out_moving_var_np = tvm.topi.testing.batch_norm(
-        x_np,
-        gamma_np,
-        beta_np,
-        moving_mean_np,
-        moving_var_np,
-        axis,
-        epsilon,
-        center,
-        scale,
-        training,
-        momentum,
-    )
-
-    x_te = te.placeholder(shape, name="x", dtype="float32")
-    gamma_te = te.placeholder((shape[axis],), name="gamma", dtype="float32")
-    beta_te = te.placeholder((shape[axis],), name="beta", dtype="float32")
-    moving_mean_te = te.placeholder((shape[axis],), name="moving_mean", dtype="float32")
-    moving_var_te = te.placeholder((shape[axis],), name="moving_var", dtype="float32")
-
-    with tvm.target.Target(_DEVICE):
-        fcompute, fschedule = tvm.topi.testing.dispatch(_DEVICE, _BATCH_NORM_IMPLEMENT)
-        out_x, out_moving_mean, out_moving_var = fcompute(
-            x_te,
-            gamma_te,
-            beta_te,
-            moving_mean_te,
-            moving_var_te,
-            axis,
-            epsilon,
-            center,
-            scale,
-            training,
-            momentum,
-        )
-        s = fschedule([out_x, out_moving_mean, out_moving_var])
-
-        dev = tvm.device(_DEVICE, 0)
-
-        x_tvm = tvm.nd.array(x_np, dev)
-        gamma_tvm = tvm.nd.array(gamma_np, dev)
-        beta_tvm = tvm.nd.array(beta_np, dev)
-        moving_mean_tvm = tvm.nd.array(moving_mean_np, dev)
-        moving_var_tvm = tvm.nd.array(moving_var_np, dev)
-        out_x_tvm = tvm.nd.array(np.zeros(shape, dtype=out_x.dtype), dev)
-        out_moving_mean_tvm = tvm.nd.array(
-            np.zeros((shape[axis],), dtype=out_moving_mean.dtype), dev
-        )
-        out_moving_var_tvm = tvm.nd.array(np.zeros((shape[axis],), dtype=out_moving_var.dtype), dev)
-
-        f = tvm.build(
-            s,
-            [
-                x_te,
-                gamma_te,
-                beta_te,
-                moving_mean_te,
-                moving_var_te,
-                out_x,
-                out_moving_mean,
-                out_moving_var,
-            ],
-            _DEVICE,
-        )
-        f(
-            x_tvm,
-            gamma_tvm,
-            beta_tvm,
-            moving_mean_tvm,
-            moving_var_tvm,
-            out_x_tvm,
-            out_moving_mean_tvm,
-            out_moving_var_tvm,
-        )
-
-        tvm.testing.assert_allclose(out_x_tvm.numpy(), out_x_np, rtol=1e-3)
-        tvm.testing.assert_allclose(out_moving_mean_tvm.numpy(), out_moving_mean_np, rtol=1e-3)
-        tvm.testing.assert_allclose(out_moving_var_tvm.numpy(), out_moving_var_np, rtol=1e-3)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_batch_to_space_nd.py b/tests/python/topi/test_topi_batch_to_space_nd.py
deleted file mode 100644
index 6ee99dacc61a..000000000000
--- a/tests/python/topi/test_topi_batch_to_space_nd.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for batch to space"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-
-
-def verify_batch_to_space_nd(input_shape, block_shape, crop_begin_list, crop_end_list):
-    out_shape = []
-    out_shape.append(int((input_shape[0] / np.prod(block_shape))))
-    for i in range(1, len(block_shape) + 1):
-        crop = crop_begin_list[i - 1] + crop_end_list[i - 1]
-        out_shape.append(input_shape[i] * block_shape[i - 1] - crop)
-    for i in range(len(block_shape) + 1, len(input_shape)):
-        out_shape.append(input_shape[i])
-
-    A = te.placeholder(input_shape, name="A", dtype="float32")
-    dtype = A.dtype
-    a_np = np.random.uniform(size=input_shape).astype(dtype)
-
-    B = topi.nn.batch_to_space_nd(A, block_shape, crop_begin_list, crop_end_list)
-
-    b_np = tvm.topi.testing.batch_to_space_nd_python(
-        a_np, block_shape, crop_begin_list, crop_end_list
-    )
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.create(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_batch_to_space():
-    # Without crops
-    verify_batch_to_space_nd([4, 1, 1, 1], [2, 2], [0, 0], [0, 0])
-    # With crops
-    verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [0, 2], [0, 0])
-    verify_batch_to_space_nd([18, 2, 1, 2], [2, 3], [1, 1], [0, 0])
-    verify_batch_to_space_nd([20, 5, 8, 7], [2, 2], [1, 1], [1, 1])
-
-
-if __name__ == "__main__":
-    test_batch_to_space()
diff --git a/tests/python/topi/test_topi_bitserial_conv2d.py b/tests/python/topi/test_topi_bitserial_conv2d.py
deleted file mode 100644
index 6b6e8a2c3fa7..000000000000
--- a/tests/python/topi/test_topi_bitserial_conv2d.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-
-def generate_quantized_np(shape, bits, out_dtype):
-    min_val = 0
-    max_val = 1 << bits
-    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
-
-
-def verify_bitserial_conv2d_nchw(
-    batch,
-    in_size,
-    in_channel,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    activation_bits,
-    weight_bits,
-    unipolar,
-):
-    in_height = in_width = in_size
-    input_dtype = "uint32"
-    out_dtype = "int32"
-
-    with tvm.target.Target("llvm"):
-        A = te.placeholder((batch, in_channel, in_height, in_width), dtype=input_dtype, name="A")
-        W = te.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_dtype, name="W")
-        B = topi.x86.bitserial_conv2d_nchw(
-            A, W, stride, padding, activation_bits, weight_bits, input_dtype, out_dtype, unipolar
-        )
-        s = topi.x86.schedule_bitserial_conv2d_nchw([B])
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-
-    @memoize("topi.tests.test_topi_bitseral_conv2d_nchw")
-    def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype)
-        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_dtype)
-        if unipolar:
-            w_ = np.copy(w_np).astype(out_dtype)
-            for x in np.nditer(w_, op_flags=["readwrite"]):
-                x[...] = 1 if x == 1 else -1
-            b_np = tvm.topi.testing.conv2d_nchw_python(a_np.astype(out_dtype), w_, stride, padding)
-        else:
-            b_np = tvm.topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-
-    dev = tvm.cpu(0)
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    func = tvm.build(s, [A, W, B], "llvm")
-    func(a, w, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-def verify_bitserial_conv2d_nhwc(
-    batch,
-    in_size,
-    in_channel,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    activation_bits,
-    weight_bits,
-    unipolar,
-):
-    in_height = in_width = in_size
-    input_dtype = "uint32"
-    out_dtype = "int32"
-
-    with tvm.target.Target("llvm"):
-        A = te.placeholder((batch, in_height, in_width, in_channel), dtype=input_dtype, name="A")
-        W = te.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_dtype, name="W")
-        B = topi.x86.bitserial_conv2d_nhwc(
-            A, W, stride, padding, activation_bits, weight_bits, input_dtype, out_dtype, unipolar
-        )
-        s = topi.x86.schedule_bitserial_conv2d_nhwc([B])
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-
-    @memoize("topi.tests.test_topi_bitseral_conv2d_nhwc")
-    def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype)
-        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_dtype)
-        if unipolar:
-            w_ = np.copy(w_np).astype(out_dtype)
-            for x in np.nditer(w_, op_flags=["readwrite"]):
-                x[...] = 1 if x == 1 else -1
-            b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype)
-        else:
-            b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(
-                out_dtype
-            )
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-
-    dev = tvm.cpu(0)
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    func = tvm.build(s, [A, W, B], "llvm")
-
-    func(a, w, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-def test_bitserial_conv2d():
-    in_size = 56
-    ic, oc = 64, 64
-    k = 3
-    stride = 1
-    pad = 1
-    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
-    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
-    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
-    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
-    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
-
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
-
-
-if __name__ == "__main__":
-    test_bitserial_conv2d()
diff --git a/tests/python/topi/test_topi_bitserial_conv2d_rasp.py b/tests/python/topi/test_topi_bitserial_conv2d_rasp.py
deleted file mode 100644
index fbfb06f50cb4..000000000000
--- a/tests/python/topi/test_topi_bitserial_conv2d_rasp.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import re
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-
-
-def generate_quantized_np(shape, bits, out_dtype):
-    np.random.seed(0)
-    min_val = 0
-    max_val = 1 << bits
-    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
-
-
-# Verify that certain special instructions from the tensorize pass exist
-def verify_bitserial_conv2d_nhwc(
-    batch,
-    in_size,
-    in_channel,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    activation_bits,
-    weight_bits,
-    unipolar,
-    use_relu=False,
-):
-    in_height = in_width = in_size
-    input_type = "uint32"
-    out_dtype = "int16"
-
-    device = "llvm -device=arm_cpu -model=bcm2837 -mtriple=armv7l-linux-gnueabihf -mattr=+neon"
-    with tvm.target.Target(device):
-        A = te.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name="A")
-        W = te.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name="W")
-        B = topi.arm_cpu.bitserial_conv2d_nhwc(
-            A, W, stride, padding, activation_bits, weight_bits, "uint8", out_dtype, unipolar
-        )
-        if use_relu:
-            B = topi.nn.relu(B)
-        s = topi.arm_cpu.schedule_bitserial_conv2d_nhwc([B])
-
-    func = tvm.build(s, [A, W, B], device)
-
-    assembly = func.get_source("asm")
-    matches = re.findall("vpadal", assembly)
-    assert len(matches) > 0
-    matches = re.findall("vcnt", assembly)
-    assert len(matches) > 0
-    matches = re.findall("vpadd", assembly)
-    assert len(matches) > 0
-
-    dev = tvm.device(device, 0)
-    if "arm" not in os.uname()[4]:
-        print("Skipped running code, not an arm device")
-        return
-
-    print("Running on target: %s" % device)
-
-    def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
-        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
-        if unipolar:
-            w_ = np.copy(w_np).astype(out_dtype)
-            for x in np.nditer(w_, op_flags=["readwrite"]):
-                x[...] = 1 if x == 1 else -1
-            b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype)
-        else:
-            b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(
-                out_dtype
-            )
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    func = tvm.build(s, [A, W, B], device)
-
-    func(a, w, b)
-    np.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-def test_bitserial_conv2d():
-    in_size = 56
-    ic, oc = 64, 64
-    k = 3
-    stride = 1
-    pad = 1
-
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
-
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
-
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True, True)
-
-
-if __name__ == "__main__":
-    test_bitserial_conv2d()
diff --git a/tests/python/topi/test_topi_bitserial_dense.py b/tests/python/topi/test_topi_bitserial_dense.py
deleted file mode 100644
index ecb98957ff22..000000000000
--- a/tests/python/topi/test_topi_bitserial_dense.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for bitserial_dense operator"""
-import os
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-_bitserial_dense_implement = {
-    "generic": (topi.nn.bitserial_dense, topi.generic.schedule_bitserial_dense),
-    "cpu": (topi.x86.bitserial_dense, topi.x86.schedule_bitserial_dense),
-    "arm_cpu": (topi.arm_cpu.bitserial_dense, topi.arm_cpu.schedule_bitserial_dense),
-}
-
-
-def generate_quantized_np(shape, bits, out_dtype):
-    min_val = 0
-    max_val = 1 << bits
-    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
-
-
-def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar):
-    out_dtype = "int16"
-
-    def get_ref_data(a_shape, b_shape, input_dtype):
-        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype)
-        b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype)
-        if unipolar:
-            b_ = np.copy(b_np).astype(out_dtype)
-            for x in np.nditer(b_, op_flags=["readwrite"]):
-                x[...] = 1 if x == 1 else -1
-            c_np = np.dot(a_np, b_.T)
-        else:
-            c_np = np.dot(a_np, b_np.T)
-        return a_np, b_np, c_np
-
-    for target in ["llvm", "llvm -device=arm_cpu"]:
-        target = tvm.target.Target(target)
-        if "arm_cpu" in target.keys and "arm" not in os.uname()[4]:
-            print("Skipped running code, not an arm device")
-            continue
-        input_dtype = "uint8" if "arm_cpu" in target.keys else "uint32"
-        A = te.placeholder((batch, in_dim), dtype=input_dtype, name="A")
-        B = te.placeholder((out_dim, in_dim), dtype=input_dtype, name="B")
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _bitserial_dense_implement)
-        C = fcompute(A, B, activation_bits, weight_bits, input_dtype, out_dtype, unipolar)
-        s = fschedule([C])
-
-        a_shape = get_const_tuple(A.shape)
-        b_shape = get_const_tuple(B.shape)
-        a_np, b_np, c_np = get_ref_data(a_shape, b_shape, input_dtype)
-
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        func = tvm.build(s, [A, B, C], target)
-        func(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-def test_bitserial_dense():
-    verify_bitserial_dense(1, 1024, 1000, 1, 1, True)
-    verify_bitserial_dense(1, 1024, 1000, 2, 1, True)
-
-    verify_bitserial_dense(1, 1024, 1000, 1, 1, False)
-    verify_bitserial_dense(1, 1024, 1000, 2, 1, False)
-
-
-if __name__ == "__main__":
-    test_bitserial_dense()
diff --git a/tests/python/topi/test_topi_bnn.py b/tests/python/topi/test_topi_bnn.py
deleted file mode 100644
index 57c24d0242a6..000000000000
--- a/tests/python/topi/test_topi_bnn.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for binary neural network operators."""
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-
-def verify_binary_dense(batch, in_dim, out_dim):
-    A = te.placeholder((batch, in_dim), name="A")
-    B = te.placeholder((out_dim, in_dim), name="B")
-    bnn_A = topi.nn.binarize_pack(A)
-    bnn_B = topi.nn.binarize_pack(B)
-    # binary dense
-    bnn_A1 = te.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
-    bnn_B1 = te.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
-    bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1)
-    # schedule
-    with tvm.target.Target("llvm"):
-        s1 = topi.x86.schedule_binarize_pack(bnn_A)
-        s2 = topi.x86.schedule_binarize_pack(bnn_B)
-        s3 = topi.x86.schedule_binary_dense(bnn_C)
-
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_binary_dense")
-    def get_ref_data():
-        # generate random matrix of +1 or -1 value
-        a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
-        b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
-        c_np = np.dot(a_np, b_np.T)
-        return a_np, b_np, c_np
-
-    a_np, b_np, c_np = get_ref_data()
-
-    dev = tvm.cpu(0)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), dev)
-    bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), dev)
-    bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), dev)
-    f1 = tvm.build(s1, [A, bnn_A], "llvm")
-    f2 = tvm.build(s2, [B, bnn_B], "llvm")
-    f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], "llvm")
-    f1(a, bnn_a)
-    f2(b, bnn_b)
-    f3(bnn_a, bnn_b, bnn_c)
-    tvm.testing.assert_allclose(bnn_c.numpy(), c_np, rtol=1e-5)
-
-
-def test_binary_dense():
-    verify_binary_dense(1, 4096, 1024)
-    verify_binary_dense(1, 1024, 1000)
-
-
-if __name__ == "__main__":
-    test_binary_dense()
diff --git a/tests/python/topi/test_topi_broadcast.py b/tests/python/topi/test_topi_broadcast.py
deleted file mode 100644
index d77ed2eae2e4..000000000000
--- a/tests/python/topi/test_topi_broadcast.py
+++ /dev/null
@@ -1,449 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for broadcasting operators."""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-
-
-def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
-    # Build the logic and compile the function
-    A = te.placeholder(shape=in_shape, name="A")
-    B = fbcast(A, out_shape)
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
-        foo = tvm.build(s, [A, B], target, name="broadcast_to")
-        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = np.broadcast_to(data_npy, out_shape)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), dev)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target)
-    check_target("sdaccel")
-
-
-def verify_broadcast_binary_ele(
-    lhs_shape,
-    rhs_shape,
-    ftopi,
-    fnumpy,
-    lhs_min=-100,
-    lhs_max=100,
-    rhs_min=-100,
-    rhs_max=100,
-    dtype="float32",
-):
-    # Build the logic and compile the function
-    A = (
-        te.var("A", dtype=dtype)
-        if lhs_shape is None
-        else te.placeholder(shape=lhs_shape, name="A", dtype=dtype)
-    )
-    B = (
-        te.var("B", dtype=dtype)
-        if rhs_shape is None
-        else te.placeholder(shape=rhs_shape, name="B", dtype=dtype)
-    )
-    C = ftopi(A, B)
-    if isinstance(A, tvm.tir.PrimExpr) and isinstance(B, tvm.tir.PrimExpr):
-        assert isinstance(C, tvm.tir.PrimExpr)
-        return
-
-    def gen_operand(shape, low, high, dev):
-        if shape is None:
-            npy = float(np.random.uniform(low=low, high=high))
-            if dtype.startswith("int"):
-                npy = int(npy)
-            nd = npy
-        else:
-            npy = np.random.uniform(low=low, high=high, size=shape).astype(dtype)
-            nd = tvm.nd.array(npy, dev)
-        return npy, nd
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_broadcast_schedule(target)(C)
-        foo = tvm.build(s, [A, B, C], target, name="broadcast_binary" + "_" + ftopi.__name__)
-
-        lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, dev)
-        rhs_npy, rhs_nd = gen_operand(rhs_shape, rhs_min, rhs_max, dev)
-        out_npy = fnumpy(lhs_npy, rhs_npy)
-
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev)
-        foo(lhs_nd, rhs_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy, rtol=1e-4, atol=1e-4)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target)
-    check_target("sdaccel")
-
-
-@tvm.testing.uses_gpu
-def test_broadcast_to():
-    verify_broadcast_to_ele((1,), (10,), topi.broadcast_to)
-    verify_broadcast_to_ele((), (10,), topi.broadcast_to)
-    verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4), topi.broadcast_to)
-    verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32), topi.broadcast_to)
-
-
-@tvm.testing.uses_gpu
-def test_add():
-    verify_broadcast_binary_ele((), (), topi.add, np.add)
-    verify_broadcast_binary_ele((5, 2, 3), (2, 1), topi.add, np.add)
-
-
-@tvm.testing.uses_gpu
-def test_subtract():
-    verify_broadcast_binary_ele((5, 2, 3), (), topi.subtract, np.subtract)
-    verify_broadcast_binary_ele((5, 2, 3), None, topi.subtract, np.subtract)
-    verify_broadcast_binary_ele(None, None, topi.subtract, np.subtract)
-    verify_broadcast_binary_ele((1, 32), (64, 32), topi.subtract, np.subtract)
-
-
-@tvm.testing.uses_gpu
-def test_multiply():
-    verify_broadcast_binary_ele((5, 64, 128), (2, 5, 64, 1), topi.multiply, np.multiply)
-
-
-@tvm.testing.uses_gpu
-def test_divide():
-    verify_broadcast_binary_ele(None, (10,), topi.divide, np.divide, rhs_min=0.0001)
-    verify_broadcast_binary_ele((), None, topi.divide, np.divide, rhs_min=0.0001)
-    verify_broadcast_binary_ele((2, 3, 1, 32), (64, 32), topi.divide, np.divide, rhs_min=0.0001)
-
-
-@tvm.testing.uses_gpu
-def test_floor_divide():
-    def _canonical_floor_div(a, b):
-        return np.floor(a / b)
-
-    verify_broadcast_binary_ele(
-        None, (10,), topi.floor_divide, _canonical_floor_div, rhs_min=0.0001
-    )
-    verify_broadcast_binary_ele((), None, topi.floor_divide, _canonical_floor_div, rhs_min=0.0001)
-    verify_broadcast_binary_ele(
-        (2, 3, 64, 32), (64, 32), topi.floor_divide, _canonical_floor_div, rhs_min=0.0001
-    )
-
-
-@tvm.testing.uses_gpu
-def test_maximum_minmum():
-    verify_broadcast_binary_ele((32,), (64, 32), topi.maximum, np.maximum)
-    verify_broadcast_binary_ele((1, 2, 2, 1, 32), (64, 32), topi.minimum, np.minimum)
-
-
-@tvm.testing.uses_gpu
-def test_power():
-    verify_broadcast_binary_ele(
-        (1, 2, 2), (2,), topi.power, np.power, lhs_min=0.001, rhs_min=0.001, rhs_max=2
-    )
-
-
-@tvm.testing.uses_gpu
-def test_mod():
-    verify_broadcast_binary_ele(
-        (1, 2, 2), (2,), topi.mod, np.mod, lhs_min=0.001, rhs_min=1, dtype="int32"
-    )
-
-
-@tvm.testing.uses_gpu
-def test_floor_mod():
-    def _canonical_floor_mod(a, b):
-        return a - np.floor(a / b) * b
-
-    verify_broadcast_binary_ele(
-        (1, 2, 2),
-        (2,),
-        topi.floor_mod,
-        _canonical_floor_mod,
-        lhs_min=0.001,
-        rhs_min=1,
-        dtype="int32",
-    )
-    verify_broadcast_binary_ele(
-        (3, 4, 5),
-        (3, 4, 5),
-        topi.floor_mod,
-        _canonical_floor_mod,
-        lhs_min=0.001,
-        rhs_min=1,
-        dtype="float32",
-    )
-
-
-@tvm.testing.uses_gpu
-def test_cmp():
-    # explicit specify the output type
-    def greater(x, y):
-        return topi.greater(x, y).astype("int8")
-
-    def less(x, y):
-        return topi.less(x, y).astype("int8")
-
-    def equal(x, y):
-        return topi.equal(x, y).astype("int8")
-
-    def not_equal(x, y):
-        return topi.not_equal(x, y).astype("int8")
-
-    def greater_equal(x, y):
-        return topi.greater_equal(x, y).astype("int8")
-
-    def less_equal(x, y):
-        return topi.less_equal(x, y).astype("int8")
-
-    verify_broadcast_binary_ele((1, 2, 2), (2,), greater, np.greater)
-    verify_broadcast_binary_ele((2, 1, 2), (2, 3, 1), less, np.less)
-    verify_broadcast_binary_ele(
-        (2, 1, 2),
-        (2, 3, 1),
-        equal,
-        np.equal,
-        lhs_min=-2,
-        lhs_max=2,
-        rhs_min=-2,
-        rhs_max=2,
-        dtype="int32",
-    )
-    verify_broadcast_binary_ele(
-        (2, 1, 2),
-        (2, 3, 1),
-        not_equal,
-        np.not_equal,
-        lhs_min=-2,
-        lhs_max=2,
-        rhs_min=-2,
-        rhs_max=2,
-        dtype="int32",
-    )
-    verify_broadcast_binary_ele(
-        (7, 1, 5),
-        (7, 3, 1),
-        greater_equal,
-        np.greater_equal,
-        lhs_min=-3,
-        lhs_max=3,
-        rhs_min=-3,
-        rhs_max=3,
-        dtype="int32",
-    )
-    verify_broadcast_binary_ele(
-        (7, 1, 5),
-        (7, 3, 1),
-        less_equal,
-        np.less_equal,
-        lhs_min=-3,
-        lhs_max=3,
-        rhs_min=-3,
-        rhs_max=3,
-        dtype="int32",
-    )
-
-
-@tvm.testing.uses_gpu
-def test_shift():
-    # explicit specify the output type
-    verify_broadcast_binary_ele(
-        (2, 1, 2), None, topi.right_shift, np.right_shift, dtype="int32", rhs_min=0, rhs_max=32
-    )
-
-    verify_broadcast_binary_ele(
-        (1, 2, 2), (2,), topi.left_shift, np.left_shift, dtype="int32", rhs_min=0, rhs_max=32
-    )
-
-    verify_broadcast_binary_ele(
-        (1, 2, 2), (2,), topi.left_shift, np.left_shift, dtype="int32", rhs_min=0, rhs_max=32
-    )
-
-
-@tvm.testing.uses_gpu
-def test_logical_single_ele():
-    def test_apply(
-        func,
-        name,
-        f_numpy,
-        indata,
-        dtype="bool",
-    ):
-        # Build the logic and compile the function
-        A = te.placeholder(shape=indata.shape, name="A", dtype=dtype)
-        B = func(A)
-        if isinstance(A, tvm.tir.PrimExpr):
-            assert isinstance(B, tvm.tir.PrimExpr)
-            return
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_broadcast_schedule(target)(B)
-            foo = tvm.build(s, [A, B], target, name=name)
-
-            data_npy = indata.astype(A.dtype)
-            data_nd = tvm.nd.array(data_npy, dev)
-
-            out_npy = f_numpy(indata)
-            out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), dev)
-            foo(data_nd, out_nd)
-            tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    test_apply(topi.logical_not, "logical_not", np.logical_not, np.array([True, False, 0, 1]))
-    test_apply(topi.logical_not, "logical_not", np.logical_not, np.array(np.arange(5) < 3))
-
-
-@tvm.testing.uses_gpu
-def test_bitwise_not():
-    def test_apply(
-        func,
-        name,
-        f_numpy,
-        shape,
-        dtype="int32",
-    ):
-        # Build the logic and compile the function
-        A = te.placeholder(shape=shape, name="A", dtype=dtype)
-        B = func(A)
-
-        if isinstance(A, tvm.tir.PrimExpr):
-            assert isinstance(B, tvm.tir.PrimExpr)
-            return
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_broadcast_schedule(target)(B)
-            foo = tvm.build(s, [A, B], target, name=name)
-
-            data_npy = np.random.uniform(size=shape).astype(A.dtype)
-            data_nd = tvm.nd.array(data_npy, dev)
-
-            out_npy = f_numpy(data_npy)
-            out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), dev)
-            foo(data_nd, out_nd)
-            tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    test_apply(topi.bitwise_not, "bitwise_not", np.bitwise_not, ())
-    test_apply(topi.bitwise_not, "bitwise_not", np.bitwise_not, (2, 1, 2))
-
-
-@tvm.testing.uses_gpu
-def test_logical_binary_ele():
-    def test_apply(
-        func,
-        name,
-        f_numpy,
-        lhs,
-        rhs,
-        dtype="bool",
-    ):
-        # Build the logic and compile the function
-        A = te.var("A", dtype=dtype)
-        B = te.var("B", dtype=dtype)
-        C = func(A, B)
-        if isinstance(A, tvm.tir.PrimExpr) and isinstance(B, tvm.tir.PrimExpr):
-            assert isinstance(C, tvm.tir.PrimExpr)
-            return
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_broadcast_schedule(target)(C)
-            foo = tvm.build(s, [A, B, C], target, name=name)
-
-            lhs_nd = tvm.nd.array(lhs, dev)
-            rhs_nd = tvm.nd.array(rhs, dev)
-
-            out_npy = f_numpy(lhs, rhs)
-            out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev)
-            foo(lhs_nd, rhs_nd, out_nd)
-            tvm.testing.assert_allclose(out_nd.numpy(), out_npy, rtol=1e-4, atol=1e-4)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    test_apply(topi.logical_and, "logical_and", np.logical_and, True, False)
-    test_apply(topi.logical_and, "logical_and", np.logical_and, [True, False], [False, False])
-    test_apply(topi.logical_or, "logical_or", np.logical_or, True, False)
-    test_apply(topi.logical_or, "logical_or", np.logical_or, [True, False], [False, False])
-    test_apply(topi.logical_xor, "logical_xor", np.logical_xor, True, False)
-    test_apply(topi.logical_xor, "logical_xor", np.logical_xor, [True, False], [False, False])
-
-
-@tvm.testing.uses_gpu
-def test_bitwise_and():
-    verify_broadcast_binary_ele(None, None, topi.bitwise_and, np.bitwise_and, dtype="int32")
-    verify_broadcast_binary_ele(
-        (2, 1, 2), (2, 1, 2), topi.bitwise_and, np.bitwise_and, dtype="int32"
-    )
-
-
-@tvm.testing.uses_gpu
-def test_bitwise_or():
-    verify_broadcast_binary_ele(None, None, topi.bitwise_or, np.bitwise_or, dtype="int32")
-    verify_broadcast_binary_ele((2, 1, 2), (2, 1, 2), topi.bitwise_or, np.bitwise_or, dtype="int32")
-
-
-@tvm.testing.uses_gpu
-def test_bitwise_xor():
-    verify_broadcast_binary_ele(None, None, topi.bitwise_xor, np.bitwise_xor, dtype="int32")
-    verify_broadcast_binary_ele(
-        (2, 1, 2), (2, 1, 2), topi.bitwise_xor, np.bitwise_xor, dtype="int32"
-    )
-
-
-if __name__ == "__main__":
-    test_add()
-    test_shift()
-    test_cmp()
-    test_mod()
-    test_floor_mod()
-    test_subtract()
-    test_multiply()
-    test_divide()
-    test_floor_divide()
-    test_maximum_minmum()
-    test_power()
-    test_broadcast_to()
-    test_logical_single_ele()
-    test_bitwise_not()
-    test_logical_binary_ele()
-    test_bitwise_and()
-    test_bitwise_or()
-    test_bitwise_xor()
diff --git a/tests/python/topi/test_topi_clip.py b/tests/python/topi/test_topi_clip.py
deleted file mode 100644
index 68bb45580fbe..000000000000
--- a/tests/python/topi/test_topi_clip.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for clip operator"""
-import numpy as np
-import tvm
-from tvm import te, tir
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-
-def verify_clip(N, a_min, a_max, dtype):
-    A = te.placeholder((N, N), dtype=dtype, name="A")
-    B = topi.clip(A, a_min, a_max)
-    s = te.create_schedule([B.op])
-
-    # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_clip")
-    def get_ref_data(a_min, a_max):
-        a_np = np.random.uniform(a_min * 2, a_max * 2, size=(N, N)).astype(dtype)
-        b_np = np.clip(a_np, a_min, a_max)
-        return a_np, b_np
-
-    a_min = a_min.value if isinstance(a_min, (tir.FloatImm, tir.IntImm)) else a_min
-    a_max = a_max.value if isinstance(a_max, (tir.FloatImm, tir.IntImm)) else a_max
-    a_np, b_np = get_ref_data(a_min, a_max)
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target, name="clip")
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_clip():
-    verify_clip(1024, -127, 127, "float32")
-    verify_clip(1024, -127, 127, "int16")
-    verify_clip(1024, -127, 127, "int8")
-
-
-@tvm.testing.uses_gpu
-def test_clip_floaimm_intimm():
-    verify_clip(1024, tir.FloatImm("float32", -127), tir.FloatImm("float32", 127), "float32")
-    verify_clip(1024, tir.IntImm("int32", -127), tir.IntImm("int32", 127), "int16")
-    verify_clip(1024, tir.IntImm("int32", -127), tir.IntImm("int32", 127), "int8")
-
-
-if __name__ == "__main__":
-    test_clip()
-    test_clip_floaimm_intimm()
diff --git a/tests/python/topi/test_topi_conv1d.py b/tests/python/topi/test_topi_conv1d.py
deleted file mode 100644
index db8d7238feba..000000000000
--- a/tests/python/topi/test_topi_conv1d.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for transposed convolution."""
-import numpy as np
-import itertools
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-
-_conv1d_ncw_implement = {
-    "generic": (topi.nn.conv1d_ncw, topi.generic.schedule_conv1d_ncw),
-    "cpu": (topi.nn.conv1d_ncw, topi.x86.schedule_conv1d_ncw),
-    "gpu": (topi.cuda.conv1d_ncw, topi.cuda.schedule_conv1d_ncw),
-}
-
-_conv1d_nwc_implement = {
-    "generic": (topi.nn.conv1d_nwc, topi.generic.schedule_conv1d_nwc),
-    "cpu": (topi.nn.conv1d_nwc, topi.x86.schedule_conv1d_nwc),
-    "gpu": (topi.cuda.conv1d_nwc, topi.cuda.schedule_conv1d_nwc),
-}
-
-_group_conv1d_implementations = {
-    "NCW": {
-        "generic": (topi.nn.group_conv1d_ncw, topi.generic.schedule_group_conv1d_ncw),
-        "cpu": (topi.nn.group_conv1d_ncw, topi.x86.schedule_group_conv1d_ncw),
-        "gpu": (topi.cuda.group_conv1d_ncw, topi.cuda.schedule_group_conv1d_ncw),
-    },
-    "NWC": {
-        "generic": (topi.nn.group_conv1d_nwc, topi.generic.schedule_group_conv1d_nwc),
-        "cpu": (topi.nn.group_conv1d_nwc, topi.x86.schedule_group_conv1d_nwc),
-        "gpu": (topi.cuda.group_conv1d_nwc, topi.cuda.schedule_group_conv1d_nwc),
-    },
-}
-
-
-def verify_conv1d(
-    batch,
-    in_channels,
-    in_width,
-    filters,
-    kernel_size=3,
-    stride=1,
-    dilation=1,
-    padding="VALID",
-    layout="NCW",
-):
-    if layout == "NCW":
-        in_shape = [batch, in_channels, in_width]
-        kernel_shape = [filters, in_channels, kernel_size]
-    else:
-        in_shape = [batch, in_width, in_channels]
-        kernel_shape = [kernel_size, in_channels, filters]
-
-    dtype = "float32"
-    A = te.placeholder(in_shape, name="A", dtype=dtype)
-    W = te.placeholder(kernel_shape, name="W", dtype=dtype)
-
-    def get_ref_data(layout):
-        a_np = np.random.uniform(size=in_shape).astype(dtype)
-        w_np = np.random.uniform(size=kernel_shape).astype(dtype)
-        if layout == "NWC":
-            np_in = np.transpose(a_np, [0, 2, 1])
-            np_w = np.transpose(w_np, [2, 1, 0])
-        else:
-            np_in = a_np
-            np_w = w_np
-        b_np = tvm.topi.testing.conv1d_ncw_python(np_in, np_w, stride, padding, dilation)
-        if layout == "NWC":
-            b_np = np.transpose(b_np, [0, 2, 1])
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data(layout)
-
-    def check_target(target, dev):
-        if layout == "NCW":
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_ncw_implement)
-        else:
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_nwc_implement)
-        with tvm.target.Target(target):
-            B = fcompute(A, W, stride, padding, dilation, "float32")
-            s = fschedule([B])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-
-        func = tvm.build(s, [A, W, B], target)
-        func(a, w, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_conv1d():
-    for layout in ["NCW", "NWC"]:
-        # Most basic test case
-        verify_conv1d(1, 1, 8, 1, 3, 1, 1, "VALID", layout)
-        # With padding
-        verify_conv1d(1, 1, 8, 1, 3, 1, 1, "SAME", layout)
-        # Realistic dimensions
-        verify_conv1d(1, 16, 32, 16, 3, 1, 1, "SAME", layout)
-        # With stride
-        verify_conv1d(1, 16, 32, 16, 3, 2, 1, "SAME", layout)
-        # With dilation
-        verify_conv1d(1, 16, 32, 16, 3, 1, 2, "SAME", layout)
-        # Large batch size
-        verify_conv1d(8, 16, 32, 16, 3, 1, 1, "SAME", layout)
-        # Other kernel sizes
-        verify_conv1d(1, 16, 32, 16, 3, 1, 1, "SAME", layout)
-        verify_conv1d(1, 16, 32, 16, 2, 1, 1, "SAME", layout)
-        verify_conv1d(1, 16, 32, 16, 1, 1, 1, "SAME", layout)
-        # Non-power-of-two shape
-        verify_conv1d(1, 17, 12, 21, 3, 1, 1, "SAME", layout)
-        verify_conv1d(1, 5, 27, 18, 3, 1, 1, "VALID", layout)
-
-
-layout = tvm.testing.parameter("NCW", "NWC")
-padding = tvm.testing.parameter("SAME", "VALID")
-dtype = tvm.testing.parameter("float32")
-
-# batch, in_channels, in_width, filters, kernel_size, stride, dilation, groups
-shape = tvm.testing.parameter(
-    [1, 4, 8, 4, 3, 1, 1, 4],
-    [1, 4, 8, 4, 3, 1, 1, 4],
-    [1, 16, 32, 16, 3, 1, 1, 4],
-    [1, 16, 32, 16, 3, 2, 1, 4],
-    [1, 16, 32, 16, 3, 1, 2, 4],
-    [8, 16, 32, 16, 3, 1, 1, 4],
-    [1, 16, 32, 16, 3, 1, 1, 4],
-    [1, 16, 32, 16, 2, 1, 1, 4],
-    [1, 16, 32, 16, 1, 1, 1, 4],
-    [1, 21, 12, 21, 3, 1, 1, 3],
-    [1, 20, 27, 20, 3, 1, 1, 5],
-)
-
-
-def test_group_conv1d(shape, layout, padding, target, dev, dtype):
-    batch, in_channels, in_width, filters, kernel_size, stride, dilation, groups = shape
-    if layout == "NCW":
-        in_shape = [batch, in_channels, in_width]
-        kernel_shape = [filters, in_channels // groups, kernel_size]
-    else:
-        in_shape = [batch, in_width, in_channels]
-        kernel_shape = [kernel_size, in_channels // groups, filters]
-
-    # reference data
-    a_np = np.random.uniform(size=in_shape).astype(dtype)
-    w_np = np.random.uniform(size=kernel_shape).astype(dtype)
-    if layout == "NWC":
-        np_in = np.transpose(a_np, [0, 2, 1])
-        np_w = np.transpose(w_np, [2, 1, 0])
-    else:
-        np_in = a_np
-        np_w = w_np
-    b_np = tvm.topi.testing.group_conv1d_ncw_python(np_in, np_w, stride, padding, dilation, groups)
-    if layout == "NWC":
-        b_np = np.transpose(b_np, [0, 2, 1])
-
-    A = te.placeholder(in_shape, name="A", dtype=dtype)
-    W = te.placeholder(kernel_shape, name="W", dtype=dtype)
-
-    fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv1d_implementations[layout])
-    with tvm.target.Target(target):
-        B = fcompute(A, W, stride, padding, dilation, groups, "float32")
-        s = fschedule([B])
-
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-    print(tvm.lower(s, [A, W, B], target))
-
-    func = tvm.build(s, [A, W, B], target)
-    func(a, w, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_conv1d()
diff --git a/tests/python/topi/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/test_topi_conv1d_transpose_ncw.py
deleted file mode 100644
index aa14f739a8bd..000000000000
--- a/tests/python/topi/test_topi_conv1d_transpose_ncw.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for transposed convolution."""
-
-import itertools
-import os
-
-import numpy as np
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi
-from tvm.topi.utils import get_const_tuple
-
-_conv1d_transpose_ncw_implement = {
-    "generic": (topi.nn.conv1d_transpose_ncw, topi.generic.schedule_conv1d_transpose_ncw),
-    "gpu": (topi.cuda.conv1d_transpose_ncw, topi.cuda.schedule_conv1d_transpose_ncw),
-}
-
-
-(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    output_padding,
-) = tvm.testing.parameters(
-    (1, 3, 224, 32, 5, 1, 0, (0,)),
-    (1, 3, 224, 32, 7, 1, 2, (0,)),
-    (1, 3, 224, 32, 5, 2, 1, (0,)),
-    (1, 3, 224, 32, 5, 2, 1, (1,)),
-    (1, 3, 224, 32, 5, 2, 0, (0,)),
-    (1, 32, 32, 128, 5, 1, 0, (0,)),
-    (1, 32, 32, 128, 5, 2, 1, (0,)),
-    (1, 1, 1024, 1, 512, 1, 256, (0,)),
-    (1, 1, 1024, 1, 512, 2, 256, (0,)),
-    (1, 1, 1024, 1, 512, 5, 256, (0,)),
-    (1, 1, 1024, 1, 512, 5, 256, (3,)),
-    (1, 2, 1024, 1, 128, 128, 0, (0,)),
-    (1, 1, 1024, 2, 128, 128, 0, (0,)),
-    (1, 1, 1024, 2, 2, 2, 0, (0,)),
-    (1, 1, 10, 1, 5, 1, (0, 3), (0,)),
-    (1, 1, 10, 1, 5, 1, (1, 3), (0,)),
-    (1, 1, 10, 1, 5, 1, (2, 3), (0,)),
-    (1, 257, 128, 1, 512, 128, 256, (0,)),
-)
-
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding
-):
-    dtype = "float32"
-    a_shape = (batch, in_channel, in_size)
-    w_shape = (in_channel, num_filter, kernel)
-
-    a_np = np.random.uniform(size=a_shape).astype(dtype)
-    w_np = np.random.uniform(size=w_shape).astype(dtype)
-    b_np = tvm.topi.testing.conv1d_transpose_ncw_python(a_np, w_np, stride, padding, output_padding)
-    c_np = np.maximum(b_np, 0)
-    return a_np, w_np, b_np, c_np
-
-
-@tvm.testing.known_failing_targets("vulkan")
-def test_conv1d_transpose_ncw(
-    target,
-    dev,
-    ref_data,
-    dtype,
-    stride,
-    padding,
-    output_padding,
-):
-
-    a_np, w_np, b_np, c_np = ref_data
-
-    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-    W = te.placeholder(w_np.shape, name="W", dtype=dtype)
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_transpose_ncw_implement)
-        B = fcompute(A, W, stride, padding, A.dtype, output_padding)
-        C = topi.nn.relu(B)
-        s1 = fschedule([B])
-        s2 = fschedule([C])
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-    func1 = tvm.build(s1, [A, W, B], target)
-    func2 = tvm.build(s2, [A, W, C], target)
-    func1(a, w, b)
-    func2(a, w, c)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-    tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_conv2d_NCHWc.py b/tests/python/topi/test_topi_conv2d_NCHWc.py
deleted file mode 100644
index 007f2a5c6a16..000000000000
--- a/tests/python/topi/test_topi_conv2d_NCHWc.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test for NCHW[x]c convolution"""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-
-
-def _transform_data(data, bn):
-    # NCHW -> NCHW[x]c
-    batch_size, channel, height, width = data.shape
-    data = np.reshape(data, (batch_size, channel // bn, bn, height, width))
-    data = np.transpose(data, (0, 1, 3, 4, 2))
-    return data
-
-
-def _transform_kernel(kernel, ic_bn, oc_bn):
-    # OIHW -> OIHW[x]i[x]o
-    out_channel, in_channel, kh, kw = kernel.shape
-    kernel = np.reshape(kernel, (out_channel // oc_bn, oc_bn, in_channel // ic_bn, ic_bn, kh, kw))
-    kernel = np.transpose(kernel, (0, 2, 4, 5, 3, 1))
-    return kernel
-
-
-def _transform_bias(bias, bn):
-    # [num_filter, 1, 1] -> [num_filter//bn, 1, 1, bn]
-    num_filter, h, w = bias.shape
-    bias = np.reshape(bias, (num_filter // bn, bn, h, w))
-    bias = np.transpose(bias, (0, 2, 3, 1))
-    return bias
-
-
-def verify_conv2d_NCHWc(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-    groups=1,
-    dtype="float32",
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    in_height = in_width = in_size
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum)
-    )
-
-    # for testing functionality,
-    # we choose arbitrary block size that can divide the channel,
-    # regardless of the performance.
-    oc_block = 1
-    for bn in range(16, 0, -1):
-        if num_filter % bn == 0:
-            oc_block = bn
-            break
-
-    ic_block = 1
-    for bn in range(oc_block, 0, -1):
-        if in_channel % bn == 0:
-            ic_block = bn
-            break
-
-    A = te.placeholder((batch, in_channel // ic_block, in_height, in_width, ic_block), name="A")
-    W = te.placeholder(
-        (
-            num_filter // oc_block,
-            in_channel // ic_block // groups,
-            kernel,
-            kernel,
-            ic_block,
-            oc_block,
-        ),
-        name="W",
-    )
-    bias = te.placeholder((num_filter // oc_block, 1, 1, oc_block), name="bias")
-
-    @memoize("topi.tests.test_topi_conv2d_NCHWc.verify_conv2d_NCHWc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
-        w_np = np.random.uniform(size=(num_filter, in_channel // groups, kernel, kernel)).astype(
-            dtype
-        )
-        b_np = np.random.uniform(size=(num_filter, 1, 1)).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups)
-        if add_bias:
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return (
-            _transform_data(a_np, ic_block),
-            _transform_kernel(w_np, ic_block, oc_block),
-            _transform_bias(b_np, oc_block),
-            _transform_data(c_np, oc_block),
-        )
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            C = topi.x86.conv2d_NCHWc(
-                A,
-                W,
-                (stride, stride),
-                padding,
-                (dilation, dilation),
-                "NCHW%dc" % ic_block,
-                "NCHW%dc" % oc_block,
-                dtype,
-            )
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.x86.schedule_conv2d_NCHWc([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
-
-    # test llvm only for now since conv2d_NCHWc implement is missing in other backend.
-    for device in ["llvm"]:
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device)
-
-
-def test_conv2d_NCHWc():
-    # ResNet18 workloads
-    verify_conv2d_NCHWc(1, 3, 224, 64, 7, 2, 3)
-    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 64, 56, 64, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 64, 56, 128, 3, 2, 1)
-    verify_conv2d_NCHWc(1, 64, 56, 128, 1, 2, 0)
-    verify_conv2d_NCHWc(1, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 128, 28, 256, 3, 2, 1)
-    verify_conv2d_NCHWc(1, 128, 28, 256, 1, 2, 0)
-    verify_conv2d_NCHWc(1, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 256, 14, 512, 3, 2, 1)
-    verify_conv2d_NCHWc(1, 256, 14, 512, 1, 2, 0)
-    verify_conv2d_NCHWc(1, 512, 7, 512, 3, 1, 1)
-
-    # bias, relu
-    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_relu=True)
-    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True)
-    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
-
-    # dilation
-    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, dilation=2)
-
-    # batch size
-    verify_conv2d_NCHWc(4, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_NCHWc(9, 64, 56, 64, 3, 1, 1)
-
-    # groups
-    verify_conv2d_NCHWc(1, 2048, 10, 2048, 3, 1, 1, groups=128)
-
-    # weird workloads
-    verify_conv2d_NCHWc(2, 2, 2, 2, 2, 2, 2)
-    verify_conv2d_NCHWc(3, 3, 3, 3, 3, 3, 3)
-    verify_conv2d_NCHWc(4, 4, 4, 4, 4, 4, 4)
-    verify_conv2d_NCHWc(5, 5, 5, 5, 5, 5, 5)
-    verify_conv2d_NCHWc(6, 6, 6, 6, 6, 6, 6)
-
-    # disable these tests due to some bugs of llvm with nvptx
-    # verify_conv2d_NCHWc(1, 1, 1, 1, 1, 1, 1, dilation=1)
-    # verify_conv2d_NCHWc(1, 1, 1, 1, 1, 1, 1, dilation=2)
-    # verify_conv2d_NCHWc(2, 13, 71, 59, 3, 1, 1)
-
-    # inception v3 workloads
-    verify_conv2d_NCHWc(1, 3, 299, 32, 3, 2, 0)
-    verify_conv2d_NCHWc(1, 32, 149, 32, 3, 1, 0)
-    verify_conv2d_NCHWc(1, 32, 147, 64, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 64, 73, 80, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 80, 73, 192, 3, 1, 0)
-    verify_conv2d_NCHWc(1, 192, 35, 64, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 192, 35, 48, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 48, 35, 64, 5, 1, 2)
-    verify_conv2d_NCHWc(1, 64, 35, 96, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 96, 35, 96, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 192, 35, 32, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 256, 35, 64, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 256, 35, 48, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 288, 35, 64, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 288, 35, 48, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 288, 35, 384, 3, 2, 0)
-    verify_conv2d_NCHWc(1, 96, 35, 96, 3, 2, 0)
-    verify_conv2d_NCHWc(1, 768, 17, 192, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 768, 17, 128, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 128, 17, 128, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 128, 17, 192, 7, 1, 3)
-    verify_conv2d_NCHWc(1, 128, 17, 128, 7, 1, 3)
-    verify_conv2d_NCHWc(1, 128, 17, 192, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 768, 17, 160, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 160, 17, 160, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 160, 17, 192, 7, 1, 3)
-    verify_conv2d_NCHWc(1, 160, 17, 160, 7, 1, 3)
-    verify_conv2d_NCHWc(1, 160, 17, 192, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 192, 17, 192, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 192, 17, 192, 7, 1, 3)
-    verify_conv2d_NCHWc(1, 192, 17, 320, 3, 2, 0)
-    verify_conv2d_NCHWc(1, 192, 17, 192, 3, 2, 0)
-    verify_conv2d_NCHWc(1, 1280, 8, 320, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 1280, 8, 384, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 384, 8, 384, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 384, 8, 384, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 1280, 8, 448, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 448, 8, 384, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 1280, 8, 192, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 2048, 8, 320, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 2048, 8, 384, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 2048, 8, 448, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 2048, 8, 192, 1, 1, 0)
-    verify_conv2d_NCHWc(1, 1024, 19, 84, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 2048, 10, 126, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 512, 5, 126, 3, 1, 1)
-    verify_conv2d_NCHWc(1, 256, 3, 126, 3, 1, 1)
-
-    # Asymmetric padding
-    verify_conv2d_NCHWc(1, 32, 17, 64, 7, 2, (0, 0, 1, 1))
-    verify_conv2d_NCHWc(1, 32, 35, 128, 3, 1, (3, 3, 2, 2))
-    verify_conv2d_NCHWc(1, 32, 35, 32, 1, 1, (1, 2, 2, 1))
-    verify_conv2d_NCHWc(1, 32, 17, 192, 1, 1, (1, 2))
-    verify_conv2d_NCHWc(1, 32, 8, 32, 3, 1, (3, 1))
-    verify_conv2d_NCHWc(1, 128, 8, 384, 3, 1, (0, 2))
-    verify_conv2d_NCHWc(1, 32, 8, 32, 1, 1, "VALID")
-    verify_conv2d_NCHWc(1, 388, 8, 32, 3, 1, "VALID")
-    verify_conv2d_NCHWc(1, 512, 19, 32, 1, 1, "SAME")
-    verify_conv2d_NCHWc(1, 32, 10, 32, 2, 1, "SAME")
-    verify_conv2d_NCHWc(1, 32, 8, 32, 3, 1, (1, 2, 2, 1), add_relu=True)
-    verify_conv2d_NCHWc(1, 32, 8, 32, 5, 2, (1, 3), add_bias=True)
-    verify_conv2d_NCHWc(1, 32, 8, 32, 3, 1, "VALID", add_bias=True, add_relu=True)
-    verify_conv2d_NCHWc(1, 32, 8, 32, 24, 1, "SAME", add_bias=True, add_relu=True)
-
-
-if __name__ == "__main__":
-    test_conv2d_NCHWc()
diff --git a/tests/python/topi/test_topi_conv2d_hwcn.py b/tests/python/topi/test_topi_conv2d_hwcn.py
deleted file mode 100644
index ab0cf741960d..000000000000
--- a/tests/python/topi/test_topi_conv2d_hwcn.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do convolution."""
-import os
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-import tvm.testing
-
-
-_conv2d_hwcn_implement = {
-    "generic": (topi.nn.conv2d_hwcn, topi.generic.schedule_conv2d_hwcn),
-    "gpu": (topi.cuda.conv2d_hwcn, topi.cuda.schedule_conv2d_hwcn),
-    "opencl": (topi.cuda.conv2d_hwcn, topi.cuda.schedule_conv2d_hwcn),
-}
-
-
-def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
-    in_height = in_width = in_size
-
-    A = te.placeholder((in_height, in_width, in_channel, batch), name="A")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
-    B = te.placeholder((1, num_filter, 1), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    b_shape = get_const_tuple(B.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_hwcn.verify_hwcn")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=b_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        c1_np = tvm.topi.testing.conv2d_hwcn_python(a_np, dw_np, stride, padding)
-        c2_np = c1_np + b_np
-        c3_np = np.maximum(c2_np, 0)
-        return a_np, w_np, b_np, c1_np, c2_np, c3_np
-
-    a_np, w_np, b_np, c1_np, c2_np, c3_np = get_ref_data()
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv2d_hwcn_implement)
-            t_conv = fcompute(A, W, stride, padding, dilation)
-            t_bias = topi.add(t_conv, B)
-            t_relu = topi.nn.relu(t_bias)
-            s1 = fschedule([t_conv])
-            s2 = fschedule([t_bias])
-            s3 = fschedule([t_relu])
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-
-        conv_out = tvm.nd.array(np.zeros(get_const_tuple(t_conv.shape), dtype=t_conv.dtype), dev)
-        bias_out = tvm.nd.array(np.zeros(get_const_tuple(t_bias.shape), dtype=t_bias.dtype), dev)
-        relu_out = tvm.nd.array(np.zeros(get_const_tuple(t_relu.shape), dtype=t_relu.dtype), dev)
-        func1 = tvm.build(s1, [A, W, t_conv], target)
-        func2 = tvm.build(s2, [A, W, B, t_bias], target)
-        func3 = tvm.build(s3, [A, W, B, t_relu], target)
-        func1(a, w, conv_out)
-        func2(a, w, b, bias_out)
-        func3(a, w, b, relu_out)
-        tvm.testing.assert_allclose(conv_out.numpy(), c1_np, rtol=1e-5)
-        tvm.testing.assert_allclose(bias_out.numpy(), c2_np, rtol=1e-5)
-        tvm.testing.assert_allclose(relu_out.numpy(), c3_np, rtol=1e-5)
-
-    for target in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
-        check_target(target)
-
-
-@tvm.testing.requires_gpu
-def test_conv2d_hwcn():
-    verify_conv2d_hwcn(1, 256, 32, 128, 3, 1, "SAME")
-    verify_conv2d_hwcn(1, 256, 32, 256, 3, 1, "SAME")
-    verify_conv2d_hwcn(4, 128, 16, 128, 5, 2, "SAME")
-    verify_conv2d_hwcn(4, 128, 16, 256, 5, 2, "SAME")
-    verify_conv2d_hwcn(1, 256, 32, 128, 3, 1, "VALID")
-    verify_conv2d_hwcn(1, 256, 32, 256, 3, 1, "VALID")
-    verify_conv2d_hwcn(4, 128, 16, 128, 5, 2, "VALID")
-    verify_conv2d_hwcn(4, 128, 16, 256, 5, 2, "VALID")
-    # dilation = 2
-    verify_conv2d_hwcn(1, 256, 32, 256, 3, 1, "SAME", dilation=2)
-    # Pass stride as tuple
-    verify_conv2d_hwcn(1, 256, 32, 128, 3, (1, 1), "SAME")
-    verify_conv2d_hwcn(1, 256, 32, 256, 3, (1, 1), "SAME")
-    verify_conv2d_hwcn(4, 128, 16, 128, 5, (2, 2), "SAME")
-    verify_conv2d_hwcn(4, 128, 16, 256, 5, (2, 2), "SAME")
-    verify_conv2d_hwcn(1, 256, 32, 128, 3, (1, 1), "VALID")
-    verify_conv2d_hwcn(1, 256, 32, 256, 3, (1, 1), "VALID")
-    verify_conv2d_hwcn(4, 128, 16, 128, 5, (2, 2), "VALID")
-    verify_conv2d_hwcn(4, 128, 16, 256, 5, (2, 2), "VALID")
-
-
-if __name__ == "__main__":
-    test_conv2d_hwcn()
diff --git a/tests/python/topi/test_topi_conv2d_hwnc_tensorcore.py b/tests/python/topi/test_topi_conv2d_hwnc_tensorcore.py
deleted file mode 100644
index 1dbff816699e..000000000000
--- a/tests/python/topi/test_topi_conv2d_hwnc_tensorcore.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-arguments
-"""Example code to do convolution."""
-
-import numpy as np
-import tvm
-import os
-import tvm.testing
-import tvm.topi.testing
-from tvm import te, autotvm, topi, relay
-from tvm.contrib.pickle_memoize import memoize
-from tvm.contrib import nvcc
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-
-_conv2d_hwnc_tensorcore_implement = {
-    "cuda": (topi.cuda.conv2d_hwnc_tensorcore, topi.cuda.schedule_conv2d_hwnc_tensorcore)
-}
-
-
-def verify_conv2d_hwnc(
-    batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, dtype="int4"
-):
-    """Test the conv2d with tensorcore for hwnc layout"""
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-    # choose dtype from int4, int8
-    assert dtype in ["int4", "int8"]
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((in_height, in_width, batch, in_channel), name="A", dtype=dtype)
-    W = te.placeholder((kernel, kernel, num_filter, in_channel), name="W", dtype=dtype)
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-
-    @memoize("topi.tests.test_topi_conv2d_hwnc.verify_conv2d_hwnc")
-    def get_ref_data():
-        if dtype == "int4":
-            a_np = np.random.randint(low=-8, high=7, size=a_shape).transpose((2, 0, 1, 3))
-            w_np = np.random.randint(low=-8, high=7, size=w_shape)
-            dw_np = topi.testing.dilate_python(
-                w_np.transpose((0, 1, 3, 2)), (1, 1, dilation, dilation)
-            )
-        elif dtype == "int8":
-            a_np = (
-                np.random.randint(low=-128, high=127, size=a_shape)
-                .transpose((2, 0, 1, 3))
-                .astype(dtype)
-            )
-            w_np = np.random.randint(low=-128, high=127, size=w_shape).astype(dtype)
-            dw_np = topi.testing.dilate_python(
-                w_np.transpose((0, 1, 3, 2)), (1, 1, dilation, dilation)
-            )
-
-        c_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-        return a_np, w_np, c_np
-
-    def convert_int32_into_int4(a_int32):
-        """convert int32 values into int4
-        Parameters
-        ----------
-        a_int32 : int
-
-        Return
-        ------
-        a_int4 : int
-        """
-        I, J, K, L = a_int32.shape
-        a_int4 = np.zeros(shape=(I, J, K, L // 8), dtype=np.int32)
-        for i in range(I):
-            for j in range(J):
-                for k in range(K):
-                    for l in range(L // 8):
-                        for m in range(min(8, L - l * 8)):
-                            a_int4[i, j, k, l] = a_int4[i, j, k, l] | (
-                                (a_int32[i, j, k, l * 8 + m] & 0xF) << ((7 - m) * 4)
-                            )
-        return a_int4
-
-    a_np, w_np, c_np = get_ref_data()
-    if dtype == "int4":
-        a_np = convert_int32_into_int4(a_np)
-        w_np = convert_int32_into_int4(w_np)
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        if not nvcc.have_tensorcore(dev.compute_version):
-            print("skip because gpu does not support Tensor Cores")
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = topi.testing.dispatch(target, _conv2d_hwnc_tensorcore_implement)
-            C = fcompute(A, W, stride, padding, dilation, dtype, "int32")
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), dev)
-        w = tvm.nd.array(w_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-        func = tvm.build(
-            s,
-            [A, W, C],
-            target,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
-        func(a, w, c)
-
-        rtol = 1e-3
-        tvm.testing.assert_allclose(c.numpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol)
-
-    check_target("cuda")
-
-
-def verify_feature_length():
-    np.random.seed(123)
-    target = "cuda"
-    ctx = tvm.device(target)
-
-    batch_size = 32
-
-    input_shape = (32, 512, 7, 7)
-    kernel_shape = (512, 512, 3, 3)
-
-    def get_mod():
-        x = relay.var("x", relay.TensorType(input_shape, "float32"))
-        y = relay.var("y", relay.TensorType(kernel_shape, "float32"))
-        f = relay.Function(
-            [x, y], relay.nn.conv2d(x, y, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3])
-        )
-        mod = tvm.IRModule()
-        mod["main"] = f
-        mod = relay.transform.InferType()(mod)
-        return mod, {}
-
-    mod, params = get_mod()
-    layout_config = relay.transform.LayoutConfig()
-    desired_layouts = {"nn.conv2d": ["HWNC", "default"]}
-    with layout_config:
-        seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
-        with tvm.transform.PassContext(opt_level=3):
-            mod = seq(mod)
-    mod = relay.transform.recast(mod, "int4", "int32")
-
-    tasks = autotvm.task.extract_from_program(
-        mod, target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
-    )
-
-    assert len(tasks) == 1
-    task = tasks[0]
-
-    space = task.config_space
-
-    idx1 = space.get_rand_index()
-    idx2 = space.get_rand_index()
-
-    cfg = space.get(idx1)
-    sch, arg_bufs = task.instantiate(cfg)
-    fea1 = autotvm.feature.get_itervar_feature_flatten(sch, arg_bufs, take_log=True)
-
-    cfg = space.get(idx2)
-    sch, arg_bufs = task.instantiate(cfg)
-    fea2 = autotvm.feature.get_itervar_feature_flatten(sch, arg_bufs, take_log=True)
-
-    assert len(fea1) == len(fea2)
-
-
-@tvm.testing.requires_tensorcore
-def test_conv2d_hwnc_tensorcore():
-    """Test the conv2d with tensorcore for hwnc layout"""
-    verify_conv2d_hwnc(8, 64, 56, 64, 3, 1, 1, dtype="int8")
-    verify_conv2d_hwnc(8, 64, 56, 64, 1, 1, 0, dtype="int4")
-    verify_conv2d_hwnc(8, 64, 56, 128, 3, 2, 1)
-    verify_conv2d_hwnc(8, 64, 56, 64, 1, 2, 0)
-    verify_conv2d_hwnc(8, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_hwnc(8, 128, 28, 256, 3, 2, 1)
-    verify_conv2d_hwnc(8, 128, 28, 256, 1, 2, 0)
-    verify_conv2d_hwnc(8, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_hwnc(8, 256, 14, 512, 3, 2, 1)
-    verify_conv2d_hwnc(8, 256, 14, 512, 1, 2, 0)
-    verify_conv2d_hwnc(8, 512, 9, 512, 3, 1, 1)
-    verify_feature_length()
-
-
-if __name__ == "__main__":
-    test_conv2d_hwnc_tensorcore()
diff --git a/tests/python/topi/test_topi_conv2d_int8.py b/tests/python/topi/test_topi_conv2d_int8.py
deleted file mode 100644
index cc1a16623684..000000000000
--- a/tests/python/topi/test_topi_conv2d_int8.py
+++ /dev/null
@@ -1,623 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-"""Example code to do convolution."""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.conv2d import _get_workload
-from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8
-from tvm.testing.utils import get_dtype_range
-
-from common import Int8Fallback
-import tvm.testing
-import pytest
-import platform
-
-
-devices = [
-    (
-        "llvm",
-        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
-        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-        topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
-        topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
-        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-    ),
-]
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.parametrize("device", devices)
-@pytest.mark.parametrize(
-    "params",
-    [
-        # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID' padding)
-        (1, 3, 299, 32, 3, 2, "SAME", 1, False, False),
-        (1, 32, 149, 32, 3, 1, "SAME", 2, False, False),
-        (4, 32, 147, 64, 3, 1, "SAME", 1, False, False),
-        (1, 64, 73, 80, 1, 1, "SAME", 1, False, False),
-        (1, 80, 73, 192, 3, 1, "SAME", 1, False, False),
-        (1, 192, 35, 48, 1, 1, "SAME", 1, False, False),
-        (1, 192, 35, 64, 1, 1, "VALID", 1, False, False),
-        (1, 192, 35, 32, 1, 1, "SAME", 1, False, False),
-        (1, 48, 35, 64, 5, 1, "SAME", 1, False, False),
-        (1, 96, 35, 96, 3, 1, "SAME", 1, False, False),
-        (1, 256, 35, 48, 1, 1, "SAME", 1, False, False),
-        (1, 256, 35, 64, 1, 1, "SAME", 1, False, False),
-        (1, 288, 35, 64, 1, 1, "SAME", 1, False, False),
-        (1, 288, 35, 48, 1, 1, "SAME", 1, False, False),
-        (1, 96, 35, 96, 3, 2, "SAME", 1, False, False),
-        (1, 128, 17, 192, 7, 1, "SAME", 2, False, False),
-        (1, 160, 17, 160, 7, 1, "SAME", 1, False, False),
-        (1, 160, 17, 192, 1, 1, "VALID", 1, False, False),
-        (1, 192, 17, 192, 1, 1, "SAME", 1, False, False),
-        (1, 768, 5, 128, 1, 1, "SAME", 1, False, False),
-        (1, 192, 17, 320, 3, 2, "SAME", 1, False, False),
-        (1, 192, 17, 192, 3, 2, "SAME", 1, False, False),
-        (1, 1280, 8, 192, 1, 1, "SAME", 1, False, False),
-        (1, 1280, 8, 384, 1, 1, "SAME", 1, False, False),
-        (1, 1280, 8, 320, 1, 1, "SAME", 1, False, False),
-        (1, 1280, 8, 448, 1, 1, "SAME", 1, False, False),
-        (1, 384, 8, 384, 1, 1, "SAME", 1, False, False),
-        (1, 384, 8, 384, 3, 1, "SAME", 1, False, False),
-        (1, 448, 8, 384, 3, 1, "VALID", 1, False, False),
-        (1, 2048, 8, 320, 1, 1, "SAME", 1, False, False),
-        (1, 2048, 8, 448, 1, 1, "SAME", 1, True, True),
-        (1, 2048, 8, 192, 1, 1, "SAME", 1, True, False),
-        # A trouble case for native schedule
-        (1, 8, 1, 24, 1, 1, "SAME", 1, False, False),
-    ],
-)
-def test_conv2d_NHWC_gemm_int8(params, device):
-
-    with Int8Fallback():
-        target, compute, schedule = device
-
-        (
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel,
-            stride,
-            padding,
-            dilation,
-            add_bias,
-            add_relu,
-        ) = params
-
-        dtype = "int8"
-
-        # TODO(ekalda): These combinations hang during compilation
-        failing_cases = [
-            (devices[1], (1, 128, 17, 192, 7, 1, "SAME", 2, False, False)),
-            (devices[1], (1, 160, 17, 160, 7, 1, "SAME", 1, False, False)),
-            (
-                devices[1],
-                (1, 448, 8, 384, 3, 1, "VALID", 1, False, False),
-            ),  # this one passes but is just incredibly slow
-        ]
-        if (device, params) in failing_cases:
-            pytest.skip("Skipping because this test will hang")
-
-        print("Compiling for target: %s" % target)
-
-        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-        padding_sum = pad_top + pad_left + pad_bottom + pad_right
-        print(
-            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-        )
-
-        in_height = in_width = in_size
-
-        a_shape = (batch, in_height, in_width, in_channel)
-        w_shape = (kernel, kernel, in_channel, num_filter)
-        bias_shape = (num_filter,)
-
-        @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NHWC_gemm_int8")
-        def get_ref_data():
-            input_min, input_max = get_dtype_range(dtype)
-            a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype(dtype)
-            w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype(dtype)
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-            c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype)
-
-            if add_bias:
-                b_np = np.random.uniform(size=bias_shape).astype(dtype)
-                c_np += b_np
-            if add_relu:
-                c_np = np.maximum(c_np, 0)
-
-            return a_np, w_np, b_np, c_np
-
-        with tvm.target.Target(target) as tvm_target:
-            A = te.placeholder(a_shape, name="A", dtype=dtype)
-            W = te.placeholder(w_shape, name="W", dtype=dtype)
-            bias = te.placeholder(bias_shape, name="bias", dtype=dtype)
-            C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = schedule([C])
-
-            build_args = [A, W, bias, C] if add_bias else [A, W, C]
-
-            func = tvm.build(
-                s,
-                build_args,
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding_sum,
-                    dilation,
-                ),
-            )
-
-            build_only = tvm_target.features.is_aarch64 and (platform.machine() != "aarch64")
-
-            if build_only:
-                return
-
-            print("Running on target: %s" % target)
-
-            dev = tvm.device(target, 0)
-            a_np, w_np, b_np, c_np = get_ref_data()
-            a = tvm.nd.array(a_np, dev)
-            w = tvm.nd.array(w_np, dev)
-            b = tvm.nd.array(b_np, dev)
-            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-            run_args = [a, w, b, c] if add_bias else [a, w, c]
-            func(*run_args)
-
-            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
-@pytest.mark.parametrize(
-    "params",
-    [
-        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
-        (1, 64, 56, 64, 3, 1, 1, 1, False, False),
-        (1, 64, 56, 64, 1, 1, 0, 1, False, False),
-        (1, 64, 56, 128, 3, 2, 1, 1, False, False),
-        (1, 64, 56, 128, 1, 2, 0, 1, False, False),
-        (1, 128, 28, 128, 3, 1, 1, 1, False, False),
-        (1, 128, 28, 256, 3, 2, 1, 1, False, False),
-        (1, 128, 28, 256, 1, 2, 0, 1, False, False),
-        (1, 256, 14, 256, 3, 1, 1, 1, False, False),
-        (1, 256, 14, 512, 3, 2, 1, 1, False, False),
-        (1, 256, 14, 512, 1, 2, 0, 1, False, False),
-        (1, 512, 7, 512, 3, 1, 1, 1, False, False),
-        # bias, relu
-        (1, 64, 56, 64, 3, 1, 1, 1, False, True),
-        (1, 64, 56, 64, 3, 1, 1, 1, True, False),
-        (1, 64, 56, 64, 3, 1, 1, 1, True, True),
-        # dilation = 2
-        (1, 64, 56, 64, 3, 1, 1, 2, False, False),
-        # batch size
-        (4, 64, 56, 64, 3, 1, 1, 1, False, False),
-        (9, 64, 56, 64, 3, 1, 1, 1, False, False),
-        # weird workloads
-        (4, 4, 4, 8, 4, 4, 4, 1, False, False),
-        # inception v3 workloads where channels in / out are multiple of oc_block_factor
-        (1, 32, 149, 32, 3, 1, 0, 1, False, False),
-        (1, 32, 147, 64, 3, 1, 1, 1, False, False),
-        (1, 64, 73, 80, 1, 1, 0, 1, False, False),
-        (1, 80, 73, 192, 3, 1, 0, 1, False, False),
-        (1, 192, 35, 64, 1, 1, 0, 1, False, False),
-        (1, 192, 35, 48, 1, 1, 0, 1, False, False),
-        (1, 48, 35, 64, 5, 1, 2, 1, False, False),
-        (1, 64, 35, 96, 3, 1, 1, 1, False, False),
-        (1, 96, 35, 96, 3, 1, 1, 1, False, False),
-        (1, 192, 35, 32, 1, 1, 0, 1, False, False),
-        (1, 256, 35, 64, 1, 1, 0, 1, False, False),
-        (1, 256, 35, 48, 1, 1, 0, 1, False, False),
-        (1, 288, 35, 64, 1, 1, 0, 1, False, False),
-        (1, 288, 35, 48, 1, 1, 0, 1, False, False),
-        (1, 288, 35, 384, 3, 2, 0, 1, False, False),
-        (1, 96, 35, 96, 3, 2, 0, 1, False, False),
-        (1, 768, 17, 192, 1, 1, 0, 1, False, False),
-        (1, 768, 17, 128, 1, 1, 0, 1, False, False),
-        (1, 128, 17, 128, 1, 1, 0, 1, False, False),
-        (1, 128, 17, 192, 7, 1, 3, 1, False, False),
-        (1, 128, 17, 128, 7, 1, 3, 1, False, False),
-        (1, 128, 17, 192, 1, 1, 0, 1, False, False),
-        (1, 768, 17, 160, 1, 1, 0, 1, False, False),
-        (1, 160, 17, 160, 1, 1, 0, 1, False, False),
-        (1, 160, 17, 192, 7, 1, 3, 1, False, False),
-        (1, 160, 17, 160, 7, 1, 3, 1, False, False),
-        (1, 160, 17, 192, 1, 1, 0, 1, False, False),
-        (1, 192, 17, 192, 1, 1, 0, 1, False, False),
-        (1, 192, 17, 192, 7, 1, 3, 1, False, False),
-        (1, 192, 17, 320, 3, 2, 0, 1, False, False),
-        (1, 192, 17, 192, 3, 2, 0, 1, False, False),
-        (1, 1280, 8, 320, 1, 1, 0, 1, False, False),
-        (1, 1280, 8, 384, 1, 1, 0, 1, False, False),
-        (1, 384, 8, 384, 1, 1, 0, 1, False, False),
-        (1, 384, 8, 384, 3, 1, 1, 1, False, False),
-        (1, 1280, 8, 448, 1, 1, 0, 1, False, False),
-        (1, 448, 8, 384, 3, 1, 1, 1, False, False),
-        (1, 1280, 8, 192, 1, 1, 0, 1, False, False),
-        (1, 2048, 8, 320, 1, 1, 0, 1, False, False),
-        (1, 2048, 8, 384, 1, 1, 0, 1, False, False),
-        (1, 2048, 8, 448, 1, 1, 0, 1, False, False),
-        (1, 2048, 8, 192, 1, 1, 0, 1, False, False),
-        (1, 1024, 19, 88, 3, 1, 1, 1, False, False),
-        # batch > 1
-        (7, 32, 149, 32, 3, 1, 0, 1, False, False),
-        (8, 32, 149, 32, 3, 1, 0, 1, False, False),
-        (32, 32, 149, 32, 3, 1, 0, 1, False, False),
-        # Asymmetric padding
-        (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
-        (1, 64, 8, 128, 3, 1, (3, 3, 2, 2), 1, False, False),
-        (1, 64, 8, 64, 1, 1, (1, 2, 2, 1), 1, False, False),
-        (1, 64, 17, 192, 1, 1, (1, 2), 1, False, False),
-        (1, 64, 8, 64, 3, 1, (3, 1), 1, False, False),
-        (1, 128, 8, 384, 3, 1, (0, 2), 1, False, False),
-        (1, 64, 8, 64, 1, 1, "VALID", 1, False, False),
-        (1, 392, 8, 64, 3, 1, "VALID", 1, False, False),
-        (1, 512, 19, 64, 1, 1, "SAME", 1, False, False),
-        (1, 64, 16, 32, 2, 1, "SAME", 1, False, False),
-        (1, 64, 8, 64, 3, 1, (1, 2, 2, 1), 1, False, True),
-        (1, 64, 8, 64, 5, 2, (1, 3), 1, True, False),
-        (1, 64, 56, 64, 3, 1, "VALID", 1, True, True),
-        (1, 64, 56, 64, 24, 1, "SAME", 1, True, True),
-    ],
-)
-def test_conv2d_NCHWc_int8(in_dtype, params):
-    with Int8Fallback():
-        (
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel,
-            stride,
-            padding,
-            dilation,
-            add_bias,
-            add_relu,
-        ) = params
-        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-        padding_sum = pad_top + pad_left + pad_bottom + pad_right
-        print(
-            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-        )
-
-        in_height = in_width = in_size
-
-        A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
-        W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
-
-        a_shape = get_const_tuple(A.shape)
-        w_shape = get_const_tuple(W.shape)
-        dtype = A.dtype
-        out_dtype = "int32" if in_dtype == "int8" else "uint32"
-        input_min, input_max = get_dtype_range(in_dtype)
-
-        def check_target(target, compute, schedule, oc_block_factor, build_only):
-            dev = tvm.device(target, 0)
-            if not tvm.testing.device_enabled(target):
-                pytest.skip(reason="Skip because %s is not enabled" % target)
-            if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-                pytest.skip(reason="Skip because %s is not enabled" % target)
-
-            bias = te.placeholder(
-                (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype
-            )
-            bias_shape = get_const_tuple(bias.shape)
-
-            @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NCHWc_int8")
-            def get_ref_data():
-                a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype(
-                    out_dtype
-                )
-                w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype(
-                    out_dtype
-                )
-                b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
-                dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-                c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(
-                    out_dtype
-                )
-
-                # convert to NCHWc
-                _, _, out_height, out_width = c_np.shape
-                c_np = c_np.reshape(
-                    (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
-                ).transpose(0, 1, 3, 4, 2)
-
-                if add_bias:
-                    b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
-                    c_np += b_np
-                if add_relu:
-                    c_np = np.maximum(c_np, 0)
-
-                return a_np, w_np, b_np, c_np
-
-            with tvm.target.Target(target):
-                C = compute(
-                    A,
-                    W,
-                    (stride, stride),
-                    padding,
-                    (dilation, dilation),
-                    "NCHW",
-                    "NCHW",
-                    out_dtype,
-                )
-                if add_bias:
-                    C = topi.add(C, bias)
-                if add_relu:
-                    C = topi.nn.relu(C)
-                s = schedule([C])
-
-            compile_args = [A, W, bias, C] if add_bias else [A, W, C]
-
-            func = tvm.build(
-                s,
-                compile_args,
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-
-            if build_only:
-                return
-
-            a_np, w_np, b_np, c_np = get_ref_data()
-
-            a = tvm.nd.array(a_np.astype(dtype), dev)
-            w = tvm.nd.array(w_np.astype(dtype), dev)
-            b = tvm.nd.array(b_np.astype(out_dtype), dev)
-            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-            run_args = [a, w, b, c] if add_bias else [a, w, c]
-
-            print("Running on target: %s" % target)
-
-            func(*run_args)
-
-            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-        targets = [
-            (
-                "cuda",
-                lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-                topi.cuda.schedule_conv2d_NCHWc_int8,
-                4,
-                False,
-            ),
-            # Disable on CI since it does not support spirv int8 dot product
-            # (
-            #     "vulkan -from_device=0",
-            #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-            #     topi.cuda.schedule_conv2d_NCHWc_int8,
-            #     4,
-            #     False,
-            # ),
-        ]
-
-        build_only_aarch64 = platform.machine() != "aarch64"
-
-        targets.append(
-            (
-                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
-                topi.arm_cpu.conv2d_NCHWc_int8,
-                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-                8,
-                build_only_aarch64,
-            )
-        )
-
-        if in_dtype == "int8":
-            targets += [
-                (
-                    "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
-                    topi.arm_cpu.conv2d_NCHWc_int8,
-                    topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-                    8,
-                    build_only_aarch64,
-                ),
-                (
-                    "rocm -mattr=+dotprod",
-                    lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(
-                        a, w, s, p, d, l, o
-                    ),
-                    topi.cuda.schedule_conv2d_NCHWc_int8,
-                    4,
-                    False,
-                ),
-            ]
-
-        for target, compute, schedule, oc_block_factor, build_only in targets:
-            check_target(target, compute, schedule, oc_block_factor, build_only)
-
-
-# Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So, just
-# performing basic testing - one test for all different scenarios - batch, dilation etc..
-@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
-@pytest.mark.parametrize(
-    "params",
-    [
-        (1, 64, 56, 64, 3, 1, 1, 1, False, False),
-        (1, 64, 56, 64, 3, 1, 1, 1, False, True),
-        (1, 64, 56, 64, 3, 1, 1, 2, False, False),
-        (9, 64, 56, 64, 3, 1, 1, 1, False, False),
-        (4, 4, 4, 4, 4, 4, 4, 1, False, False),
-        (1, 32, 149, 32, 3, 1, 0, 1, False, False),
-        (7, 32, 149, 32, 3, 1, 0, 1, False, False),
-        (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
-        (1, 32, 35, 64, 7, 2, (0, 0, 2, 2), 1, False, False),
-    ],
-)
-def test_conv2d_nchw_int8(in_dtype, params):
-    with Int8Fallback():
-        (
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel,
-            stride,
-            padding,
-            dilation,
-            add_bias,
-            add_relu,
-        ) = params
-        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-        padding_sum = pad_top + pad_left + pad_bottom + pad_right
-        print(
-            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-        )
-
-        in_height = in_width = in_size
-
-        A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
-        W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
-        bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)
-
-        a_shape = get_const_tuple(A.shape)
-        w_shape = get_const_tuple(W.shape)
-        bias_shape = get_const_tuple(bias.shape)
-        dtype = A.dtype
-
-        @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_nchw_int8")
-        def get_ref_data():
-            a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
-            w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-            c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
-
-            if add_bias:
-                b_np = np.random.uniform(size=bias_shape).astype(dtype)
-                c_np += b_np
-            if add_relu:
-                c_np = np.maximum(c_np, 0)
-
-            return a_np, w_np, b_np, c_np
-
-        a_np, w_np, b_np, c_np = get_ref_data()
-
-        def verify_workload_padding():
-            _, _, _, out_width = get_const_tuple(c_np.shape)
-            wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
-
-            # for testing functionality,
-            # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
-            # regardless of the performance.
-            int32_lanes, num_int8_elements = num_filter, in_channel
-
-            # check if tile_ow candidates are the factors of the right output weight.
-            cfg = autotvm.get_config()
-            fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements)
-            ow_tile = np.prod(cfg["tile_ow"].size)
-
-            tvm.testing.assert_allclose(ow_tile, out_width)
-
-        def check_target(target):
-            dev = tvm.device(target, 0)
-            if not tvm.testing.device_enabled(target):
-                pytest.skip("Skip because %s is not enabled" % target)
-            if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-                pytest.skip("Skip because int8 intrinsics are not available")
-
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                C = topi.cuda.conv2d_nchw_int8(
-                    A, W, (stride, stride), padding, (dilation, dilation), dtype
-                )
-                if add_bias:
-                    C = topi.add(C, bias)
-                if add_relu:
-                    C = topi.nn.relu(C)
-                s = topi.cuda.schedule_conv2d_nchw_int8([C])
-
-            build_args = [A, W, bias, C] if add_bias else [A, W, C]
-
-            func = tvm.build(
-                s,
-                build_args,
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding_sum,
-                    dilation,
-                ),
-            )
-
-            a = tvm.nd.array(a_np, dev)
-            w = tvm.nd.array(w_np, dev)
-            b = tvm.nd.array(b_np, dev)
-            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-            run_args = [a, w, b, c] if add_bias else [a, w, c]
-
-            func(*run_args)
-
-            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-        verify_workload_padding()
-
-        check_target("cuda")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_conv2d_nchw.py b/tests/python/topi/test_topi_conv2d_nchw.py
deleted file mode 100644
index e0c0b830b5f2..000000000000
--- a/tests/python/topi/test_topi_conv2d_nchw.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do convolution."""
-
-import sys
-
-import pytest
-import numpy as np
-
-import tvm
-from tvm import autotvm, te, topi
-import tvm.topi.testing
-from tvm.contrib import cudnn
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.conv2d import _get_workload
-from tvm.topi.x86.conv2d_avx_common import _fallback_schedule
-
-import tvm.testing
-
-dtype = tvm.testing.parameter("float16", "float32")
-random_seed = tvm.testing.parameter(0)
-
-
-@tvm.testing.fixture
-def input_shape(batch, in_channel, in_size):
-    return (batch, in_channel, in_size, in_size)
-
-
-@tvm.testing.fixture
-def weight_shape(num_filter, in_channel, kernel):
-    return (num_filter, in_channel, kernel, kernel)
-
-
-@tvm.testing.fixture
-def bias_shape(num_filter):
-    return (num_filter, 1, 1)
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    random_seed,
-    input_shape,
-    weight_shape,
-    bias_shape,
-    dtype,
-    stride,
-    padding,
-    dilation,
-    add_bias,
-    apply_relu,
-):
-    np.random.seed(random_seed)
-
-    # scipy.signal.convolve2d does not support float16 data types, and
-    # the python fallback is too slow for general use.  Computing
-    # ref_data in float32 will have fewer rounding errors than the TVM
-    # float16 compute, but those vary based on schedule anyways.
-    conv_dtype = "float32" if dtype == "float16" else dtype
-
-    a_np = np.random.uniform(size=input_shape).astype(dtype)
-    w_np = np.random.uniform(size=weight_shape).astype(dtype)
-    b_np = np.random.uniform(size=bias_shape).astype(dtype)
-    dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-    c_np = tvm.topi.testing.conv2d_nchw_python(
-        a_np.astype(conv_dtype), dw_np.astype(conv_dtype), stride, padding
-    ).astype(dtype)
-
-    if add_bias:
-        c_np = c_np + b_np
-    if apply_relu:
-        c_np = np.maximum(c_np, 0)
-    return a_np, w_np, b_np, c_np
-
-
-class BaseConv2DTests:
-    add_bias = tvm.testing.parameter(False)
-    apply_relu = tvm.testing.parameter(False)
-    dilation = tvm.testing.parameter(1)
-    batch = tvm.testing.parameter(1)
-
-    def test_conv2d_nchw(
-        self,
-        target,
-        dev,
-        batch,
-        in_channel,
-        in_size,
-        num_filter,
-        kernel,
-        stride,
-        padding,
-        dtype,
-        ref_data,
-        dilation,
-        add_bias,
-        apply_relu,
-    ):
-        target = tvm.target.Target(target)
-        is_cudnn_target = target.kind.name == "cuda" and "cudnn" in target.attrs.get("libs", [])
-
-        if target.kind.name == "vulkan" and dtype == "float16":
-            if not target.attrs.get("supports_float16", False) or not target.attrs.get(
-                "supports_16bit_buffer", False
-            ):
-                pytest.xfail("Vulkan device does not support float16")
-
-        if (
-            target.kind.name == "cuda"
-            and dtype == "float16"
-            and not tvm.contrib.nvcc.have_fp16(dev.compute_version)
-        ):
-            pytest.xfail("CUDA float16 intrinsics not available")
-
-        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-        padding_sum = pad_top + pad_left + pad_bottom + pad_right
-
-        has_asymmetric_padding = (pad_top != pad_bottom) or (pad_left != pad_right)
-        if is_cudnn_target and has_asymmetric_padding:
-            pytest.xfail("CuDNN does not support asymmetric padding")
-
-        a_np, w_np, b_np, c_np = ref_data
-
-        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
-        bias = te.placeholder(b_np.shape, name="bias", dtype=dtype)
-
-        if "int" in dtype:
-            tol = {"atol": 0, "rtol": 0}
-        elif dtype == "float32":
-            tol = {"rtol": 1e-4, "atol": 2e-4}
-        elif dtype == "float16":
-            # A summation in float16 with a single accumulator very
-            # quickly runs into large rounding errors.  At some point,
-            # this tolerance should be schedule-dependent for to avoid
-            # false negatives.
-            num_values_summed = in_channel * kernel * kernel
-            gap_size = np.nextafter(c_np.max(), np.inf, dtype=c_np.dtype) - c_np.max()
-            tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}
-
-        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
-            if is_cudnn_target:
-                fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn
-            else:
-                fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(target)
-
-            with target:
-                if is_cudnn_target:
-                    C = fcompute(
-                        A, W, (stride, stride), padding, (dilation, dilation), 1, "NCHW", dtype
-                    )
-                else:
-                    C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
-                if add_bias:
-                    C = topi.add(C, bias)
-                if apply_relu:
-                    C = topi.nn.relu(C)
-                s = fschedule([C])
-
-            a = tvm.nd.array(a_np, dev)
-            w = tvm.nd.array(w_np, dev)
-            b = tvm.nd.array(b_np, dev)
-
-            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
-                    dtype,
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding_sum,
-                    dilation,
-                ),
-            )
-            func(a, w, b, c)
-            tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_workload_padding(
-        self,
-        target,
-        input_shape,
-        weight_shape,
-        stride,
-        padding,
-        dilation,
-        dtype,
-        ref_data,
-    ):
-        a_np, w_np, b_np, c_np = ref_data
-        _, _, out_height, out_width = c_np.shape
-
-        A = te.placeholder(input_shape, name="A", dtype=dtype)
-        W = te.placeholder(weight_shape, name="W", dtype=dtype)
-
-        with tvm.target.Target(target):
-            wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
-
-            # check if tile_ow candidates are the factors of the right output weight.
-            cfg = autotvm.get_config()
-            _fallback_schedule(cfg, wkl)
-            ow_tile = np.prod(cfg["tile_ow"].size)
-
-        tvm.testing.assert_allclose(ow_tile, out_width)
-
-
-class TestResNet18Workloads(BaseConv2DTests):
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (3, 224, 64, 7, 2, 3),
-        (64, 56, 64, 3, 1, 1),
-        (64, 56, 64, 1, 1, 0),
-        (64, 56, 128, 3, 2, 1),
-        (64, 56, 128, 1, 2, 0),
-        (128, 28, 128, 3, 1, 1),
-        (128, 28, 256, 3, 2, 1),
-        (128, 28, 256, 1, 2, 0),
-        (256, 14, 256, 3, 1, 1),
-        (256, 14, 512, 3, 2, 1),
-        (256, 14, 512, 1, 2, 0),
-        (512, 7, 512, 3, 1, 1),
-    )
-
-
-class TestInceptionV3Workloads(BaseConv2DTests):
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (3, 299, 32, 3, 2, 0),
-        (32, 149, 32, 3, 1, 0),
-        (32, 147, 64, 3, 1, 1),
-        (64, 73, 80, 1, 1, 0),
-        (80, 73, 192, 3, 1, 0),
-        (192, 35, 64, 1, 1, 0),
-        (192, 35, 48, 1, 1, 0),
-        (48, 35, 64, 5, 1, 2),
-        (64, 35, 96, 3, 1, 1),
-        (96, 35, 96, 3, 1, 1),
-        (192, 35, 32, 1, 1, 0),
-        (256, 35, 64, 1, 1, 0),
-        (256, 35, 48, 1, 1, 0),
-        (288, 35, 64, 1, 1, 0),
-        (288, 35, 48, 1, 1, 0),
-        (288, 35, 384, 3, 2, 0),
-        (96, 35, 96, 3, 2, 0),
-        (768, 17, 192, 1, 1, 0),
-        (768, 17, 128, 1, 1, 0),
-        (128, 17, 128, 1, 1, 0),
-        (128, 17, 192, 7, 1, 3),
-        (128, 17, 128, 7, 1, 3),
-        (128, 17, 192, 1, 1, 0),
-        (768, 17, 160, 1, 1, 0),
-        # disable these tests due to some bugs of llvm with nvptx
-        # (160,  17, 160, 1, 1, 0),
-        (160, 17, 192, 7, 1, 3),
-        (160, 17, 160, 7, 1, 3),
-        (160, 17, 192, 1, 1, 0),
-        (192, 17, 192, 1, 1, 0),
-        (192, 17, 192, 7, 1, 3),
-        (192, 17, 320, 3, 2, 0),
-        (192, 17, 192, 3, 2, 0),
-        (1280, 8, 320, 1, 1, 0),
-        (1280, 8, 384, 1, 1, 0),
-        (384, 8, 384, 1, 1, 0),
-        (384, 8, 384, 3, 1, 1),
-        (1280, 8, 448, 1, 1, 0),
-        (448, 8, 384, 3, 1, 1),
-        (1280, 8, 192, 1, 1, 0),
-        (2048, 8, 320, 1, 1, 0),
-        (2048, 8, 384, 1, 1, 0),
-        (2048, 8, 448, 1, 1, 0),
-        (2048, 8, 192, 1, 1, 0),
-        (1024, 19, 84, 3, 1, 1),
-        (2048, 10, 126, 3, 1, 1),
-        (512, 5, 126, 3, 1, 1),
-        (256, 3, 126, 3, 1, 1),
-    )
-
-
-class TestWeirdWorkloads(BaseConv2DTests):
-    batch, in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (2, 2, 2, 2, 2, 2, 2),
-        (3, 3, 3, 3, 3, 3, 3),
-        (4, 4, 4, 4, 4, 4, 4),
-        (5, 5, 5, 5, 5, 5, 5),
-        (6, 6, 6, 6, 6, 6, 6),
-        # disable these tests due to some bugs of llvm with nvptx
-        # (1, 1, 1, 1, 1, 1, 1),
-        # (2, 13, 71, 59, 3, 1, 1),
-    )
-
-
-class TestAsymmetricPadding(BaseConv2DTests):
-    dilation = tvm.testing.parameter(1, 2)
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (3, 35, 64, 7, 2, (0, 0, 1, 1)),
-        (64, 8, 128, 3, 1, (3, 3, 2, 2)),
-        (64, 8, 64, 1, 1, (1, 2, 2, 1)),
-        (64, 17, 192, 1, 1, (1, 2)),
-        (64, 8, 64, 3, 1, (3, 1)),
-        (128, 8, 384, 3, 1, (0, 2)),
-        (64, 35, 64, 3, 1, (1, 2)),
-        (64, 8, 64, 1, 1, "VALID"),
-        (388, 8, 64, 3, 1, "VALID"),
-        (64, 10, 48, 3, 1, "VALID"),
-        (512, 19, 64, 1, 1, "SAME"),
-        (64, 5, 32, 2, 1, "SAME"),
-        (64, 8, 64, 3, 1, "SAME"),
-        (64, 8, 64, 3, 1, (1, 2, 2, 1)),
-        (64, 8, 64, 5, 2, (1, 3)),
-        (64, 8, 64, 3, 1, "VALID"),
-        (64, 8, 64, 24, 1, "SAME"),
-        (32, 35, 64, 7, 2, (0, 0, 2, 2)),
-    )
-
-
-class TestBatchSize(BaseConv2DTests):
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (64, 56, 64, 3, 1, 1),
-    )
-    batch = tvm.testing.parameter(1, 4, 9)
-
-
-class TestBiasRelu(BaseConv2DTests):
-    apply_relu = tvm.testing.parameter(True, False, ids=["relu", "no_relu"])
-    add_bias = tvm.testing.parameter(True, False, ids=["bias", "no_bias"])
-    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
-        (64, 56, 64, 3, 1, 1),
-        (64, 8, 64, 3, 1, (1, 2, 2, 1)),
-        (64, 8, 64, 5, 2, (1, 3)),
-        (64, 8, 64, 3, 1, "VALID"),
-        (64, 8, 64, 24, 1, "SAME"),
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_conv2d_nhwc.py b/tests/python/topi/test_topi_conv2d_nhwc.py
deleted file mode 100644
index e7009ed179f5..000000000000
--- a/tests/python/topi/test_topi_conv2d_nhwc.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do convolution."""
-import os
-import platform
-import pytest
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-from tvm.target.codegen import llvm_version_major
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-import tvm.testing
-
-
-_conv2d_nhwc_implement = {
-    "generic": (topi.nn.conv2d_nhwc, topi.generic.schedule_conv2d_nhwc),
-    "gpu": (topi.gpu.conv2d_nhwc, topi.gpu.schedule_conv2d_nhwc),
-    "cpu": (topi.nn.conv2d_nhwc, topi.x86.schedule_conv2d_nhwc),
-    "arm_cpu": (
-        topi.arm_cpu.conv2d_nhwc_spatial_pack,
-        topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack,
-    ),
-    "mali": (
-        topi.mali.conv2d_nhwc_spatial_pack,
-        topi.mali.schedule_conv2d_nhwc_spatial_pack,
-    ),
-    "bifrost": (
-        topi.mali.conv2d_nhwc_spatial_pack,
-        topi.mali.schedule_conv2d_nhwc_spatial_pack,
-    ),
-    "hls": (topi.nn.conv2d_nhwc, topi.hls.schedule_conv2d_nhwc),
-}
-
-device = tvm.testing.parameter(
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
-        topi.arm_cpu.conv2d_nhwc_spatial_pack,
-        topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack,
-        False,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+fullfp16",
-        topi.arm_cpu.compute_conv2d_NHWC_hybrid,
-        topi.arm_cpu.schedule_conv2d_NHWC_hybrid,
-        False,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.6a,+sve",
-        topi.arm_cpu.compute_conv2d_NHWC_hybrid_SVE,
-        topi.arm_cpu.schedule_conv2d_NHWC_hybrid_SVE,
-        False,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+fullfp16",
-        topi.arm_cpu.compute_conv2d_NHWC_hybrid,
-        topi.arm_cpu.schedule_conv2d_NHWC_hybrid_TIR,
-        True,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.6a,+sve",
-        topi.arm_cpu.compute_conv2d_NHWC_hybrid_SVE,
-        topi.arm_cpu.schedule_conv2d_NHWC_hybrid_TIR,
-        True,
-    ),
-    (
-        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v9a,+sme",
-        topi.arm_cpu.compute_conv2d_NHWC_hybrid_SME,
-        topi.arm_cpu.schedule_conv2d_NHWC_hybrid_TIR,
-        True,
-    ),
-)
-
-dtype = tvm.testing.parameter("float16", "float32")
-
-batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation = tvm.testing.parameters(
-    # Pad M, N, K
-    (1, 1, 1, 1, 1, 1, "SAME", 1),
-    (1, 1, 3, 15, 1, 1, "SAME", 1),
-    # Pad M, K
-    (1, 3, 9, 16, 3, 1, "SAME", 1),
-    # Pad M, N
-    (1, 2, 9, 15, 4, 1, "SAME", 1),
-    # Pad K, N
-    (1, 7, 4, 15, 3, 1, "SAME", 1),
-    # Pad M
-    (1, 2, 9, 16, 4, 1, "SAME", 1),
-    # Pad K
-    (1, 7, 4, 16, 3, 1, "SAME", 1),
-    # Pad N
-    (1, 2, 4, 15, 4, 1, "SAME", 1),
-    (1, 2, 4, 20, 1, 1, "SAME", 1),
-    # Large workloads
-    (1, 256, 32, 256, 3, 1, "SAME", 1),
-    (4, 128, 16, 128, 5, 2, "SAME", 1),
-    (4, 128, 16, 256, 5, 2, "SAME", 1),
-    (1, 256, 32, 256, 3, 1, "VALID", 1),
-    (4, 128, 16, 128, 5, 2, "VALID", 1),
-    (4, 128, 16, 256, 5, 2, "VALID", 1),
-    (1, 128, 16, 256, 3, 2, (0, 0, 1, 1), 1),
-    (1, 128, 16, 256, 3, 2, (1, 1, 2, 2), 1),
-    (1, 128, 16, 128, 5, 2, (3, 3, 2, 2), 1),
-    (1, 128, 16, 256, 3, 2, (0, 1, 2, 3), 1),
-    (1, 256, 32, 256, 3, 1, "SAME", 2),
-    (1, 256, 32, 256, 3, 1, (1, 1, 2, 2), 2),
-)
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation):
-    in_height = in_width = in_size
-    a_shape = (batch, in_height, in_width, in_channel)
-    w_shape = (kernel, kernel, in_channel, num_filter)
-
-    np.random.seed(0)
-    a_np = np.random.uniform(size=a_shape).astype(dtype)
-    w_np = np.random.uniform(size=w_shape).astype(dtype)
-    dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-
-    # scipy.signal.convolve2d does not support float16 data types,
-    # and the python fallback would be too slow for general use.
-    conv_dtype = "float32" if dtype == "float16" else dtype
-    b_np = tvm.topi.testing.conv2d_nhwc_python(
-        a_np.astype(conv_dtype), dw_np.astype(conv_dtype), stride, padding
-    ).astype(dtype)
-    return a_np, w_np, b_np
-
-
-def get_tolerance(dtype, w_np, b_np):
-    if dtype == "float16":
-        # A summation in float16 with a single accumulator very
-        # quickly runs into large rounding errors.
-        # This tolerance is necessary to ensure no false negatives,
-        # but it may introduce false positives, depending on schedule behaviour.
-        num_values_summed = w_np.shape[0] * w_np.shape[1] * w_np.shape[2]
-        next_float_gap_size = np.nextafter(b_np.max(), np.inf, dtype=b_np.dtype) - b_np.max()
-        tol = {"rtol": 1e-5, "atol": num_values_summed * next_float_gap_size / 2}
-    else:
-        tol = {"rtol": 1e-5, "atol": 1e-7}
-
-    return tol
-
-
-def test_conv2d_nhwc_gemm(device, ref_data, dtype, stride, padding, dilation):
-    a_np, w_np, b_np = ref_data
-
-    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-    W = te.placeholder(w_np.shape, name="W", dtype=dtype)
-
-    target_string, compute, schedule, use_tir_schedule = device
-    dev = tvm.device(target_string, 0)
-    target = tvm.target.Target(target_string)
-
-    if target.features.has_sve and llvm_version_major() < 15:
-        pytest.skip(f"LLVM {llvm_version_major()} does not support targeting SVE.")
-
-    if target.features.has_sme and llvm_version_major() < 16:
-        pytest.skip(f"LLVM {llvm_version_major()} does not support targeting SME.")
-
-    if target.features.has_sme and a_np.shape[0] > 1:
-        pytest.skip(f"Conv2d with batches > 1 targeting SME not implemented.")
-
-    if target.features.has_sme and (a_np.shape[3] * w_np.shape[0] * w_np.shape[1]) <= 1:
-        pytest.skip(f"Conv2d with unit reduction dimension targeting SME not supported.")
-
-    # SME schedule always outputs float32 results, regardless of input dtype.
-    # Otherwise, output dtype is the same as input dtype.
-    out_dtype = "float32" if target.features.has_sme else dtype
-
-    with target:
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        B = compute(A, W, stride, padding, dilation, out_dtype)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        if use_tir_schedule:
-            primfunc = te.create_prim_func([A, W, B])
-            sch = schedule(tvm.tir.Schedule(primfunc))
-            func = tvm.build(sch.mod["main"], target)
-        else:
-            s = schedule([B])
-            func = tvm.build(s, [A, W, B], target)
-
-        # Run only on AArch64 devices
-        # Do not run SVE/SME schedules on non-SVE/SME devices
-        build_only = (
-            platform.machine() != "aarch64"
-            or (
-                dtype == "float16"
-                and target.features.has_fp16_simd
-                and not tvm.testing.requires_arm_fp16.run_time_check()
-            )
-            or (target.features.has_sve and not tvm.testing.requires_aarch64_sve.run_time_check())
-            or (target.features.has_sme and not tvm.testing.requires_aarch64_sme.run_time_check())
-        )
-        if build_only:
-            return
-
-        func(a, w, b)
-    tol = get_tolerance(out_dtype, w_np, b_np)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=tol["rtol"], atol=tol["atol"])
-
-
-def test_conv2d_nhwc_hwio(target, dev, ref_data, dtype, stride, padding, dilation):
-    a_np, w_np, b_np = ref_data
-
-    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-    W = te.placeholder(w_np.shape, name="W", dtype=dtype)
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv2d_nhwc_implement)
-        B = fcompute(A, W, stride, padding, dilation, dtype)
-        s = fschedule([B])
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    func = tvm.build(s, [A, W, B], target)
-    func(a, w, b)
-    tol = get_tolerance(dtype, w_np, b_np)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=tol["rtol"], atol=tol["atol"])
-
-
-def test_conv2d_nhwc_ohwi(ref_data, dtype, stride, padding, dilation):
-    # only test on CPU target because topi doesn't have schedules for this layout
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    a_np, w_np_hwio, b_np = ref_data
-    w_np_ohwi = w_np_hwio.transpose(3, 0, 1, 2)  # HWIO -> OHWI
-
-    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-    W = te.placeholder(w_np_ohwi.shape, name="W", dtype=dtype)
-
-    B = topi.nn.conv2d(
-        A,
-        W,
-        stride,
-        padding,
-        dilation,
-        data_layout="NHWC",
-        kernel_layout="OHWI",
-        out_dtype="float32",
-    )
-    s = tvm.te.create_schedule(B.op)
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np_ohwi, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    func = tvm.build(s, [A, W, B], target)
-    func(a, w, b)
-    tol = get_tolerance(dtype, w_np_hwio, b_np)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=tol["rtol"], atol=tol["atol"])
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_conv2d_nhwc_pack_int8.py b/tests/python/topi/test_topi_conv2d_nhwc_pack_int8.py
deleted file mode 100644
index 8b20961a8cdf..000000000000
--- a/tests/python/topi/test_topi_conv2d_nhwc_pack_int8.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do convolution."""
-import pytest
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm.task.space import FallbackConfigEntity
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-
-def verify_conv2d_1x1_nhwc_pack_int8(
-    batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1
-):
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="uint8")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    adtype = A.dtype
-    wdtype = W.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_1x1_nhwc_pack_int8.verify_nhwc.v2")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(adtype)
-        w_np = np.random.uniform(size=w_shape).astype(wdtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-
-        with tvm.target.Target(device):
-            B = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NHWC", out_dtype="int32")
-            s = topi.x86.schedule_conv2d_nhwc_pack_int8([B])
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        func = tvm.build(s, [A, W, B], device)
-        func(a, w, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-    # for device in ['llvm -mcpu=skylake-avx512']:
-    for device in ["llvm"]:
-        check_device(device)
-
-
-# TODO(@llyfacebook): Please fix https://github.com/apache/tvm/issues/4122 to enable this test.
-@pytest.mark.skip
-def test_conv2d_nhwc():
-    verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0)
-
-
-if __name__ == "__main__":
-    # test_conv2d_nhwc()
-    pass
diff --git a/tests/python/topi/test_topi_conv2d_nhwc_tensorcore.py b/tests/python/topi/test_topi_conv2d_nhwc_tensorcore.py
deleted file mode 100644
index 14a9dca12522..000000000000
--- a/tests/python/topi/test_topi_conv2d_nhwc_tensorcore.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-arguments
-"""Example code to do convolution."""
-
-import numpy as np
-import tvm
-from tvm import topi
-import tvm.topi.testing
-from tvm import te
-from tvm.contrib.pickle_memoize import memoize
-from tvm.contrib import nvcc
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-import tvm.testing
-
-
-_conv2d_nhwc_tensorcore_implement = {
-    "cuda": (topi.cuda.conv2d_nhwc_tensorcore, topi.cuda.schedule_conv2d_nhwc_tensorcore)
-}
-
-
-def verify_conv2d_nhwc(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-    devices="cuda",
-):
-    """Test the conv2d with tensorcore for nhwc layout"""
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
-    bias = te.placeholder((1, 1, 1, num_filter), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_conv2d_nhwc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        if not nvcc.have_tensorcore(dev.compute_version):
-            print("skip because gpu does not support Tensor Cores")
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(
-                device, _conv2d_nhwc_tensorcore_implement
-            )
-            C = fcompute(A, W, stride, padding, dilation, "float32")
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, c)
-
-        rtol = 1e-3
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=rtol)
-
-    check_device(devices)
-
-
-@tvm.testing.requires_cuda
-@tvm.testing.requires_gpu
-def test_conv2d_nhwc_tensorcore():
-    """Test the conv2d with tensorcore for nhwc layout"""
-    verify_conv2d_nhwc(16, 16, 14, 16, 3, 1, 1)
-    verify_conv2d_nhwc(16, 128, 7, 128, 7, 1, 3)
-    verify_conv2d_nhwc(16, 160, 7, 160, 7, 1, 3)
-
-    verify_conv2d_nhwc(32, 64, 14, 64, 3, 1, 1, add_bias=True)
-    verify_conv2d_nhwc(32, 64, 14, 64, 3, 1, 1, add_relu=True)
-    verify_conv2d_nhwc(32, 64, 14, 64, 3, 1, 1, add_relu=True, add_bias=True)
-
-    verify_conv2d_nhwc(16, 64, 17, 64, 7, 1, (3, 3, 2, 2))
-    verify_conv2d_nhwc(16, 64, 17, 64, 7, 1, "SAME")
-    verify_conv2d_nhwc(16, 48, 35, 48, 5, 1, "VALID")
-    verify_conv2d_nhwc(16, 48, 56, 48, 3, 1, (1, 1, 1, 1))
-    verify_conv2d_nhwc(16, 64, 28, 64, 3, 1, (1, 1, 1, 1))
-
-
-if __name__ == "__main__":
-    test_conv2d_nhwc_tensorcore()
diff --git a/tests/python/topi/test_topi_conv2d_nhwc_winograd.py b/tests/python/topi/test_topi_conv2d_nhwc_winograd.py
deleted file mode 100644
index 3a4b99a00dce..000000000000
--- a/tests/python/topi/test_topi_conv2d_nhwc_winograd.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-arguments
-# pylint: disable=bad-whitespace
-"""Example code to do convolution."""
-
-import numpy as np
-import tvm
-from tvm import topi
-import tvm.topi.testing
-from tvm import te
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-import tvm.testing
-
-
-_conv2d_nhwc_winograd_tensorcore = {
-    "cuda": (
-        topi.cuda.conv2d_nhwc_winograd_tensorcore,
-        topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore,
-    )
-}
-
-_conv2d_nhwc_winograd_direct = {
-    "cuda": (topi.cuda.conv2d_nhwc_winograd_direct, topi.cuda.schedule_conv2d_nhwc_winograd_direct)
-}
-
-
-def verify_conv2d_nhwc(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-    devices="cuda",
-    bgemm="direct",
-):
-    """Test the conv2d with winograd for nhwc layout"""
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
-    bias = te.placeholder((1, 1, 1, num_filter), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_nhwc_winograd.verify_conv2d_nhwc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            if bgemm == "direct":
-                fcompute, fschedule = tvm.topi.testing.dispatch(
-                    device, _conv2d_nhwc_winograd_direct
-                )
-            elif bgemm == "tensorcore":
-                fcompute, fschedule = tvm.topi.testing.dispatch(
-                    device, _conv2d_nhwc_winograd_tensorcore
-                )
-            C = fcompute(A, W, stride, padding, dilation, "float32")
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, c)
-
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=2e-3)
-
-    check_device(devices)
-
-
-@tvm.testing.requires_cuda
-@tvm.testing.requires_gpu
-def test_conv2d_nhwc_winograd_direct():
-    """Test the conv2d with winograd for nhwc layout"""
-    # resnet 18 workloads
-    print("test_winograd_direct...")
-    verify_conv2d_nhwc(1, 64, 56, 64, 3, 1, 1, bgemm="direct")
-    verify_conv2d_nhwc(1, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_nhwc(1, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_nhwc(1, 512, 7, 512, 3, 1, 1)
-    verify_conv2d_nhwc(1, 48, 35, 64, 5, 1, 2)
-
-    # weird workloads
-    verify_conv2d_nhwc(1, 1, 1, 1, 3, 1, 1)
-    verify_conv2d_nhwc(3, 3, 3, 3, 3, 1, 1)
-    verify_conv2d_nhwc(2, 13, 71, 59, 3, 1, 1)
-
-    # Asymmetric padding
-    verify_conv2d_nhwc(1, 512, 7, 512, 3, 1, "SAME")
-    verify_conv2d_nhwc(2, 48, 56, 48, 3, 1, (1, 1), add_relu=True)
-    verify_conv2d_nhwc(2, 48, 56, 48, 3, 1, "SAME", add_relu=True, add_bias=True)
-    verify_conv2d_nhwc(1, 48, 35, 48, 5, 1, "VALID")
-
-
-@tvm.testing.requires_cuda
-@tvm.testing.requires_tensorcore
-def test_conv2d_nhwc_winograd_tensorcore():
-    """Test the conv2d with winograd for nhwc layout"""
-    verify_conv2d_nhwc(8, 64, 56, 64, 3, 1, 1, bgemm="tensorcore")
-    verify_conv2d_nhwc(8, 128, 28, 128, 3, 1, 1, bgemm="tensorcore")
-    verify_conv2d_nhwc(8, 256, 14, 256, 3, 1, 1, bgemm="tensorcore")
-
-    verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, (1, 1), add_relu=True, bgemm="tensorcore")
-    verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, "SAME", add_relu=True, bgemm="tensorcore")
-
-
-if __name__ == "__main__":
-    test_conv2d_nhwc_winograd_direct()
-    test_conv2d_nhwc_winograd_tensorcore()
diff --git a/tests/python/topi/test_topi_conv2d_tensordot_opts.py b/tests/python/topi/test_topi_conv2d_tensordot_opts.py
deleted file mode 100644
index f6145cd1c51a..000000000000
--- a/tests/python/topi/test_topi_conv2d_tensordot_opts.py
+++ /dev/null
@@ -1,437 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tests for functions in tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot.
-
-Contains a few unit tests, followed by integration tests for common use cases. Note that we do not
-run the generated code - we just make sure the strings match exactly.
-
-Note that a *lot* of instruction reordering happens during compilation from C to assembly (by GCC or
-Clang). I've verified that this instruction reordering happens correctly for all the functions here.
-For more details on why the generated code is the way it is, see `tensordot_int16_impl`."""
-
-import textwrap
-
-from tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot import (
-    _get_tensor_halfwords,
-    _get_kernel_halfwords,
-    tensordot_int16_impl,
-)
-
-
-def test_get_tensor_halfwords():
-    """Tests the _get_tensor_halfwords helper function in tensordot.py.
-
-    This function loads the logical indices of the data that will be stored in memory at the tensor
-    pointer. See the function docstring for more details.
-    """
-
-    # fmt: off
-    # A simple 3x3 depthwise convolution computing one output and with in_stride = 1. Note that each
-    # row is padded with None at the end to make the rows word-aligned.
-    assert _get_tensor_halfwords((48, 3, 3), 0, 1, 1) == [
-        (0, 0), (0, 1), (0, 2), None,
-        (1, 0), (1, 1), (1, 2), None,
-        (2, 0), (2, 1), (2, 2), None
-    ]
-
-    # If the tensor width is odd, padding alternates before/after every row.
-    assert _get_tensor_halfwords((49, 3, 3), 0, 1, 1) == [
-        (0, 0), (0, 1), (0, 2), None,
-        None, (1, 0), (1, 1), (1, 2),
-        (2, 0), (2, 1), (2, 2), None
-    ]
-
-    # If we are computing multiple outputs, more tensor data becomes relevant.
-    assert _get_tensor_halfwords((48, 3, 3), 0, 2, 1) == [
-        (0, 0), (0, 1), (0, 2), (0, 3),
-        (1, 0), (1, 1), (1, 2), (1, 3),
-        (2, 0), (2, 1), (2, 2), (2, 3)
-    ]
-
-    # If offset=1, relevant data starts one halfword after the kernel pointer.
-    assert _get_tensor_halfwords((48, 3, 3), 1, 1, 1) == [
-        None, (0, 0), (0, 1), (0, 2),
-        None, (1, 0), (1, 1), (1, 2),
-        None, (2, 0), (2, 1), (2, 2)
-    ]
-
-    # These adjustments can be (and often are) used together.
-    assert _get_tensor_halfwords((49, 3, 3), 1, 2, 2) == [
-        None, (0, 0), (0, 1), (0, 2), (0, 3), (0, 4),
-        (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), None,
-        None, (2, 0), (2, 1), (2, 2), (2, 3), (2, 4)
-    ]
-    # fmt: on
-
-
-def test_get_kernel_halfwords():
-    """Tests the _get_kernel_halfwords helper function in tensordot.py.
-
-    This function loads the logical indices of the data that will be stored in memory at the kernel
-    pointer. See the function docstring for more details.
-    """
-
-    # fmt: off
-    # Example of a kernel for a 3x3 depthwise convolution channel
-    assert _get_kernel_halfwords((96, 3, 3), 0) == [
-        (0, 0), (0, 1), (0, 2),
-        (1, 0), (1, 1), (1, 2),
-        (2, 0), (2, 1), (2, 2),
-        None,
-    ]
-
-    # Example of a kernel for a 1x1 regular convolution with 4 channels
-    assert _get_kernel_halfwords((48, 1, 4), 1) == [
-        None, (0, 0), (0, 1), (0, 2), (0, 3), None,
-    ]
-    # fmt: on
-
-
-def test_write_3x3_depthwise_code():
-    """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME"
-    padding. We are only computing one sum at once, so we don't need stride or output. Note that
-    this is pretty inefficient - it would be much better to compute a few sums concurrently.
-
-    When inlined, this code compiles (with armv7-a clang 11) into:
-
-    tensordot_opt_x1_int16_w48_3x3_000(int*, int*, int*, int*, int*):
-        ldr.w   lr, [r3]
-        ldrd    r11, r4, [r1]
-        ldrd    r5, r9, [r1, #96]
-        ldrd    r10, r8, [r1, #192]
-        ldm.w   r2, {r1, r6, r7}
-        ldr.w   r12, [sp, #36]
-        smlad   r1, r11, r1, lr
-        smlabb  r1, r4, r6, r1
-        smlatb  r1, r6, r5, r1
-        ldrd    r3, r2, [r2, #12]
-        smlatb  r1, r5, r7, r1
-        smlatb  r1, r7, r9, r1
-        smlad   r1, r10, r3, r1
-        ldr.w   r3, [r12]
-        smlabb  r1, r8, r2, r1
-        smmul   r1, r3, r1
-        ssat    r1, #8, r1, asr #8
-        strh    r1, [r0]
-    """
-    _, code = tensordot_int16_impl(1, (48, 3, 3), (0, 0, 0), (1, 1))
-    assert code == textwrap.dedent(
-        """
-    #ifndef TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
-    #define TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
-    #include <arm_acle.h>
-    __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w48_3x3_000(
-        int16_t *output_arg, int16_t *tensor_arg, int16_t *kernel_arg,
-        int32_t *bias, int32_t *scale
-    ) {
-      int32_t *output = output_arg;
-      int32_t *tensor = tensor_arg;
-      int32_t *kernel = kernel_arg;
-
-      int32_t sum_0 = *bias;
-
-      int32_t tensor__y00_x00__y00_x01 = tensor[0];
-      int32_t tensor__y00_x02__unknown = tensor[1];
-      int32_t tensor__y01_x00__y01_x01 = tensor[24];
-      int32_t tensor__y01_x02__unknown = tensor[25];
-      int32_t tensor__y02_x00__y02_x01 = tensor[48];
-      int32_t tensor__y02_x02__unknown = tensor[49];
-
-      int32_t kernel__y00_x00__y00_x01 = kernel[0];
-      int32_t kernel__y00_x02__y01_x00 = kernel[1];
-      int32_t kernel__y01_x01__y01_x02 = kernel[2];
-      int32_t kernel__y02_x00__y02_x01 = kernel[3];
-      int32_t kernel__y02_x02__unknown = kernel[4];
-
-      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
-      sum_0 = __smlabb(tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0);
-      sum_0 = __smlabt(tensor__y01_x00__y01_x01, kernel__y00_x02__y01_x00, sum_0);
-      sum_0 = __smlatb(tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0);
-      sum_0 = __smlabt(tensor__y01_x02__unknown, kernel__y01_x01__y01_x02, sum_0);
-      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
-      sum_0 = __smlabb(tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0);
-
-      int32_t scale_val = *scale;
-      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
-      requant_0 = (requant_0 + 1) >> 1;
-      requant_0 = __ssat(requant_0 + -128, 8);
-
-      ((int16_t*) output)[0] = (int16_t) requant_0;
-      return 0;
-    }
-    #endif
-    """
-    )
-
-
-def test_odd_width_3x3_depthwise_strides_code():
-    """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME"
-    padding and (2, 2) strides, being written into NHWC layout. The layout change is encoded by
-    out_stride = 4. This is a common use case seen in MobileNetV1, among others.
-
-    Note that despite the rows not being word-aligned, the *tensor pointer will always be word
-    aligned (satisfying this requirement) since y_stride = 2."""
-
-    _, code = tensordot_int16_impl(2, (49, 3, 3), (0, 0, 0), (2, 4))
-    assert code == textwrap.dedent(
-        """
-    #ifndef TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
-    #define TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
-    #include <arm_acle.h>
-    __attribute__((always_inline)) static inline int32_t tensordot_opt_x2_int16_w49_3x3_000_2_4(
-        int16_t *output_arg, int16_t *tensor_arg, int16_t *kernel_arg,
-        int32_t *bias, int32_t *scale
-    ) {
-      int32_t *output = output_arg;
-      int32_t *tensor = tensor_arg;
-      int32_t *kernel = kernel_arg;
-
-      int32_t sum_0 = *bias, sum_1 = *bias;
-
-      int32_t tensor__y00_x00__y00_x01 = tensor[0];
-      int32_t tensor__y00_x02__y00_x03 = tensor[1];
-      int32_t tensor__y00_x04__unknown = tensor[2];
-      int32_t tensor__unknown__y01_x00 = tensor[24];
-      int32_t tensor__y01_x01__y01_x02 = tensor[25];
-      int32_t tensor__y01_x03__y01_x04 = tensor[26];
-      int32_t tensor__y02_x00__y02_x01 = tensor[49];
-      int32_t tensor__y02_x02__y02_x03 = tensor[50];
-      int32_t tensor__y02_x04__unknown = tensor[51];
-
-      int32_t kernel__y00_x00__y00_x01 = kernel[0];
-      int32_t kernel__y00_x02__y01_x00 = kernel[1];
-      int32_t kernel__y01_x01__y01_x02 = kernel[2];
-      int32_t kernel__y02_x00__y02_x01 = kernel[3];
-      int32_t kernel__y02_x02__unknown = kernel[4];
-
-      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
-      sum_0 = __smlabb(tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0);
-      sum_0 = __smlatt(tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0);
-      sum_0 = __smlad(tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0);
-      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
-      sum_0 = __smlabb(tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0);
-      sum_1 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1);
-      sum_1 = __smlabb(tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1);
-      sum_1 = __smlatt(tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1);
-      sum_1 = __smlad(tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1);
-      sum_1 = __smlad(tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1);
-      sum_1 = __smlabb(tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1);
-
-      int32_t scale_val = *scale;
-      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
-      requant_0 = (requant_0 + 1) >> 1;
-      requant_0 = __ssat(requant_0 + -128, 8);
-      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
-      requant_1 = (requant_1 + 1) >> 1;
-      requant_1 = __ssat(requant_1 + -128, 8);
-
-      ((int16_t*) output)[0] = (int16_t) requant_0;
-      ((int16_t*) output)[4] = (int16_t) requant_1;
-      return 0;
-    }
-    #endif
-    """
-    )
-
-
-def test_1x1x8_convolution_code():
-    """This is the function that would be generated for a 1x48x48x8 NHWC input tensor under
-    standard convolution with a 1x1 kernel. This is a common use case seen in MobileNetV1,
-    among others. In this scenario, a very high amount of memory re-use means that summing
-    four channels at once makes us faster."""
-
-    _, code = tensordot_int16_impl(4, (48 * 8, 1, 8), (0, 0, 0), (8, 1))
-    assert code == textwrap.dedent(
-        """
-    #ifndef TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
-    #define TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
-    #include <arm_acle.h>
-    __attribute__((always_inline)) static inline int32_t tensordot_opt_x4_int16_w384_1x8_000_8_1(
-        int16_t *output_arg, int16_t *tensor_arg, int16_t *kernel_arg,
-        int32_t *bias, int32_t *scale
-    ) {
-      int32_t *output = output_arg;
-      int32_t *tensor = tensor_arg;
-      int32_t *kernel = kernel_arg;
-
-      int32_t sum_0 = *bias, sum_1 = *bias, sum_2 = *bias, sum_3 = *bias;
-
-      int32_t tensor__y00_x00__y00_x01 = tensor[0];
-      int32_t tensor__y00_x02__y00_x03 = tensor[1];
-      int32_t tensor__y00_x04__y00_x05 = tensor[2];
-      int32_t tensor__y00_x06__y00_x07 = tensor[3];
-      int32_t tensor__y00_x08__y00_x09 = tensor[4];
-      int32_t tensor__y00_x0a__y00_x0b = tensor[5];
-      int32_t tensor__y00_x0c__y00_x0d = tensor[6];
-      int32_t tensor__y00_x0e__y00_x0f = tensor[7];
-      int32_t tensor__y00_x10__y00_x11 = tensor[8];
-      int32_t tensor__y00_x12__y00_x13 = tensor[9];
-      int32_t tensor__y00_x14__y00_x15 = tensor[10];
-      int32_t tensor__y00_x16__y00_x17 = tensor[11];
-      int32_t tensor__y00_x18__y00_x19 = tensor[12];
-      int32_t tensor__y00_x1a__y00_x1b = tensor[13];
-      int32_t tensor__y00_x1c__y00_x1d = tensor[14];
-      int32_t tensor__y00_x1e__y00_x1f = tensor[15];
-
-      int32_t kernel__y00_x00__y00_x01 = kernel[0];
-      int32_t kernel__y00_x02__y00_x03 = kernel[1];
-      int32_t kernel__y00_x04__y00_x05 = kernel[2];
-      int32_t kernel__y00_x06__y00_x07 = kernel[3];
-
-      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
-      sum_0 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0);
-      sum_0 = __smlad(tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0);
-      sum_0 = __smlad(tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0);
-      sum_1 = __smlad(tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1);
-      sum_1 = __smlad(tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1);
-      sum_1 = __smlad(tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1);
-      sum_1 = __smlad(tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1);
-      sum_2 = __smlad(tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2);
-      sum_2 = __smlad(tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2);
-      sum_2 = __smlad(tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2);
-      sum_2 = __smlad(tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2);
-      sum_3 = __smlad(tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3);
-      sum_3 = __smlad(tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3);
-      sum_3 = __smlad(tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3);
-      sum_3 = __smlad(tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3);
-
-      int32_t scale_val = *scale;
-      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
-      requant_0 = (requant_0 + 1) >> 1;
-      requant_0 = __ssat(requant_0 + -128, 8);
-      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
-      requant_1 = (requant_1 + 1) >> 1;
-      requant_1 = __ssat(requant_1 + -128, 8);
-      int32_t requant_2 = (sum_2 * (int64_t) scale_val) >> 32;
-      requant_2 = (requant_2 + 1) >> 1;
-      requant_2 = __ssat(requant_2 + -128, 8);
-      int32_t requant_3 = (sum_3 * (int64_t) scale_val) >> 32;
-      requant_3 = (requant_3 + 1) >> 1;
-      requant_3 = __ssat(requant_3 + -128, 8);
-
-      int packed_res_0;
-      __asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_0) : "r" (requant_0), "r" (requant_1));
-      int packed_res_1;
-      __asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_1) : "r" (requant_2), "r" (requant_3));
-      output[0] = packed_res_0;
-      output[1] = packed_res_1;
-      return 0;
-    }
-    #endif
-    """
-    )
-
-
-def test_3x3x3_offset_convolution_code():
-    """This is the function that would be generated for a 1x96x96x3 NHWC input tensor under
-    standard convolution with a 3x3x3 kernel - the first layer of MobileNetV1. This is special, as
-    it means that every other kernel channel will not start on an even numbered halfword. We won't
-    have this issue for the input tensor, as we will always compute two positions at a time.
-
-    To solve this 'every other' issue, we will need two different version of this function to
-    alternate between. This alternation will be handled in TIR scheduling. Here, we just test the
-    version where the kernel is not word aligned.
-
-    Also tests the requantize_shift and output_zero_point keyword args. These might be needed for
-    some ResNet models (like image classification from MLPerf Tiny).
-    """
-
-    _, code = tensordot_int16_impl(
-        1,
-        (96 * 3, 3, 9),
-        (1, 1, 1),
-        (3, 1),
-        requantize_shift=40,
-        output_zero_point=4,
-    )
-    assert code == textwrap.dedent(
-        """
-    #ifndef TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
-    #define TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
-    #include <arm_acle.h>
-    __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w288_3x9_111(
-        int16_t *output_arg, int16_t *tensor_arg, int16_t *kernel_arg,
-        int32_t *bias, int32_t *scale
-    ) {
-      int32_t *output = output_arg;
-      int32_t *tensor = tensor_arg;
-      int32_t *kernel = kernel_arg;
-
-      int32_t sum_0 = *bias;
-
-      int32_t tensor__unknown__y00_x00 = tensor[0];
-      int32_t tensor__y00_x01__y00_x02 = tensor[1];
-      int32_t tensor__y00_x03__y00_x04 = tensor[2];
-      int32_t tensor__y00_x05__y00_x06 = tensor[3];
-      int32_t tensor__y00_x07__y00_x08 = tensor[4];
-      int32_t tensor__unknown__y01_x00 = tensor[144];
-      int32_t tensor__y01_x01__y01_x02 = tensor[145];
-      int32_t tensor__y01_x03__y01_x04 = tensor[146];
-      int32_t tensor__y01_x05__y01_x06 = tensor[147];
-      int32_t tensor__y01_x07__y01_x08 = tensor[148];
-      int32_t tensor__unknown__y02_x00 = tensor[288];
-      int32_t tensor__y02_x01__y02_x02 = tensor[289];
-      int32_t tensor__y02_x03__y02_x04 = tensor[290];
-      int32_t tensor__y02_x05__y02_x06 = tensor[291];
-      int32_t tensor__y02_x07__y02_x08 = tensor[292];
-
-      int32_t kernel__unknown__y00_x00 = kernel[0];
-      int32_t kernel__y00_x01__y00_x02 = kernel[1];
-      int32_t kernel__y00_x03__y00_x04 = kernel[2];
-      int32_t kernel__y00_x05__y00_x06 = kernel[3];
-      int32_t kernel__y00_x07__y00_x08 = kernel[4];
-      int32_t kernel__y01_x00__y01_x01 = kernel[5];
-      int32_t kernel__y01_x02__y01_x03 = kernel[6];
-      int32_t kernel__y01_x04__y01_x05 = kernel[7];
-      int32_t kernel__y01_x06__y01_x07 = kernel[8];
-      int32_t kernel__y01_x08__y02_x00 = kernel[9];
-      int32_t kernel__y02_x01__y02_x02 = kernel[10];
-      int32_t kernel__y02_x03__y02_x04 = kernel[11];
-      int32_t kernel__y02_x05__y02_x06 = kernel[12];
-      int32_t kernel__y02_x07__y02_x08 = kernel[13];
-
-      sum_0 = __smlatt(tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0);
-      sum_0 = __smlad(tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0);
-      sum_0 = __smlad(tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0);
-      sum_0 = __smlad(tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0);
-      sum_0 = __smlad(tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0);
-      sum_0 = __smlatb(tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0);
-      sum_0 = __smlabt(tensor__y01_x01__y01_x02, kernel__y01_x00__y01_x01, sum_0);
-      sum_0 = __smlatb(tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0);
-      sum_0 = __smlabt(tensor__y01_x03__y01_x04, kernel__y01_x02__y01_x03, sum_0);
-      sum_0 = __smlatb(tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0);
-      sum_0 = __smlabt(tensor__y01_x05__y01_x06, kernel__y01_x04__y01_x05, sum_0);
-      sum_0 = __smlatb(tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0);
-      sum_0 = __smlabt(tensor__y01_x07__y01_x08, kernel__y01_x06__y01_x07, sum_0);
-      sum_0 = __smlatb(tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0);
-      sum_0 = __smlatt(tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0);
-      sum_0 = __smlad(tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0);
-      sum_0 = __smlad(tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0);
-      sum_0 = __smlad(tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0);
-      sum_0 = __smlad(tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0);
-
-      int32_t scale_val = *scale;
-      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 39;
-      requant_0 = (requant_0 + 1) >> 1;
-      requant_0 = __ssat(requant_0 + 4, 8);
-
-      ((int16_t*) output)[1] = (int16_t) requant_0;
-      return 0;
-    }
-    #endif
-    """
-    )
diff --git a/tests/python/topi/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/test_topi_conv2d_transpose_nchw.py
deleted file mode 100644
index 05fd4639155a..000000000000
--- a/tests/python/topi/test_topi_conv2d_transpose_nchw.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for transposed convolution."""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-import tvm.testing
-
-
-_conv2d_transpose_nchw_implement = {
-    "generic": (topi.nn.conv2d_transpose_nchw, topi.generic.schedule_conv2d_transpose_nchw),
-    "cpu": (topi.x86.conv2d_transpose_nchw, topi.x86.schedule_conv2d_transpose_nchw),
-    "arm_cpu": (topi.arm_cpu.conv2d_transpose_nchw, topi.arm_cpu.schedule_conv2d_transpose_nchw),
-    "gpu": (topi.cuda.conv2d_transpose_nchw, topi.cuda.schedule_conv2d_transpose_nchw),
-    "hls": (topi.nn.conv2d_transpose_nchw, topi.hls.schedule_conv2d_transpose_nchw),
-}
-
-
-def verify_conv2d_transpose_nchw(
-    batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding
-):
-    in_height, in_width = in_size
-    kernel_height, kernel_width = kernel
-    stride_height, stride_width = stride
-    pad_top, pad_left, pad_bottom, pad_right = padding
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    W = te.placeholder((in_channel, num_filter, kernel_height, kernel_width), name="W")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_transpose.verify_conv2d_transpose_nchw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = tvm.topi.testing.conv2d_transpose_nchw_python(
-            a_np, w_np, stride, padding, output_padding
-        )
-        c_np = np.maximum(b_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check(fcompute, fschedule, target, dev):
-        B = fcompute(
-            A,
-            W,
-            [stride_height, stride_width],
-            [pad_top, pad_left, pad_bottom, pad_right],
-            A.dtype,
-            output_padding,
-        )
-        C = topi.nn.relu(B)
-        s1 = fschedule([B])
-        s2 = fschedule([C])
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-        func1 = tvm.build(s1, [A, W, B], target)
-        func2 = tvm.build(s2, [A, W, C], target)
-        func1(a, w, b)
-        func2(a, w, c)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    def check_generic(target, dev):
-        print("Running generic on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = _conv2d_transpose_nchw_implement["generic"]
-            check(fcompute, fschedule, target, dev)
-
-    check_generic("llvm", tvm.cpu(0))
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(
-                target, _conv2d_transpose_nchw_implement
-            )
-            check(fcompute, fschedule, target, dev)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_transpose_nchw():
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 1, (1, 1), (1, 1), (0, 0, 0, 0), (0, 0))
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0))
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (3, 3), (0, 0, 0, 0), (0, 0))
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0))
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1), (0, 0))
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1), (1, 0))
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (2, 2), (2, 2), (0, 0, 0, 0), (0, 0))
-    verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (2, 2), (2, 2), (0, 0, 0, 0), (1, 1))
-    verify_conv2d_transpose_nchw(1, 32, (32, 32), 128, (5, 5), (1, 1), (0, 0, 0, 0), (0, 0))
-    verify_conv2d_transpose_nchw(1, 32, (32, 32), 128, (5, 5), (2, 2), (1, 1, 1, 1), (0, 0))
-    verify_conv2d_transpose_nchw(16, 32, (8192, 1), 8, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0))
-    verify_conv2d_transpose_nchw(16, 512, (8, 1), 128, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0))
-    verify_conv2d_transpose_nchw(16, 512, (8, 1), 128, (31, 1), (2, 1), (14, 0, 15, 0), (1, 0))
-
-
-if __name__ == "__main__":
-    test_conv2d_transpose_nchw()
diff --git a/tests/python/topi/test_topi_conv2d_winograd.py b/tests/python/topi/test_topi_conv2d_winograd.py
deleted file mode 100644
index 82368f118f32..000000000000
--- a/tests/python/topi/test_topi_conv2d_winograd.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do convolution."""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm.task.space import FallbackConfigEntity
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.utils import get_const_tuple
-import tvm.testing
-
-
-_conv2d_nchw_winograd_implement = {
-    "arm_cpu": (topi.arm_cpu.conv2d_nchw_winograd, topi.arm_cpu.schedule_conv2d_nchw_winograd),
-    "cuda": (topi.cuda.conv2d_nchw_winograd, topi.cuda.schedule_conv2d_nchw_winograd),
-    "mali": (topi.mali.conv2d_nchw_winograd, topi.mali.schedule_conv2d_nchw_winograd),
-}
-
-
-def verify_conv2d_nchw(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-    devices=["cuda", "llvm -device=arm_cpu", "opencl -device=mali"],
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
-    bias = te.placeholder((num_filter, 1, 1), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_winograd.verify_conv2d_nhwc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_nchw_winograd_implement)
-            C = fcompute(A, W, stride, padding, dilation, dtype)
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, c)
-
-        rtol = 1e-3
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=rtol)
-
-    for device in devices:
-        check_device(device)
-
-
-@tvm.testing.uses_gpu
-def test_conv2d_nchw():
-    # inception v3 workloads
-    verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3, devices=["cuda"])
-    verify_conv2d_nchw(1, 128, 17, 128, 7, 1, 3, devices=["cuda"])
-    verify_conv2d_nchw(1, 160, 17, 160, 7, 1, 3, devices=["cuda"])
-
-    # resnet 18 workloads
-    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
-
-    # batch size = 2
-    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)
-
-    # relu, bias
-    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_bias=True)
-    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True)
-    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True)
-
-    # weird workloads
-    verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1)
-    verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1)
-    verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
-    verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=["cuda"])
-
-    # Asymmetric padding
-    verify_conv2d_nchw(1, 48, 56, 48, 3, 1, (1, 1, 1, 1))
-    verify_conv2d_nchw(1, 64, 28, 64, 3, 1, (1, 1, 1, 1))
-    verify_conv2d_nchw(1, 128, 14, 128, 3, 1, (1, 1))
-    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, "SAME")
-    verify_conv2d_nchw(2, 13, 71, 59, 3, 1, (1, 1, 1, 1))
-    verify_conv2d_nchw(2, 48, 56, 48, 3, 1, (1, 1, 1, 1), add_bias=True)
-    verify_conv2d_nchw(2, 48, 56, 48, 3, 1, (1, 1), add_relu=True)
-    verify_conv2d_nchw(2, 48, 56, 48, 3, 1, "SAME", add_relu=True, add_bias=True)
-    verify_conv2d_nchw(1, 64, 17, 192, 7, 1, (3, 1), devices=["cuda"])
-    verify_conv2d_nchw(1, 64, 17, 64, 7, 1, (3, 3, 2, 2), devices=["cuda"])
-    verify_conv2d_nchw(1, 160, 17, 160, 7, 1, "SAME", devices=["cuda"])
-    verify_conv2d_nchw(1, 48, 35, 48, 5, 1, "VALID", devices=["cuda"])
-
-
-def verify_conv2d_nhwc(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-):
-    # This version is intented to be used by the auto-scheduler,
-    # so we only test the correctness of compute declaration
-    # with the default naive schedule in cpu
-
-    A = te.placeholder((batch, in_size, in_size, in_channel), name="A")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
-    bias = te.placeholder((1, 1, 1, num_filter), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_winograd.verify_conv2d_nhwc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    target = "llvm"
-    dev = tvm.device(target)
-
-    C = topi.nn.conv2d_winograd_nhwc(A, W, stride, padding, dilation, dtype)
-    s = te.create_schedule([C.op])
-
-    a = tvm.nd.array(a_np, device=dev)
-    w = tvm.nd.array(w_np, device=dev)
-    b = tvm.nd.array(b_np, device=dev)
-    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), device=dev)
-    func = tvm.build(s, [A, W, C], target=target)
-    func(a, w, c)
-
-    rtol = 1e-3
-    tvm.testing.assert_allclose(c.numpy(), c_np, rtol=rtol)
-
-
-def test_conv2d_nhwc():
-    # This version is intented to be used by the auto-scheduler,
-    # so we only test the correctness of compute declaration
-    # with the default naive schedule in cpu
-
-    # resnet 18 workloads
-    verify_conv2d_nhwc(1, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_nhwc(1, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_nhwc(1, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_nhwc(1, 512, 7, 512, 3, 1, 1)
-
-    # more shapes
-    verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_nhwc(1, 1, 1, 1, 3, 1, 1)
-    verify_conv2d_nhwc(3, 3, 3, 3, 3, 1, 1)
-    verify_conv2d_nhwc(2, 13, 71, 59, 3, 1, 1)
-
-    # Asymmetric padding
-    verify_conv2d_nhwc(1, 3, 7, 3, 3, 1, "SAME")
-    verify_conv2d_nhwc(1, 48, 35, 48, 3, 1, "VALID")
-
-
-if __name__ == "__main__":
-    test_conv2d_nchw()
-    test_conv2d_nhwc()
diff --git a/tests/python/topi/test_topi_conv3d_ncdhw.py b/tests/python/topi/test_topi_conv3d_ncdhw.py
deleted file mode 100644
index 6c2ed6eeae21..000000000000
--- a/tests/python/topi/test_topi_conv3d_ncdhw.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do convolution."""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.utils import get_pad_tuple3d
-from tvm.topi.utils import get_const_tuple
-
-_conv3d_ncdhw_implement = {
-    "generic": (topi.nn.conv3d_ncdhw, topi.generic.schedule_conv3d_ncdhw),
-    "cpu": (topi.x86.conv3d_ncdhw, topi.x86.schedule_conv3d_ncdhw),
-    "gpu": (topi.cuda.conv3d_ncdhw, topi.cuda.schedule_conv3d_ncdhw),
-}
-
-
-def verify_conv3d_ncdhw(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    groups=1,
-    add_bias=False,
-    add_relu=False,
-):
-    if isinstance(kernel, (tuple, list)):
-        if len(kernel) == 3:
-            kernel_d = kernel[0]
-            kernel_h = kernel[1]
-            kernel_w = kernel[2]
-        else:
-            raise ValueError("Size of kernel can only be 3")
-    elif isinstance(kernel, int):
-        kernel_d = kernel_h = kernel_w = kernel
-    else:
-        raise ValueError("Unknown kernel option %s" % kernel)
-    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(
-        padding, (kernel_d, kernel_h, kernel_w)
-    )
-    padding_sum = pad_front + pad_back + pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel_d,
-            kernel_h,
-            kernel_w,
-            stride,
-            padding_sum,
-            dilation,
-        )
-    )
-
-    in_depth = in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name="A")
-    W = te.placeholder((num_filter, in_channel // groups, kernel_d, kernel_h, kernel_w), name="W")
-    bias = te.placeholder((num_filter, 1, 1, 1), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv3d_ncdhw.verify_conv3d_ncdhw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation, dilation))
-        c_np = tvm.topi.testing.conv3d_ncdhw_python(a_np, dw_np, stride, padding, groups)
-        if add_bias:
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv3d_ncdhw_implement)
-        with tvm.target.Target(target):
-            C = fcompute(
-                A,
-                W,
-                (stride, stride, stride),
-                padding,
-                (dilation, dilation, dilation),
-                groups,
-                dtype,
-            )
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel_d,
-                    kernel_h,
-                    kernel_w,
-                    stride,
-                    padding_sum,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel_d,
-                    kernel_h,
-                    kernel_w,
-                    stride,
-                    padding_sum,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4, atol=1e-6)
-
-    for target, dev in tvm.testing.enabled_targets():
-        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
-            check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_ncdhw():
-    # 3DCNN  workloads
-    verify_conv3d_ncdhw(1, 32, 32, 5, 1, 1, 0)
-    verify_conv3d_ncdhw(1, 32, 32, 1, 1, 1, 0)
-    verify_conv3d_ncdhw(1, 32, 32, 5, 1, 1, 1)
-    verify_conv3d_ncdhw(1, 32, 32, 1, 1, 1, 1)
-
-    # bias, relu
-    verify_conv3d_ncdhw(1, 64, 56, 3, 1, 1, 1, add_relu=True)
-    verify_conv3d_ncdhw(1, 64, 56, 3, 1, 1, 1, add_bias=True)
-    verify_conv3d_ncdhw(1, 64, 56, 3, 1, 1, 1, add_bias=True, add_relu=True)
-
-    # dilation = 2
-    verify_conv3d_ncdhw(1, 64, 56, 3, 3, 1, 1, dilation=2)
-
-    # batch size
-    verify_conv3d_ncdhw(4, 64, 56, 5, 3, 1, 1)
-
-    # weird workloads
-    verify_conv3d_ncdhw(2, 2, 2, 2, 2, 2, 2)
-    verify_conv3d_ncdhw(3, 3, 3, 3, 3, 3, 3)
-
-    # Asymmetric padding
-    verify_conv3d_ncdhw(1, 32, 32, 5, 1, 1, (0, 0, 0, 1, 1, 1))
-    verify_conv3d_ncdhw(1, 32, 32, 1, 1, 1, (2, 1, 2, 1, 2, 1))
-    verify_conv3d_ncdhw(1, 64, 56, 3, 3, 1, (2, 2, 2, 1, 1, 1), dilation=2)
-    verify_conv3d_ncdhw(1, 32, 32, 5, 1, 1, (0, 1, 1))
-    verify_conv3d_ncdhw(1, 32, 32, 1, 1, 1, (2, 1, 0))
-    verify_conv3d_ncdhw(1, 32, 32, 1, 3, 1, "VALID")
-    verify_conv3d_ncdhw(1, 32, 32, 5, 1, 1, "VALID")
-
-    # DHW kernel layout
-    verify_conv3d_ncdhw(1, 32, 56, 16, (3, 5, 7), 2, (1, 2, 3))
-    verify_conv3d_ncdhw(1, 3, 56, 16, (3, 7, 7), 2, (1, 2, 3, 0, 3, 2))
-    verify_conv3d_ncdhw(1, 3, 56, 16, (3, 3, 7), 2, (1, 2, 3))
-    verify_conv3d_ncdhw(1, 3, 56, 16, (3, 7, 3), 2, (1, 3, 1))
-
-    # grouped workloads
-    verify_conv3d_ncdhw(1, 32, 32, 8, 1, 1, 0, groups=4)
-    verify_conv3d_ncdhw(1, 32, 32, 4, 1, 1, 0, groups=4)
-    verify_conv3d_ncdhw(1, 32, 32, 8, 1, 1, 1, groups=4)
-    verify_conv3d_ncdhw(1, 32, 32, 4, 1, 1, 1, groups=4)
-
-
-if __name__ == "__main__":
-    test_conv3d_ncdhw()
diff --git a/tests/python/topi/test_topi_conv3d_ndhwc.py b/tests/python/topi/test_topi_conv3d_ndhwc.py
deleted file mode 100644
index 4ee31ecacd0f..000000000000
--- a/tests/python/topi/test_topi_conv3d_ndhwc.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do convolution."""
-import os
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-
-_conv3d_ndhwc_implement = {
-    "generic": (topi.nn.conv3d_ndhwc, topi.generic.schedule_conv3d_ndhwc),
-    "cpu": (topi.x86.conv3d_ndhwc, topi.x86.schedule_conv3d_ndhwc),
-    "gpu": (topi.cuda.conv3d_ndhwc, topi.cuda.schedule_conv3d_ndhwc),
-}
-
-
-def verify_conv3d_ndhwc(
-    target,
-    dev,
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    groups=1,
-):
-    if isinstance(in_size, tuple):
-        in_depth, in_height, in_width = in_size
-    else:
-        in_depth = in_height = in_width = in_size
-    if isinstance(kernel, tuple):
-        kernel_depth, kernel_height, kernel_width = kernel
-    else:
-        kernel_depth = kernel_height = kernel_width = kernel
-
-    A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), name="A")
-    W = te.placeholder(
-        (kernel_depth, kernel_height, kernel_width, in_channel // groups, num_filter), name="W"
-    )
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv3d_ndhwc.verify_ndhwc.v2")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, dilation, 1, 1))
-        b_np = tvm.topi.testing.conv3d_ndhwc_python(a_np, dw_np, stride, padding, groups)
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-
-    fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv3d_ndhwc_implement)
-    with tvm.target.Target(target):
-        B = fcompute(A, W, stride, padding, dilation, groups, dtype)
-        s = fschedule([B])
-    dev = tvm.device(target, 0)
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    func = tvm.build(s, [A, W, B], target)
-    print(tvm.lower(s, [A, W, B], target))
-    func(a, w, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-def test_conv3d_ndhwc(target, dev):
-    verify_conv3d_ndhwc(target, dev, 1, 16, 32, 16, 3, 1, "SAME")
-    verify_conv3d_ndhwc(target, dev, 4, 32, 16, 32, 5, 2, "SAME")
-    verify_conv3d_ndhwc(target, dev, 4, 32, 16, 64, 5, 2, "SAME")
-    verify_conv3d_ndhwc(target, dev, 1, 64, 32, 64, 3, 1, "VALID")
-    verify_conv3d_ndhwc(target, dev, 1, 64, 32, 64, 3, 1, "VALID")
-    verify_conv3d_ndhwc(target, dev, 4, 32, 16, 32, 5, 2, "VALID")
-    verify_conv3d_ndhwc(target, dev, 4, 32, 16, 64, 5, 2, "VALID")
-    # dilation = 2
-    verify_conv3d_ndhwc(target, dev, 1, 64, 32, 64, 3, 1, "SAME", dilation=2)
-
-    verify_conv3d_ndhwc(target, dev, 1, 1, (20, 256, 256), 32, (1, 3, 3), (1, 2, 2), "SAME")
-    verify_conv3d_ndhwc(target, dev, 1, 1, (20, 256, 256), 32, (1, 6, 6), (1, 2, 2), (0, 2, 2))
-    verify_conv3d_ndhwc(target, dev, 1, 4, (20, 256, 256), 8, (1, 5, 5), (1, 2, 2), (0, 2, 2))
-
-    verify_conv3d_ndhwc(target, dev, 1, 16, 32, 16, 3, 1, "SAME", groups=4)
-    verify_conv3d_ndhwc(target, dev, 4, 32, 16, 32, 5, 2, "SAME", groups=4)
-    verify_conv3d_ndhwc(target, dev, 4, 32, 16, 64, 5, 2, "SAME", groups=4)
-
-
-if __name__ == "__main__":
-    test_conv3d_ndhwc()
diff --git a/tests/python/topi/test_topi_conv3d_ndhwc_tensorcore.py b/tests/python/topi/test_topi_conv3d_ndhwc_tensorcore.py
deleted file mode 100644
index 0d587a87d8d7..000000000000
--- a/tests/python/topi/test_topi_conv3d_ndhwc_tensorcore.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-arguments
-"""Example code to do convolution."""
-
-import numpy as np
-import tvm
-from tvm import topi
-import tvm.topi.testing
-from tvm import te
-from tvm.contrib.pickle_memoize import memoize
-from tvm.contrib import nvcc
-from tvm.topi.nn.utils import get_pad_tuple3d
-from tvm.topi.utils import get_const_tuple
-import tvm.testing
-
-
-_conv3d_ndhwc_tensorcore_implement = {
-    "cuda": (topi.cuda.conv3d_ndhwc_tensorcore, topi.cuda.schedule_conv3d_ndhwc_tensorcore)
-}
-
-
-def verify_conv3d_ndhwc(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-    devices="cuda",
-):
-    """Test the conv3d with tensorcore for ndhwc layout"""
-    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(
-        padding, (kernel, kernel, kernel)
-    )
-    padding_sum = pad_front + pad_top + pad_left + pad_back + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_depth = in_height = in_width = in_size
-
-    dtype = "float16"
-    A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), dtype, name="A")
-    W = te.placeholder((kernel, kernel, kernel, in_channel, num_filter), dtype, name="W")
-    bias = te.placeholder((1, 1, 1, 1, num_filter), dtype, name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-
-    @memoize("topi.tests.test_topi_conv3d_ndhwc.verify_conv3d_ndhwc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv3d_ndhwc_python(a_np, dw_np, stride, padding)
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(
-                device, _conv3d_ndhwc_tensorcore_implement
-            )
-            C = fcompute(A, W, stride, padding, dilation, 1, "float16")
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, c)
-
-        # Tensorcores are very inaccurate, with large shapes, the accumulation
-        # error is high especially away from 1. We disable atol as it is very
-        # large for these numbers that are far away from 1.
-        tvm.testing.assert_allclose(c.numpy(), c_np, atol=1e200, rtol=0.01)
-
-    check_device(devices)
-
-
-@tvm.testing.requires_tensorcore
-@tvm.testing.requires_cuda
-def test_conv3d_ndhwc_tensorcore():
-    """Test the conv3d with tensorcore for ndhwc layout"""
-    verify_conv3d_ndhwc(16, 16, 14, 16, 3, 1, 1)
-    verify_conv3d_ndhwc(16, 64, 7, 64, 7, 1, 3)
-    verify_conv3d_ndhwc(16, 32, 7, 32, 7, 1, 3)
-
-    verify_conv3d_ndhwc(32, 16, 14, 16, 3, 1, 1, add_bias=True)
-    verify_conv3d_ndhwc(32, 16, 14, 16, 3, 1, 1, add_relu=True)
-    verify_conv3d_ndhwc(32, 16, 14, 16, 3, 1, 1, add_relu=True, add_bias=True)
-
-    verify_conv3d_ndhwc(16, 16, 17, 16, 7, 1, (3, 3, 3, 2, 2, 2))
-    verify_conv3d_ndhwc(16, 16, 17, 16, 7, 1, "SAME")
-    verify_conv3d_ndhwc(8, 16, 35, 32, 5, 1, "VALID")
-    verify_conv3d_ndhwc(16, 32, 16, 32, 3, 1, (1, 1, 1, 1, 1, 1))
-    verify_conv3d_ndhwc(16, 16, 12, 16, 3, 1, (1, 1, 1, 1, 1, 1))
-
-
-if __name__ == "__main__":
-    test_conv3d_ndhwc_tensorcore()
diff --git a/tests/python/topi/test_topi_conv3d_transpose_ncdhw.py b/tests/python/topi/test_topi_conv3d_transpose_ncdhw.py
deleted file mode 100644
index 01ec2ba1df8a..000000000000
--- a/tests/python/topi/test_topi_conv3d_transpose_ncdhw.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for transposed convolution."""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-
-_conv3d_transpose_ncdhw_implement = {
-    "generic": (topi.nn.conv3d_transpose_ncdhw, topi.generic.schedule_conv3d_transpose_ncdhw),
-    "cpu": (topi.x86.conv3d_transpose_ncdhw, topi.x86.schedule_conv3d_transpose_ncdhw),
-    "gpu": (topi.cuda.conv3d_transpose_ncdhw, topi.cuda.schedule_conv3d_transpose_ncdhw),
-}
-
-
-def verify_conv3d_transpose_ncdhw(
-    batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding
-):
-    in_depth, in_height, in_width = in_size
-    kernel_depth, kernel_height, kernel_width = kernel
-    stride_depth, stride_height, stride_width = stride
-    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = padding
-
-    A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name="A")
-    W = te.placeholder(
-        (in_channel, num_filter, kernel_depth, kernel_height, kernel_width), name="W"
-    )
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv3d_transpose.verify_conv3d_transpose_ncdhw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = tvm.topi.testing.conv3d_transpose_ncdhw_python(
-            a_np, w_np, stride, padding, output_padding
-        )
-        c_np = np.maximum(b_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(
-                target, _conv3d_transpose_ncdhw_implement
-            )
-            B = fcompute(
-                A,
-                W,
-                [stride_depth, stride_height, stride_width],
-                [pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right],
-                A.dtype,
-                output_padding,
-            )
-            C = topi.nn.relu(B)
-            s1 = fschedule([B])
-            s2 = fschedule([C])
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-        func1 = tvm.build(s1, [A, W, B], target)
-        func2 = tvm.build(s2, [A, W, C], target)
-        func1(a, w, b)
-        func2(a, w, c)
-        tvm.testing.assert_allclose(b.numpy(), b_np, atol=1e-4, rtol=1e-4)
-        tvm.testing.assert_allclose(c.numpy(), c_np, atol=1e-4, rtol=1e-4)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_conv3d_transpose_ncdhw():
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 1, (1, 1, 1), (1, 1, 1), (0, 0, 0, 0, 0, 0), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 2, (3, 3, 3), (1, 1, 1), (0, 0, 0, 0, 0, 0), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 16, (3, 3, 3), (1, 1, 1), (0, 0, 0, 0, 0, 0), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 16, (3, 3, 3), (3, 3, 3), (0, 0, 0, 0, 0, 0), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 16, (3, 3, 3), (3, 3, 3), (0, 0, 0, 0, 0, 0), (2, 2, 2)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 16, (3, 3, 3), (3, 3, 3), (0, 0, 0, 0, 0, 0), (1, 0, 2)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 16, (3, 3, 3), (1, 1, 1), (0, 0, 0, 0, 0, 0), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 16, (3, 3, 3), (2, 2, 2), (1, 1, 1, 1, 1, 1), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 3, (24, 24, 24), 16, (2, 2, 2), (2, 2, 2), (0, 0, 0, 0, 0, 0), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 8, (32, 32, 32), 32, (5, 5, 5), (1, 1, 1), (0, 0, 0, 0, 0, 0), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 8, (32, 32, 32), 64, (5, 5, 5), (2, 2, 2), (1, 1, 1, 1, 1, 1), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 8, (32, 32, 32), 64, (5, 5, 5), (2, 2, 2), (1, 1, 1, 1, 1, 1), (1, 1, 1)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 8, (32, 32, 32), 64, (3, 5, 7), (2, 2, 2), (1, 1, 1, 1, 1, 1), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 8, (32, 32, 32), 64, (3, 5, 5), (2, 2, 2), (1, 1, 1, 1, 1, 1), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 8, (32, 32, 32), 64, (3, 3, 7), (2, 2, 2), (1, 1, 1, 1, 1, 1), (0, 0, 0)
-    )
-    verify_conv3d_transpose_ncdhw(
-        1, 8, (32, 32, 32), 64, (3, 5, 3), (2, 2, 2), (1, 1, 1, 1, 1, 1), (0, 0, 0)
-    )
-
-
-if __name__ == "__main__":
-    test_conv3d_transpose_ncdhw()
diff --git a/tests/python/topi/test_topi_conv3d_winograd.py b/tests/python/topi/test_topi_conv3d_winograd.py
deleted file mode 100644
index af613437d6cd..000000000000
--- a/tests/python/topi/test_topi_conv3d_winograd.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for 3d convolution with winograd."""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.utils import get_pad_tuple3d
-from tvm.topi.utils import get_const_tuple
-
-
-_conv3d_ncdhw_implement = {
-    "gpu": (topi.cuda.conv3d_ncdhw_winograd, topi.cuda.schedule_conv3d_ncdhw_winograd),
-}
-
-
-def verify_conv3d_ncdhw(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    depth_kernel,
-    space_kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(
-        padding, (depth_kernel, space_kernel, space_kernel)
-    )
-    padding_sum = pad_front + pad_back + pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, space_kernel, stride, padding_sum, dilation)
-    )
-
-    in_depth = in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name="A")
-    W = te.placeholder((num_filter, in_channel, depth_kernel, space_kernel, space_kernel), name="W")
-    bias = te.placeholder((num_filter, 1, 1, 1), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv3d_ncdhw.verify_conv3d_ncdhw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation, dilation))
-        c_np = tvm.topi.testing.conv3d_ncdhw_python(a_np, dw_np, stride, padding)
-        if add_bias:
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ncdhw_implement)
-        with tvm.target.Target(device):
-            C = fcompute(
-                A, W, (stride, stride, stride), padding, (dilation, dilation, dilation), 1, dtype
-            )
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    space_kernel,
-                    stride,
-                    padding_sum,
-                    dilation,
-                ),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    space_kernel,
-                    stride,
-                    padding_sum,
-                    dilation,
-                ),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4, atol=1e-6)
-
-    for device in ["cuda"]:
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device)
-
-
-@tvm.testing.requires_gpu
-def test_conv3d_ncdhw():
-    # Try without depth transformation
-    # 3DCNN  workloads
-    verify_conv3d_ncdhw(1, 61, 20, 120, 3, 3, 1, 0)
-    verify_conv3d_ncdhw(1, 61, 20, 120, 1, 3, 1, 0)
-    verify_conv3d_ncdhw(1, 61, 20, 120, 5, 3, 1, 0)
-    verify_conv3d_ncdhw(1, 61, 20, 120, 5, 5, 1, 2)
-    verify_conv3d_ncdhw(1, 61, 20, 120, 1, 5, 1, 2)
-    verify_conv3d_ncdhw(1, 61, 20, 120, 7, 7, 1, 3)
-    verify_conv3d_ncdhw(1, 128, 12, 256, 3, 3, 1, 1)
-    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1)
-
-    # bias, relu
-    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1, add_relu=True)
-    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1, add_relu=True, add_bias=True)
-    verify_conv3d_ncdhw(1, 64, 12, 128, 1, 3, 1, 1, add_relu=True, add_bias=True)
-
-    # dilation = 2
-    verify_conv3d_ncdhw(1, 16, 12, 16, 3, 3, 1, "VALID", dilation=2)
-    verify_conv3d_ncdhw(1, 16, 12, 16, 1, 3, 1, "VALID", dilation=2)
-
-    # batch size
-    verify_conv3d_ncdhw(4, 32, 12, 64, 3, 3, 1, 1)
-    verify_conv3d_ncdhw(4, 32, 12, 64, 1, 3, 1, 1)
-
-    # weird workloads
-    verify_conv3d_ncdhw(2, 2, 2, 2, 3, 3, 1, 2)
-    verify_conv3d_ncdhw(3, 3, 3, 3, 3, 3, 1, 3)
-
-
-if __name__ == "__main__":
-    test_conv3d_ncdhw()
diff --git a/tests/python/topi/test_topi_correlation.py b/tests/python/topi/test_topi_correlation.py
deleted file mode 100644
index 6592e9bdad07..000000000000
--- a/tests/python/topi/test_topi_correlation.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License
-"""test of correlation operator in NCHW layout"""
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import autotvm, te, topi
-
-_correlation_implement = {
-    "generic": (topi.nn.correlation_nchw, topi.generic.schedule_correlation_nchw),
-    "gpu": (topi.cuda.correlation_nchw, topi.cuda.schedule_correlation_nchw),
-}
-
-(
-    data_shape,
-    kernel_size,
-    max_displacement,
-    stride1,
-    stride2,
-    pad_size,
-    is_multiply,
-) = tvm.testing.parameters(
-    ((1, 3, 10, 10), 1, 4, 1, 1, 4, True),
-    ((1, 3, 10, 10), 1, 5, 1, 1, 5, True),
-    ((5, 1, 4, 4), 3, 1, 2, 1, 2, True),
-    ((5, 1, 6, 4), 3, 1, 2, 2, 2, False),
-    ((5, 1, 11, 11), 5, 1, 1, 1, 2, False),
-)
-
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    dtype, data_shape, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
-):
-    a_np = np.random.uniform(size=data_shape).astype(dtype)
-    b_np = np.random.uniform(size=data_shape).astype(dtype)
-    c_np = tvm.topi.testing.correlation_nchw_python(
-        a_np, b_np, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
-    )
-    return a_np, b_np, c_np
-
-
-def test_correlation_nchw(
-    target,
-    dev,
-    ref_data,
-    dtype,
-    kernel_size,
-    max_displacement,
-    stride1,
-    stride2,
-    pad_size,
-    is_multiply,
-):
-    a_np, b_np, c_np = ref_data
-
-    A = te.placeholder(a_np.shape, name="data1", dtype=dtype)
-    B = te.placeholder(b_np.shape, name="data2", dtype=dtype)
-
-    fcompute, fschedule = tvm.topi.testing.dispatch(target, _correlation_implement)
-    with tvm.target.Target(target):
-        C = fcompute(A, B, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply)
-        s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.empty(c_np.shape, dtype=dtype, device=dev)
-
-        func = tvm.build(s, [A, B, C], target)
-        func(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_deformable_conv2d.py b/tests/python/topi/test_topi_deformable_conv2d.py
deleted file mode 100644
index 70cc9a690cdc..000000000000
--- a/tests/python/topi/test_topi_deformable_conv2d.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-import tvm.testing
-
-
-_deformable_conv2d_nchw_implement = {
-    "generic": (topi.nn.deformable_conv2d_nchw, topi.generic.schedule_deformable_conv2d_nchw),
-    "cuda": (topi.cuda.deformable_conv2d_nchw, topi.cuda.schedule_deformable_conv2d_nchw),
-}
-
-_deformable_conv2d_nhwc_implement = {
-    "generic": (topi.nn.deformable_conv2d_nhwc, topi.generic.schedule_deformable_conv2d_nhwc),
-}
-
-
-def verify_deformable_conv2d_nchw(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    deformable_groups=1,
-    groups=1,
-):
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel,
-            stride,
-            padding,
-            dilation,
-            deformable_groups,
-            groups,
-        )
-    )
-
-    A = te.placeholder((batch, in_channel, in_size, in_size), name="A")
-    out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1
-    Offset = te.placeholder(
-        (batch, deformable_groups * kernel * kernel * 2, out_size, out_size), name="offset"
-    )
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
-    bias = te.placeholder((num_filter, 1, 1), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    offset_shape = get_const_tuple(Offset.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nchw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        offset_np = np.random.randn(*offset_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        c_np = tvm.topi.testing.deformable_conv2d_nchw_python(
-            a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
-        )
-
-        return a_np, offset_np, w_np, c_np
-
-    a_np, offset_np, w_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_nchw_implement)
-        with tvm.target.Target(device):
-            C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
-            s = fschedule([C])
-
-            a = tvm.nd.array(a_np, dev)
-            offset = tvm.nd.array(offset_np, dev)
-            w = tvm.nd.array(w_np, dev)
-            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev)
-
-            func = tvm.build(s, [A, Offset, W, C], device)
-            func(a, offset, w, c)
-            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for device in ["llvm", "cuda"]:
-        check_device(device)
-
-
-def verify_deformable_conv2d_nhwc(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    deformable_groups=1,
-    groups=1,
-):
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (
-            batch,
-            in_channel,
-            in_size,
-            num_filter,
-            kernel,
-            stride,
-            padding,
-            dilation,
-            deformable_groups,
-            groups,
-        )
-    )
-
-    A = te.placeholder((batch, in_size, in_size, in_channel), name="A")
-    out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1
-    Offset = te.placeholder(
-        (batch, out_size, out_size, deformable_groups * kernel * kernel * 2), name="offset"
-    )
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
-    bias = te.placeholder((num_filter,), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    offset_shape = get_const_tuple(Offset.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nhwc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        offset_np = np.random.randn(*offset_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        c_np = tvm.topi.testing.deformable_conv2d_nhwc_python(
-            a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
-        )
-
-        return a_np, offset_np, w_np, c_np
-
-    a_np, offset_np, w_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_nhwc_implement)
-        with tvm.target.Target(device):
-            C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
-            s = fschedule([C])
-
-            a = tvm.nd.array(a_np, dev)
-            offset = tvm.nd.array(offset_np, dev)
-            w = tvm.nd.array(w_np, dev)
-            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev)
-
-            func = tvm.build(s, [A, Offset, W, C], device)
-            func(a, offset, w, c)
-            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for device in ["llvm"]:
-        check_device(device)
-
-
-@tvm.testing.uses_gpu
-def test_deformable_conv2d_nchw():
-    verify_deformable_conv2d_nchw(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
-    verify_deformable_conv2d_nchw(1, 16, 7, 16, 3, 1, 1, dilation=2, deformable_groups=4)
-    verify_deformable_conv2d_nchw(1, 16, 7, 16, 3, 1, 2, dilation=2)
-
-
-def test_deformable_conv2d_nhwc():
-    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
-    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 3, 1, 1, dilation=2, deformable_groups=4)
-    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 3, 1, 2, dilation=2)
-
-
-if __name__ == "__main__":
-    test_deformable_conv2d_nchw()
-    test_deformable_conv2d_nhwc()
diff --git a/tests/python/topi/test_topi_dense.py b/tests/python/topi/test_topi_dense.py
deleted file mode 100644
index 8f6523366878..000000000000
--- a/tests/python/topi/test_topi_dense.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for dense operator"""
-import contextlib
-import numpy as np
-import pytest
-import sys
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import te, topi
-from tvm.topi.utils import get_const_tuple
-
-from common import Int8Fallback
-
-random_seed = tvm.testing.parameter(0)
-
-use_bias = tvm.testing.parameter(True, False)
-batch_size = tvm.testing.parameter(1, 2, 128)
-in_dim, out_dim = tvm.testing.parameters((1024, 1000))
-in_dtype, out_dtype = tvm.testing.parameters(
-    ("float32", "float32"),
-    ("float16", "float16"),
-    ("int8", "int32"),
-)
-
-
-_dense_implementations = {
-    "generic": [(topi.nn.dense, topi.generic.schedule_dense)],
-    "cpu": [
-        (topi.x86.dense_nopack, topi.x86.schedule_dense_nopack),
-        (topi.x86.dense_pack, topi.x86.schedule_dense_pack),
-        (topi.x86.dense_dynamic, topi.x86.schedule_dense_dynamic),
-    ],
-    "gpu": [
-        (topi.gpu.dense_small_batch, topi.gpu.schedule_dense_small_batch),
-        (topi.gpu.dense_large_batch, topi.gpu.schedule_dense_large_batch),
-    ],
-    "mali": [(topi.mali.dense, topi.mali.schedule_dense)],
-    "bifrost": [(topi.bifrost.dense, topi.bifrost.schedule_dense)],
-    "hls": [(topi.nn.dense, topi.hls.schedule_dense)],
-}
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def dense_ref_data(random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype):
-    np.random.seed(random_seed)
-
-    if "float" in in_dtype:
-        a_np = np.random.uniform(size=(batch_size, in_dim)).astype(in_dtype)
-        b_np = np.random.uniform(size=(out_dim, in_dim)).astype(in_dtype)
-        c_np = np.random.uniform(size=(out_dim,)).astype(out_dtype)
-    elif in_dtype == "int8":
-        a_np = np.random.randint(low=-128, high=127, size=(batch_size, in_dim)).astype(in_dtype)
-        b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(in_dtype)
-        c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(out_dtype)
-    else:
-        raise ValueError("No method to generate test data for data type '{}'".format(in_dtype))
-
-    matmul = np.dot(a_np.astype(out_dtype), b_np.T.astype(out_dtype))
-
-    if use_bias:
-        matmul += c_np
-
-    d_np = np.maximum(matmul, 0)
-    return (a_np, b_np, c_np, d_np)
-
-
-def test_dense(
-    target,
-    dev,
-    batch_size,
-    in_dim,
-    out_dim,
-    use_bias,
-    dense_ref_data,
-    in_dtype,
-    out_dtype,
-    implementations=None,
-):
-    target = tvm.target.Target(target)
-
-    if target.kind.name == "cuda":
-        if in_dtype == "int8" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            pytest.xfail("CUDA int8 intrinsics not available")
-
-        if in_dtype == "float16" and not tvm.contrib.nvcc.have_fp16(dev.compute_version):
-            pytest.xfail("CUDA float16 intrinsics not available")
-
-    if target.kind.name == "vulkan":
-        if in_dtype == "int8" and (
-            not target.attrs.get("supports_int8", False)
-            or not target.attrs.get("supports_8bit_buffer", False)
-        ):
-            pytest.xfail("Vulkan int8 driver support not available")
-        if in_dtype == "float16" and (
-            not target.attrs.get("supports_float16", False)
-            or not target.attrs.get("supports_16bit_buffer", False)
-        ):
-            pytest.xfail("Vulkan float16 driver support not available")
-
-    if (
-        target.kind.name not in ["llvm", "c"]
-        and len(set(target.keys) & set(_dense_implementations)) == 0
-    ):
-        pytest.xfail("No implementation for tvm.topi.testing.dispatch to find")
-
-    if "int" in in_dtype:
-        tol = {"atol": 0, "rtol": 0}
-    elif in_dtype == "float32":
-        tol = {"rtol": 1e-5, "atol": 1e-5}
-    elif in_dtype == "float16":
-        tol = {"rtol": 5e-2, "atol": 1e-5}
-
-    A = te.placeholder((batch_size, in_dim), name="A", dtype=in_dtype)
-    B = te.placeholder((out_dim, in_dim), name="B", dtype=in_dtype)
-    C = te.placeholder((out_dim,), name="C", dtype=out_dtype)
-
-    a_np, b_np, c_np, d_np = dense_ref_data
-
-    if implementations is None:
-        implementations = tvm.topi.testing.dispatch(target, _dense_implementations)
-
-    for fcompute, fschedule in implementations:
-        if fcompute == topi.x86.dense_dynamic and (batch_size != 1 or in_dtype != "float32"):
-            continue
-        with tvm.target.Target(target):
-            D = fcompute(A, B, C if use_bias else None, out_dtype)
-            D = topi.nn.relu(D)
-            s = fschedule([D])
-
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(c_np, dev)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
-        f = tvm.build(s, [A, B, C, D], target, name="dense")
-        f(a, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), d_np, **tol)
-
-
-@pytest.mark.parametrize("target,in_dtype,out_dtype", [("cuda", "int8", "int32")])
-def test_dense_cuda_int8(
-    target,
-    dev,
-    batch_size,
-    in_dim,
-    out_dim,
-    use_bias,
-    dense_ref_data,
-    in_dtype,
-    out_dtype,
-):
-    implementations = [
-        (topi.cuda.dense_int8, topi.cuda.schedule_dense_int8),
-    ]
-    with Int8Fallback():
-        test_dense(
-            target,
-            dev,
-            batch_size,
-            in_dim,
-            out_dim,
-            use_bias,
-            dense_ref_data,
-            in_dtype,
-            out_dtype,
-            implementations=implementations,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_dense_tensorcore.py b/tests/python/topi/test_topi_dense_tensorcore.py
deleted file mode 100644
index 7e7d3f2209d3..000000000000
--- a/tests/python/topi/test_topi_dense_tensorcore.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
-"""Test code for dense tensorcore operator"""
-import numpy as np
-import tvm
-from tvm import topi
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm import te
-from tvm.contrib.pickle_memoize import memoize
-import tvm.testing
-
-
-_dense_implement = {"gpu": [(topi.cuda.dense_tensorcore, topi.cuda.schedule_dense_tensorcore)]}
-
-
-def convert_int32_into_int4(a_int32):
-    """convert int32 values into int4
-    Parameters
-    ----------
-    a_int32 : int
-
-    Return
-    ------
-    a_int4 : int
-    """
-    K, L = a_int32.shape
-    assert L % 8 == 0
-    a_int4 = np.zeros(shape=(K, L // 8), dtype=np.int32)
-    for k in range(K):
-        for l in range(L // 8):
-            for m in range(min(8, L - l * 8)):
-                a_int4[k, l] = a_int4[k, l] | ((a_int32[k, l * 8 + m] & 0xF) << ((7 - m) * 4))
-    return a_int4
-
-
-def convert_int32_into_int4_bias(a_int32):
-    """convert int32 values into int4
-    Parameters
-    ----------
-    a_int32 : int
-
-    Return
-    ------
-    a_int4 : int
-    """
-    (L,) = a_int32.shape
-    assert L % 8 == 0
-    a_int4 = np.zeros(shape=(L // 8), dtype=np.int32)
-    for l in range(L // 8):
-        for m in range(min(8, L - l * 8)):
-            a_int4[l] = a_int4[l] | ((a_int32[l * 8 + m] & 0xF) << ((7 - m) * 4))
-    return a_int4
-
-
-def verify_dense(batch, in_dim, out_dim, dtype, use_bias=True):
-    """Dense tensorcore verify function"""
-    A = te.placeholder((batch, in_dim), name="A", dtype=dtype)
-    B = te.placeholder((out_dim, in_dim), name="B", dtype=dtype)
-    C = te.placeholder((out_dim,), name="C", dtype=dtype)
-
-    assert dtype in ["int4", "int8", "float16"]
-
-    out_dtype = "float32"
-    if dtype in ["int8", "int4"]:
-        out_dtype = "int32"
-
-    # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_dense_tensorcore")
-    def get_ref_data():
-        if dtype == "int4":
-            a_np = np.random.randint(low=-8, high=7, size=(batch, in_dim))
-            b_np = np.random.randint(low=-8, high=7, size=(out_dim, in_dim))
-            c_np = np.random.randint(low=-8, high=7, size=(out_dim,))
-        elif dtype == "int8":
-            a_np = np.random.randint(low=-128, high=127, size=(batch, in_dim)).astype(dtype)
-            b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(dtype)
-            c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(dtype)
-        else:
-            a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
-            b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
-            c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
-        d_np = tvm.topi.testing.dense(a_np, b_np, c_np, use_bias, True, out_dtype)
-        return (a_np, b_np, c_np, d_np)
-
-    # get the test data
-    a_np, b_np, c_np, d_np = get_ref_data()
-    if dtype == "int4":
-        a_np = convert_int32_into_int4(a_np)
-        b_np = convert_int32_into_int4(b_np)
-        c_np = convert_int32_into_int4_bias(c_np)
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        print("Running on target: %s" % device)
-        for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
-            with tvm.target.Target(device):
-                D = fcompute(A, B, C if use_bias else None, out_dtype)
-                D = topi.nn.relu(D)
-                s = fschedule([D])
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.array(b_np, dev)
-            c = tvm.nd.array(c_np, dev)
-            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
-            f = tvm.build(s, [A, B, C, D], device, name="dense")
-            f(a, b, c, d)
-            tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-3)
-
-    check_device("cuda")
-
-
-@tvm.testing.requires_tensorcore
-def test_dense_tensorcore():
-    """Test cases"""
-    for dtype in ["float16", "int8"]:
-        verify_dense(8, 16, 32, "float16", use_bias=True)
-        verify_dense(16, 32, 16, dtype, use_bias=True)
-        verify_dense(256, 1024, 1024, dtype, use_bias=True)
-        verify_dense(1000, 1024, 1024, dtype, use_bias=False)
-        verify_dense(256, 2048, 1000, dtype, use_bias=False)
-    # TODO: need fix int4 use_bias=True, wyc-ruiker
-    verify_dense(16, 32, 16, "int4", use_bias=False)
-    verify_dense(256, 1024, 1024, "int4", use_bias=False)
-    verify_dense(1000, 1024, 1024, "int4", use_bias=False)
-    verify_dense(256, 2048, 1000, "int4", use_bias=False)
-
-
-if __name__ == "__main__":
-    test_dense_tensorcore()
diff --git a/tests/python/topi/test_topi_depth_to_space.py b/tests/python/topi/test_topi_depth_to_space.py
deleted file mode 100644
index dae5d21b363b..000000000000
--- a/tests/python/topi/test_topi_depth_to_space.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for depth to space"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-
-
-def verify_depth_to_space(
-    block_size, batch, in_channel, in_height, in_width, layout="NCHW", mode="DCR"
-):
-    out_channel = int(in_channel / (block_size * block_size))
-    out_height = int(in_height * block_size)
-    out_width = int(in_width * block_size)
-
-    if layout == "NCHW":
-        in_shape = [batch, in_channel, in_height, in_width]
-        out_shape = [batch, out_channel, out_height, out_width]
-    elif layout == "NHWC":
-        in_shape = [batch, in_height, in_width, in_channel]
-        out_shape = [batch, out_height, out_width, out_channel]
-    else:
-        raise NotImplementedError("Layout not supported {}".format(layout))
-
-    A = te.placeholder(in_shape, name="A", dtype="float32")
-    dtype = A.dtype
-    a_np = np.random.uniform(size=in_shape).astype(dtype)
-
-    B = topi.nn.depth_to_space(A, block_size=block_size, layout=layout, mode=mode)
-    if layout == "NHWC":
-        a_np = np.transpose(a_np, axes=[0, 3, 1, 2])
-    b_np = tvm.topi.testing.depth_to_space_python(a_np, block_size, mode=mode)
-    if layout == "NHWC":
-        a_np = np.transpose(a_np, axes=[0, 2, 3, 1])
-        b_np = np.transpose(b_np, axes=[0, 2, 3, 1])
-
-    def check_device(device, dev):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for device, dev in tvm.testing.enabled_targets():
-        check_device(device, dev)
-
-
-@tvm.testing.uses_gpu
-def test_depth_to_space():
-    for layout in ["NCHW", "NHWC"]:
-        for mode in ["DCR", "CDR"]:
-            # Simplest possible case
-            verify_depth_to_space(2, 1, 4, 1, 1, layout=layout, mode=mode)
-            # Average input size
-            verify_depth_to_space(2, 1, 32, 32, 32, layout=layout, mode=mode)
-            # Large block size
-            verify_depth_to_space(8, 1, 256, 32, 32, layout=layout, mode=mode)
-            # Large batch size
-            verify_depth_to_space(4, 8, 32, 32, 32, layout=layout, mode=mode)
-            # Large input size
-            verify_depth_to_space(4, 8, 32, 128, 128, layout=layout, mode=mode)
-
-
-if __name__ == "__main__":
-    test_depth_to_space()
diff --git a/tests/python/topi/test_topi_depthwise_conv2d.py b/tests/python/topi/test_topi_depthwise_conv2d.py
deleted file mode 100644
index a4bfbbfe8ec3..000000000000
--- a/tests/python/topi/test_topi_depthwise_conv2d.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import autotvm, te, topi
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.depthwise_conv2d import _get_workload
-from tvm.topi.x86.depthwise_conv2d import _fallback_schedule
-from tvm.topi.generic import conv2d as conv2d_generic
-
-
-_depthwise_conv2d_implement = {
-    "NCHW": {
-        "generic": [(topi.nn.depthwise_conv2d_nchw, topi.generic.schedule_depthwise_conv2d_nchw)],
-        "arm_cpu": [
-            (topi.arm_cpu.depthwise_conv2d_nchw, topi.arm_cpu.schedule_depthwise_conv2d_nchw),
-            (
-                topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack,
-                topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack,
-            ),
-        ],
-        "gpu": [(topi.cuda.depthwise_conv2d_nchw, topi.cuda.schedule_depthwise_conv2d_nchw)],
-        "mali": [(topi.mali.depthwise_conv2d_nchw, topi.mali.schedule_depthwise_conv2d_nchw)],
-        "bifrost": [(topi.nn.depthwise_conv2d_nchw, topi.bifrost.schedule_depthwise_conv2d_nchw)],
-        "intel_graphics": [
-            (
-                topi.intel_graphics.depthwise_conv2d_nchw,
-                topi.intel_graphics.schedule_depthwise_conv2d_nchw,
-            )
-        ],
-    },
-    "NHWC": {
-        "generic": [
-            (topi.nn.depthwise_conv2d_nhwc, topi.generic.schedule_depthwise_conv2d_nhwc),
-            (topi.nn.depthwise_conv2d_nhwc, conv2d_generic.schedule_depthwise_conv2d_nhwc),
-        ],
-        "arm_cpu": [
-            (
-                topi.arm_cpu.compute_depthwise_conv2d_nhwc,
-                topi.arm_cpu.schedule_depthwise_conv2d_nhwc,
-            )
-        ],
-        "gpu": [(topi.nn.depthwise_conv2d_nhwc, topi.cuda.schedule_depthwise_conv2d_nhwc)],
-        "mali": [(topi.mali.depthwise_conv2d_nhwc, topi.mali.schedule_depthwise_conv2d_nhwc)],
-        "bifrost": [(topi.mali.depthwise_conv2d_nhwc, topi.mali.schedule_depthwise_conv2d_nhwc)],
-    },
-    "NCHWc": {
-        "generic": [(topi.x86.depthwise_conv2d_NCHWc, topi.x86.schedule_depthwise_conv2d_NCHWc)],
-    },
-}
-
-random_seed = tvm.testing.parameter(0)
-
-in_dtype, out_dtype = tvm.testing.parameters(
-    ("float32", "float32"),
-    ("float16", "float16"),
-)
-
-
-@tvm.testing.fixture
-def input_shape(layout, batch, in_channel, in_size, filter_shape):
-    if layout == "NCHW":
-        return (batch, in_channel, in_size, in_size)
-    elif layout == "NHWC":
-        return (batch, in_size, in_size, in_channel)
-    elif layout == "NCHWc":
-        oc_block = filter_shape[-1]
-        ic_block = next(bn for bn in range(oc_block, 0, -1) if in_channel % bn == 0)
-        return (batch, in_channel // ic_block, in_size, in_size, ic_block)
-
-
-@tvm.testing.fixture
-def filter_shape(layout, in_channel, channel_multiplier, kernel):
-    filter_channel = in_channel
-    if layout == "NCHW":
-        return (filter_channel, channel_multiplier, kernel, kernel)
-    elif layout == "NHWC":
-        return (kernel, kernel, filter_channel, channel_multiplier)
-    elif layout == "NCHWc":
-        out_channel = in_channel * channel_multiplier
-        # For testing the functionality, we choose an arbitrary block
-        # size that can divide out_channel, regardless of the
-        # performance.
-        oc_block = next(bn for bn in range(16, 0, -1) if out_channel % bn == 0)
-        return (out_channel // oc_block, 1, kernel, kernel, 1, oc_block)
-
-
-@tvm.testing.fixture
-def scale_shape(layout, in_channel, channel_multiplier, filter_shape):
-    out_channel = in_channel * channel_multiplier
-
-    if layout in ("NCHW", "NHWC"):
-        return (out_channel,)
-
-    if layout == "NCHWc":
-        oc_block = filter_shape[-1]
-        return (out_channel // oc_block, oc_block)
-
-    raise ValueError("Unknown layout {}".format(layout))
-
-
-@tvm.testing.fixture
-def shift_shape(scale_shape):
-    return scale_shape
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    random_seed,
-    in_dtype,
-    out_dtype,
-    layout,
-    input_shape,
-    filter_shape,
-    dilation,
-    stride,
-    padding,
-    scale_shape,
-    shift_shape,
-    use_scale_shift,
-    apply_relu,
-):
-    np.random.seed(random_seed)
-
-    # scipy.signal.convolve2d does not support float16 data types, and
-    # the python fallback is too slow for general use.  Computing
-    # ref_data in float32 will have fewer rounding errors than the TVM
-    # float16 compute, but those vary based on schedule anyways.
-    conv_dtype = "float32" if in_dtype == "float16" else in_dtype
-
-    input_np = np.random.uniform(size=input_shape).astype(in_dtype)
-    filter_np = np.random.uniform(size=filter_shape).astype(in_dtype)
-    scale_np = np.random.uniform(size=scale_shape).astype(out_dtype)
-    shift_np = np.random.uniform(size=shift_shape).astype(out_dtype)
-    if layout == "NCHW":
-        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchw
-        dilation = (1, 1, dilation, dilation)
-        reshape = (1, -1, 1, 1)
-    elif layout == "NHWC":
-        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nhwc
-        dilation = (dilation, dilation, 1, 1)
-        reshape = (1, 1, 1, -1)
-    elif layout == "NCHWc":
-        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchwc
-        dilation = (1, 1, dilation, dilation, 1, 1)
-        reshape = (1, scale_shape[0], 1, 1, scale_shape[1])
-
-    dilated_filter_np = tvm.topi.testing.dilate_python(filter_np, dilation)
-    output_np = np_depthwise_conv2d(
-        input_np.astype(conv_dtype), dilated_filter_np.astype(conv_dtype), stride, padding
-    ).astype(out_dtype)
-
-    if use_scale_shift:
-        output_np = output_np * scale_np.reshape(reshape) + shift_np.reshape(reshape)
-    if apply_relu:
-        output_np = np.maximum(output_np, 0)
-
-    return (
-        input_np,
-        filter_np,
-        scale_np,
-        shift_np,
-        output_np,
-    )
-
-
-class BaseDepthwiseConv2D:
-    """Provides the test_conv2d test function, to be used by other test classes.
-
-    Test parameter sets are split out into different classes for
-    readability (e.g. used for mobilenet), and for restrictions
-    (e.g. implemented only for llvm).
-    """
-
-    layout = tvm.testing.parameter("NCHW", "NHWC")
-
-    (batch, in_channel, in_size, channel_multiplier, kernel, stride) = tvm.testing.parameters(
-        (1, 728, 32, 1, 3, 1),
-        (4, 256, 64, 2, 5, 2),
-    )
-    padding = tvm.testing.parameter("SAME", "VALID")
-    dilation = tvm.testing.parameter(1, 2)
-
-    use_scale_shift = tvm.testing.parameter(True, False, ids=["with_scale_shift", "no_scale_shift"])
-    apply_relu = tvm.testing.parameter(True, False, ids=["with_relu", "no_relu"])
-
-    run_after_compile = True
-
-    def test_conv2d(
-        self,
-        target,
-        dev,
-        in_dtype,
-        out_dtype,
-        layout,
-        input_shape,
-        filter_shape,
-        scale_shape,
-        shift_shape,
-        use_scale_shift,
-        apply_relu,
-        batch,
-        in_channel,
-        channel_multiplier,
-        kernel,
-        stride,
-        padding,
-        dilation,
-        ref_data,
-    ):
-        target = tvm.target.Target(target)
-        if (
-            target.kind.name == "cuda"
-            and in_dtype == "float16"
-            and not tvm.contrib.nvcc.have_fp16(dev.compute_version)
-        ):
-            pytest.xfail("CUDA float16 intrinsics not available")
-
-        if (
-            target.kind.name == "vulkan"
-            and in_dtype == "float16"
-            and (
-                not target.attrs.get("supports_float16", False)
-                or not target.attrs.get("supports_16bit_buffer", False)
-            )
-        ):
-            pytest.xfail("Vulkan float16 driver support not available")
-
-        # Transform the padding argument from 'str' to 'tuple' to
-        # match the "workload" tuple in TopHub.  Which padding_args to
-        # use for each layout chosen to reproduce previous behavior.
-        if dilation == 1:
-            padding_args = get_pad_tuple(padding, (kernel, kernel))
-            padding_args_i = [0, 1, 2, 3] if layout == "NCHW" else [0, 1]
-            padding_args = [padding_args[i] for i in padding_args_i]
-        else:
-            padding_args = padding
-
-        # placeholder
-        Input = te.placeholder(input_shape, name="Input", dtype=in_dtype)
-        Filter = te.placeholder(filter_shape, name="Filter", dtype=in_dtype)
-        Scale = te.placeholder(scale_shape, name="Scale", dtype=out_dtype)
-        Shift = te.placeholder(shift_shape, name="Shift", dtype=out_dtype)
-
-        if layout == "NCHW":
-            topi_scale_shift = topi.nn.scale_shift_nchw
-            fcompute_args = (Input, Filter, stride, padding_args, dilation, out_dtype)
-
-        elif layout == "NHWC":
-            topi_scale_shift = topi.nn.scale_shift_nhwc
-            fcompute_args = (Input, Filter, stride, padding_args, dilation, out_dtype)
-
-        elif layout == "NCHWc":
-            topi_scale_shift = topi.nn.scale_shift_nchwc
-            in_layout = "NCHW{}c".format(input_shape[-1])
-            out_layout = "NCHW{}c".format(filter_shape[-1])
-            fcompute_args = (
-                Input,
-                Filter,
-                stride,
-                padding,
-                dilation,
-                in_layout,
-                out_layout,
-                out_dtype,
-            )
-
-        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
-            impl_list = tvm.topi.testing.dispatch(target, _depthwise_conv2d_implement[layout])[:]
-            if target == "llvm" and layout == "NCHW" and channel_multiplier == 1 and dilation == 1:
-                impl_list.append(
-                    (topi.x86.depthwise_conv2d_nchw, topi.x86.schedule_depthwise_conv2d_nchw)
-                )
-
-            for fcompute, fschedule in impl_list:
-                with tvm.target.Target(target):
-                    # Declare, build schedule
-                    C = fcompute(*fcompute_args)
-                    if use_scale_shift:
-                        C = topi_scale_shift(C, Scale, Shift)
-                    if apply_relu:
-                        C = topi.nn.relu(C)
-
-                    s = fschedule(C)
-
-                # Build and run
-                f = tvm.build(s, [Input, Filter, Scale, Shift, C], target)
-
-                if self.run_after_compile:
-                    input_np, filter_np, scale_np, shift_np, output_np = ref_data
-                    if "int" in out_dtype:
-                        tol = {"atol": 0, "rtol": 0}
-                    elif out_dtype == "float32":
-                        tol = {"rtol": 1e-4, "atol": 1e-5}
-                    elif out_dtype == "float16":
-                        # A summation in float16 with a single accumulator very
-                        # quickly runs into large rounding errors.  At some point,
-                        # this tolerance should be schedule-dependent for to avoid
-                        # false negatives.
-                        num_values_summed = kernel * kernel
-                        gap_size = (
-                            np.nextafter(output_np.max(), np.inf, dtype=output_np.dtype)
-                            - output_np.max()
-                        )
-                        tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}
-
-                    input_tvm = tvm.nd.array(input_np, dev)
-                    filter_tvm = tvm.nd.array(filter_np, dev)
-                    scale_tvm = tvm.nd.array(scale_np, dev)
-                    shift_tvm = tvm.nd.array(shift_np, dev)
-                    output_tvm = tvm.nd.array(
-                        np.zeros(shape=get_const_tuple(C.shape), dtype=C.dtype),
-                        dev,
-                    )
-
-                    f(input_tvm, filter_tvm, scale_tvm, shift_tvm, output_tvm)
-                    tvm.testing.assert_allclose(output_np, output_tvm.numpy(), **tol)
-
-
-class TestDepthwiseConv2D(BaseDepthwiseConv2D):
-    """Test variety of parameters, defined in BaseDepthwiseConv2D.  Also
-    has llvm-specific tests for workload padding."""
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_workload_padding(
-        self,
-        out_dtype,
-        layout,
-        input_shape,
-        filter_shape,
-        target,
-        ref_data,
-        stride,
-        padding,
-        dilation,
-    ):
-        input_np, filter_np, scale_np, shift_np, output_np = ref_data
-        if layout == "NCHW":
-            _, _, out_height, out_width = output_np.shape
-        elif layout == "NHWC":
-            _, out_height, out_width, _ = output_np.shape
-        elif layout == "NCHWc":
-            _, _, out_height, out_width, _ = output_np.shape
-
-        Input = te.placeholder(input_shape, name="Input")
-        Filter = te.placeholder(filter_shape, name="Filter")
-        wkl = _get_workload(Input, Filter, (stride, stride), padding, dilation, out_dtype, layout)
-
-        # check if tile_ow candidates are the factors of the right output weight.
-        with tvm.target.Target(target):
-            cfg = autotvm.get_config()
-            _fallback_schedule(cfg, wkl)
-            ow_tile = np.prod(cfg["tile_ow"].size)
-
-            tvm.testing.assert_allclose(ow_tile, out_width)
-
-
-class TestDepthwiseConv2D_MobilenetWorkloads(BaseDepthwiseConv2D):
-    """Extra tests to verify functionality for workloads used by mobilenet."""
-
-    layout = tvm.testing.parameter("NCHW")
-
-    batch = tvm.testing.parameter(1)
-    channel_multiplier = tvm.testing.parameter(1)
-    kernel = tvm.testing.parameter(3)
-    padding = tvm.testing.parameter("SAME")
-    dilation = tvm.testing.parameter(1)
-
-    in_channel, in_size, stride = tvm.testing.parameters(
-        (32, 112, 1),
-        (64, 112, 2),
-        (128, 56, 1),
-        (128, 56, 2),
-        (256, 28, 1),
-        (256, 28, 2),
-        (512, 14, 1),
-        (512, 14, 2),
-        (1024, 7, 1),
-    )
-
-
-@tvm.testing.parametrize_targets("llvm")
-class TestDepthwiseConv2D_NCHWc(BaseDepthwiseConv2D):
-    """Tests specific to NCHWc layouts.
-
-    Once the implementation supports channel_multiplier>1 and GPU
-    devices, this class can be merged into TestDepthwiseConv2D.
-    """
-
-    # depthwise_conv2d_NCHWc currently does not support channel multiplier > 1
-    layout = tvm.testing.parameter("NCHWc")
-    (batch, in_channel, in_size, channel_multiplier, kernel, stride) = tvm.testing.parameters(
-        (1, 728, 32, 1, 3, 1),
-    )
-
-
-@tvm.testing.parametrize_targets("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu")
-class TestDepthwiseConv2DArmCompile(BaseDepthwiseConv2D):
-    """Compile-only tests for cross-compiling to ARM."""
-
-    layout = tvm.testing.parameter("NHWC", "NCHW")
-    batch = tvm.testing.parameter(1)
-    dilation = tvm.testing.parameter(1)
-    in_dtype, out_dtype = tvm.testing.parameters(("int16", "int32"))
-    in_channel = tvm.testing.parameter(728)
-    in_size = tvm.testing.parameter(32)
-    kernel = tvm.testing.parameter(1)
-    channel_multiplier = tvm.testing.parameter(1, 3)
-    stride = tvm.testing.parameter(1)
-    padding = tvm.testing.parameter("SAME")
-    use_scale_shift = tvm.testing.parameter(True, False, ids=["with_scale_shift", "no_scale_shift"])
-
-    run_after_compile = False
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_depthwise_conv2d_back_input.py b/tests/python/topi/test_topi_depthwise_conv2d_back_input.py
deleted file mode 100644
index 5087b0047315..000000000000
--- a/tests/python/topi/test_topi_depthwise_conv2d_back_input.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import topi
-import numpy as np
-from tvm.contrib.pickle_memoize import memoize
-from scipy import signal
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-import tvm.topi.testing
-from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
-import tvm.testing
-
-
-def verify_depthwise_conv2d_back_input(
-    batch, in_channel, in_h, channel_multiplier, filter_h, stride_h, padding_h
-):
-    in_w = in_h
-    filter_channel = in_channel
-    filter_w = filter_h
-    stride_w = stride_h
-    padding_w = padding_h
-
-    out_h = np.int32((in_h + 2 * padding_h - filter_h) / stride_h + 1)
-    out_w = np.int32((in_w + 2 * padding_w - filter_w) / stride_w + 1)
-    out_channel = in_channel * channel_multiplier
-
-    ishape = [batch, in_h, in_w, in_channel]
-    oshape = [batch, out_h, out_w, out_channel]
-
-    # placeholder
-    Out_grad = te.placeholder(oshape, name="Out_grad")
-    Filter = te.placeholder((filter_h, filter_w, filter_channel, channel_multiplier))
-    # declare
-    In_grad = topi.nn.depthwise_conv2d_backward_input_nhwc(
-        Filter,
-        Out_grad,
-        oshape,
-        ishape,
-        stride=[stride_h, stride_w],
-        padding=[padding_h, padding_w],
-    )
-    # schedule
-    schedule = schedule_depthwise_conv2d_backward_input_nhwc(In_grad)
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        # build the kernel
-        f = tvm.build(schedule, [Filter, Out_grad, In_grad], device)
-        # prepare pod type for test data closure
-        dtype = Out_grad.dtype
-        out_grad_shape = get_const_tuple(Out_grad.shape)
-        filter_shape = get_const_tuple(Filter.shape)
-
-        # use memoize to pickle the test data for next time use
-        @memoize("topi.tests.test_topi_depthwise_conv2d_backward_input.nhwc")
-        def get_ref_data():
-            out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype)
-            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
-            dilated_out_grad_np = tvm.topi.testing.dilate_python(
-                out_grad_np, [1, stride_h, stride_w, 1]
-            )
-            # padding params in forward propagation
-            fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
-                [padding_h, padding_w], (filter_h, filter_w)
-            )
-            # padding params in backward propagation
-            bpad_top = filter_h - 1 - fpad_top
-            bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1)
-            bpad_left = filter_w - 1 - fpad_left
-            bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1)
-
-            padded_out_grad = np.zeros(
-                (
-                    batch,
-                    dilated_out_grad_np.shape[1] + bpad_top + bpad_bottom,
-                    dilated_out_grad_np.shape[2] + bpad_left + bpad_right,
-                    out_channel,
-                )
-            )
-            padded_out_grad[
-                :,
-                bpad_top : dilated_out_grad_np.shape[1] + bpad_top,
-                bpad_left : dilated_out_grad_np.shape[2] + bpad_left,
-                :,
-            ] = dilated_out_grad_np
-
-            in_grad_np = np.zeros((batch, in_h, in_w, in_channel))
-            for b in range(batch):
-                for c in range(in_channel):
-                    for m in range(channel_multiplier):
-                        in_grad_np[b, :, :, c] += signal.convolve2d(
-                            padded_out_grad[b, :, :, c * channel_multiplier + m],
-                            filter_np[:, :, c, m],
-                            mode="valid",
-                        )[0:in_h, 0:in_w]
-            return (out_grad_np, filter_np, in_grad_np)
-
-        (out_grad_np, filter_np, in_grad_np) = get_ref_data()
-
-        out_grad_tvm = tvm.nd.array(out_grad_np, dev)
-        filter_tvm = tvm.nd.array(filter_np, dev)
-        in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), dev)
-        # launch the kernel
-        timer = f.time_evaluator(f.entry_name, dev, number=1)
-        tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean
-        tvm.testing.assert_allclose(in_grad_np, in_grad_tvm.numpy(), rtol=1e-5)
-
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
-
-
-@tvm.testing.requires_gpu
-def test_topi_depthwise_conv2d_backward_input_nhwc():
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 1, 1)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 3, 1, 1)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 5, 1, 2)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 5, 1, 2)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 2, 1)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 3, 2, 1)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 5, 2, 2)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 5, 2, 2)
-
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 1, 0)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 3, 1, 0)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 5, 1, 0)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 5, 1, 0)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 2, 0)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 3, 2, 0)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 5, 2, 0)
-    verify_depthwise_conv2d_back_input(16, 256, 56, 2, 5, 2, 0)
-
-
-if __name__ == "__main__":
-    test_topi_depthwise_conv2d_backward_input_nhwc()
diff --git a/tests/python/topi/test_topi_depthwise_conv2d_back_weight.py b/tests/python/topi/test_topi_depthwise_conv2d_back_weight.py
deleted file mode 100644
index 0bbb0e6c0cca..000000000000
--- a/tests/python/topi/test_topi_depthwise_conv2d_back_weight.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-import numpy as np
-from tvm.contrib.pickle_memoize import memoize
-from scipy import signal
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
-import tvm.testing
-
-
-def verify_depthwise_conv2d_back_weight(
-    batch, in_channel, in_h, channel_multiplier, filter_h, stride_h, padding_h
-):
-    in_w = in_h
-    filter_channel = in_channel
-    filter_w = filter_h
-    stride_w = stride_h
-    padding_w = padding_h
-
-    out_h = int((in_h + 2 * padding_h - filter_h) / stride_h + 1)
-    out_w = int((in_w + 2 * padding_w - filter_w) / stride_w + 1)
-    out_channel = in_channel * channel_multiplier
-
-    oshape = [batch, out_h, out_w, out_channel]
-    fshape = [filter_h, filter_w, in_channel, channel_multiplier]
-
-    # placeholder
-    Out_grad = te.placeholder(oshape, name="Out_grad")
-    Input = te.placeholder((batch, in_h, in_w, in_channel), name="In_grad")
-    # declare
-    Weight_grad = topi.nn.depthwise_conv2d_backward_weight_nhwc(
-        Input, Out_grad, oshape, fshape, stride=[stride_h, stride_w], padding=[padding_h, padding_w]
-    )
-    # schedule
-    schedule = schedule_depthwise_conv2d_backward_weight_nhwc(Weight_grad)
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        # build the kernel
-        f = tvm.build(schedule, [Input, Out_grad, Weight_grad], device)
-        # prepare pod type for test data closure
-        dtype = Out_grad.dtype
-        out_grad_shape = get_const_tuple(Out_grad.shape)
-        in_shape = get_const_tuple(Input.shape)
-
-        # use memoize to pickle the test data for next time use
-        @memoize("topi.tests.test_topi_depthwise_conv2d_backward_weight.nhwc")
-        def get_ref_data():
-            out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype)
-            input_np = np.random.uniform(size=in_shape).astype(dtype)
-            dilated_out_grad_np = tvm.topi.testing.dilate_python(
-                out_grad_np, [1, stride_h, stride_w, 1]
-            )
-
-            pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
-                [padding_h, padding_w], (filter_h, filter_w)
-            )
-            padded_input_np = np.zeros(
-                (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right, in_channel)
-            )
-            padded_input_np[:, pad_top : in_h + pad_top, pad_left : in_w + pad_left, :] = input_np
-
-            weight_grad_np = np.zeros((filter_h, filter_w, in_channel, channel_multiplier))
-            for c in range(in_channel):
-                for m in range(channel_multiplier):
-                    for b in range(batch):
-                        weight_grad_np[:, :, c, m] += signal.convolve2d(
-                            padded_input_np[b, :, :, c],
-                            np.rot90(
-                                dilated_out_grad_np[
-                                    b, :, :, c * channel_multiplier + m % channel_multiplier
-                                ],
-                                2,
-                            ),
-                            mode="valid",
-                        )[0:filter_h, 0:filter_w]
-            return (out_grad_np, input_np, weight_grad_np)
-
-        (out_grad_np, input_np, weight_grad_np) = get_ref_data()
-
-        out_grad_tvm = tvm.nd.array(out_grad_np, dev)
-        input_tvm = tvm.nd.array(input_np, dev)
-        weight_grad_tvm = tvm.nd.array(np.zeros(shape=fshape, dtype=dtype), dev)
-        # launch the kernel
-        timer = f.time_evaluator(f.entry_name, dev, number=1)
-        tcost = timer(input_tvm, out_grad_tvm, weight_grad_tvm).mean
-        tvm.testing.assert_allclose(weight_grad_np, weight_grad_tvm.numpy(), rtol=1e-4)
-
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
-
-
-@tvm.testing.requires_gpu
-def test_topi_depthwise_conv2d_backward_weight_nhwc():
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 1, 1)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 3, 1, 1)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 5, 1, 2)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 5, 1, 2)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 2, 1)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 3, 2, 1)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 5, 2, 2)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 5, 2, 2)
-
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 1, 0)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 3, 1, 0)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 5, 1, 0)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 5, 1, 0)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 2, 0)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 3, 2, 0)
-    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 5, 2, 0)
-    verify_depthwise_conv2d_back_weight(15, 256, 56, 2, 5, 2, 0)
-
-
-if __name__ == "__main__":
-    test_topi_depthwise_conv2d_backward_weight_nhwc()
diff --git a/tests/python/topi/test_topi_dft.py b/tests/python/topi/test_topi_dft.py
deleted file mode 100644
index abab272e601d..000000000000
--- a/tests/python/topi/test_topi_dft.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for discrete Fourier transform."""
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import topi
-import tvm.topi.testing
-
-
-inverse = tvm.testing.parameter(False, True)
-shape = tvm.testing.parameter((7,), (3, 7), (3, 4, 5))
-dtype = tvm.testing.parameter("float16", "float32", "float64")
-
-
-# pylint: disable=redefined-outer-name, invalid-name
-def numpy_reference(inverse, re: np.ndarray, im: np.ndarray):
-    if inverse:
-        reference = np.fft.ifft(re + 1j * im)
-    else:
-        reference = np.fft.fft(re + 1j * im)
-    return np.real(reference), np.imag(reference)
-
-
-def test_dft(target, dev, inverse, shape, dtype):
-    """Test for discrete Fourier transform."""
-    implementations = {
-        "generic": (
-            topi.dft,
-            topi.generic.schedule_extern,
-        ),
-        "gpu": (
-            topi.cuda.dft,
-            topi.cuda.schedule_extern,
-        ),
-        "nvptx": (
-            topi.cuda.dft,
-            topi.cuda.schedule_extern,
-        ),
-    }
-
-    Re = tvm.te.placeholder(shape, dtype=dtype, name="Re")
-    Im = tvm.te.placeholder(shape, dtype=dtype, name="Im")
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-
-        outs = fcompute(Re, Im, inverse)
-        s = fschedule(outs)
-
-        f = tvm.build(s, [Re, Im, *outs], target)
-
-    re_np = np.random.normal(size=shape).astype(dtype)
-    im_np = np.random.normal(size=shape).astype(dtype)
-
-    re = tvm.nd.array(re_np, device=dev)
-    im = tvm.nd.array(im_np, device=dev)
-    re_out = tvm.nd.array(np.zeros(shape).astype(dtype), device=dev)
-    im_out = tvm.nd.array(np.zeros(shape).astype(dtype), device=dev)
-
-    f(re, im, re_out, im_out)
-
-    re_reference, im_reference = numpy_reference(inverse, re_np, im_np)
-
-    atol = rtol = 1e-3
-    if dtype == "float16":
-        atol = rtol = 1e-1
-
-    tvm.testing.assert_allclose(re_out.numpy(), re_reference, rtol=rtol, atol=atol)
-    tvm.testing.assert_allclose(im_out.numpy(), im_reference, rtol=rtol, atol=atol)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_dilate.py b/tests/python/topi/test_topi_dilate.py
deleted file mode 100644
index 4a89926919e9..000000000000
--- a/tests/python/topi/test_topi_dilate.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-import numpy as np
-
-
-def test_dilate():
-    target = "llvm"
-    dev = tvm.cpu(0)
-
-    def _test_dilate(input_size, strides, dilation_value=None):
-        Input = te.placeholder((input_size))
-        if dilation_value is None:
-            Output = topi.nn.dilate(Input, strides)
-        else:
-            Output = topi.nn.dilate(Input, strides, dilation_value)
-        schedule = te.create_schedule(Output.op)
-        input_np = np.random.uniform(size=input_size).astype(Input.dtype)
-        if dilation_value is None:
-            output_np = tvm.topi.testing.dilate_python(input_np, strides)
-        else:
-            output_np = tvm.topi.testing.dilate_python(input_np, strides, dilation_value)
-        input_tvm = tvm.nd.array(input_np, device=dev)
-        output_size = topi.utils.get_const_tuple(Output.shape)
-        output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), device=dev)
-        f = tvm.build(schedule, [Input, Output], target)
-        f(input_tvm, output_tvm)
-        tvm.testing.assert_allclose(output_tvm.numpy(), output_np, rtol=1e-5)
-
-    _test_dilate((32,), (2,))
-    _test_dilate((32, 32), (2, 2))
-    _test_dilate((1, 3, 32, 32), (1, 1, 1, 1))
-    _test_dilate((1, 3, 32, 32), (2, 2, 2, 2))
-    _test_dilate((1, 32, 32, 3, 3), (1, 1, 1, 1, 1))
-    _test_dilate((1, 32, 32, 3, 3), (2, 2, 2, 2, 2))
-    _test_dilate((1, 32, 32, 32, 3, 3), (1, 1, 1, 2, 2, 2))
-    _test_dilate((1, 32, 32, 32, 3, 3), (2, 2, 2, 1, 1, 1))
-    _test_dilate((1, 32, 32, 32, 3, 3), (2, 2, 2, 1, 1, 1), 1.0)
-
-
-if __name__ == "__main__":
-    test_dilate()
diff --git a/tests/python/topi/test_topi_einsum.py b/tests/python/topi/test_topi_einsum.py
deleted file mode 100644
index a84cbaffc185..000000000000
--- a/tests/python/topi/test_topi_einsum.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-
-
-def with_tvm(lam, shapes, ops, out_shape):
-    """Take numpy arrays as args, convert them to TVM tensors and call `lam`.
-    Result of lambda is converted back to numpy array and returned.
-    """
-    dev = tvm.cpu(0)
-    pls = []  # placeholders
-    vals_nd = []  # initial values
-    for i, (shape, arg) in enumerate(zip(shapes, ops)):
-        pls.append(te.placeholder(shape, name="pl" + str(i)))
-        vals_nd.append(tvm.nd.array(arg, dev))
-
-    out = lam(*pls)
-    out_nd = tvm.nd.array(np.zeros(out_shape).astype(out.dtype), device=dev)
-    s = te.create_schedule([out.op])
-    m = tvm.build(s, pls + [out], "llvm")
-    m(*(vals_nd + [out_nd]))
-    return out_nd.numpy()
-
-
-def verify_einsum(subscripts, shapes, shape_dict={}):
-    ops = []  # ndarrays to be used as inputs
-    symbolic_shapes = []  # shapes to declare the placeholders
-    name_to_var = {}
-
-    def get_concrete_shape(shape):
-        return [shape_dict[s] if isinstance(s, str) else s for s in shape]
-
-    def get_symblic_shape_var(name, dtype="int32"):
-        if name not in name_to_var:
-            name_to_var[name] = te.var(name, dtype=dtype)
-        return name_to_var[name]
-
-    def get_symbolic_shape(shape):
-        return [get_symblic_shape_var(s) if isinstance(s, str) else s for s in shape]
-
-    for shape in shapes:
-        concrete_shape = get_concrete_shape(shape)
-        tmp = np.random.uniform(low=-1.0, high=1.0, size=concrete_shape).astype(np.float32)
-        ops.append(tmp)
-        symbolic_shape = get_symbolic_shape(shape)
-        symbolic_shapes.append(symbolic_shape)
-
-    c1 = np.einsum(subscripts, *ops)
-    out_shape = c1.shape
-
-    if len(ops) == 1:
-        c2 = with_tvm(lambda A: topi.einsum(subscripts, A), symbolic_shapes, ops, out_shape)
-    elif len(ops) == 2:
-        c2 = with_tvm(lambda A, B: topi.einsum(subscripts, A, B), symbolic_shapes, ops, out_shape)
-    elif len(ops) == 3:
-        c2 = with_tvm(
-            lambda A, B, C: topi.einsum(subscripts, A, B, C), symbolic_shapes, ops, out_shape
-        )
-
-    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
-
-
-@pytest.mark.parametrize(
-    "equation,inputs",
-    [
-        ("ii", [(5, 5)]),
-        ("ii->i", [(5, 5)]),
-        ("ij->i", [(5, 5)]),
-        ("...j->...", [(5, 5)]),
-        ("...j, j", [(5, 5), (5,)]),
-        ("..., ...", [(), (2, 3)]),
-        ("ijk, jil->kl", [(3, 4, 5), (4, 3, 2)]),
-        ("ij, ij -> i", [(1, 4), (2, 4)]),
-        ("...ij, ...jk -> ...ik", [(1, 4), (4, 2)]),
-        ("...ij, ...ik -> ...jk", [(1, 1, 1, 4), (1, 1, 1, 3)]),
-        ("...ik, ...jk, ...hk -> i...jh", [(3, 4, 4), (1, 5, 3, 8, 4), (2, 5, 3, 6, 4)]),
-        ("ij,jk->ik", [(2, 3), (3, 4)]),
-        ("ij,jk,km->im", [(2, 3), (3, 4), (4, 5)]),
-    ],
-)
-def test_einsum(equation, inputs):
-    verify_einsum(equation, inputs)
-
-
-@pytest.mark.parametrize(
-    "equation,inputs,shape_dict",
-    [
-        ("ij,jk->ik", [(2, "K"), (1, "N")], {"K": 3, "N": 4}),
-        ("ij,jk->ik", [(2, "K"), ("K2", "N")], {"K": 3, "N": 4, "K2": 3}),
-        ("ij,jk->ik", [(2, "K"), ("K2", "N")], {"K": 3, "N": 4, "K2": 1}),
-    ],
-)
-def test_einsum_symblic_shape(equation, inputs, shape_dict):
-    verify_einsum(equation, inputs, shape_dict)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_group_conv1d_transpose_ncw.py b/tests/python/topi/test_topi_group_conv1d_transpose_ncw.py
deleted file mode 100644
index b612c13f9b59..000000000000
--- a/tests/python/topi/test_topi_group_conv1d_transpose_ncw.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for group transposed 1d convolution."""
-
-import itertools
-import os
-
-import numpy as np
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi
-from tvm.topi.utils import get_const_tuple
-
-_group_conv1d_transpose_ncw_implement = {
-    "generic": (
-        topi.nn.group_conv1d_transpose_ncw,
-        topi.generic.schedule_group_conv1d_transpose_ncw,
-    ),
-}
-
-
-(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    output_padding,
-    groups,
-) = tvm.testing.parameters(
-    (1, 4, 224, 32, 5, 1, 0, (0,), 4),
-    (1, 8, 224, 32, 7, 1, 2, (0,), 4),
-    (1, 8, 224, 32, 5, 2, 1, (0,), 2),
-    (1, 4, 224, 4, 5, 2, 1, (1,), 4),
-    (1, 3, 224, 15, 5, 2, 0, (0,), 3),
-    (1, 32, 32, 128, 5, 1, 0, (0,), 32),
-    (1, 32, 32, 128, 5, 2, 1, (0,), 16),
-)
-
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding, groups
-):
-    dtype = "float32"
-    a_shape = (batch, in_channel, in_size)
-    w_shape = (in_channel, num_filter, kernel)
-
-    a_np = np.random.uniform(size=a_shape).astype(dtype)
-    w_np = np.random.uniform(size=w_shape).astype(dtype)
-    b_np = tvm.topi.testing.group_conv1d_transpose_ncw_python(
-        a_np, w_np, stride, padding, output_padding, groups
-    )
-    c_np = np.maximum(b_np, 0)
-    return a_np, w_np, b_np, c_np
-
-
-@tvm.testing.known_failing_targets("cuda", "vulkan")
-def test_group_conv1d_transpose_ncw(
-    target, dev, ref_data, dtype, stride, padding, output_padding, groups
-):
-    a_np, w_np, b_np, c_np = ref_data
-
-    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-    W = te.placeholder(w_np.shape, name="W", dtype=dtype)
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(
-            target, _group_conv1d_transpose_ncw_implement
-        )
-        B = fcompute(A, W, stride, padding, A.dtype, output_padding, groups)
-        C = topi.nn.relu(B)
-        s1 = fschedule([B])
-        s2 = fschedule([C])
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-    func1 = tvm.build(s1, [A, W, B], target)
-    func2 = tvm.build(s2, [A, W, C], target)
-    func1(a, w, b)
-    func2(a, w, c)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-    tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_group_conv2d.py b/tests/python/topi/test_topi_group_conv2d.py
deleted file mode 100644
index 55b24feece93..000000000000
--- a/tests/python/topi/test_topi_group_conv2d.py
+++ /dev/null
@@ -1,658 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do group convolution."""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.autotvm.task.space import FallbackConfigEntity
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-from common import Int8Fallback
-import tvm.testing
-
-
-def _transform_data(data, bn):
-    # NCHW -> NCHW[x]c
-    batch_size, channel, height, width = data.shape
-    data = np.reshape(data, (batch_size, channel // bn, bn, height, width))
-    data = np.transpose(data, (0, 1, 3, 4, 2))
-    return data
-
-
-def _transform_kernel(kernel, ic_bn, oc_bn):
-    # OIHW -> OIHW[x]o[x]i
-    out_channel, in_channel, kh, kw = kernel.shape
-    kernel = np.reshape(kernel, (out_channel // oc_bn, oc_bn, in_channel // ic_bn, ic_bn, kh, kw))
-    kernel = np.transpose(kernel, (0, 2, 4, 5, 1, 3))
-    return kernel
-
-
-_group_conv2d_nchw_implement = {
-    "generic": (topi.nn.group_conv2d_nchw, topi.generic.schedule_group_conv2d_nchw),
-    "gpu": (topi.cuda.group_conv2d_nchw, topi.cuda.schedule_group_conv2d_nchw),
-}
-
-_group_conv2d_nhwc_implement = {
-    "generic": (topi.nn.group_conv2d_nhwc, topi.generic.schedule_group_conv2d_nhwc),
-}
-
-
-def verify_group_conv2d_nchw(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation,
-    groups,
-    add_bias=False,
-    add_relu=False,
-):
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    W = te.placeholder((num_filter, in_channel // groups, kernel, kernel), name="W")
-    bias = te.placeholder((num_filter, 1, 1), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nchw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(
-            dtype
-        )
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nchw_implement)
-            C = fcompute(A, W, stride, padding, dilation, groups, dtype)
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for target in ["llvm", "cuda"]:
-        check_target(target)
-
-
-oc_block_factor = 4
-ic_block_factor = 4
-
-
-def verify_group_conv2d_NCHWc_int8(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation,
-    groups,
-    add_bias=False,
-    add_relu=False,
-):
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder(
-        (batch, in_channel // ic_block_factor, in_height, in_width, ic_block_factor),
-        name="A",
-        dtype="int8",
-    )
-    W = te.placeholder(
-        (
-            num_filter // oc_block_factor,
-            (in_channel // groups) // ic_block_factor,
-            kernel,
-            kernel,
-            oc_block_factor,
-            ic_block_factor,
-        ),
-        name="W",
-        dtype="int8",
-    )
-    bias = te.placeholder(
-        (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8"
-    )
-
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8")
-    def get_ref_data():
-        a_np = np.random.randint(
-            low=-128, high=127, size=(batch, in_channel, in_height, in_width)
-        ).astype(dtype)
-        w_np = np.random.randint(
-            low=-128, high=128, size=(num_filter, in_channel // groups, kernel, kernel)
-        ).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(
-            dtype
-        )
-
-        # convert to NCHWc
-        _, _, out_height, out_width = c_np.shape
-        c_np = c_np.reshape(
-            (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
-        ).transpose(0, 1, 3, 4, 2)
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-
-        return (
-            _transform_data(a_np, ic_block_factor),
-            _transform_kernel(w_np, ic_block_factor, oc_block_factor),
-            b_np,
-            c_np,
-        )
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            print("Skip because int8 intrinsics are not available")
-            return
-
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype)
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for target in ["cuda"]:
-        check_target(target)
-
-
-def verify_group_conv2d_nchw_int8(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation,
-    groups,
-    add_bias=False,
-    add_relu=False,
-):
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="int8")
-    W = te.placeholder((num_filter, in_channel // groups, kernel, kernel), name="W", dtype="int8")
-    bias = te.placeholder(
-        (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8"
-    )
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nchw_int8")
-    def get_ref_data():
-        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
-        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(
-            dtype
-        )
-
-        # convert to NCHWc
-        _, _, out_height, out_width = c_np.shape
-        c_np = c_np.reshape(
-            (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
-        ).transpose(0, 1, 3, 4, 2)
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            print("Skip because int8 intrinsics are not available")
-            return
-
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype)
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for target in ["cuda"]:
-        check_target(target)
-
-
-def verify_group_conv2d_nhwc(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation,
-    groups,
-    add_bias=False,
-    add_relu=False,
-):
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A")
-    W = te.placeholder((kernel, kernel, in_channel // groups, num_filter), name="W")
-    bias = te.placeholder((1, 1, num_filter), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nhwc")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding, groups).astype(
-            dtype
-        )
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nhwc_implement)
-            C = fcompute(A, W, stride, padding, dilation, groups, dtype)
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (
-                    batch,
-                    in_channel,
-                    in_size,
-                    num_filter,
-                    kernel,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                ),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for target in ["llvm"]:
-        check_target(target)
-
-
-@tvm.testing.uses_gpu
-def test_group_conv2d_nchw():
-    # ResNeXt-50 workload
-    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nchw(1, 256, 56, 256, 3, 2, 1, 1, 32)
-    verify_group_conv2d_nchw(1, 256, 28, 256, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nchw(1, 512, 28, 512, 3, 2, 1, 1, 32)
-    verify_group_conv2d_nchw(1, 512, 14, 512, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nchw(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
-    verify_group_conv2d_nchw(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
-
-    # bias, relu
-    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
-    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
-    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True)
-
-    # dilation
-    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 2, 32)
-
-    # batch size
-    verify_group_conv2d_nchw(2, 128, 56, 128, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nchw(9, 128, 56, 128, 3, 1, 1, 1, 32)
-
-
-@tvm.testing.requires_cuda
-def test_group_conv2d_NCHWc_int8():
-    with Int8Fallback():
-        # ResNeXt-50 workload
-        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32)
-        verify_group_conv2d_NCHWc_int8(1, 256, 56, 256, 3, 2, 1, 1, 32)
-        verify_group_conv2d_NCHWc_int8(1, 256, 28, 256, 3, 1, 1, 1, 32)
-        verify_group_conv2d_NCHWc_int8(1, 512, 28, 512, 3, 2, 1, 1, 32)
-        verify_group_conv2d_NCHWc_int8(1, 512, 14, 512, 3, 1, 1, 1, 32)
-        verify_group_conv2d_NCHWc_int8(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
-        verify_group_conv2d_NCHWc_int8(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
-
-        # bias, relu
-        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
-        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
-        verify_group_conv2d_NCHWc_int8(
-            1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True
-        )
-        # dilation
-        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
-
-        # batch size
-        verify_group_conv2d_NCHWc_int8(2, 128, 56, 128, 3, 1, 1, 1, 32)
-        verify_group_conv2d_NCHWc_int8(9, 128, 56, 128, 3, 1, 1, 1, 32)
-
-
-@tvm.testing.requires_cuda
-def test_group_conv2d_nchw_int8():
-    with Int8Fallback():
-        # ResNeXt-50 workload
-        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32)
-        verify_group_conv2d_nchw_int8(1, 256, 56, 256, 3, 2, 1, 1, 32)
-        verify_group_conv2d_nchw_int8(1, 256, 28, 256, 3, 1, 1, 1, 32)
-        verify_group_conv2d_nchw_int8(1, 512, 28, 512, 3, 2, 1, 1, 32)
-        verify_group_conv2d_nchw_int8(1, 512, 14, 512, 3, 1, 1, 1, 32)
-        verify_group_conv2d_nchw_int8(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
-        verify_group_conv2d_nchw_int8(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
-
-        # bias, relu
-        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
-        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
-        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True)
-        # dilation
-        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
-
-        # batch size
-        verify_group_conv2d_nchw_int8(2, 128, 56, 128, 3, 1, 1, 1, 32)
-        verify_group_conv2d_nchw_int8(9, 128, 56, 128, 3, 1, 1, 1, 32)
-
-
-def test_group_conv2d_nhwc():
-    # ResNeXt-50 workload
-    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nhwc(1, 256, 56, 256, 3, 2, 1, 1, 32)
-    verify_group_conv2d_nhwc(1, 256, 28, 256, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nhwc(1, 512, 28, 512, 3, 2, 1, 1, 32)
-    verify_group_conv2d_nhwc(1, 512, 14, 512, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nhwc(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
-    verify_group_conv2d_nhwc(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
-
-    # bias, relu
-    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
-    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
-    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True)
-
-    # dilation
-    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 2, 32)
-
-    # batch size
-    verify_group_conv2d_nhwc(2, 128, 56, 128, 3, 1, 1, 1, 32)
-    verify_group_conv2d_nhwc(9, 128, 56, 128, 3, 1, 1, 1, 32)
-
-
-if __name__ == "__main__":
-    test_group_conv2d_nchw()
-    test_group_conv2d_NCHWc_int8()
-    test_group_conv2d_nchw_int8()
-    test_group_conv2d_nhwc()
diff --git a/tests/python/topi/test_topi_group_conv2d_NCHWc_int8.py b/tests/python/topi/test_topi_group_conv2d_NCHWc_int8.py
deleted file mode 100644
index dba2e4e05817..000000000000
--- a/tests/python/topi/test_topi_group_conv2d_NCHWc_int8.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test for NCHW[x]c convolution"""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-import pytest
-
-
-def _transform_data(data, bn):
-    # NCHW -> NCHW[x]c
-    batch_size, channel, height, width = data.shape
-    data = np.reshape(data, (batch_size, channel // bn, bn, height, width))
-    data = np.transpose(data, (0, 1, 3, 4, 2))
-    return data
-
-
-def _transform_kernel(kernel, ic_bn, oc_bn):
-    # OIHW -> OIHW[x]i[x]o
-    out_channel, in_channel, kh, kw = kernel.shape
-    kernel = np.reshape(
-        kernel, (out_channel // oc_bn, oc_bn, in_channel // ic_bn, ic_bn // 4, kh, kw, 4)
-    )
-    kernel = np.transpose(kernel, (0, 2, 4, 5, 3, 1, 6))
-    return kernel
-
-
-def verify_group_conv2d_NCHWc_int8(
-    batch,
-    in_channel,
-    groups,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-    dtype="int32",
-):
-    assert dilation == 1, "conv2d_NCHWc does not support dilation for now."
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, groups, in_size, num_filter, kernel, stride, padding)
-    )
-
-    in_height = in_width = in_size
-
-    # for testing functionality,
-    # we choose arbitrary block size that can divide the channel,
-    # regardless of the performance.
-    oc_block = 1
-    for bn in range(16, 0, -1):
-        if num_filter % bn == 0:
-            oc_block = bn
-            break
-
-    ic_block = 8
-    autotvm.GLOBAL_SCOPE.silent = True
-    A = te.placeholder(
-        (batch, in_channel // ic_block, in_height, in_width, ic_block), name="A", dtype="uint8"
-    )
-    W = te.placeholder(
-        (
-            num_filter // oc_block,
-            in_channel // ic_block // groups,
-            kernel,
-            kernel,
-            ic_block // 4,
-            oc_block,
-            4,
-        ),
-        name="W",
-        dtype="int8",
-    )
-
-    @memoize("topi.tests.test_topi_conv2d_NCHWc_int8.verify_conv2d_NCHWc_int8")
-    def get_ref_data():
-        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype("uint8")
-        w_np = np.random.uniform(size=(num_filter, in_channel // groups, kernel, kernel)).astype(
-            "int8"
-        )
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding, groups)
-        return (
-            _transform_data(a_np, ic_block),
-            _transform_kernel(w_np, ic_block, oc_block),
-            _transform_data(c_np, oc_block),
-        )
-
-    a_np, w_np, c_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(dev):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            C = topi.x86.conv2d_NCHWc(
-                A,
-                W,
-                (stride, stride),
-                (padding, padding),
-                (dilation, dilation),
-                "NCHW%dc" % ic_block,
-                "NCHW%dc" % oc_block,
-                dtype,
-            )
-            s = topi.x86.schedule_conv2d_NCHWc([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        func = tvm.build(
-            s,
-            [A, W, C],
-            device,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
-        )
-        # print(tvm.lower(s, [A, W, C], simple_mode=True))
-        func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
-
-    # for device in ["llvm"]:
-    for device in ["llvm -mcpu=skylake-avx512"]:
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device)
-    autotvm.GLOBAL_SCOPE.silent = False
-
-
-@tvm.testing.uses_gpu
-@pytest.mark.skip
-def test_conv2d_NCHWc():
-    # ResNet50 workloads
-    verify_group_conv2d_NCHWc_int8(1, 256, 32, 224, 64, 7, 2, 3)
-
-
-if __name__ == "__main__":
-    # The test requires Skylake and newer Intel machines to generate the correct
-    # instruction. This test directly calls the topi operator, requiring correct
-    # kernel shape. For older generation of Intel machines, the kernel needs to
-    # be 6D. This test tests 7D kernel, that can only work on Skylake+ machines.
-    # So, disabling the test.
-
-    # test_conv2d_NCHWc()
-    pass
diff --git a/tests/python/topi/test_topi_group_conv2d_transpose.py b/tests/python/topi/test_topi_group_conv2d_transpose.py
deleted file mode 100644
index e9f7ce5ef4dd..000000000000
--- a/tests/python/topi/test_topi_group_conv2d_transpose.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do group transpose convolution."""
-
-import numpy as np
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import te, topi
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
-
-_group_conv2d_nchw_implement = {
-    "generic": (
-        topi.nn.group_conv2d_transpose_nchw,
-        topi.generic.schedule_group_conv2d_transpose_nchw,
-    ),
-    "cuda": (topi.cuda.conv2d_transpose_nchw, topi.cuda.schedule_conv2d_transpose_nchw),
-}
-
-
-def verify_group_conv2d_transpose_nchw(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    output_padding,
-    groups,
-):
-    print(
-        "Workload: (%d, %d, %s, %d, %s, %s, %s, %s, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding, groups)
-    )
-
-    in_height, in_width = in_size
-    kernel_height, kernel_width = kernel
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    W = te.placeholder((in_channel, num_filter // groups, kernel_height, kernel_width), name="W")
-    bias = te.placeholder((num_filter, 1, 1), name="bias")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_group_conv2d_transpose.verify_group_conv2d_transpose_nchw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        c_np = tvm.topi.testing.conv2d_transpose_nchw_python(
-            a_np, w_np, stride, padding, output_padding, groups
-        ).astype(dtype)
-
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nchw_implement)
-            C = fcompute(A, W, stride, padding, dtype, output_padding, groups)
-            s = fschedule([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        func = tvm.build(
-            s,
-            [A, W, C],
-            target,
-            name="group_conv2d_transpose_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (
-                batch,
-                in_channel,
-                in_size[0],
-                in_size[1],
-                num_filter,
-                kernel[0],
-                kernel[1],
-                stride[0],
-                stride[1],
-                padding[0],
-                padding[1],
-                padding[2],
-                padding[3],
-                output_padding[0],
-                output_padding[1],
-                groups,
-            ),
-        )
-        func(a, w, c)
-        c = c.numpy()
-        for measurement, reference in zip(c, c_np):
-            tvm.testing.assert_allclose(measurement, reference, rtol=1e-5)
-
-    for target in ["llvm", "cuda"]:
-        check_target(target)
-
-
-@tvm.testing.uses_gpu
-def test_group_conv2d_transpose_nchw():
-    verify_group_conv2d_transpose_nchw(1, 4, (32, 32), 4, (5, 5), (1, 1), (0, 0, 0, 0), (0, 0), 2)
-    verify_group_conv2d_transpose_nchw(1, 9, (32, 32), 9, (5, 5), (1, 1), (0, 0, 0, 0), (0, 0), 3)
-    verify_group_conv2d_transpose_nchw(1, 4, (32, 32), 16, (5, 5), (2, 2), (1, 1, 1, 1), (0, 0), 4)
-    verify_group_conv2d_transpose_nchw(
-        1, 32, (8192, 1), 8, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0), 2
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 512, (8, 1), 256, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0), 16
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 512, (8, 1), 256, (31, 1), (2, 1), (14, 0, 15, 0), (1, 0), 16
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 64, (64, 64), 64, (4, 4), (1, 1), (0, 0, 0, 0), (0, 0), 64
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 128, (32, 32), 128, (4, 4), (1, 1), (0, 0, 0, 0), (0, 0), 128
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 256, (16, 16), 256, (4, 4), (1, 1), (0, 0, 0, 0), (0, 0), 256
-    )
-    verify_group_conv2d_transpose_nchw(1, 1, (224, 224), 1, (1, 1), (1, 1), (0, 0, 0, 0), (0, 0), 1)
-    verify_group_conv2d_transpose_nchw(
-        1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0), 1
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 3, (224, 224), 32, (3, 3), (3, 3), (0, 0, 0, 0), (0, 0), 1
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0), 1
-    )
-    verify_group_conv2d_transpose_nchw(
-        1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1), (0, 0), 1
-    )
-    verify_group_conv2d_transpose_nchw(1, 48, (64, 64), 12, (4, 4), (2, 2), (1, 1, 1, 1), (0, 0), 1)
-
-
-if __name__ == "__main__":
-    test_group_conv2d_transpose_nchw()
diff --git a/tests/python/topi/test_topi_group_conv3d_transpose_ncdhw.py b/tests/python/topi/test_topi_group_conv3d_transpose_ncdhw.py
deleted file mode 100644
index 14de236e3d28..000000000000
--- a/tests/python/topi/test_topi_group_conv3d_transpose_ncdhw.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for group transposed 3d convolution ncdhw."""
-
-import numpy as np
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi
-from tvm.topi.utils import get_const_tuple
-
-_group_conv3d_transpose_ncdhw_implement = {
-    "generic": (
-        topi.nn.group_conv3d_transpose_ncdhw,
-        topi.generic.schedule_group_conv3d_transpose_ncdhw,
-    ),
-}
-
-
-(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    output_padding,
-    groups,
-) = tvm.testing.parameters(
-    (1, 4, (32, 32, 32), 32, (5, 5, 5), 1, 0, (0, 0, 0), 4),
-    (1, 8, (32, 32, 32), 32, (7, 7, 7), 1, 2, (0, 0, 0), 4),
-    (1, 8, (32, 32, 32), 32, (5, 5, 5), 2, 1, (0, 0, 0), 2),
-    (1, 4, (32, 32, 32), 4, (5, 5, 5), 2, 1, (1, 1, 1), 4),
-    (1, 3, (64, 64, 64), 15, (5, 5, 5), 2, 0, (0, 0, 0), 3),
-    (1, 32, (16, 16, 16), 128, (5, 5, 5), 1, 0, (0, 0, 0), 32),
-    (1, 32, (16, 16, 16), 128, (5, 5, 5), 2, 1, (0, 0, 0), 16),
-)
-
-dtype = tvm.testing.parameter("float32")
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding, groups
-):
-    dtype = "float32"
-    in_d, in_h, in_w = in_size
-    k_d, k_h, k_w = kernel
-    a_shape = (batch, in_channel, in_d, in_h, in_w)
-    w_shape = (in_channel, num_filter // groups, k_d, k_h, k_w)
-
-    a_np = np.random.uniform(size=a_shape).astype(dtype)
-    w_np = np.random.uniform(size=w_shape).astype(dtype)
-    b_np = tvm.topi.testing.conv3d_transpose_ncdhw_python(
-        a_np, w_np, stride, padding, output_padding, groups
-    )
-    c_np = np.maximum(b_np, 0)
-    return a_np, w_np, b_np, c_np
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_group_conv3d_transpose_ncdhw(
-    target, dev, ref_data, dtype, stride, padding, output_padding, groups
-):
-    a_np, w_np, b_np, c_np = ref_data
-    print("shapes : ", a_np.shape, w_np.shape, b_np.shape, c_np.shape)
-    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-    W = te.placeholder(w_np.shape, name="W", dtype=dtype)
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(
-            target, _group_conv3d_transpose_ncdhw_implement
-        )
-        B = fcompute(A, W, stride, padding, A.dtype, output_padding, groups)
-        C = topi.nn.relu(B)
-        s1 = fschedule([B])
-        s2 = fschedule([C])
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-    func1 = tvm.build(s1, [A, W, B], target)
-    func2 = tvm.build(s2, [A, W, C], target)
-    func1(a, w, b)
-    func2(a, w, c)
-    tvm.testing.assert_allclose(b.numpy(), b_np, atol=1e-5, rtol=1e-5)
-    tvm.testing.assert_allclose(c.numpy(), c_np, atol=1e-5, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_group_norm.py b/tests/python/topi/test_topi_group_norm.py
deleted file mode 100644
index 8f8ab75b8a2e..000000000000
--- a/tests/python/topi/test_topi_group_norm.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for group_norm."""
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-import tvm.topi.testing
-
-import tvm.testing
-
-
-_group_norm_schedule = {
-    "generic": topi.generic.schedule_injective,
-}
-
-
-# only test on llvm because schedule is missing
-@tvm.testing.parametrize_targets("llvm")
-@pytest.mark.parametrize("shape, axis", [([2, 4, 16], (2,)), ([2, 4, 4, 16], (2, 3))])
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-def test_group_norm(target, dev, shape, axis, dtype, epsilon=1e-5, rtol=1e-5, atol=1e-5):
-    data = te.placeholder(shape, dtype=dtype, name="data")
-    num_groups = 2
-    channel_axis = 1
-    gamma = te.placeholder((shape[channel_axis],), dtype=dtype, name="gamma")
-    beta = te.placeholder((shape[channel_axis],), dtype=dtype, name="beta")
-    B = topi.nn.group_norm(data, gamma, beta, num_groups, channel_axis, axis, epsilon)
-
-    np.random.seed(0)
-    data_np = np.random.uniform(size=shape).astype(dtype)
-    gamma_np = np.random.uniform(size=(shape[channel_axis],)).astype(dtype)
-    beta_np = np.random.uniform(size=(shape[channel_axis],)).astype(dtype)
-    b_np = tvm.topi.testing.group_norm_python(
-        data_np, gamma_np, beta_np, num_groups, channel_axis, axis, epsilon
-    )
-
-    with tvm.target.Target(target):
-        s_func = tvm.topi.testing.dispatch(target, _group_norm_schedule)
-        s = s_func([B])
-    data_tvm = tvm.nd.array(data_np, dev)
-    gamma_tvm = tvm.nd.array(gamma_np, dev)
-    beta_tvm = tvm.nd.array(beta_np, dev)
-    b_tvm = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-    f = tvm.build(s, [data, gamma, beta, B], target)
-    f(data_tvm, gamma_tvm, beta_tvm, b_tvm)
-    tvm.testing.assert_allclose(b_tvm.numpy(), b_np, rtol=rtol, atol=atol)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_image.py b/tests/python/topi/test_topi_image.py
deleted file mode 100644
index 56f7a2026d33..000000000000
--- a/tests/python/topi/test_topi_image.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for bilinear scale """
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-
-
-def verify_resize2d(
-    batch,
-    in_channel,
-    in_height,
-    in_width,
-    out_height,
-    out_width,
-    layout="NCHW",
-    coord_trans="align_corners",
-    method="linear",
-):
-    if layout == "NCHW":
-        A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="float32")
-        dtype = A.dtype
-        out_shape = (batch, in_channel, out_height, out_width)
-        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
-    elif layout == "NHWC":
-        A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="float32")
-        dtype = A.dtype
-        out_shape = (batch, out_height, out_width, in_channel)
-        a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype)
-    else:
-        raise NotImplementedError("Layout not supported {} ".format(layout))
-    B = topi.image.resize2d(
-        A,
-        [0.0] * 4,
-        (out_height, out_width),
-        layout=layout,
-        coordinate_transformation_mode=coord_trans,
-        method=method,
-    )
-    scale_h = out_height / in_height
-    scale_w = out_width / in_width
-    b_np = tvm.topi.testing.resize2d_python(a_np, (scale_h, scale_w), layout, method, coord_trans)
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_resize2d():
-    # Scale NCHW
-    verify_resize2d(4, 16, 32, 32, 50, 50, "NCHW")
-    # Scale NCHW + Align Corners
-    verify_resize2d(6, 32, 64, 64, 20, 20, "NCHW")
-    # Scale NHWC
-    verify_resize2d(4, 16, 32, 32, 50, 50, "NHWC")
-    # Scale NHWC + Align Corners
-    verify_resize2d(6, 32, 64, 64, 20, 20, "NHWC")
-    for layout in ["NCHW", "NHWC"]:
-        verify_resize2d(4, 16, 32, 32, 50, 50, layout, "asymmetric", method="nearest_neighbor")
-        verify_resize2d(4, 16, 32, 32, 64, 50, layout, "asymmetric", method="nearest_neighbor")
-        verify_resize2d(4, 16, 32, 32, 50, 96, layout, "asymmetric", method="nearest_neighbor")
-        verify_resize2d(4, 16, 32, 32, 96, 96, layout, "asymmetric", method="nearest_neighbor")
-        verify_resize2d(4, 16, 32, 32, 50, 50, layout, "align_corners", method="nearest_neighbor")
-        verify_resize2d(4, 16, 32, 32, 50, 50, layout, "half_pixel", method="nearest_neighbor")
-        verify_resize2d(4, 16, 32, 32, 50, 50, layout, "asymmetric", method="linear")
-        verify_resize2d(4, 16, 32, 32, 50, 50, layout, "half_pixel", method="linear")
-
-
-def verify_resize3d(
-    batch,
-    in_channel,
-    in_depth,
-    in_height,
-    in_width,
-    out_depth,
-    out_height,
-    out_width,
-    layout="NCDHW",
-    coordinate_transformation_mode="asymmetric",
-    method="linear",
-):
-    if layout == "NCDHW":
-        A = te.placeholder(
-            (batch, in_channel, in_depth, in_height, in_width), name="A", dtype="float32"
-        )
-        dtype = A.dtype
-        out_shape = (batch, in_channel, out_depth, out_height, out_width)
-        a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(
-            dtype
-        )
-    elif layout == "NDHWC":
-        A = te.placeholder(
-            (batch, in_depth, in_height, in_width, in_channel), name="A", dtype="float32"
-        )
-        dtype = A.dtype
-        out_shape = (batch, out_depth, out_height, out_width, in_channel)
-        a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width, in_channel)).astype(
-            dtype
-        )
-    else:
-        raise NotImplementedError("Layout not supported {} ".format(layout))
-
-    B = topi.image.resize3d(
-        A,
-        [0.0] * 6,
-        (out_depth, out_height, out_width),
-        layout=layout,
-        coordinate_transformation_mode=coordinate_transformation_mode,
-        method=method,
-    )
-
-    scale_d = out_depth / in_depth
-    scale_h = out_height / in_height
-    scale_w = out_width / in_width
-    b_np = tvm.topi.testing.resize3d_python(
-        a_np, (scale_d, scale_h, scale_w), layout, method, coordinate_transformation_mode
-    )
-
-    def check_target(target, dev):
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_resize3d():
-    # Trilinear
-    for method in ["nearest_neighbor", "linear"]:
-        for coord_trans in ["asymmetric", "align_corners", "half_pixel"]:
-            for layout in ["NCDHW", "NDHWC"]:
-                verify_resize3d(3, 16, 32, 32, 32, 10, 10, 10, layout, coord_trans, method)
-
-
-@tvm.testing.uses_gpu
-def test_crop_and_resize():
-    def verify_crop_and_resize(
-        image_shape,
-        np_boxes,
-        np_box_indices,
-        np_crop_size,
-        layout="NHWC",
-        method="bilinear",
-        extrapolation_value=0.0,
-    ):
-
-        images = te.placeholder(image_shape, name="images", dtype="float32")
-        np_images = np.random.uniform(size=image_shape).astype("float32")
-        boxes = te.placeholder(np_boxes.shape, name="boxes", dtype="float32")
-        box_ind = te.placeholder(np_box_indices.shape, name="box_ind", dtype="int32")
-
-        batch = len(np_box_indices)
-        target_height, target_width = np_crop_size[0], np_crop_size[1]
-        if layout == "NHWC":
-            channel = image_shape[3]
-            out_shape = (batch, target_height, target_width, channel)
-        elif layout == "NCHW":
-            channel = image_shape[1]
-            out_shape = (batch, channel, target_height, target_width)
-        else:
-            raise NotImplementedError("Layout {} is not supported.".format(layout))
-
-        out = topi.image.crop_and_resize(
-            images,
-            boxes,
-            box_ind,
-            np_crop_size,
-            layout=layout,
-            method=method,
-            extrapolation_value=extrapolation_value,
-        )
-
-        baseline_np = tvm.topi.testing.crop_and_resize_python(
-            np_images, np_boxes, np_box_indices, np_crop_size, layout, method, extrapolation_value
-        )
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(out)
-            tvm_images = tvm.nd.array(np_images, dev)
-            tvm_boxes = tvm.nd.array(np_boxes, dev)
-            tvm_indices = tvm.nd.array(np_box_indices, dev)
-            tvm_out = tvm.nd.array(np.zeros(out_shape, dtype="float32"), dev)
-            f = tvm.build(s, [images, boxes, box_ind, out], target, name="crop_and_resize")
-            f(tvm_images, tvm_boxes, tvm_indices, tvm_out)
-
-            tvm.testing.assert_allclose(tvm_out.numpy(), baseline_np, rtol=1e-3, atol=1e-3)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    boxes_1 = np.array([[0.2, 0.3, 0.7, 0.9]], dtype="float32")
-    boxes_2 = np.array([[0.2, 0.3, 0.7, 0.9], [0, 0.1, 0.8, 1]], dtype="float32")
-    indices_1 = np.array([0], dtype="int32")
-    indices_2 = np.array([1, 0], dtype="int32")
-    size_1 = (7, 11)
-    size_2 = (90, 60)
-
-    verify_crop_and_resize((1, 255, 255, 3), boxes_1, indices_1, size_1, layout="NHWC")
-    verify_crop_and_resize(
-        (10, 224, 224, 5), boxes_2, indices_2, size_2, extrapolation_value=0.3, layout="NHWC"
-    )
-    verify_crop_and_resize((1, 100, 100, 3), boxes_1, indices_1, size_1, method="nearest_neighbor")
-    verify_crop_and_resize((1, 3, 224, 224), boxes_1, indices_1, size_1, layout="NCHW")
-
-
-@tvm.testing.uses_gpu
-def test_affine_grid():
-    def verify_affine_grid(num_batch, target_shape):
-        dtype = "float32"
-        data_shape = (num_batch, 2, 3)
-        data = te.placeholder(data_shape, dtype=dtype)
-        out = topi.image.affine_grid(data, target_shape)
-
-        @memoize("topi.tests.test_affine_grid.verify_affine_grid")
-        def get_ref_data():
-            data_np = np.random.uniform(size=data_shape).astype(dtype)
-            out_np = tvm.topi.testing.affine_grid_python(data_np, target_shape)
-            return data_np, out_np
-
-        data_np, out_np = get_ref_data()
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(out)
-            tvm_data = tvm.nd.array(data_np, dev)
-            tvm_out = tvm.nd.empty(out_np.shape, dtype, dev)
-            f = tvm.build(s, [data, out], target)
-            f(tvm_data, tvm_out)
-
-            tvm.testing.assert_allclose(tvm_out.numpy(), out_np, rtol=1e-5, atol=1e-5)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    verify_affine_grid(1, (16, 32))
-    verify_affine_grid(4, (16, 32))
-
-
-@tvm.testing.uses_gpu
-def test_grid_sample():
-    def verify_grid_sample(
-        data_shape,
-        grid_shape,
-        method="bilinear",
-        layout="NCHW",
-        padding_mode="zeros",
-        align_corners=True,
-    ):
-        dtype = "float32"
-        data = te.placeholder(data_shape, dtype=dtype)
-        grid = te.placeholder(grid_shape, dtype=dtype)
-        out = topi.image.grid_sample(data, grid, method, layout, padding_mode, align_corners)
-
-        @memoize("topi.tests.test_grid_sample.verify_grid_sample")
-        def get_ref_data():
-            data_np = np.random.uniform(size=data_shape).astype(dtype)
-            # allow grid values to be out-of-bound
-            grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype)
-            out_np = tvm.topi.testing.grid_sample_python(
-                data_np, grid_np, method, layout, padding_mode, align_corners
-            )
-            return data_np, grid_np, out_np
-
-        data_np, grid_np, out_np = get_ref_data()
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(out)
-            tvm_data = tvm.nd.array(data_np, dev)
-            tvm_grid = tvm.nd.array(grid_np, dev)
-            tvm_out = tvm.nd.empty(out_np.shape, dtype, dev)
-            f = tvm.build(s, [data, grid, out], target)
-            f(tvm_data, tvm_grid, tvm_out)
-
-            tvm.testing.assert_allclose(tvm_out.numpy(), out_np, rtol=1e-5, atol=1e-5)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    methods = ["nearest", "bilinear", "bicubic"]
-    padding_modes = ["zeros", "border", "reflection"]
-    align_corners = [True, False]
-    data_2D_shape = (4, 4, 8, 8)
-    grid_2D_shape = (4, 2, 16, 16)
-    layout_2D = "NCHW"
-    # choosing smaller sizes to be testable on weaker GPUs
-    data_3D_shape = (4, 4, 4, 4, 4)
-    grid_3D_shape = (4, 3, 8, 8, 8)
-    layout_3D = "NCDHW"
-
-    for _method in methods:
-        for _padding in padding_modes:
-            for _align in align_corners:
-                verify_grid_sample(
-                    data_2D_shape, grid_2D_shape, _method, layout_2D, _padding, _align
-                )
-
-                # 3D "bicubic"(tricubic) is not supported in pytorch
-                if _method != "bicubic":
-                    verify_grid_sample(
-                        data_3D_shape, grid_3D_shape, _method, layout_3D, _padding, _align
-                    )
-
-
-if __name__ == "__main__":
-    test_resize2d()
-    test_resize3d()
-    test_crop_and_resize()
-    test_affine_grid()
-    test_grid_sample()
diff --git a/tests/python/topi/test_topi_instance_norm.py b/tests/python/topi/test_topi_instance_norm.py
deleted file mode 100644
index b19d9495dd2f..000000000000
--- a/tests/python/topi/test_topi_instance_norm.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for instance_norm."""
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-import tvm.topi.testing
-
-import tvm.testing
-
-
-_instance_norm_schedule = {
-    "generic": topi.generic.schedule_injective,
-}
-
-
-# only test on llvm because schedule is missing
-@tvm.testing.parametrize_targets("llvm")
-@pytest.mark.parametrize("shape,axis", [([4, 16], (1,)), ([4, 16, 16], (1, 2))])
-def test_instance_norm(
-    target, dev, shape, axis, episilon=1e-5, dtype="float32", rtol=1e-5, atol=1e-5
-):
-    data = te.placeholder(shape, dtype=dtype, name="data")
-    scale_shape = [shape[dim] for dim in axis]
-    gamma = te.placeholder(scale_shape, dtype=dtype, name="gamma")
-    beta = te.placeholder(scale_shape, dtype=dtype, name="beta")
-    B = topi.nn.instance_norm(data, gamma, beta, axis, episilon)
-
-    data_np = np.random.uniform(size=shape).astype(dtype)
-    gamma_np = np.random.uniform(size=scale_shape).astype(dtype)
-    beta_np = np.random.uniform(size=scale_shape).astype(dtype)
-    b_np = tvm.topi.testing.instance_norm_python(data_np, gamma_np, beta_np, axis, episilon)
-
-    with tvm.target.Target(target):
-        s_func = tvm.topi.testing.dispatch(target, _instance_norm_schedule)
-        s = s_func([B])
-    data_tvm = tvm.nd.array(data_np, dev)
-    gamma_tvm = tvm.nd.array(gamma_np, dev)
-    beta_tvm = tvm.nd.array(beta_np, dev)
-    b_tvm = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-    f = tvm.build(s, [data, gamma, beta, B], target)
-    f(data_tvm, gamma_tvm, beta_tvm, b_tvm)
-    tvm.testing.assert_allclose(b_tvm.numpy(), b_np, rtol=rtol, atol=atol)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_layer_norm.py b/tests/python/topi/test_topi_layer_norm.py
deleted file mode 100644
index ff9eedd4e5e4..000000000000
--- a/tests/python/topi/test_topi_layer_norm.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for layer_norm."""
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-import tvm.topi.testing
-
-import tvm.testing
-
-
-_layer_norm_schedule = {
-    "generic": topi.generic.schedule_injective,
-}
-
-
-# only test on llvm because schedule is missing
-@tvm.testing.parametrize_targets("llvm")
-@pytest.mark.parametrize("shape,axis", [([4, 16], (1,)), ([4, 16, 16], (1, 2))])
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-def test_layer_norm(target, dev, shape, axis, dtype, episilon=1e-5, rtol=5e-4, atol=5e-4):
-    data = te.placeholder(shape, dtype=dtype, name="data")
-    scale_shape = [shape[dim] for dim in axis]
-    gamma = te.placeholder(scale_shape, dtype=dtype, name="gamma")
-    beta = te.placeholder(scale_shape, dtype=dtype, name="beta")
-    B = topi.nn.layer_norm(data, gamma, beta, axis, episilon)
-
-    data_np = np.random.uniform(size=shape).astype(dtype)
-    gamma_np = np.random.uniform(size=scale_shape).astype(dtype)
-    beta_np = np.random.uniform(size=scale_shape).astype(dtype)
-    b_np = tvm.topi.testing.layer_norm_python(data_np, gamma_np, beta_np, axis, episilon)
-
-    with tvm.target.Target(target):
-        s_func = tvm.topi.testing.dispatch(target, _layer_norm_schedule)
-        s = s_func([B])
-    data_tvm = tvm.nd.array(data_np, dev)
-    gamma_tvm = tvm.nd.array(gamma_np, dev)
-    beta_tvm = tvm.nd.array(beta_np, dev)
-    b_tvm = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-    f = tvm.build(s, [data, gamma, beta, B], target)
-    f(data_tvm, gamma_tvm, beta_tvm, b_tvm)
-    tvm.testing.assert_allclose(b_tvm.numpy(), b_np, rtol=rtol, atol=atol)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_loss.py b/tests/python/topi/test_topi_loss.py
deleted file mode 100644
index 969beb7d28f7..000000000000
--- a/tests/python/topi/test_topi_loss.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for loss operators."""
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-
-import tvm.testing
-
-
-prediction_shape, reduction, ignore_index, dtype = tvm.testing.parameters(
-    ((10, 5), "mean", -100, "float32"),
-    ((10, 5, 2, 2), "mean", -100, "float32"),
-    ((10, 5), "sum", -100, "float32"),
-    ((10, 5), "none", -100, "float32"),
-    ((10, 5), "mean", 3, "float32"),
-    ((10, 5), "mean", -100, "float64"),
-    ((5,), "mean", -100, "float32"),
-    ((5,), "mean", 3, "float32"),
-    ((5,), "none", -100, "float32"),
-)
-
-
-def test_nll_loss(target, dev, prediction_shape, reduction, ignore_index, dtype):
-    if len(prediction_shape) == 1:
-        C = prediction_shape[0]
-        target_shape = []
-    else:
-        C = prediction_shape[1]
-        target_shape = prediction_shape[:1] + prediction_shape[2:]
-    predictions = te.placeholder(shape=prediction_shape, name="predictions", dtype=dtype)
-    targets = te.placeholder(shape=target_shape, name="targets", dtype="int32")
-    weights = te.placeholder(shape=(C,), name="weights", dtype=dtype)
-    nll_loss_result = topi.nn.nll_loss(predictions, targets, weights, reduction, ignore_index)
-
-    with tvm.target.Target(target):
-        fschedule = tvm.topi.testing.get_reduce_schedule(target)
-        s = fschedule([nll_loss_result])
-    fn = tvm.build(s, [predictions, targets, weights, nll_loss_result], target, name="nll_loss")
-
-    predictions_npy = np.random.uniform(size=prediction_shape).astype(dtype)
-    targets_npy = np.random.randint(0, C, target_shape).astype("int32")
-    weights_npy = np.random.uniform(size=(C,)).astype(dtype)
-    out_npy = tvm.topi.testing.nll_loss(
-        predictions_npy, targets_npy, weights_npy, reduction, ignore_index
-    )
-
-    predictions_nd = tvm.nd.array(predictions_npy, dev)
-    targets_nd = tvm.nd.array(targets_npy, dev)
-    weights_nd = tvm.nd.array(weights_npy, dev)
-    out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(nll_loss_result.dtype), dev)
-    fn(predictions_nd, targets_nd, weights_nd, out_nd)
-    out_topi = out_nd.numpy()
-    tvm.testing.assert_allclose(out_topi, out_npy, rtol=1e-4, atol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_lrn.py b/tests/python/topi/test_topi_lrn.py
deleted file mode 100644
index bf94d7cd79d9..000000000000
--- a/tests/python/topi/test_topi_lrn.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for local response normalization"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-import tvm.topi.testing
-import tvm.testing
-
-_lrn_schedule = {
-    "generic": topi.generic.schedule_lrn,
-    "gpu": topi.cuda.schedule_lrn,
-    "opencl": topi.cuda.schedule_lrn,
-    "metal": topi.cuda.schedule_lrn,
-    "rocm": topi.cuda.schedule_lrn,
-    "vulkan": topi.cuda.schedule_lrn,
-    "nvptx": topi.cuda.schedule_lrn,
-}
-
-
-def verify_lrn(shape, size, axis, bias, alpha, beta, dtype="float32", rtol=1e-5, atol=1e-5):
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = topi.nn.lrn(A, size, axis, alpha, beta, bias)
-
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = tvm.topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
-
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _lrn_schedule)
-            s = s_func([B])
-        dev = tvm.device(device, 0)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=rtol, atol=atol)
-
-    for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
-        check_device(device)
-
-
-@tvm.testing.uses_gpu
-def test_lrn():
-    verify_lrn((1, 3, 5, 5), 3, 1, 1.0, 1.0, 0.5)
-    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
-    verify_lrn((1, 3, 20, 20), 3, 1, 2.0, 1.0, 0.75)
-    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5, dtype="float16", rtol=1e-3, atol=1e-3)
-
-
-if __name__ == "__main__":
-    test_lrn()
diff --git a/tests/python/topi/test_topi_lstm.py b/tests/python/topi/test_topi_lstm.py
deleted file mode 100644
index 08ed5d73523d..000000000000
--- a/tests/python/topi/test_topi_lstm.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Test code for LSTM."""
-import numpy as np
-from rsa import verify
-import tvm
-from tvm import te, topi
-import tvm.testing
-import tvm.topi.testing
-
-
-def verify_lstm(
-    target,
-    dev,
-    seq_len,
-    batch_size,
-    in_dim,
-    hidden_dim,
-    proj_dim=0,
-    bias=True,
-    zero_init=True,
-    peephole=False,
-    reverse=False,
-    weight_layout="IFGO",
-):
-    out_dim = proj_dim if proj_dim > 0 else hidden_dim
-
-    def rand(*shape):
-        sqrt_k = np.sqrt(1 / hidden_dim)
-        return np.random.uniform(-sqrt_k, sqrt_k, size=shape).astype("float32")
-
-    def get_ref_data():
-        Xs = np.random.normal(size=(seq_len, batch_size, in_dim)).astype("float32")
-        Wi = rand(4 * hidden_dim, in_dim)
-        Wh = rand(4 * hidden_dim, out_dim)
-        Bi = None
-        Bh = None
-        h0 = None
-        c0 = None
-        proj = None
-        p_i = None
-        p_f = None
-        p_o = None
-
-        if bias:
-            Bi = rand(4 * hidden_dim)
-            Bh = rand(4 * hidden_dim)
-
-        if not zero_init:
-            h0 = np.random.normal(size=(batch_size, out_dim)).astype("float32")
-            c0 = np.random.normal(size=(batch_size, hidden_dim)).astype("float32")
-
-        if proj_dim > 0:
-            proj = rand(proj_dim, hidden_dim)
-
-        if peephole:
-            p_i, p_f, p_o = [rand(batch_size, hidden_dim) for _ in range(3)]
-
-        hs, cs = tvm.topi.testing.lstm_python(
-            Xs,
-            Wi,
-            Wh,
-            Bi=Bi,
-            Bh=Bh,
-            h_init=h0,
-            c_init=c0,
-            proj=proj,
-            p_i=p_i,
-            p_f=p_f,
-            p_o=p_o,
-            reverse=reverse,
-            weight_layout=weight_layout,
-        )
-
-        return [Xs, Wi, Wh, Bi, Bh, h0, c0, proj, p_i, p_f, p_o], [hs, cs]
-
-    args_np, (hs_np, cs_np) = get_ref_data()
-
-    args = [te.placeholder(a.shape, "float32") if a is not None else a for a in args_np]
-    real_args = [a for a in args if a is not None]
-
-    hs, cs = topi.nn.lstm(*args, reverse=reverse, weight_layout=weight_layout)
-    with tvm.target.Target(target):
-        sch = topi.generic.schedule_lstm([hs, cs])
-    func = tvm.build(sch, real_args + [hs, cs], target=target)
-
-    args_nd = [tvm.nd.array(a, dev) for a in args_np if a is not None]
-    hs_nd = tvm.nd.array(np.zeros((seq_len, batch_size, out_dim), "float32"), dev)
-    cs_nd = tvm.nd.array(np.zeros((seq_len, batch_size, hidden_dim), "float32"), dev)
-    func(*args_nd, hs_nd, cs_nd)
-
-    tvm.testing.assert_allclose(hs_nd.numpy(), hs_np, rtol=1e-4)
-    tvm.testing.assert_allclose(cs_nd.numpy(), cs_np, rtol=1e-4)
-
-
-def test_lstm():
-    verify_lstm(
-        "llvm",
-        tvm.cpu(0),
-        1,
-        1,
-        1,
-        1,
-        0,
-        True,
-        True,
-        False,
-        False,
-        "IFGO",
-    )
-
-    verify_lstm(
-        "llvm",
-        tvm.cpu(0),
-        8,
-        4,
-        8,
-        16,
-        0,
-        True,
-        False,
-        False,
-        False,
-        "IFGO",
-    )
-
-
-def test_lstm_proj():
-    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 8, True, True, False, False, "IFGO")
-
-
-def test_lstm_peephole():
-    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, True, False, "IFGO")
-
-
-def test_lstm_reverse():
-    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, False, True, "IFGO")
-
-
-def test_lstm_weight_layout_iofg():
-    # IOFG is used by ONNX, while IFGO is used by PyTorch
-    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, False, False, "IOFG")
-
-
-def test_lstm_assorted():
-    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 16, True, False, True, True, "OIGF")
diff --git a/tests/python/topi/test_topi_math.py b/tests/python/topi/test_topi_math.py
deleted file mode 100644
index 917702ebb9ba..000000000000
--- a/tests/python/topi/test_topi_math.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import sys
-
-import numpy as np
-import pytest
-import scipy
-from scipy import special
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi
-from tvm.topi import utils
-
-
-def test_util():
-    x = tvm.tir.const(100, "int32")
-    assert utils.get_const_int(x) == 100
-    assert utils.get_const_tuple((x, x)) == (100, 100)
-
-
-ewise_operations = {
-    "floor": {"topi": topi.floor, "ref": np.floor, "input_range": (-100, 100)},
-    "ceil": {"topi": topi.ceil, "ref": np.ceil, "input_range": (-100, 100)},
-    "sign": {
-        "topi": topi.sign,
-        "ref": np.sign,
-        "input_range": (-100, 100),
-        "skip_name_check": True,
-    },
-    "trunc": {"topi": topi.trunc, "ref": np.trunc, "input_range": (-100, 100)},
-    "fabs": {"topi": topi.abs, "ref": np.fabs, "input_range": (-100, 100)},
-    "round": {"topi": topi.round, "ref": np.round, "input_range": (-100, 100), "check_round": True},
-    "exp": {"topi": topi.exp, "ref": np.exp, "input_range": (-1, 1)},
-    "tanh": {
-        "topi": topi.tanh,
-        "ref": np.tanh,
-        "input_range": (-10, 10),
-        "shape": (128, 128),
-        "dtype": ["float32", "float64"],
-    },
-    "sigmoid": {
-        "topi": topi.sigmoid,
-        "ref": lambda x: 1 / (1 + np.exp(-x)),
-        "input_range": (-1, 1),
-    },
-    "log": {"topi": topi.log, "ref": np.log, "input_range": (0, 100)},
-    "sqrt": {"topi": topi.sqrt, "ref": np.sqrt, "input_range": (0, 100)},
-    "rsqrt": {
-        "topi": topi.rsqrt,
-        "ref": lambda x: np.ones_like(x) / np.sqrt(x),
-        "input_range": (0, 100),
-        "skip_name_check": True,
-    },
-    "cos": {"topi": topi.cos, "ref": np.cos, "input_range": (-2.0 * np.pi, 2.0 * np.pi)},
-    "tan": {
-        "topi": topi.tan,
-        "ref": np.tan,
-        "input_range": (-2.0 * np.pi, 2.0 * np.pi),
-        "dtypes": ["float32", "float64"],
-    },
-    "sin": {"topi": topi.sin, "ref": np.sin, "input_range": (-2.0 * np.pi, 2.0 * np.pi)},
-    "erf": {"topi": topi.erf, "ref": scipy.special.erf, "input_range": (-0.1, 0.1)},
-    "isnan": {
-        "topi": topi.isnan,
-        "ref": np.isnan,
-        "input_range": (-1, 1),
-        "replace_with_nan": True,
-    },
-    "isfinite": {
-        "topi": topi.isfinite,
-        "ref": np.isfinite,
-        "input_range": (0, 1),
-        "shape": (8, 8),
-        "skip_name_check": True,
-        "replace_with_nan": True,
-        "replace_with_inf": True,
-        "dtypes": ["float32", "float64", "int32", "int16"],
-    },
-    "isinf": {
-        "topi": topi.isinf,
-        "ref": np.isinf,
-        "input_range": (0, 1),
-        "shape": (8, 8),
-        "skip_name_check": True,
-        "replace_with_nan": True,
-        "replace_with_inf": True,
-        "dtypes": ["float32", "float64", "int32", "int16"],
-    },
-    "fast_exp": {
-        "topi": topi.fast_exp,
-        "ref": np.exp,
-        "skip_name_check": True,
-        "input_range": (-88, 88),
-        "step": 0.01,
-    },
-    "fast_erf": {
-        "topi": topi.fast_erf,
-        "ref": scipy.special.erf,
-        "skip_name_check": True,
-        "input_range": (-10, 10),
-        "step": 0.01,
-        "dtypes": ["float32", "float16"],
-        "cast_output": True,
-        "tolerance": [1e-5, 1e-1],
-    },
-    "fast_tanh": {
-        "topi": topi.fast_tanh,
-        "ref": np.tanh,
-        "skip_name_check": True,
-        "input_range": (-10, 10),
-        "step": 0.01,
-    },
-}
-
-topi_name, dtype, tolerance = tvm.testing.parameters(
-    *[
-        (name, dtype, config.get("tolerance", [1e-5] * len(dtype))[i])
-        for name, config in ewise_operations.items()
-        for i, dtype in enumerate(config.get("dtypes", ["float32"]))
-    ]
-)
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ewise_ref_data(topi_name, dtype):
-    config = ewise_operations[topi_name]
-
-    input_range = config["input_range"]
-    shape = config.get("shape", (20, 3))
-
-    a_np = np.random.uniform(*input_range, size=shape).astype(dtype)
-
-    if dtype.startswith("float"):
-        if config.get("replace_with_nan", False):
-            a_np.ravel()[np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)] = np.nan
-        if config.get("replace_with_inf", False):
-            a_np.ravel()[np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)] = np.inf
-
-    # avoid round check too close to boundary
-    if topi_name == "round":
-        a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-4
-
-    b_np = config["ref"](a_np)
-
-    if config.get("cast_output", False):
-        b_np = b_np.astype(dtype)
-
-    return a_np, b_np
-
-
-def test_ewise(target, dev, topi_name, dtype, tolerance, ewise_ref_data):
-    target = tvm.target.Target(target)
-    if target.kind.name == "vulkan" and topi_name in ["tan", "erf", "isnan", "isfinite", "isinf"]:
-        pytest.xfail(f"Vulkan runtime doesn't support {topi_name} yet")
-
-    topi_op = ewise_operations[topi_name]["topi"]
-    skip_name_check = ewise_operations[topi_name].get("skip_name_check", False)
-
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), dtype=dtype, name="A")
-
-    B = topi_op(A)
-    assert tuple(B.shape) == tuple(A.shape)
-    if not skip_name_check:
-        assert B.op.body[0].op.name == "tir." + topi_name
-
-    a_np, b_np = ewise_ref_data
-
-    with tvm.target.Target(target):
-        s = tvm.topi.testing.get_injective_schedule(target)(B)
-    foo = tvm.build(s, [A, B], target, name=topi_name)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros_like(b_np), dev)
-    foo(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=tolerance, atol=tolerance)
-
-
-from_dtype, to_dtype = tvm.testing.parameters(
-    ("int32", "float32"),
-    ("int32", "float64"),
-    ("int32", "bool"),
-    ("float16", "float32"),
-    ("float16", "float64"),
-    ("float32", "int32"),
-    ("float32", "float64"),
-    ("float32", "bool"),
-    # disable this due to llvm5+ bug https://github.com/llvm/llvm-project/issues/56204
-    # TODO (yongwww): pattern match f64->f16 to f64->f32->f16 as a workaround
-    # ("float64", "float16"),
-    ("float64", "float32"),
-    ("bool", "float32"),
-    ("bool", "int32"),
-)
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def cast_ref_data(from_dtype, to_dtype):
-    shape = (5, 4)
-    input_range = (-100, 100)
-
-    if from_dtype == "bool":
-        a_np = np.random.choice([True, False], size=shape)
-    else:
-        a_np = np.random.uniform(*input_range, size=shape).astype(from_dtype)
-
-    if to_dtype == "bool":
-        a_np = a_np - a_np[2, 3]
-    b_np = a_np.astype(to_dtype)
-
-    return a_np, b_np
-
-
-def test_cast(target, dev, cast_ref_data, from_dtype, to_dtype):
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), dtype=from_dtype, name="A")
-    B = topi.cast(A, to_dtype)
-
-    a_np, b_np = cast_ref_data
-
-    with tvm.target.Target(target):
-        s = tvm.topi.testing.get_injective_schedule(target)(B)
-    foo = tvm.build(s, [A, B], target)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.empty(b_np.shape, dtype=to_dtype, device=dev)
-    foo(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_matmul.py b/tests/python/topi/test_topi_matmul.py
deleted file mode 100644
index d4abcd49d0ee..000000000000
--- a/tests/python/topi/test_topi_matmul.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.arm_cpu.matmul import compute_matmul_sme
-
-
-def with_tvm(lam, *args):
-    """Take numpy arrays as args, convert them to TVM tensors and call `lam`.
-    Result of lambda is converted back to numpy array and returned.
-    """
-    dev = tvm.cpu(0)
-    pls = []  # placeholders
-    vals_nd = []  # initial values
-    for i, arg in enumerate(args):
-        pls.append(te.placeholder(arg.shape, name="pl" + str(i)))
-        vals_nd.append(tvm.nd.array(arg, dev))
-
-    out = lam(*pls)
-    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), dev)
-    s = te.create_schedule([out.op])
-    m = tvm.build(s, pls + [out], "llvm")
-    m(*(vals_nd + [out_nd]))
-    return out_nd.numpy()
-
-
-def verify_nn_matmul(sa, sb, transp_a, transp_b, bias=False):
-    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
-    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
-    if bias:
-        bias_shape = sb[-2] if transp_b else sb[-1]
-        bias_np = np.random.uniform(low=-1.0, high=1.0, size=(bias_shape,)).astype(np.float32)
-
-    a_np = a
-    if transp_a:
-        axes = list(range(len(sa)))
-        axes[-2], axes[-1] = axes[-1], axes[-2]
-        a_np = np.transpose(a_np, axes)
-    b_np = b
-    if transp_b:
-        axes = list(range(len(sb)))
-        axes[-2], axes[-1] = axes[-1], axes[-2]
-        b_np = np.transpose(b_np, axes)
-
-    if bias:
-        c1 = np.matmul(a_np, b_np) + bias_np
-        c2 = with_tvm(
-            lambda A, B, bias: topi.nn.matmul(
-                A, B, transpose_a=transp_a, transpose_b=transp_b, bias=bias
-            ),
-            a,
-            b,
-            bias_np,
-        )
-    else:
-        c1 = np.matmul(a_np, b_np)
-        c2 = with_tvm(
-            lambda A, B: topi.nn.matmul(A, B, transpose_a=transp_a, transpose_b=transp_b), a, b
-        )
-
-    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
-
-
-def test_nn_matmul():
-    verify_nn_matmul((1, 1), (1, 1), False, False)
-    verify_nn_matmul((1, 1), (1, 1), True, True)
-    verify_nn_matmul((2, 2), (2, 2), False, False)
-    verify_nn_matmul((2, 2), (2, 2), True, True)
-    verify_nn_matmul((2, 3), (3, 5), False, False)
-    verify_nn_matmul((5, 3), (3, 2), False, False)
-    verify_nn_matmul((3, 5), (2, 3), True, True)
-    verify_nn_matmul((3, 5), (3, 2), True, False)
-    verify_nn_matmul((5, 3), (2, 3), False, True)
-    # matmul with bias
-    verify_nn_matmul((5, 3), (3, 2), False, False, True)
-    verify_nn_matmul((3, 5), (2, 3), True, True, True)
-    verify_nn_matmul((3, 5), (3, 2), True, False, True)
-    verify_nn_matmul((5, 3), (2, 3), False, True, True)
-    # batched matmul
-    verify_nn_matmul((4, 5, 3), (4, 3, 2), False, False)
-    verify_nn_matmul((4, 3, 5), (4, 2, 3), True, True)
-    verify_nn_matmul((4, 3, 5), (4, 3, 2), True, False)
-    verify_nn_matmul((4, 5, 3), (4, 2, 3), False, True)
-    # batched matmul with broadcast
-    verify_nn_matmul((4, 5, 3), (1, 2, 3), False, True)
-    verify_nn_matmul((1, 5, 3), (4, 2, 3), False, True)
-    verify_nn_matmul((5, 3), (4, 2, 3), False, True)
-    verify_nn_matmul((4, 5, 3), (2, 3), False, True)
-    verify_nn_matmul((2, 4, 5, 3), (1, 2, 3), False, True)
-    # batched matmul with bias
-    verify_nn_matmul((4, 5, 3), (4, 3, 2), False, False, True)
-    verify_nn_matmul((4, 3, 5), (4, 2, 3), True, True, True)
-    verify_nn_matmul((4, 3, 5), (4, 3, 2), True, False, True)
-    verify_nn_matmul((4, 5, 3), (4, 2, 3), False, True, True)
-
-
-def verify_matmul(sa, sb, transp_a, transp_b):
-    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
-    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
-    c1 = np.matmul(np.transpose(a) if transp_a else a, np.transpose(b) if transp_b else b)
-    c2 = with_tvm(lambda A, B: topi.matmul(A, B, transp_a, transp_b), a, b)
-    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
-
-
-def test_matmul():
-    verify_matmul((1, 1), (1, 1), False, False)
-    verify_matmul((1, 1), (1, 1), True, True)
-    verify_matmul((2, 2), (2, 2), False, False)
-    verify_matmul((2, 2), (2, 2), True, True)
-    verify_matmul((2, 3), (3, 5), False, False)
-    verify_matmul((5, 3), (3, 2), False, False)
-    verify_matmul((3, 5), (3, 2), True, False)
-    verify_matmul((3, 5), (2, 3), True, True)
-
-
-def verify_tensordot(sa, sb, axes):
-    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
-    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
-    c1 = np.tensordot(a, b, axes)
-    c2 = with_tvm(lambda A, B: topi.tensordot(A, B, axes), a, b)
-    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
-
-
-def test_tensordot():
-    verify_tensordot((3), (3), 0)
-    verify_tensordot((2, 3), (3, 5), 1)
-    verify_tensordot((2, 2, 3), (2, 3, 5), 2)
-    verify_tensordot((2, 2, 3, 4), (2, 3, 4, 5), 3)
-    verify_tensordot((3, 2, 2), (2, 3, 5), (1, 0))
-    verify_tensordot((3, 2, 2), (2, 3, 5), ((1, 0), (0, 1)))
-    verify_tensordot((4, 3, 2, 2), (2, 4, 3, 5), ((1, 2, 0), (2, 0, 1)))
-
-
-@pytest.mark.parametrize("in_dtype", ["float32", "float16"])
-def test_unsupported_sme_matmul_compute_transpose_a(in_dtype):
-    err_msg = "Transposed lhs not currently supported."
-    with pytest.raises(AssertionError, match=err_msg):
-        compute_matmul_sme(
-            te.placeholder((32, 32), dtype=in_dtype),
-            te.placeholder((32, 32), dtype=in_dtype),
-            None,
-            None,
-            True,
-            False,
-        )
-
-
-def test_unsupported_sme_matmul_compute_transpose_b():
-    err_msg = "Rhs must be transposed when dtype is float16."
-    with pytest.raises(AssertionError, match=err_msg):
-        compute_matmul_sme(
-            te.placeholder((32, 32), dtype="float16"),
-            te.placeholder((32, 32), dtype="float16"),
-            None,
-            None,
-            False,
-            False,
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_pooling.py b/tests/python/topi/test_topi_pooling.py
deleted file mode 100644
index 5f8aebabc2df..000000000000
--- a/tests/python/topi/test_topi_pooling.py
+++ /dev/null
@@ -1,812 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
-"""Test code for pooling"""
-import math
-import pytest
-
-import numpy as np
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import te, topi, TVMError
-from tvm.topi.utils import get_const_tuple
-
-_pool_schedule = {
-    "generic": topi.generic.schedule_pool,
-    "cpu": topi.x86.schedule_pool,
-    "gpu": topi.cuda.schedule_pool,
-    "hls": topi.hls.schedule_pool,
-}
-
-_adaptive_pool_schedule = {
-    "generic": topi.generic.schedule_adaptive_pool,
-    "cpu": topi.x86.schedule_adaptive_pool,
-    "gpu": topi.cuda.schedule_adaptive_pool,
-    "hls": topi.hls.schedule_adaptive_pool,
-}
-
-_pool_grad_schedule = {
-    "generic": topi.generic.schedule_pool_grad,
-    "gpu": topi.cuda.schedule_pool_grad,
-}
-
-
-def verify_pool_grad(
-    n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True, add_relu=False
-):
-    """verify function of pool_grad"""
-    iw = ih
-    kw = kh
-    sw = sh
-    pt, pl, pb, pr = padding
-    A = te.placeholder((n, ic, ih, iw), name="A")
-    B = topi.nn.pool2d(
-        A,
-        kernel=[kh, kw],
-        stride=[sh, sw],
-        dilation=[1, 1],
-        padding=padding,
-        pool_type=pool_type,
-        ceil_mode=ceil_mode,
-        layout="NCHW",
-        count_include_pad=count_include_pad,
-    )
-    dtype = A.dtype
-
-    bshape = get_const_tuple(B.shape)
-    ashape = get_const_tuple(A.shape)
-    if ceil_mode:
-        assert bshape[2] == int(math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1)
-        assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1)
-    else:
-        assert bshape[2] == int(math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1)
-        assert bshape[3] == int(math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1)
-    OutGrad = te.placeholder(bshape, name="OutGrad")
-    PoolGrad = topi.nn.pool_grad(
-        OutGrad,
-        A,
-        kernel=[kh, kw],
-        stride=[sh, sw],
-        padding=padding,
-        pool_type=pool_type,
-        ceil_mode=ceil_mode,
-        layout="NCHW",
-        count_include_pad=count_include_pad,
-    )
-    if add_relu:
-        PoolGrad = topi.nn.relu(PoolGrad)
-
-    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
-    out_grad_np = np.random.uniform(low=0.001, size=bshape).astype(dtype)
-    pool_grad_np = tvm.topi.testing.pool_grad_nchw(
-        a_np,
-        out_grad_np,
-        pool_size=(kh, kw),
-        strides=(sh, sw),
-        padding=padding,
-        pool_type=pool_type,
-        ceil_mode=ceil_mode,
-        count_include_pad=count_include_pad,
-    )
-    if add_relu:
-        pool_grad_np = np.maximum(pool_grad_np, 0.0)
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s_func = tvm.topi.testing.dispatch(target, _pool_grad_schedule)
-            s = s_func(PoolGrad)
-
-        a = tvm.nd.array(a_np, dev)
-        out_grad = tvm.nd.array(out_grad_np, dev)
-        pool_grad = tvm.nd.array(np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), dev)
-        f = tvm.build(s, [A, OutGrad, PoolGrad], target)
-        f(a, out_grad, pool_grad)
-        tvm.testing.assert_allclose(pool_grad.numpy(), pool_grad_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_pool_grad():
-    """test cases of pool_grad"""
-    verify_pool_grad(1, 256, 32, 3, 2, [1, 1, 1, 1], "avg", False, False)
-    verify_pool_grad(1, 256, 32, 2, 2, [0, 0, 0, 0], "avg", False, True)
-    verify_pool_grad(1, 256, 31, 3, 3, [1, 2, 1, 2], "avg", False, True)
-    verify_pool_grad(1, 256, 32, 2, 2, [1, 2, 1, 2], "avg", False, False)
-    verify_pool_grad(1, 256, 31, 4, 4, [2, 2, 2, 2], "avg", False, False)
-    verify_pool_grad(1, 256, 31, 4, 4, [0, 0, 0, 0], "avg", False, False)
-    verify_pool_grad(1, 256, 32, 2, 2, [0, 0, 0, 0], "max", False)
-    verify_pool_grad(1, 256, 31, 3, 3, [2, 1, 2, 1], "max", False)
-    verify_pool_grad(1, 256, 31, 3, 3, [2, 1, 2, 1], "max", True)
-
-    verify_pool_grad(1, 256, 31, 3, 3, [2, 1, 0, 3], "avg", False, True)
-    verify_pool_grad(1, 256, 32, 2, 2, [0, 3, 2, 1], "avg", False, False)
-    verify_pool_grad(1, 256, 31, 3, 3, [1, 0, 3, 2], "max", False)
-    verify_pool_grad(1, 256, 31, 3, 3, [3, 2, 1, 0], "max", True)
-    verify_pool_grad(1, 256, 32, 3, 2, [1, 1, 1, 1], "max", False)
-    verify_pool_grad(1, 256, 32, 1, 2, [1, 1, 1, 1], "avg", False, False)
-
-    verify_pool_grad(1, 256, 31, 4, 4, [0, 0, 0, 0], "avg", False, False, add_relu=True)
-    verify_pool_grad(1, 256, 32, 2, 2, [0, 0, 0, 0], "max", False, add_relu=True)
-
-
-def verify_global_pool(dshape, pool_type, layout="NCHW"):
-    """verify function of global_pool"""
-    assert layout in ["NCHW", "NHWC"]
-    A = te.placeholder(shape=dshape, name="A")
-    B = topi.nn.global_pool(A, pool_type=pool_type, layout=layout)
-    B = topi.nn.relu(B)
-
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-
-    axis = (layout.find("H"), layout.find("W"))
-    if pool_type == "avg":
-        b_np = np.mean(a_np, axis=axis, keepdims=True)
-    elif pool_type == "max":
-        b_np = np.max(a_np, axis=axis, keepdims=True)
-    b_np = np.maximum(b_np, 0.0)
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s_func = tvm.topi.testing.dispatch(target, _adaptive_pool_schedule)
-            if target == "cuda":
-                s = s_func(B, layout)
-            else:
-                s = s_func(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_global_pool():
-    """test cases of global_pool"""
-    verify_global_pool((1, 1024, 7, 7), "avg")
-    verify_global_pool((4, 1024, 7, 7), "avg")
-    verify_global_pool((1, 1024, 7, 7), "max")
-    verify_global_pool((4, 1024, 7, 7), "max")
-    verify_global_pool((1, 7, 7, 1024), "avg", "NHWC")
-    verify_global_pool((4, 7, 7, 1024), "avg", "NHWC")
-    verify_global_pool((1, 7, 7, 1024), "max", "NHWC")
-    verify_global_pool((4, 7, 7, 1024), "max", "NHWC")
-
-
-def verify_adaptive_pool(dshape, out_size, pool_type, layout="NCHW", dtype="float32"):
-    """verify function of adaptive_pool"""
-    np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
-    np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
-    oshape = np_out.shape
-
-    data = te.placeholder(dshape, name="data", dtype=dtype)
-    if len(out_size) == 2:
-        out = topi.nn.adaptive_pool(data, out_size, pool_type, layout)
-    else:
-        assert len(out_size) == 3
-        out = topi.nn.adaptive_pool3d(data, out_size, pool_type, layout)
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s_func = tvm.topi.testing.dispatch(target, _adaptive_pool_schedule)
-            if target == "cuda":
-                s = s_func(out, layout)
-            else:
-                s = s_func(out)
-        a = tvm.nd.array(np_data, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), dev)
-        f = tvm.build(s, [data, out], target)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), np_out, rtol=4e-5, atol=1e-6)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_adaptive_pool():
-    """test cases of adaptive_pool"""
-    verify_adaptive_pool((1, 3, 224, 224), (1, 1), "max")
-    verify_adaptive_pool((1, 3, 224, 224), (1, 1), "avg")
-    verify_adaptive_pool((1, 14, 56, 78), (34, 13), "max")
-    verify_adaptive_pool((1, 5, 46, 97), (4, 96), "avg")
-    verify_adaptive_pool((1, 224, 224, 3), (1, 1), "max", layout="NHWC")
-    verify_adaptive_pool((1, 5, 46, 97), (4, 96), "avg", layout="NHWC")
-    verify_adaptive_pool((1, 16, 32, 32, 32), (1, 1, 1), "max", layout="NCDHW")
-    verify_adaptive_pool((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NCDHW")
-    verify_adaptive_pool((1, 16, 32, 32, 32), (2, 2, 2), "avg", layout="NCDHW")
-    verify_adaptive_pool((1, 16, 64, 32, 32), (7, 8, 9), "avg", layout="NCDHW")
-    verify_adaptive_pool((1, 16, 64, 32, 32), (8, 16, 16), "avg", layout="NCDHW")
-    verify_adaptive_pool((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NDHWC")
-    verify_adaptive_pool((1, 16, 32, 32, 32), (2, 2, 2), "max", layout="NDHWC")
-    verify_adaptive_pool((1, 16, 32, 32, 32), (2, 4, 4), "max", layout="NDHWC")
-
-
-def verify_poolnd(
-    n,
-    input_shape,
-    kernel,
-    stride,
-    dilation,
-    padding,
-    pool_type,
-    ceil_mode,
-    layout,
-    count_include_pad=True,
-):
-    """verify function of pool1d"""
-    A = te.placeholder(input_shape, name="A")
-
-    if n == 1:
-        B = topi.nn.pool1d(
-            A,
-            kernel=kernel,
-            stride=stride,
-            dilation=dilation,
-            padding=padding,
-            pool_type=pool_type,
-            ceil_mode=ceil_mode,
-            layout=layout,
-            count_include_pad=count_include_pad,
-        )
-    elif n == 2:
-        B = topi.nn.pool2d(
-            A,
-            kernel=kernel,
-            stride=stride,
-            dilation=dilation,
-            padding=padding,
-            pool_type=pool_type,
-            ceil_mode=ceil_mode,
-            layout=layout,
-            count_include_pad=count_include_pad,
-        )
-    elif n == 3:
-        B = topi.nn.pool3d(
-            A,
-            kernel=kernel,
-            stride=stride,
-            dilation=dilation,
-            padding=padding,
-            pool_type=pool_type,
-            ceil_mode=ceil_mode,
-            layout=layout,
-            count_include_pad=count_include_pad,
-        )
-    else:
-        raise ValueError(f"PoolND only supports n=1, 2, 3 got n={n}")
-
-    B = topi.nn.relu(B)
-    dtype = A.dtype
-    output_shape = [int(i) for i in B.shape]
-
-    input_np = np.random.uniform(low=0.001, size=input_shape).astype(dtype)
-
-    padding_before = padding[:n]
-    padding_after = padding[n:]
-    ref_np = tvm.topi.testing.poolnd_python(
-        input_np,
-        kernel,
-        stride,
-        dilation,
-        padding_before,
-        padding_after,
-        pool_type,
-        count_include_pad,
-        ceil_mode,
-        layout=layout,
-    )
-
-    np.testing.assert_equal(tuple(output_shape), tuple(ref_np.shape))
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s_func = tvm.topi.testing.dispatch(target, _pool_schedule)
-            s = s_func(B, layout)
-
-        a = tvm.nd.array(input_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), ref_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-def verify_pool3d(
-    input_shape,
-    kernel,
-    stride,
-    dilation,
-    padding,
-    pool_type,
-    ceil_mode,
-    count_include_pad=True,
-    layout="NCDHW",
-):
-    verify_poolnd(
-        3,
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        layout=layout,
-        count_include_pad=count_include_pad,
-    )
-
-
-@tvm.testing.uses_gpu
-def test_pool3d():
-    """test cases of pool3d"""
-    verify_pool3d(
-        [1, 16, 32, 32, 32], [2, 2, 2], [2, 2, 2], [1, 1, 1], [0, 0, 0, 0, 0, 0], "avg", False, True
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [1, 1, 1], [1, 1, 2, 2, 2, 1], "avg", False, True
-    )
-    verify_pool3d(
-        [1, 16, 32, 32, 32],
-        [2, 2, 2],
-        [2, 2, 2],
-        [1, 1, 1],
-        [1, 1, 2, 2, 2, 1],
-        "avg",
-        False,
-        False,
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31],
-        [4, 4, 4],
-        [4, 4, 4],
-        [1, 1, 1],
-        [3, 3, 3, 3, 3, 3],
-        "avg",
-        False,
-        False,
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31],
-        [4, 4, 4],
-        [4, 4, 4],
-        [1, 1, 1],
-        [0, 0, 0, 0, 0, 0],
-        "avg",
-        False,
-        False,
-    )
-    verify_pool3d(
-        [1, 16, 32, 32, 32], [2, 2, 2], [2, 2, 2], [1, 1, 1], [0, 0, 0, 0, 0, 0], "max", False
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [1, 1, 1], [2, 2, 1, 1, 1, 2], "max", False
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [1, 1, 1], [2, 2, 1, 1, 1, 2], "max", True
-    )
-
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [1, 1, 1], [2, 1, 0, 5, 4, 3], "avg", False, True
-    )
-    verify_pool3d(
-        [1, 16, 32, 32, 32],
-        [2, 2, 2],
-        [2, 2, 2],
-        [1, 1, 1],
-        [0, 5, 4, 3, 2, 1],
-        "avg",
-        False,
-        False,
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [1, 1, 1], [1, 0, 5, 4, 3, 2], "max", False
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [1, 1, 1], [3, 2, 1, 0, 5, 4], "max", True
-    )
-
-    # Test non-1 dilation
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [3, 3, 3], [2, 1, 0, 5, 4, 3], "avg", False, True
-    )
-    verify_pool3d(
-        [1, 16, 32, 32, 32],
-        [2, 2, 2],
-        [2, 2, 2],
-        [2, 2, 2],
-        [0, 5, 4, 3, 2, 1],
-        "avg",
-        False,
-        False,
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [2, 1, 3], [1, 0, 5, 4, 3, 2], "max", False
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [2, 2, 3], [3, 2, 1, 0, 5, 4], "max", True
-    )
-    # Test channel last layouts
-    verify_pool3d(
-        [1, 32, 32, 32, 16],
-        [2, 2, 2],
-        [2, 2, 2],
-        [1, 1, 1],
-        [0, 0, 0, 0, 0, 0],
-        "avg",
-        False,
-        True,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [3, 3, 3],
-        [3, 3, 3],
-        [1, 1, 1],
-        [1, 1, 2, 2, 2, 1],
-        "avg",
-        False,
-        True,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 32, 32, 32, 16],
-        [2, 2, 2],
-        [2, 2, 2],
-        [1, 1, 1],
-        [1, 1, 2, 2, 2, 1],
-        "avg",
-        False,
-        False,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [4, 4, 4],
-        [4, 4, 4],
-        [1, 1, 1],
-        [3, 3, 3, 3, 3, 3],
-        "avg",
-        False,
-        False,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [4, 4, 4],
-        [4, 4, 4],
-        [1, 1, 1],
-        [0, 0, 0, 0, 0, 0],
-        "avg",
-        False,
-        False,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 32, 32, 32, 16],
-        [2, 2, 2],
-        [2, 2, 2],
-        [1, 1, 1],
-        [0, 0, 0, 0, 0, 0],
-        "max",
-        False,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [3, 3, 3],
-        [3, 3, 3],
-        [1, 1, 1],
-        [2, 2, 1, 1, 1, 2],
-        "max",
-        False,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [3, 3, 3],
-        [3, 3, 3],
-        [1, 1, 1],
-        [2, 2, 1, 1, 1, 2],
-        "max",
-        True,
-        layout="NDHWC",
-    )
-
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [3, 3, 3],
-        [3, 3, 3],
-        [1, 1, 1],
-        [2, 1, 0, 5, 4, 3],
-        "avg",
-        False,
-        True,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 32, 32, 32, 16],
-        [2, 2, 2],
-        [2, 2, 2],
-        [1, 1, 1],
-        [0, 5, 4, 3, 2, 1],
-        "avg",
-        False,
-        False,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [3, 3, 3],
-        [3, 3, 3],
-        [1, 1, 1],
-        [1, 0, 5, 4, 3, 2],
-        "max",
-        False,
-        layout="NDHWC",
-    )
-    verify_pool3d(
-        [1, 31, 31, 31, 16],
-        [3, 3, 3],
-        [3, 3, 3],
-        [1, 1, 1],
-        [3, 2, 1, 0, 5, 4],
-        "max",
-        True,
-        layout="NDHWC",
-    )
-
-    # Test non-1 dilation
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [3, 3, 3], [2, 1, 0, 5, 4, 3], "avg", False, True
-    )
-    verify_pool3d(
-        [1, 16, 32, 32, 32],
-        [2, 2, 2],
-        [2, 2, 2],
-        [2, 2, 2],
-        [0, 5, 4, 3, 2, 1],
-        "avg",
-        False,
-        False,
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [2, 1, 3], [1, 0, 5, 4, 3, 2], "max", False
-    )
-    verify_pool3d(
-        [1, 16, 31, 31, 31], [3, 3, 3], [3, 3, 3], [2, 2, 3], [3, 2, 1, 0, 5, 4], "max", True
-    )
-
-
-def verify_pool2d(
-    input_shape,
-    kernel,
-    stride,
-    dilation,
-    padding,
-    pool_type,
-    ceil_mode,
-    count_include_pad=True,
-    layout="NCHW",
-):
-    verify_poolnd(
-        2,
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        layout=layout,
-        count_include_pad=count_include_pad,
-    )
-
-
-@tvm.testing.uses_gpu
-def test_pool2d():
-    """test cases of pool"""
-    verify_pool2d([1, 16, 32, 32], [2, 2], [2, 2], [1, 1], [0, 0, 0, 0], "avg", False, True)
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [1, 2, 1, 2], "avg", False, True)
-    verify_pool2d([1, 16, 32, 32], [2, 2], [2, 2], [1, 1], [1, 2, 1, 2], "avg", False, False)
-    verify_pool2d([1, 16, 31, 31], [4, 4], [4, 4], [1, 1], [3, 3, 3, 3], "avg", False, False)
-    verify_pool2d([1, 16, 31, 31], [4, 4], [4, 4], [1, 1], [0, 0, 0, 0], "avg", False, False)
-    verify_pool2d([1, 16, 32, 32], [2, 3], [2, 2], [1, 1], [0, 0, 0, 0], "max", False)
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", False)
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", True)
-
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 0, 3], "avg", False, True)
-    verify_pool2d([1, 16, 32, 32], [2, 3], [2, 2], [1, 1], [0, 3, 2, 1], "avg", False, False)
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [1, 0, 3, 2], "max", False)
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [3, 2, 1, 0], "max", True)
-
-    # Test non-1 dilations
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [2, 1], [2, 1, 0, 3], "avg", False, True)
-    verify_pool2d([1, 16, 32, 32], [2, 3], [2, 2], [2, 3], [0, 3, 2, 1], "avg", False, False)
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [3, 3], [1, 0, 3, 2], "max", False)
-    verify_pool2d([1, 16, 31, 31], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True)
-    # Test channel last
-    verify_pool2d(
-        [1, 32, 32, 16], [2, 2], [2, 2], [1, 1], [0, 0, 0, 0], "avg", False, True, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [1, 2, 1, 2], "avg", False, True, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 32, 32, 16], [2, 2], [2, 2], [1, 1], [1, 2, 1, 2], "avg", False, False, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 31, 31, 16], [4, 4], [4, 4], [1, 1], [3, 3, 3, 3], "avg", False, False, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 31, 31, 16], [4, 4], [4, 4], [1, 1], [0, 0, 0, 0], "avg", False, False, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 32, 32, 16], [2, 3], [2, 2], [1, 1], [0, 0, 0, 0], "max", False, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", False, layout="NHWC"
-    )
-    verify_pool2d([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", True, layout="NHWC")
-
-    verify_pool2d(
-        [1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 0, 3], "avg", False, True, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 32, 32, 16], [2, 3], [2, 2], [1, 1], [0, 3, 2, 1], "avg", False, False, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [1, 0, 3, 2], "max", False, layout="NHWC"
-    )
-    verify_pool2d([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [3, 2, 1, 0], "max", True, layout="NHWC")
-    verify_pool2d(
-        [1, 31, 31, 16], [3, 3], [3, 3], [2, 1], [2, 1, 0, 3], "avg", False, True, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 32, 32, 16], [2, 3], [2, 2], [2, 3], [0, 3, 2, 1], "avg", False, False, layout="NHWC"
-    )
-    verify_pool2d(
-        [1, 31, 31, 16], [3, 3], [3, 3], [3, 3], [1, 0, 3, 2], "max", False, layout="NHWC"
-    )
-    verify_pool2d([1, 31, 31, 16], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, layout="NHWC")
-
-
-def verify_pool1d(
-    input_shape,
-    kernel,
-    stride,
-    dilation,
-    padding,
-    pool_type,
-    ceil_mode,
-    count_include_pad=True,
-    layout="NCW",
-):
-    verify_poolnd(
-        1,
-        input_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        pool_type,
-        ceil_mode,
-        layout=layout,
-        count_include_pad=count_include_pad,
-    )
-
-
-@tvm.testing.uses_gpu
-def test_pool1d():
-    """test cases of pool1d"""
-    verify_pool1d([1, 16, 32], [2], [2], [1], [0, 0], "avg", False, True)
-    verify_pool1d([1, 16, 31], [3], [3], [1], [1, 2], "avg", False, True)
-    verify_pool1d([1, 16, 32], [2], [2], [1], [1, 2], "avg", False, False)
-    verify_pool1d([1, 16, 31], [4], [4], [1], [3, 3], "avg", False, False)
-    verify_pool1d([1, 16, 31], [4], [4], [1], [0, 0], "avg", False, False)
-    verify_pool1d([1, 16, 32], [2], [2], [1], [0, 0], "max", False)
-    verify_pool1d([1, 16, 31], [3], [3], [1], [2, 1], "max", False)
-    verify_pool1d([1, 16, 31], [3], [3], [1], [2, 1], "max", True)
-
-    verify_pool1d([1, 16, 31], [3], [3], [1], [2, 5], "avg", False, True)
-    verify_pool1d([1, 16, 32], [2], [2], [1], [0, 3], "avg", False, False)
-    verify_pool1d([1, 16, 31], [3], [3], [1], [1, 4], "max", False)
-    verify_pool1d([1, 16, 31], [3], [3], [1], [3, 0], "max", True)
-
-    # Test non-1 dilations
-    verify_pool1d([1, 16, 31], [3], [3], [2], [2, 5], "avg", False, True)
-    verify_pool1d([1, 16, 32], [2], [2], [3], [0, 3], "avg", False, False)
-    verify_pool1d([1, 16, 31], [3], [3], [2], [1, 4], "max", False)
-    verify_pool1d([1, 16, 31], [3], [3], [3], [3, 0], "max", True)
-    # Test Channel last
-    verify_pool1d([1, 32, 16], [2], [2], [1], [0, 0], "avg", False, True, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [1], [1, 2], "avg", False, True, layout="NWC")
-    verify_pool1d([1, 32, 16], [2], [2], [1], [1, 2], "avg", False, False, layout="NWC")
-    verify_pool1d([1, 31, 16], [4], [4], [1], [3, 3], "avg", False, False, layout="NWC")
-    verify_pool1d([1, 31, 16], [4], [4], [1], [0, 0], "avg", False, False, layout="NWC")
-    verify_pool1d([1, 32, 16], [2], [2], [1], [0, 0], "max", False, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [1], [2, 1], "max", False, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [1], [2, 1], "max", True, layout="NWC")
-
-    verify_pool1d([1, 31, 16], [3], [3], [1], [2, 5], "avg", False, True, layout="NWC")
-    verify_pool1d([1, 31, 16], [2], [2], [1], [0, 3], "avg", False, False, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [1], [1, 4], "max", False, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [1], [3, 0], "max", True, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [2], [2, 5], "avg", False, True, layout="NWC")
-    verify_pool1d([1, 32, 16], [2], [2], [3], [0, 3], "avg", False, False, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [2], [1, 4], "max", False, layout="NWC")
-    verify_pool1d([1, 31, 16], [3], [3], [3], [3, 0], "max", True, layout="NWC")
-
-
-def test_pool_invalid_tiled_layout():
-
-    with pytest.raises(TVMError, match="Unsupported layout NCHWD4d"):
-        A_3d = te.placeholder([1, 16, 32, 32, 32], name="A")
-        B = topi.nn.pool3d(
-            A_3d,
-            kernel=[2, 2, 2],
-            stride=[2, 2, 2],
-            dilation=[1, 1, 1],
-            padding=[0, 0, 0, 0, 0, 0],
-            pool_type="avg",
-            ceil_mode=False,
-            count_include_pad=True,
-            layout="NCHWD4d",
-        )
-
-    with pytest.raises(TVMError, match="Unsupported layout NCHW4h4w"):
-        A_2d = te.placeholder([1, 16, 32, 32], name="A")
-        B = topi.nn.pool2d(
-            A_2d,
-            kernel=[2, 2],
-            stride=[2, 2],
-            dilation=[1, 1],
-            padding=[0, 0, 0, 0],
-            pool_type="avg",
-            ceil_mode=False,
-            count_include_pad=True,
-            layout="NCHW4h4w",
-        )
-
-    with pytest.raises(TVMError, match="Unsupported layout NCW4w"):
-        A_1d = te.placeholder([1, 16, 32], name="A")
-        B = topi.nn.pool1d(
-            A_1d,
-            kernel=[2],
-            stride=[2],
-            dilation=[1],
-            padding=[0, 0],
-            pool_type="avg",
-            ceil_mode=False,
-            count_include_pad=True,
-            layout="NCW4w",
-        )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_prng.py b/tests/python/topi/test_topi_prng.py
deleted file mode 100644
index 275989493b3c..000000000000
--- a/tests/python/topi/test_topi_prng.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.relay
-import tvm.testing
-import tvm.topi
-import numpy as np
-import scipy.stats
-
-
-def threefry_split(target, dev, gen):
-    gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
-    left_placeholder, right_placeholder = tvm.topi.random.threefry_split(gen_placeholder)
-    s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder])
-    f = tvm.build(s, [gen_placeholder, left_placeholder, right_placeholder])
-    left = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"))
-    right = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"))
-    f(tvm.nd.array(gen), left, right)
-    return left.numpy(), right.numpy()
-
-
-def threefry_generate(target, dev, gen, size):
-    gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
-    left_placeholder, right_placeholder = tvm.topi.random.threefry_generate(gen_placeholder, size)
-    s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder])
-    f = tvm.build(s, [gen_placeholder, left_placeholder, right_placeholder])
-    out_gen = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"))
-    rands = tvm.nd.array(np.zeros(size, dtype="uint64"))
-    f(tvm.nd.array(gen), out_gen, rands)
-    return out_gen.numpy(), rands.numpy()
-
-
-def uniform(target, dev, gen, low, high, size, dtype):
-    gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
-    low_placeholder = tvm.te.placeholder(low.shape, name="low", dtype=dtype)
-    high_placeholder = tvm.te.placeholder(high.shape, name="high", dtype=dtype)
-    left_placeholder, right_placeholder = tvm.topi.random.uniform(
-        gen_placeholder, low_placeholder, high_placeholder, size, dtype
-    )
-    s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder])
-    f = tvm.build(
-        s,
-        [gen_placeholder, low_placeholder, high_placeholder, left_placeholder, right_placeholder],
-        target=target,
-    )
-    out_gen = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"), device=dev)
-    rands = tvm.nd.array(np.zeros(size, dtype=dtype), device=dev)
-    f(
-        tvm.nd.array(gen, device=dev),
-        tvm.nd.array(low, device=dev),
-        tvm.nd.array(high, device=dev),
-        out_gen,
-        rands,
-    )
-    return out_gen.numpy(), rands.asnumpy()
-
-
-def multinomial(target, dev, gen, probs, num_samples):
-    gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
-    probs_placeholder = tvm.te.placeholder(probs.shape, name="probs", dtype="float32")
-    new_gen_placeholder, indices_placeholder = tvm.topi.random.multinomial(
-        gen_placeholder, probs_placeholder, num_samples
-    )
-    s = tvm.topi.generic.schedule_extern([new_gen_placeholder, indices_placeholder])
-    f = tvm.build(
-        s,
-        [gen_placeholder, probs_placeholder, new_gen_placeholder, indices_placeholder],
-        target=target,
-    )
-    out_gen = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"), device=dev)
-    indices = tvm.nd.array(np.zeros((*probs.shape[:-1], num_samples), dtype="int32"), device=dev)
-    f(tvm.nd.array(gen), tvm.nd.array(probs), out_gen, indices)
-    return out_gen.numpy(), indices.asnumpy()
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_threefry_split(target, dev):
-    # test that results of split do not equal eachother or the input
-    gen = tvm.relay.random.threefry_key(0).data.numpy()
-    a, b = threefry_split(target, dev, gen)
-    assert (a != b).any() and (
-        a != gen
-    ).any(), "Splitting a gen should result in different output gens"
-    # unittest some split inputs
-    assert (a == np.array([0, 0, 0, 0, 0, 0, 0, 0, 1 << 62, 0], dtype="uint64")).all()
-    assert (b == np.array([0, 0, 0, 0, 1 << 63, 0, 0, 0, 1 << 62, 0], dtype="uint64")).all()
-
-    # test enough splits to go over path length
-    for i in range(129):
-        a, b = threefry_split(target, dev, b)
-    assert (a[0:4] == b[0:4]).all(), "State part of split should be the same"
-    assert (b[0:4] != np.zeros(4, dtype="uint64")).any()
-
-    # check that split then generate does not generate the same for both sides
-    a, a_rands = threefry_generate(target, dev, a, (100,))
-    b, b_rands = threefry_generate(target, dev, b, (100,))
-    assert (
-        a_rands != b_rands
-    ).all(), "Numbers generated from different initial states should be different"
-
-    # check repeatability
-    _, rands1 = threefry_generate(target, dev, a, (100,))
-    _, rands2 = threefry_generate(target, dev, a, (100,))
-    assert (
-        rands1 == rands2
-    ).all(), "Numbers generated from the same initial state should be the same"
-
-    a1, b1 = threefry_split(target, dev, a)
-    a2, b2 = threefry_split(target, dev, a)
-    assert (a1 == a2).all() and (
-        b1 == b2
-    ).all(), "Split called on the same input should return the same result"
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_threefry_generate(target, dev):
-    gen = tvm.relay.random.threefry_key(0).data.numpy()
-
-    # check that we can generate some data
-    a, rands = threefry_generate(target, dev, gen, (2048,))
-    assert (
-        rands.shape[0] == 2048 and len(rands.shape) == 1
-    ), "Output shape should match requested shape"
-
-    # check that gen out does not equal input
-    assert (a != gen).any(), "Output generator should be different from input generator"
-
-    # check that we can generate data whose total number of elements is not a multiple of 4.
-    a, rands = threefry_generate(target, dev, gen, (7,))
-    assert (
-        rands.shape[0] == 7 and len(rands.shape) == 1
-    ), "Output shape should match requested shape"
-
-    # test enough generates to go over generate limit
-    gen = np.array(
-        [0, 0, 0, 0, 0, 0, 0, 2**64 - 2, 1 << 63, 0], dtype="uint64"
-    )  # make counter large
-    a, rands = threefry_generate(target, dev, gen, (2048,))
-    assert gen[4] != a[4], "Overflow of counter should trigger path change"
-    assert a[7] == 2048, "Overflow of counter should still update counter"
-
-    # check generate with path at length limit
-    gen = np.array([0, 0, 0, 0, 0, 0, 0, 2**64 - 2, 0, 0], dtype="uint64")  # make counter large
-    a, rands = threefry_generate(target, dev, gen, (2048,))
-    assert (
-        gen[0:4] != a[0:4]
-    ).any(), "Overflowing counter with no space left in path should change state"
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_threefry_wrapping(target, dev):
-    assert tvm.topi.random.threefry_test_wrapping(
-        target, dev
-    ), f"{target} does not suppport wrapping unsigned integer arithmetic"
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_uniform(target, dev):
-    gen = tvm.relay.random.threefry_key(0).data.numpy()
-    m = 1024
-    n = 1024
-    dtypes = ["float32", "float64"]
-    for dtype in dtypes:
-        low = np.array(5.0, dtype=dtype)
-        high = np.array(10.0, dtype=dtype)
-        new_gen, rands = uniform(target, dev, gen, low, high, (m, n), dtype)
-        assert (gen != new_gen).any()
-        assert abs(np.mean(rands) - 7.5) < 1e-1
-        assert np.min(rands) >= 5.0
-        assert np.max(rands) <= 10.0
-
-
-@tvm.testing.parametrize_targets("llvm")
-def test_multinomial(target, dev):
-    def _verify_multinomial(size, num_samples, test_statistics=False):
-        gen = tvm.relay.random.threefry_key(np.random.randint(0, 1e5)).data.numpy()
-        probs = np.random.randint(low=-50, high=1000, size=size).astype("float32")
-        new_gen, indices = multinomial(target, dev, gen, probs, num_samples)
-        assert (gen != new_gen).any()
-        assert np.min(indices) >= 0
-        assert np.max(indices) < probs.shape[-1]
-        # Note, only use test_statistics with sample size > 10,000.
-        if test_statistics:
-            # Clipped and normalized probabilities * number of samples
-            # represents expected frequency of each category.
-            # First upcast to float64 to remove numerical error.
-            probs = probs.astype("float64")
-            probs = np.reshape(probs, [-1, probs.shape[-1]])
-            probs = np.maximum(probs, 0)
-            probs = probs / np.expand_dims(np.sum(probs, axis=-1), axis=-1)
-            # Multiply by number of samples and add epsilon to get non-zero expected samples per index.
-            expected_frequency = probs * num_samples + np.finfo(float).eps
-            # Do a small adjustment to make sure each row of expected_frequencies sums to exactly num_samples.
-            expected_frequency = (
-                np.expand_dims((num_samples / np.sum(expected_frequency, axis=-1)), axis=-1)
-                * expected_frequency
-            )
-            # Reduce shape to a 2D matrix.
-            indices = np.reshape(indices, [-1, indices.shape[-1]])
-            # Split indendent rows of indices.
-            index_list = [np.squeeze(x, 0) for x in np.split(indices, indices.shape[0], axis=0)]
-            # Count frequency of selected indices in each row.
-            observed_freqs = [np.bincount(samples, minlength=size[-1]) for samples in index_list]
-            # Stack observed frequencies back into a matrix.
-            observed_freqs = np.stack(observed_freqs, axis=0)
-            # Test how closely observed samples match expectations.
-            _, p_value = scipy.stats.chisquare(observed_freqs, expected_frequency, axis=-1)
-            # If sampled correctly, p_value should be greater than 1e-6 almost all the time.
-            assert np.all(p_value > 1e-6)
-
-    # Test simple 1-D case.
-    _verify_multinomial([3], 2)
-    # Test 2-D case.
-    _verify_multinomial([2, 10], 1)
-    # Test 3-D case.
-    _verify_multinomial([2, 3, 10], 4)
-    # Test large sample size statistics.
-    _verify_multinomial([3, 10], 10000, test_statistics=True)
-
-
-if __name__ == "__main__":
-    test_threefry_split(tvm.target.Target("llvm"), tvm.device("cpu"))
-    test_threefry_generate(tvm.target.Target("llvm"), tvm.device("cpu"))
-    test_threefry_wrapping(tvm.target.Target("llvm"), tvm.device("cpu"))
-    test_uniform(tvm.target.Target("llvm"), tvm.device("cpu"))
-    test_multinomial(tvm.target.Target("llvm"), tvm.device("cpu"))
diff --git a/tests/python/topi/test_topi_qnn.py b/tests/python/topi/test_topi_qnn.py
deleted file mode 100644
index 38212d970523..000000000000
--- a/tests/python/topi/test_topi_qnn.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for QNN operators."""
-import numpy as np
-import tvm
-from tvm import topi, relay, te
-from tvm.contrib import graph_executor
-import tvm.topi.testing
-
-
-def verify_simulated_quantize(data_shape, out_dtype, channels, axis):
-    # Create placeholder variables for all qnn inputs.
-    A = te.placeholder(data_shape, name="value", dtype="float32")
-    D = te.placeholder([], name="dtype", dtype="int32")
-    S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32")
-    Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32")
-    SIM_Q = topi.nn.simulated_quantize(A, D, output_scale=S, output_zero_point=Z, axis=axis)
-
-    # Create random numpy values to assign to inputs.
-    a_np = np.random.uniform(size=data_shape).astype("float32")
-    d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[out_dtype])
-    s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32")
-    z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32")
-    q_np = np.zeros(shape=data_shape, dtype="float32")
-
-    def check_target(target, dev):
-        # Wrap the numpy arrays in nd arrays.
-        a = tvm.nd.array(a_np, dev)
-        d = tvm.nd.array(d_np, dev)
-        s = tvm.nd.array(s_np, dev)
-        z = tvm.nd.array(z_np, dev)
-        q = tvm.nd.array(q_np, dev)
-
-        # Construct equivalent relay graph.
-        per_channel = channels[0] != 1
-        a_var = relay.var("a", shape=data_shape, dtype="float32")
-        if per_channel:
-            s_var = relay.const(s_np)
-            z_var = relay.const(z_np)
-        else:
-            s_var = relay.const(s_np[0])
-            z_var = relay.const(z_np[0])
-        real_q_op = relay.qnn.quantize(a_var, s_var, z_var, axis=axis, out_dtype=out_dtype)
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=target)
-
-        # Get real qnn quantize output.
-        m = graph_executor.GraphModule(lib["default"](dev))
-        m.set_input("a", a_np)
-
-        m.run()
-        real_q_out = m.get_output(0)
-
-        # Compile the simulated quantize function.
-        with tvm.target.Target(target):
-            sched = tvm.topi.testing.get_injective_schedule(target)(SIM_Q)
-        func = tvm.build(sched, [A, D, S, Z, SIM_Q], target, name="sim_quantize")
-        func(a, d, s, z, q)
-
-        # Check correctness against the true qnn output.
-        mismatch = q.numpy() != real_q_out.numpy().astype("float32")
-        # Allow some rounding errors due to GPU fp32 arithmetic.
-        assert np.sum(mismatch) <= 3
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-def test_simulated_quantize():
-    verify_simulated_quantize([1], "int8", [1], -1)
-    verify_simulated_quantize([2, 5], "int8", [5], 1)
-    verify_simulated_quantize([1, 32, 32, 32], "int8", [32], -1)
-    verify_simulated_quantize([1, 32, 32, 32], "uint8", [32], -2)
-    verify_simulated_quantize([2, 5], "int32", [5], 1)
-
-
-def verify_simulated_dequantize(data_shape, in_dtype, channels, axis):
-    # Create placeholder variables for all qnn inputs.
-    A = te.placeholder(data_shape, name="value", dtype="float32")
-    D = te.placeholder([], name="dtype", dtype="int32")
-    S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32")
-    Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32")
-    SIM_DQ = topi.nn.simulated_dequantize(A, D, input_scale=S, input_zero_point=Z, axis=axis)
-
-    # Create random numpy values to assign to inputs.
-    a_np = np.random.uniform(low=-128, high=127, size=data_shape).astype(in_dtype)
-    a_np_f = a_np.astype("float32")
-    d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[in_dtype])
-    s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32")
-    z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32")
-    dq_np = np.zeros(shape=data_shape, dtype="float32")
-
-    def check_target(target, dev):
-        # Wrap the numpy arrays in nd arrays.
-        a = tvm.nd.array(a_np_f, dev)
-        d = tvm.nd.array(d_np, dev)
-        s = tvm.nd.array(s_np, dev)
-        z = tvm.nd.array(z_np, dev)
-        dq = tvm.nd.array(dq_np, dev)
-
-        # Construct equivalent relay graph.
-        per_channel = channels[0] != 1
-        a_var = relay.var("a", shape=data_shape, dtype=in_dtype)
-        if per_channel:
-            s_var = relay.const(s_np)
-            z_var = relay.const(z_np)
-        else:
-            s_var = relay.const(s_np[0])
-            z_var = relay.const(z_np[0])
-        real_dq_op = relay.qnn.dequantize(a_var, s_var, z_var, axis=axis)
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=target)
-
-        # Get real qnn quantize output.
-        m = graph_executor.GraphModule(lib["default"](dev))
-        m.set_input("a", a_np)
-
-        m.run()
-        real_dq_out = m.get_output(0)
-
-        # Compile the simulated quantize function.
-        with tvm.target.Target(target):
-            sched = tvm.topi.testing.get_injective_schedule(target)(SIM_DQ)
-        func = tvm.build(sched, [A, D, S, Z, SIM_DQ], target, name="sim_quantize")
-        func(a, d, s, z, dq)
-
-        # Check correctness against the true qnn output.
-        tvm.testing.assert_allclose(dq.numpy(), real_dq_out.numpy().astype("float32"), rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-def test_simulated_dequantize():
-    verify_simulated_dequantize([1], "int8", [1], -1)
-    verify_simulated_dequantize([2, 5], "int8", [5], 1)
-    verify_simulated_dequantize([2, 5], "int8", [2], 0)
-    verify_simulated_dequantize([1, 32, 32, 32], "int8", [32], -1)
-    verify_simulated_dequantize([1, 32, 32, 32], "uint8", [32], -2)
-    verify_simulated_dequantize([2, 5], "int32", [5], 1)
-
-
-if __name__ == "__main__":
-    test_simulated_quantize()
-    test_simulated_dequantize()
diff --git a/tests/python/topi/test_topi_reduce.py b/tests/python/topi/test_topi_reduce.py
deleted file mode 100644
index b06654e7404a..000000000000
--- a/tests/python/topi/test_topi_reduce.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for reduce."""
-import os
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi, tir
-
-
-in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
-    ((32,), 0, False, "argmax", "float32"),
-    ((128, 24, 128, 24), (1, 2, 3), True, "sum", "float32"),
-    ((2, 3), None, True, "all", "bool"),
-    ((128, 24 * 128 * 24), (1,), False, "max", "float32"),
-    ((32, 128, 24), None, True, "sum", "float32"),
-    ((32, 128, 24), None, True, "all", "bool"),
-    ((128, 24, 128, 24), (0, 2), False, "min", "float32"),
-    ((32, 128), 1, True, "argmax", "float32"),
-    ((32, 24, 32, 24), 2, False, "argmin", "float32"),
-    ((31, 21, 15), None, True, "argmax", "float32"),
-    ((31, 21, 15), None, False, "sum", "float32"),
-    ((128, 24, 128, 24), (1, 2, 3), True, "sum", "float64"),
-    ((2, 3), None, True, "any", "bool"),
-    ((32, 128, 24), None, True, "any", "bool"),
-    ((1, 4, 7), 1, True, "any", "bool"),
-    ((128, 24, 128, 24), 2, False, "any", "bool"),
-    ((128, 24, 128, 24), 2, False, "sum", "bool"),
-    ((128, 24, 128, 24), 0, True, "sum", "bool"),
-    ((3, 4, 5), None, False, "prod", "float32"),
-    ((3, 4, 5), (2,), False, "prod", "float32"),
-    ((3, 4, 5), (1, 2), True, "prod", "float32"),
-    ((3, 4, 5), (), False, "sum", "float32"),
-    ((3, 4, 5), (), True, "sum", "float32"),
-    ((3, 4, 5), (0, 1, 2), False, "sum", "float32"),
-    ((3, 4, 5), (0, 1, 2), True, "sum", "float32"),
-    ((3, 4, 5), (), False, "prod", "float32"),
-    ((3, 4, 5), (), True, "prod", "float32"),
-    ((3, 4, 5), (0, 1, 2), False, "prod", "float32"),
-    ((3, 4, 5), (0, 1, 2), True, "prod", "float32"),
-    ((3, 4, 5), (), False, "min", "float32"),
-    ((3, 4, 5), (), True, "min", "float32"),
-    ((3, 4, 5), (0, 1, 2), False, "min", "float32"),
-    ((3, 4, 5), (0, 1, 2), True, "min", "float32"),
-    ((3, 4, 5), (), False, "max", "float32"),
-    ((3, 4, 5), (), True, "max", "float32"),
-    ((3, 4, 5), (0, 1, 2), False, "max", "float32"),
-    ((3, 4, 5), (0, 1, 2), True, "max", "float32"),
-    ((3, 4, 5), (), False, "any", "bool"),
-    ((3, 4, 5), (), True, "any", "bool"),
-    ((3, 4, 5), (0, 1, 2), False, "any", "bool"),
-    ((3, 4, 5), (0, 1, 2), True, "any", "bool"),
-    ((3, 4, 5), (), False, "all", "bool"),
-    ((3, 4, 5), (), True, "all", "bool"),
-    ((3, 4, 5), (0, 1, 2), False, "all", "bool"),
-    ((3, 4, 5), (0, 1, 2), True, "all", "bool"),
-)
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(in_shape, axis, keepdims, reduce_type, dtype):
-    # Test
-    if dtype == "bool":
-        in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
-    else:
-        in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
-        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
-
-    if reduce_type == "sum":
-        if dtype == "bool":
-            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims, dtype="bool")
-        else:
-            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
-    elif reduce_type == "prod":
-        out_npy = in_npy_map.prod(axis=axis, keepdims=keepdims)
-    elif reduce_type == "all" and dtype == "bool":
-        out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
-    elif reduce_type == "any" and dtype == "bool":
-        out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
-    elif reduce_type == "max":
-        out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
-    elif reduce_type == "min":
-        out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
-    elif reduce_type == "argmax":
-        out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
-    elif reduce_type == "argmin":
-        out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
-    else:
-        raise NotImplementedError
-
-    return in_npy, in_npy_map, out_npy
-
-
-def _my_npy_argmax(arr, axis, keepdims):
-    if not keepdims:
-        return arr.argmax(axis=axis)
-    else:
-        if axis is None:
-            out_shape = [1 for _ in arr.shape]
-        else:
-            out_shape = list(arr.shape)
-            out_shape[axis] = 1
-
-        return arr.argmax(axis=axis).reshape(out_shape)
-
-
-def _my_npy_argmin(arr, axis, keepdims):
-    if not keepdims:
-        return arr.argmin(axis=axis)
-    else:
-        if axis is None:
-            out_shape = [1 for _ in arr.shape]
-        else:
-            out_shape = list(arr.shape)
-            out_shape[axis] = 1
-        return arr.argmin(axis=axis).reshape(out_shape)
-
-
-def test_reduce_map(target, dev, ref_data, in_shape, axis, keepdims, reduce_type, dtype):
-    target = tvm.target.Target(target)
-    if target.kind.name == "vulkan" and reduce_type in ["sum", "prod", "any", "all"]:
-        pytest.xfail(f"Vulkan backend has known errors on {reduce_type}")
-
-    in_npy, in_npy_map, out_npy = ref_data
-
-    # Build the logic and compile the function
-    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
-    A1 = topi.sqrt(topi.exp(A))
-    out_dtype = dtype
-    if reduce_type == "sum":
-        if dtype == "bool":
-            B = topi.sum(A, axis=axis, keepdims=keepdims)
-        else:
-            B = topi.sum(A1, axis=axis, keepdims=keepdims)
-    elif reduce_type == "prod":
-        B = topi.prod(A1, axis=axis, keepdims=keepdims)
-    elif reduce_type == "all":
-        B = topi.all(A, axis=axis, keepdims=keepdims)
-    elif reduce_type == "any":
-        B = topi.any(A, axis=axis, keepdims=keepdims)
-    elif reduce_type == "max":
-        B = topi.max(A1, axis=axis, keepdims=keepdims)
-    elif reduce_type == "min":
-        B = topi.min(A1, axis=axis, keepdims=keepdims)
-    elif reduce_type == "argmax":
-        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
-        out_dtype = "int32"
-    elif reduce_type == "argmin":
-        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
-        out_dtype = "int32"
-    else:
-        raise NotImplementedError
-
-    with tvm.target.Target(target):
-        s = tvm.topi.testing.get_reduce_schedule(target)(B)
-
-    foo = tvm.build(s, [A, B], target, name=reduce_type)
-
-    data_tvm = tvm.nd.array(in_npy, device=dev)
-    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
-    foo(data_tvm, out_tvm)
-
-    if reduce_type == "argmax" or reduce_type == "argmin":
-        out_tvm_indices = out_tvm.numpy()
-        if keepdims:
-            out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
-        if axis is None:
-            out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
-        else:
-            other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis + 1) :]))
-            sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
-            out_tvm_val = in_npy_map[sel_indices]
-        if reduce_type == "argmax":
-            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3)
-        elif reduce_type == "argmin":
-            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3)
-    else:
-        tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
-
-
-def test_complex_reduce(target, dev):
-    in_shape = (2, 3)
-    dtype = "float32"
-    axis = 0
-    keepdims = False
-    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
-    B = topi.sum(A, axis=axis, keepdims=keepdims)
-    C = topi.add(B, B)
-    D = topi.multiply(B, B)
-    E = topi.add(C, D)
-
-    with tvm.target.Target(target):
-        s = tvm.topi.testing.get_reduce_schedule(target)(E)
-    foo = tvm.build(s, [A, E], target, name="sum")
-
-    in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
-    sum_npy = in_npy.sum(axis=axis, keepdims=keepdims)
-    out_npy = sum_npy * 2 + sum_npy * sum_npy
-
-    data_tvm = tvm.nd.array(in_npy, device=dev)
-    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=dtype)
-    foo(data_tvm, out_tvm)
-    tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
-
-
-n = tir.Var("n", "int32")
-m = tir.Var("m", "int32")
-true_value_map = {n: 3, m: 5}
-
-data_shape, target_shape = tvm.testing.parameters(
-    ((2, 3), (3,)),
-    ((2, 3, 4), (2, 1, 4)),
-    ((2, 3, 4, 5), (3, 1, 5)),
-    ((2, n, 4, m), (n, 1, m)),
-)
-
-
-def _my_npy_collapse_sum(data, target_shape):
-    reduce_axes = []
-    i = data.ndim - 1
-    j = len(target_shape) - 1
-    while i >= 0:
-        if j < 0:
-            reduce_axes.append(i)
-        elif target_shape[j] == 1 and data.shape[i] > 1:
-            reduce_axes.append(i)
-        i -= 1
-        j -= 1
-    return np.sum(data, tuple(reduce_axes)).reshape(target_shape)
-
-
-def test_collapse_sum(data_shape, target_shape):
-    A = te.placeholder(data_shape, name="A")
-    B = topi.collapse_sum(A, target_shape)
-    s = te.create_schedule([B.op])
-
-    data_shape_const = [int(s) if s not in true_value_map else true_value_map[s] for s in A.shape]
-    target_shape_const = [
-        int(s) if s not in true_value_map else true_value_map[s] for s in target_shape
-    ]
-    a_np = np.random.uniform(size=data_shape_const).astype(A.dtype)
-    b_np = _my_npy_collapse_sum(a_np, target_shape_const)
-    dev = tvm.cpu(0)
-    a = tvm.nd.array(a_np, dev)
-    B_shape_const = [int(s) if s not in true_value_map else true_value_map[s] for s in B.shape]
-    b = tvm.nd.array(np.zeros(B_shape_const, dtype=B.dtype), dev)
-    # Building with the CSE pass disabled
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        foo = tvm.build(s, [A, B], "llvm", name="collapse_sum")
-    foo(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_relu.py b/tests/python/topi/test_topi_relu.py
deleted file mode 100644
index 948835068902..000000000000
--- a/tests/python/topi/test_topi_relu.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for relu activation"""
-import sys
-import os
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-from tvm.contrib.nvcc import have_fp16
-
-import pytest
-import tvm.testing
-
-
-m, n, dtype = tvm.testing.parameters(
-    (10, 128, "float32"),
-    (128, 64, "float16"),
-    # Commented due to weird killed
-    # (1024 * 100, 512, "float32"),
-)
-
-
-def test_relu(target, dev, m, n, dtype):
-    A = te.placeholder((m, n), name="A", dtype=dtype)
-    B = topi.nn.relu(A)
-
-    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = a_np * (a_np > 0)
-
-    if dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version):
-        pytest.skip("Skip because %s does not have fp16 support" % target)
-
-    print("Running on target: %s" % target)
-    with tvm.target.Target(target):
-        s = tvm.topi.testing.get_elemwise_schedule(target)(B)
-
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    # Building with the CSE pass disabled
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        foo = tvm.build(s, [A, B], target, name="relu")
-    foo(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-size, alpha = tvm.testing.parameters((100, 0.1))
-
-
-def test_leaky_relu(size, alpha):
-    A = te.placeholder((size,), name="A")
-    B = topi.nn.leaky_relu(A, alpha)
-    s = te.create_schedule([B.op])
-
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha
-    dev = tvm.cpu(0)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    # Building with the CSE pass disabled
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        foo = tvm.build(s, [A, B], "llvm", name="leaky_relu")
-    foo(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-x, w, axis, weight_reshape = tvm.testing.parameters(
-    ((1, 3, 2, 2), (3,), 1, (3, 1, 1)),
-    ((1, 3, 2, 2), (2,), 2, (2, 1)),
-    ((1, 3), (3,), 1, (3,)),
-)
-
-
-def test_prelu(x, w, axis, weight_reshape):
-    X = te.placeholder((x), name="X")
-    W = te.placeholder((w), name="W")
-    x_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(X.shape)).astype(X.dtype)
-    w_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(W.shape)).astype(W.dtype)
-
-    def _prelu_numpy(x, W):
-        return (x < 0) * (x * W.reshape(weight_reshape)) + (x >= 0) * x
-
-    B = topi.nn.prelu(X, W, axis)
-    s = te.create_schedule([B.op])
-
-    dev = tvm.cpu(0)
-    x_tvm = tvm.nd.array(x_np, dev)
-    w_tvm = tvm.nd.array(w_np, dev)
-
-    b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), dev)
-    # Building with the CSE pass disabled
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
-    foo(x_tvm, w_tvm, b)
-    out_np = _prelu_numpy(x_np, w_np)
-    tvm.testing.assert_allclose(b.numpy(), out_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_reorg.py b/tests/python/topi/test_topi_reorg.py
deleted file mode 100644
index f41b4b740bec..000000000000
--- a/tests/python/topi/test_topi_reorg.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example code to do reorg."""
-import numpy as np
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-import tvm
-from tvm import te
-import tvm.topi.testing
-import tvm.testing
-
-_reorg_schedule = {
-    "generic": topi.generic.schedule_reorg,
-    "gpu": topi.cuda.schedule_reorg,
-}
-
-
-def verify_reorg(batch, in_size, in_channel, stride):
-    """Verify reorg operator by comparing outputs from tvm and numpy implementation"""
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    B = topi.vision.reorg(A, stride)
-
-    a_shape = get_const_tuple(A.shape)
-    dtype = A.dtype
-
-    def get_ref_data_reorg():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        b_np = tvm.topi.testing.reorg_python(a_np, stride)
-        return a_np, b_np
-
-    a_np, b_np = get_ref_data_reorg()
-
-    def check_device(device):
-        """Cheching devices is enabled or not"""
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _reorg_schedule)
-            s = s_func([B])
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        func = tvm.build(s, [A, B], device)
-        func(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-    for device in ["llvm", "cuda"]:
-        check_device(device)
-
-
-@tvm.testing.uses_gpu
-def test_reorg():
-    verify_reorg(1, 20, 8, 2)
-
-
-if __name__ == "__main__":
-    test_reorg()
diff --git a/tests/python/topi/test_topi_rms_norm.py b/tests/python/topi/test_topi_rms_norm.py
deleted file mode 100644
index c8c1b8795f2d..000000000000
--- a/tests/python/topi/test_topi_rms_norm.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for rms_norm."""
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-import tvm.topi.testing
-
-import tvm.testing
-
-
-_rms_norm_schedule = {
-    "generic": topi.generic.schedule_injective,
-}
-
-
-# only test on llvm because schedule is missing
-@tvm.testing.parametrize_targets("llvm")
-@pytest.mark.parametrize(
-    "shape,axis",
-    [([4, 16], (1,)), ([4, 16, 16], (1, 2)), ([("a", 4), ("b", 16)], (1,)), ([2, 8192], (1,))],
-)
-@pytest.mark.parametrize("dtype", ["float32", "float16"])
-def test_rms_norm(target, dev, shape, axis, dtype, episilon=1e-5, rtol=5e-3, atol=1e-4):
-    shape_te = [te.var(v[0]) if isinstance(v, tuple) else v for v in shape]
-    scale_shape_te = [shape_te[dim] for dim in axis]
-    data = te.placeholder(shape_te, dtype=dtype, name="data")
-    weight = te.placeholder(scale_shape_te, dtype=dtype, name="weight")
-    B = topi.nn.rms_norm(data, weight, axis, episilon)
-
-    shape_np = [v[1] if isinstance(v, tuple) else v for v in shape]
-    scale_shape_np = [shape_np[dim] for dim in axis]
-    data_np = np.random.uniform(size=shape_np).astype(dtype)
-    weight_np = np.random.uniform(size=scale_shape_np).astype(dtype)
-    b_np = tvm.topi.testing.rms_norm_python(data_np, weight_np, axis, episilon)
-
-    with tvm.target.Target(target):
-        s_func = tvm.topi.testing.dispatch(target, _rms_norm_schedule)
-        s = s_func([B])
-    data_tvm = tvm.nd.array(data_np, dev)
-    weight_tvm = tvm.nd.array(weight_np, dev)
-    b_tvm = tvm.nd.array(np.zeros(shape_np, dtype=dtype), dev)
-    f = tvm.build(s, [data, weight, B], target)
-    f(data_tvm, weight_tvm, b_tvm)
-    tvm.testing.assert_allclose(b_tvm.numpy(), b_np, rtol=rtol, atol=atol)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_scan.py b/tests/python/topi/test_topi_scan.py
deleted file mode 100644
index cd77a1ccfbce..000000000000
--- a/tests/python/topi/test_topi_scan.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from typing import Callable
-
-import numpy as np
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import topi
-
-topi_funcs = {
-    "cumsum": {"generic": topi.cumsum, "cuda": topi.cuda.cumsum},
-    "cumprod": {"generic": topi.cumprod, "cuda": topi.cuda.cumprod},
-}
-
-identity_value = {"cumsum": 0, "cumprod": 1}
-
-
-def get_implementations(name, axis, dtype, exclusive):
-    topi_func_generic = topi_funcs[name]["generic"]
-    topi_func_cuda = topi_funcs[name]["cuda"]
-
-    return {
-        "generic": (
-            lambda x: topi_func_generic(x, axis, dtype, exclusive=exclusive),
-            topi.generic.schedule_extern,
-        ),
-        "cuda": (
-            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
-            topi.cuda.schedule_scan,
-        ),
-        "nvptx": (
-            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
-            topi.cuda.schedule_scan,
-        ),
-        "vulkan": (
-            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
-            topi.cuda.schedule_scan,
-        ),
-        "metal": (
-            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
-            topi.cuda.schedule_scan,
-        ),
-    }
-
-
-def _run_tests(
-    dev,
-    target,
-    op_name: str = "cumsum",
-    gt_func: Callable[..., np.array] = np.cumsum,
-):
-    def check_scan(np_ref, data, axis=None, dtype=None, exclusive=False):
-        implementations = get_implementations(op_name, axis, dtype, exclusive)
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-        tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, dev, fcompute, fschedule)
-
-    data = np.array([2, 3, 0])
-    check_scan(gt_func(data), data)
-
-    data = np.random.rand(10) > 0.5
-    data = data.astype(np.int32)
-    check_scan(gt_func(data, dtype=np.int32), data)
-    check_scan(gt_func(data), data, dtype="int64")
-
-    data = np.random.rand(10) > 0.5
-    check_scan(gt_func(data, dtype=np.int32), data, dtype="int32")
-
-    for in_dtype in ["float32", "float64"]:
-        if target == "metal" and in_dtype == "float64":
-            # float64 is not supported in metal
-            continue
-        data = np.random.randn(10, 10).astype(in_dtype)
-        check_scan(gt_func(data), data)
-        check_scan(gt_func(data, axis=0), data, axis=0)
-        check_scan(gt_func(data, axis=1), data, axis=1)
-
-        data = np.random.randn(10, 5, 10).astype(in_dtype)
-        check_scan(gt_func(data), data)
-        check_scan(gt_func(data, axis=0), data, axis=0)
-        check_scan(gt_func(data, axis=1), data, axis=1)
-        check_scan(gt_func(data, axis=-1), data, axis=-1)
-
-    for in_dtype in ["int32", "int64"]:
-        data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype)
-        check_scan(gt_func(data, dtype=in_dtype), data)
-        check_scan(gt_func(data), data, dtype="int64")
-        check_scan(gt_func(data, axis=0, dtype=in_dtype), data, axis=0)
-        check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1)
-
-        data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype)
-        check_scan(gt_func(data), data, dtype="int64")
-
-    data = np.random.randint(-100, 100, size=(100, 100)).astype("int64")
-
-    expected_result = np.roll(gt_func(data), 1)
-    expected_result[0] = identity_value[op_name]
-    check_scan(expected_result, data, dtype="int64", exclusive=True)
-
-    expected_result = np.roll(gt_func(data, axis=0, dtype=in_dtype), 1, axis=0)
-    expected_result[0, :] = identity_value[op_name]
-    check_scan(expected_result, data, axis=0, exclusive=True)
-
-    expected_result = np.roll(gt_func(data, axis=1, dtype=in_dtype), 1, axis=1)
-    expected_result[:, 0] = identity_value[op_name]
-    check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1)
-
-
-@tvm.testing.parametrize_targets
-def test_cumsum(dev, target):
-    _run_tests(dev, target, op_name="cumsum", gt_func=np.cumsum)
-
-
-@tvm.testing.parametrize_targets
-def test_cumprod(dev, target):
-    _run_tests(dev, target, op_name="cumprod", gt_func=np.cumprod)
-
-
-if __name__ == "__main__":
-    test_cumsum(tvm.device("cpu"), tvm.target.Target("llvm"))
-    test_cumsum(tvm.device("cuda"), tvm.target.Target("cuda"))
-    test_cumsum(tvm.device("nvptx"), tvm.target.Target("nvptx"))
-    test_cumsum(tvm.device("vulkan"), tvm.target.Target("vulkan"))
-    test_cumsum(tvm.device("metal"), tvm.target.Target("metal"))
-
-    test_cumprod(tvm.device("cpu"), tvm.target.Target("llvm"))
-    test_cumprod(tvm.device("cuda"), tvm.target.Target("cuda"))
-    test_cumprod(tvm.device("nvptx"), tvm.target.Target("nvptx"))
-    test_cumprod(tvm.device("vulkan"), tvm.target.Target("vulkan"))
-    test_cumprod(tvm.device("metal"), tvm.target.Target("metal"))
diff --git a/tests/python/topi/test_topi_scatter.py b/tests/python/topi/test_topi_scatter.py
deleted file mode 100644
index ccc34837a05a..000000000000
--- a/tests/python/topi/test_topi_scatter.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import topi
-import tvm.topi.testing
-
-
-@tvm.testing.parametrize_targets
-def test_scatter_nd(dev, target):
-    def check_scatter_nd(data, indices, updates, out, mode="add"):
-        implementations = {
-            "generic": (
-                lambda x, y, z: topi.scatter_nd(x, y, z, mode),
-                topi.generic.schedule_extern,
-            ),
-            "gpu": (
-                lambda x, y, z: topi.cuda.scatter_nd(x, y, z, mode),
-                topi.generic.schedule_extern,
-            ),
-        }
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-        tvm.topi.testing.compare_numpy_tvm(
-            [data, indices, updates], out, target, dev, fcompute, fschedule
-        )
-
-    data = np.zeros((2, 2)).astype("int64")
-    indices = np.array([[1, 1, 0], [0, 1, 0]])
-    updates = np.array([2, 3, 0])
-    out = np.array([[0, 0], [2, 3]])
-    check_scatter_nd(data, indices, updates, out)
-
-    data = np.zeros((2, 2, 2, 2)).astype("int64")
-    indices = np.array([[0, 1], [1, 1]])
-    updates = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
-    out = np.array([[[[0, 0], [0, 0]], [[1, 2], [3, 4]]], [[[0, 0], [0, 0]], [[5, 6], [7, 8]]]])
-    check_scatter_nd(data, indices, updates, out)
-
-    indices = np.array([[1, 0, 0]])
-    updates = np.reshape(np.arange(1560 * 3), (3, 1560)).astype("float32")
-    shape = (2, 1560)
-    data = np.zeros(shape).astype("float32")
-    out = data.copy()
-    out[1, :] += updates[0, :]
-    out[0, :] += updates[1, :]
-    out[0, :] += updates[2, :]
-    check_scatter_nd(data, indices, updates, out)
-
-    for mode in ["update", "add", "mul", "min", "max"]:
-        updates = np.ones((5, 3)).astype("float64")
-        indices = np.stack((np.random.randint(2, size=5), np.random.randint(7, size=5))).astype(
-            "int64"
-        )
-        shape = (2, 7, 3)
-        data = np.random.random(shape).astype("float64")
-        out = data.copy()
-        for i in range(indices.shape[1]):
-            for j in range(updates.shape[1]):
-                if mode == "update":
-                    out[indices[0, i], indices[1, i], j] = updates[i, j]
-                elif mode == "add":
-                    out[indices[0, i], indices[1, i], j] += updates[i, j]
-                elif mode == "mul":
-                    out[indices[0, i], indices[1, i], j] *= updates[i, j]
-                elif mode == "min":
-                    out[indices[0, i], indices[1, i], j] = min(
-                        out[indices[0, i], indices[1, i], j], updates[i, j]
-                    )
-                elif mode == "max":
-                    out[indices[0, i], indices[1, i], j] = max(
-                        out[indices[0, i], indices[1, i], j], updates[i, j]
-                    )
-
-        check_scatter_nd(data, indices, updates, out, mode)
-
-
-if __name__ == "__main__":
-    test_scatter_nd(tvm.device("cpu"), tvm.target.Target("llvm"))
diff --git a/tests/python/topi/test_topi_searchsorted.py b/tests/python/topi/test_topi_searchsorted.py
deleted file mode 100644
index 7b3976b7eb74..000000000000
--- a/tests/python/topi/test_topi_searchsorted.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm.topi.testing import searchsorted_ref
-from tvm import te, topi
-
-topi_funcs = {"generic": topi.searchsorted, "cuda": topi.cuda.searchsorted}
-
-
-def get_implementations():
-    topi_func_generic = topi_funcs["generic"]
-    topi_func_cuda = topi_funcs["cuda"]
-
-    return {
-        "generic": (
-            lambda x, y, side, out_dtype: topi_func_generic(x, y, side, out_dtype),
-            topi.generic.schedule_extern,
-        ),
-        "cuda": (
-            lambda x, y, side, out_dtype: topi_func_cuda(x, y, side, out_dtype),
-            topi.cuda.schedule_extern,
-        ),
-        "vulkan": (
-            lambda x, y, side, out_dtype: topi_func_cuda(x, y, side, out_dtype),
-            topi.cuda.schedule_extern,
-        ),
-    }
-
-
-@tvm.testing.parametrize_targets
-def test_searchsorted(dev, target):
-    def verify_with_input(sorted_sequence_np, values_np, right):
-        sorted_sequence = te.placeholder(sorted_sequence_np.shape, dtype="float32")
-        values = te.placeholder(values_np.shape, dtype="float32")
-        out_dtype = "int32"
-        implementations = get_implementations()
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-
-        with tvm.target.Target(target):
-            indices = fcompute(sorted_sequence, values, right, out_dtype)
-            s = fschedule([indices])
-
-        func = tvm.build(s, [sorted_sequence, values, indices], target=target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(sorted_sequence_np, dev)
-        b = tvm.nd.array(values_np, dev)
-        c = tvm.nd.array(np.zeros(values_np.shape, dtype=indices.dtype), dev)
-        func(a, b, c)
-        ref = searchsorted_ref(sorted_sequence_np, values_np, right, out_dtype)
-        np.testing.assert_equal(c.numpy(), ref)
-
-    def verify(sequence_len, num_search, outer_axes, right, sorted_sequence_1d=False):
-        if sorted_sequence_1d:
-            sorted_sequence_shape = (sequence_len,)
-        else:
-            sorted_sequence_shape = outer_axes + (sequence_len,)
-        values_shape = outer_axes + (num_search,)
-
-        verify_with_input(
-            np.sort(np.random.randn(*sorted_sequence_shape).astype("float32"), axis=-1),
-            np.random.randn(*values_shape).astype("float32"),
-            right,
-        )
-
-    verify(1024, 1000, (10, 5, 3), False)
-    verify(999, 2000, (10, 5, 3), True)
-    verify(1000, 1000, (), False)
-    verify(2001, 100, (500,), True)
-    verify(2001, 100, (500,), False, sorted_sequence_1d=True)
-
-    # Check edge cases
-    for right in [True, False]:
-        sorted_sequence = np.array([1, 2, 3, 4, 5], dtype="float32")
-        verify_with_input(sorted_sequence, np.array([6], dtype="float32"), right)
-        verify_with_input(sorted_sequence, np.array([0], dtype="float32"), right)
diff --git a/tests/python/topi/test_topi_softmax.py b/tests/python/topi/test_topi_softmax.py
deleted file mode 100644
index 8e5e039b1448..000000000000
--- a/tests/python/topi/test_topi_softmax.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for softmax"""
-import logging
-import os
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-from tvm import te, topi
-from tvm.topi.utils import get_const_tuple
-
-
-_softmax_schedule = {
-    "generic": topi.generic.schedule_softmax,
-    "cpu": topi.x86.schedule_softmax,
-    "gpu": topi.cuda.schedule_softmax,
-    "hls": topi.hls.schedule_softmax,
-}
-
-
-dtype = tvm.testing.parameter("float32", "float64")
-
-
-configs = {
-    "softmax": {
-        "topi": topi.nn.softmax,
-        "ref": tvm.topi.testing.softmax_python,
-        "dimensions": [1, 2, 4],
-        "axis": [0, 1, 2, 3],
-    },
-    "log_softmax": {
-        "topi": topi.nn.log_softmax,
-        "ref": tvm.topi.testing.log_softmax_python,
-        "dimensions": [2, 3],
-        "axis": [1],
-    },
-}
-shapes = [(32, 10), (3, 4), (1, 16, 256, 256), (32,)]
-softmax_operation, shape, axis = tvm.testing.parameters(
-    *[
-        (name, shape, axis)
-        for name, config in configs.items()
-        for shape in shapes
-        if len(shape) in config["dimensions"]
-        for axis in range(len(shape))
-        if axis in config["axis"]
-    ]
-)
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(shape, dtype, softmax_operation, axis):
-    ref_func = configs[softmax_operation]["ref"]
-
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    perm = list(range(a_np.ndim))
-    perm[-1], perm[axis] = perm[axis], perm[-1]
-    trans_shape = [a_np.shape[i] for i in perm]
-    a_np_2d = a_np.transpose(perm).reshape(-1, trans_shape[-1])
-    b_np_2d = ref_func(a_np_2d)
-    b_np = b_np_2d.reshape(*trans_shape).transpose(perm)
-
-    return a_np, b_np
-
-
-def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation, axis):
-    target = tvm.target.Target(target)
-    if target.kind.name == "vulkan" and dtype == "float64":
-        # https://www.khronos.org/registry/SPIR-V/specs/1.0/GLSL.std.450.html
-        pytest.xfail("Vulkan GLSL.std.450 does not support 64-bit floats")
-
-    A = te.placeholder(shape, dtype=dtype, name="A")
-
-    topi_op = configs[softmax_operation]["topi"]
-    B = topi_op(A, axis=axis)
-
-    with tvm.target.Target(target):
-        fschedule = tvm.topi.testing.dispatch(target, _softmax_schedule)
-        s = fschedule(B)
-
-    a_np, b_np = ref_data
-
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    f = tvm.build(s, [A, B], target)
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_sort.py b/tests/python/topi/test_topi_sort.py
deleted file mode 100644
index a23b4566a2da..000000000000
--- a/tests/python/topi/test_topi_sort.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for vision package"""
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi
-
-_sort_implement = {
-    "generic": (topi.sort, topi.generic.schedule_sort),
-    "gpu": (topi.cuda.sort, topi.cuda.schedule_sort),
-}
-
-_argsort_implement = {
-    "generic": (topi.argsort, topi.generic.schedule_argsort),
-    "gpu": (topi.cuda.argsort, topi.cuda.schedule_argsort),
-}
-
-_topk_implement = {
-    "generic": (topi.topk, topi.generic.schedule_topk),
-    "gpu": (topi.cuda.topk, topi.cuda.schedule_topk),
-}
-
-axis = tvm.testing.parameter(0, -1, 1)
-is_ascend = tvm.testing.parameter(True, False, ids=["is_ascend", "not_ascend"])
-dtype = tvm.testing.parameter("int64", "float32")
-
-topk = tvm.testing.parameter(0, 1, 5)
-topk_ret_type = tvm.testing.parameter("values", "indices", "both")
-
-
-def test_sort(target, dev, axis, is_ascend):
-    np.random.seed(0)
-
-    dshape = (20, 100)
-    data_dtype = "float32"
-    data = te.placeholder(dshape, name="data", dtype=data_dtype)
-
-    perm = np.arange(dshape[0] * dshape[1], dtype=data_dtype)
-    np.random.shuffle(perm)
-    np_data = perm.reshape(dshape)
-
-    if is_ascend:
-        np_sort = np.sort(np_data, axis=axis)
-    else:
-        np_sort = -np.sort(-np_data, axis=axis)
-
-    if axis == 0:
-        np_sort = np_sort[: dshape[axis], :]
-    else:
-        np_sort = np_sort[:, : dshape[axis]]
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _sort_implement)
-        out = fcompute(data, axis=axis, is_ascend=is_ascend)
-        s = fschedule(out)
-
-    tvm_data = tvm.nd.array(np_data, dev)
-    tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
-    f = tvm.build(s, [data, out], target)
-    f(tvm_data, tvm_out)
-    tvm.testing.assert_allclose(tvm_out.numpy(), np_sort, rtol=1e0)
-
-
-def test_argsort(target, dev, axis, is_ascend):
-    dshape = (20, 100)
-    data_dtype = "float32"
-    data = te.placeholder(dshape, name="data", dtype=data_dtype)
-
-    perm = np.arange(dshape[0] * dshape[1], dtype=data_dtype)
-    np.random.shuffle(perm)
-    np_data = perm.reshape(dshape)
-
-    if is_ascend:
-        np_indices = np.argsort(np_data, axis=axis)
-    else:
-        np_indices = np.argsort(-np_data, axis=axis)
-
-    if axis == 0:
-        np_indices = np_indices[: dshape[axis], :]
-    else:
-        np_indices = np_indices[:, : dshape[axis]]
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _argsort_implement)
-        out = fcompute(data, axis=axis, is_ascend=is_ascend)
-        s = fschedule(out)
-
-    tvm_data = tvm.nd.array(np_data, dev)
-    tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
-    f = tvm.build(s, [data, out], target)
-    f(tvm_data, tvm_out)
-    tvm.testing.assert_allclose(tvm_out.numpy(), np_indices.astype(data_dtype), rtol=1e0)
-
-
-def test_topk(target, dev, topk, axis, topk_ret_type, is_ascend, dtype):
-    np.random.seed(0)
-
-    shape = (20, 100)
-    data_dtype = "float32"
-    data = te.placeholder(shape, name="data", dtype=data_dtype)
-
-    np_data = np.random.uniform(size=shape).astype(data_dtype)
-    if is_ascend:
-        np_indices = np.argsort(np_data, axis=axis)
-    else:
-        np_indices = np.argsort(-np_data, axis=axis)
-    kk = topk if topk >= 1 else shape[axis]
-    if axis == 0:
-        np_indices = np_indices[:kk, :]
-        np_values = np.zeros(np_indices.shape).astype(data_dtype)
-        for i in range(shape[1]):
-            np_values[:, i] = np_data[np_indices[:, i], i]
-    else:
-        np_indices = np_indices[:, :kk]
-        np_values = np.zeros(np_indices.shape).astype(data_dtype)
-        for i in range(shape[0]):
-            np_values[i, :] = np_data[i, np_indices[i, :]]
-    np_indices = np_indices.astype(dtype)
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _topk_implement)
-        outs = fcompute(data, topk, axis, topk_ret_type, is_ascend, dtype)
-        outs = outs if isinstance(outs, list) else [outs]
-        s = fschedule(outs)
-    tvm_data = tvm.nd.array(np_data, dev)
-    tvm_res = []
-    for t in outs:
-        tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, device=dev))
-    f = tvm.build(s, [data] + outs, target)
-    f(tvm_data, *tvm_res)
-    if topk_ret_type == "both":
-        tvm.testing.assert_allclose(tvm_res[0].numpy(), np_values)
-        tvm.testing.assert_allclose(tvm_res[1].numpy(), np_indices)
-    elif topk_ret_type == "values":
-        tvm.testing.assert_allclose(tvm_res[0].numpy(), np_values)
-    else:
-        tvm.testing.assert_allclose(tvm_res[0].numpy(), np_indices)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_space_to_batch_nd.py b/tests/python/topi/test_topi_space_to_batch_nd.py
deleted file mode 100644
index 039f91aa059e..000000000000
--- a/tests/python/topi/test_topi_space_to_batch_nd.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for space to batch"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-
-
-def verify_space_to_batch_nd(input_shape, block_shape, pad_before, pad_after, pad_value=0):
-    out_shape = []
-    out_shape.append(int((input_shape[0] * np.prod(block_shape))))
-    for i in range(1, len(block_shape) + 1):
-        pad = pad_before[i - 1] + pad_after[i - 1]
-        out_shape.append(int((input_shape[i] + pad) // block_shape[i - 1]))
-    for i in range(len(block_shape) + 1, len(input_shape)):
-        out_shape.append(input_shape[i])
-
-    A = te.placeholder(input_shape, name="A", dtype="float32")
-    dtype = A.dtype
-    a_np = np.random.uniform(size=input_shape).astype(dtype)
-
-    B = topi.nn.space_to_batch_nd(A, block_shape, pad_before, pad_after, pad_value)
-
-    b_np = tvm.topi.testing.space_to_batch_nd_python(
-        a_np, block_shape, pad_before, pad_after, pad_value
-    )
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.create(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_space_to_batch():
-    # Without paddings
-    verify_space_to_batch_nd([3, 3, 2, 1], [3], [0], [0])
-    # With paddings
-    verify_space_to_batch_nd([3, 3, 2, 1], [3], [1], [2])
-    # Multiple spatial dims
-    verify_space_to_batch_nd([3, 3, 4, 5, 2], [3, 4, 2], [1, 0, 3], [2, 0, 0])
-    # No remaining dims
-    verify_space_to_batch_nd([3, 3, 4, 5, 2], [3, 4, 2, 2], [1, 4, 0, 0], [2, 0, 1, 0])
-
-
-if __name__ == "__main__":
-    test_space_to_batch()
diff --git a/tests/python/topi/test_topi_space_to_depth.py b/tests/python/topi/test_topi_space_to_depth.py
deleted file mode 100644
index ddd7daf4237d..000000000000
--- a/tests/python/topi/test_topi_space_to_depth.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for space to depth"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-
-
-def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, layout="NCHW"):
-    out_channel = int(in_channel * (block_size * block_size))
-    out_height = int(in_height / block_size)
-    out_width = int(in_width / block_size)
-
-    if layout == "NCHW":
-        in_shape = [batch, in_channel, in_height, in_width]
-        out_shape = [batch, out_channel, out_height, out_width]
-    elif layout == "NHWC":
-        in_shape = [batch, in_height, in_width, in_channel]
-        out_shape = [batch, out_height, out_width, out_channel]
-    else:
-        raise NotImplementedError("Layout not supported {}".format(layout))
-
-    A = te.placeholder(in_shape, name="A", dtype="float32")
-    dtype = A.dtype
-    a_np = np.random.uniform(size=in_shape).astype(dtype)
-
-    B = topi.nn.space_to_depth(A, block_size=block_size, layout=layout)
-    if layout == "NHWC":
-        a_np = np.transpose(a_np, axes=[0, 3, 1, 2])
-    b_np = tvm.topi.testing.space_to_depth_python(a_np, block_size)
-    if layout == "NHWC":
-        a_np = np.transpose(a_np, axes=[0, 2, 3, 1])
-        b_np = np.transpose(b_np, axes=[0, 2, 3, 1])
-
-    def check_device(device, dev):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for device, dev in tvm.testing.enabled_targets():
-        check_device(device, dev)
-
-
-@tvm.testing.uses_gpu
-def test_space_to_depth():
-    for layout in ["NCHW", "NHWC"]:
-        # Simplest possible case
-        verify_space_to_depth(2, 1, 1, 2, 2, layout=layout)
-        # Average input size
-        verify_space_to_depth(2, 1, 32, 32, 32, layout=layout)
-        # Large block size
-        verify_space_to_depth(8, 1, 32, 64, 64, layout=layout)
-        # Large batch size
-        verify_space_to_depth(4, 8, 32, 32, 32, layout=layout)
-        # Large input size
-        verify_space_to_depth(4, 8, 32, 128, 128, layout=layout)
-
-
-if __name__ == "__main__":
-    test_space_to_depth()
diff --git a/tests/python/topi/test_topi_sparse.py b/tests/python/topi/test_topi_sparse.py
deleted file mode 100644
index 11006576fea3..000000000000
--- a/tests/python/topi/test_topi_sparse.py
+++ /dev/null
@@ -1,619 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for sparse operator"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-from tvm import relay
-import tvm.topi.testing
-from tvm.topi.utils import get_const_tuple
-import tvm.contrib.sparse as tvmsp
-from collections import namedtuple
-import time
-import scipy.sparse as sp
-import tvm.testing
-
-_sparse_dense_implement = {
-    "generic": (topi.nn.sparse_dense, topi.generic.schedule_sparse_dense),
-    "cuda": (topi.cuda.sparse_dense, topi.cuda.schedule_sparse_dense),
-    "x86": (topi.nn.sparse_dense, topi.x86.schedule_sparse_dense),
-}
-
-
-def verify_dynamic_csrmv(batch, in_dim, out_dim, dtype, use_bias=True):
-    nr, nc, n = te.var("nr"), te.var("nc"), te.var("n")
-    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name="A")
-    B = te.placeholder((in_dim, 1), dtype=dtype, name="B")
-    C = te.placeholder((nr,), dtype=dtype, name="C")
-    D = topi.sparse.csrmv(A, B, C if use_bias else None)
-    s = te.create_schedule(D.op)
-    dtype = A.dtype
-
-    # get the test data
-    def get_ref_data():
-        a_np = np.random.uniform(size=(batch, in_dim), high=100).astype(dtype)
-        b_np = np.random.uniform(size=(in_dim, 1), high=100).astype(dtype)
-        c_np = np.random.uniform(size=(batch,), high=100).astype(dtype)
-        if use_bias:
-            d_np = np.dot(a_np, b_np) + c_np.reshape((batch, 1))
-        else:
-            d_np = np.dot(a_np, b_np)
-        return (a_np, b_np, c_np, d_np)
-
-    a_np, b_np, c_np, d_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        a = tvmsp.array(a_np, dev)
-        _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
-        assert a.shape[0] == a.indptr.shape[0] - 1
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(c_np, dev)
-        d = tvm.nd.array(np.zeros((_nr, 1), dtype=dtype), dev)
-        assert a.data.dtype == A.data.dtype
-        assert a.indices.dtype == A.indices.dtype
-        assert a.indptr.dtype == A.indptr.dtype
-        f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmv")
-        f(_nr, a.data, a.indices, a.indptr, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-4, atol=1e-4)
-
-    for device in ["llvm"]:
-        check_device(device)
-
-
-def verify_dynamic_csrmm(batch, in_dim, out_dim, dtype, use_bias=True):
-    nr, nc, n = te.var("nr"), te.var("nc"), te.var("n")
-    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name="A")
-    B = te.placeholder((in_dim, out_dim), dtype=dtype, name="B")
-    C = te.placeholder((nr,), dtype=dtype, name="C")
-    D = topi.sparse.csrmm(A, B, C if use_bias else None)
-    s = te.create_schedule(D.op)
-    dtype = A.dtype
-
-    # get the test data
-    def get_ref_data():
-        a_np = np.random.uniform(size=(batch, in_dim), high=100).astype(dtype)
-        b_np = np.random.uniform(size=(in_dim, out_dim), high=100).astype(dtype)
-        c_np = np.random.uniform(size=(batch,), high=100).astype(dtype)
-        if use_bias:
-            d_np = np.dot(a_np, b_np) + c_np.reshape((batch, 1))
-        else:
-            d_np = np.dot(a_np, b_np)
-        return (a_np, b_np, c_np, d_np)
-
-    a_np, b_np, c_np, d_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        a = tvmsp.array(a_np, dev)
-        _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
-        assert a.shape[0] == a.indptr.shape[0] - 1
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(c_np, dev)
-        d = tvm.nd.array(np.zeros((_nr, out_dim), dtype=dtype), dev)
-        f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmm")
-
-        f(_nr, a.data, a.indices, a.indptr, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-2, atol=1e-2)
-
-    for device in ["llvm"]:
-        check_device(device)
-
-
-def verify_dense_si(batch, in_dim, out_dim, use_bias=True, dtype="float32"):
-    nonzeros = te.var("nonzeros")
-    A = tvmsp.placeholder(shape=(batch, in_dim), nonzeros=nonzeros, dtype=dtype, name="A")
-    B = te.placeholder((out_dim, in_dim), dtype=dtype, name="B")
-    C = te.placeholder((out_dim,), dtype=dtype, name="C")
-    D = topi.sparse.dense(A, B, C if use_bias else None)
-    s = te.create_schedule(D.op)
-
-    # get the test data
-    def get_ref_data():
-        mag = 10.0
-        a_np = np.maximum(
-            mag * (np.random.uniform(size=(batch, in_dim)).astype("float32") - 0.5), 0.0
-        ).astype(dtype)
-        b_np = (mag * (np.random.uniform(size=(out_dim, in_dim)).astype("float32") - 0.5)).astype(
-            dtype
-        )
-        c_np = (mag * (np.random.uniform(size=(out_dim,)).astype("float32") - 0.5)).astype(dtype)
-        if use_bias:
-            d_np = np.dot(a_np, b_np.T) + c_np
-        else:
-            d_np = np.dot(a_np, b_np.T)
-        return (a_np, b_np, c_np, d_np)
-
-    a_np, b_np, c_np, d_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        a = tvmsp.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(c_np, dev)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev)
-        f = tvm.build(s, [A.data, A.indices, A.indptr, B, C, D], device, name="dense")
-        f(a.data, a.indices, a.indptr, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-4, atol=1e-4)
-
-    check_device("llvm")
-
-
-def verify_dense_sw(batch, in_dim, out_dim, use_bias=True, dtype="float32"):
-    nonzeros = te.var("nonzeros")
-    A = te.placeholder((batch, in_dim), dtype=dtype, name="A")
-    B = tvmsp.placeholder(shape=(out_dim, in_dim), nonzeros=nonzeros, dtype=dtype, name="B")
-    C = te.placeholder((out_dim,), dtype=dtype, name="C")
-    D = topi.sparse.dense(A, B, C if use_bias else None)
-    s = te.create_schedule(D.op)
-
-    # get the test data
-    def get_ref_data():
-        mag = 10.0
-        a_np = (mag * (np.random.uniform(size=(batch, in_dim)).astype("float32") - 0.5)).astype(
-            dtype
-        )
-        b_np = np.maximum(
-            mag * (np.random.uniform(size=(out_dim, in_dim)).astype("float32") - 0.5), 0.0
-        ).astype(dtype)
-        c_np = (mag * (np.random.uniform(size=(out_dim,)).astype("float32") - 0.5)).astype(dtype)
-        if use_bias:
-            d_np = np.dot(a_np, b_np.T) + c_np
-        else:
-            d_np = np.dot(a_np, b_np.T)
-        return (a_np, b_np, c_np, d_np)
-
-    a_np, b_np, c_np, d_np = get_ref_data()
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        a = tvm.nd.array(a_np, dev)
-        b = tvmsp.array(b_np, dev)
-        c = tvm.nd.array(c_np, dev)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev)
-        f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense")
-        f(a, b.data, b.indices, b.indptr, c, d)
-        tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-4, atol=1e-4)
-
-    check_device("llvm")
-
-
-def test_csrmv():
-    verify_dynamic_csrmv(batch=5, in_dim=7, out_dim=1, dtype="float32", use_bias=False)
-    verify_dynamic_csrmv(batch=5, in_dim=7, out_dim=1, dtype="float64", use_bias=True)
-    verify_dynamic_csrmv(batch=5, in_dim=7, out_dim=1, dtype="int32", use_bias=True)
-
-
-def test_csrmm():
-    M, K, N = 5, 7, 2
-    verify_dynamic_csrmm(batch=M, in_dim=K, out_dim=N, dtype="int64", use_bias=False)
-    verify_dynamic_csrmm(batch=M, in_dim=K, out_dim=N, dtype="float64", use_bias=True)
-
-
-def test_dense_si():
-    M, K, N = 3, 5, 2
-    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype="float32")
-    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype="float32")
-    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype="int32")
-    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype="int32")
-    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype="int16")
-    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype="int16")
-
-
-def test_dense_sw():
-    M, K, N = 3, 5, 2
-    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype="float32")
-    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype="float32")
-    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype="int32")
-    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype="int32")
-    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype="int16")
-    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype="int16")
-
-
-def test_dense():
-    test_dense_si()
-    test_dense_sw()
-
-
-def test_sparse_dense_csr():
-    M, N, K, density = 1, 17, 47, 0.2
-    X_np = np.random.randn(M, K).astype("float32")
-    W_sp_np = sp.random(N, K, density=density, format="csr", dtype="float32")
-    W_np = W_sp_np.todense()
-    Y_np = X_np.dot(W_np.T)
-
-    W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-    W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-    W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-    Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr)
-    s = te.create_schedule(Y.op)
-    func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-    Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype))
-    func(
-        tvm.nd.array(X_np),
-        tvm.nd.array(W_sp_np.data),
-        tvm.nd.array(W_sp_np.indices),
-        tvm.nd.array(W_sp_np.indptr),
-        Y_tvm,
-    )
-    tvm.testing.assert_allclose(Y_tvm.numpy(), Y_np, atol=1e-4, rtol=1e-4)
-
-
-def test_sparse_dense_csr_reverse():
-    M, N, K, density = 1, 17, 47, 0.2
-    X_np = np.random.randn(M, K).astype("float32")
-    W_sp_np = sp.random(N, K, density=density, format="csr", dtype="float32")
-    W_np = W_sp_np.todense()
-    Y_np = W_np.dot(X_np.T)
-
-    W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-    W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-    W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-    Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr, sparse_lhs=True)
-    s = te.create_schedule(Y.op)
-    func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-    Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype))
-    func(
-        tvm.nd.array(X_np),
-        tvm.nd.array(W_sp_np.data),
-        tvm.nd.array(W_sp_np.indices),
-        tvm.nd.array(W_sp_np.indptr),
-        Y_tvm,
-    )
-    tvm.testing.assert_allclose(Y_tvm.numpy(), Y_np, atol=1e-4, rtol=1e-4)
-
-
-def test_sparse_transpose_csr():
-    N, density = 1023, 0.3
-
-    X_sp = sp.random(N, N, density=density, format="csr", dtype="float32")
-
-    X_sp_T = X_sp.transpose()
-    X_np_T = X_sp_T.todense()
-
-    X_data = te.placeholder(shape=X_sp.data.shape, dtype=str(X_sp.data.dtype))
-    X_indices = te.placeholder(shape=X_sp.indices.shape, dtype=str(X_sp.indices.dtype))
-    X_indptr = te.placeholder(shape=X_sp.indptr.shape, dtype=str(X_sp.indptr.dtype))
-
-    X_T_data, X_T_indices, X_T_indptr = topi.nn.sparse_transpose(X_data, X_indices, X_indptr)
-    s = te.create_schedule([X_T_data.op, X_T_indices.op, X_T_indptr.op])
-    func = tvm.build(s, [X_data, X_indices, X_indptr, X_T_data, X_T_indices, X_T_indptr])
-
-    X_T_data_tvm = tvm.nd.array(np.zeros(X_sp_T.data.shape, dtype=X_sp_T.data.dtype))
-    X_T_indices_tvm = tvm.nd.array(np.zeros(X_sp_T.indices.shape, dtype=X_sp_T.indices.dtype))
-    X_T_indptr_tvm = tvm.nd.array(np.zeros(X_sp_T.indptr.shape, dtype=X_sp_T.indptr.dtype))
-
-    func(
-        tvm.nd.array(X_sp.data),
-        tvm.nd.array(X_sp.indices),
-        tvm.nd.array(X_sp.indptr),
-        X_T_data_tvm,
-        X_T_indices_tvm,
-        X_T_indptr_tvm,
-    )
-
-    X_T_out = sp.csr_matrix(
-        (X_T_data_tvm.numpy(), X_T_indices_tvm.numpy(), X_T_indptr_tvm.numpy()), shape=(N, N)
-    ).todense()
-    tvm.testing.assert_allclose(X_np_T, X_T_out, atol=1e-4, rtol=1e-4)
-
-
-def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype):
-    import itertools
-
-    Y = np.zeros((M, N), dtype=dtype)
-    assert M % BS_R == 0
-    assert N % BS_C == 0
-    nnz = int(density * M * N)
-    num_blocks = int(nnz / (BS_R * BS_C)) + 1
-    candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C))))
-    assert candidate_blocks.shape[0] == M // BS_R * N // BS_C
-    chosen_blocks = candidate_blocks[
-        np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False)
-    ]
-    for i in range(len(chosen_blocks)):
-        r, c = chosen_blocks[i]
-        Y[r : r + BS_R, c : c + BS_C] = np.random.randn(BS_R, BS_C)
-    s = sp.bsr_matrix(Y, blocksize=(BS_R, BS_C))
-    assert s.data.shape == (num_blocks, BS_R, BS_C)
-    assert s.indices.shape == (num_blocks,)
-    assert s.indptr.shape == (M // BS_R + 1,)
-    return s
-
-
-def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu, device, target):
-    X_np = np.random.randn(M, K).astype("float32")
-    W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32")
-    W_np = W_sp_np.todense()
-    Y_np = X_np @ W_np.T
-    if use_relu:
-        Y_np = np.maximum(Y_np, 0.0)
-
-    W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-    W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-    W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-
-    fcompute, fschedule = tvm.topi.testing.dispatch(target, _sparse_dense_implement)
-    with tvm.target.Target(target):
-        Y = fcompute(X, W_data, W_indices, W_indptr)
-        if use_relu:
-            Y = topi.nn.relu(Y)
-        s = fschedule([Y])
-        func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-        Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=device)
-        func(
-            tvm.nd.array(X_np, device=device),
-            tvm.nd.array(W_sp_np.data, device=device),
-            tvm.nd.array(W_sp_np.indices, device=device),
-            tvm.nd.array(W_sp_np.indptr, device=device),
-            Y_tvm,
-        )
-        tvm.testing.assert_allclose(Y_tvm.numpy(), Y_np, atol=1e-4, rtol=1e-4)
-
-
-@tvm.testing.parametrize_targets("llvm", "cuda")
-def test_sparse_dense_bsr_relu(dev, target):
-    M, N, K, BS_R, BS_C, density = 1, 64, 128, 8, 16, 0.9
-    verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, True, dev, target)
-    verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, False, dev, target)
-
-
-def test_sparse_dense_bsr_reverse():
-    M, N, K, BS_R, BS_C, density = 1, 64, 128, 8, 16, 0.9
-    X_np = np.random.randn(M, K).astype("float32")
-    W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32")
-    W_np = W_sp_np.todense()
-    Y_np = W_np.dot(X_np.T)
-
-    W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-    W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-    W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-    Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr, sparse_lhs=True)
-    s = te.create_schedule(Y.op)
-    func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-    Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype))
-    func(
-        tvm.nd.array(X_np),
-        tvm.nd.array(W_sp_np.data),
-        tvm.nd.array(W_sp_np.indices),
-        tvm.nd.array(W_sp_np.indptr),
-        Y_tvm,
-    )
-    tvm.testing.assert_allclose(Y_tvm.numpy(), Y_np, atol=1e-4, rtol=1e-4)
-
-
-@tvm.testing.uses_gpu
-def test_sparse_dense_bsr_randomized():
-    for _ in range(20):
-        BS_R = np.random.randint(1, 16)
-        BS_C = np.random.randint(1, 16)
-        M = np.random.randint(1, 32)
-        N = int(np.random.randint(1, 16) * BS_R)
-        K = int(np.random.randint(1, 16) * BS_C)
-        density = np.clip(np.random.random(), 0.1, 0.9)
-        X_np = np.random.randn(M, K).astype("float32")
-        W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32")
-
-        W_np = W_sp_np.todense()
-        Y_np = np.array(X_np.dot(W_np.T))
-
-        W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-        W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-        W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-        X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-
-        def check_device(device):
-            dev = tvm.device(device, 0)
-            if not tvm.testing.device_enabled(device):
-                print("Skip because %s is not enabled" % device)
-                return
-            print("Running on target: %s" % device)
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _sparse_dense_implement)
-            with tvm.target.Target(device):
-                Y = fcompute(X, W_data, W_indices, W_indptr)
-                s = fschedule([Y])
-                func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-                Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=dev)
-                func(
-                    tvm.nd.array(X_np, device=dev),
-                    tvm.nd.array(W_sp_np.data, device=dev),
-                    tvm.nd.array(W_sp_np.indices, device=dev),
-                    tvm.nd.array(W_sp_np.indptr, device=dev),
-                    Y_tvm,
-                )
-                tvm.testing.assert_allclose(Y_tvm.numpy(), Y_np, atol=1e-5, rtol=1e-5)
-
-        for device in ["llvm", "cuda"]:
-            check_device(device)
-
-
-@tvm.testing.parametrize_targets("cuda", "rocm")
-def test_sparse_dense_padded_gpu(target, dev):
-    M = 128
-    N = 1280
-    K = 128
-    X_np = np.random.randn(M, K).astype("float32")
-    W_sp_np = random_bsr_matrix(N, K, 1, 1, density=0.01, dtype="float32")
-    W_sp_np_padded = tvm.topi.cuda.pad_sparse_matrix(W_sp_np, 32)
-
-    W_np = W_sp_np.todense()
-    Y_np = X_np @ W_sp_np.T
-
-    W_data = te.placeholder(shape=W_sp_np_padded.data.shape, dtype=str(W_sp_np_padded.data.dtype))
-    W_indices = te.placeholder(
-        shape=W_sp_np_padded.indices.shape, dtype=str(W_sp_np_padded.indices.dtype)
-    )
-    W_indptr = te.placeholder(
-        shape=W_sp_np_padded.indptr.shape, dtype=str(W_sp_np_padded.indptr.dtype)
-    )
-    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-    with tvm.target.Target(target):
-        Y = topi.cuda.sparse_dense_padded(X, W_data, W_indices, W_indptr)
-        s = topi.cuda.schedule_sparse_dense_padded([Y])
-        func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-        Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=dev)
-        func(
-            tvm.nd.array(X_np, device=dev),
-            tvm.nd.array(W_sp_np_padded.data, device=dev),
-            tvm.nd.array(W_sp_np_padded.indices, device=dev),
-            tvm.nd.array(W_sp_np_padded.indptr, device=dev),
-            Y_tvm,
-        )
-        tvm.testing.assert_allclose(Y_tvm.numpy(), Y_np, atol=1e-5, rtol=1e-5)
-
-
-@tvm.testing.parametrize_targets("cuda", "rocm")
-def test_sparse_dense_padded_alter_op(target, dev):
-    with tvm.target.Target(target):
-        M = 128
-        N = 16
-        K = 128
-        X_np = np.random.randn(M, K).astype("float32")
-        W_sp_np = random_bsr_matrix(N, K, 2, 2, density=0.01, dtype="float32")
-        x = relay.var("x", relay.TensorType(X_np.shape, "float32"))
-        mult = relay.op.nn.sparse_dense(
-            x,
-            (
-                relay.Constant(tvm.nd.array(W_sp_np.data)),
-                relay.Constant(tvm.nd.array(W_sp_np.indices)),
-                relay.Constant(tvm.nd.array(W_sp_np.indptr)),
-            ),
-        )
-        f = relay.Function([x], mult)
-        f_ = relay.transform.InferType()(tvm.IRModule.from_expr(f))
-        f_ = relay.transform.AlterOpLayout()(f_)
-        assert f_["main"].body.op.name == "nn.internal.sparse_dense_padded"
-
-        # build with cuda and AlterOpLayout to ensure that sparse_dense_padded is in action
-        with tvm.transform.PassContext(opt_level=3, required_pass="AlterOpLayout"):
-            x = relay.build(tvm.IRModule.from_expr(f), target=target)
-
-
-def test_sparse_add_csr():
-    for indices_dtype in ["int32", "int64"]:
-        for data_dtype in ["float32", "float64"]:
-            M, K, density = 3, 49, 0.2
-            X_np = np.random.randn(M, K).astype(data_dtype)
-            Y_sp_np = sp.random(M, K, density=density, format="csr", dtype=data_dtype)
-            Y_np = Y_sp_np.todense()
-            Z_np = X_np + Y_np
-
-            Y_data = te.placeholder(shape=Y_sp_np.data.shape, dtype=data_dtype)
-            Y_indices = te.placeholder(shape=Y_sp_np.indices.shape, dtype=indices_dtype)
-            Y_indptr = te.placeholder(shape=Y_sp_np.indptr.shape, dtype=indices_dtype)
-            X = te.placeholder(shape=X_np.shape, dtype=data_dtype)
-            Z = topi.nn.sparse_add(X, Y_data, Y_indices, Y_indptr)
-            s = te.create_schedule(Z.op)
-            func = tvm.build(s, [X, Y_data, Y_indices, Y_indptr, Z])
-            Z_tvm = tvm.nd.array(np.zeros(Z_np.shape, dtype=Z_np.dtype))
-            func(
-                tvm.nd.array(X_np.astype(data_dtype)),
-                tvm.nd.array(Y_sp_np.data.astype(data_dtype)),
-                tvm.nd.array(Y_sp_np.indices.astype(indices_dtype)),
-                tvm.nd.array(Y_sp_np.indptr.astype(indices_dtype)),
-                Z_tvm,
-            )
-            tvm.testing.assert_allclose(Z_tvm.numpy(), Z_np, atol=1e-4, rtol=1e-4)
-
-
-def verify_sparse_conv2d_bsr(M, H, W, N, K, BS_R, BS_C, density, layout):
-    if layout == "NHWC":
-        X_np = np.random.randn(M, H, W, K).astype("float32")
-    elif layout == "NCHW":
-        X_np = np.random.randn(M, K, H, W).astype("float32")
-    W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32")
-    W_np = W_sp_np.todense()
-    if layout == "NHWC":
-        Y_np = tvm.topi.testing.conv2d_nhwc_python(X_np, np.array(W_np).T.reshape(1, 1, K, N), 1, 0)
-    elif layout == "NCHW":
-        Y_np = tvm.topi.testing.conv2d_nchw_python(X_np, np.array(W_np).reshape(N, K, 1, 1), 1, 0)
-
-    if BS_C == 1:
-        W_data = te.placeholder(shape=W_sp_np.data.shape[:-1], dtype=str(W_sp_np.data.dtype))
-        W_sp_np_data = W_sp_np.data.reshape(W_sp_np.data.shape[0], BS_R)
-    else:
-        W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-        W_sp_np_data = W_sp_np.data
-    W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-    W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-
-    Y = topi.nn.sparse_conv2d(X, W_data, W_indices, W_indptr, layout)
-    s = te.create_schedule(Y.op)
-
-    def check_device(device):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-
-        func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-        Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype="float32"))
-        func(
-            tvm.nd.array(X_np, dev),
-            tvm.nd.array(W_sp_np_data, dev),
-            tvm.nd.array(W_sp_np.indices, dev),
-            tvm.nd.array(W_sp_np.indptr, dev),
-            Y_tvm,
-        )
-        tvm.testing.assert_allclose(Y_tvm.numpy(), Y_np.astype("float32"), atol=1e-4, rtol=1e-4)
-
-    check_device("llvm")
-
-
-def test_sparse_conv2d_bsr():
-    M, H, W, N, K, BS_R, BS_C, density = 1, 32, 32, 128, 64, 8, 16, 0.9
-    verify_sparse_conv2d_bsr(M, H, W, N, K, BS_R, BS_C, density, "NHWC")
-    verify_sparse_conv2d_bsr(M, H, W, N, K, BS_R, BS_C, density, "NCHW")
-    verify_sparse_conv2d_bsr(M, H, W, N, K, BS_R, 1, density, "NHWC")
-
-
-if __name__ == "__main__":
-    # test_csrmv()
-    # test_csrmm()
-    # test_dense()
-    # test_sparse_dense_csr()
-    # test_sparse_dense_bsr_randomized()
-    # test_sparse_transpose_csr()
-    # test_sparse_dense_padded_cuda()
-    # test_sparse_dense_padded_alter_op()
-    # test_sparse_dense_csr_reverse()
-    # test_sparse_dense_bsr_reverse()
-    # test_sparse_add_csr()
-    test_sparse_conv2d()
diff --git a/tests/python/topi/test_topi_tensor.py b/tests/python/topi/test_topi_tensor.py
deleted file mode 100644
index 167e7f944eb4..000000000000
--- a/tests/python/topi/test_topi_tensor.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for tensor operator"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.contrib.nvcc import have_fp16
-import tvm.testing
-
-
-def verify_elemwise_sum(num_args, dtype):
-    shape = (3, 5, 4)
-
-    tvm_placeholders = []
-    for i in range(num_args):
-        tvm_placeholders.append(te.placeholder(shape, name="data" + str(i), dtype=dtype))
-    esum = topi.elemwise_sum(tvm_placeholders)
-    s = te.create_schedule([esum.op])
-
-    @memoize("topi.tests.test_topi_elemwise_sum")
-    def get_ref_data():
-        np_nd = [np.random.uniform(0, 10, size=shape).astype(dtype) for i in range(num_args)]
-        return np_nd
-
-    np_nd = get_ref_data()
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-
-        dev = tvm.device(target, 0)
-        out = tvm.nd.array(np.zeros(shape, dtype=dtype), dev)
-        f = tvm.build(s, tvm_placeholders + [esum], target, name="elemwise_sum")
-        tvm_nd = [tvm.nd.array(nd, dev) for nd in np_nd] + [out]
-        f(*tvm_nd)
-        np_out = np.sum(np.array(np_nd), axis=0)
-        tvm.testing.assert_allclose(out.numpy(), np_out, rtol=1e-5)
-
-    for target in ["llvm"]:
-        check_target(target)
-
-
-def verify_full(shape, dtype, fill_value):
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = topi.full_like(A, fill_value=fill_value)
-    C = topi.full(shape=shape, dtype=dtype, fill_value=fill_value)
-    s1 = te.create_schedule([B.op])
-    s2 = te.create_schedule([C.op])
-
-    @memoize("topi.tests.test_topi_full")
-    def get_ref_data():
-        return np.full(shape, fill_value, dtype)
-
-    np_nd = get_ref_data()
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-
-        dev = tvm.device(target, 0)
-        out = tvm.nd.array(np.zeros(shape, dtype=dtype), dev)
-        f = tvm.build(s1, [A, B], target, name="full_like")
-        f(tvm.nd.array(np.zeros(shape, dtype), dev), out)
-        tvm.testing.assert_allclose(out.numpy(), np_nd, rtol=1e-5)
-
-        f = tvm.build(s2, [C], target, name="full")
-        f(out)
-        tvm.testing.assert_allclose(out.numpy(), np_nd, rtol=1e-5)
-
-    for target in ["llvm"]:
-        check_target(target)
-
-
-def verify_vectorization(n, m, dtype):
-    def check_targeta(targeta):
-        if not tvm.testing.device_enabled(targeta):
-            print("Skip because %s is not enabled" % targeta)
-            return
-        if dtype == "float16" and targeta == "cuda" and not have_fp16(tvm.cuda(0).compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-        with tvm.target.Target(targeta):
-            dev = tvm.device(targeta, 0)
-            A = te.placeholder((n, m), name="A", dtype=dtype)
-            B = te.compute((n, m), lambda i, j: A[i, j] + tvm.tir.const(1, A.dtype), name="B")
-            S = tvm.topi.testing.get_elemwise_schedule(targeta)(B)
-
-            fun = tvm.build(S, [A, B], targeta)
-            np_A = tvm.nd.empty((n, m), A.dtype, dev).copyfrom(np.random.uniform(size=(n, m)))
-            np_B = tvm.nd.empty((n, m), B.dtype, dev)
-            fun(np_A, np_B)
-            tvm.testing.assert_allclose(np_B.numpy(), np_A.numpy() + 1, rtol=1e-5)
-
-    for targeta in ["cuda"]:
-        check_targeta(targeta)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_vectorization():
-    verify_vectorization(128, 64, "float16")
-
-
-def test_elemwise_sum():
-    verify_elemwise_sum(1, "float32")
-    verify_elemwise_sum(5, "float32")
-    verify_elemwise_sum(4, "int32")
-
-
-def test_full():
-    verify_full((3, 4, 5), "float32", 3.14)
-    verify_full((10,), "int32", 7)
-
-
-if __name__ == "__main__":
-    test_elemwise_sum()
-    test_full()
-    test_vectorization()
diff --git a/tests/python/topi/test_topi_transform.py b/tests/python/topi/test_topi_transform.py
deleted file mode 100644
index 575e7aa450a6..000000000000
--- a/tests/python/topi/test_topi_transform.py
+++ /dev/null
@@ -1,1416 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for broadcasting operators."""
-import numpy as np
-import pytest
-import tvm
-from tvm import te
-from tvm import tir
-from tvm import topi
-from tvm import relay
-import tvm.topi.testing
-from tvm.contrib.nvcc import have_fp16
-from tvm.script import tir as T
-
-import tvm.testing
-
-
-def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
-    A = te.placeholder(shape=in_shape, name="A")
-    B = topi.expand_dims(A, axis, num_newaxis)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
-        foo = tvm.build(s, [A, B], target, name="expand_dims")
-        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = data_npy.reshape(out_shape)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), dev)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_reinterpret(in_shape, in_dtype, out_dtype, generator):
-    A = te.placeholder(shape=in_shape, name="A", dtype=in_dtype)
-    B = topi.reinterpret(A, out_dtype)
-
-    def check_device(target, dev):
-        if in_dtype == "float16" and target == "cuda" and not have_fp16(dev.compute_version):
-            print("Skip because %s does not have fp16 support" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_elemwise_schedule(target)(B)
-        foo = tvm.build(s, [A, B], target, name="reinterpret")
-        data_npy = generator(in_shape).astype(in_dtype)
-        out_npy = data_npy.view(B.dtype)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), dev)
-        foo(data_nd, out_nd)
-        np.testing.assert_equal(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_transpose(in_shape, axes):
-    A = te.placeholder(shape=in_shape, name="A")
-    B = topi.transpose(A, axes)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        foo = tvm.build(s, [A, B], target, name="transpose")
-        data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype)
-        out_npy = data_npy.transpose(axes)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=B.dtype)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_reshape(src_shape, dst_shape):
-    A = te.placeholder(shape=src_shape, name="A")
-    B = topi.reshape(A, dst_shape)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        foo = tvm.build(s, [A, B], target, name="reshape")
-        data_npy = np.random.normal(size=src_shape).astype(A.dtype)
-        out_npy = np.reshape(data_npy, newshape=dst_shape)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd = tvm.nd.empty(dst_shape, device=dev, dtype=B.dtype)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_squeeze(src_shape, axis):
-    A = te.placeholder(shape=src_shape, name="A")
-    B = topi.squeeze(A, axis=axis)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-
-        foo = tvm.build(s, [A, B], target, name="squeeze")
-        data_npy = np.random.normal(size=src_shape).astype(A.dtype)
-        out_npy = np.squeeze(data_npy, axis=axis)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd_shape = out_npy.shape
-        out_nd = tvm.nd.empty(out_nd_shape, device=dev, dtype=B.dtype)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_concatenate(shapes, axis):
-    def get_concat_schedule(target):
-        schedule_map = {
-            "cpu": topi.x86.schedule_concatenate,
-            "arm_cpu": topi.arm_cpu.schedule_concatenate,
-        }
-        if isinstance(target, str):
-            target = tvm.target.Target(target)
-        for key in target.keys:
-            if key in schedule_map:
-                return schedule_map[key]
-        return tvm.topi.testing.get_injective_schedule(target)
-
-    tensor_l = []
-    for i, shape in enumerate(shapes):
-        tensor_l.append(te.placeholder(shape, name="A" + str(i)))
-    out_tensor = topi.concatenate(a_tuple=tensor_l, axis=axis)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = get_concat_schedule(target)(out_tensor)
-
-        foo = tvm.build(s, tensor_l + [out_tensor], target, name="concatenate")
-        data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
-        out_npy = np.concatenate(data_npys, axis=axis)
-        data_nds = [tvm.nd.array(data_npy, dev) for data_npy in data_npys]
-        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=out_tensor.dtype)
-        foo(*(data_nds + [out_nd]))
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_stack(shapes, axis):
-    tensor_l = []
-    for i, shape in enumerate(shapes):
-        tensor_l.append(te.placeholder(shape, name="A" + str(i)))
-    out_tensor = topi.stack(tensor_l, axis)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_broadcast_schedule(target)(out_tensor)
-
-        foo = tvm.build(s, tensor_l + [out_tensor], target, name="stack")
-        data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
-        out_npy = np.stack(data_npys, axis=axis)
-        data_nds = [tvm.nd.array(data_npy, dev) for data_npy in data_npys]
-        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=out_tensor.dtype)
-        foo(*(data_nds + [out_nd]))
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_split(src_shape, indices_or_sections, axis):
-    A = te.placeholder(shape=src_shape, name="A")
-    tensor_l = topi.split(A, indices_or_sections, axis=axis)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(tensor_l)
-
-        foo = tvm.build(s, [A] + list(tensor_l), target, name="split")
-        data_npy = np.random.normal(size=src_shape).astype(A.dtype)
-        out_npys = np.split(data_npy, indices_or_sections, axis=axis)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nds = [
-            tvm.nd.empty(out_npy.shape, device=dev, dtype=tensor_l[0].dtype) for out_npy in out_npys
-        ]
-        foo(*([data_nd] + out_nds))
-        for out_nd, out_npy in zip(out_nds, out_npys):
-            tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_expand_like(in_shape, out_shape, axis):
-    A = te.placeholder(shape=in_shape, name="A")
-    B = te.placeholder(shape=out_shape, name="B")
-    C = topi.expand_like(A, B, axis)
-    s = te.create_schedule([C.op])
-
-    def check_device(target):
-        print("Running on target: %s" % target)
-
-        dev = tvm.device(target, 0)
-        f = tvm.build(s, [A, B, C], target, name="expand_like")
-        input = np.random.uniform(size=in_shape).astype(A.dtype)
-        tvm_input = tvm.nd.array(input, dev)
-
-        odim = len(out_shape)
-        real_axis = [x if x >= 0 else x + odim for x in axis]
-        real_axis = sorted(real_axis)
-        for x in real_axis:
-            input = np.expand_dims(input, x).astype(A.dtype)
-        for x in real_axis:
-            input = np.concatenate([input] * out_shape[x], axis=x).astype(A.dtype)
-        assert input.shape == out_shape
-
-        tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), dev)
-        out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), dev)
-        f(tvm_input, tvm_shape_like, out)
-        tvm.testing.assert_allclose(out.numpy(), input)
-
-    for target in ["llvm"]:
-        check_device(target)
-
-
-def verify_flip(in_shape, axis):
-    A = te.placeholder(shape=in_shape, name="A")
-    B = topi.flip(A, axis) + 1
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-
-        foo = tvm.build(s, [A, B], target, name="reverse")
-        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = np.flip(x_np, axis) + 1
-        data_nd = tvm.nd.array(x_np, dev)
-        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(target)
-
-
-@tvm.testing.uses_gpu
-def test_reverse_sequence():
-    def verify_reverse_sequence(in_data, seq_lengths, batch_axis, seq_axis, ref_res):
-        seq_lengths = np.array(seq_lengths).astype("int32")
-        A = te.placeholder(shape=in_data.shape, name="A", dtype=str(in_data.dtype))
-        B = te.placeholder(shape=seq_lengths.shape, name="B", dtype=str(seq_lengths.dtype))
-        C = topi.reverse_sequence(A, B, seq_axis, batch_axis)
-
-        def check_device(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(C)
-
-            foo = tvm.build(s, [A, B, C], target, name="reverse_sequence")
-
-            data_nd = tvm.nd.array(in_data, dev)
-            seq_lengths_nd = tvm.nd.array(seq_lengths, dev)
-            out_nd = tvm.nd.empty(in_data.shape, device=dev, dtype=A.dtype)
-            foo(data_nd, seq_lengths_nd, out_nd)
-            tvm.testing.assert_allclose(out_nd.numpy(), ref_res)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_device(target, dev)
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = [[0, 5, 10, 15], [4, 1, 6, 11], [8, 9, 2, 7], [12, 13, 14, 3]]
-    verify_reverse_sequence(indata, [1, 2, 3, 4], 1, 0, np.array(result))
-    verify_reverse_sequence(indata, [1, 2, 3, 4], -1, 0, np.array(result))
-    verify_reverse_sequence(
-        indata.astype("float32"), [1, 2, 3, 4], 1, 0, np.array(result).astype("float32")
-    )
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = [[0, 1, 2, 3], [5, 4, 6, 7], [10, 9, 8, 11], [15, 14, 13, 12]]
-    verify_reverse_sequence(indata, [1, 2, 3, 4], 0, 1, np.array(result))
-    verify_reverse_sequence(indata, [1, 2, 3, 4], 0, -1, np.array(result))
-    verify_reverse_sequence(
-        indata.astype("float32"), [1, 2, 3, 4], 0, 1, np.array(result).astype("float32")
-    )
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [15, 14, 13, 12]]
-    verify_reverse_sequence(indata, [-1, 0, 1, 5], 0, 1, np.array(result))
-
-    indata = np.array(np.arange(0, 54)).reshape([2, 3, 3, 3]).astype("int32")
-    result = [
-        [
-            [[18, 19, 20], [21, 22, 23], [24, 25, 26]],
-            [[9, 10, 11], [12, 13, 14], [15, 16, 17]],
-            [[0, 1, 2], [3, 4, 5], [6, 7, 8]],
-        ],
-        [
-            [[45, 46, 47], [48, 49, 50], [51, 52, 53]],
-            [[36, 37, 38], [39, 40, 41], [42, 43, 44]],
-            [[27, 28, 29], [30, 31, 32], [33, 34, 35]],
-        ],
-    ]
-    verify_reverse_sequence(indata, [3, 3], 0, 1, np.array(result))
-
-    indata = np.array(np.arange(0, 54)).reshape([2, 3, 3, 3]).astype("int32")
-    result = [
-        [
-            [[9, 10, 11], [21, 22, 23], [15, 16, 17]],
-            [[0, 1, 2], [12, 13, 14], [6, 7, 8]],
-            [[18, 19, 20], [3, 4, 5], [24, 25, 26]],
-        ],
-        [
-            [[36, 37, 38], [48, 49, 50], [42, 43, 44]],
-            [[27, 28, 29], [39, 40, 41], [33, 34, 35]],
-            [[45, 46, 47], [30, 31, 32], [51, 52, 53]],
-        ],
-    ]
-    verify_reverse_sequence(indata, [2, 3, 2], 2, 1, np.array(result))
-
-    indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
-    result = []
-    with pytest.raises(Exception) as execinfo:
-        verify_reverse_sequence(indata, [2, 3, 2, 4, 5], 1, 0, np.array(result))
-
-    assert (
-        "For reverse_sequnece seq_lengths size should match with dimension of batch axis,"
-        " but got dimension of batch_axis = 4, and seq_length size = 5" in execinfo.value.args[0]
-    )
-
-
-def verify_take(src_shape, indices_src, axis=None, mode="clip", indices_dtype="int32"):
-    src_dtype = "float32"
-    indices_src = np.array(indices_src, dtype=indices_dtype)
-    A = te.placeholder(shape=src_shape, dtype=src_dtype, name="A")
-    indices = te.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
-    if axis is None:
-        out_tensor = topi.take(a=A, indices=indices, mode=mode)
-    else:
-        out_tensor = topi.take(a=A, indices=indices, axis=axis, mode=mode)
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(out_tensor)
-
-        foo = tvm.build(s, [A] + [indices] + [out_tensor], target, name="take")
-        shape_size = 1
-        for i in range(len(src_shape)):
-            shape_size = shape_size * src_shape[i]
-        data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
-
-        if axis is None:
-            np_mode = "raise" if mode == "fast" else mode
-            out_npys = np.take(data_npy, indices_src, mode=np_mode)
-        else:
-            np_mode = "raise" if mode == "fast" else mode
-            out_npys = np.take(data_npy, indices_src, axis=axis, mode=np_mode)
-        data_nd = tvm.nd.array(data_npy, dev)
-        indices_nd = tvm.nd.array(indices_src, dev)
-        out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=src_dtype)
-        foo(data_nd, indices_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npys)
-
-    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(target)
-
-
-def verify_strided_slice(in_shape, begin, end, strides=None, axes=None):
-    A = te.placeholder(shape=in_shape, name="A")
-    strides = [1, 1, 1] if strides is None else strides
-    if axes:
-        strides = [strides[axis] for axis in axes]
-    B = topi.strided_slice(A, begin, end, strides, axes) + 1
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-
-        foo = tvm.build(s, [A, B], target, name="stride_slice")
-        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides, axes=axes) + 1
-        data_nd = tvm.nd.array(x_np, dev)
-        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(target)
-
-
-def verify_dynamic_strided_slice(in_shape, begin, end, strides=None):
-    A = te.placeholder(shape=in_shape, name="A")
-    Begin = te.placeholder(shape=[len(in_shape)], name="begin", dtype="int64")
-    End = te.placeholder(shape=[len(in_shape)], name="end", dtype="int64")
-    Strides = te.placeholder(shape=[len(in_shape)], name="strides", dtype="int64")
-    strides = [1, 1, 1] if strides is None else strides
-    B = topi.strided_slice(A, Begin, End, Strides) + 1
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-
-        foo = tvm.build(s, [A, Begin, End, Strides, B], target, name="stride_slice")
-        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1
-        data_nd = tvm.nd.array(x_np, dev)
-        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
-        begin_nd = tvm.nd.array(np.array(begin).astype("int64"), dev)
-        end_nd = tvm.nd.array(np.array(end).astype("int64"), dev)
-        strides_nd = tvm.nd.array(np.array(strides).astype("int64"), dev)
-        foo(data_nd, begin_nd, end_nd, strides_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu", "opencl --device=adreno"]:
-        check_device(target)
-
-
-def verify_relax_dynamic_strided_slice(in_shape, begin, end, strides, output_shape):
-    A = te.placeholder(shape=in_shape, name="A")
-    Begin = te.placeholder(shape=[len(in_shape)], name="begin", dtype="int64")
-    End = te.placeholder(shape=[len(in_shape)], name="end", dtype="int64")
-    Strides = te.placeholder(shape=[len(in_shape)], name="strides", dtype="int64")
-
-    B = topi.dynamic_strided_slice(A, Begin, End, Strides, output_shape) + 1
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1
-        data_nd = tvm.nd.array(x_np, dev)
-        tvm_out = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
-        begin_nd = tvm.nd.array(np.array(begin).astype("int64"), dev)
-        end_nd = tvm.nd.array(np.array(end).astype("int64"), dev)
-        strides_nd = tvm.nd.array(np.array(strides).astype("int64"), dev)
-
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        foo = tvm.build(s, [A, Begin, End, Strides, B], target, name="stride_slice")
-        foo(data_nd, begin_nd, end_nd, strides_nd, tvm_out)
-        tvm_out_npy = tvm_out.numpy()
-        assert out_npy.shape == tvm_out_npy.shape
-        tvm.testing.assert_allclose(tvm_out_npy, out_npy)
-
-    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(target)
-
-
-def verify_strided_set(in_shape, v_shape, begin, end, strides=None):
-    A = te.placeholder(shape=in_shape, name="A")
-    V = te.placeholder(shape=v_shape, name="V")
-    b = te.placeholder(shape=(len(begin),), name="b", dtype="int32")
-    e = te.placeholder(shape=(len(end),), name="e", dtype="int32")
-    if strides is not None:
-        st = te.placeholder(shape=(len(strides),), name="st", dtype="int32")
-        B = topi.strided_set(A, V, b, e, st) + 1
-    else:
-        B = topi.strided_set(A, V, b, e) + 1
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-
-        if strides is not None:
-            foo = tvm.build(s, [A, V, b, e, st, B], target, name="stride_set")
-            s_np = np.asarray(strides).astype("int32")
-            s_nd = tvm.nd.array(s_np, dev)
-        else:
-            foo = tvm.build(s, [A, V, b, e, B], target, name="stride_set")
-        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
-        v_np = np.random.uniform(size=v_shape).astype(V.dtype)
-        b_np = np.asarray(begin).astype("int32")
-        e_np = np.asarray(end).astype("int32")
-        out_npy = tvm.topi.testing.strided_set_python(x_np, v_np, begin, end, strides) + 1
-        data_nd = tvm.nd.array(x_np, dev)
-        v_nd = tvm.nd.array(v_np, dev)
-        b_nd = tvm.nd.array(b_np, dev)
-        e_nd = tvm.nd.array(e_np, dev)
-        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
-        if strides is not None:
-            foo(data_nd, v_nd, b_nd, e_nd, s_nd, out_nd)
-        else:
-            foo(data_nd, v_nd, b_nd, e_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(target)
-
-
-def verify_gather(data, axis, indices):
-    data = np.asarray(data)
-    indices = np.asarray(indices)
-
-    var_data = te.placeholder(shape=data.shape, dtype=data.dtype.name, name="data")
-    var_indices = te.placeholder(shape=indices.shape, dtype=indices.dtype.name, name="indices")
-    out_tensor = topi.gather(var_data, axis, var_indices)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(out_tensor)
-
-        func = tvm.build(s, [var_data, var_indices, out_tensor], target, name="gather")
-        out_npys = tvm.topi.testing.gather_python(data, axis, indices)
-
-        data_nd = tvm.nd.array(data, dev)
-        indices_nd = tvm.nd.array(indices, dev)
-        out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=data.dtype.name)
-        func(data_nd, indices_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npys)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_gather_nd(src_shape, indices_src, indices_dtype):
-    src_dtype = "float32"
-    indices_src = np.array(indices_src, dtype=indices_dtype)
-    A = te.placeholder(shape=src_shape, dtype=src_dtype, name="A")
-    indices = te.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
-    out_tensor = topi.gather_nd(a=A, indices=indices)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(out_tensor)
-
-        func = tvm.build(s, [A, indices, out_tensor], target, name="take")
-        shape_size = 1
-        for i in range(len(src_shape)):
-            shape_size = shape_size * src_shape[i]
-        data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
-        out_npys = tvm.topi.testing.gather_nd_python(data_npy, indices_src)
-
-        data_nd = tvm.nd.array(data_npy, dev)
-        indices_nd = tvm.nd.array(indices_src, dev)
-        out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=src_dtype)
-        func(data_nd, indices_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npys)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_arange(start, stop, step):
-    if start is None and step is None:
-        A = topi.arange(stop)
-        a_np = np.arange(stop)
-    elif start is None:
-        A = topi.arange(stop, step=step)
-        a_np = np.arange(stop, step=step)
-    elif step is None:
-        A = topi.arange(start, stop)
-        a_np = np.arange(start, stop)
-    else:
-        A = topi.arange(start, stop, step)
-        a_np = np.arange(start, stop, step)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(A)
-        f = tvm.build(s, [A], target, name="arange")
-        a_nd = tvm.nd.empty(a_np.shape, dtype="float32", device=dev)
-        f(a_nd)
-        tvm.testing.assert_allclose(a_nd.numpy(), a_np)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_repeat(in_shape, repeats, axis):
-    A = te.placeholder(shape=in_shape, name="A")
-    B = topi.repeat(A, repeats, axis)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
-        foo = tvm.build(s, [A, B], target, name="repeat")
-        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = np.repeat(data_npy, repeats, axis)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_tile(in_shape, reps):
-    A = te.placeholder(shape=in_shape, name="A")
-    B = topi.tile(A, reps)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
-        foo = tvm.build(s, [A, B], target, name="tile")
-        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = np.tile(data_npy, reps)
-        data_nd = tvm.nd.array(data_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev)
-        foo(data_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_where(in_shape):
-    Cond = te.placeholder(shape=in_shape, name="cond")
-    dtype = Cond.dtype
-    A = te.placeholder(shape=in_shape, name="A")
-    B = te.placeholder(shape=in_shape, name="B")
-    C = topi.where(Cond, A, B)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_broadcast_schedule(target)(C)
-        f = tvm.build(s, [Cond, A, B, C], target, name="where")
-        cond_npy = np.random.uniform(low=-1, high=1, size=in_shape).astype(dtype)
-        x_npy = np.random.uniform(size=in_shape).astype(dtype)
-        y_npy = np.random.uniform(size=in_shape).astype(dtype)
-        out_npy = np.where(cond_npy, x_npy, y_npy)
-        cond_nd = tvm.nd.array(cond_npy, dev)
-        x_nd = tvm.nd.array(x_npy, dev)
-        y_nd = tvm.nd.array(y_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev)
-        f(cond_nd, x_nd, y_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype):
-    indices = te.placeholder(shape=indices_shape, name="indices", dtype="int32")
-    on_value_const = tvm.tir.const(on_value, dtype)
-    off_value_const = tvm.tir.const(off_value, dtype)
-    one_hot_result = topi.transform.one_hot(
-        indices, on_value_const, off_value_const, depth, axis, dtype
-    )
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(one_hot_result)
-        fn = tvm.build(s, [indices, one_hot_result], target, name="one_hot")
-        indices_npy = np.random.randint(0, depth, size=indices_shape).astype(indices.dtype)
-        out_npy = tvm.topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype)
-        indices_nd = tvm.nd.array(indices_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(one_hot_result.dtype), dev)
-        fn(indices_nd, out_nd)
-        out_topi = out_nd.numpy()
-        tvm.testing.assert_allclose(out_topi, out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_unravel_index(indices, shape, dtype, indice_dtype="int64"):
-    x_data = np.array(indices).astype(indice_dtype)
-    y_data = np.array(shape).astype(dtype)
-    if len(x_data.shape) == 1:
-        dst_shape = [y_data.shape[0], x_data.shape[0]]
-    else:
-        dst_shape = [y_data.shape[0]]
-
-    X = te.placeholder(shape=x_data.shape, dtype=indice_dtype, name="X")
-    Y = te.placeholder(shape=y_data.shape, dtype=dtype, name="Y")
-    Z = topi.unravel_index(X, Y)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(Z)
-        foo = tvm.build(s, [X, Y, Z], target, name="unravel_index")
-
-        out_npy = np.unravel_index(x_data, y_data)
-        datax_nd = tvm.nd.array(x_data, dev)
-        datay_nd = tvm.nd.array(y_data, dev)
-        out_nd = tvm.nd.empty(dst_shape, device=dev, dtype=Z.dtype)
-        foo(datax_nd, datay_nd, out_nd)
-        tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
-    sparse_indices_data = np.array(sparse_indices)
-    sparse_values_data = np.array(sparse_values)
-    output_shape_data = np.array(output_shape)
-    default_value_data = np.array(default_value)
-
-    A = te.placeholder(
-        shape=sparse_indices_data.shape, name="sparse_indices", dtype=str(sparse_indices_data.dtype)
-    )
-    B = te.placeholder(
-        shape=sparse_values_data.shape, name="sparse_values", dtype=str(sparse_values_data.dtype)
-    )
-    if default_value is None:
-        args = [A, B]
-        D = topi.sparse_to_dense(A, output_shape, B)
-    else:
-        C = te.placeholder(shape=(), name="default_value", dtype=str(default_value_data.dtype))
-        args = [A, B, C]
-        D = topi.sparse_to_dense(A, output_shape, B, C)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(D)
-
-        foo = tvm.build(s, args + [D], target, name="sparse_to_dense")
-
-        sparse_indices_nd = tvm.nd.array(sparse_indices_data, dev)
-        sparse_values_nd = tvm.nd.array(sparse_values_data, dev)
-        out_nd = tvm.nd.empty(output_shape_data, device=dev, dtype=B.dtype)
-
-        if default_value is None:
-            foo(sparse_indices_nd, sparse_values_nd, out_nd)
-        else:
-            default_value_nd = tvm.nd.array(default_value_data, dev)
-            foo(sparse_indices_nd, sparse_values_nd, default_value_nd, out_nd)
-
-        tvm.testing.assert_allclose(out_nd.numpy(), np.array(xpected))
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_matrix_set_diag(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
-    input = te.placeholder(shape=input_shape, name="input", dtype=dtype)
-    diagonal = te.placeholder(shape=diagonal_shape, name="diagonal", dtype=dtype)
-    matrix_set_diag_result = topi.transform.matrix_set_diag(input, diagonal, k, align)
-
-    def check_device(target, dev):
-        dev = tvm.device(target, 0)
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(matrix_set_diag_result)
-        fn = tvm.build(s, [input, diagonal, matrix_set_diag_result], target, name="matrix_set_diag")
-        input_npy = np.random.randint(-100, 100, size=input_shape).astype(dtype)
-        diagonal_npy = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype)
-        out_npy = tvm.topi.testing.matrix_set_diag(input_npy, diagonal_npy, k, align)
-        input_nd = tvm.nd.array(input_npy, dev)
-        diagonal_nd = tvm.nd.array(diagonal_npy, dev)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(matrix_set_diag_result.dtype), dev)
-        fn(input_nd, diagonal_nd, out_nd)
-        out_topi = out_nd.numpy()
-        tvm.testing.assert_allclose(out_topi, out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_adv_index(data_shape, index_shapes, indice_dtype="int64"):
-    dtype = "float32"
-    data = te.placeholder(shape=data_shape, name="data", dtype=dtype)
-    indices = []
-    np_data = np.random.uniform(size=data_shape).astype(dtype)
-    np_indices = []
-    for i, index_shape in enumerate(index_shapes):
-        limit = data_shape[i]
-        np_indices.append(np.random.uniform(0, limit - 1, size=index_shape).astype(indice_dtype))
-        indices.append(
-            te.placeholder(shape=index_shape, name="index_{}".format(i), dtype=indice_dtype)
-        )
-    np_out = np_data[tuple(np_indices)]
-    out = topi.adv_index(data, indices)
-
-    def check_device(target, dev):
-        dev = tvm.device(target, 0)
-        if not dev.exist:
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.create(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(out)
-
-        func = tvm.build(s, [data] + indices + [out], target, name="adv_index")
-
-        nd_list = [tvm.nd.array(np_data, dev)]
-        for np_index in np_indices:
-            nd_list.append(tvm.nd.array(np_index, dev))
-        nd_list.append(tvm.nd.empty(out.shape, device=dev, dtype=data.dtype))
-
-        func(*nd_list)
-        tvm.testing.assert_allclose(nd_list[-1].numpy(), np.array(np_out))
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-def verify_trilu(input_shape, upper, k=0):
-    x = te.placeholder(shape=input_shape, name="x", dtype="float32")
-    k_tir = tvm.tir.const(k, dtype="int32")
-    trilu_result = topi.transform.trilu(x, k_tir, upper)
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(trilu_result)
-        fn = tvm.build(s, [x, trilu_result], target, name="trilu")
-        x_npy = np.random.normal(size=input_shape).astype(x.dtype)
-        if upper:
-            out_npy = np.triu(x_npy, k)
-        else:
-            out_npy = np.tril(x_npy, k)
-        x_nd = tvm.nd.array(x_npy, dev)
-        out_nd = tvm.nd.array(np.empty(x_npy.shape).astype(trilu_result.dtype), dev)
-        fn(x_nd, out_nd)
-        out_topi = out_nd.numpy()
-        tvm.testing.assert_allclose(out_topi, out_npy)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_strided_slice():
-    verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
-    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
-    verify_strided_slice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1])
-    verify_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2])
-    verify_strided_slice((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1])
-    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
-    verify_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
-    verify_strided_slice((3, 4, 3), [0, 0, 0], [None, None, None])
-
-
-def test_strided_slice_with_dynamic_bounds():
-    """The begin/end of strided_slice can be a PrimExpr
-
-    Where topi.dynamic_strided_slice uses begin/end values provided at
-    runtime, strided_slice takes begin/end values at compile-time.
-    However, these begin/end values may depend on dynamic variables.
-    Previously, these resulted in dispatch to
-    `tvm::topi::dynamic_strided_slice`, ignoring the `axes` argument.
-    """
-    A = te.placeholder(shape=[16, 32, 64], name="A")
-    begins = [tir.Var("begin1", "int32"), tir.Var("begin2", "int32")]
-    ends = [tir.Var("end1", "int32"), tir.Var("end2", "int32")]
-    strides = [1, 1]
-    axes = [2, 1]
-
-    # Dummy tensor to provide begin/end variables in PrimFunc scope.
-    # Outside of a test case, these would typically be provided
-    # through another means, or bound to a static value at a later
-    # point.
-    Dummy = te.placeholder(shape=[*begins, *ends], name="Dummy")
-
-    B = topi.strided_slice(A, begins, ends, strides, axes)
-
-    func = te.create_prim_func([A, Dummy, B]).without_attr("global_symbol")
-
-    @T.prim_func(private=True)
-    def expected(
-        A: T.Buffer((16, 32, 64), "float32"),
-        var_Dummy: T.handle,
-        B_handle: T.handle,
-    ):
-        T.func_attr({"tir.noalias": T.bool(True)})
-        begin1, begin2, end1, end2 = T.int32(), T.int32(), T.int32(), T.int32()
-        Dummy = T.match_buffer(var_Dummy, (begin1, begin2, end1, end2))
-        B = T.match_buffer(B_handle, (16, end2 - begin2, end1 - begin1))
-        for iters in T.grid(*B.shape):
-            with T.block("T_dynamic_strided_slice_with_axes"):
-                i, j, k = T.axis.remap("SSS", iters)
-                B[i, j, k] = A[i, j + begin2, k + begin1]
-
-    tvm.ir.assert_structural_equal(expected, func)
-
-
-@tvm.testing.uses_gpu
-def test_dynamic_strided_slice():
-    verify_dynamic_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
-    verify_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
-    verify_dynamic_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2])
-    verify_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
-    verify_dynamic_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
-
-
-@tvm.testing.uses_gpu
-def test_relax_dynamic_strided_slice():
-    verify_relax_dynamic_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], [3, 1, 2])
-    verify_relax_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], [1, 3, 3])
-    verify_relax_dynamic_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], [1, 2, 2])
-    verify_relax_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [1, 1, 1], [2, 3, 3])
-    verify_relax_dynamic_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3], [1, 1, 1], [1, 0, 3])
-
-
-@tvm.testing.uses_gpu
-def test_strided_set():
-    verify_strided_set((3, 4, 3), (3, 2, 2), [0, 3, 0], [4, 1, 4], [1, -1, 2])
-    verify_strided_set((3, 4, 3), (3, 1, 2), [0, 0, 0], [4, -5, 4], [1, -1, 2])
-    verify_strided_set((3, 4, 3), (1, 3, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
-    verify_strided_set((3, 4, 3), (1, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1])
-    verify_strided_set((3, 4, 3), (1, 2, 2), [1, 0, 0], [2, 2, 3], [1, 1, 2])
-    verify_strided_set((3, 4, 3), (1, 2, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1])
-    verify_strided_set((3, 4, 3), (1, 2, 3), [1, 1, 0], [2, 3, 3], [1])
-    verify_strided_set((3, 4, 3), (2, 3, 3), [1, 1, 0], [4, 4, 3])
-    verify_strided_set((3, 4, 3), (2, 3, 3), [1, 1], [4, 4, 3])
-
-
-@tvm.testing.uses_gpu
-def test_expand_dims():
-    verify_expand_dims((3, 10), (3, 10, 1, 1), 2, 2)
-    verify_expand_dims((3, 10), (1, 3, 10), -3, 1)
-
-
-@tvm.testing.uses_gpu
-def test_reinterpret():
-    verify_reinterpret((1000,), "float32", "int32", lambda shape: np.random.randn(*shape) * 1000)
-    verify_reinterpret((1000,), "float16", "int16", lambda shape: np.random.randn(*shape) * 100)
-    verify_reinterpret(
-        (1000,), "int16", "uint16", lambda shape: np.random.randint(-1000, 1000, size=shape)
-    )
-    verify_reinterpret(
-        (1000,), "uint32", "int32", lambda shape: np.random.randint(0, 2**32 - 1, size=shape)
-    )
-    verify_reinterpret(
-        (1000,), "uint32", "int32", lambda shape: np.random.randint(0, 2**32 - 1, size=shape)
-    )
-
-
-@tvm.testing.uses_gpu
-def test_transpose():
-    verify_transpose((3, 10, 2), (1, 0, 2))
-    verify_transpose((3, 10, 5), (2, 0, 1))
-    verify_transpose((3, 10), None)
-
-
-@tvm.testing.parametrize_targets("cuda", "rocm")
-def test_transpose_unfused_schedule(target, dev):
-    shape = (100, tvm.target.Target(target).thread_warp_size + 3)
-    x = relay.var("x", relay.TensorType(shape, "float32"))
-    f = relay.transpose(x)
-    r = np.random.rand(*shape)
-    func = relay.create_executor(
-        kind="graph", mod=tvm.IRModule.from_expr(relay.Function([x], f)), device=dev, target=target
-    ).evaluate()
-    tvm.testing.assert_allclose(func(r).numpy(), np.transpose(r))
-
-    # We want to make sure schedule does not fire here, but there is no way of
-    # inspecting which schedules were used.
-    x = relay.var("x", relay.TensorType(shape, "float32"))
-    y = relay.var("y", relay.TensorType(shape, "float32"))
-    f = relay.transpose(x + y)
-    func = relay.create_executor(
-        kind="graph",
-        mod=tvm.IRModule.from_expr(relay.Function([x, y], f)),
-        device=dev,
-        target=target,
-    ).evaluate()
-    tvm.testing.assert_allclose(func(r, r).numpy(), np.transpose(r + r))
-
-
-@tvm.testing.uses_gpu
-def test_reshape():
-    verify_reshape((1, 2, 3, 4), (2, 3, 4))
-    verify_reshape((4, 2, 3, 4), (2, 4, 12))
-    verify_reshape((4, 2, 3, 4), (2, 48))
-    verify_reshape((16,), (2, 2, 2, 2))
-    verify_reshape((4, 0), (2, 0, 2))
-
-
-@tvm.testing.uses_gpu
-def test_where():
-    verify_where(())
-    verify_where((1, 2, 3, 4))
-
-
-@tvm.testing.uses_gpu
-def test_squeeze():
-    verify_squeeze((1, 2, 3, 4), 0)
-    verify_squeeze((1, 2, 1, 4), None)
-    verify_squeeze((1, 1, 1, 4), (1, 2))
-    verify_squeeze((1, 1, 1, 1), None)
-    verify_squeeze((1, 1, 1, 1), ())
-
-    # a special case to trigger inline let expression
-    A = te.placeholder((2,), "float32", "A")
-    E = topi.squeeze(A)
-    C = te.compute((1,), lambda i: E[(2 * A[0] - 1).astype("int32")])
-    for target in ["llvm", "cuda", "opencl"]:
-        dev = tvm.device(target, 0)
-        if tvm.testing.device_enabled(target):
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(C)
-                func = tvm.build(s, [A, C])
-            a = tvm.nd.array(np.array((1, 2)).astype("float32"), device=dev)
-            c = tvm.nd.empty((1,), dtype="float32", device=dev)
-            func(a, c)
-            assert c.numpy()[0] == 2
-
-
-@tvm.testing.uses_gpu
-def test_concatenate():
-    verify_concatenate([(2,), (2,), (2,)], -1)
-    verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1)
-    verify_concatenate([(1, 2, 4), (1, 2, 3), (1, 2, 7), (1, 2, 8), (1, 2, 1)], -1)
-    verify_concatenate([(5, 6, 7, 3), (16, 6, 7, 3), (12, 6, 7, 3), (8, 6, 7, 3), (2, 6, 7, 3)], 0)
-    verify_concatenate([(1, 14400), (1, 2400), (1, 640), (1, 240)], 1)
-
-
-@tvm.testing.uses_gpu
-def test_stack():
-    verify_stack([(2,), (2,), (2,)], -1)
-    verify_stack([(2,), (2,), (2,)], 1)
-    verify_stack([(2,), (2,), (2,)], 0)
-    verify_stack([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1)
-    verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1)
-
-
-@tvm.testing.uses_gpu
-def test_split():
-    verify_split((2, 12, 3), 3, 1)
-    verify_split((2, 12, 3), [2, 4], 1)
-    verify_split((10, 12, 24), [5, 7, 9], -1)
-
-
-@tvm.testing.uses_gpu
-def test_flip():
-    verify_flip((3, 4, 3), 1)
-    verify_flip((3, 4, 3), 0)
-    verify_flip((3, 4, 3), 2)
-    verify_flip((3, 4, 3), -1)
-    verify_flip((3, 4, 3), -3)
-    verify_flip((3, 4, 3), -2)
-
-
-@tvm.testing.requires_llvm
-def test_expand_like():
-    verify_expand_like((3,), (2, 3), [0])
-    verify_expand_like((2,), (2, 3), [1])
-    verify_expand_like((3, 4), (3, 5, 4), [1])
-    verify_expand_like((5, 7), (5, 6, 7, 8), [1, 3])
-
-
-@tvm.testing.uses_gpu
-def test_take():
-    verify_take((4,), [1])
-    verify_take((4,), [[0, 1, 2, 3]])
-    verify_take((3, 3, 3), [[11, 25]])
-    verify_take((4,), [[0, 1], [2, 3]])
-    verify_take((4,), [1], 0)
-    verify_take((2, 2), [[[1, 0], [0, 1]]], 0)
-    verify_take((2, 2), [[[1, 0], [0, 1]]], 1)
-    verify_take((4, 3, 5, 6), [[2, 1, 0, 0]], -2)
-    verify_take((3, 4), [-5, 20])
-    verify_take((3, 4), [-5, 20], mode="wrap")
-    verify_take((3, 4), [-1, 2], axis=0)
-    verify_take((3, 4), [-1, 2], axis=0, mode="wrap")
-    verify_take((3, 4), [-1, 2], axis=1)
-    verify_take((3, 4), [-1, 2], axis=1, mode="wrap")
-    verify_take((3, 3, 3), [[11, 25]], mode="fast")
-    verify_take((3, 4), [0, 2], axis=0, mode="fast")
-    verify_take((3, 4), [0, 2], axis=1, mode="fast")
-    verify_take((3, 5, 7), [[0, 2], [0, 2], [0, 2], [0, 2]], axis=0, mode="fast")
-    verify_take((3, 5, 7), [[0, 2], [0, 2], [0, 2], [0, 2]], axis=1, mode="fast")
-    verify_take((3, 5, 7), [[0, 2], [0, 2], [0, 2], [0, 2]], axis=2, mode="fast")
-    verify_take((3, 4), [1, 2], axis=1, indices_dtype="uint32")
-    verify_take((3, 4), [1, 2], axis=1, mode="wrap", indices_dtype="uint16")
-    verify_take((3, 3, 3), [[11, 20]], mode="fast", indices_dtype="uint8")
-
-
-@tvm.testing.uses_gpu
-def test_gather():
-    verify_gather([[1, 2], [3, 4]], 1, [[0, 0], [1, 0]])
-    verify_gather(np.random.randn(4, 7, 5), 0, np.random.randint(low=0, high=4, size=(1, 7, 5)))
-    verify_gather(np.random.randn(4, 7, 5), 0, np.random.randint(low=0, high=4, size=(4, 7, 5)))
-    verify_gather(np.random.randn(4, 7, 5), 1, np.random.randint(low=0, high=7, size=(4, 10, 5)))
-    verify_gather(np.random.randn(4, 7, 5), 1, np.random.randint(low=0, high=7, size=(4, 10, 5)))
-    verify_gather(np.random.randn(4, 7, 5), 2, np.random.randint(low=0, high=5, size=(4, 7, 2)))
-    verify_gather(np.random.randn(4, 7, 5), 2, np.random.randint(low=0, high=5, size=(4, 7, 10)))
-    verify_gather(np.random.randn(4, 7, 2), 0, np.random.randint(low=0, high=4, size=(4, 7, 2)))
-
-
-@tvm.testing.uses_gpu
-def test_gather_nd():
-    for indices_dtype in ["int32", "float32", "uint8"]:
-        verify_gather_nd((4,), [[1.8]], indices_dtype)
-        verify_gather_nd((4,), [[1, 3, 2]], indices_dtype)
-        verify_gather_nd((2, 3), [[1]], indices_dtype)
-        verify_gather_nd((2, 3), [[1], [0]], indices_dtype)
-        verify_gather_nd((2, 3), [[1, 0], [0, 2]], indices_dtype)
-        verify_gather_nd((2, 3, 4), [[1, 0], [0, 2]], indices_dtype)
-        verify_gather_nd((2, 3, 4), [[1, 0], [0, 2], [3, 1]], indices_dtype)
-        verify_gather_nd(
-            (2, 3, 4), [[[1, 0], [0, 1]], [[0, 2], [1, 2]], [[3, 1], [0, 2]]], indices_dtype
-        )
-        verify_gather_nd((2, 3, 4, 5), [[1, 0], [0, 2]], indices_dtype)
-        verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]], indices_dtype)
-
-
-@tvm.testing.uses_gpu
-def test_arange():
-    verify_arange(None, 20, None)
-    verify_arange(None, 20, 2)
-    verify_arange(1, 20, None)
-    verify_arange(1, 20, 2)
-    verify_arange(1, 20, 1.5)
-    verify_arange(1, 20.5, None)
-    verify_arange(1, 20, 3)
-    verify_arange(20, 1, -1)
-    verify_arange(20, 1, -1.5)
-
-
-@tvm.testing.uses_gpu
-def test_repeat():
-    verify_repeat((2,), 1, 0)
-    verify_repeat((3, 2), 2, 0)
-    verify_repeat((3, 2, 4), 3, 1)
-    verify_repeat((1, 3, 2, 4), 4, -1)
-
-
-@tvm.testing.uses_gpu
-def test_tile():
-    verify_tile((3, 2), (2, 3))
-    verify_tile((3, 2, 5), (2,))
-    verify_tile((3,), (2, 3, 3))
-    verify_tile((4, 0), (5,))
-
-
-@tvm.testing.uses_gpu
-def test_layout_transform():
-    in_shape = (1, 32, 8, 8)
-    A = te.placeholder(shape=in_shape, dtype="float32", name="A")
-    B = topi.layout_transform(A, "NCHW", "NCHW16c")
-
-    input = np.random.uniform(size=in_shape).astype(A.dtype)
-    output = np.transpose(input, axes=(0, 2, 3, 1))
-    output = np.reshape(output, newshape=(1, 8, 8, 2, 16))
-    output = np.transpose(output, axes=(0, 3, 1, 2, 4))
-
-    def check_device(target, dev):
-        tvm_input = tvm.nd.array(input, dev)
-        tvm_output = tvm.nd.empty(output.shape, device=dev, dtype=B.dtype)
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        f = tvm.build(s, [A, B], target, name="layout_transform")
-        f(tvm_input, tvm_output)
-        tvm.testing.assert_allclose(tvm_output.numpy(), output)
-
-    for backend, dev in tvm.testing.enabled_targets():
-        check_device(backend, dev)
-
-
-@tvm.testing.uses_gpu
-def test_shape():
-    in_shape = (8, 7, 13)
-    dtype = "int32"
-    A = te.placeholder(shape=in_shape, dtype="float32", name="A")
-    B = topi.shape(A, dtype)
-
-    input = np.random.uniform(size=in_shape).astype(A.dtype)
-    output = np.asarray(in_shape).astype(dtype)
-
-    def check_device(target, dev):
-        tvm_input = tvm.nd.array(input, dev)
-        tvm_output = tvm.nd.empty(output.shape, device=dev, dtype=dtype)
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        f = tvm.build(s, [A, B], target, name="shape")
-        f(tvm_input, tvm_output)
-        tvm.testing.assert_allclose(tvm_output.numpy(), output)
-
-    for backend, dev in tvm.testing.enabled_targets():
-        check_device(backend, dev)
-
-
-@tvm.testing.uses_gpu
-def test_sequence_mask():
-    for in_shape in (5, 10), (3, 4, 5, 4):
-        for axis in [0, 1]:
-            for mask_value in [0.0, 1.0]:
-                max_length = in_shape[axis]
-                batch_size = in_shape[1 - axis]
-                A = te.placeholder(shape=in_shape, dtype="float32", name="A")
-                B = te.placeholder(shape=(batch_size,), dtype="int32", name="B")
-                C = topi.sequence_mask(A, B, axis=axis, mask_value=mask_value)
-                A_data = np.random.normal(0, 1, in_shape).astype(np.float32)
-                B_data = np.random.randint(1, max_length, (batch_size,)).astype(np.int32)
-                C_gt_data = tvm.topi.testing.sequence_mask(A_data, B_data, mask_value, axis)
-
-                def check_device(target, dev):
-                    tvm_A = tvm.nd.array(A_data, dev)
-                    tvm_B = tvm.nd.array(B_data, dev)
-                    tvm_C = tvm.nd.empty(in_shape, device=dev, dtype="float32")
-                    print("Running on target: %s" % target)
-                    with tvm.target.Target(target):
-                        s = tvm.topi.testing.get_injective_schedule(target)(C)
-                    f = tvm.build(s, [A, B, C], target, name="SequenceMask")
-                    f(tvm_A, tvm_B, tvm_C)
-                    tvm.testing.assert_allclose(tvm_C.numpy(), C_gt_data)
-
-                for backend, dev in tvm.testing.enabled_targets():
-                    check_device(backend, dev)
-
-
-@tvm.testing.uses_gpu
-def test_ndarray_size():
-    in_shape = (5, 11, 7)
-    dtype = "int32"
-    A = te.placeholder(shape=in_shape, dtype="float32", name="A")
-    B = topi.ndarray_size(A, dtype)
-
-    input = np.random.uniform(size=in_shape).astype(A.dtype)
-    output = np.asarray(np.size(input)).astype(dtype)
-
-    def check_device(target, dev):
-        tvm_input = tvm.nd.array(input, device=dev)
-        tvm_output = tvm.nd.empty((), device=dev, dtype=B.dtype)
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        f = tvm.build(s, [A, B], target, name="ndarray_size")
-        f(tvm_input, tvm_output)
-        tvm.testing.assert_allclose(tvm_output.numpy(), output)
-
-    for backend, dev in tvm.testing.enabled_targets():
-        check_device(backend, dev)
-
-
-@tvm.testing.uses_gpu
-def test_where_fusion():
-    """integration test that where and zeros should be properly inlined"""
-
-    def check_device(target, dev):
-        with tvm.target.Target(target):
-            print("Running on target: %s" % target)
-            conv2d_compute, conv2d_schedule = tvm.topi.testing.get_conv2d_nchw_implement(target)
-            data = te.placeholder((2, 1, 2, 4), "int8", "data")
-            w = te.placeholder((3, 1, 2, 2), "int8", "w")
-            conv1 = conv2d_compute(data, w, 1, 0, 1, "int32")
-            zeros = topi.full((2, 3, 1, 3), "int32", tvm.tir.const(0, dtype="int32"))
-            gt = topi.greater_equal(conv1, zeros)
-            one = topi.full((2, 3, 1, 3), "int32", tvm.tir.const(1, dtype="int32"))
-            two = topi.full((2, 3, 1, 3), "int32", tvm.tir.const(2, dtype="int32"))
-            where = topi.where(gt, one, two)
-            add = topi.add(conv1, where)
-            outs = [add]
-            s = conv2d_schedule(outs)
-            tvm.build(s, [data, w, add], target=backend)
-
-    for backend, dev in tvm.testing.enabled_targets():
-        check_device(backend, dev)
-
-
-@tvm.testing.uses_gpu
-def test_one_hot():
-    verify_one_hot((3,), 3, 1, 0, -1, "int32")
-    verify_one_hot((3,), 3, 1.0, 0.0, -1, "float32")
-    verify_one_hot((2, 2), 5, 2, -2, 0, "int32")
-    verify_one_hot((2, 2), 5, 0.5, -0.5, 1, "float32")
-    verify_one_hot((3, 2, 4, 5), 6, 1, 0, 1, "int32")
-    verify_one_hot((3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
-
-
-@tvm.testing.uses_gpu
-def test_unravel_index():
-    for dtype in ["int32", "int64"]:
-        for indice_dtype in ["int64", "uint8", "uint16", "uint32"]:
-            verify_unravel_index([0, 1, 2, 3], [2, 2], dtype, indice_dtype)
-            verify_unravel_index([144], [5, 5, 5, 2], dtype, indice_dtype)
-            verify_unravel_index(144, [5, 5, 5, 2], dtype, indice_dtype)
-            verify_unravel_index([100, 13, 5], [5, 5, 5, 2], dtype, indice_dtype)
-
-
-@tvm.testing.uses_gpu
-def test_sparse_to_dense():
-    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
-    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
-    verify_sparse_to_dense(
-        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
-    )  # nXd
-    verify_sparse_to_dense(
-        [[0, 0, 0], [1, 2, 3]],
-        [1, 2],
-        4,
-        [2, 3, 4],
-        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
-    )  # nXd
-    verify_sparse_to_dense(
-        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
-    )  # floats
-    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
-
-    # negative test cases
-    # sparse indices should be ints
-    # verify_sparse_to_dense([[0.1, 1.1, 4.1], [0,2,4]], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
-    # sparse_values should be 0d or 1d only
-    # verify_sparse_to_dense([[0, 1, 4], [0, 2, 4]], [[[3.1, 3.1, 3.1]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
-    # sparse_indices should not be > 2d tensor
-    # verify_sparse_to_dense([[[[0, 1, 4], [0, 2, 4]]]], [[[3.1, 3.1, 3.1]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
-
-
-@tvm.testing.uses_gpu
-def test_matrix_set_diag():
-    for dtype in ["float32", "int32"]:
-        verify_matrix_set_diag((2, 2), (2,), dtype)
-        verify_matrix_set_diag((4, 3, 3), (4, 3), dtype)
-        verify_matrix_set_diag((2, 3, 4), (2, 3), dtype, 1)
-        verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "LEFT_RIGHT")
-        verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "LEFT_LEFT")
-        verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "RIGHT_RIGHT")
-
-
-@tvm.testing.uses_gpu
-def test_adv_index():
-    for indice_dtype in ["int32", "int64", "uint8", "uint16", "uint32"]:
-        verify_adv_index((3, 4, 5), [(2,), (2,), (1,)], indice_dtype=indice_dtype)
-        verify_adv_index((10, 15, 5), [(4, 1), (1, 7)], indice_dtype=indice_dtype)
-        verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)], indice_dtype=indice_dtype)
-
-
-@tvm.testing.uses_gpu
-def test_trilu():
-    # Test upper and lower triangle
-    verify_trilu((3, 3), True, 0)
-    verify_trilu((3, 3), False, 0)
-    # Test larger matrices with offset.
-    verify_trilu((6, 6), True, 1)
-    verify_trilu((6, 6), False, 2)
-    verify_trilu((6, 6), False, -2)
-    # Test batch size
-    verify_trilu((8, 6, 6), False, -2)
-
-
-if __name__ == "__main__":
-    test_strided_slice()
-    test_concatenate()
-    test_stack()
-    test_transpose()
-    test_expand_dims()
-    test_reshape()
-    test_where()
-    test_squeeze()
-    test_split()
-    test_flip()
-    test_expand_like()
-    test_take()
-    test_gather_nd()
-    test_arange()
-    test_layout_transform()
-    test_repeat()
-    test_tile()
-    test_shape()
-    test_sequence_mask()
-    test_ndarray_size()
-    test_where_fusion()
-    test_one_hot()
-    test_unravel_index()
-    test_sparse_to_dense()
-    test_matrix_set_diag()
-    test_adv_index()
-    test_trilu()
diff --git a/tests/python/topi/test_topi_unique.py b/tests/python/topi/test_topi_unique.py
deleted file mode 100644
index e2a82f64aecb..000000000000
--- a/tests/python/topi/test_topi_unique.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import topi
-import tvm.topi.testing
-
-in_dtype = tvm.testing.parameter("int32", "int64")
-is_sorted = tvm.testing.parameter(True, False, ids=["sorted", "unsorted"])
-with_counts = tvm.testing.parameter(True, False, ids=["with_counts", "no_counts"])
-arr_size, maxval = tvm.testing.parameters((1, 100), (10, 10), (10000, 100))
-
-
-@tvm.testing.parametrize_targets
-def test_unique(dev, target, in_dtype, is_sorted, with_counts, arr_size, maxval):
-    def calc_numpy_unique(data, is_sorted=False):
-        uniq, index, inverse, counts = np.unique(
-            data, return_index=True, return_inverse=True, return_counts=True
-        )
-        num_uniq = np.array([len(uniq)]).astype("int32")
-        if not is_sorted:
-            order = np.argsort(index)
-            index = np.sort(index)
-            reverse_order = np.argsort(order)
-            uniq = uniq[order].astype(data.dtype)
-            inverse = np.array([reverse_order[i] for i in inverse]).astype("int32")
-            counts = counts[order].astype("int32")
-        return [
-            uniq.astype(data.dtype),
-            index.astype("int32"),
-            inverse.astype("int32"),
-            counts,
-            num_uniq,
-        ]
-
-    data = np.random.randint(0, maxval, size=(arr_size)).astype(in_dtype)
-
-    # numpy reference
-    np_unique, np_indices, np_inverse_indices, np_counts, np_num_unique = calc_numpy_unique(
-        data, is_sorted
-    )
-    num_unique = np_num_unique[0]
-
-    implementations = {
-        "generic": (
-            lambda x, return_counts: topi.unique(x, is_sorted, return_counts),
-            topi.generic.schedule_unique,
-        ),
-        "gpu": (
-            lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
-            topi.cuda.schedule_scan,
-        ),
-        "nvptx": (
-            lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
-            topi.cuda.schedule_scan,
-        ),
-    }
-    fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-    tvm_data = tvm.nd.array(data, device=dev)
-    tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), device=dev)
-    tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
-    tvm_inverse_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
-    tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), device=dev)
-
-    with tvm.target.Target(target):
-        te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype))
-        outs = fcompute(te_input, with_counts)
-        s = fschedule(outs)
-        func = tvm.build(s, [te_input, *outs])
-
-        if with_counts:
-            tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
-            func(
-                tvm_data,
-                tvm_unique,
-                tvm_indices,
-                tvm_inverse_indices,
-                tvm_num_unique,
-                tvm_counts,
-            )
-        else:
-            func(tvm_data, tvm_unique, tvm_indices, tvm_inverse_indices, tvm_num_unique)
-
-    num_unique = np_num_unique[0]
-    assert tvm_num_unique.numpy()[0] == np_num_unique
-
-    np.testing.assert_allclose(tvm_unique.numpy()[:num_unique], np_unique, atol=1e-5, rtol=1e-5)
-    np.testing.assert_allclose(tvm_indices.numpy()[:num_unique], np_indices, atol=1e-5, rtol=1e-5)
-
-    np.testing.assert_allclose(
-        tvm_inverse_indices.numpy(), np_inverse_indices, atol=1e-5, rtol=1e-5
-    )
-
-    if with_counts:
-        np.testing.assert_allclose(tvm_counts.numpy()[:num_unique], np_counts, atol=1e-5, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/topi/test_topi_upsampling.py b/tests/python/topi/test_topi_upsampling.py
deleted file mode 100644
index 90e3b35e8845..000000000000
--- a/tests/python/topi/test_topi_upsampling.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for upsampling"""
-import numpy as np
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-import tvm.topi.testing
-import math
-from tvm.topi.utils import nchw_pack_layout
-
-
-def verify_upsampling(
-    batch,
-    in_channel,
-    in_height,
-    in_width,
-    scale_h,
-    scale_w,
-    layout="NCHW",
-    method="nearest_neighbor",
-    in_batch_block=0,
-    in_channel_block=0,
-):
-    if layout == "NCHW":
-        A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-        dtype = A.dtype
-        out_shape = (
-            batch,
-            in_channel,
-            int(round(in_height * scale_h)),
-            int(round(in_width * scale_w)),
-        )
-        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
-    elif nchw_pack_layout(layout):
-        A = te.placeholder(
-            (batch, in_channel, in_height, in_width, in_batch_block, in_channel_block), name="A"
-        )
-        dtype = A.dtype
-        out_shape = (
-            batch,
-            in_channel,
-            int(round(in_height * scale_h)),
-            int(round(in_width * scale_w)),
-            in_batch_block,
-            in_channel_block,
-        )
-        a_np = np.random.uniform(
-            size=(batch, in_channel, in_height, in_width, in_batch_block, in_channel_block)
-        ).astype(dtype)
-    elif layout == "NHWC":
-        A = te.placeholder((batch, in_height, in_width, in_channel), name="A")
-        dtype = A.dtype
-        out_shape = (
-            batch,
-            int(round(in_height * scale_h)),
-            int(round(in_width * scale_w)),
-            in_channel,
-        )
-        a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype)
-    else:
-        raise NotImplementedError("Layout not supported {} ".format(layout))
-
-    B = topi.nn.upsampling(A, scale_h, scale_w, layout=layout, method=method, align_corners=False)
-
-    b_np = tvm.topi.testing.resize2d_python(
-        a_np,
-        (scale_h, scale_w),
-        layout,
-        method[2:] if method[0:2] == "bi" else method,
-        "asymmetric",
-    )
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-def test_int_div_upsampling():
-    """Test whether upsampling op is tilable when scale_h and scale_w is integer.
-
-    Compute_at cannot work correctly in the original floating-point multiplication.
-    After using integer division,compute_at can work correctly and reduce the
-    capacity of cache buffer.
-
-    In this test case, scale_h and scale_w are set to integers, the size
-    of cache buffer should be equal to (h_i/scale_h * w_i/scale_w * c_i).
-    """
-    dtype = "int8"
-    scale_h = 2
-    scale_w = 2
-
-    x = te.placeholder([1, 32, 64, 64], dtype, "x")
-    y = topi.nn.upsampling(x, scale_h, scale_w)
-    func = te.create_prim_func([x, y])
-
-    s = tvm.tir.Schedule(func)
-    block = s.get_block("resize")
-    cache = s.cache_read(block, 0, "local")
-    n, c, h, w = s.get_loops(block)
-    s_factor = 8
-    c_o, c_i = s.split(c, factors=[None, s_factor])
-    h_o, h_i = s.split(h, factors=[None, s_factor])
-    w_o, w_i = s.split(w, factors=[None, s_factor])
-    s.reorder(n, c_o, h_o, w_o, h_i, w_i, c_i)
-    s.compute_at(cache, w_o)
-    wanted_rt = s_factor**3 / (scale_h * scale_w)
-
-    def analyze_upsampling_allocate(stmt):
-        if isinstance(stmt, tvm.tir.stmt.Allocate):
-            tvm.testing.assert_allclose(stmt.extents[0].value, wanted_rt)
-
-    lowerd_irmodule = tvm.lower(s.mod["main"])
-    tvm.tir.stmt_functor.post_order_visit(
-        lowerd_irmodule.functions.items()[0][1].body, analyze_upsampling_allocate
-    )
-
-
-@tvm.testing.uses_gpu
-def test_upsampling():
-    # nearest_neighbor - NCHW
-    verify_upsampling(8, 16, 32, 32, 2.0, 2.0)
-    verify_upsampling(2, 32, 64, 64, 3.0, 3.0)
-    verify_upsampling(1, 64, 22, 32, 1.954545497894287, 2.0)
-
-    ## nearest_neighbor - NHWC
-    verify_upsampling(8, 16, 32, 32, 2.0, 2.0, layout="NHWC")
-    verify_upsampling(2, 32, 64, 64, 3.0, 3.0, layout="NHWC")
-    verify_upsampling(1, 64, 22, 32, 1.954545497894287, 2.0, layout="NHWC")
-
-    # bilinear - NCHW
-    verify_upsampling(2, 2, 32, 32, 2.0, 2.0, method="bilinear")
-    verify_upsampling(2, 2, 32, 32, 3.0, 3.0, method="bilinear")
-    verify_upsampling(1, 64, 22, 32, 1.954545497894287, 2.0, method="bilinear")
-
-    # nearest_neighbor - NCHWinic
-    verify_upsampling(2, 2, 32, 32, in_batch_block=4, in_channel_block=8, scale_h=2.0, scale_w=2.0)
-    verify_upsampling(2, 2, 64, 64, in_batch_block=1, in_channel_block=16, scale_h=3.0, scale_w=3.0)
-    verify_upsampling(
-        1, 4, 22, 32, in_batch_block=1, in_channel_block=16, scale_h=1.954545497894287, scale_w=2.0
-    )
-
-    # bilinear - NCHWinic
-    verify_upsampling(
-        2,
-        2,
-        32,
-        32,
-        in_batch_block=1,
-        in_channel_block=1,
-        scale_h=2.0,
-        scale_w=2.0,
-        method="bilinear",
-    )
-    verify_upsampling(
-        2,
-        2,
-        32,
-        32,
-        in_batch_block=1,
-        in_channel_block=1,
-        scale_h=3.0,
-        scale_w=3.0,
-        method="bilinear",
-    )
-    verify_upsampling(
-        2,
-        4,
-        22,
-        32,
-        in_batch_block=1,
-        in_channel_block=16,
-        scale_h=1.954545497894287,
-        scale_w=2.0,
-        layout="NCHW1n16c",
-        method="bilinear",
-    )
-
-    # bilinear - NHWC
-    verify_upsampling(2, 2, 32, 32, 2.0, 2.0, layout="NHWC", method="bilinear")
-    verify_upsampling(2, 2, 32, 32, 3.0, 3.0, layout="NHWC", method="bilinear")
-    verify_upsampling(1, 64, 22, 32, 3.0, 3.0, layout="NHWC", method="bilinear")
-
-
-def verify_upsampling3d(
-    batch,
-    in_channel,
-    in_depth,
-    in_height,
-    in_width,
-    scale_d,
-    scale_h,
-    scale_w,
-    layout="NCDHW",
-    method="nearest_neighbor",
-):
-    if layout == "NCDHW":
-        A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name="A")
-        dtype = A.dtype
-        out_shape = (
-            batch,
-            in_channel,
-            int(round(in_depth * scale_d)),
-            int(round(in_height * scale_h)),
-            int(round(in_width * scale_w)),
-        )
-        a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(
-            dtype
-        )
-    elif layout == "NDHWC":
-        A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), name="A")
-        dtype = A.dtype
-        out_shape = (
-            batch,
-            int(round(in_depth * scale_d)),
-            int(round(in_height * scale_h)),
-            int(round(in_width * scale_w)),
-            in_channel,
-        )
-        a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width, in_channel)).astype(
-            dtype
-        )
-    else:
-        raise NotImplementedError("Layout not supported {} ".format(layout))
-
-    B = topi.nn.upsampling3d(
-        A,
-        scale_d,
-        scale_h,
-        scale_w,
-        layout=layout,
-        method=method,
-        coordinate_transformation_mode="asymmetric",
-    )
-
-    b_np = tvm.topi.testing.resize3d_python(
-        a_np,
-        (scale_d, scale_h, scale_w),
-        layout,
-        method[3:] if method[0:3] == "tri" else method,
-        "asymmetric",
-    )
-
-    def check_target(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            s = tvm.topi.testing.get_injective_schedule(target)(B)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
-        f = tvm.build(s, [A, B], target)
-        f(a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_upsampling3d():
-    # nearest_neighbor - NCDHW
-    verify_upsampling3d(8, 8, 16, 16, 16, 2.0, 2.0, 2.0)
-    verify_upsampling3d(2, 16, 32, 32, 32, 3.0, 3.0, 3.0)
-    verify_upsampling3d(1, 8, 11, 16, 6, 1.954545497894287, 2.0, 1.5)
-
-    ## nearest_neighbor - NDHWC
-    verify_upsampling3d(8, 8, 16, 16, 16, 2.0, 2.0, 2.0, layout="NDHWC")
-    verify_upsampling3d(2, 16, 32, 32, 32, 3.0, 3.0, 3.0, layout="NDHWC")
-    verify_upsampling3d(1, 8, 11, 16, 6, 1.954545497894287, 2.0, 1.5, layout="NDHWC")
-
-    # trilinear - NCDHW
-    verify_upsampling3d(2, 2, 16, 16, 16, 2.0, 2.0, 2.0, method="trilinear")
-    verify_upsampling3d(2, 2, 32, 32, 32, 3.0, 3.0, 3.0, method="trilinear")
-    verify_upsampling3d(1, 2, 11, 16, 6, 1.954545497894287, 2.0, 1.5, method="trilinear")
-
-    # trilinear - NDHWC
-    verify_upsampling3d(2, 2, 16, 16, 16, 2.0, 2.0, 2.0, layout="NDHWC", method="trilinear")
-    verify_upsampling3d(2, 2, 32, 32, 32, 3.0, 3.0, 3.0, layout="NDHWC", method="trilinear")
-    verify_upsampling3d(
-        1, 2, 11, 16, 6, 1.954545497894287, 2.0, 1.5, layout="NDHWC", method="trilinear"
-    )
-
-
-if __name__ == "__main__":
-    test_upsampling()
-    test_upsampling3d()
-    test_int_div_upsampling()
diff --git a/tests/python/topi/test_topi_util.py b/tests/python/topi/test_topi_util.py
deleted file mode 100644
index bd7585e56302..000000000000
--- a/tests/python/topi/test_topi_util.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for util"""
-
-from tvm import topi
-
-
-def verify_get_shape(src_shape, src_layout, dst_layout, expect_shape):
-    dst_shape = topi.utils.get_shape(src_shape, src_layout, dst_layout)
-    assert dst_shape == expect_shape, "Shape mismatch: expecting %s but got %s" % (
-        expect_shape,
-        dst_shape,
-    )
-
-
-def test_get_shape():
-    verify_get_shape((1, 3, 224, 224), "NCHW", "NCHW", (1, 3, 224, 224))
-    verify_get_shape((1, 3, 224, 224), "NCHW", "NHWC", (1, 224, 224, 3))
-    verify_get_shape((3, 2, 32, 48, 16), "NCHW16c", "NC16cWH", (3, 2, 16, 48, 32))
-    verify_get_shape((2, 3, 32, 32, 16, 8), "OIHW16i8o", "HWO8oI16i", (32, 32, 2, 8, 3, 16))
-
-
-if __name__ == "__main__":
-    test_get_shape()
diff --git a/tests/python/topi/test_topi_vision.py b/tests/python/topi/test_topi_vision.py
deleted file mode 100644
index 86594ab1241b..000000000000
--- a/tests/python/topi/test_topi_vision.py
+++ /dev/null
@@ -1,736 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for vision package"""
-import math
-import sys
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.testing
-
-from tvm import te, topi
-from tvm.topi.vision import ssd, non_max_suppression, get_valid_counts
-
-_get_valid_counts_implement = {
-    "generic": (topi.vision.get_valid_counts, topi.generic.schedule_get_valid_counts),
-    "gpu": (topi.cuda.get_valid_counts, topi.cuda.schedule_get_valid_counts),
-}
-
-_nms_implement = {
-    "generic": (topi.vision.non_max_suppression, topi.generic.schedule_nms),
-    "gpu": (topi.cuda.non_max_suppression, topi.cuda.schedule_nms),
-}
-
-_multibox_prior_implement = {
-    "generic": (topi.vision.ssd.multibox_prior, topi.generic.schedule_multibox_prior),
-    "gpu": (topi.cuda.multibox_prior, topi.cuda.schedule_multibox_prior),
-}
-
-_multibox_detection_implement = {
-    "generic": (topi.vision.ssd.multibox_detection, topi.generic.schedule_multibox_detection),
-    "gpu": (topi.cuda.multibox_detection, topi.cuda.schedule_multibox_detection),
-}
-
-_roi_align_implement = {
-    "generic": (topi.vision.roi_align_nchw, topi.generic.schedule_roi_align),
-    "cpu": (topi.x86.roi_align_nchw, topi.generic.schedule_roi_align),
-    "gpu": (topi.vision.roi_align_nchw, topi.cuda.schedule_roi_align),
-}
-
-_roi_pool_schedule = {
-    "generic": topi.generic.schedule_roi_pool,
-    "gpu": topi.cuda.schedule_roi_pool,
-}
-
-_proposal_implement = {
-    "generic": (topi.vision.rcnn.proposal, topi.generic.schedule_proposal),
-    "gpu": (topi.cuda.proposal, topi.cuda.schedule_proposal),
-}
-
-_all_class_nms_implement = {
-    "generic": (topi.vision.all_class_non_max_suppression, topi.generic.schedule_nms),
-    "gpu": (topi.cuda.all_class_non_max_suppression, topi.cuda.schedule_nms),
-}
-
-
-class TestValidCounts:
-    dshape, score_threshold, id_index, score_index = tvm.testing.parameters(
-        ((1, 1000, 5), 0.5, -1, 0),
-        ((1, 2500, 6), 0, 0, 1),
-        ((1, 2500, 5), -1, -1, 0),
-        ((3, 1000, 6), 0.55, 1, 0),
-        ((16, 500, 5), 0.95, -1, 1),
-    )
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(self, dtype, dshape, score_threshold, id_index, score_index):
-        batch_size, num_anchor, elem_length = dshape
-        np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
-        np_out1 = np.zeros(shape=(batch_size,))
-        np_out2 = np.zeros(shape=dshape).astype(dtype)
-        np_out3 = np.zeros(shape=(batch_size, num_anchor))
-        for i in range(batch_size):
-            np_out1[i] = 0
-            inter_idx = 0
-            for j in range(num_anchor):
-                score = np_data[i, j, score_index]
-                if score > score_threshold and (id_index < 0 or np_data[i, j, id_index] >= 0):
-                    for k in range(elem_length):
-                        np_out2[i, inter_idx, k] = np_data[i, j, k]
-                    np_out1[i] += 1
-                    np_out3[i, inter_idx] = j
-                    inter_idx += 1
-                if j >= np_out1[i]:
-                    for k in range(elem_length):
-                        np_out2[i, j, k] = -1.0
-                    np_out3[i, j] = -1
-
-        return np_data, np_out1, np_out2, np_out3
-
-    def test_get_valid_counts(
-        self, target, dev, ref_data, dtype, dshape, score_threshold, id_index, score_index
-    ):
-        np_data, np_out1, np_out2, np_out3 = ref_data
-
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _get_valid_counts_implement)
-            data = te.placeholder(dshape, name="data", dtype=dtype)
-            outs = fcompute(data, score_threshold, id_index, score_index)
-            s = fschedule(outs)
-
-        tvm_input_data = tvm.nd.array(np_data, dev)
-        tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), dev)
-        tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), dev)
-        tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), dev)
-
-        f = tvm.build(s, [data, outs[0], outs[1], outs[2]], target)
-        f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3)
-        tvm.testing.assert_allclose(tvm_out1.numpy(), np_out1, rtol=1e-3)
-        tvm.testing.assert_allclose(tvm_out2.numpy(), np_out2, rtol=1e-3)
-        tvm.testing.assert_allclose(tvm_out3.numpy(), np_out3, rtol=1e-3)
-
-
-def verify_non_max_suppression(
-    target,
-    dev,
-    np_data,
-    np_valid_count,
-    np_indices,
-    np_result,
-    np_indices_result,
-    max_output_size,
-    iou_threshold,
-    force_suppress,
-    top_k,
-    coord_start,
-    score_index,
-    id_index,
-):
-    dshape = np_data.shape
-    batch, num_anchors, _ = dshape
-    indices_dshape = (batch, num_anchors)
-    data = te.placeholder(dshape, name="data")
-    valid_count = te.placeholder((batch,), dtype="int32", name="valid_count")
-    indices = te.placeholder((batch, num_anchors), dtype="int32", name="indices")
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _nms_implement)
-        out = fcompute(
-            data,
-            valid_count,
-            indices,
-            max_output_size,
-            iou_threshold,
-            force_suppress,
-            top_k,
-            coord_start=coord_start,
-            score_index=score_index,
-            id_index=id_index,
-            return_indices=False,
-        )
-        indices_out = fcompute(
-            data,
-            valid_count,
-            indices,
-            max_output_size,
-            iou_threshold,
-            force_suppress,
-            top_k,
-            coord_start=coord_start,
-            score_index=score_index,
-            id_index=id_index,
-            return_indices=True,
-        )
-        s = fschedule(out)
-        indices_s = fschedule(indices_out)
-
-    tvm_data = tvm.nd.array(np_data, dev)
-    tvm_valid_count = tvm.nd.array(np_valid_count, dev)
-    tvm_indices = tvm.nd.array(np_indices, dev)
-
-    tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
-    f = tvm.build(s, [data, valid_count, indices, out], target)
-    f(tvm_data, tvm_valid_count, tvm_indices, tvm_out)
-    tvm.testing.assert_allclose(tvm_out.numpy(), np_result, rtol=1e-4)
-
-    tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), dev)
-    f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], target)
-    f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
-    tvm.testing.assert_allclose(tvm_indices_out.numpy(), np_indices_result, rtol=1e-4)
-
-
-def test_non_max_suppression(target, dev):
-    np_data = np.array(
-        [
-            [
-                [0, 0.8, 1, 20, 25, 45],
-                [1, 0.7, 30, 60, 50, 80],
-                [0, 0.4, 4, 21, 19, 40],
-                [2, 0.9, 35, 61, 52, 79],
-                [1, 0.5, 100, 60, 70, 110],
-            ]
-        ]
-    ).astype("float32")
-    np_valid_count = np.array([4]).astype("int32")
-    np_indices = np.array([[0, 1, 2, 3, 4]]).astype("int32")
-    max_output_size = -1
-    np_result = np.array(
-        [
-            [
-                [2, 0.9, 35, 61, 52, 79],
-                [0, 0.8, 1, 20, 25, 45],
-                [-1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1, -1],
-            ]
-        ]
-    )
-    np_indices_result = np.array([[3, 0, -1, -1, -1]])
-
-    verify_non_max_suppression(
-        target,
-        dev,
-        np_data,
-        np_valid_count,
-        np_indices,
-        np_result,
-        np_indices_result,
-        max_output_size,
-        0.7,
-        True,
-        2,
-        2,
-        1,
-        0,
-    )
-
-    np_data = np.array(
-        [
-            [
-                [0.8, 1, 20, 25, 45],
-                [0.7, 30, 60, 50, 80],
-                [0.4, 4, 21, 19, 40],
-                [0.9, 35, 61, 52, 79],
-                [0.5, 100, 60, 70, 110],
-            ]
-        ]
-    ).astype("float32")
-    np_valid_count = np.array([4]).astype("int32")
-    np_indices = np.array([[0, 1, 2, 3, 4]]).astype("int32")
-    max_output_size = 2
-    np_result = np.array(
-        [
-            [
-                [0.9, 35, 61, 52, 79],
-                [0.8, 1, 20, 25, 45],
-                [-1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1],
-                [-1, -1, -1, -1, -1],
-            ]
-        ]
-    )
-    np_indices_result = np.array([[3, 0, -1, -1, -1]])
-    verify_non_max_suppression(
-        target,
-        dev,
-        np_data,
-        np_valid_count,
-        np_indices,
-        np_result,
-        np_indices_result,
-        max_output_size,
-        0.7,
-        False,
-        2,
-        1,
-        0,
-        -1,
-    )
-
-
-class TestMultiboxPrior:
-    dshape, sizes, ratios, steps, offsets, clip = tvm.testing.parameters(
-        ((1, 3, 50, 50), (1,), (1,), (-1, -1), (0.5, 0.5), False),
-        ((1, 3, 224, 224), (0.5, 0.25, 0.1), (1, 2, 0.5), (-1, -1), (0.5, 0.5), False),
-        ((1, 32, 32, 32), (0.5, 0.25), (1, 2), (2, 2), (0.5, 0.5), True),
-    )
-
-    dtype = tvm.testing.parameter("float32")
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(self, dtype, dshape, sizes, ratios, offsets, steps, clip):
-        in_height = dshape[2]
-        in_width = dshape[3]
-        num_sizes = len(sizes)
-        num_ratios = len(ratios)
-        size_ratio_concat = sizes + ratios
-        steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
-        steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
-        offset_h = offsets[0]
-        offset_w = offsets[1]
-
-        out_shape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
-
-        np_in = np.random.uniform(size=dshape).astype(dtype)
-        np_out = np.zeros(out_shape).astype(dtype)
-
-        for i in range(in_height):
-            center_h = (i + offset_h) * steps_h
-            for j in range(in_width):
-                center_w = (j + offset_w) * steps_w
-                for k in range(num_sizes + num_ratios - 1):
-                    w = (
-                        size_ratio_concat[k] * in_height / in_width / 2.0
-                        if k < num_sizes
-                        else size_ratio_concat[0]
-                        * in_height
-                        / in_width
-                        * math.sqrt(size_ratio_concat[k + 1])
-                        / 2.0
-                    )
-                    h = (
-                        size_ratio_concat[k] / 2.0
-                        if k < num_sizes
-                        else size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
-                    )
-                    count = (
-                        i * in_width * (num_sizes + num_ratios - 1)
-                        + j * (num_sizes + num_ratios - 1)
-                        + k
-                    )
-                    np_out[0][count][0] = center_w - w
-                    np_out[0][count][1] = center_h - h
-                    np_out[0][count][2] = center_w + w
-                    np_out[0][count][3] = center_h + h
-        if clip:
-            np_out = np.clip(np_out, 0, 1)
-
-        return np_in, np_out
-
-    def test_multibox_prior(
-        self, target, dev, dtype, dshape, ref_data, sizes, ratios, steps, offsets, clip
-    ):
-        np_in, np_out = ref_data
-
-        data = te.placeholder(dshape, name="data", dtype=dtype)
-
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_prior_implement)
-        with tvm.target.Target(target):
-            out = fcompute(data, sizes, ratios, steps, offsets, clip)
-            s = fschedule(out)
-
-        tvm_input_data = tvm.nd.array(np_in, dev)
-        tvm_out = tvm.nd.array(np.zeros(np_out.shape, dtype=dtype), dev)
-        f = tvm.build(s, [data, out], target)
-        f(tvm_input_data, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.numpy(), np_out, rtol=1e-3)
-
-
-class TestMultiboxDetection:
-    (batch_size,) = tvm.testing.parameters((1,), (6,))
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(
-        self,
-        batch_size,
-    ):
-        # Manually create test case
-        np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]] * batch_size)
-        np_loc_preds = np.array(
-            [[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]] * batch_size
-        )
-        np_anchors = np.array(
-            [[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]] * batch_size
-        )
-        expected_np_out = np.array(
-            [
-                [
-                    [1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
-                    [0, 0.44999999, 1, 1, 1, 1],
-                    [0, 0.30000001, 0, 0, 0.22903419, 0.20435292],
-                ]
-            ]
-            * batch_size
-        )
-        return np_cls_prob, np_loc_preds, np_anchors, expected_np_out
-
-    def test_multibox_detection(self, target, dev, ref_data):
-
-        np_cls_prob, np_loc_preds, np_anchors, expected_np_out = ref_data
-
-        batch_size = np_cls_prob.shape[0]
-        num_anchors = 3
-        num_classes = 3
-        cls_prob = te.placeholder((batch_size, num_anchors, num_classes), name="cls_prob")
-        loc_preds = te.placeholder((batch_size, num_anchors * 4), name="loc_preds")
-        anchors = te.placeholder((batch_size, num_anchors, 4), name="anchors")
-
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_detection_implement)
-        with tvm.target.Target(target):
-            out = fcompute(cls_prob, loc_preds, anchors)
-            s = fschedule(out)
-
-        tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), dev)
-        tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), dev)
-        tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), dev)
-        tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), dev)
-        f = tvm.build(s, [cls_prob, loc_preds, anchors, out], target)
-        f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.numpy(), expected_np_out, rtol=1e-4)
-
-
-class TestRoiAlign:
-    (
-        batch,
-        in_channel,
-        in_size,
-        num_roi,
-        pooled_size,
-        spatial_scale,
-        sample_ratio,
-        mode,
-    ) = tvm.testing.parameters(
-        (1, 16, 32, 64, 7, 1.0, -1, 0),
-        (4, 16, 32, 64, 7, 0.5, 2, 0),
-        (1, 32, 32, 80, 8, 0.0625, 2, 0),
-        (1, 32, 500, 80, 8, 0.0625, 2, 0),
-        (1, 16, 32, 64, 7, 1.0, -1, 1),
-        (4, 16, 32, 64, 7, 0.5, 2, 1),
-        (1, 32, 32, 80, 8, 0.0625, 2, 1),
-        (1, 32, 500, 80, 8, 0.0625, 2, 1),
-    )
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(
-        self,
-        batch,
-        in_channel,
-        in_size,
-        num_roi,
-        pooled_size,
-        spatial_scale,
-        sample_ratio,
-        mode,
-    ):
-        a_shape = (batch, in_channel, in_size, in_size)
-        rois_shape = (num_roi, 5)
-
-        a_np = np.random.uniform(-1, 1, size=a_shape).astype("float32")
-        rois_np = np.random.uniform(-1, 1, size=rois_shape).astype("float32") * in_size
-        rois_np[:, 0] = np.random.randint(low=0, high=batch, size=num_roi)
-        b_np = tvm.topi.testing.roi_align_nchw_python(
-            a_np,
-            rois_np,
-            pooled_size=pooled_size,
-            spatial_scale=spatial_scale,
-            sample_ratio=sample_ratio,
-            mode=mode,
-        )
-
-        return a_np, rois_np, b_np
-
-    def test_roi_align(
-        self,
-        target,
-        dev,
-        ref_data,
-        pooled_size,
-        spatial_scale,
-        sample_ratio,
-        mode,
-    ):
-        # For mode, 0 = avg, 1 = max
-        a_np, rois_np, b_np = ref_data
-
-        a = te.placeholder(a_np.shape)
-        rois = te.placeholder(rois_np.shape)
-
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _roi_align_implement)
-            b = fcompute(
-                a,
-                rois,
-                pooled_size=pooled_size,
-                spatial_scale=spatial_scale,
-                sample_ratio=sample_ratio,
-                mode=mode,
-            )
-            s = fschedule(b)
-
-        tvm_a = tvm.nd.array(a_np, dev)
-        tvm_rois = tvm.nd.array(rois_np, dev)
-        tvm_b = tvm.nd.array(np.zeros(b_np.shape, dtype=b.dtype), device=dev)
-        f = tvm.build(s, [a, rois, b], target)
-        f(tvm_a, tvm_rois, tvm_b)
-        tvm_val = tvm_b.numpy()
-        tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4)
-
-
-class TestRoiPool:
-    batch, in_channel, in_size, num_roi, pooled_size, spatial_scale = tvm.testing.parameters(
-        (1, 4, 16, 32, 7, 1.0),
-        (4, 4, 16, 32, 7, 0.5),
-    )
-
-    @tvm.testing.fixture(cache_return_value=True)
-    def ref_data(self, batch, in_channel, in_size, num_roi, pooled_size, spatial_scale):
-        a_shape = (batch, in_channel, in_size, in_size)
-        rois_shape = (num_roi, 5)
-
-        a_np = np.random.uniform(size=a_shape).astype("float32")
-        rois_np = np.random.uniform(size=rois_shape).astype("float32") * in_size
-        rois_np[:, 0] = np.random.randint(low=0, high=batch, size=num_roi).astype("float32")
-
-        b_np = tvm.topi.testing.roi_pool_nchw_python(
-            a_np, rois_np, pooled_size=pooled_size, spatial_scale=spatial_scale
-        )
-        return a_np, rois_np, b_np
-
-    def test_roi_pool(self, target, dev, ref_data, pooled_size, spatial_scale):
-        a_np, rois_np, b_np = ref_data
-
-        a = te.placeholder(a_np.shape)
-        rois = te.placeholder(rois_np.shape)
-
-        with tvm.target.Target(target):
-            b = topi.vision.rcnn.roi_pool_nchw(
-                a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale
-            )
-            s_func = tvm.topi.testing.dispatch(target, _roi_pool_schedule)
-            s = s_func(b)
-
-        tvm_a = tvm.nd.array(a_np, dev)
-        tvm_rois = tvm.nd.array(rois_np, dev)
-        tvm_b = tvm.nd.array(np.zeros(b_np.shape, dtype=b.dtype), device=dev)
-        f = tvm.build(s, [a, rois, b], target)
-        f(tvm_a, tvm_rois, tvm_b)
-        tvm.testing.assert_allclose(tvm_b.numpy(), b_np, rtol=1e-4)
-
-
-def verify_proposal(target, dev, np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
-    cls_prob = te.placeholder(np_cls_prob.shape)
-    bbox_pred = te.placeholder(np_bbox_pred.shape)
-    im_info = te.placeholder(np_im_info.shape)
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _proposal_implement)
-        out = fcompute(cls_prob, bbox_pred, im_info, **attrs)
-        s = fschedule(out)
-        f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], target)
-        tvm_cls_prob = tvm.nd.array(np_cls_prob, device=dev)
-        tvm_bbox_pred = tvm.nd.array(np_bbox_pred, device=dev)
-        tvm_im_info = tvm.nd.array(np_im_info, device=dev)
-        tvm_out = tvm.nd.empty(device=dev, shape=out.shape, dtype=out.dtype)
-        f(tvm_cls_prob, tvm_bbox_pred, tvm_im_info, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.numpy(), np_out, rtol=1e-4)
-
-
-@tvm.testing.known_failing_targets("vulkan")
-def test_proposal(target, dev):
-    attrs = {
-        "scales": (0.5,),
-        "ratios": (0.5,),
-        "feature_stride": 16,
-        "iou_loss": False,
-        "rpn_min_size": 16,
-        "threshold": 0.7,
-        "rpn_pre_nms_top_n": 200,
-        "rpn_post_nms_top_n": 4,
-    }
-    np_cls_prob = np.array(
-        [
-            [
-                [[0.3, 0.6, 0.2], [0.4, 0.7, 0.5], [0.1, 0.4, 0.3]],
-                [[0.7, 0.5, 0.3], [0.6, 0.4, 0.8], [0.9, 0.2, 0.5]],
-            ]
-        ],
-        dtype="float32",
-    )
-    np_bbox_pred = np.array(
-        [
-            [
-                [[0.5, 1.0, 0.6], [0.8, 1.2, 2.0], [0.9, 1.0, 0.8]],
-                [[0.5, 1.0, 0.7], [0.8, 1.2, 1.6], [2.1, 1.5, 0.7]],
-                [[1.0, 0.5, 0.7], [1.5, 0.9, 1.6], [1.4, 1.5, 0.8]],
-                [[1.0, 0.5, 0.6], [1.5, 0.9, 2.0], [1.8, 1.0, 0.9]],
-            ]
-        ],
-        dtype="float32",
-    )
-    np_im_info = np.array([[48.0, 48.0, 1.0]], dtype="float32")
-    np_out = np.array(
-        [
-            [0.0, 0.0, 2.8451548, 28.38012, 18.154846],
-            [0.0, 0.0, 15.354933, 41.96971, 41.245064],
-            [0.0, 18.019852, 1.0538368, 51.98015, 25.946163],
-            [0.0, 27.320923, -1.266357, 55.0, 24.666357],
-        ],
-        dtype="float32",
-    )
-
-    verify_proposal(target, dev, np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
-
-    np_out = np.array(
-        [
-            [0.0, -5.25, -2.5, 21.75, 19.0],
-            [0.0, 11.25, -2.0, 37.25, 18.5],
-            [0.0, 26.849998, -2.3000002, 53.45, 18.6],
-            [0.0, -4.95, 13.799999, 22.25, 35.5],
-        ],
-        dtype="float32",
-    )
-
-    attrs["iou_loss"] = True
-    verify_proposal(target, dev, np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
-
-
-def verify_all_class_non_max_suppression(
-    target,
-    dev,
-    boxes_np,
-    scores_np,
-    max_output_boxes_per_class,
-    iou_threshold,
-    score_threshold,
-    expected_indices,
-):
-    dshape = boxes_np.shape
-    batch, num_boxes, _ = dshape
-    _, num_class, _ = scores_np.shape
-    boxes = te.placeholder(dshape, name="boxes")
-    scores = te.placeholder(scores_np.shape, dtype="float32", name="scores")
-
-    with tvm.target.Target(target):
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _all_class_nms_implement)
-        out = fcompute(boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold)
-        s = fschedule(out)
-
-    tvm_boxes = tvm.nd.array(boxes_np, dev)
-    tvm_scores = tvm.nd.array(scores_np, dev)
-    selected_indices = tvm.nd.array(np.zeros((batch * num_class * num_boxes, 3), "int64"), dev)
-    num_detections = tvm.nd.array(np.zeros((1,), "int64"), dev)
-
-    f = tvm.build(s, [boxes, scores, out[0], out[1]], target)
-    f(tvm_boxes, tvm_scores, selected_indices, num_detections)
-
-    tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
-    np.testing.assert_equal(tvm_res, expected_indices)
-
-
-def test_all_class_non_max_suppression(target, dev):
-    boxes = np.array(
-        [
-            [
-                [0.0, 0.0, 0.3, 0.3],
-                [0.0, 0.0, 0.4, 0.4],
-                [0.0, 0.0, 0.5, 0.5],
-                [0.5, 0.5, 0.9, 0.9],
-                [0.5, 0.5, 1.0, 1.0],
-            ],
-            [
-                [0.0, 0.0, 0.3, 0.3],
-                [0.0, 0.0, 0.4, 0.4],
-                [0.5, 0.5, 0.95, 0.95],
-                [0.5, 0.5, 0.96, 0.96],
-                [0.5, 0.5, 1.0, 1.0],
-            ],
-        ]
-    ).astype("float32")
-
-    scores = np.array(
-        [
-            [[0.1, 0.2, 0.6, 0.3, 0.9], [0.1, 0.2, 0.6, 0.3, 0.9]],
-            [[0.1, 0.2, 0.6, 0.3, 0.9], [0.1, 0.2, 0.6, 0.3, 0.9]],
-        ]
-    ).astype("float32")
-
-    max_output_boxes_per_class = 2
-    iou_threshold = 0.8
-    score_threshold = 0.0
-
-    expected = np.array(
-        [[0, 0, 4], [0, 0, 2], [0, 1, 4], [0, 1, 2], [1, 0, 4], [1, 0, 1], [1, 1, 4], [1, 1, 1]]
-    )
-
-    verify_all_class_non_max_suppression(
-        target,
-        dev,
-        boxes,
-        scores,
-        max_output_boxes_per_class,
-        iou_threshold,
-        score_threshold,
-        expected,
-    )
-
-    boxes = np.array(
-        [
-            [
-                [0.0, 0.0, 1.0, 1.0],
-                [0.0, 0.1, 1.0, 1.1],
-                [0.0, -0.1, 1.0, 0.9],
-                [0.0, 10.0, 1.0, 11.0],
-                [0.0, 10.1, 1.0, 11.1],
-                [0.0, 100.0, 1.0, 101.0],
-            ]
-        ]
-    ).astype(np.float32)
-    scores = np.array([[[0.9, 0.75, 0.6, 0.95, 0.5, 0.3]]]).astype(np.float32)
-    max_output_boxes_per_class = 3
-    iou_threshold = 0.5
-    score_threshold = 0.4
-
-    expected = np.array([[0, 0, 3], [0, 0, 0]])
-
-    verify_all_class_non_max_suppression(
-        target,
-        dev,
-        boxes,
-        scores,
-        max_output_boxes_per_class,
-        iou_threshold,
-        score_threshold,
-        expected,
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tvmscript/test_tvmscript_printer_highlight.py b/tests/python/tvmscript/test_tvmscript_printer_highlight.py
index 4c33b435f053..5d4173640b48 100644
--- a/tests/python/tvmscript/test_tvmscript_printer_highlight.py
+++ b/tests/python/tvmscript/test_tvmscript_printer_highlight.py
@@ -19,7 +19,6 @@
 
 import tvm
 import tvm.testing
-from tvm import relay
 from tvm.script import tir as T
 from tvm.script.highlight import cprint, _format
 
@@ -51,28 +50,5 @@ def main(  # type: ignore
     Module["main"].show(style="ansi")
 
 
-def test_cprint():
-    # Print string
-    cprint("a + 1")
-
-    # Print nodes with `script` method, e.g. PrimExpr
-    cprint(tvm.tir.Var("v", "int32") + 1)
-
-    # Cannot print non-Python-style codes when using the black
-    # formatter.  This error comes from `_format`, used internally by
-    # `cprint`, and doesn't occur when using the `ruff` formatter.
-    try:
-        import black
-
-        with pytest.raises(ValueError):
-            _format("if (a == 1) { a +=1; }", formatter="black")
-    except ImportError:
-        pass
-
-    # Cannot print unsupported nodes (nodes without `script` method)
-    with pytest.raises(TypeError):
-        cprint(relay.const(1))
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/tvmscript/test_tvmscript_printer_structural_equal.py b/tests/python/tvmscript/test_tvmscript_printer_structural_equal.py
index 5c587354cc3f..6f67733a2827 100644
--- a/tests/python/tvmscript/test_tvmscript_printer_structural_equal.py
+++ b/tests/python/tvmscript/test_tvmscript_printer_structural_equal.py
@@ -19,7 +19,6 @@
 
 import tvm
 from tvm.ir import assert_structural_equal
-from tvm.relay.op.transform import split
 from tvm.runtime import ObjectPath
 from tvm.script import ir as I, tir as T
 
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 344c103850d7..0bd97e4ee048 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -594,7 +594,6 @@ def add_subparser(
         help="Run GPU build and test(s)",
         options={
             "cpp": CPP_UNITTEST,
-            "topi": ("run topi tests", ["./tests/scripts/task_python_topi.sh"]),
             "unittest": (
                 "run unit tests",
                 [
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 6007b68f577e..9e195de9bc17 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -49,4 +49,5 @@ echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
-echo set\(USE_MSC ON\) >> config.cmake
+# Temporary disable MSC
+# echo set\(USE_MSC ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index e3599695a969..e411ee2c5e87 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -47,5 +47,6 @@ echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
 echo set\(USE_CUTLASS ON\) >> config.cmake
-echo set\(USE_MSC ON\) >> config.cmake
+# Temporary disable MSC
+# echo set\(USE_MSC ON\) >> config.cmake
 echo set\(CMAKE_CUDA_ARCHITECTURES 75\) >> config.cmake
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index ef13e7963851..a35b023ad0df 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -37,7 +37,6 @@ trap cleanup 0
 
 python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR"
 python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR"
-python3 "$SCRIPT_DIR"/test_graph_executor.py "$TEMP_DIR"
 
 # Skip the Java RPC Unittests, see https://github.com/apache/tvm/issues/13168
 # # start rpc proxy server
diff --git a/tests/scripts/task_python_adreno.sh b/tests/scripts/task_python_adreno.sh
index a435d588aefb..684a63e77fae 100755
--- a/tests/scripts/task_python_adreno.sh
+++ b/tests/scripts/task_python_adreno.sh
@@ -60,17 +60,11 @@ find . -type f -path "*.pyc" | xargs rm -f
 # setup cython
 cd python; python3 setup.py build_ext --inplace; cd ..
 
+exit 0
+
 # The RPC to remote Android device has issue of hang after few tests with in CI environments.
 # Lets run them individually on fresh rpc session.
 # OpenCL texture test on Adreno
-TEXTURE_TESTS=$(./ci/scripts/jenkins/pytest_ids.py --folder tests/python/relay/opencl_texture)
-i=0
-IFS=$'\n'
-for node_id in $TEXTURE_TESTS; do
-    echo "$node_id"
-    run_pytest "$TVM_INTEGRATION_TESTSUITE_NAME-opencl-texture-$i" "$node_id" --reruns=0
-    i=$((i+1))
-done
 
 # Adreno CLML test
 CLML_TESTS=$(./ci/scripts/jenkins/pytest_ids.py --folder tests/python/contrib/test_clml)
diff --git a/tests/scripts/task_python_arm_compute_library.sh b/tests/scripts/task_python_arm_compute_library.sh
index 520abb58cdc2..1423fb198543 100755
--- a/tests/scripts/task_python_arm_compute_library.sh
+++ b/tests/scripts/task_python_arm_compute_library.sh
@@ -25,5 +25,3 @@ find . -type f -path "*.pyc" | xargs rm -f
 
 # setup cython
 cd python; python3 setup.py build_ext --inplace; cd ..
-
-run_pytest python-arm_compute_lib tests/python/contrib/test_arm_compute_lib
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index 11e4e1827421..fd53007a37ce 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -30,6 +30,9 @@ source tests/scripts/setup-pytest-env.sh
 # setup cython
 cd python; python3 setup.py build_ext --inplace; cd ..
 
+# disable hexagon tests for now
+exit 0
+
 
 if [[ "${device_serial}" == "simulator" ]]; then
     export TVM_TRACKER_PORT=9190
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 3202839e50ed..326743394d2a 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -35,23 +35,3 @@ find . -type f -path "*.pyc" | xargs rm -f
 
 # setup cython
 cd python; python3 setup.py build_ext --inplace; cd ..
-
-run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-integration tests/python/integration
-
-# forked is needed because the global registry gets contaminated
-TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
-    run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay --ignore=tests/python/relay/aot
-
-# OpenCL texture test. Deselected specific tests that fails  in CI
-TEXTURE_TESTS=$(ls tests/python/relay/opencl_texture/test_*)
-i=0
-for TEST in $TEXTURE_TESTS; do
-    TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \
-        run_pytest "${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture-$i" "$TEST"
-    i=$((i+1))
-done
-# Command line driver test
-run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver
-
-# Target test
-run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-target tests/python/target
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index 97030d6948f9..9cb1fd476e6c 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -25,11 +25,3 @@ export TVM_TEST_TARGETS="llvm; cuda"
 # to avoid CI thread throttling.
 export TVM_BIND_THREADS=0
 export OMP_NUM_THREADS=1
-
-# setup cython
-cd python; python3 setup.py build_ext --inplace; cd ..
-
-# cleanup pycache
-find . -type f -path "*.pyc" | xargs rm -f
-
-run_pytest python-topi tests/python/topi/
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index b84694cfd5e2..b074af314786 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -37,12 +37,9 @@ run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test tests/python/all
 # Then run all unittests on both ctypes and cython.
 TEST_FILES=(
   "arith"
-  "auto_scheduler"
-  "autotvm"
   "codegen"
   "ir"
   "meta_schedule"
-  "micro"
   "runtime"
   "te"
   "testing"
@@ -54,6 +51,7 @@ TEST_FILES=(
   "tvmscript"
   "usmp"
   "ci"
+  "target"
 )
 
 for TEST_FILE in ${TEST_FILES[@]}; do
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index f31c703abd5a..442b9d771e84 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -56,60 +56,3 @@ cargo test --features dynamic-linking --tests
 cd $RUST_DIR/tvm-rt
 # Build and run the tests.
 cargo test
-
-# Next we test the graph executor crate.
-cd $RUST_DIR/tvm-graph-rt
-
-# We first we compile a model using the Python bindings then run the tests.
-python3 tests/build_model.py
-cargo test --tests
-
-# Run some more tests involving the graph executor API.
-cd tests/test_tvm_basic
-cargo run
-cd -
-
-cd tests/test_tvm_dso
-cargo run
-cd -
-
-# run wasm32 test
-# cd tests/test_wasm32
-# cargo build
-# wasmtime $RUST_DIR/target/wasm32-wasi/debug/test-wasm32.wasm
-# cd -
-
-# Disabled, see https://github.com/apache/tvm/issues/11419
-# # run nn graph test
-# cd tests/test_nn
-# cargo run
-# cd -
-
-# Finally we test the TVM crate which provides both runtime
-# and compiler bindings.
-cd $RUST_DIR/tvm
-
-cargo test
-
-# run basic tests on cpu
-cd tests/basics
-cargo run --features cpu
-# uncomment when have more CI resources
-# cargo build --features gpu
-# cargo run --features gpu
-# fi
-cd -
-
-# TODO(@jroesch): I believe this is no longer true, refactor in follow up PR.
-# run callback tests separately: https://discuss.tvm.ai/t/are-global-functions-need-to-be-accessed-in-separate-processes/1075
-cd tests/callback
-cargo build
-cargo run --bin int
-cargo run --bin float
-cargo run --bin array
-cargo run --bin string
-cd -
-
-cd examples/resnet
-cargo run
-cd -
diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh
index 8a08c1ecb58d..91bbbac52300 100755
--- a/tests/scripts/task_web_wasm.sh
+++ b/tests/scripts/task_web_wasm.sh
@@ -25,8 +25,9 @@ cd web
 make clean
 npm install
 npm run lint
-npm run prepwasm
-npm run bundle
-npm run test
-npm run typedoc
+# TODO(@tqchen, @siyuan): re-enable the following tests
+# npm run prepwasm
+# npm run bundle
+# npm run test
+# npm run typedoc
 cd ..
diff --git a/tests/scripts/unity/task_python_relax.sh b/tests/scripts/unity/task_python_relax.sh
index 28dd78bf6bd2..688812b35d32 100755
--- a/tests/scripts/unity/task_python_relax.sh
+++ b/tests/scripts/unity/task_python_relax.sh
@@ -38,4 +38,4 @@ TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm}" pytest tests/python/dlight
 # python3 ./apps/relax_examples/resnet.py
 
 # Test for MSC
-pytest tests/python/contrib/test_msc
+# pytest tests/python/contrib/test_msc
diff --git a/web/tests/python/prepare_test_libs.py b/web/tests/python/prepare_test_libs.py
index ed40e1fcccc7..7198c163417e 100644
--- a/web/tests/python/prepare_test_libs.py
+++ b/web/tests/python/prepare_test_libs.py
@@ -16,13 +16,12 @@
 # under the License.
 # Prepare test library for standalone wasm runtime test.
 
+import os
 import tvm
 from tvm import te
 from tvm.contrib import tvmjs
-from tvm.relay.backend import Runtime
 from tvm import relax
 from tvm.script import relax as R
-import os
 
 
 def prepare_relax_lib(base_path):
@@ -44,16 +43,17 @@ def main(x: R.Tensor(["n"], "float32"), y: R.Tensor(["n"], "float32")):
 
 
 def prepare_tir_lib(base_path):
-    runtime = Runtime("cpp", {"system-lib": True})
     target = "llvm -mtriple=wasm32-unknown-unknown-wasm"
     if not tvm.runtime.enabled(target):
         raise RuntimeError("Target %s is not enbaled" % target)
     n = te.var("n")
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-    fadd = tvm.build(s, [A, B], target, runtime=runtime, name="add_one")
+    mod = tvm.IRModule.from_expr(
+        te.create_prim_func([A, B]).with_attr("global_symbol", "add_one")
+    ).with_attr("system_lib_prefix", "")
 
+    fadd = tvm.build(mod, target)
     wasm_path = os.path.join(base_path, "test_addone.wasm")
     fadd.export_library(wasm_path, fcompile=tvmjs.create_tvmjs_wasm)